From a62cfc3ccf2d096b28e041f2d8bbbe25f68ad00f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=BB=8B=E6=B6=B5=20Zephyr=20Wang?=
 <wi24rd+gh@fedoraproject.org>
Date: Sat, 29 May 2021 18:10:00 +0800
Subject: [PATCH 001/718] Fix typo in common.h

---
 common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common.h b/common.h
index ac795937c..d27fc4cb7 100644
--- a/common.h
+++ b/common.h
@@ -392,7 +392,7 @@ typedef int blasint;
 #endif
 
 /***
-To alloc job_t on heap or statck.
+To alloc job_t on heap or stack.
 please https://github.com/xianyi/OpenBLAS/issues/246
 ***/
 #if defined(OS_WINDOWS)

From 448fe1c42a928f2a090f7ec1a5be9b6e81ccbf0f Mon Sep 17 00:00:00 2001
From: Xianyi Zhang <xianyi@perfxlab.com>
Date: Wed, 24 Aug 2022 19:24:01 +0800
Subject: [PATCH 002/718] Test on PolarFire Soc.

`make NOFORTRAN=1 CC=gcc`
---
 TargetList.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TargetList.txt b/TargetList.txt
index deef75819..99d603d03 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -118,7 +118,7 @@ Z13
 Z14
 
 10.RISC-V 64:
-RISCV64_GENERIC
+RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54)
 C910V
 
 11.LOONGARCH64:

From bef47917bd72f35c151038fee0cf485445476863 Mon Sep 17 00:00:00 2001
From: Heller Zheng <heller.zheng@microchip.com>
Date: Tue, 15 Nov 2022 00:06:25 -0800
Subject: [PATCH 003/718] Initial version for riscv sifive x280

---
 Makefile.install                       |   7 +
 Makefile.prebuild                      |   8 +
 Makefile.riscv64                       |   8 +
 README.md                              |   5 +
 TargetList.txt                         |   1 +
 benchmark/Makefile                     |   6 +
 common_riscv64.h                       |   4 +
 cpuid_riscv64.c                        |   2 +
 getarch.c                              |  12 +
 kernel/riscv64/KERNEL.x280             | 267 ++++++++
 kernel/riscv64/amax_rvv.c              | 102 +++
 kernel/riscv64/amin_rvv.c              | 102 +++
 kernel/riscv64/asum_rvv.c              |  99 +++
 kernel/riscv64/axpby_rvv.c             | 171 +++++
 kernel/riscv64/axpy_rvv.c              | 109 +++
 kernel/riscv64/copy_rvv.c              |  94 +++
 kernel/riscv64/dot_rvv.c               | 126 ++++
 kernel/riscv64/gemm_beta_rvv.c         |  89 +++
 kernel/riscv64/gemm_ncopy_2_rvv.c      |  92 +++
 kernel/riscv64/gemm_ncopy_4_rvv.c      | 123 ++++
 kernel/riscv64/gemm_ncopy_8_rvv.c      | 164 +++++
 kernel/riscv64/gemm_ncopy_rvv_v1.c     |  76 +++
 kernel/riscv64/gemm_tcopy_2_rvv.c      | 108 +++
 kernel/riscv64/gemm_tcopy_4_rvv.c      | 236 +++++++
 kernel/riscv64/gemm_tcopy_8_rvv.c      | 264 ++++++++
 kernel/riscv64/gemm_tcopy_rvv_v1.c     |  74 +++
 kernel/riscv64/gemmkernel_2x2_rvv.c    | 214 ++++++
 kernel/riscv64/gemmkernel_4x4_rvv.c    | 508 ++++++++++++++
 kernel/riscv64/gemmkernel_rvv_v1x8.c   | 601 +++++++++++++++++
 kernel/riscv64/gemv_n_rvv.c            |  94 +++
 kernel/riscv64/gemv_t_rvv.c            | 119 ++++
 kernel/riscv64/iamax_rvv.c             | 150 +++++
 kernel/riscv64/iamin_rvv.c             | 151 +++++
 kernel/riscv64/imax_rvv.c              | 147 +++++
 kernel/riscv64/imin_rvv.c              | 147 +++++
 kernel/riscv64/izamax_rvv.c            | 162 +++++
 kernel/riscv64/izamin_rvv.c            | 161 +++++
 kernel/riscv64/max_rvv.c               |  98 +++
 kernel/riscv64/min_rvv.c               |  98 +++
 kernel/riscv64/nrm2_rvv.c              | 117 ++++
 kernel/riscv64/rot_rvv.c               | 149 +++++
 kernel/riscv64/scal_rvv.c              |  80 +++
 kernel/riscv64/sum_rvv.c               |  95 +++
 kernel/riscv64/swap_rvv.c              | 142 ++++
 kernel/riscv64/symm_lcopy_rvv_v1.c     | 101 +++
 kernel/riscv64/symm_ucopy_rvv_v1.c     | 100 +++
 kernel/riscv64/symv_L_rvv.c            | 224 +++++++
 kernel/riscv64/symv_U_rvv.c            | 221 +++++++
 kernel/riscv64/trmm_lncopy_rvv_v1.c    | 138 ++++
 kernel/riscv64/trmm_ltcopy_rvv_v1.c    | 134 ++++
 kernel/riscv64/trmm_uncopy_rvv_v1.c    | 136 ++++
 kernel/riscv64/trmm_utcopy_rvv_v1.c    | 133 ++++
 kernel/riscv64/trmmkernel_2x2_rvv.c    | 342 ++++++++++
 kernel/riscv64/trmmkernel_4x4_rvv.c    | 881 +++++++++++++++++++++++++
 kernel/riscv64/trmmkernel_rvv_v1x8.c   | 685 +++++++++++++++++++
 kernel/riscv64/trsm_kernel_LN_rvv_v1.c | 847 ++++++++++++++++++++++++
 kernel/riscv64/trsm_kernel_LT_rvv_v1.c | 840 +++++++++++++++++++++++
 kernel/riscv64/trsm_kernel_RN_rvv_v1.c | 792 ++++++++++++++++++++++
 kernel/riscv64/trsm_kernel_RT_rvv_v1.c | 828 +++++++++++++++++++++++
 kernel/riscv64/trsm_lncopy_rvv_v1.c    | 122 ++++
 kernel/riscv64/trsm_ltcopy_rvv_v1.c    | 122 ++++
 kernel/riscv64/trsm_uncopy_rvv_v1.c    | 121 ++++
 kernel/riscv64/trsm_utcopy_rvv_v1.c    | 123 ++++
 kernel/riscv64/zamax_rvv.c             | 113 ++++
 kernel/riscv64/zamin_rvv.c             | 112 ++++
 kernel/riscv64/zasum_rvv.c             | 108 +++
 kernel/riscv64/zaxpby_rvv.c            | 151 +++++
 kernel/riscv64/zaxpy_rvv.c             | 154 +++++
 kernel/riscv64/zcopy_rvv.c             | 105 +++
 kernel/riscv64/zdot_rvv.c              | 170 +++++
 kernel/riscv64/zgemm_beta_rvv.c        | 117 ++++
 kernel/riscv64/zgemv_n_rvv.c           | 170 +++++
 kernel/riscv64/zgemv_t_rvv.c           | 172 +++++
 kernel/riscv64/znrm2_rvv.c             | 122 ++++
 kernel/riscv64/zrot_rvv.c              | 181 +++++
 kernel/riscv64/zscal_rvv.c             | 148 +++++
 kernel/riscv64/zsum_rvv.c              |  97 +++
 kernel/riscv64/zswap_rvv.c             | 156 +++++
 kernel/riscv64/ztrmmkernel_2x2_rvv.c   | 596 +++++++++++++++++
 param.h                                |  44 ++
 80 files changed, 15188 insertions(+)
 create mode 100644 kernel/riscv64/KERNEL.x280
 create mode 100644 kernel/riscv64/amax_rvv.c
 create mode 100644 kernel/riscv64/amin_rvv.c
 create mode 100644 kernel/riscv64/asum_rvv.c
 create mode 100644 kernel/riscv64/axpby_rvv.c
 create mode 100644 kernel/riscv64/axpy_rvv.c
 create mode 100644 kernel/riscv64/copy_rvv.c
 create mode 100644 kernel/riscv64/dot_rvv.c
 create mode 100644 kernel/riscv64/gemm_beta_rvv.c
 create mode 100644 kernel/riscv64/gemm_ncopy_2_rvv.c
 create mode 100644 kernel/riscv64/gemm_ncopy_4_rvv.c
 create mode 100644 kernel/riscv64/gemm_ncopy_8_rvv.c
 create mode 100644 kernel/riscv64/gemm_ncopy_rvv_v1.c
 create mode 100644 kernel/riscv64/gemm_tcopy_2_rvv.c
 create mode 100644 kernel/riscv64/gemm_tcopy_4_rvv.c
 create mode 100644 kernel/riscv64/gemm_tcopy_8_rvv.c
 create mode 100644 kernel/riscv64/gemm_tcopy_rvv_v1.c
 create mode 100644 kernel/riscv64/gemmkernel_2x2_rvv.c
 create mode 100644 kernel/riscv64/gemmkernel_4x4_rvv.c
 create mode 100644 kernel/riscv64/gemmkernel_rvv_v1x8.c
 create mode 100644 kernel/riscv64/gemv_n_rvv.c
 create mode 100644 kernel/riscv64/gemv_t_rvv.c
 create mode 100644 kernel/riscv64/iamax_rvv.c
 create mode 100644 kernel/riscv64/iamin_rvv.c
 create mode 100644 kernel/riscv64/imax_rvv.c
 create mode 100644 kernel/riscv64/imin_rvv.c
 create mode 100644 kernel/riscv64/izamax_rvv.c
 create mode 100644 kernel/riscv64/izamin_rvv.c
 create mode 100644 kernel/riscv64/max_rvv.c
 create mode 100644 kernel/riscv64/min_rvv.c
 create mode 100644 kernel/riscv64/nrm2_rvv.c
 create mode 100644 kernel/riscv64/rot_rvv.c
 create mode 100644 kernel/riscv64/scal_rvv.c
 create mode 100644 kernel/riscv64/sum_rvv.c
 create mode 100644 kernel/riscv64/swap_rvv.c
 create mode 100644 kernel/riscv64/symm_lcopy_rvv_v1.c
 create mode 100644 kernel/riscv64/symm_ucopy_rvv_v1.c
 create mode 100644 kernel/riscv64/symv_L_rvv.c
 create mode 100644 kernel/riscv64/symv_U_rvv.c
 create mode 100644 kernel/riscv64/trmm_lncopy_rvv_v1.c
 create mode 100644 kernel/riscv64/trmm_ltcopy_rvv_v1.c
 create mode 100644 kernel/riscv64/trmm_uncopy_rvv_v1.c
 create mode 100644 kernel/riscv64/trmm_utcopy_rvv_v1.c
 create mode 100644 kernel/riscv64/trmmkernel_2x2_rvv.c
 create mode 100644 kernel/riscv64/trmmkernel_4x4_rvv.c
 create mode 100644 kernel/riscv64/trmmkernel_rvv_v1x8.c
 create mode 100644 kernel/riscv64/trsm_kernel_LN_rvv_v1.c
 create mode 100644 kernel/riscv64/trsm_kernel_LT_rvv_v1.c
 create mode 100644 kernel/riscv64/trsm_kernel_RN_rvv_v1.c
 create mode 100644 kernel/riscv64/trsm_kernel_RT_rvv_v1.c
 create mode 100644 kernel/riscv64/trsm_lncopy_rvv_v1.c
 create mode 100644 kernel/riscv64/trsm_ltcopy_rvv_v1.c
 create mode 100644 kernel/riscv64/trsm_uncopy_rvv_v1.c
 create mode 100644 kernel/riscv64/trsm_utcopy_rvv_v1.c
 create mode 100644 kernel/riscv64/zamax_rvv.c
 create mode 100644 kernel/riscv64/zamin_rvv.c
 create mode 100644 kernel/riscv64/zasum_rvv.c
 create mode 100644 kernel/riscv64/zaxpby_rvv.c
 create mode 100644 kernel/riscv64/zaxpy_rvv.c
 create mode 100644 kernel/riscv64/zcopy_rvv.c
 create mode 100644 kernel/riscv64/zdot_rvv.c
 create mode 100644 kernel/riscv64/zgemm_beta_rvv.c
 create mode 100644 kernel/riscv64/zgemv_n_rvv.c
 create mode 100644 kernel/riscv64/zgemv_t_rvv.c
 create mode 100644 kernel/riscv64/znrm2_rvv.c
 create mode 100644 kernel/riscv64/zrot_rvv.c
 create mode 100644 kernel/riscv64/zscal_rvv.c
 create mode 100644 kernel/riscv64/zsum_rvv.c
 create mode 100644 kernel/riscv64/zswap_rvv.c
 create mode 100644 kernel/riscv64/ztrmmkernel_2x2_rvv.c

diff --git a/Makefile.install b/Makefile.install
index 87b5bc870..f1adaa271 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -8,6 +8,7 @@ PREFIX ?= /opt/OpenBLAS
 OPENBLAS_INCLUDE_DIR := $(PREFIX)/include
 OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib
 OPENBLAS_BINARY_DIR := $(PREFIX)/bin
+OPENBLAS_RELEASE_DIR := $(PREFIX)/release
 OPENBLAS_BUILD_DIR := $(CURDIR)
 OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/$(LIBSONAMEBASE)
 OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
@@ -38,6 +39,7 @@ install : 	lib.grd
 	@-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)"
 	@-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
 	@-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
+	@-mkdir -p "$(DESTDIR)$(OPENBLAS_RELEASE_DIR)"
 	@-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)"
 	@-mkdir -p "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
 	@echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@@ -202,3 +204,8 @@ endif
 	@echo "  endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
 	@echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
 	@echo Install OK!
+#Generating release tar
+	@echo Generating $(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz
+	@tar -cvz --file=$(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz --directory=$(PREFIX) --exclude=release .
+
+	
diff --git a/Makefile.prebuild b/Makefile.prebuild
index 0be4f1274..e6a8eab59 100644
--- a/Makefile.prebuild
+++ b/Makefile.prebuild
@@ -55,6 +55,14 @@ ifeq ($(TARGET), C910V)
 TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d
 endif
 
+ifeq ($(TARGET), x280)
+TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -mcpu=sifive-x280
+endif
+
+ifeq ($(TARGET), RISCV64_GENERIC)
+TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d
+endif
+
 all: getarch_2nd
 	./getarch_2nd  0 >> $(TARGET_MAKE)
 	./getarch_2nd  1 >> $(TARGET_CONF)
diff --git a/Makefile.riscv64 b/Makefile.riscv64
index ce91e03ec..d6eaf552d 100644
--- a/Makefile.riscv64
+++ b/Makefile.riscv64
@@ -2,3 +2,11 @@ ifeq ($(CORE), C910V)
 CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920
 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static
 endif
+ifeq ($(CORE), x280)
+CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -menable-experimental-extensions -mllvm --riscv-v-vector-bits-min=512 -mcpu=sifive-x280 -ffast-math
+FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -menable-experimental-extensions -static
+endif
+ifeq ($(CORE), RISCV64_GENERIC)
+CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d
+FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static
+endif
\ No newline at end of file
diff --git a/README.md b/README.md
index 6ce85e08e..6ecb46178 100644
--- a/README.md
+++ b/README.md
@@ -186,6 +186,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
   ```
   (also known to work on C906)
 
+- **x280**: LLVM auto-vectorization using RISC-V Vector extension 1.0.
+  ```sh
+  make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran
+  ```
+
 ### Support for multiple targets in a single library
 
 OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
diff --git a/TargetList.txt b/TargetList.txt
index deef75819..6c533361e 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -120,6 +120,7 @@ Z14
 10.RISC-V 64:
 RISCV64_GENERIC
 C910V
+x280
 
 11.LOONGARCH64:
 LOONGSONGENERIC
diff --git a/benchmark/Makefile b/benchmark/Makefile
index f2f3b354a..734c83a26 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -37,6 +37,12 @@ ESSL=/opt/ibm/lib
 #LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
 LIBESSL = -lesslsmp  $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
 
+# x280 temporary workaround for gfortran
+ifeq ($(TARGET), x280)
+CCOMMON_OPT:=$(filter-out -mllvm --riscv-v-vector-bits-min=512,$(CCOMMON_OPT))
+endif
+
+
 ifneq ($(NO_LAPACK), 1)
 GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
 		    scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \
diff --git a/common_riscv64.h b/common_riscv64.h
index 7ddbe80a4..221a79901 100644
--- a/common_riscv64.h
+++ b/common_riscv64.h
@@ -92,6 +92,10 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define SEEK_ADDRESS
 
 #if defined(C910V)
+#include <riscv-vector.h>
+#endif
+
+#if defined(x280)
 #include <riscv_vector.h>
 #endif
 
diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c
index 894d2b873..5326787e6 100644
--- a/cpuid_riscv64.c
+++ b/cpuid_riscv64.c
@@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define CPU_GENERIC   0
 #define CPU_C910V     1
+#define CPU_x280        2
 
 static char *cpuname[] = {
   "RISCV64_GENERIC",
   "C910V"
+  "x280"
 };
 
 int detect(void){
diff --git a/getarch.c b/getarch.c
index cde5b4e83..0d197285c 100644
--- a/getarch.c
+++ b/getarch.c
@@ -1677,6 +1677,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define LIBNAME   "c910v"
 #define CORENAME  "C910V"
 #endif
+#endif
+#ifdef FORCE_x280
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "x280"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG   "-Dx280 " \
+       "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME   "x280"
+#define CORENAME  "x280"
 #else
 #endif
 
diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280
new file mode 100644
index 000000000..2eb60f2b4
--- /dev/null
+++ b/kernel/riscv64/KERNEL.x280
@@ -0,0 +1,267 @@
+# **********************************************************************************
+# Copyright (c) 2022, The OpenBLAS Project
+# All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# 3. Neither the name of the OpenBLAS project nor the names of
+# its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# **********************************************************************************
+
+SAMAXKERNEL  = amax_rvv.c
+DAMAXKERNEL  = amax_rvv.c
+CAMAXKERNEL  = zamax_rvv.c
+ZAMAXKERNEL  = zamax_rvv.c
+
+SAMINKERNEL  = amin_rvv.c
+DAMINKERNEL  = amin_rvv.c
+CAMINKERNEL  = zamin_rvv.c
+ZAMINKERNEL  = zamin_rvv.c
+
+SMAXKERNEL   = max_rvv.c
+DMAXKERNEL   = max_rvv.c
+
+SMINKERNEL   = min_rvv.c
+DMINKERNEL   = min_rvv.c
+
+ISAMAXKERNEL = iamax_rvv.c
+IDAMAXKERNEL = iamax_rvv.c
+ICAMAXKERNEL = izamax_rvv.c
+IZAMAXKERNEL = izamax_rvv.c
+
+ISAMINKERNEL = iamin_rvv.c
+IDAMINKERNEL = iamin_rvv.c
+ICAMINKERNEL = izamin_rvv.c
+IZAMINKERNEL = izamin_rvv.c
+
+ISMAXKERNEL  = imax_rvv.c
+IDMAXKERNEL  = imax_rvv.c
+
+ISMINKERNEL  = imin_rvv.c
+IDMINKERNEL  = imin_rvv.c
+
+SASUMKERNEL  = asum_rvv.c
+DASUMKERNEL  = asum_rvv.c
+CASUMKERNEL  = zasum_rvv.c
+ZASUMKERNEL  = zasum_rvv.c
+
+SSUMKERNEL  = sum_rvv.c
+DSUMKERNEL  = sum_rvv.c
+CSUMKERNEL  = zsum_rvv.c
+ZSUMKERNEL  = zsum_rvv.c
+
+SAXPYKERNEL  = axpy_rvv.c
+DAXPYKERNEL  = axpy_rvv.c
+CAXPYKERNEL  = zaxpy_rvv.c
+ZAXPYKERNEL  = zaxpy_rvv.c
+
+SAXPBYKERNEL  = axpby_rvv.c
+DAXPBYKERNEL  = axpby_rvv.c
+CAXPBYKERNEL  = zaxpby_rvv.c
+ZAXPBYKERNEL  = zaxpby_rvv.c
+
+SCOPYKERNEL  = copy_rvv.c
+DCOPYKERNEL  = copy_rvv.c
+CCOPYKERNEL  = zcopy_rvv.c
+ZCOPYKERNEL  = zcopy_rvv.c
+
+SDOTKERNEL   = dot_rvv.c
+DDOTKERNEL   = dot_rvv.c
+CDOTKERNEL   = zdot_rvv.c
+ZDOTKERNEL   = zdot_rvv.c
+DSDOTKERNEL  = dot_rvv.c
+
+SNRM2KERNEL  = nrm2_rvv.c
+DNRM2KERNEL  = nrm2_rvv.c
+CNRM2KERNEL  = znrm2_rvv.c
+ZNRM2KERNEL  = znrm2_rvv.c
+
+SROTKERNEL   = rot_rvv.c
+DROTKERNEL   = rot_rvv.c
+CROTKERNEL   = zrot_rvv.c
+ZROTKERNEL   = zrot_rvv.c
+
+SSCALKERNEL  = scal_rvv.c
+DSCALKERNEL  = scal_rvv.c
+CSCALKERNEL  = zscal_rvv.c
+ZSCALKERNEL  = zscal_rvv.c
+
+SSWAPKERNEL  = swap_rvv.c
+DSWAPKERNEL  = swap_rvv.c
+CSWAPKERNEL  = zswap_rvv.c
+ZSWAPKERNEL  = zswap_rvv.c
+
+SGEMVNKERNEL = gemv_n_rvv.c
+DGEMVNKERNEL = gemv_n_rvv.c
+CGEMVNKERNEL = zgemv_n_rvv.c
+ZGEMVNKERNEL = zgemv_n_rvv.c
+
+SGEMVTKERNEL = gemv_t_rvv.c
+DGEMVTKERNEL = gemv_t_rvv.c
+CGEMVTKERNEL = zgemv_t_rvv.c
+ZGEMVTKERNEL = zgemv_t_rvv.c
+
+CTRMMKERNEL	= ztrmmkernel_2x2_rvv.c
+ZTRMMKERNEL	= ztrmmkernel_2x2_rvv.c
+
+# SGEMM_UNROLL_N set in params.h
+ifeq ($(SGEMM_UNROLL_N), 2)
+SGEMMKERNEL    =  gemmkernel_2x2_rvv.c
+SGEMMONCOPY    =  gemm_ncopy_2_rvv.c
+SGEMMOTCOPY    =  gemm_tcopy_2_rvv.c
+SGEMMONCOPYOBJ =  sgemm_oncopy.o
+SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+
+STRMMKERNEL	= trmmkernel_2x2_rvv.c
+else ifeq ($(SGEMM_UNROLL_N), 4)
+SGEMMKERNEL    =  gemmkernel_4x4_rvv.c
+SGEMMONCOPY    =  gemm_ncopy_4_rvv.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
+SGEMMONCOPYOBJ =  sgemm_oncopy.o
+SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+
+STRMMKERNEL	= trmmkernel_4x4_rvv.c
+else ifeq ($(SGEMM_UNROLL_N), 8)
+# UNROLL_M is VLMAX
+SGEMMKERNEL    =  gemmkernel_rvv_v1x8.c
+SGEMMINCOPY    =  gemm_ncopy_rvv_v1.c
+SGEMMITCOPY    =  gemm_tcopy_rvv_v1.c
+SGEMMONCOPY    =  gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c
+SGEMMOTCOPY    =  gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+STRMMKERNEL	= trmmkernel_rvv_v1x8.c 
+
+STRMMUNCOPY_M  =  trmm_uncopy_rvv_v1.c
+STRMMLNCOPY_M  =  trmm_lncopy_rvv_v1.c
+STRMMUTCOPY_M  =  trmm_utcopy_rvv_v1.c
+STRMMLTCOPY_M  =  trmm_ltcopy_rvv_v1.c
+
+SSYMMUCOPY_M   =  symm_ucopy_rvv_v1.c 
+SSYMMLCOPY_M   =  symm_lcopy_rvv_v1.c
+endif
+
+# SGEMM_UNROLL_N set in params.h
+ifeq ($(DGEMM_UNROLL_N), 2)
+DGEMMKERNEL    = gemmkernel_2x2_rvv.c
+DGEMMONCOPY    = gemm_ncopy_2_rvv.c
+DGEMMOTCOPY    = gemm_tcopy_2_rvv.c
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+DTRMMKERNEL	= trmmkernel_2x2_rvv.c
+else ifeq ($(DGEMM_UNROLL_N), 4)
+DGEMMKERNEL    =  gemmkernel_4x4_rvv.c
+DGEMMONCOPY    =  gemm_ncopy_4_rvv.c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
+DGEMMONCOPYOBJ =  dgemm_oncopy.o
+DGEMMOTCOPYOBJ =  dgemm_otcopy.o
+
+DTRMMKERNEL	= trmmkernel_4x4_rvv.c
+else ifeq ($(DGEMM_UNROLL_N), 8)
+# UNROLL_M is VLMAX
+DGEMMKERNEL    =  gemmkernel_rvv_v1x8.c
+DGEMMINCOPY    =  gemm_ncopy_rvv_v1.c
+DGEMMITCOPY    =  gemm_tcopy_rvv_v1.c
+DGEMMONCOPY    =  gemm_ncopy_$(DGEMM_UNROLL_N)_rvv.c
+DGEMMOTCOPY    =  gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DTRMMKERNEL = trmmkernel_rvv_v1x8.c
+DTRMMUNCOPY_M  =  trmm_uncopy_rvv_v1.c
+DTRMMLNCOPY_M  =  trmm_lncopy_rvv_v1.c
+DTRMMUTCOPY_M  =  trmm_utcopy_rvv_v1.c
+DTRMMLTCOPY_M  =  trmm_ltcopy_rvv_v1.c
+
+DSYMMUCOPY_M   =  symm_ucopy_rvv_v1.c
+DSYMMLCOPY_M   =  symm_lcopy_rvv_v1.c
+endif
+
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ =  cgemm_oncopy.o
+CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+
+ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy.o
+ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+
+STRSMKERNEL_LN	=  trsm_kernel_LN_rvv_v1.c
+STRSMKERNEL_LT	=  trsm_kernel_LT_rvv_v1.c
+STRSMKERNEL_RN	=  trsm_kernel_RN_rvv_v1.c
+STRSMKERNEL_RT	=  trsm_kernel_RT_rvv_v1.c
+
+DTRSMKERNEL_LN	=  trsm_kernel_LN_rvv_v1.c
+DTRSMKERNEL_LT	=  trsm_kernel_LT_rvv_v1.c
+DTRSMKERNEL_RN	=  trsm_kernel_RN_rvv_v1.c
+DTRSMKERNEL_RT	=  trsm_kernel_RT_rvv_v1.c
+
+CTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+TRSMCOPYLN_M    = trsm_lncopy_rvv_v1.c
+TRSMCOPYLT_M    = trsm_ltcopy_rvv_v1.c
+TRSMCOPYUN_M    = trsm_uncopy_rvv_v1.c
+TRSMCOPYUT_M    = trsm_utcopy_rvv_v1.c
+
+SSYMV_U_KERNEL =  symv_U_rvv.c 
+SSYMV_L_KERNEL =  symv_L_rvv.c
+DSYMV_U_KERNEL =  symv_U_rvv.c 
+DSYMV_L_KERNEL =  symv_L_rvv.c
+CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+
+LSAME_KERNEL = ../generic/lsame.c
+
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+
+ifndef SGEMM_BETA
+SGEMM_BETA = gemm_beta_rvv.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = gemm_beta_rvv.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = zgemm_beta_rvv.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = zgemm_beta_rvv.c
+endif
diff --git a/kernel/riscv64/amax_rvv.c b/kernel/riscv64/amax_rvv.c
new file mode 100644
index 000000000..c9c6e7f73
--- /dev/null
+++ b/kernel/riscv64/amax_rvv.c
@@ -0,0 +1,102 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m8()
+#define VSETVL_MAX_M1 vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMAXVV_FLOAT vfmax_vv_f32m8
+#define VFABSV_FLOAT vfabs_v_f32m8
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m8()
+#define VSETVL_MAX_M1 vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMAXVV_FLOAT vfmax_vv_f64m8
+#define VFABSV_FLOAT vfabs_v_f64m8
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT maxf = 0.0;
+
+    if (n <= 0 || inc_x <= 0) return(maxf);
+
+    FLOAT_V_T vx, vmax;
+    FLOAT_V_T_M1 v_res;
+
+    v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vmax = VFMVVF_FLOAT(0.0, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+            vmax = VFMAXVV_FLOAT(vmax, vx, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+            vmax = VFMAXVV_FLOAT(vmax, vx, vl);
+        }
+
+    }
+
+    v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax);
+    maxf = VFMVFS_FLOAT_M1(v_res);
+
+    return(maxf);
+}
diff --git a/kernel/riscv64/amin_rvv.c b/kernel/riscv64/amin_rvv.c
new file mode 100644
index 000000000..370b6c338
--- /dev/null
+++ b/kernel/riscv64/amin_rvv.c
@@ -0,0 +1,102 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m8()
+#define VSETVL_MAX_M1 vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMINVV_FLOAT vfmin_vv_f32m8
+#define VFABSV_FLOAT vfabs_v_f32m8
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m8()
+#define VSETVL_MAX_M1 vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMINVV_FLOAT vfmin_vv_f64m8
+#define VFABSV_FLOAT vfabs_v_f64m8
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT minf = 0.0;
+
+    if (n <= 0 || inc_x <= 0) return(minf);
+
+    FLOAT_V_T vx, vmin;
+    FLOAT_V_T_M1 v_res;
+    
+    v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vmin = VFMVVF_FLOAT(FLT_MAX, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+            vmin = VFMINVV_FLOAT(vmin, vx, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+            vmin = VFMINVV_FLOAT(vmin, vx, vl);
+        }
+
+    }
+
+    v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax);
+    minf = VFMVFS_FLOAT_M1(v_res);
+
+    return(minf);
+}
diff --git a/kernel/riscv64/asum_rvv.c b/kernel/riscv64/asum_rvv.c
new file mode 100644
index 000000000..4f711c9be
--- /dev/null
+++ b/kernel/riscv64/asum_rvv.c
@@ -0,0 +1,99 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m8()
+#define VSETVL_MAX_M1 vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFADDVV_FLOAT vfadd_vv_f32m8
+#define VFABSV_FLOAT vfabs_v_f32m8
+#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m8()
+#define VSETVL_MAX_M1 vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFADDVV_FLOAT vfadd_vv_f64m8
+#define VFABSV_FLOAT vfabs_v_f64m8
+#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT asumf = 0.0;
+    if (n <= 0 || inc_x <= 0) return(asumf);
+
+    FLOAT_V_T vx, vsum;
+    FLOAT_V_T_M1 v_res;
+
+    v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vsum = VFMVVF_FLOAT(0.0, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+            vsum = VFADDVV_FLOAT(vsum, vx, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+            vsum = VFADDVV_FLOAT(vsum, vx, vl);
+        }
+
+    }
+
+    v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax); 
+    asumf = VFMVFS_FLOAT_M1(v_res);
+    return(asumf);
+}
diff --git a/kernel/riscv64/axpby_rvv.c b/kernel/riscv64/axpby_rvv.c
new file mode 100644
index 000000000..7c35c563d
--- /dev/null
+++ b/kernel/riscv64/axpby_rvv.c
@@ -0,0 +1,171 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m8(n)
+#define FLOAT_V_T vfloat32m8_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VSEV_FLOAT vse32_v_f32m8
+#define VSSEV_FLOAT vsse32_v_f32m8
+#define VFMACCVF_FLOAT vfmacc_vf_f32m8
+#define VFMULVF_FLOAT vfmul_vf_f32m8
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#else
+#define VSETVL(n) vsetvl_e64m8(n)
+#define FLOAT_V_T vfloat64m8_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VSEV_FLOAT vse64_v_f64m8
+#define VSSEV_FLOAT vsse64_v_f64m8
+#define VFMACCVF_FLOAT vfmacc_vf_f64m8
+#define VFMULVF_FLOAT vfmul_vf_f64m8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#endif
+
+int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
+{
+    FLOAT_V_T vx, vy;
+
+    if ( n < 0     )  return(0);
+
+    if ( beta == 0.0 ) {
+        if ( alpha == 0.0 ) {
+            if (1 == inc_y) {
+                memset(&y[0], 0, n * sizeof(FLOAT));
+            } else {
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                size_t vl = VSETVL(n);
+                vy = VFMVVF_FLOAT(0.0, vl);
+                for ( ; n > 0; n -= vl, y += vl*stride_y) {
+                    vl = VSETVL(n);
+                    VSSEV_FLOAT(y, stride_y, vy, vl);
+                }
+            }
+
+        } else {
+            if ((1 == inc_x) && (1 == inc_y)) {
+                for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+                    vl = VSETVL(n);
+                    vx = VLEV_FLOAT(x, vl);
+                    vy = VFMULVF_FLOAT(vx, alpha, vl);
+                    VSEV_FLOAT (y, vy, vl);
+                }
+            } else if (1 == inc_x) {
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
+                    vl = VSETVL(n);
+                    vx = VLEV_FLOAT(x, vl);
+                    vy = VFMULVF_FLOAT(vx, alpha, vl);
+                    VSSEV_FLOAT (y, stride_y, vy, vl);
+                }
+            } else if (1 == inc_y) {
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
+                    vl = VSETVL(n);
+                    vx = VLSEV_FLOAT(x, stride_x, vl);
+                    vy = VFMULVF_FLOAT(vx, alpha, vl);
+                    VSEV_FLOAT (y, vy, vl);
+                }
+            } else {
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
+                    vl = VSETVL(n);
+                    vx = VLSEV_FLOAT(x, stride_x, vl);
+                    vy = VFMULVF_FLOAT(vx, alpha, vl);
+                    VSSEV_FLOAT (y, stride_y, vy, vl);
+                }
+            }
+        }
+
+    } else {
+        if ( alpha == 0.0 ) {
+            if (1 == inc_y) {
+                for (size_t vl; n > 0; n -= vl, y += vl) {
+                    vl = VSETVL(n);
+                    vy = VLEV_FLOAT(y, vl);
+                    vy = VFMULVF_FLOAT(vy, beta, vl);
+                    VSEV_FLOAT (y, vy, vl);
+                }
+            } else {
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                for (size_t vl; n > 0; n -= vl, y += vl*inc_y) {
+                    vl = VSETVL(n);
+                    vy = VLSEV_FLOAT(y, stride_y, vl);
+                    vy = VFMULVF_FLOAT(vy, beta, vl);
+                    VSSEV_FLOAT (y, stride_y, vy, vl);
+                }
+            }
+
+        } else {
+            if ((1 == inc_x) && (1 == inc_y)) {
+                for (size_t vl; n > 0; n -= vl, y += vl) {
+                    vl = VSETVL(n);
+                    vy = VLEV_FLOAT(y, vl);
+                    vy = VFMULVF_FLOAT(vy, beta, vl);
+                    VSEV_FLOAT (y, vy, vl);
+                }
+            } else if (1 == inc_x) {
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
+                    vl = VSETVL(n);
+                    vx = VLEV_FLOAT(x, vl);
+                    vy = VLSEV_FLOAT(y, stride_y, vl);
+                    vy = VFMULVF_FLOAT(vy, beta, vl);
+                    vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
+                    VSSEV_FLOAT (y, stride_y, vy, vl);
+                }
+            } else if (1 == inc_y) {
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
+                    vl = VSETVL(n);
+                    vx = VLSEV_FLOAT(x, stride_x, vl);
+                    vy = VLEV_FLOAT(y, vl);
+                    vy = VFMULVF_FLOAT(vy, beta, vl);
+                    vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
+                    VSEV_FLOAT (y, vy, vl);
+                }
+            } else {
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
+                    vl = VSETVL(n);
+                    vx = VLSEV_FLOAT(x, stride_x, vl);
+                    vy = VLSEV_FLOAT(y, stride_y, vl);
+                    vy = VFMULVF_FLOAT(vy, beta, vl);
+                    vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
+                    VSSEV_FLOAT (y, stride_y, vy, vl);
+                }
+            }
+        }
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/axpy_rvv.c b/kernel/riscv64/axpy_rvv.c
new file mode 100644
index 000000000..3986f4e21
--- /dev/null
+++ b/kernel/riscv64/axpy_rvv.c
@@ -0,0 +1,109 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m8(n)
+#define FLOAT_V_T vfloat32m8_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VSEV_FLOAT vse32_v_f32m8
+#define VSSEV_FLOAT vsse32_v_f32m8
+#define VFMACCVF_FLOAT vfmacc_vf_f32m8
+#else
+#define VSETVL(n) vsetvl_e64m8(n)
+#define FLOAT_V_T vfloat64m8_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VSEV_FLOAT vse64_v_f64m8
+#define VSSEV_FLOAT vsse64_v_f64m8
+#define VFMACCVF_FLOAT vfmacc_vf_f64m8
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+    if ( n <= 0    ) return(0);
+    if ( da == 0.0 ) return(0);
+
+    FLOAT_V_T vx, vy;
+
+    if(inc_x == 1 && inc_y == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vy = VLEV_FLOAT(y, vl);
+            vy = VFMACCVF_FLOAT(vy, da, vx, vl);
+            VSEV_FLOAT (y, vy, vl);
+        }
+
+    } else if (1 == inc_y) {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vy = VLEV_FLOAT(y, vl);
+            vy = VFMACCVF_FLOAT(vy, da, vx, vl);
+            VSEV_FLOAT(y, vy, vl);
+        }
+
+    } else if (1 == inc_x) {
+
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vy = VLSEV_FLOAT(y, stride_y, vl);
+            vy = VFMACCVF_FLOAT(vy, da, vx, vl);
+            VSSEV_FLOAT(y, stride_y, vy, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vy = VLSEV_FLOAT(y, stride_y, vl);
+            vy = VFMACCVF_FLOAT(vy, da, vx, vl);
+            VSSEV_FLOAT(y, stride_y, vy, vl);
+        }
+
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/copy_rvv.c b/kernel/riscv64/copy_rvv.c
new file mode 100644
index 000000000..5d5a8bd04
--- /dev/null
+++ b/kernel/riscv64/copy_rvv.c
@@ -0,0 +1,94 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m8(n)
+#define FLOAT_V_T vfloat32m8_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VSEV_FLOAT vse32_v_f32m8
+#define VSSEV_FLOAT vsse32_v_f32m8
+#else
+#define VSETVL(n) vsetvl_e64m8(n)
+#define FLOAT_V_T vfloat64m8_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VSEV_FLOAT vse64_v_f64m8
+#define VSSEV_FLOAT vsse64_v_f64m8
+#endif
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+    if(n < 0)  return(0);
+
+    FLOAT_V_T v0;
+
+    if(inc_x == 1 && inc_y == 1) {
+
+        for(size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+            vl = VSETVL(n);
+            v0 = VLEV_FLOAT(x, vl);
+            VSEV_FLOAT(y, v0, vl);
+        }
+
+    } else if (inc_y == 1) {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
+            vl = VSETVL(n);
+            v0 = VLSEV_FLOAT(x, stride_x, vl);
+            VSEV_FLOAT(y, v0, vl);
+        }
+
+    } else if(inc_x == 1) {
+
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for(size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
+            vl = VSETVL(n);
+            v0 = VLEV_FLOAT(x, vl);
+            VSSEV_FLOAT(y, stride_y, v0, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
+            vl = VSETVL(n);
+            v0 = VLSEV_FLOAT(x, stride_x, vl);
+            VSSEV_FLOAT(y, stride_y, v0, vl);
+        }
+
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/dot_rvv.c b/kernel/riscv64/dot_rvv.c
new file mode 100644
index 000000000..60dcac2f5
--- /dev/null
+++ b/kernel/riscv64/dot_rvv.c
@@ -0,0 +1,126 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if defined(DSDOT)
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+    double dot = 0.0;
+
+    if ( n <= 0 ) return(dot);
+
+    size_t vlmax = vsetvlmax_e64m8();
+    vfloat64m8_t vr = vfmv_v_f_f64m8(0, vlmax);
+
+    if(inc_x == 1 && inc_y == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+            vl = vsetvl_e64m8(n);
+
+#if !defined(DOUBLE)
+            vfloat32m4_t vx = vle32_v_f32m4(x, vl);
+            vfloat32m4_t vy = vle32_v_f32m4(y, vl);
+
+            vr = vfwmacc_vv_f64m8(vr, vx, vy, vl);
+#else
+            vfloat64m8_t vx = vle64_v_f64m8(x, vl);
+            vfloat64m8_t vy = vle64_v_f64m8(y, vl);
+
+            vr = vfmacc_vv_f64m8(vr, vx, vy, vl);
+#endif
+        }
+
+    } else if (1 == inc_x) {
+            
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
+            vl = vsetvl_e64m8(n);
+
+#if !defined(DOUBLE)
+            vfloat32m4_t vx = vle32_v_f32m4(x, vl);
+            vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl);
+
+            vr = vfwmacc_vv_f64m8(vr, vx, vy, vl);
+#else
+            vfloat64m8_t vx = vle64_v_f64m8(x, vl);
+            vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl);
+
+            vr = vfmacc_vv_f64m8(vr, vx, vy, vl);
+#endif
+        }
+    } else if (1 == inc_y) {
+            
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
+            vl = vsetvl_e64m8(n);
+
+#if !defined(DOUBLE)
+            vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl);
+            vfloat32m4_t vy = vle32_v_f32m4(y, vl);
+
+            vr = vfwmacc_vv_f64m8(vr, vx, vy, vl);
+#else
+            vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl);
+            vfloat64m8_t vy = vle64_v_f64m8(y, vl);
+
+            vr = vfmacc_vv_f64m8(vr, vx, vy, vl);
+#endif
+        }
+    } else {
+            
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
+            vl = vsetvl_e64m8(n);
+
+#if !defined(DOUBLE)
+            vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl);
+            vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl);
+
+            vr = vfwmacc_vv_f64m8(vr, vx, vy, vl);
+#else
+            vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl);
+            vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl);
+
+            vr = vfmacc_vv_f64m8(vr, vx, vy, vl);
+#endif
+        }
+    }
+
+    vfloat64m1_t vec_zero = vfmv_v_f_f64m1(0, vlmax);
+    vfloat64m1_t vec_sum = vfredusum_vs_f64m8_f64m1(vec_zero, vr, vec_zero, vlmax);
+    dot = vfmv_f_s_f64m1_f64(vec_sum);
+
+    return(dot);
+}
diff --git a/kernel/riscv64/gemm_beta_rvv.c b/kernel/riscv64/gemm_beta_rvv.c
new file mode 100644
index 000000000..34d1ea078
--- /dev/null
+++ b/kernel/riscv64/gemm_beta_rvv.c
@@ -0,0 +1,89 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m8(n)
+#define FLOAT_V_T vfloat32m8_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMULVF_FLOAT vfmul_vf_f32m8
+#define VSEV_FLOAT vse32_v_f32m8
+#else
+#define VSETVL(n) vsetvl_e64m8(n)
+#define FLOAT_V_T vfloat64m8_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMULVF_FLOAT vfmul_vf_f64m8
+#define VSEV_FLOAT vse64_v_f64m8
+#endif
+
+// Optimizes the implementation in ../generic/gemm_beta.c
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
+          IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5,
+          FLOAT *c, BLASLONG ldc)
+{
+    BLASLONG chunk;
+    FLOAT *c_offset;
+	size_t vl;
+    FLOAT_V_T vx;
+
+    if (beta == ZERO) {
+
+        vl = VSETVL(m);
+        vx = VFMVVF_FLOAT(0.0, vl);
+
+        for( ; n > 0; n--, c += ldc) {
+            c_offset = c;
+
+            for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) {
+                vl = VSETVL(chunk);
+
+                VSEV_FLOAT(c_offset, vx, vl);
+			}
+		}
+
+	} else {
+
+        for( ; n > 0; n--, c += ldc) {
+            c_offset = c;
+
+            for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) {
+                vl = VSETVL(chunk);
+
+                vx = VLEV_FLOAT(c_offset, vl);
+                vx = VFMULVF_FLOAT(vx, beta, vl);
+                VSEV_FLOAT(c_offset, vx, vl);
+			}
+		}
+
+	}
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemm_ncopy_2_rvv.c b/kernel/riscv64/gemm_ncopy_2_rvv.c
new file mode 100644
index 000000000..5f55bc349
--- /dev/null
+++ b/kernel/riscv64/gemm_ncopy_2_rvv.c
@@ -0,0 +1,92 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m4(n)
+#define FLOAT_V_T vfloat32m4_t
+#define VLEV_FLOAT vle32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEG2_FLOAT vsseg2e32_v_f32m4
+#else
+#define VSETVL(n) vsetvl_e64m4(n)
+#define FLOAT_V_T vfloat64m4_t
+#define VLEV_FLOAT vle64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEG2_FLOAT vsseg2e64_v_f64m4
+#endif
+
+// Optimizes the implementation in ../generic/gemm_ncopy_2.c
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
+{
+    BLASLONG i, j;
+    IFLOAT *a_offset, *a_offset1, *a_offset2;
+    IFLOAT *b_offset;
+    FLOAT_V_T v1, v2;
+    size_t vl;
+
+    //fprintf(stderr, "gemm_ncopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU
+
+    a_offset = a;
+    b_offset = b;
+
+    for(j = (n >> 1); j > 0; j--) {
+
+        a_offset1 = a_offset;
+        a_offset2 = a_offset + lda;
+        a_offset += 2 * lda;
+
+        for(i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            v1 = VLEV_FLOAT(a_offset1, vl);
+            v2 = VLEV_FLOAT(a_offset2, vl);
+            VSSEG2_FLOAT(b_offset, v1, v2, vl);
+
+	        a_offset1 += vl;
+	        a_offset2 += vl;
+	        b_offset += vl*2;
+        }
+    }
+
+    if (n & 1) {
+
+        for(i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            v1 = VLEV_FLOAT(a_offset, vl);
+            VSEV_FLOAT(b_offset, v1, vl);
+
+	        a_offset += vl;
+	        b_offset += vl;
+        }
+	}
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemm_ncopy_4_rvv.c b/kernel/riscv64/gemm_ncopy_4_rvv.c
new file mode 100644
index 000000000..4d4efe4c9
--- /dev/null
+++ b/kernel/riscv64/gemm_ncopy_4_rvv.c
@@ -0,0 +1,123 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define VSSEG4_FLOAT vsseg4e32_v_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define VSSEG4_FLOAT vsseg4e64_v_f64m2
+#endif
+
+// Optimizes the implementation in ../generic/gemm_ncopy_4.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
+{
+    BLASLONG i, j;
+
+    FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
+    FLOAT *b_offset;
+
+    FLOAT_V_T v1, v2, v3, v4;
+    size_t vl;
+
+    //fprintf(stderr, "gemm_ncopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda);
+
+    a_offset = a;
+    b_offset = b;
+
+    for(j = (n >> 2); j > 0; j--) {
+        a_offset1  = a_offset;
+        a_offset2  = a_offset1 + lda;
+        a_offset3  = a_offset2 + lda;
+        a_offset4  = a_offset3 + lda;
+        a_offset += 4 * lda;
+
+        for(i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            v1 = VLEV_FLOAT(a_offset1, vl);
+            v2 = VLEV_FLOAT(a_offset2, vl);
+            v3 = VLEV_FLOAT(a_offset3, vl);
+            v4 = VLEV_FLOAT(a_offset4, vl);
+
+            VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl);
+
+            a_offset1 += vl;
+            a_offset2 += vl;
+            a_offset3 += vl;
+            a_offset4 += vl;
+            b_offset += vl*4;
+        }
+    }
+
+    if (n & 2) {
+        a_offset1  = a_offset;
+        a_offset2  = a_offset1 + lda;
+        a_offset += 2 * lda;
+
+        for(i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            v1 = VLEV_FLOAT(a_offset1, vl);
+            v2 = VLEV_FLOAT(a_offset2, vl);
+
+            VSSEG2_FLOAT(b_offset, v1, v2, vl);
+
+            a_offset1 += vl;
+            a_offset2 += vl;
+            b_offset += vl*2;
+        }
+    }
+
+    if (n & 1) {
+        a_offset1  = a_offset;
+
+        for(i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            v1 = VLEV_FLOAT(a_offset1, vl);
+
+            VSEV_FLOAT(b_offset, v1, vl);
+
+            a_offset1 += vl;
+            b_offset += vl;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemm_ncopy_8_rvv.c b/kernel/riscv64/gemm_ncopy_8_rvv.c
new file mode 100644
index 000000000..525b223c2
--- /dev/null
+++ b/kernel/riscv64/gemm_ncopy_8_rvv.c
@@ -0,0 +1,164 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m1(n)
+#define FLOAT_V_T vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m1
+#define VSEV_FLOAT vse32_v_f32m1
+#define VSSEG2_FLOAT vsseg2e32_v_f32m1
+#define VSSEG4_FLOAT vsseg4e32_v_f32m1
+#define VSSEG8_FLOAT vsseg8e32_v_f32m1
+#else
+#define VSETVL(n) vsetvl_e64m1(n)
+#define FLOAT_V_T vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m1
+#define VSEV_FLOAT vse64_v_f64m1
+#define VSSEG2_FLOAT vsseg2e64_v_f64m1
+#define VSSEG4_FLOAT vsseg4e64_v_f64m1
+#define VSSEG8_FLOAT vsseg8e64_v_f64m1
+#endif
+
+// Optimizes the implementation in ../generic/gemm_ncopy_8.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
+{
+    BLASLONG i, j;
+
+    FLOAT *a_offset;
+    FLOAT *a_offset1, *a_offset2, *a_offset3, *a_offset4;
+    FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8;
+    FLOAT *b_offset;
+
+    FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8;
+    size_t vl;
+
+    //fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
+
+    a_offset = a;
+    b_offset = b;
+
+    for(j = (n >> 3); j > 0; j--) {
+        a_offset1  = a_offset;
+        a_offset2  = a_offset1 + lda;
+        a_offset3  = a_offset2 + lda;
+        a_offset4  = a_offset3 + lda;
+        a_offset5  = a_offset4 + lda;
+        a_offset6  = a_offset5 + lda;
+        a_offset7  = a_offset6 + lda;
+        a_offset8  = a_offset7 + lda;
+        a_offset += 8 * lda;
+
+        for(i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            v1 = VLEV_FLOAT(a_offset1, vl);
+            v2 = VLEV_FLOAT(a_offset2, vl);
+            v3 = VLEV_FLOAT(a_offset3, vl);
+            v4 = VLEV_FLOAT(a_offset4, vl);
+            v5 = VLEV_FLOAT(a_offset5, vl);
+            v6 = VLEV_FLOAT(a_offset6, vl);
+            v7 = VLEV_FLOAT(a_offset7, vl);
+            v8 = VLEV_FLOAT(a_offset8, vl);
+
+            VSSEG8_FLOAT(b_offset, v1, v2, v3, v4, v5, v6, v7, v8, vl);
+
+            a_offset1 += vl;
+            a_offset2 += vl;
+            a_offset3 += vl;
+            a_offset4 += vl;
+            a_offset5 += vl;
+            a_offset6 += vl;
+            a_offset7 += vl;
+            a_offset8 += vl;
+            b_offset += vl*8;
+        }
+    }
+
+    if (n & 4) {
+        a_offset1  = a_offset;
+        a_offset2  = a_offset1 + lda;
+        a_offset3  = a_offset2 + lda;
+        a_offset4  = a_offset3 + lda;
+        a_offset += 4 * lda;
+
+        for(i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            v1 = VLEV_FLOAT(a_offset1, vl);
+            v2 = VLEV_FLOAT(a_offset2, vl);
+            v3 = VLEV_FLOAT(a_offset3, vl);
+            v4 = VLEV_FLOAT(a_offset4, vl);
+
+            VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl);
+
+            a_offset1 += vl;
+            a_offset2 += vl;
+            a_offset3 += vl;
+            a_offset4 += vl;
+            b_offset += vl*4;
+        }
+    }
+
+    if (n & 2) {
+        a_offset1  = a_offset;
+        a_offset2  = a_offset1 + lda;
+        a_offset += 2 * lda;
+
+        for(i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            v1 = VLEV_FLOAT(a_offset1, vl);
+            v2 = VLEV_FLOAT(a_offset2, vl);
+
+            VSSEG2_FLOAT(b_offset, v1, v2, vl);
+
+            a_offset1 += vl;
+            a_offset2 += vl;
+            b_offset += vl*2;
+        }
+    }
+
+    if (n & 1) {
+        a_offset1  = a_offset;
+
+        for(i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            v1 = VLEV_FLOAT(a_offset1, vl);
+
+            VSEV_FLOAT(b_offset, v1, vl);
+
+            a_offset1 += vl;
+            b_offset += vl;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemm_ncopy_rvv_v1.c b/kernel/riscv64/gemm_ncopy_rvv_v1.c
new file mode 100644
index 000000000..2c5230752
--- /dev/null
+++ b/kernel/riscv64/gemm_ncopy_rvv_v1.c
@@ -0,0 +1,76 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VLSEV_FLOAT vlse32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VLSEV_FLOAT vlse64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
+{
+    BLASLONG i, j;
+
+    FLOAT *a_offset;
+    FLOAT *a_offset1;
+    FLOAT *b_offset;
+
+    FLOAT_V_T v0;
+    size_t vl;
+
+    //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
+
+    a_offset = a;
+    b_offset = b;
+
+    for(j = n; j > 0; j -= vl) {
+        vl = VSETVL(j);
+
+        a_offset1 = a_offset;
+        a_offset += vl * lda;
+
+        for(i = m; i > 0; i--) {
+            v0 = VLSEV_FLOAT(a_offset1, lda * sizeof(FLOAT), vl);
+            VSEV_FLOAT(b_offset, v0, vl);
+
+            a_offset1++;
+            b_offset += vl;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemm_tcopy_2_rvv.c b/kernel/riscv64/gemm_tcopy_2_rvv.c
new file mode 100644
index 000000000..963e1be69
--- /dev/null
+++ b/kernel/riscv64/gemm_tcopy_2_rvv.c
@@ -0,0 +1,108 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
+#define VSSSEG2_FLOAT vssseg2e32_v_f32m2
+#define VSSSEG4_FLOAT vssseg4e32_v_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
+#define VSSSEG2_FLOAT vssseg2e64_v_f64m2
+#define VSSSEG4_FLOAT vssseg4e64_v_f64m2
+#endif
+
+// Optimizes the implementation in ../generic/gemm_tcopy_2.c
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
+{
+    BLASLONG i, j;
+    IFLOAT *a_offset, *a_offset1, *a_offset2;
+    IFLOAT *b_offset, *b_offset1, *b_offset2;
+    FLOAT_V_T v1a, v1b, v2a, v2b;
+    size_t vl;
+
+    //fprintf(stderr, "gemm_tcopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU
+
+    a_offset = a;
+    b_offset = b;
+    b_offset2 = b + m * (n & ~1);
+
+    for(i = (m >> 1); i > 0; i--) {
+
+        a_offset1 = a_offset;
+        a_offset2 = a_offset + lda;
+        a_offset += 2 * lda;
+
+        b_offset1 = b_offset;
+        b_offset += 4;
+
+        for(j = (n >> 1); j > 0; j -= vl) {
+            vl = VSETVL(j);
+
+            VLSEG2_FLOAT(&v1a, &v1b, a_offset1, vl);
+            VLSEG2_FLOAT(&v2a, &v2b, a_offset2, vl);
+
+            VSSSEG4_FLOAT(b_offset1, m*2*sizeof(FLOAT), v1a, v1b, v2a, v2b, vl);
+
+            a_offset1 += vl * 2;
+	        a_offset2 += vl * 2;
+            b_offset1 += vl * m * 2;
+        }
+
+        if (n & 1) {
+	        *(b_offset2 + 0) = *(a_offset1 + 0);
+	        *(b_offset2 + 1) = *(a_offset2 + 0);
+	        b_offset2 += 2;
+        }
+    }
+
+    if (m & 1) {
+
+        for(j = (n >> 1); j > 0; j -= vl) {
+            vl = VSETVL(j);
+
+            VLSEG2_FLOAT(&v1a, &v1b, a_offset, vl);
+
+            VSSSEG2_FLOAT(b_offset, m*2*sizeof(FLOAT), v1a, v1b, vl);
+
+            a_offset += vl * 2;
+            b_offset += vl * m * 2;
+        }
+
+        if (n & 1){
+            *(b_offset2 + 0) = *(a_offset + 0);
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemm_tcopy_4_rvv.c b/kernel/riscv64/gemm_tcopy_4_rvv.c
new file mode 100644
index 000000000..ac9974b24
--- /dev/null
+++ b/kernel/riscv64/gemm_tcopy_4_rvv.c
@@ -0,0 +1,236 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
+#define VSSSEG2_FLOAT vssseg2e32_v_f32m2
+#define VSSSEG4_FLOAT vssseg4e32_v_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
+#define VSSSEG2_FLOAT vssseg2e64_v_f64m2
+#define VSSSEG4_FLOAT vssseg4e64_v_f64m2
+#endif
+
+// Optimizes the implementation in ../generic/gemm_tcopy_4.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
+{
+    BLASLONG i, j;
+
+    FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
+    FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3;
+    FLOAT  ctemp1,  ctemp2,  ctemp3,  ctemp4;
+    FLOAT  ctemp5,  ctemp6,  ctemp7,  ctemp8;
+    FLOAT  ctemp9, ctemp10, ctemp11, ctemp12;
+    FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
+
+    //fprintf(stderr, "gemm_tcopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda);
+
+    a_offset = a;
+    b_offset = b;
+
+    b_offset2 = b + m  * (n & ~3);
+    b_offset3 = b + m  * (n & ~1);
+
+    for(j = (m >> 2); j > 0; j--) {
+        a_offset1 = a_offset;
+        a_offset2 = a_offset1 + lda;
+        a_offset3 = a_offset2 + lda;
+        a_offset4 = a_offset3 + lda;
+        a_offset += 4 * lda;
+
+        b_offset1 = b_offset;
+        b_offset += 16;
+
+        for(i = (n >> 2); i > 0; i--) {
+            v1 = VLEV_FLOAT(a_offset1, 4);
+            v2 = VLEV_FLOAT(a_offset2, 4);
+            v3 = VLEV_FLOAT(a_offset3, 4);
+            v4 = VLEV_FLOAT(a_offset4, 4);
+
+            a_offset1 += 4;
+            a_offset2 += 4;
+            a_offset3 += 4;
+            a_offset4 += 4;
+
+            VSEV_FLOAT(b_offset1, v1, 4);
+            VSEV_FLOAT(b_offset2+4, v2, 4);
+            VSEV_FLOAT(b_offset2+8, v3, 4);
+            VSEV_FLOAT(b_offset2+12, v4, 4);
+
+            b_offset1 += m * 4;
+        }
+
+        if (n & 2) {
+            v1 = VLEV_FLOAT(a_offset1, 2);
+            v2 = VLEV_FLOAT(a_offset2, 2);
+            v3 = VLEV_FLOAT(a_offset3, 2);
+            v4 = VLEV_FLOAT(a_offset4, 2);
+
+            a_offset1 += 2;
+            a_offset2 += 2;
+            a_offset3 += 2;
+            a_offset4 += 2;
+
+            VSEV_FLOAT(b_offset2, v1, 2);
+            VSEV_FLOAT(b_offset2+2, v2, 2);
+            VSEV_FLOAT(b_offset2+4, v3, 2);
+            VSEV_FLOAT(b_offset2+6, v4, 2);
+
+            b_offset2 += 8;
+        }
+
+        if (n & 1) {
+            v1 = VLEV_FLOAT(a_offset1, 1);
+            v2 = VLEV_FLOAT(a_offset2, 1);
+            v3 = VLEV_FLOAT(a_offset3, 1);
+            v4 = VLEV_FLOAT(a_offset4, 1);
+
+            VSSEG4_FLOAT(b_offset3, v1, v2, v3, v4, 1);
+
+            b_offset3 += 4;
+        }
+
+    }
+
+// TODO cleanup
+
+  if (m & 2){
+    a_offset1  = a_offset;
+    a_offset2  = a_offset1 + lda;
+    a_offset  += 2 * lda;
+
+    b_offset1  = b_offset;
+    b_offset  += 8;
+
+    i = (n >> 2);
+    if (i > 0){
+      do{
+	ctemp1  = *(a_offset1 + 0);
+	ctemp2  = *(a_offset1 + 1);
+	ctemp3  = *(a_offset1 + 2);
+	ctemp4  = *(a_offset1 + 3);
+
+	ctemp5  = *(a_offset2 + 0);
+	ctemp6  = *(a_offset2 + 1);
+	ctemp7  = *(a_offset2 + 2);
+	ctemp8  = *(a_offset2 + 3);
+
+	a_offset1 += 4;
+	a_offset2 += 4;
+
+	*(b_offset1 +  0) = ctemp1;
+	*(b_offset1 +  1) = ctemp2;
+	*(b_offset1 +  2) = ctemp3;
+	*(b_offset1 +  3) = ctemp4;
+
+	*(b_offset1 +  4) = ctemp5;
+	*(b_offset1 +  5) = ctemp6;
+	*(b_offset1 +  6) = ctemp7;
+	*(b_offset1 +  7) = ctemp8;
+
+	b_offset1 += m * 4;
+	i --;
+      }while(i > 0);
+    }
+
+    if (n & 2) {
+      ctemp1  = *(a_offset1 + 0);
+      ctemp2  = *(a_offset1 + 1);
+
+      ctemp3  = *(a_offset2 + 0);
+      ctemp4  = *(a_offset2 + 1);
+
+      a_offset1 += 2;
+      a_offset2 += 2;
+
+      *(b_offset2 +  0) = ctemp1;
+      *(b_offset2 +  1) = ctemp2;
+      *(b_offset2 +  2) = ctemp3;
+      *(b_offset2 +  3) = ctemp4;
+
+      b_offset2 += 4;
+    }
+
+    if (n & 1) {
+      ctemp1  = *(a_offset1 + 0);
+      ctemp2  = *(a_offset2 + 0);
+
+      *(b_offset3 +  0) = ctemp1;
+      *(b_offset3 +  1) = ctemp2;
+      b_offset3 += 2;
+    }
+  }
+
+  if (m & 1){
+    a_offset1  = a_offset;
+    b_offset1  = b_offset;
+
+    i = (n >> 2);
+    if (i > 0){
+      do{
+	ctemp1  = *(a_offset1 + 0);
+	ctemp2  = *(a_offset1 + 1);
+	ctemp3  = *(a_offset1 + 2);
+	ctemp4  = *(a_offset1 + 3);
+
+	a_offset1 += 4;
+
+	*(b_offset1 +  0) = ctemp1;
+	*(b_offset1 +  1) = ctemp2;
+	*(b_offset1 +  2) = ctemp3;
+	*(b_offset1 +  3) = ctemp4;
+
+	b_offset1 += 4 * m;
+
+	i --;
+      }while(i > 0);
+    }
+
+    if (n & 2) {
+      ctemp1  = *(a_offset1 + 0);
+      ctemp2  = *(a_offset1 + 1);
+      a_offset1 += 2;
+
+      *(b_offset2 +  0) = ctemp1;
+      *(b_offset2 +  1) = ctemp2;
+    }
+
+    if (n & 1) {
+      ctemp1  = *(a_offset1 + 0);
+      *(b_offset3 +  0) = ctemp1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/riscv64/gemm_tcopy_8_rvv.c b/kernel/riscv64/gemm_tcopy_8_rvv.c
new file mode 100644
index 000000000..81c1f962b
--- /dev/null
+++ b/kernel/riscv64/gemm_tcopy_8_rvv.c
@@ -0,0 +1,264 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m1(n)
+#define FLOAT_V_T vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m1
+#define VLSEV_FLOAT vlse32_v_f32m1
+#define VSEV_FLOAT vse32_v_f32m1
+#define VLSSEG2_FLOAT vlsseg2e32_v_f32m1
+#define VSSEG2_FLOAT vsseg2e32_v_f32m1
+#define VLSSEG4_FLOAT vlsseg4e32_v_f32m1
+#define VSSEG4_FLOAT vsseg4e32_v_f32m1
+#define VLSSEG8_FLOAT vlsseg8e32_v_f32m1
+#define VSSEG8_FLOAT vsseg8e32_v_f32m1
+#else
+#define VSETVL(n) vsetvl_e64m1(n)
+#define FLOAT_V_T vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m1
+#define VLSEV_FLOAT vlse64_v_f64m1
+#define VSEV_FLOAT vse64_v_f64m1
+#define VLSSEG2_FLOAT vlsseg2e64_v_f64m1
+#define VSSEG2_FLOAT vsseg2e64_v_f64m1
+#define VLSSEG4_FLOAT vlsseg4e64_v_f64m1
+#define VSSEG4_FLOAT vsseg4e64_v_f64m1
+#define VLSSEG8_FLOAT vlsseg8e64_v_f64m1
+#define VSSEG8_FLOAT vsseg8e64_v_f64m1
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
+{
+    BLASLONG i, j;
+
+    IFLOAT *aoffset;
+    IFLOAT *aoffset1;
+
+    IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
+
+    FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7;
+
+    // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
+
+    aoffset   = a;
+    boffset   = b;
+    boffset2  = b + m  * (n & ~7);
+    boffset3  = b + m  * (n & ~3);
+    boffset4  = b + m  * (n & ~1);
+
+    for(j = (m >> 3); j > 0; j--) {
+
+        aoffset1  = aoffset;
+        aoffset += 8 * lda;
+
+        boffset1  = boffset;
+        boffset  += 64;
+
+        for(i = (n >> 3); i > 0; i--) {
+            size_t vl = 8;
+
+            VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
+
+            aoffset1 += 8;
+            boffset1 += m * 8;
+        }
+
+        if (n & 4) {
+            size_t vl = 8;
+
+            VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
+
+            aoffset1 += 4;
+            boffset2 += 32;
+        }
+
+        if (n & 2) {
+            size_t vl = 8;
+
+            VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG2_FLOAT(boffset3, v0, v1, vl);
+
+            aoffset1 += 2;
+            boffset3 += 16;
+        }
+
+        if (n & 1) {
+            size_t vl = 8;
+
+            v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSEV_FLOAT(boffset4, v0, vl);
+
+            aoffset1 += 1;
+            boffset4 += 8;
+        }
+
+    }
+
+    if (m & 4) {
+
+        aoffset1  = aoffset;
+        aoffset += 4 * lda;
+
+        boffset1  = boffset;
+        boffset  += 32;
+
+        for(i = (n >> 3); i > 0; i--) {
+            size_t vl = 4;
+
+            VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
+
+            aoffset1 += 8;
+            boffset1 += m * 8;
+        }
+
+        if (n & 4) {
+            size_t vl = 4;
+
+            VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
+
+            aoffset1 += 4;
+            boffset2 += 16;
+        }
+
+        if (n & 2) {
+            size_t vl = 4;
+
+            VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG2_FLOAT(boffset3, v0, v1, vl);
+
+            aoffset1 += 2;
+            boffset3 += 8;
+	}
+
+        if (n & 1) {
+            size_t vl = 4;
+
+            v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSEV_FLOAT(boffset4, v0, vl);
+
+            aoffset1 += 1;
+            boffset4 += 4;
+        }
+    }
+
+    if (m & 2) {
+        aoffset1  = aoffset;
+        aoffset += 2 * lda;
+
+        boffset1  = boffset;
+        boffset  += 16;
+
+        for(i = (n >> 3); i > 0; i--) {
+            size_t vl = 2;
+
+            VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
+
+            aoffset1 += 8;
+            boffset1 += m * 8;
+        }
+
+        if (n & 4) {
+            size_t vl = 2;
+
+            VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
+
+            aoffset1 += 4;
+            boffset2 += 8;
+        }
+
+        if (n & 2) {
+            size_t vl = 2;
+
+            VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG2_FLOAT(boffset3, v0, v1, vl);
+
+            aoffset1 += 2;
+            boffset3 += 4;
+	}
+  
+        if (n & 1) {
+           size_t vl = 2;
+
+            v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSEV_FLOAT(boffset4, v0, vl);
+
+            aoffset1 += 1;
+            boffset4 += 2;
+        }
+    }
+
+    if (m & 1) {
+        aoffset1  = aoffset;
+        boffset1  = boffset;
+
+        for(i = (n >> 3); i > 0; i--) {
+            size_t vl = 8;
+
+            v0 = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset1, v0, vl);
+
+            aoffset1 += 8;
+            boffset1 += 8 * m;
+        }
+
+        if (n & 4) {
+            size_t vl = 4;
+
+            v0 = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset2, v0, vl);
+
+            aoffset1 += 4;
+            //boffset2 += 4;
+        }
+
+        if (n & 2) {
+            size_t vl = 2;
+
+            v0 = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset3, v0, vl);
+
+            aoffset1 += 2;
+           // boffset3 += 2;
+        }
+
+        if (n & 1) {
+           *(boffset4) = *(aoffset1);
+           // aoffset1 ++;
+           // boffset4 ++;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemm_tcopy_rvv_v1.c b/kernel/riscv64/gemm_tcopy_rvv_v1.c
new file mode 100644
index 000000000..a291b70b8
--- /dev/null
+++ b/kernel/riscv64/gemm_tcopy_rvv_v1.c
@@ -0,0 +1,74 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
+{
+    BLASLONG i, j;
+
+    IFLOAT *aoffset;
+    IFLOAT *aoffset1;
+    IFLOAT *boffset;
+
+    FLOAT_V_T v0;
+    size_t vl;
+
+    //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
+
+    aoffset = a;
+    boffset = b;
+
+    for(j = n; j > 0; j -= vl) {
+        vl = VSETVL(j);
+
+        aoffset1 = aoffset;
+        aoffset += vl;
+
+        for(i = m; i > 0; i--) {
+            v0 = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset, v0, vl);
+
+            aoffset1 += lda;
+            boffset += vl;
+        }  
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemmkernel_2x2_rvv.c b/kernel/riscv64/gemmkernel_2x2_rvv.c
new file mode 100644
index 000000000..ec8961ced
--- /dev/null
+++ b/kernel/riscv64/gemmkernel_2x2_rvv.c
@@ -0,0 +1,214 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m4()
+#define VSETVL_MAX_M1 vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEG2_FLOAT vlseg2e32_v_f32m4
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#define VFMACCVV_FLOAT vfmacc_vv_f32m4
+#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m4()
+#define VSETVL_MAX_M1 vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEG2_FLOAT vlseg2e64_v_f64m4
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#define VFMACCVV_FLOAT vfmacc_vv_f64m4
+#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#endif
+
+// Optimizes the implementation in ../generic/gemm_kernel_2x2.c
+
+int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc
+#ifdef TRMMKERNEL
+		,BLASLONG offset
+#endif
+		)
+{
+    BLASLONG i,j,k;
+    FLOAT *C0,*C1;
+    IFLOAT *ptrba,*ptrbb;
+
+    //fprintf(stderr, "gemm_kernel_2x2 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc);
+
+    FLOAT_V_T va0, va1, vb0, vb1;
+    FLOAT_V_T vres0, vres1, vres2, vres3;
+    FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3;
+    FLOAT_V_T_M1 v_z0;
+
+    v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    size_t vl;
+
+    for (j = bn/2; j > 0; j--) {
+        C0 = C;
+        C1 = C0 + ldc;
+        ptrba = ba;
+
+        for (i = bm/2; i > 0; i--) {
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+            vres2 = VFMVVF_FLOAT(0.0, vlmax);
+            vres3 = VFMVVF_FLOAT(0.0, vlmax);
+
+            for (k = bk; k > 0; k -= vl) {
+                vl = VSETVL(k);
+
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl); 
+                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); 
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl);
+
+                ptrba += vl*2;
+                ptrbb += vl*2;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
+            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax);
+            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax);
+            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
+            C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
+            C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2);
+            C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3);
+
+            C0 += 2;
+            C1 += 2;
+        }
+
+        if(bm & 1) {
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+
+            for (k = bk; k > 0; k -= vl) {
+                vl = VSETVL(k);
+
+                va0 = VLEV_FLOAT(ptrba, vl);
+                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+                ptrba += vl;
+                ptrbb += vl*2;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
+            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
+            C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1);
+
+            C0 += 1;
+            C1 += 1;
+        }
+
+        bb += (bk<<1);
+        C += (ldc<<1);
+    }
+
+    if(bn & 1) {
+        C0 = C;
+        ptrba = ba;
+        for (i = bm/2; i > 0; i--) {
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+
+            for (k = bk; k > 0; k -= vl) {
+                vl = VSETVL(k);
+
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                vb0 = VLEV_FLOAT(ptrbb, vl);
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+
+                ptrba += vl*2;
+                ptrbb += vl;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
+            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
+            C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
+
+            C0 += 2;
+        }
+
+        if(bm & 1) {
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+
+            for (k = bk; k > 0; k -= vl) {
+                vl = VSETVL(k);
+
+                va0 = VLEV_FLOAT(ptrba, vl);
+                vb0 = VLEV_FLOAT(ptrbb, vl);
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+
+                ptrba += vl;
+                ptrbb += vl;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
+            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
+
+            C0 += 1;
+        }
+
+        bb += (bk<<0);
+        C += ldc;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemmkernel_4x4_rvv.c b/kernel/riscv64/gemmkernel_4x4_rvv.c
new file mode 100644
index 000000000..aa58bcc76
--- /dev/null
+++ b/kernel/riscv64/gemmkernel_4x4_rvv.c
@@ -0,0 +1,508 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m1(n)
+#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL_MAX_M1 vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m1_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m1
+#define VLSEG2_FLOAT vlseg2e32_v_f32m1
+#define VLSEG4_FLOAT vlseg4e32_v_f32m1
+#define VFMVVF_FLOAT vfmv_v_f_f32m1
+#define VFMACCVF_FLOAT vfmacc_vf_f32m1
+#define VFMACCVV_FLOAT vfmacc_vv_f32m1
+#define VFREDSUMVS_FLOAT vfredusum_vs_f32m1_f32m1
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n) vsetvl_e64m1(n)
+#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL_MAX_M1 vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m1_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m1
+#define VLSEG2_FLOAT vlseg2e64_v_f64m1
+#define VLSEG4_FLOAT vlseg4e64_v_f64m1
+#define VFMVVF_FLOAT vfmv_v_f_f64m1
+#define VFMACCVF_FLOAT vfmacc_vf_f64m1
+#define VFMACCVV_FLOAT vfmacc_vv_f64m1
+#define VFREDSUMVS_FLOAT vfredusum_vs_f64m1_f64m1
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#endif
+
+// Optimizes the implementation in ../generic/gemm_kernel_2x2.c
+
+int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc
+#ifdef TRMMKERNEL
+		,BLASLONG offset
+#endif
+		)
+{
+    BLASLONG i,j,k;
+    FLOAT *C0,*C1,*C2,*C3;
+    IFLOAT *ptrba,*ptrbb;
+
+    //fprintf(stderr, "gemm_kernel_4x4 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); // KU
+
+    FLOAT_V_T va0, va1, va2, va3; 
+    FLOAT_V_T vb0, vb1, vb2, vb3;
+    FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
+    FLOAT_V_T vres8, vres9, vres10, vres11, vres12, vres13, vres14, vres15;
+    FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3;
+    FLOAT_V_T_M1 v_z0;
+
+    v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    size_t vl;
+
+    for (j = bn/4; j > 0; j--) {
+        C0 = C;
+        C1 = C0 + ldc;
+        C2 = C1 + ldc;
+        C3 = C2 + ldc;
+        ptrba = ba;
+
+        for (i = bm/4; i > 0; i--) {
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+            vres2 = VFMVVF_FLOAT(0.0, vlmax);
+            vres3 = VFMVVF_FLOAT(0.0, vlmax);
+            vres4 = VFMVVF_FLOAT(0.0, vlmax);
+            vres5 = VFMVVF_FLOAT(0.0, vlmax);
+            vres6 = VFMVVF_FLOAT(0.0, vlmax);
+            vres7 = VFMVVF_FLOAT(0.0, vlmax);
+            vres8 = VFMVVF_FLOAT(0.0, vlmax);
+            vres9 = VFMVVF_FLOAT(0.0, vlmax);
+            vres10 = VFMVVF_FLOAT(0.0, vlmax);
+            vres11 = VFMVVF_FLOAT(0.0, vlmax);
+            vres12 = VFMVVF_FLOAT(0.0, vlmax);
+            vres13 = VFMVVF_FLOAT(0.0, vlmax);
+            vres14 = VFMVVF_FLOAT(0.0, vlmax);
+            vres15 = VFMVVF_FLOAT(0.0, vlmax);
+
+            for (k = bk; k > 0; k -= vl) {
+                vl = VSETVL(k);
+
+                VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); 
+                VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); 
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl);
+
+                vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
+                vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl);
+                vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl);
+                vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl);
+
+                vres8 = VFMACCVV_FLOAT(vres8, va2, vb0, vl);
+                vres9 = VFMACCVV_FLOAT(vres9, va3, vb0, vl);
+                vres10 = VFMACCVV_FLOAT(vres10, va2, vb1, vl);
+                vres11 = VFMACCVV_FLOAT(vres11, va3, vb1, vl);
+
+                vres12 = VFMACCVV_FLOAT(vres12, va2, vb2, vl);
+                vres13 = VFMACCVV_FLOAT(vres13, va3, vb2, vl);
+                vres14 = VFMACCVV_FLOAT(vres14, va2, vb3, vl);
+                vres15 = VFMACCVV_FLOAT(vres15, va3, vb3, vl);
+
+                ptrba += vl*4;
+                ptrbb += vl*4;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
+            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres8, v_z0, vlmax);
+            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres9, v_z0, vlmax);
+            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
+            C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
+            C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2);
+            C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3);
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax);
+            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres10, v_z0, vlmax);
+            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres11, v_z0, vlmax);
+            C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
+            C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
+            C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2);
+            C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3);
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax);
+            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres12, v_z0, vlmax);
+            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres13, v_z0, vlmax);
+            C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
+            C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
+            C2[2] += alpha * VFMVFS_FLOAT_M1(vsum2);
+            C2[3] += alpha * VFMVFS_FLOAT_M1(vsum3);
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax);
+            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres14, v_z0, vlmax);
+            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres15, v_z0, vlmax);
+            C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
+            C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
+            C3[2] += alpha * VFMVFS_FLOAT_M1(vsum2);
+            C3[3] += alpha * VFMVFS_FLOAT_M1(vsum3);
+
+            C0 += 4;
+            C1 += 4;
+            C2 += 4;
+            C3 += 4;
+        }
+
+        if(bm & 2) {
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+            vres2 = VFMVVF_FLOAT(0.0, vlmax);
+            vres3 = VFMVVF_FLOAT(0.0, vlmax);
+            vres4 = VFMVVF_FLOAT(0.0, vlmax);
+            vres5 = VFMVVF_FLOAT(0.0, vlmax);
+            vres6 = VFMVVF_FLOAT(0.0, vlmax);
+            vres7 = VFMVVF_FLOAT(0.0, vlmax);
+
+            for (k = bk; k > 0; k -= vl) {
+                vl = VSETVL(k);
+
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl); 
+                VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); 
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl);
+
+                vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
+                vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl);
+                vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl);
+                vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl);
+
+                ptrba += vl*2;
+                ptrbb += vl*4;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
+            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
+            C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
+ 
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax);
+            C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
+            C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax);
+            C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
+            C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax);
+            C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
+            C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
+
+            C0 += 2;
+            C1 += 2;
+            C2 += 2;
+            C3 += 2;
+        }
+
+        if(bm & 1) {
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+            vres2 = VFMVVF_FLOAT(0.0, vlmax);
+            vres3 = VFMVVF_FLOAT(0.0, vlmax);
+
+            for (k = bk; k > 0; k -= vl) {
+                vl = VSETVL(k);
+
+                va0 = VLEV_FLOAT(ptrba, vl);
+                VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); 
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl);
+
+                ptrba += vl;
+                ptrbb += vl*4;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
+            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax);
+            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax);
+            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
+            C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1);
+            C2[0] += alpha * VFMVFS_FLOAT_M1(vsum2);
+            C3[0] += alpha * VFMVFS_FLOAT_M1(vsum3);
+
+            C0 += 1;
+            C1 += 1;
+            C2 += 1;
+            C3 += 1;
+        }
+
+        bb += (bk<<2);
+        C += (ldc<<2);
+    }
+
+    if(bn & 2) {
+
+        C0 = C;
+        C1 = C0 + ldc;
+        ptrba = ba;
+
+        for (i = bm/4; i > 0; i--) {
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+            vres2 = VFMVVF_FLOAT(0.0, vlmax);
+            vres3 = VFMVVF_FLOAT(0.0, vlmax);
+
+            vres4 = VFMVVF_FLOAT(0.0, vlmax);
+            vres5 = VFMVVF_FLOAT(0.0, vlmax);
+            vres6 = VFMVVF_FLOAT(0.0, vlmax);
+            vres7 = VFMVVF_FLOAT(0.0, vlmax);
+
+            for (k = bk; k > 0; k -= vl) {
+                vl = VSETVL(k);
+
+                VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
+                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
+
+                vres4 = VFMACCVV_FLOAT(vres4, va0, vb1, vl);
+                vres5 = VFMACCVV_FLOAT(vres5, va1, vb1, vl);
+                vres6 = VFMACCVV_FLOAT(vres6, va2, vb1, vl);
+                vres7 = VFMACCVV_FLOAT(vres7, va3, vb1, vl);
+
+                ptrba += vl*4;
+                ptrbb += vl*2;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
+            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax);
+            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax);
+            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
+            C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
+            C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2);
+            C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3);
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax);
+            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres6, v_z0, vlmax);
+            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres7, v_z0, vlmax);
+            C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
+            C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
+            C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2);
+            C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3);
+
+            C0 += 4;
+            C1 += 4;
+        }
+
+        if(bm & 2) {
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+            vres2 = VFMVVF_FLOAT(0.0, vlmax);
+            vres3 = VFMVVF_FLOAT(0.0, vlmax);
+
+            for (k = bk; k > 0; k -= vl) {
+                vl = VSETVL(k);
+
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl);
+
+                ptrba += vl*2;
+                ptrbb += vl*2;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
+            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax);
+            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax);
+            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
+            C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
+            C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2);
+            C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3);
+
+            C0 += 2;
+            C1 += 2;
+        }
+
+        if(bm & 1) {
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+
+            for (k = bk; k > 0; k -= vl) {
+                vl = VSETVL(k);
+
+                va0 = VLEV_FLOAT(ptrba, vl);
+                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+                ptrba += vl;
+                ptrbb += vl*2;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
+            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
+            C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1);
+
+            C0 += 1;
+            C1 += 1;
+        }
+
+        bb += (bk<<1);
+        C += (ldc<<1);
+    }
+
+    if(bn & 1) {
+        C0 = C;
+        ptrba = ba;
+        for (i = bm/4; i > 0; i--) {
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+            vres2 = VFMVVF_FLOAT(0.0, vlmax);
+            vres3 = VFMVVF_FLOAT(0.0, vlmax);
+
+            for (k = bk; k > 0; k -= vl) {
+                vl = VSETVL(k);
+
+                VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
+                vb0 = VLEV_FLOAT(ptrbb, vl);
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
+
+                ptrba += vl*4;
+                ptrbb += vl;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
+            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax);
+            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax);
+            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
+            C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
+            C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2);
+            C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3);
+
+            C0 += 4;
+        }
+
+        if(bm & 2) {
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+
+            for (k = bk; k > 0; k -= vl) {
+                vl = VSETVL(k);
+
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                vb0 = VLEV_FLOAT(ptrbb, vl);
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+
+                ptrba += vl*2;
+                ptrbb += vl;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
+            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
+            C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
+
+            C0 += 2;
+        }
+
+        if(bm & 1) {
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+
+            for (k = bk; k > 0; k -= vl) {
+                vl = VSETVL(k);
+
+                va0 = VLEV_FLOAT(ptrba, vl);
+                vb0 = VLEV_FLOAT(ptrbb, vl);
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+
+                ptrba += vl;
+                ptrbb += vl;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
+            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
+
+            C0 += 1;
+        }
+
+        bb += (bk<<0);
+        C += ldc;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemmkernel_rvv_v1x8.c b/kernel/riscv64/gemmkernel_rvv_v1x8.c
new file mode 100644
index 000000000..5cd509f93
--- /dev/null
+++ b/kernel/riscv64/gemmkernel_rvv_v1x8.c
@@ -0,0 +1,601 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VFMVVF_FLOAT vfmv_v_f_f32m2
+#define VFMACCVF_FLOAT vfmacc_vf_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VFMVVF_FLOAT vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT vfmacc_vf_f64m2
+#endif
+
+int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc
+#ifdef TRMMKERNEL
+		,BLASLONG offset
+#endif
+		)
+{
+    BLASLONG i,j,k;
+    FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7;
+    IFLOAT *ptrba,*ptrbb;
+
+    //fprintf(stderr, "%s, bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", __FUNCTION__, bm, bn, bk, alpha, ldc); // Debug
+
+    FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
+    FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
+    size_t vl;
+
+    // N:8
+    for (j = bn/8; j > 0; j--) {
+        C0 = C;
+        C1 = C0 + ldc;
+        C2 = C1 + ldc;
+        C3 = C2 + ldc;
+        C4 = C3 + ldc;
+        C5 = C4 + ldc;
+        C6 = C5 + ldc;
+        C7 = C6 + ldc;
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+            vres4 = VFMVVF_FLOAT(0.0, vl);
+            vres5 = VFMVVF_FLOAT(0.0, vl);
+            vres6 = VFMVVF_FLOAT(0.0, vl);
+            vres7 = VFMVVF_FLOAT(0.0, vl);
+#if 0
+            for (k = bk; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl); 
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
+
+                ptrba += vl;
+                ptrbb += 8;
+            }
+#else
+            // Unroll K
+            for (k = bk/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+ 
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
+                ptrbb += 8;
+  
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl);
+                ptrbb += 8;
+
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl);
+                ptrbb += 8;
+
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl);
+                ptrbb += 8;
+
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl);
+                ptrbb += 8;
+
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl);
+                ptrbb += 8;
+
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl);
+                ptrbb += 8;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl);
+                ptrbb += 8;
+            }
+
+            // K remainder
+            for (k = bk&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
+
+                ptrbb += 8;
+                ptrba += vl;
+            }
+#endif
+            va0 = VLEV_FLOAT(C0, vl);
+            va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
+            VSEV_FLOAT(C0, va0, vl);
+
+            va1 = VLEV_FLOAT(C1, vl);
+            va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl);
+            VSEV_FLOAT(C1, va1, vl);
+
+            va2 = VLEV_FLOAT(C2, vl);
+            va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl);
+            VSEV_FLOAT(C2, va2, vl);
+
+            va3 = VLEV_FLOAT(C3, vl);
+            va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl);
+            VSEV_FLOAT(C3, va3, vl);
+
+            va4 = VLEV_FLOAT(C4, vl);
+            va4 = VFMACCVF_FLOAT(va4, alpha, vres4, vl);
+            VSEV_FLOAT(C4, va4, vl);
+
+            va5 = VLEV_FLOAT(C5, vl);
+            va5 = VFMACCVF_FLOAT(va5, alpha, vres5, vl);
+            VSEV_FLOAT(C5, va5, vl);
+
+            va6 = VLEV_FLOAT(C6, vl);
+            va6 = VFMACCVF_FLOAT(va6, alpha, vres6, vl);
+            VSEV_FLOAT(C6, va6, vl);
+
+            va7 = VLEV_FLOAT(C7, vl);
+            va7 = VFMACCVF_FLOAT(va7, alpha, vres7, vl);
+            VSEV_FLOAT(C7, va7, vl);
+
+            C0 += vl;
+            C1 += vl;
+            C2 += vl;
+            C3 += vl;
+            C4 += vl;
+            C5 += vl;
+            C6 += vl;
+            C7 += vl;
+        }
+
+        bb += (bk<<3);
+        C += (ldc<<3);
+    }
+
+    // N:4
+    if (bn & 4) {
+        C0 = C;
+        C1 = C0 + ldc;
+        C2 = C1 + ldc;
+        C3 = C2 + ldc;
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+
+#if 0
+            for (k = bk; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl); 
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+
+                ptrba += vl;
+                ptrbb += 4;
+            }
+#else
+            // Unroll K
+            for (k = bk/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+                ptrbb += 4;
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+  
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
+                ptrbb += 4;
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
+                ptrbb += 4;
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
+                ptrbb += 4;
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
+                ptrbb += 4;
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
+                ptrbb += 4;
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
+                ptrbb += 4;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
+                ptrbb += 4;
+            }
+
+            // K remainder
+            for (k = bk&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+
+                ptrbb += 4;
+                ptrba += vl;
+            }
+#endif
+            va0 = VLEV_FLOAT(C0, vl);
+            va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
+            VSEV_FLOAT(C0, va0, vl);
+
+            va1 = VLEV_FLOAT(C1, vl);
+            va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl);
+            VSEV_FLOAT(C1, va1, vl);
+
+            va2 = VLEV_FLOAT(C2, vl);
+            va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl);
+            VSEV_FLOAT(C2, va2, vl);
+
+            va3 = VLEV_FLOAT(C3, vl);
+            va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl);
+            VSEV_FLOAT(C3, va3, vl);
+
+            C0 += vl;
+            C1 += vl;
+            C2 += vl;
+            C3 += vl;
+        }
+
+        bb += (bk<<2);
+        C += (ldc<<2);
+    }
+
+    // N:2
+    if (bn & 2) {
+        C0 = C;
+        C1 = C0 + ldc;
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+#if 0
+            for (k = bk; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl); 
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+
+                ptrba += vl;
+                ptrbb += 2;
+            }
+#else
+            // Unroll K
+            for (k = bk/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                ptrbb += 2;
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
+                ptrbb += 2;
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
+                ptrbb += 2;
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
+                ptrbb += 2;
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
+                ptrbb += 2;
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
+                ptrbb += 2;
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl; 
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
+                ptrbb += 2;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
+                ptrbb += 2;
+            }
+
+            // K remainder
+            for (k = bk&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+
+                ptrbb += 2;
+                ptrba += vl;
+            }
+#endif
+            va0 = VLEV_FLOAT(C0, vl);
+            va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
+            VSEV_FLOAT(C0, va0, vl);
+
+            va1 = VLEV_FLOAT(C1, vl);
+            va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl);
+            VSEV_FLOAT(C1, va1, vl);
+
+            C0 += vl;
+            C1 += vl;
+        }
+
+        bb += (bk<<1);
+        C += (ldc<<1);
+    }
+
+    // N:1
+    if (bn & 1) {
+        C0 = C;
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+#if 0
+            for (k = bk; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl); 
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+ 
+                ptrba += vl;
+                ptrbb += 1;
+            }
+#else
+            // Unroll K
+            for (k = bk/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                ptrbb += 1;
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                ptrbb += 1;
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                ptrbb += 1;
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                ptrbb += 1;
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                ptrbb += 1;
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                ptrbb += 1;
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                ptrbb += 1;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                ptrbb += 1;
+            }
+
+            // K remainder
+            for (k = bk&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+
+                ptrbb += 1;
+                ptrba += vl;
+            }
+#endif
+            va0 = VLEV_FLOAT(C0, vl);
+            va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
+            VSEV_FLOAT(C0, va0, vl);
+  
+            C0 += vl;
+        }
+
+        bb += (bk);
+        C += (ldc);
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemv_n_rvv.c b/kernel/riscv64/gemv_n_rvv.c
new file mode 100644
index 000000000..9d2dee615
--- /dev/null
+++ b/kernel/riscv64/gemv_n_rvv.c
@@ -0,0 +1,94 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m8(n)
+#define FLOAT_V_T vfloat32m8_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VSEV_FLOAT vse32_v_f32m8
+#define VSSEV_FLOAT vsse32_v_f32m8
+#define VFMACCVF_FLOAT vfmacc_vf_f32m8
+#else
+#define VSETVL(n) vsetvl_e64m8(n)
+#define FLOAT_V_T vfloat64m8_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VSEV_FLOAT vse64_v_f64m8
+#define VSSEV_FLOAT vsse64_v_f64m8
+#define VFMACCVF_FLOAT vfmacc_vf_f64m8
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+    if(n < 0)  return(0);
+
+    FLOAT *a_ptr, *x_ptr;
+    BLASLONG i;
+    FLOAT_V_T va, vy;
+
+    if(inc_y == 1) {
+
+        for (size_t vl; m > 0; m -= vl, y += vl, a += vl) {
+            vl = VSETVL(m);
+            a_ptr = a;
+            x_ptr = x;
+            vy = VLEV_FLOAT(y, vl);
+            for(i = 0; i < n; i++) {
+                va = VLEV_FLOAT(a_ptr, vl);
+                vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl);
+
+                a_ptr += lda;
+                x_ptr += inc_x;
+            }
+            VSEV_FLOAT(y, vy, vl);
+        }
+ 
+    } else {
+
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; m > 0; m -= vl, y += vl*inc_y, a += vl) {
+            vl = VSETVL(m);
+            a_ptr = a;
+            x_ptr = x;
+            vy = VLSEV_FLOAT(y, stride_y, vl);
+            for(i = 0; i < n; i++) {
+                va = VLEV_FLOAT(a_ptr, vl);
+                vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl);
+
+                a_ptr += lda;
+                x_ptr += inc_x;
+            }
+            VSSEV_FLOAT(y, stride_y, vy, vl);
+        }
+
+    }
+    return(0);
+}
diff --git a/kernel/riscv64/gemv_t_rvv.c b/kernel/riscv64/gemv_t_rvv.c
new file mode 100644
index 000000000..a80af81b6
--- /dev/null
+++ b/kernel/riscv64/gemv_t_rvv.c
@@ -0,0 +1,119 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m8()
+#define VSETVL_MAX_M1 vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1
+#define VFMACCVV_FLOAT vfmacc_vv_f32m8
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m8()
+#define VSETVL_MAX_M1 vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1
+#define VFMACCVV_FLOAT vfmacc_vv_f64m8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+    BLASLONG i, j;
+    FLOAT *a_ptr, *x_ptr;
+
+    FLOAT_V_T va, vx, vr;
+    FLOAT_V_T_M1 v_res, v_z0;
+    size_t vlmax = VSETVL_MAX_M1;
+    v_res = VFMVVF_FLOAT_M1(0, vlmax);
+    v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
+    vlmax = VSETVL_MAX;
+
+    if(inc_x == 1) {
+
+        for(i = 0; i < n; i++) {
+            j = m;
+            a_ptr = a;
+            x_ptr = x;
+            vr = VFMVVF_FLOAT(0, vlmax);
+
+            for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl) {
+                vl = VSETVL(j);
+
+                va = VLEV_FLOAT(a_ptr, vl);
+                vx = VLEV_FLOAT(x_ptr, vl);
+                vr = VFMACCVV_FLOAT(vr, va, vx, vl);
+            }
+
+            v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
+            *y += alpha * VFMVFS_FLOAT_M1(v_res);
+            y += inc_y;
+            a += lda;
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+  
+        for(i = 0; i < n; i++) {
+            j = m;
+            a_ptr = a;
+            x_ptr = x;
+            vr = VFMVVF_FLOAT(0, vlmax);
+
+            for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl*inc_x) {
+                vl = VSETVL(j);
+
+                va = VLEV_FLOAT(a_ptr, vl);
+                vx = VLSEV_FLOAT(x_ptr, stride_x, vl);
+                vr = VFMACCVV_FLOAT(vr, va, vx, vl);
+            }
+
+            v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
+            *y += alpha * VFMVFS_FLOAT_M1(v_res);
+            y += inc_y;
+            a += lda;
+        }
+
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/iamax_rvv.c b/kernel/riscv64/iamax_rvv.c
new file mode 100644
index 000000000..8b33b3bcb
--- /dev/null
+++ b/kernel/riscv64/iamax_rvv.c
@@ -0,0 +1,150 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if defined(DOUBLE)
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m8()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
+#define MASK_T vbool8_t
+#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
+#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
+#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFABSV_FLOAT vfabs_v_f64m8
+#define VFMAXVV_FLOAT vfmax_vv_f64m8
+#define VFIRSTM vfirst_m_b8
+#define UINT_V_T vuint64m8_t
+#define VIDV_MASK_UINT vid_v_u64m8_m
+#define VIDV_UINT vid_v_u64m8
+#define VADDVX_MASK_UINT vadd_vx_u64m8_m
+#define VADDVX_UINT vadd_vx_u64m8
+#define VMVVX_UINT vmv_v_x_u64m8
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT vslidedown_vx_u64m8
+#define VMVVXS_UINT vmv_x_s_u64m8_u64
+#else
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m8()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
+#define MASK_T vbool4_t
+#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
+#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
+#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFABSV_FLOAT vfabs_v_f32m8
+#define VFMAXVV_FLOAT vfmax_vv_f32m8
+#define VFIRSTM vfirst_m_b4
+#define UINT_V_T vuint32m8_t
+#define VIDV_MASK_UINT vid_v_u32m8_m
+#define VIDV_UINT vid_v_u32m8
+#define VADDVX_MASK_UINT vadd_vx_u32m8_m
+#define VADDVX_UINT vadd_vx_u32m8
+#define VMVVX_UINT vmv_v_x_u32m8
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT vslidedown_vx_u32m8
+#define VMVVXS_UINT vmv_x_s_u32m8_u32
+#endif
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    unsigned int max_index = 0;
+    if (n <= 0 || inc_x <= 0) return(max_index);
+
+    FLOAT_V_T vx, v_max;
+    UINT_V_T v_max_index;
+    MASK_T mask;
+  
+    size_t vlmax = VSETVL_MAX;
+    v_max_index = VMVVX_UINT(0, vlmax);
+    v_max = VFMVVF_FLOAT(-1, vlmax);
+    BLASLONG j=0;
+    FLOAT maxf=0.0;
+    
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+
+            //index where element greater than v_max
+            mask = VMFLTVV_FLOAT(v_max, vx, vl);
+            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
+
+            //update v_max
+            v_max = VFMAXVV_FLOAT(v_max, vx, vl);
+        }
+
+    } else {
+  
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+
+            //index where element greater than v_max
+            mask = VMFLTVV_FLOAT(v_max, vx, vl);
+            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
+
+            //update v_max
+            v_max = VFMAXVV_FLOAT(v_max, vx, vl);
+        }
+  
+    }
+
+    FLOAT_V_T_M1 v_res, v_z0;
+    v_res = VFMVVF_FLOAT_M1(0, vlmax);
+    v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
+
+    v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax);
+    maxf = VFMVFS_FLOAT_M1(v_res);
+    mask = VMFGEVF_FLOAT(v_max, maxf, vlmax);
+    max_index = VFIRSTM(mask, vlmax);
+    
+    v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax);
+    max_index = VMVVXS_UINT(v_max_index);
+
+    return(max_index+1);
+}
diff --git a/kernel/riscv64/iamin_rvv.c b/kernel/riscv64/iamin_rvv.c
new file mode 100644
index 000000000..585b37186
--- /dev/null
+++ b/kernel/riscv64/iamin_rvv.c
@@ -0,0 +1,151 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if defined(DOUBLE)
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m8()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
+#define MASK_T vbool8_t
+#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
+#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
+#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFABSV_FLOAT vfabs_v_f64m8
+#define VFMINVV_FLOAT vfmin_vv_f64m8
+#define VFIRSTM vfirst_m_b8
+#define UINT_V_T vuint64m8_t
+#define VIDV_MASK_UINT vid_v_u64m8_m
+#define VIDV_UINT vid_v_u64m8
+#define VADDVX_MASK_UINT vadd_vx_u64m8_m
+#define VADDVX_UINT vadd_vx_u64m8
+#define VMVVX_UINT vmv_v_x_u64m8
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT vslidedown_vx_u64m8
+#define VMVVXS_UINT vmv_x_s_u64m8_u64
+#else
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m8()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
+#define MASK_T vbool4_t
+#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
+#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
+#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFABSV_FLOAT vfabs_v_f32m8
+#define VFMINVV_FLOAT vfmin_vv_f32m8
+#define VFIRSTM vfirst_m_b4
+#define UINT_V_T vuint32m8_t
+#define VIDV_MASK_UINT vid_v_u32m8_m
+#define VIDV_UINT vid_v_u32m8
+#define VADDVX_MASK_UINT vadd_vx_u32m8_m
+#define VADDVX_UINT vadd_vx_u32m8
+#define VMVVX_UINT vmv_v_x_u32m8
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT vslidedown_vx_u32m8
+#define VMVVXS_UINT vmv_x_s_u32m8_u32
+#endif
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    unsigned int min_index = 0;
+    if (n <= 0 || inc_x <= 0) return(min_index);
+
+    FLOAT_V_T vx, v_min;
+    UINT_V_T v_min_index;
+    MASK_T mask;
+  
+    size_t vlmax = VSETVL_MAX;
+    v_min_index = VMVVX_UINT(0, vlmax);
+    v_min = VFMVVF_FLOAT(FLT_MAX, vlmax);
+    BLASLONG j=0;
+    FLOAT minf=0.0;
+    
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+
+            // index where element less than v_min
+            mask = VMFLTVV_FLOAT(vx, v_min, vl);
+            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
+
+            //update v_min and start_index j
+            v_min = VFMINVV_FLOAT(v_min, vx, vl);
+        }
+
+    } else {
+  
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+
+            // index where element less than v_min
+            mask = VMFLTVV_FLOAT(vx, v_min, vl);
+            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
+
+            //update v_min and start_index j
+            v_min = VFMINVV_FLOAT(v_min, vx, vl);
+        }
+  
+    }
+
+    FLOAT_V_T_M1 v_res, v_max;
+    v_res = VFMVVF_FLOAT_M1(0, vlmax);
+    v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
+
+    v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax);
+    minf = VFMVFS_FLOAT_M1(v_res);
+    mask = VMFLEVF_FLOAT(v_min, minf, vlmax);
+    min_index = VFIRSTM(mask, vlmax);
+
+    v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax);
+    min_index = VMVVXS_UINT(v_min_index);
+
+    return(min_index+1);
+}
diff --git a/kernel/riscv64/imax_rvv.c b/kernel/riscv64/imax_rvv.c
new file mode 100644
index 000000000..d84ad968e
--- /dev/null
+++ b/kernel/riscv64/imax_rvv.c
@@ -0,0 +1,147 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if defined(DOUBLE)
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m8()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
+#define MASK_T vbool8_t
+#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
+#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
+#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMAXVV_FLOAT vfmax_vv_f64m8
+#define VFIRSTM vfirst_m_b8
+#define UINT_V_T vuint64m8_t
+#define VIDV_MASK_UINT vid_v_u64m8_m
+#define VIDV_UINT vid_v_u64m8
+#define VADDVX_MASK_UINT vadd_vx_u64m8_m
+#define VADDVX_UINT vadd_vx_u64m8
+#define VMVVX_UINT vmv_v_x_u64m8
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT vslidedown_vx_u64m8
+#define VMVVXS_UINT vmv_x_s_u64m8_u64
+#else
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m8()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
+#define MASK_T vbool4_t
+#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
+#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
+#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMAXVV_FLOAT vfmax_vv_f32m8
+#define VFIRSTM vfirst_m_b4
+#define UINT_V_T vuint32m8_t
+#define VIDV_MASK_UINT vid_v_u32m8_m
+#define VIDV_UINT vid_v_u32m8
+#define VADDVX_MASK_UINT vadd_vx_u32m8_m
+#define VADDVX_UINT vadd_vx_u32m8
+#define VMVVX_UINT vmv_v_x_u32m8
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT vslidedown_vx_u32m8
+#define VMVVXS_UINT vmv_x_s_u32m8_u32
+#endif
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    unsigned int max_index = 0;
+    if (n <= 0 || inc_x <= 0) return(max_index);
+
+    FLOAT_V_T vx, v_max;
+    UINT_V_T v_max_index;
+    MASK_T mask;
+  
+    size_t vlmax = VSETVL_MAX;
+    v_max_index = VMVVX_UINT(0, vlmax);
+    v_max = VFMVVF_FLOAT(-FLT_MAX, vlmax);
+    BLASLONG j=0;
+    FLOAT maxf=0.0;
+    
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+
+            //index where element greater than v_max
+            mask = VMFLTVV_FLOAT(v_max, vx, vl);
+            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
+
+            //update v_max and start_index j
+            v_max = VFMAXVV_FLOAT(v_max, vx, vl);
+        }
+
+    } else {
+  
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+
+            //index where element greater than v_max
+            mask = VMFLTVV_FLOAT(v_max, vx, vl);
+            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
+
+            //update v_max and start_index j
+            v_max = VFMAXVV_FLOAT(v_max, vx, vl);
+        }
+  
+    }
+
+    FLOAT_V_T_M1 v_res, v_min;
+    v_res = VFMVVF_FLOAT_M1(0, vlmax);
+    v_min = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax);
+
+    v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, vlmax);
+    maxf = VFMVFS_FLOAT_M1(v_res);
+    mask = VMFGEVF_FLOAT(v_max, maxf, vlmax);
+    max_index = VFIRSTM(mask, vlmax);
+    
+    v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax);
+    max_index = VMVVXS_UINT(v_max_index);
+
+    return(max_index+1);
+}
diff --git a/kernel/riscv64/imin_rvv.c b/kernel/riscv64/imin_rvv.c
new file mode 100644
index 000000000..fb734f6f8
--- /dev/null
+++ b/kernel/riscv64/imin_rvv.c
@@ -0,0 +1,147 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if defined(DOUBLE)
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m8()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
+#define MASK_T vbool8_t
+#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
+#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
+#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMINVV_FLOAT vfmin_vv_f64m8
+#define VFIRSTM vfirst_m_b8
+#define UINT_V_T vuint64m8_t
+#define VIDV_MASK_UINT vid_v_u64m8_m
+#define VIDV_UINT vid_v_u64m8
+#define VADDVX_MASK_UINT vadd_vx_u64m8_m
+#define VADDVX_UINT vadd_vx_u64m8
+#define VMVVX_UINT vmv_v_x_u64m8
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT vslidedown_vx_u64m8
+#define VMVVXS_UINT vmv_x_s_u64m8_u64
+#else
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m8()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
+#define MASK_T vbool4_t
+#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
+#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
+#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMINVV_FLOAT vfmin_vv_f32m8
+#define VFIRSTM vfirst_m_b4
+#define UINT_V_T vuint32m8_t
+#define VIDV_MASK_UINT vid_v_u32m8_m
+#define VIDV_UINT vid_v_u32m8
+#define VADDVX_MASK_UINT vadd_vx_u32m8_m
+#define VADDVX_UINT vadd_vx_u32m8
+#define VMVVX_UINT vmv_v_x_u32m8
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT vslidedown_vx_u32m8
+#define VMVVXS_UINT vmv_x_s_u32m8_u32
+#endif
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    unsigned int min_index = 0;
+    if (n <= 0 || inc_x <= 0) return(min_index);
+
+    FLOAT_V_T vx, v_min;
+    UINT_V_T v_min_index;
+    MASK_T mask;
+  
+    size_t vlmax = VSETVL_MAX;
+    v_min_index = VMVVX_UINT(0, vlmax);
+    v_min = VFMVVF_FLOAT(FLT_MAX, vlmax);
+    BLASLONG j=0;
+    FLOAT minf=0.0;
+    
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+
+            // index where element less than v_min
+            mask = VMFLTVV_FLOAT(vx, v_min, vl);
+            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
+
+            //update v_min and start_index j
+            v_min = VFMINVV_FLOAT(v_min, vx, vl);
+        }
+
+    } else {
+  
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+
+            // index where element less than v_min
+            mask = VMFLTVV_FLOAT(vx, v_min, vl);
+            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
+
+            //update v_min and start_index j
+            v_min = VFMINVV_FLOAT(v_min, vx, vl);
+        }
+  
+    }
+
+    FLOAT_V_T_M1 v_res, v_max;
+    v_res = VFMVVF_FLOAT_M1(0, vlmax);
+    v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
+
+    v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax);
+    minf = VFMVFS_FLOAT_M1(v_res);
+    mask = VMFLEVF_FLOAT(v_min, minf, vlmax);
+    min_index = VFIRSTM(mask, vlmax);
+
+    v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax);
+    min_index = VMVVXS_UINT(v_min_index);
+
+    return(min_index+1);
+}
diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c
new file mode 100644
index 000000000..9cb332cbb
--- /dev/null
+++ b/kernel/riscv64/izamax_rvv.c
@@ -0,0 +1,162 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if defined(DOUBLE)
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m4()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VLSEG_FLOAT vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
+#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
+#define MASK_T vbool16_t
+#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
+#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16
+#define VMFGEVF_FLOAT vmfge_vf_f64m4_b16
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFABSV_FLOAT vfabs_v_f64m4
+#define VFMAXVV_FLOAT vfmax_vv_f64m4
+#define VFADDVV_FLOAT vfadd_vv_f64m4
+#define VFIRSTM vfirst_m_b16
+#define UINT_V_T vuint64m4_t
+#define VIDV_MASK_UINT vid_v_u64m4_m
+#define VIDV_UINT vid_v_u64m4
+#define VADDVX_MASK_UINT vadd_vx_u64m4_m
+#define VADDVX_UINT vadd_vx_u64m4
+#define VMVVX_UINT vmv_v_x_u64m4
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT vslidedown_vx_u64m4
+#define VMVVXS_UINT vmv_x_s_u64m4_u64
+#else
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m4()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VLSEG_FLOAT vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
+#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
+#define MASK_T vbool8_t
+#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
+#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8
+#define VMFGEVF_FLOAT vmfge_vf_f32m4_b8
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFABSV_FLOAT vfabs_v_f32m4
+#define VFMAXVV_FLOAT vfmax_vv_f32m4
+#define VFADDVV_FLOAT vfadd_vv_f32m4
+#define VFIRSTM vfirst_m_b8
+#define UINT_V_T vuint32m4_t
+#define VIDV_MASK_UINT vid_v_u32m4_m
+#define VIDV_UINT vid_v_u32m4
+#define VADDVX_MASK_UINT vadd_vx_u32m4_m
+#define VADDVX_UINT vadd_vx_u32m4
+#define VMVVX_UINT vmv_v_x_u32m4
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT vslidedown_vx_u32m4
+#define VMVVXS_UINT vmv_x_s_u32m4_u32
+#endif
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    unsigned int max_index = 0;
+    if (n <= 0 || inc_x <= 0) return(max_index);
+
+    FLOAT_V_T vx0, vx1, v_max;
+    UINT_V_T v_max_index;
+    MASK_T mask;
+  
+    size_t vlmax = VSETVL_MAX;
+    v_max_index = VMVVX_UINT(0, vlmax);
+    v_max = VFMVVF_FLOAT(-1, vlmax);
+    BLASLONG j=0;
+    FLOAT maxf=0.0;
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) {
+            vl = VSETVL(n);
+
+            VLSEG_FLOAT(&vx0, &vx1, x, vl);
+
+            vx0 = VFABSV_FLOAT(vx0, vl);
+            vx1 = VFABSV_FLOAT(vx1, vl);
+
+            vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
+
+            //index where element greater than v_max
+            mask = VMFLTVV_FLOAT(v_max, vx0, vl);
+            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
+
+            //update v_max and start_index j
+            v_max = VFMAXVV_FLOAT(v_max, vx0, vl);
+        }
+    }
+    else {
+        
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) {
+            vl = VSETVL(n);
+        
+            VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
+        
+            vx0 = VFABSV_FLOAT(vx0, vl);
+            vx1 = VFABSV_FLOAT(vx1, vl);
+        
+            vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
+        
+            //index where element greater than v_max
+            mask = VMFLTVV_FLOAT(v_max, vx0, vl);
+            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
+        
+            //update v_max and start_index j
+            v_max = VFMAXVV_FLOAT(v_max, vx0, vl);
+        }
+
+    }
+    FLOAT_V_T_M1 v_res, v_z0;
+    v_res = VFMVVF_FLOAT_M1(0, vlmax);
+    v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
+
+    v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax);
+    maxf = VFMVFS_FLOAT_M1(v_res);
+    mask = VMFGEVF_FLOAT(v_max, maxf, vlmax);
+    max_index = VFIRSTM(mask, vlmax);
+    
+    v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax);
+    max_index = VMVVXS_UINT(v_max_index);
+
+    return(max_index+1);
+}
diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c
new file mode 100644
index 000000000..69771e5aa
--- /dev/null
+++ b/kernel/riscv64/izamin_rvv.c
@@ -0,0 +1,161 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if defined(DOUBLE)
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m4()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLSEG_FLOAT vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
+#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1
+#define MASK_T vbool16_t
+#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
+#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16
+#define VMFLEVF_FLOAT vmfle_vf_f64m4_b16
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFABSV_FLOAT vfabs_v_f64m4
+#define VFMINVV_FLOAT vfmin_vv_f64m4
+#define VFADDVV_FLOAT vfadd_vv_f64m4
+#define VFIRSTM vfirst_m_b16
+#define UINT_V_T vuint64m4_t
+#define VIDV_MASK_UINT vid_v_u64m4_m
+#define VIDV_UINT vid_v_u64m4
+#define VADDVX_MASK_UINT vadd_vx_u64m4_m
+#define VADDVX_UINT vadd_vx_u64m4
+#define VMVVX_UINT vmv_v_x_u64m4
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT vslidedown_vx_u64m4
+#define VMVVXS_UINT vmv_x_s_u64m4_u64
+#else
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m4()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLSEG_FLOAT vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
+#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1
+#define MASK_T vbool8_t
+#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
+#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8
+#define VMFLEVF_FLOAT vmfle_vf_f32m4_b8
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFABSV_FLOAT vfabs_v_f32m4
+#define VFMINVV_FLOAT vfmin_vv_f32m4
+#define VFADDVV_FLOAT vfadd_vv_f32m4
+#define VFIRSTM vfirst_m_b8
+#define UINT_V_T vuint32m4_t
+#define VIDV_MASK_UINT vid_v_u32m4_m
+#define VIDV_UINT vid_v_u32m4
+#define VADDVX_MASK_UINT vadd_vx_u32m4_m
+#define VADDVX_UINT vadd_vx_u32m4
+#define VMVVX_UINT vmv_v_x_u32m4
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT vslidedown_vx_u32m4
+#define VMVVXS_UINT vmv_x_s_u32m4_u32
+#endif
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    unsigned int min_index = 0;
+    if (n <= 0 || inc_x <= 0) return(min_index);
+
+    FLOAT_V_T vx0, vx1, v_min;
+    UINT_V_T v_min_index;
+    MASK_T mask;
+  
+    size_t vlmax = VSETVL_MAX;
+    v_min_index = VMVVX_UINT(0, vlmax);
+    v_min = VFMVVF_FLOAT(FLT_MAX, vlmax);
+    BLASLONG j=0;
+    FLOAT minf=0.0;
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) {
+            vl = VSETVL(n);
+
+            VLSEG_FLOAT(&vx0, &vx1, x, vl);
+
+            vx0 = VFABSV_FLOAT(vx0, vl);
+            vx1 = VFABSV_FLOAT(vx1, vl);
+
+            vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
+
+            // index where element less than v_min
+            mask = VMFLTVV_FLOAT(vx0, v_min, vl);
+            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
+
+            //update v_min and start_index j
+            v_min = VFMINVV_FLOAT(v_min, vx0, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) {
+            vl = VSETVL(n);
+
+            VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
+
+            vx0 = VFABSV_FLOAT(vx0, vl);
+            vx1 = VFABSV_FLOAT(vx1, vl);
+
+            vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
+
+            // index where element less than v_min
+            mask = VMFLTVV_FLOAT(vx0, v_min, vl);
+            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
+
+            //update v_min and start_index j
+            v_min = VFMINVV_FLOAT(v_min, vx0, vl);
+        }
+
+    }
+
+    FLOAT_V_T_M1 v_res, v_max;
+    v_res = VFMVVF_FLOAT_M1(0, vlmax);
+    v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
+
+    v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax);
+    minf = VFMVFS_FLOAT_M1(v_res);
+    mask = VMFLEVF_FLOAT(v_min, minf, vlmax);
+    min_index = VFIRSTM(mask, vlmax);
+
+    v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax);
+    min_index = VMVVXS_UINT(v_min_index);
+
+    return(min_index+1);
+}
diff --git a/kernel/riscv64/max_rvv.c b/kernel/riscv64/max_rvv.c
new file mode 100644
index 000000000..5b1380d2b
--- /dev/null
+++ b/kernel/riscv64/max_rvv.c
@@ -0,0 +1,98 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m8()
+#define VSETVL_MAX_M1 vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMAXVV_FLOAT vfmax_vv_f32m8
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m8()
+#define VSETVL_MAX_M1 vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMAXVV_FLOAT vfmax_vv_f64m8
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT maxf = 0.0;
+
+    if (n <= 0 || inc_x <= 0) return(maxf);
+
+    FLOAT_V_T vx, vmax;
+    FLOAT_V_T_M1 v_res;
+
+    v_res = VFMVVF_FLOAT_M1(-FLT_MAX, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vmax = VFMVVF_FLOAT(-FLT_MAX, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vmax = VFMAXVV_FLOAT(vmax, vx, vl);
+       }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vmax = VFMAXVV_FLOAT(vmax, vx, vl);
+        }
+
+    }
+
+    v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax);
+    maxf = VFMVFS_FLOAT_M1(v_res);
+
+    return(maxf);
+}
diff --git a/kernel/riscv64/min_rvv.c b/kernel/riscv64/min_rvv.c
new file mode 100644
index 000000000..bddcc0ba7
--- /dev/null
+++ b/kernel/riscv64/min_rvv.c
@@ -0,0 +1,98 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m8()
+#define VSETVL_MAX_M1 vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMINVV_FLOAT vfmin_vv_f32m8
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m8()
+#define VSETVL_MAX_M1 vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMINVV_FLOAT vfmin_vv_f64m8
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT minf = 0.0;
+
+    if (n <= 0 || inc_x <= 0) return(minf);
+
+    FLOAT_V_T vx, vmin;
+    FLOAT_V_T_M1 v_res;
+    
+    v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vmin = VFMVVF_FLOAT(FLT_MAX, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vmin = VFMINVV_FLOAT(vmin, vx, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vmin = VFMINVV_FLOAT(vmin, vx, vl);
+        }
+
+    }
+
+    v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax);
+    minf = VFMVFS_FLOAT_M1(v_res);
+
+    return(minf);
+}
diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c
new file mode 100644
index 000000000..3f5d50397
--- /dev/null
+++ b/kernel/riscv64/nrm2_rvv.c
@@ -0,0 +1,117 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m8()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1
+#define VFMACCVV_FLOAT vfmacc_vv_f32m8
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VFABSV_FLOAT vfabs_v_f32m8
+#define ABS fabsf
+#else
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m8()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1
+#define VFMACCVV_FLOAT vfmacc_vv_f64m8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VFABSV_FLOAT vfabs_v_f64m8
+#define ABS fabs
+#endif
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+
+    if( n <= 0 ) return(0.0);
+    if(n == 1) return (ABS(x[0]));
+
+    FLOAT_V_T vr, v0;
+    FLOAT_V_T_M1 v_max, v_res;
+    FLOAT scale = 0.0, ssq = 0.0;
+
+    size_t vlmax = VSETVL_MAX;
+    v_res = VFMVVF_FLOAT_M1(0, vlmax);
+    v_max = VFMVVF_FLOAT_M1(0, vlmax);
+
+    vr = VFMVVF_FLOAT(0, vlmax);
+ 
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl) {
+            vl = VSETVL(n);
+
+            v0 = VLEV_FLOAT(x, vl);
+            v0 = VFABSV_FLOAT(v0, vl);
+
+            v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl);
+
+            vr = VFMACCVV_FLOAT(vr, v0, v0, vl);
+        }
+
+    } else {
+ 
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl * inc_x) {
+            vl = VSETVL(n);
+
+            v0 = VLSEV_FLOAT(x, stride_x, vl);
+            v0 = VFABSV_FLOAT(v0, vl);
+
+            v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl);
+
+            vr = VFMACCVV_FLOAT(vr, v0, v0, vl);
+        }
+
+    }
+
+    v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax);
+
+    ssq = VFMVFS_FLOAT_M1(v_res);
+    scale = VFMVFS_FLOAT_M1(v_max);
+    ssq = ssq / (scale*scale);
+
+    return(scale * sqrt(ssq));
+}
diff --git a/kernel/riscv64/rot_rvv.c b/kernel/riscv64/rot_rvv.c
new file mode 100644
index 000000000..7bf5e4270
--- /dev/null
+++ b/kernel/riscv64/rot_rvv.c
@@ -0,0 +1,149 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m8(n)
+#define FLOAT_V_T vfloat32m8_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VSEV_FLOAT vse32_v_f32m8
+#define VSSEV_FLOAT vsse32_v_f32m8
+#define VFMACCVF_FLOAT vfmacc_vf_f32m8
+#define VFMULVF_FLOAT vfmul_vf_f32m8
+#define VFMSACVF_FLOAT vfmsac_vf_f32m8
+#else
+#define VSETVL(n) vsetvl_e64m8(n)
+#define FLOAT_V_T vfloat64m8_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VSEV_FLOAT vse64_v_f64m8
+#define VSSEV_FLOAT vsse64_v_f64m8
+#define VFMACCVF_FLOAT vfmacc_vf_f64m8
+#define VFMULVF_FLOAT vfmul_vf_f64m8
+#define VFMSACVF_FLOAT vfmsac_vf_f64m8
+#endif
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+
+    if(n <= 0) return(0);
+
+    FLOAT_V_T v0, v1, vx, vy;
+
+    if (inc_x == 0 || inc_y == 0) {
+        BLASLONG i=0;
+        BLASLONG ix=0,iy=0;
+        FLOAT temp;
+        while(i < n)
+        {
+            temp   = c*x[ix] + s*y[iy] ;
+            y[iy]  = c*y[iy] - s*x[ix] ;
+            x[ix]  = temp ;
+
+            ix += inc_x ;
+            iy += inc_y ;
+            i++ ;
+        }
+    }
+    else if(inc_x == 1 && inc_y == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vy = VLEV_FLOAT(y, vl);
+
+            v0 = VFMULVF_FLOAT(vx, c, vl);
+            v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
+            VSEV_FLOAT(x, v0, vl);
+
+            v1 = VFMULVF_FLOAT(vx, s, vl);
+            v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
+            VSEV_FLOAT(y, v1, vl);
+        }
+
+    } else if(inc_y == 1) {
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vy = VLEV_FLOAT(y, vl);
+
+            v0 = VFMULVF_FLOAT(vx, c, vl);
+            v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
+            VSSEV_FLOAT(x, stride_x, v0, vl);
+
+            v1 = VFMULVF_FLOAT(vx, s, vl);
+            v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
+            VSEV_FLOAT(y, v1, vl);
+        }
+ 
+    } else if(inc_x == 1) {
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vy = VLSEV_FLOAT(y, stride_y, vl);
+
+            v0 = VFMULVF_FLOAT(vx, c, vl);
+            v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
+            VSEV_FLOAT(x, v0, vl);
+
+            v1 = VFMULVF_FLOAT(vx, s, vl);
+            v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
+            VSSEV_FLOAT(y, stride_y, v1, vl);
+        }
+
+    } else {
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
+            vl = VSETVL(n);
+ 
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vy = VLSEV_FLOAT(y, stride_y, vl);
+
+            v0 = VFMULVF_FLOAT(vx, c, vl);
+            v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
+            VSSEV_FLOAT(x, stride_x, v0, vl);
+
+            v1 = VFMULVF_FLOAT(vx, s, vl);
+            v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
+            VSSEV_FLOAT(y, stride_y, v1, vl);
+        }
+
+    }
+    
+    return(0);
+}
diff --git a/kernel/riscv64/scal_rvv.c b/kernel/riscv64/scal_rvv.c
new file mode 100644
index 000000000..d2c0378bf
--- /dev/null
+++ b/kernel/riscv64/scal_rvv.c
@@ -0,0 +1,80 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m8(n)
+#define FLOAT_V_T vfloat32m8_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VSEV_FLOAT vse32_v_f32m8
+#define VSSEV_FLOAT vsse32_v_f32m8
+#define VFMULVF_FLOAT vfmul_vf_f32m8
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#else
+#define VSETVL(n) vsetvl_e64m8(n)
+#define FLOAT_V_T vfloat64m8_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VSEV_FLOAT vse64_v_f64m8
+#define VSSEV_FLOAT vsse64_v_f64m8
+#define VFMULVF_FLOAT vfmul_vf_f64m8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+    if ( (n <= 0) || (inc_x <= 0)) return(0);
+
+    FLOAT_V_T v0;
+ 
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl) {
+            vl = VSETVL(n);
+
+            v0 = VLEV_FLOAT(x, vl);
+            v0 = VFMULVF_FLOAT(v0, da, vl);
+            VSEV_FLOAT(x, v0, vl);
+        }
+
+    } else { 
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+
+            v0 = VLSEV_FLOAT(x, stride_x, vl);
+            v0 = VFMULVF_FLOAT(v0, da, vl);
+            VSSEV_FLOAT(x, stride_x, v0, vl);
+        }
+
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/sum_rvv.c b/kernel/riscv64/sum_rvv.c
new file mode 100644
index 000000000..1db0d09dd
--- /dev/null
+++ b/kernel/riscv64/sum_rvv.c
@@ -0,0 +1,95 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m8()
+#define VSETVL_MAX_M1 vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFADDVV_FLOAT vfadd_vv_f32m8
+#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m8()
+#define VSETVL_MAX_M1 vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFADDVV_FLOAT vfadd_vv_f64m8
+#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT sumf = 0.0;
+    if (n <= 0 || inc_x <= 0) return(sumf);
+
+    FLOAT_V_T vx, vsum;
+    FLOAT_V_T_M1 v_res;
+
+    v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vsum = VFMVVF_FLOAT(0.0, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vsum = VFADDVV_FLOAT(vsum, vx, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vsum = VFADDVV_FLOAT(vsum, vx, vl);
+        }
+
+    }
+
+    v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax);
+    sumf = VFMVFS_FLOAT_M1(v_res);
+    return(sumf);
+}
diff --git a/kernel/riscv64/swap_rvv.c b/kernel/riscv64/swap_rvv.c
new file mode 100644
index 000000000..2cf92f6ad
--- /dev/null
+++ b/kernel/riscv64/swap_rvv.c
@@ -0,0 +1,142 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m8()
+#define FLOAT_V_T vfloat32m8_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VSEV_FLOAT vse32_v_f32m8
+#define VSSEV_FLOAT vsse32_v_f32m8
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#else
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m8()
+#define FLOAT_V_T vfloat64m8_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VSEV_FLOAT vse64_v_f64m8
+#define VSSEV_FLOAT vsse64_v_f64m8
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+    BLASLONG stride_x, stride_y;
+    FLOAT_V_T vx, vy;
+
+    if (n <= 0) return(0);
+
+    if (inc_x == 0 && inc_y == 0) {
+        if (n & 1) {
+            FLOAT temp = x[0];
+            x[0] = y[0];
+            y[0] = temp;
+        }
+        else {
+            return 0;
+        }
+    }
+    else if(inc_x == 0) {
+        FLOAT temp = x[0];
+        x[0] = y[(n - 1) * inc_y];
+        FLOAT* ptr = y + (n - 1) * inc_y;   // start from the last one
+        stride_y = (0 - inc_y) * sizeof(FLOAT); // reverse
+        BLASLONG m = n - 1;
+        for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_y) {
+            vl = VSETVL(m);
+            vy = VLSEV_FLOAT(ptr - 1, stride_y, vl);
+            VSSEV_FLOAT(ptr, stride_y, vy, vl);
+        }
+        y[0] = temp;
+    }
+    else if(inc_y == 0) {
+        FLOAT temp = y[0];
+        y[0] = x[(n - 1) * inc_x];
+        FLOAT* ptr = x + (n - 1) * inc_x;   // start from the last one
+        stride_x = (0 - inc_x) * sizeof(FLOAT); // reverse
+        BLASLONG m = n - 1;
+        for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_x) {
+            vl = VSETVL(m);
+            vx = VLSEV_FLOAT(ptr - 1, stride_x, vl);
+            VSSEV_FLOAT(ptr, stride_x, vx, vl);
+        }
+        x[0] = temp;
+    }
+    else if(inc_x == 1 && inc_y == 1) {
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vy = VLEV_FLOAT(y, vl);
+            VSEV_FLOAT(y, vx, vl);
+            VSEV_FLOAT(x, vy, vl);
+        }
+  
+    } else if (inc_y == 1) {
+        stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vy = VLEV_FLOAT(y, vl);
+            VSEV_FLOAT(y, vx, vl);
+            VSSEV_FLOAT(x, stride_x, vy, vl);
+        }
+ 
+    } else if(inc_x == 1) {
+        stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vy = VLSEV_FLOAT(y, stride_y, vl);
+            VSSEV_FLOAT(y, stride_y, vx, vl);
+            VSEV_FLOAT(x, vy, vl);
+        }
+ 
+    } else {
+        stride_x = inc_x * sizeof(FLOAT);
+        stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vy = VLSEV_FLOAT(y, stride_y, vl);
+            VSSEV_FLOAT(y, stride_y, vx, vl);
+            VSSEV_FLOAT(x, stride_x, vy, vl);
+        }
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/symm_lcopy_rvv_v1.c b/kernel/riscv64/symm_lcopy_rvv_v1.c
new file mode 100644
index 000000000..f0def9617
--- /dev/null
+++ b/kernel/riscv64/symm_lcopy_rvv_v1.c
@@ -0,0 +1,101 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define VSETVL_MAX vsetvlmax_e32m2()
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VLSEV_FLOAT vlse32_v_f32m2
+#define INT_V_T     vint32m2_t
+#define VID_V_INT   vid_v_i32m2
+#define VADD_VX_INT vadd_vx_i32m2
+#define VMSGT_VX_INT vmsgt_vx_i32m2_b16
+#define VBOOL_T     vbool16_t
+#define VMERGE_VVM_FLOAT  vmerge_vvm_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define VSETVL_MAX vsetvlmax_e64m2()
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VLSEV_FLOAT vlse64_v_f64m2
+#define INT_V_T     vint64m2_t
+#define VID_V_INT   vid_v_i64m2
+#define VADD_VX_INT vadd_vx_i64m2
+#define VMSGT_VX_INT vmsgt_vx_i64m2_b32
+#define VBOOL_T     vbool32_t
+#define VMERGE_VVM_FLOAT  vmerge_vvm_f64m2
+#endif
+
+// Optimizes the implementation in ../generic/symm_lcopy_4.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
+{
+    BLASLONG i, js, offset;
+
+    FLOAT *ao1, *ao2;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda;
+
+    FLOAT_V_T vb, va1, va2;
+    VBOOL_T vbool;
+    INT_V_T vindex_max, vindex;
+
+    size_t vl = VSETVL_MAX;
+    vindex_max   = VID_V_INT(vl);
+
+    for (js = n; js > 0; js -= vl, posX += vl) {
+        vl = VSETVL(js);
+        offset = posX - posY;
+
+        ao1 = a + posX + posY * lda;
+        ao2 = a + posY + (posX) * lda;
+
+        for (i = m; i > 0; i--, offset--) {
+            va2 = VLSEV_FLOAT(ao2, stride_lda, vl);
+            va1 = VLEV_FLOAT(ao1, vl);
+
+            // offset > (0 - vindex)   --->   (offset + vindex) > 0
+            vindex = VADD_VX_INT(vindex_max, offset, vl);
+            vbool  = VMSGT_VX_INT(vindex, 0, vl);
+
+            vb =  VMERGE_VVM_FLOAT(vbool, va2, va1, vl);
+            VSEV_FLOAT(b, vb, vl);
+
+            b += vl;
+            ao1 += lda;
+            ao2++;
+        }
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/symm_ucopy_rvv_v1.c b/kernel/riscv64/symm_ucopy_rvv_v1.c
new file mode 100644
index 000000000..958506df3
--- /dev/null
+++ b/kernel/riscv64/symm_ucopy_rvv_v1.c
@@ -0,0 +1,100 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define VSETVL_MAX vsetvlmax_e32m2()
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VLSEV_FLOAT vlse32_v_f32m2
+#define INT_V_T     vint32m2_t
+#define VID_V_INT   vid_v_i32m2
+#define VADD_VX_INT vadd_vx_i32m2
+#define VMSGT_VX_INT vmsgt_vx_i32m2_b16
+#define VBOOL_T     vbool16_t
+#define VMERGE_VVM_FLOAT  vmerge_vvm_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define VSETVL_MAX vsetvlmax_e64m2()
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VLSEV_FLOAT vlse64_v_f64m2
+#define INT_V_T     vint64m2_t
+#define VID_V_INT   vid_v_i64m2
+#define VADD_VX_INT vadd_vx_i64m2
+#define VMSGT_VX_INT vmsgt_vx_i64m2_b32
+#define VBOOL_T     vbool32_t
+#define VMERGE_VVM_FLOAT  vmerge_vvm_f64m2
+#endif
+
+// Optimizes the implementation in ../generic/symm_ucopy_4.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
+{
+    BLASLONG i, js, offset;
+
+    FLOAT *ao1, *ao2;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda;
+    
+    FLOAT_V_T vb, va1, va2;
+    VBOOL_T vbool;
+    INT_V_T vindex_max, vindex;
+
+    size_t vl = VSETVL_MAX;
+    vindex_max   = VID_V_INT(vl);
+
+    for (js = n; js > 0; js -= vl, posX += vl) {
+        vl = VSETVL(js);
+        offset = posX - posY;
+
+        ao1 = a + posY + (posX + 0) * lda;
+        ao2 = a + posX + 0 + posY * lda;
+
+        for (i = m; i > 0; i--, offset--) {
+            va1 = VLSEV_FLOAT(ao1, stride_lda, vl);
+            va2 = VLEV_FLOAT(ao2, vl);
+
+            // offset > (0 - vindex)   --->   (offset + vindex) > 0
+            vindex = VADD_VX_INT(vindex_max, offset, vl);
+            vbool  = VMSGT_VX_INT(vindex, 0, vl);
+
+            vb =  VMERGE_VVM_FLOAT(vbool, va2, va1, vl);
+            VSEV_FLOAT(b, vb, vl);
+
+            b += vl;
+            ao1++;
+            ao2 += lda;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/symv_L_rvv.c b/kernel/riscv64/symv_L_rvv.c
new file mode 100644
index 000000000..737abaae3
--- /dev/null
+++ b/kernel/riscv64/symv_L_rvv.c
@@ -0,0 +1,224 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL_MAX_M1       vsetvlmax_e32m1()
+#define VSETVL(n)           vsetvl_e32m8(n)
+#define VSETVL_MAX          vsetvlmax_e32m8()
+#define FLOAT_V_T_M1        vfloat32m1_t
+#define FLOAT_V_T           vfloat32m8_t
+#define VLEV_FLOAT          vle32_v_f32m8
+#define VSEV_FLOAT          vse32_v_f32m8
+#define VLSEV_FLOAT         vlse32_v_f32m8
+#define VSSEV_FLOAT         vsse32_v_f32m8
+#define VFMACCVV_FLOAT      vfmacc_vv_f32m8
+#define VFMACCVF_FLOAT      vfmacc_vf_f32m8
+#define VFNMSACVF_FLOAT     vfnmsac_vf_f32m8
+#define VFMULVF_FLOAT       vfmul_vf_f32m8
+#define VFMVVF_FLOAT        vfmv_v_f_f32m8
+#define VFMSACVF_FLOAT      vfmsac_vf_f32m8
+#define VFMVVF_FLOAT_M1     vfmv_v_f_f32m1
+#define VFREDSUM_FLOAT      vfredusum_vs_f32m8_f32m1
+#define VFMVFS_FLOAT_M1     vfmv_f_s_f32m1_f32
+#else
+#define VSETVL_MAX_M1       vsetvlmax_e64m1()
+#define VSETVL(n)           vsetvl_e64m8(n)
+#define VSETVL_MAX          vsetvlmax_e64m8()
+#define FLOAT_V_T_M1        vfloat64m1_t
+#define FLOAT_V_T           vfloat64m8_t
+#define VLEV_FLOAT          vle64_v_f64m8
+#define VSEV_FLOAT          vse64_v_f64m8
+#define VLSEV_FLOAT         vlse64_v_f64m8
+#define VSSEV_FLOAT         vsse64_v_f64m8
+#define VFMACCVV_FLOAT      vfmacc_vv_f64m8
+#define VFMACCVF_FLOAT      vfmacc_vf_f64m8
+#define VFNMSACVF_FLOAT     vfnmsac_vf_f64m8
+#define VFMULVF_FLOAT       vfmul_vf_f64m8
+#define VFMVVF_FLOAT        vfmv_v_f_f64m8
+#define VFMSACVF_FLOAT      vfmsac_vf_f64m8
+#define VFMVVF_FLOAT_M1     vfmv_v_f_f64m1
+#define VFREDSUM_FLOAT      vfredusum_vs_f64m8_f64m1
+#define VFMVFS_FLOAT_M1     vfmv_f_s_f64m1_f64
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+        BLASLONG i, j, k;
+        BLASLONG ix,iy;
+        BLASLONG jx,jy;
+        FLOAT temp1;
+        FLOAT *a_ptr = a;
+
+        FLOAT_V_T_M1 v_res, v_z0;
+        size_t vlmax = VSETVL_MAX_M1, vl;
+        v_res = VFMVVF_FLOAT_M1(0, vlmax);
+        v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
+        vlmax = VSETVL_MAX;
+
+        FLOAT_V_T va, vx, vy, vr;
+        BLASLONG stride_x, stride_y, inc_xv, inc_yv;
+
+        if(inc_x == 1 && inc_y == 1)
+        {
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[j];
+                        y[j] += temp1 * a_ptr[j];
+                        i = j + 1;
+                        vr = VFMVVF_FLOAT(0, vlmax);
+                        for (k = (m-i); k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                vr = VFMVVF_FLOAT(0, vl);
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VLEV_FLOAT(&y[i], vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSEV_FLOAT(&y[i], vy, vl);
+
+                                vx = VLEV_FLOAT(&x[i], vl);
+                                vr = VFMACCVV_FLOAT(vr, vx, va, vl);
+
+                        }
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
+
+                        y[j] += alpha * VFMVFS_FLOAT_M1(v_res);
+                        a_ptr += lda;
+                }
+        }
+        else if(inc_x == 1)
+        {
+                jy = 0;
+                stride_y = inc_y * sizeof(FLOAT);
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[j];
+                        y[jy] += temp1 * a_ptr[j];
+                        iy = jy + inc_y;
+                        i = j + 1;
+                        vr = VFMVVF_FLOAT(0, vlmax);
+                        for (k = (m-i); k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                inc_yv = inc_y * vl;
+                                vr = VFMVVF_FLOAT(0, vl);
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
+
+                                vx = VLEV_FLOAT(&x[i], vl);
+                                vr = VFMACCVV_FLOAT(vr, vx, va, vl);
+
+                                iy += inc_yv;
+                        }
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
+
+                        y[jy] += alpha * VFMVFS_FLOAT_M1(v_res);
+                        jy    += inc_y;
+                        a_ptr += lda;
+                }
+        }
+        else if(inc_y == 1)
+        {
+                jx = 0;
+                stride_x = inc_x * sizeof(FLOAT);
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[jx];
+                        y[j] += temp1 * a_ptr[j];
+                        ix = jx + inc_x;
+                        i = j + 1;
+                        vr = VFMVVF_FLOAT(0, vlmax);
+                        for (k = (m-i); k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                vr = VFMVVF_FLOAT(0, vl);
+                                inc_xv = inc_x * vl;
+
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VLEV_FLOAT(&y[i], vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSEV_FLOAT(&y[i], vy, vl);
+
+                                vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
+                                vr = VFMACCVV_FLOAT(vr, vx, va, vl);
+
+                                ix += inc_xv;
+                        }
+
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
+
+                        y[j] += alpha * VFMVFS_FLOAT_M1(v_res);
+                        jx    += inc_x;
+                        a_ptr += lda;
+                }
+        }
+        else
+        {
+                stride_x = inc_x * sizeof(FLOAT);
+                stride_y = inc_y * sizeof(FLOAT);
+                jx = 0;
+                jy = 0;
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[jx];
+                        y[jy] += temp1 * a_ptr[j];
+                        ix = jx + inc_x;
+                        iy = jy + inc_y;
+                        i = j + 1;
+                        vr = VFMVVF_FLOAT(0, vlmax);
+                        for (k = (m-i); k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                inc_xv = inc_x * vl;
+                                inc_yv = inc_y * vl;
+                                vr = VFMVVF_FLOAT(0, vl);
+                                
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
+
+                                vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
+                                vr = VFMACCVV_FLOAT(vr, vx, va, vl);
+
+                                ix += inc_xv;
+                                iy += inc_yv;
+                        }
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
+
+                        y[jy] += alpha * VFMVFS_FLOAT_M1(v_res);
+                        jx    += inc_x;
+                        jy    += inc_y;
+                        a_ptr += lda;
+                }
+        }
+        return(0);
+}
+
diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c
new file mode 100644
index 000000000..cb923be5d
--- /dev/null
+++ b/kernel/riscv64/symv_U_rvv.c
@@ -0,0 +1,221 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL_MAX_M1       vsetvlmax_e32m1()
+#define VSETVL(n)           vsetvl_e32m8(n)
+#define VSETVL_MAX          vsetvlmax_e32m8()
+#define FLOAT_V_T_M1        vfloat32m1_t
+#define FLOAT_V_T           vfloat32m8_t
+#define VLEV_FLOAT          vle32_v_f32m8
+#define VSEV_FLOAT          vse32_v_f32m8
+#define VLSEV_FLOAT         vlse32_v_f32m8
+#define VSSEV_FLOAT         vsse32_v_f32m8
+#define VFMACCVV_FLOAT      vfmacc_vv_f32m8
+#define VFMACCVF_FLOAT      vfmacc_vf_f32m8
+#define VFNMSACVF_FLOAT     vfnmsac_vf_f32m8
+#define VFMULVF_FLOAT       vfmul_vf_f32m8
+#define VFMVVF_FLOAT        vfmv_v_f_f32m8
+#define VFMSACVF_FLOAT      vfmsac_vf_f32m8
+#define VFMVVF_FLOAT_M1     vfmv_v_f_f32m1
+#define VFREDSUM_FLOAT      vfredusum_vs_f32m8_f32m1
+#define VFMVFS_FLOAT_M1     vfmv_f_s_f32m1_f32
+#else
+#define VSETVL_MAX_M1       vsetvlmax_e64m1()
+#define VSETVL(n)           vsetvl_e64m8(n)
+#define VSETVL_MAX          vsetvlmax_e64m8()
+#define FLOAT_V_T_M1        vfloat64m1_t
+#define FLOAT_V_T           vfloat64m8_t
+#define VLEV_FLOAT          vle64_v_f64m8
+#define VSEV_FLOAT          vse64_v_f64m8
+#define VLSEV_FLOAT         vlse64_v_f64m8
+#define VSSEV_FLOAT         vsse64_v_f64m8
+#define VFMACCVV_FLOAT      vfmacc_vv_f64m8
+#define VFMACCVF_FLOAT      vfmacc_vf_f64m8
+#define VFNMSACVF_FLOAT     vfnmsac_vf_f64m8
+#define VFMULVF_FLOAT       vfmul_vf_f64m8
+#define VFMVVF_FLOAT        vfmv_v_f_f64m8
+#define VFMSACVF_FLOAT      vfmsac_vf_f64m8
+#define VFMVVF_FLOAT_M1     vfmv_v_f_f64m1
+#define VFREDSUM_FLOAT      vfredusum_vs_f64m8_f64m1
+#define VFMVFS_FLOAT_M1     vfmv_f_s_f64m1_f64
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+        BLASLONG i, j, k;
+        BLASLONG ix,iy;
+        BLASLONG jx,jy;
+        FLOAT temp1;
+        FLOAT *a_ptr = a;
+        FLOAT_V_T_M1 v_res, v_z0;
+        size_t vl_max = VSETVL_MAX_M1, vl;
+        v_res = VFMVVF_FLOAT_M1(0, vl_max);
+        v_z0 = VFMVVF_FLOAT_M1(0, vl_max);
+        vl_max = VSETVL_MAX;
+
+        FLOAT_V_T va, vx, vy, vr;
+        BLASLONG stride_x, stride_y, inc_xv, inc_yv;
+
+        BLASLONG m1 = m - offset;
+        if(inc_x == 1 && inc_y == 1)
+        {
+                a_ptr += m1 * lda;
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[j];
+                        i = 0;
+                        vr = VFMVVF_FLOAT(0, vl_max);
+                        for (k = j; k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                vr = VFMVVF_FLOAT(0, vl);
+                                vy = VLEV_FLOAT(&y[i], vl);
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSEV_FLOAT(&y[i], vy, vl);
+
+                                vx = VLEV_FLOAT(&x[i], vl);
+                                vr = VFMACCVV_FLOAT(vr, vx, va, vl);
+                        }
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max);
+
+                        y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
+                        a_ptr += lda;
+                }
+        }
+        else if(inc_x == 1)
+        {
+                jy = m1 * inc_y;
+                a_ptr += m1 * lda;
+                stride_y = inc_y * sizeof(FLOAT);
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[j];
+                        iy = 0;
+                        i = 0;
+                        vr = VFMVVF_FLOAT(0, vl_max);
+                        for (k = j; k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                inc_yv = inc_y * vl;
+                                vr = VFMVVF_FLOAT(0, vl);
+                                vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
+
+                                vx = VLEV_FLOAT(&x[i], vl);
+                                vr = VFMACCVV_FLOAT(vr, vx, va, vl);
+
+                                iy += inc_yv;
+                        }
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max);
+
+                        y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
+                        a_ptr += lda;
+                        jy    += inc_y;
+                }
+        }
+        else if(inc_y == 1)
+        {
+                jx = m1 * inc_x;
+                a_ptr += m1 * lda;
+                stride_x = inc_x * sizeof(FLOAT);
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[jx];
+                        ix = 0;
+                        i = 0;
+                        vr = VFMVVF_FLOAT(0, vl_max);
+                        for (k = j; k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                inc_xv = inc_x * vl;
+                                vr = VFMVVF_FLOAT(0, vl);
+
+                                vy = VLEV_FLOAT(&y[i], vl);
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSEV_FLOAT(&y[i], vy, vl);
+
+                                vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
+                                vr = VFMACCVV_FLOAT(vr, vx, va, vl);
+
+                                ix += inc_xv;
+                        }
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max);
+
+                        y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
+                        a_ptr += lda;
+                        jx    += inc_x;
+                }
+        }
+        else
+        {
+                jx = m1 * inc_x;
+                jy = m1 * inc_y;
+                a_ptr += m1 * lda;
+                stride_x = inc_x * sizeof(FLOAT);
+                stride_y = inc_y * sizeof(FLOAT);
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[jx];
+
+                        ix = 0;
+                        iy = 0;
+                        i = 0;
+                        vr = VFMVVF_FLOAT(0, vl_max);
+                        for (k = j; k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                inc_xv = inc_x * vl;
+                                inc_yv = inc_y * vl;
+                                vr = VFMVVF_FLOAT(0, vl);
+                                vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
+
+                                vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
+                                vr = VFMACCVV_FLOAT(vr, vx, va, vl);
+                                ix += inc_xv;
+                                iy += inc_yv;
+                        }
+                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max);
+
+                        y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
+                        a_ptr += lda;
+                        jx    += inc_x;
+                        jy    += inc_y;
+                }
+        }
+        return(0);
+}
diff --git a/kernel/riscv64/trmm_lncopy_rvv_v1.c b/kernel/riscv64/trmm_lncopy_rvv_v1.c
new file mode 100644
index 000000000..73a8233f8
--- /dev/null
+++ b/kernel/riscv64/trmm_lncopy_rvv_v1.c
@@ -0,0 +1,138 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VLSEV_FLOAT vlse32_v_f32m2
+#define VBOOL_T vbool16_t
+#define UINT_V_T vint32m2_t
+#define VID_V_UINT vid_v_i32m2
+#define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16
+#define VMSEQ_VX_UINT vmseq_vx_i32m2_b16
+#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VLSEV_FLOAT vlse64_v_f64m2
+#define VBOOL_T     vbool32_t
+#define UINT_V_T     vuint64m2_t
+#define VID_V_UINT   vid_v_u64m2
+#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
+#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f64m2
+#endif
+
+// Optimizes the implementation in ../arm64/tmmm_lncopy_sve_v1.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js, X;
+
+    FLOAT *ao;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda;
+    
+    FLOAT_V_T vb, va1;
+
+    size_t vl;
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posY + posX * lda;
+        } 
+        else 
+        {
+            ao = a + posX + posY * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) 
+            {
+                va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                VSEV_FLOAT(b, va1, vl);
+
+                ao ++;
+                b += vl;
+                X ++;
+                i ++;
+            } 
+            else if (X < posY) 
+            {
+                ao += lda;
+                b += vl;
+                X ++;
+                i ++;
+            } 
+            else 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    vb =  VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl);
+#endif
+                    VSEV_FLOAT(b, vb, vl);
+                    ao++;
+                    b += vl;
+                }
+
+                X += vl;
+                i += vl;
+            }
+        } while (i < m);
+
+        posY += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/trmm_ltcopy_rvv_v1.c b/kernel/riscv64/trmm_ltcopy_rvv_v1.c
new file mode 100644
index 000000000..2fe8cf79e
--- /dev/null
+++ b/kernel/riscv64/trmm_ltcopy_rvv_v1.c
@@ -0,0 +1,134 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VBOOL_T vbool16_t
+#define UINT_V_T vuint32m2_t
+#define VID_V_UINT vid_v_u32m2
+#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
+#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VBOOL_T     vbool32_t
+#define UINT_V_T     vuint64m2_t
+#define VID_V_UINT   vid_v_u64m2
+#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
+#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f64m2
+#endif
+
+// Optimizes the implementation in ../arm64/tmmm_ltcopy_sve_v1.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js, X;
+
+    FLOAT *ao;
+    
+    FLOAT_V_T vb, va1;
+    size_t vl;
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posY + posX * lda;
+        } 
+        else 
+        {
+            ao = a + posX + posY * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) 
+            {
+                ao ++;
+                b += vl;
+                X ++;
+                i ++;
+            } 
+            else if (X < posY) 
+            {
+                va1 = VLEV_FLOAT(ao, vl);
+                VSEV_FLOAT(b, va1, vl);
+
+                ao += lda;
+                b += vl;
+                X ++;
+                i ++;
+            }
+            else
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    va1 = VLEV_FLOAT(ao, vl);
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    vb =  VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl);
+#endif
+                    VSEV_FLOAT(b, vb, vl);
+                    ao += lda;
+                    b += vl;
+                }
+                X += vl;
+                i += vl;
+
+            }
+        } while (i < m);
+
+        posY += vl;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/trmm_uncopy_rvv_v1.c b/kernel/riscv64/trmm_uncopy_rvv_v1.c
new file mode 100644
index 000000000..b64cd840d
--- /dev/null
+++ b/kernel/riscv64/trmm_uncopy_rvv_v1.c
@@ -0,0 +1,136 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VLSEV_FLOAT vlse32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VBOOL_T vbool16_t
+#define UINT_V_T vuint32m2_t
+#define VID_V_UINT vid_v_u32m2
+#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
+#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VLSEV_FLOAT vlse64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VBOOL_T     vbool32_t
+#define UINT_V_T    vuint64m2_t
+#define VID_V_UINT   vid_v_u64m2
+#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
+#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f64m2
+#endif
+
+// Optimizes the implementation in ../arm64/tmmm_uncopy_sve_v1.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js, X;
+    BLASLONG stride_lda = sizeof(FLOAT) * lda;
+    FLOAT *ao;
+
+    FLOAT_V_T vb, va1;
+    size_t vl;
+
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posX + posY * lda;
+        } 
+        else 
+        {
+            ao = a + posY + posX * lda;
+        }
+
+        i = 0;
+        do
+        {
+            if (X < posY) 
+            {
+                va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                VSEV_FLOAT(b, va1, vl);
+
+                ao ++;
+                b += vl;
+                X ++;
+                i ++;
+            } 
+            else if (X > posY) 
+            {
+                ao += lda;
+                b += vl;
+                X ++;
+                i ++;
+            } 
+            else 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    vb =  VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl);
+#endif
+                    VSEV_FLOAT(b, vb, vl);
+                    ao++;
+                    b += vl;
+                }
+
+                X += vl;
+                i += vl;
+            }
+        }while (i < m);
+
+        posY += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/trmm_utcopy_rvv_v1.c b/kernel/riscv64/trmm_utcopy_rvv_v1.c
new file mode 100644
index 000000000..b96daae5b
--- /dev/null
+++ b/kernel/riscv64/trmm_utcopy_rvv_v1.c
@@ -0,0 +1,133 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VBOOL_T vbool16_t
+#define UINT_V_T vuint32m2_t
+#define VID_V_UINT vid_v_u32m2
+#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
+#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VBOOL_T     vbool32_t
+#define UINT_V_T     vuint64m2_t
+#define VID_V_UINT   vid_v_u64m2
+#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
+#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f64m2
+#endif
+
+// Optimizes the implementation in ../arm64/tmmm_utcopy_sve_v1.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, j, js, X;
+
+    FLOAT *ao;
+    FLOAT_V_T vb, va1;
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posX + posY * lda;
+        } 
+        else 
+        {
+            ao = a + posY + posX * lda;
+        }
+
+        i = 0;
+        do
+        {
+            if (X < posY) 
+            {
+                ao ++;
+                b += vl;
+                X ++;
+                i++;
+            }
+            else if (X > posY)
+            {
+                va1 = VLEV_FLOAT(ao, vl);
+                VSEV_FLOAT(b, va1, vl);
+                ao += lda;
+                b += vl;
+                X++;
+                i++;
+            }
+            else
+            {
+                vindex  = VID_V_UINT(vl);
+                for (j = 0; j < vl; j++) 
+                {
+                    va1 = VLEV_FLOAT(ao, vl);
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    vb =  VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl);
+#endif
+                    VSEV_FLOAT(b, vb, vl);
+                    ao += lda;
+                    b += vl;
+                }
+                X += vl;
+                i += vl;
+            }
+        }while (i < m);
+        posY += vl;
+    }
+    return 0;
+}
+
diff --git a/kernel/riscv64/trmmkernel_2x2_rvv.c b/kernel/riscv64/trmmkernel_2x2_rvv.c
new file mode 100644
index 000000000..127e76970
--- /dev/null
+++ b/kernel/riscv64/trmmkernel_2x2_rvv.c
@@ -0,0 +1,342 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m4()
+#define VSETVL_MAX_M1 vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEG_FLOAT vlseg2e32_v_f32m4
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#define VFMACCVV_FLOAT vfmacc_vv_f32m4
+#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m4()
+#define VSETVL_MAX_M1 vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEG_FLOAT vlseg2e64_v_f64m4
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#define VFMACCVV_FLOAT vfmacc_vv_f64m4
+#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#endif
+
+
+// Optimizes the implementation in ../generic/trmmkernel_2x2.c
+
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
+#ifdef TRMMKERNEL
+        ,BLASLONG offset
+#endif
+        )
+{
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*ptrba,*ptrbb;
+   BLASLONG off, temp;
+
+   FLOAT_V_T va0, va1, vb0, vb1;
+   FLOAT_V_T vres0, vres1, vres2, vres3;
+   FLOAT_V_T_M1 v_res, v_z0;
+   v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
+   size_t vl;
+   size_t vlmax = VSETVL_MAX;
+   
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   off = -offset;
+#else
+   off = 0;
+#endif
+
+   for (j = bn/2; j > 0; j--)
+   {
+        C0 = C;
+        C1 = C0+ldc;
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+        ptrba = ba;
+
+        for (i = bm/2; i > 0; i--)
+        {
+#if (defined(LEFT) &&  defined(TRANSA)) || \
+              (!defined(LEFT) && !defined(TRANSA))
+             ptrbb = bb;
+#else
+              ptrba += off*2;
+              ptrbb = bb + off*2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || \
+             (!defined(LEFT) && defined(TRANSA))
+             temp = bk-off;
+#elif defined(LEFT)
+             temp = off+2;
+#else
+             temp = off+2;
+#endif
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+            vres2 = VFMVVF_FLOAT(0.0, vlmax);
+            vres3 = VFMVVF_FLOAT(0.0, vlmax);
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                VLSEG_FLOAT(&va0, &va1, ptrba, vl);
+                VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl);
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl);
+
+                ptrba += vl * 2;
+                ptrbb += vl * 2;
+            }
+            v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax);
+            C0[0] = alpha * VFMVFS_FLOAT_M1(v_res);
+            v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax);
+            C0[1] = alpha * VFMVFS_FLOAT_M1(v_res);
+            v_res = VFREDSUMVS_FLOAT(v_res, vres2, v_z0, vlmax);
+            C1[0] = alpha * VFMVFS_FLOAT_M1(v_res);
+            v_res = VFREDSUMVS_FLOAT(v_res, vres3, v_z0, vlmax);
+            C1[1] = alpha * VFMVFS_FLOAT_M1(v_res);
+
+#if ( defined(LEFT) && defined(TRANSA)) || \
+             (!defined(LEFT) && !defined(TRANSA))
+             temp = bk - off;
+#ifdef LEFT
+             temp -= 2;
+#else
+             temp -= 2;
+#endif
+             ptrba += temp*2;
+             ptrbb += temp*2;
+#endif
+#ifdef LEFT
+             off += 2;
+#endif
+             C0 = C0+2;
+             C1 = C1+2;
+        }
+
+        if (bm & 1)
+        {
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+             ptrbb = bb;
+#else
+             ptrba += off;
+             ptrbb = bb+off*2;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+             temp = bk-off;
+#elif defined(LEFT)
+             temp = off+1;
+#else
+             temp = off+2;
+#endif
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                va0 = VLEV_FLOAT(ptrba, vl);
+                VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl);
+                  
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+                ptrba += vl;
+                ptrbb += vl * 2;
+
+            }
+            v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax);
+            C0[0] = alpha * VFMVFS_FLOAT_M1(v_res);
+            v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax);
+            C1[0] = alpha * VFMVFS_FLOAT_M1(v_res);
+
+#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+             temp = bk-off;
+#ifdef LEFT
+             temp -= 1;
+#else
+             temp -= 2;
+#endif
+             ptrba += temp;
+             ptrbb += temp*2;
+#endif
+#ifdef LEFT
+             off += 1;
+#endif
+             C0 = C0+1;
+             C1 = C1+1;
+        }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 2;
+#endif
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+   }
+
+   if (bn & 1)
+   {
+        C0 = C;
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+        off = offset;
+#endif
+        ptrba = ba;
+
+        for (i = bm/2; i > 0; i--)
+        {
+#if (defined(LEFT) &&  defined(TRANSA)) || \
+              (!defined(LEFT) && !defined(TRANSA))
+              ptrbb = bb;
+#else
+              ptrba += off*2;
+              ptrbb = bb + off;
+#endif
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || \
+             (!defined(LEFT) && defined(TRANSA))
+             temp = bk-off;
+#elif defined(LEFT)
+             temp = off+2;
+#else
+             temp = off+1;
+#endif
+             vres0 = VFMVVF_FLOAT(0.0, vlmax);
+             vres1 = VFMVVF_FLOAT(0.0, vlmax);
+             
+             for (k = temp; k > 0; k -= vl)
+             {
+                 vl = VSETVL(k);
+                 vb0 = VLEV_FLOAT(ptrbb, vl);
+                 VLSEG_FLOAT(&va0, &va1, ptrba, vl);
+                   
+                 vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl);
+                 vres1 = VFMACCVV_FLOAT(vres1, vb0, va1, vl);
+             
+                 ptrba += vl * 2;
+                 ptrbb += vl;
+             
+             }
+             v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax);
+             C0[0] = alpha * VFMVFS_FLOAT_M1(v_res);
+             v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax);
+             C0[1] = alpha * VFMVFS_FLOAT_M1(v_res);
+
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+             (!defined(LEFT) && !defined(TRANSA))
+             temp = bk - off;
+#ifdef LEFT
+             temp -= 2;
+#else
+             temp -= 1;
+#endif
+             ptrba += temp*2;
+             ptrbb += temp;
+#endif
+#ifdef LEFT
+             off += 2;
+#endif
+
+             C0 = C0+2;
+        }
+
+        if (bm & 1)
+        {
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+             ptrbb = bb;
+#else
+             ptrba += off;
+             ptrbb = bb+off;
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+             temp = bk-off;
+#elif defined(LEFT)
+             temp = off + 1;
+#else
+             temp = off + 1;
+#endif
+             vres0 = VFMVVF_FLOAT(0.0, vlmax);
+             
+             for (k = temp; k > 0; k -= vl)
+             {
+                 vl = VSETVL(k);
+                 va0 = VLEV_FLOAT(ptrba, vl);
+                 vb0 = VLEV_FLOAT(ptrbb, vl);
+                   
+                 vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl);
+                 ptrba += vl;
+                 ptrbb += vl;
+             }
+             v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax);
+             C0[0] = alpha * VFMVFS_FLOAT_M1(v_res);
+
+#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+             temp = bk-off;
+#ifdef LEFT
+             temp -= 1;
+#else
+             temp -= 1;
+#endif
+             ptrba += temp;
+             ptrbb += temp;
+#endif
+#ifdef LEFT
+             off += 1;
+#endif
+             C0 = C0+1;
+        }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 1;
+#endif
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+   }
+   return 0;
+}
+
diff --git a/kernel/riscv64/trmmkernel_4x4_rvv.c b/kernel/riscv64/trmmkernel_4x4_rvv.c
new file mode 100644
index 000000000..3e46c6348
--- /dev/null
+++ b/kernel/riscv64/trmmkernel_4x4_rvv.c
@@ -0,0 +1,881 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <stdbool.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n)           vsetvl_e32m2(n)
+#define VSETVL_MAX          vsetvlmax_e32m2()
+#define VSETVL_MAX_M1       vsetvlmax_e32m1()
+#define FLOAT_V_T           vfloat32m2_t
+#define FLOAT_V_T_M1        vfloat32m1_t
+#define VLEV_FLOAT          vle32_v_f32m2
+#define VLSEG4_FLOAT        vlseg4e32_v_f32m2
+#define VLSEG2_FLOAT        vlseg2e32_v_f32m2
+#define VFMVVF_FLOAT        vfmv_v_f_f32m2
+#define VFMUL_FLOAT         vfmul_vv_f32m2
+#define VFMACCVF_FLOAT      vfmacc_vf_f32m2
+#define VFMACCVV_FLOAT      vfmacc_vv_f32m2
+#define VFREDSUMVS_FLOAT    vfredusum_vs_f32m2_f32m1
+#define VFMVVF_FLOAT_M1     vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1     vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)           vsetvl_e64m2(n)
+#define VSETVL_MAX          vsetvlmax_e64m2()
+#define VSETVL_MAX_M1       vsetvlmax_e64m1()
+#define FLOAT_V_T           vfloat64m2_t
+#define FLOAT_V_T_M1        vfloat64m1_t
+#define VLEV_FLOAT          vle64_v_f64m2
+#define VLSEG4_FLOAT        vlseg4e64_v_f64m2
+#define VLSEG2_FLOAT        vlseg2e64_v_f64m2
+#define VFMVVF_FLOAT        vfmv_v_f_f64m2
+#define VFMUL_FLOAT         vfmul_vv_f64m2
+#define VFMACCVF_FLOAT      vfmacc_vf_f64m2
+#define VFMACCVV_FLOAT      vfmacc_vv_f64m2
+#define VFREDSUMVS_FLOAT    vfredusum_vs_f64m2_f64m1
+#define VFMVVF_FLOAT_M1     vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1     vfmv_f_s_f64m1_f64
+#endif
+
+
+// Optimizes the implementation in ../generic/trmmkernel_4x4.c
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
+
+   FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3;
+   FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3, v_z0;
+   v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
+   size_t vl;
+   size_t vlmax = VSETVL_MAX;
+
+   FLOAT_V_T vres0_0;
+   FLOAT_V_T vres0_1;
+   FLOAT_V_T vres0_2;
+   FLOAT_V_T vres0_3;
+
+   FLOAT_V_T vres1_0;
+   FLOAT_V_T vres1_1;
+   FLOAT_V_T vres1_2;
+   FLOAT_V_T vres1_3;
+
+   FLOAT_V_T vres2_0;
+   FLOAT_V_T vres2_1;
+   FLOAT_V_T vres2_2;
+   FLOAT_V_T vres2_3;
+
+   FLOAT_V_T vres3_0;
+   FLOAT_V_T vres3_1;
+   FLOAT_V_T vres3_2;
+   FLOAT_V_T vres3_3;
+
+   BLASLONG off, temp;
+
+   bool left;
+   bool transposed;
+   bool backwards;
+
+#ifdef LEFT
+   left = true;
+#else
+   left = false;
+#endif
+
+#ifdef TRANSA
+   transposed = true;
+#else
+   transposed = false;
+#endif
+
+   backwards = left != transposed;
+
+   if (!left) {
+      off = -offset;
+   }
+
+
+   for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops 
+   {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+
+
+        if (left) {
+            off = offset;
+        }
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x4
+        {
+
+            ptrbb = bb;
+            if (backwards)
+            {
+                ptrba += off*4; // number of values in A
+                ptrbb += off*4; // number of values in B
+            }
+
+            vres0_0 = VFMVVF_FLOAT(0, vlmax);
+            vres0_1 = VFMVVF_FLOAT(0, vlmax);
+            vres0_2 = VFMVVF_FLOAT(0, vlmax);
+            vres0_3 = VFMVVF_FLOAT(0, vlmax);
+            
+            vres1_0 = VFMVVF_FLOAT(0, vlmax);
+            vres1_1 = VFMVVF_FLOAT(0, vlmax);
+            vres1_2 = VFMVVF_FLOAT(0, vlmax);
+            vres1_3 = VFMVVF_FLOAT(0, vlmax);
+            
+            vres2_0 = VFMVVF_FLOAT(0, vlmax);
+            vres2_1 = VFMVVF_FLOAT(0, vlmax);
+            vres2_2 = VFMVVF_FLOAT(0, vlmax);
+            vres2_3 = VFMVVF_FLOAT(0, vlmax);
+            
+            vres3_0 = VFMVVF_FLOAT(0, vlmax);
+            vres3_1 = VFMVVF_FLOAT(0, vlmax);
+            vres3_2 = VFMVVF_FLOAT(0, vlmax);
+            vres3_3 = VFMVVF_FLOAT(0, vlmax);
+
+            temp = backwards ? bk-off :
+                         left ? off + 4 : // number of values in A
+                                off + 4;  // number of values in B
+
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
+                VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
+            
+                vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
+                vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl);
+                vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl);
+                vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl);
+            
+                vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl);
+                vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl);
+                vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl);
+                vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl);
+
+                vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl);
+                vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl);
+                vres2_2 = VFMACCVV_FLOAT(vres2_2, va2, vb2, vl);
+                vres3_2 = VFMACCVV_FLOAT(vres3_2, va2, vb3, vl);
+
+                vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl);
+                vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl);
+                vres2_3 = VFMACCVV_FLOAT(vres2_3, va3, vb2, vl);
+                vres3_3 = VFMACCVV_FLOAT(vres3_3, va3, vb3, vl);
+
+                ptrba += vl * 4;
+                ptrbb += vl * 4;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax);
+            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax);
+            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax);
+            C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
+            C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
+            C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2);
+            C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3);
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax);
+            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax);
+            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax);
+            C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
+            C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
+            C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2);
+            C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3);
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax);
+            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_2, v_z0, vlmax);
+            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres2_3, v_z0, vlmax);
+            C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
+            C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
+            C2[2] = alpha * VFMVFS_FLOAT_M1(vsum2);
+            C2[3] = alpha * VFMVFS_FLOAT_M1(vsum3);
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres3_0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3_1, v_z0, vlmax);
+            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_2, v_z0, vlmax);
+            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_3, v_z0, vlmax);
+            C3[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
+            C3[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
+            C3[2] = alpha * VFMVFS_FLOAT_M1(vsum2);
+            C3[3] = alpha * VFMVFS_FLOAT_M1(vsum3);
+
+            if (!backwards) {
+                temp = bk-off;
+                temp = left ? temp - 4 : // number of values in A
+                              temp - 4;  // number of values in B
+
+                ptrba += temp*4; // number of values in A
+                ptrbb += temp*4; // number of values in B
+            }
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+            C1 = C1+4;
+            C2 = C2+4;
+            C3 = C3+4;
+
+        }
+
+        if ( bm & 2 ) // do any 2x4 loop
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*4;
+#endif
+
+            vres0_0 = VFMVVF_FLOAT(0, vlmax);
+            vres0_1 = VFMVVF_FLOAT(0, vlmax);
+            
+            vres1_0 = VFMVVF_FLOAT(0, vlmax);
+            vres1_1 = VFMVVF_FLOAT(0, vlmax);
+            
+            vres2_0 = VFMVVF_FLOAT(0, vlmax);
+            vres2_1 = VFMVVF_FLOAT(0, vlmax);
+            
+            vres3_0 = VFMVVF_FLOAT(0, vlmax);
+            vres3_1 = VFMVVF_FLOAT(0, vlmax);
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;   // number of values in A
+#else
+            temp = off+4;   // number of values in B
+#endif
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
+            
+                vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
+                vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl);
+                vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl);
+                vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl);
+
+                vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl);
+                vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl);
+                vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl);
+                vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl);
+
+                ptrba += vl * 2;
+                ptrbb += vl * 4;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax);
+            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax);
+            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax);
+
+            C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
+            C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
+            C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2);
+            C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3);
+            
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax);
+            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_0, v_z0, vlmax);
+            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_1, v_z0, vlmax);
+
+            C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
+            C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
+            C3[0] = alpha * VFMVFS_FLOAT_M1(vsum2);
+            C3[1] = alpha * VFMVFS_FLOAT_M1(vsum3);
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+            C1 = C1+2;
+            C2 = C2+2;
+            C3 = C3+2;
+
+        }
+
+        if ( bm & 1 ) // do any 1x4 loop
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*4;
+#endif
+
+            vres0_0 = VFMVVF_FLOAT(0, vlmax);
+            vres1_0 = VFMVVF_FLOAT(0, vlmax);
+            vres2_0 = VFMVVF_FLOAT(0, vlmax);
+            vres3_0 = VFMVVF_FLOAT(0, vlmax);
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;   // number of values in A
+#else
+            temp = off+4;   // number of values in B
+#endif
+
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                va0 = VLEV_FLOAT(ptrba, vl);
+                VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
+            
+                vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
+                vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl);
+                vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl);
+                vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl);
+
+                ptrba += vl;
+                ptrbb += vl * 4;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax);
+            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_0, v_z0, vlmax);
+            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_0, v_z0, vlmax);
+
+            C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
+            C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1);
+            C2[0] = alpha * VFMVFS_FLOAT_M1(vsum2);
+            C3[0] = alpha * VFMVFS_FLOAT_M1(vsum3);
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+            C1 = C1+1;
+            C2 = C2+1;
+            C3 = C3+1;
+
+        }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 4;
+#endif
+
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+    for (j=0; j<(bn&2); j+=2) // do the Mx2 loops 
+    {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x2
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*2;
+#endif
+            vres0_0 = VFMVVF_FLOAT(0, vlmax);
+            vres0_1 = VFMVVF_FLOAT(0, vlmax);
+            vres0_2 = VFMVVF_FLOAT(0, vlmax);
+            vres0_3 = VFMVVF_FLOAT(0, vlmax);
+            
+            vres1_0 = VFMVVF_FLOAT(0, vlmax);
+            vres1_1 = VFMVVF_FLOAT(0, vlmax);
+            vres1_2 = VFMVVF_FLOAT(0, vlmax);
+            vres1_3 = VFMVVF_FLOAT(0, vlmax);
+        
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;   // number of values in A
+#else
+            temp = off+2;   // number of values in B
+#endif
+
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
+                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
+            
+                vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
+                vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl);
+            
+                vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl);
+                vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl);
+            
+                vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl);
+                vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl);
+            
+                vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl);
+                vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl);
+            
+                ptrba += vl * 4;
+                ptrbb += vl * 2;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax);
+            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax);
+            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax);
+            C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
+            C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
+            C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2);
+            C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3);
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax);
+            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax);
+            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax);
+            C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
+            C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
+            C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2);
+            C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3);
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+            C1 = C1+4;
+
+        }
+
+        if ( bm & 2 ) // do any 2x2 loop
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*2;
+#endif
+
+            vres0_0 = VFMVVF_FLOAT(0, vlmax);
+            vres0_1 = VFMVVF_FLOAT(0, vlmax);
+            
+            vres1_0 = VFMVVF_FLOAT(0, vlmax);
+            vres1_1 = VFMVVF_FLOAT(0, vlmax);
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;   // number of values in A
+#else
+            temp = off+2;   // number of values in B
+#endif
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
+            
+                vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
+                vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl);
+            
+                vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl);
+                vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl);
+            
+                ptrba += vl * 2;
+                ptrbb += vl * 2;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax);
+            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax);
+            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax);
+
+            C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
+            C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
+            C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2);
+            C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3);
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+            C1 = C1+2;
+
+        }
+
+        if ( bm & 1 ) // do any 1x2 loop
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*2;
+#endif
+
+
+            vres0_0 = VFMVVF_FLOAT(0, vlmax);
+            vres1_0 = VFMVVF_FLOAT(0, vlmax);
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;   // number of values in A
+#else
+            temp = off+2;   // number of values in B
+#endif
+
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                va0 = VLEV_FLOAT(ptrba, vl);
+                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
+            
+                vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
+                vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl);
+                    
+                ptrba += vl;
+                ptrbb += vl * 2;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax);
+            C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
+            C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1);
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+            C1 = C1+1;
+
+        }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+    for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
+    {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+    off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*1;
+#endif
+
+            vres0_0 = VFMVVF_FLOAT(0, vlmax);
+            vres0_1 = VFMVVF_FLOAT(0, vlmax);
+            vres0_2 = VFMVVF_FLOAT(0, vlmax);
+            vres0_3 = VFMVVF_FLOAT(0, vlmax);
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;   // number of values in A
+#else
+            temp = off+1;   // number of values in B
+#endif
+
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
+                vb0 = VLEV_FLOAT(ptrbb, vl);
+            
+                vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
+           
+                vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl);
+            
+                vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl);
+            
+                vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl);
+            
+                ptrba += vl * 4;
+                ptrbb += vl;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax);
+            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax);
+            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax);
+            C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
+            C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
+            C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2);
+            C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3);
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+
+        }
+
+        if ( bm & 2 ) // do any 2x1 loop
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*1;
+#endif
+
+            vres0_0 = VFMVVF_FLOAT(0, vlmax);
+            vres0_1 = VFMVVF_FLOAT(0, vlmax);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;   // number of values in A
+#else
+            temp = off+1;   // number of values in B
+#endif
+
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                vb0 = VLEV_FLOAT(ptrbb, vl);
+            
+                vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
+           
+                vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl);
+           
+                ptrba += vl * 2;
+                ptrbb += vl;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
+            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax);
+            C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
+            C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+
+        }
+
+        if ( bm & 1 ) // do any 1x1 loop
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*1;
+#endif
+
+            vres0_0 = VFMVVF_FLOAT(0, vlmax);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;   // number of values in A
+#else
+            temp = off+1;   // number of values in B
+#endif
+
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                va0 = VLEV_FLOAT(ptrba, vl);
+                vb0 = VLEV_FLOAT(ptrbb, vl);
+            
+                vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
+                  
+                ptrba += vl;
+                ptrbb += vl;
+            }
+
+            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
+            C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+    }
+    return 0;
+}
diff --git a/kernel/riscv64/trmmkernel_rvv_v1x8.c b/kernel/riscv64/trmmkernel_rvv_v1x8.c
new file mode 100644
index 000000000..97b14650c
--- /dev/null
+++ b/kernel/riscv64/trmmkernel_rvv_v1x8.c
@@ -0,0 +1,685 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VFMVVF_FLOAT vfmv_v_f_f32m2
+#define VFMACCVF_FLOAT vfmacc_vf_f32m2
+#define VFMULVF_FLOAT vfmul_vf_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VFMVVF_FLOAT vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT vfmacc_vf_f64m2
+#define VFMULVF_FLOAT vfmul_vf_f64m2
+#endif
+
+
+// Optimizes the implementation in ../generic/trmmkernel_8x8.c
+
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+    //fprintf(stderr, "%s, %s, bm=%4ld bn=%4ld bk=%4ld alpha=%f ldc=%ld\n", __FILE__, __FUNCTION__, bm, bn, bk, alpha, ldc);
+
+    BLASLONG i,j,k;
+    FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb;
+
+    FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
+    FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
+    size_t vl;
+
+    BLASLONG off, temp;
+
+#if !defined(LEFT)
+    off = -offset;
+#else
+    off = 0;
+#endif
+    for (j = bn/8; j > 0; j--)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+        C4 = C3+ldc;
+        C5 = C4+ldc;
+        C6 = C5+ldc;
+        C7 = C6+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*vl;
+            ptrbb = bb + off*8;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+            vres4 = VFMVVF_FLOAT(0.0, vl);
+            vres5 = VFMVVF_FLOAT(0.0, vl);
+            vres6 = VFMVVF_FLOAT(0.0, vl);
+            vres7 = VFMVVF_FLOAT(0.0, vl);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+vl;  // number of values in A
+#else
+            temp = off+8;   // number of values in B
+#endif
+
+            for (k = temp/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
+                ptrbb += 8;
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl);
+                ptrbb += 8;
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl);
+                ptrbb += 8;
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl);
+                ptrbb += 8;
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl);
+                ptrbb += 8;
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl);
+                ptrbb += 8;
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl);
+                ptrbb += 8;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl);
+                ptrbb += 8;
+            }
+
+            for (k = temp&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl); // M:8 (should be vlen);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
+
+                ptrbb += 8;
+                ptrba += vl;
+            }
+
+            va0 = VFMULVF_FLOAT(vres0, alpha, vl);
+            VSEV_FLOAT(C0, va0, vl);
+
+            va1 = VFMULVF_FLOAT(vres1, alpha, vl);
+            VSEV_FLOAT(C1, va1, vl);
+
+            va2 = VFMULVF_FLOAT(vres2, alpha, vl);
+            VSEV_FLOAT(C2, va2, vl);
+
+            va3 = VFMULVF_FLOAT(vres3, alpha, vl);
+            VSEV_FLOAT(C3, va3, vl);
+
+            va4 = VFMULVF_FLOAT(vres4, alpha, vl);
+            VSEV_FLOAT(C4, va4, vl);
+
+            va5 = VFMULVF_FLOAT(vres5, alpha, vl);
+            VSEV_FLOAT(C5, va5, vl);
+
+            va6 = VFMULVF_FLOAT(vres6, alpha, vl);
+            VSEV_FLOAT(C6, va6, vl);
+
+            va7 = VFMULVF_FLOAT(vres7, alpha, vl);
+            VSEV_FLOAT(C7, va7, vl);
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= vl; // number of values in A
+#else
+            temp -= 8; // number of values in B
+#endif
+            ptrba += temp*vl;
+            ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+            off += vl; // number of values in A
+#endif
+
+            C0 += vl;
+            C1 += vl;
+            C2 += vl;
+            C3 += vl;
+            C4 += vl;
+            C5 += vl;
+            C6 += vl;
+            C7 += vl;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 8;
+#endif
+
+        bb += (bk<<3);
+        C += (ldc<<3);
+    }
+
+    if (bn & 4)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*vl;
+            ptrbb = bb + off*4;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+vl;  // number of values in A
+#else
+            temp = off+4;   // number of values in B
+#endif
+
+            for (k = temp/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+                ptrbb += 4;
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
+                ptrbb += 4;
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
+                ptrbb += 4;
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
+                ptrbb += 4;
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
+                ptrbb += 4;
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
+                ptrbb += 4;
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
+                ptrbb += 4;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
+                ptrbb += 4;
+            }
+
+            // K remainder
+            for (k = temp&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+
+                ptrbb += 4;
+                ptrba += vl;
+            }
+
+            va0 = VFMULVF_FLOAT(vres0, alpha, vl);
+            VSEV_FLOAT(C0, va0, vl);
+
+            va1 = VFMULVF_FLOAT(vres1, alpha, vl);
+            VSEV_FLOAT(C1, va1, vl);
+
+            va2 = VFMULVF_FLOAT(vres2, alpha, vl);
+            VSEV_FLOAT(C2, va2, vl);
+
+            va3 = VFMULVF_FLOAT(vres3, alpha, vl);
+            VSEV_FLOAT(C3, va3, vl);
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= vl; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*vl;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += vl; // number of values in A
+#endif
+
+            C0 += vl;
+            C1 += vl;
+            C2 += vl;
+            C3 += vl;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 4;
+#endif
+
+        bb += (bk<<2);
+        C += (ldc<<2);
+    }
+
+    if (bn & 2)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl) 
+        {
+            vl = VSETVL(i);
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*vl;
+            ptrbb = bb + off*2;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+vl;  // number of values in A
+#else
+            temp = off+2;   // number of values in B
+#endif
+
+            for (k = temp/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                ptrbb += 2;
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
+                ptrbb += 2;
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
+                ptrbb += 2;
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
+                ptrbb += 2;
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
+                ptrbb += 2;
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
+                ptrbb += 2;
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
+                ptrbb += 2;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
+                ptrbb += 2;
+            }
+
+            // K remainder
+            for (k = temp&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+
+                ptrbb += 2;
+                ptrba += vl;
+            }
+            va0 = VFMULVF_FLOAT(vres0, alpha, vl);
+            VSEV_FLOAT(C0, va0, vl);
+
+            va1 = VFMULVF_FLOAT(vres1, alpha, vl);
+            VSEV_FLOAT(C1, va1, vl);
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= vl; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*vl;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += vl; // number of values in A
+#endif
+
+            C0 += vl;
+            C1 += vl;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 2;
+#endif
+
+        bb += (bk<<1);
+        C += (ldc<<1);
+    }
+
+    if (bn & 1)
+    {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+        off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*vl;
+            ptrbb = bb + off*1;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+vl;  // number of values in A
+#else
+            temp = off+1;   // number of values in B
+#endif
+
+            for (k = temp/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                ptrbb += 1;
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                ptrbb += 1;
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                ptrbb += 1;
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                ptrbb += 1;
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                ptrbb += 1;
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                ptrbb += 1;
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                ptrbb += 1;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                ptrbb += 1;
+            }
+
+            // K remainder
+            for (k = temp&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+
+                ptrbb += 1;
+                ptrba += vl;
+            }
+            va0 = VFMULVF_FLOAT(vres0, alpha, vl);
+            VSEV_FLOAT(C0, va0, vl);
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= vl; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*vl;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += vl; // number of values in A
+#endif
+
+            C0 += vl;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 1;
+#endif
+
+        bb += (bk);
+        C += (ldc);
+    }
+    return 0;
+}
+
diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c
new file mode 100644
index 000000000..11a0398ca
--- /dev/null
+++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c
@@ -0,0 +1,847 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define VSETVL_MAX vsetvlmax_e32m2()
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VLSEV_FLOAT vlse32_v_f32m2
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VSSEV_FLOAT vsse32_v_f32m2
+#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define VFMACCVF_FLOAT vfmacc_vf_f32m2
+#define VFMULVF_FLOAT vfmul_vf_f32m2
+#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define VSETVL_MAX vsetvlmax_e64m2()
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VLSEV_FLOAT vlse64_v_f64m2
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VSSEV_FLOAT vsse64_v_f64m2
+#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define VFMACCVF_FLOAT vfmacc_vf_f64m2
+#define VFMULVF_FLOAT vfmul_vf_f64m2
+#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
+#endif
+
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_L
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_kernel_LN_sve.c
+
+#ifndef COMPLEX
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT aa,  bb;
+    FLOAT *pa, *pc;
+
+    int i, j, k;
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug
+
+    size_t vl;
+    FLOAT_V_T va, vc;
+
+    a += (m - 1) * m;
+    b += (m - 1) * n;
+
+    for (i = m - 1; i >= 0; i--) 
+    {
+        aa = *(a + i);
+        for (j = 0; j < n; j ++) 
+        {
+            bb = *(c + i + j * ldc);
+            bb *= aa;
+            *b             = bb;
+            *(c + i + j * ldc) = bb;
+            b ++;
+
+            pa = a;
+            pc = c + j * ldc;
+            for (k = i; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc = VLEV_FLOAT(pc, vl);
+                va = VLEV_FLOAT(pa, vl);
+                vc = VFNMSACVF_FLOAT(vc, bb, va, vl);
+                VSEV_FLOAT(pc, vc, vl);
+                pa += vl;
+                pc += vl;
+            }
+        }
+        a -= m;
+        b -= 2 * n;
+    }
+
+}
+#elif GEMM_DEFAULT_UNROLL_N == 2
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT aa,  bb0, bb1;
+    FLOAT *pa, *pc, *pc0, *pc1;
+    FLOAT *pb0, *pb1;
+
+    int i, j, k;
+    fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug
+
+    size_t vl;
+    FLOAT_V_T va, vc0, vc1;
+
+    a += (m - 1) * m;
+    b += (m - 1) * n;
+
+    for (i = m - 1; i >= 0; i--) 
+    {
+        aa = *(a + i);
+        pc = c + i;
+        for (j = 0; j < n/2; j ++) 
+        {
+            //bb = *(c + i + j * ldc);
+            pb0 = pc + j * ldc * 2;
+            pb1 = pb0 + ldc;
+            //bb *= aa;
+            bb0 = (*pb0) * aa;
+            bb1 = (*pb1) * aa;
+            //*b             = bb;
+            *b      = bb0;
+            *(b+1)  = bb1;
+            *pb0    = bb0;
+            *pb1    = bb1;
+
+            //*(c + i + j * ldc) = bb;
+            //b ++;
+
+            b += 2;
+            //pa = a + i + 1;
+            pc0 = c + j * ldc * 2;
+            pc1 = pc0 + ldc;
+            pa = a;
+            //pc = c + j * ldc;
+            for (k = i; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLEV_FLOAT(pc0, vl);
+                vc1 = VLEV_FLOAT(pc1, vl);
+                va = VLEV_FLOAT(pa, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
+                VSEV_FLOAT(pc0, vc0, vl);
+                VSEV_FLOAT(pc1, vc1, vl);
+
+                pa += vl;
+                pc0 += vl;
+                pc1 += vl;
+            }
+        }
+        pc += ldc * (n/2) * 2;
+        if (n & 1)
+        {
+            pb0 = pc;
+            bb0 = (*pb0) * aa;
+            *b      = bb0;
+            *pb0    = bb0;
+            b += 1;
+
+            pc0 = pc - i;
+            pa = a;
+            for (k = i; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLEV_FLOAT(pc0, vl);
+                va = VLEV_FLOAT(pa, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
+                VSEV_FLOAT(pc0, vc0, vl);
+
+                pa += vl;
+                pc0 += vl;
+            }
+        }
+
+        a -= m;
+        b -= 2 * n;
+    }
+
+}
+
+#elif GEMM_DEFAULT_UNROLL_N == 4
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT aa,  bb0, bb1, bb2, bb3;
+    FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3;
+    FLOAT *pb0, *pb1, *pb2, *pb3;
+
+    int i, j, k;
+
+    size_t vl;
+    FLOAT_V_T va, vc0, vc1, vc2, vc3;
+
+    a += (m - 1) * m;
+    b += (m - 1) * n;
+
+    for (i = m - 1; i >= 0; i--) 
+    {
+        aa = *(a + i);
+        pc = c + i;
+        for (j = 0; j < n/4; j ++) 
+        {
+            pb0 = pc + j * ldc * 4;
+            pb1 = pb0 + ldc;
+            pb2 = pb1 + ldc;
+            pb3 = pb2 + ldc;
+            
+            bb0 = (*pb0) * aa;
+            bb1 = (*pb1) * aa;
+            bb2 = (*pb2) * aa;
+            bb3 = (*pb3) * aa;
+
+            *b      = bb0;
+            *(b+1)  = bb1;
+            *(b+2)  = bb2;
+            *(b+3)  = bb3;
+
+            *pb0    = bb0;
+            *pb1    = bb1;
+            *pb2    = bb2;
+            *pb3    = bb3;
+
+            b += 4;
+
+            pc0 = c + j * ldc * 4;
+            pc1 = pc0 + ldc;
+            pc2 = pc1 + ldc;
+            pc3 = pc2 + ldc;
+
+            pa = a;
+            for (k = i; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLEV_FLOAT(pc0, vl);
+                vc1 = VLEV_FLOAT(pc1, vl);
+                vc2 = VLEV_FLOAT(pc2, vl);
+                vc3 = VLEV_FLOAT(pc3, vl);
+                va = VLEV_FLOAT(pa, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
+                vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
+                VSEV_FLOAT(pc0, vc0, vl);
+                VSEV_FLOAT(pc1, vc1, vl);
+                VSEV_FLOAT(pc2, vc2, vl);
+                VSEV_FLOAT(pc3, vc3, vl);
+
+                pa += vl;
+                pc0 += vl;
+                pc1 += vl;
+                pc2 += vl;
+                pc3 += vl;
+            }
+        }
+        pc += ldc * (n/4) * 4;
+
+        if (n & 2)
+        {
+            pb0 = pc + j * ldc * 2;
+            pb1 = pb0 + ldc;
+            
+            bb0 = (*pb0) * aa;
+            bb1 = (*pb1) * aa;
+
+            *b      = bb0;
+            *(b+1)  = bb1;
+
+            *pb0    = bb0;
+            *pb1    = bb1;
+
+            b += 2;
+
+            pc0 = c + j * ldc * 2;
+            pc1 = pc0 + ldc;
+
+            pa = a;
+            for (k = i; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLEV_FLOAT(pc0, vl);
+                vc1 = VLEV_FLOAT(pc1, vl);
+                va = VLEV_FLOAT(pa, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
+                VSEV_FLOAT(pc0, vc0, vl);
+                VSEV_FLOAT(pc1, vc1, vl);
+
+                pa += vl;
+                pc0 += vl;
+                pc1 += vl;
+            }
+            pc += ldc * 2;
+        }
+
+        if (n & 1)
+        {
+            pb0 = pc;
+            bb0 = (*pb0) * aa;
+            *b      = bb0;
+            *pb0    = bb0;
+            b += 1;
+
+            pc0 = pc - i;
+            pa = a;
+            for (k = i; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLEV_FLOAT(pc0, vl);
+                va = VLEV_FLOAT(pa, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
+                VSEV_FLOAT(pc0, vc0, vl);
+
+                pa += vl;
+                pc0 += vl;
+            }
+        }
+
+        a -= m;
+        b -= 2 * n;
+    }
+
+}
+#elif GEMM_DEFAULT_UNROLL_N == 8
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT aa,  bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7;
+    FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
+    FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7;
+
+    int i, j, k;
+
+    size_t vl;
+    FLOAT_V_T va, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7;
+
+    a += (m - 1) * m;
+    b += (m - 1) * n;
+
+    for (i = m - 1; i >= 0; i--) 
+    {
+        aa = *(a + i);
+        pc = c + i;
+        for (j = 0; j < n/8; j ++) 
+        {
+            pb0 = pc + j * ldc * 8;
+            pb1 = pb0 + ldc;
+            pb2 = pb1 + ldc;
+            pb3 = pb2 + ldc;
+            pb4 = pb3 + ldc;
+            pb5 = pb4 + ldc;
+            pb6 = pb5 + ldc;
+            pb7 = pb6 + ldc;
+            
+            bb0 = (*pb0) * aa;
+            bb1 = (*pb1) * aa;
+            bb2 = (*pb2) * aa;
+            bb3 = (*pb3) * aa;
+            bb4 = (*pb4) * aa;
+            bb5 = (*pb5) * aa;
+            bb6 = (*pb6) * aa;
+            bb7 = (*pb7) * aa;
+
+            *b      = bb0;
+            *(b+1)  = bb1;
+            *(b+2)  = bb2;
+            *(b+3)  = bb3;
+            *(b+4)  = bb4;
+            *(b+5)  = bb5;
+            *(b+6)  = bb6;
+            *(b+7)  = bb7;
+
+            *pb0    = bb0;
+            *pb1    = bb1;
+            *pb2    = bb2;
+            *pb3    = bb3;
+            *pb4    = bb4;
+            *pb5    = bb5;
+            *pb6    = bb6;
+            *pb7    = bb7;
+
+            b += 8;
+
+            pc0 = c + j * ldc * 8;
+            pc1 = pc0 + ldc;
+            pc2 = pc1 + ldc;
+            pc3 = pc2 + ldc;
+            pc4 = pc3 + ldc;
+            pc5 = pc4 + ldc;
+            pc6 = pc5 + ldc;
+            pc7 = pc6 + ldc;
+
+            pa = a;
+            for (k = i; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLEV_FLOAT(pc0, vl);
+                vc1 = VLEV_FLOAT(pc1, vl);
+                vc2 = VLEV_FLOAT(pc2, vl);
+                vc3 = VLEV_FLOAT(pc3, vl);
+                vc4 = VLEV_FLOAT(pc4, vl);
+                vc5 = VLEV_FLOAT(pc5, vl);
+                vc6 = VLEV_FLOAT(pc6, vl);
+                vc7 = VLEV_FLOAT(pc7, vl);
+                va = VLEV_FLOAT(pa, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
+                vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
+                vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl);
+                vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl);
+                vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl);
+                vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl);
+                VSEV_FLOAT(pc0, vc0, vl);
+                VSEV_FLOAT(pc1, vc1, vl);
+                VSEV_FLOAT(pc2, vc2, vl);
+                VSEV_FLOAT(pc3, vc3, vl);
+                VSEV_FLOAT(pc4, vc4, vl);
+                VSEV_FLOAT(pc5, vc5, vl);
+                VSEV_FLOAT(pc6, vc6, vl);
+                VSEV_FLOAT(pc7, vc7, vl);
+
+                pa += vl;
+                pc0 += vl;
+                pc1 += vl;
+                pc2 += vl;
+                pc3 += vl;
+                pc4 += vl;
+                pc5 += vl;
+                pc6 += vl;
+                pc7 += vl;
+            }
+        }
+        pc += ldc * (n/8) * 8;
+
+        if (n & 4)
+        {
+            pb0 = pc + j * ldc * 4;
+            pb1 = pb0 + ldc;
+            pb2 = pb1 + ldc;
+            pb3 = pb2 + ldc;
+            
+            bb0 = (*pb0) * aa;
+            bb1 = (*pb1) * aa;
+            bb2 = (*pb2) * aa;
+            bb3 = (*pb3) * aa;
+
+            *b      = bb0;
+            *(b+1)  = bb1;
+            *(b+2)  = bb2;
+            *(b+3)  = bb3;
+
+            *pb0    = bb0;
+            *pb1    = bb1;
+            *pb2    = bb2;
+            *pb3    = bb3;
+
+            b += 4;
+
+            pc0 = c + j * ldc * 4;
+            pc1 = pc0 + ldc;
+            pc2 = pc1 + ldc;
+            pc3 = pc2 + ldc;
+
+            pa = a;
+            for (k = i; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLEV_FLOAT(pc0, vl);
+                vc1 = VLEV_FLOAT(pc1, vl);
+                vc2 = VLEV_FLOAT(pc2, vl);
+                vc3 = VLEV_FLOAT(pc3, vl);
+                va = VLEV_FLOAT(pa, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
+                vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
+                VSEV_FLOAT(pc0, vc0, vl);
+                VSEV_FLOAT(pc1, vc1, vl);
+                VSEV_FLOAT(pc2, vc2, vl);
+                VSEV_FLOAT(pc3, vc3, vl);
+
+                pa += vl;
+                pc0 += vl;
+                pc1 += vl;
+                pc2 += vl;
+                pc3 += vl;
+            }
+            pc += ldc * 4;
+        }
+
+        if (n & 2)
+        {
+            pb0 = pc + j * ldc * 2;
+            pb1 = pb0 + ldc;
+            
+            bb0 = (*pb0) * aa;
+            bb1 = (*pb1) * aa;
+
+            *b      = bb0;
+            *(b+1)  = bb1;
+
+            *pb0    = bb0;
+            *pb1    = bb1;
+
+            b += 2;
+
+            pc0 = c + j * ldc * 2;
+            pc1 = pc0 + ldc;
+
+            pa = a;
+            for (k = i; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLEV_FLOAT(pc0, vl);
+                vc1 = VLEV_FLOAT(pc1, vl);
+                va = VLEV_FLOAT(pa, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
+                VSEV_FLOAT(pc0, vc0, vl);
+                VSEV_FLOAT(pc1, vc1, vl);
+
+                pa += vl;
+                pc0 += vl;
+                pc1 += vl;
+            }
+            pc += ldc * 2;
+        }
+
+        if (n & 1)
+        {
+            pb0 = pc;
+            bb0 = (*pb0) * aa;
+            *b      = bb0;
+            *pb0    = bb0;
+            b += 1;
+
+            pc0 = pc - i;
+            pa = a;
+            for (k = i; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLEV_FLOAT(pc0, vl);
+                va = VLEV_FLOAT(pa, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
+                VSEV_FLOAT(pc0, vc0, vl);
+
+                pa += vl;
+                pc0 += vl;
+            }
+        }
+
+        a -= m;
+        b -= 2 * n;
+    }
+
+}
+#else
+static inline void solve_generic(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa,  bb;
+
+  int i, j, k;
+
+  a += (m - 1) * m;
+  b += (m - 1) * n;
+
+  for (i = m - 1; i >= 0; i--) {
+
+    aa = *(a + i);
+
+    for (j = 0; j < n; j ++) {
+      bb = *(c + i + j * ldc);
+      bb *= aa;
+      *b             = bb;
+      *(c + i + j * ldc) = bb;
+      b ++;
+
+      for (k = 0; k < i; k ++){
+        *(c + k + j * ldc) -= bb * *(a + k);
+      }
+
+    }
+    a -= m;
+    b -= 2 * n;
+  }
+
+}
+
+#endif
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+  a += (m - 1) * m * 2;
+  b += (m - 1) * n * 2;
+
+  for (i = m - 1; i >= 0; i--) {
+
+    aa1 = *(a + i * 2 + 0);
+    aa2 = *(a + i * 2 + 1);
+
+    for (j = 0; j < n; j ++) {
+      bb1 = *(c + i * 2 + 0 + j * ldc);
+      bb2 = *(c + i * 2 + 1 + j * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 = aa1 * bb1 + aa2 * bb2;
+      cc2 = aa1 * bb2 - aa2 * bb1;
+#endif
+
+
+      *(b + 0) = cc1;
+      *(b + 1) = cc2;
+      *(c + i * 2 + 0 + j * ldc) = cc1;
+      *(c + i * 2 + 1 + j * ldc) = cc2;
+      b += 2;
+
+      for (k = 0; k < i; k ++){
+#ifndef CONJ
+        *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
+        *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#else
+        *(c + k * 2 + 0 + j * ldc) -=   cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
+        *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#endif
+      }
+
+    }
+    a -= m * 2;
+    b -= 4 * n;
+  }
+
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
+#ifdef COMPLEX
+    FLOAT dummy2,
+#endif
+    FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  BLASLONG i, j;
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  
+  size_t vl = VSETVL_MAX;
+
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  while (j > 0) {
+
+    kk = m + offset;
+
+    i = m % vl;
+    if (i) {
+      aa = a + (m - i) * k * COMPSIZE;
+      cc = c + (m - i)     * COMPSIZE;
+
+      if (k - kk > 0) {
+        GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa + i             * kk * COMPSIZE,
+            b  + GEMM_UNROLL_N * kk * COMPSIZE,
+            cc,
+            ldc);
+      }
+
+      solve(i, GEMM_UNROLL_N,
+          aa + (kk - i) * i             * COMPSIZE,
+          b  + (kk - i) * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      kk -= i;
+
+    }
+
+    int mod = i;
+    i = vl;
+    if (i <= m) {
+      aa = a + (m - mod - vl) * k * COMPSIZE;
+      cc = c + (m - mod - vl)     * COMPSIZE;
+
+      do {
+        if (k - kk > 0) {
+          GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+              ZERO,
+#endif
+              aa + vl * kk * COMPSIZE,
+              b +  GEMM_UNROLL_N * kk * COMPSIZE,
+              cc,
+              ldc);
+        }
+
+        solve(vl, GEMM_UNROLL_N,
+            aa + (kk - vl) * vl * COMPSIZE,
+            b  + (kk - vl) * GEMM_UNROLL_N * COMPSIZE,
+            cc, ldc);
+
+        aa -= vl * k * COMPSIZE;
+        cc -= vl     * COMPSIZE;
+        kk -= vl;
+
+        i += vl;
+      } while (i <= m);
+    }
+
+
+    b += GEMM_UNROLL_N * k * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+        kk = m + offset;
+
+        i = m % vl;
+        if (i) {
+          aa = a + (m - i) * k * COMPSIZE;
+          cc = c + (m - i)     * COMPSIZE;
+
+          if (k - kk > 0) {
+            GEMM_KERNEL(i, j, k - kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa + i * kk * COMPSIZE,
+                b  + j * kk * COMPSIZE,
+                cc, ldc);
+          }
+
+          solve(i, j,
+              aa + (kk - i) * i * COMPSIZE,
+              b  + (kk - i) * j * COMPSIZE,
+              cc, ldc);
+
+          kk -= i;
+
+        }
+
+        int mod = i;
+        i = vl;
+        if (i <= m) {
+          aa = a + (m - mod - vl) * k * COMPSIZE;
+          cc = c + (m - mod - vl)     * COMPSIZE;
+
+          do {
+            if (k - kk > 0) {
+              GEMM_KERNEL(vl, j, k - kk, dm1,
+#ifdef COMPLEX
+                  ZERO,
+#endif
+                  aa + vl * kk * COMPSIZE,
+                  b +  j             * kk * COMPSIZE,
+                  cc,
+                  ldc);
+            }
+
+            solve(vl, j,
+                aa + (kk - vl) * vl * COMPSIZE,
+                b  + (kk - vl) * j             * COMPSIZE,
+                cc, ldc);
+
+            aa -= vl * k * COMPSIZE;
+            cc -= vl     * COMPSIZE;
+            kk -= vl;
+
+            i += vl;
+          } while (i <= m);
+        }
+
+        b += j * k   * COMPSIZE;
+        c += j * ldc * COMPSIZE;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c
new file mode 100644
index 000000000..0380bd1bb
--- /dev/null
+++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c
@@ -0,0 +1,840 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define VSETVL_MAX vsetvlmax_e32m2()
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VLSEV_FLOAT vlse32_v_f32m2
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VSSEV_FLOAT vsse32_v_f32m2
+#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define VFMACCVF_FLOAT vfmacc_vf_f32m2
+#define VFMULVF_FLOAT vfmul_vf_f32m2
+#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define VSETVL_MAX vsetvlmax_e64m2()
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VLSEV_FLOAT vlse64_v_f64m2
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VSSEV_FLOAT vsse64_v_f64m2
+#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define VFMACCVF_FLOAT vfmacc_vf_f64m2
+#define VFMULVF_FLOAT vfmul_vf_f64m2
+#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
+#endif
+
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_L
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_kernel_LT_sve.c
+
+#ifndef COMPLEX
+#if GEMM_DEFAULT_UNROLL_N == 1
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) 
+{
+    FLOAT aa, bb;
+    FLOAT *pa, *pc;
+
+    int i, j, k;
+    size_t vl;
+    FLOAT_V_T va, vc;
+    for (i = 0; i < m; i++)
+    {
+        aa = *(a + i);
+        for (j = 0; j < n; j ++)
+        {
+            bb = *(c + i + j * ldc);
+            bb *= aa;
+            *b  = bb;
+            *(c + i + j * ldc) = bb;
+            b++;
+            pa = a + i + 1;
+            pc = c + j * ldc + i + 1;
+            for (k = (m - i - 1); k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc = VLEV_FLOAT(pc, vl);
+                va = VLEV_FLOAT(pa, vl);
+                vc = VFNMSACVF_FLOAT(vc, bb, va, vl);
+                VSEV_FLOAT(pc, vc, vl);
+                pa += vl;
+                pc += vl;
+            }
+        }
+        a += m;
+    }
+}
+#elif GEMM_DEFAULT_UNROLL_N == 2
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) 
+{
+
+    FLOAT aa, bb0, bb1;
+    FLOAT *pa, *pc, *pc0, *pc1;
+    FLOAT *pb0, *pb1;
+
+    int i, j, k;
+    size_t vl;
+    FLOAT_V_T va, vc0, vc1;
+    for (i = 0; i < m; i++)
+    {
+        aa = *(a + i);
+        pc = c + i;
+        for (j = 0; j < n/2; j ++)
+        {
+            pb0 = pc + j * ldc * 2;
+            pb1 = pb0 + ldc;
+            bb0 = (*pb0) * aa;
+            bb1 = (*pb1) * aa;
+            *b      = bb0;
+            *(b+1)  = bb1;
+            *pb0    = bb0;
+            *pb1    = bb1;
+            b += 2;
+            pa = a + i + 1;
+            pc0 = pb0 + 1;
+            pc1 = pc0 + ldc;
+            for (k = (m - i - 1); k > 0; k -= vl)
+            {
+                vl  = VSETVL(k);
+                vc0 = VLEV_FLOAT(pc0, vl);
+                vc1 = VLEV_FLOAT(pc1, vl);
+                va  = VLEV_FLOAT(pa, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
+                VSEV_FLOAT(pc0, vc0, vl);
+                VSEV_FLOAT(pc1, vc1, vl);
+                pa  += vl;
+                pc0 += vl;
+                pc1 += vl;
+            }
+        }
+        pc += ldc * (n/2) * 2;
+        if (n & 1)
+        {
+            pb0 = pc;
+            bb0 = *(pb0);
+            bb0 *= aa;
+            *b  = bb0;
+            *(c + i) = bb0;
+            b++;
+            pa = a + i + 1;
+            pc0 = pb0 + 1;
+            for (k = (m - i - 1); k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLEV_FLOAT(pc0, vl);
+                va  = VLEV_FLOAT(pa, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
+                VSEV_FLOAT(pc0, vc0, vl);
+                pa  += vl;
+                pc0 += vl;
+            }
+        }
+
+        a += m;
+    }
+}
+#elif GEMM_DEFAULT_UNROLL_N == 4
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) 
+{
+
+    FLOAT aa, bb0, bb1, bb2, bb3;
+    FLOAT *pa, *pc;
+    FLOAT *pc0, *pc1, *pc2, *pc3;
+    FLOAT *pb0, *pb1, *pb2, *pb3;
+
+    int i, j, k;
+    size_t vl;
+    FLOAT_V_T va;
+    FLOAT_V_T vc0, vc1, vc2, vc3;
+    for (i = 0; i < m; i++)
+    {
+        aa = *(a + i);
+        pc = c + i;
+        for (j = 0; j < n/4; j ++)
+        {
+            pb0 = pc;
+            pb1 = pb0 + ldc;
+            pb2 = pb1 + ldc;
+            pb3 = pb2 + ldc;
+
+            bb0 = (*pb0) * aa;
+            bb1 = (*pb1) * aa;
+            bb2 = (*pb2) * aa;
+            bb3 = (*pb3) * aa;
+
+            *b      = bb0;
+            *(b+1)  = bb1;
+            *(b+2)  = bb2;
+            *(b+3)  = bb3;
+
+            *pb0    = bb0;
+            *pb1    = bb1;
+            *pb2    = bb2;
+            *pb3    = bb3;
+            b += 4;
+
+            pa = a + i + 1;
+            pc0 = pb0 + 1;
+            pc1 = pc0 + ldc;
+            pc2 = pc1 + ldc;
+            pc3 = pc2 + ldc;
+
+            for (k = (m - i - 1); k > 0; k -= vl)
+            {
+                vl  = VSETVL(k);
+                vc0 = VLEV_FLOAT(pc0, vl);
+                vc1 = VLEV_FLOAT(pc1, vl);
+                vc2 = VLEV_FLOAT(pc2, vl);
+                vc3 = VLEV_FLOAT(pc3, vl);
+
+                va  = VLEV_FLOAT(pa, vl);
+
+                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
+                vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
+
+                VSEV_FLOAT(pc0, vc0, vl);
+                VSEV_FLOAT(pc1, vc1, vl);
+                VSEV_FLOAT(pc2, vc2, vl);
+                VSEV_FLOAT(pc3, vc3, vl);
+
+                pa  += vl;
+                pc0 += vl;
+                pc1 += vl;
+                pc2 += vl;
+                pc3 += vl;
+            }
+        }
+        pc += ldc * (n/4) * 4;
+
+        if (n & 2)
+        {
+            pb0 = pc;
+            pb1 = pb0 + ldc;
+            bb0 = (*pb0) * aa;
+            bb1 = (*pb1) * aa;
+            *b      = bb0;
+            *(b+1)  = bb1;
+            *pb0    = bb0;
+            *pb1    = bb1;
+            b += 2;
+            pa = a + i + 1;
+            pc0 = pb0 + 1;
+            pc1 = pc0 + ldc;
+            for (k = (m - i - 1); k > 0; k -= vl)
+            {
+                vl  = VSETVL(k);
+                vc0 = VLEV_FLOAT(pc0, vl);
+                vc1 = VLEV_FLOAT(pc1, vl);
+                va  = VLEV_FLOAT(pa, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
+                VSEV_FLOAT(pc0, vc0, vl);
+                VSEV_FLOAT(pc1, vc1, vl);
+                pa  += vl;
+                pc0 += vl;
+                pc1 += vl;
+            }
+            pc += ldc * 2;
+        }
+
+        if (n & 1)
+        {
+            pb0 = pc;
+            bb0 = *(pb0);
+            bb0 *= aa;
+            *b  = bb0;
+            *(c + i) = bb0;
+            b++;
+            pa = a + i + 1;
+            pc0 = pb0 + 1;
+            for (k = (m - i - 1); k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLEV_FLOAT(pc0, vl);
+                va  = VLEV_FLOAT(pa, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
+                VSEV_FLOAT(pc0, vc0, vl);
+                pa  += vl;
+                pc0 += vl;
+            }
+        }
+
+        a += m;
+    }
+}
+#elif GEMM_DEFAULT_UNROLL_N == 8
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) 
+{
+
+    FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7;
+    FLOAT *pa, *pc;
+    FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
+    FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7;
+
+    int i, j, k;
+    size_t vl;
+    FLOAT_V_T va;
+    FLOAT_V_T vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7;
+    for (i = 0; i < m; i++)
+    {
+        aa = *(a + i);
+        pc = c + i;
+        for (j = 0; j < n/8; j ++)
+        {
+            pb0 = pc + j * ldc * 8;
+            pb1 = pb0 + ldc;
+            pb2 = pb1 + ldc;
+            pb3 = pb2 + ldc;
+            pb4 = pb3 + ldc;
+            pb5 = pb4 + ldc;
+            pb6 = pb5 + ldc;
+            pb7 = pb6 + ldc;
+            
+            bb0 = (*pb0) * aa;
+            bb1 = (*pb1) * aa;
+            bb2 = (*pb2) * aa;
+            bb3 = (*pb3) * aa;
+            bb4 = (*pb4) * aa;
+            bb5 = (*pb5) * aa;
+            bb6 = (*pb6) * aa;
+            bb7 = (*pb7) * aa;
+
+            *b      = bb0;
+            *(b+1)  = bb1;
+            *(b+2)  = bb2;
+            *(b+3)  = bb3;
+            *(b+4)  = bb4;
+            *(b+5)  = bb5;
+            *(b+6)  = bb6;
+            *(b+7)  = bb7;
+
+            *pb0    = bb0;
+            *pb1    = bb1;
+            *pb2    = bb2;
+            *pb3    = bb3;
+            *pb4    = bb4;
+            *pb5    = bb5;
+            *pb6    = bb6;
+            *pb7    = bb7;
+            b += 8;
+
+            pa = a + i + 1;
+            pc0 = pb0 + 1;
+            pc1 = pc0 + ldc;
+            pc2 = pc1 + ldc;
+            pc3 = pc2 + ldc;
+            pc4 = pc3 + ldc;
+            pc5 = pc4 + ldc;
+            pc6 = pc5 + ldc;
+            pc7 = pc6 + ldc;
+
+            for (k = (m - i - 1); k > 0; k -= vl)
+            {
+                vl  = VSETVL(k);
+                vc0 = VLEV_FLOAT(pc0, vl);
+                vc1 = VLEV_FLOAT(pc1, vl);
+                vc2 = VLEV_FLOAT(pc2, vl);
+                vc3 = VLEV_FLOAT(pc3, vl);
+                vc4 = VLEV_FLOAT(pc4, vl);
+                vc5 = VLEV_FLOAT(pc5, vl);
+                vc6 = VLEV_FLOAT(pc6, vl);
+                vc7 = VLEV_FLOAT(pc7, vl);
+
+                va  = VLEV_FLOAT(pa, vl);
+
+                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
+                vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
+                vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl);
+                vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl);
+                vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl);
+                vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl);
+
+                VSEV_FLOAT(pc0, vc0, vl);
+                VSEV_FLOAT(pc1, vc1, vl);
+                VSEV_FLOAT(pc2, vc2, vl);
+                VSEV_FLOAT(pc3, vc3, vl);
+                VSEV_FLOAT(pc4, vc4, vl);
+                VSEV_FLOAT(pc5, vc5, vl);
+                VSEV_FLOAT(pc6, vc6, vl);
+                VSEV_FLOAT(pc7, vc7, vl);
+
+                pa  += vl;
+                pc0 += vl;
+                pc1 += vl;
+                pc2 += vl;
+                pc3 += vl;
+                pc4 += vl;
+                pc5 += vl;
+                pc6 += vl;
+                pc7 += vl;
+            }
+        }
+        pc += ldc * (n/8) * 8;
+
+        if (n & 4)
+        {
+            pb0 = pc;
+            pb1 = pb0 + ldc;
+            pb2 = pb1 + ldc;
+            pb3 = pb2 + ldc;
+
+            bb0 = (*pb0) * aa;
+            bb1 = (*pb1) * aa;
+            bb2 = (*pb2) * aa;
+            bb3 = (*pb3) * aa;
+
+            *b      = bb0;
+            *(b+1)  = bb1;
+            *(b+2)  = bb2;
+            *(b+3)  = bb3;
+
+            *pb0    = bb0;
+            *pb1    = bb1;
+            *pb2    = bb2;
+            *pb3    = bb3;
+            b += 4;
+
+            pa = a + i + 1;
+            pc0 = pb0 + 1;
+            pc1 = pc0 + ldc;
+            pc2 = pc1 + ldc;
+            pc3 = pc2 + ldc;
+
+            for (k = (m - i - 1); k > 0; k -= vl)
+            {
+                vl  = VSETVL(k);
+                vc0 = VLEV_FLOAT(pc0, vl);
+                vc1 = VLEV_FLOAT(pc1, vl);
+                vc2 = VLEV_FLOAT(pc2, vl);
+                vc3 = VLEV_FLOAT(pc3, vl);
+
+                va  = VLEV_FLOAT(pa, vl);
+
+                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
+                vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
+
+                VSEV_FLOAT(pc0, vc0, vl);
+                VSEV_FLOAT(pc1, vc1, vl);
+                VSEV_FLOAT(pc2, vc2, vl);
+                VSEV_FLOAT(pc3, vc3, vl);
+
+                pa  += vl;
+                pc0 += vl;
+                pc1 += vl;
+                pc2 += vl;
+                pc3 += vl;
+            }
+            pc += ldc * 4;
+        }
+
+        if (n & 2)
+        {
+            pb0 = pc;
+            pb1 = pb0 + ldc;
+            bb0 = (*pb0) * aa;
+            bb1 = (*pb1) * aa;
+            *b      = bb0;
+            *(b+1)  = bb1;
+            *pb0    = bb0;
+            *pb1    = bb1;
+            b += 2;
+            pa = a + i + 1;
+            pc0 = pb0 + 1;
+            pc1 = pc0 + ldc;
+            for (k = (m - i - 1); k > 0; k -= vl)
+            {
+                vl  = VSETVL(k);
+                vc0 = VLEV_FLOAT(pc0, vl);
+                vc1 = VLEV_FLOAT(pc1, vl);
+                va  = VLEV_FLOAT(pa, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
+                VSEV_FLOAT(pc0, vc0, vl);
+                VSEV_FLOAT(pc1, vc1, vl);
+                pa  += vl;
+                pc0 += vl;
+                pc1 += vl;
+            }
+            pc += ldc * 2;
+        }
+
+        if (n & 1)
+        {
+            pb0 = pc;
+            bb0 = *(pb0);
+            bb0 *= aa;
+            *b  = bb0;
+            *(c + i) = bb0;
+            b++;
+            pa = a + i + 1;
+            pc0 = pb0 + 1;
+            for (k = (m - i - 1); k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLEV_FLOAT(pc0, vl);
+                va  = VLEV_FLOAT(pa, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
+                VSEV_FLOAT(pc0, vc0, vl);
+                pa  += vl;
+                pc0 += vl;
+            }
+        }
+
+        a += m;
+    }
+}
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa, bb;
+
+  int i, j, k;
+
+  for (i = 0; i < m; i++) {
+
+    aa = *(a + i);
+
+    for (j = 0; j < n; j ++) {
+      bb = *(c + i + j * ldc);
+      bb *= aa;
+      *b             = bb;
+      *(c + i + j * ldc) = bb;
+      b ++;
+
+      for (k = i + 1; k < m; k ++){
+	*(c + k + j * ldc) -= bb * *(a + k);
+      }
+
+    }
+    a += m;
+  }
+}
+
+#endif
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+
+  for (i = 0; i < m; i++) {
+
+    aa1 = *(a + i * 2 + 0);
+    aa2 = *(a + i * 2 + 1);
+
+    for (j = 0; j < n; j ++) {
+      bb1 = *(c + i * 2 + 0 + j * ldc);
+      bb2 = *(c + i * 2 + 1 + j * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 = aa1 * bb1 + aa2 * bb2;
+      cc2 = aa1 * bb2 - aa2 * bb1;
+#endif
+
+      *(b + 0) = cc1;
+      *(b + 1) = cc2;
+      *(c + i * 2 + 0 + j * ldc) = cc1;
+      *(c + i * 2 + 1 + j * ldc) = cc2;
+      b += 2;
+
+      for (k = i + 1; k < m; k ++){
+#ifndef CONJ
+	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
+	*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#else
+	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
+	*(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#endif
+      }
+
+    }
+    a += m * 2;
+  }
+}
+
+
+static inline void solve_N1(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+  FLOAT *pa, *pc;
+
+  int i, j, k;
+
+  size_t vl;
+  FLOAT_V_T va0, va1, vc0, vc1;
+
+  ldc *= 2;
+
+  for (i = 0; i < m; i++) {
+
+    aa1 = *(a + i * 2 + 0);
+    aa2 = *(a + i * 2 + 1);
+
+    for (j = 0; j < n; j ++) {
+      bb1 = *(c + i * 2 + 0 + j * ldc);
+      bb2 = *(c + i * 2 + 1 + j * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 = aa1 * bb1 + aa2 * bb2;
+      cc2 = aa1 * bb2 - aa2 * bb1;
+#endif
+
+      *(b + 0) = cc1;
+      *(b + 1) = cc2;
+      *(c + i * 2 + 0 + j * ldc) = cc1;
+      *(c + i * 2 + 1 + j * ldc) = cc2;
+      b += 2;
+
+        pa = a + (i + 1) * 2;
+        pc = c + j * ldc + (i + 1) * 2;
+        for (k = (m - i - 1); k > 0; k -= vl)
+        {
+            vl = VSETVL(k);
+            VLSEG2_FLOAT(&va0, &va1, pa, vl);
+            VLSEG2_FLOAT(&vc0, &vc1, pc, vl);
+#ifndef CONJ
+            vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0);
+            vc0 = VFMACCVF_FLOAT(vc0, cc2, va1);
+            vc1 = VFNMSACVF_FLOAT(vc1, cc1, va1);
+            vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0);
+#else
+            vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0);
+            vc0 = VFNMSACVF_FLOAT(vc0, cc2, va1);
+            vc1 = VFMACCVF_FLOAT(vc1, cc1, va1);
+            vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0);
+#endif
+            VSSEG2_FLOAT(pc, vc0, vc1, vl);
+            pa += vl * 2;
+            pc += vl * 2;
+        }
+      }
+    }
+    a += m * 2;
+  }
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  BLASLONG i, j;
+
+  size_t vl = VSETVL_MAX;
+
+  //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  while (j > 0) {
+
+    kk = offset;
+    aa = a;
+    cc = c;
+
+    i = vl;
+
+    while (i <= m) {
+
+      if (kk > 0) {
+        GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa, b, cc, ldc);
+      }
+
+      solve(vl, GEMM_UNROLL_N,
+          aa + kk * vl * COMPSIZE,
+          b  + kk * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      aa += vl * k * COMPSIZE;
+      cc += vl     * COMPSIZE;
+      kk += vl;
+      i += vl;
+    }
+
+    i = m % vl;
+    if (i) {
+      if (kk > 0) {
+        GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa, b, cc, ldc);
+      }
+      solve(i, GEMM_UNROLL_N,
+          aa + kk * i             * COMPSIZE,
+          b  + kk * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      aa += i * k * COMPSIZE;
+      cc += i     * COMPSIZE;
+      kk += i;
+
+    }
+
+    b += GEMM_UNROLL_N * k   * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+        kk = offset;
+        aa = a;
+        cc = c;
+
+        i = vl;
+
+        while (i <= m) {
+          if (kk > 0) {
+            GEMM_KERNEL(vl, j, kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa,
+                b,
+                cc,
+                ldc);
+          }
+
+          solve(vl, j,
+              aa + kk * vl * COMPSIZE,
+              b  + kk * j             * COMPSIZE, cc, ldc);
+
+          aa += vl * k * COMPSIZE;
+          cc += vl     * COMPSIZE;
+          kk += vl;
+          i += vl;
+        }
+
+        i = m % vl;
+        if (i) {
+          if (kk > 0) {
+            GEMM_KERNEL(i, j, kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa,
+                b,
+                cc,
+                ldc);
+          }
+
+          solve(i, j,
+              aa + kk * i * COMPSIZE,
+              b  + kk * j * COMPSIZE, cc, ldc);
+
+          aa += i * k * COMPSIZE;
+          cc += i     * COMPSIZE;
+          kk += i;
+
+        }
+
+        b += j * k   * COMPSIZE;
+        c += j * ldc * COMPSIZE;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c
new file mode 100644
index 000000000..41368be60
--- /dev/null
+++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c
@@ -0,0 +1,792 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define VSETVL_MAX vsetvlmax_e32m2()
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VLSEV_FLOAT vlse32_v_f32m2
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VSSEV_FLOAT vsse32_v_f32m2
+#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define VFMACCVF_FLOAT vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define VSETVL_MAX vsetvlmax_e64m2()
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VLSEV_FLOAT vlse64_v_f64m2
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VSSEV_FLOAT vsse64_v_f64m2
+#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define VFMACCVF_FLOAT vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
+#endif
+
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_R
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_kernel_RN_sve.c
+
+#ifndef COMPLEX
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT aa, bb;
+    FLOAT *pb, *pc;
+    BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
+    int i, j, k;
+    size_t vl;
+    FLOAT_V_T vb, vc;
+
+    for (i = 0; i < n; i++) 
+    {
+        bb = *(b + i);
+
+        for (j = 0; j < m; j ++) 
+        {
+            aa = *(c + j + i * ldc);
+            aa *= bb;
+            *a  = aa;
+            *(c + j + i * ldc) = aa;
+            a ++;
+
+            pb = b + i + 1;
+            pc = c + j + (i + 1) *ldc;
+            for (k = (n - i - 1); k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc = VLSEV_FLOAT(pc, stride_ldc, vl);
+                vb = VLEV_FLOAT(pb, vl);
+                vc = VFNMSACVF_FLOAT(vc, aa, vb, vl);
+                VSSEV_FLOAT(pc, stride_ldc, vc, vl);
+                pb += vl;
+                pc ++;
+            }
+        }
+        b += n;
+    }
+}
+
+#elif GEMM_DEFAULT_UNROLL_N == 2
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT aa0, aa1, bb;
+    FLOAT *pb, *pc;
+    FLOAT *pa0, *pa1, *pc0, *pc1;
+    BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
+    int i, j, k;
+    size_t vl;
+    FLOAT_V_T vb, vc0, vc1;
+
+    for (i = 0; i < n; i++) 
+    {
+        bb = *(b + i);
+        pc = c + i * ldc;
+        for (j = 0; j < m/2; j ++) 
+        {
+            pa0 = pc + j * 2;
+            pa1 = pc + j * 2 + 1;
+            aa0 = *pa0 * bb;
+            aa1 = *pa1 * bb;
+
+            *pa0    = aa0;
+            *pa1    = aa1;
+            *a      = aa0;
+            *(a + 1)= aa1;
+            a  += 2;
+
+            pb = b + i + 1;
+            pc0 = pa0 + ldc;
+            pc1 = pa1 + ldc;
+            for (k = (n - i - 1); k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
+                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
+                vb = VLEV_FLOAT(pb, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
+                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
+                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
+                pb += vl;
+                pc0++;
+                pc1++;
+            }
+        }
+        pc += (m/2)*2;
+        if (m & 1)
+        {
+            pa0 = pc;
+            aa0 = *pa0 * bb;
+            
+            *pa0    = aa0;
+            *a      = aa0;
+            a  += 1;
+           
+            pb = b + i + 1;
+            pc0 = pa0 + ldc;
+            for (k = (n - i - 1); k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
+                vb = VLEV_FLOAT(pb, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
+                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
+                pb += vl;
+                pc0++;
+            }
+        }
+        b += n;
+    }
+}
+
+#elif GEMM_DEFAULT_UNROLL_N == 4
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT bb;
+    FLOAT aa0, aa1, aa2, aa3;
+    FLOAT *pb, *pc;
+    FLOAT *pa0, *pa1, *pa2, *pa3;
+    FLOAT *pc0, *pc1, *pc2, *pc3;
+    BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
+    int i, j, k;
+    size_t vl;
+    FLOAT_V_T vb, vc0, vc1, vc2, vc3;
+
+    for (i = 0; i < n; i++) 
+    {
+        bb = *(b + i);
+        pc = c + i * ldc;
+        for (j = 0; j < m/4; j ++) 
+        {
+            pa0 = pc + j * 4;
+            pa1 = pa0 + 1;
+            pa2 = pa1 + 1;
+            pa3 = pa2 + 1;
+            
+            aa0 = *pa0 * bb;
+            aa1 = *pa1 * bb;
+            aa2 = *pa2 * bb;
+            aa3 = *pa3 * bb;
+
+            *pa0    = aa0;
+            *pa1    = aa1;
+            *pa2    = aa2;
+            *pa3    = aa3;
+
+            *a      = aa0;
+            *(a + 1)= aa1;
+            *(a + 2)= aa2;
+            *(a + 3)= aa3;
+
+            a  += 4;
+
+            pb = b + i + 1;
+            pc0 = pa0 + ldc;
+            pc1 = pa1 + ldc;
+            pc2 = pa2 + ldc;
+            pc3 = pa3 + ldc;
+            for (k = (n - i - 1); k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
+                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
+                vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
+                vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
+                vb = VLEV_FLOAT(pb, vl);
+ 
+                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
+                vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
+
+                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
+                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
+                VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
+                VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
+
+                pb += vl;
+                pc0++;
+                pc1++;
+                pc2++;
+                pc3++;
+            }
+        }
+        pc += (m/4)*4;
+
+        if (m & 2)
+        {
+            pa0 = pc;
+            pa1 = pa0 + 1;
+            
+            aa0 = *pa0 * bb;
+            aa1 = *pa1 * bb;
+
+            *pa0    = aa0;
+            *pa1    = aa1;
+
+            *a      = aa0;
+            *(a + 1)= aa1;
+
+            a  += 2;
+
+            pb = b + i + 1;
+            pc0 = pa0 + ldc;
+            pc1 = pa1 + ldc;
+            for (k = (n - i - 1); k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
+                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
+                vb = VLEV_FLOAT(pb, vl);
+ 
+                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
+
+                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
+                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
+
+                pb += vl;
+                pc0++;
+                pc1++;
+            }
+            pc += 2;
+        }
+
+        if (m & 1)
+        {
+            pa0 = pc;
+            aa0 = *pa0 * bb;
+            
+            *pa0    = aa0;
+            *a      = aa0;
+            a  += 1;
+           
+            pb = b + i + 1;
+            pc0 = pa0 + ldc;
+            for (k = (n - i - 1); k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
+                vb = VLEV_FLOAT(pb, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
+                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
+                pb += vl;
+                pc0++;
+            }
+        }
+        b += n;
+    }
+}
+
+#elif GEMM_DEFAULT_UNROLL_N == 8
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT bb;
+    FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7;
+    FLOAT *pb, *pc;
+    FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
+    FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
+    BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
+    int i, j, k;
+    size_t vl;
+    FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7;
+
+    for (i = 0; i < n; i++) 
+    {
+        bb = *(b + i);
+        pc = c + i * ldc;
+        for (j = 0; j < m/8; j ++) 
+        {
+            pa0 = pc + j * 8;
+            pa1 = pa0 + 1;
+            pa2 = pa1 + 1;
+            pa3 = pa2 + 1;
+            pa4 = pa3 + 1;
+            pa5 = pa4 + 1;
+            pa6 = pa5 + 1;
+            pa7 = pa6 + 1;
+            
+            aa0 = *pa0 * bb;
+            aa1 = *pa1 * bb;
+            aa2 = *pa2 * bb;
+            aa3 = *pa3 * bb;
+            aa4 = *pa4 * bb;
+            aa5 = *pa5 * bb;
+            aa6 = *pa6 * bb;
+            aa7 = *pa7 * bb;
+
+            *pa0    = aa0;
+            *pa1    = aa1;
+            *pa2    = aa2;
+            *pa3    = aa3;
+            *pa4    = aa4;
+            *pa5    = aa5;
+            *pa6    = aa6;
+            *pa7    = aa7;
+
+            *a      = aa0;
+            *(a + 1)= aa1;
+            *(a + 2)= aa2;
+            *(a + 3)= aa3;
+            *(a + 4)= aa4;
+            *(a + 5)= aa5;
+            *(a + 6)= aa6;
+            *(a + 7)= aa7;
+
+            a  += 8;
+
+            pb = b + i + 1;
+            pc0 = pa0 + ldc;
+            pc1 = pa1 + ldc;
+            pc2 = pa2 + ldc;
+            pc3 = pa3 + ldc;
+            pc4 = pa4 + ldc;
+            pc5 = pa5 + ldc;
+            pc6 = pa6 + ldc;
+            pc7 = pa7 + ldc;
+            for (k = (n - i - 1); k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
+                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
+                vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
+                vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
+                vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl);
+                vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl);
+                vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl);
+                vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl);
+                vb = VLEV_FLOAT(pb, vl);
+ 
+                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
+                vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
+                vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl);
+                vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl);
+                vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl);
+                vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl);
+
+                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
+                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
+                VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
+                VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
+                VSSEV_FLOAT(pc4, stride_ldc, vc4, vl);
+                VSSEV_FLOAT(pc5, stride_ldc, vc5, vl);
+                VSSEV_FLOAT(pc6, stride_ldc, vc6, vl);
+                VSSEV_FLOAT(pc7, stride_ldc, vc7, vl);
+
+                pb += vl;
+                pc0++;
+                pc1++;
+                pc2++;
+                pc3++;
+                pc4++;
+                pc5++;
+                pc6++;
+                pc7++;
+            }
+        }
+        pc += (m/8)*8;
+
+        if (m & 4)
+        {
+            pa0 = pc;
+            pa1 = pa0 + 1;
+            pa2 = pa1 + 1;
+            pa3 = pa2 + 1;
+            
+            aa0 = *pa0 * bb;
+            aa1 = *pa1 * bb;
+            aa2 = *pa2 * bb;
+            aa3 = *pa3 * bb;
+
+            *pa0    = aa0;
+            *pa1    = aa1;
+            *pa2    = aa2;
+            *pa3    = aa3;
+
+            *a      = aa0;
+            *(a + 1)= aa1;
+            *(a + 2)= aa2;
+            *(a + 3)= aa3;
+
+            a  += 4;
+
+            pb = b + i + 1;
+            pc0 = pa0 + ldc;
+            pc1 = pa1 + ldc;
+            pc2 = pa2 + ldc;
+            pc3 = pa3 + ldc;
+            for (k = (n - i - 1); k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
+                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
+                vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
+                vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
+                vb = VLEV_FLOAT(pb, vl);
+ 
+                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
+                vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
+
+                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
+                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
+                VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
+                VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
+
+                pb += vl;
+                pc0++;
+                pc1++;
+                pc2++;
+                pc3++;
+            }
+            pc += 4;
+        }
+
+        if (m & 2)
+        {
+            pa0 = pc;
+            pa1 = pa0 + 1;
+            
+            aa0 = *pa0 * bb;
+            aa1 = *pa1 * bb;
+
+            *pa0    = aa0;
+            *pa1    = aa1;
+
+            *a      = aa0;
+            *(a + 1)= aa1;
+
+            a  += 2;
+
+            pb = b + i + 1;
+            pc0 = pa0 + ldc;
+            pc1 = pa1 + ldc;
+            for (k = (n - i - 1); k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
+                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
+                vb = VLEV_FLOAT(pb, vl);
+ 
+                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
+
+                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
+                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
+
+                pb += vl;
+                pc0++;
+                pc1++;
+            }
+            pc += 2;
+        }
+
+        if (m & 1)
+        {
+            pa0 = pc;
+            aa0 = *pa0 * bb;
+            
+            *pa0    = aa0;
+            *a      = aa0;
+            a  += 1;
+           
+            pb = b + i + 1;
+            pc0 = pa0 + ldc;
+            for (k = (n - i - 1); k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
+                vb = VLEV_FLOAT(pb, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
+                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
+                pb += vl;
+                pc0++;
+            }
+        }
+        b += n;
+    }
+}
+#else
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa, bb;
+
+  int i, j, k;
+
+  for (i = 0; i < n; i++) {
+
+    bb = *(b + i);
+
+    for (j = 0; j < m; j ++) {
+      aa = *(c + j + i * ldc);
+      aa *= bb;
+      *a  = aa;
+      *(c + j + i * ldc) = aa;
+      a ++;
+
+      for (k = i + 1; k < n; k ++){
+	*(c + j + k * ldc) -= aa * *(b + k);
+      }
+
+    }
+    b += n;
+  }
+}
+
+#endif
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+
+  for (i = 0; i < n; i++) {
+
+    bb1 = *(b + i * 2 + 0);
+    bb2 = *(b + i * 2 + 1);
+
+    for (j = 0; j < m; j ++) {
+      aa1 = *(c + j * 2 + 0 + i * ldc);
+      aa2 = *(c + j * 2 + 1 + i * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 =  aa1 * bb1 + aa2 * bb2;
+      cc2 = -aa1 * bb2 + aa2 * bb1;
+#endif
+
+      *(a + 0) = cc1;
+      *(a + 1) = cc2;
+      *(c + j * 2 + 0 + i * ldc) = cc1;
+      *(c + j * 2 + 1 + i * ldc) = cc2;
+      a += 2;
+
+      for (k = i + 1; k < n; k ++){
+#ifndef CONJ
+	*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#else
+	*(c + j * 2 + 0 + k * ldc) -=   cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#endif
+      }
+
+    }
+    b += n * 2;
+  }
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  BLASLONG i, j;
+
+  size_t vl = VSETVL_MAX;
+
+  //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
+
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+  kk = -offset;
+
+  while (j > 0) {
+
+    aa = a;
+    cc = c;
+
+    i = vl;
+
+    if (i <= m) {
+      do {
+	if (kk > 0) {
+	  GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+		      ZERO,
+#endif
+		      aa, b, cc, ldc);
+	}
+
+	solve(vl, GEMM_UNROLL_N,
+	      aa + kk * vl * COMPSIZE,
+	      b  + kk * GEMM_UNROLL_N * COMPSIZE,
+	      cc, ldc);
+
+	aa += vl * k * COMPSIZE;
+	cc += vl     * COMPSIZE;
+	i += vl;
+      } while (i <= m);
+    }
+
+
+    i = m % vl;
+    if (i) {
+      if (kk > 0) {
+        GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa, b, cc, ldc);
+      }
+      solve(i, GEMM_UNROLL_N,
+          aa + kk * i             * COMPSIZE,
+          b  + kk * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      aa += i * k * COMPSIZE;
+      cc += i     * COMPSIZE;
+
+    }
+
+    kk += GEMM_UNROLL_N;
+    b += GEMM_UNROLL_N * k   * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+	aa = a;
+	cc = c;
+
+  i = vl;
+
+	while (i <= m) {
+	  if (kk > 0) {
+	    GEMM_KERNEL(vl, j, kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa,
+			b,
+			cc,
+			ldc);
+	  }
+
+	  solve(vl, j,
+		aa + kk * vl * COMPSIZE,
+		b  + kk * j             * COMPSIZE, cc, ldc);
+
+	  aa += vl * k * COMPSIZE;
+	  cc += vl     * COMPSIZE;
+	  i += vl;
+	}
+
+  i = m % vl;
+  if (i) {
+	      if (kk > 0) {
+		GEMM_KERNEL(i, j, kk, dm1,
+#ifdef COMPLEX
+			    ZERO,
+#endif
+			    aa,
+			    b,
+			    cc,
+			    ldc);
+	      }
+
+	      solve(i, j,
+		    aa + kk * i * COMPSIZE,
+		    b  + kk * j * COMPSIZE, cc, ldc);
+
+	      aa += i * k * COMPSIZE;
+	      cc += i     * COMPSIZE;
+
+  }
+
+	b += j * k   * COMPSIZE;
+	c += j * ldc * COMPSIZE;
+	kk += j;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c
new file mode 100644
index 000000000..459c1663a
--- /dev/null
+++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c
@@ -0,0 +1,828 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define VSETVL_MAX vsetvlmax_e32m2()
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VLSEV_FLOAT vlse32_v_f32m2
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VSSEV_FLOAT vsse32_v_f32m2
+#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define VFMACCVF_FLOAT vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define VSETVL_MAX vsetvlmax_e64m2()
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VLSEV_FLOAT vlse64_v_f64m2
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VSSEV_FLOAT vsse64_v_f64m2
+#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define VFMACCVF_FLOAT vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
+#endif
+
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_R
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_kernel_RT_sve.c
+
+#ifndef COMPLEX
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa,  bb;
+  FLOAT *pb, *pc;
+  BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
+
+  int i, j, k;
+  size_t vl;
+  FLOAT_V_T vb, vc;
+
+  a += (n - 1) * m;
+  b += (n - 1) * n;
+
+  for (i = n - 1; i >= 0; i--) {
+
+    bb = *(b + i);
+
+    for (j = 0; j < m; j ++) {
+      aa = *(c + j + i * ldc);
+      aa *= bb;
+      *a   = aa;
+      *(c + j + i * ldc) = aa;
+      a ++;
+
+        pb = b;
+        pc = c + j;
+        for (k = i; k > 0; k -= vl)
+        {
+            vl = VSETVL(k);
+            vc = VLSEV_FLOAT(pc, stride_ldc, vl);
+            vb = VLEV_FLOAT(pb, vl);
+            vc = VFNMSACVF_FLOAT(vc, aa, vb, vl);
+            VSSEV_FLOAT(pc, stride_ldc, vc, vl);
+            pb += vl;
+            pc++;
+        }
+    }
+    b -= n;
+    a -= 2 * m;
+  }
+
+}
+#elif GEMM_DEFAULT_UNROLL_N == 2
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT aa0, aa1, bb;
+    FLOAT *pb, *pc;
+    FLOAT *pa0, *pa1, *pc0, *pc1;
+    BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
+    int i, j, k;
+    size_t vl;
+    FLOAT_V_T vb, vc0, vc1;
+
+    a += (n - 1) * m;
+    b += (n - 1) * n;
+
+    for (i = n - 1; i >= 0; i--)
+    {
+        bb = *(b + i);
+        pc = c + i * ldc;
+        for (j = 0; j < m/2; j ++) 
+        {
+            pa0 = pc + j * 2;
+            pa1 = pc + j * 2 + 1;
+            aa0 = *pa0 * bb;
+            aa1 = *pa1 * bb;
+
+            *pa0    = aa0;
+            *pa1    = aa1;
+            *a      = aa0;
+            *(a + 1)= aa1;
+            a  += 2;
+
+            pb  = b;
+            pc0 = c + j * 2;
+            pc1 = pc0 + 1;
+            for (k = i; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
+                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
+                vb = VLEV_FLOAT(pb, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
+                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
+                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
+                pb += vl;
+                pc0++;
+                pc1++;
+            }
+        }
+        pc += (m/2)*2;
+
+        if (m & 1)
+        {
+            pa0 = pc;
+            aa0 = *pa0 * bb;
+            
+            *pa0    = aa0;
+            *a      = aa0;
+            a  += 1;
+           
+            pb = b;
+            pc0 = pc - i * ldc;
+            for (k = i; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
+                vb = VLEV_FLOAT(pb, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
+                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
+                pb += vl;
+                pc0++;
+            }
+        }
+        b -= n;
+        a -= 2 * m;
+    }
+}
+
+#elif GEMM_DEFAULT_UNROLL_N == 4
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT aa0, aa1, aa2, aa3;
+    FLOAT bb;
+    FLOAT *pb, *pc;
+    FLOAT *pa0, *pa1, *pa2, *pa3;
+    FLOAT *pc0, *pc1, *pc2, *pc3;
+    BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
+    int i, j, k;
+    size_t vl;
+    FLOAT_V_T vb, vc0, vc1, vc2, vc3;
+
+    a += (n - 1) * m;
+    b += (n - 1) * n;
+
+    for (i = n - 1; i >= 0; i--)
+    {
+        bb = *(b + i);
+        pc = c + i * ldc;
+        for (j = 0; j < m/4; j ++) 
+        {
+            pa0 = pc + j * 4;
+            pa1 = pa0 + 1;
+            pa2 = pa1 + 1;
+            pa3 = pa2 + 1;
+            
+            aa0 = *pa0 * bb;
+            aa1 = *pa1 * bb;
+            aa2 = *pa2 * bb;
+            aa3 = *pa3 * bb;
+
+            *pa0    = aa0;
+            *pa1    = aa1;
+            *pa2    = aa2;
+            *pa3    = aa3;
+
+            *a      = aa0;
+            *(a + 1)= aa1;
+            *(a + 2)= aa2;
+            *(a + 3)= aa3;
+            a  += 4;
+
+            pb  = b;
+            pc0 = c + j * 4;
+            pc1 = pc0 + 1;
+            pc2 = pc1 + 1;
+            pc3 = pc2 + 1;
+            for (k = i; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
+                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
+                vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
+                vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
+                vb = VLEV_FLOAT(pb, vl);
+
+                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
+                vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
+
+                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
+                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
+                VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
+                VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
+
+                pb += vl;
+                pc0++;
+                pc1++;
+                pc2++;
+                pc3++;
+            }
+        }
+        pc += (m/4)*4;
+
+        if (m & 2)
+        {
+            pa0 = pc + j * 2;
+            pa1 = pa0 + 1;
+            
+            aa0 = *pa0 * bb;
+            aa1 = *pa1 * bb;
+
+            *pa0    = aa0;
+            *pa1    = aa1;
+
+            *a      = aa0;
+            *(a + 1)= aa1;
+            a  += 2;
+
+            pb  = b;
+            pc0 = c + j * 4;
+            pc1 = pc0 + 1;
+            for (k = i; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
+                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
+                vb = VLEV_FLOAT(pb, vl);
+
+                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
+
+                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
+                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
+
+                pb += vl;
+                pc0++;
+                pc1++;
+            }
+            pc += 2;
+        }
+
+        if (m & 1)
+        {
+            pa0 = pc;
+            aa0 = *pa0 * bb;
+            
+            *pa0    = aa0;
+            *a      = aa0;
+            a  += 1;
+           
+            pb = b;
+            pc0 = pc - i * ldc;
+            for (k = i; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
+                vb = VLEV_FLOAT(pb, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
+                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
+                pb += vl;
+                pc0++;
+            }
+        }
+        b -= n;
+        a -= 2 * m;
+    }
+}
+#elif GEMM_DEFAULT_UNROLL_N == 8
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7;
+    FLOAT bb;
+    FLOAT *pb, *pc;
+    FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
+    FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
+    BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
+    int i, j, k;
+    size_t vl;
+    FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7;
+
+    a += (n - 1) * m;
+    b += (n - 1) * n;
+
+    for (i = n - 1; i >= 0; i--)
+    {
+        bb = *(b + i);
+        pc = c + i * ldc;
+        for (j = 0; j < m/8; j ++) 
+        {
+            pa0 = pc + j * 8;
+            pa1 = pa0 + 1;
+            pa2 = pa1 + 1;
+            pa3 = pa2 + 1;
+            pa4 = pa3 + 1;
+            pa5 = pa4 + 1;
+            pa6 = pa5 + 1;
+            pa7 = pa6 + 1;
+            
+            aa0 = *pa0 * bb;
+            aa1 = *pa1 * bb;
+            aa2 = *pa2 * bb;
+            aa3 = *pa3 * bb;
+            aa4 = *pa4 * bb;
+            aa5 = *pa5 * bb;
+            aa6 = *pa6 * bb;
+            aa7 = *pa7 * bb;
+
+            *pa0    = aa0;
+            *pa1    = aa1;
+            *pa2    = aa2;
+            *pa3    = aa3;
+            *pa4    = aa4;
+            *pa5    = aa5;
+            *pa6    = aa6;
+            *pa7    = aa7;
+
+            *a      = aa0;
+            *(a + 1)= aa1;
+            *(a + 2)= aa2;
+            *(a + 3)= aa3;
+            *(a + 4)= aa4;
+            *(a + 5)= aa5;
+            *(a + 6)= aa6;
+            *(a + 7)= aa7;
+            a  += 8;
+
+            pb  = b;
+            pc0 = c + j * 8;
+            pc1 = pc0 + 1;
+            pc2 = pc1 + 1;
+            pc3 = pc2 + 1;
+            pc4 = pc3 + 1;
+            pc5 = pc4 + 1;
+            pc6 = pc5 + 1;
+            pc7 = pc6 + 1;
+            for (k = i; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
+                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
+                vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
+                vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
+                vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl);
+                vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl);
+                vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl);
+                vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl);
+                vb = VLEV_FLOAT(pb, vl);
+
+                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
+                vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
+                vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl);
+                vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl);
+                vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl);
+                vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl);
+
+                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
+                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
+                VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
+                VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
+                VSSEV_FLOAT(pc4, stride_ldc, vc4, vl);
+                VSSEV_FLOAT(pc5, stride_ldc, vc5, vl);
+                VSSEV_FLOAT(pc6, stride_ldc, vc6, vl);
+                VSSEV_FLOAT(pc7, stride_ldc, vc7, vl);
+
+                pb += vl;
+                pc0++;
+                pc1++;
+                pc2++;
+                pc3++;
+                pc4++;
+                pc5++;
+                pc6++;
+                pc7++;
+            }
+        }
+        pc += (m/8)*8;
+
+        if (m & 4)
+        {
+            pa0 = pc;
+            pa1 = pa0 + 1;
+            pa2 = pa1 + 1;
+            pa3 = pa2 + 1;
+            
+            aa0 = *pa0 * bb;
+            aa1 = *pa1 * bb;
+            aa2 = *pa2 * bb;
+            aa3 = *pa3 * bb;
+
+            *pa0    = aa0;
+            *pa1    = aa1;
+            *pa2    = aa2;
+            *pa3    = aa3;
+
+            *a      = aa0;
+            *(a + 1)= aa1;
+            *(a + 2)= aa2;
+            *(a + 3)= aa3;
+            a  += 4;
+
+            pb  = b;
+            pc0 = pc - i * ldc;
+            pc1 = pc0 + 1;
+            pc2 = pc1 + 1;
+            pc3 = pc2 + 1;
+            for (k = i; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
+                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
+                vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
+                vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
+                vb = VLEV_FLOAT(pb, vl);
+
+                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
+                vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
+
+                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
+                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
+                VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
+                VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
+
+                pb += vl;
+                pc0++;
+                pc1++;
+                pc2++;
+                pc3++;
+            }
+            pc += 4;
+        }
+
+        if (m & 2)
+        {
+            pa0 = pc;
+            pa1 = pa0 + 1;
+            
+            aa0 = *pa0 * bb;
+            aa1 = *pa1 * bb;
+
+            *pa0    = aa0;
+            *pa1    = aa1;
+
+            *a      = aa0;
+            *(a + 1)= aa1;
+            a  += 2;
+
+            pb  = b;
+            pc0 = pc - i * ldc;
+            pc1 = pc0 + 1;
+            for (k = i; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
+                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
+                vb = VLEV_FLOAT(pb, vl);
+
+                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
+
+                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
+                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
+
+                pb += vl;
+                pc0++;
+                pc1++;
+            }
+            pc += 2;
+        }
+
+        if (m & 1)
+        {
+            pa0 = pc;
+            aa0 = *pa0 * bb;
+            
+            *pa0    = aa0;
+            *a      = aa0;
+            a  += 1;
+           
+            pb = b;
+            pc0 = pc - i * ldc;
+            for (k = i; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
+                vb = VLEV_FLOAT(pb, vl);
+                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
+                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
+                pb += vl;
+                pc0++;
+            }
+        }
+        b -= n;
+        a -= 2 * m;
+    }
+}
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa,  bb;
+
+  int i, j, k;
+
+  a += (n - 1) * m;
+  b += (n - 1) * n;
+
+  for (i = n - 1; i >= 0; i--) {
+
+    bb = *(b + i);
+
+    for (j = 0; j < m; j ++) {
+      aa = *(c + j + i * ldc);
+      aa *= bb;
+      *a   = aa;
+      *(c + j + i * ldc) = aa;
+      a ++;
+
+      for (k = 0; k < i; k ++){
+	*(c + j + k * ldc) -= aa * *(b + k);
+      }
+
+    }
+    b -= n;
+    a -= 2 * m;
+  }
+
+}
+
+#endif
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+
+  a += (n - 1) * m * 2;
+  b += (n - 1) * n * 2;
+
+  for (i = n - 1; i >= 0; i--) {
+
+    bb1 = *(b + i * 2 + 0);
+    bb2 = *(b + i * 2 + 1);
+
+    for (j = 0; j < m; j ++) {
+
+      aa1 = *(c + j * 2 + 0 + i * ldc);
+      aa2 = *(c + j * 2 + 1 + i * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 =  aa1 * bb1  + aa2 * bb2;
+      cc2 = - aa1 * bb2 + aa2 * bb1;
+#endif
+
+      *(a + 0) = cc1;
+      *(a + 1) = cc2;
+
+      *(c + j * 2 + 0 + i * ldc) = cc1;
+      *(c + j * 2 + 1 + i * ldc) = cc2;
+      a += 2;
+
+      for (k = 0; k < i; k ++){
+#ifndef CONJ
+	*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#else
+	*(c + j * 2 + 0 + k * ldc) -=   cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -=  -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#endif
+      }
+
+    }
+    b -= n * 2;
+    a -= 4 * m;
+  }
+
+}
+
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
+#ifdef COMPLEX
+    FLOAT dummy2,
+#endif
+    FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  BLASLONG i, j;
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+
+  size_t vl = VSETVL_MAX;
+
+  //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
+
+  kk = n - offset;
+  c += n * ldc * COMPSIZE;
+  b += n * k   * COMPSIZE;
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = 1;
+    while (j < GEMM_UNROLL_N) {
+      if (n & j) {
+
+        aa  = a;
+        b -= j * k  * COMPSIZE;
+        c -= j * ldc* COMPSIZE;
+        cc  = c;
+
+        i = vl;
+        if (i <= m) {
+
+          do {
+            if (k - kk > 0) {
+              GEMM_KERNEL(vl, j, k - kk, dm1,
+#ifdef COMPLEX
+                  ZERO,
+#endif
+                  aa + vl * kk * COMPSIZE,
+                  b  +  j            * kk * COMPSIZE,
+                  cc,
+                  ldc);
+            }
+
+            solve(vl, j,
+                aa + (kk - j) * vl * COMPSIZE,
+                b  + (kk - j) * j             * COMPSIZE,
+                cc, ldc);
+
+            aa += vl * k * COMPSIZE;
+            cc += vl     * COMPSIZE;
+            i += vl;
+          } while (i <= m);
+        }
+
+        i = m % vl;
+        if (i) {
+          if (k - kk > 0) {
+            GEMM_KERNEL(i, j, k - kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa + i * kk * COMPSIZE,
+                b  + j * kk * COMPSIZE,
+                cc, ldc);
+          }
+
+          solve(i, j,
+              aa + (kk - j) * i * COMPSIZE,
+              b  + (kk - j) * j * COMPSIZE,
+              cc, ldc);
+
+          aa += i * k * COMPSIZE;
+          cc += i     * COMPSIZE;
+
+        }
+        kk -= j;
+      }
+      j <<= 1;
+    }
+  }
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  if (j > 0) {
+
+    do {
+      aa  = a;
+      b -= GEMM_UNROLL_N * k   * COMPSIZE;
+      c -= GEMM_UNROLL_N * ldc * COMPSIZE;
+      cc  = c;
+
+      i = vl;
+      if (i <= m) {
+	do {
+	  if (k - kk > 0) {
+	    GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa + vl * kk * COMPSIZE,
+			b  + GEMM_UNROLL_N * kk * COMPSIZE,
+			cc,
+			ldc);
+	  }
+
+	  solve(vl, GEMM_UNROLL_N,
+		aa + (kk - GEMM_UNROLL_N) * vl * COMPSIZE,
+		b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
+		cc, ldc);
+
+	  aa += vl * k * COMPSIZE;
+	  cc += vl     * COMPSIZE;
+	  i += vl;
+	} while (i <= m);
+      }
+
+      i = m % vl;
+      if (i) {
+	    if (k - kk > 0) {
+	      GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+			  ZERO,
+#endif
+			  aa + i             * kk * COMPSIZE,
+			  b  + GEMM_UNROLL_N * kk * COMPSIZE,
+			  cc,
+			  ldc);
+	    }
+
+	    solve(i, GEMM_UNROLL_N,
+		  aa + (kk - GEMM_UNROLL_N) * i             * COMPSIZE,
+		  b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
+		  cc, ldc);
+
+	    aa += i * k * COMPSIZE;
+	    cc += i     * COMPSIZE;
+
+      }
+
+      kk -= GEMM_UNROLL_N;
+      j --;
+    } while (j > 0);
+  }
+
+  return 0;
+}
+
+
diff --git a/kernel/riscv64/trsm_lncopy_rvv_v1.c b/kernel/riscv64/trsm_lncopy_rvv_v1.c
new file mode 100644
index 000000000..bacfb2b08
--- /dev/null
+++ b/kernel/riscv64/trsm_lncopy_rvv_v1.c
@@ -0,0 +1,122 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VSEV_FLOAT_M vse32_v_f32m2_m
+#define VLSEV_FLOAT vlse32_v_f32m2
+#define VBOOL_T vbool16_t
+#define UINT_V_T vuint32m2_t
+#define VID_V_UINT vid_v_u32m2
+#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VSEV_FLOAT_M vse64_v_f64m2_m
+#define VLSEV_FLOAT vlse64_v_f64m2
+#define VBOOL_T     vbool32_t
+#define UINT_V_T     vuint64m2_t
+#define VID_V_UINT   vid_v_u64m2
+#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
+
+#endif
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_lncopy_sve.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    BLASLONG i, ii, jj, js;
+
+    FLOAT *ao;
+
+    jj = offset;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda;
+
+    FLOAT_V_T va1;
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
+
+                    *(b + j) = INV(*(ao + j * lda));
+                    ao++;
+                    b += vl;
+                }
+                i += vl;
+                ii += vl;
+            }
+            else
+            {
+                if (ii > jj)
+                {
+                    va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                    VSEV_FLOAT(b, va1, vl);
+                }
+                ao++;
+                b += vl;
+                i++;
+                ii++;
+            }
+        }
+
+        a += vl * lda;
+        jj += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/trsm_ltcopy_rvv_v1.c b/kernel/riscv64/trsm_ltcopy_rvv_v1.c
new file mode 100644
index 000000000..0fc7c9f24
--- /dev/null
+++ b/kernel/riscv64/trsm_ltcopy_rvv_v1.c
@@ -0,0 +1,122 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VSEV_FLOAT_M vse32_v_f32m2_m
+#define VLSEV_FLOAT vlse32_v_f32m2
+#define VBOOL_T vbool16_t
+#define UINT_V_T vuint32m2_t
+#define VID_V_UINT vid_v_u32m2
+#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VSEV_FLOAT_M vse64_v_f64m2_m
+#define VLSEV_FLOAT vlse64_v_f64m2
+#define VBOOL_T     vbool32_t
+#define UINT_V_T     vuint64m2_t
+#define VID_V_UINT   vid_v_u64m2
+#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
+#endif
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_ltcopy_sve.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    BLASLONG i, ii, jj, js;
+
+    FLOAT *ao;
+
+    jj = offset;
+
+    FLOAT_V_T va1;
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    *(b + j) = INV(*(ao + j));
+
+                    va1 = VLEV_FLOAT(ao, vl);
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
+
+                    b += vl;
+                    ao += lda;
+                }
+                i += vl;
+                ii += vl;
+            }
+            else 
+            {
+                if (ii < jj) 
+                {
+                    va1 = VLEV_FLOAT(ao, vl);
+                    VSEV_FLOAT(b, va1, vl);
+                }
+                ao += lda;
+                b += vl;
+                i ++;
+                ii ++;
+            }
+        }
+
+        a += vl;
+        jj += vl;
+    }
+    return 0;
+}
+
diff --git a/kernel/riscv64/trsm_uncopy_rvv_v1.c b/kernel/riscv64/trsm_uncopy_rvv_v1.c
new file mode 100644
index 000000000..ee869a795
--- /dev/null
+++ b/kernel/riscv64/trsm_uncopy_rvv_v1.c
@@ -0,0 +1,121 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VSEV_FLOAT_M vse32_v_f32m2_m
+#define VLSEV_FLOAT vlse32_v_f32m2
+#define VBOOL_T vbool16_t
+#define UINT_V_T vuint32m2_t
+#define VID_V_UINT vid_v_u32m2
+#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VSEV_FLOAT_M vse64_v_f64m2_m
+#define VLSEV_FLOAT vlse64_v_f64m2
+#define VBOOL_T     vbool32_t
+#define UINT_V_T     vuint64m2_t
+#define VID_V_UINT   vid_v_u64m2
+#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
+#endif
+
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_uncopy_sve.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    BLASLONG i, ii, jj, js;
+    BLASLONG stride_lda = sizeof(FLOAT)*lda;
+
+    FLOAT *ao;
+    jj = offset;
+
+    FLOAT_V_T va1;
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        i = 0;
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    *(b + j) = INV(*(ao + j * lda));
+                    va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
+                    ao++;
+                    b += vl;
+                }
+                i += vl;
+                ii += vl;
+            } 
+            else
+            {
+                if (ii < jj) 
+                {
+                    va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                    VSEV_FLOAT(b, va1, vl);
+                }
+                ao++;
+                b += vl;
+                i++;
+                ii++;
+            }
+        } 
+
+        a += vl * lda;
+        jj += vl;
+    }
+    return 0;
+}
diff --git a/kernel/riscv64/trsm_utcopy_rvv_v1.c b/kernel/riscv64/trsm_utcopy_rvv_v1.c
new file mode 100644
index 000000000..a324b0fa6
--- /dev/null
+++ b/kernel/riscv64/trsm_utcopy_rvv_v1.c
@@ -0,0 +1,123 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VSEV_FLOAT_M vse32_v_f32m2_m
+#define VLSEV_FLOAT vlse32_v_f32m2
+#define VBOOL_T vbool16_t
+#define UINT_V_T vuint32m2_t
+#define VID_V_UINT vid_v_u32m2
+#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VSEV_FLOAT_M vse64_v_f64m2_m
+#define VLSEV_FLOAT vlse64_v_f64m2
+#define VBOOL_T     vbool32_t
+#define UINT_V_T     vuint64m2_t
+#define VID_V_UINT   vid_v_u64m2
+#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
+#endif
+
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_utcopy_sve.c
+
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    BLASLONG i, ii, jj, js;
+
+    FLOAT *ao;
+
+    jj = offset;
+    FLOAT_V_T va1;
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+  
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    va1 = VLEV_FLOAT(ao, vl);
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
+                    *(b + j) = INV(*(ao + j));
+
+                    ao += lda;
+                    b += vl;
+                }
+                i += vl;
+                ii += vl;
+            } 
+            else 
+            {
+                if (ii > jj) 
+                {
+                    va1 = VLEV_FLOAT(ao, vl);
+                    VSEV_FLOAT(b, va1, vl);
+                }
+                ao += lda;
+                b += vl;
+                i ++;
+                ii ++;
+            }
+        }
+
+        a += vl;
+        jj += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c
new file mode 100644
index 000000000..1917042be
--- /dev/null
+++ b/kernel/riscv64/zamax_rvv.c
@@ -0,0 +1,113 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m4()
+#define VSETVL_MAX_M1 vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLSEG_FLOAT vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
+#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMAXVV_FLOAT vfmax_vv_f32m4
+#define VFADDVV_FLOAT vfadd_vv_f32m4
+#define VFABSV_FLOAT vfabs_v_f32m4
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m4()
+#define VSETVL_MAX_M1 vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLSEG_FLOAT vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
+#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMAXVV_FLOAT vfmax_vv_f64m4
+#define VFADDVV_FLOAT vfadd_vv_f64m4
+#define VFABSV_FLOAT vfabs_v_f64m4
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT maxf=0.0;
+
+    if (n <= 0 || inc_x <= 0) return(maxf);
+
+    FLOAT_V_T v0, v1, vmax;
+    FLOAT_V_T_M1 v_res;
+
+    v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vmax = VFMVVF_FLOAT(0.0, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2) {
+            vl = VSETVL(n);
+
+            VLSEG_FLOAT(&v0, &v1, x, vl);
+
+            v0 = VFABSV_FLOAT(v0, vl);
+            v1 = VFABSV_FLOAT(v1, vl);
+
+            v0 = VFADDVV_FLOAT(v0, v1, vl);
+            vmax = VFMAXVV_FLOAT(vmax, v0, vl);
+  
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
+            vl = VSETVL(n);
+
+            VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl);
+
+            v0 = VFABSV_FLOAT(v0, vl);
+            v1 = VFABSV_FLOAT(v1, vl);
+
+            v0 = VFADDVV_FLOAT(v0, v1, vl);
+            vmax = VFMAXVV_FLOAT(vmax, v0, vl);
+        }
+
+    }
+
+    v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax);
+    maxf = VFMVFS_FLOAT_M1(v_res);
+
+    return(maxf);
+}
diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c
new file mode 100644
index 000000000..3f027383a
--- /dev/null
+++ b/kernel/riscv64/zamin_rvv.c
@@ -0,0 +1,112 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m4()
+#define VSETVL_MAX_M1 vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLSEG_FLOAT vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
+#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMINVV_FLOAT vfmin_vv_f32m4
+#define VFADDVV_FLOAT vfadd_vv_f32m4
+#define VFABSV_FLOAT vfabs_v_f32m4
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m4()
+#define VSETVL_MAX_M1 vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLSEG_FLOAT vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
+#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMINVV_FLOAT vfmin_vv_f64m4
+#define VFADDVV_FLOAT vfadd_vv_f64m4
+#define VFABSV_FLOAT vfabs_v_f64m4
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT minf=0.0;
+
+    if (n <= 0 || inc_x <= 0) return(minf);
+
+    FLOAT_V_T v0, v1, vmin;
+    FLOAT_V_T_M1 v_res;
+
+    v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vmin = VFMVVF_FLOAT(FLT_MAX, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2) {
+            vl = VSETVL(n);
+
+            VLSEG_FLOAT(&v0, &v1, x, vl);
+
+            v0 = VFABSV_FLOAT(v0, vl);
+            v1 = VFABSV_FLOAT(v1, vl);
+
+            v0 = VFADDVV_FLOAT(v0, v1, vl);
+            vmin = VFMINVV_FLOAT(vmin, v0, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
+            vl = VSETVL(n);
+
+            VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl);
+
+            v0 = VFABSV_FLOAT(v0, vl);
+            v1 = VFABSV_FLOAT(v1, vl);
+
+            v0 = VFADDVV_FLOAT(v0, v1, vl);
+            vmin = VFMINVV_FLOAT(vmin, v0, vl);
+        }
+
+    }
+
+    v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax);
+    minf = VFMVFS_FLOAT_M1(v_res);
+
+    return(minf);
+}
diff --git a/kernel/riscv64/zasum_rvv.c b/kernel/riscv64/zasum_rvv.c
new file mode 100644
index 000000000..7876646b3
--- /dev/null
+++ b/kernel/riscv64/zasum_rvv.c
@@ -0,0 +1,108 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m8(n)
+#define VSETVL_MAX vsetvlmax_e32m8()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
+#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VFADDVV_FLOAT vfadd_vv_f32m8
+#define VFABSV_FLOAT vfabs_v_f32m8
+#else
+#define VSETVL(n) vsetvl_e64m8(n)
+#define VSETVL_MAX vsetvlmax_e64m8()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
+#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VFADDVV_FLOAT vfadd_vv_f64m8
+#define VFABSV_FLOAT vfabs_v_f64m8
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT asumf = 0.0;
+    if (n <= 0 || inc_x <= 0) return(asumf);
+
+    FLOAT_V_T v0, v1;
+    size_t vlmax = VSETVL_MAX; 
+    FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2) {
+            vl = VSETVL(n);
+
+            v0 = VLEV_FLOAT(x, vl);
+            v1 = VLEV_FLOAT(x+vl, vl);
+
+            v0 = VFABSV_FLOAT(v0, vl);
+            v1 = VFABSV_FLOAT(v1, vl);
+
+            v_sum = VFADDVV_FLOAT(v_sum, v0, vl);
+            v_sum = VFADDVV_FLOAT(v_sum, v1, vl);
+        }
+
+    }
+    else {
+
+        int stride_x = inc_x * sizeof(FLOAT) * 2;
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
+            vl = VSETVL(n);
+
+            v0 = VLSEV_FLOAT(x, stride_x, vl);
+            v1 = VLSEV_FLOAT(x+1, stride_x, vl);
+
+            v0 = VFABSV_FLOAT(v0, vl);
+            v1 = VFABSV_FLOAT(v1, vl);
+
+            v_sum = VFADDVV_FLOAT(v_sum, v0, vl);
+            v_sum = VFADDVV_FLOAT(v_sum, v1, vl);
+        }
+
+    }
+
+    FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
+    FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax);
+    v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax);
+    asumf += VFMVFS_FLOAT_M1(v_res);
+
+    return(asumf);
+}
diff --git a/kernel/riscv64/zaxpby_rvv.c b/kernel/riscv64/zaxpby_rvv.c
new file mode 100644
index 000000000..66f52d9d0
--- /dev/null
+++ b/kernel/riscv64/zaxpby_rvv.c
@@ -0,0 +1,151 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/***************************************************************************
+* 2014/06/07 Saar
+*
+***************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)       vsetvl_e32m4(n)
+#define FLOAT_V_T       vfloat32m4_t
+#define VLSEV_FLOAT     vlse32_v_f32m4
+#define VSSEV_FLOAT     vsse32_v_f32m4
+#define VFMACCVF_FLOAT  vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#define VFMVVF_FLOAT    vfmv_v_f_f32m4
+#define VFMULVF_FLOAT   vfmul_vf_f32m4
+#define VFMSACVF_FLOAT  vfmsac_vf_f32m4
+#define VLSEG_FLOAT     vlseg2e32_v_f32m4
+#define VSSEG_FLOAT     vsseg2e32_v_f32m4
+#define VLSSEG_FLOAT    vlsseg2e32_v_f32m4
+#define VSSSEG_FLOAT    vssseg2e32_v_f32m4
+#else
+#define VSETVL(n)       vsetvl_e64m4(n)
+#define FLOAT_V_T       vfloat64m4_t
+#define VLSEV_FLOAT     vlse64_v_f64m4
+#define VSSEV_FLOAT     vsse64_v_f64m4
+#define VFMACCVF_FLOAT  vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#define VFMVVF_FLOAT    vfmv_v_f_f64m4
+#define VFMULVF_FLOAT   vfmul_vf_f64m4
+#define VFMSACVF_FLOAT  vfmsac_vf_f64m4
+#define VLSEG_FLOAT     vlseg2e64_v_f64m4
+#define VSSEG_FLOAT     vsseg2e64_v_f64m4
+#define VLSSEG_FLOAT    vlsseg2e64_v_f64m4
+#define VSSSEG_FLOAT    vssseg2e64_v_f64m4
+#endif
+
+int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y)
+{
+    BLASLONG inc_x2, inc_y2;
+
+    if ( n <= 0     )  return(0);
+
+    inc_x2 = 2 * inc_x;
+    inc_y2 = 2 * inc_y;
+    
+    BLASLONG stride_x = inc_x2 * sizeof(FLOAT);
+    BLASLONG stride_y = inc_y2 * sizeof(FLOAT);
+    FLOAT_V_T vx0, vx1, vy0, vy1;
+
+    if ( beta_r == 0.0 && beta_i == 0.0)
+    {
+        if ( alpha_r == 0.0 && alpha_i == 0.0 )
+        {
+            size_t vl = VSETVL(n);
+            FLOAT_V_T temp = VFMVVF_FLOAT(0.0, vl);
+            for ( ; n > 0; n -= vl, y += vl*stride_y)
+            {
+                vl = VSETVL(n);
+                VSSSEG_FLOAT(y, stride_y, temp, temp, vl);
+            }
+        }
+        else
+        {
+            for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) 
+            {
+                vl = VSETVL(n);
+                VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
+                
+                vy0 = VFMULVF_FLOAT(vx1, alpha_i, vl);
+                vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, vl);
+
+                vy1 = VFMULVF_FLOAT(vx1, alpha_r, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, vl);
+
+                VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
+            }
+        }
+    }
+    else
+    {
+        FLOAT_V_T v0, v1;
+
+        if ( alpha_r == 0.0 && alpha_i == 0.0 )
+        {
+            for (size_t vl; n > 0; n -= vl, y += vl*inc_y2) 
+            {
+                vl = VSETVL(n);
+                VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+                
+                v0 = VFMULVF_FLOAT(vy1, beta_i, vl);
+                v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, vl);
+
+                v1 = VFMULVF_FLOAT(vy1, beta_r, vl);
+                v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl);
+
+                VSSSEG_FLOAT(y, stride_y, v0, v1, vl);
+            }
+        }
+        else
+        {
+            for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) 
+            {
+                vl = VSETVL(n);
+                VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
+                VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+
+                v0 = VFMULVF_FLOAT(vx0, alpha_r, vl);
+                v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, vl);
+                v0 = VFMACCVF_FLOAT(v0, beta_r, vy0, vl);
+                v0 = VFNMSACVF_FLOAT(v0, beta_i, vy1, vl);
+                
+                v1 = VFMULVF_FLOAT(vx1, alpha_r, vl);
+                v1 = VFMACCVF_FLOAT(v1, alpha_i, vx0, vl);
+                v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, vl);
+                v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl);
+
+                VSSSEG_FLOAT(y, stride_y, v0, v1, vl);
+            }
+        }
+    }
+    return(0);
+
+}
diff --git a/kernel/riscv64/zaxpy_rvv.c b/kernel/riscv64/zaxpy_rvv.c
new file mode 100644
index 000000000..777bcb728
--- /dev/null
+++ b/kernel/riscv64/zaxpy_rvv.c
@@ -0,0 +1,154 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m4(n)
+#define FLOAT_V_T vfloat32m4_t
+#define VLSEG_FLOAT vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
+#define VSSEG_FLOAT vsseg2e32_v_f32m4
+#define VSSSEG_FLOAT vssseg2e32_v_f32m4
+#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#else
+#define VSETVL(n) vsetvl_e64m4(n)
+#define FLOAT_V_T vfloat64m4_t
+#define VLSEG_FLOAT vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
+#define VSSEG_FLOAT vsseg2e64_v_f64m4
+#define VSSSEG_FLOAT vssseg2e64_v_f64m4
+#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+    if(n < 0) return(0);
+    if(da_r == 0.0 && da_i == 0.0) return(0);
+ 
+    FLOAT_V_T vx0, vx1, vy0, vy1;
+
+    if(inc_x == 1 && inc_y == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            VLSEG_FLOAT(&vx0, &vx1, x, vl);
+            VLSEG_FLOAT(&vy0, &vy1, y, vl);
+        #if !defined(CONJ)
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #else
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #endif
+            VSSEG_FLOAT(y, vy0, vy1, vl);
+        }
+
+    } else if (inc_x == 1) {
+
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            VLSEG_FLOAT(&vx0, &vx1, x, vl);
+            VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+
+        #if !defined(CONJ)
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #else
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #endif
+            VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
+        }
+
+    } else if (inc_y == 1) {
+
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
+            VLSEG_FLOAT(&vy0, &vy1, y, vl);
+
+        #if !defined(CONJ)
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #else
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #endif
+            VSSEG_FLOAT(y, vy0, vy1, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
+            VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+
+        #if !defined(CONJ)
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #else
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #endif
+            VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
+        }
+
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/zcopy_rvv.c b/kernel/riscv64/zcopy_rvv.c
new file mode 100644
index 000000000..5d8322bbb
--- /dev/null
+++ b/kernel/riscv64/zcopy_rvv.c
@@ -0,0 +1,105 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL_M8(n) vsetvl_e32m8(n)
+#define FLOAT_V_T_M8 vfloat32m8_t
+#define VLEV_FLOAT_M8 vle32_v_f32m8
+#define VSEV_FLOAT_M8 vse32_v_f32m8
+
+#define VSETVL_M4(n) vsetvl_e32m4(n)
+#define FLOAT_V_T_M4 vfloat32m4_t
+#define VLSEG_FLOAT_M4 vlseg2e32_v_f32m4
+#define VSSEG_FLOAT_M4 vsseg2e32_v_f32m4
+#define VLSSEG_FLOAT_M4 vlsseg2e32_v_f32m4
+#define VSSSEG_FLOAT_M4 vssseg2e32_v_f32m4
+#else
+#define VSETVL_M8(n) vsetvl_e64m8(n)
+#define FLOAT_V_T_M8 vfloat64m8_t
+#define VLEV_FLOAT_M8 vle64_v_f64m8
+#define VSEV_FLOAT_M8 vse64_v_f64m8
+
+#define VSETVL_M4(n) vsetvl_e64m4(n)
+#define FLOAT_V_T_M4 vfloat64m4_t
+#define VLSEG_FLOAT_M4 vlseg2e64_v_f64m4
+#define VSSEG_FLOAT_M4 vsseg2e64_v_f64m4
+#define VLSSEG_FLOAT_M4 vlsseg2e64_v_f64m4
+#define VSSSEG_FLOAT_M4 vssseg2e64_v_f64m4
+#endif
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+    if(n < 0) return(0);
+
+    if(inc_x == 1 && inc_y == 1) {
+
+        FLOAT_V_T_M8 vx;
+        n *= 2; // convert to words
+
+        for(size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+            vl = VSETVL_M8(n);
+            vx = VLEV_FLOAT_M8(x, vl);
+            VSEV_FLOAT_M8(y, vx, vl);
+        }
+
+    }else if (1 == inc_x) {
+
+        FLOAT_V_T_M4 vr, vi;
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for(size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
+            vl = VSETVL_M4(n);
+            VLSEG_FLOAT_M4(&vr, &vi, x, vl);
+            VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl);
+        }
+    } else if (1 == inc_y) {
+
+        FLOAT_V_T_M4 vr, vi;
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+
+        for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
+            vl = VSETVL_M4(n);
+            VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl);
+            VSSEG_FLOAT_M4(y, vr, vi, vl);
+        }
+    } else {
+
+        FLOAT_V_T_M4 vr, vi;
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
+            vl = VSETVL_M4(n);
+            VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl);
+            VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl);
+        }
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c
new file mode 100644
index 000000000..7eae6f608
--- /dev/null
+++ b/kernel/riscv64/zdot_rvv.c
@@ -0,0 +1,170 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m4()
+#define VSETVL_MAX_M1 vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLSEG_FLOAT vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
+#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT vfmacc_vv_f32m4
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMULVV_FLOAT vfmul_vv_f32m4
+#define VFMSACVV_FLOAT vfmsac_vv_f32m4
+#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m4()
+#define VSETVL_MAX_M1 vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLSEG_FLOAT vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
+#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT vfmacc_vv_f64m4
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMULVV_FLOAT vfmul_vv_f64m4
+#define VFMSACVV_FLOAT vfmsac_vv_f64m4
+#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#endif
+
+OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+    OPENBLAS_COMPLEX_FLOAT result;
+    CREAL(result) = 0.0;
+    CIMAG(result) = 0.0;
+
+    if ( n <= 0 ) return(result);
+
+    FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1;
+    FLOAT_V_T_M1 v_res, v_z0;
+    size_t vlmax_m1 = VSETVL_MAX_M1;
+    v_res = VFMVVF_FLOAT_M1(0, vlmax_m1);
+    v_z0 = VFMVVF_FLOAT_M1(0, vlmax_m1);
+
+    size_t vlmax = VSETVL_MAX;
+    vr0 = VFMVVF_FLOAT(0, vlmax);
+    vr1 = VFMVVF_FLOAT(0, vlmax);
+ 
+    if(inc_x == 1 && inc_y == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            VLSEG_FLOAT(&vx0, &vx1, x, vl);
+            VLSEG_FLOAT(&vy0, &vy1, y, vl);
+
+            vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl);
+            vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl);
+        #if !defined(CONJ)
+            vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl);
+            vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl);
+        #else
+            vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl);
+            vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl);
+        #endif
+        }
+
+    }  else if (inc_x == 1){
+
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            VLSEG_FLOAT(&vx0, &vx1, x, vl);
+            VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+
+            vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl);
+            vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl);
+        #if !defined(CONJ)
+            vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl);
+            vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl);
+        #else
+            vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl);
+            vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl);
+        #endif
+        }
+    } else if (inc_y == 1){
+
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
+            VLSEG_FLOAT(&vy0, &vy1, y, vl);
+
+            vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl);
+            vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl);
+        #if !defined(CONJ)
+            vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl);
+            vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl);
+        #else
+            vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl);
+            vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl);
+        #endif
+        }
+    }else {
+
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
+            VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+
+            vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl);
+            vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl);
+        #if !defined(CONJ)
+            vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl);
+            vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl);
+        #else
+            vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl);
+            vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl);
+        #endif
+        }
+    }
+
+    v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, vlmax);
+    CREAL(result) = VFMVFS_FLOAT_M1(v_res);
+    v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, vlmax);
+    CIMAG(result) = VFMVFS_FLOAT_M1(v_res);
+ 
+   return(result);
+}
diff --git a/kernel/riscv64/zgemm_beta_rvv.c b/kernel/riscv64/zgemm_beta_rvv.c
new file mode 100644
index 000000000..a89752d18
--- /dev/null
+++ b/kernel/riscv64/zgemm_beta_rvv.c
@@ -0,0 +1,117 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m4(n)
+#define FLOAT_V_T vfloat32m4_t
+#define VLSEG_FLOAT vlseg2e32_v_f32m4
+#define VSSEG_FLOAT vsseg2e32_v_f32m4
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMULVF_FLOAT vfmul_vf_f32m4
+#define VFADDVV_FLOAT vfadd_vv_f32m4
+#define VFSUBVV_FLOAT vfsub_vv_f32m4
+#else
+#define VSETVL(n) vsetvl_e64m4(n)
+#define FLOAT_V_T vfloat64m4_t
+#define VLSEG_FLOAT vlseg2e64_v_f64m4
+#define VSSEG_FLOAT vsseg2e64_v_f64m4
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMULVF_FLOAT vfmul_vf_f64m4
+#define VFADDVV_FLOAT vfadd_vv_f64m4
+#define VFSUBVV_FLOAT vfsub_vv_f64m4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1,
+          FLOAT beta_r, FLOAT beta_i,
+          FLOAT *dummy2, BLASLONG dummy3,
+          FLOAT *dummy4, BLASLONG dummy5,
+          FLOAT *c, BLASLONG ldc)
+{
+    BLASLONG chunk;
+    FLOAT *c_offset;
+	size_t vl;
+    FLOAT_V_T vr, vi, v1, v2, v3, v4;
+
+    ldc *= 2;
+    c_offset = c;
+
+    if (beta_r == 0.0 && beta_i == 0.0) {
+
+        vl = VSETVL(m);
+        vr = VFMVVF_FLOAT(0.0, vl);
+        vi = VFMVVF_FLOAT(0.0, vl);
+
+        for( ; n > 0; n--, c += ldc) {
+            c_offset = c;
+
+            for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) {
+                vl = VSETVL(chunk);
+
+                VSSEG_FLOAT(c_offset, vr, vi, vl);
+			}
+		}
+
+    } else {
+
+        for( ; n > 0; n--, c += ldc) {
+            c_offset = c;
+
+            for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) {
+                vl = VSETVL(chunk);
+
+                VLSEG_FLOAT(&vr, &vi, c_offset, vl);
+
+                v1 = VFMULVF_FLOAT(vr, beta_r, vl);
+                v2 = VFMULVF_FLOAT(vi, beta_i, vl);
+
+                v3 = VFMULVF_FLOAT(vi, beta_r, vl);
+                v4 = VFMULVF_FLOAT(vr, beta_i, vl);
+
+				vr = VFSUBVV_FLOAT(v1, v2, vl);
+				vi = VFADDVV_FLOAT(v3, v4, vl);
+
+                VSSEG_FLOAT(c_offset, vr, vi, vl);
+			}
+		}
+
+	}
+
+    return 0;
+}
diff --git a/kernel/riscv64/zgemv_n_rvv.c b/kernel/riscv64/zgemv_n_rvv.c
new file mode 100644
index 000000000..2eeb61b45
--- /dev/null
+++ b/kernel/riscv64/zgemv_n_rvv.c
@@ -0,0 +1,170 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m4(n)
+#define FLOAT_V_T vfloat32m4_t
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
+#define VLSEG_FLOAT vlseg2e32_v_f32m4
+#define VSSEG_FLOAT vsseg2e32_v_f32m4
+#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
+#define VSSSEG_FLOAT vssseg2e32_v_f32m4
+#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#else
+#define VSETVL(n) vsetvl_e64m4(n)
+#define FLOAT_V_T vfloat64m4_t
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
+#define VLSEG_FLOAT vlseg2e64_v_f64m4
+#define VSSEG_FLOAT vsseg2e64_v_f64m4
+#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
+#define VSSSEG_FLOAT vssseg2e64_v_f64m4
+#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+    BLASLONG i;
+    BLASLONG ix;
+    FLOAT *a_ptr;
+    FLOAT temp_r, temp_i;
+    FLOAT_V_T va0, va1, vy0, vy1;
+
+    BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2;
+
+    BLASLONG inc_x2 = inc_x * 2;
+    BLASLONG lda2 = lda * 2;
+    if (inc_y == 1)
+    {
+        for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*2) {
+            vl = VSETVL(m);
+            a_ptr = a;
+            ix = 0;
+            VLSEG_FLOAT(&vy0, &vy1, y, vl);
+
+            for(i = 0; i < n; i++){
+#if !defined(XCONJ)
+                temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
+                temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
+#else
+                temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
+                temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
+#endif
+
+                VLSEG_FLOAT(&va0, &va1, a_ptr, vl);
+#if !defined(CONJ)
+#if !defined(XCONJ)
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl);
+#else
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl);
+#endif
+#else
+#if !defined(XCONJ)
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl);
+#else
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl);
+#endif
+#endif
+                a_ptr += lda2;
+                ix += inc_x2;
+            }
+            VSSEG_FLOAT(y, vy0, vy1, vl);
+        }
+
+    }
+    else
+    {
+        for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*inc_y*2) {
+            vl = VSETVL(m);
+            a_ptr = a;
+            ix = 0;
+            VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+
+            for(i = 0; i < n; i++){
+#if !defined(XCONJ)
+                temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
+                temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
+#else
+                temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
+                temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
+#endif
+
+                VLSEG_FLOAT(&va0, &va1, a_ptr, vl);
+#if !defined(CONJ)
+#if !defined(XCONJ)
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl);
+#else
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl);
+#endif
+#else
+#if !defined(XCONJ)
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl);
+#else
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl);
+#endif
+#endif
+                a_ptr += lda2;
+                ix += inc_x2;
+            }
+            VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
+        }
+    }
+    return(0);
+}
diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c
new file mode 100644
index 000000000..b682d5cd8
--- /dev/null
+++ b/kernel/riscv64/zgemv_t_rvv.c
@@ -0,0 +1,172 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX_M1 vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLSEG_FLOAT vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
+#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT vfmacc_vv_f32m4
+#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMULVV_FLOAT vfmul_vv_f32m4
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX_M1 vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLSEG_FLOAT vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
+#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT vfmacc_vv_f64m4
+#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMULVV_FLOAT vfmul_vv_f64m4
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+    BLASLONG i = 0, j = 0;
+    BLASLONG ix = 0, iy = 0;
+    FLOAT *a_ptr = a;
+    FLOAT temp_r, temp_i;
+
+    FLOAT_V_T va0, va1, vx0, vx1, vr, vi; 
+    FLOAT_V_T_M1 v_res, v_z0;
+
+    BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+    //BLASLONG stride_a = sizeof(FLOAT) * 2;
+    BLASLONG inc_y2 = inc_y * 2;
+    BLASLONG lda2 = lda * 2;
+
+    size_t vlmax = VSETVL_MAX_M1;
+    v_res = VFMVVF_FLOAT_M1(0, vlmax);
+    v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
+    vlmax = VSETVL(m);
+
+    if (inc_x == 1)
+    {
+        for(i = 0; i < n; i++) {    
+            j = 0;
+            ix = 0;
+            vr = VFMVVF_FLOAT(0, vlmax);
+            vi = VFMVVF_FLOAT(0, vlmax);
+            for(size_t vl, k = m; k > 0; k -= vl) {
+                vl = VSETVL(k);
+
+                VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl);
+                VLSEG_FLOAT(&vx0, &vx1, &x[ix], vl);
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                vr = VFMACCVV_FLOAT(vr, va0, vx0, vl);
+                vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl);
+                vi = VFMACCVV_FLOAT(vi, va0, vx1, vl);
+                vi = VFMACCVV_FLOAT(vi, va1, vx0, vl);
+#else
+                vr = VFMACCVV_FLOAT(vr, va0, vx0, vl);
+                vr = VFMACCVV_FLOAT(vr, va1, vx1, vl);
+                vi = VFMACCVV_FLOAT(vi, va0, vx1, vl);
+                vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl);
+#endif
+                j += vl * 2;
+                ix += vl * inc_x * 2;
+            }
+            
+            v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
+            temp_r = VFMVFS_FLOAT_M1(v_res);
+            v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax);
+            temp_i = VFMVFS_FLOAT_M1(v_res);
+
+#if !defined(XCONJ)
+            y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
+            y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+            y[iy]   += alpha_r * temp_r + alpha_i * temp_i;
+            y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+            iy += inc_y2;
+            a_ptr += lda2;
+        }
+    }
+    else
+    {
+        for(i = 0; i < n; i++) {    
+            j = 0;
+            ix = 0;
+            vr = VFMVVF_FLOAT(0, vlmax);
+            vi = VFMVVF_FLOAT(0, vlmax);
+            for(size_t vl, k = m; k > 0; k -= vl) {
+                vl = VSETVL(k);
+    
+                VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl);
+                VLSSEG_FLOAT(&vx0, &vx1, &x[ix], stride_x, vl);
+    
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                vr = VFMACCVV_FLOAT(vr, va0, vx0, vl);
+                vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl);
+                vi = VFMACCVV_FLOAT(vi, va0, vx1, vl);
+                vi = VFMACCVV_FLOAT(vi, va1, vx0, vl);
+#else
+                vr = VFMACCVV_FLOAT(vr, va0, vx0, vl);
+                vr = VFMACCVV_FLOAT(vr, va1, vx1, vl);
+                vi = VFMACCVV_FLOAT(vi, va0, vx1, vl);
+                vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl);
+#endif
+                j += vl * 2;
+                ix += vl * inc_x * 2;
+            }
+            
+            v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
+            temp_r = VFMVFS_FLOAT_M1(v_res);
+            v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax);
+            temp_i = VFMVFS_FLOAT_M1(v_res);
+    
+#if !defined(XCONJ)
+            y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
+            y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+            y[iy]   += alpha_r * temp_r + alpha_i * temp_i;
+            y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+            iy += inc_y2;
+            a_ptr += lda2;
+        }
+
+    }
+
+
+    return(0);
+}
diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c
new file mode 100644
index 000000000..921ddb8cb
--- /dev/null
+++ b/kernel/riscv64/znrm2_rvv.c
@@ -0,0 +1,122 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m4()
+#define VSETVL_MAX_M1 vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLSEG_FLOAT vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
+#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT vfmacc_vv_f32m4
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VFABSV_FLOAT vfabs_v_f32m4
+#else
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m4()
+#define VSETVL_MAX_M1 vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLSEG_FLOAT vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
+#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT vfmacc_vv_f64m4
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VFABSV_FLOAT vfabs_v_f64m4
+#endif
+
+// TODO: Should single precision use the widening MAC, or perhaps all should be double? 
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+
+    if ( n <= 0 ) return(0.0);
+
+    FLOAT_V_T vr, v0, v1;
+    FLOAT_V_T_M1 v_max, v_res;
+    FLOAT scale = 0.0, ssq = 0.0;
+
+    size_t vlmax = VSETVL_MAX;
+    v_res = VFMVVF_FLOAT_M1(0, vlmax);
+    v_max = VFMVVF_FLOAT_M1(0, vlmax);
+
+    vr = VFMVVF_FLOAT(0, vlmax);
+
+    if (inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2) {
+            vl = VSETVL(n);
+
+            VLSEG_FLOAT(&v0, &v1, x, vl);
+            v0 = VFABSV_FLOAT(v0, vl);
+            v1 = VFABSV_FLOAT(v1, vl);
+
+            v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl);
+            vr = VFMACCVV_FLOAT(vr, v0, v0, vl);
+
+            v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl);
+            vr = VFMACCVV_FLOAT(vr, v1, v1, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
+            vl = VSETVL(n);
+ 
+            VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl);
+            v0 = VFABSV_FLOAT(v0, vl);
+            v1 = VFABSV_FLOAT(v1, vl);
+
+            v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl);
+            vr = VFMACCVV_FLOAT(vr, v0, v0, vl);
+
+            v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl);
+            vr = VFMACCVV_FLOAT(vr, v1, v1, vl);
+        }
+
+    }
+
+    v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax);
+
+    ssq = VFMVFS_FLOAT_M1(v_res);
+    scale = VFMVFS_FLOAT_M1(v_max);
+    ssq = ssq / (scale*scale);
+
+    return(scale * sqrt(ssq));
+}
diff --git a/kernel/riscv64/zrot_rvv.c b/kernel/riscv64/zrot_rvv.c
new file mode 100644
index 000000000..68066a00b
--- /dev/null
+++ b/kernel/riscv64/zrot_rvv.c
@@ -0,0 +1,181 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m4(n)
+#define FLOAT_V_T vfloat32m4_t
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
+#define VLSEG_FLOAT vlseg2e32_v_f32m4
+#define VSSEG_FLOAT vsseg2e32_v_f32m4
+#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
+#define VSSSEG_FLOAT vssseg2e32_v_f32m4
+#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#define VFMULVF_FLOAT vfmul_vf_f32m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#else
+#define VSETVL(n) vsetvl_e64m4(n)
+#define FLOAT_V_T vfloat64m4_t
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
+#define VLSEG_FLOAT vlseg2e64_v_f64m4
+#define VSSEG_FLOAT vsseg2e64_v_f64m4
+#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
+#define VSSSEG_FLOAT vssseg2e64_v_f64m4
+#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#define VFMULVF_FLOAT vfmul_vf_f64m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#endif
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+
+    if (n <= 0) return(0);
+
+    FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1;
+
+    if (inc_x == 0 && inc_y == 0) {
+        BLASLONG i=0;
+        BLASLONG ix=0,iy=0;
+        FLOAT temp[2];
+        BLASLONG inc_x2;
+        BLASLONG inc_y2;
+
+        inc_x2 = 2 * inc_x ;
+        inc_y2 = 2 * inc_y ;
+
+        while(i < n)
+        {
+            temp[0]   = c*x[ix]   + s*y[iy] ;
+            temp[1]   = c*x[ix+1] + s*y[iy+1] ;
+            y[iy]     = c*y[iy]   - s*x[ix] ;
+            y[iy+1]   = c*y[iy+1] - s*x[ix+1] ;
+            x[ix]     = temp[0] ;
+            x[ix+1]   = temp[1] ;
+
+            ix += inc_x2 ;
+            iy += inc_y2 ;
+            i++ ;
+        }
+    }
+    else if(inc_x == 1 && inc_y == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            VLSEG_FLOAT(&vx0, &vx1, x, vl);
+            VLSEG_FLOAT(&vy0, &vy1, y, vl);
+
+            vt0 = VFMULVF_FLOAT(vx0, c, vl);
+            vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
+            vt1 = VFMULVF_FLOAT(vx1, c, vl);
+            vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl);
+            vy0 = VFMULVF_FLOAT(vy0, c, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl);
+            vy1 = VFMULVF_FLOAT(vy1, c, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
+
+            VSSEG_FLOAT(x, vt0, vt1, vl);
+            VSSEG_FLOAT(y, vy0, vy1, vl);
+        }
+
+    } else if (inc_x == 1){
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            VLSEG_FLOAT(&vx0, &vx1, x, vl);
+            VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+
+            vt0 = VFMULVF_FLOAT(vx0, c, vl);
+            vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
+            vt1 = VFMULVF_FLOAT(vx1, c, vl);
+            vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl);
+            vy0 = VFMULVF_FLOAT(vy0, c, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl);
+            vy1 = VFMULVF_FLOAT(vy1, c, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
+
+            VSSEG_FLOAT(x, vt0, vt1, vl);
+            VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
+        }
+
+    } else if (inc_y == 1){
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
+            VLSEG_FLOAT(&vy0, &vy1, y, vl);
+
+            vt0 = VFMULVF_FLOAT(vx0, c, vl);
+            vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
+            vt1 = VFMULVF_FLOAT(vx1, c, vl);
+            vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl);
+            vy0 = VFMULVF_FLOAT(vy0, c, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl);
+            vy1 = VFMULVF_FLOAT(vy1, c, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
+
+            VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl);
+            VSSEG_FLOAT(y, vy0, vy1, vl);
+        }
+
+    } else {
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
+            VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+
+            vt0 = VFMULVF_FLOAT(vx0, c, vl);
+            vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
+            vt1 = VFMULVF_FLOAT(vx1, c, vl);
+            vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl);
+            vy0 = VFMULVF_FLOAT(vy0, c, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl);
+            vy1 = VFMULVF_FLOAT(vy1, c, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
+
+            VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl);
+            VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c
new file mode 100644
index 000000000..079c36a2d
--- /dev/null
+++ b/kernel/riscv64/zscal_rvv.c
@@ -0,0 +1,148 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m4()
+#define FLOAT_V_T vfloat32m4_t
+#define VLSEG_FLOAT vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
+#define VSSEG_FLOAT vsseg2e32_v_f32m4
+#define VSSSEG_FLOAT vssseg2e32_v_f32m4
+#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#define VFMULVF_FLOAT vfmul_vf_f32m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#else
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m4()
+#define FLOAT_V_T vfloat64m4_t
+#define VLSEG_FLOAT vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
+#define VSSEG_FLOAT vsseg2e64_v_f64m4
+#define VSSSEG_FLOAT vssseg2e64_v_f64m4
+#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#define VFMULVF_FLOAT vfmul_vf_f64m4
+#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+
+    if((n <= 0) || (inc_x <= 0)) return(0);
+
+    FLOAT_V_T vt, vr, vi;
+    BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+    size_t vlmax = VSETVL_MAX;
+
+    if(da_r == 0.0 && da_i == 0.0) {
+
+        vr = VFMVVF_FLOAT(0.0, vlmax);
+        vi = VFMVVF_FLOAT(0.0, vlmax);
+
+        if(inc_x == 1) {
+
+            for (size_t vl; n > 0; n -= vl, x += vl*2) {
+                vl = VSETVL(n);
+
+                VSSEG_FLOAT(x, vr, vi, vl);
+            }
+
+        } else {
+
+            for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
+                vl = VSETVL(n);
+
+                VSSSEG_FLOAT(x, stride_x, vr, vi, vl);
+            }
+        }
+
+    } else if(da_r == 0.0) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
+            vl = VSETVL(n);
+            
+            VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl);
+
+            vt = VFMULVF_FLOAT(vi, -da_i, vl);
+            vi = VFMULVF_FLOAT(vr, da_i, vl);
+
+            VSSSEG_FLOAT(x, stride_x, vt, vi, vl);
+        }
+
+    } else if(da_i == 0.0) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
+            vl = VSETVL(n);
+
+            VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl);
+
+            vr = VFMULVF_FLOAT(vr, da_r, vl);
+            vi = VFMULVF_FLOAT(vi, da_r, vl);
+
+            VSSSEG_FLOAT(x, stride_x, vr, vi, vl);
+        }
+
+    } else {
+
+        if(inc_x == 1) {
+
+            for (size_t vl; n > 0; n -= vl, x += vl*2) {
+                vl = VSETVL(n);
+
+                VLSEG_FLOAT(&vr, &vi, x, vl);
+
+                vt = VFMULVF_FLOAT(vr, da_r, vl);
+                vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);
+                vi = VFMULVF_FLOAT(vi, da_r, vl);
+                vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);
+
+                VSSEG_FLOAT(x, vt, vi, vl);
+            }
+
+        } else {
+
+            for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
+                vl = VSETVL(n);
+
+                VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl);
+
+                vt = VFMULVF_FLOAT(vr, da_r, vl);
+                vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);
+                vi = VFMULVF_FLOAT(vi, da_r, vl);
+                vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);
+
+                VSSSEG_FLOAT(x, stride_x, vt, vi, vl);
+            }
+        }
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c
new file mode 100644
index 000000000..3928fbe27
--- /dev/null
+++ b/kernel/riscv64/zsum_rvv.c
@@ -0,0 +1,97 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL_MAX vsetvlmax_e32m4()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLSEG_FLOAT vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
+#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1
+#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VFADDVV_FLOAT vfadd_vv_f32m4
+#else
+#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL_MAX vsetvlmax_e64m4()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLSEG_FLOAT vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
+#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1
+#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VFADDVV_FLOAT vfadd_vv_f64m4
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT sumf = 0.0;
+    if (n <= 0 || inc_x <= 0) return(sumf);
+
+    FLOAT_V_T v0, v1;
+    size_t vlmax = VSETVL_MAX; 
+    FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2) {
+            vl = VSETVL(n);
+
+            VLSEG_FLOAT(&v0, &v1, x, vl);
+
+            v_sum = VFADDVV_FLOAT(v_sum, v0, vl);
+            v_sum = VFADDVV_FLOAT(v_sum, v1, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
+            vl = VSETVL(n);
+
+            VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl);
+
+            v_sum = VFADDVV_FLOAT(v_sum, v0, vl);
+            v_sum = VFADDVV_FLOAT(v_sum, v1, vl);
+        }
+
+    }
+
+    FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
+    FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax);
+    v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax);
+    sumf += VFMVFS_FLOAT_M1(v_res);
+
+    return(sumf);
+}
diff --git a/kernel/riscv64/zswap_rvv.c b/kernel/riscv64/zswap_rvv.c
new file mode 100644
index 000000000..86f9103d3
--- /dev/null
+++ b/kernel/riscv64/zswap_rvv.c
@@ -0,0 +1,156 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m4(n)
+#define FLOAT_V_T vfloat32m4_t
+#define VLSEG_FLOAT vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
+#define VSSEG_FLOAT vsseg2e32_v_f32m4
+#define VSSSEG_FLOAT vssseg2e32_v_f32m4
+#else
+#define VSETVL(n) vsetvl_e64m4(n)
+#define FLOAT_V_T vfloat64m4_t
+#define VLSEG_FLOAT vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
+#define VSSEG_FLOAT vsseg2e64_v_f64m4
+#define VSSSEG_FLOAT vssseg2e64_v_f64m4
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+
+    if (n <= 0) return(0);
+
+    FLOAT_V_T vx0, vx1, vy0, vy1;
+
+    if (inc_x == 0 && inc_y == 0) {
+        if (n & 1) {
+            FLOAT temp[2];
+            temp[0] = x[0];
+            temp[1] = x[1];
+            x[0] = y[0];
+            x[1] = y[1];
+            y[0] = temp[0];
+            y[1] = temp[1];
+        }
+        else {
+            return 0;
+        }
+    }
+    else if(inc_x == 0) {
+        FLOAT temp[2];
+        temp[0] = x[0];
+        temp[1] = x[1];
+        x[0] = y[(n - 1) * inc_y * 2];
+        x[0] = y[(n - 1) * inc_y * 2 + 1];
+        FLOAT* ptr = y + (n - 1) * inc_y * 2;   // start from the last one
+        BLASLONG stride_y = (0 - inc_y) * sizeof(FLOAT) * 2; // reverse
+        BLASLONG m = n - 1;
+        for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_y * 2) {
+            vl = VSETVL(m);
+            VLSSEG_FLOAT(&vy0, &vy1, ptr - 2, stride_y, vl);
+            VSSSEG_FLOAT(ptr, stride_y, vy0, vy1, vl);
+        }
+        y[0] = temp[0];
+        y[1] = temp[1];
+    }
+    else if(inc_y == 0) {
+        FLOAT temp[2];
+        temp[0] = y[0];
+        temp[1] = y[1];
+        y[0] = x[(n - 1) * inc_x * 2];
+        y[0] = x[(n - 1) * inc_x * 2 + 1];
+        FLOAT* ptr = x + (n - 1) * inc_x * 2;   // start from the last one
+        BLASLONG stride_x = (0 - inc_x) * sizeof(FLOAT) * 2; // reverse
+        BLASLONG m = n - 1;
+        for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_x * 2) {
+            vl = VSETVL(m);
+            VLSSEG_FLOAT(&vx0, &vx1, ptr - 2, stride_x, vl);
+            VSSSEG_FLOAT(ptr, stride_x, vx0, vx1, vl);
+        }
+        x[0] = temp[0];
+        x[1] = temp[1];
+    }
+    else if(inc_x == 1 && inc_y == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            VLSEG_FLOAT(&vx0, &vx1, x, vl);
+            VLSEG_FLOAT(&vy0, &vy1, y, vl);
+
+            VSSEG_FLOAT(y, vx0, vx1, vl);
+            VSSEG_FLOAT(x, vy0, vy1, vl);
+        }
+
+    } else if (inc_x == 1){
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            VLSEG_FLOAT(&vx0, &vx1, x, vl);
+            VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+
+            VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl);
+            VSSEG_FLOAT(x, vy0, vy1, vl);
+        }
+
+    } else if (inc_y == 1){
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
+            VLSEG_FLOAT(&vy0, &vy1, y, vl);
+
+            VSSEG_FLOAT(y, vx0, vx1, vl);
+            VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl);
+        }
+
+    } else {
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
+            VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+
+            VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl);
+            VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl);
+        }
+
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/ztrmmkernel_2x2_rvv.c b/kernel/riscv64/ztrmmkernel_2x2_rvv.c
new file mode 100644
index 000000000..3486a4648
--- /dev/null
+++ b/kernel/riscv64/ztrmmkernel_2x2_rvv.c
@@ -0,0 +1,596 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define VSETVL_MAX vsetvlmax_e32m2()
+#define VSETVL_MAX_M1 vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m2_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VLSEG4_FLOAT vlseg4e32_v_f32m2
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
+#define VFMVVF_FLOAT vfmv_v_f_f32m2
+#define VFMACCVF_FLOAT vfmacc_vf_f32m2
+#define VFMACCVV_FLOAT vfmacc_vv_f32m2
+#define VFNMSACVV_FLOAT vfnmsac_vv_f32m2
+#define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define VSETVL_MAX vsetvlmax_e64m2()
+#define VSETVL_MAX_M1 vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m2_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VLSEG4_FLOAT vlseg4e64_v_f64m2
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
+#define VFMVVF_FLOAT vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT vfmacc_vf_f64m2
+#define VFMACCVV_FLOAT vfmacc_vv_f64m2
+#define VFNMSACVV_FLOAT vfnmsac_vv_f64m2
+#define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1
+#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#endif
+
+// Optimizes the implementation in ../generic/ztrmmkernel_2x2.c
+
+
+/********************************
+  ADD1 a*c
+  ADD2 b*c
+  ADD3 a*d
+  ADD4 b*d
+ *********************************/
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,
+        FLOAT* C,BLASLONG ldc, BLASLONG offset)
+{
+    BLASLONG i,j,k;
+    FLOAT *C0,*C1,*ptrba,*ptrbb;
+    FLOAT res0,res1;
+    BLASLONG off, temp;
+    
+    FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3;
+    FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
+    FLOAT_V_T_M1 v_m1_res0, v_m1_res1;
+    FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
+
+    size_t vl;
+    size_t vlmax = VSETVL_MAX;
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    off = -offset;
+#else
+    off = 0;
+#endif
+
+    for (j = bn/2; j > 0; j--)
+    {
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+        off = offset;
+#endif
+        C0 = C;
+        C1 = C0+2*ldc;
+        ptrba = ba;
+
+        for (i = bm/2; i > 0; i--)
+        {
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2*2;
+            ptrbb = bb+off*2*2;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+            vres2 = VFMVVF_FLOAT(0.0, vlmax);
+            vres3 = VFMVVF_FLOAT(0.0, vlmax);
+            vres4 = VFMVVF_FLOAT(0.0, vlmax);
+            vres5 = VFMVVF_FLOAT(0.0, vlmax);
+            vres6 = VFMVVF_FLOAT(0.0, vlmax);
+            vres7 = VFMVVF_FLOAT(0.0, vlmax);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk - off;
+#elif defined(LEFT)
+            temp = off + 2;
+#else
+            temp = off + 2;
+#endif
+
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
+                VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
+
+                vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
+                vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl);
+                vres4 = VFNMSACVV_FLOAT(vres4, va1, vb3, vl);
+                vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl);
+
+                vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
+                vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl);
+                vres6 = VFNMSACVV_FLOAT(vres6, va3, vb3, vl);
+                vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl);
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
+
+                vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
+                vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl);
+                vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl);
+                vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl);
+
+                vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
+                vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl);
+                vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl);
+                vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl);
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
+
+                vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
+                vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl);
+                vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl);
+                vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl);
+
+                vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
+                vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl);
+                vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl);
+                vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl);
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
+
+                vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
+                vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl);
+                vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl);
+                vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl);
+
+                vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
+                vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl);
+                vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl);
+                vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl);
+
+#endif
+                ptrba += vl * 4;
+                ptrbb += vl * 4;
+            }
+            
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C0[0] = res0 * alphar - res1 * alphai;
+            C0[1] = res1 * alphar + res0 * alphai;
+
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C0[2] = res0 * alphar - res1 * alphai;
+            C0[3] = res1 * alphar + res0 * alphai;
+
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres4, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres5, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C1[0] = res0 * alphar - res1 * alphai;
+            C1[1] = res1 * alphar + res0 * alphai;
+
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres6, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres7, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C1[2] = res0 * alphar - res1 * alphai;
+            C1[3] = res1 * alphar + res0 * alphai;
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+            (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2;
+#else
+            temp -= 2;
+#endif
+
+            ptrba += temp*2*2;
+            ptrbb += temp*2*2;
+
+#endif
+
+#ifdef LEFT
+            off += 2;
+#endif
+
+            C0 = C0+4;
+            C1 = C1+4;
+        }
+
+        if (bm & 1)
+        {
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*2*2;
+#endif
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+            vres2 = VFMVVF_FLOAT(0.0, vlmax);
+            vres3 = VFMVVF_FLOAT(0.0, vlmax);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk - off;
+#elif defined(LEFT)
+            temp = off+1;
+#else
+            temp = off+2;
+#endif
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl);
+                vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl);
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl);
+
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl);
+
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl);
+                vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl);
+
+#endif
+                ptrba += vl * 2;
+                ptrbb += vl * 4;
+            }
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C0[0] = res0 * alphar - res1 * alphai;
+            C0[1] = res1 * alphar + res0 * alphai;
+
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C1[0] = res0 * alphar - res1 * alphai;
+            C1[1] = res1 * alphar + res0 * alphai;
+
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+            (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1;
+#else
+            temp -= 2;
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*2*2;
+#endif
+#ifdef LEFT
+            off += 1;
+#endif
+            C0 = C0+2;
+            C1 = C1+2;
+        }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 2;
+#endif
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+    if (bn & 1)
+    {
+        C0 = C;
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+        off = offset;
+#endif
+        ptrba = ba;
+
+        for (i = bm/2; i > 0; i--)
+        {
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2*2;
+            ptrbb = bb+off*2;
+#endif
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+            vres2 = VFMVVF_FLOAT(0.0, vlmax);
+            vres3 = VFMVVF_FLOAT(0.0, vlmax);
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk - off;
+#elif defined(LEFT)
+            temp = off + 2;
+#else
+            temp = off + 1;
+#endif
+
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
+                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
+
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
+
+#endif
+                ptrba += vl * 4;
+                ptrbb += vl * 2;
+            }
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C0[0] = res0 * alphar - res1 * alphai;
+            C0[1] = res1 * alphar + res0 * alphai;
+
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C0[2] = res0 * alphar - res1 * alphai;
+            C0[3] = res1 * alphar + res0 * alphai;
+
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+            (!defined(LEFT) && !defined(TRANSA))
+            temp = bk-off;
+#ifdef LEFT
+            temp -= 2;
+#else
+            temp -= 1;
+#endif
+            ptrba += temp*2*2;
+            ptrbb += temp*2;
+#endif
+#ifdef LEFT
+            off += 2;
+#endif
+            C0 = C0+4;
+        }
+
+        if (bm & 1)
+        {
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*2;
+#endif
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off + 1;
+#else
+            temp = off + 1;
+#endif
+
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+#endif
+                ptrba += vl * 2;
+                ptrbb += vl * 2;
+                
+            }
+            
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            
+            C0[0] = res0 * alphar - res1 * alphai;
+            C0[1] = res1 * alphar + res0 * alphai;
+
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+            (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1;
+#else
+            temp -= 1;
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*2;
+            
+#endif
+#ifdef LEFT
+            off += 1;
+#endif
+            C0 = C0+2;
+        }
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+    return 0;
+}
diff --git a/param.h b/param.h
index 514b13a3a..62b675d6c 100644
--- a/param.h
+++ b/param.h
@@ -3038,6 +3038,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
+#if defined(x280)
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M  16 // 4 // 16 // 2
+#define SGEMM_DEFAULT_UNROLL_N  8// 4 // 4 // 2
+
+/* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N)
+ * Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro.
+ * If VLMAX size is ever more than 1024, this should be increased also. */
+#define SGEMM_DEFAULT_UNROLL_MN  32
+
+#define DGEMM_DEFAULT_UNROLL_M  16 //2 // 8
+#define DGEMM_DEFAULT_UNROLL_N  8 //2 // 4
+#define DGEMM_DEFAULT_UNROLL_MN  32
+
+#define CGEMM_DEFAULT_UNROLL_M  2
+#define CGEMM_DEFAULT_UNROLL_N  2
+
+#define ZGEMM_DEFAULT_UNROLL_M  2
+#define ZGEMM_DEFAULT_UNROLL_N  2
+
+#define SGEMM_DEFAULT_P	160
+#define DGEMM_DEFAULT_P	160
+#define CGEMM_DEFAULT_P 96
+#define ZGEMM_DEFAULT_P 64
+
+#define SGEMM_DEFAULT_Q 240
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 120
+#define ZGEMM_DEFAULT_Q 120
+
+#define SGEMM_DEFAULT_R 12288
+#define DGEMM_DEFAULT_R 8192
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#define SYMV_P	16
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+
+#endif
 #ifdef C910V
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0

From 5d0d1c555195a391fe5d029427dfbf7b942ecdf9 Mon Sep 17 00:00:00 2001
From: Heller Zheng <heller.zheng@microchip.com>
Date: Tue, 15 Nov 2022 18:22:21 -0800
Subject: [PATCH 004/718] Remove redundant files

---
 Makefile.install                    |   5 -
 kernel/riscv64/KERNEL.x280          |  36 +-
 kernel/riscv64/gemm_ncopy_2_rvv.c   |  92 ---
 kernel/riscv64/gemm_ncopy_4_rvv.c   | 123 ----
 kernel/riscv64/gemm_tcopy_2_rvv.c   | 108 ----
 kernel/riscv64/gemm_tcopy_4_rvv.c   | 236 --------
 kernel/riscv64/gemmkernel_2x2_rvv.c | 214 -------
 kernel/riscv64/gemmkernel_4x4_rvv.c | 508 ----------------
 kernel/riscv64/trmmkernel_2x2_rvv.c | 342 -----------
 kernel/riscv64/trmmkernel_4x4_rvv.c | 881 ----------------------------
 10 files changed, 2 insertions(+), 2543 deletions(-)
 delete mode 100644 kernel/riscv64/gemm_ncopy_2_rvv.c
 delete mode 100644 kernel/riscv64/gemm_ncopy_4_rvv.c
 delete mode 100644 kernel/riscv64/gemm_tcopy_2_rvv.c
 delete mode 100644 kernel/riscv64/gemm_tcopy_4_rvv.c
 delete mode 100644 kernel/riscv64/gemmkernel_2x2_rvv.c
 delete mode 100644 kernel/riscv64/gemmkernel_4x4_rvv.c
 delete mode 100644 kernel/riscv64/trmmkernel_2x2_rvv.c
 delete mode 100644 kernel/riscv64/trmmkernel_4x4_rvv.c

diff --git a/Makefile.install b/Makefile.install
index f1adaa271..168d08f72 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -8,7 +8,6 @@ PREFIX ?= /opt/OpenBLAS
 OPENBLAS_INCLUDE_DIR := $(PREFIX)/include
 OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib
 OPENBLAS_BINARY_DIR := $(PREFIX)/bin
-OPENBLAS_RELEASE_DIR := $(PREFIX)/release
 OPENBLAS_BUILD_DIR := $(CURDIR)
 OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/$(LIBSONAMEBASE)
 OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
@@ -39,7 +38,6 @@ install : 	lib.grd
 	@-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)"
 	@-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
 	@-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
-	@-mkdir -p "$(DESTDIR)$(OPENBLAS_RELEASE_DIR)"
 	@-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)"
 	@-mkdir -p "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
 	@echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@@ -204,8 +202,5 @@ endif
 	@echo "  endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
 	@echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
 	@echo Install OK!
-#Generating release tar
-	@echo Generating $(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz
-	@tar -cvz --file=$(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz --directory=$(PREFIX) --exclude=release .
 
 	
diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280
index 2eb60f2b4..4d64354fb 100644
--- a/kernel/riscv64/KERNEL.x280
+++ b/kernel/riscv64/KERNEL.x280
@@ -122,23 +122,7 @@ CTRMMKERNEL	= ztrmmkernel_2x2_rvv.c
 ZTRMMKERNEL	= ztrmmkernel_2x2_rvv.c
 
 # SGEMM_UNROLL_N set in params.h
-ifeq ($(SGEMM_UNROLL_N), 2)
-SGEMMKERNEL    =  gemmkernel_2x2_rvv.c
-SGEMMONCOPY    =  gemm_ncopy_2_rvv.c
-SGEMMOTCOPY    =  gemm_tcopy_2_rvv.c
-SGEMMONCOPYOBJ =  sgemm_oncopy.o
-SGEMMOTCOPYOBJ =  sgemm_otcopy.o
-
-STRMMKERNEL	= trmmkernel_2x2_rvv.c
-else ifeq ($(SGEMM_UNROLL_N), 4)
-SGEMMKERNEL    =  gemmkernel_4x4_rvv.c
-SGEMMONCOPY    =  gemm_ncopy_4_rvv.c
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
-SGEMMONCOPYOBJ =  sgemm_oncopy.o
-SGEMMOTCOPYOBJ =  sgemm_otcopy.o
-
-STRMMKERNEL	= trmmkernel_4x4_rvv.c
-else ifeq ($(SGEMM_UNROLL_N), 8)
+ifeq ($(SGEMM_UNROLL_N), 8)
 # UNROLL_M is VLMAX
 SGEMMKERNEL    =  gemmkernel_rvv_v1x8.c
 SGEMMINCOPY    =  gemm_ncopy_rvv_v1.c
@@ -162,23 +146,7 @@ SSYMMLCOPY_M   =  symm_lcopy_rvv_v1.c
 endif
 
 # SGEMM_UNROLL_N set in params.h
-ifeq ($(DGEMM_UNROLL_N), 2)
-DGEMMKERNEL    = gemmkernel_2x2_rvv.c
-DGEMMONCOPY    = gemm_ncopy_2_rvv.c
-DGEMMOTCOPY    = gemm_tcopy_2_rvv.c
-DGEMMONCOPYOBJ = dgemm_oncopy.o
-DGEMMOTCOPYOBJ = dgemm_otcopy.o
-
-DTRMMKERNEL	= trmmkernel_2x2_rvv.c
-else ifeq ($(DGEMM_UNROLL_N), 4)
-DGEMMKERNEL    =  gemmkernel_4x4_rvv.c
-DGEMMONCOPY    =  gemm_ncopy_4_rvv.c
-DGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
-DGEMMONCOPYOBJ =  dgemm_oncopy.o
-DGEMMOTCOPYOBJ =  dgemm_otcopy.o
-
-DTRMMKERNEL	= trmmkernel_4x4_rvv.c
-else ifeq ($(DGEMM_UNROLL_N), 8)
+ifeq ($(DGEMM_UNROLL_N), 8)
 # UNROLL_M is VLMAX
 DGEMMKERNEL    =  gemmkernel_rvv_v1x8.c
 DGEMMINCOPY    =  gemm_ncopy_rvv_v1.c
diff --git a/kernel/riscv64/gemm_ncopy_2_rvv.c b/kernel/riscv64/gemm_ncopy_2_rvv.c
deleted file mode 100644
index 5f55bc349..000000000
--- a/kernel/riscv64/gemm_ncopy_2_rvv.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/***************************************************************************
-Copyright (c) 2022, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "common.h"
-
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle32_v_f32m4
-#define VSEV_FLOAT vse32_v_f32m4
-#define VSSEG2_FLOAT vsseg2e32_v_f32m4
-#else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle64_v_f64m4
-#define VSEV_FLOAT vse64_v_f64m4
-#define VSSEG2_FLOAT vsseg2e64_v_f64m4
-#endif
-
-// Optimizes the implementation in ../generic/gemm_ncopy_2.c
-
-int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
-{
-    BLASLONG i, j;
-    IFLOAT *a_offset, *a_offset1, *a_offset2;
-    IFLOAT *b_offset;
-    FLOAT_V_T v1, v2;
-    size_t vl;
-
-    //fprintf(stderr, "gemm_ncopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU
-
-    a_offset = a;
-    b_offset = b;
-
-    for(j = (n >> 1); j > 0; j--) {
-
-        a_offset1 = a_offset;
-        a_offset2 = a_offset + lda;
-        a_offset += 2 * lda;
-
-        for(i = m; i > 0; i -= vl) {
-            vl = VSETVL(i);
-
-            v1 = VLEV_FLOAT(a_offset1, vl);
-            v2 = VLEV_FLOAT(a_offset2, vl);
-            VSSEG2_FLOAT(b_offset, v1, v2, vl);
-
-	        a_offset1 += vl;
-	        a_offset2 += vl;
-	        b_offset += vl*2;
-        }
-    }
-
-    if (n & 1) {
-
-        for(i = m; i > 0; i -= vl) {
-            vl = VSETVL(i);
-
-            v1 = VLEV_FLOAT(a_offset, vl);
-            VSEV_FLOAT(b_offset, v1, vl);
-
-	        a_offset += vl;
-	        b_offset += vl;
-        }
-	}
-
-    return 0;
-}
diff --git a/kernel/riscv64/gemm_ncopy_4_rvv.c b/kernel/riscv64/gemm_ncopy_4_rvv.c
deleted file mode 100644
index 4d4efe4c9..000000000
--- a/kernel/riscv64/gemm_ncopy_4_rvv.c
+++ /dev/null
@@ -1,123 +0,0 @@
-/***************************************************************************
-Copyright (c) 2022, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "common.h"
-
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VSSEG2_FLOAT vsseg2e32_v_f32m2
-#define VSSEG4_FLOAT vsseg4e32_v_f32m2
-#else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VSSEG2_FLOAT vsseg2e64_v_f64m2
-#define VSSEG4_FLOAT vsseg4e64_v_f64m2
-#endif
-
-// Optimizes the implementation in ../generic/gemm_ncopy_4.c
-
-int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
-{
-    BLASLONG i, j;
-
-    FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
-    FLOAT *b_offset;
-
-    FLOAT_V_T v1, v2, v3, v4;
-    size_t vl;
-
-    //fprintf(stderr, "gemm_ncopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda);
-
-    a_offset = a;
-    b_offset = b;
-
-    for(j = (n >> 2); j > 0; j--) {
-        a_offset1  = a_offset;
-        a_offset2  = a_offset1 + lda;
-        a_offset3  = a_offset2 + lda;
-        a_offset4  = a_offset3 + lda;
-        a_offset += 4 * lda;
-
-        for(i = m; i > 0; i -= vl) {
-            vl = VSETVL(i);
-
-            v1 = VLEV_FLOAT(a_offset1, vl);
-            v2 = VLEV_FLOAT(a_offset2, vl);
-            v3 = VLEV_FLOAT(a_offset3, vl);
-            v4 = VLEV_FLOAT(a_offset4, vl);
-
-            VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl);
-
-            a_offset1 += vl;
-            a_offset2 += vl;
-            a_offset3 += vl;
-            a_offset4 += vl;
-            b_offset += vl*4;
-        }
-    }
-
-    if (n & 2) {
-        a_offset1  = a_offset;
-        a_offset2  = a_offset1 + lda;
-        a_offset += 2 * lda;
-
-        for(i = m; i > 0; i -= vl) {
-            vl = VSETVL(i);
-
-            v1 = VLEV_FLOAT(a_offset1, vl);
-            v2 = VLEV_FLOAT(a_offset2, vl);
-
-            VSSEG2_FLOAT(b_offset, v1, v2, vl);
-
-            a_offset1 += vl;
-            a_offset2 += vl;
-            b_offset += vl*2;
-        }
-    }
-
-    if (n & 1) {
-        a_offset1  = a_offset;
-
-        for(i = m; i > 0; i -= vl) {
-            vl = VSETVL(i);
-
-            v1 = VLEV_FLOAT(a_offset1, vl);
-
-            VSEV_FLOAT(b_offset, v1, vl);
-
-            a_offset1 += vl;
-            b_offset += vl;
-        }
-    }
-
-    return 0;
-}
diff --git a/kernel/riscv64/gemm_tcopy_2_rvv.c b/kernel/riscv64/gemm_tcopy_2_rvv.c
deleted file mode 100644
index 963e1be69..000000000
--- a/kernel/riscv64/gemm_tcopy_2_rvv.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/***************************************************************************
-Copyright (c) 2022, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "common.h"
-
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VSSSEG2_FLOAT vssseg2e32_v_f32m2
-#define VSSSEG4_FLOAT vssseg4e32_v_f32m2
-#else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VSSSEG2_FLOAT vssseg2e64_v_f64m2
-#define VSSSEG4_FLOAT vssseg4e64_v_f64m2
-#endif
-
-// Optimizes the implementation in ../generic/gemm_tcopy_2.c
-
-int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
-{
-    BLASLONG i, j;
-    IFLOAT *a_offset, *a_offset1, *a_offset2;
-    IFLOAT *b_offset, *b_offset1, *b_offset2;
-    FLOAT_V_T v1a, v1b, v2a, v2b;
-    size_t vl;
-
-    //fprintf(stderr, "gemm_tcopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU
-
-    a_offset = a;
-    b_offset = b;
-    b_offset2 = b + m * (n & ~1);
-
-    for(i = (m >> 1); i > 0; i--) {
-
-        a_offset1 = a_offset;
-        a_offset2 = a_offset + lda;
-        a_offset += 2 * lda;
-
-        b_offset1 = b_offset;
-        b_offset += 4;
-
-        for(j = (n >> 1); j > 0; j -= vl) {
-            vl = VSETVL(j);
-
-            VLSEG2_FLOAT(&v1a, &v1b, a_offset1, vl);
-            VLSEG2_FLOAT(&v2a, &v2b, a_offset2, vl);
-
-            VSSSEG4_FLOAT(b_offset1, m*2*sizeof(FLOAT), v1a, v1b, v2a, v2b, vl);
-
-            a_offset1 += vl * 2;
-	        a_offset2 += vl * 2;
-            b_offset1 += vl * m * 2;
-        }
-
-        if (n & 1) {
-	        *(b_offset2 + 0) = *(a_offset1 + 0);
-	        *(b_offset2 + 1) = *(a_offset2 + 0);
-	        b_offset2 += 2;
-        }
-    }
-
-    if (m & 1) {
-
-        for(j = (n >> 1); j > 0; j -= vl) {
-            vl = VSETVL(j);
-
-            VLSEG2_FLOAT(&v1a, &v1b, a_offset, vl);
-
-            VSSSEG2_FLOAT(b_offset, m*2*sizeof(FLOAT), v1a, v1b, vl);
-
-            a_offset += vl * 2;
-            b_offset += vl * m * 2;
-        }
-
-        if (n & 1){
-            *(b_offset2 + 0) = *(a_offset + 0);
-        }
-    }
-
-    return 0;
-}
diff --git a/kernel/riscv64/gemm_tcopy_4_rvv.c b/kernel/riscv64/gemm_tcopy_4_rvv.c
deleted file mode 100644
index ac9974b24..000000000
--- a/kernel/riscv64/gemm_tcopy_4_rvv.c
+++ /dev/null
@@ -1,236 +0,0 @@
-/***************************************************************************
-Copyright (c) 2022, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "common.h"
-
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VSSSEG2_FLOAT vssseg2e32_v_f32m2
-#define VSSSEG4_FLOAT vssseg4e32_v_f32m2
-#else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VSSSEG2_FLOAT vssseg2e64_v_f64m2
-#define VSSSEG4_FLOAT vssseg4e64_v_f64m2
-#endif
-
-// Optimizes the implementation in ../generic/gemm_tcopy_4.c
-
-int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
-{
-    BLASLONG i, j;
-
-    FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
-    FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3;
-    FLOAT  ctemp1,  ctemp2,  ctemp3,  ctemp4;
-    FLOAT  ctemp5,  ctemp6,  ctemp7,  ctemp8;
-    FLOAT  ctemp9, ctemp10, ctemp11, ctemp12;
-    FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
-
-    //fprintf(stderr, "gemm_tcopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda);
-
-    a_offset = a;
-    b_offset = b;
-
-    b_offset2 = b + m  * (n & ~3);
-    b_offset3 = b + m  * (n & ~1);
-
-    for(j = (m >> 2); j > 0; j--) {
-        a_offset1 = a_offset;
-        a_offset2 = a_offset1 + lda;
-        a_offset3 = a_offset2 + lda;
-        a_offset4 = a_offset3 + lda;
-        a_offset += 4 * lda;
-
-        b_offset1 = b_offset;
-        b_offset += 16;
-
-        for(i = (n >> 2); i > 0; i--) {
-            v1 = VLEV_FLOAT(a_offset1, 4);
-            v2 = VLEV_FLOAT(a_offset2, 4);
-            v3 = VLEV_FLOAT(a_offset3, 4);
-            v4 = VLEV_FLOAT(a_offset4, 4);
-
-            a_offset1 += 4;
-            a_offset2 += 4;
-            a_offset3 += 4;
-            a_offset4 += 4;
-
-            VSEV_FLOAT(b_offset1, v1, 4);
-            VSEV_FLOAT(b_offset2+4, v2, 4);
-            VSEV_FLOAT(b_offset2+8, v3, 4);
-            VSEV_FLOAT(b_offset2+12, v4, 4);
-
-            b_offset1 += m * 4;
-        }
-
-        if (n & 2) {
-            v1 = VLEV_FLOAT(a_offset1, 2);
-            v2 = VLEV_FLOAT(a_offset2, 2);
-            v3 = VLEV_FLOAT(a_offset3, 2);
-            v4 = VLEV_FLOAT(a_offset4, 2);
-
-            a_offset1 += 2;
-            a_offset2 += 2;
-            a_offset3 += 2;
-            a_offset4 += 2;
-
-            VSEV_FLOAT(b_offset2, v1, 2);
-            VSEV_FLOAT(b_offset2+2, v2, 2);
-            VSEV_FLOAT(b_offset2+4, v3, 2);
-            VSEV_FLOAT(b_offset2+6, v4, 2);
-
-            b_offset2 += 8;
-        }
-
-        if (n & 1) {
-            v1 = VLEV_FLOAT(a_offset1, 1);
-            v2 = VLEV_FLOAT(a_offset2, 1);
-            v3 = VLEV_FLOAT(a_offset3, 1);
-            v4 = VLEV_FLOAT(a_offset4, 1);
-
-            VSSEG4_FLOAT(b_offset3, v1, v2, v3, v4, 1);
-
-            b_offset3 += 4;
-        }
-
-    }
-
-// TODO cleanup
-
-  if (m & 2){
-    a_offset1  = a_offset;
-    a_offset2  = a_offset1 + lda;
-    a_offset  += 2 * lda;
-
-    b_offset1  = b_offset;
-    b_offset  += 8;
-
-    i = (n >> 2);
-    if (i > 0){
-      do{
-	ctemp1  = *(a_offset1 + 0);
-	ctemp2  = *(a_offset1 + 1);
-	ctemp3  = *(a_offset1 + 2);
-	ctemp4  = *(a_offset1 + 3);
-
-	ctemp5  = *(a_offset2 + 0);
-	ctemp6  = *(a_offset2 + 1);
-	ctemp7  = *(a_offset2 + 2);
-	ctemp8  = *(a_offset2 + 3);
-
-	a_offset1 += 4;
-	a_offset2 += 4;
-
-	*(b_offset1 +  0) = ctemp1;
-	*(b_offset1 +  1) = ctemp2;
-	*(b_offset1 +  2) = ctemp3;
-	*(b_offset1 +  3) = ctemp4;
-
-	*(b_offset1 +  4) = ctemp5;
-	*(b_offset1 +  5) = ctemp6;
-	*(b_offset1 +  6) = ctemp7;
-	*(b_offset1 +  7) = ctemp8;
-
-	b_offset1 += m * 4;
-	i --;
-      }while(i > 0);
-    }
-
-    if (n & 2) {
-      ctemp1  = *(a_offset1 + 0);
-      ctemp2  = *(a_offset1 + 1);
-
-      ctemp3  = *(a_offset2 + 0);
-      ctemp4  = *(a_offset2 + 1);
-
-      a_offset1 += 2;
-      a_offset2 += 2;
-
-      *(b_offset2 +  0) = ctemp1;
-      *(b_offset2 +  1) = ctemp2;
-      *(b_offset2 +  2) = ctemp3;
-      *(b_offset2 +  3) = ctemp4;
-
-      b_offset2 += 4;
-    }
-
-    if (n & 1) {
-      ctemp1  = *(a_offset1 + 0);
-      ctemp2  = *(a_offset2 + 0);
-
-      *(b_offset3 +  0) = ctemp1;
-      *(b_offset3 +  1) = ctemp2;
-      b_offset3 += 2;
-    }
-  }
-
-  if (m & 1){
-    a_offset1  = a_offset;
-    b_offset1  = b_offset;
-
-    i = (n >> 2);
-    if (i > 0){
-      do{
-	ctemp1  = *(a_offset1 + 0);
-	ctemp2  = *(a_offset1 + 1);
-	ctemp3  = *(a_offset1 + 2);
-	ctemp4  = *(a_offset1 + 3);
-
-	a_offset1 += 4;
-
-	*(b_offset1 +  0) = ctemp1;
-	*(b_offset1 +  1) = ctemp2;
-	*(b_offset1 +  2) = ctemp3;
-	*(b_offset1 +  3) = ctemp4;
-
-	b_offset1 += 4 * m;
-
-	i --;
-      }while(i > 0);
-    }
-
-    if (n & 2) {
-      ctemp1  = *(a_offset1 + 0);
-      ctemp2  = *(a_offset1 + 1);
-      a_offset1 += 2;
-
-      *(b_offset2 +  0) = ctemp1;
-      *(b_offset2 +  1) = ctemp2;
-    }
-
-    if (n & 1) {
-      ctemp1  = *(a_offset1 + 0);
-      *(b_offset3 +  0) = ctemp1;
-    }
-  }
-
-  return 0;
-}
diff --git a/kernel/riscv64/gemmkernel_2x2_rvv.c b/kernel/riscv64/gemmkernel_2x2_rvv.c
deleted file mode 100644
index ec8961ced..000000000
--- a/kernel/riscv64/gemmkernel_2x2_rvv.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/***************************************************************************
-Copyright (c) 2022, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "common.h"
-
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m4()
-#define VSETVL_MAX_M1 vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m4_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEG2_FLOAT vlseg2e32_v_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
-#else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m4()
-#define VSETVL_MAX_M1 vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m4_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEG2_FLOAT vlseg2e64_v_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
-#endif
-
-// Optimizes the implementation in ../generic/gemm_kernel_2x2.c
-
-int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc
-#ifdef TRMMKERNEL
-		,BLASLONG offset
-#endif
-		)
-{
-    BLASLONG i,j,k;
-    FLOAT *C0,*C1;
-    IFLOAT *ptrba,*ptrbb;
-
-    //fprintf(stderr, "gemm_kernel_2x2 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc);
-
-    FLOAT_V_T va0, va1, vb0, vb1;
-    FLOAT_V_T vres0, vres1, vres2, vres3;
-    FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3;
-    FLOAT_V_T_M1 v_z0;
-
-    v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
-    size_t vlmax = VSETVL_MAX;
-    size_t vl;
-
-    for (j = bn/2; j > 0; j--) {
-        C0 = C;
-        C1 = C0 + ldc;
-        ptrba = ba;
-
-        for (i = bm/2; i > 0; i--) {
-            ptrbb = bb;
-
-            vres0 = VFMVVF_FLOAT(0.0, vlmax);
-            vres1 = VFMVVF_FLOAT(0.0, vlmax);
-            vres2 = VFMVVF_FLOAT(0.0, vlmax);
-            vres3 = VFMVVF_FLOAT(0.0, vlmax);
-
-            for (k = bk; k > 0; k -= vl) {
-                vl = VSETVL(k);
-
-                VLSEG2_FLOAT(&va0, &va1, ptrba, vl); 
-                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); 
-
-                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
-                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
-                vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl);
-                vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl);
-
-                ptrba += vl*2;
-                ptrbb += vl*2;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
-            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax);
-            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax);
-            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
-            C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
-            C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2);
-            C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3);
-
-            C0 += 2;
-            C1 += 2;
-        }
-
-        if(bm & 1) {
-            ptrbb = bb;
-
-            vres0 = VFMVVF_FLOAT(0.0, vlmax);
-            vres1 = VFMVVF_FLOAT(0.0, vlmax);
-
-            for (k = bk; k > 0; k -= vl) {
-                vl = VSETVL(k);
-
-                va0 = VLEV_FLOAT(ptrba, vl);
-                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
-
-                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
-                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
-
-                ptrba += vl;
-                ptrbb += vl*2;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
-            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
-            C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1);
-
-            C0 += 1;
-            C1 += 1;
-        }
-
-        bb += (bk<<1);
-        C += (ldc<<1);
-    }
-
-    if(bn & 1) {
-        C0 = C;
-        ptrba = ba;
-        for (i = bm/2; i > 0; i--) {
-            ptrbb = bb;
-
-            vres0 = VFMVVF_FLOAT(0.0, vlmax);
-            vres1 = VFMVVF_FLOAT(0.0, vlmax);
-
-            for (k = bk; k > 0; k -= vl) {
-                vl = VSETVL(k);
-
-                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
-                vb0 = VLEV_FLOAT(ptrbb, vl);
-
-                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
-                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
-
-                ptrba += vl*2;
-                ptrbb += vl;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
-            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
-            C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
-
-            C0 += 2;
-        }
-
-        if(bm & 1) {
-            ptrbb = bb;
-
-            vres0 = VFMVVF_FLOAT(0.0, vlmax);
-
-            for (k = bk; k > 0; k -= vl) {
-                vl = VSETVL(k);
-
-                va0 = VLEV_FLOAT(ptrba, vl);
-                vb0 = VLEV_FLOAT(ptrbb, vl);
-
-                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
-
-                ptrba += vl;
-                ptrbb += vl;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
-            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
-
-            C0 += 1;
-        }
-
-        bb += (bk<<0);
-        C += ldc;
-    }
-
-    return 0;
-}
diff --git a/kernel/riscv64/gemmkernel_4x4_rvv.c b/kernel/riscv64/gemmkernel_4x4_rvv.c
deleted file mode 100644
index aa58bcc76..000000000
--- a/kernel/riscv64/gemmkernel_4x4_rvv.c
+++ /dev/null
@@ -1,508 +0,0 @@
-/***************************************************************************
-Copyright (c) 2022, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "common.h"
-
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m1(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define VSETVL_MAX_M1 vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m1_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m1
-#define VLSEG2_FLOAT vlseg2e32_v_f32m1
-#define VLSEG4_FLOAT vlseg4e32_v_f32m1
-#define VFMVVF_FLOAT vfmv_v_f_f32m1
-#define VFMACCVF_FLOAT vfmacc_vf_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m1
-#define VFREDSUMVS_FLOAT vfredusum_vs_f32m1_f32m1
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
-#else
-#define VSETVL(n) vsetvl_e64m1(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define VSETVL_MAX_M1 vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m1_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m1
-#define VLSEG2_FLOAT vlseg2e64_v_f64m1
-#define VLSEG4_FLOAT vlseg4e64_v_f64m1
-#define VFMVVF_FLOAT vfmv_v_f_f64m1
-#define VFMACCVF_FLOAT vfmacc_vf_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m1
-#define VFREDSUMVS_FLOAT vfredusum_vs_f64m1_f64m1
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
-#endif
-
-// Optimizes the implementation in ../generic/gemm_kernel_2x2.c
-
-int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc
-#ifdef TRMMKERNEL
-		,BLASLONG offset
-#endif
-		)
-{
-    BLASLONG i,j,k;
-    FLOAT *C0,*C1,*C2,*C3;
-    IFLOAT *ptrba,*ptrbb;
-
-    //fprintf(stderr, "gemm_kernel_4x4 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); // KU
-
-    FLOAT_V_T va0, va1, va2, va3; 
-    FLOAT_V_T vb0, vb1, vb2, vb3;
-    FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
-    FLOAT_V_T vres8, vres9, vres10, vres11, vres12, vres13, vres14, vres15;
-    FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3;
-    FLOAT_V_T_M1 v_z0;
-
-    v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
-    size_t vlmax = VSETVL_MAX;
-    size_t vl;
-
-    for (j = bn/4; j > 0; j--) {
-        C0 = C;
-        C1 = C0 + ldc;
-        C2 = C1 + ldc;
-        C3 = C2 + ldc;
-        ptrba = ba;
-
-        for (i = bm/4; i > 0; i--) {
-            ptrbb = bb;
-
-            vres0 = VFMVVF_FLOAT(0.0, vlmax);
-            vres1 = VFMVVF_FLOAT(0.0, vlmax);
-            vres2 = VFMVVF_FLOAT(0.0, vlmax);
-            vres3 = VFMVVF_FLOAT(0.0, vlmax);
-            vres4 = VFMVVF_FLOAT(0.0, vlmax);
-            vres5 = VFMVVF_FLOAT(0.0, vlmax);
-            vres6 = VFMVVF_FLOAT(0.0, vlmax);
-            vres7 = VFMVVF_FLOAT(0.0, vlmax);
-            vres8 = VFMVVF_FLOAT(0.0, vlmax);
-            vres9 = VFMVVF_FLOAT(0.0, vlmax);
-            vres10 = VFMVVF_FLOAT(0.0, vlmax);
-            vres11 = VFMVVF_FLOAT(0.0, vlmax);
-            vres12 = VFMVVF_FLOAT(0.0, vlmax);
-            vres13 = VFMVVF_FLOAT(0.0, vlmax);
-            vres14 = VFMVVF_FLOAT(0.0, vlmax);
-            vres15 = VFMVVF_FLOAT(0.0, vlmax);
-
-            for (k = bk; k > 0; k -= vl) {
-                vl = VSETVL(k);
-
-                VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); 
-                VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); 
-
-                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
-                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
-                vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl);
-                vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl);
-
-                vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
-                vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl);
-                vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl);
-                vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl);
-
-                vres8 = VFMACCVV_FLOAT(vres8, va2, vb0, vl);
-                vres9 = VFMACCVV_FLOAT(vres9, va3, vb0, vl);
-                vres10 = VFMACCVV_FLOAT(vres10, va2, vb1, vl);
-                vres11 = VFMACCVV_FLOAT(vres11, va3, vb1, vl);
-
-                vres12 = VFMACCVV_FLOAT(vres12, va2, vb2, vl);
-                vres13 = VFMACCVV_FLOAT(vres13, va3, vb2, vl);
-                vres14 = VFMACCVV_FLOAT(vres14, va2, vb3, vl);
-                vres15 = VFMACCVV_FLOAT(vres15, va3, vb3, vl);
-
-                ptrba += vl*4;
-                ptrbb += vl*4;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
-            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres8, v_z0, vlmax);
-            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres9, v_z0, vlmax);
-            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
-            C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
-            C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2);
-            C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3);
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax);
-            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres10, v_z0, vlmax);
-            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres11, v_z0, vlmax);
-            C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
-            C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
-            C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2);
-            C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3);
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax);
-            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres12, v_z0, vlmax);
-            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres13, v_z0, vlmax);
-            C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
-            C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
-            C2[2] += alpha * VFMVFS_FLOAT_M1(vsum2);
-            C2[3] += alpha * VFMVFS_FLOAT_M1(vsum3);
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax);
-            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres14, v_z0, vlmax);
-            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres15, v_z0, vlmax);
-            C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
-            C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
-            C3[2] += alpha * VFMVFS_FLOAT_M1(vsum2);
-            C3[3] += alpha * VFMVFS_FLOAT_M1(vsum3);
-
-            C0 += 4;
-            C1 += 4;
-            C2 += 4;
-            C3 += 4;
-        }
-
-        if(bm & 2) {
-            ptrbb = bb;
-
-            vres0 = VFMVVF_FLOAT(0.0, vlmax);
-            vres1 = VFMVVF_FLOAT(0.0, vlmax);
-            vres2 = VFMVVF_FLOAT(0.0, vlmax);
-            vres3 = VFMVVF_FLOAT(0.0, vlmax);
-            vres4 = VFMVVF_FLOAT(0.0, vlmax);
-            vres5 = VFMVVF_FLOAT(0.0, vlmax);
-            vres6 = VFMVVF_FLOAT(0.0, vlmax);
-            vres7 = VFMVVF_FLOAT(0.0, vlmax);
-
-            for (k = bk; k > 0; k -= vl) {
-                vl = VSETVL(k);
-
-                VLSEG2_FLOAT(&va0, &va1, ptrba, vl); 
-                VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); 
-
-                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
-                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
-                vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl);
-                vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl);
-
-                vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
-                vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl);
-                vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl);
-                vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl);
-
-                ptrba += vl*2;
-                ptrbb += vl*4;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
-            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
-            C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
- 
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax);
-            C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
-            C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax);
-            C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
-            C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax);
-            C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
-            C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
-
-            C0 += 2;
-            C1 += 2;
-            C2 += 2;
-            C3 += 2;
-        }
-
-        if(bm & 1) {
-            ptrbb = bb;
-
-            vres0 = VFMVVF_FLOAT(0.0, vlmax);
-            vres1 = VFMVVF_FLOAT(0.0, vlmax);
-            vres2 = VFMVVF_FLOAT(0.0, vlmax);
-            vres3 = VFMVVF_FLOAT(0.0, vlmax);
-
-            for (k = bk; k > 0; k -= vl) {
-                vl = VSETVL(k);
-
-                va0 = VLEV_FLOAT(ptrba, vl);
-                VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); 
-
-                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
-                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
-                vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
-                vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl);
-
-                ptrba += vl;
-                ptrbb += vl*4;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
-            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax);
-            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax);
-            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
-            C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1);
-            C2[0] += alpha * VFMVFS_FLOAT_M1(vsum2);
-            C3[0] += alpha * VFMVFS_FLOAT_M1(vsum3);
-
-            C0 += 1;
-            C1 += 1;
-            C2 += 1;
-            C3 += 1;
-        }
-
-        bb += (bk<<2);
-        C += (ldc<<2);
-    }
-
-    if(bn & 2) {
-
-        C0 = C;
-        C1 = C0 + ldc;
-        ptrba = ba;
-
-        for (i = bm/4; i > 0; i--) {
-            ptrbb = bb;
-
-            vres0 = VFMVVF_FLOAT(0.0, vlmax);
-            vres1 = VFMVVF_FLOAT(0.0, vlmax);
-            vres2 = VFMVVF_FLOAT(0.0, vlmax);
-            vres3 = VFMVVF_FLOAT(0.0, vlmax);
-
-            vres4 = VFMVVF_FLOAT(0.0, vlmax);
-            vres5 = VFMVVF_FLOAT(0.0, vlmax);
-            vres6 = VFMVVF_FLOAT(0.0, vlmax);
-            vres7 = VFMVVF_FLOAT(0.0, vlmax);
-
-            for (k = bk; k > 0; k -= vl) {
-                vl = VSETVL(k);
-
-                VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
-                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
-
-                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
-                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
-                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
-                vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
-
-                vres4 = VFMACCVV_FLOAT(vres4, va0, vb1, vl);
-                vres5 = VFMACCVV_FLOAT(vres5, va1, vb1, vl);
-                vres6 = VFMACCVV_FLOAT(vres6, va2, vb1, vl);
-                vres7 = VFMACCVV_FLOAT(vres7, va3, vb1, vl);
-
-                ptrba += vl*4;
-                ptrbb += vl*2;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
-            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax);
-            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax);
-            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
-            C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
-            C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2);
-            C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3);
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax);
-            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres6, v_z0, vlmax);
-            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres7, v_z0, vlmax);
-            C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
-            C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
-            C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2);
-            C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3);
-
-            C0 += 4;
-            C1 += 4;
-        }
-
-        if(bm & 2) {
-            ptrbb = bb;
-
-            vres0 = VFMVVF_FLOAT(0.0, vlmax);
-            vres1 = VFMVVF_FLOAT(0.0, vlmax);
-            vres2 = VFMVVF_FLOAT(0.0, vlmax);
-            vres3 = VFMVVF_FLOAT(0.0, vlmax);
-
-            for (k = bk; k > 0; k -= vl) {
-                vl = VSETVL(k);
-
-                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
-                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
-
-                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
-                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
-                vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl);
-                vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl);
-
-                ptrba += vl*2;
-                ptrbb += vl*2;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
-            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax);
-            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax);
-            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
-            C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
-            C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2);
-            C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3);
-
-            C0 += 2;
-            C1 += 2;
-        }
-
-        if(bm & 1) {
-            ptrbb = bb;
-
-            vres0 = VFMVVF_FLOAT(0.0, vlmax);
-            vres1 = VFMVVF_FLOAT(0.0, vlmax);
-
-            for (k = bk; k > 0; k -= vl) {
-                vl = VSETVL(k);
-
-                va0 = VLEV_FLOAT(ptrba, vl);
-                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
-
-                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
-                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
-
-                ptrba += vl;
-                ptrbb += vl*2;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
-            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
-            C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1);
-
-            C0 += 1;
-            C1 += 1;
-        }
-
-        bb += (bk<<1);
-        C += (ldc<<1);
-    }
-
-    if(bn & 1) {
-        C0 = C;
-        ptrba = ba;
-        for (i = bm/4; i > 0; i--) {
-            ptrbb = bb;
-
-            vres0 = VFMVVF_FLOAT(0.0, vlmax);
-            vres1 = VFMVVF_FLOAT(0.0, vlmax);
-            vres2 = VFMVVF_FLOAT(0.0, vlmax);
-            vres3 = VFMVVF_FLOAT(0.0, vlmax);
-
-            for (k = bk; k > 0; k -= vl) {
-                vl = VSETVL(k);
-
-                VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
-                vb0 = VLEV_FLOAT(ptrbb, vl);
-
-                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
-                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
-                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
-                vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
-
-                ptrba += vl*4;
-                ptrbb += vl;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
-            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax);
-            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax);
-            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
-            C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
-            C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2);
-            C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3);
-
-            C0 += 4;
-        }
-
-        if(bm & 2) {
-            ptrbb = bb;
-
-            vres0 = VFMVVF_FLOAT(0.0, vlmax);
-            vres1 = VFMVVF_FLOAT(0.0, vlmax);
-
-            for (k = bk; k > 0; k -= vl) {
-                vl = VSETVL(k);
-
-                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
-                vb0 = VLEV_FLOAT(ptrbb, vl);
-
-                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
-                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
-
-                ptrba += vl*2;
-                ptrbb += vl;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
-            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
-            C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
-
-            C0 += 2;
-        }
-
-        if(bm & 1) {
-            ptrbb = bb;
-
-            vres0 = VFMVVF_FLOAT(0.0, vlmax);
-
-            for (k = bk; k > 0; k -= vl) {
-                vl = VSETVL(k);
-
-                va0 = VLEV_FLOAT(ptrba, vl);
-                vb0 = VLEV_FLOAT(ptrbb, vl);
-
-                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
-
-                ptrba += vl;
-                ptrbb += vl;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
-            C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
-
-            C0 += 1;
-        }
-
-        bb += (bk<<0);
-        C += ldc;
-    }
-
-    return 0;
-}
diff --git a/kernel/riscv64/trmmkernel_2x2_rvv.c b/kernel/riscv64/trmmkernel_2x2_rvv.c
deleted file mode 100644
index 127e76970..000000000
--- a/kernel/riscv64/trmmkernel_2x2_rvv.c
+++ /dev/null
@@ -1,342 +0,0 @@
-/***************************************************************************
-Copyright (c) 2022, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "common.h"
-
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m4()
-#define VSETVL_MAX_M1 vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m4_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEG_FLOAT vlseg2e32_v_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
-#else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m4()
-#define VSETVL_MAX_M1 vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m4_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEG_FLOAT vlseg2e64_v_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
-#endif
-
-
-// Optimizes the implementation in ../generic/trmmkernel_2x2.c
-
-
-int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
-#ifdef TRMMKERNEL
-        ,BLASLONG offset
-#endif
-        )
-{
-   BLASLONG i,j,k;
-   FLOAT *C0,*C1,*ptrba,*ptrbb;
-   BLASLONG off, temp;
-
-   FLOAT_V_T va0, va1, vb0, vb1;
-   FLOAT_V_T vres0, vres1, vres2, vres3;
-   FLOAT_V_T_M1 v_res, v_z0;
-   v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
-   size_t vl;
-   size_t vlmax = VSETVL_MAX;
-   
-#if defined(TRMMKERNEL) && !defined(LEFT)
-   off = -offset;
-#else
-   off = 0;
-#endif
-
-   for (j = bn/2; j > 0; j--)
-   {
-        C0 = C;
-        C1 = C0+ldc;
-#if defined(TRMMKERNEL) && defined(LEFT)
-        off = offset;
-#endif
-        ptrba = ba;
-
-        for (i = bm/2; i > 0; i--)
-        {
-#if (defined(LEFT) &&  defined(TRANSA)) || \
-              (!defined(LEFT) && !defined(TRANSA))
-             ptrbb = bb;
-#else
-              ptrba += off*2;
-              ptrbb = bb + off*2;
-#endif
-
-#if (defined(LEFT) && !defined(TRANSA)) || \
-             (!defined(LEFT) && defined(TRANSA))
-             temp = bk-off;
-#elif defined(LEFT)
-             temp = off+2;
-#else
-             temp = off+2;
-#endif
-            vres0 = VFMVVF_FLOAT(0.0, vlmax);
-            vres1 = VFMVVF_FLOAT(0.0, vlmax);
-            vres2 = VFMVVF_FLOAT(0.0, vlmax);
-            vres3 = VFMVVF_FLOAT(0.0, vlmax);
-            for (k = temp; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                VLSEG_FLOAT(&va0, &va1, ptrba, vl);
-                VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl);
-
-                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
-                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
-                vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl);
-                vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl);
-
-                ptrba += vl * 2;
-                ptrbb += vl * 2;
-            }
-            v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax);
-            C0[0] = alpha * VFMVFS_FLOAT_M1(v_res);
-            v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax);
-            C0[1] = alpha * VFMVFS_FLOAT_M1(v_res);
-            v_res = VFREDSUMVS_FLOAT(v_res, vres2, v_z0, vlmax);
-            C1[0] = alpha * VFMVFS_FLOAT_M1(v_res);
-            v_res = VFREDSUMVS_FLOAT(v_res, vres3, v_z0, vlmax);
-            C1[1] = alpha * VFMVFS_FLOAT_M1(v_res);
-
-#if ( defined(LEFT) && defined(TRANSA)) || \
-             (!defined(LEFT) && !defined(TRANSA))
-             temp = bk - off;
-#ifdef LEFT
-             temp -= 2;
-#else
-             temp -= 2;
-#endif
-             ptrba += temp*2;
-             ptrbb += temp*2;
-#endif
-#ifdef LEFT
-             off += 2;
-#endif
-             C0 = C0+2;
-             C1 = C1+2;
-        }
-
-        if (bm & 1)
-        {
-#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
-             ptrbb = bb;
-#else
-             ptrba += off;
-             ptrbb = bb+off*2;
-#endif
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-             temp = bk-off;
-#elif defined(LEFT)
-             temp = off+1;
-#else
-             temp = off+2;
-#endif
-            vres0 = VFMVVF_FLOAT(0.0, vlmax);
-            vres1 = VFMVVF_FLOAT(0.0, vlmax);
-
-            for (k = temp; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                va0 = VLEV_FLOAT(ptrba, vl);
-                VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl);
-                  
-                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
-                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
-
-                ptrba += vl;
-                ptrbb += vl * 2;
-
-            }
-            v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax);
-            C0[0] = alpha * VFMVFS_FLOAT_M1(v_res);
-            v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax);
-            C1[0] = alpha * VFMVFS_FLOAT_M1(v_res);
-
-#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-             temp = bk-off;
-#ifdef LEFT
-             temp -= 1;
-#else
-             temp -= 2;
-#endif
-             ptrba += temp;
-             ptrbb += temp*2;
-#endif
-#ifdef LEFT
-             off += 1;
-#endif
-             C0 = C0+1;
-             C1 = C1+1;
-        }
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        off += 2;
-#endif
-        k = (bk<<1);
-        bb = bb+k;
-        i = (ldc<<1);
-        C = C+i;
-   }
-
-   if (bn & 1)
-   {
-        C0 = C;
-#if defined(TRMMKERNEL) &&  defined(LEFT)
-        off = offset;
-#endif
-        ptrba = ba;
-
-        for (i = bm/2; i > 0; i--)
-        {
-#if (defined(LEFT) &&  defined(TRANSA)) || \
-              (!defined(LEFT) && !defined(TRANSA))
-              ptrbb = bb;
-#else
-              ptrba += off*2;
-              ptrbb = bb + off;
-#endif
-
-
-#if (defined(LEFT) && !defined(TRANSA)) || \
-             (!defined(LEFT) && defined(TRANSA))
-             temp = bk-off;
-#elif defined(LEFT)
-             temp = off+2;
-#else
-             temp = off+1;
-#endif
-             vres0 = VFMVVF_FLOAT(0.0, vlmax);
-             vres1 = VFMVVF_FLOAT(0.0, vlmax);
-             
-             for (k = temp; k > 0; k -= vl)
-             {
-                 vl = VSETVL(k);
-                 vb0 = VLEV_FLOAT(ptrbb, vl);
-                 VLSEG_FLOAT(&va0, &va1, ptrba, vl);
-                   
-                 vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl);
-                 vres1 = VFMACCVV_FLOAT(vres1, vb0, va1, vl);
-             
-                 ptrba += vl * 2;
-                 ptrbb += vl;
-             
-             }
-             v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax);
-             C0[0] = alpha * VFMVFS_FLOAT_M1(v_res);
-             v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax);
-             C0[1] = alpha * VFMVFS_FLOAT_M1(v_res);
-
-#if ( defined(LEFT) &&  defined(TRANSA)) || \
-             (!defined(LEFT) && !defined(TRANSA))
-             temp = bk - off;
-#ifdef LEFT
-             temp -= 2;
-#else
-             temp -= 1;
-#endif
-             ptrba += temp*2;
-             ptrbb += temp;
-#endif
-#ifdef LEFT
-             off += 2;
-#endif
-
-             C0 = C0+2;
-        }
-
-        if (bm & 1)
-        {
-#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-             ptrbb = bb;
-#else
-             ptrba += off;
-             ptrbb = bb+off;
-#endif
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-             temp = bk-off;
-#elif defined(LEFT)
-             temp = off + 1;
-#else
-             temp = off + 1;
-#endif
-             vres0 = VFMVVF_FLOAT(0.0, vlmax);
-             
-             for (k = temp; k > 0; k -= vl)
-             {
-                 vl = VSETVL(k);
-                 va0 = VLEV_FLOAT(ptrba, vl);
-                 vb0 = VLEV_FLOAT(ptrbb, vl);
-                   
-                 vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl);
-                 ptrba += vl;
-                 ptrbb += vl;
-             }
-             v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax);
-             C0[0] = alpha * VFMVFS_FLOAT_M1(v_res);
-
-#if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-             temp = bk-off;
-#ifdef LEFT
-             temp -= 1;
-#else
-             temp -= 1;
-#endif
-             ptrba += temp;
-             ptrbb += temp;
-#endif
-#ifdef LEFT
-             off += 1;
-#endif
-             C0 = C0+1;
-        }
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        off += 1;
-#endif
-        k = (bk<<0);
-        bb = bb+k;
-        C = C+ldc;
-   }
-   return 0;
-}
-
diff --git a/kernel/riscv64/trmmkernel_4x4_rvv.c b/kernel/riscv64/trmmkernel_4x4_rvv.c
deleted file mode 100644
index 3e46c6348..000000000
--- a/kernel/riscv64/trmmkernel_4x4_rvv.c
+++ /dev/null
@@ -1,881 +0,0 @@
-/***************************************************************************
-Copyright (c) 2022, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "common.h"
-#include <stdbool.h>
-
-#if !defined(DOUBLE)
-#define VSETVL(n)           vsetvl_e32m2(n)
-#define VSETVL_MAX          vsetvlmax_e32m2()
-#define VSETVL_MAX_M1       vsetvlmax_e32m1()
-#define FLOAT_V_T           vfloat32m2_t
-#define FLOAT_V_T_M1        vfloat32m1_t
-#define VLEV_FLOAT          vle32_v_f32m2
-#define VLSEG4_FLOAT        vlseg4e32_v_f32m2
-#define VLSEG2_FLOAT        vlseg2e32_v_f32m2
-#define VFMVVF_FLOAT        vfmv_v_f_f32m2
-#define VFMUL_FLOAT         vfmul_vv_f32m2
-#define VFMACCVF_FLOAT      vfmacc_vf_f32m2
-#define VFMACCVV_FLOAT      vfmacc_vv_f32m2
-#define VFREDSUMVS_FLOAT    vfredusum_vs_f32m2_f32m1
-#define VFMVVF_FLOAT_M1     vfmv_v_f_f32m1
-#define VFMVFS_FLOAT_M1     vfmv_f_s_f32m1_f32
-#else
-#define VSETVL(n)           vsetvl_e64m2(n)
-#define VSETVL_MAX          vsetvlmax_e64m2()
-#define VSETVL_MAX_M1       vsetvlmax_e64m1()
-#define FLOAT_V_T           vfloat64m2_t
-#define FLOAT_V_T_M1        vfloat64m1_t
-#define VLEV_FLOAT          vle64_v_f64m2
-#define VLSEG4_FLOAT        vlseg4e64_v_f64m2
-#define VLSEG2_FLOAT        vlseg2e64_v_f64m2
-#define VFMVVF_FLOAT        vfmv_v_f_f64m2
-#define VFMUL_FLOAT         vfmul_vv_f64m2
-#define VFMACCVF_FLOAT      vfmacc_vf_f64m2
-#define VFMACCVV_FLOAT      vfmacc_vv_f64m2
-#define VFREDSUMVS_FLOAT    vfredusum_vs_f64m2_f64m1
-#define VFMVVF_FLOAT_M1     vfmv_v_f_f64m1
-#define VFMVFS_FLOAT_M1     vfmv_f_s_f64m1_f64
-#endif
-
-
-// Optimizes the implementation in ../generic/trmmkernel_4x4.c
-
-int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
-{
-
-   BLASLONG i,j,k;
-   FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
-
-   FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3;
-   FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3, v_z0;
-   v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
-   size_t vl;
-   size_t vlmax = VSETVL_MAX;
-
-   FLOAT_V_T vres0_0;
-   FLOAT_V_T vres0_1;
-   FLOAT_V_T vres0_2;
-   FLOAT_V_T vres0_3;
-
-   FLOAT_V_T vres1_0;
-   FLOAT_V_T vres1_1;
-   FLOAT_V_T vres1_2;
-   FLOAT_V_T vres1_3;
-
-   FLOAT_V_T vres2_0;
-   FLOAT_V_T vres2_1;
-   FLOAT_V_T vres2_2;
-   FLOAT_V_T vres2_3;
-
-   FLOAT_V_T vres3_0;
-   FLOAT_V_T vres3_1;
-   FLOAT_V_T vres3_2;
-   FLOAT_V_T vres3_3;
-
-   BLASLONG off, temp;
-
-   bool left;
-   bool transposed;
-   bool backwards;
-
-#ifdef LEFT
-   left = true;
-#else
-   left = false;
-#endif
-
-#ifdef TRANSA
-   transposed = true;
-#else
-   transposed = false;
-#endif
-
-   backwards = left != transposed;
-
-   if (!left) {
-      off = -offset;
-   }
-
-
-   for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops 
-   {
-        C0 = C;
-        C1 = C0+ldc;
-        C2 = C1+ldc;
-        C3 = C2+ldc;
-
-
-        if (left) {
-            off = offset;
-        }
-
-        ptrba = ba;
-
-        for (i=0; i<bm/4; i+=1) // do blocks of 4x4
-        {
-
-            ptrbb = bb;
-            if (backwards)
-            {
-                ptrba += off*4; // number of values in A
-                ptrbb += off*4; // number of values in B
-            }
-
-            vres0_0 = VFMVVF_FLOAT(0, vlmax);
-            vres0_1 = VFMVVF_FLOAT(0, vlmax);
-            vres0_2 = VFMVVF_FLOAT(0, vlmax);
-            vres0_3 = VFMVVF_FLOAT(0, vlmax);
-            
-            vres1_0 = VFMVVF_FLOAT(0, vlmax);
-            vres1_1 = VFMVVF_FLOAT(0, vlmax);
-            vres1_2 = VFMVVF_FLOAT(0, vlmax);
-            vres1_3 = VFMVVF_FLOAT(0, vlmax);
-            
-            vres2_0 = VFMVVF_FLOAT(0, vlmax);
-            vres2_1 = VFMVVF_FLOAT(0, vlmax);
-            vres2_2 = VFMVVF_FLOAT(0, vlmax);
-            vres2_3 = VFMVVF_FLOAT(0, vlmax);
-            
-            vres3_0 = VFMVVF_FLOAT(0, vlmax);
-            vres3_1 = VFMVVF_FLOAT(0, vlmax);
-            vres3_2 = VFMVVF_FLOAT(0, vlmax);
-            vres3_3 = VFMVVF_FLOAT(0, vlmax);
-
-            temp = backwards ? bk-off :
-                         left ? off + 4 : // number of values in A
-                                off + 4;  // number of values in B
-
-            for (k = temp; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
-                VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
-            
-                vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
-                vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl);
-                vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl);
-                vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl);
-            
-                vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl);
-                vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl);
-                vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl);
-                vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl);
-
-                vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl);
-                vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl);
-                vres2_2 = VFMACCVV_FLOAT(vres2_2, va2, vb2, vl);
-                vres3_2 = VFMACCVV_FLOAT(vres3_2, va2, vb3, vl);
-
-                vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl);
-                vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl);
-                vres2_3 = VFMACCVV_FLOAT(vres2_3, va3, vb2, vl);
-                vres3_3 = VFMACCVV_FLOAT(vres3_3, va3, vb3, vl);
-
-                ptrba += vl * 4;
-                ptrbb += vl * 4;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax);
-            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax);
-            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax);
-            C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
-            C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
-            C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2);
-            C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3);
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax);
-            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax);
-            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax);
-            C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
-            C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
-            C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2);
-            C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3);
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax);
-            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_2, v_z0, vlmax);
-            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres2_3, v_z0, vlmax);
-            C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
-            C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
-            C2[2] = alpha * VFMVFS_FLOAT_M1(vsum2);
-            C2[3] = alpha * VFMVFS_FLOAT_M1(vsum3);
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres3_0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3_1, v_z0, vlmax);
-            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_2, v_z0, vlmax);
-            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_3, v_z0, vlmax);
-            C3[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
-            C3[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
-            C3[2] = alpha * VFMVFS_FLOAT_M1(vsum2);
-            C3[3] = alpha * VFMVFS_FLOAT_M1(vsum3);
-
-            if (!backwards) {
-                temp = bk-off;
-                temp = left ? temp - 4 : // number of values in A
-                              temp - 4;  // number of values in B
-
-                ptrba += temp*4; // number of values in A
-                ptrbb += temp*4; // number of values in B
-            }
-#ifdef LEFT
-            off += 4; // number of values in A
-#endif
-
-            C0 = C0+4;
-            C1 = C1+4;
-            C2 = C2+4;
-            C3 = C3+4;
-
-        }
-
-        if ( bm & 2 ) // do any 2x4 loop
-        {
-
-#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-            ptrbb = bb;
-#else
-            ptrba += off*2;
-            ptrbb = bb + off*4;
-#endif
-
-            vres0_0 = VFMVVF_FLOAT(0, vlmax);
-            vres0_1 = VFMVVF_FLOAT(0, vlmax);
-            
-            vres1_0 = VFMVVF_FLOAT(0, vlmax);
-            vres1_1 = VFMVVF_FLOAT(0, vlmax);
-            
-            vres2_0 = VFMVVF_FLOAT(0, vlmax);
-            vres2_1 = VFMVVF_FLOAT(0, vlmax);
-            
-            vres3_0 = VFMVVF_FLOAT(0, vlmax);
-            vres3_1 = VFMVVF_FLOAT(0, vlmax);
-
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-            temp = bk-off;
-#elif defined(LEFT)
-            temp = off+2;   // number of values in A
-#else
-            temp = off+4;   // number of values in B
-#endif
-            for (k = temp; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
-                VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
-            
-                vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
-                vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl);
-                vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl);
-                vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl);
-
-                vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl);
-                vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl);
-                vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl);
-                vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl);
-
-                ptrba += vl * 2;
-                ptrbb += vl * 4;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax);
-            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax);
-            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax);
-
-            C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
-            C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
-            C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2);
-            C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3);
-            
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax);
-            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_0, v_z0, vlmax);
-            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_1, v_z0, vlmax);
-
-            C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
-            C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
-            C3[0] = alpha * VFMVFS_FLOAT_M1(vsum2);
-            C3[1] = alpha * VFMVFS_FLOAT_M1(vsum3);
-
-
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-            temp = bk - off;
-#ifdef LEFT
-            temp -= 2; // number of values in A
-#else
-            temp -= 4; // number of values in B
-#endif
-            ptrba += temp*2;
-            ptrbb += temp*4;
-#endif
-
-#ifdef LEFT
-            off += 2; // number of values in A
-#endif
-
-            C0 = C0+2;
-            C1 = C1+2;
-            C2 = C2+2;
-            C3 = C3+2;
-
-        }
-
-        if ( bm & 1 ) // do any 1x4 loop
-        {
-
-#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-            ptrbb = bb;
-#else
-            ptrba += off*1;
-            ptrbb = bb + off*4;
-#endif
-
-            vres0_0 = VFMVVF_FLOAT(0, vlmax);
-            vres1_0 = VFMVVF_FLOAT(0, vlmax);
-            vres2_0 = VFMVVF_FLOAT(0, vlmax);
-            vres3_0 = VFMVVF_FLOAT(0, vlmax);
-
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-            temp = bk-off;
-#elif defined(LEFT)
-            temp = off+1;   // number of values in A
-#else
-            temp = off+4;   // number of values in B
-#endif
-
-            for (k = temp; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                va0 = VLEV_FLOAT(ptrba, vl);
-                VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
-            
-                vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
-                vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl);
-                vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl);
-                vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl);
-
-                ptrba += vl;
-                ptrbb += vl * 4;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax);
-            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_0, v_z0, vlmax);
-            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_0, v_z0, vlmax);
-
-            C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
-            C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1);
-            C2[0] = alpha * VFMVFS_FLOAT_M1(vsum2);
-            C3[0] = alpha * VFMVFS_FLOAT_M1(vsum3);
-
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-            temp = bk - off;
-#ifdef LEFT
-            temp -= 1; // number of values in A
-#else
-            temp -= 4; // number of values in B
-#endif
-            ptrba += temp*1;
-            ptrbb += temp*4;
-#endif
-
-#ifdef LEFT
-            off += 1; // number of values in A
-#endif
-
-            C0 = C0+1;
-            C1 = C1+1;
-            C2 = C2+1;
-            C3 = C3+1;
-
-        }
-
-
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        off += 4;
-#endif
-
-        k = (bk<<2);
-        bb = bb+k;
-        i = (ldc<<2);
-        C = C+i;
-    }
-
-    for (j=0; j<(bn&2); j+=2) // do the Mx2 loops 
-    {
-        C0 = C;
-        C1 = C0+ldc;
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        off = offset;
-#endif
-
-        ptrba = ba;
-
-        for (i=0; i<bm/4; i+=1) // do blocks of 4x2
-        {
-
-#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-            ptrbb = bb;
-#else
-            ptrba += off*4;
-            ptrbb = bb + off*2;
-#endif
-            vres0_0 = VFMVVF_FLOAT(0, vlmax);
-            vres0_1 = VFMVVF_FLOAT(0, vlmax);
-            vres0_2 = VFMVVF_FLOAT(0, vlmax);
-            vres0_3 = VFMVVF_FLOAT(0, vlmax);
-            
-            vres1_0 = VFMVVF_FLOAT(0, vlmax);
-            vres1_1 = VFMVVF_FLOAT(0, vlmax);
-            vres1_2 = VFMVVF_FLOAT(0, vlmax);
-            vres1_3 = VFMVVF_FLOAT(0, vlmax);
-        
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-            temp = bk-off;
-#elif defined(LEFT)
-            temp = off+4;   // number of values in A
-#else
-            temp = off+2;   // number of values in B
-#endif
-
-            for (k = temp; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
-                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
-            
-                vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
-                vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl);
-            
-                vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl);
-                vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl);
-            
-                vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl);
-                vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl);
-            
-                vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl);
-                vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl);
-            
-                ptrba += vl * 4;
-                ptrbb += vl * 2;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax);
-            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax);
-            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax);
-            C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
-            C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
-            C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2);
-            C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3);
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax);
-            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax);
-            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax);
-            C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
-            C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
-            C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2);
-            C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3);
-
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-            temp = bk - off;
-#ifdef LEFT
-            temp -= 4; // number of values in A
-#else
-            temp -= 2; // number of values in B
-#endif
-            ptrba += temp*4;
-            ptrbb += temp*2;
-#endif
-
-#ifdef LEFT
-            off += 4; // number of values in A
-#endif
-
-            C0 = C0+4;
-            C1 = C1+4;
-
-        }
-
-        if ( bm & 2 ) // do any 2x2 loop
-        {
-
-#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-            ptrbb = bb;
-#else
-            ptrba += off*2;
-            ptrbb = bb + off*2;
-#endif
-
-            vres0_0 = VFMVVF_FLOAT(0, vlmax);
-            vres0_1 = VFMVVF_FLOAT(0, vlmax);
-            
-            vres1_0 = VFMVVF_FLOAT(0, vlmax);
-            vres1_1 = VFMVVF_FLOAT(0, vlmax);
-
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-            temp = bk-off;
-#elif defined(LEFT)
-            temp = off+2;   // number of values in A
-#else
-            temp = off+2;   // number of values in B
-#endif
-            for (k = temp; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
-                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
-            
-                vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
-                vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl);
-            
-                vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl);
-                vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl);
-            
-                ptrba += vl * 2;
-                ptrbb += vl * 2;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax);
-            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax);
-            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax);
-
-            C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
-            C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
-            C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2);
-            C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3);
-
-
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-            temp = bk - off;
-#ifdef LEFT
-            temp -= 2; // number of values in A
-#else
-            temp -= 2; // number of values in B
-#endif
-            ptrba += temp*2;
-            ptrbb += temp*2;
-#endif
-
-#ifdef LEFT
-            off += 2; // number of values in A
-#endif
-
-            C0 = C0+2;
-            C1 = C1+2;
-
-        }
-
-        if ( bm & 1 ) // do any 1x2 loop
-        {
-
-#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-            ptrbb = bb;
-#else
-            ptrba += off*1;
-            ptrbb = bb + off*2;
-#endif
-
-
-            vres0_0 = VFMVVF_FLOAT(0, vlmax);
-            vres1_0 = VFMVVF_FLOAT(0, vlmax);
-
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-            temp = bk-off;
-#elif defined(LEFT)
-            temp = off+1;   // number of values in A
-#else
-            temp = off+2;   // number of values in B
-#endif
-
-            for (k = temp; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                va0 = VLEV_FLOAT(ptrba, vl);
-                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
-            
-                vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
-                vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl);
-                    
-                ptrba += vl;
-                ptrbb += vl * 2;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax);
-            C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
-            C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1);
-
-
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-            temp = bk - off;
-#ifdef LEFT
-            temp -= 1; // number of values in A
-#else
-            temp -= 2; // number of values in B
-#endif
-            ptrba += temp*1;
-            ptrbb += temp*2;
-#endif
-
-#ifdef LEFT
-            off += 1; // number of values in A
-#endif
-
-            C0 = C0+1;
-            C1 = C1+1;
-
-        }
-
-
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        off += 2;
-#endif
-
-        k = (bk<<1);
-        bb = bb+k;
-        i = (ldc<<1);
-        C = C+i;
-    }
-
-    for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
-    {
-        C0 = C;
-
-#if defined(TRMMKERNEL) &&  defined(LEFT)
-    off = offset;
-#endif
-
-        ptrba = ba;
-
-        for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
-        {
-
-#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-            ptrbb = bb;
-#else
-            ptrba += off*4;
-            ptrbb = bb + off*1;
-#endif
-
-            vres0_0 = VFMVVF_FLOAT(0, vlmax);
-            vres0_1 = VFMVVF_FLOAT(0, vlmax);
-            vres0_2 = VFMVVF_FLOAT(0, vlmax);
-            vres0_3 = VFMVVF_FLOAT(0, vlmax);
-
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-            temp = bk-off;
-#elif defined(LEFT)
-            temp = off+4;   // number of values in A
-#else
-            temp = off+1;   // number of values in B
-#endif
-
-            for (k = temp; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
-                vb0 = VLEV_FLOAT(ptrbb, vl);
-            
-                vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
-           
-                vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl);
-            
-                vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl);
-            
-                vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl);
-            
-                ptrba += vl * 4;
-                ptrbb += vl;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax);
-            vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax);
-            vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax);
-            C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
-            C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
-            C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2);
-            C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3);
-
-
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-            temp = bk - off;
-#ifdef LEFT
-            temp -= 4; // number of values in A
-#else
-            temp -= 1; // number of values in B
-#endif
-            ptrba += temp*4;
-            ptrbb += temp*1;
-#endif
-
-#ifdef LEFT
-            off += 4; // number of values in A
-#endif
-
-            C0 = C0+4;
-
-        }
-
-        if ( bm & 2 ) // do any 2x1 loop
-        {
-
-#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-            ptrbb = bb;
-#else
-            ptrba += off*2;
-            ptrbb = bb + off*1;
-#endif
-
-            vres0_0 = VFMVVF_FLOAT(0, vlmax);
-            vres0_1 = VFMVVF_FLOAT(0, vlmax);
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-            temp = bk-off;
-#elif defined(LEFT)
-            temp = off+2;   // number of values in A
-#else
-            temp = off+1;   // number of values in B
-#endif
-
-            for (k = temp; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
-                vb0 = VLEV_FLOAT(ptrbb, vl);
-            
-                vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
-           
-                vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl);
-           
-                ptrba += vl * 2;
-                ptrbb += vl;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
-            vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax);
-            C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
-            C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
-
-
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-            temp = bk - off;
-#ifdef LEFT
-            temp -= 2; // number of values in A
-#else
-            temp -= 1; // number of values in B
-#endif
-            ptrba += temp*2;
-            ptrbb += temp*1;
-#endif
-
-#ifdef LEFT
-            off += 2; // number of values in A
-#endif
-
-            C0 = C0+2;
-
-        }
-
-        if ( bm & 1 ) // do any 1x1 loop
-        {
-
-#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-            ptrbb = bb;
-#else
-            ptrba += off*1;
-            ptrbb = bb + off*1;
-#endif
-
-            vres0_0 = VFMVVF_FLOAT(0, vlmax);
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-            temp = bk-off;
-#elif defined(LEFT)
-            temp = off+1;   // number of values in A
-#else
-            temp = off+1;   // number of values in B
-#endif
-
-            for (k = temp; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                va0 = VLEV_FLOAT(ptrba, vl);
-                vb0 = VLEV_FLOAT(ptrbb, vl);
-            
-                vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
-                  
-                ptrba += vl;
-                ptrbb += vl;
-            }
-
-            vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
-            C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
-
-
-#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-            temp = bk - off;
-#ifdef LEFT
-            temp -= 1; // number of values in A
-#else
-            temp -= 1; // number of values in B
-#endif
-            ptrba += temp*1;
-            ptrbb += temp*1;
-#endif
-
-#ifdef LEFT
-            off += 1; // number of values in A
-#endif
-
-            C0 = C0+1;
-
-        }
-
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        off += 1;
-#endif
-
-        k = (bk<<0);
-        bb = bb+k;
-        C = C+ldc;
-    }
-    return 0;
-}

From 9702d57b11351a5360a2f0326c69c3f550c784d2 Mon Sep 17 00:00:00 2001
From: HellerZheng <heller.zheng@microchip.com>
Date: Wed, 16 Nov 2022 11:11:04 +0800
Subject: [PATCH 005/718] Update Makefile.install

---
 Makefile.install | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Makefile.install b/Makefile.install
index 168d08f72..87b5bc870 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -202,5 +202,3 @@ endif
 	@echo "  endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
 	@echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
 	@echo Install OK!
-
-	

From 3918d8504e7720d94221025ae6078a2459ccb104 Mon Sep 17 00:00:00 2001
From: Heller Zheng <heller.zheng@microchip.com>
Date: Mon, 21 Nov 2022 19:06:07 -0800
Subject: [PATCH 006/718] nrm2 simple optimization

---
 kernel/riscv64/nrm2_rvv.c | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c
index 3f5d50397..979c31648 100644
--- a/kernel/riscv64/nrm2_rvv.c
+++ b/kernel/riscv64/nrm2_rvv.c
@@ -39,9 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMACCVV_FLOAT vfmacc_vv_f32m8
 #define VFMVVF_FLOAT vfmv_v_f_f32m8
 #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
 #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
-#define VFABSV_FLOAT vfabs_v_f32m8
 #define ABS fabsf
 #else
 #define VSETVL(n) vsetvl_e64m8(n)
@@ -54,9 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMACCVV_FLOAT vfmacc_vv_f64m8
 #define VFMVVF_FLOAT vfmv_v_f_f64m8
 #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
 #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
-#define VFABSV_FLOAT vfabs_v_f64m8
 #define ABS fabs
 #endif
 
@@ -68,12 +64,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
     if(n == 1) return (ABS(x[0]));
 
     FLOAT_V_T vr, v0;
-    FLOAT_V_T_M1 v_max, v_res;
-    FLOAT scale = 0.0, ssq = 0.0;
+    FLOAT_V_T_M1 v_res;
+    FLOAT ssq = 0.0;
 
     size_t vlmax = VSETVL_MAX;
     v_res = VFMVVF_FLOAT_M1(0, vlmax);
-    v_max = VFMVVF_FLOAT_M1(0, vlmax);
 
     vr = VFMVVF_FLOAT(0, vlmax);
  
@@ -83,9 +78,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             vl = VSETVL(n);
 
             v0 = VLEV_FLOAT(x, vl);
-            v0 = VFABSV_FLOAT(v0, vl);
-
-            v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl);
 
             vr = VFMACCVV_FLOAT(vr, v0, v0, vl);
         }
@@ -98,20 +90,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             vl = VSETVL(n);
 
             v0 = VLSEV_FLOAT(x, stride_x, vl);
-            v0 = VFABSV_FLOAT(v0, vl);
-
-            v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl);
 
             vr = VFMACCVV_FLOAT(vr, v0, v0, vl);
         }
-
     }
 
     v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax);
 
     ssq = VFMVFS_FLOAT_M1(v_res);
-    scale = VFMVFS_FLOAT_M1(v_max);
-    ssq = ssq / (scale*scale);
 
-    return(scale * sqrt(ssq));
+    return sqrt(ssq);
 }

From 387e8970cd8ce581a6c7bc48418860966140f621 Mon Sep 17 00:00:00 2001
From: Heller Zheng <heller.zheng@microchip.com>
Date: Mon, 28 Nov 2022 21:42:29 -0800
Subject: [PATCH 007/718] Fix merge problem; Update compiling COMMON_OPT per
 review comments.

---
 Makefile.prebuild | 2 +-
 Makefile.riscv64  | 6 +++---
 common_riscv64.h  | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Makefile.prebuild b/Makefile.prebuild
index e6a8eab59..c4f4a2602 100644
--- a/Makefile.prebuild
+++ b/Makefile.prebuild
@@ -56,7 +56,7 @@ TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d
 endif
 
 ifeq ($(TARGET), x280)
-TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -mcpu=sifive-x280
+TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
 endif
 
 ifeq ($(TARGET), RISCV64_GENERIC)
diff --git a/Makefile.riscv64 b/Makefile.riscv64
index d6eaf552d..d091984a6 100644
--- a/Makefile.riscv64
+++ b/Makefile.riscv64
@@ -3,10 +3,10 @@ CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920
 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static
 endif
 ifeq ($(CORE), x280)
-CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -menable-experimental-extensions -mllvm --riscv-v-vector-bits-min=512 -mcpu=sifive-x280 -ffast-math
-FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_xsfvqmaccqoq_xsfvfhbfmin -mabi=lp64d -menable-experimental-extensions -static
+CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -mllvm --riscv-v-vector-bits-min=512 -ffast-math
+FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
 endif
 ifeq ($(CORE), RISCV64_GENERIC)
 CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d
 FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static
-endif
\ No newline at end of file
+endif
diff --git a/common_riscv64.h b/common_riscv64.h
index 221a79901..2092bd5ab 100644
--- a/common_riscv64.h
+++ b/common_riscv64.h
@@ -92,7 +92,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define SEEK_ADDRESS
 
 #if defined(C910V)
-#include <riscv-vector.h>
+#include <riscv_vector.h>
 #endif
 
 #if defined(x280)

From c19dff0a31c58163dc386b0c4270e75f576b97be Mon Sep 17 00:00:00 2001
From: Xianyi Zhang <xianyi@perfxlab.com>
Date: Wed, 25 Jan 2023 19:33:32 +0800
Subject: [PATCH 008/718] Fix T-Head RVV intrinsic API changes.

---
 kernel/riscv64/amax_vector.c     |  8 ++++----
 kernel/riscv64/amin_vector.c     |  8 ++++----
 kernel/riscv64/asum_vector.c     |  8 ++++----
 kernel/riscv64/axpby_vector.c    | 16 ++++++++--------
 kernel/riscv64/axpy_vector.c     | 16 ++++++++--------
 kernel/riscv64/copy_vector.c     | 16 ++++++++--------
 kernel/riscv64/dot_vector.c      |  8 ++++----
 kernel/riscv64/gemv_n_vector.c   | 16 ++++++++--------
 kernel/riscv64/gemv_t_vector.c   |  8 ++++----
 kernel/riscv64/iamax_vector.c    |  8 ++++----
 kernel/riscv64/iamin_vector.c    |  8 ++++----
 kernel/riscv64/imax_vector.c     |  8 ++++----
 kernel/riscv64/imin_vector.c     |  8 ++++----
 kernel/riscv64/izamax_vector.c   |  4 ++--
 kernel/riscv64/izamin_vector.c   |  4 ++--
 kernel/riscv64/max_vector.c      |  8 ++++----
 kernel/riscv64/min_vector.c      |  8 ++++----
 kernel/riscv64/nrm2_vector.c     |  8 ++++----
 kernel/riscv64/rot_vector.c      | 16 ++++++++--------
 kernel/riscv64/scal_vector.c     | 16 ++++++++--------
 kernel/riscv64/swap_vector.c     | 16 ++++++++--------
 kernel/riscv64/symv_L_vector.c   | 16 ++++++++--------
 kernel/riscv64/symv_U_vector.c   | 16 ++++++++--------
 kernel/riscv64/zamax_vector.c    |  4 ++--
 kernel/riscv64/zamin_vector.c    |  4 ++--
 kernel/riscv64/zasum_vector.c    |  8 ++++----
 kernel/riscv64/zaxpby_vector.c   |  8 ++++----
 kernel/riscv64/zaxpy_vector.c    |  8 ++++----
 kernel/riscv64/zcopy_vector.c    |  8 ++++----
 kernel/riscv64/zdot_vector.c     |  8 ++++----
 kernel/riscv64/zgemv_n_vector.c  | 16 ++++++++--------
 kernel/riscv64/zgemv_t_vector.c  |  4 ++--
 kernel/riscv64/zhemv_LM_vector.c |  8 ++++----
 kernel/riscv64/zhemv_UV_vector.c |  8 ++++----
 kernel/riscv64/znrm2_vector.c    |  8 ++++----
 kernel/riscv64/zrot_vector.c     | 16 ++++++++--------
 kernel/riscv64/zscal_vector.c    |  8 ++++----
 kernel/riscv64/zswap_vector.c    | 16 ++++++++--------
 38 files changed, 190 insertions(+), 190 deletions(-)

diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c
index b778d3e55..1b7799340 100644
--- a/kernel/riscv64/amax_vector.c
+++ b/kernel/riscv64/amax_vector.c
@@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
@@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c
index fd2f83dc9..f9b7defae 100644
--- a/kernel/riscv64/amin_vector.c
+++ b/kernel/riscv64/amin_vector.c
@@ -34,8 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
@@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c
index a82275153..fc73362bc 100644
--- a/kernel/riscv64/asum_vector.c
+++ b/kernel/riscv64/asum_vector.c
@@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
@@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c
index 988c57ec2..676dfd474 100644
--- a/kernel/riscv64/axpby_vector.c
+++ b/kernel/riscv64/axpby_vector.c
@@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #define VFMVVF_FLOAT vfmv_v_f_f32m4
 #define VFMULVF_FLOAT vfmul_vf_f32m4
 #else
 #define VSETVL(n) vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #define VFMVVF_FLOAT vfmv_v_f_f64m4
 #define VFMULVF_FLOAT vfmul_vf_f64m4
diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c
index 98b9f6814..6f921f2d6 100644
--- a/kernel/riscv64/axpy_vector.c
+++ b/kernel/riscv64/axpy_vector.c
@@ -30,18 +30,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #else
 #define VSETVL(n) vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #endif
 
diff --git a/kernel/riscv64/copy_vector.c b/kernel/riscv64/copy_vector.c
index a46136d6c..fee5e195d 100644
--- a/kernel/riscv64/copy_vector.c
+++ b/kernel/riscv64/copy_vector.c
@@ -28,17 +28,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m8(n)
 #define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VSEV_FLOAT vse_v_f32m8
-#define VSSEV_FLOAT vsse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VSEV_FLOAT vse32_v_f32m8
+#define VSSEV_FLOAT vsse32_v_f32m8
 #else
 #define VSETVL(n) vsetvl_e64m8(n)
 #define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VSEV_FLOAT vse_v_f64m8
-#define VSSEV_FLOAT vsse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VSEV_FLOAT vse64_v_f64m8
+#define VSSEV_FLOAT vsse64_v_f64m8
 #endif
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
diff --git a/kernel/riscv64/dot_vector.c b/kernel/riscv64/dot_vector.c
index 64efc6c40..f47e0c0b5 100644
--- a/kernel/riscv64/dot_vector.c
+++ b/kernel/riscv64/dot_vector.c
@@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
 #define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMVVF_FLOAT vfmv_v_f_f32m4
@@ -45,8 +45,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMVVF_FLOAT vfmv_v_f_f64m4
diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c
index 32ca8618b..bb9ab8e5a 100644
--- a/kernel/riscv64/gemv_n_vector.c
+++ b/kernel/riscv64/gemv_n_vector.c
@@ -29,18 +29,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #else
 #define VSETVL(n) vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #endif
 
diff --git a/kernel/riscv64/gemv_t_vector.c b/kernel/riscv64/gemv_t_vector.c
index 7683641fa..7d0b70cbb 100644
--- a/kernel/riscv64/gemv_t_vector.c
+++ b/kernel/riscv64/gemv_t_vector.c
@@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
 #define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMVVF_FLOAT vfmv_v_f_f32m4
@@ -46,8 +46,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMVVF_FLOAT vfmv_v_f_f64m4
diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c
index ecb4cd7a9..9fea522f7 100644
--- a/kernel/riscv64/iamax_vector.c
+++ b/kernel/riscv64/iamax_vector.c
@@ -35,8 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
@@ -60,8 +60,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c
index c72bb94cc..4e81e7848 100644
--- a/kernel/riscv64/iamin_vector.c
+++ b/kernel/riscv64/iamin_vector.c
@@ -36,8 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
@@ -61,8 +61,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c
index c2d787ab8..ca48a3c48 100644
--- a/kernel/riscv64/imax_vector.c
+++ b/kernel/riscv64/imax_vector.c
@@ -36,8 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
@@ -59,8 +59,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c
index dfe9a3310..2a677098d 100644
--- a/kernel/riscv64/imin_vector.c
+++ b/kernel/riscv64/imin_vector.c
@@ -36,8 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
@@ -59,8 +59,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c
index fdbdc3ae8..66a101566 100644
--- a/kernel/riscv64/izamax_vector.c
+++ b/kernel/riscv64/izamax_vector.c
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
@@ -63,7 +63,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c
index 59c720310..818193a9e 100644
--- a/kernel/riscv64/izamin_vector.c
+++ b/kernel/riscv64/izamin_vector.c
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
@@ -64,7 +64,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c
index b988513c9..7f31e9a53 100644
--- a/kernel/riscv64/max_vector.c
+++ b/kernel/riscv64/max_vector.c
@@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
 #define VFMVVF_FLOAT vfmv_v_f_f32m8
 #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
@@ -44,8 +44,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
 #define VFMVVF_FLOAT vfmv_v_f_f64m8
 #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c
index be0803df6..14b7e01ed 100644
--- a/kernel/riscv64/min_vector.c
+++ b/kernel/riscv64/min_vector.c
@@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
 #define VFMVVF_FLOAT vfmv_v_f_f32m8
 #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
@@ -44,8 +44,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
 #define VFMVVF_FLOAT vfmv_v_f_f64m8
 #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c
index 2a83e2a52..cf6fdb741 100644
--- a/kernel/riscv64/nrm2_vector.c
+++ b/kernel/riscv64/nrm2_vector.c
@@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMVFS_FLOATM4 vfmv_f_s_f32m4_f32
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMVVF_FLOAT vfmv_v_f_f32m4
@@ -55,8 +55,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMVFS_FLOATM4 vfmv_f_s_f64m4_f64
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMVVF_FLOAT vfmv_v_f_f64m4
diff --git a/kernel/riscv64/rot_vector.c b/kernel/riscv64/rot_vector.c
index 9b48d1c69..43a65e552 100644
--- a/kernel/riscv64/rot_vector.c
+++ b/kernel/riscv64/rot_vector.c
@@ -31,10 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e32m4(n)
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #define VFMULVF_FLOAT vfmul_vf_f32m4
 #define VFMSACVF_FLOAT vfmsac_vf_f32m4
@@ -42,10 +42,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e64m4(n)
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #define VFMULVF_FLOAT vfmul_vf_f64m4
 #define VFMSACVF_FLOAT vfmsac_vf_f64m4
diff --git a/kernel/riscv64/scal_vector.c b/kernel/riscv64/scal_vector.c
index 7a3153b7c..8b9ef5a3e 100644
--- a/kernel/riscv64/scal_vector.c
+++ b/kernel/riscv64/scal_vector.c
@@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e32m8(n)
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VSEV_FLOAT vse_v_f32m8
-#define VSSEV_FLOAT vsse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VSEV_FLOAT vse32_v_f32m8
+#define VSSEV_FLOAT vsse32_v_f32m8
 #define VFMULVF_FLOAT vfmul_vf_f32m8
 #define VFMVVF_FLOAT vfmv_v_f_f32m8
 #else
 #define VSETVL(n) vsetvl_e64m8(n)
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VSEV_FLOAT vse_v_f64m8
-#define VSSEV_FLOAT vsse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VSEV_FLOAT vse64_v_f64m8
+#define VSSEV_FLOAT vsse64_v_f64m8
 #define VFMULVF_FLOAT vfmul_vf_f64m8
 #define VFMVVF_FLOAT vfmv_v_f_f64m8
 #endif
diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c
index d9421e2f1..b16592808 100644
--- a/kernel/riscv64/swap_vector.c
+++ b/kernel/riscv64/swap_vector.c
@@ -31,18 +31,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e32m8(n)
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VSEV_FLOAT vse_v_f32m8
-#define VSSEV_FLOAT vsse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VSEV_FLOAT vse32_v_f32m8
+#define VSSEV_FLOAT vsse32_v_f32m8
 #else
 #define VSETVL(n) vsetvl_e64m8(n)
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VSEV_FLOAT vse_v_f64m8
-#define VSSEV_FLOAT vsse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VSEV_FLOAT vse64_v_f64m8
+#define VSSEV_FLOAT vsse64_v_f64m8
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
diff --git a/kernel/riscv64/symv_L_vector.c b/kernel/riscv64/symv_L_vector.c
index 6588f4dda..58ec17b03 100644
--- a/kernel/riscv64/symv_L_vector.c
+++ b/kernel/riscv64/symv_L_vector.c
@@ -32,10 +32,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
@@ -48,10 +48,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c
index 31104eae6..34ff0e30a 100644
--- a/kernel/riscv64/symv_U_vector.c
+++ b/kernel/riscv64/symv_U_vector.c
@@ -32,10 +32,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
@@ -49,10 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c
index 9dbeba90f..bfb282ae0 100644
--- a/kernel/riscv64/zamax_vector.c
+++ b/kernel/riscv64/zamax_vector.c
@@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
@@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c
index dc58075ac..d9eca7f10 100644
--- a/kernel/riscv64/zamin_vector.c
+++ b/kernel/riscv64/zamin_vector.c
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
@@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c
index 8386ab62e..0d1cc42f1 100644
--- a/kernel/riscv64/zasum_vector.c
+++ b/kernel/riscv64/zasum_vector.c
@@ -34,8 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
@@ -49,8 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c
index 3eca20415..5e6034ac5 100644
--- a/kernel/riscv64/zaxpby_vector.c
+++ b/kernel/riscv64/zaxpby_vector.c
@@ -30,8 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #define VFMVVF_FLOAT vfmv_v_f_f32m4
 #define VFMULVF_FLOAT vfmul_vf_f32m4
@@ -40,8 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #define VSETVL(n) vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #define VFMVVF_FLOAT vfmv_v_f_f64m4
 #define VFMULVF_FLOAT vfmul_vf_f64m4
diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c
index 303d3541e..4ccfe4a81 100644
--- a/kernel/riscv64/zaxpy_vector.c
+++ b/kernel/riscv64/zaxpy_vector.c
@@ -30,15 +30,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
 #else
 #define VSETVL(n) vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
 #endif
diff --git a/kernel/riscv64/zcopy_vector.c b/kernel/riscv64/zcopy_vector.c
index 600f02bba..55a480a35 100644
--- a/kernel/riscv64/zcopy_vector.c
+++ b/kernel/riscv64/zcopy_vector.c
@@ -29,13 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #else
 #define VSETVL(n) vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #endif
 
 
diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c
index ec38ed9d2..0900206b3 100644
--- a/kernel/riscv64/zdot_vector.c
+++ b/kernel/riscv64/zdot_vector.c
@@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMVVF_FLOAT vfmv_v_f_f32m4
@@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMVVF_FLOAT vfmv_v_f_f64m4
diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c
index b5ee1f054..3095c28f9 100644
--- a/kernel/riscv64/zgemv_n_vector.c
+++ b/kernel/riscv64/zgemv_n_vector.c
@@ -29,19 +29,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
 #else
 #define VSETVL(n) vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
 #endif
diff --git a/kernel/riscv64/zgemv_t_vector.c b/kernel/riscv64/zgemv_t_vector.c
index e930dc2a2..a7a8a5279 100644
--- a/kernel/riscv64/zgemv_t_vector.c
+++ b/kernel/riscv64/zgemv_t_vector.c
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
@@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
diff --git a/kernel/riscv64/zhemv_LM_vector.c b/kernel/riscv64/zhemv_LM_vector.c
index 275ee9131..0a284a999 100644
--- a/kernel/riscv64/zhemv_LM_vector.c
+++ b/kernel/riscv64/zhemv_LM_vector.c
@@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
@@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c
index 2f46977d4..33b7c9c25 100644
--- a/kernel/riscv64/zhemv_UV_vector.c
+++ b/kernel/riscv64/zhemv_UV_vector.c
@@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
@@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c
index 59d0e219d..cadabdb75 100644
--- a/kernel/riscv64/znrm2_vector.c
+++ b/kernel/riscv64/znrm2_vector.c
@@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMVVF_FLOAT vfmv_v_f_f32m4
@@ -53,8 +53,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMVVF_FLOAT vfmv_v_f_f64m4
diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c
index 2fdd8135a..858dfd173 100644
--- a/kernel/riscv64/zrot_vector.c
+++ b/kernel/riscv64/zrot_vector.c
@@ -30,10 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e32m4(n)
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #define VFMULVF_FLOAT vfmul_vf_f32m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
@@ -41,10 +41,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e64m4(n)
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #define VFMULVF_FLOAT vfmul_vf_f64m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c
index 64323aa3a..d275b75f8 100644
--- a/kernel/riscv64/zscal_vector.c
+++ b/kernel/riscv64/zscal_vector.c
@@ -30,8 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e32m4(n)
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #define VFMULVF_FLOAT vfmul_vf_f32m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
@@ -40,8 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e64m4(n)
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #define VFMULVF_FLOAT vfmul_vf_f64m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c
index 7550294b5..c1dcaccab 100644
--- a/kernel/riscv64/zswap_vector.c
+++ b/kernel/riscv64/zswap_vector.c
@@ -31,18 +31,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e32m8(n)
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VSEV_FLOAT vse_v_f32m8
-#define VSSEV_FLOAT vsse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VSEV_FLOAT vse32_v_f32m8
+#define VSSEV_FLOAT vsse32_v_f32m8
 #else
 #define VSETVL(n) vsetvl_e64m8(n)
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VSEV_FLOAT vse_v_f64m8
-#define VSSEV_FLOAT vsse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VSEV_FLOAT vse64_v_f64m8
+#define VSSEV_FLOAT vsse64_v_f64m8
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)

From 63cf4d01668f8f6c73a05039bc36785ba78b0940 Mon Sep 17 00:00:00 2001
From: Heller Zheng <heller.zheng@microchip.com>
Date: Wed, 1 Feb 2023 19:13:44 -0800
Subject: [PATCH 009/718] add riscv level3 C,Z kernel functions.

---
 kernel/riscv64/KERNEL.x280             |  85 +++-
 kernel/riscv64/trmm_lncopy_rvv_v1.c    |   8 +-
 kernel/riscv64/trsm_kernel_LN_rvv_v1.c | 644 +++---------------------
 kernel/riscv64/trsm_kernel_LT_rvv_v1.c | 658 +++----------------------
 kernel/riscv64/trsm_kernel_RN_rvv_v1.c | 610 +++--------------------
 kernel/riscv64/trsm_kernel_RT_rvv_v1.c | 623 +++--------------------
 kernel/riscv64/zgemm_ncopy_4_rvv.c     | 121 +++++
 kernel/riscv64/zgemm_ncopy_rvv_v1.c    |  74 +++
 kernel/riscv64/zgemm_tcopy_4_rvv.c     | 181 +++++++
 kernel/riscv64/zgemm_tcopy_rvv_v1.c    |  74 +++
 kernel/riscv64/zgemmkernel_rvv_v1x4.c  | 475 ++++++++++++++++++
 kernel/riscv64/zhemm_ltcopy_rvv_v1.c   | 124 +++++
 kernel/riscv64/zhemm_utcopy_rvv_v1.c   | 120 +++++
 kernel/riscv64/zsymm_lcopy_rvv_v1.c    | 106 ++++
 kernel/riscv64/zsymm_ucopy_rvv_v1.c    | 106 ++++
 kernel/riscv64/ztrmm_lncopy_rvv_v1.c   | 145 ++++++
 kernel/riscv64/ztrmm_ltcopy_rvv_v1.c   | 143 ++++++
 kernel/riscv64/ztrmm_uncopy_rvv_v1.c   | 144 ++++++
 kernel/riscv64/ztrmm_utcopy_rvv_v1.c   | 140 ++++++
 kernel/riscv64/ztrmmkernel_rvv_v1x4.c  | 574 +++++++++++++++++++++
 kernel/riscv64/ztrsm_lncopy_rvv_v1.c   | 115 +++++
 kernel/riscv64/ztrsm_ltcopy_rvv_v1.c   | 114 +++++
 kernel/riscv64/ztrsm_uncopy_rvv_v1.c   | 113 +++++
 kernel/riscv64/ztrsm_utcopy_rvv_v1.c   | 115 +++++
 param.h                                |  10 +-
 25 files changed, 3342 insertions(+), 2280 deletions(-)
 create mode 100644 kernel/riscv64/zgemm_ncopy_4_rvv.c
 create mode 100644 kernel/riscv64/zgemm_ncopy_rvv_v1.c
 create mode 100644 kernel/riscv64/zgemm_tcopy_4_rvv.c
 create mode 100644 kernel/riscv64/zgemm_tcopy_rvv_v1.c
 create mode 100644 kernel/riscv64/zgemmkernel_rvv_v1x4.c
 create mode 100644 kernel/riscv64/zhemm_ltcopy_rvv_v1.c
 create mode 100644 kernel/riscv64/zhemm_utcopy_rvv_v1.c
 create mode 100644 kernel/riscv64/zsymm_lcopy_rvv_v1.c
 create mode 100644 kernel/riscv64/zsymm_ucopy_rvv_v1.c
 create mode 100644 kernel/riscv64/ztrmm_lncopy_rvv_v1.c
 create mode 100644 kernel/riscv64/ztrmm_ltcopy_rvv_v1.c
 create mode 100644 kernel/riscv64/ztrmm_uncopy_rvv_v1.c
 create mode 100644 kernel/riscv64/ztrmm_utcopy_rvv_v1.c
 create mode 100644 kernel/riscv64/ztrmmkernel_rvv_v1x4.c
 create mode 100644 kernel/riscv64/ztrsm_lncopy_rvv_v1.c
 create mode 100644 kernel/riscv64/ztrsm_ltcopy_rvv_v1.c
 create mode 100644 kernel/riscv64/ztrsm_uncopy_rvv_v1.c
 create mode 100644 kernel/riscv64/ztrsm_utcopy_rvv_v1.c

diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280
index 4d64354fb..217d8534e 100644
--- a/kernel/riscv64/KERNEL.x280
+++ b/kernel/riscv64/KERNEL.x280
@@ -118,8 +118,8 @@ DGEMVTKERNEL = gemv_t_rvv.c
 CGEMVTKERNEL = zgemv_t_rvv.c
 ZGEMVTKERNEL = zgemv_t_rvv.c
 
-CTRMMKERNEL	= ztrmmkernel_2x2_rvv.c
-ZTRMMKERNEL	= ztrmmkernel_2x2_rvv.c
+CTRMMKERNEL     = ztrmmkernel_rvv_v1x4.c
+ZTRMMKERNEL     = ztrmmkernel_rvv_v1x4.c
 
 # SGEMM_UNROLL_N set in params.h
 ifeq ($(SGEMM_UNROLL_N), 8)
@@ -168,17 +168,28 @@ DSYMMUCOPY_M   =  symm_ucopy_rvv_v1.c
 DSYMMLCOPY_M   =  symm_lcopy_rvv_v1.c
 endif
 
-CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
-CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
-CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-CGEMMONCOPYOBJ =  cgemm_oncopy.o
-CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+CGEMMKERNEL    =  zgemmkernel_rvv_v1x4.c
+CGEMMINCOPY    =  zgemm_ncopy_rvv_v1.c
+CGEMMITCOPY    =  zgemm_tcopy_rvv_v1.c
+CGEMMONCOPY    =  zgemm_ncopy_4_rvv.c
+CGEMMOTCOPY    =  zgemm_tcopy_4_rvv.c
 
-ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
-ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
-ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-ZGEMMONCOPYOBJ =  zgemm_oncopy.o
-ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL    = zgemmkernel_rvv_v1x4.c
+
+ZGEMMINCOPY    =  zgemm_ncopy_rvv_v1.c
+ZGEMMITCOPY    =  zgemm_tcopy_rvv_v1.c
+ZGEMMONCOPY    =  zgemm_ncopy_4_rvv.c
+ZGEMMOTCOPY    =  zgemm_tcopy_4_rvv.c
+
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 STRSMKERNEL_LN	=  trsm_kernel_LN_rvv_v1.c
 STRSMKERNEL_LT	=  trsm_kernel_LT_rvv_v1.c
@@ -190,20 +201,25 @@ DTRSMKERNEL_LT	=  trsm_kernel_LT_rvv_v1.c
 DTRSMKERNEL_RN	=  trsm_kernel_RN_rvv_v1.c
 DTRSMKERNEL_RT	=  trsm_kernel_RT_rvv_v1.c
 
-CTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+CTRSMKERNEL_LN  =  trsm_kernel_LN_rvv_v1.c
+CTRSMKERNEL_LT  =  trsm_kernel_LT_rvv_v1.c
+CTRSMKERNEL_RN  =  trsm_kernel_RN_rvv_v1.c
+CTRSMKERNEL_RT  =  trsm_kernel_RT_rvv_v1.c
 
-ZTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+ZTRSMKERNEL_LN  =  trsm_kernel_LN_rvv_v1.c
+ZTRSMKERNEL_LT  =  trsm_kernel_LT_rvv_v1.c
+ZTRSMKERNEL_RN  =  trsm_kernel_RN_rvv_v1.c
+ZTRSMKERNEL_RT  =  trsm_kernel_RT_rvv_v1.c
 
-TRSMCOPYLN_M    = trsm_lncopy_rvv_v1.c
-TRSMCOPYLT_M    = trsm_ltcopy_rvv_v1.c
-TRSMCOPYUN_M    = trsm_uncopy_rvv_v1.c
-TRSMCOPYUT_M    = trsm_utcopy_rvv_v1.c
+TRSMCOPYLN_M    =  trsm_lncopy_rvv_v1.c
+TRSMCOPYLT_M    =  trsm_ltcopy_rvv_v1.c
+TRSMCOPYUN_M    =  trsm_uncopy_rvv_v1.c
+TRSMCOPYUT_M    =  trsm_utcopy_rvv_v1.c
+
+ZTRSMCOPYLN_M   =  ztrsm_lncopy_rvv_v1.c
+ZTRSMCOPYLT_M   =  ztrsm_ltcopy_rvv_v1.c
+ZTRSMCOPYUN_M   =  ztrsm_uncopy_rvv_v1.c
+ZTRSMCOPYUT_M   =  ztrsm_utcopy_rvv_v1.c
 
 SSYMV_U_KERNEL =  symv_U_rvv.c 
 SSYMV_L_KERNEL =  symv_L_rvv.c
@@ -214,6 +230,27 @@ CSYMV_L_KERNEL =  ../generic/zsymv_k.c
 ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
 ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
 
+ZHEMMLTCOPY_M    =  zhemm_ltcopy_rvv_v1.c
+ZHEMMUTCOPY_M    =  zhemm_utcopy_rvv_v1.c
+
+CHEMMLTCOPY_M    =  zhemm_ltcopy_rvv_v1.c
+CHEMMUTCOPY_M    =  zhemm_utcopy_rvv_v1.c
+
+ZSYMMUCOPY_M    =  zsymm_ucopy_rvv_v1.c
+ZSYMMLCOPY_M    =  zsymm_lcopy_rvv_v1.c
+
+CSYMMUCOPY_M    =  zsymm_ucopy_rvv_v1.c
+CSYMMLCOPY_M    =  zsymm_lcopy_rvv_v1.c
+
+ZTRMMUNCOPY_M  =  ztrmm_uncopy_rvv_v1.c
+ZTRMMLNCOPY_M  =  ztrmm_lncopy_rvv_v1.c
+ZTRMMUTCOPY_M  =  ztrmm_utcopy_rvv_v1.c
+ZTRMMLTCOPY_M  =  ztrmm_ltcopy_rvv_v1.c
+
+CTRMMUNCOPY_M  =  ztrmm_uncopy_rvv_v1.c
+CTRMMLNCOPY_M  =  ztrmm_lncopy_rvv_v1.c
+CTRMMUTCOPY_M  =  ztrmm_utcopy_rvv_v1.c
+CTRMMLTCOPY_M  =  ztrmm_ltcopy_rvv_v1.c
 
 LSAME_KERNEL = ../generic/lsame.c
 
diff --git a/kernel/riscv64/trmm_lncopy_rvv_v1.c b/kernel/riscv64/trmm_lncopy_rvv_v1.c
index 73a8233f8..3457ca3e1 100644
--- a/kernel/riscv64/trmm_lncopy_rvv_v1.c
+++ b/kernel/riscv64/trmm_lncopy_rvv_v1.c
@@ -36,10 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSEV_FLOAT vse32_v_f32m2
 #define VLSEV_FLOAT vlse32_v_f32m2
 #define VBOOL_T vbool16_t
-#define UINT_V_T vint32m2_t
-#define VID_V_UINT vid_v_i32m2
-#define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16
-#define VMSEQ_VX_UINT vmseq_vx_i32m2_b16
+#define UINT_V_T vuint32m2_t
+#define VID_V_UINT vid_v_u32m2
+#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
+#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
 #define VFMERGE_VFM_FLOAT  vfmerge_vfm_f32m2
 #else
 #define VSETVL(n) vsetvl_e64m2(n)
diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c
index 11a0398ca..2cba06b38 100644
--- a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c
+++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c
@@ -31,28 +31,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e32m2(n)
 #define VSETVL_MAX vsetvlmax_e32m2()
 #define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
 #define VLSEV_FLOAT vlse32_v_f32m2
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
 #define VSSEV_FLOAT vsse32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
 #define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
+#define VSSSEG2_FLOAT vssseg2e32_v_f32m2
 #define VFMACCVF_FLOAT vfmacc_vf_f32m2
-#define VFMULVF_FLOAT vfmul_vf_f32m2
 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
+#define VFMULVF_FLOAT vfmul_vf_f32m2
 #else
 #define VSETVL(n) vsetvl_e64m2(n)
 #define VSETVL_MAX vsetvlmax_e64m2()
 #define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
 #define VLSEV_FLOAT vlse64_v_f64m2
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
 #define VSSEV_FLOAT vsse64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
 #define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
+#define VSSSEG2_FLOAT vssseg2e64_v_f64m2
+#define VFMVVF_FLOAT vfmv_v_f_f64m2
 #define VFMACCVF_FLOAT vfmacc_vf_f64m2
-#define VFMULVF_FLOAT vfmul_vf_f64m2
 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
+#define VFMULVF_FLOAT vfmul_vf_f64m2
 #endif
 
 
@@ -88,606 +91,107 @@ static FLOAT dm1 = -1.;
 
 #ifndef COMPLEX
 
-#if GEMM_DEFAULT_UNROLL_N == 1
-
 static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
-
-    FLOAT aa,  bb;
-    FLOAT *pa, *pc;
+    FLOAT aa;
+    FLOAT* pc;
 
     int i, j, k;
-    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug
 
-    size_t vl;
-    FLOAT_V_T va, vc;
-
-    a += (m - 1) * m;
-    b += (m - 1) * n;
-
-    for (i = m - 1; i >= 0; i--) 
-    {
-        aa = *(a + i);
-        for (j = 0; j < n; j ++) 
-        {
-            bb = *(c + i + j * ldc);
-            bb *= aa;
-            *b             = bb;
-            *(c + i + j * ldc) = bb;
-            b ++;
-
-            pa = a;
-            pc = c + j * ldc;
-            for (k = i; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc = VLEV_FLOAT(pc, vl);
-                va = VLEV_FLOAT(pa, vl);
-                vc = VFNMSACVF_FLOAT(vc, bb, va, vl);
-                VSEV_FLOAT(pc, vc, vl);
-                pa += vl;
-                pc += vl;
-            }
-        }
-        a -= m;
-        b -= 2 * n;
-    }
-
-}
-#elif GEMM_DEFAULT_UNROLL_N == 2
-
-static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+    BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
 
-    FLOAT aa,  bb0, bb1;
-    FLOAT *pa, *pc, *pc0, *pc1;
-    FLOAT *pb0, *pb1;
-
-    int i, j, k;
-    fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  offset = %4ld\n", __FILE__, __FUNCTION__, m, n, ldc); // Debug
+    FLOAT_V_T vb, vc;
 
     size_t vl;
-    FLOAT_V_T va, vc0, vc1;
 
     a += (m - 1) * m;
     b += (m - 1) * n;
 
-    for (i = m - 1; i >= 0; i--) 
-    {
-        aa = *(a + i);
-        pc = c + i;
-        for (j = 0; j < n/2; j ++) 
-        {
-            //bb = *(c + i + j * ldc);
-            pb0 = pc + j * ldc * 2;
-            pb1 = pb0 + ldc;
-            //bb *= aa;
-            bb0 = (*pb0) * aa;
-            bb1 = (*pb1) * aa;
-            //*b             = bb;
-            *b      = bb0;
-            *(b+1)  = bb1;
-            *pb0    = bb0;
-            *pb1    = bb1;
-
-            //*(c + i + j * ldc) = bb;
-            //b ++;
-
-            b += 2;
-            //pa = a + i + 1;
-            pc0 = c + j * ldc * 2;
-            pc1 = pc0 + ldc;
-            pa = a;
-            //pc = c + j * ldc;
-            for (k = i; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLEV_FLOAT(pc0, vl);
-                vc1 = VLEV_FLOAT(pc1, vl);
-                va = VLEV_FLOAT(pa, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
-                VSEV_FLOAT(pc0, vc0, vl);
-                VSEV_FLOAT(pc1, vc1, vl);
-
-                pa += vl;
-                pc0 += vl;
-                pc1 += vl;
-            }
-        }
-        pc += ldc * (n/2) * 2;
-        if (n & 1)
-        {
-            pb0 = pc;
-            bb0 = (*pb0) * aa;
-            *b      = bb0;
-            *pb0    = bb0;
-            b += 1;
-
-            pc0 = pc - i;
-            pa = a;
-            for (k = i; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLEV_FLOAT(pc0, vl);
-                va = VLEV_FLOAT(pa, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
-                VSEV_FLOAT(pc0, vc0, vl);
-
-                pa += vl;
-                pc0 += vl;
-            }
-        }
-
-        a -= m;
-        b -= 2 * n;
-    }
-
-}
-
-#elif GEMM_DEFAULT_UNROLL_N == 4
-
-static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
-
-    FLOAT aa,  bb0, bb1, bb2, bb3;
-    FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3;
-    FLOAT *pb0, *pb1, *pb2, *pb3;
-
-    int i, j, k;
-
-    size_t vl;
-    FLOAT_V_T va, vc0, vc1, vc2, vc3;
-
-    a += (m - 1) * m;
-    b += (m - 1) * n;
-
-    for (i = m - 1; i >= 0; i--) 
-    {
-        aa = *(a + i);
-        pc = c + i;
-        for (j = 0; j < n/4; j ++) 
-        {
-            pb0 = pc + j * ldc * 4;
-            pb1 = pb0 + ldc;
-            pb2 = pb1 + ldc;
-            pb3 = pb2 + ldc;
-            
-            bb0 = (*pb0) * aa;
-            bb1 = (*pb1) * aa;
-            bb2 = (*pb2) * aa;
-            bb3 = (*pb3) * aa;
-
-            *b      = bb0;
-            *(b+1)  = bb1;
-            *(b+2)  = bb2;
-            *(b+3)  = bb3;
-
-            *pb0    = bb0;
-            *pb1    = bb1;
-            *pb2    = bb2;
-            *pb3    = bb3;
-
-            b += 4;
-
-            pc0 = c + j * ldc * 4;
-            pc1 = pc0 + ldc;
-            pc2 = pc1 + ldc;
-            pc3 = pc2 + ldc;
-
-            pa = a;
-            for (k = i; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLEV_FLOAT(pc0, vl);
-                vc1 = VLEV_FLOAT(pc1, vl);
-                vc2 = VLEV_FLOAT(pc2, vl);
-                vc3 = VLEV_FLOAT(pc3, vl);
-                va = VLEV_FLOAT(pa, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
-                vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
-                vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
-                VSEV_FLOAT(pc0, vc0, vl);
-                VSEV_FLOAT(pc1, vc1, vl);
-                VSEV_FLOAT(pc2, vc2, vl);
-                VSEV_FLOAT(pc3, vc3, vl);
-
-                pa += vl;
-                pc0 += vl;
-                pc1 += vl;
-                pc2 += vl;
-                pc3 += vl;
-            }
-        }
-        pc += ldc * (n/4) * 4;
-
-        if (n & 2)
-        {
-            pb0 = pc + j * ldc * 2;
-            pb1 = pb0 + ldc;
-            
-            bb0 = (*pb0) * aa;
-            bb1 = (*pb1) * aa;
-
-            *b      = bb0;
-            *(b+1)  = bb1;
-
-            *pb0    = bb0;
-            *pb1    = bb1;
-
-            b += 2;
-
-            pc0 = c + j * ldc * 2;
-            pc1 = pc0 + ldc;
-
-            pa = a;
-            for (k = i; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLEV_FLOAT(pc0, vl);
-                vc1 = VLEV_FLOAT(pc1, vl);
-                va = VLEV_FLOAT(pa, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
-                VSEV_FLOAT(pc0, vc0, vl);
-                VSEV_FLOAT(pc1, vc1, vl);
-
-                pa += vl;
-                pc0 += vl;
-                pc1 += vl;
-            }
-            pc += ldc * 2;
-        }
-
-        if (n & 1)
-        {
-            pb0 = pc;
-            bb0 = (*pb0) * aa;
-            *b      = bb0;
-            *pb0    = bb0;
-            b += 1;
-
-            pc0 = pc - i;
-            pa = a;
-            for (k = i; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLEV_FLOAT(pc0, vl);
-                va = VLEV_FLOAT(pa, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
-                VSEV_FLOAT(pc0, vc0, vl);
-
-                pa += vl;
-                pc0 += vl;
-            }
-        }
-
-        a -= m;
-        b -= 2 * n;
-    }
-
-}
-#elif GEMM_DEFAULT_UNROLL_N == 8
-
-static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
-
-    FLOAT aa,  bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7;
-    FLOAT *pa, *pc, *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
-    FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7;
-
-    int i, j, k;
-
-    size_t vl;
-    FLOAT_V_T va, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7;
-
-    a += (m - 1) * m;
-    b += (m - 1) * n;
+    for (i = m - 1; i >= 0; i--) {
 
-    for (i = m - 1; i >= 0; i--) 
-    {
         aa = *(a + i);
-        pc = c + i;
-        for (j = 0; j < n/8; j ++) 
-        {
-            pb0 = pc + j * ldc * 8;
-            pb1 = pb0 + ldc;
-            pb2 = pb1 + ldc;
-            pb3 = pb2 + ldc;
-            pb4 = pb3 + ldc;
-            pb5 = pb4 + ldc;
-            pb6 = pb5 + ldc;
-            pb7 = pb6 + ldc;
-            
-            bb0 = (*pb0) * aa;
-            bb1 = (*pb1) * aa;
-            bb2 = (*pb2) * aa;
-            bb3 = (*pb3) * aa;
-            bb4 = (*pb4) * aa;
-            bb5 = (*pb5) * aa;
-            bb6 = (*pb6) * aa;
-            bb7 = (*pb7) * aa;
-
-            *b      = bb0;
-            *(b+1)  = bb1;
-            *(b+2)  = bb2;
-            *(b+3)  = bb3;
-            *(b+4)  = bb4;
-            *(b+5)  = bb5;
-            *(b+6)  = bb6;
-            *(b+7)  = bb7;
-
-            *pb0    = bb0;
-            *pb1    = bb1;
-            *pb2    = bb2;
-            *pb3    = bb3;
-            *pb4    = bb4;
-            *pb5    = bb5;
-            *pb6    = bb6;
-            *pb7    = bb7;
-
-            b += 8;
-
-            pc0 = c + j * ldc * 8;
-            pc1 = pc0 + ldc;
-            pc2 = pc1 + ldc;
-            pc3 = pc2 + ldc;
-            pc4 = pc3 + ldc;
-            pc5 = pc4 + ldc;
-            pc6 = pc5 + ldc;
-            pc7 = pc6 + ldc;
-
-            pa = a;
-            for (k = i; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLEV_FLOAT(pc0, vl);
-                vc1 = VLEV_FLOAT(pc1, vl);
-                vc2 = VLEV_FLOAT(pc2, vl);
-                vc3 = VLEV_FLOAT(pc3, vl);
-                vc4 = VLEV_FLOAT(pc4, vl);
-                vc5 = VLEV_FLOAT(pc5, vl);
-                vc6 = VLEV_FLOAT(pc6, vl);
-                vc7 = VLEV_FLOAT(pc7, vl);
-                va = VLEV_FLOAT(pa, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
-                vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
-                vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
-                vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl);
-                vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl);
-                vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl);
-                vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl);
-                VSEV_FLOAT(pc0, vc0, vl);
-                VSEV_FLOAT(pc1, vc1, vl);
-                VSEV_FLOAT(pc2, vc2, vl);
-                VSEV_FLOAT(pc3, vc3, vl);
-                VSEV_FLOAT(pc4, vc4, vl);
-                VSEV_FLOAT(pc5, vc5, vl);
-                VSEV_FLOAT(pc6, vc6, vl);
-                VSEV_FLOAT(pc7, vc7, vl);
-
-                pa += vl;
-                pc0 += vl;
-                pc1 += vl;
-                pc2 += vl;
-                pc3 += vl;
-                pc4 += vl;
-                pc5 += vl;
-                pc6 += vl;
-                pc7 += vl;
-            }
-        }
-        pc += ldc * (n/8) * 8;
-
-        if (n & 4)
-        {
-            pb0 = pc + j * ldc * 4;
-            pb1 = pb0 + ldc;
-            pb2 = pb1 + ldc;
-            pb3 = pb2 + ldc;
-            
-            bb0 = (*pb0) * aa;
-            bb1 = (*pb1) * aa;
-            bb2 = (*pb2) * aa;
-            bb3 = (*pb3) * aa;
-
-            *b      = bb0;
-            *(b+1)  = bb1;
-            *(b+2)  = bb2;
-            *(b+3)  = bb3;
-
-            *pb0    = bb0;
-            *pb1    = bb1;
-            *pb2    = bb2;
-            *pb3    = bb3;
-
-            b += 4;
-
-            pc0 = c + j * ldc * 4;
-            pc1 = pc0 + ldc;
-            pc2 = pc1 + ldc;
-            pc3 = pc2 + ldc;
-
-            pa = a;
-            for (k = i; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLEV_FLOAT(pc0, vl);
-                vc1 = VLEV_FLOAT(pc1, vl);
-                vc2 = VLEV_FLOAT(pc2, vl);
-                vc3 = VLEV_FLOAT(pc3, vl);
-                va = VLEV_FLOAT(pa, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
-                vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
-                vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
-                VSEV_FLOAT(pc0, vc0, vl);
-                VSEV_FLOAT(pc1, vc1, vl);
-                VSEV_FLOAT(pc2, vc2, vl);
-                VSEV_FLOAT(pc3, vc3, vl);
-
-                pa += vl;
-                pc0 += vl;
-                pc1 += vl;
-                pc2 += vl;
-                pc3 += vl;
+        pc  = c;
+        for (j = n; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            vb = VLSEV_FLOAT(pc + i, stride_ldc, vl);
+            vb = VFMULVF_FLOAT(vb, aa, vl);
+            VSEV_FLOAT(b, vb, vl);
+            VSSEV_FLOAT(pc + i, stride_ldc, vb, vl);
+            b   += vl;
+
+            for (k = 0; k < i; k ++) {
+                vc = VLSEV_FLOAT(pc + k, stride_ldc, vl);
+                vc = VFNMSACVF_FLOAT(vc, *(a + k), vb, vl);
+                VSSEV_FLOAT(pc + k, stride_ldc, vc, vl);
             }
-            pc += ldc * 4;
+            pc  += vl * ldc;
         }
-
-        if (n & 2)
-        {
-            pb0 = pc + j * ldc * 2;
-            pb1 = pb0 + ldc;
-            
-            bb0 = (*pb0) * aa;
-            bb1 = (*pb1) * aa;
-
-            *b      = bb0;
-            *(b+1)  = bb1;
-
-            *pb0    = bb0;
-            *pb1    = bb1;
-
-            b += 2;
-
-            pc0 = c + j * ldc * 2;
-            pc1 = pc0 + ldc;
-
-            pa = a;
-            for (k = i; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLEV_FLOAT(pc0, vl);
-                vc1 = VLEV_FLOAT(pc1, vl);
-                va = VLEV_FLOAT(pa, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
-                VSEV_FLOAT(pc0, vc0, vl);
-                VSEV_FLOAT(pc1, vc1, vl);
-
-                pa += vl;
-                pc0 += vl;
-                pc1 += vl;
-            }
-            pc += ldc * 2;
-        }
-
-        if (n & 1)
-        {
-            pb0 = pc;
-            bb0 = (*pb0) * aa;
-            *b      = bb0;
-            *pb0    = bb0;
-            b += 1;
-
-            pc0 = pc - i;
-            pa = a;
-            for (k = i; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLEV_FLOAT(pc0, vl);
-                va = VLEV_FLOAT(pa, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
-                VSEV_FLOAT(pc0, vc0, vl);
-
-                pa += vl;
-                pc0 += vl;
-            }
-        }
-
         a -= m;
         b -= 2 * n;
     }
 
 }
-#else
-static inline void solve_generic(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
-
-  FLOAT aa,  bb;
-
-  int i, j, k;
-
-  a += (m - 1) * m;
-  b += (m - 1) * n;
-
-  for (i = m - 1; i >= 0; i--) {
-
-    aa = *(a + i);
-
-    for (j = 0; j < n; j ++) {
-      bb = *(c + i + j * ldc);
-      bb *= aa;
-      *b             = bb;
-      *(c + i + j * ldc) = bb;
-      b ++;
-
-      for (k = 0; k < i; k ++){
-        *(c + k + j * ldc) -= bb * *(a + k);
-      }
-
-    }
-    a -= m;
-    b -= 2 * n;
-  }
-
-}
-
-#endif
-
 #else
 
 static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
 
-  FLOAT aa1, aa2;
-  FLOAT bb1, bb2;
-  FLOAT cc1, cc2;
-
-  int i, j, k;
+    FLOAT aa1, aa2;
+    FLOAT *pc;
+    int i, j, k;
 
-  ldc *= 2;
-  a += (m - 1) * m * 2;
-  b += (m - 1) * n * 2;
+    BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2;
 
-  for (i = m - 1; i >= 0; i--) {
+    FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2;
+    size_t vl;
+    a += (m - 1) * m * 2;
+    b += (m - 1) * n * 2;
 
-    aa1 = *(a + i * 2 + 0);
-    aa2 = *(a + i * 2 + 1);
+    for (i = m - 1; i >= 0; i--) {
 
-    for (j = 0; j < n; j ++) {
-      bb1 = *(c + i * 2 + 0 + j * ldc);
-      bb2 = *(c + i * 2 + 1 + j * ldc);
+        aa1 = *(a + i * 2 + 0);
+        aa2 = *(a + i * 2 + 1);
+        pc  = c;
 
+        for (j = n; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            VLSSEG2_FLOAT(&vb1, &vb2, pc + i * 2, stride_ldc, vl);
 #ifndef CONJ
-      cc1 = aa1 * bb1 - aa2 * bb2;
-      cc2 = aa1 * bb2 + aa2 * bb1;
+            vs1 =   VFMULVF_FLOAT(vb1, aa1, vl);
+            vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl);
+            vs2 =   VFMULVF_FLOAT(vb2, aa1, vl);
+            vs2 =  VFMACCVF_FLOAT(vs2, aa2, vb1, vl);
 #else
-      cc1 = aa1 * bb1 + aa2 * bb2;
-      cc2 = aa1 * bb2 - aa2 * bb1;
+            vs1 =   VFMULVF_FLOAT(vb1, aa1, vl);
+            vs1 =  VFMACCVF_FLOAT(vs1, aa2, vb2, vl);
+            vs2 =   VFMULVF_FLOAT(vb2, aa1, vl);
+            vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl);
 #endif
+            VSSEG2_FLOAT(b, vs1, vs2, vl);
+            VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vs1, vs2, vl);
+            b   += vl * 2;
 
-
-      *(b + 0) = cc1;
-      *(b + 1) = cc2;
-      *(c + i * 2 + 0 + j * ldc) = cc1;
-      *(c + i * 2 + 1 + j * ldc) = cc2;
-      b += 2;
-
-      for (k = 0; k < i; k ++){
+            for (k = 0; k < i; k ++) {
+                VLSSEG2_FLOAT(&vc1, &vc2, pc + k * 2, stride_ldc, vl);
 #ifndef CONJ
-        *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
-        *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
-#else
-        *(c + k * 2 + 0 + j * ldc) -=   cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
-        *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+                vc1 =  VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
+#else                                                        
+                vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
+                vc2 =  VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
 #endif
-      }
-
+                VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vc1, vc2, vl);
+            }
+            pc  += vl * ldc * 2;
+        }
+        a -= m * 2;
+        b -= 4 * n;
     }
-    a -= m * 2;
-    b -= 4 * n;
-  }
-
 }
 
+
 #endif
 
 
diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c
index 0380bd1bb..492a5631f 100644
--- a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c
+++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c
@@ -31,28 +31,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e32m2(n)
 #define VSETVL_MAX vsetvlmax_e32m2()
 #define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
 #define VLSEV_FLOAT vlse32_v_f32m2
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
 #define VSSEV_FLOAT vsse32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
 #define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
+#define VSSSEG2_FLOAT vssseg2e32_v_f32m2
 #define VFMACCVF_FLOAT vfmacc_vf_f32m2
-#define VFMULVF_FLOAT vfmul_vf_f32m2
 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
+#define VFMULVF_FLOAT vfmul_vf_f32m2
 #else
 #define VSETVL(n) vsetvl_e64m2(n)
 #define VSETVL_MAX vsetvlmax_e64m2()
 #define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
 #define VLSEV_FLOAT vlse64_v_f64m2
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
 #define VSSEV_FLOAT vsse64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
 #define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
+#define VSSSEG2_FLOAT vssseg2e64_v_f64m2
+#define VFMVVF_FLOAT vfmv_v_f_f64m2
 #define VFMACCVF_FLOAT vfmacc_vf_f64m2
-#define VFMULVF_FLOAT vfmul_vf_f64m2
 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
+#define VFMULVF_FLOAT vfmul_vf_f64m2
 #endif
 
 
@@ -87,468 +90,39 @@ static FLOAT dm1 = -1.;
 // Optimizes the implementation in ../arm64/trsm_kernel_LT_sve.c
 
 #ifndef COMPLEX
-#if GEMM_DEFAULT_UNROLL_N == 1
-
-static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) 
-{
-    FLOAT aa, bb;
-    FLOAT *pa, *pc;
 
-    int i, j, k;
-    size_t vl;
-    FLOAT_V_T va, vc;
-    for (i = 0; i < m; i++)
-    {
-        aa = *(a + i);
-        for (j = 0; j < n; j ++)
-        {
-            bb = *(c + i + j * ldc);
-            bb *= aa;
-            *b  = bb;
-            *(c + i + j * ldc) = bb;
-            b++;
-            pa = a + i + 1;
-            pc = c + j * ldc + i + 1;
-            for (k = (m - i - 1); k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc = VLEV_FLOAT(pc, vl);
-                va = VLEV_FLOAT(pa, vl);
-                vc = VFNMSACVF_FLOAT(vc, bb, va, vl);
-                VSEV_FLOAT(pc, vc, vl);
-                pa += vl;
-                pc += vl;
-            }
-        }
-        a += m;
-    }
-}
-#elif GEMM_DEFAULT_UNROLL_N == 2
-
-static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) 
-{
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
 
-    FLOAT aa, bb0, bb1;
-    FLOAT *pa, *pc, *pc0, *pc1;
-    FLOAT *pb0, *pb1;
+    FLOAT aa;
+    FLOAT* pc;
 
     int i, j, k;
-    size_t vl;
-    FLOAT_V_T va, vc0, vc1;
-    for (i = 0; i < m; i++)
-    {
-        aa = *(a + i);
-        pc = c + i;
-        for (j = 0; j < n/2; j ++)
-        {
-            pb0 = pc + j * ldc * 2;
-            pb1 = pb0 + ldc;
-            bb0 = (*pb0) * aa;
-            bb1 = (*pb1) * aa;
-            *b      = bb0;
-            *(b+1)  = bb1;
-            *pb0    = bb0;
-            *pb1    = bb1;
-            b += 2;
-            pa = a + i + 1;
-            pc0 = pb0 + 1;
-            pc1 = pc0 + ldc;
-            for (k = (m - i - 1); k > 0; k -= vl)
-            {
-                vl  = VSETVL(k);
-                vc0 = VLEV_FLOAT(pc0, vl);
-                vc1 = VLEV_FLOAT(pc1, vl);
-                va  = VLEV_FLOAT(pa, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
-                VSEV_FLOAT(pc0, vc0, vl);
-                VSEV_FLOAT(pc1, vc1, vl);
-                pa  += vl;
-                pc0 += vl;
-                pc1 += vl;
-            }
-        }
-        pc += ldc * (n/2) * 2;
-        if (n & 1)
-        {
-            pb0 = pc;
-            bb0 = *(pb0);
-            bb0 *= aa;
-            *b  = bb0;
-            *(c + i) = bb0;
-            b++;
-            pa = a + i + 1;
-            pc0 = pb0 + 1;
-            for (k = (m - i - 1); k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLEV_FLOAT(pc0, vl);
-                va  = VLEV_FLOAT(pa, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
-                VSEV_FLOAT(pc0, vc0, vl);
-                pa  += vl;
-                pc0 += vl;
-            }
-        }
-
-        a += m;
-    }
-}
-#elif GEMM_DEFAULT_UNROLL_N == 4
 
-static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) 
-{
+    BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
 
-    FLOAT aa, bb0, bb1, bb2, bb3;
-    FLOAT *pa, *pc;
-    FLOAT *pc0, *pc1, *pc2, *pc3;
-    FLOAT *pb0, *pb1, *pb2, *pb3;
+    FLOAT_V_T vb, vc;
 
-    int i, j, k;
     size_t vl;
-    FLOAT_V_T va;
-    FLOAT_V_T vc0, vc1, vc2, vc3;
-    for (i = 0; i < m; i++)
-    {
-        aa = *(a + i);
-        pc = c + i;
-        for (j = 0; j < n/4; j ++)
-        {
-            pb0 = pc;
-            pb1 = pb0 + ldc;
-            pb2 = pb1 + ldc;
-            pb3 = pb2 + ldc;
-
-            bb0 = (*pb0) * aa;
-            bb1 = (*pb1) * aa;
-            bb2 = (*pb2) * aa;
-            bb3 = (*pb3) * aa;
-
-            *b      = bb0;
-            *(b+1)  = bb1;
-            *(b+2)  = bb2;
-            *(b+3)  = bb3;
-
-            *pb0    = bb0;
-            *pb1    = bb1;
-            *pb2    = bb2;
-            *pb3    = bb3;
-            b += 4;
-
-            pa = a + i + 1;
-            pc0 = pb0 + 1;
-            pc1 = pc0 + ldc;
-            pc2 = pc1 + ldc;
-            pc3 = pc2 + ldc;
-
-            for (k = (m - i - 1); k > 0; k -= vl)
-            {
-                vl  = VSETVL(k);
-                vc0 = VLEV_FLOAT(pc0, vl);
-                vc1 = VLEV_FLOAT(pc1, vl);
-                vc2 = VLEV_FLOAT(pc2, vl);
-                vc3 = VLEV_FLOAT(pc3, vl);
-
-                va  = VLEV_FLOAT(pa, vl);
-
-                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
-                vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
-                vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
-
-                VSEV_FLOAT(pc0, vc0, vl);
-                VSEV_FLOAT(pc1, vc1, vl);
-                VSEV_FLOAT(pc2, vc2, vl);
-                VSEV_FLOAT(pc3, vc3, vl);
-
-                pa  += vl;
-                pc0 += vl;
-                pc1 += vl;
-                pc2 += vl;
-                pc3 += vl;
-            }
-        }
-        pc += ldc * (n/4) * 4;
-
-        if (n & 2)
-        {
-            pb0 = pc;
-            pb1 = pb0 + ldc;
-            bb0 = (*pb0) * aa;
-            bb1 = (*pb1) * aa;
-            *b      = bb0;
-            *(b+1)  = bb1;
-            *pb0    = bb0;
-            *pb1    = bb1;
-            b += 2;
-            pa = a + i + 1;
-            pc0 = pb0 + 1;
-            pc1 = pc0 + ldc;
-            for (k = (m - i - 1); k > 0; k -= vl)
-            {
-                vl  = VSETVL(k);
-                vc0 = VLEV_FLOAT(pc0, vl);
-                vc1 = VLEV_FLOAT(pc1, vl);
-                va  = VLEV_FLOAT(pa, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
-                VSEV_FLOAT(pc0, vc0, vl);
-                VSEV_FLOAT(pc1, vc1, vl);
-                pa  += vl;
-                pc0 += vl;
-                pc1 += vl;
-            }
-            pc += ldc * 2;
-        }
 
-        if (n & 1)
-        {
-            pb0 = pc;
-            bb0 = *(pb0);
-            bb0 *= aa;
-            *b  = bb0;
-            *(c + i) = bb0;
-            b++;
-            pa = a + i + 1;
-            pc0 = pb0 + 1;
-            for (k = (m - i - 1); k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLEV_FLOAT(pc0, vl);
-                va  = VLEV_FLOAT(pa, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
-                VSEV_FLOAT(pc0, vc0, vl);
-                pa  += vl;
-                pc0 += vl;
-            }
-        }
-
-        a += m;
-    }
-}
-#elif GEMM_DEFAULT_UNROLL_N == 8
-
-static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) 
-{
-
-    FLOAT aa, bb0, bb1, bb2, bb3, bb4, bb5, bb6, bb7;
-    FLOAT *pa, *pc;
-    FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
-    FLOAT *pb0, *pb1, *pb2, *pb3, *pb4, *pb5, *pb6, *pb7;
+    for (i = 0; i < m; i++) {
 
-    int i, j, k;
-    size_t vl;
-    FLOAT_V_T va;
-    FLOAT_V_T vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7;
-    for (i = 0; i < m; i++)
-    {
         aa = *(a + i);
-        pc = c + i;
-        for (j = 0; j < n/8; j ++)
-        {
-            pb0 = pc + j * ldc * 8;
-            pb1 = pb0 + ldc;
-            pb2 = pb1 + ldc;
-            pb3 = pb2 + ldc;
-            pb4 = pb3 + ldc;
-            pb5 = pb4 + ldc;
-            pb6 = pb5 + ldc;
-            pb7 = pb6 + ldc;
-            
-            bb0 = (*pb0) * aa;
-            bb1 = (*pb1) * aa;
-            bb2 = (*pb2) * aa;
-            bb3 = (*pb3) * aa;
-            bb4 = (*pb4) * aa;
-            bb5 = (*pb5) * aa;
-            bb6 = (*pb6) * aa;
-            bb7 = (*pb7) * aa;
-
-            *b      = bb0;
-            *(b+1)  = bb1;
-            *(b+2)  = bb2;
-            *(b+3)  = bb3;
-            *(b+4)  = bb4;
-            *(b+5)  = bb5;
-            *(b+6)  = bb6;
-            *(b+7)  = bb7;
-
-            *pb0    = bb0;
-            *pb1    = bb1;
-            *pb2    = bb2;
-            *pb3    = bb3;
-            *pb4    = bb4;
-            *pb5    = bb5;
-            *pb6    = bb6;
-            *pb7    = bb7;
-            b += 8;
-
-            pa = a + i + 1;
-            pc0 = pb0 + 1;
-            pc1 = pc0 + ldc;
-            pc2 = pc1 + ldc;
-            pc3 = pc2 + ldc;
-            pc4 = pc3 + ldc;
-            pc5 = pc4 + ldc;
-            pc6 = pc5 + ldc;
-            pc7 = pc6 + ldc;
-
-            for (k = (m - i - 1); k > 0; k -= vl)
-            {
-                vl  = VSETVL(k);
-                vc0 = VLEV_FLOAT(pc0, vl);
-                vc1 = VLEV_FLOAT(pc1, vl);
-                vc2 = VLEV_FLOAT(pc2, vl);
-                vc3 = VLEV_FLOAT(pc3, vl);
-                vc4 = VLEV_FLOAT(pc4, vl);
-                vc5 = VLEV_FLOAT(pc5, vl);
-                vc6 = VLEV_FLOAT(pc6, vl);
-                vc7 = VLEV_FLOAT(pc7, vl);
-
-                va  = VLEV_FLOAT(pa, vl);
-
-                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
-                vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
-                vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
-                vc4 = VFNMSACVF_FLOAT(vc4, bb4, va, vl);
-                vc5 = VFNMSACVF_FLOAT(vc5, bb5, va, vl);
-                vc6 = VFNMSACVF_FLOAT(vc6, bb6, va, vl);
-                vc7 = VFNMSACVF_FLOAT(vc7, bb7, va, vl);
-
-                VSEV_FLOAT(pc0, vc0, vl);
-                VSEV_FLOAT(pc1, vc1, vl);
-                VSEV_FLOAT(pc2, vc2, vl);
-                VSEV_FLOAT(pc3, vc3, vl);
-                VSEV_FLOAT(pc4, vc4, vl);
-                VSEV_FLOAT(pc5, vc5, vl);
-                VSEV_FLOAT(pc6, vc6, vl);
-                VSEV_FLOAT(pc7, vc7, vl);
-
-                pa  += vl;
-                pc0 += vl;
-                pc1 += vl;
-                pc2 += vl;
-                pc3 += vl;
-                pc4 += vl;
-                pc5 += vl;
-                pc6 += vl;
-                pc7 += vl;
-            }
-        }
-        pc += ldc * (n/8) * 8;
-
-        if (n & 4)
-        {
-            pb0 = pc;
-            pb1 = pb0 + ldc;
-            pb2 = pb1 + ldc;
-            pb3 = pb2 + ldc;
-
-            bb0 = (*pb0) * aa;
-            bb1 = (*pb1) * aa;
-            bb2 = (*pb2) * aa;
-            bb3 = (*pb3) * aa;
-
-            *b      = bb0;
-            *(b+1)  = bb1;
-            *(b+2)  = bb2;
-            *(b+3)  = bb3;
-
-            *pb0    = bb0;
-            *pb1    = bb1;
-            *pb2    = bb2;
-            *pb3    = bb3;
-            b += 4;
-
-            pa = a + i + 1;
-            pc0 = pb0 + 1;
-            pc1 = pc0 + ldc;
-            pc2 = pc1 + ldc;
-            pc3 = pc2 + ldc;
-
-            for (k = (m - i - 1); k > 0; k -= vl)
-            {
-                vl  = VSETVL(k);
-                vc0 = VLEV_FLOAT(pc0, vl);
-                vc1 = VLEV_FLOAT(pc1, vl);
-                vc2 = VLEV_FLOAT(pc2, vl);
-                vc3 = VLEV_FLOAT(pc3, vl);
-
-                va  = VLEV_FLOAT(pa, vl);
-
-                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
-                vc2 = VFNMSACVF_FLOAT(vc2, bb2, va, vl);
-                vc3 = VFNMSACVF_FLOAT(vc3, bb3, va, vl);
-
-                VSEV_FLOAT(pc0, vc0, vl);
-                VSEV_FLOAT(pc1, vc1, vl);
-                VSEV_FLOAT(pc2, vc2, vl);
-                VSEV_FLOAT(pc3, vc3, vl);
-
-                pa  += vl;
-                pc0 += vl;
-                pc1 += vl;
-                pc2 += vl;
-                pc3 += vl;
+        pc  = c;
+        for (j = n; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            vb = VLSEV_FLOAT(pc + i, stride_ldc, vl);
+            vb = VFMULVF_FLOAT(vb, aa, vl);
+            VSEV_FLOAT(b, vb, vl);
+            VSSEV_FLOAT(pc + i, stride_ldc, vb, vl);
+            b   += vl;
+
+            for (k = i + 1; k < m; k++) {
+                vc = VLSEV_FLOAT(pc + k, stride_ldc, vl);
+                vc = VFNMSACVF_FLOAT(vc, *(a + k), vb, vl);
+                VSSEV_FLOAT(pc + k, stride_ldc, vc, vl);
             }
-            pc += ldc * 4;
+            pc  += vl * ldc;
         }
-
-        if (n & 2)
-        {
-            pb0 = pc;
-            pb1 = pb0 + ldc;
-            bb0 = (*pb0) * aa;
-            bb1 = (*pb1) * aa;
-            *b      = bb0;
-            *(b+1)  = bb1;
-            *pb0    = bb0;
-            *pb1    = bb1;
-            b += 2;
-            pa = a + i + 1;
-            pc0 = pb0 + 1;
-            pc1 = pc0 + ldc;
-            for (k = (m - i - 1); k > 0; k -= vl)
-            {
-                vl  = VSETVL(k);
-                vc0 = VLEV_FLOAT(pc0, vl);
-                vc1 = VLEV_FLOAT(pc1, vl);
-                va  = VLEV_FLOAT(pa, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, bb1, va, vl);
-                VSEV_FLOAT(pc0, vc0, vl);
-                VSEV_FLOAT(pc1, vc1, vl);
-                pa  += vl;
-                pc0 += vl;
-                pc1 += vl;
-            }
-            pc += ldc * 2;
-        }
-
-        if (n & 1)
-        {
-            pb0 = pc;
-            bb0 = *(pb0);
-            bb0 *= aa;
-            *b  = bb0;
-            *(c + i) = bb0;
-            b++;
-            pa = a + i + 1;
-            pc0 = pb0 + 1;
-            for (k = (m - i - 1); k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLEV_FLOAT(pc0, vl);
-                va  = VLEV_FLOAT(pa, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, bb0, va, vl);
-                VSEV_FLOAT(pc0, vc0, vl);
-                pa  += vl;
-                pc0 += vl;
-            }
-        }
-
         a += m;
     }
 }
@@ -557,146 +131,60 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
 
 static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
 
-  FLOAT aa, bb;
-
-  int i, j, k;
-
-  for (i = 0; i < m; i++) {
-
-    aa = *(a + i);
-
-    for (j = 0; j < n; j ++) {
-      bb = *(c + i + j * ldc);
-      bb *= aa;
-      *b             = bb;
-      *(c + i + j * ldc) = bb;
-      b ++;
-
-      for (k = i + 1; k < m; k ++){
-	*(c + k + j * ldc) -= bb * *(a + k);
-      }
-
-    }
-    a += m;
-  }
-}
-
-#endif
-
-#else
-
-static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
-
-  FLOAT aa1, aa2;
-  FLOAT bb1, bb2;
-  FLOAT cc1, cc2;
-
-  int i, j, k;
-
-  ldc *= 2;
-
-  for (i = 0; i < m; i++) {
-
-    aa1 = *(a + i * 2 + 0);
-    aa2 = *(a + i * 2 + 1);
-
-    for (j = 0; j < n; j ++) {
-      bb1 = *(c + i * 2 + 0 + j * ldc);
-      bb2 = *(c + i * 2 + 1 + j * ldc);
-
-#ifndef CONJ
-      cc1 = aa1 * bb1 - aa2 * bb2;
-      cc2 = aa1 * bb2 + aa2 * bb1;
-#else
-      cc1 = aa1 * bb1 + aa2 * bb2;
-      cc2 = aa1 * bb2 - aa2 * bb1;
-#endif
-
-      *(b + 0) = cc1;
-      *(b + 1) = cc2;
-      *(c + i * 2 + 0 + j * ldc) = cc1;
-      *(c + i * 2 + 1 + j * ldc) = cc2;
-      b += 2;
-
-      for (k = i + 1; k < m; k ++){
-#ifndef CONJ
-	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
-	*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
-#else
-	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
-	*(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
-#endif
-      }
-
-    }
-    a += m * 2;
-  }
-}
-
-
-static inline void solve_N1(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
-
-  FLOAT aa1, aa2;
-  FLOAT bb1, bb2;
-  FLOAT cc1, cc2;
-  FLOAT *pa, *pc;
-
-  int i, j, k;
-
-  size_t vl;
-  FLOAT_V_T va0, va1, vc0, vc1;
+    FLOAT aa1, aa2;
+    FLOAT *pc;
+    int i, j, k;
 
-  ldc *= 2;
+    BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2;
 
-  for (i = 0; i < m; i++) {
+    FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2;
+    size_t vl;
 
-    aa1 = *(a + i * 2 + 0);
-    aa2 = *(a + i * 2 + 1);
+    ldc *= 2;
 
-    for (j = 0; j < n; j ++) {
-      bb1 = *(c + i * 2 + 0 + j * ldc);
-      bb2 = *(c + i * 2 + 1 + j * ldc);
+    for (i = 0; i < m; i++) {
+        aa1 = *(a + i * 2 + 0);
+        aa2 = *(a + i * 2 + 1);
+        pc  = c;
 
+        for (j = n; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            VLSSEG2_FLOAT(&vb1, &vb2, pc + i * 2, stride_ldc, vl);
 #ifndef CONJ
-      cc1 = aa1 * bb1 - aa2 * bb2;
-      cc2 = aa1 * bb2 + aa2 * bb1;
+            vs1 =   VFMULVF_FLOAT(vb1, aa1, vl);
+            vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl);
+            vs2 =   VFMULVF_FLOAT(vb2, aa1, vl);
+            vs2 =  VFMACCVF_FLOAT(vs2, aa2, vb1, vl);
 #else
-      cc1 = aa1 * bb1 + aa2 * bb2;
-      cc2 = aa1 * bb2 - aa2 * bb1;
+            vs1 =   VFMULVF_FLOAT(vb1, aa1, vl);
+            vs1 =  VFMACCVF_FLOAT(vs1, aa2, vb2, vl);
+            vs2 =   VFMULVF_FLOAT(vb2, aa1, vl);
+            vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl);
 #endif
+            VSSEG2_FLOAT(b, vs1, vs2, vl);
+            VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vs1, vs2, vl);
+            b   += vl * 2;
 
-      *(b + 0) = cc1;
-      *(b + 1) = cc2;
-      *(c + i * 2 + 0 + j * ldc) = cc1;
-      *(c + i * 2 + 1 + j * ldc) = cc2;
-      b += 2;
-
-        pa = a + (i + 1) * 2;
-        pc = c + j * ldc + (i + 1) * 2;
-        for (k = (m - i - 1); k > 0; k -= vl)
-        {
-            vl = VSETVL(k);
-            VLSEG2_FLOAT(&va0, &va1, pa, vl);
-            VLSEG2_FLOAT(&vc0, &vc1, pc, vl);
+            for (k = i + 1; k < m; k++) {
+                VLSSEG2_FLOAT(&vc1, &vc2, pc + k * 2, stride_ldc, vl);
 #ifndef CONJ
-            vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0);
-            vc0 = VFMACCVF_FLOAT(vc0, cc2, va1);
-            vc1 = VFNMSACVF_FLOAT(vc1, cc1, va1);
-            vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0);
-#else
-            vc0 = VFNMSACVF_FLOAT(vc0, cc1, va0);
-            vc0 = VFNMSACVF_FLOAT(vc0, cc2, va1);
-            vc1 = VFMACCVF_FLOAT(vc1, cc1, va1);
-            vc1 = VFNMSACVF_FLOAT(vc1, cc2, va0);
+                vc1 =  VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
+#else                                                        
+                vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
+                vc2 =  VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
 #endif
-            VSSEG2_FLOAT(pc, vc0, vc1, vl);
-            pa += vl * 2;
-            pc += vl * 2;
+                VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vc1, vc2, vl);
+            }
+            pc  += vl * ldc * 2;
         }
-      }
+
+        a += m * 2;
     }
-    a += m * 2;
-  }
 }
 
 #endif
@@ -714,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
 
   size_t vl = VSETVL_MAX;
 
-  //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
 
   j = (n >> GEMM_UNROLL_N_SHIFT);
 
diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c
index 41368be60..4751ae012 100644
--- a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c
+++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c
@@ -32,28 +32,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m2()
 #define FLOAT_V_T vfloat32m2_t
 #define VLEV_FLOAT vle32_v_f32m2
-#define VLSEV_FLOAT vlse32_v_f32m2
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
 #define VSSEV_FLOAT vsse32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
 #define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
+#define VSSSEG2_FLOAT vssseg2e32_v_f32m2
 #define VFMACCVF_FLOAT vfmacc_vf_f32m2
 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
+#define VFMULVF_FLOAT vfmul_vf_f32m2
 #else
 #define VSETVL(n) vsetvl_e64m2(n)
 #define VSETVL_MAX vsetvlmax_e64m2()
 #define FLOAT_V_T vfloat64m2_t
 #define VLEV_FLOAT vle64_v_f64m2
-#define VLSEV_FLOAT vlse64_v_f64m2
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
 #define VSSEV_FLOAT vsse64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
 #define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
+#define VSSSEG2_FLOAT vssseg2e64_v_f64m2
+#define VFMVVF_FLOAT vfmv_v_f_f64m2
 #define VFMACCVF_FLOAT vfmacc_vf_f64m2
 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
+#define VFMULVF_FLOAT vfmul_vf_f64m2
 #endif
 
-
 static FLOAT dm1 = -1.;
 
 #ifdef CONJ
@@ -86,569 +90,99 @@ static FLOAT dm1 = -1.;
 
 #ifndef COMPLEX
 
-#if GEMM_DEFAULT_UNROLL_N == 1
-
 static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
 
-    FLOAT aa, bb;
-    FLOAT *pb, *pc;
-    BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
-    int i, j, k;
-    size_t vl;
-    FLOAT_V_T vb, vc;
-
-    for (i = 0; i < n; i++) 
-    {
-        bb = *(b + i);
-
-        for (j = 0; j < m; j ++) 
-        {
-            aa = *(c + j + i * ldc);
-            aa *= bb;
-            *a  = aa;
-            *(c + j + i * ldc) = aa;
-            a ++;
-
-            pb = b + i + 1;
-            pc = c + j + (i + 1) *ldc;
-            for (k = (n - i - 1); k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc = VLSEV_FLOAT(pc, stride_ldc, vl);
-                vb = VLEV_FLOAT(pb, vl);
-                vc = VFNMSACVF_FLOAT(vc, aa, vb, vl);
-                VSSEV_FLOAT(pc, stride_ldc, vc, vl);
-                pb += vl;
-                pc ++;
-            }
-        }
-        b += n;
-    }
-}
-
-#elif GEMM_DEFAULT_UNROLL_N == 2
-
-static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+    FLOAT bb;
+    FLOAT *pci, *pcj;
 
-    FLOAT aa0, aa1, bb;
-    FLOAT *pb, *pc;
-    FLOAT *pa0, *pa1, *pc0, *pc1;
-    BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
     int i, j, k;
-    size_t vl;
-    FLOAT_V_T vb, vc0, vc1;
-
-    for (i = 0; i < n; i++) 
-    {
-        bb = *(b + i);
-        pc = c + i * ldc;
-        for (j = 0; j < m/2; j ++) 
-        {
-            pa0 = pc + j * 2;
-            pa1 = pc + j * 2 + 1;
-            aa0 = *pa0 * bb;
-            aa1 = *pa1 * bb;
-
-            *pa0    = aa0;
-            *pa1    = aa1;
-            *a      = aa0;
-            *(a + 1)= aa1;
-            a  += 2;
-
-            pb = b + i + 1;
-            pc0 = pa0 + ldc;
-            pc1 = pa1 + ldc;
-            for (k = (n - i - 1); k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
-                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
-                vb = VLEV_FLOAT(pb, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
-                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
-                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
-                pb += vl;
-                pc0++;
-                pc1++;
-            }
-        }
-        pc += (m/2)*2;
-        if (m & 1)
-        {
-            pa0 = pc;
-            aa0 = *pa0 * bb;
-            
-            *pa0    = aa0;
-            *a      = aa0;
-            a  += 1;
-           
-            pb = b + i + 1;
-            pc0 = pa0 + ldc;
-            for (k = (n - i - 1); k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
-                vb = VLEV_FLOAT(pb, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
-                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
-                pb += vl;
-                pc0++;
-            }
-        }
-        b += n;
-    }
-}
-
-#elif GEMM_DEFAULT_UNROLL_N == 4
-
-static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+    FLOAT_V_T va, vc;
 
-    FLOAT bb;
-    FLOAT aa0, aa1, aa2, aa3;
-    FLOAT *pb, *pc;
-    FLOAT *pa0, *pa1, *pa2, *pa3;
-    FLOAT *pc0, *pc1, *pc2, *pc3;
-    BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
-    int i, j, k;
     size_t vl;
-    FLOAT_V_T vb, vc0, vc1, vc2, vc3;
+    for (i = 0; i < n; i++) {
 
-    for (i = 0; i < n; i++) 
-    {
         bb = *(b + i);
-        pc = c + i * ldc;
-        for (j = 0; j < m/4; j ++) 
-        {
-            pa0 = pc + j * 4;
-            pa1 = pa0 + 1;
-            pa2 = pa1 + 1;
-            pa3 = pa2 + 1;
-            
-            aa0 = *pa0 * bb;
-            aa1 = *pa1 * bb;
-            aa2 = *pa2 * bb;
-            aa3 = *pa3 * bb;
-
-            *pa0    = aa0;
-            *pa1    = aa1;
-            *pa2    = aa2;
-            *pa3    = aa3;
-
-            *a      = aa0;
-            *(a + 1)= aa1;
-            *(a + 2)= aa2;
-            *(a + 3)= aa3;
-
-            a  += 4;
-
-            pb = b + i + 1;
-            pc0 = pa0 + ldc;
-            pc1 = pa1 + ldc;
-            pc2 = pa2 + ldc;
-            pc3 = pa3 + ldc;
-            for (k = (n - i - 1); k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
-                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
-                vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
-                vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
-                vb = VLEV_FLOAT(pb, vl);
- 
-                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
-                vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
-                vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
-
-                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
-                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
-                VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
-                VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
-
-                pb += vl;
-                pc0++;
-                pc1++;
-                pc2++;
-                pc3++;
-            }
-        }
-        pc += (m/4)*4;
-
-        if (m & 2)
-        {
-            pa0 = pc;
-            pa1 = pa0 + 1;
-            
-            aa0 = *pa0 * bb;
-            aa1 = *pa1 * bb;
-
-            *pa0    = aa0;
-            *pa1    = aa1;
-
-            *a      = aa0;
-            *(a + 1)= aa1;
-
-            a  += 2;
-
-            pb = b + i + 1;
-            pc0 = pa0 + ldc;
-            pc1 = pa1 + ldc;
-            for (k = (n - i - 1); k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
-                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
-                vb = VLEV_FLOAT(pb, vl);
- 
-                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
-
-                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
-                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
-
-                pb += vl;
-                pc0++;
-                pc1++;
-            }
-            pc += 2;
-        }
-
-        if (m & 1)
-        {
-            pa0 = pc;
-            aa0 = *pa0 * bb;
-            
-            *pa0    = aa0;
-            *a      = aa0;
-            a  += 1;
-           
-            pb = b + i + 1;
-            pc0 = pa0 + ldc;
-            for (k = (n - i - 1); k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
-                vb = VLEV_FLOAT(pb, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
-                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
-                pb += vl;
-                pc0++;
+        pci = c + i * ldc;
+        pcj = c;
+        for (j = m; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            va = VLEV_FLOAT(pci, vl);
+            va = VFMULVF_FLOAT(va, bb, vl);
+            VSEV_FLOAT(a, va, vl);
+            VSEV_FLOAT(pci, va, vl);
+            a   += vl;
+            pci += vl;
+            for (k = i + 1; k < n; k ++){
+                vc = VLEV_FLOAT(pcj + k * ldc, vl);
+                vc = VFNMSACVF_FLOAT(vc, *(b + k), va, vl);
+                VSEV_FLOAT(pcj + k * ldc, vc, vl);
             }
+            pcj += vl;
         }
         b += n;
     }
 }
 
-#elif GEMM_DEFAULT_UNROLL_N == 8
-
-static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
-
-    FLOAT bb;
-    FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7;
-    FLOAT *pb, *pc;
-    FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
-    FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
-    BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
-    int i, j, k;
-    size_t vl;
-    FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7;
-
-    for (i = 0; i < n; i++) 
-    {
-        bb = *(b + i);
-        pc = c + i * ldc;
-        for (j = 0; j < m/8; j ++) 
-        {
-            pa0 = pc + j * 8;
-            pa1 = pa0 + 1;
-            pa2 = pa1 + 1;
-            pa3 = pa2 + 1;
-            pa4 = pa3 + 1;
-            pa5 = pa4 + 1;
-            pa6 = pa5 + 1;
-            pa7 = pa6 + 1;
-            
-            aa0 = *pa0 * bb;
-            aa1 = *pa1 * bb;
-            aa2 = *pa2 * bb;
-            aa3 = *pa3 * bb;
-            aa4 = *pa4 * bb;
-            aa5 = *pa5 * bb;
-            aa6 = *pa6 * bb;
-            aa7 = *pa7 * bb;
-
-            *pa0    = aa0;
-            *pa1    = aa1;
-            *pa2    = aa2;
-            *pa3    = aa3;
-            *pa4    = aa4;
-            *pa5    = aa5;
-            *pa6    = aa6;
-            *pa7    = aa7;
-
-            *a      = aa0;
-            *(a + 1)= aa1;
-            *(a + 2)= aa2;
-            *(a + 3)= aa3;
-            *(a + 4)= aa4;
-            *(a + 5)= aa5;
-            *(a + 6)= aa6;
-            *(a + 7)= aa7;
-
-            a  += 8;
-
-            pb = b + i + 1;
-            pc0 = pa0 + ldc;
-            pc1 = pa1 + ldc;
-            pc2 = pa2 + ldc;
-            pc3 = pa3 + ldc;
-            pc4 = pa4 + ldc;
-            pc5 = pa5 + ldc;
-            pc6 = pa6 + ldc;
-            pc7 = pa7 + ldc;
-            for (k = (n - i - 1); k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
-                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
-                vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
-                vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
-                vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl);
-                vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl);
-                vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl);
-                vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl);
-                vb = VLEV_FLOAT(pb, vl);
- 
-                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
-                vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
-                vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
-                vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl);
-                vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl);
-                vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl);
-                vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl);
-
-                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
-                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
-                VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
-                VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
-                VSSEV_FLOAT(pc4, stride_ldc, vc4, vl);
-                VSSEV_FLOAT(pc5, stride_ldc, vc5, vl);
-                VSSEV_FLOAT(pc6, stride_ldc, vc6, vl);
-                VSSEV_FLOAT(pc7, stride_ldc, vc7, vl);
-
-                pb += vl;
-                pc0++;
-                pc1++;
-                pc2++;
-                pc3++;
-                pc4++;
-                pc5++;
-                pc6++;
-                pc7++;
-            }
-        }
-        pc += (m/8)*8;
-
-        if (m & 4)
-        {
-            pa0 = pc;
-            pa1 = pa0 + 1;
-            pa2 = pa1 + 1;
-            pa3 = pa2 + 1;
-            
-            aa0 = *pa0 * bb;
-            aa1 = *pa1 * bb;
-            aa2 = *pa2 * bb;
-            aa3 = *pa3 * bb;
-
-            *pa0    = aa0;
-            *pa1    = aa1;
-            *pa2    = aa2;
-            *pa3    = aa3;
-
-            *a      = aa0;
-            *(a + 1)= aa1;
-            *(a + 2)= aa2;
-            *(a + 3)= aa3;
-
-            a  += 4;
-
-            pb = b + i + 1;
-            pc0 = pa0 + ldc;
-            pc1 = pa1 + ldc;
-            pc2 = pa2 + ldc;
-            pc3 = pa3 + ldc;
-            for (k = (n - i - 1); k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
-                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
-                vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
-                vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
-                vb = VLEV_FLOAT(pb, vl);
- 
-                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
-                vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
-                vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
-
-                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
-                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
-                VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
-                VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
-
-                pb += vl;
-                pc0++;
-                pc1++;
-                pc2++;
-                pc3++;
-            }
-            pc += 4;
-        }
-
-        if (m & 2)
-        {
-            pa0 = pc;
-            pa1 = pa0 + 1;
-            
-            aa0 = *pa0 * bb;
-            aa1 = *pa1 * bb;
-
-            *pa0    = aa0;
-            *pa1    = aa1;
-
-            *a      = aa0;
-            *(a + 1)= aa1;
-
-            a  += 2;
-
-            pb = b + i + 1;
-            pc0 = pa0 + ldc;
-            pc1 = pa1 + ldc;
-            for (k = (n - i - 1); k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
-                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
-                vb = VLEV_FLOAT(pb, vl);
- 
-                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
-
-                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
-                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
-
-                pb += vl;
-                pc0++;
-                pc1++;
-            }
-            pc += 2;
-        }
-
-        if (m & 1)
-        {
-            pa0 = pc;
-            aa0 = *pa0 * bb;
-            
-            *pa0    = aa0;
-            *a      = aa0;
-            a  += 1;
-           
-            pb = b + i + 1;
-            pc0 = pa0 + ldc;
-            for (k = (n - i - 1); k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
-                vb = VLEV_FLOAT(pb, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
-                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
-                pb += vl;
-                pc0++;
-            }
-        }
-        b += n;
-    }
-}
 #else
-static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
-
-  FLOAT aa, bb;
 
-  int i, j, k;
-
-  for (i = 0; i < n; i++) {
-
-    bb = *(b + i);
-
-    for (j = 0; j < m; j ++) {
-      aa = *(c + j + i * ldc);
-      aa *= bb;
-      *a  = aa;
-      *(c + j + i * ldc) = aa;
-      a ++;
-
-      for (k = i + 1; k < n; k ++){
-	*(c + j + k * ldc) -= aa * *(b + k);
-      }
-
-    }
-    b += n;
-  }
-}
-
-#endif
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
 
-#else
+    FLOAT bb1, bb2;
 
-static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+    FLOAT *pci, *pcj;
 
-  FLOAT aa1, aa2;
-  FLOAT bb1, bb2;
-  FLOAT cc1, cc2;
+    int i, j, k;
 
-  int i, j, k;
+    FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2;
 
-  ldc *= 2;
+    size_t vl;
 
-  for (i = 0; i < n; i++) {
+    for (i = 0; i < n; i++) {
 
-    bb1 = *(b + i * 2 + 0);
-    bb2 = *(b + i * 2 + 1);
+        bb1 = *(b + i * 2 + 0);
+        bb2 = *(b + i * 2 + 1);
 
-    for (j = 0; j < m; j ++) {
-      aa1 = *(c + j * 2 + 0 + i * ldc);
-      aa2 = *(c + j * 2 + 1 + i * ldc);
+        pci = c + i * ldc * 2;
+        pcj = c;
 
+        for (j = m; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            VLSEG2_FLOAT(&va1, &va2, pci, vl);
 #ifndef CONJ
-      cc1 = aa1 * bb1 - aa2 * bb2;
-      cc2 = aa1 * bb2 + aa2 * bb1;
+            vs1 =   VFMULVF_FLOAT(va1, bb1, vl);
+            vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl);
+            vs2 =   VFMULVF_FLOAT(va1, bb2, vl);
+            vs2 =  VFMACCVF_FLOAT(vs2, bb1, va2, vl);
 #else
-      cc1 =  aa1 * bb1 + aa2 * bb2;
-      cc2 = -aa1 * bb2 + aa2 * bb1;
+            vs1 =   VFMULVF_FLOAT(va1, bb1, vl);
+            vs1 =  VFMACCVF_FLOAT(vs1, bb2, va2, vl);
+            vs2 =   VFMULVF_FLOAT(va2, bb1, vl);
+            vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl);
 #endif
+            VSSEG2_FLOAT(a, vs1, vs2, vl);
+            VSSEG2_FLOAT(pci, vs1, vs2, vl);
+            a += vl * 2;
+            pci += vl * 2;
 
-      *(a + 0) = cc1;
-      *(a + 1) = cc2;
-      *(c + j * 2 + 0 + i * ldc) = cc1;
-      *(c + j * 2 + 1 + i * ldc) = cc2;
-      a += 2;
-
-      for (k = i + 1; k < n; k ++){
+            for (k = i + 1; k < n; k ++){
+                VLSEG2_FLOAT(&vc1, &vc2, pcj + k * ldc * 2, vl);
 #ifndef CONJ
-	*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
-	*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+                vc1 =  VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
 #else
-	*(c + j * 2 + 0 + k * ldc) -=   cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
-	*(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
+                vc2 =  VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
 #endif
-      }
-
+                VSSEG2_FLOAT(pcj + k * ldc * 2, vc1, vc2, vl);
+            }
+            pcj += vl * 2;
+        }
+        b += n * 2;
     }
-    b += n * 2;
-  }
 }
 
 #endif
@@ -666,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
 
   size_t vl = VSETVL_MAX;
 
-  //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
 
 
   j = (n >> GEMM_UNROLL_N_SHIFT);
diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c
index 459c1663a..93a9e6916 100644
--- a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c
+++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c
@@ -32,25 +32,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m2()
 #define FLOAT_V_T vfloat32m2_t
 #define VLEV_FLOAT vle32_v_f32m2
-#define VLSEV_FLOAT vlse32_v_f32m2
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
 #define VSEV_FLOAT vse32_v_f32m2
-#define VSSEV_FLOAT vsse32_v_f32m2
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
 #define VSSEG2_FLOAT vsseg2e32_v_f32m2
 #define VFMACCVF_FLOAT vfmacc_vf_f32m2
 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
+#define VFMULVF_FLOAT vfmul_vf_f32m2
 #else
 #define VSETVL(n) vsetvl_e64m2(n)
 #define VSETVL_MAX vsetvlmax_e64m2()
 #define FLOAT_V_T vfloat64m2_t
 #define VLEV_FLOAT vle64_v_f64m2
-#define VLSEV_FLOAT vlse64_v_f64m2
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
 #define VSEV_FLOAT vse64_v_f64m2
-#define VSSEV_FLOAT vsse64_v_f64m2
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
 #define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define VFMVVF_FLOAT vfmv_v_f_f64m2
 #define VFMACCVF_FLOAT vfmacc_vf_f64m2
 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
+#define VFMULVF_FLOAT vfmul_vf_f64m2
 #endif
 
 
@@ -86,497 +85,38 @@ static FLOAT dm1 = -1.;
 
 #ifndef COMPLEX
 
-#if GEMM_DEFAULT_UNROLL_N == 1
 static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
 
-  FLOAT aa,  bb;
-  FLOAT *pb, *pc;
-  BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
-
-  int i, j, k;
-  size_t vl;
-  FLOAT_V_T vb, vc;
-
-  a += (n - 1) * m;
-  b += (n - 1) * n;
-
-  for (i = n - 1; i >= 0; i--) {
-
-    bb = *(b + i);
-
-    for (j = 0; j < m; j ++) {
-      aa = *(c + j + i * ldc);
-      aa *= bb;
-      *a   = aa;
-      *(c + j + i * ldc) = aa;
-      a ++;
-
-        pb = b;
-        pc = c + j;
-        for (k = i; k > 0; k -= vl)
-        {
-            vl = VSETVL(k);
-            vc = VLSEV_FLOAT(pc, stride_ldc, vl);
-            vb = VLEV_FLOAT(pb, vl);
-            vc = VFNMSACVF_FLOAT(vc, aa, vb, vl);
-            VSSEV_FLOAT(pc, stride_ldc, vc, vl);
-            pb += vl;
-            pc++;
-        }
-    }
-    b -= n;
-    a -= 2 * m;
-  }
-
-}
-#elif GEMM_DEFAULT_UNROLL_N == 2
-
-static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+    FLOAT bb;
+    FLOAT *pci, *pcj;
 
-    FLOAT aa0, aa1, bb;
-    FLOAT *pb, *pc;
-    FLOAT *pa0, *pa1, *pc0, *pc1;
-    BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
     int i, j, k;
-    size_t vl;
-    FLOAT_V_T vb, vc0, vc1;
-
-    a += (n - 1) * m;
-    b += (n - 1) * n;
+    FLOAT_V_T va, vc;
 
-    for (i = n - 1; i >= 0; i--)
-    {
-        bb = *(b + i);
-        pc = c + i * ldc;
-        for (j = 0; j < m/2; j ++) 
-        {
-            pa0 = pc + j * 2;
-            pa1 = pc + j * 2 + 1;
-            aa0 = *pa0 * bb;
-            aa1 = *pa1 * bb;
-
-            *pa0    = aa0;
-            *pa1    = aa1;
-            *a      = aa0;
-            *(a + 1)= aa1;
-            a  += 2;
-
-            pb  = b;
-            pc0 = c + j * 2;
-            pc1 = pc0 + 1;
-            for (k = i; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
-                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
-                vb = VLEV_FLOAT(pb, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
-                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
-                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
-                pb += vl;
-                pc0++;
-                pc1++;
-            }
-        }
-        pc += (m/2)*2;
-
-        if (m & 1)
-        {
-            pa0 = pc;
-            aa0 = *pa0 * bb;
-            
-            *pa0    = aa0;
-            *a      = aa0;
-            a  += 1;
-           
-            pb = b;
-            pc0 = pc - i * ldc;
-            for (k = i; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
-                vb = VLEV_FLOAT(pb, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
-                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
-                pb += vl;
-                pc0++;
-            }
-        }
-        b -= n;
-        a -= 2 * m;
-    }
-}
-
-#elif GEMM_DEFAULT_UNROLL_N == 4
-
-static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
-
-    FLOAT aa0, aa1, aa2, aa3;
-    FLOAT bb;
-    FLOAT *pb, *pc;
-    FLOAT *pa0, *pa1, *pa2, *pa3;
-    FLOAT *pc0, *pc1, *pc2, *pc3;
-    BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
-    int i, j, k;
     size_t vl;
-    FLOAT_V_T vb, vc0, vc1, vc2, vc3;
 
     a += (n - 1) * m;
     b += (n - 1) * n;
 
-    for (i = n - 1; i >= 0; i--)
-    {
-        bb = *(b + i);
-        pc = c + i * ldc;
-        for (j = 0; j < m/4; j ++) 
-        {
-            pa0 = pc + j * 4;
-            pa1 = pa0 + 1;
-            pa2 = pa1 + 1;
-            pa3 = pa2 + 1;
-            
-            aa0 = *pa0 * bb;
-            aa1 = *pa1 * bb;
-            aa2 = *pa2 * bb;
-            aa3 = *pa3 * bb;
-
-            *pa0    = aa0;
-            *pa1    = aa1;
-            *pa2    = aa2;
-            *pa3    = aa3;
-
-            *a      = aa0;
-            *(a + 1)= aa1;
-            *(a + 2)= aa2;
-            *(a + 3)= aa3;
-            a  += 4;
-
-            pb  = b;
-            pc0 = c + j * 4;
-            pc1 = pc0 + 1;
-            pc2 = pc1 + 1;
-            pc3 = pc2 + 1;
-            for (k = i; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
-                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
-                vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
-                vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
-                vb = VLEV_FLOAT(pb, vl);
-
-                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
-                vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
-                vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
-
-                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
-                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
-                VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
-                VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
-
-                pb += vl;
-                pc0++;
-                pc1++;
-                pc2++;
-                pc3++;
-            }
-        }
-        pc += (m/4)*4;
-
-        if (m & 2)
-        {
-            pa0 = pc + j * 2;
-            pa1 = pa0 + 1;
-            
-            aa0 = *pa0 * bb;
-            aa1 = *pa1 * bb;
-
-            *pa0    = aa0;
-            *pa1    = aa1;
-
-            *a      = aa0;
-            *(a + 1)= aa1;
-            a  += 2;
-
-            pb  = b;
-            pc0 = c + j * 4;
-            pc1 = pc0 + 1;
-            for (k = i; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
-                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
-                vb = VLEV_FLOAT(pb, vl);
-
-                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
-
-                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
-                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
-
-                pb += vl;
-                pc0++;
-                pc1++;
-            }
-            pc += 2;
-        }
-
-        if (m & 1)
-        {
-            pa0 = pc;
-            aa0 = *pa0 * bb;
-            
-            *pa0    = aa0;
-            *a      = aa0;
-            a  += 1;
-           
-            pb = b;
-            pc0 = pc - i * ldc;
-            for (k = i; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
-                vb = VLEV_FLOAT(pb, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
-                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
-                pb += vl;
-                pc0++;
-            }
-        }
-        b -= n;
-        a -= 2 * m;
-    }
-}
-#elif GEMM_DEFAULT_UNROLL_N == 8
-
-static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
-
-    FLOAT aa0, aa1, aa2, aa3, aa4, aa5, aa6, aa7;
-    FLOAT bb;
-    FLOAT *pb, *pc;
-    FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
-    FLOAT *pc0, *pc1, *pc2, *pc3, *pc4, *pc5, *pc6, *pc7;
-    BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
-    int i, j, k;
-    size_t vl;
-    FLOAT_V_T vb, vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7;
-
-    a += (n - 1) * m;
-    b += (n - 1) * n;
+    for (i = n - 1; i >= 0; i--) {
 
-    for (i = n - 1; i >= 0; i--)
-    {
         bb = *(b + i);
-        pc = c + i * ldc;
-        for (j = 0; j < m/8; j ++) 
-        {
-            pa0 = pc + j * 8;
-            pa1 = pa0 + 1;
-            pa2 = pa1 + 1;
-            pa3 = pa2 + 1;
-            pa4 = pa3 + 1;
-            pa5 = pa4 + 1;
-            pa6 = pa5 + 1;
-            pa7 = pa6 + 1;
-            
-            aa0 = *pa0 * bb;
-            aa1 = *pa1 * bb;
-            aa2 = *pa2 * bb;
-            aa3 = *pa3 * bb;
-            aa4 = *pa4 * bb;
-            aa5 = *pa5 * bb;
-            aa6 = *pa6 * bb;
-            aa7 = *pa7 * bb;
-
-            *pa0    = aa0;
-            *pa1    = aa1;
-            *pa2    = aa2;
-            *pa3    = aa3;
-            *pa4    = aa4;
-            *pa5    = aa5;
-            *pa6    = aa6;
-            *pa7    = aa7;
-
-            *a      = aa0;
-            *(a + 1)= aa1;
-            *(a + 2)= aa2;
-            *(a + 3)= aa3;
-            *(a + 4)= aa4;
-            *(a + 5)= aa5;
-            *(a + 6)= aa6;
-            *(a + 7)= aa7;
-            a  += 8;
-
-            pb  = b;
-            pc0 = c + j * 8;
-            pc1 = pc0 + 1;
-            pc2 = pc1 + 1;
-            pc3 = pc2 + 1;
-            pc4 = pc3 + 1;
-            pc5 = pc4 + 1;
-            pc6 = pc5 + 1;
-            pc7 = pc6 + 1;
-            for (k = i; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
-                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
-                vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
-                vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
-                vc4 = VLSEV_FLOAT(pc4, stride_ldc, vl);
-                vc5 = VLSEV_FLOAT(pc5, stride_ldc, vl);
-                vc6 = VLSEV_FLOAT(pc6, stride_ldc, vl);
-                vc7 = VLSEV_FLOAT(pc7, stride_ldc, vl);
-                vb = VLEV_FLOAT(pb, vl);
-
-                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
-                vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
-                vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
-                vc4 = VFNMSACVF_FLOAT(vc4, aa4, vb, vl);
-                vc5 = VFNMSACVF_FLOAT(vc5, aa5, vb, vl);
-                vc6 = VFNMSACVF_FLOAT(vc6, aa6, vb, vl);
-                vc7 = VFNMSACVF_FLOAT(vc7, aa7, vb, vl);
-
-                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
-                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
-                VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
-                VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
-                VSSEV_FLOAT(pc4, stride_ldc, vc4, vl);
-                VSSEV_FLOAT(pc5, stride_ldc, vc5, vl);
-                VSSEV_FLOAT(pc6, stride_ldc, vc6, vl);
-                VSSEV_FLOAT(pc7, stride_ldc, vc7, vl);
-
-                pb += vl;
-                pc0++;
-                pc1++;
-                pc2++;
-                pc3++;
-                pc4++;
-                pc5++;
-                pc6++;
-                pc7++;
-            }
-        }
-        pc += (m/8)*8;
-
-        if (m & 4)
-        {
-            pa0 = pc;
-            pa1 = pa0 + 1;
-            pa2 = pa1 + 1;
-            pa3 = pa2 + 1;
-            
-            aa0 = *pa0 * bb;
-            aa1 = *pa1 * bb;
-            aa2 = *pa2 * bb;
-            aa3 = *pa3 * bb;
-
-            *pa0    = aa0;
-            *pa1    = aa1;
-            *pa2    = aa2;
-            *pa3    = aa3;
-
-            *a      = aa0;
-            *(a + 1)= aa1;
-            *(a + 2)= aa2;
-            *(a + 3)= aa3;
-            a  += 4;
-
-            pb  = b;
-            pc0 = pc - i * ldc;
-            pc1 = pc0 + 1;
-            pc2 = pc1 + 1;
-            pc3 = pc2 + 1;
-            for (k = i; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
-                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
-                vc2 = VLSEV_FLOAT(pc2, stride_ldc, vl);
-                vc3 = VLSEV_FLOAT(pc3, stride_ldc, vl);
-                vb = VLEV_FLOAT(pb, vl);
-
-                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
-                vc2 = VFNMSACVF_FLOAT(vc2, aa2, vb, vl);
-                vc3 = VFNMSACVF_FLOAT(vc3, aa3, vb, vl);
-
-                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
-                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
-                VSSEV_FLOAT(pc2, stride_ldc, vc2, vl);
-                VSSEV_FLOAT(pc3, stride_ldc, vc3, vl);
-
-                pb += vl;
-                pc0++;
-                pc1++;
-                pc2++;
-                pc3++;
-            }
-            pc += 4;
-        }
-
-        if (m & 2)
-        {
-            pa0 = pc;
-            pa1 = pa0 + 1;
-            
-            aa0 = *pa0 * bb;
-            aa1 = *pa1 * bb;
-
-            *pa0    = aa0;
-            *pa1    = aa1;
-
-            *a      = aa0;
-            *(a + 1)= aa1;
-            a  += 2;
-
-            pb  = b;
-            pc0 = pc - i * ldc;
-            pc1 = pc0 + 1;
-            for (k = i; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
-                vc1 = VLSEV_FLOAT(pc1, stride_ldc, vl);
-                vb = VLEV_FLOAT(pb, vl);
-
-                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
-                vc1 = VFNMSACVF_FLOAT(vc1, aa1, vb, vl);
-
-                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
-                VSSEV_FLOAT(pc1, stride_ldc, vc1, vl);
-
-                pb += vl;
-                pc0++;
-                pc1++;
-            }
-            pc += 2;
-        }
-
-        if (m & 1)
-        {
-            pa0 = pc;
-            aa0 = *pa0 * bb;
-            
-            *pa0    = aa0;
-            *a      = aa0;
-            a  += 1;
-           
-            pb = b;
-            pc0 = pc - i * ldc;
-            for (k = i; k > 0; k -= vl)
-            {
-                vl = VSETVL(k);
-                vc0 = VLSEV_FLOAT(pc0, stride_ldc, vl);
-                vb = VLEV_FLOAT(pb, vl);
-                vc0 = VFNMSACVF_FLOAT(vc0, aa0, vb, vl);
-                VSSEV_FLOAT(pc0, stride_ldc, vc0, vl);
-                pb += vl;
-                pc0++;
+        pci = c + i * ldc;
+        pcj = c;
+        for (j = m; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            va = VLEV_FLOAT(pci, vl);
+            va = VFMULVF_FLOAT(va, bb, vl);
+            VSEV_FLOAT(a, va, vl);
+            VSEV_FLOAT(pci, va, vl);
+            a   += vl;
+            pci += vl;
+            for (k = 0; k < i; k ++){
+                vc = VLEV_FLOAT(pcj + k * ldc, vl);
+                vc = VFNMSACVF_FLOAT(vc, *(b + k), va, vl);
+                VSEV_FLOAT(pcj + k * ldc, vc, vl);
             }
+            pcj += vl;
         }
         b -= n;
         a -= 2 * m;
@@ -587,92 +127,65 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
 
 static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
 
-  FLOAT aa,  bb;
-
-  int i, j, k;
-
-  a += (n - 1) * m;
-  b += (n - 1) * n;
+    FLOAT bb1, bb2;
 
-  for (i = n - 1; i >= 0; i--) {
+    FLOAT *pci, *pcj;
 
-    bb = *(b + i);
-
-    for (j = 0; j < m; j ++) {
-      aa = *(c + j + i * ldc);
-      aa *= bb;
-      *a   = aa;
-      *(c + j + i * ldc) = aa;
-      a ++;
-
-      for (k = 0; k < i; k ++){
-	*(c + j + k * ldc) -= aa * *(b + k);
-      }
-
-    }
-    b -= n;
-    a -= 2 * m;
-  }
-
-}
-
-#endif
-
-#else
-
-static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
-
-  FLOAT aa1, aa2;
-  FLOAT bb1, bb2;
-  FLOAT cc1, cc2;
-
-  int i, j, k;
-
-  ldc *= 2;
+    int i, j, k;
 
-  a += (n - 1) * m * 2;
-  b += (n - 1) * n * 2;
+    FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2;
 
-  for (i = n - 1; i >= 0; i--) {
+    size_t vl;
 
-    bb1 = *(b + i * 2 + 0);
-    bb2 = *(b + i * 2 + 1);
+    a += (n - 1) * m * 2;
+    b += (n - 1) * n * 2;
 
-    for (j = 0; j < m; j ++) {
+    for (i = n - 1; i >= 0; i--) {
 
-      aa1 = *(c + j * 2 + 0 + i * ldc);
-      aa2 = *(c + j * 2 + 1 + i * ldc);
+        bb1 = *(b + i * 2 + 0);
+        bb2 = *(b + i * 2 + 1);
 
+        pci = c + i * ldc * 2;
+        pcj = c;
+        for (j = m; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            VLSEG2_FLOAT(&va1, &va2, pci, vl);
 #ifndef CONJ
-      cc1 = aa1 * bb1 - aa2 * bb2;
-      cc2 = aa1 * bb2 + aa2 * bb1;
+            vs1 =   VFMULVF_FLOAT(va1, bb1, vl);
+            vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl);
+            vs2 =   VFMULVF_FLOAT(va1, bb2, vl);
+            vs2 =  VFMACCVF_FLOAT(vs2, bb1, va2, vl);
 #else
-      cc1 =  aa1 * bb1  + aa2 * bb2;
-      cc2 = - aa1 * bb2 + aa2 * bb1;
+            vs1 =   VFMULVF_FLOAT(va1, bb1, vl);
+            vs1 =  VFMACCVF_FLOAT(vs1, bb2, va2, vl);
+            vs2 =   VFMULVF_FLOAT(va2, bb1, vl);
+            vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl);
 #endif
+            VSSEG2_FLOAT(a, vs1, vs2, vl);
+            VSSEG2_FLOAT(pci, vs1, vs2, vl);
+            a += vl * 2;
+            pci += vl * 2;
 
-      *(a + 0) = cc1;
-      *(a + 1) = cc2;
-
-      *(c + j * 2 + 0 + i * ldc) = cc1;
-      *(c + j * 2 + 1 + i * ldc) = cc2;
-      a += 2;
-
-      for (k = 0; k < i; k ++){
+            for (k = 0; k < i; k ++){
+                VLSEG2_FLOAT(&vc1, &vc2, pcj + k * ldc * 2, vl);
 #ifndef CONJ
-	*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
-	*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+                vc1 =  VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
 #else
-	*(c + j * 2 + 0 + k * ldc) -=   cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
-	*(c + j * 2 + 1 + k * ldc) -=  -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
+                vc2 =  VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
 #endif
-      }
-
+                VSSEG2_FLOAT(pcj + k * ldc * 2, vc1, vc2, vl);
+            }
+            pcj += vl * 2;
+        }
+        b -= n * 2;
+        a -= 4 * m;
     }
-    b -= n * 2;
-    a -= 4 * m;
-  }
-
 }
 
 #endif
@@ -689,7 +202,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
 
   size_t vl = VSETVL_MAX;
 
-  //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
 
   kk = n - offset;
   c += n * ldc * COMPSIZE;
diff --git a/kernel/riscv64/zgemm_ncopy_4_rvv.c b/kernel/riscv64/zgemm_ncopy_4_rvv.c
new file mode 100644
index 000000000..389ee5d57
--- /dev/null
+++ b/kernel/riscv64/zgemm_ncopy_4_rvv.c
@@ -0,0 +1,121 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m1(n)
+#define FLOAT_V_T vfloat32m1_t
+#define VLSEG2_FLOAT vlseg2e32_v_f32m1
+#define VSSEG2_FLOAT vsseg2e32_v_f32m1
+#define VSSEG4_FLOAT vsseg4e32_v_f32m1
+#define VSSEG8_FLOAT vsseg8e32_v_f32m1
+#else
+#define VSETVL(n) vsetvl_e64m1(n)
+#define FLOAT_V_T vfloat64m1_t
+#define VLSEG2_FLOAT vlseg2e64_v_f64m1
+#define VSSEG2_FLOAT vsseg2e64_v_f64m1
+#define VSSEG4_FLOAT vsseg4e64_v_f64m1
+#define VSSEG8_FLOAT vsseg8e64_v_f64m1
+#endif
+
+// Optimizes the implementation in ../generic/zgemm_ncopy_4.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
+    BLASLONG i, j;
+
+    FLOAT *aoffset;
+    FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
+
+    FLOAT *boffset;
+
+    FLOAT_V_T v11, v12, v21, v22, v31, v32, v41, v42;
+    size_t vl;
+
+    aoffset = a;
+    boffset = b;
+    lda *= 2;
+
+    for (j = (n >> 2); j > 0; j--) {
+        aoffset1  = aoffset;
+        aoffset2  = aoffset1 + lda;
+        aoffset3  = aoffset2 + lda;
+        aoffset4  = aoffset3 + lda;
+        aoffset  += 4 * lda;
+
+        for (i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+            VLSEG2_FLOAT(&v11, &v12, aoffset1, vl);
+            VLSEG2_FLOAT(&v21, &v22, aoffset2, vl);
+            VLSEG2_FLOAT(&v31, &v32, aoffset3, vl);
+            VLSEG2_FLOAT(&v41, &v42, aoffset4, vl);
+
+            VSSEG8_FLOAT(boffset, v11, v12, v21, v22, v31, v32, v41, v42, vl);
+
+            aoffset1 += vl * 2;
+            aoffset2 += vl * 2;
+            aoffset3 += vl * 2;
+            aoffset4 += vl * 2;
+            boffset  += vl * 8;
+        }
+    }
+
+    if (n & 2) {
+        aoffset1  = aoffset;
+        aoffset2  = aoffset1 + lda;
+        aoffset  += 2 * lda;
+        
+        for (i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+            VLSEG2_FLOAT(&v11, &v12, aoffset1, vl);
+            VLSEG2_FLOAT(&v21, &v22, aoffset2, vl);
+        
+            VSSEG4_FLOAT(boffset, v11, v12, v21, v22, vl);
+        
+            aoffset1 += vl * 2;
+            aoffset2 += vl * 2;
+            boffset  += vl * 4;
+        }
+    }
+
+    if (n & 1) {
+        aoffset1  = aoffset;
+        aoffset  += lda;
+
+        for (i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+            VLSEG2_FLOAT(&v11, &v12, aoffset1, vl);
+
+            VSSEG2_FLOAT(boffset, v11, v12, vl);
+
+            aoffset1 += vl * 2;
+            boffset  += vl * 2;
+        }
+    }
+
+     return 0;
+}
diff --git a/kernel/riscv64/zgemm_ncopy_rvv_v1.c b/kernel/riscv64/zgemm_ncopy_rvv_v1.c
new file mode 100644
index 000000000..df039bab6
--- /dev/null
+++ b/kernel/riscv64/zgemm_ncopy_rvv_v1.c
@@ -0,0 +1,74 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+    BLASLONG i, j;
+
+    FLOAT *a_offset;
+    FLOAT *a_offset1;
+    FLOAT *b_offset;
+
+    FLOAT_V_T v0, v1;
+    size_t vl;
+
+    //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
+    a_offset = a;
+    b_offset = b;
+
+    for(j = n; j > 0; j -= vl) {
+        vl = VSETVL(j);
+
+        a_offset1 = a_offset;
+        a_offset += vl * lda * 2;
+
+        for(i = m; i > 0; i--) {
+            VLSSEG2_FLOAT(&v0, &v1, a_offset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG2_FLOAT(b_offset, v0, v1, vl);
+
+            a_offset1 += 2;
+            b_offset += vl * 2;
+        }
+    }
+    return 0;
+}
+
diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv.c b/kernel/riscv64/zgemm_tcopy_4_rvv.c
new file mode 100644
index 000000000..1b34039c8
--- /dev/null
+++ b/kernel/riscv64/zgemm_tcopy_4_rvv.c
@@ -0,0 +1,181 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m1(n)
+#define FLOAT_V_T vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m1
+#define VSEV_FLOAT vse32_v_f32m1
+#define VLSSEG2_FLOAT vlsseg2e32_v_f32m1
+#define VLSSEG4_FLOAT vlsseg4e32_v_f32m1
+#define VLSSEG8_FLOAT vlsseg8e32_v_f32m1
+#define VSSEG2_FLOAT vsseg2e32_v_f32m1
+#define VSSEG4_FLOAT vsseg4e32_v_f32m1
+#define VSSEG8_FLOAT vsseg8e32_v_f32m1
+#else
+#define VSETVL(n) vsetvl_e64m1(n)
+#define FLOAT_V_T vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m1
+#define VSEV_FLOAT vse64_v_f64m1
+#define VLSSEG2_FLOAT vlsseg2e64_v_f64m1
+#define VLSSEG4_FLOAT vlsseg4e64_v_f64m1
+#define VLSSEG8_FLOAT vlsseg8e64_v_f64m1
+#define VSSEG2_FLOAT vsseg2e64_v_f64m1
+#define VSSEG4_FLOAT vsseg4e64_v_f64m1
+#define VSSEG8_FLOAT vsseg8e64_v_f64m1
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
+
+    BLASLONG i, j;
+
+    IFLOAT *aoffset;
+    IFLOAT *aoffset1;
+
+    IFLOAT *boffset, *boffset1, *boffset2, *boffset3;
+
+    FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7;
+    size_t vl;
+
+    //fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
+
+    aoffset   = a;
+    boffset   = b;
+    boffset2  = b + 2 * m  * (n & ~3);
+    boffset3  = b + 2 * m  * (n & ~1);
+
+    for(j = (m >> 2); j > 0; j--) {
+
+        aoffset1  = aoffset;
+        aoffset += 8 * lda;
+
+        boffset1  = boffset;
+        boffset  += 32;
+
+        for(i = (n >> 2); i > 0; i--) {
+            vl = 4;
+
+            VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
+
+            aoffset1 += 8;
+            boffset1 += m * 8;
+        }
+
+        if (n & 2) {
+            vl = 4;
+
+            VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
+
+            aoffset1 += 4;
+            boffset2 += 16;
+        }
+
+        if (n & 1) {
+            vl = 4;
+
+            VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG2_FLOAT(boffset3, v0, v1, vl);
+
+            aoffset1 += 2;
+            boffset3 += 8;
+        }
+    }
+
+    if (m & 2) {
+        aoffset1  = aoffset;
+        aoffset += 4 * lda;
+
+        boffset1  = boffset;
+        boffset  += 16;
+
+        for(i = (n >> 2); i > 0; i--) {
+            vl = 2;
+
+            VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
+
+            aoffset1 += 8;
+            boffset1 += m * 8;
+        }
+
+        if (n & 2) {
+            vl = 2;
+
+            VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
+
+            aoffset1 += 4;
+            boffset2 += 8;
+        }
+
+        if (n & 1) {
+            vl = 2;
+
+            VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG2_FLOAT(boffset3, v0, v1, vl);
+
+            //aoffset1 += 2;
+            boffset3 += 4;
+        }
+    }
+
+    if (m & 1) {
+        aoffset1  = aoffset;
+        boffset1  = boffset;
+
+        for(i = (n >> 2); i > 0; i--) {
+            vl = 8;
+
+            v0 = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset1, v0, vl);
+
+            aoffset1 += 8;
+            boffset1 += 8 * m;
+        }
+
+        if (n & 2) {
+            vl = 4;
+
+            v0 = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset2, v0, vl);
+
+            aoffset1 += 4;
+            //boffset2 += 4;
+        }
+
+        if (n & 1) {
+           *(boffset3) = *(aoffset1);
+           *(boffset3 + 1) = *(aoffset1 + 1);
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zgemm_tcopy_rvv_v1.c b/kernel/riscv64/zgemm_tcopy_rvv_v1.c
new file mode 100644
index 000000000..7622fb810
--- /dev/null
+++ b/kernel/riscv64/zgemm_tcopy_rvv_v1.c
@@ -0,0 +1,74 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
+#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
+#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
+{
+    BLASLONG i, j;
+
+    IFLOAT *aoffset;
+    IFLOAT *aoffset1;
+    IFLOAT *boffset;
+
+    FLOAT_V_T v0, v1;
+    size_t vl;
+
+    //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
+
+    aoffset = a;
+    boffset = b;
+
+    for(j = n; j > 0; j -= vl) {
+        vl = VSETVL(j);
+
+        aoffset1 = aoffset;
+        aoffset += vl * 2;
+
+        for(i = m; i > 0; i--) {
+            VLSEG2_FLOAT(&v0, &v1, aoffset1, vl);
+            VSSEG2_FLOAT(boffset, v0, v1, vl);
+
+            aoffset1 += lda * 2;
+            boffset += vl * 2;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zgemmkernel_rvv_v1x4.c b/kernel/riscv64/zgemmkernel_rvv_v1x4.c
new file mode 100644
index 000000000..50e29222f
--- /dev/null
+++ b/kernel/riscv64/zgemmkernel_rvv_v1x4.c
@@ -0,0 +1,475 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
+#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define VFMVVF_FLOAT vfmv_v_f_f32m2
+#define VFMACCVF_FLOAT vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
+#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define VFMVVF_FLOAT vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFMACCVF_FLOAT
+#define OP_ii       VFNMSACVF_FLOAT
+#define OP_ri       VFMACCVF_FLOAT
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFMACCVF_FLOAT
+#define OP_ii       VFMACCVF_FLOAT
+#define OP_ri       VFNMSACVF_FLOAT
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFNMSACVF_FLOAT
+#define OP_ii       VFMACCVF_FLOAT
+#define OP_ri       VFMACCVF_FLOAT
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFNMSACVF_FLOAT
+#define OP_ii       VFNMSACVF_FLOAT
+#define OP_ri       VFNMSACVF_FLOAT
+#endif
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
+#ifdef	TRMMKERNEL
+		, BLASLONG offset
+#endif
+		)
+{
+    BLASLONG i,j,k;
+    FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb;
+
+    FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
+    FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
+
+    //fprintf(stderr, "%s, bn=%ld bm=%ld bk=%ld alphar=%f alphai=%f ldc=%ld\n", __FUNCTION__, bn, bm, bk, alphar, alphai, ldc); // Debug
+
+    size_t vl;
+    for (j = bn/4; j > 0; j--)
+    {
+        C0 = C;
+        C1 = C0 + 2 * ldc;
+        C2 = C1 + 2 * ldc;
+        C3 = C2 + 2 * ldc;
+        ptrba = ba;
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+            vres4 = VFMVVF_FLOAT(0.0, vl);
+            vres5 = VFMVVF_FLOAT(0.0, vl);
+            vres6 = VFMVVF_FLOAT(0.0, vl);
+            vres7 = VFMVVF_FLOAT(0.0, vl);
+
+            for (k = bk/4; k > 0; k--)
+            {
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                ptrba += vl*2;
+
+                VLSEG2_FLOAT(&va2, &va3, ptrba, vl);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va0, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va1, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va1, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va0, vl);
+
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va0, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va1, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va1, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va0, vl);
+
+                ptrbb += 8;
+
+                VLSEG2_FLOAT(&va4, &va5, ptrba, vl);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va3, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va3, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va2, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va2, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va3, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va3, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va2, vl);
+                
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va2, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va3, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va3, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va2, vl);
+                
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va2, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va3, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va3, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va2, vl);
+                
+                ptrbb += 8;
+
+                VLSEG2_FLOAT(&va6, &va7, ptrba, vl);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va5, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va5, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va4, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va4, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va5, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va5, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va4, vl);
+                
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va4, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va5, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va5, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va4, vl);
+                
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va4, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va5, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va5, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va4, vl);
+                ptrbb += 8;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va6, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va7, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va7, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va6, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va6, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va7, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va7, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va6, vl);
+                
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va6, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va7, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va7, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va6, vl);
+                
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va6, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va7, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va7, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va6, vl);
+
+                ptrbb += 8;
+            }
+
+            for (k = (bk & 3); k > 0; k--)
+            {
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+                
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va0, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va1, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va1, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va0, vl);
+                
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va0, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va1, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va1, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va0, vl);
+
+                ptrbb += 8;
+            }
+
+            VLSEG2_FLOAT(&va0, &va1, C0, vl);
+            VLSEG2_FLOAT(&va2, &va3, C1, vl);
+
+            va0 =  VFMACCVF_FLOAT(va0, alphar, vres0, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphar, vres1, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
+            VSSEG2_FLOAT(C0, va0, va1, vl);
+
+            va2 =  VFMACCVF_FLOAT(va2, alphar, vres2, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphar, vres3, vl);
+            va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphai, vres2, vl);
+            VSSEG2_FLOAT(C1, va2, va3, vl);
+
+            VLSEG2_FLOAT(&va0, &va1, C2, vl);
+            VLSEG2_FLOAT(&va2, &va3, C3, vl);
+
+            va0 =  VFMACCVF_FLOAT(va0, alphar, vres4, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphar, vres5, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres4, vl);
+            VSSEG2_FLOAT(C2, va0, va1, vl);
+
+            va2 =  VFMACCVF_FLOAT(va2, alphar, vres6, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphar, vres7, vl);
+            va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphai, vres6, vl);
+            VSSEG2_FLOAT(C3, va2, va3, vl);
+
+            C0 += vl * 2;
+            C1 += vl * 2;
+            C2 += vl * 2;
+            C3 += vl * 2;
+        }
+
+        bb += (bk << 3);
+        C  += (ldc << 3);
+    }
+
+    if (bn & 2)
+    {
+        C0 = C;
+        C1 = C0 + 2 * ldc;
+        ptrba = ba;
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+
+            for (k = bk/4; k > 0; k--)
+            {
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                ptrba += vl*2;
+                VLSEG2_FLOAT(&va2, &va3, ptrba, vl);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+
+                ptrbb += 4;
+
+                VLSEG2_FLOAT(&va4, &va5, ptrba, vl);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va3, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va3, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va2, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va2, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va3, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va3, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va2, vl);
+                
+                ptrbb += 4;
+
+                VLSEG2_FLOAT(&va6, &va7, ptrba, vl);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va5, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va5, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va4, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va4, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va5, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va5, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va4, vl);
+                
+                ptrbb += 4;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va6, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va7, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va7, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va6, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va6, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va7, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va7, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va6, vl);
+                
+                ptrbb += 4;
+            }
+
+            for (k = (bk & 3); k > 0; k--)
+            {
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+                
+                ptrbb += 4;
+            }
+
+            VLSEG2_FLOAT(&va0, &va1, C0, vl);
+            VLSEG2_FLOAT(&va2, &va3, C1, vl);
+
+            va0 =  VFMACCVF_FLOAT(va0, alphar, vres0, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphar, vres1, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
+            VSSEG2_FLOAT(C0, va0, va1, vl);
+
+            va2 =  VFMACCVF_FLOAT(va2, alphar, vres2, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphar, vres3, vl);
+            va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphai, vres2, vl);
+            VSSEG2_FLOAT(C1, va2, va3, vl);
+
+            C0 += vl * 2;
+            C1 += vl * 2;
+        }
+
+        bb += (bk << 2);
+        C  += (ldc << 2);
+    }
+
+    if (bn & 1)
+    {
+        C0 = C;
+        ptrba = ba;
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+
+            for (k = bk/4; k > 0; k--)
+            {
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                ptrba += vl*2;
+                VLSEG2_FLOAT(&va2, &va3, ptrba, vl);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+                ptrbb += 2;
+
+                VLSEG2_FLOAT(&va4, &va5, ptrba, vl);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va3, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va3, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va2, vl);
+
+                ptrbb += 2;
+
+                VLSEG2_FLOAT(&va6, &va7, ptrba, vl);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va5, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va5, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va4, vl);
+                ptrbb += 2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va6, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va7, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va7, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va6, vl);
+                ptrbb += 2;
+            }
+
+            for (k = (bk & 3); k > 0; k--)
+            {
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+                ptrbb += 2;
+            }
+            
+            VLSEG2_FLOAT(&va0, &va1, C0, vl);
+            va0 =  VFMACCVF_FLOAT(va0, alphar, vres0, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphar, vres1, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
+            VSSEG2_FLOAT(C0, va0, va1, vl);
+            C0 += vl * 2;
+        }
+
+        bb += bk << 1;
+        C  += ldc << 1;
+   }
+   return 0;
+}
+
diff --git a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c
new file mode 100644
index 000000000..cf466d3fa
--- /dev/null
+++ b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c
@@ -0,0 +1,124 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define VSETVL_MAX vsetvlmax_e32m2()
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VLSEV_FLOAT vlse32_v_f32m2
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
+#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define INT_V_T     vint32m2_t
+#define VID_V_INT   vid_v_i32m2
+#define VADD_VX_INT vadd_vx_i32m2
+#define VFRSUB_VF_FLOAT vfrsub_vf_f32m2
+#define VMSGT_VX_INT vmsgt_vx_i32m2_b16
+#define VMSLT_VX_INT vmslt_vx_i32m2_b16
+#define VMSEQ_VX_INT vmseq_vx_i32m2_b16
+#define VBOOL_T     vbool16_t
+#define VMERGE_VVM_FLOAT  vmerge_vvm_f32m2
+#define VFMVVF_FLOAT vfmv_v_f_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define VSETVL_MAX vsetvlmax_e64m2()
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VLSEV_FLOAT vlse64_v_f64m2
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
+#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define INT_V_T     vint64m2_t
+#define VID_V_INT   vid_v_i64m2
+#define VADD_VX_INT vadd_vx_i64m2
+#define VFRSUB_VF_FLOAT vfrsub_vf_f64m2
+#define VMSGT_VX_INT vmsgt_vx_i64m2_b32
+#define VMSLT_VX_INT vmslt_vx_i64m2_b32
+#define VMSEQ_VX_INT vmseq_vx_i64m2_b32
+#define VBOOL_T     vbool32_t
+#define VMERGE_VVM_FLOAT  vmerge_vvm_f64m2
+#define VFMVVF_FLOAT vfmv_v_f_f64m2
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
+{
+    //fprintf(stderr, "%s, %s, m=%ld n=%ld lda=%ld posX=%ld posY=%ld\n", __FUNCTION__, __FILE__, m, n, lda, posX, posY);
+
+    BLASLONG i, js, offset;
+
+    FLOAT *ao1, *ao2;
+
+    BLASLONG stride_lda = sizeof(FLOAT) * lda * 2;
+
+    FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero;
+    VBOOL_T vbool_gt0, vbool_lt0, vbool_eq0;
+    INT_V_T vindex_max, vindex;
+
+    size_t vl = VSETVL_MAX;
+    vindex_max   = VID_V_INT(vl);
+    vzero = VFMVVF_FLOAT(ZERO, vl);
+
+    for (js = n; js > 0; js -= vl, posX += vl) {
+        vl = VSETVL(js);
+        offset = posX - posY;
+
+        ao1 = a + posX * 2 + posY * lda * 2;
+        ao2 = a + posY * 2 + posX * lda * 2;
+
+        for (i = m; i > 0; i--, offset--) {
+            VLSSEG2_FLOAT(&va20, &va21, ao2, stride_lda, vl);
+            VLSEG2_FLOAT(&va10, &va11, ao1, vl);
+
+            vindex = VADD_VX_INT(vindex_max, offset, vl);
+            vbool_gt0  = VMSGT_VX_INT(vindex, 0, vl);
+            vbool_lt0  = VMSLT_VX_INT(vindex, 0, vl);
+            vbool_eq0  = VMSEQ_VX_INT(vindex, 0, vl);
+
+            vb0 =  VMERGE_VVM_FLOAT(vbool_gt0, va20, va10, vl);
+            vb1 =  VMERGE_VVM_FLOAT(vbool_gt0, va21, va11, vl);
+
+            vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl);
+
+            vb1 =  VMERGE_VVM_FLOAT(vbool_lt0, vb1, vb2, vl);
+            vb1 =  VMERGE_VVM_FLOAT(vbool_eq0, vb1, vzero, vl);
+            VSSEG2_FLOAT(b, vb0, vb1, vl);
+
+            b   += vl * 2;
+            ao1 += lda * 2;
+            ao2 += 2;
+        }
+    }
+    
+    return 0;
+}
+
diff --git a/kernel/riscv64/zhemm_utcopy_rvv_v1.c b/kernel/riscv64/zhemm_utcopy_rvv_v1.c
new file mode 100644
index 000000000..6209f5417
--- /dev/null
+++ b/kernel/riscv64/zhemm_utcopy_rvv_v1.c
@@ -0,0 +1,120 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define VSETVL_MAX vsetvlmax_e32m2()
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VLSEV_FLOAT vlse32_v_f32m2
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
+#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define INT_V_T     vint32m2_t
+#define VID_V_INT   vid_v_i32m2
+#define VADD_VX_INT vadd_vx_i32m2
+#define VFRSUB_VF_FLOAT vfrsub_vf_f32m2
+#define VMSGT_VX_INT vmsgt_vx_i32m2_b16
+#define VMSLT_VX_INT vmslt_vx_i32m2_b16
+#define VMSEQ_VX_INT vmseq_vx_i32m2_b16
+#define VBOOL_T     vbool16_t
+#define VMERGE_VVM_FLOAT  vmerge_vvm_f32m2
+#define VFMVVF_FLOAT vfmv_v_f_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define VSETVL_MAX vsetvlmax_e64m2()
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VLSEV_FLOAT vlse64_v_f64m2
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
+#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define INT_V_T     vint64m2_t
+#define VID_V_INT   vid_v_i64m2
+#define VADD_VX_INT vadd_vx_i64m2
+#define VFRSUB_VF_FLOAT vfrsub_vf_f64m2
+#define VMSGT_VX_INT vmsgt_vx_i64m2_b32
+#define VMSLT_VX_INT vmslt_vx_i64m2_b32
+#define VMSEQ_VX_INT vmseq_vx_i64m2_b32
+#define VBOOL_T     vbool32_t
+#define VMERGE_VVM_FLOAT  vmerge_vvm_f64m2
+#define VFMVVF_FLOAT vfmv_v_f_f64m2
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
+{
+    BLASLONG i, js, offset;
+
+    FLOAT *ao1, *ao2;
+    //fprintf(stderr, "%s, %s, m=%ld n=%ld lda=%ld posX=%ld posY=%ld\n", __FUNCTION__, __FILE__, m, n, lda, posX, posY);
+    BLASLONG stride_lda = sizeof(FLOAT) * lda * 2;
+    
+    FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero;
+    VBOOL_T vbool_gt0, vbool_eq0;
+    INT_V_T vindex_max, vindex;
+
+    size_t vl = VSETVL_MAX;
+    vindex_max   = VID_V_INT(vl);
+    vzero = VFMVVF_FLOAT(ZERO, vl);
+
+    for (js = n; js > 0; js -= vl, posX += vl) {
+        vl = VSETVL(js);
+        offset = posX - posY;
+
+        ao1 = a + posY * 2 + posX * lda * 2;
+        ao2 = a + posX * 2 + posY * lda * 2;
+
+        for (i = m; i > 0; i--, offset--) {
+            VLSSEG2_FLOAT(&va10, &va11, ao1, stride_lda, vl);
+            VLSEG2_FLOAT(&va20, &va21, ao2, vl);
+
+            vindex = VADD_VX_INT(vindex_max, offset, vl);
+            vbool_gt0  = VMSGT_VX_INT(vindex, 0, vl);
+            vbool_eq0  = VMSEQ_VX_INT(vindex, 0, vl);
+
+            vb0 =  VMERGE_VVM_FLOAT(vbool_gt0, va20, va10, vl);
+            vb1 =  VMERGE_VVM_FLOAT(vbool_gt0, va21, va11, vl);
+
+            vb2 =  VFRSUB_VF_FLOAT(vb1, ZERO, vl);
+
+            vb1 =  VMERGE_VVM_FLOAT(vbool_gt0, vb1, vb2, vl);
+            vb1 =  VMERGE_VVM_FLOAT(vbool_eq0, vb1, vzero, vl);
+            VSSEG2_FLOAT(b, vb0, vb1, vl);
+
+            b   += vl * 2;
+            ao1 += 2;
+            ao2 += lda * 2;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zsymm_lcopy_rvv_v1.c b/kernel/riscv64/zsymm_lcopy_rvv_v1.c
new file mode 100644
index 000000000..df5c916a5
--- /dev/null
+++ b/kernel/riscv64/zsymm_lcopy_rvv_v1.c
@@ -0,0 +1,106 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define VSETVL_MAX vsetvlmax_e32m2()
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VLSEV_FLOAT vlse32_v_f32m2
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
+#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define INT_V_T     vint32m2_t
+#define VID_V_INT   vid_v_i32m2
+#define VADD_VX_INT vadd_vx_i32m2
+#define VMSGT_VX_INT vmsgt_vx_i32m2_b16
+#define VBOOL_T     vbool16_t
+#define VMERGE_VVM_FLOAT  vmerge_vvm_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define VSETVL_MAX vsetvlmax_e64m2()
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VLSEV_FLOAT vlse64_v_f64m2
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
+#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define INT_V_T     vint64m2_t
+#define VID_V_INT   vid_v_i64m2
+#define VADD_VX_INT vadd_vx_i64m2
+#define VMSGT_VX_INT vmsgt_vx_i64m2_b32
+#define VBOOL_T     vbool32_t
+#define VMERGE_VVM_FLOAT  vmerge_vvm_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
+{
+    BLASLONG i, js, offset;
+
+    FLOAT *ao1, *ao2;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda*2;
+
+    FLOAT_V_T vb0, vb1, va10, va11, va20, va21;
+    VBOOL_T vbool;
+    INT_V_T vindex_max, vindex;
+
+    size_t vl = VSETVL_MAX;
+    vindex_max   = VID_V_INT(vl);
+
+    for (js = n; js > 0; js -= vl, posX += vl) {
+        vl = VSETVL(js);
+        offset = posX - posY;
+
+        ao1 = a + posX * 2 + posY * lda * 2;
+        ao2 = a + posY * 2 + (posX) * lda * 2;
+
+        for (i = m; i > 0; i--, offset--) {
+
+            VLSSEG2_FLOAT(&va20, &va21, ao2, stride_lda, vl);
+            VLSEG2_FLOAT(&va10, &va11, ao1, vl);
+
+            vindex = VADD_VX_INT(vindex_max, offset, vl);
+            vbool  = VMSGT_VX_INT(vindex, 0, vl);
+
+            vb0 =  VMERGE_VVM_FLOAT(vbool, va20, va10, vl);
+            vb1 =  VMERGE_VVM_FLOAT(vbool, va21, va11, vl);
+            VSSEG2_FLOAT(b, vb0, vb1, vl);
+
+            b   += vl * 2;
+            ao1 += lda * 2;
+            ao2 += 2;
+        }
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/zsymm_ucopy_rvv_v1.c b/kernel/riscv64/zsymm_ucopy_rvv_v1.c
new file mode 100644
index 000000000..dcf2b081a
--- /dev/null
+++ b/kernel/riscv64/zsymm_ucopy_rvv_v1.c
@@ -0,0 +1,106 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define VSETVL_MAX vsetvlmax_e32m2()
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VLSEV_FLOAT vlse32_v_f32m2
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
+#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define INT_V_T     vint32m2_t
+#define VID_V_INT   vid_v_i32m2
+#define VADD_VX_INT vadd_vx_i32m2
+#define VMSGT_VX_INT vmsgt_vx_i32m2_b16
+#define VBOOL_T     vbool16_t
+#define VMERGE_VVM_FLOAT  vmerge_vvm_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define VSETVL_MAX vsetvlmax_e64m2()
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VLSEV_FLOAT vlse64_v_f64m2
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
+#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define INT_V_T     vint64m2_t
+#define VID_V_INT   vid_v_i64m2
+#define VADD_VX_INT vadd_vx_i64m2
+#define VMSGT_VX_INT vmsgt_vx_i64m2_b32
+#define VBOOL_T     vbool32_t
+#define VMERGE_VVM_FLOAT  vmerge_vvm_f64m2
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
+{
+    BLASLONG i, js, offset;
+
+    FLOAT *ao1, *ao2;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda * 2;
+    
+    FLOAT_V_T vb0, vb1, va10, va11, va20, va21;
+    VBOOL_T vbool;
+    INT_V_T vindex_max, vindex;
+
+
+    size_t vl = VSETVL_MAX;
+    vindex_max   = VID_V_INT(vl);
+
+    for (js = n; js > 0; js -= vl, posX += vl) {
+        vl = VSETVL(js);
+        offset = posX - posY;
+
+        ao1 = a + posY * 2 + (posX + 0) * lda * 2;
+        ao2 = a + posX * 2 + 0 + posY * lda * 2;
+
+        for (i = m; i > 0; i--, offset--) {
+            VLSSEG2_FLOAT(&va10, &va11, ao1, stride_lda, vl);
+            VLSEG2_FLOAT(&va20, &va21, ao2, vl);
+
+            vindex = VADD_VX_INT(vindex_max, offset, vl);
+            vbool  = VMSGT_VX_INT(vindex, 0, vl);
+
+            vb0 =  VMERGE_VVM_FLOAT(vbool, va20, va10, vl);
+            vb1 =  VMERGE_VVM_FLOAT(vbool, va21, va11, vl);
+            VSSEG2_FLOAT(b, vb0, vb1, vl);
+
+            b   += vl * 2;
+            ao1 += 2;
+            ao2 += lda * 2;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c
new file mode 100644
index 000000000..afd694408
--- /dev/null
+++ b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c
@@ -0,0 +1,145 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VLSEV_FLOAT vlse32_v_f32m2
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
+#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define VBOOL_T vbool16_t
+#define UINT_V_T vint32m2_t
+#define VID_V_UINT vid_v_i32m2
+#define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16
+#define VMSEQ_VX_UINT vmseq_vx_i32m2_b16
+#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VLSEV_FLOAT vlse64_v_f64m2
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
+#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define VBOOL_T     vbool32_t
+#define UINT_V_T     vuint64m2_t
+#define VID_V_UINT   vid_v_u64m2
+#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
+#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js, X;
+
+    FLOAT *ao;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda*2;
+    
+    FLOAT_V_T va0, va1;
+
+    size_t vl;
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posY * 2 + posX * lda * 2;
+        } 
+        else 
+        {
+            ao = a + posX * 2 + posY * lda * 2;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) 
+            {
+                VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
+                VSSEG2_FLOAT(b, va0, va1, vl);
+
+                ao  += 2;
+                b   += vl * 2;
+
+                X ++;
+                i ++;
+            } 
+            else if (X < posY) 
+            {
+                ao  += lda * 2;
+                b   += vl * 2;
+                X ++;
+                i ++;
+            } 
+            else 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl);
+                    va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    va0 =  VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl);
+                    va1 =  VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl);
+#endif
+                    VSSEG2_FLOAT(b, va0, va1, vl);
+                    ao  += 2;
+                    b   += vl * 2;
+                }
+
+                X += vl;
+                i += vl;
+            }
+        } while (i < m);
+
+        posY += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c
new file mode 100644
index 000000000..c7d593949
--- /dev/null
+++ b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c
@@ -0,0 +1,143 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
+#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define VBOOL_T vbool16_t
+#define UINT_V_T vuint32m2_t
+#define VID_V_UINT vid_v_u32m2
+#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
+#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
+#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define VBOOL_T     vbool32_t
+#define UINT_V_T     vuint64m2_t
+#define VID_V_UINT   vid_v_u64m2
+#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
+#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js, X;
+
+    FLOAT *ao;
+    
+    FLOAT_V_T va0, va1;
+    size_t vl;
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posY * 2 + posX * lda * 2;
+        } 
+        else 
+        {
+            ao = a + posX * 2 + posY * lda * 2;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) 
+            {
+                ao  += 2;
+                b   += vl * 2;
+                X++;
+                i++;
+            } 
+            else if (X < posY) 
+            {
+                //va1 = VLEV_FLOAT(ao, vl);
+                VLSEG2_FLOAT(&va0, &va1, ao, vl);
+                VSSEG2_FLOAT(b, va0, va1, vl);
+
+                ao  += lda * 2;
+                b   += vl * 2;
+                X ++;
+                i ++;
+            }
+            else
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    //va1 = VLEV_FLOAT(ao, vl);
+                    VLSEG2_FLOAT(&va0, &va1, ao, vl);
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl);
+                    va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    va0 =  VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl);
+                    va1 =  VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl);
+#endif
+                    //VSEV_FLOAT(b, vb, vl);
+                    VSSEG2_FLOAT(b, va0, va1, vl);
+                    ao  += lda * 2;
+                    b   += vl * 2;
+                }
+                X += vl;
+                i += vl;
+
+            }
+        } while (i < m);
+
+        posY += vl;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c
new file mode 100644
index 000000000..3c70b6385
--- /dev/null
+++ b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c
@@ -0,0 +1,144 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VLSEV_FLOAT vlse32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
+#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define VBOOL_T vbool16_t
+#define UINT_V_T vuint32m2_t
+#define VID_V_UINT vid_v_u32m2
+#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
+#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VLSEV_FLOAT vlse64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
+#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define VBOOL_T     vbool32_t
+#define UINT_V_T    vuint64m2_t
+#define VID_V_UINT   vid_v_u64m2
+#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
+#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js, X;
+    BLASLONG stride_lda = sizeof(FLOAT) * lda * 2;
+    FLOAT *ao;
+
+    FLOAT_V_T va0, va1;
+    size_t vl;
+
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posX * 2 + posY * lda * 2;
+        } 
+        else 
+        {
+            ao = a + posY * 2 + posX * lda * 2;
+        }
+
+        i = 0;
+        do
+        {
+            if (X < posY) 
+            {
+                VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
+                VSSEG2_FLOAT(b, va0, va1, vl);
+
+                ao  += 2;
+                b   += vl * 2;
+
+                X++;
+                i++;
+            } 
+            else if (X > posY) 
+            {
+                ao  += lda * 2;
+                b   += vl * 2;
+
+                X++;
+                i++;
+            } 
+            else 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl);
+                    va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    va0 =  VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl);
+                    va1 =  VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl);
+#endif
+                    VSSEG2_FLOAT(b, va0, va1, vl);
+                    ao  += 2;
+                    b   += vl * 2;
+                }
+
+                X += vl;
+                i += vl;
+            }
+        }while (i < m);
+
+        posY += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c
new file mode 100644
index 000000000..706782cf0
--- /dev/null
+++ b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c
@@ -0,0 +1,140 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
+#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define VBOOL_T vbool16_t
+#define UINT_V_T vuint32m2_t
+#define VID_V_UINT vid_v_u32m2
+#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
+#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
+#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define VBOOL_T     vbool32_t
+#define UINT_V_T     vuint64m2_t
+#define VID_V_UINT   vid_v_u64m2
+#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
+#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, j, js, X;
+
+    FLOAT *ao;
+    FLOAT_V_T va0, va1;
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posX * 2 + posY * lda * 2;
+        } 
+        else 
+        {
+            ao = a + posY * 2 + posX * lda * 2;
+        }
+
+        i = 0;
+        do
+        {
+            if (X < posY) 
+            {
+                ao  += 2;
+                b   += vl * 2;
+                X++;
+                i++;
+            }
+            else if (X > posY)
+            {
+                VLSEG2_FLOAT(&va0, &va1, ao, vl);
+                VSSEG2_FLOAT(b, va0, va1, vl);
+                ao  += lda * 2;
+                b   += vl * 2;
+                X++;
+                i++;
+            }
+            else
+            {
+                vindex  = VID_V_UINT(vl);
+                for (j = 0; j < vl; j++) 
+                {
+                    VLSEG2_FLOAT(&va0, &va1, ao, vl);
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl);
+                    va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    va0 =  VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl);
+                    va1 =  VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl);
+#endif
+                    VSSEG2_FLOAT(b, va0, va1, vl);
+                    ao += lda * 2;
+                    b += vl * 2;
+                }
+                X += vl;
+                i += vl;
+            }
+        }while (i < m);
+        posY += vl;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c
new file mode 100644
index 000000000..27409ec25
--- /dev/null
+++ b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c
@@ -0,0 +1,574 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLEV_FLOAT vle32_v_f32m2
+#define VSEV_FLOAT vse32_v_f32m2
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
+#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define VFMVVF_FLOAT vfmv_v_f_f32m2
+#define VFMACCVF_FLOAT vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
+#define VFMULVF_FLOAT vfmul_vf_f32m2
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLEV_FLOAT vle64_v_f64m2
+#define VSEV_FLOAT vse64_v_f64m2
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
+#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define VFMVVF_FLOAT vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
+#define VFMULVF_FLOAT vfmul_vf_f64m2
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFMACCVF_FLOAT
+#define OP_ii       VFNMSACVF_FLOAT
+#define OP_ri       VFMACCVF_FLOAT
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFMACCVF_FLOAT
+#define OP_ii       VFMACCVF_FLOAT
+#define OP_ri       VFNMSACVF_FLOAT
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFNMSACVF_FLOAT
+#define OP_ii       VFMACCVF_FLOAT
+#define OP_ri       VFMACCVF_FLOAT
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFNMSACVF_FLOAT
+#define OP_ii       VFNMSACVF_FLOAT
+#define OP_ri       VFNMSACVF_FLOAT
+#endif
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C, BLASLONG ldc, BLASLONG offset)
+{
+    BLASLONG i,j,k;
+    FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb;
+	BLASLONG off, temp;
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+	off = -offset;
+#else
+	off = 0;
+#endif
+
+    FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
+    FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
+
+    //fprintf(stderr, "%s, bn=%ld bm=%ld bk=%ld alphar=%f alphai=%f ldc=%ld, offset=%ld\n", __FUNCTION__, bn, bm, bk, alphar, alphai, ldc, offset); // Debug
+
+    size_t vl;
+    for (j = bn/4; j > 0; j--)
+    {
+        C0 = C;
+        C1 = C0 + 2 * ldc;
+        C2 = C1 + 2 * ldc;
+        C3 = C2 + 2 * ldc;
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+        ptrba = ba;
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*vl*2;
+            ptrbb = bb + off*4*2;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+            vres4 = VFMVVF_FLOAT(0.0, vl);
+            vres5 = VFMVVF_FLOAT(0.0, vl);
+            vres6 = VFMVVF_FLOAT(0.0, vl);
+            vres7 = VFMVVF_FLOAT(0.0, vl);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+vl;  // number of values in A
+#else
+            temp = off+4;   // number of values in B
+#endif
+
+            for (k = temp/4; k > 0; k--)
+            {
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                ptrba += vl*2;
+
+                VLSEG2_FLOAT(&va2, &va3, ptrba, vl);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va0, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va1, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va1, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va0, vl);
+
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va0, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va1, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va1, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va0, vl);
+
+                ptrbb += 8;
+
+                VLSEG2_FLOAT(&va4, &va5, ptrba, vl);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va3, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va3, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va2, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va2, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va3, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va3, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va2, vl);
+                
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va2, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va3, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va3, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va2, vl);
+                
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va2, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va3, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va3, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va2, vl);
+                
+                ptrbb += 8;
+
+                VLSEG2_FLOAT(&va6, &va7, ptrba, vl);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va5, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va5, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va4, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va4, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va5, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va5, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va4, vl);
+                
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va4, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va5, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va5, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va4, vl);
+                
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va4, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va5, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va5, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va4, vl);
+                
+                ptrbb += 8;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va6, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va7, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va7, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va6, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va6, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va7, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va7, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va6, vl);
+                
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va6, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va7, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va7, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va6, vl);
+                
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va6, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va7, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va7, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va6, vl);
+                
+                ptrbb += 8;
+            }
+
+            for (k = temp & 3; k > 0; k--)
+            {
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va0, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va1, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va1, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va0, vl);
+
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va0, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va1, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va1, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va0, vl);
+
+                ptrbb += 8;
+            }
+            va0 =  VFMULVF_FLOAT(vres0, alphar, vl);
+            va1 =  VFMULVF_FLOAT(vres1, alphar, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
+            VSSEG2_FLOAT(C0, va0, va1, vl);
+
+            va2 =  VFMULVF_FLOAT(vres2, alphar, vl);
+            va3 =  VFMULVF_FLOAT(vres3, alphar, vl);
+            va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphai, vres2, vl);
+            VSSEG2_FLOAT(C1, va2, va3, vl);
+
+            va0 =  VFMULVF_FLOAT(vres4, alphar, vl);
+            va1 =  VFMULVF_FLOAT(vres5, alphar, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres4, vl);
+            VSSEG2_FLOAT(C2, va0, va1, vl);
+
+            va2 =  VFMULVF_FLOAT(vres6, alphar, vl);
+            va3 =  VFMULVF_FLOAT(vres7, alphar, vl);
+            va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphai, vres6, vl);
+            VSSEG2_FLOAT(C3, va2, va3, vl);
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= vl; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*vl*2;
+            ptrbb += temp*4*2;
+#endif
+
+#ifdef LEFT
+            off += vl; // number of values in A
+#endif
+
+            C0 += vl * 2;
+            C1 += vl * 2;
+            C2 += vl * 2;
+            C3 += vl * 2;
+        }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 4;
+#endif
+
+        bb += (bk << 3);
+        C  += (ldc << 3);
+    }
+
+    if (bn & 2)
+    {
+        C0 = C;
+        C1 = C0 + 2 * ldc;
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+        ptrba = ba;
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*vl*2;
+            ptrbb = bb + off*2*2;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+vl;  // number of values in A
+#else
+            temp = off+2;   // number of values in B
+#endif
+            for (k = temp/4; k > 0; k--)
+            {
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                ptrba += vl*2;
+
+                VLSEG2_FLOAT(&va2, &va3, ptrba, vl);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+
+                ptrbb += 4;
+
+                VLSEG2_FLOAT(&va4, &va5, ptrba, vl);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va3, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va3, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va2, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va2, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va3, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va3, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va2, vl);
+
+                ptrbb += 4;
+
+                VLSEG2_FLOAT(&va6, &va7, ptrba, vl);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va5, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va5, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va4, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va4, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va5, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va5, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va4, vl);
+                
+                ptrbb += 4;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va6, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va7, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va7, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va6, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va6, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va7, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va7, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va6, vl);
+                
+                ptrbb += 4;
+            }
+
+            for (k = temp & 3; k > 0; k--)
+            {
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+
+                ptrbb += 4;
+            }
+
+            va0 =  VFMULVF_FLOAT(vres0, alphar, vl);
+            va1 =  VFMULVF_FLOAT(vres1, alphar, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
+            VSSEG2_FLOAT(C0, va0, va1, vl);
+
+            va2 =  VFMULVF_FLOAT(vres2, alphar, vl);
+            va3 =  VFMULVF_FLOAT(vres3, alphar, vl);
+            va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphai, vres2, vl);
+            VSSEG2_FLOAT(C1, va2, va3, vl);
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= vl; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*vl*2;
+            ptrbb += temp*2*2;
+#endif
+
+#ifdef LEFT
+            off += vl; // number of values in A
+#endif
+            C0 += vl * 2;
+            C1 += vl * 2;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 2;
+#endif
+        bb += (bk << 2);
+        C  += (ldc << 2);
+    }
+
+    if (bn & 1)
+    {
+        C0 = C;
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+        off = offset;
+#endif
+        ptrba = ba;
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*vl*2;
+            ptrbb = bb + off*2;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+vl;  // number of values in A
+#else
+            temp = off+1;   // number of values in B
+#endif
+            for (k = temp/4; k > 0; k--)
+            {
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                ptrba += vl*2;
+
+                VLSEG2_FLOAT(&va2, &va3, ptrba, vl);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                ptrbb += 2;
+
+                VLSEG2_FLOAT(&va4, &va5, ptrba, vl);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va3, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va3, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va2, vl);
+
+                ptrbb += 2;
+
+                VLSEG2_FLOAT(&va6, &va7, ptrba, vl);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va5, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va5, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va4, vl);
+                
+                ptrbb += 2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va6, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va7, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va7, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va6, vl);
+                
+                ptrbb += 2;
+            }
+
+            for (k = temp & 3; k > 0; k--)
+            {
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                ptrbb += 2;
+            }
+
+            va0 =  VFMULVF_FLOAT(vres0, alphar, vl);
+            va1 =  VFMULVF_FLOAT(vres1, alphar, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
+            VSSEG2_FLOAT(C0, va0, va1, vl);
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= vl; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*vl*2;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += vl; // number of values in A
+#endif
+            C0 += vl * 2;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 1;
+#endif
+        bb += bk << 1;
+        C  += ldc << 1;
+   }
+   return 0;
+}
diff --git a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c
new file mode 100644
index 000000000..b7ccb1eb3
--- /dev/null
+++ b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c
@@ -0,0 +1,115 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m
+#define VBOOL_T vbool16_t
+#define UINT_V_T vuint32m2_t
+#define VID_V_UINT vid_v_u32m2
+#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m
+#define VBOOL_T     vbool32_t
+#define UINT_V_T     vuint64m2_t
+#define VID_V_UINT   vid_v_u64m2
+#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug
+
+    BLASLONG i, ii, jj, js;
+
+    FLOAT *ao;
+
+    jj = offset;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda*2;
+
+    FLOAT_V_T va0, va1;
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl);
+
+                    compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1));
+                    ao  += 2;
+                    b   += vl * 2;
+                }
+                i += vl;
+                ii += vl;
+            }
+            else
+            {
+                if (ii > jj)
+                {
+                    VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
+                    VSSEG2_FLOAT(b, va0, va1, vl);
+                }
+                ao  += 2;
+                b   += vl * 2;
+                i++;
+                ii++;
+            }
+        }
+
+        a += vl * lda * 2;
+        jj += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c
new file mode 100644
index 000000000..911b81de5
--- /dev/null
+++ b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c
@@ -0,0 +1,114 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
+#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m
+#define VBOOL_T vbool16_t
+#define UINT_V_T vuint32m2_t
+#define VID_V_UINT vid_v_u32m2
+#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
+#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m
+#define VBOOL_T     vbool32_t
+#define UINT_V_T     vuint64m2_t
+#define VID_V_UINT   vid_v_u64m2
+#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug
+
+    BLASLONG i, ii, jj, js;
+
+    FLOAT *ao;
+
+    jj = offset;
+
+    FLOAT_V_T va0, va1;
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1));
+
+                    VLSEG2_FLOAT(&va0, &va1, ao, vl);
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl);
+
+                    b   += vl * 2;
+                    ao  += lda * 2;
+                }
+                i += vl;
+                ii += vl;
+            }
+            else 
+            {
+                if (ii < jj) 
+                {
+                    VLSEG2_FLOAT(&va0, &va1, ao, vl);
+                    VSSEG2_FLOAT(b, va0, va1, vl);
+                }
+                ao  += lda * 2;
+                b   += vl * 2;
+                i ++;
+                ii ++;
+            }
+        }
+
+        a += vl * 2;
+        jj += vl;
+    }
+    return 0;
+}
+
diff --git a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c
new file mode 100644
index 000000000..db075c29b
--- /dev/null
+++ b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c
@@ -0,0 +1,113 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m
+#define VBOOL_T vbool16_t
+#define UINT_V_T vuint32m2_t
+#define VID_V_UINT vid_v_u32m2
+#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m
+#define VBOOL_T     vbool32_t
+#define UINT_V_T     vuint64m2_t
+#define VID_V_UINT   vid_v_u64m2
+#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug
+
+    BLASLONG i, ii, jj, js;
+    BLASLONG stride_lda = sizeof(FLOAT)*lda*2;
+
+    FLOAT *ao;
+    jj = offset;
+
+    FLOAT_V_T va0, va1;
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        i = 0;
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1));
+                    VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl);
+                    ao  += 2;
+                    b   += vl * 2;
+                }
+                i += vl;
+                ii += vl;
+            } 
+            else
+            {
+                if (ii < jj) 
+                {
+                    VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
+                    VSSEG2_FLOAT(b, va0, va1, vl);
+                }
+                ao  += 2;
+                b   += vl * 2;
+                i++;
+                ii++;
+            }
+        } 
+
+        a += vl * lda * 2;
+        jj += vl;
+    }
+    return 0;
+}
diff --git a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c
new file mode 100644
index 000000000..e121c6273
--- /dev/null
+++ b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c
@@ -0,0 +1,115 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n) vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
+#define VLSEG2_FLOAT vlseg2e32_v_f32m2
+#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m
+#define VBOOL_T vbool16_t
+#define UINT_V_T vuint32m2_t
+#define VID_V_UINT vid_v_u32m2
+#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
+#else
+#define VSETVL(n) vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
+#define VLSEG2_FLOAT vlseg2e64_v_f64m2
+#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m
+#define VBOOL_T     vbool32_t
+#define UINT_V_T     vuint64m2_t
+#define VID_V_UINT   vid_v_u64m2
+#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug
+
+    BLASLONG i, ii, jj, js;
+
+    FLOAT *ao;
+
+    jj = offset;
+    FLOAT_V_T va0, va1;
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+  
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    VLSEG2_FLOAT(&va0, &va1, ao, vl);
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl);
+
+                    compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1));
+
+                    ao  += lda * 2;
+                    b   += vl * 2;
+                }
+                i += vl;
+                ii += vl;
+            } 
+            else 
+            {
+                if (ii > jj) 
+                {
+                    VLSEG2_FLOAT(&va0, &va1, ao, vl);
+                    VSSEG2_FLOAT(b, va0, va1, vl);
+                }
+                ao  += lda * 2;
+                b   += vl * 2;
+                i ++;
+                ii ++;
+            }
+        }
+
+        a += vl * 2;
+        jj += vl;
+    }
+
+    return 0;
+}
diff --git a/param.h b/param.h
index 62b675d6c..236f50075 100644
--- a/param.h
+++ b/param.h
@@ -3055,11 +3055,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define DGEMM_DEFAULT_UNROLL_N  8 //2 // 4
 #define DGEMM_DEFAULT_UNROLL_MN  32
 
-#define CGEMM_DEFAULT_UNROLL_M  2
-#define CGEMM_DEFAULT_UNROLL_N  2
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
+#define CGEMM_DEFAULT_UNROLL_MN 16
 
-#define ZGEMM_DEFAULT_UNROLL_M  2
-#define ZGEMM_DEFAULT_UNROLL_N  2
+#define ZGEMM_DEFAULT_UNROLL_M  8
+#define ZGEMM_DEFAULT_UNROLL_N  4
+#define ZGEMM_DEFAULT_UNROLL_MN 16
 
 #define SGEMM_DEFAULT_P	160
 #define DGEMM_DEFAULT_P	160

From 240695862984d4de845f1c42821a883946932df7 Mon Sep 17 00:00:00 2001
From: Sergei Lewis <slewis@rivosinc.com>
Date: Fri, 24 Feb 2023 10:44:55 +0000
Subject: [PATCH 010/718] * update intrinsics to match latest spec at
 https://github.com/riscv-non-isa/rvv-intrinsic-doc (in particular, __riscv_
 prefixes for rvv intrinsics) * fix multiple numerical stability and corner
 case issues * add a script to generate arbitrary gemm kernel shapes * add a
 generic zvl256b target to demonstrate large gemm kernel unrolls

---
 common_riscv64.h                           |   15 +-
 cpuid_riscv64.c                            |   10 +-
 kernel/generic/trmmkernel_16x8.c           | 3676 ++++++++++++++++++++
 kernel/generic/zlaswp_ncopy_8.c            | 1051 ++++++
 kernel/riscv64/KERNEL.RISCV64_ZVL256B      |  199 ++
 kernel/riscv64/amax_vector.c               |  231 +-
 kernel/riscv64/amin_vector.c               |  252 +-
 kernel/riscv64/asum_vector.c               |   99 +-
 kernel/riscv64/axpby_vector.c              |   47 +-
 kernel/riscv64/axpy_vector.c               |   42 +-
 kernel/riscv64/cgemm_kernel_8x8_zvl256b.c  | 1931 ++++++++++
 kernel/riscv64/copy_vector.c               |   39 +-
 kernel/riscv64/ctrmm_kernel_8x8_zvl256b.c  | 2007 +++++++++++
 kernel/riscv64/dgemm_kernel_8x8_zvl256b.c  |  860 +++++
 kernel/riscv64/dot.c                       |    2 +-
 kernel/riscv64/dot_vector.c                |   86 +-
 kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c  | 1068 ++++++
 kernel/riscv64/gemv_n_vector.c             |   24 +-
 kernel/riscv64/gemv_t_vector.c             |   91 +-
 kernel/riscv64/generate_kernel.py          |  670 ++++
 kernel/riscv64/iamax_vector.c              |  180 +-
 kernel/riscv64/iamin_vector.c              |  160 +-
 kernel/riscv64/imax_vector.c               |  124 +-
 kernel/riscv64/imin_vector.c               |  185 +-
 kernel/riscv64/izamax_vector.c             |  277 +-
 kernel/riscv64/izamin_vector.c             |  275 +-
 kernel/riscv64/max_vector.c                |   77 +-
 kernel/riscv64/min_vector.c                |   77 +-
 kernel/riscv64/nrm2_vector.c               |  342 +-
 kernel/riscv64/nrm2_vector_dot.c           |    8 +-
 kernel/riscv64/rot_vector.c                |   42 +-
 kernel/riscv64/scal_vector.c               |   83 +-
 kernel/riscv64/sgemm_kernel_16x8_zvl256b.c | 1081 ++++++
 kernel/riscv64/strmm_kernel_16x8_zvl256b.c | 1330 +++++++
 kernel/riscv64/sum_vector.c                |  114 +
 kernel/riscv64/swap_vector.c               |   54 +-
 kernel/riscv64/symv_L_vector.c             |   82 +-
 kernel/riscv64/symv_U_vector.c             |   86 +-
 kernel/riscv64/zamax_vector.c              |   90 +-
 kernel/riscv64/zamin_vector.c              |   89 +-
 kernel/riscv64/zasum_vector.c              |  107 +-
 kernel/riscv64/zaxpby_vector.c             |   32 +-
 kernel/riscv64/zaxpy_vector.c              |   20 +-
 kernel/riscv64/zcopy_vector.c              |   12 +-
 kernel/riscv64/zdot_vector.c               |   60 +-
 kernel/riscv64/zgemm_kernel_8x4_zvl256b.c  | 1253 +++++++
 kernel/riscv64/zgemm_kernel_generic.c      |  140 +
 kernel/riscv64/zgemv_n_vector.c            |   28 +-
 kernel/riscv64/zgemv_t_vector.c            |   88 +-
 kernel/riscv64/zhemv_LM_vector.c           |   60 +-
 kernel/riscv64/zhemv_UV_vector.c           |   60 +-
 kernel/riscv64/znrm2_vector.c              |  365 +-
 kernel/riscv64/zrot_vector.c               |   38 +-
 kernel/riscv64/zscal_vector.c              |   32 +-
 kernel/riscv64/zsum_vector.c               |  131 +
 kernel/riscv64/zswap_vector.c              |   50 +-
 kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c  | 1337 +++++++
 param.h                                    |   39 +
 58 files changed, 18634 insertions(+), 2374 deletions(-)
 create mode 100644 kernel/generic/trmmkernel_16x8.c
 create mode 100644 kernel/generic/zlaswp_ncopy_8.c
 create mode 100644 kernel/riscv64/KERNEL.RISCV64_ZVL256B
 create mode 100644 kernel/riscv64/cgemm_kernel_8x8_zvl256b.c
 create mode 100644 kernel/riscv64/ctrmm_kernel_8x8_zvl256b.c
 create mode 100644 kernel/riscv64/dgemm_kernel_8x8_zvl256b.c
 create mode 100644 kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c
 create mode 100755 kernel/riscv64/generate_kernel.py
 create mode 100644 kernel/riscv64/sgemm_kernel_16x8_zvl256b.c
 create mode 100644 kernel/riscv64/strmm_kernel_16x8_zvl256b.c
 create mode 100644 kernel/riscv64/sum_vector.c
 create mode 100644 kernel/riscv64/zgemm_kernel_8x4_zvl256b.c
 create mode 100644 kernel/riscv64/zgemm_kernel_generic.c
 create mode 100644 kernel/riscv64/zsum_vector.c
 create mode 100644 kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c

diff --git a/common_riscv64.h b/common_riscv64.h
index 2092bd5ab..de79c8cab 100644
--- a/common_riscv64.h
+++ b/common_riscv64.h
@@ -91,12 +91,15 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define BUFFER_SIZE     ( 32 << 20)
 #define SEEK_ADDRESS
 
-#if defined(C910V)
-#include <riscv_vector.h>
-#endif
-
-#if defined(x280)
-#include <riscv_vector.h>
+#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(__riscv_v)
+# include <riscv_vector.h>
+# if !defined(DOUBLE)
+#  define EXTRACT_FLOAT(v) __riscv_vfmv_f_s_f32m1_f32(v)
+# else
+#  define EXTRACT_FLOAT(v) __riscv_vfmv_f_s_f64m1_f64(v)
+# endif
+#else
+# define EXTRACT_FLOAT(v) (v[0])
 #endif
 
 #endif
diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c
index 5326787e6..1b6b62f21 100644
--- a/cpuid_riscv64.c
+++ b/cpuid_riscv64.c
@@ -70,14 +70,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#define CPU_GENERIC   0
-#define CPU_C910V     1
-#define CPU_x280        2
+#define CPU_GENERIC         0
+#define CPU_C910V           1
+#define CPU_RISCV64_ZVL256B 2
 
 static char *cpuname[] = {
   "RISCV64_GENERIC",
-  "C910V"
-  "x280"
+  "C910V",
+  "CPU_RISCV64_ZVL256B"
 };
 
 int detect(void){
diff --git a/kernel/generic/trmmkernel_16x8.c b/kernel/generic/trmmkernel_16x8.c
new file mode 100644
index 000000000..5412eab70
--- /dev/null
+++ b/kernel/generic/trmmkernel_16x8.c
@@ -0,0 +1,3676 @@
+#include "common.h"
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb;
+
+   FLOAT res0_0;
+   FLOAT res0_1;
+   FLOAT res0_2;
+   FLOAT res0_3;
+   FLOAT res0_4;
+   FLOAT res0_5;
+   FLOAT res0_6;
+   FLOAT res0_7;
+
+   FLOAT res0_8;
+   FLOAT res0_9;
+   FLOAT res0_10;
+   FLOAT res0_11;
+   FLOAT res0_12;
+   FLOAT res0_13;
+   FLOAT res0_14;
+   FLOAT res0_15;
+
+   FLOAT res1_0;
+   FLOAT res1_1;
+   FLOAT res1_2;
+   FLOAT res1_3;
+   FLOAT res1_4;
+   FLOAT res1_5;
+   FLOAT res1_6;
+   FLOAT res1_7;
+
+   FLOAT res1_8;
+   FLOAT res1_9;
+   FLOAT res1_10;
+   FLOAT res1_11;
+   FLOAT res1_12;
+   FLOAT res1_13;
+   FLOAT res1_14;
+   FLOAT res1_15;
+
+   FLOAT res2_0;
+   FLOAT res2_1;
+   FLOAT res2_2;
+   FLOAT res2_3;
+   FLOAT res2_4;
+   FLOAT res2_5;
+   FLOAT res2_6;
+   FLOAT res2_7;
+
+   FLOAT res2_8;
+   FLOAT res2_9;
+   FLOAT res2_10;
+   FLOAT res2_11;
+   FLOAT res2_12;
+   FLOAT res2_13;
+   FLOAT res2_14;
+   FLOAT res2_15;
+
+   FLOAT res3_0;
+   FLOAT res3_1;
+   FLOAT res3_2;
+   FLOAT res3_3;
+   FLOAT res3_4;
+   FLOAT res3_5;
+   FLOAT res3_6;
+   FLOAT res3_7;
+
+   FLOAT res3_8;
+   FLOAT res3_9;
+   FLOAT res3_10;
+   FLOAT res3_11;
+   FLOAT res3_12;
+   FLOAT res3_13;
+   FLOAT res3_14;
+   FLOAT res3_15;
+
+   FLOAT res4_0;
+   FLOAT res4_1;
+   FLOAT res4_2;
+   FLOAT res4_3;
+   FLOAT res4_4;
+   FLOAT res4_5;
+   FLOAT res4_6;
+   FLOAT res4_7;
+
+   FLOAT res4_8;
+   FLOAT res4_9;
+   FLOAT res4_10;
+   FLOAT res4_11;
+   FLOAT res4_12;
+   FLOAT res4_13;
+   FLOAT res4_14;
+   FLOAT res4_15;
+
+   FLOAT res5_0;
+   FLOAT res5_1;
+   FLOAT res5_2;
+   FLOAT res5_3;
+   FLOAT res5_4;
+   FLOAT res5_5;
+   FLOAT res5_6;
+   FLOAT res5_7;
+
+   FLOAT res5_8;
+   FLOAT res5_9;
+   FLOAT res5_10;
+   FLOAT res5_11;
+   FLOAT res5_12;
+   FLOAT res5_13;
+   FLOAT res5_14;
+   FLOAT res5_15;
+
+   FLOAT res6_0;
+   FLOAT res6_1;
+   FLOAT res6_2;
+   FLOAT res6_3;
+   FLOAT res6_4;
+   FLOAT res6_5;
+   FLOAT res6_6;
+   FLOAT res6_7;
+
+   FLOAT res6_8;
+   FLOAT res6_9;
+   FLOAT res6_10;
+   FLOAT res6_11;
+   FLOAT res6_12;
+   FLOAT res6_13;
+   FLOAT res6_14;
+   FLOAT res6_15;
+
+   FLOAT res7_0;
+   FLOAT res7_1;
+   FLOAT res7_2;
+   FLOAT res7_3;
+   FLOAT res7_4;
+   FLOAT res7_5;
+   FLOAT res7_6;
+   FLOAT res7_7;
+
+   FLOAT res7_8;
+   FLOAT res7_9;
+   FLOAT res7_10;
+   FLOAT res7_11;
+   FLOAT res7_12;
+   FLOAT res7_13;
+   FLOAT res7_14;
+   FLOAT res7_15;
+
+   FLOAT a0;
+   FLOAT a1;
+
+   FLOAT b0;
+   FLOAT b1;
+   FLOAT b2;
+   FLOAT b3;
+   FLOAT b4;
+   FLOAT b5;
+   FLOAT b6;
+   FLOAT b7;
+
+   BLASLONG off, temp;
+
+#if !defined(LEFT)
+   off = -offset;
+#else
+   off = 0;
+#endif
+
+   for (j=0; j<bn/8; j+=1)
+   {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C0+2*ldc;
+        C3 = C0+3*ldc;
+        C4 = C0+4*ldc;
+        C5 = C0+5*ldc;
+        C6 = C0+6*ldc;
+        C7 = C0+7*ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+	off = offset;
+#endif
+
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*16;
+		ptrbb = bb + off*8;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res0_8  = 0;
+		res0_9  = 0;
+		res0_10 = 0;
+		res0_11 = 0;
+		res0_12 = 0;
+		res0_13 = 0;
+		res0_14 = 0;
+		res0_15 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res1_8  = 0;
+		res1_9  = 0;
+		res1_10 = 0;
+		res1_11 = 0;
+		res1_12 = 0;
+		res1_13 = 0;
+		res1_14 = 0;
+		res1_15 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+		res2_4 = 0;
+		res2_5 = 0;
+		res2_6 = 0;
+		res2_7 = 0;
+
+		res2_8  = 0;
+		res2_9  = 0;
+		res2_10 = 0;
+		res2_11 = 0;
+		res2_12 = 0;
+		res2_13 = 0;
+		res2_14 = 0;
+		res2_15 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+		res3_4 = 0;
+		res3_5 = 0;
+		res3_6 = 0;
+		res3_7 = 0;
+
+		res3_8  = 0;
+		res3_9  = 0;
+		res3_10 = 0;
+		res3_11 = 0;
+		res3_12 = 0;
+		res3_13 = 0;
+		res3_14 = 0;
+		res3_15 = 0;
+
+
+		res4_0 = 0;
+		res4_1 = 0;
+		res4_2 = 0;
+		res4_3 = 0;
+		res4_4 = 0;
+		res4_5 = 0;
+		res4_6 = 0;
+		res4_7 = 0;
+
+		res4_8  = 0;
+		res4_9  = 0;
+		res4_10 = 0;
+		res4_11 = 0;
+		res4_12 = 0;
+		res4_13 = 0;
+		res4_14 = 0;
+		res4_15 = 0;
+
+		res5_0 = 0;
+		res5_1 = 0;
+		res5_2 = 0;
+		res5_3 = 0;
+		res5_4 = 0;
+		res5_5 = 0;
+		res5_6 = 0;
+		res5_7 = 0;
+
+		res5_8  = 0;
+		res5_9  = 0;
+		res5_10 = 0;
+		res5_11 = 0;
+		res5_12 = 0;
+		res5_13 = 0;
+		res5_14 = 0;
+		res5_15 = 0;
+
+		res6_0 = 0;
+		res6_1 = 0;
+		res6_2 = 0;
+		res6_3 = 0;
+		res6_4 = 0;
+		res6_5 = 0;
+		res6_6 = 0;
+		res6_7 = 0;
+
+		res6_8  = 0;
+		res6_9  = 0;
+		res6_10 = 0;
+		res6_11 = 0;
+		res6_12 = 0;
+		res6_13 = 0;
+		res6_14 = 0;
+		res6_15 = 0;
+
+		res7_0 = 0;
+		res7_1 = 0;
+		res7_2 = 0;
+		res7_3 = 0;
+		res7_4 = 0;
+		res7_5 = 0;
+		res7_6 = 0;
+		res7_7 = 0;
+
+		res7_8  = 0;
+		res7_9  = 0;
+		res7_10 = 0;
+		res7_11 = 0;
+		res7_12 = 0;
+		res7_13 = 0;
+		res7_14 = 0;
+		res7_15 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+16;	// number of values in A
+#else
+		temp = off+8;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+			b4 = ptrbb[4];
+			b5 = ptrbb[5];
+			b6 = ptrbb[6];
+			b7 = ptrbb[7];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+			res4_0 += a0*b4;
+			res5_0 += a0*b5;
+			res6_0 += a0*b6;
+			res7_0 += a0*b7;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+			res4_1 += a1*b4;
+			res5_1 += a1*b5;
+			res6_1 += a1*b6;
+			res7_1 += a1*b7;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+			res4_2 += a0*b4;
+			res5_2 += a0*b5;
+			res6_2 += a0*b6;
+			res7_2 += a0*b7;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+			res4_3 += a1*b4;
+			res5_3 += a1*b5;
+			res6_3 += a1*b6;
+			res7_3 += a1*b7;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+			res2_4 += a0*b2;
+			res3_4 += a0*b3;
+			res4_4 += a0*b4;
+			res5_4 += a0*b5;
+			res6_4 += a0*b6;
+			res7_4 += a0*b7;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+			res2_5 += a1*b2;
+			res3_5 += a1*b3;
+			res4_5 += a1*b4;
+			res5_5 += a1*b5;
+			res6_5 += a1*b6;
+			res7_5 += a1*b7;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+			res2_6 += a0*b2;
+			res3_6 += a0*b3;
+			res4_6 += a0*b4;
+			res5_6 += a0*b5;
+			res6_6 += a0*b6;
+			res7_6 += a0*b7;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+			res2_7 += a1*b2;
+			res3_7 += a1*b3;
+			res4_7 += a1*b4;
+			res5_7 += a1*b5;
+			res6_7 += a1*b6;
+			res7_7 += a1*b7;
+
+			a0 = ptrba[8];
+			res0_8 += a0*b0;
+			res1_8 += a0*b1;
+			res2_8 += a0*b2;
+			res3_8 += a0*b3;
+			res4_8 += a0*b4;
+			res5_8 += a0*b5;
+			res6_8 += a0*b6;
+			res7_8 += a0*b7;
+
+			a1 = ptrba[9];
+			res0_9 += a1*b0;
+			res1_9 += a1*b1;
+			res2_9 += a1*b2;
+			res3_9 += a1*b3;
+			res4_9 += a1*b4;
+			res5_9 += a1*b5;
+			res6_9 += a1*b6;
+			res7_9 += a1*b7;
+
+			a0 = ptrba[10];
+			res0_10 += a0*b0;
+			res1_10 += a0*b1;
+			res2_10 += a0*b2;
+			res3_10 += a0*b3;
+			res4_10 += a0*b4;
+			res5_10 += a0*b5;
+			res6_10 += a0*b6;
+			res7_10 += a0*b7;
+
+			a1 = ptrba[11];
+			res0_11 += a1*b0;
+			res1_11 += a1*b1;
+			res2_11 += a1*b2;
+			res3_11 += a1*b3;
+			res4_11 += a1*b4;
+			res5_11 += a1*b5;
+			res6_11 += a1*b6;
+			res7_11 += a1*b7;
+
+			a0 = ptrba[12];
+			res0_12 += a0*b0;
+			res1_12 += a0*b1;
+			res2_12 += a0*b2;
+			res3_12 += a0*b3;
+			res4_12 += a0*b4;
+			res5_12 += a0*b5;
+			res6_12 += a0*b6;
+			res7_12 += a0*b7;
+
+			a1 = ptrba[13];
+			res0_13 += a1*b0;
+			res1_13 += a1*b1;
+			res2_13 += a1*b2;
+			res3_13 += a1*b3;
+			res4_13 += a1*b4;
+			res5_13 += a1*b5;
+			res6_13 += a1*b6;
+			res7_13 += a1*b7;
+
+			a0 = ptrba[14];
+			res0_14 += a0*b0;
+			res1_14 += a0*b1;
+			res2_14 += a0*b2;
+			res3_14 += a0*b3;
+			res4_14 += a0*b4;
+			res5_14 += a0*b5;
+			res6_14 += a0*b6;
+			res7_14 += a0*b7;
+
+			a1 = ptrba[15];
+			res0_15 += a1*b0;
+			res1_15 += a1*b1;
+			res2_15 += a1*b2;
+			res3_15 += a1*b3;
+			res4_15 += a1*b4;
+			res5_15 += a1*b5;
+			res6_15 += a1*b6;
+			res7_15 += a1*b7;
+
+
+			ptrba = ptrba+16;
+			ptrbb = ptrbb+8;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res0_8  *= alpha;
+		res0_9  *= alpha;
+		res0_10 *= alpha;
+		res0_11 *= alpha;
+		res0_12 *= alpha;
+		res0_13 *= alpha;
+		res0_14 *= alpha;
+		res0_15 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		res1_8  *= alpha;
+		res1_9  *= alpha;
+		res1_10 *= alpha;
+		res1_11 *= alpha;
+		res1_12 *= alpha;
+		res1_13 *= alpha;
+		res1_14 *= alpha;
+		res1_15 *= alpha;
+		
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+		res2_4 *= alpha;
+		res2_5 *= alpha;
+		res2_6 *= alpha;
+		res2_7 *= alpha;
+
+		res2_8  *= alpha;
+		res2_9  *= alpha;
+		res2_10 *= alpha;
+		res2_11 *= alpha;
+		res2_12 *= alpha;
+		res2_13 *= alpha;
+		res2_14 *= alpha;
+		res2_15 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+		res3_4 *= alpha;
+		res3_5 *= alpha;
+		res3_6 *= alpha;
+		res3_7 *= alpha;
+
+		res3_8  *= alpha;
+		res3_9  *= alpha;
+		res3_10 *= alpha;
+		res3_11 *= alpha;
+		res3_12 *= alpha;
+		res3_13 *= alpha;
+		res3_14 *= alpha;
+		res3_15 *= alpha;
+
+		res4_0 *= alpha;
+		res4_1 *= alpha;
+		res4_2 *= alpha;
+		res4_3 *= alpha;
+		res4_4 *= alpha;
+		res4_5 *= alpha;
+		res4_6 *= alpha;
+		res4_7 *= alpha;
+
+		res4_8  *= alpha;
+		res4_9  *= alpha;
+		res4_10 *= alpha;
+		res4_11 *= alpha;
+		res4_12 *= alpha;
+		res4_13 *= alpha;
+		res4_14 *= alpha;
+		res4_15 *= alpha;
+
+		res5_0 *= alpha;
+		res5_1 *= alpha;
+		res5_2 *= alpha;
+		res5_3 *= alpha;
+		res5_4 *= alpha;
+		res5_5 *= alpha;
+		res5_6 *= alpha;
+		res5_7 *= alpha;
+
+		res5_8  *= alpha;
+		res5_9  *= alpha;
+		res5_10 *= alpha;
+		res5_11 *= alpha;
+		res5_12 *= alpha;
+		res5_13 *= alpha;
+		res5_14 *= alpha;
+		res5_15 *= alpha;
+		
+		res6_0 *= alpha;
+		res6_1 *= alpha;
+		res6_2 *= alpha;
+		res6_3 *= alpha;
+		res6_4 *= alpha;
+		res6_5 *= alpha;
+		res6_6 *= alpha;
+		res6_7 *= alpha;
+
+		res6_8  *= alpha;
+		res6_9  *= alpha;
+		res6_10 *= alpha;
+		res6_11 *= alpha;
+		res6_12 *= alpha;
+		res6_13 *= alpha;
+		res6_14 *= alpha;
+		res6_15 *= alpha;
+
+		res7_0 *= alpha;
+		res7_1 *= alpha;
+		res7_2 *= alpha;
+		res7_3 *= alpha;
+		res7_4 *= alpha;
+		res7_5 *= alpha;
+		res7_6 *= alpha;
+		res7_7 *= alpha;
+
+		res7_8  *= alpha;
+		res7_9  *= alpha;
+		res7_10 *= alpha;
+		res7_11 *= alpha;
+		res7_12 *= alpha;
+		res7_13 *= alpha;
+		res7_14 *= alpha;
+		res7_15 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C0[8]  = res0_8;
+		C0[9]  = res0_9;
+		C0[10] = res0_10;
+		C0[11] = res0_11;
+		C0[12] = res0_12;
+		C0[13] = res0_13;
+		C0[14] = res0_14;
+		C0[15] = res0_15;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C1[8]  = res1_8;
+		C1[9]  = res1_9;
+		C1[10] = res1_10;
+		C1[11] = res1_11;
+		C1[12] = res1_12;
+		C1[13] = res1_13;
+		C1[14] = res1_14;
+		C1[15] = res1_15;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+		C2[4] = res2_4;
+		C2[5] = res2_5;
+		C2[6] = res2_6;
+		C2[7] = res2_7;
+
+		C2[8]  = res2_8;
+		C2[9]  = res2_9;
+		C2[10] = res2_10;
+		C2[11] = res2_11;
+		C2[12] = res2_12;
+		C2[13] = res2_13;
+		C2[14] = res2_14;
+		C2[15] = res2_15;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+		C3[4] = res3_4;
+		C3[5] = res3_5;
+		C3[6] = res3_6;
+		C3[7] = res3_7;
+
+		C3[8]  = res3_8;
+		C3[9]  = res3_9;
+		C3[10] = res3_10;
+		C3[11] = res3_11;
+		C3[12] = res3_12;
+		C3[13] = res3_13;
+		C3[14] = res3_14;
+		C3[15] = res3_15;
+
+		C4[0] = res4_0;
+		C4[1] = res4_1;
+		C4[2] = res4_2;
+		C4[3] = res4_3;
+		C4[4] = res4_4;
+		C4[5] = res4_5;
+		C4[6] = res4_6;
+		C4[7] = res4_7;
+
+		C4[8]  = res4_8;
+		C4[9]  = res4_9;
+		C4[10] = res4_10;
+		C4[11] = res4_11;
+		C4[12] = res4_12;
+		C4[13] = res4_13;
+		C4[14] = res4_14;
+		C4[15] = res4_15;
+
+		C5[0] = res5_0;
+		C5[1] = res5_1;
+		C5[2] = res5_2;
+		C5[3] = res5_3;
+		C5[4] = res5_4;
+		C5[5] = res5_5;
+		C5[6] = res5_6;
+		C5[7] = res5_7;
+
+		C5[8]  = res5_8;
+		C5[9]  = res5_9;
+		C5[10] = res5_10;
+		C5[11] = res5_11;
+		C5[12] = res5_12;
+		C5[13] = res5_13;
+		C5[14] = res5_14;
+		C5[15] = res5_15;
+
+		C6[0] = res6_0;
+		C6[1] = res6_1;
+		C6[2] = res6_2;
+		C6[3] = res6_3;
+		C6[4] = res6_4;
+		C6[5] = res6_5;
+		C6[6] = res6_6;
+		C6[7] = res6_7;
+
+		C6[8]  = res6_8;
+		C6[9]  = res6_9;
+		C6[10] = res6_10;
+		C6[11] = res6_11;
+		C6[12] = res6_12;
+		C6[13] = res6_13;
+		C6[14] = res6_14;
+		C6[15] = res6_15;
+
+		C7[0] = res7_0;
+		C7[1] = res7_1;
+		C7[2] = res7_2;
+		C7[3] = res7_3;
+		C7[4] = res7_4;
+		C7[5] = res7_5;
+		C7[6] = res7_6;
+		C7[7] = res7_7;
+
+		C7[8]  = res7_8;
+		C7[9]  = res7_9;
+		C7[10] = res7_10;
+		C7[11] = res7_11;
+		C7[12] = res7_12;
+		C7[13] = res7_13;
+		C7[14] = res7_14;
+		C7[15] = res7_15;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 16; // number of values in A
+#else
+		temp -= 8; // number of values in B
+#endif
+		ptrba += temp*16;
+		ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+		off += 16; // number of values in A
+#endif
+
+		C0 = C0+16;
+		C1 = C1+16;
+		C2 = C2+16;
+		C3 = C3+16;
+		C4 = C4+16;
+		C5 = C5+16;
+		C6 = C6+16;
+		C7 = C7+16;
+	}
+
+
+        if ( bm & 8)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*8;
+		ptrbb = bb + off*8;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+		res2_4 = 0;
+		res2_5 = 0;
+		res2_6 = 0;
+		res2_7 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+		res3_4 = 0;
+		res3_5 = 0;
+		res3_6 = 0;
+		res3_7 = 0;
+
+		res4_0 = 0;
+		res4_1 = 0;
+		res4_2 = 0;
+		res4_3 = 0;
+		res4_4 = 0;
+		res4_5 = 0;
+		res4_6 = 0;
+		res4_7 = 0;
+
+		res5_0 = 0;
+		res5_1 = 0;
+		res5_2 = 0;
+		res5_3 = 0;
+		res5_4 = 0;
+		res5_5 = 0;
+		res5_6 = 0;
+		res5_7 = 0;
+
+		res6_0 = 0;
+		res6_1 = 0;
+		res6_2 = 0;
+		res6_3 = 0;
+		res6_4 = 0;
+		res6_5 = 0;
+		res6_6 = 0;
+		res6_7 = 0;
+
+		res7_0 = 0;
+		res7_1 = 0;
+		res7_2 = 0;
+		res7_3 = 0;
+		res7_4 = 0;
+		res7_5 = 0;
+		res7_6 = 0;
+		res7_7 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+8;	// number of values in A
+#else
+		temp = off+8;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+			b4 = ptrbb[4];
+			b5 = ptrbb[5];
+			b6 = ptrbb[6];
+			b7 = ptrbb[7];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+			res4_0 += a0*b4;
+			res5_0 += a0*b5;
+			res6_0 += a0*b6;
+			res7_0 += a0*b7;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+			res4_1 += a1*b4;
+			res5_1 += a1*b5;
+			res6_1 += a1*b6;
+			res7_1 += a1*b7;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+			res4_2 += a0*b4;
+			res5_2 += a0*b5;
+			res6_2 += a0*b6;
+			res7_2 += a0*b7;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+			res4_3 += a1*b4;
+			res5_3 += a1*b5;
+			res6_3 += a1*b6;
+			res7_3 += a1*b7;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+			res2_4 += a0*b2;
+			res3_4 += a0*b3;
+			res4_4 += a0*b4;
+			res5_4 += a0*b5;
+			res6_4 += a0*b6;
+			res7_4 += a0*b7;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+			res2_5 += a1*b2;
+			res3_5 += a1*b3;
+			res4_5 += a1*b4;
+			res5_5 += a1*b5;
+			res6_5 += a1*b6;
+			res7_5 += a1*b7;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+			res2_6 += a0*b2;
+			res3_6 += a0*b3;
+			res4_6 += a0*b4;
+			res5_6 += a0*b5;
+			res6_6 += a0*b6;
+			res7_6 += a0*b7;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+			res2_7 += a1*b2;
+			res3_7 += a1*b3;
+			res4_7 += a1*b4;
+			res5_7 += a1*b5;
+			res6_7 += a1*b6;
+			res7_7 += a1*b7;
+
+			ptrba = ptrba+8;
+			ptrbb = ptrbb+8;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+		
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+		res2_4 *= alpha;
+		res2_5 *= alpha;
+		res2_6 *= alpha;
+		res2_7 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+		res3_4 *= alpha;
+		res3_5 *= alpha;
+		res3_6 *= alpha;
+		res3_7 *= alpha;
+
+		res4_0 *= alpha;
+		res4_1 *= alpha;
+		res4_2 *= alpha;
+		res4_3 *= alpha;
+		res4_4 *= alpha;
+		res4_5 *= alpha;
+		res4_6 *= alpha;
+		res4_7 *= alpha;
+
+		res5_0 *= alpha;
+		res5_1 *= alpha;
+		res5_2 *= alpha;
+		res5_3 *= alpha;
+		res5_4 *= alpha;
+		res5_5 *= alpha;
+		res5_6 *= alpha;
+		res5_7 *= alpha;
+		
+		res6_0 *= alpha;
+		res6_1 *= alpha;
+		res6_2 *= alpha;
+		res6_3 *= alpha;
+		res6_4 *= alpha;
+		res6_5 *= alpha;
+		res6_6 *= alpha;
+		res6_7 *= alpha;
+
+		res7_0 *= alpha;
+		res7_1 *= alpha;
+		res7_2 *= alpha;
+		res7_3 *= alpha;
+		res7_4 *= alpha;
+		res7_5 *= alpha;
+		res7_6 *= alpha;
+		res7_7 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+		C2[4] = res2_4;
+		C2[5] = res2_5;
+		C2[6] = res2_6;
+		C2[7] = res2_7;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+		C3[4] = res3_4;
+		C3[5] = res3_5;
+		C3[6] = res3_6;
+		C3[7] = res3_7;
+
+		C4[0] = res4_0;
+		C4[1] = res4_1;
+		C4[2] = res4_2;
+		C4[3] = res4_3;
+		C4[4] = res4_4;
+		C4[5] = res4_5;
+		C4[6] = res4_6;
+		C4[7] = res4_7;
+
+		C5[0] = res5_0;
+		C5[1] = res5_1;
+		C5[2] = res5_2;
+		C5[3] = res5_3;
+		C5[4] = res5_4;
+		C5[5] = res5_5;
+		C5[6] = res5_6;
+		C5[7] = res5_7;
+
+		C6[0] = res6_0;
+		C6[1] = res6_1;
+		C6[2] = res6_2;
+		C6[3] = res6_3;
+		C6[4] = res6_4;
+		C6[5] = res6_5;
+		C6[6] = res6_6;
+		C6[7] = res6_7;
+
+		C7[0] = res7_0;
+		C7[1] = res7_1;
+		C7[2] = res7_2;
+		C7[3] = res7_3;
+		C7[4] = res7_4;
+		C7[5] = res7_5;
+		C7[6] = res7_6;
+		C7[7] = res7_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 8; // number of values in A
+#else
+		temp -= 8; // number of values in B
+#endif
+		ptrba += temp*8;
+		ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+		off += 8; // number of values in A
+#endif
+
+		C0 = C0+8;
+		C1 = C1+8;
+		C2 = C2+8;
+		C3 = C3+8;
+		C4 = C4+8;
+		C5 = C5+8;
+		C6 = C6+8;
+		C7 = C7+8;
+	}
+
+	if ( bm & 4 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*8;
+#endif
+
+		res0_0 = 0;
+		res1_0 = 0;
+		res2_0 = 0;
+		res3_0 = 0;
+		res4_0 = 0;
+		res5_0 = 0;
+		res6_0 = 0;
+		res7_0 = 0;
+
+		res0_1 = 0;
+		res1_1 = 0;
+		res2_1 = 0;
+		res3_1 = 0;
+		res4_1 = 0;
+		res5_1 = 0;
+		res6_1 = 0;
+		res7_1 = 0;
+
+		res0_2 = 0;
+		res1_2 = 0;
+		res2_2 = 0;
+		res3_2 = 0;
+		res4_2 = 0;
+		res5_2 = 0;
+		res6_2 = 0;
+		res7_2 = 0;
+
+		res0_3 = 0;
+		res1_3 = 0;
+		res2_3 = 0;
+		res3_3 = 0;
+		res4_3 = 0;
+		res5_3 = 0;
+		res6_3 = 0;
+		res7_3 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+8;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+			b4 = ptrbb[4];
+			b5 = ptrbb[5];
+			b6 = ptrbb[6];
+			b7 = ptrbb[7];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+			res4_0 += a0*b4;
+			res5_0 += a0*b5;
+			res6_0 += a0*b6;
+			res7_0 += a0*b7;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+			res4_1 += a1*b4;
+			res5_1 += a1*b5;
+			res6_1 += a1*b6;
+			res7_1 += a1*b7;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+			res4_2 += a0*b4;
+			res5_2 += a0*b5;
+			res6_2 += a0*b6;
+			res7_2 += a0*b7;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+			res4_3 += a1*b4;
+			res5_3 += a1*b5;
+			res6_3 += a1*b6;
+			res7_3 += a1*b7;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+8;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+
+		res4_0 *= alpha;
+		res4_1 *= alpha;
+		res4_2 *= alpha;
+		res4_3 *= alpha;
+
+		res5_0 *= alpha;
+		res5_1 *= alpha;
+		res5_2 *= alpha;
+		res5_3 *= alpha;
+		
+		res6_0 *= alpha;
+		res6_1 *= alpha;
+		res6_2 *= alpha;
+		res6_3 *= alpha;
+
+		res7_0 *= alpha;
+		res7_1 *= alpha;
+		res7_2 *= alpha;
+		res7_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+
+		C4[0] = res4_0;
+		C4[1] = res4_1;
+		C4[2] = res4_2;
+		C4[3] = res4_3;
+
+		C5[0] = res5_0;
+		C5[1] = res5_1;
+		C5[2] = res5_2;
+		C5[3] = res5_3;
+
+		C6[0] = res6_0;
+		C6[1] = res6_1;
+		C6[2] = res6_2;
+		C6[3] = res6_3;
+
+		C7[0] = res7_0;
+		C7[1] = res7_1;
+		C7[2] = res7_2;
+		C7[3] = res7_3;
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 8; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+		C2 = C2+4;
+		C3 = C3+4;
+		C4 = C4+4;
+		C5 = C5+4;
+		C6 = C6+4;
+		C7 = C7+4;
+	}
+
+	if ( bm & 2 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*8;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		
+		res2_0 = 0;
+		res2_1 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+
+		res4_0 = 0;
+		res4_1 = 0;
+
+		res5_0 = 0;
+		res5_1 = 0;
+		
+		res6_0 = 0;
+		res6_1 = 0;
+
+		res7_0 = 0;
+		res7_1 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+8;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+			b4 = ptrbb[4];
+			b5 = ptrbb[5];
+			b6 = ptrbb[6];
+			b7 = ptrbb[7];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+			res4_0 += a0*b4;
+			res5_0 += a0*b5;
+			res6_0 += a0*b6;
+			res7_0 += a0*b7;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+			res4_1 += a1*b4;
+			res5_1 += a1*b5;
+			res6_1 += a1*b6;
+			res7_1 += a1*b7;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+8;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+
+		res4_0 *= alpha;
+		res4_1 *= alpha;
+
+		res5_0 *= alpha;
+		res5_1 *= alpha;
+		
+		res6_0 *= alpha;
+		res6_1 *= alpha;
+
+		res7_0 *= alpha;
+		res7_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+
+		C4[0] = res4_0;
+		C4[1] = res4_1;
+
+		C5[0] = res5_0;
+		C5[1] = res5_1;
+
+		C6[0] = res6_0;
+		C6[1] = res6_1;
+
+		C7[0] = res7_0;
+		C7[1] = res7_1;
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 8; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+		C2 = C2+2;
+		C3 = C3+2;
+		C4 = C4+2;
+		C5 = C5+2;
+		C6 = C6+2;
+		C7 = C7+2;
+	}
+	
+	if ( bm & 1 )
+	{
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*8;
+#endif
+
+		res0_0 = 0;
+		res1_0 = 0;
+		res2_0 = 0;
+		res3_0 = 0;
+		res4_0 = 0;
+		res5_0 = 0;
+		res6_0 = 0;
+		res7_0 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+8;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+			b4 = ptrbb[4];
+			b5 = ptrbb[5];
+			b6 = ptrbb[6];
+			b7 = ptrbb[7];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+			res4_0 += a0*b4;
+			res5_0 += a0*b5;
+			res6_0 += a0*b6;
+			res7_0 += a0*b7;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+8;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+		
+		res2_0 *= alpha;
+
+		res3_0 *= alpha;
+
+		res4_0 *= alpha;
+
+		res5_0 *= alpha;
+		
+		res6_0 *= alpha;
+
+		res7_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+		C2[0] = res2_0;
+
+		C3[0] = res3_0;
+
+		C4[0] = res4_0;
+
+		C5[0] = res5_0;
+
+		C6[0] = res6_0;
+
+		C7[0] = res7_0;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 8; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+		C2 = C2+1;
+		C3 = C3+1;
+		C4 = C4+1;
+		C5 = C5+1;
+		C6 = C6+1;
+		C7 = C7+1;
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 8;
+#endif
+
+        k = (bk<<3);
+        bb = bb+k;
+        i = (ldc<<3);
+        C = C+i;
+    }
+
+
+   if( bn & 4 )
+   {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C0+2*ldc;
+        C3 = C0+3*ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+	off = offset;
+#endif
+
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*16;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res0_8  = 0;
+		res0_9  = 0;
+		res0_10 = 0;
+		res0_11 = 0;
+		res0_12 = 0;
+		res0_13 = 0;
+		res0_14 = 0;
+		res0_15 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res1_8  = 0;
+		res1_9  = 0;
+		res1_10 = 0;
+		res1_11 = 0;
+		res1_12 = 0;
+		res1_13 = 0;
+		res1_14 = 0;
+		res1_15 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+		res2_4 = 0;
+		res2_5 = 0;
+		res2_6 = 0;
+		res2_7 = 0;
+
+		res2_8  = 0;
+		res2_9  = 0;
+		res2_10 = 0;
+		res2_11 = 0;
+		res2_12 = 0;
+		res2_13 = 0;
+		res2_14 = 0;
+		res2_15 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+		res3_4 = 0;
+		res3_5 = 0;
+		res3_6 = 0;
+		res3_7 = 0;
+
+		res3_8  = 0;
+		res3_9  = 0;
+		res3_10 = 0;
+		res3_11 = 0;
+		res3_12 = 0;
+		res3_13 = 0;
+		res3_14 = 0;
+		res3_15 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+16;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+			res2_4 += a0*b2;
+			res3_4 += a0*b3;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+			res2_5 += a1*b2;
+			res3_5 += a1*b3;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+			res2_6 += a0*b2;
+			res3_6 += a0*b3;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+			res2_7 += a1*b2;
+			res3_7 += a1*b3;
+
+			a0 = ptrba[8];
+			res0_8 += a0*b0;
+			res1_8 += a0*b1;
+			res2_8 += a0*b2;
+			res3_8 += a0*b3;
+
+			a1 = ptrba[9];
+			res0_9 += a1*b0;
+			res1_9 += a1*b1;
+			res2_9 += a1*b2;
+			res3_9 += a1*b3;
+
+			a0 = ptrba[10];
+			res0_10 += a0*b0;
+			res1_10 += a0*b1;
+			res2_10 += a0*b2;
+			res3_10 += a0*b3;
+
+			a1 = ptrba[11];
+			res0_11 += a1*b0;
+			res1_11 += a1*b1;
+			res2_11 += a1*b2;
+			res3_11 += a1*b3;
+
+			a0 = ptrba[12];
+			res0_12 += a0*b0;
+			res1_12 += a0*b1;
+			res2_12 += a0*b2;
+			res3_12 += a0*b3;
+
+			a1 = ptrba[13];
+			res0_13 += a1*b0;
+			res1_13 += a1*b1;
+			res2_13 += a1*b2;
+			res3_13 += a1*b3;
+
+			a0 = ptrba[14];
+			res0_14 += a0*b0;
+			res1_14 += a0*b1;
+			res2_14 += a0*b2;
+			res3_14 += a0*b3;
+
+			a1 = ptrba[15];
+			res0_15 += a1*b0;
+			res1_15 += a1*b1;
+			res2_15 += a1*b2;
+			res3_15 += a1*b3;
+
+
+			ptrba = ptrba+16;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res0_8  *= alpha;
+		res0_9  *= alpha;
+		res0_10 *= alpha;
+		res0_11 *= alpha;
+		res0_12 *= alpha;
+		res0_13 *= alpha;
+		res0_14 *= alpha;
+		res0_15 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		res1_8  *= alpha;
+		res1_9  *= alpha;
+		res1_10 *= alpha;
+		res1_11 *= alpha;
+		res1_12 *= alpha;
+		res1_13 *= alpha;
+		res1_14 *= alpha;
+		res1_15 *= alpha;
+		
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+		res2_4 *= alpha;
+		res2_5 *= alpha;
+		res2_6 *= alpha;
+		res2_7 *= alpha;
+
+		res2_8  *= alpha;
+		res2_9  *= alpha;
+		res2_10 *= alpha;
+		res2_11 *= alpha;
+		res2_12 *= alpha;
+		res2_13 *= alpha;
+		res2_14 *= alpha;
+		res2_15 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+		res3_4 *= alpha;
+		res3_5 *= alpha;
+		res3_6 *= alpha;
+		res3_7 *= alpha;
+
+		res3_8  *= alpha;
+		res3_9  *= alpha;
+		res3_10 *= alpha;
+		res3_11 *= alpha;
+		res3_12 *= alpha;
+		res3_13 *= alpha;
+		res3_14 *= alpha;
+		res3_15 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C0[8]  = res0_8;
+		C0[9]  = res0_9;
+		C0[10] = res0_10;
+		C0[11] = res0_11;
+		C0[12] = res0_12;
+		C0[13] = res0_13;
+		C0[14] = res0_14;
+		C0[15] = res0_15;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C1[8]  = res1_8;
+		C1[9]  = res1_9;
+		C1[10] = res1_10;
+		C1[11] = res1_11;
+		C1[12] = res1_12;
+		C1[13] = res1_13;
+		C1[14] = res1_14;
+		C1[15] = res1_15;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+		C2[4] = res2_4;
+		C2[5] = res2_5;
+		C2[6] = res2_6;
+		C2[7] = res2_7;
+
+		C2[8]  = res2_8;
+		C2[9]  = res2_9;
+		C2[10] = res2_10;
+		C2[11] = res2_11;
+		C2[12] = res2_12;
+		C2[13] = res2_13;
+		C2[14] = res2_14;
+		C2[15] = res2_15;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+		C3[4] = res3_4;
+		C3[5] = res3_5;
+		C3[6] = res3_6;
+		C3[7] = res3_7;
+
+		C3[8]  = res3_8;
+		C3[9]  = res3_9;
+		C3[10] = res3_10;
+		C3[11] = res3_11;
+		C3[12] = res3_12;
+		C3[13] = res3_13;
+		C3[14] = res3_14;
+		C3[15] = res3_15;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 16; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*16;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 16; // number of values in A
+#endif
+
+		C0 = C0+16;
+		C1 = C1+16;
+		C2 = C2+16;
+		C3 = C3+16;
+	}
+
+
+        if ( bm & 8)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*8;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+		res2_4 = 0;
+		res2_5 = 0;
+		res2_6 = 0;
+		res2_7 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+		res3_4 = 0;
+		res3_5 = 0;
+		res3_6 = 0;
+		res3_7 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+8;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+			res2_4 += a0*b2;
+			res3_4 += a0*b3;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+			res2_5 += a1*b2;
+			res3_5 += a1*b3;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+			res2_6 += a0*b2;
+			res3_6 += a0*b3;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+			res2_7 += a1*b2;
+			res3_7 += a1*b3;
+
+			ptrba = ptrba+8;
+			ptrbb = ptrbb+4;
+
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+		res2_4 *= alpha;
+		res2_5 *= alpha;
+		res2_6 *= alpha;
+		res2_7 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+		res3_4 *= alpha;
+		res3_5 *= alpha;
+		res3_6 *= alpha;
+		res3_7 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+		C2[4] = res2_4;
+		C2[5] = res2_5;
+		C2[6] = res2_6;
+		C2[7] = res2_7;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+		C3[4] = res3_4;
+		C3[5] = res3_5;
+		C3[6] = res3_6;
+		C3[7] = res3_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 8; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*8;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 8; // number of values in A
+#endif
+
+		C0 = C0+8;
+		C1 = C1+8;
+		C2 = C2+8;
+		C3 = C3+8;
+	}
+
+	if ( bm & 4 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+		C2 = C2+4;
+		C3 = C3+4;
+	}
+
+	if ( bm & 2 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		
+		res2_0 = 0;
+		res2_1 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+		C2 = C2+2;
+		C3 = C3+2;
+	}
+	
+	if ( bm & 1 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res1_0 = 0;
+		res2_0 = 0;
+		res3_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+4;
+                }
+		res0_0 *= alpha;
+		res1_0 *= alpha;
+		res2_0 *= alpha;
+		res3_0 *= alpha;
+
+		C0[0] = res0_0;
+		C1[0] = res1_0;
+		C2[0] = res2_0;
+		C3[0] = res3_0;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+		C2 = C2+1;
+		C3 = C3+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 4;
+#endif
+
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+
+   if(bn&2)
+   {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+	off = offset;
+#endif
+
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*16;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res0_8  = 0;
+		res0_9  = 0;
+		res0_10 = 0;
+		res0_11 = 0;
+		res0_12 = 0;
+		res0_13 = 0;
+		res0_14 = 0;
+		res0_15 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res1_8  = 0;
+		res1_9  = 0;
+		res1_10 = 0;
+		res1_11 = 0;
+		res1_12 = 0;
+		res1_13 = 0;
+		res1_14 = 0;
+		res1_15 = 0;
+
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+16;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+
+			a0 = ptrba[8];
+			res0_8 += a0*b0;
+			res1_8 += a0*b1;
+
+			a1 = ptrba[9];
+			res0_9 += a1*b0;
+			res1_9 += a1*b1;
+
+			a0 = ptrba[10];
+			res0_10 += a0*b0;
+			res1_10 += a0*b1;
+
+			a1 = ptrba[11];
+			res0_11 += a1*b0;
+			res1_11 += a1*b1;
+
+			a0 = ptrba[12];
+			res0_12 += a0*b0;
+			res1_12 += a0*b1;
+
+			a1 = ptrba[13];
+			res0_13 += a1*b0;
+			res1_13 += a1*b1;
+
+			a0 = ptrba[14];
+			res0_14 += a0*b0;
+			res1_14 += a0*b1;
+
+			a1 = ptrba[15];
+			res0_15 += a1*b0;
+			res1_15 += a1*b1;
+
+
+			ptrba = ptrba+16;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res0_8  *= alpha;
+		res0_9  *= alpha;
+		res0_10 *= alpha;
+		res0_11 *= alpha;
+		res0_12 *= alpha;
+		res0_13 *= alpha;
+		res0_14 *= alpha;
+		res0_15 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		res1_8  *= alpha;
+		res1_9  *= alpha;
+		res1_10 *= alpha;
+		res1_11 *= alpha;
+		res1_12 *= alpha;
+		res1_13 *= alpha;
+		res1_14 *= alpha;
+		res1_15 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C0[8]  = res0_8;
+		C0[9]  = res0_9;
+		C0[10] = res0_10;
+		C0[11] = res0_11;
+		C0[12] = res0_12;
+		C0[13] = res0_13;
+		C0[14] = res0_14;
+		C0[15] = res0_15;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C1[8]  = res1_8;
+		C1[9]  = res1_9;
+		C1[10] = res1_10;
+		C1[11] = res1_11;
+		C1[12] = res1_12;
+		C1[13] = res1_13;
+		C1[14] = res1_14;
+		C1[15] = res1_15;
+
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 16; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*16;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 16; // number of values in A
+#endif
+
+		C0 = C0+16;
+		C1 = C1+16;
+	}
+
+
+
+
+        if ( bm & 8)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*8;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+8;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+
+			ptrba = ptrba+8;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 8; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*8;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 8; // number of values in A
+#endif
+
+		C0 = C0+8;
+		C1 = C1+8;
+	}
+
+	if ( bm & 4 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+
+	}
+
+	if ( bm & 2 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+
+	}
+
+	if ( bm & 1 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+
+		res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+
+   for (j=0; j<(bn&1); j+=1)
+   {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+	off = offset;
+#endif
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*16;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res0_8  = 0;
+		res0_9  = 0;
+		res0_10 = 0;
+		res0_11 = 0;
+		res0_12 = 0;
+		res0_13 = 0;
+		res0_14 = 0;
+		res0_15 = 0;
+
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+16;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+
+			a0 = ptrba[8];
+			res0_8 += a0*b0;
+
+			a1 = ptrba[9];
+			res0_9 += a1*b0;
+
+			a0 = ptrba[10];
+			res0_10 += a0*b0;
+
+			a1 = ptrba[11];
+			res0_11 += a1*b0;
+
+			a0 = ptrba[12];
+			res0_12 += a0*b0;
+
+			a1 = ptrba[13];
+			res0_13 += a1*b0;
+
+			a0 = ptrba[14];
+			res0_14 += a0*b0;
+
+			a1 = ptrba[15];
+			res0_15 += a1*b0;
+
+
+			ptrba = ptrba+16;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res0_8  *= alpha;
+		res0_9  *= alpha;
+		res0_10 *= alpha;
+		res0_11 *= alpha;
+		res0_12 *= alpha;
+		res0_13 *= alpha;
+		res0_14 *= alpha;
+		res0_15 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C0[8]  = res0_8;
+		C0[9]  = res0_9;
+		C0[10] = res0_10;
+		C0[11] = res0_11;
+		C0[12] = res0_12;
+		C0[13] = res0_13;
+		C0[14] = res0_14;
+		C0[15] = res0_15;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 16; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*16;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 16; // number of values in A
+#endif
+
+		C0 = C0+16;
+	}
+
+
+
+
+        if ( bm & 8 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*8;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+8;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+
+			ptrba = ptrba+8;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 8; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*8;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 8; // number of values in A
+#endif
+
+		C0 = C0+8;
+	}
+
+	if ( bm & 4 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+
+	}
+
+	if ( bm & 2 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+
+	}
+
+	if ( bm & 1 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+
+		C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+
+	}
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+   }
+
+   return 0;
+}
diff --git a/kernel/generic/zlaswp_ncopy_8.c b/kernel/generic/zlaswp_ncopy_8.c
new file mode 100644
index 000000000..8bd41749d
--- /dev/null
+++ b/kernel/generic/zlaswp_ncopy_8.c
@@ -0,0 +1,1051 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#define a2	(a1 + 2)
+#define a4	(a3 + 2)
+#define a6	(a5 + 2)
+#define a8	(a7 + 2)
+
+int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){
+
+  BLASLONG i, j, ip1, ip2;
+  blasint *piv;
+  FLOAT *a1, *a3, *a5, *a7;
+  FLOAT *b1, *b2, *b3, *b4;
+  FLOAT *b5, *b6, *b7, *b8;
+  FLOAT A1, A2, A3, A4, A5, A6, A7, A8;
+  FLOAT B1, B2, B3, B4, B5, B6, B7, B8;
+
+  FLOAT A9, A10, A11, A12, A13, A14, A15, A16;
+  FLOAT B9, B10, B11, B12, B13, B14, B15, B16;
+
+  a -= 2;
+  lda *= 2;
+  k1 --;
+
+ ipiv += k1;
+
+  if (n  <= 0) return 0;
+
+  j = (n >> 3);
+  if (j > 0) {
+    do {
+      piv = ipiv;
+
+      a1 = a + (k1 + 1) * 2;
+
+      a3 = a1 + 1 * lda;
+      a5 = a1 + 2 * lda;
+      a7 = a1 + 3 * lda;
+
+      ip1 = *(piv + 0) * 2;
+      ip2 = *(piv + 1) * 2;
+      piv += 2;
+
+      b1 = a + ip1;
+      b2 = a + ip2;
+
+      b3 = b1 + 1 * lda;
+      b4 = b2 + 1 * lda;
+      b5 = b1 + 2 * lda;
+      b6 = b2 + 2 * lda;
+      b7 = b1 + 3 * lda;
+      b8 = b2 + 3 * lda;
+
+      i = ((k2 - k1) >> 1);
+
+      if (i > 0) {
+	do {
+	  ip1 = *(piv + 0) * 2;
+	  ip2 = *(piv + 1) * 2;
+	  piv += 2;
+
+		for( int pass = 0; pass < 2; ++pass ) {
+		  A1  = *(a1 + 0);
+		  A9  = *(a1 + 1);
+		  A2  = *(a2 + 0);
+		  A10 = *(a2 + 1);
+		  A3  = *(a3 + 0);
+		  A11 = *(a3 + 1);
+		  A4  = *(a4 + 0);
+		  A12 = *(a4 + 1);
+		  A5  = *(a5 + 0);
+		  A13 = *(a5 + 1);
+		  A6  = *(a6 + 0);
+		  A14 = *(a6 + 1);
+		  A7  = *(a7 + 0);
+		  A15 = *(a7 + 1);
+		  A8  = *(a8 + 0);
+		  A16 = *(a8 + 1);
+
+		  B1  = *(b1 + 0);
+		  B9  = *(b1 + 1);
+		  B2  = *(b2 + 0);
+		  B10 = *(b2 + 1);
+		  B3  = *(b3 + 0);
+		  B11 = *(b3 + 1);
+		  B4  = *(b4 + 0);
+		  B12 = *(b4 + 1);
+		  B5  = *(b5 + 0);
+		  B13 = *(b5 + 1);
+		  B6  = *(b6 + 0);
+		  B14 = *(b6 + 1);
+		  B7  = *(b7 + 0);
+		  B15 = *(b7 + 1);
+		  B8  = *(b8 + 0);
+		  B16 = *(b8 + 1);
+
+		if (b1 == a1) {
+		    if (b2 == a2) {
+		      *(buffer +  0) = A1;
+		      *(buffer +  1) = A9;
+		      *(buffer +  2) = A3;
+		      *(buffer +  3) = A11;
+		      *(buffer +  4) = A5;
+		      *(buffer +  5) = A13;
+		      *(buffer +  6) = A7;
+		      *(buffer +  7) = A15;
+
+		      *(buffer +  8) = A2;
+		      *(buffer +  9) = A10;
+		      *(buffer + 10) = A4;
+		      *(buffer + 11) = A12;
+		      *(buffer + 12) = A6;
+		      *(buffer + 13) = A14;
+		      *(buffer + 14) = A8;
+		      *(buffer + 15) = A16;
+		    } else {
+		      *(buffer +  0) = A1;
+		      *(buffer +  1) = A9;
+		      *(buffer +  2) = A3;
+		      *(buffer +  3) = A11;
+		      *(buffer +  4) = A5;
+		      *(buffer +  5) = A13;
+		      *(buffer +  6) = A7;
+		      *(buffer +  7) = A15;
+
+		      *(buffer +  8) = B2;
+		      *(buffer +  9) = B10;
+		      *(buffer + 10) = B4;
+		      *(buffer + 11) = B12;
+		      *(buffer + 12) = B6;
+		      *(buffer + 13) = B14;
+		      *(buffer + 14) = B8;
+		      *(buffer + 15) = B16;
+
+		      *(b2 + 0) = A2;
+		      *(b2 + 1) = A10;
+		      *(b4 + 0) = A4;
+		      *(b4 + 1) = A12;
+		      *(b6 + 0) = A6;
+		      *(b6 + 1) = A14;
+		      *(b8 + 0) = A8;
+		      *(b8 + 1) = A16;
+		    }
+		} else
+		  if (b1 == a2) {
+		      if (b2 == a2) {
+			*(buffer +  0) = A2;
+			*(buffer +  1) = A10;
+			*(buffer +  2) = A4;
+			*(buffer +  3) = A12;
+			*(buffer +  4) = A6;
+			*(buffer +  5) = A14;
+			*(buffer +  6) = A8;
+			*(buffer +  7) = A16;
+			*(buffer +  8) = A1;
+			*(buffer +  9) = A9;
+			*(buffer + 10) = A3;
+			*(buffer + 11) = A11;
+			*(buffer + 12) = A5;
+			*(buffer + 13) = A13;
+			*(buffer + 14) = A7;
+			*(buffer + 15) = A15;
+
+		      } else {
+			*(buffer +  0) = A2;
+			*(buffer +  1) = A10;
+			*(buffer +  2) = A4;
+			*(buffer +  3) = A12;
+			*(buffer +  4) = A6;
+			*(buffer +  5) = A14;
+			*(buffer +  6) = A8;
+			*(buffer +  7) = A16;
+			*(buffer +  8) = B2;
+			*(buffer +  9) = B10;
+			*(buffer + 10) = B4;
+			*(buffer + 11) = B12;
+			*(buffer + 12) = B6;
+			*(buffer + 13) = B14;
+			*(buffer + 14) = B8;
+			*(buffer + 15) = B16;
+
+			*(b2 + 0) = A1;
+			*(b2 + 1) = A9;
+			*(b4 + 0) = A3;
+			*(b4 + 1) = A11;
+			*(b6 + 0) = A5;
+			*(b6 + 1) = A13;
+			*(b8 + 0) = A7;
+			*(b8 + 1) = A15;
+		      }
+		  } else {
+		      if (b2 == a2) {
+			*(buffer +  0) = B1;
+			*(buffer +  1) = B9;
+			*(buffer +  2) = B3;
+			*(buffer +  3) = B11;
+			*(buffer +  4) = B5;
+			*(buffer +  5) = B13;
+			*(buffer +  6) = B7;
+			*(buffer +  7) = B15;
+			*(buffer +  8) = A2;
+			*(buffer +  9) = A10;
+			*(buffer + 10) = A4;
+			*(buffer + 11) = A12;
+			*(buffer + 12) = A6;
+			*(buffer + 13) = A14;
+			*(buffer + 14) = A8;
+			*(buffer + 15) = A16;
+
+			*(b1 + 0) = A1;
+			*(b1 + 1) = A9;
+			*(b3 + 0) = A3;
+			*(b3 + 1) = A11;
+			*(b5 + 0) = A5;
+			*(b5 + 1) = A13;
+			*(b7 + 0) = A7;
+			*(b7 + 1) = A15;
+		      } else
+			if (b2 == b1) {
+			  *(buffer +  0) = B1;
+			  *(buffer +  1) = B9;
+			  *(buffer +  2) = B3;
+			  *(buffer +  3) = B11;
+			  *(buffer +  4) = B5;
+			  *(buffer +  5) = B13;
+			  *(buffer +  6) = B7;
+			  *(buffer +  7) = B15;
+			  *(buffer +  8) = A1;
+			  *(buffer +  9) = A9;
+			  *(buffer + 10) = A3;
+			  *(buffer + 11) = A11;
+			  *(buffer + 12) = A5;
+			  *(buffer + 13) = A13;
+			  *(buffer + 14) = A7;
+			  *(buffer + 15) = A15;
+
+			  *(b1 + 0) = A2;
+			  *(b1 + 1) = A10;
+			  *(b3 + 0) = A4;
+			  *(b3 + 1) = A12;
+			  *(b5 + 0) = A6;
+			  *(b5 + 1) = A14;
+			  *(b7 + 0) = A8;
+			  *(b7 + 1) = A16;
+			} else {
+			  *(buffer +  0) = B1;
+			  *(buffer +  1) = B9;
+			  *(buffer +  2) = B3;
+			  *(buffer +  3) = B11;
+			  *(buffer +  4) = B5;
+			  *(buffer +  5) = B13;
+			  *(buffer +  6) = B7;
+			  *(buffer +  7) = B15;
+			  *(buffer +  8) = B2;
+			  *(buffer +  9) = B10;
+			  *(buffer + 10) = B4;
+			  *(buffer + 11) = B12;
+			  *(buffer + 12) = B6;
+			  *(buffer + 13) = B14;
+			  *(buffer + 14) = B8;
+			  *(buffer + 15) = B16;
+
+			  *(b1 + 0) = A1;
+			  *(b1 + 1) = A9;
+			  *(b2 + 0) = A2;
+			  *(b2 + 1) = A10;
+			  *(b3 + 0) = A3;
+			  *(b3 + 1) = A11;
+			  *(b4 + 0) = A4;
+			  *(b4 + 1) = A12;
+			  *(b5 + 0) = A5;
+			  *(b5 + 1) = A13;
+			  *(b6 + 0) = A6;
+			  *(b6 + 1) = A14;
+			  *(b7 + 0) = A7;
+			  *(b7 + 1) = A15;
+			  *(b8 + 0) = A8;
+			  *(b8 + 1) = A16;
+			}
+	  }
+	  b1 += 4*lda;
+	  b2 += 4*lda;
+	  b3 += 4*lda;
+	  b4 += 4*lda;
+	  b5 += 4*lda;
+	  b6 += 4*lda;
+	  b7 += 4*lda;
+	  b8 += 4*lda;
+
+	  a1 += 4;
+	  a3 += 4;
+	  a5 += 4;
+	  a7 += 4;
+
+	  buffer += 16;
+	}
+
+	  b1 = a + ip1;
+	  b2 = a + ip2;
+
+	  b3 = b1 + 1 * lda;
+	  b4 = b2 + 1 * lda;
+	  b5 = b1 + 2 * lda;
+	  b6 = b2 + 2 * lda;
+	  b7 = b1 + 3 * lda;
+	  b8 = b2 + 3 * lda;
+
+	i --;
+	} while (i > 0);
+      }
+
+      i = ((k2 - k1) & 1);
+
+      if (i > 0) {
+	A1  = *(a1 + 0);
+	A9  = *(a1 + 1);
+	B1  = *(b1 + 0);
+	B9  = *(b1 + 1);
+	A3  = *(a3 + 0);
+	A11 = *(a3 + 1);
+	B3  = *(b3 + 0);
+	B11 = *(b3 + 1);
+	A5  = *(a5 + 0);
+	A13 = *(a5 + 1);
+	B5  = *(b5 + 0);
+	B13 = *(b5 + 1);
+	A7  = *(a7 + 0);
+	A15 = *(a7 + 1);
+	B7  = *(b7 + 0);
+	B15 = *(b7 + 1);
+
+	if (a1 == b1) {
+	  *(buffer + 0) = A1;
+	  *(buffer + 1) = A9;
+	  *(buffer + 2) = A3;
+	  *(buffer + 3) = A11;
+	  *(buffer + 4) = A5;
+	  *(buffer + 5) = A13;
+	  *(buffer + 6) = A7;
+	  *(buffer + 7) = A15;
+	} else {
+	  *(buffer + 0) = B1;
+	  *(buffer + 1) = B9;
+	  *(buffer + 2) = B3;
+	  *(buffer + 3) = B11;
+	  *(buffer + 4) = B5;
+	  *(buffer + 5) = B13;
+	  *(buffer + 6) = B7;
+	  *(buffer + 7) = B15;
+
+	  *(b1 + 0) = A1;
+	  *(b1 + 1) = A9;
+	  *(b3 + 0) = A3;
+	  *(b3 + 1) = A11;
+	  *(b5 + 0) = A5;
+	  *(b5 + 1) = A13;
+	  *(b7 + 0) = A7;
+	  *(b7 + 1) = A15;
+	}
+	buffer += 8;
+      }
+
+      a += 4 * lda;
+
+      j --;
+    } while (j > 0);
+  }
+
+
+  if (n & 4) {
+    {
+      piv = ipiv;
+
+      a1 = a + (k1 + 1) * 2;
+
+      a3 = a1 + 1 * lda;
+      a5 = a1 + 2 * lda;
+      a7 = a1 + 3 * lda;
+
+      ip1 = *(piv + 0) * 2;
+      ip2 = *(piv + 1) * 2;
+      piv += 2;
+
+      b1 = a + ip1;
+      b2 = a + ip2;
+
+      b3 = b1 + 1 * lda;
+      b4 = b2 + 1 * lda;
+      b5 = b1 + 2 * lda;
+      b6 = b2 + 2 * lda;
+      b7 = b1 + 3 * lda;
+      b8 = b2 + 3 * lda;
+
+      i = ((k2 - k1) >> 1);
+
+      if (i > 0) {
+	do {
+	  A1  = *(a1 + 0);
+	  A9  = *(a1 + 1);
+	  A2  = *(a2 + 0);
+	  A10 = *(a2 + 1);
+	  A3  = *(a3 + 0);
+	  A11 = *(a3 + 1);
+	  A4  = *(a4 + 0);
+	  A12 = *(a4 + 1);
+	  A5  = *(a5 + 0);
+	  A13 = *(a5 + 1);
+	  A6  = *(a6 + 0);
+	  A14 = *(a6 + 1);
+	  A7  = *(a7 + 0);
+	  A15 = *(a7 + 1);
+	  A8  = *(a8 + 0);
+	  A16 = *(a8 + 1);
+
+	  B1  = *(b1 + 0);
+	  B9  = *(b1 + 1);
+	  B2  = *(b2 + 0);
+	  B10 = *(b2 + 1);
+	  B3  = *(b3 + 0);
+	  B11 = *(b3 + 1);
+	  B4  = *(b4 + 0);
+	  B12 = *(b4 + 1);
+	  B5  = *(b5 + 0);
+	  B13 = *(b5 + 1);
+	  B6  = *(b6 + 0);
+	  B14 = *(b6 + 1);
+	  B7  = *(b7 + 0);
+	  B15 = *(b7 + 1);
+	  B8  = *(b8 + 0);
+	  B16 = *(b8 + 1);
+
+	  ip1 = *(piv + 0) * 2;
+	  ip2 = *(piv + 1) * 2;
+	  piv += 2;
+
+	if (b1 == a1) {
+	    if (b2 == a2) {
+	      *(buffer +  0) = A1;
+	      *(buffer +  1) = A9;
+	      *(buffer +  2) = A3;
+	      *(buffer +  3) = A11;
+	      *(buffer +  4) = A5;
+	      *(buffer +  5) = A13;
+	      *(buffer +  6) = A7;
+	      *(buffer +  7) = A15;
+
+	      *(buffer +  8) = A2;
+	      *(buffer +  9) = A10;
+	      *(buffer + 10) = A4;
+	      *(buffer + 11) = A12;
+	      *(buffer + 12) = A6;
+	      *(buffer + 13) = A14;
+	      *(buffer + 14) = A8;
+	      *(buffer + 15) = A16;
+	    } else {
+	      *(buffer +  0) = A1;
+	      *(buffer +  1) = A9;
+	      *(buffer +  2) = A3;
+	      *(buffer +  3) = A11;
+	      *(buffer +  4) = A5;
+	      *(buffer +  5) = A13;
+	      *(buffer +  6) = A7;
+	      *(buffer +  7) = A15;
+
+	      *(buffer +  8) = B2;
+	      *(buffer +  9) = B10;
+	      *(buffer + 10) = B4;
+	      *(buffer + 11) = B12;
+	      *(buffer + 12) = B6;
+	      *(buffer + 13) = B14;
+	      *(buffer + 14) = B8;
+	      *(buffer + 15) = B16;
+
+	      *(b2 + 0) = A2;
+	      *(b2 + 1) = A10;
+	      *(b4 + 0) = A4;
+	      *(b4 + 1) = A12;
+	      *(b6 + 0) = A6;
+	      *(b6 + 1) = A14;
+	      *(b8 + 0) = A8;
+	      *(b8 + 1) = A16;
+	    }
+	} else
+	  if (b1 == a2) {
+	      if (b2 == a2) {
+		*(buffer +  0) = A2;
+		*(buffer +  1) = A10;
+		*(buffer +  2) = A4;
+		*(buffer +  3) = A12;
+		*(buffer +  4) = A6;
+		*(buffer +  5) = A14;
+		*(buffer +  6) = A8;
+		*(buffer +  7) = A16;
+		*(buffer +  8) = A1;
+		*(buffer +  9) = A9;
+		*(buffer + 10) = A3;
+		*(buffer + 11) = A11;
+		*(buffer + 12) = A5;
+		*(buffer + 13) = A13;
+		*(buffer + 14) = A7;
+		*(buffer + 15) = A15;
+
+	      } else {
+		*(buffer +  0) = A2;
+		*(buffer +  1) = A10;
+		*(buffer +  2) = A4;
+		*(buffer +  3) = A12;
+		*(buffer +  4) = A6;
+		*(buffer +  5) = A14;
+		*(buffer +  6) = A8;
+		*(buffer +  7) = A16;
+		*(buffer +  8) = B2;
+		*(buffer +  9) = B10;
+		*(buffer + 10) = B4;
+		*(buffer + 11) = B12;
+		*(buffer + 12) = B6;
+		*(buffer + 13) = B14;
+		*(buffer + 14) = B8;
+		*(buffer + 15) = B16;
+
+		*(b2 + 0) = A1;
+		*(b2 + 1) = A9;
+		*(b4 + 0) = A3;
+		*(b4 + 1) = A11;
+		*(b6 + 0) = A5;
+		*(b6 + 1) = A13;
+		*(b8 + 0) = A7;
+		*(b8 + 1) = A15;
+	      }
+	  } else {
+	      if (b2 == a2) {
+		*(buffer +  0) = B1;
+		*(buffer +  1) = B9;
+		*(buffer +  2) = B3;
+		*(buffer +  3) = B11;
+		*(buffer +  4) = B5;
+		*(buffer +  5) = B13;
+		*(buffer +  6) = B7;
+		*(buffer +  7) = B15;
+		*(buffer +  8) = A2;
+		*(buffer +  9) = A10;
+		*(buffer + 10) = A4;
+		*(buffer + 11) = A12;
+		*(buffer + 12) = A6;
+		*(buffer + 13) = A14;
+		*(buffer + 14) = A8;
+		*(buffer + 15) = A16;
+
+		*(b1 + 0) = A1;
+		*(b1 + 1) = A9;
+		*(b3 + 0) = A3;
+		*(b3 + 1) = A11;
+		*(b5 + 0) = A5;
+		*(b5 + 1) = A13;
+		*(b7 + 0) = A7;
+		*(b7 + 1) = A15;
+	      } else
+		if (b2 == b1) {
+		  *(buffer +  0) = B1;
+		  *(buffer +  1) = B9;
+		  *(buffer +  2) = B3;
+		  *(buffer +  3) = B11;
+		  *(buffer +  4) = B5;
+		  *(buffer +  5) = B13;
+		  *(buffer +  6) = B7;
+		  *(buffer +  7) = B15;
+		  *(buffer +  8) = A1;
+		  *(buffer +  9) = A9;
+		  *(buffer + 10) = A3;
+		  *(buffer + 11) = A11;
+		  *(buffer + 12) = A5;
+		  *(buffer + 13) = A13;
+		  *(buffer + 14) = A7;
+		  *(buffer + 15) = A15;
+
+		  *(b1 + 0) = A2;
+		  *(b1 + 1) = A10;
+		  *(b3 + 0) = A4;
+		  *(b3 + 1) = A12;
+		  *(b5 + 0) = A6;
+		  *(b5 + 1) = A14;
+		  *(b7 + 0) = A8;
+		  *(b7 + 1) = A16;
+		} else {
+		  *(buffer +  0) = B1;
+		  *(buffer +  1) = B9;
+		  *(buffer +  2) = B3;
+		  *(buffer +  3) = B11;
+		  *(buffer +  4) = B5;
+		  *(buffer +  5) = B13;
+		  *(buffer +  6) = B7;
+		  *(buffer +  7) = B15;
+		  *(buffer +  8) = B2;
+		  *(buffer +  9) = B10;
+		  *(buffer + 10) = B4;
+		  *(buffer + 11) = B12;
+		  *(buffer + 12) = B6;
+		  *(buffer + 13) = B14;
+		  *(buffer + 14) = B8;
+		  *(buffer + 15) = B16;
+
+		  *(b1 + 0) = A1;
+		  *(b1 + 1) = A9;
+		  *(b2 + 0) = A2;
+		  *(b2 + 1) = A10;
+		  *(b3 + 0) = A3;
+		  *(b3 + 1) = A11;
+		  *(b4 + 0) = A4;
+		  *(b4 + 1) = A12;
+		  *(b5 + 0) = A5;
+		  *(b5 + 1) = A13;
+		  *(b6 + 0) = A6;
+		  *(b6 + 1) = A14;
+		  *(b7 + 0) = A7;
+		  *(b7 + 1) = A15;
+		  *(b8 + 0) = A8;
+		  *(b8 + 1) = A16;
+		}
+	  }
+
+	 buffer += 16;
+
+	  b1 = a + ip1;
+	  b2 = a + ip2;
+
+	  b3 = b1 + 1 * lda;
+	  b4 = b2 + 1 * lda;
+	  b5 = b1 + 2 * lda;
+	  b6 = b2 + 2 * lda;
+	  b7 = b1 + 3 * lda;
+	  b8 = b2 + 3 * lda;
+
+	  a1 += 4;
+	  a3 += 4;
+	  a5 += 4;
+	  a7 += 4;
+
+	i --;
+	} while (i > 0);
+      }
+
+      i = ((k2 - k1) & 1);
+
+      if (i > 0) {
+	A1  = *(a1 + 0);
+	A9  = *(a1 + 1);
+	B1  = *(b1 + 0);
+	B9  = *(b1 + 1);
+	A3  = *(a3 + 0);
+	A11 = *(a3 + 1);
+	B3  = *(b3 + 0);
+	B11 = *(b3 + 1);
+	A5  = *(a5 + 0);
+	A13 = *(a5 + 1);
+	B5  = *(b5 + 0);
+	B13 = *(b5 + 1);
+	A7  = *(a7 + 0);
+	A15 = *(a7 + 1);
+	B7  = *(b7 + 0);
+	B15 = *(b7 + 1);
+
+	if (a1 == b1) {
+	  *(buffer + 0) = A1;
+	  *(buffer + 1) = A9;
+	  *(buffer + 2) = A3;
+	  *(buffer + 3) = A11;
+	  *(buffer + 4) = A5;
+	  *(buffer + 5) = A13;
+	  *(buffer + 6) = A7;
+	  *(buffer + 7) = A15;
+	} else {
+	  *(buffer + 0) = B1;
+	  *(buffer + 1) = B9;
+	  *(buffer + 2) = B3;
+	  *(buffer + 3) = B11;
+	  *(buffer + 4) = B5;
+	  *(buffer + 5) = B13;
+	  *(buffer + 6) = B7;
+	  *(buffer + 7) = B15;
+
+	  *(b1 + 0) = A1;
+	  *(b1 + 1) = A9;
+	  *(b3 + 0) = A3;
+	  *(b3 + 1) = A11;
+	  *(b5 + 0) = A5;
+	  *(b5 + 1) = A13;
+	  *(b7 + 0) = A7;
+	  *(b7 + 1) = A15;
+	}
+	buffer += 8;
+      }
+
+      a += 4 * lda;
+    }
+  } //if (n & 4)
+
+  if (n & 2) {
+    piv = ipiv;
+
+    a1 = a + (k1 + 1) * 2;
+    a3 = a1 + lda;
+
+    ip1 = *(piv + 0) * 2;
+    ip2 = *(piv + 1) * 2;
+    piv += 2;
+
+    b1 = a + ip1;
+    b2 = a + ip2;
+
+    b3 = b1 + lda;
+    b4 = b2 + lda;
+
+    i = ((k2 - k1) >> 1);
+
+    if (i > 0) {
+      do {
+	A1 = *(a1 + 0);
+	A2 = *(a1 + 1);
+	A3 = *(a2 + 0);
+	A4 = *(a2 + 1);
+	A5 = *(a3 + 0);
+	A6 = *(a3 + 1);
+	A7 = *(a4 + 0);
+	A8 = *(a4 + 1);
+
+	B1 = *(b1 + 0);
+	B2 = *(b1 + 1);
+	B3 = *(b2 + 0);
+	B4 = *(b2 + 1);
+	B5 = *(b3 + 0);
+	B6 = *(b3 + 1);
+	B7 = *(b4 + 0);
+	B8 = *(b4 + 1);
+
+	ip1 = *(piv + 0) * 2;
+	ip2 = *(piv + 1) * 2;
+	piv += 2;
+
+	if (b1 == a1) {
+	  if (b2 == a2) {
+	    *(buffer + 0) = A1;
+	    *(buffer + 1) = A2;
+	    *(buffer + 2) = A5;
+	    *(buffer + 3) = A6;
+	    *(buffer + 4) = A3;
+	    *(buffer + 5) = A4;
+	    *(buffer + 6) = A7;
+	    *(buffer + 7) = A8;
+	  } else {
+	    *(buffer + 0) = A1;
+	    *(buffer + 1) = A2;
+	    *(buffer + 2) = A5;
+	    *(buffer + 3) = A6;
+	    *(buffer + 4) = B3;
+	    *(buffer + 5) = B4;
+	    *(buffer + 6) = B7;
+	    *(buffer + 7) = B8;
+
+	    *(b2 + 0) = A3;
+	    *(b2 + 1) = A4;
+	    *(b4 + 0) = A7;
+	    *(b4 + 1) = A8;
+	  }
+	} else {
+	  if (b1 == a2) {
+	    if (b2 == a2) {
+	      *(buffer + 0) = A3;
+	      *(buffer + 1) = A4;
+	      *(buffer + 2) = A7;
+	      *(buffer + 3) = A8;
+	      *(buffer + 4) = A1;
+	      *(buffer + 5) = A2;
+	      *(buffer + 6) = A5;
+	      *(buffer + 7) = A6;
+	    } else {
+	      *(buffer + 0) = A3;
+	      *(buffer + 1) = A4;
+	      *(buffer + 2) = A7;
+	      *(buffer + 3) = A8;
+	      *(buffer + 4) = B3;
+	      *(buffer + 5) = B4;
+	      *(buffer + 6) = B7;
+	      *(buffer + 7) = B8;
+
+	      *(b2 + 0) = A1;
+	      *(b2 + 1) = A2;
+	      *(b4 + 0) = A5;
+	      *(b4 + 1) = A6;
+	    }
+	  } else {
+	    if (b2 == a2) {
+	      *(buffer + 0) = B1;
+	      *(buffer + 1) = B2;
+	      *(buffer + 2) = B5;
+	      *(buffer + 3) = B6;
+	      *(buffer + 4) = A3;
+	      *(buffer + 5) = A4;
+	      *(buffer + 6) = A7;
+	      *(buffer + 7) = A8;
+
+	      *(b1 + 0) = A1;
+	      *(b1 + 1) = A2;
+	      *(b3 + 0) = A5;
+	      *(b3 + 1) = A6;
+	    } else {
+	      if (b2 == b1) {
+		*(buffer + 0) = B1;
+		*(buffer + 1) = B2;
+		*(buffer + 2) = B5;
+		*(buffer + 3) = B6;
+		*(buffer + 4) = A1;
+		*(buffer + 5) = A2;
+		*(buffer + 6) = A5;
+		*(buffer + 7) = A6;
+
+		*(b1 + 0) = A3;
+		*(b1 + 1) = A4;
+		*(b3 + 0) = A7;
+		*(b3 + 1) = A8;
+	      } else {
+		*(buffer + 0) = B1;
+		*(buffer + 1) = B2;
+		*(buffer + 2) = B5;
+		*(buffer + 3) = B6;
+		*(buffer + 4) = B3;
+		*(buffer + 5) = B4;
+		*(buffer + 6) = B7;
+		*(buffer + 7) = B8;
+		*(b1 + 0) = A1;
+		*(b1 + 1) = A2;
+		*(b2 + 0) = A3;
+		*(b2 + 1) = A4;
+		*(b3 + 0) = A5;
+		*(b3 + 1) = A6;
+		*(b4 + 0) = A7;
+		*(b4 + 1) = A8;
+	      }
+	    }
+	  }
+	  }
+
+	  buffer += 8;
+
+	  b1 = a + ip1;
+	  b2 = a + ip2;
+
+	  b3 = b1 + lda;
+	  b4 = b2 + lda;
+
+	  a1 += 4;
+	  a3 += 4;
+
+	  i --;
+      } while (i > 0);
+    }
+
+    i = ((k2 - k1) & 1);
+
+    if (i > 0) {
+      A1 = *(a1 + 0);
+      A2 = *(a1 + 1);
+      B1 = *(b1 + 0);
+      B2 = *(b1 + 1);
+      A3 = *(a3 + 0);
+      A4 = *(a3 + 1);
+      B3 = *(b3 + 0);
+      B4 = *(b3 + 1);
+
+      if (a1 == b1) {
+	*(buffer + 0) = A1;
+	*(buffer + 1) = A2;
+	*(buffer + 2) = A3;
+	*(buffer + 3) = A4;
+
+      } else {
+	*(buffer + 0) = B1;
+	*(buffer + 1) = B2;
+	*(buffer + 2) = B3;
+	*(buffer + 3) = B4;
+	*(b1 + 0) = A1;
+	*(b1 + 1) = A2;
+	*(b3 + 0) = A3;
+	*(b3 + 1) = A4;
+      }
+      buffer += 4;
+    }
+
+    a += 2 * lda;
+  }
+
+  if (n & 1) {
+    piv = ipiv;
+
+    a1 = a + (k1 + 1) * 2;
+
+    ip1 = *(piv + 0) * 2;
+    ip2 = *(piv + 1) * 2;
+    piv += 2;
+
+    b1 = a + ip1;
+    b2 = a + ip2;
+
+    i = ((k2 - k1) >> 1);
+
+    if (i > 0) {
+      do {
+	A1 = *(a1 + 0);
+	A2 = *(a1 + 1);
+	A3 = *(a2 + 0);
+	A4 = *(a2 + 1);
+	B1 = *(b1 + 0);
+	B2 = *(b1 + 1);
+	B3 = *(b2 + 0);
+	B4 = *(b2 + 1);
+
+	ip1 = *(piv + 0) * 2;
+	ip2 = *(piv + 1) * 2;
+	piv += 2;
+
+	if (b1 == a1) {
+	  if (b2 == a2) {
+	    *(buffer + 0) = A1;
+	    *(buffer + 1) = A2;
+	    *(buffer + 2) = A3;
+	    *(buffer + 3) = A4;
+	  } else {
+	    *(buffer + 0) = A1;
+	    *(buffer + 1) = A2;
+	    *(buffer + 2) = B3;
+	    *(buffer + 3) = B4;
+
+	    *(b2 + 0) = A3;
+	    *(b2 + 1) = A4;
+	  }
+	} else
+	  if (b1 == a2) {
+	    if (b2 == a2) {
+	      *(buffer + 0) = A3;
+	      *(buffer + 1) = A4;
+	      *(buffer + 2) = A1;
+	      *(buffer + 3) = A2;
+	    } else {
+	      *(buffer + 0) = A3;
+	      *(buffer + 1) = A4;
+	      *(buffer + 2) = B3;
+	      *(buffer + 3) = B4;
+	      *(b2 + 0) = A1;
+	      *(b2 + 1) = A2;
+	    }
+	  } else {
+	    if (b2 == a2) {
+	      *(buffer + 0) = B1;
+	      *(buffer + 1) = B2;
+	      *(buffer + 2) = A3;
+	      *(buffer + 3) = A4;
+	      *(b1 + 0) = A1;
+	      *(b1 + 1) = A2;
+	    } else
+	      if (b2 == b1) {
+		*(buffer + 0) = B1;
+		*(buffer + 1) = B2;
+		*(buffer + 2) = A1;
+		*(buffer + 3) = A2;
+		*(b1 + 0) = A3;
+		*(b1 + 1) = A4;
+	      } else {
+		*(buffer + 0) = B1;
+		*(buffer + 1) = B2;
+		*(buffer + 2) = B3;
+		*(buffer + 3) = B4;
+		*(b1 + 0) = A1;
+		*(b1 + 1) = A2;
+		*(b2 + 0) = A3;
+		*(b2 + 1) = A4;
+	      }
+	  }
+
+	buffer += 4;
+
+	b1 = a + ip1;
+	b2 = a + ip2;
+
+	a1 += 4;
+
+	i --;
+      } while (i > 0);
+    }
+
+    i = ((k2 - k1) & 1);
+
+    if (i > 0) {
+      A1 = *(a1 + 0);
+      A2 = *(a1 + 1);
+      B1 = *(b1 + 0);
+      B2 = *(b1 + 1);
+
+      if (a1 == b1) {
+	*(buffer + 0) = A1;
+	*(buffer + 1) = A2;
+      } else {
+	*(buffer + 0) = B1;
+	*(buffer + 1) = B2;
+	*(b1 + 0) = A1;
+	*(b1 + 1) = A2;
+      }
+      // buffer += 2;
+    }
+  }
+
+  return 0;
+}
+
diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B
new file mode 100644
index 000000000..d8690682f
--- /dev/null
+++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B
@@ -0,0 +1,199 @@
+SAMAXKERNEL  = amax_vector.c
+DAMAXKERNEL  = amax_vector.c
+CAMAXKERNEL  = zamax_vector.c
+ZAMAXKERNEL  = zamax_vector.c
+
+SAMINKERNEL  = amin_vector.c
+DAMINKERNEL  = amin_vector.c
+CAMINKERNEL  = zamin_vector.c
+ZAMINKERNEL  = zamin_vector.c
+
+SMAXKERNEL   = max_vector.c
+DMAXKERNEL   = max_vector.c
+
+SMINKERNEL   = min_vector.c
+DMINKERNEL   = min_vector.c
+
+ISAMAXKERNEL = iamax_vector.c
+IDAMAXKERNEL = iamax_vector.c
+ICAMAXKERNEL = izamax_vector.c
+IZAMAXKERNEL = izamax_vector.c
+
+ISAMINKERNEL = iamin_vector.c
+IDAMINKERNEL = iamin_vector.c
+ICAMINKERNEL = izamin_vector.c
+IZAMINKERNEL = izamin_vector.c
+
+ISMAXKERNEL  = imax_vector.c
+IDMAXKERNEL  = imax_vector.c
+
+ISMINKERNEL  = imin_vector.c
+IDMINKERNEL  = imin_vector.c
+
+SASUMKERNEL  = asum_vector.c
+DASUMKERNEL  = asum_vector.c
+CASUMKERNEL  = zasum_vector.c
+ZASUMKERNEL  = zasum_vector.c
+
+SSUMKERNEL  = sum_vector.c
+DSUMKERNEL  = sum_vector.c
+CSUMKERNEL  = zsum_vector.c
+ZSUMKERNEL  = zsum_vector.c
+
+SAXPYKERNEL  = axpy_vector.c
+DAXPYKERNEL  = axpy_vector.c
+CAXPYKERNEL  = zaxpy_vector.c
+ZAXPYKERNEL  = zaxpy_vector.c
+
+SCOPYKERNEL  = copy_vector.c
+DCOPYKERNEL  = copy_vector.c
+CCOPYKERNEL  = zcopy_vector.c
+ZCOPYKERNEL  = zcopy_vector.c
+
+SDOTKERNEL   = dot_vector.c
+DDOTKERNEL   = dot_vector.c
+CDOTKERNEL   = zdot_vector.c
+ZDOTKERNEL   = zdot_vector.c
+DSDOTKERNEL  = ../generic/dot.c
+
+SNRM2KERNEL  = nrm2_vector.c
+DNRM2KERNEL  = nrm2_vector.c
+CNRM2KERNEL  = znrm2_vector.c
+ZNRM2KERNEL  = znrm2_vector.c
+
+SROTKERNEL   = rot_vector.c
+DROTKERNEL   = rot_vector.c
+CROTKERNEL   = zrot_vector.c
+ZROTKERNEL   = zrot_vector.c
+
+SSCALKERNEL  = scal_vector.c
+DSCALKERNEL  = scal_vector.c
+CSCALKERNEL  = zscal_vector.c
+ZSCALKERNEL  = zscal_vector.c
+
+SSWAPKERNEL  = swap_vector.c
+DSWAPKERNEL  = swap_vector.c
+CSWAPKERNEL  = zswap_vector.c
+ZSWAPKERNEL  = zswap_vector.c
+
+SGEMVNKERNEL = gemv_n_vector.c
+DGEMVNKERNEL = gemv_n_vector.c
+CGEMVNKERNEL = zgemv_n_vector.c
+ZGEMVNKERNEL = zgemv_n_vector.c
+
+SGEMVTKERNEL = gemv_t_vector.c
+DGEMVTKERNEL = gemv_t_vector.c
+CGEMVTKERNEL = zgemv_t_vector.c
+ZGEMVTKERNEL = zgemv_t_vector.c
+
+STRMMKERNEL	= strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c
+DTRMMKERNEL	= dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c
+CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c
+ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c
+
+SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c
+DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c
+ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+SSYMV_U_KERNEL =  symv_U_vector.c
+SSYMV_L_KERNEL =  symv_L_vector.c
+DSYMV_U_KERNEL =  symv_U_vector.c
+DSYMV_L_KERNEL =  symv_L_vector.c
+CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+CHEMV_L_KERNEL =  zhemv_LM_vector.c
+CHEMV_M_KERNEL =  zhemv_LM_vector.c
+CHEMV_U_KERNEL =  zhemv_UV_vector.c
+CHEMV_V_KERNEL =  zhemv_UV_vector.c
+ZHEMV_L_KERNEL =  zhemv_LM_vector.c
+ZHEMV_M_KERNEL =  zhemv_LM_vector.c
+ZHEMV_U_KERNEL =  zhemv_UV_vector.c
+ZHEMV_V_KERNEL =  zhemv_UV_vector.c
+
+LSAME_KERNEL = ../generic/lsame.c
+
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+
+ifndef SGEMM_BETA
+SGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = ../generic/zgemm_beta.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = ../generic/zgemm_beta.c
+endif
diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c
index 1b7799340..81a39af32 100644
--- a/kernel/riscv64/amax_vector.c
+++ b/kernel/riscv64/amax_vector.c
@@ -28,36 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include <math.h>
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f,  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#define VFABS_FLOAT     JOIN(__riscv_vfabs,      _v_f,  ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0, j=0;
@@ -65,103 +66,28 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT maxf=0.0;
 	if (n <= 0 || inc_x <= 0) return(maxf);
         unsigned int gvl = 0;
-        FLOAT_V_T v0, v1, v_max;
-        FLOAT_V_T_M1 v_res, v_zero;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_zero = VFMVVF_FLOAT_M1(0, gvl);
+        FLOAT_V_T v0, v1;
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(0, 1);
 
-        MASK_T mask0, mask1;
-        FLOAT zero = 0.0;
         if(inc_x == 1){
                 gvl = VSETVL(n);
                 if(gvl <= n/2){
-                        v_max = VFMVVF_FLOAT(0, gvl);
                         for(i=0,j=0; i<n/(gvl*2); i++){
                                 v0 = VLEV_FLOAT(&x[j], gvl);
                                 v1 = VLEV_FLOAT(&x[j+gvl], gvl);
-                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                                v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
-
-                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
-                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                                v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
+                                v0 = VFABS_FLOAT(v0, gvl);
+                                v1 = VFABS_FLOAT(v1, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl);
                                 j += gvl*2;
                         }
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl);
-                        maxf = *((FLOAT*)&v_res);
-                        //maxf = v_res[0];
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                        v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl);
-                        if(*((FLOAT*)&v_res) > maxf)
-                                maxf = *((FLOAT*)&v_res);
+                        v0 = VFABS_FLOAT(v0, gvl);
+                        v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }else{
@@ -169,94 +95,27 @@ asm volatile(
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 if(gvl <= n/2){
                         BLASLONG inc_xv = inc_x * gvl;
-                        v_max = VFMVVF_FLOAT(0, gvl);
                         for(i=0,j=0; i<n/(gvl*2); i++){
                                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                                v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
-
                                 v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
-                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                                v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
+                                v0 = VFABS_FLOAT(v0, gvl);
+                                v1 = VFABS_FLOAT(v1, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl);
                                 j += gvl*2;
                                 ix += inc_xv*2;
                         }
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl);
-                        maxf = *((FLOAT*)&v_res);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                        v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl);
-                        if(*((FLOAT*)&v_res) > maxf)
-                                maxf = *((FLOAT*)&v_res);
+                        v0 = VFABS_FLOAT(v0, gvl);
+                        v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }
+
+        maxf = EXTRACT_FLOAT(v_res);
 	return(maxf);
 }
 
diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c
index f9b7defae..c8ba75f4a 100644
--- a/kernel/riscv64/amin_vector.c
+++ b/kernel/riscv64/amin_vector.c
@@ -26,232 +26,100 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #include "common.h"
-#include <math.h>
-#include <float.h>
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMINVV_FLOAT vfmin_vv_f32m8
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMINVV_FLOAT vfmin_vv_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f,  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#define VFABS_FLOAT     JOIN(__riscv_vfabs,      _v_f,  ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f  ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
-	if (n <= 0 || inc_x <= 0) return(0.0);
-	FLOAT minf=FLT_MAX;
+        BLASLONG i=0, j=0;
+        BLASLONG ix=0;
+        FLOAT minf=0.0;
+        if (n <= 0 || inc_x <= 0) return(minf);
+
+        minf = *x;
+        x += inc_x;
+        --n;
+        if (n == 0) return(minf);
+
         unsigned int gvl = 0;
-        FLOAT_V_T v0, v1, v_min;
-        FLOAT_V_T_M1 v_res, v_max;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+        FLOAT_V_T v0, v1;
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(minf, 1);
 
-        MASK_T mask0, mask1;
-	    FLOAT zero = 0.0;
         if(inc_x == 1){
                 gvl = VSETVL(n);
                 if(gvl <= n/2){
-                        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
                         for(i=0,j=0; i<n/(gvl*2); i++){
                                 v0 = VLEV_FLOAT(&x[j], gvl);
-                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-                                v_min = VFMINVV_FLOAT(v_min, v0, gvl);
-
                                 v1 = VLEV_FLOAT(&x[j+gvl], gvl);
-                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                                v_min = VFMINVV_FLOAT(v_min, v1, gvl);
+                                v0 = VFABS_FLOAT(v0, gvl);
+                                v1 = VFABS_FLOAT(v1, gvl);
+                                v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
+                                v_res = VFREDMINVS_FLOAT(v1, v_res, gvl);
                                 j += gvl*2;
                         }
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        minf = *((FLOAT*)&v_res);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-                        v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
-                        if(*((FLOAT*)&v_res) < minf)
-                                minf = *((FLOAT*)&v_res);
+                        v0 = VFABS_FLOAT(v0, gvl);
+                        v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }else{
                 gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 if(gvl <= n/2){
-                        BLASLONG idx = 0, inc_xv = inc_x * gvl;
-                        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        BLASLONG inc_xv = inc_x * gvl;
                         for(i=0,j=0; i<n/(gvl*2); i++){
-                                v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-                                v_min = VFMINVV_FLOAT(v_min, v0, gvl);
-
-                                v1 = VLSEV_FLOAT(&x[idx+inc_xv], stride_x, gvl);
-                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                                v_min = VFMINVV_FLOAT(v_min, v1, gvl);
+                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                v0 = VFABS_FLOAT(v0, gvl);
+                                v1 = VFABS_FLOAT(v1, gvl);
+                                v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
+                                v_res = VFREDMINVS_FLOAT(v1, v_res, gvl);
                                 j += gvl*2;
-                                idx += inc_xv*2;
+                                ix += inc_xv*2;
                         }
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        minf = *((FLOAT*)&v_res);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-                        v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
-                        if(*((FLOAT*)&v_res) < minf)
-                                minf = *((FLOAT*)&v_res);
+                        v0 = VFABS_FLOAT(v0, gvl);
+                        v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }
-	return(minf);
-}
-
 
+        minf = EXTRACT_FLOAT(v_res);
+        return(minf);
+}
diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c
index fc73362bc..d10bf99e6 100644
--- a/kernel/riscv64/asum_vector.c
+++ b/kernel/riscv64/asum_vector.c
@@ -28,35 +28,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include <math.h>
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFADDVV_FLOAT vfadd_vv_f32m8
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFADDVV_FLOAT vfadd_vv_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f,  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#define VFABS_FLOAT     JOIN(__riscv_vfabs,     _v_f,   ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
+#define VFADDVV_FLOAT   JOIN(__riscv_vfadd,     _vv_f,  ELEN,   LMUL,   _)
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0, j=0;
@@ -64,75 +67,61 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT asumf=0.0;
 	if (n <= 0 || inc_x <= 0) return(asumf);
         unsigned int gvl = 0;
-        FLOAT_V_T v0, v1, v_zero,v_sum;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+        FLOAT_V_T v0, v1, v_sum;
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(0, 1);
 
-        MASK_T mask0, mask1;
         if(inc_x == 1){
                 gvl = VSETVL(n);
-                v_zero = VFMVVF_FLOAT(0, gvl);
                 if(gvl <= n/2){
                         v_sum = VFMVVF_FLOAT(0, gvl);
                         for(i=0,j=0; i<n/(gvl*2); i++){
                                 v0 = VLEV_FLOAT(&x[j], gvl);
-                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
+                                v0 = VFABS_FLOAT(v0, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
 
                                 v1 = VLEV_FLOAT(&x[j+gvl], gvl);
-                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                                v1 = VFABS_FLOAT(v1, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
                                 j += gvl * 2;
                         }
-                        v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
-                        asumf += *((FLOAT*)&v_res);
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
-                        v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
-                        asumf += *((FLOAT*)&v_res);
+                        v0 = VFABS_FLOAT(v0, gvl);
+                        v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }else{
                 gvl = VSETVL(n);
                 unsigned int stride_x = inc_x * sizeof(FLOAT);
-                v_zero = VFMVVF_FLOAT(0, gvl);
                 if(gvl <= n/2){
                         v_sum = VFMVVF_FLOAT(0, gvl);
                         BLASLONG inc_xv = inc_x * gvl;
                         for(i=0,j=0; i<n/(gvl*2); i++){
                                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
+                                v0 = VFABS_FLOAT(v0, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
 
                                 v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
-                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                                v1 = VFABS_FLOAT(v1, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
                                 j += gvl * 2;
                                 inc_xv += inc_xv * 2;
                         }
-                        v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
-                        asumf += *((FLOAT*)&v_res);
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
-                        v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
-                        asumf += *((FLOAT*)&v_res);
+                        v0 = VFABS_FLOAT(v0, gvl);
+                        v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }
+        asumf = EXTRACT_FLOAT(v_res);
 	return(asumf);
 }
 
diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c
index 676dfd474..b77cb58fb 100644
--- a/kernel/riscv64/axpby_vector.c
+++ b/kernel/riscv64/axpby_vector.c
@@ -27,28 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSEV_FLOAT vse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMULVF_FLOAT vfmul_vf_f32m4
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSEV_FLOAT vse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMULVF_FLOAT vfmul_vf_f64m4
+#       define LMUL m4
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(__riscv_vse,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(__riscv_vsse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFMACCVF_FLOAT  JOIN(__riscv_vfmacc,    _vf_f,  ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
+#define VFMULVF_FLOAT   JOIN(__riscv_vfmul,     _vf_f,  ELEN,   LMUL,   _)
+
 int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
 {
 	if (n < 0)  return(0);
diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c
index 6f921f2d6..3447107a6 100644
--- a/kernel/riscv64/axpy_vector.c
+++ b/kernel/riscv64/axpy_vector.c
@@ -25,26 +25,38 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
+
 #include "common.h"
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSEV_FLOAT vse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSEV_FLOAT vse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#       define LMUL m4
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,    	ELEN,   LMUL,   _t,     _)
+#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(__riscv_vse,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(__riscv_vsse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFMACCVF_FLOAT  JOIN(__riscv_vfmacc,    _vf_f, 	ELEN,   LMUL,   _)
+
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 	BLASLONG i=0, j=0, jx=0, jy=0;
diff --git a/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c b/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c
new file mode 100644
index 000000000..7980c029a
--- /dev/null
+++ b/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c
@@ -0,0 +1,1931 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=8
+ M_tail_scalar_from=1
+ N=8
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='float'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f32m1'
+ VFMUL='__riscv_vfmul_vf_f32m1'
+ VLEV='__riscv_vle32_v_f32m1'
+ VLSEV='__riscv_vlse32_v_f32m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m1'
+ VSETVL='__riscv_vsetvl_e32m1'
+ VSEV='__riscv_vse32_v_f32m1'
+ VSSEV='__riscv_vsse32_v_f32m1'
+ acc_vector_t='vfloat32m1_t'
+ output='cgemm_kernel_8x8_zvl256b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m1_t'
+
+*/
+
+#include "common.h"
+
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+    #define S0  1
+    #define S1 -1
+    #define S2  1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfmacc
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+    #define S0  1
+    #define S1  1
+    #define S2  1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfmsac
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+    #define S0  1
+    #define S1  1
+    #define S2 -1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfnmsac
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+    #define S0  1
+    #define S1 -1
+    #define S2 -1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfnmacc
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/8; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m1(8);
+
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            float B4r = B[bi+4*2+0];
+            float B4i = B[bi+4*2+1];
+            float B5r = B[bi+5*2+0];
+            float B5i = B[bi+5*2+1];
+            float B6r = B[bi+6*2+0];
+            float B6i = B[bi+6*2+1];
+            float B7r = B[bi+7*2+0];
+            float B7i = B[bi+7*2+1];
+            bi += 8*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 14 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+            vfloat32m1_t ACC4r = tmp0r;
+            vfloat32m1_t ACC4i = tmp0i;
+            vfloat32m1_t ACC5r = tmp1r;
+            vfloat32m1_t ACC5i = tmp1i;
+            vfloat32m1_t ACC6r = tmp2r;
+            vfloat32m1_t ACC6i = tmp2i;
+            vfloat32m1_t ACC7r = tmp3r;
+            vfloat32m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                B4r = B[bi+4*2+0];
+                B4i = B[bi+4*2+1];
+                B5r = B[bi+5*2+0];
+                B5i = B[bi+5*2+1];
+                B6r = B[bi+6*2+0];
+                B6i = B[bi+6*2+1];
+                B7r = B[bi+7*2+0];
+                B7i = B[bi+7*2+1];
+                bi += 8*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C4r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C4i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C5r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C5i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C6r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C6i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C7r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C7i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C4r = __riscv_vfmacc( C4r, alphar, ACC4r, gvl );
+            C4i = __riscv_vfmacc( C4i, alphar, ACC4i, gvl );
+            C5r = __riscv_vfmacc( C5r, alphar, ACC5r, gvl );
+            C5i = __riscv_vfmacc( C5i, alphar, ACC5i, gvl );
+            C6r = __riscv_vfmacc( C6r, alphar, ACC6r, gvl );
+            C6i = __riscv_vfmacc( C6i, alphar, ACC6i, gvl );
+            C7r = __riscv_vfmacc( C7r, alphar, ACC7r, gvl );
+            C7i = __riscv_vfmacc( C7i, alphar, ACC7i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 8;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            float B4r = B[bi+4*2+0];
+            float B4i = B[bi+4*2+1];
+            float B5r = B[bi+5*2+0];
+            float B5i = B[bi+5*2+1];
+            float B6r = B[bi+6*2+0];
+            float B6i = B[bi+6*2+1];
+            float B7r = B[bi+7*2+0];
+            float B7i = B[bi+7*2+1];
+            bi += 8*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 14 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+            vfloat32m1_t ACC4r = tmp0r;
+            vfloat32m1_t ACC4i = tmp0i;
+            vfloat32m1_t ACC5r = tmp1r;
+            vfloat32m1_t ACC5i = tmp1i;
+            vfloat32m1_t ACC6r = tmp2r;
+            vfloat32m1_t ACC6i = tmp2i;
+            vfloat32m1_t ACC7r = tmp3r;
+            vfloat32m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                B4r = B[bi+4*2+0];
+                B4i = B[bi+4*2+1];
+                B5r = B[bi+5*2+0];
+                B5i = B[bi+5*2+1];
+                B6r = B[bi+6*2+0];
+                B6i = B[bi+6*2+1];
+                B7r = B[bi+7*2+0];
+                B7i = B[bi+7*2+1];
+                bi += 8*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C4r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C4i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C5r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C5i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C6r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C6i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C7r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C7i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C4r = __riscv_vfmacc( C4r, alphar, ACC4r, gvl );
+            C4i = __riscv_vfmacc( C4i, alphar, ACC4i, gvl );
+            C5r = __riscv_vfmacc( C5r, alphar, ACC5r, gvl );
+            C5i = __riscv_vfmacc( C5i, alphar, ACC5i, gvl );
+            C6r = __riscv_vfmacc( C6r, alphar, ACC6r, gvl );
+            C6i = __riscv_vfmacc( C6i, alphar, ACC6i, gvl );
+            C7r = __riscv_vfmacc( C7r, alphar, ACC7r, gvl );
+            C7i = __riscv_vfmacc( C7i, alphar, ACC7i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            float B4r = B[bi+4*2+0];
+            float B4i = B[bi+4*2+1];
+            float B5r = B[bi+5*2+0];
+            float B5i = B[bi+5*2+1];
+            float B6r = B[bi+6*2+0];
+            float B6i = B[bi+6*2+1];
+            float B7r = B[bi+7*2+0];
+            float B7i = B[bi+7*2+1];
+            bi += 8*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 14 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+            vfloat32m1_t ACC4r = tmp0r;
+            vfloat32m1_t ACC4i = tmp0i;
+            vfloat32m1_t ACC5r = tmp1r;
+            vfloat32m1_t ACC5i = tmp1i;
+            vfloat32m1_t ACC6r = tmp2r;
+            vfloat32m1_t ACC6i = tmp2i;
+            vfloat32m1_t ACC7r = tmp3r;
+            vfloat32m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                B4r = B[bi+4*2+0];
+                B4i = B[bi+4*2+1];
+                B5r = B[bi+5*2+0];
+                B5i = B[bi+5*2+1];
+                B6r = B[bi+6*2+0];
+                B6i = B[bi+6*2+1];
+                B7r = B[bi+7*2+0];
+                B7i = B[bi+7*2+1];
+                bi += 8*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C4r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C4i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C5r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C5i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C6r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C6i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C7r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C7i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C4r = __riscv_vfmacc( C4r, alphar, ACC4r, gvl );
+            C4i = __riscv_vfmacc( C4i, alphar, ACC4i, gvl );
+            C5r = __riscv_vfmacc( C5r, alphar, ACC5r, gvl );
+            C5i = __riscv_vfmacc( C5i, alphar, ACC5i, gvl );
+            C6r = __riscv_vfmacc( C6r, alphar, ACC6r, gvl );
+            C6i = __riscv_vfmacc( C6i, alphar, ACC6i, gvl );
+            C7r = __riscv_vfmacc( C7r, alphar, ACC7r, gvl );
+            C7i = __riscv_vfmacc( C7i, alphar, ACC7i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
+                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
+                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
+                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
+                result8+=S0*A[ai+0+0]*B[bi+8+0] + S1*A[ai+0+1]*B[bi+8+1];
+                result9+=S2*A[ai+0+1]*B[bi+8+0] + S3*A[ai+0+0]*B[bi+8+1];
+                result10+=S0*A[ai+0+0]*B[bi+10+0] + S1*A[ai+0+1]*B[bi+10+1];
+                result11+=S2*A[ai+0+1]*B[bi+10+0] + S3*A[ai+0+0]*B[bi+10+1];
+                result12+=S0*A[ai+0+0]*B[bi+12+0] + S1*A[ai+0+1]*B[bi+12+1];
+                result13+=S2*A[ai+0+1]*B[bi+12+0] + S3*A[ai+0+0]*B[bi+12+1];
+                result14+=S0*A[ai+0+0]*B[bi+14+0] + S1*A[ai+0+1]*B[bi+14+1];
+                result15+=S2*A[ai+0+1]*B[bi+14+0] + S3*A[ai+0+0]*B[bi+14+1];
+                ai+=1*2;
+                bi+=8*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = C[(ci+0*ldc+0)*2+0];
+            Ci = C[(ci+0*ldc+0)*2+1];
+            Cr += result0*alphar;
+            Ci += result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+1*ldc+0)*2+0];
+            Ci = C[(ci+1*ldc+0)*2+1];
+            Cr += result2*alphar;
+            Ci += result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+2*ldc+0)*2+0];
+            Ci = C[(ci+2*ldc+0)*2+1];
+            Cr += result4*alphar;
+            Ci += result5*alphar;
+            Cr -= result5*alphai;
+            Ci += result4*alphai;
+            C[(ci+2*ldc+0)*2+0] = Cr;
+            C[(ci+2*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+3*ldc+0)*2+0];
+            Ci = C[(ci+3*ldc+0)*2+1];
+            Cr += result6*alphar;
+            Ci += result7*alphar;
+            Cr -= result7*alphai;
+            Ci += result6*alphai;
+            C[(ci+3*ldc+0)*2+0] = Cr;
+            C[(ci+3*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+4*ldc+0)*2+0];
+            Ci = C[(ci+4*ldc+0)*2+1];
+            Cr += result8*alphar;
+            Ci += result9*alphar;
+            Cr -= result9*alphai;
+            Ci += result8*alphai;
+            C[(ci+4*ldc+0)*2+0] = Cr;
+            C[(ci+4*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+5*ldc+0)*2+0];
+            Ci = C[(ci+5*ldc+0)*2+1];
+            Cr += result10*alphar;
+            Ci += result11*alphar;
+            Cr -= result11*alphai;
+            Ci += result10*alphai;
+            C[(ci+5*ldc+0)*2+0] = Cr;
+            C[(ci+5*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+6*ldc+0)*2+0];
+            Ci = C[(ci+6*ldc+0)*2+1];
+            Cr += result12*alphar;
+            Ci += result13*alphar;
+            Cr -= result13*alphai;
+            Ci += result12*alphai;
+            C[(ci+6*ldc+0)*2+0] = Cr;
+            C[(ci+6*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+7*ldc+0)*2+0];
+            Ci = C[(ci+7*ldc+0)*2+1];
+            Cr += result14*alphar;
+            Ci += result15*alphar;
+            Cr -= result15*alphai;
+            Ci += result14*alphai;
+            C[(ci+7*ldc+0)*2+0] = Cr;
+            C[(ci+7*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 8;
+    }
+
+
+
+    // -- tails for N=4
+
+    if( N & 4 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
+                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
+                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
+                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
+                ai+=1*2;
+                bi+=4*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = C[(ci+0*ldc+0)*2+0];
+            Ci = C[(ci+0*ldc+0)*2+1];
+            Cr += result0*alphar;
+            Ci += result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+1*ldc+0)*2+0];
+            Ci = C[(ci+1*ldc+0)*2+1];
+            Cr += result2*alphar;
+            Ci += result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+2*ldc+0)*2+0];
+            Ci = C[(ci+2*ldc+0)*2+1];
+            Cr += result4*alphar;
+            Ci += result5*alphar;
+            Cr -= result5*alphai;
+            Ci += result4*alphai;
+            C[(ci+2*ldc+0)*2+0] = Cr;
+            C[(ci+2*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+3*ldc+0)*2+0];
+            Ci = C[(ci+3*ldc+0)*2+1];
+            Cr += result6*alphar;
+            Ci += result7*alphar;
+            Cr -= result7*alphai;
+            Ci += result6*alphai;
+            C[(ci+3*ldc+0)*2+0] = Cr;
+            C[(ci+3*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                ai+=1*2;
+                bi+=2*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = C[(ci+0*ldc+0)*2+0];
+            Ci = C[(ci+0*ldc+0)*2+1];
+            Cr += result0*alphar;
+            Ci += result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+1*ldc+0)*2+0];
+            Ci = C[(ci+1*ldc+0)*2+1];
+            Cr += result2*alphar;
+            Ci += result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                ai+=1*2;
+                bi+=1*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = C[(ci+0*ldc+0)*2+0];
+            Ci = C[(ci+0*ldc+0)*2+1];
+            Cr += result0*alphar;
+            Ci += result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/copy_vector.c b/kernel/riscv64/copy_vector.c
index fee5e195d..710e8670a 100644
--- a/kernel/riscv64/copy_vector.c
+++ b/kernel/riscv64/copy_vector.c
@@ -25,22 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 #include "common.h"
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VSEV_FLOAT vse32_v_f32m8
-#define VSSEV_FLOAT vsse32_v_f32m8
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VSEV_FLOAT vse64_v_f64m8
-#define VSSEV_FLOAT vsse64_v_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(__riscv_vse,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(__riscv_vsse,      ELEN,   _v_f,   ELEN,   LMUL)
+
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
 	BLASLONG i=0, j=0;
diff --git a/kernel/riscv64/ctrmm_kernel_8x8_zvl256b.c b/kernel/riscv64/ctrmm_kernel_8x8_zvl256b.c
new file mode 100644
index 000000000..b4f111a69
--- /dev/null
+++ b/kernel/riscv64/ctrmm_kernel_8x8_zvl256b.c
@@ -0,0 +1,2007 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=8
+ M_tail_scalar_from=1
+ N=8
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='float'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f32m1'
+ VFMUL='__riscv_vfmul_vf_f32m1'
+ VLEV='__riscv_vle32_v_f32m1'
+ VLSEV='__riscv_vlse32_v_f32m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m1'
+ VSETVL='__riscv_vsetvl_e32m1'
+ VSEV='__riscv_vse32_v_f32m1'
+ VSSEV='__riscv_vsse32_v_f32m1'
+ acc_vector_t='vfloat32m1_t'
+ output='ctrmm_kernel_8x8_zvl256b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m1_t'
+
+*/
+
+#include "common.h"
+
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+    #define S0  1
+    #define S1 -1
+    #define S2  1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfmacc
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+    #define S0  1
+    #define S1  1
+    #define S2  1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfmsac
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+    #define S0  1
+    #define S1  1
+    #define S2 -1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfnmsac
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+    #define S0  1
+    #define S1 -1
+    #define S2 -1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfnmacc
+#endif
+
+
+#if defined(LEFT) != defined(TRANSA)
+    #define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/8; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m1(8);
+
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8*2;
+                bi += off*8*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            float B4r = B[bi+4*2+0];
+            float B4i = B[bi+4*2+1];
+            float B5r = B[bi+5*2+0];
+            float B5i = B[bi+5*2+1];
+            float B6r = B[bi+6*2+0];
+            float B6i = B[bi+6*2+1];
+            float B7r = B[bi+7*2+0];
+            float B7i = B[bi+7*2+1];
+            bi += 8*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 14 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+            vfloat32m1_t ACC4r = tmp0r;
+            vfloat32m1_t ACC4i = tmp0i;
+            vfloat32m1_t ACC5r = tmp1r;
+            vfloat32m1_t ACC5i = tmp1i;
+            vfloat32m1_t ACC6r = tmp2r;
+            vfloat32m1_t ACC6i = tmp2i;
+            vfloat32m1_t ACC7r = tmp3r;
+            vfloat32m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                B4r = B[bi+4*2+0];
+                B4i = B[bi+4*2+1];
+                B5r = B[bi+5*2+0];
+                B5i = B[bi+5*2+1];
+                B6r = B[bi+6*2+0];
+                B6i = B[bi+6*2+1];
+                B7r = B[bi+7*2+0];
+                B7i = B[bi+7*2+1];
+                bi += 8*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            vfloat32m1_t C4r = __riscv_vfmul( ACC4r, alphar, gvl );
+            vfloat32m1_t C4i = __riscv_vfmul( ACC4i, alphar, gvl );
+            vfloat32m1_t C5r = __riscv_vfmul( ACC5r, alphar, gvl );
+            vfloat32m1_t C5i = __riscv_vfmul( ACC5i, alphar, gvl );
+            vfloat32m1_t C6r = __riscv_vfmul( ACC6r, alphar, gvl );
+            vfloat32m1_t C6i = __riscv_vfmul( ACC6i, alphar, gvl );
+            vfloat32m1_t C7r = __riscv_vfmul( ACC7r, alphar, gvl );
+            vfloat32m1_t C7i = __riscv_vfmul( ACC7i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 8;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4*2;
+                bi += off*8*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            float B4r = B[bi+4*2+0];
+            float B4i = B[bi+4*2+1];
+            float B5r = B[bi+5*2+0];
+            float B5i = B[bi+5*2+1];
+            float B6r = B[bi+6*2+0];
+            float B6i = B[bi+6*2+1];
+            float B7r = B[bi+7*2+0];
+            float B7i = B[bi+7*2+1];
+            bi += 8*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 14 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+            vfloat32m1_t ACC4r = tmp0r;
+            vfloat32m1_t ACC4i = tmp0i;
+            vfloat32m1_t ACC5r = tmp1r;
+            vfloat32m1_t ACC5i = tmp1i;
+            vfloat32m1_t ACC6r = tmp2r;
+            vfloat32m1_t ACC6i = tmp2i;
+            vfloat32m1_t ACC7r = tmp3r;
+            vfloat32m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                B4r = B[bi+4*2+0];
+                B4i = B[bi+4*2+1];
+                B5r = B[bi+5*2+0];
+                B5i = B[bi+5*2+1];
+                B6r = B[bi+6*2+0];
+                B6i = B[bi+6*2+1];
+                B7r = B[bi+7*2+0];
+                B7i = B[bi+7*2+1];
+                bi += 8*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            vfloat32m1_t C4r = __riscv_vfmul( ACC4r, alphar, gvl );
+            vfloat32m1_t C4i = __riscv_vfmul( ACC4i, alphar, gvl );
+            vfloat32m1_t C5r = __riscv_vfmul( ACC5r, alphar, gvl );
+            vfloat32m1_t C5i = __riscv_vfmul( ACC5i, alphar, gvl );
+            vfloat32m1_t C6r = __riscv_vfmul( ACC6r, alphar, gvl );
+            vfloat32m1_t C6i = __riscv_vfmul( ACC6i, alphar, gvl );
+            vfloat32m1_t C7r = __riscv_vfmul( ACC7r, alphar, gvl );
+            vfloat32m1_t C7i = __riscv_vfmul( ACC7i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2*2;
+                bi += off*8*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            float B4r = B[bi+4*2+0];
+            float B4i = B[bi+4*2+1];
+            float B5r = B[bi+5*2+0];
+            float B5i = B[bi+5*2+1];
+            float B6r = B[bi+6*2+0];
+            float B6i = B[bi+6*2+1];
+            float B7r = B[bi+7*2+0];
+            float B7i = B[bi+7*2+1];
+            bi += 8*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 14 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+            vfloat32m1_t ACC4r = tmp0r;
+            vfloat32m1_t ACC4i = tmp0i;
+            vfloat32m1_t ACC5r = tmp1r;
+            vfloat32m1_t ACC5i = tmp1i;
+            vfloat32m1_t ACC6r = tmp2r;
+            vfloat32m1_t ACC6i = tmp2i;
+            vfloat32m1_t ACC7r = tmp3r;
+            vfloat32m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                B4r = B[bi+4*2+0];
+                B4i = B[bi+4*2+1];
+                B5r = B[bi+5*2+0];
+                B5i = B[bi+5*2+1];
+                B6r = B[bi+6*2+0];
+                B6i = B[bi+6*2+1];
+                B7r = B[bi+7*2+0];
+                B7i = B[bi+7*2+1];
+                bi += 8*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            vfloat32m1_t C4r = __riscv_vfmul( ACC4r, alphar, gvl );
+            vfloat32m1_t C4i = __riscv_vfmul( ACC4i, alphar, gvl );
+            vfloat32m1_t C5r = __riscv_vfmul( ACC5r, alphar, gvl );
+            vfloat32m1_t C5i = __riscv_vfmul( ACC5i, alphar, gvl );
+            vfloat32m1_t C6r = __riscv_vfmul( ACC6r, alphar, gvl );
+            vfloat32m1_t C6i = __riscv_vfmul( ACC6i, alphar, gvl );
+            vfloat32m1_t C7r = __riscv_vfmul( ACC7r, alphar, gvl );
+            vfloat32m1_t C7i = __riscv_vfmul( ACC7i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1*2;
+                bi += off*8*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
+                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
+                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
+                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
+                result8+=S0*A[ai+0+0]*B[bi+8+0] + S1*A[ai+0+1]*B[bi+8+1];
+                result9+=S2*A[ai+0+1]*B[bi+8+0] + S3*A[ai+0+0]*B[bi+8+1];
+                result10+=S0*A[ai+0+0]*B[bi+10+0] + S1*A[ai+0+1]*B[bi+10+1];
+                result11+=S2*A[ai+0+1]*B[bi+10+0] + S3*A[ai+0+0]*B[bi+10+1];
+                result12+=S0*A[ai+0+0]*B[bi+12+0] + S1*A[ai+0+1]*B[bi+12+1];
+                result13+=S2*A[ai+0+1]*B[bi+12+0] + S3*A[ai+0+0]*B[bi+12+1];
+                result14+=S0*A[ai+0+0]*B[bi+14+0] + S1*A[ai+0+1]*B[bi+14+1];
+                result15+=S2*A[ai+0+1]*B[bi+14+0] + S3*A[ai+0+0]*B[bi+14+1];
+                ai+=1*2;
+                bi+=8*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = result0*alphar;
+            Ci = result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = result2*alphar;
+            Ci = result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            Cr = result4*alphar;
+            Ci = result5*alphar;
+            Cr -= result5*alphai;
+            Ci += result4*alphai;
+            C[(ci+2*ldc+0)*2+0] = Cr;
+            C[(ci+2*ldc+0)*2+1] = Ci;
+            Cr = result6*alphar;
+            Ci = result7*alphar;
+            Cr -= result7*alphai;
+            Ci += result6*alphai;
+            C[(ci+3*ldc+0)*2+0] = Cr;
+            C[(ci+3*ldc+0)*2+1] = Ci;
+            Cr = result8*alphar;
+            Ci = result9*alphar;
+            Cr -= result9*alphai;
+            Ci += result8*alphai;
+            C[(ci+4*ldc+0)*2+0] = Cr;
+            C[(ci+4*ldc+0)*2+1] = Ci;
+            Cr = result10*alphar;
+            Ci = result11*alphar;
+            Cr -= result11*alphai;
+            Ci += result10*alphai;
+            C[(ci+5*ldc+0)*2+0] = Cr;
+            C[(ci+5*ldc+0)*2+1] = Ci;
+            Cr = result12*alphar;
+            Ci = result13*alphar;
+            Cr -= result13*alphai;
+            Ci += result12*alphai;
+            C[(ci+6*ldc+0)*2+0] = Cr;
+            C[(ci+6*ldc+0)*2+1] = Ci;
+            Cr = result14*alphar;
+            Ci = result15*alphar;
+            Cr -= result15*alphai;
+            Ci += result14*alphai;
+            C[(ci+7*ldc+0)*2+0] = Cr;
+            C[(ci+7*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 8;
+    }
+
+
+
+    // -- tails for N=4
+
+    if( N & 4 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
+                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
+                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
+                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
+                ai+=1*2;
+                bi+=4*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = result0*alphar;
+            Ci = result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = result2*alphar;
+            Ci = result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            Cr = result4*alphar;
+            Ci = result5*alphar;
+            Cr -= result5*alphai;
+            Ci += result4*alphai;
+            C[(ci+2*ldc+0)*2+0] = Cr;
+            C[(ci+2*ldc+0)*2+1] = Ci;
+            Cr = result6*alphar;
+            Ci = result7*alphar;
+            Cr -= result7*alphai;
+            Ci += result6*alphai;
+            C[(ci+3*ldc+0)*2+0] = Cr;
+            C[(ci+3*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                ai+=1*2;
+                bi+=2*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = result0*alphar;
+            Ci = result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = result2*alphar;
+            Ci = result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                ai+=1*2;
+                bi+=1*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = result0*alphar;
+            Ci = result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/dgemm_kernel_8x8_zvl256b.c b/kernel/riscv64/dgemm_kernel_8x8_zvl256b.c
new file mode 100644
index 000000000..760bfc893
--- /dev/null
+++ b/kernel/riscv64/dgemm_kernel_8x8_zvl256b.c
@@ -0,0 +1,860 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=8
+ M_tail_scalar_from=2
+ N=8
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='double'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f64m1'
+ VFMUL='__riscv_vfmul_vf_f64m1'
+ VLEV='__riscv_vle64_v_f64m1'
+ VLSEV='__riscv_vlse64_v_f64m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m1'
+ VSETVL='__riscv_vsetvl_e64m1'
+ VSEV='__riscv_vse64_v_f64m1'
+ VSSEV='__riscv_vsse64_v_f64m1'
+ acc_vector_t='vfloat64m1_t'
+ output='dgemm_kernel_8x8_zvl256b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m1_t'
+
+*/
+
+#include "common.h"
+
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/8; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m1(4);
+
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            double B4 = B[bi+4];
+            double B5 = B[bi+5];
+            double B6 = B[bi+6];
+            double B7 = B[bi+7];
+            bi += 8;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
+            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
+            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
+            vfloat64m1_t result8 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
+            vfloat64m1_t result9 = __riscv_vfmul_vf_f64m1( A1, B4, gvl);
+            vfloat64m1_t result10 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
+            vfloat64m1_t result11 = __riscv_vfmul_vf_f64m1( A1, B5, gvl);
+            vfloat64m1_t result12 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
+            vfloat64m1_t result13 = __riscv_vfmul_vf_f64m1( A1, B6, gvl);
+            vfloat64m1_t result14 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
+            vfloat64m1_t result15 = __riscv_vfmul_vf_f64m1( A1, B7, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
+                result8 = __riscv_vfmacc_vf_f64m1( result8, B4, A0, gvl);
+                result9 = __riscv_vfmacc_vf_f64m1( result9, B4, A1, gvl);
+                result10 = __riscv_vfmacc_vf_f64m1( result10, B5, A0, gvl);
+                result11 = __riscv_vfmacc_vf_f64m1( result11, B5, A1, gvl);
+                result12 = __riscv_vfmacc_vf_f64m1( result12, B6, A0, gvl);
+                result13 = __riscv_vfmacc_vf_f64m1( result13, B6, A1, gvl);
+                result14 = __riscv_vfmacc_vf_f64m1( result14, B7, A0, gvl);
+                result15 = __riscv_vfmacc_vf_f64m1( result15, B7, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c8 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c9 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c10 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c11 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c12 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c13 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c14 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c15 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
+            c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl );
+            c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl );
+            c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl );
+            c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl );
+            c8 = __riscv_vfmacc_vf_f64m1( c8, alpha, result8, gvl );
+            c9 = __riscv_vfmacc_vf_f64m1( c9, alpha, result9, gvl );
+            c10 = __riscv_vfmacc_vf_f64m1( c10, alpha, result10, gvl );
+            c11 = __riscv_vfmacc_vf_f64m1( c11, alpha, result11, gvl );
+            c12 = __riscv_vfmacc_vf_f64m1( c12, alpha, result12, gvl );
+            c13 = __riscv_vfmacc_vf_f64m1( c13, alpha, result13, gvl );
+            c14 = __riscv_vfmacc_vf_f64m1( c14, alpha, result14, gvl );
+            c15 = __riscv_vfmacc_vf_f64m1( c15, alpha, result15, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c7, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c8, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c9, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c10, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c11, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c12, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c13, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c14, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c15, gvl);
+            m_top += 8;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            double B4 = B[bi+4];
+            double B5 = B[bi+5];
+            double B6 = B[bi+6];
+            double B7 = B[bi+7];
+            bi += 8;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
+            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
+            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
+            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f64m1( result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f64m1( result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f64m1( result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f64m1( result7, B7, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
+            c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl );
+            c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl );
+            c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl );
+            c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c7, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            double result8 = 0;
+            double result9 = 0;
+            double result10 = 0;
+            double result11 = 0;
+            double result12 = 0;
+            double result13 = 0;
+            double result14 = 0;
+            double result15 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                result8+=A[ai+0]*B[bi+4];
+                result9+=A[ai+1]*B[bi+4];
+                result10+=A[ai+0]*B[bi+5];
+                result11+=A[ai+1]*B[bi+5];
+                result12+=A[ai+0]*B[bi+6];
+                result13+=A[ai+1]*B[bi+6];
+                result14+=A[ai+0]*B[bi+7];
+                result15+=A[ai+1]*B[bi+7];
+                ai+=2;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            C[ci+1*ldc+0] += alpha * result2;
+            C[ci+1*ldc+1] += alpha * result3;
+            C[ci+2*ldc+0] += alpha * result4;
+            C[ci+2*ldc+1] += alpha * result5;
+            C[ci+3*ldc+0] += alpha * result6;
+            C[ci+3*ldc+1] += alpha * result7;
+            C[ci+4*ldc+0] += alpha * result8;
+            C[ci+4*ldc+1] += alpha * result9;
+            C[ci+5*ldc+0] += alpha * result10;
+            C[ci+5*ldc+1] += alpha * result11;
+            C[ci+6*ldc+0] += alpha * result12;
+            C[ci+6*ldc+1] += alpha * result13;
+            C[ci+7*ldc+0] += alpha * result14;
+            C[ci+7*ldc+1] += alpha * result15;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                result4+=A[ai+0]*B[bi+4];
+                result5+=A[ai+0]*B[bi+5];
+                result6+=A[ai+0]*B[bi+6];
+                result7+=A[ai+0]*B[bi+7];
+                ai+=1;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+1*ldc+0] += alpha * result1;
+            C[ci+2*ldc+0] += alpha * result2;
+            C[ci+3*ldc+0] += alpha * result3;
+            C[ci+4*ldc+0] += alpha * result4;
+            C[ci+5*ldc+0] += alpha * result5;
+            C[ci+6*ldc+0] += alpha * result6;
+            C[ci+7*ldc+0] += alpha * result7;
+            m_top+=1;
+        }
+
+        n_top += 8;
+    }
+
+
+
+    // -- tails for N=4
+
+    if( N & 4 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            bi += 4;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
+            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
+            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
+            c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl );
+            c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl );
+            c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl );
+            c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c7, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            bi += 4;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                ai+=2;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            C[ci+1*ldc+0] += alpha * result2;
+            C[ci+1*ldc+1] += alpha * result3;
+            C[ci+2*ldc+0] += alpha * result4;
+            C[ci+2*ldc+1] += alpha * result5;
+            C[ci+3*ldc+0] += alpha * result6;
+            C[ci+3*ldc+1] += alpha * result7;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                ai+=1;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+1*ldc+0] += alpha * result1;
+            C[ci+2*ldc+0] += alpha * result2;
+            C[ci+3*ldc+0] += alpha * result3;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            bi += 2;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            bi += 2;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                ai+=2;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            C[ci+1*ldc+0] += alpha * result2;
+            C[ci+1*ldc+1] += alpha * result3;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                ai+=1;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+1*ldc+0] += alpha * result1;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            bi += 1;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            bi += 1;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                ai+=2;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                ai+=1;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c
index 46a84ad18..bf55998ca 100644
--- a/kernel/riscv64/dot.c
+++ b/kernel/riscv64/dot.c
@@ -46,7 +46,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 	BLASLONG ix=0,iy=0;
 	double dot = 0.0 ;
 
-	if ( n < 0 )  return(dot);
+	if ( n < 1 )  return(dot);
 
 	while(i < n)
 	{
diff --git a/kernel/riscv64/dot_vector.c b/kernel/riscv64/dot_vector.c
index f47e0c0b5..a1841edb7 100644
--- a/kernel/riscv64/dot_vector.c
+++ b/kernel/riscv64/dot_vector.c
@@ -27,31 +27,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFDOTVV_FLOAT vfdot_vv_f32m4
+#define VLEV_FLOAT __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFDOTVV_FLOAT __riscv_vfdot_vv_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFDOTVV_FLOAT vfdot_vv_f64m4
+#define VLEV_FLOAT __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFDOTVV_FLOAT __riscv_vfdot_vv_f64m4
 #endif
 
 #if defined(DSDOT)
@@ -63,7 +61,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 	BLASLONG i=0, j=0;
 	double dot = 0.0 ;
 
-	if ( n < 0 )  return(dot);
+	if ( n < 1 )  return(dot);
 
         FLOAT_V_T vr, vx, vy;
         unsigned int gvl = 0;
@@ -82,8 +80,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         j += gvl;
                 }
                 if(j > 0){
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
                 //tail
                 if(j < n){
@@ -93,13 +91,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
                         //vr = VFDOTVV_FLOAT(vx, vy, gvl);
                         vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
         }else if(inc_y == 1){
                 gvl = VSETVL(n);
                 vr = VFMVVF_FLOAT(0, gvl);
-                 int stride_x = inc_x * sizeof(FLOAT);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 for(i=0,j=0; i<n/gvl; i++){
                         vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
                         vy = VLEV_FLOAT(&y[j], gvl);
@@ -107,9 +105,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         j += gvl;
                 }
                 if(j > 0){
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
-
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
                 //tail
                 if(j < n){
@@ -119,14 +116,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
                         //vr = VFDOTVV_FLOAT(vx, vy, gvl);
                         vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
-
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
         }else if(inc_x == 1){
                 gvl = VSETVL(n);
                 vr = VFMVVF_FLOAT(0, gvl);
-                 int stride_y = inc_y * sizeof(FLOAT);
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
                 for(i=0,j=0; i<n/gvl; i++){
                         vx = VLEV_FLOAT(&x[j], gvl);
                         vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
@@ -134,9 +130,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         j += gvl;
                 }
                 if(j > 0){
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
-
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
                 //tail
                 if(j < n){
@@ -146,15 +141,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
                         //vr = VFDOTVV_FLOAT(vx, vy, gvl);
                         vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
-
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
         }else{
                 gvl = VSETVL(n);
                 vr = VFMVVF_FLOAT(0, gvl);
-                 int stride_x = inc_x * sizeof(FLOAT);
-                 int stride_y = inc_y * sizeof(FLOAT);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
                 for(i=0,j=0; i<n/gvl; i++){
                         vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
                         vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
@@ -162,9 +156,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         j += gvl;
                 }
                 if(j > 0){
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
-
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
                 //tail
                 if(j < n){
@@ -174,9 +167,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
                         //vr = VFDOTVV_FLOAT(vx, vy, gvl);
                         vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
-
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
         }
 	return(dot);
diff --git a/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c b/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c
new file mode 100644
index 000000000..b1739f248
--- /dev/null
+++ b/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c
@@ -0,0 +1,1068 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=8
+ M_tail_scalar_from=2
+ N=8
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='double'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f64m1'
+ VFMUL='__riscv_vfmul_vf_f64m1'
+ VLEV='__riscv_vle64_v_f64m1'
+ VLSEV='__riscv_vlse64_v_f64m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m1'
+ VSETVL='__riscv_vsetvl_e64m1'
+ VSEV='__riscv_vse64_v_f64m1'
+ VSSEV='__riscv_vsse64_v_f64m1'
+ acc_vector_t='vfloat64m1_t'
+ output='dtrmm_kernel_8x8_zvl256b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m1_t'
+
+*/
+
+#include "common.h"
+
+
+
+#if defined(LEFT) != defined(TRANSA)
+    #define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/8; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m1(4);
+
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            double B4 = B[bi+4];
+            double B5 = B[bi+5];
+            double B6 = B[bi+6];
+            double B7 = B[bi+7];
+            bi += 8;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
+            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
+            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
+            vfloat64m1_t result8 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
+            vfloat64m1_t result9 = __riscv_vfmul_vf_f64m1( A1, B4, gvl);
+            vfloat64m1_t result10 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
+            vfloat64m1_t result11 = __riscv_vfmul_vf_f64m1( A1, B5, gvl);
+            vfloat64m1_t result12 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
+            vfloat64m1_t result13 = __riscv_vfmul_vf_f64m1( A1, B6, gvl);
+            vfloat64m1_t result14 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
+            vfloat64m1_t result15 = __riscv_vfmul_vf_f64m1( A1, B7, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
+                result8 = __riscv_vfmacc_vf_f64m1( result8, B4, A0, gvl);
+                result9 = __riscv_vfmacc_vf_f64m1( result9, B4, A1, gvl);
+                result10 = __riscv_vfmacc_vf_f64m1( result10, B5, A0, gvl);
+                result11 = __riscv_vfmacc_vf_f64m1( result11, B5, A1, gvl);
+                result12 = __riscv_vfmacc_vf_f64m1( result12, B6, A0, gvl);
+                result13 = __riscv_vfmacc_vf_f64m1( result13, B6, A1, gvl);
+                result14 = __riscv_vfmacc_vf_f64m1( result14, B7, A0, gvl);
+                result15 = __riscv_vfmacc_vf_f64m1( result15, B7, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
+            vfloat64m1_t c2 = __riscv_vfmul_vf_f64m1( result2, alpha, gvl );
+            vfloat64m1_t c3 = __riscv_vfmul_vf_f64m1( result3, alpha, gvl );
+            vfloat64m1_t c4 = __riscv_vfmul_vf_f64m1( result4, alpha, gvl );
+            vfloat64m1_t c5 = __riscv_vfmul_vf_f64m1( result5, alpha, gvl );
+            vfloat64m1_t c6 = __riscv_vfmul_vf_f64m1( result6, alpha, gvl );
+            vfloat64m1_t c7 = __riscv_vfmul_vf_f64m1( result7, alpha, gvl );
+            vfloat64m1_t c8 = __riscv_vfmul_vf_f64m1( result8, alpha, gvl );
+            vfloat64m1_t c9 = __riscv_vfmul_vf_f64m1( result9, alpha, gvl );
+            vfloat64m1_t c10 = __riscv_vfmul_vf_f64m1( result10, alpha, gvl );
+            vfloat64m1_t c11 = __riscv_vfmul_vf_f64m1( result11, alpha, gvl );
+            vfloat64m1_t c12 = __riscv_vfmul_vf_f64m1( result12, alpha, gvl );
+            vfloat64m1_t c13 = __riscv_vfmul_vf_f64m1( result13, alpha, gvl );
+            vfloat64m1_t c14 = __riscv_vfmul_vf_f64m1( result14, alpha, gvl );
+            vfloat64m1_t c15 = __riscv_vfmul_vf_f64m1( result15, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c7, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c8, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c9, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c10, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c11, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c12, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c13, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c14, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c15, gvl);
+            m_top += 8;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            double B4 = B[bi+4];
+            double B5 = B[bi+5];
+            double B6 = B[bi+6];
+            double B7 = B[bi+7];
+            bi += 8;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
+            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
+            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
+            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f64m1( result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f64m1( result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f64m1( result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f64m1( result7, B7, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
+            vfloat64m1_t c2 = __riscv_vfmul_vf_f64m1( result2, alpha, gvl );
+            vfloat64m1_t c3 = __riscv_vfmul_vf_f64m1( result3, alpha, gvl );
+            vfloat64m1_t c4 = __riscv_vfmul_vf_f64m1( result4, alpha, gvl );
+            vfloat64m1_t c5 = __riscv_vfmul_vf_f64m1( result5, alpha, gvl );
+            vfloat64m1_t c6 = __riscv_vfmul_vf_f64m1( result6, alpha, gvl );
+            vfloat64m1_t c7 = __riscv_vfmul_vf_f64m1( result7, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c7, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            double result8 = 0;
+            double result9 = 0;
+            double result10 = 0;
+            double result11 = 0;
+            double result12 = 0;
+            double result13 = 0;
+            double result14 = 0;
+            double result15 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                result8+=A[ai+0]*B[bi+4];
+                result9+=A[ai+1]*B[bi+4];
+                result10+=A[ai+0]*B[bi+5];
+                result11+=A[ai+1]*B[bi+5];
+                result12+=A[ai+0]*B[bi+6];
+                result13+=A[ai+1]*B[bi+6];
+                result14+=A[ai+0]*B[bi+7];
+                result15+=A[ai+1]*B[bi+7];
+                ai+=2;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            C[ci+1*ldc+0] = alpha * result2;
+            C[ci+1*ldc+1] = alpha * result3;
+            C[ci+2*ldc+0] = alpha * result4;
+            C[ci+2*ldc+1] = alpha * result5;
+            C[ci+3*ldc+0] = alpha * result6;
+            C[ci+3*ldc+1] = alpha * result7;
+            C[ci+4*ldc+0] = alpha * result8;
+            C[ci+4*ldc+1] = alpha * result9;
+            C[ci+5*ldc+0] = alpha * result10;
+            C[ci+5*ldc+1] = alpha * result11;
+            C[ci+6*ldc+0] = alpha * result12;
+            C[ci+6*ldc+1] = alpha * result13;
+            C[ci+7*ldc+0] = alpha * result14;
+            C[ci+7*ldc+1] = alpha * result15;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                result4+=A[ai+0]*B[bi+4];
+                result5+=A[ai+0]*B[bi+5];
+                result6+=A[ai+0]*B[bi+6];
+                result7+=A[ai+0]*B[bi+7];
+                ai+=1;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+1*ldc+0] = alpha * result1;
+            C[ci+2*ldc+0] = alpha * result2;
+            C[ci+3*ldc+0] = alpha * result3;
+            C[ci+4*ldc+0] = alpha * result4;
+            C[ci+5*ldc+0] = alpha * result5;
+            C[ci+6*ldc+0] = alpha * result6;
+            C[ci+7*ldc+0] = alpha * result7;
+            m_top+=1;
+        }
+
+        n_top += 8;
+    }
+
+
+
+    // -- tails for N=4
+
+    if( N & 4 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            bi += 4;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
+            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
+            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
+            vfloat64m1_t c2 = __riscv_vfmul_vf_f64m1( result2, alpha, gvl );
+            vfloat64m1_t c3 = __riscv_vfmul_vf_f64m1( result3, alpha, gvl );
+            vfloat64m1_t c4 = __riscv_vfmul_vf_f64m1( result4, alpha, gvl );
+            vfloat64m1_t c5 = __riscv_vfmul_vf_f64m1( result5, alpha, gvl );
+            vfloat64m1_t c6 = __riscv_vfmul_vf_f64m1( result6, alpha, gvl );
+            vfloat64m1_t c7 = __riscv_vfmul_vf_f64m1( result7, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c7, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            bi += 4;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
+            vfloat64m1_t c2 = __riscv_vfmul_vf_f64m1( result2, alpha, gvl );
+            vfloat64m1_t c3 = __riscv_vfmul_vf_f64m1( result3, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                ai+=2;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            C[ci+1*ldc+0] = alpha * result2;
+            C[ci+1*ldc+1] = alpha * result3;
+            C[ci+2*ldc+0] = alpha * result4;
+            C[ci+2*ldc+1] = alpha * result5;
+            C[ci+3*ldc+0] = alpha * result6;
+            C[ci+3*ldc+1] = alpha * result7;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                ai+=1;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+1*ldc+0] = alpha * result1;
+            C[ci+2*ldc+0] = alpha * result2;
+            C[ci+3*ldc+0] = alpha * result3;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            bi += 2;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
+            vfloat64m1_t c2 = __riscv_vfmul_vf_f64m1( result2, alpha, gvl );
+            vfloat64m1_t c3 = __riscv_vfmul_vf_f64m1( result3, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            bi += 2;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                ai+=2;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            C[ci+1*ldc+0] = alpha * result2;
+            C[ci+1*ldc+1] = alpha * result3;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                ai+=1;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+1*ldc+0] = alpha * result1;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            bi += 1;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            bi += 1;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                ai+=2;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                ai+=1;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c
index bb9ab8e5a..34e21ccda 100644
--- a/kernel/riscv64/gemv_n_vector.c
+++ b/kernel/riscv64/gemv_n_vector.c
@@ -27,21 +27,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSEV_FLOAT vse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#define VLEV_FLOAT __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSEV_FLOAT __riscv_vse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSEV_FLOAT vse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#define VLEV_FLOAT __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSEV_FLOAT __riscv_vse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
diff --git a/kernel/riscv64/gemv_t_vector.c b/kernel/riscv64/gemv_t_vector.c
index 7d0b70cbb..3764c3b64 100644
--- a/kernel/riscv64/gemv_t_vector.c
+++ b/kernel/riscv64/gemv_t_vector.c
@@ -27,107 +27,102 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m4_t
+#define VSETVL(n) __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T vfloat32m2_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFDOTVV_FLOAT vfdot_vv_f32m4
-#define VFMULVV_FLOAT vfmul_vv_f32m4
+#define VLEV_FLOAT __riscv_vle32_v_f32m2
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m2
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m2_f32m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m2
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m2
+#define xint_t int
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m4_t
+#define VSETVL(n) __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T vfloat64m2_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFDOTVV_FLOAT vfdot_vv_f64m4
-#define VFMULVV_FLOAT vfmul_vv_f64m4
+#define VLEV_FLOAT __riscv_vle64_v_f64m2
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m2
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m2_f64m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m2
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m2
+#define xint_t long long
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 {
-	    BLASLONG i = 0, j = 0, k = 0;
-	    BLASLONG ix = 0, iy = 0;
-	    FLOAT *a_ptr = a;
+	BLASLONG i = 0, j = 0, k = 0;
+	BLASLONG ix = 0, iy = 0;
+	FLOAT *a_ptr = a;
         FLOAT temp;
 
         FLOAT_V_T va, vr, vx;
-        unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+        BLASLONG gvl = 0;
+        FLOAT_V_T_M1 v_res;
+
 
         if(inc_x == 1){
                 for(i = 0; i < n; i++){
+                        v_res = VFMVVF_FLOAT_M1(0, 1);
                         gvl = VSETVL(m);
                         j = 0;
                         vr = VFMVVF_FLOAT(0, gvl);
                         for(k = 0; k < m/gvl; k++){
                                 va = VLEV_FLOAT(&a_ptr[j], gvl);
                                 vx = VLEV_FLOAT(&x[j], gvl);
-                                vr = VFMACCVV_FLOAT(vr, va, vx, gvl);
+                                vr = VFMULVV_FLOAT(va, vx, gvl);                // could vfmacc here and reduce outside loop
+                                v_res = VFREDSUM_FLOAT(vr, v_res, gvl);         // but that reordering diverges far enough from scalar path to make tests fail
                                 j += gvl;
                         }
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        temp = (FLOAT)VFMVFS_FLOAT(v_res);
                         if(j < m){
                                 gvl = VSETVL(m-j);
                                 va = VLEV_FLOAT(&a_ptr[j], gvl);
                                 vx = VLEV_FLOAT(&x[j], gvl);
                                 vr = VFMULVV_FLOAT(va, vx, gvl);
-
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp += (FLOAT)VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
                         }
+                        temp = (FLOAT)EXTRACT_FLOAT(v_res);
                         y[iy] += alpha * temp;
+
+
                         iy += inc_y;
                         a_ptr += lda;
                 }
         }else{
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
-                
                 for(i = 0; i < n; i++){
+                        v_res = VFMVVF_FLOAT_M1(0, 1);
                         gvl = VSETVL(m);
-						BLASLONG inc_xv = inc_x * gvl;
                         j = 0;
                         ix = 0;
                         vr = VFMVVF_FLOAT(0, gvl);
                         for(k = 0; k < m/gvl; k++){
                                 va = VLEV_FLOAT(&a_ptr[j], gvl);
                                 vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                                vr = VFMACCVV_FLOAT(vr, va, vx, gvl);
+                                vr = VFMULVV_FLOAT(va, vx, gvl);
+                                v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
                                 j += gvl;
-                                ix += inc_xv;
+                                ix += inc_x * gvl;
                         }
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        temp = (FLOAT)VFMVFS_FLOAT(v_res);
                         if(j < m){
                                 gvl = VSETVL(m-j);
                                 va = VLEV_FLOAT(&a_ptr[j], gvl);
                                 vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                 vr = VFMULVV_FLOAT(va, vx, gvl);
-
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp += (FLOAT)VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
                         }
+                        temp = (FLOAT)EXTRACT_FLOAT(v_res);
                         y[iy] += alpha * temp;
+
+
                         iy += inc_y;
                         a_ptr += lda;
                 }
         }
+
+
 	return(0);
 }
-
diff --git a/kernel/riscv64/generate_kernel.py b/kernel/riscv64/generate_kernel.py
new file mode 100755
index 000000000..e2ce97971
--- /dev/null
+++ b/kernel/riscv64/generate_kernel.py
@@ -0,0 +1,670 @@
+#!/usr/bin/python3
+
+import sys, os
+import contextlib
+
+#-----------------------------------------------------------------------
+def ERROR(*args, **kwargs):
+    print(*args, file=sys.stderr, **kwargs)
+    sys.exit(-1)
+
+class Target(object):
+    def __init__( self, out, mappings, initial_level=0, tab_width=4 ):
+        self._level = initial_level
+        self._tab_width = tab_width
+        self._out = out
+        self._mappings = mappings
+
+    @contextlib.contextmanager
+    def map( self, **items ):
+        old_mappings = self._mappings
+        self._mappings = dict(old_mappings, **items)
+        yield self._mappings
+        self._mappings = old_mappings
+
+    @contextlib.contextmanager
+    def block( self, start=None, end=None, **args ):
+        with self.map(**args):
+            if start is not None:
+                self.write();
+                self.write(start)
+            self._level += 1
+            yield self._level
+            self._level -= 1
+            if end is not None:
+                self.write(end)
+                self.write()
+
+    def write( self, fmt=None, *args, **kwargs ):
+        if fmt is not None:
+            mappings = dict(self._mappings, **kwargs) if kwargs else self._mappings
+            self._out(self._indent_str() + fmt.format(*args, **mappings))
+        else:
+            self._out("")
+
+    def _indent_str( self ):
+        return ' ' * (self._level * self._tab_width)
+
+#-----------------------------------------------------------------------
+def generate_trmm_block( dest ):
+    dest.write("{index_type} pass_K = K;")
+    dest.write("#ifdef LEFT")
+    with dest.block():
+        dest.write("{index_type} off = offset + m_top;")
+    dest.write("#else")
+    with dest.block():
+        dest.write("{index_type} off = -offset + n_top;")
+    dest.write("#endif")
+
+    dest.write("#ifdef BACKWARDS")
+    with dest.block():
+        dest.write("ai += off*{M}{elt_size};")
+        dest.write("bi += off*{N}{elt_size};")
+        dest.write("pass_K -= off;")
+    dest.write("#else")
+    with dest.block():
+        dest.write("#ifdef LEFT")
+        with dest.block():
+            dest.write("pass_K = off + {M};")
+        dest.write("#else")
+        with dest.block():
+            dest.write("pass_K = off + {N};")
+        dest.write("#endif")
+    dest.write("#endif")
+
+#-----------------------------------------------------------------------
+def generate_gemm_kernel_inner_real( settings, dest, M, N, vlen, a_regs ):
+    TRMM           = (settings['op'].value == 'trmm')
+    narrow_result  = (settings['param_precision'].value != 'double') and settings['force_acc_double'].value
+
+    with dest.map( 
+        M=M, 
+        N=N, 
+    ):
+        dest.write("{index_type} ai=m_top*K{elt_size};")
+        dest.write("{index_type} bi=n_top*K{elt_size};")
+        if TRMM:
+            generate_trmm_block( dest )
+
+        for i in range(N):
+            dest.write("{param_scalar_t} B{i} = B[bi+{i}];", i=i)
+        dest.write("bi += {N};")
+        dest.write()
+
+        for i in range(a_regs):
+            dest.write("{param_vector_t} A{i} = {VLEV}( &A[ai+{i}*gvl], gvl );", i=i)
+        dest.write("ai += {M};")
+        dest.write()
+
+        for j in range(N):
+            for i in range(a_regs):
+                dest.write("{acc_vector_t} result{dest} = {VMUL_TO_ACC}( A{i}, B{j}, gvl);", dest=j*a_regs+i, i=i, j=j)
+
+        with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')):
+            for i in range(N):
+                dest.write("B{i} = B[bi+{i}];", i=i )
+            dest.write("bi += {N};")
+            dest.write()
+
+            for i in range(a_regs):
+                dest.write("A{i} = {VLEV}( &A[ai+{i}*gvl], gvl );", i=i)
+
+            dest.write("ai += {M};")
+            dest.write()
+
+
+            for j in range(N):
+                for i in range(a_regs):
+                    dest.write("result{dest} = {VMACC_TO_ACC}( result{dest}, B{j}, A{i}, gvl);", dest= j*a_regs+i, j=j, i=i )
+
+        dest.write()
+        dest.write("{index_type} ci=n_top*ldc+m_top;")
+        dest.write()
+
+        if narrow_result:
+            for j in range(N):
+                for i in range(a_regs):
+                    dest.write("{param_vector_t} narrowed{idx} = {VFNCVT}( result{idx}, gvl );", idx=j*a_regs+i)
+
+        if not TRMM:
+            for j in range(N):
+                for i in range(a_regs):
+                    idx = j*a_regs+i
+                    increment = ' ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
+                    if idx == N*a_regs-1:
+                        increment = ''
+                    dest.write("{param_vector_t} c{idx} = {VLEV}( &C[ci], gvl);{increment}", idx=idx, increment=increment)
+
+        if narrow_result:
+            for j in range(N):
+                for i in range(a_regs):
+                    idx = j*a_regs+i
+                    if TRMM:
+                        dest.write("{param_vector_t} c{idx} = {VFMUL}( narrowed{idx}, alpha, gvl );", idx=idx)
+                    else:
+                        dest.write("c{idx} = {VFMACC}( c{idx}, alpha, narrowed{idx}, gvl );", idx=idx)
+        else:
+            for j in range(N):
+                for i in range(a_regs):
+                    idx = j*a_regs+i
+                    if TRMM:
+                        dest.write("{param_vector_t} c{idx} = {VFMUL}( result{idx}, alpha, gvl );", idx=idx)
+                    else:
+                        dest.write("c{idx} = {VFMACC}( c{idx}, alpha, result{idx}, gvl );", idx=idx)
+            
+
+        if not TRMM:
+            dest.write()
+            dest.write("ci=n_top*ldc+m_top;")
+            dest.write()
+
+        for j in range(N):
+            for i in range(a_regs):
+                idx = j*a_regs+i
+                increment = ' ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
+                if idx == N*a_regs-1:
+                    increment = ''
+                dest.write("{VSEV}( &C[ci], c{idx}, gvl);{increment}", idx=idx, increment=increment)
+
+
+#-----------------------------------------------------------------------
+def generate_gemm_kernel_inner_complex( settings, dest, M, N, vlen, a_regs ):
+    TRMM           = (settings['op'].value == 'trmm')
+    narrow_result  = (settings['param_precision'].value != 'double') and settings['force_acc_double'].value
+
+    if narrow_result:
+        raise RuntimeError("wide accumulator not supported for generated complex kernels")
+        # we could, but we run out of registers really really fast
+
+    with dest.map( 
+        M=M, 
+        N=N, 
+    ):
+        dest.write("{index_type} ai=m_top*K*2;")
+        dest.write("{index_type} bi=n_top*K*2;")
+        if TRMM:
+            generate_trmm_block( dest )
+
+        for i in range(N):
+            dest.write("{param_scalar_t} B{i}r = B[bi+{i}*2+0];", i=i)
+            dest.write("{param_scalar_t} B{i}i = B[bi+{i}*2+1];", i=i)
+        dest.write("bi += {N}*2;")
+        dest.write()
+
+        for i in range(a_regs):
+            dest.write("{param_vector_t} A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i)
+            dest.write("{param_vector_t} A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i)
+        dest.write("ai += {M}*2;")
+        dest.write()
+
+
+        accumulation_regs = a_regs * N * settings['LMUL_ACC'].value
+        dest.write("// {a_regs} vector regs to hold A array contents, {accumulation_regs} regs to hold values accumulated over k",
+                a_regs=a_regs*2, accumulation_regs=accumulation_regs*2
+            )
+        pass_regs = (accumulation_regs + a_regs)*2
+        tmp_regs = 32-pass_regs
+        if tmp_regs < 2:
+            raise RuntimeError("Complex kernel would use too many registers!")
+
+        dest.write("// leaving {tmp_regs} vector registers for temporaries", tmp_regs=tmp_regs)
+
+        tmp_unroll_i = min(tmp_regs, a_regs)
+        tmp_unroll_j = N
+        while tmp_unroll_j > 1 and (tmp_regs/(tmp_unroll_i*2)) < tmp_unroll_j:
+            tmp_unroll_j = int(tmp_unroll_j / 2)
+
+        if tmp_unroll_i < a_regs or tmp_unroll_j < N:
+            dest.write("// performing {ops} operations between reuses of temporaries", ops=tmp_unroll_j*tmp_unroll_i)
+
+        for tj in range(0, N, tmp_unroll_j):
+            for ti in range(0, a_regs, tmp_unroll_i):
+                for j in range(tj, tj+tmp_unroll_j):
+                    for i in range(ti, ti+tmp_unroll_i):
+                        with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
+                            if ti == 0 and tj==0:
+                                dest.write("{acc_vector_t} tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);")
+                                dest.write("{acc_vector_t} tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);")
+                            else:
+                                dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);")
+                                dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);")
+                for j in range(tj, tj+tmp_unroll_j):
+                    for i in range(ti, ti+tmp_unroll_i):
+                        with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
+                            dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);")
+                            dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);")
+
+                for j in range(tj, tj+tmp_unroll_j):
+                    for i in range(ti, ti+tmp_unroll_i):
+                        with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
+                            dest.write("{acc_vector_t} ACC{dest}r = tmp{tmp}r;")
+                            dest.write("{acc_vector_t} ACC{dest}i = tmp{tmp}i;")
+
+        with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')):
+            for i in range(N):
+                dest.write("B{i}r = B[bi+{i}*2+0];", i=i)
+                dest.write("B{i}i = B[bi+{i}*2+1];", i=i)
+            dest.write("bi += {N}*2;")
+            dest.write()
+
+            for i in range(a_regs):
+                dest.write("A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i)
+                dest.write("A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i)
+
+            dest.write("ai += {M}*2;")
+            dest.write()
+
+
+            for tj in range(0, N, tmp_unroll_j):
+                for ti in range(0, a_regs, tmp_unroll_i):
+                    # note the values in tmp{tmp}* are frequently of similar magnitude and opposite sign
+                    # so accumulating them directly to ACC would lose precision when ACC is larger
+
+                    for j in range(tj, tj+tmp_unroll_j):
+                        for i in range(ti, ti+tmp_unroll_i):
+                            with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
+                                dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);")
+                                dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);")
+                    for j in range(tj, tj+tmp_unroll_j):
+                        for i in range(ti, ti+tmp_unroll_i):
+                            with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
+                                dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);")
+                                dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);")
+                    for j in range(tj, tj+tmp_unroll_j):
+                        for i in range(ti, ti+tmp_unroll_i):
+                            with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
+                                dest.write("ACC{dest}r = {__riscv_}vfadd( ACC{dest}r, tmp{tmp}r, gvl);")
+                                dest.write("ACC{dest}i = {__riscv_}vfadd( ACC{dest}i, tmp{tmp}i, gvl);")
+
+        dest.write()
+        dest.write("{index_type} ci=n_top*ldc+m_top;")
+        dest.write()
+
+        for j in range(N):
+            if TRMM:
+                for i in range(a_regs):
+                    with dest.map(idx=j*a_regs+i):
+                        dest.write("{param_vector_t} C{idx}r = {__riscv_}vfmul( ACC{idx}r, alphar, gvl );")
+                        dest.write("{param_vector_t} C{idx}i = {__riscv_}vfmul( ACC{idx}i, alphar, gvl );")
+            else:
+                for i in range(a_regs):
+                    idx = j*a_regs+i
+                    increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
+                    if idx == N*a_regs-1:
+                        increment = ''                    
+                    with dest.map(idx=j*a_regs+i, increment=increment):
+                        dest.write("{param_vector_t} C{idx}r = {VLSEV}( &C[ci*2+0], sizeof(FLOAT)*2, gvl );")
+                        dest.write("{param_vector_t} C{idx}i = {VLSEV}( &C[ci*2+1], sizeof(FLOAT)*2, gvl );")
+                        dest.write("{increment}")
+
+        if not TRMM:
+            for j in range(N):
+                for i in range(a_regs):
+                    with dest.map(idx=j*a_regs+i):
+                        dest.write("C{idx}r = {__riscv_}vfmacc( C{idx}r, alphar, ACC{idx}r, gvl );")
+                        dest.write("C{idx}i = {__riscv_}vfmacc( C{idx}i, alphar, ACC{idx}i, gvl );")
+
+        for j in range(N):
+            for i in range(a_regs):
+                with dest.map(idx=j*a_regs+i):
+                    dest.write("C{idx}r = {__riscv_}vfnmsac( C{idx}r, alphai, ACC{idx}i, gvl );")
+                    dest.write("C{idx}i = {__riscv_}vfmacc ( C{idx}i, alphai, ACC{idx}r, gvl );")
+
+        if not TRMM:
+            dest.write()
+            dest.write("ci=n_top*ldc+m_top;")
+            dest.write()
+
+        for j in range(N):
+            for i in range(a_regs):
+                idx = j*a_regs+i
+                increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
+                if idx == N*a_regs-1:
+                    increment = ''                    
+                with dest.map(idx=j*a_regs+i, increment=increment):
+                    dest.write("{VSSEV}( &C[ci*2+0], sizeof(FLOAT)*2, C{idx}r, gvl);")
+                    dest.write("{VSSEV}( &C[ci*2+1], sizeof(FLOAT)*2, C{idx}i, gvl);")
+                    dest.write("{increment}")
+
+#-----------------------------------------------------------------------
+def generate_gemm_kernel( settings, OUTPUT ):
+    if settings['conjugate'].value:
+        ERROR('conjugate gemm not yet supported')
+
+    is_complex = settings['complex'].value
+    generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real
+    dest = Target(OUTPUT, { k:str(settings[k].value) for k in settings })
+
+    M = settings['M'].value
+    N = settings['N'].value
+    vlenmax = int( settings['reg_width_bits'].value / settings['ELEN_PARAM'].value )
+    a_regs = max(int(M/vlenmax), 1)
+
+    accumulation_regs = a_regs * N * settings['LMUL_ACC'].value
+    required_regs = accumulation_regs + a_regs
+    if is_complex:
+        required_regs = required_regs * 2 + 2
+        dest.write('''
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+    #define S0  1
+    #define S1 -1
+    #define S2  1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmsac{tail_policy}
+    #define VFMACC_RI __riscv_vfmacc{tail_policy}
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+    #define S0  1
+    #define S1  1
+    #define S2  1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmacc{tail_policy}
+    #define VFMACC_RI __riscv_vfmsac{tail_policy}
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+    #define S0  1
+    #define S1  1
+    #define S2 -1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmacc{tail_policy}
+    #define VFMACC_RI __riscv_vfnmsac{tail_policy}
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+    #define S0  1
+    #define S1 -1
+    #define S2 -1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmsac{tail_policy}
+    #define VFMACC_RI __riscv_vfnmacc{tail_policy}
+#endif
+'''.format(tail_policy=settings['tail_policy'].value))
+
+
+    if required_regs > 32:
+        raise Exception("{} vector registers needed during accumulation for unrolling {} x {}{} but only 32 are available".format(
+            required_regs, N, M, (" with wide accumulator" if settings['LMUL_ACC'].value > 1 else '')
+            ))
+
+    TRMM = (settings['op'].value == 'trmm')
+    if TRMM:
+        with dest.block("#if defined(LEFT) != defined(TRANSA)", "#endif"):
+            dest.write("#define BACKWARDS")
+
+    dest.write("int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, {alpha}, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc{trmm})",
+            alpha = ('FLOAT alphar, FLOAT alphai' if is_complex else 'FLOAT alpha'),
+            trmm = (', BLASLONG offset' if TRMM else '')
+        )
+
+    with dest.block("{{", "}}", elt_size='*2' if is_complex else ''):
+        if settings['trace'].value:
+            dest.write("printf(\"\\n\\nENTRY: %s(%d) M %d N %d K %d ldc %d\\n\", __FILE__, __LINE__, M, N, K, ldc);")
+        dest.write("{index_type} gvl = 0;")
+        dest.write("{index_type} m_top = 0;")
+        dest.write("{index_type} n_top = 0;")
+
+        dest.write()
+        dest.write()
+        dest.write("// -- MAIN PASS")
+
+        with dest.block("for ({index_type} j=0; j<N/{N}; j+=1) {{", "}}"):
+            dest.write("m_top = 0;")
+            dest.write("{index_type} gvl = {VSETVL}({vlenmax});", vlenmax=min(vlenmax,max(int(M/a_regs),1)))
+            dest.write()
+            with dest.block("for ({index_type} i=0; i<M/{M}; i+=1) {{", "}}"):
+                generate_gemm_kernel_inner( settings, dest, M, N, vlenmax, a_regs )
+                dest.write( "m_top += {M};" )
+
+            dest.write()
+            dest.write()
+            dest.write("// -- tails for main pass")
+            generate_M_tails( dest, settings, M, N )
+
+            dest.write( "n_top += {N};" )
+
+
+        N_tail = int(N/2)
+        while( N_tail > 0 ):
+            with dest.map(N=N_tail):
+                dest.write()
+                dest.write()
+                dest.write("// -- tails for N={N}")
+                with dest.block("if( N & {N} ) {{", "}}" ):
+                    if settings['trace'].value:
+                        dest.write("printf(\"N tail entry: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);")
+                    dest.write("gvl = {VSETVL}({vlenmax});", vlenmax=min(vlenmax,max(int(M/a_regs),1)))
+                    dest.write("m_top = 0;")
+                    with dest.block("for ({index_type} i=0; i<M/{M}; i+=1) {{", "}}"):
+                        generate_gemm_kernel_inner( settings, dest, M, N_tail, vlenmax, a_regs )
+                        dest.write("m_top += {M};")
+
+                    generate_M_tails( dest, settings, M, N_tail )
+                    dest.write("n_top += {N};")
+            N_tail = int(N_tail/2)
+
+        dest.write("return 0;");
+
+
+#-----------------------------------------------------------------------
+def generate_M_tails( dest, settings, M, N ):
+    M_tail = int(M/2)
+    M_tail_min = settings['M_tail_scalar_from'].value
+    vlenmax = int( settings['reg_width_bits'].value / settings['ELEN_PARAM'].value )
+    TRMM           = (settings['op'].value == 'trmm')
+    is_complex = settings['complex'].value
+    generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real
+
+    while( M_tail > M_tail_min ):
+        with dest.block("if( M & {M_tail} ) {{", "}}", M_tail=M_tail ):
+            if settings['trace'].value:
+                dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);")
+            a_regs = max( 1, int(M_tail/vlenmax) )
+            vlen = int(M_tail/a_regs)
+            dest.write("gvl = {VSETVL}({vlen});\n", vlen=vlen)
+
+            generate_gemm_kernel_inner( settings, dest, M_tail, N, vlen, a_regs )
+            dest.write( "m_top += {M_tail};" )
+
+        M_tail = int( M_tail / 2 )
+
+    while( M_tail > 0 ):
+        with dest.block("if( M & {M_tail} ) {{", "}}", 
+                M_tail=M_tail, 
+                N=N, 
+                result_t = ('double' if settings['force_acc_double'].value else settings['param_scalar_t'].value) 
+        ):
+            if settings['trace'].value:
+                dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);")
+            for r in range(M_tail * N * (2 if is_complex else 1)):
+                dest.write("{result_t} result{r} = 0;",
+                    r=r
+                )
+
+            dest.write("{index_type} ai=m_top*K{elt_size};")
+            dest.write("{index_type} bi=n_top*K{elt_size};")
+
+            if TRMM:
+                with dest.map(M=M_tail, N=N):
+                    generate_trmm_block( dest )
+
+            with dest.block("for({index_type} k=0; k<{Kend}; k++) {{", "}}", Kend = ('pass_K' if TRMM else 'K') ):
+                for ki in range( N ):
+                    for kj in range( M_tail ):
+                        if is_complex:
+                            dest.write("result{dest}+=S0*A[ai+{kj}+0]*B[bi+{ki}+0] + S1*A[ai+{kj}+1]*B[bi+{ki}+1];".format(
+                                        dest=(ki*M_tail+kj)*2, kj=kj*2, ki=ki*2
+                                    ))                            
+                            dest.write("result{dest}+=S2*A[ai+{kj}+1]*B[bi+{ki}+0] + S3*A[ai+{kj}+0]*B[bi+{ki}+1];".format(
+                                        dest=(ki*M_tail+kj)*2+1, kj=kj*2, ki=ki*2
+                                    ))                            
+                        else:
+                            dest.write("result{dest}+=A[ai+{kj}]*B[bi+{ki}];".format(
+                                    dest=ki*M_tail+kj, kj=kj, ki=ki
+                                ))
+                dest.write("ai+={M_tail}{elt_size};")
+                dest.write("bi+={N}{elt_size};")
+
+            dest.write("{index_type} ci=n_top*ldc+m_top;")
+            if is_complex:
+                dest.write("{result_t} Cr, Ci;")
+            for ki in range( N ):
+                for kj in range( M_tail ):
+                    if is_complex:
+                        if TRMM:
+                            dest.write('Cr = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0)
+                            dest.write('Ci = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1)
+                        else:
+                            dest.write('Cr = C[(ci+{ki}*ldc+{kj})*2+0];', ki=ki, kj=kj)
+                            dest.write('Ci = C[(ci+{ki}*ldc+{kj})*2+1];', ki=ki, kj=kj)
+                            dest.write('Cr += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0)
+                            dest.write('Ci += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1)
+                        dest.write('Cr -= result{dest}*alphai;', dest=(ki*M_tail+kj)*2+1)
+                        dest.write('Ci += result{dest}*alphai;', dest=(ki*M_tail+kj)*2+0)
+                        dest.write("C[(ci+{ki}*ldc+{kj})*2+0] = Cr;", ki=ki, kj=kj )
+                        dest.write("C[(ci+{ki}*ldc+{kj})*2+1] = Ci;", ki=ki, kj=kj )
+                    else:
+                        op = '' if TRMM else '+'
+                        dest.write("C[ci+{ki}*ldc+{kj}] {op}= alpha * result{dest};",
+                                ki=ki, kj=kj, op=op, dest=ki*M_tail+kj
+                            )
+            dest.write("m_top+={M_tail};")
+
+        M_tail = int(M_tail/2)
+
+
+#-----------------------------------------------------------------------
+class Setting(object):
+    def __init__( self, value, convert = None ):
+        self._value = value
+        self._convert = convert
+
+    @classmethod
+    def ENUM( cls, *values ):
+        def closure( values ):
+            return lambda value: values[value.lower()]
+        return closure( { v.lower():v for v in values } )
+
+    @classmethod
+    def BOOL( cls, value ):
+        return value.lower().startswith('t') or value == '1'
+
+    @property
+    def value( self ):
+        return self._value
+
+    @property
+    def configurable( self ):
+        return self._convert is not None
+
+    @value.setter
+    def value( self, value ):
+        self._value = self._convert( value )
+
+    def __str__( self ):
+        return str(self._value)
+
+#-----------------------------------------------------------------------
+def main():
+    settings = {
+        'op':               Setting( 'gemm', Setting.ENUM( 'gemm', 'trmm' ) ),
+        'M':                Setting( 16, int ),
+        'N':                Setting( 4, int ),
+        'reg_width_bits':   Setting( 256, int ),
+        'LMUL':             Setting( 1, int ),
+        'M_tail_scalar_from':Setting( 2, int ),
+        'cpu':              Setting( 'zvl256b', str ),
+        'param_precision':  Setting( 'float', Setting.ENUM( 'float', 'double' ) ),
+        'force_acc_double': Setting( False, Setting.BOOL ),
+        'complex':          Setting( False, Setting.BOOL ),
+        'conjugate':        Setting( False, Setting.BOOL ),
+        'index_type':       Setting( 'BLASLONG', str ),
+        'trace':            Setting( False, Setting.BOOL ),
+        'output':           Setting( None, str ),
+        'tail_policy':      Setting( '', str ), # _ta, if toolchain supports it
+        '__riscv_':         Setting( '__riscv_', str),
+    }
+
+    for item in sys.argv[1:]:
+        try:
+            name, value = tuple(item.split( '=', 1 ))
+        except:
+            ERROR("couldn't parse {}, expected arguments of the form name=value".format(item))
+
+        if name not in settings:
+            ERROR("couldn't parse {}, {} it is not a known option\n".format( item, name )
+                  +"options (and current defaults) are\n{}".format(
+                   " ".join([ '{}={}'.format(k, settings[k].value) for k in settings.keys()]))
+                )
+
+        try:
+            settings[name].value = value
+        except:
+            import traceback
+            traceback.print_exc()
+            ERROR("couldn't parse {}".format(item))
+
+    if settings['output'].value is None:
+        if settings['complex'].value:
+            prefix = 'z' if settings['param_precision'].value == 'double' else 'c'
+        else:
+            prefix = 'd' if settings['param_precision'].value == 'double' else 's'
+        settings['output'] = Setting('{}{}_kernel_{}x{}_{}.c'.format(
+                prefix,
+                settings['op'],
+                settings['M'],
+                settings['N'],
+                settings['cpu']
+            ))
+
+    if settings['param_precision'].value == 'double':
+        settings['param_scalar_t'] = Setting( 'double' )
+        settings['ELEN_PARAM'] = Setting(64)
+    else:
+        settings['param_scalar_t'] = Setting( 'float' )
+        settings['ELEN_PARAM'] = Setting(32)
+
+    settings['VFMUL'] = Setting( '{}vfmul_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) )
+    settings['VFMACC'] = Setting( '{}vfmacc_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) )
+
+    settings['ELEN_ACC'] = settings['ELEN_PARAM']
+    settings['LMUL_ACC'] = Setting(settings['LMUL'].value)
+    widen = ''
+
+    if settings['force_acc_double'].value and (settings['param_precision'].value == 'float'):
+        settings['ELEN_ACC'] = Setting(64)
+        settings['LMUL_ACC'] = Setting(settings['LMUL'].value*2)
+        settings['VFNCVT']   = Setting('{}vfncvt_f_f_w_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']))
+        widen = 'w'
+
+    settings['VMUL_TO_ACC'] = Setting( '{}vf{}mul_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) )
+    settings['VMACC_TO_ACC'] = Setting( '{}vf{}macc_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) )
+
+    settings['param_vector_t']=Setting('vfloat{}m{}_t'.format(settings['ELEN_PARAM'], settings['LMUL']))
+    settings['acc_vector_t']  =Setting('vfloat{}m{}_t'.format(settings['ELEN_ACC'], settings['LMUL_ACC']))
+    settings['VLEV']          =Setting('{}vle{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
+    settings['VSEV']          =Setting('{}vse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
+    settings['VLSEV']         =Setting('{}vlse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
+    settings['VSSEV']         =Setting('{}vsse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
+    settings['VSETVL']        =Setting('{}vsetvl_e{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL']))
+
+
+    to_stdout = (settings['output'].value == '-')
+    if not to_stdout:
+        print("Writing {}".format(settings['output'].value), file=sys.stderr)
+
+    with open(sys.stdout.fileno() if to_stdout else settings['output'].value, 'w') as destination_file:
+        def OUTPUT(*args, **kwargs):
+            print(*args, file=destination_file, **kwargs)
+
+        OUTPUT("/*\n\nAUTOGENERATED KERNEL\nSettings:\n {}".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if settings[k].configurable])))
+        OUTPUT("Derived:\n {}\n*/\n".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if not settings[k].configurable])))
+
+        OUTPUT('#include "common.h"')
+        OUTPUT("\n")
+
+        if settings['op'].value in ('gemm', 'trmm'):
+            generate_gemm_kernel(settings, OUTPUT)
+        else:
+            ERROR("unsupported kernel type {}".format(settings['op']))
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c
index 9fea522f7..92880fbcf 100644
--- a/kernel/riscv64/iamax_vector.c
+++ b/kernel/riscv64/iamax_vector.c
@@ -27,118 +27,111 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #include <math.h>
+#include <float.h>
 
 #if defined(DOUBLE)
 
-#define ABS fabs
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
-#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
-#define VMFIRSTM vmfirst_m_b8
-#define UINT_V_T vuint64m8_t
-#define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
-#define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VMVVX_UINT vmv_v_x_u64m8
+#define VLEV_FLOAT __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1
+#define MASK_T vbool16_t
+#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m4_b16
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m4
+#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m4_b16
+#define VMFIRSTM __riscv_vfirst_m_b16
+#define UINT_V_T vuint64m4_t
+#define VIDV_UINT __riscv_vid_v_u64m4
+#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_mu
+#define VADDVX_UINT __riscv_vadd_vx_u64m4
+#define VMVVX_UINT __riscv_vmv_v_x_u64m4
+#define VFABS_FLOAT __riscv_vfabs_v_f64m4
+#define VCOMPRESS __riscv_vcompress_vm_u64m4
+#define VMV_X __riscv_vmv_x_s_u64m4_u64
 #else
 
-#define ABS fabsf
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
-#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
-#define VMFIRSTM vmfirst_m_b4
-#define UINT_V_T vuint32m8_t
-#define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
-#define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VMVVX_UINT vmv_v_x_u32m8
+#define VLEV_FLOAT __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1
+#define MASK_T vbool8_t
+#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m4_b8
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m4
+#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m4_b8
+#define VMFIRSTM __riscv_vfirst_m_b8
+#define UINT_V_T vuint32m4_t
+#define VIDV_UINT __riscv_vid_v_u32m4
+#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_mu
+#define VADDVX_UINT __riscv_vadd_vx_u32m4
+#define VMVVX_UINT __riscv_vmv_v_x_u32m4
+#define VFABS_FLOAT __riscv_vfabs_v_f32m4
+#define VCOMPRESS __riscv_vcompress_vm_u32m4
+#define VMV_X __riscv_vmv_x_s_u32m4_u32
 #endif
 
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
-	FLOAT maxf=0.0;
+        BLASLONG i=0, j=0;
         unsigned int max_index = 0;
-	if (n <= 0 || inc_x <= 0) return(max_index);
+        if (n <= 0 || inc_x <= 0) return(max_index);
+        FLOAT maxf=-FLT_MAX;
 
         FLOAT_V_T vx, v_max;
         UINT_V_T v_max_index;
         MASK_T mask;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
+
+        gvl = VSETVL(n);
+        UINT_V_T vid = VIDV_UINT(gvl);
 
         if(inc_x == 1){
-                gvl = VSETVL(n);
                 v_max_index = VMVVX_UINT(0, gvl);
-                v_max = VFMVVF_FLOAT(-1, gvl);
+                v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
                 for(i=0,j=0; i < n/gvl; i++){
                         vx = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        vx = VFABS_FLOAT(vx, gvl);
 
                         //index where element greater than v_max
                         mask = VMFLTVV_FLOAT(v_max, vx, gvl);
-                        v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
-                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl);
+                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl);
 
                         //update v_max and start_index j
                         v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
                         j += gvl;
                 }
-                v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                maxf = *((FLOAT*)&v_res);
+                v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                maxf = EXTRACT_FLOAT(v_res);
                 mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
-                max_index = VMFIRSTM(mask,gvl);
-                max_index = *((unsigned int*)&v_max_index+max_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                max_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
-                        vx = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        v_max = VLEV_FLOAT(&x[j], gvl);
+                        v_max = VFABS_FLOAT(v_max, gvl);
 
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                        FLOAT cur_maxf = *((FLOAT*)&v_res);
+                        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                        FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
                         if(cur_maxf > maxf){
                                 //tail index
-                                v_max_index = VIDV_UINT(gvl);
-                                v_max_index = VADDVX_UINT(v_max_index, j, gvl);
+                                v_max_index = VADDVX_UINT(vid, j, gvl);
 
                                 mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
-                                max_index = VMFIRSTM(mask,gvl);
-                                max_index = *((unsigned int*)&v_max_index+max_index);
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                                max_index = VMV_X(compressed);
                         }
                 }
         }else{
@@ -146,51 +139,48 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 unsigned int stride_x = inc_x * sizeof(FLOAT);
                 unsigned int idx = 0, inc_v = gvl * inc_x;
 
+                v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
                 v_max_index = VMVVX_UINT(0, gvl);
-                v_max = VFMVVF_FLOAT(-1, gvl);
                 for(i=0,j=0; i < n/gvl; i++){
                         vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        vx = VFABS_FLOAT(vx, gvl);
 
                         //index where element greater than v_max
                         mask = VMFLTVV_FLOAT(v_max, vx, gvl);
-                        v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
-                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
+                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl);
 
                         //update v_max and start_index j
                         v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
                         j += gvl;
                         idx += inc_v;
                 }
-                v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                maxf = *((FLOAT*)&v_res);
+
+                v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                maxf = EXTRACT_FLOAT(v_res);
                 mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
-                max_index = VMFIRSTM(mask,gvl);
-                max_index = *((unsigned int*)&v_max_index+max_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                max_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
-                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        v_max = VFABS_FLOAT(v_max, gvl);
+
+                        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                        FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
 
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                        FLOAT cur_maxf = *((FLOAT*)&v_res);
                         if(cur_maxf > maxf){
                                 //tail index
-                                v_max_index = VIDV_UINT(gvl);
-                                v_max_index = VADDVX_UINT(v_max_index, j, gvl);
+                                v_max_index = VADDVX_UINT(vid, j, gvl);
 
                                 mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
-                                max_index = VMFIRSTM(mask,gvl);
-                                max_index = *((unsigned int*)&v_max_index+max_index);
+
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                                max_index = VMV_X(compressed);
                         }
                 }
         }
-	return(max_index+1);
+        return(max_index+1);
 }
-
-
diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c
index 4e81e7848..0503f9948 100644
--- a/kernel/riscv64/iamin_vector.c
+++ b/kernel/riscv64/iamin_vector.c
@@ -31,85 +31,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(DOUBLE)
 
-#define ABS fabs
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) __riscv_vsetvl_e64m8(n)
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
+#define VLEV_FLOAT __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
+#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1
 #define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMINVV_FLOAT vfmin_vv_f64m8
-#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
-#define VMFIRSTM vmfirst_m_b8
+#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f64m8_b8
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8
+#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8
+#define VMFIRSTM __riscv_vfirst_m_b8
 #define UINT_V_T vuint64m8_t
-#define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
-#define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VMVVX_UINT vmv_v_x_u64m8
+#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu
+#define VIDV_UINT __riscv_vid_v_u64m8
+#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu
+#define VADDVX_UINT __riscv_vadd_vx_u64m8
+#define VMVVX_UINT __riscv_vmv_v_x_u64m8
+#define VFABS_FLOAT __riscv_vfabs_v_f64m8
+#define VCOMPRESS __riscv_vcompress_vm_u64m8
+#define VMV_X __riscv_vmv_x_s_u64m8_u64
 #else
 
-#define ABS fabsf
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) __riscv_vsetvl_e32m8(n)
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
+#define VLEV_FLOAT __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
+#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1
 #define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMINVV_FLOAT vfmin_vv_f32m8
-#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
-#define VMFIRSTM vmfirst_m_b4
+#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f32m8_b4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8
+#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4
+#define VMFIRSTM __riscv_vfirst_m_b4
 #define UINT_V_T vuint32m8_t
-#define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
-#define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VMVVX_UINT vmv_v_x_u32m8
+#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu
+#define VIDV_UINT __riscv_vid_v_u32m8
+#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu
+#define VADDVX_UINT __riscv_vadd_vx_u32m8
+#define VMVVX_UINT __riscv_vmv_v_x_u32m8
+#define VFABS_FLOAT __riscv_vfabs_v_f32m8
+#define VCOMPRESS __riscv_vcompress_vm_u32m8
+#define VMV_X __riscv_vmv_x_s_u32m8_u32
 #endif
 
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
-	FLOAT minf=FLT_MAX;
+        BLASLONG i=0, j=0;
         unsigned int min_index = 0;
-	if (n <= 0 || inc_x <= 0) return(min_index);
+        if (n <= 0 || inc_x <= 0) return(min_index);
+        FLOAT minf=FLT_MAX;
 
         FLOAT_V_T vx, v_min;
         UINT_V_T v_min_index;
         MASK_T mask;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_max;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
 
         if(inc_x == 1){
                 gvl = VSETVL(n);
-                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
                 v_min_index = VMVVX_UINT(0, gvl);
+                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
                 for(i=0,j=0; i < n/gvl; i++){
                         vx = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        vx = VFABS_FLOAT(vx, gvl);
 
-                        //index where element less than v_min
-                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
+                        //index where element greater than v_min
+                        mask = VMFGTVV_FLOAT(v_min, vx, gvl);
                         v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
                         v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
 
@@ -117,29 +111,29 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         v_min = VFMINVV_FLOAT(v_min, vx, gvl);
                         j += gvl;
                 }
-                v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                minf = *((FLOAT*)&v_res);
+                v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                minf = EXTRACT_FLOAT(v_res);
                 mask = VMFLEVF_FLOAT(v_min, minf, gvl);
-                min_index = VMFIRSTM(mask,gvl);
-                min_index = *((unsigned int*)&v_min_index+min_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                min_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
-                        vx = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        v_min = VLEV_FLOAT(&x[j], gvl);
+                        v_min = VFABS_FLOAT(v_min, gvl);
 
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        FLOAT cur_minf = *((FLOAT*)&v_res);
-                        if(cur_minf < minf){
+                        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                        FLOAT cur_minf = EXTRACT_FLOAT(v_res);
+                        if(cur_minf > minf){
                                 //tail index
                                 v_min_index = VIDV_UINT(gvl);
                                 v_min_index = VADDVX_UINT(v_min_index, j, gvl);
 
                                 mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
-                                min_index = VMFIRSTM(mask,gvl);
-                                min_index = *((unsigned int*)&v_min_index+min_index);
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                                min_index = VMV_X(compressed);
                         }
                 }
         }else{
@@ -151,12 +145,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 v_min_index = VMVVX_UINT(0, gvl);
                 for(i=0,j=0; i < n/gvl; i++){
                         vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        vx = VFABS_FLOAT(vx, gvl);
 
-                        //index where element less than v_min
-                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
+                        //index where element greater than v_min
+                        mask = VMFGTVV_FLOAT(v_min, vx, gvl);
                         v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
                         v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
 
@@ -165,33 +157,31 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         j += gvl;
                         idx += inc_v;
                 }
-                v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                minf = *((FLOAT*)&v_res);
+                v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                minf = EXTRACT_FLOAT(v_res);
                 mask = VMFLEVF_FLOAT(v_min, minf, gvl);
-                min_index = VMFIRSTM(mask,gvl);
-                min_index = *((unsigned int*)&v_min_index+min_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                min_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
-                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        v_min = VFABS_FLOAT(v_min, gvl);
 
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        FLOAT cur_minf = *((FLOAT*)&v_res);
-                        if(cur_minf < minf){
+                        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                        FLOAT cur_minf = EXTRACT_FLOAT(v_res);
+                        if(cur_minf > minf){
                                 //tail index
                                 v_min_index = VIDV_UINT(gvl);
                                 v_min_index = VADDVX_UINT(v_min_index, j, gvl);
 
                                 mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
-                                min_index = VMFIRSTM(mask,gvl);
-                                min_index = *((unsigned int*)&v_min_index+min_index);
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                                min_index = VMV_X(compressed);
                         }
                 }
         }
-	return(min_index+1);
+        return(min_index+1);
 }
-
-
diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c
index ca48a3c48..e24f9fd48 100644
--- a/kernel/riscv64/imax_vector.c
+++ b/kernel/riscv64/imax_vector.c
@@ -31,68 +31,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(DOUBLE)
 
-#define ABS fabs
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) __riscv_vsetvl_e64m8(n)
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
+#define VLEV_FLOAT __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
+#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1
 #define MASK_T vbool8_t
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
-#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
-#define VMFIRSTM vmfirst_m_b8
+#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8
+#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8
+#define VMFIRSTM __riscv_vfirst_m_b8
 #define UINT_V_T vuint64m8_t
-#define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
-#define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VMVVX_UINT vmv_v_x_u64m8
+#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu
+#define VIDV_UINT __riscv_vid_v_u64m8
+#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu
+#define VADDVX_UINT __riscv_vadd_vx_u64m8
+#define VMVVX_UINT __riscv_vmv_v_x_u64m8
+#define VCOMPRESS __riscv_vcompress_vm_u64m8
+#define VMV_X __riscv_vmv_x_s_u64m8_u64
 #else
 
-#define ABS fabsf
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) __riscv_vsetvl_e32m8(n)
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
+#define VLEV_FLOAT __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
+#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1
 #define MASK_T vbool4_t
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
-#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
-#define VMFIRSTM vmfirst_m_b4
+#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8
+#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4
+#define VMFIRSTM __riscv_vfirst_m_b4
 #define UINT_V_T vuint32m8_t
-#define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
-#define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VMVVX_UINT vmv_v_x_u32m8
+#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu
+#define VIDV_UINT __riscv_vid_v_u32m8
+#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu
+#define VADDVX_UINT __riscv_vadd_vx_u32m8
+#define VMVVX_UINT __riscv_vmv_v_x_u32m8
+#define VCOMPRESS __riscv_vcompress_vm_u32m8
+#define VMV_X __riscv_vmv_x_s_u32m8_u32
 #endif
 
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
+        BLASLONG i=0, j=0;
         unsigned int max_index = 0;
-	if (n <= 0 || inc_x <= 0) return(max_index);
-	FLOAT maxf=-FLT_MAX;
+        if (n <= 0 || inc_x <= 0) return(max_index);
+        FLOAT maxf=-FLT_MAX;
 
         FLOAT_V_T vx, v_max;
         UINT_V_T v_max_index;
         MASK_T mask;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_min;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
 
         if(inc_x == 1){
                 gvl = VSETVL(n);
@@ -104,32 +102,34 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         //index where element greater than v_max
                         mask = VMFLTVV_FLOAT(v_max, vx, gvl);
                         v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
-                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl);
+                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
 
                         //update v_max and start_index j
                         v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
                         j += gvl;
                 }
-                v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
-                maxf = *((FLOAT*)&v_res);
+                v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                maxf = EXTRACT_FLOAT(v_res);
                 mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
-                max_index = VMFIRSTM(mask,gvl);
-                max_index = *((unsigned int*)&v_max_index+max_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                max_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
                         v_max = VLEV_FLOAT(&x[j], gvl);
 
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
-                        FLOAT cur_maxf = *((FLOAT*)&v_res);
+                        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                        FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
                         if(cur_maxf > maxf){
                                 //tail index
                                 v_max_index = VIDV_UINT(gvl);
                                 v_max_index = VADDVX_UINT(v_max_index, j, gvl);
 
                                 mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
-                                max_index = VMFIRSTM(mask,gvl);
-                                max_index = *((unsigned int*)&v_max_index+max_index);
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                                max_index = VMV_X(compressed);
                         }
                 }
         }else{
@@ -145,37 +145,37 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         //index where element greater than v_max
                         mask = VMFLTVV_FLOAT(v_max, vx, gvl);
                         v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
-                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl);
+                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
 
                         //update v_max and start_index j
                         v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
                         j += gvl;
                         idx += inc_v;
                 }
-                v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
-                maxf = *((FLOAT*)&v_res);
+                v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                maxf = EXTRACT_FLOAT(v_res);
                 mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
-                max_index = VMFIRSTM(mask,gvl);
-                max_index = *((unsigned int*)&v_max_index+max_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                max_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
                         v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
 
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
-                        FLOAT cur_maxf = *((FLOAT*)&v_res);
+                        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                        FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
                         if(cur_maxf > maxf){
                                 //tail index
                                 v_max_index = VIDV_UINT(gvl);
                                 v_max_index = VADDVX_UINT(v_max_index, j, gvl);
 
                                 mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
-                                max_index = VMFIRSTM(mask,gvl);
-                                max_index = *((unsigned int*)&v_max_index+max_index);
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                                max_index = VMV_X(compressed);
                         }
                 }
         }
-	return(max_index+1);
+        return(max_index+1);
 }
-
-
diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c
index 2a677098d..a60bd3d07 100644
--- a/kernel/riscv64/imin_vector.c
+++ b/kernel/riscv64/imin_vector.c
@@ -31,122 +31,105 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(DOUBLE)
 
-#define ABS fabs
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) __riscv_vsetvl_e64m8(n)
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
+#define VLEV_FLOAT __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
+#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1
 #define MASK_T vbool8_t
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMINVV_FLOAT vfmin_vv_f64m8
-#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
-#define VMFIRSTM vmfirst_m_b8
+#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f64m8_b8
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8
+#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8
+#define VMFIRSTM __riscv_vfirst_m_b8
 #define UINT_V_T vuint64m8_t
-#define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
-#define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VMVVX_UINT vmv_v_x_u64m8
+#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m
+#define VIDV_UINT __riscv_vid_v_u64m8
+#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m
+#define VADDVX_UINT __riscv_vadd_vx_u64m8
+#define VMVVX_UINT __riscv_vmv_v_x_u64m8
+#define VCOMPRESS __riscv_vcompress_vm_u64m8
+#define VMV_X __riscv_vmv_x_s_u64m8_u64
 #else
 
-#define ABS fabsf
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) __riscv_vsetvl_e32m8(n)
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
+#define VLEV_FLOAT __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
+#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1
 #define MASK_T vbool4_t
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMINVV_FLOAT vfmin_vv_f32m8
-#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
-#define VMFIRSTM vmfirst_m_b4
+#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f32m8_b4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8
+#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4
+#define VMFIRSTM __riscv_vfirst_m_b4
 #define UINT_V_T vuint32m8_t
-#define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
-#define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VMVVX_UINT vmv_v_x_u32m8
+#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m
+#define VIDV_UINT __riscv_vid_v_u32m8
+#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m
+#define VADDVX_UINT __riscv_vadd_vx_u32m8
+#define VMVVX_UINT __riscv_vmv_v_x_u32m8
+#define VCOMPRESS __riscv_vcompress_vm_u32m8
+#define VMV_X __riscv_vmv_x_s_u32m8_u32
 #endif
 
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
-	FLOAT minf=FLT_MAX;
+        BLASLONG i=0, j=0;
         unsigned int min_index = 0;
-	if (n <= 0 || inc_x <= 0) return(min_index);
+        if (n <= 0 || inc_x <= 0) return(min_index);
+        FLOAT minf=FLT_MAX;
 
         FLOAT_V_T vx, v_min;
         UINT_V_T v_min_index;
         MASK_T mask;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_max;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
 
         if(inc_x == 1){
                 gvl = VSETVL(n);
-                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
                 v_min_index = VMVVX_UINT(0, gvl);
+                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
                 for(i=0,j=0; i < n/gvl; i++){
                         vx = VLEV_FLOAT(&x[j], gvl);
-                        //index where element less than v_min
-                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
-                        v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e64,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_min_index)
-        :"v"(mask), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e32,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_min_index)
-        :"v"(mask), "r"(gvl)
-        :"v0");
-#endif
-*/
-                        v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl);
+
+                        //index where element greater than v_min
+                        mask = VMFGTVV_FLOAT(v_min, vx, gvl);
+                        v_min_index = VIDV_MASK_UINT(mask, gvl);
+                        v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl);
 
                         //update v_min and start_index j
                         v_min = VFMINVV_FLOAT(v_min, vx, gvl);
                         j += gvl;
                 }
-                v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                minf = *((FLOAT*)&v_res);
+                v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                minf = EXTRACT_FLOAT(v_res);
                 mask = VMFLEVF_FLOAT(v_min, minf, gvl);
-                min_index = VMFIRSTM(mask,gvl);
-                min_index = *((unsigned int*)&v_min_index+min_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                min_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
                         v_min = VLEV_FLOAT(&x[j], gvl);
 
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        FLOAT cur_minf = *((FLOAT*)&v_res);
-                        if(cur_minf < minf){
+                        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                        FLOAT cur_minf = EXTRACT_FLOAT(v_res);
+                        if(cur_minf > minf){
                                 //tail index
                                 v_min_index = VIDV_UINT(gvl);
                                 v_min_index = VADDVX_UINT(v_min_index, j, gvl);
+
                                 mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
-                                min_index = VMFIRSTM(mask,gvl);
-                                min_index = *((unsigned int*)&v_min_index+min_index);
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                                min_index = VMV_X(compressed);
                         }
                 }
         }else{
@@ -159,59 +142,39 @@ asm volatile(
                 for(i=0,j=0; i < n/gvl; i++){
                         vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
 
-                        //index where element less than v_min
-                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
-                        v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e64,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_min_index)
-        :"v"(mask), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e32,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_min_index)
-        :"v"(mask), "r"(gvl)
-        :"v0");
-#endif
-*/
-
-                        v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl);
+                        //index where element greater than v_min
+                        mask = VMFGTVV_FLOAT(v_min, vx, gvl);
+                        v_min_index = VIDV_MASK_UINT(mask, gvl);
+                        v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl);
 
                         //update v_min and start_index j
                         v_min = VFMINVV_FLOAT(v_min, vx, gvl);
                         j += gvl;
                         idx += inc_v;
                 }
-                v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                minf = *((FLOAT*)&v_res);
+                v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                minf = EXTRACT_FLOAT(v_res);
                 mask = VMFLEVF_FLOAT(v_min, minf, gvl);
-                min_index = VMFIRSTM(mask,gvl);
-                min_index = *((unsigned int*)&v_min_index+min_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                min_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
                         v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        FLOAT cur_minf = *((FLOAT*)&v_res);
-                        if(cur_minf < minf){
+                        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                        FLOAT cur_minf = EXTRACT_FLOAT(v_res);
+                        if(cur_minf > minf){
                                 //tail index
                                 v_min_index = VIDV_UINT(gvl);
                                 v_min_index = VADDVX_UINT(v_min_index, j, gvl);
+
                                 mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
-                                min_index = VMFIRSTM(mask,gvl);
-                                min_index = *((unsigned int*)&v_min_index+min_index);
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                                min_index = VMV_X(compressed);
                         }
                 }
         }
-	return(min_index+1);
+        return(min_index+1);
 }
-
-
diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c
index 66a101566..89cd510c1 100644
--- a/kernel/riscv64/izamax_vector.c
+++ b/kernel/riscv64/izamax_vector.c
@@ -27,241 +27,132 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #include <math.h>
+#include <float.h>
 
 #if defined(DOUBLE)
 
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) __riscv_vsetvl_e64m8(n)
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
+#define VLEV_FLOAT __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
+#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1
 #define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
-#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
-#define VMFIRSTM vmfirst_m_b8
+#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8
+#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8
+#define VMFIRSTM __riscv_vfirst_m_b8
 #define UINT_V_T vuint64m8_t
-#define VSEVU_UINT vse64_v_u64m8
+#define VSEVU_UINT __riscv_vse64_v_u64m8
 #define UINT_T long unsigned int
-#define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
-#define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VFADDVV_FLOAT vfadd_vv_f64m8
-#define VMVVX_UINT vmv_v_x_u64m8
+#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu
+#define VIDV_UINT __riscv_vid_v_u64m8
+#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu
+#define VADDVX_UINT __riscv_vadd_vx_u64m8
+#define VMVVX_UINT __riscv_vmv_v_x_u64m8
+#define VFABS_FLOAT __riscv_vfabs_v_f64m8
+#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8
+#define VCOMPRESS __riscv_vcompress_vm_u64m8
+#define VMV_X __riscv_vmv_x_s_u64m8_u64
 #else
 
-#define ABS fabsf
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) __riscv_vsetvl_e32m8(n)
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
+#define VLEV_FLOAT __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
+#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1
 #define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
-#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
-#define VMFIRSTM vmfirst_m_b4
+#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8
+#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4
+#define VMFIRSTM __riscv_vfirst_m_b4
 #define UINT_V_T vuint32m8_t
 #define UINT_T unsigned int
-#define VSEVU_UINT vse32_v_u32m8
-#define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
-#define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VFADDVV_FLOAT vfadd_vv_f32m8
-#define VMVVX_UINT vmv_v_x_u32m8
+#define VSEVU_UINT __riscv_vse32_v_u32m8
+#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu
+#define VIDV_UINT __riscv_vid_v_u32m8
+#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu
+#define VADDVX_UINT __riscv_vadd_vx_u32m8
+#define VMVVX_UINT __riscv_vmv_v_x_u32m8
+#define VFABS_FLOAT __riscv_vfabs_v_f32m8
+#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8
+#define VCOMPRESS __riscv_vcompress_vm_u32m8
+#define VMV_X __riscv_vmv_x_s_u32m8_u32
 #endif
 
-#define RVV_M RVV_M8
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
-	FLOAT maxf=0.0;
+        BLASLONG i=0, j=0;
         unsigned int max_index = 0;
-	if (n <= 0 || inc_x <= 0) return(max_index);
+        if (n <= 0 || inc_x <= 0) return(max_index);
+        FLOAT maxf=-FLT_MAX;
 
-        FLOAT_V_T vx0, vx1, v_max;
+        FLOAT_V_T vx, vx2, v_max;
         UINT_V_T v_max_index;
-        MASK_T mask0, mask1;
+        MASK_T mask;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
 
         gvl = VSETVL(n);
-                UINT_T temp_uint[gvl];
+        unsigned int stride_x = inc_x * 2 * sizeof(FLOAT);
+        unsigned int idx = 0, inc_v = gvl * inc_x * 2;
+
+        v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
         v_max_index = VMVVX_UINT(0, gvl);
-        v_max = VFMVVF_FLOAT(-1, gvl);
-        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
-        BLASLONG inc_xv = gvl * inc_x * 2;
-        BLASLONG ix = 0;
         for(i=0,j=0; i < n/gvl; i++){
-                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                //fabs(vector)
-                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
-                vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                //fabs(vector)
-                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
-                vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                vx0 = VFADDVV_FLOAT(vx0, vx1, gvl);
+                vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+                vx = VFABS_FLOAT(vx, gvl);
+                vx2 = VFABS_FLOAT(vx2, gvl);
+                vx = VFADDVV_FLOAT(vx, vx2, gvl);
+
 
                 //index where element greater than v_max
-                mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl);
-                v_max_index = VIDV_MASK_UINT(mask0, v_max_index, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e64,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_max_index)
-        :"v"(mask0), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e32,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_max_index)
-        :"v"(mask0), "r"(gvl)
-        :"v0");
-#endif
-*/
-                v_max_index = VADDVX_MASK_UINT(mask0, v_max_index, v_max_index, j, gvl);
+                mask = VMFLTVV_FLOAT(v_max, vx, gvl);
+                v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
+                v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
 
                 //update v_max and start_index j
-                v_max = VFMAXVV_FLOAT(v_max, vx0, gvl);
+                v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
                 j += gvl;
-                ix += inc_xv;
+                idx += inc_v;
         }
-        vx0 = VFMVVF_FLOAT(0, gvl);
-        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-        maxf = VFMVFS_FLOAT(v_res);
-        mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl);
-        max_index = VMFIRSTM(mask0,gvl);
-        VSEVU_UINT(temp_uint,v_max_index,gvl);
-        max_index = temp_uint[max_index];
-
+        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+        maxf = EXTRACT_FLOAT(v_res);
+        mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
+        UINT_V_T compressed;
+        compressed = VCOMPRESS(v_max_index, mask, gvl);
+        max_index = VMV_X(compressed);
 
         if(j < n){
                 gvl = VSETVL(n-j);
-                v_max_index = VMVVX_UINT(0, gvl);
-                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                //fabs(vector)
-                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
-                vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                //fabs(vector)
-                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
-                vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                v_max = VFADDVV_FLOAT(vx0, vx1, gvl);
-                v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                FLOAT cur_maxf = VFMVFS_FLOAT(v_res);
+                v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+                v_max = VFABS_FLOAT(v_max, gvl);
+                vx2 = VFABS_FLOAT(vx2, gvl);
+                v_max = VFADDVV_FLOAT(v_max, vx2, gvl);
+
+                v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
+
                 if(cur_maxf > maxf){
                         //tail index
                         v_max_index = VIDV_UINT(gvl);
                         v_max_index = VADDVX_UINT(v_max_index, j, gvl);
 
-                        mask0 = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
-                        max_index = VMFIRSTM(mask0,gvl);
-                        VSEVU_UINT(temp_uint,v_max_index,gvl);
-                                         max_index = temp_uint[max_index];
-
+                        mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
+                        UINT_V_T compressed;
+                        compressed = VCOMPRESS(v_max_index, mask, gvl);
+                        max_index = VMV_X(compressed);
                 }
         }
-	return(max_index+1);
-}
-
 
+        return(max_index+1);
+}
diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c
index 818193a9e..74daf32b8 100644
--- a/kernel/riscv64/izamin_vector.c
+++ b/kernel/riscv64/izamin_vector.c
@@ -31,235 +31,128 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(DOUBLE)
 
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) __riscv_vsetvl_e64m8(n)
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
+#define VLEV_FLOAT __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
+#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1
 #define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMINVV_FLOAT vfmin_vv_f64m8
-#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
-#define VMFIRSTM vmfirst_m_b8
+#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f64m8_b8
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8
+#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8
+#define VMFIRSTM __riscv_vfirst_m_b8
 #define UINT_V_T vuint64m8_t
 #define VSEVU_UINT vse64_v_u64m8
 #define UINT_T long unsigned int
-#define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
-#define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VFADDVV_FLOAT vfadd_vv_f64m8
-#define VMVVX_UINT vmv_v_x_u64m8
+#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu
+#define VIDV_UINT __riscv_vid_v_u64m8
+#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu
+#define VADDVX_UINT __riscv_vadd_vx_u64m8
+#define VMVVX_UINT __riscv_vmv_v_x_u64m8
+#define VFABS_FLOAT __riscv_vfabs_v_f64m8
+#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8
+#define VCOMPRESS __riscv_vcompress_vm_u64m8
+#define VMV_X __riscv_vmv_x_s_u64m8_u64
 #else
 
-#define ABS fabsf
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) __riscv_vsetvl_e32m8(n)
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
+#define VLEV_FLOAT __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
+#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1
 #define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMINVV_FLOAT vfmin_vv_f32m8
-#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
-#define VMFIRSTM vmfirst_m_b4
+#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f32m8_b4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8
+#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4
+#define VMFIRSTM __riscv_vfirst_m_b4
 #define UINT_V_T vuint32m8_t
 #define UINT_T unsigned int
-#define VSEVU_UINT vse32_v_u32m8
-#define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
-#define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VFADDVV_FLOAT vfadd_vv_f32m8
-#define VMVVX_UINT vmv_v_x_u32m8
+#define VSEVU_UINT __riscv_vse32_v_u32m8
+#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu
+#define VIDV_UINT __riscv_vid_v_u32m8
+#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu
+#define VADDVX_UINT __riscv_vadd_vx_u32m8
+#define VMVVX_UINT __riscv_vmv_v_x_u32m8
+#define VFABS_FLOAT __riscv_vfabs_v_f32m8
+#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8
+#define VCOMPRESS __riscv_vcompress_vm_u32m8
+#define VMV_X __riscv_vmv_x_s_u32m8_u32
 #endif
 
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
-	FLOAT minf=FLT_MAX;
+        BLASLONG i=0, j=0;
         unsigned int min_index = 0;
-	if (n <= 0 || inc_x <= 0) return(min_index);
+        if (n <= 0 || inc_x <= 0) return(min_index);
+        FLOAT minf=FLT_MAX;
 
-        FLOAT_V_T vx0, vx1, v_min;
+        FLOAT_V_T vx, vx2, v_min;
         UINT_V_T v_min_index;
-        MASK_T mask0, mask1;
+        MASK_T mask;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_max;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
 
         gvl = VSETVL(n);
-		UINT_T temp_uint[gvl];
-        v_min_index = VMVVX_UINT(0, gvl);
+        unsigned int stride_x = inc_x * 2 * sizeof(FLOAT);
+        unsigned int idx = 0, inc_v = gvl * inc_x * 2;
+
         v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
-        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
-        BLASLONG inc_xv = gvl * inc_x * 2;
-        BLASLONG ix = 0;
+        v_min_index = VMVVX_UINT(0, gvl);
         for(i=0,j=0; i < n/gvl; i++){
-                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                //fabs(vector)
-                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
-                vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                //fabs(vector)
-                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
-                vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                vx0 = VFADDVV_FLOAT(vx0, vx1, gvl);
+                vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+                vx = VFABS_FLOAT(vx, gvl);
+                vx2 = VFABS_FLOAT(vx2, gvl);
+                vx = VFADDVV_FLOAT(vx, vx2, gvl);
 
-                //index where element less than v_min
-                mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl);
-                v_min_index = VIDV_MASK_UINT(mask0, v_min_index, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e64,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_min_index)
-        :"v"(mask0), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e32,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_min_index)
-        :"v"(mask0), "r"(gvl)
-        :"v0");
-#endif
-*/
-                v_min_index = VADDVX_MASK_UINT(mask0, v_min_index, v_min_index, j, gvl);
+
+                //index where element greater than v_min
+                mask = VMFGTVV_FLOAT(v_min, vx, gvl);
+                v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
+                v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
 
                 //update v_min and start_index j
-                v_min = VFMINVV_FLOAT(v_min, vx0, gvl);
+                v_min = VFMINVV_FLOAT(v_min, vx, gvl);
                 j += gvl;
-                ix += inc_xv;
+                idx += inc_v;
         }
-        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-        minf = VFMVFS_FLOAT(v_res);
-        mask0 = VMFLEVF_FLOAT(v_min, minf, gvl);
-        min_index = VMFIRSTM(mask0,gvl);
-                 VSEVU_UINT(temp_uint,v_min_index,gvl);
-        min_index = temp_uint[min_index];
+
+        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+        minf = EXTRACT_FLOAT(v_res);
+        mask = VMFLEVF_FLOAT(v_min, minf, gvl);
+        UINT_V_T compressed;
+        compressed = VCOMPRESS(v_min_index, mask, gvl);
+        min_index = VMV_X(compressed);
 
         if(j < n){
                 gvl = VSETVL(n-j);
-                v_min_index = VMVVX_UINT(0, gvl);
-                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                //fabs(vector)
-                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
-                vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                //fabs(vector)
-                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
-                vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                v_min = VFADDVV_FLOAT(vx0, vx1, gvl);
-                v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                FLOAT cur_minf = VFMVFS_FLOAT(v_res);
-                if(cur_minf < minf){
+                v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+                v_min = VFABS_FLOAT(v_min, gvl);
+                vx2 = VFABS_FLOAT(vx2, gvl);
+                v_min = VFADDVV_FLOAT(v_min, vx2, gvl);
+
+                v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                FLOAT cur_minf = EXTRACT_FLOAT(v_res);
+                if(cur_minf > minf){
                         //tail index
                         v_min_index = VIDV_UINT(gvl);
                         v_min_index = VADDVX_UINT(v_min_index, j, gvl);
 
-                        mask0 = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
-                        min_index = VMFIRSTM(mask0,gvl);
-                              VSEVU_UINT(temp_uint,v_min_index,gvl);
-                                       min_index = temp_uint[min_index];
-
+                        mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
+                        UINT_V_T compressed;
+                        compressed = VCOMPRESS(v_min_index, mask, gvl);
+                        min_index = VMV_X(compressed);
                 }
         }
-	return(min_index+1);
-}
-
 
+        return(min_index+1);
+}
diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c
index 7f31e9a53..97f602e51 100644
--- a/kernel/riscv64/max_vector.c
+++ b/kernel/riscv64/max_vector.c
@@ -28,30 +28,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include <math.h>
 #include <float.h>
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 32
+#       else
+#               define ELEN 32
+#               define MLEN 16
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 8
+#       else
+#               define ELEN 32
+#               define MLEN 4
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f,  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#define MASK_T          JOIN(vbool,     MLEN,   _t,     _,      _)
+#define VMFLTVF_FLOAT   JOIN(__riscv_vmflt_vf_f, ELEN,  LMUL,   _b,     MLEN)
+#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
+#define VFMAXVV_FLOAT   JOIN(__riscv_vfmax,     _vv_f,  ELEN,   LMUL,   _)
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0, j=0;
@@ -59,10 +73,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT maxf=-FLT_MAX;
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_max;
-        FLOAT_V_T_M1 v_res, v_min;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
 
         if(inc_x == 1){
                 gvl = VSETVL(n);
@@ -76,15 +88,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
                                 j += gvl * 2;
                         }
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
-                        maxf = *((FLOAT*)&v_res);
+                        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
-                        v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl);
-                        if(*((FLOAT*)&v_res) > maxf)
-                                maxf = *((FLOAT*)&v_res);
+                        v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }else{
@@ -102,18 +111,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 j += gvl * 2;
                                 idx += inc_xv * 2;
                         }
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
-                        maxf = *((FLOAT*)&v_res);
+                        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
-                        v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl);
-                        if(*((FLOAT*)&v_res) > maxf)
-                                maxf = *((FLOAT*)&v_res);
+                        v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }
+        maxf = EXTRACT_FLOAT(v_res);
 	return(maxf);
 }
 
diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c
index 14b7e01ed..77bf19b9d 100644
--- a/kernel/riscv64/min_vector.c
+++ b/kernel/riscv64/min_vector.c
@@ -28,30 +28,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include <math.h>
 #include <float.h>
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMINVV_FLOAT vfmin_vv_f32m8
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 32
+#       else
+#               define ELEN 32
+#               define MLEN 16
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMINVV_FLOAT vfmin_vv_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 8
+#       else
+#               define ELEN 32
+#               define MLEN 4
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,    ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,    ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f,  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#define MASK_T          JOIN(vbool,     MLEN,   _t,     _,      _)
+#define VMFLTVF_FLOAT   JOIN(__riscv_vmflt_vf_f, ELEN,  LMUL,   _b,     MLEN)
+#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
+#define VFMINVV_FLOAT   JOIN(__riscv_vfmin,     _vv_f,  ELEN,   LMUL,   _)
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0, j=0;
@@ -59,10 +73,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT minf=FLT_MAX;
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_min;
-        FLOAT_V_T_M1 v_res, v_max;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
 
         if(inc_x == 1){
                 gvl = VSETVL(n);
@@ -76,15 +88,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 v_min = VFMINVV_FLOAT(v_min, v1, gvl);
                                 j += gvl * 2;
                         }
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        minf = *((FLOAT*)&v_res);
+                        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
-                        v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
-                        if(*((FLOAT*)&v_res) < minf)
-                                minf = *((FLOAT*)&v_res);
+                        v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }else{
@@ -102,18 +111,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 j += gvl * 2;
                                 idx += inc_xv * 2;
                         }
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        minf = *((FLOAT*)&v_res);
+                        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
-                        v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
-                        if(*((FLOAT*)&v_res) < minf)
-                                minf = *((FLOAT*)&v_res);
+                        v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }
+        minf = EXTRACT_FLOAT(v_res);
 	return(minf);
 }
 
diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c
index cf6fdb741..205f07eb7 100644
--- a/kernel/riscv64/nrm2_vector.c
+++ b/kernel/riscv64/nrm2_vector.c
@@ -26,207 +26,185 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #include "common.h"
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m4_t
-#define VFMVFS_FLOATM4 vfmv_f_s_f32m4_f32
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFDOTVV_FLOAT vfdot_vv_f32m4
-#define ABS fabsf
-#define MASK_T vbool8_t
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m
-#define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8
-#define VMFIRSTM vmfirst_m_b8
-#define VFDIVVF_FLOAT vfdiv_vf_f32m4
-#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m1
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 64
+#       else
+#               define ELEN 32
+#               define MLEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m4_t
-#define VFMVFS_FLOATM4 vfmv_f_s_f64m4_f64
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFDOTVV_FLOAT vfdot_vv_f64m4
+#       define LMUL m4
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 16
+#       else
+#               define ELEN 32
+#               define MLEN 8
+#       endif
+#endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVSF_FLOAT    JOIN(__riscv_vfmv,      _s_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
+#define MASK_T          JOIN(vbool,             MLEN,   _t,     _,      _)
+#define VFABS           JOIN(__riscv_vfabs,     _v_f,   ELEN,   LMUL,   _)
+#define VMFNE           JOIN(__riscv_vmfne_vf_f,ELEN,   LMUL,   _b,     MLEN)
+#define VMFGT           JOIN(__riscv_vmfgt_vv_f,ELEN,   LMUL,   _b,     MLEN)
+#define VMFEQ           JOIN(__riscv_vmfeq_vf_f,ELEN,   LMUL,   _b,     MLEN)
+#define VCPOP           JOIN(__riscv_vcpop,     _m_b,   MLEN,   _,      _)
+#define VFREDMAX        JOIN(__riscv_vfredmax_vs_f,ELEN,LMUL,   JOIN2(_f, ELEN), m1)
+#define VFREDMIN        JOIN(__riscv_vfredmin_vs_f,ELEN,LMUL,   JOIN2(_f, ELEN), m1)
+#define VFIRST          JOIN(__riscv_vfirst,    _m_b,   MLEN,   _,      _)
+#define VRGATHER        JOIN(__riscv_vrgather,  _vx_f,  ELEN,   LMUL,   _)
+#define VFDIV           JOIN(__riscv_vfdiv,     _vv_f,  ELEN,   LMUL,   _)
+#define VFDIV_M         JOIN(__riscv_vfdiv,     _vv_f,  ELEN,   LMUL,   _mu)
+#define VFMUL           JOIN(__riscv_vfmul,     _vv_f,  ELEN,   LMUL,   _)
+#define VFMUL_M         JOIN(__riscv_vfmul,     _vv_f,  ELEN,   LMUL,   _mu)
+#define VFMACC          JOIN(__riscv_vfmacc,    _vv_f,  ELEN,   LMUL,   _)
+#define VFMACC_M        JOIN(__riscv_vfmacc,    _vv_f,  ELEN,   LMUL,   _mu)
+#define VMSBF           JOIN(__riscv_vmsbf,     _m_b,   MLEN,   _,      _)
+#define VMSOF           JOIN(__riscv_vmsof,     _m_b,   MLEN,   _,      _)
+#define VMAND           JOIN(__riscv_vmand,     _mm_b,  MLEN,   _,      _)
+#define VMANDN          JOIN(__riscv_vmandn,    _mm_b,  MLEN,   _,      _)
+#define VFREDSUM        JOIN(__riscv_vfredusum_vs_f,ELEN,LMUL,  JOIN2(_f, ELEN), m1)
+#define VMERGE          JOIN(__riscv_vmerge,    _vvm_f, ELEN,   LMUL,   _)
+
+#define VSEV_FLOAT      JOIN(__riscv_vse,       ELEN,   _v_f,   ELEN,   LMUL)
+
+#if defined(DOUBLE)
 #define ABS fabs
-#define MASK_T vbool16_t
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m
-#define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16
-#define VMFIRSTM vmfirst_m_b16
-#define VFDIVVF_FLOAT vfdiv_vf_f64m4
-#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
+#else
+#define ABS fabsf
 #endif
 
+#define EXTRACT_FLOAT0_V(v) JOIN(__riscv_vfmv_f_s_f, ELEN, LMUL, _f, ELEN)(v)
+
+//#define DUMP( label, v0, gvl )
+#define DUMP( label, v0, gvl ) do{ FLOAT x[16]; VSEV_FLOAT( x, v0, gvl ); printf ("%s(%d): %s [ ", __FILE__, __LINE__, label); for( int xxx = 0; xxx < gvl; ++xxx ) { printf("%f, ", x[xxx]); } printf(" ]\n"); } while(0)
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
+	BLASLONG i=0;
 
-	if ( n < 0 )  return(0.0);
+	if(n <= 0)  return(0.0);
         if(n == 1) return (ABS(x[0]));
 
-        FLOAT_V_T vr, v0, v_zero;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
-
-        FLOAT scale = 0.0, ssq = 0.0;
-        MASK_T mask;
-        BLASLONG index = 0;
-        if(inc_x == 1){
-                gvl = VSETVL(n);
-                vr = VFMVVF_FLOAT(0, gvl);
-                v_zero = VFMVVF_FLOAT(0, gvl);
-                for(i=0,j=0; i<n/gvl; i++){
-                        v0 = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0){
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
-                                }
-                        }else{//found greater element
-                                //ssq in vector vr: vr[0]
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                //total ssq before current vector
-                                ssq += VFMVFS_FLOAT(v_res);
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                //ssq in vector vr
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+
+        MASK_T nonzero_mask;
+        MASK_T scale_mask;
+
+        gvl = VSETVL(n);
+        FLOAT_V_T v0;
+        FLOAT_V_T v_ssq = VFMVVF_FLOAT(0, gvl);
+        FLOAT_V_T v_scale = VFMVVF_FLOAT(0, gvl);
+
+        FLOAT scale = 0;
+        FLOAT ssq = 0;
+        unsigned int stride_x = inc_x * sizeof(FLOAT);
+        int idx = 0;
+
+        if( n >= gvl ) // don't pay overheads if we're not doing useful work
+        {
+                for(i=0; i<n/gvl; i++){
+                        v0 = VLSEV_FLOAT( &x[idx], stride_x, gvl );
+                        nonzero_mask = VMFNE( v0, 0, gvl );
+                        v0 = VFABS( v0, gvl );
+                        scale_mask = VMFGT( v0, v_scale, gvl );
+
+                        // assume scale changes are relatively infrequent
+
+                        // unclear if the vcpop+branch is actually a win
+                        // since the operations being skipped are predicated anyway
+                        // need profiling to confirm
+                        if( VCPOP(scale_mask, gvl) ) 
+                        {
+                                v_scale = VFDIV_M( scale_mask, v_scale, v_scale, v0, gvl );
+                                v_scale = VFMUL_M( scale_mask, v_scale, v_scale, v_scale, gvl );
+                                v_ssq = VFMUL_M( scale_mask, v_ssq, v_ssq, v_scale, gvl );
+                                v_scale = VMERGE( v_scale, v0, scale_mask, gvl );
                         }
-                        j += gvl;
+                        v0 = VFDIV_M( nonzero_mask, v0, v0, v_scale, gvl );
+                        v_ssq = VFMACC_M( nonzero_mask, v_ssq, v0, v0, gvl );
+                        idx += inc_x * gvl;
                 }
-                //ssq in vector vr: vr[0]
-                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                //total ssq now
-                ssq += VFMVFS_FLOAT(v_res);
-
-                //tail
-                if(j < n){
-                        gvl = VSETVL(n-j);
-                        v0 = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0)
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                        }else{//found greater element
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+
+                // we have gvl elements which we accumulated independently, with independent scales
+                // we need to combine these
+                // naive sort so we process small values first to avoid losing information
+                // could use vector sort extensions where available, but we're dealing with gvl elts at most
+
+                FLOAT * out_ssq = alloca(gvl*sizeof(FLOAT));
+                FLOAT * out_scale = alloca(gvl*sizeof(FLOAT));
+                VSEV_FLOAT( out_ssq, v_ssq, gvl );
+                VSEV_FLOAT( out_scale, v_scale, gvl );
+                for( int a = 0; a < (gvl-1); ++a )
+                {
+                        int smallest = a;
+                        for( size_t b = a+1; b < gvl; ++b )
+                                if( out_scale[b] < out_scale[smallest] )
+                                        smallest = b;
+                        if( smallest != a )
+                        {
+                                FLOAT tmp1 = out_ssq[a];
+                                FLOAT tmp2 = out_scale[a];
+                                out_ssq[a] = out_ssq[smallest];
+                                out_scale[a] = out_scale[smallest];
+                                out_ssq[smallest] = tmp1;
+                                out_scale[smallest] = tmp2;
                         }
-                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
-                        //ssq in vector vr: vr[0]
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        //total ssq now
-                        ssq += VFMVFS_FLOAT(v_res);
                 }
-        }else{
-                gvl = VSETVL(n);
-                vr = VFMVVF_FLOAT(0, gvl);
-                v_zero = VFMVVF_FLOAT(0, gvl);
-                unsigned int stride_x = inc_x * sizeof(FLOAT);
-                int idx = 0, inc_v = inc_x * gvl;
-                for(i=0,j=0; i<n/gvl; i++){
-                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0){
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
-                                }
-                        }else{//found greater element
-                                //ssq in vector vr: vr[0]
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                //total ssq before current vector
-                                ssq += VFMVFS_FLOAT(v_res);
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                //ssq in vector vr
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+
+                int a = 0;
+                while( a<gvl && out_scale[a] == 0 )
+                        ++a;
+
+                if( a < gvl ) 
+                {
+                        ssq = out_ssq[a];
+                        scale = out_scale[a];
+                        ++a;
+                        for( ; a < gvl; ++a ) 
+                        {
+                                ssq = ssq * ( scale / out_scale[a] ) * ( scale / out_scale[a] ) + out_ssq[a];
+                                scale = out_scale[a];
                         }
-                        j += gvl;
-                        idx += inc_v;
                 }
-                //ssq in vector vr: vr[0]
-                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                //total ssq now
-                ssq += VFMVFS_FLOAT(v_res);
-
-                //tail
-                if(j < n){
-                        gvl = VSETVL(n-j);
-                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0)
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                        }else{//found greater element
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOATM4(vr);
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+        }
+
+        //finish any tail using scalar ops
+        i*=gvl*inc_x;
+        n*=inc_x;
+        while(i < n){
+                if ( x[i] != 0.0 ){
+                        FLOAT absxi = ABS( x[i] );
+                        if ( scale < absxi ){
+                                ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
+                                scale = absxi ;
+                        }
+                        else{
+                                ssq += ( absxi/scale ) * ( absxi/scale );
                         }
-                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
-                        //ssq in vector vr: vr[0]
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        //total ssq now
-                        ssq += VFMVFS_FLOAT(v_res);
+
                 }
+
+                i += inc_x;
         }
+
 	return(scale * sqrt(ssq));
 }
 
diff --git a/kernel/riscv64/nrm2_vector_dot.c b/kernel/riscv64/nrm2_vector_dot.c
index a8b2b7c66..06e61d695 100644
--- a/kernel/riscv64/nrm2_vector_dot.c
+++ b/kernel/riscv64/nrm2_vector_dot.c
@@ -31,9 +31,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
 #define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m8
 #define VFMVVF_FLOAT vfmv_v_f_f32m8
@@ -45,9 +45,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
 #define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m8
 #define VFMVVF_FLOAT vfmv_v_f_f64m8
diff --git a/kernel/riscv64/rot_vector.c b/kernel/riscv64/rot_vector.c
index 43a65e552..f7aa6c3c2 100644
--- a/kernel/riscv64/rot_vector.c
+++ b/kernel/riscv64/rot_vector.c
@@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSEV_FLOAT vse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMULVF_FLOAT vfmul_vf_f32m4
-#define VFMSACVF_FLOAT vfmsac_vf_f32m4
+#define VLEV_FLOAT __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSEV_FLOAT __riscv_vse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4
+#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSEV_FLOAT vse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMULVF_FLOAT vfmul_vf_f64m4
-#define VFMSACVF_FLOAT vfmsac_vf_f64m4
+#define VLEV_FLOAT __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSEV_FLOAT __riscv_vse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4
+#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m4
 #endif
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
@@ -57,11 +57,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 	BLASLONG ix=0,iy=0;
 
 	if(n <= 0)  return(0);
-        unsigned int gvl = 0;
+        unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
         FLOAT_V_T v0, v1, vx, vy;
 
         if(inc_x == 1 && inc_y == 1){
-                gvl = VSETVL(n);
                 for(i=0,j=0; i<n/gvl; i++){
                         vx = VLEV_FLOAT(&x[j], gvl);
                         vy = VLEV_FLOAT(&y[j], gvl);
@@ -90,7 +89,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
                         VSEV_FLOAT(&y[j], v1, gvl);
                 }
         }else if(inc_y == 1){
-                gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * gvl;
                 for(i=0,j=0; i<n/gvl; i++){
@@ -122,7 +120,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
                         VSEV_FLOAT(&y[j], v1, gvl);
                 }
         }else if(inc_x == 1){
-                gvl = VSETVL(n);
                 BLASLONG stride_y = inc_y * sizeof(FLOAT);
                 BLASLONG inc_yv = inc_y * gvl;
                 for(i=0,j=0; i<n/gvl; i++){
@@ -154,7 +151,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
                         VSSEV_FLOAT(&y[j*inc_y], stride_y, v1, gvl);
                 }
         }else{
-                gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 BLASLONG stride_y = inc_y * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * gvl;
diff --git a/kernel/riscv64/scal_vector.c b/kernel/riscv64/scal_vector.c
index 8b9ef5a3e..f6b87d4f9 100644
--- a/kernel/riscv64/scal_vector.c
+++ b/kernel/riscv64/scal_vector.c
@@ -26,28 +26,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #include "common.h"
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VSEV_FLOAT vse32_v_f32m8
-#define VSSEV_FLOAT vsse32_v_f32m8
-#define VFMULVF_FLOAT vfmul_vf_f32m8
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 32
+#       else
+#               define ELEN 32
+#               define MLEN 16
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VSEV_FLOAT vse64_v_f64m8
-#define VSSEV_FLOAT vsse64_v_f64m8
-#define VFMULVF_FLOAT vfmul_vf_f64m8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 8
+#       else
+#               define ELEN 32
+#               define MLEN 4
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(__riscv_vse,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(__riscv_vsse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
+#define VFMULVF_FLOAT   JOIN(__riscv_vfmul,     _vf_f,  ELEN,   LMUL,   _)
+
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 	BLASLONG i=0,j=0;
@@ -84,25 +97,25 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
                 }
         }else{
                 if(da == 0.0){
+                        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                        BLASLONG ix = 0;
                         gvl = VSETVL(n);
-						BLASLONG stride_x = inc_x * sizeof(FLOAT);
-						BLASLONG ix = 0;
-                        if(gvl <= n / 2){
-							    long int inc_xv = gvl * inc_x;
-                                v0 = VFMVVF_FLOAT(0, gvl);
-                                for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){
-									VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
-									VSSEV_FLOAT(&x[ix + inc_xv], stride_x, v0, gvl);
-									ix += inc_xv * 2;
-                                }
+                        v0 = VFMVVF_FLOAT(0, gvl);
+
+                        for(i = 0; i < n/(gvl*2); ++i ){
+                                VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
+                                ix += inc_x * gvl;
+                                VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
+                                ix += inc_x * gvl;
                         }
-                        //tail
-                        for(; j <n; ){
-                                gvl = VSETVL(n-j);
+
+                        i *= gvl*2;
+                        while( i < n ){
+                                gvl = VSETVL(n-i);
                                 v0 = VFMVVF_FLOAT(0, gvl);
-								VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
-                                j += gvl;
-								ix += inc_x * gvl;
+                                VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
+                                i += gvl;
+                                ix += inc_x * gvl;
                         }
                 }else{
                         gvl = VSETVL(n);
diff --git a/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c b/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c
new file mode 100644
index 000000000..e22df34f9
--- /dev/null
+++ b/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c
@@ -0,0 +1,1081 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=16
+ M_tail_scalar_from=2
+ N=8
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='float'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f32m1'
+ VFMUL='__riscv_vfmul_vf_f32m1'
+ VLEV='__riscv_vle32_v_f32m1'
+ VLSEV='__riscv_vlse32_v_f32m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m1'
+ VSETVL='__riscv_vsetvl_e32m1'
+ VSEV='__riscv_vse32_v_f32m1'
+ VSSEV='__riscv_vsse32_v_f32m1'
+ acc_vector_t='vfloat32m1_t'
+ output='sgemm_kernel_16x8_zvl256b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m1_t'
+
+*/
+
+#include "common.h"
+
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/8; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m1(8);
+
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            float B4 = B[bi+4];
+            float B5 = B[bi+5];
+            float B6 = B[bi+6];
+            float B7 = B[bi+7];
+            bi += 8;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A1, B2, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A1, B3, gvl);
+            vfloat32m1_t result8 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
+            vfloat32m1_t result9 = __riscv_vfmul_vf_f32m1( A1, B4, gvl);
+            vfloat32m1_t result10 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
+            vfloat32m1_t result11 = __riscv_vfmul_vf_f32m1( A1, B5, gvl);
+            vfloat32m1_t result12 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
+            vfloat32m1_t result13 = __riscv_vfmul_vf_f32m1( A1, B6, gvl);
+            vfloat32m1_t result14 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
+            vfloat32m1_t result15 = __riscv_vfmul_vf_f32m1( A1, B7, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B3, A1, gvl);
+                result8 = __riscv_vfmacc_vf_f32m1( result8, B4, A0, gvl);
+                result9 = __riscv_vfmacc_vf_f32m1( result9, B4, A1, gvl);
+                result10 = __riscv_vfmacc_vf_f32m1( result10, B5, A0, gvl);
+                result11 = __riscv_vfmacc_vf_f32m1( result11, B5, A1, gvl);
+                result12 = __riscv_vfmacc_vf_f32m1( result12, B6, A0, gvl);
+                result13 = __riscv_vfmacc_vf_f32m1( result13, B6, A1, gvl);
+                result14 = __riscv_vfmacc_vf_f32m1( result14, B7, A0, gvl);
+                result15 = __riscv_vfmacc_vf_f32m1( result15, B7, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c4 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c5 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c6 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c8 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c9 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c10 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c11 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c12 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c13 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c14 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c15 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
+            c4 = __riscv_vfmacc_vf_f32m1( c4, alpha, result4, gvl );
+            c5 = __riscv_vfmacc_vf_f32m1( c5, alpha, result5, gvl );
+            c6 = __riscv_vfmacc_vf_f32m1( c6, alpha, result6, gvl );
+            c7 = __riscv_vfmacc_vf_f32m1( c7, alpha, result7, gvl );
+            c8 = __riscv_vfmacc_vf_f32m1( c8, alpha, result8, gvl );
+            c9 = __riscv_vfmacc_vf_f32m1( c9, alpha, result9, gvl );
+            c10 = __riscv_vfmacc_vf_f32m1( c10, alpha, result10, gvl );
+            c11 = __riscv_vfmacc_vf_f32m1( c11, alpha, result11, gvl );
+            c12 = __riscv_vfmacc_vf_f32m1( c12, alpha, result12, gvl );
+            c13 = __riscv_vfmacc_vf_f32m1( c13, alpha, result13, gvl );
+            c14 = __riscv_vfmacc_vf_f32m1( c14, alpha, result14, gvl );
+            c15 = __riscv_vfmacc_vf_f32m1( c15, alpha, result15, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c8, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c9, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c10, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c11, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c12, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c13, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c14, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c15, gvl);
+            m_top += 16;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            float B4 = B[bi+4];
+            float B5 = B[bi+5];
+            float B6 = B[bi+6];
+            float B7 = B[bi+7];
+            bi += 8;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B7, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c4 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c5 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c6 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
+            c4 = __riscv_vfmacc_vf_f32m1( c4, alpha, result4, gvl );
+            c5 = __riscv_vfmacc_vf_f32m1( c5, alpha, result5, gvl );
+            c6 = __riscv_vfmacc_vf_f32m1( c6, alpha, result6, gvl );
+            c7 = __riscv_vfmacc_vf_f32m1( c7, alpha, result7, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            float B4 = B[bi+4];
+            float B5 = B[bi+5];
+            float B6 = B[bi+6];
+            float B7 = B[bi+7];
+            bi += 8;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B7, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c4 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c5 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c6 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
+            c4 = __riscv_vfmacc_vf_f32m1( c4, alpha, result4, gvl );
+            c5 = __riscv_vfmacc_vf_f32m1( c5, alpha, result5, gvl );
+            c6 = __riscv_vfmacc_vf_f32m1( c6, alpha, result6, gvl );
+            c7 = __riscv_vfmacc_vf_f32m1( c7, alpha, result7, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                result8+=A[ai+0]*B[bi+4];
+                result9+=A[ai+1]*B[bi+4];
+                result10+=A[ai+0]*B[bi+5];
+                result11+=A[ai+1]*B[bi+5];
+                result12+=A[ai+0]*B[bi+6];
+                result13+=A[ai+1]*B[bi+6];
+                result14+=A[ai+0]*B[bi+7];
+                result15+=A[ai+1]*B[bi+7];
+                ai+=2;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            C[ci+1*ldc+0] += alpha * result2;
+            C[ci+1*ldc+1] += alpha * result3;
+            C[ci+2*ldc+0] += alpha * result4;
+            C[ci+2*ldc+1] += alpha * result5;
+            C[ci+3*ldc+0] += alpha * result6;
+            C[ci+3*ldc+1] += alpha * result7;
+            C[ci+4*ldc+0] += alpha * result8;
+            C[ci+4*ldc+1] += alpha * result9;
+            C[ci+5*ldc+0] += alpha * result10;
+            C[ci+5*ldc+1] += alpha * result11;
+            C[ci+6*ldc+0] += alpha * result12;
+            C[ci+6*ldc+1] += alpha * result13;
+            C[ci+7*ldc+0] += alpha * result14;
+            C[ci+7*ldc+1] += alpha * result15;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                result4+=A[ai+0]*B[bi+4];
+                result5+=A[ai+0]*B[bi+5];
+                result6+=A[ai+0]*B[bi+6];
+                result7+=A[ai+0]*B[bi+7];
+                ai+=1;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+1*ldc+0] += alpha * result1;
+            C[ci+2*ldc+0] += alpha * result2;
+            C[ci+3*ldc+0] += alpha * result3;
+            C[ci+4*ldc+0] += alpha * result4;
+            C[ci+5*ldc+0] += alpha * result5;
+            C[ci+6*ldc+0] += alpha * result6;
+            C[ci+7*ldc+0] += alpha * result7;
+            m_top+=1;
+        }
+
+        n_top += 8;
+    }
+
+
+
+    // -- tails for N=4
+
+    if( N & 4 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            bi += 4;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A1, B2, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A1, B3, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B3, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c4 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c5 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c6 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
+            c4 = __riscv_vfmacc_vf_f32m1( c4, alpha, result4, gvl );
+            c5 = __riscv_vfmacc_vf_f32m1( c5, alpha, result5, gvl );
+            c6 = __riscv_vfmacc_vf_f32m1( c6, alpha, result6, gvl );
+            c7 = __riscv_vfmacc_vf_f32m1( c7, alpha, result7, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
+            m_top += 16;
+        }
+
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            bi += 4;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            bi += 4;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                ai+=2;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            C[ci+1*ldc+0] += alpha * result2;
+            C[ci+1*ldc+1] += alpha * result3;
+            C[ci+2*ldc+0] += alpha * result4;
+            C[ci+2*ldc+1] += alpha * result5;
+            C[ci+3*ldc+0] += alpha * result6;
+            C[ci+3*ldc+1] += alpha * result7;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                ai+=1;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+1*ldc+0] += alpha * result1;
+            C[ci+2*ldc+0] += alpha * result2;
+            C[ci+3*ldc+0] += alpha * result3;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            bi += 2;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
+            m_top += 16;
+        }
+
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            bi += 2;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            bi += 2;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                ai+=2;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            C[ci+1*ldc+0] += alpha * result2;
+            C[ci+1*ldc+1] += alpha * result3;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                ai+=1;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+1*ldc+0] += alpha * result1;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            bi += 1;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
+            m_top += 16;
+        }
+
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            bi += 1;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            bi += 1;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                ai+=2;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                ai+=1;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/strmm_kernel_16x8_zvl256b.c b/kernel/riscv64/strmm_kernel_16x8_zvl256b.c
new file mode 100644
index 000000000..f70157d61
--- /dev/null
+++ b/kernel/riscv64/strmm_kernel_16x8_zvl256b.c
@@ -0,0 +1,1330 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=16
+ M_tail_scalar_from=2
+ N=8
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='float'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f32m1'
+ VFMUL='__riscv_vfmul_vf_f32m1'
+ VLEV='__riscv_vle32_v_f32m1'
+ VLSEV='__riscv_vlse32_v_f32m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m1'
+ VSETVL='__riscv_vsetvl_e32m1'
+ VSEV='__riscv_vse32_v_f32m1'
+ VSSEV='__riscv_vsse32_v_f32m1'
+ acc_vector_t='vfloat32m1_t'
+ output='strmm_kernel_16x8_zvl256b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m1_t'
+
+*/
+
+#include "common.h"
+
+
+
+#if defined(LEFT) != defined(TRANSA)
+    #define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/8; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m1(8);
+
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*16;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 16;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            float B4 = B[bi+4];
+            float B5 = B[bi+5];
+            float B6 = B[bi+6];
+            float B7 = B[bi+7];
+            bi += 8;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A1, B2, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A1, B3, gvl);
+            vfloat32m1_t result8 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
+            vfloat32m1_t result9 = __riscv_vfmul_vf_f32m1( A1, B4, gvl);
+            vfloat32m1_t result10 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
+            vfloat32m1_t result11 = __riscv_vfmul_vf_f32m1( A1, B5, gvl);
+            vfloat32m1_t result12 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
+            vfloat32m1_t result13 = __riscv_vfmul_vf_f32m1( A1, B6, gvl);
+            vfloat32m1_t result14 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
+            vfloat32m1_t result15 = __riscv_vfmul_vf_f32m1( A1, B7, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B3, A1, gvl);
+                result8 = __riscv_vfmacc_vf_f32m1( result8, B4, A0, gvl);
+                result9 = __riscv_vfmacc_vf_f32m1( result9, B4, A1, gvl);
+                result10 = __riscv_vfmacc_vf_f32m1( result10, B5, A0, gvl);
+                result11 = __riscv_vfmacc_vf_f32m1( result11, B5, A1, gvl);
+                result12 = __riscv_vfmacc_vf_f32m1( result12, B6, A0, gvl);
+                result13 = __riscv_vfmacc_vf_f32m1( result13, B6, A1, gvl);
+                result14 = __riscv_vfmacc_vf_f32m1( result14, B7, A0, gvl);
+                result15 = __riscv_vfmacc_vf_f32m1( result15, B7, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            vfloat32m1_t c2 = __riscv_vfmul_vf_f32m1( result2, alpha, gvl );
+            vfloat32m1_t c3 = __riscv_vfmul_vf_f32m1( result3, alpha, gvl );
+            vfloat32m1_t c4 = __riscv_vfmul_vf_f32m1( result4, alpha, gvl );
+            vfloat32m1_t c5 = __riscv_vfmul_vf_f32m1( result5, alpha, gvl );
+            vfloat32m1_t c6 = __riscv_vfmul_vf_f32m1( result6, alpha, gvl );
+            vfloat32m1_t c7 = __riscv_vfmul_vf_f32m1( result7, alpha, gvl );
+            vfloat32m1_t c8 = __riscv_vfmul_vf_f32m1( result8, alpha, gvl );
+            vfloat32m1_t c9 = __riscv_vfmul_vf_f32m1( result9, alpha, gvl );
+            vfloat32m1_t c10 = __riscv_vfmul_vf_f32m1( result10, alpha, gvl );
+            vfloat32m1_t c11 = __riscv_vfmul_vf_f32m1( result11, alpha, gvl );
+            vfloat32m1_t c12 = __riscv_vfmul_vf_f32m1( result12, alpha, gvl );
+            vfloat32m1_t c13 = __riscv_vfmul_vf_f32m1( result13, alpha, gvl );
+            vfloat32m1_t c14 = __riscv_vfmul_vf_f32m1( result14, alpha, gvl );
+            vfloat32m1_t c15 = __riscv_vfmul_vf_f32m1( result15, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c8, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c9, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c10, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c11, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c12, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c13, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c14, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c15, gvl);
+            m_top += 16;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            float B4 = B[bi+4];
+            float B5 = B[bi+5];
+            float B6 = B[bi+6];
+            float B7 = B[bi+7];
+            bi += 8;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B7, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            vfloat32m1_t c2 = __riscv_vfmul_vf_f32m1( result2, alpha, gvl );
+            vfloat32m1_t c3 = __riscv_vfmul_vf_f32m1( result3, alpha, gvl );
+            vfloat32m1_t c4 = __riscv_vfmul_vf_f32m1( result4, alpha, gvl );
+            vfloat32m1_t c5 = __riscv_vfmul_vf_f32m1( result5, alpha, gvl );
+            vfloat32m1_t c6 = __riscv_vfmul_vf_f32m1( result6, alpha, gvl );
+            vfloat32m1_t c7 = __riscv_vfmul_vf_f32m1( result7, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            float B4 = B[bi+4];
+            float B5 = B[bi+5];
+            float B6 = B[bi+6];
+            float B7 = B[bi+7];
+            bi += 8;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B7, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            vfloat32m1_t c2 = __riscv_vfmul_vf_f32m1( result2, alpha, gvl );
+            vfloat32m1_t c3 = __riscv_vfmul_vf_f32m1( result3, alpha, gvl );
+            vfloat32m1_t c4 = __riscv_vfmul_vf_f32m1( result4, alpha, gvl );
+            vfloat32m1_t c5 = __riscv_vfmul_vf_f32m1( result5, alpha, gvl );
+            vfloat32m1_t c6 = __riscv_vfmul_vf_f32m1( result6, alpha, gvl );
+            vfloat32m1_t c7 = __riscv_vfmul_vf_f32m1( result7, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                result8+=A[ai+0]*B[bi+4];
+                result9+=A[ai+1]*B[bi+4];
+                result10+=A[ai+0]*B[bi+5];
+                result11+=A[ai+1]*B[bi+5];
+                result12+=A[ai+0]*B[bi+6];
+                result13+=A[ai+1]*B[bi+6];
+                result14+=A[ai+0]*B[bi+7];
+                result15+=A[ai+1]*B[bi+7];
+                ai+=2;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            C[ci+1*ldc+0] = alpha * result2;
+            C[ci+1*ldc+1] = alpha * result3;
+            C[ci+2*ldc+0] = alpha * result4;
+            C[ci+2*ldc+1] = alpha * result5;
+            C[ci+3*ldc+0] = alpha * result6;
+            C[ci+3*ldc+1] = alpha * result7;
+            C[ci+4*ldc+0] = alpha * result8;
+            C[ci+4*ldc+1] = alpha * result9;
+            C[ci+5*ldc+0] = alpha * result10;
+            C[ci+5*ldc+1] = alpha * result11;
+            C[ci+6*ldc+0] = alpha * result12;
+            C[ci+6*ldc+1] = alpha * result13;
+            C[ci+7*ldc+0] = alpha * result14;
+            C[ci+7*ldc+1] = alpha * result15;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                result4+=A[ai+0]*B[bi+4];
+                result5+=A[ai+0]*B[bi+5];
+                result6+=A[ai+0]*B[bi+6];
+                result7+=A[ai+0]*B[bi+7];
+                ai+=1;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+1*ldc+0] = alpha * result1;
+            C[ci+2*ldc+0] = alpha * result2;
+            C[ci+3*ldc+0] = alpha * result3;
+            C[ci+4*ldc+0] = alpha * result4;
+            C[ci+5*ldc+0] = alpha * result5;
+            C[ci+6*ldc+0] = alpha * result6;
+            C[ci+7*ldc+0] = alpha * result7;
+            m_top+=1;
+        }
+
+        n_top += 8;
+    }
+
+
+
+    // -- tails for N=4
+
+    if( N & 4 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*16;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 16;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            bi += 4;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A1, B2, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A1, B3, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B3, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            vfloat32m1_t c2 = __riscv_vfmul_vf_f32m1( result2, alpha, gvl );
+            vfloat32m1_t c3 = __riscv_vfmul_vf_f32m1( result3, alpha, gvl );
+            vfloat32m1_t c4 = __riscv_vfmul_vf_f32m1( result4, alpha, gvl );
+            vfloat32m1_t c5 = __riscv_vfmul_vf_f32m1( result5, alpha, gvl );
+            vfloat32m1_t c6 = __riscv_vfmul_vf_f32m1( result6, alpha, gvl );
+            vfloat32m1_t c7 = __riscv_vfmul_vf_f32m1( result7, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
+            m_top += 16;
+        }
+
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            bi += 4;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            vfloat32m1_t c2 = __riscv_vfmul_vf_f32m1( result2, alpha, gvl );
+            vfloat32m1_t c3 = __riscv_vfmul_vf_f32m1( result3, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            bi += 4;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            vfloat32m1_t c2 = __riscv_vfmul_vf_f32m1( result2, alpha, gvl );
+            vfloat32m1_t c3 = __riscv_vfmul_vf_f32m1( result3, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                ai+=2;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            C[ci+1*ldc+0] = alpha * result2;
+            C[ci+1*ldc+1] = alpha * result3;
+            C[ci+2*ldc+0] = alpha * result4;
+            C[ci+2*ldc+1] = alpha * result5;
+            C[ci+3*ldc+0] = alpha * result6;
+            C[ci+3*ldc+1] = alpha * result7;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                ai+=1;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+1*ldc+0] = alpha * result1;
+            C[ci+2*ldc+0] = alpha * result2;
+            C[ci+3*ldc+0] = alpha * result3;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*16;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 16;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            bi += 2;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            vfloat32m1_t c2 = __riscv_vfmul_vf_f32m1( result2, alpha, gvl );
+            vfloat32m1_t c3 = __riscv_vfmul_vf_f32m1( result3, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
+            m_top += 16;
+        }
+
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            bi += 2;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            bi += 2;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                ai+=2;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            C[ci+1*ldc+0] = alpha * result2;
+            C[ci+1*ldc+1] = alpha * result3;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                ai+=1;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+1*ldc+0] = alpha * result1;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*16;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 16;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            bi += 1;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
+            m_top += 16;
+        }
+
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            bi += 1;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            bi += 1;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                ai+=2;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                ai+=1;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/sum_vector.c b/kernel/riscv64/sum_vector.c
new file mode 100644
index 000000000..3fd217375
--- /dev/null
+++ b/kernel/riscv64/sum_vector.c
@@ -0,0 +1,114 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n) __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
+#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8
+#else
+#define VSETVL(n) __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
+#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8
+#endif
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0;
+	FLOAT asumf=0.0;
+	if (n <= 0 || inc_x <= 0) return(asumf);
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_sum;
+        FLOAT_V_T_M1 v_res;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+
+        if(inc_x == 1){
+                gvl = VSETVL(n);
+                if(gvl <= n/2){
+                        v_sum = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+                                j += gvl * 2;
+                        }
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
+                }
+                for(;j<n;){
+                        gvl = VSETVL(n-j);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
+                        j += gvl;
+                }
+        }else{
+                gvl = VSETVL(n);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        v_sum = VFMVVF_FLOAT(0, gvl);
+                        BLASLONG inc_xv = inc_x * gvl;
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                                v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+                                j += gvl * 2;
+                                inc_xv += inc_xv * 2;
+                        }
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
+                }
+                for(;j<n;){
+                        gvl = VSETVL(n-j);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
+                        j += gvl;
+                }
+        }
+        asumf = EXTRACT_FLOAT(v_res);
+	return(asumf);
+}
+
+
diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c
index b16592808..baf3d8f69 100644
--- a/kernel/riscv64/swap_vector.c
+++ b/kernel/riscv64/swap_vector.c
@@ -27,35 +27,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #include <stdio.h>
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VSEV_FLOAT vse32_v_f32m8
-#define VSSEV_FLOAT vsse32_v_f32m8
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 32
+#       else
+#               define ELEN 32
+#               define MLEN 16
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VSEV_FLOAT vse64_v_f64m8
-#define VSSEV_FLOAT vsse64_v_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 8
+#       else
+#               define ELEN 32
+#               define MLEN 4
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(__riscv_vse,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(__riscv_vsse,      ELEN,   _v_f,   ELEN,   LMUL)
+
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 	BLASLONG i = 0, j = 0;
 	BLASLONG ix = 0,iy = 0;
         BLASLONG stride_x, stride_y;
         FLOAT_V_T vx0, vx1, vy0, vy1;
-        unsigned int gvl = 0;
 
 	if (n < 0)  return(0);
+
+        unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
+        if( inc_x == 0 && inc_y == 0 ) { n = n & 1; }
+
         if(inc_x == 1 && inc_y == 1){
-                gvl = VSETVL(n);
                 if(gvl <= n/2){
                         for(i=0,j=0; i<n/(2*gvl); i++){
                                 vx0 = VLEV_FLOAT(&x[j], gvl);
@@ -79,7 +96,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
                         j+=gvl;
                 }
         }else if (inc_y == 1){
-                gvl = VSETVL(n);
                 stride_x = inc_x * sizeof(FLOAT);
                 if(gvl <= n/2){
                         BLASLONG inc_xv = inc_x * gvl;
@@ -107,7 +123,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
                         ix += inc_x * gvl;
                 }
         }else if(inc_x == 1){
-                gvl = VSETVL(n);
                 stride_y = inc_y * sizeof(FLOAT);
                 if(gvl <= n/2){
                         BLASLONG inc_yv = inc_y * gvl;
@@ -135,7 +150,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
                         iy += inc_y * gvl;
                 }
         }else{
-                gvl = VSETVL(n);
                 stride_x = inc_x * sizeof(FLOAT);
                 stride_y = inc_y * sizeof(FLOAT);
                 if(gvl <= n/2){
diff --git a/kernel/riscv64/symv_L_vector.c b/kernel/riscv64/symv_L_vector.c
index 58ec17b03..f3b922195 100644
--- a/kernel/riscv64/symv_L_vector.c
+++ b/kernel/riscv64/symv_L_vector.c
@@ -27,37 +27,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSEV_FLOAT vse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMULVV_FLOAT vfmul_vv_f32m4
+#define VLEV_FLOAT __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSEV_FLOAT __riscv_vse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSEV_FLOAT vse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMULVV_FLOAT vfmul_vv_f64m4
+#define VLEV_FLOAT __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSEV_FLOAT __riscv_vse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@@ -99,8 +97,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         i += gvl;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < m){
                                         gvl = VSETVL(m-i);
                                         vy = VLEV_FLOAT(&y[i], gvl);
@@ -110,8 +108,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLEV_FLOAT(&x[i], gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
 			}
                         y[j] += alpha * temp2;
@@ -144,8 +142,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         i += gvl;
                                         iy += inc_yv;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < m){
                                         gvl = VSETVL(m-i);
                                         vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
@@ -155,8 +153,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLEV_FLOAT(&x[i], gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
 			}
                         y[jy] += alpha * temp2;
@@ -190,8 +188,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         i += gvl;
                                         ix += inc_xv;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < m){
                                         gvl = VSETVL(m-i);
                                         vy = VLEV_FLOAT(&y[i], gvl);
@@ -201,8 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
 			}
                         y[j] += alpha * temp2;
@@ -241,8 +239,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         ix += inc_xv;
                                         iy += inc_yv;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < m){
                                         gvl = VSETVL(m-i);
                                         vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
@@ -252,8 +250,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
 			}
                         y[jy] += alpha * temp2;
diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c
index 34ff0e30a..9977e2741 100644
--- a/kernel/riscv64/symv_U_vector.c
+++ b/kernel/riscv64/symv_U_vector.c
@@ -27,39 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSEV_FLOAT vse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFDOTVV_FLOAT vfdot_vv_f32m4
-#define VFMULVV_FLOAT vfmul_vv_f32m4
+#define VLEV_FLOAT __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSEV_FLOAT __riscv_vse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFDOTVV_FLOAT __riscv_vfdot_vv_f32m4
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSEV_FLOAT vse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFDOTVV_FLOAT vfdot_vv_f64m4
-#define VFMULVV_FLOAT vfmul_vv_f64m4
+#define VLEV_FLOAT __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSEV_FLOAT __riscv_vse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFDOTVV_FLOAT __riscv_vfdot_vv_f64m4
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@@ -101,8 +99,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         i += gvl;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < j){
                                         gvl = VSETVL(j-i);
                                         vy = VLEV_FLOAT(&y[i], gvl);
@@ -112,8 +110,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLEV_FLOAT(&x[i], gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
                         }
                         y[j] += temp1 * a_ptr[j] + alpha * temp2;
@@ -145,8 +143,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         i += gvl;
                                         iy += inc_yv;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < j){
                                         gvl = VSETVL(j-i);
                                         vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
@@ -156,8 +154,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLEV_FLOAT(&x[i], gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
                         }
                         y[jy] += temp1 * a_ptr[j] + alpha * temp2;
@@ -190,8 +188,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         i += gvl;
                                         ix += inc_xv;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < j){
                                         gvl = VSETVL(j-i);
                                         vy = VLEV_FLOAT(&y[i], gvl);
@@ -201,8 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
                         }
                         y[j] += temp1 * a_ptr[j] + alpha * temp2;
@@ -240,8 +238,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         ix += inc_xv;
                                         iy += inc_yv;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < j){
                                         gvl = VSETVL(j-i);
                                         vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
@@ -251,8 +249,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
                         }
                         y[jy] += temp1 * a_ptr[j] + alpha * temp2;
diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c
index bfb282ae0..4301528bd 100644
--- a/kernel/riscv64/zamax_vector.c
+++ b/kernel/riscv64/zamax_vector.c
@@ -28,40 +28,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include <math.h>
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
-#define VFADDVV_FLOAT vfadd_vv_f32m8
-
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 32
+#       else
+#               define ELEN 32
+#               define MLEN 16
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
-#define VFADDVV_FLOAT vfadd_vv_f64m8
-
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 8
+#       else
+#               define ELEN 32
+#               define MLEN 4
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f,  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#define MASK_T          JOIN(vbool,             MLEN,   _t,     _,      _)
+#define VMFLTVF_FLOAT   JOIN(__riscv_vmflt_vf_f, ELEN,  LMUL,   _b,     MLEN)
+#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
+#define VFRSUBVF_MASK_FLOAT JOIN(__riscv_vfrsub,_vf_f,  ELEN,   LMUL,   _m)
+#define VFMAXVV_FLOAT   JOIN(__riscv_vfmax,     _vv_f,  ELEN,   LMUL,   _)
+#define VFADDVV_FLOAT   JOIN(__riscv_vfadd,     _vv_f,  ELEN,   LMUL,   _)
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0, j=0;
@@ -70,10 +75,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	if (n <= 0 || inc_x <= 0) return(maxf);
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_max;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(0, 1);
 
         MASK_T mask0, mask1;
         BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
@@ -84,9 +87,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
                 mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
+                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl);
                 mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl);
 
                 v0 = VFADDVV_FLOAT(v0, v1, gvl);
                 v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
@@ -94,22 +97,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 j += gvl;
                 ix += inc_xv;
         }
-        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-        maxf = VFMVFS_FLOAT(v_res);
+        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
 
         if(j<n){
                 gvl = VSETVL(n-j);
                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
                 mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
+                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl);
                 mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl);
                 v1 = VFADDVV_FLOAT(v0, v1, gvl);
-                v_res = VFREDMAXVS_FLOAT(v_res, v1, v_z0, gvl);
-                             
-                if(VFMVFS_FLOAT(v_res)> maxf)
-                        maxf = VFMVFS_FLOAT(v_res);
+                v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl);
         }
+        maxf = EXTRACT_FLOAT(v_res);
         return(maxf);
 }
diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c
index d9eca7f10..095b1c3df 100644
--- a/kernel/riscv64/zamin_vector.c
+++ b/kernel/riscv64/zamin_vector.c
@@ -29,38 +29,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <math.h>
 #include <float.h>
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMINVV_FLOAT vfmin_vv_f32m8
-#define VFADDVV_FLOAT vfadd_vv_f32m8
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 32
+#       else
+#               define ELEN 32
+#               define MLEN 16
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMINVV_FLOAT vfmin_vv_f64m8
-#define VFADDVV_FLOAT vfadd_vv_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 8
+#       else
+#               define ELEN 32
+#               define MLEN 4
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f,  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#define MASK_T          JOIN(vbool,             MLEN,   _t,     _,      _)
+#define VMFLTVF_FLOAT   JOIN(__riscv_vmflt_vf_f, ELEN,  LMUL,   _b,     MLEN)
+#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
+#define VFRSUBVF_MASK_FLOAT JOIN(__riscv_vfrsub,_vf_f,  ELEN,   LMUL,   _m)
+#define VFMINVV_FLOAT   JOIN(__riscv_vfmin,     _vv_f,  ELEN,   LMUL,   _)
+#define VFADDVV_FLOAT   JOIN(__riscv_vfadd,     _vv_f,  ELEN,   LMUL,   _)
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0, j=0;
@@ -69,10 +77,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT minf=FLT_MAX;
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_min;
-        FLOAT_V_T_M1 v_res, v_max;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
 
         MASK_T mask0, mask1;
         BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
@@ -83,9 +89,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
                 mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
+                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl);
                 mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl);
 
                 v0 = VFADDVV_FLOAT(v0, v1, gvl);
                 v_min = VFMINVV_FLOAT(v_min, v0, gvl);
@@ -93,21 +99,20 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 j += gvl;
                 ix += inc_xv;
         }
-        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-        minf = VFMVFS_FLOAT(v_res);
+        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
 
         if(j<n){
                 gvl = VSETVL(n-j);
                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
                 mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
+                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl);
                 mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl);
                 v1 = VFADDVV_FLOAT(v0, v1, gvl);
-                v_res = VFREDMINVS_FLOAT(v_res, v1, v_max, gvl);
-                if(VFMVFS_FLOAT(v_res) < minf)
-                        minf = VFMVFS_FLOAT(v_res);
+                v_res = VFREDMINVS_FLOAT(v1, v_res, gvl);
         }
+
+        minf = EXTRACT_FLOAT(v_res);
         return(minf);
 }
diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c
index 0d1cc42f1..9136f0037 100644
--- a/kernel/riscv64/zasum_vector.c
+++ b/kernel/riscv64/zasum_vector.c
@@ -28,37 +28,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include <math.h>
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VFFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFADDVV_FLOAT vfadd_vv_f32m8
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN _b32
+#       else
+#               define ELEN 32
+#               define MLEN _b16
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VFFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFADDVV_FLOAT vfadd_vv_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN _b8
+#       else
+#               define ELEN 32
+#               define MLEN _b4
+#       endif
 #endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f,  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#define VFABS_FLOAT     JOIN(__riscv_vfabs,     _v_f,   ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
+#define VFADDVV_FLOAT   JOIN(__riscv_vfadd,     _vv_f,  ELEN,   LMUL,   _)
+#define VMFLTVF_FLOAT   JOIN(__riscv_vmflt,     _vf_f,  ELEN,   LMUL,   MLEN)
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0, j=0;
@@ -67,12 +73,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	if (n <= 0 || inc_x <= 0) return(asumf);
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_zero,v_sum;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(0, 1);
 
-        MASK_T mask0, mask1;
         if(inc_x == 1){
                 BLASLONG n2 = n * 2;
                 gvl = VSETVL(n2);
@@ -81,26 +84,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         v_sum = VFMVVF_FLOAT(0, gvl);
                         for(i=0,j=0; i<n2/(gvl*2); i++){
                                 v0 = VLEV_FLOAT(&x[j], gvl);
-                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
+                                v0 = VFABS_FLOAT(v0, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
 
                                 v1 = VLEV_FLOAT(&x[j+gvl], gvl);
-                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                                v1 = VFABS_FLOAT(v1, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
                                 j += gvl * 2;
                         }
-                        v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
-                        asumf += VFFMVFS_FLOAT(v_res);
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
                 }
                 for(;j<n2;){
                         gvl = VSETVL(n2-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
-                        v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
-                        asumf += VFFMVFS_FLOAT(v_res);
+                        v0 = VFABS_FLOAT(v0, gvl);
+                        v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }else{
@@ -112,34 +110,29 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 v_sum = VFMVVF_FLOAT(0, gvl);
                 for(i=0,j=0; i<n/gvl; i++){
                         v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
+                        v0 = VFABS_FLOAT(v0, gvl);
                         v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
 
                         v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                        mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                        v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                        v1 = VFABS_FLOAT(v1, gvl);
                         v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
 
                         j += gvl;
                         ix += inc_xv;
                 }
-                v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
-                asumf += VFFMVFS_FLOAT(v_res);
+                v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
                 if(j<n){
                         gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
-                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        v0 = VFABS_FLOAT(v0, gvl);
 
-                        mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                        v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        v1 = VFABS_FLOAT(v1, gvl);
                         v_sum = VFADDVV_FLOAT(v0, v1, gvl);
-                        v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
-                        asumf += VFFMVFS_FLOAT(v_res);
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
                 }
         }
+        asumf = EXTRACT_FLOAT(v_res);
 	return(asumf);
 }
 
diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c
index 5e6034ac5..404f51fb3 100644
--- a/kernel/riscv64/zaxpby_vector.c
+++ b/kernel/riscv64/zaxpby_vector.c
@@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMULVF_FLOAT vfmul_vf_f32m4
-#define VFMSACVF_FLOAT vfmsac_vf_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
+#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4
+#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMULVF_FLOAT vfmul_vf_f64m4
-#define VFMSACVF_FLOAT vfmsac_vf_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
+#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4
+#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
 #endif
 
 int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i, FLOAT *y, BLASLONG inc_y)
diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c
index 4ccfe4a81..20bfe74ec 100644
--- a/kernel/riscv64/zaxpy_vector.c
+++ b/kernel/riscv64/zaxpy_vector.c
@@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
diff --git a/kernel/riscv64/zcopy_vector.c b/kernel/riscv64/zcopy_vector.c
index 55a480a35..9da60acb0 100644
--- a/kernel/riscv64/zcopy_vector.c
+++ b/kernel/riscv64/zcopy_vector.c
@@ -27,15 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
 #endif
 
 
diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c
index 0900206b3..57542714a 100644
--- a/kernel/riscv64/zdot_vector.c
+++ b/kernel/riscv64/zdot_vector.c
@@ -27,37 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFDOTVV_FLOAT vfdot_vv_f32m4
-#define VFMULVV_FLOAT vfmul_vv_f32m4
-#define VFMSACVV_FLOAT vfmsac_vv_f32m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
+#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32
+#define VLEV_FLOAT __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFDOTVV_FLOAT __riscv_vfdot_vv_f32m4
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
+#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f32m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFDOTVV_FLOAT vfdot_vv_f64m4
-#define VFMULVV_FLOAT vfmul_vv_f64m4
-#define VFMSACVV_FLOAT vfmsac_vv_f64m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
+#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64
+#define VLEV_FLOAT __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFDOTVV_FLOAT __riscv_vfdot_vv_f64m4
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
+#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f64m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
 #endif
 
 OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
@@ -109,9 +109,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
                 ix += inc_xv;
                 iy += inc_yv;
         }
-        v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+        v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
         dot[0] += VFMVFS_FLOAT(v_res);
-        v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+        v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
         dot[1] += VFMVFS_FLOAT(v_res);
         //tail
         if(j < n){
@@ -132,9 +132,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
                 vr1 = VFMULVV_FLOAT(vx1, vy0, gvl);
                 vr1 = VFMSACVV_FLOAT(vr1, vx0, vy1, gvl);
 #endif
-                v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+                v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
                 dot[0] += VFMVFS_FLOAT(v_res);
-                v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+                v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
                 dot[1] += VFMVFS_FLOAT(v_res);
         }
         CREAL(result) = dot[0];
diff --git a/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c b/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c
new file mode 100644
index 000000000..ca33368f0
--- /dev/null
+++ b/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c
@@ -0,0 +1,1253 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=8
+ M_tail_scalar_from=1
+ N=4
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='double'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f64m1'
+ VFMUL='__riscv_vfmul_vf_f64m1'
+ VLEV='__riscv_vle64_v_f64m1'
+ VLSEV='__riscv_vlse64_v_f64m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m1'
+ VSETVL='__riscv_vsetvl_e64m1'
+ VSEV='__riscv_vse64_v_f64m1'
+ VSSEV='__riscv_vsse64_v_f64m1'
+ acc_vector_t='vfloat64m1_t'
+ output='zgemm_kernel_8x4_zvl256b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m1_t'
+
+*/
+
+#include "common.h"
+
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+    #define S0  1
+    #define S1 -1
+    #define S2  1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfmacc
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+    #define S0  1
+    #define S1  1
+    #define S2  1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfmsac
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+    #define S0  1
+    #define S1  1
+    #define S2 -1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfnmsac
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+    #define S0  1
+    #define S1 -1
+    #define S2 -1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfnmacc
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/4; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m1(4);
+
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            double B2r = B[bi+2*2+0];
+            double B2i = B[bi+2*2+1];
+            double B3r = B[bi+3*2+0];
+            double B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 4 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f64m1( A1i, B2i, gvl);
+            tmp1i = __riscv_vfmul_vf_f64m1( A1r, B2i, gvl);
+            tmp2r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+            tmp2i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+            tmp3r = __riscv_vfmul_vf_f64m1( A1i, B3i, gvl);
+            tmp3i = __riscv_vfmul_vf_f64m1( A1r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B2r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B2r, A1i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B3r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B3r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A1r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A1i, gvl);
+            vfloat64m1_t ACC4r = tmp0r;
+            vfloat64m1_t ACC4i = tmp0i;
+            vfloat64m1_t ACC5r = tmp1r;
+            vfloat64m1_t ACC5i = tmp1i;
+            vfloat64m1_t ACC6r = tmp2r;
+            vfloat64m1_t ACC6i = tmp2i;
+            vfloat64m1_t ACC7r = tmp3r;
+            vfloat64m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B2i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B2i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B2r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B2r, A1i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B3r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B3r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A1r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A1i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+             ci += gvl;
+            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*1;
+            vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+             ci += gvl;
+            vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*1;
+            vfloat64m1_t C4r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C4i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+             ci += gvl;
+            vfloat64m1_t C5r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C5i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*1;
+            vfloat64m1_t C6r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C6i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+             ci += gvl;
+            vfloat64m1_t C7r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C7i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C4r = __riscv_vfmacc( C4r, alphar, ACC4r, gvl );
+            C4i = __riscv_vfmacc( C4i, alphar, ACC4i, gvl );
+            C5r = __riscv_vfmacc( C5r, alphar, ACC5r, gvl );
+            C5i = __riscv_vfmacc( C5i, alphar, ACC5i, gvl );
+            C6r = __riscv_vfmacc( C6r, alphar, ACC6r, gvl );
+            C6i = __riscv_vfmacc( C6i, alphar, ACC6i, gvl );
+            C7r = __riscv_vfmacc( C7r, alphar, ACC7r, gvl );
+            C7i = __riscv_vfmacc( C7i, alphar, ACC7i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 8;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            double B2r = B[bi+2*2+0];
+            double B2i = B[bi+2*2+1];
+            double B3r = B[bi+3*2+0];
+            double B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e64m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            double B2r = B[bi+2*2+0];
+            double B2i = B[bi+2*2+1];
+            double B3r = B[bi+3*2+0];
+            double B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
+                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
+                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
+                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
+                ai+=1*2;
+                bi+=4*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            double Cr, Ci;
+            Cr = C[(ci+0*ldc+0)*2+0];
+            Ci = C[(ci+0*ldc+0)*2+1];
+            Cr += result0*alphar;
+            Ci += result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+1*ldc+0)*2+0];
+            Ci = C[(ci+1*ldc+0)*2+1];
+            Cr += result2*alphar;
+            Ci += result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+2*ldc+0)*2+0];
+            Ci = C[(ci+2*ldc+0)*2+1];
+            Cr += result4*alphar;
+            Ci += result5*alphar;
+            Cr -= result5*alphai;
+            Ci += result4*alphai;
+            C[(ci+2*ldc+0)*2+0] = Cr;
+            C[(ci+2*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+3*ldc+0)*2+0];
+            Ci = C[(ci+3*ldc+0)*2+1];
+            Cr += result6*alphar;
+            Ci += result7*alphar;
+            Cr -= result7*alphai;
+            Ci += result6*alphai;
+            C[(ci+3*ldc+0)*2+0] = Cr;
+            C[(ci+3*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 4 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 20 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+             ci += gvl;
+            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*1;
+            vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+             ci += gvl;
+            vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e64m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                ai+=1*2;
+                bi+=2*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            double Cr, Ci;
+            Cr = C[(ci+0*ldc+0)*2+0];
+            Ci = C[(ci+0*ldc+0)*2+1];
+            Cr += result0*alphar;
+            Ci += result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+1*ldc+0)*2+0];
+            Ci = C[(ci+1*ldc+0)*2+1];
+            Cr += result2*alphar;
+            Ci += result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 4 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 24 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+             ci += gvl;
+            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e64m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                ai+=1*2;
+                bi+=1*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            double Cr, Ci;
+            Cr = C[(ci+0*ldc+0)*2+0];
+            Ci = C[(ci+0*ldc+0)*2+1];
+            Cr += result0*alphar;
+            Ci += result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/zgemm_kernel_generic.c b/kernel/riscv64/zgemm_kernel_generic.c
new file mode 100644
index 000000000..a46dc9171
--- /dev/null
+++ b/kernel/riscv64/zgemm_kernel_generic.c
@@ -0,0 +1,140 @@
+#include "common.h"
+
+
+/* for debugging/unit tests
+ * this is a drop-in replacement for zgemm/cgemm/ztrmm/ctrmm kernels that supports arbitrary combinations of unroll values
+ */
+
+#ifdef TRMMKERNEL
+    #if defined(LEFT) != defined(TRANSA)
+    #define BACKWARDS
+    #endif
+#endif
+
+#ifdef DOUBLE
+
+#define UNROLL_M ZGEMM_DEFAULT_UNROLL_M
+#define UNROLL_N ZGEMM_DEFAULT_UNROLL_N
+
+#else
+
+#define UNROLL_M CGEMM_DEFAULT_UNROLL_M
+#define UNROLL_N CGEMM_DEFAULT_UNROLL_N
+
+#endif
+
+int CNAME(BLASLONG M,BLASLONG N,BLASLONG K,FLOAT alphar,FLOAT alphai,FLOAT* A,FLOAT* B,FLOAT* C,BLASLONG ldc
+#ifdef TRMMKERNEL
+    ,BLASLONG offset
+#endif    
+    )
+{
+    FLOAT res[UNROLL_M*UNROLL_N*2];
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+    FLOAT sign[4] = { 1, -1,  1,  1};
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+    FLOAT sign[4] = { 1,  1,  1, -1};
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+    FLOAT sign[4] = { 1,  1, -1,  1};
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+    FLOAT sign[4] = { 1, -1, -1, -1};
+#endif
+
+    BLASLONG n_packing = UNROLL_N;
+    BLASLONG n_top = 0;
+
+    while(n_top < N)
+    {
+        while( n_top+n_packing > N )
+            n_packing >>= 1;
+
+        BLASLONG m_packing = UNROLL_M;
+        BLASLONG m_top = 0;
+        while (m_top < M)
+        {
+            while( m_top+m_packing > M )
+                m_packing >>= 1;
+
+            BLASLONG ai = K*m_top*2;
+            BLASLONG bi = K*n_top*2;
+
+            BLASLONG pass_K = K;
+
+
+            #ifdef TRMMKERNEL
+                #ifdef LEFT
+                    BLASLONG off = offset + m_top;
+                #else
+                    BLASLONG off = -offset + n_top;
+                #endif
+                #ifdef BACKWARDS
+                    ai += off * m_packing*2;
+                    bi += off * n_packing*2;
+                    pass_K -= off; 
+                #else
+                    #ifdef LEFT
+                        pass_K = off + m_packing;
+                    #else
+                        pass_K = off + n_packing;
+                    #endif
+                #endif
+            #endif
+
+            memset( res, 0, UNROLL_M*UNROLL_N*2*sizeof(FLOAT) );
+
+            for (BLASLONG k=0; k<pass_K; k+=1)
+            {
+                for( BLASLONG ki = 0; ki < n_packing; ++ki )
+                {
+                    FLOAT B0 = B[bi+ki*2+0];
+                    FLOAT B1 = B[bi+ki*2+1];
+
+                    for( BLASLONG kj = 0; kj < m_packing; ++kj )
+                    {
+                        FLOAT A0 = A[ai+kj*2+0];
+                        FLOAT A1 = A[ai+kj*2+1];
+
+                        res[(ki*UNROLL_M+kj)*2+0] += sign[0]*A0*B0 +sign[1]*A1*B1;
+                        res[(ki*UNROLL_M+kj)*2+1] += sign[2]*A1*B0 +sign[3]*A0*B1;
+                    }
+                }
+
+                ai += m_packing*2;
+                bi += n_packing*2;
+            }
+
+            BLASLONG cofs = ldc * n_top + m_top;
+            for( BLASLONG ki = 0; ki < n_packing; ++ki )
+            {
+                for( BLASLONG kj = 0; kj < m_packing; ++kj )
+                {
+                    #ifdef TRMMKERNEL
+                    FLOAT Cr = 0;
+                    FLOAT Ci = 0;
+                    #else
+                    FLOAT Cr = C[(cofs+ki*ldc+kj)*2+0];
+                    FLOAT Ci = C[(cofs+ki*ldc+kj)*2+1];
+                    #endif
+
+                    Cr += res[(ki*UNROLL_M+kj)*2+0]*alphar;
+                    Cr += -res[(ki*UNROLL_M+kj)*2+1]*alphai;
+                    Ci += res[(ki*UNROLL_M+kj)*2+1]*alphar;
+                    Ci += res[(ki*UNROLL_M+kj)*2+0]*alphai;
+
+                    C[(cofs+ki*ldc+kj)*2+0] = Cr; 
+                    C[(cofs+ki*ldc+kj)*2+1] = Ci;
+                }
+            }
+
+            m_top += m_packing;
+        }
+
+        n_top += n_packing;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c
index 3095c28f9..f4acad770 100644
--- a/kernel/riscv64/zgemv_n_vector.c
+++ b/kernel/riscv64/zgemv_n_vector.c
@@ -27,23 +27,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSEV_FLOAT vse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#define VLEV_FLOAT __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSEV_FLOAT __riscv_vse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSEV_FLOAT vse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#define VLEV_FLOAT __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSEV_FLOAT __riscv_vse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
diff --git a/kernel/riscv64/zgemv_t_vector.c b/kernel/riscv64/zgemv_t_vector.c
index a7a8a5279..179454094 100644
--- a/kernel/riscv64/zgemv_t_vector.c
+++ b/kernel/riscv64/zgemv_t_vector.c
@@ -27,31 +27,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m4_t
+#define VSETVL(n) __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m2_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMULVV_FLOAT vfmul_vv_f32m4
+#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m2
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m2_f32m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m2
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m2
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m4_t
+#define VSETVL(n) __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m2_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMULVV_FLOAT vfmul_vv_f64m4
+#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m2
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m2_f64m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m2
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m2
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m2
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@@ -62,49 +62,43 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
         FLOAT temp_r, temp_i;
 
         FLOAT_V_T va0, va1, vx0, vx1, vr, vi;
-        unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
-
+        unsigned int gvl = VSETVL(m);
+        FLOAT_V_T_M1 v_res_r, v_res_i;
         BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
         BLASLONG stride_a = sizeof(FLOAT) * 2;
-        gvl = VSETVL(m);
         BLASLONG inc_xv = inc_x * gvl * 2;
         BLASLONG inc_av = gvl * 2;
         BLASLONG inc_y2 = inc_y * 2;
         BLASLONG lda2 = lda * 2;
+
         for(i = 0; i < n; i++){
+                v_res_r = VFMVVF_FLOAT_M1(0, 1);
+                v_res_i = VFMVVF_FLOAT_M1(0, 1);
                 gvl = VSETVL(m);
                 j = 0;
                 ix = 0;
-                vr = VFMVVF_FLOAT(0, gvl);
-                vi = VFMVVF_FLOAT(0, gvl);
                 for(k = 0; k < m/gvl; k++){
                         va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
                         va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
                         vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                         vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                        vr = VFMACCVV_FLOAT(vr, va0, vx0, gvl);
+                        vr = VFMULVV_FLOAT(va0, vx0, gvl);
+                        vi = VFMULVV_FLOAT(va0, vx1, gvl);
                         vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl);
-                        vi = VFMACCVV_FLOAT(vi, va0, vx1, gvl);
                         vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl);
 #else
-                        vr = VFMACCVV_FLOAT(vr, va0, vx0, gvl);
+                        vr = VFMULVV_FLOAT(va0, vx0, gvl);
+                        vi = VFMULVV_FLOAT(va0, vx1, gvl);
                         vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl);
-                        vi = VFMACCVV_FLOAT(vi, va0, vx1, gvl);
                         vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl);
-
 #endif
+                        v_res_r = VFREDSUM_FLOAT(vr, v_res_r, gvl);
+                        v_res_i = VFREDSUM_FLOAT(vi, v_res_i, gvl);
+
                         j += inc_av;
                         ix += inc_xv;
                 }
-                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                temp_r = VFMVFS_FLOAT(v_res);
-                v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl);
-                temp_i = VFMVFS_FLOAT(v_res);
                 if(j/2 < m){
                         gvl = VSETVL(m-j/2);
                         va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
@@ -113,21 +107,23 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
                         vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
                         vr = VFMULVV_FLOAT(va0, vx0, gvl);
-                        vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl);
                         vi = VFMULVV_FLOAT(va0, vx1, gvl);
+                        vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl);
                         vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl);
 #else
                         vr = VFMULVV_FLOAT(va0, vx0, gvl);
-                        vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl);
                         vi = VFMULVV_FLOAT(va0, vx1, gvl);
+                        vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl);
                         vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl);
 
 #endif
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        temp_r += VFMVFS_FLOAT(v_res);
-                        v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl);
-                        temp_i += VFMVFS_FLOAT(v_res);
+                        v_res_r = VFREDSUM_FLOAT(vr, v_res_r, gvl);
+                        v_res_i = VFREDSUM_FLOAT(vi, v_res_i, gvl);
                 }
+
+                temp_r = VFMVFS_FLOAT(v_res_r);
+                temp_i = VFMVFS_FLOAT(v_res_i);
+
 #if !defined(XCONJ)
                 y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
                 y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
diff --git a/kernel/riscv64/zhemv_LM_vector.c b/kernel/riscv64/zhemv_LM_vector.c
index 0a284a999..e025120e5 100644
--- a/kernel/riscv64/zhemv_LM_vector.c
+++ b/kernel/riscv64/zhemv_LM_vector.c
@@ -27,37 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMULVV_FLOAT vfmul_vv_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
+#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMULVV_FLOAT vfmul_vv_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
+#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
@@ -143,9 +143,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
                                 iy += inc_yv;
                                 ia += inc_av;
                         }
-                        v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+                        v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
                         temp_r2 = VFMVFS_FLOAT(v_res);
-                        v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+                        v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
                         temp_i2 = VFMVFS_FLOAT(v_res);
                         if(i < m){
 				                gvl = VSETVL(m-i);
@@ -181,9 +181,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
                                 vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
 #endif
 
-                                v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+                                v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
                                 temp_r2 += VFMVFS_FLOAT(v_res);
-                                v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+                                v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
                                 temp_i2 += VFMVFS_FLOAT(v_res);
                         }
                 }
diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c
index 33b7c9c25..0e1ea5436 100644
--- a/kernel/riscv64/zhemv_UV_vector.c
+++ b/kernel/riscv64/zhemv_UV_vector.c
@@ -27,37 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMULVV_FLOAT vfmul_vv_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
+#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMULVV_FLOAT vfmul_vv_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
+#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
@@ -142,9 +142,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
                                 iy += inc_yv;
                                 ia += inc_av;
                         }
-                        v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+                        v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
                         temp_r2 = VFMVFS_FLOAT(v_res);
-                        v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+                        v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
                         temp_i2 = VFMVFS_FLOAT(v_res);
                         if(i < j){
 				                gvl = VSETVL(j-i);
@@ -180,9 +180,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
                                 vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
 #endif
 
-                                v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+                                v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
                                 temp_r2 += VFMVFS_FLOAT(v_res);
-                                v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+                                v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
                                 temp_i2 += VFMVFS_FLOAT(v_res);
                         }
                 }
diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c
index cadabdb75..437bf4246 100644
--- a/kernel/riscv64/znrm2_vector.c
+++ b/kernel/riscv64/znrm2_vector.c
@@ -26,264 +26,151 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #include "common.h"
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m4_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFDOTVV_FLOAT vfdot_vv_f32m4
-#define ABS fabsf
-#define MASK_T vbool8_t
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m
-#define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8
-#define VMFIRSTM vmfirst_m_b8
-#define VFDIVVF_FLOAT vfdiv_vf_f32m4
-#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m1
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 64
+#       else
+#               define ELEN 32
+#               define MLEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m4_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFDOTVV_FLOAT vfdot_vv_f64m4
+#       define LMUL m4
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 16
+#       else
+#               define ELEN 32
+#               define MLEN 8
+#       endif
+#endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
+#define MASK_T          JOIN(vbool,             MLEN,   _t,     _,      _)
+#define VFABS           JOIN(__riscv_vfabs,     _v_f,   ELEN,   LMUL,   _)
+#define VMFNE           JOIN(__riscv_vmfne_vf_f,ELEN,   LMUL,   _b,     MLEN)
+#define VMFGT           JOIN(__riscv_vmfgt_vv_f,ELEN,   LMUL,   _b,     MLEN)
+#define VMFEQ           JOIN(__riscv_vmfeq_vv_f,ELEN,   LMUL,   _b,     MLEN)
+#define VCPOP           JOIN(__riscv_vcpop,     _m_b,   MLEN,   _,      _)
+#define VFREDMAX        JOIN(__riscv_vfredmax_vs_f,ELEN,LMUL,   JOIN2(_f, ELEN), m1)
+#define VFIRST          JOIN(__riscv_vfirst,    _m_b,   MLEN,   _,      _)
+#define VRGATHER        JOIN(__riscv_vrgather,  _vx_f,  ELEN,   LMUL,   _)
+#define VFDIV           JOIN(__riscv_vfdiv,     _vf_f,  ELEN,   LMUL,   _)
+#define VFDIV_M         JOIN(__riscv_vfdiv,     _vv_f,  ELEN,   LMUL,   _mu)
+#define VFMUL           JOIN(__riscv_vfmul,     _vv_f,  ELEN,   LMUL,   _)
+#define VFMACC          JOIN(__riscv_vfmacc,    _vv_f,  ELEN,   LMUL,   _)
+#define VFMACC_M        JOIN(__riscv_vfmacc,    _vv_f,  ELEN,   LMUL,   _mu)
+#define VMSOF           JOIN(__riscv_vmsof,     _m_b,   MLEN,   _,      _)
+#define VMANDN          JOIN(__riscv_vmandn,    _mm_b,  MLEN,   _,      _)
+#define VFREDUSUM       JOIN(__riscv_vfredusum_vs_f,ELEN,LMUL,  JOIN2(_f, ELEN), m1)
+#if defined(DOUBLE)
 #define ABS fabs
-#define MASK_T vbool16_t
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m
-#define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16
-#define VMFIRSTM vmfirst_m_b16
-#define VFDIVVF_FLOAT vfdiv_vf_f64m4
-#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
+#else
+#define ABS fabsf
 #endif
 
+#define EXTRACT_FLOAT0_V(v) JOIN(__riscv_vfmv_f_s_f, ELEN, LMUL, _f, ELEN)(v)
+
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
+        BLASLONG i=0;
 
-	if ( n < 0 )  return(0.0);
-//        if(n == 1) return (ABS(x[0]));
+        if(n < 0)  return(0.0);
 
-        FLOAT_V_T vr, v0, v_zero;
+        FLOAT_V_T v_ssq, v_scale, v0, v1, v_zero;
         unsigned int gvl = 0;
         FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
 
-        FLOAT scale = 0.0, ssq = 0.0;
-        MASK_T mask;
-        BLASLONG index = 0;
-        if(inc_x == 1){
-                BLASLONG n2 = n * 2;
-                gvl = VSETVL(n2);
-                vr = VFMVVF_FLOAT(0, gvl);
-                v_zero = VFMVVF_FLOAT(0, gvl);
-                for(i=0,j=0; i<n2/gvl; i++){
-                        v0 = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0){
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
-                                }
-                        }else{//found greater element
-                                //ssq in vector vr: vr[0]
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                //total ssq before current vector
-                                ssq += VFMVFS_FLOAT(v_res);
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                //ssq in vector vr
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
-                        }
-                        j += gvl;
-                }
-                //ssq in vector vr: vr[0]
-                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                //total ssq now
-                ssq += VFMVFS_FLOAT(v_res);
+        v_res = VFMVVF_FLOAT_M1(0, 1);
+        v_z0 = VFMVVF_FLOAT_M1(0, 1);
 
-                //tail
-                if(j < n2){
-                        gvl = VSETVL(n2-j);
-                        v0 = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0)
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                        }else{//found greater element
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                        }
-                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
-                        //ssq in vector vr: vr[0]
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        //total ssq now
-                        ssq += VFMVFS_FLOAT(v_res);
-                }
-        }else{
-                gvl = VSETVL(n);
-                vr = VFMVVF_FLOAT(0, gvl);
-                v_zero = VFMVVF_FLOAT(0, gvl);
-                unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
-                int idx = 0, inc_v = inc_x * gvl * 2;
-                for(i=0,j=0; i<n/gvl; i++){
-                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0){
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
-                                }
-                        }else{//found greater element
-                                //ssq in vector vr: vr[0]
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                //total ssq before current vector
-                                ssq += VFMVFS_FLOAT(v_res);
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                //ssq in vector vr
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
-                        }
+        gvl = VSETVL(n);
+        v_ssq = VFMVVF_FLOAT(0, gvl);
+        v_scale = VFMVVF_FLOAT(0, gvl);
+        v_zero = VFMVVF_FLOAT(0, gvl);
 
-                        v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0){
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
-                                }
-                        }else{//found greater element
-                                //ssq in vector vr: vr[0]
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                //total ssq before current vector
-                                ssq += VFMVFS_FLOAT(v_res);
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                //ssq in vector vr
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
-                        }
-                        j += gvl;
-                        idx += inc_v;
+        unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
+        int idx = 0;
+
+        for(i=0; i<n/gvl; i++){
+                v0 = VLSEV_FLOAT( &x[idx], stride_x, gvl );
+                v1 = VLSEV_FLOAT( &x[idx+1], stride_x, gvl );
+                v0 = VFABS( v0, gvl );
+                v1 = VFABS( v1, gvl );
+
+                MASK_T scale_mask0 = VMFGT( v0, v_scale, gvl );
+                MASK_T scale_mask1 = VMFGT( v1, v_scale, gvl );
+                if( VCPOP( scale_mask0, gvl ) + VCPOP( scale_mask1, gvl ) > 0 ){ // scale change?
+                        // find largest element in v0 and v1
+                        v_res = VFREDMAX( v0, v_z0, gvl );
+                        v_res = VFREDMAX( v1, v_res, gvl );
+                        FLOAT const largest_elt = EXTRACT_FLOAT( v_res );
+
+                        v_scale = VFDIV( v_scale, largest_elt, gvl );   // scale/largest_elt
+                        v_scale = VFMUL( v_scale, v_scale, gvl );       // (scale/largest_elt)*(scale/largest_elt)
+                        v_ssq = VFMUL( v_scale, v_ssq, gvl );           // ssq*(scale/largest_elt)*(scale/largest_elt)
+
+                        v_scale = VFMVVF_FLOAT( largest_elt, gvl );     // splated largest_elt becomes new scale
                 }
-                //ssq in vector vr: vr[0]
-                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                //total ssq now
-                ssq += VFMVFS_FLOAT(v_res);
 
-                //tail
-                if(j < n){
-                        gvl = VSETVL(n-j);
-                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0){
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                MASK_T nonzero_mask0 = VMFNE( v0, 0, gvl );
+                MASK_T nonzero_mask1 = VMFNE( v1, 0, gvl );
+                v0 = VFDIV_M( nonzero_mask0, v_zero, v0, v_scale, gvl );
+                v1 = VFDIV_M( nonzero_mask1, v_zero, v1, v_scale, gvl );
+                v_ssq = VFMACC_M( nonzero_mask0, v_ssq, v0, v0, gvl );
+                v_ssq = VFMACC_M( nonzero_mask1, v_ssq, v1, v1, gvl );
+
+                idx += inc_x * gvl * 2;
+        }
+
+        v_res = VFREDUSUM(v_ssq, v_z0, gvl);
+        FLOAT ssq = EXTRACT_FLOAT(v_res);
+        FLOAT scale = EXTRACT_FLOAT0_V(v_scale);
+
+        //finish any tail using scalar ops
+        i*=gvl;
+        if(i<n){
+                i *= inc_x*2;
+                n *= inc_x*2;
+                FLOAT temp;
+                do{
+                        if ( x[i] != 0.0 ){
+                                temp = ABS( x[i] );
+                                if ( scale < temp ){
+                                        ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
+                                        scale = temp ;
+                                }else{
+                                        ssq += ( temp / scale ) * ( temp / scale );
                                 }
-                        }else{//found greater element
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
                         }
 
-                        v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0){
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                        if ( x[i+1] != 0.0 ){
+                                temp = ABS( x[i+1] );
+                                if ( scale < temp ){
+                                        ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
+                                        scale = temp ;
+                                }else{
+                                        ssq += ( temp / scale ) * ( temp / scale );
                                 }
-                        }else{//found greater element
-                                //ssq in vector vr: vr[0]
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                //total ssq before current vector
-                                ssq += VFMVFS_FLOAT(v_res);
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
                         }
-                        //ssq in vector vr: vr[0]
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        //total ssq now
-                        ssq += VFMVFS_FLOAT(v_res);
-                }
-        }
-	return(scale * sqrt(ssq));
-}
 
+                        i += inc_x*2;
+                }while(i<n);
+        }
 
+        return(scale * sqrt(ssq));
+}
diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c
index 858dfd173..c3afbc7cc 100644
--- a/kernel/riscv64/zrot_vector.c
+++ b/kernel/riscv64/zrot_vector.c
@@ -27,27 +27,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSEV_FLOAT vse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMULVF_FLOAT vfmul_vf_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#define VLEV_FLOAT __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSEV_FLOAT __riscv_vse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSEV_FLOAT vse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMULVF_FLOAT vfmul_vf_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#define VLEV_FLOAT __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSEV_FLOAT __riscv_vse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
 #endif
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
@@ -59,7 +59,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
         unsigned int gvl = 0;
 
         FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1;
-        gvl = VSETVL(n);
+        gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
         BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
         BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
         BLASLONG inc_xv = inc_x * 2 * gvl;
diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c
index d275b75f8..5d9ab7b28 100644
--- a/kernel/riscv64/zscal_vector.c
+++ b/kernel/riscv64/zscal_vector.c
@@ -27,25 +27,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMULVF_FLOAT vfmul_vf_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMULVF_FLOAT vfmul_vf_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
diff --git a/kernel/riscv64/zsum_vector.c b/kernel/riscv64/zsum_vector.c
new file mode 100644
index 000000000..7aab15105
--- /dev/null
+++ b/kernel/riscv64/zsum_vector.c
@@ -0,0 +1,131 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN _b32
+#       else
+#               define ELEN 32
+#               define MLEN _b16
+#       endif
+#else
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN _b8
+#       else
+#               define ELEN 32
+#               define MLEN _b4
+#       endif
+#endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f,  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
+#define VFADDVV_FLOAT   JOIN(__riscv_vfadd,     _vv_f,  ELEN,   LMUL,   _)
+#define VMFLTVF_FLOAT   JOIN(__riscv_vmflt,     _vf_f,  ELEN,   LMUL,   MLEN)
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0;
+	FLOAT asumf=0.0;
+	if (n <= 0 || inc_x <= 0) return(asumf);
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_zero,v_sum;
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(0, 1);
+
+        if(inc_x == 1){
+                BLASLONG n2 = n * 2;
+                gvl = VSETVL(n2);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                if(gvl <= n2/2){
+                        v_sum = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n2/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+                                j += gvl * 2;
+                        }
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
+                }
+                for(;j<n2;){
+                        gvl = VSETVL(n2-j);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
+                        j += gvl;
+                }
+        }else{
+                gvl = VSETVL(n);
+                unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
+                v_zero = VFMVVF_FLOAT(0, gvl);
+
+                BLASLONG inc_xv = inc_x * 2 * gvl;
+                v_sum = VFMVVF_FLOAT(0, gvl);
+                for(i=0,j=0; i<n/gvl; i++){
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                }
+                v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
+                if(j<n){
+                        gvl = VSETVL(n-j);
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        v_sum = VFADDVV_FLOAT(v0, v1, gvl);
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
+                }
+        }
+        asumf = EXTRACT_FLOAT(v_res);
+	return(asumf);
+}
+
+
diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c
index c1dcaccab..d8980602d 100644
--- a/kernel/riscv64/zswap_vector.c
+++ b/kernel/riscv64/zswap_vector.c
@@ -27,35 +27,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #include <stdio.h>
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VSEV_FLOAT vse32_v_f32m8
-#define VSSEV_FLOAT vsse32_v_f32m8
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 64
+#       else
+#               define ELEN 32
+#               define MLEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VSEV_FLOAT vse64_v_f64m8
-#define VSSEV_FLOAT vsse64_v_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 16
+#       else
+#               define ELEN 32
+#               define MLEN 8
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(__riscv_vse,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(__riscv_vsse,      ELEN,   _v_f,   ELEN,   LMUL)
+
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 	BLASLONG i = 0, j = 0;
 	BLASLONG ix = 0,iy = 0;
         BLASLONG stride_x, stride_y;
         FLOAT_V_T vx0, vx1, vy0, vy1;
-        unsigned int gvl = 0;
+        unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
+        if( inc_x == 0 && inc_y == 0 ) { n = n & 1; }
 
 	if (n < 0)  return(0);
         if(inc_x == 1 && inc_y == 1){
-                gvl = VSETVL(n);
                 BLASLONG n2 = n * 2;
                 if(gvl <= n2/2){
                         for(i=0,j=0; i<n2/(2*gvl); i++){
@@ -80,7 +95,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
                         j += gvl;
                 }
         }else{
-                gvl = VSETVL(n);
                 stride_x = inc_x * 2 * sizeof(FLOAT);
                 stride_y = inc_y * 2 * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * gvl * 2;
diff --git a/kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c b/kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c
new file mode 100644
index 000000000..de7622b89
--- /dev/null
+++ b/kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c
@@ -0,0 +1,1337 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=8
+ M_tail_scalar_from=1
+ N=4
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='double'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f64m1'
+ VFMUL='__riscv_vfmul_vf_f64m1'
+ VLEV='__riscv_vle64_v_f64m1'
+ VLSEV='__riscv_vlse64_v_f64m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m1'
+ VSETVL='__riscv_vsetvl_e64m1'
+ VSEV='__riscv_vse64_v_f64m1'
+ VSSEV='__riscv_vsse64_v_f64m1'
+ acc_vector_t='vfloat64m1_t'
+ output='ztrmm_kernel_8x4_zvl256b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m1_t'
+
+*/
+
+#include "common.h"
+
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+    #define S0  1
+    #define S1 -1
+    #define S2  1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfmacc
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+    #define S0  1
+    #define S1  1
+    #define S2  1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfmsac
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+    #define S0  1
+    #define S1  1
+    #define S2 -1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfnmsac
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+    #define S0  1
+    #define S1 -1
+    #define S2 -1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfnmacc
+#endif
+
+
+#if defined(LEFT) != defined(TRANSA)
+    #define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/4; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m1(4);
+
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            double B2r = B[bi+2*2+0];
+            double B2i = B[bi+2*2+1];
+            double B3r = B[bi+3*2+0];
+            double B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 4 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f64m1( A1i, B2i, gvl);
+            tmp1i = __riscv_vfmul_vf_f64m1( A1r, B2i, gvl);
+            tmp2r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+            tmp2i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+            tmp3r = __riscv_vfmul_vf_f64m1( A1i, B3i, gvl);
+            tmp3i = __riscv_vfmul_vf_f64m1( A1r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B2r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B2r, A1i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B3r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B3r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A1r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A1i, gvl);
+            vfloat64m1_t ACC4r = tmp0r;
+            vfloat64m1_t ACC4i = tmp0i;
+            vfloat64m1_t ACC5r = tmp1r;
+            vfloat64m1_t ACC5i = tmp1i;
+            vfloat64m1_t ACC6r = tmp2r;
+            vfloat64m1_t ACC6i = tmp2i;
+            vfloat64m1_t ACC7r = tmp3r;
+            vfloat64m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B2i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B2i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B2r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B2r, A1i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B3r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B3r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A1r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A1i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat64m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat64m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat64m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat64m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            vfloat64m1_t C4r = __riscv_vfmul( ACC4r, alphar, gvl );
+            vfloat64m1_t C4i = __riscv_vfmul( ACC4i, alphar, gvl );
+            vfloat64m1_t C5r = __riscv_vfmul( ACC5r, alphar, gvl );
+            vfloat64m1_t C5i = __riscv_vfmul( ACC5i, alphar, gvl );
+            vfloat64m1_t C6r = __riscv_vfmul( ACC6r, alphar, gvl );
+            vfloat64m1_t C6i = __riscv_vfmul( ACC6i, alphar, gvl );
+            vfloat64m1_t C7r = __riscv_vfmul( ACC7r, alphar, gvl );
+            vfloat64m1_t C7i = __riscv_vfmul( ACC7i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 8;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            double B2r = B[bi+2*2+0];
+            double B2i = B[bi+2*2+1];
+            double B3r = B[bi+3*2+0];
+            double B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat64m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat64m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat64m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat64m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e64m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            double B2r = B[bi+2*2+0];
+            double B2i = B[bi+2*2+1];
+            double B3r = B[bi+3*2+0];
+            double B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat64m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat64m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat64m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat64m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
+                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
+                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
+                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
+                ai+=1*2;
+                bi+=4*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            double Cr, Ci;
+            Cr = result0*alphar;
+            Ci = result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = result2*alphar;
+            Ci = result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            Cr = result4*alphar;
+            Ci = result5*alphar;
+            Cr -= result5*alphai;
+            Ci += result4*alphai;
+            C[(ci+2*ldc+0)*2+0] = Cr;
+            C[(ci+2*ldc+0)*2+1] = Ci;
+            Cr = result6*alphar;
+            Ci = result7*alphar;
+            Cr -= result7*alphai;
+            Ci += result6*alphai;
+            C[(ci+3*ldc+0)*2+0] = Cr;
+            C[(ci+3*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 4 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 20 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat64m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat64m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat64m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat64m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e64m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                ai+=1*2;
+                bi+=2*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            double Cr, Ci;
+            Cr = result0*alphar;
+            Ci = result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = result2*alphar;
+            Ci = result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 4 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 24 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e64m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                ai+=1*2;
+                bi+=1*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            double Cr, Ci;
+            Cr = result0*alphar;
+            Ci = result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/param.h b/param.h
index ede9fd1d3..4ece0b5a6 100644
--- a/param.h
+++ b/param.h
@@ -3121,6 +3121,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
+#ifdef RISCV64_ZVL256B
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M  16
+#define SGEMM_DEFAULT_UNROLL_N  8
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  8
+
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  8
+
+#define ZGEMM_DEFAULT_UNROLL_M  8
+#define ZGEMM_DEFAULT_UNROLL_N  4
+
+#define SGEMM_DEFAULT_P 128
+#define DGEMM_DEFAULT_P 64
+#define CGEMM_DEFAULT_P 64
+#define ZGEMM_DEFAULT_P 64
+
+#define SGEMM_DEFAULT_Q 128
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 128
+#define ZGEMM_DEFAULT_Q 64
+
+#define SGEMM_DEFAULT_R 16384
+#define DGEMM_DEFAULT_R 8192
+#define CGEMM_DEFAULT_R 8192
+#define ZGEMM_DEFAULT_R 4096
+
+#define SYMV_P 16
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+
+#endif
+
 #ifdef ARMV7
 #define SNUMOPT		2
 #define DNUMOPT		2

From 9b61be4545201c3f3d112018a9189c2d163d0718 Mon Sep 17 00:00:00 2001
From: Sergei Lewis <slewis@rivosinc.com>
Date: Wed, 1 Mar 2023 17:40:42 +0000
Subject: [PATCH 011/718] factoring riscv64/dot.c fix into separate PR as
 requested

---
 kernel/riscv64/dot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c
index bf55998ca..46a84ad18 100644
--- a/kernel/riscv64/dot.c
+++ b/kernel/riscv64/dot.c
@@ -46,7 +46,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 	BLASLONG ix=0,iy=0;
 	double dot = 0.0 ;
 
-	if ( n < 1 )  return(dot);
+	if ( n < 0 )  return(dot);
 
 	while(i < n)
 	{

From 1374a2d08b078451dcfaf723614ea13e441e1d06 Mon Sep 17 00:00:00 2001
From: Heller Zheng <heller.zheng@microchip.com>
Date: Sun, 19 Mar 2023 23:59:03 -0700
Subject: [PATCH 012/718] This PR adapts latest spec changes Add prefix
 (_riscv) for all riscv intrinsics Update some intrinsics' parameter, like
 vfredxxxx, vmerge

---
 kernel/riscv64/amax_rvv.c              |  54 +++++------
 kernel/riscv64/amin_rvv.c              |  54 +++++------
 kernel/riscv64/asum_rvv.c              |  54 +++++------
 kernel/riscv64/axpby_rvv.c             |  36 +++----
 kernel/riscv64/axpy_rvv.c              |  28 +++---
 kernel/riscv64/copy_rvv.c              |  24 ++---
 kernel/riscv64/dot_rvv.c               |  66 ++++++-------
 kernel/riscv64/gemm_beta_rvv.c         |  24 ++---
 kernel/riscv64/gemm_ncopy_8_rvv.c      |  28 +++---
 kernel/riscv64/gemm_ncopy_rvv_v1.c     |  20 ++--
 kernel/riscv64/gemm_tcopy_8_rvv.c      |  44 ++++-----
 kernel/riscv64/gemm_tcopy_rvv_v1.c     |  16 ++--
 kernel/riscv64/gemmkernel_rvv_v1x8.c   |  24 ++---
 kernel/riscv64/gemv_n_rvv.c            |  28 +++---
 kernel/riscv64/gemv_t_rvv.c            |  53 +++++------
 kernel/riscv64/iamax_rvv.c             | 115 +++++++++++-----------
 kernel/riscv64/iamin_rvv.c             | 117 +++++++++++------------
 kernel/riscv64/imax_rvv.c              | 113 +++++++++++-----------
 kernel/riscv64/imin_rvv.c              | 113 +++++++++++-----------
 kernel/riscv64/izamax_rvv.c            | 127 ++++++++++++-------------
 kernel/riscv64/izamin_rvv.c            | 121 ++++++++++++-----------
 kernel/riscv64/max_rvv.c               |  50 +++++-----
 kernel/riscv64/min_rvv.c               |  50 +++++-----
 kernel/riscv64/nrm2_rvv.c              |  46 ++++-----
 kernel/riscv64/rot_rvv.c               |  36 +++----
 kernel/riscv64/scal_rvv.c              |  32 +++----
 kernel/riscv64/sum_rvv.c               |  50 +++++-----
 kernel/riscv64/swap_rvv.c              |  28 +++---
 kernel/riscv64/symm_lcopy_rvv_v1.c     |  50 +++++-----
 kernel/riscv64/symm_ucopy_rvv_v1.c     |  50 +++++-----
 kernel/riscv64/symv_L_rvv.c            |  81 ++++++++--------
 kernel/riscv64/symv_U_rvv.c            |  81 ++++++++--------
 kernel/riscv64/trmm_lncopy_rvv_v1.c    |  48 +++++-----
 kernel/riscv64/trmm_ltcopy_rvv_v1.c    |  44 ++++-----
 kernel/riscv64/trmm_uncopy_rvv_v1.c    |  48 +++++-----
 kernel/riscv64/trmm_utcopy_rvv_v1.c    |  44 ++++-----
 kernel/riscv64/trmmkernel_rvv_v1x8.c   |  28 +++---
 kernel/riscv64/trsm_kernel_LN_rvv_v1.c |  54 +++++------
 kernel/riscv64/trsm_kernel_LT_rvv_v1.c |  54 +++++------
 kernel/riscv64/trsm_kernel_RN_rvv_v1.c |  54 +++++------
 kernel/riscv64/trsm_kernel_RT_rvv_v1.c |  42 ++++----
 kernel/riscv64/trsm_lncopy_rvv_v1.c    |  40 ++++----
 kernel/riscv64/trsm_ltcopy_rvv_v1.c    |  40 ++++----
 kernel/riscv64/trsm_uncopy_rvv_v1.c    |  40 ++++----
 kernel/riscv64/trsm_utcopy_rvv_v1.c    |  40 ++++----
 kernel/riscv64/zamax_rvv.c             |  58 +++++------
 kernel/riscv64/zamin_rvv.c             |  58 +++++------
 kernel/riscv64/zasum_rvv.c             |  51 +++++-----
 kernel/riscv64/zaxpby_rvv.c            |  52 +++++-----
 kernel/riscv64/zaxpy_rvv.c             |  32 +++----
 kernel/riscv64/zcopy_rvv.c             |  44 ++++-----
 kernel/riscv64/zdot_rvv.c              |  65 +++++++------
 kernel/riscv64/zgemm_beta_rvv.c        |  32 +++----
 kernel/riscv64/zgemm_ncopy_4_rvv.c     |  24 ++---
 kernel/riscv64/zgemm_ncopy_rvv_v1.c    |  16 ++--
 kernel/riscv64/zgemm_tcopy_4_rvv.c     |  40 ++++----
 kernel/riscv64/zgemm_tcopy_rvv_v1.c    |  16 ++--
 kernel/riscv64/zgemmkernel_rvv_v1x4.c  |  36 +++----
 kernel/riscv64/zgemv_n_rvv.c           |  48 +++++-----
 kernel/riscv64/zgemv_t_rvv.c           |  61 ++++++------
 kernel/riscv64/zhemm_ltcopy_rvv_v1.c   |  84 ++++++++--------
 kernel/riscv64/zhemm_utcopy_rvv_v1.c   |  84 ++++++++--------
 kernel/riscv64/znrm2_rvv.c             |  66 ++++++-------
 kernel/riscv64/zrot_rvv.c              |  52 +++++-----
 kernel/riscv64/zscal_rvv.c             |  44 ++++-----
 kernel/riscv64/zsum_rvv.c              |  47 +++++----
 kernel/riscv64/zswap_rvv.c             |  24 ++---
 kernel/riscv64/zsymm_lcopy_rvv_v1.c    |  64 ++++++-------
 kernel/riscv64/zsymm_ucopy_rvv_v1.c    |  64 ++++++-------
 kernel/riscv64/ztrmm_lncopy_rvv_v1.c   |  64 ++++++-------
 kernel/riscv64/ztrmm_ltcopy_rvv_v1.c   |  61 ++++++------
 kernel/riscv64/ztrmm_uncopy_rvv_v1.c   |  64 ++++++-------
 kernel/riscv64/ztrmm_utcopy_rvv_v1.c   |  60 ++++++------
 kernel/riscv64/ztrmmkernel_2x2_rvv.c   |  60 ++++++------
 kernel/riscv64/ztrmmkernel_rvv_v1x4.c  |  40 ++++----
 kernel/riscv64/ztrsm_lncopy_rvv_v1.c   |  36 +++----
 kernel/riscv64/ztrsm_ltcopy_rvv_v1.c   |  36 +++----
 kernel/riscv64/ztrsm_uncopy_rvv_v1.c   |  36 +++----
 kernel/riscv64/ztrsm_utcopy_rvv_v1.c   |  36 +++----
 79 files changed, 2013 insertions(+), 2031 deletions(-)

diff --git a/kernel/riscv64/amax_rvv.c b/kernel/riscv64/amax_rvv.c
index c9c6e7f73..be0bdbea0 100644
--- a/kernel/riscv64/amax_rvv.c
+++ b/kernel/riscv64/amax_rvv.c
@@ -29,33 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <float.h>
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m8()
-#define VSETVL_MAX_M1 vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
-#define VFABSV_FLOAT vfabs_v_f32m8
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m8_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f32m8
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m8()
-#define VSETVL_MAX_M1 vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
-#define VFABSV_FLOAT vfabs_v_f64m8
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m8_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f64m8
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -95,7 +95,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
     }
 
-    v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax);
+    v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax);
     maxf = VFMVFS_FLOAT_M1(v_res);
 
     return(maxf);
diff --git a/kernel/riscv64/amin_rvv.c b/kernel/riscv64/amin_rvv.c
index 370b6c338..d4926084b 100644
--- a/kernel/riscv64/amin_rvv.c
+++ b/kernel/riscv64/amin_rvv.c
@@ -29,33 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <float.h>
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m8()
-#define VSETVL_MAX_M1 vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMINVV_FLOAT vfmin_vv_f32m8
-#define VFABSV_FLOAT vfabs_v_f32m8
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m8_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMINVV_FLOAT           __riscv_vfmin_vv_f32m8
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m8()
-#define VSETVL_MAX_M1 vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMINVV_FLOAT vfmin_vv_f64m8
-#define VFABSV_FLOAT vfabs_v_f64m8
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m8_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMINVV_FLOAT           __riscv_vfmin_vv_f64m8
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -95,7 +95,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
     }
 
-    v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax);
+    v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax);
     minf = VFMVFS_FLOAT_M1(v_res);
 
     return(minf);
diff --git a/kernel/riscv64/asum_rvv.c b/kernel/riscv64/asum_rvv.c
index 4f711c9be..691591e22 100644
--- a/kernel/riscv64/asum_rvv.c
+++ b/kernel/riscv64/asum_rvv.c
@@ -28,33 +28,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m8()
-#define VSETVL_MAX_M1 vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFADDVV_FLOAT vfadd_vv_f32m8
-#define VFABSV_FLOAT vfabs_v_f32m8
-#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m8
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m8()
-#define VSETVL_MAX_M1 vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFADDVV_FLOAT vfadd_vv_f64m8
-#define VFABSV_FLOAT vfabs_v_f64m8
-#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m8
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -93,7 +93,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
     }
 
-    v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax); 
+    v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax); 
     asumf = VFMVFS_FLOAT_M1(v_res);
     return(asumf);
 }
diff --git a/kernel/riscv64/axpby_rvv.c b/kernel/riscv64/axpby_rvv.c
index 7c35c563d..a1dbdb0e4 100644
--- a/kernel/riscv64/axpby_rvv.c
+++ b/kernel/riscv64/axpby_rvv.c
@@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VSEV_FLOAT vse32_v_f32m8
-#define VSSEV_FLOAT vsse32_v_f32m8
-#define VFMACCVF_FLOAT vfmacc_vf_f32m8
-#define VFMULVF_FLOAT vfmul_vf_f32m8
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VSEV_FLOAT vse64_v_f64m8
-#define VSSEV_FLOAT vsse64_v_f64m8
-#define VFMACCVF_FLOAT vfmacc_vf_f64m8
-#define VFMULVF_FLOAT vfmul_vf_f64m8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
 #endif
 
 int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
diff --git a/kernel/riscv64/axpy_rvv.c b/kernel/riscv64/axpy_rvv.c
index 3986f4e21..8bc2f30de 100644
--- a/kernel/riscv64/axpy_rvv.c
+++ b/kernel/riscv64/axpy_rvv.c
@@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VSEV_FLOAT vse32_v_f32m8
-#define VSSEV_FLOAT vsse32_v_f32m8
-#define VFMACCVF_FLOAT vfmacc_vf_f32m8
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VSEV_FLOAT vse64_v_f64m8
-#define VSSEV_FLOAT vsse64_v_f64m8
-#define VFMACCVF_FLOAT vfmacc_vf_f64m8
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
diff --git a/kernel/riscv64/copy_rvv.c b/kernel/riscv64/copy_rvv.c
index 5d5a8bd04..041fd2dae 100644
--- a/kernel/riscv64/copy_rvv.c
+++ b/kernel/riscv64/copy_rvv.c
@@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VSEV_FLOAT vse32_v_f32m8
-#define VSSEV_FLOAT vsse32_v_f32m8
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VSEV_FLOAT vse64_v_f64m8
-#define VSSEV_FLOAT vsse64_v_f64m8
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
 #endif
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
diff --git a/kernel/riscv64/dot_rvv.c b/kernel/riscv64/dot_rvv.c
index 60dcac2f5..3276695b6 100644
--- a/kernel/riscv64/dot_rvv.c
+++ b/kernel/riscv64/dot_rvv.c
@@ -37,24 +37,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 
     if ( n <= 0 ) return(dot);
 
-    size_t vlmax = vsetvlmax_e64m8();
-    vfloat64m8_t vr = vfmv_v_f_f64m8(0, vlmax);
+    size_t vlmax = __riscv_vsetvlmax_e64m8();
+    vfloat64m8_t vr = __riscv_vfmv_v_f_f64m8(0, vlmax);
 
     if(inc_x == 1 && inc_y == 1) {
 
         for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
-            vl = vsetvl_e64m8(n);
+            vl = __riscv_vsetvl_e64m8(n);
 
 #if !defined(DOUBLE)
-            vfloat32m4_t vx = vle32_v_f32m4(x, vl);
-            vfloat32m4_t vy = vle32_v_f32m4(y, vl);
+            vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl);
+            vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl);
 
-            vr = vfwmacc_vv_f64m8(vr, vx, vy, vl);
+            vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl);
 #else
-            vfloat64m8_t vx = vle64_v_f64m8(x, vl);
-            vfloat64m8_t vy = vle64_v_f64m8(y, vl);
+            vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl);
+            vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl);
 
-            vr = vfmacc_vv_f64m8(vr, vx, vy, vl);
+            vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl);
 #endif
         }
 
@@ -63,18 +63,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
         BLASLONG stride_y = inc_y * sizeof(FLOAT);
 
         for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
-            vl = vsetvl_e64m8(n);
+            vl = __riscv_vsetvl_e64m8(n);
 
 #if !defined(DOUBLE)
-            vfloat32m4_t vx = vle32_v_f32m4(x, vl);
-            vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl);
+            vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl);
+            vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl);
 
-            vr = vfwmacc_vv_f64m8(vr, vx, vy, vl);
+            vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl);
 #else
-            vfloat64m8_t vx = vle64_v_f64m8(x, vl);
-            vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl);
+            vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl);
+            vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl);
 
-            vr = vfmacc_vv_f64m8(vr, vx, vy, vl);
+            vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl);
 #endif
         }
     } else if (1 == inc_y) {
@@ -82,18 +82,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
         BLASLONG stride_x = inc_x * sizeof(FLOAT);
 
         for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
-            vl = vsetvl_e64m8(n);
+            vl = __riscv_vsetvl_e64m8(n);
 
 #if !defined(DOUBLE)
-            vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl);
-            vfloat32m4_t vy = vle32_v_f32m4(y, vl);
+            vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl);
+            vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl);
 
-            vr = vfwmacc_vv_f64m8(vr, vx, vy, vl);
+            vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl);
 #else
-            vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl);
-            vfloat64m8_t vy = vle64_v_f64m8(y, vl);
+            vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl);
+            vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl);
 
-            vr = vfmacc_vv_f64m8(vr, vx, vy, vl);
+            vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl);
 #endif
         }
     } else {
@@ -102,25 +102,25 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
         BLASLONG stride_y = inc_y * sizeof(FLOAT);
 
         for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
-            vl = vsetvl_e64m8(n);
+            vl = __riscv_vsetvl_e64m8(n);
 
 #if !defined(DOUBLE)
-            vfloat32m4_t vx = vlse32_v_f32m4(x, stride_x, vl);
-            vfloat32m4_t vy = vlse32_v_f32m4(y, stride_y, vl);
+            vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl);
+            vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl);
 
-            vr = vfwmacc_vv_f64m8(vr, vx, vy, vl);
+            vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl);
 #else
-            vfloat64m8_t vx = vlse64_v_f64m8(x, stride_x, vl);
-            vfloat64m8_t vy = vlse64_v_f64m8(y, stride_y, vl);
+            vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl);
+            vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl);
 
-            vr = vfmacc_vv_f64m8(vr, vx, vy, vl);
+            vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl);
 #endif
         }
     }
 
-    vfloat64m1_t vec_zero = vfmv_v_f_f64m1(0, vlmax);
-    vfloat64m1_t vec_sum = vfredusum_vs_f64m8_f64m1(vec_zero, vr, vec_zero, vlmax);
-    dot = vfmv_f_s_f64m1_f64(vec_sum);
+    vfloat64m1_t vec_zero = __riscv_vfmv_v_f_f64m1(0, vlmax);
+    vfloat64m1_t vec_sum = __riscv_vfredusum_vs_f64m8_f64m1(vr, vec_zero, vlmax);
+    dot = __riscv_vfmv_f_s_f64m1_f64(vec_sum);
 
     return(dot);
 }
diff --git a/kernel/riscv64/gemm_beta_rvv.c b/kernel/riscv64/gemm_beta_rvv.c
index 34d1ea078..f3cf6491d 100644
--- a/kernel/riscv64/gemm_beta_rvv.c
+++ b/kernel/riscv64/gemm_beta_rvv.c
@@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMULVF_FLOAT vfmul_vf_f32m8
-#define VSEV_FLOAT vse32_v_f32m8
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMULVF_FLOAT vfmul_vf_f64m8
-#define VSEV_FLOAT vse64_v_f64m8
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
 #endif
 
 // Optimizes the implementation in ../generic/gemm_beta.c
diff --git a/kernel/riscv64/gemm_ncopy_8_rvv.c b/kernel/riscv64/gemm_ncopy_8_rvv.c
index 525b223c2..3030d67fb 100644
--- a/kernel/riscv64/gemm_ncopy_8_rvv.c
+++ b/kernel/riscv64/gemm_ncopy_8_rvv.c
@@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m1(n)
-#define FLOAT_V_T vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m1
-#define VSEV_FLOAT vse32_v_f32m1
-#define VSSEG2_FLOAT vsseg2e32_v_f32m1
-#define VSSEG4_FLOAT vsseg4e32_v_f32m1
-#define VSSEG8_FLOAT vsseg8e32_v_f32m1
+#define VSETVL(n)               __riscv_vsetvl_e32m1(n)
+#define FLOAT_V_T               vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m1
+#define VSEV_FLOAT              __riscv_vse32_v_f32m1
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m1
+#define VSSEG4_FLOAT            __riscv_vsseg4e32_v_f32m1
+#define VSSEG8_FLOAT            __riscv_vsseg8e32_v_f32m1
 #else
-#define VSETVL(n) vsetvl_e64m1(n)
-#define FLOAT_V_T vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m1
-#define VSEV_FLOAT vse64_v_f64m1
-#define VSSEG2_FLOAT vsseg2e64_v_f64m1
-#define VSSEG4_FLOAT vsseg4e64_v_f64m1
-#define VSSEG8_FLOAT vsseg8e64_v_f64m1
+#define VSETVL(n)               __riscv_vsetvl_e64m1(n)
+#define FLOAT_V_T               vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m1
+#define VSEV_FLOAT              __riscv_vse64_v_f64m1
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m1
+#define VSSEG4_FLOAT            __riscv_vsseg4e64_v_f64m1
+#define VSSEG8_FLOAT            __riscv_vsseg8e64_v_f64m1
 #endif
 
 // Optimizes the implementation in ../generic/gemm_ncopy_8.c
diff --git a/kernel/riscv64/gemm_ncopy_rvv_v1.c b/kernel/riscv64/gemm_ncopy_rvv_v1.c
index 2c5230752..2d6db15e5 100644
--- a/kernel/riscv64/gemm_ncopy_rvv_v1.c
+++ b/kernel/riscv64/gemm_ncopy_rvv_v1.c
@@ -28,17 +28,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VLSEV_FLOAT vlse32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VLSEV_FLOAT vlse64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
diff --git a/kernel/riscv64/gemm_tcopy_8_rvv.c b/kernel/riscv64/gemm_tcopy_8_rvv.c
index 81c1f962b..080a87312 100644
--- a/kernel/riscv64/gemm_tcopy_8_rvv.c
+++ b/kernel/riscv64/gemm_tcopy_8_rvv.c
@@ -28,29 +28,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m1(n)
-#define FLOAT_V_T vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m1
-#define VLSEV_FLOAT vlse32_v_f32m1
-#define VSEV_FLOAT vse32_v_f32m1
-#define VLSSEG2_FLOAT vlsseg2e32_v_f32m1
-#define VSSEG2_FLOAT vsseg2e32_v_f32m1
-#define VLSSEG4_FLOAT vlsseg4e32_v_f32m1
-#define VSSEG4_FLOAT vsseg4e32_v_f32m1
-#define VLSSEG8_FLOAT vlsseg8e32_v_f32m1
-#define VSSEG8_FLOAT vsseg8e32_v_f32m1
+#define VSETVL(n)               __riscv_vsetvl_e32m1(n)
+#define FLOAT_V_T               vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m1
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m1
+#define VSEV_FLOAT              __riscv_vse32_v_f32m1
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m1
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m1
+#define VLSSEG4_FLOAT           __riscv_vlsseg4e32_v_f32m1
+#define VSSEG4_FLOAT            __riscv_vsseg4e32_v_f32m1
+#define VLSSEG8_FLOAT           __riscv_vlsseg8e32_v_f32m1
+#define VSSEG8_FLOAT            __riscv_vsseg8e32_v_f32m1
 #else
-#define VSETVL(n) vsetvl_e64m1(n)
-#define FLOAT_V_T vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m1
-#define VLSEV_FLOAT vlse64_v_f64m1
-#define VSEV_FLOAT vse64_v_f64m1
-#define VLSSEG2_FLOAT vlsseg2e64_v_f64m1
-#define VSSEG2_FLOAT vsseg2e64_v_f64m1
-#define VLSSEG4_FLOAT vlsseg4e64_v_f64m1
-#define VSSEG4_FLOAT vsseg4e64_v_f64m1
-#define VLSSEG8_FLOAT vlsseg8e64_v_f64m1
-#define VSSEG8_FLOAT vsseg8e64_v_f64m1
+#define VSETVL(n)               __riscv_vsetvl_e64m1(n)
+#define FLOAT_V_T               vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m1
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m1
+#define VSEV_FLOAT              __riscv_vse64_v_f64m1
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m1
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m1
+#define VLSSEG4_FLOAT           __riscv_vlsseg4e64_v_f64m1
+#define VSSEG4_FLOAT            __riscv_vsseg4e64_v_f64m1
+#define VLSSEG8_FLOAT           __riscv_vlsseg8e64_v_f64m1
+#define VSSEG8_FLOAT            __riscv_vsseg8e64_v_f64m1
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
diff --git a/kernel/riscv64/gemm_tcopy_rvv_v1.c b/kernel/riscv64/gemm_tcopy_rvv_v1.c
index a291b70b8..c5fb6479f 100644
--- a/kernel/riscv64/gemm_tcopy_rvv_v1.c
+++ b/kernel/riscv64/gemm_tcopy_rvv_v1.c
@@ -28,15 +28,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
diff --git a/kernel/riscv64/gemmkernel_rvv_v1x8.c b/kernel/riscv64/gemmkernel_rvv_v1x8.c
index 5cd509f93..471b3158f 100644
--- a/kernel/riscv64/gemmkernel_rvv_v1x8.c
+++ b/kernel/riscv64/gemmkernel_rvv_v1x8.c
@@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VFMVVF_FLOAT vfmv_v_f_f32m2
-#define VFMACCVF_FLOAT vfmacc_vf_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VFMVVF_FLOAT vfmv_v_f_f64m2
-#define VFMACCVF_FLOAT vfmacc_vf_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
 #endif
 
 int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc
diff --git a/kernel/riscv64/gemv_n_rvv.c b/kernel/riscv64/gemv_n_rvv.c
index 9d2dee615..1366eb5ad 100644
--- a/kernel/riscv64/gemv_n_rvv.c
+++ b/kernel/riscv64/gemv_n_rvv.c
@@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VSEV_FLOAT vse32_v_f32m8
-#define VSSEV_FLOAT vsse32_v_f32m8
-#define VFMACCVF_FLOAT vfmacc_vf_f32m8
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VSEV_FLOAT vse64_v_f64m8
-#define VSSEV_FLOAT vsse64_v_f64m8
-#define VFMACCVF_FLOAT vfmacc_vf_f64m8
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
diff --git a/kernel/riscv64/gemv_t_rvv.c b/kernel/riscv64/gemv_t_rvv.c
index a80af81b6..f0c834866 100644
--- a/kernel/riscv64/gemv_t_rvv.c
+++ b/kernel/riscv64/gemv_t_rvv.c
@@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m8()
-#define VSETVL_MAX_M1 vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m8
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m8()
-#define VSETVL_MAX_M1 vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@@ -63,7 +63,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
     FLOAT_V_T va, vx, vr;
     FLOAT_V_T_M1 v_res, v_z0;
     size_t vlmax = VSETVL_MAX_M1;
-    v_res = VFMVVF_FLOAT_M1(0, vlmax);
     v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
     vlmax = VSETVL_MAX;
 
@@ -83,7 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
                 vr = VFMACCVV_FLOAT(vr, va, vx, vl);
             }
 
-            v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
+            v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
             *y += alpha * VFMVFS_FLOAT_M1(v_res);
             y += inc_y;
             a += lda;
@@ -107,7 +106,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
                 vr = VFMACCVV_FLOAT(vr, va, vx, vl);
             }
 
-            v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
+            v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
             *y += alpha * VFMVFS_FLOAT_M1(v_res);
             y += inc_y;
             a += lda;
diff --git a/kernel/riscv64/iamax_rvv.c b/kernel/riscv64/iamax_rvv.c
index 8b33b3bcb..ef7850a55 100644
--- a/kernel/riscv64/iamax_rvv.c
+++ b/kernel/riscv64/iamax_rvv.c
@@ -28,57 +28,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if defined(DOUBLE)
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m8()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFABSV_FLOAT vfabs_v_f64m8
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
-#define VFIRSTM vfirst_m_b8
-#define UINT_V_T vuint64m8_t
-#define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
-#define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VMVVX_UINT vmv_v_x_u64m8
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
-#define VSLIDEDOWN_UINT vslidedown_vx_u64m8
-#define VMVVXS_UINT vmv_x_s_u64m8_u64
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m8_f64m1
+#define MASK_T                  vbool8_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f64m8_b8
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f64m8_b8
+#define VMFGEVF_FLOAT           __riscv_vmfge_vf_f64m8_b8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
+#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f64m8
+#define VFIRSTM                 __riscv_vfirst_m_b8
+#define UINT_V_T                vuint64m8_t
+#define VIDV_MASK_UINT          __riscv_vid_v_u64m8_m
+#define VIDV_UINT               __riscv_vid_v_u64m8
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m8_m
+#define VADDVX_UINT             __riscv_vadd_vx_u64m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u64m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u64m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u64m8_u64
 #else
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m8()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFABSV_FLOAT vfabs_v_f32m8
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
-#define VFIRSTM vfirst_m_b4
-#define UINT_V_T vuint32m8_t
-#define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
-#define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VMVVX_UINT vmv_v_x_u32m8
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
-#define VSLIDEDOWN_UINT vslidedown_vx_u32m8
-#define VMVVXS_UINT vmv_x_s_u32m8_u32
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m8_f32m1
+#define MASK_T                  vbool4_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f32m8_b4
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f32m8_b4
+#define VMFGEVF_FLOAT           __riscv_vmfge_vf_f32m8_b4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
+#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f32m8
+#define VFIRSTM                 __riscv_vfirst_m_b4
+#define UINT_V_T                vuint32m8_t
+#define VIDV_MASK_UINT          __riscv_vid_v_u32m8_m
+#define VIDV_UINT               __riscv_vid_v_u32m8
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m8_m
+#define VADDVX_UINT             __riscv_vadd_vx_u32m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u32m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u32m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u32m8_u32
 #endif
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -106,8 +106,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             //index where element greater than v_max
             mask = VMFLTVV_FLOAT(v_max, vx, vl);
-            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
-            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
+            v_max_index = VIDV_MASK_UINT(mask, vl);
+            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl);
 
             //update v_max
             v_max = VFMAXVV_FLOAT(v_max, vx, vl);
@@ -125,8 +125,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             //index where element greater than v_max
             mask = VMFLTVV_FLOAT(v_max, vx, vl);
-            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
-            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
+            v_max_index = VIDV_MASK_UINT(mask, vl);
+            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl);
 
             //update v_max
             v_max = VFMAXVV_FLOAT(v_max, vx, vl);
@@ -134,16 +134,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
   
     }
 
-    FLOAT_V_T_M1 v_res, v_z0;
+    FLOAT_V_T_M1 v_res;
     v_res = VFMVVF_FLOAT_M1(0, vlmax);
-    v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
 
-    v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax);
+    v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax);
     maxf = VFMVFS_FLOAT_M1(v_res);
     mask = VMFGEVF_FLOAT(v_max, maxf, vlmax);
     max_index = VFIRSTM(mask, vlmax);
     
-    v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax);
+    v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax);
     max_index = VMVVXS_UINT(v_max_index);
 
     return(max_index+1);
diff --git a/kernel/riscv64/iamin_rvv.c b/kernel/riscv64/iamin_rvv.c
index 585b37186..56a086fed 100644
--- a/kernel/riscv64/iamin_rvv.c
+++ b/kernel/riscv64/iamin_rvv.c
@@ -29,57 +29,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <float.h>
 
 #if defined(DOUBLE)
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m8()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFABSV_FLOAT vfabs_v_f64m8
-#define VFMINVV_FLOAT vfmin_vv_f64m8
-#define VFIRSTM vfirst_m_b8
-#define UINT_V_T vuint64m8_t
-#define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
-#define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VMVVX_UINT vmv_v_x_u64m8
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
-#define VSLIDEDOWN_UINT vslidedown_vx_u64m8
-#define VMVVXS_UINT vmv_x_s_u64m8_u64
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m8_f64m1
+#define MASK_T                  vbool8_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f64m8_b8
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f64m8_b8
+#define VMFLEVF_FLOAT           __riscv_vmfle_vf_f64m8_b8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
+#define VFMINVV_FLOAT           __riscv_vfmin_vv_f64m8
+#define VFIRSTM                 __riscv_vfirst_m_b8
+#define UINT_V_T                vuint64m8_t
+#define VIDV_MASK_UINT          __riscv_vid_v_u64m8_m
+#define VIDV_UINT               __riscv_vid_v_u64m8
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m8_m
+#define VADDVX_UINT             __riscv_vadd_vx_u64m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u64m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u64m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u64m8_u64
 #else
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m8()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFABSV_FLOAT vfabs_v_f32m8
-#define VFMINVV_FLOAT vfmin_vv_f32m8
-#define VFIRSTM vfirst_m_b4
-#define UINT_V_T vuint32m8_t
-#define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
-#define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VMVVX_UINT vmv_v_x_u32m8
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
-#define VSLIDEDOWN_UINT vslidedown_vx_u32m8
-#define VMVVXS_UINT vmv_x_s_u32m8_u32
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m8_f32m1
+#define MASK_T                  vbool4_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f32m8_b4
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f32m8_b4
+#define VMFLEVF_FLOAT           __riscv_vmfle_vf_f32m8_b4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
+#define VFMINVV_FLOAT           __riscv_vfmin_vv_f32m8
+#define VFIRSTM                 __riscv_vfirst_m_b4
+#define UINT_V_T                vuint32m8_t
+#define VIDV_MASK_UINT          __riscv_vid_v_u32m8_m
+#define VIDV_UINT               __riscv_vid_v_u32m8
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m8_m
+#define VADDVX_UINT             __riscv_vadd_vx_u32m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u32m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u32m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u32m8_u32
 #endif
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -107,8 +107,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             // index where element less than v_min
             mask = VMFLTVV_FLOAT(vx, v_min, vl);
-            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
-            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
+            v_min_index = VIDV_MASK_UINT(mask, vl);
+            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl);
 
             //update v_min and start_index j
             v_min = VFMINVV_FLOAT(v_min, vx, vl);
@@ -126,8 +126,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             // index where element less than v_min
             mask = VMFLTVV_FLOAT(vx, v_min, vl);
-            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
-            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
+            v_min_index = VIDV_MASK_UINT(mask, vl);
+            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl);
 
             //update v_min and start_index j
             v_min = VFMINVV_FLOAT(v_min, vx, vl);
@@ -135,16 +135,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
   
     }
 
-    FLOAT_V_T_M1 v_res, v_max;
-    v_res = VFMVVF_FLOAT_M1(0, vlmax);
-    v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
+    FLOAT_V_T_M1 v_res;
+    v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
 
-    v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax);
+    v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax);
     minf = VFMVFS_FLOAT_M1(v_res);
     mask = VMFLEVF_FLOAT(v_min, minf, vlmax);
     min_index = VFIRSTM(mask, vlmax);
 
-    v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax);
+    v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax);
     min_index = VMVVXS_UINT(v_min_index);
 
     return(min_index+1);
diff --git a/kernel/riscv64/imax_rvv.c b/kernel/riscv64/imax_rvv.c
index d84ad968e..5b60a56f7 100644
--- a/kernel/riscv64/imax_rvv.c
+++ b/kernel/riscv64/imax_rvv.c
@@ -29,55 +29,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <float.h>
 
 #if defined(DOUBLE)
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m8()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
-#define VFIRSTM vfirst_m_b8
-#define UINT_V_T vuint64m8_t
-#define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
-#define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VMVVX_UINT vmv_v_x_u64m8
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
-#define VSLIDEDOWN_UINT vslidedown_vx_u64m8
-#define VMVVXS_UINT vmv_x_s_u64m8_u64
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m8_f64m1
+#define MASK_T                  vbool8_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f64m8_b8
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f64m8_b8
+#define VMFGEVF_FLOAT           __riscv_vmfge_vf_f64m8_b8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f64m8
+#define VFIRSTM                 __riscv_vfirst_m_b8
+#define UINT_V_T                vuint64m8_t
+#define VIDV_MASK_UINT          __riscv_vid_v_u64m8_m
+#define VIDV_UINT               __riscv_vid_v_u64m8
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m8_m
+#define VADDVX_UINT             __riscv_vadd_vx_u64m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u64m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u64m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u64m8_u64
 #else
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m8()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
-#define VFIRSTM vfirst_m_b4
-#define UINT_V_T vuint32m8_t
-#define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
-#define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VMVVX_UINT vmv_v_x_u32m8
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
-#define VSLIDEDOWN_UINT vslidedown_vx_u32m8
-#define VMVVXS_UINT vmv_x_s_u32m8_u32
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m8_f32m1
+#define MASK_T                  vbool4_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f32m8_b4
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f32m8_b4
+#define VMFGEVF_FLOAT           __riscv_vmfge_vf_f32m8_b4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f32m8
+#define VFIRSTM                 __riscv_vfirst_m_b4
+#define UINT_V_T                vuint32m8_t
+#define VIDV_MASK_UINT          __riscv_vid_v_u32m8_m
+#define VIDV_UINT               __riscv_vid_v_u32m8
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m8_m
+#define VADDVX_UINT             __riscv_vadd_vx_u32m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u32m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u32m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u32m8_u32
 #endif
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -104,8 +104,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             //index where element greater than v_max
             mask = VMFLTVV_FLOAT(v_max, vx, vl);
-            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
-            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
+            v_max_index = VIDV_MASK_UINT(mask, vl);
+            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl);
 
             //update v_max and start_index j
             v_max = VFMAXVV_FLOAT(v_max, vx, vl);
@@ -122,8 +122,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             //index where element greater than v_max
             mask = VMFLTVV_FLOAT(v_max, vx, vl);
-            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
-            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
+            v_max_index = VIDV_MASK_UINT(mask, vl);
+            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl);
 
             //update v_max and start_index j
             v_max = VFMAXVV_FLOAT(v_max, vx, vl);
@@ -131,16 +131,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
   
     }
 
-    FLOAT_V_T_M1 v_res, v_min;
-    v_res = VFMVVF_FLOAT_M1(0, vlmax);
-    v_min = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax);
+    FLOAT_V_T_M1 v_res;
+    v_res = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax);
 
-    v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, vlmax);
+    v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax);
     maxf = VFMVFS_FLOAT_M1(v_res);
     mask = VMFGEVF_FLOAT(v_max, maxf, vlmax);
     max_index = VFIRSTM(mask, vlmax);
     
-    v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax);
+    v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax);
     max_index = VMVVXS_UINT(v_max_index);
 
     return(max_index+1);
diff --git a/kernel/riscv64/imin_rvv.c b/kernel/riscv64/imin_rvv.c
index fb734f6f8..b49544a1b 100644
--- a/kernel/riscv64/imin_rvv.c
+++ b/kernel/riscv64/imin_rvv.c
@@ -29,55 +29,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <float.h>
 
 #if defined(DOUBLE)
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m8()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMINVV_FLOAT vfmin_vv_f64m8
-#define VFIRSTM vfirst_m_b8
-#define UINT_V_T vuint64m8_t
-#define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
-#define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VMVVX_UINT vmv_v_x_u64m8
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
-#define VSLIDEDOWN_UINT vslidedown_vx_u64m8
-#define VMVVXS_UINT vmv_x_s_u64m8_u64
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m8_f64m1
+#define MASK_T                  vbool8_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f64m8_b8
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f64m8_b8
+#define VMFLEVF_FLOAT           __riscv_vmfle_vf_f64m8_b8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMINVV_FLOAT           __riscv_vfmin_vv_f64m8
+#define VFIRSTM                 __riscv_vfirst_m_b8
+#define UINT_V_T                vuint64m8_t
+#define VIDV_MASK_UINT          __riscv_vid_v_u64m8_m
+#define VIDV_UINT               __riscv_vid_v_u64m8
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m8_m
+#define VADDVX_UINT             __riscv_vadd_vx_u64m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u64m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u64m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u64m8_u64
 #else
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m8()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMINVV_FLOAT vfmin_vv_f32m8
-#define VFIRSTM vfirst_m_b4
-#define UINT_V_T vuint32m8_t
-#define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
-#define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VMVVX_UINT vmv_v_x_u32m8
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
-#define VSLIDEDOWN_UINT vslidedown_vx_u32m8
-#define VMVVXS_UINT vmv_x_s_u32m8_u32
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m8_f32m1
+#define MASK_T                  vbool4_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f32m8_b4
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f32m8_b4
+#define VMFLEVF_FLOAT           __riscv_vmfle_vf_f32m8_b4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMINVV_FLOAT           __riscv_vfmin_vv_f32m8
+#define VFIRSTM                 __riscv_vfirst_m_b4
+#define UINT_V_T                vuint32m8_t
+#define VIDV_MASK_UINT          __riscv_vid_v_u32m8_m
+#define VIDV_UINT               __riscv_vid_v_u32m8
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m8_m
+#define VADDVX_UINT             __riscv_vadd_vx_u32m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u32m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u32m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u32m8_u32
 #endif
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -104,8 +104,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             // index where element less than v_min
             mask = VMFLTVV_FLOAT(vx, v_min, vl);
-            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
-            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
+            v_min_index = VIDV_MASK_UINT(mask, vl);
+            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl);
 
             //update v_min and start_index j
             v_min = VFMINVV_FLOAT(v_min, vx, vl);
@@ -122,8 +122,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             // index where element less than v_min
             mask = VMFLTVV_FLOAT(vx, v_min, vl);
-            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
-            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
+            v_min_index = VIDV_MASK_UINT(mask, vl);
+            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl);
 
             //update v_min and start_index j
             v_min = VFMINVV_FLOAT(v_min, vx, vl);
@@ -131,16 +131,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
   
     }
 
-    FLOAT_V_T_M1 v_res, v_max;
-    v_res = VFMVVF_FLOAT_M1(0, vlmax);
-    v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
+    FLOAT_V_T_M1 v_res;
+    v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
 
-    v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax);
+    v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax);
     minf = VFMVFS_FLOAT_M1(v_res);
     mask = VMFLEVF_FLOAT(v_min, minf, vlmax);
     min_index = VFIRSTM(mask, vlmax);
 
-    v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax);
+    v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax);
     min_index = VMVVXS_UINT(v_min_index);
 
     return(min_index+1);
diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c
index 9cb332cbb..e61d0cbec 100644
--- a/kernel/riscv64/izamax_rvv.c
+++ b/kernel/riscv64/izamax_rvv.c
@@ -28,63 +28,63 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if defined(DOUBLE)
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m4()
-#define FLOAT_V_T vfloat64m4_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VLSEG_FLOAT vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
-#define MASK_T vbool16_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
-#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16
-#define VMFGEVF_FLOAT vmfge_vf_f64m4_b16
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFABSV_FLOAT vfabs_v_f64m4
-#define VFMAXVV_FLOAT vfmax_vv_f64m4
-#define VFADDVV_FLOAT vfadd_vv_f64m4
-#define VFIRSTM vfirst_m_b16
-#define UINT_V_T vuint64m4_t
-#define VIDV_MASK_UINT vid_v_u64m4_m
-#define VIDV_UINT vid_v_u64m4
-#define VADDVX_MASK_UINT vadd_vx_u64m4_m
-#define VADDVX_UINT vadd_vx_u64m4
-#define VMVVX_UINT vmv_v_x_u64m4
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
-#define VSLIDEDOWN_UINT vslidedown_vx_u64m4
-#define VMVVXS_UINT vmv_x_s_u64m4_u64
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m4_f64m1
+#define MASK_T                  vbool16_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f64m4_b16
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f64m4_b16
+#define VMFGEVF_FLOAT           __riscv_vmfge_vf_f64m4_b16
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m4
+#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f64m4
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
+#define VFIRSTM                 __riscv_vfirst_m_b16
+#define UINT_V_T                vuint64m4_t
+#define VIDV_MASK_UINT          __riscv_vid_v_u64m4_m
+#define VIDV_UINT               __riscv_vid_v_u64m4
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m4_m
+#define VADDVX_UINT             __riscv_vadd_vx_u64m4
+#define VMVVX_UINT              __riscv_vmv_v_x_u64m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u64m4
+#define VMVVXS_UINT             __riscv_vmv_x_s_u64m4_u64
 #else
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m4()
-#define FLOAT_V_T vfloat32m4_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VLSEG_FLOAT vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
-#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8
-#define VMFGEVF_FLOAT vmfge_vf_f32m4_b8
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFABSV_FLOAT vfabs_v_f32m4
-#define VFMAXVV_FLOAT vfmax_vv_f32m4
-#define VFADDVV_FLOAT vfadd_vv_f32m4
-#define VFIRSTM vfirst_m_b8
-#define UINT_V_T vuint32m4_t
-#define VIDV_MASK_UINT vid_v_u32m4_m
-#define VIDV_UINT vid_v_u32m4
-#define VADDVX_MASK_UINT vadd_vx_u32m4_m
-#define VADDVX_UINT vadd_vx_u32m4
-#define VMVVX_UINT vmv_v_x_u32m4
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
-#define VSLIDEDOWN_UINT vslidedown_vx_u32m4
-#define VMVVXS_UINT vmv_x_s_u32m4_u32
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m4_f32m1
+#define MASK_T                  vbool8_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f32m4_b8
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f32m4_b8
+#define VMFGEVF_FLOAT           __riscv_vmfge_vf_f32m4_b8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m4
+#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f32m4
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
+#define VFIRSTM                 __riscv_vfirst_m_b8
+#define UINT_V_T                vuint32m4_t
+#define VIDV_MASK_UINT          __riscv_vid_v_u32m4_m
+#define VIDV_UINT               __riscv_vid_v_u32m4
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m4_m
+#define VADDVX_UINT             __riscv_vadd_vx_u32m4
+#define VMVVX_UINT              __riscv_vmv_v_x_u32m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u32m4
+#define VMVVXS_UINT             __riscv_vmv_x_s_u32m4_u32
 #endif
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -116,8 +116,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             //index where element greater than v_max
             mask = VMFLTVV_FLOAT(v_max, vx0, vl);
-            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
-            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
+            v_max_index = VIDV_MASK_UINT(mask, vl);
+            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl);
 
             //update v_max and start_index j
             v_max = VFMAXVV_FLOAT(v_max, vx0, vl);
@@ -138,24 +138,23 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         
             //index where element greater than v_max
             mask = VMFLTVV_FLOAT(v_max, vx0, vl);
-            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
-            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
+            v_max_index = VIDV_MASK_UINT(mask, vl);
+            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl);
         
             //update v_max and start_index j
             v_max = VFMAXVV_FLOAT(v_max, vx0, vl);
         }
 
     }
-    FLOAT_V_T_M1 v_res, v_z0;
+    FLOAT_V_T_M1 v_res;
     v_res = VFMVVF_FLOAT_M1(0, vlmax);
-    v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
 
-    v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, vlmax);
+    v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax);
     maxf = VFMVFS_FLOAT_M1(v_res);
     mask = VMFGEVF_FLOAT(v_max, maxf, vlmax);
     max_index = VFIRSTM(mask, vlmax);
     
-    v_max_index = VSLIDEDOWN_UINT(v_max_index, v_max_index, max_index, vlmax);
+    v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax);
     max_index = VMVVXS_UINT(v_max_index);
 
     return(max_index+1);
diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c
index 69771e5aa..297b3c99a 100644
--- a/kernel/riscv64/izamin_rvv.c
+++ b/kernel/riscv64/izamin_rvv.c
@@ -29,59 +29,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <float.h>
 
 #if defined(DOUBLE)
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m4()
-#define FLOAT_V_T vfloat64m4_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLSEG_FLOAT vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1
-#define MASK_T vbool16_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
-#define VMFLTVV_FLOAT vmflt_vv_f64m4_b16
-#define VMFLEVF_FLOAT vmfle_vf_f64m4_b16
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFABSV_FLOAT vfabs_v_f64m4
-#define VFMINVV_FLOAT vfmin_vv_f64m4
-#define VFADDVV_FLOAT vfadd_vv_f64m4
-#define VFIRSTM vfirst_m_b16
-#define UINT_V_T vuint64m4_t
-#define VIDV_MASK_UINT vid_v_u64m4_m
-#define VIDV_UINT vid_v_u64m4
-#define VADDVX_MASK_UINT vadd_vx_u64m4_m
-#define VADDVX_UINT vadd_vx_u64m4
-#define VMVVX_UINT vmv_v_x_u64m4
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
-#define VSLIDEDOWN_UINT vslidedown_vx_u64m4
-#define VMVVXS_UINT vmv_x_s_u64m4_u64
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m4_f64m1
+#define MASK_T                  vbool16_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f64m4_b16
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f64m4_b16
+#define VMFLEVF_FLOAT           __riscv_vmfle_vf_f64m4_b16
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m4
+#define VFMINVV_FLOAT           __riscv_vfmin_vv_f64m4
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
+#define VFIRSTM                 __riscv_vfirst_m_b16
+#define UINT_V_T                vuint64m4_t
+#define VIDV_MASK_UINT          __riscv_vid_v_u64m4_m
+#define VIDV_UINT               __riscv_vid_v_u64m4
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m4_m
+#define VADDVX_UINT             __riscv_vadd_vx_u64m4
+#define VMVVX_UINT              __riscv_vmv_v_x_u64m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u64m4
+#define VMVVXS_UINT             __riscv_vmv_x_s_u64m4_u64
 #else
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m4()
-#define FLOAT_V_T vfloat32m4_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLSEG_FLOAT vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
-#define VMFLTVV_FLOAT vmflt_vv_f32m4_b8
-#define VMFLEVF_FLOAT vmfle_vf_f32m4_b8
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFABSV_FLOAT vfabs_v_f32m4
-#define VFMINVV_FLOAT vfmin_vv_f32m4
-#define VFADDVV_FLOAT vfadd_vv_f32m4
-#define VFIRSTM vfirst_m_b8
-#define UINT_V_T vuint32m4_t
-#define VIDV_MASK_UINT vid_v_u32m4_m
-#define VIDV_UINT vid_v_u32m4
-#define VADDVX_MASK_UINT vadd_vx_u32m4_m
-#define VADDVX_UINT vadd_vx_u32m4
-#define VMVVX_UINT vmv_v_x_u32m4
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
-#define VSLIDEDOWN_UINT vslidedown_vx_u32m4
-#define VMVVXS_UINT vmv_x_s_u32m4_u32
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m4_f32m1
+#define MASK_T                  vbool8_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f32m4_b8
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f32m4_b8
+#define VMFLEVF_FLOAT           __riscv_vmfle_vf_f32m4_b8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m4
+#define VFMINVV_FLOAT           __riscv_vfmin_vv_f32m4
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
+#define VFIRSTM                 __riscv_vfirst_m_b8
+#define UINT_V_T                vuint32m4_t
+#define VIDV_MASK_UINT          __riscv_vid_v_u32m4_m
+#define VIDV_UINT               __riscv_vid_v_u32m4
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m4_m
+#define VADDVX_UINT             __riscv_vadd_vx_u32m4
+#define VMVVX_UINT              __riscv_vmv_v_x_u32m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u32m4
+#define VMVVXS_UINT             __riscv_vmv_x_s_u32m4_u32
 #endif
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -113,8 +113,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             // index where element less than v_min
             mask = VMFLTVV_FLOAT(vx0, v_min, vl);
-            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
-            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
+            v_min_index = VIDV_MASK_UINT(mask, vl);
+            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl);
 
             //update v_min and start_index j
             v_min = VFMINVV_FLOAT(v_min, vx0, vl);
@@ -136,8 +136,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             // index where element less than v_min
             mask = VMFLTVV_FLOAT(vx0, v_min, vl);
-            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
-            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
+            v_min_index = VIDV_MASK_UINT(mask, vl);
+            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl);
 
             //update v_min and start_index j
             v_min = VFMINVV_FLOAT(v_min, vx0, vl);
@@ -145,16 +145,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
     }
 
-    FLOAT_V_T_M1 v_res, v_max;
-    v_res = VFMVVF_FLOAT_M1(0, vlmax);
-    v_max = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
+    FLOAT_V_T_M1 v_res;
+    v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
 
-    v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, vlmax);
+    v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax);
     minf = VFMVFS_FLOAT_M1(v_res);
     mask = VMFLEVF_FLOAT(v_min, minf, vlmax);
     min_index = VFIRSTM(mask, vlmax);
 
-    v_min_index = VSLIDEDOWN_UINT(v_min_index, v_min_index, min_index, vlmax);
+    v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax);
     min_index = VMVVXS_UINT(v_min_index);
 
     return(min_index+1);
diff --git a/kernel/riscv64/max_rvv.c b/kernel/riscv64/max_rvv.c
index 5b1380d2b..9315321f4 100644
--- a/kernel/riscv64/max_rvv.c
+++ b/kernel/riscv64/max_rvv.c
@@ -29,31 +29,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <float.h>
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m8()
-#define VSETVL_MAX_M1 vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m8_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f32m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m8()
-#define VSETVL_MAX_M1 vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m8_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f64m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -91,7 +91,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
     }
 
-    v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax);
+    v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax);
     maxf = VFMVFS_FLOAT_M1(v_res);
 
     return(maxf);
diff --git a/kernel/riscv64/min_rvv.c b/kernel/riscv64/min_rvv.c
index bddcc0ba7..158b682fd 100644
--- a/kernel/riscv64/min_rvv.c
+++ b/kernel/riscv64/min_rvv.c
@@ -29,31 +29,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <float.h>
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m8()
-#define VSETVL_MAX_M1 vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMINVV_FLOAT vfmin_vv_f32m8
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m8_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMINVV_FLOAT           __riscv_vfmin_vv_f32m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m8()
-#define VSETVL_MAX_M1 vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMINVV_FLOAT vfmin_vv_f64m8
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m8_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMINVV_FLOAT           __riscv_vfmin_vv_f64m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -91,7 +91,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
     }
 
-    v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax);
+    v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax);
     minf = VFMVFS_FLOAT_M1(v_res);
 
     return(minf);
diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c
index 979c31648..42abfa119 100644
--- a/kernel/riscv64/nrm2_rvv.c
+++ b/kernel/riscv64/nrm2_rvv.c
@@ -29,30 +29,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <math.h>
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m8()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m8
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
 #define ABS fabsf
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m8()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
 #define ABS fabs
 #endif
 
@@ -95,7 +95,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         }
     }
 
-    v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax);
+    v_res = VFREDSUM_FLOAT(vr, v_res, vlmax);
 
     ssq = VFMVFS_FLOAT_M1(v_res);
 
diff --git a/kernel/riscv64/rot_rvv.c b/kernel/riscv64/rot_rvv.c
index 7bf5e4270..90f81d5e2 100644
--- a/kernel/riscv64/rot_rvv.c
+++ b/kernel/riscv64/rot_rvv.c
@@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VSEV_FLOAT vse32_v_f32m8
-#define VSSEV_FLOAT vsse32_v_f32m8
-#define VFMACCVF_FLOAT vfmacc_vf_f32m8
-#define VFMULVF_FLOAT vfmul_vf_f32m8
-#define VFMSACVF_FLOAT vfmsac_vf_f32m8
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m8
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f32m8
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VSEV_FLOAT vse64_v_f64m8
-#define VSSEV_FLOAT vsse64_v_f64m8
-#define VFMACCVF_FLOAT vfmacc_vf_f64m8
-#define VFMULVF_FLOAT vfmul_vf_f64m8
-#define VFMSACVF_FLOAT vfmsac_vf_f64m8
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m8
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f64m8
 #endif
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
diff --git a/kernel/riscv64/scal_rvv.c b/kernel/riscv64/scal_rvv.c
index d2c0378bf..2e2cfd31e 100644
--- a/kernel/riscv64/scal_rvv.c
+++ b/kernel/riscv64/scal_rvv.c
@@ -28,23 +28,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VSEV_FLOAT vse32_v_f32m8
-#define VSSEV_FLOAT vsse32_v_f32m8
-#define VFMULVF_FLOAT vfmul_vf_f32m8
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VSEV_FLOAT vse64_v_f64m8
-#define VSSEV_FLOAT vsse64_v_f64m8
-#define VFMULVF_FLOAT vfmul_vf_f64m8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
diff --git a/kernel/riscv64/sum_rvv.c b/kernel/riscv64/sum_rvv.c
index 1db0d09dd..9715faf22 100644
--- a/kernel/riscv64/sum_rvv.c
+++ b/kernel/riscv64/sum_rvv.c
@@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m8()
-#define VSETVL_MAX_M1 vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFADDVV_FLOAT vfadd_vv_f32m8
-#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m8
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m8()
-#define VSETVL_MAX_M1 vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFADDVV_FLOAT vfadd_vv_f64m8
-#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m8
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -89,7 +89,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
     }
 
-    v_res = VFREDSUMVS_FLOAT(v_res, vsum, v_res, vlmax);
+    v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax);
     sumf = VFMVFS_FLOAT_M1(v_res);
     return(sumf);
 }
diff --git a/kernel/riscv64/swap_rvv.c b/kernel/riscv64/swap_rvv.c
index 2cf92f6ad..893d70554 100644
--- a/kernel/riscv64/swap_rvv.c
+++ b/kernel/riscv64/swap_rvv.c
@@ -28,23 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m8()
-#define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VSEV_FLOAT vse32_v_f32m8
-#define VSSEV_FLOAT vsse32_v_f32m8
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m8()
-#define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VSEV_FLOAT vse64_v_f64m8
-#define VSSEV_FLOAT vsse64_v_f64m8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
diff --git a/kernel/riscv64/symm_lcopy_rvv_v1.c b/kernel/riscv64/symm_lcopy_rvv_v1.c
index f0def9617..a615db44d 100644
--- a/kernel/riscv64/symm_lcopy_rvv_v1.c
+++ b/kernel/riscv64/symm_lcopy_rvv_v1.c
@@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define VSETVL_MAX vsetvlmax_e32m2()
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VLSEV_FLOAT vlse32_v_f32m2
-#define INT_V_T     vint32m2_t
-#define VID_V_INT   vid_v_i32m2
-#define VADD_VX_INT vadd_vx_i32m2
-#define VMSGT_VX_INT vmsgt_vx_i32m2_b16
-#define VBOOL_T     vbool16_t
-#define VMERGE_VVM_FLOAT  vmerge_vvm_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define INT_V_T                 vint32m2_t
+#define VID_V_INT               __riscv_vid_v_i32m2
+#define VADD_VX_INT             __riscv_vadd_vx_i32m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i32m2_b16
+#define VBOOL_T                 vbool16_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define VSETVL_MAX vsetvlmax_e64m2()
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VLSEV_FLOAT vlse64_v_f64m2
-#define INT_V_T     vint64m2_t
-#define VID_V_INT   vid_v_i64m2
-#define VADD_VX_INT vadd_vx_i64m2
-#define VMSGT_VX_INT vmsgt_vx_i64m2_b32
-#define VBOOL_T     vbool32_t
-#define VMERGE_VVM_FLOAT  vmerge_vvm_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define INT_V_T                 vint64m2_t
+#define VID_V_INT               __riscv_vid_v_i64m2
+#define VADD_VX_INT             __riscv_vadd_vx_i64m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i64m2_b32
+#define VBOOL_T                 vbool32_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f64m2
 #endif
 
 // Optimizes the implementation in ../generic/symm_lcopy_4.c
@@ -87,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
             vindex = VADD_VX_INT(vindex_max, offset, vl);
             vbool  = VMSGT_VX_INT(vindex, 0, vl);
 
-            vb =  VMERGE_VVM_FLOAT(vbool, va2, va1, vl);
+            vb =  VMERGE_VVM_FLOAT(va2, va1, vbool, vl);
             VSEV_FLOAT(b, vb, vl);
 
             b += vl;
diff --git a/kernel/riscv64/symm_ucopy_rvv_v1.c b/kernel/riscv64/symm_ucopy_rvv_v1.c
index 958506df3..464f97b3a 100644
--- a/kernel/riscv64/symm_ucopy_rvv_v1.c
+++ b/kernel/riscv64/symm_ucopy_rvv_v1.c
@@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define VSETVL_MAX vsetvlmax_e32m2()
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VLSEV_FLOAT vlse32_v_f32m2
-#define INT_V_T     vint32m2_t
-#define VID_V_INT   vid_v_i32m2
-#define VADD_VX_INT vadd_vx_i32m2
-#define VMSGT_VX_INT vmsgt_vx_i32m2_b16
-#define VBOOL_T     vbool16_t
-#define VMERGE_VVM_FLOAT  vmerge_vvm_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define INT_V_T                 vint32m2_t
+#define VID_V_INT               __riscv_vid_v_i32m2
+#define VADD_VX_INT             __riscv_vadd_vx_i32m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i32m2_b16
+#define VBOOL_T                 vbool16_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define VSETVL_MAX vsetvlmax_e64m2()
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VLSEV_FLOAT vlse64_v_f64m2
-#define INT_V_T     vint64m2_t
-#define VID_V_INT   vid_v_i64m2
-#define VADD_VX_INT vadd_vx_i64m2
-#define VMSGT_VX_INT vmsgt_vx_i64m2_b32
-#define VBOOL_T     vbool32_t
-#define VMERGE_VVM_FLOAT  vmerge_vvm_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define INT_V_T                 vint64m2_t
+#define VID_V_INT               __riscv_vid_v_i64m2
+#define VADD_VX_INT             __riscv_vadd_vx_i64m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i64m2_b32
+#define VBOOL_T                 vbool32_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f64m2
 #endif
 
 // Optimizes the implementation in ../generic/symm_ucopy_4.c
@@ -87,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
             vindex = VADD_VX_INT(vindex_max, offset, vl);
             vbool  = VMSGT_VX_INT(vindex, 0, vl);
 
-            vb =  VMERGE_VVM_FLOAT(vbool, va2, va1, vl);
+            vb =  VMERGE_VVM_FLOAT(va2, va1, vbool, vl);
             VSEV_FLOAT(b, vb, vl);
 
             b += vl;
diff --git a/kernel/riscv64/symv_L_rvv.c b/kernel/riscv64/symv_L_rvv.c
index 737abaae3..e87ab22ae 100644
--- a/kernel/riscv64/symv_L_rvv.c
+++ b/kernel/riscv64/symv_L_rvv.c
@@ -28,43 +28,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL_MAX_M1       vsetvlmax_e32m1()
-#define VSETVL(n)           vsetvl_e32m8(n)
-#define VSETVL_MAX          vsetvlmax_e32m8()
-#define FLOAT_V_T_M1        vfloat32m1_t
-#define FLOAT_V_T           vfloat32m8_t
-#define VLEV_FLOAT          vle32_v_f32m8
-#define VSEV_FLOAT          vse32_v_f32m8
-#define VLSEV_FLOAT         vlse32_v_f32m8
-#define VSSEV_FLOAT         vsse32_v_f32m8
-#define VFMACCVV_FLOAT      vfmacc_vv_f32m8
-#define VFMACCVF_FLOAT      vfmacc_vf_f32m8
-#define VFNMSACVF_FLOAT     vfnmsac_vf_f32m8
-#define VFMULVF_FLOAT       vfmul_vf_f32m8
-#define VFMVVF_FLOAT        vfmv_v_f_f32m8
-#define VFMSACVF_FLOAT      vfmsac_vf_f32m8
-#define VFMVVF_FLOAT_M1     vfmv_v_f_f32m1
-#define VFREDSUM_FLOAT      vfredusum_vs_f32m8_f32m1
-#define VFMVFS_FLOAT_M1     vfmv_f_s_f32m1_f32
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f32m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
 #else
-#define VSETVL_MAX_M1       vsetvlmax_e64m1()
-#define VSETVL(n)           vsetvl_e64m8(n)
-#define VSETVL_MAX          vsetvlmax_e64m8()
-#define FLOAT_V_T_M1        vfloat64m1_t
-#define FLOAT_V_T           vfloat64m8_t
-#define VLEV_FLOAT          vle64_v_f64m8
-#define VSEV_FLOAT          vse64_v_f64m8
-#define VLSEV_FLOAT         vlse64_v_f64m8
-#define VSSEV_FLOAT         vsse64_v_f64m8
-#define VFMACCVV_FLOAT      vfmacc_vv_f64m8
-#define VFMACCVF_FLOAT      vfmacc_vf_f64m8
-#define VFNMSACVF_FLOAT     vfnmsac_vf_f64m8
-#define VFMULVF_FLOAT       vfmul_vf_f64m8
-#define VFMVVF_FLOAT        vfmv_v_f_f64m8
-#define VFMSACVF_FLOAT      vfmsac_vf_f64m8
-#define VFMVVF_FLOAT_M1     vfmv_v_f_f64m1
-#define VFREDSUM_FLOAT      vfredusum_vs_f64m8_f64m1
-#define VFMVFS_FLOAT_M1     vfmv_f_s_f64m1_f64
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f64m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@@ -77,7 +77,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
         FLOAT_V_T_M1 v_res, v_z0;
         size_t vlmax = VSETVL_MAX_M1, vl;
-        v_res = VFMVVF_FLOAT_M1(0, vlmax);
         v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
         vlmax = VSETVL_MAX;
 
@@ -105,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                 vr = VFMACCVV_FLOAT(vr, vx, va, vl);
 
                         }
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
 
                         y[j] += alpha * VFMVFS_FLOAT_M1(v_res);
                         a_ptr += lda;
@@ -137,7 +136,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                 iy += inc_yv;
                         }
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
 
                         y[jy] += alpha * VFMVFS_FLOAT_M1(v_res);
                         jy    += inc_y;
@@ -172,7 +171,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                 ix += inc_xv;
                         }
 
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
 
                         y[j] += alpha * VFMVFS_FLOAT_M1(v_res);
                         jx    += inc_x;
@@ -211,7 +210,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                 ix += inc_xv;
                                 iy += inc_yv;
                         }
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
 
                         y[jy] += alpha * VFMVFS_FLOAT_M1(v_res);
                         jx    += inc_x;
diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c
index cb923be5d..3fbc33c89 100644
--- a/kernel/riscv64/symv_U_rvv.c
+++ b/kernel/riscv64/symv_U_rvv.c
@@ -29,43 +29,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL_MAX_M1       vsetvlmax_e32m1()
-#define VSETVL(n)           vsetvl_e32m8(n)
-#define VSETVL_MAX          vsetvlmax_e32m8()
-#define FLOAT_V_T_M1        vfloat32m1_t
-#define FLOAT_V_T           vfloat32m8_t
-#define VLEV_FLOAT          vle32_v_f32m8
-#define VSEV_FLOAT          vse32_v_f32m8
-#define VLSEV_FLOAT         vlse32_v_f32m8
-#define VSSEV_FLOAT         vsse32_v_f32m8
-#define VFMACCVV_FLOAT      vfmacc_vv_f32m8
-#define VFMACCVF_FLOAT      vfmacc_vf_f32m8
-#define VFNMSACVF_FLOAT     vfnmsac_vf_f32m8
-#define VFMULVF_FLOAT       vfmul_vf_f32m8
-#define VFMVVF_FLOAT        vfmv_v_f_f32m8
-#define VFMSACVF_FLOAT      vfmsac_vf_f32m8
-#define VFMVVF_FLOAT_M1     vfmv_v_f_f32m1
-#define VFREDSUM_FLOAT      vfredusum_vs_f32m8_f32m1
-#define VFMVFS_FLOAT_M1     vfmv_f_s_f32m1_f32
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f32m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
 #else
-#define VSETVL_MAX_M1       vsetvlmax_e64m1()
-#define VSETVL(n)           vsetvl_e64m8(n)
-#define VSETVL_MAX          vsetvlmax_e64m8()
-#define FLOAT_V_T_M1        vfloat64m1_t
-#define FLOAT_V_T           vfloat64m8_t
-#define VLEV_FLOAT          vle64_v_f64m8
-#define VSEV_FLOAT          vse64_v_f64m8
-#define VLSEV_FLOAT         vlse64_v_f64m8
-#define VSSEV_FLOAT         vsse64_v_f64m8
-#define VFMACCVV_FLOAT      vfmacc_vv_f64m8
-#define VFMACCVF_FLOAT      vfmacc_vf_f64m8
-#define VFNMSACVF_FLOAT     vfnmsac_vf_f64m8
-#define VFMULVF_FLOAT       vfmul_vf_f64m8
-#define VFMVVF_FLOAT        vfmv_v_f_f64m8
-#define VFMSACVF_FLOAT      vfmsac_vf_f64m8
-#define VFMVVF_FLOAT_M1     vfmv_v_f_f64m1
-#define VFREDSUM_FLOAT      vfredusum_vs_f64m8_f64m1
-#define VFMVFS_FLOAT_M1     vfmv_f_s_f64m1_f64
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f64m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@@ -77,7 +77,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
         FLOAT *a_ptr = a;
         FLOAT_V_T_M1 v_res, v_z0;
         size_t vl_max = VSETVL_MAX_M1, vl;
-        v_res = VFMVVF_FLOAT_M1(0, vl_max);
         v_z0 = VFMVVF_FLOAT_M1(0, vl_max);
         vl_max = VSETVL_MAX;
 
@@ -105,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                 vx = VLEV_FLOAT(&x[i], vl);
                                 vr = VFMACCVV_FLOAT(vr, vx, va, vl);
                         }
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max);
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max);
 
                         y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
                         a_ptr += lda;
@@ -137,7 +136,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                 iy += inc_yv;
                         }
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max);
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max);
 
                         y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
                         a_ptr += lda;
@@ -171,7 +170,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                 ix += inc_xv;
                         }
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max);
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max);
 
                         y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
                         a_ptr += lda;
@@ -209,7 +208,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                 ix += inc_xv;
                                 iy += inc_yv;
                         }
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vl_max);
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max);
 
                         y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
                         a_ptr += lda;
diff --git a/kernel/riscv64/trmm_lncopy_rvv_v1.c b/kernel/riscv64/trmm_lncopy_rvv_v1.c
index 3457ca3e1..4135a9b62 100644
--- a/kernel/riscv64/trmm_lncopy_rvv_v1.c
+++ b/kernel/riscv64/trmm_lncopy_rvv_v1.c
@@ -30,29 +30,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VLSEV_FLOAT vlse32_v_f32m2
-#define VBOOL_T vbool16_t
-#define UINT_V_T vuint32m2_t
-#define VID_V_UINT vid_v_u32m2
-#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
-#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
-#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VLSEV_FLOAT vlse64_v_f64m2
-#define VBOOL_T     vbool32_t
-#define UINT_V_T     vuint64m2_t
-#define VID_V_UINT   vid_v_u64m2
-#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
-#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
-#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
 #endif
 
 // Optimizes the implementation in ../arm64/tmmm_lncopy_sve_v1.c
@@ -116,10 +116,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                 {
                     va1 = VLSEV_FLOAT(ao, stride_lda, vl);
                     vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
-                    vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
+                    vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
 #ifdef UNIT
                     vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
-                    vb =  VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl);
+                    vb =  VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl);
 #endif
                     VSEV_FLOAT(b, vb, vl);
                     ao++;
diff --git a/kernel/riscv64/trmm_ltcopy_rvv_v1.c b/kernel/riscv64/trmm_ltcopy_rvv_v1.c
index 2fe8cf79e..580714fde 100644
--- a/kernel/riscv64/trmm_ltcopy_rvv_v1.c
+++ b/kernel/riscv64/trmm_ltcopy_rvv_v1.c
@@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VBOOL_T vbool16_t
-#define UINT_V_T vuint32m2_t
-#define VID_V_UINT vid_v_u32m2
-#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
-#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
-#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VBOOL_T     vbool32_t
-#define UINT_V_T     vuint64m2_t
-#define VID_V_UINT   vid_v_u64m2
-#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
-#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
-#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
 #endif
 
 // Optimizes the implementation in ../arm64/tmmm_ltcopy_sve_v1.c
@@ -111,10 +111,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                 {
                     va1 = VLEV_FLOAT(ao, vl);
                     vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
-                    vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
+                    vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
 #ifdef UNIT
                     vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
-                    vb =  VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl);
+                    vb =  VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl);
 #endif
                     VSEV_FLOAT(b, vb, vl);
                     ao += lda;
diff --git a/kernel/riscv64/trmm_uncopy_rvv_v1.c b/kernel/riscv64/trmm_uncopy_rvv_v1.c
index b64cd840d..852ab7f11 100644
--- a/kernel/riscv64/trmm_uncopy_rvv_v1.c
+++ b/kernel/riscv64/trmm_uncopy_rvv_v1.c
@@ -30,29 +30,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VLSEV_FLOAT vlse32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VBOOL_T vbool16_t
-#define UINT_V_T vuint32m2_t
-#define VID_V_UINT vid_v_u32m2
-#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
-#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
-#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VLSEV_FLOAT vlse64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VBOOL_T     vbool32_t
-#define UINT_V_T    vuint64m2_t
-#define VID_V_UINT   vid_v_u64m2
-#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
-#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
-#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
 #endif
 
 // Optimizes the implementation in ../arm64/tmmm_uncopy_sve_v1.c
@@ -114,10 +114,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                 {
                     va1 = VLSEV_FLOAT(ao, stride_lda, vl);
                     vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
-                    vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
+                    vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
 #ifdef UNIT
                     vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
-                    vb =  VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl);
+                    vb =  VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl);
 #endif
                     VSEV_FLOAT(b, vb, vl);
                     ao++;
diff --git a/kernel/riscv64/trmm_utcopy_rvv_v1.c b/kernel/riscv64/trmm_utcopy_rvv_v1.c
index b96daae5b..e0b6d362d 100644
--- a/kernel/riscv64/trmm_utcopy_rvv_v1.c
+++ b/kernel/riscv64/trmm_utcopy_rvv_v1.c
@@ -32,27 +32,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VBOOL_T vbool16_t
-#define UINT_V_T vuint32m2_t
-#define VID_V_UINT vid_v_u32m2
-#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
-#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
-#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VBOOL_T     vbool32_t
-#define UINT_V_T     vuint64m2_t
-#define VID_V_UINT   vid_v_u64m2
-#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
-#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
-#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
 #endif
 
 // Optimizes the implementation in ../arm64/tmmm_utcopy_sve_v1.c
@@ -113,10 +113,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                 {
                     va1 = VLEV_FLOAT(ao, vl);
                     vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
-                    vb = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
+                    vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
 #ifdef UNIT
                     vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
-                    vb =  VFMERGE_VFM_FLOAT(vbool_eq, vb, ONE, vl);
+                    vb =  VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl);
 #endif
                     VSEV_FLOAT(b, vb, vl);
                     ao += lda;
diff --git a/kernel/riscv64/trmmkernel_rvv_v1x8.c b/kernel/riscv64/trmmkernel_rvv_v1x8.c
index 97b14650c..393b24bce 100644
--- a/kernel/riscv64/trmmkernel_rvv_v1x8.c
+++ b/kernel/riscv64/trmmkernel_rvv_v1x8.c
@@ -28,21 +28,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VFMVVF_FLOAT vfmv_v_f_f32m2
-#define VFMACCVF_FLOAT vfmacc_vf_f32m2
-#define VFMULVF_FLOAT vfmul_vf_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VFMVVF_FLOAT vfmv_v_f_f64m2
-#define VFMACCVF_FLOAT vfmacc_vf_f64m2
-#define VFMULVF_FLOAT vfmul_vf_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m2
 #endif
 
 
diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c
index 2cba06b38..886af0c3b 100644
--- a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c
+++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c
@@ -28,34 +28,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define VSETVL_MAX vsetvlmax_e32m2()
-#define FLOAT_V_T vfloat32m2_t
-#define VLSEV_FLOAT vlse32_v_f32m2
-#define VSSEV_FLOAT vsse32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VSSEG2_FLOAT vsseg2e32_v_f32m2
-#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
-#define VSSSEG2_FLOAT vssseg2e32_v_f32m2
-#define VFMACCVF_FLOAT vfmacc_vf_f32m2
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
-#define VFMULVF_FLOAT vfmul_vf_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
+#define VSSSEG2_FLOAT           __riscv_vssseg2e32_v_f32m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define VSETVL_MAX vsetvlmax_e64m2()
-#define FLOAT_V_T vfloat64m2_t
-#define VLSEV_FLOAT vlse64_v_f64m2
-#define VSSEV_FLOAT vsse64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VSSEG2_FLOAT vsseg2e64_v_f64m2
-#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
-#define VSSSEG2_FLOAT vssseg2e64_v_f64m2
-#define VFMVVF_FLOAT vfmv_v_f_f64m2
-#define VFMACCVF_FLOAT vfmacc_vf_f64m2
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
-#define VFMULVF_FLOAT vfmul_vf_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
+#define VSSSEG2_FLOAT           __riscv_vssseg2e64_v_f64m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m2
 #endif
 
 
diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c
index 492a5631f..ddeef966c 100644
--- a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c
+++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c
@@ -28,34 +28,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define VSETVL_MAX vsetvlmax_e32m2()
-#define FLOAT_V_T vfloat32m2_t
-#define VLSEV_FLOAT vlse32_v_f32m2
-#define VSSEV_FLOAT vsse32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VSSEG2_FLOAT vsseg2e32_v_f32m2
-#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
-#define VSSSEG2_FLOAT vssseg2e32_v_f32m2
-#define VFMACCVF_FLOAT vfmacc_vf_f32m2
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
-#define VFMULVF_FLOAT vfmul_vf_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
+#define VSSSEG2_FLOAT           __riscv_vssseg2e32_v_f32m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define VSETVL_MAX vsetvlmax_e64m2()
-#define FLOAT_V_T vfloat64m2_t
-#define VLSEV_FLOAT vlse64_v_f64m2
-#define VSSEV_FLOAT vsse64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VSSEG2_FLOAT vsseg2e64_v_f64m2
-#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
-#define VSSSEG2_FLOAT vssseg2e64_v_f64m2
-#define VFMVVF_FLOAT vfmv_v_f_f64m2
-#define VFMACCVF_FLOAT vfmacc_vf_f64m2
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
-#define VFMULVF_FLOAT vfmul_vf_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
+#define VSSSEG2_FLOAT           __riscv_vssseg2e64_v_f64m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m2
 #endif
 
 
diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c
index 4751ae012..4c83bbaa3 100644
--- a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c
+++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c
@@ -28,34 +28,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define VSETVL_MAX vsetvlmax_e32m2()
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSSEV_FLOAT vsse32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VSSEG2_FLOAT vsseg2e32_v_f32m2
-#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
-#define VSSSEG2_FLOAT vssseg2e32_v_f32m2
-#define VFMACCVF_FLOAT vfmacc_vf_f32m2
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
-#define VFMULVF_FLOAT vfmul_vf_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
+#define VSSSEG2_FLOAT           __riscv_vssseg2e32_v_f32m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define VSETVL_MAX vsetvlmax_e64m2()
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSSEV_FLOAT vsse64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VSSEG2_FLOAT vsseg2e64_v_f64m2
-#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
-#define VSSSEG2_FLOAT vssseg2e64_v_f64m2
-#define VFMVVF_FLOAT vfmv_v_f_f64m2
-#define VFMACCVF_FLOAT vfmacc_vf_f64m2
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
-#define VFMULVF_FLOAT vfmul_vf_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
+#define VSSSEG2_FLOAT           __riscv_vssseg2e64_v_f64m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m2
 #endif
 
 static FLOAT dm1 = -1.;
diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c
index 93a9e6916..b368eefb9 100644
--- a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c
+++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c
@@ -28,28 +28,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define VSETVL_MAX vsetvlmax_e32m2()
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VSSEG2_FLOAT vsseg2e32_v_f32m2
-#define VFMACCVF_FLOAT vfmacc_vf_f32m2
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
-#define VFMULVF_FLOAT vfmul_vf_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define VSETVL_MAX vsetvlmax_e64m2()
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VSSEG2_FLOAT vsseg2e64_v_f64m2
-#define VFMVVF_FLOAT vfmv_v_f_f64m2
-#define VFMACCVF_FLOAT vfmacc_vf_f64m2
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
-#define VFMULVF_FLOAT vfmul_vf_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m2
 #endif
 
 
diff --git a/kernel/riscv64/trsm_lncopy_rvv_v1.c b/kernel/riscv64/trsm_lncopy_rvv_v1.c
index bacfb2b08..41c84be25 100644
--- a/kernel/riscv64/trsm_lncopy_rvv_v1.c
+++ b/kernel/riscv64/trsm_lncopy_rvv_v1.c
@@ -29,27 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VSEV_FLOAT_M vse32_v_f32m2_m
-#define VLSEV_FLOAT vlse32_v_f32m2
-#define VBOOL_T vbool16_t
-#define UINT_V_T vuint32m2_t
-#define VID_V_UINT vid_v_u32m2
-#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VSEV_FLOAT_M            __riscv_vse32_v_f32m2_m
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VSEV_FLOAT_M vse64_v_f64m2_m
-#define VLSEV_FLOAT vlse64_v_f64m2
-#define VBOOL_T     vbool32_t
-#define UINT_V_T     vuint64m2_t
-#define VID_V_UINT   vid_v_u64m2
-#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VSEV_FLOAT_M            __riscv_vse64_v_f64m2_m
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
 
 #endif
 
diff --git a/kernel/riscv64/trsm_ltcopy_rvv_v1.c b/kernel/riscv64/trsm_ltcopy_rvv_v1.c
index 0fc7c9f24..003bd3465 100644
--- a/kernel/riscv64/trsm_ltcopy_rvv_v1.c
+++ b/kernel/riscv64/trsm_ltcopy_rvv_v1.c
@@ -29,27 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VSEV_FLOAT_M vse32_v_f32m2_m
-#define VLSEV_FLOAT vlse32_v_f32m2
-#define VBOOL_T vbool16_t
-#define UINT_V_T vuint32m2_t
-#define VID_V_UINT vid_v_u32m2
-#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VSEV_FLOAT_M            __riscv_vse32_v_f32m2_m
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VSEV_FLOAT_M vse64_v_f64m2_m
-#define VLSEV_FLOAT vlse64_v_f64m2
-#define VBOOL_T     vbool32_t
-#define UINT_V_T     vuint64m2_t
-#define VID_V_UINT   vid_v_u64m2
-#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VSEV_FLOAT_M            __riscv_vse64_v_f64m2_m
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
 #endif
 
 #ifndef UNIT
diff --git a/kernel/riscv64/trsm_uncopy_rvv_v1.c b/kernel/riscv64/trsm_uncopy_rvv_v1.c
index ee869a795..6cca5d49c 100644
--- a/kernel/riscv64/trsm_uncopy_rvv_v1.c
+++ b/kernel/riscv64/trsm_uncopy_rvv_v1.c
@@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VSEV_FLOAT_M vse32_v_f32m2_m
-#define VLSEV_FLOAT vlse32_v_f32m2
-#define VBOOL_T vbool16_t
-#define UINT_V_T vuint32m2_t
-#define VID_V_UINT vid_v_u32m2
-#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VSEV_FLOAT_M            __riscv_vse32_v_f32m2_m
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VSEV_FLOAT_M vse64_v_f64m2_m
-#define VLSEV_FLOAT vlse64_v_f64m2
-#define VBOOL_T     vbool32_t
-#define UINT_V_T     vuint64m2_t
-#define VID_V_UINT   vid_v_u64m2
-#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VSEV_FLOAT_M            __riscv_vse64_v_f64m2_m
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
 #endif
 
 
diff --git a/kernel/riscv64/trsm_utcopy_rvv_v1.c b/kernel/riscv64/trsm_utcopy_rvv_v1.c
index a324b0fa6..bc058525f 100644
--- a/kernel/riscv64/trsm_utcopy_rvv_v1.c
+++ b/kernel/riscv64/trsm_utcopy_rvv_v1.c
@@ -29,27 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VSEV_FLOAT_M vse32_v_f32m2_m
-#define VLSEV_FLOAT vlse32_v_f32m2
-#define VBOOL_T vbool16_t
-#define UINT_V_T vuint32m2_t
-#define VID_V_UINT vid_v_u32m2
-#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VSEV_FLOAT_M            __riscv_vse32_v_f32m2_m
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VSEV_FLOAT_M vse64_v_f64m2_m
-#define VLSEV_FLOAT vlse64_v_f64m2
-#define VBOOL_T     vbool32_t
-#define UINT_V_T     vuint64m2_t
-#define VID_V_UINT   vid_v_u64m2
-#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VSEV_FLOAT_M            __riscv_vse64_v_f64m2_m
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
 #endif
 
 
diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c
index 1917042be..615b7519c 100644
--- a/kernel/riscv64/zamax_rvv.c
+++ b/kernel/riscv64/zamax_rvv.c
@@ -29,35 +29,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <float.h>
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m4()
-#define VSETVL_MAX_M1 vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m4_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLSEG_FLOAT vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMAXVV_FLOAT vfmax_vv_f32m4
-#define VFADDVV_FLOAT vfadd_vv_f32m4
-#define VFABSV_FLOAT vfabs_v_f32m4
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m4_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f32m4
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m4()
-#define VSETVL_MAX_M1 vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m4_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLSEG_FLOAT vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMAXVV_FLOAT vfmax_vv_f64m4
-#define VFADDVV_FLOAT vfadd_vv_f64m4
-#define VFABSV_FLOAT vfabs_v_f64m4
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m4_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f64m4
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -106,7 +106,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
     }
 
-    v_res = VFREDMAXVS_FLOAT(v_res, vmax, v_res, vlmax);
+    v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax);
     maxf = VFMVFS_FLOAT_M1(v_res);
 
     return(maxf);
diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c
index 3f027383a..a0d36d46f 100644
--- a/kernel/riscv64/zamin_rvv.c
+++ b/kernel/riscv64/zamin_rvv.c
@@ -29,35 +29,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <float.h>
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m4()
-#define VSETVL_MAX_M1 vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m4_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLSEG_FLOAT vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m4_f32m1
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMINVV_FLOAT vfmin_vv_f32m4
-#define VFADDVV_FLOAT vfadd_vv_f32m4
-#define VFABSV_FLOAT vfabs_v_f32m4
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m4_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMINVV_FLOAT           __riscv_vfmin_vv_f32m4
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m4()
-#define VSETVL_MAX_M1 vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m4_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLSEG_FLOAT vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m4_f64m1
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMINVV_FLOAT vfmin_vv_f64m4
-#define VFADDVV_FLOAT vfadd_vv_f64m4
-#define VFABSV_FLOAT vfabs_v_f64m4
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m4_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMINVV_FLOAT           __riscv_vfmin_vv_f64m4
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -105,7 +105,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
     }
 
-    v_res = VFREDMINVS_FLOAT(v_res, vmin, v_res, vlmax);
+    v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax);
     minf = VFMVFS_FLOAT_M1(v_res);
 
     return(minf);
diff --git a/kernel/riscv64/zasum_rvv.c b/kernel/riscv64/zasum_rvv.c
index 7876646b3..1d2f0e1fe 100644
--- a/kernel/riscv64/zasum_rvv.c
+++ b/kernel/riscv64/zasum_rvv.c
@@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m8()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m8
-#define VLSEV_FLOAT vlse32_v_f32m8
-#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
-#define VFADDVV_FLOAT vfadd_vv_f32m8
-#define VFABSV_FLOAT vfabs_v_f32m8
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m8
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m8()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m8
-#define VLSEV_FLOAT vlse64_v_f64m8
-#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
-#define VFADDVV_FLOAT vfadd_vv_f64m8
-#define VFABSV_FLOAT vfabs_v_f64m8
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m8
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -99,9 +99,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
     }
 
-    FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
     FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax);
-    v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax);
+    v_res = VFREDSUMVS_FLOAT(v_sum, v_res, vlmax);
     asumf += VFMVFS_FLOAT_M1(v_res);
 
     return(asumf);
diff --git a/kernel/riscv64/zaxpby_rvv.c b/kernel/riscv64/zaxpby_rvv.c
index 66f52d9d0..e0da55311 100644
--- a/kernel/riscv64/zaxpby_rvv.c
+++ b/kernel/riscv64/zaxpby_rvv.c
@@ -33,33 +33,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n)       vsetvl_e32m4(n)
-#define FLOAT_V_T       vfloat32m4_t
-#define VLSEV_FLOAT     vlse32_v_f32m4
-#define VSSEV_FLOAT     vsse32_v_f32m4
-#define VFMACCVF_FLOAT  vfmacc_vf_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
-#define VFMVVF_FLOAT    vfmv_v_f_f32m4
-#define VFMULVF_FLOAT   vfmul_vf_f32m4
-#define VFMSACVF_FLOAT  vfmsac_vf_f32m4
-#define VLSEG_FLOAT     vlseg2e32_v_f32m4
-#define VSSEG_FLOAT     vsseg2e32_v_f32m4
-#define VLSSEG_FLOAT    vlsseg2e32_v_f32m4
-#define VSSSEG_FLOAT    vssseg2e32_v_f32m4
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define FLOAT_V_T               vfloat32m4_t
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m4
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m4
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m4
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4
 #else
-#define VSETVL(n)       vsetvl_e64m4(n)
-#define FLOAT_V_T       vfloat64m4_t
-#define VLSEV_FLOAT     vlse64_v_f64m4
-#define VSSEV_FLOAT     vsse64_v_f64m4
-#define VFMACCVF_FLOAT  vfmacc_vf_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
-#define VFMVVF_FLOAT    vfmv_v_f_f64m4
-#define VFMULVF_FLOAT   vfmul_vf_f64m4
-#define VFMSACVF_FLOAT  vfmsac_vf_f64m4
-#define VLSEG_FLOAT     vlseg2e64_v_f64m4
-#define VSSEG_FLOAT     vsseg2e64_v_f64m4
-#define VLSSEG_FLOAT    vlsseg2e64_v_f64m4
-#define VSSSEG_FLOAT    vssseg2e64_v_f64m4
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define FLOAT_V_T               vfloat64m4_t
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m4
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m4
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m4
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4
 #endif
 
 int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y)
diff --git a/kernel/riscv64/zaxpy_rvv.c b/kernel/riscv64/zaxpy_rvv.c
index 777bcb728..3f75898e0 100644
--- a/kernel/riscv64/zaxpy_rvv.c
+++ b/kernel/riscv64/zaxpy_rvv.c
@@ -28,23 +28,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define FLOAT_V_T vfloat32m4_t
-#define VLSEG_FLOAT vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
-#define VSSEG_FLOAT vsseg2e32_v_f32m4
-#define VSSSEG_FLOAT vssseg2e32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define FLOAT_V_T               vfloat32m4_t
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define FLOAT_V_T vfloat64m4_t
-#define VLSEG_FLOAT vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
-#define VSSEG_FLOAT vsseg2e64_v_f64m4
-#define VSSSEG_FLOAT vssseg2e64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define FLOAT_V_T               vfloat64m4_t
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m4
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
diff --git a/kernel/riscv64/zcopy_rvv.c b/kernel/riscv64/zcopy_rvv.c
index 5d8322bbb..bd94810ce 100644
--- a/kernel/riscv64/zcopy_rvv.c
+++ b/kernel/riscv64/zcopy_rvv.c
@@ -28,29 +28,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL_M8(n) vsetvl_e32m8(n)
-#define FLOAT_V_T_M8 vfloat32m8_t
-#define VLEV_FLOAT_M8 vle32_v_f32m8
-#define VSEV_FLOAT_M8 vse32_v_f32m8
-
-#define VSETVL_M4(n) vsetvl_e32m4(n)
-#define FLOAT_V_T_M4 vfloat32m4_t
-#define VLSEG_FLOAT_M4 vlseg2e32_v_f32m4
-#define VSSEG_FLOAT_M4 vsseg2e32_v_f32m4
-#define VLSSEG_FLOAT_M4 vlsseg2e32_v_f32m4
-#define VSSSEG_FLOAT_M4 vssseg2e32_v_f32m4
+#define VSETVL_M8(n)            __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T_M8            vfloat32m8_t
+#define VLEV_FLOAT_M8           __riscv_vle32_v_f32m8
+#define VSEV_FLOAT_M8           __riscv_vse32_v_f32m8
+
+#define VSETVL_M4(n)            __riscv_vsetvl_e32m4(n)
+#define FLOAT_V_T_M4            vfloat32m4_t
+#define VLSEG_FLOAT_M4          __riscv_vlseg2e32_v_f32m4
+#define VSSEG_FLOAT_M4          __riscv_vsseg2e32_v_f32m4
+#define VLSSEG_FLOAT_M4         __riscv_vlsseg2e32_v_f32m4
+#define VSSSEG_FLOAT_M4         __riscv_vssseg2e32_v_f32m4
 #else
-#define VSETVL_M8(n) vsetvl_e64m8(n)
-#define FLOAT_V_T_M8 vfloat64m8_t
-#define VLEV_FLOAT_M8 vle64_v_f64m8
-#define VSEV_FLOAT_M8 vse64_v_f64m8
-
-#define VSETVL_M4(n) vsetvl_e64m4(n)
-#define FLOAT_V_T_M4 vfloat64m4_t
-#define VLSEG_FLOAT_M4 vlseg2e64_v_f64m4
-#define VSSEG_FLOAT_M4 vsseg2e64_v_f64m4
-#define VLSSEG_FLOAT_M4 vlsseg2e64_v_f64m4
-#define VSSSEG_FLOAT_M4 vssseg2e64_v_f64m4
+#define VSETVL_M8(n)            __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T_M8            vfloat64m8_t
+#define VLEV_FLOAT_M8           __riscv_vle64_v_f64m8
+#define VSEV_FLOAT_M8           __riscv_vse64_v_f64m8
+
+#define VSETVL_M4(n)            __riscv_vsetvl_e64m4(n)
+#define FLOAT_V_T_M4            vfloat64m4_t
+#define VLSEG_FLOAT_M4          __riscv_vlseg2e64_v_f64m4
+#define VSSEG_FLOAT_M4          __riscv_vsseg2e64_v_f64m4
+#define VLSSEG_FLOAT_M4         __riscv_vlsseg2e64_v_f64m4
+#define VSSSEG_FLOAT_M4         __riscv_vssseg2e64_v_f64m4
 #endif
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c
index 7eae6f608..1543c513d 100644
--- a/kernel/riscv64/zdot_rvv.c
+++ b/kernel/riscv64/zdot_rvv.c
@@ -28,37 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m4()
-#define VSETVL_MAX_M1 vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m4_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLSEG_FLOAT vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMULVV_FLOAT vfmul_vv_f32m4
-#define VFMSACVV_FLOAT vfmsac_vv_f32m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f32m4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT           __riscv_vfmul_vv_f32m4
+#define VFMSACVV_FLOAT          __riscv_vfmsac_vv_f32m4
+#define VFNMSACVV_FLOAT         __riscv_vfnmsac_vv_f32m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m4()
-#define VSETVL_MAX_M1 vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m4_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLSEG_FLOAT vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMULVV_FLOAT vfmul_vv_f64m4
-#define VFMSACVV_FLOAT vfmsac_vv_f64m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f64m4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT           __riscv_vfmul_vv_f64m4
+#define VFMSACVV_FLOAT          __riscv_vfmsac_vv_f64m4
+#define VFNMSACVV_FLOAT         __riscv_vfnmsac_vv_f64m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
 #endif
 
 OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
@@ -72,7 +72,6 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
     FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1;
     FLOAT_V_T_M1 v_res, v_z0;
     size_t vlmax_m1 = VSETVL_MAX_M1;
-    v_res = VFMVVF_FLOAT_M1(0, vlmax_m1);
     v_z0 = VFMVVF_FLOAT_M1(0, vlmax_m1);
 
     size_t vlmax = VSETVL_MAX;
@@ -161,9 +160,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
         }
     }
 
-    v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, vlmax);
+    v_res = VFREDSUM_FLOAT(vr0, v_z0, vlmax);
     CREAL(result) = VFMVFS_FLOAT_M1(v_res);
-    v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, vlmax);
+    v_res = VFREDSUM_FLOAT(vr1, v_z0, vlmax);
     CIMAG(result) = VFMVFS_FLOAT_M1(v_res);
  
    return(result);
diff --git a/kernel/riscv64/zgemm_beta_rvv.c b/kernel/riscv64/zgemm_beta_rvv.c
index a89752d18..b94b5f4bf 100644
--- a/kernel/riscv64/zgemm_beta_rvv.c
+++ b/kernel/riscv64/zgemm_beta_rvv.c
@@ -39,23 +39,23 @@
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define FLOAT_V_T vfloat32m4_t
-#define VLSEG_FLOAT vlseg2e32_v_f32m4
-#define VSSEG_FLOAT vsseg2e32_v_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMULVF_FLOAT vfmul_vf_f32m4
-#define VFADDVV_FLOAT vfadd_vv_f32m4
-#define VFSUBVV_FLOAT vfsub_vv_f32m4
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define FLOAT_V_T               vfloat32m4_t
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m4
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
+#define VFSUBVV_FLOAT           __riscv_vfsub_vv_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define FLOAT_V_T vfloat64m4_t
-#define VLSEG_FLOAT vlseg2e64_v_f64m4
-#define VSSEG_FLOAT vsseg2e64_v_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMULVF_FLOAT vfmul_vf_f64m4
-#define VFADDVV_FLOAT vfadd_vv_f64m4
-#define VFSUBVV_FLOAT vfsub_vv_f64m4
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define FLOAT_V_T               vfloat64m4_t
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m4
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
+#define VFSUBVV_FLOAT           __riscv_vfsub_vv_f64m4
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1,
diff --git a/kernel/riscv64/zgemm_ncopy_4_rvv.c b/kernel/riscv64/zgemm_ncopy_4_rvv.c
index 389ee5d57..d50a4b8d5 100644
--- a/kernel/riscv64/zgemm_ncopy_4_rvv.c
+++ b/kernel/riscv64/zgemm_ncopy_4_rvv.c
@@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m1(n)
-#define FLOAT_V_T vfloat32m1_t
-#define VLSEG2_FLOAT vlseg2e32_v_f32m1
-#define VSSEG2_FLOAT vsseg2e32_v_f32m1
-#define VSSEG4_FLOAT vsseg4e32_v_f32m1
-#define VSSEG8_FLOAT vsseg8e32_v_f32m1
+#define VSETVL(n)               __riscv_vsetvl_e32m1(n)
+#define FLOAT_V_T               vfloat32m1_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m1
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m1
+#define VSSEG4_FLOAT            __riscv_vsseg4e32_v_f32m1
+#define VSSEG8_FLOAT            __riscv_vsseg8e32_v_f32m1
 #else
-#define VSETVL(n) vsetvl_e64m1(n)
-#define FLOAT_V_T vfloat64m1_t
-#define VLSEG2_FLOAT vlseg2e64_v_f64m1
-#define VSSEG2_FLOAT vsseg2e64_v_f64m1
-#define VSSEG4_FLOAT vsseg4e64_v_f64m1
-#define VSSEG8_FLOAT vsseg8e64_v_f64m1
+#define VSETVL(n)               __riscv_vsetvl_e64m1(n)
+#define FLOAT_V_T               vfloat64m1_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m1
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m1
+#define VSSEG4_FLOAT            __riscv_vsseg4e64_v_f64m1
+#define VSSEG8_FLOAT            __riscv_vsseg8e64_v_f64m1
 #endif
 
 // Optimizes the implementation in ../generic/zgemm_ncopy_4.c
diff --git a/kernel/riscv64/zgemm_ncopy_rvv_v1.c b/kernel/riscv64/zgemm_ncopy_rvv_v1.c
index df039bab6..1d3b8d3b7 100644
--- a/kernel/riscv64/zgemm_ncopy_rvv_v1.c
+++ b/kernel/riscv64/zgemm_ncopy_rvv_v1.c
@@ -29,15 +29,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv.c b/kernel/riscv64/zgemm_tcopy_4_rvv.c
index 1b34039c8..8c35b5616 100644
--- a/kernel/riscv64/zgemm_tcopy_4_rvv.c
+++ b/kernel/riscv64/zgemm_tcopy_4_rvv.c
@@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m1(n)
-#define FLOAT_V_T vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m1
-#define VSEV_FLOAT vse32_v_f32m1
-#define VLSSEG2_FLOAT vlsseg2e32_v_f32m1
-#define VLSSEG4_FLOAT vlsseg4e32_v_f32m1
-#define VLSSEG8_FLOAT vlsseg8e32_v_f32m1
-#define VSSEG2_FLOAT vsseg2e32_v_f32m1
-#define VSSEG4_FLOAT vsseg4e32_v_f32m1
-#define VSSEG8_FLOAT vsseg8e32_v_f32m1
+#define VSETVL(n)               __riscv_vsetvl_e32m1(n)
+#define FLOAT_V_T               vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m1
+#define VSEV_FLOAT              __riscv_vse32_v_f32m1
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m1
+#define VLSSEG4_FLOAT           __riscv_vlsseg4e32_v_f32m1
+#define VLSSEG8_FLOAT           __riscv_vlsseg8e32_v_f32m1
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m1
+#define VSSEG4_FLOAT            __riscv_vsseg4e32_v_f32m1
+#define VSSEG8_FLOAT            __riscv_vsseg8e32_v_f32m1
 #else
-#define VSETVL(n) vsetvl_e64m1(n)
-#define FLOAT_V_T vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m1
-#define VSEV_FLOAT vse64_v_f64m1
-#define VLSSEG2_FLOAT vlsseg2e64_v_f64m1
-#define VLSSEG4_FLOAT vlsseg4e64_v_f64m1
-#define VLSSEG8_FLOAT vlsseg8e64_v_f64m1
-#define VSSEG2_FLOAT vsseg2e64_v_f64m1
-#define VSSEG4_FLOAT vsseg4e64_v_f64m1
-#define VSSEG8_FLOAT vsseg8e64_v_f64m1
+#define VSETVL(n)               __riscv_vsetvl_e64m1(n)
+#define FLOAT_V_T               vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m1
+#define VSEV_FLOAT              __riscv_vse64_v_f64m1
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m1
+#define VLSSEG4_FLOAT           __riscv_vlsseg4e64_v_f64m1
+#define VLSSEG8_FLOAT           __riscv_vlsseg8e64_v_f64m1
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m1
+#define VSSEG4_FLOAT            __riscv_vsseg4e64_v_f64m1
+#define VSSEG8_FLOAT            __riscv_vsseg8e64_v_f64m1
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
diff --git a/kernel/riscv64/zgemm_tcopy_rvv_v1.c b/kernel/riscv64/zgemm_tcopy_rvv_v1.c
index 7622fb810..7a085269c 100644
--- a/kernel/riscv64/zgemm_tcopy_rvv_v1.c
+++ b/kernel/riscv64/zgemm_tcopy_rvv_v1.c
@@ -28,15 +28,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VSSEG2_FLOAT vsseg2e32_v_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VSSEG2_FLOAT vsseg2e64_v_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
diff --git a/kernel/riscv64/zgemmkernel_rvv_v1x4.c b/kernel/riscv64/zgemmkernel_rvv_v1x4.c
index 50e29222f..41399cf79 100644
--- a/kernel/riscv64/zgemmkernel_rvv_v1x4.c
+++ b/kernel/riscv64/zgemmkernel_rvv_v1x4.c
@@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VSSEG2_FLOAT vsseg2e32_v_f32m2
-#define VFMVVF_FLOAT vfmv_v_f_f32m2
-#define VFMACCVF_FLOAT vfmacc_vf_f32m2
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VSSEG2_FLOAT vsseg2e64_v_f64m2
-#define VFMVVF_FLOAT vfmv_v_f_f64m2
-#define VFMACCVF_FLOAT vfmacc_vf_f64m2
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
 #endif
 
 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
diff --git a/kernel/riscv64/zgemv_n_rvv.c b/kernel/riscv64/zgemv_n_rvv.c
index 2eeb61b45..4a40c30a7 100644
--- a/kernel/riscv64/zgemv_n_rvv.c
+++ b/kernel/riscv64/zgemv_n_rvv.c
@@ -28,31 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSEV_FLOAT vse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VLSEG_FLOAT vlseg2e32_v_f32m4
-#define VSSEG_FLOAT vsseg2e32_v_f32m4
-#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
-#define VSSSEG_FLOAT vssseg2e32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define FLOAT_V_T               vfloat32m4_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m4
+#define VSEV_FLOAT              __riscv_vse32_v_f32m4
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSEV_FLOAT vse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VLSEG_FLOAT vlseg2e64_v_f64m4
-#define VSSEG_FLOAT vsseg2e64_v_f64m4
-#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
-#define VSSSEG_FLOAT vssseg2e64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define FLOAT_V_T               vfloat64m4_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m4
+#define VSEV_FLOAT              __riscv_vse64_v_f64m4
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m4
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c
index b682d5cd8..15795cc3a 100644
--- a/kernel/riscv64/zgemv_t_rvv.c
+++ b/kernel/riscv64/zgemv_t_rvv.c
@@ -28,33 +28,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX_M1 vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m4_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLSEG_FLOAT vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMULVV_FLOAT vfmul_vv_f32m4
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f32m4
+#define VFNMSACVV_FLOAT         __riscv_vfnmsac_vv_f32m4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT           __riscv_vfmul_vv_f32m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX_M1 vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m4_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLSEG_FLOAT vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMULVV_FLOAT vfmul_vv_f64m4
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f64m4
+#define VFNMSACVV_FLOAT         __riscv_vfnmsac_vv_f64m4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT           __riscv_vfmul_vv_f64m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@@ -73,7 +73,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
     BLASLONG lda2 = lda * 2;
 
     size_t vlmax = VSETVL_MAX_M1;
-    v_res = VFMVVF_FLOAT_M1(0, vlmax);
     v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
     vlmax = VSETVL(m);
 
@@ -105,9 +104,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
                 ix += vl * inc_x * 2;
             }
             
-            v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
+            v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
             temp_r = VFMVFS_FLOAT_M1(v_res);
-            v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax);
+            v_res = VFREDSUM_FLOAT(vi, v_z0, vlmax);
             temp_i = VFMVFS_FLOAT_M1(v_res);
 
 #if !defined(XCONJ)
@@ -149,9 +148,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
                 ix += vl * inc_x * 2;
             }
             
-            v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, vlmax);
+            v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
             temp_r = VFMVFS_FLOAT_M1(v_res);
-            v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, vlmax);
+            v_res = VFREDSUM_FLOAT(vi, v_z0, vlmax);
             temp_i = VFMVFS_FLOAT_M1(v_res);
     
 #if !defined(XCONJ)
diff --git a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c
index cf466d3fa..79b20a646 100644
--- a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c
+++ b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c
@@ -28,45 +28,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define VSETVL_MAX vsetvlmax_e32m2()
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VLSEV_FLOAT vlse32_v_f32m2
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT vsseg2e32_v_f32m2
-#define INT_V_T     vint32m2_t
-#define VID_V_INT   vid_v_i32m2
-#define VADD_VX_INT vadd_vx_i32m2
-#define VFRSUB_VF_FLOAT vfrsub_vf_f32m2
-#define VMSGT_VX_INT vmsgt_vx_i32m2_b16
-#define VMSLT_VX_INT vmslt_vx_i32m2_b16
-#define VMSEQ_VX_INT vmseq_vx_i32m2_b16
-#define VBOOL_T     vbool16_t
-#define VMERGE_VVM_FLOAT  vmerge_vvm_f32m2
-#define VFMVVF_FLOAT vfmv_v_f_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define INT_V_T                 vint32m2_t
+#define VID_V_INT               __riscv_vid_v_i32m2
+#define VADD_VX_INT             __riscv_vadd_vx_i32m2
+#define VFRSUB_VF_FLOAT         __riscv_vfrsub_vf_f32m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i32m2_b16
+#define VMSLT_VX_INT            __riscv_vmslt_vx_i32m2_b16
+#define VMSEQ_VX_INT            __riscv_vmseq_vx_i32m2_b16
+#define VBOOL_T                 vbool16_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f32m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define VSETVL_MAX vsetvlmax_e64m2()
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VLSEV_FLOAT vlse64_v_f64m2
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT vsseg2e64_v_f64m2
-#define INT_V_T     vint64m2_t
-#define VID_V_INT   vid_v_i64m2
-#define VADD_VX_INT vadd_vx_i64m2
-#define VFRSUB_VF_FLOAT vfrsub_vf_f64m2
-#define VMSGT_VX_INT vmsgt_vx_i64m2_b32
-#define VMSLT_VX_INT vmslt_vx_i64m2_b32
-#define VMSEQ_VX_INT vmseq_vx_i64m2_b32
-#define VBOOL_T     vbool32_t
-#define VMERGE_VVM_FLOAT  vmerge_vvm_f64m2
-#define VFMVVF_FLOAT vfmv_v_f_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define INT_V_T                 vint64m2_t
+#define VID_V_INT               __riscv_vid_v_i64m2
+#define VADD_VX_INT             __riscv_vadd_vx_i64m2
+#define VFRSUB_VF_FLOAT         __riscv_vfrsub_vf_f64m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i64m2_b32
+#define VMSLT_VX_INT            __riscv_vmslt_vx_i64m2_b32
+#define VMSEQ_VX_INT            __riscv_vmseq_vx_i64m2_b32
+#define VBOOL_T                 vbool32_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f64m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
 #endif
 
 
@@ -104,13 +104,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
             vbool_lt0  = VMSLT_VX_INT(vindex, 0, vl);
             vbool_eq0  = VMSEQ_VX_INT(vindex, 0, vl);
 
-            vb0 =  VMERGE_VVM_FLOAT(vbool_gt0, va20, va10, vl);
-            vb1 =  VMERGE_VVM_FLOAT(vbool_gt0, va21, va11, vl);
+            vb0 =  VMERGE_VVM_FLOAT(va20, va10, vbool_gt0, vl);
+            vb1 =  VMERGE_VVM_FLOAT(va21, va11, vbool_gt0, vl);
 
             vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl);
 
-            vb1 =  VMERGE_VVM_FLOAT(vbool_lt0, vb1, vb2, vl);
-            vb1 =  VMERGE_VVM_FLOAT(vbool_eq0, vb1, vzero, vl);
+            vb1 =  VMERGE_VVM_FLOAT(vb1, vb2, vbool_lt0, vl);
+            vb1 =  VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl);
             VSSEG2_FLOAT(b, vb0, vb1, vl);
 
             b   += vl * 2;
diff --git a/kernel/riscv64/zhemm_utcopy_rvv_v1.c b/kernel/riscv64/zhemm_utcopy_rvv_v1.c
index 6209f5417..a86815275 100644
--- a/kernel/riscv64/zhemm_utcopy_rvv_v1.c
+++ b/kernel/riscv64/zhemm_utcopy_rvv_v1.c
@@ -28,45 +28,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define VSETVL_MAX vsetvlmax_e32m2()
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VLSEV_FLOAT vlse32_v_f32m2
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT vsseg2e32_v_f32m2
-#define INT_V_T     vint32m2_t
-#define VID_V_INT   vid_v_i32m2
-#define VADD_VX_INT vadd_vx_i32m2
-#define VFRSUB_VF_FLOAT vfrsub_vf_f32m2
-#define VMSGT_VX_INT vmsgt_vx_i32m2_b16
-#define VMSLT_VX_INT vmslt_vx_i32m2_b16
-#define VMSEQ_VX_INT vmseq_vx_i32m2_b16
-#define VBOOL_T     vbool16_t
-#define VMERGE_VVM_FLOAT  vmerge_vvm_f32m2
-#define VFMVVF_FLOAT vfmv_v_f_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define INT_V_T                 vint32m2_t
+#define VID_V_INT               __riscv_vid_v_i32m2
+#define VADD_VX_INT             __riscv_vadd_vx_i32m2
+#define VFRSUB_VF_FLOAT         __riscv_vfrsub_vf_f32m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i32m2_b16
+#define VMSLT_VX_INT            __riscv_vmslt_vx_i32m2_b16
+#define VMSEQ_VX_INT            __riscv_vmseq_vx_i32m2_b16
+#define VBOOL_T                 vbool16_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f32m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define VSETVL_MAX vsetvlmax_e64m2()
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VLSEV_FLOAT vlse64_v_f64m2
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT vsseg2e64_v_f64m2
-#define INT_V_T     vint64m2_t
-#define VID_V_INT   vid_v_i64m2
-#define VADD_VX_INT vadd_vx_i64m2
-#define VFRSUB_VF_FLOAT vfrsub_vf_f64m2
-#define VMSGT_VX_INT vmsgt_vx_i64m2_b32
-#define VMSLT_VX_INT vmslt_vx_i64m2_b32
-#define VMSEQ_VX_INT vmseq_vx_i64m2_b32
-#define VBOOL_T     vbool32_t
-#define VMERGE_VVM_FLOAT  vmerge_vvm_f64m2
-#define VFMVVF_FLOAT vfmv_v_f_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define INT_V_T                 vint64m2_t
+#define VID_V_INT               __riscv_vid_v_i64m2
+#define VADD_VX_INT             __riscv_vadd_vx_i64m2
+#define VFRSUB_VF_FLOAT         __riscv_vfrsub_vf_f64m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i64m2_b32
+#define VMSLT_VX_INT            __riscv_vmslt_vx_i64m2_b32
+#define VMSEQ_VX_INT            __riscv_vmseq_vx_i64m2_b32
+#define VBOOL_T                 vbool32_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f64m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
 #endif
 
 
@@ -101,13 +101,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
             vbool_gt0  = VMSGT_VX_INT(vindex, 0, vl);
             vbool_eq0  = VMSEQ_VX_INT(vindex, 0, vl);
 
-            vb0 =  VMERGE_VVM_FLOAT(vbool_gt0, va20, va10, vl);
-            vb1 =  VMERGE_VVM_FLOAT(vbool_gt0, va21, va11, vl);
+            vb0 =  VMERGE_VVM_FLOAT(va20, va10, vbool_gt0, vl);
+            vb1 =  VMERGE_VVM_FLOAT(va21, va11, vbool_gt0, vl);
 
             vb2 =  VFRSUB_VF_FLOAT(vb1, ZERO, vl);
 
-            vb1 =  VMERGE_VVM_FLOAT(vbool_gt0, vb1, vb2, vl);
-            vb1 =  VMERGE_VVM_FLOAT(vbool_eq0, vb1, vzero, vl);
+            vb1 =  VMERGE_VVM_FLOAT(vb1, vb2, vbool_gt0, vl);
+            vb1 =  VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl);
             VSSEG2_FLOAT(b, vb0, vb1, vl);
 
             b   += vl * 2;
diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c
index 921ddb8cb..5f7873b5a 100644
--- a/kernel/riscv64/znrm2_rvv.c
+++ b/kernel/riscv64/znrm2_rvv.c
@@ -28,35 +28,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m4()
-#define VSETVL_MAX_M1 vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m4_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLSEG_FLOAT vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
-#define VFABSV_FLOAT vfabs_v_f32m4
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f32m4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m4_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m4()
-#define VSETVL_MAX_M1 vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m4_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLSEG_FLOAT vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
-#define VFABSV_FLOAT vfabs_v_f64m4
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f64m4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m4_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m4
 #endif
 
 // TODO: Should single precision use the widening MAC, or perhaps all should be double? 
@@ -85,10 +85,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             v0 = VFABSV_FLOAT(v0, vl);
             v1 = VFABSV_FLOAT(v1, vl);
 
-            v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl);
+            v_max = VFREDMAXVS_FLOAT(v0, v_max, vl);
             vr = VFMACCVV_FLOAT(vr, v0, v0, vl);
 
-            v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl);
+            v_max = VFREDMAXVS_FLOAT(v1, v_max, vl);
             vr = VFMACCVV_FLOAT(vr, v1, v1, vl);
         }
 
@@ -103,16 +103,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             v0 = VFABSV_FLOAT(v0, vl);
             v1 = VFABSV_FLOAT(v1, vl);
 
-            v_max = VFREDMAXVS_FLOAT(v_max, v0, v_max, vl);
+            v_max = VFREDMAXVS_FLOAT(v0, v_max, vl);
             vr = VFMACCVV_FLOAT(vr, v0, v0, vl);
 
-            v_max = VFREDMAXVS_FLOAT(v_max, v1, v_max, vl);
+            v_max = VFREDMAXVS_FLOAT(v1, v_max, vl);
             vr = VFMACCVV_FLOAT(vr, v1, v1, vl);
         }
 
     }
 
-    v_res = VFREDSUM_FLOAT(v_res, vr, v_res, vlmax);
+    v_res = VFREDSUM_FLOAT(vr, v_res, vlmax);
 
     ssq = VFMVFS_FLOAT_M1(v_res);
     scale = VFMVFS_FLOAT_M1(v_max);
diff --git a/kernel/riscv64/zrot_rvv.c b/kernel/riscv64/zrot_rvv.c
index 68066a00b..ee81bfe91 100644
--- a/kernel/riscv64/zrot_rvv.c
+++ b/kernel/riscv64/zrot_rvv.c
@@ -28,33 +28,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle32_v_f32m4
-#define VLSEV_FLOAT vlse32_v_f32m4
-#define VSEV_FLOAT vse32_v_f32m4
-#define VSSEV_FLOAT vsse32_v_f32m4
-#define VLSEG_FLOAT vlseg2e32_v_f32m4
-#define VSSEG_FLOAT vsseg2e32_v_f32m4
-#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
-#define VSSSEG_FLOAT vssseg2e32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMULVF_FLOAT vfmul_vf_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define FLOAT_V_T               vfloat32m4_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m4
+#define VSEV_FLOAT              __riscv_vse32_v_f32m4
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle64_v_f64m4
-#define VLSEV_FLOAT vlse64_v_f64m4
-#define VSEV_FLOAT vse64_v_f64m4
-#define VSSEV_FLOAT vsse64_v_f64m4
-#define VLSEG_FLOAT vlseg2e64_v_f64m4
-#define VSSEG_FLOAT vsseg2e64_v_f64m4
-#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
-#define VSSSEG_FLOAT vssseg2e64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMULVF_FLOAT vfmul_vf_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define FLOAT_V_T               vfloat64m4_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m4
+#define VSEV_FLOAT              __riscv_vse64_v_f64m4
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m4
 #endif
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c
index 079c36a2d..779fab68c 100644
--- a/kernel/riscv64/zscal_rvv.c
+++ b/kernel/riscv64/zscal_rvv.c
@@ -28,29 +28,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m4()
-#define FLOAT_V_T vfloat32m4_t
-#define VLSEG_FLOAT vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
-#define VSSEG_FLOAT vsseg2e32_v_f32m4
-#define VSSSEG_FLOAT vssseg2e32_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMULVF_FLOAT vfmul_vf_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define FLOAT_V_T               vfloat32m4_t
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m4()
-#define FLOAT_V_T vfloat64m4_t
-#define VLSEG_FLOAT vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
-#define VSSEG_FLOAT vsseg2e64_v_f64m4
-#define VSSSEG_FLOAT vssseg2e64_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMULVF_FLOAT vfmul_vf_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define FLOAT_V_T               vfloat64m4_t
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c
index 3928fbe27..44df112c6 100644
--- a/kernel/riscv64/zsum_rvv.c
+++ b/kernel/riscv64/zsum_rvv.c
@@ -28,29 +28,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m4()
-#define FLOAT_V_T vfloat32m4_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLSEG_FLOAT vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
-#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
-#define VFADDVV_FLOAT vfadd_vv_f32m4
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m4()
-#define FLOAT_V_T vfloat64m4_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLSEG_FLOAT vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
-#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
-#define VFADDVV_FLOAT vfadd_vv_f64m4
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -88,9 +88,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
     }
 
-    FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
     FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax);
-    v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, vlmax);
+    v_res = VFREDSUMVS_FLOAT(v_sum, v_res, vlmax);
     sumf += VFMVFS_FLOAT_M1(v_res);
 
     return(sumf);
diff --git a/kernel/riscv64/zswap_rvv.c b/kernel/riscv64/zswap_rvv.c
index 86f9103d3..17b7b9f43 100644
--- a/kernel/riscv64/zswap_rvv.c
+++ b/kernel/riscv64/zswap_rvv.c
@@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define FLOAT_V_T vfloat32m4_t
-#define VLSEG_FLOAT vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT vlsseg2e32_v_f32m4
-#define VSSEG_FLOAT vsseg2e32_v_f32m4
-#define VSSSEG_FLOAT vssseg2e32_v_f32m4
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define FLOAT_V_T               vfloat32m4_t
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define FLOAT_V_T vfloat64m4_t
-#define VLSEG_FLOAT vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT vlsseg2e64_v_f64m4
-#define VSSEG_FLOAT vsseg2e64_v_f64m4
-#define VSSSEG_FLOAT vssseg2e64_v_f64m4
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define FLOAT_V_T               vfloat64m4_t
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
diff --git a/kernel/riscv64/zsymm_lcopy_rvv_v1.c b/kernel/riscv64/zsymm_lcopy_rvv_v1.c
index df5c916a5..0f9e04869 100644
--- a/kernel/riscv64/zsymm_lcopy_rvv_v1.c
+++ b/kernel/riscv64/zsymm_lcopy_rvv_v1.c
@@ -28,37 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define VSETVL_MAX vsetvlmax_e32m2()
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VLSEV_FLOAT vlse32_v_f32m2
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT vsseg2e32_v_f32m2
-#define INT_V_T     vint32m2_t
-#define VID_V_INT   vid_v_i32m2
-#define VADD_VX_INT vadd_vx_i32m2
-#define VMSGT_VX_INT vmsgt_vx_i32m2_b16
-#define VBOOL_T     vbool16_t
-#define VMERGE_VVM_FLOAT  vmerge_vvm_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define INT_V_T                 vint32m2_t
+#define VID_V_INT               __riscv_vid_v_i32m2
+#define VADD_VX_INT             __riscv_vadd_vx_i32m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i32m2_b16
+#define VBOOL_T                 vbool16_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define VSETVL_MAX vsetvlmax_e64m2()
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VLSEV_FLOAT vlse64_v_f64m2
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT vsseg2e64_v_f64m2
-#define INT_V_T     vint64m2_t
-#define VID_V_INT   vid_v_i64m2
-#define VADD_VX_INT vadd_vx_i64m2
-#define VMSGT_VX_INT vmsgt_vx_i64m2_b32
-#define VBOOL_T     vbool32_t
-#define VMERGE_VVM_FLOAT  vmerge_vvm_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define INT_V_T                 vint64m2_t
+#define VID_V_INT               __riscv_vid_v_i64m2
+#define VADD_VX_INT             __riscv_vadd_vx_i64m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i64m2_b32
+#define VBOOL_T                 vbool32_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f64m2
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
@@ -91,8 +91,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
             vindex = VADD_VX_INT(vindex_max, offset, vl);
             vbool  = VMSGT_VX_INT(vindex, 0, vl);
 
-            vb0 =  VMERGE_VVM_FLOAT(vbool, va20, va10, vl);
-            vb1 =  VMERGE_VVM_FLOAT(vbool, va21, va11, vl);
+            vb0 =  VMERGE_VVM_FLOAT(va20, va10, vbool, vl);
+            vb1 =  VMERGE_VVM_FLOAT(va21, va11, vbool, vl);
             VSSEG2_FLOAT(b, vb0, vb1, vl);
 
             b   += vl * 2;
diff --git a/kernel/riscv64/zsymm_ucopy_rvv_v1.c b/kernel/riscv64/zsymm_ucopy_rvv_v1.c
index dcf2b081a..fdc693700 100644
--- a/kernel/riscv64/zsymm_ucopy_rvv_v1.c
+++ b/kernel/riscv64/zsymm_ucopy_rvv_v1.c
@@ -28,37 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define VSETVL_MAX vsetvlmax_e32m2()
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VLSEV_FLOAT vlse32_v_f32m2
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT vsseg2e32_v_f32m2
-#define INT_V_T     vint32m2_t
-#define VID_V_INT   vid_v_i32m2
-#define VADD_VX_INT vadd_vx_i32m2
-#define VMSGT_VX_INT vmsgt_vx_i32m2_b16
-#define VBOOL_T     vbool16_t
-#define VMERGE_VVM_FLOAT  vmerge_vvm_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define INT_V_T                 vint32m2_t
+#define VID_V_INT               __riscv_vid_v_i32m2
+#define VADD_VX_INT             __riscv_vadd_vx_i32m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i32m2_b16
+#define VBOOL_T                 vbool16_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define VSETVL_MAX vsetvlmax_e64m2()
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VLSEV_FLOAT vlse64_v_f64m2
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT vsseg2e64_v_f64m2
-#define INT_V_T     vint64m2_t
-#define VID_V_INT   vid_v_i64m2
-#define VADD_VX_INT vadd_vx_i64m2
-#define VMSGT_VX_INT vmsgt_vx_i64m2_b32
-#define VBOOL_T     vbool32_t
-#define VMERGE_VVM_FLOAT  vmerge_vvm_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define INT_V_T                 vint64m2_t
+#define VID_V_INT               __riscv_vid_v_i64m2
+#define VADD_VX_INT             __riscv_vadd_vx_i64m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i64m2_b32
+#define VBOOL_T                 vbool32_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f64m2
 #endif
 
 
@@ -92,8 +92,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
             vindex = VADD_VX_INT(vindex_max, offset, vl);
             vbool  = VMSGT_VX_INT(vindex, 0, vl);
 
-            vb0 =  VMERGE_VVM_FLOAT(vbool, va20, va10, vl);
-            vb1 =  VMERGE_VVM_FLOAT(vbool, va21, va11, vl);
+            vb0 =  VMERGE_VVM_FLOAT(va20, va10, vbool, vl);
+            vb1 =  VMERGE_VVM_FLOAT(va21, va11, vbool, vl);
             VSSEG2_FLOAT(b, vb0, vb1, vl);
 
             b   += vl * 2;
diff --git a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c
index afd694408..7276618c5 100644
--- a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c
+++ b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c
@@ -30,35 +30,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VLSEV_FLOAT vlse32_v_f32m2
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT vsseg2e32_v_f32m2
-#define VBOOL_T vbool16_t
-#define UINT_V_T vint32m2_t
-#define VID_V_UINT vid_v_i32m2
-#define VMSGTU_VX_UINT vmsgt_vx_i32m2_b16
-#define VMSEQ_VX_UINT vmseq_vx_i32m2_b16
-#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vint32m2_t
+#define VID_V_UINT              __riscv_vid_v_i32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgt_vx_i32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_i32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VLSEV_FLOAT vlse64_v_f64m2
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT vsseg2e64_v_f64m2
-#define VBOOL_T     vbool32_t
-#define UINT_V_T     vuint64m2_t
-#define VID_V_UINT   vid_v_u64m2
-#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
-#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
-#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
@@ -121,12 +121,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                 {
                     VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
                     vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
-                    va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl);
-                    va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
+                    va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl);
+                    va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
 #ifdef UNIT
                     vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
-                    va0 =  VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl);
-                    va1 =  VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl);
+                    va0 =  VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl);
+                    va1 =  VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl);
 #endif
                     VSSEG2_FLOAT(b, va0, va1, vl);
                     ao  += 2;
diff --git a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c
index c7d593949..72e8f2ce2 100644
--- a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c
+++ b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c
@@ -30,33 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT vsseg2e32_v_f32m2
-#define VBOOL_T vbool16_t
-#define UINT_V_T vuint32m2_t
-#define VID_V_UINT vid_v_u32m2
-#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
-#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
-#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT vsseg2e64_v_f64m2
-#define VBOOL_T     vbool32_t
-#define UINT_V_T     vuint64m2_t
-#define VID_V_UINT   vid_v_u64m2
-#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
-#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
-#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
@@ -117,14 +117,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                     //va1 = VLEV_FLOAT(ao, vl);
                     VLSEG2_FLOAT(&va0, &va1, ao, vl);
                     vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
-                    va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl);
-                    va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
+                    va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl);
+                    va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
 #ifdef UNIT
                     vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
-                    va0 =  VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl);
-                    va1 =  VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl);
+                    va0 =  VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl);
+                    va1 =  VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl);
 #endif
-                    //VSEV_FLOAT(b, vb, vl);
                     VSSEG2_FLOAT(b, va0, va1, vl);
                     ao  += lda * 2;
                     b   += vl * 2;
diff --git a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c
index 3c70b6385..e6d36c86d 100644
--- a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c
+++ b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c
@@ -30,35 +30,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VLSEV_FLOAT vlse32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT vsseg2e32_v_f32m2
-#define VBOOL_T vbool16_t
-#define UINT_V_T vuint32m2_t
-#define VID_V_UINT vid_v_u32m2
-#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
-#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
-#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VLSEV_FLOAT vlse64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT vsseg2e64_v_f64m2
-#define VBOOL_T     vbool32_t
-#define UINT_V_T    vuint64m2_t
-#define VID_V_UINT   vid_v_u64m2
-#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
-#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
-#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
@@ -120,12 +120,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                 {
                     VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
                     vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
-                    va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl);
-                    va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
+                    va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl);
+                    va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
 #ifdef UNIT
                     vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
-                    va0 =  VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl);
-                    va1 =  VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl);
+                    va0 =  VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl);
+                    va1 =  VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl);
 #endif
                     VSSEG2_FLOAT(b, va0, va1, vl);
                     ao  += 2;
diff --git a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c
index 706782cf0..7085cfc37 100644
--- a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c
+++ b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c
@@ -32,33 +32,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT vsseg2e32_v_f32m2
-#define VBOOL_T vbool16_t
-#define UINT_V_T vuint32m2_t
-#define VID_V_UINT vid_v_u32m2
-#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
-#define VMSEQ_VX_UINT vmseq_vx_u32m2_b16
-#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT vsseg2e64_v_f64m2
-#define VBOOL_T     vbool32_t
-#define UINT_V_T     vuint64m2_t
-#define VID_V_UINT   vid_v_u64m2
-#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
-#define VMSEQ_VX_UINT vmseq_vx_u64m2_b32
-#define VFMERGE_VFM_FLOAT  vfmerge_vfm_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
@@ -117,12 +117,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                 {
                     VLSEG2_FLOAT(&va0, &va1, ao, vl);
                     vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
-                    va0 = VFMERGE_VFM_FLOAT(vbool_cmp, va0, ZERO, vl);
-                    va1 = VFMERGE_VFM_FLOAT(vbool_cmp, va1, ZERO, vl);
+                    va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl);
+                    va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
 #ifdef UNIT
                     vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
-                    va0 =  VFMERGE_VFM_FLOAT(vbool_eq, va0, ONE, vl);
-                    va1 =  VFMERGE_VFM_FLOAT(vbool_eq, va1, ZERO, vl);
+                    va0 =  VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl);
+                    va1 =  VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl);
 #endif
                     VSSEG2_FLOAT(b, va0, va1, vl);
                     ao += lda * 2;
diff --git a/kernel/riscv64/ztrmmkernel_2x2_rvv.c b/kernel/riscv64/ztrmmkernel_2x2_rvv.c
index 3486a4648..399124d2e 100644
--- a/kernel/riscv64/ztrmmkernel_2x2_rvv.c
+++ b/kernel/riscv64/ztrmmkernel_2x2_rvv.c
@@ -28,37 +28,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define VSETVL_MAX vsetvlmax_e32m2()
-#define VSETVL_MAX_M1 vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m2_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VLSEG4_FLOAT vlseg4e32_v_f32m2
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VFMVVF_FLOAT vfmv_v_f_f32m2
-#define VFMACCVF_FLOAT vfmacc_vf_f32m2
-#define VFMACCVV_FLOAT vfmacc_vv_f32m2
-#define VFNMSACVV_FLOAT vfnmsac_vv_f32m2
-#define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VLSEG4_FLOAT            __riscv_vlseg4e32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f32m2
+#define VFNMSACVV_FLOAT         __riscv_vfnmsac_vv_f32m2
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f32m2_f32m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define VSETVL_MAX vsetvlmax_e64m2()
-#define VSETVL_MAX_M1 vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m2_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VLSEG4_FLOAT vlseg4e64_v_f64m2
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VFMVVF_FLOAT vfmv_v_f_f64m2
-#define VFMACCVF_FLOAT vfmacc_vf_f64m2
-#define VFMACCVV_FLOAT vfmacc_vv_f64m2
-#define VFNMSACVV_FLOAT vfnmsac_vv_f64m2
-#define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VLSEG4_FLOAT            __riscv_vlseg4e64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f64m2
+#define VFNMSACVV_FLOAT         __riscv_vfnmsac_vv_f64m2
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f64m2_f64m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
 #endif
 
 // Optimizes the implementation in ../generic/ztrmmkernel_2x2.c
diff --git a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c
index 27409ec25..92b4b855b 100644
--- a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c
+++ b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c
@@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLEV_FLOAT vle32_v_f32m2
-#define VSEV_FLOAT vse32_v_f32m2
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VSSEG2_FLOAT vsseg2e32_v_f32m2
-#define VFMVVF_FLOAT vfmv_v_f_f32m2
-#define VFMACCVF_FLOAT vfmacc_vf_f32m2
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m2
-#define VFMULVF_FLOAT vfmul_vf_f32m2
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLEV_FLOAT vle64_v_f64m2
-#define VSEV_FLOAT vse64_v_f64m2
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VSSEG2_FLOAT vsseg2e64_v_f64m2
-#define VFMVVF_FLOAT vfmv_v_f_f64m2
-#define VFMACCVF_FLOAT vfmacc_vf_f64m2
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m2
-#define VFMULVF_FLOAT vfmul_vf_f64m2
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m2
 #endif
 
 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
diff --git a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c
index b7ccb1eb3..383cb883f 100644
--- a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c
+++ b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c
@@ -29,25 +29,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT vsseg2e32_v_f32m2
-#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m
-#define VBOOL_T vbool16_t
-#define UINT_V_T vuint32m2_t
-#define VID_V_UINT vid_v_u32m2
-#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e32_v_f32m2_m
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT vsseg2e64_v_f64m2
-#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m
-#define VBOOL_T     vbool32_t
-#define UINT_V_T     vuint64m2_t
-#define VID_V_UINT   vid_v_u64m2
-#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e64_v_f64m2_m
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
 
 #endif
 
diff --git a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c
index 911b81de5..f57e9f1de 100644
--- a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c
+++ b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c
@@ -29,25 +29,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VSSEG2_FLOAT vsseg2e32_v_f32m2
-#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m
-#define VBOOL_T vbool16_t
-#define UINT_V_T vuint32m2_t
-#define VID_V_UINT vid_v_u32m2
-#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e32_v_f32m2_m
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VSSEG2_FLOAT vsseg2e64_v_f64m2
-#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m
-#define VBOOL_T     vbool32_t
-#define UINT_V_T     vuint64m2_t
-#define VID_V_UINT   vid_v_u64m2
-#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e64_v_f64m2_m
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
diff --git a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c
index db075c29b..be3613429 100644
--- a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c
+++ b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c
@@ -30,25 +30,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLSSEG2_FLOAT vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT vsseg2e32_v_f32m2
-#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m
-#define VBOOL_T vbool16_t
-#define UINT_V_T vuint32m2_t
-#define VID_V_UINT vid_v_u32m2
-#define VMSGTU_VX_UINT vmsgtu_vx_u32m2_b16
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e32_v_f32m2_m
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLSSEG2_FLOAT vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT vsseg2e64_v_f64m2
-#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m
-#define VBOOL_T     vbool32_t
-#define UINT_V_T     vuint64m2_t
-#define VID_V_UINT   vid_v_u64m2
-#define VMSGTU_VX_UINT vmsgtu_vx_u64m2_b32
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e64_v_f64m2_m
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
 #endif
 
 
diff --git a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c
index e121c6273..b1f5ef8f0 100644
--- a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c
+++ b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c
@@ -29,25 +29,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m2(n)
-#define FLOAT_V_T vfloat32m2_t
-#define VLSEG2_FLOAT vlseg2e32_v_f32m2
-#define VSSEG2_FLOAT vsseg2e32_v_f32m2
-#define VSSEG2_FLOAT_M vsseg2e32_v_f32m2_m
-#define VBOOL_T vbool16_t
-#define UINT_V_T vuint32m2_t
-#define VID_V_UINT vid_v_u32m2
-#define VMSLTU_VX_UINT vmsltu_vx_u32m2_b16
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e32_v_f32m2_m
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
 #else
-#define VSETVL(n) vsetvl_e64m2(n)
-#define FLOAT_V_T vfloat64m2_t
-#define VLSEG2_FLOAT vlseg2e64_v_f64m2
-#define VSSEG2_FLOAT vsseg2e64_v_f64m2
-#define VSSEG2_FLOAT_M vsseg2e64_v_f64m2_m
-#define VBOOL_T     vbool32_t
-#define UINT_V_T     vuint64m2_t
-#define VID_V_UINT   vid_v_u64m2
-#define VMSLTU_VX_UINT vmsltu_vx_u64m2_b32
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e64_v_f64m2_m
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
 #endif
 
 

From 6b74bee2f9d7272f1932a9ba9bbd1bda6c122fbf Mon Sep 17 00:00:00 2001
From: Heller Zheng <heller.zheng@microchip.com>
Date: Mon, 27 Mar 2023 18:59:24 -0700
Subject: [PATCH 013/718] Update TARGET=x280 description.

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6ecb46178..1f1c0f3ed 100644
--- a/README.md
+++ b/README.md
@@ -186,7 +186,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
   ```
   (also known to work on C906)
 
-- **x280**: LLVM auto-vectorization using RISC-V Vector extension 1.0.
+- **x280**: Level-3 BLAS and Level-1,2 are optimized by RISC-V Vector extension 1.0.
   ```sh
   make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran
   ```

From 281e834566a06f1c756d262dc31e809faaf8933f Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Thu, 30 Mar 2023 15:15:25 -0400
Subject: [PATCH 014/718] do not pass -j flag to the MAKE variable

---
 getarch.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/getarch.c b/getarch.c
index 937a8db68..87384c084 100644
--- a/getarch.c
+++ b/getarch.c
@@ -1930,15 +1930,15 @@ printf("ELF_VERSION=2\n");
 
 #ifdef MAKE_NB_JOBS
   #if MAKE_NB_JOBS > 0
-    printf("MAKE += -j %d\n", MAKE_NB_JOBS);
+    printf("MAKEFLAGS += -j %d\n", MAKE_NB_JOBS);
   #else
     // Let make use parent -j argument or -j1 if there
     // is no make parent
   #endif
 #elif NO_PARALLEL_MAKE==1
-    printf("MAKE += -j 1\n");
+    printf("MAKEFLAGS += -j 1\n");
 #else
-    printf("MAKE += -j %d\n", get_num_cores());
+    printf("MAKEFLAGS += -j %d\n", get_num_cores());
 #endif
 
     break;

From 397108fba299c87ce17957452d57469af914f516 Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Fri, 31 Mar 2023 09:22:40 -0400
Subject: [PATCH 015/718] serialize shared prerequisites

---
 Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 144b3400d..3c4b8948a 100644
--- a/Makefile
+++ b/Makefile
@@ -40,9 +40,9 @@ LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
 SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
 
 .PHONY : all libs netlib $(RELA) test ctest shared install
-.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
+.NOTPARALLEL : shared
 
-all :: libs netlib $(RELA) tests shared
+all :: tests
 	@echo
 	@echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
 	@echo
@@ -150,7 +150,7 @@ ifeq ($(OSNAME), CYGWIN_NT)
 endif
 endif
 
-tests : libs netlib $(RELA) shared
+tests : shared
 ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
 	touch $(LIBNAME)
 ifndef NO_FBLAS

From 3e8f51e7cf49903216e8f92ff6c9cdf7bcf4886f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 1 Apr 2023 22:25:07 +0200
Subject: [PATCH 016/718] Update version to 0.3.23.dev

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 502bf7a9d..35fd830ee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
 
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 23)
+set(OpenBLAS_PATCH_VERSION 23.dev)
 
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 

From 516f22b8ca61ff9976afa7cf07c79ebca9b94fa8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 1 Apr 2023 22:25:55 +0200
Subject: [PATCH 017/718] Update version to 0.3.23.dev

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index ab46fd075..e210e49e8 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.23
+VERSION = 0.3.23.dev
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

From 65b7bf9f3e3437e8887630e477c2abd749a7f0ac Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 3 Apr 2023 10:51:38 +0200
Subject: [PATCH 018/718] Add Apple M1 testing via Cirrus CI

---
 cirrus.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 cirrus.yml

diff --git a/cirrus.yml b/cirrus.yml
new file mode 100644
index 000000000..d16eb811a
--- /dev/null
+++ b/cirrus.yml
@@ -0,0 +1,10 @@
+macos_instance:
+  image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
+
+task:
+  compile_script:
+  - brew install llvm
+  - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
+  - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+  - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
+  - make TARGET=VORTEX USE_OPENMP=1 CC=clang

From 5f1fb27c40baa7b2ae105baf933cb6a86670aac2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 3 Apr 2023 11:00:17 +0200
Subject: [PATCH 019/718] Rename cirrus.yml to .cirrus.yml

---
 cirrus.yml => .cirrus.yml | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename cirrus.yml => .cirrus.yml (100%)

diff --git a/cirrus.yml b/.cirrus.yml
similarity index 100%
rename from cirrus.yml
rename to .cirrus.yml

From d175b8f56fc770d888f515b1a79417ec6ca9ff85 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 3 Apr 2023 15:02:10 +0200
Subject: [PATCH 020/718] Refactor ?GEBAL (Reference-LAPACK PR 808)

---
 lapack-netlib/SRC/cgebal.f | 312 ++++++++++++++++++++-----------------
 lapack-netlib/SRC/dgebal.f | 300 ++++++++++++++++++-----------------
 lapack-netlib/SRC/sgebal.f | 307 +++++++++++++++++++-----------------
 lapack-netlib/SRC/zgebal.f | 309 +++++++++++++++++++-----------------
 4 files changed, 654 insertions(+), 574 deletions(-)

diff --git a/lapack-netlib/SRC/cgebal.f b/lapack-netlib/SRC/cgebal.f
index 5d1ebb026..3f54d3937 100644
--- a/lapack-netlib/SRC/cgebal.f
+++ b/lapack-netlib/SRC/cgebal.f
@@ -85,6 +85,7 @@
 *> \verbatim
 *>          ILO is INTEGER
 *> \endverbatim
+*>
 *> \param[out] IHI
 *> \verbatim
 *>          IHI is INTEGER
@@ -154,6 +155,9 @@
 *>
 *>  Modified by Tzu-Yi Chen, Computer Science Division, University of
 *>    California at Berkeley, USA
+*>
+*>  Refactored by Evert Provoost, Department of Computer Science,
+*>    KU Leuven, Belgium
 *> \endverbatim
 *>
 *  =====================================================================
@@ -183,8 +187,8 @@
       PARAMETER          ( FACTOR = 0.95E+0 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            NOCONV
-      INTEGER            I, ICA, IEXC, IRA, J, K, L, M
+      LOGICAL            NOCONV, CANSWAP
+      INTEGER            I, ICA, IRA, J, K, L
       REAL               C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1,
      $                   SFMIN2
 *     ..
@@ -195,10 +199,10 @@
       EXTERNAL           SISNAN, LSAME, ICAMAX, SLAMCH, SCNRM2
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CSSCAL, CSWAP, XERBLA
+      EXTERNAL           XERBLA, CSSCAL, CSWAP
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          ABS, AIMAG, MAX, MIN, REAL
+      INTRINSIC          ABS, REAL, AIMAG, MAX, MIN
 *
 *     Test the input parameters
 *
@@ -216,176 +220,194 @@
          RETURN
       END IF
 *
-      K = 1
-      L = N
+*     Quick returns.
 *
-      IF( N.EQ.0 )
-     $   GO TO 210
+      IF( N.EQ.0 ) THEN
+         ILO = 1
+         IHI = 0
+         RETURN
+      END IF
 *
       IF( LSAME( JOB, 'N' ) ) THEN
-         DO 10 I = 1, N
+         DO I = 1, N
             SCALE( I ) = ONE
-   10    CONTINUE
-         GO TO 210
+         END DO
+         ILO = 1
+         IHI = N
+         RETURN
       END IF
 *
-      IF( LSAME( JOB, 'S' ) )
-     $   GO TO 120
-*
-*     Permutation to isolate eigenvalues if possible
-*
-      GO TO 50
-*
-*     Row and column exchange.
-*
-   20 CONTINUE
-      SCALE( M ) = J
-      IF( J.EQ.M )
-     $   GO TO 30
-*
-      CALL CSWAP( L, A( 1, J ), 1, A( 1, M ), 1 )
-      CALL CSWAP( N-K+1, A( J, K ), LDA, A( M, K ), LDA )
-*
-   30 CONTINUE
-      GO TO ( 40, 80 )IEXC
-*
-*     Search for rows isolating an eigenvalue and push them down.
-*
-   40 CONTINUE
-      IF( L.EQ.1 )
-     $   GO TO 210
-      L = L - 1
-*
-   50 CONTINUE
-      DO 70 J = L, 1, -1
+*     Permutation to isolate eigenvalues if possible.
 *
-         DO 60 I = 1, L
-            IF( I.EQ.J )
-     $         GO TO 60
-            IF( REAL( A( J, I ) ).NE.ZERO .OR. AIMAG( A( J, I ) ).NE.
-     $          ZERO )GO TO 70
-   60    CONTINUE
-*
-         M = L
-         IEXC = 1
-         GO TO 20
-   70 CONTINUE
-*
-      GO TO 90
+      K = 1
+      L = N
 *
-*     Search for columns isolating an eigenvalue and push them left.
+      IF( .NOT.LSAME( JOB, 'S' ) ) THEN
 *
-   80 CONTINUE
-      K = K + 1
+*        Row and column exchange.
 *
-   90 CONTINUE
-      DO 110 J = K, L
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for rows isolating an eigenvalue and push them down.
+*
+            NOCONV = .FALSE.
+            DO I = L, 1, -1
+               CANSWAP = .TRUE.
+               DO J = 1, L
+                  IF( I.NE.J .AND. ( REAL( A( I, J ) ).NE.ZERO .OR.
+     $                AIMAG( A( I, J ) ).NE.ZERO ) ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( L ) = I
+                  IF( I.NE.L ) THEN
+                     CALL CSWAP( L, A( 1, I ), 1, A( 1, L ), 1 )
+                     CALL CSWAP( N-K+1, A( I, K ), LDA, A( L, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  IF( L.EQ.1 ) THEN
+                     ILO = 1
+                     IHI = 1
+                     RETURN
+                  END IF
+*
+                  L = L - 1
+               END IF
+            END DO
+*
+         END DO
+
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for columns isolating an eigenvalue and push them left.
+*
+            NOCONV = .FALSE.
+            DO J = K, L
+               CANSWAP = .TRUE.
+               DO I = K, L
+                  IF( I.NE.J .AND. ( REAL( A( I, J ) ).NE.ZERO .OR.
+     $                AIMAG( A( I, J ) ).NE.ZERO ) ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( K ) = J
+                  IF( J.NE.K ) THEN
+                     CALL CSWAP( L, A( 1, J ), 1, A( 1, K ), 1 )
+                     CALL CSWAP( N-K+1, A( J, K ), LDA, A( K, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  K = K + 1
+               END IF
+            END DO
+*
+         END DO
 *
-         DO 100 I = K, L
-            IF( I.EQ.J )
-     $         GO TO 100
-            IF( REAL( A( I, J ) ).NE.ZERO .OR. AIMAG( A( I, J ) ).NE.
-     $          ZERO )GO TO 110
-  100    CONTINUE
+      END IF
 *
-         M = K
-         IEXC = 2
-         GO TO 20
-  110 CONTINUE
+*     Initialize SCALE for non-permuted submatrix.
 *
-  120 CONTINUE
-      DO 130 I = K, L
+      DO I = K, L
          SCALE( I ) = ONE
-  130 CONTINUE
+      END DO
 *
-      IF( LSAME( JOB, 'P' ) )
-     $   GO TO 210
+*     If we only had to permute, we are done.
+*
+      IF( LSAME( JOB, 'P' ) ) THEN
+         ILO = K
+         IHI = L
+         RETURN
+      END IF
 *
 *     Balance the submatrix in rows K to L.
 *
-*     Iterative loop for norm reduction
+*     Iterative loop for norm reduction.
 *
       SFMIN1 = SLAMCH( 'S' ) / SLAMCH( 'P' )
       SFMAX1 = ONE / SFMIN1
       SFMIN2 = SFMIN1*SCLFAC
       SFMAX2 = ONE / SFMIN2
-  140 CONTINUE
-      NOCONV = .FALSE.
-*
-      DO 200 I = K, L
-*
-         C = SCNRM2( L-K+1, A( K, I ), 1 )
-         R = SCNRM2( L-K+1, A( I , K ), LDA )
-         ICA = ICAMAX( L, A( 1, I ), 1 )
-         CA = ABS( A( ICA, I ) )
-         IRA = ICAMAX( N-K+1, A( I, K ), LDA )
-         RA = ABS( A( I, IRA+K-1 ) )
-*
-*        Guard against zero C or R due to underflow.
-*
-         IF( C.EQ.ZERO .OR. R.EQ.ZERO )
-     $      GO TO 200
-         G = R / SCLFAC
-         F = ONE
-         S = C + R
-  160    CONTINUE
-         IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR.
-     $       MIN( R, G, RA ).LE.SFMIN2 )GO TO 170
-            IF( SISNAN( C+F+CA+R+G+RA ) ) THEN
 *
-*           Exit if NaN to avoid infinite loop
+      NOCONV = .TRUE.
+      DO WHILE( NOCONV )
+         NOCONV = .FALSE.
 *
-            INFO = -3
-            CALL XERBLA( 'CGEBAL', -INFO )
-            RETURN
-         END IF
-         F = F*SCLFAC
-         C = C*SCLFAC
-         CA = CA*SCLFAC
-         R = R / SCLFAC
-         G = G / SCLFAC
-         RA = RA / SCLFAC
-         GO TO 160
-*
-  170    CONTINUE
-         G = C / SCLFAC
-  180    CONTINUE
-         IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR.
-     $       MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190
-         F = F / SCLFAC
-         C = C / SCLFAC
-         G = G / SCLFAC
-         CA = CA / SCLFAC
-         R = R*SCLFAC
-         RA = RA*SCLFAC
-         GO TO 180
-*
-*        Now balance.
-*
-  190    CONTINUE
-         IF( ( C+R ).GE.FACTOR*S )
-     $      GO TO 200
-         IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
-            IF( F*SCALE( I ).LE.SFMIN1 )
-     $         GO TO 200
-         END IF
-         IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
-            IF( SCALE( I ).GE.SFMAX1 / F )
-     $         GO TO 200
-         END IF
-         G = ONE / F
-         SCALE( I ) = SCALE( I )*F
-         NOCONV = .TRUE.
+         DO I = K, L
 *
-         CALL CSSCAL( N-K+1, G, A( I, K ), LDA )
-         CALL CSSCAL( L, F, A( 1, I ), 1 )
+            C = SCNRM2( L-K+1, A( K, I ), 1 )
+            R = SCNRM2( L-K+1, A( I, K ), LDA )
+            ICA = ICAMAX( L, A( 1, I ), 1 )
+            CA = ABS( A( ICA, I ) )
+            IRA = ICAMAX( N-K+1, A( I, K ), LDA )
+            RA = ABS( A( I, IRA+K-1 ) )
 *
-  200 CONTINUE
+*           Guard against zero C or R due to underflow.
+*
+            IF( C.EQ.ZERO .OR. R.EQ.ZERO ) CYCLE
+*
+*           Exit if NaN to avoid infinite loop
 *
-      IF( NOCONV )
-     $   GO TO 140
+            IF( SISNAN( C+CA+R+RA ) ) THEN
+               INFO = -3
+               CALL XERBLA( 'CGEBAL', -INFO )
+               RETURN
+            END IF
+*
+            G = R / SCLFAC
+            F = ONE
+            S = C + R
+*
+            DO WHILE( C.LT.G .AND. MAX( F, C, CA ).LT.SFMAX2 .AND.
+     $                MIN( R, G, RA ).GT.SFMIN2 )
+               F = F*SCLFAC
+               C = C*SCLFAC
+               CA = CA*SCLFAC
+               R = R / SCLFAC
+               G = G / SCLFAC
+               RA = RA / SCLFAC
+            END DO
+*
+            G = C / SCLFAC
+*
+            DO WHILE( G.GE.R .AND. MAX( R, RA ).LT.SFMAX2 .AND.
+     $                MIN( F, C, G, CA ).GT.SFMIN2 )
+               F = F / SCLFAC
+               C = C / SCLFAC
+               G = G / SCLFAC
+               CA = CA / SCLFAC
+               R = R*SCLFAC
+               RA = RA*SCLFAC
+            END DO
+*
+*           Now balance.
+*
+            IF( ( C+R ).GE.FACTOR*S ) CYCLE
+            IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
+               IF( F*SCALE( I ).LE.SFMIN1 ) CYCLE
+            END IF
+            IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
+               IF( SCALE( I ).GE.SFMAX1 / F ) CYCLE
+            END IF
+            G = ONE / F
+            SCALE( I ) = SCALE( I )*F
+            NOCONV = .TRUE.
+*
+            CALL CSSCAL( N-K+1, G, A( I, K ), LDA )
+            CALL CSSCAL( L, F, A( 1, I ), 1 )
+*
+         END DO
+*
+      END DO
 *
-  210 CONTINUE
       ILO = K
       IHI = L
 *
diff --git a/lapack-netlib/SRC/dgebal.f b/lapack-netlib/SRC/dgebal.f
index 821c7704a..f7b38b378 100644
--- a/lapack-netlib/SRC/dgebal.f
+++ b/lapack-netlib/SRC/dgebal.f
@@ -153,6 +153,9 @@
 *>
 *>  Modified by Tzu-Yi Chen, Computer Science Division, University of
 *>    California at Berkeley, USA
+*>
+*>  Refactored by Evert Provoost, Department of Computer Science,
+*>    KU Leuven, Belgium
 *> \endverbatim
 *>
 *  =====================================================================
@@ -181,8 +184,8 @@
       PARAMETER          ( FACTOR = 0.95D+0 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            NOCONV
-      INTEGER            I, ICA, IEXC, IRA, J, K, L, M
+      LOGICAL            NOCONV, CANSWAP
+      INTEGER            I, ICA, IRA, J, K, L
       DOUBLE PRECISION   C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1,
      $                   SFMIN2
 *     ..
@@ -214,177 +217,192 @@
          RETURN
       END IF
 *
-      K = 1
-      L = N
+*     Quick returns.
 *
-      IF( N.EQ.0 )
-     $   GO TO 210
+      IF( N.EQ.0 ) THEN
+         ILO = 1
+         IHI = 0
+         RETURN
+      END IF
 *
       IF( LSAME( JOB, 'N' ) ) THEN
-         DO 10 I = 1, N
+         DO I = 1, N
             SCALE( I ) = ONE
-   10    CONTINUE
-         GO TO 210
+         END DO
+         ILO = 1
+         IHI = N
+         RETURN
       END IF
 *
-      IF( LSAME( JOB, 'S' ) )
-     $   GO TO 120
-*
-*     Permutation to isolate eigenvalues if possible
-*
-      GO TO 50
-*
-*     Row and column exchange.
-*
-   20 CONTINUE
-      SCALE( M ) = J
-      IF( J.EQ.M )
-     $   GO TO 30
-*
-      CALL DSWAP( L, A( 1, J ), 1, A( 1, M ), 1 )
-      CALL DSWAP( N-K+1, A( J, K ), LDA, A( M, K ), LDA )
-*
-   30 CONTINUE
-      GO TO ( 40, 80 )IEXC
-*
-*     Search for rows isolating an eigenvalue and push them down.
-*
-   40 CONTINUE
-      IF( L.EQ.1 )
-     $   GO TO 210
-      L = L - 1
-*
-   50 CONTINUE
-      DO 70 J = L, 1, -1
+*     Permutation to isolate eigenvalues if possible.
 *
-         DO 60 I = 1, L
-            IF( I.EQ.J )
-     $         GO TO 60
-            IF( A( J, I ).NE.ZERO )
-     $         GO TO 70
-   60    CONTINUE
-*
-         M = L
-         IEXC = 1
-         GO TO 20
-   70 CONTINUE
-*
-      GO TO 90
+      K = 1
+      L = N
 *
-*     Search for columns isolating an eigenvalue and push them left.
+      IF( .NOT.LSAME( JOB, 'S' ) ) THEN
 *
-   80 CONTINUE
-      K = K + 1
+*        Row and column exchange.
 *
-   90 CONTINUE
-      DO 110 J = K, L
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for rows isolating an eigenvalue and push them down.
+*
+            NOCONV = .FALSE.
+            DO I = L, 1, -1
+               CANSWAP = .TRUE.
+               DO J = 1, L
+                  IF( I.NE.J .AND. A( I, J ).NE.ZERO ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( L ) = I
+                  IF( I.NE.L ) THEN
+                     CALL DSWAP( L, A( 1, I ), 1, A( 1, L ), 1 )
+                     CALL DSWAP( N-K+1, A( I, K ), LDA, A( L, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  IF( L.EQ.1 ) THEN
+                     ILO = 1
+                     IHI = 1
+                     RETURN
+                  END IF
+*
+                  L = L - 1
+               END IF
+            END DO
+*
+         END DO
+
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for columns isolating an eigenvalue and push them left.
+*
+            NOCONV = .FALSE.
+            DO J = K, L
+               CANSWAP = .TRUE.
+               DO I = K, L
+                  IF( I.NE.J .AND. A( I, J ).NE.ZERO ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( K ) = J
+                  IF( J.NE.K ) THEN
+                     CALL DSWAP( L, A( 1, J ), 1, A( 1, K ), 1 )
+                     CALL DSWAP( N-K+1, A( J, K ), LDA, A( K, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  K = K + 1
+               END IF
+            END DO
+*
+         END DO
 *
-         DO 100 I = K, L
-            IF( I.EQ.J )
-     $         GO TO 100
-            IF( A( I, J ).NE.ZERO )
-     $         GO TO 110
-  100    CONTINUE
+      END IF
 *
-         M = K
-         IEXC = 2
-         GO TO 20
-  110 CONTINUE
+*     Initialize SCALE for non-permuted submatrix.
 *
-  120 CONTINUE
-      DO 130 I = K, L
+      DO I = K, L
          SCALE( I ) = ONE
-  130 CONTINUE
+      END DO
 *
-      IF( LSAME( JOB, 'P' ) )
-     $   GO TO 210
+*     If we only had to permute, we are done.
+*
+      IF( LSAME( JOB, 'P' ) ) THEN
+         ILO = K
+         IHI = L
+         RETURN
+      END IF
 *
 *     Balance the submatrix in rows K to L.
 *
-*     Iterative loop for norm reduction
+*     Iterative loop for norm reduction.
 *
       SFMIN1 = DLAMCH( 'S' ) / DLAMCH( 'P' )
       SFMAX1 = ONE / SFMIN1
       SFMIN2 = SFMIN1*SCLFAC
       SFMAX2 = ONE / SFMIN2
 *
-  140 CONTINUE
-      NOCONV = .FALSE.
+      NOCONV = .TRUE.
+      DO WHILE( NOCONV )
+         NOCONV = .FALSE.
 *
-      DO 200 I = K, L
+         DO I = K, L
 *
-         C = DNRM2( L-K+1, A( K, I ), 1 )
-         R = DNRM2( L-K+1, A( I, K ), LDA )
-         ICA = IDAMAX( L, A( 1, I ), 1 )
-         CA = ABS( A( ICA, I ) )
-         IRA = IDAMAX( N-K+1, A( I, K ), LDA )
-         RA = ABS( A( I, IRA+K-1 ) )
+            C = DNRM2( L-K+1, A( K, I ), 1 )
+            R = DNRM2( L-K+1, A( I, K ), LDA )
+            ICA = IDAMAX( L, A( 1, I ), 1 )
+            CA = ABS( A( ICA, I ) )
+            IRA = IDAMAX( N-K+1, A( I, K ), LDA )
+            RA = ABS( A( I, IRA+K-1 ) )
 *
-*        Guard against zero C or R due to underflow.
+*           Guard against zero C or R due to underflow.
 *
-         IF( C.EQ.ZERO .OR. R.EQ.ZERO )
-     $      GO TO 200
-         G = R / SCLFAC
-         F = ONE
-         S = C + R
-  160    CONTINUE
-         IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR.
-     $       MIN( R, G, RA ).LE.SFMIN2 )GO TO 170
-            IF( DISNAN( C+F+CA+R+G+RA ) ) THEN
+            IF( C.EQ.ZERO .OR. R.EQ.ZERO ) CYCLE
 *
 *           Exit if NaN to avoid infinite loop
 *
-            INFO = -3
-            CALL XERBLA( 'DGEBAL', -INFO )
-            RETURN
-         END IF
-         F = F*SCLFAC
-         C = C*SCLFAC
-         CA = CA*SCLFAC
-         R = R / SCLFAC
-         G = G / SCLFAC
-         RA = RA / SCLFAC
-         GO TO 160
-*
-  170    CONTINUE
-         G = C / SCLFAC
-  180    CONTINUE
-         IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR.
-     $       MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190
-         F = F / SCLFAC
-         C = C / SCLFAC
-         G = G / SCLFAC
-         CA = CA / SCLFAC
-         R = R*SCLFAC
-         RA = RA*SCLFAC
-         GO TO 180
-*
-*        Now balance.
-*
-  190    CONTINUE
-         IF( ( C+R ).GE.FACTOR*S )
-     $      GO TO 200
-         IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
-            IF( F*SCALE( I ).LE.SFMIN1 )
-     $         GO TO 200
-         END IF
-         IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
-            IF( SCALE( I ).GE.SFMAX1 / F )
-     $         GO TO 200
-         END IF
-         G = ONE / F
-         SCALE( I ) = SCALE( I )*F
-         NOCONV = .TRUE.
-*
-         CALL DSCAL( N-K+1, G, A( I, K ), LDA )
-         CALL DSCAL( L, F, A( 1, I ), 1 )
-*
-  200 CONTINUE
-*
-      IF( NOCONV )
-     $   GO TO 140
+            IF( DISNAN( C+CA+R+RA ) ) THEN
+               INFO = -3
+               CALL XERBLA( 'DGEBAL', -INFO )
+               RETURN
+            END IF
+*
+            G = R / SCLFAC
+            F = ONE
+            S = C + R
+*
+            DO WHILE( C.LT.G .AND. MAX( F, C, CA ).LT.SFMAX2 .AND.
+     $                MIN( R, G, RA ).GT.SFMIN2 )
+               F = F*SCLFAC
+               C = C*SCLFAC
+               CA = CA*SCLFAC
+               R = R / SCLFAC
+               G = G / SCLFAC
+               RA = RA / SCLFAC
+            END DO
+*
+            G = C / SCLFAC
+*
+            DO WHILE( G.GE.R .AND. MAX( R, RA ).LT.SFMAX2 .AND.
+     $                MIN( F, C, G, CA ).GT.SFMIN2 )
+               F = F / SCLFAC
+               C = C / SCLFAC
+               G = G / SCLFAC
+               CA = CA / SCLFAC
+               R = R*SCLFAC
+               RA = RA*SCLFAC
+            END DO
+*
+*           Now balance.
+*
+            IF( ( C+R ).GE.FACTOR*S ) CYCLE
+            IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
+               IF( F*SCALE( I ).LE.SFMIN1 ) CYCLE
+            END IF
+            IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
+               IF( SCALE( I ).GE.SFMAX1 / F ) CYCLE
+            END IF
+            G = ONE / F
+            SCALE( I ) = SCALE( I )*F
+            NOCONV = .TRUE.
+*
+            CALL DSCAL( N-K+1, G, A( I, K ), LDA )
+            CALL DSCAL( L, F, A( 1, I ), 1 )
+*
+         END DO
+*
+      END DO
 *
-  210 CONTINUE
       ILO = K
       IHI = L
 *
diff --git a/lapack-netlib/SRC/sgebal.f b/lapack-netlib/SRC/sgebal.f
index f519c8c57..7c115fb6c 100644
--- a/lapack-netlib/SRC/sgebal.f
+++ b/lapack-netlib/SRC/sgebal.f
@@ -153,6 +153,9 @@
 *>
 *>  Modified by Tzu-Yi Chen, Computer Science Division, University of
 *>    California at Berkeley, USA
+*>
+*>  Refactored by Evert Provoost, Department of Computer Science,
+*>    KU Leuven, Belgium
 *> \endverbatim
 *>
 *  =====================================================================
@@ -181,8 +184,8 @@
       PARAMETER          ( FACTOR = 0.95E+0 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            NOCONV
-      INTEGER            I, ICA, IEXC, IRA, J, K, L, M
+      LOGICAL            NOCONV, CANSWAP
+      INTEGER            I, ICA, IRA, J, K, L
       REAL               C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1,
      $                   SFMIN2
 *     ..
@@ -197,7 +200,7 @@
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN
-*
+*     ..
 *     Test the input parameters
 *
       INFO = 0
@@ -214,176 +217,192 @@
          RETURN
       END IF
 *
-      K = 1
-      L = N
+*     Quick returns.
 *
-      IF( N.EQ.0 )
-     $   GO TO 210
+      IF( N.EQ.0 ) THEN
+         ILO = 1
+         IHI = 0
+         RETURN
+      END IF
 *
       IF( LSAME( JOB, 'N' ) ) THEN
-         DO 10 I = 1, N
+         DO I = 1, N
             SCALE( I ) = ONE
-   10    CONTINUE
-         GO TO 210
+         END DO
+         ILO = 1
+         IHI = N
+         RETURN
       END IF
 *
-      IF( LSAME( JOB, 'S' ) )
-     $   GO TO 120
-*
-*     Permutation to isolate eigenvalues if possible
-*
-      GO TO 50
-*
-*     Row and column exchange.
-*
-   20 CONTINUE
-      SCALE( M ) = J
-      IF( J.EQ.M )
-     $   GO TO 30
-*
-      CALL SSWAP( L, A( 1, J ), 1, A( 1, M ), 1 )
-      CALL SSWAP( N-K+1, A( J, K ), LDA, A( M, K ), LDA )
-*
-   30 CONTINUE
-      GO TO ( 40, 80 )IEXC
-*
-*     Search for rows isolating an eigenvalue and push them down.
-*
-   40 CONTINUE
-      IF( L.EQ.1 )
-     $   GO TO 210
-      L = L - 1
+*     Permutation to isolate eigenvalues if possible.
 *
-   50 CONTINUE
-      DO 70 J = L, 1, -1
-*
-         DO 60 I = 1, L
-            IF( I.EQ.J )
-     $         GO TO 60
-            IF( A( J, I ).NE.ZERO )
-     $         GO TO 70
-   60    CONTINUE
-*
-         M = L
-         IEXC = 1
-         GO TO 20
-   70 CONTINUE
-*
-      GO TO 90
+      K = 1
+      L = N
 *
-*     Search for columns isolating an eigenvalue and push them left.
+      IF( .NOT.LSAME( JOB, 'S' ) ) THEN
 *
-   80 CONTINUE
-      K = K + 1
+*        Row and column exchange.
 *
-   90 CONTINUE
-      DO 110 J = K, L
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for rows isolating an eigenvalue and push them down.
+*
+            NOCONV = .FALSE.
+            DO I = L, 1, -1
+               CANSWAP = .TRUE.
+               DO J = 1, L
+                  IF( I.NE.J .AND. A( I, J ).NE.ZERO ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( L ) = I
+                  IF( I.NE.L ) THEN
+                     CALL SSWAP( L, A( 1, I ), 1, A( 1, L ), 1 )
+                     CALL SSWAP( N-K+1, A( I, K ), LDA, A( L, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  IF( L.EQ.1 ) THEN
+                     ILO = 1
+                     IHI = 1
+                     RETURN
+                  END IF
+*
+                  L = L - 1
+               END IF
+            END DO
+*
+         END DO
+
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for columns isolating an eigenvalue and push them left.
+*
+            NOCONV = .FALSE.
+            DO J = K, L
+               CANSWAP = .TRUE.
+               DO I = K, L
+                  IF( I.NE.J .AND. A( I, J ).NE.ZERO ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( K ) = J
+                  IF( J.NE.K ) THEN
+                     CALL SSWAP( L, A( 1, J ), 1, A( 1, K ), 1 )
+                     CALL SSWAP( N-K+1, A( J, K ), LDA, A( K, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  K = K + 1
+               END IF
+            END DO
+*
+         END DO
 *
-         DO 100 I = K, L
-            IF( I.EQ.J )
-     $         GO TO 100
-            IF( A( I, J ).NE.ZERO )
-     $         GO TO 110
-  100    CONTINUE
+      END IF
 *
-         M = K
-         IEXC = 2
-         GO TO 20
-  110 CONTINUE
+*     Initialize SCALE for non-permuted submatrix.
 *
-  120 CONTINUE
-      DO 130 I = K, L
+      DO I = K, L
          SCALE( I ) = ONE
-  130 CONTINUE
+      END DO
 *
-      IF( LSAME( JOB, 'P' ) )
-     $   GO TO 210
+*     If we only had to permute, we are done.
+*
+      IF( LSAME( JOB, 'P' ) ) THEN
+         ILO = K
+         IHI = L
+         RETURN
+      END IF
 *
 *     Balance the submatrix in rows K to L.
 *
-*     Iterative loop for norm reduction
+*     Iterative loop for norm reduction.
 *
       SFMIN1 = SLAMCH( 'S' ) / SLAMCH( 'P' )
       SFMAX1 = ONE / SFMIN1
       SFMIN2 = SFMIN1*SCLFAC
       SFMAX2 = ONE / SFMIN2
-  140 CONTINUE
-      NOCONV = .FALSE.
-*
-      DO 200 I = K, L
-*
-         C = SNRM2( L-K+1, A( K, I ), 1 )
-         R = SNRM2( L-K+1, A( I, K ), LDA )
-         ICA = ISAMAX( L, A( 1, I ), 1 )
-         CA = ABS( A( ICA, I ) )
-         IRA = ISAMAX( N-K+1, A( I, K ), LDA )
-         RA = ABS( A( I, IRA+K-1 ) )
-*
-*        Guard against zero C or R due to underflow.
-*
-         IF( C.EQ.ZERO .OR. R.EQ.ZERO )
-     $      GO TO 200
-         G = R / SCLFAC
-         F = ONE
-         S = C + R
-  160    CONTINUE
-         IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR.
-     $       MIN( R, G, RA ).LE.SFMIN2 )GO TO 170
-         F = F*SCLFAC
-         C = C*SCLFAC
-         CA = CA*SCLFAC
-         R = R / SCLFAC
-         G = G / SCLFAC
-         RA = RA / SCLFAC
-         GO TO 160
-*
-  170    CONTINUE
-         G = C / SCLFAC
-  180    CONTINUE
-         IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR.
-     $       MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190
-            IF( SISNAN( C+F+CA+R+G+RA ) ) THEN
 *
-*           Exit if NaN to avoid infinite loop
+      NOCONV = .TRUE.
+      DO WHILE( NOCONV )
+         NOCONV = .FALSE.
 *
-            INFO = -3
-            CALL XERBLA( 'SGEBAL', -INFO )
-            RETURN
-         END IF
-         F = F / SCLFAC
-         C = C / SCLFAC
-         G = G / SCLFAC
-         CA = CA / SCLFAC
-         R = R*SCLFAC
-         RA = RA*SCLFAC
-         GO TO 180
-*
-*        Now balance.
-*
-  190    CONTINUE
-         IF( ( C+R ).GE.FACTOR*S )
-     $      GO TO 200
-         IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
-            IF( F*SCALE( I ).LE.SFMIN1 )
-     $         GO TO 200
-         END IF
-         IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
-            IF( SCALE( I ).GE.SFMAX1 / F )
-     $         GO TO 200
-         END IF
-         G = ONE / F
-         SCALE( I ) = SCALE( I )*F
-         NOCONV = .TRUE.
+         DO I = K, L
+*
+            C = SNRM2( L-K+1, A( K, I ), 1 )
+            R = SNRM2( L-K+1, A( I, K ), LDA )
+            ICA = ISAMAX( L, A( 1, I ), 1 )
+            CA = ABS( A( ICA, I ) )
+            IRA = ISAMAX( N-K+1, A( I, K ), LDA )
+            RA = ABS( A( I, IRA+K-1 ) )
 *
-         CALL SSCAL( N-K+1, G, A( I, K ), LDA )
-         CALL SSCAL( L, F, A( 1, I ), 1 )
+*           Guard against zero C or R due to underflow.
 *
-  200 CONTINUE
+            IF( C.EQ.ZERO .OR. R.EQ.ZERO ) CYCLE
+*
+*           Exit if NaN to avoid infinite loop
 *
-      IF( NOCONV )
-     $   GO TO 140
+            IF( SISNAN( C+CA+R+RA ) ) THEN
+               INFO = -3
+               CALL XERBLA( 'SGEBAL', -INFO )
+               RETURN
+            END IF
+*
+            G = R / SCLFAC
+            F = ONE
+            S = C + R
+*
+            DO WHILE( C.LT.G .AND. MAX( F, C, CA ).LT.SFMAX2 .AND.
+     $                MIN( R, G, RA ).GT.SFMIN2 )
+               F = F*SCLFAC
+               C = C*SCLFAC
+               CA = CA*SCLFAC
+               R = R / SCLFAC
+               G = G / SCLFAC
+               RA = RA / SCLFAC
+            END DO
+*
+            G = C / SCLFAC
+*
+            DO WHILE( G.GE.R .AND. MAX( R, RA ).LT.SFMAX2 .AND.
+     $                MIN( F, C, G, CA ).GT.SFMIN2 )
+               F = F / SCLFAC
+               C = C / SCLFAC
+               G = G / SCLFAC
+               CA = CA / SCLFAC
+               R = R*SCLFAC
+               RA = RA*SCLFAC
+            END DO
+*
+*           Now balance.
+*
+            IF( ( C+R ).GE.FACTOR*S ) CYCLE
+            IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
+               IF( F*SCALE( I ).LE.SFMIN1 ) CYCLE
+            END IF
+            IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
+               IF( SCALE( I ).GE.SFMAX1 / F ) CYCLE
+            END IF
+            G = ONE / F
+            SCALE( I ) = SCALE( I )*F
+            NOCONV = .TRUE.
+*
+            CALL SSCAL( N-K+1, G, A( I, K ), LDA )
+            CALL SSCAL( L, F, A( 1, I ), 1 )
+*
+         END DO
+*
+      END DO
 *
-  210 CONTINUE
       ILO = K
       IHI = L
 *
diff --git a/lapack-netlib/SRC/zgebal.f b/lapack-netlib/SRC/zgebal.f
index d4a9e39f1..a467991d4 100644
--- a/lapack-netlib/SRC/zgebal.f
+++ b/lapack-netlib/SRC/zgebal.f
@@ -89,7 +89,7 @@
 *> \param[out] IHI
 *> \verbatim
 *>          IHI is INTEGER
-*>          ILO and IHI are set to INTEGER such that on exit
+*>          ILO and IHI are set to integers such that on exit
 *>          A(i,j) = 0 if i > j and j = 1,...,ILO-1 or I = IHI+1,...,N.
 *>          If JOB = 'N' or 'S', ILO = 1 and IHI = N.
 *> \endverbatim
@@ -155,6 +155,9 @@
 *>
 *>  Modified by Tzu-Yi Chen, Computer Science Division, University of
 *>    California at Berkeley, USA
+*>
+*>  Refactored by Evert Provoost, Department of Computer Science,
+*>    KU Leuven, Belgium
 *> \endverbatim
 *>
 *  =====================================================================
@@ -184,8 +187,8 @@
       PARAMETER          ( FACTOR = 0.95D+0 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            NOCONV
-      INTEGER            I, ICA, IEXC, IRA, J, K, L, M
+      LOGICAL            NOCONV, CANSWAP
+      INTEGER            I, ICA, IRA, J, K, L
       DOUBLE PRECISION   C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1,
      $                   SFMIN2
 *     ..
@@ -217,176 +220,194 @@
          RETURN
       END IF
 *
-      K = 1
-      L = N
+*     Quick returns.
 *
-      IF( N.EQ.0 )
-     $   GO TO 210
+      IF( N.EQ.0 ) THEN
+         ILO = 1
+         IHI = 0
+         RETURN
+      END IF
 *
       IF( LSAME( JOB, 'N' ) ) THEN
-         DO 10 I = 1, N
+         DO I = 1, N
             SCALE( I ) = ONE
-   10    CONTINUE
-         GO TO 210
+         END DO
+         ILO = 1
+         IHI = N
+         RETURN
       END IF
 *
-      IF( LSAME( JOB, 'S' ) )
-     $   GO TO 120
-*
-*     Permutation to isolate eigenvalues if possible
-*
-      GO TO 50
-*
-*     Row and column exchange.
-*
-   20 CONTINUE
-      SCALE( M ) = J
-      IF( J.EQ.M )
-     $   GO TO 30
-*
-      CALL ZSWAP( L, A( 1, J ), 1, A( 1, M ), 1 )
-      CALL ZSWAP( N-K+1, A( J, K ), LDA, A( M, K ), LDA )
-*
-   30 CONTINUE
-      GO TO ( 40, 80 )IEXC
-*
-*     Search for rows isolating an eigenvalue and push them down.
-*
-   40 CONTINUE
-      IF( L.EQ.1 )
-     $   GO TO 210
-      L = L - 1
-*
-   50 CONTINUE
-      DO 70 J = L, 1, -1
+*     Permutation to isolate eigenvalues if possible.
 *
-         DO 60 I = 1, L
-            IF( I.EQ.J )
-     $         GO TO 60
-            IF( DBLE( A( J, I ) ).NE.ZERO .OR. DIMAG( A( J, I ) ).NE.
-     $          ZERO )GO TO 70
-   60    CONTINUE
-*
-         M = L
-         IEXC = 1
-         GO TO 20
-   70 CONTINUE
-*
-      GO TO 90
+      K = 1
+      L = N
 *
-*     Search for columns isolating an eigenvalue and push them left.
+      IF( .NOT.LSAME( JOB, 'S' ) ) THEN
 *
-   80 CONTINUE
-      K = K + 1
+*        Row and column exchange.
 *
-   90 CONTINUE
-      DO 110 J = K, L
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for rows isolating an eigenvalue and push them down.
+*
+            NOCONV = .FALSE.
+            DO I = L, 1, -1
+               CANSWAP = .TRUE.
+               DO J = 1, L
+                  IF( I.NE.J .AND. ( DBLE( A( I, J ) ).NE.ZERO .OR.
+     $                DIMAG( A( I, J ) ).NE.ZERO ) ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( L ) = I
+                  IF( I.NE.L ) THEN
+                     CALL ZSWAP( L, A( 1, I ), 1, A( 1, L ), 1 )
+                     CALL ZSWAP( N-K+1, A( I, K ), LDA, A( L, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  IF( L.EQ.1 ) THEN
+                     ILO = 1
+                     IHI = 1
+                     RETURN
+                  END IF
+*
+                  L = L - 1
+               END IF
+            END DO
+*
+         END DO
+
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for columns isolating an eigenvalue and push them left.
+*
+            NOCONV = .FALSE.
+            DO J = K, L
+               CANSWAP = .TRUE.
+               DO I = K, L
+                  IF( I.NE.J .AND. ( DBLE( A( I, J ) ).NE.ZERO .OR.
+     $                DIMAG( A( I, J ) ).NE.ZERO ) ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( K ) = J
+                  IF( J.NE.K ) THEN
+                     CALL ZSWAP( L, A( 1, J ), 1, A( 1, K ), 1 )
+                     CALL ZSWAP( N-K+1, A( J, K ), LDA, A( K, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  K = K + 1
+               END IF
+            END DO
+*
+         END DO
 *
-         DO 100 I = K, L
-            IF( I.EQ.J )
-     $         GO TO 100
-            IF( DBLE( A( I, J ) ).NE.ZERO .OR. DIMAG( A( I, J ) ).NE.
-     $          ZERO )GO TO 110
-  100    CONTINUE
+      END IF
 *
-         M = K
-         IEXC = 2
-         GO TO 20
-  110 CONTINUE
+*     Initialize SCALE for non-permuted submatrix.
 *
-  120 CONTINUE
-      DO 130 I = K, L
+      DO I = K, L
          SCALE( I ) = ONE
-  130 CONTINUE
+      END DO
 *
-      IF( LSAME( JOB, 'P' ) )
-     $   GO TO 210
+*     If we only had to permute, we are done.
+*
+      IF( LSAME( JOB, 'P' ) ) THEN
+         ILO = K
+         IHI = L
+         RETURN
+      END IF
 *
 *     Balance the submatrix in rows K to L.
 *
-*     Iterative loop for norm reduction
+*     Iterative loop for norm reduction.
 *
       SFMIN1 = DLAMCH( 'S' ) / DLAMCH( 'P' )
       SFMAX1 = ONE / SFMIN1
       SFMIN2 = SFMIN1*SCLFAC
       SFMAX2 = ONE / SFMIN2
-  140 CONTINUE
-      NOCONV = .FALSE.
-*
-      DO 200 I = K, L
-*
-         C = DZNRM2( L-K+1, A( K, I ), 1 )
-         R = DZNRM2( L-K+1, A( I, K ), LDA )
-         ICA = IZAMAX( L, A( 1, I ), 1 )
-         CA = ABS( A( ICA, I ) )
-         IRA = IZAMAX( N-K+1, A( I, K ), LDA )
-         RA = ABS( A( I, IRA+K-1 ) )
-*
-*        Guard against zero C or R due to underflow.
-*
-         IF( C.EQ.ZERO .OR. R.EQ.ZERO )
-     $      GO TO 200
-         G = R / SCLFAC
-         F = ONE
-         S = C + R
-  160    CONTINUE
-         IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR.
-     $       MIN( R, G, RA ).LE.SFMIN2 )GO TO 170
-            IF( DISNAN( C+F+CA+R+G+RA ) ) THEN
 *
-*           Exit if NaN to avoid infinite loop
+      NOCONV = .TRUE.
+      DO WHILE( NOCONV )
+         NOCONV = .FALSE.
 *
-            INFO = -3
-            CALL XERBLA( 'ZGEBAL', -INFO )
-            RETURN
-         END IF
-         F = F*SCLFAC
-         C = C*SCLFAC
-         CA = CA*SCLFAC
-         R = R / SCLFAC
-         G = G / SCLFAC
-         RA = RA / SCLFAC
-         GO TO 160
-*
-  170    CONTINUE
-         G = C / SCLFAC
-  180    CONTINUE
-         IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR.
-     $       MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190
-         F = F / SCLFAC
-         C = C / SCLFAC
-         G = G / SCLFAC
-         CA = CA / SCLFAC
-         R = R*SCLFAC
-         RA = RA*SCLFAC
-         GO TO 180
-*
-*        Now balance.
-*
-  190    CONTINUE
-         IF( ( C+R ).GE.FACTOR*S )
-     $      GO TO 200
-         IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
-            IF( F*SCALE( I ).LE.SFMIN1 )
-     $         GO TO 200
-         END IF
-         IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
-            IF( SCALE( I ).GE.SFMAX1 / F )
-     $         GO TO 200
-         END IF
-         G = ONE / F
-         SCALE( I ) = SCALE( I )*F
-         NOCONV = .TRUE.
+         DO I = K, L
 *
-         CALL ZDSCAL( N-K+1, G, A( I, K ), LDA )
-         CALL ZDSCAL( L, F, A( 1, I ), 1 )
+            C = DZNRM2( L-K+1, A( K, I ), 1 )
+            R = DZNRM2( L-K+1, A( I, K ), LDA )
+            ICA = IZAMAX( L, A( 1, I ), 1 )
+            CA = ABS( A( ICA, I ) )
+            IRA = IZAMAX( N-K+1, A( I, K ), LDA )
+            RA = ABS( A( I, IRA+K-1 ) )
 *
-  200 CONTINUE
+*           Guard against zero C or R due to underflow.
+*
+            IF( C.EQ.ZERO .OR. R.EQ.ZERO ) CYCLE
+*
+*           Exit if NaN to avoid infinite loop
 *
-      IF( NOCONV )
-     $   GO TO 140
+            IF( DISNAN( C+CA+R+RA ) ) THEN
+               INFO = -3
+               CALL XERBLA( 'ZGEBAL', -INFO )
+               RETURN
+            END IF
+*
+            G = R / SCLFAC
+            F = ONE
+            S = C + R
+*
+            DO WHILE( C.LT.G .AND. MAX( F, C, CA ).LT.SFMAX2 .AND.
+     $                MIN( R, G, RA ).GT.SFMIN2 )
+               F = F*SCLFAC
+               C = C*SCLFAC
+               CA = CA*SCLFAC
+               R = R / SCLFAC
+               G = G / SCLFAC
+               RA = RA / SCLFAC
+            END DO
+*
+            G = C / SCLFAC
+*
+            DO WHILE( G.GE.R .AND. MAX( R, RA ).LT.SFMAX2 .AND.
+     $                MIN( F, C, G, CA ).GT.SFMIN2 )
+               F = F / SCLFAC
+               C = C / SCLFAC
+               G = G / SCLFAC
+               CA = CA / SCLFAC
+               R = R*SCLFAC
+               RA = RA*SCLFAC
+            END DO
+*
+*           Now balance.
+*
+            IF( ( C+R ).GE.FACTOR*S ) CYCLE
+            IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
+               IF( F*SCALE( I ).LE.SFMIN1 ) CYCLE
+            END IF
+            IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
+               IF( SCALE( I ).GE.SFMAX1 / F ) CYCLE
+            END IF
+            G = ONE / F
+            SCALE( I ) = SCALE( I )*F
+            NOCONV = .TRUE.
+*
+            CALL ZDSCAL( N-K+1, G, A( I, K ), LDA )
+            CALL ZDSCAL( L, F, A( 1, I ), 1 )
+*
+         END DO
+*
+      END DO
 *
-  210 CONTINUE
       ILO = K
       IHI = L
 *

From a4ee1c84f020e7450dd6e27e5909d0493724cf7e Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Fri, 7 Apr 2023 12:50:36 +0100
Subject: [PATCH 021/718] Export `ssyconvf` symbol

This was apparently missed in commit a836fe8ec when adding the
LAPACK 3.7.0 symbols. We noticed when adding wrappers for 3.7.0
routines in SciPy. For more details, see
https://github.com/rgommers/scipy/issues/143
---
 exports/gensymbol | 1 +
 1 file changed, 1 insertion(+)

diff --git a/exports/gensymbol b/exports/gensymbol
index f05de626f..5823c0b3b 100755
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -716,6 +716,7 @@ lapackobjs2z="$lapackobjs2z
 # functions added for lapack-3.7.0
 lapackobjs2s="$lapackobjs2s
     slarfy
+    ssyconvf
     strevc3
     sgelqt
     sgelqt3

From d677214570b3aa024eb74f0da7e7995b5b7e9bc2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 7 Apr 2023 14:11:16 +0200
Subject: [PATCH 022/718] Remove the badge for the dead drone.io service and
 add Cirrus CI in its place

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6ce85e08e..a2eac07be 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,8 @@ Travis CI: [![Build Status](https://travis-ci.com/xianyi/OpenBLAS.svg?branch=dev
 
 AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
 
-Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/)
+Cirrus CI: [![Build Status](https://api.cirrus-ci.com/github/xianyi/OpenBLAS.svg?branch=develop)](https://cirrus-ci.com/github/xianyi/OpenBLAS)
+<!-- Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/)-->
 
 [![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
 

From 3effdc15053a53acc36c421ee8df78cf78879fa9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 7 Apr 2023 19:32:22 +0200
Subject: [PATCH 023/718] Protect CROSS_PATH against spurious addition of
 linebreaks from isolated dashes

fix for #3989
---
 c_check | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/c_check b/c_check
index e8f90e18a..7173968e5 100755
--- a/c_check
+++ b/c_check
@@ -35,9 +35,12 @@ if [ "`dirname \"$compiler_name\"`" != '.' ]; then
     cross_suffix="$cross_suffix`dirname \"$compiler_name\"`/"
 fi
 
-bn=`basename $compiler_name`
+bn=`basename \"$compiler_name\"`
+
 case "$bn" in
-    *-*) cross_suffix="$cross_suffix${bn%-*}-"
+    *-*) if [ "$bn" != '-']; then
+           cross_suffix="$cross_suffix${bn%-*}-"
+         fi
 esac
 
 compiler=""

From fd20a2e8c6c4aa2f47cd1e7019fd9b51176d393e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 10 Apr 2023 22:28:00 +0200
Subject: [PATCH 024/718] Convert CMAKE booleans to 0/1 values for gensymbol

---
 CMakeLists.txt | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 35fd830ee..8ecd95a95 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -398,15 +398,45 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
     message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
   endif()
 
+  if (${BUILD_LAPACK_DEPRECATED}) 
+	  set (BLD 1)
+  else ()
+	  set (BLD 0)
+  endif()
+  if (${BUILD_BFLOAT16}) 
+	  set (BBF16 1)
+  else ()
+	  set (BBF16 0)
+  endif()
+  if (${BUILD_SINGLE})
+	  set (BS 1)
+  else ()
+	  set (BS 0)
+  endif()
+  if (${BUILD_DOUBLE})
+	  set (BD 1)
+  else ()
+	  set (BD 0)
+  endif()
+  if (${BUILD_COMPLEX})
+	  set (BC 1)
+  else ()
+	  set (BC 0)
+  endif()
+  if (${BUILD_COMPLEX16})
+	  set (BZ 1)
+  else ()
+	  set (BZ 0)
+  endif()
   if (NOT USE_PERL)
   add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
-    COMMAND  ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def
+	  COMMAND  ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def 
     COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def  ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
     COMMENT "renaming symbols"
     )
   else()
   add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
-    COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def
+    COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
     COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def  ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
     COMMENT "renaming symbols"
     )

From d5fbec7c20e2bcc6c088cf62029ff5e9a879fcd9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 10 Apr 2023 23:49:35 +0200
Subject: [PATCH 025/718] Export ?MIN/?MAX, ?AMIN/?AMAX, CDOT/ZDOT and ?GEMMT

---
 exports/gensymbol | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/exports/gensymbol b/exports/gensymbol
index 5823c0b3b..7ce85fd7d 100755
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -17,11 +17,11 @@
 # removed blas_thread_shutdown_
 #
 blasobjsc="
-    caxpy caxpby ccopy cdotc cdotu cgbmv cgemm cgemv cgerc cgeru
+    camin camax caxpy caxpby ccopy cdot cdotc cdotu cgbmv cgemm cgemv cgerc cgeru
     chbmv chemm chemv cher2 cher2k cher cherk scabs1 scamax
     chpmv chpr2 chpr crotg cscal csrot csscal cswap scamin scasum scnrm2
     csymm csyr2k csyrk ctbmv ctbsv ctpmv ctpsv ctrmm ctrmv ctrsm
-    ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum"
+    ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum cgemmt"
 
 blasobjsd="
     damax damin dasum daxpy daxpby dcabs1 dcopy ddot dgbmv dgemm
@@ -29,7 +29,7 @@ blasobjsd="
     dscal dsdot dspmv dspr2 dimatcopy domatcopy
     dspr dswap dsymm dsymv dsyr2 dsyr2k dsyr dsyrk dtbmv dtbsv
     dtpmv dtpsv dtrmm dtrmv dtrsm dtrsv
-        idamax idamin idmax idmin dgeadd dsum"
+        idamax idamin idmax idmin dgeadd dsum dgemmt"
 
 blasobjss="
     isamax isamin ismax ismin
@@ -38,58 +38,58 @@ blasobjss="
     smax smin snrm2 simatcopy somatcopy
     srot srotg srotm srotmg ssbmv sscal sspmv sspr2 sspr sswap
     ssymm ssymv ssyr2 ssyr2k ssyr ssyrk stbmv stbsv stpmv stpsv
-    strmm strmv strsm strsv  sgeadd ssum"
+    strmm strmv strsm strsv  sgeadd ssum sgemmt"
 
 blasobjsz="
-    izamax izamin
-    zaxpy zaxpby zcopy zdotc zdotu zdrot
+    zamin zamax izamax izamin
+    zaxpy zaxpby zcopy zdot zdotc zdotu zdrot
     zdscal zgbmv zgemm zgemv zgerc zgeru
     zhbmv zhemm zhemv zher2 zher2k zher zherk zhpmv zhpr2
     zhpr zrotg zscal zswap zsymm zsyr2k zsyrk ztbmv
     ztbsv ztpmv ztpsv ztrmm ztrmv ztrsm ztrsv
     zomatcopy  zimatcopy dzamax dzamin dzasum dznrm2
-    zgeadd  dzsum"
+    zgeadd  dzsum zgemmt"
 
 blasobjs="lsame xerbla"
 bfblasobjs="sbgemm sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod"
 cblasobjsc="
-    cblas_caxpy cblas_ccopy cblas_cdotc cblas_cdotu cblas_cgbmv cblas_cgemm cblas_cgemv
+    cblas_caxpy cblas_ccopy cblas_cdot cblas_cdotc cblas_cdotu cblas_cgbmv cblas_cgemm cblas_cgemv
     cblas_cgerc cblas_cgeru cblas_chbmv cblas_chemm cblas_chemv cblas_cher2 cblas_cher2k
     cblas_cher cblas_cherk  cblas_chpmv cblas_chpr2 cblas_chpr cblas_cscal cblas_caxpby
     cblas_csscal cblas_cswap cblas_csymm cblas_csyr2k cblas_csyrk cblas_ctbmv cblas_cgeadd
     cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv
-    cblas_scnrm2 cblas_scasum
+    cblas_scnrm2 cblas_scasum cblas_camin cblas_camax cblas_cgemmt cblas_cmin cblas_cmax
     cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy
     "
 cblasobjsd="
-    cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot
-    cblas_dgbmv cblas_dgemm cblas_dgemv cblas_dger cblas_dnrm2
+    cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot cblas_damin cblas_damax
+    cblas_dgbmv cblas_dgemm cblas_dgemv cblas_dger cblas_dnrm2 cblas_dmin cblas_dmax
     cblas_drot cblas_drotg cblas_drotm cblas_drotmg cblas_dsbmv cblas_dscal cblas_dsdot
     cblas_dspmv cblas_dspr2 cblas_dspr cblas_dswap cblas_dsymm cblas_dsymv cblas_dsyr2
     cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv
-    cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd
+    cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt
     cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy
     "
 
 cblasobjss="
-    cblas_sasum cblas_saxpy cblas_saxpby
+    cblas_sasum cblas_saxpy cblas_saxpby cblas_samin cblas_samax cblas_smax cblas_smin
     cblas_scopy cblas_sdot cblas_sdsdot cblas_sgbmv cblas_sgemm
     cblas_sgemv cblas_sger cblas_snrm2 cblas_srot cblas_srotg
     cblas_srotm cblas_srotmg cblas_ssbmv cblas_sscal cblas_sspmv cblas_sspr2 cblas_sspr
     cblas_sswap cblas_ssymm cblas_ssymv cblas_ssyr2 cblas_ssyr2k cblas_ssyr cblas_ssyrk
     cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm
-    cblas_strsv cblas_sgeadd
+    cblas_strsv cblas_sgeadd cblas_sgemmt 
     cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy
     "
 
 cblasobjsz="
-    cblas_dzasum cblas_dznrm2 cblas_zaxpy cblas_zcopy cblas_zdotc cblas_zdotu cblas_zdscal
+    cblas_dzasum cblas_dznrm2 cblas_zaxpy cblas_zcopy cblas_zdot cblas_zdotc cblas_zdotu cblas_zdscal
     cblas_zgbmv cblas_zgemm cblas_zgemv cblas_zgerc cblas_zgeru cblas_zhbmv cblas_zhemm
     cblas_zhemv cblas_zher2 cblas_zher2k cblas_zher cblas_zherk cblas_zhpmv cblas_zhpr2
     cblas_zhpr cblas_zscal cblas_zswap cblas_zsymm cblas_zsyr2k cblas_zsyrk
     cblas_ztbmv cblas_ztbsv cblas_ztpmv cblas_ztpsv cblas_ztrmm cblas_ztrmv cblas_ztrsm
     cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub
-    cblas_zaxpby cblas_zgeadd
+    cblas_zaxpby cblas_zgeadd cblas_zamin cblas_zamax cblas_zgemmt cblas_zmin cblas_zmax
     cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy
 "
 

From caa2945138f3c8a6f3f0dacbaf653c283e3cd2cb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 11 Apr 2023 00:04:09 +0200
Subject: [PATCH 026/718] Support Apple A15/M2 cpus through the existing VORTEX
 target

---
 cpuid_arm64.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpuid_arm64.c b/cpuid_arm64.c
index 1080ea974..809f48e95 100644
--- a/cpuid_arm64.c
+++ b/cpuid_arm64.c
@@ -268,7 +268,8 @@ int detect(void)
 #else
 #ifdef __APPLE__
 	sysctlbyname("hw.cpufamily",&value,&length,NULL,0);
-	if (value ==131287967|| value == 458787763 ) return CPU_VORTEX;
+	if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; //A12/M1
+	if (value == 3660830781) return CPU_VORTEX; //A15/M2
 #endif
 	return CPU_ARMV8;	
 #endif

From 57bdc36c846cd44396e1d39a5f7a191bda363503 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 11 Apr 2023 22:38:38 +0200
Subject: [PATCH 027/718] add conditionals for BUILD_LAPACK_DEPRECATED

---
 cmake/lapack.cmake  |  4 ++++
 cmake/lapacke.cmake | 30 ++++++------------------------
 2 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake
index 45dda8686..544e226ab 100644
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -436,6 +436,7 @@ if(USE_XBLAS)
   set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
 endif()
 
+if(BUILD_LAPACK_DEPRECATED)
 list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f
   DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f
   DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f)
@@ -449,6 +450,7 @@ list(APPEND ZLASRC DEPRECATED/zgegs.f DEPRECATED/zgegv.f
   DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f
   DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f)
 message(STATUS "Building deprecated routines")
+endif()
 
 set(DSLASRC spotrs.f)
 
@@ -930,6 +932,7 @@ if(USE_XBLAS)
   set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
 endif()
 
+if(BUILD_LAPACK_DEPRECATED)
 list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c
   DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c
   DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c)
@@ -943,6 +946,7 @@ list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c
   DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c
   DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c)
 message(STATUS "Building deprecated routines")
+endif()
 
 set(DSLASRC spotrs.c)
 
diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake
index 3a9352197..be6a286fe 100644
--- a/cmake/lapacke.cmake
+++ b/cmake/lapacke.cmake
@@ -70,8 +70,6 @@ set(CSRC
   lapacke_cgeqlf_work.c
   lapacke_cgeqp3.c
   lapacke_cgeqp3_work.c
-  lapacke_cgeqpf.c
-  lapacke_cgeqpf_work.c
   lapacke_cgeqr.c
   lapacke_cgeqr_work.c
   lapacke_cgeqr2.c
@@ -144,12 +142,8 @@ set(CSRC
   lapacke_cggqrf_work.c
   lapacke_cggrqf.c
   lapacke_cggrqf_work.c
-  lapacke_cggsvd.c
-  lapacke_cggsvd_work.c
   lapacke_cggsvd3.c
   lapacke_cggsvd3_work.c
-  lapacke_cggsvp.c
-  lapacke_cggsvp_work.c
   lapacke_cggsvp3.c
   lapacke_cggsvp3_work.c
   lapacke_cgtcon.c
@@ -695,8 +689,6 @@ set(DSRC
   lapacke_dgeqlf_work.c
   lapacke_dgeqp3.c
   lapacke_dgeqp3_work.c
-  lapacke_dgeqpf.c
-  lapacke_dgeqpf_work.c
   lapacke_dgeqr.c
   lapacke_dgeqr_work.c
   lapacke_dgeqr2.c
@@ -771,12 +763,8 @@ set(DSRC
   lapacke_dggqrf_work.c
   lapacke_dggrqf.c
   lapacke_dggrqf_work.c
-  lapacke_dggsvd.c
-  lapacke_dggsvd_work.c
   lapacke_dggsvd3.c
   lapacke_dggsvd3_work.c
-  lapacke_dggsvp.c
-  lapacke_dggsvp_work.c
   lapacke_dggsvp3.c
   lapacke_dggsvp3_work.c
   lapacke_dgtcon.c
@@ -1275,8 +1263,6 @@ set(SSRC
   lapacke_sgeqlf_work.c
   lapacke_sgeqp3.c
   lapacke_sgeqp3_work.c
-  lapacke_sgeqpf.c
-  lapacke_sgeqpf_work.c
   lapacke_sgeqr.c
   lapacke_sgeqr_work.c
   lapacke_sgeqr2.c
@@ -1351,12 +1337,8 @@ set(SSRC
   lapacke_sggqrf_work.c
   lapacke_sggrqf.c
   lapacke_sggrqf_work.c
-  lapacke_sggsvd.c
-  lapacke_sggsvd_work.c
   lapacke_sggsvd3.c
   lapacke_sggsvd3_work.c
-  lapacke_sggsvp.c
-  lapacke_sggsvp_work.c
   lapacke_sggsvp3.c
   lapacke_sggsvp3_work.c
   lapacke_sgtcon.c
@@ -1849,8 +1831,6 @@ set(ZSRC
   lapacke_zgeqlf_work.c
   lapacke_zgeqp3.c
   lapacke_zgeqp3_work.c
-  lapacke_zgeqpf.c
-  lapacke_zgeqpf_work.c
   lapacke_zgeqr.c
   lapacke_zgeqr_work.c
   lapacke_zgeqr2.c
@@ -1925,12 +1905,8 @@ set(ZSRC
   lapacke_zggqrf_work.c
   lapacke_zggrqf.c
   lapacke_zggrqf_work.c
-  lapacke_zggsvd.c
-  lapacke_zggsvd_work.c
   lapacke_zggsvd3.c
   lapacke_zggsvd3_work.c
-  lapacke_zggsvp.c
-  lapacke_zggsvp_work.c
   lapacke_zggsvp3.c
   lapacke_zggsvp3_work.c
   lapacke_zgtcon.c
@@ -2401,6 +2377,12 @@ set(ZSRC
   lapacke_csyr_work.c
   lapacke_ilaver.c
 )
+if (BUILD_LAPACK_DEPRECATED)
+set(SRCS $SRCS lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c)
+set(SRCD $SRCD lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c)
+set(SRCC $SRCC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c)
+set(SRCZ $SRCZ lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c)
+endif()
 
 set(SRCX
   lapacke_cgbrfsx.c       lapacke_cporfsx.c       lapacke_dgerfsx.c       lapacke_sgbrfsx.c       lapacke_ssyrfsx.c       lapacke_zherfsx.c

From cd8eb33a9c989b479367c2bdd33d7c843c27e3fb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 11 Apr 2023 22:39:53 +0200
Subject: [PATCH 028/718] Expose BUILD_LAPACK_DEPRECATED

---
 CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8ecd95a95..d59290c90 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,6 +20,8 @@ include(CMakePackageConfigHelpers)
 #######
 option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF)
 
+option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON)
+
 option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
 
 option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
@@ -398,12 +400,12 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
     message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
   endif()
 
-  if (${BUILD_LAPACK_DEPRECATED}) 
+  if (${BUILD_LAPACK_DEPRECATED})
 	  set (BLD 1)
   else ()
 	  set (BLD 0)
   endif()
-  if (${BUILD_BFLOAT16}) 
+  if (${BUILD_BFLOAT16})
 	  set (BBF16 1)
   else ()
 	  set (BBF16 0)

From 6c45c980835cc0fc0d6fc8751349af54bdaa8426 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 11 Apr 2023 22:41:18 +0200
Subject: [PATCH 029/718] Add (only) the GEMMT functions

---
 exports/gensymbol | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/exports/gensymbol b/exports/gensymbol
index 7ce85fd7d..b584167a4 100755
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -17,7 +17,7 @@
 # removed blas_thread_shutdown_
 #
 blasobjsc="
-    camin camax caxpy caxpby ccopy cdot cdotc cdotu cgbmv cgemm cgemv cgerc cgeru
+    caxpy caxpby ccopy cdotc cdotu cgbmv cgemm cgemv cgerc cgeru
     chbmv chemm chemv cher2 cher2k cher cherk scabs1 scamax
     chpmv chpr2 chpr crotg cscal csrot csscal cswap scamin scasum scnrm2
     csymm csyr2k csyrk ctbmv ctbsv ctpmv ctpsv ctrmm ctrmv ctrsm
@@ -41,8 +41,8 @@ blasobjss="
     strmm strmv strsm strsv  sgeadd ssum sgemmt"
 
 blasobjsz="
-    zamin zamax izamax izamin
-    zaxpy zaxpby zcopy zdot zdotc zdotu zdrot
+    izamax izamin
+    zaxpy zaxpby zcopy zdotc zdotu zdrot
     zdscal zgbmv zgemm zgemv zgerc zgeru
     zhbmv zhemm zhemv zher2 zher2k zher zherk zhpmv zhpr2
     zhpr zrotg zscal zswap zsymm zsyr2k zsyrk ztbmv
@@ -53,17 +53,17 @@ blasobjsz="
 blasobjs="lsame xerbla"
 bfblasobjs="sbgemm sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod"
 cblasobjsc="
-    cblas_caxpy cblas_ccopy cblas_cdot cblas_cdotc cblas_cdotu cblas_cgbmv cblas_cgemm cblas_cgemv
+    cblas_caxpy cblas_ccopy cblas_cdotc cblas_cdotu cblas_cgbmv cblas_cgemm cblas_cgemv
     cblas_cgerc cblas_cgeru cblas_chbmv cblas_chemm cblas_chemv cblas_cher2 cblas_cher2k
     cblas_cher cblas_cherk  cblas_chpmv cblas_chpr2 cblas_chpr cblas_cscal cblas_caxpby
     cblas_csscal cblas_cswap cblas_csymm cblas_csyr2k cblas_csyrk cblas_ctbmv cblas_cgeadd
     cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv
-    cblas_scnrm2 cblas_scasum cblas_camin cblas_camax cblas_cgemmt cblas_cmin cblas_cmax
+    cblas_scnrm2 cblas_scasum cblas_cgemmt
     cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy
     "
 cblasobjsd="
-    cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot cblas_damin cblas_damax
-    cblas_dgbmv cblas_dgemm cblas_dgemv cblas_dger cblas_dnrm2 cblas_dmin cblas_dmax
+    cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot
+    cblas_dgbmv cblas_dgemm cblas_dgemv cblas_dger cblas_dnrm2
     cblas_drot cblas_drotg cblas_drotm cblas_drotmg cblas_dsbmv cblas_dscal cblas_dsdot
     cblas_dspmv cblas_dspr2 cblas_dspr cblas_dswap cblas_dsymm cblas_dsymv cblas_dsyr2
     cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv
@@ -72,24 +72,24 @@ cblasobjsd="
     "
 
 cblasobjss="
-    cblas_sasum cblas_saxpy cblas_saxpby cblas_samin cblas_samax cblas_smax cblas_smin
+    cblas_sasum cblas_saxpy cblas_saxpby
     cblas_scopy cblas_sdot cblas_sdsdot cblas_sgbmv cblas_sgemm
     cblas_sgemv cblas_sger cblas_snrm2 cblas_srot cblas_srotg
     cblas_srotm cblas_srotmg cblas_ssbmv cblas_sscal cblas_sspmv cblas_sspr2 cblas_sspr
     cblas_sswap cblas_ssymm cblas_ssymv cblas_ssyr2 cblas_ssyr2k cblas_ssyr cblas_ssyrk
     cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm
-    cblas_strsv cblas_sgeadd cblas_sgemmt 
+    cblas_strsv cblas_sgeadd cblas_sgemmt
     cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy
     "
 
 cblasobjsz="
-    cblas_dzasum cblas_dznrm2 cblas_zaxpy cblas_zcopy cblas_zdot cblas_zdotc cblas_zdotu cblas_zdscal
+    cblas_dzasum cblas_dznrm2 cblas_zaxpy cblas_zcopy cblas_zdotc cblas_zdotu cblas_zdscal
     cblas_zgbmv cblas_zgemm cblas_zgemv cblas_zgerc cblas_zgeru cblas_zhbmv cblas_zhemm
     cblas_zhemv cblas_zher2 cblas_zher2k cblas_zher cblas_zherk cblas_zhpmv cblas_zhpr2
     cblas_zhpr cblas_zscal cblas_zswap cblas_zsymm cblas_zsyr2k cblas_zsyrk
     cblas_ztbmv cblas_ztbsv cblas_ztpmv cblas_ztpsv cblas_ztrmm cblas_ztrmv cblas_ztrsm
     cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub
-    cblas_zaxpby cblas_zgeadd cblas_zamin cblas_zamax cblas_zgemmt cblas_zmin cblas_zmax
+    cblas_zaxpby cblas_zgeadd cblas_zgemmt
     cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy
 "
 

From 2ea00788c271c7a5727b13f7a90433ce21639042 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 11 Apr 2023 22:46:51 +0200
Subject: [PATCH 030/718] Add ?GEMMT

---
 exports/gensymbol.pl | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/exports/gensymbol.pl b/exports/gensymbol.pl
index e38a3cc89..dd79e924d 100644
--- a/exports/gensymbol.pl
+++ b/exports/gensymbol.pl
@@ -21,7 +21,7 @@
     chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax,
     chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2,
     csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm,
-    ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum);
+    ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt);
     
 @blasobjsd = (
     damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm,
@@ -29,7 +29,7 @@
     dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy,
     dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv,
     dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv,
-        idamax,idamin,idmax,idmin,dgeadd,dsum);
+        idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt);
     
 @blasobjss = (
     isamax,isamin,ismax,ismin,
@@ -38,7 +38,7 @@
     smax,smin,snrm2,simatcopy,somatcopy,
     srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap,
     ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv,
-    strmm,strmv,strsm,strsv, sgeadd,ssum);
+    strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt);
      
 @blasobjsz = (
     izamax,izamin,,
@@ -48,7 +48,7 @@
     zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv,
     ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv,
     zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2,
-    zgeadd, dzsum);
+    zgeadd, dzsum, zgemmt);
 
 @blasobjs = (lsame, xerbla);
 @bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
@@ -60,7 +60,7 @@
     cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, 
     cblas_scnrm2, cblas_scasum,
     cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy
-    );
+    cblas_cgemmt);
 @cblasobjsd = (
     cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot,
     cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2,
@@ -69,7 +69,7 @@
     cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv,
     cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd,
     cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy
-    );
+    cblas_dgemmt);
     
 @cblasobjss = (
     cblas_sasum, cblas_saxpy, cblas_saxpby,
@@ -80,7 +80,7 @@
     cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm,
     cblas_strsv, cblas_sgeadd,
     cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy
-    );
+    cblas_sgemmt);
 @cblasobjsz = (
     cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal,
     cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm,
@@ -90,7 +90,7 @@
     cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub,
     cblas_zaxpby, cblas_zgeadd,
     cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy
-);
+    cblas_zgemmt);
 
 @cblasobjs = (  cblas_xerbla );
 

From ac650225c1cba69b3aba0d52031af838e3b6d1dc Mon Sep 17 00:00:00 2001
From: Honglin Zhu <zhuhonglin.zhl@alibaba-inc.com>
Date: Thu, 13 Apr 2023 00:08:27 +0800
Subject: [PATCH 031/718] Fix x86 detection error

---
 cpuid_x86.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/cpuid_x86.c b/cpuid_x86.c
index ad13a8c8c..69cbba90e 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1936,7 +1936,8 @@ static char *corename[] = {
   "ZEN",
   "SKYLAKEX",
   "DHYANA",
-  "COOPERLAKE"
+  "COOPERLAKE",
+  "SAPPHIRERAPIDS",
 };
 
 static char *corename_lower[] = {
@@ -1970,7 +1971,8 @@ static char *corename_lower[] = {
   "zen",
   "skylakex",
   "dhyana",
-  "cooperlake"
+  "cooperlake",
+  "sapphirerapids",
 };
 
 
@@ -2276,16 +2278,18 @@ int get_coretype(void){
             return CORE_NEHALEM;
 	}
         if (model == 15) { // Sapphire Rapids
+	  if(support_amx_bf16())
+	    return CORE_SAPPHIRERAPIDS;
 	  if(support_avx512_bf16())
-            return CPUTYPE_COOPERLAKE;	
+            return CORE_COOPERLAKE;	
           if(support_avx512())
-            return CPUTYPE_SKYLAKEX;
+            return CORE_SKYLAKEX;
           if(support_avx2())
-            return CPUTYPE_HASWELL;
+            return CORE_HASWELL;
           if(support_avx())
-	    return CPUTYPE_SANDYBRIDGE;
+	    return CORE_SANDYBRIDGE;
 	  else
-	  return CPUTYPE_NEHALEM;	
+	  return CORE_NEHALEM;
         }
       break;
 

From cda29633a30bf7ecbc64f85e4bcc6517ad954f1c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 13 Apr 2023 17:59:48 +0200
Subject: [PATCH 032/718] move ALPHA_I out of register 18 (reserved on OSX)

---
 kernel/arm64/cgemm_kernel_8x4.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/arm64/cgemm_kernel_8x4.S b/kernel/arm64/cgemm_kernel_8x4.S
index 24e08a646..f100adc7a 100644
--- a/kernel/arm64/cgemm_kernel_8x4.S
+++ b/kernel/arm64/cgemm_kernel_8x4.S
@@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define pCRow3		x15
 #define pA		x16
 #define alphaR		w17
-#define alphaI		w18
+#define alphaI		w19
 
 #define alpha0_R	s10
 #define alphaV0_R	v10.s[0]

From c7bbad09adf8cdd2fa4b8709ea669e530a0136a4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 13 Apr 2023 18:00:47 +0200
Subject: [PATCH 033/718] Move ALPHA_I out of register 18 (reserved on OSX)

---
 kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S b/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S
index 29a68ff22..2c63925be 100644
--- a/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S
+++ b/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S
@@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define pCRow3		x15
 #define pA		x16
 #define alphaR		w17
-#define alphaI		w18
+#define alphaI		w19
 
 #define alpha0_R	s10
 #define alphaV0_R	v10.s[0]

From 0b1acb0ba3aa327fee65bc6bcf596080dfc39f4b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 13 Apr 2023 18:03:35 +0200
Subject: [PATCH 034/718] Move ALPHA_I out of register 18 (reserved on OSX)

---
 kernel/arm64/ctrmm_kernel_8x4.S | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S
index 5c0827397..e8f1d8cf3 100644
--- a/kernel/arm64/ctrmm_kernel_8x4.S
+++ b/kernel/arm64/ctrmm_kernel_8x4.S
@@ -49,10 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define pCRow3		x15
 #define pA		x16
 #define alphaR		w17
-#define alphaI		w18
-#define temp		x19
-#define tempOffset	x20
-#define tempK		x21
+#define alphaI		w19
+#define temp		x20
+#define tempOffset	x21
+#define tempK		x22
 
 #define alpha0_R	s10
 #define alphaV0_R	v10.s[0]

From 108a21e47a754032a9fb5477afcb76c6c158a146 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 13 Apr 2023 18:05:14 +0200
Subject: [PATCH 035/718] Move ALPHA out of register 18 (reserved on OSX)

---
 kernel/arm64/sgemm_kernel_sve_v2x8.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/arm64/sgemm_kernel_sve_v2x8.S b/kernel/arm64/sgemm_kernel_sve_v2x8.S
index c969ed4db..60e1f347b 100644
--- a/kernel/arm64/sgemm_kernel_sve_v2x8.S
+++ b/kernel/arm64/sgemm_kernel_sve_v2x8.S
@@ -55,8 +55,8 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */
 #define lanes		x15
 #define pA1	    	x16
 #define pA2	    	x17
-#define alpha		w18
-#define vec_len		x19
+#define alpha		w19
+#define vec_len		x20
 #define vec_lenx2   x20
 
 #define alpha0		s10

From 3727672a74c18938230c3a2db012a5693688bfd6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 13 Apr 2023 18:07:52 +0200
Subject: [PATCH 036/718] Improve workaround and keep compilers from optimizing
 it out

---
 kernel/arm64/dznrm2_thunderx2t99.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c
index e342b0b63..0bd274b3f 100644
--- a/kernel/arm64/dznrm2_thunderx2t99.c
+++ b/kernel/arm64/dznrm2_thunderx2t99.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "common.h"
-
+#include <float.h>
 #include <arm_neon.h>
 
 #if defined(SMP)
@@ -344,6 +344,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT dummy_alpha[2];
 #endif
 	FLOAT ssq, scale;
+	volatile FLOAT sca;
 
 	if (n <= 0 || inc_x <= 0) return 0.0;
 
@@ -404,7 +405,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 #else
 	nrm2_compute(n, x, inc_x, &ssq, &scale);
 #endif
-	if (fabs(scale) <1.e-300) return 0.;
+	sca = fabs(scale);
+	if (sca < DBL_MIN) return 0.;
 	ssq = sqrt(ssq) * scale;
 
 	return ssq;

From f096a339e4a22f4bc6dc454640e5d4007b07368b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 13 Apr 2023 18:16:09 +0200
Subject: [PATCH 037/718] Use long value fields for cpu ident on OSX

---
 cpuid_arm64.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpuid_arm64.c b/cpuid_arm64.c
index 809f48e95..e586f9a3c 100644
--- a/cpuid_arm64.c
+++ b/cpuid_arm64.c
@@ -267,9 +267,9 @@ int detect(void)
 	}
 #else
 #ifdef __APPLE__
-	sysctlbyname("hw.cpufamily",&value,&length,NULL,0);
-	if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; //A12/M1
-	if (value == 3660830781) return CPU_VORTEX; //A15/M2
+	sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0);
+	if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1
+	if (value64 == 3660830781) return CPU_VORTEX; //A15/M2
 #endif
 	return CPU_ARMV8;	
 #endif

From 970e611e007eeb180ce963af89746d9e84f90e8f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 14 Apr 2023 19:42:34 +0200
Subject: [PATCH 038/718] fix missing blank in test

---
 c_check | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/c_check b/c_check
index 7173968e5..9be152b12 100755
--- a/c_check
+++ b/c_check
@@ -38,7 +38,7 @@ fi
 bn=`basename \"$compiler_name\"`
 
 case "$bn" in
-    *-*) if [ "$bn" != '-']; then
+    *-*) if [ "$bn" != '-' ]; then
            cross_suffix="$cross_suffix${bn%-*}-"
          fi
 esac

From 8be68fa7f4edfa0c65949faf67f8feea2c7f0f43 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 15 Apr 2023 12:02:39 +0200
Subject: [PATCH 039/718] move declaration of sca to really keep the compiler
 from throwing it out (for now)

---
 kernel/arm64/dznrm2_thunderx2t99.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c
index 0bd274b3f..6077c85dd 100644
--- a/kernel/arm64/dznrm2_thunderx2t99.c
+++ b/kernel/arm64/dznrm2_thunderx2t99.c
@@ -344,7 +344,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT dummy_alpha[2];
 #endif
 	FLOAT ssq, scale;
-	volatile FLOAT sca;
 
 	if (n <= 0 || inc_x <= 0) return 0.0;
 
@@ -405,7 +404,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 #else
 	nrm2_compute(n, x, inc_x, &ssq, &scale);
 #endif
-	sca = fabs(scale);
+	volatile FLOAT sca = fabs(scale);
 	if (sca < DBL_MIN) return 0.;
 	ssq = sqrt(ssq) * scale;
 

From 38d7a7b562860555a6440953124eb47cf7d5b506 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 16 Apr 2023 00:07:58 +0200
Subject: [PATCH 040/718] Fix ?GEMMT

---
 interface/gemmt.c | 100 +++++++++++++++++++++++-----------------------
 1 file changed, 50 insertions(+), 50 deletions(-)

diff --git a/interface/gemmt.c b/interface/gemmt.c
index 3eed1dfe4..d35406411 100644
--- a/interface/gemmt.c
+++ b/interface/gemmt.c
@@ -35,29 +35,26 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "common.h"
-#ifdef FUNCTION_PROFILE
-#include "functable.h"
-#endif
 
 #ifndef COMPLEX
 #define SMP_THRESHOLD_MIN 65536.0
 #ifdef XDOUBLE
-#define ERROR_NAME "QGEMT "
+#define ERROR_NAME "QGEMMT "
 #elif defined(DOUBLE)
-#define ERROR_NAME "DGEMT "
+#define ERROR_NAME "DGEMMT "
 #elif defined(BFLOAT16)
-#define ERROR_NAME "SBGEMT "
+#define ERROR_NAME "SBGEMMT "
 #else
-#define ERROR_NAME "SGEMT "
+#define ERROR_NAME "SGEMMT "
 #endif
 #else
 #define SMP_THRESHOLD_MIN 8192.0
 #ifdef XDOUBLE
-#define ERROR_NAME "XGEMT "
+#define ERROR_NAME "XGEMMT "
 #elif defined(DOUBLE)
-#define ERROR_NAME "ZGEMT "
+#define ERROR_NAME "ZGEMMT "
 #else
-#define ERROR_NAME "CGEMT "
+#define ERROR_NAME "CGEMMT "
 #endif
 #endif
 
@@ -68,13 +65,13 @@
 #ifndef CBLAS
 
 void NAME(char *UPLO, char *TRANSA, char *TRANSB,
-	  blasint * M, blasint * N, blasint * K,
+	  blasint * M, blasint * K,
 	  FLOAT * Alpha,
 	  IFLOAT * a, blasint * ldA,
 	  IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
 {
 
-	blasint m, n, k;
+	blasint m, k;
 	blasint lda, ldb, ldc;
 	int transa, transb, uplo;
 	blasint info;
@@ -92,7 +89,6 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
 	PRINT_DEBUG_NAME;
 
 	m = *M;
-	n = *N;
 	k = *K;
 
 #if defined(COMPLEX)
@@ -167,8 +163,6 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
 		info = 13;
 	if (k < 0)
 		info = 5;
-	if (n < 0)
-		info = 4;
 	if (m < 0)
 		info = 3;
 	if (transb < 0)
@@ -184,7 +178,7 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
 
 void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 	   enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M,
-	   blasint N, blasint k,
+	   blasint k,
 #ifndef COMPLEX
 	   FLOAT alpha,
 	   IFLOAT * A, blasint LDA,
@@ -205,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 	int transa, transb, uplo;
 	blasint info;
-	blasint m, n, lda, ldb;
+	blasint m, lda, ldb;
 	FLOAT *a, *b;
 	XFLOAT *buffer;
 
@@ -248,9 +242,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 			transb = 3;
 #endif
 
-		m = M;
-		n = N;
-
 		a = (void *)A;
 		b = (void *)B;
 		lda = LDA;
@@ -262,8 +253,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 			info = 13;
 		if (k < 0)
 			info = 5;
-		if (n < 0)
-			info = 4;
 		if (m < 0)
 			info = 3;
 		if (transb < 0)
@@ -273,8 +262,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 	}
 
 	if (order == CblasRowMajor) {
-		m = N;
-		n = M;
 
 		a = (void *)B;
 		b = (void *)A;
@@ -319,8 +306,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 			info = 13;
 		if (k < 0)
 			info = 5;
-		if (n < 0)
-			info = 4;
 		if (m < 0)
 			info = 3;
 		if (transb < 0)
@@ -407,37 +392,35 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 #endif
 
-	if ((m == 0) || (n == 0))
+	if ((m == 0) )
 		return;
 
 	IDEBUG_START;
 
-	FUNCTION_PROFILE_START();
-
 	const blasint incb = (transb == 0) ? 1 : ldb;
 
 	if (uplo == 1) {
-		for (i = 0; i < n; i++) {
-			j = n - i;
+		for (i = 0; i < m; i++) {
+			j = m - i;
 
 			l = j;
 #if defined(COMPLEX)
 			aa = a + i * 2;
 			bb = b + i * ldb * 2;
 			if (transa) {
-				l = k;
 				aa = a + lda * i * 2;
-				bb = b + i * 2;
 			}
+			if (transb)
+				bb = b + i * 2;
 			cc = c + i * 2 * ldc + i * 2;
 #else
 			aa = a + i;
 			bb = b + i * ldb;
 			if (transa) {
-				l = k;
 				aa = a + lda * i;
-				bb = b + i;
 			}
+			if (transb)
+				bb = b + i;
 			cc = c + i * ldc + i;
 #endif
 
@@ -458,8 +441,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 			IDEBUG_START;
 
-			FUNCTION_PROFILE_START();
-
 			buffer_size = j + k + 128 / sizeof(FLOAT);
 #ifdef WINDOWS_ABI
 			buffer_size += 160 / sizeof(FLOAT);
@@ -479,20 +460,34 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 
 #if defined(COMPLEX)
+				if (!transa)
 				(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
 						     aa, lda, bb, incb, cc, 1,
 						     buffer);
+				else
+				(gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
+						     aa, lda, bb, incb, cc, 1,
+						     buffer);
 #else
+				if (!transa)
 				(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
 						     bb, incb, cc, 1, buffer);
+				else
+				(gemv[(int)transa]) (k, j, 0, alpha, aa, lda,
+						     bb, incb, cc, 1, buffer);
 #endif
 #ifdef SMP
 			} else {
-
+				if (!transa)
 				(gemv_thread[(int)transa]) (j, k, alpha, aa,
 							    lda, bb, incb, cc,
 							    1, buffer,
 							    nthreads);
+				else
+				(gemv_thread[(int)transa]) (k, j, alpha, aa,
+							    lda, bb, incb, cc,
+							    1, buffer,
+							    nthreads);
 
 			}
 #endif
@@ -501,21 +496,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 		}
 	} else {
 
-		for (i = 0; i < n; i++) {
+		for (i = 0; i < m; i++) {
 			j = i + 1;
 
 			l = j;
 #if defined COMPLEX
 			bb = b + i * ldb * 2;
-			if (transa) {
-				l = k;
+			if (transb) {
 				bb = b + i * 2;
 			}
 			cc = c + i * 2 * ldc;
 #else
 			bb = b + i * ldb;
-			if (transa) {
-				l = k;
+			if (transb) {
 				bb = b + i;
 			}
 			cc = c + i * ldc;
@@ -537,8 +530,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 			IDEBUG_START;
 
-			FUNCTION_PROFILE_START();
-
 			buffer_size = j + k + 128 / sizeof(FLOAT);
 #ifdef WINDOWS_ABI
 			buffer_size += 160 / sizeof(FLOAT);
@@ -558,30 +549,39 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 
 #if defined(COMPLEX)
+				if (!transa)
 				(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
 						     a, lda, bb, incb, cc, 1,
 						     buffer);
+				else
+				(gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
+						     a, lda, bb, incb, cc, 1,
+						     buffer);
 #else
+				if (!transa)
 				(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
 						     incb, cc, 1, buffer);
+				else
+				(gemv[(int)transa]) (k, j, 0, alpha, a, lda, bb,
+						     incb, cc, 1, buffer);
 #endif
 
 #ifdef SMP
 			} else {
-
+				if (!transa)
 				(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
 							    bb, incb, cc, 1,
 							    buffer, nthreads);
-
+				else
+				(gemv_thread[(int)transa]) (k, j, alpha, a, lda,
+							    bb, incb, cc, 1,
+							    buffer, nthreads);
 			}
 #endif
 
 			STACK_FREE(buffer);
 		}
 	}
-	FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE,
-			     args.m * args.k + args.k * args.n +
-			     args.m * args.n, 2 * args.m * args.n * args.k);
 
 	IDEBUG_END;
 

From bfc20c2e97c9695d1f04c6b5ad16d21fb6c1db76 Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <chris.sidebottom@arm.com>
Date: Mon, 17 Apr 2023 11:17:42 +0100
Subject: [PATCH 041/718] Add Chris Sidebottom to CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index f5e9dda91..71df13634 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -23,6 +23,9 @@
   * Optimization on AMD Piledriver
   * Optimization on Intel Haswell
 
+* Chris Sidebottom <chris.sidebottom@arm.com>
+  * Optimizations and other improvements targeting AArch64
+
 ## Previous Developers
 
 * Zaheer Chothia <zaheer.chothia@gmail.com>
@@ -212,4 +215,4 @@ In chronological order:
   * [2022-03] Support RISC-V Vector Intrinisc 1.0 version.
   
 * Pablo Romero <https://github.com/pablorcum>
-  * [2022-08] Fix building from sources for QNX
\ No newline at end of file
+  * [2022-08] Fix building from sources for QNX

From 44164e3a3d7f5c956728596b9f88d43cad0a8c14 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 17 Apr 2023 14:23:13 +0200
Subject: [PATCH 042/718] revert "move alpha out of register 18" (out of PR
 scope, no SVE on Apple hw)

---
 kernel/arm64/sgemm_kernel_sve_v2x8.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/arm64/sgemm_kernel_sve_v2x8.S b/kernel/arm64/sgemm_kernel_sve_v2x8.S
index 60e1f347b..c969ed4db 100644
--- a/kernel/arm64/sgemm_kernel_sve_v2x8.S
+++ b/kernel/arm64/sgemm_kernel_sve_v2x8.S
@@ -55,8 +55,8 @@ With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */
 #define lanes		x15
 #define pA1	    	x16
 #define pA2	    	x17
-#define alpha		w19
-#define vec_len		x20
+#define alpha		w18
+#define vec_len		x19
 #define vec_lenx2   x20
 
 #define alpha0		s10

From 32f2fafde75ca674401c1ce4bc4301ca271536fa Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <chris.sidebottom@arm.com>
Date: Thu, 24 Nov 2022 13:38:20 +0000
Subject: [PATCH 043/718] Propagate SWITCH_RATIO to DYNAMIC_ARCH builds

Previously dynamic builds were either using the default SWITCH_RATIO
or one from the higher level architecture; this patch ensures the
dynamic builds can use this parameter as well.
---
 common_param.h                       |  2 ++
 driver/level3/level3_gemm3m_thread.c | 15 ++++++++------
 driver/level3/level3_syrk_threaded.c | 13 +++++++-----
 driver/level3/level3_thread.c        | 31 +++++++++++++++++-----------
 kernel/setparam-ref.c                |  5 ++++-
 lapack/potrf/potrf_parallel.c        |  4 ----
 param.h                              |  4 ++++
 7 files changed, 46 insertions(+), 28 deletions(-)

diff --git a/common_param.h b/common_param.h
index e14ef2782..7230dd00d 100644
--- a/common_param.h
+++ b/common_param.h
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project.                              */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -45,6 +46,7 @@
 
 typedef struct {
   int dtb_entries;
+  int switch_ratio;
   int offsetA, offsetB, align;
 
 #ifdef BUILD_BFLOAT16
diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c
index 39824fc5a..26d07fa94 100644
--- a/driver/level3/level3_gemm3m_thread.c
+++ b/driver/level3/level3_gemm3m_thread.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project.                              */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -44,10 +45,6 @@
 #define DIVIDE_RATE 2
 #endif
 
-#ifndef SWITCH_RATIO
-#define SWITCH_RATIO 2
-#endif
-
 //The array of job_t may overflow the stack.
 //Instead, use malloc to alloc job_t.
 #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@@ -1015,6 +1012,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
   BLASLONG divN, divT;
   int mode;
 
+#if defined(DYNAMIC_ARCH)
+  int switch_ratio = gotoblas->switch_ratio;
+#else
+  int switch_ratio = SWITCH_RATIO;
+#endif
+
   if (range_m) {
     BLASLONG m_from = *(((BLASLONG *)range_m) + 0);
     BLASLONG m_to   = *(((BLASLONG *)range_m) + 1);
@@ -1030,7 +1033,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
   }
 */
 
-  if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) {
+  if ((args -> m < nthreads * switch_ratio) || (args -> n < nthreads * switch_ratio)) {
     GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0);
     return 0;
   }
@@ -1038,7 +1041,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
   divT = nthreads;
   divN = 1;
 
-  while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) {
+  while ((GEMM3M_P * divT > m * switch_ratio) && (divT > 1)) {
     do {
       divT --;
       divN = 1;
diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c
index d7dcd68a3..b03577fb3 100644
--- a/driver/level3/level3_syrk_threaded.c
+++ b/driver/level3/level3_syrk_threaded.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project.                              */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -44,10 +45,6 @@
 #define DIVIDE_RATE 2
 #endif
 
-#ifndef SWITCH_RATIO
-#define SWITCH_RATIO 2
-#endif
-
 //The array of job_t may overflow the stack.
 //Instead, use malloc to alloc job_t.
 #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@@ -528,7 +525,13 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
   int  mode, mask;
   double dnum, di, dinum;
 
-  if ((nthreads  == 1) || (args -> n < nthreads * SWITCH_RATIO)) {
+#if defined(DYNAMIC_ARCH)
+  int switch_ratio = gotoblas->switch_ratio;
+#else
+  int switch_ratio = SWITCH_RATIO;
+#endif
+
+  if ((nthreads == 1) || (args->n < nthreads * switch_ratio)) {
     SYRK_LOCAL(args, range_m, range_n, sa, sb, 0);
     return 0;
   }
diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c
index 02b60b50d..c9ecf73e8 100644
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project.                              */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -44,10 +45,6 @@
 #define DIVIDE_RATE 2
 #endif
 
-#ifndef SWITCH_RATIO
-#define SWITCH_RATIO 2
-#endif
-
 #ifndef GEMM_PREFERED_SIZE
 #define GEMM_PREFERED_SIZE 1
 #endif
@@ -577,6 +574,11 @@ InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
   BLASLONG width, i, j, k, js;
   BLASLONG m, n, n_from, n_to;
   int mode;
+#if defined(DYNAMIC_ARCH)
+  int switch_ratio = gotoblas->switch_ratio;
+#else
+  int switch_ratio = SWITCH_RATIO;
+#endif
 
   /* Get execution mode */
 #ifndef COMPLEX
@@ -698,8 +700,8 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
     num_parts  = 0;
     while (n > 0){
       width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts);
-      if (width < SWITCH_RATIO) {
-        width = SWITCH_RATIO;
+      if (width < switch_ratio) {
+        width = switch_ratio;
       }
       width = round_up(n, width, GEMM_PREFERED_SIZE);
 
@@ -746,6 +748,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
   BLASLONG m = args -> m;
   BLASLONG n = args -> n;
   BLASLONG nthreads_m, nthreads_n;
+#if defined(DYNAMIC_ARCH)
+  int switch_ratio = gotoblas->switch_ratio;
+#else
+  int switch_ratio = SWITCH_RATIO;
+#endif
 
   /* Get dimensions from index ranges if available */
   if (range_m) {
@@ -755,21 +762,21 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
     n = range_n[1] - range_n[0];
   }
 
-  /* Partitions in m should have at least SWITCH_RATIO rows */
-  if (m < 2 * SWITCH_RATIO) {
+  /* Partitions in m should have at least switch_ratio rows */
+  if (m < 2 * switch_ratio) {
     nthreads_m = 1;
   } else {
     nthreads_m = args -> nthreads;
-    while (m < nthreads_m * SWITCH_RATIO) {
+    while (m < nthreads_m * switch_ratio) {
       nthreads_m = nthreads_m / 2;
     }
   }
 
-  /* Partitions in n should have at most SWITCH_RATIO * nthreads_m columns */
-  if (n < SWITCH_RATIO * nthreads_m) {
+  /* Partitions in n should have at most switch_ratio * nthreads_m columns */
+  if (n < switch_ratio * nthreads_m) {
     nthreads_n = 1;
   } else {
-    nthreads_n = (n + SWITCH_RATIO * nthreads_m - 1) / (SWITCH_RATIO * nthreads_m);
+    nthreads_n = (n + switch_ratio * nthreads_m - 1) / (switch_ratio * nthreads_m);
     if (nthreads_m * nthreads_n > args -> nthreads) {
       nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m);
     }
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index 522c6d7d9..79436f43b 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project.                              */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -49,7 +50,9 @@
 static void init_parameter(void);
 
 gotoblas_t TABLE_NAME = {
-  DTB_DEFAULT_ENTRIES ,
+  DTB_DEFAULT_ENTRIES,
+
+  SWITCH_RATIO,
 
   GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN,
 
diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c
index 29364cc05..a7c28f4c2 100644
--- a/lapack/potrf/potrf_parallel.c
+++ b/lapack/potrf/potrf_parallel.c
@@ -80,10 +80,6 @@ static FLOAT dm1 = -1.;
 #define DIVIDE_RATE 2
 #endif
 
-#ifndef SWITCH_RATIO
-#define SWITCH_RATIO 2
-#endif
-
 #ifndef LOWER
 #define TRANS
 #endif
diff --git a/param.h b/param.h
index 19cbe75a5..aec1b6a92 100644
--- a/param.h
+++ b/param.h
@@ -3838,6 +3838,10 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout
 
 #endif
 
+#ifndef SWITCH_RATIO
+#define SWITCH_RATIO 2
+#endif
+
 #ifndef QGEMM_DEFAULT_UNROLL_M
 #define QGEMM_DEFAULT_UNROLL_M 2
 #endif

From 5b165420b5962b2b73319f55b747be4f6c697860 Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <chris.sidebottom@arm.com>
Date: Mon, 5 Dec 2022 15:17:52 +0000
Subject: [PATCH 044/718] SWITCH_RATIO for Arm(R) Neoverse(TM) architecture

This seems like a good balance of values for reasonably sized matrices. With `SWITCH_RATIO=16` the DGEMM scales better to bigger sizes but the better solution would be some kind of
thread throttling so I've gone with `SWITCH_RATIO=8`.
---
 param.h | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/param.h b/param.h
index f1f5cbdad..ae391dd3f 100644
--- a/param.h
+++ b/param.h
@@ -1,5 +1,5 @@
 /*****************************************************************************
-Copyright (c) 2011-2014, The OpenBLAS Project
+Copyright (c) 2011-2023, The OpenBLAS Project
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -3338,6 +3338,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 
 #elif defined(NEOVERSEN1)
 
+#if defined(XDOUBLE) || defined(DOUBLE)
+#define SWITCH_RATIO            8
+#else
+#define SWITCH_RATIO            16
+#endif
+
 #define SGEMM_DEFAULT_UNROLL_M  16
 #define SGEMM_DEFAULT_UNROLL_N  4
 
@@ -3367,7 +3373,11 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 
 #elif defined(NEOVERSEV1)
 
-#define SWITCH_RATIO  16
+#if defined(XDOUBLE) || defined(DOUBLE)
+#define SWITCH_RATIO            8
+#else
+#define SWITCH_RATIO            16
+#endif
 
 #define SGEMM_DEFAULT_UNROLL_M  16
 #define SGEMM_DEFAULT_UNROLL_N  4
@@ -3398,6 +3408,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 
 #elif defined(NEOVERSEN2)
 
+#if defined(XDOUBLE) || defined(DOUBLE)
+#define SWITCH_RATIO            8
+#else
+#define SWITCH_RATIO            16
+#endif
+
 #undef SBGEMM_ALIGN_K
 #define SBGEMM_ALIGN_K 4
 

From ec334e69dc1db8ad607ffedd7bcac53d29026ac8 Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <chris.sidebottom@arm.com>
Date: Mon, 17 Apr 2023 17:38:42 +0100
Subject: [PATCH 045/718] Use SVE kernel for SGEMM/DGEMM on Arm(R) Neoverse(TM)
 V1

This re-spins #3869 with some additional copy unrolling which helps maintain SYRK performance.

After #3868, the SVE kernels represent a pretty good boost.

This re-uses ARMV8SVE as a base and I'm going to incrementally move everything to use ARMV8SVE in additional patches (as well as fix up anything that's not already in ARMV8SVE).
---
 benchmark/syrk.c                   |   6 +-
 kernel/arm64/KERNEL.ARMV8SVE       |  12 +--
 kernel/arm64/KERNEL.NEOVERSEV1     | 167 +++++++----------------------
 kernel/arm64/dgemm_ncopy_sve_v1.c  |  79 --------------
 kernel/arm64/dgemm_tcopy_sve_v1.c  |  77 -------------
 kernel/arm64/gemm_ncopy_sve_v1x8.c | 131 ++++++++++++++++++++++
 kernel/arm64/gemm_tcopy_sve_v1x8.c | 125 +++++++++++++++++++++
 kernel/arm64/sgemm_ncopy_sve_v1.c  |  78 --------------
 kernel/arm64/sgemm_tcopy_sve_v1.c  |  77 -------------
 param.h                            |   8 +-
 10 files changed, 307 insertions(+), 453 deletions(-)
 delete mode 100644 kernel/arm64/dgemm_ncopy_sve_v1.c
 delete mode 100644 kernel/arm64/dgemm_tcopy_sve_v1.c
 create mode 100644 kernel/arm64/gemm_ncopy_sve_v1x8.c
 create mode 100644 kernel/arm64/gemm_tcopy_sve_v1x8.c
 delete mode 100644 kernel/arm64/sgemm_ncopy_sve_v1.c
 delete mode 100644 kernel/arm64/sgemm_tcopy_sve_v1.c

diff --git a/benchmark/syrk.c b/benchmark/syrk.c
index fa0f24666..e0ae58707 100644
--- a/benchmark/syrk.c
+++ b/benchmark/syrk.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
+Copyright (c) 2014, 2023 The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -67,7 +67,7 @@ int main(int argc, char *argv[]){
   int step =   1;
   int loops =  1;
 
-  if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
+  if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
 
   double time1,timeg;
 
@@ -77,7 +77,7 @@ int main(int argc, char *argv[]){
   if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
   if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
 
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Uplo = %c Trans = %c Loops = %d\n", from, to, step,uplo,trans,loops);
 
 
   if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE
index bd25f7cd8..07393624c 100644
--- a/kernel/arm64/KERNEL.ARMV8SVE
+++ b/kernel/arm64/KERNEL.ARMV8SVE
@@ -128,10 +128,10 @@ SGEMM_BETA     = sgemm_beta.S
 SGEMMKERNEL    =  sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
 STRMMKERNEL    =  strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
 
-SGEMMINCOPY    =  sgemm_ncopy_sve_v1.c
-SGEMMITCOPY    =  sgemm_tcopy_sve_v1.c
-SGEMMONCOPY    =  sgemm_ncopy_$(DGEMM_UNROLL_N).S
-SGEMMOTCOPY    =  sgemm_tcopy_$(DGEMM_UNROLL_N).S
+SGEMMINCOPY    =  gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c
+SGEMMITCOPY    =  gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c
+SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
+SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
 
 SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
@@ -149,8 +149,8 @@ SSYMMLCOPY_M    =  symm_lcopy_sve.c
 DGEMMKERNEL    =  dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
 DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
 
-DGEMMINCOPY    =  dgemm_ncopy_sve_v1.c
-DGEMMITCOPY    =  dgemm_tcopy_sve_v1.c
+DGEMMINCOPY    =  gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c
+DGEMMITCOPY    =  gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c
 DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
 DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
 
diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1
index 9a5938459..d6617e8a4 100644
--- a/kernel/arm64/KERNEL.NEOVERSEV1
+++ b/kernel/arm64/KERNEL.NEOVERSEV1
@@ -1,75 +1,6 @@
-SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
-CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
+include $(KERNELDIR)/KERNEL.ARMV8SVE
 
-SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
-
-SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
-
-ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = ../arm/iamin.c
-ICAMINKERNEL = ../arm/izamin.c
-IZAMINKERNEL = ../arm/izamin.c
-
-ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
-
-ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
-
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
-SAXPYKERNEL  = axpy.S
 DAXPYKERNEL  = daxpy_thunderx2t99.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
-
-SROTKERNEL   = rot.S
-DROTKERNEL   = rot.S
-CROTKERNEL   = zrot.S
-ZROTKERNEL   = zrot.S
-
-SSCALKERNEL  = scal.S
-DSCALKERNEL  = scal.S
-CSCALKERNEL  = zscal.S
-ZSCALKERNEL  = zscal.S
-
-SGEMVNKERNEL = gemv_n.S
-DGEMVNKERNEL = gemv_n.S
-CGEMVNKERNEL = zgemv_n.S
-ZGEMVNKERNEL = zgemv_n.S
-
-SGEMVTKERNEL = gemv_t.S
-DGEMVTKERNEL = gemv_t.S
-CGEMVTKERNEL = zgemv_t.S
-ZGEMVTKERNEL = zgemv_t.S
-
 
 SASUMKERNEL    = sasum_thunderx2t99.c
 DASUMKERNEL    = dasum_thunderx2t99.c
@@ -100,67 +31,20 @@ DDOTKERNEL     = dot.c
 SDOTKERNEL     = dot.c
 CDOTKERNEL     = zdot_thunderx2t99.c
 ZDOTKERNEL     = zdot_thunderx2t99.c
-DSDOTKERNEL    = dot.S
-
-DGEMM_BETA     = dgemm_beta.S
-SGEMM_BETA     = sgemm_beta.S
-
-SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
-STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
-ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
-ifeq ($(SGEMM_UNROLL_M), 16)
-SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S
-else
-SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
-endif
-ifeq ($(SGEMM_UNROLL_M), 4)
-SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_M).S
-else
-SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
-endif
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-ifeq ($(SGEMM_UNROLL_N), 16)
-SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
-else
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
-endif
-ifeq ($(SGEMM_UNROLL_N), 4)
-SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
-else
-SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
-endif
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
-DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
 
-ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+CTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
 
-ifeq ($(DGEMM_UNROLL_M), 8)
-DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S
-DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S
-else
-DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
-DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
-endif
-
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-ifeq ($(DGEMM_UNROLL_N), 4)
-DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
-DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
-else
-DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
-DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
-endif
-
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+CTRMMUNCOPY_M  =
+CTRMMLNCOPY_M  =
+CTRMMUTCOPY_M  =
+CTRMMLTCOPY_M  =
+CHEMMLTCOPY_M  =
+CHEMMUTCOPY_M  =
+CSYMMUCOPY_M   =
+CSYMMLCOPY_M   =
 
 CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
 CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
@@ -169,12 +53,34 @@ CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
 CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
 CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
 CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+else
+CGEMMINCOPYOBJ =
+CGEMMITCOPYOBJ =
 endif
 CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
 CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
 CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
+ZTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+ZTRSMCOPYLN_M    =
+ZTRSMCOPYLT_M    =
+ZTRSMCOPYUN_M    =
+ZTRSMCOPYUT_M    =
+
+ZTRMMUNCOPY_M  =
+ZTRMMLNCOPY_M  =
+ZTRMMUTCOPY_M  =
+ZTRMMLTCOPY_M  =
+ZHEMMLTCOPY_M  =
+ZHEMMUTCOPY_M  =
+ZSYMMUCOPY_M   =
+ZSYMMLCOPY_M   =
+
 ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
 ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
 ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
@@ -182,6 +88,9 @@ ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
 ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
 ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
 ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+else
+ZGEMMINCOPYOBJ =
+ZGEMMITCOPYOBJ =
 endif
 ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
 ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
diff --git a/kernel/arm64/dgemm_ncopy_sve_v1.c b/kernel/arm64/dgemm_ncopy_sve_v1.c
deleted file mode 100644
index 1f812c775..000000000
--- a/kernel/arm64/dgemm_ncopy_sve_v1.c
+++ /dev/null
@@ -1,79 +0,0 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-#include <stdio.h>
-#include "common.h"
-#include <arm_sve.h>
-
-// TODO: write in assembly with proper unrolling of inner loop
-int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
-
-    BLASLONG j;
-    IFLOAT *aoffset, *aoffset1, *boffset;
-
-    svint64_t lda_vec = svindex_s64(0LL, lda);
-    uint64_t sve_size = svcntd();
-
-    aoffset = a;
-    boffset = b;
-
-    j = 0;
-    svbool_t pg = svwhilelt_b64(j, n);
-    uint64_t active = svcntp_b64(svptrue_b64(), pg);
-    do {
-
-        aoffset1 = aoffset;
-
-        uint64_t i_cnt = m;
-        while (i_cnt--) {
-            svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
-            svst1_f64(pg, (double *) boffset, a_vec);
-            aoffset1++;
-            boffset += active;
-        }
-        aoffset += sve_size * lda;
-
-        j += svcntd();
-        pg = svwhilelt_b64(j, n);
-        active = svcntp_b64(svptrue_b64(), pg);
-
-
-    } while (svptest_any(svptrue_b64(), pg));
-
-    return 0;
-}
diff --git a/kernel/arm64/dgemm_tcopy_sve_v1.c b/kernel/arm64/dgemm_tcopy_sve_v1.c
deleted file mode 100644
index cb645a1b6..000000000
--- a/kernel/arm64/dgemm_tcopy_sve_v1.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-#include <stdio.h>
-#include "common.h"
-#include <arm_sve.h>
-
-// TODO: write in assembly with proper unrolling of inner loop
-int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
-
-    BLASLONG j;
-    IFLOAT *aoffset, *aoffset1, *boffset;
-
-    uint64_t sve_size = svcntd();
-
-    aoffset = a;
-    boffset = b;
-
-    j = 0;
-    svbool_t pg = svwhilelt_b64(j, n);
-    uint64_t active = svcntp_b64(svptrue_b64(), pg);
-    do {
-
-        aoffset1 = aoffset;
-
-        uint64_t i_cnt = m;
-        while (i_cnt--) {
-            svfloat64_t a_vec = svld1(pg, (double *)aoffset1);
-            svst1_f64(pg, (double *) boffset, a_vec);
-            aoffset1 += lda;
-            boffset += active;
-        }
-        aoffset += sve_size;
-
-        j += svcntd();
-        pg = svwhilelt_b64(j, n);
-        active = svcntp_b64(svptrue_b64(), pg);
-
-    } while (svptest_any(svptrue_b64(), pg));
-
-    return 0;
-}
diff --git a/kernel/arm64/gemm_ncopy_sve_v1x8.c b/kernel/arm64/gemm_ncopy_sve_v1x8.c
new file mode 100644
index 000000000..113b1ee40
--- /dev/null
+++ b/kernel/arm64/gemm_ncopy_sve_v1x8.c
@@ -0,0 +1,131 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdio.h>
+#include <arm_sve.h>
+
+#include "common.h"
+
+#ifdef DOUBLE
+#define COUNT "cntd"
+#define SV_TYPE svfloat64_t
+#define SV_INDEX svuint64_t
+#define SV_INDEXER svindex_u64
+#define SV_TRUE svptrue_b64
+#define SV_WHILE svwhilelt_b64
+#define SV_PREFETCH svprfd_gather_index
+#else
+#define COUNT "cntw"
+#define SV_TYPE svfloat32_t
+#define SV_INDEX svuint32_t
+#define SV_INDEXER svindex_u32
+#define SV_TRUE svptrue_b32
+#define SV_WHILE svwhilelt_b32
+#define SV_PREFETCH svprfw_gather_index
+#endif
+
+#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active)   \
+    a_vec = svld1_gather_index(pg, a_offset_inner, lda_vec);    \
+    svst1(pg, b_offset, a_vec);                                 \
+    a_offset_inner++;                                           \
+    b_offset += active;
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
+    uint64_t sve_size;
+    asm(COUNT" %[SIZE_]" : [SIZE_]  "=r" (sve_size) : : );
+
+    IFLOAT *a_offset, *a_offset_inner, *b_offset;
+    a_offset = a;
+    b_offset = b;
+
+    SV_INDEX lda_vec = SV_INDEXER(0LL, lda);
+    SV_TYPE a_vec;
+    svbool_t pg_true = SV_TRUE();
+
+    BLASLONG single_vectors_n = n & -sve_size;
+    for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
+        a_offset_inner = a_offset;
+
+        svbool_t pg = pg_true;
+        uint64_t active = sve_size;
+        uint64_t i_cnt = m >> 3;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 4) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        a_offset += sve_size * lda;
+    }
+
+    BLASLONG remaining_n = n - single_vectors_n;
+    if (remaining_n) {
+        a_offset_inner = a_offset;
+        svbool_t pg = SV_WHILE(0L, remaining_n);
+        uint64_t active = remaining_n;
+        uint64_t i_cnt = m >> 2;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/arm64/gemm_tcopy_sve_v1x8.c b/kernel/arm64/gemm_tcopy_sve_v1x8.c
new file mode 100644
index 000000000..68a2cc07c
--- /dev/null
+++ b/kernel/arm64/gemm_tcopy_sve_v1x8.c
@@ -0,0 +1,125 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdio.h>
+#include <arm_sve.h>
+
+#include "common.h"
+
+#ifdef DOUBLE
+#define COUNT "cntd"
+#define SV_TYPE svfloat64_t
+#define SV_TRUE svptrue_b64
+#define SV_WHILE svwhilelt_b64
+#else
+#define COUNT "cntw"
+#define SV_TYPE svfloat32_t
+#define SV_TRUE svptrue_b32
+#define SV_WHILE svwhilelt_b32
+#endif
+
+#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active)   \
+    a_vec = svld1(pg, a_offset_inner);                          \
+    svst1(pg, b_offset, a_vec);                                 \
+    a_offset_inner += lda;                                      \
+    b_offset += active;
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+    uint64_t sve_size = svcntw();
+    asm(COUNT" %[SIZE_]" : [SIZE_]  "=r" (sve_size) : : );
+
+    IFLOAT *a_offset, *a_offset_inner, *b_offset;
+    a_offset = a;
+    b_offset = b;
+
+    SV_TYPE a_vec;
+    svbool_t pg_true = SV_TRUE();
+
+    BLASLONG single_vectors_n = n & -sve_size;
+    for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
+        a_offset_inner = a_offset;
+
+        svbool_t pg = pg_true;
+        uint64_t active = sve_size;
+        uint64_t i_cnt = m >> 3;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 4) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        a_offset += sve_size;
+    }
+
+    BLASLONG remaining_n = n - single_vectors_n;
+    if (remaining_n) {
+        a_offset_inner = a_offset;
+        svbool_t pg = SV_WHILE(0L, remaining_n);
+        uint64_t active = remaining_n;
+        uint64_t i_cnt = m >> 2;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+    }
+
+    return 0;
+}
+
diff --git a/kernel/arm64/sgemm_ncopy_sve_v1.c b/kernel/arm64/sgemm_ncopy_sve_v1.c
deleted file mode 100644
index 1bc186335..000000000
--- a/kernel/arm64/sgemm_ncopy_sve_v1.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-#include <stdio.h>
-#include "common.h"
-#include <arm_sve.h>
-
-// TODO: write in assembly with proper unrolling of inner loop
-int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
-
-    BLASLONG j;
-    IFLOAT *aoffset, *aoffset1, *boffset;
-
-    svint32_t lda_vec = svindex_s32(0LL, lda);
-    uint32_t sve_size = svcntw();
-
-    aoffset = a;
-    boffset = b;
-
-    j = 0;
-    svbool_t pg = svwhilelt_b32(j, n);
-    uint32_t active = svcntp_b32(svptrue_b32(), pg);
-    do {
-
-        aoffset1 = aoffset;
-
-        uint32_t i_cnt = m;
-        while (i_cnt--) {
-            svfloat32_t a_vec = svld1_gather_index(pg, (float *) aoffset1, lda_vec);
-            svst1_f32(pg, (float *) boffset, a_vec);
-            aoffset1++;
-            boffset += active;
-        }
-        aoffset += sve_size * lda;
-
-        j += svcntw();
-        pg = svwhilelt_b32(j, n);
-        active = svcntp_b32(svptrue_b32(), pg);
-
-    } while (svptest_any(svptrue_b32(), pg));
-
-    return 0;
-}
diff --git a/kernel/arm64/sgemm_tcopy_sve_v1.c b/kernel/arm64/sgemm_tcopy_sve_v1.c
deleted file mode 100644
index 9f8cf502a..000000000
--- a/kernel/arm64/sgemm_tcopy_sve_v1.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-#include <stdio.h>
-#include "common.h"
-#include <arm_sve.h>
-
-// TODO: write in assembly with proper unrolling of inner loop
-int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
-
-    BLASLONG j;
-    IFLOAT *aoffset, *aoffset1, *boffset;
-
-    uint32_t sve_size = svcntw();
-
-    aoffset = a;
-    boffset = b;
-
-    j = 0;
-    svbool_t pg = svwhilelt_b32(j, n);
-    uint32_t active = svcntp_b32(svptrue_b32(), pg);
-    do {
-
-        aoffset1 = aoffset;
-
-        uint32_t i_cnt = m;
-        while (i_cnt--) {
-            svfloat32_t a_vec = svld1(pg, (float *) aoffset1);
-            svst1_f32(pg, (float *) boffset, a_vec);
-            aoffset1 += lda;
-            boffset += active;
-        }
-        aoffset += sve_size;
-
-        j += svcntw();
-        pg = svwhilelt_b32(j, n);
-        active = svcntp_b32(svptrue_b32(), pg);
-
-    } while (svptest_any(svptrue_b32(), pg));
-
-    return 0;
-}
diff --git a/param.h b/param.h
index f1f5cbdad..99ff7cb4c 100644
--- a/param.h
+++ b/param.h
@@ -1,5 +1,5 @@
 /*****************************************************************************
-Copyright (c) 2011-2014, The OpenBLAS Project
+Copyright (c) 2011-2023, The OpenBLAS Project
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -3370,10 +3370,10 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define SWITCH_RATIO  16
 
 #define SGEMM_DEFAULT_UNROLL_M  16
-#define SGEMM_DEFAULT_UNROLL_N  4
+#define SGEMM_DEFAULT_UNROLL_N  8
 
-#define DGEMM_DEFAULT_UNROLL_M  8
-#define DGEMM_DEFAULT_UNROLL_N  4
+#define DGEMM_DEFAULT_UNROLL_M  4 // Actually 2VL (8) but kept separate to keep copies separate
+#define DGEMM_DEFAULT_UNROLL_N  8
 
 #define CGEMM_DEFAULT_UNROLL_M  8
 #define CGEMM_DEFAULT_UNROLL_N  4

From 479509bb37d5cd26baa62462abe461a0a1d43bb2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 17 Apr 2023 21:57:25 +0200
Subject: [PATCH 046/718] Remove any stray trailing dash from CROSS_SUFFIX (as
 would result from clang -arch)

---
 c_check | 1 +
 1 file changed, 1 insertion(+)

diff --git a/c_check b/c_check
index 9be152b12..232adba67 100755
--- a/c_check
+++ b/c_check
@@ -40,6 +40,7 @@ bn=`basename \"$compiler_name\"`
 case "$bn" in
     *-*) if [ "$bn" != '-' ]; then
            cross_suffix="$cross_suffix${bn%-*}-"
+	   cross_suffix=`echo $cross_suffix|sed -e 's/ -$//'`
          fi
 esac
 

From 7de9335c56aed6fb7fb1590c4d146338ca666726 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 19 Apr 2023 23:42:09 +0200
Subject: [PATCH 047/718] Disable gcc's tree-vectorizer pass on all operating
 systems

---
 kernel/x86_64/sgemv_n_4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
index c9681fa8b..296eded5a 100644
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#if (defined(__GNUC__) && __GNUC__ > 11) 
 #pragma GCC optimize("no-tree-vectorize")
 #endif
 

From 99f6d31ed52822ec69bba1f225ef889e99d38f99 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 19 Apr 2023 23:42:55 +0200
Subject: [PATCH 048/718] Disable gcc's tree-vectorizer pass on all operating
 systems

---
 kernel/x86_64/sgemv_t_4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index 07aa51503..ea89a2aaf 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#if (defined(__GNUC__) && __GNUC__ > 11) 
 #pragma GCC optimize("no-tree-vectorize")
 #endif
 

From d18efaed20e4ed48bba3777a091fa4e49e35b67f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 19 Apr 2023 23:43:43 +0200
Subject: [PATCH 049/718] Disable gcc's tree-vectorizer pass on all operating
 systems

---
 kernel/x86_64/ssymv_L.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c
index 45914daf5..4826b00c6 100644
--- a/kernel/x86_64/ssymv_L.c
+++ b/kernel/x86_64/ssymv_L.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#if (defined(__GNUC__) && __GNUC__ > 11) 
 #pragma GCC optimize("no-tree-vectorize")
 #endif
 

From bb6d6735bf094b8f1bf6fb7a986f5360c0baf2c7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 19 Apr 2023 23:44:15 +0200
Subject: [PATCH 050/718] Disable gcc's tree-vectorizer pass on all operating
 systems

---
 kernel/x86_64/ssymv_U.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c
index 26e5ca7e9..06db14ebe 100644
--- a/kernel/x86_64/ssymv_U.c
+++ b/kernel/x86_64/ssymv_U.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#if (defined(__GNUC__) && __GNUC__ > 11) 
 #pragma GCC optimize("no-tree-vectorize")
 #endif
 

From 66b39b835c33b1f9300d042010d255f1465a8e3e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 19 Apr 2023 23:44:45 +0200
Subject: [PATCH 051/718] Disable gcc's tree-vectorizer pass on all operating
 systems

---
 kernel/x86_64/zdot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c
index 27397ccfa..72a712a9e 100644
--- a/kernel/x86_64/zdot.c
+++ b/kernel/x86_64/zdot.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#if (defined(__GNUC__) && __GNUC__ > 11) 
 #pragma GCC optimize("no-tree-vectorize")
 #endif
 

From c2fe9cb91fab22ff00a3a660aa106cfe6b9e132f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 19 Apr 2023 23:45:14 +0200
Subject: [PATCH 052/718] Disable gcc's tree-vectorizer pass on all operating
 systems

---
 kernel/x86_64/zgemv_n_4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c
index 8fc960610..678cea957 100644
--- a/kernel/x86_64/zgemv_n_4.c
+++ b/kernel/x86_64/zgemv_n_4.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#if (defined(__GNUC__) && __GNUC__ > 11) 
 #pragma GCC optimize("no-tree-vectorize")
 #endif
 

From c9174ae8d7e385c9fd030d263f6ff5e07aa9b2ee Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 19 Apr 2023 23:45:44 +0200
Subject: [PATCH 053/718] Disable gcc's tree-vectorizer pass on all operating
 systems

---
 kernel/x86_64/zgemv_t_4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c
index 63c8b11a4..44d545df7 100644
--- a/kernel/x86_64/zgemv_t_4.c
+++ b/kernel/x86_64/zgemv_t_4.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#if (defined(__GNUC__) && __GNUC__ > 11) 
 #pragma GCC optimize("no-tree-vectorize")
 #endif
 

From 84bcf6639fb2de9ae15350a6b3979bf5757bae85 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 20 Apr 2023 23:24:52 +0200
Subject: [PATCH 054/718]  Disable gcc's tree-vectorizer pass on all operating
 systems

---
 kernel/x86_64/cgemv_n_4.c | 5 +++--
 kernel/x86_64/cgemv_t_4.c | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c
index 3ca173c20..94e6d2c77 100644
--- a/kernel/x86_64/cgemv_n_4.c
+++ b/kernel/x86_64/cgemv_n_4.c
@@ -24,9 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
+#if (defined(__GNUC__) && __GNUC__ > 11) 
+#pragma GCC optimize("no-tree-vectorize")
+#endif
 
-#include <stdlib.h>
-#include <stdio.h>
 #include "common.h"
 
 #if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c
index 3187e196c..f123e81d1 100644
--- a/kernel/x86_64/cgemv_t_4.c
+++ b/kernel/x86_64/cgemv_t_4.c
@@ -24,7 +24,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
-
+#if (defined(__GNUC__) && __GNUC__ > 11) 
+#pragma GCC optimize("no-tree-vectorize")
+#endif
 
 #include "common.h"
 

From 67d33e5b9878a42e0b61151a6bd40ac728fbe19c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Markus=20M=C3=BCtzel?= <markus.muetzel@gmx.de>
Date: Fri, 21 Apr 2023 13:02:23 +0200
Subject: [PATCH 055/718] CI (MSYS2): Update location of compiler cache.

---
 .github/workflows/dynamic_arch.yml | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml
index ff40b354d..fdfad3365 100644
--- a/.github/workflows/dynamic_arch.yml
+++ b/.github/workflows/dynamic_arch.yml
@@ -217,11 +217,16 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v3
 
+      - name: Prepare ccache
+        # Get cache location of ccache
+        id: ccache-prepare
+        run: |
+          echo "ccachedir=$(cygpath -m $(ccache -k cache_dir))" >> $GITHUB_OUTPUT
+
       - name: Compilation cache
         uses: actions/cache@v3
         with:
-          # It looks like this path needs to be hard-coded.
-          path: C:/msys64/home/runneradmin/.ccache
+          path: ${{ steps.ccache-prepare.outputs.ccachedir }}
           # We include the commit sha in the cache key, as new cache entries are
           # only created if there is no existing entry for the key yet.
           key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}
@@ -234,9 +239,10 @@ jobs:
         # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota.
         run: |
           which ccache
-          test -d ~/.ccache || mkdir -p ~/.ccache
-          echo "max_size = 250M" > ~/.ccache/ccache.conf
-          echo "compression = true" >> ~/.ccache/ccache.conf
+          test -d ${{ steps.ccache-prepare.outputs.ccachedir }} || mkdir -p ${{ steps.ccache-prepare.outputs.ccachedir }}
+          echo "max_size = 250M" > ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf
+          echo "compression = true" >> ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf
+          ccache -p
           ccache -s
           echo $HOME
           cygpath -w $HOME

From e27e9a50b1d831d2b02a7d1c7477fab5ab9f460d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Markus=20M=C3=BCtzel?= <markus.muetzel@gmx.de>
Date: Fri, 21 Apr 2023 14:10:40 +0200
Subject: [PATCH 056/718] CI (MSYS2): Save ccache before running tests.

---
 .github/workflows/dynamic_arch.yml | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml
index fdfad3365..b537c8494 100644
--- a/.github/workflows/dynamic_arch.yml
+++ b/.github/workflows/dynamic_arch.yml
@@ -219,17 +219,19 @@ jobs:
 
       - name: Prepare ccache
         # Get cache location of ccache
+        # Create key that is used in action/cache/restore and action/cache/save steps
         id: ccache-prepare
         run: |
           echo "ccachedir=$(cygpath -m $(ccache -k cache_dir))" >> $GITHUB_OUTPUT
+          # We include the commit sha in the cache key, as new cache entries are
+          # only created if there is no existing entry for the key yet.
+          echo "key=ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}" >> $GITHUB_OUTPUT
 
-      - name: Compilation cache
-        uses: actions/cache@v3
+      - name: Restore ccache
+        uses: actions/cache/restore@v3
         with:
           path: ${{ steps.ccache-prepare.outputs.ccachedir }}
-          # We include the commit sha in the cache key, as new cache entries are
-          # only created if there is no existing entry for the key yet.
-          key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}
+          key: ${{ steps.ccache-prepare.outputs.key }}
           # Restore a matching ccache cache entry. Prefer same branch.
           restore-keys: |
             ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}
@@ -270,6 +272,13 @@ jobs:
         continue-on-error: true
         run: ccache -s
 
+      - name: Save ccache
+        # Save the cache after we are done (successfully) building
+        uses: actions/cache/save@v3
+        with:
+          path: ${{ steps.ccache-prepare.outputs.ccachedir }}
+          key: ${{ steps.ccache-prepare.outputs.key }}
+
       - name: Run tests
         timeout-minutes: 60
         run: cd build && ctest

From f2c6a34de9836ff9c2b3684f5ce19e08098f373e Mon Sep 17 00:00:00 2001
From: lilinjie <lilinjie@uniontech.com>
Date: Mon, 24 Apr 2023 10:29:59 +0800
Subject: [PATCH 057/718] fix typo

Signed-off-by: lilinjie <lilinjie@uniontech.com>
---
 GotoBLAS_06WeirdPerformance.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GotoBLAS_06WeirdPerformance.txt b/GotoBLAS_06WeirdPerformance.txt
index 05766e17b..0f7cec5c9 100644
--- a/GotoBLAS_06WeirdPerformance.txt
+++ b/GotoBLAS_06WeirdPerformance.txt
@@ -11,7 +11,7 @@
    operation is finished.
 
 
-2. Simlar problem may happen under virtual machine. If supervisor
+2. Similar problem may happen under virtual machine. If supervisor
    allocates different cores for each scheduling, BLAS performnace
    will be bad. This is because BLAS also utilizes all cache,
    unexpected re-schedule for different core may result of heavy

From 7152d6b06d549e7f1b474c0eedcf0c8bba5d9421 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 27 Apr 2023 08:36:20 +0200
Subject: [PATCH 058/718] fix cblas_gemmt

---
 interface/gemmt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/interface/gemmt.c b/interface/gemmt.c
index d35406411..0ec754fb9 100644
--- a/interface/gemmt.c
+++ b/interface/gemmt.c
@@ -177,7 +177,7 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
 #else
 
 void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
-	   enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M,
+	   enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m,
 	   blasint k,
 #ifndef COMPLEX
 	   FLOAT alpha,
@@ -392,7 +392,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 #endif
 
-	if ((m == 0) )
+	if (m == 0)
 		return;
 
 	IDEBUG_START;

From 1f6f7328ebcada538eb30894e46d89f188e3a217 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 27 Apr 2023 09:14:12 +0200
Subject: [PATCH 059/718] remove redundant declaration

---
 interface/gemmt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/gemmt.c b/interface/gemmt.c
index 0ec754fb9..cebc7918d 100644
--- a/interface/gemmt.c
+++ b/interface/gemmt.c
@@ -199,7 +199,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 	int transa, transb, uplo;
 	blasint info;
-	blasint m, lda, ldb;
+	blasint lda, ldb;
 	FLOAT *a, *b;
 	XFLOAT *buffer;
 

From 7b16c4c051bfafcf8f926daec6bdbe48983308cf Mon Sep 17 00:00:00 2001
From: Han Gao <gaohan@iscas.ac.cn>
Date: Fri, 28 Apr 2023 04:32:06 +0000
Subject: [PATCH 060/718] CI (C910V): add test

Signed-off-by: Han Gao <gaohan@iscas.ac.cn>
---
 .github/workflows/c910v.yml | 121 ++++++++++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)
 create mode 100644 .github/workflows/c910v.yml

diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml
new file mode 100644
index 000000000..199304fb1
--- /dev/null
+++ b/.github/workflows/c910v.yml
@@ -0,0 +1,121 @@
+name: c910v qemu test
+
+on: [push, pull_request]
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+jobs:
+  TEST:
+    runs-on: ubuntu-latest
+    env:
+      xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282
+      toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - target: RISCV64_GENERIC
+            triple: riscv64-linux-gnu
+            apt_triple: riscv64-linux-gnu
+            opts: NO_SHARED=1 TARGET=RISCV64_GENERIC
+          - target: C910V
+            triple: riscv64-unknown-linux-gnu
+            apt_triple: riscv64-linux-gnu
+            opts: NO_SHARED=1 TARGET=C910V
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: install build deps
+        run: |
+          sudo apt-get update
+          sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
+          gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross
+
+      - name: checkout qemu
+        uses: actions/checkout@v3
+        with:
+          repository: T-head-Semi/qemu
+          path: qemu
+          ref: 1e692ebb43d396c52352406323fc782c1ac99a42
+
+      - name: build qemu
+        run: |
+          # Force use c910v qemu-user
+          wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
+          cd qemu
+          patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
+          ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
+          make -j$(nproc)
+          make install
+
+      - name: Compilation cache
+        uses: actions/cache@v3
+        with:
+          path: ~/.ccache
+          key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
+          restore-keys: |
+            ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
+            ccache-${{ runner.os }}-${{ matrix.target }}
+
+      - name: Configure ccache
+        run: |
+          test -d ~/.ccache || mkdir -p ~/.ccache
+          echo "max_size = 300M" > ~/.ccache/ccache.conf
+          echo "compression = true" >> ~/.ccache/ccache.conf
+          ccache -s
+
+      - name: build OpenBLAS
+        run: |
+            wget ${xuetie_toolchain}/${toolchain_file_name}
+            tar -xvf ${toolchain_file_name} -C /opt
+            export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1/bin:$PATH"
+
+            make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
+
+      - name: test
+        run: |
+          export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
+          qemu-riscv64 ./utest/openblas_utest
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat2 < ./ctest/sin2
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat2 < ./ctest/din2
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat2 < ./ctest/cin2
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat2 < ./ctest/zin2
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat3 < ./ctest/sin3
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat3 < ./ctest/din3
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat3 < ./ctest/cin3
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat3 < ./ctest/zin3
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat1
+          rm -f ./test/?BLAT2.SUMM
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat
+          rm -f ./test/?BLAT2.SUMM
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat
+          rm -f ./test/?BLAT3.SUMM
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat
+          rm -f ./test/?BLAT3.SUMM
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat

From b1781ad338842d4e91d8c851986e8e1a78986fe8 Mon Sep 17 00:00:00 2001
From: "Kirill A. Korinsky" <kirill@korins.ky>
Date: Sat, 29 Apr 2023 11:13:26 +0200
Subject: [PATCH 061/718] Do not requires GNU mktemp

Historically the GNU mktemp was the first one which doesn't requires
`-t` to create a directory.

Here I've introduced a fallback when `-t` is required.

For example MacPorts contains similar patch: https://github.com/macports/macports-ports/blob/bbe8abfe26651cc46d496bf48ecda506faa40a4a/math/OpenBLAS/files/patch-MacOSX-mktemp.diff
---
 c_check | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/c_check b/c_check
index 232adba67..8b633ab8b 100755
--- a/c_check
+++ b/c_check
@@ -168,7 +168,7 @@ fi
 
 no_msa=0
 if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
-    tmpd="$(mktemp -d)"
+    tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
     tmpf="$tmpd/a.c"
     code='"addvi.b $w0, $w1, 1"'
     msa_flags='-mmsa -mfp64 -mload-store-pairs'
@@ -208,7 +208,7 @@ esac
 
 no_avx512=0
 if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
-    tmpd=`mktemp -d`
+    tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
     tmpf="$tmpd/a.c"
     code='"vbroadcastss -4 * 4(%rsi), %zmm2"'
     printf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
@@ -229,7 +229,7 @@ fi
 
 no_rv64gv=0
 if [ "$architecture" = "riscv64" ]; then
-    tmpd=`mktemp -d`
+    tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
     tmpf="$tmpd/a.c"
     code='"vsetvli    zero, zero, e8, m1\n"'
     printf "int main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
@@ -245,7 +245,7 @@ fi
 
 no_sve=0
 if [ "$architecture" = "arm64" ]; then
-    tmpd=`mktemp -d`
+    tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
     tmpf="$tmpd/a.c"
     printf "#include <arm_sve.h>\n\n int main(void){}\n">> "$tmpf"
     args=" -march=armv8-a+sve -c -o $tmpf.o $tmpf"
@@ -261,7 +261,7 @@ fi
 c11_atomics=0
 case "$data" in
     *HAVE_C11*)
-        tmpd=`mktemp -d`
+        tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
         tmpf="$tmpd/a.c"
         printf "#include <stdatomic.h>\nint main(void){}\n" >> "$tmpf"
         args=" -c -o $tmpf.o $tmpf"

From e5538a62cb71c50623e117888a85ba26cd11dc9d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 4 May 2023 22:56:39 +0200
Subject: [PATCH 062/718] Add suggestions to NUM_THREADS/auxiliary buffer
 message

---
 driver/others/memory.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index 4493b7d71..3cbd17bc2 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -3015,6 +3015,8 @@ void *blas_memory_alloc(int procpos){
 #endif
  if (memory_overflowed) goto terminate;
   fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n");
+  fprintf(stderr,"To avoid this warning, please rebuild your copy of OpenBLAS with a larger NUM_THREADS setting\n");
+  fprintf(stderr,"or set the environment variable OPENBLAS_NUM_THREADS to %d or lower\n", NUM_BUFFERS);
   memory_overflowed=1;
   new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t));
   newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct));

From 60226b35e1e24a64be69c7f65934286df2dcfedb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 6 May 2023 12:44:38 +0200
Subject: [PATCH 063/718] Fix (redundant) lapack-runtest target

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 3c4b8948a..299970c67 100644
--- a/Makefile
+++ b/Makefile
@@ -373,10 +373,10 @@ ifneq ($(CROSS), 1)
 	(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING)
 endif
 
-lapack-runtest:
+lapack-runtest: lapack-test
 	( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
         ./testsecond; ./testdsecnd; ./testieee; ./testversion )
-	(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
+	(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING )
 
 
 blas-test:

From c2078b2356fdad2badfd385cae50ea8499c2c6a6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 7 May 2023 20:15:13 +0200
Subject: [PATCH 064/718] Mark xerbla's arguments as const

---
 cblas.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cblas.h b/cblas.h
index c2bdd27fa..8a5055cf8 100644
--- a/cblas.h
+++ b/cblas.h
@@ -350,7 +350,7 @@ void cblas_cher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBL
 void cblas_zher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
                   OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, void *C, OPENBLAS_CONST blasint ldc);
 
-void cblas_xerbla(blasint p, char *rout, char *form, ...);
+void cblas_xerbla(blasint p, OPENBLAS_CONST char *rout, OPENBLAS_CONST char *form, ...);
 
 /*** BLAS extensions ***/
 

From c74ee11376600ecf546240a78c6bf6a9728b91e0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 8 May 2023 14:24:38 +0200
Subject: [PATCH 065/718] Add an M1-based OSX crossbuild and a NeoverseN1 build
 to CIRRUS CI (#3997)

* Add an M1-based OSX crossbuild and a NeoverseN1 build (plus Windows//LLVM commented out for now)
---
 .cirrus.yml | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/.cirrus.yml b/.cirrus.yml
index d16eb811a..b8f03acaf 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -2,9 +2,65 @@ macos_instance:
   image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
 
 task:
+  name: AppleM1/LLVM
   compile_script:
   - brew install llvm
   - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
   - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
   - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
   - make TARGET=VORTEX USE_OPENMP=1 CC=clang
+
+macos_instance:
+  image: ghcr.io/cirruslabs/macos-monterey-xcode:13.4
+task:
+  name: AppleM1/LLVM x86_64 xbuild
+  compile_script:
+  - #brew install llvm
+  - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
+  - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+  - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" 
+  - export ARCHS="i386 x86_64"
+  - export ARCHS_STANDARD="i386 x86_64"
+  - export ARCHS_STANDARD_32_64_BIT="i386 x86_64"
+  - export ARCHS_STANDARD_64_BIT=x86_64
+  - export ARCHS_STANDARD_INCLUDING_64_BIT="i386 x86_64"
+  - export ARCHS_UNIVERSAL_IPHONE_OS="i386 x86_64"
+  - export VALID_ARCHS="i386 x86_64"
+  - #find /Applications/Xcode-13.4.1.app -name libunwind.dylib
+  - export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+  - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator15.5.sdk -arch x86_64 -miphoneos-version-min=10.0"
+  - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
+  always:
+    config_artifacts:
+      path: "*conf*"
+      type: text/plain
+#  lib_artifacts:
+#    path: "libopenblas*"
+#    type: application/octet-streamm
+
+
+task:
+  name: NeoverseN1
+  arm_container:
+    image: node:latest
+  compile_script:
+  - make
+  
+
+#task:
+#  name: Windows/LLVM16     --- too slow ---
+#  windows_container:
+#    image: cirrusci/windowsservercore:cmake-2021.12.07
+#  install_script: 
+#    - choco list --localonly
+#    - choco install -y llvm
+#    - # choco install -y cmake --installargs '"ADD_CMAKE_TO_PATH=System"'
+#    - choco install -y ninja
+#    - refreshenv
+#    - cd "c:/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/VC/Auxiliary/Build"
+#    - vcvarsall x64
+#    - cd "C:\Users\ContainerAdministrator\AppData\Local\Temp\cirrus-ci-build"
+#    - cmake -S . -B build -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release
+#    - cd build
+#    - cmake --build .
+#    - ctest

From 33ab415f68e90fbe78874e4b18fee185e998fb96 Mon Sep 17 00:00:00 2001
From: Ken Ho <klho@alumni.caltech.edu>
Date: Mon, 8 May 2023 14:43:56 -0700
Subject: [PATCH 066/718] Bug fix and improvements for [z]imatcopy interface.

---
 interface/imatcopy.c  | 34 ++++++++---------
 interface/zimatcopy.c | 89 +++++++++++++++++++------------------------
 2 files changed, 57 insertions(+), 66 deletions(-)

diff --git a/interface/imatcopy.c b/interface/imatcopy.c
index 91975f7f4..495243949 100644
--- a/interface/imatcopy.c
+++ b/interface/imatcopy.c
@@ -120,17 +120,20 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
     		BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
     		return;
   	}
+
 #ifdef NEW_IMATCOPY
-    if ( *lda == *ldb && *rows == *cols) {
+    if ( *lda == *ldb ) {
         if ( order == BlasColMajor )
         {
             if ( trans == BlasNoTrans )
             {
                 IMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda );
+                return;
             }
-            else
+            else if ( *rows == *cols )
             {
                 IMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda );
+                return;
             }
         }
         else
@@ -138,21 +141,18 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
             if ( trans == BlasNoTrans )
             {
                 IMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda );
+                return;
             }
-            else
+            else if ( *rows == *cols )
             {
                 IMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda );
+                return;
             }
         }
-        return; 
     }
-
 #endif
 
-	if ( *lda >  *ldb )
-		msize = (size_t)(*lda) * (*ldb)  * sizeof(FLOAT);
-	else
-		msize = (size_t)(*ldb) * (*ldb)  * sizeof(FLOAT);
+	msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT);
 
 	b = malloc(msize);
 	if ( b == NULL )
@@ -165,26 +165,26 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 	{
 		if ( trans == BlasNoTrans )
 		{
-			OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *ldb );
-			OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *ldb, a, *ldb );
+			OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *rows );
+			OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *rows, a, *ldb );
 		}
 		else
 		{
-			OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *ldb );
-			OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, b, *ldb, a, *ldb );
+			OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *cols );
+			OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, b, *cols, a, *ldb );
 		}
 	}
 	else
 	{
 		if ( trans == BlasNoTrans )
 		{
-			OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *ldb );
-			OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *ldb, a, *ldb );
+			OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *cols );
+			OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *cols, a, *ldb );
 		}
 		else
 		{
-			OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *ldb );
-			OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, b, *ldb, a, *ldb );
+			OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *rows );
+			OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, b, *rows, a, *ldb );
 		}
 	}
 
diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c
index ecda5ef4e..fcad398ba 100644
--- a/interface/zimatcopy.c
+++ b/interface/zimatcopy.c
@@ -125,27 +125,33 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
     		BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
     		return;
   	}
+
 #ifdef NEW_IMATCOPY
-    if (*lda == *ldb && *cols == *rows) {
+    if (*lda == *ldb ) {
         if ( order == BlasColMajor )
         {
 
             if ( trans == BlasNoTrans )
             {
                 IMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
             if ( trans == BlasConj )
             {
                 IMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
-            if ( trans == BlasTrans )
+            if ( trans == BlasTrans && *rows == *cols )
             {
                 IMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
-            if ( trans == BlasTransConj )
+            if ( trans == BlasTransConj && *rows == *cols )
             {
                 IMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
+
         }
         else
         {
@@ -153,67 +159,59 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
             if ( trans == BlasNoTrans )
             {
                 IMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
             if ( trans == BlasConj )
             {
                 IMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
-            if ( trans == BlasTrans )
+            if ( trans == BlasTrans && *rows == *cols )
             {
                 IMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
-            if ( trans == BlasTransConj )
+            if ( trans == BlasTransConj && *rows == *cols )
             {
                 IMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
+
         }
-        return; 
     }
 #endif
 
-	if ( *lda >  *ldb )
-                msize = (size_t)(*lda) * (*ldb)  * sizeof(FLOAT) * 2;
-        else
-                msize = (size_t)(*ldb) * (*ldb)  * sizeof(FLOAT) * 2;
-
-        b = malloc(msize);
-        if ( b == NULL )
-        {
-                printf("Memory alloc failed in zimatcopy\n");
-                exit(1);
-        }
+	msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT) * 2;
 
+	b = malloc(msize);
+	if ( b == NULL )
+	{
+		printf("Memory alloc failed\n");
+		exit(1);
+	}
 
 	if ( order == BlasColMajor )
 	{
 
 		if ( trans == BlasNoTrans )
 		{
-	  		OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+	  		OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
+	  		OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
 		}
 		if ( trans == BlasConj )
 		{
-			OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+			OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
+	  		OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
 		}
 		if ( trans == BlasTrans )
 		{
-			OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+			OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
+	  		OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
 		}
 		if ( trans == BlasTransConj )
 		{
-			OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+			OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
+	  		OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
 		}
 
 	}
@@ -222,34 +220,27 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 
 		if ( trans == BlasNoTrans )
 		{
-			OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+			OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
+	  		OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
 		}
 		if ( trans == BlasConj )
 		{
-			OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+			OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
+	  		OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
 		}
 		if ( trans == BlasTrans )
 		{
-			OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+			OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
+	  		OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
 		}
 		if ( trans == BlasTransConj )
 		{
-			OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+			OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
+	  		OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
 		}
 
 	}
+
 	free(b);
 	return;
 

From 7a86c437b52c0664889769b663f3937cf45a2d94 Mon Sep 17 00:00:00 2001
From: Ken Ho <klho@alumni.caltech.edu>
Date: Wed, 10 May 2023 09:13:04 -0700
Subject: [PATCH 067/718] Change some "if" statements to "else if" following
 suggestion by @mmuetzel.

---
 interface/zimatcopy.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c
index fcad398ba..34454c582 100644
--- a/interface/zimatcopy.c
+++ b/interface/zimatcopy.c
@@ -198,17 +198,17 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 	  		OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
 	  		OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
 		}
-		if ( trans == BlasConj )
+		else if ( trans == BlasConj )
 		{
 			OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
 	  		OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
 		}
-		if ( trans == BlasTrans )
+		else if ( trans == BlasTrans )
 		{
 			OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
 	  		OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
 		}
-		if ( trans == BlasTransConj )
+		else if ( trans == BlasTransConj )
 		{
 			OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
 	  		OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
@@ -223,17 +223,17 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 			OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
 	  		OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
 		}
-		if ( trans == BlasConj )
+		else if ( trans == BlasConj )
 		{
 			OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
 	  		OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
 		}
-		if ( trans == BlasTrans )
+		else if ( trans == BlasTrans )
 		{
 			OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
 	  		OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
 		}
-		if ( trans == BlasTransConj )
+		else if ( trans == BlasTransConj )
 		{
 			OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
 	  		OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );

From df1b1f6a91461c5e0e7b3564693e4474d744bf19 Mon Sep 17 00:00:00 2001
From: Ken Ho <klho@alumni.caltech.edu>
Date: Fri, 12 May 2023 09:41:52 -0700
Subject: [PATCH 068/718] More detailed error message in [z]imatcopy.c.

---
 interface/imatcopy.c  | 2 +-
 interface/zimatcopy.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/interface/imatcopy.c b/interface/imatcopy.c
index 495243949..c4417e99c 100644
--- a/interface/imatcopy.c
+++ b/interface/imatcopy.c
@@ -157,7 +157,7 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 	b = malloc(msize);
 	if ( b == NULL )
 	{
-		printf("Memory alloc failed\n");
+		printf("Memory alloc failed in imatcopy\n");
 		exit(1);
 	}
 
diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c
index 34454c582..b0b32dc87 100644
--- a/interface/zimatcopy.c
+++ b/interface/zimatcopy.c
@@ -186,7 +186,7 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 	b = malloc(msize);
 	if ( b == NULL )
 	{
-		printf("Memory alloc failed\n");
+		printf("Memory alloc failed in zimatcopy\n");
 		exit(1);
 	}
 

From e2779c852f0630b6ff6a2eec9e8cfbb014e300f7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 15 May 2023 20:49:56 +0200
Subject: [PATCH 069/718] Do not build the tests when only the CBLAS interface
 is selected (#4041)

* Do not build the tests when only the CBLAS interface is selected
---
 CMakeLists.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d59290c90..cc964b76e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -311,19 +311,25 @@ endif()
 
 #if (MSVC OR NOT NOFORTRAN)
 if (NOT NO_CBLAS)
+  if (NOT ONLY_CBLAS)
   # Broken without fortran on unix
-  add_subdirectory(utest)
+    add_subdirectory(utest)
+endif()
 endif()
 
 if (NOT NOFORTRAN)
+ if (NOT ONLY_CBLAS)
   # Build test and ctest
   add_subdirectory(test)
+ endif() 
   if (BUILD_TESTING)
     add_subdirectory(lapack-netlib/TESTING)
   endif()
 endif()
   if(NOT NO_CBLAS)
+   if (NOT ONLY_CBLAS)
     add_subdirectory(ctest)
+   endif()
   endif()
   if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
     add_subdirectory(cpp_thread_test)

From 86f48997c749d54a7c13193c500247c5610bb9f6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 16 May 2023 12:01:50 +0200
Subject: [PATCH 070/718] CirrusCI: Add Neoverse build with OpenMP (#4042)

* Add Neoverse build with OpenMP
---
 .cirrus.yml | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index b8f03acaf..7ca1cc7bc 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -46,7 +46,14 @@ task:
   compile_script:
   - make
   
-
+task:
+  name: NeoverseN1-OMP
+  arm_container:
+    image: node:latest
+    cpu: 8
+  compile_script:
+  - make USE_OPENMP=1
+  
 #task:
 #  name: Windows/LLVM16     --- too slow ---
 #  windows_container:

From c5f7e46526a602689f1b19e0598a411fb0a33111 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 16 May 2023 19:54:42 +0200
Subject: [PATCH 071/718] Fix typos and errors in comments (Reference-LAPACK
 809)

---
 lapack-netlib/cgbsvx.f |  644 ++++++++++++
 lapack-netlib/cgejsv.f | 2232 +++++++++++++++++++++++++++++++++++++++
 lapack-netlib/cgesvx.f |  602 +++++++++++
 lapack-netlib/dgbsvx.f |  639 ++++++++++++
 lapack-netlib/dgejsv.f | 1780 ++++++++++++++++++++++++++++++++
 lapack-netlib/dgesvx.f |  599 +++++++++++
 lapack-netlib/sgbsvx.f |  641 ++++++++++++
 lapack-netlib/sgesvx.f |  599 +++++++++++
 lapack-netlib/zgbsvx.f |  644 ++++++++++++
 lapack-netlib/zgejsv.f | 2234 ++++++++++++++++++++++++++++++++++++++++
 lapack-netlib/zgesvx.f |  602 +++++++++++
 11 files changed, 11216 insertions(+)
 create mode 100644 lapack-netlib/cgbsvx.f
 create mode 100644 lapack-netlib/cgejsv.f
 create mode 100644 lapack-netlib/cgesvx.f
 create mode 100644 lapack-netlib/dgbsvx.f
 create mode 100644 lapack-netlib/dgejsv.f
 create mode 100644 lapack-netlib/dgesvx.f
 create mode 100644 lapack-netlib/sgbsvx.f
 create mode 100644 lapack-netlib/sgesvx.f
 create mode 100644 lapack-netlib/zgbsvx.f
 create mode 100644 lapack-netlib/zgejsv.f
 create mode 100644 lapack-netlib/zgesvx.f

diff --git a/lapack-netlib/cgbsvx.f b/lapack-netlib/cgbsvx.f
new file mode 100644
index 000000000..eaab5682c
--- /dev/null
+++ b/lapack-netlib/cgbsvx.f
@@ -0,0 +1,644 @@
+*> \brief <b> CGBSVX computes the solution to system of linear equations A * X = B for GB matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CGBSVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cgbsvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cgbsvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cgbsvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+*                          LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+*                          RCOND, FERR, BERR, WORK, RWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+*       REAL               RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * )
+*       REAL               BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   RWORK( * )
+*       COMPLEX            AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CGBSVX uses the LU factorization to compute the solution to a complex
+*> system of linear equations A * X = B, A**T * X = B, or A**H * X = B,
+*> where A is a band matrix of order N with KL subdiagonals and KU
+*> superdiagonals, and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed by this subroutine:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = L * U,
+*>    where L is a product of permutation and unit lower triangular
+*>    matrices with KL subdiagonals, and U is upper triangular with
+*>    KL+KU superdiagonals.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AFB and IPIV contain the factored form of
+*>                  A.  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  AB, AFB, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AFB and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AFB and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations.
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KL
+*> \verbatim
+*>          KL is INTEGER
+*>          The number of subdiagonals within the band of A.  KL >= 0.
+*> \endverbatim
+*>
+*> \param[in] KU
+*> \verbatim
+*>          KU is INTEGER
+*>          The number of superdiagonals within the band of A.  KU >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is COMPLEX array, dimension (LDAB,N)
+*>          On entry, the matrix A in band storage, in rows 1 to KL+KU+1.
+*>          The j-th column of A is stored in the j-th column of the
+*>          array AB as follows:
+*>          AB(KU+1+i-j,j) = A(i,j) for max(1,j-KU)<=i<=min(N,j+kl)
+*>
+*>          If FACT = 'F' and EQUED is not 'N', then A must have been
+*>          equilibrated by the scaling factors in R and/or C.  AB is not
+*>          modified if FACT = 'F' or 'N', or if FACT = 'E' and
+*>          EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] AFB
+*> \verbatim
+*>          AFB is COMPLEX array, dimension (LDAFB,N)
+*>          If FACT = 'F', then AFB is an input argument and on entry
+*>          contains details of the LU factorization of the band matrix
+*>          A, as computed by CGBTRF.  U is stored as an upper triangular
+*>          band matrix with KL+KU superdiagonals in rows 1 to KL+KU+1,
+*>          and the multipliers used during the factorization are stored
+*>          in rows KL+KU+2 to 2*KL+KU+1.  If EQUED .ne. 'N', then AFB is
+*>          the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of A.
+*>
+*>          If FACT = 'E', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of the equilibrated
+*>          matrix A (see the description of AB for the form of the
+*>          equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAFB
+*> \verbatim
+*>          LDAFB is INTEGER
+*>          The leading dimension of the array AFB.  LDAFB >= 2*KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = L*U
+*>          as computed by CGBTRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is REAL array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is REAL array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX array, dimension (LDB,NRHS)
+*>          On entry, the right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is COMPLEX array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is REAL
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is REAL array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is REAL array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (2*N)
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (MAX(1,N))
+*>          On exit, RWORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If RWORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          RWORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization
+*>                       has been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*>                       value of RCOND would suggest.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complexGBsolve
+*
+*  =====================================================================
+      SUBROUTINE CGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+     $                   LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+     $                   RCOND, FERR, BERR, WORK, RWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+      REAL               RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * )
+      REAL               BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   RWORK( * )
+      COMPLEX            AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*  Moved setting of INFO = N+1 so INFO does not subsequently get
+*  overwritten.  Sven, 17 Mar 05.
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J, J1, J2
+      REAL               AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      REAL               CLANGB, CLANTB, SLAMCH
+      EXTERNAL           LSAME, CLANGB, CLANTB, SLAMCH
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CCOPY, CGBCON, CGBEQU, CGBRFS, CGBTRF, CGBTRS,
+     $                   CLACPY, CLAQGB, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = SLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KL.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( KU.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -6
+      ELSE IF( LDAB.LT.KL+KU+1 ) THEN
+         INFO = -8
+      ELSE IF( LDAFB.LT.2*KL+KU+1 ) THEN
+         INFO = -10
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -12
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -13
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -14
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -18
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CGBSVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL CGBEQU( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL CLAQGB( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                   AMAX, EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of the band matrix A.
+*
+         DO 70 J = 1, N
+            J1 = MAX( J-KU, 1 )
+            J2 = MIN( J+KL, N )
+            CALL CCOPY( J2-J1+1, AB( KU+1-J+J1, J ), 1,
+     $                  AFB( KL+KU+1-J+J1, J ), 1 )
+   70    CONTINUE
+*
+         CALL CGBTRF( N, N, KL, KU, AFB, LDAFB, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            ANORM = ZERO
+            DO 90 J = 1, INFO
+               DO 80 I = MAX( KU+2-J, 1 ), MIN( N+KU+1-J, KL+KU+1 )
+                  ANORM = MAX( ANORM, ABS( AB( I, J ) ) )
+   80          CONTINUE
+   90       CONTINUE
+            RPVGRW = CLANTB( 'M', 'U', 'N', INFO, MIN( INFO-1, KL+KU ),
+     $                       AFB( MAX( 1, KL+KU+2-INFO ), 1 ), LDAFB,
+     $                       RWORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = ANORM / RPVGRW
+            END IF
+            RWORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = CLANGB( NORM, N, KL, KU, AB, LDAB, RWORK )
+      RPVGRW = CLANTB( 'M', 'U', 'N', N, KL+KU, AFB, LDAFB, RWORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = CLANGB( 'M', N, KL, KU, AB, LDAB, RWORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL CGBCON( NORM, N, KL, KU, AFB, LDAFB, IPIV, ANORM, RCOND,
+     $             WORK, RWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL CLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL CGBTRS( TRANS, N, KL, KU, NRHS, AFB, LDAFB, IPIV, X, LDX,
+     $             INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL CGBRFS( TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, LDAFB, IPIV,
+     $             B, LDB, X, LDX, FERR, BERR, WORK, RWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 110 J = 1, NRHS
+               DO 100 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+  100          CONTINUE
+  110       CONTINUE
+            DO 120 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+  120       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 140 J = 1, NRHS
+            DO 130 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  130       CONTINUE
+  140    CONTINUE
+         DO 150 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  150    CONTINUE
+      END IF
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.SLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+*
+      RWORK( 1 ) = RPVGRW
+      RETURN
+*
+*     End of CGBSVX
+*
+      END
diff --git a/lapack-netlib/cgejsv.f b/lapack-netlib/cgejsv.f
new file mode 100644
index 000000000..51a6cee4e
--- /dev/null
+++ b/lapack-netlib/cgejsv.f
@@ -0,0 +1,2232 @@
+*> \brief \b CGEJSV
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CGEJSV + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cgejsv.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cgejsv.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cgejsv.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*     SUBROUTINE CGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,
+*                         M, N, A, LDA, SVA, U, LDU, V, LDV,
+*                         CWORK, LWORK, RWORK, LRWORK, IWORK, INFO )
+*
+*     .. Scalar Arguments ..
+*     IMPLICIT    NONE
+*     INTEGER     INFO, LDA, LDU, LDV, LWORK, M, N
+*     ..
+*     .. Array Arguments ..
+*     COMPLEX     A( LDA, * ),  U( LDU, * ), V( LDV, * ), CWORK( LWORK )
+*     REAL        SVA( N ), RWORK( LRWORK )
+*     INTEGER     IWORK( * )
+*     CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CGEJSV computes the singular value decomposition (SVD) of a complex M-by-N
+*> matrix [A], where M >= N. The SVD of [A] is written as
+*>
+*>              [A] = [U] * [SIGMA] * [V]^*,
+*>
+*> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N
+*> diagonal elements, [U] is an M-by-N (or M-by-M) unitary matrix, and
+*> [V] is an N-by-N unitary matrix. The diagonal elements of [SIGMA] are
+*> the singular values of [A]. The columns of [U] and [V] are the left and
+*> the right singular vectors of [A], respectively. The matrices [U] and [V]
+*> are computed and stored in the arrays U and V, respectively. The diagonal
+*> of [SIGMA] is computed and stored in the array SVA.
+*> \endverbatim
+*>
+*>  Arguments:
+*>  ==========
+*>
+*> \param[in] JOBA
+*> \verbatim
+*>          JOBA is CHARACTER*1
+*>         Specifies the level of accuracy:
+*>       = 'C': This option works well (high relative accuracy) if A = B * D,
+*>              with well-conditioned B and arbitrary diagonal matrix D.
+*>              The accuracy cannot be spoiled by COLUMN scaling. The
+*>              accuracy of the computed output depends on the condition of
+*>              B, and the procedure aims at the best theoretical accuracy.
+*>              The relative error max_{i=1:N}|d sigma_i| / sigma_i is
+*>              bounded by f(M,N)*epsilon* cond(B), independent of D.
+*>              The input matrix is preprocessed with the QRF with column
+*>              pivoting. This initial preprocessing and preconditioning by
+*>              a rank revealing QR factorization is common for all values of
+*>              JOBA. Additional actions are specified as follows:
+*>       = 'E': Computation as with 'C' with an additional estimate of the
+*>              condition number of B. It provides a realistic error bound.
+*>       = 'F': If A = D1 * C * D2 with ill-conditioned diagonal scalings
+*>              D1, D2, and well-conditioned matrix C, this option gives
+*>              higher accuracy than the 'C' option. If the structure of the
+*>              input matrix is not known, and relative accuracy is
+*>              desirable, then this option is advisable. The input matrix A
+*>              is preprocessed with QR factorization with FULL (row and
+*>              column) pivoting.
+*>       = 'G': Computation as with 'F' with an additional estimate of the
+*>              condition number of B, where A=B*D. If A has heavily weighted
+*>              rows, then using this condition number gives too pessimistic
+*>              error bound.
+*>       = 'A': Small singular values are not well determined by the data 
+*>              and are considered as noisy; the matrix is treated as
+*>              numerically rank deficient. The error in the computed
+*>              singular values is bounded by f(m,n)*epsilon*||A||.
+*>              The computed SVD A = U * S * V^* restores A up to
+*>              f(m,n)*epsilon*||A||.
+*>              This gives the procedure the licence to discard (set to zero)
+*>              all singular values below N*epsilon*||A||.
+*>       = 'R': Similar as in 'A'. Rank revealing property of the initial
+*>              QR factorization is used do reveal (using triangular factor)
+*>              a gap sigma_{r+1} < epsilon * sigma_r in which case the
+*>              numerical RANK is declared to be r. The SVD is computed with
+*>              absolute error bounds, but more accurately than with 'A'.
+*> \endverbatim
+*>
+*> \param[in] JOBU
+*> \verbatim
+*>          JOBU is CHARACTER*1
+*>         Specifies whether to compute the columns of U:
+*>       = 'U': N columns of U are returned in the array U.
+*>       = 'F': full set of M left sing. vectors is returned in the array U.
+*>       = 'W': U may be used as workspace of length M*N. See the description
+*>              of U.
+*>       = 'N': U is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBV
+*> \verbatim
+*>          JOBV is CHARACTER*1
+*>         Specifies whether to compute the matrix V:
+*>       = 'V': N columns of V are returned in the array V; Jacobi rotations
+*>              are not explicitly accumulated.
+*>       = 'J': N columns of V are returned in the array V, but they are
+*>              computed as the product of Jacobi rotations, if JOBT = 'N'.
+*>       = 'W': V may be used as workspace of length N*N. See the description
+*>              of V.
+*>       = 'N': V is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBR
+*> \verbatim
+*>          JOBR is CHARACTER*1
+*>         Specifies the RANGE for the singular values. Issues the licence to
+*>         set to zero small positive singular values if they are outside
+*>         specified range. If A .NE. 0 is scaled so that the largest singular
+*>         value of c*A is around SQRT(BIG), BIG=SLAMCH('O'), then JOBR issues
+*>         the licence to kill columns of A whose norm in c*A is less than
+*>         SQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN,
+*>         where SFMIN=SLAMCH('S'), EPSLN=SLAMCH('E').
+*>       = 'N': Do not kill small columns of c*A. This option assumes that
+*>              BLAS and QR factorizations and triangular solvers are
+*>              implemented to work in that range. If the condition of A
+*>              is greater than BIG, use CGESVJ.
+*>       = 'R': RESTRICTED range for sigma(c*A) is [SQRT(SFMIN), SQRT(BIG)]
+*>              (roughly, as described above). This option is recommended.
+*>                                             ===========================
+*>         For computing the singular values in the FULL range [SFMIN,BIG]
+*>         use CGESVJ.
+*> \endverbatim
+*>
+*> \param[in] JOBT
+*> \verbatim
+*>          JOBT is CHARACTER*1
+*>         If the matrix is square then the procedure may determine to use
+*>         transposed A if A^* seems to be better with respect to convergence.
+*>         If the matrix is not square, JOBT is ignored.
+*>         The decision is based on two values of entropy over the adjoint
+*>         orbit of A^* * A. See the descriptions of RWORK(6) and RWORK(7).
+*>       = 'T': transpose if entropy test indicates possibly faster
+*>         convergence of Jacobi process if A^* is taken as input. If A is
+*>         replaced with A^*, then the row pivoting is included automatically.
+*>       = 'N': do not speculate.
+*>         The option 'T' can be used to compute only the singular values, or
+*>         the full SVD (U, SIGMA and V). For only one set of singular vectors
+*>         (U or V), the caller should provide both U and V, as one of the
+*>         matrices is used as workspace if the matrix A is transposed.
+*>         The implementer can easily remove this constraint and make the
+*>         code more complicated. See the descriptions of U and V.
+*>         In general, this option is considered experimental, and 'N'; should
+*>         be preferred. This is subject to changes in the future.
+*> \endverbatim
+*>
+*> \param[in] JOBP
+*> \verbatim
+*>          JOBP is CHARACTER*1
+*>         Issues the licence to introduce structured perturbations to drown
+*>         denormalized numbers. This licence should be active if the
+*>         denormals are poorly implemented, causing slow computation,
+*>         especially in cases of fast convergence (!). For details see [1,2].
+*>         For the sake of simplicity, this perturbations are included only
+*>         when the full SVD or only the singular values are requested. The
+*>         implementer/user can easily add the perturbation for the cases of
+*>         computing one set of singular vectors.
+*>       = 'P': introduce perturbation
+*>       = 'N': do not perturb
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>         The number of rows of the input matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>         The number of columns of the input matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] SVA
+*> \verbatim
+*>          SVA is REAL array, dimension (N)
+*>          On exit,
+*>          - For RWORK(1)/RWORK(2) = ONE: The singular values of A. During
+*>            the computation SVA contains Euclidean column norms of the
+*>            iterated matrices in the array A.
+*>          - For RWORK(1) .NE. RWORK(2): The singular values of A are
+*>            (RWORK(1)/RWORK(2)) * SVA(1:N). This factored form is used if
+*>            sigma_max(A) overflows or if small singular values have been
+*>            saved from underflow by scaling the input matrix A.
+*>          - If JOBR='R' then some of the singular values may be returned
+*>            as exact zeros obtained by "set to zero" because they are
+*>            below the numerical rank threshold or are denormalized numbers.
+*> \endverbatim
+*>
+*> \param[out] U
+*> \verbatim
+*>          U is COMPLEX array, dimension ( LDU, N ) or ( LDU, M )
+*>          If JOBU = 'U', then U contains on exit the M-by-N matrix of
+*>                         the left singular vectors.
+*>          If JOBU = 'F', then U contains on exit the M-by-M matrix of
+*>                         the left singular vectors, including an ONB
+*>                         of the orthogonal complement of the Range(A).
+*>          If JOBU = 'W'  .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N),
+*>                         then U is used as workspace if the procedure
+*>                         replaces A with A^*. In that case, [V] is computed
+*>                         in U as left singular vectors of A^* and then
+*>                         copied back to the V array. This 'W' option is just
+*>                         a reminder to the caller that in this case U is
+*>                         reserved as workspace of length N*N.
+*>          If JOBU = 'N'  U is not referenced, unless JOBT='T'.
+*> \endverbatim
+*>
+*> \param[in] LDU
+*> \verbatim
+*>          LDU is INTEGER
+*>          The leading dimension of the array U,  LDU >= 1.
+*>          IF  JOBU = 'U' or 'F' or 'W',  then LDU >= M.
+*> \endverbatim
+*>
+*> \param[out] V
+*> \verbatim
+*>          V is COMPLEX array, dimension ( LDV, N )
+*>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
+*>                         the right singular vectors;
+*>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
+*>                         then V is used as workspace if the pprocedure
+*>                         replaces A with A^*. In that case, [U] is computed
+*>                         in V as right singular vectors of A^* and then
+*>                         copied back to the U array. This 'W' option is just
+*>                         a reminder to the caller that in this case V is
+*>                         reserved as workspace of length N*N.
+*>          If JOBV = 'N'  V is not referenced, unless JOBT='T'.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V,  LDV >= 1.
+*>          If JOBV = 'V' or 'J' or 'W', then LDV >= N.
+*> \endverbatim
+*>
+*> \param[out] CWORK
+*> \verbatim
+*>          CWORK is COMPLEX array, dimension (MAX(2,LWORK))
+*>          If the call to CGEJSV is a workspace query (indicated by LWORK=-1 or
+*>          LRWORK=-1), then on exit CWORK(1) contains the required length of 
+*>          CWORK for the job parameters used in the call.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          Length of CWORK to confirm proper allocation of workspace.
+*>          LWORK depends on the job:
+*>
+*>          1. If only SIGMA is needed ( JOBU = 'N', JOBV = 'N' ) and
+*>            1.1 .. no scaled condition estimate required (JOBA.NE.'E'.AND.JOBA.NE.'G'):
+*>               LWORK >= 2*N+1. This is the minimal requirement.
+*>               ->> For optimal performance (blocked code) the optimal value
+*>               is LWORK >= N + (N+1)*NB. Here NB is the optimal
+*>               block size for CGEQP3 and CGEQRF.
+*>               In general, optimal LWORK is computed as
+*>               LWORK >= max(N+LWORK(CGEQP3),N+LWORK(CGEQRF), LWORK(CGESVJ)).        
+*>            1.2. .. an estimate of the scaled condition number of A is
+*>               required (JOBA='E', or 'G'). In this case, LWORK the minimal
+*>               requirement is LWORK >= N*N + 2*N.
+*>               ->> For optimal performance (blocked code) the optimal value
+*>               is LWORK >= max(N+(N+1)*NB, N*N+2*N)=N**2+2*N.
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(CGEQP3),N+LWORK(CGEQRF), LWORK(CGESVJ),
+*>                            N*N+LWORK(CPOCON)).
+*>          2. If SIGMA and the right singular vectors are needed (JOBV = 'V'),
+*>             (JOBU = 'N')
+*>            2.1   .. no scaled condition estimate requested (JOBE = 'N'):    
+*>            -> the minimal requirement is LWORK >= 3*N.
+*>            -> For optimal performance, 
+*>               LWORK >= max(N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for CGEQP3, CGEQRF, CGELQF,
+*>               CUNMLQ. In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(CGEQP3), N+LWORK(CGESVJ),
+*>                       N+LWORK(CGELQF), 2*N+LWORK(CGEQRF), N+LWORK(CUNMLQ)).
+*>            2.2 .. an estimate of the scaled condition number of A is
+*>               required (JOBA='E', or 'G').
+*>            -> the minimal requirement is LWORK >= 3*N.      
+*>            -> For optimal performance, 
+*>               LWORK >= max(N+(N+1)*NB, 2*N,2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for CGEQP3, CGEQRF, CGELQF,
+*>               CUNMLQ. In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(CGEQP3), LWORK(CPOCON), N+LWORK(CGESVJ),
+*>                       N+LWORK(CGELQF), 2*N+LWORK(CGEQRF), N+LWORK(CUNMLQ)).   
+*>          3. If SIGMA and the left singular vectors are needed
+*>            3.1  .. no scaled condition estimate requested (JOBE = 'N'):
+*>            -> the minimal requirement is LWORK >= 3*N.
+*>            -> For optimal performance:
+*>               if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for CGEQP3, CGEQRF, CUNMQR.
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(CGEQP3), 2*N+LWORK(CGEQRF), N+LWORK(CUNMQR)). 
+*>            3.2  .. an estimate of the scaled condition number of A is
+*>               required (JOBA='E', or 'G').
+*>            -> the minimal requirement is LWORK >= 3*N.
+*>            -> For optimal performance:
+*>               if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for CGEQP3, CGEQRF, CUNMQR.
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(CGEQP3),N+LWORK(CPOCON),
+*>                        2*N+LWORK(CGEQRF), N+LWORK(CUNMQR)).
+*>
+*>          4. If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and
+*>            4.1. if JOBV = 'V'
+*>               the minimal requirement is LWORK >= 5*N+2*N*N.
+*>            4.2. if JOBV = 'J' the minimal requirement is
+*>               LWORK >= 4*N+N*N.
+*>            In both cases, the allocated CWORK can accommodate blocked runs
+*>            of CGEQP3, CGEQRF, CGELQF, CUNMQR, CUNMLQ.
+*> 
+*>          If the call to CGEJSV is a workspace query (indicated by LWORK=-1 or
+*>          LRWORK=-1), then on exit CWORK(1) contains the optimal and CWORK(2) contains the
+*>          minimal length of CWORK for the job parameters used in the call.        
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (MAX(7,LRWORK))
+*>          On exit,
+*>          RWORK(1) = Determines the scaling factor SCALE = RWORK(2) / RWORK(1)
+*>                    such that SCALE*SVA(1:N) are the computed singular values
+*>                    of A. (See the description of SVA().)
+*>          RWORK(2) = See the description of RWORK(1).
+*>          RWORK(3) = SCONDA is an estimate for the condition number of
+*>                    column equilibrated A. (If JOBA = 'E' or 'G')
+*>                    SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1).
+*>                    It is computed using CPOCON. It holds
+*>                    N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+*>                    where R is the triangular factor from the QRF of A.
+*>                    However, if R is truncated and the numerical rank is
+*>                    determined to be strictly smaller than N, SCONDA is
+*>                    returned as -1, thus indicating that the smallest
+*>                    singular values might be lost.
+*>
+*>          If full SVD is needed, the following two condition numbers are
+*>          useful for the analysis of the algorithm. They are provided for
+*>          a developer/implementer who is familiar with the details of
+*>          the method.
+*>
+*>          RWORK(4) = an estimate of the scaled condition number of the
+*>                    triangular factor in the first QR factorization.
+*>          RWORK(5) = an estimate of the scaled condition number of the
+*>                    triangular factor in the second QR factorization.
+*>          The following two parameters are computed if JOBT = 'T'.
+*>          They are provided for a developer/implementer who is familiar
+*>          with the details of the method.
+*>          RWORK(6) = the entropy of A^* * A :: this is the Shannon entropy
+*>                    of diag(A^* * A) / Trace(A^* * A) taken as point in the
+*>                    probability simplex.
+*>          RWORK(7) = the entropy of A * A^*. (See the description of RWORK(6).)
+*>          If the call to CGEJSV is a workspace query (indicated by LWORK=-1 or
+*>          LRWORK=-1), then on exit RWORK(1) contains the required length of
+*>          RWORK for the job parameters used in the call.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*>          LRWORK is INTEGER
+*>          Length of RWORK to confirm proper allocation of workspace.
+*>          LRWORK depends on the job:
+*>
+*>       1. If only the singular values are requested i.e. if
+*>          LSAME(JOBU,'N') .AND. LSAME(JOBV,'N')
+*>          then:
+*>          1.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>               then: LRWORK = max( 7, 2 * M ).
+*>          1.2. Otherwise, LRWORK  = max( 7,  N ).
+*>       2. If singular values with the right singular vectors are requested
+*>          i.e. if
+*>          (LSAME(JOBV,'V').OR.LSAME(JOBV,'J')) .AND.
+*>          .NOT.(LSAME(JOBU,'U').OR.LSAME(JOBU,'F'))
+*>          then:
+*>          2.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>          then LRWORK = max( 7, 2 * M ).
+*>          2.2. Otherwise, LRWORK  = max( 7,  N ).
+*>       3. If singular values with the left singular vectors are requested, i.e. if
+*>          (LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) .AND.
+*>          .NOT.(LSAME(JOBV,'V').OR.LSAME(JOBV,'J'))
+*>          then:
+*>          3.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>          then LRWORK = max( 7, 2 * M ).
+*>          3.2. Otherwise, LRWORK  = max( 7,  N ).
+*>       4. If singular values with both the left and the right singular vectors
+*>          are requested, i.e. if
+*>          (LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) .AND.
+*>          (LSAME(JOBV,'V').OR.LSAME(JOBV,'J'))
+*>          then:
+*>          4.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>          then LRWORK = max( 7, 2 * M ).
+*>          4.2. Otherwise, LRWORK  = max( 7, N ).
+*> 
+*>          If, on entry, LRWORK = -1 or LWORK=-1, a workspace query is assumed and 
+*>          the length of RWORK is returned in RWORK(1). 
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, of dimension at least 4, that further depends
+*>          on the job:
+*> 
+*>          1. If only the singular values are requested then:
+*>             If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>             then the length of IWORK is N+M; otherwise the length of IWORK is N.
+*>          2. If the singular values and the right singular vectors are requested then:
+*>             If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>             then the length of IWORK is N+M; otherwise the length of IWORK is N. 
+*>          3. If the singular values and the left singular vectors are requested then:
+*>             If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>             then the length of IWORK is N+M; otherwise the length of IWORK is N. 
+*>          4. If the singular values with both the left and the right singular vectors
+*>             are requested, then:      
+*>             4.1. If LSAME(JOBV,'J') the length of IWORK is determined as follows:
+*>                  If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>                  then the length of IWORK is N+M; otherwise the length of IWORK is N. 
+*>             4.2. If LSAME(JOBV,'V') the length of IWORK is determined as follows:
+*>                  If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>                  then the length of IWORK is 2*N+M; otherwise the length of IWORK is 2*N.
+*>        
+*>          On exit,
+*>          IWORK(1) = the numerical rank determined after the initial
+*>                     QR factorization with pivoting. See the descriptions
+*>                     of JOBA and JOBR.
+*>          IWORK(2) = the number of the computed nonzero singular values
+*>          IWORK(3) = if nonzero, a warning message:
+*>                     If IWORK(3) = 1 then some of the column norms of A
+*>                     were denormalized floats. The requested high accuracy
+*>                     is not warranted by the data.
+*>          IWORK(4) = 1 or -1. If IWORK(4) = 1, then the procedure used A^* to
+*>                     do the job as specified by the JOB parameters.
+*>          If the call to CGEJSV is a workspace query (indicated by LWORK = -1 and 
+*>          LRWORK = -1), then on exit IWORK(1) contains the required length of 
+*>          IWORK for the job parameters used in the call.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>           < 0:  if INFO = -i, then the i-th argument had an illegal value.
+*>           = 0:  successful exit;
+*>           > 0:  CGEJSV  did not converge in the maximal allowed number
+*>                 of sweeps. The computed values may be inaccurate.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complexGEsing
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>  CGEJSV implements a preconditioned Jacobi SVD algorithm. It uses CGEQP3,
+*>  CGEQRF, and CGELQF as preprocessors and preconditioners. Optionally, an
+*>  additional row pivoting can be used as a preprocessor, which in some
+*>  cases results in much higher accuracy. An example is matrix A with the
+*>  structure A = D1 * C * D2, where D1, D2 are arbitrarily ill-conditioned
+*>  diagonal matrices and C is well-conditioned matrix. In that case, complete
+*>  pivoting in the first QR factorizations provides accuracy dependent on the
+*>  condition number of C, and independent of D1, D2. Such higher accuracy is
+*>  not completely understood theoretically, but it works well in practice.
+*>  Further, if A can be written as A = B*D, with well-conditioned B and some
+*>  diagonal D, then the high accuracy is guaranteed, both theoretically and
+*>  in software, independent of D. For more details see [1], [2].
+*>     The computational range for the singular values can be the full range
+*>  ( UNDERFLOW,OVERFLOW ), provided that the machine arithmetic and the BLAS
+*>  & LAPACK routines called by CGEJSV are implemented to work in that range.
+*>  If that is not the case, then the restriction for safe computation with
+*>  the singular values in the range of normalized IEEE numbers is that the
+*>  spectral condition number kappa(A)=sigma_max(A)/sigma_min(A) does not
+*>  overflow. This code (CGEJSV) is best used in this restricted range,
+*>  meaning that singular values of magnitude below ||A||_2 / SLAMCH('O') are
+*>  returned as zeros. See JOBR for details on this.
+*>     Further, this implementation is somewhat slower than the one described
+*>  in [1,2] due to replacement of some non-LAPACK components, and because
+*>  the choice of some tuning parameters in the iterative part (CGESVJ) is
+*>  left to the implementer on a particular machine.
+*>     The rank revealing QR factorization (in this code: CGEQP3) should be
+*>  implemented as in [3]. We have a new version of CGEQP3 under development
+*>  that is more robust than the current one in LAPACK, with a cleaner cut in
+*>  rank deficient cases. It will be available in the SIGMA library [4].
+*>  If M is much larger than N, it is obvious that the initial QRF with
+*>  column pivoting can be preprocessed by the QRF without pivoting. That
+*>  well known trick is not used in CGEJSV because in some cases heavy row
+*>  weighting can be treated with complete pivoting. The overhead in cases
+*>  M much larger than N is then only due to pivoting, but the benefits in
+*>  terms of accuracy have prevailed. The implementer/user can incorporate
+*>  this extra QRF step easily. The implementer can also improve data movement
+*>  (matrix transpose, matrix copy, matrix transposed copy) - this
+*>  implementation of CGEJSV uses only the simplest, naive data movement.
+*> \endverbatim
+*
+*> \par Contributor:
+*  ==================
+*>
+*>  Zlatko Drmac (Zagreb, Croatia)
+*
+*> \par References:
+*  ================
+*>
+*> \verbatim
+*>
+*> [1] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm I.
+*>     SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1322-1342.
+*>     LAPACK Working note 169.
+*> [2] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm II.
+*>     SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1343-1362.
+*>     LAPACK Working note 170.
+*> [3] Z. Drmac and Z. Bujanovic: On the failure of rank-revealing QR
+*>     factorization software - a case study.
+*>     ACM Trans. Math. Softw. Vol. 35, No 2 (2008), pp. 1-28.
+*>     LAPACK Working note 176.
+*> [4] Z. Drmac: SIGMA - mathematical software library for accurate SVD, PSV,
+*>     QSVD, (H,K)-SVD computations.
+*>     Department of Mathematics, University of Zagreb, 2008, 2016.
+*> \endverbatim
+*
+*>  \par Bugs, examples and comments:
+*   =================================
+*>
+*>  Please report all bugs and send interesting examples and/or comments to
+*>  drmac@math.hr. Thank you.
+*>
+*  =====================================================================
+      SUBROUTINE CGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,
+     $                   M, N, A, LDA, SVA, U, LDU, V, LDV,
+     $                   CWORK, LWORK, RWORK, LRWORK, IWORK, INFO )
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      IMPLICIT    NONE
+      INTEGER     INFO, LDA, LDU, LDV, LWORK, LRWORK, M, N
+*     ..
+*     .. Array Arguments ..
+      COMPLEX     A( LDA, * ), U( LDU, * ), V( LDV, * ), CWORK( LWORK )
+      REAL        SVA( N ), RWORK( LRWORK )
+      INTEGER     IWORK( * )
+      CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV
+*     ..
+*
+*  ===========================================================================
+*
+*     .. Local Parameters ..
+      REAL        ZERO,         ONE
+      PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 )
+      COMPLEX     CZERO,                    CONE
+      PARAMETER ( CZERO = ( 0.0E0, 0.0E0 ), CONE = ( 1.0E0, 0.0E0 ) )
+*     ..
+*     .. Local Scalars ..
+      COMPLEX CTEMP
+      REAL    AAPP,   AAQQ,   AATMAX, AATMIN, BIG,    BIG1,   COND_OK,
+     $        CONDR1, CONDR2, ENTRA,  ENTRAT, EPSLN,  MAXPRJ, SCALEM,
+     $        SCONDA, SFMIN,  SMALL,  TEMP1,  USCAL1, USCAL2, XSC
+      INTEGER IERR,   N1,     NR,     NUMRANK,        p, q,   WARNING
+      LOGICAL ALMORT, DEFR,   ERREST, GOSCAL,  JRACC,  KILL,   LQUERY,
+     $        LSVEC,  L2ABER, L2KILL, L2PERT,  L2RANK, L2TRAN, NOSCAL,
+     $        ROWPIV, RSVEC,  TRANSP
+*
+      INTEGER OPTWRK, MINWRK, MINRWRK, MINIWRK
+      INTEGER LWCON,  LWLQF, LWQP3, LWQRF, LWUNMLQ, LWUNMQR, LWUNMQRM,
+     $        LWSVDJ, LWSVDJV, LRWQP3, LRWCON, LRWSVDJ, IWOFF
+      INTEGER LWRK_CGELQF, LWRK_CGEQP3,  LWRK_CGEQP3N, LWRK_CGEQRF,  
+     $        LWRK_CGESVJ, LWRK_CGESVJV, LWRK_CGESVJU, LWRK_CUNMLQ, 
+     $        LWRK_CUNMQR, LWRK_CUNMQRM    
+*     ..
+*     .. Local Arrays
+      COMPLEX CDUMMY(1)
+      REAL    RDUMMY(1)
+*
+*     .. Intrinsic Functions ..
+      INTRINSIC ABS, CMPLX, CONJG, ALOG, MAX, MIN, REAL, NINT, SQRT
+*     ..
+*     .. External Functions ..
+      REAL      SLAMCH, SCNRM2
+      INTEGER   ISAMAX, ICAMAX
+      LOGICAL   LSAME
+      EXTERNAL  ISAMAX, ICAMAX, LSAME, SLAMCH, SCNRM2
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL  SLASSQ, CCOPY,  CGELQF, CGEQP3, CGEQRF, CLACPY, CLAPMR,
+     $          CLASCL, SLASCL, CLASET, CLASSQ, CLASWP, CUNGQR, CUNMLQ,
+     $          CUNMQR, CPOCON, SSCAL,  CSSCAL, CSWAP,  CTRSM,  CLACGV,
+     $          XERBLA
+*
+      EXTERNAL  CGESVJ
+*     ..
+*
+*     Test the input arguments
+*
+      LSVEC  = LSAME( JOBU, 'U' ) .OR. LSAME( JOBU, 'F' )
+      JRACC  = LSAME( JOBV, 'J' )
+      RSVEC  = LSAME( JOBV, 'V' ) .OR. JRACC
+      ROWPIV = LSAME( JOBA, 'F' ) .OR. LSAME( JOBA, 'G' )
+      L2RANK = LSAME( JOBA, 'R' )
+      L2ABER = LSAME( JOBA, 'A' )
+      ERREST = LSAME( JOBA, 'E' ) .OR. LSAME( JOBA, 'G' )
+      L2TRAN = LSAME( JOBT, 'T' ) .AND. ( M .EQ. N )
+      L2KILL = LSAME( JOBR, 'R' )
+      DEFR   = LSAME( JOBR, 'N' )
+      L2PERT = LSAME( JOBP, 'P' )
+*
+      LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 )
+*
+      IF ( .NOT.(ROWPIV .OR. L2RANK .OR. L2ABER .OR.
+     $     ERREST .OR. LSAME( JOBA, 'C' ) )) THEN
+         INFO = - 1
+      ELSE IF ( .NOT.( LSVEC .OR. LSAME( JOBU, 'N' ) .OR.
+     $   ( LSAME( JOBU, 'W' ) .AND. RSVEC .AND. L2TRAN ) ) ) THEN
+         INFO = - 2
+      ELSE IF ( .NOT.( RSVEC .OR. LSAME( JOBV, 'N' ) .OR.
+     $   ( LSAME( JOBV, 'W' ) .AND. LSVEC .AND. L2TRAN ) ) ) THEN
+         INFO = - 3
+      ELSE IF ( .NOT. ( L2KILL .OR. DEFR ) )    THEN
+         INFO = - 4
+      ELSE IF ( .NOT. ( LSAME(JOBT,'T') .OR. LSAME(JOBT,'N') ) ) THEN
+         INFO = - 5
+      ELSE IF ( .NOT. ( L2PERT .OR. LSAME( JOBP, 'N' ) ) ) THEN
+         INFO = - 6
+      ELSE IF ( M .LT. 0 ) THEN
+         INFO = - 7
+      ELSE IF ( ( N .LT. 0 ) .OR. ( N .GT. M ) ) THEN
+         INFO = - 8
+      ELSE IF ( LDA .LT. M ) THEN
+         INFO = - 10
+      ELSE IF ( LSVEC .AND. ( LDU .LT. M ) ) THEN
+         INFO = - 13
+      ELSE IF ( RSVEC .AND. ( LDV .LT. N ) ) THEN
+         INFO = - 15
+      ELSE
+*        #:)
+         INFO = 0
+      END IF
+*
+      IF ( INFO .EQ. 0 ) THEN 
+*         .. compute the minimal and the optimal workspace lengths 
+*         [[The expressions for computing the minimal and the optimal
+*         values of LCWORK, LRWORK are written with a lot of redundancy and
+*         can be simplified. However, this verbose form is useful for
+*         maintenance and modifications of the code.]]
+*
+*        .. minimal workspace length for CGEQP3 of an M x N matrix,
+*         CGEQRF of an N x N matrix, CGELQF of an N x N matrix,
+*         CUNMLQ for computing N x N matrix, CUNMQR for computing N x N
+*         matrix, CUNMQR for computing M x N matrix, respectively.
+          LWQP3 = N+1   
+          LWQRF = MAX( 1, N )
+          LWLQF = MAX( 1, N )
+          LWUNMLQ  = MAX( 1, N )
+          LWUNMQR  = MAX( 1, N )
+          LWUNMQRM = MAX( 1, M )
+*        .. minimal workspace length for CPOCON of an N x N matrix
+          LWCON = 2 * N 
+*        .. minimal workspace length for CGESVJ of an N x N matrix,
+*         without and with explicit accumulation of Jacobi rotations
+          LWSVDJ  = MAX( 2 * N, 1 )         
+          LWSVDJV = MAX( 2 * N, 1 )
+*         .. minimal REAL workspace length for CGEQP3, CPOCON, CGESVJ
+          LRWQP3  = 2 * N 
+          LRWCON  = N 
+          LRWSVDJ = N 
+          IF ( LQUERY ) THEN 
+              CALL CGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1, 
+     $             RDUMMY, IERR )
+              LWRK_CGEQP3 = INT( CDUMMY(1) )
+              CALL CGEQRF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR )
+              LWRK_CGEQRF = INT( CDUMMY(1) )
+              CALL CGELQF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR )
+              LWRK_CGELQF = INT( CDUMMY(1) )
+          END IF
+          MINWRK  = 2
+          OPTWRK  = 2
+          MINIWRK = N 
+          IF ( .NOT. (LSVEC .OR. RSVEC ) ) THEN
+*             .. minimal and optimal sizes of the complex workspace if
+*             only the singular values are requested
+              IF ( ERREST ) THEN 
+                  MINWRK = MAX( N+LWQP3, N**2+LWCON, N+LWQRF, LWSVDJ )
+              ELSE
+                  MINWRK = MAX( N+LWQP3, N+LWQRF, LWSVDJ )
+              END IF
+              IF ( LQUERY ) THEN 
+                  CALL CGESVJ( 'L', 'N', 'N', N, N, A, LDA, SVA, N, V, 
+     $                 LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                  LWRK_CGESVJ = INT( CDUMMY(1) )
+                  IF ( ERREST ) THEN 
+                      OPTWRK = MAX( N+LWRK_CGEQP3, N**2+LWCON, 
+     $                              N+LWRK_CGEQRF, LWRK_CGESVJ )
+                  ELSE
+                      OPTWRK = MAX( N+LWRK_CGEQP3, N+LWRK_CGEQRF, 
+     $                              LWRK_CGESVJ )
+                  END IF
+              END IF
+              IF ( L2TRAN .OR. ROWPIV ) THEN 
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWCON, LRWSVDJ )
+                  ELSE
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ )
+                  END IF                 
+              ELSE
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, LRWQP3, LRWCON, LRWSVDJ )
+                  ELSE
+                     MINRWRK = MAX( 7, LRWQP3, LRWSVDJ )
+                  END IF
+              END IF   
+              IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M 
+          ELSE IF ( RSVEC .AND. (.NOT.LSVEC) ) THEN
+*            .. minimal and optimal sizes of the complex workspace if the
+*            singular values and the right singular vectors are requested
+             IF ( ERREST ) THEN 
+                 MINWRK = MAX( N+LWQP3, LWCON, LWSVDJ, N+LWLQF,  
+     $                         2*N+LWQRF, N+LWSVDJ, N+LWUNMLQ )
+             ELSE
+                 MINWRK = MAX( N+LWQP3, LWSVDJ, N+LWLQF, 2*N+LWQRF, 
+     $                         N+LWSVDJ, N+LWUNMLQ )
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL CGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A,
+     $                LDA, CDUMMY, -1, RDUMMY, -1, IERR )
+                 LWRK_CGESVJ = INT( CDUMMY(1) )
+                 CALL CUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY,
+     $                V, LDV, CDUMMY, -1, IERR )
+                 LWRK_CUNMLQ = INT( CDUMMY(1) )
+                 IF ( ERREST ) THEN 
+                 OPTWRK = MAX( N+LWRK_CGEQP3, LWCON, LWRK_CGESVJ, 
+     $                         N+LWRK_CGELQF, 2*N+LWRK_CGEQRF,
+     $                         N+LWRK_CGESVJ,  N+LWRK_CUNMLQ )
+                 ELSE
+                 OPTWRK = MAX( N+LWRK_CGEQP3, LWRK_CGESVJ,N+LWRK_CGELQF,
+     $                         2*N+LWRK_CGEQRF, N+LWRK_CGESVJ, 
+     $                         N+LWRK_CUNMLQ )
+                 END IF
+             END IF
+             IF ( L2TRAN .OR. ROWPIV ) THEN 
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ, LRWCON )
+                  ELSE
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ ) 
+                  END IF                  
+             ELSE
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON )
+                  ELSE
+                     MINRWRK = MAX( 7, LRWQP3, LRWSVDJ ) 
+                  END IF                 
+             END IF
+             IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+          ELSE IF ( LSVEC .AND. (.NOT.RSVEC) ) THEN  
+*            .. minimal and optimal sizes of the complex workspace if the
+*            singular values and the left singular vectors are requested
+             IF ( ERREST ) THEN
+                 MINWRK = N + MAX( LWQP3,LWCON,N+LWQRF,LWSVDJ,LWUNMQRM )
+             ELSE
+                 MINWRK = N + MAX( LWQP3, N+LWQRF, LWSVDJ, LWUNMQRM )
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL CGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A,
+     $                LDA, CDUMMY, -1, RDUMMY, -1, IERR )
+                 LWRK_CGESVJ = INT( CDUMMY(1) )
+                 CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
+     $               LDU, CDUMMY, -1, IERR )
+                 LWRK_CUNMQRM = INT( CDUMMY(1) )
+                 IF ( ERREST ) THEN
+                 OPTWRK = N + MAX( LWRK_CGEQP3, LWCON, N+LWRK_CGEQRF,
+     $                             LWRK_CGESVJ, LWRK_CUNMQRM )
+                 ELSE
+                 OPTWRK = N + MAX( LWRK_CGEQP3, N+LWRK_CGEQRF,
+     $                             LWRK_CGESVJ, LWRK_CUNMQRM )
+                 END IF
+             END IF
+             IF ( L2TRAN .OR. ROWPIV ) THEN 
+                 IF ( ERREST ) THEN 
+                    MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ, LRWCON )
+                 ELSE
+                    MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ )
+                 END IF                 
+             ELSE
+                 IF ( ERREST ) THEN 
+                    MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON )
+                 ELSE
+                    MINRWRK = MAX( 7, LRWQP3, LRWSVDJ )
+                 END IF                
+             END IF 
+             IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+          ELSE
+*            .. minimal and optimal sizes of the complex workspace if the
+*            full SVD is requested
+             IF ( .NOT. JRACC ) THEN                
+                 IF ( ERREST ) THEN 
+                    MINWRK = MAX( N+LWQP3, N+LWCON,  2*N+N**2+LWCON, 
+     $                         2*N+LWQRF,         2*N+LWQP3, 
+     $                         2*N+N**2+N+LWLQF,  2*N+N**2+N+N**2+LWCON,
+     $                         2*N+N**2+N+LWSVDJ, 2*N+N**2+N+LWSVDJV, 
+     $                         2*N+N**2+N+LWUNMQR,2*N+N**2+N+LWUNMLQ, 
+     $                         N+N**2+LWSVDJ,   N+LWUNMQRM )
+                 ELSE
+                    MINWRK = MAX( N+LWQP3,        2*N+N**2+LWCON, 
+     $                         2*N+LWQRF,         2*N+LWQP3, 
+     $                         2*N+N**2+N+LWLQF,  2*N+N**2+N+N**2+LWCON,
+     $                         2*N+N**2+N+LWSVDJ, 2*N+N**2+N+LWSVDJV,
+     $                         2*N+N**2+N+LWUNMQR,2*N+N**2+N+LWUNMLQ,
+     $                         N+N**2+LWSVDJ,      N+LWUNMQRM ) 
+                 END IF 
+                 MINIWRK = MINIWRK + N 
+                 IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+             ELSE
+                 IF ( ERREST ) THEN 
+                    MINWRK = MAX( N+LWQP3, N+LWCON, 2*N+LWQRF, 
+     $                         2*N+N**2+LWSVDJV, 2*N+N**2+N+LWUNMQR, 
+     $                         N+LWUNMQRM )
+                 ELSE
+                    MINWRK = MAX( N+LWQP3, 2*N+LWQRF, 
+     $                         2*N+N**2+LWSVDJV, 2*N+N**2+N+LWUNMQR, 
+     $                         N+LWUNMQRM ) 
+                 END IF   
+                 IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
+     $                LDU, CDUMMY, -1, IERR )
+                 LWRK_CUNMQRM = INT( CDUMMY(1) )
+                 CALL CUNMQR( 'L', 'N', N, N, N, A, LDA, CDUMMY, U,
+     $                LDU, CDUMMY, -1, IERR )
+                 LWRK_CUNMQR = INT( CDUMMY(1) )
+                 IF ( .NOT. JRACC ) THEN
+                     CALL CGEQP3( N,N, A, LDA, IWORK, CDUMMY,CDUMMY, -1,
+     $                    RDUMMY, IERR )
+                     LWRK_CGEQP3N = INT( CDUMMY(1) )
+                     CALL CGESVJ( 'L', 'U', 'N', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_CGESVJ = INT( CDUMMY(1) )
+                     CALL CGESVJ( 'U', 'U', 'N', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_CGESVJU = INT( CDUMMY(1) )
+                     CALL CGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_CGESVJV = INT( CDUMMY(1) )
+                     CALL CUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY,
+     $                    V, LDV, CDUMMY, -1, IERR )
+                     LWRK_CUNMLQ = INT( CDUMMY(1) )
+                     IF ( ERREST ) THEN 
+                       OPTWRK = MAX( N+LWRK_CGEQP3, N+LWCON, 
+     $                          2*N+N**2+LWCON, 2*N+LWRK_CGEQRF, 
+     $                          2*N+LWRK_CGEQP3N, 
+     $                          2*N+N**2+N+LWRK_CGELQF,  
+     $                          2*N+N**2+N+N**2+LWCON,
+     $                          2*N+N**2+N+LWRK_CGESVJ, 
+     $                          2*N+N**2+N+LWRK_CGESVJV,               
+     $                          2*N+N**2+N+LWRK_CUNMQR,
+     $                          2*N+N**2+N+LWRK_CUNMLQ, 
+     $                          N+N**2+LWRK_CGESVJU,                  
+     $                          N+LWRK_CUNMQRM )
+                     ELSE
+                       OPTWRK = MAX( N+LWRK_CGEQP3,  
+     $                          2*N+N**2+LWCON, 2*N+LWRK_CGEQRF, 
+     $                          2*N+LWRK_CGEQP3N, 
+     $                          2*N+N**2+N+LWRK_CGELQF,  
+     $                          2*N+N**2+N+N**2+LWCON,
+     $                          2*N+N**2+N+LWRK_CGESVJ,               
+     $                          2*N+N**2+N+LWRK_CGESVJV, 
+     $                          2*N+N**2+N+LWRK_CUNMQR,
+     $                          2*N+N**2+N+LWRK_CUNMLQ, 
+     $                          N+N**2+LWRK_CGESVJU,
+     $                          N+LWRK_CUNMQRM )
+                     END IF                    
+                 ELSE
+                     CALL CGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_CGESVJV = INT( CDUMMY(1) )
+                     CALL CUNMQR( 'L', 'N', N, N, N, CDUMMY, N, CDUMMY,
+     $                    V, LDV, CDUMMY, -1, IERR )
+                     LWRK_CUNMQR = INT( CDUMMY(1) )
+                     CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
+     $                    LDU, CDUMMY, -1, IERR )
+                     LWRK_CUNMQRM = INT( CDUMMY(1) )
+                     IF ( ERREST ) THEN 
+                        OPTWRK = MAX( N+LWRK_CGEQP3, N+LWCON,   
+     $                           2*N+LWRK_CGEQRF, 2*N+N**2,  
+     $                           2*N+N**2+LWRK_CGESVJV,  
+     $                           2*N+N**2+N+LWRK_CUNMQR,N+LWRK_CUNMQRM )
+                     ELSE
+                        OPTWRK = MAX( N+LWRK_CGEQP3, 2*N+LWRK_CGEQRF,  
+     $                           2*N+N**2, 2*N+N**2+LWRK_CGESVJV, 
+     $                           2*N+N**2+N+LWRK_CUNMQR, 
+     $                           N+LWRK_CUNMQRM )   
+                     END IF                  
+                 END IF               
+             END IF 
+             IF ( L2TRAN .OR. ROWPIV ) THEN 
+                 MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ, LRWCON )
+             ELSE
+                 MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON )
+             END IF 
+          END IF
+          MINWRK = MAX( 2, MINWRK )
+          OPTWRK = MAX( OPTWRK, MINWRK )
+          IF ( LWORK  .LT. MINWRK  .AND. (.NOT.LQUERY) ) INFO = - 17
+          IF ( LRWORK .LT. MINRWRK .AND. (.NOT.LQUERY) ) INFO = - 19   
+      END IF
+*      
+      IF ( INFO .NE. 0 ) THEN
+*       #:(
+         CALL XERBLA( 'CGEJSV', - INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+          CWORK(1) = OPTWRK
+          CWORK(2) = MINWRK
+          RWORK(1) = MINRWRK
+          IWORK(1) = MAX( 4, MINIWRK )
+          RETURN   
+      END IF
+*
+*     Quick return for void matrix (Y3K safe)
+* #:)
+      IF ( ( M .EQ. 0 ) .OR. ( N .EQ. 0 ) ) THEN
+         IWORK(1:4) = 0
+         RWORK(1:7) = 0
+         RETURN
+      ENDIF
+*
+*     Determine whether the matrix U should be M x N or M x M
+*
+      IF ( LSVEC ) THEN
+         N1 = N
+         IF ( LSAME( JOBU, 'F' ) ) N1 = M
+      END IF
+*
+*     Set numerical parameters
+*
+*!    NOTE: Make sure SLAMCH() does not fail on the target architecture.
+*
+      EPSLN = SLAMCH('Epsilon')
+      SFMIN = SLAMCH('SafeMinimum')
+      SMALL = SFMIN / EPSLN
+      BIG   = SLAMCH('O')
+*     BIG   = ONE / SFMIN
+*
+*     Initialize SVA(1:N) = diag( ||A e_i||_2 )_1^N
+*
+*(!)  If necessary, scale SVA() to protect the largest norm from
+*     overflow. It is possible that this scaling pushes the smallest
+*     column norm left from the underflow threshold (extreme case).
+*
+      SCALEM  = ONE / SQRT(REAL(M)*REAL(N))
+      NOSCAL  = .TRUE.
+      GOSCAL  = .TRUE.
+      DO 1874 p = 1, N
+         AAPP = ZERO
+         AAQQ = ONE
+         CALL CLASSQ( M, A(1,p), 1, AAPP, AAQQ )
+         IF ( AAPP .GT. BIG ) THEN
+            INFO = - 9
+            CALL XERBLA( 'CGEJSV', -INFO )
+            RETURN
+         END IF
+         AAQQ = SQRT(AAQQ)
+         IF ( ( AAPP .LT. (BIG / AAQQ) ) .AND. NOSCAL  ) THEN
+            SVA(p)  = AAPP * AAQQ
+         ELSE
+            NOSCAL  = .FALSE.
+            SVA(p)  = AAPP * ( AAQQ * SCALEM )
+            IF ( GOSCAL ) THEN
+               GOSCAL = .FALSE.
+               CALL SSCAL( p-1, SCALEM, SVA, 1 )
+            END IF
+         END IF
+ 1874 CONTINUE
+*
+      IF ( NOSCAL ) SCALEM = ONE
+*
+      AAPP = ZERO
+      AAQQ = BIG
+      DO 4781 p = 1, N
+         AAPP = MAX( AAPP, SVA(p) )
+         IF ( SVA(p) .NE. ZERO ) AAQQ = MIN( AAQQ, SVA(p) )
+ 4781 CONTINUE
+*
+*     Quick return for zero M x N matrix
+* #:)
+      IF ( AAPP .EQ. ZERO ) THEN
+         IF ( LSVEC ) CALL CLASET( 'G', M, N1, CZERO, CONE, U, LDU )
+         IF ( RSVEC ) CALL CLASET( 'G', N, N,  CZERO, CONE, V, LDV )
+         RWORK(1) = ONE
+         RWORK(2) = ONE
+         IF ( ERREST ) RWORK(3) = ONE
+         IF ( LSVEC .AND. RSVEC ) THEN
+            RWORK(4) = ONE
+            RWORK(5) = ONE
+         END IF
+         IF ( L2TRAN ) THEN
+            RWORK(6) = ZERO
+            RWORK(7) = ZERO
+         END IF
+         IWORK(1) = 0
+         IWORK(2) = 0
+         IWORK(3) = 0
+         IWORK(4) = -1
+         RETURN
+      END IF
+*
+*     Issue warning if denormalized column norms detected. Override the
+*     high relative accuracy request. Issue licence to kill nonzero columns
+*     (set them to zero) whose norm is less than sigma_max / BIG (roughly).
+* #:(
+      WARNING = 0
+      IF ( AAQQ .LE. SFMIN ) THEN
+         L2RANK = .TRUE.
+         L2KILL = .TRUE.
+         WARNING = 1
+      END IF
+*
+*     Quick return for one-column matrix
+* #:)
+      IF ( N .EQ. 1 ) THEN
+*
+         IF ( LSVEC ) THEN
+            CALL CLASCL( 'G',0,0,SVA(1),SCALEM, M,1,A(1,1),LDA,IERR )
+            CALL CLACPY( 'A', M, 1, A, LDA, U, LDU )
+*           computing all M left singular vectors of the M x 1 matrix
+            IF ( N1 .NE. N  ) THEN
+              CALL CGEQRF( M, N, U,LDU, CWORK, CWORK(N+1),LWORK-N,IERR )
+              CALL CUNGQR( M,N1,1, U,LDU,CWORK,CWORK(N+1),LWORK-N,IERR )
+              CALL CCOPY( M, A(1,1), 1, U(1,1), 1 )
+            END IF
+         END IF
+         IF ( RSVEC ) THEN
+             V(1,1) = CONE
+         END IF
+         IF ( SVA(1) .LT. (BIG*SCALEM) ) THEN
+            SVA(1)  = SVA(1) / SCALEM
+            SCALEM  = ONE
+         END IF
+         RWORK(1) = ONE / SCALEM
+         RWORK(2) = ONE
+         IF ( SVA(1) .NE. ZERO ) THEN
+            IWORK(1) = 1
+            IF ( ( SVA(1) / SCALEM) .GE. SFMIN ) THEN
+               IWORK(2) = 1
+            ELSE
+               IWORK(2) = 0
+            END IF
+         ELSE
+            IWORK(1) = 0
+            IWORK(2) = 0
+         END IF
+         IWORK(3) = 0
+         IWORK(4) = -1
+         IF ( ERREST ) RWORK(3) = ONE
+         IF ( LSVEC .AND. RSVEC ) THEN
+            RWORK(4) = ONE
+            RWORK(5) = ONE
+         END IF
+         IF ( L2TRAN ) THEN
+            RWORK(6) = ZERO
+            RWORK(7) = ZERO
+         END IF
+         RETURN
+*
+      END IF
+*
+      TRANSP = .FALSE.
+*
+      AATMAX = -ONE
+      AATMIN =  BIG
+      IF ( ROWPIV .OR. L2TRAN ) THEN
+*
+*     Compute the row norms, needed to determine row pivoting sequence
+*     (in the case of heavily row weighted A, row pivoting is strongly
+*     advised) and to collect information needed to compare the
+*     structures of A * A^* and A^* * A (in the case L2TRAN.EQ..TRUE.).
+*
+         IF ( L2TRAN ) THEN
+            DO 1950 p = 1, M
+               XSC   = ZERO
+               TEMP1 = ONE
+               CALL CLASSQ( N, A(p,1), LDA, XSC, TEMP1 )
+*              CLASSQ gets both the ell_2 and the ell_infinity norm
+*              in one pass through the vector
+               RWORK(M+p)  = XSC * SCALEM
+               RWORK(p)    = XSC * (SCALEM*SQRT(TEMP1))
+               AATMAX = MAX( AATMAX, RWORK(p) )
+               IF (RWORK(p) .NE. ZERO) 
+     $            AATMIN = MIN(AATMIN,RWORK(p))
+ 1950       CONTINUE
+         ELSE
+            DO 1904 p = 1, M
+               RWORK(M+p) = SCALEM*ABS( A(p,ICAMAX(N,A(p,1),LDA)) )
+               AATMAX = MAX( AATMAX, RWORK(M+p) )
+               AATMIN = MIN( AATMIN, RWORK(M+p) )
+ 1904       CONTINUE
+         END IF
+*
+      END IF
+*
+*     For square matrix A try to determine whether A^*  would be better
+*     input for the preconditioned Jacobi SVD, with faster convergence.
+*     The decision is based on an O(N) function of the vector of column
+*     and row norms of A, based on the Shannon entropy. This should give
+*     the right choice in most cases when the difference actually matters.
+*     It may fail and pick the slower converging side.
+*
+      ENTRA  = ZERO
+      ENTRAT = ZERO
+      IF ( L2TRAN ) THEN
+*
+         XSC   = ZERO
+         TEMP1 = ONE
+         CALL SLASSQ( N, SVA, 1, XSC, TEMP1 )
+         TEMP1 = ONE / TEMP1
+*
+         ENTRA = ZERO
+         DO 1113 p = 1, N
+            BIG1  = ( ( SVA(p) / XSC )**2 ) * TEMP1
+            IF ( BIG1 .NE. ZERO ) ENTRA = ENTRA + BIG1 * ALOG(BIG1)
+ 1113    CONTINUE
+         ENTRA = - ENTRA / ALOG(REAL(N))
+*
+*        Now, SVA().^2/Trace(A^* * A) is a point in the probability simplex.
+*        It is derived from the diagonal of  A^* * A.  Do the same with the
+*        diagonal of A * A^*, compute the entropy of the corresponding
+*        probability distribution. Note that A * A^* and A^* * A have the
+*        same trace.
+*
+         ENTRAT = ZERO
+         DO 1114 p = 1, M
+            BIG1 = ( ( RWORK(p) / XSC )**2 ) * TEMP1
+            IF ( BIG1 .NE. ZERO ) ENTRAT = ENTRAT + BIG1 * ALOG(BIG1)
+ 1114    CONTINUE
+         ENTRAT = - ENTRAT / ALOG(REAL(M))
+*
+*        Analyze the entropies and decide A or A^*. Smaller entropy
+*        usually means better input for the algorithm.
+*
+         TRANSP = ( ENTRAT .LT. ENTRA )
+* 
+*        If A^* is better than A, take the adjoint of A. This is allowed
+*        only for square matrices, M=N.  
+         IF ( TRANSP ) THEN
+*           In an optimal implementation, this trivial transpose
+*           should be replaced with faster transpose.
+            DO 1115 p = 1, N - 1
+               A(p,p) = CONJG(A(p,p))
+               DO 1116 q = p + 1, N
+                   CTEMP = CONJG(A(q,p))
+                  A(q,p) = CONJG(A(p,q))
+                  A(p,q) = CTEMP
+ 1116          CONTINUE
+ 1115       CONTINUE
+            A(N,N) = CONJG(A(N,N))
+            DO 1117 p = 1, N
+               RWORK(M+p) = SVA(p)
+               SVA(p) = RWORK(p)
+*              previously computed row 2-norms are now column 2-norms
+*              of the transposed matrix
+ 1117       CONTINUE
+            TEMP1  = AAPP
+            AAPP   = AATMAX
+            AATMAX = TEMP1
+            TEMP1  = AAQQ
+            AAQQ   = AATMIN
+            AATMIN = TEMP1
+            KILL   = LSVEC
+            LSVEC  = RSVEC
+            RSVEC  = KILL
+            IF ( LSVEC ) N1 = N
+*
+            ROWPIV = .TRUE.
+         END IF
+*
+      END IF
+*     END IF L2TRAN
+*
+*     Scale the matrix so that its maximal singular value remains less
+*     than SQRT(BIG) -- the matrix is scaled so that its maximal column
+*     has Euclidean norm equal to SQRT(BIG/N). The only reason to keep
+*     SQRT(BIG) instead of BIG is the fact that CGEJSV uses LAPACK and
+*     BLAS routines that, in some implementations, are not capable of
+*     working in the full interval [SFMIN,BIG] and that they may provoke
+*     overflows in the intermediate results. If the singular values spread
+*     from SFMIN to BIG, then CGESVJ will compute them. So, in that case,
+*     one should use CGESVJ instead of CGEJSV.
+      BIG1   = SQRT( BIG )
+      TEMP1  = SQRT( BIG / REAL(N) )
+*     >> for future updates: allow bigger range, i.e. the largest column
+*     will be allowed up to BIG/N and CGESVJ will do the rest. However, for
+*     this all other (LAPACK) components must allow such a range.      
+*     TEMP1  = BIG/REAL(N)
+*     TEMP1  = BIG * EPSLN  this should 'almost' work with current LAPACK components
+      CALL SLASCL( 'G', 0, 0, AAPP, TEMP1, N, 1, SVA, N, IERR )
+      IF ( AAQQ .GT. (AAPP * SFMIN) ) THEN
+          AAQQ = ( AAQQ / AAPP ) * TEMP1
+      ELSE
+          AAQQ = ( AAQQ * TEMP1 ) / AAPP
+      END IF
+      TEMP1 = TEMP1 * SCALEM
+      CALL CLASCL( 'G', 0, 0, AAPP, TEMP1, M, N, A, LDA, IERR )
+*
+*     To undo scaling at the end of this procedure, multiply the
+*     computed singular values with USCAL2 / USCAL1.
+*
+      USCAL1 = TEMP1
+      USCAL2 = AAPP
+*
+      IF ( L2KILL ) THEN
+*        L2KILL enforces computation of nonzero singular values in
+*        the restricted range of condition number of the initial A,
+*        sigma_max(A) / sigma_min(A) approx. SQRT(BIG)/SQRT(SFMIN).
+         XSC = SQRT( SFMIN )
+      ELSE
+         XSC = SMALL
+*
+*        Now, if the condition number of A is too big,
+*        sigma_max(A) / sigma_min(A) .GT. SQRT(BIG/N) * EPSLN / SFMIN,
+*        as a precaution measure, the full SVD is computed using CGESVJ
+*        with accumulated Jacobi rotations. This provides numerically
+*        more robust computation, at the cost of slightly increased run
+*        time. Depending on the concrete implementation of BLAS and LAPACK
+*        (i.e. how they behave in presence of extreme ill-conditioning) the
+*        implementor may decide to remove this switch.
+         IF ( ( AAQQ.LT.SQRT(SFMIN) ) .AND. LSVEC .AND. RSVEC ) THEN
+            JRACC = .TRUE.
+         END IF
+*
+      END IF
+      IF ( AAQQ .LT. XSC ) THEN
+         DO 700 p = 1, N
+            IF ( SVA(p) .LT. XSC ) THEN
+               CALL CLASET( 'A', M, 1, CZERO, CZERO, A(1,p), LDA )
+               SVA(p) = ZERO
+            END IF
+ 700     CONTINUE
+      END IF
+*
+*     Preconditioning using QR factorization with pivoting
+*
+      IF ( ROWPIV ) THEN
+*        Optional row permutation (Bjoerck row pivoting):
+*        A result by Cox and Higham shows that the Bjoerck's
+*        row pivoting combined with standard column pivoting
+*        has similar effect as Powell-Reid complete pivoting.
+*        The ell-infinity norms of A are made nonincreasing.
+         IF ( ( LSVEC .AND. RSVEC ) .AND. .NOT.( JRACC ) ) THEN 
+              IWOFF = 2*N
+         ELSE
+              IWOFF = N
+         END IF
+         DO 1952 p = 1, M - 1
+            q = ISAMAX( M-p+1, RWORK(M+p), 1 ) + p - 1
+            IWORK(IWOFF+p) = q
+            IF ( p .NE. q ) THEN
+               TEMP1      = RWORK(M+p)
+               RWORK(M+p) = RWORK(M+q)
+               RWORK(M+q) = TEMP1
+            END IF
+ 1952    CONTINUE
+         CALL CLASWP( N, A, LDA, 1, M-1, IWORK(IWOFF+1), 1 )
+      END IF
+*
+*     End of the preparation phase (scaling, optional sorting and
+*     transposing, optional flushing of small columns).
+*
+*     Preconditioning
+*
+*     If the full SVD is needed, the right singular vectors are computed
+*     from a matrix equation, and for that we need theoretical analysis
+*     of the Businger-Golub pivoting. So we use CGEQP3 as the first RR QRF.
+*     In all other cases the first RR QRF can be chosen by other criteria
+*     (eg speed by replacing global with restricted window pivoting, such
+*     as in xGEQPX from TOMS # 782). Good results will be obtained using
+*     xGEQPX with properly (!) chosen numerical parameters.
+*     Any improvement of CGEQP3 improves overall performance of CGEJSV.
+*
+*     A * P1 = Q1 * [ R1^* 0]^*:
+      DO 1963 p = 1, N
+*        .. all columns are free columns
+         IWORK(p) = 0
+ 1963 CONTINUE
+      CALL CGEQP3( M, N, A, LDA, IWORK, CWORK, CWORK(N+1), LWORK-N,
+     $             RWORK, IERR )
+*
+*     The upper triangular matrix R1 from the first QRF is inspected for
+*     rank deficiency and possibilities for deflation, or possible
+*     ill-conditioning. Depending on the user specified flag L2RANK,
+*     the procedure explores possibilities to reduce the numerical
+*     rank by inspecting the computed upper triangular factor. If
+*     L2RANK or L2ABER are up, then CGEJSV will compute the SVD of
+*     A + dA, where ||dA|| <= f(M,N)*EPSLN.
+*
+      NR = 1
+      IF ( L2ABER ) THEN
+*        Standard absolute error bound suffices. All sigma_i with
+*        sigma_i < N*EPSLN*||A|| are flushed to zero. This is an
+*        aggressive enforcement of lower numerical rank by introducing a
+*        backward error of the order of N*EPSLN*||A||.
+         TEMP1 = SQRT(REAL(N))*EPSLN
+         DO 3001 p = 2, N
+            IF ( ABS(A(p,p)) .GE. (TEMP1*ABS(A(1,1))) ) THEN
+               NR = NR + 1
+            ELSE
+               GO TO 3002
+            END IF
+ 3001    CONTINUE
+ 3002    CONTINUE
+      ELSE IF ( L2RANK ) THEN
+*        .. similarly as above, only slightly more gentle (less aggressive).
+*        Sudden drop on the diagonal of R1 is used as the criterion for
+*        close-to-rank-deficient.
+         TEMP1 = SQRT(SFMIN)
+         DO 3401 p = 2, N
+            IF ( ( ABS(A(p,p)) .LT. (EPSLN*ABS(A(p-1,p-1))) ) .OR.
+     $           ( ABS(A(p,p)) .LT. SMALL ) .OR.
+     $           ( L2KILL .AND. (ABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3402
+            NR = NR + 1
+ 3401    CONTINUE
+ 3402    CONTINUE
+*
+      ELSE
+*        The goal is high relative accuracy. However, if the matrix
+*        has high scaled condition number the relative accuracy is in
+*        general not feasible. Later on, a condition number estimator
+*        will be deployed to estimate the scaled condition number.
+*        Here we just remove the underflowed part of the triangular
+*        factor. This prevents the situation in which the code is
+*        working hard to get the accuracy not warranted by the data.
+         TEMP1  = SQRT(SFMIN)
+         DO 3301 p = 2, N
+            IF ( ( ABS(A(p,p)) .LT. SMALL ) .OR.
+     $           ( L2KILL .AND. (ABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3302
+            NR = NR + 1
+ 3301    CONTINUE
+ 3302    CONTINUE
+*
+      END IF
+*
+      ALMORT = .FALSE.
+      IF ( NR .EQ. N ) THEN
+         MAXPRJ = ONE
+         DO 3051 p = 2, N
+            TEMP1  = ABS(A(p,p)) / SVA(IWORK(p))
+            MAXPRJ = MIN( MAXPRJ, TEMP1 )
+ 3051    CONTINUE
+         IF ( MAXPRJ**2 .GE. ONE - REAL(N)*EPSLN ) ALMORT = .TRUE.
+      END IF
+*
+*
+      SCONDA = - ONE
+      CONDR1 = - ONE
+      CONDR2 = - ONE
+*
+      IF ( ERREST ) THEN
+         IF ( N .EQ. NR ) THEN
+            IF ( RSVEC ) THEN
+*              .. V is available as workspace
+               CALL CLACPY( 'U', N, N, A, LDA, V, LDV )
+               DO 3053 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+                  CALL CSSCAL( p, ONE/TEMP1, V(1,p), 1 )
+ 3053          CONTINUE
+               IF ( LSVEC )THEN
+                   CALL CPOCON( 'U', N, V, LDV, ONE, TEMP1,
+     $                  CWORK(N+1), RWORK, IERR )
+               ELSE
+                   CALL CPOCON( 'U', N, V, LDV, ONE, TEMP1,
+     $                  CWORK, RWORK, IERR )
+               END IF               
+*          
+            ELSE IF ( LSVEC ) THEN
+*              .. U is available as workspace
+               CALL CLACPY( 'U', N, N, A, LDA, U, LDU )
+               DO 3054 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+                  CALL CSSCAL( p, ONE/TEMP1, U(1,p), 1 )
+ 3054          CONTINUE
+               CALL CPOCON( 'U', N, U, LDU, ONE, TEMP1,
+     $              CWORK(N+1), RWORK, IERR )
+            ELSE
+               CALL CLACPY( 'U', N, N, A, LDA, CWORK, N )
+*[]            CALL CLACPY( 'U', N, N, A, LDA, CWORK(N+1), N )
+*              Change: here index shifted by N to the left, CWORK(1:N) 
+*              not needed for SIGMA only computation
+               DO 3052 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+*[]               CALL CSSCAL( p, ONE/TEMP1, CWORK(N+(p-1)*N+1), 1 )
+                  CALL CSSCAL( p, ONE/TEMP1, CWORK((p-1)*N+1), 1 )
+ 3052          CONTINUE
+*           .. the columns of R are scaled to have unit Euclidean lengths.
+*[]               CALL CPOCON( 'U', N, CWORK(N+1), N, ONE, TEMP1,
+*[]     $              CWORK(N+N*N+1), RWORK, IERR )
+               CALL CPOCON( 'U', N, CWORK, N, ONE, TEMP1,
+     $              CWORK(N*N+1), RWORK, IERR )               
+*              
+            END IF
+            IF ( TEMP1 .NE. ZERO ) THEN 
+               SCONDA = ONE / SQRT(TEMP1)
+            ELSE
+               SCONDA = - ONE
+            END IF
+*           SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1).
+*           N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+         ELSE
+            SCONDA = - ONE
+         END IF
+      END IF
+*
+      L2PERT = L2PERT .AND. ( ABS( A(1,1)/A(NR,NR) ) .GT. SQRT(BIG1) )
+*     If there is no violent scaling, artificial perturbation is not needed.
+*
+*     Phase 3:
+*
+      IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN
+*
+*         Singular Values only
+*
+*         .. transpose A(1:NR,1:N)
+         DO 1946 p = 1, MIN( N-1, NR )
+            CALL CCOPY( N-p, A(p,p+1), LDA, A(p+1,p), 1 )
+            CALL CLACGV( N-p+1, A(p,p), 1 )
+ 1946    CONTINUE
+         IF ( NR .EQ. N ) A(N,N) = CONJG(A(N,N))
+*
+*        The following two DO-loops introduce small relative perturbation
+*        into the strict upper triangle of the lower triangular matrix.
+*        Small entries below the main diagonal are also changed.
+*        This modification is useful if the computing environment does not
+*        provide/allow FLUSH TO ZERO underflow, for it prevents many
+*        annoying denormalized numbers in case of strongly scaled matrices.
+*        The perturbation is structured so that it does not introduce any
+*        new perturbation of the singular values, and it does not destroy
+*        the job done by the preconditioner.
+*        The licence for this perturbation is in the variable L2PERT, which
+*        should be .FALSE. if FLUSH TO ZERO underflow is active.
+*
+         IF ( .NOT. ALMORT ) THEN
+*
+            IF ( L2PERT ) THEN
+*              XSC = SQRT(SMALL)
+               XSC = EPSLN / REAL(N)
+               DO 4947 q = 1, NR
+                  CTEMP = CMPLX(XSC*ABS(A(q,q)),ZERO)
+                  DO 4949 p = 1, N
+                     IF ( ( (p.GT.q) .AND. (ABS(A(p,q)).LE.TEMP1) )
+     $                    .OR. ( p .LT. q ) )
+*     $                     A(p,q) = TEMP1 * ( A(p,q) / ABS(A(p,q)) )
+     $                     A(p,q) = CTEMP
+ 4949             CONTINUE
+ 4947          CONTINUE
+            ELSE
+               CALL CLASET( 'U', NR-1,NR-1, CZERO,CZERO, A(1,2),LDA )
+            END IF
+*
+*            .. second preconditioning using the QR factorization
+*
+            CALL CGEQRF( N,NR, A,LDA, CWORK, CWORK(N+1),LWORK-N, IERR )
+*
+*           .. and transpose upper to lower triangular
+            DO 1948 p = 1, NR - 1
+               CALL CCOPY( NR-p, A(p,p+1), LDA, A(p+1,p), 1 )
+               CALL CLACGV( NR-p+1, A(p,p), 1 )
+ 1948       CONTINUE
+*
+         END IF
+*
+*           Row-cyclic Jacobi SVD algorithm with column pivoting
+*
+*           .. again some perturbation (a "background noise") is added
+*           to drown denormals
+            IF ( L2PERT ) THEN
+*              XSC = SQRT(SMALL)
+               XSC = EPSLN / REAL(N)
+               DO 1947 q = 1, NR
+                  CTEMP = CMPLX(XSC*ABS(A(q,q)),ZERO)
+                  DO 1949 p = 1, NR
+                     IF ( ( (p.GT.q) .AND. (ABS(A(p,q)).LE.TEMP1) )
+     $                       .OR. ( p .LT. q ) )
+*     $                   A(p,q) = TEMP1 * ( A(p,q) / ABS(A(p,q)) )
+     $                   A(p,q) = CTEMP
+ 1949             CONTINUE
+ 1947          CONTINUE
+            ELSE
+               CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, A(1,2), LDA )
+            END IF
+*
+*           .. and one-sided Jacobi rotations are started on a lower
+*           triangular matrix (plus perturbation which is ignored in
+*           the part which destroys triangular form (confusing?!))
+*
+            CALL CGESVJ( 'L', 'N', 'N', NR, NR, A, LDA, SVA,
+     $                N, V, LDV, CWORK, LWORK, RWORK, LRWORK, INFO )
+*
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+*
+*
+      ELSE IF ( ( RSVEC .AND. ( .NOT. LSVEC ) .AND. ( .NOT. JRACC ) ) 
+     $       .OR. 
+     $   ( JRACC .AND. ( .NOT. LSVEC ) .AND. ( NR .NE. N ) ) ) THEN
+*
+*        -> Singular Values and Right Singular Vectors <-
+*
+         IF ( ALMORT ) THEN
+*
+*           .. in this case NR equals N
+            DO 1998 p = 1, NR
+               CALL CCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+               CALL CLACGV( N-p+1, V(p,p), 1 )
+ 1998       CONTINUE
+            CALL CLASET( 'U', NR-1,NR-1, CZERO, CZERO, V(1,2), LDV )
+*
+            CALL CGESVJ( 'L','U','N', N, NR, V, LDV, SVA, NR, A, LDA,
+     $                  CWORK, LWORK, RWORK, LRWORK, INFO )
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+
+         ELSE
+*
+*        .. two more QR factorizations ( one QRF is not enough, two require
+*        accumulated product of Jacobi rotations, three are perfect )
+*
+            CALL CLASET( 'L', NR-1,NR-1, CZERO, CZERO, A(2,1), LDA )
+            CALL CGELQF( NR,N, A, LDA, CWORK, CWORK(N+1), LWORK-N, IERR)
+            CALL CLACPY( 'L', NR, NR, A, LDA, V, LDV )
+            CALL CLASET( 'U', NR-1,NR-1, CZERO, CZERO, V(1,2), LDV )
+            CALL CGEQRF( NR, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+     $                   LWORK-2*N, IERR )
+            DO 8998 p = 1, NR
+               CALL CCOPY( NR-p+1, V(p,p), LDV, V(p,p), 1 )
+               CALL CLACGV( NR-p+1, V(p,p), 1 )
+ 8998       CONTINUE
+            CALL CLASET('U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV)
+*
+            CALL CGESVJ( 'L', 'U','N', NR, NR, V,LDV, SVA, NR, U,
+     $                  LDU, CWORK(N+1), LWORK-N, RWORK, LRWORK, INFO )
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+            IF ( NR .LT. N ) THEN
+               CALL CLASET( 'A',N-NR, NR, CZERO,CZERO, V(NR+1,1),  LDV )
+               CALL CLASET( 'A',NR, N-NR, CZERO,CZERO, V(1,NR+1),  LDV )
+               CALL CLASET( 'A',N-NR,N-NR,CZERO,CONE, V(NR+1,NR+1),LDV )
+            END IF
+*
+         CALL CUNMLQ( 'L', 'C', N, N, NR, A, LDA, CWORK,
+     $               V, LDV, CWORK(N+1), LWORK-N, IERR )
+*
+         END IF
+*         .. permute the rows of V
+*         DO 8991 p = 1, N
+*            CALL CCOPY( N, V(p,1), LDV, A(IWORK(p),1), LDA )
+* 8991    CONTINUE
+*         CALL CLACPY( 'All', N, N, A, LDA, V, LDV )
+         CALL CLAPMR( .FALSE., N, N, V, LDV, IWORK )
+*
+          IF ( TRANSP ) THEN
+            CALL CLACPY( 'A', N, N, V, LDV, U, LDU )
+          END IF
+*
+      ELSE IF ( JRACC .AND. (.NOT. LSVEC) .AND. ( NR.EQ. N ) ) THEN 
+*          
+         CALL CLASET( 'L', N-1,N-1, CZERO, CZERO, A(2,1), LDA )
+*
+         CALL CGESVJ( 'U','N','V', N, N, A, LDA, SVA, N, V, LDV,
+     $               CWORK, LWORK, RWORK, LRWORK, INFO )
+          SCALEM  = RWORK(1)
+          NUMRANK = NINT(RWORK(2))
+          CALL CLAPMR( .FALSE., N, N, V, LDV, IWORK )
+*
+      ELSE IF ( LSVEC .AND. ( .NOT. RSVEC ) ) THEN
+*
+*        .. Singular Values and Left Singular Vectors                 ..
+*
+*        .. second preconditioning step to avoid need to accumulate
+*        Jacobi rotations in the Jacobi iterations.
+         DO 1965 p = 1, NR
+            CALL CCOPY( N-p+1, A(p,p), LDA, U(p,p), 1 )
+            CALL CLACGV( N-p+1, U(p,p), 1 )
+ 1965    CONTINUE
+         CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU )
+*
+         CALL CGEQRF( N, NR, U, LDU, CWORK(N+1), CWORK(2*N+1),
+     $              LWORK-2*N, IERR )
+*
+         DO 1967 p = 1, NR - 1
+            CALL CCOPY( NR-p, U(p,p+1), LDU, U(p+1,p), 1 )
+            CALL CLACGV( N-p+1, U(p,p), 1 )
+ 1967    CONTINUE
+         CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU )
+*
+         CALL CGESVJ( 'L', 'U', 'N', NR,NR, U, LDU, SVA, NR, A,
+     $        LDA, CWORK(N+1), LWORK-N, RWORK, LRWORK, INFO )
+         SCALEM  = RWORK(1)
+         NUMRANK = NINT(RWORK(2))
+*
+         IF ( NR .LT. M ) THEN
+            CALL CLASET( 'A',  M-NR, NR,CZERO, CZERO, U(NR+1,1), LDU )
+            IF ( NR .LT. N1 ) THEN
+               CALL CLASET( 'A',NR, N1-NR, CZERO, CZERO, U(1,NR+1),LDU )
+               CALL CLASET( 'A',M-NR,N1-NR,CZERO,CONE,U(NR+1,NR+1),LDU )
+            END IF
+         END IF
+*
+         CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $               LDU, CWORK(N+1), LWORK-N, IERR )
+*
+         IF ( ROWPIV )
+     $       CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+         DO 1974 p = 1, N1
+            XSC = ONE / SCNRM2( M, U(1,p), 1 )
+            CALL CSSCAL( M, XSC, U(1,p), 1 )
+ 1974    CONTINUE
+*
+         IF ( TRANSP ) THEN
+            CALL CLACPY( 'A', N, N, U, LDU, V, LDV )
+         END IF
+*
+      ELSE
+*
+*        .. Full SVD ..
+*
+         IF ( .NOT. JRACC ) THEN
+*
+         IF ( .NOT. ALMORT ) THEN
+*
+*           Second Preconditioning Step (QRF [with pivoting])
+*           Note that the composition of TRANSPOSE, QRF and TRANSPOSE is
+*           equivalent to an LQF CALL. Since in many libraries the QRF
+*           seems to be better optimized than the LQF, we do explicit
+*           transpose and use the QRF. This is subject to changes in an
+*           optimized implementation of CGEJSV.
+*
+            DO 1968 p = 1, NR
+               CALL CCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+               CALL CLACGV( N-p+1, V(p,p), 1 )
+ 1968       CONTINUE
+*
+*           .. the following two loops perturb small entries to avoid
+*           denormals in the second QR factorization, where they are
+*           as good as zeros. This is done to avoid painfully slow
+*           computation with denormals. The relative size of the perturbation
+*           is a parameter that can be changed by the implementer.
+*           This perturbation device will be obsolete on machines with
+*           properly implemented arithmetic.
+*           To switch it off, set L2PERT=.FALSE. To remove it from  the
+*           code, remove the action under L2PERT=.TRUE., leave the ELSE part.
+*           The following two loops should be blocked and fused with the
+*           transposed copy above.
+*
+            IF ( L2PERT ) THEN
+               XSC = SQRT(SMALL)
+               DO 2969 q = 1, NR
+                  CTEMP = CMPLX(XSC*ABS( V(q,q) ),ZERO)
+                  DO 2968 p = 1, N
+                     IF ( ( p .GT. q ) .AND. ( ABS(V(p,q)) .LE. TEMP1 )
+     $                   .OR. ( p .LT. q ) )
+*     $                   V(p,q) = TEMP1 * ( V(p,q) / ABS(V(p,q)) )
+     $                   V(p,q) = CTEMP
+                     IF ( p .LT. q ) V(p,q) = - V(p,q)
+ 2968             CONTINUE
+ 2969          CONTINUE
+            ELSE
+               CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV )
+            END IF
+*
+*           Estimate the row scaled condition number of R1
+*           (If R1 is rectangular, N > NR, then the condition number
+*           of the leading NR x NR submatrix is estimated.)
+*
+            CALL CLACPY( 'L', NR, NR, V, LDV, CWORK(2*N+1), NR )
+            DO 3950 p = 1, NR
+               TEMP1 = SCNRM2(NR-p+1,CWORK(2*N+(p-1)*NR+p),1)
+               CALL CSSCAL(NR-p+1,ONE/TEMP1,CWORK(2*N+(p-1)*NR+p),1)
+ 3950       CONTINUE
+            CALL CPOCON('L',NR,CWORK(2*N+1),NR,ONE,TEMP1,
+     $                   CWORK(2*N+NR*NR+1),RWORK,IERR)
+            CONDR1 = ONE / SQRT(TEMP1)
+*           .. here need a second opinion on the condition number
+*           .. then assume worst case scenario
+*           R1 is OK for inverse <=> CONDR1 .LT. REAL(N)
+*           more conservative    <=> CONDR1 .LT. SQRT(REAL(N))
+*
+            COND_OK = SQRT(SQRT(REAL(NR)))
+*[TP]       COND_OK is a tuning parameter.
+*
+            IF ( CONDR1 .LT. COND_OK ) THEN
+*              .. the second QRF without pivoting. Note: in an optimized
+*              implementation, this QRF should be implemented as the QRF
+*              of a lower triangular matrix.
+*              R1^* = Q2 * R2
+               CALL CGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+     $              LWORK-2*N, IERR )
+*
+               IF ( L2PERT ) THEN
+                  XSC = SQRT(SMALL)/EPSLN
+                  DO 3959 p = 2, NR
+                     DO 3958 q = 1, p - 1
+                        CTEMP=CMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))),
+     $                              ZERO)
+                        IF ( ABS(V(q,p)) .LE. TEMP1 )
+*     $                     V(q,p) = TEMP1 * ( V(q,p) / ABS(V(q,p)) )
+     $                     V(q,p) = CTEMP
+ 3958                CONTINUE
+ 3959             CONTINUE
+               END IF
+*
+               IF ( NR .NE. N )
+     $         CALL CLACPY( 'A', N, NR, V, LDV, CWORK(2*N+1), N )
+*              .. save ...
+*
+*           .. this transposed copy should be better than naive
+               DO 1969 p = 1, NR - 1
+                  CALL CCOPY( NR-p, V(p,p+1), LDV, V(p+1,p), 1 )
+                  CALL CLACGV(NR-p+1, V(p,p), 1 )
+ 1969          CONTINUE
+               V(NR,NR)=CONJG(V(NR,NR))
+*
+               CONDR2 = CONDR1
+*
+            ELSE
+*
+*              .. ill-conditioned case: second QRF with pivoting
+*              Note that windowed pivoting would be equally good
+*              numerically, and more run-time efficient. So, in
+*              an optimal implementation, the next call to CGEQP3
+*              should be replaced with eg. CALL CGEQPX (ACM TOMS #782)
+*              with properly (carefully) chosen parameters.
+*
+*              R1^* * P2 = Q2 * R2
+               DO 3003 p = 1, NR
+                  IWORK(N+p) = 0
+ 3003          CONTINUE
+               CALL CGEQP3( N, NR, V, LDV, IWORK(N+1), CWORK(N+1),
+     $                  CWORK(2*N+1), LWORK-2*N, RWORK, IERR )
+**               CALL CGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+**     $              LWORK-2*N, IERR )
+               IF ( L2PERT ) THEN
+                  XSC = SQRT(SMALL)
+                  DO 3969 p = 2, NR
+                     DO 3968 q = 1, p - 1
+                        CTEMP=CMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))),
+     $                                ZERO)
+                        IF ( ABS(V(q,p)) .LE. TEMP1 )
+*     $                     V(q,p) = TEMP1 * ( V(q,p) / ABS(V(q,p)) )
+     $                     V(q,p) = CTEMP
+ 3968                CONTINUE
+ 3969             CONTINUE
+               END IF
+*
+               CALL CLACPY( 'A', N, NR, V, LDV, CWORK(2*N+1), N )
+*
+               IF ( L2PERT ) THEN
+                  XSC = SQRT(SMALL)
+                  DO 8970 p = 2, NR
+                     DO 8971 q = 1, p - 1
+                        CTEMP=CMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))),
+     $                               ZERO)
+*                        V(p,q) = - TEMP1*( V(q,p) / ABS(V(q,p)) )
+                        V(p,q) = - CTEMP
+ 8971                CONTINUE
+ 8970             CONTINUE
+               ELSE
+                  CALL CLASET( 'L',NR-1,NR-1,CZERO,CZERO,V(2,1),LDV )
+               END IF
+*              Now, compute R2 = L3 * Q3, the LQ factorization.
+               CALL CGELQF( NR, NR, V, LDV, CWORK(2*N+N*NR+1),
+     $               CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, IERR )
+*              .. and estimate the condition number
+               CALL CLACPY( 'L',NR,NR,V,LDV,CWORK(2*N+N*NR+NR+1),NR )
+               DO 4950 p = 1, NR
+                  TEMP1 = SCNRM2( p, CWORK(2*N+N*NR+NR+p), NR )
+                  CALL CSSCAL( p, ONE/TEMP1, CWORK(2*N+N*NR+NR+p), NR )
+ 4950          CONTINUE
+               CALL CPOCON( 'L',NR,CWORK(2*N+N*NR+NR+1),NR,ONE,TEMP1,
+     $              CWORK(2*N+N*NR+NR+NR*NR+1),RWORK,IERR )
+               CONDR2 = ONE / SQRT(TEMP1)
+*
+*
+               IF ( CONDR2 .GE. COND_OK ) THEN
+*                 .. save the Householder vectors used for Q3
+*                 (this overwrites the copy of R2, as it will not be
+*                 needed in this branch, but it does not overwritte the
+*                 Huseholder vectors of Q2.).
+                  CALL CLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N )
+*                 .. and the rest of the information on Q3 is in
+*                 WORK(2*N+N*NR+1:2*N+N*NR+N)
+               END IF
+*
+            END IF
+*
+            IF ( L2PERT ) THEN
+               XSC = SQRT(SMALL)
+               DO 4968 q = 2, NR
+                  CTEMP = XSC * V(q,q)
+                  DO 4969 p = 1, q - 1
+*                     V(p,q) = - TEMP1*( V(p,q) / ABS(V(p,q)) )
+                     V(p,q) = - CTEMP
+ 4969             CONTINUE
+ 4968          CONTINUE
+            ELSE
+               CALL CLASET( 'U', NR-1,NR-1, CZERO,CZERO, V(1,2), LDV )
+            END IF
+*
+*        Second preconditioning finished; continue with Jacobi SVD
+*        The input matrix is lower trinagular.
+*
+*        Recover the right singular vectors as solution of a well
+*        conditioned triangular matrix equation.
+*
+            IF ( CONDR1 .LT. COND_OK ) THEN
+*
+               CALL CGESVJ( 'L','U','N',NR,NR,V,LDV,SVA,NR,U, LDU,
+     $              CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,RWORK,
+     $              LRWORK, INFO )
+               SCALEM  = RWORK(1)
+               NUMRANK = NINT(RWORK(2))
+               DO 3970 p = 1, NR
+                  CALL CCOPY(  NR, V(1,p), 1, U(1,p), 1 )
+                  CALL CSSCAL( NR, SVA(p),    V(1,p), 1 )
+ 3970          CONTINUE
+
+*        .. pick the right matrix equation and solve it
+*
+               IF ( NR .EQ. N ) THEN
+* :))             .. best case, R1 is inverted. The solution of this matrix
+*                 equation is Q2*V2 = the product of the Jacobi rotations
+*                 used in CGESVJ, premultiplied with the orthogonal matrix
+*                 from the second QR factorization.
+                  CALL CTRSM('L','U','N','N', NR,NR,CONE, A,LDA, V,LDV)
+               ELSE
+*                 .. R1 is well conditioned, but non-square. Adjoint of R2
+*                 is inverted to get the product of the Jacobi rotations
+*                 used in CGESVJ. The Q-factor from the second QR
+*                 factorization is then built in explicitly.
+                  CALL CTRSM('L','U','C','N',NR,NR,CONE,CWORK(2*N+1),
+     $                 N,V,LDV)
+                  IF ( NR .LT. N ) THEN
+                  CALL CLASET('A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV)
+                  CALL CLASET('A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV)
+                  CALL CLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+                  END IF
+                  CALL CUNMQR('L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $                V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR)
+               END IF
+*
+            ELSE IF ( CONDR2 .LT. COND_OK ) THEN
+*
+*              The matrix R2 is inverted. The solution of the matrix equation
+*              is Q3^* * V3 = the product of the Jacobi rotations (appplied to
+*              the lower triangular L3 from the LQ factorization of
+*              R2=L3*Q3), pre-multiplied with the transposed Q3.
+               CALL CGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
+     $              LDU, CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR,
+     $              RWORK, LRWORK, INFO )
+               SCALEM  = RWORK(1)
+               NUMRANK = NINT(RWORK(2))
+               DO 3870 p = 1, NR
+                  CALL CCOPY( NR, V(1,p), 1, U(1,p), 1 )
+                  CALL CSSCAL( NR, SVA(p),    U(1,p), 1 )
+ 3870          CONTINUE
+               CALL CTRSM('L','U','N','N',NR,NR,CONE,CWORK(2*N+1),N,
+     $                    U,LDU)
+*              .. apply the permutation from the second QR factorization
+               DO 873 q = 1, NR
+                  DO 872 p = 1, NR
+                     CWORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q)
+ 872              CONTINUE
+                  DO 874 p = 1, NR
+                     U(p,q) = CWORK(2*N+N*NR+NR+p)
+ 874              CONTINUE
+ 873           CONTINUE
+               IF ( NR .LT. N ) THEN
+                  CALL CLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV )
+                  CALL CLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV )
+                  CALL CLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+               END IF
+               CALL CUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $              V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+            ELSE
+*              Last line of defense.
+* #:(          This is a rather pathological case: no scaled condition
+*              improvement after two pivoted QR factorizations. Other
+*              possibility is that the rank revealing QR factorization
+*              or the condition estimator has failed, or the COND_OK
+*              is set very close to ONE (which is unnecessary). Normally,
+*              this branch should never be executed, but in rare cases of
+*              failure of the RRQR or condition estimator, the last line of
+*              defense ensures that CGEJSV completes the task.
+*              Compute the full SVD of L3 using CGESVJ with explicit
+*              accumulation of Jacobi rotations.
+               CALL CGESVJ( 'L', 'U', 'V', NR, NR, V, LDV, SVA, NR, U,
+     $              LDU, CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR,
+     $                         RWORK, LRWORK, INFO )
+               SCALEM  = RWORK(1)
+               NUMRANK = NINT(RWORK(2))
+               IF ( NR .LT. N ) THEN
+                  CALL CLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV )
+                  CALL CLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV )
+                  CALL CLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+               END IF
+               CALL CUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $              V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+*
+               CALL CUNMLQ( 'L', 'C', NR, NR, NR, CWORK(2*N+1), N,
+     $              CWORK(2*N+N*NR+1), U, LDU, CWORK(2*N+N*NR+NR+1),
+     $              LWORK-2*N-N*NR-NR, IERR )
+               DO 773 q = 1, NR
+                  DO 772 p = 1, NR
+                     CWORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q)
+ 772              CONTINUE
+                  DO 774 p = 1, NR
+                     U(p,q) = CWORK(2*N+N*NR+NR+p)
+ 774              CONTINUE
+ 773           CONTINUE
+*
+            END IF
+*
+*           Permute the rows of V using the (column) permutation from the
+*           first QRF. Also, scale the columns to make them unit in
+*           Euclidean norm. This applies to all cases.
+*
+            TEMP1 = SQRT(REAL(N)) * EPSLN
+            DO 1972 q = 1, N
+               DO 972 p = 1, N
+                  CWORK(2*N+N*NR+NR+IWORK(p)) = V(p,q)
+  972          CONTINUE
+               DO 973 p = 1, N
+                  V(p,q) = CWORK(2*N+N*NR+NR+p)
+  973          CONTINUE
+               XSC = ONE / SCNRM2( N, V(1,q), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $           CALL CSSCAL( N, XSC, V(1,q), 1 )
+ 1972       CONTINUE
+*           At this moment, V contains the right singular vectors of A.
+*           Next, assemble the left singular vector matrix U (M x N).
+            IF ( NR .LT. M ) THEN
+               CALL CLASET('A', M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU)
+               IF ( NR .LT. N1 ) THEN
+                  CALL CLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU)
+                  CALL CLASET('A',M-NR,N1-NR,CZERO,CONE,
+     $                        U(NR+1,NR+1),LDU)
+               END IF
+            END IF
+*
+*           The Q matrix from the first QRF is built into the left singular
+*           matrix U. This applies to all cases.
+*
+            CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $           LDU, CWORK(N+1), LWORK-N, IERR )
+
+*           The columns of U are normalized. The cost is O(M*N) flops.
+            TEMP1 = SQRT(REAL(M)) * EPSLN
+            DO 1973 p = 1, NR
+               XSC = ONE / SCNRM2( M, U(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $          CALL CSSCAL( M, XSC, U(1,p), 1 )
+ 1973       CONTINUE
+*
+*           If the initial QRF is computed with row pivoting, the left
+*           singular vectors must be adjusted.
+*
+            IF ( ROWPIV )
+     $          CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+         ELSE
+*
+*        .. the initial matrix A has almost orthogonal columns and
+*        the second QRF is not needed
+*
+            CALL CLACPY( 'U', N, N, A, LDA, CWORK(N+1), N )
+            IF ( L2PERT ) THEN
+               XSC = SQRT(SMALL)
+               DO 5970 p = 2, N
+                  CTEMP = XSC * CWORK( N + (p-1)*N + p )
+                  DO 5971 q = 1, p - 1
+*                     CWORK(N+(q-1)*N+p)=-TEMP1 * ( CWORK(N+(p-1)*N+q) /
+*     $                                        ABS(CWORK(N+(p-1)*N+q)) )
+                     CWORK(N+(q-1)*N+p)=-CTEMP
+ 5971             CONTINUE
+ 5970          CONTINUE
+            ELSE
+               CALL CLASET( 'L',N-1,N-1,CZERO,CZERO,CWORK(N+2),N )
+            END IF
+*
+            CALL CGESVJ( 'U', 'U', 'N', N, N, CWORK(N+1), N, SVA,
+     $           N, U, LDU, CWORK(N+N*N+1), LWORK-N-N*N, RWORK, LRWORK,
+     $       INFO )
+*
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+            DO 6970 p = 1, N
+               CALL CCOPY( N, CWORK(N+(p-1)*N+1), 1, U(1,p), 1 )
+               CALL CSSCAL( N, SVA(p), CWORK(N+(p-1)*N+1), 1 )
+ 6970       CONTINUE
+*
+            CALL CTRSM( 'L', 'U', 'N', 'N', N, N,
+     $           CONE, A, LDA, CWORK(N+1), N )
+            DO 6972 p = 1, N
+               CALL CCOPY( N, CWORK(N+p), N, V(IWORK(p),1), LDV )
+ 6972       CONTINUE
+            TEMP1 = SQRT(REAL(N))*EPSLN
+            DO 6971 p = 1, N
+               XSC = ONE / SCNRM2( N, V(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $            CALL CSSCAL( N, XSC, V(1,p), 1 )
+ 6971       CONTINUE
+*
+*           Assemble the left singular vector matrix U (M x N).
+*
+            IF ( N .LT. M ) THEN
+               CALL CLASET( 'A',  M-N, N, CZERO, CZERO, U(N+1,1), LDU )
+               IF ( N .LT. N1 ) THEN
+                  CALL CLASET('A',N,  N1-N, CZERO, CZERO,  U(1,N+1),LDU)
+                  CALL CLASET( 'A',M-N,N1-N, CZERO, CONE,U(N+1,N+1),LDU)
+               END IF
+            END IF
+            CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $           LDU, CWORK(N+1), LWORK-N, IERR )
+            TEMP1 = SQRT(REAL(M))*EPSLN
+            DO 6973 p = 1, N1
+               XSC = ONE / SCNRM2( M, U(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $            CALL CSSCAL( M, XSC, U(1,p), 1 )
+ 6973       CONTINUE
+*
+            IF ( ROWPIV )
+     $         CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+         END IF
+*
+*        end of the  >> almost orthogonal case <<  in the full SVD
+*
+         ELSE
+*
+*        This branch deploys a preconditioned Jacobi SVD with explicitly
+*        accumulated rotations. It is included as optional, mainly for
+*        experimental purposes. It does perform well, and can also be used.
+*        In this implementation, this branch will be automatically activated
+*        if the  condition number sigma_max(A) / sigma_min(A) is predicted
+*        to be greater than the overflow threshold. This is because the
+*        a posteriori computation of the singular vectors assumes robust
+*        implementation of BLAS and some LAPACK procedures, capable of working
+*        in presence of extreme values, e.g. when the singular values spread from
+*        the underflow to the overflow threshold. 
+*
+         DO 7968 p = 1, NR
+            CALL CCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+            CALL CLACGV( N-p+1, V(p,p), 1 )
+ 7968    CONTINUE
+*
+         IF ( L2PERT ) THEN
+            XSC = SQRT(SMALL/EPSLN)
+            DO 5969 q = 1, NR
+               CTEMP = CMPLX(XSC*ABS( V(q,q) ),ZERO)
+               DO 5968 p = 1, N
+                  IF ( ( p .GT. q ) .AND. ( ABS(V(p,q)) .LE. TEMP1 )
+     $                .OR. ( p .LT. q ) )
+*     $                V(p,q) = TEMP1 * ( V(p,q) / ABS(V(p,q)) )
+     $                V(p,q) = CTEMP
+                  IF ( p .LT. q ) V(p,q) = - V(p,q)
+ 5968          CONTINUE
+ 5969       CONTINUE
+         ELSE
+            CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV )
+         END IF
+
+         CALL CGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+     $        LWORK-2*N, IERR )
+         CALL CLACPY( 'L', N, NR, V, LDV, CWORK(2*N+1), N )
+*
+         DO 7969 p = 1, NR
+            CALL CCOPY( NR-p+1, V(p,p), LDV, U(p,p), 1 )
+            CALL CLACGV( NR-p+1, U(p,p), 1 )
+ 7969    CONTINUE
+
+         IF ( L2PERT ) THEN
+            XSC = SQRT(SMALL/EPSLN)
+            DO 9970 q = 2, NR
+               DO 9971 p = 1, q - 1
+                  CTEMP = CMPLX(XSC * MIN(ABS(U(p,p)),ABS(U(q,q))),
+     $                           ZERO)
+*                  U(p,q) = - TEMP1 * ( U(q,p) / ABS(U(q,p)) )
+                  U(p,q) = - CTEMP
+ 9971          CONTINUE
+ 9970       CONTINUE
+         ELSE
+            CALL CLASET('U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU )
+         END IF
+
+         CALL CGESVJ( 'L', 'U', 'V', NR, NR, U, LDU, SVA,
+     $        N, V, LDV, CWORK(2*N+N*NR+1), LWORK-2*N-N*NR,
+     $         RWORK, LRWORK, INFO )
+         SCALEM  = RWORK(1)
+         NUMRANK = NINT(RWORK(2))
+
+         IF ( NR .LT. N ) THEN
+            CALL CLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV )
+            CALL CLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV )
+            CALL CLASET( 'A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV )
+         END IF
+
+         CALL CUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $        V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+*
+*           Permute the rows of V using the (column) permutation from the
+*           first QRF. Also, scale the columns to make them unit in
+*           Euclidean norm. This applies to all cases.
+*
+            TEMP1 = SQRT(REAL(N)) * EPSLN
+            DO 7972 q = 1, N
+               DO 8972 p = 1, N
+                  CWORK(2*N+N*NR+NR+IWORK(p)) = V(p,q)
+ 8972          CONTINUE
+               DO 8973 p = 1, N
+                  V(p,q) = CWORK(2*N+N*NR+NR+p)
+ 8973          CONTINUE
+               XSC = ONE / SCNRM2( N, V(1,q), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $           CALL CSSCAL( N, XSC, V(1,q), 1 )
+ 7972       CONTINUE
+*
+*           At this moment, V contains the right singular vectors of A.
+*           Next, assemble the left singular vector matrix U (M x N).
+*
+         IF ( NR .LT. M ) THEN
+            CALL CLASET( 'A',  M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU )
+            IF ( NR .LT. N1 ) THEN
+               CALL CLASET('A',NR,  N1-NR, CZERO, CZERO,  U(1,NR+1),LDU)
+               CALL CLASET('A',M-NR,N1-NR, CZERO, CONE,U(NR+1,NR+1),LDU)
+            END IF
+         END IF
+*
+         CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $        LDU, CWORK(N+1), LWORK-N, IERR )
+*
+            IF ( ROWPIV )
+     $         CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+*
+         END IF
+         IF ( TRANSP ) THEN
+*           .. swap U and V because the procedure worked on A^*
+            DO 6974 p = 1, N
+               CALL CSWAP( N, U(1,p), 1, V(1,p), 1 )
+ 6974       CONTINUE
+         END IF
+*
+      END IF
+*     end of the full SVD
+*
+*     Undo scaling, if necessary (and possible)
+*
+      IF ( USCAL2 .LE. (BIG/SVA(1))*USCAL1 ) THEN
+         CALL SLASCL( 'G', 0, 0, USCAL1, USCAL2, NR, 1, SVA, N, IERR )
+         USCAL1 = ONE
+         USCAL2 = ONE
+      END IF
+*
+      IF ( NR .LT. N ) THEN
+         DO 3004 p = NR+1, N
+            SVA(p) = ZERO
+ 3004    CONTINUE
+      END IF
+*
+      RWORK(1) = USCAL2 * SCALEM
+      RWORK(2) = USCAL1
+      IF ( ERREST ) RWORK(3) = SCONDA
+      IF ( LSVEC .AND. RSVEC ) THEN
+         RWORK(4) = CONDR1
+         RWORK(5) = CONDR2
+      END IF
+      IF ( L2TRAN ) THEN
+         RWORK(6) = ENTRA
+         RWORK(7) = ENTRAT
+      END IF
+*
+      IWORK(1) = NR
+      IWORK(2) = NUMRANK
+      IWORK(3) = WARNING
+      IF ( TRANSP ) THEN
+          IWORK(4) =  1 
+      ELSE
+          IWORK(4) = -1
+      END IF 
+      
+*
+      RETURN
+*     ..
+*     .. END OF CGEJSV
+*     ..
+      END
+*
diff --git a/lapack-netlib/cgesvx.f b/lapack-netlib/cgesvx.f
new file mode 100644
index 000000000..74a37e9a0
--- /dev/null
+++ b/lapack-netlib/cgesvx.f
@@ -0,0 +1,602 @@
+*> \brief <b> CGESVX computes the solution to system of linear equations A * X = B for GE matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CGESVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cgesvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cgesvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cgesvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+*                          EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+*                          WORK, RWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+*       REAL               RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * )
+*       REAL               BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   RWORK( * )
+*       COMPLEX            A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CGESVX uses the LU factorization to compute the solution to a complex
+*> system of linear equations
+*>    A * X = B,
+*> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = P * L * U,
+*>    where P is a permutation matrix, L is a unit lower triangular
+*>    matrix, and U is upper triangular.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AF and IPIV contain the factored form of A.
+*>                  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  A, AF, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AF and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AF and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations:
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N)
+*>          On entry, the N-by-N matrix A.  If FACT = 'F' and EQUED is
+*>          not 'N', then A must have been equilibrated by the scaling
+*>          factors in R and/or C.  A is not modified if FACT = 'F' or
+*>          'N', or if FACT = 'E' and EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] AF
+*> \verbatim
+*>          AF is COMPLEX array, dimension (LDAF,N)
+*>          If FACT = 'F', then AF is an input argument and on entry
+*>          contains the factors L and U from the factorization
+*>          A = P*L*U as computed by CGETRF.  If EQUED .ne. 'N', then
+*>          AF is the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the equilibrated matrix A (see the description of A for
+*>          the form of the equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAF
+*> \verbatim
+*>          LDAF is INTEGER
+*>          The leading dimension of the array AF.  LDAF >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          as computed by CGETRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is REAL array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is REAL array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX array, dimension (LDB,NRHS)
+*>          On entry, the N-by-NRHS right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is COMPLEX array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is REAL
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is REAL array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is REAL array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (2*N)
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (MAX(1,2*N))
+*>          On exit, RWORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If RWORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          RWORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization has
+*>                       been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*>                       value of RCOND would suggest.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complexGEsolve
+*
+*  =====================================================================
+      SUBROUTINE CGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+     $                   EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+     $                   WORK, RWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+      REAL               RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * )
+      REAL               BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   RWORK( * )
+      COMPLEX            A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J
+      REAL               AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      REAL               CLANGE, CLANTR, SLAMCH
+      EXTERNAL           LSAME, CLANGE, CLANTR, SLAMCH
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CGECON, CGEEQU, CGERFS, CGETRF, CGETRS, CLACPY,
+     $                   CLAQGE, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = SLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE IF( LDAF.LT.MAX( 1, N ) ) THEN
+         INFO = -8
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -10
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -11
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -12
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -14
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CGESVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL CGEEQU( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL CLAQGE( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX,
+     $                   EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of A.
+*
+         CALL CLACPY( 'Full', N, N, A, LDA, AF, LDAF )
+         CALL CGETRF( N, N, AF, LDAF, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            RPVGRW = CLANTR( 'M', 'U', 'N', INFO, INFO, AF, LDAF,
+     $               RWORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = CLANGE( 'M', N, INFO, A, LDA, RWORK ) /
+     $                  RPVGRW
+            END IF
+            RWORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = CLANGE( NORM, N, N, A, LDA, RWORK )
+      RPVGRW = CLANTR( 'M', 'U', 'N', N, N, AF, LDAF, RWORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = CLANGE( 'M', N, N, A, LDA, RWORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL CGECON( NORM, N, AF, LDAF, ANORM, RCOND, WORK, RWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL CLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL CGETRS( TRANS, N, NRHS, AF, LDAF, IPIV, X, LDX, INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL CGERFS( TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, B, LDB, X,
+     $             LDX, FERR, BERR, WORK, RWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 80 J = 1, NRHS
+               DO 70 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+   70          CONTINUE
+   80       CONTINUE
+            DO 90 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+   90       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 110 J = 1, NRHS
+            DO 100 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  100       CONTINUE
+  110    CONTINUE
+         DO 120 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  120    CONTINUE
+      END IF
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.SLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+*
+      RWORK( 1 ) = RPVGRW
+      RETURN
+*
+*     End of CGESVX
+*
+      END
diff --git a/lapack-netlib/dgbsvx.f b/lapack-netlib/dgbsvx.f
new file mode 100644
index 000000000..0ee5eecb3
--- /dev/null
+++ b/lapack-netlib/dgbsvx.f
@@ -0,0 +1,639 @@
+*> \brief <b> DGBSVX computes the solution to system of linear equations A * X = B for GB matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DGBSVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dgbsvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dgbsvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dgbsvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+*                          LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+*                          RCOND, FERR, BERR, WORK, IWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+*       DOUBLE PRECISION   RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * ), IWORK( * )
+*       DOUBLE PRECISION   AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+*      $                   BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DGBSVX uses the LU factorization to compute the solution to a real
+*> system of linear equations A * X = B, A**T * X = B, or A**H * X = B,
+*> where A is a band matrix of order N with KL subdiagonals and KU
+*> superdiagonals, and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed by this subroutine:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = L * U,
+*>    where L is a product of permutation and unit lower triangular
+*>    matrices with KL subdiagonals, and U is upper triangular with
+*>    KL+KU superdiagonals.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AFB and IPIV contain the factored form of
+*>                  A.  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  AB, AFB, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AFB and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AFB and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations.
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KL
+*> \verbatim
+*>          KL is INTEGER
+*>          The number of subdiagonals within the band of A.  KL >= 0.
+*> \endverbatim
+*>
+*> \param[in] KU
+*> \verbatim
+*>          KU is INTEGER
+*>          The number of superdiagonals within the band of A.  KU >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is DOUBLE PRECISION array, dimension (LDAB,N)
+*>          On entry, the matrix A in band storage, in rows 1 to KL+KU+1.
+*>          The j-th column of A is stored in the j-th column of the
+*>          array AB as follows:
+*>          AB(KU+1+i-j,j) = A(i,j) for max(1,j-KU)<=i<=min(N,j+kl)
+*>
+*>          If FACT = 'F' and EQUED is not 'N', then A must have been
+*>          equilibrated by the scaling factors in R and/or C.  AB is not
+*>          modified if FACT = 'F' or 'N', or if FACT = 'E' and
+*>          EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] AFB
+*> \verbatim
+*>          AFB is DOUBLE PRECISION array, dimension (LDAFB,N)
+*>          If FACT = 'F', then AFB is an input argument and on entry
+*>          contains details of the LU factorization of the band matrix
+*>          A, as computed by DGBTRF.  U is stored as an upper triangular
+*>          band matrix with KL+KU superdiagonals in rows 1 to KL+KU+1,
+*>          and the multipliers used during the factorization are stored
+*>          in rows KL+KU+2 to 2*KL+KU+1.  If EQUED .ne. 'N', then AFB is
+*>          the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of A.
+*>
+*>          If FACT = 'E', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of the equilibrated
+*>          matrix A (see the description of AB for the form of the
+*>          equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAFB
+*> \verbatim
+*>          LDAFB is INTEGER
+*>          The leading dimension of the array AFB.  LDAFB >= 2*KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = L*U
+*>          as computed by DGBTRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is DOUBLE PRECISION array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is DOUBLE PRECISION array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is DOUBLE PRECISION array, dimension (LDB,NRHS)
+*>          On entry, the right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is DOUBLE PRECISION array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is DOUBLE PRECISION
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,3*N))
+*>          On exit, WORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If WORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          WORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N)
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization
+*>                       has been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*>                       value of RCOND would suggest.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup doubleGBsolve
+*
+*  =====================================================================
+      SUBROUTINE DGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+     $                   LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+     $                   RCOND, FERR, BERR, WORK, IWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+      DOUBLE PRECISION   RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * ), IWORK( * )
+      DOUBLE PRECISION   AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+     $                   BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J, J1, J2
+      DOUBLE PRECISION   AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      DOUBLE PRECISION   DLAMCH, DLANGB, DLANTB
+      EXTERNAL           LSAME, DLAMCH, DLANGB, DLANTB
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DCOPY, DGBCON, DGBEQU, DGBRFS, DGBTRF, DGBTRS,
+     $                   DLACPY, DLAQGB, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = DLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KL.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( KU.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -6
+      ELSE IF( LDAB.LT.KL+KU+1 ) THEN
+         INFO = -8
+      ELSE IF( LDAFB.LT.2*KL+KU+1 ) THEN
+         INFO = -10
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -12
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -13
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -14
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -18
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DGBSVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL DGBEQU( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL DLAQGB( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                   AMAX, EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of the band matrix A.
+*
+         DO 70 J = 1, N
+            J1 = MAX( J-KU, 1 )
+            J2 = MIN( J+KL, N )
+            CALL DCOPY( J2-J1+1, AB( KU+1-J+J1, J ), 1,
+     $                  AFB( KL+KU+1-J+J1, J ), 1 )
+   70    CONTINUE
+*
+         CALL DGBTRF( N, N, KL, KU, AFB, LDAFB, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            ANORM = ZERO
+            DO 90 J = 1, INFO
+               DO 80 I = MAX( KU+2-J, 1 ), MIN( N+KU+1-J, KL+KU+1 )
+                  ANORM = MAX( ANORM, ABS( AB( I, J ) ) )
+   80          CONTINUE
+   90       CONTINUE
+            RPVGRW = DLANTB( 'M', 'U', 'N', INFO, MIN( INFO-1, KL+KU ),
+     $                       AFB( MAX( 1, KL+KU+2-INFO ), 1 ), LDAFB,
+     $                       WORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = ANORM / RPVGRW
+            END IF
+            WORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = DLANGB( NORM, N, KL, KU, AB, LDAB, WORK )
+      RPVGRW = DLANTB( 'M', 'U', 'N', N, KL+KU, AFB, LDAFB, WORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = DLANGB( 'M', N, KL, KU, AB, LDAB, WORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL DGBCON( NORM, N, KL, KU, AFB, LDAFB, IPIV, ANORM, RCOND,
+     $             WORK, IWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL DLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL DGBTRS( TRANS, N, KL, KU, NRHS, AFB, LDAFB, IPIV, X, LDX,
+     $             INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL DGBRFS( TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, LDAFB, IPIV,
+     $             B, LDB, X, LDX, FERR, BERR, WORK, IWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 110 J = 1, NRHS
+               DO 100 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+  100          CONTINUE
+  110       CONTINUE
+            DO 120 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+  120       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 140 J = 1, NRHS
+            DO 130 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  130       CONTINUE
+  140    CONTINUE
+         DO 150 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  150    CONTINUE
+      END IF
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.DLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+*
+      WORK( 1 ) = RPVGRW
+      RETURN
+*
+*     End of DGBSVX
+*
+      END
diff --git a/lapack-netlib/dgejsv.f b/lapack-netlib/dgejsv.f
new file mode 100644
index 000000000..ee769bb38
--- /dev/null
+++ b/lapack-netlib/dgejsv.f
@@ -0,0 +1,1780 @@
+*> \brief \b DGEJSV
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DGEJSV + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dgejsv.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dgejsv.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dgejsv.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,
+*                          M, N, A, LDA, SVA, U, LDU, V, LDV,
+*                          WORK, LWORK, IWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       IMPLICIT    NONE
+*       INTEGER     INFO, LDA, LDU, LDV, LWORK, M, N
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION A( LDA, * ), SVA( N ), U( LDU, * ), V( LDV, * ),
+*      $            WORK( LWORK )
+*       INTEGER     IWORK( * )
+*       CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DGEJSV computes the singular value decomposition (SVD) of a real M-by-N
+*> matrix [A], where M >= N. The SVD of [A] is written as
+*>
+*>              [A] = [U] * [SIGMA] * [V]^t,
+*>
+*> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N
+*> diagonal elements, [U] is an M-by-N (or M-by-M) orthonormal matrix, and
+*> [V] is an N-by-N orthogonal matrix. The diagonal elements of [SIGMA] are
+*> the singular values of [A]. The columns of [U] and [V] are the left and
+*> the right singular vectors of [A], respectively. The matrices [U] and [V]
+*> are computed and stored in the arrays U and V, respectively. The diagonal
+*> of [SIGMA] is computed and stored in the array SVA.
+*> DGEJSV can sometimes compute tiny singular values and their singular vectors much
+*> more accurately than other SVD routines, see below under Further Details.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBA
+*> \verbatim
+*>          JOBA is CHARACTER*1
+*>        Specifies the level of accuracy:
+*>       = 'C': This option works well (high relative accuracy) if A = B * D,
+*>             with well-conditioned B and arbitrary diagonal matrix D.
+*>             The accuracy cannot be spoiled by COLUMN scaling. The
+*>             accuracy of the computed output depends on the condition of
+*>             B, and the procedure aims at the best theoretical accuracy.
+*>             The relative error max_{i=1:N}|d sigma_i| / sigma_i is
+*>             bounded by f(M,N)*epsilon* cond(B), independent of D.
+*>             The input matrix is preprocessed with the QRF with column
+*>             pivoting. This initial preprocessing and preconditioning by
+*>             a rank revealing QR factorization is common for all values of
+*>             JOBA. Additional actions are specified as follows:
+*>       = 'E': Computation as with 'C' with an additional estimate of the
+*>             condition number of B. It provides a realistic error bound.
+*>       = 'F': If A = D1 * C * D2 with ill-conditioned diagonal scalings
+*>             D1, D2, and well-conditioned matrix C, this option gives
+*>             higher accuracy than the 'C' option. If the structure of the
+*>             input matrix is not known, and relative accuracy is
+*>             desirable, then this option is advisable. The input matrix A
+*>             is preprocessed with QR factorization with FULL (row and
+*>             column) pivoting.
+*>       = 'G': Computation as with 'F' with an additional estimate of the
+*>             condition number of B, where A=D*B. If A has heavily weighted
+*>             rows, then using this condition number gives too pessimistic
+*>             error bound.
+*>       = 'A': Small singular values are the noise and the matrix is treated
+*>             as numerically rank deficient. The error in the computed
+*>             singular values is bounded by f(m,n)*epsilon*||A||.
+*>             The computed SVD A = U * S * V^t restores A up to
+*>             f(m,n)*epsilon*||A||.
+*>             This gives the procedure the licence to discard (set to zero)
+*>             all singular values below N*epsilon*||A||.
+*>       = 'R': Similar as in 'A'. Rank revealing property of the initial
+*>             QR factorization is used do reveal (using triangular factor)
+*>             a gap sigma_{r+1} < epsilon * sigma_r in which case the
+*>             numerical RANK is declared to be r. The SVD is computed with
+*>             absolute error bounds, but more accurately than with 'A'.
+*> \endverbatim
+*>
+*> \param[in] JOBU
+*> \verbatim
+*>          JOBU is CHARACTER*1
+*>        Specifies whether to compute the columns of U:
+*>       = 'U': N columns of U are returned in the array U.
+*>       = 'F': full set of M left sing. vectors is returned in the array U.
+*>       = 'W': U may be used as workspace of length M*N. See the description
+*>             of U.
+*>       = 'N': U is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBV
+*> \verbatim
+*>          JOBV is CHARACTER*1
+*>        Specifies whether to compute the matrix V:
+*>       = 'V': N columns of V are returned in the array V; Jacobi rotations
+*>             are not explicitly accumulated.
+*>       = 'J': N columns of V are returned in the array V, but they are
+*>             computed as the product of Jacobi rotations. This option is
+*>             allowed only if JOBU .NE. 'N', i.e. in computing the full SVD.
+*>       = 'W': V may be used as workspace of length N*N. See the description
+*>             of V.
+*>       = 'N': V is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBR
+*> \verbatim
+*>          JOBR is CHARACTER*1
+*>        Specifies the RANGE for the singular values. Issues the licence to
+*>        set to zero small positive singular values if they are outside
+*>        specified range. If A .NE. 0 is scaled so that the largest singular
+*>        value of c*A is around DSQRT(BIG), BIG=SLAMCH('O'), then JOBR issues
+*>        the licence to kill columns of A whose norm in c*A is less than
+*>        DSQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN,
+*>        where SFMIN=SLAMCH('S'), EPSLN=SLAMCH('E').
+*>       = 'N': Do not kill small columns of c*A. This option assumes that
+*>             BLAS and QR factorizations and triangular solvers are
+*>             implemented to work in that range. If the condition of A
+*>             is greater than BIG, use DGESVJ.
+*>       = 'R': RESTRICTED range for sigma(c*A) is [DSQRT(SFMIN), DSQRT(BIG)]
+*>             (roughly, as described above). This option is recommended.
+*>                                            ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*>        For computing the singular values in the FULL range [SFMIN,BIG]
+*>        use DGESVJ.
+*> \endverbatim
+*>
+*> \param[in] JOBT
+*> \verbatim
+*>          JOBT is CHARACTER*1
+*>        If the matrix is square then the procedure may determine to use
+*>        transposed A if A^t seems to be better with respect to convergence.
+*>        If the matrix is not square, JOBT is ignored. This is subject to
+*>        changes in the future.
+*>        The decision is based on two values of entropy over the adjoint
+*>        orbit of A^t * A. See the descriptions of WORK(6) and WORK(7).
+*>       = 'T': transpose if entropy test indicates possibly faster
+*>        convergence of Jacobi process if A^t is taken as input. If A is
+*>        replaced with A^t, then the row pivoting is included automatically.
+*>       = 'N': do not speculate.
+*>        This option can be used to compute only the singular values, or the
+*>        full SVD (U, SIGMA and V). For only one set of singular vectors
+*>        (U or V), the caller should provide both U and V, as one of the
+*>        matrices is used as workspace if the matrix A is transposed.
+*>        The implementer can easily remove this constraint and make the
+*>        code more complicated. See the descriptions of U and V.
+*> \endverbatim
+*>
+*> \param[in] JOBP
+*> \verbatim
+*>          JOBP is CHARACTER*1
+*>        Issues the licence to introduce structured perturbations to drown
+*>        denormalized numbers. This licence should be active if the
+*>        denormals are poorly implemented, causing slow computation,
+*>        especially in cases of fast convergence (!). For details see [1,2].
+*>        For the sake of simplicity, this perturbations are included only
+*>        when the full SVD or only the singular values are requested. The
+*>        implementer/user can easily add the perturbation for the cases of
+*>        computing one set of singular vectors.
+*>       = 'P': introduce perturbation
+*>       = 'N': do not perturb
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>         The number of rows of the input matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>         The number of columns of the input matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] SVA
+*> \verbatim
+*>          SVA is DOUBLE PRECISION array, dimension (N)
+*>          On exit,
+*>          - For WORK(1)/WORK(2) = ONE: The singular values of A. During the
+*>            computation SVA contains Euclidean column norms of the
+*>            iterated matrices in the array A.
+*>          - For WORK(1) .NE. WORK(2): The singular values of A are
+*>            (WORK(1)/WORK(2)) * SVA(1:N). This factored form is used if
+*>            sigma_max(A) overflows or if small singular values have been
+*>            saved from underflow by scaling the input matrix A.
+*>          - If JOBR='R' then some of the singular values may be returned
+*>            as exact zeros obtained by "set to zero" because they are
+*>            below the numerical rank threshold or are denormalized numbers.
+*> \endverbatim
+*>
+*> \param[out] U
+*> \verbatim
+*>          U is DOUBLE PRECISION array, dimension ( LDU, N ) or ( LDU, M )
+*>          If JOBU = 'U', then U contains on exit the M-by-N matrix of
+*>                         the left singular vectors.
+*>          If JOBU = 'F', then U contains on exit the M-by-M matrix of
+*>                         the left singular vectors, including an ONB
+*>                         of the orthogonal complement of the Range(A).
+*>          If JOBU = 'W'  .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N),
+*>                         then U is used as workspace if the procedure
+*>                         replaces A with A^t. In that case, [V] is computed
+*>                         in U as left singular vectors of A^t and then
+*>                         copied back to the V array. This 'W' option is just
+*>                         a reminder to the caller that in this case U is
+*>                         reserved as workspace of length N*N.
+*>          If JOBU = 'N'  U is not referenced, unless JOBT='T'.
+*> \endverbatim
+*>
+*> \param[in] LDU
+*> \verbatim
+*>          LDU is INTEGER
+*>          The leading dimension of the array U,  LDU >= 1.
+*>          IF  JOBU = 'U' or 'F' or 'W',  then LDU >= M.
+*> \endverbatim
+*>
+*> \param[out] V
+*> \verbatim
+*>          V is DOUBLE PRECISION array, dimension ( LDV, N )
+*>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
+*>                         the right singular vectors;
+*>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
+*>                         then V is used as workspace if the pprocedure
+*>                         replaces A with A^t. In that case, [U] is computed
+*>                         in V as right singular vectors of A^t and then
+*>                         copied back to the U array. This 'W' option is just
+*>                         a reminder to the caller that in this case V is
+*>                         reserved as workspace of length N*N.
+*>          If JOBV = 'N'  V is not referenced, unless JOBT='T'.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V,  LDV >= 1.
+*>          If JOBV = 'V' or 'J' or 'W', then LDV >= N.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(7,LWORK))
+*>          On exit, if N > 0 .AND. M > 0 (else not referenced),
+*>          WORK(1) = SCALE = WORK(2) / WORK(1) is the scaling factor such
+*>                    that SCALE*SVA(1:N) are the computed singular values
+*>                    of A. (See the description of SVA().)
+*>          WORK(2) = See the description of WORK(1).
+*>          WORK(3) = SCONDA is an estimate for the condition number of
+*>                    column equilibrated A. (If JOBA = 'E' or 'G')
+*>                    SCONDA is an estimate of DSQRT(||(R^t * R)^(-1)||_1).
+*>                    It is computed using DPOCON. It holds
+*>                    N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+*>                    where R is the triangular factor from the QRF of A.
+*>                    However, if R is truncated and the numerical rank is
+*>                    determined to be strictly smaller than N, SCONDA is
+*>                    returned as -1, thus indicating that the smallest
+*>                    singular values might be lost.
+*>
+*>          If full SVD is needed, the following two condition numbers are
+*>          useful for the analysis of the algorithm. They are provided for
+*>          a developer/implementer who is familiar with the details of
+*>          the method.
+*>
+*>          WORK(4) = an estimate of the scaled condition number of the
+*>                    triangular factor in the first QR factorization.
+*>          WORK(5) = an estimate of the scaled condition number of the
+*>                    triangular factor in the second QR factorization.
+*>          The following two parameters are computed if JOBT = 'T'.
+*>          They are provided for a developer/implementer who is familiar
+*>          with the details of the method.
+*>
+*>          WORK(6) = the entropy of A^t*A :: this is the Shannon entropy
+*>                    of diag(A^t*A) / Trace(A^t*A) taken as point in the
+*>                    probability simplex.
+*>          WORK(7) = the entropy of A*A^t.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          Length of WORK to confirm proper allocation of work space.
+*>          LWORK depends on the job:
+*>
+*>          If only SIGMA is needed (JOBU = 'N', JOBV = 'N') and
+*>            -> .. no scaled condition estimate required (JOBE = 'N'):
+*>               LWORK >= max(2*M+N,4*N+1,7). This is the minimal requirement.
+*>               ->> For optimal performance (blocked code) the optimal value
+*>               is LWORK >= max(2*M+N,3*N+(N+1)*NB,7). Here NB is the optimal
+*>               block size for DGEQP3 and DGEQRF.
+*>               In general, optimal LWORK is computed as
+*>               LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DGEQRF), 7).
+*>            -> .. an estimate of the scaled condition number of A is
+*>               required (JOBA='E', 'G'). In this case, LWORK is the maximum
+*>               of the above and N*N+4*N, i.e. LWORK >= max(2*M+N,N*N+4*N,7).
+*>               ->> For optimal performance (blocked code) the optimal value
+*>               is LWORK >= max(2*M+N,3*N+(N+1)*NB, N*N+4*N, 7).
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DGEQRF),
+*>                                                     N+N*N+LWORK(DPOCON),7).
+*>
+*>          If SIGMA and the right singular vectors are needed (JOBV = 'V'),
+*>            -> the minimal requirement is LWORK >= max(2*M+N,4*N+1,7).
+*>            -> For optimal performance, LWORK >= max(2*M+N,3*N+(N+1)*NB,7),
+*>               where NB is the optimal block size for DGEQP3, DGEQRF, DGELQF,
+*>               DORMLQ. In general, the optimal length LWORK is computed as
+*>               LWORK >= max(2*M+N,N+LWORK(DGEQP3), N+LWORK(DPOCON),
+*>                       N+LWORK(DGELQF), 2*N+LWORK(DGEQRF), N+LWORK(DORMLQ)).
+*>
+*>          If SIGMA and the left singular vectors are needed
+*>            -> the minimal requirement is LWORK >= max(2*M+N,4*N+1,7).
+*>            -> For optimal performance:
+*>               if JOBU = 'U' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,7),
+*>               if JOBU = 'F' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,N+M*NB,7),
+*>               where NB is the optimal block size for DGEQP3, DGEQRF, DORMQR.
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DPOCON),
+*>                        2*N+LWORK(DGEQRF), N+LWORK(DORMQR)).
+*>               Here LWORK(DORMQR) equals N*NB (for JOBU = 'U') or
+*>               M*NB (for JOBU = 'F').
+*>
+*>          If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and
+*>            -> if JOBV = 'V'
+*>               the minimal requirement is LWORK >= max(2*M+N,6*N+2*N*N).
+*>            -> if JOBV = 'J' the minimal requirement is
+*>               LWORK >= max(2*M+N, 4*N+N*N,2*N+N*N+6).
+*>            -> For optimal performance, LWORK should be additionally
+*>               larger than N+M*NB, where NB is the optimal block size
+*>               for DORMQR.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (MAX(3,M+3*N)).
+*>          On exit,
+*>          IWORK(1) = the numerical rank determined after the initial
+*>                     QR factorization with pivoting. See the descriptions
+*>                     of JOBA and JOBR.
+*>          IWORK(2) = the number of the computed nonzero singular values
+*>          IWORK(3) = if nonzero, a warning message:
+*>                     If IWORK(3) = 1 then some of the column norms of A
+*>                     were denormalized floats. The requested high accuracy
+*>                     is not warranted by the data.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>           < 0:  if INFO = -i, then the i-th argument had an illegal value.
+*>           = 0:  successful exit;
+*>           > 0:  DGEJSV  did not converge in the maximal allowed number
+*>                 of sweeps. The computed values may be inaccurate.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup doubleGEsing
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  DGEJSV implements a preconditioned Jacobi SVD algorithm. It uses DGEQP3,
+*>  DGEQRF, and DGELQF as preprocessors and preconditioners. Optionally, an
+*>  additional row pivoting can be used as a preprocessor, which in some
+*>  cases results in much higher accuracy. An example is matrix A with the
+*>  structure A = D1 * C * D2, where D1, D2 are arbitrarily ill-conditioned
+*>  diagonal matrices and C is well-conditioned matrix. In that case, complete
+*>  pivoting in the first QR factorizations provides accuracy dependent on the
+*>  condition number of C, and independent of D1, D2. Such higher accuracy is
+*>  not completely understood theoretically, but it works well in practice.
+*>  Further, if A can be written as A = B*D, with well-conditioned B and some
+*>  diagonal D, then the high accuracy is guaranteed, both theoretically and
+*>  in software, independent of D. For more details see [1], [2].
+*>     The computational range for the singular values can be the full range
+*>  ( UNDERFLOW,OVERFLOW ), provided that the machine arithmetic and the BLAS
+*>  & LAPACK routines called by DGEJSV are implemented to work in that range.
+*>  If that is not the case, then the restriction for safe computation with
+*>  the singular values in the range of normalized IEEE numbers is that the
+*>  spectral condition number kappa(A)=sigma_max(A)/sigma_min(A) does not
+*>  overflow. This code (DGEJSV) is best used in this restricted range,
+*>  meaning that singular values of magnitude below ||A||_2 / DLAMCH('O') are
+*>  returned as zeros. See JOBR for details on this.
+*>     Further, this implementation is somewhat slower than the one described
+*>  in [1,2] due to replacement of some non-LAPACK components, and because
+*>  the choice of some tuning parameters in the iterative part (DGESVJ) is
+*>  left to the implementer on a particular machine.
+*>     The rank revealing QR factorization (in this code: DGEQP3) should be
+*>  implemented as in [3]. We have a new version of DGEQP3 under development
+*>  that is more robust than the current one in LAPACK, with a cleaner cut in
+*>  rank deficient cases. It will be available in the SIGMA library [4].
+*>  If M is much larger than N, it is obvious that the initial QRF with
+*>  column pivoting can be preprocessed by the QRF without pivoting. That
+*>  well known trick is not used in DGEJSV because in some cases heavy row
+*>  weighting can be treated with complete pivoting. The overhead in cases
+*>  M much larger than N is then only due to pivoting, but the benefits in
+*>  terms of accuracy have prevailed. The implementer/user can incorporate
+*>  this extra QRF step easily. The implementer can also improve data movement
+*>  (matrix transpose, matrix copy, matrix transposed copy) - this
+*>  implementation of DGEJSV uses only the simplest, naive data movement.
+*> \endverbatim
+*
+*> \par Contributors:
+*  ==================
+*>
+*>  Zlatko Drmac (Zagreb, Croatia) and Kresimir Veselic (Hagen, Germany)
+*
+*> \par References:
+*  ================
+*>
+*> \verbatim
+*>
+*> [1] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm I.
+*>     SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1322-1342.
+*>     LAPACK Working note 169.
+*> [2] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm II.
+*>     SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1343-1362.
+*>     LAPACK Working note 170.
+*> [3] Z. Drmac and Z. Bujanovic: On the failure of rank-revealing QR
+*>     factorization software - a case study.
+*>     ACM Trans. Math. Softw. Vol. 35, No 2 (2008), pp. 1-28.
+*>     LAPACK Working note 176.
+*> [4] Z. Drmac: SIGMA - mathematical software library for accurate SVD, PSV,
+*>     QSVD, (H,K)-SVD computations.
+*>     Department of Mathematics, University of Zagreb, 2008.
+*> \endverbatim
+*
+*>  \par Bugs, examples and comments:
+*   =================================
+*>
+*>  Please report all bugs and send interesting examples and/or comments to
+*>  drmac@math.hr. Thank you.
+*>
+*  =====================================================================
+      SUBROUTINE DGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,
+     $                   M, N, A, LDA, SVA, U, LDU, V, LDV,
+     $                   WORK, LWORK, IWORK, INFO )
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      IMPLICIT    NONE
+      INTEGER     INFO, LDA, LDU, LDV, LWORK, M, N
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION A( LDA, * ), SVA( N ), U( LDU, * ), V( LDV, * ),
+     $            WORK( LWORK )
+      INTEGER     IWORK( * )
+      CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV
+*     ..
+*
+*  ===========================================================================
+*
+*     .. Local Parameters ..
+      DOUBLE PRECISION   ZERO,  ONE
+      PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 )
+*     ..
+*     .. Local Scalars ..
+      DOUBLE PRECISION AAPP, AAQQ, AATMAX, AATMIN, BIG, BIG1, COND_OK,
+     $        CONDR1, CONDR2, ENTRA,  ENTRAT, EPSLN,  MAXPRJ, SCALEM,
+     $        SCONDA, SFMIN,  SMALL,  TEMP1,  USCAL1, USCAL2, XSC
+      INTEGER IERR,   N1,     NR,     NUMRANK,        p, q,   WARNING
+      LOGICAL ALMORT, DEFR,   ERREST, GOSCAL, JRACC,  KILL,   LSVEC,
+     $        L2ABER, L2KILL, L2PERT, L2RANK, L2TRAN,
+     $        NOSCAL, ROWPIV, RSVEC,  TRANSP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC DABS, DLOG, MAX, MIN, DBLE, IDNINT, DSIGN, DSQRT
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION  DLAMCH, DNRM2
+      INTEGER   IDAMAX
+      LOGICAL   LSAME
+      EXTERNAL  IDAMAX, LSAME, DLAMCH, DNRM2
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL  DCOPY,  DGELQF, DGEQP3, DGEQRF, DLACPY, DLASCL,
+     $          DLASET, DLASSQ, DLASWP, DORGQR, DORMLQ,
+     $          DORMQR, DPOCON, DSCAL,  DSWAP,  DTRSM,  XERBLA
+*
+      EXTERNAL  DGESVJ
+*     ..
+*
+*     Test the input arguments
+*
+      LSVEC  = LSAME( JOBU, 'U' ) .OR. LSAME( JOBU, 'F' )
+      JRACC  = LSAME( JOBV, 'J' )
+      RSVEC  = LSAME( JOBV, 'V' ) .OR. JRACC
+      ROWPIV = LSAME( JOBA, 'F' ) .OR. LSAME( JOBA, 'G' )
+      L2RANK = LSAME( JOBA, 'R' )
+      L2ABER = LSAME( JOBA, 'A' )
+      ERREST = LSAME( JOBA, 'E' ) .OR. LSAME( JOBA, 'G' )
+      L2TRAN = LSAME( JOBT, 'T' )
+      L2KILL = LSAME( JOBR, 'R' )
+      DEFR   = LSAME( JOBR, 'N' )
+      L2PERT = LSAME( JOBP, 'P' )
+*
+      IF ( .NOT.(ROWPIV .OR. L2RANK .OR. L2ABER .OR.
+     $     ERREST .OR. LSAME( JOBA, 'C' ) )) THEN
+         INFO = - 1
+      ELSE IF ( .NOT.( LSVEC  .OR. LSAME( JOBU, 'N' ) .OR.
+     $                             LSAME( JOBU, 'W' )) ) THEN
+         INFO = - 2
+      ELSE IF ( .NOT.( RSVEC .OR. LSAME( JOBV, 'N' ) .OR.
+     $   LSAME( JOBV, 'W' )) .OR. ( JRACC .AND. (.NOT.LSVEC) ) ) THEN
+         INFO = - 3
+      ELSE IF ( .NOT. ( L2KILL .OR. DEFR ) )    THEN
+         INFO = - 4
+      ELSE IF ( .NOT. ( L2TRAN .OR. LSAME( JOBT, 'N' ) ) ) THEN
+         INFO = - 5
+      ELSE IF ( .NOT. ( L2PERT .OR. LSAME( JOBP, 'N' ) ) ) THEN
+         INFO = - 6
+      ELSE IF ( M .LT. 0 ) THEN
+         INFO = - 7
+      ELSE IF ( ( N .LT. 0 ) .OR. ( N .GT. M ) ) THEN
+         INFO = - 8
+      ELSE IF ( LDA .LT. M ) THEN
+         INFO = - 10
+      ELSE IF ( LSVEC .AND. ( LDU .LT. M ) ) THEN
+         INFO = - 13
+      ELSE IF ( RSVEC .AND. ( LDV .LT. N ) ) THEN
+         INFO = - 15
+      ELSE IF ( (.NOT.(LSVEC .OR. RSVEC .OR. ERREST).AND.
+     &                           (LWORK .LT. MAX(7,4*N+1,2*M+N))) .OR.
+     & (.NOT.(LSVEC .OR. RSVEC) .AND. ERREST .AND.
+     &                         (LWORK .LT. MAX(7,4*N+N*N,2*M+N))) .OR.
+     & (LSVEC .AND. (.NOT.RSVEC) .AND. (LWORK .LT. MAX(7,2*M+N,4*N+1)))
+     & .OR.
+     & (RSVEC .AND. (.NOT.LSVEC) .AND. (LWORK .LT. MAX(7,2*M+N,4*N+1)))
+     & .OR.
+     & (LSVEC .AND. RSVEC .AND. (.NOT.JRACC) .AND.
+     &                          (LWORK.LT.MAX(2*M+N,6*N+2*N*N)))
+     & .OR. (LSVEC .AND. RSVEC .AND. JRACC .AND.
+     &                          LWORK.LT.MAX(2*M+N,4*N+N*N,2*N+N*N+6)))
+     &   THEN
+         INFO = - 17
+      ELSE
+*        #:)
+         INFO = 0
+      END IF
+*
+      IF ( INFO .NE. 0 ) THEN
+*       #:(
+         CALL XERBLA( 'DGEJSV', - INFO )
+         RETURN
+      END IF
+*
+*     Quick return for void matrix (Y3K safe)
+* #:)
+      IF ( ( M .EQ. 0 ) .OR. ( N .EQ. 0 ) ) THEN
+         IWORK(1:3) = 0
+         WORK(1:7) = 0
+         RETURN
+      ENDIF
+*
+*     Determine whether the matrix U should be M x N or M x M
+*
+      IF ( LSVEC ) THEN
+         N1 = N
+         IF ( LSAME( JOBU, 'F' ) ) N1 = M
+      END IF
+*
+*     Set numerical parameters
+*
+*!    NOTE: Make sure DLAMCH() does not fail on the target architecture.
+*
+      EPSLN = DLAMCH('Epsilon')
+      SFMIN = DLAMCH('SafeMinimum')
+      SMALL = SFMIN / EPSLN
+      BIG   = DLAMCH('O')
+*     BIG   = ONE / SFMIN
+*
+*     Initialize SVA(1:N) = diag( ||A e_i||_2 )_1^N
+*
+*(!)  If necessary, scale SVA() to protect the largest norm from
+*     overflow. It is possible that this scaling pushes the smallest
+*     column norm left from the underflow threshold (extreme case).
+*
+      SCALEM  = ONE / DSQRT(DBLE(M)*DBLE(N))
+      NOSCAL  = .TRUE.
+      GOSCAL  = .TRUE.
+      DO 1874 p = 1, N
+         AAPP = ZERO
+         AAQQ = ONE
+         CALL DLASSQ( M, A(1,p), 1, AAPP, AAQQ )
+         IF ( AAPP .GT. BIG ) THEN
+            INFO = - 9
+            CALL XERBLA( 'DGEJSV', -INFO )
+            RETURN
+         END IF
+         AAQQ = DSQRT(AAQQ)
+         IF ( ( AAPP .LT. (BIG / AAQQ) ) .AND. NOSCAL  ) THEN
+            SVA(p)  = AAPP * AAQQ
+         ELSE
+            NOSCAL  = .FALSE.
+            SVA(p)  = AAPP * ( AAQQ * SCALEM )
+            IF ( GOSCAL ) THEN
+               GOSCAL = .FALSE.
+               CALL DSCAL( p-1, SCALEM, SVA, 1 )
+            END IF
+         END IF
+ 1874 CONTINUE
+*
+      IF ( NOSCAL ) SCALEM = ONE
+*
+      AAPP = ZERO
+      AAQQ = BIG
+      DO 4781 p = 1, N
+         AAPP = MAX( AAPP, SVA(p) )
+         IF ( SVA(p) .NE. ZERO ) AAQQ = MIN( AAQQ, SVA(p) )
+ 4781 CONTINUE
+*
+*     Quick return for zero M x N matrix
+* #:)
+      IF ( AAPP .EQ. ZERO ) THEN
+         IF ( LSVEC ) CALL DLASET( 'G', M, N1, ZERO, ONE, U, LDU )
+         IF ( RSVEC ) CALL DLASET( 'G', N, N,  ZERO, ONE, V, LDV )
+         WORK(1) = ONE
+         WORK(2) = ONE
+         IF ( ERREST ) WORK(3) = ONE
+         IF ( LSVEC .AND. RSVEC ) THEN
+            WORK(4) = ONE
+            WORK(5) = ONE
+         END IF
+         IF ( L2TRAN ) THEN
+            WORK(6) = ZERO
+            WORK(7) = ZERO
+         END IF
+         IWORK(1) = 0
+         IWORK(2) = 0
+         IWORK(3) = 0
+         RETURN
+      END IF
+*
+*     Issue warning if denormalized column norms detected. Override the
+*     high relative accuracy request. Issue licence to kill columns
+*     (set them to zero) whose norm is less than sigma_max / BIG (roughly).
+* #:(
+      WARNING = 0
+      IF ( AAQQ .LE. SFMIN ) THEN
+         L2RANK = .TRUE.
+         L2KILL = .TRUE.
+         WARNING = 1
+      END IF
+*
+*     Quick return for one-column matrix
+* #:)
+      IF ( N .EQ. 1 ) THEN
+*
+         IF ( LSVEC ) THEN
+            CALL DLASCL( 'G',0,0,SVA(1),SCALEM, M,1,A(1,1),LDA,IERR )
+            CALL DLACPY( 'A', M, 1, A, LDA, U, LDU )
+*           computing all M left singular vectors of the M x 1 matrix
+            IF ( N1 .NE. N  ) THEN
+               CALL DGEQRF( M, N, U,LDU, WORK, WORK(N+1),LWORK-N,IERR )
+               CALL DORGQR( M,N1,1, U,LDU,WORK,WORK(N+1),LWORK-N,IERR )
+               CALL DCOPY( M, A(1,1), 1, U(1,1), 1 )
+            END IF
+         END IF
+         IF ( RSVEC ) THEN
+             V(1,1) = ONE
+         END IF
+         IF ( SVA(1) .LT. (BIG*SCALEM) ) THEN
+            SVA(1)  = SVA(1) / SCALEM
+            SCALEM  = ONE
+         END IF
+         WORK(1) = ONE / SCALEM
+         WORK(2) = ONE
+         IF ( SVA(1) .NE. ZERO ) THEN
+            IWORK(1) = 1
+            IF ( ( SVA(1) / SCALEM) .GE. SFMIN ) THEN
+               IWORK(2) = 1
+            ELSE
+               IWORK(2) = 0
+            END IF
+         ELSE
+            IWORK(1) = 0
+            IWORK(2) = 0
+         END IF
+         IWORK(3) = 0
+         IF ( ERREST ) WORK(3) = ONE
+         IF ( LSVEC .AND. RSVEC ) THEN
+            WORK(4) = ONE
+            WORK(5) = ONE
+         END IF
+         IF ( L2TRAN ) THEN
+            WORK(6) = ZERO
+            WORK(7) = ZERO
+         END IF
+         RETURN
+*
+      END IF
+*
+      TRANSP = .FALSE.
+      L2TRAN = L2TRAN .AND. ( M .EQ. N )
+*
+      AATMAX = -ONE
+      AATMIN =  BIG
+      IF ( ROWPIV .OR. L2TRAN ) THEN
+*
+*     Compute the row norms, needed to determine row pivoting sequence
+*     (in the case of heavily row weighted A, row pivoting is strongly
+*     advised) and to collect information needed to compare the
+*     structures of A * A^t and A^t * A (in the case L2TRAN.EQ..TRUE.).
+*
+         IF ( L2TRAN ) THEN
+            DO 1950 p = 1, M
+               XSC   = ZERO
+               TEMP1 = ONE
+               CALL DLASSQ( N, A(p,1), LDA, XSC, TEMP1 )
+*              DLASSQ gets both the ell_2 and the ell_infinity norm
+*              in one pass through the vector
+               WORK(M+N+p)  = XSC * SCALEM
+               WORK(N+p)    = XSC * (SCALEM*DSQRT(TEMP1))
+               AATMAX = MAX( AATMAX, WORK(N+p) )
+               IF (WORK(N+p) .NE. ZERO) AATMIN = MIN(AATMIN,WORK(N+p))
+ 1950       CONTINUE
+         ELSE
+            DO 1904 p = 1, M
+               WORK(M+N+p) = SCALEM*DABS( A(p,IDAMAX(N,A(p,1),LDA)) )
+               AATMAX = MAX( AATMAX, WORK(M+N+p) )
+               AATMIN = MIN( AATMIN, WORK(M+N+p) )
+ 1904       CONTINUE
+         END IF
+*
+      END IF
+*
+*     For square matrix A try to determine whether A^t  would be  better
+*     input for the preconditioned Jacobi SVD, with faster convergence.
+*     The decision is based on an O(N) function of the vector of column
+*     and row norms of A, based on the Shannon entropy. This should give
+*     the right choice in most cases when the difference actually matters.
+*     It may fail and pick the slower converging side.
+*
+      ENTRA  = ZERO
+      ENTRAT = ZERO
+      IF ( L2TRAN ) THEN
+*
+         XSC   = ZERO
+         TEMP1 = ONE
+         CALL DLASSQ( N, SVA, 1, XSC, TEMP1 )
+         TEMP1 = ONE / TEMP1
+*
+         ENTRA = ZERO
+         DO 1113 p = 1, N
+            BIG1  = ( ( SVA(p) / XSC )**2 ) * TEMP1
+            IF ( BIG1 .NE. ZERO ) ENTRA = ENTRA + BIG1 * DLOG(BIG1)
+ 1113    CONTINUE
+         ENTRA = - ENTRA / DLOG(DBLE(N))
+*
+*        Now, SVA().^2/Trace(A^t * A) is a point in the probability simplex.
+*        It is derived from the diagonal of  A^t * A.  Do the same with the
+*        diagonal of A * A^t, compute the entropy of the corresponding
+*        probability distribution. Note that A * A^t and A^t * A have the
+*        same trace.
+*
+         ENTRAT = ZERO
+         DO 1114 p = N+1, N+M
+            BIG1 = ( ( WORK(p) / XSC )**2 ) * TEMP1
+            IF ( BIG1 .NE. ZERO ) ENTRAT = ENTRAT + BIG1 * DLOG(BIG1)
+ 1114    CONTINUE
+         ENTRAT = - ENTRAT / DLOG(DBLE(M))
+*
+*        Analyze the entropies and decide A or A^t. Smaller entropy
+*        usually means better input for the algorithm.
+*
+         TRANSP = ( ENTRAT .LT. ENTRA )
+*
+*        If A^t is better than A, transpose A.
+*
+         IF ( TRANSP ) THEN
+*           In an optimal implementation, this trivial transpose
+*           should be replaced with faster transpose.
+            DO 1115 p = 1, N - 1
+               DO 1116 q = p + 1, N
+                   TEMP1 = A(q,p)
+                  A(q,p) = A(p,q)
+                  A(p,q) = TEMP1
+ 1116          CONTINUE
+ 1115       CONTINUE
+            DO 1117 p = 1, N
+               WORK(M+N+p) = SVA(p)
+               SVA(p)      = WORK(N+p)
+ 1117       CONTINUE
+            TEMP1  = AAPP
+            AAPP   = AATMAX
+            AATMAX = TEMP1
+            TEMP1  = AAQQ
+            AAQQ   = AATMIN
+            AATMIN = TEMP1
+            KILL   = LSVEC
+            LSVEC  = RSVEC
+            RSVEC  = KILL
+            IF ( LSVEC ) N1 = N
+*
+            ROWPIV = .TRUE.
+         END IF
+*
+      END IF
+*     END IF L2TRAN
+*
+*     Scale the matrix so that its maximal singular value remains less
+*     than DSQRT(BIG) -- the matrix is scaled so that its maximal column
+*     has Euclidean norm equal to DSQRT(BIG/N). The only reason to keep
+*     DSQRT(BIG) instead of BIG is the fact that DGEJSV uses LAPACK and
+*     BLAS routines that, in some implementations, are not capable of
+*     working in the full interval [SFMIN,BIG] and that they may provoke
+*     overflows in the intermediate results. If the singular values spread
+*     from SFMIN to BIG, then DGESVJ will compute them. So, in that case,
+*     one should use DGESVJ instead of DGEJSV.
+*
+      BIG1   = DSQRT( BIG )
+      TEMP1  = DSQRT( BIG / DBLE(N) )
+*
+      CALL DLASCL( 'G', 0, 0, AAPP, TEMP1, N, 1, SVA, N, IERR )
+      IF ( AAQQ .GT. (AAPP * SFMIN) ) THEN
+          AAQQ = ( AAQQ / AAPP ) * TEMP1
+      ELSE
+          AAQQ = ( AAQQ * TEMP1 ) / AAPP
+      END IF
+      TEMP1 = TEMP1 * SCALEM
+      CALL DLASCL( 'G', 0, 0, AAPP, TEMP1, M, N, A, LDA, IERR )
+*
+*     To undo scaling at the end of this procedure, multiply the
+*     computed singular values with USCAL2 / USCAL1.
+*
+      USCAL1 = TEMP1
+      USCAL2 = AAPP
+*
+      IF ( L2KILL ) THEN
+*        L2KILL enforces computation of nonzero singular values in
+*        the restricted range of condition number of the initial A,
+*        sigma_max(A) / sigma_min(A) approx. DSQRT(BIG)/DSQRT(SFMIN).
+         XSC = DSQRT( SFMIN )
+      ELSE
+         XSC = SMALL
+*
+*        Now, if the condition number of A is too big,
+*        sigma_max(A) / sigma_min(A) .GT. DSQRT(BIG/N) * EPSLN / SFMIN,
+*        as a precaution measure, the full SVD is computed using DGESVJ
+*        with accumulated Jacobi rotations. This provides numerically
+*        more robust computation, at the cost of slightly increased run
+*        time. Depending on the concrete implementation of BLAS and LAPACK
+*        (i.e. how they behave in presence of extreme ill-conditioning) the
+*        implementor may decide to remove this switch.
+         IF ( ( AAQQ.LT.DSQRT(SFMIN) ) .AND. LSVEC .AND. RSVEC ) THEN
+            JRACC = .TRUE.
+         END IF
+*
+      END IF
+      IF ( AAQQ .LT. XSC ) THEN
+         DO 700 p = 1, N
+            IF ( SVA(p) .LT. XSC ) THEN
+               CALL DLASET( 'A', M, 1, ZERO, ZERO, A(1,p), LDA )
+               SVA(p) = ZERO
+            END IF
+ 700     CONTINUE
+      END IF
+*
+*     Preconditioning using QR factorization with pivoting
+*
+      IF ( ROWPIV ) THEN
+*        Optional row permutation (Bjoerck row pivoting):
+*        A result by Cox and Higham shows that the Bjoerck's
+*        row pivoting combined with standard column pivoting
+*        has similar effect as Powell-Reid complete pivoting.
+*        The ell-infinity norms of A are made nonincreasing.
+         DO 1952 p = 1, M - 1
+            q = IDAMAX( M-p+1, WORK(M+N+p), 1 ) + p - 1
+            IWORK(2*N+p) = q
+            IF ( p .NE. q ) THEN
+               TEMP1       = WORK(M+N+p)
+               WORK(M+N+p) = WORK(M+N+q)
+               WORK(M+N+q) = TEMP1
+            END IF
+ 1952    CONTINUE
+         CALL DLASWP( N, A, LDA, 1, M-1, IWORK(2*N+1), 1 )
+      END IF
+*
+*     End of the preparation phase (scaling, optional sorting and
+*     transposing, optional flushing of small columns).
+*
+*     Preconditioning
+*
+*     If the full SVD is needed, the right singular vectors are computed
+*     from a matrix equation, and for that we need theoretical analysis
+*     of the Businger-Golub pivoting. So we use DGEQP3 as the first RR QRF.
+*     In all other cases the first RR QRF can be chosen by other criteria
+*     (eg speed by replacing global with restricted window pivoting, such
+*     as in SGEQPX from TOMS # 782). Good results will be obtained using
+*     SGEQPX with properly (!) chosen numerical parameters.
+*     Any improvement of DGEQP3 improves overall performance of DGEJSV.
+*
+*     A * P1 = Q1 * [ R1^t 0]^t:
+      DO 1963 p = 1, N
+*        .. all columns are free columns
+         IWORK(p) = 0
+ 1963 CONTINUE
+      CALL DGEQP3( M,N,A,LDA, IWORK,WORK, WORK(N+1),LWORK-N, IERR )
+*
+*     The upper triangular matrix R1 from the first QRF is inspected for
+*     rank deficiency and possibilities for deflation, or possible
+*     ill-conditioning. Depending on the user specified flag L2RANK,
+*     the procedure explores possibilities to reduce the numerical
+*     rank by inspecting the computed upper triangular factor. If
+*     L2RANK or L2ABER are up, then DGEJSV will compute the SVD of
+*     A + dA, where ||dA|| <= f(M,N)*EPSLN.
+*
+      NR = 1
+      IF ( L2ABER ) THEN
+*        Standard absolute error bound suffices. All sigma_i with
+*        sigma_i < N*EPSLN*||A|| are flushed to zero. This is an
+*        aggressive enforcement of lower numerical rank by introducing a
+*        backward error of the order of N*EPSLN*||A||.
+         TEMP1 = DSQRT(DBLE(N))*EPSLN
+         DO 3001 p = 2, N
+            IF ( DABS(A(p,p)) .GE. (TEMP1*DABS(A(1,1))) ) THEN
+               NR = NR + 1
+            ELSE
+               GO TO 3002
+            END IF
+ 3001    CONTINUE
+ 3002    CONTINUE
+      ELSE IF ( L2RANK ) THEN
+*        .. similarly as above, only slightly more gentle (less aggressive).
+*        Sudden drop on the diagonal of R1 is used as the criterion for
+*        close-to-rank-deficient.
+         TEMP1 = DSQRT(SFMIN)
+         DO 3401 p = 2, N
+            IF ( ( DABS(A(p,p)) .LT. (EPSLN*DABS(A(p-1,p-1))) ) .OR.
+     $           ( DABS(A(p,p)) .LT. SMALL ) .OR.
+     $           ( L2KILL .AND. (DABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3402
+            NR = NR + 1
+ 3401    CONTINUE
+ 3402    CONTINUE
+*
+      ELSE
+*        The goal is high relative accuracy. However, if the matrix
+*        has high scaled condition number the relative accuracy is in
+*        general not feasible. Later on, a condition number estimator
+*        will be deployed to estimate the scaled condition number.
+*        Here we just remove the underflowed part of the triangular
+*        factor. This prevents the situation in which the code is
+*        working hard to get the accuracy not warranted by the data.
+         TEMP1  = DSQRT(SFMIN)
+         DO 3301 p = 2, N
+            IF ( ( DABS(A(p,p)) .LT. SMALL ) .OR.
+     $          ( L2KILL .AND. (DABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3302
+            NR = NR + 1
+ 3301    CONTINUE
+ 3302    CONTINUE
+*
+      END IF
+*
+      ALMORT = .FALSE.
+      IF ( NR .EQ. N ) THEN
+         MAXPRJ = ONE
+         DO 3051 p = 2, N
+            TEMP1  = DABS(A(p,p)) / SVA(IWORK(p))
+            MAXPRJ = MIN( MAXPRJ, TEMP1 )
+ 3051    CONTINUE
+         IF ( MAXPRJ**2 .GE. ONE - DBLE(N)*EPSLN ) ALMORT = .TRUE.
+      END IF
+*
+*
+      SCONDA = - ONE
+      CONDR1 = - ONE
+      CONDR2 = - ONE
+*
+      IF ( ERREST ) THEN
+         IF ( N .EQ. NR ) THEN
+            IF ( RSVEC ) THEN
+*              .. V is available as workspace
+               CALL DLACPY( 'U', N, N, A, LDA, V, LDV )
+               DO 3053 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+                  CALL DSCAL( p, ONE/TEMP1, V(1,p), 1 )
+ 3053          CONTINUE
+               CALL DPOCON( 'U', N, V, LDV, ONE, TEMP1,
+     $              WORK(N+1), IWORK(2*N+M+1), IERR )
+            ELSE IF ( LSVEC ) THEN
+*              .. U is available as workspace
+               CALL DLACPY( 'U', N, N, A, LDA, U, LDU )
+               DO 3054 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+                  CALL DSCAL( p, ONE/TEMP1, U(1,p), 1 )
+ 3054          CONTINUE
+               CALL DPOCON( 'U', N, U, LDU, ONE, TEMP1,
+     $              WORK(N+1), IWORK(2*N+M+1), IERR )
+            ELSE
+               CALL DLACPY( 'U', N, N, A, LDA, WORK(N+1), N )
+               DO 3052 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+                  CALL DSCAL( p, ONE/TEMP1, WORK(N+(p-1)*N+1), 1 )
+ 3052          CONTINUE
+*           .. the columns of R are scaled to have unit Euclidean lengths.
+               CALL DPOCON( 'U', N, WORK(N+1), N, ONE, TEMP1,
+     $              WORK(N+N*N+1), IWORK(2*N+M+1), IERR )
+            END IF
+            SCONDA = ONE / DSQRT(TEMP1)
+*           SCONDA is an estimate of DSQRT(||(R^t * R)^(-1)||_1).
+*           N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+         ELSE
+            SCONDA = - ONE
+         END IF
+      END IF
+*
+      L2PERT = L2PERT .AND. ( DABS( A(1,1)/A(NR,NR) ) .GT. DSQRT(BIG1) )
+*     If there is no violent scaling, artificial perturbation is not needed.
+*
+*     Phase 3:
+*
+      IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN
+*
+*         Singular Values only
+*
+*         .. transpose A(1:NR,1:N)
+         DO 1946 p = 1, MIN( N-1, NR )
+            CALL DCOPY( N-p, A(p,p+1), LDA, A(p+1,p), 1 )
+ 1946    CONTINUE
+*
+*        The following two DO-loops introduce small relative perturbation
+*        into the strict upper triangle of the lower triangular matrix.
+*        Small entries below the main diagonal are also changed.
+*        This modification is useful if the computing environment does not
+*        provide/allow FLUSH TO ZERO underflow, for it prevents many
+*        annoying denormalized numbers in case of strongly scaled matrices.
+*        The perturbation is structured so that it does not introduce any
+*        new perturbation of the singular values, and it does not destroy
+*        the job done by the preconditioner.
+*        The licence for this perturbation is in the variable L2PERT, which
+*        should be .FALSE. if FLUSH TO ZERO underflow is active.
+*
+         IF ( .NOT. ALMORT ) THEN
+*
+            IF ( L2PERT ) THEN
+*              XSC = DSQRT(SMALL)
+               XSC = EPSLN / DBLE(N)
+               DO 4947 q = 1, NR
+                  TEMP1 = XSC*DABS(A(q,q))
+                  DO 4949 p = 1, N
+                     IF ( ( (p.GT.q) .AND. (DABS(A(p,q)).LE.TEMP1) )
+     $                    .OR. ( p .LT. q ) )
+     $                     A(p,q) = DSIGN( TEMP1, A(p,q) )
+ 4949             CONTINUE
+ 4947          CONTINUE
+            ELSE
+               CALL DLASET( 'U', NR-1,NR-1, ZERO,ZERO, A(1,2),LDA )
+            END IF
+*
+*            .. second preconditioning using the QR factorization
+*
+            CALL DGEQRF( N,NR, A,LDA, WORK, WORK(N+1),LWORK-N, IERR )
+*
+*           .. and transpose upper to lower triangular
+            DO 1948 p = 1, NR - 1
+               CALL DCOPY( NR-p, A(p,p+1), LDA, A(p+1,p), 1 )
+ 1948       CONTINUE
+*
+         END IF
+*
+*           Row-cyclic Jacobi SVD algorithm with column pivoting
+*
+*           .. again some perturbation (a "background noise") is added
+*           to drown denormals
+            IF ( L2PERT ) THEN
+*              XSC = DSQRT(SMALL)
+               XSC = EPSLN / DBLE(N)
+               DO 1947 q = 1, NR
+                  TEMP1 = XSC*DABS(A(q,q))
+                  DO 1949 p = 1, NR
+                     IF ( ( (p.GT.q) .AND. (DABS(A(p,q)).LE.TEMP1) )
+     $                       .OR. ( p .LT. q ) )
+     $                   A(p,q) = DSIGN( TEMP1, A(p,q) )
+ 1949             CONTINUE
+ 1947          CONTINUE
+            ELSE
+               CALL DLASET( 'U', NR-1, NR-1, ZERO, ZERO, A(1,2), LDA )
+            END IF
+*
+*           .. and one-sided Jacobi rotations are started on a lower
+*           triangular matrix (plus perturbation which is ignored in
+*           the part which destroys triangular form (confusing?!))
+*
+            CALL DGESVJ( 'L', 'NoU', 'NoV', NR, NR, A, LDA, SVA,
+     $                      N, V, LDV, WORK, LWORK, INFO )
+*
+            SCALEM  = WORK(1)
+            NUMRANK = IDNINT(WORK(2))
+*
+*
+      ELSE IF ( RSVEC .AND. ( .NOT. LSVEC ) ) THEN
+*
+*        -> Singular Values and Right Singular Vectors <-
+*
+         IF ( ALMORT ) THEN
+*
+*           .. in this case NR equals N
+            DO 1998 p = 1, NR
+               CALL DCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+ 1998       CONTINUE
+            CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV )
+*
+            CALL DGESVJ( 'L','U','N', N, NR, V,LDV, SVA, NR, A,LDA,
+     $                  WORK, LWORK, INFO )
+            SCALEM  = WORK(1)
+            NUMRANK = IDNINT(WORK(2))
+
+         ELSE
+*
+*        .. two more QR factorizations ( one QRF is not enough, two require
+*        accumulated product of Jacobi rotations, three are perfect )
+*
+            CALL DLASET( 'Lower', NR-1, NR-1, ZERO, ZERO, A(2,1), LDA )
+            CALL DGELQF( NR, N, A, LDA, WORK, WORK(N+1), LWORK-N, IERR)
+            CALL DLACPY( 'Lower', NR, NR, A, LDA, V, LDV )
+            CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV )
+            CALL DGEQRF( NR, NR, V, LDV, WORK(N+1), WORK(2*N+1),
+     $                   LWORK-2*N, IERR )
+            DO 8998 p = 1, NR
+               CALL DCOPY( NR-p+1, V(p,p), LDV, V(p,p), 1 )
+ 8998       CONTINUE
+            CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV )
+*
+            CALL DGESVJ( 'Lower', 'U','N', NR, NR, V,LDV, SVA, NR, U,
+     $                  LDU, WORK(N+1), LWORK, INFO )
+            SCALEM  = WORK(N+1)
+            NUMRANK = IDNINT(WORK(N+2))
+            IF ( NR .LT. N ) THEN
+               CALL DLASET( 'A',N-NR, NR, ZERO,ZERO, V(NR+1,1),   LDV )
+               CALL DLASET( 'A',NR, N-NR, ZERO,ZERO, V(1,NR+1),   LDV )
+               CALL DLASET( 'A',N-NR,N-NR,ZERO,ONE, V(NR+1,NR+1), LDV )
+            END IF
+*
+         CALL DORMLQ( 'Left', 'Transpose', N, N, NR, A, LDA, WORK,
+     $               V, LDV, WORK(N+1), LWORK-N, IERR )
+*
+         END IF
+*
+         DO 8991 p = 1, N
+            CALL DCOPY( N, V(p,1), LDV, A(IWORK(p),1), LDA )
+ 8991    CONTINUE
+         CALL DLACPY( 'All', N, N, A, LDA, V, LDV )
+*
+         IF ( TRANSP ) THEN
+            CALL DLACPY( 'All', N, N, V, LDV, U, LDU )
+         END IF
+*
+      ELSE IF ( LSVEC .AND. ( .NOT. RSVEC ) ) THEN
+*
+*        .. Singular Values and Left Singular Vectors                 ..
+*
+*        .. second preconditioning step to avoid need to accumulate
+*        Jacobi rotations in the Jacobi iterations.
+         DO 1965 p = 1, NR
+            CALL DCOPY( N-p+1, A(p,p), LDA, U(p,p), 1 )
+ 1965    CONTINUE
+         CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, U(1,2), LDU )
+*
+         CALL DGEQRF( N, NR, U, LDU, WORK(N+1), WORK(2*N+1),
+     $              LWORK-2*N, IERR )
+*
+         DO 1967 p = 1, NR - 1
+            CALL DCOPY( NR-p, U(p,p+1), LDU, U(p+1,p), 1 )
+ 1967    CONTINUE
+         CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, U(1,2), LDU )
+*
+         CALL DGESVJ( 'Lower', 'U', 'N', NR,NR, U, LDU, SVA, NR, A,
+     $        LDA, WORK(N+1), LWORK-N, INFO )
+         SCALEM  = WORK(N+1)
+         NUMRANK = IDNINT(WORK(N+2))
+*
+         IF ( NR .LT. M ) THEN
+            CALL DLASET( 'A',  M-NR, NR,ZERO, ZERO, U(NR+1,1), LDU )
+            IF ( NR .LT. N1 ) THEN
+               CALL DLASET( 'A',NR, N1-NR, ZERO, ZERO, U(1,NR+1), LDU )
+               CALL DLASET( 'A',M-NR,N1-NR,ZERO,ONE,U(NR+1,NR+1), LDU )
+            END IF
+         END IF
+*
+         CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, WORK, U,
+     $               LDU, WORK(N+1), LWORK-N, IERR )
+*
+         IF ( ROWPIV )
+     $       CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(2*N+1), -1 )
+*
+         DO 1974 p = 1, N1
+            XSC = ONE / DNRM2( M, U(1,p), 1 )
+            CALL DSCAL( M, XSC, U(1,p), 1 )
+ 1974    CONTINUE
+*
+         IF ( TRANSP ) THEN
+            CALL DLACPY( 'All', N, N, U, LDU, V, LDV )
+         END IF
+*
+      ELSE
+*
+*        .. Full SVD ..
+*
+         IF ( .NOT. JRACC ) THEN
+*
+         IF ( .NOT. ALMORT ) THEN
+*
+*           Second Preconditioning Step (QRF [with pivoting])
+*           Note that the composition of TRANSPOSE, QRF and TRANSPOSE is
+*           equivalent to an LQF CALL. Since in many libraries the QRF
+*           seems to be better optimized than the LQF, we do explicit
+*           transpose and use the QRF. This is subject to changes in an
+*           optimized implementation of DGEJSV.
+*
+            DO 1968 p = 1, NR
+               CALL DCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+ 1968       CONTINUE
+*
+*           .. the following two loops perturb small entries to avoid
+*           denormals in the second QR factorization, where they are
+*           as good as zeros. This is done to avoid painfully slow
+*           computation with denormals. The relative size of the perturbation
+*           is a parameter that can be changed by the implementer.
+*           This perturbation device will be obsolete on machines with
+*           properly implemented arithmetic.
+*           To switch it off, set L2PERT=.FALSE. To remove it from  the
+*           code, remove the action under L2PERT=.TRUE., leave the ELSE part.
+*           The following two loops should be blocked and fused with the
+*           transposed copy above.
+*
+            IF ( L2PERT ) THEN
+               XSC = DSQRT(SMALL)
+               DO 2969 q = 1, NR
+                  TEMP1 = XSC*DABS( V(q,q) )
+                  DO 2968 p = 1, N
+                     IF ( ( p .GT. q ) .AND. ( DABS(V(p,q)) .LE. TEMP1 )
+     $                   .OR. ( p .LT. q ) )
+     $                   V(p,q) = DSIGN( TEMP1, V(p,q) )
+                     IF ( p .LT. q ) V(p,q) = - V(p,q)
+ 2968             CONTINUE
+ 2969          CONTINUE
+            ELSE
+               CALL DLASET( 'U', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV )
+            END IF
+*
+*           Estimate the row scaled condition number of R1
+*           (If R1 is rectangular, N > NR, then the condition number
+*           of the leading NR x NR submatrix is estimated.)
+*
+            CALL DLACPY( 'L', NR, NR, V, LDV, WORK(2*N+1), NR )
+            DO 3950 p = 1, NR
+               TEMP1 = DNRM2(NR-p+1,WORK(2*N+(p-1)*NR+p),1)
+               CALL DSCAL(NR-p+1,ONE/TEMP1,WORK(2*N+(p-1)*NR+p),1)
+ 3950       CONTINUE
+            CALL DPOCON('Lower',NR,WORK(2*N+1),NR,ONE,TEMP1,
+     $                   WORK(2*N+NR*NR+1),IWORK(M+2*N+1),IERR)
+            CONDR1 = ONE / DSQRT(TEMP1)
+*           .. here need a second opinion on the condition number
+*           .. then assume worst case scenario
+*           R1 is OK for inverse <=> CONDR1 .LT. DBLE(N)
+*           more conservative    <=> CONDR1 .LT. DSQRT(DBLE(N))
+*
+            COND_OK = DSQRT(DBLE(NR))
+*[TP]       COND_OK is a tuning parameter.
+
+            IF ( CONDR1 .LT. COND_OK ) THEN
+*              .. the second QRF without pivoting. Note: in an optimized
+*              implementation, this QRF should be implemented as the QRF
+*              of a lower triangular matrix.
+*              R1^t = Q2 * R2
+               CALL DGEQRF( N, NR, V, LDV, WORK(N+1), WORK(2*N+1),
+     $              LWORK-2*N, IERR )
+*
+               IF ( L2PERT ) THEN
+                  XSC = DSQRT(SMALL)/EPSLN
+                  DO 3959 p = 2, NR
+                     DO 3958 q = 1, p - 1
+                        TEMP1 = XSC * MIN(DABS(V(p,p)),DABS(V(q,q)))
+                        IF ( DABS(V(q,p)) .LE. TEMP1 )
+     $                     V(q,p) = DSIGN( TEMP1, V(q,p) )
+ 3958                CONTINUE
+ 3959             CONTINUE
+               END IF
+*
+               IF ( NR .NE. N )
+     $         CALL DLACPY( 'A', N, NR, V, LDV, WORK(2*N+1), N )
+*              .. save ...
+*
+*           .. this transposed copy should be better than naive
+               DO 1969 p = 1, NR - 1
+                  CALL DCOPY( NR-p, V(p,p+1), LDV, V(p+1,p), 1 )
+ 1969          CONTINUE
+*
+               CONDR2 = CONDR1
+*
+            ELSE
+*
+*              .. ill-conditioned case: second QRF with pivoting
+*              Note that windowed pivoting would be equally good
+*              numerically, and more run-time efficient. So, in
+*              an optimal implementation, the next call to DGEQP3
+*              should be replaced with eg. CALL SGEQPX (ACM TOMS #782)
+*              with properly (carefully) chosen parameters.
+*
+*              R1^t * P2 = Q2 * R2
+               DO 3003 p = 1, NR
+                  IWORK(N+p) = 0
+ 3003          CONTINUE
+               CALL DGEQP3( N, NR, V, LDV, IWORK(N+1), WORK(N+1),
+     $                  WORK(2*N+1), LWORK-2*N, IERR )
+**               CALL DGEQRF( N, NR, V, LDV, WORK(N+1), WORK(2*N+1),
+**     $              LWORK-2*N, IERR )
+               IF ( L2PERT ) THEN
+                  XSC = DSQRT(SMALL)
+                  DO 3969 p = 2, NR
+                     DO 3968 q = 1, p - 1
+                        TEMP1 = XSC * MIN(DABS(V(p,p)),DABS(V(q,q)))
+                        IF ( DABS(V(q,p)) .LE. TEMP1 )
+     $                     V(q,p) = DSIGN( TEMP1, V(q,p) )
+ 3968                CONTINUE
+ 3969             CONTINUE
+               END IF
+*
+               CALL DLACPY( 'A', N, NR, V, LDV, WORK(2*N+1), N )
+*
+               IF ( L2PERT ) THEN
+                  XSC = DSQRT(SMALL)
+                  DO 8970 p = 2, NR
+                     DO 8971 q = 1, p - 1
+                        TEMP1 = XSC * MIN(DABS(V(p,p)),DABS(V(q,q)))
+                        V(p,q) = - DSIGN( TEMP1, V(q,p) )
+ 8971                CONTINUE
+ 8970             CONTINUE
+               ELSE
+                  CALL DLASET( 'L',NR-1,NR-1,ZERO,ZERO,V(2,1),LDV )
+               END IF
+*              Now, compute R2 = L3 * Q3, the LQ factorization.
+               CALL DGELQF( NR, NR, V, LDV, WORK(2*N+N*NR+1),
+     $               WORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, IERR )
+*              .. and estimate the condition number
+               CALL DLACPY( 'L',NR,NR,V,LDV,WORK(2*N+N*NR+NR+1),NR )
+               DO 4950 p = 1, NR
+                  TEMP1 = DNRM2( p, WORK(2*N+N*NR+NR+p), NR )
+                  CALL DSCAL( p, ONE/TEMP1, WORK(2*N+N*NR+NR+p), NR )
+ 4950          CONTINUE
+               CALL DPOCON( 'L',NR,WORK(2*N+N*NR+NR+1),NR,ONE,TEMP1,
+     $              WORK(2*N+N*NR+NR+NR*NR+1),IWORK(M+2*N+1),IERR )
+               CONDR2 = ONE / DSQRT(TEMP1)
+*
+               IF ( CONDR2 .GE. COND_OK ) THEN
+*                 .. save the Householder vectors used for Q3
+*                 (this overwrites the copy of R2, as it will not be
+*                 needed in this branch, but it does not overwritte the
+*                 Huseholder vectors of Q2.).
+                  CALL DLACPY( 'U', NR, NR, V, LDV, WORK(2*N+1), N )
+*                 .. and the rest of the information on Q3 is in
+*                 WORK(2*N+N*NR+1:2*N+N*NR+N)
+               END IF
+*
+            END IF
+*
+            IF ( L2PERT ) THEN
+               XSC = DSQRT(SMALL)
+               DO 4968 q = 2, NR
+                  TEMP1 = XSC * V(q,q)
+                  DO 4969 p = 1, q - 1
+*                    V(p,q) = - DSIGN( TEMP1, V(q,p) )
+                     V(p,q) = - DSIGN( TEMP1, V(p,q) )
+ 4969             CONTINUE
+ 4968          CONTINUE
+            ELSE
+               CALL DLASET( 'U', NR-1,NR-1, ZERO,ZERO, V(1,2), LDV )
+            END IF
+*
+*        Second preconditioning finished; continue with Jacobi SVD
+*        The input matrix is lower trinagular.
+*
+*        Recover the right singular vectors as solution of a well
+*        conditioned triangular matrix equation.
+*
+            IF ( CONDR1 .LT. COND_OK ) THEN
+*
+               CALL DGESVJ( 'L','U','N',NR,NR,V,LDV,SVA,NR,U,
+     $              LDU,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,INFO )
+               SCALEM  = WORK(2*N+N*NR+NR+1)
+               NUMRANK = IDNINT(WORK(2*N+N*NR+NR+2))
+               DO 3970 p = 1, NR
+                  CALL DCOPY( NR, V(1,p), 1, U(1,p), 1 )
+                  CALL DSCAL( NR, SVA(p),    V(1,p), 1 )
+ 3970          CONTINUE
+
+*        .. pick the right matrix equation and solve it
+*
+               IF ( NR .EQ. N ) THEN
+* :))             .. best case, R1 is inverted. The solution of this matrix
+*                 equation is Q2*V2 = the product of the Jacobi rotations
+*                 used in DGESVJ, premultiplied with the orthogonal matrix
+*                 from the second QR factorization.
+                  CALL DTRSM( 'L','U','N','N', NR,NR,ONE, A,LDA, V,LDV )
+               ELSE
+*                 .. R1 is well conditioned, but non-square. Transpose(R2)
+*                 is inverted to get the product of the Jacobi rotations
+*                 used in DGESVJ. The Q-factor from the second QR
+*                 factorization is then built in explicitly.
+                  CALL DTRSM('L','U','T','N',NR,NR,ONE,WORK(2*N+1),
+     $                 N,V,LDV)
+                  IF ( NR .LT. N ) THEN
+                    CALL DLASET('A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV)
+                    CALL DLASET('A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV)
+                    CALL DLASET('A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV)
+                  END IF
+                  CALL DORMQR('L','N',N,N,NR,WORK(2*N+1),N,WORK(N+1),
+     $                 V,LDV,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR)
+               END IF
+*
+            ELSE IF ( CONDR2 .LT. COND_OK ) THEN
+*
+* :)           .. the input matrix A is very likely a relative of
+*              the Kahan matrix :)
+*              The matrix R2 is inverted. The solution of the matrix equation
+*              is Q3^T*V3 = the product of the Jacobi rotations (appplied to
+*              the lower triangular L3 from the LQ factorization of
+*              R2=L3*Q3), pre-multiplied with the transposed Q3.
+               CALL DGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
+     $              LDU, WORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, INFO )
+               SCALEM  = WORK(2*N+N*NR+NR+1)
+               NUMRANK = IDNINT(WORK(2*N+N*NR+NR+2))
+               DO 3870 p = 1, NR
+                  CALL DCOPY( NR, V(1,p), 1, U(1,p), 1 )
+                  CALL DSCAL( NR, SVA(p),    U(1,p), 1 )
+ 3870          CONTINUE
+               CALL DTRSM('L','U','N','N',NR,NR,ONE,WORK(2*N+1),N,U,LDU)
+*              .. apply the permutation from the second QR factorization
+               DO 873 q = 1, NR
+                  DO 872 p = 1, NR
+                     WORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q)
+ 872              CONTINUE
+                  DO 874 p = 1, NR
+                     U(p,q) = WORK(2*N+N*NR+NR+p)
+ 874              CONTINUE
+ 873           CONTINUE
+               IF ( NR .LT. N ) THEN
+                  CALL DLASET( 'A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV )
+                  CALL DLASET( 'A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV )
+                  CALL DLASET( 'A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV )
+               END IF
+               CALL DORMQR( 'L','N',N,N,NR,WORK(2*N+1),N,WORK(N+1),
+     $              V,LDV,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+            ELSE
+*              Last line of defense.
+* #:(          This is a rather pathological case: no scaled condition
+*              improvement after two pivoted QR factorizations. Other
+*              possibility is that the rank revealing QR factorization
+*              or the condition estimator has failed, or the COND_OK
+*              is set very close to ONE (which is unnecessary). Normally,
+*              this branch should never be executed, but in rare cases of
+*              failure of the RRQR or condition estimator, the last line of
+*              defense ensures that DGEJSV completes the task.
+*              Compute the full SVD of L3 using DGESVJ with explicit
+*              accumulation of Jacobi rotations.
+               CALL DGESVJ( 'L', 'U', 'V', NR, NR, V, LDV, SVA, NR, U,
+     $              LDU, WORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, INFO )
+               SCALEM  = WORK(2*N+N*NR+NR+1)
+               NUMRANK = IDNINT(WORK(2*N+N*NR+NR+2))
+               IF ( NR .LT. N ) THEN
+                  CALL DLASET( 'A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV )
+                  CALL DLASET( 'A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV )
+                  CALL DLASET( 'A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV )
+               END IF
+               CALL DORMQR( 'L','N',N,N,NR,WORK(2*N+1),N,WORK(N+1),
+     $              V,LDV,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+*
+               CALL DORMLQ( 'L', 'T', NR, NR, NR, WORK(2*N+1), N,
+     $              WORK(2*N+N*NR+1), U, LDU, WORK(2*N+N*NR+NR+1),
+     $              LWORK-2*N-N*NR-NR, IERR )
+               DO 773 q = 1, NR
+                  DO 772 p = 1, NR
+                     WORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q)
+ 772              CONTINUE
+                  DO 774 p = 1, NR
+                     U(p,q) = WORK(2*N+N*NR+NR+p)
+ 774              CONTINUE
+ 773           CONTINUE
+*
+            END IF
+*
+*           Permute the rows of V using the (column) permutation from the
+*           first QRF. Also, scale the columns to make them unit in
+*           Euclidean norm. This applies to all cases.
+*
+            TEMP1 = DSQRT(DBLE(N)) * EPSLN
+            DO 1972 q = 1, N
+               DO 972 p = 1, N
+                  WORK(2*N+N*NR+NR+IWORK(p)) = V(p,q)
+  972          CONTINUE
+               DO 973 p = 1, N
+                  V(p,q) = WORK(2*N+N*NR+NR+p)
+  973          CONTINUE
+               XSC = ONE / DNRM2( N, V(1,q), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $           CALL DSCAL( N, XSC, V(1,q), 1 )
+ 1972       CONTINUE
+*           At this moment, V contains the right singular vectors of A.
+*           Next, assemble the left singular vector matrix U (M x N).
+            IF ( NR .LT. M ) THEN
+               CALL DLASET( 'A', M-NR, NR, ZERO, ZERO, U(NR+1,1), LDU )
+               IF ( NR .LT. N1 ) THEN
+                  CALL DLASET('A',NR,N1-NR,ZERO,ZERO,U(1,NR+1),LDU)
+                  CALL DLASET('A',M-NR,N1-NR,ZERO,ONE,U(NR+1,NR+1),LDU)
+               END IF
+            END IF
+*
+*           The Q matrix from the first QRF is built into the left singular
+*           matrix U. This applies to all cases.
+*
+            CALL DORMQR( 'Left', 'No_Tr', M, N1, N, A, LDA, WORK, U,
+     $           LDU, WORK(N+1), LWORK-N, IERR )
+
+*           The columns of U are normalized. The cost is O(M*N) flops.
+            TEMP1 = DSQRT(DBLE(M)) * EPSLN
+            DO 1973 p = 1, NR
+               XSC = ONE / DNRM2( M, U(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $          CALL DSCAL( M, XSC, U(1,p), 1 )
+ 1973       CONTINUE
+*
+*           If the initial QRF is computed with row pivoting, the left
+*           singular vectors must be adjusted.
+*
+            IF ( ROWPIV )
+     $          CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(2*N+1), -1 )
+*
+         ELSE
+*
+*        .. the initial matrix A has almost orthogonal columns and
+*        the second QRF is not needed
+*
+            CALL DLACPY( 'Upper', N, N, A, LDA, WORK(N+1), N )
+            IF ( L2PERT ) THEN
+               XSC = DSQRT(SMALL)
+               DO 5970 p = 2, N
+                  TEMP1 = XSC * WORK( N + (p-1)*N + p )
+                  DO 5971 q = 1, p - 1
+                     WORK(N+(q-1)*N+p)=-DSIGN(TEMP1,WORK(N+(p-1)*N+q))
+ 5971             CONTINUE
+ 5970          CONTINUE
+            ELSE
+               CALL DLASET( 'Lower',N-1,N-1,ZERO,ZERO,WORK(N+2),N )
+            END IF
+*
+            CALL DGESVJ( 'Upper', 'U', 'N', N, N, WORK(N+1), N, SVA,
+     $           N, U, LDU, WORK(N+N*N+1), LWORK-N-N*N, INFO )
+*
+            SCALEM  = WORK(N+N*N+1)
+            NUMRANK = IDNINT(WORK(N+N*N+2))
+            DO 6970 p = 1, N
+               CALL DCOPY( N, WORK(N+(p-1)*N+1), 1, U(1,p), 1 )
+               CALL DSCAL( N, SVA(p), WORK(N+(p-1)*N+1), 1 )
+ 6970       CONTINUE
+*
+            CALL DTRSM( 'Left', 'Upper', 'NoTrans', 'No UD', N, N,
+     $           ONE, A, LDA, WORK(N+1), N )
+            DO 6972 p = 1, N
+               CALL DCOPY( N, WORK(N+p), N, V(IWORK(p),1), LDV )
+ 6972       CONTINUE
+            TEMP1 = DSQRT(DBLE(N))*EPSLN
+            DO 6971 p = 1, N
+               XSC = ONE / DNRM2( N, V(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $            CALL DSCAL( N, XSC, V(1,p), 1 )
+ 6971       CONTINUE
+*
+*           Assemble the left singular vector matrix U (M x N).
+*
+            IF ( N .LT. M ) THEN
+               CALL DLASET( 'A',  M-N, N, ZERO, ZERO, U(N+1,1), LDU )
+               IF ( N .LT. N1 ) THEN
+                  CALL DLASET( 'A',N,  N1-N, ZERO, ZERO,  U(1,N+1),LDU )
+                  CALL DLASET( 'A',M-N,N1-N, ZERO, ONE,U(N+1,N+1),LDU )
+               END IF
+            END IF
+            CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, WORK, U,
+     $           LDU, WORK(N+1), LWORK-N, IERR )
+            TEMP1 = DSQRT(DBLE(M))*EPSLN
+            DO 6973 p = 1, N1
+               XSC = ONE / DNRM2( M, U(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $            CALL DSCAL( M, XSC, U(1,p), 1 )
+ 6973       CONTINUE
+*
+            IF ( ROWPIV )
+     $         CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(2*N+1), -1 )
+*
+         END IF
+*
+*        end of the  >> almost orthogonal case <<  in the full SVD
+*
+         ELSE
+*
+*        This branch deploys a preconditioned Jacobi SVD with explicitly
+*        accumulated rotations. It is included as optional, mainly for
+*        experimental purposes. It does perform well, and can also be used.
+*        In this implementation, this branch will be automatically activated
+*        if the  condition number sigma_max(A) / sigma_min(A) is predicted
+*        to be greater than the overflow threshold. This is because the
+*        a posteriori computation of the singular vectors assumes robust
+*        implementation of BLAS and some LAPACK procedures, capable of working
+*        in presence of extreme values. Since that is not always the case, ...
+*
+         DO 7968 p = 1, NR
+            CALL DCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+ 7968    CONTINUE
+*
+         IF ( L2PERT ) THEN
+            XSC = DSQRT(SMALL/EPSLN)
+            DO 5969 q = 1, NR
+               TEMP1 = XSC*DABS( V(q,q) )
+               DO 5968 p = 1, N
+                  IF ( ( p .GT. q ) .AND. ( DABS(V(p,q)) .LE. TEMP1 )
+     $                .OR. ( p .LT. q ) )
+     $                V(p,q) = DSIGN( TEMP1, V(p,q) )
+                  IF ( p .LT. q ) V(p,q) = - V(p,q)
+ 5968          CONTINUE
+ 5969       CONTINUE
+         ELSE
+            CALL DLASET( 'U', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV )
+         END IF
+
+         CALL DGEQRF( N, NR, V, LDV, WORK(N+1), WORK(2*N+1),
+     $        LWORK-2*N, IERR )
+         CALL DLACPY( 'L', N, NR, V, LDV, WORK(2*N+1), N )
+*
+         DO 7969 p = 1, NR
+            CALL DCOPY( NR-p+1, V(p,p), LDV, U(p,p), 1 )
+ 7969    CONTINUE
+
+         IF ( L2PERT ) THEN
+            XSC = DSQRT(SMALL/EPSLN)
+            DO 9970 q = 2, NR
+               DO 9971 p = 1, q - 1
+                  TEMP1 = XSC * MIN(DABS(U(p,p)),DABS(U(q,q)))
+                  U(p,q) = - DSIGN( TEMP1, U(q,p) )
+ 9971          CONTINUE
+ 9970       CONTINUE
+         ELSE
+            CALL DLASET('U', NR-1, NR-1, ZERO, ZERO, U(1,2), LDU )
+         END IF
+
+         CALL DGESVJ( 'G', 'U', 'V', NR, NR, U, LDU, SVA,
+     $        N, V, LDV, WORK(2*N+N*NR+1), LWORK-2*N-N*NR, INFO )
+         SCALEM  = WORK(2*N+N*NR+1)
+         NUMRANK = IDNINT(WORK(2*N+N*NR+2))
+
+         IF ( NR .LT. N ) THEN
+            CALL DLASET( 'A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV )
+            CALL DLASET( 'A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV )
+            CALL DLASET( 'A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV )
+         END IF
+
+         CALL DORMQR( 'L','N',N,N,NR,WORK(2*N+1),N,WORK(N+1),
+     $        V,LDV,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+*
+*           Permute the rows of V using the (column) permutation from the
+*           first QRF. Also, scale the columns to make them unit in
+*           Euclidean norm. This applies to all cases.
+*
+            TEMP1 = DSQRT(DBLE(N)) * EPSLN
+            DO 7972 q = 1, N
+               DO 8972 p = 1, N
+                  WORK(2*N+N*NR+NR+IWORK(p)) = V(p,q)
+ 8972          CONTINUE
+               DO 8973 p = 1, N
+                  V(p,q) = WORK(2*N+N*NR+NR+p)
+ 8973          CONTINUE
+               XSC = ONE / DNRM2( N, V(1,q), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $           CALL DSCAL( N, XSC, V(1,q), 1 )
+ 7972       CONTINUE
+*
+*           At this moment, V contains the right singular vectors of A.
+*           Next, assemble the left singular vector matrix U (M x N).
+*
+         IF ( NR .LT. M ) THEN
+            CALL DLASET( 'A',  M-NR, NR, ZERO, ZERO, U(NR+1,1), LDU )
+            IF ( NR .LT. N1 ) THEN
+               CALL DLASET( 'A',NR,  N1-NR, ZERO, ZERO,  U(1,NR+1),LDU )
+               CALL DLASET( 'A',M-NR,N1-NR, ZERO, ONE,U(NR+1,NR+1),LDU )
+            END IF
+         END IF
+*
+         CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, WORK, U,
+     $        LDU, WORK(N+1), LWORK-N, IERR )
+*
+            IF ( ROWPIV )
+     $         CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(2*N+1), -1 )
+*
+*
+         END IF
+         IF ( TRANSP ) THEN
+*           .. swap U and V because the procedure worked on A^t
+            DO 6974 p = 1, N
+               CALL DSWAP( N, U(1,p), 1, V(1,p), 1 )
+ 6974       CONTINUE
+         END IF
+*
+      END IF
+*     end of the full SVD
+*
+*     Undo scaling, if necessary (and possible)
+*
+      IF ( USCAL2 .LE. (BIG/SVA(1))*USCAL1 ) THEN
+         CALL DLASCL( 'G', 0, 0, USCAL1, USCAL2, NR, 1, SVA, N, IERR )
+         USCAL1 = ONE
+         USCAL2 = ONE
+      END IF
+*
+      IF ( NR .LT. N ) THEN
+         DO 3004 p = NR+1, N
+            SVA(p) = ZERO
+ 3004    CONTINUE
+      END IF
+*
+      WORK(1) = USCAL2 * SCALEM
+      WORK(2) = USCAL1
+      IF ( ERREST ) WORK(3) = SCONDA
+      IF ( LSVEC .AND. RSVEC ) THEN
+         WORK(4) = CONDR1
+         WORK(5) = CONDR2
+      END IF
+      IF ( L2TRAN ) THEN
+         WORK(6) = ENTRA
+         WORK(7) = ENTRAT
+      END IF
+*
+      IWORK(1) = NR
+      IWORK(2) = NUMRANK
+      IWORK(3) = WARNING
+*
+      RETURN
+*     ..
+*     .. END OF DGEJSV
+*     ..
+      END
+*
diff --git a/lapack-netlib/dgesvx.f b/lapack-netlib/dgesvx.f
new file mode 100644
index 000000000..f787488dc
--- /dev/null
+++ b/lapack-netlib/dgesvx.f
@@ -0,0 +1,599 @@
+*> \brief <b> DGESVX computes the solution to system of linear equations A * X = B for GE matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DGESVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dgesvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dgesvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dgesvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+*                          EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+*                          WORK, IWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+*       DOUBLE PRECISION   RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * ), IWORK( * )
+*       DOUBLE PRECISION   A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+*      $                   BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DGESVX uses the LU factorization to compute the solution to a real
+*> system of linear equations
+*>    A * X = B,
+*> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = P * L * U,
+*>    where P is a permutation matrix, L is a unit lower triangular
+*>    matrix, and U is upper triangular.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AF and IPIV contain the factored form of A.
+*>                  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  A, AF, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AF and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AF and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations:
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>          On entry, the N-by-N matrix A.  If FACT = 'F' and EQUED is
+*>          not 'N', then A must have been equilibrated by the scaling
+*>          factors in R and/or C.  A is not modified if FACT = 'F' or
+*>          'N', or if FACT = 'E' and EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] AF
+*> \verbatim
+*>          AF is DOUBLE PRECISION array, dimension (LDAF,N)
+*>          If FACT = 'F', then AF is an input argument and on entry
+*>          contains the factors L and U from the factorization
+*>          A = P*L*U as computed by DGETRF.  If EQUED .ne. 'N', then
+*>          AF is the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the equilibrated matrix A (see the description of A for
+*>          the form of the equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAF
+*> \verbatim
+*>          LDAF is INTEGER
+*>          The leading dimension of the array AF.  LDAF >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          as computed by DGETRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is DOUBLE PRECISION array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is DOUBLE PRECISION array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is DOUBLE PRECISION array, dimension (LDB,NRHS)
+*>          On entry, the N-by-NRHS right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is DOUBLE PRECISION array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is DOUBLE PRECISION
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,4*N))
+*>          On exit, WORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If WORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          WORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N)
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization has
+*>                       been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*>                       value of RCOND would suggest.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup doubleGEsolve
+*
+*  =====================================================================
+      SUBROUTINE DGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+     $                   EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+     $                   WORK, IWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+      DOUBLE PRECISION   RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * ), IWORK( * )
+      DOUBLE PRECISION   A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+     $                   BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J
+      DOUBLE PRECISION   AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      DOUBLE PRECISION   DLAMCH, DLANGE, DLANTR
+      EXTERNAL           LSAME, DLAMCH, DLANGE, DLANTR
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DGECON, DGEEQU, DGERFS, DGETRF, DGETRS, DLACPY,
+     $                   DLAQGE, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = DLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE IF( LDAF.LT.MAX( 1, N ) ) THEN
+         INFO = -8
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -10
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -11
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -12
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -14
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DGESVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL DGEEQU( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL DLAQGE( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX,
+     $                   EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of A.
+*
+         CALL DLACPY( 'Full', N, N, A, LDA, AF, LDAF )
+         CALL DGETRF( N, N, AF, LDAF, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            RPVGRW = DLANTR( 'M', 'U', 'N', INFO, INFO, AF, LDAF,
+     $               WORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = DLANGE( 'M', N, INFO, A, LDA, WORK ) / RPVGRW
+            END IF
+            WORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = DLANGE( NORM, N, N, A, LDA, WORK )
+      RPVGRW = DLANTR( 'M', 'U', 'N', N, N, AF, LDAF, WORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = DLANGE( 'M', N, N, A, LDA, WORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL DGECON( NORM, N, AF, LDAF, ANORM, RCOND, WORK, IWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL DLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL DGETRS( TRANS, N, NRHS, AF, LDAF, IPIV, X, LDX, INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL DGERFS( TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, B, LDB, X,
+     $             LDX, FERR, BERR, WORK, IWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 80 J = 1, NRHS
+               DO 70 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+   70          CONTINUE
+   80       CONTINUE
+            DO 90 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+   90       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 110 J = 1, NRHS
+            DO 100 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  100       CONTINUE
+  110    CONTINUE
+         DO 120 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  120    CONTINUE
+      END IF
+*
+      WORK( 1 ) = RPVGRW
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.DLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+      RETURN
+*
+*     End of DGESVX
+*
+      END
diff --git a/lapack-netlib/sgbsvx.f b/lapack-netlib/sgbsvx.f
new file mode 100644
index 000000000..df3a721d9
--- /dev/null
+++ b/lapack-netlib/sgbsvx.f
@@ -0,0 +1,641 @@
+*> \brief <b> SGBSVX computes the solution to system of linear equations A * X = B for GB matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SGBSVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sgbsvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sgbsvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sgbsvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+*                          LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+*                          RCOND, FERR, BERR, WORK, IWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+*       REAL               RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * ), IWORK( * )
+*       REAL               AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+*      $                   BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SGBSVX uses the LU factorization to compute the solution to a real
+*> system of linear equations A * X = B, A**T * X = B, or A**H * X = B,
+*> where A is a band matrix of order N with KL subdiagonals and KU
+*> superdiagonals, and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed by this subroutine:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = L * U,
+*>    where L is a product of permutation and unit lower triangular
+*>    matrices with KL subdiagonals, and U is upper triangular with
+*>    KL+KU superdiagonals.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AFB and IPIV contain the factored form of
+*>                  A.  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  AB, AFB, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AFB and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AFB and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations.
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KL
+*> \verbatim
+*>          KL is INTEGER
+*>          The number of subdiagonals within the band of A.  KL >= 0.
+*> \endverbatim
+*>
+*> \param[in] KU
+*> \verbatim
+*>          KU is INTEGER
+*>          The number of superdiagonals within the band of A.  KU >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is REAL array, dimension (LDAB,N)
+*>          On entry, the matrix A in band storage, in rows 1 to KL+KU+1.
+*>          The j-th column of A is stored in the j-th column of the
+*>          array AB as follows:
+*>          AB(KU+1+i-j,j) = A(i,j) for max(1,j-KU)<=i<=min(N,j+kl)
+*>
+*>          If FACT = 'F' and EQUED is not 'N', then A must have been
+*>          equilibrated by the scaling factors in R and/or C.  AB is not
+*>          modified if FACT = 'F' or 'N', or if FACT = 'E' and
+*>          EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] AFB
+*> \verbatim
+*>          AFB is REAL array, dimension (LDAFB,N)
+*>          If FACT = 'F', then AFB is an input argument and on entry
+*>          contains details of the LU factorization of the band matrix
+*>          A, as computed by SGBTRF.  U is stored as an upper triangular
+*>          band matrix with KL+KU superdiagonals in rows 1 to KL+KU+1,
+*>          and the multipliers used during the factorization are stored
+*>          in rows KL+KU+2 to 2*KL+KU+1.  If EQUED .ne. 'N', then AFB is
+*>          the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of A.
+*>
+*>          If FACT = 'E', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of the equilibrated
+*>          matrix A (see the description of AB for the form of the
+*>          equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAFB
+*> \verbatim
+*>          LDAFB is INTEGER
+*>          The leading dimension of the array AFB.  LDAFB >= 2*KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = L*U
+*>          as computed by SGBTRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is REAL array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is REAL array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is REAL array, dimension (LDB,NRHS)
+*>          On entry, the right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is REAL array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is REAL
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is REAL array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is REAL array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (MAX(1,3*N))
+*>          On exit, WORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If WORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          WORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N)
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization
+*>                       has been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup realGBsolve
+*
+*  =====================================================================
+      SUBROUTINE SGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+     $                   LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+     $                   RCOND, FERR, BERR, WORK, IWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+      REAL               RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * ), IWORK( * )
+      REAL               AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+     $                   BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*  Moved setting of INFO = N+1 so INFO does not subsequently get
+*  overwritten.  Sven, 17 Mar 05.
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J, J1, J2
+      REAL               AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      REAL               SLAMCH, SLANGB, SLANTB
+      EXTERNAL           LSAME, SLAMCH, SLANGB, SLANTB
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SCOPY, SGBCON, SGBEQU, SGBRFS, SGBTRF, SGBTRS,
+     $                   SLACPY, SLAQGB, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = SLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KL.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( KU.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -6
+      ELSE IF( LDAB.LT.KL+KU+1 ) THEN
+         INFO = -8
+      ELSE IF( LDAFB.LT.2*KL+KU+1 ) THEN
+         INFO = -10
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -12
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -13
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -14
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -18
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SGBSVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL SGBEQU( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL SLAQGB( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                   AMAX, EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of the band matrix A.
+*
+         DO 70 J = 1, N
+            J1 = MAX( J-KU, 1 )
+            J2 = MIN( J+KL, N )
+            CALL SCOPY( J2-J1+1, AB( KU+1-J+J1, J ), 1,
+     $                  AFB( KL+KU+1-J+J1, J ), 1 )
+   70    CONTINUE
+*
+         CALL SGBTRF( N, N, KL, KU, AFB, LDAFB, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            ANORM = ZERO
+            DO 90 J = 1, INFO
+               DO 80 I = MAX( KU+2-J, 1 ), MIN( N+KU+1-J, KL+KU+1 )
+                  ANORM = MAX( ANORM, ABS( AB( I, J ) ) )
+   80          CONTINUE
+   90       CONTINUE
+            RPVGRW = SLANTB( 'M', 'U', 'N', INFO, MIN( INFO-1, KL+KU ),
+     $                       AFB( MAX( 1, KL+KU+2-INFO ), 1 ), LDAFB,
+     $                       WORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = ANORM / RPVGRW
+            END IF
+            WORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = SLANGB( NORM, N, KL, KU, AB, LDAB, WORK )
+      RPVGRW = SLANTB( 'M', 'U', 'N', N, KL+KU, AFB, LDAFB, WORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = SLANGB( 'M', N, KL, KU, AB, LDAB, WORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL SGBCON( NORM, N, KL, KU, AFB, LDAFB, IPIV, ANORM, RCOND,
+     $             WORK, IWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL SLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL SGBTRS( TRANS, N, KL, KU, NRHS, AFB, LDAFB, IPIV, X, LDX,
+     $             INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL SGBRFS( TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, LDAFB, IPIV,
+     $             B, LDB, X, LDX, FERR, BERR, WORK, IWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 110 J = 1, NRHS
+               DO 100 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+  100          CONTINUE
+  110       CONTINUE
+            DO 120 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+  120       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 140 J = 1, NRHS
+            DO 130 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  130       CONTINUE
+  140    CONTINUE
+         DO 150 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  150    CONTINUE
+      END IF
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.SLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+*
+      WORK( 1 ) = RPVGRW
+      RETURN
+*
+*     End of SGBSVX
+*
+      END
diff --git a/lapack-netlib/sgesvx.f b/lapack-netlib/sgesvx.f
new file mode 100644
index 000000000..385e626cf
--- /dev/null
+++ b/lapack-netlib/sgesvx.f
@@ -0,0 +1,599 @@
+*> \brief <b> SGESVX computes the solution to system of linear equations A * X = B for GE matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SGESVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sgesvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sgesvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sgesvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+*                          EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+*                          WORK, IWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+*       REAL               RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * ), IWORK( * )
+*       REAL               A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+*      $                   BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SGESVX uses the LU factorization to compute the solution to a real
+*> system of linear equations
+*>    A * X = B,
+*> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = P * L * U,
+*>    where P is a permutation matrix, L is a unit lower triangular
+*>    matrix, and U is upper triangular.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AF and IPIV contain the factored form of A.
+*>                  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  A, AF, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AF and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AF and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations:
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N)
+*>          On entry, the N-by-N matrix A.  If FACT = 'F' and EQUED is
+*>          not 'N', then A must have been equilibrated by the scaling
+*>          factors in R and/or C.  A is not modified if FACT = 'F' or
+*>          'N', or if FACT = 'E' and EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] AF
+*> \verbatim
+*>          AF is REAL array, dimension (LDAF,N)
+*>          If FACT = 'F', then AF is an input argument and on entry
+*>          contains the factors L and U from the factorization
+*>          A = P*L*U as computed by SGETRF.  If EQUED .ne. 'N', then
+*>          AF is the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the equilibrated matrix A (see the description of A for
+*>          the form of the equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAF
+*> \verbatim
+*>          LDAF is INTEGER
+*>          The leading dimension of the array AF.  LDAF >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          as computed by SGETRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is REAL array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is REAL array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is REAL array, dimension (LDB,NRHS)
+*>          On entry, the N-by-NRHS right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is REAL array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is REAL
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is REAL array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is REAL array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (MAX(1,4*N))
+*>          On exit, WORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If WORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          WORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N)
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization has
+*>                       been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*>                       value of RCOND would suggest.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup realGEsolve
+*
+*  =====================================================================
+      SUBROUTINE SGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+     $                   EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+     $                   WORK, IWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+      REAL               RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * ), IWORK( * )
+      REAL               A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+     $                   BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J
+      REAL               AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      REAL               SLAMCH, SLANGE, SLANTR
+      EXTERNAL           LSAME, SLAMCH, SLANGE, SLANTR
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SGECON, SGEEQU, SGERFS, SGETRF, SGETRS, SLACPY,
+     $                   SLAQGE, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = SLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE IF( LDAF.LT.MAX( 1, N ) ) THEN
+         INFO = -8
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -10
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -11
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -12
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -14
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SGESVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL SGEEQU( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL SLAQGE( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX,
+     $                   EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of A.
+*
+         CALL SLACPY( 'Full', N, N, A, LDA, AF, LDAF )
+         CALL SGETRF( N, N, AF, LDAF, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            RPVGRW = SLANTR( 'M', 'U', 'N', INFO, INFO, AF, LDAF,
+     $               WORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = SLANGE( 'M', N, INFO, A, LDA, WORK ) / RPVGRW
+            END IF
+            WORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = SLANGE( NORM, N, N, A, LDA, WORK )
+      RPVGRW = SLANTR( 'M', 'U', 'N', N, N, AF, LDAF, WORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = SLANGE( 'M', N, N, A, LDA, WORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL SGECON( NORM, N, AF, LDAF, ANORM, RCOND, WORK, IWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL SLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL SGETRS( TRANS, N, NRHS, AF, LDAF, IPIV, X, LDX, INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL SGERFS( TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, B, LDB, X,
+     $             LDX, FERR, BERR, WORK, IWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 80 J = 1, NRHS
+               DO 70 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+   70          CONTINUE
+   80       CONTINUE
+            DO 90 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+   90       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 110 J = 1, NRHS
+            DO 100 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  100       CONTINUE
+  110    CONTINUE
+         DO 120 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  120    CONTINUE
+      END IF
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.SLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+*
+      WORK( 1 ) = RPVGRW
+      RETURN
+*
+*     End of SGESVX
+*
+      END
diff --git a/lapack-netlib/zgbsvx.f b/lapack-netlib/zgbsvx.f
new file mode 100644
index 000000000..871564a81
--- /dev/null
+++ b/lapack-netlib/zgbsvx.f
@@ -0,0 +1,644 @@
+*> \brief <b> ZGBSVX computes the solution to system of linear equations A * X = B for GB matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZGBSVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zgbsvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zgbsvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zgbsvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+*                          LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+*                          RCOND, FERR, BERR, WORK, RWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+*       DOUBLE PRECISION   RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * )
+*       DOUBLE PRECISION   BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   RWORK( * )
+*       COMPLEX*16         AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZGBSVX uses the LU factorization to compute the solution to a complex
+*> system of linear equations A * X = B, A**T * X = B, or A**H * X = B,
+*> where A is a band matrix of order N with KL subdiagonals and KU
+*> superdiagonals, and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed by this subroutine:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = L * U,
+*>    where L is a product of permutation and unit lower triangular
+*>    matrices with KL subdiagonals, and U is upper triangular with
+*>    KL+KU superdiagonals.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AFB and IPIV contain the factored form of
+*>                  A.  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  AB, AFB, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AFB and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AFB and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations.
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KL
+*> \verbatim
+*>          KL is INTEGER
+*>          The number of subdiagonals within the band of A.  KL >= 0.
+*> \endverbatim
+*>
+*> \param[in] KU
+*> \verbatim
+*>          KU is INTEGER
+*>          The number of superdiagonals within the band of A.  KU >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is COMPLEX*16 array, dimension (LDAB,N)
+*>          On entry, the matrix A in band storage, in rows 1 to KL+KU+1.
+*>          The j-th column of A is stored in the j-th column of the
+*>          array AB as follows:
+*>          AB(KU+1+i-j,j) = A(i,j) for max(1,j-KU)<=i<=min(N,j+kl)
+*>
+*>          If FACT = 'F' and EQUED is not 'N', then A must have been
+*>          equilibrated by the scaling factors in R and/or C.  AB is not
+*>          modified if FACT = 'F' or 'N', or if FACT = 'E' and
+*>          EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] AFB
+*> \verbatim
+*>          AFB is COMPLEX*16 array, dimension (LDAFB,N)
+*>          If FACT = 'F', then AFB is an input argument and on entry
+*>          contains details of the LU factorization of the band matrix
+*>          A, as computed by ZGBTRF.  U is stored as an upper triangular
+*>          band matrix with KL+KU superdiagonals in rows 1 to KL+KU+1,
+*>          and the multipliers used during the factorization are stored
+*>          in rows KL+KU+2 to 2*KL+KU+1.  If EQUED .ne. 'N', then AFB is
+*>          the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of A.
+*>
+*>          If FACT = 'E', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of the equilibrated
+*>          matrix A (see the description of AB for the form of the
+*>          equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAFB
+*> \verbatim
+*>          LDAFB is INTEGER
+*>          The leading dimension of the array AFB.  LDAFB >= 2*KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = L*U
+*>          as computed by ZGBTRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is DOUBLE PRECISION array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is DOUBLE PRECISION array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX*16 array, dimension (LDB,NRHS)
+*>          On entry, the right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is COMPLEX*16 array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is DOUBLE PRECISION
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (2*N)
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (MAX(1,N))
+*>          On exit, RWORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If RWORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          RWORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization
+*>                       has been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*>                       value of RCOND would suggest.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16GBsolve
+*
+*  =====================================================================
+      SUBROUTINE ZGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+     $                   LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+     $                   RCOND, FERR, BERR, WORK, RWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+      DOUBLE PRECISION   RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * )
+      DOUBLE PRECISION   BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   RWORK( * )
+      COMPLEX*16         AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*  Moved setting of INFO = N+1 so INFO does not subsequently get
+*  overwritten.  Sven, 17 Mar 05.
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J, J1, J2
+      DOUBLE PRECISION   AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      DOUBLE PRECISION   DLAMCH, ZLANGB, ZLANTB
+      EXTERNAL           LSAME, DLAMCH, ZLANGB, ZLANTB
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA, ZCOPY, ZGBCON, ZGBEQU, ZGBRFS, ZGBTRF,
+     $                   ZGBTRS, ZLACPY, ZLAQGB
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = DLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KL.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( KU.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -6
+      ELSE IF( LDAB.LT.KL+KU+1 ) THEN
+         INFO = -8
+      ELSE IF( LDAFB.LT.2*KL+KU+1 ) THEN
+         INFO = -10
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -12
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -13
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -14
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -18
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZGBSVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL ZGBEQU( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL ZLAQGB( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                   AMAX, EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of the band matrix A.
+*
+         DO 70 J = 1, N
+            J1 = MAX( J-KU, 1 )
+            J2 = MIN( J+KL, N )
+            CALL ZCOPY( J2-J1+1, AB( KU+1-J+J1, J ), 1,
+     $                  AFB( KL+KU+1-J+J1, J ), 1 )
+   70    CONTINUE
+*
+         CALL ZGBTRF( N, N, KL, KU, AFB, LDAFB, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            ANORM = ZERO
+            DO 90 J = 1, INFO
+               DO 80 I = MAX( KU+2-J, 1 ), MIN( N+KU+1-J, KL+KU+1 )
+                  ANORM = MAX( ANORM, ABS( AB( I, J ) ) )
+   80          CONTINUE
+   90       CONTINUE
+            RPVGRW = ZLANTB( 'M', 'U', 'N', INFO, MIN( INFO-1, KL+KU ),
+     $                       AFB( MAX( 1, KL+KU+2-INFO ), 1 ), LDAFB,
+     $                       RWORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = ANORM / RPVGRW
+            END IF
+            RWORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = ZLANGB( NORM, N, KL, KU, AB, LDAB, RWORK )
+      RPVGRW = ZLANTB( 'M', 'U', 'N', N, KL+KU, AFB, LDAFB, RWORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = ZLANGB( 'M', N, KL, KU, AB, LDAB, RWORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL ZGBCON( NORM, N, KL, KU, AFB, LDAFB, IPIV, ANORM, RCOND,
+     $             WORK, RWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL ZLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL ZGBTRS( TRANS, N, KL, KU, NRHS, AFB, LDAFB, IPIV, X, LDX,
+     $             INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL ZGBRFS( TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, LDAFB, IPIV,
+     $             B, LDB, X, LDX, FERR, BERR, WORK, RWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 110 J = 1, NRHS
+               DO 100 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+  100          CONTINUE
+  110       CONTINUE
+            DO 120 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+  120       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 140 J = 1, NRHS
+            DO 130 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  130       CONTINUE
+  140    CONTINUE
+         DO 150 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  150    CONTINUE
+      END IF
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.DLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+*
+      RWORK( 1 ) = RPVGRW
+      RETURN
+*
+*     End of ZGBSVX
+*
+      END
diff --git a/lapack-netlib/zgejsv.f b/lapack-netlib/zgejsv.f
new file mode 100644
index 000000000..5fe899e50
--- /dev/null
+++ b/lapack-netlib/zgejsv.f
@@ -0,0 +1,2234 @@
+*> \brief \b ZGEJSV
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZGEJSV + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zgejsv.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zgejsv.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zgejsv.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*     SUBROUTINE ZGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,
+*                         M, N, A, LDA, SVA, U, LDU, V, LDV,
+*                         CWORK, LWORK, RWORK, LRWORK, IWORK, INFO )
+*
+*     .. Scalar Arguments ..
+*     IMPLICIT    NONE
+*     INTEGER     INFO, LDA, LDU, LDV, LWORK, M, N
+*     ..
+*     .. Array Arguments ..
+*     COMPLEX*16     A( LDA, * ),  U( LDU, * ), V( LDV, * ), CWORK( LWORK )
+*     DOUBLE PRECISION   SVA( N ), RWORK( LRWORK )
+*     INTEGER     IWORK( * )
+*     CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZGEJSV computes the singular value decomposition (SVD) of a complex M-by-N
+*> matrix [A], where M >= N. The SVD of [A] is written as
+*>
+*>              [A] = [U] * [SIGMA] * [V]^*,
+*>
+*> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N
+*> diagonal elements, [U] is an M-by-N (or M-by-M) unitary matrix, and
+*> [V] is an N-by-N unitary matrix. The diagonal elements of [SIGMA] are
+*> the singular values of [A]. The columns of [U] and [V] are the left and
+*> the right singular vectors of [A], respectively. The matrices [U] and [V]
+*> are computed and stored in the arrays U and V, respectively. The diagonal
+*> of [SIGMA] is computed and stored in the array SVA.
+*> \endverbatim
+*>
+*>  Arguments:
+*>  ==========
+*>
+*> \param[in] JOBA
+*> \verbatim
+*>          JOBA is CHARACTER*1
+*>         Specifies the level of accuracy:
+*>       = 'C': This option works well (high relative accuracy) if A = B * D,
+*>              with well-conditioned B and arbitrary diagonal matrix D.
+*>              The accuracy cannot be spoiled by COLUMN scaling. The
+*>              accuracy of the computed output depends on the condition of
+*>              B, and the procedure aims at the best theoretical accuracy.
+*>              The relative error max_{i=1:N}|d sigma_i| / sigma_i is
+*>              bounded by f(M,N)*epsilon* cond(B), independent of D.
+*>              The input matrix is preprocessed with the QRF with column
+*>              pivoting. This initial preprocessing and preconditioning by
+*>              a rank revealing QR factorization is common for all values of
+*>              JOBA. Additional actions are specified as follows:
+*>       = 'E': Computation as with 'C' with an additional estimate of the
+*>              condition number of B. It provides a realistic error bound.
+*>       = 'F': If A = D1 * C * D2 with ill-conditioned diagonal scalings
+*>              D1, D2, and well-conditioned matrix C, this option gives
+*>              higher accuracy than the 'C' option. If the structure of the
+*>              input matrix is not known, and relative accuracy is
+*>              desirable, then this option is advisable. The input matrix A
+*>              is preprocessed with QR factorization with FULL (row and
+*>              column) pivoting.
+*>       = 'G': Computation as with 'F' with an additional estimate of the
+*>              condition number of B, where A=B*D. If A has heavily weighted
+*>              rows, then using this condition number gives too pessimistic
+*>              error bound.
+*>       = 'A': Small singular values are not well determined by the data 
+*>              and are considered as noisy; the matrix is treated as
+*>              numerically rank deficient. The error in the computed
+*>              singular values is bounded by f(m,n)*epsilon*||A||.
+*>              The computed SVD A = U * S * V^* restores A up to
+*>              f(m,n)*epsilon*||A||.
+*>              This gives the procedure the licence to discard (set to zero)
+*>              all singular values below N*epsilon*||A||.
+*>       = 'R': Similar as in 'A'. Rank revealing property of the initial
+*>              QR factorization is used do reveal (using triangular factor)
+*>              a gap sigma_{r+1} < epsilon * sigma_r in which case the
+*>              numerical RANK is declared to be r. The SVD is computed with
+*>              absolute error bounds, but more accurately than with 'A'.
+*> \endverbatim
+*>
+*> \param[in] JOBU
+*> \verbatim
+*>          JOBU is CHARACTER*1
+*>         Specifies whether to compute the columns of U:
+*>       = 'U': N columns of U are returned in the array U.
+*>       = 'F': full set of M left sing. vectors is returned in the array U.
+*>       = 'W': U may be used as workspace of length M*N. See the description
+*>              of U.
+*>       = 'N': U is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBV
+*> \verbatim
+*>          JOBV is CHARACTER*1
+*>         Specifies whether to compute the matrix V:
+*>       = 'V': N columns of V are returned in the array V; Jacobi rotations
+*>              are not explicitly accumulated.
+*>       = 'J': N columns of V are returned in the array V, but they are
+*>              computed as the product of Jacobi rotations, if JOBT = 'N'.
+*>       = 'W': V may be used as workspace of length N*N. See the description
+*>              of V.
+*>       = 'N': V is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBR
+*> \verbatim
+*>          JOBR is CHARACTER*1
+*>         Specifies the RANGE for the singular values. Issues the licence to
+*>         set to zero small positive singular values if they are outside
+*>         specified range. If A .NE. 0 is scaled so that the largest singular
+*>         value of c*A is around SQRT(BIG), BIG=DLAMCH('O'), then JOBR issues
+*>         the licence to kill columns of A whose norm in c*A is less than
+*>         SQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN,
+*>         where SFMIN=DLAMCH('S'), EPSLN=DLAMCH('E').
+*>       = 'N': Do not kill small columns of c*A. This option assumes that
+*>              BLAS and QR factorizations and triangular solvers are
+*>              implemented to work in that range. If the condition of A
+*>              is greater than BIG, use ZGESVJ.
+*>       = 'R': RESTRICTED range for sigma(c*A) is [SQRT(SFMIN), SQRT(BIG)]
+*>              (roughly, as described above). This option is recommended.
+*>                                             ===========================
+*>         For computing the singular values in the FULL range [SFMIN,BIG]
+*>         use ZGESVJ.
+*> \endverbatim
+*>
+*> \param[in] JOBT
+*> \verbatim
+*>          JOBT is CHARACTER*1
+*>         If the matrix is square then the procedure may determine to use
+*>         transposed A if A^* seems to be better with respect to convergence.
+*>         If the matrix is not square, JOBT is ignored. 
+*>         The decision is based on two values of entropy over the adjoint
+*>         orbit of A^* * A. See the descriptions of RWORK(6) and RWORK(7).
+*>       = 'T': transpose if entropy test indicates possibly faster
+*>         convergence of Jacobi process if A^* is taken as input. If A is
+*>         replaced with A^*, then the row pivoting is included automatically.
+*>       = 'N': do not speculate.
+*>         The option 'T' can be used to compute only the singular values, or
+*>         the full SVD (U, SIGMA and V). For only one set of singular vectors
+*>         (U or V), the caller should provide both U and V, as one of the
+*>         matrices is used as workspace if the matrix A is transposed.
+*>         The implementer can easily remove this constraint and make the
+*>         code more complicated. See the descriptions of U and V.
+*>         In general, this option is considered experimental, and 'N'; should
+*>         be preferred. This is subject to changes in the future.
+*> \endverbatim
+*>
+*> \param[in] JOBP
+*> \verbatim
+*>          JOBP is CHARACTER*1
+*>         Issues the licence to introduce structured perturbations to drown
+*>         denormalized numbers. This licence should be active if the
+*>         denormals are poorly implemented, causing slow computation,
+*>         especially in cases of fast convergence (!). For details see [1,2].
+*>         For the sake of simplicity, this perturbations are included only
+*>         when the full SVD or only the singular values are requested. The
+*>         implementer/user can easily add the perturbation for the cases of
+*>         computing one set of singular vectors.
+*>       = 'P': introduce perturbation
+*>       = 'N': do not perturb
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>         The number of rows of the input matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>         The number of columns of the input matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] SVA
+*> \verbatim
+*>          SVA is DOUBLE PRECISION array, dimension (N)
+*>          On exit,
+*>          - For RWORK(1)/RWORK(2) = ONE: The singular values of A. During
+*>            the computation SVA contains Euclidean column norms of the
+*>            iterated matrices in the array A.
+*>          - For RWORK(1) .NE. RWORK(2): The singular values of A are
+*>            (RWORK(1)/RWORK(2)) * SVA(1:N). This factored form is used if
+*>            sigma_max(A) overflows or if small singular values have been
+*>            saved from underflow by scaling the input matrix A.
+*>          - If JOBR='R' then some of the singular values may be returned
+*>            as exact zeros obtained by "set to zero" because they are
+*>            below the numerical rank threshold or are denormalized numbers.
+*> \endverbatim
+*>
+*> \param[out] U
+*> \verbatim
+*>          U is COMPLEX*16 array, dimension ( LDU, N )
+*>          If JOBU = 'U', then U contains on exit the M-by-N matrix of
+*>                         the left singular vectors.
+*>          If JOBU = 'F', then U contains on exit the M-by-M matrix of
+*>                         the left singular vectors, including an ONB
+*>                         of the orthogonal complement of the Range(A).
+*>          If JOBU = 'W'  .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N),
+*>                         then U is used as workspace if the procedure
+*>                         replaces A with A^*. In that case, [V] is computed
+*>                         in U as left singular vectors of A^* and then
+*>                         copied back to the V array. This 'W' option is just
+*>                         a reminder to the caller that in this case U is
+*>                         reserved as workspace of length N*N.
+*>          If JOBU = 'N'  U is not referenced, unless JOBT='T'.
+*> \endverbatim
+*>
+*> \param[in] LDU
+*> \verbatim
+*>          LDU is INTEGER
+*>          The leading dimension of the array U,  LDU >= 1.
+*>          IF  JOBU = 'U' or 'F' or 'W',  then LDU >= M.
+*> \endverbatim
+*>
+*> \param[out] V
+*> \verbatim
+*>          V is COMPLEX*16 array, dimension ( LDV, N )
+*>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
+*>                         the right singular vectors;
+*>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
+*>                         then V is used as workspace if the pprocedure
+*>                         replaces A with A^*. In that case, [U] is computed
+*>                         in V as right singular vectors of A^* and then
+*>                         copied back to the U array. This 'W' option is just
+*>                         a reminder to the caller that in this case V is
+*>                         reserved as workspace of length N*N.
+*>          If JOBV = 'N'  V is not referenced, unless JOBT='T'.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V,  LDV >= 1.
+*>          If JOBV = 'V' or 'J' or 'W', then LDV >= N.
+*> \endverbatim
+*>
+*> \param[out] CWORK
+*> \verbatim
+*>          CWORK is COMPLEX*16 array, dimension (MAX(2,LWORK))
+*>          If the call to ZGEJSV is a workspace query (indicated by LWORK=-1 or
+*>          LRWORK=-1), then on exit CWORK(1) contains the required length of
+*>          CWORK for the job parameters used in the call.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          Length of CWORK to confirm proper allocation of workspace.
+*>          LWORK depends on the job:
+*>
+*>          1. If only SIGMA is needed ( JOBU = 'N', JOBV = 'N' ) and
+*>            1.1 .. no scaled condition estimate required (JOBA.NE.'E'.AND.JOBA.NE.'G'):
+*>               LWORK >= 2*N+1. This is the minimal requirement.
+*>               ->> For optimal performance (blocked code) the optimal value
+*>               is LWORK >= N + (N+1)*NB. Here NB is the optimal
+*>               block size for ZGEQP3 and ZGEQRF.
+*>               In general, optimal LWORK is computed as
+*>               LWORK >= max(N+LWORK(ZGEQP3),N+LWORK(ZGEQRF), LWORK(ZGESVJ)).
+*>            1.2. .. an estimate of the scaled condition number of A is
+*>               required (JOBA='E', or 'G'). In this case, LWORK the minimal
+*>               requirement is LWORK >= N*N + 2*N.
+*>               ->> For optimal performance (blocked code) the optimal value
+*>               is LWORK >= max(N+(N+1)*NB, N*N+2*N)=N**2+2*N.
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(ZGEQP3),N+LWORK(ZGEQRF), LWORK(ZGESVJ),
+*>                            N*N+LWORK(ZPOCON)).
+*>          2. If SIGMA and the right singular vectors are needed (JOBV = 'V'),
+*>             (JOBU = 'N')
+*>            2.1   .. no scaled condition estimate requested (JOBE = 'N'):    
+*>            -> the minimal requirement is LWORK >= 3*N.
+*>            -> For optimal performance, 
+*>               LWORK >= max(N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for ZGEQP3, ZGEQRF, ZGELQF,
+*>               ZUNMLQ. In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(ZGEQP3), N+LWORK(ZGESVJ),
+*>                       N+LWORK(ZGELQF), 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMLQ)).
+*>            2.2 .. an estimate of the scaled condition number of A is
+*>               required (JOBA='E', or 'G').
+*>            -> the minimal requirement is LWORK >= 3*N.      
+*>            -> For optimal performance, 
+*>               LWORK >= max(N+(N+1)*NB, 2*N,2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for ZGEQP3, ZGEQRF, ZGELQF,
+*>               ZUNMLQ. In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(ZGEQP3), LWORK(ZPOCON), N+LWORK(ZGESVJ),
+*>                       N+LWORK(ZGELQF), 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMLQ)).   
+*>          3. If SIGMA and the left singular vectors are needed
+*>            3.1  .. no scaled condition estimate requested (JOBE = 'N'):
+*>            -> the minimal requirement is LWORK >= 3*N.
+*>            -> For optimal performance:
+*>               if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for ZGEQP3, ZGEQRF, ZUNMQR.
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(ZGEQP3), 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMQR)). 
+*>            3.2  .. an estimate of the scaled condition number of A is
+*>               required (JOBA='E', or 'G').
+*>            -> the minimal requirement is LWORK >= 3*N.
+*>            -> For optimal performance:
+*>               if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for ZGEQP3, ZGEQRF, ZUNMQR.
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(ZGEQP3),N+LWORK(ZPOCON),
+*>                        2*N+LWORK(ZGEQRF), N+LWORK(ZUNMQR)).
+*>          4. If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and 
+*>            4.1. if JOBV = 'V'  
+*>               the minimal requirement is LWORK >= 5*N+2*N*N. 
+*>            4.2. if JOBV = 'J' the minimal requirement is 
+*>               LWORK >= 4*N+N*N.
+*>            In both cases, the allocated CWORK can accommodate blocked runs
+*>            of ZGEQP3, ZGEQRF, ZGELQF, SUNMQR, ZUNMLQ.
+*>
+*>          If the call to ZGEJSV is a workspace query (indicated by LWORK=-1 or
+*>          LRWORK=-1), then on exit CWORK(1) contains the optimal and CWORK(2) contains the
+*>          minimal length of CWORK for the job parameters used in the call.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (MAX(7,LRWORK))
+*>          On exit,
+*>          RWORK(1) = Determines the scaling factor SCALE = RWORK(2) / RWORK(1)
+*>                    such that SCALE*SVA(1:N) are the computed singular values
+*>                    of A. (See the description of SVA().)
+*>          RWORK(2) = See the description of RWORK(1).
+*>          RWORK(3) = SCONDA is an estimate for the condition number of
+*>                    column equilibrated A. (If JOBA = 'E' or 'G')
+*>                    SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1).
+*>                    It is computed using ZPOCON. It holds
+*>                    N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+*>                    where R is the triangular factor from the QRF of A.
+*>                    However, if R is truncated and the numerical rank is
+*>                    determined to be strictly smaller than N, SCONDA is
+*>                    returned as -1, thus indicating that the smallest
+*>                    singular values might be lost.
+*>
+*>          If full SVD is needed, the following two condition numbers are
+*>          useful for the analysis of the algorithm. They are provided for
+*>          a developer/implementer who is familiar with the details of
+*>          the method.
+*>
+*>          RWORK(4) = an estimate of the scaled condition number of the
+*>                    triangular factor in the first QR factorization.
+*>          RWORK(5) = an estimate of the scaled condition number of the
+*>                    triangular factor in the second QR factorization.
+*>          The following two parameters are computed if JOBT = 'T'.
+*>          They are provided for a developer/implementer who is familiar
+*>          with the details of the method.
+*>          RWORK(6) = the entropy of A^* * A :: this is the Shannon entropy
+*>                    of diag(A^* * A) / Trace(A^* * A) taken as point in the
+*>                    probability simplex.
+*>          RWORK(7) = the entropy of A * A^*. (See the description of RWORK(6).)
+*>          If the call to ZGEJSV is a workspace query (indicated by LWORK=-1 or
+*>          LRWORK=-1), then on exit RWORK(1) contains the required length of
+*>          RWORK for the job parameters used in the call.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*>          LRWORK is INTEGER
+*>          Length of RWORK to confirm proper allocation of workspace.
+*>          LRWORK depends on the job:
+*>
+*>       1. If only the singular values are requested i.e. if
+*>          LSAME(JOBU,'N') .AND. LSAME(JOBV,'N')
+*>          then:
+*>          1.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>               then: LRWORK = max( 7, 2 * M ).
+*>          1.2. Otherwise, LRWORK  = max( 7,  N ).
+*>       2. If singular values with the right singular vectors are requested
+*>          i.e. if
+*>          (LSAME(JOBV,'V').OR.LSAME(JOBV,'J')) .AND.
+*>          .NOT.(LSAME(JOBU,'U').OR.LSAME(JOBU,'F'))
+*>          then:
+*>          2.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>          then LRWORK = max( 7, 2 * M ).
+*>          2.2. Otherwise, LRWORK  = max( 7,  N ).
+*>       3. If singular values with the left singular vectors are requested, i.e. if
+*>          (LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) .AND.
+*>          .NOT.(LSAME(JOBV,'V').OR.LSAME(JOBV,'J'))
+*>          then:
+*>          3.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>          then LRWORK = max( 7, 2 * M ).
+*>          3.2. Otherwise, LRWORK  = max( 7,  N ).
+*>       4. If singular values with both the left and the right singular vectors
+*>          are requested, i.e. if
+*>          (LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) .AND.
+*>          (LSAME(JOBV,'V').OR.LSAME(JOBV,'J'))
+*>          then:
+*>          4.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>          then LRWORK = max( 7, 2 * M ).
+*>          4.2. Otherwise, LRWORK  = max( 7, N ).
+*>
+*>          If, on entry, LRWORK = -1 or LWORK=-1, a workspace query is assumed and 
+*>          the length of RWORK is returned in RWORK(1). 
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, of dimension at least 4, that further depends 
+*>          on the job:
+*>
+*>          1. If only the singular values are requested then:
+*>             If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>             then the length of IWORK is N+M; otherwise the length of IWORK is N.
+*>          2. If the singular values and the right singular vectors are requested then:
+*>             If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>             then the length of IWORK is N+M; otherwise the length of IWORK is N. 
+*>          3. If the singular values and the left singular vectors are requested then:
+*>             If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>             then the length of IWORK is N+M; otherwise the length of IWORK is N. 
+*>          4. If the singular values with both the left and the right singular vectors
+*>             are requested, then:      
+*>             4.1. If LSAME(JOBV,'J') the length of IWORK is determined as follows:
+*>                  If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>                  then the length of IWORK is N+M; otherwise the length of IWORK is N. 
+*>             4.2. If LSAME(JOBV,'V') the length of IWORK is determined as follows:
+*>                  If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>                  then the length of IWORK is 2*N+M; otherwise the length of IWORK is 2*N.
+*>        
+*>          On exit,
+*>          IWORK(1) = the numerical rank determined after the initial
+*>                     QR factorization with pivoting. See the descriptions
+*>                     of JOBA and JOBR.
+*>          IWORK(2) = the number of the computed nonzero singular values
+*>          IWORK(3) = if nonzero, a warning message:
+*>                     If IWORK(3) = 1 then some of the column norms of A
+*>                     were denormalized floats. The requested high accuracy
+*>                     is not warranted by the data.
+*>          IWORK(4) = 1 or -1. If IWORK(4) = 1, then the procedure used A^* to
+*>                     do the job as specified by the JOB parameters.
+*>          If the call to ZGEJSV is a workspace query (indicated by LWORK = -1 or
+*>          LRWORK = -1), then on exit IWORK(1) contains the required length of 
+*>          IWORK for the job parameters used in the call.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>           < 0:  if INFO = -i, then the i-th argument had an illegal value.
+*>           = 0:  successful exit;
+*>           > 0:  ZGEJSV  did not converge in the maximal allowed number
+*>                 of sweeps. The computed values may be inaccurate.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16GEsing
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  ZGEJSV implements a preconditioned Jacobi SVD algorithm. It uses ZGEQP3,
+*>  ZGEQRF, and ZGELQF as preprocessors and preconditioners. Optionally, an
+*>  additional row pivoting can be used as a preprocessor, which in some
+*>  cases results in much higher accuracy. An example is matrix A with the
+*>  structure A = D1 * C * D2, where D1, D2 are arbitrarily ill-conditioned
+*>  diagonal matrices and C is well-conditioned matrix. In that case, complete
+*>  pivoting in the first QR factorizations provides accuracy dependent on the
+*>  condition number of C, and independent of D1, D2. Such higher accuracy is
+*>  not completely understood theoretically, but it works well in practice.
+*>  Further, if A can be written as A = B*D, with well-conditioned B and some
+*>  diagonal D, then the high accuracy is guaranteed, both theoretically and
+*>  in software, independent of D. For more details see [1], [2].
+*>     The computational range for the singular values can be the full range
+*>  ( UNDERFLOW,OVERFLOW ), provided that the machine arithmetic and the BLAS
+*>  & LAPACK routines called by ZGEJSV are implemented to work in that range.
+*>  If that is not the case, then the restriction for safe computation with
+*>  the singular values in the range of normalized IEEE numbers is that the
+*>  spectral condition number kappa(A)=sigma_max(A)/sigma_min(A) does not
+*>  overflow. This code (ZGEJSV) is best used in this restricted range,
+*>  meaning that singular values of magnitude below ||A||_2 / DLAMCH('O') are
+*>  returned as zeros. See JOBR for details on this.
+*>     Further, this implementation is somewhat slower than the one described
+*>  in [1,2] due to replacement of some non-LAPACK components, and because
+*>  the choice of some tuning parameters in the iterative part (ZGESVJ) is
+*>  left to the implementer on a particular machine.
+*>     The rank revealing QR factorization (in this code: ZGEQP3) should be
+*>  implemented as in [3]. We have a new version of ZGEQP3 under development
+*>  that is more robust than the current one in LAPACK, with a cleaner cut in
+*>  rank deficient cases. It will be available in the SIGMA library [4].
+*>  If M is much larger than N, it is obvious that the initial QRF with
+*>  column pivoting can be preprocessed by the QRF without pivoting. That
+*>  well known trick is not used in ZGEJSV because in some cases heavy row
+*>  weighting can be treated with complete pivoting. The overhead in cases
+*>  M much larger than N is then only due to pivoting, but the benefits in
+*>  terms of accuracy have prevailed. The implementer/user can incorporate
+*>  this extra QRF step easily. The implementer can also improve data movement
+*>  (matrix transpose, matrix copy, matrix transposed copy) - this
+*>  implementation of ZGEJSV uses only the simplest, naive data movement.
+*> \endverbatim
+*
+*> \par Contributor:
+*  ==================
+*>
+*>  Zlatko Drmac, Department of Mathematics, Faculty of Science,
+*>  University of Zagreb (Zagreb, Croatia); drmac@math.hr
+*
+*> \par References:
+*  ================
+*>
+*> \verbatim
+*>
+*> [1] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm I.
+*>     SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1322-1342.
+*>     LAPACK Working note 169.
+*> [2] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm II.
+*>     SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1343-1362.
+*>     LAPACK Working note 170.
+*> [3] Z. Drmac and Z. Bujanovic: On the failure of rank-revealing QR
+*>     factorization software - a case study.
+*>     ACM Trans. Math. Softw. Vol. 35, No 2 (2008), pp. 1-28.
+*>     LAPACK Working note 176.
+*> [4] Z. Drmac: SIGMA - mathematical software library for accurate SVD, PSV,
+*>     QSVD, (H,K)-SVD computations.
+*>     Department of Mathematics, University of Zagreb, 2008, 2016.
+*> \endverbatim
+*
+*>  \par Bugs, examples and comments:
+*   =================================
+*>
+*>  Please report all bugs and send interesting examples and/or comments to
+*>  drmac@math.hr. Thank you.
+*>
+*  =====================================================================
+      SUBROUTINE ZGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,
+     $                   M, N, A, LDA, SVA, U, LDU, V, LDV,
+     $                   CWORK, LWORK, RWORK, LRWORK, IWORK, INFO )
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      IMPLICIT    NONE
+      INTEGER     INFO, LDA, LDU, LDV, LWORK, LRWORK, M, N
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16       A( LDA, * ), U( LDU, * ), V( LDV, * ),
+     $                 CWORK( LWORK )
+      DOUBLE PRECISION SVA( N ), RWORK( LRWORK )
+      INTEGER          IWORK( * )
+      CHARACTER*1      JOBA, JOBP, JOBR, JOBT, JOBU, JOBV
+*     ..
+*
+*  ===========================================================================
+*
+*     .. Local Parameters ..
+      DOUBLE PRECISION ZERO, ONE
+      PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 )
+      COMPLEX*16 CZERO, CONE
+      PARAMETER ( CZERO = ( 0.0D0, 0.0D0 ), CONE = ( 1.0D0, 0.0D0 ) )
+*     ..
+*     .. Local Scalars ..
+      COMPLEX*16       CTEMP
+      DOUBLE PRECISION AAPP,    AAQQ,   AATMAX, AATMIN, BIG,    BIG1,
+     $                 COND_OK, CONDR1, CONDR2, ENTRA,  ENTRAT, EPSLN,
+     $                 MAXPRJ,  SCALEM, SCONDA, SFMIN,  SMALL,  TEMP1,
+     $                 USCAL1,  USCAL2, XSC
+      INTEGER IERR,   N1,     NR,     NUMRANK,        p, q,   WARNING
+      LOGICAL ALMORT, DEFR,   ERREST, GOSCAL,  JRACC,  KILL,   LQUERY,
+     $        LSVEC,  L2ABER, L2KILL, L2PERT,  L2RANK, L2TRAN, NOSCAL,
+     $        ROWPIV, RSVEC,  TRANSP
+*
+      INTEGER OPTWRK, MINWRK, MINRWRK, MINIWRK
+      INTEGER LWCON,  LWLQF, LWQP3, LWQRF, LWUNMLQ, LWUNMQR, LWUNMQRM,
+     $        LWSVDJ, LWSVDJV, LRWQP3, LRWCON, LRWSVDJ, IWOFF
+      INTEGER LWRK_ZGELQF, LWRK_ZGEQP3,  LWRK_ZGEQP3N, LWRK_ZGEQRF,  
+     $        LWRK_ZGESVJ, LWRK_ZGESVJV, LWRK_ZGESVJU, LWRK_ZUNMLQ, 
+     $        LWRK_ZUNMQR, LWRK_ZUNMQRM    
+*     ..
+*     .. Local Arrays
+      COMPLEX*16         CDUMMY(1)
+      DOUBLE PRECISION   RDUMMY(1)
+*
+*     .. Intrinsic Functions ..
+      INTRINSIC ABS, DCMPLX, CONJG, DLOG, MAX, MIN, DBLE, NINT, SQRT
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION      DLAMCH, DZNRM2
+      INTEGER   IDAMAX, IZAMAX
+      LOGICAL   LSAME
+      EXTERNAL  IDAMAX, IZAMAX, LSAME, DLAMCH, DZNRM2
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL  DLASSQ, ZCOPY,  ZGELQF, ZGEQP3, ZGEQRF, ZLACPY, ZLAPMR,
+     $          ZLASCL, DLASCL, ZLASET, ZLASSQ, ZLASWP, ZUNGQR, ZUNMLQ,
+     $          ZUNMQR, ZPOCON, DSCAL,  ZDSCAL, ZSWAP,  ZTRSM,  ZLACGV,
+     $          XERBLA
+*
+      EXTERNAL  ZGESVJ
+*     ..
+*
+*     Test the input arguments
+*
+      LSVEC  = LSAME( JOBU, 'U' ) .OR. LSAME( JOBU, 'F' )
+      JRACC  = LSAME( JOBV, 'J' )
+      RSVEC  = LSAME( JOBV, 'V' ) .OR. JRACC
+      ROWPIV = LSAME( JOBA, 'F' ) .OR. LSAME( JOBA, 'G' )
+      L2RANK = LSAME( JOBA, 'R' )
+      L2ABER = LSAME( JOBA, 'A' )
+      ERREST = LSAME( JOBA, 'E' ) .OR. LSAME( JOBA, 'G' )
+      L2TRAN = LSAME( JOBT, 'T' ) .AND. ( M .EQ. N )
+      L2KILL = LSAME( JOBR, 'R' )
+      DEFR   = LSAME( JOBR, 'N' )
+      L2PERT = LSAME( JOBP, 'P' )
+*
+      LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 )
+*
+      IF ( .NOT.(ROWPIV .OR. L2RANK .OR. L2ABER .OR.
+     $     ERREST .OR. LSAME( JOBA, 'C' ) )) THEN
+         INFO = - 1
+      ELSE IF ( .NOT.( LSVEC .OR. LSAME( JOBU, 'N' ) .OR.
+     $   ( LSAME( JOBU, 'W' ) .AND. RSVEC .AND. L2TRAN ) ) ) THEN
+         INFO = - 2
+      ELSE IF ( .NOT.( RSVEC .OR. LSAME( JOBV, 'N' ) .OR.
+     $   ( LSAME( JOBV, 'W' ) .AND. LSVEC .AND. L2TRAN ) ) ) THEN
+         INFO = - 3
+      ELSE IF ( .NOT. ( L2KILL .OR. DEFR ) )    THEN
+         INFO = - 4
+      ELSE IF ( .NOT. ( LSAME(JOBT,'T') .OR. LSAME(JOBT,'N') ) ) THEN
+         INFO = - 5
+      ELSE IF ( .NOT. ( L2PERT .OR. LSAME( JOBP, 'N' ) ) ) THEN
+         INFO = - 6
+      ELSE IF ( M .LT. 0 ) THEN
+         INFO = - 7
+      ELSE IF ( ( N .LT. 0 ) .OR. ( N .GT. M ) ) THEN
+         INFO = - 8
+      ELSE IF ( LDA .LT. M ) THEN
+         INFO = - 10
+      ELSE IF ( LSVEC .AND. ( LDU .LT. M ) ) THEN
+         INFO = - 13
+      ELSE IF ( RSVEC .AND. ( LDV .LT. N ) ) THEN
+         INFO = - 15
+      ELSE
+*        #:)
+         INFO = 0
+      END IF
+*
+      IF ( INFO .EQ. 0 ) THEN 
+*         .. compute the minimal and the optimal workspace lengths 
+*         [[The expressions for computing the minimal and the optimal
+*         values of LCWORK, LRWORK are written with a lot of redundancy and
+*         can be simplified. However, this verbose form is useful for
+*         maintenance and modifications of the code.]]
+*
+*        .. minimal workspace length for ZGEQP3 of an M x N matrix,
+*         ZGEQRF of an N x N matrix, ZGELQF of an N x N matrix,
+*         ZUNMLQ for computing N x N matrix, ZUNMQR for computing N x N
+*         matrix, ZUNMQR for computing M x N matrix, respectively.
+          LWQP3 = N+1   
+          LWQRF = MAX( 1, N )
+          LWLQF = MAX( 1, N )
+          LWUNMLQ  = MAX( 1, N )
+          LWUNMQR  = MAX( 1, N )
+          LWUNMQRM = MAX( 1, M )
+*        .. minimal workspace length for ZPOCON of an N x N matrix
+          LWCON = 2 * N 
+*        .. minimal workspace length for ZGESVJ of an N x N matrix,
+*         without and with explicit accumulation of Jacobi rotations
+          LWSVDJ  = MAX( 2 * N, 1 )         
+          LWSVDJV = MAX( 2 * N, 1 )
+*         .. minimal REAL workspace length for ZGEQP3, ZPOCON, ZGESVJ
+          LRWQP3  = 2 * N 
+          LRWCON  = N 
+          LRWSVDJ = N 
+          IF ( LQUERY ) THEN 
+              CALL ZGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1, 
+     $             RDUMMY, IERR )
+              LWRK_ZGEQP3 = INT( CDUMMY(1) )
+              CALL ZGEQRF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR )
+              LWRK_ZGEQRF = INT( CDUMMY(1) )
+              CALL ZGELQF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR )
+              LWRK_ZGELQF = INT( CDUMMY(1) )
+          END IF
+          MINWRK  = 2
+          OPTWRK  = 2
+          MINIWRK = N 
+          IF ( .NOT. (LSVEC .OR. RSVEC ) ) THEN
+*             .. minimal and optimal sizes of the complex workspace if
+*             only the singular values are requested
+              IF ( ERREST ) THEN 
+                  MINWRK = MAX( N+LWQP3, N**2+LWCON, N+LWQRF, LWSVDJ )
+              ELSE
+                  MINWRK = MAX( N+LWQP3, N+LWQRF, LWSVDJ )
+              END IF
+              IF ( LQUERY ) THEN 
+                  CALL ZGESVJ( 'L', 'N', 'N', N, N, A, LDA, SVA, N, V, 
+     $                 LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                  LWRK_ZGESVJ = INT( CDUMMY(1) )
+                  IF ( ERREST ) THEN 
+                      OPTWRK = MAX( N+LWRK_ZGEQP3, N**2+LWCON, 
+     $                              N+LWRK_ZGEQRF, LWRK_ZGESVJ )
+                  ELSE
+                      OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWRK_ZGEQRF, 
+     $                              LWRK_ZGESVJ )
+                  END IF
+              END IF
+              IF ( L2TRAN .OR. ROWPIV ) THEN 
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWCON, LRWSVDJ )
+                  ELSE
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ )
+                  END IF                 
+              ELSE
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, LRWQP3, LRWCON, LRWSVDJ )
+                  ELSE
+                     MINRWRK = MAX( 7, LRWQP3, LRWSVDJ )
+                  END IF
+              END IF   
+              IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M 
+          ELSE IF ( RSVEC .AND. (.NOT.LSVEC) ) THEN
+*            .. minimal and optimal sizes of the complex workspace if the
+*            singular values and the right singular vectors are requested
+             IF ( ERREST ) THEN 
+                 MINWRK = MAX( N+LWQP3, LWCON, LWSVDJ, N+LWLQF,  
+     $                         2*N+LWQRF, N+LWSVDJ, N+LWUNMLQ )
+             ELSE
+                 MINWRK = MAX( N+LWQP3, LWSVDJ, N+LWLQF, 2*N+LWQRF, 
+     $                         N+LWSVDJ, N+LWUNMLQ )
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL ZGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A,
+     $                LDA, CDUMMY, -1, RDUMMY, -1, IERR )
+                 LWRK_ZGESVJ = INT( CDUMMY(1) )
+                 CALL ZUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY,
+     $                V, LDV, CDUMMY, -1, IERR )
+                 LWRK_ZUNMLQ = INT( CDUMMY(1) )
+                 IF ( ERREST ) THEN 
+                 OPTWRK = MAX( N+LWRK_ZGEQP3, LWCON, LWRK_ZGESVJ, 
+     $                         N+LWRK_ZGELQF, 2*N+LWRK_ZGEQRF,
+     $                         N+LWRK_ZGESVJ,  N+LWRK_ZUNMLQ )
+                 ELSE
+                 OPTWRK = MAX( N+LWRK_ZGEQP3, LWRK_ZGESVJ,N+LWRK_ZGELQF,
+     $                         2*N+LWRK_ZGEQRF, N+LWRK_ZGESVJ, 
+     $                         N+LWRK_ZUNMLQ )
+                 END IF
+             END IF
+             IF ( L2TRAN .OR. ROWPIV ) THEN 
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ, LRWCON )
+                  ELSE
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ ) 
+                  END IF                  
+             ELSE
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON )
+                  ELSE
+                     MINRWRK = MAX( 7, LRWQP3, LRWSVDJ ) 
+                  END IF                 
+             END IF
+             IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+          ELSE IF ( LSVEC .AND. (.NOT.RSVEC) ) THEN  
+*            .. minimal and optimal sizes of the complex workspace if the
+*            singular values and the left singular vectors are requested
+             IF ( ERREST ) THEN
+                 MINWRK = N + MAX( LWQP3,LWCON,N+LWQRF,LWSVDJ,LWUNMQRM )
+             ELSE
+                 MINWRK = N + MAX( LWQP3, N+LWQRF, LWSVDJ, LWUNMQRM )
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL ZGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A,
+     $                LDA, CDUMMY, -1, RDUMMY, -1, IERR )
+                 LWRK_ZGESVJ = INT( CDUMMY(1) )
+                 CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
+     $               LDU, CDUMMY, -1, IERR )
+                 LWRK_ZUNMQRM = INT( CDUMMY(1) )
+                 IF ( ERREST ) THEN
+                 OPTWRK = N + MAX( LWRK_ZGEQP3, LWCON, N+LWRK_ZGEQRF,
+     $                             LWRK_ZGESVJ, LWRK_ZUNMQRM )
+                 ELSE
+                 OPTWRK = N + MAX( LWRK_ZGEQP3, N+LWRK_ZGEQRF,
+     $                             LWRK_ZGESVJ, LWRK_ZUNMQRM )
+                 END IF
+             END IF
+             IF ( L2TRAN .OR. ROWPIV ) THEN 
+                 IF ( ERREST ) THEN 
+                    MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ, LRWCON )
+                 ELSE
+                    MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ )
+                 END IF                 
+             ELSE
+                 IF ( ERREST ) THEN 
+                    MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON )
+                 ELSE
+                    MINRWRK = MAX( 7, LRWQP3, LRWSVDJ )
+                 END IF                
+             END IF 
+             IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+          ELSE
+*            .. minimal and optimal sizes of the complex workspace if the
+*            full SVD is requested
+             IF ( .NOT. JRACC ) THEN                
+                 IF ( ERREST ) THEN 
+                    MINWRK = MAX( N+LWQP3, N+LWCON,  2*N+N**2+LWCON, 
+     $                         2*N+LWQRF,         2*N+LWQP3, 
+     $                         2*N+N**2+N+LWLQF,  2*N+N**2+N+N**2+LWCON,
+     $                         2*N+N**2+N+LWSVDJ, 2*N+N**2+N+LWSVDJV, 
+     $                         2*N+N**2+N+LWUNMQR,2*N+N**2+N+LWUNMLQ, 
+     $                         N+N**2+LWSVDJ,   N+LWUNMQRM )
+                 ELSE
+                    MINWRK = MAX( N+LWQP3,        2*N+N**2+LWCON, 
+     $                         2*N+LWQRF,         2*N+LWQP3, 
+     $                         2*N+N**2+N+LWLQF,  2*N+N**2+N+N**2+LWCON,
+     $                         2*N+N**2+N+LWSVDJ, 2*N+N**2+N+LWSVDJV,
+     $                         2*N+N**2+N+LWUNMQR,2*N+N**2+N+LWUNMLQ,
+     $                         N+N**2+LWSVDJ,      N+LWUNMQRM ) 
+                 END IF 
+                 MINIWRK = MINIWRK + N 
+                 IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+             ELSE
+                 IF ( ERREST ) THEN 
+                    MINWRK = MAX( N+LWQP3, N+LWCON, 2*N+LWQRF, 
+     $                         2*N+N**2+LWSVDJV, 2*N+N**2+N+LWUNMQR, 
+     $                         N+LWUNMQRM )
+                 ELSE
+                    MINWRK = MAX( N+LWQP3, 2*N+LWQRF, 
+     $                         2*N+N**2+LWSVDJV, 2*N+N**2+N+LWUNMQR, 
+     $                         N+LWUNMQRM ) 
+                 END IF   
+                 IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
+     $                LDU, CDUMMY, -1, IERR )
+                 LWRK_ZUNMQRM = INT( CDUMMY(1) )
+                 CALL ZUNMQR( 'L', 'N', N, N, N, A, LDA, CDUMMY, U,
+     $                LDU, CDUMMY, -1, IERR )
+                 LWRK_ZUNMQR = INT( CDUMMY(1) )
+                 IF ( .NOT. JRACC ) THEN
+                     CALL ZGEQP3( N,N, A, LDA, IWORK, CDUMMY,CDUMMY, -1,
+     $                    RDUMMY, IERR )
+                     LWRK_ZGEQP3N = INT( CDUMMY(1) )
+                     CALL ZGESVJ( 'L', 'U', 'N', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_ZGESVJ = INT( CDUMMY(1) )
+                     CALL ZGESVJ( 'U', 'U', 'N', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_ZGESVJU = INT( CDUMMY(1) )
+                     CALL ZGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_ZGESVJV = INT( CDUMMY(1) )
+                     CALL ZUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY,
+     $                    V, LDV, CDUMMY, -1, IERR )
+                     LWRK_ZUNMLQ = INT( CDUMMY(1) )
+                     IF ( ERREST ) THEN 
+                       OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWCON, 
+     $                          2*N+N**2+LWCON, 2*N+LWRK_ZGEQRF, 
+     $                          2*N+LWRK_ZGEQP3N, 
+     $                          2*N+N**2+N+LWRK_ZGELQF,  
+     $                          2*N+N**2+N+N**2+LWCON,
+     $                          2*N+N**2+N+LWRK_ZGESVJ, 
+     $                          2*N+N**2+N+LWRK_ZGESVJV,               
+     $                          2*N+N**2+N+LWRK_ZUNMQR,
+     $                          2*N+N**2+N+LWRK_ZUNMLQ, 
+     $                          N+N**2+LWRK_ZGESVJU,                  
+     $                          N+LWRK_ZUNMQRM )
+                     ELSE
+                       OPTWRK = MAX( N+LWRK_ZGEQP3,  
+     $                          2*N+N**2+LWCON, 2*N+LWRK_ZGEQRF, 
+     $                          2*N+LWRK_ZGEQP3N, 
+     $                          2*N+N**2+N+LWRK_ZGELQF,  
+     $                          2*N+N**2+N+N**2+LWCON,
+     $                          2*N+N**2+N+LWRK_ZGESVJ,               
+     $                          2*N+N**2+N+LWRK_ZGESVJV, 
+     $                          2*N+N**2+N+LWRK_ZUNMQR,
+     $                          2*N+N**2+N+LWRK_ZUNMLQ, 
+     $                          N+N**2+LWRK_ZGESVJU,
+     $                          N+LWRK_ZUNMQRM )
+                     END IF                    
+                 ELSE
+                     CALL ZGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_ZGESVJV = INT( CDUMMY(1) )
+                     CALL ZUNMQR( 'L', 'N', N, N, N, CDUMMY, N, CDUMMY,
+     $                    V, LDV, CDUMMY, -1, IERR )
+                     LWRK_ZUNMQR = INT( CDUMMY(1) )
+                     CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
+     $                    LDU, CDUMMY, -1, IERR )
+                     LWRK_ZUNMQRM = INT( CDUMMY(1) )
+                     IF ( ERREST ) THEN 
+                        OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWCON,   
+     $                           2*N+LWRK_ZGEQRF, 2*N+N**2,  
+     $                           2*N+N**2+LWRK_ZGESVJV,  
+     $                           2*N+N**2+N+LWRK_ZUNMQR,N+LWRK_ZUNMQRM )
+                     ELSE
+                        OPTWRK = MAX( N+LWRK_ZGEQP3, 2*N+LWRK_ZGEQRF,  
+     $                           2*N+N**2, 2*N+N**2+LWRK_ZGESVJV, 
+     $                           2*N+N**2+N+LWRK_ZUNMQR, 
+     $                           N+LWRK_ZUNMQRM )   
+                     END IF                  
+                 END IF               
+             END IF 
+             IF ( L2TRAN .OR. ROWPIV ) THEN 
+                 MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ, LRWCON )
+             ELSE
+                 MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON )
+             END IF 
+          END IF
+          MINWRK = MAX( 2, MINWRK )
+          OPTWRK = MAX( MINWRK, OPTWRK )
+          IF ( LWORK  .LT. MINWRK  .AND. (.NOT.LQUERY) ) INFO = - 17
+          IF ( LRWORK .LT. MINRWRK .AND. (.NOT.LQUERY) ) INFO = - 19   
+      END IF
+*      
+      IF ( INFO .NE. 0 ) THEN
+*       #:(
+         CALL XERBLA( 'ZGEJSV', - INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+          CWORK(1) = OPTWRK
+          CWORK(2) = MINWRK
+          RWORK(1) = MINRWRK
+          IWORK(1) = MAX( 4, MINIWRK )
+          RETURN   
+      END IF
+*
+*     Quick return for void matrix (Y3K safe)
+* #:)
+      IF ( ( M .EQ. 0 ) .OR. ( N .EQ. 0 ) ) THEN
+         IWORK(1:4) = 0
+         RWORK(1:7) = 0
+         RETURN
+      ENDIF
+*
+*     Determine whether the matrix U should be M x N or M x M
+*
+      IF ( LSVEC ) THEN
+         N1 = N
+         IF ( LSAME( JOBU, 'F' ) ) N1 = M
+      END IF
+*
+*     Set numerical parameters
+*
+*!    NOTE: Make sure DLAMCH() does not fail on the target architecture.
+*
+      EPSLN = DLAMCH('Epsilon')
+      SFMIN = DLAMCH('SafeMinimum')
+      SMALL = SFMIN / EPSLN
+      BIG   = DLAMCH('O')
+*     BIG   = ONE / SFMIN
+*
+*     Initialize SVA(1:N) = diag( ||A e_i||_2 )_1^N
+*
+*(!)  If necessary, scale SVA() to protect the largest norm from
+*     overflow. It is possible that this scaling pushes the smallest
+*     column norm left from the underflow threshold (extreme case).
+*
+      SCALEM  = ONE / SQRT(DBLE(M)*DBLE(N))
+      NOSCAL  = .TRUE.
+      GOSCAL  = .TRUE.
+      DO 1874 p = 1, N
+         AAPP = ZERO
+         AAQQ = ONE
+         CALL ZLASSQ( M, A(1,p), 1, AAPP, AAQQ )
+         IF ( AAPP .GT. BIG ) THEN
+            INFO = - 9
+            CALL XERBLA( 'ZGEJSV', -INFO )
+            RETURN
+         END IF
+         AAQQ = SQRT(AAQQ)
+         IF ( ( AAPP .LT. (BIG / AAQQ) ) .AND. NOSCAL  ) THEN
+            SVA(p)  = AAPP * AAQQ
+         ELSE
+            NOSCAL  = .FALSE.
+            SVA(p)  = AAPP * ( AAQQ * SCALEM )
+            IF ( GOSCAL ) THEN
+               GOSCAL = .FALSE.
+               CALL DSCAL( p-1, SCALEM, SVA, 1 )
+            END IF
+         END IF
+ 1874 CONTINUE
+*
+      IF ( NOSCAL ) SCALEM = ONE
+*
+      AAPP = ZERO
+      AAQQ = BIG
+      DO 4781 p = 1, N
+         AAPP = MAX( AAPP, SVA(p) )
+         IF ( SVA(p) .NE. ZERO ) AAQQ = MIN( AAQQ, SVA(p) )
+ 4781 CONTINUE
+*
+*     Quick return for zero M x N matrix
+* #:)
+      IF ( AAPP .EQ. ZERO ) THEN
+         IF ( LSVEC ) CALL ZLASET( 'G', M, N1, CZERO, CONE, U, LDU )
+         IF ( RSVEC ) CALL ZLASET( 'G', N, N,  CZERO, CONE, V, LDV )
+         RWORK(1) = ONE
+         RWORK(2) = ONE
+         IF ( ERREST ) RWORK(3) = ONE
+         IF ( LSVEC .AND. RSVEC ) THEN
+            RWORK(4) = ONE
+            RWORK(5) = ONE
+         END IF
+         IF ( L2TRAN ) THEN
+            RWORK(6) = ZERO
+            RWORK(7) = ZERO
+         END IF
+         IWORK(1) = 0
+         IWORK(2) = 0
+         IWORK(3) = 0
+         IWORK(4) = -1
+         RETURN
+      END IF
+*
+*     Issue warning if denormalized column norms detected. Override the
+*     high relative accuracy request. Issue licence to kill nonzero columns
+*     (set them to zero) whose norm is less than sigma_max / BIG (roughly).
+* #:(
+      WARNING = 0
+      IF ( AAQQ .LE. SFMIN ) THEN
+         L2RANK = .TRUE.
+         L2KILL = .TRUE.
+         WARNING = 1
+      END IF
+*
+*     Quick return for one-column matrix
+* #:)
+      IF ( N .EQ. 1 ) THEN
+*
+         IF ( LSVEC ) THEN
+            CALL ZLASCL( 'G',0,0,SVA(1),SCALEM, M,1,A(1,1),LDA,IERR )
+            CALL ZLACPY( 'A', M, 1, A, LDA, U, LDU )
+*           computing all M left singular vectors of the M x 1 matrix
+            IF ( N1 .NE. N  ) THEN
+              CALL ZGEQRF( M, N, U,LDU, CWORK, CWORK(N+1),LWORK-N,IERR )
+              CALL ZUNGQR( M,N1,1, U,LDU,CWORK,CWORK(N+1),LWORK-N,IERR )
+              CALL ZCOPY( M, A(1,1), 1, U(1,1), 1 )
+            END IF
+         END IF
+         IF ( RSVEC ) THEN
+             V(1,1) = CONE
+         END IF
+         IF ( SVA(1) .LT. (BIG*SCALEM) ) THEN
+            SVA(1)  = SVA(1) / SCALEM
+            SCALEM  = ONE
+         END IF
+         RWORK(1) = ONE / SCALEM
+         RWORK(2) = ONE
+         IF ( SVA(1) .NE. ZERO ) THEN
+            IWORK(1) = 1
+            IF ( ( SVA(1) / SCALEM) .GE. SFMIN ) THEN
+               IWORK(2) = 1
+            ELSE
+               IWORK(2) = 0
+            END IF
+         ELSE
+            IWORK(1) = 0
+            IWORK(2) = 0
+         END IF
+         IWORK(3) = 0
+         IWORK(4) = -1
+         IF ( ERREST ) RWORK(3) = ONE
+         IF ( LSVEC .AND. RSVEC ) THEN
+            RWORK(4) = ONE
+            RWORK(5) = ONE
+         END IF
+         IF ( L2TRAN ) THEN
+            RWORK(6) = ZERO
+            RWORK(7) = ZERO
+         END IF
+         RETURN
+*
+      END IF
+*
+      TRANSP = .FALSE.
+*
+      AATMAX = -ONE
+      AATMIN =  BIG
+      IF ( ROWPIV .OR. L2TRAN ) THEN
+*
+*     Compute the row norms, needed to determine row pivoting sequence
+*     (in the case of heavily row weighted A, row pivoting is strongly
+*     advised) and to collect information needed to compare the
+*     structures of A * A^* and A^* * A (in the case L2TRAN.EQ..TRUE.).
+*
+         IF ( L2TRAN ) THEN
+            DO 1950 p = 1, M
+               XSC   = ZERO
+               TEMP1 = ONE
+               CALL ZLASSQ( N, A(p,1), LDA, XSC, TEMP1 )
+*              ZLASSQ gets both the ell_2 and the ell_infinity norm
+*              in one pass through the vector
+               RWORK(M+p)  = XSC * SCALEM
+               RWORK(p)    = XSC * (SCALEM*SQRT(TEMP1))
+               AATMAX = MAX( AATMAX, RWORK(p) )
+               IF (RWORK(p) .NE. ZERO) 
+     $            AATMIN = MIN(AATMIN,RWORK(p))
+ 1950       CONTINUE
+         ELSE
+            DO 1904 p = 1, M
+               RWORK(M+p) = SCALEM*ABS( A(p,IZAMAX(N,A(p,1),LDA)) )
+               AATMAX = MAX( AATMAX, RWORK(M+p) )
+               AATMIN = MIN( AATMIN, RWORK(M+p) )
+ 1904       CONTINUE
+         END IF
+*
+      END IF
+*
+*     For square matrix A try to determine whether A^*  would be better
+*     input for the preconditioned Jacobi SVD, with faster convergence.
+*     The decision is based on an O(N) function of the vector of column
+*     and row norms of A, based on the Shannon entropy. This should give
+*     the right choice in most cases when the difference actually matters.
+*     It may fail and pick the slower converging side.
+*
+      ENTRA  = ZERO
+      ENTRAT = ZERO
+      IF ( L2TRAN ) THEN
+*
+         XSC   = ZERO
+         TEMP1 = ONE
+         CALL DLASSQ( N, SVA, 1, XSC, TEMP1 )
+         TEMP1 = ONE / TEMP1
+*
+         ENTRA = ZERO
+         DO 1113 p = 1, N
+            BIG1  = ( ( SVA(p) / XSC )**2 ) * TEMP1
+            IF ( BIG1 .NE. ZERO ) ENTRA = ENTRA + BIG1 * DLOG(BIG1)
+ 1113    CONTINUE
+         ENTRA = - ENTRA / DLOG(DBLE(N))
+*
+*        Now, SVA().^2/Trace(A^* * A) is a point in the probability simplex.
+*        It is derived from the diagonal of  A^* * A.  Do the same with the
+*        diagonal of A * A^*, compute the entropy of the corresponding
+*        probability distribution. Note that A * A^* and A^* * A have the
+*        same trace.
+*
+         ENTRAT = ZERO
+         DO 1114 p = 1, M
+            BIG1 = ( ( RWORK(p) / XSC )**2 ) * TEMP1
+            IF ( BIG1 .NE. ZERO ) ENTRAT = ENTRAT + BIG1 * DLOG(BIG1)
+ 1114    CONTINUE
+         ENTRAT = - ENTRAT / DLOG(DBLE(M))
+*
+*        Analyze the entropies and decide A or A^*. Smaller entropy
+*        usually means better input for the algorithm.
+*
+         TRANSP = ( ENTRAT .LT. ENTRA )
+* 
+*        If A^* is better than A, take the adjoint of A. This is allowed
+*        only for square matrices, M=N.
+         IF ( TRANSP ) THEN
+*           In an optimal implementation, this trivial transpose
+*           should be replaced with faster transpose.
+            DO 1115 p = 1, N - 1
+               A(p,p) = CONJG(A(p,p))
+               DO 1116 q = p + 1, N
+                   CTEMP = CONJG(A(q,p))
+                  A(q,p) = CONJG(A(p,q))
+                  A(p,q) = CTEMP
+ 1116          CONTINUE
+ 1115       CONTINUE
+            A(N,N) = CONJG(A(N,N))
+            DO 1117 p = 1, N
+               RWORK(M+p) = SVA(p)
+               SVA(p)     = RWORK(p)
+*              previously computed row 2-norms are now column 2-norms
+*              of the transposed matrix
+ 1117       CONTINUE
+            TEMP1  = AAPP
+            AAPP   = AATMAX
+            AATMAX = TEMP1
+            TEMP1  = AAQQ
+            AAQQ   = AATMIN
+            AATMIN = TEMP1
+            KILL   = LSVEC
+            LSVEC  = RSVEC
+            RSVEC  = KILL
+            IF ( LSVEC ) N1 = N
+*
+            ROWPIV = .TRUE.
+         END IF
+*
+      END IF
+*     END IF L2TRAN
+*
+*     Scale the matrix so that its maximal singular value remains less
+*     than SQRT(BIG) -- the matrix is scaled so that its maximal column
+*     has Euclidean norm equal to SQRT(BIG/N). The only reason to keep
+*     SQRT(BIG) instead of BIG is the fact that ZGEJSV uses LAPACK and
+*     BLAS routines that, in some implementations, are not capable of
+*     working in the full interval [SFMIN,BIG] and that they may provoke
+*     overflows in the intermediate results. If the singular values spread
+*     from SFMIN to BIG, then ZGESVJ will compute them. So, in that case,
+*     one should use ZGESVJ instead of ZGEJSV.
+*     >> change in the April 2016 update: allow bigger range, i.e. the
+*     largest column is allowed up to BIG/N and ZGESVJ will do the rest.
+      BIG1   = SQRT( BIG )
+      TEMP1  = SQRT( BIG / DBLE(N) ) 
+*      TEMP1  = BIG/DBLE(N)
+*
+      CALL DLASCL( 'G', 0, 0, AAPP, TEMP1, N, 1, SVA, N, IERR )
+      IF ( AAQQ .GT. (AAPP * SFMIN) ) THEN
+          AAQQ = ( AAQQ / AAPP ) * TEMP1
+      ELSE
+          AAQQ = ( AAQQ * TEMP1 ) / AAPP
+      END IF
+      TEMP1 = TEMP1 * SCALEM
+      CALL ZLASCL( 'G', 0, 0, AAPP, TEMP1, M, N, A, LDA, IERR )
+*
+*     To undo scaling at the end of this procedure, multiply the
+*     computed singular values with USCAL2 / USCAL1.
+*
+      USCAL1 = TEMP1
+      USCAL2 = AAPP
+*
+      IF ( L2KILL ) THEN
+*        L2KILL enforces computation of nonzero singular values in
+*        the restricted range of condition number of the initial A,
+*        sigma_max(A) / sigma_min(A) approx. SQRT(BIG)/SQRT(SFMIN).
+         XSC = SQRT( SFMIN )
+      ELSE
+         XSC = SMALL
+*
+*        Now, if the condition number of A is too big,
+*        sigma_max(A) / sigma_min(A) .GT. SQRT(BIG/N) * EPSLN / SFMIN,
+*        as a precaution measure, the full SVD is computed using ZGESVJ
+*        with accumulated Jacobi rotations. This provides numerically
+*        more robust computation, at the cost of slightly increased run
+*        time. Depending on the concrete implementation of BLAS and LAPACK
+*        (i.e. how they behave in presence of extreme ill-conditioning) the
+*        implementor may decide to remove this switch.
+         IF ( ( AAQQ.LT.SQRT(SFMIN) ) .AND. LSVEC .AND. RSVEC ) THEN
+            JRACC = .TRUE.
+         END IF
+*
+      END IF
+      IF ( AAQQ .LT. XSC ) THEN
+         DO 700 p = 1, N
+            IF ( SVA(p) .LT. XSC ) THEN
+               CALL ZLASET( 'A', M, 1, CZERO, CZERO, A(1,p), LDA )
+               SVA(p) = ZERO
+            END IF
+ 700     CONTINUE
+      END IF
+*
+*     Preconditioning using QR factorization with pivoting
+*
+      IF ( ROWPIV ) THEN
+*        Optional row permutation (Bjoerck row pivoting):
+*        A result by Cox and Higham shows that the Bjoerck's
+*        row pivoting combined with standard column pivoting
+*        has similar effect as Powell-Reid complete pivoting.
+*        The ell-infinity norms of A are made nonincreasing.
+         IF ( ( LSVEC .AND. RSVEC ) .AND. .NOT.( JRACC ) ) THEN 
+              IWOFF = 2*N
+         ELSE
+              IWOFF = N
+         END IF
+         DO 1952 p = 1, M - 1
+            q = IDAMAX( M-p+1, RWORK(M+p), 1 ) + p - 1
+            IWORK(IWOFF+p) = q
+            IF ( p .NE. q ) THEN
+               TEMP1      = RWORK(M+p)
+               RWORK(M+p) = RWORK(M+q)
+               RWORK(M+q) = TEMP1
+            END IF
+ 1952    CONTINUE
+         CALL ZLASWP( N, A, LDA, 1, M-1, IWORK(IWOFF+1), 1 )
+      END IF
+*
+*     End of the preparation phase (scaling, optional sorting and
+*     transposing, optional flushing of small columns).
+*
+*     Preconditioning
+*
+*     If the full SVD is needed, the right singular vectors are computed
+*     from a matrix equation, and for that we need theoretical analysis
+*     of the Businger-Golub pivoting. So we use ZGEQP3 as the first RR QRF.
+*     In all other cases the first RR QRF can be chosen by other criteria
+*     (eg speed by replacing global with restricted window pivoting, such
+*     as in xGEQPX from TOMS # 782). Good results will be obtained using
+*     xGEQPX with properly (!) chosen numerical parameters.
+*     Any improvement of ZGEQP3 improves overall performance of ZGEJSV.
+*
+*     A * P1 = Q1 * [ R1^* 0]^*:
+      DO 1963 p = 1, N
+*        .. all columns are free columns
+         IWORK(p) = 0
+ 1963 CONTINUE
+      CALL ZGEQP3( M, N, A, LDA, IWORK, CWORK, CWORK(N+1), LWORK-N,
+     $             RWORK, IERR )
+*
+*     The upper triangular matrix R1 from the first QRF is inspected for
+*     rank deficiency and possibilities for deflation, or possible
+*     ill-conditioning. Depending on the user specified flag L2RANK,
+*     the procedure explores possibilities to reduce the numerical
+*     rank by inspecting the computed upper triangular factor. If
+*     L2RANK or L2ABER are up, then ZGEJSV will compute the SVD of
+*     A + dA, where ||dA|| <= f(M,N)*EPSLN.
+*
+      NR = 1
+      IF ( L2ABER ) THEN
+*        Standard absolute error bound suffices. All sigma_i with
+*        sigma_i < N*EPSLN*||A|| are flushed to zero. This is an
+*        aggressive enforcement of lower numerical rank by introducing a
+*        backward error of the order of N*EPSLN*||A||.
+         TEMP1 = SQRT(DBLE(N))*EPSLN
+         DO 3001 p = 2, N
+            IF ( ABS(A(p,p)) .GE. (TEMP1*ABS(A(1,1))) ) THEN
+               NR = NR + 1
+            ELSE
+               GO TO 3002
+            END IF
+ 3001    CONTINUE
+ 3002    CONTINUE
+      ELSE IF ( L2RANK ) THEN
+*        .. similarly as above, only slightly more gentle (less aggressive).
+*        Sudden drop on the diagonal of R1 is used as the criterion for
+*        close-to-rank-deficient.
+         TEMP1 = SQRT(SFMIN)
+         DO 3401 p = 2, N
+            IF ( ( ABS(A(p,p)) .LT. (EPSLN*ABS(A(p-1,p-1))) ) .OR.
+     $           ( ABS(A(p,p)) .LT. SMALL ) .OR.
+     $           ( L2KILL .AND. (ABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3402
+            NR = NR + 1
+ 3401    CONTINUE
+ 3402    CONTINUE
+*
+      ELSE
+*        The goal is high relative accuracy. However, if the matrix
+*        has high scaled condition number the relative accuracy is in
+*        general not feasible. Later on, a condition number estimator
+*        will be deployed to estimate the scaled condition number.
+*        Here we just remove the underflowed part of the triangular
+*        factor. This prevents the situation in which the code is
+*        working hard to get the accuracy not warranted by the data.
+         TEMP1  = SQRT(SFMIN)
+         DO 3301 p = 2, N
+            IF ( ( ABS(A(p,p)) .LT. SMALL ) .OR.
+     $           ( L2KILL .AND. (ABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3302
+            NR = NR + 1
+ 3301    CONTINUE
+ 3302    CONTINUE
+*
+      END IF
+*
+      ALMORT = .FALSE.
+      IF ( NR .EQ. N ) THEN
+         MAXPRJ = ONE
+         DO 3051 p = 2, N
+            TEMP1  = ABS(A(p,p)) / SVA(IWORK(p))
+            MAXPRJ = MIN( MAXPRJ, TEMP1 )
+ 3051    CONTINUE
+         IF ( MAXPRJ**2 .GE. ONE - DBLE(N)*EPSLN ) ALMORT = .TRUE.
+      END IF
+*
+*
+      SCONDA = - ONE
+      CONDR1 = - ONE
+      CONDR2 = - ONE
+*
+      IF ( ERREST ) THEN
+         IF ( N .EQ. NR ) THEN
+            IF ( RSVEC ) THEN
+*              .. V is available as workspace
+               CALL ZLACPY( 'U', N, N, A, LDA, V, LDV )
+               DO 3053 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+                  CALL ZDSCAL( p, ONE/TEMP1, V(1,p), 1 )
+ 3053          CONTINUE
+               IF ( LSVEC )THEN
+                   CALL ZPOCON( 'U', N, V, LDV, ONE, TEMP1,
+     $                  CWORK(N+1), RWORK, IERR )
+               ELSE
+                   CALL ZPOCON( 'U', N, V, LDV, ONE, TEMP1,
+     $                  CWORK, RWORK, IERR )
+               END IF               
+*          
+            ELSE IF ( LSVEC ) THEN
+*              .. U is available as workspace
+               CALL ZLACPY( 'U', N, N, A, LDA, U, LDU )
+               DO 3054 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+                  CALL ZDSCAL( p, ONE/TEMP1, U(1,p), 1 )
+ 3054          CONTINUE
+               CALL ZPOCON( 'U', N, U, LDU, ONE, TEMP1,
+     $              CWORK(N+1), RWORK, IERR )
+            ELSE
+               CALL ZLACPY( 'U', N, N, A, LDA, CWORK, N )
+*[]            CALL ZLACPY( 'U', N, N, A, LDA, CWORK(N+1), N )
+*              Change: here index shifted by N to the left, CWORK(1:N) 
+*              not needed for SIGMA only computation
+               DO 3052 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+*[]               CALL ZDSCAL( p, ONE/TEMP1, CWORK(N+(p-1)*N+1), 1 )
+                  CALL ZDSCAL( p, ONE/TEMP1, CWORK((p-1)*N+1), 1 )
+ 3052          CONTINUE
+*           .. the columns of R are scaled to have unit Euclidean lengths.
+*[]               CALL ZPOCON( 'U', N, CWORK(N+1), N, ONE, TEMP1,
+*[]     $              CWORK(N+N*N+1), RWORK, IERR )
+               CALL ZPOCON( 'U', N, CWORK, N, ONE, TEMP1,
+     $              CWORK(N*N+1), RWORK, IERR )               
+*              
+            END IF
+            IF ( TEMP1 .NE. ZERO ) THEN 
+               SCONDA = ONE / SQRT(TEMP1)
+            ELSE
+               SCONDA = - ONE
+            END IF
+*           SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1).
+*           N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+         ELSE
+            SCONDA = - ONE
+         END IF
+      END IF
+*
+      L2PERT = L2PERT .AND. ( ABS( A(1,1)/A(NR,NR) ) .GT. SQRT(BIG1) )
+*     If there is no violent scaling, artificial perturbation is not needed.
+*
+*     Phase 3:
+*
+      IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN
+*
+*         Singular Values only
+*
+*         .. transpose A(1:NR,1:N)
+         DO 1946 p = 1, MIN( N-1, NR )
+            CALL ZCOPY( N-p, A(p,p+1), LDA, A(p+1,p), 1 )
+            CALL ZLACGV( N-p+1, A(p,p), 1 )
+ 1946    CONTINUE
+         IF ( NR .EQ. N ) A(N,N) = CONJG(A(N,N))
+*
+*        The following two DO-loops introduce small relative perturbation
+*        into the strict upper triangle of the lower triangular matrix.
+*        Small entries below the main diagonal are also changed.
+*        This modification is useful if the computing environment does not
+*        provide/allow FLUSH TO ZERO underflow, for it prevents many
+*        annoying denormalized numbers in case of strongly scaled matrices.
+*        The perturbation is structured so that it does not introduce any
+*        new perturbation of the singular values, and it does not destroy
+*        the job done by the preconditioner.
+*        The licence for this perturbation is in the variable L2PERT, which
+*        should be .FALSE. if FLUSH TO ZERO underflow is active.
+*
+         IF ( .NOT. ALMORT ) THEN
+*
+            IF ( L2PERT ) THEN
+*              XSC = SQRT(SMALL)
+               XSC = EPSLN / DBLE(N)
+               DO 4947 q = 1, NR
+                  CTEMP = DCMPLX(XSC*ABS(A(q,q)),ZERO)
+                  DO 4949 p = 1, N
+                     IF ( ( (p.GT.q) .AND. (ABS(A(p,q)).LE.TEMP1) )
+     $                    .OR. ( p .LT. q ) )
+*     $                     A(p,q) = TEMP1 * ( A(p,q) / ABS(A(p,q)) )
+     $                     A(p,q) = CTEMP
+ 4949             CONTINUE
+ 4947          CONTINUE
+            ELSE
+               CALL ZLASET( 'U', NR-1,NR-1, CZERO,CZERO, A(1,2),LDA )
+            END IF
+*
+*            .. second preconditioning using the QR factorization
+*
+            CALL ZGEQRF( N,NR, A,LDA, CWORK, CWORK(N+1),LWORK-N, IERR )
+*
+*           .. and transpose upper to lower triangular
+            DO 1948 p = 1, NR - 1
+               CALL ZCOPY( NR-p, A(p,p+1), LDA, A(p+1,p), 1 )
+               CALL ZLACGV( NR-p+1, A(p,p), 1 )
+ 1948       CONTINUE
+*
+      END IF
+*
+*           Row-cyclic Jacobi SVD algorithm with column pivoting
+*
+*           .. again some perturbation (a "background noise") is added
+*           to drown denormals
+            IF ( L2PERT ) THEN
+*              XSC = SQRT(SMALL)
+               XSC = EPSLN / DBLE(N)
+               DO 1947 q = 1, NR
+                  CTEMP = DCMPLX(XSC*ABS(A(q,q)),ZERO)
+                  DO 1949 p = 1, NR
+                     IF ( ( (p.GT.q) .AND. (ABS(A(p,q)).LE.TEMP1) )
+     $                       .OR. ( p .LT. q ) )
+*     $                   A(p,q) = TEMP1 * ( A(p,q) / ABS(A(p,q)) )
+     $                   A(p,q) = CTEMP
+ 1949             CONTINUE
+ 1947          CONTINUE
+            ELSE
+               CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, A(1,2), LDA )
+            END IF
+*
+*           .. and one-sided Jacobi rotations are started on a lower
+*           triangular matrix (plus perturbation which is ignored in
+*           the part which destroys triangular form (confusing?!))
+*
+            CALL ZGESVJ( 'L', 'N', 'N', NR, NR, A, LDA, SVA,
+     $                N, V, LDV, CWORK, LWORK, RWORK, LRWORK, INFO )
+*
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+*
+*
+      ELSE IF ( ( RSVEC .AND. ( .NOT. LSVEC ) .AND. ( .NOT. JRACC ) )
+     $       .OR. 
+     $   ( JRACC .AND. ( .NOT. LSVEC ) .AND. ( NR .NE. N ) ) ) THEN
+*
+*        -> Singular Values and Right Singular Vectors <-
+*
+         IF ( ALMORT ) THEN
+*
+*           .. in this case NR equals N
+            DO 1998 p = 1, NR
+               CALL ZCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+               CALL ZLACGV( N-p+1, V(p,p), 1 )
+ 1998       CONTINUE
+            CALL ZLASET( 'U', NR-1,NR-1, CZERO, CZERO, V(1,2), LDV )
+*
+            CALL ZGESVJ( 'L','U','N', N, NR, V, LDV, SVA, NR, A, LDA,
+     $                  CWORK, LWORK, RWORK, LRWORK, INFO )
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+
+         ELSE
+*
+*        .. two more QR factorizations ( one QRF is not enough, two require
+*        accumulated product of Jacobi rotations, three are perfect )
+*
+            CALL ZLASET( 'L', NR-1,NR-1, CZERO, CZERO, A(2,1), LDA )
+            CALL ZGELQF( NR,N, A, LDA, CWORK, CWORK(N+1), LWORK-N, IERR)
+            CALL ZLACPY( 'L', NR, NR, A, LDA, V, LDV )
+            CALL ZLASET( 'U', NR-1,NR-1, CZERO, CZERO, V(1,2), LDV )
+            CALL ZGEQRF( NR, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+     $                   LWORK-2*N, IERR )
+            DO 8998 p = 1, NR
+               CALL ZCOPY( NR-p+1, V(p,p), LDV, V(p,p), 1 )
+               CALL ZLACGV( NR-p+1, V(p,p), 1 )
+ 8998       CONTINUE
+            CALL ZLASET('U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV)
+*
+            CALL ZGESVJ( 'L', 'U','N', NR, NR, V,LDV, SVA, NR, U,
+     $                  LDU, CWORK(N+1), LWORK-N, RWORK, LRWORK, INFO )
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+            IF ( NR .LT. N ) THEN
+               CALL ZLASET( 'A',N-NR, NR, CZERO,CZERO, V(NR+1,1),  LDV )
+               CALL ZLASET( 'A',NR, N-NR, CZERO,CZERO, V(1,NR+1),  LDV )
+               CALL ZLASET( 'A',N-NR,N-NR,CZERO,CONE, V(NR+1,NR+1),LDV )
+            END IF
+*
+         CALL ZUNMLQ( 'L', 'C', N, N, NR, A, LDA, CWORK,
+     $               V, LDV, CWORK(N+1), LWORK-N, IERR )
+*
+         END IF
+*         .. permute the rows of V
+*         DO 8991 p = 1, N
+*            CALL ZCOPY( N, V(p,1), LDV, A(IWORK(p),1), LDA )
+* 8991    CONTINUE
+*         CALL ZLACPY( 'All', N, N, A, LDA, V, LDV )
+         CALL ZLAPMR( .FALSE., N, N, V, LDV, IWORK )
+*
+          IF ( TRANSP ) THEN
+            CALL ZLACPY( 'A', N, N, V, LDV, U, LDU )
+          END IF
+*
+      ELSE IF ( JRACC .AND. (.NOT. LSVEC) .AND. ( NR.EQ. N ) ) THEN 
+*          
+         CALL ZLASET( 'L', N-1,N-1, CZERO, CZERO, A(2,1), LDA )
+*
+         CALL ZGESVJ( 'U','N','V', N, N, A, LDA, SVA, N, V, LDV,
+     $               CWORK, LWORK, RWORK, LRWORK, INFO )
+          SCALEM  = RWORK(1)
+          NUMRANK = NINT(RWORK(2))
+          CALL ZLAPMR( .FALSE., N, N, V, LDV, IWORK )
+*
+      ELSE IF ( LSVEC .AND. ( .NOT. RSVEC ) ) THEN
+*
+*        .. Singular Values and Left Singular Vectors                 ..
+*
+*        .. second preconditioning step to avoid need to accumulate
+*        Jacobi rotations in the Jacobi iterations.
+         DO 1965 p = 1, NR
+            CALL ZCOPY( N-p+1, A(p,p), LDA, U(p,p), 1 )
+            CALL ZLACGV( N-p+1, U(p,p), 1 )
+ 1965    CONTINUE
+         CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU )
+*
+         CALL ZGEQRF( N, NR, U, LDU, CWORK(N+1), CWORK(2*N+1),
+     $              LWORK-2*N, IERR )
+*
+         DO 1967 p = 1, NR - 1
+            CALL ZCOPY( NR-p, U(p,p+1), LDU, U(p+1,p), 1 )
+            CALL ZLACGV( N-p+1, U(p,p), 1 )
+ 1967    CONTINUE
+         CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU )
+*
+         CALL ZGESVJ( 'L', 'U', 'N', NR,NR, U, LDU, SVA, NR, A,
+     $        LDA, CWORK(N+1), LWORK-N, RWORK, LRWORK, INFO )
+         SCALEM  = RWORK(1)
+         NUMRANK = NINT(RWORK(2))
+*
+         IF ( NR .LT. M ) THEN
+            CALL ZLASET( 'A',  M-NR, NR,CZERO, CZERO, U(NR+1,1), LDU )
+            IF ( NR .LT. N1 ) THEN
+               CALL ZLASET( 'A',NR, N1-NR, CZERO, CZERO, U(1,NR+1),LDU )
+               CALL ZLASET( 'A',M-NR,N1-NR,CZERO,CONE,U(NR+1,NR+1),LDU )
+            END IF
+         END IF
+*
+         CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $               LDU, CWORK(N+1), LWORK-N, IERR )
+*
+         IF ( ROWPIV )
+     $       CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+         DO 1974 p = 1, N1
+            XSC = ONE / DZNRM2( M, U(1,p), 1 )
+            CALL ZDSCAL( M, XSC, U(1,p), 1 )
+ 1974    CONTINUE
+*
+         IF ( TRANSP ) THEN
+            CALL ZLACPY( 'A', N, N, U, LDU, V, LDV )
+         END IF
+*
+      ELSE
+*
+*        .. Full SVD ..
+*
+         IF ( .NOT. JRACC ) THEN
+*
+         IF ( .NOT. ALMORT ) THEN
+*
+*           Second Preconditioning Step (QRF [with pivoting])
+*           Note that the composition of TRANSPOSE, QRF and TRANSPOSE is
+*           equivalent to an LQF CALL. Since in many libraries the QRF
+*           seems to be better optimized than the LQF, we do explicit
+*           transpose and use the QRF. This is subject to changes in an
+*           optimized implementation of ZGEJSV.
+*
+            DO 1968 p = 1, NR
+               CALL ZCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+               CALL ZLACGV( N-p+1, V(p,p), 1 )
+ 1968       CONTINUE
+*
+*           .. the following two loops perturb small entries to avoid
+*           denormals in the second QR factorization, where they are
+*           as good as zeros. This is done to avoid painfully slow
+*           computation with denormals. The relative size of the perturbation
+*           is a parameter that can be changed by the implementer.
+*           This perturbation device will be obsolete on machines with
+*           properly implemented arithmetic.
+*           To switch it off, set L2PERT=.FALSE. To remove it from  the
+*           code, remove the action under L2PERT=.TRUE., leave the ELSE part.
+*           The following two loops should be blocked and fused with the
+*           transposed copy above.
+*
+            IF ( L2PERT ) THEN
+               XSC = SQRT(SMALL)
+               DO 2969 q = 1, NR
+                  CTEMP = DCMPLX(XSC*ABS( V(q,q) ),ZERO)
+                  DO 2968 p = 1, N
+                     IF ( ( p .GT. q ) .AND. ( ABS(V(p,q)) .LE. TEMP1 )
+     $                   .OR. ( p .LT. q ) )
+*     $                   V(p,q) = TEMP1 * ( V(p,q) / ABS(V(p,q)) )
+     $                   V(p,q) = CTEMP
+                     IF ( p .LT. q ) V(p,q) = - V(p,q)
+ 2968             CONTINUE
+ 2969          CONTINUE
+            ELSE
+               CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV )
+            END IF
+*
+*           Estimate the row scaled condition number of R1
+*           (If R1 is rectangular, N > NR, then the condition number
+*           of the leading NR x NR submatrix is estimated.)
+*
+            CALL ZLACPY( 'L', NR, NR, V, LDV, CWORK(2*N+1), NR )
+            DO 3950 p = 1, NR
+               TEMP1 = DZNRM2(NR-p+1,CWORK(2*N+(p-1)*NR+p),1)
+               CALL ZDSCAL(NR-p+1,ONE/TEMP1,CWORK(2*N+(p-1)*NR+p),1)
+ 3950       CONTINUE
+            CALL ZPOCON('L',NR,CWORK(2*N+1),NR,ONE,TEMP1,
+     $                   CWORK(2*N+NR*NR+1),RWORK,IERR)
+            CONDR1 = ONE / SQRT(TEMP1)
+*           .. here need a second opinion on the condition number
+*           .. then assume worst case scenario
+*           R1 is OK for inverse <=> CONDR1 .LT. DBLE(N)
+*           more conservative    <=> CONDR1 .LT. SQRT(DBLE(N))
+*
+            COND_OK = SQRT(SQRT(DBLE(NR)))
+*[TP]       COND_OK is a tuning parameter.
+*
+            IF ( CONDR1 .LT. COND_OK ) THEN
+*              .. the second QRF without pivoting. Note: in an optimized
+*              implementation, this QRF should be implemented as the QRF
+*              of a lower triangular matrix.
+*              R1^* = Q2 * R2
+               CALL ZGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+     $              LWORK-2*N, IERR )
+*
+               IF ( L2PERT ) THEN
+                  XSC = SQRT(SMALL)/EPSLN
+                  DO 3959 p = 2, NR
+                     DO 3958 q = 1, p - 1
+                        CTEMP=DCMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))),
+     $                              ZERO)
+                        IF ( ABS(V(q,p)) .LE. TEMP1 )
+*     $                     V(q,p) = TEMP1 * ( V(q,p) / ABS(V(q,p)) )
+     $                     V(q,p) = CTEMP
+ 3958                CONTINUE
+ 3959             CONTINUE
+               END IF
+*
+               IF ( NR .NE. N )
+     $         CALL ZLACPY( 'A', N, NR, V, LDV, CWORK(2*N+1), N )
+*              .. save ...
+*
+*           .. this transposed copy should be better than naive
+               DO 1969 p = 1, NR - 1
+                  CALL ZCOPY( NR-p, V(p,p+1), LDV, V(p+1,p), 1 )
+                  CALL ZLACGV(NR-p+1, V(p,p), 1 )
+ 1969          CONTINUE
+               V(NR,NR)=CONJG(V(NR,NR))
+*
+               CONDR2 = CONDR1
+*
+            ELSE
+*
+*              .. ill-conditioned case: second QRF with pivoting
+*              Note that windowed pivoting would be equally good
+*              numerically, and more run-time efficient. So, in
+*              an optimal implementation, the next call to ZGEQP3
+*              should be replaced with eg. CALL ZGEQPX (ACM TOMS #782)
+*              with properly (carefully) chosen parameters.
+*
+*              R1^* * P2 = Q2 * R2
+               DO 3003 p = 1, NR
+                  IWORK(N+p) = 0
+ 3003          CONTINUE
+               CALL ZGEQP3( N, NR, V, LDV, IWORK(N+1), CWORK(N+1),
+     $                  CWORK(2*N+1), LWORK-2*N, RWORK, IERR )
+**               CALL ZGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+**     $              LWORK-2*N, IERR )
+               IF ( L2PERT ) THEN
+                  XSC = SQRT(SMALL)
+                  DO 3969 p = 2, NR
+                     DO 3968 q = 1, p - 1
+                        CTEMP=DCMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))),
+     $                                ZERO)
+                        IF ( ABS(V(q,p)) .LE. TEMP1 )
+*     $                     V(q,p) = TEMP1 * ( V(q,p) / ABS(V(q,p)) )
+     $                     V(q,p) = CTEMP
+ 3968                CONTINUE
+ 3969             CONTINUE
+               END IF
+*
+               CALL ZLACPY( 'A', N, NR, V, LDV, CWORK(2*N+1), N )
+*
+               IF ( L2PERT ) THEN
+                  XSC = SQRT(SMALL)
+                  DO 8970 p = 2, NR
+                     DO 8971 q = 1, p - 1
+                        CTEMP=DCMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))),
+     $                               ZERO)
+*                        V(p,q) = - TEMP1*( V(q,p) / ABS(V(q,p)) )
+                        V(p,q) = - CTEMP
+ 8971                CONTINUE
+ 8970             CONTINUE
+               ELSE
+                  CALL ZLASET( 'L',NR-1,NR-1,CZERO,CZERO,V(2,1),LDV )
+               END IF
+*              Now, compute R2 = L3 * Q3, the LQ factorization.
+               CALL ZGELQF( NR, NR, V, LDV, CWORK(2*N+N*NR+1),
+     $               CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, IERR )
+*              .. and estimate the condition number
+               CALL ZLACPY( 'L',NR,NR,V,LDV,CWORK(2*N+N*NR+NR+1),NR )
+               DO 4950 p = 1, NR
+                  TEMP1 = DZNRM2( p, CWORK(2*N+N*NR+NR+p), NR )
+                  CALL ZDSCAL( p, ONE/TEMP1, CWORK(2*N+N*NR+NR+p), NR )
+ 4950          CONTINUE
+               CALL ZPOCON( 'L',NR,CWORK(2*N+N*NR+NR+1),NR,ONE,TEMP1,
+     $              CWORK(2*N+N*NR+NR+NR*NR+1),RWORK,IERR )
+               CONDR2 = ONE / SQRT(TEMP1)
+*
+*
+               IF ( CONDR2 .GE. COND_OK ) THEN
+*                 .. save the Householder vectors used for Q3
+*                 (this overwrites the copy of R2, as it will not be
+*                 needed in this branch, but it does not overwritte the
+*                 Huseholder vectors of Q2.).
+                  CALL ZLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N )
+*                 .. and the rest of the information on Q3 is in
+*                 WORK(2*N+N*NR+1:2*N+N*NR+N)
+               END IF
+*
+            END IF
+*
+            IF ( L2PERT ) THEN
+               XSC = SQRT(SMALL)
+               DO 4968 q = 2, NR
+                  CTEMP = XSC * V(q,q)
+                  DO 4969 p = 1, q - 1
+*                     V(p,q) = - TEMP1*( V(p,q) / ABS(V(p,q)) )
+                     V(p,q) = - CTEMP
+ 4969             CONTINUE
+ 4968          CONTINUE
+            ELSE
+               CALL ZLASET( 'U', NR-1,NR-1, CZERO,CZERO, V(1,2), LDV )
+            END IF
+*
+*        Second preconditioning finished; continue with Jacobi SVD
+*        The input matrix is lower trinagular.
+*
+*        Recover the right singular vectors as solution of a well
+*        conditioned triangular matrix equation.
+*
+            IF ( CONDR1 .LT. COND_OK ) THEN
+*
+               CALL ZGESVJ( 'L','U','N',NR,NR,V,LDV,SVA,NR,U, LDU,
+     $              CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,RWORK,
+     $              LRWORK, INFO )
+               SCALEM  = RWORK(1)
+               NUMRANK = NINT(RWORK(2))
+               DO 3970 p = 1, NR
+                  CALL ZCOPY(  NR, V(1,p), 1, U(1,p), 1 )
+                  CALL ZDSCAL( NR, SVA(p),    V(1,p), 1 )
+ 3970          CONTINUE
+
+*        .. pick the right matrix equation and solve it
+*
+               IF ( NR .EQ. N ) THEN
+* :))             .. best case, R1 is inverted. The solution of this matrix
+*                 equation is Q2*V2 = the product of the Jacobi rotations
+*                 used in ZGESVJ, premultiplied with the orthogonal matrix
+*                 from the second QR factorization.
+                  CALL ZTRSM('L','U','N','N', NR,NR,CONE, A,LDA, V,LDV)
+               ELSE
+*                 .. R1 is well conditioned, but non-square. Adjoint of R2
+*                 is inverted to get the product of the Jacobi rotations
+*                 used in ZGESVJ. The Q-factor from the second QR
+*                 factorization is then built in explicitly.
+                  CALL ZTRSM('L','U','C','N',NR,NR,CONE,CWORK(2*N+1),
+     $                 N,V,LDV)
+                  IF ( NR .LT. N ) THEN
+                  CALL ZLASET('A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV)
+                  CALL ZLASET('A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV)
+                  CALL ZLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+                  END IF
+                  CALL ZUNMQR('L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $                V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR)
+               END IF
+*
+            ELSE IF ( CONDR2 .LT. COND_OK ) THEN
+*
+*              The matrix R2 is inverted. The solution of the matrix equation
+*              is Q3^* * V3 = the product of the Jacobi rotations (appplied to
+*              the lower triangular L3 from the LQ factorization of
+*              R2=L3*Q3), pre-multiplied with the transposed Q3.
+               CALL ZGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
+     $              LDU, CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR,
+     $              RWORK, LRWORK, INFO )
+               SCALEM  = RWORK(1)
+               NUMRANK = NINT(RWORK(2))
+               DO 3870 p = 1, NR
+                  CALL ZCOPY( NR, V(1,p), 1, U(1,p), 1 )
+                  CALL ZDSCAL( NR, SVA(p),    U(1,p), 1 )
+ 3870          CONTINUE
+               CALL ZTRSM('L','U','N','N',NR,NR,CONE,CWORK(2*N+1),N,
+     $                    U,LDU)
+*              .. apply the permutation from the second QR factorization
+               DO 873 q = 1, NR
+                  DO 872 p = 1, NR
+                     CWORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q)
+ 872              CONTINUE
+                  DO 874 p = 1, NR
+                     U(p,q) = CWORK(2*N+N*NR+NR+p)
+ 874              CONTINUE
+ 873           CONTINUE
+               IF ( NR .LT. N ) THEN
+                  CALL ZLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV )
+                  CALL ZLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV )
+                  CALL ZLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+               END IF
+               CALL ZUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $              V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+            ELSE
+*              Last line of defense.
+* #:(          This is a rather pathological case: no scaled condition
+*              improvement after two pivoted QR factorizations. Other
+*              possibility is that the rank revealing QR factorization
+*              or the condition estimator has failed, or the COND_OK
+*              is set very close to ONE (which is unnecessary). Normally,
+*              this branch should never be executed, but in rare cases of
+*              failure of the RRQR or condition estimator, the last line of
+*              defense ensures that ZGEJSV completes the task.
+*              Compute the full SVD of L3 using ZGESVJ with explicit
+*              accumulation of Jacobi rotations.
+               CALL ZGESVJ( 'L', 'U', 'V', NR, NR, V, LDV, SVA, NR, U,
+     $              LDU, CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR,
+     $                         RWORK, LRWORK, INFO )
+               SCALEM  = RWORK(1)
+               NUMRANK = NINT(RWORK(2))
+               IF ( NR .LT. N ) THEN
+                  CALL ZLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV )
+                  CALL ZLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV )
+                  CALL ZLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+               END IF
+               CALL ZUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $              V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+*
+               CALL ZUNMLQ( 'L', 'C', NR, NR, NR, CWORK(2*N+1), N,
+     $              CWORK(2*N+N*NR+1), U, LDU, CWORK(2*N+N*NR+NR+1),
+     $              LWORK-2*N-N*NR-NR, IERR )
+               DO 773 q = 1, NR
+                  DO 772 p = 1, NR
+                     CWORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q)
+ 772              CONTINUE
+                  DO 774 p = 1, NR
+                     U(p,q) = CWORK(2*N+N*NR+NR+p)
+ 774              CONTINUE
+ 773           CONTINUE
+*
+            END IF
+*
+*           Permute the rows of V using the (column) permutation from the
+*           first QRF. Also, scale the columns to make them unit in
+*           Euclidean norm. This applies to all cases.
+*
+            TEMP1 = SQRT(DBLE(N)) * EPSLN
+            DO 1972 q = 1, N
+               DO 972 p = 1, N
+                  CWORK(2*N+N*NR+NR+IWORK(p)) = V(p,q)
+  972          CONTINUE
+               DO 973 p = 1, N
+                  V(p,q) = CWORK(2*N+N*NR+NR+p)
+  973          CONTINUE
+               XSC = ONE / DZNRM2( N, V(1,q), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $           CALL ZDSCAL( N, XSC, V(1,q), 1 )
+ 1972       CONTINUE
+*           At this moment, V contains the right singular vectors of A.
+*           Next, assemble the left singular vector matrix U (M x N).
+            IF ( NR .LT. M ) THEN
+               CALL ZLASET('A', M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU)
+               IF ( NR .LT. N1 ) THEN
+                  CALL ZLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU)
+                  CALL ZLASET('A',M-NR,N1-NR,CZERO,CONE,
+     $                        U(NR+1,NR+1),LDU)
+               END IF
+            END IF
+*
+*           The Q matrix from the first QRF is built into the left singular
+*           matrix U. This applies to all cases.
+*
+            CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $           LDU, CWORK(N+1), LWORK-N, IERR )
+
+*           The columns of U are normalized. The cost is O(M*N) flops.
+            TEMP1 = SQRT(DBLE(M)) * EPSLN
+            DO 1973 p = 1, NR
+               XSC = ONE / DZNRM2( M, U(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $          CALL ZDSCAL( M, XSC, U(1,p), 1 )
+ 1973       CONTINUE
+*
+*           If the initial QRF is computed with row pivoting, the left
+*           singular vectors must be adjusted.
+*
+            IF ( ROWPIV )
+     $          CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+         ELSE
+*
+*        .. the initial matrix A has almost orthogonal columns and
+*        the second QRF is not needed
+*
+            CALL ZLACPY( 'U', N, N, A, LDA, CWORK(N+1), N )
+            IF ( L2PERT ) THEN
+               XSC = SQRT(SMALL)
+               DO 5970 p = 2, N
+                  CTEMP = XSC * CWORK( N + (p-1)*N + p )
+                  DO 5971 q = 1, p - 1
+*                     CWORK(N+(q-1)*N+p)=-TEMP1 * ( CWORK(N+(p-1)*N+q) /
+*     $                                        ABS(CWORK(N+(p-1)*N+q)) )
+                     CWORK(N+(q-1)*N+p)=-CTEMP
+ 5971             CONTINUE
+ 5970          CONTINUE
+            ELSE
+               CALL ZLASET( 'L',N-1,N-1,CZERO,CZERO,CWORK(N+2),N )
+            END IF
+*
+            CALL ZGESVJ( 'U', 'U', 'N', N, N, CWORK(N+1), N, SVA,
+     $           N, U, LDU, CWORK(N+N*N+1), LWORK-N-N*N, RWORK, LRWORK,
+     $       INFO )
+*
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+            DO 6970 p = 1, N
+               CALL ZCOPY( N, CWORK(N+(p-1)*N+1), 1, U(1,p), 1 )
+               CALL ZDSCAL( N, SVA(p), CWORK(N+(p-1)*N+1), 1 )
+ 6970       CONTINUE
+*
+            CALL ZTRSM( 'L', 'U', 'N', 'N', N, N,
+     $           CONE, A, LDA, CWORK(N+1), N )
+            DO 6972 p = 1, N
+               CALL ZCOPY( N, CWORK(N+p), N, V(IWORK(p),1), LDV )
+ 6972       CONTINUE
+            TEMP1 = SQRT(DBLE(N))*EPSLN
+            DO 6971 p = 1, N
+               XSC = ONE / DZNRM2( N, V(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $            CALL ZDSCAL( N, XSC, V(1,p), 1 )
+ 6971       CONTINUE
+*
+*           Assemble the left singular vector matrix U (M x N).
+*
+            IF ( N .LT. M ) THEN
+               CALL ZLASET( 'A',  M-N, N, CZERO, CZERO, U(N+1,1), LDU )
+               IF ( N .LT. N1 ) THEN
+                  CALL ZLASET('A',N,  N1-N, CZERO, CZERO,  U(1,N+1),LDU)
+                  CALL ZLASET( 'A',M-N,N1-N, CZERO, CONE,U(N+1,N+1),LDU)
+               END IF
+            END IF
+            CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $           LDU, CWORK(N+1), LWORK-N, IERR )
+            TEMP1 = SQRT(DBLE(M))*EPSLN
+            DO 6973 p = 1, N1
+               XSC = ONE / DZNRM2( M, U(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $            CALL ZDSCAL( M, XSC, U(1,p), 1 )
+ 6973       CONTINUE
+*
+            IF ( ROWPIV )
+     $         CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+         END IF
+*
+*        end of the  >> almost orthogonal case <<  in the full SVD
+*
+         ELSE
+*
+*        This branch deploys a preconditioned Jacobi SVD with explicitly
+*        accumulated rotations. It is included as optional, mainly for
+*        experimental purposes. It does perform well, and can also be used.
+*        In this implementation, this branch will be automatically activated
+*        if the  condition number sigma_max(A) / sigma_min(A) is predicted
+*        to be greater than the overflow threshold. This is because the
+*        a posteriori computation of the singular vectors assumes robust
+*        implementation of BLAS and some LAPACK procedures, capable of working
+*        in presence of extreme values, e.g. when the singular values spread from
+*        the underflow to the overflow threshold. 
+*
+         DO 7968 p = 1, NR
+            CALL ZCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+            CALL ZLACGV( N-p+1, V(p,p), 1 )
+ 7968    CONTINUE
+*
+         IF ( L2PERT ) THEN
+            XSC = SQRT(SMALL/EPSLN)
+            DO 5969 q = 1, NR
+               CTEMP = DCMPLX(XSC*ABS( V(q,q) ),ZERO)
+               DO 5968 p = 1, N
+                  IF ( ( p .GT. q ) .AND. ( ABS(V(p,q)) .LE. TEMP1 )
+     $                .OR. ( p .LT. q ) )
+*     $                V(p,q) = TEMP1 * ( V(p,q) / ABS(V(p,q)) )
+     $                V(p,q) = CTEMP
+                  IF ( p .LT. q ) V(p,q) = - V(p,q)
+ 5968          CONTINUE
+ 5969       CONTINUE
+         ELSE
+            CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV )
+         END IF
+
+         CALL ZGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+     $        LWORK-2*N, IERR )
+         CALL ZLACPY( 'L', N, NR, V, LDV, CWORK(2*N+1), N )
+*
+         DO 7969 p = 1, NR
+            CALL ZCOPY( NR-p+1, V(p,p), LDV, U(p,p), 1 )
+            CALL ZLACGV( NR-p+1, U(p,p), 1 )
+ 7969    CONTINUE
+
+         IF ( L2PERT ) THEN
+            XSC = SQRT(SMALL/EPSLN)
+            DO 9970 q = 2, NR
+               DO 9971 p = 1, q - 1
+                  CTEMP = DCMPLX(XSC * MIN(ABS(U(p,p)),ABS(U(q,q))),
+     $                            ZERO)
+*                  U(p,q) = - TEMP1 * ( U(q,p) / ABS(U(q,p)) )
+                  U(p,q) = - CTEMP
+ 9971          CONTINUE
+ 9970       CONTINUE
+         ELSE
+            CALL ZLASET('U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU )
+         END IF
+
+         CALL ZGESVJ( 'L', 'U', 'V', NR, NR, U, LDU, SVA,
+     $        N, V, LDV, CWORK(2*N+N*NR+1), LWORK-2*N-N*NR,
+     $         RWORK, LRWORK, INFO )
+         SCALEM  = RWORK(1)
+         NUMRANK = NINT(RWORK(2))
+
+         IF ( NR .LT. N ) THEN
+            CALL ZLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV )
+            CALL ZLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV )
+            CALL ZLASET( 'A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV )
+         END IF
+
+         CALL ZUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $        V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+*
+*           Permute the rows of V using the (column) permutation from the
+*           first QRF. Also, scale the columns to make them unit in
+*           Euclidean norm. This applies to all cases.
+*
+            TEMP1 = SQRT(DBLE(N)) * EPSLN
+            DO 7972 q = 1, N
+               DO 8972 p = 1, N
+                  CWORK(2*N+N*NR+NR+IWORK(p)) = V(p,q)
+ 8972          CONTINUE
+               DO 8973 p = 1, N
+                  V(p,q) = CWORK(2*N+N*NR+NR+p)
+ 8973          CONTINUE
+               XSC = ONE / DZNRM2( N, V(1,q), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $           CALL ZDSCAL( N, XSC, V(1,q), 1 )
+ 7972       CONTINUE
+*
+*           At this moment, V contains the right singular vectors of A.
+*           Next, assemble the left singular vector matrix U (M x N).
+*
+         IF ( NR .LT. M ) THEN
+            CALL ZLASET( 'A',  M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU )
+            IF ( NR .LT. N1 ) THEN
+               CALL ZLASET('A',NR,  N1-NR, CZERO, CZERO,  U(1,NR+1),LDU)
+               CALL ZLASET('A',M-NR,N1-NR, CZERO, CONE,U(NR+1,NR+1),LDU)
+            END IF
+         END IF
+*
+         CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $        LDU, CWORK(N+1), LWORK-N, IERR )
+*
+            IF ( ROWPIV )
+     $         CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+*
+         END IF
+         IF ( TRANSP ) THEN
+*           .. swap U and V because the procedure worked on A^*
+            DO 6974 p = 1, N
+               CALL ZSWAP( N, U(1,p), 1, V(1,p), 1 )
+ 6974       CONTINUE
+         END IF
+*
+      END IF
+*     end of the full SVD
+*
+*     Undo scaling, if necessary (and possible)
+*
+      IF ( USCAL2 .LE. (BIG/SVA(1))*USCAL1 ) THEN
+         CALL DLASCL( 'G', 0, 0, USCAL1, USCAL2, NR, 1, SVA, N, IERR )
+         USCAL1 = ONE
+         USCAL2 = ONE
+      END IF
+*
+      IF ( NR .LT. N ) THEN
+         DO 3004 p = NR+1, N
+            SVA(p) = ZERO
+ 3004    CONTINUE
+      END IF
+*
+      RWORK(1) = USCAL2 * SCALEM
+      RWORK(2) = USCAL1
+      IF ( ERREST ) RWORK(3) = SCONDA
+      IF ( LSVEC .AND. RSVEC ) THEN
+         RWORK(4) = CONDR1
+         RWORK(5) = CONDR2
+      END IF
+      IF ( L2TRAN ) THEN
+         RWORK(6) = ENTRA
+         RWORK(7) = ENTRAT
+      END IF
+*
+      IWORK(1) = NR
+      IWORK(2) = NUMRANK
+      IWORK(3) = WARNING
+      IF ( TRANSP ) THEN
+          IWORK(4) =  1 
+      ELSE
+          IWORK(4) = -1
+      END IF 
+      
+*
+      RETURN
+*     ..
+*     .. END OF ZGEJSV
+*     ..
+      END
+*
diff --git a/lapack-netlib/zgesvx.f b/lapack-netlib/zgesvx.f
new file mode 100644
index 000000000..3b193a1b2
--- /dev/null
+++ b/lapack-netlib/zgesvx.f
@@ -0,0 +1,602 @@
+*> \brief <b> ZGESVX computes the solution to system of linear equations A * X = B for GE matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZGESVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zgesvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zgesvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zgesvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+*                          EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+*                          WORK, RWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+*       DOUBLE PRECISION   RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * )
+*       DOUBLE PRECISION   BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   RWORK( * )
+*       COMPLEX*16         A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZGESVX uses the LU factorization to compute the solution to a complex
+*> system of linear equations
+*>    A * X = B,
+*> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = P * L * U,
+*>    where P is a permutation matrix, L is a unit lower triangular
+*>    matrix, and U is upper triangular.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AF and IPIV contain the factored form of A.
+*>                  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  A, AF, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AF and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AF and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations:
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>          On entry, the N-by-N matrix A.  If FACT = 'F' and EQUED is
+*>          not 'N', then A must have been equilibrated by the scaling
+*>          factors in R and/or C.  A is not modified if FACT = 'F' or
+*>          'N', or if FACT = 'E' and EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] AF
+*> \verbatim
+*>          AF is COMPLEX*16 array, dimension (LDAF,N)
+*>          If FACT = 'F', then AF is an input argument and on entry
+*>          contains the factors L and U from the factorization
+*>          A = P*L*U as computed by ZGETRF.  If EQUED .ne. 'N', then
+*>          AF is the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the equilibrated matrix A (see the description of A for
+*>          the form of the equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAF
+*> \verbatim
+*>          LDAF is INTEGER
+*>          The leading dimension of the array AF.  LDAF >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          as computed by ZGETRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is DOUBLE PRECISION array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is DOUBLE PRECISION array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX*16 array, dimension (LDB,NRHS)
+*>          On entry, the N-by-NRHS right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is COMPLEX*16 array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is DOUBLE PRECISION
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (2*N)
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (MAX(1,2*N))
+*>          On exit, RWORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If RWORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          RWORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization has
+*>                       been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*>                       value of RCOND would suggest.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16GEsolve
+*
+*  =====================================================================
+      SUBROUTINE ZGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+     $                   EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+     $                   WORK, RWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+      DOUBLE PRECISION   RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * )
+      DOUBLE PRECISION   BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   RWORK( * )
+      COMPLEX*16         A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J
+      DOUBLE PRECISION   AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      DOUBLE PRECISION   DLAMCH, ZLANGE, ZLANTR
+      EXTERNAL           LSAME, DLAMCH, ZLANGE, ZLANTR
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA, ZGECON, ZGEEQU, ZGERFS, ZGETRF, ZGETRS,
+     $                   ZLACPY, ZLAQGE
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = DLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE IF( LDAF.LT.MAX( 1, N ) ) THEN
+         INFO = -8
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -10
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -11
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -12
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -14
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZGESVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL ZGEEQU( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL ZLAQGE( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX,
+     $                   EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of A.
+*
+         CALL ZLACPY( 'Full', N, N, A, LDA, AF, LDAF )
+         CALL ZGETRF( N, N, AF, LDAF, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            RPVGRW = ZLANTR( 'M', 'U', 'N', INFO, INFO, AF, LDAF,
+     $               RWORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = ZLANGE( 'M', N, INFO, A, LDA, RWORK ) /
+     $                  RPVGRW
+            END IF
+            RWORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = ZLANGE( NORM, N, N, A, LDA, RWORK )
+      RPVGRW = ZLANTR( 'M', 'U', 'N', N, N, AF, LDAF, RWORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = ZLANGE( 'M', N, N, A, LDA, RWORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL ZGECON( NORM, N, AF, LDAF, ANORM, RCOND, WORK, RWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL ZLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL ZGETRS( TRANS, N, NRHS, AF, LDAF, IPIV, X, LDX, INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL ZGERFS( TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, B, LDB, X,
+     $             LDX, FERR, BERR, WORK, RWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 80 J = 1, NRHS
+               DO 70 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+   70          CONTINUE
+   80       CONTINUE
+            DO 90 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+   90       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 110 J = 1, NRHS
+            DO 100 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  100       CONTINUE
+  110    CONTINUE
+         DO 120 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  120    CONTINUE
+      END IF
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.DLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+*
+      RWORK( 1 ) = RPVGRW
+      RETURN
+*
+*     End of ZGESVX
+*
+      END

From 02efa8d6be38db3810d955ebe0b11769eb338e28 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 16 May 2023 20:01:47 +0200
Subject: [PATCH 072/718] Fix typos in comments (Reference-LAPACK 811)

---
 lapack-netlib/SRC/cgejsv.f     | 18 +++++++++---------
 lapack-netlib/SRC/claswlq.f    |  1 +
 lapack-netlib/SRC/clatsqr.f    |  1 +
 lapack-netlib/SRC/cuncsd2by1.f |  4 ++--
 lapack-netlib/SRC/cungtsqr.f   |  3 ++-
 lapack-netlib/SRC/dlaswlq.f    |  1 +
 lapack-netlib/SRC/dlatsqr.f    |  1 +
 lapack-netlib/SRC/dorgtsqr.f   |  3 ++-
 lapack-netlib/SRC/slaswlq.f    |  1 +
 lapack-netlib/SRC/slatsqr.f    |  1 +
 lapack-netlib/SRC/sorgtsqr.f   |  3 ++-
 lapack-netlib/SRC/zgejsv.f     | 18 +++++++++---------
 lapack-netlib/SRC/zlaswlq.f    |  1 +
 lapack-netlib/SRC/zlatsqr.f    |  1 +
 lapack-netlib/SRC/zuncsd2by1.f |  4 ++--
 lapack-netlib/SRC/zungtsqr.f   |  3 ++-
 16 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/lapack-netlib/SRC/cgejsv.f b/lapack-netlib/SRC/cgejsv.f
index 062ac182b..8fe4159f6 100644
--- a/lapack-netlib/SRC/cgejsv.f
+++ b/lapack-netlib/SRC/cgejsv.f
@@ -52,10 +52,10 @@
 *> are computed and stored in the arrays U and V, respectively. The diagonal
 *> of [SIGMA] is computed and stored in the array SVA.
 *> \endverbatim
-*>
-*>  Arguments:
-*>  ==========
-*>
+*
+*  Arguments:
+*  ==========
+*
 *> \param[in] JOBA
 *> \verbatim
 *>          JOBA is CHARACTER*1
@@ -151,7 +151,7 @@
 *>         transposed A if A^* seems to be better with respect to convergence.
 *>         If the matrix is not square, JOBT is ignored.
 *>         The decision is based on two values of entropy over the adjoint
-*>         orbit of A^* * A. See the descriptions of WORK(6) and WORK(7).
+*>         orbit of A^* * A. See the descriptions of RWORK(6) and RWORK(7).
 *>       = 'T': transpose if entropy test indicates possibly faster
 *>         convergence of Jacobi process if A^* is taken as input. If A is
 *>         replaced with A^*, then the row pivoting is included automatically.
@@ -209,11 +209,11 @@
 *> \verbatim
 *>          SVA is REAL array, dimension (N)
 *>          On exit,
-*>          - For WORK(1)/WORK(2) = ONE: The singular values of A. During the
-*>            computation SVA contains Euclidean column norms of the
+*>          - For RWORK(1)/RWORK(2) = ONE: The singular values of A. During
+*>            the computation SVA contains Euclidean column norms of the
 *>            iterated matrices in the array A.
-*>          - For WORK(1) .NE. WORK(2): The singular values of A are
-*>            (WORK(1)/WORK(2)) * SVA(1:N). This factored form is used if
+*>          - For RWORK(1) .NE. RWORK(2): The singular values of A are
+*>            (RWORK(1)/RWORK(2)) * SVA(1:N). This factored form is used if
 *>            sigma_max(A) overflows or if small singular values have been
 *>            saved from underflow by scaling the input matrix A.
 *>          - If JOBR='R' then some of the singular values may be returned
diff --git a/lapack-netlib/SRC/claswlq.f b/lapack-netlib/SRC/claswlq.f
index 63cbd02c6..1a09b8305 100644
--- a/lapack-netlib/SRC/claswlq.f
+++ b/lapack-netlib/SRC/claswlq.f
@@ -104,6 +104,7 @@
 *> \endverbatim
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= MB*M.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/lapack-netlib/SRC/clatsqr.f b/lapack-netlib/SRC/clatsqr.f
index 7a0b85487..377190081 100644
--- a/lapack-netlib/SRC/clatsqr.f
+++ b/lapack-netlib/SRC/clatsqr.f
@@ -106,6 +106,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= NB*N.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/lapack-netlib/SRC/cuncsd2by1.f b/lapack-netlib/SRC/cuncsd2by1.f
index 8360d5932..f0c44f670 100644
--- a/lapack-netlib/SRC/cuncsd2by1.f
+++ b/lapack-netlib/SRC/cuncsd2by1.f
@@ -212,13 +212,13 @@
 *>          LRWORK is INTEGER
 *>          The dimension of the array RWORK.
 *>
-*>          If LRWORK = -1, then a workspace query is assumed; the routine
+*>          If LRWORK=-1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK and RWORK
 *>          arrays, returns this value as the first entry of the WORK
 *>          and RWORK array, respectively, and no error message related
 *>          to LWORK or LRWORK is issued by XERBLA.
 *> \endverbatim
-*
+*>
 *> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (M-MIN(P,M-P,Q,M-Q))
diff --git a/lapack-netlib/SRC/cungtsqr.f b/lapack-netlib/SRC/cungtsqr.f
index 64ccb9731..1734be2aa 100644
--- a/lapack-netlib/SRC/cungtsqr.f
+++ b/lapack-netlib/SRC/cungtsqr.f
@@ -133,6 +133,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= (M+NB)*N.
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
@@ -302,4 +303,4 @@
 *
 *     End of CUNGTSQR
 *
-      END
\ No newline at end of file
+      END
diff --git a/lapack-netlib/SRC/dlaswlq.f b/lapack-netlib/SRC/dlaswlq.f
index fb8857145..c95c94cbc 100644
--- a/lapack-netlib/SRC/dlaswlq.f
+++ b/lapack-netlib/SRC/dlaswlq.f
@@ -104,6 +104,7 @@
 *> \endverbatim
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= MB*M.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/lapack-netlib/SRC/dlatsqr.f b/lapack-netlib/SRC/dlatsqr.f
index f5cbb76bb..94a04be02 100644
--- a/lapack-netlib/SRC/dlatsqr.f
+++ b/lapack-netlib/SRC/dlatsqr.f
@@ -106,6 +106,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= NB*N.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/lapack-netlib/SRC/dorgtsqr.f b/lapack-netlib/SRC/dorgtsqr.f
index 34d96b238..2e5cd40be 100644
--- a/lapack-netlib/SRC/dorgtsqr.f
+++ b/lapack-netlib/SRC/dorgtsqr.f
@@ -133,6 +133,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= (M+NB)*N.
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
@@ -301,4 +302,4 @@
 *
 *     End of DORGTSQR
 *
-      END
\ No newline at end of file
+      END
diff --git a/lapack-netlib/SRC/slaswlq.f b/lapack-netlib/SRC/slaswlq.f
index bc7cf343d..95e0ddcce 100644
--- a/lapack-netlib/SRC/slaswlq.f
+++ b/lapack-netlib/SRC/slaswlq.f
@@ -104,6 +104,7 @@
 *> \endverbatim
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= MB * M.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/lapack-netlib/SRC/slatsqr.f b/lapack-netlib/SRC/slatsqr.f
index 33966c01f..86733bb15 100644
--- a/lapack-netlib/SRC/slatsqr.f
+++ b/lapack-netlib/SRC/slatsqr.f
@@ -106,6 +106,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= NB*N.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/lapack-netlib/SRC/sorgtsqr.f b/lapack-netlib/SRC/sorgtsqr.f
index a755fa4a4..692eba1d9 100644
--- a/lapack-netlib/SRC/sorgtsqr.f
+++ b/lapack-netlib/SRC/sorgtsqr.f
@@ -133,6 +133,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= (M+NB)*N.
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
@@ -301,4 +302,4 @@
 *
 *     End of SORGTSQR
 *
-      END
\ No newline at end of file
+      END
diff --git a/lapack-netlib/SRC/zgejsv.f b/lapack-netlib/SRC/zgejsv.f
index 5134ea891..464c4a0fa 100644
--- a/lapack-netlib/SRC/zgejsv.f
+++ b/lapack-netlib/SRC/zgejsv.f
@@ -52,10 +52,10 @@
 *> are computed and stored in the arrays U and V, respectively. The diagonal
 *> of [SIGMA] is computed and stored in the array SVA.
 *> \endverbatim
-*>
-*>  Arguments:
-*>  ==========
-*>
+*
+*  Arguments:
+*  ==========
+*
 *> \param[in] JOBA
 *> \verbatim
 *>          JOBA is CHARACTER*1
@@ -151,7 +151,7 @@
 *>         transposed A if A^* seems to be better with respect to convergence.
 *>         If the matrix is not square, JOBT is ignored. 
 *>         The decision is based on two values of entropy over the adjoint
-*>         orbit of A^* * A. See the descriptions of WORK(6) and WORK(7).
+*>         orbit of A^* * A. See the descriptions of RWORK(6) and RWORK(7).
 *>       = 'T': transpose if entropy test indicates possibly faster
 *>         convergence of Jacobi process if A^* is taken as input. If A is
 *>         replaced with A^*, then the row pivoting is included automatically.
@@ -209,11 +209,11 @@
 *> \verbatim
 *>          SVA is DOUBLE PRECISION array, dimension (N)
 *>          On exit,
-*>          - For WORK(1)/WORK(2) = ONE: The singular values of A. During the
-*>            computation SVA contains Euclidean column norms of the
+*>          - For RWORK(1)/RWORK(2) = ONE: The singular values of A. During
+*>            the computation SVA contains Euclidean column norms of the
 *>            iterated matrices in the array A.
-*>          - For WORK(1) .NE. WORK(2): The singular values of A are
-*>            (WORK(1)/WORK(2)) * SVA(1:N). This factored form is used if
+*>          - For RWORK(1) .NE. RWORK(2): The singular values of A are
+*>            (RWORK(1)/RWORK(2)) * SVA(1:N). This factored form is used if
 *>            sigma_max(A) overflows or if small singular values have been
 *>            saved from underflow by scaling the input matrix A.
 *>          - If JOBR='R' then some of the singular values may be returned
diff --git a/lapack-netlib/SRC/zlaswlq.f b/lapack-netlib/SRC/zlaswlq.f
index e4e703343..be4c48539 100644
--- a/lapack-netlib/SRC/zlaswlq.f
+++ b/lapack-netlib/SRC/zlaswlq.f
@@ -104,6 +104,7 @@
 *> \endverbatim
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= MB*M.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/lapack-netlib/SRC/zlatsqr.f b/lapack-netlib/SRC/zlatsqr.f
index ffdbc68c0..8c938aebc 100644
--- a/lapack-netlib/SRC/zlatsqr.f
+++ b/lapack-netlib/SRC/zlatsqr.f
@@ -106,6 +106,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= NB*N.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/lapack-netlib/SRC/zuncsd2by1.f b/lapack-netlib/SRC/zuncsd2by1.f
index 2d861c2cc..399b598be 100644
--- a/lapack-netlib/SRC/zuncsd2by1.f
+++ b/lapack-netlib/SRC/zuncsd2by1.f
@@ -211,13 +211,13 @@
 *>          LRWORK is INTEGER
 *>          The dimension of the array RWORK.
 *>
-*>          If LRWORK = -1, then a workspace query is assumed; the routine
+*>          If LRWORK=-1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK and RWORK
 *>          arrays, returns this value as the first entry of the WORK
 *>          and RWORK array, respectively, and no error message related
 *>          to LWORK or LRWORK is issued by XERBLA.
 *> \endverbatim
-*
+*>
 *> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (M-MIN(P,M-P,Q,M-Q))
diff --git a/lapack-netlib/SRC/zungtsqr.f b/lapack-netlib/SRC/zungtsqr.f
index 4f2d7dfdd..23e28ac5c 100644
--- a/lapack-netlib/SRC/zungtsqr.f
+++ b/lapack-netlib/SRC/zungtsqr.f
@@ -133,6 +133,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= (M+NB)*N.
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
@@ -302,4 +303,4 @@
 *
 *     End of ZUNGTSQR
 *
-      END
\ No newline at end of file
+      END

From 5fbd5f531bcd074603654948dca4ec771a25a833 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 16 May 2023 20:05:05 +0200
Subject: [PATCH 073/718] Fix typo in description of VR argument
 (Reference-LAPACK 812)

---
 lapack-netlib/SRC/ctgevc.f | 7 +++----
 lapack-netlib/SRC/ztgevc.f | 7 +++----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/lapack-netlib/SRC/ctgevc.f b/lapack-netlib/SRC/ctgevc.f
index 4e5289cb2..22144f259 100644
--- a/lapack-netlib/SRC/ctgevc.f
+++ b/lapack-netlib/SRC/ctgevc.f
@@ -53,7 +53,7 @@
 *>
 *>    S*x = w*P*x,  (y**H)*S = w*(y**H)*P,
 *>
-*> where y**H denotes the conjugate tranpose of y.
+*> where y**H denotes the conjugate transpose of y.
 *> The eigenvalues are not input to this routine, but are computed
 *> directly from the diagonal elements of S and P.
 *>
@@ -154,7 +154,7 @@
 *> \verbatim
 *>          VR is COMPLEX array, dimension (LDVR,MM)
 *>          On entry, if SIDE = 'R' or 'B' and HOWMNY = 'B', VR must
-*>          contain an N-by-N matrix Q (usually the unitary matrix Z
+*>          contain an N-by-N matrix Z (usually the unitary matrix Z
 *>          of right Schur vectors returned by CHGEQZ).
 *>          On exit, if SIDE = 'R' or 'B', VR contains:
 *>          if HOWMNY = 'A', the matrix X of right eigenvectors of (S,P);
@@ -259,7 +259,7 @@
       EXTERNAL           LSAME, SLAMCH, CLADIV
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CGEMV, SLABAD, XERBLA
+      EXTERNAL           CGEMV, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, AIMAG, CMPLX, CONJG, MAX, MIN, REAL
@@ -367,7 +367,6 @@
 *
       SAFMIN = SLAMCH( 'Safe minimum' )
       BIG = ONE / SAFMIN
-      CALL SLABAD( SAFMIN, BIG )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       SMALL = SAFMIN*N / ULP
       BIG = ONE / SMALL
diff --git a/lapack-netlib/SRC/ztgevc.f b/lapack-netlib/SRC/ztgevc.f
index 23bd36ddb..6dcbe582f 100644
--- a/lapack-netlib/SRC/ztgevc.f
+++ b/lapack-netlib/SRC/ztgevc.f
@@ -53,7 +53,7 @@
 *>
 *>    S*x = w*P*x,  (y**H)*S = w*(y**H)*P,
 *>
-*> where y**H denotes the conjugate tranpose of y.
+*> where y**H denotes the conjugate transpose of y.
 *> The eigenvalues are not input to this routine, but are computed
 *> directly from the diagonal elements of S and P.
 *>
@@ -154,7 +154,7 @@
 *> \verbatim
 *>          VR is COMPLEX*16 array, dimension (LDVR,MM)
 *>          On entry, if SIDE = 'R' or 'B' and HOWMNY = 'B', VR must
-*>          contain an N-by-N matrix Q (usually the unitary matrix Z
+*>          contain an N-by-N matrix Z (usually the unitary matrix Z
 *>          of right Schur vectors returned by ZHGEQZ).
 *>          On exit, if SIDE = 'R' or 'B', VR contains:
 *>          if HOWMNY = 'A', the matrix X of right eigenvectors of (S,P);
@@ -259,7 +259,7 @@
       EXTERNAL           LSAME, DLAMCH, ZLADIV
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, XERBLA, ZGEMV
+      EXTERNAL           XERBLA, ZGEMV
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, DCMPLX, DCONJG, DIMAG, MAX, MIN
@@ -367,7 +367,6 @@
 *
       SAFMIN = DLAMCH( 'Safe minimum' )
       BIG = ONE / SAFMIN
-      CALL DLABAD( SAFMIN, BIG )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       SMALL = SAFMIN*N / ULP
       BIG = ONE / SMALL

From a82c1443dbf40e4c46ed3158c6945a3cdb28d3cf Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 17 May 2023 14:33:46 +0200
Subject: [PATCH 074/718] Fix typos in comments (Reference-LAPACK PR 814)

---
 lapack-netlib/SRC/cgejsv.f       | 6 +++---
 lapack-netlib/SRC/cgsvj0.f       | 6 +++---
 lapack-netlib/SRC/cgsvj1.f       | 6 +++---
 lapack-netlib/SRC/clalsa.f       | 4 ++--
 lapack-netlib/SRC/cstegr.f       | 2 +-
 lapack-netlib/SRC/ctgsen.f       | 2 +-
 lapack-netlib/SRC/dgejsv.f       | 8 ++++----
 lapack-netlib/SRC/dgsvj0.f       | 6 +++---
 lapack-netlib/SRC/dgsvj1.f       | 6 +++---
 lapack-netlib/SRC/dlalsa.f       | 4 ++--
 lapack-netlib/SRC/dlarre.f       | 2 +-
 lapack-netlib/SRC/dstegr.f       | 2 +-
 lapack-netlib/SRC/dtgevc.f       | 5 ++---
 lapack-netlib/SRC/iparam2stage.F | 4 ++--
 lapack-netlib/SRC/sgejsv.f       | 6 +++---
 lapack-netlib/SRC/sgsvj0.f       | 6 +++---
 lapack-netlib/SRC/sgsvj1.f       | 6 +++---
 lapack-netlib/SRC/slalsa.f       | 4 ++--
 lapack-netlib/SRC/slarre.f       | 2 +-
 lapack-netlib/SRC/sstegr.f       | 2 +-
 lapack-netlib/SRC/stgevc.f       | 5 ++---
 lapack-netlib/SRC/zgejsv.f       | 6 +++---
 lapack-netlib/SRC/zgsvj0.f       | 6 +++---
 lapack-netlib/SRC/zgsvj1.f       | 6 +++---
 lapack-netlib/SRC/zlalsa.f       | 4 ++--
 lapack-netlib/SRC/zstegr.f       | 2 +-
 lapack-netlib/SRC/ztgsy2.f       | 2 +-
 27 files changed, 59 insertions(+), 61 deletions(-)

diff --git a/lapack-netlib/SRC/cgejsv.f b/lapack-netlib/SRC/cgejsv.f
index 8fe4159f6..2b7b2dc70 100644
--- a/lapack-netlib/SRC/cgejsv.f
+++ b/lapack-netlib/SRC/cgejsv.f
@@ -1819,7 +1819,7 @@
                IF ( CONDR2 .GE. COND_OK ) THEN
 *                 .. save the Householder vectors used for Q3
 *                 (this overwrites the copy of R2, as it will not be
-*                 needed in this branch, but it does not overwritte the
+*                 needed in this branch, but it does not overwrite the
 *                 Huseholder vectors of Q2.).
                   CALL CLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N )
 *                 .. and the rest of the information on Q3 is in
@@ -1842,7 +1842,7 @@
             END IF
 *
 *        Second preconditioning finished; continue with Jacobi SVD
-*        The input matrix is lower trinagular.
+*        The input matrix is lower triangular.
 *
 *        Recover the right singular vectors as solution of a well
 *        conditioned triangular matrix equation.
@@ -1886,7 +1886,7 @@
             ELSE IF ( CONDR2 .LT. COND_OK ) THEN
 *
 *              The matrix R2 is inverted. The solution of the matrix equation
-*              is Q3^* * V3 = the product of the Jacobi rotations (appplied to
+*              is Q3^* * V3 = the product of the Jacobi rotations (applied to
 *              the lower triangular L3 from the LQ factorization of
 *              R2=L3*Q3), pre-multiplied with the transposed Q3.
                CALL CGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
diff --git a/lapack-netlib/SRC/cgsvj0.f b/lapack-netlib/SRC/cgsvj0.f
index 37853a10a..9e6053013 100644
--- a/lapack-netlib/SRC/cgsvj0.f
+++ b/lapack-netlib/SRC/cgsvj0.f
@@ -117,7 +117,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -125,9 +125,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is COMPLEX array, dimension (LDV,N)
-*>          If JOBV = 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV = 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/cgsvj1.f b/lapack-netlib/SRC/cgsvj1.f
index f1a5204e3..6d68e920d 100644
--- a/lapack-netlib/SRC/cgsvj1.f
+++ b/lapack-netlib/SRC/cgsvj1.f
@@ -147,7 +147,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -155,9 +155,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is COMPLEX array, dimension (LDV,N)
-*>          If JOBV = 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV = 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/clalsa.f b/lapack-netlib/SRC/clalsa.f
index 06883ab20..fceb380f5 100644
--- a/lapack-netlib/SRC/clalsa.f
+++ b/lapack-netlib/SRC/clalsa.f
@@ -42,9 +42,9 @@
 *>
 *> \verbatim
 *>
-*> CLALSA is an itermediate step in solving the least squares problem
+*> CLALSA is an intermediate step in solving the least squares problem
 *> by computing the SVD of the coefficient matrix in compact form (The
-*> singular vectors are computed as products of simple orthorgonal
+*> singular vectors are computed as products of simple orthogonal
 *> matrices.).
 *>
 *> If ICOMPQ = 0, CLALSA applies the inverse of the left singular vector
diff --git a/lapack-netlib/SRC/cstegr.f b/lapack-netlib/SRC/cstegr.f
index 9d6e06da3..a162d5076 100644
--- a/lapack-netlib/SRC/cstegr.f
+++ b/lapack-netlib/SRC/cstegr.f
@@ -56,7 +56,7 @@
 *>
 *> Note : CSTEGR and CSTEMR work only on machines which follow
 *> IEEE-754 floating-point standard in their handling of infinities and
-*> NaNs.  Normal execution may create these exceptiona values and hence
+*> NaNs.  Normal execution may create these exceptional values and hence
 *> may abort due to a floating point exception in environments which
 *> do not conform to the IEEE-754 standard.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/ctgsen.f b/lapack-netlib/SRC/ctgsen.f
index f9b6cd10c..ffd638099 100644
--- a/lapack-netlib/SRC/ctgsen.f
+++ b/lapack-netlib/SRC/ctgsen.f
@@ -339,7 +339,7 @@
 *>            [ kron(In2, B11)  -kron(B22**H, In1) ].
 *>
 *>  Here, Inx is the identity matrix of size nx and A22**H is the
-*>  conjuguate transpose of A22. kron(X, Y) is the Kronecker product between
+*>  conjugate transpose of A22. kron(X, Y) is the Kronecker product between
 *>  the matrices X and Y.
 *>
 *>  When DIF(2) is small, small changes in (A, B) can cause large changes
diff --git a/lapack-netlib/SRC/dgejsv.f b/lapack-netlib/SRC/dgejsv.f
index 83d16c30e..798e9154d 100644
--- a/lapack-netlib/SRC/dgejsv.f
+++ b/lapack-netlib/SRC/dgejsv.f
@@ -362,7 +362,7 @@
 *>
 *> \param[out] IWORK
 *> \verbatim
-*>          IWORK is INTEGER array, dimension (M+3*N).
+*>          IWORK is INTEGER array, dimension (MAX(3,M+3*N)).
 *>          On exit,
 *>          IWORK(1) = the numerical rank determined after the initial
 *>                     QR factorization with pivoting. See the descriptions
@@ -1386,7 +1386,7 @@
                IF ( CONDR2 .GE. COND_OK ) THEN
 *                 .. save the Householder vectors used for Q3
 *                 (this overwrites the copy of R2, as it will not be
-*                 needed in this branch, but it does not overwritte the
+*                 needed in this branch, but it does not overwrite the
 *                 Huseholder vectors of Q2.).
                   CALL DLACPY( 'U', NR, NR, V, LDV, WORK(2*N+1), N )
 *                 .. and the rest of the information on Q3 is in
@@ -1409,7 +1409,7 @@
             END IF
 *
 *        Second preconditioning finished; continue with Jacobi SVD
-*        The input matrix is lower trinagular.
+*        The input matrix is lower triangular.
 *
 *        Recover the right singular vectors as solution of a well
 *        conditioned triangular matrix equation.
@@ -1454,7 +1454,7 @@
 * :)           .. the input matrix A is very likely a relative of
 *              the Kahan matrix :)
 *              The matrix R2 is inverted. The solution of the matrix equation
-*              is Q3^T*V3 = the product of the Jacobi rotations (appplied to
+*              is Q3^T*V3 = the product of the Jacobi rotations (applied to
 *              the lower triangular L3 from the LQ factorization of
 *              R2=L3*Q3), pre-multiplied with the transposed Q3.
                CALL DGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
diff --git a/lapack-netlib/SRC/dgsvj0.f b/lapack-netlib/SRC/dgsvj0.f
index 197a9a626..586723e64 100644
--- a/lapack-netlib/SRC/dgsvj0.f
+++ b/lapack-netlib/SRC/dgsvj0.f
@@ -117,7 +117,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -125,9 +125,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is DOUBLE PRECISION array, dimension (LDV,N)
-*>          If JOBV = 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV = 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dgsvj1.f b/lapack-netlib/SRC/dgsvj1.f
index 11d1dde2b..d757bb927 100644
--- a/lapack-netlib/SRC/dgsvj1.f
+++ b/lapack-netlib/SRC/dgsvj1.f
@@ -147,7 +147,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                         sequence of Jacobi rotations.
 *>          If JOBV = 'N', then MV is not referenced.
 *> \endverbatim
@@ -155,9 +155,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is DOUBLE PRECISION array, dimension (LDV,N)
-*>          If JOBV = 'V', then N rows of V are post-multipled by a
+*>          If JOBV = 'V', then N rows of V are post-multiplied by a
 *>                         sequence of Jacobi rotations.
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                         sequence of Jacobi rotations.
 *>          If JOBV = 'N', then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dlalsa.f b/lapack-netlib/SRC/dlalsa.f
index da8e0fa17..d89f5d2f9 100644
--- a/lapack-netlib/SRC/dlalsa.f
+++ b/lapack-netlib/SRC/dlalsa.f
@@ -43,9 +43,9 @@
 *>
 *> \verbatim
 *>
-*> DLALSA is an itermediate step in solving the least squares problem
+*> DLALSA is an intermediate step in solving the least squares problem
 *> by computing the SVD of the coefficient matrix in compact form (The
-*> singular vectors are computed as products of simple orthorgonal
+*> singular vectors are computed as products of simple orthogonal
 *> matrices.).
 *>
 *> If ICOMPQ = 0, DLALSA applies the inverse of the left singular vector
diff --git a/lapack-netlib/SRC/dlarre.f b/lapack-netlib/SRC/dlarre.f
index 70f59b829..1cceed1a2 100644
--- a/lapack-netlib/SRC/dlarre.f
+++ b/lapack-netlib/SRC/dlarre.f
@@ -51,7 +51,7 @@
 *> DSTEMR to compute the eigenvectors of T.
 *> The accuracy varies depending on whether bisection is used to
 *> find a few eigenvalues or the dqds algorithm (subroutine DLASQ2) to
-*> conpute all and then discard any unwanted one.
+*> compute all and then discard any unwanted one.
 *> As an added benefit, DLARRE also outputs the n
 *> Gerschgorin intervals for the matrices L_i D_i L_i^T.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dstegr.f b/lapack-netlib/SRC/dstegr.f
index 01ec101d8..598c60e8d 100644
--- a/lapack-netlib/SRC/dstegr.f
+++ b/lapack-netlib/SRC/dstegr.f
@@ -56,7 +56,7 @@
 *>
 *> Note : DSTEGR and DSTEMR work only on machines which follow
 *> IEEE-754 floating-point standard in their handling of infinities and
-*> NaNs.  Normal execution may create these exceptiona values and hence
+*> NaNs.  Normal execution may create these exceptional values and hence
 *> may abort due to a floating point exception in environments which
 *> do not conform to the IEEE-754 standard.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dtgevc.f b/lapack-netlib/SRC/dtgevc.f
index e7084664c..be70b2083 100644
--- a/lapack-netlib/SRC/dtgevc.f
+++ b/lapack-netlib/SRC/dtgevc.f
@@ -52,7 +52,7 @@
 *>
 *>    S*x = w*P*x,  (y**H)*S = w*(y**H)*P,
 *>
-*> where y**H denotes the conjugate tranpose of y.
+*> where y**H denotes the conjugate transpose of y.
 *> The eigenvalues are not input to this routine, but are computed
 *> directly from the diagonal blocks of S and P.
 *>
@@ -337,7 +337,7 @@
       EXTERNAL           LSAME, DLAMCH
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DGEMV, DLABAD, DLACPY, DLAG2, DLALN2, XERBLA
+      EXTERNAL           DGEMV, DLACPY, DLAG2, DLALN2, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN
@@ -463,7 +463,6 @@
 *
       SAFMIN = DLAMCH( 'Safe minimum' )
       BIG = ONE / SAFMIN
-      CALL DLABAD( SAFMIN, BIG )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       SMALL = SAFMIN*N / ULP
       BIG = ONE / SMALL
diff --git a/lapack-netlib/SRC/iparam2stage.F b/lapack-netlib/SRC/iparam2stage.F
index c701c2be0..52d507e9d 100644
--- a/lapack-netlib/SRC/iparam2stage.F
+++ b/lapack-netlib/SRC/iparam2stage.F
@@ -89,14 +89,14 @@
 *>
 *> \param[in] NBI
 *> \verbatim
-*>          NBI is INTEGER which is the used in the reduciton, 
+*>          NBI is INTEGER which is the used in the reduction, 
 *>          (e.g., the size of the band), needed to compute workspace
 *>          and LHOUS2.
 *> \endverbatim
 *>
 *> \param[in] IBI
 *> \verbatim
-*>          IBI is INTEGER which represent the IB of the reduciton,
+*>          IBI is INTEGER which represent the IB of the reduction,
 *>          needed to compute workspace and LHOUS2.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/sgejsv.f b/lapack-netlib/SRC/sgejsv.f
index 923573bdb..1333e54fb 100644
--- a/lapack-netlib/SRC/sgejsv.f
+++ b/lapack-netlib/SRC/sgejsv.f
@@ -1386,7 +1386,7 @@
                IF ( CONDR2 .GE. COND_OK ) THEN
 *                 .. save the Householder vectors used for Q3
 *                 (this overwrites the copy of R2, as it will not be
-*                 needed in this branch, but it does not overwritte the
+*                 needed in this branch, but it does not overwrite the
 *                 Huseholder vectors of Q2.).
                   CALL SLACPY( 'U', NR, NR, V, LDV, WORK(2*N+1), N )
 *                 .. and the rest of the information on Q3 is in
@@ -1409,7 +1409,7 @@
             END IF
 *
 *        Second preconditioning finished; continue with Jacobi SVD
-*        The input matrix is lower trinagular.
+*        The input matrix is lower triangular.
 *
 *        Recover the right singular vectors as solution of a well
 *        conditioned triangular matrix equation.
@@ -1454,7 +1454,7 @@
 * :)           .. the input matrix A is very likely a relative of
 *              the Kahan matrix :)
 *              The matrix R2 is inverted. The solution of the matrix equation
-*              is Q3^T*V3 = the product of the Jacobi rotations (appplied to
+*              is Q3^T*V3 = the product of the Jacobi rotations (applied to
 *              the lower triangular L3 from the LQ factorization of
 *              R2=L3*Q3), pre-multiplied with the transposed Q3.
                CALL SGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
diff --git a/lapack-netlib/SRC/sgsvj0.f b/lapack-netlib/SRC/sgsvj0.f
index 23b6f0077..9249f4219 100644
--- a/lapack-netlib/SRC/sgsvj0.f
+++ b/lapack-netlib/SRC/sgsvj0.f
@@ -117,7 +117,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -125,9 +125,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is REAL array, dimension (LDV,N)
-*>          If JOBV = 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV = 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sgsvj1.f b/lapack-netlib/SRC/sgsvj1.f
index 0345ccb42..79fd4d2d3 100644
--- a/lapack-netlib/SRC/sgsvj1.f
+++ b/lapack-netlib/SRC/sgsvj1.f
@@ -147,7 +147,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -155,9 +155,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is REAL array, dimension (LDV,N)
-*>          If JOBV = 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV = 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/slalsa.f b/lapack-netlib/SRC/slalsa.f
index 465455e4a..95becc76e 100644
--- a/lapack-netlib/SRC/slalsa.f
+++ b/lapack-netlib/SRC/slalsa.f
@@ -43,9 +43,9 @@
 *>
 *> \verbatim
 *>
-*> SLALSA is an itermediate step in solving the least squares problem
+*> SLALSA is an intermediate step in solving the least squares problem
 *> by computing the SVD of the coefficient matrix in compact form (The
-*> singular vectors are computed as products of simple orthorgonal
+*> singular vectors are computed as products of simple orthogonal
 *> matrices.).
 *>
 *> If ICOMPQ = 0, SLALSA applies the inverse of the left singular vector
diff --git a/lapack-netlib/SRC/slarre.f b/lapack-netlib/SRC/slarre.f
index 2e34ca5a6..e1b52c385 100644
--- a/lapack-netlib/SRC/slarre.f
+++ b/lapack-netlib/SRC/slarre.f
@@ -51,7 +51,7 @@
 *> SSTEMR to compute the eigenvectors of T.
 *> The accuracy varies depending on whether bisection is used to
 *> find a few eigenvalues or the dqds algorithm (subroutine SLASQ2) to
-*> conpute all and then discard any unwanted one.
+*> compute all and then discard any unwanted one.
 *> As an added benefit, SLARRE also outputs the n
 *> Gerschgorin intervals for the matrices L_i D_i L_i^T.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sstegr.f b/lapack-netlib/SRC/sstegr.f
index 2e2975fdf..2967a6969 100644
--- a/lapack-netlib/SRC/sstegr.f
+++ b/lapack-netlib/SRC/sstegr.f
@@ -56,7 +56,7 @@
 *>
 *> Note : SSTEGR and SSTEMR work only on machines which follow
 *> IEEE-754 floating-point standard in their handling of infinities and
-*> NaNs.  Normal execution may create these exceptiona values and hence
+*> NaNs.  Normal execution may create these exceptional values and hence
 *> may abort due to a floating point exception in environments which
 *> do not conform to the IEEE-754 standard.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/stgevc.f b/lapack-netlib/SRC/stgevc.f
index 15fc88c4b..be4cb1829 100644
--- a/lapack-netlib/SRC/stgevc.f
+++ b/lapack-netlib/SRC/stgevc.f
@@ -52,7 +52,7 @@
 *>
 *>    S*x = w*P*x,  (y**H)*S = w*(y**H)*P,
 *>
-*> where y**H denotes the conjugate tranpose of y.
+*> where y**H denotes the conjugate transpose of y.
 *> The eigenvalues are not input to this routine, but are computed
 *> directly from the diagonal blocks of S and P.
 *>
@@ -337,7 +337,7 @@
       EXTERNAL           LSAME, SLAMCH
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEMV, SLABAD, SLACPY, SLAG2, SLALN2, XERBLA
+      EXTERNAL           SGEMV, SLACPY, SLAG2, SLALN2, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN
@@ -463,7 +463,6 @@
 *
       SAFMIN = SLAMCH( 'Safe minimum' )
       BIG = ONE / SAFMIN
-      CALL SLABAD( SAFMIN, BIG )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       SMALL = SAFMIN*N / ULP
       BIG = ONE / SMALL
diff --git a/lapack-netlib/SRC/zgejsv.f b/lapack-netlib/SRC/zgejsv.f
index 464c4a0fa..a36a9188a 100644
--- a/lapack-netlib/SRC/zgejsv.f
+++ b/lapack-netlib/SRC/zgejsv.f
@@ -1821,7 +1821,7 @@
                IF ( CONDR2 .GE. COND_OK ) THEN
 *                 .. save the Householder vectors used for Q3
 *                 (this overwrites the copy of R2, as it will not be
-*                 needed in this branch, but it does not overwritte the
+*                 needed in this branch, but it does not overwrite the
 *                 Huseholder vectors of Q2.).
                   CALL ZLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N )
 *                 .. and the rest of the information on Q3 is in
@@ -1844,7 +1844,7 @@
             END IF
 *
 *        Second preconditioning finished; continue with Jacobi SVD
-*        The input matrix is lower trinagular.
+*        The input matrix is lower triangular.
 *
 *        Recover the right singular vectors as solution of a well
 *        conditioned triangular matrix equation.
@@ -1888,7 +1888,7 @@
             ELSE IF ( CONDR2 .LT. COND_OK ) THEN
 *
 *              The matrix R2 is inverted. The solution of the matrix equation
-*              is Q3^* * V3 = the product of the Jacobi rotations (appplied to
+*              is Q3^* * V3 = the product of the Jacobi rotations (applied to
 *              the lower triangular L3 from the LQ factorization of
 *              R2=L3*Q3), pre-multiplied with the transposed Q3.
                CALL ZGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
diff --git a/lapack-netlib/SRC/zgsvj0.f b/lapack-netlib/SRC/zgsvj0.f
index 672ab7590..11bfbd8dc 100644
--- a/lapack-netlib/SRC/zgsvj0.f
+++ b/lapack-netlib/SRC/zgsvj0.f
@@ -117,7 +117,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -125,9 +125,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is COMPLEX*16 array, dimension (LDV,N)
-*>          If JOBV = 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV = 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zgsvj1.f b/lapack-netlib/SRC/zgsvj1.f
index 2461a9555..efe0384ef 100644
--- a/lapack-netlib/SRC/zgsvj1.f
+++ b/lapack-netlib/SRC/zgsvj1.f
@@ -147,7 +147,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -155,9 +155,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is COMPLEX*16 array, dimension (LDV,N)
-*>          If JOBV = 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV = 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zlalsa.f b/lapack-netlib/SRC/zlalsa.f
index 73ccf3a8b..d419598d2 100644
--- a/lapack-netlib/SRC/zlalsa.f
+++ b/lapack-netlib/SRC/zlalsa.f
@@ -42,9 +42,9 @@
 *>
 *> \verbatim
 *>
-*> ZLALSA is an itermediate step in solving the least squares problem
+*> ZLALSA is an intermediate step in solving the least squares problem
 *> by computing the SVD of the coefficient matrix in compact form (The
-*> singular vectors are computed as products of simple orthorgonal
+*> singular vectors are computed as products of simple orthogonal
 *> matrices.).
 *>
 *> If ICOMPQ = 0, ZLALSA applies the inverse of the left singular vector
diff --git a/lapack-netlib/SRC/zstegr.f b/lapack-netlib/SRC/zstegr.f
index ee43a0d21..3736a0517 100644
--- a/lapack-netlib/SRC/zstegr.f
+++ b/lapack-netlib/SRC/zstegr.f
@@ -56,7 +56,7 @@
 *>
 *> Note : ZSTEGR and ZSTEMR work only on machines which follow
 *> IEEE-754 floating-point standard in their handling of infinities and
-*> NaNs.  Normal execution may create these exceptiona values and hence
+*> NaNs.  Normal execution may create these exceptional values and hence
 *> may abort due to a floating point exception in environments which
 *> do not conform to the IEEE-754 standard.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/ztgsy2.f b/lapack-netlib/SRC/ztgsy2.f
index ee26b5e7b..0cae8939e 100644
--- a/lapack-netlib/SRC/ztgsy2.f
+++ b/lapack-netlib/SRC/ztgsy2.f
@@ -57,7 +57,7 @@
 *>        Z = [ kron(In, A)  -kron(B**H, Im) ]             (2)
 *>            [ kron(In, D)  -kron(E**H, Im) ],
 *>
-*> Ik is the identity matrix of size k and X**H is the conjuguate transpose of X.
+*> Ik is the identity matrix of size k and X**H is the conjugate transpose of X.
 *> kron(X, Y) is the Kronecker product between the matrices X and Y.
 *>
 *> If TRANS = 'C', y in the conjugate transposed system Z**H*y = scale*b

From b01894adcb411abc976869e4a6b5bfb3bd697b69 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 17 May 2023 14:35:13 +0200
Subject: [PATCH 075/718] Fix typos in comments (Reference-LAPACK PR 814)

---
 lapack-netlib/TESTING/EIG/cdrgvx.f | 2 +-
 lapack-netlib/TESTING/EIG/ddrgvx.f | 2 +-
 lapack-netlib/TESTING/EIG/sdrgvx.f | 2 +-
 lapack-netlib/TESTING/EIG/zdrgvx.f | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lapack-netlib/TESTING/EIG/cdrgvx.f b/lapack-netlib/TESTING/EIG/cdrgvx.f
index 830a39d86..4114b697f 100644
--- a/lapack-netlib/TESTING/EIG/cdrgvx.f
+++ b/lapack-netlib/TESTING/EIG/cdrgvx.f
@@ -56,7 +56,7 @@
 *>
 *>    | l**H * (beta A - alpha B) | / ( ulp max( |beta A|, |alpha B| ) )
 *>
-*>     where l**H is the conjugate tranpose of l.
+*>     where l**H is the conjugate transpose of l.
 *>
 *> (2) max over all right eigenvalue/-vector pairs (beta/alpha,r) of
 *>
diff --git a/lapack-netlib/TESTING/EIG/ddrgvx.f b/lapack-netlib/TESTING/EIG/ddrgvx.f
index c63762134..3de72eb70 100644
--- a/lapack-netlib/TESTING/EIG/ddrgvx.f
+++ b/lapack-netlib/TESTING/EIG/ddrgvx.f
@@ -56,7 +56,7 @@
 *>
 *>    | l**H * (beta A - alpha B) | / ( ulp max( |beta A|, |alpha B| ) )
 *>
-*>     where l**H is the conjugate tranpose of l.
+*>     where l**H is the conjugate transpose of l.
 *>
 *> (2) max over all right eigenvalue/-vector pairs (beta/alpha,r) of
 *>
diff --git a/lapack-netlib/TESTING/EIG/sdrgvx.f b/lapack-netlib/TESTING/EIG/sdrgvx.f
index f6da720ad..91803f60a 100644
--- a/lapack-netlib/TESTING/EIG/sdrgvx.f
+++ b/lapack-netlib/TESTING/EIG/sdrgvx.f
@@ -57,7 +57,7 @@
 *>
 *>    | l**H * (beta A - alpha B) | / ( ulp max( |beta A|, |alpha B| ) )
 *>
-*>     where l**H is the conjugate tranpose of l.
+*>     where l**H is the conjugate transpose of l.
 *>
 *> (2) max over all right eigenvalue/-vector pairs (beta/alpha,r) of
 *>
diff --git a/lapack-netlib/TESTING/EIG/zdrgvx.f b/lapack-netlib/TESTING/EIG/zdrgvx.f
index 72e0303fb..813151f13 100644
--- a/lapack-netlib/TESTING/EIG/zdrgvx.f
+++ b/lapack-netlib/TESTING/EIG/zdrgvx.f
@@ -55,7 +55,7 @@
 *>
 *>    | l**H * (beta A - alpha B) | / ( ulp max( |beta A|, |alpha B| ) )
 *>
-*>     where l**H is the conjugate tranpose of l.
+*>     where l**H is the conjugate transpose of l.
 *>
 *> (2) max over all right eigenvalue/-vector pairs (beta/alpha,r) of
 *>

From 47715b57268cf3f32ad37b90e65cd5932fdb5620 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 17 May 2023 14:36:21 +0200
Subject: [PATCH 076/718] Fix typos in comments (Reference-LAPACK PR 814)

---
 lapack-netlib/TESTING/LIN/crzt01.f | 2 +-
 lapack-netlib/TESTING/LIN/drzt01.f | 2 +-
 lapack-netlib/TESTING/LIN/srzt01.f | 2 +-
 lapack-netlib/TESTING/LIN/zrzt01.f | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lapack-netlib/TESTING/LIN/crzt01.f b/lapack-netlib/TESTING/LIN/crzt01.f
index 6a75a8b31..94fdcb4a9 100644
--- a/lapack-netlib/TESTING/LIN/crzt01.f
+++ b/lapack-netlib/TESTING/LIN/crzt01.f
@@ -158,7 +158,7 @@
 *
 *     R = R * P(1) * ... *P(m)
 *
-      CALL CUNMRZ( 'Right', 'No tranpose', M, N, M, N-M, AF, LDA, TAU,
+      CALL CUNMRZ( 'Right', 'No transpose', M, N, M, N-M, AF, LDA, TAU,
      $             WORK, M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
 *     R = R - A
diff --git a/lapack-netlib/TESTING/LIN/drzt01.f b/lapack-netlib/TESTING/LIN/drzt01.f
index 8e969aba7..7a88a2c20 100644
--- a/lapack-netlib/TESTING/LIN/drzt01.f
+++ b/lapack-netlib/TESTING/LIN/drzt01.f
@@ -158,7 +158,7 @@
 *
 *     R = R * P(1) * ... *P(m)
 *
-      CALL DORMRZ( 'Right', 'No tranpose', M, N, M, N-M, AF, LDA, TAU,
+      CALL DORMRZ( 'Right', 'No transpose', M, N, M, N-M, AF, LDA, TAU,
      $             WORK, M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
 *     R = R - A
diff --git a/lapack-netlib/TESTING/LIN/srzt01.f b/lapack-netlib/TESTING/LIN/srzt01.f
index 4478c0506..a2b8b29ba 100644
--- a/lapack-netlib/TESTING/LIN/srzt01.f
+++ b/lapack-netlib/TESTING/LIN/srzt01.f
@@ -158,7 +158,7 @@
 *
 *     R = R * P(1) * ... *P(m)
 *
-      CALL SORMRZ( 'Right', 'No tranpose', M, N, M, N-M, AF, LDA, TAU,
+      CALL SORMRZ( 'Right', 'No transpose', M, N, M, N-M, AF, LDA, TAU,
      $             WORK, M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
 *     R = R - A
diff --git a/lapack-netlib/TESTING/LIN/zrzt01.f b/lapack-netlib/TESTING/LIN/zrzt01.f
index 2eba7ba6f..b6e4d73f9 100644
--- a/lapack-netlib/TESTING/LIN/zrzt01.f
+++ b/lapack-netlib/TESTING/LIN/zrzt01.f
@@ -159,7 +159,7 @@
 *
 *     R = R * P(1) * ... *P(m)
 *
-      CALL ZUNMRZ( 'Right', 'No tranpose', M, N, M, N-M, AF, LDA, TAU,
+      CALL ZUNMRZ( 'Right', 'No transpose', M, N, M, N-M, AF, LDA, TAU,
      $             WORK, M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
 *     R = R - A

From be05ba4374b40310cef2c874e797e76772119f4c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 18 May 2023 16:28:20 +0200
Subject: [PATCH 077/718] Fix typos in comments and documentation of LAPACK
 (Reference-LAPACK PR 820) (#4045)

* Fix typos in comments and documentation (Reference-LAPACK PR 820)
---
 .../CMAKE/CheckFortranTypeSizes.cmake         |   2 +-
 lapack-netlib/CMAKE/Findcodecov.cmake         |   4 +-
 lapack-netlib/CMakeLists.txt                  | 300 ++++++++++++++++--
 lapack-netlib/DOCS/lawn81.tex                 |   2 +-
 .../LAPACKE/utils/lapacke_cgb_trans.c         |   2 +-
 .../LAPACKE/utils/lapacke_dgb_trans.c         |   2 +-
 .../LAPACKE/utils/lapacke_sgb_trans.c         |   2 +-
 .../LAPACKE/utils/lapacke_zgb_trans.c         |   2 +-
 lapack-netlib/SRC/DEPRECATED/cggsvd.f         |   2 +-
 lapack-netlib/SRC/DEPRECATED/zggsvd.f         |   2 +-
 lapack-netlib/SRC/cgejsv.f                    |   2 +-
 lapack-netlib/SRC/cgesvdq.f                   |   2 +-
 lapack-netlib/SRC/cgsvj0.f                    |   4 +-
 lapack-netlib/SRC/cgsvj1.f                    |   4 +-
 lapack-netlib/SRC/chesv_aa_2stage.f           |   2 +-
 lapack-netlib/SRC/chetf2_rk.f                 |   8 +-
 lapack-netlib/SRC/chetf2_rook.f               |   8 +-
 lapack-netlib/SRC/chetrf_aa.f                 |   2 +-
 lapack-netlib/SRC/chetrf_aa_2stage.f          |   2 +-
 lapack-netlib/SRC/cla_gbrfsx_extended.f       |   6 +-
 lapack-netlib/SRC/cla_gerfsx_extended.f       |   2 +-
 lapack-netlib/SRC/cla_herfsx_extended.f       |   2 +-
 lapack-netlib/SRC/cla_porfsx_extended.f       |   2 +-
 lapack-netlib/SRC/cla_syrfsx_extended.f       |   2 +-
 lapack-netlib/SRC/claed7.f                    |   2 +-
 lapack-netlib/SRC/claqz0.f                    |   7 +-
 lapack-netlib/SRC/clarfb_gett.f               |   2 +-
 lapack-netlib/SRC/clatdf.f                    |   2 +-
 lapack-netlib/SRC/clatrs3.f                   |   2 +-
 lapack-netlib/SRC/csyconvf.f                  |  10 +-
 lapack-netlib/SRC/csyconvf_rook.f             |   2 +-
 lapack-netlib/SRC/csysv_aa_2stage.f           |   2 +-
 lapack-netlib/SRC/csytrf_aa.f                 |   2 +-
 lapack-netlib/SRC/csytrf_aa_2stage.f          |   2 +-
 lapack-netlib/SRC/ctrexc.f                    |   2 +-
 lapack-netlib/SRC/cunbdb1.f                   |   2 +-
 lapack-netlib/SRC/cunbdb2.f                   |   2 +-
 lapack-netlib/SRC/cunbdb3.f                   |   2 +-
 lapack-netlib/SRC/cunbdb4.f                   |   2 +-
 lapack-netlib/SRC/dbdsvdx.f                   |   2 +-
 lapack-netlib/SRC/dgejsv.f                    |   2 +-
 lapack-netlib/SRC/dgesvdq.f                   |   2 +-
 lapack-netlib/SRC/dgsvj0.f                    |   4 +-
 lapack-netlib/SRC/dgsvj1.f                    |   4 +-
 lapack-netlib/SRC/dla_gbrfsx_extended.f       |   2 +-
 lapack-netlib/SRC/dla_gerfsx_extended.f       |   2 +-
 lapack-netlib/SRC/dla_porfsx_extended.f       |   2 +-
 lapack-netlib/SRC/dla_syrfsx_extended.f       |   2 +-
 lapack-netlib/SRC/dlaqz0.f                    |   7 +-
 lapack-netlib/SRC/dlarfb_gett.f               |   2 +-
 lapack-netlib/SRC/dlatrs3.f                   |   2 +-
 lapack-netlib/SRC/dorbdb1.f                   |   2 +-
 lapack-netlib/SRC/dorbdb2.f                   |   2 +-
 lapack-netlib/SRC/dorbdb3.f                   |   2 +-
 lapack-netlib/SRC/dorbdb4.f                   |   2 +-
 lapack-netlib/SRC/dsyconvf.f                  |  10 +-
 lapack-netlib/SRC/dsyconvf_rook.f             |   2 +-
 lapack-netlib/SRC/dsysv_aa_2stage.f           |   2 +-
 lapack-netlib/SRC/dsytrf_aa.f                 |   2 +-
 lapack-netlib/SRC/dsytrf_aa_2stage.f          |   2 +-
 lapack-netlib/SRC/sbdsvdx.f                   |   2 +-
 lapack-netlib/SRC/sgejsv.f                    |   2 +-
 lapack-netlib/SRC/sgesvdq.f                   |   2 +-
 lapack-netlib/SRC/sgsvj0.f                    |   4 +-
 lapack-netlib/SRC/sgsvj1.f                    |   4 +-
 lapack-netlib/SRC/sla_gbrfsx_extended.f       |   2 +-
 lapack-netlib/SRC/sla_gerfsx_extended.f       |   2 +-
 lapack-netlib/SRC/sla_porfsx_extended.f       |   2 +-
 lapack-netlib/SRC/sla_syrfsx_extended.f       |   2 +-
 lapack-netlib/SRC/slaqz0.f                    |   7 +-
 lapack-netlib/SRC/slarfb_gett.f               |   2 +-
 lapack-netlib/SRC/slatrs3.f                   |   2 +-
 lapack-netlib/SRC/sorbdb1.f                   |   2 +-
 lapack-netlib/SRC/sorbdb2.f                   |   2 +-
 lapack-netlib/SRC/sorbdb3.f                   |   2 +-
 lapack-netlib/SRC/sorbdb4.f                   |   2 +-
 lapack-netlib/SRC/ssyconvf.f                  |  10 +-
 lapack-netlib/SRC/ssyconvf_rook.f             |   2 +-
 lapack-netlib/SRC/ssysv_aa_2stage.f           |   2 +-
 lapack-netlib/SRC/ssytrf_aa.f                 |   2 +-
 lapack-netlib/SRC/ssytrf_aa_2stage.f          |   2 +-
 lapack-netlib/SRC/zgejsv.f                    |   2 +-
 lapack-netlib/SRC/zgesvdq.f                   |   2 +-
 lapack-netlib/SRC/zgsvj0.f                    |   4 +-
 lapack-netlib/SRC/zgsvj1.f                    |   4 +-
 lapack-netlib/SRC/zhesv_aa_2stage.f           |   2 +-
 lapack-netlib/SRC/zhetf2_rk.f                 |   8 +-
 lapack-netlib/SRC/zhetf2_rook.f               |   8 +-
 lapack-netlib/SRC/zhetrf_aa.f                 |   2 +-
 lapack-netlib/SRC/zhetrf_aa_2stage.f          |   2 +-
 lapack-netlib/SRC/zla_gbrfsx_extended.f       |   2 +-
 lapack-netlib/SRC/zla_gerfsx_extended.f       |   2 +-
 lapack-netlib/SRC/zla_herfsx_extended.f       |   2 +-
 lapack-netlib/SRC/zla_porfsx_extended.f       |   2 +-
 lapack-netlib/SRC/zla_syrfsx_extended.f       |   2 +-
 lapack-netlib/SRC/zlaed7.f                    |   2 +-
 lapack-netlib/SRC/zlaqz0.f                    |   7 +-
 lapack-netlib/SRC/zlarfb_gett.f               |   2 +-
 lapack-netlib/SRC/zlatdf.f                    |   2 +-
 lapack-netlib/SRC/zlatrs3.f                   |   2 +-
 lapack-netlib/SRC/zsyconvf.f                  |  10 +-
 lapack-netlib/SRC/zsyconvf_rook.f             |   2 +-
 lapack-netlib/SRC/zsysv_aa_2stage.f           |   2 +-
 lapack-netlib/SRC/zsytrf_aa.f                 |   2 +-
 lapack-netlib/SRC/zsytrf_aa_2stage.f          |   2 +-
 lapack-netlib/SRC/ztrexc.f                    |   2 +-
 lapack-netlib/SRC/zunbdb1.f                   |   2 +-
 lapack-netlib/SRC/zunbdb2.f                   |   2 +-
 lapack-netlib/SRC/zunbdb3.f                   |   2 +-
 lapack-netlib/SRC/zunbdb4.f                   |   2 +-
 lapack-netlib/TESTING/EIG/cchkst.f            |   6 +-
 lapack-netlib/TESTING/EIG/cchkst2stg.f        |  11 +-
 lapack-netlib/TESTING/EIG/cdrgsx.f            |   5 +-
 lapack-netlib/TESTING/EIG/cdrvsg.f            |   5 +-
 lapack-netlib/TESTING/EIG/cdrvsg2stg.f        |   5 +-
 lapack-netlib/TESTING/EIG/cdrvst.f            |   7 +-
 lapack-netlib/TESTING/EIG/cdrvst2stg.f        |   5 +-
 lapack-netlib/TESTING/EIG/cerrst.f            |  74 ++---
 lapack-netlib/TESTING/EIG/dchkst.f            |  11 +-
 lapack-netlib/TESTING/EIG/dchkst2stg.f        |  11 +-
 lapack-netlib/TESTING/EIG/ddrgsx.f            |   5 +-
 lapack-netlib/TESTING/EIG/ddrvsg.f            |   5 +-
 lapack-netlib/TESTING/EIG/ddrvsg2stg.f        |   5 +-
 lapack-netlib/TESTING/EIG/ddrvst.f            |  13 +-
 lapack-netlib/TESTING/EIG/ddrvst2stg.f        |   5 +-
 lapack-netlib/TESTING/EIG/schkst.f            |  11 +-
 lapack-netlib/TESTING/EIG/schkst2stg.f        |  11 +-
 lapack-netlib/TESTING/EIG/sdrgsx.f            |   5 +-
 lapack-netlib/TESTING/EIG/sdrvsg.f            |   5 +-
 lapack-netlib/TESTING/EIG/sdrvsg2stg.f        |   5 +-
 lapack-netlib/TESTING/EIG/sdrvst.f            |  13 +-
 lapack-netlib/TESTING/EIG/sdrvst2stg.f        |   5 +-
 lapack-netlib/TESTING/EIG/zchkst.f            |  12 +-
 lapack-netlib/TESTING/EIG/zchkst2stg.f        |  13 +-
 lapack-netlib/TESTING/EIG/zdrgsx.f            |   5 +-
 lapack-netlib/TESTING/EIG/zdrvsg.f            |   5 +-
 lapack-netlib/TESTING/EIG/zdrvsg2stg.f        |   5 +-
 lapack-netlib/TESTING/EIG/zdrvst.f            |  11 +-
 lapack-netlib/TESTING/EIG/zdrvst2stg.f        |   5 +-
 lapack-netlib/TESTING/LIN/alahd.f             |   2 +-
 lapack-netlib/TESTING/LIN/cchktp.f            |   2 +-
 lapack-netlib/TESTING/LIN/cerrhe.f            |   4 +-
 lapack-netlib/TESTING/LIN/cerrhex.f           |   4 +-
 lapack-netlib/TESTING/LIN/cerrsy.f            |   4 +-
 lapack-netlib/TESTING/LIN/cerrsyx.f           |   4 +-
 lapack-netlib/TESTING/LIN/cgtt01.f            |   2 +-
 lapack-netlib/TESTING/LIN/cgtt02.f            |   4 +-
 lapack-netlib/TESTING/LIN/chet01_3.f          |   2 +-
 lapack-netlib/TESTING/LIN/clqt02.f            |   2 +-
 lapack-netlib/TESTING/LIN/cptt01.f            |   2 +-
 lapack-netlib/TESTING/LIN/cptt02.f            |   2 +-
 lapack-netlib/TESTING/LIN/cqlt02.f            |   2 +-
 lapack-netlib/TESTING/LIN/cqrt02.f            |   2 +-
 lapack-netlib/TESTING/LIN/crqt02.f            |   2 +-
 lapack-netlib/TESTING/LIN/csyt01_3.f          |   2 +-
 lapack-netlib/TESTING/LIN/dchktp.f            |   2 +-
 lapack-netlib/TESTING/LIN/ddrvab.f            |   2 +-
 lapack-netlib/TESTING/LIN/ddrvac.f            |   2 +-
 lapack-netlib/TESTING/LIN/derrsy.f            |   4 +-
 lapack-netlib/TESTING/LIN/derrsyx.f           |   4 +-
 lapack-netlib/TESTING/LIN/dgtt01.f            |   2 +-
 lapack-netlib/TESTING/LIN/dgtt02.f            |   4 +-
 lapack-netlib/TESTING/LIN/dlqt02.f            |   2 +-
 lapack-netlib/TESTING/LIN/dptt01.f            |   2 +-
 lapack-netlib/TESTING/LIN/dptt02.f            |   2 +-
 lapack-netlib/TESTING/LIN/dqlt02.f            |   2 +-
 lapack-netlib/TESTING/LIN/dqrt02.f            |   2 +-
 lapack-netlib/TESTING/LIN/drqt02.f            |   2 +-
 lapack-netlib/TESTING/LIN/dsyt01_3.f          |   2 +-
 lapack-netlib/TESTING/LIN/schktp.f            |   2 +-
 lapack-netlib/TESTING/LIN/serrsy.f            |   4 +-
 lapack-netlib/TESTING/LIN/serrsyx.f           |   4 +-
 lapack-netlib/TESTING/LIN/sgtt01.f            |   2 +-
 lapack-netlib/TESTING/LIN/sgtt02.f            |   4 +-
 lapack-netlib/TESTING/LIN/slqt02.f            |   2 +-
 lapack-netlib/TESTING/LIN/sptt01.f            |   2 +-
 lapack-netlib/TESTING/LIN/sptt02.f            |   2 +-
 lapack-netlib/TESTING/LIN/sqlt02.f            |   2 +-
 lapack-netlib/TESTING/LIN/sqrt02.f            |   2 +-
 lapack-netlib/TESTING/LIN/srqt02.f            |   2 +-
 lapack-netlib/TESTING/LIN/ssyt01_3.f          |   2 +-
 lapack-netlib/TESTING/LIN/zchktp.f            |   2 +-
 lapack-netlib/TESTING/LIN/zdrvab.f            |   2 +-
 lapack-netlib/TESTING/LIN/zdrvac.f            |   2 +-
 lapack-netlib/TESTING/LIN/zerrhe.f            |   4 +-
 lapack-netlib/TESTING/LIN/zerrhex.f           |   4 +-
 lapack-netlib/TESTING/LIN/zerrsy.f            |   4 +-
 lapack-netlib/TESTING/LIN/zerrsyx.f           |   4 +-
 lapack-netlib/TESTING/LIN/zgtt01.f            |   2 +-
 lapack-netlib/TESTING/LIN/zgtt02.f            |   4 +-
 lapack-netlib/TESTING/LIN/zhet01_3.f          |   2 +-
 lapack-netlib/TESTING/LIN/zlqt02.f            |   2 +-
 lapack-netlib/TESTING/LIN/zptt01.f            |   2 +-
 lapack-netlib/TESTING/LIN/zptt02.f            |   2 +-
 lapack-netlib/TESTING/LIN/zqlt02.f            |   2 +-
 lapack-netlib/TESTING/LIN/zqrt02.f            |   2 +-
 lapack-netlib/TESTING/LIN/zrqt02.f            |   2 +-
 lapack-netlib/TESTING/LIN/zsyt01_3.f          |   2 +-
 198 files changed, 636 insertions(+), 414 deletions(-)

diff --git a/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake b/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake
index 585ca26e7..17c0df80e 100644
--- a/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake
+++ b/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake
@@ -1,4 +1,4 @@
-# This module perdorms several try-compiles to determine the default integer
+# This module performs several try-compiles to determine the default integer
 # size being used by the fortran compiler
 #
 # After execution, the following variables are set.  If they are un set then
diff --git a/lapack-netlib/CMAKE/Findcodecov.cmake b/lapack-netlib/CMAKE/Findcodecov.cmake
index 384064007..93db45130 100644
--- a/lapack-netlib/CMAKE/Findcodecov.cmake
+++ b/lapack-netlib/CMAKE/Findcodecov.cmake
@@ -36,7 +36,7 @@ function(add_coverage TNAME)
 endfunction()
 
 
-# Find the reuired flags foreach language.
+# Find the required flags foreach language.
 set(CMAKE_REQUIRED_QUIET_SAVE ${CMAKE_REQUIRED_QUIET})
 set(CMAKE_REQUIRED_QUIET ${codecov_FIND_QUIETLY})
 
@@ -118,7 +118,7 @@ function (codecov_path_of_source FILE RETURN_VAR)
 
   # If expression was found, SOURCEFILE is a generator-expression for an
   # object library. Currently we found no way to call this function automatic
-  # for the referenced target, so it must be called in the directoryso of the
+  # for the referenced target, so it must be called in the directory of the
   # object library definition.
   if(NOT "${_source}" STREQUAL "")
     set(${RETURN_VAR} "" PARENT_SCOPE)
diff --git a/lapack-netlib/CMakeLists.txt b/lapack-netlib/CMakeLists.txt
index b704e72c5..fefaa8b89 100644
--- a/lapack-netlib/CMakeLists.txt
+++ b/lapack-netlib/CMakeLists.txt
@@ -1,16 +1,20 @@
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required(VERSION 3.2)
 
 project(LAPACK Fortran C)
 
 set(LAPACK_MAJOR_VERSION 3)
-set(LAPACK_MINOR_VERSION 9)
+set(LAPACK_MINOR_VERSION 11)
 set(LAPACK_PATCH_VERSION 0)
 set(
   LAPACK_VERSION
   ${LAPACK_MAJOR_VERSION}.${LAPACK_MINOR_VERSION}.${LAPACK_PATCH_VERSION}
   )
 
-# Add the CMake directory for custon CMake modules
+# Allow setting a prefix for the library names
+set(CMAKE_STATIC_LIBRARY_PREFIX "lib${LIBRARY_PREFIX}")
+set(CMAKE_SHARED_LIBRARY_PREFIX "lib${LIBRARY_PREFIX}")
+
+# Add the CMake directory for custom CMake modules
 set(CMAKE_MODULE_PATH "${LAPACK_SOURCE_DIR}/CMAKE" ${CMAKE_MODULE_PATH})
 
 # Export all symbols on Windows when building shared libraries
@@ -41,6 +45,40 @@ if(_is_coverage_build)
   find_package(codecov)
 endif()
 
+# By default test Fortran compiler complex abs and complex division
+option(TEST_FORTRAN_COMPILER "Test Fortran compiler complex abs and complex division" OFF)
+if( TEST_FORTRAN_COMPILER )
+
+  add_executable( test_zcomplexabs ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexabs.f )
+  add_custom_target( run_test_zcomplexabs
+    COMMAND test_zcomplexabs 2> test_zcomplexabs.err
+    WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL
+    COMMENT "Running test_zcomplexabs in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zcomplexabs.err"
+    SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexabs.f )
+
+  add_executable( test_zcomplexdiv ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexdiv.f )
+  add_custom_target( run_test_zcomplexdiv
+    COMMAND test_zcomplexdiv 2> test_zcomplexdiv.err
+    WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL
+    COMMENT "Running test_zcomplexdiv in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zcomplexdiv.err"
+    SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexdiv.f )
+
+  add_executable( test_zcomplexmult ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexmult.f )
+  add_custom_target( run_test_zcomplexmult
+    COMMAND test_zcomplexmult 2> test_zcomplexmult.err
+    WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL
+    COMMENT "Running test_zcomplexmult in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zcomplexmult.err"
+    SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexmult.f )
+
+  add_executable( test_zminMax ${LAPACK_SOURCE_DIR}/INSTALL/test_zminMax.f )
+  add_custom_target( run_test_zminMax
+    COMMAND test_zminMax 2> test_zminMax.err
+    WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL
+    COMMENT "Running test_zminMax in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zminMax.err"
+    SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zminMax.f )
+    
+endif()
+
 # By default static library
 option(BUILD_SHARED_LIBS "Build shared libraries" OFF)
 
@@ -89,12 +127,57 @@ configure_file(
 include(PreventInSourceBuilds)
 include(PreventInBuildInstalls)
 
+# Check if recursive flag exists
+include(CheckFortranCompilerFlag)
+if(CMAKE_Fortran_COMPILER_ID STREQUAL Flang)
+  check_fortran_compiler_flag("-Mrecursive" _MrecursiveFlag)
+elseif(CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
+  check_fortran_compiler_flag("-frecursive" _frecursiveFlag)
+elseif(CMAKE_Fortran_COMPILER_ID STREQUAL Intel)
+  check_fortran_compiler_flag("-recursive" _recursiveFlag)
+elseif(CMAKE_Fortran_COMPILER_ID STREQUAL XL)
+  check_fortran_compiler_flag("-qrecur" _qrecurFlag)
+elseif(CMAKE_Fortran_COMPILER_ID STREQUAL NAG)
+  check_fortran_compiler_flag("-recursive" _recursiveFlag)
+else()
+  message(WARNING "Fortran local arrays should be allocated on the stack."
+    " Please use a compiler which guarantees that feature."
+    " See https://github.com/Reference-LAPACK/lapack/pull/188 and references therein.")
+endif()
+
+# Add recursive flag
+if(_MrecursiveFlag)
+  string(REGEX MATCH "-Mrecursive" output_test <string> "${CMAKE_Fortran_FLAGS}")
+  if(NOT output_test)
+    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mrecursive"
+      CACHE STRING "Recursive flag must be set" FORCE)
+  endif()
+elseif(_frecursiveFlag)
+  string(REGEX MATCH "-frecursive" output_test <string> "${CMAKE_Fortran_FLAGS}")
+  if(NOT output_test)
+    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -frecursive"
+      CACHE STRING "Recursive flag must be set" FORCE)
+  endif()
+elseif(_recursiveFlag)
+  string(REGEX MATCH "-recursive" output_test <string> "${CMAKE_Fortran_FLAGS}")
+  if(NOT output_test)
+    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -recursive"
+      CACHE STRING "Recursive flag must be set" FORCE)
+  endif()
+elseif(_qrecurFlag)
+  string(REGEX MATCH "-qrecur" output_test <string> "${CMAKE_Fortran_FLAGS}")
+  if(NOT output_test)
+    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qrecur"
+      CACHE STRING "Recursive flag must be set" FORCE)
+  endif()
+endif()
+
 if(UNIX)
   if(CMAKE_Fortran_COMPILER_ID STREQUAL Intel)
     set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fp-model strict")
   endif()
   if(CMAKE_Fortran_COMPILER_ID STREQUAL XL)
-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict=none")
+    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict")
   endif()
 # Delete libmtsk in linking sequence for Sun/Oracle Fortran Compiler.
 # This library is not present in the Sun package SolarisStudio12.3-linux-x86-bin
@@ -128,6 +211,22 @@ if(CMAKE_Fortran_COMPILER_ID STREQUAL Compaq)
   endif()
 endif()
 
+# Add option to enable flat namespace for symbol resolution on macOS
+if(APPLE)
+  option(USE_FLAT_NAMESPACE "Use flat namespaces for symbol resolution during build and runtime." OFF)
+
+  if(USE_FLAT_NAMESPACE)
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-flat_namespace")
+    set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Wl,-flat_namespace")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-flat_namespace")
+  else()
+    if(BUILD_SHARED_LIBS AND BUILD_TESTING)
+      message(WARNING
+        "LAPACK test suite might fail with shared libraries and the default two-level namespace. "
+        "Disable shared libraries or enable flat namespace for symbol resolution via -DUSE_FLAT_NAMESPACE=ON.")
+    endif()
+  endif()
+endif()
 
 # --------------------------------------------------
 set(LAPACK_INSTALL_EXPORT_NAME ${LAPACKLIB}-targets)
@@ -149,13 +248,14 @@ option(BUILD_TESTING "Build tests" ${_is_coverage_build})
 include(CTest)
 message(STATUS "Build tests: ${BUILD_TESTING}")
 
-# lapack_testing.py uses features from python 2.7 and greater
 if(BUILD_TESTING)
-  set(_msg "Looking for Python >= 2.7 needed for summary tests")
+  set(_msg "Looking for Python3 needed for summary tests")
   message(STATUS "${_msg}")
-  find_package(PythonInterp 2.7 QUIET)
-  if(PYTHONINTERP_FOUND)
-    message(STATUS "${_msg} - found (${PYTHON_VERSION_STRING})")
+  # find_package(PythonInterp 3) cannot be used because /usr/bin/python may be
+  # a Python2 interpreter.
+  find_program(PYTHON_EXECUTABLE python3)
+  if(PYTHON_EXECUTABLE)
+    message(STATUS "${_msg} - found")
   else()
     message(STATUS "${_msg} - not found (skipping summary tests)")
   endif()
@@ -177,7 +277,7 @@ CheckLAPACKCompilerFlags()
 # Check second function
 
 include(CheckTimeFunction)
-set(TIME_FUNC NONE ${TIME_FUNC})
+set(TIME_FUNC NONE)
 CHECK_TIME_FUNCTION(NONE TIME_FUNC)
 CHECK_TIME_FUNCTION(INT_CPU_TIME TIME_FUNC)
 CHECK_TIME_FUNCTION(EXT_ETIME TIME_FUNC)
@@ -210,6 +310,7 @@ if(NOT (BUILD_SINGLE OR BUILD_DOUBLE OR BUILD_COMPLEX OR BUILD_COMPLEX16))
   BUILD_SINGLE, BUILD_DOUBLE, BUILD_COMPLEX, BUILD_COMPLEX16.")
 endif()
 
+
 # --------------------------------------------------
 # Subdirectories that need to be processed
 option(USE_OPTIMIZED_BLAS "Whether or not to use an optimized BLAS library instead of included netlib BLAS" OFF)
@@ -325,35 +426,80 @@ option(LAPACKE_WITH_TMG "Build LAPACKE with tmglib routines" OFF)
 if(LAPACKE_WITH_TMG)
   set(LAPACKE ON)
 endif()
-if(BUILD_TESTING OR LAPACKE_WITH_TMG) #already included, avoid double inclusion
+
+# TMGLIB
+# Cache export target
+set(LAPACK_INSTALL_EXPORT_NAME_CACHE ${LAPACK_INSTALL_EXPORT_NAME})
+if(BUILD_TESTING OR LAPACKE_WITH_TMG)
+  if(LATESTLAPACK_FOUND AND LAPACKE_WITH_TMG)
+    set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES})
+    # Check if dlatms (part of tmg) is found
+    CHECK_FORTRAN_FUNCTION_EXISTS("dlatms" LAPACK_WITH_TMGLIB_FOUND)
+    unset(CMAKE_REQUIRED_LIBRARIES)
+    if(NOT LAPACK_WITH_TMGLIB_FOUND)
+      # Build and install TMG as part of LAPACKE targets (as opposed to LAPACK
+      # targets)
+      set(LAPACK_INSTALL_EXPORT_NAME ${LAPACKELIB}-targets)
+    endif()
+  endif()
   add_subdirectory(TESTING/MATGEN)
 endif()
+# Reset export target
+set(LAPACK_INSTALL_EXPORT_NAME ${LAPACK_INSTALL_EXPORT_NAME_CACHE})
+unset(LAPACK_INSTALL_EXPORT_NAME_CACHE)
 
 if(LAPACKE)
   add_subdirectory(LAPACKE)
 endif()
 
+
 #-------------------------------------
 # BLAS++ / LAPACK++
 option(BLAS++ "Build BLAS++" OFF)
 option(LAPACK++ "Build LAPACK++" OFF)
- 
- 
+
+
 function(_display_cpp_implementation_msg name)
   string(TOLOWER ${name} name_lc)
   message(STATUS "${name}++ enable")
   message(STATUS "----------------")
   message(STATUS "Thank you for your interest in ${name}++, a newly developed C++ API for ${name} library")
   message(STATUS "The objective of ${name}++ is to provide a convenient, performance oriented API for development in the C++ language, that, for the most part, preserves established conventions, while, at the same time, takes advantages of modern C++ features, such as: namespaces, templates, exceptions, etc.")
-  message(STATUS "We are still working on integrating ${name}++ in our library. For the moment, you can download directly ${name_lc}++ from https://bitbucket.org/icl/${name_lc}pp")
   message(STATUS "For support ${name}++ related question, please email: slate-user@icl.utk.edu")
   message(STATUS "----------------")
 endfunction()
-if(BLAS++)
+if (BLAS++)
   _display_cpp_implementation_msg("BLAS")
+  include(ExternalProject)
+  ExternalProject_Add(blaspp
+	  URL https://bitbucket.org/icl/blaspp/downloads/blaspp-2020.10.02.tar.gz
+	  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${PROJECT_BINARY_DIR}/blaspp-prefix/src/blaspp
+	  BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make
+	  INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install
+	  )
+  ExternalProject_Add_StepDependencies(blaspp build ${BLAS_LIBRARIES})
 endif()
-if(LAPACK++)
+if (LAPACK++)
+	message (STATUS "linking lapack++ against ${LAPACK_LIBRARIES}")
   _display_cpp_implementation_msg("LAPACK")
+  include(ExternalProject)
+  if (BUILD_SHARED_LIBS)
+  ExternalProject_Add(lapackpp
+	  URL https://bitbucket.org/icl/lapackpp/downloads/lapackpp-2020.10.02.tar.gz
+	  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DLAPACK_LIBRARIES=${LAPACK_LIBRARIES} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}  ${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp
+	  BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make
+	  INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install
+	  )
+  else ()
+# FIXME this does not really work as the libraries list gets converted to a semicolon-separated list somewhere in the lapack++ build files
+  ExternalProject_Add(lapackpp
+	  URL https://bitbucket.org/icl/lapackpp/downloads/lapackpp-2020.10.02.tar.gz
+	  CONFIGURE_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DLAPACK_LIBRARIES="${PROJECT_BINARY_DIR}/lib/liblapack.a -lgfortran"  -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}  ${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp
+	  BUILD_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make
+	  INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install
+  )
+  endif()
+  ExternalProject_Add_StepDependencies(lapackpp build blaspp ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES})
 endif()
 
 # --------------------------------------------------
@@ -370,7 +516,7 @@ set(CPACK_MONOLITHIC_INSTALL ON)
 set(CPACK_PACKAGE_INSTALL_DIRECTORY "LAPACK")
 if(WIN32 AND NOT UNIX)
   # There is a bug in NSI that does not handle full unix paths properly. Make
-  # sure there is at least one set of four (4) backlasshes.
+  # sure there is at least one set of four (4) backslashes.
   set(CPACK_NSIS_HELP_LINK "http:\\\\\\\\http://icl.cs.utk.edu/lapack-forum")
   set(CPACK_NSIS_URL_INFO_ABOUT "http:\\\\\\\\www.netlib.org/lapack")
   set(CPACK_NSIS_CONTACT "lapack@eecs.utk.edu")
@@ -396,10 +542,6 @@ if(NOT LATESTLAPACK_FOUND)
   set(ALL_TARGETS ${ALL_TARGETS} ${LAPACKLIB})
 endif()
 
-if(BUILD_TESTING OR LAPACKE_WITH_TMG)
-  set(ALL_TARGETS ${ALL_TARGETS} ${TMGLIB})
-endif()
-
 # Export lapack targets, not including lapacke, from the
 # install tree, if any.
 set(_lapack_config_install_guard_target "")
@@ -424,6 +566,10 @@ if(LAPACKE)
   set(ALL_TARGETS ${ALL_TARGETS} ${LAPACKELIB})
 endif()
 
+if(NOT LAPACK_WITH_TMGLIB_FOUND AND LAPACKE_WITH_TMG)
+  set(ALL_TARGETS ${ALL_TARGETS} ${TMGLIB})
+endif()
+
 # Export lapack and lapacke targets from the build tree, if any.
 set(_lapack_config_build_guard_target "")
 if(ALL_TARGETS)
@@ -461,4 +607,114 @@ install(FILES
   DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${LAPACKLIB}-${LAPACK_VERSION}
   COMPONENT Development
   )
-  
+if (LAPACK++)
+  install(
+  DIRECTORY "${LAPACK_BINARY_DIR}/lib/"
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  FILES_MATCHING REGEX "liblapackpp.(a|so)$"
+  )
+  install(
+    DIRECTORY "${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp/include/"
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+    FILES_MATCHING REGEX "\\.(h|hh)$"
+  )
+  write_basic_package_version_file(
+    "lapackppConfigVersion.cmake"
+    VERSION 2020.10.02
+    COMPATIBILITY AnyNewerVersion
+  )
+  install(
+    FILES "${CMAKE_CURRENT_BINARY_DIR}/lib/lapackpp/lapackppConfig.cmake"
+          "${CMAKE_CURRENT_BINARY_DIR}/lib/lapackpp/lapackppConfigVersion.cmake"
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/"
+  )
+
+endif()
+if (BLAS++)
+  write_basic_package_version_file(
+    "blasppConfigVersion.cmake"
+    VERSION 2020.10.02
+    COMPATIBILITY AnyNewerVersion
+  )
+  install(
+    FILES "${CMAKE_CURRENT_BINARY_DIR}/lib/blaspp/blasppConfig.cmake"
+          "${CMAKE_CURRENT_BINARY_DIR}/lib/blaspp/blasppConfigVersion.cmake"
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/"
+  )
+  install(
+  DIRECTORY "${LAPACK_BINARY_DIR}/lib/"
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  FILES_MATCHING REGEX "libblaspp.(a|so)$"
+  )
+  install(
+    DIRECTORY "${PROJECT_BINARY_DIR}/blaspp-prefix/src/blaspp/include/"
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+    FILES_MATCHING REGEX "\\.(h|hh)$"
+  )
+endif()
+
+# --------------------------------------------------
+# Generate MAN and/or HTML Documentation
+option(BUILD_HTML_DOCUMENTATION "Create and install the HTML based API
+documentation (requires Doxygen) - command: make html" OFF)
+option(BUILD_MAN_DOCUMENTATION "Create and install the MAN based documentation (requires Doxygen) - command: make man" OFF)
+message(STATUS "Build html documentation: ${BUILD_HTML_DOCUMENTATION}")
+message(STATUS "Build man documentation: ${BUILD_MAN_DOCUMENTATION}")
+
+if(BUILD_HTML_DOCUMENTATION OR BUILD_MAN_DOCUMENTATION)
+    find_package(Doxygen)
+    if(NOT DOXYGEN_FOUND)
+         message(WARNING "Doxygen is needed to build the documentation.")
+
+    else()
+
+    set(DOXYGEN_PROJECT_BRIEF "LAPACK: Linear Algebra PACKage")
+    set(DOXYGEN_PROJECT_NUMBER ${LAPACK_VERSION})
+    set(DOXYGEN_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/DOCS)
+    set(PROJECT_LOGO ${CMAKE_CURRENT_SOURCE_DIR}/DOCS/lapack.png)
+    set(DOXYGEN_OPTIMIZE_FOR_FORTRAN YES)
+    set(DOXYGEN_SOURCE_BROWSER YES)
+    set(DISTRIBUTE_GROUP_DOC YES)
+    set(DOXYGEN_CREATE_SUBDIRS YES)
+    set(DOXYGEN_SEPARATE_MEMBER_PAGES YES)
+    set(DOXYGEN_EXTRACT_ALL YES)
+    set(DOXYGEN_FILE_PATTERNS "*.f;*.c;*.h")
+    set(DOXYGEN_RECURSIVE YES)
+    set(DOXYGEN_GENERATE_TREEVIEW YES)
+    set(DOXYGEN_INTERACTIVE_SVG YES)
+    set(DOXYGEN_QUIET YES)
+    set(DOXYGEN_WARNINGS NO)
+    set(DOXYGEN_GENERATE_HTML NO)
+    set(DOXYGEN_GENERATE_MAN NO)
+
+
+    if (BUILD_HTML_DOCUMENTATION)
+        set(DOXYGEN_GENERATE_HTML YES)
+        set(DOXYGEN_HTML_OUTPUT explore-html)
+        set(DOXYGEN_INLINE_SOURCES YES)
+        set(DOXYGEN_CALL_GRAPH YES)
+        set(DOXYGEN_CALLER_GRAPH YES)
+
+        doxygen_add_docs(
+            html
+            ${PROJECT_SOURCE_DIR}
+            COMMENT "Generating html LAPACK documentation (it will take some time... time to grab a coffee)"
+        )
+    endif()
+    if (BUILD_MAN_DOCUMENTATION)
+        set(DOXYGEN_GENERATE_MAN YES)
+        set(DOXYGEN_EXCLUDE SRC/VARIANTS)
+        set(DOXYGEN_MAN_LINKS YES)
+        set(DOXYGEN_INLINE_SOURCES NO)
+        set(DOXYGEN_CALL_GRAPH NO)
+        set(DOXYGEN_CALLER_GRAPH NO)
+
+        doxygen_add_docs(
+            man
+            ${PROJECT_SOURCE_DIR}
+            COMMENT "Generating man LAPACK documentation"
+        )
+    endif()
+
+    endif()
+endif()
diff --git a/lapack-netlib/DOCS/lawn81.tex b/lapack-netlib/DOCS/lawn81.tex
index 794c2a7aa..668ca8d68 100644
--- a/lapack-netlib/DOCS/lawn81.tex
+++ b/lapack-netlib/DOCS/lawn81.tex
@@ -575,7 +575,7 @@ There are six machine-dependent functions in the test and timing
 package, at least three of which must be installed.  They are
 
 \begin{tabbing}
-MONOMO  \=  DOUBLE PRECYSION  \=  \kill
+MONOMO  \=  DOUBLE PRECISION  \=  \kill
 LSAME   \>  LOGICAL      \> Test if two characters are the same regardless of case \\
 SLAMCH  \>  REAL  \> Determine machine-dependent parameters \\
 DLAMCH  \>  DOUBLE PRECISION \> Determine machine-dependent parameters \\
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_cgb_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_cgb_trans.c
index 2adf71493..b32fc2f9e 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_cgb_trans.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_cgb_trans.c
@@ -54,7 +54,7 @@ void LAPACKE_cgb_trans( int matrix_layout, lapack_int m, lapack_int n,
         }
     } else if ( matrix_layout == LAPACK_ROW_MAJOR ) {
         /* TODO: interchange loops for performance.
-         * This is just reference impemeltation.
+         * This is just reference implementation.
          */
         for( j = 0; j < MIN( n, ldin ); j++ ) {
             for( i = MAX( ku-j, 0 ); i < MIN3( ldout, m+ku-j, kl+ku+1 );
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dgb_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_dgb_trans.c
index 6624936a6..89e421eae 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_dgb_trans.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_dgb_trans.c
@@ -54,7 +54,7 @@ void LAPACKE_dgb_trans( int matrix_layout, lapack_int m, lapack_int n,
         }
     } else if ( matrix_layout == LAPACK_ROW_MAJOR ) {
         /* TODO: interchange loops for performance.
-         * This is just reference impemeltation.
+         * This is just reference implementation.
          */
         for( j = 0; j < MIN( n, ldin ); j++ ) {
             for( i = MAX( ku-j, 0 ); i < MIN3( ldout, m+ku-j, kl+ku+1 );
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_sgb_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_sgb_trans.c
index 046fdb9ca..a90c9617a 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_sgb_trans.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_sgb_trans.c
@@ -54,7 +54,7 @@ void LAPACKE_sgb_trans( int matrix_layout, lapack_int m, lapack_int n,
         }
     } else if ( matrix_layout == LAPACK_ROW_MAJOR ) {
         /* TODO: interchange loops for performance.
-         * This is just reference impemeltation.
+         * This is just reference implementation.
          */
         for( j = 0; j < MIN( n, ldin ); j++ ) {
             for( i = MAX( ku-j, 0 ); i < MIN3( ldout, m+ku-j, kl+ku+1 );
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_zgb_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_zgb_trans.c
index 32a9de379..63323b1c3 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_zgb_trans.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_zgb_trans.c
@@ -54,7 +54,7 @@ void LAPACKE_zgb_trans( int matrix_layout, lapack_int m, lapack_int n,
         }
     } else if ( matrix_layout == LAPACK_ROW_MAJOR ) {
         /* TODO: interchange loops for performance.
-         * This is just reference impemeltation
+         * This is just reference implementation
          */
         for( j = 0; j < MIN( n, ldin ); j++ ) {
             for( i = MAX( ku-j, 0 ); i < MIN3( ldout, m+ku-j, kl+ku+1 );
diff --git a/lapack-netlib/SRC/DEPRECATED/cggsvd.f b/lapack-netlib/SRC/DEPRECATED/cggsvd.f
index e6fdb47e5..515ac8d49 100644
--- a/lapack-netlib/SRC/DEPRECATED/cggsvd.f
+++ b/lapack-netlib/SRC/DEPRECATED/cggsvd.f
@@ -107,7 +107,7 @@
 *> In particular, if B is an N-by-N nonsingular matrix, then the GSVD of
 *> A and B implicitly gives the SVD of A*inv(B):
 *>                      A*inv(B) = U*(D1*inv(D2))*V**H.
-*> If ( A**H,B**H)**H has orthnormal columns, then the GSVD of A and B is also
+*> If ( A**H,B**H)**H has orthonormal columns, then the GSVD of A and B is also
 *> equal to the CS decomposition of A and B. Furthermore, the GSVD can
 *> be used to derive the solution of the eigenvalue problem:
 *>                      A**H*A x = lambda* B**H*B x.
diff --git a/lapack-netlib/SRC/DEPRECATED/zggsvd.f b/lapack-netlib/SRC/DEPRECATED/zggsvd.f
index c0b9247a6..8a41e36c6 100644
--- a/lapack-netlib/SRC/DEPRECATED/zggsvd.f
+++ b/lapack-netlib/SRC/DEPRECATED/zggsvd.f
@@ -106,7 +106,7 @@
 *> In particular, if B is an N-by-N nonsingular matrix, then the GSVD of
 *> A and B implicitly gives the SVD of A*inv(B):
 *>                      A*inv(B) = U*(D1*inv(D2))*V**H.
-*> If ( A**H,B**H)**H has orthnormal columns, then the GSVD of A and B is also
+*> If ( A**H,B**H)**H has orthonormal columns, then the GSVD of A and B is also
 *> equal to the CS decomposition of A and B. Furthermore, the GSVD can
 *> be used to derive the solution of the eigenvalue problem:
 *>                      A**H*A x = lambda* B**H*B x.
diff --git a/lapack-netlib/SRC/cgejsv.f b/lapack-netlib/SRC/cgejsv.f
index 2b7b2dc70..1fc75613e 100644
--- a/lapack-netlib/SRC/cgejsv.f
+++ b/lapack-netlib/SRC/cgejsv.f
@@ -252,7 +252,7 @@
 *>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
 *>                         the right singular vectors;
 *>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
-*>                         then V is used as workspace if the pprocedure
+*>                         then V is used as workspace if the procedure
 *>                         replaces A with A^*. In that case, [U] is computed
 *>                         in V as right singular vectors of A^* and then
 *>                         copied back to the U array. This 'W' option is just
diff --git a/lapack-netlib/SRC/cgesvdq.f b/lapack-netlib/SRC/cgesvdq.f
index 1a587eca3..95091c2c0 100644
--- a/lapack-netlib/SRC/cgesvdq.f
+++ b/lapack-netlib/SRC/cgesvdq.f
@@ -363,7 +363,7 @@
 *>   an optimal implementation would do all necessary scaling before calling
 *>   CGESVD and the scaling in CGESVD can be switched off.
 *>   3. Other comments related to code optimization are given in comments in the
-*>   code, enlosed in [[double brackets]].
+*>   code, enclosed in [[double brackets]].
 *> \endverbatim
 *
 *> \par Bugs, examples and comments
diff --git a/lapack-netlib/SRC/cgsvj0.f b/lapack-netlib/SRC/cgsvj0.f
index 9e6053013..7426ecdf1 100644
--- a/lapack-netlib/SRC/cgsvj0.f
+++ b/lapack-netlib/SRC/cgsvj0.f
@@ -52,10 +52,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/cgsvj1.f b/lapack-netlib/SRC/cgsvj1.f
index 6d68e920d..18dd690cc 100644
--- a/lapack-netlib/SRC/cgsvj1.f
+++ b/lapack-netlib/SRC/cgsvj1.f
@@ -75,10 +75,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/chesv_aa_2stage.f b/lapack-netlib/SRC/chesv_aa_2stage.f
index e474c6fad..36970a329 100644
--- a/lapack-netlib/SRC/chesv_aa_2stage.f
+++ b/lapack-netlib/SRC/chesv_aa_2stage.f
@@ -87,7 +87,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/chetf2_rk.f b/lapack-netlib/SRC/chetf2_rk.f
index 4bb032382..e687ec64a 100644
--- a/lapack-netlib/SRC/chetf2_rk.f
+++ b/lapack-netlib/SRC/chetf2_rk.f
@@ -480,7 +480,7 @@
                   A( J, K ) = CONJG( A( P, J ) )
                   A( P, J ) = T
    14          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = CONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( K, K ) )
@@ -508,7 +508,7 @@
                   A( J, KK ) = CONJG( A( KP, J ) )
                   A( KP, J ) = T
    15          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = CONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( KK, KK ) )
@@ -834,7 +834,7 @@
                   A( J, K ) = CONJG( A( P, J ) )
                   A( P, J ) = T
    44          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = CONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( K, K ) )
@@ -862,7 +862,7 @@
                   A( J, KK ) = CONJG( A( KP, J ) )
                   A( KP, J ) = T
    45          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = CONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( KK, KK ) )
diff --git a/lapack-netlib/SRC/chetf2_rook.f b/lapack-netlib/SRC/chetf2_rook.f
index ee4eaf68f..49fba1bda 100644
--- a/lapack-netlib/SRC/chetf2_rook.f
+++ b/lapack-netlib/SRC/chetf2_rook.f
@@ -420,7 +420,7 @@
                   A( J, K ) = CONJG( A( P, J ) )
                   A( P, J ) = T
    14          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = CONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( K, K ) )
@@ -441,7 +441,7 @@
                   A( J, KK ) = CONJG( A( KP, J ) )
                   A( KP, J ) = T
    15          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = CONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( KK, KK ) )
@@ -733,7 +733,7 @@
                   A( J, K ) = CONJG( A( P, J ) )
                   A( P, J ) = T
    44          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = CONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( K, K ) )
@@ -754,7 +754,7 @@
                   A( J, KK ) = CONJG( A( KP, J ) )
                   A( KP, J ) = T
    45          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = CONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( KK, KK ) )
diff --git a/lapack-netlib/SRC/chetrf_aa.f b/lapack-netlib/SRC/chetrf_aa.f
index 54567b8cd..d9e4fbd19 100644
--- a/lapack-netlib/SRC/chetrf_aa.f
+++ b/lapack-netlib/SRC/chetrf_aa.f
@@ -74,7 +74,7 @@
 *>
 *>          On exit, the tridiagonal matrix is stored in the diagonals
 *>          and the subdiagonals of A just below (or above) the diagonals,
-*>          and L is stored below (or above) the subdiaonals, when UPLO
+*>          and L is stored below (or above) the subdiagonals, when UPLO
 *>          is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/chetrf_aa_2stage.f b/lapack-netlib/SRC/chetrf_aa_2stage.f
index 20a9859e8..400efdf26 100644
--- a/lapack-netlib/SRC/chetrf_aa_2stage.f
+++ b/lapack-netlib/SRC/chetrf_aa_2stage.f
@@ -75,7 +75,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/cla_gbrfsx_extended.f b/lapack-netlib/SRC/cla_gbrfsx_extended.f
index 6d43c8325..e8b96fd6e 100644
--- a/lapack-netlib/SRC/cla_gbrfsx_extended.f
+++ b/lapack-netlib/SRC/cla_gbrfsx_extended.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE CLA_GBRFSX_EXTENDED ( PREC_TYPE, TRANS_TYPE, N, KL, KU,
+*       SUBROUTINE CLA_GBRFSX_EXTENDED( PREC_TYPE, TRANS_TYPE, N, KL, KU,
 *                                       NRHS, AB, LDAB, AFB, LDAFB, IPIV,
 *                                       COLEQU, C, B, LDB, Y, LDY,
 *                                       BERR_OUT, N_NORMS, ERR_BNDS_NORM,
@@ -400,7 +400,7 @@
 *> \ingroup complexGBcomputational
 *
 *  =====================================================================
-      SUBROUTINE CLA_GBRFSX_EXTENDED ( PREC_TYPE, TRANS_TYPE, N, KL, KU,
+      SUBROUTINE CLA_GBRFSX_EXTENDED( PREC_TYPE, TRANS_TYPE, N, KL, KU,
      $                                NRHS, AB, LDAB, AFB, LDAFB, IPIV,
      $                                COLEQU, C, B, LDB, Y, LDY,
      $                                BERR_OUT, N_NORMS, ERR_BNDS_NORM,
@@ -651,7 +651,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL CAXPY( N, (1.0E+0,0.0E+0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/cla_gerfsx_extended.f b/lapack-netlib/SRC/cla_gerfsx_extended.f
index 4b1031101..e524db5ad 100644
--- a/lapack-netlib/SRC/cla_gerfsx_extended.f
+++ b/lapack-netlib/SRC/cla_gerfsx_extended.f
@@ -637,7 +637,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL CAXPY( N, (1.0E+0,0.0E+0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/cla_herfsx_extended.f b/lapack-netlib/SRC/cla_herfsx_extended.f
index 6d007ef58..19f845692 100644
--- a/lapack-netlib/SRC/cla_herfsx_extended.f
+++ b/lapack-netlib/SRC/cla_herfsx_extended.f
@@ -654,7 +654,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL CAXPY( N, CMPLX(1.0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/cla_porfsx_extended.f b/lapack-netlib/SRC/cla_porfsx_extended.f
index 9ced9b1b9..2dafecb35 100644
--- a/lapack-netlib/SRC/cla_porfsx_extended.f
+++ b/lapack-netlib/SRC/cla_porfsx_extended.f
@@ -625,7 +625,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL CAXPY( N, CMPLX(1.0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/cla_syrfsx_extended.f b/lapack-netlib/SRC/cla_syrfsx_extended.f
index 4fe538a98..95f969731 100644
--- a/lapack-netlib/SRC/cla_syrfsx_extended.f
+++ b/lapack-netlib/SRC/cla_syrfsx_extended.f
@@ -654,7 +654,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL CAXPY( N, CMPLX(1.0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/claed7.f b/lapack-netlib/SRC/claed7.f
index 9d2f97141..72f1417d4 100644
--- a/lapack-netlib/SRC/claed7.f
+++ b/lapack-netlib/SRC/claed7.f
@@ -363,7 +363,7 @@
             RETURN
          END IF
 *
-*     Prepare the INDXQ sorting premutation.
+*     Prepare the INDXQ sorting permutation.
 *
          N1 = K
          N2 = N - K
diff --git a/lapack-netlib/SRC/claqz0.f b/lapack-netlib/SRC/claqz0.f
index 6de40e06c..c6cc5847d 100644
--- a/lapack-netlib/SRC/claqz0.f
+++ b/lapack-netlib/SRC/claqz0.f
@@ -89,7 +89,7 @@
 *>      Anal., 29(2006), pp. 199--227.
 *>
 *> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift,
-*>      multipole rational QZ method with agressive early deflation"
+*>      multipole rational QZ method with aggressive early deflation"
 *> \endverbatim
 *
 *  Arguments:
@@ -310,7 +310,7 @@
       CHARACTER :: JBCMPZ*3
 
 *     External Functions
-      EXTERNAL :: XERBLA, CHGEQZ, CLAQZ2, CLAQZ3, CLASET, SLABAD,
+      EXTERNAL :: XERBLA, CHGEQZ, CLAQZ2, CLAQZ3, CLASET,
      $            CLARTG, CROT
       REAL, EXTERNAL :: SLAMCH, CLANHS
       LOGICAL, EXTERNAL :: LSAME
@@ -462,7 +462,6 @@
 *     Get machine constants
       SAFMIN = SLAMCH( 'SAFE MINIMUM' )
       SAFMAX = ONE/SAFMIN
-      CALL SLABAD( SAFMIN, SAFMAX )
       ULP = SLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( REAL( N )/ULP )
 
@@ -533,7 +532,7 @@
          DO WHILE ( K.GE.ISTART2 )
 
             IF( ABS( B( K, K ) ) .LT. BTOL ) THEN
-*              A diagonal element of B is negligable, move it
+*              A diagonal element of B is negligible, move it
 *              to the top and deflate it
                
                DO K2 = K, ISTART2+1, -1
diff --git a/lapack-netlib/SRC/clarfb_gett.f b/lapack-netlib/SRC/clarfb_gett.f
index ee6959ed8..5f042e345 100644
--- a/lapack-netlib/SRC/clarfb_gett.f
+++ b/lapack-netlib/SRC/clarfb_gett.f
@@ -452,7 +452,7 @@
          IF( LNOTIDENT ) THEN
 *
 *           col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2,
-*           V1 is not an identy matrix, but unit lower-triangular
+*           V1 is not an identity matrix, but unit lower-triangular
 *           V1 stored in A1 (diagonal ones are not stored).
 *
 *
diff --git a/lapack-netlib/SRC/clatdf.f b/lapack-netlib/SRC/clatdf.f
index 5445e387e..1e7d71669 100644
--- a/lapack-netlib/SRC/clatdf.f
+++ b/lapack-netlib/SRC/clatdf.f
@@ -227,7 +227,7 @@
             BM = RHS( J ) - CONE
             SPLUS = ONE
 *
-*           Lockahead for L- part RHS(1:N-1) = +-1
+*           Look-ahead for L- part RHS(1:N-1) = +-1
 *           SPLUS and SMIN computed more efficiently than in BSOLVE[1].
 *
             SPLUS = SPLUS + REAL( CDOTC( N-J, Z( J+1, J ), 1, Z( J+1,
diff --git a/lapack-netlib/SRC/clatrs3.f b/lapack-netlib/SRC/clatrs3.f
index a902f1ed0..0502f6898 100644
--- a/lapack-netlib/SRC/clatrs3.f
+++ b/lapack-netlib/SRC/clatrs3.f
@@ -577,7 +577,7 @@
 *              Prepare the linear update to be executed with GEMM.
 *              For each column, compute a consistent scaling, a
 *              scaling factor to survive the linear update, and
-*              rescale the column segments, if necesssary. Then
+*              rescale the column segments, if necessary. Then
 *              the linear update is safely executed.
 *
                DO KK = 1, K2-K1
diff --git a/lapack-netlib/SRC/csyconvf.f b/lapack-netlib/SRC/csyconvf.f
index f3b46f0b4..febbc411c 100644
--- a/lapack-netlib/SRC/csyconvf.f
+++ b/lapack-netlib/SRC/csyconvf.f
@@ -39,7 +39,7 @@
 *> CSYTRF provided on entry in parameter A into the factorization
 *> output format used in CSYTRF_RK (or CSYTRF_BK) that is stored
 *> on exit in parameters A and E. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in CSYTRF into
+*> the interchanges stored in IPIV from the format used in CSYTRF into
 *> the format used in CSYTRF_RK (or CSYTRF_BK).
 *>
 *> If parameter WAY = 'R':
@@ -48,7 +48,7 @@
 *> (or CSYTRF_BK) provided on entry in parameters A and E into
 *> the factorization output format used in CSYTRF that is stored
 *> on exit in parameter A. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in CSYTRF_RK
+*> the interchanges stored in IPIV from the format used in CSYTRF_RK
 *> (or CSYTRF_BK) into the format used in CSYTRF.
 *>
 *> CSYCONVF can also convert in Hermitian matrix case, i.e. between
@@ -325,7 +325,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -469,7 +469,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -535,7 +535,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/csyconvf_rook.f b/lapack-netlib/SRC/csyconvf_rook.f
index a5b9d82da..0da34e0fa 100644
--- a/lapack-netlib/SRC/csyconvf_rook.f
+++ b/lapack-netlib/SRC/csyconvf_rook.f
@@ -520,7 +520,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/csysv_aa_2stage.f b/lapack-netlib/SRC/csysv_aa_2stage.f
index d8881a748..22227505c 100644
--- a/lapack-netlib/SRC/csysv_aa_2stage.f
+++ b/lapack-netlib/SRC/csysv_aa_2stage.f
@@ -87,7 +87,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/csytrf_aa.f b/lapack-netlib/SRC/csytrf_aa.f
index b1165a425..c5467bf01 100644
--- a/lapack-netlib/SRC/csytrf_aa.f
+++ b/lapack-netlib/SRC/csytrf_aa.f
@@ -74,7 +74,7 @@
 *>
 *>          On exit, the tridiagonal matrix is stored in the diagonals
 *>          and the subdiagonals of A just below (or above) the diagonals,
-*>          and L is stored below (or above) the subdiaonals, when UPLO
+*>          and L is stored below (or above) the subdiagonals, when UPLO
 *>          is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/csytrf_aa_2stage.f b/lapack-netlib/SRC/csytrf_aa_2stage.f
index 8a2cfd7bc..b21df8cd3 100644
--- a/lapack-netlib/SRC/csytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/csytrf_aa_2stage.f
@@ -75,7 +75,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/ctrexc.f b/lapack-netlib/SRC/ctrexc.f
index ea64ddf9b..2bc0348fb 100644
--- a/lapack-netlib/SRC/ctrexc.f
+++ b/lapack-netlib/SRC/ctrexc.f
@@ -40,7 +40,7 @@
 *>
 *> The Schur form T is reordered by a unitary similarity transformation
 *> Z**H*T*Z, and optionally the matrix Q of Schur vectors is updated by
-*> postmultplying it with Z.
+*> postmultiplying it with Z.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/cunbdb1.f b/lapack-netlib/SRC/cunbdb1.f
index 740e38a85..80faa8808 100644
--- a/lapack-netlib/SRC/cunbdb1.f
+++ b/lapack-netlib/SRC/cunbdb1.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> CUNBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/cunbdb2.f b/lapack-netlib/SRC/cunbdb2.f
index b45db6100..94b9fdbf9 100644
--- a/lapack-netlib/SRC/cunbdb2.f
+++ b/lapack-netlib/SRC/cunbdb2.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> CUNBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/cunbdb3.f b/lapack-netlib/SRC/cunbdb3.f
index b532bfbc8..f942bc698 100644
--- a/lapack-netlib/SRC/cunbdb3.f
+++ b/lapack-netlib/SRC/cunbdb3.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> CUNBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/cunbdb4.f b/lapack-netlib/SRC/cunbdb4.f
index 117f23d08..a551c184e 100644
--- a/lapack-netlib/SRC/cunbdb4.f
+++ b/lapack-netlib/SRC/cunbdb4.f
@@ -38,7 +38,7 @@
 *>\verbatim
 *>
 *> CUNBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/dbdsvdx.f b/lapack-netlib/SRC/dbdsvdx.f
index 4ccd4edad..4668a88f2 100644
--- a/lapack-netlib/SRC/dbdsvdx.f
+++ b/lapack-netlib/SRC/dbdsvdx.f
@@ -45,7 +45,7 @@
 *>
 *>  Given an upper bidiagonal B with diagonal D = [ d_1 d_2 ... d_N ]
 *>  and superdiagonal E = [ e_1 e_2 ... e_N-1 ], DBDSVDX computes the
-*>  singular value decompositon of B through the eigenvalues and
+*>  singular value decomposition of B through the eigenvalues and
 *>  eigenvectors of the N*2-by-N*2 tridiagonal matrix
 *>
 *>        |  0  d_1                |
diff --git a/lapack-netlib/SRC/dgejsv.f b/lapack-netlib/SRC/dgejsv.f
index 798e9154d..1db85e9c2 100644
--- a/lapack-netlib/SRC/dgejsv.f
+++ b/lapack-netlib/SRC/dgejsv.f
@@ -253,7 +253,7 @@
 *>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
 *>                         the right singular vectors;
 *>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
-*>                         then V is used as workspace if the pprocedure
+*>                         then V is used as workspace if the procedure
 *>                         replaces A with A^t. In that case, [U] is computed
 *>                         in V as right singular vectors of A^t and then
 *>                         copied back to the U array. This 'W' option is just
diff --git a/lapack-netlib/SRC/dgesvdq.f b/lapack-netlib/SRC/dgesvdq.f
index 6f9ac703e..a514def8b 100644
--- a/lapack-netlib/SRC/dgesvdq.f
+++ b/lapack-netlib/SRC/dgesvdq.f
@@ -365,7 +365,7 @@
 *>   an optimal implementation would do all necessary scaling before calling
 *>   CGESVD and the scaling in CGESVD can be switched off.
 *>   3. Other comments related to code optimization are given in comments in the
-*>   code, enlosed in [[double brackets]].
+*>   code, enclosed in [[double brackets]].
 *> \endverbatim
 *
 *> \par Bugs, examples and comments
diff --git a/lapack-netlib/SRC/dgsvj0.f b/lapack-netlib/SRC/dgsvj0.f
index 586723e64..1008aa8c6 100644
--- a/lapack-netlib/SRC/dgsvj0.f
+++ b/lapack-netlib/SRC/dgsvj0.f
@@ -52,10 +52,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dgsvj1.f b/lapack-netlib/SRC/dgsvj1.f
index d757bb927..f1a099b2a 100644
--- a/lapack-netlib/SRC/dgsvj1.f
+++ b/lapack-netlib/SRC/dgsvj1.f
@@ -75,10 +75,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dla_gbrfsx_extended.f b/lapack-netlib/SRC/dla_gbrfsx_extended.f
index 5454b1bfc..b331178a9 100644
--- a/lapack-netlib/SRC/dla_gbrfsx_extended.f
+++ b/lapack-netlib/SRC/dla_gbrfsx_extended.f
@@ -645,7 +645,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL DAXPY( N, 1.0D+0, DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/dla_gerfsx_extended.f b/lapack-netlib/SRC/dla_gerfsx_extended.f
index 92b0d76d4..bf8260d04 100644
--- a/lapack-netlib/SRC/dla_gerfsx_extended.f
+++ b/lapack-netlib/SRC/dla_gerfsx_extended.f
@@ -625,7 +625,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL DAXPY( N, 1.0D+0, DY, 1, Y( 1, J ), 1 )
diff --git a/lapack-netlib/SRC/dla_porfsx_extended.f b/lapack-netlib/SRC/dla_porfsx_extended.f
index 5c8850fef..94d1087cd 100644
--- a/lapack-netlib/SRC/dla_porfsx_extended.f
+++ b/lapack-netlib/SRC/dla_porfsx_extended.f
@@ -617,7 +617,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL DAXPY( N, 1.0D+0, DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/dla_syrfsx_extended.f b/lapack-netlib/SRC/dla_syrfsx_extended.f
index e1cde6fc2..2e06a622b 100644
--- a/lapack-netlib/SRC/dla_syrfsx_extended.f
+++ b/lapack-netlib/SRC/dla_syrfsx_extended.f
@@ -647,7 +647,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL DAXPY( N, 1.0D+0, DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/dlaqz0.f b/lapack-netlib/SRC/dlaqz0.f
index c4cb95fd3..84cb96bcb 100644
--- a/lapack-netlib/SRC/dlaqz0.f
+++ b/lapack-netlib/SRC/dlaqz0.f
@@ -102,7 +102,7 @@
 *>      Anal., 29(2006), pp. 199--227.
 *>
 *> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift,
-*>      multipole rational QZ method with agressive early deflation"
+*>      multipole rational QZ method with aggressive early deflation"
 *> \endverbatim
 *
 *  Arguments:
@@ -332,7 +332,7 @@
       CHARACTER :: JBCMPZ*3
 
 *     External Functions
-      EXTERNAL :: XERBLA, DHGEQZ, DLASET, DLAQZ3, DLAQZ4, DLABAD,
+      EXTERNAL :: XERBLA, DHGEQZ, DLASET, DLAQZ3, DLAQZ4,
      $            DLARTG, DROT
       DOUBLE PRECISION, EXTERNAL :: DLAMCH, DLANHS
       LOGICAL, EXTERNAL :: LSAME
@@ -482,7 +482,6 @@
 *     Get machine constants
       SAFMIN = DLAMCH( 'SAFE MINIMUM' )
       SAFMAX = ONE/SAFMIN
-      CALL DLABAD( SAFMIN, SAFMAX )
       ULP = DLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( DBLE( N )/ULP )
 
@@ -567,7 +566,7 @@
          DO WHILE ( K.GE.ISTART2 )
 
             IF( ABS( B( K, K ) ) .LT. BTOL ) THEN
-*              A diagonal element of B is negligable, move it
+*              A diagonal element of B is negligible, move it
 *              to the top and deflate it
                
                DO K2 = K, ISTART2+1, -1
diff --git a/lapack-netlib/SRC/dlarfb_gett.f b/lapack-netlib/SRC/dlarfb_gett.f
index 10ab6461e..2c7ea59b0 100644
--- a/lapack-netlib/SRC/dlarfb_gett.f
+++ b/lapack-netlib/SRC/dlarfb_gett.f
@@ -451,7 +451,7 @@
          IF( LNOTIDENT ) THEN
 *
 *           col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2,
-*           V1 is not an identy matrix, but unit lower-triangular
+*           V1 is not an identity matrix, but unit lower-triangular
 *           V1 stored in A1 (diagonal ones are not stored).
 *
 *
diff --git a/lapack-netlib/SRC/dlatrs3.f b/lapack-netlib/SRC/dlatrs3.f
index b4a98bc78..e6d78b672 100644
--- a/lapack-netlib/SRC/dlatrs3.f
+++ b/lapack-netlib/SRC/dlatrs3.f
@@ -574,7 +574,7 @@
 *              Prepare the linear update to be executed with GEMM.
 *              For each column, compute a consistent scaling, a
 *              scaling factor to survive the linear update, and
-*              rescale the column segments, if necesssary. Then
+*              rescale the column segments, if necessary. Then
 *              the linear update is safely executed.
 *
                DO KK = 1, K2-K1
diff --git a/lapack-netlib/SRC/dorbdb1.f b/lapack-netlib/SRC/dorbdb1.f
index dae18f5df..b5b2d1362 100644
--- a/lapack-netlib/SRC/dorbdb1.f
+++ b/lapack-netlib/SRC/dorbdb1.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> DORBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/dorbdb2.f b/lapack-netlib/SRC/dorbdb2.f
index a0dacbb16..0b4ad732c 100644
--- a/lapack-netlib/SRC/dorbdb2.f
+++ b/lapack-netlib/SRC/dorbdb2.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> DORBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/dorbdb3.f b/lapack-netlib/SRC/dorbdb3.f
index 7b3727956..79b10a5d3 100644
--- a/lapack-netlib/SRC/dorbdb3.f
+++ b/lapack-netlib/SRC/dorbdb3.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> DORBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/dorbdb4.f b/lapack-netlib/SRC/dorbdb4.f
index 08604be45..985be3277 100644
--- a/lapack-netlib/SRC/dorbdb4.f
+++ b/lapack-netlib/SRC/dorbdb4.f
@@ -38,7 +38,7 @@
 *>\verbatim
 *>
 *> DORBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/dsyconvf.f b/lapack-netlib/SRC/dsyconvf.f
index 9e7a5af0e..005c98a03 100644
--- a/lapack-netlib/SRC/dsyconvf.f
+++ b/lapack-netlib/SRC/dsyconvf.f
@@ -39,7 +39,7 @@
 *> DSYTRF provided on entry in parameter A into the factorization
 *> output format used in DSYTRF_RK (or DSYTRF_BK) that is stored
 *> on exit in parameters A and E. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in DSYTRF into
+*> the interchanges stored in IPIV from the format used in DSYTRF into
 *> the format used in DSYTRF_RK (or DSYTRF_BK).
 *>
 *> If parameter WAY = 'R':
@@ -48,7 +48,7 @@
 *> (or DSYTRF_BK) provided on entry in parameters A and E into
 *> the factorization output format used in DSYTRF that is stored
 *> on exit in parameter A. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in DSYTRF_RK
+*> the interchanges stored in IPIV from the format used in DSYTRF_RK
 *> (or DSYTRF_BK) into the format used in DSYTRF.
 *> \endverbatim
 *
@@ -322,7 +322,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -466,7 +466,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -532,7 +532,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/dsyconvf_rook.f b/lapack-netlib/SRC/dsyconvf_rook.f
index d7f529358..c3f2083bd 100644
--- a/lapack-netlib/SRC/dsyconvf_rook.f
+++ b/lapack-netlib/SRC/dsyconvf_rook.f
@@ -517,7 +517,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/dsysv_aa_2stage.f b/lapack-netlib/SRC/dsysv_aa_2stage.f
index 1100702ba..72fbe1e9a 100644
--- a/lapack-netlib/SRC/dsysv_aa_2stage.f
+++ b/lapack-netlib/SRC/dsysv_aa_2stage.f
@@ -89,7 +89,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/dsytrf_aa.f b/lapack-netlib/SRC/dsytrf_aa.f
index 346737953..9a0b26ce5 100644
--- a/lapack-netlib/SRC/dsytrf_aa.f
+++ b/lapack-netlib/SRC/dsytrf_aa.f
@@ -74,7 +74,7 @@
 *>
 *>          On exit, the tridiagonal matrix is stored in the diagonals
 *>          and the subdiagonals of A just below (or above) the diagonals,
-*>          and L is stored below (or above) the subdiaonals, when UPLO
+*>          and L is stored below (or above) the subdiagonals, when UPLO
 *>          is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/dsytrf_aa_2stage.f b/lapack-netlib/SRC/dsytrf_aa_2stage.f
index b7f5f07c2..c65bd86e6 100644
--- a/lapack-netlib/SRC/dsytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/dsytrf_aa_2stage.f
@@ -75,7 +75,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/sbdsvdx.f b/lapack-netlib/SRC/sbdsvdx.f
index dcd554f1d..d7cb2dc83 100644
--- a/lapack-netlib/SRC/sbdsvdx.f
+++ b/lapack-netlib/SRC/sbdsvdx.f
@@ -45,7 +45,7 @@
 *>
 *>  Given an upper bidiagonal B with diagonal D = [ d_1 d_2 ... d_N ]
 *>  and superdiagonal E = [ e_1 e_2 ... e_N-1 ], SBDSVDX computes the
-*>  singular value decompositon of B through the eigenvalues and
+*>  singular value decomposition of B through the eigenvalues and
 *>  eigenvectors of the N*2-by-N*2 tridiagonal matrix
 *>
 *>        |  0  d_1                |
diff --git a/lapack-netlib/SRC/sgejsv.f b/lapack-netlib/SRC/sgejsv.f
index 1333e54fb..42d2b8313 100644
--- a/lapack-netlib/SRC/sgejsv.f
+++ b/lapack-netlib/SRC/sgejsv.f
@@ -253,7 +253,7 @@
 *>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
 *>                         the right singular vectors;
 *>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
-*>                         then V is used as workspace if the pprocedure
+*>                         then V is used as workspace if the procedure
 *>                         replaces A with A^t. In that case, [U] is computed
 *>                         in V as right singular vectors of A^t and then
 *>                         copied back to the U array. This 'W' option is just
diff --git a/lapack-netlib/SRC/sgesvdq.f b/lapack-netlib/SRC/sgesvdq.f
index ebbfc70b1..6fb328b8c 100644
--- a/lapack-netlib/SRC/sgesvdq.f
+++ b/lapack-netlib/SRC/sgesvdq.f
@@ -365,7 +365,7 @@
 *>   an optimal implementation would do all necessary scaling before calling
 *>   CGESVD and the scaling in CGESVD can be switched off.
 *>   3. Other comments related to code optimization are given in comments in the
-*>   code, enlosed in [[double brackets]].
+*>   code, enclosed in [[double brackets]].
 *> \endverbatim
 *
 *> \par Bugs, examples and comments
diff --git a/lapack-netlib/SRC/sgsvj0.f b/lapack-netlib/SRC/sgsvj0.f
index 9249f4219..7e5fddeb0 100644
--- a/lapack-netlib/SRC/sgsvj0.f
+++ b/lapack-netlib/SRC/sgsvj0.f
@@ -52,10 +52,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sgsvj1.f b/lapack-netlib/SRC/sgsvj1.f
index 79fd4d2d3..f1d1f7239 100644
--- a/lapack-netlib/SRC/sgsvj1.f
+++ b/lapack-netlib/SRC/sgsvj1.f
@@ -75,10 +75,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sla_gbrfsx_extended.f b/lapack-netlib/SRC/sla_gbrfsx_extended.f
index 499d6bf90..c79330cb5 100644
--- a/lapack-netlib/SRC/sla_gbrfsx_extended.f
+++ b/lapack-netlib/SRC/sla_gbrfsx_extended.f
@@ -644,7 +644,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL SAXPY( N, 1.0, DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/sla_gerfsx_extended.f b/lapack-netlib/SRC/sla_gerfsx_extended.f
index de05d8eb4..1a19a5071 100644
--- a/lapack-netlib/SRC/sla_gerfsx_extended.f
+++ b/lapack-netlib/SRC/sla_gerfsx_extended.f
@@ -628,7 +628,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL SAXPY( N, 1.0, DY, 1, Y( 1, J ), 1 )
diff --git a/lapack-netlib/SRC/sla_porfsx_extended.f b/lapack-netlib/SRC/sla_porfsx_extended.f
index ada4cad21..9f33b14dd 100644
--- a/lapack-netlib/SRC/sla_porfsx_extended.f
+++ b/lapack-netlib/SRC/sla_porfsx_extended.f
@@ -617,7 +617,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL SAXPY( N, 1.0, DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/sla_syrfsx_extended.f b/lapack-netlib/SRC/sla_syrfsx_extended.f
index d5096be02..2fa3b1c50 100644
--- a/lapack-netlib/SRC/sla_syrfsx_extended.f
+++ b/lapack-netlib/SRC/sla_syrfsx_extended.f
@@ -647,7 +647,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL SAXPY( N, 1.0, DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/slaqz0.f b/lapack-netlib/SRC/slaqz0.f
index 2e06f9d42..8b2d3286e 100644
--- a/lapack-netlib/SRC/slaqz0.f
+++ b/lapack-netlib/SRC/slaqz0.f
@@ -100,7 +100,7 @@
 *>      Anal., 29(2006), pp. 199--227.
 *>
 *> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift,
-*>      multipole rational QZ method with agressive early deflation"
+*>      multipole rational QZ method with aggressive early deflation"
 *> \endverbatim
 *
 *  Arguments:
@@ -329,7 +329,7 @@
       CHARACTER :: JBCMPZ*3
 
 *     External Functions
-      EXTERNAL :: XERBLA, SHGEQZ, SLAQZ3, SLAQZ4, SLASET, SLABAD,
+      EXTERNAL :: XERBLA, SHGEQZ, SLAQZ3, SLAQZ4, SLASET,
      $            SLARTG, SROT
       REAL, EXTERNAL :: SLAMCH, SLANHS
       LOGICAL, EXTERNAL :: LSAME
@@ -479,7 +479,6 @@
 *     Get machine constants
       SAFMIN = SLAMCH( 'SAFE MINIMUM' )
       SAFMAX = ONE/SAFMIN
-      CALL SLABAD( SAFMIN, SAFMAX )
       ULP = SLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( REAL( N )/ULP )
 
@@ -564,7 +563,7 @@
          DO WHILE ( K.GE.ISTART2 )
 
             IF( ABS( B( K, K ) ) .LT. BTOL ) THEN
-*              A diagonal element of B is negligable, move it
+*              A diagonal element of B is negligible, move it
 *              to the top and deflate it
                
                DO K2 = K, ISTART2+1, -1
diff --git a/lapack-netlib/SRC/slarfb_gett.f b/lapack-netlib/SRC/slarfb_gett.f
index 7719f2965..f1fdef790 100644
--- a/lapack-netlib/SRC/slarfb_gett.f
+++ b/lapack-netlib/SRC/slarfb_gett.f
@@ -451,7 +451,7 @@
          IF( LNOTIDENT ) THEN
 *
 *           col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2,
-*           V1 is not an identy matrix, but unit lower-triangular
+*           V1 is not an identity matrix, but unit lower-triangular
 *           V1 stored in A1 (diagonal ones are not stored).
 *
 *
diff --git a/lapack-netlib/SRC/slatrs3.f b/lapack-netlib/SRC/slatrs3.f
index c3a08e524..8f0c4bf16 100644
--- a/lapack-netlib/SRC/slatrs3.f
+++ b/lapack-netlib/SRC/slatrs3.f
@@ -574,7 +574,7 @@
 *              Prepare the linear update to be executed with GEMM.
 *              For each column, compute a consistent scaling, a
 *              scaling factor to survive the linear update, and
-*              rescale the column segments, if necesssary. Then
+*              rescale the column segments, if necessary. Then
 *              the linear update is safely executed.
 *
                DO KK = 1, K2-K1
diff --git a/lapack-netlib/SRC/sorbdb1.f b/lapack-netlib/SRC/sorbdb1.f
index c860f4366..191e5742a 100644
--- a/lapack-netlib/SRC/sorbdb1.f
+++ b/lapack-netlib/SRC/sorbdb1.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> SORBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/sorbdb2.f b/lapack-netlib/SRC/sorbdb2.f
index 484d352f8..b2ff34bb1 100644
--- a/lapack-netlib/SRC/sorbdb2.f
+++ b/lapack-netlib/SRC/sorbdb2.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> SORBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/sorbdb3.f b/lapack-netlib/SRC/sorbdb3.f
index 6209b24ee..99478c5d0 100644
--- a/lapack-netlib/SRC/sorbdb3.f
+++ b/lapack-netlib/SRC/sorbdb3.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> SORBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/sorbdb4.f b/lapack-netlib/SRC/sorbdb4.f
index bf60fb7bb..0fef5b759 100644
--- a/lapack-netlib/SRC/sorbdb4.f
+++ b/lapack-netlib/SRC/sorbdb4.f
@@ -38,7 +38,7 @@
 *>\verbatim
 *>
 *> SORBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/ssyconvf.f b/lapack-netlib/SRC/ssyconvf.f
index 6defc1f0e..af55da51a 100644
--- a/lapack-netlib/SRC/ssyconvf.f
+++ b/lapack-netlib/SRC/ssyconvf.f
@@ -39,7 +39,7 @@
 *> SSYTRF provided on entry in parameter A into the factorization
 *> output format used in SSYTRF_RK (or SSYTRF_BK) that is stored
 *> on exit in parameters A and E. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in SSYTRF into
+*> the interchanges stored in IPIV from the format used in SSYTRF into
 *> the format used in SSYTRF_RK (or SSYTRF_BK).
 *>
 *> If parameter WAY = 'R':
@@ -48,7 +48,7 @@
 *> (or SSYTRF_BK) provided on entry in parameters A and E into
 *> the factorization output format used in SSYTRF that is stored
 *> on exit in parameter A. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in SSYTRF_RK
+*> the interchanges stored in IPIV from the format used in SSYTRF_RK
 *> (or SSYTRF_BK) into the format used in SSYTRF.
 *> \endverbatim
 *
@@ -322,7 +322,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -466,7 +466,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -532,7 +532,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/ssyconvf_rook.f b/lapack-netlib/SRC/ssyconvf_rook.f
index c59f257bb..efd7c57fe 100644
--- a/lapack-netlib/SRC/ssyconvf_rook.f
+++ b/lapack-netlib/SRC/ssyconvf_rook.f
@@ -517,7 +517,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/ssysv_aa_2stage.f b/lapack-netlib/SRC/ssysv_aa_2stage.f
index b9eac8342..aa862f14b 100644
--- a/lapack-netlib/SRC/ssysv_aa_2stage.f
+++ b/lapack-netlib/SRC/ssysv_aa_2stage.f
@@ -88,7 +88,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/ssytrf_aa.f b/lapack-netlib/SRC/ssytrf_aa.f
index 13e849cdc..4ba026fc8 100644
--- a/lapack-netlib/SRC/ssytrf_aa.f
+++ b/lapack-netlib/SRC/ssytrf_aa.f
@@ -74,7 +74,7 @@
 *>
 *>          On exit, the tridiagonal matrix is stored in the diagonals
 *>          and the subdiagonals of A just below (or above) the diagonals,
-*>          and L is stored below (or above) the subdiaonals, when UPLO
+*>          and L is stored below (or above) the subdiagonals, when UPLO
 *>          is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/ssytrf_aa_2stage.f b/lapack-netlib/SRC/ssytrf_aa_2stage.f
index 94b3aa4b5..07357f2ab 100644
--- a/lapack-netlib/SRC/ssytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/ssytrf_aa_2stage.f
@@ -75,7 +75,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/zgejsv.f b/lapack-netlib/SRC/zgejsv.f
index a36a9188a..b4bc531ab 100644
--- a/lapack-netlib/SRC/zgejsv.f
+++ b/lapack-netlib/SRC/zgejsv.f
@@ -252,7 +252,7 @@
 *>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
 *>                         the right singular vectors;
 *>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
-*>                         then V is used as workspace if the pprocedure
+*>                         then V is used as workspace if the procedure
 *>                         replaces A with A^*. In that case, [U] is computed
 *>                         in V as right singular vectors of A^* and then
 *>                         copied back to the U array. This 'W' option is just
diff --git a/lapack-netlib/SRC/zgesvdq.f b/lapack-netlib/SRC/zgesvdq.f
index 05eb722f8..b990f7389 100644
--- a/lapack-netlib/SRC/zgesvdq.f
+++ b/lapack-netlib/SRC/zgesvdq.f
@@ -363,7 +363,7 @@
 *>   an optimal implementation would do all necessary scaling before calling
 *>   CGESVD and the scaling in CGESVD can be switched off.
 *>   3. Other comments related to code optimization are given in comments in the
-*>   code, enlosed in [[double brackets]].
+*>   code, enclosed in [[double brackets]].
 *> \endverbatim
 *
 *> \par Bugs, examples and comments
diff --git a/lapack-netlib/SRC/zgsvj0.f b/lapack-netlib/SRC/zgsvj0.f
index 11bfbd8dc..075c243c2 100644
--- a/lapack-netlib/SRC/zgsvj0.f
+++ b/lapack-netlib/SRC/zgsvj0.f
@@ -52,10 +52,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zgsvj1.f b/lapack-netlib/SRC/zgsvj1.f
index efe0384ef..ba71e155f 100644
--- a/lapack-netlib/SRC/zgsvj1.f
+++ b/lapack-netlib/SRC/zgsvj1.f
@@ -75,10 +75,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zhesv_aa_2stage.f b/lapack-netlib/SRC/zhesv_aa_2stage.f
index ee1596855..79c01c546 100644
--- a/lapack-netlib/SRC/zhesv_aa_2stage.f
+++ b/lapack-netlib/SRC/zhesv_aa_2stage.f
@@ -88,7 +88,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/zhetf2_rk.f b/lapack-netlib/SRC/zhetf2_rk.f
index 050c7993d..87df901aa 100644
--- a/lapack-netlib/SRC/zhetf2_rk.f
+++ b/lapack-netlib/SRC/zhetf2_rk.f
@@ -480,7 +480,7 @@
                   A( J, K ) = DCONJG( A( P, J ) )
                   A( P, J ) = T
    14          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = DCONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( K, K ) )
@@ -508,7 +508,7 @@
                   A( J, KK ) = DCONJG( A( KP, J ) )
                   A( KP, J ) = T
    15          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = DCONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( KK, KK ) )
@@ -834,7 +834,7 @@
                   A( J, K ) = DCONJG( A( P, J ) )
                   A( P, J ) = T
    44          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = DCONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( K, K ) )
@@ -862,7 +862,7 @@
                   A( J, KK ) = DCONJG( A( KP, J ) )
                   A( KP, J ) = T
    45          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = DCONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( KK, KK ) )
diff --git a/lapack-netlib/SRC/zhetf2_rook.f b/lapack-netlib/SRC/zhetf2_rook.f
index 94bb29736..91172f601 100644
--- a/lapack-netlib/SRC/zhetf2_rook.f
+++ b/lapack-netlib/SRC/zhetf2_rook.f
@@ -420,7 +420,7 @@
                   A( J, K ) = DCONJG( A( P, J ) )
                   A( P, J ) = T
    14          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = DCONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( K, K ) )
@@ -441,7 +441,7 @@
                   A( J, KK ) = DCONJG( A( KP, J ) )
                   A( KP, J ) = T
    15          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = DCONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( KK, KK ) )
@@ -733,7 +733,7 @@
                   A( J, K ) = DCONJG( A( P, J ) )
                   A( P, J ) = T
    44          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = DCONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( K, K ) )
@@ -754,7 +754,7 @@
                   A( J, KK ) = DCONJG( A( KP, J ) )
                   A( KP, J ) = T
    45          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = DCONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( KK, KK ) )
diff --git a/lapack-netlib/SRC/zhetrf_aa.f b/lapack-netlib/SRC/zhetrf_aa.f
index 56722e7e6..537c16e8c 100644
--- a/lapack-netlib/SRC/zhetrf_aa.f
+++ b/lapack-netlib/SRC/zhetrf_aa.f
@@ -74,7 +74,7 @@
 *>
 *>          On exit, the tridiagonal matrix is stored in the diagonals
 *>          and the subdiagonals of A just below (or above) the diagonals,
-*>          and L is stored below (or above) the subdiaonals, when UPLO
+*>          and L is stored below (or above) the subdiagonals, when UPLO
 *>          is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/zhetrf_aa_2stage.f b/lapack-netlib/SRC/zhetrf_aa_2stage.f
index 92f1c09b3..477602b5e 100644
--- a/lapack-netlib/SRC/zhetrf_aa_2stage.f
+++ b/lapack-netlib/SRC/zhetrf_aa_2stage.f
@@ -75,7 +75,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/zla_gbrfsx_extended.f b/lapack-netlib/SRC/zla_gbrfsx_extended.f
index fe4d635b1..2f57b7682 100644
--- a/lapack-netlib/SRC/zla_gbrfsx_extended.f
+++ b/lapack-netlib/SRC/zla_gbrfsx_extended.f
@@ -651,7 +651,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL ZAXPY( N, (1.0D+0,0.0D+0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/zla_gerfsx_extended.f b/lapack-netlib/SRC/zla_gerfsx_extended.f
index 9d618f294..22e45c5a9 100644
--- a/lapack-netlib/SRC/zla_gerfsx_extended.f
+++ b/lapack-netlib/SRC/zla_gerfsx_extended.f
@@ -636,7 +636,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL ZAXPY( N, (1.0D+0,0.0D+0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/zla_herfsx_extended.f b/lapack-netlib/SRC/zla_herfsx_extended.f
index a55dd9431..689460a02 100644
--- a/lapack-netlib/SRC/zla_herfsx_extended.f
+++ b/lapack-netlib/SRC/zla_herfsx_extended.f
@@ -655,7 +655,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL ZAXPY( N, DCMPLX(1.0D+0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/zla_porfsx_extended.f b/lapack-netlib/SRC/zla_porfsx_extended.f
index 12e05e049..e853494fc 100644
--- a/lapack-netlib/SRC/zla_porfsx_extended.f
+++ b/lapack-netlib/SRC/zla_porfsx_extended.f
@@ -626,7 +626,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL ZAXPY( N, DCMPLX(1.0D+0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/zla_syrfsx_extended.f b/lapack-netlib/SRC/zla_syrfsx_extended.f
index d6c241499..fb1b9e2d7 100644
--- a/lapack-netlib/SRC/zla_syrfsx_extended.f
+++ b/lapack-netlib/SRC/zla_syrfsx_extended.f
@@ -655,7 +655,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL ZAXPY( N, DCMPLX(1.0D+0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/zlaed7.f b/lapack-netlib/SRC/zlaed7.f
index 83f32d8b8..86e5ec6b5 100644
--- a/lapack-netlib/SRC/zlaed7.f
+++ b/lapack-netlib/SRC/zlaed7.f
@@ -363,7 +363,7 @@
             RETURN
          END IF
 *
-*     Prepare the INDXQ sorting premutation.
+*     Prepare the INDXQ sorting permutation.
 *
          N1 = K
          N2 = N - K
diff --git a/lapack-netlib/SRC/zlaqz0.f b/lapack-netlib/SRC/zlaqz0.f
index 3e20200ed..dcb28850a 100644
--- a/lapack-netlib/SRC/zlaqz0.f
+++ b/lapack-netlib/SRC/zlaqz0.f
@@ -89,7 +89,7 @@
 *>      Anal., 29(2006), pp. 199--227.
 *>
 *> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift,
-*>      multipole rational QZ method with agressive early deflation"
+*>      multipole rational QZ method with aggressive early deflation"
 *> \endverbatim
 *
 *  Arguments:
@@ -312,7 +312,7 @@
       CHARACTER :: JBCMPZ*3
 
 *     External Functions
-      EXTERNAL :: XERBLA, ZHGEQZ, ZLAQZ2, ZLAQZ3, ZLASET, DLABAD,
+      EXTERNAL :: XERBLA, ZHGEQZ, ZLAQZ2, ZLAQZ3, ZLASET,
      $            ZLARTG, ZROT
       DOUBLE PRECISION, EXTERNAL :: DLAMCH, ZLANHS
       LOGICAL, EXTERNAL :: LSAME
@@ -464,7 +464,6 @@
 *     Get machine constants
       SAFMIN = DLAMCH( 'SAFE MINIMUM' )
       SAFMAX = ONE/SAFMIN
-      CALL DLABAD( SAFMIN, SAFMAX )
       ULP = DLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( DBLE( N )/ULP )
 
@@ -535,7 +534,7 @@
          DO WHILE ( K.GE.ISTART2 )
 
             IF( ABS( B( K, K ) ) .LT. BTOL ) THEN
-*              A diagonal element of B is negligable, move it
+*              A diagonal element of B is negligible, move it
 *              to the top and deflate it
                
                DO K2 = K, ISTART2+1, -1
diff --git a/lapack-netlib/SRC/zlarfb_gett.f b/lapack-netlib/SRC/zlarfb_gett.f
index 4a3c4dcf1..17d4b33aa 100644
--- a/lapack-netlib/SRC/zlarfb_gett.f
+++ b/lapack-netlib/SRC/zlarfb_gett.f
@@ -452,7 +452,7 @@
          IF( LNOTIDENT ) THEN
 *
 *           col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2,
-*           V1 is not an identy matrix, but unit lower-triangular
+*           V1 is not an identity matrix, but unit lower-triangular
 *           V1 stored in A1 (diagonal ones are not stored).
 *
 *
diff --git a/lapack-netlib/SRC/zlatdf.f b/lapack-netlib/SRC/zlatdf.f
index 25e71edce..41714a3e0 100644
--- a/lapack-netlib/SRC/zlatdf.f
+++ b/lapack-netlib/SRC/zlatdf.f
@@ -227,7 +227,7 @@
             BM = RHS( J ) - CONE
             SPLUS = ONE
 *
-*           Lockahead for L- part RHS(1:N-1) = +-1
+*           Look-ahead for L- part RHS(1:N-1) = +-1
 *           SPLUS and SMIN computed more efficiently than in BSOLVE[1].
 *
             SPLUS = SPLUS + DBLE( ZDOTC( N-J, Z( J+1, J ), 1, Z( J+1,
diff --git a/lapack-netlib/SRC/zlatrs3.f b/lapack-netlib/SRC/zlatrs3.f
index fc1be0517..231a17274 100644
--- a/lapack-netlib/SRC/zlatrs3.f
+++ b/lapack-netlib/SRC/zlatrs3.f
@@ -577,7 +577,7 @@
 *              Prepare the linear update to be executed with GEMM.
 *              For each column, compute a consistent scaling, a
 *              scaling factor to survive the linear update, and
-*              rescale the column segments, if necesssary. Then
+*              rescale the column segments, if necessary. Then
 *              the linear update is safely executed.
 *
                DO KK = 1, K2 - K1
diff --git a/lapack-netlib/SRC/zsyconvf.f b/lapack-netlib/SRC/zsyconvf.f
index eb49b0f3d..0958a5f77 100644
--- a/lapack-netlib/SRC/zsyconvf.f
+++ b/lapack-netlib/SRC/zsyconvf.f
@@ -39,7 +39,7 @@
 *> ZSYTRF provided on entry in parameter A into the factorization
 *> output format used in ZSYTRF_RK (or ZSYTRF_BK) that is stored
 *> on exit in parameters A and E. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in ZSYTRF into
+*> the interchanges stored in IPIV from the format used in ZSYTRF into
 *> the format used in ZSYTRF_RK (or ZSYTRF_BK).
 *>
 *> If parameter WAY = 'R':
@@ -48,7 +48,7 @@
 *> (or ZSYTRF_BK) provided on entry in parameters A and E into
 *> the factorization output format used in ZSYTRF that is stored
 *> on exit in parameter A. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in ZSYTRF_RK
+*> the interchanges stored in IPIV from the format used in ZSYTRF_RK
 *> (or ZSYTRF_BK) into the format used in ZSYTRF.
 *>
 *> ZSYCONVF can also convert in Hermitian matrix case, i.e. between
@@ -325,7 +325,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -469,7 +469,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -535,7 +535,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/zsyconvf_rook.f b/lapack-netlib/SRC/zsyconvf_rook.f
index 3cfa694c3..62cca060b 100644
--- a/lapack-netlib/SRC/zsyconvf_rook.f
+++ b/lapack-netlib/SRC/zsyconvf_rook.f
@@ -520,7 +520,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/zsysv_aa_2stage.f b/lapack-netlib/SRC/zsysv_aa_2stage.f
index 4f19630a9..701d73a38 100644
--- a/lapack-netlib/SRC/zsysv_aa_2stage.f
+++ b/lapack-netlib/SRC/zsysv_aa_2stage.f
@@ -87,7 +87,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/zsytrf_aa.f b/lapack-netlib/SRC/zsytrf_aa.f
index 73d257cfe..ddb19925a 100644
--- a/lapack-netlib/SRC/zsytrf_aa.f
+++ b/lapack-netlib/SRC/zsytrf_aa.f
@@ -74,7 +74,7 @@
 *>
 *>          On exit, the tridiagonal matrix is stored in the diagonals
 *>          and the subdiagonals of A just below (or above) the diagonals,
-*>          and L is stored below (or above) the subdiaonals, when UPLO
+*>          and L is stored below (or above) the subdiagonals, when UPLO
 *>          is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/zsytrf_aa_2stage.f b/lapack-netlib/SRC/zsytrf_aa_2stage.f
index b731cb7a2..95b9fda0d 100644
--- a/lapack-netlib/SRC/zsytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/zsytrf_aa_2stage.f
@@ -75,7 +75,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/ztrexc.f b/lapack-netlib/SRC/ztrexc.f
index b92e63efa..32aae51e4 100644
--- a/lapack-netlib/SRC/ztrexc.f
+++ b/lapack-netlib/SRC/ztrexc.f
@@ -40,7 +40,7 @@
 *>
 *> The Schur form T is reordered by a unitary similarity transformation
 *> Z**H*T*Z, and optionally the matrix Q of Schur vectors is updated by
-*> postmultplying it with Z.
+*> postmultiplying it with Z.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zunbdb1.f b/lapack-netlib/SRC/zunbdb1.f
index 767788a74..2fae170de 100644
--- a/lapack-netlib/SRC/zunbdb1.f
+++ b/lapack-netlib/SRC/zunbdb1.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> ZUNBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/zunbdb2.f b/lapack-netlib/SRC/zunbdb2.f
index 46b08aa1e..28e78fc23 100644
--- a/lapack-netlib/SRC/zunbdb2.f
+++ b/lapack-netlib/SRC/zunbdb2.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> ZUNBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/zunbdb3.f b/lapack-netlib/SRC/zunbdb3.f
index 74083e41a..9f32a7a88 100644
--- a/lapack-netlib/SRC/zunbdb3.f
+++ b/lapack-netlib/SRC/zunbdb3.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> ZUNBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/zunbdb4.f b/lapack-netlib/SRC/zunbdb4.f
index 4672cfa67..a1db5eb79 100644
--- a/lapack-netlib/SRC/zunbdb4.f
+++ b/lapack-netlib/SRC/zunbdb4.f
@@ -38,7 +38,7 @@
 *>\verbatim
 *>
 *> ZUNBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/TESTING/EIG/cchkst.f b/lapack-netlib/TESTING/EIG/cchkst.f
index 95747d051..ed535e91f 100644
--- a/lapack-netlib/TESTING/EIG/cchkst.f
+++ b/lapack-netlib/TESTING/EIG/cchkst.f
@@ -364,7 +364,7 @@
 *> \verbatim
 *>          D1 is REAL array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by CSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by CSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -665,8 +665,7 @@
       EXTERNAL           CCOPY, CHET21, CHETRD, CHPT21, CHPTRD, CLACPY,
      $                   CLASET, CLATMR, CLATMS, CPTEQR, CSTEDC, CSTEMR,
      $                   CSTEIN, CSTEQR, CSTT21, CSTT22, CUNGTR, CUPGTR,
-     $                   SCOPY, SLABAD, SLASUM, SSTEBZ, SSTECH, SSTERF,
-     $                   XERBLA
+     $                   SCOPY, SLASUM, SSTEBZ, SSTECH, SSTERF, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, CONJG, INT, LOG, MAX, MIN, REAL, SQRT
@@ -733,7 +732,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/cchkst2stg.f b/lapack-netlib/TESTING/EIG/cchkst2stg.f
index e4deb8ac8..668b51a36 100644
--- a/lapack-netlib/TESTING/EIG/cchkst2stg.f
+++ b/lapack-netlib/TESTING/EIG/cchkst2stg.f
@@ -385,7 +385,7 @@
 *> \verbatim
 *>          D1 is REAL array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by CSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by CSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -683,10 +683,10 @@
       EXTERNAL           ILAENV, SLAMCH, SLARND, SSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SCOPY, SLABAD, SLASUM, SSTEBZ, SSTECH, SSTERF,
-     $                   XERBLA, CCOPY, CHET21, CHETRD, CHPT21, CHPTRD,
-     $                   CLACPY, CLASET, CLATMR, CLATMS, CPTEQR, CSTEDC,
-     $                   CSTEMR, CSTEIN, CSTEQR, CSTT21, CSTT22, CUNGTR,
+      EXTERNAL           SCOPY, SLASUM, SSTEBZ, SSTECH, SSTERF, XERBLA,
+     $                   CCOPY, CHET21, CHETRD, CHPT21, CHPTRD, CLACPY,
+     $                   CLASET, CLATMR, CLATMS, CPTEQR, CSTEDC, CSTEMR,
+     $                   CSTEIN, CSTEQR, CSTT21, CSTT22, CUNGTR,
      $                   CUPGTR, CHETRD_2STAGE, SLASET
 *     ..
 *     .. Intrinsic Functions ..
@@ -754,7 +754,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/cdrgsx.f b/lapack-netlib/TESTING/EIG/cdrgsx.f
index c5baeef5e..1729770a2 100644
--- a/lapack-netlib/TESTING/EIG/cdrgsx.f
+++ b/lapack-netlib/TESTING/EIG/cdrgsx.f
@@ -395,7 +395,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALASVM, CGESVD, CGET51, CGGESX, CLACPY, CLAKF2,
-     $                   CLASET, CLATM5, SLABAD, XERBLA
+     $                   CLASET, CLATM5, XERBLA
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            FS
@@ -478,7 +478,6 @@
       ULPINV = ONE / ULP
       SMLNUM = SLAMCH( 'S' ) / ULP
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       THRSH2 = TEN*THRESH
       NTESTT = 0
       NERRS = 0
@@ -917,7 +916,7 @@
      $      / '  2:  A and B are upper triangular matrices, ',
      $      / '  3:  A and B are as type 2, but each second diagonal ',
      $      'block in A_11 and ', /
-     $      '      each third diaongal block in A_22 are 2x2 blocks,',
+     $      '      each third diagonal block in A_22 are 2x2 blocks,',
      $      / '  4:  A and B are block diagonal matrices, ',
      $      / '  5:  (A,B) has potentially close or common ',
      $      'eigenvalues.', / )
diff --git a/lapack-netlib/TESTING/EIG/cdrvsg.f b/lapack-netlib/TESTING/EIG/cdrvsg.f
index d15b39d01..729976738 100644
--- a/lapack-netlib/TESTING/EIG/cdrvsg.f
+++ b/lapack-netlib/TESTING/EIG/cdrvsg.f
@@ -236,7 +236,7 @@
 *>
 *>  B       COMPLEX array, dimension (LDB , max(NN))
 *>          Used to hold the Hermitian positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -420,7 +420,7 @@
 *     .. External Subroutines ..
       EXTERNAL           CHBGV, CHBGVD, CHBGVX, CHEGV, CHEGVD, CHEGVX,
      $                   CHPGV, CHPGVD, CHPGVX, CLACPY, CLASET, CLATMR,
-     $                   CLATMS, CSGT01, SLABAD, SLAFTS, SLASUM, XERBLA
+     $                   CLATMS, CSGT01, SLAFTS, SLASUM, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN, REAL, SQRT
@@ -481,7 +481,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/cdrvsg2stg.f b/lapack-netlib/TESTING/EIG/cdrvsg2stg.f
index 8b8553773..f7d323247 100644
--- a/lapack-netlib/TESTING/EIG/cdrvsg2stg.f
+++ b/lapack-netlib/TESTING/EIG/cdrvsg2stg.f
@@ -242,7 +242,7 @@
 *>
 *>  B       COMPLEX array, dimension (LDB , max(NN))
 *>          Used to hold the Hermitian positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -426,7 +426,7 @@
       EXTERNAL           LSAME, SLAMCH, SLARND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLABAD, SLAFTS, SLASUM, XERBLA, CHBGV, CHBGVD,
+      EXTERNAL           SLAFTS, SLASUM, XERBLA, CHBGV, CHBGVD,
      $                   CHBGVX, CHEGV, CHEGVD, CHEGVX, CHPGV, CHPGVD,
      $                   CHPGVX, CLACPY, CLASET, CLATMR, CLATMS, CSGT01,
      $                   CHEGV_2STAGE
@@ -490,7 +490,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/cdrvst.f b/lapack-netlib/TESTING/EIG/cdrvst.f
index 9c129c0e8..205f06f0d 100644
--- a/lapack-netlib/TESTING/EIG/cdrvst.f
+++ b/lapack-netlib/TESTING/EIG/cdrvst.f
@@ -204,7 +204,7 @@
 *>          Not modified.
 *>
 *>  D1      REAL array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by CSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by CSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -393,8 +393,8 @@
 *     .. External Subroutines ..
       EXTERNAL           ALASVM, CHBEV, CHBEVD, CHBEVX, CHEEV, CHEEVD,
      $                   CHEEVR, CHEEVX, CHET21, CHET22, CHPEV, CHPEVD,
-     $                   CHPEVX, CLACPY, CLASET, CLATMR, CLATMS, SLABAD,
-     $                   SLAFTS, XERBLA
+     $                   CHPEVX, CLACPY, CLASET, CLATMR, CLATMS, SLAFTS,
+     $                   XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, INT, LOG, MAX, MIN, REAL, SQRT
@@ -451,7 +451,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/cdrvst2stg.f b/lapack-netlib/TESTING/EIG/cdrvst2stg.f
index 954c7fb87..258f1f370 100644
--- a/lapack-netlib/TESTING/EIG/cdrvst2stg.f
+++ b/lapack-netlib/TESTING/EIG/cdrvst2stg.f
@@ -204,7 +204,7 @@
 *>          Not modified.
 *>
 *>  D1      REAL             array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by CSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by CSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -391,7 +391,7 @@
       EXTERNAL           SLAMCH, SLARND, SSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, SLABAD, SLAFTS, XERBLA, CHBEV, CHBEVD,
+      EXTERNAL           ALASVM, SLAFTS, XERBLA, CHBEV, CHBEVD,
      $                   CHBEVX, CHEEV, CHEEVD, CHEEVR, CHEEVX, CHET21,
      $                   CHET22, CHPEV, CHPEVD, CHPEVX, CLACPY, CLASET,
      $                   CHEEVD_2STAGE, CHEEVR_2STAGE, CHEEVX_2STAGE,
@@ -453,7 +453,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/cerrst.f b/lapack-netlib/TESTING/EIG/cerrst.f
index eef34b44b..18b5cad36 100644
--- a/lapack-netlib/TESTING/EIG/cerrst.f
+++ b/lapack-netlib/TESTING/EIG/cerrst.f
@@ -628,56 +628,56 @@
          SRNAMT = 'CHEEVX_2STAGE'
          INFOT = 1
          CALL CHEEVX_2STAGE( '/', 'A', 'U', 0, A, 1,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 1, W, 1, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 1
          CALL CHEEVX_2STAGE( 'V', 'A', 'U', 0, A, 1,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 1, W, 1, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 2
          CALL CHEEVX_2STAGE( 'N', '/', 'U', 0, A, 1,
-     $                0.0D0, 1.0D0, 1, 0, 0.0D0,
+     $                0.0, 1.0, 1, 0, 0.0,
      $                M, X, Z, 1, W, 1, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 3
          CALL CHEEVX_2STAGE( 'N', 'A', '/', 0, A, 1,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 1, W, 1, RW, IW, I3, INFO )
          INFOT = 4
          CALL CHEEVX_2STAGE( 'N', 'A', 'U', -1, A, 1,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 1, W, 1, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 6
          CALL CHEEVX_2STAGE( 'N', 'A', 'U', 2, A, 1,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 2, W, 3, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL CHEEVX_2STAGE( 'N', 'V', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 1, W, 1, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 9
          CALL CHEEVX_2STAGE( 'N', 'I', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 1, W, 1, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 10
          CALL CHEEVX_2STAGE( 'N', 'I', 'U', 2, A, 2,
-     $                0.0D0, 0.0D0, 2, 1, 0.0D0,
+     $                0.0, 0.0, 2, 1, 0.0,
      $                M, X, Z, 2, W, 3, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 15
          CALL CHEEVX_2STAGE( 'N', 'A', 'U', 2, A, 2,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 0, W, 3, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 17
          CALL CHEEVX_2STAGE( 'N', 'A', 'U', 2, A, 2,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 2, W, 0, RW, IW, I1, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          NT = NT + 11
@@ -755,79 +755,79 @@
          N = 1
          INFOT = 1
          CALL CHEEVR_2STAGE( '/', 'A', 'U', 0, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 1
          CALL CHEEVR_2STAGE( 'V', 'A', 'U', 0, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 2
          CALL CHEEVR_2STAGE( 'N', '/', 'U', 0, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 3
          CALL CHEEVR_2STAGE( 'N', 'A', '/', -1, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N,
      $                IW( 2*N+1 ), 10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 4
          CALL CHEEVR_2STAGE( 'N', 'A', 'U', -1, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N,
      $                IW( 2*N+1 ), 10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 6
          CALL CHEEVR_2STAGE( 'N', 'A', 'U', 2, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL CHEEVR_2STAGE( 'N', 'V', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 9
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 0, 1, 0.0D0,
+     $                0.0, 0.0, 0, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 10
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 2, A, 2,
-     $                0.0D0, 0.0D0, 2, 1, 0.0D0,
+     $                0.0, 0.0, 2, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 15
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 0, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 18
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N-1, IW( 2*N-1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 22
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 10*N-1,
      $                INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
@@ -1259,65 +1259,65 @@
          SRNAMT = 'CHBEVX_2STAGE'
          INFOT = 1
          CALL CHBEVX_2STAGE( '/', 'A', 'U', 0, 0, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          INFOT = 1
          CALL CHBEVX_2STAGE( 'V', 'A', 'U', 0, 0, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 2
          CALL CHBEVX_2STAGE( 'N', '/', 'U', 0, 0, A, 1, Q, 1,
-     $                       0.0D0, 1.0D0, 1, 0, 0.0D0,
+     $                       0.0, 1.0, 1, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 3
          CALL CHBEVX_2STAGE( 'N', 'A', '/', 0, 0, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          INFOT = 4
          CALL CHBEVX_2STAGE( 'N', 'A', 'U', -1, 0, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 5
          CALL CHBEVX_2STAGE( 'N', 'A', 'U', 0, -1, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 7
          CALL CHBEVX_2STAGE( 'N', 'A', 'U', 2, 1, A, 1, Q, 2,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 2, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
 *         INFOT = 9
 *         CALL CHBEVX_2STAGE( 'V', 'A', 'U', 2, 0, A, 1, Q, 1,
-*     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+*     $                       0.0, 0.0, 0, 0, 0.0,
 *     $                       M, X, Z, 2, W, 0, RW, IW, I3, INFO )
 *         CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
          CALL CHBEVX_2STAGE( 'N', 'V', 'U', 1, 0, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 12
          CALL CHBEVX_2STAGE( 'N', 'I', 'U', 1, 0, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 13
          CALL CHBEVX_2STAGE( 'N', 'I', 'U', 1, 0, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 1, 2, 0.0D0,
+     $                       0.0, 0.0, 1, 2, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 18
          CALL CHBEVX_2STAGE( 'N', 'A', 'U', 2, 0, A, 1, Q, 2,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 0, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL CHBEVX_2STAGE( 'N', 'A', 'U', 2, 0, A, 1, Q, 2,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          NT = NT + 12
diff --git a/lapack-netlib/TESTING/EIG/dchkst.f b/lapack-netlib/TESTING/EIG/dchkst.f
index 2e04f68c5..6e02c84fb 100644
--- a/lapack-netlib/TESTING/EIG/dchkst.f
+++ b/lapack-netlib/TESTING/EIG/dchkst.f
@@ -363,7 +363,7 @@
 *> \verbatim
 *>          D1 is DOUBLE PRECISION array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by DSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by DSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -645,10 +645,10 @@
       EXTERNAL           ILAENV, DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DCOPY, DLABAD, DLACPY, DLASET, DLASUM, DLATMR,
-     $                   DLATMS, DOPGTR, DORGTR, DPTEQR, DSPT21, DSPTRD,
-     $                   DSTEBZ, DSTECH, DSTEDC, DSTEMR, DSTEIN, DSTEQR,
-     $                   DSTERF, DSTT21, DSTT22, DSYT21, DSYTRD, XERBLA
+      EXTERNAL           DCOPY, DLACPY, DLASET, DLASUM, DLATMR, DLATMS,
+     $                   DOPGTR, DORGTR, DPTEQR, DSPT21, DSPTRD, DSTEBZ,
+     $                   DSTECH, DSTEDC, DSTEMR, DSTEIN, DSTEQR, DSTERF,
+     $                   DSTT21, DSTT22, DSYT21, DSYTRD, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, INT, LOG, MAX, MIN, SQRT
@@ -715,7 +715,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/dchkst2stg.f b/lapack-netlib/TESTING/EIG/dchkst2stg.f
index 2c98b802d..0fbebc8b7 100644
--- a/lapack-netlib/TESTING/EIG/dchkst2stg.f
+++ b/lapack-netlib/TESTING/EIG/dchkst2stg.f
@@ -384,7 +384,7 @@
 *> \verbatim
 *>          D1 is DOUBLE PRECISION array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by DSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by DSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -666,10 +666,10 @@
       EXTERNAL           ILAENV, DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DCOPY, DLABAD, DLACPY, DLASET, DLASUM, DLATMR,
-     $                   DLATMS, DOPGTR, DORGTR, DPTEQR, DSPT21, DSPTRD,
-     $                   DSTEBZ, DSTECH, DSTEDC, DSTEMR, DSTEIN, DSTEQR,
-     $                   DSTERF, DSTT21, DSTT22, DSYT21, DSYTRD, XERBLA,
+      EXTERNAL           DCOPY, DLACPY, DLASET, DLASUM, DLATMR, DLATMS,
+     $                   DOPGTR, DORGTR, DPTEQR, DSPT21, DSPTRD, DSTEBZ,
+     $                   DSTECH, DSTEDC, DSTEMR, DSTEIN, DSTEQR, DSTERF,
+     $                   DSTT21, DSTT22, DSYT21, DSYTRD, XERBLA,
      $                   DSYTRD_2STAGE
 *     ..
 *     .. Intrinsic Functions ..
@@ -737,7 +737,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/ddrgsx.f b/lapack-netlib/TESTING/EIG/ddrgsx.f
index b3f5e23f4..82470ead3 100644
--- a/lapack-netlib/TESTING/EIG/ddrgsx.f
+++ b/lapack-netlib/TESTING/EIG/ddrgsx.f
@@ -400,7 +400,7 @@
       EXTERNAL           DLCTSX, ILAENV, DLAMCH, DLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, DGESVD, DGET51, DGET53, DGGESX, DLABAD,
+      EXTERNAL           ALASVM, DGESVD, DGET51, DGET53, DGGESX,
      $                   DLACPY, DLAKF2, DLASET, DLATM5, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
@@ -478,7 +478,6 @@
       ULPINV = ONE / ULP
       SMLNUM = DLAMCH( 'S' ) / ULP
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
       THRSH2 = TEN*THRESH
       NTESTT = 0
       NERRS = 0
@@ -984,7 +983,7 @@
      $      / '  2:  A and B are upper triangular matrices, ',
      $      / '  3:  A and B are as type 2, but each second diagonal ',
      $      'block in A_11 and ', /
-     $      '      each third diaongal block in A_22 are 2x2 blocks,',
+     $      '      each third diagonal block in A_22 are 2x2 blocks,',
      $      / '  4:  A and B are block diagonal matrices, ',
      $      / '  5:  (A,B) has potentially close or common ',
      $      'eigenvalues.', / )
diff --git a/lapack-netlib/TESTING/EIG/ddrvsg.f b/lapack-netlib/TESTING/EIG/ddrvsg.f
index 2e9d3c643..72c373086 100644
--- a/lapack-netlib/TESTING/EIG/ddrvsg.f
+++ b/lapack-netlib/TESTING/EIG/ddrvsg.f
@@ -234,7 +234,7 @@
 *>
 *>  B       DOUBLE PRECISION array, dimension (LDB , max(NN))
 *>          Used to hold the symmetric positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -399,7 +399,7 @@
       EXTERNAL           LSAME, DLAMCH, DLARND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, DLACPY, DLAFTS, DLASET, DLASUM, DLATMR,
+      EXTERNAL           DLACPY, DLAFTS, DLASET, DLASUM, DLATMR,
      $                   DLATMS, DSBGV, DSBGVD, DSBGVX, DSGT01, DSPGV,
      $                   DSPGVD, DSPGVX, DSYGV, DSYGVD, DSYGVX, XERBLA
 *     ..
@@ -460,7 +460,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/ddrvsg2stg.f b/lapack-netlib/TESTING/EIG/ddrvsg2stg.f
index 196c6b48e..0fe31cab1 100644
--- a/lapack-netlib/TESTING/EIG/ddrvsg2stg.f
+++ b/lapack-netlib/TESTING/EIG/ddrvsg2stg.f
@@ -240,7 +240,7 @@
 *>
 *>  B       DOUBLE PRECISION array, dimension (LDB , max(NN))
 *>          Used to hold the symmetric positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -408,7 +408,7 @@
       EXTERNAL           LSAME, DLAMCH, DLARND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, DLACPY, DLAFTS, DLASET, DLASUM, DLATMR,
+      EXTERNAL           DLACPY, DLAFTS, DLASET, DLASUM, DLATMR,
      $                   DLATMS, DSBGV, DSBGVD, DSBGVX, DSGT01, DSPGV,
      $                   DSPGVD, DSPGVX, DSYGV, DSYGVD, DSYGVX, XERBLA,
      $                   DSYGV_2STAGE
@@ -470,7 +470,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/ddrvst.f b/lapack-netlib/TESTING/EIG/ddrvst.f
index a25077018..805fd8271 100644
--- a/lapack-netlib/TESTING/EIG/ddrvst.f
+++ b/lapack-netlib/TESTING/EIG/ddrvst.f
@@ -214,7 +214,7 @@
 *>          Not modified.
 *>
 *>  D1      DOUBLE PRECISION array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by DSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by DSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -502,11 +502,11 @@
       EXTERNAL           DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, DLABAD, DLACPY, DLAFTS, DLASET, DLATMR,
-     $                   DLATMS, DSBEV, DSBEVD, DSBEVX, DSPEV, DSPEVD,
-     $                   DSPEVX, DSTEV, DSTEVD, DSTEVR, DSTEVX, DSTT21,
-     $                   DSTT22, DSYEV, DSYEVD, DSYEVR, DSYEVX, DSYT21,
-     $                   DSYT22, XERBLA
+      EXTERNAL           ALASVM, DLACPY, DLAFTS, DLASET, DLATMR, DLATMS,
+     $                   DSBEV, DSBEVD, DSBEVX, DSPEV, DSPEVD, DSPEVX,
+     $                   DSTEV, DSTEVD, DSTEVR, DSTEVX, DSTT21, DSTT22,
+     $                   DSYEV, DSYEVD, DSYEVR, DSYEVX, DSYT21, DSYT22,
+     $                   XERBLA
 *     ..
 *     .. Scalars in Common ..
       CHARACTER*32       SRNAMT
@@ -574,7 +574,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/ddrvst2stg.f b/lapack-netlib/TESTING/EIG/ddrvst2stg.f
index c9a2632bb..e38671e38 100644
--- a/lapack-netlib/TESTING/EIG/ddrvst2stg.f
+++ b/lapack-netlib/TESTING/EIG/ddrvst2stg.f
@@ -214,7 +214,7 @@
 *>          Not modified.
 *>
 *>  D1      DOUBLE PRECISION array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by DSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by DSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -502,7 +502,7 @@
       EXTERNAL           DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, DLABAD, DLACPY, DLAFTS, DLASET, DLATMR,
+      EXTERNAL           ALASVM, DLACPY, DLAFTS, DLASET, DLATMR,
      $                   DLATMS, DSBEV, DSBEVD, DSBEVX, DSPEV, DSPEVD,
      $                   DSPEVX, DSTEV, DSTEVD, DSTEVR, DSTEVX, DSTT21,
      $                   DSTT22, DSYEV, DSYEVD, DSYEVR, DSYEVX, DSYT21,
@@ -577,7 +577,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/schkst.f b/lapack-netlib/TESTING/EIG/schkst.f
index 10622d77a..aecbdfe93 100644
--- a/lapack-netlib/TESTING/EIG/schkst.f
+++ b/lapack-netlib/TESTING/EIG/schkst.f
@@ -363,7 +363,7 @@
 *> \verbatim
 *>          D1 is REAL array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by SSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by SSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -645,10 +645,10 @@
       EXTERNAL           ILAENV, SLAMCH, SLARND, SSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SCOPY, SLABAD, SLACPY, SLASET, SLASUM, SLATMR,
-     $                   SLATMS, SOPGTR, SORGTR, SPTEQR, SSPT21, SSPTRD,
-     $                   SSTEBZ, SSTECH, SSTEDC, SSTEMR, SSTEIN, SSTEQR,
-     $                   SSTERF, SSTT21, SSTT22, SSYT21, SSYTRD, XERBLA
+      EXTERNAL           SCOPY, SLACPY, SLASET, SLASUM, SLATMR, SLATMS,
+     $                   SOPGTR, SORGTR, SPTEQR, SSPT21, SSPTRD, SSTEBZ,
+     $                   SSTECH, SSTEDC, SSTEMR, SSTEIN, SSTEQR, SSTERF,
+     $                   SSTT21, SSTT22, SSYT21, SSYTRD, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, INT, LOG, MAX, MIN, REAL, SQRT
@@ -715,7 +715,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/schkst2stg.f b/lapack-netlib/TESTING/EIG/schkst2stg.f
index ac5a3fc39..49899a660 100644
--- a/lapack-netlib/TESTING/EIG/schkst2stg.f
+++ b/lapack-netlib/TESTING/EIG/schkst2stg.f
@@ -384,7 +384,7 @@
 *> \verbatim
 *>          D1 is REAL array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by SSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by SSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -666,10 +666,10 @@
       EXTERNAL           ILAENV, SLAMCH, SLARND, SSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SCOPY, SLABAD, SLACPY, SLASET, SLASUM, SLATMR,
-     $                   SLATMS, SOPGTR, SORGTR, SPTEQR, SSPT21, SSPTRD,
-     $                   SSTEBZ, SSTECH, SSTEDC, SSTEMR, SSTEIN, SSTEQR,
-     $                   SSTERF, SSTT21, SSTT22, SSYT21, SSYTRD, XERBLA,
+      EXTERNAL           SCOPY, SLACPY, SLASET, SLASUM, SLATMR, SLATMS,
+     $                   SOPGTR, SORGTR, SPTEQR, SSPT21, SSPTRD, SSTEBZ,
+     $                   SSTECH, SSTEDC, SSTEMR, SSTEIN, SSTEQR, SSTERF,
+     $                   SSTT21, SSTT22, SSYT21, SSYTRD, XERBLA,
      $                   SSYTRD_2STAGE
 *     ..
 *     .. Intrinsic Functions ..
@@ -737,7 +737,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/sdrgsx.f b/lapack-netlib/TESTING/EIG/sdrgsx.f
index ce0d59214..2015cac1c 100644
--- a/lapack-netlib/TESTING/EIG/sdrgsx.f
+++ b/lapack-netlib/TESTING/EIG/sdrgsx.f
@@ -400,7 +400,7 @@
       EXTERNAL           SLCTSX, ILAENV, SLAMCH, SLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, SGESVD, SGET51, SGET53, SGGESX, SLABAD,
+      EXTERNAL           ALASVM, SGESVD, SGET51, SGET53, SGGESX,
      $                   SLACPY, SLAKF2, SLASET, SLATM5, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
@@ -479,7 +479,6 @@ c        MINWRK = MAX( 10*( NSIZE+1 ), 5*NSIZE*NSIZE / 2-2 )
       ULPINV = ONE / ULP
       SMLNUM = SLAMCH( 'S' ) / ULP
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       THRSH2 = TEN*THRESH
       NTESTT = 0
       NERRS = 0
@@ -985,7 +984,7 @@ c        MINWRK = MAX( 10*( NSIZE+1 ), 5*NSIZE*NSIZE / 2-2 )
      $      / '  2:  A and B are upper triangular matrices, ',
      $      / '  3:  A and B are as type 2, but each second diagonal ',
      $      'block in A_11 and ', /
-     $      '      each third diaongal block in A_22 are 2x2 blocks,',
+     $      '      each third diagonal block in A_22 are 2x2 blocks,',
      $      / '  4:  A and B are block diagonal matrices, ',
      $      / '  5:  (A,B) has potentially close or common ',
      $      'eigenvalues.', / )
diff --git a/lapack-netlib/TESTING/EIG/sdrvsg.f b/lapack-netlib/TESTING/EIG/sdrvsg.f
index 877579bcd..0c82b6f49 100644
--- a/lapack-netlib/TESTING/EIG/sdrvsg.f
+++ b/lapack-netlib/TESTING/EIG/sdrvsg.f
@@ -234,7 +234,7 @@
 *>
 *>  B       REAL array, dimension (LDB , max(NN))
 *>          Used to hold the symmetric positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -399,7 +399,7 @@
       EXTERNAL           LSAME, SLAMCH, SLARND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLABAD, SLACPY, SLAFTS, SLASET, SLASUM, SLATMR,
+      EXTERNAL           SLACPY, SLAFTS, SLASET, SLASUM, SLATMR,
      $                   SLATMS, SSBGV, SSBGVD, SSBGVX, SSGT01, SSPGV,
      $                   SSPGVD, SSPGVX, SSYGV, SSYGVD, SSYGVX, XERBLA
 *     ..
@@ -460,7 +460,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/sdrvsg2stg.f b/lapack-netlib/TESTING/EIG/sdrvsg2stg.f
index ebd169977..38ed61628 100644
--- a/lapack-netlib/TESTING/EIG/sdrvsg2stg.f
+++ b/lapack-netlib/TESTING/EIG/sdrvsg2stg.f
@@ -240,7 +240,7 @@
 *>
 *>  B       REAL             array, dimension (LDB , max(NN))
 *>          Used to hold the symmetric positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -408,7 +408,7 @@
       EXTERNAL           LSAME, SLAMCH, SLARND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLABAD, SLACPY, SLAFTS, SLASET, SLASUM, SLATMR,
+      EXTERNAL           SLACPY, SLAFTS, SLASET, SLASUM, SLATMR,
      $                   SLATMS, SSBGV, SSBGVD, SSBGVX, SSGT01, SSPGV,
      $                   SSPGVD, SSPGVX, SSYGV, SSYGVD, SSYGVX, XERBLA,
      $                   SSYGV_2STAGE
@@ -470,7 +470,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/sdrvst.f b/lapack-netlib/TESTING/EIG/sdrvst.f
index ea0cf66f9..be6d33cee 100644
--- a/lapack-netlib/TESTING/EIG/sdrvst.f
+++ b/lapack-netlib/TESTING/EIG/sdrvst.f
@@ -214,7 +214,7 @@
 *>          Not modified.
 *>
 *>  D1      REAL array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by SSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by SSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -502,11 +502,11 @@
       EXTERNAL           SLAMCH, SLARND, SSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, SLABAD, SLACPY, SLAFTS, SLASET, SLATMR,
-     $                   SLATMS, SSBEV, SSBEVD, SSBEVX, SSPEV, SSPEVD,
-     $                   SSPEVX, SSTEV, SSTEVD, SSTEVR, SSTEVX, SSTT21,
-     $                   SSTT22, SSYEV, SSYEVD, SSYEVR, SSYEVX, SSYT21,
-     $                   SSYT22, XERBLA
+      EXTERNAL           ALASVM, SLACPY, SLAFTS, SLASET, SLATMR, SLATMS,
+     $                   SSBEV, SSBEVD, SSBEVX, SSPEV, SSPEVD, SSPEVX,
+     $                   SSTEV, SSTEVD, SSTEVR, SSTEVX, SSTT21, SSTT22,
+     $                   SSYEV, SSYEVD, SSYEVR, SSYEVX, SSYT21, SSYT22,
+     $                   XERBLA
 *     ..
 *     .. Scalars in Common ..
       CHARACTER*32       SRNAMT
@@ -574,7 +574,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/sdrvst2stg.f b/lapack-netlib/TESTING/EIG/sdrvst2stg.f
index a13a58b48..e05ec0749 100644
--- a/lapack-netlib/TESTING/EIG/sdrvst2stg.f
+++ b/lapack-netlib/TESTING/EIG/sdrvst2stg.f
@@ -214,7 +214,7 @@
 *>          Not modified.
 *>
 *>  D1      REAL             array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by SSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by SSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -502,7 +502,7 @@
       EXTERNAL           SLAMCH, SLARND, SSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, SLABAD, SLACPY, SLAFTS, SLASET, SLATMR,
+      EXTERNAL           ALASVM, SLACPY, SLAFTS, SLASET, SLATMR,
      $                   SLATMS, SSBEV, SSBEVD, SSBEVX, SSPEV, SSPEVD,
      $                   SSPEVX, SSTEV, SSTEVD, SSTEVR, SSTEVX, SSTT21,
      $                   SSTT22, SSYEV, SSYEVD, SSYEVR, SSYEVX, SSYT21,
@@ -577,7 +577,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/zchkst.f b/lapack-netlib/TESTING/EIG/zchkst.f
index 60496dde1..b77d94438 100644
--- a/lapack-netlib/TESTING/EIG/zchkst.f
+++ b/lapack-netlib/TESTING/EIG/zchkst.f
@@ -364,7 +364,7 @@
 *> \verbatim
 *>          D1 is DOUBLE PRECISION array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by ZSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by ZSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -662,11 +662,10 @@
       EXTERNAL           ILAENV, DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DCOPY, DLABAD, DLASUM, DSTEBZ, DSTECH, DSTERF,
-     $                   XERBLA, ZCOPY, ZHET21, ZHETRD, ZHPT21, ZHPTRD,
-     $                   ZLACPY, ZLASET, ZLATMR, ZLATMS, ZPTEQR, ZSTEDC,
-     $                   ZSTEMR, ZSTEIN, ZSTEQR, ZSTT21, ZSTT22, ZUNGTR,
-     $                   ZUPGTR
+      EXTERNAL           DCOPY, DLASUM, DSTEBZ, DSTECH, DSTERF, XERBLA,
+     $                   ZCOPY, ZHET21, ZHETRD, ZHPT21, ZHPTRD, ZLACPY,
+     $                   ZLASET, ZLATMR, ZLATMS, ZPTEQR, ZSTEDC, ZSTEMR,
+     $                   ZSTEIN, ZSTEQR, ZSTT21, ZSTT22, ZUNGTR, ZUPGTR
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, DCONJG, INT, LOG, MAX, MIN, SQRT
@@ -733,7 +732,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/zchkst2stg.f b/lapack-netlib/TESTING/EIG/zchkst2stg.f
index b1ef80816..3b333ef1c 100644
--- a/lapack-netlib/TESTING/EIG/zchkst2stg.f
+++ b/lapack-netlib/TESTING/EIG/zchkst2stg.f
@@ -385,7 +385,7 @@
 *> \verbatim
 *>          D1 is DOUBLE PRECISION array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by ZSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by ZSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -683,11 +683,11 @@
       EXTERNAL           ILAENV, DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DCOPY, DLABAD, DLASUM, DSTEBZ, DSTECH, DSTERF,
-     $                   XERBLA, ZCOPY, ZHET21, ZHETRD, ZHPT21, ZHPTRD,
-     $                   ZLACPY, ZLASET, ZLATMR, ZLATMS, ZPTEQR, ZSTEDC,
-     $                   ZSTEMR, ZSTEIN, ZSTEQR, ZSTT21, ZSTT22, ZUNGTR,
-     $                   ZUPGTR, ZHETRD_2STAGE, DLASET
+      EXTERNAL           DCOPY, DLASUM, DSTEBZ, DSTECH, DSTERF, XERBLA,
+     $                   ZCOPY, ZHET21, ZHETRD, ZHPT21, ZHPTRD, ZLACPY,
+     $                   ZLASET, ZLATMR, ZLATMS, ZPTEQR, ZSTEDC, ZSTEMR,
+     $                   ZSTEIN, ZSTEQR, ZSTT21, ZSTT22, ZUNGTR, ZUPGTR,
+     $                   ZHETRD_2STAGE, DLASET
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, DCONJG, INT, LOG, MAX, MIN, SQRT
@@ -754,7 +754,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/zdrgsx.f b/lapack-netlib/TESTING/EIG/zdrgsx.f
index 096ed3c22..a486873fa 100644
--- a/lapack-netlib/TESTING/EIG/zdrgsx.f
+++ b/lapack-netlib/TESTING/EIG/zdrgsx.f
@@ -394,7 +394,7 @@
       EXTERNAL           ZLCTSX, ILAENV, DLAMCH, ZLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, DLABAD, XERBLA, ZGESVD, ZGET51, ZGGESX,
+      EXTERNAL           ALASVM, XERBLA, ZGESVD, ZGET51, ZGGESX,
      $                   ZLACPY, ZLAKF2, ZLASET, ZLATM5
 *     ..
 *     .. Scalars in Common ..
@@ -479,7 +479,6 @@
       ULPINV = ONE / ULP
       SMLNUM = DLAMCH( 'S' ) / ULP
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
       THRSH2 = TEN*THRESH
       NTESTT = 0
       NERRS = 0
@@ -918,7 +917,7 @@
      $      / '  2:  A and B are upper triangular matrices, ',
      $      / '  3:  A and B are as type 2, but each second diagonal ',
      $      'block in A_11 and ', /
-     $      '      each third diaongal block in A_22 are 2x2 blocks,',
+     $      '      each third diagonal block in A_22 are 2x2 blocks,',
      $      / '  4:  A and B are block diagonal matrices, ',
      $      / '  5:  (A,B) has potentially close or common ',
      $      'eigenvalues.', / )
diff --git a/lapack-netlib/TESTING/EIG/zdrvsg.f b/lapack-netlib/TESTING/EIG/zdrvsg.f
index 71f1d6371..1796805ea 100644
--- a/lapack-netlib/TESTING/EIG/zdrvsg.f
+++ b/lapack-netlib/TESTING/EIG/zdrvsg.f
@@ -236,7 +236,7 @@
 *>
 *>  B       COMPLEX*16 array, dimension (LDB , max(NN))
 *>          Used to hold the Hermitian positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -418,7 +418,7 @@
       EXTERNAL           LSAME, DLAMCH, DLARND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, DLAFTS, DLASUM, XERBLA, ZHBGV, ZHBGVD,
+      EXTERNAL           DLAFTS, DLASUM, XERBLA, ZHBGV, ZHBGVD,
      $                   ZHBGVX, ZHEGV, ZHEGVD, ZHEGVX, ZHPGV, ZHPGVD,
      $                   ZHPGVX, ZLACPY, ZLASET, ZLATMR, ZLATMS, ZSGT01
 *     ..
@@ -481,7 +481,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/zdrvsg2stg.f b/lapack-netlib/TESTING/EIG/zdrvsg2stg.f
index 4bdf2849e..c5ef4ce70 100644
--- a/lapack-netlib/TESTING/EIG/zdrvsg2stg.f
+++ b/lapack-netlib/TESTING/EIG/zdrvsg2stg.f
@@ -242,7 +242,7 @@
 *>
 *>  B       COMPLEX*16 array, dimension (LDB , max(NN))
 *>          Used to hold the Hermitian positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -426,7 +426,7 @@
       EXTERNAL           LSAME, DLAMCH, DLARND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, DLAFTS, DLASUM, XERBLA, ZHBGV, ZHBGVD,
+      EXTERNAL           DLAFTS, DLASUM, XERBLA, ZHBGV, ZHBGVD,
      $                   ZHBGVX, ZHEGV, ZHEGVD, ZHEGVX, ZHPGV, ZHPGVD,
      $                   ZHPGVX, ZLACPY, ZLASET, ZLATMR, ZLATMS, ZSGT01,
      $                   ZHEGV_2STAGE
@@ -490,7 +490,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/zdrvst.f b/lapack-netlib/TESTING/EIG/zdrvst.f
index 384e58de1..f838f0af2 100644
--- a/lapack-netlib/TESTING/EIG/zdrvst.f
+++ b/lapack-netlib/TESTING/EIG/zdrvst.f
@@ -204,7 +204,7 @@
 *>          Not modified.
 *>
 *>  D1      DOUBLE PRECISION array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by ZSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by ZSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -391,10 +391,10 @@
       EXTERNAL           DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, DLABAD, DLAFTS, XERBLA, ZHBEV, ZHBEVD,
-     $                   ZHBEVX, ZHEEV, ZHEEVD, ZHEEVR, ZHEEVX, ZHET21,
-     $                   ZHET22, ZHPEV, ZHPEVD, ZHPEVX, ZLACPY, ZLASET,
-     $                   ZLATMR, ZLATMS
+      EXTERNAL           ALASVM, DLAFTS, XERBLA, ZHBEV, ZHBEVD, ZHBEVX,
+     $                   ZHEEV, ZHEEVD, ZHEEVR, ZHEEVX, ZHET21, ZHET22,
+     $                   ZHPEV, ZHPEVD, ZHPEVX, ZLACPY, ZLASET, ZLATMR,
+     $                   ZLATMS
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, INT, LOG, MAX, MIN, SQRT
@@ -451,7 +451,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/zdrvst2stg.f b/lapack-netlib/TESTING/EIG/zdrvst2stg.f
index 4a88e5218..4b989b460 100644
--- a/lapack-netlib/TESTING/EIG/zdrvst2stg.f
+++ b/lapack-netlib/TESTING/EIG/zdrvst2stg.f
@@ -204,7 +204,7 @@
 *>          Not modified.
 *>
 *>  D1      DOUBLE PRECISION array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by ZSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by ZSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -391,7 +391,7 @@
       EXTERNAL           DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, DLABAD, DLAFTS, XERBLA, ZHBEV, ZHBEVD,
+      EXTERNAL           ALASVM, DLAFTS, XERBLA, ZHBEV, ZHBEVD,
      $                   ZHBEVX, ZHEEV, ZHEEVD, ZHEEVR, ZHEEVX, ZHET21,
      $                   ZHET22, ZHPEV, ZHPEVD, ZHPEVX, ZLACPY, ZLASET,
      $                   ZHEEVD_2STAGE, ZHEEVR_2STAGE, ZHEEVX_2STAGE,
@@ -453,7 +453,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/LIN/alahd.f b/lapack-netlib/TESTING/LIN/alahd.f
index f0423a23b..dd75394b3 100644
--- a/lapack-netlib/TESTING/LIN/alahd.f
+++ b/lapack-netlib/TESTING/LIN/alahd.f
@@ -777,7 +777,7 @@
      $       'triangular-pentagonal matrices' )
  8004 FORMAT( / 1X, A3, ':  TS factorization for ',
      $       'tall-skinny or short-wide matrices' )
- 8005 FORMAT( / 1X, A3, ':  Householder recostruction from TSQR',
+ 8005 FORMAT( / 1X, A3, ':  Householder reconstruction from TSQR',
      $       ' factorization output ', /,' for tall-skinny matrices.' )
 *
 *     GE matrix types
diff --git a/lapack-netlib/TESTING/LIN/cchktp.f b/lapack-netlib/TESTING/LIN/cchktp.f
index 18242ff54..e14f1062e 100644
--- a/lapack-netlib/TESTING/LIN/cchktp.f
+++ b/lapack-netlib/TESTING/LIN/cchktp.f
@@ -87,7 +87,7 @@
 *> \verbatim
 *>          NMAX is INTEGER
 *>          The leading dimension of the work arrays.  NMAX >= the
-*>          maximumm value of N in NVAL.
+*>          maximum value of N in NVAL.
 *> \endverbatim
 *>
 *> \param[out] AP
diff --git a/lapack-netlib/TESTING/LIN/cerrhe.f b/lapack-netlib/TESTING/LIN/cerrhe.f
index d0c5cf6b5..1798aed57 100644
--- a/lapack-netlib/TESTING/LIN/cerrhe.f
+++ b/lapack-netlib/TESTING/LIN/cerrhe.f
@@ -133,7 +133,7 @@
       IF( LSAMEN( 2, C2, 'HE' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a Hermitian indefinite matrix with patrial
+*        of a Hermitian indefinite matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        CHETRF
@@ -576,7 +576,7 @@
          CALL CHKXER( 'CHETRS_AA_STAGE', INFOT, NOUT, LERR, OK )
 *
 *        Test error exits of the routines that use factorization
-*        of a Hermitian indefinite packed matrix with patrial
+*        of a Hermitian indefinite packed matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
       ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN
diff --git a/lapack-netlib/TESTING/LIN/cerrhex.f b/lapack-netlib/TESTING/LIN/cerrhex.f
index b6c889798..83b3a92ad 100644
--- a/lapack-netlib/TESTING/LIN/cerrhex.f
+++ b/lapack-netlib/TESTING/LIN/cerrhex.f
@@ -137,7 +137,7 @@
       IF( LSAMEN( 2, C2, 'HE' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a Hermitian indefinite matrix with patrial
+*        of a Hermitian indefinite matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        CHETRF
@@ -523,7 +523,7 @@
       ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN
 *
 *     Test error exits of the routines that use factorization
-*     of a Hermitian indefinite packed matrix with patrial
+*     of a Hermitian indefinite packed matrix with partial
 *     (Bunch-Kaufman) diagonal pivoting method.
 *
 *        CHPTRF
diff --git a/lapack-netlib/TESTING/LIN/cerrsy.f b/lapack-netlib/TESTING/LIN/cerrsy.f
index a236d2d18..6d284be98 100644
--- a/lapack-netlib/TESTING/LIN/cerrsy.f
+++ b/lapack-netlib/TESTING/LIN/cerrsy.f
@@ -130,7 +130,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        CSYTRF
@@ -469,7 +469,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        CSPTRF
diff --git a/lapack-netlib/TESTING/LIN/cerrsyx.f b/lapack-netlib/TESTING/LIN/cerrsyx.f
index 34972668e..c8d269c84 100644
--- a/lapack-netlib/TESTING/LIN/cerrsyx.f
+++ b/lapack-netlib/TESTING/LIN/cerrsyx.f
@@ -135,7 +135,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        CSYTRF
@@ -521,7 +521,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        CSPTRF
diff --git a/lapack-netlib/TESTING/LIN/cgtt01.f b/lapack-netlib/TESTING/LIN/cgtt01.f
index e504515c5..64dc5f8fa 100644
--- a/lapack-netlib/TESTING/LIN/cgtt01.f
+++ b/lapack-netlib/TESTING/LIN/cgtt01.f
@@ -39,7 +39,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/cgtt02.f b/lapack-netlib/TESTING/LIN/cgtt02.f
index 702e66eed..8b951acd5 100644
--- a/lapack-netlib/TESTING/LIN/cgtt02.f
+++ b/lapack-netlib/TESTING/LIN/cgtt02.f
@@ -40,14 +40,14 @@
 *> \verbatim
 *>          TRANS is CHARACTER
 *>          Specifies the form of the residual.
-*>          = 'N':  B - A * X     (No transpose)
+*>          = 'N':  B - A    * X  (No transpose)
 *>          = 'T':  B - A**T * X  (Transpose)
 *>          = 'C':  B - A**H * X  (Conjugate transpose)
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/chet01_3.f b/lapack-netlib/TESTING/LIN/chet01_3.f
index 58104a357..7e07b5790 100644
--- a/lapack-netlib/TESTING/LIN/chet01_3.f
+++ b/lapack-netlib/TESTING/LIN/chet01_3.f
@@ -188,7 +188,7 @@
          RETURN
       END IF
 *
-*     a) Revert to multiplyers of L
+*     a) Revert to multipliers of L
 *
       CALL CSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/clqt02.f b/lapack-netlib/TESTING/LIN/clqt02.f
index 24eb05399..20297f6cc 100644
--- a/lapack-netlib/TESTING/LIN/clqt02.f
+++ b/lapack-netlib/TESTING/LIN/clqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> CLQT02 tests CUNGLQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the LQ factorization of an m-by-n matrix A, CLQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/cptt01.f b/lapack-netlib/TESTING/LIN/cptt01.f
index e4520ec3d..3b117ad28 100644
--- a/lapack-netlib/TESTING/LIN/cptt01.f
+++ b/lapack-netlib/TESTING/LIN/cptt01.f
@@ -36,7 +36,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/cptt02.f b/lapack-netlib/TESTING/LIN/cptt02.f
index da4f0e854..ffaef89bd 100644
--- a/lapack-netlib/TESTING/LIN/cptt02.f
+++ b/lapack-netlib/TESTING/LIN/cptt02.f
@@ -46,7 +46,7 @@
 *>
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/cqlt02.f b/lapack-netlib/TESTING/LIN/cqlt02.f
index fc4685aa5..53080cd17 100644
--- a/lapack-netlib/TESTING/LIN/cqlt02.f
+++ b/lapack-netlib/TESTING/LIN/cqlt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> CQLT02 tests CUNGQL, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QL factorization of an m-by-n matrix A, CQLT02 generates
diff --git a/lapack-netlib/TESTING/LIN/cqrt02.f b/lapack-netlib/TESTING/LIN/cqrt02.f
index 62f176cd8..db22496cf 100644
--- a/lapack-netlib/TESTING/LIN/cqrt02.f
+++ b/lapack-netlib/TESTING/LIN/cqrt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> CQRT02 tests CUNGQR, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QR factorization of an m-by-n matrix A, CQRT02 generates
diff --git a/lapack-netlib/TESTING/LIN/crqt02.f b/lapack-netlib/TESTING/LIN/crqt02.f
index 8625c2f91..81c805dc4 100644
--- a/lapack-netlib/TESTING/LIN/crqt02.f
+++ b/lapack-netlib/TESTING/LIN/crqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> CRQT02 tests CUNGRQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the RQ factorization of an m-by-n matrix A, CRQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/csyt01_3.f b/lapack-netlib/TESTING/LIN/csyt01_3.f
index f0b316cef..879749a10 100644
--- a/lapack-netlib/TESTING/LIN/csyt01_3.f
+++ b/lapack-netlib/TESTING/LIN/csyt01_3.f
@@ -188,7 +188,7 @@
          RETURN
       END IF
 *
-*     a) Revert to multiplyers of L
+*     a) Revert to multipliers of L
 *
       CALL CSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/dchktp.f b/lapack-netlib/TESTING/LIN/dchktp.f
index 9af6150ca..6db5b1376 100644
--- a/lapack-netlib/TESTING/LIN/dchktp.f
+++ b/lapack-netlib/TESTING/LIN/dchktp.f
@@ -86,7 +86,7 @@
 *> \verbatim
 *>          NMAX is INTEGER
 *>          The leading dimension of the work arrays.  NMAX >= the
-*>          maximumm value of N in NVAL.
+*>          maximum value of N in NVAL.
 *> \endverbatim
 *>
 *> \param[out] AP
diff --git a/lapack-netlib/TESTING/LIN/ddrvab.f b/lapack-netlib/TESTING/LIN/ddrvab.f
index 9110d8334..5fd32b951 100644
--- a/lapack-netlib/TESTING/LIN/ddrvab.f
+++ b/lapack-netlib/TESTING/LIN/ddrvab.f
@@ -346,7 +346,7 @@
                CALL DGET08( TRANS, N, N, NRHS, A, LDA, X, LDA, WORK,
      $                      LDA, RWORK, RESULT( 1 ) )
 *
-*              Check if the test passes the tesing.
+*              Check if the test passes the testing.
 *              Print information about the tests that did not
 *              pass the testing.
 *
diff --git a/lapack-netlib/TESTING/LIN/ddrvac.f b/lapack-netlib/TESTING/LIN/ddrvac.f
index bd463cee4..3ecbc6a23 100644
--- a/lapack-netlib/TESTING/LIN/ddrvac.f
+++ b/lapack-netlib/TESTING/LIN/ddrvac.f
@@ -365,7 +365,7 @@
                   CALL DPOT06( UPLO, N, NRHS, A, LDA, X, LDA, WORK,
      $               LDA, RWORK, RESULT( 1 ) )
 *
-*                 Check if the test passes the tesing.
+*                 Check if the test passes the testing.
 *                 Print information about the tests that did not
 *                 pass the testing.
 *
diff --git a/lapack-netlib/TESTING/LIN/derrsy.f b/lapack-netlib/TESTING/LIN/derrsy.f
index af15a4b8e..eb08d4c7b 100644
--- a/lapack-netlib/TESTING/LIN/derrsy.f
+++ b/lapack-netlib/TESTING/LIN/derrsy.f
@@ -133,7 +133,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        DSYTRF
@@ -581,7 +581,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        DSPTRF
diff --git a/lapack-netlib/TESTING/LIN/derrsyx.f b/lapack-netlib/TESTING/LIN/derrsyx.f
index c2d14caab..495302158 100644
--- a/lapack-netlib/TESTING/LIN/derrsyx.f
+++ b/lapack-netlib/TESTING/LIN/derrsyx.f
@@ -138,7 +138,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        DSYTRF
@@ -528,7 +528,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        DSPTRF
diff --git a/lapack-netlib/TESTING/LIN/dgtt01.f b/lapack-netlib/TESTING/LIN/dgtt01.f
index 3b5ff9e4c..6fd754db2 100644
--- a/lapack-netlib/TESTING/LIN/dgtt01.f
+++ b/lapack-netlib/TESTING/LIN/dgtt01.f
@@ -39,7 +39,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/dgtt02.f b/lapack-netlib/TESTING/LIN/dgtt02.f
index b3268b138..4fecff86e 100644
--- a/lapack-netlib/TESTING/LIN/dgtt02.f
+++ b/lapack-netlib/TESTING/LIN/dgtt02.f
@@ -41,14 +41,14 @@
 *> \verbatim
 *>          TRANS is CHARACTER
 *>          Specifies the form of the residual.
-*>          = 'N':  B - A * X  (No transpose)
+*>          = 'N':  B - A    * X  (No transpose)
 *>          = 'T':  B - A**T * X  (Transpose)
 *>          = 'C':  B - A**H * X  (Conjugate transpose = Transpose)
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/dlqt02.f b/lapack-netlib/TESTING/LIN/dlqt02.f
index 158cd0b79..672d82bbc 100644
--- a/lapack-netlib/TESTING/LIN/dlqt02.f
+++ b/lapack-netlib/TESTING/LIN/dlqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> DLQT02 tests DORGLQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the LQ factorization of an m-by-n matrix A, DLQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/dptt01.f b/lapack-netlib/TESTING/LIN/dptt01.f
index 2c6c440ff..9e9c6bd85 100644
--- a/lapack-netlib/TESTING/LIN/dptt01.f
+++ b/lapack-netlib/TESTING/LIN/dptt01.f
@@ -35,7 +35,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/dptt02.f b/lapack-netlib/TESTING/LIN/dptt02.f
index a4802c696..e045b8779 100644
--- a/lapack-netlib/TESTING/LIN/dptt02.f
+++ b/lapack-netlib/TESTING/LIN/dptt02.f
@@ -35,7 +35,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/dqlt02.f b/lapack-netlib/TESTING/LIN/dqlt02.f
index 950cfe67b..7799e0858 100644
--- a/lapack-netlib/TESTING/LIN/dqlt02.f
+++ b/lapack-netlib/TESTING/LIN/dqlt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> DQLT02 tests DORGQL, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QL factorization of an m-by-n matrix A, DQLT02 generates
diff --git a/lapack-netlib/TESTING/LIN/dqrt02.f b/lapack-netlib/TESTING/LIN/dqrt02.f
index d0e4349d1..d41c7b324 100644
--- a/lapack-netlib/TESTING/LIN/dqrt02.f
+++ b/lapack-netlib/TESTING/LIN/dqrt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> DQRT02 tests DORGQR, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QR factorization of an m-by-n matrix A, DQRT02 generates
diff --git a/lapack-netlib/TESTING/LIN/drqt02.f b/lapack-netlib/TESTING/LIN/drqt02.f
index 7fbb6a6d4..faf639109 100644
--- a/lapack-netlib/TESTING/LIN/drqt02.f
+++ b/lapack-netlib/TESTING/LIN/drqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> DRQT02 tests DORGRQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the RQ factorization of an m-by-n matrix A, DRQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/dsyt01_3.f b/lapack-netlib/TESTING/LIN/dsyt01_3.f
index 2a7d9d142..060a9caed 100644
--- a/lapack-netlib/TESTING/LIN/dsyt01_3.f
+++ b/lapack-netlib/TESTING/LIN/dsyt01_3.f
@@ -183,7 +183,7 @@
          RETURN
       END IF
 *
-*     a) Revert to multiplyers of L
+*     a) Revert to multipliers of L
 *
       CALL DSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/schktp.f b/lapack-netlib/TESTING/LIN/schktp.f
index ff05c1d97..a5243f651 100644
--- a/lapack-netlib/TESTING/LIN/schktp.f
+++ b/lapack-netlib/TESTING/LIN/schktp.f
@@ -86,7 +86,7 @@
 *> \verbatim
 *>          NMAX is INTEGER
 *>          The leading dimension of the work arrays.  NMAX >= the
-*>          maximumm value of N in NVAL.
+*>          maximum value of N in NVAL.
 *> \endverbatim
 *>
 *> \param[out] AP
diff --git a/lapack-netlib/TESTING/LIN/serrsy.f b/lapack-netlib/TESTING/LIN/serrsy.f
index c562b417b..ab422ba5c 100644
--- a/lapack-netlib/TESTING/LIN/serrsy.f
+++ b/lapack-netlib/TESTING/LIN/serrsy.f
@@ -133,7 +133,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        SSYTRF
@@ -581,7 +581,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        SSPTRF
diff --git a/lapack-netlib/TESTING/LIN/serrsyx.f b/lapack-netlib/TESTING/LIN/serrsyx.f
index ed47c37bd..cffd28a52 100644
--- a/lapack-netlib/TESTING/LIN/serrsyx.f
+++ b/lapack-netlib/TESTING/LIN/serrsyx.f
@@ -137,7 +137,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        SSYTRF
@@ -527,7 +527,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        SSPTRF
diff --git a/lapack-netlib/TESTING/LIN/sgtt01.f b/lapack-netlib/TESTING/LIN/sgtt01.f
index 5d88c91f7..5d639af99 100644
--- a/lapack-netlib/TESTING/LIN/sgtt01.f
+++ b/lapack-netlib/TESTING/LIN/sgtt01.f
@@ -39,7 +39,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/sgtt02.f b/lapack-netlib/TESTING/LIN/sgtt02.f
index a4eb09f6b..6daea295f 100644
--- a/lapack-netlib/TESTING/LIN/sgtt02.f
+++ b/lapack-netlib/TESTING/LIN/sgtt02.f
@@ -41,14 +41,14 @@
 *> \verbatim
 *>          TRANS is CHARACTER
 *>          Specifies the form of the residual.
-*>          = 'N':  B - A * X  (No transpose)
+*>          = 'N':  B - A    * X  (No transpose)
 *>          = 'T':  B - A**T * X  (Transpose)
 *>          = 'C':  B - A**H * X  (Conjugate transpose = Transpose)
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/slqt02.f b/lapack-netlib/TESTING/LIN/slqt02.f
index d59a8a01c..1c7bd4f0d 100644
--- a/lapack-netlib/TESTING/LIN/slqt02.f
+++ b/lapack-netlib/TESTING/LIN/slqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> SLQT02 tests SORGLQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the LQ factorization of an m-by-n matrix A, SLQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/sptt01.f b/lapack-netlib/TESTING/LIN/sptt01.f
index fef18dd2b..5b33bfad5 100644
--- a/lapack-netlib/TESTING/LIN/sptt01.f
+++ b/lapack-netlib/TESTING/LIN/sptt01.f
@@ -35,7 +35,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/sptt02.f b/lapack-netlib/TESTING/LIN/sptt02.f
index 51a6a074a..2bd5535aa 100644
--- a/lapack-netlib/TESTING/LIN/sptt02.f
+++ b/lapack-netlib/TESTING/LIN/sptt02.f
@@ -35,7 +35,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/sqlt02.f b/lapack-netlib/TESTING/LIN/sqlt02.f
index 5d381b950..f26352eb3 100644
--- a/lapack-netlib/TESTING/LIN/sqlt02.f
+++ b/lapack-netlib/TESTING/LIN/sqlt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> SQLT02 tests SORGQL, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QL factorization of an m-by-n matrix A, SQLT02 generates
diff --git a/lapack-netlib/TESTING/LIN/sqrt02.f b/lapack-netlib/TESTING/LIN/sqrt02.f
index 72163f0a9..44b9c6270 100644
--- a/lapack-netlib/TESTING/LIN/sqrt02.f
+++ b/lapack-netlib/TESTING/LIN/sqrt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> SQRT02 tests SORGQR, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QR factorization of an m-by-n matrix A, SQRT02 generates
diff --git a/lapack-netlib/TESTING/LIN/srqt02.f b/lapack-netlib/TESTING/LIN/srqt02.f
index ca0594f7a..a33c98ba4 100644
--- a/lapack-netlib/TESTING/LIN/srqt02.f
+++ b/lapack-netlib/TESTING/LIN/srqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> SRQT02 tests SORGRQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the RQ factorization of an m-by-n matrix A, SRQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/ssyt01_3.f b/lapack-netlib/TESTING/LIN/ssyt01_3.f
index 147553db9..951fcb7d6 100644
--- a/lapack-netlib/TESTING/LIN/ssyt01_3.f
+++ b/lapack-netlib/TESTING/LIN/ssyt01_3.f
@@ -183,7 +183,7 @@
          RETURN
       END IF
 *
-*     a) Revert to multiplyers of L
+*     a) Revert to multipliers of L
 *
       CALL SSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/zchktp.f b/lapack-netlib/TESTING/LIN/zchktp.f
index 1798c24e7..ab46f5ce5 100644
--- a/lapack-netlib/TESTING/LIN/zchktp.f
+++ b/lapack-netlib/TESTING/LIN/zchktp.f
@@ -87,7 +87,7 @@
 *> \verbatim
 *>          NMAX is INTEGER
 *>          The leading dimension of the work arrays.  NMAX >= the
-*>          maximumm value of N in NVAL.
+*>          maximum value of N in NVAL.
 *> \endverbatim
 *>
 *> \param[out] AP
diff --git a/lapack-netlib/TESTING/LIN/zdrvab.f b/lapack-netlib/TESTING/LIN/zdrvab.f
index 130515959..772eb08af 100644
--- a/lapack-netlib/TESTING/LIN/zdrvab.f
+++ b/lapack-netlib/TESTING/LIN/zdrvab.f
@@ -348,7 +348,7 @@
                CALL ZGET08( TRANS, N, N, NRHS, A, LDA, X, LDA, WORK,
      $                      LDA, RWORK, RESULT( 1 ) )
 *
-*              Check if the test passes the tesing.
+*              Check if the test passes the testing.
 *              Print information about the tests that did not
 *              pass the testing.
 *
diff --git a/lapack-netlib/TESTING/LIN/zdrvac.f b/lapack-netlib/TESTING/LIN/zdrvac.f
index 20f8eb1e5..bbf73a263 100644
--- a/lapack-netlib/TESTING/LIN/zdrvac.f
+++ b/lapack-netlib/TESTING/LIN/zdrvac.f
@@ -367,7 +367,7 @@
                   CALL ZPOT06( UPLO, N, NRHS, A, LDA, X, LDA, WORK,
      $               LDA, RWORK, RESULT( 1 ) )
 *
-*                 Check if the test passes the tesing.
+*                 Check if the test passes the testing.
 *                 Print information about the tests that did not
 *                 pass the testing.
 *
diff --git a/lapack-netlib/TESTING/LIN/zerrhe.f b/lapack-netlib/TESTING/LIN/zerrhe.f
index e49e5037c..40dd25d75 100644
--- a/lapack-netlib/TESTING/LIN/zerrhe.f
+++ b/lapack-netlib/TESTING/LIN/zerrhe.f
@@ -135,7 +135,7 @@
       IF( LSAMEN( 2, C2, 'HE' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a Hermitian indefinite matrix with patrial
+*        of a Hermitian indefinite matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        ZHETRF
@@ -580,7 +580,7 @@
       ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a Hermitian indefinite packed matrix with patrial
+*        of a Hermitian indefinite packed matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        ZHPTRF
diff --git a/lapack-netlib/TESTING/LIN/zerrhex.f b/lapack-netlib/TESTING/LIN/zerrhex.f
index b6e8b77ef..bdcdefff7 100644
--- a/lapack-netlib/TESTING/LIN/zerrhex.f
+++ b/lapack-netlib/TESTING/LIN/zerrhex.f
@@ -138,7 +138,7 @@
       OK = .TRUE.
 *
 *     Test error exits of the routines that use factorization
-*     of a Hermitian indefinite matrix with patrial
+*     of a Hermitian indefinite matrix with partial
 *     (Bunch-Kaufman) diagonal pivoting method.
 *
       IF( LSAMEN( 2, C2, 'HE' ) ) THEN
@@ -526,7 +526,7 @@
       ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a Hermitian indefinite packed matrix with patrial
+*        of a Hermitian indefinite packed matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        ZHPTRF
diff --git a/lapack-netlib/TESTING/LIN/zerrsy.f b/lapack-netlib/TESTING/LIN/zerrsy.f
index a50f9d24e..932df1936 100644
--- a/lapack-netlib/TESTING/LIN/zerrsy.f
+++ b/lapack-netlib/TESTING/LIN/zerrsy.f
@@ -132,7 +132,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        ZSYTRF
@@ -471,7 +471,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        ZSPTRF
diff --git a/lapack-netlib/TESTING/LIN/zerrsyx.f b/lapack-netlib/TESTING/LIN/zerrsyx.f
index 23d2a5a2b..9d5e71288 100644
--- a/lapack-netlib/TESTING/LIN/zerrsyx.f
+++ b/lapack-netlib/TESTING/LIN/zerrsyx.f
@@ -139,7 +139,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        ZSYTRF
@@ -525,7 +525,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        ZSPTRF
diff --git a/lapack-netlib/TESTING/LIN/zgtt01.f b/lapack-netlib/TESTING/LIN/zgtt01.f
index c63fdbc25..a8ac13d2d 100644
--- a/lapack-netlib/TESTING/LIN/zgtt01.f
+++ b/lapack-netlib/TESTING/LIN/zgtt01.f
@@ -39,7 +39,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/zgtt02.f b/lapack-netlib/TESTING/LIN/zgtt02.f
index 7362967be..f86fe9244 100644
--- a/lapack-netlib/TESTING/LIN/zgtt02.f
+++ b/lapack-netlib/TESTING/LIN/zgtt02.f
@@ -40,14 +40,14 @@
 *> \verbatim
 *>          TRANS is CHARACTER
 *>          Specifies the form of the residual.
-*>          = 'N':  B - A * X     (No transpose)
+*>          = 'N':  B - A    * X  (No transpose)
 *>          = 'T':  B - A**T * X  (Transpose)
 *>          = 'C':  B - A**H * X  (Conjugate transpose)
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/zhet01_3.f b/lapack-netlib/TESTING/LIN/zhet01_3.f
index 0a76404d6..5beed9042 100644
--- a/lapack-netlib/TESTING/LIN/zhet01_3.f
+++ b/lapack-netlib/TESTING/LIN/zhet01_3.f
@@ -188,7 +188,7 @@
          RETURN
       END IF
 *
-*     a) Revert to multiplyers of L
+*     a) Revert to multipliers of L
 *
       CALL ZSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/zlqt02.f b/lapack-netlib/TESTING/LIN/zlqt02.f
index c55d76ccd..9ba98ff1f 100644
--- a/lapack-netlib/TESTING/LIN/zlqt02.f
+++ b/lapack-netlib/TESTING/LIN/zlqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> ZLQT02 tests ZUNGLQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the LQ factorization of an m-by-n matrix A, ZLQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/zptt01.f b/lapack-netlib/TESTING/LIN/zptt01.f
index e842c7e46..9f2359c2a 100644
--- a/lapack-netlib/TESTING/LIN/zptt01.f
+++ b/lapack-netlib/TESTING/LIN/zptt01.f
@@ -36,7 +36,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/zptt02.f b/lapack-netlib/TESTING/LIN/zptt02.f
index 6e3a8aed0..8a1e2961c 100644
--- a/lapack-netlib/TESTING/LIN/zptt02.f
+++ b/lapack-netlib/TESTING/LIN/zptt02.f
@@ -46,7 +46,7 @@
 *>
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/zqlt02.f b/lapack-netlib/TESTING/LIN/zqlt02.f
index 1f84cfa5a..7cb889931 100644
--- a/lapack-netlib/TESTING/LIN/zqlt02.f
+++ b/lapack-netlib/TESTING/LIN/zqlt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> ZQLT02 tests ZUNGQL, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QL factorization of an m-by-n matrix A, ZQLT02 generates
diff --git a/lapack-netlib/TESTING/LIN/zqrt02.f b/lapack-netlib/TESTING/LIN/zqrt02.f
index 2dbefaf84..a32703c49 100644
--- a/lapack-netlib/TESTING/LIN/zqrt02.f
+++ b/lapack-netlib/TESTING/LIN/zqrt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> ZQRT02 tests ZUNGQR, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QR factorization of an m-by-n matrix A, ZQRT02 generates
diff --git a/lapack-netlib/TESTING/LIN/zrqt02.f b/lapack-netlib/TESTING/LIN/zrqt02.f
index 548321d00..c44e04cb6 100644
--- a/lapack-netlib/TESTING/LIN/zrqt02.f
+++ b/lapack-netlib/TESTING/LIN/zrqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> ZRQT02 tests ZUNGRQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the RQ factorization of an m-by-n matrix A, ZRQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/zsyt01_3.f b/lapack-netlib/TESTING/LIN/zsyt01_3.f
index 202488db9..a572353f2 100644
--- a/lapack-netlib/TESTING/LIN/zsyt01_3.f
+++ b/lapack-netlib/TESTING/LIN/zsyt01_3.f
@@ -188,7 +188,7 @@
          RETURN
       END IF
 *
-*     a) Revert to multiplyers of L
+*     a) Revert to multipliers of L
 *
       CALL ZSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO )
 *

From 0c38ebd5990f7c44529cabbbadf7806eb6afa0c3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 18 May 2023 17:31:55 +0200
Subject: [PATCH 078/718] Extend tests for error exit sysv/sytd2/gehd2
 (Reference-LAPACK PR 795)

---
 lapack-netlib/TESTING/EIG/cerrhs.f | 70 +++++++++++++++++++++++++++--
 lapack-netlib/TESTING/EIG/cerrst.f | 18 +++++++-
 lapack-netlib/TESTING/EIG/derrhs.f | 72 ++++++++++++++++++++++++++++--
 lapack-netlib/TESTING/EIG/derrst.f | 24 +++++++---
 lapack-netlib/TESTING/EIG/serrhs.f | 72 ++++++++++++++++++++++++++++--
 lapack-netlib/TESTING/EIG/serrst.f | 18 +++++++-
 lapack-netlib/TESTING/EIG/zerrhs.f | 70 +++++++++++++++++++++++++++--
 lapack-netlib/TESTING/EIG/zerrst.f | 18 +++++++-
 8 files changed, 337 insertions(+), 25 deletions(-)

diff --git a/lapack-netlib/TESTING/EIG/cerrhs.f b/lapack-netlib/TESTING/EIG/cerrhs.f
index 0568a6d78..2dd86b8c9 100644
--- a/lapack-netlib/TESTING/EIG/cerrhs.f
+++ b/lapack-netlib/TESTING/EIG/cerrhs.f
@@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> CERRHS tests the error exits for CGEBAK, CGEBAL, CGEHRD, CUNGHR,
-*> CUNMHR, CHSEQR, CHSEIN, and CTREVC.
+*> CERRHS tests the error exits for CGEBAK, CGEBAL, CGEHRD, CGEHD2,
+*> CUNGHR, CUNMHR, CHSEQR, CHSEIN, CTREVC, and CTREVC3.
 *> \endverbatim
 *
 *  Arguments:
@@ -86,7 +86,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHKXER, CGEBAK, CGEBAL, CGEHRD, CHSEIN, CHSEQR,
-     $                   CUNGHR, CUNMHR, CTREVC
+     $                   CUNGHR, CUNMHR, CTREVC, CTREVC3, CGEHD2
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          REAL
@@ -193,6 +193,29 @@
          CALL CHKXER( 'CGEHRD', INFOT, NOUT, LERR, OK )
          NT = NT + 7
 *
+*        CGEHD2
+*
+         SRNAMT = 'CGEHD2'
+         INFOT = 1
+         CALL CGEHD2( -1, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL CGEHD2( 0, 0, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL CGEHD2( 0, 2, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL CGEHD2( 1, 1, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL CGEHD2( 0, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL CGEHD2( 2, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 6
+*
 *        CUNGHR
 *
          SRNAMT = 'CUNGHR'
@@ -398,6 +421,47 @@
      $                RW, INFO )
          CALL CHKXER( 'CTREVC', INFOT, NOUT, LERR, OK )
          NT = NT + 7
+*
+*        CTREVC3
+*
+         SRNAMT = 'CTREVC3'
+         INFOT = 1
+         CALL CTREVC3( '/', 'A', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL CTREVC3( 'L', '/', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL CTREVC3( 'L', 'A', SEL, -1, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL CTREVC3( 'L', 'A', SEL, 2, A, 1, VL, 2, VR, 1, 4, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL CTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL CTREVC3( 'R', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 11
+         CALL CTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 1, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 14
+         CALL CTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W,
+     $                 2, RW, 2, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 16
+         CALL CTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         NT = NT + 9
       END IF
 *
 *     Print a summary line.
diff --git a/lapack-netlib/TESTING/EIG/cerrst.f b/lapack-netlib/TESTING/EIG/cerrst.f
index 18b5cad36..ba97afbe5 100644
--- a/lapack-netlib/TESTING/EIG/cerrst.f
+++ b/lapack-netlib/TESTING/EIG/cerrst.f
@@ -21,7 +21,7 @@
 *>
 *> \verbatim
 *>
-*> CERRST tests the error exits for CHETRD, CUNGTR, CUNMTR, CHPTRD,
+*> CERRST tests the error exits for CHETRD, CHETD2, CUNGTR, CUNMTR, CHPTRD,
 *> CUNGTR, CUPMTR, CSTEQR, CSTEIN, CPTEQR, CHBTRD,
 *> CHEEV, CHEEVX, CHEEVD, CHBEV, CHBEVX, CHBEVD,
 *> CHPEV, CHPEVX, CHPEVD, and CSTEDC.
@@ -94,7 +94,7 @@
       EXTERNAL           CHBEV, CHBEVD, CHBEVX, CHBTRD, CHEEV, CHEEVD,
      $                   CHEEVR, CHEEVX, CHETRD, CHKXER, CHPEV, CHPEVD,
      $                   CHPEVX, CHPTRD, CPTEQR, CSTEDC, CSTEIN, CSTEQR,
-     $                   CUNGTR, CUNMTR, CUPGTR, CUPMTR,
+     $                   CUNGTR, CUNMTR, CUPGTR, CUPMTR, CHETD2,
      $                   CHEEVD_2STAGE, CHEEVR_2STAGE, CHEEVX_2STAGE,
      $                   CHEEV_2STAGE, CHBEV_2STAGE, CHBEVD_2STAGE,
      $                   CHBEVX_2STAGE, CHETRD_2STAGE, CHETRD_HE2HB,
@@ -156,6 +156,20 @@
          CALL CHKXER( 'CHETRD', INFOT, NOUT, LERR, OK )
          NT = NT + 4
 *
+*        CHETD2
+*
+         SRNAMT = 'CHETD2'
+         INFOT = 1
+         CALL CHETD2( '/', 0, A, 1, D, E, TAU, W, 1, INFO )
+         CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL CHETD2( 'U', -1, A, 1, D, E, TAU, W, 1, INFO )
+         CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL CHETD2( 'U', 2, A, 1, D, E, TAU, W, 1, INFO )
+         CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 3
+*
 *        CHETRD_2STAGE
 *
          SRNAMT = 'CHETRD_2STAGE'
diff --git a/lapack-netlib/TESTING/EIG/derrhs.f b/lapack-netlib/TESTING/EIG/derrhs.f
index fec41c0f0..583bebc9b 100644
--- a/lapack-netlib/TESTING/EIG/derrhs.f
+++ b/lapack-netlib/TESTING/EIG/derrhs.f
@@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> DERRHS tests the error exits for DGEBAK, SGEBAL, SGEHRD, DORGHR,
-*> DORMHR, DHSEQR, SHSEIN, and DTREVC.
+*> DERRHS tests the error exits for DGEBAK, DGEBAL, DGEHRD, DGEHD2,
+*> DORGHR, DORMHR, DHSEQR, DHSEIN, DTREVC, and DTREVC3.
 *> \endverbatim
 *
 *  Arguments:
@@ -86,7 +86,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHKXER, DGEBAK, DGEBAL, DGEHRD, DHSEIN, DHSEQR,
-     $                   DORGHR, DORMHR, DTREVC
+     $                   DORGHR, DORMHR, DTREVC, DTREVC3, DGEHD2
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          DBLE
@@ -194,6 +194,29 @@
          CALL CHKXER( 'DGEHRD', INFOT, NOUT, LERR, OK )
          NT = NT + 7
 *
+*        DGEHD2
+*
+         SRNAMT = 'DGEHD2'
+         INFOT = 1
+         CALL DGEHD2( -1, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL DGEHD2( 0, 0, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL DGEHD2( 0, 2, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL DGEHD2( 1, 1, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL DGEHD2( 0, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL DGEHD2( 2, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 6
+*
 *        DORGHR
 *
          SRNAMT = 'DORGHR'
@@ -328,7 +351,11 @@
          CALL DHSEQR( 'E', 'V', 2, 1, 2, A, 2, WR, WI, C, 1, W, 1,
      $                INFO )
          CALL CHKXER( 'DHSEQR', INFOT, NOUT, LERR, OK )
-         NT = NT + 9
+         INFOT = 13
+         CALL DHSEQR( 'E', 'N', 2, 1, 2, A, 2, WR, WI, C, 1, W, 1,
+     $                INFO )
+         CALL CHKXER( 'DHSEQR', INFOT, NOUT, LERR, OK )
+         NT = NT + 10
 *
 *        DHSEIN
 *
@@ -399,6 +426,43 @@
      $                INFO )
          CALL CHKXER( 'DTREVC', INFOT, NOUT, LERR, OK )
          NT = NT + 7
+*
+*        DTREVC3
+*
+         SRNAMT = 'DTREVC3'
+         INFOT = 1
+         CALL DTREVC3( '/', 'A', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL DTREVC3( 'L', '/', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL DTREVC3( 'L', 'A', SEL, -1, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL DTREVC3( 'L', 'A', SEL, 2, A, 1, VL, 2, VR, 1, 4, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL DTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL DTREVC3( 'R', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 11
+         CALL DTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 1, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 14
+         CALL DTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W,
+     $                 2, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         NT = NT + 8
       END IF
 *
 *     Print a summary line.
diff --git a/lapack-netlib/TESTING/EIG/derrst.f b/lapack-netlib/TESTING/EIG/derrst.f
index f297e5a7d..a55b6eea9 100644
--- a/lapack-netlib/TESTING/EIG/derrst.f
+++ b/lapack-netlib/TESTING/EIG/derrst.f
@@ -21,10 +21,10 @@
 *>
 *> \verbatim
 *>
-*> DERRST tests the error exits for DSYTRD, DORGTR, DORMTR, DSPTRD,
-*> DOPGTR, DOPMTR, DSTEQR, SSTERF, SSTEBZ, SSTEIN, DPTEQR, DSBTRD,
-*> DSYEV, SSYEVX, SSYEVD, DSBEV, SSBEVX, SSBEVD,
-*> DSPEV, SSPEVX, SSPEVD, DSTEV, SSTEVX, SSTEVD, and SSTEDC.
+*> DERRST tests the error exits for DSYTRD, DSYTD2, DORGTR, DORMTR, DSPTRD,
+*> DOPGTR, DOPMTR, DSTEQR, DSTERF, DSTEBZ, DSTEIN, DPTEQR, DSBTRD,
+*> DSYEV, DSYEVX, DSYEVD, DSBEV, DSBEVX, DSBEVD,
+*> DSPEV, DSPEVX, DSPEVD, DSTEV, DSTEVX, DSTEVD, and DSTEDC.
 *> DSYEVD_2STAGE, DSYEVR_2STAGE, DSYEVX_2STAGE,
 *> DSYEV_2STAGE, DSBEV_2STAGE, DSBEVD_2STAGE,
 *> DSBEVX_2STAGE, DSYTRD_2STAGE, DSYTRD_SY2SB,
@@ -95,7 +95,7 @@
      $                   DSBEV, DSBEVD, DSBEVX, DSBTRD, DSPEV, DSPEVD,
      $                   DSPEVX, DSPTRD, DSTEBZ, DSTEDC, DSTEIN, DSTEQR,
      $                   DSTERF, DSTEV, DSTEVD, DSTEVR, DSTEVX, DSYEV,
-     $                   DSYEVD, DSYEVR, DSYEVX, DSYTRD,
+     $                   DSYEVD, DSYEVR, DSYEVX, DSYTRD, DSYTD2,
      $                   DSYEVD_2STAGE, DSYEVR_2STAGE, DSYEVX_2STAGE,
      $                   DSYEV_2STAGE, DSBEV_2STAGE, DSBEVD_2STAGE,
      $                   DSBEVX_2STAGE, DSYTRD_2STAGE, DSYTRD_SY2SB,
@@ -157,6 +157,20 @@
          CALL CHKXER( 'DSYTRD', INFOT, NOUT, LERR, OK )
          NT = NT + 4
 *
+*        DSYTD2
+*
+         SRNAMT = 'DSYTD2'
+         INFOT = 1
+         CALL DSYTD2( '/', 0, A, 1, D, E, TAU, W, 1, INFO )
+         CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL DSYTD2( 'U', -1, A, 1, D, E, TAU, W, 1, INFO )
+         CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL DSYTD2( 'U', 2, A, 1, D, E, TAU, W, 1, INFO )
+         CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 3
+*
 *        DSYTRD_2STAGE
 *
          SRNAMT = 'DSYTRD_2STAGE'
diff --git a/lapack-netlib/TESTING/EIG/serrhs.f b/lapack-netlib/TESTING/EIG/serrhs.f
index 8f0ff98a1..89b7303cd 100644
--- a/lapack-netlib/TESTING/EIG/serrhs.f
+++ b/lapack-netlib/TESTING/EIG/serrhs.f
@@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> SERRHS tests the error exits for SGEBAK, SGEBAL, SGEHRD, SORGHR,
-*> SORMHR, SHSEQR, SHSEIN, and STREVC.
+*> SERRHS tests the error exits for SGEBAK, SGEBAL, SGEHRD, SGEHD2,
+*> SORGHR, SORMHR, SHSEQR, SHSEIN, STREVC, and STREVC3.
 *> \endverbatim
 *
 *  Arguments:
@@ -85,7 +85,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHKXER, SGEBAK, SGEBAL, SGEHRD, SHSEIN, SHSEQR,
-     $                   SORGHR, SORMHR, STREVC
+     $                   SORGHR, SORMHR, STREVC, STREVC3, SGEHD2
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          REAL
@@ -193,6 +193,29 @@
          CALL CHKXER( 'SGEHRD', INFOT, NOUT, LERR, OK )
          NT = NT + 7
 *
+*        SGEHD2
+*
+         SRNAMT = 'SGEHD2'
+         INFOT = 1
+         CALL SGEHD2( -1, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL SGEHD2( 0, 0, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL SGEHD2( 0, 2, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL SGEHD2( 1, 1, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL SGEHD2( 0, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL SGEHD2( 2, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 6
+*
 *        SORGHR
 *
          SRNAMT = 'SORGHR'
@@ -327,7 +350,11 @@
          CALL SHSEQR( 'E', 'V', 2, 1, 2, A, 2, WR, WI, C, 1, W, 1,
      $                INFO )
          CALL CHKXER( 'SHSEQR', INFOT, NOUT, LERR, OK )
-         NT = NT + 9
+         INFOT = 13
+         CALL SHSEQR( 'E', 'N', 2, 1, 2, A, 2, WR, WI, C, 1, W, 1,
+     $                INFO )
+         CALL CHKXER( 'SHSEQR', INFOT, NOUT, LERR, OK )
+         NT = NT + 10
 *
 *        SHSEIN
 *
@@ -398,6 +425,43 @@
      $                INFO )
          CALL CHKXER( 'STREVC', INFOT, NOUT, LERR, OK )
          NT = NT + 7
+*
+*        STREVC3
+*
+         SRNAMT = 'STREVC3'
+         INFOT = 1
+         CALL STREVC3( '/', 'A', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL STREVC3( 'L', '/', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL STREVC3( 'L', 'A', SEL, -1, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL STREVC3( 'L', 'A', SEL, 2, A, 1, VL, 2, VR, 1, 4, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL STREVC3( 'L', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL STREVC3( 'R', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 11
+         CALL STREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 1, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 14
+         CALL STREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W,
+     $                 2, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         NT = NT + 8
       END IF
 *
 *     Print a summary line.
diff --git a/lapack-netlib/TESTING/EIG/serrst.f b/lapack-netlib/TESTING/EIG/serrst.f
index afb6d4faf..8c9c0f306 100644
--- a/lapack-netlib/TESTING/EIG/serrst.f
+++ b/lapack-netlib/TESTING/EIG/serrst.f
@@ -21,7 +21,7 @@
 *>
 *> \verbatim
 *>
-*> SERRST tests the error exits for SSYTRD, SORGTR, SORMTR, SSPTRD,
+*> SERRST tests the error exits for SSYTRD, SSYTD2, SORGTR, SORMTR, SSPTRD,
 *> SOPGTR, SOPMTR, SSTEQR, SSTERF, SSTEBZ, SSTEIN, SPTEQR, SSBTRD,
 *> SSYEV, SSYEVX, SSYEVD, SSBEV, SSBEVX, SSBEVD,
 *> SSPEV, SSPEVX, SSPEVD, SSTEV, SSTEVX, SSTEVD, and SSTEDC.
@@ -95,7 +95,7 @@
      $                   SSBEV, SSBEVD, SSBEVX, SSBTRD, SSPEV, SSPEVD,
      $                   SSPEVX, SSPTRD, SSTEBZ, SSTEDC, SSTEIN, SSTEQR,
      $                   SSTERF, SSTEV, SSTEVD, SSTEVR, SSTEVX, SSYEV,
-     $                   SSYEVD, SSYEVR, SSYEVX, SSYTRD,
+     $                   SSYEVD, SSYEVR, SSYEVX, SSYTRD, SSYTD2,
      $                   SSYEVD_2STAGE, SSYEVR_2STAGE, SSYEVX_2STAGE,
      $                   SSYEV_2STAGE, SSBEV_2STAGE, SSBEVD_2STAGE,
      $                   SSBEVX_2STAGE, SSYTRD_2STAGE, SSYTRD_SY2SB,
@@ -157,6 +157,20 @@
          CALL CHKXER( 'SSYTRD', INFOT, NOUT, LERR, OK )
          NT = NT + 4
 *
+*        SSYTD2
+*
+         SRNAMT = 'SSYTD2'
+         INFOT = 1
+         CALL SSYTD2( '/', 0, A, 1, D, E, TAU, W, 1, INFO )
+         CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL SSYTD2( 'U', -1, A, 1, D, E, TAU, W, 1, INFO )
+         CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL SSYTD2( 'U', 2, A, 1, D, E, TAU, W, 1, INFO )
+         CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 3
+*
 *        SSYTRD_2STAGE
 *
          SRNAMT = 'SSYTRD_2STAGE'
diff --git a/lapack-netlib/TESTING/EIG/zerrhs.f b/lapack-netlib/TESTING/EIG/zerrhs.f
index 582338947..3a70a556e 100644
--- a/lapack-netlib/TESTING/EIG/zerrhs.f
+++ b/lapack-netlib/TESTING/EIG/zerrhs.f
@@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> ZERRHS tests the error exits for ZGEBAK, CGEBAL, CGEHRD, ZUNGHR,
-*> ZUNMHR, ZHSEQR, CHSEIN, and ZTREVC.
+*> ZERRHS tests the error exits for ZGEBAK, ZGEBAL, ZGEHRD, ZGEHD2,
+*> ZUNGHR, ZUNMHR, ZHSEQR, ZHSEIN, ZTREVC, and ZTREVC3.
 *> \endverbatim
 *
 *  Arguments:
@@ -86,7 +86,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHKXER, ZGEBAK, ZGEBAL, ZGEHRD, ZHSEIN, ZHSEQR,
-     $                   ZTREVC, ZUNGHR, ZUNMHR
+     $                   ZUNGHR, ZUNMHR, ZTREVC, ZTREVC3
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          DBLE
@@ -193,6 +193,29 @@
          CALL CHKXER( 'ZGEHRD', INFOT, NOUT, LERR, OK )
          NT = NT + 7
 *
+*        ZGEHD2
+*
+         SRNAMT = 'ZGEHD2'
+         INFOT = 1
+         CALL ZGEHD2( -1, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZGEHD2( 0, 0, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZGEHD2( 0, 2, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL ZGEHD2( 1, 1, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL ZGEHD2( 0, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZGEHD2( 2, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 6
+*
 *        ZUNGHR
 *
          SRNAMT = 'ZUNGHR'
@@ -389,6 +412,47 @@
      $                INFO )
          CALL CHKXER( 'ZTREVC', INFOT, NOUT, LERR, OK )
          NT = NT + 7
+*
+*        ZTREVC3
+*
+         SRNAMT = 'ZTREVC3'
+         INFOT = 1
+         CALL ZTREVC3( '/', 'A', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZTREVC3( 'L', '/', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL ZTREVC3( 'L', 'A', SEL, -1, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL ZTREVC3( 'L', 'A', SEL, 2, A, 1, VL, 2, VR, 1, 4, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL ZTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL ZTREVC3( 'R', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 11
+         CALL ZTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 1, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 14
+         CALL ZTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W,
+     $                 2, RW, 2, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 16
+         CALL ZTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         NT = NT + 9
       END IF
 *
 *     Print a summary line.
diff --git a/lapack-netlib/TESTING/EIG/zerrst.f b/lapack-netlib/TESTING/EIG/zerrst.f
index 5b0e6f820..948f94bc2 100644
--- a/lapack-netlib/TESTING/EIG/zerrst.f
+++ b/lapack-netlib/TESTING/EIG/zerrst.f
@@ -23,7 +23,7 @@
 *>
 *> \verbatim
 *>
-*> ZERRST tests the error exits for ZHETRD, ZUNGTR, CUNMTR, ZHPTRD,
+*> ZERRST tests the error exits for ZHETRD, ZHETD2, ZUNGTR, CUNMTR, ZHPTRD,
 *> ZUNGTR, ZUPMTR, ZSTEQR, CSTEIN, ZPTEQR, ZHBTRD,
 *> ZHEEV, CHEEVX, CHEEVD, ZHBEV, CHBEVX, CHBEVD,
 *> ZHPEV, CHPEVX, CHPEVD, and ZSTEDC.
@@ -95,7 +95,7 @@
       EXTERNAL           CHKXER, ZHBEV, ZHBEVD, ZHBEVX, ZHBTRD, ZHEEV,
      $                   ZHEEVD, ZHEEVR, ZHEEVX, ZHETRD, ZHPEV, ZHPEVD,
      $                   ZHPEVX, ZHPTRD, ZPTEQR, ZSTEDC, ZSTEIN, ZSTEQR,
-     $                   ZUNGTR, ZUNMTR, ZUPGTR, ZUPMTR,
+     $                   ZUNGTR, ZUNMTR, ZUPGTR, ZUPMTR, ZHETD2,
      $                   ZHEEVD_2STAGE, ZHEEVR_2STAGE, ZHEEVX_2STAGE,
      $                   ZHEEV_2STAGE, ZHBEV_2STAGE, ZHBEVD_2STAGE,
      $                   ZHBEVX_2STAGE, ZHETRD_2STAGE
@@ -156,6 +156,20 @@
          CALL CHKXER( 'ZHETRD', INFOT, NOUT, LERR, OK )
          NT = NT + 4
 *
+*        ZHETD2
+*
+         SRNAMT = 'ZHETD2'
+         INFOT = 1
+         CALL ZHETD2( '/', 0, A, 1, D, E, TAU, W, 1, INFO )
+         CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZHETD2( 'U', -1, A, 1, D, E, TAU, W, 1, INFO )
+         CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL ZHETD2( 'U', 2, A, 1, D, E, TAU, W, 1, INFO )
+         CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 3
+*
 *        ZHETRD_2STAGE
 *
          SRNAMT = 'ZHETRD_2STAGE'

From ca1791cfebe72ad5f92b538cc6a2c2a33cee72f1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 18 May 2023 17:35:00 +0200
Subject: [PATCH 079/718] Extend tests for error exit sysv/sytd2/gehd2
 (Reference-LAPACK PR 795)

---
 lapack-netlib/TESTING/LIN/cerrvx.f           | 57 +++++++++++--
 lapack-netlib/TESTING/LIN/derrvx.f           | 44 ++++++----
 lapack-netlib/TESTING/LIN/serrvx.f           | 24 +++++-
 lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f |  2 +-
 lapack-netlib/TESTING/LIN/zerrvx.f           | 87 +++++++++++++++-----
 5 files changed, 167 insertions(+), 47 deletions(-)

diff --git a/lapack-netlib/TESTING/LIN/cerrvx.f b/lapack-netlib/TESTING/LIN/cerrvx.f
index 701abd161..548d09d0c 100644
--- a/lapack-netlib/TESTING/LIN/cerrvx.f
+++ b/lapack-netlib/TESTING/LIN/cerrvx.f
@@ -87,7 +87,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGBSV, CGBSVX, CGESV, CGESVX, CGTSV, CGTSVX,
-     $                   CHESV, CHESV_RK ,CHESV_ROOK, CHESVX, CHKXER,
+     $                   CHESV, CHESV_RK, CHESV_ROOK, CHESVX, CHKXER,
      $                   CHPSV, CHPSVX, CPBSV, CPBSVX, CPOSV, CPOSVX,
      $                   CPPSV, CPPSVX, CPTSV, CPTSVX, CSPSV, CSPSVX,
      $                   CSYSV, CSYSV_AA, CSYSV_RK, CSYSV_ROOK,
@@ -651,6 +651,9 @@
          INFOT = 3
          CALL CHESV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CHESV_ROOK', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL CHESV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'CHESV_ROOK', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL CHESV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CHESV_ROOK', INFOT, NOUT, LERR, OK )
@@ -710,9 +713,15 @@
          INFOT = 3
          CALL CHESV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CHESV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL CHESV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'CHESV_AA', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL CHESV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CHESV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL CHESV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO )
+         CALL CHKXER( 'CHESV_AA', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'H2' ) ) THEN
 *
@@ -733,16 +742,44 @@
          CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 5
          CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
          CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
-         INFOT = 7
-         CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
+         INFOT = 13
+         CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+*
+      ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN
+*
+*        CSYSV_AASEN
+*
+         SRNAMT = 'CSYSV_AA'
+         INFOT = 1
+         CALL CSYSV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL CSYSV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL CSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL CSYSV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL CSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL CSYSV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO )
+         CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN
 *
@@ -763,14 +800,18 @@
          CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 5
          CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
          CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
-         INFOT = 7
-         CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
+         INFOT = 13
+         CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
 *
@@ -834,6 +875,9 @@
          INFOT = 3
          CALL CSYSV( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CSYSV ', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL CSYSV( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'CSYSV ', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL CSYSV( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CSYSV ', INFOT, NOUT, LERR, OK )
@@ -898,6 +942,9 @@
          INFOT = 3
          CALL CSYSV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CSYSV_ROOK', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL CSYSV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'CSYSV_ROOK', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL CSYSV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CSYSV_ROOK', INFOT, NOUT, LERR, OK )
diff --git a/lapack-netlib/TESTING/LIN/derrvx.f b/lapack-netlib/TESTING/LIN/derrvx.f
index 447b00bc4..f2d29f7a3 100644
--- a/lapack-netlib/TESTING/LIN/derrvx.f
+++ b/lapack-netlib/TESTING/LIN/derrvx.f
@@ -699,21 +699,27 @@
 *
       ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN
 *
-*        DSYSV_AA
-*
-        SRNAMT = 'DSYSV_AA'
-        INFOT = 1
-        CALL DSYSV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
-        INFOT = 2
-        CALL DSYSV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
-        INFOT = 3
-        CALL DSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
-        INFOT = 8
-        CALL DSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
+*        DSYSV_AASEN
+*
+         SRNAMT = 'DSYSV_AA'
+         INFOT = 1
+         CALL DSYSV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL DSYSV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL DSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL DSYSV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL DSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL DSYSV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO )
+         CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN
 *
@@ -734,14 +740,18 @@
          CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 5
          CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
          CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
-         INFOT = 7
-         CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
+         INFOT = 13
+         CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
 *
diff --git a/lapack-netlib/TESTING/LIN/serrvx.f b/lapack-netlib/TESTING/LIN/serrvx.f
index c3db47332..440f9113e 100644
--- a/lapack-netlib/TESTING/LIN/serrvx.f
+++ b/lapack-netlib/TESTING/LIN/serrvx.f
@@ -582,6 +582,9 @@
          INFOT = 3
          CALL SSYSV( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'SSYSV ', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL SSYSV( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'SSYSV ', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL SSYSV( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'SSYSV ', INFOT, NOUT, LERR, OK )
@@ -647,6 +650,9 @@
          INFOT = 3
          CALL SSYSV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'SSYSV_ROOK', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL SSYSV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'SSYSV_ROOK', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL SSYSV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'SSYSV_ROOK', INFOT, NOUT, LERR, OK )
@@ -694,7 +700,7 @@
 *
       ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN
 *
-*        SSYSV_AA
+*        SSYSV_AASEN
 *
          SRNAMT = 'SSYSV_AA'
          INFOT = 1
@@ -706,13 +712,19 @@
          INFOT = 3
          CALL SSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'SSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL SSYSV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'SSYSV_AA', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL SSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'SSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL SSYSV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO )
+         CALL CHKXER( 'SSYSV_AA', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN
 *
-*        DSYSV_AASEN_2STAGE
+*        SSYSV_AASEN_2STAGE
 *
          SRNAMT = 'SSYSV_AA_2STAGE'
          INFOT = 1
@@ -729,14 +741,18 @@
          CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 5
          CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
          CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
-         INFOT = 7
-         CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
+         INFOT = 13
+         CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
 *
diff --git a/lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f
index 21497477c..5406a76bf 100644
--- a/lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f
@@ -229,7 +229,7 @@
 *     Test path
 *
       PATH( 1: 1 ) = 'Zomplex precision'
-      PATH( 2: 3 ) = 'H2'
+      PATH( 2: 3 ) = 'S2'
 *
 *     Path to generate matrices
 *
diff --git a/lapack-netlib/TESTING/LIN/zerrvx.f b/lapack-netlib/TESTING/LIN/zerrvx.f
index ea7823df3..80b3aaf4a 100644
--- a/lapack-netlib/TESTING/LIN/zerrvx.f
+++ b/lapack-netlib/TESTING/LIN/zerrvx.f
@@ -653,6 +653,9 @@
          INFOT = 3
          CALL ZHESV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'ZHESV_ROOK', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZHESV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'ZHESV_ROOK', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL ZHESV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'ZHESV_ROOK', INFOT, NOUT, LERR, OK )
@@ -700,21 +703,27 @@
 *
       ELSE IF( LSAMEN( 2, C2, 'HA' ) ) THEN
 *
-*        ZHESV_AA
-*
-        SRNAMT = 'ZHESV_AA'
-        INFOT = 1
-        CALL ZHESV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
-        INFOT = 2
-        CALL ZHESV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
-        INFOT = 3
-        CALL ZHESV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
-        INFOT = 8
-        CALL ZHESV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
+*        ZHESV_AASEN
+*
+         SRNAMT = 'ZHESV_AA'
+         INFOT = 1
+         CALL ZHESV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZHESV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL ZHESV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZHESV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL ZHESV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL ZHESV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO )
+         CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'H2' ) ) THEN
 *
@@ -735,16 +744,44 @@
          CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 5
          CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
          CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
-         INFOT = 7
-         CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
+         INFOT = 13
+         CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+*
+      ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN
+*
+*        ZSYSV_AASEN
+*
+         SRNAMT = 'ZSYSV_AA'
+         INFOT = 1
+         CALL ZSYSV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZSYSV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL ZSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZSYSV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL ZSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL ZSYSV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO )
+         CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN
 *
@@ -765,17 +802,21 @@
          CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 5
          CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
          CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
-         INFOT = 7
-         CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
+         INFOT = 13
+         CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
-**
+*
       ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN
 *
 *        ZHPSV
@@ -836,6 +877,9 @@
          INFOT = 3
          CALL ZSYSV( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'ZSYSV ', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZSYSV( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'ZSYSV ', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL ZSYSV( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'ZSYSV ', INFOT, NOUT, LERR, OK )
@@ -900,6 +944,9 @@
          INFOT = 3
          CALL ZSYSV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'ZSYSV_ROOK', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZSYSV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_ROOK', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL ZSYSV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'ZSYSV_ROOK', INFOT, NOUT, LERR, OK )

From 2bbd61950e504e9ae3b0df2a1c6c20946bc4183d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 18 May 2023 23:35:57 +0200
Subject: [PATCH 080/718] Add interfaces for [cz]unhr_col and [sd]orhr_col
 (Reference-LAPACK PR 827)

---
 lapack-netlib/LAPACKE/include/lapacke.h | 36 ++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h
index 9998b1504..9bd228064 100644
--- a/lapack-netlib/LAPACKE/include/lapacke.h
+++ b/lapack-netlib/LAPACKE/include/lapacke.h
@@ -12727,7 +12727,41 @@ lapack_int LAPACKE_zhetrs_aa_2stage_work( int matrix_layout, char uplo, lapack_i
                                lapack_int lda, lapack_complex_double* tb,
                                lapack_int ltb, lapack_int* ipiv, lapack_int* ipiv2,
                                lapack_complex_double* b, lapack_int ldb );
-                   
+
+//LAPACK 3.10.0
+lapack_int LAPACKE_sorhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, float* a,
+                              lapack_int lda, float* t,
+                              lapack_int ldt, float* d );
+lapack_int LAPACKE_sorhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, float* a,
+                                   lapack_int lda, float* t,
+                                   lapack_int ldt, float* d );
+lapack_int LAPACKE_dorhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, double* a,
+                              lapack_int lda, double* t,
+                              lapack_int ldt, double* d );
+lapack_int LAPACKE_dorhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, double* a,
+                                   lapack_int lda, double* t,
+                                   lapack_int ldt, double* d );
+lapack_int LAPACKE_cunhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, lapack_complex_float* a,
+                              lapack_int lda, lapack_complex_float* t,
+                              lapack_int ldt, lapack_complex_float* d );
+lapack_int LAPACKE_cunhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, lapack_complex_float* a,
+                                   lapack_int lda, lapack_complex_float* t,
+                                   lapack_int ldt, lapack_complex_float* d );
+lapack_int LAPACKE_zunhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, lapack_complex_double* a,
+                              lapack_int lda, lapack_complex_double* t,
+                              lapack_int ldt, lapack_complex_double* d );
+lapack_int LAPACKE_zunhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, lapack_complex_double* a,
+                                   lapack_int lda, lapack_complex_double* t,
+                                   lapack_int ldt, lapack_complex_double* d );
+
 /* APIs for set/get nancheck flags */
 void LAPACKE_set_nancheck( int flag );
 int LAPACKE_get_nancheck( void );

From bc967e770906a74b6a8de8701b9f22f3815c3d91 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 18 May 2023 23:41:46 +0200
Subject: [PATCH 081/718] Add interfaces for [cz]unhr_col and [sd]orhr_col
 (Reference-LAPACK PR 827)

---
 lapack-netlib/LAPACKE/include/lapack.h | 32 ++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h
index 3e7f9de5b..a0fcaa259 100644
--- a/lapack-netlib/LAPACKE/include/lapack.h
+++ b/lapack-netlib/LAPACKE/include/lapack.h
@@ -11517,6 +11517,22 @@ void LAPACK_sorgtsqr_row(
     float* work, lapack_int const* lwork,
     lapack_int* info );
 
+#define LAPACK_dorhr_col LAPACK_GLOBAL(dorhr_col,DORHR_COL)
+void LAPACK_dorhr_col(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* nb, double* A,
+    lapack_int const* lda, double* T,
+    lapack_int const* ldt, double* D,
+    lapack_int* info );
+
+#define LAPACK_sorhr_col LAPACK_GLOBAL(sorhr_col,SORHR_COL)
+void LAPACK_sorhr_col(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* nb, float* A,
+    lapack_int const* lda, float* T,
+    lapack_int const* ldt, float* D,
+    lapack_int* info );
+
 #define LAPACK_dormbr_base LAPACK_GLOBAL(dormbr,DORMBR)
 void LAPACK_dormbr_base(
     char const* vect, char const* side, char const* trans,
@@ -22708,6 +22724,22 @@ void LAPACK_zungtsqr_row(
     lapack_complex_double* work, lapack_int const* lwork,
     lapack_int* info );
 
+#define LAPACK_cunhr_col LAPACK_GLOBAL(cunhr_col,CUNHR_COL)
+void LAPACK_cunhr_col(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* nb, lapack_complex_float* A,
+    lapack_int const* lda, lapack_complex_float* T,
+    lapack_int const* ldt, lapack_complex_float* D,
+    lapack_int* info );
+
+#define LAPACK_zunhr_col LAPACK_GLOBAL(zunhr_col,ZUNHR_COL)
+void LAPACK_zunhr_col(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* nb, lapack_complex_double* A,
+    lapack_int const* lda, lapack_complex_double* T,
+    lapack_int const* ldt, lapack_complex_double* D,
+    lapack_int* info );
+
 #define LAPACK_cunmbr_base LAPACK_GLOBAL(cunmbr,CUNMBR)
 void LAPACK_cunmbr_base(
     char const* vect, char const* side, char const* trans,

From 4f10899f5aeb7308fe87998c0f61ee2bd5026807 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 18 May 2023 23:43:18 +0200
Subject: [PATCH 082/718] Add interfaces for [cz]unhr_col and [sd]orhr_col
 (Reference-LAPACK PR 827)

---
 lapack-netlib/LAPACKE/src/lapacke_cunhr_col.c | 24 +++++++
 .../LAPACKE/src/lapacke_cunhr_col_work.c      | 69 +++++++++++++++++++
 lapack-netlib/LAPACKE/src/lapacke_dorhr_col.c | 24 +++++++
 .../LAPACKE/src/lapacke_dorhr_col_work.c      | 69 +++++++++++++++++++
 lapack-netlib/LAPACKE/src/lapacke_sorhr_col.c | 24 +++++++
 .../LAPACKE/src/lapacke_sorhr_col_work.c      | 69 +++++++++++++++++++
 lapack-netlib/LAPACKE/src/lapacke_zunhr_col.c | 24 +++++++
 .../LAPACKE/src/lapacke_zunhr_col_work.c      | 69 +++++++++++++++++++
 8 files changed, 372 insertions(+)
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_cunhr_col.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_cunhr_col_work.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dorhr_col.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dorhr_col_work.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_sorhr_col.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_sorhr_col_work.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zunhr_col.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zunhr_col_work.c

diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunhr_col.c b/lapack-netlib/LAPACKE/src/lapacke_cunhr_col.c
new file mode 100644
index 000000000..7ed1ad4c4
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cunhr_col.c
@@ -0,0 +1,24 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cunhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, lapack_complex_float* a,
+                              lapack_int lda, lapack_complex_float* t,
+                              lapack_int ldt, lapack_complex_float* d)
+{
+  lapack_int info = 0;
+  if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+    LAPACKE_xerbla( "LAPACKE_cunhr_col", -1 );
+    return -1;
+  }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+  if( LAPACKE_get_nancheck() ) {
+    /* Optionally check input matrices for NaNs */
+    if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) {
+      return -5;
+    }
+  }
+#endif
+  /* Call middle-level interface */
+  info = LAPACKE_cunhr_col_work( matrix_layout, m, n, nb, a, lda, t, ldt, d );
+  return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunhr_col_work.c b/lapack-netlib/LAPACKE/src/lapacke_cunhr_col_work.c
new file mode 100644
index 000000000..76b8366f0
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cunhr_col_work.c
@@ -0,0 +1,69 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cunhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, lapack_complex_float* a,
+                                   lapack_int lda, lapack_complex_float* t,
+                                   lapack_int ldt, lapack_complex_float* d )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_cunhr_col( &m, &n, &nb, a, &lda, t, &ldt, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_int ldt_t = MAX(1,MIN(nb,n));
+        lapack_complex_float* a_t = NULL;
+        lapack_complex_float* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -6;
+            LAPACKE_xerbla( "LAPACKE_cunhr_col_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_cunhr_col_work", info );
+            return info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (lapack_complex_float*)
+            LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (lapack_complex_float*)
+            LAPACKE_malloc( sizeof(lapack_complex_float) *
+                            ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_cunhr_col( &m, &n, &nb, a_t, &lda_t, t_t, &ldt_t, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, ldt, n, t_t, ldt_t, t,
+                           ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_cunhr_col_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_cunhr_col_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorhr_col.c b/lapack-netlib/LAPACKE/src/lapacke_dorhr_col.c
new file mode 100644
index 000000000..1f37725e9
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dorhr_col.c
@@ -0,0 +1,24 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dorhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, double* a,
+                              lapack_int lda, double* t,
+                              lapack_int ldt, double* d)
+{
+  lapack_int info = 0;
+  if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+    LAPACKE_xerbla( "LAPACKE_dorhr_col", -1 );
+    return -1;
+  }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+  if( LAPACKE_get_nancheck() ) {
+    /* Optionally check input matrices for NaNs */
+    if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) {
+      return -5;
+    }
+  }
+#endif
+  /* Call middle-level interface */
+  info = LAPACKE_dorhr_col_work( matrix_layout, m, n, nb, a, lda, t, ldt, d );
+  return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorhr_col_work.c b/lapack-netlib/LAPACKE/src/lapacke_dorhr_col_work.c
new file mode 100644
index 000000000..28b80cc02
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dorhr_col_work.c
@@ -0,0 +1,69 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dorhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, double* a,
+                                   lapack_int lda, double* t,
+                                   lapack_int ldt, double* d )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_dorhr_col( &m, &n, &nb, a, &lda, t, &ldt, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_int ldt_t = MAX(1,MIN(nb,n));
+        double* a_t = NULL;
+        double* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -6;
+            LAPACKE_xerbla( "LAPACKE_dorhr_col_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_dorhr_col_work", info );
+            return info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (double*)
+            LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (double*)
+            LAPACKE_malloc( sizeof(double) *
+                            ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_dge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_dorhr_col( &m, &n, &nb, a_t, &lda_t, t_t, &ldt_t, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, ldt, n, t_t, ldt_t, t,
+                           ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_dorhr_col_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_dorhr_col_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorhr_col.c b/lapack-netlib/LAPACKE/src/lapacke_sorhr_col.c
new file mode 100644
index 000000000..60e6e7951
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sorhr_col.c
@@ -0,0 +1,24 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sorhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, float* a,
+                              lapack_int lda, float* t,
+                              lapack_int ldt, float* d)
+{
+  lapack_int info = 0;
+  if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+    LAPACKE_xerbla( "LAPACKE_sorhr_col", -1 );
+    return -1;
+  }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+  if( LAPACKE_get_nancheck() ) {
+    /* Optionally check input matrices for NaNs */
+    if( LAPACKE_sge_nancheck( matrix_layout, m, n, a, lda ) ) {
+      return -5;
+    }
+  }
+#endif
+  /* Call middle-level interface */
+  info = LAPACKE_sorhr_col_work( matrix_layout, m, n, nb, a, lda, t, ldt, d );
+  return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorhr_col_work.c b/lapack-netlib/LAPACKE/src/lapacke_sorhr_col_work.c
new file mode 100644
index 000000000..56d6a965e
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sorhr_col_work.c
@@ -0,0 +1,69 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sorhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, float* a,
+                                   lapack_int lda, float* t,
+                                   lapack_int ldt, float* d )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_sorhr_col( &m, &n, &nb, a, &lda, t, &ldt, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_int ldt_t = MAX(1,MIN(nb,n));
+        float* a_t = NULL;
+        float* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -6;
+            LAPACKE_xerbla( "LAPACKE_sorhr_col_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_sorhr_col_work", info );
+            return info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (float*)
+            LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (float*)
+            LAPACKE_malloc( sizeof(float) *
+                            ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_sge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_sorhr_col( &m, &n, &nb, a_t, &lda_t, t_t, &ldt_t, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, ldt, n, t_t, ldt_t, t,
+                           ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_sorhr_col_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_sorhr_col_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zunhr_col.c b/lapack-netlib/LAPACKE/src/lapacke_zunhr_col.c
new file mode 100644
index 000000000..7e2507daf
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zunhr_col.c
@@ -0,0 +1,24 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zunhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, lapack_complex_double* a,
+                              lapack_int lda, lapack_complex_double* t,
+                              lapack_int ldt, lapack_complex_double* d)
+{
+  lapack_int info = 0;
+  if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+    LAPACKE_xerbla( "LAPACKE_zunhr_col", -1 );
+    return -1;
+  }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+  if( LAPACKE_get_nancheck() ) {
+    /* Optionally check input matrices for NaNs */
+    if( LAPACKE_zge_nancheck( matrix_layout, m, n, a, lda ) ) {
+      return -5;
+    }
+  }
+#endif
+  /* Call middle-level interface */
+  info = LAPACKE_zunhr_col_work( matrix_layout, m, n, nb, a, lda, t, ldt, d );
+  return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zunhr_col_work.c b/lapack-netlib/LAPACKE/src/lapacke_zunhr_col_work.c
new file mode 100644
index 000000000..b5e640177
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zunhr_col_work.c
@@ -0,0 +1,69 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zunhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, lapack_complex_double* a,
+                                   lapack_int lda, lapack_complex_double* t,
+                                   lapack_int ldt, lapack_complex_double* d )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_zunhr_col( &m, &n, &nb, a, &lda, t, &ldt, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_int ldt_t = MAX(1,MIN(nb,n));
+        lapack_complex_double* a_t = NULL;
+        lapack_complex_double* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -6;
+            LAPACKE_xerbla( "LAPACKE_zunhr_col_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_zunhr_col_work", info );
+            return info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (lapack_complex_double*)
+            LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (lapack_complex_double*)
+            LAPACKE_malloc( sizeof(lapack_complex_double) *
+                            ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_zge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_zunhr_col( &m, &n, &nb, a_t, &lda_t, t_t, &ldt_t, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, ldt, n, t_t, ldt_t, t,
+                           ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_zunhr_col_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_zunhr_col_work", info );
+    }
+    return info;
+}

From 2182cc0ff5bf5b77701d6aab19e4942ef2c6061b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 18 May 2023 23:48:21 +0200
Subject: [PATCH 083/718] Add interfaces for [cz]unhr_col and [sd]orhr_col
 (Reference-LAPACK PR 827)

---
 lapack-netlib/LAPACKE/src/CMakeLists.txt | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/lapack-netlib/LAPACKE/src/CMakeLists.txt b/lapack-netlib/LAPACKE/src/CMakeLists.txt
index 4171a3bd4..89890f923 100644
--- a/lapack-netlib/LAPACKE/src/CMakeLists.txt
+++ b/lapack-netlib/LAPACKE/src/CMakeLists.txt
@@ -589,6 +589,8 @@ lapacke_cungtr.c
 lapacke_cungtr_work.c
 lapacke_cungtsqr_row.c
 lapacke_cungtsqr_row_work.c
+lapacke_cunhr_col.c
+lapacke_cunhr_col_work.c
 lapacke_cunmbr.c
 lapacke_cunmbr_work.c
 lapacke_cunmhr.c
@@ -857,6 +859,8 @@ lapacke_dorgtr.c
 lapacke_dorgtr_work.c
 lapacke_dorgtsqr_row.c
 lapacke_dorgtsqr_row_work.c
+lapacke_dorhr_col.c
+lapacke_dorhr_col_work.c
 lapacke_dormbr.c
 lapacke_dormbr_work.c
 lapacke_dormhr.c
@@ -1432,6 +1436,8 @@ lapacke_sorgtr.c
 lapacke_sorgtr_work.c
 lapacke_sorgtsqr_row.c
 lapacke_sorgtsqr_row_work.c
+lapacke_sorhr_col.c
+lapacke_sorhr_col_work.c
 lapacke_sormbr.c
 lapacke_sormbr_work.c
 lapacke_sormhr.c
@@ -2346,6 +2352,8 @@ lapacke_zungtr.c
 lapacke_zungtr_work.c
 lapacke_zungtsqr_row.c
 lapacke_zungtsqr_row_work.c
+lapacke_zunhr_col.c
+lapacke_zunhr_col_work.c
 lapacke_zunmbr.c
 lapacke_zunmbr_work.c
 lapacke_zunmhr.c

From 58ac660d078e44fa33a8bbce4c6268b54c3cf63a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 18 May 2023 23:52:07 +0200
Subject: [PATCH 084/718] Add interfaces for [cz]unhr_col and [sd]orhr_col
 (Reference-LAPACK PR 827)

---
 lapack-netlib/LAPACKE/src/Makefile | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile
index 9c02c1445..d35e2a2d7 100644
--- a/lapack-netlib/LAPACKE/src/Makefile
+++ b/lapack-netlib/LAPACKE/src/Makefile
@@ -640,6 +640,8 @@ lapacke_cungtr.o \
 lapacke_cungtr_work.o \
 lapacke_cungtsqr_row.o \
 lapacke_cungtsqr_row_work.o \
+lapacke_cunhr_col.o \
+lapacke_cunhr_col_work.o \
 lapacke_cunmbr.o \
 lapacke_cunmbr_work.o \
 lapacke_cunmhr.o \
@@ -912,6 +914,8 @@ lapacke_dorgtr.o \
 lapacke_dorgtr_work.o \
 lapacke_dorgtsqr_row.o \
 lapacke_dorgtsqr_row_work.o \
+lapacke_dorhr_col.o \
+lapacke_dorhr_col_work.o \
 lapacke_dormbr.o \
 lapacke_dormbr_work.o \
 lapacke_dormhr.o \
@@ -1486,6 +1490,8 @@ lapacke_sorgtr.o \
 lapacke_sorgtr_work.o \
 lapacke_sorgtsqr_row.o \
 lapacke_sorgtsqr_row_work.o \
+lapacke_sorhr_col.o \
+lapacke_sorhr_col_work.o \
 lapacke_sormbr.o \
 lapacke_sormbr_work.o \
 lapacke_sormhr.o \
@@ -2404,6 +2410,8 @@ lapacke_zungtr.o \
 lapacke_zungtr_work.o \
 lapacke_zungtsqr_row.o \
 lapacke_zungtsqr_row_work.o \
+lapacke_zunhr_col.o \
+lapacke_zunhr_col_work.o \
 lapacke_zunmbr.o \
 lapacke_zunmbr_work.o \
 lapacke_zunmhr.o \

From a789211a2e1b60ef4c197709fca22606be01417e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 18 May 2023 23:56:27 +0200
Subject: [PATCH 085/718] Add interfaces for [cz]unhr_col and [sd]orhr_col
 (Reference-LAPACK PR 827)

---
 cmake/lapacke.cmake | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake
index be6a286fe..c8cfa7778 100644
--- a/cmake/lapacke.cmake
+++ b/cmake/lapacke.cmake
@@ -590,6 +590,8 @@ set(CSRC
   lapacke_cungtr_work.c
   lapacke_cungtsqr_row.c
   lapacke_cungtsqr_row_work.c
+  lapacke_cunhr_col.c
+  lapacke_cunhr_col_work.c
   lapacke_cunmbr.c
   lapacke_cunmbr_work.c
   lapacke_cunmhr.c
@@ -862,6 +864,8 @@ set(DSRC
   lapacke_dorgtr_work.c
   lapacke_dorgtsqr_row.c
   lapacke_dorgtsqr_row_work.c
+  lapacke_dorhr_col.c
+  lapacke_dorhr_col_work.c
   lapacke_dormbr.c
   lapacke_dormbr_work.c
   lapacke_dormhr.c
@@ -1435,6 +1439,8 @@ set(SSRC
   lapacke_sorgtr_work.c
   lapacke_sorgtsqr_row.c
   lapacke_sorgtsqr_row_work.c
+  lapacke_sorhr_col.c
+  lapacke_sorhr_col_work.c
   lapacke_sormbr.c
   lapacke_sormbr_work.c
   lapacke_sormhr.c
@@ -2351,6 +2357,8 @@ set(ZSRC
   lapacke_zungtr_work.c
   lapacke_zungtsqr_row.c
   lapacke_zungtsqr_row_work.c
+  lapacke_zunhr_col.c
+  lapacke_zunhr_col_work.c
   lapacke_zunmbr.c
   lapacke_zunmbr_work.c
   lapacke_zunmhr.c

From f249ccb741653b8f72783bdd20912fc2baef50ca Mon Sep 17 00:00:00 2001
From: Honglin Zhu <zhuhonglin.zhl@alibaba-inc.com>
Date: Thu, 18 May 2023 23:51:37 +0800
Subject: [PATCH 086/718] Fix spr sbgemm error

---
 cpuid_x86.c                                  |  8 ++++++--
 kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c | 11 +++++------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/cpuid_x86.c b/cpuid_x86.c
index 69cbba90e..c2486e380 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1479,6 +1479,8 @@ int get_cpuname(void){
           else
 	    return CPUTYPE_NEHALEM;
 	case 15: // Sapphire Rapids
+	  if(support_amx_bf16())
+	    return CPUTYPE_SAPPHIRERAPIDS;
 	  if(support_avx512_bf16())
             return CPUTYPE_COOPERLAKE;	
           if(support_avx512())
@@ -1845,7 +1847,8 @@ static char *cpuname[] = {
   "ZEN",
   "SKYLAKEX",
   "DHYANA",
-  "COOPERLAKE"
+  "COOPERLAKE",
+  "SAPPHIRERAPIDS",
 };
 
 static char *lowercpuname[] = {
@@ -1902,7 +1905,8 @@ static char *lowercpuname[] = {
   "zen",
   "skylakex",
   "dhyana",
-  "cooperlake"
+  "cooperlake",
+  "sapphirerapids",
 };
 
 static char *corename[] = {
diff --git a/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c
index 90e0a32c7..5ee3c8532 100644
--- a/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c
+++ b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c
@@ -97,33 +97,32 @@ typedef struct {
 #define T_C10	6
 #define T_C11	7
 
-// FIXME: gcc11 seem have problem in tile load/store address calc,
-// need to multiply with element size (2 or 4) here.
+
 #define LOAD_A(M, N) _tile_loadd(T_A##M, ptr_a##M, lda * 2)
 #define LOAD_A_TAIL(M, N) {\
 	__m256i ymm = _mm256_loadu_epi16(ptr_a##M); \
 	__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
 	_mm512_storeu_epi16(tail_a + 16 * M, zmm); \
-	_tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \
+	_tile_loadd(T_A##M, tail_a + 16 * M, 2 * 2); \
 }
 #define MASK_LOAD_A_TAIL(M, N) {\
 	__m256i ymm = _mm256_maskz_loadu_epi16(amask, ptr_a##M); \
 	__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
 	_mm512_storeu_epi16(tail_a + 16 * M, zmm); \
-	_tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \
+	_tile_loadd(T_A##M, tail_a + 16 * M, 2 * 2); \
 }
 #define LOAD_B(M, N) _tile_loadd(T_B##N, ptr_b##N, ldb * 2)
 #define LOAD_B_TAIL(M, N) {\
 	__m256i ymm = _mm256_loadu_epi16(ptr_b##N); \
 	__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
 	_mm512_storeu_epi16(tail_b + 16 * N, zmm); \
-	_tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \
+	_tile_loadd(T_B##N, tail_b + 16 * N, 2 * 2); \
 }
 #define MASK_LOAD_B_TAIL(M, N) {\
 	__m256i ymm = _mm256_maskz_loadu_epi16(bmask, ptr_b##N); \
 	__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
 	_mm512_storeu_epi16(tail_b + 16 * N, zmm); \
-	_tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \
+	_tile_loadd(T_B##N, tail_b + 16 * N, 2 * 2); \
 }
 
 #define MATMUL(M, N) _tile_dpbf16ps(T_C##M##N, T_A##M, T_B##N)

From 0b83088887af276562728162c01f26da8e85ab22 Mon Sep 17 00:00:00 2001
From: Honglin Zhu <zhuhonglin.zhl@alibaba-inc.com>
Date: Fri, 19 May 2023 00:19:50 +0800
Subject: [PATCH 087/718] spr dynamic arch support

---
 Makefile.system                     |  2 +-
 driver/others/dynamic.c             | 34 ++++++++++++++++++++++++++++-
 kernel/x86_64/KERNEL.SAPPHIRERAPIDS | 10 ++++++++-
 3 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index 343b94bb3..e37b8b45a 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -645,7 +645,7 @@ DYNAMIC_CORE += HASWELL ZEN
 endif
 ifneq ($(NO_AVX512), 1)
 ifneq ($(NO_AVX2), 1)
-DYNAMIC_CORE += SKYLAKEX COOPERLAKE
+DYNAMIC_CORE += SKYLAKEX COOPERLAKE SAPPHIRERAPIDS
 endif
 endif
 endif
diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index f61930983..dee5538fa 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -268,9 +268,11 @@ extern gotoblas_t  gotoblas_ZEN;
 #ifndef NO_AVX512
 extern gotoblas_t  gotoblas_SKYLAKEX;
 extern gotoblas_t  gotoblas_COOPERLAKE;
+extern gotoblas_t  gotoblas_SAPPHIRERAPIDS;
 #else
 #define gotoblas_SKYLAKEX gotoblas_HASWELL
 #define gotoblas_COOPERLAKE gotoblas_HASWELL
+#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL
 #endif
 #endif
 #else
@@ -279,6 +281,7 @@ extern gotoblas_t  gotoblas_COOPERLAKE;
 #define gotoblas_HASWELL gotoblas_NEHALEM
 #define gotoblas_SKYLAKEX gotoblas_NEHALEM
 #define gotoblas_COOPERLAKE gotoblas_NEHALEM
+#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM
 #define gotoblas_BULLDOZER gotoblas_BARCELONA
 #define gotoblas_PILEDRIVER gotoblas_BARCELONA
 #define gotoblas_STEAMROLLER gotoblas_BARCELONA
@@ -378,6 +381,31 @@ int support_avx512_bf16(){
 #endif
 }
 
+#define BIT_AMX_TILE	0x01000000
+#define BIT_AMX_BF16	0x00400000
+#define BIT_AMX_ENBD	0x00060000
+
+int support_amx_bf16() {
+#if !defined(NO_AVX) && !defined(NO_AVX512)
+  int eax, ebx, ecx, edx;
+  int ret=0;
+
+  if (!support_avx512())
+    return 0;
+  // CPUID.7.0:EDX indicates AMX support
+  cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
+  if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) {
+    // CPUID.D.0:EAX[17:18] indicates AMX enabled
+    cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+    if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD)
+      ret = 1;
+  }
+  return ret;
+#else
+  return 0;
+#endif
+}
+
 extern void openblas_warning(int verbose, const char * msg);
 #define FALLBACK_VERBOSE 1
 #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
@@ -689,6 +717,8 @@ static gotoblas_t *get_coretype(void){
 	  }
 	}
 	if (model == 15){          // Sapphire Rapids
+	   if(support_amx_bf16())
+	     return &gotoblas_SAPPHIRERAPIDS;
 	   if(support_avx512_bf16())
              return &gotoblas_COOPERLAKE;
           if (support_avx512()) 
@@ -941,7 +971,8 @@ static char *corename[] = {
     "Excavator",
     "Zen",
     "SkylakeX",
-    "Cooperlake"
+    "Cooperlake",
+    "SapphireRapids"
 };
 
 char *gotoblas_corename(void) {
@@ -1006,6 +1037,7 @@ char *gotoblas_corename(void) {
   if (gotoblas == &gotoblas_ZEN)          return corename[23];
   if (gotoblas == &gotoblas_SKYLAKEX)     return corename[24];
   if (gotoblas == &gotoblas_COOPERLAKE)   return corename[25];
+  if (gotoblas == &gotoblas_SAPPHIRERAPIDS) return corename[26];
   return corename[0];
 }
 
diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
index 88f574668..d101503bb 100644
--- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
+++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
@@ -1,6 +1,14 @@
 include $(KERNELDIR)/KERNEL.COOPERLAKE
 
-SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_spr.c
+undefine SBGEMM_SMALL_M_PERMIT
+undefine SBGEMM_SMALL_K_NN
+undefine SBGEMM_SMALL_K_B0_NN
+undefine SBGEMM_SMALL_K_NT
+undefine SBGEMM_SMALL_K_B0_NT
+undefine SBGEMM_SMALL_K_TN
+undefine SBGEMM_SMALL_K_B0_TN
+undefine SBGEMM_SMALL_K_TT
+undefine SBGEMM_SMALL_K_B0_TT
 
 SBGEMM_BETA     = sgemm_beta_skylakex.c
 SBGEMMKERNEL    = sbgemm_kernel_16x16_spr.c

From 90f041e348af30de319ddbb0e1b4ade6d74fc380 Mon Sep 17 00:00:00 2001
From: Honglin Zhu <zhuhonglin.zhl@alibaba-inc.com>
Date: Fri, 19 May 2023 00:21:16 +0800
Subject: [PATCH 088/718] Invoke the syscall to allow the use of amx tiles

---
 common_param.h        |  1 +
 interface/gemm.c      | 31 +++++++++++++++++++++++++++++++
 kernel/setparam-ref.c |  7 +++++++
 3 files changed, 39 insertions(+)

diff --git a/common_param.h b/common_param.h
index 8b39ca2fc..c082d248e 100644
--- a/common_param.h
+++ b/common_param.h
@@ -53,6 +53,7 @@ typedef struct {
   int sbgemm_p, sbgemm_q, sbgemm_r;
   int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn;
   int sbgemm_align_k;
+  int need_amxtile_permission;  // 0 default, 1 for device support amx.
 
   void   (*sbstobf16_k) (BLASLONG, float    *, BLASLONG, bfloat16 *, BLASLONG);
   void   (*sbdtobf16_k) (BLASLONG, double   *, BLASLONG, bfloat16 *, BLASLONG);
diff --git a/interface/gemm.c b/interface/gemm.c
index 71cc77a1b..285b99eb9 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -154,6 +154,23 @@ static size_t zgemm_small_kernel_b0[] = {
 #endif
 #endif
 
+#if defined(__linux__) && defined(BFLOAT16)
+#define XFEATURE_XTILEDATA 18
+#define ARCH_REQ_XCOMP_PERM 0x1023
+static int openblas_amxtile_permission = 0;
+static int init_amxtile_permission() {
+  long status =
+      syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);
+  if (status != 0) {
+    fprintf(stderr, "XTILEDATA permission not granted in your device(Linux, "
+                    "Intel Sapphier Rapids), skip sbgemm calculation\n");
+    return -1;
+  }
+  openblas_amxtile_permission = 1;
+  return 0;
+}
+#endif
+
 #ifndef CBLAS
 
 void NAME(char *TRANSA, char *TRANSB,
@@ -455,6 +472,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 
 #endif
 
+#if defined(__linux__) && defined(BFLOAT16)
+#if defined(DYNAMIC_ARCH)
+  if (gotoblas->need_amxtile_permission &&
+      openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) {
+    return;
+  }
+#endif
+#if !defined(DYNAMIC_ARCH) && defined(SAPPHIRERAPIDS)
+  if (openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) {
+    return;
+  }
+#endif
+#endif  // defined(__linux__) && defined(BFLOAT16)
+
   if ((args.m == 0) || (args.n == 0)) return;
 
 #if 0
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index 7832c0a87..4c361f155 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -66,6 +66,7 @@ gotoblas_t TABLE_NAME = {
 #endif
 
   SBGEMM_ALIGN_K,
+  0, // need_amxtile_permission
 
   sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS,
 
@@ -1809,6 +1810,12 @@ static void init_parameter(void) {
 
 #endif
 
+#ifdef SAPPHIRERAPIDS
+#if (BUILD_BFLOAT16 == 1)
+  TABLE_NAME.need_amxtile_permission = 1;
+#endif
+#endif
+
 #if BUILD_COMPLEX==1
 #ifdef CGEMM3M_DEFAULT_P
   TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P;

From 18d7afe69daa196902cd68b63cc381aaafc9d26e Mon Sep 17 00:00:00 2001
From: sh-zheng <2294474733@qq.com>
Date: Sat, 20 May 2023 01:19:44 +0800
Subject: [PATCH 089/718] Add rvv support for zsymv and active rvv support for
 zhemv

---
 kernel/riscv64/KERNEL.x280    |  17 ++-
 kernel/riscv64/zhemv_LM_rvv.c | 198 +++++++++++++++++++++++++++++++++
 kernel/riscv64/zhemv_UV_rvv.c | 199 ++++++++++++++++++++++++++++++++++
 kernel/riscv64/zsymv_L_rvv.c  | 179 ++++++++++++++++++++++++++++++
 kernel/riscv64/zsymv_U_rvv.c  | 177 ++++++++++++++++++++++++++++++
 5 files changed, 766 insertions(+), 4 deletions(-)
 create mode 100644 kernel/riscv64/zhemv_LM_rvv.c
 create mode 100644 kernel/riscv64/zhemv_UV_rvv.c
 create mode 100644 kernel/riscv64/zsymv_L_rvv.c
 create mode 100644 kernel/riscv64/zsymv_U_rvv.c

diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280
index 217d8534e..86708fe01 100644
--- a/kernel/riscv64/KERNEL.x280
+++ b/kernel/riscv64/KERNEL.x280
@@ -225,10 +225,19 @@ SSYMV_U_KERNEL =  symv_U_rvv.c
 SSYMV_L_KERNEL =  symv_L_rvv.c
 DSYMV_U_KERNEL =  symv_U_rvv.c 
 DSYMV_L_KERNEL =  symv_L_rvv.c
-CSYMV_U_KERNEL =  ../generic/zsymv_k.c
-CSYMV_L_KERNEL =  ../generic/zsymv_k.c
-ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
-ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+CSYMV_U_KERNEL =  zsymv_U_rvv.c
+CSYMV_L_KERNEL =  zsymv_L_rvv.c
+ZSYMV_U_KERNEL =  zsymv_U_rvv.c
+ZSYMV_L_KERNEL =  zsymv_L_rvv.c
+
+CHEMV_L_KERNEL =  zhemv_LM_rvv.c
+CHEMV_M_KERNEL =  zhemv_LM_rvv.c
+CHEMV_U_KERNEL =  zhemv_UV_rvv.c
+CHEMV_V_KERNEL =  zhemv_UV_rvv.c
+ZHEMV_L_KERNEL =  zhemv_LM_rvv.c
+ZHEMV_M_KERNEL =  zhemv_LM_rvv.c
+ZHEMV_U_KERNEL =  zhemv_UV_rvv.c
+ZHEMV_V_KERNEL =  zhemv_UV_rvv.c
 
 ZHEMMLTCOPY_M    =  zhemm_ltcopy_rvv_v1.c
 ZHEMMUTCOPY_M    =  zhemm_utcopy_rvv_v1.c
diff --git a/kernel/riscv64/zhemv_LM_rvv.c b/kernel/riscv64/zhemv_LM_rvv.c
new file mode 100644
index 000000000..e025120e5
--- /dev/null
+++ b/kernel/riscv64/zhemv_LM_rvv.c
@@ -0,0 +1,198 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
+#else
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
+        BLASLONG i, j, k;
+        BLASLONG ix, iy, ia;
+        BLASLONG jx, jy, ja;
+        FLOAT temp_r1, temp_i1;
+        FLOAT temp_r2, temp_i2;
+        FLOAT *a_ptr = a;
+        unsigned int gvl = 0;
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
+        FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1;
+        BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, len, lda2;
+
+        BLASLONG inc_x2 = incx * 2;
+        BLASLONG inc_y2 = incy * 2;
+        stride_x = inc_x2 * sizeof(FLOAT);
+        stride_y = inc_y2 * sizeof(FLOAT);
+        stride_a = 2 * sizeof(FLOAT);
+        lda2 = lda * 2;
+
+        jx = 0;
+        jy = 0;
+        ja = 0;
+        for(j = 0; j < offset; j++){
+                temp_r1 = alpha_r * x[jx]   - alpha_i * x[jx+1];;
+                temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx];
+                temp_r2 = 0;
+                temp_i2 = 0;
+                y[jy] += temp_r1 * a_ptr[ja];
+                y[jy+1] += temp_i1 * a_ptr[ja];
+                ix = jx + inc_x2;
+                iy = jy + inc_y2;
+                ia = ja + 2;
+                i = j + 1;
+                len = m - i;
+                if(len > 0){
+                        gvl = VSETVL(len);
+                        inc_xv = incx * gvl * 2;
+                        inc_yv = incy * gvl * 2;
+                        inc_av = gvl * 2;
+                        vr0 = VFMVVF_FLOAT(0, gvl);
+                        vr1 = VFMVVF_FLOAT(0, gvl);
+                        for(k = 0; k < len / gvl; k++){
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#ifndef HEMVREV
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+#else
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+
+#endif
+                                i += gvl;
+                                ix += inc_xv;
+                                iy += inc_yv;
+                                ia += inc_av;
+                        }
+                        v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
+                        temp_r2 = VFMVFS_FLOAT(v_res);
+                        v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
+                        temp_i2 = VFMVFS_FLOAT(v_res);
+                        if(i < m){
+				                gvl = VSETVL(m-i);
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#ifndef HEMVREV
+                                vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
+                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
+                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+#else
+                                vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
+                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+#endif
+
+                                v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
+                                temp_r2 += VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
+                                temp_i2 += VFMVFS_FLOAT(v_res);
+                        }
+                }
+		y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2;
+		y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2;
+		jx    += inc_x2;
+		jy    += inc_y2;
+		ja    += 2;
+		a_ptr += lda2;
+        }
+	return(0);
+}
diff --git a/kernel/riscv64/zhemv_UV_rvv.c b/kernel/riscv64/zhemv_UV_rvv.c
new file mode 100644
index 000000000..0e1ea5436
--- /dev/null
+++ b/kernel/riscv64/zhemv_UV_rvv.c
@@ -0,0 +1,199 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
+#else
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
+        BLASLONG i, j, k;
+        BLASLONG ix, iy, ia;
+        BLASLONG jx, jy, ja;
+        FLOAT temp_r1, temp_i1;
+        FLOAT temp_r2, temp_i2;
+        FLOAT *a_ptr = a;
+        unsigned int gvl = 0;
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
+        FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1;
+        BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2;
+
+        BLASLONG inc_x2 = incx * 2;
+        BLASLONG inc_y2 = incy * 2;
+        stride_x = inc_x2 * sizeof(FLOAT);
+        stride_y = inc_y2 * sizeof(FLOAT);
+        stride_a = 2 * sizeof(FLOAT);
+        lda2 = lda * 2;
+
+        BLASLONG m1 = m - offset;
+        a_ptr = a + m1 * lda2;
+        jx = m1 * inc_x2;
+        jy = m1 * inc_y2;
+        ja = m1 * 2;
+        for(j = m1; j < m; j++){
+                temp_r1 = alpha_r * x[jx]   - alpha_i * x[jx+1];;
+                temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx];
+                temp_r2 = 0;
+                temp_i2 = 0;
+                ix = 0;
+                iy = 0;
+                ia = 0;
+                i = 0;
+                if(j > 0){
+                        gvl = VSETVL(j);
+                        inc_xv = incx * gvl * 2;
+                        inc_yv = incy * gvl * 2;
+                        inc_av = gvl * 2;
+                        vr0 = VFMVVF_FLOAT(0, gvl);
+                        vr1 = VFMVVF_FLOAT(0, gvl);
+                        for(k = 0; k < j / gvl; k++){
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#ifndef HEMVREV
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+#else
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+
+#endif
+                                i += gvl;
+                                ix += inc_xv;
+                                iy += inc_yv;
+                                ia += inc_av;
+                        }
+                        v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
+                        temp_r2 = VFMVFS_FLOAT(v_res);
+                        v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
+                        temp_i2 = VFMVFS_FLOAT(v_res);
+                        if(i < j){
+				                gvl = VSETVL(j-i);
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#ifndef HEMVREV
+                                vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
+                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
+                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+#else
+                                vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
+                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+#endif
+
+                                v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
+                                temp_r2 += VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
+                                temp_i2 += VFMVFS_FLOAT(v_res);
+                        }
+                }
+                y[jy] += temp_r1 * a_ptr[ja];
+                y[jy+1] += temp_i1 * a_ptr[ja];
+		y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2;
+		y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2;
+		jx    += inc_x2;
+		jy    += inc_y2;
+		ja    += 2;
+		a_ptr += lda2;
+        }
+	return(0);
+}
diff --git a/kernel/riscv64/zsymv_L_rvv.c b/kernel/riscv64/zsymv_L_rvv.c
new file mode 100644
index 000000000..3bf621094
--- /dev/null
+++ b/kernel/riscv64/zsymv_L_rvv.c
@@ -0,0 +1,179 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSEV_FLOAT __riscv_vse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
+#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
+#define VFNEGV_FLOAT __riscv_vfneg_v_f32mf4
+#else
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSEV_FLOAT __riscv_vse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
+#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
+#define VFNEGV_FLOAT __riscv_vfneg_v_f64mf4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i,
+	  FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+        BLASLONG i, j, k;
+        BLASLONG ix,iy;
+        BLASLONG jx,jy;
+        FLOAT temp1[2];
+        FLOAT temp2[2];
+        FLOAT *a_ptr = a;
+        BLASLONG gvl = VSETVL_MAX;
+        FLOAT_V_T_M1 v_res, v_z0;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
+        FLOAT_V_T va_r, va_i, vx_r, vx_i, vy_r, vy_i, vr_r, vr_i;
+        BLASLONG stride_x, stride_y, inc_xv, inc_yv, len;
+
+        stride_x = 2 * inc_x * sizeof(FLOAT);
+        stride_y = 2 * inc_y * sizeof(FLOAT);
+        jx = 0;
+        jy = 0;
+        for (j=0; j<offset; j++)
+        {
+                temp1[0] = alpha_r * x[2 * jx] - alpha_i * x[2 * jx + 1];
+                temp1[1] = alpha_r * x[2 * jx + 1] + alpha_i * x[2 * jx];
+                temp2[0] = 0;
+                temp2[1] = 0;
+
+		y[2 * jy] += temp1[0] * a_ptr[j * 2] - temp1[1] * a_ptr[j * 2 + 1];
+		y[2 * jy + 1] += temp1[1] * a_ptr[j * 2] + temp1[0] * a_ptr[j * 2 + 1];
+
+                ix = jx + inc_x;
+                iy = jy + inc_y;
+                i = j + 1;
+                len = m - i;
+                if(len > 0){
+                        gvl = VSETVL(len);
+                        inc_xv = inc_x * gvl;
+                        inc_yv = inc_y * gvl;
+                        vr_r = VFMVVF_FLOAT(0, gvl);
+                        vr_i = VFMVVF_FLOAT(0, gvl);
+                        for(k = 0; k < len / gvl; k++){
+                                va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl);
+                                va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl);
+
+                                vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl);
+                                vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl);
+                                
+                                vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl);
+                                vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl);
+                                vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl);
+                                vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl);
+                                
+                                VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl);
+                                VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl);
+
+                                vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl);
+                                vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl);
+                                vr_r = VFMACCVV_FLOAT(vr_r, vx_r, va_r, gvl);
+                                vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl);
+                                vr_i = VFMACCVV_FLOAT(vr_i, vx_r, va_i, gvl);
+                                vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl);
+
+                                i += gvl;
+                                ix += inc_xv;
+                                iy += inc_yv;
+                        }
+                        v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl);
+                        temp2[0] = VFMVFS_FLOAT_M1(v_res);
+                        v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl);
+                        temp2[1] = VFMVFS_FLOAT_M1(v_res);
+
+                        if(i < m){
+                                gvl = VSETVL(m-i);
+                                vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl);
+                                vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl);
+                                va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl);
+                                va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl);
+
+                                vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl);
+                                vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl);
+                                vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl);
+                                vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl);
+                                
+                                VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl);
+                                VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl);
+
+                                vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl);
+                                vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl);
+                                vr_r = VFMULVV_FLOAT(vx_r, va_r, gvl);
+                                vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl);
+                                vr_i = VFMULVV_FLOAT(vx_r, va_i, gvl);
+                                vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl);
+                                
+                                v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl);
+                                temp2[0] += VFMVFS_FLOAT_M1(v_res);
+                                v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl);
+                                temp2[1] += VFMVFS_FLOAT_M1(v_res);
+                        }
+                }
+                y[2 * jy] += alpha_r * temp2[0] - alpha_i * temp2[1];
+                y[2 * jy + 1] += alpha_r * temp2[1] + alpha_i * temp2[0];
+
+                jx    += inc_x;
+                jy    += inc_y;
+                a_ptr += 2 * lda;
+        }
+                
+        return(0);
+}
+
diff --git a/kernel/riscv64/zsymv_U_rvv.c b/kernel/riscv64/zsymv_U_rvv.c
new file mode 100644
index 000000000..de1564f75
--- /dev/null
+++ b/kernel/riscv64/zsymv_U_rvv.c
@@ -0,0 +1,177 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSEV_FLOAT __riscv_vse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
+#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSEV_FLOAT __riscv_vse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
+#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i,
+	  FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+        BLASLONG i, j, k;
+        BLASLONG ix,iy;
+        BLASLONG jx,jy;
+        FLOAT temp1[2];
+        FLOAT temp2[2];
+        FLOAT *a_ptr = a;
+        BLASLONG gvl = VSETVL_MAX;
+        FLOAT_V_T_M1 v_res, v_z0;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
+
+        FLOAT_V_T va_r, va_i, vx_r, vx_i, vy_r, vy_i, vr_r, vr_i;
+        BLASLONG stride_x, stride_y, inc_xv, inc_yv;
+
+        BLASLONG m1 = m - offset;
+        jx = m1 * inc_x;
+        jy = m1 * inc_y;
+        a_ptr += m1 * lda;
+        stride_x = 2 * inc_x * sizeof(FLOAT);
+        stride_y = 2 * inc_y * sizeof(FLOAT);
+        for (j=m1; j<m; j++)
+        {
+                temp1[0] = alpha_r * x[2 * jx] - alpha_i * x[2 * jx + 1];
+                temp1[1] = alpha_r * x[2 * jx + 1] + alpha_i * x[2 * jx];
+                temp2[0] = 0;
+                temp2[1] = 0;
+                if(j > 0){
+                         ix = 0;
+                         iy = 0;
+                         i = 0;
+                         gvl = VSETVL(j);
+                         inc_xv = inc_x * gvl;
+                         inc_yv = inc_y * gvl;
+                         vr_r = VFMVVF_FLOAT(0, gvl);
+                         vr_i = VFMVVF_FLOAT(0, gvl);
+                         for(k = 0; k < j / gvl; k++){
+                                 va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl);
+                                 va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl);
+
+                                 vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl);
+                                 vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl);
+                                 
+                                 vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl);
+                                 vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl);
+                                 vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl);
+                                 vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl);
+                                
+                                 VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl);
+                                 VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl);
+
+                                 vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl);
+                                 vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl);
+                                 vr_r = VFMACCVV_FLOAT(vr_r, vx_r, va_r, gvl);
+                                 vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl);
+                                 vr_i = VFMACCVV_FLOAT(vr_i, vx_r, va_i, gvl);
+                                 vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl);
+
+                                 i += gvl;
+                                 ix += inc_xv;
+                                 iy += inc_yv;
+                         }
+                         v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl);
+                         temp2[0] = VFMVFS_FLOAT_M1(v_res);
+                         v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl);
+                         temp2[1] = VFMVFS_FLOAT_M1(v_res);
+
+                         if(i < j){
+                                 gvl = VSETVL(j-i);
+                                 vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl);
+                                 vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl);
+                                 
+                                 va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl);
+                                 va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl);
+
+                                 vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl);
+                                 vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl);
+                                 vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl);
+                                 vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl);
+                                
+                                 VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl);
+                                 VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl);
+
+                                 vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl);
+                                 vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl);
+                                 vr_r = VFMULVV_FLOAT(vx_r, va_r, gvl);
+                                 vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl);
+                                 vr_i = VFMULVV_FLOAT(vx_r, va_i, gvl);
+                                 vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl);
+                                
+                                 v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl);
+                                 temp2[0] += VFMVFS_FLOAT_M1(v_res);
+                                 v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl);
+                                 temp2[1] += VFMVFS_FLOAT_M1(v_res);
+                        }
+                }
+
+		 y[2 * jy] += temp1[0] * a_ptr[j * 2] - temp1[1] * a_ptr[j * 2 + 1] + alpha_r * temp2[0] - alpha_i * temp2[1];
+		 y[2 * jy + 1] += temp1[1] * a_ptr[j * 2] + temp1[0] * a_ptr[j * 2 + 1] + alpha_r * temp2[1] + alpha_i * temp2[0];
+
+                a_ptr += 2 * lda;
+                jx    += inc_x;
+                jy    += inc_y;
+        }
+        
+        return(0);
+}
+

From a76afdc0474eb35bd8986a78683b3131767e0a30 Mon Sep 17 00:00:00 2001
From: Honglin Zhu <zhuhonglin.zhl@alibaba-inc.com>
Date: Fri, 19 May 2023 16:22:01 +0800
Subject: [PATCH 090/718] Compatible with older version of GNU make

---
 kernel/x86_64/KERNEL.SAPPHIRERAPIDS | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
index d101503bb..3a832e917 100644
--- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
+++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
@@ -1,14 +1,14 @@
 include $(KERNELDIR)/KERNEL.COOPERLAKE
 
-undefine SBGEMM_SMALL_M_PERMIT
-undefine SBGEMM_SMALL_K_NN
-undefine SBGEMM_SMALL_K_B0_NN
-undefine SBGEMM_SMALL_K_NT
-undefine SBGEMM_SMALL_K_B0_NT
-undefine SBGEMM_SMALL_K_TN
-undefine SBGEMM_SMALL_K_B0_TN
-undefine SBGEMM_SMALL_K_TT
-undefine SBGEMM_SMALL_K_B0_TT
+SBGEMM_SMALL_M_PERMIT =
+SBGEMM_SMALL_K_NN     =
+SBGEMM_SMALL_K_B0_NN  =
+SBGEMM_SMALL_K_NT     =
+SBGEMM_SMALL_K_B0_NT  =
+SBGEMM_SMALL_K_TN     =
+SBGEMM_SMALL_K_B0_TN  =
+SBGEMM_SMALL_K_TT     =
+SBGEMM_SMALL_K_B0_TT  =
 
 SBGEMM_BETA     = sgemm_beta_skylakex.c
 SBGEMMKERNEL    = sbgemm_kernel_16x16_spr.c

From d96d386d138a78e806c133b9b61d4b1e37163d1b Mon Sep 17 00:00:00 2001
From: Honglin Zhu <zhuhonglin.zhl@alibaba-inc.com>
Date: Sat, 20 May 2023 14:13:49 +0800
Subject: [PATCH 091/718] Add CMake dynamic build for Sapphire Rapids

---
 cmake/arch.cmake  | 2 +-
 cmake/utils.cmake | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index 8521f3988..f70019800 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -82,7 +82,7 @@ if (DYNAMIC_ARCH)
       set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
     endif ()
     if (NOT NO_AVX512)
-      set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE)
+      set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE SAPPHIRERAPIDS)
       string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
     endif ()
     if (DYNAMIC_LIST)
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 56c1cb060..5e8ba866b 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -87,6 +87,15 @@ macro(ParseMakefileVars MAKEFILE_IN)
       #message(STATUS "skipping ${makefile_line}")
       continue ()
     endif ()
+
+    # Example 1: SBGEMM_SMALL_M_PERMIT =
+    # Unset the variable
+    string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*$" line_match "${makefile_line}")
+    if (NOT "${line_match}" STREQUAL "")
+      set(var_name ${CMAKE_MATCH_1})
+      unset(${var_name})
+    endif()
+
     string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
     if (NOT "${line_match}" STREQUAL "")
       #message(STATUS "match on ${line_match}")

From 9e80a194d6da44bae67091edbf2d3989d1542d8a Mon Sep 17 00:00:00 2001
From: Honglin Zhu <zhuhonglin.zhl@alibaba-inc.com>
Date: Sun, 21 May 2023 19:52:58 +0800
Subject: [PATCH 092/718] Fix dynamic_list build and gcc version check error

---
 driver/others/dynamic.c | 13 +++++++++++++
 kernel/Makefile         |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index dee5538fa..8e0f53f74 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -220,6 +220,19 @@ extern gotoblas_t gotoblas_COOPERLAKE;
 #else
 #define gotoblas_COOPERLAKE gotoblas_PRESCOTT
 #endif
+#ifdef DYN_SAPPHIRERAPIDS
+extern gotoblas_t gotoblas_SAPPHIRERAPIDS;
+#elif defined(DYN_SKYLAKEX)
+#define gotoblas_SAPPHIRERAPIDS gotoblas_SKYLAKEX
+#elif defined(DYN_HASWELL)
+#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL
+#elif defined(DYN_SANDYBRIDGE)
+#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE
+#elif defined(DYN_NEHALEM)
+#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM
+#else
+#define gotoblas_SAPPHIRERAPIDS gotoblas_PRESCOTT
+#endif
 
 
 #else // not DYNAMIC_LIST
diff --git a/kernel/Makefile b/kernel/Makefile
index 977886044..d426a1bdb 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -33,7 +33,7 @@ endif
 ifdef TARGET_CORE
 ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
- ifeq ($(GCCVERSIONGTEQ10), 1) 
+ ifeq ($(GCCVERSIONGTEQ11), 1) 
   override CFLAGS += -march=sapphirerapids
  else 
   override CFLAGS += -march=skylake-avx512 -mavx512f

From 71e4125795cf4a873f486fbb6e8a7711d717e803 Mon Sep 17 00:00:00 2001
From: Honglin Zhu <zhuhonglin.zhl@alibaba-inc.com>
Date: Mon, 22 May 2023 21:59:59 +0800
Subject: [PATCH 093/718] Fix syscall error on non-x86 platform

---
 interface/gemm.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/interface/gemm.c b/interface/gemm.c
index 285b99eb9..4778b641b 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -154,7 +154,7 @@ static size_t zgemm_small_kernel_b0[] = {
 #endif
 #endif
 
-#if defined(__linux__) && defined(BFLOAT16)
+#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
 #define XFEATURE_XTILEDATA 18
 #define ARCH_REQ_XCOMP_PERM 0x1023
 static int openblas_amxtile_permission = 0;
@@ -472,7 +472,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 
 #endif
 
-#if defined(__linux__) && defined(BFLOAT16)
+#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
 #if defined(DYNAMIC_ARCH)
   if (gotoblas->need_amxtile_permission &&
       openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) {
@@ -484,7 +484,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
     return;
   }
 #endif
-#endif  // defined(__linux__) && defined(BFLOAT16)
+#endif  // defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
 
   if ((args.m == 0) || (args.n == 0)) return;
 

From d3bf5a5401e623e107a23fb70151c7102cbd14c7 Mon Sep 17 00:00:00 2001
From: sh-zheng <2294474733@qq.com>
Date: Mon, 22 May 2023 22:39:45 +0800
Subject: [PATCH 094/718] Combine two reduction operations of zhe/symv into
 one, with tail undisturbed setted.

---
 kernel/riscv64/zhemv_LM_rvv.c | 68 +++++++++++++++++------------------
 kernel/riscv64/zhemv_UV_rvv.c | 68 +++++++++++++++++------------------
 kernel/riscv64/zsymv_L_rvv.c  | 50 +++++++++++++-------------
 kernel/riscv64/zsymv_U_rvv.c  | 52 +++++++++++++--------------
 4 files changed, 119 insertions(+), 119 deletions(-)

diff --git a/kernel/riscv64/zhemv_LM_rvv.c b/kernel/riscv64/zhemv_LM_rvv.c
index e025120e5..95c6a377c 100644
--- a/kernel/riscv64/zhemv_LM_rvv.c
+++ b/kernel/riscv64/zhemv_LM_rvv.c
@@ -36,12 +36,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSSEV_FLOAT __riscv_vsse32_v_f32m4
 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu
 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
 #define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu
 #else
 #define VSETVL(n) __riscv_vsetvl_e64m4(n)
 #define VSETVL_MAX __riscv_vsetvlmax_e64m1()
@@ -52,12 +54,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSSEV_FLOAT __riscv_vsse64_v_f64m4
 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu
 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
 #define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
@@ -143,49 +147,45 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
                                 iy += inc_yv;
                                 ia += inc_av;
                         }
-                        v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
-                        temp_r2 = VFMVFS_FLOAT(v_res);
-                        v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
-                        temp_i2 = VFMVFS_FLOAT(v_res);
+
                         if(i < m){
-				                gvl = VSETVL(m-i);
-                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
-                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
-                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
-                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+				unsigned int gvl_rem = VSETVL(m-i);
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl_rem);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl_rem);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl_rem);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl_rem);
 #ifndef HEMVREV
-                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
-                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
-                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
-                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl_rem);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl_rem);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem);
 #else
-                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
-                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
-                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
-                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl_rem);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl_rem);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem);
 #endif
-                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
-                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl_rem);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl_rem);
 
-                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl_rem);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl_rem);
 #ifndef HEMVREV
-                                vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
-                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
-                                vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
-                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+                                vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem);
+                                vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, va1, gvl_rem);
+                                vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem);
+                                vr1 = VFNMSACVV_FLOAT_TU(vr1, vx0, va1, gvl_rem);
 #else
-                                vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
-                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
-                                vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
-                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+                                vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem);
+                                vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, va1, gvl_rem);
+                                vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem);
+                                vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, va1, gvl_rem);
 #endif
-
-                                v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
-                                temp_r2 += VFMVFS_FLOAT(v_res);
-                                v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
-                                temp_i2 += VFMVFS_FLOAT(v_res);
                         }
+                        v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
+                        temp_r2 = VFMVFS_FLOAT(v_res);
+                        v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
+                        temp_i2 = VFMVFS_FLOAT(v_res);
                 }
 		y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2;
 		y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2;
diff --git a/kernel/riscv64/zhemv_UV_rvv.c b/kernel/riscv64/zhemv_UV_rvv.c
index 0e1ea5436..ec06622fc 100644
--- a/kernel/riscv64/zhemv_UV_rvv.c
+++ b/kernel/riscv64/zhemv_UV_rvv.c
@@ -36,12 +36,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSSEV_FLOAT __riscv_vsse32_v_f32m4
 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu
 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
 #define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu
 #else
 #define VSETVL(n) __riscv_vsetvl_e64m4(n)
 #define VSETVL_MAX __riscv_vsetvlmax_e64m1()
@@ -52,12 +54,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSSEV_FLOAT __riscv_vsse64_v_f64m4
 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu
 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
 #define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
@@ -142,49 +146,45 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
                                 iy += inc_yv;
                                 ia += inc_av;
                         }
-                        v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
-                        temp_r2 = VFMVFS_FLOAT(v_res);
-                        v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
-                        temp_i2 = VFMVFS_FLOAT(v_res);
+
                         if(i < j){
-				                gvl = VSETVL(j-i);
-                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
-                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
-                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
-                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+				unsigned int gvl_rem = VSETVL(j-i);
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl_rem);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl_rem);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl_rem);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl_rem);
 #ifndef HEMVREV
-                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
-                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
-                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
-                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl_rem);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl_rem);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem);
 #else
-                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
-                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
-                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
-                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl_rem);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl_rem);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem);
 #endif
-                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
-                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl_rem);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl_rem);
 
-                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl_rem);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl_rem);
 #ifndef HEMVREV
-                                vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
-                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
-                                vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
-                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+                                vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem);
+                                vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, va1, gvl_rem);
+                                vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem);
+                                vr1 = VFNMSACVV_FLOAT_TU(vr1, vx0, va1, gvl_rem);
 #else
-                                vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
-                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
-                                vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
-                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+                                vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem);
+                                vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, va1, gvl_rem);
+                                vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem);
+                                vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, va1, gvl_rem);
 #endif
-
-                                v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
-                                temp_r2 += VFMVFS_FLOAT(v_res);
-                                v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
-                                temp_i2 += VFMVFS_FLOAT(v_res);
                         }
+                        v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
+                        temp_r2 = VFMVFS_FLOAT(v_res);
+                        v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
+                        temp_i2 = VFMVFS_FLOAT(v_res);
                 }
                 y[jy] += temp_r1 * a_ptr[ja];
                 y[jy+1] += temp_i1 * a_ptr[ja];
diff --git a/kernel/riscv64/zsymv_L_rvv.c b/kernel/riscv64/zsymv_L_rvv.c
index 3bf621094..cefdea7f6 100644
--- a/kernel/riscv64/zsymv_L_rvv.c
+++ b/kernel/riscv64/zsymv_L_rvv.c
@@ -38,6 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu
 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
@@ -57,6 +59,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu
 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
@@ -133,38 +137,34 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i,
                                 ix += inc_xv;
                                 iy += inc_yv;
                         }
-                        v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl);
-                        temp2[0] = VFMVFS_FLOAT_M1(v_res);
-                        v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl);
-                        temp2[1] = VFMVFS_FLOAT_M1(v_res);
 
                         if(i < m){
-                                gvl = VSETVL(m-i);
-                                vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl);
-                                vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl);
-                                va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl);
-                                va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl);
+                                unsigned int gvl_rem = VSETVL(m-i);
+                                vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl_rem);
+                                vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl_rem);
+                                va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl_rem);
+                                va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl_rem);
 
-                                vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl);
-                                vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl);
-                                vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl);
-                                vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl);
+                                vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl_rem);
+                                vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl_rem);
+                                vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl_rem);
+                                vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl_rem);
                                 
-                                VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl);
-                                VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl);
+                                VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl_rem);
+                                VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl_rem);
 
-                                vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl);
-                                vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl);
-                                vr_r = VFMULVV_FLOAT(vx_r, va_r, gvl);
-                                vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl);
-                                vr_i = VFMULVV_FLOAT(vx_r, va_i, gvl);
-                                vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl);
+                                vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl_rem);
+                                vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl_rem);
+                                vr_r = VFMACCVV_FLOAT_TU(vr_r, vx_r, va_r, gvl_rem);
+                                vr_r = VFNMSACVV_FLOAT_TU(vr_r, vx_i, va_i, gvl_rem);
+                                vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_r, va_i, gvl_rem);
+                                vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_i, va_r, gvl_rem);
                                 
-                                v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl);
-                                temp2[0] += VFMVFS_FLOAT_M1(v_res);
-                                v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl);
-                                temp2[1] += VFMVFS_FLOAT_M1(v_res);
                         }
+                        v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl);
+                        temp2[0] = VFMVFS_FLOAT_M1(v_res);
+                        v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl);
+                        temp2[1] = VFMVFS_FLOAT_M1(v_res);
                 }
                 y[2 * jy] += alpha_r * temp2[0] - alpha_i * temp2[1];
                 y[2 * jy + 1] += alpha_r * temp2[1] + alpha_i * temp2[0];
diff --git a/kernel/riscv64/zsymv_U_rvv.c b/kernel/riscv64/zsymv_U_rvv.c
index de1564f75..67b5a649c 100644
--- a/kernel/riscv64/zsymv_U_rvv.c
+++ b/kernel/riscv64/zsymv_U_rvv.c
@@ -38,6 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu
 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
@@ -56,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
 #define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu
 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
 #define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
@@ -129,39 +133,35 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i,
                                  ix += inc_xv;
                                  iy += inc_yv;
                          }
-                         v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl);
-                         temp2[0] = VFMVFS_FLOAT_M1(v_res);
-                         v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl);
-                         temp2[1] = VFMVFS_FLOAT_M1(v_res);
 
                          if(i < j){
-                                 gvl = VSETVL(j-i);
-                                 vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl);
-                                 vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl);
+                                 unsigned int gvl_rem = VSETVL(j-i);
+                                 vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl_rem);
+                                 vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl_rem);
                                  
-                                 va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl);
-                                 va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl);
+                                 va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl_rem);
+                                 va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl_rem);
 
-                                 vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl);
-                                 vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl);
-                                 vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl);
-                                 vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl);
+                                 vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl_rem);
+                                 vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl_rem);
+                                 vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl_rem);
+                                 vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl_rem);
                                 
-                                 VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl);
-                                 VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl);
+                                 VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl_rem);
+                                 VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl_rem);
 
-                                 vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl);
-                                 vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl);
-                                 vr_r = VFMULVV_FLOAT(vx_r, va_r, gvl);
-                                 vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl);
-                                 vr_i = VFMULVV_FLOAT(vx_r, va_i, gvl);
-                                 vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl);
+                                 vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl_rem);
+                                 vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl_rem);
+                                 vr_r = VFMACCVV_FLOAT_TU(vr_r, vx_r, va_r, gvl_rem);
+                                 vr_r = VFNMSACVV_FLOAT_TU(vr_r, vx_i, va_i, gvl_rem);
+                                 vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_r, va_i, gvl_rem);
+                                 vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_i, va_r, gvl_rem);
                                 
-                                 v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl);
-                                 temp2[0] += VFMVFS_FLOAT_M1(v_res);
-                                 v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl);
-                                 temp2[1] += VFMVFS_FLOAT_M1(v_res);
-                        }
+                         }
+                         v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl);
+                         temp2[0] = VFMVFS_FLOAT_M1(v_res);
+                         v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl);
+                         temp2[1] = VFMVFS_FLOAT_M1(v_res);
                 }
 
 		 y[2 * jy] += temp1[0] * a_ptr[j * 2] - temp1[1] * a_ptr[j * 2 + 1] + alpha_r * temp2[0] - alpha_i * temp2[1];

From c0da3af13ba13f42628136503e1047d206c7257f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 24 May 2023 11:37:32 +0200
Subject: [PATCH 095/718] Fix segfault when executing with row major

---
 lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c
index 962624d21..3f8f0cf17 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c
@@ -83,7 +83,7 @@ lapack_int LAPACKE_cgeqrt_work( int matrix_layout, lapack_int m, lapack_int n,
         }
         /* Transpose output matrices */
         LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
-        LAPACKE_cge_trans( LAPACK_COL_MAJOR, ldt, MIN(m,n), t_t, ldt_t, t,
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, nb, MIN(m,n), t_t, ldt_t, t,
                            ldt );
         /* Release memory and exit */
         LAPACKE_free( t_t );

From b0a72586a32a06b725c99c1ff84ff0f9d7b54eec Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 24 May 2023 11:39:20 +0200
Subject: [PATCH 096/718] fix segfault when executing with row major

---
 lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c
index 8a4c7cead..000c94e0b 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c
@@ -80,7 +80,7 @@ lapack_int LAPACKE_dgeqrt_work( int matrix_layout, lapack_int m, lapack_int n,
         }
         /* Transpose output matrices */
         LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
-        LAPACKE_dge_trans( LAPACK_COL_MAJOR, ldt, MIN(m,n), t_t, ldt_t, t,
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, nb, MIN(m,n), t_t, ldt_t, t,
                            ldt );
         /* Release memory and exit */
         LAPACKE_free( t_t );

From abbc387145670cba345edd02ecfee7310e7422de Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 24 May 2023 11:40:25 +0200
Subject: [PATCH 097/718] fix segfault when executing with row major

---
 lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c
index 00ca6e501..d357845ae 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c
@@ -79,7 +79,7 @@ lapack_int LAPACKE_sgeqrt_work( int matrix_layout, lapack_int m, lapack_int n,
         }
         /* Transpose output matrices */
         LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
-        LAPACKE_sge_trans( LAPACK_COL_MAJOR, ldt, MIN(m,n), t_t, ldt_t, t,
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, nb, MIN(m,n), t_t, ldt_t, t,
                            ldt );
         /* Release memory and exit */
         LAPACKE_free( t_t );

From 0d72705adc8bc917bc4ecc1b299ba7cad750e999 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 24 May 2023 11:41:28 +0200
Subject: [PATCH 098/718] fix segfault when executing with row major
 (Reference-LAPACK 768)

---
 lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c
index 8f6f1ed0b..633111533 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c
@@ -83,7 +83,7 @@ lapack_int LAPACKE_zgeqrt_work( int matrix_layout, lapack_int m, lapack_int n,
         }
         /* Transpose output matrices */
         LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
-        LAPACKE_zge_trans( LAPACK_COL_MAJOR, ldt, MIN(m,n), t_t, ldt_t, t,
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, nb, MIN(m,n), t_t, ldt_t, t,
                            ldt );
         /* Release memory and exit */
         LAPACKE_free( t_t );

From e2f4adfc608a47fab120dd164c2712e2c18b759f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 24 May 2023 16:51:14 +0200
Subject: [PATCH 099/718] fix scaling (Reference-LAPACK PR 830)

---
 lapack-netlib/SRC/dtgsna.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/dtgsna.f b/lapack-netlib/SRC/dtgsna.f
index b9b3ad8af..013dc91bd 100644
--- a/lapack-netlib/SRC/dtgsna.f
+++ b/lapack-netlib/SRC/dtgsna.f
@@ -632,8 +632,8 @@
                C1 = TWO*( ALPHAR*ALPHAR+ALPHAI*ALPHAI+BETA*BETA )
                C2 = FOUR*BETA*BETA*ALPHAI*ALPHAI
                ROOT1 = C1 + SQRT( C1*C1-4.0D0*C2 )
-               ROOT2 = C2 / ROOT1
                ROOT1 = ROOT1 / TWO
+               ROOT2 = C2 / ROOT1
                COND = MIN( SQRT( ROOT1 ), SQRT( ROOT2 ) )
             END IF
 *

From 912bfd9bfbabef50cd0399c142c52b7cc350291c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 24 May 2023 16:52:56 +0200
Subject: [PATCH 100/718] fix scaling (Reference-LAPACK PR 830)

---
 lapack-netlib/SRC/stgsna.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/stgsna.f b/lapack-netlib/SRC/stgsna.f
index 40f822915..430f3c4b7 100644
--- a/lapack-netlib/SRC/stgsna.f
+++ b/lapack-netlib/SRC/stgsna.f
@@ -632,8 +632,8 @@
                C1 = TWO*( ALPHAR*ALPHAR+ALPHAI*ALPHAI+BETA*BETA )
                C2 = FOUR*BETA*BETA*ALPHAI*ALPHAI
                ROOT1 = C1 + SQRT( C1*C1-4.0*C2 )
-               ROOT2 = C2 / ROOT1
                ROOT1 = ROOT1 / TWO
+               ROOT2 = C2 / ROOT1
                COND = MIN( SQRT( ROOT1 ), SQRT( ROOT2 ) )
             END IF
 *

From e728744bd66bb9c10f24cc1daa0c65a535bab533 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 24 May 2023 22:30:45 +0200
Subject: [PATCH 101/718] Add comments on handling of zero-sized B matrix in
 ?GELSY (Reference-LAPACK PR 833) (#4056)

* Add comment related to zero-sized matrix B
---
 lapack-netlib/SRC/cgelsy.f | 2 ++
 lapack-netlib/SRC/dgelsy.f | 2 ++
 lapack-netlib/SRC/sgelsy.f | 2 ++
 lapack-netlib/SRC/zgelsy.f | 2 ++
 4 files changed, 8 insertions(+)

diff --git a/lapack-netlib/SRC/cgelsy.f b/lapack-netlib/SRC/cgelsy.f
index 67140f191..64bb10023 100644
--- a/lapack-netlib/SRC/cgelsy.f
+++ b/lapack-netlib/SRC/cgelsy.f
@@ -116,6 +116,7 @@
 *>          B is COMPLEX array, dimension (LDB,NRHS)
 *>          On entry, the M-by-NRHS right hand side matrix B.
 *>          On exit, the N-by-NRHS solution matrix X.
+*>          If M = 0 or N = 0, B is not referenced.
 *> \endverbatim
 *>
 *> \param[in] LDB
@@ -148,6 +149,7 @@
 *>          The effective rank of A, i.e., the order of the submatrix
 *>          R11.  This is the same as the order of the submatrix T11
 *>          in the complete orthogonal factorization of A.
+*>          If NRHS = 0, RANK = 0 on output.
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/dgelsy.f b/lapack-netlib/SRC/dgelsy.f
index aebab9264..e9fcd9682 100644
--- a/lapack-netlib/SRC/dgelsy.f
+++ b/lapack-netlib/SRC/dgelsy.f
@@ -115,6 +115,7 @@
 *>          B is DOUBLE PRECISION array, dimension (LDB,NRHS)
 *>          On entry, the M-by-NRHS right hand side matrix B.
 *>          On exit, the N-by-NRHS solution matrix X.
+*>          If M = 0 or N = 0, B is not referenced.
 *> \endverbatim
 *>
 *> \param[in] LDB
@@ -147,6 +148,7 @@
 *>          The effective rank of A, i.e., the order of the submatrix
 *>          R11.  This is the same as the order of the submatrix T11
 *>          in the complete orthogonal factorization of A.
+*>          If NRHS = 0, RANK = 0 on output.
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/sgelsy.f b/lapack-netlib/SRC/sgelsy.f
index 9c60f78a7..89dd39e80 100644
--- a/lapack-netlib/SRC/sgelsy.f
+++ b/lapack-netlib/SRC/sgelsy.f
@@ -115,6 +115,7 @@
 *>          B is REAL array, dimension (LDB,NRHS)
 *>          On entry, the M-by-NRHS right hand side matrix B.
 *>          On exit, the N-by-NRHS solution matrix X.
+*>          If M = 0 or N = 0, B is not referenced.
 *> \endverbatim
 *>
 *> \param[in] LDB
@@ -147,6 +148,7 @@
 *>          The effective rank of A, i.e., the order of the submatrix
 *>          R11.  This is the same as the order of the submatrix T11
 *>          in the complete orthogonal factorization of A.
+*>          If NRHS = 0, RANK = 0 on output.
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/zgelsy.f b/lapack-netlib/SRC/zgelsy.f
index 65fa87ae9..497becf8b 100644
--- a/lapack-netlib/SRC/zgelsy.f
+++ b/lapack-netlib/SRC/zgelsy.f
@@ -116,6 +116,7 @@
 *>          B is COMPLEX*16 array, dimension (LDB,NRHS)
 *>          On entry, the M-by-NRHS right hand side matrix B.
 *>          On exit, the N-by-NRHS solution matrix X.
+*>          If M = 0 or N = 0, B is not referenced.
 *> \endverbatim
 *>
 *> \param[in] LDB
@@ -148,6 +149,7 @@
 *>          The effective rank of A, i.e., the order of the submatrix
 *>          R11.  This is the same as the order of the submatrix T11
 *>          in the complete orthogonal factorization of A.
+*>          If NRHS = 0, RANK = 0 on output.
 *> \endverbatim
 *>
 *> \param[out] WORK

From 490ce5199b7e13fea34e203eddceff4cd350d213 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 25 May 2023 10:22:24 +0200
Subject: [PATCH 102/718] Update brief description of function purpose
 (Reference-LAPACK 831)

---
 lapack-netlib/SRC/DEPRECATED/cgegs.f | 2 +-
 lapack-netlib/SRC/DEPRECATED/cgegv.f | 2 +-
 lapack-netlib/SRC/DEPRECATED/dgegs.f | 2 +-
 lapack-netlib/SRC/DEPRECATED/dgegv.f | 2 +-
 lapack-netlib/SRC/DEPRECATED/sgegs.f | 2 +-
 lapack-netlib/SRC/DEPRECATED/sgegv.f | 2 +-
 lapack-netlib/SRC/DEPRECATED/zgegs.f | 2 +-
 lapack-netlib/SRC/DEPRECATED/zgegv.f | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lapack-netlib/SRC/DEPRECATED/cgegs.f b/lapack-netlib/SRC/DEPRECATED/cgegs.f
index 1f0791a20..b6adf9111 100644
--- a/lapack-netlib/SRC/DEPRECATED/cgegs.f
+++ b/lapack-netlib/SRC/DEPRECATED/cgegs.f
@@ -1,4 +1,4 @@
-*> \brief <b> CGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> CGEGS computes the eigenvalues, Schur form, and, optionally, the left and or/right Schur vectors of a complex matrix pair (A,B)</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/cgegv.f b/lapack-netlib/SRC/DEPRECATED/cgegv.f
index ba810ddef..d2b254255 100644
--- a/lapack-netlib/SRC/DEPRECATED/cgegv.f
+++ b/lapack-netlib/SRC/DEPRECATED/cgegv.f
@@ -1,4 +1,4 @@
-*> \brief <b> CGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> CGEGV computes the eigenvalues and, optionally, the left and/or right eigenvectors of a complex matrix pair (A,B).</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/dgegs.f b/lapack-netlib/SRC/DEPRECATED/dgegs.f
index 0ac0112c2..02e9fdcb2 100644
--- a/lapack-netlib/SRC/DEPRECATED/dgegs.f
+++ b/lapack-netlib/SRC/DEPRECATED/dgegs.f
@@ -1,4 +1,4 @@
-*> \brief <b> DGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> DGEGS computes the eigenvalues, real Schur form, and, optionally, the left and/or right Schur vectors of a real matrix pair (A,B)</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/dgegv.f b/lapack-netlib/SRC/DEPRECATED/dgegv.f
index 7e81c85a9..0b5c48922 100644
--- a/lapack-netlib/SRC/DEPRECATED/dgegv.f
+++ b/lapack-netlib/SRC/DEPRECATED/dgegv.f
@@ -1,4 +1,4 @@
-*> \brief <b> DGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> DGEGV computes the eigenvalues and, optionally, the left and/or right eigenvectors of a real matrix pair (A,B).</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/sgegs.f b/lapack-netlib/SRC/DEPRECATED/sgegs.f
index 2ed9ad942..11ecc67ac 100644
--- a/lapack-netlib/SRC/DEPRECATED/sgegs.f
+++ b/lapack-netlib/SRC/DEPRECATED/sgegs.f
@@ -1,4 +1,4 @@
-*> \brief <b> SGEGS computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> SGEGS computes the eigenvalues, real Schur form, and, optionally, the left and/or right Schur vectors of a real matrix pair (A,B)</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/sgegv.f b/lapack-netlib/SRC/DEPRECATED/sgegv.f
index 7a179a499..97556e371 100644
--- a/lapack-netlib/SRC/DEPRECATED/sgegv.f
+++ b/lapack-netlib/SRC/DEPRECATED/sgegv.f
@@ -1,4 +1,4 @@
-*> \brief <b> SGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> SGEGV computes the eigenvalues and, optionally, the left and/or right eigenvectors of a real matrix pair (A,B).</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/zgegs.f b/lapack-netlib/SRC/DEPRECATED/zgegs.f
index c5cdd26e5..23f8d43d1 100644
--- a/lapack-netlib/SRC/DEPRECATED/zgegs.f
+++ b/lapack-netlib/SRC/DEPRECATED/zgegs.f
@@ -1,4 +1,4 @@
-*> \brief <b> ZGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> ZGEGS computes the eigenvalues, Schur form, and, optionally, the left and or/right Schur vectors of a complex matrix pair (A,B)</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/zgegv.f b/lapack-netlib/SRC/DEPRECATED/zgegv.f
index aa4ab3f71..542d3f4ff 100644
--- a/lapack-netlib/SRC/DEPRECATED/zgegv.f
+++ b/lapack-netlib/SRC/DEPRECATED/zgegv.f
@@ -1,4 +1,4 @@
-*> \brief <b> ZGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> ZGEGV computes the eigenvalues and, optionally, the left and/or right eigenvectors of a complex matrix pair (A,B).</b>
 *
 *  =========== DOCUMENTATION ===========
 *

From 447368783aca067d6bbbc057f4528023eb3567bd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 25 May 2023 13:06:15 +0200
Subject: [PATCH 103/718] Add quick return for N < 1 (Reference-LAPACK PR 837)

---
 lapack-netlib/SRC/dlaruv.f | 5 +++++
 lapack-netlib/SRC/slaruv.f | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/lapack-netlib/SRC/dlaruv.f b/lapack-netlib/SRC/dlaruv.f
index 0f5c9541d..1a4ce4009 100644
--- a/lapack-netlib/SRC/dlaruv.f
+++ b/lapack-netlib/SRC/dlaruv.f
@@ -382,6 +382,11 @@
      $                   1537 /
 *     ..
 *     .. Executable Statements ..
+*
+*     Quick return for N < 1
+      IF ( N < 1 ) THEN
+         RETURN
+      END IF
 *
       I1 = ISEED( 1 )
       I2 = ISEED( 2 )
diff --git a/lapack-netlib/SRC/slaruv.f b/lapack-netlib/SRC/slaruv.f
index c25dc2e2f..cd37a1c47 100644
--- a/lapack-netlib/SRC/slaruv.f
+++ b/lapack-netlib/SRC/slaruv.f
@@ -382,6 +382,11 @@
      $                   1537 /
 *     ..
 *     .. Executable Statements ..
+*
+*     Quick return for N < 1
+      IF ( N < 1 ) THEN
+         RETURN
+      END IF
 *
       I1 = ISEED( 1 )
       I2 = ISEED( 2 )

From e09055330e88cecda4d45e380637a60e230f3c68 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 25 May 2023 14:14:17 +0200
Subject: [PATCH 104/718] run apt-get update before fetching cross-compilers

---
 .github/workflows/dynamic_arch.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml
index b537c8494..c16f87954 100644
--- a/.github/workflows/dynamic_arch.yml
+++ b/.github/workflows/dynamic_arch.yml
@@ -310,6 +310,7 @@ jobs:
 
       - name: Install Dependencies
         run: |
+          sudo apt-get update
           sudo apt-get install -y ccache gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-${{ matrix.target }}-cross
 
       - name: Compilation cache

From 899c3a6f6a801fef080962221755eac8d543d5df Mon Sep 17 00:00:00 2001
From: Angelika Schwarz <17718454+angsch@users.noreply.github.com>
Date: Thu, 27 Apr 2023 17:59:15 +0200
Subject: [PATCH 105/718] Improve input argument checks of gemmt

* Fix return value for invalid info
* Add missing checks for ldA, ldB
* Use reference-LAPACK like checks (ie ld=0,nrows=0 is invalid)
---
 interface/gemmt.c | 73 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 52 insertions(+), 21 deletions(-)

diff --git a/interface/gemmt.c b/interface/gemmt.c
index cebc7918d..046432670 100644
--- a/interface/gemmt.c
+++ b/interface/gemmt.c
@@ -77,6 +77,7 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
 	blasint info;
 
 	char transA, transB, Uplo;
+	blasint nrowa, nrowb;
 	IFLOAT *buffer;
 	IFLOAT *aa, *bb;
 	FLOAT *cc;
@@ -155,22 +156,31 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
 	if (Uplo == 'L')
 		uplo = 1;
 
+	nrowa = m;
+	if (transa) nrowa = k;
+	nrowb = k;
+	if (transb) nrowb = m;
+
 	info = 0;
 
-	if (uplo < 0)
-		info = 14;
-	if (ldc < m)
+	if (ldc < MAX(1, m))
 		info = 13;
+	if (ldb < MAX(1, nrowa))
+		info = 10;
+	if (lda < MAX(1, nrowb))
+		info = 8;
 	if (k < 0)
 		info = 5;
 	if (m < 0)
-		info = 3;
+		info = 4;
 	if (transb < 0)
-		info = 2;
+		info = 3;
 	if (transa < 0)
+		info = 2;
+	if (uplo < 0)
 		info = 1;
 
-	if (info) {
+	if (info != 0) {
 		BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
 		return;
 	}
@@ -205,11 +215,14 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 	PRINT_DEBUG_CNAME;
 
+	uplo = -1;
 	transa = -1;
 	transb = -1;
 	info = 0;
 
 	if (order == CblasColMajor) {
+		if (Uplo == CblasUpper) uplo = 0;
+		if (Uplo == CblasLower) uplo = 1;
 
 		if (TransA == CblasNoTrans)
 			transa = 0;
@@ -249,15 +262,27 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 		info = -1;
 
-		if (ldc < m)
+		blasint nrowa, nrowb;
+		nrowa = m;
+		if (transa) nrowa = k;
+		nrowb = k;
+		if (transb) nrowb = m;
+
+		if (ldc < MAX(1, m))
 			info = 13;
+		if (ldb < MAX(1, nrowb))
+			info = 10;
+		if (lda < MAX(1, nrowa))
+			info = 8;
 		if (k < 0)
 			info = 5;
 		if (m < 0)
-			info = 3;
+			info = 4;
 		if (transb < 0)
-			info = 2;
+			info = 3;
 		if (transa < 0)
+			info = 2;
+		if (uplo < 0)
 			info = 1;
 	}
 
@@ -269,6 +294,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 		lda = LDB;
 		ldb = LDA;
 
+		if (Uplo == CblasUpper) uplo = 0;
+		if (Uplo == CblasLower) uplo = 1;
+
 		if (TransB == CblasNoTrans)
 			transa = 0;
 		if (TransB == CblasTrans)
@@ -302,27 +330,30 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 		info = -1;
 
-		if (ldc < m)
+		blasint ncola, ncolb;
+		ncola = k;
+		if (transa) ncola = m;
+		ncolb = m;
+		if (transb) ncolb = k;
+
+		if (ldc < MAX(1,m))
 			info = 13;
+		if (ldb < MAX(1, ncolb))
+			info = 10;
+		if (lda < MAX(1, ncola))
+			info = 8;
 		if (k < 0)
 			info = 5;
 		if (m < 0)
-			info = 3;
+			info = 4;
 		if (transb < 0)
-			info = 2;
+			info = 3;
 		if (transa < 0)
+			info = 2;
+		if (uplo < 0)
 			info = 1;
-
 	}
 
-	uplo = -1;
-	if (Uplo == CblasUpper)
-		uplo = 0;
-	if (Uplo == CblasLower)
-		uplo = 1;
-	if (uplo < 0)
-		info = 14;
-
 	if (info >= 0) {
 		BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
 		return;

From a757e1486f5899beb6488f9c7080fd4554b78632 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 31 May 2023 08:44:20 +0200
Subject: [PATCH 106/718] Fix potential segfault from not allocating work array
 (Reference-LAPACK 836)

---
 lapack-netlib/LAPACKE/src/lapacke_dgeesx.c | 15 ++++++++-------
 lapack-netlib/LAPACKE/src/lapacke_sgeesx.c | 15 ++++++++-------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c b/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c
index 7796edffc..0460b6406 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c
@@ -80,10 +80,13 @@ lapack_int LAPACKE_dgeesx( int matrix_layout, char jobvs, char sort,
     /* Allocate memory for work arrays */
     if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) {
         iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
-        if( iwork == NULL ) {
-            info = LAPACK_WORK_MEMORY_ERROR;
-            goto exit_level_1;
-        }
+    }
+    else {
+        iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * 1 );
+    }
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
     }
     work = (double*)LAPACKE_malloc( sizeof(double) * lwork );
     if( work == NULL ) {
@@ -97,9 +100,7 @@ lapack_int LAPACKE_dgeesx( int matrix_layout, char jobvs, char sort,
     /* Release memory and exit */
     LAPACKE_free( work );
 exit_level_2:
-    if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) {
-        LAPACKE_free( iwork );
-    }
+    LAPACKE_free( iwork );
 exit_level_1:
     if( LAPACKE_lsame( sort, 's' ) ) {
         LAPACKE_free( bwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c b/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c
index f7be44297..d2555ecc8 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c
@@ -80,10 +80,13 @@ lapack_int LAPACKE_sgeesx( int matrix_layout, char jobvs, char sort,
     /* Allocate memory for work arrays */
     if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) {
         iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
-        if( iwork == NULL ) {
-            info = LAPACK_WORK_MEMORY_ERROR;
-            goto exit_level_1;
-        }
+    }
+    else {
+        iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * 1 );
+    }
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
     }
     work = (float*)LAPACKE_malloc( sizeof(float) * lwork );
     if( work == NULL ) {
@@ -97,9 +100,7 @@ lapack_int LAPACKE_sgeesx( int matrix_layout, char jobvs, char sort,
     /* Release memory and exit */
     LAPACKE_free( work );
 exit_level_2:
-    if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) {
-        LAPACKE_free( iwork );
-    }
+    LAPACKE_free( iwork );
 exit_level_1:
     if( LAPACKE_lsame( sort, 's' ) ) {
         LAPACKE_free( bwork );

From 92720c2f1ef8b52d57bd01e07044e61e94ce2094 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 31 May 2023 11:33:32 +0200
Subject: [PATCH 107/718] Remove unused variable SMIN (Reference-LAPACK 832)

---
 lapack-netlib/SRC/cbdsqr.f | 16 +++++++---------
 lapack-netlib/SRC/dbdsqr.f | 16 +++++++---------
 lapack-netlib/SRC/sbdsqr.f | 16 +++++++---------
 lapack-netlib/SRC/zbdsqr.f | 16 +++++++---------
 4 files changed, 28 insertions(+), 36 deletions(-)

diff --git a/lapack-netlib/SRC/cbdsqr.f b/lapack-netlib/SRC/cbdsqr.f
index 1d7c4e09d..40706644e 100644
--- a/lapack-netlib/SRC/cbdsqr.f
+++ b/lapack-netlib/SRC/cbdsqr.f
@@ -259,7 +259,7 @@
      $                   NM12, NM13, OLDLL, OLDM
       REAL               ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU,
      $                   OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL,
-     $                   SINR, SLL, SMAX, SMIN, SMINL, SMINOA,
+     $                   SINR, SLL, SMAX, SMIN, SMINOA,
      $                   SN, THRESH, TOL, TOLMUL, UNFL
 *     ..
 *     .. External Functions ..
@@ -372,7 +372,7 @@
       DO 30 I = 1, N - 1
          SMAX = MAX( SMAX, ABS( E( I ) ) )
    30 CONTINUE
-      SMINL = ZERO
+      SMIN = ZERO
       IF( TOL.GE.ZERO ) THEN
 *
 *        Relative accuracy desired
@@ -426,7 +426,6 @@
       IF( TOL.LT.ZERO .AND. ABS( D( M ) ).LE.THRESH )
      $   D( M ) = ZERO
       SMAX = ABS( D( M ) )
-      SMIN = SMAX
       DO 70 LLL = 1, M - 1
          LL = M - LLL
          ABSS = ABS( D( LL ) )
@@ -435,7 +434,6 @@
      $      D( LL ) = ZERO
          IF( ABSE.LE.THRESH )
      $      GO TO 80
-         SMIN = MIN( SMIN, ABSS )
          SMAX = MAX( SMAX, ABSS, ABSE )
    70 CONTINUE
       LL = 0
@@ -517,14 +515,14 @@
 *           apply convergence criterion forward
 *
             MU = ABS( D( LL ) )
-            SMINL = MU
+            SMIN = MU
             DO 100 LLL = LL, M - 1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL+1 ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   100       CONTINUE
          END IF
 *
@@ -545,14 +543,14 @@
 *           apply convergence criterion backward
 *
             MU = ABS( D( M ) )
-            SMINL = MU
+            SMIN = MU
             DO 110 LLL = M - 1, LL, -1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   110       CONTINUE
          END IF
       END IF
@@ -562,7 +560,7 @@
 *     Compute shift.  First, test if shifting would ruin relative
 *     accuracy, and if so set the shift to zero.
 *
-      IF( TOL.GE.ZERO .AND. N*TOL*( SMINL / SMAX ).LE.
+      IF( TOL.GE.ZERO .AND. N*TOL*( SMIN / SMAX ).LE.
      $    MAX( EPS, HNDRTH*TOL ) ) THEN
 *
 *        Use a zero shift to avoid loss of relative accuracy
diff --git a/lapack-netlib/SRC/dbdsqr.f b/lapack-netlib/SRC/dbdsqr.f
index c220a5875..bc697a007 100644
--- a/lapack-netlib/SRC/dbdsqr.f
+++ b/lapack-netlib/SRC/dbdsqr.f
@@ -278,7 +278,7 @@
      $                   MAXITDIVN, NM1, NM12, NM13, OLDLL, OLDM
       DOUBLE PRECISION   ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU,
      $                   OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL,
-     $                   SINR, SLL, SMAX, SMIN, SMINL, SMINOA,
+     $                   SINR, SLL, SMAX, SMIN, SMINOA,
      $                   SN, THRESH, TOL, TOLMUL, UNFL
 *     ..
 *     .. External Functions ..
@@ -391,7 +391,7 @@
       DO 30 I = 1, N - 1
          SMAX = MAX( SMAX, ABS( E( I ) ) )
    30 CONTINUE
-      SMINL = ZERO
+      SMIN = ZERO
       IF( TOL.GE.ZERO ) THEN
 *
 *        Relative accuracy desired
@@ -451,7 +451,6 @@
       IF( TOL.LT.ZERO .AND. ABS( D( M ) ).LE.THRESH )
      $   D( M ) = ZERO
       SMAX = ABS( D( M ) )
-      SMIN = SMAX
       DO 70 LLL = 1, M - 1
          LL = M - LLL
          ABSS = ABS( D( LL ) )
@@ -460,7 +459,6 @@
      $      D( LL ) = ZERO
          IF( ABSE.LE.THRESH )
      $      GO TO 80
-         SMIN = MIN( SMIN, ABSS )
          SMAX = MAX( SMAX, ABSS, ABSE )
    70 CONTINUE
       LL = 0
@@ -542,14 +540,14 @@
 *           apply convergence criterion forward
 *
             MU = ABS( D( LL ) )
-            SMINL = MU
+            SMIN = MU
             DO 100 LLL = LL, M - 1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL+1 ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   100       CONTINUE
          END IF
 *
@@ -570,14 +568,14 @@
 *           apply convergence criterion backward
 *
             MU = ABS( D( M ) )
-            SMINL = MU
+            SMIN = MU
             DO 110 LLL = M - 1, LL, -1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   110       CONTINUE
          END IF
       END IF
@@ -587,7 +585,7 @@
 *     Compute shift.  First, test if shifting would ruin relative
 *     accuracy, and if so set the shift to zero.
 *
-      IF( TOL.GE.ZERO .AND. N*TOL*( SMINL / SMAX ).LE.
+      IF( TOL.GE.ZERO .AND. N*TOL*( SMIN / SMAX ).LE.
      $    MAX( EPS, HNDRTH*TOL ) ) THEN
 *
 *        Use a zero shift to avoid loss of relative accuracy
diff --git a/lapack-netlib/SRC/sbdsqr.f b/lapack-netlib/SRC/sbdsqr.f
index c798baaf5..880f0607b 100644
--- a/lapack-netlib/SRC/sbdsqr.f
+++ b/lapack-netlib/SRC/sbdsqr.f
@@ -277,7 +277,7 @@
      $                   MAXITDIVN, NM1, NM12, NM13, OLDLL, OLDM
       REAL               ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU,
      $                   OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL,
-     $                   SINR, SLL, SMAX, SMIN, SMINL,  SMINOA,
+     $                   SINR, SLL, SMAX, SMIN, SMINOA,
      $                   SN, THRESH, TOL, TOLMUL, UNFL
 *     ..
 *     .. External Functions ..
@@ -390,7 +390,7 @@
       DO 30 I = 1, N - 1
          SMAX = MAX( SMAX, ABS( E( I ) ) )
    30 CONTINUE
-      SMINL = ZERO
+      SMIN = ZERO
       IF( TOL.GE.ZERO ) THEN
 *
 *        Relative accuracy desired
@@ -450,7 +450,6 @@
       IF( TOL.LT.ZERO .AND. ABS( D( M ) ).LE.THRESH )
      $   D( M ) = ZERO
       SMAX = ABS( D( M ) )
-      SMIN = SMAX
       DO 70 LLL = 1, M - 1
          LL = M - LLL
          ABSS = ABS( D( LL ) )
@@ -459,7 +458,6 @@
      $      D( LL ) = ZERO
          IF( ABSE.LE.THRESH )
      $      GO TO 80
-         SMIN = MIN( SMIN, ABSS )
          SMAX = MAX( SMAX, ABSS, ABSE )
    70 CONTINUE
       LL = 0
@@ -541,14 +539,14 @@
 *           apply convergence criterion forward
 *
             MU = ABS( D( LL ) )
-            SMINL = MU
+            SMIN = MU
             DO 100 LLL = LL, M - 1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL+1 ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   100       CONTINUE
          END IF
 *
@@ -569,14 +567,14 @@
 *           apply convergence criterion backward
 *
             MU = ABS( D( M ) )
-            SMINL = MU
+            SMIN = MU
             DO 110 LLL = M - 1, LL, -1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   110       CONTINUE
          END IF
       END IF
@@ -586,7 +584,7 @@
 *     Compute shift.  First, test if shifting would ruin relative
 *     accuracy, and if so set the shift to zero.
 *
-      IF( TOL.GE.ZERO .AND. N*TOL*( SMINL / SMAX ).LE.
+      IF( TOL.GE.ZERO .AND. N*TOL*( SMIN / SMAX ).LE.
      $    MAX( EPS, HNDRTH*TOL ) ) THEN
 *
 *        Use a zero shift to avoid loss of relative accuracy
diff --git a/lapack-netlib/SRC/zbdsqr.f b/lapack-netlib/SRC/zbdsqr.f
index 2ad6142a7..faedafc3c 100644
--- a/lapack-netlib/SRC/zbdsqr.f
+++ b/lapack-netlib/SRC/zbdsqr.f
@@ -259,7 +259,7 @@
      $                   NM12, NM13, OLDLL, OLDM
       DOUBLE PRECISION   ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU,
      $                   OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL,
-     $                   SINR, SLL, SMAX, SMIN, SMINL, SMINOA,
+     $                   SINR, SLL, SMAX, SMIN, SMINOA,
      $                   SN, THRESH, TOL, TOLMUL, UNFL
 *     ..
 *     .. External Functions ..
@@ -372,7 +372,7 @@
       DO 30 I = 1, N - 1
          SMAX = MAX( SMAX, ABS( E( I ) ) )
    30 CONTINUE
-      SMINL = ZERO
+      SMIN = ZERO
       IF( TOL.GE.ZERO ) THEN
 *
 *        Relative accuracy desired
@@ -426,7 +426,6 @@
       IF( TOL.LT.ZERO .AND. ABS( D( M ) ).LE.THRESH )
      $   D( M ) = ZERO
       SMAX = ABS( D( M ) )
-      SMIN = SMAX
       DO 70 LLL = 1, M - 1
          LL = M - LLL
          ABSS = ABS( D( LL ) )
@@ -435,7 +434,6 @@
      $      D( LL ) = ZERO
          IF( ABSE.LE.THRESH )
      $      GO TO 80
-         SMIN = MIN( SMIN, ABSS )
          SMAX = MAX( SMAX, ABSS, ABSE )
    70 CONTINUE
       LL = 0
@@ -517,14 +515,14 @@
 *           apply convergence criterion forward
 *
             MU = ABS( D( LL ) )
-            SMINL = MU
+            SMIN = MU
             DO 100 LLL = LL, M - 1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL+1 ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   100       CONTINUE
          END IF
 *
@@ -545,14 +543,14 @@
 *           apply convergence criterion backward
 *
             MU = ABS( D( M ) )
-            SMINL = MU
+            SMIN = MU
             DO 110 LLL = M - 1, LL, -1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   110       CONTINUE
          END IF
       END IF
@@ -562,7 +560,7 @@
 *     Compute shift.  First, test if shifting would ruin relative
 *     accuracy, and if so set the shift to zero.
 *
-      IF( TOL.GE.ZERO .AND. N*TOL*( SMINL / SMAX ).LE.
+      IF( TOL.GE.ZERO .AND. N*TOL*( SMIN / SMAX ).LE.
      $    MAX( EPS, HNDRTH*TOL ) ) THEN
 *
 *        Use a zero shift to avoid loss of relative accuracy

From 11d867731ae0c0925276cbcbdaeafa2087cd8061 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 31 May 2023 19:35:16 +0200
Subject: [PATCH 108/718] Add Apple m1/iphoneos (cross)build in Cirrus CI
 (#4052)

* Add Apple m1/iphoneos (cross)build
---
 .cirrus.yml | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/.cirrus.yml b/.cirrus.yml
index 7ca1cc7bc..9aeb787b1 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -10,6 +10,18 @@ task:
   - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
   - make TARGET=VORTEX USE_OPENMP=1 CC=clang
 
+task:
+  name: AppleM1/LLVM/CMAKE
+  compile_script:
+  - brew install llvm
+  - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
+  - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+  - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
+  - mkdir build
+  - cd build
+  - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
+  - make
+  
 macos_instance:
   image: ghcr.io/cirruslabs/macos-monterey-xcode:13.4
 task:
@@ -38,6 +50,23 @@ task:
 #    path: "libopenblas*"
 #    type: application/octet-streamm
 
+macos_instance:
+  image: ghcr.io/cirruslabs/macos-monterey-xcode:13.4
+task:
+  name: AppleM1/LLVM armv8-ios xbuild
+  compile_script:
+  - #brew install llvm
+  - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
+  - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+  - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" 
+  - find /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms -name "IPhoneOS*sdk"
+  - export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+  - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS15.5.sdk -arch arm64 -miphoneos-version-min=10.0"
+  - make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1
+  always:
+    config_artifacts:
+      path: "*conf*"
+      type: text/plain
 
 task:
   name: NeoverseN1

From b1f6d90a114bda8c3d6813737ce9faae3699c43f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 3 Jun 2023 23:21:39 +0200
Subject: [PATCH 109/718] Drop extraneous escaping in the basename call

---
 c_check | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/c_check b/c_check
index 8b633ab8b..aeb9779e4 100755
--- a/c_check
+++ b/c_check
@@ -35,7 +35,7 @@ if [ "`dirname \"$compiler_name\"`" != '.' ]; then
     cross_suffix="$cross_suffix`dirname \"$compiler_name\"`/"
 fi
 
-bn=`basename \"$compiler_name\"`
+bn=`basename "$compiler_name"`
 
 case "$bn" in
     *-*) if [ "$bn" != '-' ]; then

From 0954746380a54c4f5c45f892dabef6b7c9aa93c3 Mon Sep 17 00:00:00 2001
From: Heller Zheng <heller.zheng@microchip.com>
Date: Sun, 4 Jun 2023 20:06:58 -0700
Subject: [PATCH 110/718] remove argument unused during compilation. fix wrong
 vr = VFMVVF_FLOAT(0, vl);

---
 Makefile.riscv64            | 2 +-
 kernel/riscv64/symv_L_rvv.c | 4 ----
 kernel/riscv64/symv_U_rvv.c | 4 ----
 3 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/Makefile.riscv64 b/Makefile.riscv64
index d091984a6..ce7a27141 100644
--- a/Makefile.riscv64
+++ b/Makefile.riscv64
@@ -3,7 +3,7 @@ CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920
 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static
 endif
 ifeq ($(CORE), x280)
-CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -mllvm --riscv-v-vector-bits-min=512 -ffast-math
+CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math
 FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
 endif
 ifeq ($(CORE), RISCV64_GENERIC)
diff --git a/kernel/riscv64/symv_L_rvv.c b/kernel/riscv64/symv_L_rvv.c
index e87ab22ae..b27db2e37 100644
--- a/kernel/riscv64/symv_L_rvv.c
+++ b/kernel/riscv64/symv_L_rvv.c
@@ -94,7 +94,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                         for (k = (m-i); k > 0; k -= vl, i += vl)
                         {
                                 vl = VSETVL(k);
-                                vr = VFMVVF_FLOAT(0, vl);
                                 va = VLEV_FLOAT(&a_ptr[i], vl);
                                 vy = VLEV_FLOAT(&y[i], vl);
                                 vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
@@ -125,7 +124,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                         {
                                 vl = VSETVL(k);
                                 inc_yv = inc_y * vl;
-                                vr = VFMVVF_FLOAT(0, vl);
                                 va = VLEV_FLOAT(&a_ptr[i], vl);
                                 vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
                                 vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
@@ -157,7 +155,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                         for (k = (m-i); k > 0; k -= vl, i += vl)
                         {
                                 vl = VSETVL(k);
-                                vr = VFMVVF_FLOAT(0, vl);
                                 inc_xv = inc_x * vl;
 
                                 va = VLEV_FLOAT(&a_ptr[i], vl);
@@ -197,7 +194,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                 vl = VSETVL(k);
                                 inc_xv = inc_x * vl;
                                 inc_yv = inc_y * vl;
-                                vr = VFMVVF_FLOAT(0, vl);
                                 
                                 va = VLEV_FLOAT(&a_ptr[i], vl);
                                 vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c
index 3fbc33c89..7e45b1a01 100644
--- a/kernel/riscv64/symv_U_rvv.c
+++ b/kernel/riscv64/symv_U_rvv.c
@@ -95,7 +95,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                         for (k = j; k > 0; k -= vl, i += vl)
                         {
                                 vl = VSETVL(k);
-                                vr = VFMVVF_FLOAT(0, vl);
                                 vy = VLEV_FLOAT(&y[i], vl);
                                 va = VLEV_FLOAT(&a_ptr[i], vl);
                                 vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
@@ -125,7 +124,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                         {
                                 vl = VSETVL(k);
                                 inc_yv = inc_y * vl;
-                                vr = VFMVVF_FLOAT(0, vl);
                                 vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
                                 va = VLEV_FLOAT(&a_ptr[i], vl);
                                 vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
@@ -158,7 +156,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                         {
                                 vl = VSETVL(k);
                                 inc_xv = inc_x * vl;
-                                vr = VFMVVF_FLOAT(0, vl);
 
                                 vy = VLEV_FLOAT(&y[i], vl);
                                 va = VLEV_FLOAT(&a_ptr[i], vl);
@@ -197,7 +194,6 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                 vl = VSETVL(k);
                                 inc_xv = inc_x * vl;
                                 inc_yv = inc_y * vl;
-                                vr = VFMVVF_FLOAT(0, vl);
                                 vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
                                 va = VLEV_FLOAT(&a_ptr[i], vl);
                                 vy = VFMACCVF_FLOAT(vy, temp1, va, vl);

From f981e37ddafe1f756d0fe176152d39642f4877e9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 5 Jun 2023 20:34:31 +0200
Subject: [PATCH 111/718] Treat f95 as gfortran

---
 cmake/fc.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/fc.cmake b/cmake/fc.cmake
index e615e148e..5e32ef129 100644
--- a/cmake/fc.cmake
+++ b/cmake/fc.cmake
@@ -38,7 +38,7 @@ if (${F_COMPILER} STREQUAL "G95")
   endif ()
 endif ()
 
-if (${F_COMPILER} STREQUAL "GFORTRAN")
+if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95")
   set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
   # ensure reentrancy of lapack codes
   set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")

From 67d1e72e8b8343d8dd83b011234582c0e6009bbb Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Thu, 8 Jun 2023 20:20:47 +0800
Subject: [PATCH 112/718] LoongArch64: Add ABI detection for loongarch64

If lp64d ABI is supported, it is used; otherwise,
it falls back to the lp64 ABI.
---
 Makefile.system | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index 343b94bb3..7dc17c486 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -932,8 +932,12 @@ BINARY_DEFINED = 1
 endif
 
 ifeq ($(ARCH), loongarch64)
-CCOMMON_OPT += -march=loongarch64 -mabi=lp64
-FCOMMON_OPT += -march=loongarch64 -mabi=lp64
+LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d)
+ifneq ($(LA64_ABI), lp64d)
+LA64_ABI=lp64
+endif
+CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
+FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
 endif
 
 endif

From 2993a732625fb81b79cab46a39585fdcc4894445 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 9 Jun 2023 18:56:13 +0200
Subject: [PATCH 113/718] Add NVHPC(nvc)

---
 cmake/cc.cmake | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index 83b8d15ab..aeaa76710 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -65,6 +65,14 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
   endif ()
 endif ()
 
+if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC")
+  if (POWER)
+    set(CCOMMON_OPT "${CCOMMON_OPT} -tp pwr8")
+  else ()
+    set(CCOMMON_OPT "${CCOMMON_OPT} -tp px")
+  endif ()
+endif ()
+
 if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
   if (BINARY64)
     set(CCOMMON_OPT "${CCOMMON_OPT} -m64")

From f8c8b22740d8e9817e33d73fe4aef766eb1fb8eb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 9 Jun 2023 18:58:52 +0200
Subject: [PATCH 114/718] Add NVHPC(pgf95)

---
 cmake/fc.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/fc.cmake b/cmake/fc.cmake
index 5e32ef129..a67760885 100644
--- a/cmake/fc.cmake
+++ b/cmake/fc.cmake
@@ -121,7 +121,7 @@ if (${F_COMPILER} STREQUAL "IBM")
   endif ()
 endif ()
 
-if (${F_COMPILER} STREQUAL "PGI")
+if (${F_COMPILER} STREQUAL "PGI" OR ${F_COMPILER} STREQUAL "PGF95")
   set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PGI")
   set(COMMON_PROF "${COMMON_PROF} -DPGICOMPILER")
   if (BINARY64)

From d2144b2981a77ec52fc78d9b8d0c6a8f5c08db57 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 9 Jun 2023 19:01:15 +0200
Subject: [PATCH 115/718] Add NVHPC

---
 cmake/system.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/system.cmake b/cmake/system.cmake
index 631e7fe69..3dc6c863e 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -55,7 +55,7 @@ if (DEFINED TARGET)
 endif ()
 
 # On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
-if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
+if (X86_64 AND NOT (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" OR ${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC"))
   set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
 endif ()
 

From ee80dd8371f749b2bd38c13f5eca5c7b2963e624 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 10 Jun 2023 00:40:47 +0200
Subject: [PATCH 116/718] Ignore LAPACK's la_constants.mod

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 0fe20ecaa..1195bc9b8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,7 @@ lapack-3.4.2
 lapack-3.4.2.tgz
 lapack-netlib/make.inc
 lapack-netlib/lapacke/include/lapacke_mangling.h
+lapack-netlib/SRC/la_constants.mod
 lapack-netlib/TESTING/testing_results.txt
 lapack-netlib/INSTALL/test*
 lapack-netlib/TESTING/xeigtstc

From 58b88aa5f0ffddedec963d067acd102ef413c8e2 Mon Sep 17 00:00:00 2001
From: Manjul Mohan <manjul@linux.vnet.ibm.com>
Date: Mon, 12 Jun 2023 01:08:59 -0400
Subject: [PATCH 117/718] POWER10: Fix compiler warnings

This patch removes the warning messages related to unused variables in
sbgemm_kernel_power10.c.

Signed-off-by: Manjul Mohan <manjul@linux.vnet.ibm.com>
---
 kernel/power/sbgemm_kernel_power10.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kernel/power/sbgemm_kernel_power10.c b/kernel/power/sbgemm_kernel_power10.c
index 134929ec1..c3fa67cf6 100644
--- a/kernel/power/sbgemm_kernel_power10.c
+++ b/kernel/power/sbgemm_kernel_power10.c
@@ -336,7 +336,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
       if (m & 1)
 	{
 	  IFLOAT *BO = B;
-	  v2sf_t *rowC;
 	  v4sf_t result[4], res[4];
 	  __vector_quad acc0, acc1;
 	  __builtin_mma_xxsetaccz (&acc0);
@@ -492,7 +491,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 	      if (k > 1)
 		l = (k / 2) << 3;
 	      vec_t *rowA = (vec_t *) & (AO[l << 1]);
-	      vec_t *rowB = (vec_t *) & (BO[l]);
 	      vector short rowB_mrg =
 		{ BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 };
 	      MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero));
@@ -570,7 +568,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
       if (m & 1)
 	{
 	  IFLOAT *BO = B;
-	  v2sf_t *rowC;
 	  v4sf_t result[4], res[4];
 	  __vector_quad acc0;
 	  BLASLONG l = 0;

From ff618ac475bbbc301c64fa85e2f426af2320dc91 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 13 Jun 2023 19:16:06 +0200
Subject: [PATCH 118/718] Rework removal of compiler options

---
 c_check | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/c_check b/c_check
index aeb9779e4..2e0e377bd 100755
--- a/c_check
+++ b/c_check
@@ -35,12 +35,12 @@ if [ "`dirname \"$compiler_name\"`" != '.' ]; then
     cross_suffix="$cross_suffix`dirname \"$compiler_name\"`/"
 fi
 
-bn=`basename "$compiler_name"`
+cn= `echo $compiler_name | sed -e 's/ -.*//'`
+bn=`basename "$cn"`
 
 case "$bn" in
     *-*) if [ "$bn" != '-' ]; then
            cross_suffix="$cross_suffix${bn%-*}-"
-	   cross_suffix=`echo $cross_suffix|sed -e 's/ -$//'`
          fi
 esac
 

From 369cc0fa025e5ee14f60a61d4b574add7f6dc28e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 14 Jun 2023 20:58:23 +0200
Subject: [PATCH 119/718] CirrusCI: Add basic FreeBSD build (#4082)

* Add basic FreeBSD build
---
 .cirrus.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.cirrus.yml b/.cirrus.yml
index 9aeb787b1..36bfaaa1f 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -82,6 +82,15 @@ task:
     cpu: 8
   compile_script:
   - make USE_OPENMP=1
+
+FreeBSD_task:
+  freebsd_instance:
+    image_family: freebsd-13-2
+  install_script:
+  - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
+  compile_script:
+  - ls -l /usr/local/lib
+  - gmake CC=gcc 
   
 #task:
 #  name: Windows/LLVM16     --- too slow ---

From c94cfef30a236f3fee14882fc453104a50a0a972 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 17 Jun 2023 11:18:21 +0200
Subject: [PATCH 120/718] CirrusCI:  Add INTERFACE64 jobs on MacOS, FreeBSD,
 and on Linux arm64 (#4085)

* Add INTERFACE64 jobs on MacOS, FreeBSD, and on Linux arm64
---
 .cirrus.yml | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 36bfaaa1f..bef2b1e86 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -10,6 +10,15 @@ task:
   - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
   - make TARGET=VORTEX USE_OPENMP=1 CC=clang
 
+task:
+  name: AppleM1/LLVM/ILP64
+  compile_script:
+  - brew install llvm
+  - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
+  - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+  - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
+  - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1
+
 task:
   name: AppleM1/LLVM/CMAKE
   compile_script:
@@ -74,7 +83,14 @@ task:
     image: node:latest
   compile_script:
   - make
-  
+
+  task:
+  name: NeoverseN1-ILP64
+  arm_container:
+    image: node:latest
+  compile_script:
+  - make INTERFACE64=1
+
 task:
   name: NeoverseN1-OMP
   arm_container:
@@ -84,6 +100,7 @@ task:
   - make USE_OPENMP=1
 
 FreeBSD_task:
+  name: FreeBSD-gcc12
   freebsd_instance:
     image_family: freebsd-13-2
   install_script:
@@ -91,7 +108,18 @@ FreeBSD_task:
   compile_script:
   - ls -l /usr/local/lib
   - gmake CC=gcc 
-  
+
+
+FreeBSD_task:
+  name: freebsd-gcc12-ilp64
+  freebsd_instance:
+    image_family: freebsd-13-2
+  install_script:
+  - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
+  compile_script:
+  - ls -l /usr/local/lib
+  - gmake CC=gcc INTERFACE64=1
+
 #task:
 #  name: Windows/LLVM16     --- too slow ---
 #  windows_container:

From c677978789eefd93f87a9e07f8f14743ce3179d9 Mon Sep 17 00:00:00 2001
From: sL1pKn07 <sl1pkn07@gmail.com>
Date: Sat, 17 Jun 2023 15:17:45 +0200
Subject: [PATCH 121/718] Install .cmake files in CMAKE_INSTALL_LIBDIR This
 avoid problems when build and install 64 and 32 bits libs at same time,
 because both overwrite the .cmake files if install it in an non-architecture
 depend path like is `share/cmake/foo` is.

cmake now pick the correct .cmake files when try to add to another project as depencies
---
 CMakeLists.txt | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cc964b76e..86758d8b6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -321,7 +321,7 @@ if (NOT NOFORTRAN)
  if (NOT ONLY_CBLAS)
   # Build test and ctest
   add_subdirectory(test)
- endif() 
+ endif()
   if (BUILD_TESTING)
     add_subdirectory(lapack-netlib/TESTING)
   endif()
@@ -438,7 +438,7 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
   endif()
   if (NOT USE_PERL)
   add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
-	  COMMAND  ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def 
+	  COMMAND  ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
     COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def  ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
     COMMENT "renaming symbols"
     )
@@ -549,9 +549,8 @@ configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/
 install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
 
 
-# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
 set(PN OpenBLAS)
-set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}")
+set(CMAKECONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PN}${SUFFIX64}")
 configure_package_config_file(cmake/${PN}Config.cmake.in
                               "${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake"
                               INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR})

From c7cf1ae95adc2b0a82dfaf7f5c6be70272579e61 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 18 Jun 2023 08:46:27 +0200
Subject: [PATCH 122/718] Remove LL/sceil (same as CEILING intrinsic)
 (Reference-LAPACK PR847)

---
 lapack-netlib/SRC/VARIANTS/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/VARIANTS/Makefile b/lapack-netlib/SRC/VARIANTS/Makefile
index 25d8ee175..35e50cbc2 100644
--- a/lapack-netlib/SRC/VARIANTS/Makefile
+++ b/lapack-netlib/SRC/VARIANTS/Makefile
@@ -28,7 +28,7 @@ LULL = lu/LL/cgetrf.o lu/LL/dgetrf.o lu/LL/sgetrf.o lu/LL/zgetrf.o
 
 LUREC = lu/REC/cgetrf.o lu/REC/dgetrf.o lu/REC/sgetrf.o lu/REC/zgetrf.o
 
-QRLL = qr/LL/cgeqrf.o qr/LL/dgeqrf.o qr/LL/sgeqrf.o qr/LL/zgeqrf.o qr/LL/sceil.o
+QRLL = qr/LL/cgeqrf.o qr/LL/dgeqrf.o qr/LL/sgeqrf.o qr/LL/zgeqrf.o
 
 
 .PHONY: all

From 5c742ad44cad8d613127f470d1cfa56c40567ba6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 18 Jun 2023 09:59:44 +0200
Subject: [PATCH 123/718] Fix description (Reference-LAPACK PR847)

---
 lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f
index 16d250c3f..f9384b416 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f
@@ -24,7 +24,7 @@ C> \brief \b CPOTRF VARIANT: right looking block version of the algorithm, calli
 C>\details \b Purpose:
 C>\verbatim
 C>
-C> CPOTRF computes the Cholesky factorization of a real Hermitian
+C> CPOTRF computes the Cholesky factorization of a complex Hermitian
 C> positive definite matrix A.
 C>
 C> The factorization has the form

From ce7f4adb4270fe8ccaf28000e2371e9d456b5bdf Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 18 Jun 2023 10:00:49 +0200
Subject: [PATCH 124/718] Fix description (Reference-LAPACK PR 847)

---
 lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f
index 76d8bab61..bef27afa7 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f
@@ -24,7 +24,7 @@ C> \brief \b ZPOTRF VARIANT: right looking block version of the algorithm, calli
 C>\details \b Purpose:
 C>\verbatim
 C>
-C> ZPOTRF computes the Cholesky factorization of a real Hermitian
+C> ZPOTRF computes the Cholesky factorization of a complex Hermitian
 C> positive definite matrix A.
 C>
 C> The factorization has the form

From 8b32cebfcc4d3fb5ac914564991915c5effb4199 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 18 Jun 2023 10:02:12 +0200
Subject: [PATCH 125/718] Fix description (Reference-LAPACK PR 847)

---
 lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f
index d6149c08f..62d0a2580 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f
@@ -24,7 +24,7 @@ C> \brief \b CPOTRF VARIANT: top-looking block version of the algorithm, calling
 C>\details \b Purpose:
 C>\verbatim
 C>
-C> CPOTRF computes the Cholesky factorization of a real symmetric
+C> CPOTRF computes the Cholesky factorization of a complex Hermitian
 C> positive definite matrix A.
 C>
 C> The factorization has the form

From 35f2ce8340a3186c56debb09b3fa99d85d9af0b8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 18 Jun 2023 10:03:43 +0200
Subject: [PATCH 126/718] Fix description (Reference-LAPACK PR 847)

---
 lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f
index 62d0a2580..c810a1533 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f
@@ -55,7 +55,7 @@ C>
 C> \param[in,out] A
 C> \verbatim
 C>          A is COMPLEX array, dimension (LDA,N)
-C>          On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+C>          On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
 C>          N-by-N upper triangular part of A contains the upper
 C>          triangular part of the matrix A, and the strictly lower
 C>          triangular part of A is not referenced.  If UPLO = 'L', the

From 2dae4a34a942b44ada32a3c4e6d7db524cfe9001 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 18 Jun 2023 10:05:03 +0200
Subject: [PATCH 127/718] Fix description (Reference-LAPACK PR 847)

---
 lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f
index f8b9e253c..449c7ac95 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f
@@ -24,7 +24,7 @@ C> \brief \b ZPOTRF VARIANT: top-looking block version of the algorithm, calling
 C>\details \b Purpose:
 C>\verbatim
 C>
-C> ZPOTRF computes the Cholesky factorization of a real symmetric
+C> ZPOTRF computes the Cholesky factorization of a complex Hermitian
 C> positive definite matrix A.
 C>
 C> The factorization has the form
@@ -55,7 +55,7 @@ C>
 C> \param[in,out] A
 C> \verbatim
 C>          A is COMPLEX*16 array, dimension (LDA,N)
-C>          On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+C>          On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
 C>          N-by-N upper triangular part of A contains the upper
 C>          triangular part of the matrix A, and the strictly lower
 C>          triangular part of A is not referenced.  If UPLO = 'L', the

From f330f862f9057ebf7698cb84a1007a65e6c9a038 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 18 Jun 2023 10:08:36 +0200
Subject: [PATCH 128/718] Delete and replace with CEILING intrinsic
 (Reference-LAPACK PR 847)

---
 lapack-netlib/SRC/VARIANTS/qr/LL/sceil.f | 86 ------------------------
 1 file changed, 86 deletions(-)
 delete mode 100644 lapack-netlib/SRC/VARIANTS/qr/LL/sceil.f

diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/sceil.f b/lapack-netlib/SRC/VARIANTS/qr/LL/sceil.f
deleted file mode 100644
index a007360ba..000000000
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/sceil.f
+++ /dev/null
@@ -1,86 +0,0 @@
-C> \brief \b SCEIL
-*
-*  =========== DOCUMENTATION ===========
-*
-* Online html documentation available at
-*            http://www.netlib.org/lapack/explore-html/
-*
-*  Definition:
-*  ===========
-*
-*       REAL FUNCTION SCEIL( A )
-*
-*       .. Scalar Arguments ..
-*       REAL A
-*       ..
-*
-*    =====================================================================
-*
-*       .. Intrinsic Functions ..
-* 	      INTRINSIC          INT
-*       ..
-*       .. Executable Statements ..*
-*
-*       IF (A-INT(A).EQ.0) THEN
-*           SCEIL = A
-*       ELSE IF (A.GT.0) THEN
-*           SCEIL = INT(A)+1;
-*       ELSE
-*           SCEIL = INT(A)
-*       END IF
-*
-*       RETURN
-*
-*       END
-*  Purpose
-*  =======
-*
-C>\details \b Purpose:
-C>\verbatim
-C>\endverbatim
-*
-*  Arguments:
-*  ==========
-*
-*
-*  Authors:
-*  ========
-*
-C> \author Univ. of Tennessee
-C> \author Univ. of California Berkeley
-C> \author Univ. of Colorado Denver
-C> \author NAG Ltd.
-*
-C> \date December 2016
-*
-C> \ingroup variantsOTHERcomputational
-*
-*  =====================================================================
-      REAL FUNCTION SCEIL( A )
-*
-*  -- LAPACK computational routine --
-*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
-*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*
-*     .. Scalar Arguments ..*
-      REAL A
-*     ..
-*
-*  =====================================================================
-*
-*     .. Intrinsic Functions ..
-	      INTRINSIC          INT
-*     ..
-*     .. Executable Statements ..*
-*
-      IF (A-INT(A).EQ.0) THEN
-          SCEIL = A
-      ELSE IF (A.GT.0) THEN
-          SCEIL = INT(A)+1;
-      ELSE
-          SCEIL = INT(A)
-      END IF
-
-      RETURN
-*
-      END

From f524594d27cb1e9c6ea847f42df260eeb2a34071 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 18 Jun 2023 10:11:30 +0200
Subject: [PATCH 129/718] Replace SCEIL with CEILING intrinsic
 (Reference-LAPACK PR 847)

---
 lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f | 13 ++++++-------
 lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f | 11 +++++------
 lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f | 11 +++++------
 lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f | 13 ++++++-------
 4 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f
index 46eaf33b9..743731a00 100644
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f
+++ b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f
@@ -23,7 +23,7 @@ C> \brief \b CGEQRF VARIANT: left-looking Level 3 BLAS version of the algorithm.
 C>\details \b Purpose:
 C>\verbatim
 C>
-C> CGEQRF computes a QR factorization of a real M-by-N matrix A:
+C> CGEQRF computes a QR factorization of a complex M-by-N matrix A:
 C> A = Q * R.
 C>
 C> This is the left-looking Level 3 BLAS version of the algorithm.
@@ -172,12 +172,11 @@ C>
       EXTERNAL           CGEQR2, CLARFB, CLARFT, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX, MIN
+      INTRINSIC          CEILING, MAX, MIN, REAL
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               SCEIL
-      EXTERNAL           ILAENV, SCEIL
+      EXTERNAL           ILAENV
 *     ..
 *     .. Executable Statements ..
 
@@ -205,13 +204,13 @@ C>
 *
 *     So here 4 x 4 is the last T stored in the workspace
 *
-      NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB
+      NT = K-CEILING(REAL(K-NX)/REAL(NB))*NB
 
 *
 *     optimal workspace = space for dlarfb + space for normal T's + space for the last T
 *
       LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB))
-      LLWORK = SCEIL(REAL(LLWORK)/REAL(NB))
+      LLWORK = CEILING(REAL(LLWORK)/REAL(NB))
 
       IF( K.EQ.0 ) THEN
 
@@ -230,7 +229,7 @@ C>
 
       ELSE
 
-          LBWORK = SCEIL(REAL(K)/REAL(NB))*NB
+          LBWORK = CEILING(REAL(K)/REAL(NB))*NB
           LWKOPT = (LBWORK+LLWORK-NB)*NB
           WORK( 1 ) = LWKOPT
 
diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f
index 55cab8b23..bbdd46113 100644
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f
+++ b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f
@@ -172,12 +172,11 @@ C>
       EXTERNAL           DGEQR2, DLARFB, DLARFT, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX, MIN
+      INTRINSIC          CEILING, MAX, MIN, REAL
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               SCEIL
-      EXTERNAL           ILAENV, SCEIL
+      EXTERNAL           ILAENV
 *     ..
 *     .. Executable Statements ..
 
@@ -205,13 +204,13 @@ C>
 *
 *     So here 4 x 4 is the last T stored in the workspace
 *
-      NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB
+      NT = K-CEILING(REAL(K-NX)/REAL(NB))*NB
 
 *
 *     optimal workspace = space for dlarfb + space for normal T's + space for the last T
 *
       LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB))
-      LLWORK = SCEIL(REAL(LLWORK)/REAL(NB))
+      LLWORK = CEILING(REAL(LLWORK)/REAL(NB))
 
       IF( K.EQ.0 ) THEN
 
@@ -230,7 +229,7 @@ C>
 
       ELSE
 
-          LBWORK = SCEIL(REAL(K)/REAL(NB))*NB
+          LBWORK = CEILING(REAL(K)/REAL(NB))*NB
           LWKOPT = (LBWORK+LLWORK-NB)*NB
           WORK( 1 ) = LWKOPT
 
diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f
index d2ad13ced..bf68d635b 100644
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f
+++ b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f
@@ -172,12 +172,11 @@ C>
       EXTERNAL           SGEQR2, SLARFB, SLARFT, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX, MIN
+      INTRINSIC          CEILING, MAX, MIN, REAL
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               SCEIL
-      EXTERNAL           ILAENV, SCEIL
+      EXTERNAL           ILAENV
 *     ..
 *     .. Executable Statements ..
 
@@ -205,13 +204,13 @@ C>
 *
 *     So here 4 x 4 is the last T stored in the workspace
 *
-      NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB
+      NT = K-CEILING(REAL(K-NX)/REAL(NB))*NB
 
 *
 *     optimal workspace = space for dlarfb + space for normal T's + space for the last T
 *
       LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB))
-      LLWORK = SCEIL(REAL(LLWORK)/REAL(NB))
+      LLWORK = CEILING(REAL(LLWORK)/REAL(NB))
 
       IF( K.EQ.0 ) THEN
 
@@ -230,7 +229,7 @@ C>
 
       ELSE
 
-          LBWORK = SCEIL(REAL(K)/REAL(NB))*NB
+          LBWORK = CEILING(REAL(K)/REAL(NB))*NB
           LWKOPT = (LBWORK+LLWORK-NB)*NB
           WORK( 1 ) = LWKOPT
 
diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f
index 623b88a8a..06918568e 100644
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f
+++ b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f
@@ -23,7 +23,7 @@ C> \brief \b ZGEQRF VARIANT: left-looking Level 3 BLAS of the algorithm.
 C>\details \b Purpose:
 C>\verbatim
 C>
-C> ZGEQRF computes a QR factorization of a real M-by-N matrix A:
+C> ZGEQRF computes a QR factorization of a complex M-by-N matrix A:
 C> A = Q * R.
 C>
 C> This is the left-looking Level 3 BLAS version of the algorithm.
@@ -172,12 +172,11 @@ C>
       EXTERNAL           ZGEQR2, ZLARFB, ZLARFT, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX, MIN
+      INTRINSIC          CEILING, MAX, MIN, REAL
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               SCEIL
-      EXTERNAL           ILAENV, SCEIL
+      EXTERNAL           ILAENV
 *     ..
 *     .. Executable Statements ..
 
@@ -205,13 +204,13 @@ C>
 *
 *     So here 4 x 4 is the last T stored in the workspace
 *
-      NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB
+      NT = K-CEILING(REAL(K-NX)/REAL(NB))*NB
 
 *
 *     optimal workspace = space for dlarfb + space for normal T's + space for the last T
 *
       LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB))
-      LLWORK = SCEIL(REAL(LLWORK)/REAL(NB))
+      LLWORK = CEILING(REAL(LLWORK)/REAL(NB))
 
       IF( K.EQ.0 ) THEN
 
@@ -230,7 +229,7 @@ C>
 
       ELSE
 
-          LBWORK = SCEIL(REAL(K)/REAL(NB))*NB
+          LBWORK = CEILING(REAL(K)/REAL(NB))*NB
           LWKOPT = (LBWORK+LLWORK-NB)*NB
           WORK( 1 ) = LWKOPT
 

From 0e55702b80bde454dfd109149ef50f35ef0ff085 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 18 Jun 2023 11:17:21 +0200
Subject: [PATCH 130/718] Use dynamic allocation in ?SYL01 tests
 (Reference-LAPACK PR 854)

---
 lapack-netlib/TESTING/EIG/csyl01.f | 33 +++++++++++++++++++++++-----
 lapack-netlib/TESTING/EIG/dsyl01.f | 35 +++++++++++++++++++++++++-----
 lapack-netlib/TESTING/EIG/ssyl01.f | 35 +++++++++++++++++++++++++-----
 lapack-netlib/TESTING/EIG/zsyl01.f | 33 +++++++++++++++++++++++-----
 4 files changed, 114 insertions(+), 22 deletions(-)

diff --git a/lapack-netlib/TESTING/EIG/csyl01.f b/lapack-netlib/TESTING/EIG/csyl01.f
index 82d790daa..8a3cd1ae5 100644
--- a/lapack-netlib/TESTING/EIG/csyl01.f
+++ b/lapack-netlib/TESTING/EIG/csyl01.f
@@ -120,14 +120,16 @@
       COMPLEX            RMUL
 *     ..
 *     .. Local Arrays ..
-      COMPLEX            A( MAXM, MAXM ), B( MAXN, MAXN ),
-     $                   C( MAXM, MAXN ), CC( MAXM, MAXN ),
-     $                   X( MAXM, MAXN ),
-     $                   DUML( MAXM ), DUMR( MAXN ),
+      COMPLEX            DUML( MAXM ), DUMR( MAXN ),
      $                   D( MAX( MAXM, MAXN ) )
-      REAL               SWORK( LDSWORK, 54 ), DUM( MAXN ), VM( 2 )
+      REAL               DUM( MAXN ), VM( 2 )
       INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 )
 *     ..
+*     .. Allocatable Arrays ..
+      INTEGER            AllocateStatus
+      COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, C, CC, X
+      REAL,    DIMENSION(:,:), ALLOCATABLE :: SWORK
+*     ..
 *     .. External Functions ..
       LOGICAL            SISNAN
       REAL               SLAMCH, CLANGE
@@ -139,6 +141,20 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, REAL, MAX
 *     ..
+*     .. Allocate memory dynamically ..
+      ALLOCATE ( A( MAXM, MAXM ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( B( MAXN, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( C( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( CC( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( X( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( SWORK( LDSWORK, 54 ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+*     ..
 *     .. Executable Statements ..
 *
 *     Get machine parameters
@@ -286,6 +302,13 @@
             END DO
          END DO
       END DO
+*
+      DEALLOCATE (A, STAT = AllocateStatus)
+      DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (C, STAT = AllocateStatus)
+      DEALLOCATE (CC, STAT = AllocateStatus)
+      DEALLOCATE (X, STAT = AllocateStatus)
+      DEALLOCATE (SWORK, STAT = AllocateStatus)
 *
       RETURN
 *
diff --git a/lapack-netlib/TESTING/EIG/dsyl01.f b/lapack-netlib/TESTING/EIG/dsyl01.f
index 782d2cd42..0ea481382 100644
--- a/lapack-netlib/TESTING/EIG/dsyl01.f
+++ b/lapack-netlib/TESTING/EIG/dsyl01.f
@@ -117,13 +117,15 @@
      $                   SCALE, SCALE3, SMLNUM, TNRM, XNRM
 *     ..
 *     .. Local Arrays ..
-      DOUBLE PRECISION   A( MAXM, MAXM ), B( MAXN, MAXN ),
-     $                   C( MAXM, MAXN ), CC( MAXM, MAXN ),
-     $                   X( MAXM, MAXN ),
-     $                   DUML( MAXM ), DUMR( MAXN ),
+      DOUBLE PRECISION   DUML( MAXM ), DUMR( MAXN ),
      $                   D( MAX( MAXM, MAXN ) ), DUM( MAXN ),
-     $                   SWORK( LDSWORK, 126 ), VM( 2 )
-      INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 )
+     $                   VM( 2 )
+      INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 )
+*     ..
+*     .. Allocatable Arrays ..
+      INTEGER            AllocateStatus
+      DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, C, CC, X,
+     $                   SWORK
 *     ..
 *     .. External Functions ..
       LOGICAL            DISNAN
@@ -136,6 +138,20 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, MAX
 *     ..
+*     .. Allocate memory dynamically ..
+      ALLOCATE ( A( MAXM, MAXM ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( B( MAXN, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( C( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( CC( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( X( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( SWORK( LDSWORK, 126 ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+*     ..
 *     .. Executable Statements ..
 *
 *     Get machine parameters
@@ -280,6 +296,13 @@
             END DO
          END DO
       END DO
+*
+      DEALLOCATE (A, STAT = AllocateStatus)
+      DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (C, STAT = AllocateStatus)
+      DEALLOCATE (CC, STAT = AllocateStatus)
+      DEALLOCATE (X, STAT = AllocateStatus)
+      DEALLOCATE (SWORK, STAT = AllocateStatus)
 *
       RETURN
 *
diff --git a/lapack-netlib/TESTING/EIG/ssyl01.f b/lapack-netlib/TESTING/EIG/ssyl01.f
index 22d089dc8..fda30a3c0 100644
--- a/lapack-netlib/TESTING/EIG/ssyl01.f
+++ b/lapack-netlib/TESTING/EIG/ssyl01.f
@@ -117,13 +117,15 @@
      $                   SCALE, SCALE3, SMLNUM, TNRM, XNRM
 *     ..
 *     .. Local Arrays ..
-      REAL               A( MAXM, MAXM ), B( MAXN, MAXN ),
-     $                   C( MAXM, MAXN ), CC( MAXM, MAXN ),
-     $                   X( MAXM, MAXN ),
-     $                   DUML( MAXM ), DUMR( MAXN ),
+      REAL               DUML( MAXM ), DUMR( MAXN ),
      $                   D( MAX( MAXM, MAXN ) ), DUM( MAXN ),
-     $                   SWORK( LDSWORK, 54 ), VM( 2 )
-      INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 )
+     $                   VM( 2 )
+      INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 )
+*     ..
+*     .. Allocatable Arrays ..
+      INTEGER            AllocateStatus
+      REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, C, CC, X,
+     $                   SWORK
 *     ..
 *     .. External Functions ..
       LOGICAL            SISNAN
@@ -136,6 +138,20 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, REAL, MAX
 *     ..
+*     .. Allocate memory dynamically ..
+      ALLOCATE ( A( MAXM, MAXM ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( B( MAXN, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( C( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( CC( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( X( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( SWORK( LDSWORK, 54 ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+*     ..
 *     .. Executable Statements ..
 *
 *     Get machine parameters
@@ -280,6 +296,13 @@
             END DO
          END DO
       END DO
+*
+      DEALLOCATE (A, STAT = AllocateStatus)
+      DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (C, STAT = AllocateStatus)
+      DEALLOCATE (CC, STAT = AllocateStatus)
+      DEALLOCATE (X, STAT = AllocateStatus)
+      DEALLOCATE (SWORK, STAT = AllocateStatus)
 *
       RETURN
 *
diff --git a/lapack-netlib/TESTING/EIG/zsyl01.f b/lapack-netlib/TESTING/EIG/zsyl01.f
index 329f39dc4..5d26d494c 100644
--- a/lapack-netlib/TESTING/EIG/zsyl01.f
+++ b/lapack-netlib/TESTING/EIG/zsyl01.f
@@ -120,14 +120,16 @@
       COMPLEX*16         RMUL
 *     ..
 *     .. Local Arrays ..
-      COMPLEX*16         A( MAXM, MAXM ), B( MAXN, MAXN ),
-     $                   C( MAXM, MAXN ), CC( MAXM, MAXN ),
-     $                   X( MAXM, MAXN ),
-     $                   DUML( MAXM ), DUMR( MAXN ),
+      COMPLEX*16         DUML( MAXM ), DUMR( MAXN ),
      $                   D( MAX( MAXM, MAXN ) )
-      DOUBLE PRECISION   SWORK( LDSWORK, 103 ), DUM( MAXN ), VM( 2 )
+      DOUBLE PRECISION   DUM( MAXN ), VM( 2 )
       INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 )
 *     ..
+*     .. Allocatable Arrays ..
+      INTEGER            AllocateStatus
+      COMPLEX*16,       DIMENSION(:,:), ALLOCATABLE :: A, B, C, CC, X
+      DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: SWORK
+*     ..
 *     .. External Functions ..
       LOGICAL            DISNAN
       DOUBLE PRECISION   DLAMCH, ZLANGE
@@ -139,6 +141,20 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, MAX, SQRT
 *     ..
+*     .. Allocate memory dynamically ..
+      ALLOCATE ( A( MAXM, MAXM ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( B( MAXN, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( C( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( CC( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( X( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( SWORK( LDSWORK, 103 ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+*     ..
 *     .. Executable Statements ..
 *
 *     Get machine parameters
@@ -286,6 +302,13 @@
             END DO
          END DO
       END DO
+*
+      DEALLOCATE (A, STAT = AllocateStatus)
+      DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (C, STAT = AllocateStatus)
+      DEALLOCATE (CC, STAT = AllocateStatus)
+      DEALLOCATE (X, STAT = AllocateStatus)
+      DEALLOCATE (SWORK, STAT = AllocateStatus)
 *
       RETURN
 *

From 73b30b1dec1d0e9e50e3486854574b5f6097cefd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 18 Jun 2023 17:46:29 +0200
Subject: [PATCH 131/718] Fix VLEV_FLOAT/VSEV_FLOAT macros to compile with
 t-head 2.6.1

---
 kernel/riscv64/amax_vector.c     |  8 ++++----
 kernel/riscv64/amin_vector.c     |  8 ++++----
 kernel/riscv64/asum_vector.c     |  8 ++++----
 kernel/riscv64/axpby_vector.c    | 16 ++++++++--------
 kernel/riscv64/axpy_vector.c     | 16 ++++++++--------
 kernel/riscv64/copy_vector.c     | 16 ++++++++--------
 kernel/riscv64/dot_vector.c      |  8 ++++----
 kernel/riscv64/gemv_n_vector.c   | 16 ++++++++--------
 kernel/riscv64/gemv_t_vector.c   |  8 ++++----
 kernel/riscv64/iamax_vector.c    |  8 ++++----
 kernel/riscv64/iamin_vector.c    |  8 ++++----
 kernel/riscv64/imax_vector.c     |  8 ++++----
 kernel/riscv64/imin_vector.c     |  8 ++++----
 kernel/riscv64/izamax_vector.c   |  4 ++--
 kernel/riscv64/izamin_vector.c   |  4 ++--
 kernel/riscv64/max_vector.c      |  8 ++++----
 kernel/riscv64/min_vector.c      |  8 ++++----
 kernel/riscv64/nrm2_vector.c     |  8 ++++----
 kernel/riscv64/rot_vector.c      | 16 ++++++++--------
 kernel/riscv64/scal_vector.c     | 16 ++++++++--------
 kernel/riscv64/swap_vector.c     | 16 ++++++++--------
 kernel/riscv64/symv_L_vector.c   | 16 ++++++++--------
 kernel/riscv64/symv_U_vector.c   | 16 ++++++++--------
 kernel/riscv64/zamax_vector.c    |  4 ++--
 kernel/riscv64/zamin_vector.c    |  4 ++--
 kernel/riscv64/zasum_vector.c    |  8 ++++----
 kernel/riscv64/zaxpby_vector.c   |  8 ++++----
 kernel/riscv64/zaxpy_vector.c    |  8 ++++----
 kernel/riscv64/zcopy_vector.c    |  8 ++++----
 kernel/riscv64/zdot_vector.c     |  8 ++++----
 kernel/riscv64/zgemv_n_vector.c  | 16 ++++++++--------
 kernel/riscv64/zgemv_t_vector.c  |  4 ++--
 kernel/riscv64/zhemv_LM_vector.c |  8 ++++----
 kernel/riscv64/zhemv_UV_vector.c |  8 ++++----
 kernel/riscv64/znrm2_vector.c    |  8 ++++----
 kernel/riscv64/zrot_vector.c     | 16 ++++++++--------
 kernel/riscv64/zscal_vector.c    |  8 ++++----
 kernel/riscv64/zswap_vector.c    | 16 ++++++++--------
 38 files changed, 190 insertions(+), 190 deletions(-)

diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c
index b778d3e55..1b7799340 100644
--- a/kernel/riscv64/amax_vector.c
+++ b/kernel/riscv64/amax_vector.c
@@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
@@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c
index fd2f83dc9..f9b7defae 100644
--- a/kernel/riscv64/amin_vector.c
+++ b/kernel/riscv64/amin_vector.c
@@ -34,8 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
@@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c
index a82275153..fc73362bc 100644
--- a/kernel/riscv64/asum_vector.c
+++ b/kernel/riscv64/asum_vector.c
@@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
@@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c
index 988c57ec2..676dfd474 100644
--- a/kernel/riscv64/axpby_vector.c
+++ b/kernel/riscv64/axpby_vector.c
@@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #define VFMVVF_FLOAT vfmv_v_f_f32m4
 #define VFMULVF_FLOAT vfmul_vf_f32m4
 #else
 #define VSETVL(n) vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #define VFMVVF_FLOAT vfmv_v_f_f64m4
 #define VFMULVF_FLOAT vfmul_vf_f64m4
diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c
index 98b9f6814..6f921f2d6 100644
--- a/kernel/riscv64/axpy_vector.c
+++ b/kernel/riscv64/axpy_vector.c
@@ -30,18 +30,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #else
 #define VSETVL(n) vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #endif
 
diff --git a/kernel/riscv64/copy_vector.c b/kernel/riscv64/copy_vector.c
index a46136d6c..fee5e195d 100644
--- a/kernel/riscv64/copy_vector.c
+++ b/kernel/riscv64/copy_vector.c
@@ -28,17 +28,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m8(n)
 #define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VSEV_FLOAT vse_v_f32m8
-#define VSSEV_FLOAT vsse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VSEV_FLOAT vse32_v_f32m8
+#define VSSEV_FLOAT vsse32_v_f32m8
 #else
 #define VSETVL(n) vsetvl_e64m8(n)
 #define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VSEV_FLOAT vse_v_f64m8
-#define VSSEV_FLOAT vsse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VSEV_FLOAT vse64_v_f64m8
+#define VSSEV_FLOAT vsse64_v_f64m8
 #endif
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
diff --git a/kernel/riscv64/dot_vector.c b/kernel/riscv64/dot_vector.c
index 64efc6c40..f47e0c0b5 100644
--- a/kernel/riscv64/dot_vector.c
+++ b/kernel/riscv64/dot_vector.c
@@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
 #define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMVVF_FLOAT vfmv_v_f_f32m4
@@ -45,8 +45,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMVVF_FLOAT vfmv_v_f_f64m4
diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c
index 32ca8618b..bb9ab8e5a 100644
--- a/kernel/riscv64/gemv_n_vector.c
+++ b/kernel/riscv64/gemv_n_vector.c
@@ -29,18 +29,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #else
 #define VSETVL(n) vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #endif
 
diff --git a/kernel/riscv64/gemv_t_vector.c b/kernel/riscv64/gemv_t_vector.c
index 7683641fa..7d0b70cbb 100644
--- a/kernel/riscv64/gemv_t_vector.c
+++ b/kernel/riscv64/gemv_t_vector.c
@@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
 #define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMVVF_FLOAT vfmv_v_f_f32m4
@@ -46,8 +46,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMVVF_FLOAT vfmv_v_f_f64m4
diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c
index ecb4cd7a9..9fea522f7 100644
--- a/kernel/riscv64/iamax_vector.c
+++ b/kernel/riscv64/iamax_vector.c
@@ -35,8 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
@@ -60,8 +60,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c
index c72bb94cc..4e81e7848 100644
--- a/kernel/riscv64/iamin_vector.c
+++ b/kernel/riscv64/iamin_vector.c
@@ -36,8 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
@@ -61,8 +61,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c
index c2d787ab8..ca48a3c48 100644
--- a/kernel/riscv64/imax_vector.c
+++ b/kernel/riscv64/imax_vector.c
@@ -36,8 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
@@ -59,8 +59,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c
index dfe9a3310..2a677098d 100644
--- a/kernel/riscv64/imin_vector.c
+++ b/kernel/riscv64/imin_vector.c
@@ -36,8 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
@@ -59,8 +59,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c
index fdbdc3ae8..66a101566 100644
--- a/kernel/riscv64/izamax_vector.c
+++ b/kernel/riscv64/izamax_vector.c
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
@@ -63,7 +63,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c
index 59c720310..818193a9e 100644
--- a/kernel/riscv64/izamin_vector.c
+++ b/kernel/riscv64/izamin_vector.c
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
@@ -64,7 +64,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c
index b988513c9..7f31e9a53 100644
--- a/kernel/riscv64/max_vector.c
+++ b/kernel/riscv64/max_vector.c
@@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
 #define VFMVVF_FLOAT vfmv_v_f_f32m8
 #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
@@ -44,8 +44,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
 #define VFMVVF_FLOAT vfmv_v_f_f64m8
 #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c
index be0803df6..14b7e01ed 100644
--- a/kernel/riscv64/min_vector.c
+++ b/kernel/riscv64/min_vector.c
@@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
 #define VFMVVF_FLOAT vfmv_v_f_f32m8
 #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
@@ -44,8 +44,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
 #define VFMVVF_FLOAT vfmv_v_f_f64m8
 #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c
index 2a83e2a52..cf6fdb741 100644
--- a/kernel/riscv64/nrm2_vector.c
+++ b/kernel/riscv64/nrm2_vector.c
@@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMVFS_FLOATM4 vfmv_f_s_f32m4_f32
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMVVF_FLOAT vfmv_v_f_f32m4
@@ -55,8 +55,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMVFS_FLOATM4 vfmv_f_s_f64m4_f64
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMVVF_FLOAT vfmv_v_f_f64m4
diff --git a/kernel/riscv64/rot_vector.c b/kernel/riscv64/rot_vector.c
index 9b48d1c69..43a65e552 100644
--- a/kernel/riscv64/rot_vector.c
+++ b/kernel/riscv64/rot_vector.c
@@ -31,10 +31,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e32m4(n)
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #define VFMULVF_FLOAT vfmul_vf_f32m4
 #define VFMSACVF_FLOAT vfmsac_vf_f32m4
@@ -42,10 +42,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e64m4(n)
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #define VFMULVF_FLOAT vfmul_vf_f64m4
 #define VFMSACVF_FLOAT vfmsac_vf_f64m4
diff --git a/kernel/riscv64/scal_vector.c b/kernel/riscv64/scal_vector.c
index 7a3153b7c..8b9ef5a3e 100644
--- a/kernel/riscv64/scal_vector.c
+++ b/kernel/riscv64/scal_vector.c
@@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e32m8(n)
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VSEV_FLOAT vse_v_f32m8
-#define VSSEV_FLOAT vsse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VSEV_FLOAT vse32_v_f32m8
+#define VSSEV_FLOAT vsse32_v_f32m8
 #define VFMULVF_FLOAT vfmul_vf_f32m8
 #define VFMVVF_FLOAT vfmv_v_f_f32m8
 #else
 #define VSETVL(n) vsetvl_e64m8(n)
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VSEV_FLOAT vse_v_f64m8
-#define VSSEV_FLOAT vsse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VSEV_FLOAT vse64_v_f64m8
+#define VSSEV_FLOAT vsse64_v_f64m8
 #define VFMULVF_FLOAT vfmul_vf_f64m8
 #define VFMVVF_FLOAT vfmv_v_f_f64m8
 #endif
diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c
index d9421e2f1..b16592808 100644
--- a/kernel/riscv64/swap_vector.c
+++ b/kernel/riscv64/swap_vector.c
@@ -31,18 +31,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e32m8(n)
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VSEV_FLOAT vse_v_f32m8
-#define VSSEV_FLOAT vsse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VSEV_FLOAT vse32_v_f32m8
+#define VSSEV_FLOAT vsse32_v_f32m8
 #else
 #define VSETVL(n) vsetvl_e64m8(n)
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VSEV_FLOAT vse_v_f64m8
-#define VSSEV_FLOAT vsse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VSEV_FLOAT vse64_v_f64m8
+#define VSSEV_FLOAT vsse64_v_f64m8
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
diff --git a/kernel/riscv64/symv_L_vector.c b/kernel/riscv64/symv_L_vector.c
index 6588f4dda..58ec17b03 100644
--- a/kernel/riscv64/symv_L_vector.c
+++ b/kernel/riscv64/symv_L_vector.c
@@ -32,10 +32,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
@@ -48,10 +48,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c
index 31104eae6..34ff0e30a 100644
--- a/kernel/riscv64/symv_U_vector.c
+++ b/kernel/riscv64/symv_U_vector.c
@@ -32,10 +32,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
@@ -49,10 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c
index 9dbeba90f..bfb282ae0 100644
--- a/kernel/riscv64/zamax_vector.c
+++ b/kernel/riscv64/zamax_vector.c
@@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
@@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c
index dc58075ac..d9eca7f10 100644
--- a/kernel/riscv64/zamin_vector.c
+++ b/kernel/riscv64/zamin_vector.c
@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
@@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c
index 8386ab62e..0d1cc42f1 100644
--- a/kernel/riscv64/zasum_vector.c
+++ b/kernel/riscv64/zasum_vector.c
@@ -34,8 +34,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
 #define MASK_T vbool4_t
 #define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
@@ -49,8 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
 #define MASK_T vbool8_t
 #define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c
index 3eca20415..5e6034ac5 100644
--- a/kernel/riscv64/zaxpby_vector.c
+++ b/kernel/riscv64/zaxpby_vector.c
@@ -30,8 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #define VFMVVF_FLOAT vfmv_v_f_f32m4
 #define VFMULVF_FLOAT vfmul_vf_f32m4
@@ -40,8 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #define VSETVL(n) vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #define VFMVVF_FLOAT vfmv_v_f_f64m4
 #define VFMULVF_FLOAT vfmul_vf_f64m4
diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c
index 303d3541e..4ccfe4a81 100644
--- a/kernel/riscv64/zaxpy_vector.c
+++ b/kernel/riscv64/zaxpy_vector.c
@@ -30,15 +30,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
 #else
 #define VSETVL(n) vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
 #endif
diff --git a/kernel/riscv64/zcopy_vector.c b/kernel/riscv64/zcopy_vector.c
index 600f02bba..55a480a35 100644
--- a/kernel/riscv64/zcopy_vector.c
+++ b/kernel/riscv64/zcopy_vector.c
@@ -29,13 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #else
 #define VSETVL(n) vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #endif
 
 
diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c
index ec38ed9d2..0900206b3 100644
--- a/kernel/riscv64/zdot_vector.c
+++ b/kernel/riscv64/zdot_vector.c
@@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMVVF_FLOAT vfmv_v_f_f32m4
@@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMVVF_FLOAT vfmv_v_f_f64m4
diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c
index b5ee1f054..3095c28f9 100644
--- a/kernel/riscv64/zgemv_n_vector.c
+++ b/kernel/riscv64/zgemv_n_vector.c
@@ -29,19 +29,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n) vsetvl_e32m4(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
 #else
 #define VSETVL(n) vsetvl_e64m4(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
 #endif
diff --git a/kernel/riscv64/zgemv_t_vector.c b/kernel/riscv64/zgemv_t_vector.c
index e930dc2a2..a7a8a5279 100644
--- a/kernel/riscv64/zgemv_t_vector.c
+++ b/kernel/riscv64/zgemv_t_vector.c
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
@@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
diff --git a/kernel/riscv64/zhemv_LM_vector.c b/kernel/riscv64/zhemv_LM_vector.c
index 275ee9131..0a284a999 100644
--- a/kernel/riscv64/zhemv_LM_vector.c
+++ b/kernel/riscv64/zhemv_LM_vector.c
@@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
@@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c
index 2f46977d4..33b7c9c25 100644
--- a/kernel/riscv64/zhemv_UV_vector.c
+++ b/kernel/riscv64/zhemv_UV_vector.c
@@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
@@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c
index 59d0e219d..cadabdb75 100644
--- a/kernel/riscv64/znrm2_vector.c
+++ b/kernel/riscv64/znrm2_vector.c
@@ -32,8 +32,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
 #define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m4
 #define VFMVVF_FLOAT vfmv_v_f_f32m4
@@ -53,8 +53,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
 #define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m4
 #define VFMVVF_FLOAT vfmv_v_f_f64m4
diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c
index 2fdd8135a..858dfd173 100644
--- a/kernel/riscv64/zrot_vector.c
+++ b/kernel/riscv64/zrot_vector.c
@@ -30,10 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e32m4(n)
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLEV_FLOAT vle32_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSEV_FLOAT vse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #define VFMULVF_FLOAT vfmul_vf_f32m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
@@ -41,10 +41,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e64m4(n)
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLEV_FLOAT vle64_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSEV_FLOAT vse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #define VFMULVF_FLOAT vfmul_vf_f64m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c
index 64323aa3a..d275b75f8 100644
--- a/kernel/riscv64/zscal_vector.c
+++ b/kernel/riscv64/zscal_vector.c
@@ -30,8 +30,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e32m4(n)
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLSEV_FLOAT vlse32_v_f32m4
+#define VSSEV_FLOAT vsse32_v_f32m4
 #define VFMACCVF_FLOAT vfmacc_vf_f32m4
 #define VFMULVF_FLOAT vfmul_vf_f32m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
@@ -40,8 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e64m4(n)
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLSEV_FLOAT vlse64_v_f64m4
+#define VSSEV_FLOAT vsse64_v_f64m4
 #define VFMACCVF_FLOAT vfmacc_vf_f64m4
 #define VFMULVF_FLOAT vfmul_vf_f64m4
 #define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c
index 7550294b5..c1dcaccab 100644
--- a/kernel/riscv64/zswap_vector.c
+++ b/kernel/riscv64/zswap_vector.c
@@ -31,18 +31,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n) vsetvl_e32m8(n)
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VSEV_FLOAT vse_v_f32m8
-#define VSSEV_FLOAT vsse_v_f32m8
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
+#define VSEV_FLOAT vse32_v_f32m8
+#define VSSEV_FLOAT vsse32_v_f32m8
 #else
 #define VSETVL(n) vsetvl_e64m8(n)
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VSEV_FLOAT vse_v_f64m8
-#define VSSEV_FLOAT vsse_v_f64m8
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
+#define VSEV_FLOAT vse64_v_f64m8
+#define VSSEV_FLOAT vsse64_v_f64m8
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)

From 655058dd2177ce07d3a250568992473fd7347683 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 19 Jun 2023 14:58:35 +0200
Subject: [PATCH 132/718] fix spurious compiler error and more extraneous
 quotes

---
 c_check | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/c_check b/c_check
index 2e0e377bd..7c8494e4a 100755
--- a/c_check
+++ b/c_check
@@ -31,11 +31,11 @@ flags="$*"
 
 cross_suffix=""
 
-if [ "`dirname \"$compiler_name\"`" != '.' ]; then
-    cross_suffix="$cross_suffix`dirname \"$compiler_name\"`/"
+if [ "`dirname "$compiler_name"`" != '.' ]; then
+    cross_suffix="$cross_suffix`dirname "$compiler_name"`/"
 fi
 
-cn= `echo $compiler_name | sed -e 's/ -.*//'`
+cn=`echo $compiler_name | sed -e 's/ -.*//'`
 bn=`basename "$cn"`
 
 case "$bn" in

From 8d57af540b12dc864d49dc45ebd8dc449b10e41b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 20 Jun 2023 10:42:46 +0200
Subject: [PATCH 133/718] Add Dynamic Mode Decomposition functions
 (Reference-LAPACK PR 736)

---
 lapack-netlib/SRC/cgedmd.f90  |  995 +++++++++++++++++++++++++++++++
 lapack-netlib/SRC/cgedmdq.f90 |  689 +++++++++++++++++++++
 lapack-netlib/SRC/dgedmd.f90  | 1054 +++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/dgedmdq.f90 |  704 ++++++++++++++++++++++
 lapack-netlib/SRC/sgedmd.f90  | 1054 +++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/sgedmdq.f90 |  703 ++++++++++++++++++++++
 lapack-netlib/SRC/zgedmd.f90  |  996 +++++++++++++++++++++++++++++++
 lapack-netlib/SRC/zgedmdq.f90 |  689 +++++++++++++++++++++
 8 files changed, 6884 insertions(+)
 create mode 100644 lapack-netlib/SRC/cgedmd.f90
 create mode 100644 lapack-netlib/SRC/cgedmdq.f90
 create mode 100644 lapack-netlib/SRC/dgedmd.f90
 create mode 100644 lapack-netlib/SRC/dgedmdq.f90
 create mode 100644 lapack-netlib/SRC/sgedmd.f90
 create mode 100644 lapack-netlib/SRC/sgedmdq.f90
 create mode 100644 lapack-netlib/SRC/zgedmd.f90
 create mode 100644 lapack-netlib/SRC/zgedmdq.f90

diff --git a/lapack-netlib/SRC/cgedmd.f90 b/lapack-netlib/SRC/cgedmd.f90
new file mode 100644
index 000000000..499489270
--- /dev/null
+++ b/lapack-netlib/SRC/cgedmd.f90
@@ -0,0 +1,995 @@
+      SUBROUTINE CGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,   &
+                         M, N, X, LDX, Y, LDY, NRNK, TOL,   &
+                         K, EIGS, Z, LDZ, RES, B,    LDB,   &
+                         W, LDW,  S, LDS, ZWORK,  LZWORK,   &
+                         RWORK, LRWORK, IWORK, LIWORK, INFO )
+!   March 2023
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real32
+!.....
+!     Scalar arguments
+      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+                                 NRNK, LDZ, LDB, LDW,  LDS, &
+                                 LIWORK, LRWORK, LZWORK
+      INTEGER,       INTENT(OUT)  :: K, INFO
+      REAL(KIND=WP), INTENT(IN)   ::    TOL
+!     Array arguments
+      COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+                                         W(LDW,*), S(LDS,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
+      REAL(KIND=WP),    INTENT(OUT)   :: RES(*)
+      REAL(KIND=WP),    INTENT(OUT)   :: RWORK(*)
+      INTEGER,          INTENT(OUT)   :: IWORK(*)
+!............................................................
+!     Purpose
+!     =======
+!     CGEDMD computes the Dynamic Mode Decomposition (DMD) for
+!     a pair of data snapshot matrices. For the input matrices
+!     X and Y such that Y = A*X with an unaccessible matrix
+!     A, CGEDMD computes a certain number of Ritz pairs of A using
+!     the standard Rayleigh-Ritz extraction from a subspace of
+!     range(X) that is determined using the leading left singular
+!     vectors of X. Optionally, CGEDMD returns the residuals
+!     of the computed Ritz pairs, the information needed for
+!     a refinement of the Ritz vectors, or the eigenvectors of
+!     the Exact DMD.
+!     For further details see the references listed
+!     below. For more details of the implementation see [3].
+!
+!     References
+!     ==========
+!     [1] P. Schmid: Dynamic mode decomposition of numerical
+!         and experimental data,
+!         Journal of Fluid Mechanics 656, 5-28, 2010.
+!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!         decompositions: analysis and enhancements,
+!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!         Mode Decomposition I. Technical report. AIMDyn Inc.
+!         and LAPACK Working Note 298.
+!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
+!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!         Theory and Applications, Journal of Computational
+!         Dynamics 1(2), 391 -421, 2014.
+!
+!......................................................................
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     and supported by
+!     - DARPA SBIR project "Koopman Operator-Based Forecasting
+!     for Nonstationary Processes from Near-Term, Limited
+!     Observational Data" Contract No: W31P4Q-21-C-0007
+!     - DARPA PAI project "Physics-Informed Machine Learning
+!     Methodologies" Contract No: HR0011-18-9-0033
+!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!     Framework for Space-Time Analysis of Process Dynamics"
+!     Contract No: HR0011-16-C-0116
+!     Any opinions, findings and conclusions or recommendations
+!     expressed in this material are those of the author and
+!     do not necessarily reflect the views of the DARPA SBIR
+!     Program Office
+!============================================================
+!     Distribution Statement A:
+!     Approved for Public Release, Distribution Unlimited.
+!     Cleared by DARPA on September 29, 2022
+!============================================================
+!......................................................................
+!     Arguments
+!     =========
+!     JOBS (input) CHARACTER*1
+!     Determines whether the initial data snapshots are scaled
+!     by a diagonal matrix.
+!     'S' :: The data snapshots matrices X and Y are multiplied
+!            with a diagonal matrix D so that X*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'C' :: The snapshots are scaled as with the 'S' option.
+!            If it is found that an i-th column of X is zero
+!            vector and the corresponding i-th column of Y is
+!            non-zero, then the i-th column of Y is set to
+!            zero and a warning flag is raised.
+!     'Y' :: The data snapshots matrices X and Y are multiplied
+!            by a diagonal matrix D so that Y*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'N' :: No data scaling.
+!.....
+!     JOBZ (input) CHARACTER*1
+!     Determines whether the eigenvectors (Koopman modes) will
+!     be computed.
+!     'V' :: The eigenvectors (Koopman modes) will be computed
+!            and returned in the matrix Z.
+!            See the description of Z.
+!     'F' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product X(:,1:K)*W, where X
+!            contains a POD basis (leading left singular vectors
+!            of the data matrix X) and W contains the eigenvectors
+!            of the corresponding Rayleigh quotient.
+!            See the descriptions of K, X, W, Z.
+!     'N' :: The eigenvectors are not computed.
+!.....
+!     JOBR (input) CHARACTER*1
+!     Determines whether to compute the residuals.
+!     'R' :: The residuals for the computed eigenpairs will be
+!            computed and stored in the array RES.
+!            See the description of RES.
+!            For this option to be legal, JOBZ must be 'V'.
+!     'N' :: The residuals are not computed.
+!.....
+!     JOBF (input) CHARACTER*1
+!     Specifies whether to store information needed for post-
+!     processing (e.g. computing refined Ritz vectors)
+!     'R' :: The matrix needed for the refinement of the Ritz
+!            vectors is computed and stored in the array B.
+!            See the description of B.
+!     'E' :: The unscaled eigenvectors of the Exact DMD are
+!            computed and returned in the array B. See the
+!            description of B.
+!     'N' :: No eigenvector refinement data is computed.
+!.....
+!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!     Allows for a selection of the SVD algorithm from the
+!     LAPACK library.
+!     1 :: CGESVD (the QR SVD algorithm)
+!     2 :: CGESDD (the Divide and Conquer algorithm; if enough
+!          workspace available, this is the fastest option)
+!     3 :: CGESVDQ (the preconditioned QR SVD  ; this and 4
+!          are the most accurate options)
+!     4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3
+!          are the most accurate options)
+!     For the four methods above, a significant difference in
+!     the accuracy of small singular values is possible if
+!     the snapshots vary in norm so that X is severely
+!     ill-conditioned. If small (smaller than EPS*||X||)
+!     singular values are of interest and JOBS=='N',  then
+!     the options (3, 4) give the most accurate results, where
+!     the option 4 is slightly better and with stronger
+!     theoretical background.
+!     If JOBS=='S', i.e. the columns of X will be normalized,
+!     then all methods give nearly equally accurate results.
+!.....
+!     M (input) INTEGER, M>= 0
+!     The state space dimension (the row dimension of X, Y).
+!.....
+!     N (input) INTEGER, 0 <= N <= M
+!     The number of data snapshot pairs
+!     (the number of columns of X and Y).
+!.....
+!     X (input/output) COMPLEX(KIND=WP) M-by-N array
+!   > On entry, X contains the data snapshot matrix X. It is
+!     assumed that the column norms of X are in the range of
+!     the normalized floating point numbers.
+!   < On exit, the leading K columns of X contain a POD basis,
+!     i.e. the leading K left singular vectors of the input
+!     data matrix X, U(:,1:K). All N columns of X contain all
+!     left singular vectors of the input matrix X.
+!     See the descriptions of K, Z and W.
+!.....
+!     LDX (input) INTEGER, LDX >= M
+!     The leading dimension of the array X.
+!.....
+!     Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array
+!   > On entry, Y contains the data snapshot matrix Y
+!   < On exit,
+!     If JOBR == 'R', the leading K columns of Y  contain
+!     the residual vectors for the computed Ritz pairs.
+!     See the description of RES.
+!     If JOBR == 'N', Y contains the original input data,
+!                     scaled according to the value of JOBS.
+!.....
+!     LDY (input) INTEGER , LDY >= M
+!     The leading dimension of the array Y.
+!.....
+!     NRNK (input) INTEGER
+!     Determines the mode how to compute the numerical rank,
+!     i.e. how to truncate small singular values of the input
+!     matrix X. On input, if
+!     NRNK = -1 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(1)
+!                  This option is recommended.
+!     NRNK = -2 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(i-1)
+!                  This option is included for R&D purposes.
+!                  It requires highly accurate SVD, which
+!                  may not be feasible.
+!     The numerical rank can be enforced by using positive
+!     value of NRNK as follows:
+!     0 < NRNK <= N :: at most NRNK largest singular values
+!     will be used. If the number of the computed nonzero
+!     singular values is less than NRNK, then only those
+!     nonzero values will be used and the actually used
+!     dimension is less than NRNK. The actual number of
+!     the nonzero singular values is returned in the variable
+!     K. See the descriptions of TOL and  K.
+!.....
+!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!     The tolerance for truncating small singular values.
+!     See the description of NRNK.
+!.....
+!     K (output) INTEGER,  0 <= K <= N
+!     The dimension of the POD basis for the data snapshot
+!     matrix X and the number of the computed Ritz pairs.
+!     The value of K is determined according to the rule set
+!     by the parameters NRNK and TOL.
+!     See the descriptions of NRNK and TOL.
+!.....
+!     EIGS (output) COMPLEX(KIND=WP) N-by-1 array
+!     The leading K (K<=N) entries of EIGS contain
+!     the computed eigenvalues (Ritz values).
+!     See the descriptions of K, and Z.
+!.....
+!     Z (workspace/output) COMPLEX(KIND=WP)  M-by-N array
+!     If JOBZ =='V' then Z contains the  Ritz vectors.  Z(:,i)
+!     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1.
+!     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as
+!     the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i)
+!     is an eigenvector corresponding to EIGS(i). The columns
+!     of W(1:k,1:K) are the computed eigenvectors of the
+!     K-by-K Rayleigh quotient.
+!     See the descriptions of EIGS, X and W.
+!.....
+!     LDZ (input) INTEGER , LDZ >= M
+!     The leading dimension of the array Z.
+!.....
+!     RES (output) REAL(KIND=WP) N-by-1 array
+!     RES(1:K) contains the residuals for the K computed
+!     Ritz pairs,
+!     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2.
+!     See the description of EIGS and Z.
+!.....
+!     B (output) COMPLEX(KIND=WP)  M-by-N array.
+!     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
+!     be used for computing the refined vectors; see further
+!     details in the provided references.
+!     If JOBF == 'E', B(1:M,1:K) contains
+!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!     Exact DMD, up to scaling by the inverse eigenvalues.
+!     If JOBF =='N', then B is not referenced.
+!     See the descriptions of X, W, K.
+!.....
+!     LDB (input) INTEGER, LDB >= M
+!     The leading dimension of the array B.
+!.....
+!     W (workspace/output) COMPLEX(KIND=WP) N-by-N array
+!     On exit, W(1:K,1:K) contains the K computed
+!     eigenvectors of the matrix Rayleigh quotient.
+!     The Ritz vectors (returned in Z) are the
+!     product of X (containing a POD basis for the input
+!     matrix X) and W. See the descriptions of K, S, X and Z.
+!     W is also used as a workspace to temporarily store the
+!     right singular vectors of X.
+!.....
+!     LDW (input) INTEGER, LDW >= N
+!     The leading dimension of the array W.
+!.....
+!     S (workspace/output) COMPLEX(KIND=WP) N-by-N array
+!     The array S(1:K,1:K) is used for the matrix Rayleigh
+!     quotient. This content is overwritten during
+!     the eigenvalue decomposition by CGEEV.
+!     See the description of K.
+!.....
+!     LDS (input) INTEGER, LDS >= N
+!     The leading dimension of the array S.
+!.....
+!     ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array
+!     ZWORK is used as complex workspace in the complex SVD, as
+!     specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing
+!     the eigenvalues of a Rayleigh quotient.
+!     If the call to CGEDMD is only workspace query, then
+!     ZWORK(1) contains the minimal complex workspace length and
+!     ZWORK(2) is the optimal complex workspace length.
+!     Hence, the length of work is at least 2.
+!     See the description of LZWORK.
+!.....
+!     LZWORK (input) INTEGER
+!     The minimal length of the workspace vector ZWORK.
+!     LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV),
+!     where LZWORK_CGEEV = MAX( 1, 2*N )  and the minimal
+!     LZWORK_SVD is calculated as follows
+!     If WHTSVD == 1 :: CGESVD ::
+!        LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+!     If WHTSVD == 2 :: CGESDD ::
+!        LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N)
+!     If WHTSVD == 3 :: CGESVDQ ::
+!        LZWORK_SVD = obtainable by a query
+!     If WHTSVD == 4 :: CGEJSV ::
+!        LZWORK_SVD = obtainable by a query
+!     If on entry LZWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths and returns them in
+!     LZWORK(1) and LZWORK(2), respectively.
+!.....
+!     RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array
+!     On exit, RWORK(1:N) contains the singular values of
+!     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
+!     If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain
+!     scaling factor RWORK(N+2)/RWORK(N+1) used to scale X
+!     and Y to avoid overflow in the SVD of X.
+!     This may be of interest if the scaling option is off
+!     and as many as possible smallest eigenvalues are
+!     desired to the highest feasible accuracy.
+!     If the call to CGEDMD is only workspace query, then
+!     RWORK(1) contains the minimal workspace length.
+!     See the description of LRWORK.
+!.....
+!     LRWORK (input) INTEGER
+!     The minimal length of the workspace vector RWORK.
+!     LRWORK is calculated as follows:
+!     LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where
+!     LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace
+!     for the SVD subroutine determined by the input parameter
+!     WHTSVD.
+!     If WHTSVD == 1 :: CGESVD ::
+!        LRWORK_SVD = 5*MIN(M,N)
+!     If WHTSVD == 2 :: CGESDD ::
+!        LRWORK_SVD =  MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N),
+!        2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) )
+!     If WHTSVD == 3 :: CGESVDQ ::
+!        LRWORK_SVD = obtainable by a query
+!     If WHTSVD == 4 :: CGEJSV ::
+!        LRWORK_SVD = obtainable by a query
+!     If on entry LRWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     real workspace length and returns it in RWORK(1).
+!.....
+!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!     Workspace that is required only if WHTSVD equals
+!     2 , 3 or 4. (See the description of WHTSVD).
+!     If on entry LWORK =-1 or LIWORK=-1, then the
+!     minimal length of IWORK is computed and returned in
+!     IWORK(1). See the description of LIWORK.
+!.....
+!     LIWORK (input) INTEGER
+!     The minimal length of the workspace vector IWORK.
+!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!     If on entry LIWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for  ZWORK, RWORK and
+!     IWORK. See the descriptions of ZWORK, RWORK and IWORK.
+!.....
+!     INFO (output) INTEGER
+!     -i < 0 :: On entry, the i-th argument had an
+!               illegal value
+!        = 0 :: Successful return.
+!        = 1 :: Void input. Quick exit (M=0 or N=0).
+!        = 2 :: The SVD computation of X did not converge.
+!               Suggestion: Check the input data and/or
+!               repeat with different WHTSVD.
+!        = 3 :: The computation of the eigenvalues did not
+!               converge.
+!        = 4 :: If data scaling was requested on input and
+!               the procedure found inconsistency in the data
+!               such that for some column index i,
+!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!               to zero if JOBS=='C'. The computation proceeds
+!               with original or modified data and warning
+!               flag is set with INFO=4.
+!.............................................................
+!.............................................................
+!     Parameters
+!     ~~~~~~~~~~
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+      COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP )
+      COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP )
+
+!     Local scalars
+!     ~~~~~~~~~~~~~
+      REAL(KIND=WP) :: OFL,   ROOTSC, SCALE,  SMALL,   &
+                       SSUM,  XSCL1,  XSCL2
+      INTEGER       ::  i,  j, IMINWR,  INFO1, INFO2,   &
+                        LWRKEV, LWRSDD, LWRSVD, LWRSVJ, &
+                       LWRSVQ, MLWORK, MWRKEV, MWRSDD, &
+                       MWRSVD, MWRSVJ, MWRSVQ, NUMRNK, &
+                       OLWORK, MLRWRK
+      LOGICAL       ::  BADXY, LQUERY, SCCOLX, SCCOLY, &
+                        WNTEX, WNTREF, WNTRES, WNTVEC
+      CHARACTER     ::  JOBZL, T_OR_N
+      CHARACTER     ::  JSVOPT
+!
+!     Local arrays
+!     ~~~~~~~~~~~~
+      REAL(KIND=WP) :: RDUMMY(2)
+
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      REAL(KIND=WP) CLANGE, SLAMCH, SCNRM2
+      EXTERNAL      CLANGE, SLAMCH, SCNRM2, ICAMAX
+      INTEGER                               ICAMAX
+      LOGICAL       SISNAN, LSAME
+      EXTERNAL      SISNAN, LSAME
+
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      CAXPY,  CGEMM,  CSSCAL
+      EXTERNAL      CGEEV,  CGEJSV, CGESDD, CGESVD, CGESVDQ, &
+                    CLACPY, CLASCL, CLASSQ, XERBLA
+
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC     FLOAT, INT, MAX, SQRT
+!............................................................
+!
+!    Test the input arguments
+!
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME(JOBS,'C')
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V')
+      WNTREF = LSAME(JOBF,'R')
+      WNTEX  = LSAME(JOBF,'E')
+      INFO   = 0
+      LQUERY = ( ( LZWORK == -1 ) .OR. ( LIWORK == -1 ) &
+                                  .OR. ( LRWORK == -1 ) )
+!
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. &
+                                  LSAME(JOBS,'N')) )   THEN
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. LSAME(JOBZ,'N')        &
+                              .OR. LSAME(JOBZ,'F')) )  THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.  &
+                ( WNTRES .AND. (.NOT.WNTVEC) ) )       THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             &
+                LSAME(JOBF,'N') ) )                    THEN
+          INFO = -4
+      ELSE IF ( .NOT.((WHTSVD == 1) .OR. (WHTSVD == 2) .OR.  &
+                      (WHTSVD == 3) .OR. (WHTSVD == 4) )) THEN
+          INFO = -5
+      ELSE IF ( M < 0 )   THEN
+          INFO = -6
+      ELSE IF ( ( N < 0 ) .OR. ( N > M ) ) THEN
+          INFO = -7
+      ELSE IF ( LDX < M ) THEN
+          INFO = -9
+      ELSE IF ( LDY < M ) THEN
+          INFO = -11
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. &
+                ((NRNK >= 1).AND.(NRNK <=N ))) )      THEN
+          INFO = -12
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) )  THEN
+          INFO = -13
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -17
+      ELSE IF ( (WNTREF .OR. WNTEX ) .AND. ( LDB < M ) ) THEN
+          INFO = -20
+      ELSE IF ( LDW < N ) THEN
+          INFO = -22
+      ELSE IF ( LDS < N ) THEN
+          INFO = -24
+      END IF
+!
+      IF ( INFO == 0 ) THEN
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and
+          ! determine minimal and optimal sizes of the
+          ! workspace at any moment of the run.
+         IF ( N == 0 ) THEN
+             ! Quick return. All output except K is void.
+             ! INFO=1 signals the void input.
+             ! In case of a workspace query, the default
+             ! minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN
+                IWORK(1) = 1
+                RWORK(1) = 1
+                ZWORK(1) = 2
+                ZWORK(2) = 2
+            ELSE
+               K   =  0
+            END IF
+            INFO = 1
+            RETURN
+         END IF
+
+         IMINWR = 1
+         MLRWRK = MAX(1,N)
+         MLWORK = 2
+         OLWORK = 2
+         SELECT CASE ( WHTSVD )
+         CASE (1)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of CGESVD:
+             ! MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+             MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+             MLWORK = MAX(MLWORK,MWRSVD)
+             MLRWRK = MAX(MLRWRK,N + 5*MIN(M,N))
+             IF ( LQUERY ) THEN
+                CALL CGESVD( 'O', 'S', M, N, X, LDX, RWORK, &
+                     B, LDB, W, LDW, ZWORK, -1, RDUMMY, INFO1 )
+                LWRSVD = INT( ZWORK(1) )
+                OLWORK = MAX(OLWORK,LWRSVD)
+             END IF
+         CASE (2)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of CGESDD:
+             ! MWRSDD = 2*min(M,N)*min(M,N)+2*min(M,N)+max(M,N).
+             ! RWORK length: 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N)
+             ! In LAPACK 3.10.1 RWORK is defined differently.
+             ! Below we take max over the two versions.
+             ! IMINWR = 8*MIN(M,N)
+             MWRSDD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N)
+             MLWORK = MAX(MLWORK,MWRSDD)
+             IMINWR = 8*MIN(M,N)
+             MLRWRK = MAX( MLRWRK,  N +                    &
+                      MAX( 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), &
+                           5*MIN(M,N)*MIN(M,N)+5*MIN(M,N), &
+                           2*MAX(M,N)*MIN(M,N)+            &
+                           2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) )
+             IF ( LQUERY ) THEN
+                CALL CGESDD( 'O', M, N, X, LDX, RWORK, B,     &
+                     LDB, W, LDW, ZWORK, -1, RDUMMY, IWORK, INFO1 )
+                LWRSDD = MAX(MWRSDD,INT( ZWORK(1) ))
+                OLWORK = MAX(OLWORK,LWRSDD)
+             END IF
+         CASE (3)
+             CALL CGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                  X, LDX, RWORK, Z, LDZ, W, LDW, NUMRNK,  &
+                  IWORK, -1, ZWORK, -1, RDUMMY, -1, INFO1 )
+             IMINWR = IWORK(1)
+             MWRSVQ = INT(ZWORK(2))
+             MLWORK = MAX(MLWORK,MWRSVQ)
+             MLRWRK = MAX(MLRWRK,N + INT(RDUMMY(1)))
+             IF ( LQUERY ) THEN
+                LWRSVQ = INT(ZWORK(1))
+                OLWORK = MAX(OLWORK,LWRSVQ)
+             END IF
+         CASE (4)
+             JSVOPT = 'J'
+             CALL CGEJSV( 'F', 'U', JSVOPT, 'N', 'N', 'P', M, &
+                   N, X, LDX, RWORK, Z, LDZ, W, LDW,       &
+                   ZWORK, -1, RDUMMY, -1, IWORK, INFO1 )
+             IMINWR = IWORK(1)
+             MWRSVJ = INT(ZWORK(2))
+             MLWORK = MAX(MLWORK,MWRSVJ)
+             MLRWRK = MAX(MLRWRK,N + MAX(7,INT(RDUMMY(1))))
+             IF ( LQUERY ) THEN
+                LWRSVJ = INT(ZWORK(1))
+                OLWORK = MAX(OLWORK,LWRSVJ)
+             END IF
+         END SELECT
+         IF ( WNTVEC .OR. WNTEX .OR. LSAME(JOBZ,'F') ) THEN
+             JOBZL = 'V'
+         ELSE
+             JOBZL = 'N'
+         END IF
+         ! Workspace calculation to the CGEEV call
+         MWRKEV = MAX( 1, 2*N )
+         MLWORK = MAX(MLWORK,MWRKEV)
+         MLRWRK = MAX(MLRWRK,N+2*N)
+         IF ( LQUERY ) THEN
+             CALL CGEEV( 'N', JOBZL, N, S, LDS, EIGS, &
+              W, LDW, W, LDW, ZWORK, -1, RWORK, INFO1 ) ! LAPACK CALL
+                LWRKEV = INT(ZWORK(1))
+                OLWORK = MAX( OLWORK, LWRKEV )
+                OLWORK = MAX( 2, OLWORK )
+         END IF
+!
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -30
+         IF ( LRWORK < MLRWRK .AND. (.NOT.LQUERY) ) INFO = -28
+         IF ( LZWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -26
+
+      END IF
+!
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'CGEDMD', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          RWORK(1) = MLRWRK
+          ZWORK(1) = MLWORK
+          ZWORK(2) = OLWORK
+          RETURN
+      END IF
+!............................................................
+!
+      OFL   = SLAMCH('O')*SLAMCH('P')
+      SMALL = SLAMCH('S')
+      BADXY = .FALSE.
+!
+!     <1> Optional scaling of the snapshots (columns of X, Y)
+!     ==========================================================
+      IF ( SCCOLX ) THEN
+          ! The columns of X will be normalized.
+          ! To prevent overflows, the column norms of X are
+          ! carefully computed using CLASSQ.
+          K = 0
+          DO i = 1, N
+            !WORK(i) = SCNRM2( M, X(1,i), 1 )
+            SCALE  = ZERO
+            CALL CLASSQ( M, X(1,i), 1, SCALE, SSUM )
+            IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -8
+                CALL XERBLA('CGEDMD',-INFO)
+            END IF
+            IF ( (SCALE /= ZERO) .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of X(:,i) overflows. First, X(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2.
+!                 Next, the norm of X(:,i) is stored without
+!                 overflow as WORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of X(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL CLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, X(1,i), LDX, INFO2 )
+                  RWORK(i) = - SCALE * ( ROOTSC / FLOAT(M) )
+               ELSE
+!                 X(:,i) will be scaled to unit 2-norm
+                  RWORK(i) =   SCALE * ROOTSC
+                  CALL CLASCL( 'G',0, 0, RWORK(i), ONE, M, 1, &
+                               X(1,i), LDX, INFO2 )             ! LAPACK CALL
+!                 X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)          ! INTRINSIC
+               END IF
+            ELSE
+               RWORK(i) = ZERO
+               K = K + 1
+            END IF
+          END DO
+          IF ( K == N ) THEN
+          ! All columns of X are zero. Return error code -8.
+          ! (the 8th input variable had an illegal value)
+          K = 0
+          INFO = -8
+          CALL XERBLA('CGEDMD',-INFO)
+          RETURN
+          END IF
+          DO i = 1, N
+!           Now, apply the same scaling to the columns of Y.
+            IF ( RWORK(i) >  ZERO ) THEN
+                CALL CSSCAL( M, ONE/RWORK(i), Y(1,i), 1 ) ! BLAS CALL
+!               Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)      ! INTRINSIC
+            ELSE IF ( RWORK(i) < ZERO ) THEN
+                CALL CLASCL( 'G', 0, 0, -RWORK(i),          &
+                     ONE/FLOAT(M), M, 1, Y(1,i), LDY, INFO2 ) ! LAPACK CALL
+            ELSE IF ( ABS(Y(ICAMAX(M, Y(1,i),1),i ))  &
+                                            /= ZERO ) THEN
+!               X(:,i) is zero vector. For consistency,
+!               Y(:,i) should also be zero. If Y(:,i) is not
+!               zero, then the data might be inconsistent or
+!               corrupted. If JOBS == 'C', Y(:,i) is set to
+!               zero and a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+                IF ( LSAME(JOBS,'C')) &
+                CALL CSSCAL( M, ZERO, Y(1,i), 1 )  ! BLAS CALL
+            END IF
+          END DO
+      END IF
+  !
+      IF ( SCCOLY ) THEN
+          ! The columns of Y will be normalized.
+          ! To prevent overflows, the column norms of Y are
+          ! carefully computed using CLASSQ.
+          DO i = 1, N
+            !RWORK(i) = SCNRM2( M, Y(1,i), 1 )
+            SCALE  = ZERO
+            CALL CLASSQ( M, Y(1,i), 1, SCALE, SSUM )
+            IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -10
+                CALL XERBLA('CGEDMD',-INFO)
+            END IF
+            IF ( SCALE /= ZERO  .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of Y(:,i) overflows. First, Y(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2.
+!                 Next, the norm of Y(:,i) is stored without
+!                 overflow as RWORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of Y(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL CLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, Y(1,i), LDY, INFO2 )
+                  RWORK(i) = - SCALE * ( ROOTSC / FLOAT(M) )
+               ELSE
+!                 Y(:,i) will be scaled to unit 2-norm
+                  RWORK(i) =   SCALE * ROOTSC
+                  CALL CLASCL( 'G',0, 0, RWORK(i), ONE, M, 1, &
+                               Y(1,i), LDY, INFO2 )              ! LAPACK CALL
+!                 Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)          ! INTRINSIC
+               END IF
+            ELSE
+               RWORK(i) = ZERO
+            END IF
+         END DO
+         DO i = 1, N
+!           Now, apply the same scaling to the columns of X.
+            IF ( RWORK(i) >  ZERO ) THEN
+                CALL CSSCAL( M, ONE/RWORK(i), X(1,i), 1 )  ! BLAS CALL
+!               X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)      ! INTRINSIC
+            ELSE IF ( RWORK(i) < ZERO ) THEN
+                CALL CLASCL( 'G', 0, 0, -RWORK(i),          &
+                     ONE/FLOAT(M), M, 1, X(1,i), LDX, INFO2 ) ! LAPACK CALL
+            ELSE IF ( ABS(X(ICAMAX(M, X(1,i),1),i ))  &
+                                           /= ZERO ) THEN
+!               Y(:,i) is zero vector.  If X(:,i) is not
+!               zero, then a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+            END IF
+         END DO
+       END IF
+!
+!     <2> SVD of the data snapshot matrix X.
+!     =====================================
+!     The left singular vectors are stored in the array X.
+!     The right singular vectors are in the array W.
+!     The array W will later on contain the eigenvectors
+!     of a Rayleigh quotient.
+      NUMRNK = N
+      SELECT CASE ( WHTSVD )
+         CASE (1)
+             CALL CGESVD( 'O', 'S', M, N, X, LDX, RWORK, B, &
+                  LDB, W, LDW, ZWORK, LZWORK,  RWORK(N+1), INFO1 ) ! LAPACK CALL
+             T_OR_N = 'C'
+         CASE (2)
+            CALL CGESDD( 'O', M, N, X, LDX, RWORK, B, LDB, W, &
+                 LDW, ZWORK, LZWORK, RWORK(N+1), IWORK, INFO1 )   ! LAPACK CALL
+            T_OR_N = 'C'
+         CASE (3)
+              CALL CGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                   X, LDX, RWORK, Z, LDZ, W, LDW, &
+                   NUMRNK, IWORK, LIWORK, ZWORK,     &
+                   LZWORK, RWORK(N+1), LRWORK-N, INFO1)     ! LAPACK CALL
+              CALL CLACPY( 'A', M, NUMRNK, Z, LDZ, X, LDX )   ! LAPACK CALL
+         T_OR_N = 'C'
+         CASE (4)
+              CALL CGEJSV( 'F', 'U', JSVOPT, 'N', 'N', 'P', M, &
+                   N, X, LDX, RWORK, Z, LDZ, W, LDW, &
+                   ZWORK, LZWORK, RWORK(N+1), LRWORK-N, IWORK, INFO1 )    ! LAPACK CALL
+              CALL CLACPY( 'A', M, N, Z, LDZ, X, LDX )   ! LAPACK CALL
+              T_OR_N = 'N'
+              XSCL1 = RWORK(N+1)
+              XSCL2 = RWORK(N+2)
+              IF ( XSCL1 /=  XSCL2 ) THEN
+                 ! This is an exceptional situation. If the
+                 ! data matrices are not scaled and the
+                 ! largest singular value of X overflows.
+                 ! In that case CGEJSV can return the SVD
+                 ! in scaled form. The scaling factor can be used
+                 ! to rescale the data (X and Y).
+                 CALL CLASCL( 'G', 0, 0, XSCL1, XSCL2, M, N, Y, LDY, INFO2  )
+              END IF
+      END SELECT
+!
+      IF ( INFO1 > 0 ) THEN
+         ! The SVD selected subroutine did not converge.
+         ! Return with an error code.
+         INFO = 2
+         RETURN
+      END IF
+!
+      IF ( RWORK(1) == ZERO ) THEN
+          ! The largest computed singular value of (scaled)
+          ! X is zero. Return error code -8
+          ! (the 8th input variable had an illegal value).
+          K = 0
+          INFO = -8
+          CALL XERBLA('CGEDMD',-INFO)
+          RETURN
+      END IF
+!
+      !<3> Determine the numerical rank of the data
+      !    snapshots matrix X. This depends on the
+      !    parameters NRNK and TOL.
+
+      SELECT CASE ( NRNK )
+          CASE ( -1 )
+               K = 1
+               DO i = 2, NUMRNK
+                 IF ( ( RWORK(i) <= RWORK(1)*TOL ) .OR. &
+                      ( RWORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE ( -2 )
+               K = 1
+               DO i = 1, NUMRNK-1
+                 IF ( ( RWORK(i+1) <= RWORK(i)*TOL  ) .OR. &
+                      ( RWORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE DEFAULT
+               K = 1
+               DO i = 2, NRNK
+                  IF ( RWORK(i) <= SMALL ) EXIT
+                  K = K + 1
+               END DO
+          END SELECT
+      !   Now, U = X(1:M,1:K) is the SVD/POD basis for the
+      !   snapshot data in the input matrix X.
+
+      !<4> Compute the Rayleigh quotient S = U^H * A * U.
+      !    Depending on the requested outputs, the computation
+      !    is organized to compute additional auxiliary
+      !    matrices (for the residuals and refinements).
+      !
+      !    In all formulas below, we need V_k*Sigma_k^(-1)
+      !    where either V_k is in W(1:N,1:K), or V_k^H is in
+      !    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)).
+      IF ( LSAME(T_OR_N, 'N') ) THEN
+          DO i = 1, K
+           CALL CSSCAL( N, ONE/RWORK(i), W(1,i), 1 )   ! BLAS CALL
+           ! W(1:N,i) = (ONE/RWORK(i)) * W(1:N,i)      ! INTRINSIC
+          END DO
+      ELSE
+          ! This non-unit stride access is due to the fact
+          ! that CGESVD, CGESVDQ and CGESDD return the
+          ! adjoint matrix of the right singular vectors.
+          !DO i = 1, K
+          ! CALL DSCAL( N, ONE/RWORK(i), W(i,1), LDW )  ! BLAS CALL
+          ! ! W(i,1:N) = (ONE/RWORK(i)) * W(i,1:N)      ! INTRINSIC
+          !END DO
+          DO i = 1, K
+              RWORK(N+i) = ONE/RWORK(i)
+          END DO
+          DO j = 1, N
+             DO i = 1, K
+                 W(i,j) = CMPLX(RWORK(N+i),ZERO,KIND=WP)*W(i,j)
+             END DO
+          END DO
+      END IF
+!
+      IF ( WNTREF ) THEN
+         !
+         ! Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K)))
+         ! for computing the refined Ritz vectors
+         ! (optionally, outside CGEDMD).
+          CALL CGEMM( 'N', T_OR_N, M, K, N, ZONE, Y, LDY, W, &
+                      LDW, ZZERO, Z, LDZ )                       ! BLAS CALL
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N)))  ! INTRINSIC, for T_OR_N=='T'
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))             ! INTRINSIC, for T_OR_N=='N'
+          !
+          ! At this point Z contains
+          ! A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and
+          ! this is needed for computing the residuals.
+          ! This matrix is  returned in the array B and
+          ! it can be used to compute refined Ritz vectors.
+          CALL CLACPY( 'A', M, K, Z, LDZ, B, LDB )   ! BLAS CALL
+          ! B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC
+
+          CALL CGEMM( 'C', 'N', K, K, M, ZONE, X, LDX, Z, &
+                      LDZ, ZZERO, S, LDS )                       ! BLAS CALL
+          ! S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRINSIC
+          ! At this point S = U^H * A * U is the Rayleigh quotient.
+      ELSE
+        ! A * U(:,1:K) is not explicitly needed and the
+        ! computation is organized differently. The Rayleigh
+        ! quotient is computed more efficiently.
+        CALL CGEMM( 'C', 'N', K, N, M, ZONE, X, LDX, Y, LDY, &
+                   ZZERO, Z, LDZ )                                  ! BLAS CALL
+        ! Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) )  ! INTRINSIC
+        !
+        CALL CGEMM( 'N', T_OR_N, K, K, N, ZONE, Z, LDZ, W, &
+                    LDW, ZZERO, S, LDS )                        ! BLAS CALL
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRINSIC, for T_OR_N=='T'
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))          ! INTRINSIC, for T_OR_N=='N'
+        ! At this point S = U^H * A * U is the Rayleigh quotient.
+        ! If the residuals are requested, save scaled V_k into Z.
+        ! Recall that V_k or V_k^H is stored in W.
+        IF ( WNTRES .OR. WNTEX ) THEN
+          IF ( LSAME(T_OR_N, 'N') ) THEN
+              CALL CLACPY( 'A', N, K, W, LDW, Z, LDZ )
+          ELSE
+              CALL CLACPY( 'A', K, N, W, LDW, Z, LDZ )
+          END IF
+        END IF
+      END IF
+!
+      !<5> Compute the Ritz values and (if requested) the
+      !   right eigenvectors of the Rayleigh quotient.
+      !
+      CALL CGEEV( 'N', JOBZL, K, S, LDS, EIGS, W, &
+           LDW, W, LDW, ZWORK, LZWORK, RWORK(N+1), INFO1 )  ! LAPACK CALL
+      !
+      ! W(1:K,1:K) contains the eigenvectors of the Rayleigh
+      ! quotient.  See the description of Z.
+      ! Also, see the description of CGEEV.
+      IF ( INFO1 > 0 ) THEN
+         ! CGEEV failed to compute the eigenvalues and
+         ! eigenvectors of the Rayleigh quotient.
+         INFO = 3
+         RETURN
+      END IF
+!
+      ! <6> Compute the eigenvectors (if requested) and,
+      ! the residuals (if requested).
+      !
+      IF ( WNTVEC .OR. WNTEX ) THEN
+          IF ( WNTRES ) THEN
+              IF ( WNTREF ) THEN
+                ! Here, if the refinement is requested, we have
+                ! A*U(:,1:K) already computed and stored in Z.
+                ! For the residuals, need Y = A * U(:,1;K) * W.
+                CALL CGEMM( 'N', 'N', M, K, K, ZONE, Z, LDZ, W, &
+                           LDW, ZZERO, Y, LDY )              ! BLAS CALL
+                ! Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)       ! INTRINSIC
+                ! This frees Z; Y contains A * U(:,1:K) * W.
+              ELSE
+                ! Compute S = V_k * Sigma_k^(-1) * W, where
+                ! V_k * Sigma_k^(-1) (or its adjoint) is stored in Z
+                CALL CGEMM( T_OR_N, 'N', N, K, K, ZONE, Z, LDZ, &
+                           W, LDW, ZZERO, S, LDS)
+                ! Then, compute Z = Y * S =
+                ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+                ! = A * U(:,1:K) * W(1:K,1:K)
+                CALL CGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, &
+                           LDS, ZZERO, Z, LDZ)
+                ! Save a copy of Z into Y and free Z for holding
+                ! the Ritz vectors.
+                CALL CLACPY( 'A', M, K, Z, LDZ, Y, LDY )
+                IF ( WNTEX ) CALL CLACPY( 'A', M, K, Z, LDZ, B, LDB )
+              END IF
+          ELSE IF ( WNTEX ) THEN
+              ! Compute S = V_k * Sigma_k^(-1) * W, where
+                ! V_k * Sigma_k^(-1) is stored in Z
+                CALL CGEMM( T_OR_N, 'N', N, K, K, ZONE, Z, LDZ, &
+                           W, LDW, ZZERO, S, LDS)
+                ! Then, compute Z = Y * S =
+                ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+                ! = A * U(:,1:K) * W(1:K,1:K)
+                CALL CGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, &
+                           LDS, ZZERO, B, LDB)
+                ! The above call replaces the following two calls
+                ! that were used in the developing-testing phase.
+                ! CALL CGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, &
+                !           LDS, ZZERO, Z, LDZ)
+                ! Save a copy of Z into Y and free Z for holding
+                ! the Ritz vectors.
+                ! CALL CLACPY( 'A', M, K, Z, LDZ, B, LDB )
+          END IF
+!
+          ! Compute the Ritz vectors
+          IF ( WNTVEC ) CALL CGEMM( 'N', 'N', M, K, K, ZONE, X, LDX, W, LDW, &
+                       ZZERO, Z, LDZ )                          ! BLAS CALL
+          ! Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRINSIC
+!
+          IF ( WNTRES ) THEN
+             DO i = 1, K
+                CALL CAXPY( M, -EIGS(i), Z(1,i), 1, Y(1,i), 1 )       ! BLAS CALL
+                ! Y(1:M,i) = Y(1:M,i) - EIGS(i) * Z(1:M,i)            ! INTRINSIC
+                RES(i) = SCNRM2( M, Y(1,i), 1)                        ! BLAS CALL
+             END DO
+          END IF
+      END IF
+!
+      IF ( WHTSVD == 4 ) THEN
+          RWORK(N+1) = XSCL1
+          RWORK(N+2) = XSCL2
+      END IF
+!
+!     Successful exit.
+      IF ( .NOT. BADXY ) THEN
+         INFO = 0
+      ELSE
+         ! A warning on possible data inconsistency.
+         ! This should be a rare event.
+         INFO = 4
+      END IF
+!............................................................
+      RETURN
+!     ......
+      END SUBROUTINE CGEDMD
+
diff --git a/lapack-netlib/SRC/cgedmdq.f90 b/lapack-netlib/SRC/cgedmdq.f90
new file mode 100644
index 000000000..52c1669c7
--- /dev/null
+++ b/lapack-netlib/SRC/cgedmdq.f90
@@ -0,0 +1,689 @@
+SUBROUTINE CGEDMDQ( JOBS,  JOBZ, JOBR, JOBQ, JOBT, JOBF,   &
+                    WHTSVD,   M, N, F, LDF,  X, LDX,  Y,   &
+                    LDY,   NRNK,  TOL,   K,  EIGS,         &
+                    Z, LDZ, RES,  B,     LDB,   V, LDV,    & 
+                    S, LDS, ZWORK, LZWORK, WORK,  LWORK,   &
+                    IWORK, LIWORK, INFO )
+! March 2023
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE 
+      INTEGER, PARAMETER :: WP = real32
+!.....      
+!     Scalar arguments       
+      CHARACTER, INTENT(IN)  :: JOBS, JOBZ, JOBR, JOBQ,    &
+                                JOBT, JOBF
+      INTEGER,   INTENT(IN)  :: WHTSVD, M, N,   LDF, LDX,  &
+                                LDY, NRNK, LDZ, LDB, LDV,  &
+                                LDS, LZWORK,  LWORK, LIWORK
+      INTEGER,   INTENT(OUT) :: INFO,   K      
+      REAL(KIND=WP), INTENT(IN)    ::   TOL     
+!     Array arguments      
+      COMPLEX(KIND=WP), INTENT(INOUT) :: F(LDF,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: X(LDX,*), Y(LDY,*), &
+                                         Z(LDZ,*), B(LDB,*), &
+                                         V(LDV,*), S(LDS,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
+      REAL(KIND=WP), INTENT(OUT)   :: RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)  
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!.....      
+!     Purpose  
+!     =======
+!     CGEDMDQ computes the Dynamic Mode Decomposition (DMD) for
+!     a pair of data snapshot matrices, using a QR factorization
+!     based compression of the data. For the input matrices
+!     X and Y such that Y = A*X with an unaccessible matrix
+!     A, CGEDMDQ computes a certain number of Ritz pairs of A using
+!     the standard Rayleigh-Ritz extraction from a subspace of
+!     range(X) that is determined using the leading left singular 
+!     vectors of X. Optionally, CGEDMDQ returns the residuals 
+!     of the computed Ritz pairs, the information needed for
+!     a refinement of the Ritz vectors, or the eigenvectors of
+!     the Exact DMD.
+!     For further details see the references listed
+!     below. For more details of the implementation see [3].      
+!
+!     References
+!     ==========
+!     [1] P. Schmid: Dynamic mode decomposition of numerical
+!         and experimental data,
+!         Journal of Fluid Mechanics 656, 5-28, 2010.
+!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!         decompositions: analysis and enhancements,
+!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!         Mode Decomposition I. Technical report. AIMDyn Inc.
+!         and LAPACK Working Note 298.      
+!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. 
+!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!         Theory and Applications, Journal of Computational
+!         Dynamics 1(2), 391 -421, 2014.
+!
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     and supported by
+!     - DARPA SBIR project "Koopman Operator-Based Forecasting
+!     for Nonstationary Processes from Near-Term, Limited
+!     Observational Data" Contract No: W31P4Q-21-C-0007
+!     - DARPA PAI project "Physics-Informed Machine Learning
+!     Methodologies" Contract No: HR0011-18-9-0033
+!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!     Framework for Space-Time Analysis of Process Dynamics"
+!     Contract No: HR0011-16-C-0116
+!     Any opinions, findings and conclusions or recommendations 
+!     expressed in this material are those of the author and 
+!     do not necessarily reflect the views of the DARPA SBIR 
+!     Program Office.      
+!============================================================
+!     Distribution Statement A: 
+!     Approved for Public Release, Distribution Unlimited.
+!     Cleared by DARPA on September 29, 2022      
+!============================================================      
+!......................................................................      
+!     Arguments
+!     =========
+!     JOBS (input) CHARACTER*1
+!     Determines whether the initial data snapshots are scaled
+!     by a diagonal matrix. The data snapshots are the columns
+!     of F. The leading N-1 columns of F are denoted X and the
+!     trailing N-1 columns are denoted Y. 
+!     'S' :: The data snapshots matrices X and Y are multiplied
+!            with a diagonal matrix D so that X*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'C' :: The snapshots are scaled as with the 'S' option.
+!            If it is found that an i-th column of X is zero
+!            vector and the corresponding i-th column of Y is
+!            non-zero, then the i-th column of Y is set to
+!            zero and a warning flag is raised.
+!     'Y' :: The data snapshots matrices X and Y are multiplied
+!            by a diagonal matrix D so that Y*D has unit
+!            nonzero columns (in the Euclidean 2-norm)    
+!     'N' :: No data scaling.   
+!.....
+!     JOBZ (input) CHARACTER*1
+!     Determines whether the eigenvectors (Koopman modes) will
+!     be computed.
+!     'V' :: The eigenvectors (Koopman modes) will be computed
+!            and returned in the matrix Z.
+!            See the description of Z.
+!     'F' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Z*V, where Z
+!            is orthonormal and V contains the eigenvectors
+!            of the corresponding Rayleigh quotient.
+!            See the descriptions of F, V, Z.
+!     'Q' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Q*Z, where Z
+!            contains the eigenvectors of the compression of the
+!            underlying discretised operator onto the span of
+!            the data snapshots. See the descriptions of F, V, Z.   
+!            Q is from the inital QR facorization.    
+!     'N' :: The eigenvectors are not computed.  
+!.....      
+!     JOBR (input) CHARACTER*1 
+!     Determines whether to compute the residuals.
+!     'R' :: The residuals for the computed eigenpairs will
+!            be computed and stored in the array RES.
+!            See the description of RES.
+!            For this option to be legal, JOBZ must be 'V'.
+!     'N' :: The residuals are not computed.
+!.....
+!     JOBQ (input) CHARACTER*1 
+!     Specifies whether to explicitly compute and return the
+!     unitary matrix from the QR factorization.
+!     'Q' :: The matrix Q of the QR factorization of the data
+!            snapshot matrix is computed and stored in the
+!            array F. See the description of F.       
+!     'N' :: The matrix Q is not explicitly computed.
+!.....
+!     JOBT (input) CHARACTER*1 
+!     Specifies whether to return the upper triangular factor
+!     from the QR factorization.
+!     'R' :: The matrix R of the QR factorization of the data 
+!            snapshot matrix F is returned in the array Y.
+!            See the description of Y and Further details.       
+!     'N' :: The matrix R is not returned. 
+!.....
+!     JOBF (input) CHARACTER*1
+!     Specifies whether to store information needed for post-
+!     processing (e.g. computing refined Ritz vectors)
+!     'R' :: The matrix needed for the refinement of the Ritz
+!            vectors is computed and stored in the array B.
+!            See the description of B.
+!     'E' :: The unscaled eigenvectors of the Exact DMD are 
+!            computed and returned in the array B. See the
+!            description of B.
+!     'N' :: No eigenvector refinement data is computed.   
+!     To be useful on exit, this option needs JOBQ='Q'.    
+!.....
+!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!     Allows for a selection of the SVD algorithm from the
+!     LAPACK library.
+!     1 :: CGESVD (the QR SVD algorithm)
+!     2 :: CGESDD (the Divide and Conquer algorithm; if enough
+!          workspace available, this is the fastest option)
+!     3 :: CGESVDQ (the preconditioned QR SVD  ; this and 4
+!          are the most accurate options)
+!     4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3
+!          are the most accurate options)
+!     For the four methods above, a significant difference in
+!     the accuracy of small singular values is possible if
+!     the snapshots vary in norm so that X is severely
+!     ill-conditioned. If small (smaller than EPS*||X||)
+!     singular values are of interest and JOBS=='N',  then
+!     the options (3, 4) give the most accurate results, where
+!     the option 4 is slightly better and with stronger 
+!     theoretical background.
+!     If JOBS=='S', i.e. the columns of X will be normalized,
+!     then all methods give nearly equally accurate results.
+!.....
+!     M (input) INTEGER, M >= 0 
+!     The state space dimension (the number of rows of F).
+!.....      
+!     N (input) INTEGER, 0 <= N <= M
+!     The number of data snapshots from a single trajectory,
+!     taken at equidistant discrete times. This is the 
+!     number of columns of F.
+!.....
+!     F (input/output) COMPLEX(KIND=WP) M-by-N array
+!     > On entry,
+!     the columns of F are the sequence of data snapshots 
+!     from a single trajectory, taken at equidistant discrete
+!     times. It is assumed that the column norms of F are 
+!     in the range of the normalized floating point numbers. 
+!     < On exit,
+!     If JOBQ == 'Q', the array F contains the orthogonal 
+!     matrix/factor of the QR factorization of the initial 
+!     data snapshots matrix F. See the description of JOBQ. 
+!     If JOBQ == 'N', the entries in F strictly below the main
+!     diagonal contain, column-wise, the information on the 
+!     Householder vectors, as returned by CGEQRF. The 
+!     remaining information to restore the orthogonal matrix
+!     of the initial QR factorization is stored in ZWORK(1:MIN(M,N)). 
+!     See the description of ZWORK.
+!.....
+!     LDF (input) INTEGER, LDF >= M 
+!     The leading dimension of the array F.
+!.....
+!     X (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array
+!     X is used as workspace to hold representations of the
+!     leading N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, the leading K columns of X contain the leading
+!     K left singular vectors of the above described content
+!     of X. To lift them to the space of the left singular
+!     vectors U(:,1:K) of the input data, pre-multiply with the 
+!     Q factor from the initial QR factorization. 
+!     See the descriptions of F, K, V  and Z.
+!.....      
+!     LDX (input) INTEGER, LDX >= N  
+!     The leading dimension of the array X. 
+!.....
+!     Y (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N) array
+!     Y is used as workspace to hold representations of the
+!     trailing N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, 
+!     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper
+!     triangular factor from the QR factorization of the data
+!     snapshot matrix F.
+!.....      
+!     LDY (input) INTEGER , LDY >= N
+!     The leading dimension of the array Y.   
+!.....
+!     NRNK (input) INTEGER
+!     Determines the mode how to compute the numerical rank,
+!     i.e. how to truncate small singular values of the input
+!     matrix X. On input, if
+!     NRNK = -1 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(1)
+!                  This option is recommended.
+!     NRNK = -2 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(i-1)
+!                  This option is included for R&D purposes.
+!                  It requires highly accurate SVD, which
+!                  may not be feasible.      
+!     The numerical rank can be enforced by using positive 
+!     value of NRNK as follows: 
+!     0 < NRNK <= N-1 :: at most NRNK largest singular values
+!     will be used. If the number of the computed nonzero
+!     singular values is less than NRNK, then only those
+!     nonzero values will be used and the actually used
+!     dimension is less than NRNK. The actual number of
+!     the nonzero singular values is returned in the variable
+!     K. See the description of K.
+!.....
+!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!     The tolerance for truncating small singular values.
+!     See the description of NRNK.  
+!.....
+!     K (output) INTEGER,  0 <= K <= N 
+!     The dimension of the SVD/POD basis for the leading N-1
+!     data snapshots (columns of F) and the number of the 
+!     computed Ritz pairs. The value of K is determined
+!     according to the rule set by the parameters NRNK and 
+!     TOL. See the descriptions of NRNK and TOL. 
+!.....
+!     EIGS (output) COMPLEX(KIND=WP) (N-1)-by-1 array
+!     The leading K (K<=N-1) entries of EIGS contain
+!     the computed eigenvalues (Ritz values).
+!     See the descriptions of K, and Z.
+!.....
+!     Z (workspace/output) COMPLEX(KIND=WP)  M-by-(N-1) array
+!     If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i)
+!     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1.
+!     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as
+!     Z*V, where Z contains orthonormal matrix (the product of
+!     Q from the initial QR factorization and the SVD/POD_basis
+!     returned by CGEDMD in X) and the second factor (the 
+!     eigenvectors of the Rayleigh quotient) is in the array V, 
+!     as returned by CGEDMD. That is,  X(:,1:K)*V(:,i)
+!     is an eigenvector corresponding to EIGS(i). The columns 
+!     of V(1:K,1:K) are the computed eigenvectors of the 
+!     K-by-K Rayleigh quotient.  
+!     See the descriptions of EIGS, X and V.      
+!.....
+!     LDZ (input) INTEGER , LDZ >= M
+!     The leading dimension of the array Z.
+!.....
+!     RES (output) REAL(KIND=WP) (N-1)-by-1 array
+!     RES(1:K) contains the residuals for the K computed 
+!     Ritz pairs, 
+!     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2.
+!     See the description of EIGS and Z.      
+!.....
+!     B (output) COMPLEX(KIND=WP)  MIN(M,N)-by-(N-1) array.
+!     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can
+!     be used for computing the refined vectors; see further 
+!     details in the provided references. 
+!     If JOBF == 'E', B(1:N,1;K) contains 
+!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!     Exact DMD, up to scaling by the inverse eigenvalues.   
+!     In both cases, the content of B can be lifted to the 
+!     original dimension of the input data by pre-multiplying
+!     with the Q factor from the initial QR factorization. 
+!     Here A denotes a compression of the underlying operator.      
+!     See the descriptions of F and X.
+!     If JOBF =='N', then B is not referenced.
+!.....
+!     LDB (input) INTEGER, LDB >= MIN(M,N)
+!     The leading dimension of the array B.
+!.....
+!     V (workspace/output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array
+!     On exit, V(1:K,1:K) V contains the K eigenvectors of
+!     the Rayleigh quotient. The Ritz vectors
+!     (returned in Z) are the product of Q from the initial QR
+!     factorization (see the description of F) X (see the 
+!     description of X) and V.
+!.....
+!     LDV (input) INTEGER, LDV >= N-1
+!     The leading dimension of the array V.
+!.....      
+!     S (output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array
+!     The array S(1:K,1:K) is used for the matrix Rayleigh
+!     quotient. This content is overwritten during
+!     the eigenvalue decomposition by CGEEV.
+!     See the description of K.
+!.....
+!     LDS (input) INTEGER, LDS >= N-1        
+!     The leading dimension of the array S.
+!.....
+!     ZWORK (workspace/output) COMPLEX(KIND=WP) LWORK-by-1 array
+!     On exit, 
+!     ZWORK(1:MIN(M,N)) contains the scalar factors of the 
+!     elementary reflectors as returned by CGEQRF of the 
+!     M-by-N input matrix F.   
+!     If the call to CGEDMDQ is only workspace query, then
+!     ZWORK(1) contains the minimal complex workspace length and
+!     ZWORK(2) is the optimal complex workspace length. 
+!     Hence, the length of work is at least 2.
+!     See the description of LZWORK.      
+!.....      
+!     LZWORK (input) INTEGER
+!     The minimal length of the  workspace vector ZWORK.
+!     LZWORK is calculated as follows:
+!     Let MLWQR  = N (minimal workspace for CGEQRF[M,N])
+!         MLWDMD = minimal workspace for CGEDMD (see the
+!                  description of LWORK in CGEDMD)
+!         MLWMQR = N (minimal workspace for 
+!                    ZUNMQR['L','N',M,N,N])
+!         MLWGQR = N (minimal workspace for ZUNGQR[M,N,N])
+!         MINMN  = MIN(M,N)      
+!     Then
+!     LZWORK = MAX(2, MIN(M,N)+MLWQR, MINMN+MLWDMD)
+!     is further updated as follows:
+!        if   JOBZ == 'V' or JOBZ == 'F' THEN 
+!             LZWORK = MAX( LZWORK, MINMN+MLWMQR )
+!        if   JOBQ == 'Q' THEN
+!             LZWORK = MAX( ZLWORK, MINMN+MLWGQR)      
+!
+!.....      
+!     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!     On exit,
+!     WORK(1:N-1) contains the singular values of 
+!     the input submatrix F(1:M,1:N-1).
+!     If the call to CGEDMDQ is only workspace query, then
+!     WORK(1) contains the minimal workspace length and
+!     WORK(2) is the optimal workspace length. hence, the
+!     length of work is at least 2.
+!     See the description of LWORK.
+!.....
+!     LWORK (input) INTEGER
+!     The minimal length of the  workspace vector WORK.
+!     LWORK is the same as in CGEDMD, because in CGEDMDQ
+!     only CGEDMD requires real workspace for snapshots
+!     of dimensions MIN(M,N)-by-(N-1).
+!     If on entry LWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.          
+!.....
+!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!     Workspace that is required only if WHTSVD equals
+!     2 , 3 or 4. (See the description of WHTSVD).
+!     If on entry LWORK =-1 or LIWORK=-1, then the
+!     minimal length of IWORK is computed and returned in
+!     IWORK(1). See the description of LIWORK.
+!.....
+!     LIWORK (input) INTEGER
+!     The minimal length of the workspace vector IWORK.
+!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!     Let M1=MIN(M,N), N1=N-1. Then      
+!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!     If on entry LIWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.
+!..... 
+!     INFO (output) INTEGER
+!     -i < 0 :: On entry, the i-th argument had an
+!               illegal value
+!        = 0 :: Successful return.
+!        = 1 :: Void input. Quick exit (M=0 or N=0).
+!        = 2 :: The SVD computation of X did not converge.
+!               Suggestion: Check the input data and/or
+!               repeat with different WHTSVD.
+!        = 3 :: The computation of the eigenvalues did not
+!               converge.
+!        = 4 :: If data scaling was requested on input and
+!               the procedure found inconsistency in the data
+!               such that for some column index i,
+!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!               to zero if JOBS=='C'. The computation proceeds
+!               with original or modified data and warning
+!               flag is set with INFO=4.  
+!.............................................................
+!.............................................................
+!     Parameters
+!     ~~~~~~~~~~      
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+!     COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP )
+      COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP )
+!      
+!     Local scalars      
+!     ~~~~~~~~~~~~~
+      INTEGER           :: IMINWR, INFO1,  MINMN, MLRWRK,   &
+                           MLWDMD, MLWGQR, MLWMQR, MLWORK,  & 
+                           MLWQR,  OLWDMD, OLWGQR, OLWMQR,  &
+                           OLWORK, OLWQR
+      LOGICAL           :: LQUERY, SCCOLX, SCCOLY, WANTQ,  &
+                           WNTTRF, WNTRES, WNTVEC, WNTVCF, &
+                           WNTVCQ, WNTREF, WNTEX
+      CHARACTER(LEN=1)  :: JOBVL
+!      
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      LOGICAL       LSAME
+      EXTERNAL      LSAME 
+!
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      CGEQRF, CLACPY, CLASET, CUNGQR, & 
+                    CUNMQR, XERBLA
+
+!     External subroutines
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      CGEDMD 
+      
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC      MAX, MIN, INT         
+ !..........................................................  
+ !
+ !    Test the input arguments    
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME( JOBS, 'C' )
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V')
+      WNTVCF = LSAME(JOBZ,'F')
+      WNTVCQ = LSAME(JOBZ,'Q')
+      WNTREF = LSAME(JOBF,'R') 
+      WNTEX  = LSAME(JOBF,'E')
+      WANTQ  = LSAME(JOBQ,'Q')
+      WNTTRF = LSAME(JOBT,'R')     
+      MINMN  = MIN(M,N)
+      INFO = 0 
+      LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) )
+!       
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR.                &
+                                  LSAME(JOBS,'N')) )  THEN 
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. WNTVCF .OR. WNTVCQ       &
+                              .OR. LSAME(JOBZ,'N')) ) THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.    & 
+          ( WNTRES .AND. LSAME(JOBZ,'N') ) ) THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WANTQ .OR. LSAME(JOBQ,'N')) ) THEN
+           INFO = -4                 
+      ELSE IF ( .NOT. ( WNTTRF .OR. LSAME(JOBT,'N') ) )  THEN
+          INFO = -5
+       ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             & 
+                LSAME(JOBF,'N') ) )                     THEN
+          INFO = -6    
+      ELSE IF ( .NOT. ((WHTSVD == 1).OR.(WHTSVD == 2).OR.   &
+                       (WHTSVD == 3).OR.(WHTSVD == 4)) ) THEN
+          INFO = -7
+      ELSE IF ( M < 0 ) THEN
+          INFO = -8
+      ELSE IF ( ( N < 0 ) .OR. ( N > M+1 ) ) THEN
+          INFO = -9
+      ELSE IF ( LDF < M ) THEN
+          INFO = -11
+      ELSE IF ( LDX < MINMN ) THEN
+          INFO = -13
+      ELSE IF ( LDY < MINMN ) THEN
+          INFO = -15
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR.    & 
+                       ((NRNK >= 1).AND.(NRNK <=N ))) )  THEN
+          INFO = -16
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN
+          INFO = -17
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -21
+      ELSE IF ( (WNTREF.OR.WNTEX ).AND.( LDB < MINMN ) ) THEN
+          INFO = -24
+      ELSE IF ( LDV < N-1 ) THEN
+          INFO = -26
+      ELSE IF ( LDS < N-1 ) THEN
+          INFO = -28
+      END IF
+!      
+      IF ( WNTVEC .OR. WNTVCF .OR. WNTVCQ ) THEN
+          JOBVL = 'V'
+      ELSE
+          JOBVL = 'N'
+      END IF     
+      IF ( INFO == 0 ) THEN  
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and 
+          ! determine minimal and optimal sizes of the 
+          ! workspace at any moment of the run.         
+         IF ( ( N == 0 ) .OR. ( N == 1 ) ) THEN
+             ! All output except K is void. INFO=1 signals
+             ! the void input. In case of a workspace query,
+             ! the minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN  
+               IWORK(1) = 1
+                WORK(1) = 2
+                WORK(2) = 2
+            ELSE                
+               K = 0
+            END IF             
+            INFO = 1  
+            RETURN
+         END IF     
+         
+         MLRWRK = 2
+         MLWORK = 2
+         OLWORK = 2 
+         IMINWR = 1
+         MLWQR  = MAX(1,N)  ! Minimal workspace length for CGEQRF.
+         MLWORK = MAX(MLWORK,MINMN + MLWQR) 
+
+         IF ( LQUERY ) THEN 
+             CALL CGEQRF( M, N, F, LDF, ZWORK, ZWORK, -1, &
+                          INFO1 )
+             OLWQR  = INT(ZWORK(1))
+             OLWORK = MAX(OLWORK,MINMN + OLWQR)           
+         END IF
+         CALL CGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,& 
+                      N-1, X, LDX, Y, LDY, NRNK, TOL, K,     & 
+                      EIGS, Z, LDZ, RES,  B, LDB, V, LDV,    & 
+                      S, LDS, ZWORK, LZWORK, WORK, -1, IWORK,&
+                      LIWORK, INFO1 )
+         MLWDMD = INT(ZWORK(1))
+         MLWORK = MAX(MLWORK, MINMN + MLWDMD)
+         MLRWRK = MAX(MLRWRK, INT(WORK(1)))
+         IMINWR = MAX(IMINWR, IWORK(1))
+         IF ( LQUERY ) THEN 
+             OLWDMD = INT(ZWORK(2))
+             OLWORK = MAX(OLWORK, MINMN+OLWDMD)
+         END IF
+         IF ( WNTVEC .OR. WNTVCF ) THEN
+            MLWMQR = MAX(1,N) 
+            MLWORK = MAX(MLWORK, MINMN+MLWMQR)
+            IF ( LQUERY ) THEN
+               CALL CUNMQR( 'L','N', M, N, MINMN, F, LDF,  & 
+                            ZWORK, Z, LDZ, ZWORK, -1, INFO1 )
+               OLWMQR = INT(ZWORK(1))
+               OLWORK = MAX(OLWORK, MINMN+OLWMQR)
+            END IF
+         END IF  
+         IF ( WANTQ ) THEN
+            MLWGQR = MAX(1,N)
+            MLWORK = MAX(MLWORK, MINMN+MLWGQR)
+            IF ( LQUERY ) THEN 
+                CALL CUNGQR( M, MINMN, MINMN, F, LDF, ZWORK, &
+                             ZWORK, -1, INFO1 )               
+                OLWGQR = INT(ZWORK(1))
+                OLWORK = MAX(OLWORK, MINMN+OLWGQR)
+            END IF            
+         END IF          
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -34
+         IF ( LWORK  < MLRWRK .AND. (.NOT.LQUERY) ) INFO = -32
+         IF ( LZWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -30
+      END IF  
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'CGEDMDQ', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          ZWORK(1) = MLWORK
+          ZWORK(2) = OLWORK
+          WORK(1)  = MLRWRK
+          WORK(2)  = MLRWRK
+          RETURN
+      END IF   
+!.....	  
+!     Initial QR factorization that is used to represent the
+!     snapshots as elements of lower dimensional subspace.
+!     For large scale computation with M >>N , at this place 
+!     one can use an out of core QRF.
+!   
+      CALL CGEQRF( M, N, F, LDF, ZWORK,                & 
+                   ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )
+!      
+!     Define X and Y as the snapshots representations in the
+!     orthogonal basis computed in the QR factorization.
+!     X corresponds to the leading N-1 and Y to the trailing
+!     N-1 snapshots.
+      CALL CLASET( 'L', MINMN, N-1, ZZERO,  ZZERO, X, LDX )
+      CALL CLACPY( 'U', MINMN, N-1, F,      LDF, X, LDX )
+      CALL CLACPY( 'A', MINMN, N-1, F(1,2), LDF, Y, LDY )
+      IF ( M >= 3 ) THEN
+          CALL CLASET( 'L', MINMN-2, N-2, ZZERO,  ZZERO, &
+                       Y(3,1), LDY )  
+      END IF
+!
+!     Compute the DMD of the projected snapshot pairs (X,Y)   
+      CALL CGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN, &
+                  N-1,  X, LDX, Y, LDY, NRNK,   TOL, K,    &
+                  EIGS, Z, LDZ, RES, B,  LDB,   V, LDV,    &
+                  S, LDS, ZWORK(MINMN+1), LZWORK-MINMN,    & 
+                  WORK,   LWORK, IWORK, LIWORK, INFO1 )
+      IF ( INFO1 == 2 .OR. INFO1 == 3 ) THEN
+          ! Return with error code. See CGEDMD for details.
+          INFO = INFO1
+          RETURN
+      ELSE
+          INFO = INFO1
+      END IF    
+!      
+!     The Ritz vectors (Koopman modes) can be explicitly 
+!     formed or returned in factored form.
+      IF ( WNTVEC ) THEN
+        ! Compute the eigenvectors explicitly.  
+        IF ( M > MINMN ) CALL CLASET( 'A', M-MINMN, K, ZZERO, &
+                                     ZZERO, Z(MINMN+1,1), LDZ )
+        CALL CUNMQR( 'L','N', M, K, MINMN, F, LDF, ZWORK, Z,  &
+             LDZ, ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )
+      ELSE IF ( WNTVCF ) THEN   
+        !   Return the Ritz vectors (eigenvectors) in factored
+        !   form Z*V, where Z contains orthonormal matrix (the
+        !   product of Q from the initial QR factorization and 
+        !   the SVD/POD_basis returned by CGEDMD in X) and the 
+        !   second factor (the eigenvectors of the Rayleigh 
+        !   quotient) is in the array V, as returned by CGEDMD.
+        CALL CLACPY( 'A', N, K, X, LDX, Z, LDZ )
+        IF ( M > N ) CALL CLASET( 'A', M-N, K, ZZERO, ZZERO, & 
+                                 Z(N+1,1), LDZ )
+        CALL CUNMQR( 'L','N', M, K, MINMN, F, LDF, ZWORK, Z, &
+                    LDZ, ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )
+      END IF
+!     
+!     Some optional output variables:
+!
+!     The upper triangular factor R in the initial QR 
+!     factorization is optionally returned in the array Y.
+!     This is useful if this call to CGEDMDQ is to be 
+      
+!     followed by a streaming DMD that is implemented in a 
+!     QR compressed form.
+      IF ( WNTTRF ) THEN ! Return the upper triangular R in Y 
+         CALL CLASET( 'A', MINMN, N, ZZERO,  ZZERO, Y, LDY )
+         CALL CLACPY( 'U', MINMN, N, F, LDF,        Y, LDY )
+      END IF    
+!
+!     The orthonormal/unitary factor Q in the initial QR 
+!     factorization is optionally returned in the array F. 
+!     Same as with the triangular factor above, this is 
+!     useful in a streaming DMD.
+      IF ( WANTQ ) THEN                   ! Q overwrites F 
+         CALL CUNGQR( M, MINMN, MINMN, F, LDF, ZWORK,     &
+                      ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )  
+      END IF
+!      
+      RETURN
+!      
+      END SUBROUTINE CGEDMDQ
+    
\ No newline at end of file
diff --git a/lapack-netlib/SRC/dgedmd.f90 b/lapack-netlib/SRC/dgedmd.f90
new file mode 100644
index 000000000..20424808f
--- /dev/null
+++ b/lapack-netlib/SRC/dgedmd.f90
@@ -0,0 +1,1054 @@
+      SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,  &
+                         M, N, X, LDX, Y, LDY, NRNK, TOL,  &
+                         K, REIG,  IMEIG,   Z, LDZ,  RES,  &
+                         B, LDB, W,  LDW,   S, LDS,        &
+                         WORK, LWORK, IWORK, LIWORK, INFO )
+! March 2023
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real64
+!.....
+!     Scalar arguments
+      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+                                 NRNK, LDZ, LDB, LDW,  LDS, &
+                                 LWORK,  LIWORK
+      INTEGER,   INTENT(OUT)  :: K, INFO
+      REAL(KIND=WP), INTENT(IN)  :: TOL
+!     Array arguments
+      REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+      REAL(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+                                      W(LDW,*), S(LDS,*)
+      REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*), &
+                                      RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!............................................................
+!     Purpose
+!     =======
+!     DGEDMD computes the Dynamic Mode Decomposition (DMD) for
+!     a pair of data snapshot matrices. For the input matrices
+!     X and Y such that Y = A*X with an unaccessible matrix
+!     A, DGEDMD computes a certain number of Ritz pairs of A using
+!     the standard Rayleigh-Ritz extraction from a subspace of
+!     range(X) that is determined using the leading left singular
+!     vectors of X. Optionally, DGEDMD returns the residuals
+!     of the computed Ritz pairs, the information needed for
+!     a refinement of the Ritz vectors, or the eigenvectors of
+!     the Exact DMD.
+!     For further details see the references listed
+!     below. For more details of the implementation see [3].
+!
+!     References
+!     ==========
+!     [1] P. Schmid: Dynamic mode decomposition of numerical
+!         and experimental data,
+!         Journal of Fluid Mechanics 656, 5-28, 2010.
+!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!         decompositions: analysis and enhancements,
+!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!         Mode Decomposition I. Technical report. AIMDyn Inc.
+!         and LAPACK Working Note 298.
+!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
+!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!         Theory and Applications, Journal of Computational
+!         Dynamics 1(2), 391 -421, 2014.
+!
+!......................................................................
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     and supported by
+!     - DARPA SBIR project "Koopman Operator-Based Forecasting
+!     for Nonstationary Processes from Near-Term, Limited
+!     Observational Data" Contract No: W31P4Q-21-C-0007
+!     - DARPA PAI project "Physics-Informed Machine Learning
+!     Methodologies" Contract No: HR0011-18-9-0033
+!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!     Framework for Space-Time Analysis of Process Dynamics"
+!     Contract No: HR0011-16-C-0116
+!     Any opinions, findings and conclusions or recommendations
+!     expressed in this material are those of the author and
+!     do not necessarily reflect the views of the DARPA SBIR
+!     Program Office
+!============================================================
+!     Distribution Statement A:
+!     Approved for Public Release, Distribution Unlimited.
+!     Cleared by DARPA on September 29, 2022
+!============================================================
+!............................................................
+!     Arguments
+!     =========
+!     JOBS (input) CHARACTER*1
+!     Determines whether the initial data snapshots are scaled
+!     by a diagonal matrix.
+!     'S' :: The data snapshots matrices X and Y are multiplied
+!            with a diagonal matrix D so that X*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'C' :: The snapshots are scaled as with the 'S' option.
+!            If it is found that an i-th column of X is zero
+!            vector and the corresponding i-th column of Y is
+!            non-zero, then the i-th column of Y is set to
+!            zero and a warning flag is raised.
+!     'Y' :: The data snapshots matrices X and Y are multiplied
+!            by a diagonal matrix D so that Y*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'N' :: No data scaling.
+!.....
+!     JOBZ (input) CHARACTER*1
+!     Determines whether the eigenvectors (Koopman modes) will
+!     be computed.
+!     'V' :: The eigenvectors (Koopman modes) will be computed
+!            and returned in the matrix Z.
+!            See the description of Z.
+!     'F' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product X(:,1:K)*W, where X
+!            contains a POD basis (leading left singular vectors
+!            of the data matrix X) and W contains the eigenvectors
+!            of the corresponding Rayleigh quotient.
+!            See the descriptions of K, X, W, Z.
+!     'N' :: The eigenvectors are not computed.
+!.....
+!     JOBR (input) CHARACTER*1
+!     Determines whether to compute the residuals.
+!     'R' :: The residuals for the computed eigenpairs will be
+!            computed and stored in the array RES.
+!            See the description of RES.
+!            For this option to be legal, JOBZ must be 'V'.
+!     'N' :: The residuals are not computed.
+!.....
+!     JOBF (input) CHARACTER*1
+!     Specifies whether to store information needed for post-
+!     processing (e.g. computing refined Ritz vectors)
+!     'R' :: The matrix needed for the refinement of the Ritz
+!            vectors is computed and stored in the array B.
+!            See the description of B.
+!     'E' :: The unscaled eigenvectors of the Exact DMD are
+!            computed and returned in the array B. See the
+!            description of B.
+!     'N' :: No eigenvector refinement data is computed.
+!.....
+!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!     Allows for a selection of the SVD algorithm from the
+!     LAPACK library.
+!     1 :: DGESVD (the QR SVD algorithm)
+!     2 :: DGESDD (the Divide and Conquer algorithm; if enough
+!          workspace available, this is the fastest option)
+!     3 :: DGESVDQ (the preconditioned QR SVD  ; this and 4
+!          are the most accurate options)
+!     4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3
+!          are the most accurate options)
+!     For the four methods above, a significant difference in
+!     the accuracy of small singular values is possible if
+!     the snapshots vary in norm so that X is severely
+!     ill-conditioned. If small (smaller than EPS*||X||)
+!     singular values are of interest and JOBS=='N',  then
+!     the options (3, 4) give the most accurate results, where
+!     the option 4 is slightly better and with stronger
+!     theoretical background.
+!     If JOBS=='S', i.e. the columns of X will be normalized,
+!     then all methods give nearly equally accurate results.
+!.....
+!     M (input) INTEGER, M>= 0
+!     The state space dimension (the row dimension of X, Y).
+!.....
+!     N (input) INTEGER, 0 <= N <= M
+!     The number of data snapshot pairs
+!     (the number of columns of X and Y).
+!.....
+!     X (input/output) REAL(KIND=WP) M-by-N array
+!     > On entry, X contains the data snapshot matrix X. It is
+!     assumed that the column norms of X are in the range of
+!     the normalized floating point numbers.
+!     < On exit, the leading K columns of X contain a POD basis,
+!     i.e. the leading K left singular vectors of the input
+!     data matrix X, U(:,1:K). All N columns of X contain all
+!     left singular vectors of the input matrix X.
+!     See the descriptions of K, Z and W.
+!.....
+!     LDX (input) INTEGER, LDX >= M
+!     The leading dimension of the array X.
+!.....
+!     Y (input/workspace/output) REAL(KIND=WP) M-by-N array
+!     > On entry, Y contains the data snapshot matrix Y
+!     < On exit,
+!     If JOBR == 'R', the leading K columns of Y  contain
+!     the residual vectors for the computed Ritz pairs.
+!     See the description of RES.
+!     If JOBR == 'N', Y contains the original input data,
+!                     scaled according to the value of JOBS.
+!.....
+!     LDY (input) INTEGER , LDY >= M
+!     The leading dimension of the array Y.
+!.....
+!     NRNK (input) INTEGER
+!     Determines the mode how to compute the numerical rank,
+!     i.e. how to truncate small singular values of the input
+!     matrix X. On input, if
+!     NRNK = -1 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(1).
+!                  This option is recommended.
+!     NRNK = -2 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(i-1)
+!                  This option is included for R&D purposes.
+!                  It requires highly accurate SVD, which
+!                  may not be feasible.
+!
+!     The numerical rank can be enforced by using positive
+!     value of NRNK as follows:
+!     0 < NRNK <= N :: at most NRNK largest singular values
+!     will be used. If the number of the computed nonzero
+!     singular values is less than NRNK, then only those
+!     nonzero values will be used and the actually used
+!     dimension is less than NRNK. The actual number of
+!     the nonzero singular values is returned in the variable
+!     K. See the descriptions of TOL and  K.
+!.....
+!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!     The tolerance for truncating small singular values.
+!     See the description of NRNK.
+!.....
+!     K (output) INTEGER,  0 <= K <= N
+!     The dimension of the POD basis for the data snapshot
+!     matrix X and the number of the computed Ritz pairs.
+!     The value of K is determined according to the rule set
+!     by the parameters NRNK and TOL.
+!     See the descriptions of NRNK and TOL.
+!.....
+!     REIG (output) REAL(KIND=WP) N-by-1 array
+!     The leading K (K<=N) entries of REIG contain
+!     the real parts of the computed eigenvalues
+!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!     See the descriptions of K, IMEIG, and Z.
+!.....
+!     IMEIG (output) REAL(KIND=WP) N-by-1 array
+!     The leading K (K<=N) entries of IMEIG contain
+!     the imaginary parts of the computed eigenvalues
+!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!     The eigenvalues are determined as follows:
+!     If IMEIG(i) == 0, then the corresponding eigenvalue is
+!     real, LAMBDA(i) = REIG(i).
+!     If IMEIG(i)>0, then the corresponding complex
+!     conjugate pair of eigenvalues reads
+!     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i)
+!     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i)
+!     That is, complex conjugate pairs have consecutive
+!     indices (i,i+1), with the positive imaginary part
+!     listed first.
+!     See the descriptions of K, REIG, and Z.
+!.....
+!     Z (workspace/output) REAL(KIND=WP)  M-by-N array
+!     If JOBZ =='V' then
+!        Z contains real Ritz vectors as follows:
+!        If IMEIG(i)=0, then Z(:,i) is an eigenvector of
+!        the i-th Ritz value; ||Z(:,i)||_2=1.
+!        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then
+!        [Z(:,i) Z(:,i+1)] span an invariant subspace and
+!        the Ritz values extracted from this subspace are
+!        REIG(i) + sqrt(-1)*IMEIG(i) and
+!        REIG(i) - sqrt(-1)*IMEIG(i).
+!        The corresponding eigenvectors are
+!        Z(:,i) + sqrt(-1)*Z(:,i+1) and
+!        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively.
+!        || Z(:,i:i+1)||_F = 1.
+!     If JOBZ == 'F', then the above descriptions hold for
+!     the columns of X(:,1:K)*W(1:K,1:K), where the columns
+!     of W(1:k,1:K) are the computed eigenvectors of the
+!     K-by-K Rayleigh quotient. The columns of W(1:K,1:K)
+!     are similarly structured: If IMEIG(i) == 0 then
+!     X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0
+!     then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and
+!          X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1)
+!     are the eigenvectors of LAMBDA(i), LAMBDA(i+1).
+!     See the descriptions of REIG, IMEIG, X and W.
+!.....
+!     LDZ (input) INTEGER , LDZ >= M
+!     The leading dimension of the array Z.
+!.....
+!     RES (output) REAL(KIND=WP) N-by-1 array
+!     RES(1:K) contains the residuals for the K computed
+!     Ritz pairs.
+!     If LAMBDA(i) is real, then
+!        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2.
+!     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair
+!     then
+!     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F
+!     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ]
+!               [-imag(LAMBDA(i)) real(LAMBDA(i)) ].
+!     It holds that
+!     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2
+!     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2
+!     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1)
+!           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1)
+!     See the description of REIG, IMEIG and Z.
+!.....
+!     B (output) REAL(KIND=WP)  M-by-N array.
+!     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
+!     be used for computing the refined vectors; see further
+!     details in the provided references.
+!     If JOBF == 'E', B(1:M,1;K) contains
+!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!     Exact DMD, up to scaling by the inverse eigenvalues.
+!     If JOBF =='N', then B is not referenced.
+!     See the descriptions of X, W, K.
+!.....
+!     LDB (input) INTEGER, LDB >= M
+!     The leading dimension of the array B.
+!.....
+!     W (workspace/output) REAL(KIND=WP) N-by-N array
+!     On exit, W(1:K,1:K) contains the K computed
+!     eigenvectors of the matrix Rayleigh quotient (real and
+!     imaginary parts for each complex conjugate pair of the
+!     eigenvalues). The Ritz vectors (returned in Z) are the
+!     product of X (containing a POD basis for the input
+!     matrix X) and W. See the descriptions of K, S, X and Z.
+!     W is also used as a workspace to temporarily store the
+!     right singular vectors of X.
+!.....
+!     LDW (input) INTEGER, LDW >= N
+!     The leading dimension of the array W.
+!.....
+!     S (workspace/output) REAL(KIND=WP) N-by-N array
+!     The array S(1:K,1:K) is used for the matrix Rayleigh
+!     quotient. This content is overwritten during
+!     the eigenvalue decomposition by DGEEV.
+!     See the description of K.
+!.....
+!     LDS (input) INTEGER, LDS >= N
+!     The leading dimension of the array S.
+!.....
+!     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!     On exit, WORK(1:N) contains the singular values of
+!     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
+!     If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain
+!     scaling factor WORK(N+2)/WORK(N+1) used to scale X
+!     and Y to avoid overflow in the SVD of X.
+!     This may be of interest if the scaling option is off
+!     and as many as possible smallest eigenvalues are
+!     desired to the highest feasible accuracy.
+!     If the call to DGEDMD is only workspace query, then
+!     WORK(1) contains the minimal workspace length and
+!     WORK(2) is the optimal workspace length. Hence, the
+!     leng of work is at least 2.
+!     See the description of LWORK.
+!.....
+!     LWORK (input) INTEGER
+!     The minimal length of the workspace vector WORK.
+!     LWORK is calculated as follows:
+!     If WHTSVD == 1 ::
+!        If JOBZ == 'V', then
+!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)).
+!        If JOBZ == 'N'  then
+!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)).
+!        Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal
+!        workspace length of DGESVD.
+!     If WHTSVD == 2 ::
+!        If JOBZ == 'V', then
+!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N))
+!        If JOBZ == 'N', then
+!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N))
+!        Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the
+!        minimal workspace length of DGESDD.
+!     If WHTSVD == 3 ::
+!        If JOBZ == 'V', then
+!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
+!        If JOBZ == 'N', then
+!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
+!        Here LWORK_SVD = N+M+MAX(3*N+1,
+!                        MAX(1,3*N+M,5*N),MAX(1,N))
+!        is the minimal workspace length of DGESVDQ.
+!     If WHTSVD == 4 ::
+!        If JOBZ == 'V', then
+!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
+!        If JOBZ == 'N', then
+!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
+!        Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the
+!        minimal workspace length of DGEJSV.
+!     The above expressions are not simplified in order to
+!     make the usage of WORK more transparent, and for
+!     easier checking. In any case, LWORK >= 2.
+!     If on entry LWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.
+!.....
+!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!     Workspace that is required only if WHTSVD equals
+!     2 , 3 or 4. (See the description of WHTSVD).
+!     If on entry LWORK =-1 or LIWORK=-1, then the
+!     minimal length of IWORK is computed and returned in
+!     IWORK(1). See the description of LIWORK.
+!.....
+!     LIWORK (input) INTEGER
+!     The minimal length of the workspace vector IWORK.
+!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!     If on entry LIWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.
+!.....
+!     INFO (output) INTEGER
+!     -i < 0 :: On entry, the i-th argument had an
+!               illegal value
+!        = 0 :: Successful return.
+!        = 1 :: Void input. Quick exit (M=0 or N=0).
+!        = 2 :: The SVD computation of X did not converge.
+!               Suggestion: Check the input data and/or
+!               repeat with different WHTSVD.
+!        = 3 :: The computation of the eigenvalues did not
+!               converge.
+!        = 4 :: If data scaling was requested on input and
+!               the procedure found inconsistency in the data
+!               such that for some column index i,
+!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!               to zero if JOBS=='C'. The computation proceeds
+!               with original or modified data and warning
+!               flag is set with INFO=4.
+!.............................................................
+!.............................................................
+!     Parameters
+!     ~~~~~~~~~~
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+
+!     Local scalars
+!     ~~~~~~~~~~~~~
+      REAL(KIND=WP) :: OFL,    ROOTSC, SCALE,  SMALL,  &
+                       SSUM,   XSCL1,  XSCL2
+      INTEGER       :: i,   j, IMINWR,  INFO1, INFO2,  &
+                       LWRKEV, LWRSDD, LWRSVD,         &
+                       LWRSVQ, MLWORK, MWRKEV, MWRSDD, &
+                       MWRSVD, MWRSVJ, MWRSVQ, NUMRNK, &
+                       OLWORK
+      LOGICAL       :: BADXY,  LQUERY, SCCOLX, SCCOLY, &
+                       WNTEX,  WNTREF, WNTRES, WNTVEC
+      CHARACTER     :: JOBZL,  T_OR_N
+      CHARACTER     :: JSVOPT
+
+!     Local arrays
+!     ~~~~~~~~~~~~
+      REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2)
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      REAL(KIND=WP) DLANGE, DLAMCH, DNRM2
+      EXTERNAL      DLANGE, DLAMCH, DNRM2, IDAMAX
+      INTEGER       IDAMAX
+      LOGICAL       DISNAN, LSAME
+      EXTERNAL      DISNAN, LSAME
+
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      DAXPY,  DGEMM,  DSCAL
+      EXTERNAL      DGEEV,  DGEJSV, DGESDD, DGESVD, DGESVDQ, &
+                    DLACPY, DLASCL, DLASSQ, XERBLA
+
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC     DBLE, INT, MAX, SQRT
+!............................................................
+!
+!    Test the input arguments
+!
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME(JOBS,'C')
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V')
+      WNTREF = LSAME(JOBF,'R')
+      WNTEX  = LSAME(JOBF,'E')
+      INFO   = 0
+      LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) )
+!
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. &
+                                  LSAME(JOBS,'N')) )   THEN
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. LSAME(JOBZ,'N')        &
+                              .OR. LSAME(JOBZ,'F')) )  THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.  &
+                ( WNTRES .AND. (.NOT.WNTVEC) ) )       THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             &
+                LSAME(JOBF,'N') ) )                    THEN
+          INFO = -4
+      ELSE IF ( .NOT.((WHTSVD == 1) .OR. (WHTSVD == 2) .OR.  &
+                      (WHTSVD == 3) .OR. (WHTSVD == 4) )) THEN
+          INFO = -5
+      ELSE IF ( M < 0 )   THEN
+          INFO = -6
+      ELSE IF ( ( N < 0 ) .OR. ( N > M ) ) THEN
+          INFO = -7
+      ELSE IF ( LDX < M ) THEN
+          INFO = -9
+      ELSE IF ( LDY < M ) THEN
+          INFO = -11
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. &
+                ((NRNK >= 1).AND.(NRNK <=N ))) )      THEN
+          INFO = -12
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) )  THEN
+          INFO = -13
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -18
+      ELSE IF ( (WNTREF .OR. WNTEX ) .AND. ( LDB < M ) ) THEN
+          INFO = -21
+      ELSE IF ( LDW < N ) THEN
+          INFO = -23
+      ELSE IF ( LDS < N ) THEN
+          INFO = -25
+      END IF
+!
+      IF ( INFO == 0 ) THEN
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and
+          ! determine minimal and optimal sizes of the
+          ! workspace at any moment of the run.
+         IF ( N == 0 ) THEN
+             ! Quick return. All output except K is void.
+             ! INFO=1 signals the void input.
+             ! In case of a workspace query, the default
+             ! minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN
+                IWORK(1) = 1
+                WORK(1)  = 2
+                WORK(2)  = 2
+            ELSE
+               K = 0
+            END IF
+            INFO = 1
+            RETURN
+         END IF
+         MLWORK = MAX(2,N)
+         OLWORK = MAX(2,N)
+         IMINWR = 1
+         SELECT CASE ( WHTSVD )
+         CASE (1)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of DGESVD:
+             ! MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N))
+             MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N))
+             MLWORK = MAX(MLWORK,N + MWRSVD)
+             IF ( LQUERY ) THEN
+                CALL DGESVD( 'O', 'S', M, N, X, LDX, WORK, &
+                           B, LDB, W, LDW, RDUMMY, -1, INFO1 )
+                LWRSVD = MAX( MWRSVD, INT( RDUMMY(1) ) )
+                OLWORK = MAX(OLWORK,N + LWRSVD)
+             END IF
+         CASE (2)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of DGESDD:
+             ! MWRSDD = 3*MIN(M,N)*MIN(M,N) +
+             ! MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) )
+             ! IMINWR = 8*MIN(M,N)
+             MWRSDD = 3*MIN(M,N)*MIN(M,N) +                &
+              MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) )
+             MLWORK = MAX(MLWORK,N + MWRSDD)
+             IMINWR = 8*MIN(M,N)
+             IF ( LQUERY ) THEN
+                CALL DGESDD( 'O', M, N, X, LDX, WORK, B,     &
+                     LDB, W, LDW, RDUMMY, -1, IWORK, INFO1 )
+                LWRSDD = MAX( MWRSDD, INT( RDUMMY(1) ) )
+                OLWORK = MAX(OLWORK,N + LWRSDD)
+             END IF
+         CASE (3)
+             !LWQP3 = 3*N+1
+             !LWORQ = MAX(N, 1)
+             !MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N))
+             !MWRSVQ = N + MAX( LWQP3, MWRSVD, LWORQ ) + MAX(M,2)
+             !MLWORK = N +  MWRSVQ
+             !IMINWR = M+N-1
+             CALL DGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                             X, LDX, WORK, Z, LDZ, W, LDW,   &
+                             NUMRNK, IWORK, LIWORK, RDUMMY,  &
+                             -1, RDUMMY2, -1, INFO1 )
+             IMINWR = IWORK(1)
+             MWRSVQ = INT(RDUMMY(2))
+             MLWORK = MAX(MLWORK,N+MWRSVQ+INT(RDUMMY2(1)))
+             IF ( LQUERY ) THEN
+                LWRSVQ = MAX( MWRSVQ, INT(RDUMMY(1)) )
+                OLWORK = MAX(OLWORK,N+LWRSVQ+INT(RDUMMY2(1)))
+             END IF
+         CASE (4)
+             JSVOPT = 'J'
+             !MWRSVJ = MAX( 7, 2*M+N, 6*N+2*N*N ) ! for JSVOPT='V'
+             MWRSVJ = MAX( 7, 2*M+N, 4*N+N*N, 2*N+N*N+6 )
+             MLWORK = MAX(MLWORK,N+MWRSVJ)
+             IMINWR = MAX( 3, M+3*N )
+             IF ( LQUERY ) THEN
+                OLWORK =  MAX(OLWORK,N+MWRSVJ)
+             END IF
+         END SELECT
+         IF ( WNTVEC .OR. WNTEX .OR. LSAME(JOBZ,'F') ) THEN
+             JOBZL = 'V'
+         ELSE
+             JOBZL = 'N'
+         END IF
+         ! Workspace calculation to the DGEEV call
+         IF ( LSAME(JOBZL,'V') ) THEN
+             MWRKEV = MAX( 1, 4*N )
+         ELSE
+             MWRKEV = MAX( 1, 3*N )
+         END IF
+         MLWORK = MAX(MLWORK,N+MWRKEV)
+         IF ( LQUERY ) THEN
+                CALL DGEEV( 'N', JOBZL, N, S, LDS, REIG, &
+                    IMEIG, W, LDW, W, LDW, RDUMMY, -1, INFO1 )
+                LWRKEV = MAX( MWRKEV, INT(RDUMMY(1)) )
+                OLWORK = MAX( OLWORK, N+LWRKEV )
+         END IF
+!
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -29
+         IF (  LWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -27
+      END IF
+!
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'DGEDMD', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          WORK(1)  = MLWORK
+          WORK(2)  = OLWORK
+          RETURN
+      END IF
+!............................................................
+!
+      OFL   = DLAMCH('O')
+      SMALL = DLAMCH('S')
+      BADXY = .FALSE.
+!
+!     <1> Optional scaling of the snapshots (columns of X, Y)
+!     ==========================================================
+      IF ( SCCOLX ) THEN
+          ! The columns of X will be normalized.
+          ! To prevent overflows, the column norms of X are
+          ! carefully computed using DLASSQ.
+          K = 0
+          DO i = 1, N
+            !WORK(i) = DNRM2( M, X(1,i), 1 )
+            SCALE  = ZERO
+            CALL DLASSQ( M, X(1,i), 1, SCALE, SSUM )
+            IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -8
+                CALL XERBLA('DGEDMD',-INFO)
+            END IF
+            IF ( (SCALE /= ZERO) .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of X(:,i) overflows. First, X(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2.
+!                 Next, the norm of X(:,i) is stored without
+!                 overflow as WORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of X(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL DLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, X(1,i), M, INFO2 )
+                  WORK(i) = - SCALE * ( ROOTSC / DBLE(M) )
+               ELSE
+!                 X(:,i) will be scaled to unit 2-norm
+                  WORK(i) =   SCALE * ROOTSC
+                  CALL DLASCL( 'G',0, 0, WORK(i), ONE, M, 1, &
+                               X(1,i), M, INFO2 )              ! LAPACK CALL
+!                 X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)          ! INTRINSIC
+               END IF
+            ELSE
+               WORK(i) = ZERO
+               K = K + 1
+            END IF
+          END DO
+          IF ( K == N ) THEN
+          ! All columns of X are zero. Return error code -8.
+          ! (the 8th input variable had an illegal value)
+          K = 0
+          INFO = -8
+          CALL XERBLA('DGEDMD',-INFO)
+          RETURN
+          END IF
+          DO i = 1, N
+!           Now, apply the same scaling to the columns of Y.
+            IF ( WORK(i) >  ZERO ) THEN
+                CALL DSCAL( M, ONE/WORK(i), Y(1,i), 1 )  ! BLAS CALL
+!               Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)      ! INTRINSIC
+            ELSE IF ( WORK(i) < ZERO ) THEN
+                CALL DLASCL( 'G', 0, 0, -WORK(i),          &
+                     ONE/DBLE(M), M, 1, Y(1,i), M, INFO2 ) ! LAPACK CALL
+            ELSE IF ( Y(IDAMAX(M, Y(1,i),1),i )  &
+                                            /= ZERO ) THEN
+!               X(:,i) is zero vector. For consistency,
+!               Y(:,i) should also be zero. If Y(:,i) is not
+!               zero, then the data might be inconsistent or
+!               corrupted. If JOBS == 'C', Y(:,i) is set to
+!               zero and a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+                IF ( LSAME(JOBS,'C')) &
+                CALL DSCAL( M, ZERO, Y(1,i), 1 )  ! BLAS CALL
+            END IF
+          END DO
+      END IF
+  !
+      IF ( SCCOLY ) THEN
+          ! The columns of Y will be normalized.
+          ! To prevent overflows, the column norms of Y are
+          ! carefully computed using DLASSQ.
+          DO i = 1, N
+            !WORK(i) = DNRM2( M, Y(1,i), 1 )
+            SCALE  = ZERO
+            CALL DLASSQ( M, Y(1,i), 1, SCALE, SSUM )
+            IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -10
+                CALL XERBLA('DGEDMD',-INFO)
+            END IF
+            IF ( SCALE /= ZERO  .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of Y(:,i) overflows. First, Y(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2.
+!                 Next, the norm of Y(:,i) is stored without
+!                 overflow as WORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of Y(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL DLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, Y(1,i), M, INFO2 )
+                  WORK(i) = - SCALE * ( ROOTSC / DBLE(M) )
+               ELSE
+!                 X(:,i) will be scaled to unit 2-norm
+                  WORK(i) =   SCALE * ROOTSC
+                  CALL DLASCL( 'G',0, 0, WORK(i), ONE, M, 1, &
+                               Y(1,i), M, INFO2 )              ! LAPACK CALL
+!                 Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)          ! INTRINSIC
+               END IF
+            ELSE
+               WORK(i) = ZERO
+            END IF
+         END DO
+         DO i = 1, N
+!           Now, apply the same scaling to the columns of X.
+            IF ( WORK(i) >  ZERO ) THEN
+                CALL DSCAL( M, ONE/WORK(i), X(1,i), 1 )  ! BLAS CALL
+!               X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)      ! INTRINSIC
+            ELSE IF ( WORK(i) < ZERO ) THEN
+                CALL DLASCL( 'G', 0, 0, -WORK(i),          &
+                     ONE/DBLE(M), M, 1, X(1,i), M, INFO2 ) ! LAPACK CALL
+            ELSE IF ( X(IDAMAX(M, X(1,i),1),i )  &
+                                           /= ZERO ) THEN
+!               Y(:,i) is zero vector.  If X(:,i) is not
+!               zero, then a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+            END IF
+         END DO
+       END IF
+!
+!     <2> SVD of the data snapshot matrix X.
+!     =====================================
+!     The left singular vectors are stored in the array X.
+!     The right singular vectors are in the array W.
+!     The array W will later on contain the eigenvectors
+!     of a Rayleigh quotient.
+      NUMRNK = N
+      SELECT CASE ( WHTSVD )
+         CASE (1)
+             CALL DGESVD( 'O', 'S', M, N, X, LDX, WORK, B, &
+                  LDB, W, LDW, WORK(N+1), LWORK-N, INFO1 ) ! LAPACK CALL
+             T_OR_N = 'T'
+         CASE (2)
+            CALL DGESDD( 'O', M, N, X, LDX, WORK, B, LDB, W, &
+                 LDW, WORK(N+1), LWORK-N, IWORK, INFO1 )   ! LAPACK CALL
+            T_OR_N = 'T'
+         CASE (3)
+              CALL DGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                   X, LDX, WORK, Z, LDZ, W, LDW, &
+                   NUMRNK, IWORK, LIWORK, WORK(N+MAX(2,M)+1),&
+                   LWORK-N-MAX(2,M), WORK(N+1), MAX(2,M), INFO1)     ! LAPACK CALL
+              CALL DLACPY( 'A', M, NUMRNK, Z, LDZ, X, LDX )   ! LAPACK CALL
+         T_OR_N = 'T'
+         CASE (4)
+              CALL DGEJSV( 'F', 'U', JSVOPT, 'N', 'N', 'P', M, &
+                   N, X, LDX, WORK, Z, LDZ, W, LDW, &
+                   WORK(N+1), LWORK-N, IWORK, INFO1 )    ! LAPACK CALL
+              CALL DLACPY( 'A', M, N, Z, LDZ, X, LDX )   ! LAPACK CALL
+              T_OR_N = 'N'
+              XSCL1 = WORK(N+1)
+              XSCL2 = WORK(N+2)
+              IF ( XSCL1 /=  XSCL2 ) THEN
+                 ! This is an exceptional situation. If the
+                 ! data matrices are not scaled and the
+                 ! largest singular value of X overflows.
+                 ! In that case DGEJSV can return the SVD
+                 ! in scaled form. The scaling factor can be used
+                 ! to rescale the data (X and Y).
+                 CALL DLASCL( 'G', 0, 0, XSCL1, XSCL2, M, N, Y, LDY, INFO2  )
+              END IF
+      END SELECT
+!
+      IF ( INFO1 > 0 ) THEN
+         ! The SVD selected subroutine did not converge.
+         ! Return with an error code.
+         INFO = 2
+         RETURN
+      END IF
+!
+      IF ( WORK(1) == ZERO ) THEN
+          ! The largest computed singular value of (scaled)
+          ! X is zero. Return error code -8
+          ! (the 8th input variable had an illegal value).
+          K = 0
+          INFO = -8
+          CALL XERBLA('DGEDMD',-INFO)
+          RETURN
+      END IF
+!
+      !<3> Determine the numerical rank of the data
+      !    snapshots matrix X. This depends on the
+      !    parameters NRNK and TOL.
+
+      SELECT CASE ( NRNK )
+          CASE ( -1 )
+               K = 1
+               DO i = 2, NUMRNK
+                 IF ( ( WORK(i) <= WORK(1)*TOL ) .OR. &
+                      ( WORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE ( -2 )
+               K = 1
+               DO i = 1, NUMRNK-1
+                 IF ( ( WORK(i+1) <= WORK(i)*TOL  ) .OR. &
+                      ( WORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE DEFAULT
+               K = 1
+               DO i = 2, NRNK
+                  IF ( WORK(i) <= SMALL ) EXIT
+                  K = K + 1
+               END DO
+          END SELECT
+      !   Now, U = X(1:M,1:K) is the SVD/POD basis for the
+      !   snapshot data in the input matrix X.
+
+      !<4> Compute the Rayleigh quotient S = U^T * A * U.
+      !    Depending on the requested outputs, the computation
+      !    is organized to compute additional auxiliary
+      !    matrices (for the residuals and refinements).
+      !
+      !    In all formulas below, we need V_k*Sigma_k^(-1)
+      !    where either V_k is in W(1:N,1:K), or V_k^T is in
+      !    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)).
+      IF ( LSAME(T_OR_N, 'N') ) THEN
+          DO i = 1, K
+           CALL DSCAL( N, ONE/WORK(i), W(1,i), 1 )    ! BLAS CALL
+           ! W(1:N,i) = (ONE/WORK(i)) * W(1:N,i)      ! INTRINSIC
+          END DO
+      ELSE
+          ! This non-unit stride access is due to the fact
+          ! that DGESVD, DGESVDQ and DGESDD return the
+          ! transposed matrix of the right singular vectors.
+          !DO i = 1, K
+          ! CALL DSCAL( N, ONE/WORK(i), W(i,1), LDW )    ! BLAS CALL
+          ! ! W(i,1:N) = (ONE/WORK(i)) * W(i,1:N)      ! INTRINSIC
+          !END DO
+          DO i = 1, K
+              WORK(N+i) = ONE/WORK(i)
+          END DO
+          DO j = 1, N
+             DO i = 1, K
+                 W(i,j) = (WORK(N+i))*W(i,j)
+             END DO
+          END DO
+      END IF
+!
+      IF ( WNTREF ) THEN
+         !
+         ! Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K)))
+         ! for computing the refined Ritz vectors
+         ! (optionally, outside DGEDMD).
+          CALL DGEMM( 'N', T_OR_N, M, K, N, ONE, Y, LDY, W, &
+                      LDW, ZERO, Z, LDZ )                        ! BLAS CALL
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N)))  ! INTRINSIC, for T_OR_N=='T'
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))             ! INTRINSIC, for T_OR_N=='N'
+          !
+          ! At this point Z contains
+          ! A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and
+          ! this is needed for computing the residuals.
+          ! This matrix is  returned in the array B and
+          ! it can be used to compute refined Ritz vectors.
+          CALL DLACPY( 'A', M, K, Z, LDZ, B, LDB )   ! BLAS CALL
+          ! B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC
+
+          CALL DGEMM( 'T', 'N', K, K, M, ONE, X, LDX, Z, &
+                      LDZ, ZERO, S, LDS )                        ! BLAS CALL
+          ! S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRINSIC
+          ! At this point S = U^T * A * U is the Rayleigh quotient.
+      ELSE
+        ! A * U(:,1:K) is not explicitly needed and the
+        ! computation is organized differently. The Rayleigh
+        ! quotient is computed more efficiently.
+        CALL DGEMM( 'T', 'N', K, N, M, ONE, X, LDX, Y, LDY, &
+                   ZERO, Z, LDZ )                                   ! BLAS CALL
+        ! Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) )  ! INTRINSIC
+        ! In the two DGEMM calls here, can use K for LDZ.
+        CALL DGEMM( 'N', T_OR_N, K, K, N, ONE, Z, LDZ, W, &
+                    LDW, ZERO, S, LDS )                         ! BLAS CALL
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRINSIC, for T_OR_N=='T'
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))          ! INTRINSIC, for T_OR_N=='N'
+        ! At this point S = U^T * A * U is the Rayleigh quotient.
+        ! If the residuals are requested, save scaled V_k into Z.
+        ! Recall that V_k or V_k^T is stored in W.
+        IF ( WNTRES .OR. WNTEX ) THEN
+          IF ( LSAME(T_OR_N, 'N') ) THEN
+              CALL DLACPY( 'A', N, K, W, LDW, Z, LDZ )
+          ELSE
+              CALL DLACPY( 'A', K, N, W, LDW, Z, LDZ )
+          END IF
+        END IF
+      END IF
+!
+      !<5> Compute the Ritz values and (if requested) the
+      !   right eigenvectors of the Rayleigh quotient.
+      !
+      CALL DGEEV( 'N', JOBZL, K, S, LDS, REIG, IMEIG, W, &
+                  LDW, W, LDW, WORK(N+1), LWORK-N, INFO1 )   ! LAPACK CALL
+      !
+      ! W(1:K,1:K) contains the eigenvectors of the Rayleigh
+      ! quotient. Even in the case of complex spectrum, all
+      ! computation is done in real arithmetic. REIG and
+      ! IMEIG are the real and the imaginary parts of the
+      ! eigenvalues, so that the spectrum is given as
+      ! REIG(:) + sqrt(-1)*IMEIG(:). Complex conjugate pairs
+      ! are listed at consecutive positions. For such a
+      ! complex conjugate pair of the eigenvalues, the
+      ! corresponding eigenvectors are also a complex
+      ! conjugate pair with the real and imaginary parts
+      ! stored column-wise in W at the corresponding
+      ! consecutive column indices. See the description of Z.
+      ! Also, see the description of DGEEV.
+      IF ( INFO1 > 0 ) THEN
+         ! DGEEV failed to compute the eigenvalues and
+         ! eigenvectors of the Rayleigh quotient.
+         INFO = 3
+         RETURN
+      END IF
+!
+      ! <6> Compute the eigenvectors (if requested) and,
+      ! the residuals (if requested).
+      !
+      IF ( WNTVEC .OR. WNTEX ) THEN
+      IF ( WNTRES ) THEN
+          IF ( WNTREF ) THEN
+            ! Here, if the refinement is requested, we have
+            ! A*U(:,1:K) already computed and stored in Z.
+            ! For the residuals, need Y = A * U(:,1;K) * W.
+            CALL DGEMM( 'N', 'N', M, K, K, ONE, Z, LDZ, W, &
+                       LDW, ZERO, Y, LDY )               ! BLAS CALL
+            ! Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)       ! INTRINSIC
+            ! This frees Z; Y contains A * U(:,1:K) * W.
+          ELSE
+            ! Compute S = V_k * Sigma_k^(-1) * W, where
+            ! V_k * Sigma_k^(-1) is stored in Z
+            CALL DGEMM( T_OR_N, 'N', N, K, K, ONE, Z, LDZ, &
+                       W, LDW, ZERO, S, LDS)
+            ! Then, compute Z = Y * S =
+            ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+            ! = A * U(:,1:K) * W(1:K,1:K)
+            CALL DGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, &
+                       LDS, ZERO, Z, LDZ)
+            ! Save a copy of Z into Y and free Z for holding
+            ! the Ritz vectors.
+            CALL DLACPY( 'A', M, K, Z, LDZ, Y, LDY )
+            IF ( WNTEX ) CALL DLACPY( 'A', M, K, Z, LDZ, B, LDB )
+          END IF
+      ELSE IF ( WNTEX ) THEN
+          ! Compute S = V_k * Sigma_k^(-1) * W, where
+            ! V_k * Sigma_k^(-1) is stored in Z
+            CALL DGEMM( T_OR_N, 'N', N, K, K, ONE, Z, LDZ, &
+                       W, LDW, ZERO, S, LDS )
+            ! Then, compute Z = Y * S =
+            ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+            ! = A * U(:,1:K) * W(1:K,1:K)
+            CALL DGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, &
+                       LDS, ZERO, B, LDB )
+            ! The above call replaces the following two calls
+            ! that were used in the developing-testing phase.
+            ! CALL DGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, &
+            !           LDS, ZERO, Z, LDZ)
+            ! Save a copy of Z into B and free Z for holding
+            ! the Ritz vectors.
+            ! CALL DLACPY( 'A', M, K, Z, LDZ, B, LDB )
+      END IF
+!
+      ! Compute the real form of the Ritz vectors
+      IF ( WNTVEC ) CALL DGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, &
+                   ZERO, Z, LDZ )                           ! BLAS CALL
+      ! Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRINSIC
+!
+      IF ( WNTRES ) THEN
+         i = 1
+         DO WHILE ( i <= K )
+            IF ( IMEIG(i) == ZERO ) THEN
+                ! have a real eigenvalue with real eigenvector
+                CALL DAXPY( M, -REIG(i), Z(1,i), 1, Y(1,i), 1 )       ! BLAS CALL
+                ! Y(1:M,i) = Y(1:M,i) - REIG(i) * Z(1:M,i)            ! INTRINSIC
+                RES(i) = DNRM2( M, Y(1,i), 1)                         ! BLAS CALL
+                i = i + 1
+            ELSE
+               ! Have a complex conjugate pair
+               ! REIG(i) +- sqrt(-1)*IMEIG(i).
+               ! Since all computation is done in real
+               ! arithmetic, the formula for the residual
+               ! is recast for real representation of the
+               ! complex conjugate eigenpair. See the
+               ! description of RES.
+               AB(1,1) =  REIG(i)
+               AB(2,1) = -IMEIG(i)
+               AB(1,2) =  IMEIG(i)
+               AB(2,2) =  REIG(i)
+               CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), &
+                           LDZ, AB, 2, ONE, Y(1,i), LDY )          ! BLAS CALL
+               ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB   ! INTRINSIC
+               RES(i)   = DLANGE( 'F', M, 2, Y(1,i), LDY, &
+                                  WORK(N+1) )                      ! LAPACK CALL
+               RES(i+1) = RES(i)
+               i = i + 2
+            END IF
+         END DO
+      END IF
+      END IF
+!
+      IF ( WHTSVD == 4 ) THEN
+          WORK(N+1) = XSCL1
+          WORK(N+2) = XSCL2
+      END IF
+!
+!     Successful exit.
+      IF ( .NOT. BADXY ) THEN
+         INFO = 0
+      ELSE
+         ! A warning on possible data inconsistency.
+         ! This should be a rare event.
+         INFO = 4
+      END IF
+!............................................................
+      RETURN
+!     ......
+      END SUBROUTINE DGEDMD
+
diff --git a/lapack-netlib/SRC/dgedmdq.f90 b/lapack-netlib/SRC/dgedmdq.f90
new file mode 100644
index 000000000..bedfba472
--- /dev/null
+++ b/lapack-netlib/SRC/dgedmdq.f90
@@ -0,0 +1,704 @@
+SUBROUTINE DGEDMDQ( JOBS,  JOBZ, JOBR, JOBQ, JOBT, JOBF,   &
+                    WHTSVD,   M, N, F, LDF,  X, LDX,  Y,   &
+                    LDY,   NRNK,  TOL,   K,  REIG, IMEIG,  &
+                    Z, LDZ, RES,  B,     LDB,   V, LDV,    & 
+                    S, LDS, WORK, LWORK, IWORK, LIWORK, INFO )
+! March 2023
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real64      
+!.....      
+!     Scalar arguments       
+      CHARACTER, INTENT(IN)  :: JOBS, JOBZ, JOBR, JOBQ,    &
+                                JOBT, JOBF
+      INTEGER,   INTENT(IN)  :: WHTSVD, M, N,   LDF, LDX,  &
+                                LDY, NRNK, LDZ, LDB, LDV,  &
+                                LDS, LWORK,  LIWORK
+      INTEGER,   INTENT(OUT) :: INFO,    K      
+      REAL(KIND=WP), INTENT(IN)    ::   TOL     
+!     Array arguments      
+      REAL(KIND=WP), INTENT(INOUT) :: F(LDF,*)
+      REAL(KIND=WP), INTENT(OUT)   :: X(LDX,*), Y(LDY,*),  &
+                                      Z(LDZ,*), B(LDB,*),  &
+                                      V(LDV,*), S(LDS,*)
+      REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*),  &
+                                      RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)  
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!.....      
+!     Purpose  
+!     =======
+!     DGEDMDQ computes the Dynamic Mode Decomposition (DMD) for
+!     a pair of data snapshot matrices, using a QR factorization
+!     based compression of the data. For the input matrices
+!     X and Y such that Y = A*X with an unaccessible matrix
+!     A, DGEDMDQ computes a certain number of Ritz pairs of A using
+!     the standard Rayleigh-Ritz extraction from a subspace of
+!     range(X) that is determined using the leading left singular 
+!     vectors of X. Optionally, DGEDMDQ returns the residuals 
+!     of the computed Ritz pairs, the information needed for
+!     a refinement of the Ritz vectors, or the eigenvectors of
+!     the Exact DMD.
+!     For further details see the references listed
+!     below. For more details of the implementation see [3].      
+!
+!     References
+!     ==========
+!     [1] P. Schmid: Dynamic mode decomposition of numerical
+!         and experimental data,
+!         Journal of Fluid Mechanics 656, 5-28, 2010.
+!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!         decompositions: analysis and enhancements,
+!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!         Mode Decomposition I. Technical report. AIMDyn Inc.
+!         and LAPACK Working Note 298.      
+!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. 
+!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!         Theory and Applications, Journal of Computational
+!         Dynamics 1(2), 391 -421, 2014.
+!
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     and supported by
+!     - DARPA SBIR project "Koopman Operator-Based Forecasting
+!     for Nonstationary Processes from Near-Term, Limited
+!     Observational Data" Contract No: W31P4Q-21-C-0007
+!     - DARPA PAI project "Physics-Informed Machine Learning
+!     Methodologies" Contract No: HR0011-18-9-0033
+!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!     Framework for Space-Time Analysis of Process Dynamics"
+!     Contract No: HR0011-16-C-0116
+!     Any opinions, findings and conclusions or recommendations 
+!     expressed in this material are those of the author and 
+!     do not necessarily reflect the views of the DARPA SBIR 
+!     Program Office.      
+!============================================================
+!     Distribution Statement A: 
+!     Approved for Public Release, Distribution Unlimited.
+!     Cleared by DARPA on September 29, 2022  
+!============================================================      
+!......................................................................      
+!     Arguments
+!     =========
+!     JOBS (input) CHARACTER*1
+!     Determines whether the initial data snapshots are scaled
+!     by a diagonal matrix. The data snapshots are the columns
+!     of F. The leading N-1 columns of F are denoted X and the
+!     trailing N-1 columns are denoted Y. 
+!     'S' :: The data snapshots matrices X and Y are multiplied
+!            with a diagonal matrix D so that X*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'C' :: The snapshots are scaled as with the 'S' option.
+!            If it is found that an i-th column of X is zero
+!            vector and the corresponding i-th column of Y is
+!            non-zero, then the i-th column of Y is set to
+!            zero and a warning flag is raised.
+!     'Y' :: The data snapshots matrices X and Y are multiplied
+!            by a diagonal matrix D so that Y*D has unit
+!            nonzero columns (in the Euclidean 2-norm)    
+!     'N' :: No data scaling.   
+!.....
+!     JOBZ (input) CHARACTER*1
+!     Determines whether the eigenvectors (Koopman modes) will
+!     be computed.
+!     'V' :: The eigenvectors (Koopman modes) will be computed
+!            and returned in the matrix Z.
+!            See the description of Z.
+!     'F' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Z*V, where Z
+!            is orthonormal and V contains the eigenvectors
+!            of the corresponding Rayleigh quotient.
+!            See the descriptions of F, V, Z.
+!     'Q' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Q*Z, where Z
+!            contains the eigenvectors of the compression of the
+!            underlying discretized operator onto the span of
+!            the data snapshots. See the descriptions of F, V, Z.  
+!            Q is from the initial QR factorization.      
+!     'N' :: The eigenvectors are not computed.  
+!.....      
+!     JOBR (input) CHARACTER*1 
+!     Determines whether to compute the residuals.
+!     'R' :: The residuals for the computed eigenpairs will
+!            be computed and stored in the array RES.
+!            See the description of RES.
+!            For this option to be legal, JOBZ must be 'V'.
+!     'N' :: The residuals are not computed.
+!.....
+!     JOBQ (input) CHARACTER*1 
+!     Specifies whether to explicitly compute and return the
+!     orthogonal matrix from the QR factorization.
+!     'Q' :: The matrix Q of the QR factorization of the data
+!            snapshot matrix is computed and stored in the
+!            array F. See the description of F.       
+!     'N' :: The matrix Q is not explicitly computed.
+!.....
+!     JOBT (input) CHARACTER*1 
+!     Specifies whether to return the upper triangular factor
+!     from the QR factorization.
+!     'R' :: The matrix R of the QR factorization of the data 
+!            snapshot matrix F is returned in the array Y.
+!            See the description of Y and Further details.       
+!     'N' :: The matrix R is not returned.    
+!.....
+!     JOBF (input) CHARACTER*1
+!     Specifies whether to store information needed for post-
+!     processing (e.g. computing refined Ritz vectors)
+!     'R' :: The matrix needed for the refinement of the Ritz
+!            vectors is computed and stored in the array B.
+!            See the description of B.
+!     'E' :: The unscaled eigenvectors of the Exact DMD are 
+!            computed and returned in the array B. See the
+!            description of B.
+!     'N' :: No eigenvector refinement data is computed.
+!     To be useful on exit, this option needs JOBQ='Q'.
+!.....      
+!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!     Allows for a selection of the SVD algorithm from the
+!     LAPACK library.
+!     1 :: DGESVD (the QR SVD algorithm)
+!     2 :: DGESDD (the Divide and Conquer algorithm; if enough
+!          workspace available, this is the fastest option)
+!     3 :: DGESVDQ (the preconditioned QR SVD  ; this and 4
+!          are the most accurate options)
+!     4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3
+!          are the most accurate options)
+!     For the four methods above, a significant difference in
+!     the accuracy of small singular values is possible if
+!     the snapshots vary in norm so that X is severely
+!     ill-conditioned. If small (smaller than EPS*||X||)
+!     singular values are of interest and JOBS=='N',  then
+!     the options (3, 4) give the most accurate results, where
+!     the option 4 is slightly better and with stronger 
+!     theoretical background.
+!     If JOBS=='S', i.e. the columns of X will be normalized,
+!     then all methods give nearly equally accurate results.
+!.....
+!     M (input) INTEGER, M >= 0 
+!     The state space dimension (the number of rows of F).
+!.....      
+!     N (input) INTEGER, 0 <= N <= M
+!     The number of data snapshots from a single trajectory,
+!     taken at equidistant discrete times. This is the 
+!     number of columns of F.
+!.....
+!     F (input/output) REAL(KIND=WP) M-by-N array
+!     > On entry,
+!     the columns of F are the sequence of data snapshots 
+!     from a single trajectory, taken at equidistant discrete
+!     times. It is assumed that the column norms of F are 
+!     in the range of the normalized floating point numbers. 
+!     < On exit,
+!     If JOBQ == 'Q', the array F contains the orthogonal 
+!     matrix/factor of the QR factorization of the initial 
+!     data snapshots matrix F. See the description of JOBQ. 
+!     If JOBQ == 'N', the entries in F strictly below the main
+!     diagonal contain, column-wise, the information on the 
+!     Householder vectors, as returned by DGEQRF. The 
+!     remaining information to restore the orthogonal matrix
+!     of the initial QR factorization is stored in WORK(1:N). 
+!     See the description of WORK.
+!.....
+!     LDF (input) INTEGER, LDF >= M 
+!     The leading dimension of the array F.
+!.....
+!     X (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array
+!     X is used as workspace to hold representations of the
+!     leading N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, the leading K columns of X contain the leading
+!     K left singular vectors of the above described content
+!     of X. To lift them to the space of the left singular
+!     vectors U(:,1:K)of the input data, pre-multiply with the 
+!     Q factor from the initial QR factorization. 
+!     See the descriptions of F, K, V  and Z.
+!.....      
+!     LDX (input) INTEGER, LDX >= N  
+!     The leading dimension of the array X. 
+!.....
+!     Y (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array
+!     Y is used as workspace to hold representations of the
+!     trailing N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, 
+!     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper
+!     triangular factor from the QR factorization of the data
+!     snapshot matrix F.
+!.....      
+!     LDY (input) INTEGER , LDY >= N
+!     The leading dimension of the array Y.   
+!.....
+!     NRNK (input) INTEGER
+!     Determines the mode how to compute the numerical rank,
+!     i.e. how to truncate small singular values of the input
+!     matrix X. On input, if
+!     NRNK = -1 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(1)
+!                  This option is recommended.  
+!     NRNK = -2 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(i-1)
+!                  This option is included for R&D purposes.
+!                  It requires highly accurate SVD, which
+!                  may not be feasible.       
+!     The numerical rank can be enforced by using positive 
+!     value of NRNK as follows: 
+!     0 < NRNK <= N-1 :: at most NRNK largest singular values
+!     will be used. If the number of the computed nonzero
+!     singular values is less than NRNK, then only those
+!     nonzero values will be used and the actually used
+!     dimension is less than NRNK. The actual number of
+!     the nonzero singular values is returned in the variable
+!     K. See the description of K.
+!.....
+!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!     The tolerance for truncating small singular values.
+!     See the description of NRNK.  
+!.....
+!     K (output) INTEGER,  0 <= K <= N 
+!     The dimension of the SVD/POD basis for the leading N-1
+!     data snapshots (columns of F) and the number of the 
+!     computed Ritz pairs. The value of K is determined
+!     according to the rule set by the parameters NRNK and 
+!     TOL. See the descriptions of NRNK and TOL. 
+!.....
+!     REIG (output) REAL(KIND=WP) (N-1)-by-1 array
+!     The leading K (K<=N) entries of REIG contain
+!     the real parts of the computed eigenvalues
+!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!     See the descriptions of K, IMEIG, Z.
+!.....
+!     IMEIG (output) REAL(KIND=WP) (N-1)-by-1 array
+!     The leading K (K<N) entries of REIG contain
+!     the imaginary parts of the computed eigenvalues
+!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!     The eigenvalues are determined as follows:
+!     If IMEIG(i) == 0, then the corresponding eigenvalue is
+!     real, LAMBDA(i) = REIG(i).
+!     If IMEIG(i)>0, then the corresponding complex
+!     conjugate pair of eigenvalues reads
+!     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i)
+!     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i)
+!     That is, complex conjugate pairs have consequtive
+!     indices (i,i+1), with the positive imaginary part
+!     listed first.
+!     See the descriptions of K, REIG, Z.     
+!.....      
+!     Z (workspace/output) REAL(KIND=WP)  M-by-(N-1) array
+!     If JOBZ =='V' then
+!        Z contains real Ritz vectors as follows:
+!        If IMEIG(i)=0, then Z(:,i) is an eigenvector of
+!        the i-th Ritz value.
+!        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then
+!        [Z(:,i) Z(:,i+1)] span an invariant subspace and
+!        the Ritz values extracted from this subspace are
+!        REIG(i) + sqrt(-1)*IMEIG(i) and
+!        REIG(i) - sqrt(-1)*IMEIG(i).
+!        The corresponding eigenvectors are
+!        Z(:,i) + sqrt(-1)*Z(:,i+1) and
+!        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively.
+!     If JOBZ == 'F', then the above descriptions hold for
+!     the columns of Z*V, where the columns of V are the
+!     eigenvectors of the K-by-K Rayleigh quotient, and Z is
+!     orthonormal. The columns of V are similarly structured:
+!     If IMEIG(i) == 0 then Z*V(:,i) is an eigenvector, and if 
+!     IMEIG(i) > 0 then Z*V(:,i)+sqrt(-1)*Z*V(:,i+1) and
+!                       Z*V(:,i)-sqrt(-1)*Z*V(:,i+1)
+!     are the eigenvectors of LAMBDA(i), LAMBDA(i+1).
+!     See the descriptions of REIG, IMEIG, X and V.
+!.....
+!     LDZ (input) INTEGER , LDZ >= M
+!     The leading dimension of the array Z.
+!.....
+!     RES (output) REAL(KIND=WP) (N-1)-by-1 array
+!     RES(1:K) contains the residuals for the K computed 
+!     Ritz pairs.       
+!     If LAMBDA(i) is real, then
+!        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2.
+!     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair
+!     then
+!     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F
+!     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ]
+!               [-imag(LAMBDA(i)) real(LAMBDA(i)) ].
+!     It holds that
+!     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2
+!     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2
+!     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1)
+!           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1)
+!     See the description of Z.
+!.....
+!     B (output) REAL(KIND=WP)  MIN(M,N)-by-(N-1) array.
+!     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can
+!     be used for computing the refined vectors; see further 
+!     details in the provided references. 
+!     If JOBF == 'E', B(1:N,1;K) contains 
+!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!     Exact DMD, up to scaling by the inverse eigenvalues.   
+!     In both cases, the content of B can be lifted to the 
+!     original dimension of the input data by pre-multiplying
+!     with the Q factor from the initial QR factorization.
+!     Here A denotes a compression of the underlying operator.
+!     See the descriptions of F and X.
+!     If JOBF =='N', then B is not referenced.
+!.....
+!     LDB (input) INTEGER, LDB >= MIN(M,N)
+!     The leading dimension of the array B.
+!.....      
+!     V (workspace/output) REAL(KIND=WP) (N-1)-by-(N-1) array
+!     On exit, V(1:K,1:K) contains the K eigenvectors of
+!     the Rayleigh quotient. The eigenvectors of a complex
+!     conjugate pair of eigenvalues are returned in real form
+!     as explained in the description of Z. The Ritz vectors
+!     (returned in Z) are the product of X and V; see
+!     the descriptions of X and Z.
+!.....
+!     LDV (input) INTEGER, LDV >= N-1
+!     The leading dimension of the array V.
+!.....      
+!     S (output) REAL(KIND=WP) (N-1)-by-(N-1) array
+!     The array S(1:K,1:K) is used for the matrix Rayleigh
+!     quotient. This content is overwritten during
+!     the eigenvalue decomposition by DGEEV.
+!     See the description of K.
+!.....
+!     LDS (input) INTEGER, LDS >= N-1        
+!     The leading dimension of the array S.
+!.....
+!     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!     On exit, 
+!     WORK(1:MIN(M,N)) contains the scalar factors of the 
+!     elementary reflectors as returned by DGEQRF of the 
+!     M-by-N input matrix F.
+!     WORK(MIN(M,N)+1:MIN(M,N)+N-1) contains the singular values of 
+!     the input submatrix F(1:M,1:N-1).
+!     If the call to DGEDMDQ is only workspace query, then
+!     WORK(1) contains the minimal workspace length and
+!     WORK(2) is the optimal workspace length. Hence, the
+!     length of work is at least 2.
+!     See the description of LWORK.
+!.....
+!     LWORK (input) INTEGER
+!     The minimal length of the  workspace vector WORK.
+!     LWORK is calculated as follows:
+!     Let MLWQR  = N (minimal workspace for DGEQRF[M,N])
+!         MLWDMD = minimal workspace for DGEDMD (see the
+!                  description of LWORK in DGEDMD) for 
+!                  snapshots of dimensions MIN(M,N)-by-(N-1)
+!         MLWMQR = N (minimal workspace for 
+!                    DORMQR['L','N',M,N,N])
+!         MLWGQR = N (minimal workspace for DORGQR[M,N,N])
+!     Then
+!     LWORK = MAX(N+MLWQR, N+MLWDMD)
+!     is updated as follows:
+!        if   JOBZ == 'V' or JOBZ == 'F' THEN 
+!             LWORK = MAX( LWORK, MIN(M,N)+N-1+MLWMQR )
+!        if   JOBQ == 'Q' THEN
+!             LWORK = MAX( LWORK, MIN(M,N)+N-1+MLWGQR)
+!     If on entry LWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.          
+!.....
+!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!     Workspace that is required only if WHTSVD equals
+!     2 , 3 or 4. (See the description of WHTSVD).
+!     If on entry LWORK =-1 or LIWORK=-1, then the
+!     minimal length of IWORK is computed and returned in
+!     IWORK(1). See the description of LIWORK.
+!.....
+!     LIWORK (input) INTEGER
+!     The minimal length of the workspace vector IWORK.
+!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!     Let M1=MIN(M,N), N1=N-1. Then    
+!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1))
+!     If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1)
+!     If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1)
+!     If on entry LIWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.
+!..... 
+!     INFO (output) INTEGER
+!     -i < 0 :: On entry, the i-th argument had an
+!               illegal value
+!        = 0 :: Successful return.
+!        = 1 :: Void input. Quick exit (M=0 or N=0).
+!        = 2 :: The SVD computation of X did not converge.
+!               Suggestion: Check the input data and/or
+!               repeat with different WHTSVD.
+!        = 3 :: The computation of the eigenvalues did not
+!               converge.
+!        = 4 :: If data scaling was requested on input and
+!               the procedure found inconsistency in the data
+!               such that for some column index i,
+!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!               to zero if JOBS=='C'. The computation proceeds
+!               with original or modified data and warning
+!               flag is set with INFO=4.  
+!.............................................................
+!.............................................................
+!     Parameters
+!     ~~~~~~~~~~      
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+!      
+!     Local scalars      
+!     ~~~~~~~~~~~~~
+      INTEGER           :: IMINWR, INFO1,  MLWDMD, MLWGQR, &
+                           MLWMQR, MLWORK, MLWQR,  MINMN,  & 
+                           OLWDMD, OLWGQR, OLWMQR, OLWORK, &
+                           OLWQR
+      LOGICAL           :: LQUERY, SCCOLX, SCCOLY, WANTQ,  &
+                           WNTTRF, WNTRES, WNTVEC, WNTVCF, &
+                           WNTVCQ, WNTREF, WNTEX
+      CHARACTER(LEN=1)  :: JOBVL
+!      
+!     Local array      
+!     ~~~~~~~~~~~      
+      REAL(KIND=WP) :: RDUMMY(2)
+!      
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      LOGICAL       LSAME
+      EXTERNAL      LSAME 
+!
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      DGEMM 
+      EXTERNAL      DGEQRF, DLACPY, DLASET, DORGQR, & 
+                    DORMQR, XERBLA
+
+!     External subroutines
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      DGEDMD 
+      
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC      MAX, MIN, INT         
+ !..........................................................  
+ !
+ !    Test the input arguments    
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME( JOBS, 'C' )
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V')
+      WNTVCF = LSAME(JOBZ,'F')
+      WNTVCQ = LSAME(JOBZ,'Q')
+      WNTREF = LSAME(JOBF,'R') 
+      WNTEX  = LSAME(JOBF,'E')
+      WANTQ  = LSAME(JOBQ,'Q')
+      WNTTRF = LSAME(JOBT,'R')     
+      MINMN  = MIN(M,N)
+      INFO = 0 
+      LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) )
+!       
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR.                &
+                                  LSAME(JOBS,'N')) )  THEN 
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. WNTVCF .OR. WNTVCQ    &
+                              .OR. LSAME(JOBZ,'N')) ) THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.    & 
+          ( WNTRES .AND. LSAME(JOBZ,'N') ) ) THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WANTQ .OR. LSAME(JOBQ,'N')) ) THEN
+          INFO = -4                 
+      ELSE IF ( .NOT. ( WNTTRF .OR. LSAME(JOBT,'N') ) )  THEN
+          INFO = -5
+      ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             & 
+                LSAME(JOBF,'N') ) )                    THEN
+          INFO = -6    
+      ELSE IF ( .NOT. ((WHTSVD == 1).OR.(WHTSVD == 2).OR.   &
+                       (WHTSVD == 3).OR.(WHTSVD == 4)) ) THEN
+          INFO = -7
+      ELSE IF ( M < 0 ) THEN
+          INFO = -8
+      ELSE IF ( ( N < 0 ) .OR. ( N > M+1 ) ) THEN
+          INFO = -9
+      ELSE IF ( LDF < M ) THEN
+          INFO = -11
+      ELSE IF ( LDX < MINMN ) THEN
+          INFO = -13
+      ELSE IF ( LDY < MINMN ) THEN
+          INFO = -15
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR.    & 
+                       ((NRNK >= 1).AND.(NRNK <=N ))) )  THEN
+          INFO = -16
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN
+          INFO = -17
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -22
+      ELSE IF ( (WNTREF.OR.WNTEX ).AND.( LDB < MINMN ) ) THEN
+          INFO = -25
+      ELSE IF ( LDV < N-1 ) THEN
+          INFO = -27
+      ELSE IF ( LDS < N-1 ) THEN
+          INFO = -29
+      END IF
+!      
+      IF ( WNTVEC .OR. WNTVCF .OR. WNTVCQ ) THEN
+          JOBVL = 'V'
+      ELSE
+          JOBVL = 'N'
+      END IF     
+      IF ( INFO == 0 ) THEN  
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and 
+          ! determine minimal and optimal sizes of the 
+          ! workspace at any moment of the run.         
+         IF ( ( N == 0 ) .OR. ( N == 1 ) ) THEN
+             ! All output except K is void. INFO=1 signals
+             ! the void input. In case of a workspace query,
+             ! the minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN  
+               IWORK(1) = 1
+                WORK(1) = 2
+                WORK(2) = 2
+            ELSE                
+               K = 0
+            END IF             
+            INFO = 1  
+            RETURN
+         END IF     
+         MLWQR  = MAX(1,N)  ! Minimal workspace length for DGEQRF.
+         MLWORK = MINMN + MLWQR 
+         IF ( LQUERY ) THEN 
+             CALL DGEQRF( M, N, F, LDF, WORK, RDUMMY, -1, &
+                          INFO1 )
+             OLWQR  = INT(RDUMMY(1))
+             OLWORK = MIN(M,N) + OLWQR           
+         END IF
+         CALL DGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,& 
+                      N-1, X, LDX, Y, LDY, NRNK, TOL, K,     & 
+                      REIG, IMEIG, Z, LDZ, RES,  B, LDB,     & 
+                      V, LDV, S, LDS, WORK, -1, IWORK,       &
+                      LIWORK, INFO1 )
+         MLWDMD = INT(WORK(1))
+         MLWORK = MAX(MLWORK, MINMN + MLWDMD)
+         IMINWR = IWORK(1)
+         IF ( LQUERY ) THEN 
+             OLWDMD = INT(WORK(2))
+             OLWORK = MAX(OLWORK, MINMN+OLWDMD)
+         END IF
+         IF ( WNTVEC .OR. WNTVCF ) THEN
+            MLWMQR = MAX(1,N) 
+            MLWORK = MAX(MLWORK,MINMN+N-1+MLWMQR)
+            IF ( LQUERY ) THEN
+               CALL DORMQR( 'L','N', M, N, MINMN, F, LDF,  & 
+                            WORK, Z, LDZ, WORK, -1, INFO1 )
+               OLWMQR = INT(WORK(1))
+               OLWORK = MAX(OLWORK,MINMN+N-1+OLWMQR)
+            END IF
+         END IF  
+         IF ( WANTQ ) THEN
+            MLWGQR = N
+            MLWORK = MAX(MLWORK,MINMN+N-1+MLWGQR)
+            IF ( LQUERY ) THEN 
+                CALL DORGQR( M, MINMN, MINMN, F, LDF, WORK, &
+                             WORK, -1, INFO1 )        
+                OLWGQR = INT(WORK(1))
+                OLWORK = MAX(OLWORK,MINMN+N-1+OLWGQR)
+            END IF            
+         END IF   
+         IMINWR = MAX( 1, IMINWR )
+         MLWORK = MAX( 2, MLWORK )      
+         IF (  LWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -31
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -33
+      END IF  
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'DGEDMDQ', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          WORK(1)  = MLWORK
+          WORK(2)  = OLWORK
+          RETURN
+      END IF   
+!.....	  
+!     Initial QR factorization that is used to represent the
+!     snapshots as elements of lower dimensional subspace.
+!     For large scale computation with M >>N , at this place 
+!     one can use an out of core QRF.
+!   
+      CALL DGEQRF( M, N, F, LDF, WORK,               & 
+                   WORK(MINMN+1), LWORK-MINMN, INFO1 )
+!      
+!     Define X and Y as the snapshots representations in the
+!     orthogonal basis computed in the QR factorization.
+!     X corresponds to the leading N-1 and Y to the trailing
+!     N-1 snapshots.
+      CALL DLASET( 'L', MINMN, N-1, ZERO,  ZERO, X, LDX )
+      CALL DLACPY( 'U', MINMN, N-1, F,      LDF, X, LDX )
+      CALL DLACPY( 'A', MINMN, N-1, F(1,2), LDF, Y, LDY )
+      IF ( M >= 3 ) THEN
+          CALL DLASET( 'L', MINMN-2, N-2, ZERO,  ZERO, &
+                       Y(3,1), LDY )  
+      END IF
+!
+!     Compute the DMD of the projected snapshot pairs (X,Y)   
+      CALL DGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,  &
+                   N-1, X, LDX, Y, LDY, NRNK,   TOL, K,     &
+                   REIG, IMEIG, Z, LDZ, RES, B, LDB, V,     &
+                   LDV, S, LDS, WORK(MINMN+1), LWORK-MINMN, & 
+                   IWORK, LIWORK, INFO1 )
+      IF ( INFO1 == 2 .OR. INFO1 == 3 ) THEN
+          ! Return with error code. See DGEDMD for details.
+          INFO = INFO1
+          RETURN
+      ELSE
+          INFO = INFO1
+      END IF    
+!      
+!     The Ritz vectors (Koopman modes) can be explicitly 
+!     formed or returned in factored form.
+      IF ( WNTVEC ) THEN
+        ! Compute the eigenvectors explicitly.  
+        IF ( M > MINMN ) CALL DLASET( 'A', M-MINMN, K, ZERO, &
+                                     ZERO, Z(MINMN+1,1), LDZ )
+        CALL DORMQR( 'L','N', M, K, MINMN, F, LDF, WORK, Z,  &
+             LDZ, WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 )
+      ELSE IF ( WNTVCF ) THEN   
+        !   Return the Ritz vectors (eigenvectors) in factored
+        !   form Z*V, where Z contains orthonormal matrix (the
+        !   product of Q from the initial QR factorization and 
+        !   the SVD/POD_basis returned by DGEDMD in X) and the 
+        !   second factor (the eigenvectors of the Rayleigh 
+        !   quotient) is in the array V, as returned by DGEDMD.
+        CALL DLACPY( 'A', N, K, X, LDX, Z, LDZ )
+        IF ( M > N ) CALL DLASET( 'A', M-N, K, ZERO, ZERO,   & 
+                                  Z(N+1,1), LDZ )
+        CALL DORMQR( 'L','N', M, K, MINMN, F, LDF, WORK, Z,  &
+             LDZ, WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 )
+      END IF
+!     
+!     Some optional output variables:
+!
+!     The upper triangular factor R in the initial QR 
+!     factorization is optionally returned in the array Y.
+!     This is useful if this call to DGEDMDQ is to be 
+!     followed by a streaming DMD that is implemented in a 
+!     QR compressed form.
+      IF ( WNTTRF ) THEN ! Return the upper triangular R in Y 
+         CALL DLASET( 'A', MINMN, N, ZERO,  ZERO, Y, LDY )
+         CALL DLACPY( 'U', MINMN, N, F, LDF,      Y, LDY )
+      END IF    
+!
+!     The orthonormal/orthogonal factor Q in the initial QR 
+!     factorization is optionally returned in the array F. 
+!     Same as with the triangular factor above, this is 
+!     useful in a streaming DMD.
+      IF ( WANTQ ) THEN               ! Q overwrites F 
+         CALL DORGQR( M, MINMN, MINMN, F, LDF, WORK, &
+              WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 )  
+      END IF
+!      
+      RETURN
+!      
+      END SUBROUTINE DGEDMDQ
+    
\ No newline at end of file
diff --git a/lapack-netlib/SRC/sgedmd.f90 b/lapack-netlib/SRC/sgedmd.f90
new file mode 100644
index 000000000..49cb11527
--- /dev/null
+++ b/lapack-netlib/SRC/sgedmd.f90
@@ -0,0 +1,1054 @@
+      SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,  &
+                         M, N, X, LDX, Y, LDY, NRNK, TOL,  &
+                         K, REIG,  IMEIG,   Z, LDZ,  RES,  &
+                         B, LDB, W,  LDW,   S, LDS,        &
+                         WORK, LWORK, IWORK, LIWORK, INFO )
+! March 2023
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real32
+!.....
+!     Scalar arguments
+      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+                                 NRNK, LDZ, LDB, LDW,  LDS, &
+                                 LWORK,  LIWORK
+      INTEGER,   INTENT(OUT)  :: K, INFO
+      REAL(KIND=WP), INTENT(IN) ::  TOL
+!     Array arguments
+      REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+      REAL(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+                                      W(LDW,*), S(LDS,*)
+      REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*), &
+                                      RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!............................................................
+!     Purpose
+!     =======
+!     SGEDMD computes the Dynamic Mode Decomposition (DMD) for
+!     a pair of data snapshot matrices. For the input matrices
+!     X and Y such that Y = A*X with an unaccessible matrix
+!     A, SGEDMD computes a certain number of Ritz pairs of A using
+!     the standard Rayleigh-Ritz extraction from a subspace of
+!     range(X) that is determined using the leading left singular
+!     vectors of X. Optionally, SGEDMD returns the residuals
+!     of the computed Ritz pairs, the information needed for
+!     a refinement of the Ritz vectors, or the eigenvectors of
+!     the Exact DMD.
+!     For further details see the references listed
+!     below. For more details of the implementation see [3].
+!
+!     References
+!     ==========
+!     [1] P. Schmid: Dynamic mode decomposition of numerical
+!         and experimental data,
+!         Journal of Fluid Mechanics 656, 5-28, 2010.
+!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!         decompositions: analysis and enhancements,
+!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!         Mode Decomposition I. Technical report. AIMDyn Inc.
+!         and LAPACK Working Note 298.
+!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
+!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!         Theory and Applications, Journal of Computational
+!         Dynamics 1(2), 391 -421, 2014.
+!
+!......................................................................
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     and supported by
+!     - DARPA SBIR project "Koopman Operator-Based Forecasting
+!     for Nonstationary Processes from Near-Term, Limited
+!     Observational Data" Contract No: W31P4Q-21-C-0007
+!     - DARPA PAI project "Physics-Informed Machine Learning
+!     Methodologies" Contract No: HR0011-18-9-0033
+!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!     Framework for Space-Time Analysis of Process Dynamics"
+!     Contract No: HR0011-16-C-0116
+!     Any opinions, findings and conclusions or recommendations
+!     expressed in this material are those of the author and
+!     do not necessarily reflect the views of the DARPA SBIR
+!     Program Office
+!============================================================
+!     Distribution Statement A:
+!     Approved for Public Release, Distribution Unlimited.
+!     Cleared by DARPA on September 29, 2022
+!============================================================
+!......................................................................
+!     Arguments
+!     =========
+!     JOBS (input) CHARACTER*1
+!     Determines whether the initial data snapshots are scaled
+!     by a diagonal matrix.
+!     'S' :: The data snapshots matrices X and Y are multiplied
+!            with a diagonal matrix D so that X*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'C' :: The snapshots are scaled as with the 'S' option.
+!            If it is found that an i-th column of X is zero
+!            vector and the corresponding i-th column of Y is
+!            non-zero, then the i-th column of Y is set to
+!            zero and a warning flag is raised.
+!     'Y' :: The data snapshots matrices X and Y are multiplied
+!            by a diagonal matrix D so that Y*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'N' :: No data scaling.
+!.....
+!     JOBZ (input) CHARACTER*1
+!     Determines whether the eigenvectors (Koopman modes) will
+!     be computed.
+!     'V' :: The eigenvectors (Koopman modes) will be computed
+!            and returned in the matrix Z.
+!            See the description of Z.
+!     'F' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product X(:,1:K)*W, where X
+!            contains a POD basis (leading left singular vectors
+!            of the data matrix X) and W contains the eigenvectors
+!            of the corresponding Rayleigh quotient.
+!            See the descriptions of K, X, W, Z.
+!     'N' :: The eigenvectors are not computed.
+!.....
+!     JOBR (input) CHARACTER*1
+!     Determines whether to compute the residuals.
+!     'R' :: The residuals for the computed eigenpairs will be
+!            computed and stored in the array RES.
+!            See the description of RES.
+!            For this option to be legal, JOBZ must be 'V'.
+!     'N' :: The residuals are not computed.
+!.....
+!     JOBF (input) CHARACTER*1
+!     Specifies whether to store information needed for post-
+!     processing (e.g. computing refined Ritz vectors)
+!     'R' :: The matrix needed for the refinement of the Ritz
+!            vectors is computed and stored in the array B.
+!            See the description of B.
+!     'E' :: The unscaled eigenvectors of the Exact DMD are
+!            computed and returned in the array B. See the
+!            description of B.
+!     'N' :: No eigenvector refinement data is computed.
+!.....
+!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!     Allows for a selection of the SVD algorithm from the
+!     LAPACK library.
+!     1 :: SGESVD (the QR SVD algorithm)
+!     2 :: SGESDD (the Divide and Conquer algorithm; if enough
+!          workspace available, this is the fastest option)
+!     3 :: SGESVDQ (the preconditioned QR SVD  ; this and 4
+!          are the most accurate options)
+!     4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3
+!          are the most accurate options)
+!     For the four methods above, a significant difference in
+!     the accuracy of small singular values is possible if
+!     the snapshots vary in norm so that X is severely
+!     ill-conditioned. If small (smaller than EPS*||X||)
+!     singular values are of interest and JOBS=='N',  then
+!     the options (3, 4) give the most accurate results, where
+!     the option 4 is slightly better and with stronger
+!     theoretical background.
+!     If JOBS=='S', i.e. the columns of X will be normalized,
+!     then all methods give nearly equally accurate results.
+!.....
+!     M (input) INTEGER, M>= 0
+!     The state space dimension (the row dimension of X, Y).
+!.....
+!     N (input) INTEGER, 0 <= N <= M
+!     The number of data snapshot pairs
+!     (the number of columns of X and Y).
+!.....
+!     X (input/output) REAL(KIND=WP) M-by-N array
+!     > On entry, X contains the data snapshot matrix X. It is
+!     assumed that the column norms of X are in the range of
+!     the normalized floating point numbers.
+!     < On exit, the leading K columns of X contain a POD basis,
+!     i.e. the leading K left singular vectors of the input
+!     data matrix X, U(:,1:K). All N columns of X contain all
+!     left singular vectors of the input matrix X.
+!     See the descriptions of K, Z and W.
+!.....
+!     LDX (input) INTEGER, LDX >= M
+!     The leading dimension of the array X.
+!.....
+!     Y (input/workspace/output) REAL(KIND=WP) M-by-N array
+!     > On entry, Y contains the data snapshot matrix Y
+!     < On exit,
+!     If JOBR == 'R', the leading K columns of Y  contain
+!     the residual vectors for the computed Ritz pairs.
+!     See the description of RES.
+!     If JOBR == 'N', Y contains the original input data,
+!                     scaled according to the value of JOBS.
+!.....
+!     LDY (input) INTEGER , LDY >= M
+!     The leading dimension of the array Y.
+!.....
+!     NRNK (input) INTEGER
+!     Determines the mode how to compute the numerical rank,
+!     i.e. how to truncate small singular values of the input
+!     matrix X. On input, if
+!     NRNK = -1 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(1)
+!                  This option is recommended.
+!     NRNK = -2 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(i-1)
+!                  This option is included for R&D purposes.
+!                  It requires highly accurate SVD, which
+!                  may not be feasible.
+!     The numerical rank can be enforced by using positive
+!     value of NRNK as follows:
+!     0 < NRNK <= N :: at most NRNK largest singular values
+!     will be used. If the number of the computed nonzero
+!     singular values is less than NRNK, then only those
+!     nonzero values will be used and the actually used
+!     dimension is less than NRNK. The actual number of
+!     the nonzero singular values is returned in the variable
+!     K. See the descriptions of TOL and  K.
+!.....
+!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!     The tolerance for truncating small singular values.
+!     See the description of NRNK.
+!.....
+!     K (output) INTEGER,  0 <= K <= N
+!     The dimension of the POD basis for the data snapshot
+!     matrix X and the number of the computed Ritz pairs.
+!     The value of K is determined according to the rule set
+!     by the parameters NRNK and TOL.
+!     See the descriptions of NRNK and TOL.
+!.....
+!     REIG (output) REAL(KIND=WP) N-by-1 array
+!     The leading K (K<=N) entries of REIG contain
+!     the real parts of the computed eigenvalues
+!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!     See the descriptions of K, IMEIG, and Z.
+!.....
+!     IMEIG (output) REAL(KIND=WP) N-by-1 array
+!     The leading K (K<=N) entries of IMEIG contain
+!     the imaginary parts of the computed eigenvalues
+!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!     The eigenvalues are determined as follows:
+!     If IMEIG(i) == 0, then the corresponding eigenvalue is
+!     real, LAMBDA(i) = REIG(i).
+!     If IMEIG(i)>0, then the corresponding complex
+!     conjugate pair of eigenvalues reads
+!     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i)
+!     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i)
+!     That is, complex conjugate pairs have consecutive
+!     indices (i,i+1), with the positive imaginary part
+!     listed first.
+!     See the descriptions of K, REIG, and Z.
+!.....
+!     Z (workspace/output) REAL(KIND=WP)  M-by-N array
+!     If JOBZ =='V' then
+!        Z contains real Ritz vectors as follows:
+!        If IMEIG(i)=0, then Z(:,i) is an eigenvector of
+!        the i-th Ritz value; ||Z(:,i)||_2=1.
+!        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then
+!        [Z(:,i) Z(:,i+1)] span an invariant subspace and
+!        the Ritz values extracted from this subspace are
+!        REIG(i) + sqrt(-1)*IMEIG(i) and
+!        REIG(i) - sqrt(-1)*IMEIG(i).
+!        The corresponding eigenvectors are
+!        Z(:,i) + sqrt(-1)*Z(:,i+1) and
+!        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively.
+!        || Z(:,i:i+1)||_F = 1.
+!     If JOBZ == 'F', then the above descriptions hold for
+!     the columns of X(:,1:K)*W(1:K,1:K), where the columns
+!     of W(1:k,1:K) are the computed eigenvectors of the
+!     K-by-K Rayleigh quotient. The columns of W(1:K,1:K)
+!     are similarly structured: If IMEIG(i) == 0 then
+!     X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0
+!     then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and
+!          X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1)
+!     are the eigenvectors of LAMBDA(i), LAMBDA(i+1).
+!     See the descriptions of REIG, IMEIG, X and W.
+!.....
+!     LDZ (input) INTEGER , LDZ >= M
+!     The leading dimension of the array Z.
+!.....
+!     RES (output) REAL(KIND=WP) N-by-1 array
+!     RES(1:K) contains the residuals for the K computed
+!     Ritz pairs.
+!     If LAMBDA(i) is real, then
+!        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2.
+!     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair
+!     then
+!     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F
+!     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ]
+!               [-imag(LAMBDA(i)) real(LAMBDA(i)) ].
+!     It holds that
+!     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2
+!     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2
+!     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1)
+!           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1)
+!     See the description of REIG, IMEIG and Z.
+!.....
+!     B (output) REAL(KIND=WP)  M-by-N array.
+!     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
+!     be used for computing the refined vectors; see further
+!     details in the provided references.
+!     If JOBF == 'E', B(1:M,1;K) contains
+!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!     Exact DMD, up to scaling by the inverse eigenvalues.
+!     If JOBF =='N', then B is not referenced.
+!     See the descriptions of X, W, K.
+!.....
+!     LDB (input) INTEGER, LDB >= M
+!     The leading dimension of the array B.
+!.....
+!     W (workspace/output) REAL(KIND=WP) N-by-N array
+!     On exit, W(1:K,1:K) contains the K computed
+!     eigenvectors of the matrix Rayleigh quotient (real and
+!     imaginary parts for each complex conjugate pair of the
+!     eigenvalues). The Ritz vectors (returned in Z) are the
+!     product of X (containing a POD basis for the input
+!     matrix X) and W. See the descriptions of K, S, X and Z.
+!     W is also used as a workspace to temporarily store the
+!     left singular vectors of X.
+!.....
+!     LDW (input) INTEGER, LDW >= N
+!     The leading dimension of the array W.
+!.....
+!     S (workspace/output) REAL(KIND=WP) N-by-N array
+!     The array S(1:K,1:K) is used for the matrix Rayleigh
+!     quotient. This content is overwritten during
+!     the eigenvalue decomposition by SGEEV.
+!     See the description of K.
+!.....
+!     LDS (input) INTEGER, LDS >= N
+!     The leading dimension of the array S.
+!.....
+!     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!     On exit, WORK(1:N) contains the singular values of
+!     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
+!     If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain
+!     scaling factor WORK(N+2)/WORK(N+1) used to scale X
+!     and Y to avoid overflow in the SVD of X.
+!     This may be of interest if the scaling option is off
+!     and as many as possible smallest eigenvalues are
+!     desired to the highest feasible accuracy.
+!     If the call to SGEDMD is only workspace query, then
+!     WORK(1) contains the minimal workspace length and
+!     WORK(2) is the optimal workspace length. Hence, the
+!     length of work is at least 2.
+!     See the description of LWORK.
+!.....
+!     LWORK (input) INTEGER
+!     The minimal length of the workspace vector WORK.
+!     LWORK is calculated as follows:
+!     If WHTSVD == 1 ::
+!        If JOBZ == 'V', then
+!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)).
+!        If JOBZ == 'N'  then
+!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)).
+!        Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal
+!        workspace length of SGESVD.
+!     If WHTSVD == 2 ::
+!        If JOBZ == 'V', then
+!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N))
+!        If JOBZ == 'N', then
+!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N))
+!        Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the
+!        minimal workspace length of SGESDD.
+!     If WHTSVD == 3 ::
+!        If JOBZ == 'V', then
+!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
+!        If JOBZ == 'N', then
+!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
+!        Here LWORK_SVD = N+M+MAX(3*N+1,
+!                        MAX(1,3*N+M,5*N),MAX(1,N))
+!        is the minimal workspace length of SGESVDQ.
+!     If WHTSVD == 4 ::
+!        If JOBZ == 'V', then
+!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
+!        If JOBZ == 'N', then
+!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
+!        Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the
+!        minimal workspace length of SGEJSV.
+!     The above expressions are not simplified in order to
+!     make the usage of WORK more transparent, and for
+!     easier checking. In any case, LWORK >= 2.
+!     If on entry LWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.
+!.....
+!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!     Workspace that is required only if WHTSVD equals
+!     2 , 3 or 4. (See the description of WHTSVD).
+!     If on entry LWORK =-1 or LIWORK=-1, then the
+!     minimal length of IWORK is computed and returned in
+!     IWORK(1). See the description of LIWORK.
+!.....
+!     LIWORK (input) INTEGER
+!     The minimal length of the workspace vector IWORK.
+!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!     If on entry LIWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.
+!.....
+!     INFO (output) INTEGER
+!     -i < 0 :: On entry, the i-th argument had an
+!               illegal value
+!        = 0 :: Successful return.
+!        = 1 :: Void input. Quick exit (M=0 or N=0).
+!        = 2 :: The SVD computation of X did not converge.
+!               Suggestion: Check the input data and/or
+!               repeat with different WHTSVD.
+!        = 3 :: The computation of the eigenvalues did not
+!               converge.
+!        = 4 :: If data scaling was requested on input and
+!               the procedure found inconsistency in the data
+!               such that for some column index i,
+!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!               to zero if JOBS=='C'. The computation proceeds
+!               with original or modified data and warning
+!               flag is set with INFO=4.
+!.............................................................
+!.............................................................
+!     Parameters
+!     ~~~~~~~~~~
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+
+!     Local scalars
+!     ~~~~~~~~~~~~~
+      REAL(KIND=WP) :: OFL,   ROOTSC, SCALE,  SMALL,   &
+                       SSUM,  XSCL1,  XSCL2
+      INTEGER       ::  i,  j, IMINWR,  INFO1, INFO2,  &
+                       LWRKEV, LWRSDD, LWRSVD, &
+                       LWRSVQ, MLWORK, MWRKEV, MWRSDD, &
+                       MWRSVD, MWRSVJ, MWRSVQ, NUMRNK, &
+                       OLWORK
+      LOGICAL       ::  BADXY, LQUERY, SCCOLX, SCCOLY, &
+                        WNTEX, WNTREF, WNTRES, WNTVEC
+      CHARACTER     ::  JOBZL, T_OR_N
+      CHARACTER     ::  JSVOPT
+
+!     Local arrays
+!     ~~~~~~~~~~~~
+      REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2)
+
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      REAL(KIND=WP) SLANGE, SLAMCH, SNRM2
+      EXTERNAL      SLANGE, SLAMCH, SNRM2, ISAMAX
+      INTEGER       ISAMAX
+      LOGICAL       SISNAN, LSAME
+      EXTERNAL      SISNAN, LSAME
+
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      SAXPY,  SGEMM,  SSCAL
+      EXTERNAL      SGEEV,  SGEJSV, SGESDD, SGESVD, SGESVDQ, &
+                    SLACPY, SLASCL, SLASSQ, XERBLA
+
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC     INT, FLOAT, MAX, SQRT
+!............................................................
+!
+!    Test the input arguments
+!
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME(JOBS,'C')
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V')
+      WNTREF = LSAME(JOBF,'R')
+      WNTEX  = LSAME(JOBF,'E')
+      INFO   = 0
+      LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) )
+!
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. &
+                                  LSAME(JOBS,'N')) )   THEN
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. LSAME(JOBZ,'N')        &
+                              .OR. LSAME(JOBZ,'F')) )  THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.  &
+                ( WNTRES .AND. (.NOT.WNTVEC) ) )       THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             &
+                LSAME(JOBF,'N') ) )                    THEN
+          INFO = -4
+      ELSE IF ( .NOT.((WHTSVD == 1) .OR. (WHTSVD == 2) .OR.  &
+                      (WHTSVD == 3) .OR. (WHTSVD == 4) )) THEN
+          INFO = -5
+      ELSE IF ( M < 0 )   THEN
+          INFO = -6
+      ELSE IF ( ( N < 0 ) .OR. ( N > M ) ) THEN
+          INFO = -7
+      ELSE IF ( LDX < M ) THEN
+          INFO = -9
+      ELSE IF ( LDY < M ) THEN
+          INFO = -11
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. &
+                ((NRNK >= 1).AND.(NRNK <=N ))) )      THEN
+          INFO = -12
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) )  THEN
+          INFO = -13
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -18
+      ELSE IF ( (WNTREF .OR. WNTEX ) .AND. ( LDB < M ) ) THEN
+          INFO = -21
+      ELSE IF ( LDW < N ) THEN
+          INFO = -23
+      ELSE IF ( LDS < N ) THEN
+          INFO = -25
+      END IF
+!
+      IF ( INFO == 0 ) THEN
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and
+          ! determine minimal and optimal sizes of the
+          ! workspace at any moment of the run.
+         IF ( N == 0 ) THEN
+             ! Quick return. All output except K is void.
+             ! INFO=1 signals the void input.
+             ! In case of a workspace query, the default
+             ! minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN
+                IWORK(1) = 1
+                WORK(1)  = 2
+                WORK(2)  = 2
+            ELSE
+               K = 0
+            END IF
+            INFO = 1
+            RETURN
+         END IF
+         MLWORK = MAX(2,N)
+         OLWORK = MAX(2,N)
+         IMINWR = 1
+         SELECT CASE ( WHTSVD )
+         CASE (1)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of SGESVD:
+             ! MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N))
+             MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N))
+             MLWORK = MAX(MLWORK,N + MWRSVD)
+             IF ( LQUERY ) THEN
+                CALL SGESVD( 'O', 'S', M, N, X, LDX, WORK, &
+                           B, LDB, W, LDW, RDUMMY, -1, INFO1 )
+                LWRSVD = MAX( MWRSVD, INT( RDUMMY(1) ) )
+                OLWORK = MAX(OLWORK,N + LWRSVD)
+             END IF
+         CASE (2)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of SGESDD:
+             ! MWRSDD = 3*MIN(M,N)*MIN(M,N) +
+             ! MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) )
+             ! IMINWR = 8*MIN(M,N)
+             MWRSDD = 3*MIN(M,N)*MIN(M,N) +                &
+              MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) )
+             MLWORK = MAX(MLWORK,N + MWRSDD)
+             IMINWR = 8*MIN(M,N)
+             IF ( LQUERY ) THEN
+                CALL SGESDD( 'O', M, N, X, LDX, WORK, B,     &
+                     LDB, W, LDW, RDUMMY, -1, IWORK, INFO1 )
+                LWRSDD = MAX( MWRSDD, INT( RDUMMY(1) ) )
+                OLWORK = MAX(OLWORK,N + LWRSDD)
+             END IF
+         CASE (3)
+             !LWQP3 = 3*N+1
+             !LWORQ = MAX(N, 1)
+             !MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N))
+             !MWRSVQ = N + MAX( LWQP3, MWRSVD, LWORQ )+ MAX(M,2)
+             !MLWORK = N + MWRSVQ
+             !IMINWR = M+N-1
+             CALL SGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                          X, LDX, WORK, Z, LDZ, W, LDW,   &
+                             NUMRNK, IWORK, -1, RDUMMY,   &
+                             -1, RDUMMY2, -1, INFO1 )
+             IMINWR = IWORK(1)
+             MWRSVQ = INT(RDUMMY(2))
+             MLWORK = MAX(MLWORK,N+MWRSVQ+INT(RDUMMY2(1)))
+             IF ( LQUERY ) THEN
+                LWRSVQ = INT(RDUMMY(1))
+                OLWORK = MAX(OLWORK,N+LWRSVQ+INT(RDUMMY2(1)))
+             END IF
+         CASE (4)
+             JSVOPT = 'J'
+             !MWRSVJ = MAX( 7, 2*M+N, 6*N+2*N*N )! for JSVOPT='V'
+             MWRSVJ = MAX( 7, 2*M+N, 4*N+N*N, 2*N+N*N+6 )
+             MLWORK = MAX(MLWORK,N+MWRSVJ)
+             IMINWR = MAX( 3, M+3*N )
+             IF ( LQUERY ) THEN
+                OLWORK = MAX(OLWORK,N+MWRSVJ)
+             END IF
+         END SELECT
+         IF ( WNTVEC .OR. WNTEX .OR. LSAME(JOBZ,'F') ) THEN
+             JOBZL = 'V'
+         ELSE
+             JOBZL = 'N'
+         END IF
+         ! Workspace calculation to the SGEEV call
+         IF ( LSAME(JOBZL,'V') ) THEN
+             MWRKEV = MAX( 1, 4*N )
+         ELSE
+             MWRKEV = MAX( 1, 3*N )
+         END IF
+         MLWORK = MAX(MLWORK,N+MWRKEV)
+         IF ( LQUERY ) THEN
+                CALL SGEEV( 'N', JOBZL, N, S, LDS, REIG, &
+                    IMEIG, W, LDW, W, LDW, RDUMMY, -1, INFO1 )
+                LWRKEV = MAX( MWRKEV, INT(RDUMMY(1)) )
+                OLWORK = MAX( OLWORK, N+LWRKEV )
+         END IF
+!
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -29
+         IF (  LWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -27
+      END IF
+!
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'SGEDMD', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          WORK(1)  = MLWORK
+          WORK(2)  = OLWORK
+          RETURN
+      END IF
+!............................................................
+!
+      OFL   = SLAMCH('O')
+      SMALL = SLAMCH('S')
+      BADXY = .FALSE.
+!
+!     <1> Optional scaling of the snapshots (columns of X, Y)
+!     ==========================================================
+      IF ( SCCOLX ) THEN
+          ! The columns of X will be normalized.
+          ! To prevent overflows, the column norms of X are
+          ! carefully computed using SLASSQ.
+          K = 0
+          DO i = 1, N
+            !WORK(i) = DNRM2( M, X(1,i), 1 )
+            SCALE  = ZERO
+            CALL SLASSQ( M, X(1,i), 1, SCALE, SSUM )
+            IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -8
+                CALL XERBLA('SGEDMD',-INFO)
+            END IF
+            IF ( (SCALE /= ZERO) .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of X(:,i) overflows. First, X(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2.
+!                 Next, the norm of X(:,i) is stored without
+!                 overflow as WORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of X(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL SLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, X(1,i), M, INFO2 )
+                  WORK(i) = - SCALE * ( ROOTSC / FLOAT(M) )
+               ELSE
+!                 X(:,i) will be scaled to unit 2-norm
+                  WORK(i) =   SCALE * ROOTSC
+                  CALL SLASCL( 'G',0, 0, WORK(i), ONE, M, 1, &
+                               X(1,i), M, INFO2 )              ! LAPACK CALL
+!                 X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)          ! INTRINSIC
+               END IF
+            ELSE
+               WORK(i) = ZERO
+               K = K + 1
+            END IF
+          END DO
+          IF ( K == N ) THEN
+          ! All columns of X are zero. Return error code -8.
+          ! (the 8th input variable had an illegal value)
+          K = 0
+          INFO = -8
+          CALL XERBLA('SGEDMD',-INFO)
+          RETURN
+          END IF
+          DO i = 1, N
+!           Now, apply the same scaling to the columns of Y.
+            IF ( WORK(i) >  ZERO ) THEN
+                CALL SSCAL( M, ONE/WORK(i), Y(1,i), 1 )  ! BLAS CALL
+!               Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)      ! INTRINSIC
+            ELSE IF ( WORK(i) < ZERO ) THEN
+                CALL SLASCL( 'G', 0, 0, -WORK(i),          &
+                     ONE/FLOAT(M), M, 1, Y(1,i), M, INFO2 ) ! LAPACK CALL
+            ELSE IF ( Y(ISAMAX(M, Y(1,i),1),i )  &
+                                            /= ZERO ) THEN
+!               X(:,i) is zero vector. For consistency,
+!               Y(:,i) should also be zero. If Y(:,i) is not
+!               zero, then the data might be inconsistent or
+!               corrupted. If JOBS == 'C', Y(:,i) is set to
+!               zero and a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+                IF ( LSAME(JOBS,'C')) &
+                CALL SSCAL( M, ZERO, Y(1,i), 1 )  ! BLAS CALL
+            END IF
+          END DO
+      END IF
+  !
+      IF ( SCCOLY ) THEN
+          ! The columns of Y will be normalized.
+          ! To prevent overflows, the column norms of Y are
+          ! carefully computed using SLASSQ.
+          DO i = 1, N
+            !WORK(i) = DNRM2( M, Y(1,i), 1 )
+            SCALE  = ZERO
+            CALL SLASSQ( M, Y(1,i), 1, SCALE, SSUM )
+            IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -10
+                CALL XERBLA('SGEDMD',-INFO)
+            END IF
+            IF ( SCALE /= ZERO  .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of Y(:,i) overflows. First, Y(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2.
+!                 Next, the norm of Y(:,i) is stored without
+!                 overflow as WORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of Y(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL SLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, Y(1,i), M, INFO2 )
+                  WORK(i) = - SCALE * ( ROOTSC / FLOAT(M) )
+               ELSE
+!                 X(:,i) will be scaled to unit 2-norm
+                  WORK(i) =   SCALE * ROOTSC
+                  CALL SLASCL( 'G',0, 0, WORK(i), ONE, M, 1, &
+                               Y(1,i), M, INFO2 )              ! LAPACK CALL
+!                 Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)          ! INTRINSIC
+               END IF
+            ELSE
+               WORK(i) = ZERO
+            END IF
+         END DO
+         DO i = 1, N
+!           Now, apply the same scaling to the columns of X.
+            IF ( WORK(i) >  ZERO ) THEN
+                CALL SSCAL( M, ONE/WORK(i), X(1,i), 1 )  ! BLAS CALL
+!               X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)      ! INTRINSIC
+            ELSE IF ( WORK(i) < ZERO ) THEN
+                CALL SLASCL( 'G', 0, 0, -WORK(i),          &
+                     ONE/FLOAT(M), M, 1, X(1,i), M, INFO2 ) ! LAPACK CALL
+            ELSE IF ( X(ISAMAX(M, X(1,i),1),i )  &
+                                           /= ZERO ) THEN
+!               Y(:,i) is zero vector.  If X(:,i) is not
+!               zero, then a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+            END IF
+         END DO
+       END IF
+!
+!     <2> SVD of the data snapshot matrix X.
+!     =====================================
+!     The left singular vectors are stored in the array X.
+!     The right singular vectors are in the array W.
+!     The array W will later on contain the eigenvectors
+!     of a Rayleigh quotient.
+      NUMRNK = N
+      SELECT CASE ( WHTSVD )
+         CASE (1)
+             CALL SGESVD( 'O', 'S', M, N, X, LDX, WORK, B, &
+                  LDB, W, LDW, WORK(N+1), LWORK-N, INFO1 ) ! LAPACK CALL
+             T_OR_N = 'T'
+         CASE (2)
+            CALL SGESDD( 'O', M, N, X, LDX, WORK, B, LDB, W, &
+                 LDW, WORK(N+1), LWORK-N, IWORK, INFO1 )   ! LAPACK CALL
+            T_OR_N = 'T'
+         CASE (3)
+              CALL SGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                   X, LDX, WORK, Z, LDZ, W, LDW, &
+                   NUMRNK, IWORK, LIWORK, WORK(N+MAX(2,M)+1),&
+                   LWORK-N-MAX(2,M), WORK(N+1), MAX(2,M), INFO1)     ! LAPACK CALL
+              CALL SLACPY( 'A', M, NUMRNK, Z, LDZ, X, LDX )   ! LAPACK CALL
+         T_OR_N = 'T'
+         CASE (4)
+              CALL SGEJSV( 'F', 'U', JSVOPT, 'N', 'N', 'P', M, &
+                   N, X, LDX, WORK, Z, LDZ, W, LDW, &
+                   WORK(N+1), LWORK-N, IWORK, INFO1 )    ! LAPACK CALL
+              CALL SLACPY( 'A', M, N, Z, LDZ, X, LDX )   ! LAPACK CALL
+              T_OR_N = 'N'
+              XSCL1 = WORK(N+1)
+              XSCL2 = WORK(N+2)
+              IF ( XSCL1 /=  XSCL2 ) THEN
+                 ! This is an exceptional situation. If the
+                 ! data matrices are not scaled and the
+                 ! largest singular value of X overflows.
+                 ! In that case SGEJSV can return the SVD
+                 ! in scaled form. The scaling factor can be used
+                 ! to rescale the data (X and Y).
+                 CALL SLASCL( 'G', 0, 0, XSCL1, XSCL2, M, N, Y, LDY, INFO2  )
+              END IF
+      END SELECT
+!
+      IF ( INFO1 > 0 ) THEN
+         ! The SVD selected subroutine did not converge.
+         ! Return with an error code.
+         INFO = 2
+         RETURN
+      END IF
+!
+      IF ( WORK(1) == ZERO ) THEN
+          ! The largest computed singular value of (scaled)
+          ! X is zero. Return error code -8
+          ! (the 8th input variable had an illegal value).
+          K = 0
+          INFO = -8
+          CALL XERBLA('SGEDMD',-INFO)
+          RETURN
+      END IF
+!
+      !<3> Determine the numerical rank of the data
+      !    snapshots matrix X. This depends on the
+      !    parameters NRNK and TOL.
+
+      SELECT CASE ( NRNK )
+          CASE ( -1 )
+               K = 1
+               DO i = 2, NUMRNK
+                 IF ( ( WORK(i) <= WORK(1)*TOL ) .OR. &
+                      ( WORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE ( -2 )
+               K = 1
+               DO i = 1, NUMRNK-1
+                 IF ( ( WORK(i+1) <= WORK(i)*TOL  ) .OR. &
+                      ( WORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE DEFAULT
+               K = 1
+               DO i = 2, NRNK
+                  IF ( WORK(i) <= SMALL ) EXIT
+                  K = K + 1
+               END DO
+          END SELECT
+      !   Now, U = X(1:M,1:K) is the SVD/POD basis for the
+      !   snapshot data in the input matrix X.
+
+      !<4> Compute the Rayleigh quotient S = U^T * A * U.
+      !    Depending on the requested outputs, the computation
+      !    is organized to compute additional auxiliary
+      !    matrices (for the residuals and refinements).
+      !
+      !    In all formulas below, we need V_k*Sigma_k^(-1)
+      !    where either V_k is in W(1:N,1:K), or V_k^T is in
+      !    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)).
+      IF ( LSAME(T_OR_N, 'N') ) THEN
+          DO i = 1, K
+           CALL SSCAL( N, ONE/WORK(i), W(1,i), 1 )    ! BLAS CALL
+           ! W(1:N,i) = (ONE/WORK(i)) * W(1:N,i)      ! INTRINSIC
+          END DO
+      ELSE
+          ! This non-unit stride access is due to the fact
+          ! that SGESVD, SGESVDQ and SGESDD return the
+          ! transposed matrix of the right singular vectors.
+          !DO i = 1, K
+          ! CALL SSCAL( N, ONE/WORK(i), W(i,1), LDW )    ! BLAS CALL
+          ! ! W(i,1:N) = (ONE/WORK(i)) * W(i,1:N)      ! INTRINSIC
+          !END DO
+          DO i = 1, K
+              WORK(N+i) = ONE/WORK(i)
+          END DO
+          DO j = 1, N
+             DO i = 1, K
+                 W(i,j) = (WORK(N+i))*W(i,j)
+             END DO
+          END DO
+      END IF
+!
+      IF ( WNTREF ) THEN
+         !
+         ! Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K)))
+         ! for computing the refined Ritz vectors
+         ! (optionally, outside SGEDMD).
+          CALL SGEMM( 'N', T_OR_N, M, K, N, ONE, Y, LDY, W, &
+                      LDW, ZERO, Z, LDZ )                        ! BLAS CALL
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N)))  ! INTRINSIC, for T_OR_N=='T'
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))             ! INTRINSIC, for T_OR_N=='N'
+          !
+          ! At this point Z contains
+          ! A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and
+          ! this is needed for computing the residuals.
+          ! This matrix is  returned in the array B and
+          ! it can be used to compute refined Ritz vectors.
+          CALL SLACPY( 'A', M, K, Z, LDZ, B, LDB )   ! BLAS CALL
+          ! B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC
+
+          CALL SGEMM( 'T', 'N', K, K, M, ONE, X, LDX, Z, &
+                      LDZ, ZERO, S, LDS )                        ! BLAS CALL
+          ! S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRINSIC
+          ! At this point S = U^T * A * U is the Rayleigh quotient.
+      ELSE
+        ! A * U(:,1:K) is not explicitly needed and the
+        ! computation is organized differently. The Rayleigh
+        ! quotient is computed more efficiently.
+        CALL SGEMM( 'T', 'N', K, N, M, ONE, X, LDX, Y, LDY, &
+                   ZERO, Z, LDZ )                                   ! BLAS CALL
+        ! Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) )  ! INTRINSIC
+        ! In the two SGEMM calls here, can use K for LDZ
+        CALL SGEMM( 'N', T_OR_N, K, K, N, ONE, Z, LDZ, W, &
+                    LDW, ZERO, S, LDS )                         ! BLAS CALL
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRINSIC, for T_OR_N=='T'
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))          ! INTRINSIC, for T_OR_N=='N'
+        ! At this point S = U^T * A * U is the Rayleigh quotient.
+        ! If the residuals are requested, save scaled V_k into Z.
+        ! Recall that V_k or V_k^T is stored in W.
+        IF ( WNTRES .OR. WNTEX ) THEN
+          IF ( LSAME(T_OR_N, 'N') ) THEN
+              CALL SLACPY( 'A', N, K, W, LDW, Z, LDZ )
+          ELSE
+              CALL SLACPY( 'A', K, N, W, LDW, Z, LDZ )
+          END IF
+        END IF
+      END IF
+!
+      !<5> Compute the Ritz values and (if requested) the
+      !   right eigenvectors of the Rayleigh quotient.
+      !
+      CALL SGEEV( 'N', JOBZL, K, S, LDS, REIG, IMEIG, W, &
+                  LDW, W, LDW, WORK(N+1), LWORK-N, INFO1 )   ! LAPACK CALL
+      !
+      ! W(1:K,1:K) contains the eigenvectors of the Rayleigh
+      ! quotient. Even in the case of complex spectrum, all
+      ! computation is done in real arithmetic. REIG and
+      ! IMEIG are the real and the imaginary parts of the
+      ! eigenvalues, so that the spectrum is given as
+      ! REIG(:) + sqrt(-1)*IMEIG(:). Complex conjugate pairs
+      ! are listed at consecutive positions. For such a
+      ! complex conjugate pair of the eigenvalues, the
+      ! corresponding eigenvectors are also a complex
+      ! conjugate pair with the real and imaginary parts
+      ! stored column-wise in W at the corresponding
+      ! consecutive column indices. See the description of Z.
+      ! Also, see the description of SGEEV.
+      IF ( INFO1 > 0 ) THEN
+         ! SGEEV failed to compute the eigenvalues and
+         ! eigenvectors of the Rayleigh quotient.
+         INFO = 3
+         RETURN
+      END IF
+!
+      ! <6> Compute the eigenvectors (if requested) and,
+      ! the residuals (if requested).
+      !
+      IF ( WNTVEC .OR. WNTEX ) THEN
+      IF ( WNTRES ) THEN
+          IF ( WNTREF ) THEN
+            ! Here, if the refinement is requested, we have
+            ! A*U(:,1:K) already computed and stored in Z.
+            ! For the residuals, need Y = A * U(:,1;K) * W.
+            CALL SGEMM( 'N', 'N', M, K, K, ONE, Z, LDZ, W, &
+                       LDW, ZERO, Y, LDY )               ! BLAS CALL
+            ! Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)       ! INTRINSIC
+            ! This frees Z; Y contains A * U(:,1:K) * W.
+          ELSE
+            ! Compute S = V_k * Sigma_k^(-1) * W, where
+            ! V_k * Sigma_k^(-1) is stored in Z
+            CALL SGEMM( T_OR_N, 'N', N, K, K, ONE, Z, LDZ, &
+                       W, LDW, ZERO, S, LDS )
+            ! Then, compute Z = Y * S =
+            ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+            ! = A * U(:,1:K) * W(1:K,1:K)
+            CALL SGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, &
+                       LDS, ZERO, Z, LDZ )
+            ! Save a copy of Z into Y and free Z for holding
+            ! the Ritz vectors.
+            CALL SLACPY( 'A', M, K, Z, LDZ, Y, LDY )
+            IF ( WNTEX ) CALL SLACPY( 'A', M, K, Z, LDZ, B, LDB )
+          END IF
+      ELSE IF ( WNTEX ) THEN
+          ! Compute S = V_k * Sigma_k^(-1) * W, where
+            ! V_k * Sigma_k^(-1) is stored in Z
+            CALL SGEMM( T_OR_N, 'N', N, K, K, ONE, Z, LDZ, &
+                       W, LDW, ZERO, S, LDS )
+            ! Then, compute Z = Y * S =
+            ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+            ! = A * U(:,1:K) * W(1:K,1:K)
+            CALL SGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, &
+                       LDS, ZERO, B, LDB )
+            ! The above call replaces the following two calls
+            ! that were used in the developing-testing phase.
+            ! CALL SGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, &
+            !           LDS, ZERO, Z, LDZ)
+            ! Save a copy of Z into B and free Z for holding
+            ! the Ritz vectors.
+            ! CALL SLACPY( 'A', M, K, Z, LDZ, B, LDB )
+      END IF
+!
+      ! Compute the real form of the Ritz vectors
+      IF ( WNTVEC ) CALL SGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, &
+                   ZERO, Z, LDZ )                           ! BLAS CALL
+      ! Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRINSIC
+!
+      IF ( WNTRES ) THEN
+         i = 1
+         DO WHILE ( i <= K )
+            IF ( IMEIG(i) == ZERO ) THEN
+                ! have a real eigenvalue with real eigenvector
+                CALL SAXPY( M, -REIG(i), Z(1,i), 1, Y(1,i), 1 )       ! BLAS CALL
+                ! Y(1:M,i) = Y(1:M,i) - REIG(i) * Z(1:M,i)            ! INTRINSIC
+                RES(i) = SNRM2( M, Y(1,i), 1 )                         ! BLAS CALL
+                i = i + 1
+            ELSE
+               ! Have a complex conjugate pair
+               ! REIG(i) +- sqrt(-1)*IMEIG(i).
+               ! Since all computation is done in real
+               ! arithmetic, the formula for the residual
+               ! is recast for real representation of the
+               ! complex conjugate eigenpair. See the
+               ! description of RES.
+               AB(1,1) =  REIG(i)
+               AB(2,1) = -IMEIG(i)
+               AB(1,2) =  IMEIG(i)
+               AB(2,2) =  REIG(i)
+               CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), &
+                           LDZ, AB, 2, ONE, Y(1,i), LDY )          ! BLAS CALL
+               ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB   ! INTRINSIC
+               RES(i)   = SLANGE( 'F', M, 2, Y(1,i), LDY, &
+                                  WORK(N+1) )                      ! LAPACK CALL
+               RES(i+1) = RES(i)
+               i = i + 2
+            END IF
+         END DO
+      END IF
+      END IF
+!
+      IF ( WHTSVD == 4 ) THEN
+          WORK(N+1) = XSCL1
+          WORK(N+2) = XSCL2
+      END IF
+!
+!     Successful exit.
+      IF ( .NOT. BADXY ) THEN
+         INFO = 0
+      ELSE
+         ! A warning on possible data inconsistency.
+         ! This should be a rare event.
+         INFO = 4
+      END IF
+!............................................................
+      RETURN
+!     ......
+      END SUBROUTINE SGEDMD
+
diff --git a/lapack-netlib/SRC/sgedmdq.f90 b/lapack-netlib/SRC/sgedmdq.f90
new file mode 100644
index 000000000..acd5d56c6
--- /dev/null
+++ b/lapack-netlib/SRC/sgedmdq.f90
@@ -0,0 +1,703 @@
+SUBROUTINE SGEDMDQ( JOBS,  JOBZ, JOBR, JOBQ, JOBT, JOBF,   &
+                    WHTSVD,   M, N, F, LDF,  X, LDX,  Y,   &
+                    LDY,   NRNK,  TOL,   K,  REIG, IMEIG,  &
+                    Z, LDZ, RES,  B,     LDB,   V, LDV,    & 
+                    S, LDS, WORK, LWORK, IWORK, LIWORK, INFO )
+! March 2023
+!.....
+      USE                   iso_fortran_env 
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real32     
+!.....      
+!     Scalar arguments       
+      CHARACTER, INTENT(IN)  :: JOBS, JOBZ, JOBR, JOBQ,    &
+                                JOBT, JOBF
+      INTEGER,   INTENT(IN)  :: WHTSVD, M, N,   LDF, LDX,  &
+                                LDY, NRNK, LDZ, LDB, LDV,  &
+                                LDS, LWORK,  LIWORK
+      INTEGER,   INTENT(OUT) :: INFO,   K      
+      REAL(KIND=WP), INTENT(IN)    ::   TOL     
+!     Array arguments      
+      REAL(KIND=WP), INTENT(INOUT) :: F(LDF,*)
+      REAL(KIND=WP), INTENT(OUT)   :: X(LDX,*), Y(LDY,*),  &
+                                      Z(LDZ,*), B(LDB,*),  &
+                                      V(LDV,*), S(LDS,*)
+      REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*),  &
+                                      RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)  
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!.....      
+!     Purpose  
+!     =======
+!     SGEDMDQ computes the Dynamic Mode Decomposition (DMD) for
+!     a pair of data snapshot matrices, using a QR factorization
+!     based compression of the data. For the input matrices
+!     X and Y such that Y = A*X with an unaccessible matrix
+!     A, SGEDMDQ computes a certain number of Ritz pairs of A using
+!     the standard Rayleigh-Ritz extraction from a subspace of
+!     range(X) that is determined using the leading left singular 
+!     vectors of X. Optionally, SGEDMDQ returns the residuals 
+!     of the computed Ritz pairs, the information needed for
+!     a refinement of the Ritz vectors, or the eigenvectors of
+!     the Exact DMD.
+!     For further details see the references listed
+!     below. For more details of the implementation see [3].      
+!
+!     References
+!     ==========
+!     [1] P. Schmid: Dynamic mode decomposition of numerical
+!         and experimental data,
+!         Journal of Fluid Mechanics 656, 5-28, 2010.
+!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!         decompositions: analysis and enhancements,
+!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!         Mode Decomposition I. Technical report. AIMDyn Inc.
+!         and LAPACK Working Note 298.      
+!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. 
+!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!         Theory and Applications, Journal of Computational
+!         Dynamics 1(2), 391 -421, 2014.
+!
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     and supported by
+!     - DARPA SBIR project "Koopman Operator-Based Forecasting
+!     for Nonstationary Processes from Near-Term, Limited
+!     Observational Data" Contract No: W31P4Q-21-C-0007
+!     - DARPA PAI project "Physics-Informed Machine Learning
+!     Methodologies" Contract No: HR0011-18-9-0033
+!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!     Framework for Space-Time Analysis of Process Dynamics"
+!     Contract No: HR0011-16-C-0116
+!     Any opinions, findings and conclusions or recommendations 
+!     expressed in this material are those of the author and 
+!     do not necessarily reflect the views of the DARPA SBIR 
+!     Program Office.      
+!============================================================
+!     Distribution Statement A: 
+!     Approved for Public Release, Distribution Unlimited.
+!     Cleared by DARPA on September 29, 2022
+!============================================================      
+!......................................................................      
+!     Arguments
+!     =========
+!     JOBS (input) CHARACTER*1
+!     Determines whether the initial data snapshots are scaled
+!     by a diagonal matrix. The data snapshots are the columns
+!     of F. The leading N-1 columns of F are denoted X and the
+!     trailing N-1 columns are denoted Y. 
+!     'S' :: The data snapshots matrices X and Y are multiplied
+!            with a diagonal matrix D so that X*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'C' :: The snapshots are scaled as with the 'S' option.
+!            If it is found that an i-th column of X is zero
+!            vector and the corresponding i-th column of Y is
+!            non-zero, then the i-th column of Y is set to
+!            zero and a warning flag is raised.
+!     'Y' :: The data snapshots matrices X and Y are multiplied
+!            by a diagonal matrix D so that Y*D has unit
+!            nonzero columns (in the Euclidean 2-norm)    
+!     'N' :: No data scaling.   
+!.....
+!     JOBZ (input) CHARACTER*1
+!     Determines whether the eigenvectors (Koopman modes) will
+!     be computed.
+!     'V' :: The eigenvectors (Koopman modes) will be computed
+!            and returned in the matrix Z.
+!            See the description of Z.
+!     'F' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Z*V, where Z
+!            is orthonormal and V contains the eigenvectors
+!            of the corresponding Rayleigh quotient.
+!            See the descriptions of F, V, Z.
+!     'Q' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Q*Z, where Z
+!            contains the eigenvectors of the compression of the
+!            underlying discretized operator onto the span of
+!            the data snapshots. See the descriptions of F, V, Z. 
+!            Q is from the initial QR factorization.  
+!     'N' :: The eigenvectors are not computed.  
+!.....      
+!     JOBR (input) CHARACTER*1 
+!     Determines whether to compute the residuals.
+!     'R' :: The residuals for the computed eigenpairs will
+!            be computed and stored in the array RES.
+!            See the description of RES.
+!            For this option to be legal, JOBZ must be 'V'.
+!     'N' :: The residuals are not computed.
+!.....
+!     JOBQ (input) CHARACTER*1 
+!     Specifies whether to explicitly compute and return the
+!     orthogonal matrix from the QR factorization.
+!     'Q' :: The matrix Q of the QR factorization of the data
+!            snapshot matrix is computed and stored in the
+!            array F. See the description of F.       
+!     'N' :: The matrix Q is not explicitly computed.
+!.....
+!     JOBT (input) CHARACTER*1 
+!     Specifies whether to return the upper triangular factor
+!     from the QR factorization.
+!     'R' :: The matrix R of the QR factorization of the data 
+!            snapshot matrix F is returned in the array Y.
+!            See the description of Y and Further details.       
+!     'N' :: The matrix R is not returned.    
+!.....
+!     JOBF (input) CHARACTER*1
+!     Specifies whether to store information needed for post-
+!     processing (e.g. computing refined Ritz vectors)
+!     'R' :: The matrix needed for the refinement of the Ritz
+!            vectors is computed and stored in the array B.
+!            See the description of B.
+!     'E' :: The unscaled eigenvectors of the Exact DMD are 
+!            computed and returned in the array B. See the
+!            description of B.
+!     'N' :: No eigenvector refinement data is computed.   
+!     To be useful on exit, this option needs JOBQ='Q'.      
+!.....
+!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!     Allows for a selection of the SVD algorithm from the
+!     LAPACK library.
+!     1 :: SGESVD (the QR SVD algorithm)
+!     2 :: SGESDD (the Divide and Conquer algorithm; if enough
+!          workspace available, this is the fastest option)
+!     3 :: SGESVDQ (the preconditioned QR SVD  ; this and 4
+!          are the most accurate options)
+!     4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3
+!          are the most accurate options)
+!     For the four methods above, a significant difference in
+!     the accuracy of small singular values is possible if
+!     the snapshots vary in norm so that X is severely
+!     ill-conditioned. If small (smaller than EPS*||X||)
+!     singular values are of interest and JOBS=='N',  then
+!     the options (3, 4) give the most accurate results, where
+!     the option 4 is slightly better and with stronger 
+!     theoretical background.
+!     If JOBS=='S', i.e. the columns of X will be normalized,
+!     then all methods give nearly equally accurate results.
+!.....
+!     M (input) INTEGER, M >= 0 
+!     The state space dimension (the number of rows of F)
+!.....      
+!     N (input) INTEGER, 0 <= N <= M
+!     The number of data snapshots from a single trajectory,
+!     taken at equidistant discrete times. This is the 
+!     number of columns of F.
+!.....
+!     F (input/output) REAL(KIND=WP) M-by-N array
+!     > On entry,
+!     the columns of F are the sequence of data snapshots 
+!     from a single trajectory, taken at equidistant discrete
+!     times. It is assumed that the column norms of F are 
+!     in the range of the normalized floating point numbers. 
+!     < On exit,
+!     If JOBQ == 'Q', the array F contains the orthogonal 
+!     matrix/factor of the QR factorization of the initial 
+!     data snapshots matrix F. See the description of JOBQ. 
+!     If JOBQ == 'N', the entries in F strictly below the main
+!     diagonal contain, column-wise, the information on the 
+!     Householder vectors, as returned by SGEQRF. The 
+!     remaining information to restore the orthogonal matrix
+!     of the initial QR factorization is stored in WORK(1:N). 
+!     See the description of WORK.
+!.....
+!     LDF (input) INTEGER, LDF >= M 
+!     The leading dimension of the array F.
+!.....
+!     X (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array
+!     X is used as workspace to hold representations of the
+!     leading N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, the leading K columns of X contain the leading
+!     K left singular vectors of the above described content
+!     of X. To lift them to the space of the left singular
+!     vectors U(:,1:K)of the input data, pre-multiply with the 
+!     Q factor from the initial QR factorization. 
+!     See the descriptions of F, K, V  and Z.
+!.....      
+!     LDX (input) INTEGER, LDX >= N  
+!     The leading dimension of the array X 
+!.....
+!     Y (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array
+!     Y is used as workspace to hold representations of the
+!     trailing N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, 
+!     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper
+!     triangular factor from the QR factorization of the data
+!     snapshot matrix F.
+!.....      
+!     LDY (input) INTEGER , LDY >= N
+!     The leading dimension of the array Y   
+!.....
+!     NRNK (input) INTEGER
+!     Determines the mode how to compute the numerical rank,
+!     i.e. how to truncate small singular values of the input
+!     matrix X. On input, if
+!     NRNK = -1 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(1)
+!                  This option is recommended.   
+!     NRNK = -2 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(i-1)
+!                  This option is included for R&D purposes.
+!                  It requires highly accurate SVD, which
+!                  may not be feasible.     
+!     The numerical rank can be enforced by using positive 
+!     value of NRNK as follows: 
+!     0 < NRNK <= N-1 :: at most NRNK largest singular values
+!     will be used. If the number of the computed nonzero
+!     singular values is less than NRNK, then only those
+!     nonzero values will be used and the actually used
+!     dimension is less than NRNK. The actual number of
+!     the nonzero singular values is returned in the variable
+!     K. See the description of K.
+!.....
+!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!     The tolerance for truncating small singular values.
+!     See the description of NRNK.  
+!.....
+!     K (output) INTEGER,  0 <= K <= N 
+!     The dimension of the SVD/POD basis for the leading N-1
+!     data snapshots (columns of F) and the number of the 
+!     computed Ritz pairs. The value of K is determined
+!     according to the rule set by the parameters NRNK and 
+!     TOL. See the descriptions of NRNK and TOL. 
+!.....
+!     REIG (output) REAL(KIND=WP) (N-1)-by-1 array
+!     The leading K (K<=N) entries of REIG contain
+!     the real parts of the computed eigenvalues
+!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!     See the descriptions of K, IMEIG, Z.
+!.....
+!     IMEIG (output) REAL(KIND=WP) (N-1)-by-1 array
+!     The leading K (K<N) entries of REIG contain
+!     the imaginary parts of the computed eigenvalues
+!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!     The eigenvalues are determined as follows:
+!     If IMEIG(i) == 0, then the corresponding eigenvalue is
+!     real, LAMBDA(i) = REIG(i).
+!     If IMEIG(i)>0, then the corresponding complex
+!     conjugate pair of eigenvalues reads
+!     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i)
+!     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i)
+!     That is, complex conjugate pairs have consecutive
+!     indices (i,i+1), with the positive imaginary part
+!     listed first.
+!     See the descriptions of K, REIG, Z.     
+!.....      
+!     Z (workspace/output) REAL(KIND=WP)  M-by-(N-1) array
+!     If JOBZ =='V' then
+!        Z contains real Ritz vectors as follows:
+!        If IMEIG(i)=0, then Z(:,i) is an eigenvector of
+!        the i-th Ritz value.
+!        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then
+!        [Z(:,i) Z(:,i+1)] span an invariant subspace and
+!        the Ritz values extracted from this subspace are
+!        REIG(i) + sqrt(-1)*IMEIG(i) and
+!        REIG(i) - sqrt(-1)*IMEIG(i).
+!        The corresponding eigenvectors are
+!        Z(:,i) + sqrt(-1)*Z(:,i+1) and
+!        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively.
+!     If JOBZ == 'F', then the above descriptions hold for
+!     the columns of Z*V, where the columns of V are the
+!     eigenvectors of the K-by-K Rayleigh quotient, and Z is
+!     orthonormal. The columns of V are similarly structured:
+!     If IMEIG(i) == 0 then Z*V(:,i) is an eigenvector, and if 
+!     IMEIG(i) > 0 then Z*V(:,i)+sqrt(-1)*Z*V(:,i+1) and
+!                       Z*V(:,i)-sqrt(-1)*Z*V(:,i+1)
+!     are the eigenvectors of LAMBDA(i), LAMBDA(i+1).
+!     See the descriptions of REIG, IMEIG, X and V.
+!.....
+!     LDZ (input) INTEGER , LDZ >= M
+!     The leading dimension of the array Z.
+!.....
+!     RES (output) REAL(KIND=WP) (N-1)-by-1 array
+!     RES(1:K) contains the residuals for the K computed 
+!     Ritz pairs.       
+!     If LAMBDA(i) is real, then
+!        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2.
+!     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair
+!     then
+!     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F
+!     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ]
+!               [-imag(LAMBDA(i)) real(LAMBDA(i)) ].
+!     It holds that
+!     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2
+!     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2
+!     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1)
+!           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1)
+!     See the description of Z.
+!.....
+!     B (output) REAL(KIND=WP)  MIN(M,N)-by-(N-1) array.
+!     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can
+!     be used for computing the refined vectors; see further 
+!     details in the provided references. 
+!     If JOBF == 'E', B(1:N,1;K) contains 
+!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!     Exact DMD, up to scaling by the inverse eigenvalues.   
+!     In both cases, the content of B can be lifted to the 
+!     original dimension of the input data by pre-multiplying
+!     with the Q factor from the initial QR factorization.     
+!     Here A denotes a compression of the underlying operator.      
+!     See the descriptions of F and X.
+!     If JOBF =='N', then B is not referenced.
+!.....
+!     LDB (input) INTEGER, LDB >= MIN(M,N)
+!     The leading dimension of the array B.
+!.....      
+!     V (workspace/output) REAL(KIND=WP) (N-1)-by-(N-1) array
+!     On exit, V(1:K,1:K) contains the K eigenvectors of
+!     the Rayleigh quotient. The eigenvectors of a complex
+!     conjugate pair of eigenvalues are returned in real form
+!     as explained in the description of Z. The Ritz vectors
+!     (returned in Z) are the product of X and V; see
+!     the descriptions of X and Z.
+!.....
+!     LDV (input) INTEGER, LDV >= N-1
+!     The leading dimension of the array V.
+!.....      
+!     S (output) REAL(KIND=WP) (N-1)-by-(N-1) array
+!     The array S(1:K,1:K) is used for the matrix Rayleigh
+!     quotient. This content is overwritten during
+!     the eigenvalue decomposition by SGEEV.
+!     See the description of K.
+!.....
+!     LDS (input) INTEGER, LDS >= N-1        
+!     The leading dimension of the array S.
+!.....
+!     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!     On exit, 
+!     WORK(1:MIN(M,N)) contains the scalar factors of the 
+!     elementary reflectors as returned by SGEQRF of the 
+!     M-by-N input matrix F.
+!     WORK(MIN(M,N)+1:MIN(M,N)+N-1) contains the singular values of 
+!     the input submatrix F(1:M,1:N-1).
+!     If the call to SGEDMDQ is only workspace query, then
+!     WORK(1) contains the minimal workspace length and
+!     WORK(2) is the optimal workspace length. Hence, the
+!     length of work is at least 2.
+!     See the description of LWORK.
+!.....
+!     LWORK (input) INTEGER
+!     The minimal length of the  workspace vector WORK.
+!     LWORK is calculated as follows:
+!     Let MLWQR  = N (minimal workspace for SGEQRF[M,N])
+!         MLWDMD = minimal workspace for SGEDMD (see the
+!                  description of LWORK in SGEDMD) for 
+!                  snapshots of dimensions MIN(M,N)-by-(N-1)
+!         MLWMQR = N (minimal workspace for 
+!                    SORMQR['L','N',M,N,N])
+!         MLWGQR = N (minimal workspace for SORGQR[M,N,N])
+!     Then
+!     LWORK = MAX(N+MLWQR, N+MLWDMD)
+!     is updated as follows:
+!        if   JOBZ == 'V' or JOBZ == 'F' THEN 
+!             LWORK = MAX( LWORK,MIN(M,N)+N-1 +MLWMQR )
+!        if   JOBQ == 'Q' THEN
+!             LWORK = MAX( LWORK,MIN(M,N)+N-1+MLWGQR)
+!     If on entry LWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.          
+!.....
+!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!     Workspace that is required only if WHTSVD equals
+!     2 , 3 or 4. (See the description of WHTSVD).
+!     If on entry LWORK =-1 or LIWORK=-1, then the
+!     minimal length of IWORK is computed and returned in
+!     IWORK(1). See the description of LIWORK.
+!.....
+!     LIWORK (input) INTEGER
+!     The minimal length of the workspace vector IWORK.
+!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!     Let M1=MIN(M,N), N1=N-1. Then
+!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1))
+!     If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1)
+!     If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1)
+!     If on entry LIWORK = -1, then a worskpace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.
+!..... 
+!     INFO (output) INTEGER
+!     -i < 0 :: On entry, the i-th argument had an
+!               illegal value
+!        = 0 :: Successful return.
+!        = 1 :: Void input. Quick exit (M=0 or N=0).
+!        = 2 :: The SVD computation of X did not converge.
+!               Suggestion: Check the input data and/or
+!               repeat with different WHTSVD.
+!        = 3 :: The computation of the eigenvalues did not
+!               converge.
+!        = 4 :: If data scaling was requested on input and
+!               the procedure found inconsistency in the data
+!               such that for some column index i,
+!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!               to zero if JOBS=='C'. The computation proceeds
+!               with original or modified data and warning
+!               flag is set with INFO=4.  
+!.............................................................
+!.............................................................
+!     Parameters
+!     ~~~~~~~~~~      
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+!      
+!     Local scalars      
+!     ~~~~~~~~~~~~~
+      INTEGER           :: IMINWR, INFO1,  MLWDMD, MLWGQR, &
+                           MLWMQR, MLWORK, MLWQR,  MINMN,  & 
+                           OLWDMD, OLWGQR, OLWMQR, OLWORK, &
+                           OLWQR
+      LOGICAL           :: LQUERY, SCCOLX, SCCOLY, WANTQ,  &
+                           WNTTRF, WNTRES, WNTVEC, WNTVCF, &
+                           WNTVCQ, WNTREF, WNTEX
+      CHARACTER(LEN=1)  :: JOBVL
+!      
+!     Local array      
+!     ~~~~~~~~~~~      
+      REAL(KIND=WP) :: RDUMMY(2)
+!      
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      LOGICAL       LSAME
+      EXTERNAL      LSAME 
+!
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      SGEMM 
+      EXTERNAL      SGEQRF, SLACPY, SLASET, SORGQR, & 
+                    SORMQR, XERBLA
+
+!     External subroutines
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      SGEDMD 
+      
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC      MAX, MIN, INT         
+ !..........................................................  
+ !
+ !    Test the input arguments    
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME( JOBS, 'C' )
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V') 
+      WNTVCF = LSAME(JOBZ,'F')
+      WNTVCQ = LSAME(JOBZ,'Q')
+      WNTREF = LSAME(JOBF,'R') 
+      WNTEX  = LSAME(JOBF,'E')
+      WANTQ  = LSAME(JOBQ,'Q')
+      WNTTRF = LSAME(JOBT,'R')     
+      MINMN  = MIN(M,N)
+      INFO = 0 
+      LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) )
+!       
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. LSAME(JOBS,'N')) )  THEN 
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. WNTVCF .OR. WNTVCQ       &
+                              .OR. LSAME(JOBZ,'N')) ) THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.    & 
+          ( WNTRES .AND. LSAME(JOBZ,'N') ) ) THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WANTQ .OR. LSAME(JOBQ,'N')) ) THEN
+          INFO = -4                 
+      ELSE IF ( .NOT. ( WNTTRF .OR. LSAME(JOBT,'N') ) )  THEN
+          INFO = -5
+      ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             & 
+                LSAME(JOBF,'N') ) )                    THEN
+          INFO = -6    
+      ELSE IF ( .NOT. ((WHTSVD == 1).OR.(WHTSVD == 2).OR.   &
+                       (WHTSVD == 3).OR.(WHTSVD == 4)) ) THEN
+          INFO = -7
+      ELSE IF ( M < 0 ) THEN
+          INFO = -8
+      ELSE IF ( ( N < 0 ) .OR. ( N > M+1 ) ) THEN
+          INFO = -9
+      ELSE IF ( LDF < M ) THEN
+          INFO = -11
+      ELSE IF ( LDX < MINMN ) THEN
+          INFO = -13
+      ELSE IF ( LDY < MINMN ) THEN
+          INFO = -15
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR.    & 
+                       ((NRNK >= 1).AND.(NRNK <=N ))) )  THEN
+          INFO = -16
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN
+          INFO = -17
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -22
+      ELSE IF ( (WNTREF.OR.WNTEX ).AND.( LDB < MINMN ) ) THEN
+          INFO = -25
+      ELSE IF ( LDV < N-1 ) THEN
+          INFO = -27
+      ELSE IF ( LDS < N-1 ) THEN
+          INFO = -29
+      END IF
+!      
+      IF ( WNTVEC .OR. WNTVCF ) THEN
+          JOBVL = 'V'
+      ELSE
+          JOBVL = 'N'
+      END IF     
+      IF ( INFO == 0 ) THEN  
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and 
+          ! determine minimal and optimal sizes of the 
+          ! workspace at any moment of the run.         
+         IF ( ( N == 0 ) .OR. ( N == 1 ) ) THEN
+             ! All output except K is void. INFO=1 signals
+             ! the void input. In case of a workspace query,
+             ! the minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN  
+               IWORK(1) = 1
+                WORK(1) = 2
+                WORK(2) = 2
+            ELSE                
+               K = 0
+            END IF             
+            INFO = 1  
+            RETURN
+         END IF     
+         MLWQR  = MAX(1,N)  ! Minimal workspace length for SGEQRF.
+         MLWORK = MIN(M,N) + MLWQR 
+         IF ( LQUERY ) THEN 
+             CALL SGEQRF( M, N, F, LDF, WORK, RDUMMY, -1, &
+                          INFO1 )
+             OLWQR = INT(RDUMMY(1))
+             OLWORK = MIN(M,N) + OLWQR           
+         END IF
+         CALL SGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,& 
+                      N-1, X, LDX, Y, LDY, NRNK, TOL, K,     & 
+                      REIG, IMEIG, Z, LDZ, RES,  B, LDB,     & 
+                      V, LDV, S, LDS, WORK, -1, IWORK,       &
+                      LIWORK, INFO1 )
+         MLWDMD = INT(WORK(1))
+         MLWORK = MAX(MLWORK, MINMN + MLWDMD)
+         IMINWR = IWORK(1)
+         IF ( LQUERY ) THEN 
+             OLWDMD = INT(WORK(2))
+             OLWORK = MAX(OLWORK, MINMN+OLWDMD)
+         END IF
+         IF ( WNTVEC .OR. WNTVCF ) THEN
+            MLWMQR = MAX(1,N) 
+            MLWORK = MAX(MLWORK,MINMN+N-1+MLWMQR)
+            IF ( LQUERY ) THEN
+               CALL SORMQR( 'L','N', M, N, MINMN, F, LDF,  & 
+                            WORK, Z, LDZ, WORK, -1, INFO1 )
+               OLWMQR = INT(WORK(1))
+               OLWORK = MAX(OLWORK,MINMN+N-1+OLWMQR)
+            END IF
+         END IF  
+         IF ( WANTQ ) THEN
+            MLWGQR = N
+            MLWORK = MAX(MLWORK,MINMN+N-1+MLWGQR)
+            IF ( LQUERY ) THEN 
+                CALL SORGQR( M, MINMN, MINMN, F, LDF, WORK, &
+                             WORK, -1, INFO1 )        
+                OLWGQR = INT(WORK(1))
+                OLWORK = MAX(OLWORK,MINMN+N-1+OLWGQR)
+            END IF            
+         END IF   
+         IMINWR = MAX( 1, IMINWR )
+         MLWORK = MAX( 2, MLWORK )      
+         IF (  LWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -31
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -33
+      END IF  
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'SGEDMDQ', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          WORK(1)  = MLWORK
+          WORK(2)  = OLWORK
+          RETURN
+      END IF   
+!.....	  
+!     Initial QR factorization that is used to represent the
+!     snapshots as elements of lower dimensional subspace.
+!     For large scale computation with M >>N , at this place 
+!     one can use an out of core QRF.
+!   
+      CALL SGEQRF( M, N, F, LDF, WORK,         & 
+                   WORK(MINMN+1), LWORK-MINMN, INFO1 )
+!      
+!     Define X and Y as the snapshots representations in the
+!     orthogonal basis computed in the QR factorization.
+!     X corresponds to the leading N-1 and Y to the trailing
+!     N-1 snapshots.
+      CALL SLASET( 'L', MINMN, N-1, ZERO,  ZERO, X, LDX )
+      CALL SLACPY( 'U', MINMN, N-1, F,      LDF, X, LDX )
+      CALL SLACPY( 'A', MINMN, N-1, F(1,2), LDF, Y, LDY )
+      IF ( M >= 3 ) THEN
+          CALL SLASET( 'L', MINMN-2, N-2, ZERO,  ZERO, &
+                       Y(3,1), LDY )  
+      END IF
+!
+!     Compute the DMD of the projected snapshot pairs (X,Y)   
+      CALL SGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,  &
+                   N-1, X, LDX, Y, LDY, NRNK,   TOL, K,     &
+                   REIG, IMEIG, Z, LDZ, RES, B, LDB, V,     &
+                   LDV, S, LDS, WORK(MINMN+1), LWORK-MINMN, IWORK,  & 
+                   LIWORK, INFO1 )
+      IF ( INFO1 == 2 .OR. INFO1 == 3 ) THEN
+          ! Return with error code.
+          INFO = INFO1
+          RETURN
+      ELSE
+          INFO = INFO1
+      END IF    
+!      
+!     The Ritz vectors (Koopman modes) can be explicitly 
+!     formed or returned in factored form.
+      IF ( WNTVEC ) THEN
+        ! Compute the eigenvectors explicitly.  
+        IF ( M > MINMN ) CALL SLASET( 'A', M-MINMN, K, ZERO, &
+                                     ZERO, Z(MINMN+1,1), LDZ )
+        CALL SORMQR( 'L','N', M, K, MINMN, F, LDF, WORK, Z,  &
+                     LDZ, WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 )
+      ELSE IF ( WNTVCF ) THEN   
+        !   Return the Ritz vectors (eigenvectors) in factored
+        !   form Z*V, where Z contains orthonormal matrix (the
+        !   product of Q from the initial QR factorization and 
+        !   the SVD/POD_basis returned by SGEDMD in X) and the 
+        !   second factor (the eigenvectors of the Rayleigh 
+        !   quotient) is in the array V, as returned by SGEDMD.
+        CALL SLACPY( 'A', N, K, X, LDX, Z, LDZ )
+        IF ( M > N ) CALL SLASET( 'A', M-N, K, ZERO, ZERO,   & 
+                                  Z(N+1,1), LDZ )
+        CALL SORMQR( 'L','N', M, K, MINMN, F, LDF, WORK, Z,  &
+             LDZ, WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 )
+      END IF
+!     
+!     Some optional output variables:
+!
+!     The upper triangular factor in the initial QR 
+!     factorization is optionally returned in the array Y.
+!     This is useful if this call to SGEDMDQ is to be 
+!     followed by a streaming DMD that is implemented in a 
+!     QR compressed form.
+      IF ( WNTTRF ) THEN ! Return the upper triangular R in Y 
+         CALL SLASET( 'A', MINMN, N, ZERO,  ZERO, Y, LDY )
+         CALL SLACPY( 'U', MINMN, N, F, LDF,      Y, LDY )
+      END IF    
+!
+!     The orthonormal/orthogonal factor in the initial QR 
+!     factorization is optionally returned in the array F. 
+!     Same as with the triangular factor above, this is 
+!     useful in a streaming DMD.
+      IF ( WANTQ ) THEN  ! Q overwrites F 
+         CALL SORGQR( M, MINMN, MINMN, F, LDF, WORK, &
+              WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 )  
+      END IF
+!      
+      RETURN
+!      
+      END SUBROUTINE SGEDMDQ
+    
\ No newline at end of file
diff --git a/lapack-netlib/SRC/zgedmd.f90 b/lapack-netlib/SRC/zgedmd.f90
new file mode 100644
index 000000000..090641ad8
--- /dev/null
+++ b/lapack-netlib/SRC/zgedmd.f90
@@ -0,0 +1,996 @@
+      SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,   &
+                         M, N, X, LDX, Y, LDY, NRNK, TOL,   &
+                         K, EIGS, Z, LDZ, RES, B,    LDB,   &
+                         W, LDW,  S, LDS, ZWORK,  LZWORK,   &
+                         RWORK, LRWORK, IWORK, LIWORK, INFO )
+! March 2023
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real64
+
+!.....
+!     Scalar arguments
+      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+                                 NRNK, LDZ, LDB, LDW,  LDS, &
+                                 LIWORK, LRWORK, LZWORK
+      INTEGER,       INTENT(OUT)  :: K, INFO
+      REAL(KIND=WP), INTENT(IN)   ::    TOL
+!     Array arguments
+      COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+                                         W(LDW,*), S(LDS,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
+      REAL(KIND=WP),    INTENT(OUT)   :: RES(*)
+      REAL(KIND=WP),    INTENT(OUT)   :: RWORK(*)
+      INTEGER,          INTENT(OUT)   :: IWORK(*)
+!............................................................
+!     Purpose
+!     =======
+!     ZGEDMD computes the Dynamic Mode Decomposition (DMD) for
+!     a pair of data snapshot matrices. For the input matrices
+!     X and Y such that Y = A*X with an unaccessible matrix
+!     A, ZGEDMD computes a certain number of Ritz pairs of A using
+!     the standard Rayleigh-Ritz extraction from a subspace of
+!     range(X) that is determined using the leading left singular
+!     vectors of X. Optionally, ZGEDMD returns the residuals
+!     of the computed Ritz pairs, the information needed for
+!     a refinement of the Ritz vectors, or the eigenvectors of
+!     the Exact DMD.
+!     For further details see the references listed
+!     below. For more details of the implementation see [3].
+!
+!     References
+!     ==========
+!     [1] P. Schmid: Dynamic mode decomposition of numerical
+!         and experimental data,
+!         Journal of Fluid Mechanics 656, 5-28, 2010.
+!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!         decompositions: analysis and enhancements,
+!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!         Mode Decomposition I. Technical report. AIMDyn Inc.
+!         and LAPACK Working Note 298.
+!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
+!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!         Theory and Applications, Journal of Computational
+!         Dynamics 1(2), 391 -421, 2014.
+!
+!......................................................................
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     and supported by
+!     - DARPA SBIR project "Koopman Operator-Based Forecasting
+!     for Nonstationary Processes from Near-Term, Limited
+!     Observational Data" Contract No: W31P4Q-21-C-0007
+!     - DARPA PAI project "Physics-Informed Machine Learning
+!     Methodologies" Contract No: HR0011-18-9-0033
+!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!     Framework for Space-Time Analysis of Process Dynamics"
+!     Contract No: HR0011-16-C-0116
+!     Any opinions, findings and conclusions or recommendations
+!     expressed in this material are those of the author and
+!     do not necessarily reflect the views of the DARPA SBIR
+!     Program Office
+!============================================================
+!     Distribution Statement A:
+!     Approved for Public Release, Distribution Unlimited.
+!     Cleared by DARPA on September 29, 2022
+!============================================================
+!............................................................
+!     Arguments
+!     =========
+!     JOBS (input) CHARACTER*1
+!     Determines whether the initial data snapshots are scaled
+!     by a diagonal matrix.
+!     'S' :: The data snapshots matrices X and Y are multiplied
+!            with a diagonal matrix D so that X*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'C' :: The snapshots are scaled as with the 'S' option.
+!            If it is found that an i-th column of X is zero
+!            vector and the corresponding i-th column of Y is
+!            non-zero, then the i-th column of Y is set to
+!            zero and a warning flag is raised.
+!     'Y' :: The data snapshots matrices X and Y are multiplied
+!            by a diagonal matrix D so that Y*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'N' :: No data scaling.
+!.....
+!     JOBZ (input) CHARACTER*1
+!     Determines whether the eigenvectors (Koopman modes) will
+!     be computed.
+!     'V' :: The eigenvectors (Koopman modes) will be computed
+!            and returned in the matrix Z.
+!            See the description of Z.
+!     'F' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product X(:,1:K)*W, where X
+!            contains a POD basis (leading left singular vectors
+!            of the data matrix X) and W contains the eigenvectors
+!            of the corresponding Rayleigh quotient.
+!            See the descriptions of K, X, W, Z.
+!     'N' :: The eigenvectors are not computed.
+!.....
+!     JOBR (input) CHARACTER*1
+!     Determines whether to compute the residuals.
+!     'R' :: The residuals for the computed eigenpairs will be
+!            computed and stored in the array RES.
+!            See the description of RES.
+!            For this option to be legal, JOBZ must be 'V'.
+!     'N' :: The residuals are not computed.
+!.....
+!     JOBF (input) CHARACTER*1
+!     Specifies whether to store information needed for post-
+!     processing (e.g. computing refined Ritz vectors)
+!     'R' :: The matrix needed for the refinement of the Ritz
+!            vectors is computed and stored in the array B.
+!            See the description of B.
+!     'E' :: The unscaled eigenvectors of the Exact DMD are
+!            computed and returned in the array B. See the
+!            description of B.
+!     'N' :: No eigenvector refinement data is computed.
+!.....
+!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!     Allows for a selection of the SVD algorithm from the
+!     LAPACK library.
+!     1 :: ZGESVD (the QR SVD algorithm)
+!     2 :: ZGESDD (the Divide and Conquer algorithm; if enough
+!          workspace available, this is the fastest option)
+!     3 :: ZGESVDQ (the preconditioned QR SVD  ; this and 4
+!          are the most accurate options)
+!     4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3
+!          are the most accurate options)
+!     For the four methods above, a significant difference in
+!     the accuracy of small singular values is possible if
+!     the snapshots vary in norm so that X is severely
+!     ill-conditioned. If small (smaller than EPS*||X||)
+!     singular values are of interest and JOBS=='N',  then
+!     the options (3, 4) give the most accurate results, where
+!     the option 4 is slightly better and with stronger
+!     theoretical background.
+!     If JOBS=='S', i.e. the columns of X will be normalized,
+!     then all methods give nearly equally accurate results.
+!.....
+!     M (input) INTEGER, M>= 0
+!     The state space dimension (the row dimension of X, Y).
+!.....
+!     N (input) INTEGER, 0 <= N <= M
+!     The number of data snapshot pairs
+!     (the number of columns of X and Y).
+!.....
+!     X (input/output) COMPLEX(KIND=WP) M-by-N array
+!   > On entry, X contains the data snapshot matrix X. It is
+!     assumed that the column norms of X are in the range of
+!     the normalized floating point numbers.
+!   < On exit, the leading K columns of X contain a POD basis,
+!     i.e. the leading K left singular vectors of the input
+!     data matrix X, U(:,1:K). All N columns of X contain all
+!     left singular vectors of the input matrix X.
+!     See the descriptions of K, Z and W.
+!.....
+!     LDX (input) INTEGER, LDX >= M
+!     The leading dimension of the array X.
+!.....
+!     Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array
+!   > On entry, Y contains the data snapshot matrix Y
+!   < On exit,
+!     If JOBR == 'R', the leading K columns of Y  contain
+!     the residual vectors for the computed Ritz pairs.
+!     See the description of RES.
+!     If JOBR == 'N', Y contains the original input data,
+!                     scaled according to the value of JOBS.
+!.....
+!     LDY (input) INTEGER , LDY >= M
+!     The leading dimension of the array Y.
+!.....
+!     NRNK (input) INTEGER
+!     Determines the mode how to compute the numerical rank,
+!     i.e. how to truncate small singular values of the input
+!     matrix X. On input, if
+!     NRNK = -1 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(1)
+!                  This option is recommended.
+!     NRNK = -2 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(i-1)
+!                  This option is included for R&D purposes.
+!                  It requires highly accurate SVD, which
+!                  may not be feasible.
+!     The numerical rank can be enforced by using positive
+!     value of NRNK as follows:
+!     0 < NRNK <= N :: at most NRNK largest singular values
+!     will be used. If the number of the computed nonzero
+!     singular values is less than NRNK, then only those
+!     nonzero values will be used and the actually used
+!     dimension is less than NRNK. The actual number of
+!     the nonzero singular values is returned in the variable
+!     K. See the descriptions of TOL and  K.
+!.....
+!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!     The tolerance for truncating small singular values.
+!     See the description of NRNK.
+!.....
+!     K (output) INTEGER,  0 <= K <= N
+!     The dimension of the POD basis for the data snapshot
+!     matrix X and the number of the computed Ritz pairs.
+!     The value of K is determined according to the rule set
+!     by the parameters NRNK and TOL.
+!     See the descriptions of NRNK and TOL.
+!.....
+!     EIGS (output) COMPLEX(KIND=WP) N-by-1 array
+!     The leading K (K<=N) entries of EIGS contain
+!     the computed eigenvalues (Ritz values).
+!     See the descriptions of K, and Z.
+!.....
+!     Z (workspace/output) COMPLEX(KIND=WP)  M-by-N array
+!     If JOBZ =='V' then Z contains the  Ritz vectors.  Z(:,i)
+!     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1.
+!     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as
+!     the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i)
+!     is an eigenvector corresponding to EIGS(i). The columns
+!     of W(1:k,1:K) are the computed eigenvectors of the
+!     K-by-K Rayleigh quotient.
+!     See the descriptions of EIGS, X and W.
+!.....
+!     LDZ (input) INTEGER , LDZ >= M
+!     The leading dimension of the array Z.
+!.....
+!     RES (output) REAL(KIND=WP) N-by-1 array
+!     RES(1:K) contains the residuals for the K computed
+!     Ritz pairs,
+!     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2.
+!     See the description of EIGS and Z.
+!.....
+!     B (output) COMPLEX(KIND=WP)  M-by-N array.
+!     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
+!     be used for computing the refined vectors; see further
+!     details in the provided references.
+!     If JOBF == 'E', B(1:M,1:K) contains
+!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!     Exact DMD, up to scaling by the inverse eigenvalues.
+!     If JOBF =='N', then B is not referenced.
+!     See the descriptions of X, W, K.
+!.....
+!     LDB (input) INTEGER, LDB >= M
+!     The leading dimension of the array B.
+!.....
+!     W (workspace/output) COMPLEX(KIND=WP) N-by-N array
+!     On exit, W(1:K,1:K) contains the K computed
+!     eigenvectors of the matrix Rayleigh quotient.
+!     The Ritz vectors (returned in Z) are the
+!     product of X (containing a POD basis for the input
+!     matrix X) and W. See the descriptions of K, S, X and Z.
+!     W is also used as a workspace to temporarily store the
+!     right singular vectors of X.
+!.....
+!     LDW (input) INTEGER, LDW >= N
+!     The leading dimension of the array W.
+!.....
+!     S (workspace/output) COMPLEX(KIND=WP) N-by-N array
+!     The array S(1:K,1:K) is used for the matrix Rayleigh
+!     quotient. This content is overwritten during
+!     the eigenvalue decomposition by ZGEEV.
+!     See the description of K.
+!.....
+!     LDS (input) INTEGER, LDS >= N
+!     The leading dimension of the array S.
+!.....
+!     ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array
+!     ZWORK is used as complex workspace in the complex SVD, as
+!     specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing
+!     the eigenvalues of a Rayleigh quotient.
+!     If the call to ZGEDMD is only workspace query, then
+!     ZWORK(1) contains the minimal complex workspace length and
+!     ZWORK(2) is the optimal complex workspace length.
+!     Hence, the length of work is at least 2.
+!     See the description of LZWORK.
+!.....
+!     LZWORK (input) INTEGER
+!     The minimal length of the workspace vector ZWORK.
+!     LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV),
+!     where LZWORK_ZGEEV = MAX( 1, 2*N )  and the minimal
+!     LZWORK_SVD is calculated as follows
+!     If WHTSVD == 1 :: ZGESVD ::
+!        LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+!     If WHTSVD == 2 :: ZGESDD ::
+!        LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N)
+!     If WHTSVD == 3 :: ZGESVDQ ::
+!        LZWORK_SVD = obtainable by a query
+!     If WHTSVD == 4 :: ZGEJSV ::
+!        LZWORK_SVD = obtainable by a query
+!     If on entry LZWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths and returns them in
+!     LZWORK(1) and LZWORK(2), respectively.
+!.....
+!     RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array
+!     On exit, RWORK(1:N) contains the singular values of
+!     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
+!     If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain
+!     scaling factor RWORK(N+2)/RWORK(N+1) used to scale X
+!     and Y to avoid overflow in the SVD of X.
+!     This may be of interest if the scaling option is off
+!     and as many as possible smallest eigenvalues are
+!     desired to the highest feasible accuracy.
+!     If the call to ZGEDMD is only workspace query, then
+!     RWORK(1) contains the minimal workspace length.
+!     See the description of LRWORK.
+!.....
+!     LRWORK (input) INTEGER
+!     The minimal length of the workspace vector RWORK.
+!     LRWORK is calculated as follows:
+!     LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where
+!     LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace
+!     for the SVD subroutine determined by the input parameter
+!     WHTSVD.
+!     If WHTSVD == 1 :: ZGESVD ::
+!        LRWORK_SVD = 5*MIN(M,N)
+!     If WHTSVD == 2 :: ZGESDD ::
+!        LRWORK_SVD =  MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N),
+!        2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) )
+!     If WHTSVD == 3 :: ZGESVDQ ::
+!        LRWORK_SVD = obtainable by a query
+!     If WHTSVD == 4 :: ZGEJSV ::
+!        LRWORK_SVD = obtainable by a query
+!     If on entry LRWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     real workspace length and returns it in RWORK(1).
+!.....
+!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!     Workspace that is required only if WHTSVD equals
+!     2 , 3 or 4. (See the description of WHTSVD).
+!     If on entry LWORK =-1 or LIWORK=-1, then the
+!     minimal length of IWORK is computed and returned in
+!     IWORK(1). See the description of LIWORK.
+!.....
+!     LIWORK (input) INTEGER
+!     The minimal length of the workspace vector IWORK.
+!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!     If on entry LIWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for  ZWORK, RWORK and
+!     IWORK. See the descriptions of ZWORK, RWORK and IWORK.
+!.....
+!     INFO (output) INTEGER
+!     -i < 0 :: On entry, the i-th argument had an
+!               illegal value
+!        = 0 :: Successful return.
+!        = 1 :: Void input. Quick exit (M=0 or N=0).
+!        = 2 :: The SVD computation of X did not converge.
+!               Suggestion: Check the input data and/or
+!               repeat with different WHTSVD.
+!        = 3 :: The computation of the eigenvalues did not
+!               converge.
+!        = 4 :: If data scaling was requested on input and
+!               the procedure found inconsistency in the data
+!               such that for some column index i,
+!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!               to zero if JOBS=='C'. The computation proceeds
+!               with original or modified data and warning
+!               flag is set with INFO=4.
+!.............................................................
+!.............................................................
+!     Parameters
+!     ~~~~~~~~~~
+      REAL(KIND=WP),    PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP),    PARAMETER :: ZERO = 0.0_WP
+      COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP )
+      COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP )
+
+!     Local scalars
+!     ~~~~~~~~~~~~~
+      REAL(KIND=WP) :: OFL,   ROOTSC, SCALE,  SMALL,    &
+                       SSUM,  XSCL1,  XSCL2
+      INTEGER       ::  i,  j,  IMINWR,  INFO1, INFO2,  &
+                        LWRKEV, LWRSDD, LWRSVD, LWRSVJ, &
+                        LWRSVQ, MLWORK, MWRKEV, MWRSDD, &
+                        MWRSVD, MWRSVJ, MWRSVQ, NUMRNK, &
+                        OLWORK, MLRWRK
+      LOGICAL       ::  BADXY, LQUERY, SCCOLX, SCCOLY,  &
+                        WNTEX, WNTREF, WNTRES, WNTVEC
+      CHARACTER     ::  JOBZL, T_OR_N
+      CHARACTER     ::  JSVOPT
+!
+!     Local arrays
+!     ~~~~~~~~~~~~
+      REAL(KIND=WP) :: RDUMMY(2)
+
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      REAL(KIND=WP) ZLANGE, DLAMCH, DZNRM2
+      EXTERNAL      ZLANGE, DLAMCH, DZNRM2, IZAMAX
+      INTEGER                               IZAMAX
+      LOGICAL       DISNAN, LSAME
+      EXTERNAL      DISNAN, LSAME
+
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      ZAXPY,  ZGEMM,  ZDSCAL
+      EXTERNAL      ZGEEV,  ZGEJSV, ZGESDD, ZGESVD, ZGESVDQ, &
+                    ZLACPY, ZLASCL, ZLASSQ, XERBLA
+
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC     DBLE, INT, MAX, SQRT
+!............................................................
+!
+!    Test the input arguments
+!
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME(JOBS,'C')
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V')
+      WNTREF = LSAME(JOBF,'R')
+      WNTEX  = LSAME(JOBF,'E')
+      INFO   = 0
+      LQUERY = ( ( LZWORK == -1 ) .OR. ( LIWORK == -1 ) &
+                                  .OR. ( LRWORK == -1 ) )
+!
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. &
+                                  LSAME(JOBS,'N')) )   THEN
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. LSAME(JOBZ,'N')        &
+                              .OR. LSAME(JOBZ,'F')) )  THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.  &
+                ( WNTRES .AND. (.NOT.WNTVEC) ) )       THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             &
+                LSAME(JOBF,'N') ) )                    THEN
+          INFO = -4
+      ELSE IF ( .NOT.((WHTSVD == 1) .OR. (WHTSVD == 2) .OR.  &
+                      (WHTSVD == 3) .OR. (WHTSVD == 4) )) THEN
+          INFO = -5
+      ELSE IF ( M < 0 )   THEN
+          INFO = -6
+      ELSE IF ( ( N < 0 ) .OR. ( N > M ) ) THEN
+          INFO = -7
+      ELSE IF ( LDX < M ) THEN
+          INFO = -9
+      ELSE IF ( LDY < M ) THEN
+          INFO = -11
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. &
+                ((NRNK >= 1).AND.(NRNK <=N ))) )      THEN
+          INFO = -12
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) )  THEN
+          INFO = -13
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -17
+      ELSE IF ( (WNTREF .OR. WNTEX ) .AND. ( LDB < M ) ) THEN
+          INFO = -20
+      ELSE IF ( LDW < N ) THEN
+          INFO = -22
+      ELSE IF ( LDS < N ) THEN
+          INFO = -24
+      END IF
+!
+      IF ( INFO == 0 ) THEN
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and
+          ! determine minimal and optimal sizes of the
+          ! workspace at any moment of the run.
+         IF ( N == 0 ) THEN
+             ! Quick return. All output except K is void.
+             ! INFO=1 signals the void input.
+             ! In case of a workspace query, the default
+             ! minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN
+                IWORK(1) = 1
+                RWORK(1) = 1
+                ZWORK(1) = 2
+                ZWORK(2) = 2
+            ELSE
+               K   =  0
+            END IF
+            INFO = 1
+            RETURN
+         END IF
+
+         IMINWR = 1
+         MLRWRK = MAX(1,N)
+         MLWORK = 2
+         OLWORK = 2
+         SELECT CASE ( WHTSVD )
+         CASE (1)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of ZGESVD:
+             ! MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+             MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+             MLWORK = MAX(MLWORK,MWRSVD)
+             MLRWRK = MAX(MLRWRK,N + 5*MIN(M,N))
+             IF ( LQUERY ) THEN
+                CALL ZGESVD( 'O', 'S', M, N, X, LDX, RWORK, &
+                     B, LDB, W, LDW, ZWORK, -1, RDUMMY, INFO1 )
+                LWRSVD = INT( ZWORK(1) )
+                OLWORK = MAX(OLWORK,LWRSVD)
+             END IF
+         CASE (2)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of ZGESDD:
+             ! MWRSDD = 2*min(M,N)*min(M,N)+2*min(M,N)+max(M,N).
+             ! RWORK length: 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N)
+             ! In LAPACK 3.10.1 RWORK is defined differently.
+             ! Below we take max over the two versions.
+             ! IMINWR = 8*MIN(M,N)
+             MWRSDD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N)
+             MLWORK = MAX(MLWORK,MWRSDD)
+             IMINWR = 8*MIN(M,N)
+             MLRWRK = MAX( MLRWRK,  N +                    &
+                      MAX( 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), &
+                           5*MIN(M,N)*MIN(M,N)+5*MIN(M,N), &
+                           2*MAX(M,N)*MIN(M,N)+            &
+                           2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) )
+             IF ( LQUERY ) THEN
+                CALL ZGESDD( 'O', M, N, X, LDX, RWORK, B,LDB,&
+                     W, LDW, ZWORK, -1, RDUMMY, IWORK, INFO1 )
+                LWRSDD = MAX( MWRSDD,INT( ZWORK(1) ))
+                ! Possible bug in ZGESDD optimal workspace size.
+                OLWORK = MAX(OLWORK,LWRSDD)
+             END IF
+         CASE (3)
+             CALL ZGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                  X, LDX, RWORK, Z, LDZ, W, LDW, NUMRNK,  &
+                  IWORK, -1, ZWORK, -1, RDUMMY, -1, INFO1 )
+             IMINWR = IWORK(1)
+             MWRSVQ = INT(ZWORK(2))
+             MLWORK = MAX(MLWORK,MWRSVQ)
+             MLRWRK  = MAX(MLRWRK,N + INT(RDUMMY(1)))
+             IF ( LQUERY ) THEN
+                LWRSVQ = INT(ZWORK(1))
+                OLWORK = MAX(OLWORK,LWRSVQ)
+             END IF
+         CASE (4)
+             JSVOPT = 'J'
+             CALL ZGEJSV( 'F', 'U', JSVOPT, 'R', 'N', 'P', M, &
+                   N, X, LDX, RWORK, Z, LDZ, W, LDW,       &
+                   ZWORK, -1, RDUMMY, -1, IWORK, INFO1 )
+             IMINWR = IWORK(1)
+             MWRSVJ = INT(ZWORK(2))
+             MLWORK = MAX(MLWORK,MWRSVJ)
+             MLRWRK = MAX(MLRWRK,N + MAX(7,INT(RDUMMY(1))))
+             IF ( LQUERY ) THEN
+                LWRSVJ = INT(ZWORK(1))
+                OLWORK = MAX(OLWORK,LWRSVJ)
+             END IF
+         END SELECT
+         IF ( WNTVEC .OR. WNTEX .OR. LSAME(JOBZ,'F') ) THEN
+             JOBZL = 'V'
+         ELSE
+             JOBZL = 'N'
+         END IF
+         ! Workspace calculation to the ZGEEV call
+         MWRKEV = MAX( 1, 2*N )
+         MLWORK = MAX(MLWORK,MWRKEV)
+         MLRWRK = MAX(MLRWRK,N+2*N)
+         IF ( LQUERY ) THEN
+             CALL ZGEEV( 'N', JOBZL, N, S, LDS, EIGS, &
+              W, LDW, W, LDW, ZWORK, -1, RWORK, INFO1 )
+                LWRKEV = INT(ZWORK(1))
+                OLWORK = MAX( OLWORK, LWRKEV )
+         END IF
+!
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -30
+         IF ( LRWORK < MLRWRK .AND. (.NOT.LQUERY) ) INFO = -28
+         IF ( LZWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -26
+
+      END IF
+!
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'ZGEDMD', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          RWORK(1) = MLRWRK
+          ZWORK(1) = MLWORK
+          ZWORK(2) = OLWORK
+          RETURN
+      END IF
+!............................................................
+!
+      OFL   = DLAMCH('O')
+      SMALL = DLAMCH('S')
+      BADXY = .FALSE.
+!
+!     <1> Optional scaling of the snapshots (columns of X, Y)
+!     ==========================================================
+      IF ( SCCOLX ) THEN
+          ! The columns of X will be normalized.
+          ! To prevent overflows, the column norms of X are
+          ! carefully computed using ZLASSQ.
+          K = 0
+          DO i = 1, N
+            !WORK(i) = DZNRM2( M, X(1,i), 1 )
+            SCALE  = ZERO
+            CALL ZLASSQ( M, X(1,i), 1, SCALE, SSUM )
+            IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -8
+                CALL XERBLA('ZGEDMD',-INFO)
+            END IF
+            IF ( (SCALE /= ZERO) .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of X(:,i) overflows. First, X(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2.
+!                 Next, the norm of X(:,i) is stored without
+!                 overflow as RWORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of X(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL ZLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, X(1,i), LDX, INFO2 )
+                  RWORK(i) = - SCALE * ( ROOTSC / DBLE(M) )
+               ELSE
+!                 X(:,i) will be scaled to unit 2-norm
+                  RWORK(i) =   SCALE * ROOTSC
+                  CALL ZLASCL( 'G',0, 0, RWORK(i), ONE, M, 1, &
+                               X(1,i), LDX, INFO2 )      ! LAPACK CALL
+!                 X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)   ! INTRINSIC
+               END IF
+            ELSE
+               RWORK(i) = ZERO
+               K = K + 1
+            END IF
+          END DO
+          IF ( K == N ) THEN
+             ! All columns of X are zero. Return error code -8.
+             ! (the 8th input variable had an illegal value)
+             K = 0
+             INFO = -8
+             CALL XERBLA('ZGEDMD',-INFO)
+             RETURN
+          END IF
+          DO i = 1, N
+!           Now, apply the same scaling to the columns of Y.
+            IF ( RWORK(i) >  ZERO ) THEN
+                CALL ZDSCAL( M, ONE/RWORK(i), Y(1,i), 1 )  ! BLAS CALL
+!               Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)       ! INTRINSIC
+            ELSE IF ( RWORK(i) < ZERO ) THEN
+                CALL ZLASCL( 'G', 0, 0, -RWORK(i),          &
+                     ONE/DBLE(M), M, 1, Y(1,i), LDY, INFO2 ) ! LAPACK CALL
+            ELSE IF ( ABS(Y(IZAMAX(M, Y(1,i),1),i ))  &
+                                            /= ZERO ) THEN
+!               X(:,i) is zero vector. For consistency,
+!               Y(:,i) should also be zero. If Y(:,i) is not
+!               zero, then the data might be inconsistent or
+!               corrupted. If JOBS == 'C', Y(:,i) is set to
+!               zero and a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+                IF ( LSAME(JOBS,'C')) &
+                CALL ZDSCAL( M, ZERO, Y(1,i), 1 )  ! BLAS CALL
+            END IF
+          END DO
+      END IF
+  !
+      IF ( SCCOLY ) THEN
+          ! The columns of Y will be normalized.
+          ! To prevent overflows, the column norms of Y are
+          ! carefully computed using ZLASSQ.
+          DO i = 1, N
+            !RWORK(i) = DZNRM2( M, Y(1,i), 1 )
+            SCALE  = ZERO
+            CALL ZLASSQ( M, Y(1,i), 1, SCALE, SSUM )
+            IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -10
+                CALL XERBLA('ZGEDMD',-INFO)
+            END IF
+            IF ( SCALE /= ZERO  .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of Y(:,i) overflows. First, Y(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2.
+!                 Next, the norm of Y(:,i) is stored without
+!                 overflow as RWORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of Y(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL ZLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, Y(1,i), LDY, INFO2 )
+                  RWORK(i) = - SCALE * ( ROOTSC / DBLE(M) )
+               ELSE
+!                 Y(:,i) will be scaled to unit 2-norm
+                  RWORK(i) =   SCALE * ROOTSC
+                  CALL ZLASCL( 'G',0, 0, RWORK(i), ONE, M, 1, &
+                               Y(1,i), LDY, INFO2 )             ! LAPACK CALL
+!                 Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)          ! INTRINSIC
+               END IF
+            ELSE
+               RWORK(i) = ZERO
+            END IF
+         END DO
+         DO i = 1, N
+!           Now, apply the same scaling to the columns of X.
+            IF ( RWORK(i) >  ZERO ) THEN
+                CALL ZDSCAL( M, ONE/RWORK(i), X(1,i), 1 ) ! BLAS CALL
+!               X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)      ! INTRINSIC
+            ELSE IF ( RWORK(i) < ZERO ) THEN
+                CALL ZLASCL( 'G', 0, 0, -RWORK(i),          &
+                     ONE/DBLE(M), M, 1, X(1,i), LDX, INFO2 ) ! LAPACK CALL
+            ELSE IF ( ABS(X(IZAMAX(M, X(1,i),1),i ))  &
+                                           /= ZERO ) THEN
+!               Y(:,i) is zero vector.  If X(:,i) is not
+!               zero, then a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+            END IF
+         END DO
+       END IF
+!
+!     <2> SVD of the data snapshot matrix X.
+!     =====================================
+!     The left singular vectors are stored in the array X.
+!     The right singular vectors are in the array W.
+!     The array W will later on contain the eigenvectors
+!     of a Rayleigh quotient.
+      NUMRNK = N
+      SELECT CASE ( WHTSVD )
+         CASE (1)
+             CALL ZGESVD( 'O', 'S', M, N, X, LDX, RWORK, B, &
+                  LDB, W, LDW, ZWORK, LZWORK,  RWORK(N+1), INFO1 ) ! LAPACK CALL
+             T_OR_N = 'C'
+         CASE (2)
+            CALL ZGESDD( 'O', M, N, X, LDX, RWORK, B, LDB, W, &
+                 LDW, ZWORK, LZWORK, RWORK(N+1), IWORK, INFO1 )   ! LAPACK CALL
+            T_OR_N = 'C'
+         CASE (3)
+              CALL ZGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                   X, LDX, RWORK, Z, LDZ, W, LDW, &
+                   NUMRNK, IWORK, LIWORK, ZWORK,     &
+                   LZWORK, RWORK(N+1), LRWORK-N, INFO1)     ! LAPACK CALL
+              CALL ZLACPY( 'A', M, NUMRNK, Z, LDZ, X, LDX )   ! LAPACK CALL
+         T_OR_N = 'C'
+         CASE (4)
+              CALL ZGEJSV( 'F', 'U', JSVOPT, 'R', 'N', 'P', M, &
+                   N, X, LDX, RWORK, Z, LDZ, W, LDW, &
+                   ZWORK, LZWORK, RWORK(N+1), LRWORK-N, IWORK, INFO1 )    ! LAPACK CALL
+              CALL ZLACPY( 'A', M, N, Z, LDZ, X, LDX )   ! LAPACK CALL
+              T_OR_N = 'N'
+              XSCL1 = RWORK(N+1)
+              XSCL2 = RWORK(N+2)
+              IF ( XSCL1 /=  XSCL2 ) THEN
+                 ! This is an exceptional situation. If the
+                 ! data matrices are not scaled and the
+                 ! largest singular value of X overflows.
+                 ! In that case ZGEJSV can return the SVD
+                 ! in scaled form. The scaling factor can be used
+                 ! to rescale the data (X and Y).
+                 CALL ZLASCL( 'G', 0, 0, XSCL1, XSCL2, M, N, Y, LDY, INFO2  )
+              END IF
+      END SELECT
+!
+      IF ( INFO1 > 0 ) THEN
+         ! The SVD selected subroutine did not converge.
+         ! Return with an error code.
+         INFO = 2
+         RETURN
+      END IF
+!
+      IF ( RWORK(1) == ZERO ) THEN
+          ! The largest computed singular value of (scaled)
+          ! X is zero. Return error code -8
+          ! (the 8th input variable had an illegal value).
+          K = 0
+          INFO = -8
+          CALL XERBLA('ZGEDMD',-INFO)
+          RETURN
+      END IF
+!
+      !<3> Determine the numerical rank of the data
+      !    snapshots matrix X. This depends on the
+      !    parameters NRNK and TOL.
+
+      SELECT CASE ( NRNK )
+          CASE ( -1 )
+               K = 1
+               DO i = 2, NUMRNK
+                 IF ( ( RWORK(i) <= RWORK(1)*TOL ) .OR. &
+                      ( RWORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE ( -2 )
+               K = 1
+               DO i = 1, NUMRNK-1
+                 IF ( ( RWORK(i+1) <= RWORK(i)*TOL  ) .OR. &
+                      ( RWORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE DEFAULT
+               K = 1
+               DO i = 2, NRNK
+                  IF ( RWORK(i) <= SMALL ) EXIT
+                  K = K + 1
+               END DO
+          END SELECT
+      !   Now, U = X(1:M,1:K) is the SVD/POD basis for the
+      !   snapshot data in the input matrix X.
+
+      !<4> Compute the Rayleigh quotient S = U^H * A * U.
+      !    Depending on the requested outputs, the computation
+      !    is organized to compute additional auxiliary
+      !    matrices (for the residuals and refinements).
+      !
+      !    In all formulas below, we need V_k*Sigma_k^(-1)
+      !    where either V_k is in W(1:N,1:K), or V_k^H is in
+      !    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)).
+      IF ( LSAME(T_OR_N, 'N') ) THEN
+          DO i = 1, K
+           CALL ZDSCAL( N, ONE/RWORK(i), W(1,i), 1 )    ! BLAS CALL
+           ! W(1:N,i) = (ONE/RWORK(i)) * W(1:N,i)      ! INTRINSIC
+          END DO
+      ELSE
+          ! This non-unit stride access is due to the fact
+          ! that ZGESVD, ZGESVDQ and ZGESDD return the
+          ! adjoint matrix of the right singular vectors.
+          !DO i = 1, K
+          ! CALL ZDSCAL( N, ONE/RWORK(i), W(i,1), LDW )    ! BLAS CALL
+          ! ! W(i,1:N) = (ONE/RWORK(i)) * W(i,1:N)      ! INTRINSIC
+          !END DO
+          DO i = 1, K
+              RWORK(N+i) = ONE/RWORK(i)
+          END DO
+          DO j = 1, N
+             DO i = 1, K
+                 W(i,j) = CMPLX(RWORK(N+i),ZERO,KIND=WP)*W(i,j)
+             END DO
+          END DO
+      END IF
+!
+      IF ( WNTREF ) THEN
+         !
+         ! Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K)))
+         ! for computing the refined Ritz vectors
+         ! (optionally, outside ZGEDMD).
+          CALL ZGEMM( 'N', T_OR_N, M, K, N, ZONE, Y, LDY, W, &
+                      LDW, ZZERO, Z, LDZ )                       ! BLAS CALL
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(CONJG(W(1:K,1:N)))) ! INTRINSIC, for T_OR_N=='C'
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))                   ! INTRINSIC, for T_OR_N=='N'
+          !
+          ! At this point Z contains
+          ! A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and
+          ! this is needed for computing the residuals.
+          ! This matrix is  returned in the array B and
+          ! it can be used to compute refined Ritz vectors.
+          CALL ZLACPY( 'A', M, K, Z, LDZ, B, LDB )   ! BLAS CALL
+          ! B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC
+
+          CALL ZGEMM( 'C', 'N', K, K, M, ZONE, X, LDX, Z, &
+                      LDZ, ZZERO, S, LDS )                        ! BLAS CALL
+          ! S(1:K,1:K) = MATMUL(TRANSPOSE(CONJG(X(1:M,1:K))),Z(1:M,1:K)) ! INTRINSIC
+          ! At this point S = U^H * A * U is the Rayleigh quotient.
+      ELSE
+        ! A * U(:,1:K) is not explicitly needed and the
+        ! computation is organized differently. The Rayleigh
+        ! quotient is computed more efficiently.
+        CALL ZGEMM( 'C', 'N', K, N, M, ZONE, X, LDX, Y, LDY, &
+                   ZZERO, Z, LDZ )                                         ! BLAS CALL
+        ! Z(1:K,1:N) = MATMUL( TRANSPOSE(CONJG(X(1:M,1:K))), Y(1:M,1:N) )  ! INTRINSIC
+        !
+        CALL ZGEMM( 'N', T_OR_N, K, K, N, ZONE, Z, LDZ, W, &
+                    LDW, ZZERO, S, LDS )                         ! BLAS CALL
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(CONJG(W(1:K,1:N)))) ! INTRINSIC, for T_OR_N=='T'
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))                 ! INTRINSIC, for T_OR_N=='N'
+        ! At this point S = U^H * A * U is the Rayleigh quotient.
+        ! If the residuals are requested, save scaled V_k into Z.
+        ! Recall that V_k or V_k^H is stored in W.
+        IF ( WNTRES .OR. WNTEX ) THEN
+          IF ( LSAME(T_OR_N, 'N') ) THEN
+              CALL ZLACPY( 'A', N, K, W, LDW, Z, LDZ )
+          ELSE
+              CALL ZLACPY( 'A', K, N, W, LDW, Z, LDZ )
+          END IF
+        END IF
+      END IF
+!
+      !<5> Compute the Ritz values and (if requested) the
+      !   right eigenvectors of the Rayleigh quotient.
+      !
+      CALL ZGEEV( 'N', JOBZL, K, S, LDS, EIGS, W, LDW, &
+            W, LDW, ZWORK, LZWORK, RWORK(N+1), INFO1 )  ! LAPACK CALL
+      !
+      ! W(1:K,1:K) contains the eigenvectors of the Rayleigh
+      ! quotient.  See the description of Z.
+      ! Also, see the description of ZGEEV.
+      IF ( INFO1 > 0 ) THEN
+         ! ZGEEV failed to compute the eigenvalues and
+         ! eigenvectors of the Rayleigh quotient.
+         INFO = 3
+         RETURN
+      END IF
+!
+      ! <6> Compute the eigenvectors (if requested) and,
+      ! the residuals (if requested).
+      !
+      IF ( WNTVEC .OR. WNTEX ) THEN
+      IF ( WNTRES ) THEN
+          IF ( WNTREF ) THEN
+            ! Here, if the refinement is requested, we have
+            ! A*U(:,1:K) already computed and stored in Z.
+            ! For the residuals, need Y = A * U(:,1;K) * W.
+            CALL ZGEMM( 'N', 'N', M, K, K, ZONE, Z, LDZ, W, &
+                       LDW, ZZERO, Y, LDY )               ! BLAS CALL
+            ! Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)        ! INTRINSIC
+            ! This frees Z; Y contains A * U(:,1:K) * W.
+          ELSE
+            ! Compute S = V_k * Sigma_k^(-1) * W, where
+            ! V_k * Sigma_k^(-1) (or its adjoint) is stored in Z
+            CALL ZGEMM( T_OR_N, 'N', N, K, K, ZONE, Z, LDZ, &
+                       W, LDW, ZZERO, S, LDS )
+            ! Then, compute Z = Y * S =
+            ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+            ! = A * U(:,1:K) * W(1:K,1:K)
+            CALL ZGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, &
+                       LDS, ZZERO, Z, LDZ )
+            ! Save a copy of Z into Y and free Z for holding
+            ! the Ritz vectors.
+            CALL ZLACPY( 'A', M, K, Z, LDZ, Y, LDY )
+            IF ( WNTEX ) CALL ZLACPY( 'A', M, K, Z, LDZ, B, LDB )
+          END IF
+      ELSE IF ( WNTEX ) THEN
+          ! Compute S = V_k * Sigma_k^(-1) * W, where
+            ! V_k * Sigma_k^(-1) is stored in Z
+            CALL ZGEMM( T_OR_N, 'N', N, K, K, ZONE, Z, LDZ, &
+                       W, LDW, ZZERO, S, LDS )
+            ! Then, compute Z = Y * S =
+            ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+            ! = A * U(:,1:K) * W(1:K,1:K)
+            CALL ZGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, &
+                       LDS, ZZERO, B, LDB )
+            ! The above call replaces the following two calls
+            ! that were used in the developing-testing phase.
+            ! CALL ZGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, &
+            !           LDS, ZZERO, Z, LDZ)
+            ! Save a copy of Z into B and free Z for holding
+            ! the Ritz vectors.
+            ! CALL ZLACPY( 'A', M, K, Z, LDZ, B, LDB )
+      END IF
+!
+      ! Compute the Ritz vectors
+      IF ( WNTVEC ) CALL ZGEMM( 'N', 'N', M, K, K, ZONE, X, LDX, W, LDW, &
+                   ZZERO, Z, LDZ )                          ! BLAS CALL
+      ! Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRINSIC
+!
+      IF ( WNTRES ) THEN
+         DO i = 1, K
+            CALL ZAXPY( M, -EIGS(i), Z(1,i), 1, Y(1,i), 1 )       ! BLAS CALL
+            ! Y(1:M,i) = Y(1:M,i) - EIGS(i) * Z(1:M,i)            ! INTRINSIC
+            RES(i) = DZNRM2( M, Y(1,i), 1 )                       ! BLAS CALL
+         END DO
+      END IF
+      END IF
+!
+      IF ( WHTSVD == 4 ) THEN
+          RWORK(N+1) = XSCL1
+          RWORK(N+2) = XSCL2
+      END IF
+!
+!     Successful exit.
+      IF ( .NOT. BADXY ) THEN
+         INFO = 0
+      ELSE
+         ! A warning on possible data inconsistency.
+         ! This should be a rare event.
+         INFO = 4
+      END IF
+!............................................................
+      RETURN
+!     ......
+      END SUBROUTINE ZGEDMD
+
diff --git a/lapack-netlib/SRC/zgedmdq.f90 b/lapack-netlib/SRC/zgedmdq.f90
new file mode 100644
index 000000000..51be72a32
--- /dev/null
+++ b/lapack-netlib/SRC/zgedmdq.f90
@@ -0,0 +1,689 @@
+SUBROUTINE ZGEDMDQ( JOBS,  JOBZ, JOBR, JOBQ, JOBT, JOBF,   &
+                    WHTSVD,   M, N, F, LDF,  X, LDX,  Y,   &
+                    LDY,   NRNK,  TOL,   K,  EIGS,         &
+                    Z, LDZ, RES,  B,     LDB,   V, LDV,    & 
+                    S, LDS, ZWORK, LZWORK, WORK,  LWORK,   &
+                    IWORK, LIWORK, INFO )
+! March 2023
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real64
+!.....      
+!     Scalar arguments       
+      CHARACTER, INTENT(IN)  :: JOBS, JOBZ, JOBR, JOBQ,    &
+                                JOBT, JOBF
+      INTEGER,   INTENT(IN)  :: WHTSVD, M, N,   LDF, LDX,  &
+                                LDY, NRNK, LDZ, LDB, LDV,  &
+                                LDS, LZWORK,  LWORK, LIWORK
+      INTEGER,   INTENT(OUT) :: INFO,   K      
+      REAL(KIND=WP), INTENT(IN)    ::   TOL     
+!     Array arguments      
+      COMPLEX(KIND=WP), INTENT(INOUT) :: F(LDF,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: X(LDX,*), Y(LDY,*), &
+                                         Z(LDZ,*), B(LDB,*), &
+                                         V(LDV,*), S(LDS,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
+      REAL(KIND=WP), INTENT(OUT)   :: RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)  
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!.....      
+!     Purpose  
+!     =======
+!     ZGEDMDQ computes the Dynamic Mode Decomposition (DMD) for
+!     a pair of data snapshot matrices, using a QR factorization
+!     based compression of the data. For the input matrices
+!     X and Y such that Y = A*X with an unaccessible matrix
+!     A, ZGEDMDQ computes a certain number of Ritz pairs of A using
+!     the standard Rayleigh-Ritz extraction from a subspace of
+!     range(X) that is determined using the leading left singular 
+!     vectors of X. Optionally, ZGEDMDQ returns the residuals 
+!     of the computed Ritz pairs, the information needed for
+!     a refinement of the Ritz vectors, or the eigenvectors of
+!     the Exact DMD.
+!     For further details see the references listed
+!     below. For more details of the implementation see [3].      
+!
+!     References
+!     ==========
+!     [1] P. Schmid: Dynamic mode decomposition of numerical
+!         and experimental data,
+!         Journal of Fluid Mechanics 656, 5-28, 2010.
+!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!         decompositions: analysis and enhancements,
+!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!         Mode Decomposition I. Technical report. AIMDyn Inc.
+!         and LAPACK Working Note 298.      
+!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. 
+!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!         Theory and Applications, Journal of Computational
+!         Dynamics 1(2), 391 -421, 2014.
+!
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     and supported by
+!     - DARPA SBIR project "Koopman Operator-Based Forecasting
+!     for Nonstationary Processes from Near-Term, Limited
+!     Observational Data" Contract No: W31P4Q-21-C-0007
+!     - DARPA PAI project "Physics-Informed Machine Learning
+!     Methodologies" Contract No: HR0011-18-9-0033
+!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!     Framework for Space-Time Analysis of Process Dynamics"
+!     Contract No: HR0011-16-C-0116
+!     Any opinions, findings and conclusions or recommendations 
+!     expressed in this material are those of the author and 
+!     do not necessarily reflect the views of the DARPA SBIR 
+!     Program Office.      
+!============================================================
+!     Distribution Statement A: 
+!     Approved for Public Release, Distribution Unlimited.
+!     Cleared by DARPA on September 29, 2022
+!============================================================      
+!......................................................................      
+!     Arguments
+!     =========
+!     JOBS (input) CHARACTER*1
+!     Determines whether the initial data snapshots are scaled
+!     by a diagonal matrix. The data snapshots are the columns
+!     of F. The leading N-1 columns of F are denoted X and the
+!     trailing N-1 columns are denoted Y. 
+!     'S' :: The data snapshots matrices X and Y are multiplied
+!            with a diagonal matrix D so that X*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'C' :: The snapshots are scaled as with the 'S' option.
+!            If it is found that an i-th column of X is zero
+!            vector and the corresponding i-th column of Y is
+!            non-zero, then the i-th column of Y is set to
+!            zero and a warning flag is raised.
+!     'Y' :: The data snapshots matrices X and Y are multiplied
+!            by a diagonal matrix D so that Y*D has unit
+!            nonzero columns (in the Euclidean 2-norm)    
+!     'N' :: No data scaling.   
+!.....
+!     JOBZ (input) CHARACTER*1
+!     Determines whether the eigenvectors (Koopman modes) will
+!     be computed.
+!     'V' :: The eigenvectors (Koopman modes) will be computed
+!            and returned in the matrix Z.
+!            See the description of Z.
+!     'F' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Z*V, where Z
+!            is orthonormal and V contains the eigenvectors
+!            of the corresponding Rayleigh quotient.
+!            See the descriptions of F, V, Z.
+!     'Q' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Q*Z, where Z
+!            contains the eigenvectors of the compression of the
+!            underlying discretized operator onto the span of
+!            the data snapshots. See the descriptions of F, V, Z.
+!            Q is from the initial QR factorization.  
+!     'N' :: The eigenvectors are not computed.  
+!.....      
+!     JOBR (input) CHARACTER*1 
+!     Determines whether to compute the residuals.
+!     'R' :: The residuals for the computed eigenpairs will
+!            be computed and stored in the array RES.
+!            See the description of RES.
+!            For this option to be legal, JOBZ must be 'V'.
+!     'N' :: The residuals are not computed.
+!.....
+!     JOBQ (input) CHARACTER*1 
+!     Specifies whether to explicitly compute and return the
+!     unitary matrix from the QR factorization.
+!     'Q' :: The matrix Q of the QR factorization of the data
+!            snapshot matrix is computed and stored in the
+!            array F. See the description of F.       
+!     'N' :: The matrix Q is not explicitly computed.
+!.....
+!     JOBT (input) CHARACTER*1 
+!     Specifies whether to return the upper triangular factor
+!     from the QR factorization.
+!     'R' :: The matrix R of the QR factorization of the data 
+!            snapshot matrix F is returned in the array Y.
+!            See the description of Y and Further details.       
+!     'N' :: The matrix R is not returned. 
+!.....
+!     JOBF (input) CHARACTER*1
+!     Specifies whether to store information needed for post-
+!     processing (e.g. computing refined Ritz vectors)
+!     'R' :: The matrix needed for the refinement of the Ritz
+!            vectors is computed and stored in the array B.
+!            See the description of B.
+!     'E' :: The unscaled eigenvectors of the Exact DMD are 
+!            computed and returned in the array B. See the
+!            description of B.
+!     'N' :: No eigenvector refinement data is computed.   
+!     To be useful on exit, this option needs JOBQ='Q'.    
+!.....
+!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!     Allows for a selection of the SVD algorithm from the
+!     LAPACK library.
+!     1 :: ZGESVD (the QR SVD algorithm)
+!     2 :: ZGESDD (the Divide and Conquer algorithm; if enough
+!          workspace available, this is the fastest option)
+!     3 :: ZGESVDQ (the preconditioned QR SVD  ; this and 4
+!          are the most accurate options)
+!     4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3
+!          are the most accurate options)
+!     For the four methods above, a significant difference in
+!     the accuracy of small singular values is possible if
+!     the snapshots vary in norm so that X is severely
+!     ill-conditioned. If small (smaller than EPS*||X||)
+!     singular values are of interest and JOBS=='N',  then
+!     the options (3, 4) give the most accurate results, where
+!     the option 4 is slightly better and with stronger 
+!     theoretical background.
+!     If JOBS=='S', i.e. the columns of X will be normalized,
+!     then all methods give nearly equally accurate results.
+!.....
+!     M (input) INTEGER, M >= 0 
+!     The state space dimension (the number of rows of F).
+!.....      
+!     N (input) INTEGER, 0 <= N <= M
+!     The number of data snapshots from a single trajectory,
+!     taken at equidistant discrete times. This is the 
+!     number of columns of F.
+!.....
+!     F (input/output) COMPLEX(KIND=WP) M-by-N array
+!     > On entry,
+!     the columns of F are the sequence of data snapshots 
+!     from a single trajectory, taken at equidistant discrete
+!     times. It is assumed that the column norms of F are 
+!     in the range of the normalized floating point numbers. 
+!     < On exit,
+!     If JOBQ == 'Q', the array F contains the orthogonal 
+!     matrix/factor of the QR factorization of the initial 
+!     data snapshots matrix F. See the description of JOBQ. 
+!     If JOBQ == 'N', the entries in F strictly below the main
+!     diagonal contain, column-wise, the information on the 
+!     Householder vectors, as returned by ZGEQRF. The 
+!     remaining information to restore the orthogonal matrix
+!     of the initial QR factorization is stored in ZWORK(1:MIN(M,N)). 
+!     See the description of ZWORK.
+!.....
+!     LDF (input) INTEGER, LDF >= M 
+!     The leading dimension of the array F.
+!.....
+!     X (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array
+!     X is used as workspace to hold representations of the
+!     leading N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, the leading K columns of X contain the leading
+!     K left singular vectors of the above described content
+!     of X. To lift them to the space of the left singular
+!     vectors U(:,1:K) of the input data, pre-multiply with the 
+!     Q factor from the initial QR factorization. 
+!     See the descriptions of F, K, V  and Z.
+!.....      
+!     LDX (input) INTEGER, LDX >= N  
+!     The leading dimension of the array X. 
+!.....
+!     Y (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N) array
+!     Y is used as workspace to hold representations of the
+!     trailing N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, 
+!     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper
+!     triangular factor from the QR factorization of the data
+!     snapshot matrix F.
+!.....      
+!     LDY (input) INTEGER , LDY >= N
+!     The leading dimension of the array Y.   
+!.....
+!     NRNK (input) INTEGER
+!     Determines the mode how to compute the numerical rank,
+!     i.e. how to truncate small singular values of the input
+!     matrix X. On input, if
+!     NRNK = -1 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(1)
+!                  This option is recommended.  
+!     NRNK = -2 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(i-1)
+!                  This option is included for R&D purposes.
+!                  It requires highly accurate SVD, which
+!                  may not be feasible.      
+!     The numerical rank can be enforced by using positive 
+!     value of NRNK as follows: 
+!     0 < NRNK <= N-1 :: at most NRNK largest singular values
+!     will be used. If the number of the computed nonzero
+!     singular values is less than NRNK, then only those
+!     nonzero values will be used and the actually used
+!     dimension is less than NRNK. The actual number of
+!     the nonzero singular values is returned in the variable
+!     K. See the description of K.
+!.....
+!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!     The tolerance for truncating small singular values.
+!     See the description of NRNK.  
+!.....
+!     K (output) INTEGER,  0 <= K <= N 
+!     The dimension of the SVD/POD basis for the leading N-1
+!     data snapshots (columns of F) and the number of the 
+!     computed Ritz pairs. The value of K is determined
+!     according to the rule set by the parameters NRNK and 
+!     TOL. See the descriptions of NRNK and TOL. 
+!.....
+!     EIGS (output) COMPLEX(KIND=WP) (N-1)-by-1 array
+!     The leading K (K<=N-1) entries of EIGS contain
+!     the computed eigenvalues (Ritz values).
+!     See the descriptions of K, and Z.
+!.....
+!     Z (workspace/output) COMPLEX(KIND=WP)  M-by-(N-1) array
+!     If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i)
+!     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1.
+!     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as
+!     Z*V, where Z contains orthonormal matrix (the product of
+!     Q from the initial QR factorization and the SVD/POD_basis
+!     returned by ZGEDMD in X) and the second factor (the 
+!     eigenvectors of the Rayleigh quotient) is in the array V, 
+!     as returned by ZGEDMD. That is,  X(:,1:K)*V(:,i)
+!     is an eigenvector corresponding to EIGS(i). The columns 
+!     of V(1:K,1:K) are the computed eigenvectors of the 
+!     K-by-K Rayleigh quotient.  
+!     See the descriptions of EIGS, X and V.      
+!.....
+!     LDZ (input) INTEGER , LDZ >= M
+!     The leading dimension of the array Z.
+!.....
+!     RES (output) REAL(KIND=WP) (N-1)-by-1 array
+!     RES(1:K) contains the residuals for the K computed 
+!     Ritz pairs, 
+!     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2.
+!     See the description of EIGS and Z.      
+!.....
+!     B (output) COMPLEX(KIND=WP)  MIN(M,N)-by-(N-1) array.
+!     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can
+!     be used for computing the refined vectors; see further 
+!     details in the provided references. 
+!     If JOBF == 'E', B(1:N,1;K) contains 
+!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!     Exact DMD, up to scaling by the inverse eigenvalues.   
+!     In both cases, the content of B can be lifted to the 
+!     original dimension of the input data by pre-multiplying
+!     with the Q factor from the initial QR factorization.   
+!     Here A denotes a compression of the underlying operator.      
+!     See the descriptions of F and X.
+!     If JOBF =='N', then B is not referenced.
+!.....
+!     LDB (input) INTEGER, LDB >= MIN(M,N)
+!     The leading dimension of the array B.
+!.....
+!     V (workspace/output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array
+!     On exit, V(1:K,1:K) V contains the K eigenvectors of
+!     the Rayleigh quotient. The Ritz vectors
+!     (returned in Z) are the product of Q from the initial QR
+!     factorization (see the description of F) X (see the 
+!     description of X) and V.
+!.....
+!     LDV (input) INTEGER, LDV >= N-1
+!     The leading dimension of the array V.
+!.....      
+!     S (output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array
+!     The array S(1:K,1:K) is used for the matrix Rayleigh
+!     quotient. This content is overwritten during
+!     the eigenvalue decomposition by ZGEEV.
+!     See the description of K.
+!.....
+!     LDS (input) INTEGER, LDS >= N-1        
+!     The leading dimension of the array S.
+!.....
+!     ZWORK (workspace/output) COMPLEX(KIND=WP) LWORK-by-1 array
+!     On exit, 
+!     ZWORK(1:MIN(M,N)) contains the scalar factors of the 
+!     elementary reflectors as returned by ZGEQRF of the 
+!     M-by-N input matrix F.   
+!     If the call to ZGEDMDQ is only workspace query, then
+!     ZWORK(1) contains the minimal complex workspace length and
+!     ZWORK(2) is the optimal complex workspace length. 
+!     Hence, the length of work is at least 2.
+!     See the description of LZWORK.      
+!.....      
+!     LZWORK (input) INTEGER
+!     The minimal length of the  workspace vector ZWORK.
+!     LZWORK is calculated as follows:
+!     Let MLWQR  = N (minimal workspace for ZGEQRF[M,N])
+!         MLWDMD = minimal workspace for ZGEDMD (see the
+!                  description of LWORK in ZGEDMD)
+!         MLWMQR = N (minimal workspace for 
+!                    ZUNMQR['L','N',M,N,N])
+!         MLWGQR = N (minimal workspace for ZUNGQR[M,N,N])
+!         MINMN  = MIN(M,N)      
+!     Then
+!     LZWORK = MAX(2, MIN(M,N)+MLWQR, MINMN+MLWDMD)
+!     is further updated as follows:
+!        if   JOBZ == 'V' or JOBZ == 'F' THEN 
+!             LZWORK = MAX(LZWORK, MINMN+MLWMQR)
+!        if   JOBQ == 'Q' THEN
+!             LZWORK = MAX(ZLWORK, MINMN+MLWGQR)      
+!
+!.....      
+!     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!     On exit,
+!     WORK(1:N-1) contains the singular values of 
+!     the input submatrix F(1:M,1:N-1).
+!     If the call to ZGEDMDQ is only workspace query, then
+!     WORK(1) contains the minimal workspace length and
+!     WORK(2) is the optimal workspace length. hence, the
+!     length of work is at least 2.
+!     See the description of LWORK.
+!.....
+!     LWORK (input) INTEGER
+!     The minimal length of the  workspace vector WORK.
+!     LWORK is the same as in ZGEDMD, because in ZGEDMDQ
+!     only ZGEDMD requires real workspace for snapshots
+!     of dimensions MIN(M,N)-by-(N-1). 
+!     If on entry LWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace length for WORK.          
+!.....
+!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!     Workspace that is required only if WHTSVD equals
+!     2 , 3 or 4. (See the description of WHTSVD).
+!     If on entry LWORK =-1 or LIWORK=-1, then the
+!     minimal length of IWORK is computed and returned in
+!     IWORK(1). See the description of LIWORK.
+!.....
+!     LIWORK (input) INTEGER
+!     The minimal length of the workspace vector IWORK.
+!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!     Let M1=MIN(M,N), N1=N-1. Then
+!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1))
+!     If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1)
+!     If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1)
+!     If on entry LIWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.
+!..... 
+!     INFO (output) INTEGER
+!     -i < 0 :: On entry, the i-th argument had an
+!               illegal value
+!        = 0 :: Successful return.
+!        = 1 :: Void input. Quick exit (M=0 or N=0).
+!        = 2 :: The SVD computation of X did not converge.
+!               Suggestion: Check the input data and/or
+!               repeat with different WHTSVD.
+!        = 3 :: The computation of the eigenvalues did not
+!               converge.
+!        = 4 :: If data scaling was requested on input and
+!               the procedure found inconsistency in the data
+!               such that for some column index i,
+!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!               to zero if JOBS=='C'. The computation proceeds
+!               with original or modified data and warning
+!               flag is set with INFO=4.  
+!.............................................................
+!.............................................................
+!     Parameters
+!     ~~~~~~~~~~      
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+!     COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP )
+      COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP )
+!      
+!     Local scalars      
+!     ~~~~~~~~~~~~~
+      INTEGER           :: IMINWR, INFO1,  MINMN, MLRWRK,   &
+                           MLWDMD, MLWGQR, MLWMQR, MLWORK,  & 
+                           MLWQR,  OLWDMD, OLWGQR, OLWMQR,  &
+                           OLWORK, OLWQR
+      LOGICAL           :: LQUERY, SCCOLX, SCCOLY, WANTQ,  &
+                           WNTTRF, WNTRES, WNTVEC, WNTVCF, &
+                           WNTVCQ, WNTREF, WNTEX
+      CHARACTER(LEN=1)  :: JOBVL
+!      
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      LOGICAL       LSAME
+      EXTERNAL      LSAME 
+!
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      ZGEQRF, ZLACPY, ZLASET, ZUNGQR, & 
+                    ZUNMQR, XERBLA
+
+!     External subroutines
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      ZGEDMD 
+      
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC      MAX, MIN, INT         
+ !..........................................................  
+ !
+ !    Test the input arguments    
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME( JOBS, 'C' )
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V')
+      WNTVCF = LSAME(JOBZ,'F')
+      WNTVCQ = LSAME(JOBZ,'Q') 
+      WNTREF = LSAME(JOBF,'R') 
+      WNTEX  = LSAME(JOBF,'E')
+      WANTQ  = LSAME(JOBQ,'Q')
+      WNTTRF = LSAME(JOBT,'R')     
+      MINMN  = MIN(M,N)
+      INFO = 0 
+      LQUERY = ( (LZWORK == -1) .OR. (LWORK == -1) .OR. (LIWORK == -1) )
+!       
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR.                &
+                                  LSAME(JOBS,'N')) )  THEN 
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. WNTVCF .OR. WNTVCQ    &
+                              .OR. LSAME(JOBZ,'N')) ) THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.    & 
+          ( WNTRES .AND. LSAME(JOBZ,'N') ) ) THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WANTQ .OR. LSAME(JOBQ,'N')) ) THEN
+           INFO = -4                 
+      ELSE IF ( .NOT. ( WNTTRF .OR. LSAME(JOBT,'N') ) )  THEN
+          INFO = -5
+       ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             & 
+                LSAME(JOBF,'N') ) )                     THEN
+          INFO = -6    
+      ELSE IF ( .NOT. ((WHTSVD == 1).OR.(WHTSVD == 2).OR.   &
+                       (WHTSVD == 3).OR.(WHTSVD == 4)) ) THEN
+          INFO = -7
+      ELSE IF ( M < 0 ) THEN
+          INFO = -8
+      ELSE IF ( ( N < 0 ) .OR. ( N > M+1 ) ) THEN
+          INFO = -9
+      ELSE IF ( LDF < M ) THEN
+          INFO = -11
+      ELSE IF ( LDX < MINMN ) THEN
+          INFO = -13
+      ELSE IF ( LDY < MINMN ) THEN
+          INFO = -15
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR.    & 
+                       ((NRNK >= 1).AND.(NRNK <=N ))) )  THEN
+          INFO = -16
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN
+          INFO = -17
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -21
+      ELSE IF ( (WNTREF.OR.WNTEX ).AND.( LDB < MINMN ) ) THEN
+          INFO = -24
+      ELSE IF ( LDV < N-1 ) THEN
+          INFO = -26
+      ELSE IF ( LDS < N-1 ) THEN
+          INFO = -28
+      END IF
+!      
+      IF ( WNTVEC .OR. WNTVCF .OR. WNTVCQ ) THEN
+          JOBVL = 'V'
+      ELSE
+          JOBVL = 'N'
+      END IF     
+      IF ( INFO == 0 ) THEN  
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and 
+          ! determine minimal and optimal sizes of the 
+          ! workspace at any moment of the run.         
+         IF ( ( N == 0 ) .OR. ( N == 1 ) ) THEN
+             ! All output except K is void. INFO=1 signals
+             ! the void input. In case of a workspace query,
+             ! the minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN  
+               IWORK(1) = 1
+               ZWORK(1) = 2
+               ZWORK(2) = 2
+               WORK(1)  = 2
+               WORK(2)  = 2
+            ELSE                
+               K = 0
+            END IF             
+            INFO = 1  
+            RETURN
+         END IF   
+         
+         MLRWRK = 2
+         MLWORK = 2
+         OLWORK = 2 
+         IMINWR = 1
+         MLWQR  = MAX(1,N)  ! Minimal workspace length for ZGEQRF.
+         MLWORK = MAX(MLWORK,MINMN + MLWQR) 
+         
+         IF ( LQUERY ) THEN 
+             CALL ZGEQRF( M, N, F, LDF, ZWORK, ZWORK, -1, &
+                          INFO1 )
+             OLWQR  = INT(ZWORK(1))
+             OLWORK = MAX(OLWORK,MINMN + OLWQR)         
+         END IF
+         CALL ZGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,& 
+                      N-1, X, LDX, Y, LDY, NRNK, TOL, K,     & 
+                      EIGS, Z, LDZ, RES,  B, LDB, V, LDV,    & 
+                      S, LDS, ZWORK, -1, WORK, -1, IWORK,&
+                      -1, INFO1 )
+         MLWDMD = INT(ZWORK(1))
+         MLWORK = MAX(MLWORK, MINMN + MLWDMD)
+         MLRWRK = MAX(MLRWRK, INT(WORK(1)))
+         IMINWR = MAX(IMINWR, IWORK(1))
+         IF ( LQUERY ) THEN 
+             OLWDMD = INT(ZWORK(2))
+             OLWORK = MAX(OLWORK, MINMN+OLWDMD)
+         END IF
+         IF ( WNTVEC .OR. WNTVCF ) THEN
+            MLWMQR = MAX(1,N) 
+            MLWORK = MAX(MLWORK,MINMN+MLWMQR)
+            IF ( LQUERY ) THEN
+               CALL ZUNMQR( 'L','N', M, N, MINMN, F, LDF,  & 
+                            ZWORK, Z, LDZ, ZWORK, -1, INFO1 )
+               OLWMQR = INT(ZWORK(1))
+               OLWORK = MAX(OLWORK,MINMN+OLWMQR)
+            END IF
+         END IF  
+         IF ( WANTQ ) THEN
+            MLWGQR = MAX(1,N)
+            MLWORK = MAX(MLWORK,MINMN+MLWGQR)
+            IF ( LQUERY ) THEN 
+                CALL ZUNGQR( M, MINMN, MINMN, F, LDF, ZWORK, &
+                             ZWORK, -1, INFO1 )               
+                OLWGQR = INT(ZWORK(1))
+                OLWORK = MAX(OLWORK,MINMN+OLWGQR)
+            END IF            
+         END IF         
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -34
+         IF ( LWORK  < MLRWRK .AND. (.NOT.LQUERY) ) INFO = -32
+         IF ( LZWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -30
+      END IF  
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'ZGEDMDQ', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          ZWORK(1) = MLWORK
+          ZWORK(2) = OLWORK
+          WORK(1)  = MLRWRK
+          WORK(2)  = MLRWRK
+          RETURN
+      END IF   
+!.....	  
+!     Initial QR factorization that is used to represent the
+!     snapshots as elements of lower dimensional subspace.
+!     For large scale computation with M >> N, at this place 
+!     one can use an out of core QRF.
+!   
+      CALL ZGEQRF( M, N, F, LDF, ZWORK,                & 
+                   ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )
+!      
+!     Define X and Y as the snapshots representations in the
+!     orthogonal basis computed in the QR factorization.
+!     X corresponds to the leading N-1 and Y to the trailing
+!     N-1 snapshots.
+      CALL ZLASET( 'L', MINMN, N-1, ZZERO,  ZZERO, X, LDX )
+      CALL ZLACPY( 'U', MINMN, N-1, F,      LDF, X, LDX )
+      CALL ZLACPY( 'A', MINMN, N-1, F(1,2), LDF, Y, LDY )
+      IF ( M >= 3 ) THEN
+          CALL ZLASET( 'L', MINMN-2, N-2, ZZERO,  ZZERO, &
+                       Y(3,1), LDY )  
+      END IF
+!
+!     Compute the DMD of the projected snapshot pairs (X,Y)   
+      CALL ZGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN, &
+                  N-1,  X, LDX, Y, LDY, NRNK,   TOL, K,    &
+                  EIGS, Z, LDZ, RES, B,  LDB,   V, LDV,    &
+                  S, LDS, ZWORK(MINMN+1), LZWORK-MINMN, & 
+                  WORK,   LWORK, IWORK, LIWORK, INFO1 )
+      IF ( INFO1 == 2 .OR. INFO1 == 3 ) THEN
+          ! Return with error code. See ZGEDMD for details.
+          INFO = INFO1
+          RETURN
+      ELSE
+          INFO = INFO1
+      END IF    
+!      
+!     The Ritz vectors (Koopman modes) can be explicitly 
+!     formed or returned in factored form.
+      IF ( WNTVEC ) THEN
+        ! Compute the eigenvectors explicitly.  
+        IF ( M > MINMN ) CALL ZLASET( 'A', M-MINMN, K, ZZERO, &
+                                     ZZERO, Z(MINMN+1,1), LDZ )
+        CALL ZUNMQR( 'L','N', M, K, MINMN, F, LDF, ZWORK, Z,  &
+             LDZ, ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )
+      ELSE IF ( WNTVCF ) THEN   
+        !   Return the Ritz vectors (eigenvectors) in factored
+        !   form Z*V, where Z contains orthonormal matrix (the
+        !   product of Q from the initial QR factorization and 
+        !   the SVD/POD_basis returned by ZGEDMD in X) and the 
+        !   second factor (the eigenvectors of the Rayleigh 
+        !   quotient) is in the array V, as returned by ZGEDMD.
+        CALL ZLACPY( 'A', N, K, X, LDX, Z, LDZ )
+        IF ( M > N ) CALL ZLASET( 'A', M-N, K, ZZERO, ZZERO, & 
+                                 Z(N+1,1), LDZ )
+        CALL ZUNMQR( 'L','N', M, K, MINMN, F, LDF, ZWORK, Z, &
+                    LDZ, ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )
+      END IF
+!     
+!     Some optional output variables:
+!
+!     The upper triangular factor R in the initial QR 
+!     factorization is optionally returned in the array Y.
+!     This is useful if this call to ZGEDMDQ is to be 
+!     followed by a streaming DMD that is implemented in a 
+!     QR compressed form.
+      IF ( WNTTRF ) THEN ! Return the upper triangular R in Y 
+         CALL ZLASET( 'A', MINMN, N, ZZERO,  ZZERO, Y, LDY )
+         CALL ZLACPY( 'U', MINMN, N, F, LDF,        Y, LDY )
+      END IF    
+!
+!     The orthonormal/unitary factor Q in the initial QR 
+!     factorization is optionally returned in the array F. 
+!     Same as with the triangular factor above, this is 
+!     useful in a streaming DMD.
+      IF ( WANTQ ) THEN                   ! Q overwrites F 
+         CALL ZUNGQR( M, MINMN, MINMN, F, LDF, ZWORK,     &
+                      ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )  
+      END IF
+!      
+      RETURN
+!      
+      END SUBROUTINE ZGEDMDQ
+    
\ No newline at end of file

From c0865ab0fe8579a2716b4f12fae4e3f2f6b1bede Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 20 Jun 2023 10:45:29 +0200
Subject: [PATCH 134/718] Add LAPACKE interfaces for Dynamic Mode Decomposition
 (Reference-LAPACK PR 736)

---
 lapack-netlib/LAPACKE/src/lapacke_cgedmd.c    | 115 ++++++++++
 .../LAPACKE/src/lapacke_cgedmd_work.c         | 180 +++++++++++++++
 lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c   | 123 +++++++++++
 .../LAPACKE/src/lapacke_cgedmdq_work.c        | 205 ++++++++++++++++++
 lapack-netlib/LAPACKE/src/lapacke_dgedmd.c    | 112 ++++++++++
 .../LAPACKE/src/lapacke_dgedmd_work.c         | 179 +++++++++++++++
 lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c   | 119 ++++++++++
 .../LAPACKE/src/lapacke_dgedmdq_work.c        | 200 +++++++++++++++++
 lapack-netlib/LAPACKE/src/lapacke_sgedmd.c    | 112 ++++++++++
 .../LAPACKE/src/lapacke_sgedmd_work.c         | 179 +++++++++++++++
 lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c   | 119 ++++++++++
 .../LAPACKE/src/lapacke_sgedmdq_work.c        | 200 +++++++++++++++++
 lapack-netlib/LAPACKE/src/lapacke_zgedmd.c    | 116 ++++++++++
 .../LAPACKE/src/lapacke_zgedmd_work.c         | 182 ++++++++++++++++
 lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c   | 123 +++++++++++
 .../LAPACKE/src/lapacke_zgedmdq_work.c        | 205 ++++++++++++++++++
 16 files changed, 2469 insertions(+)
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_cgedmd.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dgedmd.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_sgedmd.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zgedmd.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c

diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c
new file mode 100644
index 000000000..a269b0daf
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c
@@ -0,0 +1,115 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function cgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgedmd( int matrix_layout, char jobs, char jobz, char jobf,
+                           lapack_int whtsvd, lapack_int m, lapack_int n,
+                           lapack_complex_float* x, lapack_int ldx,
+                           lapack_complex_float* y, lapack_int ldy, lapack_int k,
+                           lapack_complex_float* reig, lapack_complex_float* imeig,
+                           lapack_complex_float* z, lapack_int ldz,
+                           lapack_complex_float* res, lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* w,
+                           lapack_int ldw, lapack_complex_float* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    lapack_complex_float* work = NULL;
+    lapack_int* iwork = NULL;
+    lapack_complex_float work_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_cgedmd", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -8;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -10;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -15;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -18;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, w, ldw ) ) {
+            return -20;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -22;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n,
+                                x, ldx, y, ldy, k, reig, imeig, z, ldz, res,
+                                b, ldb, w, ldw, s, lds, &work_query, lwork,
+                                &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork  = LAPACK_C2INT( work_query );
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    work  = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n,
+                                x, ldx, y, ldy, k, reig, imeig, z, ldz, res,
+                                b, ldb, w, ldw, s, lds, work, lwork, iwork,
+                                liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_1:
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_cgedmd", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c
new file mode 100644
index 000000000..534934efb
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c
@@ -0,0 +1,180 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function cgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobf, lapack_int whtsvd, lapack_int m,
+                                lapack_int n, lapack_complex_float* x, lapack_int ldx,
+                                lapack_complex_float* y, lapack_int ldy, lapack_int k,
+                                lapack_complex_float* reig, lapack_complex_float* imeig,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_complex_float* res, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* w,
+                                lapack_int ldw, lapack_complex_float* s, lapack_int lds,
+                                lapack_complex_float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_cgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy,
+                       &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds,
+                       work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldw_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        lapack_complex_float* x_t = NULL;
+        lapack_complex_float* y_t = NULL;
+        lapack_complex_float* z_t = NULL;
+        lapack_complex_float* b_t = NULL;
+        lapack_complex_float* w_t = NULL;
+        lapack_complex_float* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldx < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -11;
+            LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -19;
+            LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+            return info;
+        }
+        if( ldw < n ) {
+            info = -21;
+            LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_cgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy,
+                           &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds,
+                           work, &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        x_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        y_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        z_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        b_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        w_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldw_t * MAX(1,n) );
+        if( w_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        s_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        /* Transpose input matrices */
+        LAPACKE_cge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_cgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, y_t,
+                       &ldy_t, &k, reig, imeig, z_t, &ldz_t, res, b_t, &ldb_t,
+                       w_t, &ldw_t, s_t, &lds_t, work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, w_t, ldw_t, w, ldw );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_5:
+        LAPACKE_free( w_t );
+exit_level_4:
+        LAPACKE_free( b_t );
+exit_level_3:
+        LAPACKE_free( z_t );
+exit_level_2:
+        LAPACKE_free( y_t );
+exit_level_1:
+        LAPACKE_free( x_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c
new file mode 100644
index 000000000..60e83729b
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c
@@ -0,0 +1,123 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function cgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgedmdq( int matrix_layout, char jobs, char jobz, char jobr,
+                            char jobq, char jobt, char jobf, lapack_int whtsvd,
+                            lapack_int m, lapack_int n, lapack_complex_float* f,
+                            lapack_int ldf, lapack_complex_float* x,
+                            lapack_int ldx, lapack_complex_float* y,
+                            lapack_int ldy, lapack_int nrnk, float tol,
+                            lapack_int k, lapack_complex_float* reig,
+                            lapack_complex_float* imeig,
+                            lapack_complex_float* z, lapack_int ldz,
+                            lapack_complex_float* res, lapack_complex_float* b,
+                            lapack_int ldb, lapack_complex_float* v,
+                            lapack_int ldv, lapack_complex_float* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    lapack_complex_float* work = NULL;
+    lapack_int* iwork = NULL;
+    lapack_complex_float work_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_cgedmdq", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, f, ldf ) ) {
+            return -11;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -13;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -15;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -22;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -25;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, v, ldv ) ) {
+            return -27;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -29;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_cgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, reig, imeig, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, &work_query, lwork,
+                                 &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork  = LAPACK_C2INT( work_query );
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    work  = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_cgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, reig, imeig, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, work, lwork, iwork,
+                                 liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_1:
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_cgedmdq", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c
new file mode 100644
index 000000000..5bdbd3f56
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c
@@ -0,0 +1,205 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function cgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 lapack_complex_float* f, lapack_int ldf,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 lapack_complex_float* y, lapack_int ldy,
+                                 lapack_int nrnk, float tol, lapack_int k,
+                                 lapack_complex_float* reig,
+                                 lapack_complex_float* imeig,
+                                 lapack_complex_float* z,
+                                 lapack_int ldz, lapack_complex_float* res,
+                                 lapack_complex_float* b,
+                                 lapack_int ldb, lapack_complex_float* v,
+                                 lapack_int ldv, lapack_complex_float* s,
+                                 lapack_int lds, lapack_complex_float* work,
+                                 lapack_int lwork, lapack_int* iwork,
+                                 lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
+                        imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldf_t = MAX(1,m);
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldv_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        lapack_complex_float* f_t = NULL;
+        lapack_complex_float* x_t = NULL;
+        lapack_complex_float* y_t = NULL;
+        lapack_complex_float* z_t = NULL;
+        lapack_complex_float* b_t = NULL;
+        lapack_complex_float* v_t = NULL;
+        lapack_complex_float* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldf < n ) {
+            info = -12;
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+            return info;
+        }
+        if( ldx < n ) {
+            info = -14;
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -26;
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+            return info;
+        }
+        if( ldv < n ) {
+            info = -28;
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -30;
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 || liwork == -1 ) {
+            LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                            &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
+                            imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                            work, &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        f_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldf_t * MAX(1,n) );
+        if( f_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        x_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        y_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        z_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        b_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        v_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldv_t * MAX(1,n) );
+        if( v_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        s_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_6;
+        }
+        /* Transpose input matrices */
+        LAPACKE_cge_trans( matrix_layout, m, n, f, ldf, f_t, ldf_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, v, ldv, v_t, ldv_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
+                        imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, f_t, ldf_t, f, ldf );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, v_t, ldv_t, v, ldv );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_6:
+        LAPACKE_free( v_t );
+exit_level_5:
+        LAPACKE_free( b_t );
+exit_level_4:
+        LAPACKE_free( z_t );
+exit_level_3:
+        LAPACKE_free( y_t );
+exit_level_2:
+        LAPACKE_free( x_t );
+exit_level_1:
+        LAPACKE_free( f_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c
new file mode 100644
index 000000000..246d7f649
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c
@@ -0,0 +1,112 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function dgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dgedmd( int matrix_layout, char jobs, char jobz, char jobf,
+                           lapack_int whtsvd, lapack_int m, lapack_int n,
+                           double* x, lapack_int ldx, double* y, lapack_int ldy,
+                           lapack_int k, double* reig, double* imeig, double* z,
+                           lapack_int ldz, double* res, double* b, lapack_int ldb,
+                           double* w, lapack_int ldw, double* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    double* work = NULL;
+    lapack_int* iwork = NULL;
+    double work_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_dgedmd", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -8;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -10;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -15;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -18;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -20;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, w, ldw ) ) {
+            return -22;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n,
+                                x, ldx, y, ldy, k, reig, imeig, z, ldz, res,
+                                b, ldb, w, ldw, s, lds, &work_query, lwork,
+                                &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork  = (lapack_int) work_query;
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    work  = (double*)LAPACKE_malloc( sizeof(double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n,
+                                x, ldx, y, ldy, k, reig, imeig, z, ldz, res,
+                                b, ldb, w, ldw, s, lds, work, lwork, iwork,
+                                liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_1:
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_dgedmd", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c
new file mode 100644
index 000000000..4d1169de9
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c
@@ -0,0 +1,179 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function dgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobf, lapack_int whtsvd, lapack_int m,
+                                lapack_int n, double* x, lapack_int ldx,
+                                double* y, lapack_int ldy, lapack_int k,
+                                double* reig, double* imeig, double* z,
+                                lapack_int ldz, double* res, double* b,
+                                lapack_int ldb, double* w, lapack_int ldw,
+                                double* s, lapack_int lds, double* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_dgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy,
+                       &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds,
+                       work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldw_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        double* x_t = NULL;
+        double* y_t = NULL;
+        double* z_t = NULL;
+        double* b_t = NULL;
+        double* w_t = NULL;
+        double* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldx < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -11;
+            LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -19;
+            LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+            return info;
+        }
+        if( ldw < n ) {
+            info = -21;
+            LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_dgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy,
+                           &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds,
+                           work, &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        x_t = (double*)LAPACKE_malloc( sizeof(double) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        y_t = (double*)LAPACKE_malloc( sizeof(double) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        z_t = (double*)LAPACKE_malloc( sizeof(double) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        b_t = (double*)LAPACKE_malloc( sizeof(double) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        w_t = (double*)LAPACKE_malloc( sizeof(double) * ldw_t * MAX(1,n) );
+        if( w_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        s_t = (double*)LAPACKE_malloc( sizeof(double) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        /* Transpose input matrices */
+        LAPACKE_dge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_dgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, y_t,
+                       &ldy_t, &k, reig, imeig, z_t, &ldz_t, res, b_t, &ldb_t,
+                       w_t, &ldw_t, s_t, &lds_t, work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, w_t, ldw_t, w, ldw );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_5:
+        LAPACKE_free( w_t );
+exit_level_4:
+        LAPACKE_free( b_t );
+exit_level_3:
+        LAPACKE_free( z_t );
+exit_level_2:
+        LAPACKE_free( y_t );
+exit_level_1:
+        LAPACKE_free( x_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c
new file mode 100644
index 000000000..f3d621ba9
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c
@@ -0,0 +1,119 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function dgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dgedmdq( int matrix_layout, char jobs, char jobz, char jobr,
+                            char jobq, char jobt, char jobf, lapack_int whtsvd,
+                            lapack_int m, lapack_int n, double* f, lapack_int ldf,
+                            double* x, lapack_int ldx, double* y, lapack_int ldy,
+                            lapack_int nrnk, double tol, lapack_int k,
+                            double* reig, double* imeig, double* z,
+                            lapack_int ldz, double* res, double* b, lapack_int ldb,
+                            double* v, lapack_int ldv, double* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    double* work = NULL;
+    lapack_int* iwork = NULL;
+    double work_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_dgedmdq", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, f, ldf ) ) {
+            return -11;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -13;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -15;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -22;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -25;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, v, ldv ) ) {
+            return -27;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -29;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_dgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, reig, imeig, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, &work_query, lwork,
+                                 &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork  = (lapack_int) work_query;
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    work  = (double*)LAPACKE_malloc( sizeof(double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_dgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, reig, imeig, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, work, lwork, iwork,
+                                 liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_1:
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_dgedmdq", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c
new file mode 100644
index 000000000..51b2a66d8
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c
@@ -0,0 +1,200 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function dgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 double* f, lapack_int ldf, double* x,
+                                 lapack_int ldx, double* y, lapack_int ldy,
+                                 lapack_int nrnk, double tol, lapack_int k,
+                                 double* reig, double* imeig, double* z,
+                                 lapack_int ldz, double* res, double* b,
+                                 lapack_int ldb, double* v, lapack_int ldv,
+                                 double* s, lapack_int lds, double* work,
+                                 lapack_int lwork, lapack_int* iwork,
+                                 lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
+                        imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldf_t = MAX(1,m);
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldv_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        double* f_t = NULL;
+        double* x_t = NULL;
+        double* y_t = NULL;
+        double* z_t = NULL;
+        double* b_t = NULL;
+        double* v_t = NULL;
+        double* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldf < n ) {
+            info = -12;
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+            return info;
+        }
+        if( ldx < n ) {
+            info = -14;
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -26;
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+            return info;
+        }
+        if( ldv < n ) {
+            info = -28;
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -30;
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 || liwork == -1 ) {
+            LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                            &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
+                            imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                            work, &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        f_t = (double*)LAPACKE_malloc( sizeof(double) * ldf_t * MAX(1,n) );
+        if( f_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        x_t = (double*)LAPACKE_malloc( sizeof(double) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        y_t = (double*)LAPACKE_malloc( sizeof(double) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        z_t = (double*)LAPACKE_malloc( sizeof(double) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        b_t = (double*)LAPACKE_malloc( sizeof(double) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        v_t = (double*)LAPACKE_malloc( sizeof(double) * ldv_t * MAX(1,n) );
+        if( v_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        s_t = (double*)LAPACKE_malloc( sizeof(double) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_6;
+        }
+        /* Transpose input matrices */
+        LAPACKE_dge_trans( matrix_layout, m, n, f, ldf, f_t, ldf_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, v, ldv, v_t, ldv_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
+                        imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, f_t, ldf_t, f, ldf );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, v_t, ldv_t, v, ldv );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_6:
+        LAPACKE_free( v_t );
+exit_level_5:
+        LAPACKE_free( b_t );
+exit_level_4:
+        LAPACKE_free( z_t );
+exit_level_3:
+        LAPACKE_free( y_t );
+exit_level_2:
+        LAPACKE_free( x_t );
+exit_level_1:
+        LAPACKE_free( f_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c
new file mode 100644
index 000000000..879631b1d
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c
@@ -0,0 +1,112 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function sgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sgedmd( int matrix_layout, char jobs, char jobz, char jobf,
+                           lapack_int whtsvd, lapack_int m, lapack_int n,
+                           float* x, lapack_int ldx, float* y, lapack_int ldy,
+                           lapack_int k, float* reig, float* imeig, float* z,
+                           lapack_int ldz, float* res, float* b, lapack_int ldb,
+                           float* w, lapack_int ldw, float* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    float* work = NULL;
+    lapack_int* iwork = NULL;
+    float work_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_sgedmd", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -8;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -10;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -15;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -18;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -20;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, w, ldw ) ) {
+            return -22;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n,
+                                x, ldx, y, ldy, k, reig, imeig, z, ldz, res,
+                                b, ldb, w, ldw, s, lds, &work_query, lwork,
+                                &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork  = (lapack_int) work_query;
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    work  = (float*)LAPACKE_malloc( sizeof(float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n,
+                                x, ldx, y, ldy, k, reig, imeig, z, ldz, res,
+                                b, ldb, w, ldw, s, lds, work, lwork, iwork,
+                                liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_1:
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_sgedmd", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c
new file mode 100644
index 000000000..762a9b271
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c
@@ -0,0 +1,179 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function sgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobf, lapack_int whtsvd, lapack_int m,
+                                lapack_int n, float* x, lapack_int ldx,
+                                float* y, lapack_int ldy, lapack_int k,
+                                float* reig, float* imeig, float* z,
+                                lapack_int ldz, float* res, float* b,
+                                lapack_int ldb, float* w, lapack_int ldw,
+                                float* s, lapack_int lds, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_sgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy,
+                       &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds,
+                       work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldw_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        float* x_t = NULL;
+        float* y_t = NULL;
+        float* z_t = NULL;
+        float* b_t = NULL;
+        float* w_t = NULL;
+        float* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldx < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -11;
+            LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -19;
+            LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+            return info;
+        }
+        if( ldw < n ) {
+            info = -21;
+            LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_sgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy,
+                           &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds,
+                           work, &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        x_t = (float*)LAPACKE_malloc( sizeof(float) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        y_t = (float*)LAPACKE_malloc( sizeof(float) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        z_t = (float*)LAPACKE_malloc( sizeof(float) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        b_t = (float*)LAPACKE_malloc( sizeof(float) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        w_t = (float*)LAPACKE_malloc( sizeof(float) * ldw_t * MAX(1,n) );
+        if( w_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        s_t = (float*)LAPACKE_malloc( sizeof(float) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        /* Transpose input matrices */
+        LAPACKE_sge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_sgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, y_t,
+                       &ldy_t, &k, reig, imeig, z_t, &ldz_t, res, b_t, &ldb_t,
+                       w_t, &ldw_t, s_t, &lds_t, work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, w_t, ldw_t, w, ldw );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_5:
+        LAPACKE_free( w_t );
+exit_level_4:
+        LAPACKE_free( b_t );
+exit_level_3:
+        LAPACKE_free( z_t );
+exit_level_2:
+        LAPACKE_free( y_t );
+exit_level_1:
+        LAPACKE_free( x_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c
new file mode 100644
index 000000000..e202d7fbd
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c
@@ -0,0 +1,119 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function sgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sgedmdq( int matrix_layout, char jobs, char jobz, char jobr,
+                            char jobq, char jobt, char jobf, lapack_int whtsvd,
+                            lapack_int m, lapack_int n, float* f, lapack_int ldf,
+                            float* x, lapack_int ldx, float* y, lapack_int ldy,
+                            lapack_int nrnk, float tol, lapack_int k,
+                            float* reig, float* imeig, float* z,
+                            lapack_int ldz, float* res, float* b, lapack_int ldb,
+                            float* v, lapack_int ldv, float* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    float* work = NULL;
+    lapack_int* iwork = NULL;
+    float work_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_sgedmdq", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, f, ldf ) ) {
+            return -11;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -13;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -15;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -22;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -25;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, v, ldv ) ) {
+            return -27;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -29;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_sgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, reig, imeig, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, &work_query, lwork,
+                                 &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork  = (lapack_int) work_query;
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    work  = (float*)LAPACKE_malloc( sizeof(float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_sgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, reig, imeig, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, work, lwork, iwork,
+                                 liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_1:
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_sgedmdq", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c
new file mode 100644
index 000000000..9039898d2
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c
@@ -0,0 +1,200 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function sgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 float* f, lapack_int ldf, float* x,
+                                 lapack_int ldx, float* y, lapack_int ldy,
+                                 lapack_int nrnk, float tol, lapack_int k,
+                                 float* reig, float* imeig, float* z,
+                                 lapack_int ldz, float* res, float* b,
+                                 lapack_int ldb, float* v, lapack_int ldv,
+                                 float* s, lapack_int lds, float* work,
+                                 lapack_int lwork, lapack_int* iwork,
+                                 lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
+                        imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldf_t = MAX(1,m);
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldv_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        float* f_t = NULL;
+        float* x_t = NULL;
+        float* y_t = NULL;
+        float* z_t = NULL;
+        float* b_t = NULL;
+        float* v_t = NULL;
+        float* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldf < n ) {
+            info = -12;
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+            return info;
+        }
+        if( ldx < n ) {
+            info = -14;
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -26;
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+            return info;
+        }
+        if( ldv < n ) {
+            info = -28;
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -30;
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 || liwork == -1 ) {
+            LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                            &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
+                            imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                            work, &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        f_t = (float*)LAPACKE_malloc( sizeof(float) * ldf_t * MAX(1,n) );
+        if( f_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        x_t = (float*)LAPACKE_malloc( sizeof(float) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        y_t = (float*)LAPACKE_malloc( sizeof(float) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        z_t = (float*)LAPACKE_malloc( sizeof(float) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        b_t = (float*)LAPACKE_malloc( sizeof(float) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        v_t = (float*)LAPACKE_malloc( sizeof(float) * ldv_t * MAX(1,n) );
+        if( v_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        s_t = (float*)LAPACKE_malloc( sizeof(float) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_6;
+        }
+        /* Transpose input matrices */
+        LAPACKE_sge_trans( matrix_layout, m, n, f, ldf, f_t, ldf_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, v, ldv, v_t, ldv_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
+                        imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, f_t, ldf_t, f, ldf );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, v_t, ldv_t, v, ldv );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_6:
+        LAPACKE_free( v_t );
+exit_level_5:
+        LAPACKE_free( b_t );
+exit_level_4:
+        LAPACKE_free( z_t );
+exit_level_3:
+        LAPACKE_free( y_t );
+exit_level_2:
+        LAPACKE_free( x_t );
+exit_level_1:
+        LAPACKE_free( f_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c
new file mode 100644
index 000000000..f3f421c54
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c
@@ -0,0 +1,116 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function zgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zgedmd( int matrix_layout, char jobs, char jobz, char jobf,
+                           lapack_int whtsvd, lapack_int m, lapack_int n,
+                           lapack_complex_double* x, lapack_int ldx,
+                           lapack_complex_double* y, lapack_int ldy,
+                           lapack_int k, lapack_complex_double* reig,
+                           lapack_complex_double* imeig, lapack_complex_double* z,
+                           lapack_int ldz, lapack_complex_double* res,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* w, lapack_int ldw,
+                           lapack_complex_double* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    lapack_complex_double* work = NULL;
+    lapack_int* iwork = NULL;
+    lapack_complex_double work_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_zgedmd", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -8;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -10;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -15;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -18;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -20;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, w, ldw ) ) {
+            return -22;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n,
+                                x, ldx, y, ldy, k, reig, imeig, z, ldz, res,
+                                b, ldb, w, ldw, s, lds, &work_query, lwork,
+                                &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork  = LAPACK_Z2INT( work_query );
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    work  = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n,
+                                x, ldx, y, ldy, k, reig, imeig, z, ldz, res,
+                                b, ldb, w, ldw, s, lds, work, lwork, iwork,
+                                liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_1:
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_zgedmd", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c
new file mode 100644
index 000000000..2554411ec
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c
@@ -0,0 +1,182 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function zgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobf, lapack_int whtsvd, lapack_int m,
+                                lapack_int n, lapack_complex_double* x,
+                                lapack_int ldx, lapack_complex_double* y,
+                                lapack_int ldy, lapack_int k,
+                                lapack_complex_double* reig,
+                                lapack_complex_double* imeig, lapack_complex_double* z,
+                                lapack_int ldz, lapack_complex_double* res,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* w, lapack_int ldw,
+                                lapack_complex_double* s, lapack_int lds,
+                                lapack_complex_double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_zgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy,
+                       &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds,
+                       work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldw_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        lapack_complex_double* x_t = NULL;
+        lapack_complex_double* y_t = NULL;
+        lapack_complex_double* z_t = NULL;
+        lapack_complex_double* b_t = NULL;
+        lapack_complex_double* w_t = NULL;
+        lapack_complex_double* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldx < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -11;
+            LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -19;
+            LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+            return info;
+        }
+        if( ldw < n ) {
+            info = -21;
+            LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_zgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy,
+                           &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds,
+                           work, &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        x_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        y_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        z_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        b_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        w_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldw_t * MAX(1,n) );
+        if( w_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        s_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        /* Transpose input matrices */
+        LAPACKE_zge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_zgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, y_t,
+                       &ldy_t, &k, reig, imeig, z_t, &ldz_t, res, b_t, &ldb_t,
+                       w_t, &ldw_t, s_t, &lds_t, work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, w_t, ldw_t, w, ldw );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_5:
+        LAPACKE_free( w_t );
+exit_level_4:
+        LAPACKE_free( b_t );
+exit_level_3:
+        LAPACKE_free( z_t );
+exit_level_2:
+        LAPACKE_free( y_t );
+exit_level_1:
+        LAPACKE_free( x_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c
new file mode 100644
index 000000000..3648ffdf2
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c
@@ -0,0 +1,123 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function zgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zgedmdq( int matrix_layout, char jobs, char jobz, char jobr,
+                            char jobq, char jobt, char jobf, lapack_int whtsvd,
+                            lapack_int m, lapack_int n, lapack_complex_double* f,
+                            lapack_int ldf, lapack_complex_double* x,
+                            lapack_int ldx, lapack_complex_double* y,
+                            lapack_int ldy, lapack_int nrnk, double tol,
+                            lapack_int k, lapack_complex_double* reig,
+                            lapack_complex_double* imeig,
+                            lapack_complex_double* z, lapack_int ldz,
+                            lapack_complex_double* res, lapack_complex_double* b,
+                            lapack_int ldb, lapack_complex_double* v,
+                            lapack_int ldv, lapack_complex_double* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    lapack_complex_double* work = NULL;
+    lapack_int* iwork = NULL;
+    lapack_complex_double work_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_cgedmdq", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, f, ldf ) ) {
+            return -11;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -13;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -15;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -22;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -25;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, v, ldv ) ) {
+            return -27;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -29;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_zgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, reig, imeig, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, &work_query, lwork,
+                                 &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork  = LAPACK_Z2INT( work_query );
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    work  = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_zgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, reig, imeig, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, work, lwork, iwork,
+                                 liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_1:
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_zgedmdq", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c
new file mode 100644
index 000000000..9afceba07
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c
@@ -0,0 +1,205 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function zgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 lapack_complex_double* f, lapack_int ldf,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 lapack_complex_double* y, lapack_int ldy,
+                                 lapack_int nrnk, double tol, lapack_int k,
+                                 lapack_complex_double* reig,
+                                 lapack_complex_double* imeig,
+                                 lapack_complex_double* z,
+                                 lapack_int ldz, lapack_complex_double* res,
+                                 lapack_complex_double* b,
+                                 lapack_int ldb, lapack_complex_double* v,
+                                 lapack_int ldv, lapack_complex_double* s,
+                                 lapack_int lds, lapack_complex_double* work,
+                                 lapack_int lwork, lapack_int* iwork,
+                                 lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
+                        imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldf_t = MAX(1,m);
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldv_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        lapack_complex_double* f_t = NULL;
+        lapack_complex_double* x_t = NULL;
+        lapack_complex_double* y_t = NULL;
+        lapack_complex_double* z_t = NULL;
+        lapack_complex_double* b_t = NULL;
+        lapack_complex_double* v_t = NULL;
+        lapack_complex_double* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldf < n ) {
+            info = -12;
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+            return info;
+        }
+        if( ldx < n ) {
+            info = -14;
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -26;
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+            return info;
+        }
+        if( ldv < n ) {
+            info = -28;
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -30;
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 || liwork == -1 ) {
+            LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                            &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
+                            imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                            work, &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        f_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldf_t * MAX(1,n) );
+        if( f_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        x_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        y_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        z_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        b_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        v_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldv_t * MAX(1,n) );
+        if( v_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        s_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_6;
+        }
+        /* Transpose input matrices */
+        LAPACKE_zge_trans( matrix_layout, m, n, f, ldf, f_t, ldf_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, v, ldv, v_t, ldv_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
+                        imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, f_t, ldf_t, f, ldf );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, v_t, ldv_t, v, ldv );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_6:
+        LAPACKE_free( v_t );
+exit_level_5:
+        LAPACKE_free( b_t );
+exit_level_4:
+        LAPACKE_free( z_t );
+exit_level_3:
+        LAPACKE_free( y_t );
+exit_level_2:
+        LAPACKE_free( x_t );
+exit_level_1:
+        LAPACKE_free( f_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+    }
+    return info;
+}

From be5e1ecee3e517ac34dc12c74b829ac5543430a9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 20 Jun 2023 11:27:11 +0200
Subject: [PATCH 135/718] Add LAPACKE interfaces for ?TRSYL3 (Reference-LAPACK
 PR 651)

---
 lapack-netlib/LAPACKE/src/Makefile | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile
index d35e2a2d7..9d81e2416 100644
--- a/lapack-netlib/LAPACKE/src/Makefile
+++ b/lapack-netlib/LAPACKE/src/Makefile
@@ -28,7 +28,6 @@
 ##############################################################################
 # Contents: Native C interface to LAPACK
 # Author: Intel Corporation
-# September, 2010
 ##############################################################################
 # makefile for LAPACKE, used to build lapacke binary.
 #
@@ -608,6 +607,8 @@ lapacke_ctrsna.o \
 lapacke_ctrsna_work.o \
 lapacke_ctrsyl.o \
 lapacke_ctrsyl_work.o \
+lapacke_ctrsyl3.o \
+lapacke_ctrsyl3_work.o \
 lapacke_ctrtri.o \
 lapacke_ctrtri_work.o \
 lapacke_ctrtrs.o \
@@ -1228,6 +1229,8 @@ lapacke_dtrsna.o \
 lapacke_dtrsna_work.o \
 lapacke_dtrsyl.o \
 lapacke_dtrsyl_work.o \
+lapacke_dtrsyl3.o \
+lapacke_dtrsyl3_work.o \
 lapacke_dtrtri.o \
 lapacke_dtrtri_work.o \
 lapacke_dtrtrs.o \
@@ -1800,6 +1803,8 @@ lapacke_strsna.o \
 lapacke_strsna_work.o \
 lapacke_strsyl.o \
 lapacke_strsyl_work.o \
+lapacke_strsyl3.o \
+lapacke_strsyl3_work.o \
 lapacke_strtri.o \
 lapacke_strtri_work.o \
 lapacke_strtrs.o \
@@ -2378,6 +2383,8 @@ lapacke_ztrsna.o \
 lapacke_ztrsna_work.o \
 lapacke_ztrsyl.o \
 lapacke_ztrsyl_work.o \
+lapacke_ztrsyl3.o \
+lapacke_ztrsyl3_work.o \
 lapacke_ztrtri.o \
 lapacke_ztrtri_work.o \
 lapacke_ztrtrs.o \
@@ -2552,7 +2559,7 @@ $(LAPACKELIB): $(OBJ) $(OBJ_S) $(OBJ_C) $(OBJ_D) $(OBJ_Z) $(DEPRECATED) $(EXTEND
 ifdef BUILD_DEPRECATED
 	$(AR) $(ARFLAGS) $@ $(DEPRECATED)
 endif
-ifdef (USEXBLAS)
+ifdef USEXBLAS
 	$(AR) $(ARFLAGS) $@ $(EXTENDED)
 endif
 ifdef LAPACKE_WITH_TMG

From b6a28adeaf73a3206848edbdc646d139e6aa8b63 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 20 Jun 2023 11:33:03 +0200
Subject: [PATCH 136/718] Add ?TRSYL3 (Reference-LAPACK PR 651)

---
 cmake/lapacke.cmake | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake
index c8cfa7778..6713beefc 100644
--- a/cmake/lapacke.cmake
+++ b/cmake/lapacke.cmake
@@ -558,6 +558,8 @@ set(CSRC
   lapacke_ctrsna_work.c
   lapacke_ctrsyl.c
   lapacke_ctrsyl_work.c
+  lapacke_ctrsyl3.c
+  lapacke_ctrsyl3_work.c
   lapacke_ctrtri.c
   lapacke_ctrtri_work.c
   lapacke_ctrtrs.c
@@ -1178,6 +1180,8 @@ set(DSRC
   lapacke_dtrsna_work.c
   lapacke_dtrsyl.c
   lapacke_dtrsyl_work.c
+  lapacke_dtrsyl3.c
+  lapacke_dtrsyl3_work.c
   lapacke_dtrtri.c
   lapacke_dtrtri_work.c
   lapacke_dtrtrs.c
@@ -1750,6 +1754,8 @@ set(SSRC
   lapacke_strsna_work.c
   lapacke_strsyl.c
   lapacke_strsyl_work.c
+  lapacke_ctrsyl3.c
+  lapacke_ctrsyl3_work.c
   lapacke_strtri.c
   lapacke_strtri_work.c
   lapacke_strtrs.c
@@ -2325,6 +2331,8 @@ set(ZSRC
   lapacke_ztrsna_work.c
   lapacke_ztrsyl.c
   lapacke_ztrsyl_work.c
+  lapacke_ztrsyl3.c
+  lapacke_ztrsyl3_work.c
   lapacke_ztrtri.c
   lapacke_ztrtri_work.c
   lapacke_ztrtrs.c

From e28fdf71b78613073701b3fc2f05f60ad46bb6ed Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 20 Jun 2023 14:50:54 +0200
Subject: [PATCH 137/718] Add LAPACKE interfaces for Dynamic Mode Decomposition
 (Reference-LAPACK PR 736)

---
 lapack-netlib/LAPACKE/src/Makefile | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile
index 9d81e2416..969288f42 100644
--- a/lapack-netlib/LAPACKE/src/Makefile
+++ b/lapack-netlib/LAPACKE/src/Makefile
@@ -137,6 +137,10 @@ lapacke_cgerqf.o \
 lapacke_cgerqf_work.o \
 lapacke_cgesdd.o \
 lapacke_cgesdd_work.o \
+lapacke_cgedmd.o \
+lapacke_cgedmd_work.o \
+lapacke_cgedmdq.o \
+lapacke_cgedmdq_work.o \
 lapacke_cgesv.o \
 lapacke_cgesv_work.o \
 lapacke_cgesvd.o \
@@ -763,6 +767,10 @@ lapacke_dgerqf.o \
 lapacke_dgerqf_work.o \
 lapacke_dgesdd.o \
 lapacke_dgesdd_work.o \
+lapacke_dgedmd.o \
+lapacke_dgedmd_work.o \
+lapacke_dgedmdq.o \
+lapacke_dgedmdq_work.o \
 lapacke_dgesv.o \
 lapacke_dgesv_work.o \
 lapacke_dgesvd.o \
@@ -1343,6 +1351,10 @@ lapacke_sgerqf.o \
 lapacke_sgerqf_work.o \
 lapacke_sgesdd.o \
 lapacke_sgesdd_work.o \
+lapacke_sgedmd.o \
+lapacke_sgedmd_work.o \
+lapacke_sgedmdq.o \
+lapacke_sgedmdq_work.o \
 lapacke_sgesv.o \
 lapacke_sgesv_work.o \
 lapacke_sgesvd.o \
@@ -1913,6 +1925,10 @@ lapacke_zgerqf.o \
 lapacke_zgerqf_work.o \
 lapacke_zgesdd.o \
 lapacke_zgesdd_work.o \
+lapacke_zgedmd.o \
+lapacke_zgedmd_work.o \
+lapacke_zgedmdq.o \
+lapacke_zgedmdq_work.o \
 lapacke_zgesv.o \
 lapacke_zgesv_work.o \
 lapacke_zgesvd.o \

From 83d6ce12893e182bfb212ea5f6d7f3d2f489cda3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 20 Jun 2023 14:55:26 +0200
Subject: [PATCH 138/718] Add interfaces for Dynamic Mode Decomposition
 (Reference-LAPACK PR 736)

---
 cmake/lapacke.cmake | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake
index 6713beefc..f43bf10d0 100644
--- a/cmake/lapacke.cmake
+++ b/cmake/lapacke.cmake
@@ -90,6 +90,10 @@ set(CSRC
   lapacke_cgerqf_work.c
   lapacke_cgesdd.c
   lapacke_cgesdd_work.c
+  lapacke_cgedmd.c
+  lapacke_cgedmd_work.c
+  lapacke_cgedmdq.c
+  lapacke_cgedmdq_work.c
   lapacke_cgesv.c
   lapacke_cgesv_work.c
   lapacke_cgesvd.c
@@ -713,6 +717,10 @@ set(DSRC
   lapacke_dgerqf_work.c
   lapacke_dgesdd.c
   lapacke_dgesdd_work.c
+  lapacke_dgedmd.c
+  lapacke_dgedmd_work.c
+  lapacke_dgedmdq.c
+  lapacke_dgedmdq_work.c
   lapacke_dgesv.c
   lapacke_dgesv_work.c
   lapacke_dgesvd.c
@@ -1291,6 +1299,10 @@ set(SSRC
   lapacke_sgerqf_work.c
   lapacke_sgesdd.c
   lapacke_sgesdd_work.c
+  lapacke_sgedmd.c
+  lapacke_sgedmd_work.c
+  lapacke_sgedmdq.c
+  lapacke_sgedmdq_work.c
   lapacke_sgesv.c
   lapacke_sgesv_work.c
   lapacke_sgesvd.c
@@ -1863,6 +1875,10 @@ set(ZSRC
   lapacke_zgerqf_work.c
   lapacke_zgesdd.c
   lapacke_zgesdd_work.c
+  lapacke_zgedmd.c
+  lapacke_zgedmd_work.c
+  lapacke_zgedmdq.c
+  lapacke_zgedmdq_work.c
   lapacke_zgesv.c
   lapacke_zgesv_work.c
   lapacke_zgesvd.c

From defafd1353a13e20452f94a3320593ecf6ffe0b4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 20 Jun 2023 15:07:53 +0200
Subject: [PATCH 139/718] Add functions for Dynamic Mode Decomposition
 (Reference-LAPACK PR 736)

---
 cmake/lapack.cmake | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake
index 544e226ab..077390d90 100644
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -124,7 +124,7 @@ set(SLASRC
    ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
    sgesvdq.f slaorhr_col_getrfnp.f
    slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f 
-   slatrs3.f strsyl3.f sgelst.f)
+   slatrs3.f strsyl3.f sgelst.f sgedmd.f90 sgedmdq.f90)
 
 set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
    sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
@@ -223,7 +223,7 @@ set(CLASRC
    chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
    cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f 
    cungtsqr.f cungtsqr_row.f cunhr_col.f 
-   clatrs3.f ctrsyl3.f cgelst.f)
+   clatrs3.f ctrsyl3.f cgelst.f cgedmd.f90 cgedmdq.f90)
 
 set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
    cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
@@ -316,7 +316,7 @@ set(DLASRC
    dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
    dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
    dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f 
-   dlatrs3.f dtrsyl3.f dgelst.f)
+   dlatrs3.f dtrsyl3.f dgelst.f dgedmd.f90 dgedmdq.f90)
 
 set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
    dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
@@ -419,7 +419,7 @@ set(ZLASRC
    zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
    zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
    zungtsqr.f zungtsqr_row.f zunhr_col.f
-   zlatrs3.f ztrsyl3.f zgelst.f)
+   zlatrs3.f ztrsyl3.f zgelst.f zgedmd.f90 zgedmdq.f90)
 
 set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
    zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f

From 9f0ef475b4fe9e287cd253aa9c285155887ee99c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 20 Jun 2023 15:12:02 +0200
Subject: [PATCH 140/718] Add Dynamic Mode Decomposition functions
 (Reference-LAPACK PR 736)

---
 lapack-netlib/SRC/Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile
index 8cac42330..74db14e46 100644
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@@ -207,7 +207,7 @@ SLASRC_O = \
    ssytrd_2stage.o ssytrd_sy2sb.o ssytrd_sb2st.o ssb2st_kernels.o \
    ssyevd_2stage.o ssyev_2stage.o ssyevx_2stage.o ssyevr_2stage.o \
    ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o \
-   sgesvdq.o slatrs3.o strsyl3.o sgelst.o
+   sgesvdq.o slatrs3.o strsyl3.o sgelst.o sgedmd.o sgedmdq.o
    
 endif
 
@@ -316,7 +316,7 @@ CLASRC_O = \
    chetrd_2stage.o chetrd_he2hb.o chetrd_hb2st.o chb2st_kernels.o \
    cheevd_2stage.o cheev_2stage.o cheevx_2stage.o cheevr_2stage.o \
    chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o \
-   cgesvdq.o clatrs3.o ctrsyl3.o cgelst.o
+   cgesvdq.o clatrs3.o ctrsyl3.o cgelst.o cgedmd.o cgedmdq.o
 endif
 
 ifdef USEXBLAS
@@ -417,7 +417,7 @@ DLASRC_O = \
    dsytrd_2stage.o dsytrd_sy2sb.o dsytrd_sb2st.o dsb2st_kernels.o \
    dsyevd_2stage.o dsyev_2stage.o dsyevx_2stage.o dsyevr_2stage.o \
    dsbev_2stage.o dsbevx_2stage.o dsbevd_2stage.o dsygv_2stage.o \
-   dgesvdq.o dlatrs3.o dtrsyl3.o dgelst.o
+   dgesvdq.o dlatrs3.o dtrsyl3.o dgelst.o dgedmd.o dgedmdq.o
 endif
 
 ifdef USEXBLAS
@@ -526,7 +526,7 @@ ZLASRC_O = \
    zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \
    zheevd_2stage.o zheev_2stage.o zheevx_2stage.o zheevr_2stage.o \
    zhbev_2stage.o zhbevx_2stage.o zhbevd_2stage.o zhegv_2stage.o \
-   zgesvdq.o zlatrs3.o ztrsyl3.o zgelst.o
+   zgesvdq.o zlatrs3.o ztrsyl3.o zgelst.o zgedmd.o zgedmdq.o
 endif
 
 ifdef USEXBLAS

From c7a05458cd6d315ac3cc377c3760f59784d9e9e8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 20 Jun 2023 15:52:44 +0200
Subject: [PATCH 141/718] Add Dynamic Mode Decomposition functions
 (Reference-LAPACK PR 736)

---
 lapack-netlib/LAPACKE/include/lapack.h | 132 +++++++++++++++++++++++++
 1 file changed, 132 insertions(+)

diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h
index a0fcaa259..a5d02b42e 100644
--- a/lapack-netlib/LAPACKE/include/lapack.h
+++ b/lapack-netlib/LAPACKE/include/lapack.h
@@ -3323,6 +3323,138 @@ void LAPACK_zgesdd_base(
     #define LAPACK_zgesdd(...) LAPACK_zgesdd_base(__VA_ARGS__)
 #endif
 
+#define LAPACK_cgedmd LAPACK_GLOBAL(cgedmd,CGEDMD)
+void LAPACK_cgedmd(
+    char const* jobs, char const* jobz, char const* jobf,
+    lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* x, lapack_int const* ldx,
+    lapack_complex_float* y, lapack_int const* ldy, lapack_int const* k,
+    lapack_complex_float* reig, lapack_complex_float* imeig,
+    lapack_complex_float* z, lapack_int const* ldz, lapack_complex_float* res,
+    lapack_complex_float* b, lapack_int const* ldb,
+    lapack_complex_float* w, lapack_int const* ldw,
+    lapack_complex_float* s, lapack_int const* lds,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_dgedmd LAPACK_GLOBAL(dgedmd,DGEDMD)
+void LAPACK_dgedmd(
+    char const* jobs, char const* jobz, char const* jobf,
+    lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n,
+    double* x, lapack_int const* ldx,
+    double* y, lapack_int const* ldy, lapack_int const* k,
+    double* reig, double* imeig,
+    double* z, lapack_int const* ldz, double* res,
+    double* b, lapack_int const* ldb,
+    double* w, lapack_int const* ldw,
+    double* s, lapack_int const* lds,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_sgedmd LAPACK_GLOBAL(sgedmd,SGEDMD)
+void LAPACK_sgedmd(
+    char const* jobs, char const* jobz, char const* jobf,
+    lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n,
+    float* x, lapack_int const* ldx,
+    float* y, lapack_int const* ldy, lapack_int const* k,
+    float* reig, float* imeig,
+    float* z, lapack_int const* ldz, float* res,
+    float* b, lapack_int const* ldb,
+    float* w, lapack_int const* ldw,
+    float* s, lapack_int const* lds,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_zgedmd LAPACK_GLOBAL(zgedmd,ZGEDMD)
+void LAPACK_zgedmd(
+    char const* jobs, char const* jobz, char const* jobf,
+    lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* x, lapack_int const* ldx,
+    lapack_complex_double* y, lapack_int const* ldy, lapack_int const* k,
+    lapack_complex_double* reig, lapack_complex_double* imeig,
+    lapack_complex_double* z, lapack_int const* ldz, lapack_complex_double* res,
+    lapack_complex_double* b, lapack_int const* ldb,
+    lapack_complex_double* w, lapack_int const* ldw,
+    lapack_complex_double* s, lapack_int const* lds,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_cgedmdq LAPACK_GLOBAL(cgedmdq,CGEDMDQ)
+void LAPACK_cgedmdq(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobq,
+    char const* jobt, char const* jobf, lapack_int const* whtsvd,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* f, lapack_int const* ldf,
+    lapack_complex_float* x, lapack_int const* ldx,
+    lapack_complex_float* y, lapack_int const* ldy, lapack_int const* nrnk,
+    float const* tol, lapack_int const* k,
+    lapack_complex_float* reig, lapack_complex_float* imeig,
+    lapack_complex_float* z, lapack_int const* ldz, lapack_complex_float* res,
+    lapack_complex_float* b, lapack_int const* ldb,
+    lapack_complex_float* v, lapack_int const* ldv,
+    lapack_complex_float* s, lapack_int const* lds,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_dgedmdq LAPACK_GLOBAL(dgedmdq,DGEDMDQ)
+void LAPACK_dgedmdq(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobq,
+    char const* jobt, char const* jobf, lapack_int const* whtsvd,
+    lapack_int const* m, lapack_int const* n,
+    double* f, lapack_int const* ldf,
+    double* x, lapack_int const* ldx,
+    double* y, lapack_int const* ldy, lapack_int const* nrnk,
+    double const* tol, lapack_int const* k,
+    double* reig, double* imeig,
+    double* z, lapack_int const* ldz, double* res,
+    double* b, lapack_int const* ldb,
+    double* v, lapack_int const* ldv,
+    double* s, lapack_int const* lds,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_sgedmdq LAPACK_GLOBAL(sgedmdq,SGEDMDQ)
+void LAPACK_sgedmdq(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobq,
+    char const* jobt, char const* jobf, lapack_int const* whtsvd,
+    lapack_int const* m, lapack_int const* n,
+    float* f, lapack_int const* ldf,
+    float* x, lapack_int const* ldx,
+    float* y, lapack_int const* ldy, lapack_int const* nrnk,
+    float const* tol, lapack_int const* k,
+    float* reig, float* imeig,
+    float* z, lapack_int const* ldz, float* res,
+    float* b, lapack_int const* ldb,
+    float* v, lapack_int const* ldv,
+    float* s, lapack_int const* lds,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_zgedmdq LAPACK_GLOBAL(zgedmdq,ZGEDMDQ)
+void LAPACK_zgedmdq(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobq,
+    char const* jobt, char const* jobf, lapack_int const* whtsvd,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* f, lapack_int const* ldf,
+    lapack_complex_double* x, lapack_int const* ldx,
+    lapack_complex_double* y, lapack_int const* ldy, lapack_int const* nrnk,
+    double const* tol, lapack_int const* k,
+    lapack_complex_double* reig, lapack_complex_double* imeig,
+    lapack_complex_double* z, lapack_int const* ldz, lapack_complex_double* res,
+    lapack_complex_double* b, lapack_int const* ldb,
+    lapack_complex_double* v, lapack_int const* ldv,
+    lapack_complex_double* s, lapack_int const* lds,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info )
+
 #define LAPACK_cgesv LAPACK_GLOBAL(cgesv,CGESV)
 lapack_int LAPACK_cgesv(
     lapack_int const* n, lapack_int const* nrhs,

From de88063aa2795fd25c00efeb92791b9c1d38c5be Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 20 Jun 2023 15:56:10 +0200
Subject: [PATCH 142/718] Add interfaces for Dynamic Mode Decomposition
 functions (Reference-LAPACK PR 736)

---
 lapack-netlib/LAPACKE/include/lapacke.h | 131 ++++++++++++++++++++++--
 1 file changed, 122 insertions(+), 9 deletions(-)

diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h
index 9bd228064..9a9ab4753 100644
--- a/lapack-netlib/LAPACKE/include/lapacke.h
+++ b/lapack-netlib/LAPACKE/include/lapacke.h
@@ -956,7 +956,7 @@ lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp, char jobr,
                            lapack_int lda, double* s, lapack_complex_double* u,
                            lapack_int ldu, lapack_complex_double* v,
                            lapack_int ldv, lapack_int* numrank );
-                           
+
 lapack_int LAPACKE_sgesvj( int matrix_layout, char joba, char jobu, char jobv,
                            lapack_int m, lapack_int n, float* a, lapack_int lda,
                            float* sva, lapack_int mv, float* v, lapack_int ldv,
@@ -5712,6 +5712,120 @@ lapack_int LAPACKE_zgesdd_work( int matrix_layout, char jobz, lapack_int m,
                                 lapack_complex_double* work, lapack_int lwork,
                                 double* rwork, lapack_int* iwork );
 
+lapack_int LAPACKE_sgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobf, lapack_int whtsvd, lapack_int m,
+                                lapack_int n, float* x, lapack_int ldx,
+                                float* y, lapack_int ldy, lapack_int k,
+                                float* reig, float* imeig, float* z,
+                                lapack_int ldz, float* res, float* b,
+                                lapack_int ldb, float* w, lapack_int ldw,
+                                float* s, lapack_int lds, float* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_dgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobf, lapack_int whtsvd, lapack_int m,
+                                lapack_int n, double* x, lapack_int ldx,
+                                double* y, lapack_int ldy, lapack_int k,
+                                double* reig, double* imeig, double* z,
+                                lapack_int ldz, double* res, double* b,
+                                lapack_int ldb, double* w, lapack_int ldw,
+                                double* s, lapack_int lds, double* work,
+                                lapack_int lwork, lapack_int* iwork,
+                                lapack_int liwork );
+
+lapack_int LAPACKE_cgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobf, lapack_int whtsvd, lapack_int m,
+                                lapack_int n, lapack_complex_float* x,
+                                lapack_int ldx, lapack_complex_float* y,
+                                lapack_int ldy, lapack_int k,
+                                lapack_complex_float* reig,
+                                lapack_complex_float* imeig,
+                                lapack_complex_float* z, lapack_int ldz,
+                                lapack_complex_float* res,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* w, lapack_int ldw,
+                                lapack_complex_float* s, lapack_int lds,
+                                lapack_complex_float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_zgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobf, lapack_int whtsvd, lapack_int m,
+                                lapack_int n, lapack_complex_double* x,
+                                lapack_int ldx, lapack_complex_double* y,
+                                lapack_int ldy, lapack_int k,
+                                lapack_complex_double* reig,
+                                lapack_complex_double* imeig,
+                                lapack_complex_double* z, lapack_int ldz,
+                                lapack_complex_double* res,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* w, lapack_int ldw,
+                                lapack_complex_double* s, lapack_int lds,
+                                lapack_complex_double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 float* f, lapack_int ldf, float* x,
+                                 lapack_int ldx, float* y, lapack_int ldy,
+                                 lapack_int nrnk, float tol, lapack_int k,
+                                 float* reig, float* imeig, float* z,
+                                 lapack_int ldz, float* res, float* b,
+                                 lapack_int ldb, float* v, lapack_int ldv,
+                                 float* s, lapack_int lds, float* work,
+                                 lapack_int lwork, lapack_int* iwork,
+                                 lapack_int liwork );
+
+lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 double* f, lapack_int ldf, double* x,
+                                 lapack_int ldx, double* y, lapack_int ldy,
+                                 lapack_int nrnk, double tol, lapack_int k,
+                                 double* reig, double* imeig, double* z,
+                                 lapack_int ldz, double* res, double* b,
+                                 lapack_int ldb, double* v, lapack_int ldv,
+                                 double* s, lapack_int lds, double* work,
+                                 lapack_int lwork, lapack_int* iwork,
+                                 lapack_int liwork );
+
+lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 lapack_complex_float* f, lapack_int ldf,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 lapack_complex_float* y, lapack_int ldy,
+                                 lapack_int nrnk, float tol, lapack_int k,
+                                 lapack_complex_float* reig,
+                                 lapack_complex_float* imeig,
+                                 lapack_complex_float* z, lapack_int ldz,
+                                 lapack_complex_float* res,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* v, lapack_int ldv,
+                                 lapack_complex_float* s, lapack_int lds,
+                                 lapack_complex_float* work, lapack_int lwork,
+                                 lapack_int* iwork,
+                                 lapack_int liwork );
+
+lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 lapack_complex_double* f, lapack_int ldf,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 lapack_complex_double* y, lapack_int ldy,
+                                 lapack_int nrnk, double tol, lapack_int k,
+                                 lapack_complex_double* reig,
+                                 lapack_complex_double* imeig,
+                                 lapack_complex_double* z, lapack_int ldz,
+                                 lapack_complex_double* res,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* v, lapack_int ldv,
+                                 lapack_complex_double* s, lapack_int lds,
+                                 lapack_complex_double* work, lapack_int lwork,
+                                 lapack_int* iwork,
+                                 lapack_int liwork );
+
 lapack_int LAPACKE_sgesv_work( int matrix_layout, lapack_int n, lapack_int nrhs,
                                float* a, lapack_int lda, lapack_int* ipiv,
                                float* b, lapack_int ldb );
@@ -5833,7 +5947,7 @@ lapack_int LAPACKE_zgesvdq_work( int matrix_layout, char joba, char jobp,
                                 lapack_int* iwork, lapack_int liwork,
                                 lapack_complex_double* cwork, lapack_int lcwork,
                                 double* rwork, lapack_int lrwork);
-                                
+
 lapack_int LAPACKE_sgesvj_work( int matrix_layout, char joba, char jobu,
                                 char jobv, lapack_int m, lapack_int n, float* a,
                                 lapack_int lda, float* sva, lapack_int mv,
@@ -12550,7 +12664,7 @@ lapack_int LAPACKE_zhegv_2stage_work( int matrix_layout, lapack_int itype, char
 //LAPACK 3.8.0
 lapack_int LAPACKE_ssysv_aa_2stage( int matrix_layout, char uplo, lapack_int n,
                           lapack_int nrhs, float* a, lapack_int lda,
-                          float* tb, lapack_int ltb, lapack_int* ipiv, 
+                          float* tb, lapack_int ltb, lapack_int* ipiv,
                           lapack_int* ipiv2, float* b, lapack_int ldb );
 lapack_int LAPACKE_ssysv_aa_2stage_work( int matrix_layout, char uplo, lapack_int n,
                                lapack_int nrhs, float* a, lapack_int lda,
@@ -12560,7 +12674,7 @@ lapack_int LAPACKE_ssysv_aa_2stage_work( int matrix_layout, char uplo, lapack_in
 lapack_int LAPACKE_dsysv_aa_2stage( int matrix_layout, char uplo, lapack_int n,
                           lapack_int nrhs, double* a, lapack_int lda,
                           double* tb, lapack_int ltb,
-                          lapack_int* ipiv, lapack_int* ipiv2, 
+                          lapack_int* ipiv, lapack_int* ipiv2,
                           double* b, lapack_int ldb );
 lapack_int LAPACKE_dsysv_aa_2stage_work( int matrix_layout, char uplo, lapack_int n,
                                lapack_int nrhs, double* a, lapack_int lda,
@@ -12612,10 +12726,10 @@ lapack_int LAPACKE_zhesv_aa_2stage_work( int matrix_layout, char uplo, lapack_in
                                lapack_int ltb, lapack_int* ipiv, lapack_int* ipiv2,
                                lapack_complex_double* b, lapack_int ldb,
                                lapack_complex_double* work, lapack_int lwork );
-                               
+
 lapack_int LAPACKE_ssytrf_aa_2stage( int matrix_layout, char uplo, lapack_int n,
                           float* a, lapack_int lda,
-                          float* tb, lapack_int ltb, lapack_int* ipiv, 
+                          float* tb, lapack_int ltb, lapack_int* ipiv,
                           lapack_int* ipiv2 );
 lapack_int LAPACKE_ssytrf_aa_2stage_work( int matrix_layout, char uplo, lapack_int n,
                                float* a, lapack_int lda,
@@ -12671,7 +12785,7 @@ lapack_int LAPACKE_zhetrf_aa_2stage_work( int matrix_layout, char uplo, lapack_i
 
 lapack_int LAPACKE_ssytrs_aa_2stage( int matrix_layout, char uplo, lapack_int n,
                           lapack_int nrhs, float* a, lapack_int lda,
-                          float* tb, lapack_int ltb, lapack_int* ipiv, 
+                          float* tb, lapack_int ltb, lapack_int* ipiv,
                           lapack_int* ipiv2, float* b, lapack_int ldb );
 lapack_int LAPACKE_ssytrs_aa_2stage_work( int matrix_layout, char uplo, lapack_int n,
                                lapack_int nrhs, float* a, lapack_int lda,
@@ -12680,7 +12794,7 @@ lapack_int LAPACKE_ssytrs_aa_2stage_work( int matrix_layout, char uplo, lapack_i
 lapack_int LAPACKE_dsytrs_aa_2stage( int matrix_layout, char uplo, lapack_int n,
                           lapack_int nrhs, double* a, lapack_int lda,
                           double* tb, lapack_int ltb,
-                          lapack_int* ipiv, lapack_int* ipiv2, 
+                          lapack_int* ipiv, lapack_int* ipiv2,
                           double* b, lapack_int ldb );
 lapack_int LAPACKE_dsytrs_aa_2stage_work( int matrix_layout, char uplo, lapack_int n,
                                lapack_int nrhs, double* a, lapack_int lda,
@@ -12727,7 +12841,6 @@ lapack_int LAPACKE_zhetrs_aa_2stage_work( int matrix_layout, char uplo, lapack_i
                                lapack_int lda, lapack_complex_double* tb,
                                lapack_int ltb, lapack_int* ipiv, lapack_int* ipiv2,
                                lapack_complex_double* b, lapack_int ldb );
-
 //LAPACK 3.10.0
 lapack_int LAPACKE_sorhr_col( int matrix_layout, lapack_int m, lapack_int n,
                               lapack_int nb, float* a,

From 5eef0793bac9637b80f9ce02363d80abcda24840 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 20 Jun 2023 18:56:33 +0200
Subject: [PATCH 143/718] Fix missing semicolon

---
 lapack-netlib/LAPACKE/include/lapack.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h
index a5d02b42e..f510c8c80 100644
--- a/lapack-netlib/LAPACKE/include/lapack.h
+++ b/lapack-netlib/LAPACKE/include/lapack.h
@@ -3453,7 +3453,7 @@ void LAPACK_zgedmdq(
     lapack_complex_double* s, lapack_int const* lds,
     lapack_complex_double* work, lapack_int const* lwork,
     lapack_int* iwork, lapack_int const* liwork,
-    lapack_int* info )
+    lapack_int* info );
 
 #define LAPACK_cgesv LAPACK_GLOBAL(cgesv,CGESV)
 lapack_int LAPACK_cgesv(

From 649ab6481adec7e4bb2a411a8d2956458523733f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 20 Jun 2023 20:03:44 +0200
Subject: [PATCH 144/718] Add dummy C files for ?GEMDQ?

---
 lapack-netlib/SRC/cgedmd.c  | 511 ++++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/cgedmdq.c | 511 ++++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/dgedmd.c  | 511 ++++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/dgedmdq.c | 511 ++++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/sgedmd.c  | 511 ++++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/sgedmdq.c | 511 ++++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/zgedmd.c  | 511 ++++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/zgedmdq.c | 511 ++++++++++++++++++++++++++++++++++++
 8 files changed, 4088 insertions(+)
 create mode 100644 lapack-netlib/SRC/cgedmd.c
 create mode 100644 lapack-netlib/SRC/cgedmdq.c
 create mode 100644 lapack-netlib/SRC/dgedmd.c
 create mode 100644 lapack-netlib/SRC/dgedmdq.c
 create mode 100644 lapack-netlib/SRC/sgedmd.c
 create mode 100644 lapack-netlib/SRC/sgedmdq.c
 create mode 100644 lapack-netlib/SRC/zgedmd.c
 create mode 100644 lapack-netlib/SRC/zgedmdq.c

diff --git a/lapack-netlib/SRC/cgedmd.c b/lapack-netlib/SRC/cgedmd.c
new file mode 100644
index 000000000..447b23014
--- /dev/null
+++ b/lapack-netlib/SRC/cgedmd.c
@@ -0,0 +1,511 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
diff --git a/lapack-netlib/SRC/cgedmdq.c b/lapack-netlib/SRC/cgedmdq.c
new file mode 100644
index 000000000..447b23014
--- /dev/null
+++ b/lapack-netlib/SRC/cgedmdq.c
@@ -0,0 +1,511 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
diff --git a/lapack-netlib/SRC/dgedmd.c b/lapack-netlib/SRC/dgedmd.c
new file mode 100644
index 000000000..447b23014
--- /dev/null
+++ b/lapack-netlib/SRC/dgedmd.c
@@ -0,0 +1,511 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
diff --git a/lapack-netlib/SRC/dgedmdq.c b/lapack-netlib/SRC/dgedmdq.c
new file mode 100644
index 000000000..447b23014
--- /dev/null
+++ b/lapack-netlib/SRC/dgedmdq.c
@@ -0,0 +1,511 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
diff --git a/lapack-netlib/SRC/sgedmd.c b/lapack-netlib/SRC/sgedmd.c
new file mode 100644
index 000000000..447b23014
--- /dev/null
+++ b/lapack-netlib/SRC/sgedmd.c
@@ -0,0 +1,511 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
diff --git a/lapack-netlib/SRC/sgedmdq.c b/lapack-netlib/SRC/sgedmdq.c
new file mode 100644
index 000000000..447b23014
--- /dev/null
+++ b/lapack-netlib/SRC/sgedmdq.c
@@ -0,0 +1,511 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
diff --git a/lapack-netlib/SRC/zgedmd.c b/lapack-netlib/SRC/zgedmd.c
new file mode 100644
index 000000000..447b23014
--- /dev/null
+++ b/lapack-netlib/SRC/zgedmd.c
@@ -0,0 +1,511 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
diff --git a/lapack-netlib/SRC/zgedmdq.c b/lapack-netlib/SRC/zgedmdq.c
new file mode 100644
index 000000000..447b23014
--- /dev/null
+++ b/lapack-netlib/SRC/zgedmdq.c
@@ -0,0 +1,511 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+

From 2bfd7ff126934704eb83edae5f3c118cabfb203e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 20 Jun 2023 21:39:29 +0200
Subject: [PATCH 145/718] Add dummy C sources for ?GEDMD

---
 cmake/lapack.cmake | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake
index 077390d90..12127531d 100644
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -624,7 +624,7 @@ set(SLASRC
    ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c
    sgesvdq.c slaorhr_col_getrfnp.c
    slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c 
-   slatrs3.c strsyl3.c sgelst.c)
+   slatrs3.c strsyl3.c sgelst.c sgedmd.c sgedmdq.c)
 
 set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c
    sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c
@@ -722,7 +722,7 @@ set(CLASRC
    chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c
    cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c 
    cungtsqr.c cungtsqr_row.c cunhr_col.c 
-   clatrs3.c ctrsyl3.c cgelst.c)
+   clatrs3.c ctrsyl3.c cgelst.c cgedmd.c cgedmdq.c)
 
 set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c
    cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c
@@ -814,7 +814,7 @@ set(DLASRC
    dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c
    dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c
    dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c 
-   dlatrs3.c dtrsyl3.c dgelst.c)
+   dlatrs3.c dtrsyl3.c dgelst.c dgedmd.c dgedmdq.c)
 
 set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c
    dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c
@@ -925,7 +925,7 @@ set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c
    zla_gbrfsx_extended.c zla_gbamv.c zla_gbrcond_c.c zla_gbrcond_x.c
    zla_gbrpvgrw.c zhesvxx.c zherfsx.c zla_herfsx_extended.c
    zla_heamv.c zla_hercond_c.c zla_hercond_x.c zla_herpvgrw.c
-   zla_lin_berr.c zlarscl2.c zlascl2.c zla_wwaddw.c)
+   zla_lin_berr.c zlarscl2.c zlascl2.c zla_wwaddw.c zgedmd.c zgedmdq.c)
 
 
 if(USE_XBLAS)

From 174f4e65e320241c7b57136db95d367cac7583d7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 20 Jun 2023 23:14:35 +0200
Subject: [PATCH 146/718] Add LAPACK/LAPACKE functions for Dynamic Mode
 Decomposition

---
 exports/gensymbol | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/exports/gensymbol b/exports/gensymbol
index b584167a4..704eab06f 100755
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -844,6 +844,23 @@ lapackobjs2z="$lapackobjs2z
     zungtsqr_row
     "
 
+#functions added for lapack-3.11
+lapackobjs2c="$lapackobjs2c
+    cgedmd
+    cgedmdq
+    "
+lapackobjs2d="$lapackobjs2d
+    dgedmd
+    dgedmdq
+    "
+lapackobjs2s="$lapackobjs2s
+    sgedmd
+    sgedmdq
+    "
+lapackobjs2z="$lapackobjs2z
+    zgedmd
+    zgedmdq
+    "
 lapack_extendedprecision_objs="
     zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx
     dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx
@@ -1013,6 +1030,10 @@ lapackeobjsc="
     LAPACKE_cgebrd_work
     LAPACKE_cgecon
     LAPACKE_cgecon_work
+    LAPACKE_cgedmd
+    LAPACKE_cgedmd_work
+    LAPACKE_cgedmdq
+    LAPACKE_cgedmdq_work
     LAPACKE_cgeequ
     LAPACKE_cgeequ_work
     LAPACKE_cgeequb
@@ -1672,6 +1693,10 @@ lapackeobjsd="
     LAPACKE_dgebrd_work
     LAPACKE_dgecon
     LAPACKE_dgecon_work
+    LAPACKE_dgedmd
+    LAPACKE_dgedmd_work
+    LAPACKE_dgedmdq
+    LAPACKE_dgedmdq_work
     LAPACKE_dgeequ
     LAPACKE_dgeequ_work
     LAPACKE_dgeequb
@@ -2285,6 +2310,10 @@ lapackeobjss="
     LAPACKE_sgebrd_work
     LAPACKE_sgecon
     LAPACKE_sgecon_work
+    LAPACKE_sgedmd
+    LAPACKE_sgedmd_work
+    LAPACKE_sgedmdq
+    LAPACKE_sgedmdq_work
     LAPACKE_sgeequ
     LAPACKE_sgeequ_work
     LAPACKE_sgeequb
@@ -2894,6 +2923,10 @@ lapackeobjsz="
     LAPACKE_zgebrd_work
     LAPACKE_zgecon
     LAPACKE_zgecon_work
+    LAPACKE_zgedmd
+    LAPACKE_zgedmd_work
+    LAPACKE_zgedmdq
+    LAPACKE_zgedmdq_work
     LAPACKE_zgeequ
     LAPACKE_zgeequ_work
     LAPACKE_zgeequb

From 5cc1f7a0bab55bd8c10c9fceaa88d67a7246dede Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 21 Jun 2023 15:43:20 +0200
Subject: [PATCH 147/718] Add functional C replacements for ?GEDMD?

---
 lapack-netlib/SRC/cgedmd.c  | 1159 ++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/cgedmdq.c |  778 ++++++++++++++++++++++
 lapack-netlib/SRC/dgedmd.c  | 1242 +++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/dgedmdq.c |  789 ++++++++++++++++++++++
 lapack-netlib/SRC/sgedmd.c  | 1235 ++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/sgedmdq.c |  785 ++++++++++++++++++++++
 lapack-netlib/SRC/zgedmd.c  | 1165 ++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/zgedmdq.c |  782 ++++++++++++++++++++++
 8 files changed, 7935 insertions(+)

diff --git a/lapack-netlib/SRC/cgedmd.c b/lapack-netlib/SRC/cgedmd.c
index 447b23014..570395c7b 100644
--- a/lapack-netlib/SRC/cgedmd.c
+++ b/lapack-netlib/SRC/cgedmd.c
@@ -509,3 +509,1162 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 
 
 
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+static integer c__1 = 1;
+static integer c__0 = 0;
+
+/* Subroutine */ int cgedmd_(char *jobs, char *jobz, char *jobr, char *jobf, 
+	integer *whtsvd, integer *m, integer *n, complex *x, integer *ldx, 
+	complex *y, integer *ldy, integer *nrnk, real *tol, integer *k, 
+	complex *eigs, complex *z__, integer *ldz, real *res, complex *b, 
+	integer *ldb, complex *w, integer *ldw, complex *s, integer *lds, 
+	complex *zwork, integer *lzwork, real *rwork, integer *lrwork, 
+	integer *iwork, integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer x_dim1, x_offset, y_dim1, y_offset, z_dim1, z_offset, b_dim1, 
+	    b_offset, w_dim1, w_offset, s_dim1, s_offset, i__1, i__2, i__3, 
+	    i__4, i__5;
+    real r__1, r__2;
+    complex q__1, q__2;
+
+    /* Local variables */
+    complex zone;
+    real zero, ssum;
+    integer info1, info2;
+    real xscl1, xscl2;
+    integer i__, j;
+    real scale;
+    extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *), cgeev_(char *, 
+	    char *, integer *, complex *, integer *, complex *, complex *, 
+	    integer *, complex *, integer *, complex *, integer *, real *, 
+	    integer *);
+    extern logical lsame_(char *, char *);
+    logical badxy;
+    real small;
+    char jobzl[1];
+    extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, 
+	    integer *, complex *, integer *);
+    logical wntex;
+    complex zzero;
+    extern real scnrm2_(integer *, complex *, integer *);
+    extern /* Subroutine */ int cgesdd_(char *, integer *, integer *, complex 
+	    *, integer *, real *, complex *, integer *, complex *, integer *, 
+	    complex *, integer *, real *, integer *, integer *), 
+	    clascl_(char *, integer *, integer *, real *, real *, integer *, 
+	    integer *, complex *, integer *, integer *);
+    extern integer icamax_(integer *, complex *, integer *);
+    extern real slamch_(char *);
+    extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer 
+	    *), cgesvd_(char *, char *, integer *, integer *, complex *, 
+	    integer *, real *, complex *, integer *, complex *, integer *, 
+	    complex *, integer *, real *, integer *), clacpy_(
+	    char *, integer *, integer *, complex *, integer *, complex *, 
+	    integer *), xerbla_(char *, integer *);
+    char t_or_n__[1];
+    extern /* Subroutine */ int cgejsv_(char *, char *, char *, char *, char *
+	    , char *, integer *, integer *, complex *, integer *, real *, 
+	    complex *, integer *, complex *, integer *, complex *, integer *, 
+	    real *, integer *, integer *, integer *), classq_(integer *, complex *, integer *, 
+	    real *, real *);
+    logical sccolx, sccoly;
+    extern logical sisnan_(real *);
+    integer lwrsdd, mwrsdd, iminwr;
+    logical wntref, wntvec;
+    real rootsc;
+    integer lwrkev, mlwork, mwrkev, numrnk, olwork, lwrsvd, mwrsvd, mlrwrk;
+    logical lquery, wntres;
+    char jsvopt[1];
+    integer lwrsvj, mwrsvj;
+    real rdummy[2];
+    extern /* Subroutine */ int mecago_();
+    integer lwrsvq, mwrsvq;
+    real ofl, one;
+    extern /* Subroutine */ int cgesvdq_(char *, char *, char *, char *, char 
+	    *, integer *, integer *, complex *, integer *, real *, complex *, 
+	    integer *, complex *, integer *, integer *, integer *, integer *, 
+	    complex *, integer *, real *, integer *, integer *);
+
+/*   March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real32 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ............................................................ */
+/*     Purpose */
+/*     ======= */
+/*     CGEDMD computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, CGEDMD computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, CGEDMD returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/* ...................................................................... */
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ...................................................................... */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product X(:,1:K)*W, where X */
+/*            contains a POD basis (leading left singular vectors */
+/*            of the data matrix X) and W contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of K, X, W, Z. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will be */
+/*            computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: CGESVD (the QR SVD algorithm) */
+/*     2 :: CGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: CGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M>= 0 */
+/*     The state space dimension (the row dimension of X, Y). */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshot pairs */
+/*     (the number of columns of X and Y). */
+/* ..... */
+/*     X (input/output) COMPLEX(KIND=WP) M-by-N array */
+/*   > On entry, X contains the data snapshot matrix X. It is */
+/*     assumed that the column norms of X are in the range of */
+/*     the normalized floating point numbers. */
+/*   < On exit, the leading K columns of X contain a POD basis, */
+/*     i.e. the leading K left singular vectors of the input */
+/*     data matrix X, U(:,1:K). All N columns of X contain all */
+/*     left singular vectors of the input matrix X. */
+/*     See the descriptions of K, Z and W. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= M */
+/*     The leading dimension of the array X. */
+/* ..... */
+/*     Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array */
+/*   > On entry, Y contains the data snapshot matrix Y */
+/*   < On exit, */
+/*     If JOBR == 'R', the leading K columns of Y  contain */
+/*     the residual vectors for the computed Ritz pairs. */
+/*     See the description of RES. */
+/*     If JOBR == 'N', Y contains the original input data, */
+/*                     scaled according to the value of JOBS. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= M */
+/*     The leading dimension of the array Y. */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1) */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the descriptions of TOL and  K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the POD basis for the data snapshot */
+/*     matrix X and the number of the computed Ritz pairs. */
+/*     The value of K is determined according to the rule set */
+/*     by the parameters NRNK and TOL. */
+/*     See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     EIGS (output) COMPLEX(KIND=WP) N-by-1 array */
+/*     The leading K (K<=N) entries of EIGS contain */
+/*     the computed eigenvalues (Ritz values). */
+/*     See the descriptions of K, and Z. */
+/* ..... */
+/*     Z (workspace/output) COMPLEX(KIND=WP)  M-by-N array */
+/*     If JOBZ =='V' then Z contains the  Ritz vectors.  Z(:,i) */
+/*     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. */
+/*     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as */
+/*     the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) */
+/*     is an eigenvector corresponding to EIGS(i). The columns */
+/*     of W(1:k,1:K) are the computed eigenvectors of the */
+/*     K-by-K Rayleigh quotient. */
+/*     See the descriptions of EIGS, X and W. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) N-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs, */
+/*     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. */
+/*     See the description of EIGS and Z. */
+/* ..... */
+/*     B (output) COMPLEX(KIND=WP)  M-by-N array. */
+/*     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:M,1:K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     If JOBF =='N', then B is not referenced. */
+/*     See the descriptions of X, W, K. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= M */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     W (workspace/output) COMPLEX(KIND=WP) N-by-N array */
+/*     On exit, W(1:K,1:K) contains the K computed */
+/*     eigenvectors of the matrix Rayleigh quotient. */
+/*     The Ritz vectors (returned in Z) are the */
+/*     product of X (containing a POD basis for the input */
+/*     matrix X) and W. See the descriptions of K, S, X and Z. */
+/*     W is also used as a workspace to temporarily store the */
+/*     right singular vectors of X. */
+/* ..... */
+/*     LDW (input) INTEGER, LDW >= N */
+/*     The leading dimension of the array W. */
+/* ..... */
+/*     S (workspace/output) COMPLEX(KIND=WP) N-by-N array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by CGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array */
+/*     ZWORK is used as complex workspace in the complex SVD, as */
+/*     specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing */
+/*     the eigenvalues of a Rayleigh quotient. */
+/*     If the call to CGEDMD is only workspace query, then */
+/*     ZWORK(1) contains the minimal complex workspace length and */
+/*     ZWORK(2) is the optimal complex workspace length. */
+/*     Hence, the length of work is at least 2. */
+/*     See the description of LZWORK. */
+/* ..... */
+/*     LZWORK (input) INTEGER */
+/*     The minimal length of the workspace vector ZWORK. */
+/*     LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV), */
+/*     where LZWORK_CGEEV = MAX( 1, 2*N )  and the minimal */
+/*     LZWORK_SVD is calculated as follows */
+/*     If WHTSVD == 1 :: CGESVD :: */
+/*        LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) */
+/*     If WHTSVD == 2 :: CGESDD :: */
+/*        LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) */
+/*     If WHTSVD == 3 :: CGESVDQ :: */
+/*        LZWORK_SVD = obtainable by a query */
+/*     If WHTSVD == 4 :: CGEJSV :: */
+/*        LZWORK_SVD = obtainable by a query */
+/*     If on entry LZWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths and returns them in */
+/*     LZWORK(1) and LZWORK(2), respectively. */
+/* ..... */
+/*     RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array */
+/*     On exit, RWORK(1:N) contains the singular values of */
+/*     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). */
+/*     If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain */
+/*     scaling factor RWORK(N+2)/RWORK(N+1) used to scale X */
+/*     and Y to avoid overflow in the SVD of X. */
+/*     This may be of interest if the scaling option is off */
+/*     and as many as possible smallest eigenvalues are */
+/*     desired to the highest feasible accuracy. */
+/*     If the call to CGEDMD is only workspace query, then */
+/*     RWORK(1) contains the minimal workspace length. */
+/*     See the description of LRWORK. */
+/* ..... */
+/*     LRWORK (input) INTEGER */
+/*     The minimal length of the workspace vector RWORK. */
+/*     LRWORK is calculated as follows: */
+/*     LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where */
+/*     LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace */
+/*     for the SVD subroutine determined by the input parameter */
+/*     WHTSVD. */
+/*     If WHTSVD == 1 :: CGESVD :: */
+/*        LRWORK_SVD = 5*MIN(M,N) */
+/*     If WHTSVD == 2 :: CGESDD :: */
+/*        LRWORK_SVD =  MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), */
+/*        2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) */
+/*     If WHTSVD == 3 :: CGESVDQ :: */
+/*        LRWORK_SVD = obtainable by a query */
+/*     If WHTSVD == 4 :: CGEJSV :: */
+/*        LRWORK_SVD = obtainable by a query */
+/*     If on entry LRWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     real workspace length and returns it in RWORK(1). */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */
+/*     If on entry LIWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for  ZWORK, RWORK and */
+/*     IWORK. See the descriptions of ZWORK, RWORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+
+/*     Local arrays */
+/*     ~~~~~~~~~~~~ */
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+/* ............................................................ */
+    /* Parameter adjustments */
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --eigs;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1 * 1;
+    w -= w_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --zwork;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    zero = 0.f;
+    one = 1.f;
+    zzero.r = 0.f, zzero.i = 0.f;
+    zone.r = 1.f, zone.i = 0.f;
+
+/*    Test the input arguments */
+
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    *info = 0;
+    lquery = *lzwork == -1 || *liwork == -1 || *lrwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || lsame_(jobz, "N") || lsame_(
+	    jobz, "F"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && ! wntvec) {
+	*info = -3;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -4;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -5;
+    } else if (*m < 0) {
+	*info = -6;
+    } else if (*n < 0 || *n > *m) {
+	*info = -7;
+    } else if (*ldx < *m) {
+	*info = -9;
+    } else if (*ldy < *m) {
+	*info = -11;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -12;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -13;
+    } else if (*ldz < *m) {
+	*info = -17;
+    } else if ((wntref || wntex) && *ldb < *m) {
+	*info = -20;
+    } else if (*ldw < *n) {
+	*info = -22;
+    } else if (*lds < *n) {
+	*info = -24;
+    }
+
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0) {
+/* Quick return. All output except K is void. */
+/* INFO=1 signals the void input. */
+/* In case of a workspace query, the default */
+/* minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		rwork[1] = 1.f;
+		zwork[1].r = 2.f, zwork[1].i = 0.f;
+		zwork[2].r = 2.f, zwork[2].i = 0.f;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	iminwr = 1;
+	mlrwrk = f2cmax(1,*n);
+	mlwork = 2;
+	olwork = 2;
+/*         SELECT CASE ( WHTSVD ) */
+	if (*whtsvd == 1) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of CGESVD: */
+/* MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N)) */
+/* Computing MAX */
+	    i__1 = 1, i__2 = (f2cmin(*m,*n) << 1) + f2cmax(*m,*n);
+	    mwrsvd = f2cmax(i__1,i__2);
+	    mlwork = f2cmax(mlwork,mwrsvd);
+/* Computing MAX */
+	    i__1 = mlrwrk, i__2 = *n + f2cmin(*m,*n) * 5;
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		cgesvd_("O", "S", m, n, &x[x_offset], ldx, &rwork[1], &b[
+			b_offset], ldb, &w[w_offset], ldw, &zwork[1], &c_n1, 
+			rdummy, &info1);
+		lwrsvd = (integer) zwork[1].r;
+		olwork = f2cmax(olwork,lwrsvd);
+	    }
+	} else if (*whtsvd == 2) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of CGESDD: */
+/* MWRSDD = 2*f2cmin(M,N)*f2cmin(M,N)+2*f2cmin(M,N)+f2cmax(M,N). */
+/* RWORK length: 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N) */
+/* In LAPACK 3.10.1 RWORK is defined differently. */
+/* Below we take f2cmax over the two versions. */
+/* IMINWR = 8*MIN(M,N) */
+	    mwrsdd = (f2cmin(*m,*n) << 1) * f2cmin(*m,*n) + (f2cmin(*m,*n) << 1) + f2cmax(
+		    *m,*n);
+	    mlwork = f2cmax(mlwork,mwrsdd);
+	    iminwr = f2cmin(*m,*n) << 3;
+/* Computing MAX */
+/* Computing MAX */
+	    i__3 = f2cmin(*m,*n) * 5 * f2cmin(*m,*n) + f2cmin(*m,*n) * 7, i__4 = f2cmin(*
+		    m,*n) * 5 * f2cmin(*m,*n) + f2cmin(*m,*n) * 5, i__3 = f2cmax(i__3,
+		    i__4), i__4 = (f2cmax(*m,*n) << 1) * f2cmin(*m,*n) + (f2cmin(*m,*n)
+		     << 1) * f2cmin(*m,*n) + f2cmin(*m,*n);
+	    i__1 = mlrwrk, i__2 = *n + f2cmax(i__3,i__4);
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		cgesdd_("O", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset],
+			 ldb, &w[w_offset], ldw, &zwork[1], &c_n1, rdummy, &
+			iwork[1], &info1);
+/* Computing MAX */
+		i__1 = mwrsdd, i__2 = (integer) zwork[1].r;
+		lwrsdd = f2cmax(i__1,i__2);
+		olwork = f2cmax(olwork,lwrsdd);
+	    }
+	} else if (*whtsvd == 3) {
+	    cgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &rwork[
+		    1], &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &
+		    iwork[1], &c_n1, &zwork[1], &c_n1, rdummy, &c_n1, &info1);
+	    iminwr = iwork[1];
+	    mwrsvq = (integer) zwork[2].r;
+	    mlwork = f2cmax(mlwork,mwrsvq);
+/* Computing MAX */
+	    i__1 = mlrwrk, i__2 = *n + (integer) rdummy[0];
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		lwrsvq = (integer) zwork[1].r;
+		olwork = f2cmax(olwork,lwrsvq);
+	    }
+	} else if (*whtsvd == 4) {
+	    *(unsigned char *)jsvopt = 'J';
+	    cgejsv_("F", "U", jsvopt, "N", "N", "P", m, n, &x[x_offset], ldx, 
+		    &rwork[1], &z__[z_offset], ldz, &w[w_offset], ldw, &zwork[
+		    1], &c_n1, rdummy, &c_n1, &iwork[1], &info1);
+	    iminwr = iwork[1];
+	    mwrsvj = (integer) zwork[2].r;
+	    mlwork = f2cmax(mlwork,mwrsvj);
+/* Computing MAX */
+/* Computing MAX */
+	    i__3 = 7, i__4 = (integer) rdummy[0];
+	    i__1 = mlrwrk, i__2 = *n + f2cmax(i__3,i__4);
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		lwrsvj = (integer) zwork[1].r;
+		olwork = f2cmax(olwork,lwrsvj);
+	    }
+/*         END SELECT */
+	}
+	if (wntvec || wntex || lsame_(jobz, "F")) {
+	    *(unsigned char *)jobzl = 'V';
+	} else {
+	    *(unsigned char *)jobzl = 'N';
+	}
+/* Workspace calculation to the CGEEV call */
+/* Computing MAX */
+	i__1 = 1, i__2 = *n << 1;
+	mwrkev = f2cmax(i__1,i__2);
+	mlwork = f2cmax(mlwork,mwrkev);
+/* Computing MAX */
+	i__1 = mlrwrk, i__2 = *n + (*n << 1);
+	mlrwrk = f2cmax(i__1,i__2);
+	if (lquery) {
+	    cgeev_("N", jobzl, n, &s[s_offset], lds, &eigs[1], &w[w_offset], 
+		    ldw, &w[w_offset], ldw, &zwork[1], &c_n1, &rwork[1], &
+		    info1);
+/* LAPACK CALL */
+	    lwrkev = (integer) zwork[1].r;
+	    olwork = f2cmax(olwork,lwrkev);
+	    olwork = f2cmax(2,olwork);
+	}
+
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -30;
+	}
+	if (*lrwork < mlrwrk && ! lquery) {
+	    *info = -28;
+	}
+	if (*lzwork < mlwork && ! lquery) {
+	    *info = -26;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGEDMD", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	rwork[1] = (real) mlrwrk;
+	zwork[1].r = (real) mlwork, zwork[1].i = 0.f;
+	zwork[2].r = (real) olwork, zwork[2].i = 0.f;
+	return 0;
+    }
+/* ............................................................ */
+
+    ofl = slamch_("O") * slamch_("P");
+    small = slamch_("S");
+    badxy = FALSE_;
+
+/*     <1> Optional scaling of the snapshots (columns of X, Y) */
+/*     ========================================================== */
+    if (sccolx) {
+/* The columns of X will be normalized. */
+/* To prevent overflows, the column norms of X are */
+/* carefully computed using CLASSQ. */
+	*k = 0;
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* WORK(i) = SCNRM2( M, X(1,i), 1 ) */
+	    scale = zero;
+	    classq_(m, &x[i__ * x_dim1 + 1], &c__1, &scale, &ssum);
+	    if (sisnan_(&scale) || sisnan_(&ssum)) {
+		*k = 0;
+		*info = -8;
+		i__2 = -(*info);
+		xerbla_("CGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of X(:,i) overflows. First, X(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. */
+/*                 Next, the norm of X(:,i) is stored without */
+/*                 overflow as WORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of X(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    r__1 = one / rootsc;
+		    clascl_("G", &c__0, &c__0, &scale, &r__1, m, &c__1, &x[
+			    i__ * x_dim1 + 1], ldx, &info2);
+		    rwork[i__] = -scale * (rootsc / (real) (*m));
+		} else {
+/*                 X(:,i) will be scaled to unit 2-norm */
+		    rwork[i__] = scale * rootsc;
+		    clascl_("G", &c__0, &c__0, &rwork[i__], &one, m, &c__1, &
+			    x[i__ * x_dim1 + 1], ldx, &info2);
+/*                 X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)          ! INTRINSIC */
+/* LAPAC */
+		}
+	    } else {
+		rwork[i__] = zero;
+		++(*k);
+	    }
+	}
+	if (*k == *n) {
+/* All columns of X are zero. Return error code -8. */
+/* (the 8th input variable had an illegal value) */
+	    *k = 0;
+	    *info = -8;
+	    i__1 = -(*info);
+	    xerbla_("CGEDMD", &i__1);
+	    return 0;
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of Y. */
+	    if (rwork[i__] > zero) {
+		r__1 = one / rwork[i__];
+		csscal_(m, &r__1, &y[i__ * y_dim1 + 1], &c__1);
+/*               Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)      ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (rwork[i__] < zero) {
+		r__1 = -rwork[i__];
+		r__2 = one / (real) (*m);
+		clascl_("G", &c__0, &c__0, &r__1, &r__2, m, &c__1, &y[i__ * 
+			y_dim1 + 1], ldy, &info2);
+/* LAPACK C */
+	    } else if (c_abs(&y[icamax_(m, &y[i__ * y_dim1 + 1], &c__1) + i__ 
+		    * y_dim1]) != zero) {
+/*               X(:,i) is zero vector. For consistency, */
+/*               Y(:,i) should also be zero. If Y(:,i) is not */
+/*               zero, then the data might be inconsistent or */
+/*               corrupted. If JOBS == 'C', Y(:,i) is set to */
+/*               zero and a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+		if (lsame_(jobs, "C")) {
+		    csscal_(m, &zero, &y[i__ * y_dim1 + 1], &c__1);
+		}
+/* BLAS CALL */
+	    }
+	}
+    }
+
+    if (sccoly) {
+/* The columns of Y will be normalized. */
+/* To prevent overflows, the column norms of Y are */
+/* carefully computed using CLASSQ. */
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* RWORK(i) = SCNRM2( M, Y(1,i), 1 ) */
+	    scale = zero;
+	    classq_(m, &y[i__ * y_dim1 + 1], &c__1, &scale, &ssum);
+	    if (sisnan_(&scale) || sisnan_(&ssum)) {
+		*k = 0;
+		*info = -10;
+		i__2 = -(*info);
+		xerbla_("CGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of Y(:,i) overflows. First, Y(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. */
+/*                 Next, the norm of Y(:,i) is stored without */
+/*                 overflow as RWORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of Y(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    r__1 = one / rootsc;
+		    clascl_("G", &c__0, &c__0, &scale, &r__1, m, &c__1, &y[
+			    i__ * y_dim1 + 1], ldy, &info2);
+		    rwork[i__] = -scale * (rootsc / (real) (*m));
+		} else {
+/*                 Y(:,i) will be scaled to unit 2-norm */
+		    rwork[i__] = scale * rootsc;
+		    clascl_("G", &c__0, &c__0, &rwork[i__], &one, m, &c__1, &
+			    y[i__ * y_dim1 + 1], ldy, &info2);
+/*                 Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)          ! INTRINSIC */
+/* LAPA */
+		}
+	    } else {
+		rwork[i__] = zero;
+	    }
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of X. */
+	    if (rwork[i__] > zero) {
+		r__1 = one / rwork[i__];
+		csscal_(m, &r__1, &x[i__ * x_dim1 + 1], &c__1);
+/*               X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)      ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (rwork[i__] < zero) {
+		r__1 = -rwork[i__];
+		r__2 = one / (real) (*m);
+		clascl_("G", &c__0, &c__0, &r__1, &r__2, m, &c__1, &x[i__ * 
+			x_dim1 + 1], ldx, &info2);
+/* LAPACK */
+	    } else if (c_abs(&x[icamax_(m, &x[i__ * x_dim1 + 1], &c__1) + i__ 
+		    * x_dim1]) != zero) {
+/*               Y(:,i) is zero vector.  If X(:,i) is not */
+/*               zero, then a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+	    }
+	}
+    }
+
+/*     <2> SVD of the data snapshot matrix X. */
+/*     ===================================== */
+/*     The left singular vectors are stored in the array X. */
+/*     The right singular vectors are in the array W. */
+/*     The array W will later on contain the eigenvectors */
+/*     of a Rayleigh quotient. */
+    numrnk = *n;
+/*      SELECT CASE ( WHTSVD ) */
+    if (*whtsvd == 1) {
+	cgesvd_("O", "S", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], 
+		ldb, &w[w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &
+		info1);
+/* LA */
+	*(unsigned char *)t_or_n__ = 'C';
+    } else if (*whtsvd == 2) {
+	cgesdd_("O", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], ldb, &
+		w[w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &iwork[1]
+		, &info1);
+/* LAP */
+	*(unsigned char *)t_or_n__ = 'C';
+    } else if (*whtsvd == 3) {
+	i__1 = *lrwork - *n;
+	cgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &rwork[1], 
+		&z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &iwork[1], 
+		liwork, &zwork[1], lzwork, &rwork[*n + 1], &i__1, &info1);
+/* LAPACK CA */
+	clacpy_("A", m, &numrnk, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK C */
+	*(unsigned char *)t_or_n__ = 'C';
+    } else if (*whtsvd == 4) {
+	i__1 = *lrwork - *n;
+	cgejsv_("F", "U", jsvopt, "N", "N", "P", m, n, &x[x_offset], ldx, &
+		rwork[1], &z__[z_offset], ldz, &w[w_offset], ldw, &zwork[1], 
+		lzwork, &rwork[*n + 1], &i__1, &iwork[1], &info1);
+	clacpy_("A", m, n, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK CALL */
+	*(unsigned char *)t_or_n__ = 'N';
+	xscl1 = rwork[*n + 1];
+	xscl2 = rwork[*n + 2];
+	if (xscl1 != xscl2) {
+/* This is an exceptional situation. If the */
+/* data matrices are not scaled and the */
+/* largest singular value of X overflows. */
+/* In that case CGEJSV can return the SVD */
+/* in scaled form. The scaling factor can be used */
+/* to rescale the data (X and Y). */
+	    clascl_("G", &c__0, &c__0, &xscl1, &xscl2, m, n, &y[y_offset], 
+		    ldy, &info2);
+	}
+/*      END SELECT */
+    }
+
+    if (info1 > 0) {
+/* The SVD selected subroutine did not converge. */
+/* Return with an error code. */
+	*info = 2;
+	return 0;
+    }
+
+    if (rwork[1] == zero) {
+/* The largest computed singular value of (scaled) */
+/* X is zero. Return error code -8 */
+/* (the 8th input variable had an illegal value). */
+	*k = 0;
+	*info = -8;
+	i__1 = -(*info);
+	xerbla_("CGEDMD", &i__1);
+	return 0;
+    }
+
+/* <3> Determine the numerical rank of the data */
+/*    snapshots matrix X. This depends on the */
+/*    parameters NRNK and TOL. */
+/*      SELECT CASE ( NRNK ) */
+    if (*nrnk == -1) {
+	*k = 1;
+	i__1 = numrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (rwork[i__] <= rwork[1] * *tol || rwork[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else if (*nrnk == -2) {
+	*k = 1;
+	i__1 = numrnk - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (rwork[i__ + 1] <= rwork[i__] * *tol || rwork[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else {
+	*k = 1;
+	i__1 = *nrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (rwork[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+/*          END SELECT */
+    }
+/*   Now, U = X(1:M,1:K) is the SVD/POD basis for the */
+/*   snapshot data in the input matrix X. */
+/* <4> Compute the Rayleigh quotient S = U^H * A * U. */
+/*    Depending on the requested outputs, the computation */
+/*    is organized to compute additional auxiliary */
+/*    matrices (for the residuals and refinements). */
+
+/*    In all formulas below, we need V_k*Sigma_k^(-1) */
+/*    where either V_k is in W(1:N,1:K), or V_k^H is in */
+/*    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). */
+    if (lsame_(t_or_n__, "N")) {
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    r__1 = one / rwork[i__];
+	    csscal_(n, &r__1, &w[i__ * w_dim1 + 1], &c__1);
+/* W(1:N,i) = (ONE/RWORK(i)) * W(1:N,i)      ! INTRINSIC */
+/* BLAS CALL */
+	}
+    } else {
+/* This non-unit stride access is due to the fact */
+/* that CGESVD, CGESVDQ and CGESDD return the */
+/* adjoint matrix of the right singular vectors. */
+/* DO i = 1, K */
+/* CALL DSCAL( N, ONE/RWORK(i), W(i,1), LDW )  ! BLAS CALL */
+/* ! W(i,1:N) = (ONE/RWORK(i)) * W(i,1:N)      ! INTRINSIC */
+/* END DO */
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    rwork[*n + i__] = one / rwork[i__];
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *k;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * w_dim1;
+		i__4 = *n + i__;
+		q__2.r = rwork[i__4], q__2.i = zero;
+		i__5 = i__ + j * w_dim1;
+		q__1.r = q__2.r * w[i__5].r - q__2.i * w[i__5].i, q__1.i = 
+			q__2.r * w[i__5].i + q__2.i * w[i__5].r;
+		w[i__3].r = q__1.r, w[i__3].i = q__1.i;
+	    }
+	}
+    }
+
+    if (wntref) {
+
+/* Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) */
+/* for computing the refined Ritz vectors */
+/* (optionally, outside CGEDMD). */
+	cgemm_("N", t_or_n__, m, k, n, &zone, &y[y_offset], ldy, &w[w_offset],
+		 ldw, &zzero, &z__[z_offset], ldz);
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N)))  ! INTRI */
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))             ! INTRI */
+
+/* At this point Z contains */
+/* A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and */
+/* this is needed for computing the residuals. */
+/* This matrix is  returned in the array B and */
+/* it can be used to compute refined Ritz vectors. */
+/* BLAS */
+	clacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+/* B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC */
+/* BLAS CALL */
+	cgemm_("C", "N", k, k, m, &zone, &x[x_offset], ldx, &z__[z_offset], 
+		ldz, &zzero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRI */
+/* At this point S = U^H * A * U is the Rayleigh quotient. */
+/* BLAS */
+    } else {
+/* A * U(:,1:K) is not explicitly needed and the */
+/* computation is organized differently. The Rayleigh */
+/* quotient is computed more efficiently. */
+	cgemm_("C", "N", k, n, m, &zone, &x[x_offset], ldx, &y[y_offset], ldy,
+		 &zzero, &z__[z_offset], ldz);
+/* Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) )  ! IN */
+
+/* B */
+	cgemm_("N", t_or_n__, k, k, n, &zone, &z__[z_offset], ldz, &w[
+		w_offset], ldw, &zzero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRIN */
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))          ! INTRIN */
+/* At this point S = U^H * A * U is the Rayleigh quotient. */
+/* If the residuals are requested, save scaled V_k into Z. */
+/* Recall that V_k or V_k^H is stored in W. */
+/* BLAS */
+	if (wntres || wntex) {
+	    if (lsame_(t_or_n__, "N")) {
+		clacpy_("A", n, k, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    } else {
+		clacpy_("A", k, n, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    }
+	}
+    }
+
+/* <5> Compute the Ritz values and (if requested) the */
+/*   right eigenvectors of the Rayleigh quotient. */
+
+    cgeev_("N", jobzl, k, &s[s_offset], lds, &eigs[1], &w[w_offset], ldw, &w[
+	    w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &info1);
+
+/* W(1:K,1:K) contains the eigenvectors of the Rayleigh */
+/* quotient.  See the description of Z. */
+/* Also, see the description of CGEEV. */
+/* LAPACK CA */
+    if (info1 > 0) {
+/* CGEEV failed to compute the eigenvalues and */
+/* eigenvectors of the Rayleigh quotient. */
+	*info = 3;
+	return 0;
+    }
+
+/* <6> Compute the eigenvectors (if requested) and, */
+/* the residuals (if requested). */
+
+    if (wntvec || wntex) {
+	if (wntres) {
+	    if (wntref) {
+/* Here, if the refinement is requested, we have */
+/* A*U(:,1:K) already computed and stored in Z. */
+/* For the residuals, need Y = A * U(:,1;K) * W. */
+		cgemm_("N", "N", m, k, k, &zone, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zzero, &y[y_offset], ldy);
+/* Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)       ! INTRINSIC */
+/* This frees Z; Y contains A * U(:,1:K) * W. */
+/* BLAS CALL */
+	    } else {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) (or its adjoint) is stored in Z */
+		cgemm_(t_or_n__, "N", n, k, k, &zone, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zzero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+		cgemm_("N", "N", m, k, n, &zone, &y[y_offset], ldy, &s[
+			s_offset], lds, &zzero, &z__[z_offset], ldz);
+/* Save a copy of Z into Y and free Z for holding */
+/* the Ritz vectors. */
+		clacpy_("A", m, k, &z__[z_offset], ldz, &y[y_offset], ldy);
+		if (wntex) {
+		    clacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+		}
+	    }
+	} else if (wntex) {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) is stored in Z */
+	    cgemm_(t_or_n__, "N", n, k, k, &zone, &z__[z_offset], ldz, &w[
+		    w_offset], ldw, &zzero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+	    cgemm_("N", "N", m, k, n, &zone, &y[y_offset], ldy, &s[s_offset], 
+		    lds, &zzero, &b[b_offset], ldb);
+/* The above call replaces the following two calls */
+/* that were used in the developing-testing phase. */
+/* CALL CGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & */
+/*           LDS, ZZERO, Z, LDZ) */
+/* Save a copy of Z into Y and free Z for holding */
+/* the Ritz vectors. */
+/* CALL CLACPY( 'A', M, K, Z, LDZ, B, LDB ) */
+	}
+
+/* Compute the Ritz vectors */
+	if (wntvec) {
+	    cgemm_("N", "N", m, k, k, &zone, &x[x_offset], ldx, &w[w_offset], 
+		    ldw, &zzero, &z__[z_offset], ldz);
+	}
+/* Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRIN */
+
+/* BLAS CALL */
+	if (wntres) {
+	    i__1 = *k;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		i__2 = i__;
+		q__1.r = -eigs[i__2].r, q__1.i = -eigs[i__2].i;
+		caxpy_(m, &q__1, &z__[i__ * z_dim1 + 1], &c__1, &y[i__ * 
+			y_dim1 + 1], &c__1);
+/* Y(1:M,i) = Y(1:M,i) - EIGS(i) * Z(1:M,i)            ! */
+
+		res[i__] = scnrm2_(m, &y[i__ * y_dim1 + 1], &c__1);
+
+	    }
+	}
+    }
+
+    if (*whtsvd == 4) {
+	rwork[*n + 1] = xscl1;
+	rwork[*n + 2] = xscl2;
+    }
+
+/*     Successful exit. */
+    if (! badxy) {
+	*info = 0;
+    } else {
+/* A warning on possible data inconsistency. */
+/* This should be a rare event. */
+	*info = 4;
+    }
+/* ............................................................ */
+    return 0;
+/*     ...... */
+} /* cgedmd_ */
+
diff --git a/lapack-netlib/SRC/cgedmdq.c b/lapack-netlib/SRC/cgedmdq.c
index 447b23014..6e3a1faca 100644
--- a/lapack-netlib/SRC/cgedmdq.c
+++ b/lapack-netlib/SRC/cgedmdq.c
@@ -509,3 +509,781 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 
 
 
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+
+/* Subroutine */ int cgedmdq_(char *jobs, char *jobz, char *jobr, char *jobq, 
+	char *jobt, char *jobf, integer *whtsvd, integer *m, integer *n, 
+	complex *f, integer *ldf, complex *x, integer *ldx, complex *y, 
+	integer *ldy, integer *nrnk, real *tol, integer *k, complex *eigs, 
+	complex *z__, integer *ldz, real *res, complex *b, integer *ldb, 
+	complex *v, integer *ldv, complex *s, integer *lds, complex *zwork, 
+	integer *lzwork, real *work, integer *lwork, integer *iwork, integer *
+	liwork, integer *info)
+{
+    /* System generated locals */
+    integer f_dim1, f_offset, x_dim1, x_offset, y_dim1, y_offset, z_dim1, 
+	    z_offset, b_dim1, b_offset, v_dim1, v_offset, s_dim1, s_offset, 
+	    i__1, i__2;
+
+    /* Local variables */
+    real zero;
+    integer info1;
+    extern logical lsame_(char *, char *);
+    char jobvl[1];
+    integer minmn;
+    logical wantq;
+    integer mlwqr, olwqr;
+    logical wntex;
+    complex zzero;
+    extern /* Subroutine */ int cgedmd_(char *, char *, char *, char *, 
+	    integer *, integer *, integer *, complex *, integer *, complex *, 
+	    integer *, integer *, real *, integer *, complex *, complex *, 
+	    integer *, real *, complex *, integer *, complex *, integer *, 
+	    complex *, integer *, complex *, integer *, real *, integer *, 
+	    integer *, integer *, integer *), 
+	    cgeqrf_(integer *, integer *, complex *, integer *, complex *, 
+	    complex *, integer *, integer *), clacpy_(char *, integer *, 
+	    integer *, complex *, integer *, complex *, integer *), 
+	    claset_(char *, integer *, integer *, complex *, complex *, 
+	    complex *, integer *), xerbla_(char *, integer *);
+    integer mlwdmd, olwdmd;
+    logical sccolx, sccoly;
+    extern /* Subroutine */ int cungqr_(integer *, integer *, integer *, 
+	    complex *, integer *, complex *, complex *, integer *, integer *);
+    integer iminwr;
+    logical wntvec, wntvcf;
+    integer mlwgqr;
+    logical wntref;
+    integer mlwork, olwgqr, olwork, mlrwrk, mlwmqr, olwmqr;
+    logical lquery, wntres, wnttrf, wntvcq;
+    extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *, 
+	    integer *, complex *, integer *, complex *, complex *, integer *, 
+	    complex *, integer *, integer *);
+    real one;
+
+/* March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real32 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ..... */
+/*     Purpose */
+/*     ======= */
+/*     CGEDMDQ computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices, using a QR factorization */
+/*     based compression of the data. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, CGEDMDQ computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, CGEDMDQ returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office. */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ...................................................................... */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. The data snapshots are the columns */
+/*     of F. The leading N-1 columns of F are denoted X and the */
+/*     trailing N-1 columns are denoted Y. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Z*V, where Z */
+/*            is orthonormal and V contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of F, V, Z. */
+/*     'Q' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Q*Z, where Z */
+/*            contains the eigenvectors of the compression of the */
+/*            underlying discretised operator onto the span of */
+/*            the data snapshots. See the descriptions of F, V, Z. */
+/*            Q is from the inital QR facorization. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will */
+/*            be computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBQ (input) CHARACTER*1 */
+/*     Specifies whether to explicitly compute and return the */
+/*     unitary matrix from the QR factorization. */
+/*     'Q' :: The matrix Q of the QR factorization of the data */
+/*            snapshot matrix is computed and stored in the */
+/*            array F. See the description of F. */
+/*     'N' :: The matrix Q is not explicitly computed. */
+/* ..... */
+/*     JOBT (input) CHARACTER*1 */
+/*     Specifies whether to return the upper triangular factor */
+/*     from the QR factorization. */
+/*     'R' :: The matrix R of the QR factorization of the data */
+/*            snapshot matrix F is returned in the array Y. */
+/*            See the description of Y and Further details. */
+/*     'N' :: The matrix R is not returned. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/*     To be useful on exit, this option needs JOBQ='Q'. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: CGESVD (the QR SVD algorithm) */
+/*     2 :: CGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: CGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M >= 0 */
+/*     The state space dimension (the number of rows of F). */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshots from a single trajectory, */
+/*     taken at equidistant discrete times. This is the */
+/*     number of columns of F. */
+/* ..... */
+/*     F (input/output) COMPLEX(KIND=WP) M-by-N array */
+/*     > On entry, */
+/*     the columns of F are the sequence of data snapshots */
+/*     from a single trajectory, taken at equidistant discrete */
+/*     times. It is assumed that the column norms of F are */
+/*     in the range of the normalized floating point numbers. */
+/*     < On exit, */
+/*     If JOBQ == 'Q', the array F contains the orthogonal */
+/*     matrix/factor of the QR factorization of the initial */
+/*     data snapshots matrix F. See the description of JOBQ. */
+/*     If JOBQ == 'N', the entries in F strictly below the main */
+/*     diagonal contain, column-wise, the information on the */
+/*     Householder vectors, as returned by CGEQRF. The */
+/*     remaining information to restore the orthogonal matrix */
+/*     of the initial QR factorization is stored in ZWORK(1:MIN(M,N)). */
+/*     See the description of ZWORK. */
+/* ..... */
+/*     LDF (input) INTEGER, LDF >= M */
+/*     The leading dimension of the array F. */
+/* ..... */
+/*     X (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array */
+/*     X is used as workspace to hold representations of the */
+/*     leading N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, the leading K columns of X contain the leading */
+/*     K left singular vectors of the above described content */
+/*     of X. To lift them to the space of the left singular */
+/*     vectors U(:,1:K) of the input data, pre-multiply with the */
+/*     Q factor from the initial QR factorization. */
+/*     See the descriptions of F, K, V  and Z. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= N */
+/*     The leading dimension of the array X. */
+/* ..... */
+/*     Y (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N) array */
+/*     Y is used as workspace to hold representations of the */
+/*     trailing N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, */
+/*     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper */
+/*     triangular factor from the QR factorization of the data */
+/*     snapshot matrix F. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= N */
+/*     The leading dimension of the array Y. */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1) */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N-1 :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the description of K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the SVD/POD basis for the leading N-1 */
+/*     data snapshots (columns of F) and the number of the */
+/*     computed Ritz pairs. The value of K is determined */
+/*     according to the rule set by the parameters NRNK and */
+/*     TOL. See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     EIGS (output) COMPLEX(KIND=WP) (N-1)-by-1 array */
+/*     The leading K (K<=N-1) entries of EIGS contain */
+/*     the computed eigenvalues (Ritz values). */
+/*     See the descriptions of K, and Z. */
+/* ..... */
+/*     Z (workspace/output) COMPLEX(KIND=WP)  M-by-(N-1) array */
+/*     If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) */
+/*     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. */
+/*     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as */
+/*     Z*V, where Z contains orthonormal matrix (the product of */
+/*     Q from the initial QR factorization and the SVD/POD_basis */
+/*     returned by CGEDMD in X) and the second factor (the */
+/*     eigenvectors of the Rayleigh quotient) is in the array V, */
+/*     as returned by CGEDMD. That is,  X(:,1:K)*V(:,i) */
+/*     is an eigenvector corresponding to EIGS(i). The columns */
+/*     of V(1:K,1:K) are the computed eigenvectors of the */
+/*     K-by-K Rayleigh quotient. */
+/*     See the descriptions of EIGS, X and V. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs, */
+/*     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. */
+/*     See the description of EIGS and Z. */
+/* ..... */
+/*     B (output) COMPLEX(KIND=WP)  MIN(M,N)-by-(N-1) array. */
+/*     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:N,1;K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     In both cases, the content of B can be lifted to the */
+/*     original dimension of the input data by pre-multiplying */
+/*     with the Q factor from the initial QR factorization. */
+/*     Here A denotes a compression of the underlying operator. */
+/*     See the descriptions of F and X. */
+/*     If JOBF =='N', then B is not referenced. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= MIN(M,N) */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     V (workspace/output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array */
+/*     On exit, V(1:K,1:K) V contains the K eigenvectors of */
+/*     the Rayleigh quotient. The Ritz vectors */
+/*     (returned in Z) are the product of Q from the initial QR */
+/*     factorization (see the description of F) X (see the */
+/*     description of X) and V. */
+/* ..... */
+/*     LDV (input) INTEGER, LDV >= N-1 */
+/*     The leading dimension of the array V. */
+/* ..... */
+/*     S (output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by CGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N-1 */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     ZWORK (workspace/output) COMPLEX(KIND=WP) LWORK-by-1 array */
+/*     On exit, */
+/*     ZWORK(1:MIN(M,N)) contains the scalar factors of the */
+/*     elementary reflectors as returned by CGEQRF of the */
+/*     M-by-N input matrix F. */
+/*     If the call to CGEDMDQ is only workspace query, then */
+/*     ZWORK(1) contains the minimal complex workspace length and */
+/*     ZWORK(2) is the optimal complex workspace length. */
+/*     Hence, the length of work is at least 2. */
+/*     See the description of LZWORK. */
+/* ..... */
+/*     LZWORK (input) INTEGER */
+/*     The minimal length of the  workspace vector ZWORK. */
+/*     LZWORK is calculated as follows: */
+/*     Let MLWQR  = N (minimal workspace for CGEQRF[M,N]) */
+/*         MLWDMD = minimal workspace for CGEDMD (see the */
+/*                  description of LWORK in CGEDMD) */
+/*         MLWMQR = N (minimal workspace for */
+/*                    ZUNMQR['L','N',M,N,N]) */
+/*         MLWGQR = N (minimal workspace for ZUNGQR[M,N,N]) */
+/*         MINMN  = MIN(M,N) */
+/*     Then */
+/*     LZWORK = MAX(2, MIN(M,N)+MLWQR, MINMN+MLWDMD) */
+/*     is further updated as follows: */
+/*        if   JOBZ == 'V' or JOBZ == 'F' THEN */
+/*             LZWORK = MAX( LZWORK, MINMN+MLWMQR ) */
+/*        if   JOBQ == 'Q' THEN */
+/*             LZWORK = MAX( ZLWORK, MINMN+MLWGQR) */
+
+/* ..... */
+/*     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */
+/*     On exit, */
+/*     WORK(1:N-1) contains the singular values of */
+/*     the input submatrix F(1:M,1:N-1). */
+/*     If the call to CGEDMDQ is only workspace query, then */
+/*     WORK(1) contains the minimal workspace length and */
+/*     WORK(2) is the optimal workspace length. hence, the */
+/*     length of work is at least 2. */
+/*     See the description of LWORK. */
+/* ..... */
+/*     LWORK (input) INTEGER */
+/*     The minimal length of the  workspace vector WORK. */
+/*     LWORK is the same as in CGEDMD, because in CGEDMDQ */
+/*     only CGEDMD requires real workspace for snapshots */
+/*     of dimensions MIN(M,N)-by-(N-1). */
+/*     If on entry LWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     Let M1=MIN(M,N), N1=N-1. Then */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */
+/*     If on entry LIWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+/*     COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP ) */
+
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     External subroutines */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+/* .......................................................... */
+    /* Parameter adjustments */
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --eigs;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1 * 1;
+    v -= v_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --zwork;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    one = 1.f;
+    zero = 0.f;
+    zzero.r = 0.f, zzero.i = 0.f;
+
+/*    Test the input arguments */
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntvcf = lsame_(jobz, "F");
+    wntvcq = lsame_(jobz, "Q");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    wantq = lsame_(jobq, "Q");
+    wnttrf = lsame_(jobt, "R");
+    minmn = f2cmin(*m,*n);
+    *info = 0;
+    lquery = *lwork == -1 || *liwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || wntvcf || wntvcq || lsame_(jobz, "N"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && lsame_(jobz, "N")) {
+	*info = -3;
+    } else if (! (wantq || lsame_(jobq, "N"))) {
+	*info = -4;
+    } else if (! (wnttrf || lsame_(jobt, "N"))) {
+	*info = -5;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -6;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -7;
+    } else if (*m < 0) {
+	*info = -8;
+    } else if (*n < 0 || *n > *m + 1) {
+	*info = -9;
+    } else if (*ldf < *m) {
+	*info = -11;
+    } else if (*ldx < minmn) {
+	*info = -13;
+    } else if (*ldy < minmn) {
+	*info = -15;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -16;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -17;
+    } else if (*ldz < *m) {
+	*info = -21;
+    } else if ((wntref || wntex) && *ldb < minmn) {
+	*info = -24;
+    } else if (*ldv < *n - 1) {
+	*info = -26;
+    } else if (*lds < *n - 1) {
+	*info = -28;
+    }
+
+    if (wntvec || wntvcf || wntvcq) {
+	*(unsigned char *)jobvl = 'V';
+    } else {
+	*(unsigned char *)jobvl = 'N';
+    }
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0 || *n == 1) {
+/* All output except K is void. INFO=1 signals */
+/* the void input. In case of a workspace query, */
+/* the minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		work[1] = 2.f;
+		work[2] = 2.f;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	mlrwrk = 2;
+	mlwork = 2;
+	olwork = 2;
+	iminwr = 1;
+	mlwqr = f2cmax(1,*n);
+/* Minimal workspace length for CGEQRF. */
+/* Computing MAX */
+	i__1 = mlwork, i__2 = minmn + mlwqr;
+	mlwork = f2cmax(i__1,i__2);
+	if (lquery) {
+	    cgeqrf_(m, n, &f[f_offset], ldf, &zwork[1], &zwork[1], &c_n1, &
+		    info1);
+	    olwqr = (integer) zwork[1].r;
+/* Computing MAX */
+	    i__1 = olwork, i__2 = minmn + olwqr;
+	    olwork = f2cmax(i__1,i__2);
+	}
+	i__1 = *n - 1;
+	cgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], 
+		ldx, &y[y_offset], ldy, nrnk, tol, k, &eigs[1], &z__[z_offset]
+		, ldz, &res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[
+		s_offset], lds, &zwork[1], lzwork, &work[1], &c_n1, &iwork[1],
+		 liwork, &info1);
+	mlwdmd = (integer) zwork[1].r;
+/* Computing MAX */
+	i__1 = mlwork, i__2 = minmn + mlwdmd;
+	mlwork = f2cmax(i__1,i__2);
+/* Computing MAX */
+	i__1 = mlrwrk, i__2 = (integer) work[1];
+	mlrwrk = f2cmax(i__1,i__2);
+	iminwr = f2cmax(iminwr,iwork[1]);
+	if (lquery) {
+	    olwdmd = (integer) zwork[2].r;
+/* Computing MAX */
+	    i__1 = olwork, i__2 = minmn + olwdmd;
+	    olwork = f2cmax(i__1,i__2);
+	}
+	if (wntvec || wntvcf) {
+	    mlwmqr = f2cmax(1,*n);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + mlwmqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		cunmqr_("L", "N", m, n, &minmn, &f[f_offset], ldf, &zwork[1], 
+			&z__[z_offset], ldz, &zwork[1], &c_n1, &info1);
+		olwmqr = (integer) zwork[1].r;
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + olwmqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	if (wantq) {
+	    mlwgqr = f2cmax(1,*n);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + mlwgqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		cungqr_(m, &minmn, &minmn, &f[f_offset], ldf, &zwork[1], &
+			zwork[1], &c_n1, &info1);
+		olwgqr = (integer) zwork[1].r;
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + olwgqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -34;
+	}
+	if (*lwork < mlrwrk && ! lquery) {
+	    *info = -32;
+	}
+	if (*lzwork < mlwork && ! lquery) {
+	    *info = -30;
+	}
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGEDMDQ", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	zwork[1].r = (real) mlwork, zwork[1].i = 0.f;
+	zwork[2].r = (real) olwork, zwork[2].i = 0.f;
+	work[1] = (real) mlrwrk;
+	work[2] = (real) mlrwrk;
+	return 0;
+    }
+/* ..... */
+/*     Initial QR factorization that is used to represent the */
+/*     snapshots as elements of lower dimensional subspace. */
+/*     For large scale computation with M >>N , at this place */
+/*     one can use an out of core QRF. */
+
+    i__1 = *lzwork - minmn;
+    cgeqrf_(m, n, &f[f_offset], ldf, &zwork[1], &zwork[minmn + 1], &i__1, &
+	    info1);
+
+/*     Define X and Y as the snapshots representations in the */
+/*     orthogonal basis computed in the QR factorization. */
+/*     X corresponds to the leading N-1 and Y to the trailing */
+/*     N-1 snapshots. */
+    i__1 = *n - 1;
+    claset_("L", &minmn, &i__1, &zzero, &zzero, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    clacpy_("U", &minmn, &i__1, &f[f_offset], ldf, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    clacpy_("A", &minmn, &i__1, &f[(f_dim1 << 1) + 1], ldf, &y[y_offset], ldy);
+    if (*m >= 3) {
+	i__1 = minmn - 2;
+	i__2 = *n - 2;
+	claset_("L", &i__1, &i__2, &zzero, &zzero, &y[y_dim1 + 3], ldy);
+    }
+
+/*     Compute the DMD of the projected snapshot pairs (X,Y) */
+    i__1 = *n - 1;
+    i__2 = *lzwork - minmn;
+    cgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], ldx,
+	     &y[y_offset], ldy, nrnk, tol, k, &eigs[1], &z__[z_offset], ldz, &
+	    res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[s_offset], lds, &
+	    zwork[minmn + 1], &i__2, &work[1], lwork, &iwork[1], liwork, &
+	    info1);
+    if (info1 == 2 || info1 == 3) {
+/* Return with error code. See CGEDMD for details. */
+	*info = info1;
+	return 0;
+    } else {
+	*info = info1;
+    }
+
+/*     The Ritz vectors (Koopman modes) can be explicitly */
+/*     formed or returned in factored form. */
+    if (wntvec) {
+/* Compute the eigenvectors explicitly. */
+	if (*m > minmn) {
+	    i__1 = *m - minmn;
+	    claset_("A", &i__1, k, &zzero, &zzero, &z__[minmn + 1 + z_dim1], 
+		    ldz);
+	}
+	i__1 = *lzwork - minmn;
+	cunmqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &zwork[1], &z__[
+		z_offset], ldz, &zwork[minmn + 1], &i__1, &info1);
+    } else if (wntvcf) {
+/*   Return the Ritz vectors (eigenvectors) in factored */
+/*   form Z*V, where Z contains orthonormal matrix (the */
+/*   product of Q from the initial QR factorization and */
+/*   the SVD/POD_basis returned by CGEDMD in X) and the */
+/*   second factor (the eigenvectors of the Rayleigh */
+/*   quotient) is in the array V, as returned by CGEDMD. */
+	clacpy_("A", n, k, &x[x_offset], ldx, &z__[z_offset], ldz);
+	if (*m > *n) {
+	    i__1 = *m - *n;
+	    claset_("A", &i__1, k, &zzero, &zzero, &z__[*n + 1 + z_dim1], ldz);
+	}
+	i__1 = *lzwork - minmn;
+	cunmqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &zwork[1], &z__[
+		z_offset], ldz, &zwork[minmn + 1], &i__1, &info1);
+    }
+
+/*     Some optional output variables: */
+
+/*     The upper triangular factor R in the initial QR */
+/*     factorization is optionally returned in the array Y. */
+/*     This is useful if this call to CGEDMDQ is to be */
+/*     followed by a streaming DMD that is implemented in a */
+/*     QR compressed form. */
+    if (wnttrf) {
+/* Return the upper triangular R in Y */
+	claset_("A", &minmn, n, &zzero, &zzero, &y[y_offset], ldy);
+	clacpy_("U", &minmn, n, &f[f_offset], ldf, &y[y_offset], ldy);
+    }
+
+/*     The orthonormal/unitary factor Q in the initial QR */
+/*     factorization is optionally returned in the array F. */
+/*     Same as with the triangular factor above, this is */
+/*     useful in a streaming DMD. */
+    if (wantq) {
+/* Q overwrites F */
+	i__1 = *lzwork - minmn;
+	cungqr_(m, &minmn, &minmn, &f[f_offset], ldf, &zwork[1], &zwork[minmn 
+		+ 1], &i__1, &info1);
+    }
+
+    return 0;
+
+} /* cgedmdq_ */
+
diff --git a/lapack-netlib/SRC/dgedmd.c b/lapack-netlib/SRC/dgedmd.c
index 447b23014..66b4d5da6 100644
--- a/lapack-netlib/SRC/dgedmd.c
+++ b/lapack-netlib/SRC/dgedmd.c
@@ -509,3 +509,1245 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 
 
 
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+static integer c__1 = 1;
+static integer c__0 = 0;
+static integer c__2 = 2;
+
+/* Subroutine */ int dgedmd_(char *jobs, char *jobz, char *jobr, char *jobf, 
+	integer *whtsvd, integer *m, integer *n, doublereal *x, integer *ldx, 
+	doublereal *y, integer *ldy, integer *nrnk, doublereal *tol, integer *
+	k, doublereal *reig, doublereal *imeig, doublereal *z__, integer *ldz,
+	 doublereal *res, doublereal *b, integer *ldb, doublereal *w, integer 
+	*ldw, doublereal *s, integer *lds, doublereal *work, integer *lwork, 
+	integer *iwork, integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer x_dim1, x_offset, y_dim1, y_offset, z_dim1, z_offset, b_dim1, 
+	    b_offset, w_dim1, w_offset, s_dim1, s_offset, i__1, i__2;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    doublereal zero, ssum;
+    integer info1, info2;
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    doublereal xscl1, xscl2;
+    integer i__, j;
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, 
+	    integer *);
+    doublereal scale;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, 
+	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
+	    integer *, doublereal *, doublereal *, integer *),
+	     dgeev_(char *, char *, integer *, doublereal *, integer *, 
+	    doublereal *, doublereal *, doublereal *, integer *, doublereal *,
+	     integer *, doublereal *, integer *, integer *);
+    extern logical lsame_(char *, char *);
+    logical badxy;
+    doublereal small;
+    char jobzl[1];
+    extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, 
+	    integer *, doublereal *, integer *);
+    logical wntex;
+    doublereal ab[4]	/* was [2][2] */;
+    extern doublereal dlamch_(char *), dlange_(char *, integer *, 
+	    integer *, doublereal *, integer *, doublereal *);
+    extern /* Subroutine */ int dgesdd_(char *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *, integer *, doublereal *, integer *, integer *, 
+	    integer *), dlascl_(char *, integer *, integer *, 
+	    doublereal *, doublereal *, integer *, integer *, doublereal *, 
+	    integer *, integer *);
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern logical disnan_(doublereal *);
+    extern /* Subroutine */ int dgesvd_(char *, char *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *, integer *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal 
+	    *, integer *, doublereal *, integer *), xerbla_(char *, 
+	    integer *);
+    char t_or_n__[1];
+    extern /* Subroutine */ int dgejsv_(char *, char *, char *, char *, char *
+	    , char *, integer *, integer *, doublereal *, integer *, 
+	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
+	    doublereal *, integer *, integer *, integer *), dlassq_(integer *, doublereal *, 
+	    integer *, doublereal *, doublereal *);
+    logical sccolx, sccoly;
+    integer lwrsdd, mwrsdd, iminwr;
+    logical wntref, wntvec;
+    doublereal rootsc;
+    integer lwrkev, mlwork, mwrkev, numrnk, olwork;
+    doublereal rdummy[2];
+    integer lwrsvd, mwrsvd;
+    logical lquery, wntres;
+    char jsvopt[1];
+    extern /* Subroutine */ int mecago_();
+    integer mwrsvj, lwrsvq, mwrsvq;
+    doublereal rdummy2[2], ofl, one;
+    extern /* Subroutine */ int dgesvdq_(char *, char *, char *, char *, char 
+	    *, integer *, integer *, doublereal *, integer *, doublereal *, 
+	    doublereal *, integer *, doublereal *, integer *, integer *, 
+	    integer *, integer *, doublereal *, integer *, doublereal *, 
+	    integer *, integer *);
+
+/* March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real64 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ............................................................ */
+/*     Purpose */
+/*     ======= */
+/*     DGEDMD computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, DGEDMD computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, DGEDMD returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/* ...................................................................... */
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ............................................................ */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product X(:,1:K)*W, where X */
+/*            contains a POD basis (leading left singular vectors */
+/*            of the data matrix X) and W contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of K, X, W, Z. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will be */
+/*            computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: DGESVD (the QR SVD algorithm) */
+/*     2 :: DGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: DGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M>= 0 */
+/*     The state space dimension (the row dimension of X, Y). */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshot pairs */
+/*     (the number of columns of X and Y). */
+/* ..... */
+/*     X (input/output) REAL(KIND=WP) M-by-N array */
+/*     > On entry, X contains the data snapshot matrix X. It is */
+/*     assumed that the column norms of X are in the range of */
+/*     the normalized floating point numbers. */
+/*     < On exit, the leading K columns of X contain a POD basis, */
+/*     i.e. the leading K left singular vectors of the input */
+/*     data matrix X, U(:,1:K). All N columns of X contain all */
+/*     left singular vectors of the input matrix X. */
+/*     See the descriptions of K, Z and W. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= M */
+/*     The leading dimension of the array X. */
+/* ..... */
+/*     Y (input/workspace/output) REAL(KIND=WP) M-by-N array */
+/*     > On entry, Y contains the data snapshot matrix Y */
+/*     < On exit, */
+/*     If JOBR == 'R', the leading K columns of Y  contain */
+/*     the residual vectors for the computed Ritz pairs. */
+/*     See the description of RES. */
+/*     If JOBR == 'N', Y contains the original input data, */
+/*                     scaled according to the value of JOBS. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= M */
+/*     The leading dimension of the array Y. */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1). */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the descriptions of TOL and  K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the POD basis for the data snapshot */
+/*     matrix X and the number of the computed Ritz pairs. */
+/*     The value of K is determined according to the rule set */
+/*     by the parameters NRNK and TOL. */
+/*     See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     REIG (output) REAL(KIND=WP) N-by-1 array */
+/*     The leading K (K<=N) entries of REIG contain */
+/*     the real parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     See the descriptions of K, IMEIG, and Z. */
+/* ..... */
+/*     IMEIG (output) REAL(KIND=WP) N-by-1 array */
+/*     The leading K (K<=N) entries of IMEIG contain */
+/*     the imaginary parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     The eigenvalues are determined as follows: */
+/*     If IMEIG(i) == 0, then the corresponding eigenvalue is */
+/*     real, LAMBDA(i) = REIG(i). */
+/*     If IMEIG(i)>0, then the corresponding complex */
+/*     conjugate pair of eigenvalues reads */
+/*     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i) */
+/*     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) */
+/*     That is, complex conjugate pairs have consecutive */
+/*     indices (i,i+1), with the positive imaginary part */
+/*     listed first. */
+/*     See the descriptions of K, REIG, and Z. */
+/* ..... */
+/*     Z (workspace/output) REAL(KIND=WP)  M-by-N array */
+/*     If JOBZ =='V' then */
+/*        Z contains real Ritz vectors as follows: */
+/*        If IMEIG(i)=0, then Z(:,i) is an eigenvector of */
+/*        the i-th Ritz value; ||Z(:,i)||_2=1. */
+/*        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then */
+/*        [Z(:,i) Z(:,i+1)] span an invariant subspace and */
+/*        the Ritz values extracted from this subspace are */
+/*        REIG(i) + sqrt(-1)*IMEIG(i) and */
+/*        REIG(i) - sqrt(-1)*IMEIG(i). */
+/*        The corresponding eigenvectors are */
+/*        Z(:,i) + sqrt(-1)*Z(:,i+1) and */
+/*        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. */
+/*        || Z(:,i:i+1)||_F = 1. */
+/*     If JOBZ == 'F', then the above descriptions hold for */
+/*     the columns of X(:,1:K)*W(1:K,1:K), where the columns */
+/*     of W(1:k,1:K) are the computed eigenvectors of the */
+/*     K-by-K Rayleigh quotient. The columns of W(1:K,1:K) */
+/*     are similarly structured: If IMEIG(i) == 0 then */
+/*     X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 */
+/*     then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and */
+/*          X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) */
+/*     are the eigenvectors of LAMBDA(i), LAMBDA(i+1). */
+/*     See the descriptions of REIG, IMEIG, X and W. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) N-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs. */
+/*     If LAMBDA(i) is real, then */
+/*        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. */
+/*     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair */
+/*     then */
+/*     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F */
+/*     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] */
+/*               [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. */
+/*     It holds that */
+/*     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2 */
+/*     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 */
+/*     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1) */
+/*           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1) */
+/*     See the description of REIG, IMEIG and Z. */
+/* ..... */
+/*     B (output) REAL(KIND=WP)  M-by-N array. */
+/*     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:M,1;K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     If JOBF =='N', then B is not referenced. */
+/*     See the descriptions of X, W, K. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= M */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     W (workspace/output) REAL(KIND=WP) N-by-N array */
+/*     On exit, W(1:K,1:K) contains the K computed */
+/*     eigenvectors of the matrix Rayleigh quotient (real and */
+/*     imaginary parts for each complex conjugate pair of the */
+/*     eigenvalues). The Ritz vectors (returned in Z) are the */
+/*     product of X (containing a POD basis for the input */
+/*     matrix X) and W. See the descriptions of K, S, X and Z. */
+/*     W is also used as a workspace to temporarily store the */
+/*     right singular vectors of X. */
+/* ..... */
+/*     LDW (input) INTEGER, LDW >= N */
+/*     The leading dimension of the array W. */
+/* ..... */
+/*     S (workspace/output) REAL(KIND=WP) N-by-N array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by DGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */
+/*     On exit, WORK(1:N) contains the singular values of */
+/*     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). */
+/*     If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain */
+/*     scaling factor WORK(N+2)/WORK(N+1) used to scale X */
+/*     and Y to avoid overflow in the SVD of X. */
+/*     This may be of interest if the scaling option is off */
+/*     and as many as possible smallest eigenvalues are */
+/*     desired to the highest feasible accuracy. */
+/*     If the call to DGEDMD is only workspace query, then */
+/*     WORK(1) contains the minimal workspace length and */
+/*     WORK(2) is the optimal workspace length. Hence, the */
+/*     leng of work is at least 2. */
+/*     See the description of LWORK. */
+/* ..... */
+/*     LWORK (input) INTEGER */
+/*     The minimal length of the workspace vector WORK. */
+/*     LWORK is calculated as follows: */
+/*     If WHTSVD == 1 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). */
+/*        If JOBZ == 'N'  then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). */
+/*        Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal */
+/*        workspace length of DGESVD. */
+/*     If WHTSVD == 2 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) */
+/*        If JOBZ == 'N', then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) */
+/*        Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the */
+/*        minimal workspace length of DGESDD. */
+/*     If WHTSVD == 3 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) */
+/*        If JOBZ == 'N', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) */
+/*        Here LWORK_SVD = N+M+MAX(3*N+1, */
+/*                        MAX(1,3*N+M,5*N),MAX(1,N)) */
+/*        is the minimal workspace length of DGESVDQ. */
+/*     If WHTSVD == 4 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) */
+/*        If JOBZ == 'N', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) */
+/*        Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the */
+/*        minimal workspace length of DGEJSV. */
+/*     The above expressions are not simplified in order to */
+/*     make the usage of WORK more transparent, and for */
+/*     easier checking. In any case, LWORK >= 2. */
+/*     If on entry LWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */
+/*     If on entry LIWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+/*     Local arrays */
+/*     ~~~~~~~~~~~~ */
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+/* ............................................................ */
+    /* Parameter adjustments */
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --reig;
+    --imeig;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1 * 1;
+    w -= w_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    one = 1.f;
+    zero = 0.f;
+
+/*    Test the input arguments */
+
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    *info = 0;
+    lquery = *lwork == -1 || *liwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || lsame_(jobz, "N") || lsame_(
+	    jobz, "F"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && ! wntvec) {
+	*info = -3;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -4;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -5;
+    } else if (*m < 0) {
+	*info = -6;
+    } else if (*n < 0 || *n > *m) {
+	*info = -7;
+    } else if (*ldx < *m) {
+	*info = -9;
+    } else if (*ldy < *m) {
+	*info = -11;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -12;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -13;
+    } else if (*ldz < *m) {
+	*info = -18;
+    } else if ((wntref || wntex) && *ldb < *m) {
+	*info = -21;
+    } else if (*ldw < *n) {
+	*info = -23;
+    } else if (*lds < *n) {
+	*info = -25;
+    }
+
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0) {
+/* Quick return. All output except K is void. */
+/* INFO=1 signals the void input. */
+/* In case of a workspace query, the default */
+/* minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		work[1] = 2.;
+		work[2] = 2.;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	mlwork = f2cmax(2,*n);
+	olwork = f2cmax(2,*n);
+	iminwr = 1;
+/*         SELECT CASE ( WHTSVD ) */
+	if (*whtsvd == 1) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of DGESVD: */
+/* MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) */
+/* Computing MAX */
+	    i__1 = 1, i__2 = f2cmin(*m,*n) * 3 + f2cmax(*m,*n), i__1 = f2cmax(i__1,
+		    i__2), i__2 = f2cmin(*m,*n) * 5;
+	    mwrsvd = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsvd;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		dgesvd_("O", "S", m, n, &x[x_offset], ldx, &work[1], &b[
+			b_offset], ldb, &w[w_offset], ldw, rdummy, &c_n1, &
+			info1);
+/* Computing MAX */
+		i__1 = mwrsvd, i__2 = (integer) rdummy[0];
+		lwrsvd = f2cmax(i__1,i__2);
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + lwrsvd;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	} else if (*whtsvd == 2) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of DGESDD: */
+/* MWRSDD = 3*MIN(M,N)*MIN(M,N) + */
+/* MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) ) */
+/* IMINWR = 8*MIN(M,N) */
+/* Computing MAX */
+	    i__1 = f2cmax(*m,*n), i__2 = f2cmin(*m,*n) * 5 * f2cmin(*m,*n) + (f2cmin(*m,*
+		    n) << 2);
+	    mwrsdd = f2cmin(*m,*n) * 3 * f2cmin(*m,*n) + f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsdd;
+	    mlwork = f2cmax(i__1,i__2);
+	    iminwr = f2cmin(*m,*n) << 3;
+	    if (lquery) {
+		dgesdd_("O", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], 
+			ldb, &w[w_offset], ldw, rdummy, &c_n1, &iwork[1], &
+			info1);
+/* Computing MAX */
+		i__1 = mwrsdd, i__2 = (integer) rdummy[0];
+		lwrsdd = f2cmax(i__1,i__2);
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + lwrsdd;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	} else if (*whtsvd == 3) {
+/* LWQP3 = 3*N+1 */
+/* LWORQ = MAX(N, 1) */
+/* MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) */
+/* MWRSVQ = N + MAX( LWQP3, MWRSVD, LWORQ ) + MAX(M,2) */
+/* MLWORK = N +  MWRSVQ */
+/* IMINWR = M+N-1 */
+	    dgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &work[
+		    1], &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &
+		    iwork[1], liwork, rdummy, &c_n1, rdummy2, &c_n1, &info1);
+	    iminwr = iwork[1];
+	    mwrsvq = (integer) rdummy[1];
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsvq + (integer) rdummy2[0];
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+/* Computing MAX */
+		i__1 = mwrsvq, i__2 = (integer) rdummy[0];
+		lwrsvq = f2cmax(i__1,i__2);
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + lwrsvq + (integer) rdummy2[0];
+		olwork = f2cmax(i__1,i__2);
+	    }
+	} else if (*whtsvd == 4) {
+	    *(unsigned char *)jsvopt = 'J';
+/* MWRSVJ = MAX( 7, 2*M+N, 6*N+2*N*N ) ! for JSVOPT='V' */
+/* Computing MAX */
+	    i__1 = 7, i__2 = (*m << 1) + *n, i__1 = f2cmax(i__1,i__2), i__2 = (*
+		    n << 2) + *n * *n, i__1 = f2cmax(i__1,i__2), i__2 = (*n << 1)
+		     + *n * *n + 6;
+	    mwrsvj = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsvj;
+	    mlwork = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = 3, i__2 = *m + *n * 3;
+	    iminwr = f2cmax(i__1,i__2);
+	    if (lquery) {
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + mwrsvj;
+		olwork = f2cmax(i__1,i__2);
+	    }
+/*         END SELECT */
+	}
+	if (wntvec || wntex || lsame_(jobz, "F")) {
+	    *(unsigned char *)jobzl = 'V';
+	} else {
+	    *(unsigned char *)jobzl = 'N';
+	}
+/* Workspace calculation to the DGEEV call */
+	if (lsame_(jobzl, "V")) {
+/* Computing MAX */
+	    i__1 = 1, i__2 = *n << 2;
+	    mwrkev = f2cmax(i__1,i__2);
+	} else {
+/* Computing MAX */
+	    i__1 = 1, i__2 = *n * 3;
+	    mwrkev = f2cmax(i__1,i__2);
+	}
+/* Computing MAX */
+	i__1 = mlwork, i__2 = *n + mwrkev;
+	mlwork = f2cmax(i__1,i__2);
+	if (lquery) {
+	    dgeev_("N", jobzl, n, &s[s_offset], lds, &reig[1], &imeig[1], &w[
+		    w_offset], ldw, &w[w_offset], ldw, rdummy, &c_n1, &info1);
+/* Computing MAX */
+	    i__1 = mwrkev, i__2 = (integer) rdummy[0];
+	    lwrkev = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = olwork, i__2 = *n + lwrkev;
+	    olwork = f2cmax(i__1,i__2);
+	}
+
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -29;
+	}
+	if (*lwork < mlwork && ! lquery) {
+	    *info = -27;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGEDMD", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	work[1] = (doublereal) mlwork;
+	work[2] = (doublereal) olwork;
+	return 0;
+    }
+/* ............................................................ */
+
+    ofl = dlamch_("O");
+    small = dlamch_("S");
+    badxy = FALSE_;
+
+/*     <1> Optional scaling of the snapshots (columns of X, Y) */
+/*     ========================================================== */
+    if (sccolx) {
+/* The columns of X will be normalized. */
+/* To prevent overflows, the column norms of X are */
+/* carefully computed using DLASSQ. */
+	*k = 0;
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* WORK(i) = DNRM2( M, X(1,i), 1 ) */
+	    scale = zero;
+	    dlassq_(m, &x[i__ * x_dim1 + 1], &c__1, &scale, &ssum);
+	    if (disnan_(&scale) || disnan_(&ssum)) {
+		*k = 0;
+		*info = -8;
+		i__2 = -(*info);
+		xerbla_("DGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of X(:,i) overflows. First, X(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. */
+/*                 Next, the norm of X(:,i) is stored without */
+/*                 overflow as WORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of X(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    d__1 = one / rootsc;
+		    dlascl_("G", &c__0, &c__0, &scale, &d__1, m, &c__1, &x[
+			    i__ * x_dim1 + 1], m, &info2);
+		    work[i__] = -scale * (rootsc / (doublereal) (*m));
+		} else {
+/*                 X(:,i) will be scaled to unit 2-norm */
+		    work[i__] = scale * rootsc;
+		    dlascl_("G", &c__0, &c__0, &work[i__], &one, m, &c__1, &x[
+			    i__ * x_dim1 + 1], m, &info2);
+/*                 X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)          ! INTRINSIC */
+/* LAPACK */
+		}
+	    } else {
+		work[i__] = zero;
+		++(*k);
+	    }
+	}
+	if (*k == *n) {
+/* All columns of X are zero. Return error code -8. */
+/* (the 8th input variable had an illegal value) */
+	    *k = 0;
+	    *info = -8;
+	    i__1 = -(*info);
+	    xerbla_("DGEDMD", &i__1);
+	    return 0;
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of Y. */
+	    if (work[i__] > zero) {
+		d__1 = one / work[i__];
+		dscal_(m, &d__1, &y[i__ * y_dim1 + 1], &c__1);
+/*               Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)      ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (work[i__] < zero) {
+		d__1 = -work[i__];
+		d__2 = one / (doublereal) (*m);
+		dlascl_("G", &c__0, &c__0, &d__1, &d__2, m, &c__1, &y[i__ * 
+			y_dim1 + 1], m, &info2);
+/* LAPACK CAL */
+	    } else if (y[idamax_(m, &y[i__ * y_dim1 + 1], &c__1) + i__ * 
+		    y_dim1] != zero) {
+/*               X(:,i) is zero vector. For consistency, */
+/*               Y(:,i) should also be zero. If Y(:,i) is not */
+/*               zero, then the data might be inconsistent or */
+/*               corrupted. If JOBS == 'C', Y(:,i) is set to */
+/*               zero and a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+		if (lsame_(jobs, "C")) {
+		    dscal_(m, &zero, &y[i__ * y_dim1 + 1], &c__1);
+		}
+/* BLAS CALL */
+	    }
+	}
+    }
+
+    if (sccoly) {
+/* The columns of Y will be normalized. */
+/* To prevent overflows, the column norms of Y are */
+/* carefully computed using DLASSQ. */
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* WORK(i) = DNRM2( M, Y(1,i), 1 ) */
+	    scale = zero;
+	    dlassq_(m, &y[i__ * y_dim1 + 1], &c__1, &scale, &ssum);
+	    if (disnan_(&scale) || disnan_(&ssum)) {
+		*k = 0;
+		*info = -10;
+		i__2 = -(*info);
+		xerbla_("DGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of Y(:,i) overflows. First, Y(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. */
+/*                 Next, the norm of Y(:,i) is stored without */
+/*                 overflow as WORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of Y(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    d__1 = one / rootsc;
+		    dlascl_("G", &c__0, &c__0, &scale, &d__1, m, &c__1, &y[
+			    i__ * y_dim1 + 1], m, &info2);
+		    work[i__] = -scale * (rootsc / (doublereal) (*m));
+		} else {
+/*                 X(:,i) will be scaled to unit 2-norm */
+		    work[i__] = scale * rootsc;
+		    dlascl_("G", &c__0, &c__0, &work[i__], &one, m, &c__1, &y[
+			    i__ * y_dim1 + 1], m, &info2);
+/*                 Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)          ! INTRINSIC */
+/* LAPACK */
+		}
+	    } else {
+		work[i__] = zero;
+	    }
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of X. */
+	    if (work[i__] > zero) {
+		d__1 = one / work[i__];
+		dscal_(m, &d__1, &x[i__ * x_dim1 + 1], &c__1);
+/*               X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)      ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (work[i__] < zero) {
+		d__1 = -work[i__];
+		d__2 = one / (doublereal) (*m);
+		dlascl_("G", &c__0, &c__0, &d__1, &d__2, m, &c__1, &x[i__ * 
+			x_dim1 + 1], m, &info2);
+/* LAPACK CAL */
+	    } else if (x[idamax_(m, &x[i__ * x_dim1 + 1], &c__1) + i__ * 
+		    x_dim1] != zero) {
+/*               Y(:,i) is zero vector.  If X(:,i) is not */
+/*               zero, then a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+	    }
+	}
+    }
+
+/*     <2> SVD of the data snapshot matrix X. */
+/*     ===================================== */
+/*     The left singular vectors are stored in the array X. */
+/*     The right singular vectors are in the array W. */
+/*     The array W will later on contain the eigenvectors */
+/*     of a Rayleigh quotient. */
+    numrnk = *n;
+/*      SELECT CASE ( WHTSVD ) */
+    if (*whtsvd == 1) {
+	i__1 = *lwork - *n;
+	dgesvd_("O", "S", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], 
+		ldb, &w[w_offset], ldw, &work[*n + 1], &i__1, &info1);
+/* LAPACK CAL */
+	*(unsigned char *)t_or_n__ = 'T';
+    } else if (*whtsvd == 2) {
+	i__1 = *lwork - *n;
+	dgesdd_("O", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], ldb, &w[
+		w_offset], ldw, &work[*n + 1], &i__1, &iwork[1], &info1);
+/* LAPACK CAL */
+	*(unsigned char *)t_or_n__ = 'T';
+    } else if (*whtsvd == 3) {
+	i__1 = *lwork - *n - f2cmax(2,*m);
+	i__2 = f2cmax(2,*m);
+	dgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &work[1], &
+		z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &iwork[1], 
+		liwork, &work[*n + f2cmax(2,*m) + 1], &i__1, &work[*n + 1], &
+		i__2, &info1);
+/* L */
+	dlacpy_("A", m, &numrnk, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK C */
+	*(unsigned char *)t_or_n__ = 'T';
+    } else if (*whtsvd == 4) {
+	i__1 = *lwork - *n;
+	dgejsv_("F", "U", jsvopt, "N", "N", "P", m, n, &x[x_offset], ldx, &
+		work[1], &z__[z_offset], ldz, &w[w_offset], ldw, &work[*n + 1]
+		, &i__1, &iwork[1], &info1);
+/* LAPACK CALL */
+	dlacpy_("A", m, n, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK CALL */
+	*(unsigned char *)t_or_n__ = 'N';
+	xscl1 = work[*n + 1];
+	xscl2 = work[*n + 2];
+	if (xscl1 != xscl2) {
+/* This is an exceptional situation. If the */
+/* data matrices are not scaled and the */
+/* largest singular value of X overflows. */
+/* In that case DGEJSV can return the SVD */
+/* in scaled form. The scaling factor can be used */
+/* to rescale the data (X and Y). */
+	    dlascl_("G", &c__0, &c__0, &xscl1, &xscl2, m, n, &y[y_offset], 
+		    ldy, &info2);
+	}
+/*      END SELECT */
+    }
+
+    if (info1 > 0) {
+/* The SVD selected subroutine did not converge. */
+/* Return with an error code. */
+	*info = 2;
+	return 0;
+    }
+
+    if (work[1] == zero) {
+/* The largest computed singular value of (scaled) */
+/* X is zero. Return error code -8 */
+/* (the 8th input variable had an illegal value). */
+	*k = 0;
+	*info = -8;
+	i__1 = -(*info);
+	xerbla_("DGEDMD", &i__1);
+	return 0;
+    }
+
+/* <3> Determine the numerical rank of the data */
+/*    snapshots matrix X. This depends on the */
+/*    parameters NRNK and TOL. */
+/*      SELECT CASE ( NRNK ) */
+    if (*nrnk == -1) {
+	*k = 1;
+	i__1 = numrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (work[i__] <= work[1] * *tol || work[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else if (*nrnk == -2) {
+	*k = 1;
+	i__1 = numrnk - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (work[i__ + 1] <= work[i__] * *tol || work[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else {
+	*k = 1;
+	i__1 = *nrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (work[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+/*          END SELECT */
+    }
+/*   Now, U = X(1:M,1:K) is the SVD/POD basis for the */
+/*   snapshot data in the input matrix X. */
+/* <4> Compute the Rayleigh quotient S = U^T * A * U. */
+/*    Depending on the requested outputs, the computation */
+/*    is organized to compute additional auxiliary */
+/*    matrices (for the residuals and refinements). */
+
+/*    In all formulas below, we need V_k*Sigma_k^(-1) */
+/*    where either V_k is in W(1:N,1:K), or V_k^T is in */
+/*    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). */
+    if (lsame_(t_or_n__, "N")) {
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    d__1 = one / work[i__];
+	    dscal_(n, &d__1, &w[i__ * w_dim1 + 1], &c__1);
+/* W(1:N,i) = (ONE/WORK(i)) * W(1:N,i)      ! INTRINSIC */
+/* BLAS CALL */
+	}
+    } else {
+/* This non-unit stride access is due to the fact */
+/* that DGESVD, DGESVDQ and DGESDD return the */
+/* transposed matrix of the right singular vectors. */
+/* DO i = 1, K */
+/* CALL DSCAL( N, ONE/WORK(i), W(i,1), LDW )    ! BLAS CALL */
+/* ! W(i,1:N) = (ONE/WORK(i)) * W(i,1:N)      ! INTRINSIC */
+/* END DO */
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    work[*n + i__] = one / work[i__];
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *k;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		w[i__ + j * w_dim1] = work[*n + i__] * w[i__ + j * w_dim1];
+	    }
+	}
+    }
+
+    if (wntref) {
+
+/* Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) */
+/* for computing the refined Ritz vectors */
+/* (optionally, outside DGEDMD). */
+	dgemm_("N", t_or_n__, m, k, n, &one, &y[y_offset], ldy, &w[w_offset], 
+		ldw, &zero, &z__[z_offset], ldz);
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N)))  ! INTRI */
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))             ! INTRI */
+
+/* At this point Z contains */
+/* A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and */
+/* this is needed for computing the residuals. */
+/* This matrix is  returned in the array B and */
+/* it can be used to compute refined Ritz vectors. */
+/* BLAS */
+	dlacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+/* B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC */
+/* BLAS CALL */
+	dgemm_("T", "N", k, k, m, &one, &x[x_offset], ldx, &z__[z_offset], 
+		ldz, &zero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRI */
+/* At this point S = U^T * A * U is the Rayleigh quotient. */
+/* BLAS */
+    } else {
+/* A * U(:,1:K) is not explicitly needed and the */
+/* computation is organized differently. The Rayleigh */
+/* quotient is computed more efficiently. */
+	dgemm_("T", "N", k, n, m, &one, &x[x_offset], ldx, &y[y_offset], ldy, 
+		&zero, &z__[z_offset], ldz);
+/* Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) )  ! IN */
+/* In the two DGEMM calls here, can use K for LDZ. */
+/* B */
+	dgemm_("N", t_or_n__, k, k, n, &one, &z__[z_offset], ldz, &w[w_offset]
+		, ldw, &zero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRIN */
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))          ! INTRIN */
+/* At this point S = U^T * A * U is the Rayleigh quotient. */
+/* If the residuals are requested, save scaled V_k into Z. */
+/* Recall that V_k or V_k^T is stored in W. */
+/* BLAS */
+	if (wntres || wntex) {
+	    if (lsame_(t_or_n__, "N")) {
+		dlacpy_("A", n, k, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    } else {
+		dlacpy_("A", k, n, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    }
+	}
+    }
+
+/* <5> Compute the Ritz values and (if requested) the */
+/*   right eigenvectors of the Rayleigh quotient. */
+
+    i__1 = *lwork - *n;
+    dgeev_("N", jobzl, k, &s[s_offset], lds, &reig[1], &imeig[1], &w[w_offset]
+	    , ldw, &w[w_offset], ldw, &work[*n + 1], &i__1, &info1);
+
+/* W(1:K,1:K) contains the eigenvectors of the Rayleigh */
+/* quotient. Even in the case of complex spectrum, all */
+/* computation is done in real arithmetic. REIG and */
+/* IMEIG are the real and the imaginary parts of the */
+/* eigenvalues, so that the spectrum is given as */
+/* REIG(:) + sqrt(-1)*IMEIG(:). Complex conjugate pairs */
+/* are listed at consecutive positions. For such a */
+/* complex conjugate pair of the eigenvalues, the */
+/* corresponding eigenvectors are also a complex */
+/* conjugate pair with the real and imaginary parts */
+/* stored column-wise in W at the corresponding */
+/* consecutive column indices. See the description of Z. */
+/* Also, see the description of DGEEV. */
+/* LAPACK C */
+    if (info1 > 0) {
+/* DGEEV failed to compute the eigenvalues and */
+/* eigenvectors of the Rayleigh quotient. */
+	*info = 3;
+	return 0;
+    }
+
+/* <6> Compute the eigenvectors (if requested) and, */
+/* the residuals (if requested). */
+
+    if (wntvec || wntex) {
+	if (wntres) {
+	    if (wntref) {
+/* Here, if the refinement is requested, we have */
+/* A*U(:,1:K) already computed and stored in Z. */
+/* For the residuals, need Y = A * U(:,1;K) * W. */
+		dgemm_("N", "N", m, k, k, &one, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zero, &y[y_offset], ldy);
+/* Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)       ! INTRINSIC */
+/* This frees Z; Y contains A * U(:,1:K) * W. */
+/* BLAS CALL */
+	    } else {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) is stored in Z */
+		dgemm_(t_or_n__, "N", n, k, k, &one, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+		dgemm_("N", "N", m, k, n, &one, &y[y_offset], ldy, &s[
+			s_offset], lds, &zero, &z__[z_offset], ldz);
+/* Save a copy of Z into Y and free Z for holding */
+/* the Ritz vectors. */
+		dlacpy_("A", m, k, &z__[z_offset], ldz, &y[y_offset], ldy);
+		if (wntex) {
+		    dlacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+		}
+	    }
+	} else if (wntex) {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) is stored in Z */
+	    dgemm_(t_or_n__, "N", n, k, k, &one, &z__[z_offset], ldz, &w[
+		    w_offset], ldw, &zero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+	    dgemm_("N", "N", m, k, n, &one, &y[y_offset], ldy, &s[s_offset], 
+		    lds, &zero, &b[b_offset], ldb);
+/* The above call replaces the following two calls */
+/* that were used in the developing-testing phase. */
+/* CALL DGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & */
+/*           LDS, ZERO, Z, LDZ) */
+/* Save a copy of Z into B and free Z for holding */
+/* the Ritz vectors. */
+/* CALL DLACPY( 'A', M, K, Z, LDZ, B, LDB ) */
+	}
+
+/* Compute the real form of the Ritz vectors */
+	if (wntvec) {
+	    dgemm_("N", "N", m, k, k, &one, &x[x_offset], ldx, &w[w_offset], 
+		    ldw, &zero, &z__[z_offset], ldz);
+	}
+/* Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRINSIC */
+
+/* BLAS CALL */
+	if (wntres) {
+	    i__ = 1;
+	    while(i__ <= *k) {
+		if (imeig[i__] == zero) {
+/* have a real eigenvalue with real eigenvector */
+		    d__1 = -reig[i__];
+		    daxpy_(m, &d__1, &z__[i__ * z_dim1 + 1], &c__1, &y[i__ * 
+			    y_dim1 + 1], &c__1);
+/* Y(1:M,i) = Y(1:M,i) - REIG(i) * Z(1:M,i)            ! */
+
+		    res[i__] = dnrm2_(m, &y[i__ * y_dim1 + 1], &c__1);
+
+		    ++i__;
+		} else {
+/* Have a complex conjugate pair */
+/* REIG(i) +- sqrt(-1)*IMEIG(i). */
+/* Since all computation is done in real */
+/* arithmetic, the formula for the residual */
+/* is recast for real representation of the */
+/* complex conjugate eigenpair. See the */
+/* description of RES. */
+		    ab[0] = reig[i__];
+		    ab[1] = -imeig[i__];
+		    ab[2] = imeig[i__];
+		    ab[3] = reig[i__];
+		    d__1 = -one;
+		    dgemm_("N", "N", m, &c__2, &c__2, &d__1, &z__[i__ * 
+			    z_dim1 + 1], ldz, ab, &c__2, &one, &y[i__ * 
+			    y_dim1 + 1], ldy);
+/* Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB   ! INT */
+/* BL */
+		    res[i__] = dlange_("F", m, &c__2, &y[i__ * y_dim1 + 1], 
+			    ldy, &work[*n + 1]);
+/* LA */
+		    res[i__ + 1] = res[i__];
+		    i__ += 2;
+		}
+	    }
+	}
+    }
+
+    if (*whtsvd == 4) {
+	work[*n + 1] = xscl1;
+	work[*n + 2] = xscl2;
+    }
+
+/*     Successful exit. */
+    if (! badxy) {
+	*info = 0;
+    } else {
+/* A warning on possible data inconsistency. */
+/* This should be a rare event. */
+	*info = 4;
+    }
+/* ............................................................ */
+    return 0;
+/*     ...... */
+} /* dgedmd_ */
+
diff --git a/lapack-netlib/SRC/dgedmdq.c b/lapack-netlib/SRC/dgedmdq.c
index 447b23014..a743a3156 100644
--- a/lapack-netlib/SRC/dgedmdq.c
+++ b/lapack-netlib/SRC/dgedmdq.c
@@ -509,3 +509,792 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 
 
 
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+
+/* Subroutine */ int dgedmdq_(char *jobs, char *jobz, char *jobr, char *jobq, 
+	char *jobt, char *jobf, integer *whtsvd, integer *m, integer *n, 
+	doublereal *f, integer *ldf, doublereal *x, integer *ldx, doublereal *
+	y, integer *ldy, integer *nrnk, doublereal *tol, integer *k, 
+	doublereal *reig, doublereal *imeig, doublereal *z__, integer *ldz, 
+	doublereal *res, doublereal *b, integer *ldb, doublereal *v, integer *
+	ldv, doublereal *s, integer *lds, doublereal *work, integer *lwork, 
+	integer *iwork, integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer f_dim1, f_offset, x_dim1, x_offset, y_dim1, y_offset, z_dim1, 
+	    z_offset, b_dim1, b_offset, v_dim1, v_offset, s_dim1, s_offset, 
+	    i__1, i__2;
+
+    /* Local variables */
+    doublereal zero;
+    integer info1;
+    extern logical lsame_(char *, char *);
+    char jobvl[1];
+    integer minmn;
+    logical wantq;
+    integer mlwqr, olwqr;
+    logical wntex;
+    extern /* Subroutine */ int dgedmd_(char *, char *, char *, char *, 
+	    integer *, integer *, integer *, doublereal *, integer *, 
+	    doublereal *, integer *, integer *, doublereal *, integer *, 
+	    doublereal *, doublereal *, doublereal *, integer *, doublereal *,
+	     doublereal *, integer *, doublereal *, integer *, doublereal *, 
+	    integer *, doublereal *, integer *, integer *, integer *, integer 
+	    *), dgeqrf_(integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    integer *), dlacpy_(char *, integer *, integer *, doublereal *, 
+	    integer *, doublereal *, integer *), dlaset_(char *, 
+	    integer *, integer *, doublereal *, doublereal *, doublereal *, 
+	    integer *), xerbla_(char *, integer *);
+    integer mlwdmd, olwdmd;
+    logical sccolx, sccoly;
+    extern /* Subroutine */ int dorgqr_(integer *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    integer *), dormqr_(char *, char *, integer *, integer *, integer 
+	    *, doublereal *, integer *, doublereal *, doublereal *, integer *,
+	     doublereal *, integer *, integer *);
+    integer iminwr;
+    logical wntvec, wntvcf;
+    integer mlwgqr;
+    logical wntref;
+    integer mlwork, olwgqr, olwork;
+    doublereal rdummy[2];
+    integer mlwmqr, olwmqr;
+    logical lquery, wntres, wnttrf, wntvcq;
+    doublereal one;
+
+/* March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real64 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ..... */
+/*     Purpose */
+/*     ======= */
+/*     DGEDMDQ computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices, using a QR factorization */
+/*     based compression of the data. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, DGEDMDQ computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, DGEDMDQ returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office. */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ...................................................................... */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. The data snapshots are the columns */
+/*     of F. The leading N-1 columns of F are denoted X and the */
+/*     trailing N-1 columns are denoted Y. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Z*V, where Z */
+/*            is orthonormal and V contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of F, V, Z. */
+/*     'Q' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Q*Z, where Z */
+/*            contains the eigenvectors of the compression of the */
+/*            underlying discretized operator onto the span of */
+/*            the data snapshots. See the descriptions of F, V, Z. */
+/*            Q is from the initial QR factorization. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will */
+/*            be computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBQ (input) CHARACTER*1 */
+/*     Specifies whether to explicitly compute and return the */
+/*     orthogonal matrix from the QR factorization. */
+/*     'Q' :: The matrix Q of the QR factorization of the data */
+/*            snapshot matrix is computed and stored in the */
+/*            array F. See the description of F. */
+/*     'N' :: The matrix Q is not explicitly computed. */
+/* ..... */
+/*     JOBT (input) CHARACTER*1 */
+/*     Specifies whether to return the upper triangular factor */
+/*     from the QR factorization. */
+/*     'R' :: The matrix R of the QR factorization of the data */
+/*            snapshot matrix F is returned in the array Y. */
+/*            See the description of Y and Further details. */
+/*     'N' :: The matrix R is not returned. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/*     To be useful on exit, this option needs JOBQ='Q'. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: DGESVD (the QR SVD algorithm) */
+/*     2 :: DGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: DGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M >= 0 */
+/*     The state space dimension (the number of rows of F). */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshots from a single trajectory, */
+/*     taken at equidistant discrete times. This is the */
+/*     number of columns of F. */
+/* ..... */
+/*     F (input/output) REAL(KIND=WP) M-by-N array */
+/*     > On entry, */
+/*     the columns of F are the sequence of data snapshots */
+/*     from a single trajectory, taken at equidistant discrete */
+/*     times. It is assumed that the column norms of F are */
+/*     in the range of the normalized floating point numbers. */
+/*     < On exit, */
+/*     If JOBQ == 'Q', the array F contains the orthogonal */
+/*     matrix/factor of the QR factorization of the initial */
+/*     data snapshots matrix F. See the description of JOBQ. */
+/*     If JOBQ == 'N', the entries in F strictly below the main */
+/*     diagonal contain, column-wise, the information on the */
+/*     Householder vectors, as returned by DGEQRF. The */
+/*     remaining information to restore the orthogonal matrix */
+/*     of the initial QR factorization is stored in WORK(1:N). */
+/*     See the description of WORK. */
+/* ..... */
+/*     LDF (input) INTEGER, LDF >= M */
+/*     The leading dimension of the array F. */
+/* ..... */
+/*     X (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array */
+/*     X is used as workspace to hold representations of the */
+/*     leading N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, the leading K columns of X contain the leading */
+/*     K left singular vectors of the above described content */
+/*     of X. To lift them to the space of the left singular */
+/*     vectors U(:,1:K)of the input data, pre-multiply with the */
+/*     Q factor from the initial QR factorization. */
+/*     See the descriptions of F, K, V  and Z. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= N */
+/*     The leading dimension of the array X. */
+/* ..... */
+/*     Y (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array */
+/*     Y is used as workspace to hold representations of the */
+/*     trailing N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, */
+/*     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper */
+/*     triangular factor from the QR factorization of the data */
+/*     snapshot matrix F. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= N */
+/*     The leading dimension of the array Y. */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1) */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N-1 :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the description of K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the SVD/POD basis for the leading N-1 */
+/*     data snapshots (columns of F) and the number of the */
+/*     computed Ritz pairs. The value of K is determined */
+/*     according to the rule set by the parameters NRNK and */
+/*     TOL. See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     REIG (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     The leading K (K<=N) entries of REIG contain */
+/*     the real parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     See the descriptions of K, IMEIG, Z. */
+/* ..... */
+/*     IMEIG (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     The leading K (K<N) entries of REIG contain */
+/*     the imaginary parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     The eigenvalues are determined as follows: */
+/*     If IMEIG(i) == 0, then the corresponding eigenvalue is */
+/*     real, LAMBDA(i) = REIG(i). */
+/*     If IMEIG(i)>0, then the corresponding complex */
+/*     conjugate pair of eigenvalues reads */
+/*     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i) */
+/*     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) */
+/*     That is, complex conjugate pairs have consequtive */
+/*     indices (i,i+1), with the positive imaginary part */
+/*     listed first. */
+/*     See the descriptions of K, REIG, Z. */
+/* ..... */
+/*     Z (workspace/output) REAL(KIND=WP)  M-by-(N-1) array */
+/*     If JOBZ =='V' then */
+/*        Z contains real Ritz vectors as follows: */
+/*        If IMEIG(i)=0, then Z(:,i) is an eigenvector of */
+/*        the i-th Ritz value. */
+/*        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then */
+/*        [Z(:,i) Z(:,i+1)] span an invariant subspace and */
+/*        the Ritz values extracted from this subspace are */
+/*        REIG(i) + sqrt(-1)*IMEIG(i) and */
+/*        REIG(i) - sqrt(-1)*IMEIG(i). */
+/*        The corresponding eigenvectors are */
+/*        Z(:,i) + sqrt(-1)*Z(:,i+1) and */
+/*        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. */
+/*     If JOBZ == 'F', then the above descriptions hold for */
+/*     the columns of Z*V, where the columns of V are the */
+/*     eigenvectors of the K-by-K Rayleigh quotient, and Z is */
+/*     orthonormal. The columns of V are similarly structured: */
+/*     If IMEIG(i) == 0 then Z*V(:,i) is an eigenvector, and if */
+/*     IMEIG(i) > 0 then Z*V(:,i)+sqrt(-1)*Z*V(:,i+1) and */
+/*                       Z*V(:,i)-sqrt(-1)*Z*V(:,i+1) */
+/*     are the eigenvectors of LAMBDA(i), LAMBDA(i+1). */
+/*     See the descriptions of REIG, IMEIG, X and V. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs. */
+/*     If LAMBDA(i) is real, then */
+/*        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. */
+/*     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair */
+/*     then */
+/*     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F */
+/*     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] */
+/*               [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. */
+/*     It holds that */
+/*     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2 */
+/*     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 */
+/*     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1) */
+/*           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1) */
+/*     See the description of Z. */
+/* ..... */
+/*     B (output) REAL(KIND=WP)  MIN(M,N)-by-(N-1) array. */
+/*     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:N,1;K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     In both cases, the content of B can be lifted to the */
+/*     original dimension of the input data by pre-multiplying */
+/*     with the Q factor from the initial QR factorization. */
+/*     Here A denotes a compression of the underlying operator. */
+/*     See the descriptions of F and X. */
+/*     If JOBF =='N', then B is not referenced. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= MIN(M,N) */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     V (workspace/output) REAL(KIND=WP) (N-1)-by-(N-1) array */
+/*     On exit, V(1:K,1:K) contains the K eigenvectors of */
+/*     the Rayleigh quotient. The eigenvectors of a complex */
+/*     conjugate pair of eigenvalues are returned in real form */
+/*     as explained in the description of Z. The Ritz vectors */
+/*     (returned in Z) are the product of X and V; see */
+/*     the descriptions of X and Z. */
+/* ..... */
+/*     LDV (input) INTEGER, LDV >= N-1 */
+/*     The leading dimension of the array V. */
+/* ..... */
+/*     S (output) REAL(KIND=WP) (N-1)-by-(N-1) array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by DGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N-1 */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */
+/*     On exit, */
+/*     WORK(1:MIN(M,N)) contains the scalar factors of the */
+/*     elementary reflectors as returned by DGEQRF of the */
+/*     M-by-N input matrix F. */
+/*     WORK(MIN(M,N)+1:MIN(M,N)+N-1) contains the singular values of */
+/*     the input submatrix F(1:M,1:N-1). */
+/*     If the call to DGEDMDQ is only workspace query, then */
+/*     WORK(1) contains the minimal workspace length and */
+/*     WORK(2) is the optimal workspace length. Hence, the */
+/*     length of work is at least 2. */
+/*     See the description of LWORK. */
+/* ..... */
+/*     LWORK (input) INTEGER */
+/*     The minimal length of the  workspace vector WORK. */
+/*     LWORK is calculated as follows: */
+/*     Let MLWQR  = N (minimal workspace for DGEQRF[M,N]) */
+/*         MLWDMD = minimal workspace for DGEDMD (see the */
+/*                  description of LWORK in DGEDMD) for */
+/*                  snapshots of dimensions MIN(M,N)-by-(N-1) */
+/*         MLWMQR = N (minimal workspace for */
+/*                    DORMQR['L','N',M,N,N]) */
+/*         MLWGQR = N (minimal workspace for DORGQR[M,N,N]) */
+/*     Then */
+/*     LWORK = MAX(N+MLWQR, N+MLWDMD) */
+/*     is updated as follows: */
+/*        if   JOBZ == 'V' or JOBZ == 'F' THEN */
+/*             LWORK = MAX( LWORK, MIN(M,N)+N-1+MLWMQR ) */
+/*        if   JOBQ == 'Q' THEN */
+/*             LWORK = MAX( LWORK, MIN(M,N)+N-1+MLWGQR) */
+/*     If on entry LWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     Let M1=MIN(M,N), N1=N-1. Then */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1) */
+/*     If on entry LIWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+
+/*     Local array */
+/*     ~~~~~~~~~~~ */
+
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     External subroutines */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+/* .......................................................... */
+    /* Parameter adjustments */
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --reig;
+    --imeig;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1 * 1;
+    v -= v_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    zero = 0.f;
+    one = 1.f;
+
+/*    Test the input arguments */
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntvcf = lsame_(jobz, "F");
+    wntvcq = lsame_(jobz, "Q");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    wantq = lsame_(jobq, "Q");
+    wnttrf = lsame_(jobt, "R");
+    minmn = f2cmin(*m,*n);
+    *info = 0;
+    lquery = *lwork == -1 || *liwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || wntvcf || wntvcq || lsame_(jobz, "N"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && lsame_(jobz, "N")) {
+	*info = -3;
+    } else if (! (wantq || lsame_(jobq, "N"))) {
+	*info = -4;
+    } else if (! (wnttrf || lsame_(jobt, "N"))) {
+	*info = -5;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -6;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -7;
+    } else if (*m < 0) {
+	*info = -8;
+    } else if (*n < 0 || *n > *m + 1) {
+	*info = -9;
+    } else if (*ldf < *m) {
+	*info = -11;
+    } else if (*ldx < minmn) {
+	*info = -13;
+    } else if (*ldy < minmn) {
+	*info = -15;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -16;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -17;
+    } else if (*ldz < *m) {
+	*info = -22;
+    } else if ((wntref || wntex) && *ldb < minmn) {
+	*info = -25;
+    } else if (*ldv < *n - 1) {
+	*info = -27;
+    } else if (*lds < *n - 1) {
+	*info = -29;
+    }
+
+    if (wntvec || wntvcf || wntvcq) {
+	*(unsigned char *)jobvl = 'V';
+    } else {
+	*(unsigned char *)jobvl = 'N';
+    }
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0 || *n == 1) {
+/* All output except K is void. INFO=1 signals */
+/* the void input. In case of a workspace query, */
+/* the minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		work[1] = 2.;
+		work[2] = 2.;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	mlwqr = f2cmax(1,*n);
+/* Minimal workspace length for DGEQRF. */
+	mlwork = minmn + mlwqr;
+	if (lquery) {
+	    dgeqrf_(m, n, &f[f_offset], ldf, &work[1], rdummy, &c_n1, &info1);
+	    olwqr = (integer) rdummy[0];
+	    olwork = f2cmin(*m,*n) + olwqr;
+	}
+	i__1 = *n - 1;
+	dgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], 
+		ldx, &y[y_offset], ldy, nrnk, tol, k, &reig[1], &imeig[1], &
+		z__[z_offset], ldz, &res[1], &b[b_offset], ldb, &v[v_offset], 
+		ldv, &s[s_offset], lds, &work[1], &c_n1, &iwork[1], liwork, &
+		info1);
+	mlwdmd = (integer) work[1];
+/* Computing MAX */
+	i__1 = mlwork, i__2 = minmn + mlwdmd;
+	mlwork = f2cmax(i__1,i__2);
+	iminwr = iwork[1];
+	if (lquery) {
+	    olwdmd = (integer) work[2];
+/* Computing MAX */
+	    i__1 = olwork, i__2 = minmn + olwdmd;
+	    olwork = f2cmax(i__1,i__2);
+	}
+	if (wntvec || wntvcf) {
+	    mlwmqr = f2cmax(1,*n);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + *n - 1 + mlwmqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		dormqr_("L", "N", m, n, &minmn, &f[f_offset], ldf, &work[1], &
+			z__[z_offset], ldz, &work[1], &c_n1, &info1);
+		olwmqr = (integer) work[1];
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + *n - 1 + olwmqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	if (wantq) {
+	    mlwgqr = *n;
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + *n - 1 + mlwgqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		dorgqr_(m, &minmn, &minmn, &f[f_offset], ldf, &work[1], &work[
+			1], &c_n1, &info1);
+		olwgqr = (integer) work[1];
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + *n - 1 + olwgqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	iminwr = f2cmax(1,iminwr);
+	mlwork = f2cmax(2,mlwork);
+	if (*lwork < mlwork && ! lquery) {
+	    *info = -31;
+	}
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -33;
+	}
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGEDMDQ", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	work[1] = (doublereal) mlwork;
+	work[2] = (doublereal) olwork;
+	return 0;
+    }
+/* ..... */
+/*     Initial QR factorization that is used to represent the */
+/*     snapshots as elements of lower dimensional subspace. */
+/*     For large scale computation with M >>N , at this place */
+/*     one can use an out of core QRF. */
+
+    i__1 = *lwork - minmn;
+    dgeqrf_(m, n, &f[f_offset], ldf, &work[1], &work[minmn + 1], &i__1, &
+	    info1);
+
+/*     Define X and Y as the snapshots representations in the */
+/*     orthogonal basis computed in the QR factorization. */
+/*     X corresponds to the leading N-1 and Y to the trailing */
+/*     N-1 snapshots. */
+    i__1 = *n - 1;
+    dlaset_("L", &minmn, &i__1, &zero, &zero, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    dlacpy_("U", &minmn, &i__1, &f[f_offset], ldf, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    dlacpy_("A", &minmn, &i__1, &f[(f_dim1 << 1) + 1], ldf, &y[y_offset], ldy);
+    if (*m >= 3) {
+	i__1 = minmn - 2;
+	i__2 = *n - 2;
+	dlaset_("L", &i__1, &i__2, &zero, &zero, &y[y_dim1 + 3], ldy);
+    }
+
+/*     Compute the DMD of the projected snapshot pairs (X,Y) */
+    i__1 = *n - 1;
+    i__2 = *lwork - minmn;
+    dgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], ldx,
+	     &y[y_offset], ldy, nrnk, tol, k, &reig[1], &imeig[1], &z__[
+	    z_offset], ldz, &res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[
+	    s_offset], lds, &work[minmn + 1], &i__2, &iwork[1], liwork, &
+	    info1);
+    if (info1 == 2 || info1 == 3) {
+/* Return with error code. See DGEDMD for details. */
+	*info = info1;
+	return 0;
+    } else {
+	*info = info1;
+    }
+
+/*     The Ritz vectors (Koopman modes) can be explicitly */
+/*     formed or returned in factored form. */
+    if (wntvec) {
+/* Compute the eigenvectors explicitly. */
+	if (*m > minmn) {
+	    i__1 = *m - minmn;
+	    dlaset_("A", &i__1, k, &zero, &zero, &z__[minmn + 1 + z_dim1], 
+		    ldz);
+	}
+	i__1 = *lwork - (minmn + *n - 1);
+	dormqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &work[1], &z__[
+		z_offset], ldz, &work[minmn + *n], &i__1, &info1);
+    } else if (wntvcf) {
+/*   Return the Ritz vectors (eigenvectors) in factored */
+/*   form Z*V, where Z contains orthonormal matrix (the */
+/*   product of Q from the initial QR factorization and */
+/*   the SVD/POD_basis returned by DGEDMD in X) and the */
+/*   second factor (the eigenvectors of the Rayleigh */
+/*   quotient) is in the array V, as returned by DGEDMD. */
+	dlacpy_("A", n, k, &x[x_offset], ldx, &z__[z_offset], ldz);
+	if (*m > *n) {
+	    i__1 = *m - *n;
+	    dlaset_("A", &i__1, k, &zero, &zero, &z__[*n + 1 + z_dim1], ldz);
+	}
+	i__1 = *lwork - (minmn + *n - 1);
+	dormqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &work[1], &z__[
+		z_offset], ldz, &work[minmn + *n], &i__1, &info1);
+    }
+
+/*     Some optional output variables: */
+
+/*     The upper triangular factor R in the initial QR */
+/*     factorization is optionally returned in the array Y. */
+/*     This is useful if this call to DGEDMDQ is to be */
+/*     followed by a streaming DMD that is implemented in a */
+/*     QR compressed form. */
+    if (wnttrf) {
+/* Return the upper triangular R in Y */
+	dlaset_("A", &minmn, n, &zero, &zero, &y[y_offset], ldy);
+	dlacpy_("U", &minmn, n, &f[f_offset], ldf, &y[y_offset], ldy);
+    }
+
+/*     The orthonormal/orthogonal factor Q in the initial QR */
+/*     factorization is optionally returned in the array F. */
+/*     Same as with the triangular factor above, this is */
+/*     useful in a streaming DMD. */
+    if (wantq) {
+/* Q overwrites F */
+	i__1 = *lwork - (minmn + *n - 1);
+	dorgqr_(m, &minmn, &minmn, &f[f_offset], ldf, &work[1], &work[minmn + 
+		*n], &i__1, &info1);
+    }
+
+    return 0;
+
+} /* dgedmdq_ */
+
diff --git a/lapack-netlib/SRC/sgedmd.c b/lapack-netlib/SRC/sgedmd.c
index 447b23014..c8f3a5964 100644
--- a/lapack-netlib/SRC/sgedmd.c
+++ b/lapack-netlib/SRC/sgedmd.c
@@ -509,3 +509,1238 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 
 
 
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+static integer c__1 = 1;
+static integer c__0 = 0;
+static integer c__2 = 2;
+
+/* Subroutine */ int sgedmd_(char *jobs, char *jobz, char *jobr, char *jobf, 
+	integer *whtsvd, integer *m, integer *n, real *x, integer *ldx, real *
+	y, integer *ldy, integer *nrnk, real *tol, integer *k, real *reig, 
+	real *imeig, real *z__, integer *ldz, real *res, real *b, integer *
+	ldb, real *w, integer *ldw, real *s, integer *lds, real *work, 
+	integer *lwork, integer *iwork, integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer x_dim1, x_offset, y_dim1, y_offset, z_dim1, z_offset, b_dim1, 
+	    b_offset, w_dim1, w_offset, s_dim1, s_offset, i__1, i__2;
+    real r__1, r__2;
+
+    /* Local variables */
+    real zero, ssum;
+    integer info1, info2;
+    real xscl1, xscl2;
+    extern real snrm2_(integer *, real *, integer *);
+    integer i__, j;
+    real scale;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *);
+    logical badxy;
+    real small;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, 
+	    integer *, real *, real *, integer *, real *, integer *, real *, 
+	    real *, integer *), sgeev_(char *, char *, 
+	    integer *, real *, integer *, real *, real *, real *, integer *, 
+	    real *, integer *, real *, integer *, integer *);
+    char jobzl[1];
+    extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, 
+	    real *, integer *);
+    logical wntex;
+    real ab[4]	/* was [2][2] */;
+    extern real slamch_(char *), slange_(char *, integer *, integer *,
+	     real *, integer *, real *);
+    extern /* Subroutine */ int sgesdd_(char *, integer *, integer *, real *, 
+	    integer *, real *, real *, integer *, real *, integer *, real *, 
+	    integer *, integer *, integer *), xerbla_(char *, integer 
+	    *);
+    char t_or_n__[1];
+    extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, 
+	    real *, integer *, integer *, real *, integer *, integer *);
+    extern integer isamax_(integer *, real *, integer *);
+    logical sccolx, sccoly;
+    extern logical sisnan_(real *);
+    extern /* Subroutine */ int sgesvd_(char *, char *, integer *, integer *, 
+	    real *, integer *, real *, real *, integer *, real *, integer *, 
+	    real *, integer *, integer *);
+    integer lwrsdd, mwrsdd;
+    extern /* Subroutine */ int sgejsv_(char *, char *, char *, char *, char *
+	    , char *, integer *, integer *, real *, integer *, real *, real *,
+	     integer *, real *, integer *, real *, integer *, integer *, 
+	    integer *), 
+	    slacpy_(char *, integer *, integer *, real *, integer *, real *, 
+	    integer *);
+    integer iminwr;
+    logical wntref, wntvec;
+    real rootsc;
+    integer lwrkev, mlwork, mwrkev, numrnk, olwork;
+    real rdummy[2];
+    integer lwrsvd, mwrsvd;
+    logical lquery, wntres;
+    char jsvopt[1];
+    extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, 
+	    real *), mecago_();
+    integer mwrsvj, lwrsvq, mwrsvq;
+    real rdummy2[2], ofl, one;
+    extern /* Subroutine */ int sgesvdq_(char *, char *, char *, char *, char 
+	    *, integer *, integer *, real *, integer *, real *, real *, 
+	    integer *, real *, integer *, integer *, integer *, integer *, 
+	    real *, integer *, real *, integer *, integer *);
+
+/* March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real32 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ............................................................ */
+/*     Purpose */
+/*     ======= */
+/*     SGEDMD computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, SGEDMD computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, SGEDMD returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/* ...................................................................... */
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ...................................................................... */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product X(:,1:K)*W, where X */
+/*            contains a POD basis (leading left singular vectors */
+/*            of the data matrix X) and W contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of K, X, W, Z. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will be */
+/*            computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: SGESVD (the QR SVD algorithm) */
+/*     2 :: SGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: SGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M>= 0 */
+/*     The state space dimension (the row dimension of X, Y). */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshot pairs */
+/*     (the number of columns of X and Y). */
+/* ..... */
+/*     X (input/output) REAL(KIND=WP) M-by-N array */
+/*     > On entry, X contains the data snapshot matrix X. It is */
+/*     assumed that the column norms of X are in the range of */
+/*     the normalized floating point numbers. */
+/*     < On exit, the leading K columns of X contain a POD basis, */
+/*     i.e. the leading K left singular vectors of the input */
+/*     data matrix X, U(:,1:K). All N columns of X contain all */
+/*     left singular vectors of the input matrix X. */
+/*     See the descriptions of K, Z and W. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= M */
+/*     The leading dimension of the array X. */
+/* ..... */
+/*     Y (input/workspace/output) REAL(KIND=WP) M-by-N array */
+/*     > On entry, Y contains the data snapshot matrix Y */
+/*     < On exit, */
+/*     If JOBR == 'R', the leading K columns of Y  contain */
+/*     the residual vectors for the computed Ritz pairs. */
+/*     See the description of RES. */
+/*     If JOBR == 'N', Y contains the original input data, */
+/*                     scaled according to the value of JOBS. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= M */
+/*     The leading dimension of the array Y. */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1) */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the descriptions of TOL and  K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the POD basis for the data snapshot */
+/*     matrix X and the number of the computed Ritz pairs. */
+/*     The value of K is determined according to the rule set */
+/*     by the parameters NRNK and TOL. */
+/*     See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     REIG (output) REAL(KIND=WP) N-by-1 array */
+/*     The leading K (K<=N) entries of REIG contain */
+/*     the real parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     See the descriptions of K, IMEIG, and Z. */
+/* ..... */
+/*     IMEIG (output) REAL(KIND=WP) N-by-1 array */
+/*     The leading K (K<=N) entries of IMEIG contain */
+/*     the imaginary parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     The eigenvalues are determined as follows: */
+/*     If IMEIG(i) == 0, then the corresponding eigenvalue is */
+/*     real, LAMBDA(i) = REIG(i). */
+/*     If IMEIG(i)>0, then the corresponding complex */
+/*     conjugate pair of eigenvalues reads */
+/*     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i) */
+/*     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) */
+/*     That is, complex conjugate pairs have consecutive */
+/*     indices (i,i+1), with the positive imaginary part */
+/*     listed first. */
+/*     See the descriptions of K, REIG, and Z. */
+/* ..... */
+/*     Z (workspace/output) REAL(KIND=WP)  M-by-N array */
+/*     If JOBZ =='V' then */
+/*        Z contains real Ritz vectors as follows: */
+/*        If IMEIG(i)=0, then Z(:,i) is an eigenvector of */
+/*        the i-th Ritz value; ||Z(:,i)||_2=1. */
+/*        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then */
+/*        [Z(:,i) Z(:,i+1)] span an invariant subspace and */
+/*        the Ritz values extracted from this subspace are */
+/*        REIG(i) + sqrt(-1)*IMEIG(i) and */
+/*        REIG(i) - sqrt(-1)*IMEIG(i). */
+/*        The corresponding eigenvectors are */
+/*        Z(:,i) + sqrt(-1)*Z(:,i+1) and */
+/*        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. */
+/*        || Z(:,i:i+1)||_F = 1. */
+/*     If JOBZ == 'F', then the above descriptions hold for */
+/*     the columns of X(:,1:K)*W(1:K,1:K), where the columns */
+/*     of W(1:k,1:K) are the computed eigenvectors of the */
+/*     K-by-K Rayleigh quotient. The columns of W(1:K,1:K) */
+/*     are similarly structured: If IMEIG(i) == 0 then */
+/*     X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 */
+/*     then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and */
+/*          X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) */
+/*     are the eigenvectors of LAMBDA(i), LAMBDA(i+1). */
+/*     See the descriptions of REIG, IMEIG, X and W. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) N-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs. */
+/*     If LAMBDA(i) is real, then */
+/*        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. */
+/*     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair */
+/*     then */
+/*     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F */
+/*     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] */
+/*               [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. */
+/*     It holds that */
+/*     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2 */
+/*     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 */
+/*     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1) */
+/*           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1) */
+/*     See the description of REIG, IMEIG and Z. */
+/* ..... */
+/*     B (output) REAL(KIND=WP)  M-by-N array. */
+/*     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:M,1;K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     If JOBF =='N', then B is not referenced. */
+/*     See the descriptions of X, W, K. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= M */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     W (workspace/output) REAL(KIND=WP) N-by-N array */
+/*     On exit, W(1:K,1:K) contains the K computed */
+/*     eigenvectors of the matrix Rayleigh quotient (real and */
+/*     imaginary parts for each complex conjugate pair of the */
+/*     eigenvalues). The Ritz vectors (returned in Z) are the */
+/*     product of X (containing a POD basis for the input */
+/*     matrix X) and W. See the descriptions of K, S, X and Z. */
+/*     W is also used as a workspace to temporarily store the */
+/*     left singular vectors of X. */
+/* ..... */
+/*     LDW (input) INTEGER, LDW >= N */
+/*     The leading dimension of the array W. */
+/* ..... */
+/*     S (workspace/output) REAL(KIND=WP) N-by-N array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by SGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */
+/*     On exit, WORK(1:N) contains the singular values of */
+/*     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). */
+/*     If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain */
+/*     scaling factor WORK(N+2)/WORK(N+1) used to scale X */
+/*     and Y to avoid overflow in the SVD of X. */
+/*     This may be of interest if the scaling option is off */
+/*     and as many as possible smallest eigenvalues are */
+/*     desired to the highest feasible accuracy. */
+/*     If the call to SGEDMD is only workspace query, then */
+/*     WORK(1) contains the minimal workspace length and */
+/*     WORK(2) is the optimal workspace length. Hence, the */
+/*     length of work is at least 2. */
+/*     See the description of LWORK. */
+/* ..... */
+/*     LWORK (input) INTEGER */
+/*     The minimal length of the workspace vector WORK. */
+/*     LWORK is calculated as follows: */
+/*     If WHTSVD == 1 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). */
+/*        If JOBZ == 'N'  then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). */
+/*        Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal */
+/*        workspace length of SGESVD. */
+/*     If WHTSVD == 2 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) */
+/*        If JOBZ == 'N', then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) */
+/*        Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the */
+/*        minimal workspace length of SGESDD. */
+/*     If WHTSVD == 3 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) */
+/*        If JOBZ == 'N', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) */
+/*        Here LWORK_SVD = N+M+MAX(3*N+1, */
+/*                        MAX(1,3*N+M,5*N),MAX(1,N)) */
+/*        is the minimal workspace length of SGESVDQ. */
+/*     If WHTSVD == 4 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) */
+/*        If JOBZ == 'N', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) */
+/*        Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the */
+/*        minimal workspace length of SGEJSV. */
+/*     The above expressions are not simplified in order to */
+/*     make the usage of WORK more transparent, and for */
+/*     easier checking. In any case, LWORK >= 2. */
+/*     If on entry LWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */
+/*     If on entry LIWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+/*     Local arrays */
+/*     ~~~~~~~~~~~~ */
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+/* ............................................................ */
+    /* Parameter adjustments */
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --reig;
+    --imeig;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1 * 1;
+    w -= w_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    one = 1.f;
+    zero = 0.f;
+
+/*    Test the input arguments */
+
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    *info = 0;
+    lquery = *lwork == -1 || *liwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || lsame_(jobz, "N") || lsame_(
+	    jobz, "F"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && ! wntvec) {
+	*info = -3;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -4;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -5;
+    } else if (*m < 0) {
+	*info = -6;
+    } else if (*n < 0 || *n > *m) {
+	*info = -7;
+    } else if (*ldx < *m) {
+	*info = -9;
+    } else if (*ldy < *m) {
+	*info = -11;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -12;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -13;
+    } else if (*ldz < *m) {
+	*info = -18;
+    } else if ((wntref || wntex) && *ldb < *m) {
+	*info = -21;
+    } else if (*ldw < *n) {
+	*info = -23;
+    } else if (*lds < *n) {
+	*info = -25;
+    }
+
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0) {
+/* Quick return. All output except K is void. */
+/* INFO=1 signals the void input. */
+/* In case of a workspace query, the default */
+/* minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		work[1] = 2.f;
+		work[2] = 2.f;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	mlwork = f2cmax(2,*n);
+	olwork = f2cmax(2,*n);
+	iminwr = 1;
+/*         SELECT CASE ( WHTSVD ) */
+	if (*whtsvd == 1) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of SGESVD: */
+/* MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) */
+/* Computing MAX */
+	    i__1 = 1, i__2 = f2cmin(*m,*n) * 3 + f2cmax(*m,*n), i__1 = f2cmax(i__1,
+		    i__2), i__2 = f2cmin(*m,*n) * 5;
+	    mwrsvd = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsvd;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		sgesvd_("O", "S", m, n, &x[x_offset], ldx, &work[1], &b[
+			b_offset], ldb, &w[w_offset], ldw, rdummy, &c_n1, &
+			info1);
+/* Computing MAX */
+		i__1 = mwrsvd, i__2 = (integer) rdummy[0];
+		lwrsvd = f2cmax(i__1,i__2);
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + lwrsvd;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	} else if (*whtsvd == 2) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of SGESDD: */
+/* MWRSDD = 3*MIN(M,N)*MIN(M,N) + */
+/* MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) ) */
+/* IMINWR = 8*MIN(M,N) */
+/* Computing MAX */
+	    i__1 = f2cmax(*m,*n), i__2 = f2cmin(*m,*n) * 5 * f2cmin(*m,*n) + (f2cmin(*m,*
+		    n) << 2);
+	    mwrsdd = f2cmin(*m,*n) * 3 * f2cmin(*m,*n) + f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsdd;
+	    mlwork = f2cmax(i__1,i__2);
+	    iminwr = f2cmin(*m,*n) << 3;
+	    if (lquery) {
+		sgesdd_("O", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], 
+			ldb, &w[w_offset], ldw, rdummy, &c_n1, &iwork[1], &
+			info1);
+/* Computing MAX */
+		i__1 = mwrsdd, i__2 = (integer) rdummy[0];
+		lwrsdd = f2cmax(i__1,i__2);
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + lwrsdd;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	} else if (*whtsvd == 3) {
+/* LWQP3 = 3*N+1 */
+/* LWORQ = MAX(N, 1) */
+/* MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) */
+/* MWRSVQ = N + MAX( LWQP3, MWRSVD, LWORQ )+ MAX(M,2) */
+/* MLWORK = N + MWRSVQ */
+/* IMINWR = M+N-1 */
+	    sgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &work[
+		    1], &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &
+		    iwork[1], &c_n1, rdummy, &c_n1, rdummy2, &c_n1, &info1);
+	    iminwr = iwork[1];
+	    mwrsvq = (integer) rdummy[1];
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsvq + (integer) rdummy2[0];
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		lwrsvq = (integer) rdummy[0];
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + lwrsvq + (integer) rdummy2[0];
+		olwork = f2cmax(i__1,i__2);
+	    }
+	} else if (*whtsvd == 4) {
+	    *(unsigned char *)jsvopt = 'J';
+/* MWRSVJ = MAX( 7, 2*M+N, 6*N+2*N*N )! for JSVOPT='V' */
+/* Computing MAX */
+	    i__1 = 7, i__2 = (*m << 1) + *n, i__1 = f2cmax(i__1,i__2), i__2 = (*
+		    n << 2) + *n * *n, i__1 = f2cmax(i__1,i__2), i__2 = (*n << 1)
+		     + *n * *n + 6;
+	    mwrsvj = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsvj;
+	    mlwork = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = 3, i__2 = *m + *n * 3;
+	    iminwr = f2cmax(i__1,i__2);
+	    if (lquery) {
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + mwrsvj;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+/*         END SELECT */
+	if (wntvec || wntex || lsame_(jobz, "F")) {
+	    *(unsigned char *)jobzl = 'V';
+	} else {
+	    *(unsigned char *)jobzl = 'N';
+	}
+/* Workspace calculation to the SGEEV call */
+	if (lsame_(jobzl, "V")) {
+/* Computing MAX */
+	    i__1 = 1, i__2 = *n << 2;
+	    mwrkev = f2cmax(i__1,i__2);
+	} else {
+/* Computing MAX */
+	    i__1 = 1, i__2 = *n * 3;
+	    mwrkev = f2cmax(i__1,i__2);
+	}
+/* Computing MAX */
+	i__1 = mlwork, i__2 = *n + mwrkev;
+	mlwork = f2cmax(i__1,i__2);
+	if (lquery) {
+	    sgeev_("N", jobzl, n, &s[s_offset], lds, &reig[1], &imeig[1], &w[
+		    w_offset], ldw, &w[w_offset], ldw, rdummy, &c_n1, &info1);
+/* Computing MAX */
+	    i__1 = mwrkev, i__2 = (integer) rdummy[0];
+	    lwrkev = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = olwork, i__2 = *n + lwrkev;
+	    olwork = f2cmax(i__1,i__2);
+	}
+
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -29;
+	}
+	if (*lwork < mlwork && ! lquery) {
+	    *info = -27;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGEDMD", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	work[1] = (real) mlwork;
+	work[2] = (real) olwork;
+	return 0;
+    }
+/* ............................................................ */
+
+    ofl = slamch_("O");
+    small = slamch_("S");
+    badxy = FALSE_;
+
+/*     <1> Optional scaling of the snapshots (columns of X, Y) */
+/*     ========================================================== */
+    if (sccolx) {
+/* The columns of X will be normalized. */
+/* To prevent overflows, the column norms of X are */
+/* carefully computed using SLASSQ. */
+	*k = 0;
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* WORK(i) = DNRM2( M, X(1,i), 1 ) */
+	    scale = zero;
+	    slassq_(m, &x[i__ * x_dim1 + 1], &c__1, &scale, &ssum);
+	    if (sisnan_(&scale) || sisnan_(&ssum)) {
+		*k = 0;
+		*info = -8;
+		i__2 = -(*info);
+		xerbla_("SGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of X(:,i) overflows. First, X(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. */
+/*                 Next, the norm of X(:,i) is stored without */
+/*                 overflow as WORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of X(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    r__1 = one / rootsc;
+		    slascl_("G", &c__0, &c__0, &scale, &r__1, m, &c__1, &x[
+			    i__ * x_dim1 + 1], m, &info2);
+		    work[i__] = -scale * (rootsc / (real) (*m));
+		} else {
+/*                 X(:,i) will be scaled to unit 2-norm */
+		    work[i__] = scale * rootsc;
+		    slascl_("G", &c__0, &c__0, &work[i__], &one, m, &c__1, &x[
+			    i__ * x_dim1 + 1], m, &info2);
+/*                 X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)          ! INTRINSIC */
+/* LAPACK */
+		}
+	    } else {
+		work[i__] = zero;
+		++(*k);
+	    }
+	}
+	if (*k == *n) {
+/* All columns of X are zero. Return error code -8. */
+/* (the 8th input variable had an illegal value) */
+	    *k = 0;
+	    *info = -8;
+	    i__1 = -(*info);
+	    xerbla_("SGEDMD", &i__1);
+	    return 0;
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of Y. */
+	    if (work[i__] > zero) {
+		r__1 = one / work[i__];
+		sscal_(m, &r__1, &y[i__ * y_dim1 + 1], &c__1);
+/*               Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)      ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (work[i__] < zero) {
+		r__1 = -work[i__];
+		r__2 = one / (real) (*m);
+		slascl_("G", &c__0, &c__0, &r__1, &r__2, m, &c__1, &y[i__ * 
+			y_dim1 + 1], m, &info2);
+/* LAPACK CA */
+	    } else if (y[isamax_(m, &y[i__ * y_dim1 + 1], &c__1) + i__ * 
+		    y_dim1] != zero) {
+/*               X(:,i) is zero vector. For consistency, */
+/*               Y(:,i) should also be zero. If Y(:,i) is not */
+/*               zero, then the data might be inconsistent or */
+/*               corrupted. If JOBS == 'C', Y(:,i) is set to */
+/*               zero and a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+		if (lsame_(jobs, "C")) {
+		    sscal_(m, &zero, &y[i__ * y_dim1 + 1], &c__1);
+		}
+/* BLAS CALL */
+	    }
+	}
+    }
+
+    if (sccoly) {
+/* The columns of Y will be normalized. */
+/* To prevent overflows, the column norms of Y are */
+/* carefully computed using SLASSQ. */
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* WORK(i) = DNRM2( M, Y(1,i), 1 ) */
+	    scale = zero;
+	    slassq_(m, &y[i__ * y_dim1 + 1], &c__1, &scale, &ssum);
+	    if (sisnan_(&scale) || sisnan_(&ssum)) {
+		*k = 0;
+		*info = -10;
+		i__2 = -(*info);
+		xerbla_("SGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of Y(:,i) overflows. First, Y(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. */
+/*                 Next, the norm of Y(:,i) is stored without */
+/*                 overflow as WORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of Y(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    r__1 = one / rootsc;
+		    slascl_("G", &c__0, &c__0, &scale, &r__1, m, &c__1, &y[
+			    i__ * y_dim1 + 1], m, &info2);
+		    work[i__] = -scale * (rootsc / (real) (*m));
+		} else {
+/*                 X(:,i) will be scaled to unit 2-norm */
+		    work[i__] = scale * rootsc;
+		    slascl_("G", &c__0, &c__0, &work[i__], &one, m, &c__1, &y[
+			    i__ * y_dim1 + 1], m, &info2);
+/*                 Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)          ! INTRINSIC */
+/* LAPACK */
+		}
+	    } else {
+		work[i__] = zero;
+	    }
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of X. */
+	    if (work[i__] > zero) {
+		r__1 = one / work[i__];
+		sscal_(m, &r__1, &x[i__ * x_dim1 + 1], &c__1);
+/*               X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)      ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (work[i__] < zero) {
+		r__1 = -work[i__];
+		r__2 = one / (real) (*m);
+		slascl_("G", &c__0, &c__0, &r__1, &r__2, m, &c__1, &x[i__ * 
+			x_dim1 + 1], m, &info2);
+/* LAPACK CA */
+	    } else if (x[isamax_(m, &x[i__ * x_dim1 + 1], &c__1) + i__ * 
+		    x_dim1] != zero) {
+/*               Y(:,i) is zero vector.  If X(:,i) is not */
+/*               zero, then a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+	    }
+	}
+    }
+
+/*     <2> SVD of the data snapshot matrix X. */
+/*     ===================================== */
+/*     The left singular vectors are stored in the array X. */
+/*     The right singular vectors are in the array W. */
+/*     The array W will later on contain the eigenvectors */
+/*     of a Rayleigh quotient. */
+    numrnk = *n;
+/*      SELECT CASE ( WHTSVD ) */
+    if (*whtsvd == 1) {
+	i__1 = *lwork - *n;
+	sgesvd_("O", "S", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], 
+		ldb, &w[w_offset], ldw, &work[*n + 1], &i__1, &info1);
+/* LAPACK CAL */
+	*(unsigned char *)t_or_n__ = 'T';
+    } else if (*whtsvd == 2) {
+	i__1 = *lwork - *n;
+	sgesdd_("O", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], ldb, &w[
+		w_offset], ldw, &work[*n + 1], &i__1, &iwork[1], &info1);
+/* LAPACK CAL */
+	*(unsigned char *)t_or_n__ = 'T';
+    } else if (*whtsvd == 3) {
+	i__1 = *lwork - *n - f2cmax(2,*m);
+	i__2 = f2cmax(2,*m);
+	sgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &work[1], &
+		z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &iwork[1], 
+		liwork, &work[*n + f2cmax(2,*m) + 1], &i__1, &work[*n + 1], &
+		i__2, &info1);
+
+	slacpy_("A", m, &numrnk, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK C */
+	*(unsigned char *)t_or_n__ = 'T';
+    } else if (*whtsvd == 4) {
+	i__1 = *lwork - *n;
+	sgejsv_("F", "U", jsvopt, "N", "N", "P", m, n, &x[x_offset], ldx, &
+		work[1], &z__[z_offset], ldz, &w[w_offset], ldw, &work[*n + 1]
+		, &i__1, &iwork[1], &info1);
+/* LAPACK CALL */
+	slacpy_("A", m, n, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK CALL */
+	*(unsigned char *)t_or_n__ = 'N';
+	xscl1 = work[*n + 1];
+	xscl2 = work[*n + 2];
+	if (xscl1 != xscl2) {
+/* This is an exceptional situation. If the */
+/* data matrices are not scaled and the */
+/* largest singular value of X overflows. */
+/* In that case SGEJSV can return the SVD */
+/* in scaled form. The scaling factor can be used */
+/* to rescale the data (X and Y). */
+	    slascl_("G", &c__0, &c__0, &xscl1, &xscl2, m, n, &y[y_offset], 
+		    ldy, &info2);
+	}
+/*      END SELECT */
+    }
+
+    if (info1 > 0) {
+/* The SVD selected subroutine did not converge. */
+/* Return with an error code. */
+	*info = 2;
+	return 0;
+    }
+
+    if (work[1] == zero) {
+/* The largest computed singular value of (scaled) */
+/* X is zero. Return error code -8 */
+/* (the 8th input variable had an illegal value). */
+	*k = 0;
+	*info = -8;
+	i__1 = -(*info);
+	xerbla_("SGEDMD", &i__1);
+	return 0;
+    }
+
+/* <3> Determine the numerical rank of the data */
+/*    snapshots matrix X. This depends on the */
+/*    parameters NRNK and TOL. */
+/*      SELECT CASE ( NRNK ) */
+    if (*nrnk == -1) {
+	*k = 1;
+	i__1 = numrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (work[i__] <= work[1] * *tol || work[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else if (*nrnk == -2) {
+	*k = 1;
+	i__1 = numrnk - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (work[i__ + 1] <= work[i__] * *tol || work[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else {
+	*k = 1;
+	i__1 = *nrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (work[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+/*          END SELECT */
+    }
+/*   Now, U = X(1:M,1:K) is the SVD/POD basis for the */
+/*   snapshot data in the input matrix X. */
+/* <4> Compute the Rayleigh quotient S = U^T * A * U. */
+/*    Depending on the requested outputs, the computation */
+/*    is organized to compute additional auxiliary */
+/*    matrices (for the residuals and refinements). */
+
+/*    In all formulas below, we need V_k*Sigma_k^(-1) */
+/*    where either V_k is in W(1:N,1:K), or V_k^T is in */
+/*    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). */
+    if (lsame_(t_or_n__, "N")) {
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    r__1 = one / work[i__];
+	    sscal_(n, &r__1, &w[i__ * w_dim1 + 1], &c__1);
+/* W(1:N,i) = (ONE/WORK(i)) * W(1:N,i)      ! INTRINSIC */
+/* BLAS CALL */
+	}
+    } else {
+/* This non-unit stride access is due to the fact */
+/* that SGESVD, SGESVDQ and SGESDD return the */
+/* transposed matrix of the right singular vectors. */
+/* DO i = 1, K */
+/* CALL SSCAL( N, ONE/WORK(i), W(i,1), LDW )    ! BLAS CALL */
+/* ! W(i,1:N) = (ONE/WORK(i)) * W(i,1:N)      ! INTRINSIC */
+/* END DO */
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    work[*n + i__] = one / work[i__];
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *k;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		w[i__ + j * w_dim1] = work[*n + i__] * w[i__ + j * w_dim1];
+	    }
+	}
+    }
+
+    if (wntref) {
+
+/* Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) */
+/* for computing the refined Ritz vectors */
+/* (optionally, outside SGEDMD). */
+	sgemm_("N", t_or_n__, m, k, n, &one, &y[y_offset], ldy, &w[w_offset], 
+		ldw, &zero, &z__[z_offset], ldz);
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N)))  ! INTRI */
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))             ! INTRI */
+
+/* At this point Z contains */
+/* A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and */
+/* this is needed for computing the residuals. */
+/* This matrix is  returned in the array B and */
+/* it can be used to compute refined Ritz vectors. */
+/* BLAS */
+	slacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+/* B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC */
+/* BLAS CALL */
+	sgemm_("T", "N", k, k, m, &one, &x[x_offset], ldx, &z__[z_offset], 
+		ldz, &zero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRI */
+/* At this point S = U^T * A * U is the Rayleigh quotient. */
+/* BLAS */
+    } else {
+/* A * U(:,1:K) is not explicitly needed and the */
+/* computation is organized differently. The Rayleigh */
+/* quotient is computed more efficiently. */
+	sgemm_("T", "N", k, n, m, &one, &x[x_offset], ldx, &y[y_offset], ldy, 
+		&zero, &z__[z_offset], ldz);
+/* Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) )  ! IN */
+/* In the two SGEMM calls here, can use K for LDZ */
+/* B */
+	sgemm_("N", t_or_n__, k, k, n, &one, &z__[z_offset], ldz, &w[w_offset]
+		, ldw, &zero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRIN */
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))          ! INTRIN */
+/* At this point S = U^T * A * U is the Rayleigh quotient. */
+/* If the residuals are requested, save scaled V_k into Z. */
+/* Recall that V_k or V_k^T is stored in W. */
+/* BLAS */
+	if (wntres || wntex) {
+	    if (lsame_(t_or_n__, "N")) {
+		slacpy_("A", n, k, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    } else {
+		slacpy_("A", k, n, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    }
+	}
+    }
+
+/* <5> Compute the Ritz values and (if requested) the */
+/*   right eigenvectors of the Rayleigh quotient. */
+
+    i__1 = *lwork - *n;
+    sgeev_("N", jobzl, k, &s[s_offset], lds, &reig[1], &imeig[1], &w[w_offset]
+	    , ldw, &w[w_offset], ldw, &work[*n + 1], &i__1, &info1);
+
+/* W(1:K,1:K) contains the eigenvectors of the Rayleigh */
+/* quotient. Even in the case of complex spectrum, all */
+/* computation is done in real arithmetic. REIG and */
+/* IMEIG are the real and the imaginary parts of the */
+/* eigenvalues, so that the spectrum is given as */
+/* REIG(:) + sqrt(-1)*IMEIG(:). Complex conjugate pairs */
+/* are listed at consecutive positions. For such a */
+/* complex conjugate pair of the eigenvalues, the */
+/* corresponding eigenvectors are also a complex */
+/* conjugate pair with the real and imaginary parts */
+/* stored column-wise in W at the corresponding */
+/* consecutive column indices. See the description of Z. */
+/* Also, see the description of SGEEV. */
+/* LAPACK C */
+    if (info1 > 0) {
+/* SGEEV failed to compute the eigenvalues and */
+/* eigenvectors of the Rayleigh quotient. */
+	*info = 3;
+	return 0;
+    }
+
+/* <6> Compute the eigenvectors (if requested) and, */
+/* the residuals (if requested). */
+
+    if (wntvec || wntex) {
+	if (wntres) {
+	    if (wntref) {
+/* Here, if the refinement is requested, we have */
+/* A*U(:,1:K) already computed and stored in Z. */
+/* For the residuals, need Y = A * U(:,1;K) * W. */
+		sgemm_("N", "N", m, k, k, &one, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zero, &y[y_offset], ldy);
+/* Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)       ! INTRINSIC */
+/* This frees Z; Y contains A * U(:,1:K) * W. */
+/* BLAS CALL */
+	    } else {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) is stored in Z */
+		sgemm_(t_or_n__, "N", n, k, k, &one, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+		sgemm_("N", "N", m, k, n, &one, &y[y_offset], ldy, &s[
+			s_offset], lds, &zero, &z__[z_offset], ldz);
+/* Save a copy of Z into Y and free Z for holding */
+/* the Ritz vectors. */
+		slacpy_("A", m, k, &z__[z_offset], ldz, &y[y_offset], ldy);
+		if (wntex) {
+		    slacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+		}
+	    }
+	} else if (wntex) {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) is stored in Z */
+	    sgemm_(t_or_n__, "N", n, k, k, &one, &z__[z_offset], ldz, &w[
+		    w_offset], ldw, &zero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+	    sgemm_("N", "N", m, k, n, &one, &y[y_offset], ldy, &s[s_offset], 
+		    lds, &zero, &b[b_offset], ldb);
+/* The above call replaces the following two calls */
+/* that were used in the developing-testing phase. */
+/* CALL SGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & */
+/*           LDS, ZERO, Z, LDZ) */
+/* Save a copy of Z into B and free Z for holding */
+/* the Ritz vectors. */
+/* CALL SLACPY( 'A', M, K, Z, LDZ, B, LDB ) */
+	}
+
+/* Compute the real form of the Ritz vectors */
+	if (wntvec) {
+	    sgemm_("N", "N", m, k, k, &one, &x[x_offset], ldx, &w[w_offset], 
+		    ldw, &zero, &z__[z_offset], ldz);
+	}
+/* Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRINSIC */
+
+/* BLAS CALL */
+	if (wntres) {
+	    i__ = 1;
+	    while(i__ <= *k) {
+		if (imeig[i__] == zero) {
+/* have a real eigenvalue with real eigenvector */
+		    r__1 = -reig[i__];
+		    saxpy_(m, &r__1, &z__[i__ * z_dim1 + 1], &c__1, &y[i__ * 
+			    y_dim1 + 1], &c__1);
+/* Y(1:M,i) = Y(1:M,i) - REIG(i) * Z(1:M,i)            ! */
+
+		    res[i__] = snrm2_(m, &y[i__ * y_dim1 + 1], &c__1);
+		    ++i__;
+		} else {
+/* Have a complex conjugate pair */
+/* REIG(i) +- sqrt(-1)*IMEIG(i). */
+/* Since all computation is done in real */
+/* arithmetic, the formula for the residual */
+/* is recast for real representation of the */
+/* complex conjugate eigenpair. See the */
+/* description of RES. */
+		    ab[0] = reig[i__];
+		    ab[1] = -imeig[i__];
+		    ab[2] = imeig[i__];
+		    ab[3] = reig[i__];
+		    r__1 = -one;
+		    sgemm_("N", "N", m, &c__2, &c__2, &r__1, &z__[i__ * 
+			    z_dim1 + 1], ldz, ab, &c__2, &one, &y[i__ * 
+			    y_dim1 + 1], ldy);
+/* Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB   ! INT */
+/* BL */
+		    res[i__] = slange_("F", m, &c__2, &y[i__ * y_dim1 + 1], 
+			    ldy, &work[*n + 1]);
+/* LA */
+		    res[i__ + 1] = res[i__];
+		    i__ += 2;
+		}
+	    }
+	}
+    }
+
+    if (*whtsvd == 4) {
+	work[*n + 1] = xscl1;
+	work[*n + 2] = xscl2;
+    }
+
+/*     Successful exit. */
+    if (! badxy) {
+	*info = 0;
+    } else {
+/* A warning on possible data inconsistency. */
+/* This should be a rare event. */
+	*info = 4;
+    }
+/* ............................................................ */
+    return 0;
+/*     ...... */
+} /* sgedmd_ */
+
diff --git a/lapack-netlib/SRC/sgedmdq.c b/lapack-netlib/SRC/sgedmdq.c
index 447b23014..0adf3bda3 100644
--- a/lapack-netlib/SRC/sgedmdq.c
+++ b/lapack-netlib/SRC/sgedmdq.c
@@ -509,3 +509,788 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 
 
 
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+
+/* Subroutine */ int sgedmdq_(char *jobs, char *jobz, char *jobr, char *jobq, 
+	char *jobt, char *jobf, integer *whtsvd, integer *m, integer *n, real 
+	*f, integer *ldf, real *x, integer *ldx, real *y, integer *ldy, 
+	integer *nrnk, real *tol, integer *k, real *reig, real *imeig, real *
+	z__, integer *ldz, real *res, real *b, integer *ldb, real *v, integer 
+	*ldv, real *s, integer *lds, real *work, integer *lwork, integer *
+	iwork, integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer f_dim1, f_offset, x_dim1, x_offset, y_dim1, y_offset, z_dim1, 
+	    z_offset, b_dim1, b_offset, v_dim1, v_offset, s_dim1, s_offset, 
+	    i__1, i__2;
+
+    /* Local variables */
+    real zero;
+    integer info1;
+    extern logical lsame_(char *, char *);
+    char jobvl[1];
+    integer minmn;
+    logical wantq;
+    integer mlwqr, olwqr;
+    logical wntex;
+    extern /* Subroutine */ int sgedmd_(char *, char *, char *, char *, 
+	    integer *, integer *, integer *, real *, integer *, real *, 
+	    integer *, integer *, real *, integer *, real *, real *, real *, 
+	    integer *, real *, real *, integer *, real *, integer *, real *, 
+	    integer *, real *, integer *, integer *, integer *, integer *), xerbla_(char *, integer *);
+    integer mlwdmd, olwdmd;
+    extern /* Subroutine */ int sgeqrf_(integer *, integer *, real *, integer 
+	    *, real *, real *, integer *, integer *);
+    logical sccolx, sccoly;
+    extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, 
+	    integer *, real *, integer *), slaset_(char *, integer *, 
+	    integer *, real *, real *, real *, integer *);
+    integer iminwr;
+    logical wntvec, wntvcf;
+    integer mlwgqr;
+    logical wntref;
+    integer mlwork, olwgqr, olwork;
+    real rdummy[2];
+    integer mlwmqr, olwmqr;
+    logical lquery, wntres, wnttrf, wntvcq;
+    extern /* Subroutine */ int sorgqr_(integer *, integer *, integer *, real 
+	    *, integer *, real *, real *, integer *, integer *), sormqr_(char 
+	    *, char *, integer *, integer *, integer *, real *, integer *, 
+	    real *, real *, integer *, real *, integer *, integer *);
+    real one;
+
+/* March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real32 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ..... */
+/*     Purpose */
+/*     ======= */
+/*     SGEDMDQ computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices, using a QR factorization */
+/*     based compression of the data. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, SGEDMDQ computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, SGEDMDQ returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office. */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ...................................................................... */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. The data snapshots are the columns */
+/*     of F. The leading N-1 columns of F are denoted X and the */
+/*     trailing N-1 columns are denoted Y. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Z*V, where Z */
+/*            is orthonormal and V contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of F, V, Z. */
+/*     'Q' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Q*Z, where Z */
+/*            contains the eigenvectors of the compression of the */
+/*            underlying discretized operator onto the span of */
+/*            the data snapshots. See the descriptions of F, V, Z. */
+/*            Q is from the initial QR factorization. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will */
+/*            be computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBQ (input) CHARACTER*1 */
+/*     Specifies whether to explicitly compute and return the */
+/*     orthogonal matrix from the QR factorization. */
+/*     'Q' :: The matrix Q of the QR factorization of the data */
+/*            snapshot matrix is computed and stored in the */
+/*            array F. See the description of F. */
+/*     'N' :: The matrix Q is not explicitly computed. */
+/* ..... */
+/*     JOBT (input) CHARACTER*1 */
+/*     Specifies whether to return the upper triangular factor */
+/*     from the QR factorization. */
+/*     'R' :: The matrix R of the QR factorization of the data */
+/*            snapshot matrix F is returned in the array Y. */
+/*            See the description of Y and Further details. */
+/*     'N' :: The matrix R is not returned. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/*     To be useful on exit, this option needs JOBQ='Q'. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: SGESVD (the QR SVD algorithm) */
+/*     2 :: SGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: SGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M >= 0 */
+/*     The state space dimension (the number of rows of F) */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshots from a single trajectory, */
+/*     taken at equidistant discrete times. This is the */
+/*     number of columns of F. */
+/* ..... */
+/*     F (input/output) REAL(KIND=WP) M-by-N array */
+/*     > On entry, */
+/*     the columns of F are the sequence of data snapshots */
+/*     from a single trajectory, taken at equidistant discrete */
+/*     times. It is assumed that the column norms of F are */
+/*     in the range of the normalized floating point numbers. */
+/*     < On exit, */
+/*     If JOBQ == 'Q', the array F contains the orthogonal */
+/*     matrix/factor of the QR factorization of the initial */
+/*     data snapshots matrix F. See the description of JOBQ. */
+/*     If JOBQ == 'N', the entries in F strictly below the main */
+/*     diagonal contain, column-wise, the information on the */
+/*     Householder vectors, as returned by SGEQRF. The */
+/*     remaining information to restore the orthogonal matrix */
+/*     of the initial QR factorization is stored in WORK(1:N). */
+/*     See the description of WORK. */
+/* ..... */
+/*     LDF (input) INTEGER, LDF >= M */
+/*     The leading dimension of the array F. */
+/* ..... */
+/*     X (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array */
+/*     X is used as workspace to hold representations of the */
+/*     leading N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, the leading K columns of X contain the leading */
+/*     K left singular vectors of the above described content */
+/*     of X. To lift them to the space of the left singular */
+/*     vectors U(:,1:K)of the input data, pre-multiply with the */
+/*     Q factor from the initial QR factorization. */
+/*     See the descriptions of F, K, V  and Z. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= N */
+/*     The leading dimension of the array X */
+/* ..... */
+/*     Y (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array */
+/*     Y is used as workspace to hold representations of the */
+/*     trailing N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, */
+/*     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper */
+/*     triangular factor from the QR factorization of the data */
+/*     snapshot matrix F. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= N */
+/*     The leading dimension of the array Y */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1) */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N-1 :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the description of K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the SVD/POD basis for the leading N-1 */
+/*     data snapshots (columns of F) and the number of the */
+/*     computed Ritz pairs. The value of K is determined */
+/*     according to the rule set by the parameters NRNK and */
+/*     TOL. See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     REIG (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     The leading K (K<=N) entries of REIG contain */
+/*     the real parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     See the descriptions of K, IMEIG, Z. */
+/* ..... */
+/*     IMEIG (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     The leading K (K<N) entries of REIG contain */
+/*     the imaginary parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     The eigenvalues are determined as follows: */
+/*     If IMEIG(i) == 0, then the corresponding eigenvalue is */
+/*     real, LAMBDA(i) = REIG(i). */
+/*     If IMEIG(i)>0, then the corresponding complex */
+/*     conjugate pair of eigenvalues reads */
+/*     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i) */
+/*     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) */
+/*     That is, complex conjugate pairs have consecutive */
+/*     indices (i,i+1), with the positive imaginary part */
+/*     listed first. */
+/*     See the descriptions of K, REIG, Z. */
+/* ..... */
+/*     Z (workspace/output) REAL(KIND=WP)  M-by-(N-1) array */
+/*     If JOBZ =='V' then */
+/*        Z contains real Ritz vectors as follows: */
+/*        If IMEIG(i)=0, then Z(:,i) is an eigenvector of */
+/*        the i-th Ritz value. */
+/*        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then */
+/*        [Z(:,i) Z(:,i+1)] span an invariant subspace and */
+/*        the Ritz values extracted from this subspace are */
+/*        REIG(i) + sqrt(-1)*IMEIG(i) and */
+/*        REIG(i) - sqrt(-1)*IMEIG(i). */
+/*        The corresponding eigenvectors are */
+/*        Z(:,i) + sqrt(-1)*Z(:,i+1) and */
+/*        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. */
+/*     If JOBZ == 'F', then the above descriptions hold for */
+/*     the columns of Z*V, where the columns of V are the */
+/*     eigenvectors of the K-by-K Rayleigh quotient, and Z is */
+/*     orthonormal. The columns of V are similarly structured: */
+/*     If IMEIG(i) == 0 then Z*V(:,i) is an eigenvector, and if */
+/*     IMEIG(i) > 0 then Z*V(:,i)+sqrt(-1)*Z*V(:,i+1) and */
+/*                       Z*V(:,i)-sqrt(-1)*Z*V(:,i+1) */
+/*     are the eigenvectors of LAMBDA(i), LAMBDA(i+1). */
+/*     See the descriptions of REIG, IMEIG, X and V. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs. */
+/*     If LAMBDA(i) is real, then */
+/*        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. */
+/*     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair */
+/*     then */
+/*     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F */
+/*     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] */
+/*               [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. */
+/*     It holds that */
+/*     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2 */
+/*     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 */
+/*     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1) */
+/*           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1) */
+/*     See the description of Z. */
+/* ..... */
+/*     B (output) REAL(KIND=WP)  MIN(M,N)-by-(N-1) array. */
+/*     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:N,1;K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     In both cases, the content of B can be lifted to the */
+/*     original dimension of the input data by pre-multiplying */
+/*     with the Q factor from the initial QR factorization. */
+/*     Here A denotes a compression of the underlying operator. */
+/*     See the descriptions of F and X. */
+/*     If JOBF =='N', then B is not referenced. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= MIN(M,N) */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     V (workspace/output) REAL(KIND=WP) (N-1)-by-(N-1) array */
+/*     On exit, V(1:K,1:K) contains the K eigenvectors of */
+/*     the Rayleigh quotient. The eigenvectors of a complex */
+/*     conjugate pair of eigenvalues are returned in real form */
+/*     as explained in the description of Z. The Ritz vectors */
+/*     (returned in Z) are the product of X and V; see */
+/*     the descriptions of X and Z. */
+/* ..... */
+/*     LDV (input) INTEGER, LDV >= N-1 */
+/*     The leading dimension of the array V. */
+/* ..... */
+/*     S (output) REAL(KIND=WP) (N-1)-by-(N-1) array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by SGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N-1 */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */
+/*     On exit, */
+/*     WORK(1:MIN(M,N)) contains the scalar factors of the */
+/*     elementary reflectors as returned by SGEQRF of the */
+/*     M-by-N input matrix F. */
+/*     WORK(MIN(M,N)+1:MIN(M,N)+N-1) contains the singular values of */
+/*     the input submatrix F(1:M,1:N-1). */
+/*     If the call to SGEDMDQ is only workspace query, then */
+/*     WORK(1) contains the minimal workspace length and */
+/*     WORK(2) is the optimal workspace length. Hence, the */
+/*     length of work is at least 2. */
+/*     See the description of LWORK. */
+/* ..... */
+/*     LWORK (input) INTEGER */
+/*     The minimal length of the  workspace vector WORK. */
+/*     LWORK is calculated as follows: */
+/*     Let MLWQR  = N (minimal workspace for SGEQRF[M,N]) */
+/*         MLWDMD = minimal workspace for SGEDMD (see the */
+/*                  description of LWORK in SGEDMD) for */
+/*                  snapshots of dimensions MIN(M,N)-by-(N-1) */
+/*         MLWMQR = N (minimal workspace for */
+/*                    SORMQR['L','N',M,N,N]) */
+/*         MLWGQR = N (minimal workspace for SORGQR[M,N,N]) */
+/*     Then */
+/*     LWORK = MAX(N+MLWQR, N+MLWDMD) */
+/*     is updated as follows: */
+/*        if   JOBZ == 'V' or JOBZ == 'F' THEN */
+/*             LWORK = MAX( LWORK,MIN(M,N)+N-1 +MLWMQR ) */
+/*        if   JOBQ == 'Q' THEN */
+/*             LWORK = MAX( LWORK,MIN(M,N)+N-1+MLWGQR) */
+/*     If on entry LWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     Let M1=MIN(M,N), N1=N-1. Then */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1) */
+/*     If on entry LIWORK = -1, then a worskpace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+
+/*     Local array */
+/*     ~~~~~~~~~~~ */
+
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     External subroutines */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+    /* Parameter adjustments */
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --reig;
+    --imeig;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1 * 1;
+    v -= v_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    one = 1.f;
+    zero = 0.f;
+/* .......................................................... */
+
+/*    Test the input arguments */
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntvcf = lsame_(jobz, "F");
+    wntvcq = lsame_(jobz, "Q");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    wantq = lsame_(jobq, "Q");
+    wnttrf = lsame_(jobt, "R");
+    minmn = f2cmin(*m,*n);
+    *info = 0;
+    lquery = *lwork == -1 || *liwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || wntvcf || wntvcq || lsame_(jobz, "N"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && lsame_(jobz, "N")) {
+	*info = -3;
+    } else if (! (wantq || lsame_(jobq, "N"))) {
+	*info = -4;
+    } else if (! (wnttrf || lsame_(jobt, "N"))) {
+	*info = -5;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -6;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -7;
+    } else if (*m < 0) {
+	*info = -8;
+    } else if (*n < 0 || *n > *m + 1) {
+	*info = -9;
+    } else if (*ldf < *m) {
+	*info = -11;
+    } else if (*ldx < minmn) {
+	*info = -13;
+    } else if (*ldy < minmn) {
+	*info = -15;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -16;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -17;
+    } else if (*ldz < *m) {
+	*info = -22;
+    } else if ((wntref || wntex) && *ldb < minmn) {
+	*info = -25;
+    } else if (*ldv < *n - 1) {
+	*info = -27;
+    } else if (*lds < *n - 1) {
+	*info = -29;
+    }
+
+    if (wntvec || wntvcf) {
+	*(unsigned char *)jobvl = 'V';
+    } else {
+	*(unsigned char *)jobvl = 'N';
+    }
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0 || *n == 1) {
+/* All output except K is void. INFO=1 signals */
+/* the void input. In case of a workspace query, */
+/* the minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		work[1] = 2.f;
+		work[2] = 2.f;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	mlwqr = f2cmax(1,*n);
+/* Minimal workspace length for SGEQRF. */
+	mlwork = f2cmin(*m,*n) + mlwqr;
+	if (lquery) {
+	    sgeqrf_(m, n, &f[f_offset], ldf, &work[1], rdummy, &c_n1, &info1);
+	    olwqr = (integer) rdummy[0];
+	    olwork = f2cmin(*m,*n) + olwqr;
+	}
+	i__1 = *n - 1;
+	sgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], 
+		ldx, &y[y_offset], ldy, nrnk, tol, k, &reig[1], &imeig[1], &
+		z__[z_offset], ldz, &res[1], &b[b_offset], ldb, &v[v_offset], 
+		ldv, &s[s_offset], lds, &work[1], &c_n1, &iwork[1], liwork, &
+		info1);
+	mlwdmd = (integer) work[1];
+/* Computing MAX */
+	i__1 = mlwork, i__2 = minmn + mlwdmd;
+	mlwork = f2cmax(i__1,i__2);
+	iminwr = iwork[1];
+	if (lquery) {
+	    olwdmd = (integer) work[2];
+/* Computing MAX */
+	    i__1 = olwork, i__2 = minmn + olwdmd;
+	    olwork = f2cmax(i__1,i__2);
+	}
+	if (wntvec || wntvcf) {
+	    mlwmqr = f2cmax(1,*n);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + *n - 1 + mlwmqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		sormqr_("L", "N", m, n, &minmn, &f[f_offset], ldf, &work[1], &
+			z__[z_offset], ldz, &work[1], &c_n1, &info1);
+		olwmqr = (integer) work[1];
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + *n - 1 + olwmqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	if (wantq) {
+	    mlwgqr = *n;
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + *n - 1 + mlwgqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		sorgqr_(m, &minmn, &minmn, &f[f_offset], ldf, &work[1], &work[
+			1], &c_n1, &info1);
+		olwgqr = (integer) work[1];
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + *n - 1 + olwgqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	iminwr = f2cmax(1,iminwr);
+	mlwork = f2cmax(2,mlwork);
+	if (*lwork < mlwork && ! lquery) {
+	    *info = -31;
+	}
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -33;
+	}
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGEDMDQ", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	work[1] = (real) mlwork;
+	work[2] = (real) olwork;
+	return 0;
+    }
+/* ..... */
+/*     Initial QR factorization that is used to represent the */
+/*     snapshots as elements of lower dimensional subspace. */
+/*     For large scale computation with M >>N , at this place */
+/*     one can use an out of core QRF. */
+
+    i__1 = *lwork - minmn;
+    sgeqrf_(m, n, &f[f_offset], ldf, &work[1], &work[minmn + 1], &i__1, &
+	    info1);
+
+/*     Define X and Y as the snapshots representations in the */
+/*     orthogonal basis computed in the QR factorization. */
+/*     X corresponds to the leading N-1 and Y to the trailing */
+/*     N-1 snapshots. */
+    i__1 = *n - 1;
+    slaset_("L", &minmn, &i__1, &zero, &zero, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    slacpy_("U", &minmn, &i__1, &f[f_offset], ldf, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    slacpy_("A", &minmn, &i__1, &f[(f_dim1 << 1) + 1], ldf, &y[y_offset], ldy);
+    if (*m >= 3) {
+	i__1 = minmn - 2;
+	i__2 = *n - 2;
+	slaset_("L", &i__1, &i__2, &zero, &zero, &y[y_dim1 + 3], ldy);
+    }
+
+/*     Compute the DMD of the projected snapshot pairs (X,Y) */
+    i__1 = *n - 1;
+    i__2 = *lwork - minmn;
+    sgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], ldx,
+	     &y[y_offset], ldy, nrnk, tol, k, &reig[1], &imeig[1], &z__[
+	    z_offset], ldz, &res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[
+	    s_offset], lds, &work[minmn + 1], &i__2, &iwork[1], liwork, &
+	    info1);
+    if (info1 == 2 || info1 == 3) {
+/* Return with error code. */
+	*info = info1;
+	return 0;
+    } else {
+	*info = info1;
+    }
+
+/*     The Ritz vectors (Koopman modes) can be explicitly */
+/*     formed or returned in factored form. */
+    if (wntvec) {
+/* Compute the eigenvectors explicitly. */
+	if (*m > minmn) {
+	    i__1 = *m - minmn;
+	    slaset_("A", &i__1, k, &zero, &zero, &z__[minmn + 1 + z_dim1], 
+		    ldz);
+	}
+	i__1 = *lwork - (minmn + *n - 1);
+	sormqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &work[1], &z__[
+		z_offset], ldz, &work[minmn + *n], &i__1, &info1);
+    } else if (wntvcf) {
+/*   Return the Ritz vectors (eigenvectors) in factored */
+/*   form Z*V, where Z contains orthonormal matrix (the */
+/*   product of Q from the initial QR factorization and */
+/*   the SVD/POD_basis returned by SGEDMD in X) and the */
+/*   second factor (the eigenvectors of the Rayleigh */
+/*   quotient) is in the array V, as returned by SGEDMD. */
+	slacpy_("A", n, k, &x[x_offset], ldx, &z__[z_offset], ldz);
+	if (*m > *n) {
+	    i__1 = *m - *n;
+	    slaset_("A", &i__1, k, &zero, &zero, &z__[*n + 1 + z_dim1], ldz);
+	}
+	i__1 = *lwork - (minmn + *n - 1);
+	sormqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &work[1], &z__[
+		z_offset], ldz, &work[minmn + *n], &i__1, &info1);
+    }
+
+/*     Some optional output variables: */
+
+/*     The upper triangular factor in the initial QR */
+/*     factorization is optionally returned in the array Y. */
+/*     This is useful if this call to SGEDMDQ is to be */
+/*     followed by a streaming DMD that is implemented in a */
+/*     QR compressed form. */
+    if (wnttrf) {
+/* Return the upper triangular R in Y */
+	slaset_("A", &minmn, n, &zero, &zero, &y[y_offset], ldy);
+	slacpy_("U", &minmn, n, &f[f_offset], ldf, &y[y_offset], ldy);
+    }
+
+/*     The orthonormal/orthogonal factor in the initial QR */
+/*     factorization is optionally returned in the array F. */
+/*     Same as with the triangular factor above, this is */
+/*     useful in a streaming DMD. */
+    if (wantq) {
+/* Q overwrites F */
+	i__1 = *lwork - (minmn + *n - 1);
+	sorgqr_(m, &minmn, &minmn, &f[f_offset], ldf, &work[1], &work[minmn + 
+		*n], &i__1, &info1);
+    }
+
+    return 0;
+
+} /* sgedmdq_ */
+
diff --git a/lapack-netlib/SRC/zgedmd.c b/lapack-netlib/SRC/zgedmd.c
index 447b23014..c1b39ba3e 100644
--- a/lapack-netlib/SRC/zgedmd.c
+++ b/lapack-netlib/SRC/zgedmd.c
@@ -509,3 +509,1168 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 
 
 
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+static integer c__1 = 1;
+static integer c__0 = 0;
+
+/* Subroutine */ int zgedmd_(char *jobs, char *jobz, char *jobr, char *jobf, 
+	integer *whtsvd, integer *m, integer *n, doublecomplex *x, integer *
+	ldx, doublecomplex *y, integer *ldy, integer *nrnk, doublereal *tol, 
+	integer *k, doublecomplex *eigs, doublecomplex *z__, integer *ldz, 
+	doublereal *res, doublecomplex *b, integer *ldb, doublecomplex *w, 
+	integer *ldw, doublecomplex *s, integer *lds, doublecomplex *zwork, 
+	integer *lzwork, doublereal *rwork, integer *lrwork, integer *iwork, 
+	integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer x_dim1, x_offset, y_dim1, y_offset, z_dim1, z_offset, b_dim1, 
+	    b_offset, w_dim1, w_offset, s_dim1, s_offset, i__1, i__2, i__3, 
+	    i__4, i__5;
+    doublereal d__1, d__2;
+    doublecomplex z__1, z__2;
+
+    /* Local variables */
+    doublecomplex zone;
+    doublereal zero, ssum;
+    integer info1, info2;
+    doublereal xscl1, xscl2;
+    integer i__, j;
+    doublereal scale;
+    extern logical lsame_(char *, char *);
+    logical badxy;
+    doublereal small;
+    extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, 
+	    integer *, doublecomplex *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *);
+    char jobzl[1];
+    extern /* Subroutine */ int zgeev_(char *, char *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
+	     doublereal *, integer *);
+    logical wntex;
+    doublecomplex zzero;
+    extern /* Subroutine */ int zaxpy_(integer *, doublecomplex *, 
+	    doublecomplex *, integer *, doublecomplex *, integer *);
+    extern doublereal dznrm2_(integer *, doublecomplex *, integer *), dlamch_(
+	    char *);
+    extern logical disnan_(doublereal *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    char t_or_n__[1];
+    extern /* Subroutine */ int zdscal_(integer *, doublereal *, 
+	    doublecomplex *, integer *), zgesdd_(char *, integer *, integer *,
+	     doublecomplex *, integer *, doublereal *, doublecomplex *, 
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
+	     doublereal *, integer *, integer *), zlascl_(char *, 
+	    integer *, integer *, doublereal *, doublereal *, integer *, 
+	    integer *, doublecomplex *, integer *, integer *);
+    extern integer izamax_(integer *, doublecomplex *, integer *);
+    logical sccolx, sccoly;
+    integer lwrsdd, mwrsdd;
+    extern /* Subroutine */ int zgesvd_(char *, char *, integer *, integer *, 
+	    doublecomplex *, integer *, doublereal *, doublecomplex *, 
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
+	     doublereal *, integer *), zlacpy_(char *, 
+	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
+	     integer *);
+    integer iminwr;
+    logical wntref, wntvec;
+    doublereal rootsc;
+    integer lwrkev, mlwork, mwrkev, numrnk, olwork, lwrsvd, mwrsvd, mlrwrk;
+    logical lquery, wntres;
+    char jsvopt[1];
+    integer lwrsvj, mwrsvj;
+    doublereal rdummy[2];
+    extern /* Subroutine */ int zgejsv_(char *, char *, char *, char *, char *
+	    , char *, integer *, integer *, doublecomplex *, integer *, 
+	    doublereal *, doublecomplex *, integer *, doublecomplex *, 
+	    integer *, doublecomplex *, integer *, doublereal *, integer *, 
+	    integer *, integer *), zlassq_(integer *, doublecomplex *, integer *, 
+	    doublereal *, doublereal *), mecago_();
+    integer lwrsvq, mwrsvq;
+    doublereal ofl, one;
+    extern /* Subroutine */ int zgesvdq_(char *, char *, char *, char *, char 
+	    *, integer *, integer *, doublecomplex *, integer *, doublereal *,
+	     doublecomplex *, integer *, doublecomplex *, integer *, integer *
+	    , integer *, integer *, doublecomplex *, integer *, doublereal *, 
+	    integer *, integer *);
+
+/* March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real64 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ............................................................ */
+/*     Purpose */
+/*     ======= */
+/*     ZGEDMD computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, ZGEDMD computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, ZGEDMD returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/* ...................................................................... */
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ............................................................ */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product X(:,1:K)*W, where X */
+/*            contains a POD basis (leading left singular vectors */
+/*            of the data matrix X) and W contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of K, X, W, Z. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will be */
+/*            computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: ZGESVD (the QR SVD algorithm) */
+/*     2 :: ZGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: ZGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M>= 0 */
+/*     The state space dimension (the row dimension of X, Y). */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshot pairs */
+/*     (the number of columns of X and Y). */
+/* ..... */
+/*     X (input/output) COMPLEX(KIND=WP) M-by-N array */
+/*   > On entry, X contains the data snapshot matrix X. It is */
+/*     assumed that the column norms of X are in the range of */
+/*     the normalized floating point numbers. */
+/*   < On exit, the leading K columns of X contain a POD basis, */
+/*     i.e. the leading K left singular vectors of the input */
+/*     data matrix X, U(:,1:K). All N columns of X contain all */
+/*     left singular vectors of the input matrix X. */
+/*     See the descriptions of K, Z and W. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= M */
+/*     The leading dimension of the array X. */
+/* ..... */
+/*     Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array */
+/*   > On entry, Y contains the data snapshot matrix Y */
+/*   < On exit, */
+/*     If JOBR == 'R', the leading K columns of Y  contain */
+/*     the residual vectors for the computed Ritz pairs. */
+/*     See the description of RES. */
+/*     If JOBR == 'N', Y contains the original input data, */
+/*                     scaled according to the value of JOBS. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= M */
+/*     The leading dimension of the array Y. */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1) */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the descriptions of TOL and  K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the POD basis for the data snapshot */
+/*     matrix X and the number of the computed Ritz pairs. */
+/*     The value of K is determined according to the rule set */
+/*     by the parameters NRNK and TOL. */
+/*     See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     EIGS (output) COMPLEX(KIND=WP) N-by-1 array */
+/*     The leading K (K<=N) entries of EIGS contain */
+/*     the computed eigenvalues (Ritz values). */
+/*     See the descriptions of K, and Z. */
+/* ..... */
+/*     Z (workspace/output) COMPLEX(KIND=WP)  M-by-N array */
+/*     If JOBZ =='V' then Z contains the  Ritz vectors.  Z(:,i) */
+/*     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. */
+/*     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as */
+/*     the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) */
+/*     is an eigenvector corresponding to EIGS(i). The columns */
+/*     of W(1:k,1:K) are the computed eigenvectors of the */
+/*     K-by-K Rayleigh quotient. */
+/*     See the descriptions of EIGS, X and W. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) N-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs, */
+/*     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. */
+/*     See the description of EIGS and Z. */
+/* ..... */
+/*     B (output) COMPLEX(KIND=WP)  M-by-N array. */
+/*     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:M,1:K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     If JOBF =='N', then B is not referenced. */
+/*     See the descriptions of X, W, K. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= M */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     W (workspace/output) COMPLEX(KIND=WP) N-by-N array */
+/*     On exit, W(1:K,1:K) contains the K computed */
+/*     eigenvectors of the matrix Rayleigh quotient. */
+/*     The Ritz vectors (returned in Z) are the */
+/*     product of X (containing a POD basis for the input */
+/*     matrix X) and W. See the descriptions of K, S, X and Z. */
+/*     W is also used as a workspace to temporarily store the */
+/*     right singular vectors of X. */
+/* ..... */
+/*     LDW (input) INTEGER, LDW >= N */
+/*     The leading dimension of the array W. */
+/* ..... */
+/*     S (workspace/output) COMPLEX(KIND=WP) N-by-N array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by ZGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array */
+/*     ZWORK is used as complex workspace in the complex SVD, as */
+/*     specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing */
+/*     the eigenvalues of a Rayleigh quotient. */
+/*     If the call to ZGEDMD is only workspace query, then */
+/*     ZWORK(1) contains the minimal complex workspace length and */
+/*     ZWORK(2) is the optimal complex workspace length. */
+/*     Hence, the length of work is at least 2. */
+/*     See the description of LZWORK. */
+/* ..... */
+/*     LZWORK (input) INTEGER */
+/*     The minimal length of the workspace vector ZWORK. */
+/*     LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV), */
+/*     where LZWORK_ZGEEV = MAX( 1, 2*N )  and the minimal */
+/*     LZWORK_SVD is calculated as follows */
+/*     If WHTSVD == 1 :: ZGESVD :: */
+/*        LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) */
+/*     If WHTSVD == 2 :: ZGESDD :: */
+/*        LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) */
+/*     If WHTSVD == 3 :: ZGESVDQ :: */
+/*        LZWORK_SVD = obtainable by a query */
+/*     If WHTSVD == 4 :: ZGEJSV :: */
+/*        LZWORK_SVD = obtainable by a query */
+/*     If on entry LZWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths and returns them in */
+/*     LZWORK(1) and LZWORK(2), respectively. */
+/* ..... */
+/*     RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array */
+/*     On exit, RWORK(1:N) contains the singular values of */
+/*     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). */
+/*     If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain */
+/*     scaling factor RWORK(N+2)/RWORK(N+1) used to scale X */
+/*     and Y to avoid overflow in the SVD of X. */
+/*     This may be of interest if the scaling option is off */
+/*     and as many as possible smallest eigenvalues are */
+/*     desired to the highest feasible accuracy. */
+/*     If the call to ZGEDMD is only workspace query, then */
+/*     RWORK(1) contains the minimal workspace length. */
+/*     See the description of LRWORK. */
+/* ..... */
+/*     LRWORK (input) INTEGER */
+/*     The minimal length of the workspace vector RWORK. */
+/*     LRWORK is calculated as follows: */
+/*     LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where */
+/*     LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace */
+/*     for the SVD subroutine determined by the input parameter */
+/*     WHTSVD. */
+/*     If WHTSVD == 1 :: ZGESVD :: */
+/*        LRWORK_SVD = 5*MIN(M,N) */
+/*     If WHTSVD == 2 :: ZGESDD :: */
+/*        LRWORK_SVD =  MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), */
+/*        2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) */
+/*     If WHTSVD == 3 :: ZGESVDQ :: */
+/*        LRWORK_SVD = obtainable by a query */
+/*     If WHTSVD == 4 :: ZGEJSV :: */
+/*        LRWORK_SVD = obtainable by a query */
+/*     If on entry LRWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     real workspace length and returns it in RWORK(1). */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */
+/*     If on entry LIWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for  ZWORK, RWORK and */
+/*     IWORK. See the descriptions of ZWORK, RWORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+
+/*     Local arrays */
+/*     ~~~~~~~~~~~~ */
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+/* ............................................................ */
+    /* Parameter adjustments */
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --eigs;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1 * 1;
+    w -= w_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --zwork;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    zero = 0.f;
+    one = 1.f;
+    zzero.r = 0.f, zzero.i = 0.f;
+    zone.r = 1.f, zone.i = 0.f;
+
+/*    Test the input arguments */
+
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    *info = 0;
+    lquery = *lzwork == -1 || *liwork == -1 || *lrwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || lsame_(jobz, "N") || lsame_(
+	    jobz, "F"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && ! wntvec) {
+	*info = -3;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -4;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -5;
+    } else if (*m < 0) {
+	*info = -6;
+    } else if (*n < 0 || *n > *m) {
+	*info = -7;
+    } else if (*ldx < *m) {
+	*info = -9;
+    } else if (*ldy < *m) {
+	*info = -11;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -12;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -13;
+    } else if (*ldz < *m) {
+	*info = -17;
+    } else if ((wntref || wntex) && *ldb < *m) {
+	*info = -20;
+    } else if (*ldw < *n) {
+	*info = -22;
+    } else if (*lds < *n) {
+	*info = -24;
+    }
+
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0) {
+/* Quick return. All output except K is void. */
+/* INFO=1 signals the void input. */
+/* In case of a workspace query, the default */
+/* minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		rwork[1] = 1.;
+		zwork[1].r = 2., zwork[1].i = 0.;
+		zwork[2].r = 2., zwork[2].i = 0.;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	iminwr = 1;
+	mlrwrk = f2cmax(1,*n);
+	mlwork = 2;
+	olwork = 2;
+/*         SELECT CASE ( WHTSVD ) */
+	if (*whtsvd == 1) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of ZGESVD: */
+/* MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N)) */
+/* Computing MAX */
+	    i__1 = 1, i__2 = (f2cmin(*m,*n) << 1) + f2cmax(*m,*n);
+	    mwrsvd = f2cmax(i__1,i__2);
+	    mlwork = f2cmax(mlwork,mwrsvd);
+/* Computing MAX */
+	    i__1 = mlrwrk, i__2 = *n + f2cmin(*m,*n) * 5;
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		zgesvd_("O", "S", m, n, &x[x_offset], ldx, &rwork[1], &b[
+			b_offset], ldb, &w[w_offset], ldw, &zwork[1], &c_n1, 
+			rdummy, &info1);
+		lwrsvd = (integer) zwork[1].r;
+		olwork = f2cmax(olwork,lwrsvd);
+	    }
+	} else if (*whtsvd == 2) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of ZGESDD: */
+/* MWRSDD = 2*f2cmin(M,N)*f2cmin(M,N)+2*f2cmin(M,N)+f2cmax(M,N). */
+/* RWORK length: 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N) */
+/* In LAPACK 3.10.1 RWORK is defined differently. */
+/* Below we take f2cmax over the two versions. */
+/* IMINWR = 8*MIN(M,N) */
+	    mwrsdd = (f2cmin(*m,*n) << 1) * f2cmin(*m,*n) + (f2cmin(*m,*n) << 1) + f2cmax(
+		    *m,*n);
+	    mlwork = f2cmax(mlwork,mwrsdd);
+	    iminwr = f2cmin(*m,*n) << 3;
+/* Computing MAX */
+/* Computing MAX */
+	    i__3 = f2cmin(*m,*n) * 5 * f2cmin(*m,*n) + f2cmin(*m,*n) * 7, i__4 = f2cmin(*
+		    m,*n) * 5 * f2cmin(*m,*n) + f2cmin(*m,*n) * 5, i__3 = f2cmax(i__3,
+		    i__4), i__4 = (f2cmax(*m,*n) << 1) * f2cmin(*m,*n) + (f2cmin(*m,*n)
+		     << 1) * f2cmin(*m,*n) + f2cmin(*m,*n);
+	    i__1 = mlrwrk, i__2 = *n + f2cmax(i__3,i__4);
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		zgesdd_("O", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset],
+			 ldb, &w[w_offset], ldw, &zwork[1], &c_n1, rdummy, &
+			iwork[1], &info1);
+/* Computing MAX */
+		i__1 = mwrsdd, i__2 = (integer) zwork[1].r;
+		lwrsdd = f2cmax(i__1,i__2);
+/* Possible bug in ZGESDD optimal workspace size. */
+		olwork = f2cmax(olwork,lwrsdd);
+	    }
+	} else if (*whtsvd == 3) {
+	    zgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &rwork[
+		    1], &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &
+		    iwork[1], &c_n1, &zwork[1], &c_n1, rdummy, &c_n1, &info1);
+	    iminwr = iwork[1];
+	    mwrsvq = (integer) zwork[2].r;
+	    mlwork = f2cmax(mlwork,mwrsvq);
+/* Computing MAX */
+	    i__1 = mlrwrk, i__2 = *n + (integer) rdummy[0];
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		lwrsvq = (integer) zwork[1].r;
+		olwork = f2cmax(olwork,lwrsvq);
+	    }
+	} else if (*whtsvd == 4) {
+	    *(unsigned char *)jsvopt = 'J';
+	    zgejsv_("F", "U", jsvopt, "R", "N", "P", m, n, &x[x_offset], ldx, 
+		    &rwork[1], &z__[z_offset], ldz, &w[w_offset], ldw, &zwork[
+		    1], &c_n1, rdummy, &c_n1, &iwork[1], &info1);
+	    iminwr = iwork[1];
+	    mwrsvj = (integer) zwork[2].r;
+	    mlwork = f2cmax(mlwork,mwrsvj);
+/* Computing MAX */
+/* Computing MAX */
+	    i__3 = 7, i__4 = (integer) rdummy[0];
+	    i__1 = mlrwrk, i__2 = *n + f2cmax(i__3,i__4);
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		lwrsvj = (integer) zwork[1].r;
+		olwork = f2cmax(olwork,lwrsvj);
+	    }
+/*         END SELECT */
+	}
+	if (wntvec || wntex || lsame_(jobz, "F")) {
+	    *(unsigned char *)jobzl = 'V';
+	} else {
+	    *(unsigned char *)jobzl = 'N';
+	}
+/* Workspace calculation to the ZGEEV call */
+/* Computing MAX */
+	i__1 = 1, i__2 = *n << 1;
+	mwrkev = f2cmax(i__1,i__2);
+	mlwork = f2cmax(mlwork,mwrkev);
+/* Computing MAX */
+	i__1 = mlrwrk, i__2 = *n + (*n << 1);
+	mlrwrk = f2cmax(i__1,i__2);
+	if (lquery) {
+	    zgeev_("N", jobzl, n, &s[s_offset], lds, &eigs[1], &w[w_offset], 
+		    ldw, &w[w_offset], ldw, &zwork[1], &c_n1, &rwork[1], &
+		    info1);
+	    lwrkev = (integer) zwork[1].r;
+	    olwork = f2cmax(olwork,lwrkev);
+	}
+
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -30;
+	}
+	if (*lrwork < mlrwrk && ! lquery) {
+	    *info = -28;
+	}
+	if (*lzwork < mlwork && ! lquery) {
+	    *info = -26;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGEDMD", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	rwork[1] = (doublereal) mlrwrk;
+	zwork[1].r = (doublereal) mlwork, zwork[1].i = 0.;
+	zwork[2].r = (doublereal) olwork, zwork[2].i = 0.;
+	return 0;
+    }
+/* ............................................................ */
+
+    ofl = dlamch_("O");
+    small = dlamch_("S");
+    badxy = FALSE_;
+
+/*     <1> Optional scaling of the snapshots (columns of X, Y) */
+/*     ========================================================== */
+    if (sccolx) {
+/* The columns of X will be normalized. */
+/* To prevent overflows, the column norms of X are */
+/* carefully computed using ZLASSQ. */
+	*k = 0;
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* WORK(i) = DZNRM2( M, X(1,i), 1 ) */
+	    scale = zero;
+	    zlassq_(m, &x[i__ * x_dim1 + 1], &c__1, &scale, &ssum);
+	    if (disnan_(&scale) || disnan_(&ssum)) {
+		*k = 0;
+		*info = -8;
+		i__2 = -(*info);
+		xerbla_("ZGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of X(:,i) overflows. First, X(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. */
+/*                 Next, the norm of X(:,i) is stored without */
+/*                 overflow as RWORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of X(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    d__1 = one / rootsc;
+		    zlascl_("G", &c__0, &c__0, &scale, &d__1, m, &c__1, &x[
+			    i__ * x_dim1 + 1], ldx, &info2);
+		    rwork[i__] = -scale * (rootsc / (doublereal) (*m));
+		} else {
+/*                 X(:,i) will be scaled to unit 2-norm */
+		    rwork[i__] = scale * rootsc;
+		    zlascl_("G", &c__0, &c__0, &rwork[i__], &one, m, &c__1, &
+			    x[i__ * x_dim1 + 1], ldx, &info2);
+/*                 X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)   ! INTRINSIC */
+/* LAPACK CALL */
+		}
+	    } else {
+		rwork[i__] = zero;
+		++(*k);
+	    }
+	}
+	if (*k == *n) {
+/* All columns of X are zero. Return error code -8. */
+/* (the 8th input variable had an illegal value) */
+	    *k = 0;
+	    *info = -8;
+	    i__1 = -(*info);
+	    xerbla_("ZGEDMD", &i__1);
+	    return 0;
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of Y. */
+	    if (rwork[i__] > zero) {
+		d__1 = one / rwork[i__];
+		zdscal_(m, &d__1, &y[i__ * y_dim1 + 1], &c__1);
+/*               Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)       ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (rwork[i__] < zero) {
+		d__1 = -rwork[i__];
+		d__2 = one / (doublereal) (*m);
+		zlascl_("G", &c__0, &c__0, &d__1, &d__2, m, &c__1, &y[i__ * 
+			y_dim1 + 1], ldy, &info2);
+/* LAPACK C */
+	    } else if (z_abs(&y[izamax_(m, &y[i__ * y_dim1 + 1], &c__1) + i__ 
+		    * y_dim1]) != zero) {
+/*               X(:,i) is zero vector. For consistency, */
+/*               Y(:,i) should also be zero. If Y(:,i) is not */
+/*               zero, then the data might be inconsistent or */
+/*               corrupted. If JOBS == 'C', Y(:,i) is set to */
+/*               zero and a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+		if (lsame_(jobs, "C")) {
+		    zdscal_(m, &zero, &y[i__ * y_dim1 + 1], &c__1);
+		}
+/* BLAS CALL */
+	    }
+	}
+    }
+
+    if (sccoly) {
+/* The columns of Y will be normalized. */
+/* To prevent overflows, the column norms of Y are */
+/* carefully computed using ZLASSQ. */
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* RWORK(i) = DZNRM2( M, Y(1,i), 1 ) */
+	    scale = zero;
+	    zlassq_(m, &y[i__ * y_dim1 + 1], &c__1, &scale, &ssum);
+	    if (disnan_(&scale) || disnan_(&ssum)) {
+		*k = 0;
+		*info = -10;
+		i__2 = -(*info);
+		xerbla_("ZGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of Y(:,i) overflows. First, Y(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. */
+/*                 Next, the norm of Y(:,i) is stored without */
+/*                 overflow as RWORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of Y(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    d__1 = one / rootsc;
+		    zlascl_("G", &c__0, &c__0, &scale, &d__1, m, &c__1, &y[
+			    i__ * y_dim1 + 1], ldy, &info2);
+		    rwork[i__] = -scale * (rootsc / (doublereal) (*m));
+		} else {
+/*                 Y(:,i) will be scaled to unit 2-norm */
+		    rwork[i__] = scale * rootsc;
+		    zlascl_("G", &c__0, &c__0, &rwork[i__], &one, m, &c__1, &
+			    y[i__ * y_dim1 + 1], ldy, &info2);
+/*                 Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)          ! INTRINSIC */
+/* LAPAC */
+		}
+	    } else {
+		rwork[i__] = zero;
+	    }
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of X. */
+	    if (rwork[i__] > zero) {
+		d__1 = one / rwork[i__];
+		zdscal_(m, &d__1, &x[i__ * x_dim1 + 1], &c__1);
+/*               X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)      ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (rwork[i__] < zero) {
+		d__1 = -rwork[i__];
+		d__2 = one / (doublereal) (*m);
+		zlascl_("G", &c__0, &c__0, &d__1, &d__2, m, &c__1, &x[i__ * 
+			x_dim1 + 1], ldx, &info2);
+/* LAPACK C */
+	    } else if (z_abs(&x[izamax_(m, &x[i__ * x_dim1 + 1], &c__1) + i__ 
+		    * x_dim1]) != zero) {
+/*               Y(:,i) is zero vector.  If X(:,i) is not */
+/*               zero, then a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+	    }
+	}
+    }
+
+/*     <2> SVD of the data snapshot matrix X. */
+/*     ===================================== */
+/*     The left singular vectors are stored in the array X. */
+/*     The right singular vectors are in the array W. */
+/*     The array W will later on contain the eigenvectors */
+/*     of a Rayleigh quotient. */
+    numrnk = *n;
+/*      SELECT CASE ( WHTSVD ) */
+    if (*whtsvd == 1) {
+	zgesvd_("O", "S", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], 
+		ldb, &w[w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &
+		info1);
+/* LA */
+	*(unsigned char *)t_or_n__ = 'C';
+    } else if (*whtsvd == 2) {
+	zgesdd_("O", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], ldb, &
+		w[w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &iwork[1]
+		, &info1);
+/* LAP */
+	*(unsigned char *)t_or_n__ = 'C';
+    } else if (*whtsvd == 3) {
+	i__1 = *lrwork - *n;
+	zgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &rwork[1], 
+		&z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &iwork[1], 
+		liwork, &zwork[1], lzwork, &rwork[*n + 1], &i__1, &info1);
+/* LAPACK CA */
+	zlacpy_("A", m, &numrnk, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK C */
+	*(unsigned char *)t_or_n__ = 'C';
+    } else if (*whtsvd == 4) {
+	i__1 = *lrwork - *n;
+	zgejsv_("F", "U", jsvopt, "R", "N", "P", m, n, &x[x_offset], ldx, &
+		rwork[1], &z__[z_offset], ldz, &w[w_offset], ldw, &zwork[1], 
+		lzwork, &rwork[*n + 1], &i__1, &iwork[1], &info1);
+	zlacpy_("A", m, n, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK CALL */
+	*(unsigned char *)t_or_n__ = 'N';
+	xscl1 = rwork[*n + 1];
+	xscl2 = rwork[*n + 2];
+	if (xscl1 != xscl2) {
+/* This is an exceptional situation. If the */
+/* data matrices are not scaled and the */
+/* largest singular value of X overflows. */
+/* In that case ZGEJSV can return the SVD */
+/* in scaled form. The scaling factor can be used */
+/* to rescale the data (X and Y). */
+	    zlascl_("G", &c__0, &c__0, &xscl1, &xscl2, m, n, &y[y_offset], 
+		    ldy, &info2);
+	}
+/*      END SELECT */
+    }
+
+    if (info1 > 0) {
+/* The SVD selected subroutine did not converge. */
+/* Return with an error code. */
+	*info = 2;
+	return 0;
+    }
+
+    if (rwork[1] == zero) {
+/* The largest computed singular value of (scaled) */
+/* X is zero. Return error code -8 */
+/* (the 8th input variable had an illegal value). */
+	*k = 0;
+	*info = -8;
+	i__1 = -(*info);
+	xerbla_("ZGEDMD", &i__1);
+	return 0;
+    }
+
+/* <3> Determine the numerical rank of the data */
+/*    snapshots matrix X. This depends on the */
+/*    parameters NRNK and TOL. */
+/*      SELECT CASE ( NRNK ) */
+    if (*nrnk == -1) {
+	*k = 1;
+	i__1 = numrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (rwork[i__] <= rwork[1] * *tol || rwork[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else if (*nrnk == -2) {
+	*k = 1;
+	i__1 = numrnk - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (rwork[i__ + 1] <= rwork[i__] * *tol || rwork[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else {
+	*k = 1;
+	i__1 = *nrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (rwork[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+/*          END SELECT */
+    }
+/*   Now, U = X(1:M,1:K) is the SVD/POD basis for the */
+/*   snapshot data in the input matrix X. */
+/* <4> Compute the Rayleigh quotient S = U^H * A * U. */
+/*    Depending on the requested outputs, the computation */
+/*    is organized to compute additional auxiliary */
+/*    matrices (for the residuals and refinements). */
+
+/*    In all formulas below, we need V_k*Sigma_k^(-1) */
+/*    where either V_k is in W(1:N,1:K), or V_k^H is in */
+/*    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). */
+    if (lsame_(t_or_n__, "N")) {
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    d__1 = one / rwork[i__];
+	    zdscal_(n, &d__1, &w[i__ * w_dim1 + 1], &c__1);
+/* W(1:N,i) = (ONE/RWORK(i)) * W(1:N,i)      ! INTRINSIC */
+/* BLAS CALL */
+	}
+    } else {
+/* This non-unit stride access is due to the fact */
+/* that ZGESVD, ZGESVDQ and ZGESDD return the */
+/* adjoint matrix of the right singular vectors. */
+/* DO i = 1, K */
+/* CALL ZDSCAL( N, ONE/RWORK(i), W(i,1), LDW )    ! BLAS CALL */
+/* ! W(i,1:N) = (ONE/RWORK(i)) * W(i,1:N)      ! INTRINSIC */
+/* END DO */
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    rwork[*n + i__] = one / rwork[i__];
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *k;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * w_dim1;
+		i__4 = *n + i__;
+		z__2.r = rwork[i__4], z__2.i = zero;
+		i__5 = i__ + j * w_dim1;
+		z__1.r = z__2.r * w[i__5].r - z__2.i * w[i__5].i, z__1.i = 
+			z__2.r * w[i__5].i + z__2.i * w[i__5].r;
+		w[i__3].r = z__1.r, w[i__3].i = z__1.i;
+	    }
+	}
+    }
+
+    if (wntref) {
+
+/* Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) */
+/* for computing the refined Ritz vectors */
+/* (optionally, outside ZGEDMD). */
+	zgemm_("N", t_or_n__, m, k, n, &zone, &y[y_offset], ldy, &w[w_offset],
+		 ldw, &zzero, &z__[z_offset], ldz);
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(CONJG(W(1:K,1:N)))) ! */
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))                   ! */
+
+/* At this point Z contains */
+/* A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and */
+/* this is needed for computing the residuals. */
+/* This matrix is  returned in the array B and */
+/* it can be used to compute refined Ritz vectors. */
+/* BLA */
+	zlacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+/* B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC */
+/* BLAS CALL */
+	zgemm_("C", "N", k, k, m, &zone, &x[x_offset], ldx, &z__[z_offset], 
+		ldz, &zzero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(TRANSPOSE(CONJG(X(1:M,1:K))),Z(1:M,1:K)) */
+/* At this point S = U^H * A * U is the Rayleigh quotient. */
+/* BLA */
+    } else {
+/* A * U(:,1:K) is not explicitly needed and the */
+/* computation is organized differently. The Rayleigh */
+/* quotient is computed more efficiently. */
+	zgemm_("C", "N", k, n, m, &zone, &x[x_offset], ldx, &y[y_offset], ldy,
+		 &zzero, &z__[z_offset], ldz);
+/* Z(1:K,1:N) = MATMUL( TRANSPOSE(CONJG(X(1:M,1:K))), Y(1:M,1:N) */
+
+	zgemm_("N", t_or_n__, k, k, n, &zone, &z__[z_offset], ldz, &w[
+		w_offset], ldw, &zzero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(CONJG(W(1:K,1:N)))) ! */
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))                 ! */
+/* At this point S = U^H * A * U is the Rayleigh quotient. */
+/* If the residuals are requested, save scaled V_k into Z. */
+/* Recall that V_k or V_k^H is stored in W. */
+/* BLAS */
+	if (wntres || wntex) {
+	    if (lsame_(t_or_n__, "N")) {
+		zlacpy_("A", n, k, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    } else {
+		zlacpy_("A", k, n, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    }
+	}
+    }
+
+/* <5> Compute the Ritz values and (if requested) the */
+/*   right eigenvectors of the Rayleigh quotient. */
+
+    zgeev_("N", jobzl, k, &s[s_offset], lds, &eigs[1], &w[w_offset], ldw, &w[
+	    w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &info1);
+
+/* W(1:K,1:K) contains the eigenvectors of the Rayleigh */
+/* quotient.  See the description of Z. */
+/* Also, see the description of ZGEEV. */
+/* LAPACK CALL */
+    if (info1 > 0) {
+/* ZGEEV failed to compute the eigenvalues and */
+/* eigenvectors of the Rayleigh quotient. */
+	*info = 3;
+	return 0;
+    }
+
+/* <6> Compute the eigenvectors (if requested) and, */
+/* the residuals (if requested). */
+
+    if (wntvec || wntex) {
+	if (wntres) {
+	    if (wntref) {
+/* Here, if the refinement is requested, we have */
+/* A*U(:,1:K) already computed and stored in Z. */
+/* For the residuals, need Y = A * U(:,1;K) * W. */
+		zgemm_("N", "N", m, k, k, &zone, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zzero, &y[y_offset], ldy);
+/* Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)        ! INTRINSIC */
+/* This frees Z; Y contains A * U(:,1:K) * W. */
+/* BLAS CALL */
+	    } else {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) (or its adjoint) is stored in Z */
+		zgemm_(t_or_n__, "N", n, k, k, &zone, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zzero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+		zgemm_("N", "N", m, k, n, &zone, &y[y_offset], ldy, &s[
+			s_offset], lds, &zzero, &z__[z_offset], ldz);
+/* Save a copy of Z into Y and free Z for holding */
+/* the Ritz vectors. */
+		zlacpy_("A", m, k, &z__[z_offset], ldz, &y[y_offset], ldy);
+		if (wntex) {
+		    zlacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+		}
+	    }
+	} else if (wntex) {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) is stored in Z */
+	    zgemm_(t_or_n__, "N", n, k, k, &zone, &z__[z_offset], ldz, &w[
+		    w_offset], ldw, &zzero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+	    zgemm_("N", "N", m, k, n, &zone, &y[y_offset], ldy, &s[s_offset], 
+		    lds, &zzero, &b[b_offset], ldb);
+/* The above call replaces the following two calls */
+/* that were used in the developing-testing phase. */
+/* CALL ZGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & */
+/*           LDS, ZZERO, Z, LDZ) */
+/* Save a copy of Z into B and free Z for holding */
+/* the Ritz vectors. */
+/* CALL ZLACPY( 'A', M, K, Z, LDZ, B, LDB ) */
+	}
+
+/* Compute the Ritz vectors */
+	if (wntvec) {
+	    zgemm_("N", "N", m, k, k, &zone, &x[x_offset], ldx, &w[w_offset], 
+		    ldw, &zzero, &z__[z_offset], ldz);
+	}
+/* Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRINSIC */
+
+/* BLAS CALL */
+	if (wntres) {
+	    i__1 = *k;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		i__2 = i__;
+		z__1.r = -eigs[i__2].r, z__1.i = -eigs[i__2].i;
+		zaxpy_(m, &z__1, &z__[i__ * z_dim1 + 1], &c__1, &y[i__ * 
+			y_dim1 + 1], &c__1);
+/* Y(1:M,i) = Y(1:M,i) - EIGS(i) * Z(1:M,i)            ! INTR */
+/* BLAS */
+		res[i__] = dznrm2_(m, &y[i__ * y_dim1 + 1], &c__1);
+/* BLAS */
+	    }
+	}
+    }
+
+    if (*whtsvd == 4) {
+	rwork[*n + 1] = xscl1;
+	rwork[*n + 2] = xscl2;
+    }
+
+/*     Successful exit. */
+    if (! badxy) {
+	*info = 0;
+    } else {
+/* A warning on possible data inconsistency. */
+/* This should be a rare event. */
+	*info = 4;
+    }
+/* ............................................................ */
+    return 0;
+/*     ...... */
+} /* zgedmd_ */
+
diff --git a/lapack-netlib/SRC/zgedmdq.c b/lapack-netlib/SRC/zgedmdq.c
index 447b23014..1815f0814 100644
--- a/lapack-netlib/SRC/zgedmdq.c
+++ b/lapack-netlib/SRC/zgedmdq.c
@@ -509,3 +509,785 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 
 
 
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+
+/* Subroutine */ int zgedmdq_(char *jobs, char *jobz, char *jobr, char *jobq, 
+	char *jobt, char *jobf, integer *whtsvd, integer *m, integer *n, 
+	doublecomplex *f, integer *ldf, doublecomplex *x, integer *ldx, 
+	doublecomplex *y, integer *ldy, integer *nrnk, doublereal *tol, 
+	integer *k, doublecomplex *eigs, doublecomplex *z__, integer *ldz, 
+	doublereal *res, doublecomplex *b, integer *ldb, doublecomplex *v, 
+	integer *ldv, doublecomplex *s, integer *lds, doublecomplex *zwork, 
+	integer *lzwork, doublereal *work, integer *lwork, integer *iwork, 
+	integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer f_dim1, f_offset, x_dim1, x_offset, y_dim1, y_offset, z_dim1, 
+	    z_offset, b_dim1, b_offset, v_dim1, v_offset, s_dim1, s_offset, 
+	    i__1, i__2;
+
+    /* Local variables */
+    doublereal zero;
+    integer info1;
+    extern logical lsame_(char *, char *);
+    char jobvl[1];
+    integer minmn;
+    logical wantq;
+    integer mlwqr, olwqr;
+    logical wntex;
+    doublecomplex zzero;
+    extern /* Subroutine */ int zgedmd_(char *, char *, char *, char *, 
+	    integer *, integer *, integer *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, integer *, doublereal *, integer *, 
+	    doublecomplex *, doublecomplex *, integer *, doublereal *, 
+	    doublecomplex *, integer *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, integer *, 
+	    doublereal *, integer *, integer *, integer *, integer *), xerbla_(char *, integer *);
+    integer mlwdmd, olwdmd;
+    logical sccolx, sccoly;
+    extern /* Subroutine */ int zgeqrf_(integer *, integer *, doublecomplex *,
+	     integer *, doublecomplex *, doublecomplex *, integer *, integer *
+	    ), zlacpy_(char *, integer *, integer *, doublecomplex *, integer 
+	    *, doublecomplex *, integer *), zlaset_(char *, integer *,
+	     integer *, doublecomplex *, doublecomplex *, doublecomplex *, 
+	    integer *);
+    integer iminwr;
+    logical wntvec, wntvcf;
+    integer mlwgqr;
+    logical wntref;
+    integer mlwork, olwgqr, olwork, mlrwrk, mlwmqr, olwmqr;
+    logical lquery, wntres, wnttrf, wntvcq;
+    extern /* Subroutine */ int zungqr_(integer *, integer *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *, integer *), zunmqr_(char *, char *, integer *, integer 
+	    *, integer *, doublecomplex *, integer *, doublecomplex *, 
+	    doublecomplex *, integer *, doublecomplex *, integer *, integer *);
+    doublereal one;
+
+/* March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real64 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ..... */
+/*     Purpose */
+/*     ======= */
+/*     ZGEDMDQ computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices, using a QR factorization */
+/*     based compression of the data. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, ZGEDMDQ computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, ZGEDMDQ returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office. */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ...................................................................... */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. The data snapshots are the columns */
+/*     of F. The leading N-1 columns of F are denoted X and the */
+/*     trailing N-1 columns are denoted Y. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Z*V, where Z */
+/*            is orthonormal and V contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of F, V, Z. */
+/*     'Q' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Q*Z, where Z */
+/*            contains the eigenvectors of the compression of the */
+/*            underlying discretized operator onto the span of */
+/*            the data snapshots. See the descriptions of F, V, Z. */
+/*            Q is from the initial QR factorization. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will */
+/*            be computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBQ (input) CHARACTER*1 */
+/*     Specifies whether to explicitly compute and return the */
+/*     unitary matrix from the QR factorization. */
+/*     'Q' :: The matrix Q of the QR factorization of the data */
+/*            snapshot matrix is computed and stored in the */
+/*            array F. See the description of F. */
+/*     'N' :: The matrix Q is not explicitly computed. */
+/* ..... */
+/*     JOBT (input) CHARACTER*1 */
+/*     Specifies whether to return the upper triangular factor */
+/*     from the QR factorization. */
+/*     'R' :: The matrix R of the QR factorization of the data */
+/*            snapshot matrix F is returned in the array Y. */
+/*            See the description of Y and Further details. */
+/*     'N' :: The matrix R is not returned. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/*     To be useful on exit, this option needs JOBQ='Q'. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: ZGESVD (the QR SVD algorithm) */
+/*     2 :: ZGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: ZGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M >= 0 */
+/*     The state space dimension (the number of rows of F). */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshots from a single trajectory, */
+/*     taken at equidistant discrete times. This is the */
+/*     number of columns of F. */
+/* ..... */
+/*     F (input/output) COMPLEX(KIND=WP) M-by-N array */
+/*     > On entry, */
+/*     the columns of F are the sequence of data snapshots */
+/*     from a single trajectory, taken at equidistant discrete */
+/*     times. It is assumed that the column norms of F are */
+/*     in the range of the normalized floating point numbers. */
+/*     < On exit, */
+/*     If JOBQ == 'Q', the array F contains the orthogonal */
+/*     matrix/factor of the QR factorization of the initial */
+/*     data snapshots matrix F. See the description of JOBQ. */
+/*     If JOBQ == 'N', the entries in F strictly below the main */
+/*     diagonal contain, column-wise, the information on the */
+/*     Householder vectors, as returned by ZGEQRF. The */
+/*     remaining information to restore the orthogonal matrix */
+/*     of the initial QR factorization is stored in ZWORK(1:MIN(M,N)). */
+/*     See the description of ZWORK. */
+/* ..... */
+/*     LDF (input) INTEGER, LDF >= M */
+/*     The leading dimension of the array F. */
+/* ..... */
+/*     X (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array */
+/*     X is used as workspace to hold representations of the */
+/*     leading N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, the leading K columns of X contain the leading */
+/*     K left singular vectors of the above described content */
+/*     of X. To lift them to the space of the left singular */
+/*     vectors U(:,1:K) of the input data, pre-multiply with the */
+/*     Q factor from the initial QR factorization. */
+/*     See the descriptions of F, K, V  and Z. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= N */
+/*     The leading dimension of the array X. */
+/* ..... */
+/*     Y (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N) array */
+/*     Y is used as workspace to hold representations of the */
+/*     trailing N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, */
+/*     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper */
+/*     triangular factor from the QR factorization of the data */
+/*     snapshot matrix F. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= N */
+/*     The leading dimension of the array Y. */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1) */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N-1 :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the description of K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the SVD/POD basis for the leading N-1 */
+/*     data snapshots (columns of F) and the number of the */
+/*     computed Ritz pairs. The value of K is determined */
+/*     according to the rule set by the parameters NRNK and */
+/*     TOL. See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     EIGS (output) COMPLEX(KIND=WP) (N-1)-by-1 array */
+/*     The leading K (K<=N-1) entries of EIGS contain */
+/*     the computed eigenvalues (Ritz values). */
+/*     See the descriptions of K, and Z. */
+/* ..... */
+/*     Z (workspace/output) COMPLEX(KIND=WP)  M-by-(N-1) array */
+/*     If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) */
+/*     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. */
+/*     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as */
+/*     Z*V, where Z contains orthonormal matrix (the product of */
+/*     Q from the initial QR factorization and the SVD/POD_basis */
+/*     returned by ZGEDMD in X) and the second factor (the */
+/*     eigenvectors of the Rayleigh quotient) is in the array V, */
+/*     as returned by ZGEDMD. That is,  X(:,1:K)*V(:,i) */
+/*     is an eigenvector corresponding to EIGS(i). The columns */
+/*     of V(1:K,1:K) are the computed eigenvectors of the */
+/*     K-by-K Rayleigh quotient. */
+/*     See the descriptions of EIGS, X and V. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs, */
+/*     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. */
+/*     See the description of EIGS and Z. */
+/* ..... */
+/*     B (output) COMPLEX(KIND=WP)  MIN(M,N)-by-(N-1) array. */
+/*     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:N,1;K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     In both cases, the content of B can be lifted to the */
+/*     original dimension of the input data by pre-multiplying */
+/*     with the Q factor from the initial QR factorization. */
+/*     Here A denotes a compression of the underlying operator. */
+/*     See the descriptions of F and X. */
+/*     If JOBF =='N', then B is not referenced. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= MIN(M,N) */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     V (workspace/output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array */
+/*     On exit, V(1:K,1:K) V contains the K eigenvectors of */
+/*     the Rayleigh quotient. The Ritz vectors */
+/*     (returned in Z) are the product of Q from the initial QR */
+/*     factorization (see the description of F) X (see the */
+/*     description of X) and V. */
+/* ..... */
+/*     LDV (input) INTEGER, LDV >= N-1 */
+/*     The leading dimension of the array V. */
+/* ..... */
+/*     S (output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by ZGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N-1 */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     ZWORK (workspace/output) COMPLEX(KIND=WP) LWORK-by-1 array */
+/*     On exit, */
+/*     ZWORK(1:MIN(M,N)) contains the scalar factors of the */
+/*     elementary reflectors as returned by ZGEQRF of the */
+/*     M-by-N input matrix F. */
+/*     If the call to ZGEDMDQ is only workspace query, then */
+/*     ZWORK(1) contains the minimal complex workspace length and */
+/*     ZWORK(2) is the optimal complex workspace length. */
+/*     Hence, the length of work is at least 2. */
+/*     See the description of LZWORK. */
+/* ..... */
+/*     LZWORK (input) INTEGER */
+/*     The minimal length of the  workspace vector ZWORK. */
+/*     LZWORK is calculated as follows: */
+/*     Let MLWQR  = N (minimal workspace for ZGEQRF[M,N]) */
+/*         MLWDMD = minimal workspace for ZGEDMD (see the */
+/*                  description of LWORK in ZGEDMD) */
+/*         MLWMQR = N (minimal workspace for */
+/*                    ZUNMQR['L','N',M,N,N]) */
+/*         MLWGQR = N (minimal workspace for ZUNGQR[M,N,N]) */
+/*         MINMN  = MIN(M,N) */
+/*     Then */
+/*     LZWORK = MAX(2, MIN(M,N)+MLWQR, MINMN+MLWDMD) */
+/*     is further updated as follows: */
+/*        if   JOBZ == 'V' or JOBZ == 'F' THEN */
+/*             LZWORK = MAX(LZWORK, MINMN+MLWMQR) */
+/*        if   JOBQ == 'Q' THEN */
+/*             LZWORK = MAX(ZLWORK, MINMN+MLWGQR) */
+
+/* ..... */
+/*     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */
+/*     On exit, */
+/*     WORK(1:N-1) contains the singular values of */
+/*     the input submatrix F(1:M,1:N-1). */
+/*     If the call to ZGEDMDQ is only workspace query, then */
+/*     WORK(1) contains the minimal workspace length and */
+/*     WORK(2) is the optimal workspace length. hence, the */
+/*     length of work is at least 2. */
+/*     See the description of LWORK. */
+/* ..... */
+/*     LWORK (input) INTEGER */
+/*     The minimal length of the  workspace vector WORK. */
+/*     LWORK is the same as in ZGEDMD, because in ZGEDMDQ */
+/*     only ZGEDMD requires real workspace for snapshots */
+/*     of dimensions MIN(M,N)-by-(N-1). */
+/*     If on entry LWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace length for WORK. */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     Let M1=MIN(M,N), N1=N-1. Then */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1) */
+/*     If on entry LIWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+/*     COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP ) */
+
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     External subroutines */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+/* .......................................................... */
+    /* Parameter adjustments */
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --eigs;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1 * 1;
+    v -= v_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --zwork;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    one = 1.f;
+    zero = 0.f;
+    zzero.r = 0.f, zzero.i = 0.f;
+
+/*    Test the input arguments */
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntvcf = lsame_(jobz, "F");
+    wntvcq = lsame_(jobz, "Q");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    wantq = lsame_(jobq, "Q");
+    wnttrf = lsame_(jobt, "R");
+    minmn = f2cmin(*m,*n);
+    *info = 0;
+    lquery = *lzwork == -1 || *lwork == -1 || *liwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || wntvcf || wntvcq || lsame_(jobz, "N"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && lsame_(jobz, "N")) {
+	*info = -3;
+    } else if (! (wantq || lsame_(jobq, "N"))) {
+	*info = -4;
+    } else if (! (wnttrf || lsame_(jobt, "N"))) {
+	*info = -5;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -6;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -7;
+    } else if (*m < 0) {
+	*info = -8;
+    } else if (*n < 0 || *n > *m + 1) {
+	*info = -9;
+    } else if (*ldf < *m) {
+	*info = -11;
+    } else if (*ldx < minmn) {
+	*info = -13;
+    } else if (*ldy < minmn) {
+	*info = -15;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -16;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -17;
+    } else if (*ldz < *m) {
+	*info = -21;
+    } else if ((wntref || wntex) && *ldb < minmn) {
+	*info = -24;
+    } else if (*ldv < *n - 1) {
+	*info = -26;
+    } else if (*lds < *n - 1) {
+	*info = -28;
+    }
+
+    if (wntvec || wntvcf || wntvcq) {
+	*(unsigned char *)jobvl = 'V';
+    } else {
+	*(unsigned char *)jobvl = 'N';
+    }
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0 || *n == 1) {
+/* All output except K is void. INFO=1 signals */
+/* the void input. In case of a workspace query, */
+/* the minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		zwork[1].r = 2., zwork[1].i = 0.;
+		zwork[2].r = 2., zwork[2].i = 0.;
+		work[1] = 2.;
+		work[2] = 2.;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	mlrwrk = 2;
+	mlwork = 2;
+	olwork = 2;
+	iminwr = 1;
+	mlwqr = f2cmax(1,*n);
+/* Minimal workspace length for ZGEQRF. */
+/* Computing MAX */
+	i__1 = mlwork, i__2 = minmn + mlwqr;
+	mlwork = f2cmax(i__1,i__2);
+	if (lquery) {
+	    zgeqrf_(m, n, &f[f_offset], ldf, &zwork[1], &zwork[1], &c_n1, &
+		    info1);
+	    olwqr = (integer) zwork[1].r;
+/* Computing MAX */
+	    i__1 = olwork, i__2 = minmn + olwqr;
+	    olwork = f2cmax(i__1,i__2);
+	}
+	i__1 = *n - 1;
+	zgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], 
+		ldx, &y[y_offset], ldy, nrnk, tol, k, &eigs[1], &z__[z_offset]
+		, ldz, &res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[
+		s_offset], lds, &zwork[1], &c_n1, &work[1], &c_n1, &iwork[1], 
+		&c_n1, &info1);
+	mlwdmd = (integer) zwork[1].r;
+/* Computing MAX */
+	i__1 = mlwork, i__2 = minmn + mlwdmd;
+	mlwork = f2cmax(i__1,i__2);
+/* Computing MAX */
+	i__1 = mlrwrk, i__2 = (integer) work[1];
+	mlrwrk = f2cmax(i__1,i__2);
+	iminwr = f2cmax(iminwr,iwork[1]);
+	if (lquery) {
+	    olwdmd = (integer) zwork[2].r;
+/* Computing MAX */
+	    i__1 = olwork, i__2 = minmn + olwdmd;
+	    olwork = f2cmax(i__1,i__2);
+	}
+	if (wntvec || wntvcf) {
+	    mlwmqr = f2cmax(1,*n);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + mlwmqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		zunmqr_("L", "N", m, n, &minmn, &f[f_offset], ldf, &zwork[1], 
+			&z__[z_offset], ldz, &zwork[1], &c_n1, &info1);
+		olwmqr = (integer) zwork[1].r;
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + olwmqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	if (wantq) {
+	    mlwgqr = f2cmax(1,*n);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + mlwgqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		zungqr_(m, &minmn, &minmn, &f[f_offset], ldf, &zwork[1], &
+			zwork[1], &c_n1, &info1);
+		olwgqr = (integer) zwork[1].r;
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + olwgqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -34;
+	}
+	if (*lwork < mlrwrk && ! lquery) {
+	    *info = -32;
+	}
+	if (*lzwork < mlwork && ! lquery) {
+	    *info = -30;
+	}
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGEDMDQ", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	zwork[1].r = (doublereal) mlwork, zwork[1].i = 0.;
+	zwork[2].r = (doublereal) olwork, zwork[2].i = 0.;
+	work[1] = (doublereal) mlrwrk;
+	work[2] = (doublereal) mlrwrk;
+	return 0;
+    }
+/* ..... */
+/*     Initial QR factorization that is used to represent the */
+/*     snapshots as elements of lower dimensional subspace. */
+/*     For large scale computation with M >> N, at this place */
+/*     one can use an out of core QRF. */
+
+    i__1 = *lzwork - minmn;
+    zgeqrf_(m, n, &f[f_offset], ldf, &zwork[1], &zwork[minmn + 1], &i__1, &
+	    info1);
+
+/*     Define X and Y as the snapshots representations in the */
+/*     orthogonal basis computed in the QR factorization. */
+/*     X corresponds to the leading N-1 and Y to the trailing */
+/*     N-1 snapshots. */
+    i__1 = *n - 1;
+    zlaset_("L", &minmn, &i__1, &zzero, &zzero, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    zlacpy_("U", &minmn, &i__1, &f[f_offset], ldf, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    zlacpy_("A", &minmn, &i__1, &f[(f_dim1 << 1) + 1], ldf, &y[y_offset], ldy);
+    if (*m >= 3) {
+	i__1 = minmn - 2;
+	i__2 = *n - 2;
+	zlaset_("L", &i__1, &i__2, &zzero, &zzero, &y[y_dim1 + 3], ldy);
+    }
+
+/*     Compute the DMD of the projected snapshot pairs (X,Y) */
+    i__1 = *n - 1;
+    i__2 = *lzwork - minmn;
+    zgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], ldx,
+	     &y[y_offset], ldy, nrnk, tol, k, &eigs[1], &z__[z_offset], ldz, &
+	    res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[s_offset], lds, &
+	    zwork[minmn + 1], &i__2, &work[1], lwork, &iwork[1], liwork, &
+	    info1);
+    if (info1 == 2 || info1 == 3) {
+/* Return with error code. See ZGEDMD for details. */
+	*info = info1;
+	return 0;
+    } else {
+	*info = info1;
+    }
+
+/*     The Ritz vectors (Koopman modes) can be explicitly */
+/*     formed or returned in factored form. */
+    if (wntvec) {
+/* Compute the eigenvectors explicitly. */
+	if (*m > minmn) {
+	    i__1 = *m - minmn;
+	    zlaset_("A", &i__1, k, &zzero, &zzero, &z__[minmn + 1 + z_dim1], 
+		    ldz);
+	}
+	i__1 = *lzwork - minmn;
+	zunmqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &zwork[1], &z__[
+		z_offset], ldz, &zwork[minmn + 1], &i__1, &info1);
+    } else if (wntvcf) {
+/*   Return the Ritz vectors (eigenvectors) in factored */
+/*   form Z*V, where Z contains orthonormal matrix (the */
+/*   product of Q from the initial QR factorization and */
+/*   the SVD/POD_basis returned by ZGEDMD in X) and the */
+/*   second factor (the eigenvectors of the Rayleigh */
+/*   quotient) is in the array V, as returned by ZGEDMD. */
+	zlacpy_("A", n, k, &x[x_offset], ldx, &z__[z_offset], ldz);
+	if (*m > *n) {
+	    i__1 = *m - *n;
+	    zlaset_("A", &i__1, k, &zzero, &zzero, &z__[*n + 1 + z_dim1], ldz);
+	}
+	i__1 = *lzwork - minmn;
+	zunmqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &zwork[1], &z__[
+		z_offset], ldz, &zwork[minmn + 1], &i__1, &info1);
+    }
+
+/*     Some optional output variables: */
+
+/*     The upper triangular factor R in the initial QR */
+/*     factorization is optionally returned in the array Y. */
+/*     This is useful if this call to ZGEDMDQ is to be */
+/*     followed by a streaming DMD that is implemented in a */
+/*     QR compressed form. */
+    if (wnttrf) {
+/* Return the upper triangular R in Y */
+	zlaset_("A", &minmn, n, &zzero, &zzero, &y[y_offset], ldy);
+	zlacpy_("U", &minmn, n, &f[f_offset], ldf, &y[y_offset], ldy);
+    }
+
+/*     The orthonormal/unitary factor Q in the initial QR */
+/*     factorization is optionally returned in the array F. */
+/*     Same as with the triangular factor above, this is */
+/*     useful in a streaming DMD. */
+    if (wantq) {
+/* Q overwrites F */
+	i__1 = *lzwork - minmn;
+	zungqr_(m, &minmn, &minmn, &f[f_offset], ldf, &zwork[1], &zwork[minmn 
+		+ 1], &i__1, &info1);
+    }
+
+    return 0;
+
+} /* zgedmdq_ */
+

From fa6d06359a28ddba5996d9e25120d9907d83a1dd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 21 Jun 2023 17:17:31 +0200
Subject: [PATCH 148/718] correct list placement of zgedmd/zgedmdq

---
 cmake/lapack.cmake | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake
index 12127531d..d339f0ce9 100644
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -915,7 +915,8 @@ set(ZLASRC
    zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c
    zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c
    zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c
-   zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c)
+   zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c
+   zgedmd.c zgedmdq.c)
 
 set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c
    zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c
@@ -925,7 +926,7 @@ set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c
    zla_gbrfsx_extended.c zla_gbamv.c zla_gbrcond_c.c zla_gbrcond_x.c
    zla_gbrpvgrw.c zhesvxx.c zherfsx.c zla_herfsx_extended.c
    zla_heamv.c zla_hercond_c.c zla_hercond_x.c zla_herpvgrw.c
-   zla_lin_berr.c zlarscl2.c zlascl2.c zla_wwaddw.c zgedmd.c zgedmdq.c)
+   zla_lin_berr.c zlarscl2.c zlascl2.c zla_wwaddw.c)
 
 
 if(USE_XBLAS)

From 42fd3f4ec7d091519eafa60e4e08849aa19af98d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 21 Jun 2023 22:52:31 +0200
Subject: [PATCH 149/718] Add standard module path for Windows flang

---
 azure-pipelines.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 16b9da4f5..65ef538e9 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -115,7 +115,7 @@ jobs:
       mkdir build 
       cd build
       call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
-      cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
+      cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER="flang -I C:\Miniconda\Library\include\flang" -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
       cmake --build . --config Release
       ctest
 

From 9d3154701ca0391d1e3dedd7b79739bef7dbe65d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 22 Jun 2023 16:09:14 +0200
Subject: [PATCH 150/718] Fix confusing use of "minor" in inline documentation
 (Reference-LAPACK PR849)

---
 lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f | 4 ++--
 lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f | 4 ++--
 lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f | 4 ++--
 lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f
index f9384b416..1b8e53cc2 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f
index 1575bd95c..e0a621b2e 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f
index 67ebae335..f3b66a9e3 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f
index bef27afa7..dda42faf9 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>

From f12b1c7b08c3f6aa00a0133bcf056be91adc7bfd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 22 Jun 2023 16:10:27 +0200
Subject: [PATCH 151/718] Fix confusing use of "minor" in inline documentation
 (Reference-LAPACK PR849)

---
 lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f | 4 ++--
 lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f | 4 ++--
 lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f | 4 ++--
 lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f
index c810a1533..b9dffa4ec 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f
index e49200ea5..e68a559ba 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f
index 65895502b..7401f8844 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f
index 449c7ac95..80b7c7f43 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>

From 219a73a394d8bf4cd31a7cd1c534f83f205c5c70 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 22 Jun 2023 16:18:58 +0200
Subject: [PATCH 152/718] Fix confusing use of "minor" in inline documentation
 (Reference-LAPACK PR849)

---
 lapack-netlib/SRC/zcposv.f       |  8 ++++----
 lapack-netlib/SRC/zhegv.f        |  2 +-
 lapack-netlib/SRC/zhegv_2stage.f |  2 +-
 lapack-netlib/SRC/zhegvd.f       |  2 +-
 lapack-netlib/SRC/zhegvx.f       |  2 +-
 lapack-netlib/SRC/zhpgv.f        |  2 +-
 lapack-netlib/SRC/zhpgvd.f       |  2 +-
 lapack-netlib/SRC/zhpgvx.f       |  2 +-
 lapack-netlib/SRC/zla_porpvgrw.f |  6 +++---
 lapack-netlib/SRC/zpbsv.f        |  6 +++---
 lapack-netlib/SRC/zpbsvx.f       | 10 +++++-----
 lapack-netlib/SRC/zpbtf2.f       |  4 ++--
 lapack-netlib/SRC/zpbtrf.f       |  4 ++--
 lapack-netlib/SRC/zpftrf.f       |  4 ++--
 lapack-netlib/SRC/zposv.f        |  6 +++---
 lapack-netlib/SRC/zposvx.f       | 10 +++++-----
 lapack-netlib/SRC/zposvxx.f      |  2 +-
 lapack-netlib/SRC/zpotf2.f       |  4 ++--
 lapack-netlib/SRC/zpotrf.f       |  4 ++--
 lapack-netlib/SRC/zpotrf2.f      |  4 ++--
 lapack-netlib/SRC/zppsv.f        |  6 +++---
 lapack-netlib/SRC/zppsvx.f       | 10 +++++-----
 lapack-netlib/SRC/zpptrf.f       |  4 ++--
 lapack-netlib/SRC/zpteqr.f       |  4 ++--
 lapack-netlib/SRC/zptsv.f        |  4 ++--
 lapack-netlib/SRC/zptsvx.f       | 10 +++++-----
 lapack-netlib/SRC/zpttrf.f       |  4 ++--
 27 files changed, 64 insertions(+), 64 deletions(-)

diff --git a/lapack-netlib/SRC/zcposv.f b/lapack-netlib/SRC/zcposv.f
index 9b1940d60..649d2c049 100644
--- a/lapack-netlib/SRC/zcposv.f
+++ b/lapack-netlib/SRC/zcposv.f
@@ -187,10 +187,10 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of
-*>                (COMPLEX*16) A is not positive definite, so the
-*>                factorization could not be completed, and the solution
-*>                has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of (COMPLEX*16) A is not positive, so the factorization
+*>                could not be completed, and the solution has not been
+*>                computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/zhegv.f b/lapack-netlib/SRC/zhegv.f
index 41657e3be..c973bd0fc 100644
--- a/lapack-netlib/SRC/zhegv.f
+++ b/lapack-netlib/SRC/zhegv.f
@@ -160,7 +160,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zhegv_2stage.f b/lapack-netlib/SRC/zhegv_2stage.f
index fda651e5e..91ac09311 100644
--- a/lapack-netlib/SRC/zhegv_2stage.f
+++ b/lapack-netlib/SRC/zhegv_2stage.f
@@ -179,7 +179,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zhegvd.f b/lapack-netlib/SRC/zhegvd.f
index eeda656ad..2c3586517 100644
--- a/lapack-netlib/SRC/zhegvd.f
+++ b/lapack-netlib/SRC/zhegvd.f
@@ -212,7 +212,7 @@
 *>                    the submatrix lying in rows and columns INFO/(N+1)
 *>                    through mod(INFO,N+1);
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zhegvx.f b/lapack-netlib/SRC/zhegvx.f
index ac9f9ef1a..71ed1c4ca 100644
--- a/lapack-netlib/SRC/zhegvx.f
+++ b/lapack-netlib/SRC/zhegvx.f
@@ -280,7 +280,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zhpgv.f b/lapack-netlib/SRC/zhpgv.f
index 72876b6e4..b92168555 100644
--- a/lapack-netlib/SRC/zhpgv.f
+++ b/lapack-netlib/SRC/zhpgv.f
@@ -144,7 +144,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not convergeto zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zhpgvd.f b/lapack-netlib/SRC/zhpgvd.f
index e96e39738..e9688f0c7 100644
--- a/lapack-netlib/SRC/zhpgvd.f
+++ b/lapack-netlib/SRC/zhpgvd.f
@@ -205,7 +205,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not convergeto zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zhpgvx.f b/lapack-netlib/SRC/zhpgvx.f
index 94d7f7733..de75b486b 100644
--- a/lapack-netlib/SRC/zhpgvx.f
+++ b/lapack-netlib/SRC/zhpgvx.f
@@ -250,7 +250,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zla_porpvgrw.f b/lapack-netlib/SRC/zla_porpvgrw.f
index 9b381a072..897589aa0 100644
--- a/lapack-netlib/SRC/zla_porpvgrw.f
+++ b/lapack-netlib/SRC/zla_porpvgrw.f
@@ -142,9 +142,9 @@
 *     .. Executable Statements ..
       UPPER = LSAME( 'Upper', UPLO )
 *
-*     DPOTRF will have factored only the NCOLSxNCOLS leading minor, so
-*     we restrict the growth search to that minor and use only the first
-*     2*NCOLS workspace entries.
+*     DPOTRF will have factored only the NCOLSxNCOLS leading submatrix,
+*     so we restrict the growth search to that submatrix and use only
+*     the first 2*NCOLS workspace entries.
 *
       RPVGRW = 1.0D+0
       DO I = 1, 2*NCOLS
diff --git a/lapack-netlib/SRC/zpbsv.f b/lapack-netlib/SRC/zpbsv.f
index fe6baf8b5..ef212bbc3 100644
--- a/lapack-netlib/SRC/zpbsv.f
+++ b/lapack-netlib/SRC/zpbsv.f
@@ -119,9 +119,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/zpbsvx.f b/lapack-netlib/SRC/zpbsvx.f
index 1efd2fd1e..724102376 100644
--- a/lapack-netlib/SRC/zpbsvx.f
+++ b/lapack-netlib/SRC/zpbsvx.f
@@ -70,7 +70,7 @@
 *>    where U is an upper triangular band matrix, and L is a lower
 *>    triangular band matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -280,10 +280,10 @@
 *>          = 0: successful exit
 *>          < 0: if INFO = -i, the i-th argument had an illegal value
 *>          > 0: if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/zpbtf2.f b/lapack-netlib/SRC/zpbtf2.f
index 7b90af47e..39bf6c3fa 100644
--- a/lapack-netlib/SRC/zpbtf2.f
+++ b/lapack-netlib/SRC/zpbtf2.f
@@ -97,8 +97,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zpbtrf.f b/lapack-netlib/SRC/zpbtrf.f
index 61cd1d434..80afb0836 100644
--- a/lapack-netlib/SRC/zpbtrf.f
+++ b/lapack-netlib/SRC/zpbtrf.f
@@ -92,8 +92,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zpftrf.f b/lapack-netlib/SRC/zpftrf.f
index 6d18f2cb3..4d4a5116e 100644
--- a/lapack-netlib/SRC/zpftrf.f
+++ b/lapack-netlib/SRC/zpftrf.f
@@ -91,8 +91,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *>
 *>  Further Notes on RFP Format:
diff --git a/lapack-netlib/SRC/zposv.f b/lapack-netlib/SRC/zposv.f
index 3bb625876..0e91cde2f 100644
--- a/lapack-netlib/SRC/zposv.f
+++ b/lapack-netlib/SRC/zposv.f
@@ -110,9 +110,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/zposvx.f b/lapack-netlib/SRC/zposvx.f
index f9e9b1d5f..6c06dbd57 100644
--- a/lapack-netlib/SRC/zposvx.f
+++ b/lapack-netlib/SRC/zposvx.f
@@ -70,7 +70,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -276,10 +276,10 @@
 *>          = 0: successful exit
 *>          < 0: if INFO = -i, the i-th argument had an illegal value
 *>          > 0: if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/zposvxx.f b/lapack-netlib/SRC/zposvxx.f
index 6735fab71..3886c66f0 100644
--- a/lapack-netlib/SRC/zposvxx.f
+++ b/lapack-netlib/SRC/zposvxx.f
@@ -87,7 +87,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*>    3. If the leading i-by-i principal minor is not positive definite,
+*>    3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A (see argument RCOND).  If the reciprocal of the condition number
diff --git a/lapack-netlib/SRC/zpotf2.f b/lapack-netlib/SRC/zpotf2.f
index eb88d617c..b48f3d654 100644
--- a/lapack-netlib/SRC/zpotf2.f
+++ b/lapack-netlib/SRC/zpotf2.f
@@ -89,8 +89,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zpotrf.f b/lapack-netlib/SRC/zpotrf.f
index 3b7018276..3edae84fb 100644
--- a/lapack-netlib/SRC/zpotrf.f
+++ b/lapack-netlib/SRC/zpotrf.f
@@ -87,8 +87,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zpotrf2.f b/lapack-netlib/SRC/zpotrf2.f
index 859ddc75f..67430f231 100644
--- a/lapack-netlib/SRC/zpotrf2.f
+++ b/lapack-netlib/SRC/zpotrf2.f
@@ -86,8 +86,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zppsv.f b/lapack-netlib/SRC/zppsv.f
index 19536e204..f466266e4 100644
--- a/lapack-netlib/SRC/zppsv.f
+++ b/lapack-netlib/SRC/zppsv.f
@@ -104,9 +104,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/zppsvx.f b/lapack-netlib/SRC/zppsvx.f
index f94badf78..60d07cbc7 100644
--- a/lapack-netlib/SRC/zppsvx.f
+++ b/lapack-netlib/SRC/zppsvx.f
@@ -69,7 +69,7 @@
 *>    where U is an upper triangular matrix, L is a lower triangular
 *>    matrix, and **H indicates conjugate transpose.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -262,10 +262,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/zpptrf.f b/lapack-netlib/SRC/zpptrf.f
index a34d63913..a103e5eca 100644
--- a/lapack-netlib/SRC/zpptrf.f
+++ b/lapack-netlib/SRC/zpptrf.f
@@ -79,8 +79,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zpteqr.f b/lapack-netlib/SRC/zpteqr.f
index a81a6ad94..897136c76 100644
--- a/lapack-netlib/SRC/zpteqr.f
+++ b/lapack-netlib/SRC/zpteqr.f
@@ -123,8 +123,8 @@
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value.
 *>          > 0:  if INFO = i, and i is:
 *>                <= N  the Cholesky factorization of the matrix could
-*>                      not be performed because the i-th principal minor
-*>                      was not positive definite.
+*>                      not be performed because the leading principal
+*>                      minor of order i was not positive.
 *>                > N   the SVD algorithm failed to converge;
 *>                      if INFO = N+i, i off-diagonal elements of the
 *>                      bidiagonal factor did not converge to zero.
diff --git a/lapack-netlib/SRC/zptsv.f b/lapack-netlib/SRC/zptsv.f
index d53e17a34..191adaff6 100644
--- a/lapack-netlib/SRC/zptsv.f
+++ b/lapack-netlib/SRC/zptsv.f
@@ -94,8 +94,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the solution has not been
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the solution has not been
 *>                computed.  The factorization has not been completed
 *>                unless i = N.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zptsvx.f b/lapack-netlib/SRC/zptsvx.f
index 4a40768f1..94409a7a1 100644
--- a/lapack-netlib/SRC/zptsvx.f
+++ b/lapack-netlib/SRC/zptsvx.f
@@ -60,7 +60,7 @@
 *>    factorization can also be regarded as having the form
 *>    A = U**H*D*U.
 *>
-*> 2. If the leading i-by-i principal minor is not positive definite,
+*> 2. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -205,10 +205,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/zpttrf.f b/lapack-netlib/SRC/zpttrf.f
index a106ec419..75ef847c4 100644
--- a/lapack-netlib/SRC/zpttrf.f
+++ b/lapack-netlib/SRC/zpttrf.f
@@ -71,8 +71,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite; if k < N, the factorization could not
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive; if k < N, the factorization could not
 *>               be completed, while if k = N, the factorization was
 *>               completed, but D(N) <= 0.
 *> \endverbatim

From d9a6cacef737593c5f6e7c91396da7e3769afede Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 22 Jun 2023 16:20:49 +0200
Subject: [PATCH 153/718] Fix confusing use of "minor" in inline documentation
 (Reference-LAPACK PR849)

---
 lapack-netlib/SRC/sla_porpvgrw.f |  6 +++---
 lapack-netlib/SRC/spbsv.f        |  6 +++---
 lapack-netlib/SRC/spbsvx.f       | 10 +++++-----
 lapack-netlib/SRC/spbtf2.f       |  4 ++--
 lapack-netlib/SRC/spbtrf.f       |  4 ++--
 lapack-netlib/SRC/spftrf.f       |  4 ++--
 lapack-netlib/SRC/sposv.f        |  6 +++---
 lapack-netlib/SRC/sposvx.f       | 10 +++++-----
 lapack-netlib/SRC/sposvxx.f      |  2 +-
 lapack-netlib/SRC/spotf2.f       |  4 ++--
 lapack-netlib/SRC/spotrf.f       |  4 ++--
 lapack-netlib/SRC/spotrf2.f      |  4 ++--
 lapack-netlib/SRC/sppsv.f        |  6 +++---
 lapack-netlib/SRC/sppsvx.f       | 10 +++++-----
 lapack-netlib/SRC/spptrf.f       |  4 ++--
 lapack-netlib/SRC/spteqr.f       |  4 ++--
 lapack-netlib/SRC/sptsv.f        |  4 ++--
 lapack-netlib/SRC/sptsvx.f       | 10 +++++-----
 lapack-netlib/SRC/spttrf.f       |  4 ++--
 lapack-netlib/SRC/ssbgvx.f       |  2 +-
 lapack-netlib/SRC/sspgv.f        |  2 +-
 lapack-netlib/SRC/sspgvd.f       |  2 +-
 lapack-netlib/SRC/sspgvx.f       |  2 +-
 lapack-netlib/SRC/ssygv.f        |  2 +-
 lapack-netlib/SRC/ssygv_2stage.f |  2 +-
 lapack-netlib/SRC/ssygvd.f       |  2 +-
 lapack-netlib/SRC/ssygvx.f       |  2 +-
 27 files changed, 61 insertions(+), 61 deletions(-)

diff --git a/lapack-netlib/SRC/sla_porpvgrw.f b/lapack-netlib/SRC/sla_porpvgrw.f
index a97e7f72a..8064bf7fa 100644
--- a/lapack-netlib/SRC/sla_porpvgrw.f
+++ b/lapack-netlib/SRC/sla_porpvgrw.f
@@ -132,9 +132,9 @@
 *
       UPPER = LSAME( 'Upper', UPLO )
 *
-*     SPOTRF will have factored only the NCOLSxNCOLS leading minor, so
-*     we restrict the growth search to that minor and use only the first
-*     2*NCOLS workspace entries.
+*     SPOTRF will have factored only the NCOLSxNCOLS leading submatrix,
+*     so we restrict the growth search to that submatrix and use only
+*     the first 2*NCOLS workspace entries.
 *
       RPVGRW = 1.0
       DO I = 1, 2*NCOLS
diff --git a/lapack-netlib/SRC/spbsv.f b/lapack-netlib/SRC/spbsv.f
index 2d084424e..8929321ea 100644
--- a/lapack-netlib/SRC/spbsv.f
+++ b/lapack-netlib/SRC/spbsv.f
@@ -119,9 +119,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/spbsvx.f b/lapack-netlib/SRC/spbsvx.f
index 27907eb85..6abb6caae 100644
--- a/lapack-netlib/SRC/spbsvx.f
+++ b/lapack-netlib/SRC/spbsvx.f
@@ -71,7 +71,7 @@
 *>    where U is an upper triangular band matrix, and L is a lower
 *>    triangular band matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -281,10 +281,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/spbtf2.f b/lapack-netlib/SRC/spbtf2.f
index ff08bc327..b5aac1ccd 100644
--- a/lapack-netlib/SRC/spbtf2.f
+++ b/lapack-netlib/SRC/spbtf2.f
@@ -97,8 +97,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/spbtrf.f b/lapack-netlib/SRC/spbtrf.f
index ef5dcbb98..d3ae216e5 100644
--- a/lapack-netlib/SRC/spbtrf.f
+++ b/lapack-netlib/SRC/spbtrf.f
@@ -92,8 +92,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/spftrf.f b/lapack-netlib/SRC/spftrf.f
index 9e2c11eea..0ed0e3abd 100644
--- a/lapack-netlib/SRC/spftrf.f
+++ b/lapack-netlib/SRC/spftrf.f
@@ -91,8 +91,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/sposv.f b/lapack-netlib/SRC/sposv.f
index fe0a35a56..336332d2d 100644
--- a/lapack-netlib/SRC/sposv.f
+++ b/lapack-netlib/SRC/sposv.f
@@ -110,9 +110,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/sposvx.f b/lapack-netlib/SRC/sposvx.f
index bcf38c7ea..0770897d2 100644
--- a/lapack-netlib/SRC/sposvx.f
+++ b/lapack-netlib/SRC/sposvx.f
@@ -71,7 +71,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -277,10 +277,10 @@
 *>          = 0: successful exit
 *>          < 0: if INFO = -i, the i-th argument had an illegal value
 *>          > 0: if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/sposvxx.f b/lapack-netlib/SRC/sposvxx.f
index 08adf1d58..19e599f64 100644
--- a/lapack-netlib/SRC/sposvxx.f
+++ b/lapack-netlib/SRC/sposvxx.f
@@ -88,7 +88,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*>    3. If the leading i-by-i principal minor is not positive definite,
+*>    3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A (see argument RCOND).  If the reciprocal of the condition number
diff --git a/lapack-netlib/SRC/spotf2.f b/lapack-netlib/SRC/spotf2.f
index 5b3504834..773b768b2 100644
--- a/lapack-netlib/SRC/spotf2.f
+++ b/lapack-netlib/SRC/spotf2.f
@@ -89,8 +89,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/spotrf.f b/lapack-netlib/SRC/spotrf.f
index 5d5771c86..12ef58a40 100644
--- a/lapack-netlib/SRC/spotrf.f
+++ b/lapack-netlib/SRC/spotrf.f
@@ -87,8 +87,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/spotrf2.f b/lapack-netlib/SRC/spotrf2.f
index ae0484ce1..ef731ffa2 100644
--- a/lapack-netlib/SRC/spotrf2.f
+++ b/lapack-netlib/SRC/spotrf2.f
@@ -86,8 +86,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/sppsv.f b/lapack-netlib/SRC/sppsv.f
index 2d3fb3d91..1f48dd4db 100644
--- a/lapack-netlib/SRC/sppsv.f
+++ b/lapack-netlib/SRC/sppsv.f
@@ -104,9 +104,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/sppsvx.f b/lapack-netlib/SRC/sppsvx.f
index 7d71efcd5..bd2da20ee 100644
--- a/lapack-netlib/SRC/sppsvx.f
+++ b/lapack-netlib/SRC/sppsvx.f
@@ -69,7 +69,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -262,10 +262,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/spptrf.f b/lapack-netlib/SRC/spptrf.f
index 0f90399cc..be51c4824 100644
--- a/lapack-netlib/SRC/spptrf.f
+++ b/lapack-netlib/SRC/spptrf.f
@@ -79,8 +79,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/spteqr.f b/lapack-netlib/SRC/spteqr.f
index 0d6bf911b..6e6e9aa45 100644
--- a/lapack-netlib/SRC/spteqr.f
+++ b/lapack-netlib/SRC/spteqr.f
@@ -123,8 +123,8 @@
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value.
 *>          > 0:  if INFO = i, and i is:
 *>                <= N  the Cholesky factorization of the matrix could
-*>                      not be performed because the i-th principal minor
-*>                      was not positive definite.
+*>                      not be performed because the leading principal
+*>                      minor of order i was not positive.
 *>                > N   the SVD algorithm failed to converge;
 *>                      if INFO = N+i, i off-diagonal elements of the
 *>                      bidiagonal factor did not converge to zero.
diff --git a/lapack-netlib/SRC/sptsv.f b/lapack-netlib/SRC/sptsv.f
index 46aadf4c3..f11c22594 100644
--- a/lapack-netlib/SRC/sptsv.f
+++ b/lapack-netlib/SRC/sptsv.f
@@ -93,8 +93,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the solution has not been
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the solution has not been
 *>                computed.  The factorization has not been completed
 *>                unless i = N.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sptsvx.f b/lapack-netlib/SRC/sptsvx.f
index 8dd04b054..eaa691cef 100644
--- a/lapack-netlib/SRC/sptsvx.f
+++ b/lapack-netlib/SRC/sptsvx.f
@@ -59,7 +59,7 @@
 *>    factorization can also be regarded as having the form
 *>    A = U**T*D*U.
 *>
-*> 2. If the leading i-by-i principal minor is not positive definite,
+*> 2. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -199,10 +199,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/spttrf.f b/lapack-netlib/SRC/spttrf.f
index 2217a7338..e083b7456 100644
--- a/lapack-netlib/SRC/spttrf.f
+++ b/lapack-netlib/SRC/spttrf.f
@@ -70,8 +70,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite; if k < N, the factorization could not
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive; if k < N, the factorization could not
 *>               be completed, while if k = N, the factorization was
 *>               completed, but D(N) <= 0.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/ssbgvx.f b/lapack-netlib/SRC/ssbgvx.f
index 271f35964..3607fae82 100644
--- a/lapack-netlib/SRC/ssbgvx.f
+++ b/lapack-netlib/SRC/ssbgvx.f
@@ -267,7 +267,7 @@
 *>                  Their indices are stored in IFAIL.
 *>          > N:  SPBSTF returned an error code; i.e.,
 *>                if INFO = N + i, for 1 <= i <= N, then the leading
-*>                minor of order i of B is not positive definite.
+*>                principal minor of order i of B is not positive.
 *>                The factorization of B could not be completed and
 *>                no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sspgv.f b/lapack-netlib/SRC/sspgv.f
index c73e94e60..e8bc66e5d 100644
--- a/lapack-netlib/SRC/sspgv.f
+++ b/lapack-netlib/SRC/sspgv.f
@@ -139,7 +139,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero.
 *>             > N:   if INFO = n + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sspgvd.f b/lapack-netlib/SRC/sspgvd.f
index 73862ed1b..8ce2311fa 100644
--- a/lapack-netlib/SRC/sspgvd.f
+++ b/lapack-netlib/SRC/sspgvd.f
@@ -184,7 +184,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sspgvx.f b/lapack-netlib/SRC/sspgvx.f
index de581543a..6d5b4ed3d 100644
--- a/lapack-netlib/SRC/sspgvx.f
+++ b/lapack-netlib/SRC/sspgvx.f
@@ -245,7 +245,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/ssygv.f b/lapack-netlib/SRC/ssygv.f
index 270957fce..f39947d92 100644
--- a/lapack-netlib/SRC/ssygv.f
+++ b/lapack-netlib/SRC/ssygv.f
@@ -154,7 +154,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/ssygv_2stage.f b/lapack-netlib/SRC/ssygv_2stage.f
index 49f357d90..3d9a44b5e 100644
--- a/lapack-netlib/SRC/ssygv_2stage.f
+++ b/lapack-netlib/SRC/ssygv_2stage.f
@@ -173,7 +173,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/ssygvd.f b/lapack-netlib/SRC/ssygvd.f
index 7c7e0de01..79f12a6f9 100644
--- a/lapack-netlib/SRC/ssygvd.f
+++ b/lapack-netlib/SRC/ssygvd.f
@@ -190,7 +190,7 @@
 *>                    the submatrix lying in rows and columns INFO/(N+1)
 *>                    through mod(INFO,N+1);
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/ssygvx.f b/lapack-netlib/SRC/ssygvx.f
index e93da60cc..344075c9f 100644
--- a/lapack-netlib/SRC/ssygvx.f
+++ b/lapack-netlib/SRC/ssygvx.f
@@ -270,7 +270,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim

From a20f533b868a23c96374d7c6f897cf1c76772e8a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 22 Jun 2023 16:23:26 +0200
Subject: [PATCH 154/718] Fix confusing use of "minor" in inline documentation
 (Reference-LAPACK PR849)

---
 lapack-netlib/SRC/dla_porpvgrw.f |  6 +++---
 lapack-netlib/SRC/dpbsv.f        |  6 +++---
 lapack-netlib/SRC/dpbsvx.f       | 10 +++++-----
 lapack-netlib/SRC/dpbtf2.f       |  4 ++--
 lapack-netlib/SRC/dpbtrf.f       |  4 ++--
 lapack-netlib/SRC/dpftrf.f       |  4 ++--
 lapack-netlib/SRC/dposv.f        |  6 +++---
 lapack-netlib/SRC/dposvx.f       | 10 +++++-----
 lapack-netlib/SRC/dposvxx.f      |  2 +-
 lapack-netlib/SRC/dpotf2.f       |  4 ++--
 lapack-netlib/SRC/dpotrf.f       |  4 ++--
 lapack-netlib/SRC/dpotrf2.f      |  4 ++--
 lapack-netlib/SRC/dppsv.f        |  6 +++---
 lapack-netlib/SRC/dppsvx.f       | 10 +++++-----
 lapack-netlib/SRC/dpptrf.f       |  4 ++--
 lapack-netlib/SRC/dpteqr.f       |  4 ++--
 lapack-netlib/SRC/dptsv.f        |  4 ++--
 lapack-netlib/SRC/dptsvx.f       | 10 +++++-----
 lapack-netlib/SRC/dpttrf.f       |  4 ++--
 lapack-netlib/SRC/dsbgvx.f       |  2 +-
 lapack-netlib/SRC/dspgv.f        |  2 +-
 lapack-netlib/SRC/dspgvd.f       |  2 +-
 lapack-netlib/SRC/dspgvx.f       |  2 +-
 lapack-netlib/SRC/dsposv.f       |  4 ++--
 lapack-netlib/SRC/dsygv.f        |  2 +-
 lapack-netlib/SRC/dsygv_2stage.f |  2 +-
 lapack-netlib/SRC/dsygvd.f       |  2 +-
 lapack-netlib/SRC/dsygvx.f       |  2 +-
 28 files changed, 63 insertions(+), 63 deletions(-)

diff --git a/lapack-netlib/SRC/dla_porpvgrw.f b/lapack-netlib/SRC/dla_porpvgrw.f
index 93ad3eb6a..00fdd7ae1 100644
--- a/lapack-netlib/SRC/dla_porpvgrw.f
+++ b/lapack-netlib/SRC/dla_porpvgrw.f
@@ -134,9 +134,9 @@
 *
       UPPER = LSAME( 'Upper', UPLO )
 *
-*     DPOTRF will have factored only the NCOLSxNCOLS leading minor, so
-*     we restrict the growth search to that minor and use only the first
-*     2*NCOLS workspace entries.
+*     DPOTRF will have factored only the NCOLSxNCOLS leading submatrix,
+*     so we restrict the growth search to that submatrix and use only
+*     the first 2*NCOLS workspace entries.
 *
       RPVGRW = 1.0D+0
       DO I = 1, 2*NCOLS
diff --git a/lapack-netlib/SRC/dpbsv.f b/lapack-netlib/SRC/dpbsv.f
index a52e78309..2d8f06440 100644
--- a/lapack-netlib/SRC/dpbsv.f
+++ b/lapack-netlib/SRC/dpbsv.f
@@ -119,9 +119,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/dpbsvx.f b/lapack-netlib/SRC/dpbsvx.f
index 1bf526fc7..142dda5da 100644
--- a/lapack-netlib/SRC/dpbsvx.f
+++ b/lapack-netlib/SRC/dpbsvx.f
@@ -71,7 +71,7 @@
 *>    where U is an upper triangular band matrix, and L is a lower
 *>    triangular band matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -281,10 +281,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/dpbtf2.f b/lapack-netlib/SRC/dpbtf2.f
index 534629294..1284c9ec8 100644
--- a/lapack-netlib/SRC/dpbtf2.f
+++ b/lapack-netlib/SRC/dpbtf2.f
@@ -97,8 +97,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dpbtrf.f b/lapack-netlib/SRC/dpbtrf.f
index 8256f8938..29e9aaecd 100644
--- a/lapack-netlib/SRC/dpbtrf.f
+++ b/lapack-netlib/SRC/dpbtrf.f
@@ -92,8 +92,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dpftrf.f b/lapack-netlib/SRC/dpftrf.f
index 980debaf2..312dcdf84 100644
--- a/lapack-netlib/SRC/dpftrf.f
+++ b/lapack-netlib/SRC/dpftrf.f
@@ -91,8 +91,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dposv.f b/lapack-netlib/SRC/dposv.f
index ee2988e6f..cb76e9977 100644
--- a/lapack-netlib/SRC/dposv.f
+++ b/lapack-netlib/SRC/dposv.f
@@ -110,9 +110,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/dposvx.f b/lapack-netlib/SRC/dposvx.f
index 4a0b9d605..faffff803 100644
--- a/lapack-netlib/SRC/dposvx.f
+++ b/lapack-netlib/SRC/dposvx.f
@@ -71,7 +71,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -277,10 +277,10 @@
 *>          = 0: successful exit
 *>          < 0: if INFO = -i, the i-th argument had an illegal value
 *>          > 0: if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/dposvxx.f b/lapack-netlib/SRC/dposvxx.f
index e74b23c80..b5336cf63 100644
--- a/lapack-netlib/SRC/dposvxx.f
+++ b/lapack-netlib/SRC/dposvxx.f
@@ -88,7 +88,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*>    3. If the leading i-by-i principal minor is not positive definite,
+*>    3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A (see argument RCOND).  If the reciprocal of the condition number
diff --git a/lapack-netlib/SRC/dpotf2.f b/lapack-netlib/SRC/dpotf2.f
index 08fa4957f..30da5c3f3 100644
--- a/lapack-netlib/SRC/dpotf2.f
+++ b/lapack-netlib/SRC/dpotf2.f
@@ -89,8 +89,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dpotrf.f b/lapack-netlib/SRC/dpotrf.f
index 1679fc3cd..65509feb8 100644
--- a/lapack-netlib/SRC/dpotrf.f
+++ b/lapack-netlib/SRC/dpotrf.f
@@ -87,8 +87,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dpotrf2.f b/lapack-netlib/SRC/dpotrf2.f
index 6c28ce6d6..aaf9b9c58 100644
--- a/lapack-netlib/SRC/dpotrf2.f
+++ b/lapack-netlib/SRC/dpotrf2.f
@@ -86,8 +86,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dppsv.f b/lapack-netlib/SRC/dppsv.f
index 435703b08..1888005d9 100644
--- a/lapack-netlib/SRC/dppsv.f
+++ b/lapack-netlib/SRC/dppsv.f
@@ -104,9 +104,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/dppsvx.f b/lapack-netlib/SRC/dppsvx.f
index cb41d39ee..3b08fc821 100644
--- a/lapack-netlib/SRC/dppsvx.f
+++ b/lapack-netlib/SRC/dppsvx.f
@@ -69,7 +69,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -262,10 +262,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/dpptrf.f b/lapack-netlib/SRC/dpptrf.f
index 2d8de5110..d9eed910d 100644
--- a/lapack-netlib/SRC/dpptrf.f
+++ b/lapack-netlib/SRC/dpptrf.f
@@ -79,8 +79,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dpteqr.f b/lapack-netlib/SRC/dpteqr.f
index aa1f1a80c..d07b065b0 100644
--- a/lapack-netlib/SRC/dpteqr.f
+++ b/lapack-netlib/SRC/dpteqr.f
@@ -123,8 +123,8 @@
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value.
 *>          > 0:  if INFO = i, and i is:
 *>                <= N  the Cholesky factorization of the matrix could
-*>                      not be performed because the i-th principal minor
-*>                      was not positive definite.
+*>                      not be performed because the leading principal
+*>                      minor of order i was not positive.
 *>                > N   the SVD algorithm failed to converge;
 *>                      if INFO = N+i, i off-diagonal elements of the
 *>                      bidiagonal factor did not converge to zero.
diff --git a/lapack-netlib/SRC/dptsv.f b/lapack-netlib/SRC/dptsv.f
index addc34b88..41d8cff15 100644
--- a/lapack-netlib/SRC/dptsv.f
+++ b/lapack-netlib/SRC/dptsv.f
@@ -93,8 +93,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the solution has not been
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the solution has not been
 *>                computed.  The factorization has not been completed
 *>                unless i = N.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dptsvx.f b/lapack-netlib/SRC/dptsvx.f
index 7fb6cf436..fcbf5aa3b 100644
--- a/lapack-netlib/SRC/dptsvx.f
+++ b/lapack-netlib/SRC/dptsvx.f
@@ -59,7 +59,7 @@
 *>    factorization can also be regarded as having the form
 *>    A = U**T*D*U.
 *>
-*> 2. If the leading i-by-i principal minor is not positive definite,
+*> 2. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -199,10 +199,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/dpttrf.f b/lapack-netlib/SRC/dpttrf.f
index e0022e3ad..156e845a3 100644
--- a/lapack-netlib/SRC/dpttrf.f
+++ b/lapack-netlib/SRC/dpttrf.f
@@ -70,8 +70,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite; if k < N, the factorization could not
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive; if k < N, the factorization could not
 *>               be completed, while if k = N, the factorization was
 *>               completed, but D(N) <= 0.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dsbgvx.f b/lapack-netlib/SRC/dsbgvx.f
index 20de17931..53deae580 100644
--- a/lapack-netlib/SRC/dsbgvx.f
+++ b/lapack-netlib/SRC/dsbgvx.f
@@ -267,7 +267,7 @@
 *>                  Their indices are stored in IFAIL.
 *>          > N:  DPBSTF returned an error code; i.e.,
 *>                if INFO = N + i, for 1 <= i <= N, then the leading
-*>                minor of order i of B is not positive definite.
+*>                principal minor of order i of B is not positive.
 *>                The factorization of B could not be completed and
 *>                no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dspgv.f b/lapack-netlib/SRC/dspgv.f
index d8ec3b1a4..96041c301 100644
--- a/lapack-netlib/SRC/dspgv.f
+++ b/lapack-netlib/SRC/dspgv.f
@@ -139,7 +139,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero.
 *>             > N:   if INFO = n + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dspgvd.f b/lapack-netlib/SRC/dspgvd.f
index df215ae1a..ec3cdc1ac 100644
--- a/lapack-netlib/SRC/dspgvd.f
+++ b/lapack-netlib/SRC/dspgvd.f
@@ -184,7 +184,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dspgvx.f b/lapack-netlib/SRC/dspgvx.f
index ec93147aa..5afd73d02 100644
--- a/lapack-netlib/SRC/dspgvx.f
+++ b/lapack-netlib/SRC/dspgvx.f
@@ -245,7 +245,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dsposv.f b/lapack-netlib/SRC/dsposv.f
index c3b8de0e3..0bd75698d 100644
--- a/lapack-netlib/SRC/dsposv.f
+++ b/lapack-netlib/SRC/dsposv.f
@@ -177,8 +177,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of (DOUBLE
-*>                PRECISION) A is not positive definite, so the
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of (DOUBLE PRECISION) A is not positive, so the
 *>                factorization could not be completed, and the solution
 *>                has not been computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dsygv.f b/lapack-netlib/SRC/dsygv.f
index 5208dbb1f..02a4cc3ed 100644
--- a/lapack-netlib/SRC/dsygv.f
+++ b/lapack-netlib/SRC/dsygv.f
@@ -154,7 +154,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dsygv_2stage.f b/lapack-netlib/SRC/dsygv_2stage.f
index 5c71ebf94..383304267 100644
--- a/lapack-netlib/SRC/dsygv_2stage.f
+++ b/lapack-netlib/SRC/dsygv_2stage.f
@@ -173,7 +173,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dsygvd.f b/lapack-netlib/SRC/dsygvd.f
index 3b38665a7..d6682d4e5 100644
--- a/lapack-netlib/SRC/dsygvd.f
+++ b/lapack-netlib/SRC/dsygvd.f
@@ -190,7 +190,7 @@
 *>                    the submatrix lying in rows and columns INFO/(N+1)
 *>                    through mod(INFO,N+1);
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dsygvx.f b/lapack-netlib/SRC/dsygvx.f
index 3fa55b97c..2dc27e8a8 100644
--- a/lapack-netlib/SRC/dsygvx.f
+++ b/lapack-netlib/SRC/dsygvx.f
@@ -270,7 +270,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim

From 7b73666d709c7f77379a35d318fdc34bd3818efe Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 22 Jun 2023 16:25:24 +0200
Subject: [PATCH 155/718] Fix confusing use of "minor" in inline documentation
 (Reference-LAPACK PR849)

---
 lapack-netlib/SRC/chegv.f        |  2 +-
 lapack-netlib/SRC/chegv_2stage.f |  2 +-
 lapack-netlib/SRC/chegvd.f       |  2 +-
 lapack-netlib/SRC/chegvx.f       |  2 +-
 lapack-netlib/SRC/chpgv.f        |  2 +-
 lapack-netlib/SRC/chpgvd.f       |  2 +-
 lapack-netlib/SRC/chpgvx.f       |  2 +-
 lapack-netlib/SRC/cla_porpvgrw.f |  6 +++---
 lapack-netlib/SRC/cpbsv.f        |  6 +++---
 lapack-netlib/SRC/cpbsvx.f       | 10 +++++-----
 lapack-netlib/SRC/cpbtf2.f       |  4 ++--
 lapack-netlib/SRC/cpbtrf.f       |  4 ++--
 lapack-netlib/SRC/cpftrf.f       |  4 ++--
 lapack-netlib/SRC/cposv.f        |  6 +++---
 lapack-netlib/SRC/cposvx.f       | 10 +++++-----
 lapack-netlib/SRC/cposvxx.f      |  2 +-
 lapack-netlib/SRC/cpotf2.f       |  4 ++--
 lapack-netlib/SRC/cpotrf.f       |  4 ++--
 lapack-netlib/SRC/cpotrf2.f      |  4 ++--
 lapack-netlib/SRC/cppsv.f        |  6 +++---
 lapack-netlib/SRC/cppsvx.f       | 10 +++++-----
 lapack-netlib/SRC/cpptrf.f       |  6 +++---
 lapack-netlib/SRC/cpteqr.f       |  4 ++--
 lapack-netlib/SRC/cptsv.f        |  4 ++--
 lapack-netlib/SRC/cptsvx.f       | 10 +++++-----
 lapack-netlib/SRC/cpttrf.f       |  4 ++--
 26 files changed, 61 insertions(+), 61 deletions(-)

diff --git a/lapack-netlib/SRC/chegv.f b/lapack-netlib/SRC/chegv.f
index f7675a19f..198e5d102 100644
--- a/lapack-netlib/SRC/chegv.f
+++ b/lapack-netlib/SRC/chegv.f
@@ -160,7 +160,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/chegv_2stage.f b/lapack-netlib/SRC/chegv_2stage.f
index 472581c4b..d2b8fc795 100644
--- a/lapack-netlib/SRC/chegv_2stage.f
+++ b/lapack-netlib/SRC/chegv_2stage.f
@@ -179,7 +179,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/chegvd.f b/lapack-netlib/SRC/chegvd.f
index 4b7f43d52..c96f011af 100644
--- a/lapack-netlib/SRC/chegvd.f
+++ b/lapack-netlib/SRC/chegvd.f
@@ -212,7 +212,7 @@
 *>                    the submatrix lying in rows and columns INFO/(N+1)
 *>                    through mod(INFO,N+1);
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/chegvx.f b/lapack-netlib/SRC/chegvx.f
index 6e428242d..8e565222d 100644
--- a/lapack-netlib/SRC/chegvx.f
+++ b/lapack-netlib/SRC/chegvx.f
@@ -280,7 +280,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/chpgv.f b/lapack-netlib/SRC/chpgv.f
index 417f10121..660724e05 100644
--- a/lapack-netlib/SRC/chpgv.f
+++ b/lapack-netlib/SRC/chpgv.f
@@ -144,7 +144,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not convergeto zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/chpgvd.f b/lapack-netlib/SRC/chpgvd.f
index 65d08b783..5c9e417d3 100644
--- a/lapack-netlib/SRC/chpgvd.f
+++ b/lapack-netlib/SRC/chpgvd.f
@@ -205,7 +205,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not convergeto zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/chpgvx.f b/lapack-netlib/SRC/chpgvx.f
index 711daf55f..2646800cc 100644
--- a/lapack-netlib/SRC/chpgvx.f
+++ b/lapack-netlib/SRC/chpgvx.f
@@ -250,7 +250,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/cla_porpvgrw.f b/lapack-netlib/SRC/cla_porpvgrw.f
index 1eb706d1a..78cd19da9 100644
--- a/lapack-netlib/SRC/cla_porpvgrw.f
+++ b/lapack-netlib/SRC/cla_porpvgrw.f
@@ -140,9 +140,9 @@
 *     .. Executable Statements ..
       UPPER = LSAME( 'Upper', UPLO )
 *
-*     SPOTRF will have factored only the NCOLSxNCOLS leading minor, so
-*     we restrict the growth search to that minor and use only the first
-*     2*NCOLS workspace entries.
+*     SPOTRF will have factored only the NCOLSxNCOLS leading submatrix,
+*     so we restrict the growth search to that submatrix and use only
+*     the first 2*NCOLS workspace entries.
 *
       RPVGRW = 1.0
       DO I = 1, 2*NCOLS
diff --git a/lapack-netlib/SRC/cpbsv.f b/lapack-netlib/SRC/cpbsv.f
index 248abbc1f..889bbde08 100644
--- a/lapack-netlib/SRC/cpbsv.f
+++ b/lapack-netlib/SRC/cpbsv.f
@@ -119,9 +119,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/cpbsvx.f b/lapack-netlib/SRC/cpbsvx.f
index 652e18501..975c87768 100644
--- a/lapack-netlib/SRC/cpbsvx.f
+++ b/lapack-netlib/SRC/cpbsvx.f
@@ -70,7 +70,7 @@
 *>    where U is an upper triangular band matrix, and L is a lower
 *>    triangular band matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -280,10 +280,10 @@
 *>          = 0: successful exit
 *>          < 0: if INFO = -i, the i-th argument had an illegal value
 *>          > 0: if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/cpbtf2.f b/lapack-netlib/SRC/cpbtf2.f
index 0be2c0a7f..f5bc9b3a6 100644
--- a/lapack-netlib/SRC/cpbtf2.f
+++ b/lapack-netlib/SRC/cpbtf2.f
@@ -97,8 +97,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/cpbtrf.f b/lapack-netlib/SRC/cpbtrf.f
index a4c18efb3..af60780c8 100644
--- a/lapack-netlib/SRC/cpbtrf.f
+++ b/lapack-netlib/SRC/cpbtrf.f
@@ -92,8 +92,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/cpftrf.f b/lapack-netlib/SRC/cpftrf.f
index cbaab6832..12799c6f7 100644
--- a/lapack-netlib/SRC/cpftrf.f
+++ b/lapack-netlib/SRC/cpftrf.f
@@ -91,8 +91,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *>
 *>  Further Notes on RFP Format:
diff --git a/lapack-netlib/SRC/cposv.f b/lapack-netlib/SRC/cposv.f
index f37dfa3c0..ea6fc37db 100644
--- a/lapack-netlib/SRC/cposv.f
+++ b/lapack-netlib/SRC/cposv.f
@@ -110,9 +110,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/cposvx.f b/lapack-netlib/SRC/cposvx.f
index 78b9f4db1..322a26447 100644
--- a/lapack-netlib/SRC/cposvx.f
+++ b/lapack-netlib/SRC/cposvx.f
@@ -70,7 +70,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -276,10 +276,10 @@
 *>          = 0: successful exit
 *>          < 0: if INFO = -i, the i-th argument had an illegal value
 *>          > 0: if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/cposvxx.f b/lapack-netlib/SRC/cposvxx.f
index 7834c75da..c40a2d856 100644
--- a/lapack-netlib/SRC/cposvxx.f
+++ b/lapack-netlib/SRC/cposvxx.f
@@ -87,7 +87,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*>    3. If the leading i-by-i principal minor is not positive definite,
+*>    3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A (see argument RCOND).  If the reciprocal of the condition number
diff --git a/lapack-netlib/SRC/cpotf2.f b/lapack-netlib/SRC/cpotf2.f
index 2f4658bae..d84988949 100644
--- a/lapack-netlib/SRC/cpotf2.f
+++ b/lapack-netlib/SRC/cpotf2.f
@@ -89,8 +89,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/cpotrf.f b/lapack-netlib/SRC/cpotrf.f
index 6aba3103e..e2b120a49 100644
--- a/lapack-netlib/SRC/cpotrf.f
+++ b/lapack-netlib/SRC/cpotrf.f
@@ -87,8 +87,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/cpotrf2.f b/lapack-netlib/SRC/cpotrf2.f
index e1eae3e9d..ea2e4ca98 100644
--- a/lapack-netlib/SRC/cpotrf2.f
+++ b/lapack-netlib/SRC/cpotrf2.f
@@ -86,8 +86,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/cppsv.f b/lapack-netlib/SRC/cppsv.f
index 1e6f02695..a8fd660c4 100644
--- a/lapack-netlib/SRC/cppsv.f
+++ b/lapack-netlib/SRC/cppsv.f
@@ -104,9 +104,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/cppsvx.f b/lapack-netlib/SRC/cppsvx.f
index f6f07538c..2ef02100f 100644
--- a/lapack-netlib/SRC/cppsvx.f
+++ b/lapack-netlib/SRC/cppsvx.f
@@ -69,7 +69,7 @@
 *>    where U is an upper triangular matrix, L is a lower triangular
 *>    matrix, and **H indicates conjugate transpose.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -262,10 +262,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/cpptrf.f b/lapack-netlib/SRC/cpptrf.f
index 4e81458cb..e36f834cb 100644
--- a/lapack-netlib/SRC/cpptrf.f
+++ b/lapack-netlib/SRC/cpptrf.f
@@ -79,9 +79,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
-*>                completed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive definite, and the factorization could
+*>                not be completed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/cpteqr.f b/lapack-netlib/SRC/cpteqr.f
index e3af59041..fc9c44908 100644
--- a/lapack-netlib/SRC/cpteqr.f
+++ b/lapack-netlib/SRC/cpteqr.f
@@ -123,8 +123,8 @@
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value.
 *>          > 0:  if INFO = i, and i is:
 *>                <= N  the Cholesky factorization of the matrix could
-*>                      not be performed because the i-th principal minor
-*>                      was not positive definite.
+*>                      not be performed because the leading principal
+*>                      minor of order i was not positive.
 *>                > N   the SVD algorithm failed to converge;
 *>                      if INFO = N+i, i off-diagonal elements of the
 *>                      bidiagonal factor did not converge to zero.
diff --git a/lapack-netlib/SRC/cptsv.f b/lapack-netlib/SRC/cptsv.f
index 20ee32bbe..4c16f6a0a 100644
--- a/lapack-netlib/SRC/cptsv.f
+++ b/lapack-netlib/SRC/cptsv.f
@@ -94,8 +94,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the solution has not been
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the solution has not been
 *>                computed.  The factorization has not been completed
 *>                unless i = N.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/cptsvx.f b/lapack-netlib/SRC/cptsvx.f
index db63a3c36..6f7d8cf5b 100644
--- a/lapack-netlib/SRC/cptsvx.f
+++ b/lapack-netlib/SRC/cptsvx.f
@@ -60,7 +60,7 @@
 *>    factorization can also be regarded as having the form
 *>    A = U**H*D*U.
 *>
-*> 2. If the leading i-by-i principal minor is not positive definite,
+*> 2. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -205,10 +205,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/cpttrf.f b/lapack-netlib/SRC/cpttrf.f
index c6379e923..111343b78 100644
--- a/lapack-netlib/SRC/cpttrf.f
+++ b/lapack-netlib/SRC/cpttrf.f
@@ -71,8 +71,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite; if k < N, the factorization could not
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive; if k < N, the factorization could not
 *>               be completed, while if k = N, the factorization was
 *>               completed, but D(N) <= 0.
 *> \endverbatim

From d301649430be60a529d978318a1ffd62e34141f6 Mon Sep 17 00:00:00 2001
From: Mark Seminatore <mseminatore@users.noreply.github.com>
Date: Fri, 23 Jun 2023 19:42:27 -0700
Subject: [PATCH 156/718] fix #4063 threading perf issues on Windows

---
 driver/others/blas_server_win32.c | 99 +++++++++++++------------------
 1 file changed, 40 insertions(+), 59 deletions(-)

diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index afa33cccc..1d747bd9f 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -52,9 +52,7 @@
 
 /* Thread server common information */
 typedef struct{
-  CRITICAL_SECTION lock;
-  HANDLE filled;
-  HANDLE killed;
+  HANDLE taskSemaphore;
 
   blas_queue_t	*queue;    /* Parameter Pointer */
   int		shutdown;  /* server shutdown flag */
@@ -68,6 +66,7 @@ int blas_server_avail = 0;
 static BLASULONG server_lock       = 0;
 
 static blas_pool_t   pool;
+static BLASULONG pool_lock = 0;
 static HANDLE	    blas_threads   [MAX_CPU_NUMBER];
 static DWORD	    blas_threads_id[MAX_CPU_NUMBER];
 
@@ -198,7 +197,6 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 
 /* This is a main routine of threads. Each thread waits until job is */
 /* queued.                                                           */
-
 static DWORD WINAPI blas_thread_server(void *arg){
 
   /* Thread identifier */
@@ -207,9 +205,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
 #endif
 
   void *buffer, *sa, *sb;
-  blas_queue_t	*queue;
-  DWORD action;
-  HANDLE handles[] = {pool.filled, pool.killed};
+  volatile blas_queue_t	*queue;
 
   /* Each server needs each buffer */
   buffer   = blas_memory_alloc(2);
@@ -226,28 +222,32 @@ static DWORD WINAPI blas_thread_server(void *arg){
     fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu);
 #endif
 
-    do {
-      action = WaitForMultipleObjects(2, handles, FALSE, INFINITE);
-    } while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1));
-
-    if (action == WAIT_OBJECT_0 + 1) break;
+	// all worker threads wait on the semaphore
+	WaitForSingleObject(pool.taskSemaphore, INFINITE);
 
+	// kill the thread if we are shutting down the server
+	if (pool.shutdown)
+		break;
+    
 #ifdef SMP_DEBUG
     fprintf(STDERR, "Server[%2ld] Got it.\n", cpu);
 #endif
 
-    EnterCriticalSection(&pool.lock);
+	// grab a queued task and update the list
+	volatile blas_queue_t* queue_next;
+	LONG64 prev_value;
+	do {
+		queue = (volatile blas_queue_t*)pool.queue;
+		if (!queue)
+			break;
 
-    queue = pool.queue;
-    if (queue) pool.queue = queue->next;
-
-    LeaveCriticalSection(&pool.lock);
+		queue_next = (volatile blas_queue_t*)queue->next;
+		prev_value = InterlockedCompareExchange64((PLONG64)&pool.queue, (LONG64)queue_next, (LONG64)queue);
+	} while (prev_value != queue);
 
     if (queue)  {
       int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
 
-      if (pool.queue) SetEvent(pool.filled);
-
       sa = queue -> sa;
       sb = queue -> sb;
 
@@ -332,13 +332,8 @@ static DWORD WINAPI blas_thread_server(void *arg){
     fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
 #endif
 
-    EnterCriticalSection(&queue->lock);
-
-    queue -> status = BLAS_STATUS_FINISHED;
-
-    LeaveCriticalSection(&queue->lock);
-
-    SetEvent(queue->finish);
+	// mark our sub-task as complete
+	InterlockedDecrement(&queue->status);
   }
 
   /* Shutdown procedure */
@@ -353,7 +348,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
   }
 
 /* Initializing routine */
-int blas_thread_init(void){
+  int blas_thread_init(void){
   BLASLONG i;
 
   if (blas_server_avail || (blas_cpu_number <= 1)) return 0;
@@ -367,9 +362,7 @@ int blas_thread_init(void){
 
   if (!blas_server_avail){
 
-    InitializeCriticalSection(&pool.lock);
-    pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
-    pool.killed = CreateEvent(NULL, TRUE,  FALSE, NULL);
+	pool.taskSemaphore = CreateSemaphore(NULL, 0, blas_cpu_number - 1, NULL);
 
     pool.shutdown = 0;
     pool.queue    = NULL;
@@ -391,11 +384,10 @@ int blas_thread_init(void){
 /*
    User can call one of two routines.
 
-     exec_blas_async ... immediately returns after jobs are queued.
+	 exec_blas_async ... immediately returns after jobs are queued.
 
-     exec_blas       ... returns after jobs are finished.
+	 exec_blas       ... returns after jobs are finished.
 */
-
 int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
 
 #if defined(SMP_SERVER)
@@ -409,8 +401,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
   current = queue;
 
   while (current) {
-    InitializeCriticalSection(&current -> lock);
-    current -> finish = CreateEvent(NULL, FALSE, FALSE, NULL);
+	current->status = 1;
     current -> position = pos;
 
 #ifdef CONSISTENT_FPCSR
@@ -422,19 +413,10 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
     pos ++;
   }
 
-  EnterCriticalSection(&pool.lock);
+  pool.queue = queue;
 
-  if (pool.queue) {
-    current = pool.queue;
-    while (current -> next) current = current -> next;
-    current -> next = queue;
-  } else {
-    pool.queue = queue;
-  }
-
-  LeaveCriticalSection(&pool.lock);
-
-  SetEvent(pool.filled);
+  // start up worker threads
+  ReleaseSemaphore(pool.taskSemaphore, pos - 1, NULL);
 
   return 0;
 }
@@ -450,10 +432,9 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
     fprintf(STDERR, "Waiting Queue ..\n");
 #endif
 
-      WaitForSingleObject(queue->finish, INFINITE);
-
-      CloseHandle(queue->finish);
-      DeleteCriticalSection(&queue -> lock);
+	// spin-wait on each sub-task to finish
+	while (*((volatile int*)&queue->status))
+		YIELDING;
 
       queue = queue -> next;
       num --;
@@ -501,18 +482,21 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
 
 /* Shutdown procedure, but user don't have to call this routine. The */
 /* kernel automatically kill threads.                                */
-
 int BLASFUNC(blas_thread_shutdown)(void){
 
   int i;
 
+#ifdef SMP_DEBUG
+  fprintf(STDERR, "blas_thread_shutdown..\n");
+#endif
+
   if (!blas_server_avail) return 0;
 
   LOCK_COMMAND(&server_lock);
 
   if (blas_server_avail){
 
-    SetEvent(pool.killed);
+	  pool.shutdown = 1;
 
     for(i = 0; i < blas_num_threads - 1; i++){
       // Could also just use WaitForMultipleObjects
@@ -528,8 +512,7 @@ int BLASFUNC(blas_thread_shutdown)(void){
       CloseHandle(blas_threads[i]);
     }
 
-    CloseHandle(pool.filled);
-    CloseHandle(pool.killed);
+	CloseHandle(pool.taskSemaphore);
 
     blas_server_avail = 0;
   }
@@ -559,16 +542,14 @@ void goto_set_num_threads(int num_threads)
 		//increased_threads = 1;
 	    if (!blas_server_avail){
 
-			InitializeCriticalSection(&pool.lock);
-			pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
-			pool.killed = CreateEvent(NULL, TRUE,  FALSE, NULL);
+			pool.taskSemaphore = CreateSemaphore(NULL, 0, blas_cpu_number - 1, NULL);
 
 			pool.shutdown = 0;
 			pool.queue    = NULL;
 			blas_server_avail = 1;
 		}
 
-		for(i = blas_num_threads - 1; i < num_threads - 1; i++){
+		for(i = blas_num_threads; i < num_threads - 1; i++){
 
 			blas_threads[i] = CreateThread(NULL, 0,
 				     blas_thread_server, (void *)i,

From 8caabc5982153f7be980410e95f87749bc04c70f Mon Sep 17 00:00:00 2001
From: Mark Seminatore <mseminatore@users.noreply.github.com>
Date: Fri, 23 Jun 2023 19:45:16 -0700
Subject: [PATCH 157/718] fix #4063 remove unused pool_lock

---
 driver/others/blas_server_win32.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 1d747bd9f..3fdb86ec3 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -66,12 +66,9 @@ int blas_server_avail = 0;
 static BLASULONG server_lock       = 0;
 
 static blas_pool_t   pool;
-static BLASULONG pool_lock = 0;
 static HANDLE	    blas_threads   [MAX_CPU_NUMBER];
 static DWORD	    blas_threads_id[MAX_CPU_NUMBER];
 
-
-
 static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 
       if (!(mode & BLAS_COMPLEX)){

From 427f9f2428eb892745591394f153a012f2d35c2f Mon Sep 17 00:00:00 2001
From: Mark Seminatore <mseminatore@users.noreply.github.com>
Date: Fri, 23 Jun 2023 22:15:39 -0700
Subject: [PATCH 158/718] update contributors

---
 CONTRIBUTORS.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 71df13634..7efc04092 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -216,3 +216,8 @@ In chronological order:
   
 * Pablo Romero <https://github.com/pablorcum>
   * [2022-08] Fix building from sources for QNX
+
+* Mark Seminatore <https://github.com/mseminatore>
+  * [2023-06-23] Fix bounds issue in goto_set_num_threads
+  * [2023-06-23] Improve Windows threading performance scaling
+  
\ No newline at end of file

From 7783a9af020aa0cbacd1e6d315c2594a17b71961 Mon Sep 17 00:00:00 2001
From: Mark Seminatore <mseminatore@users.noreply.github.com>
Date: Sat, 24 Jun 2023 14:35:11 -0700
Subject: [PATCH 159/718] attempt to fix old mingw gcc issue

---
 driver/others/blas_server_win32.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 3fdb86ec3..25879250f 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -50,6 +50,15 @@
 
 /* This is a thread implementation for Win32 lazy implementation */
 
+#if defined (__GNUC__) && (__GNUC__ < 6)
+	#define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch)
+#else
+	#if defined(_WIN64)
+		#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp)
+	#else
+		#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp)
+#endif
+
 /* Thread server common information */
 typedef struct{
   HANDLE taskSemaphore;
@@ -232,14 +241,14 @@ static DWORD WINAPI blas_thread_server(void *arg){
 
 	// grab a queued task and update the list
 	volatile blas_queue_t* queue_next;
-	LONG64 prev_value;
+	INT_PTR prev_value;
 	do {
 		queue = (volatile blas_queue_t*)pool.queue;
 		if (!queue)
 			break;
 
 		queue_next = (volatile blas_queue_t*)queue->next;
-		prev_value = InterlockedCompareExchange64((PLONG64)&pool.queue, (LONG64)queue_next, (LONG64)queue);
+		prev_value = WIN_CAS((INT_PTR*)&pool.queue, (INT_PTR)queue_next, (INT_PTR)queue);
 	} while (prev_value != queue);
 
     if (queue)  {

From d6991dd230d5ff33bd86708762de8da05ebc0636 Mon Sep 17 00:00:00 2001
From: Mark Seminatore <mseminatore@users.noreply.github.com>
Date: Sat, 24 Jun 2023 15:43:32 -0700
Subject: [PATCH 160/718] fix missing #endif

---
 driver/others/blas_server_win32.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 25879250f..0b213bf2c 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -57,6 +57,7 @@
 		#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp)
 	#else
 		#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp)
+	#endif
 #endif
 
 /* Thread server common information */

From 572e482b38fcb18ae25e46183aa4c956e1c7d48b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 25 Jun 2023 12:42:05 +0200
Subject: [PATCH 161/718] Fix inline documentation of S/DLASD0
 (Reference-LAPACK PR 855)

---
 lapack-netlib/SRC/dlasd0.f | 10 ++++++----
 lapack-netlib/SRC/slasd0.f | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/lapack-netlib/SRC/dlasd0.f b/lapack-netlib/SRC/dlasd0.f
index 215dc8717..6d8d4e2db 100644
--- a/lapack-netlib/SRC/dlasd0.f
+++ b/lapack-netlib/SRC/dlasd0.f
@@ -79,10 +79,11 @@
 *>         On exit, E has been destroyed.
 *> \endverbatim
 *>
-*> \param[out] U
+*> \param[in,out] U
 *> \verbatim
 *>          U is DOUBLE PRECISION array, dimension (LDU, N)
-*>         On exit, U contains the left singular vectors.
+*>         On exit, U contains the left singular vectors, 
+*>          if U passed in as (N, N) Identity.
 *> \endverbatim
 *>
 *> \param[in] LDU
@@ -91,10 +92,11 @@
 *>         On entry, leading dimension of U.
 *> \endverbatim
 *>
-*> \param[out] VT
+*> \param[in,out] VT
 *> \verbatim
 *>          VT is DOUBLE PRECISION array, dimension (LDVT, M)
-*>         On exit, VT**T contains the right singular vectors.
+*>         On exit, VT**T contains the right singular vectors,
+*>          if VT passed in as (M, M) Identity.
 *> \endverbatim
 *>
 *> \param[in] LDVT
diff --git a/lapack-netlib/SRC/slasd0.f b/lapack-netlib/SRC/slasd0.f
index a45f741a9..c8a5c7838 100644
--- a/lapack-netlib/SRC/slasd0.f
+++ b/lapack-netlib/SRC/slasd0.f
@@ -79,10 +79,11 @@
 *>         On exit, E has been destroyed.
 *> \endverbatim
 *>
-*> \param[out] U
+*> \param[in,out] U
 *> \verbatim
 *>          U is REAL array, dimension (LDU, N)
-*>         On exit, U contains the left singular vectors.
+*>         On exit, U contains the left singular vectors,
+*>          if U passed in as (N, N) Identity.
 *> \endverbatim
 *>
 *> \param[in] LDU
@@ -91,10 +92,11 @@
 *>         On entry, leading dimension of U.
 *> \endverbatim
 *>
-*> \param[out] VT
+*> \param[in,out] VT
 *> \verbatim
 *>          VT is REAL array, dimension (LDVT, M)
-*>         On exit, VT**T contains the right singular vectors.
+*>         On exit, VT**T contains the right singular vectors,
+*>          if VT passed in as (M, M) Identity.
 *> \endverbatim
 *>
 *> \param[in] LDVT

From 1fe96f8da788fe58351c482625d55e058db61974 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 25 Jun 2023 22:36:57 +0200
Subject: [PATCH 162/718] Fix failures to handle increments of zero

---
 kernel/riscv64/rot_vector.c   | 1 +
 kernel/riscv64/swap_vector.c  | 1 +
 kernel/riscv64/zrot_vector.c  | 1 +
 kernel/riscv64/zswap_vector.c | 1 +
 4 files changed, 4 insertions(+)

diff --git a/kernel/riscv64/rot_vector.c b/kernel/riscv64/rot_vector.c
index 43a65e552..f3786e1d0 100644
--- a/kernel/riscv64/rot_vector.c
+++ b/kernel/riscv64/rot_vector.c
@@ -155,6 +155,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
                 }
         }else{
                 gvl = VSETVL(n);
+		if (inc_x == 0 && inc_y == 0) gvl = VSETVL(1);
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 BLASLONG stride_y = inc_y * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * gvl;
diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c
index b16592808..82fa5ce31 100644
--- a/kernel/riscv64/swap_vector.c
+++ b/kernel/riscv64/swap_vector.c
@@ -136,6 +136,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
                 }
         }else{
                 gvl = VSETVL(n);
+                if (inc_x == 0 && inc_y == 0) gvl = VSETVL(1);
                 stride_x = inc_x * sizeof(FLOAT);
                 stride_y = inc_y * sizeof(FLOAT);
                 if(gvl <= n/2){
diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c
index 858dfd173..727d13a87 100644
--- a/kernel/riscv64/zrot_vector.c
+++ b/kernel/riscv64/zrot_vector.c
@@ -112,6 +112,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 		}
 		
 	}else{
+                if (inc_x == 0 && inc_y == 0) gvl = VSETVL(1);
 		for(i=0,j=0; i < n/gvl; i++){
 			vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
 			vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c
index c1dcaccab..09cc8992a 100644
--- a/kernel/riscv64/zswap_vector.c
+++ b/kernel/riscv64/zswap_vector.c
@@ -81,6 +81,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
                 }
         }else{
                 gvl = VSETVL(n);
+                if (inc_x == 0 && inc_y == 0) gvl = VSETVL(1);
                 stride_x = inc_x * 2 * sizeof(FLOAT);
                 stride_y = inc_y * 2 * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * gvl * 2;

From b13787971e108d4a267feae478eb253ad7781c9e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 25 Jun 2023 22:37:56 +0200
Subject: [PATCH 163/718] Enforce -O1 to work around miscompilation by the
 vendor gcc

---
 Makefile.riscv64 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.riscv64 b/Makefile.riscv64
index ce91e03ec..0246c0f7d 100644
--- a/Makefile.riscv64
+++ b/Makefile.riscv64
@@ -1,4 +1,4 @@
 ifeq ($(CORE), C910V)
-CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920
+CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -O1
 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static
 endif

From 329bd3410b5ba48310b035fd75d1dd445413bc3b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 26 Jun 2023 15:44:10 +0200
Subject: [PATCH 164/718] Remove legacy warning comments and rename variable
 LAMDA to LAMBDA (Reference-LAPACK PR 852)

---
 lapack-netlib/SRC/cgelsd.f        |  6 ----
 lapack-netlib/SRC/cgesdd.f        |  6 ----
 lapack-netlib/SRC/chbevd.f        |  6 ----
 lapack-netlib/SRC/chbevd_2stage.f |  6 ----
 lapack-netlib/SRC/chbgvd.f        |  6 ----
 lapack-netlib/SRC/cheevd.f        |  6 ----
 lapack-netlib/SRC/cheevd_2stage.f |  6 ----
 lapack-netlib/SRC/chegvd.f        |  6 ----
 lapack-netlib/SRC/chpevd.f        |  6 ----
 lapack-netlib/SRC/chpgvd.f        |  6 ----
 lapack-netlib/SRC/claed8.f        | 30 ++++++++---------
 lapack-netlib/SRC/clals0.f        | 10 ++++++
 lapack-netlib/SRC/clalsd.f        |  6 ----
 lapack-netlib/SRC/cstedc.f        |  6 ----
 lapack-netlib/SRC/dbdsdc.f        |  7 ----
 lapack-netlib/SRC/dgelsd.f        |  6 ----
 lapack-netlib/SRC/dgesdd.f        |  6 ----
 lapack-netlib/SRC/dlaed2.f        | 30 ++++++++---------
 lapack-netlib/SRC/dlaed3.f        | 52 +++++++-----------------------
 lapack-netlib/SRC/dlaed8.f        | 34 ++++++++++----------
 lapack-netlib/SRC/dlaed9.f        | 47 ++++++++-------------------
 lapack-netlib/SRC/dlals0.f        | 10 ++++++
 lapack-netlib/SRC/dlalsd.f        |  6 ----
 lapack-netlib/SRC/dlas2.f         |  4 +--
 lapack-netlib/SRC/dlasd3.f        | 34 ++------------------
 lapack-netlib/SRC/dlasd8.f        | 30 ++++-------------
 lapack-netlib/SRC/dlasv2.f        |  4 +--
 lapack-netlib/SRC/dsbevd.f        |  6 ----
 lapack-netlib/SRC/dsbevd_2stage.f |  6 ----
 lapack-netlib/SRC/dsbgvd.f        |  6 ----
 lapack-netlib/SRC/dspevd.f        |  6 ----
 lapack-netlib/SRC/dspgvd.f        |  6 ----
 lapack-netlib/SRC/dstedc.f        |  6 ----
 lapack-netlib/SRC/dstevd.f        |  6 ----
 lapack-netlib/SRC/dsyevd.f        |  7 ----
 lapack-netlib/SRC/dsyevd_2stage.f |  6 ----
 lapack-netlib/SRC/dsygvd.f        |  6 ----
 lapack-netlib/SRC/sbdsdc.f        |  7 ----
 lapack-netlib/SRC/sgelsd.f        |  6 ----
 lapack-netlib/SRC/sgesdd.f        |  6 ----
 lapack-netlib/SRC/slaed2.f        | 30 ++++++++---------
 lapack-netlib/SRC/slaed3.f        | 53 +++++++------------------------
 lapack-netlib/SRC/slaed8.f        | 34 ++++++++++----------
 lapack-netlib/SRC/slaed9.f        | 47 ++++++++-------------------
 lapack-netlib/SRC/slals0.f        | 10 ++++++
 lapack-netlib/SRC/slalsd.f        |  6 ----
 lapack-netlib/SRC/slas2.f         |  4 +--
 lapack-netlib/SRC/slasd3.f        | 34 ++------------------
 lapack-netlib/SRC/slasd8.f        | 30 ++++-------------
 lapack-netlib/SRC/slasv2.f        |  4 +--
 lapack-netlib/SRC/ssbevd.f        |  6 ----
 lapack-netlib/SRC/ssbevd_2stage.f |  6 ----
 lapack-netlib/SRC/ssbgvd.f        |  6 ----
 lapack-netlib/SRC/sspevd.f        |  6 ----
 lapack-netlib/SRC/sspgvd.f        |  6 ----
 lapack-netlib/SRC/sstedc.f        |  6 ----
 lapack-netlib/SRC/sstevd.f        |  6 ----
 lapack-netlib/SRC/ssyevd.f        |  7 ----
 lapack-netlib/SRC/ssyevd_2stage.f |  6 ----
 lapack-netlib/SRC/ssygvd.f        |  6 ----
 lapack-netlib/SRC/zgelsd.f        |  6 ----
 lapack-netlib/SRC/zgesdd.f        |  6 ----
 lapack-netlib/SRC/zhbevd.f        |  6 ----
 lapack-netlib/SRC/zhbevd_2stage.f |  6 ----
 lapack-netlib/SRC/zhbgvd.f        |  6 ----
 lapack-netlib/SRC/zheevd.f        |  6 ----
 lapack-netlib/SRC/zheevd_2stage.f |  6 ----
 lapack-netlib/SRC/zhegvd.f        |  6 ----
 lapack-netlib/SRC/zhpevd.f        |  6 ----
 lapack-netlib/SRC/zhpgvd.f        |  6 ----
 lapack-netlib/SRC/zlaed8.f        | 30 ++++++++---------
 lapack-netlib/SRC/zlals0.f        | 10 ++++++
 lapack-netlib/SRC/zlalsd.f        |  6 ----
 lapack-netlib/SRC/zstedc.f        |  6 ----
 74 files changed, 206 insertions(+), 681 deletions(-)

diff --git a/lapack-netlib/SRC/cgelsd.f b/lapack-netlib/SRC/cgelsd.f
index fce4ca6e2..c3c77bf63 100644
--- a/lapack-netlib/SRC/cgelsd.f
+++ b/lapack-netlib/SRC/cgelsd.f
@@ -60,12 +60,6 @@
 *> singular values which are less than RCOND times the largest singular
 *> value.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/cgesdd.f b/lapack-netlib/SRC/cgesdd.f
index b824374d3..1838629ae 100644
--- a/lapack-netlib/SRC/cgesdd.f
+++ b/lapack-netlib/SRC/cgesdd.f
@@ -53,12 +53,6 @@
 *>
 *> Note that the routine returns VT = V**H, not V.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/chbevd.f b/lapack-netlib/SRC/chbevd.f
index 1598f4de5..de33c9039 100644
--- a/lapack-netlib/SRC/chbevd.f
+++ b/lapack-netlib/SRC/chbevd.f
@@ -41,12 +41,6 @@
 *> a complex Hermitian band matrix A.  If eigenvectors are desired, it
 *> uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/chbevd_2stage.f b/lapack-netlib/SRC/chbevd_2stage.f
index 340c546e8..3c9c8ecc0 100644
--- a/lapack-netlib/SRC/chbevd_2stage.f
+++ b/lapack-netlib/SRC/chbevd_2stage.f
@@ -47,12 +47,6 @@
 *> the reduction to tridiagonal.  If eigenvectors are desired, it
 *> uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/chbgvd.f b/lapack-netlib/SRC/chbgvd.f
index c4ad20753..655006370 100644
--- a/lapack-netlib/SRC/chbgvd.f
+++ b/lapack-netlib/SRC/chbgvd.f
@@ -46,12 +46,6 @@
 *> and banded, and B is also positive definite.  If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/cheevd.f b/lapack-netlib/SRC/cheevd.f
index 2ddf74b98..dce0b2083 100644
--- a/lapack-netlib/SRC/cheevd.f
+++ b/lapack-netlib/SRC/cheevd.f
@@ -41,12 +41,6 @@
 *> complex Hermitian matrix A.  If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/cheevd_2stage.f b/lapack-netlib/SRC/cheevd_2stage.f
index 830e13d30..a0e8843ae 100644
--- a/lapack-netlib/SRC/cheevd_2stage.f
+++ b/lapack-netlib/SRC/cheevd_2stage.f
@@ -46,12 +46,6 @@
 *> the reduction to tridiagonal.  If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/chegvd.f b/lapack-netlib/SRC/chegvd.f
index c96f011af..4edc36f2a 100644
--- a/lapack-netlib/SRC/chegvd.f
+++ b/lapack-netlib/SRC/chegvd.f
@@ -43,12 +43,6 @@
 *> B are assumed to be Hermitian and B is also positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/chpevd.f b/lapack-netlib/SRC/chpevd.f
index c44462394..06d01064d 100644
--- a/lapack-netlib/SRC/chpevd.f
+++ b/lapack-netlib/SRC/chpevd.f
@@ -41,12 +41,6 @@
 *> a complex Hermitian matrix A in packed storage.  If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/chpgvd.f b/lapack-netlib/SRC/chpgvd.f
index 5c9e417d3..c24ca1360 100644
--- a/lapack-netlib/SRC/chpgvd.f
+++ b/lapack-netlib/SRC/chpgvd.f
@@ -44,12 +44,6 @@
 *> positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/claed8.f b/lapack-netlib/SRC/claed8.f
index c15a0365f..1600087ab 100644
--- a/lapack-netlib/SRC/claed8.f
+++ b/lapack-netlib/SRC/claed8.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE CLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMDA,
+*       SUBROUTINE CLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMBDA,
 *                          Q2, LDQ2, W, INDXP, INDX, INDXQ, PERM, GIVPTR,
 *                          GIVCOL, GIVNUM, INFO )
 *
@@ -29,7 +29,7 @@
 *       .. Array Arguments ..
 *       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
 *      $                   INDXQ( * ), PERM( * )
-*       REAL               D( * ), DLAMDA( * ), GIVNUM( 2, * ), W( * ),
+*       REAL               D( * ), DLAMBDA( * ), GIVNUM( 2, * ), W( * ),
 *      $                   Z( * )
 *       COMPLEX            Q( LDQ, * ), Q2( LDQ2, * )
 *       ..
@@ -122,9 +122,9 @@
 *>         destroyed during the updating process.
 *> \endverbatim
 *>
-*> \param[out] DLAMDA
+*> \param[out] DLAMBDA
 *> \verbatim
-*>          DLAMDA is REAL array, dimension (N)
+*>          DLAMBDA is REAL array, dimension (N)
 *>         Contains a copy of the first K eigenvalues which will be used
 *>         by SLAED3 to form the secular equation.
 *> \endverbatim
@@ -222,7 +222,7 @@
 *> \ingroup complexOTHERcomputational
 *
 *  =====================================================================
-      SUBROUTINE CLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMDA,
+      SUBROUTINE CLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMBDA,
      $                   Q2, LDQ2, W, INDXP, INDX, INDXQ, PERM, GIVPTR,
      $                   GIVCOL, GIVNUM, INFO )
 *
@@ -237,7 +237,7 @@
 *     .. Array Arguments ..
       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
      $                   INDXQ( * ), PERM( * )
-      REAL               D( * ), DLAMDA( * ), GIVNUM( 2, * ), W( * ),
+      REAL               D( * ), DLAMBDA( * ), GIVNUM( 2, * ), W( * ),
      $                   Z( * )
       COMPLEX            Q( LDQ, * ), Q2( LDQ2, * )
 *     ..
@@ -322,14 +322,14 @@
          INDXQ( I ) = INDXQ( I ) + CUTPNT
    20 CONTINUE
       DO 30 I = 1, N
-         DLAMDA( I ) = D( INDXQ( I ) )
+         DLAMBDA( I ) = D( INDXQ( I ) )
          W( I ) = Z( INDXQ( I ) )
    30 CONTINUE
       I = 1
       J = CUTPNT + 1
-      CALL SLAMRG( N1, N2, DLAMDA, 1, 1, INDX )
+      CALL SLAMRG( N1, N2, DLAMBDA, 1, 1, INDX )
       DO 40 I = 1, N
-         D( I ) = DLAMDA( INDX( I ) )
+         D( I ) = DLAMBDA( INDX( I ) )
          Z( I ) = W( INDX( I ) )
    40 CONTINUE
 *
@@ -438,7 +438,7 @@
          ELSE
             K = K + 1
             W( K ) = Z( JLAM )
-            DLAMDA( K ) = D( JLAM )
+            DLAMBDA( K ) = D( JLAM )
             INDXP( K ) = JLAM
             JLAM = J
          END IF
@@ -450,19 +450,19 @@
 *
       K = K + 1
       W( K ) = Z( JLAM )
-      DLAMDA( K ) = D( JLAM )
+      DLAMBDA( K ) = D( JLAM )
       INDXP( K ) = JLAM
 *
   100 CONTINUE
 *
-*     Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+*     Sort the eigenvalues and corresponding eigenvectors into DLAMBDA
 *     and Q2 respectively.  The eigenvalues/vectors which were not
-*     deflated go into the first K slots of DLAMDA and Q2 respectively,
+*     deflated go into the first K slots of DLAMBDA and Q2 respectively,
 *     while those which were deflated go into the last N - K slots.
 *
       DO 110 J = 1, N
          JP = INDXP( J )
-         DLAMDA( J ) = D( JP )
+         DLAMBDA( J ) = D( JP )
          PERM( J ) = INDXQ( INDX( JP ) )
          CALL CCOPY( QSIZ, Q( 1, PERM( J ) ), 1, Q2( 1, J ), 1 )
   110 CONTINUE
@@ -471,7 +471,7 @@
 *     into the last N - K slots of D and Q respectively.
 *
       IF( K.LT.N ) THEN
-         CALL SCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 )
+         CALL SCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 )
          CALL CLACPY( 'A', QSIZ, N-K, Q2( 1, K+1 ), LDQ2, Q( 1, K+1 ),
      $                LDQ )
       END IF
diff --git a/lapack-netlib/SRC/clals0.f b/lapack-netlib/SRC/clals0.f
index e981fc36f..0b545d5d7 100644
--- a/lapack-netlib/SRC/clals0.f
+++ b/lapack-netlib/SRC/clals0.f
@@ -392,6 +392,11 @@
      $                ( POLES( I, 2 ).EQ.ZERO ) ) THEN
                      RWORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine SLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent
+*                    optimizing compilers from doing x+(y+z).
+*
                      RWORK( I ) = POLES( I, 2 )*Z( I ) /
      $                            ( SLAMC3( POLES( I, 2 ), DSIGJ )-
      $                            DIFLJ ) / ( POLES( I, 2 )+DJ )
@@ -470,6 +475,11 @@
                   IF( Z( J ).EQ.ZERO ) THEN
                      RWORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine SLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent optimizing
+*                    compilers from doing x+(y+z).
+*
                      RWORK( I ) = Z( J ) / ( SLAMC3( DSIGJ, -POLES( I+1,
      $                            2 ) )-DIFR( I, 1 ) ) /
      $                            ( DSIGJ+POLES( I, 1 ) ) / DIFR( I, 2 )
diff --git a/lapack-netlib/SRC/clalsd.f b/lapack-netlib/SRC/clalsd.f
index a2da9a925..bdd6b31c5 100644
--- a/lapack-netlib/SRC/clalsd.f
+++ b/lapack-netlib/SRC/clalsd.f
@@ -48,12 +48,6 @@
 *> problem; in this case a minimum norm solution is returned.
 *> The actual singular values are returned in D in ascending order.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/cstedc.f b/lapack-netlib/SRC/cstedc.f
index a57d9eaef..77a4ec3be 100644
--- a/lapack-netlib/SRC/cstedc.f
+++ b/lapack-netlib/SRC/cstedc.f
@@ -43,12 +43,6 @@
 *> be found if CHETRD or CHPTRD or CHBTRD has been used to reduce this
 *> matrix to tridiagonal form.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.  See SLAED3 for details.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dbdsdc.f b/lapack-netlib/SRC/dbdsdc.f
index 99fe82296..4b6c3e694 100644
--- a/lapack-netlib/SRC/dbdsdc.f
+++ b/lapack-netlib/SRC/dbdsdc.f
@@ -45,13 +45,6 @@
 *> respectively. DBDSDC can be used to compute all singular values,
 *> and optionally, singular vectors or singular vectors in compact form.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.  See DLASD3 for details.
-*>
 *> The code currently calls DLASDQ if singular values only are desired.
 *> However, it can be slightly modified to compute singular values
 *> using the divide and conquer method.
diff --git a/lapack-netlib/SRC/dgelsd.f b/lapack-netlib/SRC/dgelsd.f
index b3b3d8b2d..b1f45a2c6 100644
--- a/lapack-netlib/SRC/dgelsd.f
+++ b/lapack-netlib/SRC/dgelsd.f
@@ -59,12 +59,6 @@
 *> singular values which are less than RCOND times the largest singular
 *> value.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dgesdd.f b/lapack-netlib/SRC/dgesdd.f
index 738a122e8..87a4e702d 100644
--- a/lapack-netlib/SRC/dgesdd.f
+++ b/lapack-netlib/SRC/dgesdd.f
@@ -55,12 +55,6 @@
 *>
 *> Note that the routine returns VT = V**T, not V.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dlaed2.f b/lapack-netlib/SRC/dlaed2.f
index 9b1f1e093..1a53650e8 100644
--- a/lapack-netlib/SRC/dlaed2.f
+++ b/lapack-netlib/SRC/dlaed2.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE DLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMDA, W,
+*       SUBROUTINE DLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMBDA, W,
 *                          Q2, INDX, INDXC, INDXP, COLTYP, INFO )
 *
 *       .. Scalar Arguments ..
@@ -28,7 +28,7 @@
 *       .. Array Arguments ..
 *       INTEGER            COLTYP( * ), INDX( * ), INDXC( * ), INDXP( * ),
 *      $                   INDXQ( * )
-*       DOUBLE PRECISION   D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+*       DOUBLE PRECISION   D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
 *      $                   W( * ), Z( * )
 *       ..
 *
@@ -123,9 +123,9 @@
 *>         process.
 *> \endverbatim
 *>
-*> \param[out] DLAMDA
+*> \param[out] DLAMBDA
 *> \verbatim
-*>          DLAMDA is DOUBLE PRECISION array, dimension (N)
+*>          DLAMBDA is DOUBLE PRECISION array, dimension (N)
 *>         A copy of the first K eigenvalues which will be used by
 *>         DLAED3 to form the secular equation.
 *> \endverbatim
@@ -148,7 +148,7 @@
 *> \param[out] INDX
 *> \verbatim
 *>          INDX is INTEGER array, dimension (N)
-*>         The permutation used to sort the contents of DLAMDA into
+*>         The permutation used to sort the contents of DLAMBDA into
 *>         ascending order.
 *> \endverbatim
 *>
@@ -207,7 +207,7 @@
 *>  Modified by Francoise Tisseur, University of Tennessee
 *>
 *  =====================================================================
-      SUBROUTINE DLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMDA, W,
+      SUBROUTINE DLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMBDA, W,
      $                   Q2, INDX, INDXC, INDXP, COLTYP, INFO )
 *
 *  -- LAPACK computational routine --
@@ -221,7 +221,7 @@
 *     .. Array Arguments ..
       INTEGER            COLTYP( * ), INDX( * ), INDXC( * ), INDXP( * ),
      $                   INDXQ( * )
-      DOUBLE PRECISION   D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+      DOUBLE PRECISION   D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
      $                   W( * ), Z( * )
 *     ..
 *
@@ -300,9 +300,9 @@
 *     re-integrate the deflated parts from the last pass
 *
       DO 20 I = 1, N
-         DLAMDA( I ) = D( INDXQ( I ) )
+         DLAMBDA( I ) = D( INDXQ( I ) )
    20 CONTINUE
-      CALL DLAMRG( N1, N2, DLAMDA, 1, 1, INDXC )
+      CALL DLAMRG( N1, N2, DLAMBDA, 1, 1, INDXC )
       DO 30 I = 1, N
          INDX( I ) = INDXQ( INDXC( I ) )
    30 CONTINUE
@@ -324,11 +324,11 @@
          DO 40 J = 1, N
             I = INDX( J )
             CALL DCOPY( N, Q( 1, I ), 1, Q2( IQ2 ), 1 )
-            DLAMDA( J ) = D( I )
+            DLAMBDA( J ) = D( I )
             IQ2 = IQ2 + N
    40    CONTINUE
          CALL DLACPY( 'A', N, N, Q2, N, Q, LDQ )
-         CALL DCOPY( N, DLAMDA, 1, D, 1 )
+         CALL DCOPY( N, DLAMBDA, 1, D, 1 )
          GO TO 190
       END IF
 *
@@ -421,7 +421,7 @@
             PJ = NJ
          ELSE
             K = K + 1
-            DLAMDA( K ) = D( PJ )
+            DLAMBDA( K ) = D( PJ )
             W( K ) = Z( PJ )
             INDXP( K ) = PJ
             PJ = NJ
@@ -433,7 +433,7 @@
 *     Record the last eigenvalue.
 *
       K = K + 1
-      DLAMDA( K ) = D( PJ )
+      DLAMBDA( K ) = D( PJ )
       W( K ) = Z( PJ )
       INDXP( K ) = PJ
 *
@@ -470,9 +470,9 @@
          PSM( CT ) = PSM( CT ) + 1
   130 CONTINUE
 *
-*     Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+*     Sort the eigenvalues and corresponding eigenvectors into DLAMBDA
 *     and Q2 respectively.  The eigenvalues/vectors which were not
-*     deflated go into the first K slots of DLAMDA and Q2 respectively,
+*     deflated go into the first K slots of DLAMBDA and Q2 respectively,
 *     while those which were deflated go into the last N - K slots.
 *
       I = 1
diff --git a/lapack-netlib/SRC/dlaed3.f b/lapack-netlib/SRC/dlaed3.f
index c58944e60..f9982c89e 100644
--- a/lapack-netlib/SRC/dlaed3.f
+++ b/lapack-netlib/SRC/dlaed3.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE DLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMDA, Q2, INDX,
+*       SUBROUTINE DLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMBDA, Q2, INDX,
 *                          CTOT, W, S, INFO )
 *
 *       .. Scalar Arguments ..
@@ -27,7 +27,7 @@
 *       ..
 *       .. Array Arguments ..
 *       INTEGER            CTOT( * ), INDX( * )
-*       DOUBLE PRECISION   D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+*       DOUBLE PRECISION   D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
 *      $                   S( * ), W( * )
 *       ..
 *
@@ -44,12 +44,6 @@
 *> being combined by the matrix of eigenvectors of the K-by-K system
 *> which is solved here.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -104,14 +98,12 @@
 *>          RHO >= 0 required.
 *> \endverbatim
 *>
-*> \param[in,out] DLAMDA
+*> \param[in] DLAMBDA
 *> \verbatim
-*>          DLAMDA is DOUBLE PRECISION array, dimension (K)
+*>          DLAMBDA is DOUBLE PRECISION array, dimension (K)
 *>          The first K elements of this array contain the old roots
 *>          of the deflated updating problem.  These are the poles
-*>          of the secular equation. May be changed on output by
-*>          having lowest order bit set to zero on Cray X-MP, Cray Y-MP,
-*>          Cray-2, or Cray C-90, as described above.
+*>          of the secular equation.
 *> \endverbatim
 *>
 *> \param[in] Q2
@@ -180,7 +172,7 @@
 *>  Modified by Francoise Tisseur, University of Tennessee
 *>
 *  =====================================================================
-      SUBROUTINE DLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMDA, Q2, INDX,
+      SUBROUTINE DLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMBDA, Q2, INDX,
      $                   CTOT, W, S, INFO )
 *
 *  -- LAPACK computational routine --
@@ -193,7 +185,7 @@
 *     ..
 *     .. Array Arguments ..
       INTEGER            CTOT( * ), INDX( * )
-      DOUBLE PRECISION   D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+      DOUBLE PRECISION   D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
      $                   S( * ), W( * )
 *     ..
 *
@@ -208,8 +200,8 @@
       DOUBLE PRECISION   TEMP
 *     ..
 *     .. External Functions ..
-      DOUBLE PRECISION   DLAMC3, DNRM2
-      EXTERNAL           DLAMC3, DNRM2
+      DOUBLE PRECISION   DNRM2
+      EXTERNAL           DNRM2
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DCOPY, DGEMM, DLACPY, DLAED4, DLASET, XERBLA
@@ -240,29 +232,9 @@
       IF( K.EQ.0 )
      $   RETURN
 *
-*     Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DLAMDA(I) if it is 1; this makes the subsequent
-*     subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DLAMDA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DLAMDA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 10 I = 1, K
-         DLAMDA( I ) = DLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I )
-   10 CONTINUE
 *
       DO 20 J = 1, K
-         CALL DLAED4( K, J, DLAMDA, W, Q( 1, J ), RHO, D( J ), INFO )
+         CALL DLAED4( K, J, DLAMBDA, W, Q( 1, J ), RHO, D( J ), INFO )
 *
 *        If the zero finder fails, the computation is terminated.
 *
@@ -293,10 +265,10 @@
       CALL DCOPY( K, Q, LDQ+1, W, 1 )
       DO 60 J = 1, K
          DO 40 I = 1, J - 1
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    40    CONTINUE
          DO 50 I = J + 1, K
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    50    CONTINUE
    60 CONTINUE
       DO 70 I = 1, K
diff --git a/lapack-netlib/SRC/dlaed8.f b/lapack-netlib/SRC/dlaed8.f
index 3631fb456..5d1d9144d 100644
--- a/lapack-netlib/SRC/dlaed8.f
+++ b/lapack-netlib/SRC/dlaed8.f
@@ -19,7 +19,7 @@
 *  ===========
 *
 *       SUBROUTINE DLAED8( ICOMPQ, K, N, QSIZ, D, Q, LDQ, INDXQ, RHO,
-*                          CUTPNT, Z, DLAMDA, Q2, LDQ2, W, PERM, GIVPTR,
+*                          CUTPNT, Z, DLAMBDA, Q2, LDQ2, W, PERM, GIVPTR,
 *                          GIVCOL, GIVNUM, INDXP, INDX, INFO )
 *
 *       .. Scalar Arguments ..
@@ -30,7 +30,7 @@
 *       .. Array Arguments ..
 *       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
 *      $                   INDXQ( * ), PERM( * )
-*       DOUBLE PRECISION   D( * ), DLAMDA( * ), GIVNUM( 2, * ),
+*       DOUBLE PRECISION   D( * ), DLAMBDA( * ), GIVNUM( 2, * ),
 *      $                   Q( LDQ, * ), Q2( LDQ2, * ), W( * ), Z( * )
 *       ..
 *
@@ -141,9 +141,9 @@
 *>         process.
 *> \endverbatim
 *>
-*> \param[out] DLAMDA
+*> \param[out] DLAMBDA
 *> \verbatim
-*>          DLAMDA is DOUBLE PRECISION array, dimension (N)
+*>          DLAMBDA is DOUBLE PRECISION array, dimension (N)
 *>         A copy of the first K eigenvalues which will be used by
 *>         DLAED3 to form the secular equation.
 *> \endverbatim
@@ -238,7 +238,7 @@
 *
 *  =====================================================================
       SUBROUTINE DLAED8( ICOMPQ, K, N, QSIZ, D, Q, LDQ, INDXQ, RHO,
-     $                   CUTPNT, Z, DLAMDA, Q2, LDQ2, W, PERM, GIVPTR,
+     $                   CUTPNT, Z, DLAMBDA, Q2, LDQ2, W, PERM, GIVPTR,
      $                   GIVCOL, GIVNUM, INDXP, INDX, INFO )
 *
 *  -- LAPACK computational routine --
@@ -253,7 +253,7 @@
 *     .. Array Arguments ..
       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
      $                   INDXQ( * ), PERM( * )
-      DOUBLE PRECISION   D( * ), DLAMDA( * ), GIVNUM( 2, * ),
+      DOUBLE PRECISION   D( * ), DLAMBDA( * ), GIVNUM( 2, * ),
      $                   Q( LDQ, * ), Q2( LDQ2, * ), W( * ), Z( * )
 *     ..
 *
@@ -339,14 +339,14 @@
          INDXQ( I ) = INDXQ( I ) + CUTPNT
    20 CONTINUE
       DO 30 I = 1, N
-         DLAMDA( I ) = D( INDXQ( I ) )
+         DLAMBDA( I ) = D( INDXQ( I ) )
          W( I ) = Z( INDXQ( I ) )
    30 CONTINUE
       I = 1
       J = CUTPNT + 1
-      CALL DLAMRG( N1, N2, DLAMDA, 1, 1, INDX )
+      CALL DLAMRG( N1, N2, DLAMBDA, 1, 1, INDX )
       DO 40 I = 1, N
-         D( I ) = DLAMDA( INDX( I ) )
+         D( I ) = DLAMBDA( INDX( I ) )
          Z( I ) = W( INDX( I ) )
    40 CONTINUE
 *
@@ -464,7 +464,7 @@
          ELSE
             K = K + 1
             W( K ) = Z( JLAM )
-            DLAMDA( K ) = D( JLAM )
+            DLAMBDA( K ) = D( JLAM )
             INDXP( K ) = JLAM
             JLAM = J
          END IF
@@ -476,26 +476,26 @@
 *
       K = K + 1
       W( K ) = Z( JLAM )
-      DLAMDA( K ) = D( JLAM )
+      DLAMBDA( K ) = D( JLAM )
       INDXP( K ) = JLAM
 *
   110 CONTINUE
 *
-*     Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+*     Sort the eigenvalues and corresponding eigenvectors into DLAMBDA
 *     and Q2 respectively.  The eigenvalues/vectors which were not
-*     deflated go into the first K slots of DLAMDA and Q2 respectively,
+*     deflated go into the first K slots of DLAMBDA and Q2 respectively,
 *     while those which were deflated go into the last N - K slots.
 *
       IF( ICOMPQ.EQ.0 ) THEN
          DO 120 J = 1, N
             JP = INDXP( J )
-            DLAMDA( J ) = D( JP )
+            DLAMBDA( J ) = D( JP )
             PERM( J ) = INDXQ( INDX( JP ) )
   120    CONTINUE
       ELSE
          DO 130 J = 1, N
             JP = INDXP( J )
-            DLAMDA( J ) = D( JP )
+            DLAMBDA( J ) = D( JP )
             PERM( J ) = INDXQ( INDX( JP ) )
             CALL DCOPY( QSIZ, Q( 1, PERM( J ) ), 1, Q2( 1, J ), 1 )
   130    CONTINUE
@@ -506,9 +506,9 @@
 *
       IF( K.LT.N ) THEN
          IF( ICOMPQ.EQ.0 ) THEN
-            CALL DCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 )
+            CALL DCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 )
          ELSE
-            CALL DCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 )
+            CALL DCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 )
             CALL DLACPY( 'A', QSIZ, N-K, Q2( 1, K+1 ), LDQ2,
      $                   Q( 1, K+1 ), LDQ )
          END IF
diff --git a/lapack-netlib/SRC/dlaed9.f b/lapack-netlib/SRC/dlaed9.f
index b88cdd907..0d209c2c2 100644
--- a/lapack-netlib/SRC/dlaed9.f
+++ b/lapack-netlib/SRC/dlaed9.f
@@ -18,15 +18,15 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE DLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMDA, W,
-*                          S, LDS, INFO )
+*       SUBROUTINE DLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMBDA,
+*                          W, S, LDS, INFO )
 *
 *       .. Scalar Arguments ..
 *       INTEGER            INFO, K, KSTART, KSTOP, LDQ, LDS, N
 *       DOUBLE PRECISION   RHO
 *       ..
 *       .. Array Arguments ..
-*       DOUBLE PRECISION   D( * ), DLAMDA( * ), Q( LDQ, * ), S( LDS, * ),
+*       DOUBLE PRECISION   D( * ), DLAMBDA( * ), Q( LDQ, * ), S( LDS, * ),
 *      $                   W( * )
 *       ..
 *
@@ -96,9 +96,9 @@
 *>          RHO >= 0 required.
 *> \endverbatim
 *>
-*> \param[in] DLAMDA
+*> \param[in] DLAMBDA
 *> \verbatim
-*>          DLAMDA is DOUBLE PRECISION array, dimension (K)
+*>          DLAMBDA is DOUBLE PRECISION array, dimension (K)
 *>          The first K elements of this array contain the old roots
 *>          of the deflated updating problem.  These are the poles
 *>          of the secular equation.
@@ -151,8 +151,8 @@
 *> at Berkeley, USA
 *
 *  =====================================================================
-      SUBROUTINE DLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMDA, W,
-     $                   S, LDS, INFO )
+      SUBROUTINE DLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMBDA,
+     $                   W, S, LDS, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -163,7 +163,7 @@
       DOUBLE PRECISION   RHO
 *     ..
 *     .. Array Arguments ..
-      DOUBLE PRECISION   D( * ), DLAMDA( * ), Q( LDQ, * ), S( LDS, * ),
+      DOUBLE PRECISION   D( * ), DLAMBDA( * ), Q( LDQ, * ), S( LDS, * ),
      $                   W( * )
 *     ..
 *
@@ -174,8 +174,8 @@
       DOUBLE PRECISION   TEMP
 *     ..
 *     .. External Functions ..
-      DOUBLE PRECISION   DLAMC3, DNRM2
-      EXTERNAL           DLAMC3, DNRM2
+      DOUBLE PRECISION   DNRM2
+      EXTERNAL           DNRM2
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DCOPY, DLAED4, XERBLA
@@ -212,30 +212,9 @@
 *
       IF( K.EQ.0 )
      $   RETURN
-*
-*     Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DLAMDA(I) if it is 1; this makes the subsequent
-*     subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DLAMDA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DLAMDA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 10 I = 1, N
-         DLAMDA( I ) = DLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I )
-   10 CONTINUE
 *
       DO 20 J = KSTART, KSTOP
-         CALL DLAED4( K, J, DLAMDA, W, Q( 1, J ), RHO, D( J ), INFO )
+         CALL DLAED4( K, J, DLAMBDA, W, Q( 1, J ), RHO, D( J ), INFO )
 *
 *        If the zero finder fails, the computation is terminated.
 *
@@ -261,10 +240,10 @@
       CALL DCOPY( K, Q, LDQ+1, W, 1 )
       DO 70 J = 1, K
          DO 50 I = 1, J - 1
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    50    CONTINUE
          DO 60 I = J + 1, K
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    60    CONTINUE
    70 CONTINUE
       DO 80 I = 1, K
diff --git a/lapack-netlib/SRC/dlals0.f b/lapack-netlib/SRC/dlals0.f
index cfca22280..928405e22 100644
--- a/lapack-netlib/SRC/dlals0.f
+++ b/lapack-netlib/SRC/dlals0.f
@@ -389,6 +389,11 @@
      $                ( POLES( I, 2 ).EQ.ZERO ) ) THEN
                      WORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine DLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent
+*                    optimizing compilers from doing x+(y+z).
+*
                      WORK( I ) = POLES( I, 2 )*Z( I ) /
      $                           ( DLAMC3( POLES( I, 2 ), DSIGJ )-
      $                           DIFLJ ) / ( POLES( I, 2 )+DJ )
@@ -440,6 +445,11 @@
                   IF( Z( J ).EQ.ZERO ) THEN
                      WORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine DLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent
+*                    optimizing compilers from doing x+(y+z).
+*
                      WORK( I ) = Z( J ) / ( DLAMC3( DSIGJ, -POLES( I+1,
      $                           2 ) )-DIFR( I, 1 ) ) /
      $                           ( DSIGJ+POLES( I, 1 ) ) / DIFR( I, 2 )
diff --git a/lapack-netlib/SRC/dlalsd.f b/lapack-netlib/SRC/dlalsd.f
index d22c45dc6..706ac4c90 100644
--- a/lapack-netlib/SRC/dlalsd.f
+++ b/lapack-netlib/SRC/dlalsd.f
@@ -47,12 +47,6 @@
 *> problem; in this case a minimum norm solution is returned.
 *> The actual singular values are returned in D in ascending order.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dlas2.f b/lapack-netlib/SRC/dlas2.f
index ea929e86f..ff75e3942 100644
--- a/lapack-netlib/SRC/dlas2.f
+++ b/lapack-netlib/SRC/dlas2.f
@@ -93,9 +93,7 @@
 *>  infinite.
 *>
 *>  Overflow will not occur unless the largest singular value itself
-*>  overflows, or is within a few ulps of overflow. (On machines with
-*>  partial overflow, like the Cray, overflow may occur if the largest
-*>  singular value is within a factor of 2 of overflow.)
+*>  overflows, or is within a few ulps of overflow.
 *>
 *>  Underflow is harmless if underflow is gradual. Otherwise, results
 *>  may correspond to a matrix modified by perturbations of size near
diff --git a/lapack-netlib/SRC/dlasd3.f b/lapack-netlib/SRC/dlasd3.f
index df939efc5..44957377b 100644
--- a/lapack-netlib/SRC/dlasd3.f
+++ b/lapack-netlib/SRC/dlasd3.f
@@ -44,13 +44,6 @@
 *> appropriate calls to DLASD4 and then updates the singular
 *> vectors by matrix multiplication.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
-*>
 *> DLASD3 is called from DLASD1.
 *> \endverbatim
 *
@@ -103,7 +96,7 @@
 *>         The leading dimension of the array Q.  LDQ >= K.
 *> \endverbatim
 *>
-*> \param[in,out] DSIGMA
+*> \param[in] DSIGMA
 *> \verbatim
 *>          DSIGMA is DOUBLE PRECISION array, dimension(K)
 *>         The first K elements of this array contain the old roots
@@ -249,8 +242,8 @@
       DOUBLE PRECISION   RHO, TEMP
 *     ..
 *     .. External Functions ..
-      DOUBLE PRECISION   DLAMC3, DNRM2
-      EXTERNAL           DLAMC3, DNRM2
+      DOUBLE PRECISION   DNRM2
+      EXTERNAL           DNRM2
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DCOPY, DGEMM, DLACPY, DLASCL, DLASD4, XERBLA
@@ -310,27 +303,6 @@
          RETURN
       END IF
 *
-*     Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DSIGMA(I) if it is 1; this makes the subsequent
-*     subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DSIGMA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DSIGMA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DSIGMA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 20 I = 1, K
-         DSIGMA( I ) = DLAMC3( DSIGMA( I ), DSIGMA( I ) ) - DSIGMA( I )
-   20 CONTINUE
-*
 *     Keep a copy of Z.
 *
       CALL DCOPY( K, Z, 1, Q, 1 )
diff --git a/lapack-netlib/SRC/dlasd8.f b/lapack-netlib/SRC/dlasd8.f
index a769bdb22..73c3ef0b4 100644
--- a/lapack-netlib/SRC/dlasd8.f
+++ b/lapack-netlib/SRC/dlasd8.f
@@ -121,14 +121,12 @@
 *>          The leading dimension of DIFR, must be at least K.
 *> \endverbatim
 *>
-*> \param[in,out] DSIGMA
+*> \param[in] DSIGMA
 *> \verbatim
 *>          DSIGMA is DOUBLE PRECISION array, dimension ( K )
 *>          On entry, the first K elements of this array contain the old
 *>          roots of the deflated updating problem.  These are the poles
 *>          of the secular equation.
-*>          On exit, the elements of DSIGMA may be very slightly altered
-*>          in value.
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -227,27 +225,6 @@
          RETURN
       END IF
 *
-*     Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DSIGMA(I) if it is 1; this makes the subsequent
-*     subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DSIGMA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DSIGMA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 10 I = 1, K
-         DSIGMA( I ) = DLAMC3( DSIGMA( I ), DSIGMA( I ) ) - DSIGMA( I )
-   10 CONTINUE
-*
 *     Book keeping.
 *
       IWK1 = 1
@@ -312,6 +289,11 @@
             DSIGJP = -DSIGMA( J+1 )
          END IF
          WORK( J ) = -Z( J ) / DIFLJ / ( DSIGMA( J )+DJ )
+*
+*        Use calls to the subroutine DLAMC3 to enforce the parentheses
+*        (x+y)+z. The goal is to prevent optimizing compilers
+*        from doing x+(y+z).
+*
          DO 60 I = 1, J - 1
             WORK( I ) = Z( I ) / ( DLAMC3( DSIGMA( I ), DSIGJ )-DIFLJ )
      $                   / ( DSIGMA( I )+DJ )
diff --git a/lapack-netlib/SRC/dlasv2.f b/lapack-netlib/SRC/dlasv2.f
index 64a06dee1..cb2bf51c4 100644
--- a/lapack-netlib/SRC/dlasv2.f
+++ b/lapack-netlib/SRC/dlasv2.f
@@ -124,9 +124,7 @@
 *>  infinite.
 *>
 *>  Overflow will not occur unless the largest singular value itself
-*>  overflows or is within a few ulps of overflow. (On machines with
-*>  partial overflow, like the Cray, overflow may occur if the largest
-*>  singular value is within a factor of 2 of overflow.)
+*>  overflows or is within a few ulps of overflow.
 *>
 *>  Underflow is harmless if underflow is gradual. Otherwise, results
 *>  may correspond to a matrix modified by perturbations of size near
diff --git a/lapack-netlib/SRC/dsbevd.f b/lapack-netlib/SRC/dsbevd.f
index 3eb4ed8df..350c0a9f0 100644
--- a/lapack-netlib/SRC/dsbevd.f
+++ b/lapack-netlib/SRC/dsbevd.f
@@ -40,12 +40,6 @@
 *> a real symmetric band matrix A. If eigenvectors are desired, it uses
 *> a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dsbevd_2stage.f b/lapack-netlib/SRC/dsbevd_2stage.f
index 45a64b478..82997c850 100644
--- a/lapack-netlib/SRC/dsbevd_2stage.f
+++ b/lapack-netlib/SRC/dsbevd_2stage.f
@@ -45,12 +45,6 @@
 *> the reduction to tridiagonal. If eigenvectors are desired, it uses
 *> a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dsbgvd.f b/lapack-netlib/SRC/dsbgvd.f
index 30b016611..0ab3177ac 100644
--- a/lapack-netlib/SRC/dsbgvd.f
+++ b/lapack-netlib/SRC/dsbgvd.f
@@ -43,12 +43,6 @@
 *> banded, and B is also positive definite.  If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dspevd.f b/lapack-netlib/SRC/dspevd.f
index d9d6c8917..05aa91b03 100644
--- a/lapack-netlib/SRC/dspevd.f
+++ b/lapack-netlib/SRC/dspevd.f
@@ -40,12 +40,6 @@
 *> of a real symmetric matrix A in packed storage. If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dspgvd.f b/lapack-netlib/SRC/dspgvd.f
index ec3cdc1ac..24c2309c3 100644
--- a/lapack-netlib/SRC/dspgvd.f
+++ b/lapack-netlib/SRC/dspgvd.f
@@ -44,12 +44,6 @@
 *> positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dstedc.f b/lapack-netlib/SRC/dstedc.f
index 2ed84afaa..6d533664b 100644
--- a/lapack-netlib/SRC/dstedc.f
+++ b/lapack-netlib/SRC/dstedc.f
@@ -42,12 +42,6 @@
 *> found if DSYTRD or DSPTRD or DSBTRD has been used to reduce this
 *> matrix to tridiagonal form.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.  See DLAED3 for details.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dstevd.f b/lapack-netlib/SRC/dstevd.f
index 507f39b2b..54717df3d 100644
--- a/lapack-netlib/SRC/dstevd.f
+++ b/lapack-netlib/SRC/dstevd.f
@@ -40,12 +40,6 @@
 *> real symmetric tridiagonal matrix. If eigenvectors are desired, it
 *> uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dsyevd.f b/lapack-netlib/SRC/dsyevd.f
index eaaecd8d9..b27f4cdc7 100644
--- a/lapack-netlib/SRC/dsyevd.f
+++ b/lapack-netlib/SRC/dsyevd.f
@@ -40,13 +40,6 @@
 *> real symmetric matrix A. If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
-*>
 *> Because of large use of BLAS of level 3, DSYEVD needs N**2 more
 *> workspace than DSYEVX.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dsyevd_2stage.f b/lapack-netlib/SRC/dsyevd_2stage.f
index 0eae8ad06..d5a68c35d 100644
--- a/lapack-netlib/SRC/dsyevd_2stage.f
+++ b/lapack-netlib/SRC/dsyevd_2stage.f
@@ -45,12 +45,6 @@
 *> the reduction to tridiagonal. If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dsygvd.f b/lapack-netlib/SRC/dsygvd.f
index d6682d4e5..41a384c80 100644
--- a/lapack-netlib/SRC/dsygvd.f
+++ b/lapack-netlib/SRC/dsygvd.f
@@ -42,12 +42,6 @@
 *> B are assumed to be symmetric and B is also positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/sbdsdc.f b/lapack-netlib/SRC/sbdsdc.f
index 18a404497..2a6cc9970 100644
--- a/lapack-netlib/SRC/sbdsdc.f
+++ b/lapack-netlib/SRC/sbdsdc.f
@@ -45,13 +45,6 @@
 *> respectively. SBDSDC can be used to compute all singular values,
 *> and optionally, singular vectors or singular vectors in compact form.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.  See SLASD3 for details.
-*>
 *> The code currently calls SLASDQ if singular values only are desired.
 *> However, it can be slightly modified to compute singular values
 *> using the divide and conquer method.
diff --git a/lapack-netlib/SRC/sgelsd.f b/lapack-netlib/SRC/sgelsd.f
index f5f17d34c..9fda7b593 100644
--- a/lapack-netlib/SRC/sgelsd.f
+++ b/lapack-netlib/SRC/sgelsd.f
@@ -59,12 +59,6 @@
 *> singular values which are less than RCOND times the largest singular
 *> value.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/sgesdd.f b/lapack-netlib/SRC/sgesdd.f
index d3b5e3ba1..d271bb757 100644
--- a/lapack-netlib/SRC/sgesdd.f
+++ b/lapack-netlib/SRC/sgesdd.f
@@ -55,12 +55,6 @@
 *>
 *> Note that the routine returns VT = V**T, not V.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/slaed2.f b/lapack-netlib/SRC/slaed2.f
index 16500e74c..cadf53555 100644
--- a/lapack-netlib/SRC/slaed2.f
+++ b/lapack-netlib/SRC/slaed2.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMDA, W,
+*       SUBROUTINE SLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMBDA, W,
 *                          Q2, INDX, INDXC, INDXP, COLTYP, INFO )
 *
 *       .. Scalar Arguments ..
@@ -28,7 +28,7 @@
 *       .. Array Arguments ..
 *       INTEGER            COLTYP( * ), INDX( * ), INDXC( * ), INDXP( * ),
 *      $                   INDXQ( * )
-*       REAL               D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+*       REAL               D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
 *      $                   W( * ), Z( * )
 *       ..
 *
@@ -123,9 +123,9 @@
 *>         process.
 *> \endverbatim
 *>
-*> \param[out] DLAMDA
+*> \param[out] DLAMBDA
 *> \verbatim
-*>          DLAMDA is REAL array, dimension (N)
+*>          DLAMBDA is REAL array, dimension (N)
 *>         A copy of the first K eigenvalues which will be used by
 *>         SLAED3 to form the secular equation.
 *> \endverbatim
@@ -148,7 +148,7 @@
 *> \param[out] INDX
 *> \verbatim
 *>          INDX is INTEGER array, dimension (N)
-*>         The permutation used to sort the contents of DLAMDA into
+*>         The permutation used to sort the contents of DLAMBDA into
 *>         ascending order.
 *> \endverbatim
 *>
@@ -207,7 +207,7 @@
 *>  Modified by Francoise Tisseur, University of Tennessee
 *>
 *  =====================================================================
-      SUBROUTINE SLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMDA, W,
+      SUBROUTINE SLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMBDA, W,
      $                   Q2, INDX, INDXC, INDXP, COLTYP, INFO )
 *
 *  -- LAPACK computational routine --
@@ -221,7 +221,7 @@
 *     .. Array Arguments ..
       INTEGER            COLTYP( * ), INDX( * ), INDXC( * ), INDXP( * ),
      $                   INDXQ( * )
-      REAL               D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+      REAL               D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
      $                   W( * ), Z( * )
 *     ..
 *
@@ -300,9 +300,9 @@
 *     re-integrate the deflated parts from the last pass
 *
       DO 20 I = 1, N
-         DLAMDA( I ) = D( INDXQ( I ) )
+         DLAMBDA( I ) = D( INDXQ( I ) )
    20 CONTINUE
-      CALL SLAMRG( N1, N2, DLAMDA, 1, 1, INDXC )
+      CALL SLAMRG( N1, N2, DLAMBDA, 1, 1, INDXC )
       DO 30 I = 1, N
          INDX( I ) = INDXQ( INDXC( I ) )
    30 CONTINUE
@@ -324,11 +324,11 @@
          DO 40 J = 1, N
             I = INDX( J )
             CALL SCOPY( N, Q( 1, I ), 1, Q2( IQ2 ), 1 )
-            DLAMDA( J ) = D( I )
+            DLAMBDA( J ) = D( I )
             IQ2 = IQ2 + N
    40    CONTINUE
          CALL SLACPY( 'A', N, N, Q2, N, Q, LDQ )
-         CALL SCOPY( N, DLAMDA, 1, D, 1 )
+         CALL SCOPY( N, DLAMBDA, 1, D, 1 )
          GO TO 190
       END IF
 *
@@ -421,7 +421,7 @@
             PJ = NJ
          ELSE
             K = K + 1
-            DLAMDA( K ) = D( PJ )
+            DLAMBDA( K ) = D( PJ )
             W( K ) = Z( PJ )
             INDXP( K ) = PJ
             PJ = NJ
@@ -433,7 +433,7 @@
 *     Record the last eigenvalue.
 *
       K = K + 1
-      DLAMDA( K ) = D( PJ )
+      DLAMBDA( K ) = D( PJ )
       W( K ) = Z( PJ )
       INDXP( K ) = PJ
 *
@@ -470,9 +470,9 @@
          PSM( CT ) = PSM( CT ) + 1
   130 CONTINUE
 *
-*     Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+*     Sort the eigenvalues and corresponding eigenvectors into DLAMBDA
 *     and Q2 respectively.  The eigenvalues/vectors which were not
-*     deflated go into the first K slots of DLAMDA and Q2 respectively,
+*     deflated go into the first K slots of DLAMBDA and Q2 respectively,
 *     while those which were deflated go into the last N - K slots.
 *
       I = 1
diff --git a/lapack-netlib/SRC/slaed3.f b/lapack-netlib/SRC/slaed3.f
index e84f22be1..44c601f91 100644
--- a/lapack-netlib/SRC/slaed3.f
+++ b/lapack-netlib/SRC/slaed3.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMDA, Q2, INDX,
+*       SUBROUTINE SLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMBDA, Q2, INDX,
 *                          CTOT, W, S, INFO )
 *
 *       .. Scalar Arguments ..
@@ -27,7 +27,7 @@
 *       ..
 *       .. Array Arguments ..
 *       INTEGER            CTOT( * ), INDX( * )
-*       REAL               D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+*       REAL               D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
 *      $                   S( * ), W( * )
 *       ..
 *
@@ -44,12 +44,6 @@
 *> being combined by the matrix of eigenvectors of the K-by-K system
 *> which is solved here.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -104,14 +98,12 @@
 *>          RHO >= 0 required.
 *> \endverbatim
 *>
-*> \param[in,out] DLAMDA
+*> \param[in] DLAMBDA
 *> \verbatim
-*>          DLAMDA is REAL array, dimension (K)
+*>          DLAMBDA is REAL array, dimension (K)
 *>          The first K elements of this array contain the old roots
 *>          of the deflated updating problem.  These are the poles
-*>          of the secular equation. May be changed on output by
-*>          having lowest order bit set to zero on Cray X-MP, Cray Y-MP,
-*>          Cray-2, or Cray C-90, as described above.
+*>          of the secular equation.
 *> \endverbatim
 *>
 *> \param[in] Q2
@@ -180,7 +172,7 @@
 *>  Modified by Francoise Tisseur, University of Tennessee
 *>
 *  =====================================================================
-      SUBROUTINE SLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMDA, Q2, INDX,
+      SUBROUTINE SLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMBDA, Q2, INDX,
      $                   CTOT, W, S, INFO )
 *
 *  -- LAPACK computational routine --
@@ -193,7 +185,7 @@
 *     ..
 *     .. Array Arguments ..
       INTEGER            CTOT( * ), INDX( * )
-      REAL               D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+      REAL               D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
      $                   S( * ), W( * )
 *     ..
 *
@@ -208,8 +200,8 @@
       REAL               TEMP
 *     ..
 *     .. External Functions ..
-      REAL               SLAMC3, SNRM2
-      EXTERNAL           SLAMC3, SNRM2
+      REAL               SNRM2
+      EXTERNAL           SNRM2
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SGEMM, SLACPY, SLAED4, SLASET, XERBLA
@@ -239,30 +231,9 @@
 *
       IF( K.EQ.0 )
      $   RETURN
-*
-*     Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DLAMDA(I) if it is 1; this makes the subsequent
-*     subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DLAMDA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DLAMDA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 10 I = 1, K
-         DLAMDA( I ) = SLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I )
-   10 CONTINUE
 *
       DO 20 J = 1, K
-         CALL SLAED4( K, J, DLAMDA, W, Q( 1, J ), RHO, D( J ), INFO )
+         CALL SLAED4( K, J, DLAMBDA, W, Q( 1, J ), RHO, D( J ), INFO )
 *
 *        If the zero finder fails, the computation is terminated.
 *
@@ -293,10 +264,10 @@
       CALL SCOPY( K, Q, LDQ+1, W, 1 )
       DO 60 J = 1, K
          DO 40 I = 1, J - 1
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    40    CONTINUE
          DO 50 I = J + 1, K
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    50    CONTINUE
    60 CONTINUE
       DO 70 I = 1, K
diff --git a/lapack-netlib/SRC/slaed8.f b/lapack-netlib/SRC/slaed8.f
index 9c8ba440c..9dd8a15f6 100644
--- a/lapack-netlib/SRC/slaed8.f
+++ b/lapack-netlib/SRC/slaed8.f
@@ -19,7 +19,7 @@
 *  ===========
 *
 *       SUBROUTINE SLAED8( ICOMPQ, K, N, QSIZ, D, Q, LDQ, INDXQ, RHO,
-*                          CUTPNT, Z, DLAMDA, Q2, LDQ2, W, PERM, GIVPTR,
+*                          CUTPNT, Z, DLAMBDA, Q2, LDQ2, W, PERM, GIVPTR,
 *                          GIVCOL, GIVNUM, INDXP, INDX, INFO )
 *
 *       .. Scalar Arguments ..
@@ -30,7 +30,7 @@
 *       .. Array Arguments ..
 *       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
 *      $                   INDXQ( * ), PERM( * )
-*       REAL               D( * ), DLAMDA( * ), GIVNUM( 2, * ),
+*       REAL               D( * ), DLAMBDA( * ), GIVNUM( 2, * ),
 *      $                   Q( LDQ, * ), Q2( LDQ2, * ), W( * ), Z( * )
 *       ..
 *
@@ -141,9 +141,9 @@
 *>         process.
 *> \endverbatim
 *>
-*> \param[out] DLAMDA
+*> \param[out] DLAMBDA
 *> \verbatim
-*>          DLAMDA is REAL array, dimension (N)
+*>          DLAMBDA is REAL array, dimension (N)
 *>         A copy of the first K eigenvalues which will be used by
 *>         SLAED3 to form the secular equation.
 *> \endverbatim
@@ -238,7 +238,7 @@
 *
 *  =====================================================================
       SUBROUTINE SLAED8( ICOMPQ, K, N, QSIZ, D, Q, LDQ, INDXQ, RHO,
-     $                   CUTPNT, Z, DLAMDA, Q2, LDQ2, W, PERM, GIVPTR,
+     $                   CUTPNT, Z, DLAMBDA, Q2, LDQ2, W, PERM, GIVPTR,
      $                   GIVCOL, GIVNUM, INDXP, INDX, INFO )
 *
 *  -- LAPACK computational routine --
@@ -253,7 +253,7 @@
 *     .. Array Arguments ..
       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
      $                   INDXQ( * ), PERM( * )
-      REAL               D( * ), DLAMDA( * ), GIVNUM( 2, * ),
+      REAL               D( * ), DLAMBDA( * ), GIVNUM( 2, * ),
      $                   Q( LDQ, * ), Q2( LDQ2, * ), W( * ), Z( * )
 *     ..
 *
@@ -339,14 +339,14 @@
          INDXQ( I ) = INDXQ( I ) + CUTPNT
    20 CONTINUE
       DO 30 I = 1, N
-         DLAMDA( I ) = D( INDXQ( I ) )
+         DLAMBDA( I ) = D( INDXQ( I ) )
          W( I ) = Z( INDXQ( I ) )
    30 CONTINUE
       I = 1
       J = CUTPNT + 1
-      CALL SLAMRG( N1, N2, DLAMDA, 1, 1, INDX )
+      CALL SLAMRG( N1, N2, DLAMBDA, 1, 1, INDX )
       DO 40 I = 1, N
-         D( I ) = DLAMDA( INDX( I ) )
+         D( I ) = DLAMBDA( INDX( I ) )
          Z( I ) = W( INDX( I ) )
    40 CONTINUE
 *
@@ -464,7 +464,7 @@
          ELSE
             K = K + 1
             W( K ) = Z( JLAM )
-            DLAMDA( K ) = D( JLAM )
+            DLAMBDA( K ) = D( JLAM )
             INDXP( K ) = JLAM
             JLAM = J
          END IF
@@ -476,26 +476,26 @@
 *
       K = K + 1
       W( K ) = Z( JLAM )
-      DLAMDA( K ) = D( JLAM )
+      DLAMBDA( K ) = D( JLAM )
       INDXP( K ) = JLAM
 *
   110 CONTINUE
 *
-*     Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+*     Sort the eigenvalues and corresponding eigenvectors into DLAMBDA
 *     and Q2 respectively.  The eigenvalues/vectors which were not
-*     deflated go into the first K slots of DLAMDA and Q2 respectively,
+*     deflated go into the first K slots of DLAMBDA and Q2 respectively,
 *     while those which were deflated go into the last N - K slots.
 *
       IF( ICOMPQ.EQ.0 ) THEN
          DO 120 J = 1, N
             JP = INDXP( J )
-            DLAMDA( J ) = D( JP )
+            DLAMBDA( J ) = D( JP )
             PERM( J ) = INDXQ( INDX( JP ) )
   120    CONTINUE
       ELSE
          DO 130 J = 1, N
             JP = INDXP( J )
-            DLAMDA( J ) = D( JP )
+            DLAMBDA( J ) = D( JP )
             PERM( J ) = INDXQ( INDX( JP ) )
             CALL SCOPY( QSIZ, Q( 1, PERM( J ) ), 1, Q2( 1, J ), 1 )
   130    CONTINUE
@@ -506,9 +506,9 @@
 *
       IF( K.LT.N ) THEN
          IF( ICOMPQ.EQ.0 ) THEN
-            CALL SCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 )
+            CALL SCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 )
          ELSE
-            CALL SCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 )
+            CALL SCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 )
             CALL SLACPY( 'A', QSIZ, N-K, Q2( 1, K+1 ), LDQ2,
      $                   Q( 1, K+1 ), LDQ )
          END IF
diff --git a/lapack-netlib/SRC/slaed9.f b/lapack-netlib/SRC/slaed9.f
index 4d07416e9..d1b7b29fd 100644
--- a/lapack-netlib/SRC/slaed9.f
+++ b/lapack-netlib/SRC/slaed9.f
@@ -18,15 +18,15 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMDA, W,
-*                          S, LDS, INFO )
+*       SUBROUTINE SLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMBDA,
+*                          W, S, LDS, INFO )
 *
 *       .. Scalar Arguments ..
 *       INTEGER            INFO, K, KSTART, KSTOP, LDQ, LDS, N
 *       REAL               RHO
 *       ..
 *       .. Array Arguments ..
-*       REAL               D( * ), DLAMDA( * ), Q( LDQ, * ), S( LDS, * ),
+*       REAL               D( * ), DLAMBDA( * ), Q( LDQ, * ), S( LDS, * ),
 *      $                   W( * )
 *       ..
 *
@@ -96,9 +96,9 @@
 *>          RHO >= 0 required.
 *> \endverbatim
 *>
-*> \param[in] DLAMDA
+*> \param[in] DLAMBDA
 *> \verbatim
-*>          DLAMDA is REAL array, dimension (K)
+*>          DLAMBDA is REAL array, dimension (K)
 *>          The first K elements of this array contain the old roots
 *>          of the deflated updating problem.  These are the poles
 *>          of the secular equation.
@@ -151,8 +151,8 @@
 *> at Berkeley, USA
 *
 *  =====================================================================
-      SUBROUTINE SLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMDA, W,
-     $                   S, LDS, INFO )
+      SUBROUTINE SLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMBDA,
+     $                   W, S, LDS, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -163,7 +163,7 @@
       REAL               RHO
 *     ..
 *     .. Array Arguments ..
-      REAL               D( * ), DLAMDA( * ), Q( LDQ, * ), S( LDS, * ),
+      REAL               D( * ), DLAMBDA( * ), Q( LDQ, * ), S( LDS, * ),
      $                   W( * )
 *     ..
 *
@@ -174,8 +174,8 @@
       REAL               TEMP
 *     ..
 *     .. External Functions ..
-      REAL               SLAMC3, SNRM2
-      EXTERNAL           SLAMC3, SNRM2
+      REAL               SNRM2
+      EXTERNAL           SNRM2
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SLAED4, XERBLA
@@ -212,30 +212,9 @@
 *
       IF( K.EQ.0 )
      $   RETURN
-*
-*     Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DLAMDA(I) if it is 1; this makes the subsequent
-*     subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DLAMDA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DLAMDA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 10 I = 1, N
-         DLAMDA( I ) = SLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I )
-   10 CONTINUE
 *
       DO 20 J = KSTART, KSTOP
-         CALL SLAED4( K, J, DLAMDA, W, Q( 1, J ), RHO, D( J ), INFO )
+         CALL SLAED4( K, J, DLAMBDA, W, Q( 1, J ), RHO, D( J ), INFO )
 *
 *        If the zero finder fails, the computation is terminated.
 *
@@ -261,10 +240,10 @@
       CALL SCOPY( K, Q, LDQ+1, W, 1 )
       DO 70 J = 1, K
          DO 50 I = 1, J - 1
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    50    CONTINUE
          DO 60 I = J + 1, K
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    60    CONTINUE
    70 CONTINUE
       DO 80 I = 1, K
diff --git a/lapack-netlib/SRC/slals0.f b/lapack-netlib/SRC/slals0.f
index 7d44e2864..f168dc653 100644
--- a/lapack-netlib/SRC/slals0.f
+++ b/lapack-netlib/SRC/slals0.f
@@ -389,6 +389,11 @@
      $                ( POLES( I, 2 ).EQ.ZERO ) ) THEN
                      WORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine SLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent
+*                    optimizing compilers from doing x+(y+z).
+*
                      WORK( I ) = POLES( I, 2 )*Z( I ) /
      $                           ( SLAMC3( POLES( I, 2 ), DSIGJ )-
      $                           DIFLJ ) / ( POLES( I, 2 )+DJ )
@@ -440,6 +445,11 @@
                   IF( Z( J ).EQ.ZERO ) THEN
                      WORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine SLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent
+*                    optimizing compilers from doing x+(y+z).
+*
                      WORK( I ) = Z( J ) / ( SLAMC3( DSIGJ, -POLES( I+1,
      $                           2 ) )-DIFR( I, 1 ) ) /
      $                           ( DSIGJ+POLES( I, 1 ) ) / DIFR( I, 2 )
diff --git a/lapack-netlib/SRC/slalsd.f b/lapack-netlib/SRC/slalsd.f
index 2197f728e..9943a52d9 100644
--- a/lapack-netlib/SRC/slalsd.f
+++ b/lapack-netlib/SRC/slalsd.f
@@ -47,12 +47,6 @@
 *> problem; in this case a minimum norm solution is returned.
 *> The actual singular values are returned in D in ascending order.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/slas2.f b/lapack-netlib/SRC/slas2.f
index 6ae86204c..02ecbf434 100644
--- a/lapack-netlib/SRC/slas2.f
+++ b/lapack-netlib/SRC/slas2.f
@@ -93,9 +93,7 @@
 *>  infinite.
 *>
 *>  Overflow will not occur unless the largest singular value itself
-*>  overflows, or is within a few ulps of overflow. (On machines with
-*>  partial overflow, like the Cray, overflow may occur if the largest
-*>  singular value is within a factor of 2 of overflow.)
+*>  overflows, or is within a few ulps of overflow.
 *>
 *>  Underflow is harmless if underflow is gradual. Otherwise, results
 *>  may correspond to a matrix modified by perturbations of size near
diff --git a/lapack-netlib/SRC/slasd3.f b/lapack-netlib/SRC/slasd3.f
index f9420f88a..8f74743c2 100644
--- a/lapack-netlib/SRC/slasd3.f
+++ b/lapack-netlib/SRC/slasd3.f
@@ -44,13 +44,6 @@
 *> appropriate calls to SLASD4 and then updates the singular
 *> vectors by matrix multiplication.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
-*>
 *> SLASD3 is called from SLASD1.
 *> \endverbatim
 *
@@ -103,7 +96,7 @@
 *>         The leading dimension of the array Q.  LDQ >= K.
 *> \endverbatim
 *>
-*> \param[in,out] DSIGMA
+*> \param[in] DSIGMA
 *> \verbatim
 *>          DSIGMA is REAL array, dimension(K)
 *>         The first K elements of this array contain the old roots
@@ -249,8 +242,8 @@
       REAL               RHO, TEMP
 *     ..
 *     .. External Functions ..
-      REAL               SLAMC3, SNRM2
-      EXTERNAL           SLAMC3, SNRM2
+      REAL               SNRM2
+      EXTERNAL           SNRM2
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SGEMM, SLACPY, SLASCL, SLASD4, XERBLA
@@ -310,27 +303,6 @@
          RETURN
       END IF
 *
-*     Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DSIGMA(I) if it is 1; this makes the subsequent
-*     subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DSIGMA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DSIGMA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DSIGMA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 20 I = 1, K
-         DSIGMA( I ) = SLAMC3( DSIGMA( I ), DSIGMA( I ) ) - DSIGMA( I )
-   20 CONTINUE
-*
 *     Keep a copy of Z.
 *
       CALL SCOPY( K, Z, 1, Q, 1 )
diff --git a/lapack-netlib/SRC/slasd8.f b/lapack-netlib/SRC/slasd8.f
index 43b171e5f..df5002367 100644
--- a/lapack-netlib/SRC/slasd8.f
+++ b/lapack-netlib/SRC/slasd8.f
@@ -121,14 +121,12 @@
 *>          The leading dimension of DIFR, must be at least K.
 *> \endverbatim
 *>
-*> \param[in,out] DSIGMA
+*> \param[in] DSIGMA
 *> \verbatim
 *>          DSIGMA is REAL array, dimension ( K )
 *>          On entry, the first K elements of this array contain the old
 *>          roots of the deflated updating problem.  These are the poles
 *>          of the secular equation.
-*>          On exit, the elements of DSIGMA may be very slightly altered
-*>          in value.
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -227,27 +225,6 @@
          RETURN
       END IF
 *
-*     Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DSIGMA(I) if it is 1; this makes the subsequent
-*     subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DSIGMA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DSIGMA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 10 I = 1, K
-         DSIGMA( I ) = SLAMC3( DSIGMA( I ), DSIGMA( I ) ) - DSIGMA( I )
-   10 CONTINUE
-*
 *     Book keeping.
 *
       IWK1 = 1
@@ -312,6 +289,11 @@
             DSIGJP = -DSIGMA( J+1 )
          END IF
          WORK( J ) = -Z( J ) / DIFLJ / ( DSIGMA( J )+DJ )
+*
+*        Use calls to the subroutine SLAMC3 to enforce the parentheses
+*        (x+y)+z. The goal is to prevent optimizing compilers
+*        from doing x+(y+z).
+*
          DO 60 I = 1, J - 1
             WORK( I ) = Z( I ) / ( SLAMC3( DSIGMA( I ), DSIGJ )-DIFLJ )
      $                   / ( DSIGMA( I )+DJ )
diff --git a/lapack-netlib/SRC/slasv2.f b/lapack-netlib/SRC/slasv2.f
index 6b98e9a0c..bf5d3ea0e 100644
--- a/lapack-netlib/SRC/slasv2.f
+++ b/lapack-netlib/SRC/slasv2.f
@@ -124,9 +124,7 @@
 *>  infinite.
 *>
 *>  Overflow will not occur unless the largest singular value itself
-*>  overflows or is within a few ulps of overflow. (On machines with
-*>  partial overflow, like the Cray, overflow may occur if the largest
-*>  singular value is within a factor of 2 of overflow.)
+*>  overflows or is within a few ulps of overflow.
 *>
 *>  Underflow is harmless if underflow is gradual. Otherwise, results
 *>  may correspond to a matrix modified by perturbations of size near
diff --git a/lapack-netlib/SRC/ssbevd.f b/lapack-netlib/SRC/ssbevd.f
index bcf14ce85..e87f9a030 100644
--- a/lapack-netlib/SRC/ssbevd.f
+++ b/lapack-netlib/SRC/ssbevd.f
@@ -40,12 +40,6 @@
 *> a real symmetric band matrix A. If eigenvectors are desired, it uses
 *> a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/ssbevd_2stage.f b/lapack-netlib/SRC/ssbevd_2stage.f
index 9687ee024..014bade48 100644
--- a/lapack-netlib/SRC/ssbevd_2stage.f
+++ b/lapack-netlib/SRC/ssbevd_2stage.f
@@ -45,12 +45,6 @@
 *> the reduction to tridiagonal. If eigenvectors are desired, it uses
 *> a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/ssbgvd.f b/lapack-netlib/SRC/ssbgvd.f
index 6dd1fe952..7c21ee455 100644
--- a/lapack-netlib/SRC/ssbgvd.f
+++ b/lapack-netlib/SRC/ssbgvd.f
@@ -43,12 +43,6 @@
 *> banded, and B is also positive definite.  If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/sspevd.f b/lapack-netlib/SRC/sspevd.f
index 56329da34..0872e95ac 100644
--- a/lapack-netlib/SRC/sspevd.f
+++ b/lapack-netlib/SRC/sspevd.f
@@ -40,12 +40,6 @@
 *> of a real symmetric matrix A in packed storage. If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/sspgvd.f b/lapack-netlib/SRC/sspgvd.f
index 8ce2311fa..1a88365f2 100644
--- a/lapack-netlib/SRC/sspgvd.f
+++ b/lapack-netlib/SRC/sspgvd.f
@@ -44,12 +44,6 @@
 *> positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/sstedc.f b/lapack-netlib/SRC/sstedc.f
index 925b03422..61e3c2fda 100644
--- a/lapack-netlib/SRC/sstedc.f
+++ b/lapack-netlib/SRC/sstedc.f
@@ -42,12 +42,6 @@
 *> found if SSYTRD or SSPTRD or SSBTRD has been used to reduce this
 *> matrix to tridiagonal form.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.  See SLAED3 for details.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/sstevd.f b/lapack-netlib/SRC/sstevd.f
index bc5b5aaab..218af8c76 100644
--- a/lapack-netlib/SRC/sstevd.f
+++ b/lapack-netlib/SRC/sstevd.f
@@ -40,12 +40,6 @@
 *> real symmetric tridiagonal matrix. If eigenvectors are desired, it
 *> uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/ssyevd.f b/lapack-netlib/SRC/ssyevd.f
index ac0d0284d..ee0e33384 100644
--- a/lapack-netlib/SRC/ssyevd.f
+++ b/lapack-netlib/SRC/ssyevd.f
@@ -40,13 +40,6 @@
 *> real symmetric matrix A. If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
-*>
 *> Because of large use of BLAS of level 3, SSYEVD needs N**2 more
 *> workspace than SSYEVX.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/ssyevd_2stage.f b/lapack-netlib/SRC/ssyevd_2stage.f
index f3fde6b4a..e63e280a7 100644
--- a/lapack-netlib/SRC/ssyevd_2stage.f
+++ b/lapack-netlib/SRC/ssyevd_2stage.f
@@ -45,12 +45,6 @@
 *> the reduction to tridiagonal. If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/ssygvd.f b/lapack-netlib/SRC/ssygvd.f
index 79f12a6f9..3c8bd2a0e 100644
--- a/lapack-netlib/SRC/ssygvd.f
+++ b/lapack-netlib/SRC/ssygvd.f
@@ -42,12 +42,6 @@
 *> B are assumed to be symmetric and B is also positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zgelsd.f b/lapack-netlib/SRC/zgelsd.f
index 01793e16c..b5bc768e8 100644
--- a/lapack-netlib/SRC/zgelsd.f
+++ b/lapack-netlib/SRC/zgelsd.f
@@ -60,12 +60,6 @@
 *> singular values which are less than RCOND times the largest singular
 *> value.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zgesdd.f b/lapack-netlib/SRC/zgesdd.f
index 30d18a3a0..7f203afa5 100644
--- a/lapack-netlib/SRC/zgesdd.f
+++ b/lapack-netlib/SRC/zgesdd.f
@@ -53,12 +53,6 @@
 *>
 *> Note that the routine returns VT = V**H, not V.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zhbevd.f b/lapack-netlib/SRC/zhbevd.f
index 0db551540..be9f01556 100644
--- a/lapack-netlib/SRC/zhbevd.f
+++ b/lapack-netlib/SRC/zhbevd.f
@@ -41,12 +41,6 @@
 *> a complex Hermitian band matrix A.  If eigenvectors are desired, it
 *> uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zhbevd_2stage.f b/lapack-netlib/SRC/zhbevd_2stage.f
index 4522d5e79..e32c7125c 100644
--- a/lapack-netlib/SRC/zhbevd_2stage.f
+++ b/lapack-netlib/SRC/zhbevd_2stage.f
@@ -47,12 +47,6 @@
 *> the reduction to tridiagonal.  If eigenvectors are desired, it
 *> uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zhbgvd.f b/lapack-netlib/SRC/zhbgvd.f
index b0664750e..4bd02168d 100644
--- a/lapack-netlib/SRC/zhbgvd.f
+++ b/lapack-netlib/SRC/zhbgvd.f
@@ -46,12 +46,6 @@
 *> and banded, and B is also positive definite.  If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zheevd.f b/lapack-netlib/SRC/zheevd.f
index 7f58c7f72..ba52f9e72 100644
--- a/lapack-netlib/SRC/zheevd.f
+++ b/lapack-netlib/SRC/zheevd.f
@@ -41,12 +41,6 @@
 *> complex Hermitian matrix A.  If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zheevd_2stage.f b/lapack-netlib/SRC/zheevd_2stage.f
index 9859b0d67..e697a9823 100644
--- a/lapack-netlib/SRC/zheevd_2stage.f
+++ b/lapack-netlib/SRC/zheevd_2stage.f
@@ -46,12 +46,6 @@
 *> the reduction to tridiagonal.  If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zhegvd.f b/lapack-netlib/SRC/zhegvd.f
index 2c3586517..c9ff55e3d 100644
--- a/lapack-netlib/SRC/zhegvd.f
+++ b/lapack-netlib/SRC/zhegvd.f
@@ -43,12 +43,6 @@
 *> B are assumed to be Hermitian and B is also positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zhpevd.f b/lapack-netlib/SRC/zhpevd.f
index 7625c8fe8..5260aaf14 100644
--- a/lapack-netlib/SRC/zhpevd.f
+++ b/lapack-netlib/SRC/zhpevd.f
@@ -41,12 +41,6 @@
 *> a complex Hermitian matrix A in packed storage.  If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zhpgvd.f b/lapack-netlib/SRC/zhpgvd.f
index e9688f0c7..dfe92067c 100644
--- a/lapack-netlib/SRC/zhpgvd.f
+++ b/lapack-netlib/SRC/zhpgvd.f
@@ -44,12 +44,6 @@
 *> positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zlaed8.f b/lapack-netlib/SRC/zlaed8.f
index 995a673de..003725820 100644
--- a/lapack-netlib/SRC/zlaed8.f
+++ b/lapack-netlib/SRC/zlaed8.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE ZLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMDA,
+*       SUBROUTINE ZLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMBDA,
 *                          Q2, LDQ2, W, INDXP, INDX, INDXQ, PERM, GIVPTR,
 *                          GIVCOL, GIVNUM, INFO )
 *
@@ -29,7 +29,7 @@
 *       .. Array Arguments ..
 *       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
 *      $                   INDXQ( * ), PERM( * )
-*       DOUBLE PRECISION   D( * ), DLAMDA( * ), GIVNUM( 2, * ), W( * ),
+*       DOUBLE PRECISION   D( * ), DLAMBDA( * ), GIVNUM( 2, * ), W( * ),
 *      $                   Z( * )
 *       COMPLEX*16         Q( LDQ, * ), Q2( LDQ2, * )
 *       ..
@@ -122,9 +122,9 @@
 *>         destroyed during the updating process.
 *> \endverbatim
 *>
-*> \param[out] DLAMDA
+*> \param[out] DLAMBDA
 *> \verbatim
-*>          DLAMDA is DOUBLE PRECISION array, dimension (N)
+*>          DLAMBDA is DOUBLE PRECISION array, dimension (N)
 *>         Contains a copy of the first K eigenvalues which will be used
 *>         by DLAED3 to form the secular equation.
 *> \endverbatim
@@ -222,7 +222,7 @@
 *> \ingroup complex16OTHERcomputational
 *
 *  =====================================================================
-      SUBROUTINE ZLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMDA,
+      SUBROUTINE ZLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMBDA,
      $                   Q2, LDQ2, W, INDXP, INDX, INDXQ, PERM, GIVPTR,
      $                   GIVCOL, GIVNUM, INFO )
 *
@@ -237,7 +237,7 @@
 *     .. Array Arguments ..
       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
      $                   INDXQ( * ), PERM( * )
-      DOUBLE PRECISION   D( * ), DLAMDA( * ), GIVNUM( 2, * ), W( * ),
+      DOUBLE PRECISION   D( * ), DLAMBDA( * ), GIVNUM( 2, * ), W( * ),
      $                   Z( * )
       COMPLEX*16         Q( LDQ, * ), Q2( LDQ2, * )
 *     ..
@@ -322,14 +322,14 @@
          INDXQ( I ) = INDXQ( I ) + CUTPNT
    20 CONTINUE
       DO 30 I = 1, N
-         DLAMDA( I ) = D( INDXQ( I ) )
+         DLAMBDA( I ) = D( INDXQ( I ) )
          W( I ) = Z( INDXQ( I ) )
    30 CONTINUE
       I = 1
       J = CUTPNT + 1
-      CALL DLAMRG( N1, N2, DLAMDA, 1, 1, INDX )
+      CALL DLAMRG( N1, N2, DLAMBDA, 1, 1, INDX )
       DO 40 I = 1, N
-         D( I ) = DLAMDA( INDX( I ) )
+         D( I ) = DLAMBDA( INDX( I ) )
          Z( I ) = W( INDX( I ) )
    40 CONTINUE
 *
@@ -438,7 +438,7 @@
          ELSE
             K = K + 1
             W( K ) = Z( JLAM )
-            DLAMDA( K ) = D( JLAM )
+            DLAMBDA( K ) = D( JLAM )
             INDXP( K ) = JLAM
             JLAM = J
          END IF
@@ -450,19 +450,19 @@
 *
       K = K + 1
       W( K ) = Z( JLAM )
-      DLAMDA( K ) = D( JLAM )
+      DLAMBDA( K ) = D( JLAM )
       INDXP( K ) = JLAM
 *
   100 CONTINUE
 *
-*     Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+*     Sort the eigenvalues and corresponding eigenvectors into DLAMBDA
 *     and Q2 respectively.  The eigenvalues/vectors which were not
-*     deflated go into the first K slots of DLAMDA and Q2 respectively,
+*     deflated go into the first K slots of DLAMBDA and Q2 respectively,
 *     while those which were deflated go into the last N - K slots.
 *
       DO 110 J = 1, N
          JP = INDXP( J )
-         DLAMDA( J ) = D( JP )
+         DLAMBDA( J ) = D( JP )
          PERM( J ) = INDXQ( INDX( JP ) )
          CALL ZCOPY( QSIZ, Q( 1, PERM( J ) ), 1, Q2( 1, J ), 1 )
   110 CONTINUE
@@ -471,7 +471,7 @@
 *     into the last N - K slots of D and Q respectively.
 *
       IF( K.LT.N ) THEN
-         CALL DCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 )
+         CALL DCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 )
          CALL ZLACPY( 'A', QSIZ, N-K, Q2( 1, K+1 ), LDQ2, Q( 1, K+1 ),
      $                LDQ )
       END IF
diff --git a/lapack-netlib/SRC/zlals0.f b/lapack-netlib/SRC/zlals0.f
index 7a7310042..79c0cf5e4 100644
--- a/lapack-netlib/SRC/zlals0.f
+++ b/lapack-netlib/SRC/zlals0.f
@@ -392,6 +392,11 @@
      $                ( POLES( I, 2 ).EQ.ZERO ) ) THEN
                      RWORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine DLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent
+*                    optimizing compilers from doing x+(y+z).
+*
                      RWORK( I ) = POLES( I, 2 )*Z( I ) /
      $                            ( DLAMC3( POLES( I, 2 ), DSIGJ )-
      $                            DIFLJ ) / ( POLES( I, 2 )+DJ )
@@ -470,6 +475,11 @@
                   IF( Z( J ).EQ.ZERO ) THEN
                      RWORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine DLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent
+*                    optimizing compilers from doing x+(y+z).
+*
                      RWORK( I ) = Z( J ) / ( DLAMC3( DSIGJ, -POLES( I+1,
      $                            2 ) )-DIFR( I, 1 ) ) /
      $                            ( DSIGJ+POLES( I, 1 ) ) / DIFR( I, 2 )
diff --git a/lapack-netlib/SRC/zlalsd.f b/lapack-netlib/SRC/zlalsd.f
index dca308e56..1d7358aa9 100644
--- a/lapack-netlib/SRC/zlalsd.f
+++ b/lapack-netlib/SRC/zlalsd.f
@@ -48,12 +48,6 @@
 *> problem; in this case a minimum norm solution is returned.
 *> The actual singular values are returned in D in ascending order.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zstedc.f b/lapack-netlib/SRC/zstedc.f
index 74d390af7..e62063a19 100644
--- a/lapack-netlib/SRC/zstedc.f
+++ b/lapack-netlib/SRC/zstedc.f
@@ -43,12 +43,6 @@
 *> be found if ZHETRD or ZHPTRD or ZHBTRD has been used to reduce this
 *> matrix to tridiagonal form.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.  See DLAED3 for details.
 *> \endverbatim
 *
 *  Arguments:

From b9fb63c05e06684653b5e44e8a6a3d5037bbbd89 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 26 Jun 2023 16:25:08 +0200
Subject: [PATCH 165/718] Remove OMP-private variable outside OMP context
 (LAPACK PR860)

---
 lapack-netlib/SRC/chetrd_hb2st.F | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/chetrd_hb2st.F b/lapack-netlib/SRC/chetrd_hb2st.F
index e047ab720..30b01ed83 100644
--- a/lapack-netlib/SRC/chetrd_hb2st.F
+++ b/lapack-netlib/SRC/chetrd_hb2st.F
@@ -537,7 +537,7 @@ C                 END IF
      $                         STIND, EDIND, SWEEPID, N, KD, IB,
      $                         WORK ( INDA ), LDA, 
      $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
-     $                         WORK( INDW + TID*KD ) )
+     $                         WORK( INDW ) )
 #endif 
                           IF ( BLKLASTIND.GE.(N-1) ) THEN
                               STT = STT + 1

From 5b6921780868806c14a9a8123c649f31216a9a9c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 26 Jun 2023 16:26:21 +0200
Subject: [PATCH 166/718] Update zhetrd_hb2st.FRemove OMP-private variable
 outside OMP context (LAPACK PR860)

---
 lapack-netlib/SRC/zhetrd_hb2st.F | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/zhetrd_hb2st.F b/lapack-netlib/SRC/zhetrd_hb2st.F
index e839271a4..1d39ac942 100644
--- a/lapack-netlib/SRC/zhetrd_hb2st.F
+++ b/lapack-netlib/SRC/zhetrd_hb2st.F
@@ -538,7 +538,7 @@ C                 END IF
      $                         STIND, EDIND, SWEEPID, N, KD, IB,
      $                         WORK ( INDA ), LDA, 
      $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
-     $                         WORK( INDW + TID*KD ) )
+     $                         WORK( INDW ) )
 #endif 
                           IF ( BLKLASTIND.GE.(N-1) ) THEN
                               STT = STT + 1

From a0e413ef054993a79cc7909cbc147e22ec2d60c9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 26 Jun 2023 16:28:15 +0200
Subject: [PATCH 167/718] Update ssytrd_sb2st.FRemove OMP-private variable
 outside OMP context (LAPACK PR860)

---
 lapack-netlib/SRC/ssytrd_sb2st.F | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/ssytrd_sb2st.F b/lapack-netlib/SRC/ssytrd_sb2st.F
index faeeff5f4..b8386670a 100644
--- a/lapack-netlib/SRC/ssytrd_sb2st.F
+++ b/lapack-netlib/SRC/ssytrd_sb2st.F
@@ -506,7 +506,7 @@
      $                         STIND, EDIND, SWEEPID, N, KD, IB,
      $                         WORK ( INDA ), LDA, 
      $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
-     $                         WORK( INDW + TID*KD ) )
+     $                         WORK( INDW ) )
 #endif 
                           IF ( BLKLASTIND.GE.(N-1) ) THEN
                               STT = STT + 1

From 49202228d75b806c6e1ac7fe63e61b3b6493b856 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 26 Jun 2023 16:29:04 +0200
Subject: [PATCH 168/718] Remove OMP-private variable outside OMP context
 (LAPACK PR860)

---
 lapack-netlib/SRC/dsytrd_sb2st.F | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/dsytrd_sb2st.F b/lapack-netlib/SRC/dsytrd_sb2st.F
index fabc10756..bb74dd491 100644
--- a/lapack-netlib/SRC/dsytrd_sb2st.F
+++ b/lapack-netlib/SRC/dsytrd_sb2st.F
@@ -506,7 +506,7 @@
      $                         STIND, EDIND, SWEEPID, N, KD, IB,
      $                         WORK ( INDA ), LDA, 
      $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
-     $                         WORK( INDW + TID*KD ) )
+     $                         WORK( INDW ) )
 #endif 
                           IF ( BLKLASTIND.GE.(N-1) ) THEN
                               STT = STT + 1

From d6be5036d7495d3471d4e7c0cf8bb791b029e811 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 26 Jun 2023 21:19:33 +0200
Subject: [PATCH 169/718] Fix IDAMAX

---
 kernel/riscv64/iamax_vector.c | 40 +++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 13 deletions(-)

diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c
index 9fea522f7..4242af6ea 100644
--- a/kernel/riscv64/iamax_vector.c
+++ b/kernel/riscv64/iamax_vector.c
@@ -29,8 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <math.h>
 
 #if defined(DOUBLE)
-
-#define ABS fabs
+#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
 #define VSETVL(n) vsetvl_e64m8(n)
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
@@ -54,8 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VADDVX_UINT vadd_vx_u64m8
 #define VMVVX_UINT vmv_v_x_u64m8
 #else
-
-#define ABS fabsf
+#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
 #define VSETVL(n) vsetvl_e32m8(n)
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
@@ -85,7 +83,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0, j=0;
 	FLOAT maxf=0.0;
+#ifdef DOUBLE
+        BLASLONG max_index = 0;
+#else
         unsigned int max_index = 0;
+#endif
 	if (n <= 0 || inc_x <= 0) return(max_index);
 
         FLOAT_V_T vx, v_max;
@@ -117,11 +119,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         j += gvl;
                 }
                 v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                maxf = *((FLOAT*)&v_res);
+                maxf = VFMVFS_FLOAT(v_res);
                 mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
                 max_index = VMFIRSTM(mask,gvl);
-                max_index = *((unsigned int*)&v_max_index+max_index);
-
+#ifdef DOUBLE
+		max_index = *((BLASLONG *)&v_max_index+max_index);
+#else
+		max_index = *((unsigned int *)&v_max_index+max_index);
+#endif
                 if(j < n){
                         gvl = VSETVL(n-j);
                         vx = VLEV_FLOAT(&x[j], gvl);
@@ -130,7 +135,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
 
                         v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                        FLOAT cur_maxf = *((FLOAT*)&v_res);
+                        FLOAT cur_maxf = VFMVFS_FLOAT(v_res);
                         if(cur_maxf > maxf){
                                 //tail index
                                 v_max_index = VIDV_UINT(gvl);
@@ -138,7 +143,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
                                 mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
                                 max_index = VMFIRSTM(mask,gvl);
+#ifdef DOUBLE
+                                max_index = *((BLASLONG*)&v_max_index+max_index);
+#else
                                 max_index = *((unsigned int*)&v_max_index+max_index);
+#endif
                         }
                 }
         }else{
@@ -165,11 +174,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         idx += inc_v;
                 }
                 v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                maxf = *((FLOAT*)&v_res);
+                maxf = VFMVFS_FLOAT(v_res);
                 mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
                 max_index = VMFIRSTM(mask,gvl);
+#ifdef DOUBLE
+                max_index = *((BLASLONG*)&v_max_index+max_index);
+#else
                 max_index = *((unsigned int*)&v_max_index+max_index);
-
+#endif
                 if(j < n){
                         gvl = VSETVL(n-j);
                         vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
@@ -178,7 +190,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
 
                         v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                        FLOAT cur_maxf = *((FLOAT*)&v_res);
+                        FLOAT cur_maxf = VFMVFS_FLOAT(v_res);
                         if(cur_maxf > maxf){
                                 //tail index
                                 v_max_index = VIDV_UINT(gvl);
@@ -186,11 +198,13 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
                                 mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
                                 max_index = VMFIRSTM(mask,gvl);
+#ifdef DOUBLE
+                                max_index = *((BLASLONG*)&v_max_index+max_index);
+#else
                                 max_index = *((unsigned int*)&v_max_index+max_index);
+#endif
                         }
                 }
         }
 	return(max_index+1);
 }
-
-

From 772b0cc71514409511eb4efd4b5770ae40b2f4e7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 27 Jun 2023 16:12:27 +0200
Subject: [PATCH 170/718] Fix early bailout

---
 kernel/riscv64/dot_vector.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/riscv64/dot_vector.c b/kernel/riscv64/dot_vector.c
index f47e0c0b5..cc27d68ed 100644
--- a/kernel/riscv64/dot_vector.c
+++ b/kernel/riscv64/dot_vector.c
@@ -63,7 +63,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 	BLASLONG i=0, j=0;
 	double dot = 0.0 ;
 
-	if ( n < 0 )  return(dot);
+	if ( n < 1 )  return(dot);
 
         FLOAT_V_T vr, vx, vy;
         unsigned int gvl = 0;

From ceaee7dc645da97e5e99ee7c0b2a9be8709b32c2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 27 Jun 2023 16:13:23 +0200
Subject: [PATCH 171/718] remove the limitation to -O1 again

---
 Makefile.riscv64 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.riscv64 b/Makefile.riscv64
index 0246c0f7d..ce91e03ec 100644
--- a/Makefile.riscv64
+++ b/Makefile.riscv64
@@ -1,4 +1,4 @@
 ifeq ($(CORE), C910V)
-CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -O1
+CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920
 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static
 endif

From 616fdea82ad91ebb481d7da469d0b238473bc9b7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 28 Jun 2023 09:45:17 +0200
Subject: [PATCH 172/718] Revert "Improve Windows threading performance
 scaling"

---
 CONTRIBUTORS.md                   |   5 --
 driver/others/blas_server_win32.c | 110 +++++++++++++++++-------------
 2 files changed, 61 insertions(+), 54 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 7efc04092..71df13634 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -216,8 +216,3 @@ In chronological order:
   
 * Pablo Romero <https://github.com/pablorcum>
   * [2022-08] Fix building from sources for QNX
-
-* Mark Seminatore <https://github.com/mseminatore>
-  * [2023-06-23] Fix bounds issue in goto_set_num_threads
-  * [2023-06-23] Improve Windows threading performance scaling
-  
\ No newline at end of file
diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 0b213bf2c..afa33cccc 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -50,19 +50,11 @@
 
 /* This is a thread implementation for Win32 lazy implementation */
 
-#if defined (__GNUC__) && (__GNUC__ < 6)
-	#define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch)
-#else
-	#if defined(_WIN64)
-		#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp)
-	#else
-		#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp)
-	#endif
-#endif
-
 /* Thread server common information */
 typedef struct{
-  HANDLE taskSemaphore;
+  CRITICAL_SECTION lock;
+  HANDLE filled;
+  HANDLE killed;
 
   blas_queue_t	*queue;    /* Parameter Pointer */
   int		shutdown;  /* server shutdown flag */
@@ -79,6 +71,8 @@ static blas_pool_t   pool;
 static HANDLE	    blas_threads   [MAX_CPU_NUMBER];
 static DWORD	    blas_threads_id[MAX_CPU_NUMBER];
 
+
+
 static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 
       if (!(mode & BLAS_COMPLEX)){
@@ -204,6 +198,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 
 /* This is a main routine of threads. Each thread waits until job is */
 /* queued.                                                           */
+
 static DWORD WINAPI blas_thread_server(void *arg){
 
   /* Thread identifier */
@@ -212,7 +207,9 @@ static DWORD WINAPI blas_thread_server(void *arg){
 #endif
 
   void *buffer, *sa, *sb;
-  volatile blas_queue_t	*queue;
+  blas_queue_t	*queue;
+  DWORD action;
+  HANDLE handles[] = {pool.filled, pool.killed};
 
   /* Each server needs each buffer */
   buffer   = blas_memory_alloc(2);
@@ -229,32 +226,28 @@ static DWORD WINAPI blas_thread_server(void *arg){
     fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu);
 #endif
 
-	// all worker threads wait on the semaphore
-	WaitForSingleObject(pool.taskSemaphore, INFINITE);
+    do {
+      action = WaitForMultipleObjects(2, handles, FALSE, INFINITE);
+    } while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1));
+
+    if (action == WAIT_OBJECT_0 + 1) break;
 
-	// kill the thread if we are shutting down the server
-	if (pool.shutdown)
-		break;
-    
 #ifdef SMP_DEBUG
     fprintf(STDERR, "Server[%2ld] Got it.\n", cpu);
 #endif
 
-	// grab a queued task and update the list
-	volatile blas_queue_t* queue_next;
-	INT_PTR prev_value;
-	do {
-		queue = (volatile blas_queue_t*)pool.queue;
-		if (!queue)
-			break;
+    EnterCriticalSection(&pool.lock);
+
+    queue = pool.queue;
+    if (queue) pool.queue = queue->next;
 
-		queue_next = (volatile blas_queue_t*)queue->next;
-		prev_value = WIN_CAS((INT_PTR*)&pool.queue, (INT_PTR)queue_next, (INT_PTR)queue);
-	} while (prev_value != queue);
+    LeaveCriticalSection(&pool.lock);
 
     if (queue)  {
       int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
 
+      if (pool.queue) SetEvent(pool.filled);
+
       sa = queue -> sa;
       sb = queue -> sb;
 
@@ -339,8 +332,13 @@ static DWORD WINAPI blas_thread_server(void *arg){
     fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
 #endif
 
-	// mark our sub-task as complete
-	InterlockedDecrement(&queue->status);
+    EnterCriticalSection(&queue->lock);
+
+    queue -> status = BLAS_STATUS_FINISHED;
+
+    LeaveCriticalSection(&queue->lock);
+
+    SetEvent(queue->finish);
   }
 
   /* Shutdown procedure */
@@ -355,7 +353,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
   }
 
 /* Initializing routine */
-  int blas_thread_init(void){
+int blas_thread_init(void){
   BLASLONG i;
 
   if (blas_server_avail || (blas_cpu_number <= 1)) return 0;
@@ -369,7 +367,9 @@ static DWORD WINAPI blas_thread_server(void *arg){
 
   if (!blas_server_avail){
 
-	pool.taskSemaphore = CreateSemaphore(NULL, 0, blas_cpu_number - 1, NULL);
+    InitializeCriticalSection(&pool.lock);
+    pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
+    pool.killed = CreateEvent(NULL, TRUE,  FALSE, NULL);
 
     pool.shutdown = 0;
     pool.queue    = NULL;
@@ -391,10 +391,11 @@ static DWORD WINAPI blas_thread_server(void *arg){
 /*
    User can call one of two routines.
 
-	 exec_blas_async ... immediately returns after jobs are queued.
+     exec_blas_async ... immediately returns after jobs are queued.
 
-	 exec_blas       ... returns after jobs are finished.
+     exec_blas       ... returns after jobs are finished.
 */
+
 int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
 
 #if defined(SMP_SERVER)
@@ -408,7 +409,8 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
   current = queue;
 
   while (current) {
-	current->status = 1;
+    InitializeCriticalSection(&current -> lock);
+    current -> finish = CreateEvent(NULL, FALSE, FALSE, NULL);
     current -> position = pos;
 
 #ifdef CONSISTENT_FPCSR
@@ -420,10 +422,19 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
     pos ++;
   }
 
-  pool.queue = queue;
+  EnterCriticalSection(&pool.lock);
+
+  if (pool.queue) {
+    current = pool.queue;
+    while (current -> next) current = current -> next;
+    current -> next = queue;
+  } else {
+    pool.queue = queue;
+  }
+
+  LeaveCriticalSection(&pool.lock);
 
-  // start up worker threads
-  ReleaseSemaphore(pool.taskSemaphore, pos - 1, NULL);
+  SetEvent(pool.filled);
 
   return 0;
 }
@@ -439,9 +450,10 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
     fprintf(STDERR, "Waiting Queue ..\n");
 #endif
 
-	// spin-wait on each sub-task to finish
-	while (*((volatile int*)&queue->status))
-		YIELDING;
+      WaitForSingleObject(queue->finish, INFINITE);
+
+      CloseHandle(queue->finish);
+      DeleteCriticalSection(&queue -> lock);
 
       queue = queue -> next;
       num --;
@@ -489,21 +501,18 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
 
 /* Shutdown procedure, but user don't have to call this routine. The */
 /* kernel automatically kill threads.                                */
+
 int BLASFUNC(blas_thread_shutdown)(void){
 
   int i;
 
-#ifdef SMP_DEBUG
-  fprintf(STDERR, "blas_thread_shutdown..\n");
-#endif
-
   if (!blas_server_avail) return 0;
 
   LOCK_COMMAND(&server_lock);
 
   if (blas_server_avail){
 
-	  pool.shutdown = 1;
+    SetEvent(pool.killed);
 
     for(i = 0; i < blas_num_threads - 1; i++){
       // Could also just use WaitForMultipleObjects
@@ -519,7 +528,8 @@ int BLASFUNC(blas_thread_shutdown)(void){
       CloseHandle(blas_threads[i]);
     }
 
-	CloseHandle(pool.taskSemaphore);
+    CloseHandle(pool.filled);
+    CloseHandle(pool.killed);
 
     blas_server_avail = 0;
   }
@@ -549,14 +559,16 @@ void goto_set_num_threads(int num_threads)
 		//increased_threads = 1;
 	    if (!blas_server_avail){
 
-			pool.taskSemaphore = CreateSemaphore(NULL, 0, blas_cpu_number - 1, NULL);
+			InitializeCriticalSection(&pool.lock);
+			pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
+			pool.killed = CreateEvent(NULL, TRUE,  FALSE, NULL);
 
 			pool.shutdown = 0;
 			pool.queue    = NULL;
 			blas_server_avail = 1;
 		}
 
-		for(i = blas_num_threads; i < num_threads - 1; i++){
+		for(i = blas_num_threads - 1; i < num_threads - 1; i++){
 
 			blas_threads[i] = CreateThread(NULL, 0,
 				     blas_thread_server, (void *)i,

From e14a025bb1f5abbd7a71c1524882526d7bc5e28f Mon Sep 17 00:00:00 2001
From: Xianyi Zhang <xianyi@perfxlab.com>
Date: Wed, 28 Jun 2023 11:17:38 +0000
Subject: [PATCH 173/718] Temporily walk around zaxpy vector kernel bug.

---
 kernel/riscv64/KERNEL.C910V | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V
index 0da66fa35..18cb3bafd 100644
--- a/kernel/riscv64/KERNEL.C910V
+++ b/kernel/riscv64/KERNEL.C910V
@@ -42,8 +42,8 @@ ZSUMKERNEL  = ../arm/zsum.c
 
 SAXPYKERNEL  = axpy_vector.c
 DAXPYKERNEL  = axpy_vector.c
-CAXPYKERNEL  = zaxpy_vector.c
-ZAXPYKERNEL  = zaxpy_vector.c
+CAXPYKERNEL  = zaxpy.c
+ZAXPYKERNEL  = zaxpy.c
 
 SAXPBYKERNEL  = axpby_vector.c
 DAXPBYKERNEL  = axpby_vector.c

From f7b916076b1f32ccedbbd44c84ab9784d6fbfda2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 28 Jun 2023 21:30:59 +0200
Subject: [PATCH 174/718] Add Jenkins configuration files for OSUOSL powerci
 and ibmz-ci

---
 Jenkinsfile     | 15 ++++++++++-----
 Jenkinsfile.pwr | 16 ++++++++++++++++
 2 files changed, 26 insertions(+), 5 deletions(-)
 create mode 100644 Jenkinsfile.pwr

diff --git a/Jenkinsfile b/Jenkinsfile
index 2b61bed9f..5fad6a95b 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1,9 +1,14 @@
-node {
-        stage('Checkout') {
-            checkout
+pipeline {
+    agent { 
+        docker {
+            image 'osuosl/ubuntu-s390x'
         }
-
+    }
+    stages {
         stage('Build') {
-            sh("make")
+            steps {
+                sh 'make'
+            }
         }
+    }
 }
diff --git a/Jenkinsfile.pwr b/Jenkinsfile.pwr
new file mode 100644
index 000000000..d141ed8a5
--- /dev/null
+++ b/Jenkinsfile.pwr
@@ -0,0 +1,16 @@
+pipeline {
+    agent { 
+        docker {
+            image 'osuosl/ubuntu-ppc64le'
+        }
+    }
+    stages {
+        stage('Build') {
+            steps {
+                sh 'sudo apt update'
+                sh 'sudo apt install gfortran -y'
+                sh 'make'
+            }
+        }
+    }
+}

From bb862b82d54bc3c29e905179035e8839742f1ea9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 29 Jun 2023 23:59:25 +0200
Subject: [PATCH 175/718] Fix integer overflow in multithreading threshold
 calculation for SYMM/SYRK (#4116)

* Fix potential integer overflow
---
 interface/symm.c | 4 ++--
 interface/syrk.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/interface/symm.c b/interface/symm.c
index 3e65e69b1..3e6e0fd48 100644
--- a/interface/symm.c
+++ b/interface/symm.c
@@ -166,7 +166,7 @@ void NAME(char *SIDE, char *UPLO,
   int nodes;
 #endif
 # if defined(SMP)
-  int MN;
+  double MN;
 #endif
   blasint info;
   int side;
@@ -264,7 +264,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
   int nodes;
 #endif
 #if defined(SMP)
-  int MN;
+  double MN;
 #endif
 
   PRINT_DEBUG_CNAME;
diff --git a/interface/syrk.c b/interface/syrk.c
index 3b056aec8..69f2328a4 100644
--- a/interface/syrk.c
+++ b/interface/syrk.c
@@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS,
   FLOAT *sa, *sb;
 
 #ifdef SMP
-  int NNK;
+  double NNK;
 #ifdef USE_SIMPLE_THREADED_LEVEL3
 #ifndef COMPLEX
 #ifdef XDOUBLE
@@ -232,7 +232,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
   FLOAT *sa, *sb;
 
 #ifdef SMP
-int NNK;
+double NNK;
 
 #ifdef USE_SIMPLE_THREADED_LEVEL3
 #ifndef COMPLEX

From 6deb52812d61dffcc8e60dd355ca2d763ddbe55e Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Fri, 30 Jun 2023 14:11:01 +0800
Subject: [PATCH 176/718] LoongArch64: Add WhereAmI()

---
 common_loongarch64.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/common_loongarch64.h b/common_loongarch64.h
index e15539b5f..ce1fcf091 100644
--- a/common_loongarch64.h
+++ b/common_loongarch64.h
@@ -83,6 +83,19 @@ static inline int blas_quickdivide(blasint x, blasint y){
   return x / y;
 }
 
+#ifndef NO_AFFINITY
+static inline int WhereAmI(void){
+  int ret = 0, counter = 0;
+  __asm__ volatile (
+    "rdtimel.w  %[counter],   %[id]"
+    : [id]"=r"(ret), [counter]"=r"(counter)
+    :
+    : "memory"
+  );
+  return ret;
+}
+#endif
+
 #ifdef DOUBLE
 #define GET_IMAGE(res)  __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res)  : : "memory")
 #else

From 4d0b7fbec04c95c90291938a5974f00673e10e68 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 2 Jul 2023 12:38:57 +0200
Subject: [PATCH 177/718] Adjust M1 crossbuilds after image autoupgrade and add
 an M1-to-Android crossbuild on Cirrus CI (#4117)

* Fix Apple crossbuilds after Cirrus updated the image; add a crossbuild to Android/ARMV7
---
 .cirrus.yml | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index bef2b1e86..8a1c4a0a8 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -32,7 +32,7 @@ task:
   - make
   
 macos_instance:
-  image: ghcr.io/cirruslabs/macos-monterey-xcode:13.4
+  image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
 task:
   name: AppleM1/LLVM x86_64 xbuild
   compile_script:
@@ -47,10 +47,11 @@ task:
   - export ARCHS_STANDARD_INCLUDING_64_BIT="i386 x86_64"
   - export ARCHS_UNIVERSAL_IPHONE_OS="i386 x86_64"
   - export VALID_ARCHS="i386 x86_64"
-  - #find /Applications/Xcode-13.4.1.app -name libunwind.dylib
-  - export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
-  - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator15.5.sdk -arch x86_64 -miphoneos-version-min=10.0"
-  - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
+  - xcrun --sdk macosx --show-sdk-path
+  - xcodebuild -version
+  - export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+  - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX12.3.sdk -arch x86_64"
+  - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
   always:
     config_artifacts:
       path: "*conf*"
@@ -60,7 +61,7 @@ task:
 #    type: application/octet-streamm
 
 macos_instance:
-  image: ghcr.io/cirruslabs/macos-monterey-xcode:13.4
+  image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
 task:
   name: AppleM1/LLVM armv8-ios xbuild
   compile_script:
@@ -68,15 +69,33 @@ task:
   - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
   - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
   - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" 
-  - find /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms -name "IPhoneOS*sdk"
-  - export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
-  - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS15.5.sdk -arch arm64 -miphoneos-version-min=10.0"
+  - export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+  - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
   - make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1
   always:
     config_artifacts:
       path: "*conf*"
       type: text/plain
 
+macos_instance:
+  image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
+task:
+  name: AppleM1/LLVM armv7-androidndk xbuild
+  compile_script:
+  - #brew install android-ndk
+  - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
+  - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+  - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" 
+  - find /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b -name "armv7a-linux-androideabi*-ranlib"
+  - #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+  - #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
+  - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b/AndroidNDK8937393.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
+  - make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
+  always:
+    config_artifacts:
+      path: "*conf*"
+      type: text/plain
+
 task:
   name: NeoverseN1
   arm_container:

From a32af56761f2df742baf976b07b9b5ff72df5342 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 2 Jul 2023 21:50:15 +0200
Subject: [PATCH 178/718] Add status badges for OSUOSL's POWERCI and IBMZ-CI
 services

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index a2eac07be..081d45870 100644
--- a/README.md
+++ b/README.md
@@ -9,9 +9,12 @@ AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n
 Cirrus CI: [![Build Status](https://api.cirrus-ci.com/github/xianyi/OpenBLAS.svg?branch=develop)](https://cirrus-ci.com/github/xianyi/OpenBLAS)
 <!-- Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/)-->
 
+
 [![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
 
+OSUOSL POWERCI [![Build Status](https://powerci.osuosl.org/buildStatus/icon?job=OpenBLAS_gh%2Fdevelop)](http://powerci.osuosl.org/job/OpenBLAS_gh/job/develop/)
 
+OSUOSL IBMZ-CI [![Build Status](http://ibmz-ci.osuosl.org/buildStatus/icon?job=OpenBLAS-Z%2Fdevelop)](http://ibmz-ci.osuosl.org/job/OpenBLAS-Z/job/develop/)
 ## Introduction
 
 OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version.

From ac698cedad4bd0840e165a22c0aafdc35a281ae3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 5 Jul 2023 09:47:49 +0200
Subject: [PATCH 179/718] Add compiler options for ARM64 SVE targets in
 DYNAMIC_ARCH builds

---
 cmake/system.cmake | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/cmake/system.cmake b/cmake/system.cmake
index 3dc6c863e..414193ec8 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -280,7 +280,29 @@ if (DEFINED TARGET)
   if (${TARGET} STREQUAL POWER8)
     set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
   endif()
+
+if (${TARGET} STREQUAL NEOVERSEV1)
+    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
+      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1")
+    else ()
+	    message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.")
+    endif()
+  endif()
+  if (${TARGET} STREQUAL NEOVERSEN2)
+    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
+      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
+    else ()
+	    message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.")
+    endif()
+  endif()
+  if (${TARGET} STREQUAL ARMV8SVE)
+      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve")
+  endif()
+
 endif()
+
 if (DEFINED BINARY)
   message(STATUS "Compiling a ${BINARY}-bit binary.")
 endif ()

From 2f9f70584e4d1bb34dee6d273ffa38e9b4474710 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 5 Jul 2023 09:48:54 +0200
Subject: [PATCH 180/718] Fix target list syntax for ARM64 DYNAMIC_ARCH

---
 cmake/arch.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index f70019800..e6e434a0a 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -46,7 +46,7 @@ if (DYNAMIC_ARCH)
   if (ARM64)
 	  set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
     if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
-          set(DYNAMIC_CORE "${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2")
+          set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2)
     endif ()
     if (DYNAMIC_LIST)
 	  set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})

From 1363a7c4f12f4188869feea554e66585fc52db3b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 5 Jul 2023 10:10:13 +0200
Subject: [PATCH 181/718] Correct the order of eigenvalues/vector for 2x2
 matrices (Reference-LAPACK PR 867)

---
 lapack-netlib/SRC/cstemr.f | 36 +++++++++++++++++++++++++++++-------
 lapack-netlib/SRC/dstemr.f | 36 +++++++++++++++++++++++++++++-------
 lapack-netlib/SRC/sstemr.f | 36 +++++++++++++++++++++++++++++-------
 lapack-netlib/SRC/zstemr.f | 34 ++++++++++++++++++++++++++++------
 4 files changed, 115 insertions(+), 27 deletions(-)

diff --git a/lapack-netlib/SRC/cstemr.f b/lapack-netlib/SRC/cstemr.f
index d49684db3..9d47450e3 100644
--- a/lapack-netlib/SRC/cstemr.f
+++ b/lapack-netlib/SRC/cstemr.f
@@ -320,7 +320,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup stemr
 *
 *> \par Contributors:
 *  ==================
@@ -329,7 +329,8 @@
 *> Jim Demmel, University of California, Berkeley, USA \n
 *> Inderjit Dhillon, University of Texas, Austin, USA \n
 *> Osni Marques, LBNL/NERSC, USA \n
-*> Christof Voemel, University of California, Berkeley, USA
+*> Christof Voemel, University of California, Berkeley, USA \n
+*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n
 *
 *  =====================================================================
       SUBROUTINE CSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU,
@@ -361,7 +362,8 @@
      $                     MINRGP = 3.0E-3 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY
+      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY,
+     $                   LAESWAP
       INTEGER            I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW,
      $                   IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD,
      $                   INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP,
@@ -397,6 +399,7 @@
 *
       LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) )
       ZQUERY = ( NZC.EQ.-1 )
+      LAESWAP = .FALSE.
 
 *     SSTEMR needs WORK of size 6*N, IWORK of size 3*N.
 *     In addition, SLARRE needs WORK of size 6*N, IWORK of size 5*N.
@@ -519,6 +522,15 @@
          ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
             CALL SLAEV2( D(1), E(1), D(2), R1, R2, CS, SN )
          END IF
+*        D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However,
+*        the following code requires R1 >= R2. Hence, we correct
+*        the order of R1, R2, CS, SN if R1 < R2 before further processing.
+         IF( R1.LT.R2 ) THEN
+            E(2) = R1
+            R1 = R2
+            R2 = E(2)
+            LAESWAP = .TRUE.
+         ENDIF
          IF( ALLEIG.OR.
      $      (VALEIG.AND.(R2.GT.WL).AND.
      $                  (R2.LE.WU)).OR.
@@ -526,8 +538,13 @@
             M = M+1
             W( M ) = R2
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = -SN
-               Z( 2, M ) = CS
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ELSE
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
@@ -550,8 +567,13 @@
             M = M+1
             W( M ) = R1
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = CS
-               Z( 2, M ) = SN
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ELSE
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
diff --git a/lapack-netlib/SRC/dstemr.f b/lapack-netlib/SRC/dstemr.f
index d0c71ddd9..44a33423e 100644
--- a/lapack-netlib/SRC/dstemr.f
+++ b/lapack-netlib/SRC/dstemr.f
@@ -303,7 +303,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup stemr
 *
 *> \par Contributors:
 *  ==================
@@ -312,7 +312,8 @@
 *> Jim Demmel, University of California, Berkeley, USA \n
 *> Inderjit Dhillon, University of Texas, Austin, USA \n
 *> Osni Marques, LBNL/NERSC, USA \n
-*> Christof Voemel, University of California, Berkeley, USA
+*> Christof Voemel, University of California, Berkeley, USA \n
+*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n
 *
 *  =====================================================================
       SUBROUTINE DSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU,
@@ -344,7 +345,8 @@
      $                     MINRGP = 1.0D-3 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY
+      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY,
+     $                   LAESWAP
       INTEGER            I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW,
      $                   IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD,
      $                   INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP,
@@ -380,6 +382,7 @@
 *
       LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) )
       ZQUERY = ( NZC.EQ.-1 )
+      LAESWAP = .FALSE.
 
 *     DSTEMR needs WORK of size 6*N, IWORK of size 3*N.
 *     In addition, DLARRE needs WORK of size 6*N, IWORK of size 5*N.
@@ -502,6 +505,15 @@
          ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
             CALL DLAEV2( D(1), E(1), D(2), R1, R2, CS, SN )
          END IF
+*        D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However,
+*        the following code requires R1 >= R2. Hence, we correct
+*        the order of R1, R2, CS, SN if R1 < R2 before further processing.
+         IF( R1.LT.R2 ) THEN
+            E(2) = R1
+            R1 = R2
+            R2 = E(2)
+            LAESWAP = .TRUE.
+         ENDIF
          IF( ALLEIG.OR.
      $      (VALEIG.AND.(R2.GT.WL).AND.
      $                  (R2.LE.WU)).OR.
@@ -509,8 +521,13 @@
             M = M+1
             W( M ) = R2
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = -SN
-               Z( 2, M ) = CS
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ELSE
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
@@ -533,8 +550,13 @@
             M = M+1
             W( M ) = R1
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = CS
-               Z( 2, M ) = SN
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ELSE
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
diff --git a/lapack-netlib/SRC/sstemr.f b/lapack-netlib/SRC/sstemr.f
index 3a9bbe784..2ed697b69 100644
--- a/lapack-netlib/SRC/sstemr.f
+++ b/lapack-netlib/SRC/sstemr.f
@@ -303,7 +303,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup stemr
 *
 *> \par Contributors:
 *  ==================
@@ -312,7 +312,8 @@
 *> Jim Demmel, University of California, Berkeley, USA \n
 *> Inderjit Dhillon, University of Texas, Austin, USA \n
 *> Osni Marques, LBNL/NERSC, USA \n
-*> Christof Voemel, University of California, Berkeley, USA
+*> Christof Voemel, University of California, Berkeley, USA \n
+*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n
 *
 *  =====================================================================
       SUBROUTINE SSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU,
@@ -344,7 +345,8 @@
      $                     MINRGP = 3.0E-3 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY
+      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY,
+     $                   LAESWAP
       INTEGER            I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW,
      $                   IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD,
      $                   INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP,
@@ -378,6 +380,7 @@
 *
       LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) )
       ZQUERY = ( NZC.EQ.-1 )
+      LAESWAP = .FALSE.
 
 *     SSTEMR needs WORK of size 6*N, IWORK of size 3*N.
 *     In addition, SLARRE needs WORK of size 6*N, IWORK of size 5*N.
@@ -500,6 +503,15 @@
          ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
             CALL SLAEV2( D(1), E(1), D(2), R1, R2, CS, SN )
          END IF
+*        D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However,
+*        the following code requires R1 >= R2. Hence, we correct
+*        the order of R1, R2, CS, SN if R1 < R2 before further processing.
+         IF( R1.LT.R2 ) THEN
+            E(2) = R1
+            R1 = R2
+            R2 = E(2)
+            LAESWAP = .TRUE.
+         ENDIF
          IF( ALLEIG.OR.
      $      (VALEIG.AND.(R2.GT.WL).AND.
      $                  (R2.LE.WU)).OR.
@@ -507,8 +519,13 @@
             M = M+1
             W( M ) = R2
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = -SN
-               Z( 2, M ) = CS
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ELSE
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
@@ -531,8 +548,13 @@
             M = M+1
             W( M ) = R1
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = CS
-               Z( 2, M ) = SN
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ELSE
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
diff --git a/lapack-netlib/SRC/zstemr.f b/lapack-netlib/SRC/zstemr.f
index b034198de..4eaf5ef97 100644
--- a/lapack-netlib/SRC/zstemr.f
+++ b/lapack-netlib/SRC/zstemr.f
@@ -320,7 +320,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup stemr
 *
 *> \par Contributors:
 *  ==================
@@ -330,6 +330,7 @@
 *> Inderjit Dhillon, University of Texas, Austin, USA \n
 *> Osni Marques, LBNL/NERSC, USA \n
 *> Christof Voemel, University of California, Berkeley, USA \n
+*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n
 *
 *  =====================================================================
       SUBROUTINE ZSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU,
@@ -361,7 +362,8 @@
      $                     MINRGP = 1.0D-3 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY
+      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY,
+     $                   LAESWAP
       INTEGER            I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW,
      $                   IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD,
      $                   INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP,
@@ -397,6 +399,7 @@
 *
       LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) )
       ZQUERY = ( NZC.EQ.-1 )
+      LAESWAP = .FALSE.
 
 *     DSTEMR needs WORK of size 6*N, IWORK of size 3*N.
 *     In addition, DLARRE needs WORK of size 6*N, IWORK of size 5*N.
@@ -519,6 +522,15 @@
          ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
             CALL DLAEV2( D(1), E(1), D(2), R1, R2, CS, SN )
          END IF
+*        D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However,
+*        the following code requires R1 >= R2. Hence, we correct
+*        the order of R1, R2, CS, SN if R1 < R2 before further processing.
+         IF( R1.LT.R2 ) THEN
+            E(2) = R1
+            R1 = R2
+            R2 = E(2)
+            LAESWAP = .TRUE.
+         ENDIF
          IF( ALLEIG.OR.
      $      (VALEIG.AND.(R2.GT.WL).AND.
      $                  (R2.LE.WU)).OR.
@@ -526,8 +538,13 @@
             M = M+1
             W( M ) = R2
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = -SN
-               Z( 2, M ) = CS
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ELSE
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
@@ -550,8 +567,13 @@
             M = M+1
             W( M ) = R1
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = CS
-               Z( 2, M ) = SN
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ELSE
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN

From e1958eb70529c36d7dc4f3baf9e7bf37524053ab Mon Sep 17 00:00:00 2001
From: Octavian Maghiar <octavian.maghiar@imgtec.com>
Date: Wed, 5 Jul 2023 11:34:00 +0100
Subject: [PATCH 182/718] Fixes RVV masked intrinsics for iamax/iamin/imax/imin
 kernels

Changes masked intrinsics from _m to _mu and reintroduces maskedoff argument.
---
 kernel/riscv64/iamax_rvv.c | 16 ++++++++--------
 kernel/riscv64/iamin_rvv.c | 16 ++++++++--------
 kernel/riscv64/imax_rvv.c  | 16 ++++++++--------
 kernel/riscv64/imin_rvv.c  | 16 ++++++++--------
 4 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/kernel/riscv64/iamax_rvv.c b/kernel/riscv64/iamax_rvv.c
index ef7850a55..d3508a91d 100644
--- a/kernel/riscv64/iamax_rvv.c
+++ b/kernel/riscv64/iamax_rvv.c
@@ -45,9 +45,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMAXVV_FLOAT           __riscv_vfmax_vv_f64m8
 #define VFIRSTM                 __riscv_vfirst_m_b8
 #define UINT_V_T                vuint64m8_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u64m8_m
+#define VIDV_MASK_UINT          __riscv_vid_v_u64m8_mu
 #define VIDV_UINT               __riscv_vid_v_u64m8
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m8_m
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m8_mu
 #define VADDVX_UINT             __riscv_vadd_vx_u64m8
 #define VMVVX_UINT              __riscv_vmv_v_x_u64m8
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
@@ -71,9 +71,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMAXVV_FLOAT           __riscv_vfmax_vv_f32m8
 #define VFIRSTM                 __riscv_vfirst_m_b4
 #define UINT_V_T                vuint32m8_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u32m8_m
+#define VIDV_MASK_UINT          __riscv_vid_v_u32m8_mu
 #define VIDV_UINT               __riscv_vid_v_u32m8
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m8_m
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m8_mu
 #define VADDVX_UINT             __riscv_vadd_vx_u32m8
 #define VMVVX_UINT              __riscv_vmv_v_x_u32m8
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
@@ -106,8 +106,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             //index where element greater than v_max
             mask = VMFLTVV_FLOAT(v_max, vx, vl);
-            v_max_index = VIDV_MASK_UINT(mask, vl);
-            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl);
+            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
 
             //update v_max
             v_max = VFMAXVV_FLOAT(v_max, vx, vl);
@@ -125,8 +125,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             //index where element greater than v_max
             mask = VMFLTVV_FLOAT(v_max, vx, vl);
-            v_max_index = VIDV_MASK_UINT(mask, vl);
-            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl);
+            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
 
             //update v_max
             v_max = VFMAXVV_FLOAT(v_max, vx, vl);
diff --git a/kernel/riscv64/iamin_rvv.c b/kernel/riscv64/iamin_rvv.c
index 56a086fed..ae1d4f726 100644
--- a/kernel/riscv64/iamin_rvv.c
+++ b/kernel/riscv64/iamin_rvv.c
@@ -46,9 +46,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMINVV_FLOAT           __riscv_vfmin_vv_f64m8
 #define VFIRSTM                 __riscv_vfirst_m_b8
 #define UINT_V_T                vuint64m8_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u64m8_m
+#define VIDV_MASK_UINT          __riscv_vid_v_u64m8_mu
 #define VIDV_UINT               __riscv_vid_v_u64m8
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m8_m
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m8_mu
 #define VADDVX_UINT             __riscv_vadd_vx_u64m8
 #define VMVVX_UINT              __riscv_vmv_v_x_u64m8
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
@@ -72,9 +72,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMINVV_FLOAT           __riscv_vfmin_vv_f32m8
 #define VFIRSTM                 __riscv_vfirst_m_b4
 #define UINT_V_T                vuint32m8_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u32m8_m
+#define VIDV_MASK_UINT          __riscv_vid_v_u32m8_mu
 #define VIDV_UINT               __riscv_vid_v_u32m8
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m8_m
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m8_mu
 #define VADDVX_UINT             __riscv_vadd_vx_u32m8
 #define VMVVX_UINT              __riscv_vmv_v_x_u32m8
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
@@ -107,8 +107,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             // index where element less than v_min
             mask = VMFLTVV_FLOAT(vx, v_min, vl);
-            v_min_index = VIDV_MASK_UINT(mask, vl);
-            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl);
+            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
 
             //update v_min and start_index j
             v_min = VFMINVV_FLOAT(v_min, vx, vl);
@@ -126,8 +126,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             // index where element less than v_min
             mask = VMFLTVV_FLOAT(vx, v_min, vl);
-            v_min_index = VIDV_MASK_UINT(mask, vl);
-            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl);
+            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
 
             //update v_min and start_index j
             v_min = VFMINVV_FLOAT(v_min, vx, vl);
diff --git a/kernel/riscv64/imax_rvv.c b/kernel/riscv64/imax_rvv.c
index 5b60a56f7..33250568d 100644
--- a/kernel/riscv64/imax_rvv.c
+++ b/kernel/riscv64/imax_rvv.c
@@ -45,9 +45,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMAXVV_FLOAT           __riscv_vfmax_vv_f64m8
 #define VFIRSTM                 __riscv_vfirst_m_b8
 #define UINT_V_T                vuint64m8_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u64m8_m
+#define VIDV_MASK_UINT          __riscv_vid_v_u64m8_mu
 #define VIDV_UINT               __riscv_vid_v_u64m8
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m8_m
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m8_mu
 #define VADDVX_UINT             __riscv_vadd_vx_u64m8
 #define VMVVX_UINT              __riscv_vmv_v_x_u64m8
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
@@ -70,9 +70,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMAXVV_FLOAT           __riscv_vfmax_vv_f32m8
 #define VFIRSTM                 __riscv_vfirst_m_b4
 #define UINT_V_T                vuint32m8_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u32m8_m
+#define VIDV_MASK_UINT          __riscv_vid_v_u32m8_mu
 #define VIDV_UINT               __riscv_vid_v_u32m8
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m8_m
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m8_mu
 #define VADDVX_UINT             __riscv_vadd_vx_u32m8
 #define VMVVX_UINT              __riscv_vmv_v_x_u32m8
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
@@ -104,8 +104,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             //index where element greater than v_max
             mask = VMFLTVV_FLOAT(v_max, vx, vl);
-            v_max_index = VIDV_MASK_UINT(mask, vl);
-            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl);
+            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
 
             //update v_max and start_index j
             v_max = VFMAXVV_FLOAT(v_max, vx, vl);
@@ -122,8 +122,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             //index where element greater than v_max
             mask = VMFLTVV_FLOAT(v_max, vx, vl);
-            v_max_index = VIDV_MASK_UINT(mask, vl);
-            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl);
+            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
 
             //update v_max and start_index j
             v_max = VFMAXVV_FLOAT(v_max, vx, vl);
diff --git a/kernel/riscv64/imin_rvv.c b/kernel/riscv64/imin_rvv.c
index b49544a1b..4ce49c3af 100644
--- a/kernel/riscv64/imin_rvv.c
+++ b/kernel/riscv64/imin_rvv.c
@@ -45,9 +45,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMINVV_FLOAT           __riscv_vfmin_vv_f64m8
 #define VFIRSTM                 __riscv_vfirst_m_b8
 #define UINT_V_T                vuint64m8_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u64m8_m
+#define VIDV_MASK_UINT          __riscv_vid_v_u64m8_mu
 #define VIDV_UINT               __riscv_vid_v_u64m8
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m8_m
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m8_mu
 #define VADDVX_UINT             __riscv_vadd_vx_u64m8
 #define VMVVX_UINT              __riscv_vmv_v_x_u64m8
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
@@ -70,9 +70,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMINVV_FLOAT           __riscv_vfmin_vv_f32m8
 #define VFIRSTM                 __riscv_vfirst_m_b4
 #define UINT_V_T                vuint32m8_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u32m8_m
+#define VIDV_MASK_UINT          __riscv_vid_v_u32m8_mu
 #define VIDV_UINT               __riscv_vid_v_u32m8
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m8_m
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m8_mu
 #define VADDVX_UINT             __riscv_vadd_vx_u32m8
 #define VMVVX_UINT              __riscv_vmv_v_x_u32m8
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
@@ -104,8 +104,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             // index where element less than v_min
             mask = VMFLTVV_FLOAT(vx, v_min, vl);
-            v_min_index = VIDV_MASK_UINT(mask, vl);
-            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl);
+            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
 
             //update v_min and start_index j
             v_min = VFMINVV_FLOAT(v_min, vx, vl);
@@ -122,8 +122,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             // index where element less than v_min
             mask = VMFLTVV_FLOAT(vx, v_min, vl);
-            v_min_index = VIDV_MASK_UINT(mask, vl);
-            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl);
+            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
 
             //update v_min and start_index j
             v_min = VFMINVV_FLOAT(v_min, vx, vl);

From 069353bd445235deca47b4c902346432b24450d1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 5 Jul 2023 14:13:58 +0200
Subject: [PATCH 183/718] Add reciprocal scaling of a complex vector and use it
 in C/ZGETF2 (Reference-LAPACK PR839)

---
 lapack-netlib/SRC/cgetf2.f |  25 ++---
 lapack-netlib/SRC/crscl.f  | 202 ++++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/zgetf2.f |  17 +---
 lapack-netlib/SRC/zrscl.f  | 203 +++++++++++++++++++++++++++++++++++++
 4 files changed, 416 insertions(+), 31 deletions(-)
 create mode 100644 lapack-netlib/SRC/crscl.f
 create mode 100644 lapack-netlib/SRC/zrscl.f

diff --git a/lapack-netlib/SRC/cgetf2.f b/lapack-netlib/SRC/cgetf2.f
index aac989970..995ee40ec 100644
--- a/lapack-netlib/SRC/cgetf2.f
+++ b/lapack-netlib/SRC/cgetf2.f
@@ -101,7 +101,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup getf2
 *
 *  =====================================================================
       SUBROUTINE CGETF2( M, N, A, LDA, IPIV, INFO )
@@ -126,16 +126,14 @@
      $                   ZERO = ( 0.0E+0, 0.0E+0 ) )
 *     ..
 *     .. Local Scalars ..
-      REAL               SFMIN
-      INTEGER            I, J, JP
+      INTEGER            J, JP
 *     ..
 *     .. External Functions ..
-      REAL               SLAMCH
       INTEGER            ICAMAX
-      EXTERNAL           SLAMCH, ICAMAX
+      EXTERNAL           ICAMAX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CGERU, CSCAL, CSWAP, XERBLA
+      EXTERNAL           CGERU, CRSCL, CSWAP, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -161,10 +159,6 @@
 *
       IF( M.EQ.0 .OR. N.EQ.0 )
      $   RETURN
-*
-*     Compute machine safe minimum
-*
-      SFMIN = SLAMCH('S')
 *
       DO 10 J = 1, MIN( M, N )
 *
@@ -181,15 +175,8 @@
 *
 *           Compute elements J+1:M of J-th column.
 *
-            IF( J.LT.M ) THEN
-               IF( ABS(A( J, J )) .GE. SFMIN ) THEN
-                  CALL CSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 )
-               ELSE
-                  DO 20 I = 1, M-J
-                     A( J+I, J ) = A( J+I, J ) / A( J, J )
-   20             CONTINUE
-               END IF
-            END IF
+            IF( J.LT.M )
+     $         CALL CRSCL( M-J, A( J, J ), A( J+1, J ), 1 )
 *
          ELSE IF( INFO.EQ.0 ) THEN
 *
diff --git a/lapack-netlib/SRC/crscl.f b/lapack-netlib/SRC/crscl.f
new file mode 100644
index 000000000..22919cd62
--- /dev/null
+++ b/lapack-netlib/SRC/crscl.f
@@ -0,0 +1,202 @@
+*> \brief \b CRSCL multiplies a vector by the reciprocal of a real scalar.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CRSCL + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/crscl.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/crscl.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/crscl.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CRSCL( N, A, X, INCX )
+*
+*       .. Scalar Arguments ..
+*       INTEGER            INCX, N
+*       COMPLEX            A
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            X( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CRSCL multiplies an n-element complex vector x by the complex scalar
+*> 1/a.  This is done without overflow or underflow as long as
+*> the final result x/a does not overflow or underflow.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of components of the vector x.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is COMPLEX
+*>          The scalar a which is used to divide each component of x.
+*>          A must not be 0, or the subroutine will divide by zero.
+*> \endverbatim
+*>
+*> \param[in,out] X
+*> \verbatim
+*>          X is COMPLEX array, dimension
+*>                         (1+(N-1)*abs(INCX))
+*>          The n-element vector x.
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*>          INCX is INTEGER
+*>          The increment between successive values of the vector X.
+*>          > 0:  X(1) = X(1) and X(1+(i-1)*INCX) = x(i),     1< i<= n
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complexOTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE CRSCL( N, A, X, INCX )
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+      COMPLEX            A
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            X( * )
+*     ..
+*
+* =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      REAL               SAFMAX, SAFMIN, OV, AR, AI, ABSR, ABSI, UR
+     %                   , UI
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH
+      COMPLEX            CLADIV
+      EXTERNAL           SLAMCH, CLADIV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CSCAL, CSSCAL, CSRSCL
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( N.LE.0 )
+     $   RETURN
+*
+*     Get machine parameters
+*
+      SAFMIN = SLAMCH( 'S' )
+      SAFMAX = ONE / SAFMIN
+      OV   = SLAMCH( 'O' )
+*
+*     Initialize constants related to A.
+*
+      AR = REAL( A )
+      AI = AIMAG( A )
+      ABSR = ABS( AR )
+      ABSI = ABS( AI )
+*
+      IF( AI.EQ.ZERO ) THEN
+*        If alpha is real, then we can use csrscl
+         CALL CSRSCL( N, AR, X, INCX )
+*
+      ELSE IF( AR.EQ.ZERO ) THEN
+*        If alpha has a zero real part, then we follow the same rules as if
+*        alpha were real.
+         IF( ABSI.GT.SAFMAX ) THEN
+            CALL CSSCAL( N, SAFMIN, X, INCX )
+            CALL CSCAL( N, CMPLX( ZERO, -SAFMAX / AI ), X, INCX )
+         ELSE IF( ABSI.LT.SAFMIN ) THEN
+            CALL CSCAL( N, CMPLX( ZERO, -SAFMIN / AI ), X, INCX )
+            CALL CSSCAL( N, SAFMAX, X, INCX )
+         ELSE
+            CALL CSCAL( N, CMPLX( ZERO, -ONE / AI ), X, INCX )
+         END IF
+*
+      ELSE
+*        The following numbers can be computed.
+*        They are the inverse of the real and imaginary parts of 1/alpha.
+*        Note that a and b are always different from zero.
+*        NaNs are only possible if either:
+*        1. alphaR or alphaI is NaN.
+*        2. alphaR and alphaI are both infinite, in which case it makes sense
+*        to propagate a NaN.
+         UR = AR + AI * ( AI / AR )
+         UI = AI + AR * ( AR / AI )
+*
+         IF( (ABS( UR ).LT.SAFMIN).OR.(ABS( UI ).LT.SAFMIN) ) THEN
+*           This means that both alphaR and alphaI are very small.
+            CALL CSCAL( N, CMPLX( SAFMIN / UR, -SAFMIN / UI ), X, INCX )
+            CALL CSSCAL( N, SAFMAX, X, INCX )
+         ELSE IF( (ABS( UR ).GT.SAFMAX).OR.(ABS( UI ).GT.SAFMAX) ) THEN
+            IF( (ABSR.GT.OV).OR.(ABSI.GT.OV) ) THEN
+*              This means that a and b are both Inf. No need for scaling.
+               CALL CSCAL( N, CMPLX( ONE / UR, -ONE / UI ), X, INCX )
+            ELSE
+               CALL CSSCAL( N, SAFMIN, X, INCX )
+               IF( (ABS( UR ).GT.OV).OR.(ABS( UI ).GT.OV) ) THEN
+*                 Infs were generated. We do proper scaling to avoid them.
+                  IF( ABSR.GE.ABSI ) THEN
+*                    ABS( UR ) <= ABS( UI )
+                     UR = (SAFMIN * AR) + SAFMIN * (AI * ( AI / AR ))
+                     UI = (SAFMIN * AI) + AR * ( (SAFMIN * AR) / AI )
+                  ELSE
+*                    ABS( UR ) > ABS( UI )
+                     UR = (SAFMIN * AR) + AI * ( (SAFMIN * AI) / AR )
+                     UI = (SAFMIN * AI) + SAFMIN * (AR * ( AR / AI ))
+                  END IF
+                  CALL CSCAL( N, CMPLX( ONE / UR, -ONE / UI ), X, INCX )
+               ELSE
+                  CALL CSCAL( N, CMPLX( SAFMAX / UR, -SAFMAX / UI ),
+     $                        X, INCX )
+               END IF
+            END IF
+         ELSE
+            CALL CSCAL( N, CMPLX( ONE / UR, -ONE / UI ), X, INCX )
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CRSCL
+*
+      END
diff --git a/lapack-netlib/SRC/zgetf2.f b/lapack-netlib/SRC/zgetf2.f
index c247f8645..7c63dbbee 100644
--- a/lapack-netlib/SRC/zgetf2.f
+++ b/lapack-netlib/SRC/zgetf2.f
@@ -101,7 +101,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup getf2
 *
 *  =====================================================================
       SUBROUTINE ZGETF2( M, N, A, LDA, IPIV, INFO )
@@ -127,7 +127,7 @@
 *     ..
 *     .. Local Scalars ..
       DOUBLE PRECISION   SFMIN
-      INTEGER            I, J, JP
+      INTEGER            J, JP
 *     ..
 *     .. External Functions ..
       DOUBLE PRECISION   DLAMCH
@@ -135,7 +135,7 @@
       EXTERNAL           DLAMCH, IZAMAX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           XERBLA, ZGERU, ZSCAL, ZSWAP
+      EXTERNAL           XERBLA, ZGERU, ZRSCL, ZSWAP
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -181,15 +181,8 @@
 *
 *           Compute elements J+1:M of J-th column.
 *
-            IF( J.LT.M ) THEN
-               IF( ABS(A( J, J )) .GE. SFMIN ) THEN
-                  CALL ZSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 )
-               ELSE
-                  DO 20 I = 1, M-J
-                     A( J+I, J ) = A( J+I, J ) / A( J, J )
-   20             CONTINUE
-               END IF
-            END IF
+            IF( J.LT.M )
+     $         CALL ZRSCL( M-J, A( J, J ), A( J+1, J ), 1 )
 *
          ELSE IF( INFO.EQ.0 ) THEN
 *
diff --git a/lapack-netlib/SRC/zrscl.f b/lapack-netlib/SRC/zrscl.f
new file mode 100644
index 000000000..970f6de75
--- /dev/null
+++ b/lapack-netlib/SRC/zrscl.f
@@ -0,0 +1,203 @@
+*> \brief \b ZDRSCL multiplies a vector by the reciprocal of a real scalar.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZDRSCL + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zdrscl.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zdrscl.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zdrscl.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZRSCL( N, A, X, INCX )
+*
+*       .. Scalar Arguments ..
+*       INTEGER            INCX, N
+*       COMPLEX*16         A
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         X( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZRSCL multiplies an n-element complex vector x by the complex scalar
+*> 1/a.  This is done without overflow or underflow as long as
+*> the final result x/a does not overflow or underflow.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of components of the vector x.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is COMPLEX*16
+*>          The scalar a which is used to divide each component of x.
+*>          A must not be 0, or the subroutine will divide by zero.
+*> \endverbatim
+*>
+*> \param[in,out] X
+*> \verbatim
+*>          X is COMPLEX*16 array, dimension
+*>                         (1+(N-1)*abs(INCX))
+*>          The n-element vector x.
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*>          INCX is INTEGER
+*>          The increment between successive values of the vector SX.
+*>          > 0:  SX(1) = X(1) and SX(1+(i-1)*INCX) = x(i),     1< i<= n
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16OTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE ZRSCL( N, A, X, INCX )
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+      COMPLEX*16         A
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         X( * )
+*     ..
+*
+* =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      DOUBLE PRECISION   SAFMAX, SAFMIN, OV, AR, AI, ABSR, ABSI, UR, UI
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH
+      COMPLEX*16         ZLADIV
+      EXTERNAL           DLAMCH, ZLADIV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DSCAL, ZDSCAL, ZDRSCL
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( N.LE.0 )
+     $   RETURN
+*
+*     Get machine parameters
+*
+      SAFMIN = DLAMCH( 'S' )
+      SAFMAX = ONE / SAFMIN
+      OV   = DLAMCH( 'O' )
+*
+*     Initialize constants related to A.
+*
+      AR = DBLE( A )
+      AI = DIMAG( A )
+      ABSR = ABS( AR )
+      ABSI = ABS( AI )
+*
+      IF( AI.EQ.ZERO ) THEN
+*        If alpha is real, then we can use csrscl
+         CALL ZDRSCL( N, AR, X, INCX )
+*
+      ELSE IF( AR.EQ.ZERO ) THEN
+*        If alpha has a zero real part, then we follow the same rules as if
+*        alpha were real.
+         IF( ABSI.GT.SAFMAX ) THEN
+            CALL ZDSCAL( N, SAFMIN, X, INCX )
+            CALL ZSCAL( N, DCMPLX( ZERO, -SAFMAX / AI ), X, INCX )
+         ELSE IF( ABSI.LT.SAFMIN ) THEN
+            CALL ZSCAL( N, DCMPLX( ZERO, -SAFMIN / AI ), X, INCX )
+            CALL ZDSCAL( N, SAFMAX, X, INCX )
+         ELSE
+            CALL ZSCAL( N, DCMPLX( ZERO, -ONE / AI ), X, INCX )
+         END IF
+*
+      ELSE
+*        The following numbers can be computed.
+*        They are the inverse of the real and imaginary parts of 1/alpha.
+*        Note that a and b are always different from zero.
+*        NaNs are only possible if either:
+*        1. alphaR or alphaI is NaN.
+*        2. alphaR and alphaI are both infinite, in which case it makes sense
+*        to propagate a NaN.
+         UR = AR + AI * ( AI / AR )
+         UI = AI + AR * ( AR / AI )
+*
+         IF( (ABS( UR ).LT.SAFMIN).OR.(ABS( UI ).LT.SAFMIN) ) THEN
+*           This means that both alphaR and alphaI are very small.
+            CALL ZSCAL( N, DCMPLX( SAFMIN / UR, -SAFMIN / UI ), X,
+     $                  INCX )
+            CALL ZDSCAL( N, SAFMAX, X, INCX )
+         ELSE IF( (ABS( UR ).GT.SAFMAX).OR.(ABS( UI ).GT.SAFMAX) ) THEN
+            IF( (ABSR.GT.OV).OR.(ABSI.GT.OV) ) THEN
+*              This means that a and b are both Inf. No need for scaling.
+               CALL ZSCAL( N, DCMPLX( ONE / UR, -ONE / UI ), X, INCX )
+            ELSE
+               CALL ZDSCAL( N, SAFMIN, X, INCX )
+               IF( (ABS( UR ).GT.OV).OR.(ABS( UI ).GT.OV) ) THEN
+*                 Infs were generated. We do proper scaling to avoid them.
+                  IF( ABSR.GE.ABSI ) THEN
+*                    ABS( UR ) <= ABS( UI )
+                     UR = (SAFMIN * AR) + SAFMIN * (AI * ( AI / AR ))
+                     UI = (SAFMIN * AI) + AR * ( (SAFMIN * AR) / AI )
+                  ELSE
+*                    ABS( UR ) > ABS( UI )
+                     UR = (SAFMIN * AR) + AI * ( (SAFMIN * AI) / AR )
+                     UI = (SAFMIN * AI) + SAFMIN * (AR * ( AR / AI ))
+                  END IF
+                  CALL ZSCAL( N, DCMPLX( ONE / UR, -ONE / UI ), X,
+     $                        INCX )
+               ELSE
+                  CALL ZSCAL( N, DCMPLX( SAFMAX / UR, -SAFMAX / UI ),
+     $                        X, INCX )
+               END IF
+            END IF
+         ELSE
+            CALL ZSCAL( N, DCMPLX( ONE / UR, -ONE / UI ), X, INCX )
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZRSCL
+*
+      END

From 1d399ad0d6c7b7d1cb99f7385ac9c30d82fbd41e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 5 Jul 2023 14:17:26 +0200
Subject: [PATCH 184/718] Add CRSCL/ZRSCL (Reference-LAPACK PR839)

---
 lapack-netlib/SRC/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile
index 74db14e46..c75fd5f49 100644
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@@ -280,7 +280,7 @@ CLASRC_O = \
    cposv.o  cposvx.o cpotf2.o cpotri.o cpstrf.o cpstf2.o \
    cppcon.o cppequ.o cpprfs.o cppsv.o  cppsvx.o cpptrf.o cpptri.o cpptrs.o \
    cptcon.o cpteqr.o cptrfs.o cptsv.o  cptsvx.o cpttrf.o cpttrs.o cptts2.o \
-   crot.o   cspcon.o cspmv.o  cspr.o   csprfs.o cspsv.o \
+   crot.o crscl.o cspcon.o cspmv.o  cspr.o   csprfs.o cspsv.o \
    cspsvx.o csptrf.o csptri.o csptrs.o csrscl.o cstedc.o \
    cstegr.o cstein.o csteqr.o \
    csycon.o csymv.o \
@@ -488,7 +488,7 @@ ZLASRC_O = \
    zposv.o  zposvx.o zpotf2.o zpotrf.o zpotri.o zpotrs.o zpstrf.o zpstf2.o \
    zppcon.o zppequ.o zpprfs.o zppsv.o  zppsvx.o zpptrf.o zpptri.o zpptrs.o \
    zptcon.o zpteqr.o zptrfs.o zptsv.o  zptsvx.o zpttrf.o zpttrs.o zptts2.o \
-   zrot.o   zspcon.o zspmv.o  zspr.o   zsprfs.o zspsv.o \
+   zrot.o zrscl.o  zspcon.o zspmv.o  zspr.o   zsprfs.o zspsv.o \
    zspsvx.o zsptrf.o zsptri.o zsptrs.o zdrscl.o zstedc.o \
    zstegr.o zstein.o zsteqr.o \
    zsycon.o zsymv.o \

From e3277e134cafff50bfd335da1d3a39a5112cf651 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 5 Jul 2023 14:22:59 +0200
Subject: [PATCH 185/718] Add CRSCL/ZRSCL (Reference-LAPACK PR 839)

---
 cmake/lapack.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake
index d339f0ce9..ce5d0831f 100644
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -187,7 +187,7 @@ set(CLASRC
    cposv.f  cposvx.f cpotrf2.f cpotri.f cpstrf.f cpstf2.f
    cppcon.f cppequ.f cpprfs.f cppsv.f  cppsvx.f cpptrf.f cpptri.f cpptrs.f
    cptcon.f cpteqr.f cptrfs.f cptsv.f  cptsvx.f cpttrf.f cpttrs.f cptts2.f
-   crot.f   cspcon.f csprfs.f cspsv.f
+   crot.f crscl.f cspcon.f csprfs.f cspsv.f
    cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f
    cstegr.f cstein.f csteqr.f csycon.f
    csyrfs.f csysv.f  csysvx.f csytf2.f csytrf.f csytri.f
@@ -381,7 +381,7 @@ set(ZLASRC
    zposv.f  zposvx.f zpotrf2.f zpotri.f zpotrs.f zpstrf.f zpstf2.f
    zppcon.f zppequ.f zpprfs.f zppsv.f  zppsvx.f zpptrf.f zpptri.f zpptrs.f
    zptcon.f zpteqr.f zptrfs.f zptsv.f  zptsvx.f zpttrf.f zpttrs.f zptts2.f
-   zrot.f   zspcon.f zsprfs.f zspsv.f
+   zrot.f zrscl.f zspcon.f zsprfs.f zspsv.f
    zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f
    zstegr.f zstein.f zsteqr.f zsycon.f
    zsyrfs.f zsysv.f  zsysvx.f zsytf2.f zsytrf.f zsytri.f

From bed3a6a30404ae54770a8b059b32e5f6b0d1fb96 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 7 Jul 2023 10:13:41 +0200
Subject: [PATCH 186/718] Fix segfault when NRHS is zero (Reference-LAPACK PR
 876)

---
 lapack-netlib/SRC/cgelss.f | 12 +++++-------
 lapack-netlib/SRC/dgelss.f | 11 +++++------
 lapack-netlib/SRC/sgelss.f | 11 +++++------
 lapack-netlib/SRC/zgelss.f | 16 +++++++---------
 4 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/lapack-netlib/SRC/cgelss.f b/lapack-netlib/SRC/cgelss.f
index da6b9092f..d1e38c504 100644
--- a/lapack-netlib/SRC/cgelss.f
+++ b/lapack-netlib/SRC/cgelss.f
@@ -170,7 +170,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEsolve
+*> \ingroup gelss
 *
 *  =====================================================================
       SUBROUTINE CGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK,
@@ -214,8 +214,7 @@
 *     .. External Subroutines ..
       EXTERNAL           CBDSQR, CCOPY, CGEBRD, CGELQF, CGEMM, CGEMV,
      $                   CGEQRF, CLACPY, CLASCL, CLASET, CSRSCL, CUNGBR,
-     $                   CUNMBR, CUNMLQ, CUNMQR, SLABAD, SLASCL, SLASET,
-     $                   XERBLA
+     $                   CUNMBR, CUNMLQ, CUNMQR, SLASCL, SLASET, XERBLA
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
@@ -388,7 +387,6 @@
       SFMIN = SLAMCH( 'S' )
       SMLNUM = SFMIN / EPS
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
@@ -540,7 +538,7 @@
      $                     LDB, CZERO, WORK, N )
                CALL CLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB )
    20       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL CGEMV( 'C', N, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 )
             CALL CCOPY( N, WORK, 1, B, 1 )
          END IF
@@ -645,7 +643,7 @@
                CALL CLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ),
      $                      LDB )
    40       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL CGEMV( 'C', M, M, CONE, WORK( IL ), LDWORK, B( 1, 1 ),
      $                  1, CZERO, WORK( IWORK ), 1 )
             CALL CCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 )
@@ -737,7 +735,7 @@
      $                     LDB, CZERO, WORK, N )
                CALL CLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB )
    60       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL CGEMV( 'C', M, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 )
             CALL CCOPY( N, WORK, 1, B, 1 )
          END IF
diff --git a/lapack-netlib/SRC/dgelss.f b/lapack-netlib/SRC/dgelss.f
index c4190f2e0..38449be7f 100644
--- a/lapack-netlib/SRC/dgelss.f
+++ b/lapack-netlib/SRC/dgelss.f
@@ -164,7 +164,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEsolve
+*> \ingroup gelss
 *
 *  =====================================================================
       SUBROUTINE DGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK,
@@ -203,7 +203,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DBDSQR, DCOPY, DGEBRD, DGELQF, DGEMM, DGEMV,
-     $                   DGEQRF, DLABAD, DLACPY, DLASCL, DLASET, DORGBR,
+     $                   DGEQRF, DLACPY, DLASCL, DLASET, DORGBR,
      $                   DORMBR, DORMLQ, DORMQR, DRSCL, XERBLA
 *     ..
 *     .. External Functions ..
@@ -385,7 +385,6 @@
       SFMIN = DLAMCH( 'S' )
       SMLNUM = SFMIN / EPS
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
@@ -529,7 +528,7 @@
      $                     LDB, ZERO, WORK, N )
                CALL DLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB )
    20       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL DGEMV( 'T', N, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 )
             CALL DCOPY( N, WORK, 1, B, 1 )
          END IF
@@ -626,7 +625,7 @@
                CALL DLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ),
      $                      LDB )
    40       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL DGEMV( 'T', M, M, ONE, WORK( IL ), LDWORK, B( 1, 1 ),
      $                  1, ZERO, WORK( IWORK ), 1 )
             CALL DCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 )
@@ -712,7 +711,7 @@
      $                     LDB, ZERO, WORK, N )
                CALL DLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB )
    60       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL DGEMV( 'T', M, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 )
             CALL DCOPY( N, WORK, 1, B, 1 )
          END IF
diff --git a/lapack-netlib/SRC/sgelss.f b/lapack-netlib/SRC/sgelss.f
index 9aed4329f..89d3a6e4f 100644
--- a/lapack-netlib/SRC/sgelss.f
+++ b/lapack-netlib/SRC/sgelss.f
@@ -164,7 +164,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEsolve
+*> \ingroup gelss
 *
 *  =====================================================================
       SUBROUTINE SGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK,
@@ -202,7 +202,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SBDSQR, SCOPY, SGEBRD, SGELQF, SGEMM, SGEMV,
-     $                   SGEQRF, SLABAD, SLACPY, SLASCL, SLASET, SORGBR,
+     $                   SGEQRF, SLACPY, SLASCL, SLASET, SORGBR,
      $                   SORMBR, SORMLQ, SORMQR, SRSCL, XERBLA
 *     ..
 *     .. External Functions ..
@@ -381,7 +381,6 @@
       SFMIN = SLAMCH( 'S' )
       SMLNUM = SFMIN / EPS
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
@@ -525,7 +524,7 @@
      $                     LDB, ZERO, WORK, N )
                CALL SLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB )
    20       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL SGEMV( 'T', N, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 )
             CALL SCOPY( N, WORK, 1, B, 1 )
          END IF
@@ -622,7 +621,7 @@
                CALL SLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ),
      $                      LDB )
    40       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL SGEMV( 'T', M, M, ONE, WORK( IL ), LDWORK, B( 1, 1 ),
      $                  1, ZERO, WORK( IWORK ), 1 )
             CALL SCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 )
@@ -708,7 +707,7 @@
      $                     LDB, ZERO, WORK, N )
                CALL SLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB )
    60       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL SGEMV( 'T', M, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 )
             CALL SCOPY( N, WORK, 1, B, 1 )
          END IF
diff --git a/lapack-netlib/SRC/zgelss.f b/lapack-netlib/SRC/zgelss.f
index be53ba95b..afdbaecf0 100644
--- a/lapack-netlib/SRC/zgelss.f
+++ b/lapack-netlib/SRC/zgelss.f
@@ -170,7 +170,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEsolve
+*> \ingroup gelss
 *
 *  =====================================================================
       SUBROUTINE ZGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK,
@@ -212,10 +212,9 @@
       COMPLEX*16         DUM( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, DLASCL, DLASET, XERBLA, ZBDSQR, ZCOPY,
-     $                   ZDRSCL, ZGEBRD, ZGELQF, ZGEMM, ZGEMV, ZGEQRF,
-     $                   ZLACPY, ZLASCL, ZLASET, ZUNGBR, ZUNMBR, ZUNMLQ,
-     $                   ZUNMQR
+      EXTERNAL           DLASCL, DLASET, XERBLA, ZBDSQR, ZCOPY, ZDRSCL,
+     $                   ZGEBRD, ZGELQF, ZGEMM, ZGEMV, ZGEQRF, ZLACPY,
+     $                   ZLASCL, ZLASET, ZUNGBR, ZUNMBR, ZUNMLQ
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
@@ -388,7 +387,6 @@
       SFMIN = DLAMCH( 'S' )
       SMLNUM = SFMIN / EPS
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
@@ -540,7 +538,7 @@
      $                     LDB, CZERO, WORK, N )
                CALL ZLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB )
    20       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL ZGEMV( 'C', N, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 )
             CALL ZCOPY( N, WORK, 1, B, 1 )
          END IF
@@ -645,7 +643,7 @@
                CALL ZLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ),
      $                      LDB )
    40       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL ZGEMV( 'C', M, M, CONE, WORK( IL ), LDWORK, B( 1, 1 ),
      $                  1, CZERO, WORK( IWORK ), 1 )
             CALL ZCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 )
@@ -737,7 +735,7 @@
      $                     LDB, CZERO, WORK, N )
                CALL ZLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB )
    60       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL ZGEMV( 'C', M, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 )
             CALL ZCOPY( N, WORK, 1, B, 1 )
          END IF

From 63ec2ad3954347fcca04e2d4687a076205ba19f6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 8 Jul 2023 10:00:05 +0200
Subject: [PATCH 187/718] Fix computation of UPLO in LAPACKE_?larfb
 (Reference-LAPACK PR 878)

---
 lapack-netlib/LAPACKE/src/lapacke_clarfb.c      | 2 +-
 lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c | 2 +-
 lapack-netlib/LAPACKE/src/lapacke_dlarfb.c      | 2 +-
 lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c | 2 +-
 lapack-netlib/LAPACKE/src/lapacke_slarfb.c      | 2 +-
 lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c | 2 +-
 lapack-netlib/LAPACKE/src/lapacke_zlarfb.c      | 2 +-
 lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c
index ed12b476e..aac7b551d 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c
@@ -58,7 +58,7 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) {
             LAPACKE_xerbla( "LAPACKE_clarfb", -8 );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c
index 545769b83..67bbbd34f 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c
@@ -60,7 +60,7 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans,
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         ldc_t = MAX(1,m);
         ldt_t = MAX(1,k);
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c
index f4ddc62a5..aeebd8dec 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c
@@ -57,7 +57,7 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) {
             LAPACKE_xerbla( "LAPACKE_dlarfb", -8 );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c
index de444c146..de2f41e66 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c
@@ -59,7 +59,7 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans,
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         ldc_t = MAX(1,m);
         ldt_t = MAX(1,k);
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c
index d36958f93..3d6c29f88 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c
@@ -57,7 +57,7 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) {
             LAPACKE_xerbla( "LAPACKE_slarfb", -8 );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c
index 8b6127633..72a392a77 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c
@@ -59,7 +59,7 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans,
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         ldc_t = MAX(1,m);
         ldt_t = MAX(1,k);
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c
index 85355b202..c5edbbc0e 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c
@@ -58,7 +58,7 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) {
             LAPACKE_xerbla( "LAPACKE_zlarfb", -8 );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c
index 72d85ec82..232c8ef58 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c
@@ -60,7 +60,7 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans,
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         ldc_t = MAX(1,m);
         ldt_t = MAX(1,k);

From 7e93ab1b9e831c5bc229c3b4989f2d813472d928 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 9 Jul 2023 17:00:25 +0200
Subject: [PATCH 188/718] Fix info code returned for invalid ldb

---
 interface/imatcopy.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/interface/imatcopy.c b/interface/imatcopy.c
index c4417e99c..4cf0966cc 100644
--- a/interface/imatcopy.c
+++ b/interface/imatcopy.c
@@ -100,13 +100,13 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 
 	if ( order == BlasColMajor)
 	{
-        	if ( trans == BlasNoTrans  &&  *ldb < *rows ) info = 9;
-        	if ( trans == BlasTrans    &&  *ldb < *cols ) info = 9;
+        	if ( trans == BlasNoTrans  &&  *ldb < *rows ) info = 8;
+        	if ( trans == BlasTrans    &&  *ldb < *cols ) info = 8;
 	}
 	if ( order == BlasRowMajor)
 	{
-        	if ( trans == BlasNoTrans  &&  *ldb < *cols ) info = 9;
-        	if ( trans == BlasTrans    &&  *ldb < *rows ) info = 9;
+        	if ( trans == BlasNoTrans  &&  *ldb < *cols ) info = 8;
+        	if ( trans == BlasTrans    &&  *ldb < *rows ) info = 8;
 	}
 
 	if ( order == BlasColMajor &&  *lda < *rows ) info = 7;

From a721fccfdcbf54ab9d573e8e8259c6272636edc7 Mon Sep 17 00:00:00 2001
From: Felix Yan <felixonmars@archlinux.org>
Date: Tue, 11 Jul 2023 16:34:20 +0300
Subject: [PATCH 189/718] Fix riscv64 detection in system_check.cmake

---
 cmake/system_check.cmake | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake
index c59e85d54..2b43a5a14 100644
--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@@ -44,6 +44,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
   set(MIPS64 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
   set(LOONGARCH64 1)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*")
+  set(RISCV64 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
   if (NOT BINARY)
     if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
@@ -107,7 +109,7 @@ else()
 endif ()
 
 if (NOT BINARY)
-  if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64)
+  if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64)
     set(BINARY 64)
   else ()
     set(BINARY 32)

From 35dedb68ce9b6be845ac900fc645d996753ac9e6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 11 Jul 2023 17:07:30 +0200
Subject: [PATCH 190/718] Add C versions of C/ZRSCL

---
 cmake/lapack.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake
index ce5d0831f..5c6290484 100644
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -686,7 +686,7 @@ set(CLASRC
    cposv.c  cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c
    cppcon.c cppequ.c cpprfs.c cppsv.c  cppsvx.c cpptrf.c cpptri.c cpptrs.c
    cptcon.c cpteqr.c cptrfs.c cptsv.c  cptsvx.c cpttrf.c cpttrs.c cptts2.c
-   crot.c   cspcon.c csprfs.c cspsv.c
+   crot.c crscl.c  cspcon.c csprfs.c cspsv.c
    cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c
    cstegr.c cstein.c csteqr.c csycon.c
    csyrfs.c csysv.c  csysvx.c csytf2.c csytrf.c csytri.c
@@ -878,7 +878,7 @@ set(ZLASRC
    zposv.c  zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c
    zppcon.c zppequ.c zpprfs.c zppsv.c  zppsvx.c zpptrf.c zpptri.c zpptrs.c
    zptcon.c zpteqr.c zptrfs.c zptsv.c  zptsvx.c zpttrf.c zpttrs.c zptts2.c
-   zrot.c   zspcon.c zsprfs.c zspsv.c
+   zrot.c zrscl.c  zspcon.c zsprfs.c zspsv.c
    zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c
    zstegr.c zstein.c zsteqr.c zsycon.c
    zsyrfs.c zsysv.c  zsysvx.c zsytf2.c zsytrf.c zsytri.c

From afef854863fc48ee6a520636af9e49216f18d82e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 11 Jul 2023 17:08:27 +0200
Subject: [PATCH 191/718] Add C versions of C/ZRSCL

---
 lapack-netlib/SRC/crscl.c | 735 ++++++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/zrscl.c | 735 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 1470 insertions(+)
 create mode 100644 lapack-netlib/SRC/crscl.c
 create mode 100644 lapack-netlib/SRC/zrscl.c

diff --git a/lapack-netlib/SRC/crscl.c b/lapack-netlib/SRC/crscl.c
new file mode 100644
index 000000000..7c87553d5
--- /dev/null
+++ b/lapack-netlib/SRC/crscl.c
@@ -0,0 +1,735 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* > \brief \b CRSCL multiplies a vector by the reciprocal of a real scalar. */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/* > \htmlonly */
+/* > Download CRSCL + dependencies */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/crscl.f
+"> */
+/* > [TGZ]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/crscl.f
+"> */
+/* > [ZIP]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/crscl.f
+"> */
+/* > [TXT]</a> */
+/* > \endhtmlonly */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE CRSCL( N, A, X, INCX ) */
+
+/*       INTEGER            INCX, N */
+/*       COMPLEX            A */
+/*       COMPLEX            X( * ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > CRSCL multiplies an n-element complex vector x by the complex scalar */
+/* > 1/a.  This is done without overflow or underflow as long as */
+/* > the final result x/a does not overflow or underflow. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of components of the vector x. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is COMPLEX */
+/* >          The scalar a which is used to divide each component of x. */
+/* >          A must not be 0, or the subroutine will divide by zero. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] X */
+/* > \verbatim */
+/* >          X is COMPLEX array, dimension */
+/* >                         (1+(N-1)*abs(INCX)) */
+/* >          The n-element vector x. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] INCX */
+/* > \verbatim */
+/* >          INCX is INTEGER */
+/* >          The increment between successive values of the vector X. */
+/* >          > 0:  X(1) = X(1) and X(1+(i-1)*INCX) = x(i),     1< i<= n */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complexOTHERauxiliary */
+
+/*  ===================================================================== */
+/* Subroutine */ int crscl_(integer *n, complex *a, complex *x, integer *incx)
+{
+    /* System generated locals */
+    real r__1, r__2;
+    complex q__1;
+
+    /* Local variables */
+    real absi, absr;
+    extern /* Subroutine */ int cscal_(integer *, complex *, complex *, 
+	    integer *);
+    real ai, ar, ui, ov, ur;
+    extern real slamch_(char *);
+    extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer 
+	    *);
+    real safmin, safmax;
+    extern /* Subroutine */ int csrscl_(integer *, real *, complex *, integer 
+	    *);
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/* ===================================================================== */
+
+
+/*     Quick return if possible */
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+
+/*     Get machine parameters */
+
+    safmin = slamch_("S");
+    safmax = 1.f / safmin;
+    ov = slamch_("O");
+
+/*     Initialize constants related to A. */
+
+    ar = a->r;
+    ai = r_imag(a);
+    absr = abs(ar);
+    absi = abs(ai);
+
+    if (ai == 0.f) {
+/*        If alpha is real, then we can use csrscl */
+	csrscl_(n, &ar, &x[1], incx);
+
+    } else if (ar == 0.f) {
+/*        If alpha has a zero real part, then we follow the same rules as if */
+/*        alpha were real. */
+	if (absi > safmax) {
+	    csscal_(n, &safmin, &x[1], incx);
+	    r__1 = -safmax / ai;
+	    q__1.r = 0.f, q__1.i = r__1;
+	    cscal_(n, &q__1, &x[1], incx);
+	} else if (absi < safmin) {
+	    r__1 = -safmin / ai;
+	    q__1.r = 0.f, q__1.i = r__1;
+	    cscal_(n, &q__1, &x[1], incx);
+	    csscal_(n, &safmax, &x[1], incx);
+	} else {
+	    r__1 = -1.f / ai;
+	    q__1.r = 0.f, q__1.i = r__1;
+	    cscal_(n, &q__1, &x[1], incx);
+	}
+
+    } else {
+/*        The following numbers can be computed. */
+/*        They are the inverse of the real and imaginary parts of 1/alpha. */
+/*        Note that a and b are always different from zero. */
+/*        NaNs are only possible if either: */
+/*        1. alphaR or alphaI is NaN. */
+/*        2. alphaR and alphaI are both infinite, in which case it makes sense */
+/*        to propagate a NaN. */
+	ur = ar + ai * (ai / ar);
+	ui = ai + ar * (ar / ai);
+
+	if (abs(ur) < safmin || abs(ui) < safmin) {
+/*           This means that both alphaR and alphaI are very small. */
+	    r__1 = safmin / ur;
+	    r__2 = -safmin / ui;
+	    q__1.r = r__1, q__1.i = r__2;
+	    cscal_(n, &q__1, &x[1], incx);
+	    csscal_(n, &safmax, &x[1], incx);
+	} else if (abs(ur) > safmax || abs(ui) > safmax) {
+	    if (absr > ov || absi > ov) {
+/*              This means that a and b are both Inf. No need for scaling. */
+		r__1 = 1.f / ur;
+		r__2 = -1.f / ui;
+		q__1.r = r__1, q__1.i = r__2;
+		cscal_(n, &q__1, &x[1], incx);
+	    } else {
+		csscal_(n, &safmin, &x[1], incx);
+		if (abs(ur) > ov || abs(ui) > ov) {
+/*                 Infs were generated. We do proper scaling to avoid them. */
+		    if (absr >= absi) {
+/*                    ABS( UR ) <= ABS( UI ) */
+			ur = safmin * ar + safmin * (ai * (ai / ar));
+			ui = safmin * ai + ar * (safmin * ar / ai);
+		    } else {
+/*                    ABS( UR ) > ABS( UI ) */
+			ur = safmin * ar + ai * (safmin * ai / ar);
+			ui = safmin * ai + safmin * (ar * (ar / ai));
+		    }
+		    r__1 = 1.f / ur;
+		    r__2 = -1.f / ui;
+		    q__1.r = r__1, q__1.i = r__2;
+		    cscal_(n, &q__1, &x[1], incx);
+		} else {
+		    r__1 = safmax / ur;
+		    r__2 = -safmax / ui;
+		    q__1.r = r__1, q__1.i = r__2;
+		    cscal_(n, &q__1, &x[1], incx);
+		}
+	    }
+	} else {
+	    r__1 = 1.f / ur;
+	    r__2 = -1.f / ui;
+	    q__1.r = r__1, q__1.i = r__2;
+	    cscal_(n, &q__1, &x[1], incx);
+	}
+    }
+
+    return 0;
+
+/*     End of CRSCL */
+
+} /* crscl_ */
+
diff --git a/lapack-netlib/SRC/zrscl.c b/lapack-netlib/SRC/zrscl.c
new file mode 100644
index 000000000..2264b5465
--- /dev/null
+++ b/lapack-netlib/SRC/zrscl.c
@@ -0,0 +1,735 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* > \brief \b ZDRSCL multiplies a vector by the reciprocal of a real scalar. */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/* > \htmlonly */
+/* > Download ZDRSCL + dependencies */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zdrscl.
+f"> */
+/* > [TGZ]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zdrscl.
+f"> */
+/* > [ZIP]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zdrscl.
+f"> */
+/* > [TXT]</a> */
+/* > \endhtmlonly */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE ZRSCL( N, A, X, INCX ) */
+
+/*       INTEGER            INCX, N */
+/*       COMPLEX*16         A */
+/*       COMPLEX*16         X( * ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > ZRSCL multiplies an n-element complex vector x by the complex scalar */
+/* > 1/a.  This is done without overflow or underflow as long as */
+/* > the final result x/a does not overflow or underflow. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of components of the vector x. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is COMPLEX*16 */
+/* >          The scalar a which is used to divide each component of x. */
+/* >          A must not be 0, or the subroutine will divide by zero. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] X */
+/* > \verbatim */
+/* >          X is COMPLEX*16 array, dimension */
+/* >                         (1+(N-1)*abs(INCX)) */
+/* >          The n-element vector x. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] INCX */
+/* > \verbatim */
+/* >          INCX is INTEGER */
+/* >          The increment between successive values of the vector SX. */
+/* >          > 0:  SX(1) = X(1) and SX(1+(i-1)*INCX) = x(i),     1< i<= n */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complex16OTHERauxiliary */
+
+/*  ===================================================================== */
+/* Subroutine */ int zrscl_(integer *n, doublecomplex *a, doublecomplex *x, 
+	integer *incx)
+{
+    /* System generated locals */
+    doublereal d__1, d__2;
+    doublecomplex z__1;
+
+    /* Local variables */
+    doublereal absi, absr;
+    extern /* Subroutine */ int zscal_(integer *, doublecomplex *, 
+	    doublecomplex *, integer *);
+    doublereal ai, ar;
+    extern doublereal dlamch_(char *);
+    doublereal ui, ov, ur, safmin, safmax;
+    extern /* Subroutine */ int zdscal_(integer *, doublereal *, 
+	    doublecomplex *, integer *), zdrscl_(integer *, doublereal *, 
+	    doublecomplex *, integer *);
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/* ===================================================================== */
+
+
+/*     Quick return if possible */
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+
+/*     Get machine parameters */
+
+    safmin = dlamch_("S");
+    safmax = 1. / safmin;
+    ov = dlamch_("O");
+
+/*     Initialize constants related to A. */
+
+    ar = a->r;
+    ai = d_imag(a);
+    absr = abs(ar);
+    absi = abs(ai);
+
+    if (ai == 0.) {
+/*        If alpha is real, then we can use csrscl */
+	zdrscl_(n, &ar, &x[1], incx);
+
+    } else if (ar == 0.) {
+/*        If alpha has a zero real part, then we follow the same rules as if */
+/*        alpha were real. */
+	if (absi > safmax) {
+	    zdscal_(n, &safmin, &x[1], incx);
+	    d__1 = -safmax / ai;
+	    z__1.r = 0., z__1.i = d__1;
+	    zscal_(n, &z__1, &x[1], incx);
+	} else if (absi < safmin) {
+	    d__1 = -safmin / ai;
+	    z__1.r = 0., z__1.i = d__1;
+	    zscal_(n, &z__1, &x[1], incx);
+	    zdscal_(n, &safmax, &x[1], incx);
+	} else {
+	    d__1 = -1. / ai;
+	    z__1.r = 0., z__1.i = d__1;
+	    zscal_(n, &z__1, &x[1], incx);
+	}
+
+    } else {
+/*        The following numbers can be computed. */
+/*        They are the inverse of the real and imaginary parts of 1/alpha. */
+/*        Note that a and b are always different from zero. */
+/*        NaNs are only possible if either: */
+/*        1. alphaR or alphaI is NaN. */
+/*        2. alphaR and alphaI are both infinite, in which case it makes sense */
+/*        to propagate a NaN. */
+	ur = ar + ai * (ai / ar);
+	ui = ai + ar * (ar / ai);
+
+	if (abs(ur) < safmin || abs(ui) < safmin) {
+/*           This means that both alphaR and alphaI are very small. */
+	    d__1 = safmin / ur;
+	    d__2 = -safmin / ui;
+	    z__1.r = d__1, z__1.i = d__2;
+	    zscal_(n, &z__1, &x[1], incx);
+	    zdscal_(n, &safmax, &x[1], incx);
+	} else if (abs(ur) > safmax || abs(ui) > safmax) {
+	    if (absr > ov || absi > ov) {
+/*              This means that a and b are both Inf. No need for scaling. */
+		d__1 = 1. / ur;
+		d__2 = -1. / ui;
+		z__1.r = d__1, z__1.i = d__2;
+		zscal_(n, &z__1, &x[1], incx);
+	    } else {
+		zdscal_(n, &safmin, &x[1], incx);
+		if (abs(ur) > ov || abs(ui) > ov) {
+/*                 Infs were generated. We do proper scaling to avoid them. */
+		    if (absr >= absi) {
+/*                    ABS( UR ) <= ABS( UI ) */
+			ur = safmin * ar + safmin * (ai * (ai / ar));
+			ui = safmin * ai + ar * (safmin * ar / ai);
+		    } else {
+/*                    ABS( UR ) > ABS( UI ) */
+			ur = safmin * ar + ai * (safmin * ai / ar);
+			ui = safmin * ai + safmin * (ar * (ar / ai));
+		    }
+		    d__1 = 1. / ur;
+		    d__2 = -1. / ui;
+		    z__1.r = d__1, z__1.i = d__2;
+		    zscal_(n, &z__1, &x[1], incx);
+		} else {
+		    d__1 = safmax / ur;
+		    d__2 = -safmax / ui;
+		    z__1.r = d__1, z__1.i = d__2;
+		    zscal_(n, &z__1, &x[1], incx);
+		}
+	    }
+	} else {
+	    d__1 = 1. / ur;
+	    d__2 = -1. / ui;
+	    z__1.r = d__1, z__1.i = d__2;
+	    zscal_(n, &z__1, &x[1], incx);
+	}
+    }
+
+    return 0;
+
+/*     End of ZRSCL */
+
+} /* zrscl_ */
+

From 1e4a3a2b5e111a6a94eb53946fa92c1715c5dd5e Mon Sep 17 00:00:00 2001
From: Octavian Maghiar <octavian.maghiar@imgtec.com>
Date: Wed, 12 Jul 2023 12:55:50 +0100
Subject: [PATCH 192/718] Fixes RVV masked intrinsics for izamax/izamin kernels

---
 kernel/riscv64/izamax_rvv.c | 18 +++++++++---------
 kernel/riscv64/izamin_rvv.c | 16 ++++++++--------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c
index e61d0cbec..e43ded820 100644
--- a/kernel/riscv64/izamax_rvv.c
+++ b/kernel/riscv64/izamax_rvv.c
@@ -48,9 +48,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
 #define VFIRSTM                 __riscv_vfirst_m_b16
 #define UINT_V_T                vuint64m4_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u64m4_m
+#define VIDV_MASK_UINT          __riscv_vid_v_u64m4_mu
 #define VIDV_UINT               __riscv_vid_v_u64m4
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m4_m
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m4_mu
 #define VADDVX_UINT             __riscv_vadd_vx_u64m4
 #define VMVVX_UINT              __riscv_vmv_v_x_u64m4
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
@@ -77,9 +77,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
 #define VFIRSTM                 __riscv_vfirst_m_b8
 #define UINT_V_T                vuint32m4_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u32m4_m
+#define VIDV_MASK_UINT          __riscv_vid_v_u32m4_mu
 #define VIDV_UINT               __riscv_vid_v_u32m4
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m4_m
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m4_mu
 #define VADDVX_UINT             __riscv_vadd_vx_u32m4
 #define VMVVX_UINT              __riscv_vmv_v_x_u32m4
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
@@ -116,8 +116,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             //index where element greater than v_max
             mask = VMFLTVV_FLOAT(v_max, vx0, vl);
-            v_max_index = VIDV_MASK_UINT(mask, vl);
-            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl);
+            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
 
             //update v_max and start_index j
             v_max = VFMAXVV_FLOAT(v_max, vx0, vl);
@@ -138,9 +138,9 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         
             //index where element greater than v_max
             mask = VMFLTVV_FLOAT(v_max, vx0, vl);
-            v_max_index = VIDV_MASK_UINT(mask, vl);
-            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, j, vl);
-        
+            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
+
             //update v_max and start_index j
             v_max = VFMAXVV_FLOAT(v_max, vx0, vl);
         }
diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c
index 297b3c99a..cc3c37c8e 100644
--- a/kernel/riscv64/izamin_rvv.c
+++ b/kernel/riscv64/izamin_rvv.c
@@ -47,9 +47,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
 #define VFIRSTM                 __riscv_vfirst_m_b16
 #define UINT_V_T                vuint64m4_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u64m4_m
+#define VIDV_MASK_UINT          __riscv_vid_v_u64m4_mu
 #define VIDV_UINT               __riscv_vid_v_u64m4
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m4_m
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m4_mu
 #define VADDVX_UINT             __riscv_vadd_vx_u64m4
 #define VMVVX_UINT              __riscv_vmv_v_x_u64m4
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
@@ -74,9 +74,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
 #define VFIRSTM                 __riscv_vfirst_m_b8
 #define UINT_V_T                vuint32m4_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u32m4_m
+#define VIDV_MASK_UINT          __riscv_vid_v_u32m4_mu
 #define VIDV_UINT               __riscv_vid_v_u32m4
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m4_m
+#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m4_mu
 #define VADDVX_UINT             __riscv_vadd_vx_u32m4
 #define VMVVX_UINT              __riscv_vmv_v_x_u32m4
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
@@ -113,8 +113,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             // index where element less than v_min
             mask = VMFLTVV_FLOAT(vx0, v_min, vl);
-            v_min_index = VIDV_MASK_UINT(mask, vl);
-            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl);
+            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
 
             //update v_min and start_index j
             v_min = VFMINVV_FLOAT(v_min, vx0, vl);
@@ -136,8 +136,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             // index where element less than v_min
             mask = VMFLTVV_FLOAT(vx0, v_min, vl);
-            v_min_index = VIDV_MASK_UINT(mask, vl);
-            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, vl);
+            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
 
             //update v_min and start_index j
             v_min = VFMINVV_FLOAT(v_min, vx0, vl);

From 45b2cd2fb299efdda04e539467fc3b15c2bd6d63 Mon Sep 17 00:00:00 2001
From: Andy Mroczkowski <amroczkowski@exyntechnologies.com>
Date: Wed, 12 Jul 2023 09:37:45 -0400
Subject: [PATCH 193/718] treat armv8 CMAKE_SYSTEM_PROCESSOR as arm64

The cmake scripts incorrectly treated armv8 as 32-bit arm, causing
compilation issues. This just adds 'armv8' to the arm64 condition check.
---
 cmake/system_check.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake
index fdc79c8ce..bf3c23d23 100644
--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@@ -54,7 +54,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
   endif()
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
   set(X86 1)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*|armv8.*)")
   if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
     set(ARM64 1)
   else()

From e08743d9771c96afa6dce3d2326aab3554db6337 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 12 Jul 2023 23:02:36 +0200
Subject: [PATCH 194/718] Update to use safe scaling algorithm from
 Reference-LAPACK PR 527

---
 interface/rotg.c  | 62 ++++++++++++++++++++++++++++++++++-------------
 interface/zrotg.c | 40 ++++++++++++++++++++++++++----
 2 files changed, 80 insertions(+), 22 deletions(-)

diff --git a/interface/rotg.c b/interface/rotg.c
index 69443a5a0..3ccf4f7eb 100644
--- a/interface/rotg.c
+++ b/interface/rotg.c
@@ -1,9 +1,11 @@
 #include <math.h>
+#include <float.h>
 #include "common.h"
 #ifdef FUNCTION_PROFILE
 #include "functable.h"
 #endif
 
+
 #ifndef CBLAS
 
 void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
@@ -14,17 +16,27 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
 
 #endif
 
+#ifdef DOUBLE
+  long double safmin = DBL_MIN;
+#else
+  long double safmin = FLT_MIN;
+#endif
+
 #if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86)
 
   long double da = *DA;
   long double db = *DB;
   long double c;
   long double s;
-  long double r, roe, z;
+  long double r, z;
+  long double sigma, dascal,dbscal;
 
   long double ada = fabsl(da);
   long double adb = fabsl(db);
-  long double scale = ada + adb;
+  long double maxab = MAX(ada,adb);
+  long double safmax;
+  long double scale;
+
 
 #ifndef CBLAS
   PRINT_DEBUG_NAME;
@@ -32,17 +44,25 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
   PRINT_DEBUG_CNAME;
 #endif
 
-  roe = db;
-  if (ada > adb) roe = da;
-
-  if (scale == ZERO) {
+  if (adb == ZERO) {
     *C = ONE;
     *S = ZERO;
-    *DA = ZERO;
     *DB = ZERO;
+  } else if (ada == ZERO) {
+    *C = ZERO;
+    *S = ONE;
+    *DA = *DB;
+    *DB = ONE;
   } else {
-    r = sqrt(da * da + db * db);
-    if (roe < 0) r = -r;
+  safmax = 1./safmin;
+  scale = MIN(MAX(safmin,maxab), safmax);
+    if (ada > adb)
+	sigma = copysign(1.,da);
+    else
+	sigma = copysign(1.,db);
+    dascal = da / scale;
+    dbscal = db / scale;
+    r = sigma * (scale * sqrt(dascal * dascal + dbscal * dbscal));
     c = da / r;
     s = db / r;
     z = ONE;
@@ -65,11 +85,22 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
   FLOAT db = *DB;
   FLOAT c  = *C;
   FLOAT s  = *S;
-  FLOAT r, roe, z;
+  FLOAT sigma;
+  FLOAT r, z;
 
   FLOAT ada = fabs(da);
   FLOAT adb = fabs(db);
-  FLOAT scale = ada + adb;
+  FLOAT maxab = MAX(ada,adb);
+  long double safmax ;
+  FLOAT scale ;
+
+  safmax = 1./safmin;
+  scale = MIN(MAX(safmin,maxab), safmax);
+
+  if (ada > adb)
+	sigma = sign(1.,da);
+    else
+	sigma = sign(1.,db);
 
 #ifndef CBLAS
   PRINT_DEBUG_NAME;
@@ -77,20 +108,17 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
   PRINT_DEBUG_CNAME;
 #endif
 
-  roe = db;
-  if (ada > adb) roe = da;
 
-  if (scale == ZERO) {
+  if (adb == ZERO) {
     *C = ONE;
     *S = ZERO;
-    *DA = ZERO;
+    DA = ZERO;
     *DB = ZERO;
   } else {
     FLOAT aa = da / scale;
     FLOAT bb = db / scale;
 
-    r = scale * sqrt(aa * aa + bb * bb);
-    if (roe < 0) r = -r;
+    r = sigma * scale * sqrt(aa * aa + bb * bb);
     c = da / r;
     s = db / r;
     z = ONE;
diff --git a/interface/zrotg.c b/interface/zrotg.c
index 123f4da85..7c4e2ed08 100644
--- a/interface/zrotg.c
+++ b/interface/zrotg.c
@@ -1,9 +1,11 @@
 #include <math.h>
+#include <float.h>
 #include "common.h"
 #ifdef FUNCTION_PROFILE
 #include "functable.h"
 #endif
 
+
 #ifndef CBLAS
 void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
 
@@ -14,6 +16,12 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
     FLOAT *S  = (FLOAT*) VS;
 #endif /* CBLAS */
 
+#ifdef DOUBLE
+  long double safmin = DBL_MIN;
+#else
+  long double safmin = FLT_MIN;
+#endif
+
 #if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86)
 
   long double da_r = *(DA + 0);
@@ -23,6 +31,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
   long double r;
 
   long double ada = fabsl(da_r) + fabsl(da_i);
+  long double adb = sqrt(db_r * db_r + db_i * db_i);
 
   PRINT_DEBUG_NAME;
 
@@ -38,10 +47,24 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
     *(DA + 1) = db_i;
   } else {
     long double alpha_r, alpha_i;
+    long double safmax = 1./safmin;
+    long double sigma;
+    long double maxab = MAX(ada,adb);
+    long double scale = MIN(MAX(safmin,maxab), safmax);
 
-    ada = sqrt(da_r * da_r + da_i * da_i);
 
-    r = sqrt(da_r * da_r + da_i * da_i + db_r * db_r + db_i * db_i);
+    long double aa_r    = da_r / scale;
+    long double aa_i    = da_i / scale;
+    long double bb_r    = db_r / scale;
+    long double bb_i    = db_i / scale;
+
+    if (ada > adb)
+        sigma = copysign(1.,da_r);
+    else
+        sigma = copysign(1.,db_r);
+
+    r = sigma * scale * sqrt(aa_r * aa_r + aa_i * aa_i + bb_r * bb_r + bb_i * bb_i);
+
 
     alpha_r = da_r / ada;
     alpha_i = da_i / ada;
@@ -60,7 +83,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
   FLOAT r;
 
   FLOAT ada = fabs(da_r) + fabs(da_i);
-  FLOAT adb;
+  FLOAT ada = fabs(db_r) + fabs(db_i);
 
   PRINT_DEBUG_NAME;
 
@@ -75,6 +98,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
     *(DA + 0) = db_r;
     *(DA + 1) = db_i;
   } else {
+    long double safmax = 1./safmin;
     FLOAT scale;
     FLOAT aa_r, aa_i, bb_r, bb_i;
     FLOAT alpha_r, alpha_i;
@@ -108,14 +132,20 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
     	scale = (bb_i / bb_r);
     	adb = bb_r * sqrt(ONE + scale * scale);
     }
-    scale = ada + adb;
+    FLOAT maxab = MAX(ada,adb);
+    scale = MIN(MAX(safmin,maxab), safmax);
 
     aa_r    = da_r / scale;
     aa_i    = da_i / scale;
     bb_r    = db_r / scale;
     bb_i    = db_i / scale;
 
-    r = scale * sqrt(aa_r * aa_r + aa_i * aa_i + bb_r * bb_r + bb_i * bb_i);
+    if (ada > adb)
+        sigma = copysign(1.,da_r);
+    else
+        sigma = copysign(1.,db_r);
+
+    r = sigma * scale * sqrt(aa_r * aa_r + aa_i * aa_i + bb_r * bb_r + bb_i * bb_i);
 
     alpha_r = da_r / ada;
     alpha_i = da_i / ada;

From affeef0b9c1ef99fca1a97dd4635e4f6552f6b71 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 13 Jul 2023 08:38:03 +0200
Subject: [PATCH 195/718] Fix gmake build not always picking the right ARM64
 arch options for clang (#4136)

* Fix gcc version checks erroneously excluding clang

* Avoid some mtune names not supported by (Apple)Clang
---
 Makefile.arm64 | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/Makefile.arm64 b/Makefile.arm64
index 064e84cbb..1b10446f7 100644
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -69,7 +69,7 @@ endif
 # in GCC>=9
 ifeq ($(CORE), NEOVERSEN1)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ9), 1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
 CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
@@ -92,9 +92,14 @@ endif
 # in GCC>=10.4
 ifeq ($(CORE), NEOVERSEV1)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ10), 1)
-ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
-CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-v1
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
+ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
+CCOMMON_OPT += -march=armv8.4-a+sve
+ifeq (1, $(ISCLANG))
+CCOMMON_OPT += -mtune=cortex-x1
+else
+CCOMMON_OPT += -mtune=neoverse-v1
+endif
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
 endif
@@ -122,8 +127,8 @@ endif
 # in GCC>=10.4
 ifeq ($(CORE), NEOVERSEN2)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ10), 1)
-ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
+ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
 ifneq ($(OSNAME), Darwin)
 CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
 else
@@ -155,7 +160,7 @@ endif
 # Use a53 tunings because a55 is only available in GCC>=8.1
 ifeq ($(CORE), CORTEXA55)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ8), 1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ8) $(ISCLANG)))
 CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
@@ -196,8 +201,13 @@ endif
 endif
 
 ifeq ($(CORE), THUNDERX3T110)
-ifeq ($(GCCVERSIONGTEQ10), 1)
-CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
+CCOMMON_OPT += -march=armv8.3-a 
+ifeq (0, $(ISCLANG))
+CCOMMON_OPT += -mtune=thunderx3t110
+else
+CCOMMON_OPT += -mtune=thunderx2t99
+endif
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
 endif
@@ -225,9 +235,12 @@ endif
 endif
 endif
 
-ifeq ($(GCCVERSIONGTEQ9), 1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
 ifeq ($(CORE), EMAG8180)
-CCOMMON_OPT += -march=armv8-a -mtune=emag
+CCOMMON_OPT += -march=armv8-a
+ifeq  ($(ISCLANG), 0)
+CCOMMON_OPT += -mtune=emag
+endif
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8-a -mtune=emag
 endif

From 0f2ce93904272255a0e0a60d8d296ab5974ef235 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 13 Jul 2023 10:56:59 +0200
Subject: [PATCH 196/718] typo fix

---
 interface/rotg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/rotg.c b/interface/rotg.c
index 3ccf4f7eb..530dce16a 100644
--- a/interface/rotg.c
+++ b/interface/rotg.c
@@ -112,7 +112,7 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
   if (adb == ZERO) {
     *C = ONE;
     *S = ZERO;
-    DA = ZERO;
+    *DA = ZERO;
     *DB = ZERO;
   } else {
     FLOAT aa = da / scale;

From 7c75c8b2fe3fc4b2e6c5c09f63d7ce2b8d5b5c18 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 13 Jul 2023 21:40:12 +0200
Subject: [PATCH 197/718] fix truncated edit

---
 interface/rotg.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/interface/rotg.c b/interface/rotg.c
index 530dce16a..8d6df531a 100644
--- a/interface/rotg.c
+++ b/interface/rotg.c
@@ -112,8 +112,12 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
   if (adb == ZERO) {
     *C = ONE;
     *S = ZERO;
-    *DA = ZERO;
     *DB = ZERO;
+  else if (ada == ZERO) {
+    *C = ZERO;
+    *S = ONE;
+    *DA = *DB;
+    *DB = ONE;
   } else {
     FLOAT aa = da / scale;
     FLOAT bb = db / scale;

From 9567305e4c4711c49530917419dc391c00d414b7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 13 Jul 2023 23:21:18 +0200
Subject: [PATCH 198/718] Restore initialization of data01,data02

---
 kernel/generic/ztrsm_utcopy_1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/generic/ztrsm_utcopy_1.c b/kernel/generic/ztrsm_utcopy_1.c
index 08f85e891..5833a64ef 100644
--- a/kernel/generic/ztrsm_utcopy_1.c
+++ b/kernel/generic/ztrsm_utcopy_1.c
@@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
   BLASLONG i, ii, j, jj;
 
-  FLOAT data01, data02;
+  FLOAT data01=0.0, data02=0.0;
   FLOAT *a1;
 
   lda *= 2;

From cfa0a80664591d0243903d626aefec922cb69e6a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 13 Jul 2023 23:23:12 +0200
Subject: [PATCH 199/718] Restore initialization of data variables

---
 kernel/generic/ztrsm_utcopy_2.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/generic/ztrsm_utcopy_2.c b/kernel/generic/ztrsm_utcopy_2.c
index 387bb2532..bc495f7c6 100644
--- a/kernel/generic/ztrsm_utcopy_2.c
+++ b/kernel/generic/ztrsm_utcopy_2.c
@@ -47,6 +47,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
   FLOAT data05, data06, data07, data08;
   FLOAT *a1, *a2;
 
+  data01=data02=data07=data08=0.0;
   lda *= 2;
 
   jj = offset;

From 5e1103b8d7cd3ff60dc7ebddbcc8f47c9c927f98 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 13 Jul 2023 23:35:38 +0200
Subject: [PATCH 200/718] Update rotg.c

---
 interface/rotg.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/interface/rotg.c b/interface/rotg.c
index 8d6df531a..8d40d9c53 100644
--- a/interface/rotg.c
+++ b/interface/rotg.c
@@ -98,9 +98,9 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
   scale = MIN(MAX(safmin,maxab), safmax);
 
   if (ada > adb)
-	sigma = sign(1.,da);
+	sigma = copysign(1.,da);
     else
-	sigma = sign(1.,db);
+	sigma = copysign(1.,db);
 
 #ifndef CBLAS
   PRINT_DEBUG_NAME;
@@ -113,7 +113,7 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
     *C = ONE;
     *S = ZERO;
     *DB = ZERO;
-  else if (ada == ZERO) {
+  } else if (ada == ZERO) {
     *C = ZERO;
     *S = ONE;
     *DA = *DB;

From 04cdf5efb4a13010874bed854df4264a5730819e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 14 Jul 2023 00:05:00 +0200
Subject: [PATCH 201/718] fix typo and missing declaration

---
 interface/zrotg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/interface/zrotg.c b/interface/zrotg.c
index 7c4e2ed08..dd765f05f 100644
--- a/interface/zrotg.c
+++ b/interface/zrotg.c
@@ -83,7 +83,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
   FLOAT r;
 
   FLOAT ada = fabs(da_r) + fabs(da_i);
-  FLOAT ada = fabs(db_r) + fabs(db_i);
+  FLOAT adb = fabs(db_r) + fabs(db_i);
 
   PRINT_DEBUG_NAME;
 
@@ -99,7 +99,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
     *(DA + 1) = db_i;
   } else {
     long double safmax = 1./safmin;
-    FLOAT scale;
+    FLOAT scale, sigma;
     FLOAT aa_r, aa_i, bb_r, bb_i;
     FLOAT alpha_r, alpha_i;
 

From 3d31191b0fc5f25946b45bf16708aacd0e61c40b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 14 Jul 2023 11:06:48 +0200
Subject: [PATCH 202/718] Work around Clang failing to disambiguate SVE
 intrinsics and add AppleClang crossbuild to MacOS/arm64 DYNAMIC_ARCH in
 AzureCI (#4140)

* Add AppleClang crossbuild to MacOS/arm64 DYNAMIC_ARCH

* add casts to disambiguate svwhilelt for clang
---
 azure-pipelines.yml                | 13 +++++++++++++
 kernel/arm64/dot_kernel_sve.c      |  4 ++--
 kernel/arm64/gemm_ncopy_sve_v1x8.c |  2 +-
 kernel/arm64/gemm_tcopy_sve_v1x8.c |  2 +-
 kernel/arm64/trmm_lncopy_sve_v1.c  |  8 ++++----
 kernel/arm64/trmm_ltcopy_sve_v1.c  |  8 ++++----
 kernel/arm64/trmm_uncopy_sve_v1.c  |  8 ++++----
 kernel/arm64/trmm_utcopy_sve_v1.c  |  8 ++++----
 kernel/arm64/trsm_lncopy_sve.c     |  8 ++++----
 9 files changed, 37 insertions(+), 24 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 65ef538e9..ff56ad00b 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -271,6 +271,19 @@ jobs:
   - script: |
      make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
 
+- job: OSX_xbuild_DYNAMIC_ARM64
+  pool:
+     vmImage: 'macOS-11'
+  variables:
+     CC: /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+     CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX11.3.sdk -arch arm64
+  steps:
+  - script: |
+     ls /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs
+     /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus
+     /Applications/Xcode_11.7.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version
+     make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
+
 - job: ALPINE_MUSL
   pool:
      vmImage: 'ubuntu-latest'
diff --git a/kernel/arm64/dot_kernel_sve.c b/kernel/arm64/dot_kernel_sve.c
index 8460e0d5e..9c057551e 100644
--- a/kernel/arm64/dot_kernel_sve.c
+++ b/kernel/arm64/dot_kernel_sve.c
@@ -50,8 +50,8 @@ static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) {
         BLASLONG sve_width = SVE_WIDTH;
 
         for (BLASLONG i = 0; i < n; i += sve_width * 2) {
-                svbool_t pg_a = SVE_WHILELT(i, n);
-                svbool_t pg_b = SVE_WHILELT(i + sve_width, n);
+                svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n);
+                svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n);
 
                 SVE_TYPE x_vec_a = svld1(pg_a, &x[i]);
                 SVE_TYPE y_vec_a = svld1(pg_a, &y[i]);
diff --git a/kernel/arm64/gemm_ncopy_sve_v1x8.c b/kernel/arm64/gemm_ncopy_sve_v1x8.c
index 113b1ee40..7b2a2e767 100644
--- a/kernel/arm64/gemm_ncopy_sve_v1x8.c
+++ b/kernel/arm64/gemm_ncopy_sve_v1x8.c
@@ -107,7 +107,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
     BLASLONG remaining_n = n - single_vectors_n;
     if (remaining_n) {
         a_offset_inner = a_offset;
-        svbool_t pg = SV_WHILE(0L, remaining_n);
+        svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
         uint64_t active = remaining_n;
         uint64_t i_cnt = m >> 2;
         while (i_cnt--) {
diff --git a/kernel/arm64/gemm_tcopy_sve_v1x8.c b/kernel/arm64/gemm_tcopy_sve_v1x8.c
index 68a2cc07c..9a93b6cb7 100644
--- a/kernel/arm64/gemm_tcopy_sve_v1x8.c
+++ b/kernel/arm64/gemm_tcopy_sve_v1x8.c
@@ -100,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
     BLASLONG remaining_n = n - single_vectors_n;
     if (remaining_n) {
         a_offset_inner = a_offset;
-        svbool_t pg = SV_WHILE(0L, remaining_n);
+        svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
         uint64_t active = remaining_n;
         uint64_t i_cnt = m >> 2;
         while (i_cnt--) {
diff --git a/kernel/arm64/trmm_lncopy_sve_v1.c b/kernel/arm64/trmm_lncopy_sve_v1.c
index 918e945ac..c7f79e3fd 100644
--- a/kernel/arm64/trmm_lncopy_sve_v1.c
+++ b/kernel/arm64/trmm_lncopy_sve_v1.c
@@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
 #ifdef DOUBLE
     svint64_t index = svindex_s64(0LL, lda);
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
     svint32_t index = svindex_s32(0, lda);
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trmm_ltcopy_sve_v1.c b/kernel/arm64/trmm_ltcopy_sve_v1.c
index b76cc56de..b3ba68973 100644
--- a/kernel/arm64/trmm_ltcopy_sve_v1.c
+++ b/kernel/arm64/trmm_ltcopy_sve_v1.c
@@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
     js = 0;
 #ifdef DOUBLE
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -122,11 +122,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trmm_uncopy_sve_v1.c b/kernel/arm64/trmm_uncopy_sve_v1.c
index 75fa163ae..a47d2096c 100644
--- a/kernel/arm64/trmm_uncopy_sve_v1.c
+++ b/kernel/arm64/trmm_uncopy_sve_v1.c
@@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
 #ifdef DOUBLE
     svint64_t index = svindex_s64(0LL, lda);
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
     svint32_t index = svindex_s32(0, lda);
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trmm_utcopy_sve_v1.c b/kernel/arm64/trmm_utcopy_sve_v1.c
index 36a03242a..c5188beb4 100644
--- a/kernel/arm64/trmm_utcopy_sve_v1.c
+++ b/kernel/arm64/trmm_utcopy_sve_v1.c
@@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
     js = 0;
 #ifdef DOUBLE
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -121,11 +121,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trsm_lncopy_sve.c b/kernel/arm64/trsm_lncopy_sve.c
index 5a9d4194a..2895eb85d 100644
--- a/kernel/arm64/trsm_lncopy_sve.c
+++ b/kernel/arm64/trsm_lncopy_sve.c
@@ -56,13 +56,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 #ifdef DOUBLE
   int64_t js = 0;
   svint64_t index = svindex_s64(0LL, lda);
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
   svint32_t index = svindex_s32(0, lda);
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -106,11 +106,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif

From 4c43d1eebadf32956efc7a7deb172a6259dd1ace Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 15 Jul 2023 07:47:19 +0200
Subject: [PATCH 203/718] Fix C prototypes and LAPACKE headers for
 ?GEDMD/?GEDMDQ (#4134)

* Fix prototypes for ?GEDMD/?GEDMDQ and their LAPACKE interfaces
---
 lapack-netlib/LAPACKE/include/lapack.h        | 176 +++++++++++++-----
 lapack-netlib/LAPACKE/include/lapacke.h       | 106 +++++------
 lapack-netlib/LAPACKE/src/lapacke_cgedmd.c    |  52 ++++--
 .../LAPACKE/src/lapacke_cgedmd_work.c         |  32 ++--
 lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c   |  42 +++--
 .../LAPACKE/src/lapacke_cgedmdq_work.c        |  30 +--
 lapack-netlib/LAPACKE/src/lapacke_dgedmd.c    |  28 +--
 .../LAPACKE/src/lapacke_dgedmd_work.c         |  38 ++--
 lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c   |   2 +-
 .../LAPACKE/src/lapacke_dgedmdq_work.c        |  14 +-
 lapack-netlib/LAPACKE/src/lapacke_sgedmd.c    |  30 +--
 .../LAPACKE/src/lapacke_sgedmd_work.c         |  39 ++--
 lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c   |   2 +-
 .../LAPACKE/src/lapacke_sgedmdq_work.c        |  14 +-
 lapack-netlib/LAPACKE/src/lapacke_zgedmd.c    |  54 +++---
 .../LAPACKE/src/lapacke_zgedmd_work.c         |  36 ++--
 lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c   |  44 +++--
 .../LAPACKE/src/lapacke_zgedmdq_work.c        |  28 +--
 18 files changed, 454 insertions(+), 313 deletions(-)

diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h
index f510c8c80..28f8ad655 100644
--- a/lapack-netlib/LAPACKE/include/lapack.h
+++ b/lapack-netlib/LAPACKE/include/lapack.h
@@ -3323,68 +3323,107 @@ void LAPACK_zgesdd_base(
     #define LAPACK_zgesdd(...) LAPACK_zgesdd_base(__VA_ARGS__)
 #endif
 
-#define LAPACK_cgedmd LAPACK_GLOBAL(cgedmd,CGEDMD)
-void LAPACK_cgedmd(
-    char const* jobs, char const* jobz, char const* jobf,
+#define LAPACK_cgedmd_base LAPACK_GLOBAL(cgedmd,CGEDMD)
+void LAPACK_cgedmd_base(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobf,
     lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n,
     lapack_complex_float* x, lapack_int const* ldx,
-    lapack_complex_float* y, lapack_int const* ldy, lapack_int const* k,
-    lapack_complex_float* reig, lapack_complex_float* imeig,
-    lapack_complex_float* z, lapack_int const* ldz, lapack_complex_float* res,
+    lapack_complex_float* y, lapack_int const* ldy, lapack_int const* nrnk,
+    const float* tol, lapack_int* k, lapack_complex_float* eigs,
+    lapack_complex_float* z, lapack_int const* ldz, float* res,
     lapack_complex_float* b, lapack_int const* ldb,
     lapack_complex_float* w, lapack_int const* ldw,
     lapack_complex_float* s, lapack_int const* lds,
-    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_complex_float* zwork, lapack_int const* lzwork,
+    float* work, lapack_int const* lwork,
     lapack_int* iwork, lapack_int const* liwork,
-    lapack_int* info );
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_cgedmd(...) LAPACK_cgedmd_base(__VA_ARGS__, 1, 1, 1, 1)
+#else
+    #define LAPACK_cgedmd(...) LAPACK_cgedmd_base(__VA_ARGS__)
+#endif
+
 
-#define LAPACK_dgedmd LAPACK_GLOBAL(dgedmd,DGEDMD)
-void LAPACK_dgedmd(
-    char const* jobs, char const* jobz, char const* jobf,
+#define LAPACK_dgedmd_base LAPACK_GLOBAL(dgedmd,DGEDMD)
+void LAPACK_dgedmd_base(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobf,
     lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n,
     double* x, lapack_int const* ldx,
-    double* y, lapack_int const* ldy, lapack_int const* k,
-    double* reig, double* imeig,
+    double* y, lapack_int const* ldy, lapack_int const* nrnk,
+    const double* tol, lapack_int* k, double* reig, double* imeig,
     double* z, lapack_int const* ldz, double* res,
     double* b, lapack_int const* ldb,
     double* w, lapack_int const* ldw,
     double* s, lapack_int const* lds,
     double* work, lapack_int const* lwork,
     lapack_int* iwork, lapack_int const* liwork,
-    lapack_int* info );
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_dgedmd(...) LAPACK_dgedmd_base(__VA_ARGS__, 1, 1, 1, 1)
+#else
+    #define LAPACK_dgedmd(...) LAPACK_dgedmd_base(__VA_ARGS__)
+#endif
 
-#define LAPACK_sgedmd LAPACK_GLOBAL(sgedmd,SGEDMD)
-void LAPACK_sgedmd(
-    char const* jobs, char const* jobz, char const* jobf,
+#define LAPACK_sgedmd_base LAPACK_GLOBAL(sgedmd,SGEDMD)
+void LAPACK_sgedmd_base(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobf,
     lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n,
     float* x, lapack_int const* ldx,
-    float* y, lapack_int const* ldy, lapack_int const* k,
-    float* reig, float* imeig,
+    float* y, lapack_int const* ldy, lapack_int const* nrnk,
+    const float* tol, lapack_int* k, float* reig, float *imeig,
     float* z, lapack_int const* ldz, float* res,
     float* b, lapack_int const* ldb,
     float* w, lapack_int const* ldw,
     float* s, lapack_int const* lds,
     float* work, lapack_int const* lwork,
     lapack_int* iwork, lapack_int const* liwork,
-    lapack_int* info );
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_sgedmd(...) LAPACK_sgedmd_base(__VA_ARGS__, 1, 1, 1, 1)
+#else
+    #define LAPACK_sgedmd(...) LAPACK_sgedmd_base(__VA_ARGS__)
+#endif
 
-#define LAPACK_zgedmd LAPACK_GLOBAL(zgedmd,ZGEDMD)
-void LAPACK_zgedmd(
-    char const* jobs, char const* jobz, char const* jobf,
+#define LAPACK_zgedmd_base LAPACK_GLOBAL(zgedmd,ZGEDMD)
+void LAPACK_zgedmd_base(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobf,
     lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n,
     lapack_complex_double* x, lapack_int const* ldx,
-    lapack_complex_double* y, lapack_int const* ldy, lapack_int const* k,
-    lapack_complex_double* reig, lapack_complex_double* imeig,
-    lapack_complex_double* z, lapack_int const* ldz, lapack_complex_double* res,
+    lapack_complex_double* y, lapack_int const* ldy, lapack_int const* nrnk,
+    const double* tol, lapack_int *k, lapack_complex_double* eigs,
+    lapack_complex_double* z, lapack_int const* ldz, double* res,
     lapack_complex_double* b, lapack_int const* ldb,
     lapack_complex_double* w, lapack_int const* ldw,
     lapack_complex_double* s, lapack_int const* lds,
-    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_complex_double* zwork, lapack_int const* lzwork,
+    double* rwork, lapack_int const* lrwork,
     lapack_int* iwork, lapack_int const* liwork,
-    lapack_int* info );
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_zgedmd(...) LAPACK_zgedmd_base(__VA_ARGS__, 1, 1, 1, 1)
+#else
+    #define LAPACK_zgedmd(...) LAPACK_zgedmd_base(__VA_ARGS__)
+#endif
 
-#define LAPACK_cgedmdq LAPACK_GLOBAL(cgedmdq,CGEDMDQ)
-void LAPACK_cgedmdq(
+#define LAPACK_cgedmdq_base LAPACK_GLOBAL(cgedmdq,CGEDMDQ)
+void LAPACK_cgedmdq_base(
     char const* jobs, char const* jobz, char const* jobr, char const* jobq,
     char const* jobt, char const* jobf, lapack_int const* whtsvd,
     lapack_int const* m, lapack_int const* n,
@@ -3392,35 +3431,54 @@ void LAPACK_cgedmdq(
     lapack_complex_float* x, lapack_int const* ldx,
     lapack_complex_float* y, lapack_int const* ldy, lapack_int const* nrnk,
     float const* tol, lapack_int const* k,
-    lapack_complex_float* reig, lapack_complex_float* imeig,
-    lapack_complex_float* z, lapack_int const* ldz, lapack_complex_float* res,
+    lapack_complex_float* eigs,
+    lapack_complex_float* z, lapack_int const* ldz, float* res,
     lapack_complex_float* b, lapack_int const* ldb,
     lapack_complex_float* v, lapack_int const* ldv,
     lapack_complex_float* s, lapack_int const* lds,
-    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_complex_float* zwork, lapack_int const* lzwork,
+    float* work, lapack_int const* lwork,
     lapack_int* iwork, lapack_int const* liwork,
-    lapack_int* info );
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_cgedmdq(...) LAPACK_cgedmdq_base(__VA_ARGS__, 1, 1, 1, 1, 1, 1)
+#else
+    #define LAPACK_cgedmdq(...) LAPACK_cgedmdq_base(__VA_ARGS__)
+#endif
 
-#define LAPACK_dgedmdq LAPACK_GLOBAL(dgedmdq,DGEDMDQ)
-void LAPACK_dgedmdq(
+#define LAPACK_dgedmdq_base LAPACK_GLOBAL(dgedmdq,DGEDMDQ)
+void LAPACK_dgedmdq_base(
     char const* jobs, char const* jobz, char const* jobr, char const* jobq,
     char const* jobt, char const* jobf, lapack_int const* whtsvd,
     lapack_int const* m, lapack_int const* n,
     double* f, lapack_int const* ldf,
     double* x, lapack_int const* ldx,
     double* y, lapack_int const* ldy, lapack_int const* nrnk,
-    double const* tol, lapack_int const* k,
-    double* reig, double* imeig,
+    double const* tol, lapack_int* k,
+    double* reig, double *imeig,
     double* z, lapack_int const* ldz, double* res,
     double* b, lapack_int const* ldb,
     double* v, lapack_int const* ldv,
     double* s, lapack_int const* lds,
     double* work, lapack_int const* lwork,
     lapack_int* iwork, lapack_int const* liwork,
-    lapack_int* info );
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_dgedmdq(...) LAPACK_dgedmdq_base(__VA_ARGS__, 1, 1, 1, 1, 1, 1)
+#else
+    #define LAPACK_dgedmdq(...) LAPACK_dgedmdq_base(__VA_ARGS__)
+#endif
 
-#define LAPACK_sgedmdq LAPACK_GLOBAL(sgedmdq,SGEDMDQ)
-void LAPACK_sgedmdq(
+#define LAPACK_sgedmdq_base LAPACK_GLOBAL(sgedmdq,SGEDMDQ)
+void LAPACK_sgedmdq_base(
     char const* jobs, char const* jobz, char const* jobr, char const* jobq,
     char const* jobt, char const* jobf, lapack_int const* whtsvd,
     lapack_int const* m, lapack_int const* n,
@@ -3435,10 +3493,19 @@ void LAPACK_sgedmdq(
     float* s, lapack_int const* lds,
     float* work, lapack_int const* lwork,
     lapack_int* iwork, lapack_int const* liwork,
-    lapack_int* info );
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_sgedmdq(...) LAPACK_sgedmdq_base(__VA_ARGS__, 1, 1, 1, 1, 1, 1)
+#else
+    #define LAPACK_sgedmdq(...) LAPACK_sgedmdq_base(__VA_ARGS__)
+#endif
 
-#define LAPACK_zgedmdq LAPACK_GLOBAL(zgedmdq,ZGEDMDQ)
-void LAPACK_zgedmdq(
+#define LAPACK_zgedmdq_base LAPACK_GLOBAL(zgedmdq,ZGEDMDQ)
+void LAPACK_zgedmdq_base(
     char const* jobs, char const* jobz, char const* jobr, char const* jobq,
     char const* jobt, char const* jobf, lapack_int const* whtsvd,
     lapack_int const* m, lapack_int const* n,
@@ -3446,14 +3513,25 @@ void LAPACK_zgedmdq(
     lapack_complex_double* x, lapack_int const* ldx,
     lapack_complex_double* y, lapack_int const* ldy, lapack_int const* nrnk,
     double const* tol, lapack_int const* k,
-    lapack_complex_double* reig, lapack_complex_double* imeig,
-    lapack_complex_double* z, lapack_int const* ldz, lapack_complex_double* res,
+    lapack_complex_double* eigs,
+    lapack_complex_double* z, lapack_int const* ldz, double* res,
     lapack_complex_double* b, lapack_int const* ldb,
     lapack_complex_double* v, lapack_int const* ldv,
     lapack_complex_double* s, lapack_int const* lds,
-    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_complex_double* zwork, lapack_int const* lzwork,
+    double* work, lapack_int const* lwork,
     lapack_int* iwork, lapack_int const* liwork,
-    lapack_int* info );
+    lapack_int* info
+
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_zgedmdq(...) LAPACK_zgedmdq_base(__VA_ARGS__, 1, 1, 1, 1, 1, 1)
+#else
+    #define LAPACK_zgedmdq(...) LAPACK_zgedmdq_base(__VA_ARGS__)
+#endif
 
 #define LAPACK_cgesv LAPACK_GLOBAL(cgesv,CGESV)
 lapack_int LAPACK_cgesv(
@@ -21649,7 +21727,7 @@ void LAPACK_ztrevc_base(
 #endif
 );
 #ifdef LAPACK_FORTRAN_STRLEN_END
-    #define LAPACK_ztrevc(...) LAPACK_ztrevc_base(__VA_ARGS__, 1, 1)
+    #define LAPACK_ztrevc(...) LAPACK_ztrevc_base(__VA_ARGS__, (size_t)1, 1)
 #else
     #define LAPACK_ztrevc(...) LAPACK_ztrevc_base(__VA_ARGS__)
 #endif
diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h
index 9a9ab4753..377e2a6bb 100644
--- a/lapack-netlib/LAPACKE/include/lapacke.h
+++ b/lapack-netlib/LAPACKE/include/lapacke.h
@@ -5713,55 +5713,59 @@ lapack_int LAPACKE_zgesdd_work( int matrix_layout, char jobz, lapack_int m,
                                 double* rwork, lapack_int* iwork );
 
 lapack_int LAPACKE_sgedmd_work( int matrix_layout, char jobs, char jobz,
-                                char jobf, lapack_int whtsvd, lapack_int m,
-                                lapack_int n, float* x, lapack_int ldx,
-                                float* y, lapack_int ldy, lapack_int k,
-                                float* reig, float* imeig, float* z,
-                                lapack_int ldz, float* res, float* b,
-                                lapack_int ldb, float* w, lapack_int ldw,
-                                float* s, lapack_int lds, float* work,
-                                lapack_int lwork, lapack_int* iwork,
-                                lapack_int liwork );
+                                char jobr, char jobf, lapack_int whtsvd,
+				lapack_int m, lapack_int n, float* x,
+				lapack_int ldx, float* y, lapack_int ldy,
+				lapack_int nrnk, float* tol, lapack_int k,
+			       	float* reig, float* imeig,
+				float* z, lapack_int ldz, float* res,
+				float* b, lapack_int ldb, float* w,
+				lapack_int ldw, float* s, lapack_int lds,
+				float* work, lapack_int lwork,
+				lapack_int* iwork, lapack_int liwork );
 
 lapack_int LAPACKE_dgedmd_work( int matrix_layout, char jobs, char jobz,
-                                char jobf, lapack_int whtsvd, lapack_int m,
-                                lapack_int n, double* x, lapack_int ldx,
-                                double* y, lapack_int ldy, lapack_int k,
-                                double* reig, double* imeig, double* z,
-                                lapack_int ldz, double* res, double* b,
-                                lapack_int ldb, double* w, lapack_int ldw,
-                                double* s, lapack_int lds, double* work,
-                                lapack_int lwork, lapack_int* iwork,
-                                lapack_int liwork );
+                                char jobr, char jobf, lapack_int whtsvd,
+				lapack_int m, lapack_int n, double* x,
+				lapack_int ldx, double* y, lapack_int ldy,
+				lapack_int nrnk, double* tol, lapack_int k,
+			       	double* reig, double *imeig,
+				double* z, lapack_int ldz, double* res,
+				double* b, lapack_int ldb, double* w,
+				lapack_int ldw, double* s, lapack_int lds,
+				double* work, lapack_int lwork,
+				lapack_int* iwork, lapack_int liwork );
 
 lapack_int LAPACKE_cgedmd_work( int matrix_layout, char jobs, char jobz,
-                                char jobf, lapack_int whtsvd, lapack_int m,
-                                lapack_int n, lapack_complex_float* x,
-                                lapack_int ldx, lapack_complex_float* y,
-                                lapack_int ldy, lapack_int k,
-                                lapack_complex_float* reig,
-                                lapack_complex_float* imeig,
+                                char jobr, char jobf, lapack_int whtsvd,
+				lapack_int m, lapack_int n,
+				lapack_complex_float* x, lapack_int ldx,
+				lapack_complex_float* y, lapack_int ldy,
+				lapack_int nrnk, float* tol, lapack_int k,
+			       	lapack_complex_float* eigs,
                                 lapack_complex_float* z, lapack_int ldz,
-                                lapack_complex_float* res,
+                                float* res,
                                 lapack_complex_float* b, lapack_int ldb,
                                 lapack_complex_float* w, lapack_int ldw,
                                 lapack_complex_float* s, lapack_int lds,
-                                lapack_complex_float* work, lapack_int lwork,
+                                lapack_complex_float* zwork, lapack_int lzwork,
+                                float* work, lapack_int lwork,
                                 lapack_int* iwork, lapack_int liwork );
 
 lapack_int LAPACKE_zgedmd_work( int matrix_layout, char jobs, char jobz,
-                                char jobf, lapack_int whtsvd, lapack_int m,
-                                lapack_int n, lapack_complex_double* x,
-                                lapack_int ldx, lapack_complex_double* y,
-                                lapack_int ldy, lapack_int k,
-                                lapack_complex_double* reig,
-                                lapack_complex_double* imeig,
+                                char jobr, char jobf, lapack_int whtsvd,
+				lapack_int m, lapack_int n,
+				lapack_complex_double* x, lapack_int ldx,
+				lapack_complex_double* y, lapack_int ldy,
+				lapack_int nrnk, double* tol, lapack_int k, 
+				lapack_complex_double* eigs,
                                 lapack_complex_double* z, lapack_int ldz,
-                                lapack_complex_double* res,
+                                double* res,
                                 lapack_complex_double* b, lapack_int ldb,
                                 lapack_complex_double* w, lapack_int ldw,
                                 lapack_complex_double* s, lapack_int lds,
-                                lapack_complex_double* work, lapack_int lwork,
+                                lapack_complex_double* zwork, lapack_int lzwork,
+                                double* work, lapack_int lwork,
                                 lapack_int* iwork, lapack_int liwork );
 
 lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz,
@@ -5769,8 +5773,8 @@ lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz,
                                  lapack_int whtsvd, lapack_int m, lapack_int n,
                                  float* f, lapack_int ldf, float* x,
                                  lapack_int ldx, float* y, lapack_int ldy,
-                                 lapack_int nrnk, float tol, lapack_int k,
-                                 float* reig, float* imeig, float* z,
+                                 lapack_int nrnk, float* tol, lapack_int k,
+				 float* reig, float *imeig, float* z,
                                  lapack_int ldz, float* res, float* b,
                                  lapack_int ldb, float* v, lapack_int ldv,
                                  float* s, lapack_int lds, float* work,
@@ -5782,8 +5786,8 @@ lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz,
                                  lapack_int whtsvd, lapack_int m, lapack_int n,
                                  double* f, lapack_int ldf, double* x,
                                  lapack_int ldx, double* y, lapack_int ldy,
-                                 lapack_int nrnk, double tol, lapack_int k,
-                                 double* reig, double* imeig, double* z,
+                                 lapack_int nrnk, double* tol, lapack_int k,
+				 double* reig, double* imeig, double* z,
                                  lapack_int ldz, double* res, double* b,
                                  lapack_int ldb, double* v, lapack_int ldv,
                                  double* s, lapack_int lds, double* work,
@@ -5796,17 +5800,16 @@ lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz,
                                  lapack_complex_float* f, lapack_int ldf,
                                  lapack_complex_float* x, lapack_int ldx,
                                  lapack_complex_float* y, lapack_int ldy,
-                                 lapack_int nrnk, float tol, lapack_int k,
-                                 lapack_complex_float* reig,
-                                 lapack_complex_float* imeig,
+                                 lapack_int nrnk, float* tol, lapack_int k,
+                                 lapack_complex_float* eigs,
                                  lapack_complex_float* z, lapack_int ldz,
-                                 lapack_complex_float* res,
+                                 float* res,
                                  lapack_complex_float* b, lapack_int ldb,
                                  lapack_complex_float* v, lapack_int ldv,
                                  lapack_complex_float* s, lapack_int lds,
-                                 lapack_complex_float* work, lapack_int lwork,
-                                 lapack_int* iwork,
-                                 lapack_int liwork );
+                                 lapack_complex_float* zwork, lapack_int lzwork,
+                                 float* work, lapack_int lwork,
+                                 lapack_int* iwork, lapack_int liwork);
 
 lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz,
                                  char jobr, char jobq, char jobt, char jobf,
@@ -5814,17 +5817,16 @@ lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz,
                                  lapack_complex_double* f, lapack_int ldf,
                                  lapack_complex_double* x, lapack_int ldx,
                                  lapack_complex_double* y, lapack_int ldy,
-                                 lapack_int nrnk, double tol, lapack_int k,
-                                 lapack_complex_double* reig,
-                                 lapack_complex_double* imeig,
+                                 lapack_int nrnk, double* tol, lapack_int k,
+                                 lapack_complex_double* eigs,
                                  lapack_complex_double* z, lapack_int ldz,
-                                 lapack_complex_double* res,
+                                 double* res,
                                  lapack_complex_double* b, lapack_int ldb,
                                  lapack_complex_double* v, lapack_int ldv,
                                  lapack_complex_double* s, lapack_int lds,
-                                 lapack_complex_double* work, lapack_int lwork,
-                                 lapack_int* iwork,
-                                 lapack_int liwork );
+                                 lapack_complex_double* zwork, lapack_int lzwork,
+                                 double* work, lapack_int lwork,
+                                 lapack_int* iwork, lapack_int liwork);
 
 lapack_int LAPACKE_sgesv_work( int matrix_layout, lapack_int n, lapack_int nrhs,
                                float* a, lapack_int lda, lapack_int* ipiv,
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c
index a269b0daf..6c77e199e 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c
@@ -32,22 +32,26 @@
 
 #include "lapacke_utils.h"
 
-lapack_int LAPACKE_cgedmd( int matrix_layout, char jobs, char jobz, char jobf,
-                           lapack_int whtsvd, lapack_int m, lapack_int n,
-                           lapack_complex_float* x, lapack_int ldx,
-                           lapack_complex_float* y, lapack_int ldy, lapack_int k,
-                           lapack_complex_float* reig, lapack_complex_float* imeig,
+lapack_int LAPACKE_cgedmd( int matrix_layout, char jobs, char jobz, char jobr,
+			   char jobf, lapack_int whtsvd, lapack_int m,
+		 	   lapack_int n, lapack_complex_float* x,
+			   lapack_int ldx, lapack_complex_float* y,
+			   lapack_int ldy, lapack_int nrnk, float* tol,
+			   lapack_int k, lapack_complex_float* eigs,
                            lapack_complex_float* z, lapack_int ldz,
-                           lapack_complex_float* res, lapack_complex_float* b,
+                           float* res, lapack_complex_float* b,
                            lapack_int ldb, lapack_complex_float* w,
                            lapack_int ldw, lapack_complex_float* s, lapack_int lds)
 {
     lapack_int info = 0;
     lapack_int lwork = -1;
     lapack_int liwork = -1;
-    lapack_complex_float* work = NULL;
+    lapack_int lzwork = -1;
+    lapack_complex_float* zwork = NULL;
+    float* work = NULL;
     lapack_int* iwork = NULL;
-    lapack_complex_float work_query;
+    lapack_complex_float zwork_query;
+    float work_query;
     lapack_int iwork_query;
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
         LAPACKE_xerbla( "LAPACKE_cgedmd", -1 );
@@ -77,36 +81,44 @@ lapack_int LAPACKE_cgedmd( int matrix_layout, char jobs, char jobz, char jobf,
     }
 #endif
     /* Query optimal working array(s) size */
-    info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n,
-                                x, ldx, y, ldy, k, reig, imeig, z, ldz, res,
-                                b, ldb, w, ldw, s, lds, &work_query, lwork,
-                                &iwork_query, liwork );
+    info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz,
+				res, b, ldb, w, ldw, s, lds, &zwork_query, 
+				lzwork, &work_query, lwork, &iwork_query, liwork );
 
     if( info != 0 ) {
         goto exit_level_0;
     }
+    lzwork  = LAPACK_C2INT( zwork_query );
     lwork  = LAPACK_C2INT( work_query );
     liwork = iwork_query;
     /* Allocate memory for work arrays */
-    work  = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lwork );
-    if( work == NULL ) {
+    zwork  = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lzwork );
+    if( zwork == NULL ) {
         info = LAPACK_WORK_MEMORY_ERROR;
         goto exit_level_0;
     }
+    work  = (float*)LAPACKE_malloc( sizeof(float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
     if( iwork == NULL ) {
         info = LAPACK_WORK_MEMORY_ERROR;
-        goto exit_level_1;
+        goto exit_level_2;
     }
     /* Call middle-level interface */
-    info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n,
-                                x, ldx, y, ldy, k, reig, imeig, z, ldz, res,
-                                b, ldb, w, ldw, s, lds, work, lwork, iwork,
-                                liwork );
+    info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz,
+				res, b, ldb, w, ldw, s, lds, zwork, lzwork,
+				work, lwork, iwork, liwork );
     /* Release memory and exit */
     LAPACKE_free( iwork );
-exit_level_1:
+exit_level_2:
     LAPACKE_free( work );
+exit_level_1:
+    LAPACKE_free( zwork );
 exit_level_0:
     if( info == LAPACK_WORK_MEMORY_ERROR ) {
         LAPACKE_xerbla( "LAPACKE_cgedmd", info );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c
index 534934efb..08d8b91f5 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c
@@ -33,23 +33,25 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_cgedmd_work( int matrix_layout, char jobs, char jobz,
-                                char jobf, lapack_int whtsvd, lapack_int m,
+                                char jobr, char jobf, lapack_int whtsvd, lapack_int m,
                                 lapack_int n, lapack_complex_float* x, lapack_int ldx,
-                                lapack_complex_float* y, lapack_int ldy, lapack_int k,
-                                lapack_complex_float* reig, lapack_complex_float* imeig,
+                                lapack_complex_float* y, lapack_int ldy, lapack_int nrnk,
+                                float* tol, lapack_int k, lapack_complex_float* eigs,
                                 lapack_complex_float* z, lapack_int ldz,
-                                lapack_complex_float* res, lapack_complex_float* b,
+                                float* res, lapack_complex_float* b,
                                 lapack_int ldb, lapack_complex_float* w,
                                 lapack_int ldw, lapack_complex_float* s, lapack_int lds,
-                                lapack_complex_float* work, lapack_int lwork,
+                                lapack_complex_float* zwork, lapack_int lzwork,
+                                float* work, lapack_int lwork,
                                 lapack_int* iwork, lapack_int liwork )
 {
     lapack_int info = 0;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
-        LAPACK_cgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy,
-                       &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds,
-                       work, &lwork, iwork, &liwork, &info );
+        LAPACK_cgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, y,
+ 	               &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b, &ldb, w, &ldw,
+		       s, &lds, zwork, &lzwork, work, &lwork, iwork, &liwork,
+		       &info );
         if( info < 0 ) {
             info = info - 1;
         }
@@ -99,9 +101,10 @@ lapack_int LAPACKE_cgedmd_work( int matrix_layout, char jobs, char jobz,
         }
         /* Query optimal working array(s) size if requested */
         if( lwork == -1 ) {
-            LAPACK_cgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy,
-                           &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds,
-                           work, &lwork, iwork, &liwork, &info );
+            LAPACK_cgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x,
+			   &ldx, y, &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b,
+			   &ldb, w, &ldw, s, &lds, zwork, &lzwork, 
+			   work, &lwork, iwork, &liwork, &info );
             return (info < 0) ? (info - 1) : info;
         }
         /* Allocate memory for temporary array(s) */
@@ -143,9 +146,10 @@ lapack_int LAPACKE_cgedmd_work( int matrix_layout, char jobs, char jobz,
         LAPACKE_cge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t );
         LAPACKE_cge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
         /* Call LAPACK function and adjust info */
-        LAPACK_cgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, y_t,
-                       &ldy_t, &k, reig, imeig, z_t, &ldz_t, res, b_t, &ldb_t,
-                       w_t, &ldw_t, s_t, &lds_t, work, &lwork, iwork, &liwork, &info );
+        LAPACK_cgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t,
+		       &ldx_t, y_t, &ldy_t, &nrnk, tol, &k, eigs, z_t, &ldz_t,
+		       res, b_t, &ldb_t, w_t, &ldw_t, s_t, &lds_t, zwork,
+		       &lzwork, work, &lwork, iwork, &liwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c
index 60e83729b..b0b258f97 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c
@@ -37,20 +37,22 @@ lapack_int LAPACKE_cgedmdq( int matrix_layout, char jobs, char jobz, char jobr,
                             lapack_int m, lapack_int n, lapack_complex_float* f,
                             lapack_int ldf, lapack_complex_float* x,
                             lapack_int ldx, lapack_complex_float* y,
-                            lapack_int ldy, lapack_int nrnk, float tol,
-                            lapack_int k, lapack_complex_float* reig,
-                            lapack_complex_float* imeig,
+                            lapack_int ldy, lapack_int nrnk, float* tol,
+                            lapack_int k, lapack_complex_float* eigs,
                             lapack_complex_float* z, lapack_int ldz,
-                            lapack_complex_float* res, lapack_complex_float* b,
+                            float* res, lapack_complex_float* b,
                             lapack_int ldb, lapack_complex_float* v,
                             lapack_int ldv, lapack_complex_float* s, lapack_int lds)
 {
     lapack_int info = 0;
     lapack_int lwork = -1;
     lapack_int liwork = -1;
-    lapack_complex_float* work = NULL;
+    lapack_int lzwork = -1;
+    lapack_complex_float* zwork = NULL;
+    float* work = NULL;
     lapack_int* iwork = NULL;
-    lapack_complex_float work_query;
+    lapack_complex_float zwork_query;
+    float work_query;
     lapack_int iwork_query;
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
         LAPACKE_xerbla( "LAPACKE_cgedmdq", -1 );
@@ -85,36 +87,44 @@ lapack_int LAPACKE_cgedmdq( int matrix_layout, char jobs, char jobz, char jobr,
     /* Query optimal working array(s) size */
     info = LAPACKE_cgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
                                  jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
-                                 nrnk, tol, k, reig, imeig, z, ldz, res,
-                                 b, ldb, v, ldv, s, lds, &work_query, lwork,
-                                 &iwork_query, liwork );
+                                 nrnk, tol, k, eigs, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, &zwork_query, lzwork,
+				 &work_query, lwork, &iwork_query, liwork );
 
     if( info != 0 ) {
         goto exit_level_0;
     }
+    lzwork  = LAPACK_C2INT( zwork_query );
     lwork  = LAPACK_C2INT( work_query );
     liwork = iwork_query;
     /* Allocate memory for work arrays */
-    work  = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lwork );
-    if( work == NULL ) {
+    zwork  = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lzwork );
+    if( zwork == NULL ) {
         info = LAPACK_WORK_MEMORY_ERROR;
         goto exit_level_0;
     }
+    work  = (float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
     if( iwork == NULL ) {
         info = LAPACK_WORK_MEMORY_ERROR;
-        goto exit_level_1;
+        goto exit_level_2;
     }
     /* Call middle-level interface */
     info = LAPACKE_cgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
                                  jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
-                                 nrnk, tol, k, reig, imeig, z, ldz, res,
-                                 b, ldb, v, ldv, s, lds, work, lwork, iwork,
-                                 liwork );
+                                 nrnk, tol, k, eigs, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, zwork, lzwork,
+				 work, lwork, iwork, liwork );
     /* Release memory and exit */
     LAPACKE_free( iwork );
-exit_level_1:
+exit_level_2:
     LAPACKE_free( work );
+exit_level_1:
+    LAPACKE_free( zwork );
 exit_level_0:
     if( info == LAPACK_WORK_MEMORY_ERROR ) {
         LAPACKE_xerbla( "LAPACKE_cgedmdq", info );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c
index 5bdbd3f56..05287c1bc 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c
@@ -38,15 +38,15 @@ lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz,
                                  lapack_complex_float* f, lapack_int ldf,
                                  lapack_complex_float* x, lapack_int ldx,
                                  lapack_complex_float* y, lapack_int ldy,
-                                 lapack_int nrnk, float tol, lapack_int k,
-                                 lapack_complex_float* reig,
-                                 lapack_complex_float* imeig,
+                                 lapack_int nrnk, float* tol, lapack_int k,
+                                 lapack_complex_float* eigs,
                                  lapack_complex_float* z,
-                                 lapack_int ldz, lapack_complex_float* res,
+                                 lapack_int ldz, float* res,
                                  lapack_complex_float* b,
                                  lapack_int ldb, lapack_complex_float* v,
                                  lapack_int ldv, lapack_complex_float* s,
-                                 lapack_int lds, lapack_complex_float* work,
+                                 lapack_int lds, lapack_complex_float *zwork,
+				 lapack_int lzwork, float* work,
                                  lapack_int lwork, lapack_int* iwork,
                                  lapack_int liwork )
 {
@@ -54,9 +54,9 @@ lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
-                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
-                        imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
-                        work, &lwork, iwork, &liwork, &info );
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        zwork, &lzwork, work, &lwork, iwork, &liwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
@@ -112,11 +112,11 @@ lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz,
             return info;
         }
         /* Query optimal working array(s) size if requested */
-        if( lwork == -1 || liwork == -1 ) {
+        if( lzwork == -1 || lwork == -1 || liwork == -1 ) {
             LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
-                            &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
-                            imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
-                            work, &lwork, iwork, &liwork, &info );
+                            &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs,
+                            z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                            zwork, &lzwork, work, &lwork, iwork, &liwork, &info );
             return (info < 0) ? (info - 1) : info;
         }
         /* Allocate memory for temporary array(s) */
@@ -165,9 +165,9 @@ lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz,
         LAPACKE_cge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
         /* Call LAPACK function and adjust info */
         LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
-                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
-                        imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
-                        work, &lwork, iwork, &liwork, &info );
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        zwork, &lzwork, work, &lwork, iwork, &liwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c
index 246d7f649..6802378da 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c
@@ -32,11 +32,13 @@
 
 #include "lapacke_utils.h"
 
-lapack_int LAPACKE_dgedmd( int matrix_layout, char jobs, char jobz, char jobf,
-                           lapack_int whtsvd, lapack_int m, lapack_int n,
-                           double* x, lapack_int ldx, double* y, lapack_int ldy,
-                           lapack_int k, double* reig, double* imeig, double* z,
-                           lapack_int ldz, double* res, double* b, lapack_int ldb,
+lapack_int LAPACKE_dgedmd( int matrix_layout, char jobs, char jobz, char jobr,
+			   char jobf, lapack_int whtsvd, lapack_int m,
+			   lapack_int n, double* x, lapack_int ldx, double* y,
+			   lapack_int ldy, lapack_int nrnk, double* tol, 
+			   lapack_int k, double* reig, double* imeig,
+			   double* z, lapack_int ldz,
+			   double* res, double* b, lapack_int ldb,
                            double* w, lapack_int ldw, double* s, lapack_int lds)
 {
     lapack_int info = 0;
@@ -74,10 +76,10 @@ lapack_int LAPACKE_dgedmd( int matrix_layout, char jobs, char jobz, char jobf,
     }
 #endif
     /* Query optimal working array(s) size */
-    info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n,
-                                x, ldx, y, ldy, k, reig, imeig, z, ldz, res,
-                                b, ldb, w, ldw, s, lds, &work_query, lwork,
-                                &iwork_query, liwork );
+    info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz,
+				res, b, ldb, w, ldw, s, lds, &work_query,
+				lwork, &iwork_query, liwork );
 
     if( info != 0 ) {
         goto exit_level_0;
@@ -96,10 +98,10 @@ lapack_int LAPACKE_dgedmd( int matrix_layout, char jobs, char jobz, char jobf,
         goto exit_level_1;
     }
     /* Call middle-level interface */
-    info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n,
-                                x, ldx, y, ldy, k, reig, imeig, z, ldz, res,
-                                b, ldb, w, ldw, s, lds, work, lwork, iwork,
-                                liwork );
+    info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz,
+				res, b, ldb, w, ldw, s, lds, work, lwork,
+				iwork, liwork );
     /* Release memory and exit */
     LAPACKE_free( iwork );
 exit_level_1:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c
index 4d1169de9..987709a1b 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c
@@ -33,22 +33,23 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_dgedmd_work( int matrix_layout, char jobs, char jobz,
-                                char jobf, lapack_int whtsvd, lapack_int m,
-                                lapack_int n, double* x, lapack_int ldx,
-                                double* y, lapack_int ldy, lapack_int k,
-                                double* reig, double* imeig, double* z,
-                                lapack_int ldz, double* res, double* b,
-                                lapack_int ldb, double* w, lapack_int ldw,
-                                double* s, lapack_int lds, double* work,
-                                lapack_int lwork, lapack_int* iwork,
-                                lapack_int liwork )
+                                char jobr, char jobf, lapack_int whtsvd,
+				lapack_int m, lapack_int n, double* x,
+				lapack_int ldx, double* y, lapack_int ldy,
+				lapack_int nrnk, double* tol, lapack_int k,
+			       	double* reig, double* imeig,
+				double* z, lapack_int ldz, double* res,
+				double* b, lapack_int ldb, double* w,
+				lapack_int ldw, double* s, lapack_int lds,
+				double* work, lapack_int lwork,
+				lapack_int* iwork, lapack_int liwork )
 {
     lapack_int info = 0;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
-        LAPACK_dgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy,
-                       &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds,
-                       work, &lwork, iwork, &liwork, &info );
+        LAPACK_dgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, y,
+		       &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw,
+		       s, &lds, work, &lwork, iwork, &liwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
@@ -98,9 +99,9 @@ lapack_int LAPACKE_dgedmd_work( int matrix_layout, char jobs, char jobz,
         }
         /* Query optimal working array(s) size if requested */
         if( lwork == -1 ) {
-            LAPACK_dgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy,
-                           &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds,
-                           work, &lwork, iwork, &liwork, &info );
+            LAPACK_dgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx,
+			   y, &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b, &ldb, w,
+			   &ldw, s, &lds, work, &lwork, iwork, &liwork, &info );
             return (info < 0) ? (info - 1) : info;
         }
         /* Allocate memory for temporary array(s) */
@@ -142,9 +143,10 @@ lapack_int LAPACKE_dgedmd_work( int matrix_layout, char jobs, char jobz,
         LAPACKE_dge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t );
         LAPACKE_dge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
         /* Call LAPACK function and adjust info */
-        LAPACK_dgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, y_t,
-                       &ldy_t, &k, reig, imeig, z_t, &ldz_t, res, b_t, &ldb_t,
-                       w_t, &ldw_t, s_t, &lds_t, work, &lwork, iwork, &liwork, &info );
+        LAPACK_dgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t, &ldx_t,
+		       y_t, &ldy_t, &nrnk, tol, &k, reig, imeig, z_t, &ldz_t, res, b_t,
+		       &ldb_t, w_t, &ldw_t, s_t, &lds_t, work, &lwork,
+		       iwork, &liwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c
index f3d621ba9..5c3c39308 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c
@@ -36,7 +36,7 @@ lapack_int LAPACKE_dgedmdq( int matrix_layout, char jobs, char jobz, char jobr,
                             char jobq, char jobt, char jobf, lapack_int whtsvd,
                             lapack_int m, lapack_int n, double* f, lapack_int ldf,
                             double* x, lapack_int ldx, double* y, lapack_int ldy,
-                            lapack_int nrnk, double tol, lapack_int k,
+                            lapack_int nrnk, double* tol, lapack_int k,
                             double* reig, double* imeig, double* z,
                             lapack_int ldz, double* res, double* b, lapack_int ldb,
                             double* v, lapack_int ldv, double* s, lapack_int lds)
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c
index 51b2a66d8..149e6d24f 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c
@@ -37,7 +37,7 @@ lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz,
                                  lapack_int whtsvd, lapack_int m, lapack_int n,
                                  double* f, lapack_int ldf, double* x,
                                  lapack_int ldx, double* y, lapack_int ldy,
-                                 lapack_int nrnk, double tol, lapack_int k,
+                                 lapack_int nrnk, double* tol, lapack_int k,
                                  double* reig, double* imeig, double* z,
                                  lapack_int ldz, double* res, double* b,
                                  lapack_int ldb, double* v, lapack_int ldv,
@@ -49,8 +49,8 @@ lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
-                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
-                        imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
                         work, &lwork, iwork, &liwork, &info );
         if( info < 0 ) {
             info = info - 1;
@@ -109,8 +109,8 @@ lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz,
         /* Query optimal working array(s) size if requested */
         if( lwork == -1 || liwork == -1 ) {
             LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
-                            &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
-                            imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                            &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig,
+                            z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
                             work, &lwork, iwork, &liwork, &info );
             return (info < 0) ? (info - 1) : info;
         }
@@ -160,8 +160,8 @@ lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz,
         LAPACKE_dge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
         /* Call LAPACK function and adjust info */
         LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
-                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
-                        imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
                         work, &lwork, iwork, &liwork, &info );
         if( info < 0 ) {
             info = info - 1;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c
index 879631b1d..6865fcf65 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c
@@ -32,12 +32,14 @@
 
 #include "lapacke_utils.h"
 
-lapack_int LAPACKE_sgedmd( int matrix_layout, char jobs, char jobz, char jobf,
-                           lapack_int whtsvd, lapack_int m, lapack_int n,
-                           float* x, lapack_int ldx, float* y, lapack_int ldy,
-                           lapack_int k, float* reig, float* imeig, float* z,
-                           lapack_int ldz, float* res, float* b, lapack_int ldb,
-                           float* w, lapack_int ldw, float* s, lapack_int lds)
+lapack_int LAPACKE_sgedmd( int matrix_layout, char jobs, char jobz, char jobr,
+			   char jobf, lapack_int whtsvd, lapack_int m,
+			   lapack_int n, float* x, lapack_int ldx, float* y,
+			   lapack_int ldy, lapack_int nrnk, float* tol, 
+			   lapack_int k, float* reig, float* imeig,
+			   float* z, lapack_int ldz, float* res,
+			   float* b, lapack_int ldb, float* w, lapack_int ldw,
+			   float* s, lapack_int lds)
 {
     lapack_int info = 0;
     lapack_int lwork = -1;
@@ -74,10 +76,10 @@ lapack_int LAPACKE_sgedmd( int matrix_layout, char jobs, char jobz, char jobf,
     }
 #endif
     /* Query optimal working array(s) size */
-    info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n,
-                                x, ldx, y, ldy, k, reig, imeig, z, ldz, res,
-                                b, ldb, w, ldw, s, lds, &work_query, lwork,
-                                &iwork_query, liwork );
+    info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz,
+				res, b, ldb, w, ldw, s, lds, &work_query,
+				lwork, &iwork_query, liwork );
 
     if( info != 0 ) {
         goto exit_level_0;
@@ -96,10 +98,10 @@ lapack_int LAPACKE_sgedmd( int matrix_layout, char jobs, char jobz, char jobf,
         goto exit_level_1;
     }
     /* Call middle-level interface */
-    info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n,
-                                x, ldx, y, ldy, k, reig, imeig, z, ldz, res,
-                                b, ldb, w, ldw, s, lds, work, lwork, iwork,
-                                liwork );
+    info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz,
+				res, b, ldb, w, ldw, s, lds, work, lwork,
+				iwork, liwork );
     /* Release memory and exit */
     LAPACKE_free( iwork );
 exit_level_1:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c
index 762a9b271..5b24152da 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c
@@ -33,22 +33,23 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_sgedmd_work( int matrix_layout, char jobs, char jobz,
-                                char jobf, lapack_int whtsvd, lapack_int m,
-                                lapack_int n, float* x, lapack_int ldx,
-                                float* y, lapack_int ldy, lapack_int k,
-                                float* reig, float* imeig, float* z,
-                                lapack_int ldz, float* res, float* b,
-                                lapack_int ldb, float* w, lapack_int ldw,
-                                float* s, lapack_int lds, float* work,
-                                lapack_int lwork, lapack_int* iwork,
-                                lapack_int liwork )
+				char jobr, char jobf, lapack_int whtsvd,
+				lapack_int m, lapack_int n, float* x,
+				lapack_int ldx, float* y, lapack_int ldy,
+				lapack_int nrnk, float* tol, lapack_int k, 
+				float* reig, float* imeig,
+				float* z, lapack_int ldz, float* res,
+				float* b, lapack_int ldb, float* w,
+				lapack_int ldw, float* s, lapack_int lds,
+				float* work, lapack_int lwork,
+				lapack_int* iwork, lapack_int liwork )
 {
     lapack_int info = 0;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
-        LAPACK_sgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy,
-                       &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds,
-                       work, &lwork, iwork, &liwork, &info );
+        LAPACK_sgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, y,
+		       &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw,
+		       s, &lds, work, &lwork, iwork, &liwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
@@ -98,9 +99,10 @@ lapack_int LAPACKE_sgedmd_work( int matrix_layout, char jobs, char jobz,
         }
         /* Query optimal working array(s) size if requested */
         if( lwork == -1 ) {
-            LAPACK_sgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy,
-                           &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds,
-                           work, &lwork, iwork, &liwork, &info );
+            LAPACK_sgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x,
+			   &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b,
+			   &ldb, w, &ldw, s, &lds, work, &lwork, iwork,
+			   &liwork, &info );
             return (info < 0) ? (info - 1) : info;
         }
         /* Allocate memory for temporary array(s) */
@@ -142,9 +144,10 @@ lapack_int LAPACKE_sgedmd_work( int matrix_layout, char jobs, char jobz,
         LAPACKE_sge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t );
         LAPACKE_sge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
         /* Call LAPACK function and adjust info */
-        LAPACK_sgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, y_t,
-                       &ldy_t, &k, reig, imeig, z_t, &ldz_t, res, b_t, &ldb_t,
-                       w_t, &ldw_t, s_t, &lds_t, work, &lwork, iwork, &liwork, &info );
+        LAPACK_sgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t,
+	    	       &ldx_t, y_t, &ldy_t, &nrnk, tol, &k, reig, imeig, z_t, &ldz_t,
+		       res, b_t, &ldb_t, w_t, &ldw_t, s_t, &lds_t, work,
+		       &lwork, iwork, &liwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c
index e202d7fbd..e65c2094f 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c
@@ -36,7 +36,7 @@ lapack_int LAPACKE_sgedmdq( int matrix_layout, char jobs, char jobz, char jobr,
                             char jobq, char jobt, char jobf, lapack_int whtsvd,
                             lapack_int m, lapack_int n, float* f, lapack_int ldf,
                             float* x, lapack_int ldx, float* y, lapack_int ldy,
-                            lapack_int nrnk, float tol, lapack_int k,
+                            lapack_int nrnk, float* tol, lapack_int k,
                             float* reig, float* imeig, float* z,
                             lapack_int ldz, float* res, float* b, lapack_int ldb,
                             float* v, lapack_int ldv, float* s, lapack_int lds)
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c
index 9039898d2..e1c1f5c98 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c
@@ -37,7 +37,7 @@ lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz,
                                  lapack_int whtsvd, lapack_int m, lapack_int n,
                                  float* f, lapack_int ldf, float* x,
                                  lapack_int ldx, float* y, lapack_int ldy,
-                                 lapack_int nrnk, float tol, lapack_int k,
+                                 lapack_int nrnk, float* tol, lapack_int k,
                                  float* reig, float* imeig, float* z,
                                  lapack_int ldz, float* res, float* b,
                                  lapack_int ldb, float* v, lapack_int ldv,
@@ -49,8 +49,8 @@ lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
-                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
-                        imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
                         work, &lwork, iwork, &liwork, &info );
         if( info < 0 ) {
             info = info - 1;
@@ -109,8 +109,8 @@ lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz,
         /* Query optimal working array(s) size if requested */
         if( lwork == -1 || liwork == -1 ) {
             LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
-                            &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
-                            imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                            &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig,
+                            z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
                             work, &lwork, iwork, &liwork, &info );
             return (info < 0) ? (info - 1) : info;
         }
@@ -160,8 +160,8 @@ lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz,
         LAPACKE_sge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
         /* Call LAPACK function and adjust info */
         LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
-                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
-                        imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
                         work, &lwork, iwork, &liwork, &info );
         if( info < 0 ) {
             info = info - 1;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c
index f3f421c54..e4ea4fe10 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c
@@ -32,24 +32,28 @@
 
 #include "lapacke_utils.h"
 
-lapack_int LAPACKE_zgedmd( int matrix_layout, char jobs, char jobz, char jobf,
-                           lapack_int whtsvd, lapack_int m, lapack_int n,
-                           lapack_complex_double* x, lapack_int ldx,
-                           lapack_complex_double* y, lapack_int ldy,
-                           lapack_int k, lapack_complex_double* reig,
-                           lapack_complex_double* imeig, lapack_complex_double* z,
-                           lapack_int ldz, lapack_complex_double* res,
+lapack_int LAPACKE_zgedmd( int matrix_layout, char jobs, char jobz, char jobr,
+			   char jobf, lapack_int whtsvd, lapack_int m,
+			   lapack_int n, lapack_complex_double* x,
+			   lapack_int ldx, lapack_complex_double* y,
+			   lapack_int ldy, lapack_int nrnk, double *tol, lapack_int k,
+                           lapack_complex_double* eigs, lapack_complex_double* z,
+                           lapack_int ldz, double* res,
                            lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* zw, lapack_int lzw,
                            lapack_complex_double* w, lapack_int ldw,
                            lapack_complex_double* s, lapack_int lds)
 {
     lapack_int info = 0;
     lapack_int lwork = -1;
     lapack_int liwork = -1;
-    lapack_complex_double* work = NULL;
+    lapack_int lzwork = -1;
+    lapack_complex_double* zwork = NULL;
+    double* work = NULL;
     lapack_int* iwork = NULL;
-    lapack_complex_double work_query;
+    double work_query;
     lapack_int iwork_query;
+    lapack_complex_double zwork_query;
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
         LAPACKE_xerbla( "LAPACKE_zgedmd", -1 );
         return -1;
@@ -78,36 +82,44 @@ lapack_int LAPACKE_zgedmd( int matrix_layout, char jobs, char jobz, char jobf,
     }
 #endif
     /* Query optimal working array(s) size */
-    info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n,
-                                x, ldx, y, ldy, k, reig, imeig, z, ldz, res,
-                                b, ldb, w, ldw, s, lds, &work_query, lwork,
-                                &iwork_query, liwork );
+    info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz,
+				res, b, ldb, w, ldw, s, lds, &zwork_query, lzwork,
+				&work_query, lwork, &iwork_query, liwork );
 
     if( info != 0 ) {
         goto exit_level_0;
     }
     lwork  = LAPACK_Z2INT( work_query );
     liwork = iwork_query;
+    lzwork = LAPACK_Z2INT( zwork_query );
     /* Allocate memory for work arrays */
-    work  = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lwork );
-    if( work == NULL ) {
+    zwork  = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lzwork );
+    if( zwork == NULL ) {
         info = LAPACK_WORK_MEMORY_ERROR;
         goto exit_level_0;
     }
+    work  = (double*)LAPACKE_malloc( sizeof(double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
     if( iwork == NULL ) {
         info = LAPACK_WORK_MEMORY_ERROR;
-        goto exit_level_1;
+        goto exit_level_2;
     }
     /* Call middle-level interface */
-    info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n,
-                                x, ldx, y, ldy, k, reig, imeig, z, ldz, res,
-                                b, ldb, w, ldw, s, lds, work, lwork, iwork,
-                                liwork );
+    info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz,
+				res, b, ldb, w, ldw, s, lds, zwork, lzwork,
+				work, lwork, iwork, liwork );
     /* Release memory and exit */
     LAPACKE_free( iwork );
-exit_level_1:
+exit_level_2:
     LAPACKE_free( work );
+exit_level_1:
+    LAPACKE_free( zwork );
 exit_level_0:
     if( info == LAPACK_WORK_MEMORY_ERROR ) {
         LAPACKE_xerbla( "LAPACKE_zgedmd", info );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c
index 2554411ec..ebacfaa94 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c
@@ -33,25 +33,27 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_zgedmd_work( int matrix_layout, char jobs, char jobz,
-                                char jobf, lapack_int whtsvd, lapack_int m,
-                                lapack_int n, lapack_complex_double* x,
+                                char jobr, char jobf, lapack_int whtsvd,
+				lapack_int m, lapack_int n,
+				lapack_complex_double* x,
                                 lapack_int ldx, lapack_complex_double* y,
-                                lapack_int ldy, lapack_int k,
-                                lapack_complex_double* reig,
-                                lapack_complex_double* imeig, lapack_complex_double* z,
-                                lapack_int ldz, lapack_complex_double* res,
+                                lapack_int ldy, lapack_int nrnk, double *tol, lapack_int k,
+                                lapack_complex_double* eigs, lapack_complex_double* z,
+                                lapack_int ldz, double* res,
                                 lapack_complex_double* b, lapack_int ldb,
                                 lapack_complex_double* w, lapack_int ldw,
                                 lapack_complex_double* s, lapack_int lds,
-                                lapack_complex_double* work, lapack_int lwork,
+                                lapack_complex_double* zwork, lapack_int lzwork,
+                                double* work, lapack_int lwork,
                                 lapack_int* iwork, lapack_int liwork )
 {
     lapack_int info = 0;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
-        LAPACK_zgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy,
-                       &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds,
-                       work, &lwork, iwork, &liwork, &info );
+        LAPACK_zgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx,
+		       y, &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b, &ldb, w,
+		       &ldw, s, &lds, zwork, &lzwork, work, &lwork, iwork, 
+		       &liwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
@@ -101,9 +103,10 @@ lapack_int LAPACKE_zgedmd_work( int matrix_layout, char jobs, char jobz,
         }
         /* Query optimal working array(s) size if requested */
         if( lwork == -1 ) {
-            LAPACK_zgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy,
-                           &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds,
-                           work, &lwork, iwork, &liwork, &info );
+            LAPACK_zgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x,
+			   &ldx, y, &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b,
+			   &ldb, w, &ldw, s, &lds, zwork, &lzwork, work, 
+			   &lwork, iwork, &liwork, &info );
             return (info < 0) ? (info - 1) : info;
         }
         /* Allocate memory for temporary array(s) */
@@ -145,9 +148,10 @@ lapack_int LAPACKE_zgedmd_work( int matrix_layout, char jobs, char jobz,
         LAPACKE_zge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t );
         LAPACKE_zge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
         /* Call LAPACK function and adjust info */
-        LAPACK_zgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, y_t,
-                       &ldy_t, &k, reig, imeig, z_t, &ldz_t, res, b_t, &ldb_t,
-                       w_t, &ldw_t, s_t, &lds_t, work, &lwork, iwork, &liwork, &info );
+        LAPACK_zgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t,
+		       &ldx_t, y_t, &ldy_t, &nrnk, tol, &k, eigs, z_t, &ldz_t,
+		       res, b_t, &ldb_t, w_t, &ldw_t, s_t, &lds_t, zwork,
+		       &lzwork, work, &lwork, iwork, &liwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c
index 3648ffdf2..368d48e20 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c
@@ -37,23 +37,25 @@ lapack_int LAPACKE_zgedmdq( int matrix_layout, char jobs, char jobz, char jobr,
                             lapack_int m, lapack_int n, lapack_complex_double* f,
                             lapack_int ldf, lapack_complex_double* x,
                             lapack_int ldx, lapack_complex_double* y,
-                            lapack_int ldy, lapack_int nrnk, double tol,
-                            lapack_int k, lapack_complex_double* reig,
-                            lapack_complex_double* imeig,
+                            lapack_int ldy, lapack_int nrnk, double* tol,
+                            lapack_int k, lapack_complex_double* eigs,
                             lapack_complex_double* z, lapack_int ldz,
-                            lapack_complex_double* res, lapack_complex_double* b,
+                            double* res, lapack_complex_double* b,
                             lapack_int ldb, lapack_complex_double* v,
                             lapack_int ldv, lapack_complex_double* s, lapack_int lds)
 {
     lapack_int info = 0;
     lapack_int lwork = -1;
     lapack_int liwork = -1;
-    lapack_complex_double* work = NULL;
+    lapack_int lzwork = -1;
+    lapack_complex_double* zwork = NULL;
+    double* work = NULL;
     lapack_int* iwork = NULL;
-    lapack_complex_double work_query;
+    double work_query;
+    lapack_complex_double zwork_query;
     lapack_int iwork_query;
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
-        LAPACKE_xerbla( "LAPACKE_cgedmdq", -1 );
+        LAPACKE_xerbla( "LAPACKE_zgedmdq", -1 );
         return -1;
     }
 #ifndef LAPACK_DISABLE_NAN_CHECK
@@ -85,36 +87,44 @@ lapack_int LAPACKE_zgedmdq( int matrix_layout, char jobs, char jobz, char jobr,
     /* Query optimal working array(s) size */
     info = LAPACKE_zgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
                                  jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
-                                 nrnk, tol, k, reig, imeig, z, ldz, res,
-                                 b, ldb, v, ldv, s, lds, &work_query, lwork,
-                                 &iwork_query, liwork );
+                                 nrnk, tol, k, eigs, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, &zwork_query, lzwork, 
+				 &work_query, lwork, &iwork_query, liwork );
 
     if( info != 0 ) {
         goto exit_level_0;
     }
     lwork  = LAPACK_Z2INT( work_query );
+    lzwork  = LAPACK_Z2INT( zwork_query );
     liwork = iwork_query;
     /* Allocate memory for work arrays */
-    work  = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lwork );
-    if( work == NULL ) {
+    zwork  = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lzwork );
+    if( zwork == NULL ) {
         info = LAPACK_WORK_MEMORY_ERROR;
         goto exit_level_0;
     }
+    work  = (double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
     if( iwork == NULL ) {
         info = LAPACK_WORK_MEMORY_ERROR;
-        goto exit_level_1;
+        goto exit_level_2;
     }
     /* Call middle-level interface */
     info = LAPACKE_zgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
                                  jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
-                                 nrnk, tol, k, reig, imeig, z, ldz, res,
-                                 b, ldb, v, ldv, s, lds, work, lwork, iwork,
-                                 liwork );
+                                 nrnk, tol, k, eigs, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, zwork, lzwork, 
+				 work, lwork, iwork, liwork );
     /* Release memory and exit */
     LAPACKE_free( iwork );
-exit_level_1:
+exit_level_2:
     LAPACKE_free( work );
+exit_level_1:
+    LAPACKE_free( zwork );
 exit_level_0:
     if( info == LAPACK_WORK_MEMORY_ERROR ) {
         LAPACKE_xerbla( "LAPACKE_zgedmdq", info );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c
index 9afceba07..131e4f9ad 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c
@@ -38,15 +38,15 @@ lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz,
                                  lapack_complex_double* f, lapack_int ldf,
                                  lapack_complex_double* x, lapack_int ldx,
                                  lapack_complex_double* y, lapack_int ldy,
-                                 lapack_int nrnk, double tol, lapack_int k,
-                                 lapack_complex_double* reig,
-                                 lapack_complex_double* imeig,
+                                 lapack_int nrnk, double* tol, lapack_int k,
+                                 lapack_complex_double* eigs,
                                  lapack_complex_double* z,
-                                 lapack_int ldz, lapack_complex_double* res,
+                                 lapack_int ldz, double* res,
                                  lapack_complex_double* b,
                                  lapack_int ldb, lapack_complex_double* v,
                                  lapack_int ldv, lapack_complex_double* s,
-                                 lapack_int lds, lapack_complex_double* work,
+                                 lapack_int lds, lapack_complex_double* zwork,
+				 lapack_int lzwork, double* work,
                                  lapack_int lwork, lapack_int* iwork,
                                  lapack_int liwork )
 {
@@ -54,9 +54,9 @@ lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
-                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
-                        imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
-                        work, &lwork, iwork, &liwork, &info );
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        zwork, &lzwork, work, &lwork, iwork, &liwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
@@ -114,9 +114,9 @@ lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz,
         /* Query optimal working array(s) size if requested */
         if( lwork == -1 || liwork == -1 ) {
             LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
-                            &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
-                            imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
-                            work, &lwork, iwork, &liwork, &info );
+                            &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs,
+                            z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                            zwork, &lzwork, work, &lwork, iwork, &liwork, &info );
             return (info < 0) ? (info - 1) : info;
         }
         /* Allocate memory for temporary array(s) */
@@ -165,9 +165,9 @@ lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz,
         LAPACKE_zge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
         /* Call LAPACK function and adjust info */
         LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
-                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig,
-                        imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
-                        work, &lwork, iwork, &liwork, &info );
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        zwork, &lzwork, work, &lwork, iwork, &liwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }

From 0a637cc403b2af09b20c48238e2376793a25c8c2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 15 Jul 2023 16:37:42 +0200
Subject: [PATCH 204/718] Fix workspace query corner cases to always return at
 least 1 (Reference-LAPACK PR 883) (#4146)

* Fix workspace query corner cases to always return at least 1
---
 lapack-netlib/SRC/csytrf.f    | 2 +-
 lapack-netlib/SRC/csytrf_rk.f | 2 +-
 lapack-netlib/SRC/ctrevc3.f   | 4 ++--
 lapack-netlib/SRC/dsytrf.f    | 2 +-
 lapack-netlib/SRC/dsytrf_rk.f | 2 +-
 lapack-netlib/SRC/dtrevc3.f   | 2 +-
 lapack-netlib/SRC/ssytrf.f    | 2 +-
 lapack-netlib/SRC/ssytrf_rk.f | 2 +-
 lapack-netlib/SRC/strevc3.f   | 2 +-
 lapack-netlib/SRC/zsytrf.f    | 2 +-
 lapack-netlib/SRC/zsytrf_rk.f | 2 +-
 lapack-netlib/SRC/ztrevc3.f   | 4 ++--
 12 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/lapack-netlib/SRC/csytrf.f b/lapack-netlib/SRC/csytrf.f
index ebf228f18..951196b83 100644
--- a/lapack-netlib/SRC/csytrf.f
+++ b/lapack-netlib/SRC/csytrf.f
@@ -232,7 +232,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'CSYTRF', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/csytrf_rk.f b/lapack-netlib/SRC/csytrf_rk.f
index 9c2b7182f..996801e7d 100644
--- a/lapack-netlib/SRC/csytrf_rk.f
+++ b/lapack-netlib/SRC/csytrf_rk.f
@@ -310,7 +310,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'CSYTRF_RK', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/ctrevc3.f b/lapack-netlib/SRC/ctrevc3.f
index 0f58696b2..11b32104d 100644
--- a/lapack-netlib/SRC/ctrevc3.f
+++ b/lapack-netlib/SRC/ctrevc3.f
@@ -321,9 +321,9 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'CTREVC', SIDE // HOWMNY, N, -1, -1, -1 )
-      MAXWRK = N + 2*N*NB
+      MAXWRK = MAX( 1, N + 2*N*NB )
       WORK(1) = MAXWRK
-      RWORK(1) = N
+      RWORK(1) = MAX( 1, N )
       LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 )
       IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN
          INFO = -1
diff --git a/lapack-netlib/SRC/dsytrf.f b/lapack-netlib/SRC/dsytrf.f
index a39b03283..aee9b3f6a 100644
--- a/lapack-netlib/SRC/dsytrf.f
+++ b/lapack-netlib/SRC/dsytrf.f
@@ -232,7 +232,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'DSYTRF', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/dsytrf_rk.f b/lapack-netlib/SRC/dsytrf_rk.f
index 7341b9263..086586968 100644
--- a/lapack-netlib/SRC/dsytrf_rk.f
+++ b/lapack-netlib/SRC/dsytrf_rk.f
@@ -310,7 +310,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'DSYTRF_RK', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/dtrevc3.f b/lapack-netlib/SRC/dtrevc3.f
index a4651e788..c8c04ad13 100644
--- a/lapack-netlib/SRC/dtrevc3.f
+++ b/lapack-netlib/SRC/dtrevc3.f
@@ -298,7 +298,7 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'DTREVC', SIDE // HOWMNY, N, -1, -1, -1 )
-      MAXWRK = N + 2*N*NB
+      MAXWRK = MAX( 1, N + 2*N*NB )
       WORK(1) = MAXWRK
       LQUERY = ( LWORK.EQ.-1 )
       IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN
diff --git a/lapack-netlib/SRC/ssytrf.f b/lapack-netlib/SRC/ssytrf.f
index d188589b9..31e38e466 100644
--- a/lapack-netlib/SRC/ssytrf.f
+++ b/lapack-netlib/SRC/ssytrf.f
@@ -232,7 +232,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/ssytrf_rk.f b/lapack-netlib/SRC/ssytrf_rk.f
index ec84fcb1b..8e1ef460a 100644
--- a/lapack-netlib/SRC/ssytrf_rk.f
+++ b/lapack-netlib/SRC/ssytrf_rk.f
@@ -310,7 +310,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'SSYTRF_RK', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/strevc3.f b/lapack-netlib/SRC/strevc3.f
index 5af57123b..253cbc24c 100644
--- a/lapack-netlib/SRC/strevc3.f
+++ b/lapack-netlib/SRC/strevc3.f
@@ -298,7 +298,7 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'STREVC', SIDE // HOWMNY, N, -1, -1, -1 )
-      MAXWRK = N + 2*N*NB
+      MAXWRK = MAX( 1, N + 2*N*NB )
       WORK(1) = MAXWRK
       LQUERY = ( LWORK.EQ.-1 )
       IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN
diff --git a/lapack-netlib/SRC/zsytrf.f b/lapack-netlib/SRC/zsytrf.f
index a775a8758..dc9016c69 100644
--- a/lapack-netlib/SRC/zsytrf.f
+++ b/lapack-netlib/SRC/zsytrf.f
@@ -232,7 +232,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'ZSYTRF', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/zsytrf_rk.f b/lapack-netlib/SRC/zsytrf_rk.f
index 3b398ce6c..af8b8d501 100644
--- a/lapack-netlib/SRC/zsytrf_rk.f
+++ b/lapack-netlib/SRC/zsytrf_rk.f
@@ -310,7 +310,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'ZSYTRF_RK', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/ztrevc3.f b/lapack-netlib/SRC/ztrevc3.f
index 6300e80ae..8fb144e0c 100644
--- a/lapack-netlib/SRC/ztrevc3.f
+++ b/lapack-netlib/SRC/ztrevc3.f
@@ -321,9 +321,9 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'ZTREVC', SIDE // HOWMNY, N, -1, -1, -1 )
-      MAXWRK = N + 2*N*NB
+      MAXWRK = MAX( 1, N + 2*N*NB )
       WORK(1) = MAXWRK
-      RWORK(1) = N
+      RWORK(1) = MAX( 1, N )
       LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 )
       IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN
          INFO = -1

From 8da6aca2ecf557b6250631414a114bdb3f743c86 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 16 Jul 2023 22:15:15 +0200
Subject: [PATCH 205/718] Support Alder Lake N (fam 6 exmodel 11 model 14) as
 Haswell

---
 cpuid_x86.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpuid_x86.c b/cpuid_x86.c
index c2486e380..c485f3ddf 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1551,6 +1551,7 @@ int get_cpuname(void){
           case 7: // Raptor Lake
           case 10:
           case 15:
+	  case 14: // Alder Lake N
             if(support_avx2())
               return CPUTYPE_HASWELL;
 	    if(support_avx())
@@ -2360,6 +2361,7 @@ int get_coretype(void){
 	  case 7: // Raptor Lake
           case 10:
           case 15:
+	  case 14: // Alder Lake N	  
 #ifndef NO_AVX2
 	  if(support_avx2())
             return CORE_HASWELL;

From ada9e442ebfe945895c444903f55b51b9daf8776 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 17 Jul 2023 23:13:56 +0200
Subject: [PATCH 206/718] Add Apple M1 build using gcc,gmake and OpenMP

---
 .cirrus.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.cirrus.yml b/.cirrus.yml
index 8a1c4a0a8..02cd40997 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -30,6 +30,15 @@ task:
   - cd build
   - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
   - make
+
+task:
+  name: AppleM1/GCC/MAKE/OPENMP
+  compile_script:
+  - brew install gcc@11
+  - export PATH=/opt/homebrew/bin:$PATH
+  - export LDFLAGS="-L/opt/homebrew/lib"
+  - export CPPFLAGS="-I/opt/homebrew/include"
+  - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
   
 macos_instance:
   image: ghcr.io/cirruslabs/macos-monterey-xcode:latest

From ca7199f249bb6a87f201a1cd564d42fef338f29a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 19 Jul 2023 14:48:42 +0200
Subject: [PATCH 207/718] Treat newer Neoverse as N1 if SVE unavailable (may be
 disabled in container/cloud env)

---
 driver/others/dynamic_arm64.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c
index 0f47b287c..b29e6e46c 100644
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@@ -147,6 +147,9 @@ extern void openblas_warning(int verbose, const char * msg);
 #ifndef HWCAP_CPUID
 #define HWCAP_CPUID (1 << 11)
 #endif
+#ifndef HWCAP_SVE
+#define HWCAP_SVE (1 << 22)
+#endif
 
 #define get_cpu_ftr(id, var) ({					\
 		__asm__ __volatile__ ("mrs %0, "#id : "=r" (var));		\
@@ -281,9 +284,15 @@ static gotoblas_t *get_coretype(void) {
           return &gotoblas_NEOVERSEN1;
 #ifndef NO_SVE
         case 0xd49:
-          return &gotoblas_NEOVERSEN2;
+	  if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) 
+	    return &gotoblas_NEOVERSEN1;
+	  else
+            return &gotoblas_NEOVERSEN2;
 	case 0xd40:
-	  return &gotoblas_NEOVERSEV1;
+	  if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) 
+	    return &gotoblas_NEOVERSEN1;
+	  else
+	    return &gotoblas_NEOVERSEV1;
 #endif
 	case 0xd05: // Cortex A55
 	  return &gotoblas_CORTEXA55;

From 5c58994eb26bfd5715f4100a213a0e69b280ea16 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 19 Jul 2023 18:27:41 +0200
Subject: [PATCH 208/718] Add fallback warning

---
 driver/others/dynamic_arm64.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c
index b29e6e46c..ef2597234 100644
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@@ -137,6 +137,8 @@ extern gotoblas_t  gotoblas_CORTEXA55;
 #endif
 
 extern void openblas_warning(int verbose, const char * msg);
+#define FALLBACK_VERBOSE 1
+#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"
 
 #define NUM_CORETYPES   13
 
@@ -284,14 +286,16 @@ static gotoblas_t *get_coretype(void) {
           return &gotoblas_NEOVERSEN1;
 #ifndef NO_SVE
         case 0xd49:
-	  if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) 
+	  if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
+	    openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);  
 	    return &gotoblas_NEOVERSEN1;
-	  else
+	  } else
             return &gotoblas_NEOVERSEN2;
 	case 0xd40:
-	  if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) 
+	  if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
+		  openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
 	    return &gotoblas_NEOVERSEN1;
-	  else
+      	  }else
 	    return &gotoblas_NEOVERSEV1;
 #endif
 	case 0xd05: // Cortex A55

From 66904f814853d1d34aed020eebbd826e8466f029 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 19 Jul 2023 22:14:34 +0200
Subject: [PATCH 209/718] Ensure that a premature call will not overwrite
 unrelated memory

---
 driver/others/blas_server_win32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index afa33cccc..5bdfc1276 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -568,7 +568,7 @@ void goto_set_num_threads(int num_threads)
 			blas_server_avail = 1;
 		}
 
-		for(i = blas_num_threads - 1; i < num_threads - 1; i++){
+		for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
 
 			blas_threads[i] = CreateThread(NULL, 0,
 				     blas_thread_server, (void *)i,

From b34f19a365a43ce8a05c29a97036bfc9e300c289 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 19 Jul 2023 22:19:22 +0200
Subject: [PATCH 210/718] Ensure that a premature call to set_num_threads will
 not overwrite unrelated memory

---
 driver/others/blas_server.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c
index 051513f27..a8a84acbb 100644
--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@@ -973,7 +973,7 @@ void goto_set_num_threads(int num_threads) {
 
     increased_threads = 1;
 
-    for(i = blas_num_threads - 1; i < num_threads - 1; i++){
+    for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
 
       atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0);
       thread_status[i].status = THREAD_STATUS_WAKEUP;

From 76ef1672f84bb90574f451d4f4677c37aa6751b7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 19 Jul 2023 22:31:07 +0200
Subject: [PATCH 211/718] Override DSDOT with generic code to get rid of qemu
 precision error

---
 kernel/riscv64/KERNEL.C910V | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V
index 18cb3bafd..e6f2b3314 100644
--- a/kernel/riscv64/KERNEL.C910V
+++ b/kernel/riscv64/KERNEL.C910V
@@ -59,6 +59,7 @@ SDOTKERNEL   = dot_vector.c
 DDOTKERNEL   = dot_vector.c
 CDOTKERNEL   = zdot_vector.c
 ZDOTKERNEL   = zdot_vector.c
+DSDOTKERNEL  = ../generic/dot.c
 
 SNRM2KERNEL  = nrm2_vector.c
 DNRM2KERNEL  = nrm2_vector.c

From 8df0289db61ea5a3e461c94c51a5798e2dd18b86 Mon Sep 17 00:00:00 2001
From: Octavian Maghiar <octavian.maghiar@imgtec.com>
Date: Thu, 20 Jul 2023 15:28:35 +0100
Subject: [PATCH 212/718] Adds tail undisturbed for RVV Level 1 operations

During the last iteration of some RVV operations, accumulators can get overwritten when VL < VLMAX and tail policy is agnostic.
Commit changes intrinsics tail policy to undistrubed.
---
 kernel/riscv64/amax_rvv.c   |  8 +++---
 kernel/riscv64/amin_rvv.c   |  8 +++---
 kernel/riscv64/asum_rvv.c   |  8 +++---
 kernel/riscv64/dot_rvv.c    | 16 +++++------
 kernel/riscv64/iamax_rvv.c  | 24 ++++++++--------
 kernel/riscv64/iamin_rvv.c  | 24 ++++++++--------
 kernel/riscv64/imax_rvv.c   | 24 ++++++++--------
 kernel/riscv64/imin_rvv.c   | 24 ++++++++--------
 kernel/riscv64/izamax_rvv.c | 24 ++++++++--------
 kernel/riscv64/izamin_rvv.c | 24 ++++++++--------
 kernel/riscv64/max_rvv.c    |  8 +++---
 kernel/riscv64/min_rvv.c    |  8 +++---
 kernel/riscv64/nrm2_rvv.c   |  8 +++---
 kernel/riscv64/sum_rvv.c    |  8 +++---
 kernel/riscv64/zamax_rvv.c  |  8 +++---
 kernel/riscv64/zamin_rvv.c  |  8 +++---
 kernel/riscv64/zasum_rvv.c  | 12 ++++----
 kernel/riscv64/zdot_rvv.c   | 56 ++++++++++++++++++-------------------
 kernel/riscv64/znrm2_rvv.c  | 24 ++++++++--------
 kernel/riscv64/zsum_rvv.c   | 12 ++++----
 20 files changed, 168 insertions(+), 168 deletions(-)

diff --git a/kernel/riscv64/amax_rvv.c b/kernel/riscv64/amax_rvv.c
index be0bdbea0..451fbc834 100644
--- a/kernel/riscv64/amax_rvv.c
+++ b/kernel/riscv64/amax_rvv.c
@@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m8_f32m1
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
-#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f32m8
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f32m8_tu
 #define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
 #else
@@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m8_f64m1
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
-#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f64m8
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f64m8_tu
 #define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
 #endif
@@ -78,7 +78,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             vx = VLEV_FLOAT(x, vl);
             vx = VFABSV_FLOAT(vx, vl);
-            vmax = VFMAXVV_FLOAT(vmax, vx, vl);
+            vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl);
         }
 
     } else {
@@ -90,7 +90,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             vx = VLSEV_FLOAT(x, stride_x, vl);
             vx = VFABSV_FLOAT(vx, vl);
-            vmax = VFMAXVV_FLOAT(vmax, vx, vl);
+            vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl);
         }
 
     }
diff --git a/kernel/riscv64/amin_rvv.c b/kernel/riscv64/amin_rvv.c
index d4926084b..5186d7b12 100644
--- a/kernel/riscv64/amin_rvv.c
+++ b/kernel/riscv64/amin_rvv.c
@@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m8_f32m1
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
-#define VFMINVV_FLOAT           __riscv_vfmin_vv_f32m8
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f32m8_tu
 #define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
 #else
@@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m8_f64m1
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
-#define VFMINVV_FLOAT           __riscv_vfmin_vv_f64m8
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f64m8_tu
 #define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
 #endif
@@ -78,7 +78,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             vx = VLEV_FLOAT(x, vl);
             vx = VFABSV_FLOAT(vx, vl);
-            vmin = VFMINVV_FLOAT(vmin, vx, vl);
+            vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl);
         }
 
     } else {
@@ -90,7 +90,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             vx = VLSEV_FLOAT(x, stride_x, vl);
             vx = VFABSV_FLOAT(vx, vl);
-            vmin = VFMINVV_FLOAT(vmin, vx, vl);
+            vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl);
         }
 
     }
diff --git a/kernel/riscv64/asum_rvv.c b/kernel/riscv64/asum_rvv.c
index 691591e22..0ea610cbb 100644
--- a/kernel/riscv64/asum_rvv.c
+++ b/kernel/riscv64/asum_rvv.c
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VLEV_FLOAT              __riscv_vle32_v_f32m8
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
-#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m8
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f32m8_tu
 #define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
 #define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f32m8_f32m1
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
@@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VLEV_FLOAT              __riscv_vle64_v_f64m8
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
-#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m8
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f64m8_tu
 #define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
 #define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f64m8_f64m1
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
@@ -76,7 +76,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             vx = VLEV_FLOAT(x, vl);
             vx = VFABSV_FLOAT(vx, vl);
-            vsum = VFADDVV_FLOAT(vsum, vx, vl);
+            vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl);
         }
 
     } else {
@@ -88,7 +88,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             vx = VLSEV_FLOAT(x, stride_x, vl);
             vx = VFABSV_FLOAT(vx, vl);
-            vsum = VFADDVV_FLOAT(vsum, vx, vl);
+            vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl);
         }
 
     }
diff --git a/kernel/riscv64/dot_rvv.c b/kernel/riscv64/dot_rvv.c
index 3276695b6..837badf41 100644
--- a/kernel/riscv64/dot_rvv.c
+++ b/kernel/riscv64/dot_rvv.c
@@ -49,12 +49,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
             vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl);
             vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl);
 
-            vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl);
+            vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl);
 #else
             vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl);
             vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl);
 
-            vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl);
+            vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl);
 #endif
         }
 
@@ -69,12 +69,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
             vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl);
             vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl);
 
-            vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl);
+            vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl);
 #else
             vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl);
             vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl);
 
-            vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl);
+            vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl);
 #endif
         }
     } else if (1 == inc_y) {
@@ -88,12 +88,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
             vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl);
             vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl);
 
-            vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl);
+            vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl);
 #else
             vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl);
             vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl);
 
-            vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl);
+            vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl);
 #endif
         }
     } else {
@@ -108,12 +108,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
             vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl);
             vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl);
 
-            vr = __riscv_vfwmacc_vv_f64m8(vr, vx, vy, vl);
+            vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl);
 #else
             vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl);
             vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl);
 
-            vr = __riscv_vfmacc_vv_f64m8(vr, vx, vy, vl);
+            vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl);
 #endif
         }
     }
diff --git a/kernel/riscv64/iamax_rvv.c b/kernel/riscv64/iamax_rvv.c
index d3508a91d..8362d7cef 100644
--- a/kernel/riscv64/iamax_rvv.c
+++ b/kernel/riscv64/iamax_rvv.c
@@ -42,12 +42,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
 #define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
-#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f64m8
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f64m8_tu
 #define VFIRSTM                 __riscv_vfirst_m_b8
 #define UINT_V_T                vuint64m8_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u64m8_mu
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u64m8_tumu
 #define VIDV_UINT               __riscv_vid_v_u64m8
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m8_mu
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u64m8_tumu
 #define VADDVX_UINT             __riscv_vadd_vx_u64m8
 #define VMVVX_UINT              __riscv_vmv_v_x_u64m8
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
@@ -68,12 +68,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
 #define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
-#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f32m8
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f32m8_tu
 #define VFIRSTM                 __riscv_vfirst_m_b4
 #define UINT_V_T                vuint32m8_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u32m8_mu
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u32m8_tumu
 #define VIDV_UINT               __riscv_vid_v_u32m8
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m8_mu
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u32m8_tumu
 #define VADDVX_UINT             __riscv_vadd_vx_u32m8
 #define VMVVX_UINT              __riscv_vmv_v_x_u32m8
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
@@ -106,11 +106,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             //index where element greater than v_max
             mask = VMFLTVV_FLOAT(v_max, vx, vl);
-            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
-            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
+            v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
 
             //update v_max
-            v_max = VFMAXVV_FLOAT(v_max, vx, vl);
+            v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl);
         }
 
     } else {
@@ -125,11 +125,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             //index where element greater than v_max
             mask = VMFLTVV_FLOAT(v_max, vx, vl);
-            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
-            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
+            v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
 
             //update v_max
-            v_max = VFMAXVV_FLOAT(v_max, vx, vl);
+            v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl);
         }
   
     }
diff --git a/kernel/riscv64/iamin_rvv.c b/kernel/riscv64/iamin_rvv.c
index ae1d4f726..f90dbb545 100644
--- a/kernel/riscv64/iamin_rvv.c
+++ b/kernel/riscv64/iamin_rvv.c
@@ -43,12 +43,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
 #define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
-#define VFMINVV_FLOAT           __riscv_vfmin_vv_f64m8
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f64m8_tu
 #define VFIRSTM                 __riscv_vfirst_m_b8
 #define UINT_V_T                vuint64m8_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u64m8_mu
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u64m8_tumu
 #define VIDV_UINT               __riscv_vid_v_u64m8
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m8_mu
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u64m8_tumu
 #define VADDVX_UINT             __riscv_vadd_vx_u64m8
 #define VMVVX_UINT              __riscv_vmv_v_x_u64m8
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
@@ -69,12 +69,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
 #define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
-#define VFMINVV_FLOAT           __riscv_vfmin_vv_f32m8
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f32m8_tu
 #define VFIRSTM                 __riscv_vfirst_m_b4
 #define UINT_V_T                vuint32m8_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u32m8_mu
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u32m8_tumu
 #define VIDV_UINT               __riscv_vid_v_u32m8
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m8_mu
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u32m8_tumu
 #define VADDVX_UINT             __riscv_vadd_vx_u32m8
 #define VMVVX_UINT              __riscv_vmv_v_x_u32m8
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
@@ -107,11 +107,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             // index where element less than v_min
             mask = VMFLTVV_FLOAT(vx, v_min, vl);
-            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
-            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
+            v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
 
             //update v_min and start_index j
-            v_min = VFMINVV_FLOAT(v_min, vx, vl);
+            v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl);
         }
 
     } else {
@@ -126,11 +126,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             // index where element less than v_min
             mask = VMFLTVV_FLOAT(vx, v_min, vl);
-            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
-            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
+            v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
 
             //update v_min and start_index j
-            v_min = VFMINVV_FLOAT(v_min, vx, vl);
+            v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl);
         }
   
     }
diff --git a/kernel/riscv64/imax_rvv.c b/kernel/riscv64/imax_rvv.c
index 33250568d..b1a77b178 100644
--- a/kernel/riscv64/imax_rvv.c
+++ b/kernel/riscv64/imax_rvv.c
@@ -42,12 +42,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VMFGEVF_FLOAT           __riscv_vmfge_vf_f64m8_b8
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
-#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f64m8
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f64m8_tu
 #define VFIRSTM                 __riscv_vfirst_m_b8
 #define UINT_V_T                vuint64m8_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u64m8_mu
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u64m8_tumu
 #define VIDV_UINT               __riscv_vid_v_u64m8
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m8_mu
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u64m8_tumu
 #define VADDVX_UINT             __riscv_vadd_vx_u64m8
 #define VMVVX_UINT              __riscv_vmv_v_x_u64m8
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
@@ -67,12 +67,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VMFGEVF_FLOAT           __riscv_vmfge_vf_f32m8_b4
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
-#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f32m8
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f32m8_tu
 #define VFIRSTM                 __riscv_vfirst_m_b4
 #define UINT_V_T                vuint32m8_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u32m8_mu
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u32m8_tumu
 #define VIDV_UINT               __riscv_vid_v_u32m8
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m8_mu
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u32m8_tumu
 #define VADDVX_UINT             __riscv_vadd_vx_u32m8
 #define VMVVX_UINT              __riscv_vmv_v_x_u32m8
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
@@ -104,11 +104,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             //index where element greater than v_max
             mask = VMFLTVV_FLOAT(v_max, vx, vl);
-            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
-            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
+            v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
 
             //update v_max and start_index j
-            v_max = VFMAXVV_FLOAT(v_max, vx, vl);
+            v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl);
         }
 
     } else {
@@ -122,11 +122,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             //index where element greater than v_max
             mask = VMFLTVV_FLOAT(v_max, vx, vl);
-            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
-            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
+            v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
 
             //update v_max and start_index j
-            v_max = VFMAXVV_FLOAT(v_max, vx, vl);
+            v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl);
         }
   
     }
diff --git a/kernel/riscv64/imin_rvv.c b/kernel/riscv64/imin_rvv.c
index 4ce49c3af..1de7f3233 100644
--- a/kernel/riscv64/imin_rvv.c
+++ b/kernel/riscv64/imin_rvv.c
@@ -42,12 +42,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VMFLEVF_FLOAT           __riscv_vmfle_vf_f64m8_b8
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
-#define VFMINVV_FLOAT           __riscv_vfmin_vv_f64m8
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f64m8_tu
 #define VFIRSTM                 __riscv_vfirst_m_b8
 #define UINT_V_T                vuint64m8_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u64m8_mu
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u64m8_tumu
 #define VIDV_UINT               __riscv_vid_v_u64m8
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m8_mu
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u64m8_tumu
 #define VADDVX_UINT             __riscv_vadd_vx_u64m8
 #define VMVVX_UINT              __riscv_vmv_v_x_u64m8
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
@@ -67,12 +67,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VMFLEVF_FLOAT           __riscv_vmfle_vf_f32m8_b4
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
-#define VFMINVV_FLOAT           __riscv_vfmin_vv_f32m8
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f32m8_tu
 #define VFIRSTM                 __riscv_vfirst_m_b4
 #define UINT_V_T                vuint32m8_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u32m8_mu
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u32m8_tumu
 #define VIDV_UINT               __riscv_vid_v_u32m8
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m8_mu
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u32m8_tumu
 #define VADDVX_UINT             __riscv_vadd_vx_u32m8
 #define VMVVX_UINT              __riscv_vmv_v_x_u32m8
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
@@ -104,11 +104,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             // index where element less than v_min
             mask = VMFLTVV_FLOAT(vx, v_min, vl);
-            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
-            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
+            v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
 
             //update v_min and start_index j
-            v_min = VFMINVV_FLOAT(v_min, vx, vl);
+            v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl);
         }
 
     } else {
@@ -122,11 +122,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             // index where element less than v_min
             mask = VMFLTVV_FLOAT(vx, v_min, vl);
-            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
-            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
+            v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
 
             //update v_min and start_index j
-            v_min = VFMINVV_FLOAT(v_min, vx, vl);
+            v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl);
         }
   
     }
diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c
index e43ded820..e93f0056c 100644
--- a/kernel/riscv64/izamax_rvv.c
+++ b/kernel/riscv64/izamax_rvv.c
@@ -44,13 +44,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
 #define VFABSV_FLOAT            __riscv_vfabs_v_f64m4
-#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f64m4
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f64m4_tu
 #define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
 #define VFIRSTM                 __riscv_vfirst_m_b16
 #define UINT_V_T                vuint64m4_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u64m4_mu
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u64m4_tumu
 #define VIDV_UINT               __riscv_vid_v_u64m4
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m4_mu
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u64m4_tumu
 #define VADDVX_UINT             __riscv_vadd_vx_u64m4
 #define VMVVX_UINT              __riscv_vmv_v_x_u64m4
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
@@ -73,13 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
 #define VFABSV_FLOAT            __riscv_vfabs_v_f32m4
-#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f32m4
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f32m4_tu
 #define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
 #define VFIRSTM                 __riscv_vfirst_m_b8
 #define UINT_V_T                vuint32m4_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u32m4_mu
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u32m4_tumu
 #define VIDV_UINT               __riscv_vid_v_u32m4
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m4_mu
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u32m4_tumu
 #define VADDVX_UINT             __riscv_vadd_vx_u32m4
 #define VMVVX_UINT              __riscv_vmv_v_x_u32m4
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
@@ -116,11 +116,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             //index where element greater than v_max
             mask = VMFLTVV_FLOAT(v_max, vx0, vl);
-            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
-            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
+            v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
 
             //update v_max and start_index j
-            v_max = VFMAXVV_FLOAT(v_max, vx0, vl);
+            v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx0, vl);
         }
     }
     else {
@@ -138,11 +138,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         
             //index where element greater than v_max
             mask = VMFLTVV_FLOAT(v_max, vx0, vl);
-            v_max_index = VIDV_MASK_UINT(mask, v_max_index, vl);
-            v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, vl);
+            v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
 
             //update v_max and start_index j
-            v_max = VFMAXVV_FLOAT(v_max, vx0, vl);
+            v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx0, vl);
         }
 
     }
diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c
index cc3c37c8e..b5bc27404 100644
--- a/kernel/riscv64/izamin_rvv.c
+++ b/kernel/riscv64/izamin_rvv.c
@@ -43,13 +43,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
 #define VFABSV_FLOAT            __riscv_vfabs_v_f64m4
-#define VFMINVV_FLOAT           __riscv_vfmin_vv_f64m4
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f64m4_tu
 #define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
 #define VFIRSTM                 __riscv_vfirst_m_b16
 #define UINT_V_T                vuint64m4_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u64m4_mu
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u64m4_tumu
 #define VIDV_UINT               __riscv_vid_v_u64m4
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u64m4_mu
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u64m4_tumu
 #define VADDVX_UINT             __riscv_vadd_vx_u64m4
 #define VMVVX_UINT              __riscv_vmv_v_x_u64m4
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
@@ -70,13 +70,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
 #define VFABSV_FLOAT            __riscv_vfabs_v_f32m4
-#define VFMINVV_FLOAT           __riscv_vfmin_vv_f32m4
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f32m4_tu
 #define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
 #define VFIRSTM                 __riscv_vfirst_m_b8
 #define UINT_V_T                vuint32m4_t
-#define VIDV_MASK_UINT          __riscv_vid_v_u32m4_mu
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u32m4_tumu
 #define VIDV_UINT               __riscv_vid_v_u32m4
-#define VADDVX_MASK_UINT        __riscv_vadd_vx_u32m4_mu
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u32m4_tumu
 #define VADDVX_UINT             __riscv_vadd_vx_u32m4
 #define VMVVX_UINT              __riscv_vmv_v_x_u32m4
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
@@ -113,11 +113,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             // index where element less than v_min
             mask = VMFLTVV_FLOAT(vx0, v_min, vl);
-            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
-            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
+            v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
 
             //update v_min and start_index j
-            v_min = VFMINVV_FLOAT(v_min, vx0, vl);
+            v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx0, vl);
         }
 
     } else {
@@ -136,11 +136,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             // index where element less than v_min
             mask = VMFLTVV_FLOAT(vx0, v_min, vl);
-            v_min_index = VIDV_MASK_UINT(mask, v_min_index, vl);
-            v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, vl);
+            v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
 
             //update v_min and start_index j
-            v_min = VFMINVV_FLOAT(v_min, vx0, vl);
+            v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx0, vl);
         }
 
     }
diff --git a/kernel/riscv64/max_rvv.c b/kernel/riscv64/max_rvv.c
index 9315321f4..745c27bf4 100644
--- a/kernel/riscv64/max_rvv.c
+++ b/kernel/riscv64/max_rvv.c
@@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m8_f32m1
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
-#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f32m8
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f32m8_tu
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m8(n)
@@ -52,7 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m8_f64m1
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
-#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f64m8
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f64m8_tu
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
 #endif
 
@@ -75,7 +75,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             vl = VSETVL(n);
 
             vx = VLEV_FLOAT(x, vl);
-            vmax = VFMAXVV_FLOAT(vmax, vx, vl);
+            vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl);
        }
 
     } else {
@@ -86,7 +86,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             vl = VSETVL(n);
 
             vx = VLSEV_FLOAT(x, stride_x, vl);
-            vmax = VFMAXVV_FLOAT(vmax, vx, vl);
+            vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl);
         }
 
     }
diff --git a/kernel/riscv64/min_rvv.c b/kernel/riscv64/min_rvv.c
index 158b682fd..78528fef9 100644
--- a/kernel/riscv64/min_rvv.c
+++ b/kernel/riscv64/min_rvv.c
@@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m8_f32m1
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
-#define VFMINVV_FLOAT           __riscv_vfmin_vv_f32m8
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f32m8_tu
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m8(n)
@@ -52,7 +52,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m8_f64m1
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
-#define VFMINVV_FLOAT           __riscv_vfmin_vv_f64m8
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f64m8_tu
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
 #endif
 
@@ -75,7 +75,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             vl = VSETVL(n);
 
             vx = VLEV_FLOAT(x, vl);
-            vmin = VFMINVV_FLOAT(vmin, vx, vl);
+            vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl);
         }
 
     } else {
@@ -86,7 +86,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             vl = VSETVL(n);
 
             vx = VLSEV_FLOAT(x, stride_x, vl);
-            vmin = VFMINVV_FLOAT(vmin, vx, vl);
+            vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl);
         }
 
     }
diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c
index 42abfa119..994fadb70 100644
--- a/kernel/riscv64/nrm2_rvv.c
+++ b/kernel/riscv64/nrm2_rvv.c
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VLEV_FLOAT              __riscv_vle32_v_f32m8
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
 #define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m8_f32m1
-#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f32m8
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f32m8_tu
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
@@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VLEV_FLOAT              __riscv_vle64_v_f64m8
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
 #define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m8_f64m1
-#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f64m8
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f64m8_tu
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
@@ -79,7 +79,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             v0 = VLEV_FLOAT(x, vl);
 
-            vr = VFMACCVV_FLOAT(vr, v0, v0, vl);
+            vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl);
         }
 
     } else {
@@ -91,7 +91,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             v0 = VLSEV_FLOAT(x, stride_x, vl);
 
-            vr = VFMACCVV_FLOAT(vr, v0, v0, vl);
+            vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl);
         }
     }
 
diff --git a/kernel/riscv64/sum_rvv.c b/kernel/riscv64/sum_rvv.c
index 9715faf22..c5629197f 100644
--- a/kernel/riscv64/sum_rvv.c
+++ b/kernel/riscv64/sum_rvv.c
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VLEV_FLOAT              __riscv_vle32_v_f32m8
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
-#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m8
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f32m8_tu
 #define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f32m8_f32m1
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
@@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VLEV_FLOAT              __riscv_vle64_v_f64m8
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
-#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m8
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f64m8_tu
 #define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f64m8_f64m1
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
@@ -73,7 +73,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             vl = VSETVL(n);
 
             vx = VLEV_FLOAT(x, vl);
-            vsum = VFADDVV_FLOAT(vsum, vx, vl);
+            vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl);
         }
 
     } else {
@@ -84,7 +84,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             vl = VSETVL(n);
 
             vx = VLSEV_FLOAT(x, stride_x, vl);
-            vsum = VFADDVV_FLOAT(vsum, vx, vl);
+            vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl);
         }
 
     }
diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c
index 615b7519c..bbb1e876b 100644
--- a/kernel/riscv64/zamax_rvv.c
+++ b/kernel/riscv64/zamax_rvv.c
@@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m4_f32m1
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
-#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f32m4
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f32m4_tu
 #define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
 #define VFABSV_FLOAT            __riscv_vfabs_v_f32m4
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
@@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m4_f64m1
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
-#define VFMAXVV_FLOAT           __riscv_vfmax_vv_f64m4
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f64m4_tu
 #define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
 #define VFABSV_FLOAT            __riscv_vfabs_v_f64m4
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
@@ -84,7 +84,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             v1 = VFABSV_FLOAT(v1, vl);
 
             v0 = VFADDVV_FLOAT(v0, v1, vl);
-            vmax = VFMAXVV_FLOAT(vmax, v0, vl);
+            vmax = VFMAXVV_FLOAT_TU(vmax, vmax, v0, vl);
   
         }
 
@@ -101,7 +101,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             v1 = VFABSV_FLOAT(v1, vl);
 
             v0 = VFADDVV_FLOAT(v0, v1, vl);
-            vmax = VFMAXVV_FLOAT(vmax, v0, vl);
+            vmax = VFMAXVV_FLOAT_TU(vmax, vmax, v0, vl);
         }
 
     }
diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c
index a0d36d46f..c5453121b 100644
--- a/kernel/riscv64/zamin_rvv.c
+++ b/kernel/riscv64/zamin_rvv.c
@@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m4_f32m1
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
-#define VFMINVV_FLOAT           __riscv_vfmin_vv_f32m4
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f32m4_tu
 #define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
 #define VFABSV_FLOAT            __riscv_vfabs_v_f32m4
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
@@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m4_f64m1
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
-#define VFMINVV_FLOAT           __riscv_vfmin_vv_f64m4
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f64m4_tu
 #define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
 #define VFABSV_FLOAT            __riscv_vfabs_v_f64m4
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
@@ -84,7 +84,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             v1 = VFABSV_FLOAT(v1, vl);
 
             v0 = VFADDVV_FLOAT(v0, v1, vl);
-            vmin = VFMINVV_FLOAT(vmin, v0, vl);
+            vmin = VFMINVV_FLOAT_TU(vmin, vmin, v0, vl);
         }
 
     } else {
@@ -100,7 +100,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             v1 = VFABSV_FLOAT(v1, vl);
 
             v0 = VFADDVV_FLOAT(v0, v1, vl);
-            vmin = VFMINVV_FLOAT(vmin, v0, vl);
+            vmin = VFMINVV_FLOAT_TU(vmin, vmin, v0, vl);
         }
 
     }
diff --git a/kernel/riscv64/zasum_rvv.c b/kernel/riscv64/zasum_rvv.c
index 1d2f0e1fe..ebec1b19c 100644
--- a/kernel/riscv64/zasum_rvv.c
+++ b/kernel/riscv64/zasum_rvv.c
@@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
-#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m8
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f32m8_tu
 #define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m8(n)
@@ -51,7 +51,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
-#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m8
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f64m8_tu
 #define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
 #endif
 
@@ -75,8 +75,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             v0 = VFABSV_FLOAT(v0, vl);
             v1 = VFABSV_FLOAT(v1, vl);
 
-            v_sum = VFADDVV_FLOAT(v_sum, v0, vl);
-            v_sum = VFADDVV_FLOAT(v_sum, v1, vl);
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl);
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl);
         }
 
     }
@@ -93,8 +93,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             v0 = VFABSV_FLOAT(v0, vl);
             v1 = VFABSV_FLOAT(v1, vl);
 
-            v_sum = VFADDVV_FLOAT(v_sum, v0, vl);
-            v_sum = VFADDVV_FLOAT(v_sum, v1, vl);
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl);
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl);
         }
 
     }
diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c
index 1543c513d..fa0e89353 100644
--- a/kernel/riscv64/zdot_rvv.c
+++ b/kernel/riscv64/zdot_rvv.c
@@ -36,12 +36,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
 #define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
 #define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f32m4
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f32m4_tu
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
 #define VFMULVV_FLOAT           __riscv_vfmul_vv_f32m4
 #define VFMSACVV_FLOAT          __riscv_vfmsac_vv_f32m4
-#define VFNMSACVV_FLOAT         __riscv_vfnmsac_vv_f32m4
+#define VFNMSACVV_FLOAT_TU      __riscv_vfnmsac_vv_f32m4_tu
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m4(n)
@@ -52,12 +52,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
 #define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
 #define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f64m4
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f64m4_tu
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
 #define VFMULVV_FLOAT           __riscv_vfmul_vv_f64m4
 #define VFMSACVV_FLOAT          __riscv_vfmsac_vv_f64m4
-#define VFNMSACVV_FLOAT         __riscv_vfnmsac_vv_f64m4
+#define VFNMSACVV_FLOAT_TU      __riscv_vfnmsac_vv_f64m4_tu
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
 #endif
 
@@ -86,14 +86,14 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
             VLSEG_FLOAT(&vx0, &vx1, x, vl);
             VLSEG_FLOAT(&vy0, &vy1, y, vl);
 
-            vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl);
-            vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl);
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl);
         #if !defined(CONJ)
-            vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl);
-            vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl);
+            vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl);
         #else
-            vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl);
-            vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl);
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl);
         #endif
         }
 
@@ -107,14 +107,14 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
             VLSEG_FLOAT(&vx0, &vx1, x, vl);
             VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
 
-            vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl);
-            vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl);
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl);
         #if !defined(CONJ)
-            vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl);
-            vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl);
+            vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl);
         #else
-            vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl);
-            vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl);
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl);
         #endif
         }
     } else if (inc_y == 1){
@@ -127,14 +127,14 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
             VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
             VLSEG_FLOAT(&vy0, &vy1, y, vl);
 
-            vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl);
-            vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl);
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl);
         #if !defined(CONJ)
-            vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl);
-            vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl);
+            vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl);
         #else
-            vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl);
-            vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl);
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl);
         #endif
         }
     }else {
@@ -148,14 +148,14 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
             VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
             VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
 
-            vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, vl);
-            vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, vl);
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl);
         #if !defined(CONJ)
-            vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, vl);
-            vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, vl);
+            vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl);
         #else
-            vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, vl);
-            vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, vl);
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl);
         #endif
         }
     }
diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c
index 5f7873b5a..d2b27aa8d 100644
--- a/kernel/riscv64/znrm2_rvv.c
+++ b/kernel/riscv64/znrm2_rvv.c
@@ -36,10 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
 #define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
 #define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f32m4
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f32m4_tu
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
-#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m4_f32m1
+#define VFREDMAXVS_FLOAT_TU     __riscv_vfredmax_vs_f32m4_f32m1_tu
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
 #define VFABSV_FLOAT            __riscv_vfabs_v_f32m4
 #else
@@ -51,10 +51,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
 #define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
 #define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f64m4
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f64m4_tu
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
-#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m4_f64m1
+#define VFREDMAXVS_FLOAT_TU     __riscv_vfredmax_vs_f64m4_f64m1_tu
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
 #define VFABSV_FLOAT            __riscv_vfabs_v_f64m4
 #endif
@@ -85,11 +85,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             v0 = VFABSV_FLOAT(v0, vl);
             v1 = VFABSV_FLOAT(v1, vl);
 
-            v_max = VFREDMAXVS_FLOAT(v0, v_max, vl);
-            vr = VFMACCVV_FLOAT(vr, v0, v0, vl);
+            v_max = VFREDMAXVS_FLOAT_TU(v_max, v0, v_max, vl);
+            vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl);
 
-            v_max = VFREDMAXVS_FLOAT(v1, v_max, vl);
-            vr = VFMACCVV_FLOAT(vr, v1, v1, vl);
+            v_max = VFREDMAXVS_FLOAT_TU(v_max, v1, v_max, vl);
+            vr = VFMACCVV_FLOAT_TU(vr, v1, v1, vl);
         }
 
     } else {
@@ -103,11 +103,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             v0 = VFABSV_FLOAT(v0, vl);
             v1 = VFABSV_FLOAT(v1, vl);
 
-            v_max = VFREDMAXVS_FLOAT(v0, v_max, vl);
-            vr = VFMACCVV_FLOAT(vr, v0, v0, vl);
+            v_max = VFREDMAXVS_FLOAT_TU(v_max, v0, v_max, vl);
+            vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl);
 
-            v_max = VFREDMAXVS_FLOAT(v1, v_max, vl);
-            vr = VFMACCVV_FLOAT(vr, v1, v1, vl);
+            v_max = VFREDMAXVS_FLOAT_TU(v_max, v1, v_max, vl);
+            vr = VFMACCVV_FLOAT_TU(vr, v1, v1, vl);
         }
 
     }
diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c
index 44df112c6..b41f70eb5 100644
--- a/kernel/riscv64/zsum_rvv.c
+++ b/kernel/riscv64/zsum_rvv.c
@@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
-#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f32m4_tu
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m4(n)
 #define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
@@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
-#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f64m4_tu
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
@@ -69,8 +69,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             VLSEG_FLOAT(&v0, &v1, x, vl);
 
-            v_sum = VFADDVV_FLOAT(v_sum, v0, vl);
-            v_sum = VFADDVV_FLOAT(v_sum, v1, vl);
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl);
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl);
         }
 
     } else {
@@ -82,8 +82,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
             VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl);
 
-            v_sum = VFADDVV_FLOAT(v_sum, v0, vl);
-            v_sum = VFADDVV_FLOAT(v_sum, v1, vl);
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl);
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl);
         }
 
     }

From 25037ae87535c188a5635b471d566dc3a51e71d8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 22 Jul 2023 23:14:25 +0200
Subject: [PATCH 213/718] Fix actual arguments in some LAPACK procedure calls
 (Reference-LAPACK PR 885) (#4155)

* Fix actual arguments (Reference-LAPACK PR 885)
---
 lapack-netlib/SRC/cgelqt3.f        |  3 ++-
 lapack-netlib/SRC/dgelqt3.f        |  3 ++-
 lapack-netlib/SRC/dlatrs.f         |  7 ++++--
 lapack-netlib/SRC/dtrsyl3.f        |  2 +-
 lapack-netlib/SRC/sgelqt3.f        |  3 ++-
 lapack-netlib/SRC/slatrs.f         |  7 ++++--
 lapack-netlib/SRC/strsyl3.f        |  2 +-
 lapack-netlib/SRC/zgelqt3.f        |  3 ++-
 lapack-netlib/TESTING/EIG/cerrst.f |  6 ++---
 lapack-netlib/TESTING/EIG/derrst.f |  6 ++---
 lapack-netlib/TESTING/EIG/serrst.f |  6 ++---
 lapack-netlib/TESTING/EIG/zerrst.f |  6 ++---
 lapack-netlib/TESTING/LIN/cchktr.f |  9 +++----
 lapack-netlib/TESTING/LIN/cerrtr.f | 38 +++++++++++++++---------------
 lapack-netlib/TESTING/LIN/derrtr.f | 38 +++++++++++++++---------------
 lapack-netlib/TESTING/LIN/serrtr.f | 38 +++++++++++++++---------------
 lapack-netlib/TESTING/LIN/zchktr.f |  7 +++---
 lapack-netlib/TESTING/LIN/zerrtr.f | 38 +++++++++++++++---------------
 18 files changed, 117 insertions(+), 105 deletions(-)

diff --git a/lapack-netlib/SRC/cgelqt3.f b/lapack-netlib/SRC/cgelqt3.f
index 1dfbd3f2b..553087bf4 100644
--- a/lapack-netlib/SRC/cgelqt3.f
+++ b/lapack-netlib/SRC/cgelqt3.f
@@ -159,7 +159,8 @@
 *
 *        Compute Householder transform when M=1
 *
-         CALL CLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T )
+         CALL CLARFG( N, A( 1, 1 ), A( 1, MIN( 2, N ) ), LDA,
+     &                T( 1, 1 ) )
          T(1,1)=CONJG(T(1,1))
 *
       ELSE
diff --git a/lapack-netlib/SRC/dgelqt3.f b/lapack-netlib/SRC/dgelqt3.f
index 5bcc06a80..ee3bdceb4 100644
--- a/lapack-netlib/SRC/dgelqt3.f
+++ b/lapack-netlib/SRC/dgelqt3.f
@@ -173,7 +173,8 @@
 *
 *        Compute Householder transform when M=1
 *
-         CALL DLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T )
+         CALL DLARFG( N, A ( 1, 1 ), A( 1, MIN( 2, N ) ), LDA,
+     &                T( 1, 1) )
 *
       ELSE
 *
diff --git a/lapack-netlib/SRC/dlatrs.f b/lapack-netlib/SRC/dlatrs.f
index be156bee2..b282f4227 100644
--- a/lapack-netlib/SRC/dlatrs.f
+++ b/lapack-netlib/SRC/dlatrs.f
@@ -261,6 +261,9 @@
       DOUBLE PRECISION   BIGNUM, GROW, REC, SMLNUM, SUMJ, TJJ, TJJS,
      $                   TMAX, TSCAL, USCAL, XBND, XJ, XMAX
 *     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   WORK(1)
+*     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            IDAMAX
@@ -362,7 +365,7 @@
 *              A is upper triangular.
 *
                DO J = 2, N
-                  TMAX = MAX( DLANGE( 'M', J-1, 1, A( 1, J ), 1, SUMJ ),
+                  TMAX = MAX( DLANGE( 'M', J-1, 1, A( 1, J ), 1, WORK ),
      $                        TMAX )
                END DO
             ELSE
@@ -371,7 +374,7 @@
 *
                DO J = 1, N - 1
                   TMAX = MAX( DLANGE( 'M', N-J, 1, A( J+1, J ), 1,
-     $                        SUMJ ), TMAX )
+     $                        WORK ), TMAX )
                END DO
             END IF
 *
diff --git a/lapack-netlib/SRC/dtrsyl3.f b/lapack-netlib/SRC/dtrsyl3.f
index c44ec3808..31a5230ba 100644
--- a/lapack-netlib/SRC/dtrsyl3.f
+++ b/lapack-netlib/SRC/dtrsyl3.f
@@ -1220,7 +1220,7 @@
 *
          SCALOC = MIN( BIGNUM / SCAL, ONE / BUF )
          BUF = BUF * SCALOC
-         CALL DLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK )
+         CALL DLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK(1) )
       END IF
 *
 *     Combine with buffer scaling factor. SCALE will be flushed if
diff --git a/lapack-netlib/SRC/sgelqt3.f b/lapack-netlib/SRC/sgelqt3.f
index 23816b4c8..82f5c1cf4 100644
--- a/lapack-netlib/SRC/sgelqt3.f
+++ b/lapack-netlib/SRC/sgelqt3.f
@@ -158,7 +158,8 @@
 *
 *        Compute Householder transform when M=1
 *
-         CALL SLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T )
+         CALL SLARFG( N, A( 1, 1 ), A( 1, MIN( 2, N ) ), LDA,
+     &                T( 1, 1 ) )
 *
       ELSE
 *
diff --git a/lapack-netlib/SRC/slatrs.f b/lapack-netlib/SRC/slatrs.f
index 0761d656f..9765ea3d7 100644
--- a/lapack-netlib/SRC/slatrs.f
+++ b/lapack-netlib/SRC/slatrs.f
@@ -261,6 +261,9 @@
       REAL               BIGNUM, GROW, REC, SMLNUM, SUMJ, TJJ, TJJS,
      $                   TMAX, TSCAL, USCAL, XBND, XJ, XMAX
 *     ..
+*     .. Local Arrays ..
+      REAL               WORK (1)
+*     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ISAMAX
@@ -362,7 +365,7 @@
 *              A is upper triangular.
 *
                DO J = 2, N
-                  TMAX = MAX( SLANGE( 'M', J-1, 1, A( 1, J ), 1, SUMJ ),
+                  TMAX = MAX( SLANGE( 'M', J-1, 1, A( 1, J ), 1, WORK ),
      $                        TMAX )
                END DO
             ELSE
@@ -371,7 +374,7 @@
 *
                DO J = 1, N - 1
                   TMAX = MAX( SLANGE( 'M', N-J, 1, A( J+1, J ), 1,
-     $                        SUMJ ), TMAX )
+     $                        WORK ), TMAX )
                END DO
             END IF
 *
diff --git a/lapack-netlib/SRC/strsyl3.f b/lapack-netlib/SRC/strsyl3.f
index 28762c2ed..ef3f2da83 100644
--- a/lapack-netlib/SRC/strsyl3.f
+++ b/lapack-netlib/SRC/strsyl3.f
@@ -1223,7 +1223,7 @@
 *
          SCALOC = MIN( BIGNUM / SCAL, ONE / BUF )
          BUF = BUF * SCALOC
-         CALL SLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK )
+         CALL SLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK(1) )
       END IF
 *
 *     Combine with buffer scaling factor. SCALE will be flushed if
diff --git a/lapack-netlib/SRC/zgelqt3.f b/lapack-netlib/SRC/zgelqt3.f
index 629a09472..1a71dc44e 100644
--- a/lapack-netlib/SRC/zgelqt3.f
+++ b/lapack-netlib/SRC/zgelqt3.f
@@ -174,7 +174,8 @@
 *
 *        Compute Householder transform when M=1
 *
-         CALL ZLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T )
+         CALL ZLARFG( N, A( 1, 1 ), A( 1, MIN( 2, N ) ), LDA,
+     &                T( 1, 1 ) )
          T(1,1)=CONJG(T(1,1))
 *
       ELSE
diff --git a/lapack-netlib/TESTING/EIG/cerrst.f b/lapack-netlib/TESTING/EIG/cerrst.f
index ba97afbe5..1748a2aad 100644
--- a/lapack-netlib/TESTING/EIG/cerrst.f
+++ b/lapack-netlib/TESTING/EIG/cerrst.f
@@ -160,13 +160,13 @@
 *
          SRNAMT = 'CHETD2'
          INFOT = 1
-         CALL CHETD2( '/', 0, A, 1, D, E, TAU, W, 1, INFO )
+         CALL CHETD2( '/', 0, A, 1, D, E, TAU, INFO )
          CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK )
          INFOT = 2
-         CALL CHETD2( 'U', -1, A, 1, D, E, TAU, W, 1, INFO )
+         CALL CHETD2( 'U', -1, A, 1, D, E, TAU, INFO )
          CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK )
          INFOT = 4
-         CALL CHETD2( 'U', 2, A, 1, D, E, TAU, W, 1, INFO )
+         CALL CHETD2( 'U', 2, A, 1, D, E, TAU, INFO )
          CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK )
          NT = NT + 3
 *
diff --git a/lapack-netlib/TESTING/EIG/derrst.f b/lapack-netlib/TESTING/EIG/derrst.f
index a55b6eea9..059538644 100644
--- a/lapack-netlib/TESTING/EIG/derrst.f
+++ b/lapack-netlib/TESTING/EIG/derrst.f
@@ -161,13 +161,13 @@
 *
          SRNAMT = 'DSYTD2'
          INFOT = 1
-         CALL DSYTD2( '/', 0, A, 1, D, E, TAU, W, 1, INFO )
+         CALL DSYTD2( '/', 0, A, 1, D, E, TAU, INFO )
          CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK )
          INFOT = 2
-         CALL DSYTD2( 'U', -1, A, 1, D, E, TAU, W, 1, INFO )
+         CALL DSYTD2( 'U', -1, A, 1, D, E, TAU, INFO )
          CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK )
          INFOT = 4
-         CALL DSYTD2( 'U', 2, A, 1, D, E, TAU, W, 1, INFO )
+         CALL DSYTD2( 'U', 2, A, 1, D, E, TAU, INFO )
          CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK )
          NT = NT + 3
 *
diff --git a/lapack-netlib/TESTING/EIG/serrst.f b/lapack-netlib/TESTING/EIG/serrst.f
index 8c9c0f306..b87fc42ef 100644
--- a/lapack-netlib/TESTING/EIG/serrst.f
+++ b/lapack-netlib/TESTING/EIG/serrst.f
@@ -161,13 +161,13 @@
 *
          SRNAMT = 'SSYTD2'
          INFOT = 1
-         CALL SSYTD2( '/', 0, A, 1, D, E, TAU, W, 1, INFO )
+         CALL SSYTD2( '/', 0, A, 1, D, E, TAU, INFO )
          CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK )
          INFOT = 2
-         CALL SSYTD2( 'U', -1, A, 1, D, E, TAU, W, 1, INFO )
+         CALL SSYTD2( 'U', -1, A, 1, D, E, TAU, INFO )
          CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK )
          INFOT = 4
-         CALL SSYTD2( 'U', 2, A, 1, D, E, TAU, W, 1, INFO )
+         CALL SSYTD2( 'U', 2, A, 1, D, E, TAU, INFO )
          CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK )
          NT = NT + 3
 *
diff --git a/lapack-netlib/TESTING/EIG/zerrst.f b/lapack-netlib/TESTING/EIG/zerrst.f
index 948f94bc2..d7b41c053 100644
--- a/lapack-netlib/TESTING/EIG/zerrst.f
+++ b/lapack-netlib/TESTING/EIG/zerrst.f
@@ -160,13 +160,13 @@
 *
          SRNAMT = 'ZHETD2'
          INFOT = 1
-         CALL ZHETD2( '/', 0, A, 1, D, E, TAU, W, 1, INFO )
+         CALL ZHETD2( '/', 0, A, 1, D, E, TAU, INFO )
          CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK )
          INFOT = 2
-         CALL ZHETD2( 'U', -1, A, 1, D, E, TAU, W, 1, INFO )
+         CALL ZHETD2( 'U', -1, A, 1, D, E, TAU, INFO )
          CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK )
          INFOT = 4
-         CALL ZHETD2( 'U', 2, A, 1, D, E, TAU, W, 1, INFO )
+         CALL ZHETD2( 'U', 2, A, 1, D, E, TAU, INFO )
          CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK )
          NT = NT + 3
 *
diff --git a/lapack-netlib/TESTING/LIN/cchktr.f b/lapack-netlib/TESTING/LIN/cchktr.f
index 4b09361d8..2953a2bd5 100644
--- a/lapack-netlib/TESTING/LIN/cchktr.f
+++ b/lapack-netlib/TESTING/LIN/cchktr.f
@@ -201,7 +201,8 @@
 *     .. Local Arrays ..
       CHARACTER          TRANSS( NTRAN ), UPLOS( 2 )
       INTEGER            ISEED( 4 ), ISEEDY( 4 )
-      REAL               RESULT( NTESTS ), SCALE3( 2 )
+      REAL               RESULT( NTESTS ), RWORK2( 2*NMAX ),
+     $                   SCALE3( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -542,10 +543,10 @@
                   SRNAMT = 'CLATRS3'
                   CALL CCOPY( N, X, 1, B, 1 )
                   CALL CCOPY( N, X, 1, B( N+1 ), 1 )
-                  CALL CSCAL( N, BIGNUM, B( N+1 ), 1 )
+                  CALL CSSCAL( N, BIGNUM, B( N+1 ), 1 )
                   CALL CLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA,
-     $                          B, MAX(1, N), SCALE3, RWORK, WORK, NMAX,
-     $                          INFO )
+     $                          B, MAX(1, N), SCALE3, RWORK, RWORK2,
+     $                          2*NMAX, INFO )
 *
 *                 Check error code from CLATRS3.
 *
diff --git a/lapack-netlib/TESTING/LIN/cerrtr.f b/lapack-netlib/TESTING/LIN/cerrtr.f
index 9ba784f62..ab83357f8 100644
--- a/lapack-netlib/TESTING/LIN/cerrtr.f
+++ b/lapack-netlib/TESTING/LIN/cerrtr.f
@@ -70,7 +70,7 @@
 *     .. Local Scalars ..
       CHARACTER*2        C2
       INTEGER            INFO
-      REAL               RCOND, SCALE
+      REAL               RCOND, SCALE, SCALES(0)
 *     ..
 *     .. Local Arrays ..
       REAL               R1( NMAX ), R2( NMAX ), RW( NMAX )
@@ -245,40 +245,40 @@
 *
          SRNAMT = 'CLATRS3'
          INFOT = 1
-         CALL CLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 2
-         CALL CLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 3
-         CALL CLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 4
-         CALL CLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 5
-         CALL CLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 6
-         CALL CLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 8
-         CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 10
-         CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 14
-         CALL CLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 0, INFO )
+         CALL CLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 0, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
 *
 *     Test error exits for the packed triangular routines.
diff --git a/lapack-netlib/TESTING/LIN/derrtr.f b/lapack-netlib/TESTING/LIN/derrtr.f
index d0580497d..878d9070c 100644
--- a/lapack-netlib/TESTING/LIN/derrtr.f
+++ b/lapack-netlib/TESTING/LIN/derrtr.f
@@ -71,7 +71,7 @@
 *     .. Local Scalars ..
       CHARACTER*2        C2
       INTEGER            INFO
-      DOUBLE PRECISION   RCOND, SCALE
+      DOUBLE PRECISION   RCOND, SCALE, SCALES(0)
 *     ..
 *     .. Local Arrays ..
       INTEGER            IW( NMAX )
@@ -250,40 +250,40 @@
 *
          SRNAMT = 'DLATRS3'
          INFOT = 1
-         CALL DLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 2
-         CALL DLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 3
-         CALL DLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 4
-         CALL DLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 5
-         CALL DLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 6
-         CALL DLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 8
-         CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 10
-         CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 14
-         CALL DLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 0, INFO )
+         CALL DLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 0, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN
diff --git a/lapack-netlib/TESTING/LIN/serrtr.f b/lapack-netlib/TESTING/LIN/serrtr.f
index af1ce0a8e..391b54c3f 100644
--- a/lapack-netlib/TESTING/LIN/serrtr.f
+++ b/lapack-netlib/TESTING/LIN/serrtr.f
@@ -71,7 +71,7 @@
 *     .. Local Scalars ..
       CHARACTER*2        C2
       INTEGER            INFO
-      REAL               RCOND, SCALE
+      REAL               RCOND, SCALE, SCALES(0)
 *     ..
 *     .. Local Arrays ..
       INTEGER            IW( NMAX )
@@ -250,40 +250,40 @@
 *
          SRNAMT = 'SLATRS3'
          INFOT = 1
-         CALL SLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 2
-         CALL SLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 3
-         CALL SLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 4
-         CALL SLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 5
-         CALL SLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 6
-         CALL SLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 8
-         CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 10
-         CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 14
-         CALL SLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 0, INFO )
+         CALL SLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 0, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN
diff --git a/lapack-netlib/TESTING/LIN/zchktr.f b/lapack-netlib/TESTING/LIN/zchktr.f
index 275ca2857..4af538124 100644
--- a/lapack-netlib/TESTING/LIN/zchktr.f
+++ b/lapack-netlib/TESTING/LIN/zchktr.f
@@ -201,7 +201,8 @@
 *     .. Local Arrays ..
       CHARACTER          TRANSS( NTRAN ), UPLOS( 2 )
       INTEGER            ISEED( 4 ), ISEEDY( 4 )
-      DOUBLE PRECISION   RESULT( NTESTS ), SCALE3( 2 )
+      DOUBLE PRECISION   RESULT( NTESTS ), RWORK2( 2*NMAX), 
+     $                   SCALE3( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -544,8 +545,8 @@
                   CALL ZCOPY( N, X, 1, B( N+1 ), 1 )
                   CALL ZDSCAL( N, BIGNUM, B( N+1 ), 1 )
                   CALL ZLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA,
-     $                          B, MAX(1, N), SCALE3, RWORK, WORK, NMAX,
-     $                          INFO )
+     $                          B, MAX(1, N), SCALE3, RWORK, RWORK2,
+     $                          2*NMAX, INFO )
 *
 *                 Check error code from ZLATRS3.
 *
diff --git a/lapack-netlib/TESTING/LIN/zerrtr.f b/lapack-netlib/TESTING/LIN/zerrtr.f
index 211b92154..640c39793 100644
--- a/lapack-netlib/TESTING/LIN/zerrtr.f
+++ b/lapack-netlib/TESTING/LIN/zerrtr.f
@@ -70,7 +70,7 @@
 *     .. Local Scalars ..
       CHARACTER*2        C2
       INTEGER            INFO
-      DOUBLE PRECISION   RCOND, SCALE
+      DOUBLE PRECISION   RCOND, SCALE, SCALES(0)
 *     ..
 *     .. Local Arrays ..
       DOUBLE PRECISION   R1( NMAX ), R2( NMAX ), RW( NMAX )
@@ -245,40 +245,40 @@
 *
          SRNAMT = 'ZLATRS3'
          INFOT = 1
-         CALL ZLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 2
-         CALL ZLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 3
-         CALL ZLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 4
-         CALL ZLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 5
-         CALL ZLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 6
-         CALL ZLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 8
-         CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 10
-         CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 14
-         CALL ZLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 0, INFO )
+         CALL ZLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 0, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
 *
 *     Test error exits for the packed triangular routines.

From fec4867748ef1c2141fe0c7aa3070f00d5048aab Mon Sep 17 00:00:00 2001
From: martin-frbg <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 23 Jul 2023 20:31:55 +0200
Subject: [PATCH 214/718] Fix file permissions (issue 4095)

---
 benchmark/spr.c  | 0
 benchmark/spr2.c | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 benchmark/spr.c
 mode change 100755 => 100644 benchmark/spr2.c

diff --git a/benchmark/spr.c b/benchmark/spr.c
old mode 100755
new mode 100644
diff --git a/benchmark/spr2.c b/benchmark/spr2.c
old mode 100755
new mode 100644

From 7976deff801b2dc7fa26278b075cf6d4bdbaf56e Mon Sep 17 00:00:00 2001
From: martin-frbg <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 23 Jul 2023 20:37:07 +0200
Subject: [PATCH 215/718] Fix file permissions (issue 4095)

---
 kernel/arm64/sgemm_beta.S        | 0
 kernel/generic/ztrmmkernel_4x4.c | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100755 => 100644 kernel/arm64/sgemm_beta.S
 mode change 100755 => 100644 kernel/generic/ztrmmkernel_4x4.c

diff --git a/kernel/arm64/sgemm_beta.S b/kernel/arm64/sgemm_beta.S
old mode 100755
new mode 100644
diff --git a/kernel/generic/ztrmmkernel_4x4.c b/kernel/generic/ztrmmkernel_4x4.c
old mode 100755
new mode 100644

From 7c8ea130a35400ee15320b4747d52ff1c8349773 Mon Sep 17 00:00:00 2001
From: steppi <albert.steppi@gmail.com>
Date: Tue, 18 Jul 2023 14:36:03 -0400
Subject: [PATCH 216/718] Set up cirun workflow for arm64 graviton

---
 .cirun.yml                           |  16 ++++
 .github/workflows/arm64_graviton.yml | 126 +++++++++++++++++++++++++++
 2 files changed, 142 insertions(+)
 create mode 100644 .cirun.yml
 create mode 100644 .github/workflows/arm64_graviton.yml

diff --git a/.cirun.yml b/.cirun.yml
new file mode 100644
index 000000000..f0e0149d3
--- /dev/null
+++ b/.cirun.yml
@@ -0,0 +1,16 @@
+# Self-Hosted Github Action Runners on AWS via Cirun.io
+# Reference: https://docs.cirun.io/Reference/yml.html
+runners:
+  - name: "aws-runner-graviton"
+    # Cloud Provider: AWS
+    cloud: "aws"
+    region: "us-east-1"
+    # Cheapest VM on AWS
+    instance_type: "c7g.large"
+    # Ubuntu-22.04, ami image
+    machine_image: "ami-0a0c8eebcdd6dcbd0"
+    preemptible: false
+    # Add this label in the "runs-on" param in .github/workflows/<workflow-name>.yml
+    # So that this runner is created for running the workflow
+    labels:
+      - "cirun-aws-runner-graviton"
diff --git a/.github/workflows/arm64_graviton.yml b/.github/workflows/arm64_graviton.yml
new file mode 100644
index 000000000..bcb05047c
--- /dev/null
+++ b/.github/workflows/arm64_graviton.yml
@@ -0,0 +1,126 @@
+name: arm64 graviton cirun
+
+on: [push, pull_request]
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+jobs:
+  build:
+    runs-on: "cirun-aws-runner-graviton--${{ github.run_id }}"
+
+    strategy:
+      fail-fast: false
+      matrix:
+        fortran: [gfortran]
+        build: [cmake, make]
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Print system information
+        run: |
+          if [ "$RUNNER_OS" == "Linux" ]; then
+            cat /proc/cpuinfo
+          else
+            echo "::error::$RUNNER_OS not supported"
+            exit 1
+          fi
+
+      - name: Install Dependencies
+        run: |
+          if [ "$RUNNER_OS" == "Linux" ]; then
+            sudo apt update
+            sudo apt-get install -y gfortran cmake ccache libtinfo5
+          else
+            echo "::error::$RUNNER_OS not supported"
+            exit 1
+          fi
+
+      - name: Compilation cache
+        uses: actions/cache@v3
+        with:
+          path: ~/.ccache
+          # We include the commit sha in the cache key, as new cache entries are
+          # only created if there is no existing entry for the key yet.
+          # GNU make and cmake call the compilers differently. It looks like
+          # that causes the cache to mismatch. Keep the ccache for both build
+          # tools separate to avoid polluting each other.
+          key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
+          # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
+          restore-keys: |
+            ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}
+            ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}
+            ccache-${{ runner.os }}-${{ matrix.build }}
+
+      - name: Configure ccache
+        run: |
+          if [ "${{ matrix.build }}" = "make" ]; then
+            # Add ccache to path
+            if [ "$RUNNER_OS" = "Linux" ]; then
+              echo "/usr/lib/ccache" >> $GITHUB_PATH
+            else
+              echo "::error::$RUNNER_OS not supported"
+              exit 1
+            fi
+          fi
+          # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
+          test -d ~/.ccache || mkdir -p ~/.ccache
+          echo "max_size = 300M" > ~/.ccache/ccache.conf
+          echo "compression = true" >> ~/.ccache/ccache.conf
+          ccache -s
+
+      - name: Build OpenBLAS
+        run: |
+          case "${{ matrix.build }}" in
+            "make")
+              make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}"
+              ;;
+            "cmake")
+              mkdir build && cd build
+              cmake -DDYNAMIC_ARCH=1 \
+                    -DNOFORTRAN=0 \
+                    -DBUILD_WITHOUT_LAPACK=0 \
+                    -DCMAKE_VERBOSE_MAKEFILE=ON \
+                    -DCMAKE_BUILD_TYPE=Release \
+                    -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
+                    -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+                    -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
+                    ..
+              cmake --build .
+              ;;
+            *)
+              echo "::error::Configuration not supported"
+              exit 1
+              ;;
+          esac
+
+      - name: Show ccache status
+        continue-on-error: true
+        run: ccache -s
+
+      - name: Run tests
+        timeout-minutes: 60
+        run: |
+          case "${{ matrix.build }}" in
+            "make")
+              MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0'
+              echo "::group::Tests in 'test' directory"
+              make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
+              echo "::endgroup::"
+              echo "::group::Tests in 'ctest' directory"
+              make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
+              echo "::endgroup::"
+              echo "::group::Tests in 'utest' directory"
+              make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
+              echo "::endgroup::"
+              ;;
+            "cmake")
+              cd build && ctest
+              ;;
+            *)
+              echo "::error::Configuration not supported"
+              exit 1
+              ;;
+          esac

From 826a9d5fa47f20f23f42c97385e72e121a2efb4f Mon Sep 17 00:00:00 2001
From: Octavian Maghiar <octavian.maghiar@imgtec.com>
Date: Tue, 25 Jul 2023 11:36:23 +0100
Subject: [PATCH 219/718] Adds tail undisturbed for RVV Level 2 operations

During the last iteration of some RVV operations, accumulators can get overwritten when VL < VLMAX and tail policy is agnostic.
Commit changes intrinsics tail policy to undistrubed.
---
 kernel/riscv64/gemv_t_rvv.c  |  8 ++++----
 kernel/riscv64/symv_L_rvv.c  | 12 +++++------
 kernel/riscv64/symv_U_rvv.c  | 12 +++++------
 kernel/riscv64/zgemv_t_rvv.c | 40 ++++++++++++++++++------------------
 4 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/kernel/riscv64/gemv_t_rvv.c b/kernel/riscv64/gemv_t_rvv.c
index f0c834866..9c859aa50 100644
--- a/kernel/riscv64/gemv_t_rvv.c
+++ b/kernel/riscv64/gemv_t_rvv.c
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VLEV_FLOAT              __riscv_vle32_v_f32m8
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
 #define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m8_f32m1
-#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f32m8
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f32m8_tu
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
@@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VLEV_FLOAT              __riscv_vle64_v_f64m8
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
 #define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m8_f64m1
-#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f64m8
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f64m8_tu
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
 #define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
@@ -79,7 +79,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 
                 va = VLEV_FLOAT(a_ptr, vl);
                 vx = VLEV_FLOAT(x_ptr, vl);
-                vr = VFMACCVV_FLOAT(vr, va, vx, vl);
+                vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl);
             }
 
             v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
@@ -103,7 +103,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 
                 va = VLEV_FLOAT(a_ptr, vl);
                 vx = VLSEV_FLOAT(x_ptr, stride_x, vl);
-                vr = VFMACCVV_FLOAT(vr, va, vx, vl);
+                vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl);
             }
 
             v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
diff --git a/kernel/riscv64/symv_L_rvv.c b/kernel/riscv64/symv_L_rvv.c
index b27db2e37..888d628a5 100644
--- a/kernel/riscv64/symv_L_rvv.c
+++ b/kernel/riscv64/symv_L_rvv.c
@@ -37,7 +37,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSEV_FLOAT              __riscv_vse32_v_f32m8
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
 #define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
-#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f32m8
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f32m8_tu
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m8
 #define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m8
@@ -56,7 +56,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSEV_FLOAT              __riscv_vse64_v_f64m8
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
 #define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
-#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f64m8
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f64m8_tu
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m8
 #define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m8
@@ -100,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                 VSEV_FLOAT(&y[i], vy, vl);
 
                                 vx = VLEV_FLOAT(&x[i], vl);
-                                vr = VFMACCVV_FLOAT(vr, vx, va, vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
 
                         }
                         v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
@@ -130,7 +130,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                 VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
 
                                 vx = VLEV_FLOAT(&x[i], vl);
-                                vr = VFMACCVV_FLOAT(vr, vx, va, vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
 
                                 iy += inc_yv;
                         }
@@ -163,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                 VSEV_FLOAT(&y[i], vy, vl);
 
                                 vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
-                                vr = VFMACCVV_FLOAT(vr, vx, va, vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
 
                                 ix += inc_xv;
                         }
@@ -201,7 +201,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                 VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
 
                                 vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
-                                vr = VFMACCVV_FLOAT(vr, vx, va, vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
 
                                 ix += inc_xv;
                                 iy += inc_yv;
diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c
index 7e45b1a01..3cfd3ee4c 100644
--- a/kernel/riscv64/symv_U_rvv.c
+++ b/kernel/riscv64/symv_U_rvv.c
@@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSEV_FLOAT              __riscv_vse32_v_f32m8
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
 #define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
-#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f32m8
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f32m8_tu
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m8
 #define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m8
@@ -57,7 +57,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSEV_FLOAT              __riscv_vse64_v_f64m8
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
 #define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
-#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f64m8
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f64m8_tu
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m8
 #define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m8
@@ -101,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                 VSEV_FLOAT(&y[i], vy, vl);
 
                                 vx = VLEV_FLOAT(&x[i], vl);
-                                vr = VFMACCVV_FLOAT(vr, vx, va, vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
                         }
                         v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max);
 
@@ -130,7 +130,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                 VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
 
                                 vx = VLEV_FLOAT(&x[i], vl);
-                                vr = VFMACCVV_FLOAT(vr, vx, va, vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
 
                                 iy += inc_yv;
                         }
@@ -163,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                 VSEV_FLOAT(&y[i], vy, vl);
 
                                 vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
-                                vr = VFMACCVV_FLOAT(vr, vx, va, vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
 
                                 ix += inc_xv;
                         }
@@ -200,7 +200,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                 VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
 
                                 vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
-                                vr = VFMACCVV_FLOAT(vr, vx, va, vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
                                 ix += inc_xv;
                                 iy += inc_yv;
                         }
diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c
index 15795cc3a..2f0380530 100644
--- a/kernel/riscv64/zgemv_t_rvv.c
+++ b/kernel/riscv64/zgemv_t_rvv.c
@@ -35,8 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
 #define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
 #define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f32m4
-#define VFNMSACVV_FLOAT         __riscv_vfnmsac_vv_f32m4
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f32m4_tu
+#define VFNMSACVV_FLOAT_TU      __riscv_vfnmsac_vv_f32m4_tu
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
 #define VFMULVV_FLOAT           __riscv_vfmul_vv_f32m4
@@ -49,8 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
 #define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
 #define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f64m4
-#define VFNMSACVV_FLOAT         __riscv_vfnmsac_vv_f64m4
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f64m4_tu
+#define VFNMSACVV_FLOAT_TU      __riscv_vfnmsac_vv_f64m4_tu
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
 #define VFMULVV_FLOAT           __riscv_vfmul_vv_f64m4
@@ -90,15 +90,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
                 VLSEG_FLOAT(&vx0, &vx1, &x[ix], vl);
 
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                vr = VFMACCVV_FLOAT(vr, va0, vx0, vl);
-                vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl);
-                vi = VFMACCVV_FLOAT(vi, va0, vx1, vl);
-                vi = VFMACCVV_FLOAT(vi, va1, vx0, vl);
+                vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl);
+                vr = VFNMSACVV_FLOAT_TU(vr, va1, vx1, vl);
+                vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl);
+                vi = VFMACCVV_FLOAT_TU(vi, va1, vx0, vl);
 #else
-                vr = VFMACCVV_FLOAT(vr, va0, vx0, vl);
-                vr = VFMACCVV_FLOAT(vr, va1, vx1, vl);
-                vi = VFMACCVV_FLOAT(vi, va0, vx1, vl);
-                vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl);
+                vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl);
+                vr = VFMACCVV_FLOAT_TU(vr, va1, vx1, vl);
+                vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl);
+                vi = VFNMSACVV_FLOAT_TU(vi, va1, vx0, vl);
 #endif
                 j += vl * 2;
                 ix += vl * inc_x * 2;
@@ -134,15 +134,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
                 VLSSEG_FLOAT(&vx0, &vx1, &x[ix], stride_x, vl);
     
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                vr = VFMACCVV_FLOAT(vr, va0, vx0, vl);
-                vr = VFNMSACVV_FLOAT(vr, va1, vx1, vl);
-                vi = VFMACCVV_FLOAT(vi, va0, vx1, vl);
-                vi = VFMACCVV_FLOAT(vi, va1, vx0, vl);
+                vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl);
+                vr = VFNMSACVV_FLOAT_TU(vr, va1, vx1, vl);
+                vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl);
+                vi = VFMACCVV_FLOAT_TU(vi, va1, vx0, vl);
 #else
-                vr = VFMACCVV_FLOAT(vr, va0, vx0, vl);
-                vr = VFMACCVV_FLOAT(vr, va1, vx1, vl);
-                vi = VFMACCVV_FLOAT(vi, va0, vx1, vl);
-                vi = VFNMSACVV_FLOAT(vi, va1, vx0, vl);
+                vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl);
+                vr = VFMACCVV_FLOAT_TU(vr, va1, vx1, vl);
+                vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl);
+                vi = VFNMSACVV_FLOAT_TU(vi, va1, vx0, vl);
 #endif
                 j += vl * 2;
                 ix += vl * inc_x * 2;

From aea2a4622b2cc3ccb5e3c58dc2e11b6c4471f6cd Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <chris.sidebottom@arm.com>
Date: Tue, 25 Jul 2023 14:05:17 +0100
Subject: [PATCH 220/718] Use latest non-SVE kernels in ARMV8SVE

These are generally better and, in some cases, include threading which helps in the cores we're targeting here.
---
 kernel/arm64/KERNEL.ARMV8SVE   | 70 +++++++++++++++-------------------
 kernel/arm64/KERNEL.NEOVERSEV1 | 32 ----------------
 2 files changed, 30 insertions(+), 72 deletions(-)

diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE
index 07393624c..38d7ff8f1 100644
--- a/kernel/arm64/KERNEL.ARMV8SVE
+++ b/kernel/arm64/KERNEL.ARMV8SVE
@@ -57,7 +57,7 @@ CAMAXKERNEL  = zamax.S
 ZAMAXKERNEL  = zamax.S
 
 SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = axpy.S
+DAXPYKERNEL  = daxpy_thunderx2t99.S
 CAXPYKERNEL  = zaxpy.S
 ZAXPYKERNEL  = zaxpy.S
 
@@ -81,45 +81,35 @@ DGEMVTKERNEL = gemv_t.S
 CGEMVTKERNEL = zgemv_t.S
 ZGEMVTKERNEL = zgemv_t.S
 
-
-SASUMKERNEL    = asum.S
-DASUMKERNEL    = asum.S
-CASUMKERNEL    = casum.S
-ZASUMKERNEL    = zasum.S
-
-SCOPYKERNEL    = copy.S
-DCOPYKERNEL    = copy.S
-CCOPYKERNEL    = copy.S
-ZCOPYKERNEL    = copy.S
-
-SSWAPKERNEL    = swap.S
-DSWAPKERNEL    = swap.S
-CSWAPKERNEL    = swap.S
-ZSWAPKERNEL    = swap.S
-
-ISAMAXKERNEL   = iamax.S
-IDAMAXKERNEL   = iamax.S
-ICAMAXKERNEL   = izamax.S
-IZAMAXKERNEL   = izamax.S
-
-SNRM2KERNEL    = nrm2.S
-DNRM2KERNEL    = nrm2.S
-CNRM2KERNEL    = znrm2.S
-ZNRM2KERNEL    = znrm2.S
-
-DDOTKERNEL     = dot.S
-ifneq ($(C_COMPILER), PGI)
-SDOTKERNEL     = ../generic/dot.c
-else
-SDOTKERNEL = dot.S
-endif
-ifneq ($(C_COMPILER), PGI)
-CDOTKERNEL     = zdot.S
-ZDOTKERNEL     = zdot.S
-else
-CDOTKERNEL = ../arm/zdot.c
-ZDOTKERNEL = ../arm/zdot.c
-endif
+SASUMKERNEL    = sasum_thunderx2t99.c
+DASUMKERNEL    = dasum_thunderx2t99.c
+CASUMKERNEL    = casum_thunderx2t99.c
+ZASUMKERNEL    = zasum_thunderx2t99.c
+
+SCOPYKERNEL    = copy_thunderx2t99.c
+DCOPYKERNEL    = copy_thunderx2t99.c
+CCOPYKERNEL    = copy_thunderx2t99.c
+ZCOPYKERNEL    = copy_thunderx2t99.c
+
+SSWAPKERNEL    = swap_thunderx2t99.S
+DSWAPKERNEL    = swap_thunderx2t99.S
+CSWAPKERNEL    = swap_thunderx2t99.S
+ZSWAPKERNEL    = swap_thunderx2t99.S
+
+ISAMAXKERNEL   = iamax_thunderx2t99.c
+IDAMAXKERNEL   = iamax_thunderx2t99.c
+ICAMAXKERNEL   = izamax_thunderx2t99.c
+IZAMAXKERNEL   = izamax_thunderx2t99.c
+
+SNRM2KERNEL    = scnrm2_thunderx2t99.c
+DNRM2KERNEL    = dznrm2_thunderx2t99.c
+CNRM2KERNEL    = scnrm2_thunderx2t99.c
+ZNRM2KERNEL    = dznrm2_thunderx2t99.c
+
+DDOTKERNEL     = dot.c
+SDOTKERNEL     = dot.c
+CDOTKERNEL     = zdot_thunderx2t99.c
+ZDOTKERNEL     = zdot_thunderx2t99.c
 DSDOTKERNEL    = dot.S
 
 DGEMM_BETA     = dgemm_beta.S
diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1
index d6617e8a4..a8dafe862 100644
--- a/kernel/arm64/KERNEL.NEOVERSEV1
+++ b/kernel/arm64/KERNEL.NEOVERSEV1
@@ -1,37 +1,5 @@
 include $(KERNELDIR)/KERNEL.ARMV8SVE
 
-DAXPYKERNEL  = daxpy_thunderx2t99.S
-
-SASUMKERNEL    = sasum_thunderx2t99.c
-DASUMKERNEL    = dasum_thunderx2t99.c
-CASUMKERNEL    = casum_thunderx2t99.c
-ZASUMKERNEL    = zasum_thunderx2t99.c
-
-SCOPYKERNEL    = copy_thunderx2t99.c
-DCOPYKERNEL    = copy_thunderx2t99.c
-CCOPYKERNEL    = copy_thunderx2t99.c
-ZCOPYKERNEL    = copy_thunderx2t99.c
-
-SSWAPKERNEL    = swap_thunderx2t99.S
-DSWAPKERNEL    = swap_thunderx2t99.S
-CSWAPKERNEL    = swap_thunderx2t99.S
-ZSWAPKERNEL    = swap_thunderx2t99.S
-
-ISAMAXKERNEL   = iamax_thunderx2t99.c
-IDAMAXKERNEL   = iamax_thunderx2t99.c
-ICAMAXKERNEL   = izamax_thunderx2t99.c
-IZAMAXKERNEL   = izamax_thunderx2t99.c
-
-SNRM2KERNEL    = scnrm2_thunderx2t99.c
-DNRM2KERNEL    = dznrm2_thunderx2t99.c
-CNRM2KERNEL    = scnrm2_thunderx2t99.c
-ZNRM2KERNEL    = dznrm2_thunderx2t99.c
-
-DDOTKERNEL     = dot.c
-SDOTKERNEL     = dot.c
-CDOTKERNEL     = zdot_thunderx2t99.c
-ZDOTKERNEL     = zdot_thunderx2t99.c
-
 CTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
 CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c

From f971ef55f2ce09d60b08137ca0608a6475a7611f Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <chris.sidebottom@arm.com>
Date: Tue, 25 Jul 2023 11:56:33 +0100
Subject: [PATCH 221/718] Add ARMV8SVE to AArch64 Dynamic Dispatch

In order to enable support for future cores which have similar tunings
(in this case I'm doing this for the Arm(R) Neoverse(TM) V2 core), this generically detects SVE support and enables it. This should better manage the size and complexity of dynamic dispatch rather than just copy pasting the same parameters.

To make `ARMV8SVE` more representive of the common 128-bit SVE case,
I've split it and similar parameters from A64FX which has the wider
512-bit SVE.
---
 Makefile.system               |  1 +
 cmake/arch.cmake              |  2 +-
 driver/others/dynamic_arm64.c | 19 +++++++++++++++-
 param.h                       | 41 +++++++++++++++++++++++++++++++++--
 4 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index 7d26eccc3..62926b380 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -668,6 +668,7 @@ DYNAMIC_CORE += NEOVERSEN1
 ifneq ($(NO_SVE), 1)
 DYNAMIC_CORE += NEOVERSEV1
 DYNAMIC_CORE += NEOVERSEN2
+DYNAMIC_CORE += ARMV8SVE
 endif
 DYNAMIC_CORE += CORTEXA55
 DYNAMIC_CORE += FALKOR
diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index e6e434a0a..07df31b89 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -46,7 +46,7 @@ if (DYNAMIC_ARCH)
   if (ARM64)
 	  set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
     if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
-          set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2)
+          set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE)
     endif ()
     if (DYNAMIC_LIST)
 	  set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c
index ef2597234..530d18115 100644
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2;
 #else
 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8
 #endif
+#ifdef DYN_ARMV8SVE
+extern gotoblas_t gotoblas_ARMV8SVE;
+#else
+#define gotoblas_ARMV8SVE gotoblas_ARMV8
+#endif
 #ifdef DYN_CORTEX_A55
 extern gotoblas_t  gotoblas_CORTEXA55;
 #else
@@ -128,9 +134,11 @@ extern gotoblas_t  gotoblas_NEOVERSEN1;
 #ifndef NO_SVE
 extern gotoblas_t  gotoblas_NEOVERSEV1;
 extern gotoblas_t  gotoblas_NEOVERSEN2;
+extern gotoblas_t  gotoblas_ARMV8SVE;
 #else
 #define gotoblas_NEOVERSEV1 gotoblas_ARMV8
 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8
+#define gotoblas_ARMV8SVE   gotoblas_ARMV8
 #endif
 extern gotoblas_t  gotoblas_THUNDERX3T110;
 extern gotoblas_t  gotoblas_CORTEXA55;
@@ -140,7 +148,7 @@ extern void openblas_warning(int verbose, const char * msg);
 #define FALLBACK_VERBOSE 1
 #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"
 
-#define NUM_CORETYPES   13
+#define NUM_CORETYPES   16
 
 /*
  * In case asm/hwcap.h is outdated on the build system, make sure
@@ -173,6 +181,7 @@ static char *corename[] = {
   "neoversen2",
   "thunderx3t110",
   "cortexa55",
+  "armv8sve",
   "unknown"
 };
 
@@ -192,6 +201,7 @@ char *gotoblas_corename(void) {
   if (gotoblas == &gotoblas_NEOVERSEN2)   return corename[12];
   if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
   if (gotoblas == &gotoblas_CORTEXA55)    return corename[14];
+  if (gotoblas == &gotoblas_ARMV8SVE)     return corename[15];
   return corename[NUM_CORETYPES];
 }
 
@@ -226,6 +236,7 @@ static gotoblas_t *force_coretype(char *coretype) {
     case 12: return (&gotoblas_NEOVERSEN2);
     case 13: return (&gotoblas_THUNDERX3T110);
     case 14: return (&gotoblas_CORTEXA55);
+    case 15: return (&gotoblas_ARMV8SVE);
   }
   snprintf(message, 128, "Core not found: %s\n", coretype);
   openblas_warning(1, message);
@@ -345,6 +356,12 @@ static gotoblas_t *get_coretype(void) {
       snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
       openblas_warning(1, coremsg);
   }
+#ifndef NO_SVE
+  if ((getauxval(AT_HWCAP) & HWCAP_SVE)) {
+    return &gotoblas_ARMV8SVE;
+  }
+#endif
+
   return NULL;
 #endif
 }
diff --git a/param.h b/param.h
index aa193a284..84e0c2ac7 100644
--- a/param.h
+++ b/param.h
@@ -3371,7 +3371,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
-#elif defined(NEOVERSEV1)
+#elif defined(NEOVERSEV1) // 256-bit SVE
 
 #if defined(XDOUBLE) || defined(DOUBLE)
 #define SWITCH_RATIO            8
@@ -3449,7 +3449,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
-#elif defined(ARMV8SVE) || defined(A64FX) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2)
+#elif defined(A64FX) // 512-bit SVE
 
 /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl".
 Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */
@@ -3490,6 +3490,43 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
+#elif defined(ARMV8SVE) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE
+
+#if defined(XDOUBLE) || defined(DOUBLE)
+#define SWITCH_RATIO            8
+#else
+#define SWITCH_RATIO            16
+#endif
+
+#define SGEMM_DEFAULT_UNROLL_M  4 // Actually 1VL (8) but kept seperate to keep copies seperate
+#define SGEMM_DEFAULT_UNROLL_N  8
+
+#define DGEMM_DEFAULT_UNROLL_M  4
+#define DGEMM_DEFAULT_UNROLL_N  8
+
+#define CGEMM_DEFAULT_UNROLL_M  2
+#define CGEMM_DEFAULT_UNROLL_N  4
+#define CGEMM_DEFAULT_UNROLL_MN  16
+
+#define ZGEMM_DEFAULT_UNROLL_M  2
+#define ZGEMM_DEFAULT_UNROLL_N  4
+#define ZGEMM_DEFAULT_UNROLL_MN  16
+
+#define SGEMM_DEFAULT_P 128
+#define DGEMM_DEFAULT_P 160
+#define CGEMM_DEFAULT_P 128
+#define ZGEMM_DEFAULT_P 128
+
+#define SGEMM_DEFAULT_Q 352
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 224
+#define ZGEMM_DEFAULT_Q 112
+
+#define SGEMM_DEFAULT_R 4096
+#define DGEMM_DEFAULT_R 4096
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
 #else /* Other/undetected ARMv8 cores */
 
 #define SGEMM_DEFAULT_UNROLL_M  16

From 24586bc4ffc0486b9f7051ca21a6a16e3c2f5797 Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <chris.sidebottom@arm.com>
Date: Tue, 25 Jul 2023 14:50:38 +0100
Subject: [PATCH 222/718] Disambiguate whilelt

---
 kernel/arm64/cgemm_ncopy_sve_v1.c  |  5 +++--
 kernel/arm64/cgemm_tcopy_sve_v1.c  |  5 +++--
 kernel/arm64/symm_lcopy_sve.c      |  9 +++++----
 kernel/arm64/symm_ucopy_sve.c      |  9 +++++----
 kernel/arm64/trsm_ltcopy_sve.c     |  9 +++++----
 kernel/arm64/trsm_uncopy_sve.c     |  9 +++++----
 kernel/arm64/trsm_utcopy_sve.c     |  9 +++++----
 kernel/arm64/zgemm_ncopy_sve_v1.c  |  5 +++--
 kernel/arm64/zgemm_tcopy_sve_v1.c  |  5 +++--
 kernel/arm64/zhemm_ltcopy_sve.c    | 13 +++++++------
 kernel/arm64/zhemm_utcopy_sve.c    | 13 +++++++------
 kernel/arm64/zsymm_lcopy_sve.c     |  9 +++++----
 kernel/arm64/zsymm_ucopy_sve.c     |  9 +++++----
 kernel/arm64/ztrmm_lncopy_sve_v1.c |  9 +++++----
 kernel/arm64/ztrmm_ltcopy_sve_v1.c |  9 +++++----
 kernel/arm64/ztrmm_uncopy_sve_v1.c |  9 +++++----
 kernel/arm64/ztrmm_utcopy_sve_v1.c |  9 +++++----
 kernel/arm64/ztrsm_lncopy_sve.c    |  9 +++++----
 kernel/arm64/ztrsm_ltcopy_sve.c    |  9 +++++----
 kernel/arm64/ztrsm_uncopy_sve.c    |  9 +++++----
 kernel/arm64/ztrsm_utcopy_sve.c    |  9 +++++----
 21 files changed, 101 insertions(+), 80 deletions(-)

diff --git a/kernel/arm64/cgemm_ncopy_sve_v1.c b/kernel/arm64/cgemm_ncopy_sve_v1.c
index 6aa44a8f6..2fdaf5fcd 100644
--- a/kernel/arm64/cgemm_ncopy_sve_v1.c
+++ b/kernel/arm64/cgemm_ncopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
     boffset = b;
 
     j = 0;
-    svbool_t pg = svwhilelt_b32(j, n);
+    svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
     uint32_t active = svcntp_b32(svptrue_b32(), pg);
     do {
 
@@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
         aoffset += active * lda * 2;
 
         j += svcntw();
-        pg = svwhilelt_b32(j, n);
+        pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
         active = svcntp_b32(svptrue_b32(), pg);
 
 
diff --git a/kernel/arm64/cgemm_tcopy_sve_v1.c b/kernel/arm64/cgemm_tcopy_sve_v1.c
index 748cd954e..086a2fed1 100644
--- a/kernel/arm64/cgemm_tcopy_sve_v1.c
+++ b/kernel/arm64/cgemm_tcopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
     boffset = b;
 
     j = 0;
-    svbool_t pg = svwhilelt_b32(j, n);
+    svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
     uint32_t active = svcntp_b32(svptrue_b32(), pg);
     do {
 
@@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
         aoffset += active * 2;
 
         j += svcntw();
-        pg = svwhilelt_b32(j, n);
+        pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
         active = svcntp_b32(svptrue_b32(), pg);
 
     } while (svptest_any(svptrue_b32(), pg));
diff --git a/kernel/arm64/symm_lcopy_sve.c b/kernel/arm64/symm_lcopy_sve.c
index 6ba4afc8b..e138f0647 100644
--- a/kernel/arm64/symm_lcopy_sve.c
+++ b/kernel/arm64/symm_lcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
   svint64_t one_vec = svdup_s64(1LL);
 
   int64_t j = 0;
-  svbool_t pg = svwhilelt_b64(j, n);
+  svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
   int64_t active = svcntp_b64(svptrue_b64(), pg);
   svint64_t index_neg = svindex_s64(0LL, -1LL);
   svint64_t index = svindex_s64(0LL, 1LL);
@@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s64(posX);
     j += sve_size;
-    pg = svwhilelt_b64(j, n);
+    pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     active = svcntp_b64(svptrue_b64(), pg);
   } while (svptest_any(svptrue_b64(), pg));
 
@@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
   int32_t N = n;
   int32_t j = 0;
-  svbool_t pg = svwhilelt_b32(j, N);
+  svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
   int32_t active = svcntp_b32(svptrue_b32(), pg);
   svint32_t index_neg = svindex_s32(0, -1);
   svint32_t index = svindex_s32(0, 1);
@@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s32(posX);
     j += sve_size;
-    pg = svwhilelt_b32(j, N);
+    pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
     active = svcntp_b32(svptrue_b32(), pg);
   } while (svptest_any(svptrue_b32(), pg));
 
diff --git a/kernel/arm64/symm_ucopy_sve.c b/kernel/arm64/symm_ucopy_sve.c
index 32da5bd16..9a4cb6d4f 100644
--- a/kernel/arm64/symm_ucopy_sve.c
+++ b/kernel/arm64/symm_ucopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
   svint64_t one_vec = svdup_s64(1LL);
 
   int64_t j = 0;
-  svbool_t pg = svwhilelt_b64(j, n);
+  svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
   int64_t active = svcntp_b64(svptrue_b64(), pg);
   svint64_t index_neg = svindex_s64(0LL, -1LL);
   svint64_t index = svindex_s64(0LL, 1LL);
@@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s64(posX);
     j += sve_size;
-    pg = svwhilelt_b64(j, n);
+    pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     active = svcntp_b64(svptrue_b64(), pg);
   } while (svptest_any(svptrue_b64(), pg));
 
@@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
   int32_t N = n;
   int32_t j = 0;
-  svbool_t pg = svwhilelt_b32(j, N);
+  svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
   int32_t active = svcntp_b32(svptrue_b32(), pg);
   svint32_t index_neg = svindex_s32(0, -1);
   svint32_t index = svindex_s32(0, 1);
@@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s32(posX);
     j += sve_size;
-    pg = svwhilelt_b32(j, N);
+    pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
     active = svcntp_b32(svptrue_b32(), pg);
   } while (svptest_any(svptrue_b32(), pg));
 
diff --git a/kernel/arm64/trsm_ltcopy_sve.c b/kernel/arm64/trsm_ltcopy_sve.c
index ac4019e26..fdda992e0 100644
--- a/kernel/arm64/trsm_ltcopy_sve.c
+++ b/kernel/arm64/trsm_ltcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
   jj = offset;
 #ifdef DOUBLE
   int64_t js = 0;
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trsm_uncopy_sve.c b/kernel/arm64/trsm_uncopy_sve.c
index 8fdcd0f4b..1a03aa974 100644
--- a/kernel/arm64/trsm_uncopy_sve.c
+++ b/kernel/arm64/trsm_uncopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -56,13 +57,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 #ifdef DOUBLE
   int64_t js = 0;
   svint64_t index = svindex_s64(0LL, lda);
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
   svint32_t index = svindex_s32(0, lda);
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trsm_utcopy_sve.c b/kernel/arm64/trsm_utcopy_sve.c
index 0f5f0dccd..b06166f36 100644
--- a/kernel/arm64/trsm_utcopy_sve.c
+++ b/kernel/arm64/trsm_utcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
   jj = offset;
 #ifdef DOUBLE
   int64_t js = 0;
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/zgemm_ncopy_sve_v1.c b/kernel/arm64/zgemm_ncopy_sve_v1.c
index 8f9b4268a..6b8c93baf 100644
--- a/kernel/arm64/zgemm_ncopy_sve_v1.c
+++ b/kernel/arm64/zgemm_ncopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
     boffset = b;
 
     j = 0;
-    svbool_t pg = svwhilelt_b64(j, n);
+    svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     uint64_t active = svcntp_b64(svptrue_b64(), pg);
     do {
 
@@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
         aoffset += active * lda * 2;
 
         j += svcntd();
-        pg = svwhilelt_b64(j, n);
+        pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
         active = svcntp_b64(svptrue_b64(), pg);
 
 
diff --git a/kernel/arm64/zgemm_tcopy_sve_v1.c b/kernel/arm64/zgemm_tcopy_sve_v1.c
index c6e50bc1c..fd8d2190f 100644
--- a/kernel/arm64/zgemm_tcopy_sve_v1.c
+++ b/kernel/arm64/zgemm_tcopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
     boffset = b;
 
     j = 0;
-    svbool_t pg = svwhilelt_b64(j, n);
+    svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     uint64_t active = svcntp_b64(svptrue_b64(), pg);
     do {
 
@@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
         aoffset += active * 2;
 
         j += svcntd();
-        pg = svwhilelt_b64(j, n);
+        pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
         active = svcntp_b64(svptrue_b64(), pg);
 
     } while (svptest_any(svptrue_b64(), pg));
diff --git a/kernel/arm64/zhemm_ltcopy_sve.c b/kernel/arm64/zhemm_ltcopy_sve.c
index 37dbfe4e1..615667264 100644
--- a/kernel/arm64/zhemm_ltcopy_sve.c
+++ b/kernel/arm64/zhemm_ltcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
   svint64_t one_vec = svdup_s64(1LL);
 
   int64_t j = 0;
-  svbool_t pg = svwhilelt_b64(j, n);
+  svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
   int64_t active = svcntp_b64(svptrue_b64(), pg);
   svint64_t index_neg = svindex_s64(0LL, -1LL);
   svint64_t index = svindex_s64(0LL, 1LL);
@@ -79,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         gat_ind = svadd_m(cmp, gat_ind, lda_vec);
         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
         if (offset <= 0) {
-            svbool_t off_g = svwhilelt_b64(offset, 0LL);
+            svbool_t off_g = svwhilelt_b64((uint64_t)offset, (uint64_t)0LL);
             data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
         }
 
@@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s64(posX);
     j += sve_size;
-    pg = svwhilelt_b64(j, n);
+    pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     active = svcntp_b64(svptrue_b64(), pg);
   } while (svptest_any(svptrue_b64(), pg));
 
@@ -117,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
   int32_t j = 0;
   int32_t N = n;
-  svbool_t pg = svwhilelt_b32(j, N);
+  svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
   int32_t active = svcntp_b32(svptrue_b32(), pg);
   svint32_t index_neg = svindex_s32(0, -1);
   svint32_t index = svindex_s32(0, 1);
@@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         gat_ind = svadd_m(cmp, gat_ind, lda_vec);
         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
         if (offset <= 0) {
-            svbool_t off_g = svwhilelt_b32(offset, 0);
+            svbool_t off_g = svwhilelt_b32((uint32_t)offset, (uint32_t)0);
             data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
         }
 
@@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s32(posX);
     j += sve_size;
-    pg = svwhilelt_b32(j, N);
+    pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
     active = svcntp_b32(svptrue_b32(), pg);
   } while (svptest_any(svptrue_b32(), pg));
 
diff --git a/kernel/arm64/zhemm_utcopy_sve.c b/kernel/arm64/zhemm_utcopy_sve.c
index 21e03b7be..f763d5607 100644
--- a/kernel/arm64/zhemm_utcopy_sve.c
+++ b/kernel/arm64/zhemm_utcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
   svint64_t one_vec = svdup_s64(1LL);
 
   int64_t j = 0;
-  svbool_t pg = svwhilelt_b64(j, n);
+  svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
   int64_t active = svcntp_b64(svptrue_b64(), pg);
   svint64_t index_neg = svindex_s64(0LL, -1LL);
   svint64_t index = svindex_s64(0LL, 1LL);
@@ -80,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
         data_vec_imag = svneg_z(pg, data_vec_imag);
         if (offset <= 0) {
-            svbool_t off_g = svwhilelt_b64(offset, 0LL);
+            svbool_t off_g = svwhilelt_b64((uint64_t)offset, (uint64_t)0LL);
             data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
         }
 
@@ -100,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s64(posX);
     j += sve_size;
-    pg = svwhilelt_b64(j, n);
+    pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     active = svcntp_b64(svptrue_b64(), pg);
   } while (svptest_any(svptrue_b64(), pg));
 #else
@@ -116,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
   int32_t j = 0;
   int32_t N = n;
-  svbool_t pg = svwhilelt_b32(j, N);
+  svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
   int32_t active = svcntp_b32(svptrue_b32(), pg);
   svint32_t index_neg = svindex_s32(0, -1);
   svint32_t index = svindex_s32(0, 1);
@@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
         data_vec_imag = svneg_z(pg, data_vec_imag);
         if (offset <= 0) {
-            svbool_t off_g = svwhilelt_b32(offset, 0);
+            svbool_t off_g = svwhilelt_b32((uint32_t)offset, (uint32_t)0);
             data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
         }
 
@@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s32(posX);
     j += sve_size;
-    pg = svwhilelt_b32(j, N);
+    pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
     active = svcntp_b32(svptrue_b32(), pg);
   } while (svptest_any(svptrue_b32(), pg));
 
diff --git a/kernel/arm64/zsymm_lcopy_sve.c b/kernel/arm64/zsymm_lcopy_sve.c
index 6f18aa956..5a17d3b19 100644
--- a/kernel/arm64/zsymm_lcopy_sve.c
+++ b/kernel/arm64/zsymm_lcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
   svint64_t one_vec = svdup_s64(1LL);
 
   int64_t j = 0;
-  svbool_t pg = svwhilelt_b64(j, n);
+  svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
   int64_t active = svcntp_b64(svptrue_b64(), pg);
   svint64_t index_neg = svindex_s64(0LL, -1LL);
   svint64_t index = svindex_s64(0LL, 1LL);
@@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s64(posX);
     j += sve_size;
-    pg = svwhilelt_b64(j, n);
+    pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     active = svcntp_b64(svptrue_b64(), pg);
   } while (svptest_any(svptrue_b64(), pg));
 
@@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
   int32_t N = n;
   int32_t j = 0;
-  svbool_t pg = svwhilelt_b32(j, N);
+  svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
   int32_t active = svcntp_b32(svptrue_b32(), pg);
   svint32_t index_neg = svindex_s32(0, -1);
   svint32_t index = svindex_s32(0, 1);
@@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s32(posX);
     j += sve_size;
-    pg = svwhilelt_b32(j, N);
+    pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
     active = svcntp_b32(svptrue_b32(), pg);
   } while (svptest_any(svptrue_b32(), pg));
 
diff --git a/kernel/arm64/zsymm_ucopy_sve.c b/kernel/arm64/zsymm_ucopy_sve.c
index 6be48cdaf..06989e3aa 100644
--- a/kernel/arm64/zsymm_ucopy_sve.c
+++ b/kernel/arm64/zsymm_ucopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
   svint64_t one_vec = svdup_s64(1LL);
 
   int64_t j = 0;
-  svbool_t pg = svwhilelt_b64(j, n);
+  svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
   int64_t active = svcntp_b64(svptrue_b64(), pg);
   svint64_t index_neg = svindex_s64(0LL, -1LL);
   svint64_t index = svindex_s64(0LL, 1LL);
@@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s64(posX);
     j += sve_size;
-    pg = svwhilelt_b64(j, n);
+    pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     active = svcntp_b64(svptrue_b64(), pg);
   } while (svptest_any(svptrue_b64(), pg));
 
@@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
   int32_t N = n;
   int32_t j = 0;
-  svbool_t pg = svwhilelt_b32(j, N);
+  svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
   int32_t active = svcntp_b32(svptrue_b32(), pg);
   svint32_t index_neg = svindex_s32(0, -1);
   svint32_t index = svindex_s32(0, 1);
@@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s32(posX);
     j += sve_size;
-    pg = svwhilelt_b32(j, N);
+    pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
     active = svcntp_b32(svptrue_b32(), pg);
   } while (svptest_any(svptrue_b32(), pg));
 
diff --git a/kernel/arm64/ztrmm_lncopy_sve_v1.c b/kernel/arm64/ztrmm_lncopy_sve_v1.c
index d34f607ab..5a7171d9d 100644
--- a/kernel/arm64/ztrmm_lncopy_sve_v1.c
+++ b/kernel/arm64/ztrmm_lncopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
 #ifdef DOUBLE
     svint64_t index = svindex_s64(0LL, lda);
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
     svint32_t index = svindex_s32(0, lda);
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrmm_ltcopy_sve_v1.c b/kernel/arm64/ztrmm_ltcopy_sve_v1.c
index 7f34c9857..3a88f26b2 100644
--- a/kernel/arm64/ztrmm_ltcopy_sve_v1.c
+++ b/kernel/arm64/ztrmm_ltcopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
     js = 0;
 #ifdef DOUBLE
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -129,11 +130,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrmm_uncopy_sve_v1.c b/kernel/arm64/ztrmm_uncopy_sve_v1.c
index 7eb9452c9..c3dbdcbe3 100644
--- a/kernel/arm64/ztrmm_uncopy_sve_v1.c
+++ b/kernel/arm64/ztrmm_uncopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
 #ifdef DOUBLE
     svint64_t index = svindex_s64(0LL, lda);
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
     svint32_t index = svindex_s32(0, lda);
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrmm_utcopy_sve_v1.c b/kernel/arm64/ztrmm_utcopy_sve_v1.c
index 60c8ff3b4..ddfa7ba4e 100644
--- a/kernel/arm64/ztrmm_utcopy_sve_v1.c
+++ b/kernel/arm64/ztrmm_utcopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
     js = 0;
 #ifdef DOUBLE
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -128,11 +129,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrsm_lncopy_sve.c b/kernel/arm64/ztrsm_lncopy_sve.c
index eb7cd0294..f81ba14c2 100644
--- a/kernel/arm64/ztrsm_lncopy_sve.c
+++ b/kernel/arm64/ztrsm_lncopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 #ifdef DOUBLE
   int64_t js = 0;
   svint64_t index = svindex_s64(0LL, lda);
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
   svint32_t index = svindex_s32(0, lda);
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrsm_ltcopy_sve.c b/kernel/arm64/ztrsm_ltcopy_sve.c
index 34dbf8a30..46a11abed 100644
--- a/kernel/arm64/ztrsm_ltcopy_sve.c
+++ b/kernel/arm64/ztrsm_ltcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
   jj = offset;
 #ifdef DOUBLE
   int64_t js = 0;
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrsm_uncopy_sve.c b/kernel/arm64/ztrsm_uncopy_sve.c
index 92e086b75..436112130 100644
--- a/kernel/arm64/ztrsm_uncopy_sve.c
+++ b/kernel/arm64/ztrsm_uncopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 #ifdef DOUBLE
   int64_t js = 0;
   svint64_t index = svindex_s64(0LL, lda);
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
   svint32_t index = svindex_s32(0, lda);
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrsm_utcopy_sve.c b/kernel/arm64/ztrsm_utcopy_sve.c
index ccb942e1b..ddf3e265f 100644
--- a/kernel/arm64/ztrsm_utcopy_sve.c
+++ b/kernel/arm64/ztrsm_utcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
   jj = offset;
 #ifdef DOUBLE
   int64_t js = 0;
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif

From ea669c8ae938450e101140d634cca291bffd3898 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 26 Jul 2023 00:27:14 +0200
Subject: [PATCH 223/718] simplify openmp thread limit handling

---
 common_thread.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/common_thread.h b/common_thread.h
index 05e1d5489..0a4c703b7 100644
--- a/common_thread.h
+++ b/common_thread.h
@@ -136,15 +136,13 @@ typedef struct blas_queue {
 #ifdef SMP_SERVER
 
 extern int blas_server_avail;
+extern int blas_omp_number_max;
 
 static __inline int num_cpu_avail(int level) {
 
 #ifdef USE_OPENMP
 int openmp_nthreads;
-	if (blas_num_threads_set == 0)
 	openmp_nthreads=omp_get_max_threads();
-	else
-	openmp_nthreads=blas_cpu_number;
 #endif
 
 #ifndef USE_OPENMP 
@@ -156,7 +154,13 @@ int openmp_nthreads;
       ) return 1;        
 
 #ifdef USE_OPENMP
-  if (blas_cpu_number != openmp_nthreads) {
+     if (openmp_nthreads > blas_omp_number_max){
+#ifdef DEBUG
+     fprintf(stderr,"WARNING - more OpenMP threads requested (%d) than available (%d)\n",openmp_nthreads,blas_omp_number_max);
+#endif
+     openmp_nthreads = blas_omp_number_max;
+     }
+     if (blas_cpu_number != openmp_nthreads) {
 	  goto_set_num_threads(openmp_nthreads);
   }
 #endif

From 3326b924b324d49960e0f657f049e5658e7e6c69 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 26 Jul 2023 00:31:24 +0200
Subject: [PATCH 224/718] remove status variable blas_num_threads_set;
 initialize openmp thread maximum on startup

---
 driver/others/blas_server_omp.c | 4 +++-
 driver/others/memory.c          | 4 ----
 driver/others/memory_qalloc.c   | 1 -
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c
index 2e0c0f38c..43764df00 100644
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -68,6 +68,7 @@
 #endif
 
 int blas_server_avail = 0;
+int blas_omp_number_max = 0;
 
 extern int openblas_omp_adaptive_env();
 
@@ -100,7 +101,6 @@ static void adjust_thread_buffers() {
 
 void goto_set_num_threads(int num_threads) {
 
-  blas_num_threads_set = 1;
   if (num_threads < 0) blas_num_threads_set = 0;
   if (num_threads < 1) num_threads = blas_num_threads;
 
@@ -125,6 +125,8 @@ void openblas_set_num_threads(int num_threads) {
 }
 
 int blas_thread_init(void){
+if(blas_omp_number_max <= 0)
+  blas_omp_number_max = omp_get_max_threads();
 
   blas_get_cpu_number();
 
diff --git a/driver/others/memory.c b/driver/others/memory.c
index 3cbd17bc2..4fceae754 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -422,8 +422,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s
 */
 int blas_num_threads = 0;
 
-int blas_num_threads_set = 0;
-
 int  goto_get_num_procs  (void) {
   return blas_cpu_number;
 }
@@ -1996,8 +1994,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s
 */
 int blas_num_threads = 0;
 
-int blas_num_threads_set = 0;
-
 int  goto_get_num_procs  (void) {
   return blas_cpu_number;
 }
diff --git a/driver/others/memory_qalloc.c b/driver/others/memory_qalloc.c
index 0b38d1887..6174d9b75 100644
--- a/driver/others/memory_qalloc.c
+++ b/driver/others/memory_qalloc.c
@@ -283,7 +283,6 @@ The numbers of threads in the thread pool.
 This value is equal or large than blas_cpu_number. This means some threads are sleep.
 */
 int blas_num_threads = 0;
-int blas_num_threads_set = 0;
 
 int  goto_get_num_procs  (void) {
   return blas_cpu_number;

From 94adf98bb80b36c9faa2fa90fe7f33b36d30748c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 26 Jul 2023 08:31:37 +0200
Subject: [PATCH 225/718] remove unused status variable

---
 common_thread.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/common_thread.h b/common_thread.h
index 0a4c703b7..06a7a1a38 100644
--- a/common_thread.h
+++ b/common_thread.h
@@ -53,7 +53,6 @@ extern void goto_set_num_threads(int nthreads);
 /* Global Parameter */
 extern int blas_cpu_number;
 extern int blas_num_threads;
-extern int blas_num_threads_set;
 extern int blas_omp_linked;
 
 #define BLAS_LEGACY	0x8000U

From 9ff84dc3f2536ce94e0a300a098485c7ff3d771d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 26 Jul 2023 10:02:44 +0200
Subject: [PATCH 226/718] remove unused status variable

---
 driver/others/blas_server_omp.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c
index 43764df00..fe6b4a7c0 100644
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -101,7 +101,6 @@ static void adjust_thread_buffers() {
 
 void goto_set_num_threads(int num_threads) {
 
-  if (num_threads < 0) blas_num_threads_set = 0;
   if (num_threads < 1) num_threads = blas_num_threads;
 
   if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;

From 76aa6bac4df3014acaad26390e6c7e3085d25806 Mon Sep 17 00:00:00 2001
From: steppi <albert.steppi@gmail.com>
Date: Wed, 26 Jul 2023 12:01:12 -0400
Subject: [PATCH 227/718] Fix cirun url [skip actions]

---
 .cirun.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.cirun.yml b/.cirun.yml
index f0e0149d3..bfc6494d0 100644
--- a/.cirun.yml
+++ b/.cirun.yml
@@ -1,5 +1,5 @@
 # Self-Hosted Github Action Runners on AWS via Cirun.io
-# Reference: https://docs.cirun.io/Reference/yml.html
+# Reference: https://docs.cirun.io/reference/yaml
 runners:
   - name: "aws-runner-graviton"
     # Cloud Provider: AWS

From 2a62d2df96beb60e2048543aca130eb267f94741 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 26 Jul 2023 19:39:11 +0200
Subject: [PATCH 228/718] Enable use of AVX512 microkernels with NVIDIA HPC
 from version 22.3

---
 kernel/x86_64/casum_microk_skylakex-2.c        | 5 ++++-
 kernel/x86_64/cscal_microk_skylakex-2.c        | 5 ++++-
 kernel/x86_64/dasum_microk_haswell-2.c         | 5 ++++-
 kernel/x86_64/dasum_microk_skylakex-2.c        | 5 ++++-
 kernel/x86_64/daxpy_microk_skylakex-2.c        | 5 ++++-
 kernel/x86_64/ddot_microk_skylakex-2.c         | 5 ++++-
 kernel/x86_64/dgemm_small_kernel_nn_skylakex.c | 5 ++++-
 kernel/x86_64/dgemm_small_kernel_tn_skylakex.c | 5 ++++-
 kernel/x86_64/dgemv_n_microk_skylakex-4.c      | 5 ++++-
 kernel/x86_64/drot_microk_skylakex-2.c         | 5 ++++-
 kernel/x86_64/dscal_microk_skylakex-2.c        | 5 ++++-
 kernel/x86_64/dsymv_L_microk_skylakex-2.c      | 5 ++++-
 kernel/x86_64/dtobf16_microk_cooperlake.c      | 5 ++++-
 kernel/x86_64/sasum_microk_haswell-2.c         | 5 ++++-
 kernel/x86_64/sasum_microk_skylakex-2.c        | 5 ++++-
 kernel/x86_64/saxpy_microk_skylakex-2.c        | 5 ++++-
 kernel/x86_64/sbdot_microk_cooperlake.c        | 5 ++++-
 kernel/x86_64/sbgemv_n_microk_cooperlake.c     | 5 ++++-
 kernel/x86_64/sbgemv_t_microk_cooperlake.c     | 5 ++++-
 kernel/x86_64/sdot_microk_skylakex-2.c         | 5 ++++-
 kernel/x86_64/sgemm_small_kernel_nn_skylakex.c | 6 +++++-
 kernel/x86_64/sgemm_small_kernel_tn_skylakex.c | 6 +++++-
 kernel/x86_64/sgemv_n_microk_skylakex-8.c      | 8 ++++++--
 kernel/x86_64/sgemv_t_microk_skylakex.c        | 5 ++++-
 kernel/x86_64/srot_microk_skylakex-2.c         | 5 ++++-
 kernel/x86_64/sscal_microk_skylakex-2.c        | 6 +++++-
 kernel/x86_64/stobf16_microk_cooperlake.c      | 6 +++++-
 kernel/x86_64/zasum_microk_skylakex-2.c        | 6 +++++-
 kernel/x86_64/zscal_microk_skylakex-2.c        | 6 +++++-
 29 files changed, 124 insertions(+), 30 deletions(-)

diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c
index b398aa6e1..ab0eea2ac 100644
--- a/kernel/x86_64/casum_microk_skylakex-2.c
+++ b/kernel/x86_64/casum_microk_skylakex-2.c
@@ -1,5 +1,8 @@
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_CASUM_KERNEL 1
 
diff --git a/kernel/x86_64/cscal_microk_skylakex-2.c b/kernel/x86_64/cscal_microk_skylakex-2.c
index 8a622427b..a6c012a4c 100644
--- a/kernel/x86_64/cscal_microk_skylakex-2.c
+++ b/kernel/x86_64/cscal_microk_skylakex-2.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #include <immintrin.h>
 
diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c
index fd9da7ebe..bc27c7647 100644
--- a/kernel/x86_64/dasum_microk_haswell-2.c
+++ b/kernel/x86_64/dasum_microk_haswell-2.c
@@ -1,4 +1,7 @@
-#if (( defined(__GNUC__)  && __GNUC__   > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__)
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_DASUM_KERNEL
 
diff --git a/kernel/x86_64/dasum_microk_skylakex-2.c b/kernel/x86_64/dasum_microk_skylakex-2.c
index 83bc078b3..76b9fbef0 100644
--- a/kernel/x86_64/dasum_microk_skylakex-2.c
+++ b/kernel/x86_64/dasum_microk_skylakex-2.c
@@ -1,5 +1,8 @@
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_DASUM_KERNEL 1
 
diff --git a/kernel/x86_64/daxpy_microk_skylakex-2.c b/kernel/x86_64/daxpy_microk_skylakex-2.c
index e785a39f1..5b9147d10 100644
--- a/kernel/x86_64/daxpy_microk_skylakex-2.c
+++ b/kernel/x86_64/daxpy_microk_skylakex-2.c
@@ -27,7 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #include <immintrin.h>
 
diff --git a/kernel/x86_64/ddot_microk_skylakex-2.c b/kernel/x86_64/ddot_microk_skylakex-2.c
index 8eabf225a..f076862f7 100644
--- a/kernel/x86_64/ddot_microk_skylakex-2.c
+++ b/kernel/x86_64/ddot_microk_skylakex-2.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_KERNEL_8 1
 
diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c
index a98772b94..da57a18a7 100644
--- a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c
+++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c
@@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #include <immintrin.h>
 #include "common.h"
diff --git a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c
index 37d1ca497..69ad6d94e 100644
--- a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c
+++ b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c
@@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #include <immintrin.h>
 #include "common.h"
diff --git a/kernel/x86_64/dgemv_n_microk_skylakex-4.c b/kernel/x86_64/dgemv_n_microk_skylakex-4.c
index 4030399ab..4e8739864 100644
--- a/kernel/x86_64/dgemv_n_microk_skylakex-4.c
+++ b/kernel/x86_64/dgemv_n_microk_skylakex-4.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_KERNEL_4x4 1
 
diff --git a/kernel/x86_64/drot_microk_skylakex-2.c b/kernel/x86_64/drot_microk_skylakex-2.c
index 4e862e663..bf9c044d4 100644
--- a/kernel/x86_64/drot_microk_skylakex-2.c
+++ b/kernel/x86_64/drot_microk_skylakex-2.c
@@ -1,5 +1,8 @@
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_DROT_KERNEL 1
 
diff --git a/kernel/x86_64/dscal_microk_skylakex-2.c b/kernel/x86_64/dscal_microk_skylakex-2.c
index e0598272e..381136414 100644
--- a/kernel/x86_64/dscal_microk_skylakex-2.c
+++ b/kernel/x86_64/dscal_microk_skylakex-2.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #include <immintrin.h>
 
diff --git a/kernel/x86_64/dsymv_L_microk_skylakex-2.c b/kernel/x86_64/dsymv_L_microk_skylakex-2.c
index f0df5aaa8..ca4773a4b 100644
--- a/kernel/x86_64/dsymv_L_microk_skylakex-2.c
+++ b/kernel/x86_64/dsymv_L_microk_skylakex-2.c
@@ -27,7 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #include <immintrin.h>
 
diff --git a/kernel/x86_64/dtobf16_microk_cooperlake.c b/kernel/x86_64/dtobf16_microk_cooperlake.c
index 9b8ac4714..b713b39be 100644
--- a/kernel/x86_64/dtobf16_microk_cooperlake.c
+++ b/kernel/x86_64/dtobf16_microk_cooperlake.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_TOBF16_ACCL_KERNEL 1
 #include "common.h"
diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c
index 2eb5b9538..3b4d65cfc 100644
--- a/kernel/x86_64/sasum_microk_haswell-2.c
+++ b/kernel/x86_64/sasum_microk_haswell-2.c
@@ -1,4 +1,7 @@
-#if (( defined(__GNUC__)  && __GNUC__   > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__)
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_SASUM_KERNEL 1
 
diff --git a/kernel/x86_64/sasum_microk_skylakex-2.c b/kernel/x86_64/sasum_microk_skylakex-2.c
index fbc91b558..f193053ee 100644
--- a/kernel/x86_64/sasum_microk_skylakex-2.c
+++ b/kernel/x86_64/sasum_microk_skylakex-2.c
@@ -1,5 +1,8 @@
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_SASUM_KERNEL 1
 
diff --git a/kernel/x86_64/saxpy_microk_skylakex-2.c b/kernel/x86_64/saxpy_microk_skylakex-2.c
index 950f10ba2..bbe4d2bc5 100644
--- a/kernel/x86_64/saxpy_microk_skylakex-2.c
+++ b/kernel/x86_64/saxpy_microk_skylakex-2.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_KERNEL_16 1
 
diff --git a/kernel/x86_64/sbdot_microk_cooperlake.c b/kernel/x86_64/sbdot_microk_cooperlake.c
index 2aefe46ff..ccec98e34 100644
--- a/kernel/x86_64/sbdot_microk_cooperlake.c
+++ b/kernel/x86_64/sbdot_microk_cooperlake.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_SBDOT_ACCL_KERNEL 1
 #include "common.h"
diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake.c b/kernel/x86_64/sbgemv_n_microk_cooperlake.c
index d875e0d96..c87f9fa5b 100644
--- a/kernel/x86_64/sbgemv_n_microk_cooperlake.c
+++ b/kernel/x86_64/sbgemv_n_microk_cooperlake.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_SBGEMV_N_ACCL_KERNEL 1
 #include "common.h"
diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake.c b/kernel/x86_64/sbgemv_t_microk_cooperlake.c
index 23da2e809..5b7a2e147 100644
--- a/kernel/x86_64/sbgemv_t_microk_cooperlake.c
+++ b/kernel/x86_64/sbgemv_t_microk_cooperlake.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_SBGEMV_T_ACCL_KERNEL 1
 
diff --git a/kernel/x86_64/sdot_microk_skylakex-2.c b/kernel/x86_64/sdot_microk_skylakex-2.c
index 1fcb7f27c..f14632f94 100644
--- a/kernel/x86_64/sdot_microk_skylakex-2.c
+++ b/kernel/x86_64/sdot_microk_skylakex-2.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_KERNEL_16 1
 
diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
index 2366fe3aa..6f4309c30 100644
--- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
+++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
@@ -24,7 +24,11 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
+
 
 #include <immintrin.h>
 #include "common.h"
diff --git a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c
index 308f5e35e..987b090ba 100644
--- a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c
+++ b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c
@@ -24,7 +24,11 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
+
 
 #include <immintrin.h>
 #include "common.h"
diff --git a/kernel/x86_64/sgemv_n_microk_skylakex-8.c b/kernel/x86_64/sgemv_n_microk_skylakex-8.c
index fba9cedcd..199621712 100644
--- a/kernel/x86_64/sgemv_n_microk_skylakex-8.c
+++ b/kernel/x86_64/sgemv_n_microk_skylakex-8.c
@@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   >= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && __NVCOMPVERS >= 2203 )
+
 
 #define HAVE_SGEMV_N_SKYLAKE_KERNEL 1
 #include "common.h"
@@ -255,4 +259,4 @@ static int sgemv_kernel_n_64(BLASLONG m, BLASLONG n, float alpha, float *a, BLAS
 }
 
 
-#endif
\ No newline at end of file
+#endif
diff --git a/kernel/x86_64/sgemv_t_microk_skylakex.c b/kernel/x86_64/sgemv_t_microk_skylakex.c
index dca12acfc..d4f675a1e 100644
--- a/kernel/x86_64/sgemv_t_microk_skylakex.c
+++ b/kernel/x86_64/sgemv_t_microk_skylakex.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   >= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   >= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_SGEMV_T_SKYLAKE_KERNEL 1
 #include "common.h"
diff --git a/kernel/x86_64/srot_microk_skylakex-2.c b/kernel/x86_64/srot_microk_skylakex-2.c
index a21d1cf64..aec25ac56 100644
--- a/kernel/x86_64/srot_microk_skylakex-2.c
+++ b/kernel/x86_64/srot_microk_skylakex-2.c
@@ -1,5 +1,8 @@
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_SROT_KERNEL 1
 
diff --git a/kernel/x86_64/sscal_microk_skylakex-2.c b/kernel/x86_64/sscal_microk_skylakex-2.c
index c4fa160f0..5c13cba55 100644
--- a/kernel/x86_64/sscal_microk_skylakex-2.c
+++ b/kernel/x86_64/sscal_microk_skylakex-2.c
@@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
+
 
 #include <immintrin.h>
 
diff --git a/kernel/x86_64/stobf16_microk_cooperlake.c b/kernel/x86_64/stobf16_microk_cooperlake.c
index 2756a6934..e7d20ddfa 100644
--- a/kernel/x86_64/stobf16_microk_cooperlake.c
+++ b/kernel/x86_64/stobf16_microk_cooperlake.c
@@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
+
 
 #define HAVE_TOBF16_ACCL_KERNEL 1
 #include "common.h"
diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c
index e257a5456..e60abc28b 100644
--- a/kernel/x86_64/zasum_microk_skylakex-2.c
+++ b/kernel/x86_64/zasum_microk_skylakex-2.c
@@ -1,5 +1,9 @@
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
+
 
 #define HAVE_ZASUM_KERNEL 1
 
diff --git a/kernel/x86_64/zscal_microk_skylakex-2.c b/kernel/x86_64/zscal_microk_skylakex-2.c
index f9e05e333..29dc4f6df 100644
--- a/kernel/x86_64/zscal_microk_skylakex-2.c
+++ b/kernel/x86_64/zscal_microk_skylakex-2.c
@@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
+
 
 #include <immintrin.h>
 

From ee72575475739dac3fd64caaa31cd6f6fd006fe0 Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Wed, 26 Jul 2023 13:00:07 +0200
Subject: [PATCH 229/718] Add documentation on redistributing OpenBLAS

This touches on the following:

- build configurations
- naming of symbols, shared/static libraries and other build outputs
  like pkg-config and CMake files
- (in more detail) guidance on ILP64 builds

It tries to explain that, while this is only guidance and there may be
reasons to deviate from that, for some build options there are best
practices, and for some others there are choices to make.

It also links to a number of well-maintained build recipes in order
to help packagers of other distros make choices.

Closes gh-3798

[skip ci]
---
 docs/distributing.md | 270 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 270 insertions(+)
 create mode 100644 docs/distributing.md

diff --git a/docs/distributing.md b/docs/distributing.md
new file mode 100644
index 000000000..1e6372a28
--- /dev/null
+++ b/docs/distributing.md
@@ -0,0 +1,270 @@
+# Guidance for redistributing OpenBLAS
+
+*We note that this document contains recommendations only - packagers and other
+redistributors are in charge of how OpenBLAS is built and distributed in their
+systems, and may have good reasons to deviate from the guidance given on this
+page. These recommendations are aimed at general packaging systems, with a user
+base that typically is large, open source (or freely available at least), and
+doesn't behave uniformly or that the packager is directly connected with.*
+
+OpenBLAS has a large number of build-time options which can be used to change
+how it behaves at runtime, how artifacts or symbols are named, etc. Variation
+in build configuration can be necessary to acheive a given end goal within a
+distribution or as an end user. However, such variation can also make it more
+difficult to build on top of OpenBLAS and ship code or other packages in a way
+that works across many different distros. Here we provide guidance about the
+most important build options, what effects they may have when changed, and
+which ones to default to.
+
+The Make and CMake build systems provide equivalent options and yield more or
+less the same artifacts, but not exactly (the CMake builds are still
+experimental). You can choose either one and the options will function in the
+same way, however the CMake outputs may require some renaming. To review
+available build options, see `Makefile.rule` or `CMakeLists.txt` in the root of
+the repository.
+
+Build options typically fall into two categories: (a) options that affect the
+user interface, such as library and symbol names or APIs that are made
+available, and (b) options that affect performance and runtime behavior, such
+as threading behavior or CPU architecture-specific code paths. The user
+interface options are more important to keep aligned between distributions,
+while for the performance-related options there are typically more reasons to
+make choices that deviate from the defaults.
+
+Here are recommendations for user interface related packaging choices where it
+is not likely to be a good idea to deviate (typically these are the default
+settings):
+
+1. Include CBLAS. The CBLAS interface is widely used and it doesn't affect
+   binary size much, so don't turn it off.
+2. Include LAPACK and LAPACKE. The LAPACK interface is also widely used, and
+   while it does make up a significant part of the binary size of the installed
+   library, that does not outweigh the regression in usability when deviating
+   from the default here.[^1]
+3. Always distribute the pkg-config (`.pc`) and CMake `.cmake`) dependency
+   detection files. These files are used by build systems when users want to
+   link against OpenBLAS, and there is no benefit of leaving them out.
+4. Provide the LP64 interface by default, and if in addition to that you choose
+   to provide an ILP64 interface build as well, use a symbol suffix to avoid
+   symbol name clashes (see the next section).
+
+[^1] All major distributions do include LAPACK as of mid 2023 as far as we
+know. Older versions of Arch Linux did not, and that was known to cause
+problems.
+
+
+## ILP64 interface builds
+
+The LP64 (32-bit integer) interface is the default build, and has
+well-established C and Fortran APIs as determined by the reference (Netlib)
+BLAS and LAPACK libraries. The ILP64 (64-bit integer) interface however does
+not have a standard API: symbol names and shared/static library names can be
+produced in multiple ways, and this tends to make it difficult to use.
+As of today there is an agreed-upon way of choosing names for OpenBLAS between
+a number of key users/redistributors, which is the closest thing to a standard
+that there is now. However, there is an ongoing standardization effort in the
+reference BLAS and LAPACK libraries, which differs from the current OpenBLAS
+agreed-upon convention. In this section we'll aim to explain both.
+
+Those two methods are fairly similar, and have a key thing in common: *using a
+symbol suffix*. This is good practice; it is recommended that if you distribute
+an ILP64 build, to have it use a symbol suffix containing `64` in the name.
+This avoids potential symbol clashes when different packages which depend on
+OpenBLAS load both an LP64 and an ILP64 library into memory at the same time.
+
+### The current OpenBLAS agreed-upon ILP64 convention
+
+This convention comprises the shared library name and the symbol suffix in the
+shared library. The symbol suffix to use is `64_`, implying that the library
+name will be `libopenblas64_.so` and the symbols in that library end in `64_`.
+The central issue where this was discussed is
+[openblas#646](https://github.com/xianyi/OpenBLAS/issues/646), and adopters
+include Fedora, Julia, NumPy and SciPy - SuiteSparse already used it as well.
+
+To build shared and static libraries with the currently recommended ILP64
+conventions with Make:
+```bash
+$ make INTERFACE64=1 SYMBOLSUFFIX=64_
+```
+
+This will produce libraries named `libopenblas64_.so|a`, a pkg-config file
+named `openblas64.pc`, and CMake and header files.
+
+Installing locally and inspecting the output will show a few more details:
+```bash
+$ make install PREFIX=$PWD/../openblas/make64 INTERFACE64=1 SYMBOLSUFFIX=64_
+$ tree .  # output slightly edited down
+.
+├── include
+│   ├── cblas.h
+│   ├── f77blas.h
+│   ├── lapacke_config.h
+│   ├── lapacke.h
+│   ├── lapacke_mangling.h
+│   ├── lapacke_utils.h
+│   ├── lapack.h
+│   └── openblas_config.h
+└── lib
+    ├── cmake
+    │   └── openblas
+    │       ├── OpenBLASConfig.cmake
+    │       └── OpenBLASConfigVersion.cmake
+    ├── libopenblas64_.a
+    ├── libopenblas64_.so
+    └── pkgconfig
+        └── openblas64.pc
+```
+
+A key point are the symbol names. These will equal the LP64 symbol names, then
+(for Fortran only) the compiler mangling, and then the `64_` symbol suffix.
+Hence to obtain the final symbol names, we need to take into account which
+Fortran compiler we are using. For the most common cases (e.g., gfortran, Intel
+Fortran, or Flang), that means appending a single underscore. In that case, the
+result is:
+
+| base API name | binary symbol name | call from Fortran code | call from C code      |
+|---------------|--------------------|------------------------|-----------------------|
+| `dgemm`       | `dgemm_64_`        | `dgemm_64(...)`        | `dgemm_64_(...)`      |
+| `cblas_dgemm` | `cblas_dgemm64_`   | n/a                    | `cblas_dgemm64_(...)` |
+
+It is quite useful to have these symbol names be as uniform as possible across
+different packaging systems.
+
+The equivalent build options with CMake are:
+```bash
+$ mkdir build && cd build
+$ cmake .. -DINTERFACE64=1 -DSYMBOLSUFFIX=64_ -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON
+$ cmake --build . -j
+```
+
+Note that the result is not 100% identical to the Make result. For example, the
+library name ends in `_64` rather than `64_` - it is recommended to rename them
+to match the Make library names (also update the `libsuffix` entry in
+`openblas64.pc` to match that rename).
+```bash
+$ cmake --install . --prefix $PWD/../../openblas/cmake64
+$ tree .
+.
+├── include
+│   └── openblas64
+│       ├── cblas.h
+│       ├── f77blas.h
+│       ├── lapacke_config.h
+│       ├── lapacke_example_aux.h
+│       ├── lapacke.h
+│       ├── lapacke_mangling.h
+│       ├── lapacke_utils.h
+│       ├── lapack.h
+│       ├── openblas64
+│       │   └── lapacke_mangling.h
+│       └── openblas_config.h
+└── lib
+    ├── cmake
+    │   └── OpenBLAS64
+    │       ├── OpenBLAS64Config.cmake
+    │       ├── OpenBLAS64ConfigVersion.cmake
+    │       ├── OpenBLAS64Targets.cmake
+    │       └── OpenBLAS64Targets-noconfig.cmake
+    ├── libopenblas_64.a
+    ├── libopenblas_64.so -> libopenblas_64.so.0
+    └── pkgconfig
+        └── openblas64.pc
+```
+
+
+### The upcoming standardized ILP64 convention
+
+While the `64_` convention above got some adoption, it's slightly hacky and is
+implemented through the use of `objcopy`. An effort is ongoing for a more
+broadly adopted convention in the reference BLAS and LAPACK libraries, using
+(a) the `_64` suffix, and (b) applying that suffix _before_ rather than after
+Fortran compiler mangling. The central issue for this is
+[lapack#666](https://github.com/Reference-LAPACK/lapack/issues/666).
+
+For the most common cases of compiler mangling (a single `_` appended), the end
+result will be:
+
+| base API name | binary symbol name | call from Fortran code | call from C code      |
+|---------------|--------------------|------------------------|-----------------------|
+| `dgemm`       | `dgemm_64_`        | `dgemm_64(...)`        | `dgemm_64_(...)`      |
+| `cblas_dgemm` | `cblas_dgemm_64`   | n/a                    | `cblas_dgemm_64(...)` |
+
+For other compiler mangling schemes, replace the trailing `_` by the scheme in use.
+
+The shared library name for this `_64` convention should be `libopenblas_64.so`.
+
+Note: it is not yet possible to produce an OpenBLAS build which employs this
+convention! Once reference BLAS and LAPACK with support for `_64` have been
+released, a future OpenBLAS release will support it. For now, please use the
+older `64_` scheme and avoid using the name `libopenblas_64.so`; it should be
+considered reserved for future use of the `_64` standard as prescribed by
+reference BLAS/LAPACK.
+
+
+## Performance and runtime behavior related build options
+
+For these options there are multiple reasonable or common choices.
+
+### Threading related options
+
+OpenBLAS can be built as a multi-threaded or single-threaded library, with the
+default being multi-threaded. It's expected that the default `libopenblas`
+library is multi-threaded; if you'd like to also distribute single-threaded
+builds, consider naming them `libopenblas_sequential`.
+
+OpenBLAS can be built with pthreads or OpenMP as the threading model, with the
+default being pthreads. Both options are commonly used, and the choice here
+should not influence the shared library name. The choice will be captured by
+the `.pc` file. E.g.,:
+```bash
+$ pkg-config --libs openblas
+-fopenmp -lopenblas
+
+$ cat openblas.pc
+...
+openblas_config= ... USE_OPENMP=0 MAX_THREADS=24
+```
+
+The maximum number of threads users will be able to use is determined at build
+time by the `NUM_THREADS` build option. It defaults to 24, and there's a wide
+range of values that are reasonable to use (up to 256). 64 is a typical choice
+here; there is a memory footprint penalty that is linear in `NUM_THREADS`.
+Please see `Makefile.rule` for more details.
+
+### CPU architecture related options
+
+OpenBLAS contains a lot of CPU architecture-specific optimizations, hence when
+distributing to a user base with a variety of hardware, it is recommended to
+enable CPU architecture runtime detection. This will dynamically select
+optimized kernels for individual APIs. To do this, use the `DYNAMIC_ARCH=1`
+build option. This is usually done on all common CPU families, except when
+there are known issues.
+
+In case the CPU architecture is known (e.g. you're building binaries for macOS
+M1 users), it is possible to specify the target architecture directly with the
+`TARGET=` build option.
+
+`DYNAMIC_ARCH` and `TARGET` are covered in more detail in the main `README.md`
+in this repository.
+
+
+## Real-world examples
+
+OpenBLAS is likely to be distributed in one of these distribution models:
+
+1. As a standalone package, or multiple packages, in a packaging ecosystem like
+   a Linux distro, Homebrew, conda-forge or MSYS2.
+2. Vendored as part of a larger package, e.g. in Julia, NumPy, SciPy, or R.
+3. Locally, e.g. making available as a build on a single HPC cluster.
+
+The guidance on this page is most important for models (1) and (2). These links
+to build recipes for a representative selection of packaging systems may be
+helpful as a reference:
+
+- [Fedora](https://src.fedoraproject.org/rpms/openblas/blob/rawhide/f/openblas.spec)
+- [Debian](https://salsa.debian.org/science-team/openblas/-/blob/master/debian/rules)
+- [Homebrew](https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/openblas.rb)
+- [MSYS2](https://github.com/msys2/MINGW-packages/blob/master/mingw-w64-openblas/PKGBUILD)
+- [conda-forge](https://github.com/conda-forge/openblas-feedstock/blob/main/recipe/build.sh)
+- [NumPy/SciPy](https://github.com/MacPython/openblas-libs/blob/main/tools/build_openblas.sh)
+- [Nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/development/libraries/science/math/openblas/default.nix)

From 730ca04b48129a3810a5881b31d5fce4cdeb37ef Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <chris.sidebottom@arm.com>
Date: Thu, 27 Jul 2023 13:27:28 +0100
Subject: [PATCH 230/718] Fix ZHEMM copy for SVE

Whilst disambiguating whilelt, I inadvertantly used the wrong datatype
for offsets, which can be negative. This rectifies that.
---
 kernel/arm64/zhemm_ltcopy_sve.c | 4 ++--
 kernel/arm64/zhemm_utcopy_sve.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/arm64/zhemm_ltcopy_sve.c b/kernel/arm64/zhemm_ltcopy_sve.c
index 615667264..fcf7e7073 100644
--- a/kernel/arm64/zhemm_ltcopy_sve.c
+++ b/kernel/arm64/zhemm_ltcopy_sve.c
@@ -80,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         gat_ind = svadd_m(cmp, gat_ind, lda_vec);
         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
         if (offset <= 0) {
-            svbool_t off_g = svwhilelt_b64((uint64_t)offset, (uint64_t)0LL);
+            svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL);
             data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
         }
 
@@ -143,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         gat_ind = svadd_m(cmp, gat_ind, lda_vec);
         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
         if (offset <= 0) {
-            svbool_t off_g = svwhilelt_b32((uint32_t)offset, (uint32_t)0);
+            svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0);
             data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
         }
 
diff --git a/kernel/arm64/zhemm_utcopy_sve.c b/kernel/arm64/zhemm_utcopy_sve.c
index f763d5607..056c9824e 100644
--- a/kernel/arm64/zhemm_utcopy_sve.c
+++ b/kernel/arm64/zhemm_utcopy_sve.c
@@ -81,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
         data_vec_imag = svneg_z(pg, data_vec_imag);
         if (offset <= 0) {
-            svbool_t off_g = svwhilelt_b64((uint64_t)offset, (uint64_t)0LL);
+            svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL);
             data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
         }
 
@@ -143,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
         data_vec_imag = svneg_z(pg, data_vec_imag);
         if (offset <= 0) {
-            svbool_t off_g = svwhilelt_b32((uint32_t)offset, (uint32_t)0);
+            svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0);
             data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
         }
 

From 84a268b6ca208891177f7fbb82b87c0d561de3f3 Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <chris.sidebottom@arm.com>
Date: Thu, 27 Jul 2023 10:55:34 +0100
Subject: [PATCH 231/718] Use SVE zgemm/cgemm on Arm(R) Neoverse(TM) V1 core

This patch removes the prefetches from cgemm/zgemm which improves the performance similar to sgemm/dgemm did in #3868, this means I'm happy to enable this on any applicable cores.

I also replicated the unrolling the copies from sgemm and dgemm.
---
 kernel/arm64/KERNEL.ARMV8SVE               |   8 +-
 kernel/arm64/KERNEL.NEOVERSEV1             |  65 -----------
 kernel/arm64/cgemm_kernel_sve_v1x4.S       |  39 -------
 kernel/arm64/gemm_ncopy_complex_sve_v1x4.c | 121 +++++++++++++++++++++
 kernel/arm64/gemm_tcopy_complex_sve_v1x4.c | 115 ++++++++++++++++++++
 kernel/arm64/zgemm_kernel_sve_v1x4.S       |  41 -------
 param.h                                    |   6 +-
 7 files changed, 244 insertions(+), 151 deletions(-)
 create mode 100644 kernel/arm64/gemm_ncopy_complex_sve_v1x4.c
 create mode 100644 kernel/arm64/gemm_tcopy_complex_sve_v1x4.c

diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE
index 38d7ff8f1..ccbce27e1 100644
--- a/kernel/arm64/KERNEL.ARMV8SVE
+++ b/kernel/arm64/KERNEL.ARMV8SVE
@@ -160,8 +160,8 @@ DSYMMLCOPY_M    =  symm_lcopy_sve.c
 CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
 CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
 
-CGEMMINCOPY    =  cgemm_ncopy_sve_v1.c
-CGEMMITCOPY    =  cgemm_tcopy_sve_v1.c
+CGEMMINCOPY    =  gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
+CGEMMITCOPY    =  gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
 CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
 CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
 
@@ -184,8 +184,8 @@ CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
 ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
 ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
 
-ZGEMMINCOPY    =  zgemm_ncopy_sve_v1.c
-ZGEMMITCOPY    =  zgemm_tcopy_sve_v1.c
+ZGEMMINCOPY    =  gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
+ZGEMMITCOPY    =  gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
 ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
 ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
 
diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1
index a8dafe862..bc5999097 100644
--- a/kernel/arm64/KERNEL.NEOVERSEV1
+++ b/kernel/arm64/KERNEL.NEOVERSEV1
@@ -1,66 +1 @@
 include $(KERNELDIR)/KERNEL.ARMV8SVE
-
-CTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
-
-CTRMMUNCOPY_M  =
-CTRMMLNCOPY_M  =
-CTRMMUTCOPY_M  =
-CTRMMLTCOPY_M  =
-CHEMMLTCOPY_M  =
-CHEMMUTCOPY_M  =
-CSYMMUCOPY_M   =
-CSYMMLCOPY_M   =
-
-CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
-CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-else
-CGEMMINCOPYOBJ =
-CGEMMITCOPYOBJ =
-endif
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
-
-ZTRSMCOPYLN_M    =
-ZTRSMCOPYLT_M    =
-ZTRSMCOPYUN_M    =
-ZTRSMCOPYUT_M    =
-
-ZTRMMUNCOPY_M  =
-ZTRMMLNCOPY_M  =
-ZTRMMUTCOPY_M  =
-ZTRMMLTCOPY_M  =
-ZHEMMLTCOPY_M  =
-ZHEMMUTCOPY_M  =
-ZSYMMUCOPY_M   =
-ZSYMMLCOPY_M   =
-
-ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
-ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-else
-ZGEMMINCOPYOBJ =
-ZGEMMITCOPYOBJ =
-endif
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
diff --git a/kernel/arm64/cgemm_kernel_sve_v1x4.S b/kernel/arm64/cgemm_kernel_sve_v1x4.S
index 38770f66b..2136ebbee 100644
--- a/kernel/arm64/cgemm_kernel_sve_v1x4.S
+++ b/kernel/arm64/cgemm_kernel_sve_v1x4.S
@@ -240,7 +240,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
     add pB, pB, 32
 
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
 .endm
 
 .macro KERNELv1x4_M1
@@ -276,9 +275,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     ld1rw  z15.s, p0/z,  [pB, 28]
 
     add pB, pB, 32
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
-
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
 .endm
 
 .macro KERNELv1x4_M2
@@ -313,11 +309,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	OP_ri	z23.s, p1/m, z2.s, z15.s
     ld1rw  z15.s, p0/z,  [pB, 28]
 
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
-
     add pB, pB, 32
-
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
 .endm
 
 .macro KERNELv1x4_E
@@ -341,10 +333,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	OP_ii	z22.s, p1/m, z3.s, z15.s
 	OP_ri	z23.s, p1/m, z2.s, z15.s
 
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
-
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
-
 .endm
 
 .macro KERNELv1x4_SUB
@@ -383,13 +371,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	OP_ii	z22.s, p1/m, z1.s, z15.s
 	OP_ri	z23.s, p1/m, z0.s, z15.s
 
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
 .endm
 
 .macro SAVEv1x4
-	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
-
 	ld2w	{z24.s, z25.s}, p1/z, [pCRow0]
 	fmla	z24.s, p1/m, z16.s, alphaz_R
 	fmls	z24.s, p1/m, z17.s, alphaz_I
@@ -407,8 +391,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	st2w 	{z26.s, z27.s}, p1, [pCRow1]
 
 	add	pCRow1, pCRow1, lanes, lsl #3
-	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
-
 	ld2w	{z28.s, z29.s}, p1/z, [pCRow2]
 	fmla	z28.s, p1/m, z20.s, alphaz_R
 	fmls	z28.s, p1/m, z21.s, alphaz_I
@@ -425,12 +407,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	fmla	z31.s, p1/m, z23.s, alphaz_R
 	st2w 	{z30.s, z31.s}, p1, [pCRow3]
 
-	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
-
 	add	pCRow3, pCRow3, lanes, lsl #3	// pC = pC + lanes  * 2 *4
 
-	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
-
 .endm
 
 /******************************************************************************/
@@ -466,8 +444,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVEv1x2
-	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
-
 	ld2w	{z24.s, z25.s}, p1/z, [pCRow0]
 	fmla	z24.s, p1/m, z16.s, alphaz_R
 	fmls	z24.s, p1/m, z17.s, alphaz_I
@@ -485,10 +461,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	st2w 	{z26.s, z27.s}, p1, [pCRow1]
 
 	add	pCRow1, pCRow1, lanes, lsl #3
-	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
-
-	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
-
 .endm
 
 /******************************************************************************/
@@ -516,8 +488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVEv1x1
-	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
-
 	ld2w	{z24.s, z25.s}, p1/z, [pCRow0]
 	fmla	z24.s, p1/m, z16.s, alphaz_R
 	fmls	z24.s, p1/m, z17.s, alphaz_I
@@ -527,8 +497,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 2 *4
 
-	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
-
 .endm
 
 /******************************************************************************/
@@ -553,9 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stp	x26, x27, [sp, #(9 * 16)]
 	str	x28, [sp, #(10 * 16)]
 
-	prfm	PLDL1KEEP, [origPB]
-	prfm	PLDL1KEEP, [origPA]
-
 	fmov	alphaR, s0
 	dup	    alphaz_R, alphaR
 	fmov	alphaI, s1
@@ -676,10 +641,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	bne	.Lcgemm_kernel_L4_Mv1_46
 
 .Lcgemm_kernel_L4_Mv1_100:
-	prfm	PLDL1KEEP, [pA]
-	prfm	PLDL1KEEP, [pA, #64]
-	prfm	PLDL1KEEP, [origPB]
-
 	SAVEv1x4
 
 .Lcgemm_kernel_L4_Mv1_END:
diff --git a/kernel/arm64/gemm_ncopy_complex_sve_v1x4.c b/kernel/arm64/gemm_ncopy_complex_sve_v1x4.c
new file mode 100644
index 000000000..90f867b44
--- /dev/null
+++ b/kernel/arm64/gemm_ncopy_complex_sve_v1x4.c
@@ -0,0 +1,121 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdio.h>
+#include <arm_sve.h>
+
+#include "common.h"
+
+#ifdef DOUBLE
+#define COUNT "cntd"
+#define SV_TYPE svfloat64_t
+#define SV_INDEX svuint64_t
+#define SV_INDEXER svindex_u64
+#define SV_TRUE svptrue_b64
+#define SV_WHILE svwhilelt_b64
+#else
+#define COUNT "cntw"
+#define SV_TYPE svfloat32_t
+#define SV_INDEX svuint32_t
+#define SV_INDEXER svindex_u32
+#define SV_TRUE svptrue_b32
+#define SV_WHILE svwhilelt_b32
+#endif
+
+#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active)           \
+    a_vec_real = svld1_gather_index(pg, a_offset_inner, lda_vec);       \
+    a_vec_imag = svld1_gather_index(pg, a_offset_inner + 1, lda_vec);   \
+    svst2(pg, b_offset, svcreate2(a_vec_real, a_vec_imag));             \
+    a_offset_inner += 2;                                                \
+    b_offset += active * 2;
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
+    uint64_t sve_size;
+    asm(COUNT" %[SIZE_]" : [SIZE_]  "=r" (sve_size) : : );
+
+    IFLOAT *a_offset, *a_offset_inner, *b_offset;
+    a_offset = a;
+    b_offset = b;
+
+    SV_INDEX lda_vec = SV_INDEXER(0LL, lda * 2);
+    SV_TYPE a_vec_real;
+    SV_TYPE a_vec_imag;
+    svbool_t pg_true = SV_TRUE();
+
+    BLASLONG single_vectors_n = n & -sve_size;
+    for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
+        a_offset_inner = a_offset;
+
+        svbool_t pg = pg_true;
+        uint64_t active = sve_size;
+        uint64_t i_cnt = m >> 2;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        a_offset += sve_size * lda * 2;
+    }
+
+    BLASLONG remaining_n = n - single_vectors_n;
+    if (remaining_n) {
+        a_offset_inner = a_offset;
+        svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
+        uint64_t active = remaining_n;
+        uint64_t i_cnt = m >> 2;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+    }
+
+    return 0;
+}
+
diff --git a/kernel/arm64/gemm_tcopy_complex_sve_v1x4.c b/kernel/arm64/gemm_tcopy_complex_sve_v1x4.c
new file mode 100644
index 000000000..975166a2e
--- /dev/null
+++ b/kernel/arm64/gemm_tcopy_complex_sve_v1x4.c
@@ -0,0 +1,115 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdio.h>
+#include <arm_sve.h>
+
+#include "common.h"
+
+#ifdef DOUBLE
+#define COUNT "cntd"
+#define SV_TYPE svfloat64x2_t
+#define SV_TRUE svptrue_b64
+#define SV_WHILE svwhilelt_b64
+#else
+#define COUNT "cntw"
+#define SV_TYPE svfloat32x2_t
+#define SV_TRUE svptrue_b32
+#define SV_WHILE svwhilelt_b32
+#endif
+
+#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active)   \
+    a_vec = svld2(pg, a_offset_inner);                          \
+    svst2(pg, b_offset, a_vec);                                 \
+    a_offset_inner += lda * 2;                                  \
+    b_offset += active * 2;
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+    uint64_t sve_size = svcntw();
+    asm(COUNT" %[SIZE_]" : [SIZE_]  "=r" (sve_size) : : );
+
+    IFLOAT *a_offset, *a_offset_inner, *b_offset;
+    a_offset = a;
+    b_offset = b;
+
+    SV_TYPE a_vec;
+    svbool_t pg_true = SV_TRUE();
+
+    BLASLONG single_vectors_n = n & -sve_size;
+    for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
+        a_offset_inner = a_offset;
+
+        svbool_t pg = pg_true;
+        uint64_t active = sve_size;
+        uint64_t i_cnt = m >> 2;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        a_offset += sve_size * 2;
+    }
+
+    BLASLONG remaining_n = n - single_vectors_n;
+    if (remaining_n) {
+        a_offset_inner = a_offset;
+        svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
+        uint64_t active = remaining_n;
+        uint64_t i_cnt = m >> 2;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+    }
+
+    return 0;
+}
+
+
diff --git a/kernel/arm64/zgemm_kernel_sve_v1x4.S b/kernel/arm64/zgemm_kernel_sve_v1x4.S
index d5b35775c..a043948d6 100644
--- a/kernel/arm64/zgemm_kernel_sve_v1x4.S
+++ b/kernel/arm64/zgemm_kernel_sve_v1x4.S
@@ -239,8 +239,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     ld1rd  z15.d, p0/z,  [pB, 56]
 
     add pB, pB, 64
-
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
 .endm
 
 .macro KERNELv1x4_M1
@@ -276,9 +274,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     ld1rd  z15.d, p0/z,  [pB, 56]
 
     add pB, pB, 64
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
-
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
 .endm
 
 .macro KERNELv1x4_M2
@@ -313,11 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	OP_ri	z23.d, p1/m, z2.d, z15.d
     ld1rd  z15.d, p0/z,  [pB, 56]
 
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
-
     add pB, pB, 64
-
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
 .endm
 
 .macro KERNELv1x4_E
@@ -340,11 +331,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	OP_ir	z23.d, p1/m, z3.d, z14.d
 	OP_ii	z22.d, p1/m, z3.d, z15.d
 	OP_ri	z23.d, p1/m, z2.d, z15.d
-
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
-
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
-
 .endm
 
 .macro KERNELv1x4_SUB
@@ -382,14 +368,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	OP_ir	z23.d, p1/m, z1.d, z14.d
 	OP_ii	z22.d, p1/m, z1.d, z15.d
 	OP_ri	z23.d, p1/m, z0.d, z15.d
-
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
 .endm
 
 .macro SAVEv1x4
-	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
-
 	ld2d	{z24.d, z25.d}, p1/z, [pCRow0]
 	fmla	z24.d, p1/m, z16.d, alphaz_R
 	fmls	z24.d, p1/m, z17.d, alphaz_I
@@ -407,7 +388,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	st2d 	{z26.d, z27.d}, p1, [pCRow1]
 
 	add	pCRow1, pCRow1, lanes, lsl #4
-	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
 
 	ld2d	{z28.d, z29.d}, p1/z, [pCRow2]
 	fmla	z28.d, p1/m, z20.d, alphaz_R
@@ -425,12 +405,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	fmla	z31.d, p1/m, z23.d, alphaz_R
 	st2d 	{z30.d, z31.d}, p1, [pCRow3]
 
-	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
-
 	add	pCRow3, pCRow3, lanes, lsl #4	// pC = pC + lanes  * 2 *8
 
-	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
-
 .endm
 
 /******************************************************************************/
@@ -466,8 +442,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVEv1x2
-	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
-
 	ld2d	{z24.d, z25.d}, p1/z, [pCRow0]
 	fmla	z24.d, p1/m, z16.d, alphaz_R
 	fmls	z24.d, p1/m, z17.d, alphaz_I
@@ -485,10 +459,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	st2d 	{z26.d, z27.d}, p1, [pCRow1]
 
 	add	pCRow1, pCRow1, lanes, lsl #4
-	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
-
-	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
-
 .endm
 
 /******************************************************************************/
@@ -516,8 +486,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVEv1x1
-	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
-
 	ld2d	{z24.d, z25.d}, p1/z, [pCRow0]
 	fmla	z24.d, p1/m, z16.d, alphaz_R
 	fmls	z24.d, p1/m, z17.d, alphaz_I
@@ -527,8 +495,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	add	pCRow0, pCRow0, lanes, lsl #4	// pC = pC + lanes  * 2 *8
 
-	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
-
 .endm
 
 /******************************************************************************/
@@ -553,9 +519,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stp	x26, x27, [sp, #(9 * 16)]
 	str	x28, [sp, #(10 * 16)]
 
-	prfm	PLDL1KEEP, [origPB]
-	prfm	PLDL1KEEP, [origPA]
-
 	fmov	alphaR, d0
 	dup	    alphaz_R, alphaR
 	fmov	alphaI, d1
@@ -676,10 +639,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	bne	.Lzgemm_kernel_L4_Mv1_46
 
 .Lzgemm_kernel_L4_Mv1_100:
-	prfm	PLDL1KEEP, [pA]
-	prfm	PLDL1KEEP, [pA, #64]
-	prfm	PLDL1KEEP, [origPB]
-
 	SAVEv1x4
 
 .Lzgemm_kernel_L4_Mv1_END:
diff --git a/param.h b/param.h
index 84e0c2ac7..8fb4bcc48 100644
--- a/param.h
+++ b/param.h
@@ -3385,11 +3385,13 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define DGEMM_DEFAULT_UNROLL_M  4 // Actually 2VL (8) but kept separate to keep copies separate
 #define DGEMM_DEFAULT_UNROLL_N  8
 
-#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_M  2
 #define CGEMM_DEFAULT_UNROLL_N  4
+#define CGEMM_DEFAULT_UNROLL_MN 16
 
-#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_M  2
 #define ZGEMM_DEFAULT_UNROLL_N  4
+#define ZGEMM_DEFAULT_UNROLL_MN 16
 
 #define SGEMM_DEFAULT_P 128
 #define DGEMM_DEFAULT_P 160

From 007cd834c198d0df59f29893e72b26dbb99cd6fa Mon Sep 17 00:00:00 2001
From: Felix Yan <felixonmars@archlinux.org>
Date: Fri, 28 Jul 2023 04:50:16 +0300
Subject: [PATCH 232/718] Use defined variable for riscv64 in arch.cmake

It's defined in #4137
---
 cmake/arch.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index 07df31b89..ebdc5a833 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -135,7 +135,7 @@ if (ARM64)
   set(BINARY_DEFINED 1)
 endif ()
 
-if (${ARCH} STREQUAL "riscv64")
+if (RISCV64)
   set(NO_BINARY_MODE 1)
   set(BINARY_DEFINED 1)
 endif ()

From 4ed6414c17a9d194e8f93e5a3f1796879c49d9a9 Mon Sep 17 00:00:00 2001
From: Felix Yan <felixonmars@archlinux.org>
Date: Fri, 28 Jul 2023 04:53:27 +0300
Subject: [PATCH 233/718] Fix 64-bit fortran options for riscv64

64-bit builds are currently broken without this flag.

Makefiles have done this already: https://github.com/xianyi/OpenBLAS/blob/5720fa02c58562c7d3e6a3e97b053598548e98d9/Makefile.system#L831
---
 cmake/fc.cmake | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cmake/fc.cmake b/cmake/fc.cmake
index a67760885..2f7ee3371 100644
--- a/cmake/fc.cmake
+++ b/cmake/fc.cmake
@@ -63,6 +63,11 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95")
         set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
       endif ()
     endif ()
+    if (RISCV64)
+      if (BINARY64)
+        set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
+      endif ()
+    endif ()
   else ()
     if (BINARY64)
       set(FCOMMON_OPT "${FCOMMON_OPT} -m64")

From f5506b002cac8226be51d9ccdc45cf37f94d1d42 Mon Sep 17 00:00:00 2001
From: Felix Yan <felixonmars@archlinux.org>
Date: Fri, 28 Jul 2023 16:19:14 +0300
Subject: [PATCH 234/718] Add 64-bit flag on INTERFACE64 only

---
 cmake/fc.cmake | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cmake/fc.cmake b/cmake/fc.cmake
index 2f7ee3371..7400cb3e6 100644
--- a/cmake/fc.cmake
+++ b/cmake/fc.cmake
@@ -65,7 +65,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95")
     endif ()
     if (RISCV64)
       if (BINARY64)
-        set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
+        if (INTERFACE64)
+          set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
+        endif ()
       endif ()
     endif ()
   else ()

From b209915121d7d5f38fdcfcd6029c2d14d2182ad0 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Tue, 25 Jul 2023 15:10:50 -0700
Subject: [PATCH 235/718] Fix build with clang

There are two instances when building the tests where OpenBLAS fails to
build with OpenMP and clang due to library paths getting reset as flags
are set rather than appended. This seems to only affect certain
clang/libomp installations, but if it's already grabbing the correct
library paths we might as well use them.
---
 ctest/Makefile | 2 +-
 test/Makefile  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ctest/Makefile b/ctest/Makefile
index 0fb2450d2..9e85d23b9 100644
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -208,7 +208,7 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(F_COMPILER), GFORTRAN)
 ifeq ($(C_COMPILER), CLANG)
-CEXTRALIB = -lomp
+CEXTRALIB += -lomp
 endif
 endif
 ifeq ($(F_COMPILER), NAG)
diff --git a/test/Makefile b/test/Makefile
index 923f1537c..46a7b1158 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -265,7 +265,7 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(F_COMPILER), GFORTRAN)
 ifeq ($(C_COMPILER), CLANG)
-CEXTRALIB = -lomp
+CEXTRALIB += -lomp
 endif
 endif
 ifeq ($(F_COMPILER), NAG)

From fe54ee3d15a1dd48cfebd227749e7e747e54c06e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 29 Jul 2023 11:48:38 +0200
Subject: [PATCH 236/718] nvc currently miscompiles this, hopefully fixed in
 release 23.09

---
 kernel/x86_64/casum_microk_skylakex-2.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c
index ab0eea2ac..7161a0a62 100644
--- a/kernel/x86_64/casum_microk_skylakex-2.c
+++ b/kernel/x86_64/casum_microk_skylakex-2.c
@@ -2,7 +2,9 @@
 #ifdef __NVCOMPILER
 #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
 #endif
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309 )
+
+#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309))
 
 #define HAVE_CASUM_KERNEL 1
 
@@ -350,3 +352,4 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
     return sumf;
 }
 #endif
+#endif

From 9f6847583a5e0c5360760bd247c34bf7ce8db022 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 29 Jul 2023 11:50:16 +0200
Subject: [PATCH 237/718] nvc currently miscompiles this, hopefully fixed in
 release 23.09

---
 kernel/x86_64/zasum_microk_skylakex-2.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c
index e60abc28b..94802e951 100644
--- a/kernel/x86_64/zasum_microk_skylakex-2.c
+++ b/kernel/x86_64/zasum_microk_skylakex-2.c
@@ -2,8 +2,9 @@
 #ifdef __NVCOMPILER
 #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
 #endif
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309 )
 
+#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309))
 
 #define HAVE_ZASUM_KERNEL 1
 
@@ -342,3 +343,4 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
     return sumf;
 }
 #endif
+#endif

From ebb447e32ef95348bcea447b7b8db4cca37a2615 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 29 Jul 2023 12:23:57 +0200
Subject: [PATCH 238/718] Update zasum_microk_skylakex-2.c

---
 kernel/x86_64/zasum_microk_skylakex-2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c
index 94802e951..d4f35db63 100644
--- a/kernel/x86_64/zasum_microk_skylakex-2.c
+++ b/kernel/x86_64/zasum_microk_skylakex-2.c
@@ -2,7 +2,7 @@
 #ifdef __NVCOMPILER
 #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
 #endif
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309 )
+#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309))
 
 #if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309))
 

From 6a428b5629fdf09c896d7659e49796e844ba9848 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 29 Jul 2023 12:24:30 +0200
Subject: [PATCH 239/718] Update casum_microk_skylakex-2.c

---
 kernel/x86_64/casum_microk_skylakex-2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c
index 7161a0a62..ac1dc6fa1 100644
--- a/kernel/x86_64/casum_microk_skylakex-2.c
+++ b/kernel/x86_64/casum_microk_skylakex-2.c
@@ -2,7 +2,7 @@
 #ifdef __NVCOMPILER
 #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
 #endif
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309 )
+#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309))
 
 #if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309))
 

From 4664b57e6ede3c55efea2c6aba01dad59e84e67a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 4 Aug 2023 12:25:34 +0200
Subject: [PATCH 240/718] use shortcut only when both incx and incy are zero

---
 kernel/x86_64/zaxpy_sse2.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/x86_64/zaxpy_sse2.S b/kernel/x86_64/zaxpy_sse2.S
index a7dd054fb..3776c8910 100644
--- a/kernel/x86_64/zaxpy_sse2.S
+++ b/kernel/x86_64/zaxpy_sse2.S
@@ -1418,10 +1418,10 @@
 	movq	M,  %rax
 //If incx==0 || incy==0, avoid unloop and jump to end.
 	cmpq	$0, INCX
-	je  .L58
+	jne  	.L59
 	cmpq	$0, INCY
 	je	.L58
-
+.L59:
 	sarq	$3, %rax
 	jle	.L55
 

From d64fa286f7c5c911c7a07a5638bc9090c54bafbc Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 4 Aug 2023 12:26:36 +0200
Subject: [PATCH 241/718] add test case for zaxpy with incx=0 incy=1

---
 utest/test_axpy.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/utest/test_axpy.c b/utest/test_axpy.c
index 5fd7c1b04..33b6cf6ed 100644
--- a/utest/test_axpy.c
+++ b/utest/test_axpy.c
@@ -74,6 +74,28 @@ CTEST(axpy,zaxpy_inc_0)
 		ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
 	}
 }
+
+CTEST(axpy,zaxpy_incx_0)
+{
+        blasint i;
+        blasint N=4,incX=0,incY=1;
+        double a[2]={0.25,0.5};
+        double x1[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0};
+        double y1[]={2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0};
+        double x2[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0};
+        double y2[]={0.75,5.25,4.75,9.25,0.75,5.25,4.75,9.25};
+
+        //OpenBLAS
+        BLASFUNC(zaxpy)(&N,a,x1,&incX,y1,&incY);
+
+        for(i=0; i<2*N; i++){
+//fprintf(stderr,"output X %lf\n",x1[i]);
+//fprintf(stderr,"output Y %lf\n",y1[i]);
+		ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+                ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+        }
+}
+
 #endif
 
 #ifdef BUILD_SINGLE

From 862d06ab8ae0486a41c138ed51c2cbc75722a906 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 4 Aug 2023 15:28:02 +0200
Subject: [PATCH 242/718] Add INCX=0,INCY=1 test case for CAXPY

---
 utest/test_axpy.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/utest/test_axpy.c b/utest/test_axpy.c
index 33b6cf6ed..26005e70f 100644
--- a/utest/test_axpy.c
+++ b/utest/test_axpy.c
@@ -89,8 +89,6 @@ CTEST(axpy,zaxpy_incx_0)
         BLASFUNC(zaxpy)(&N,a,x1,&incX,y1,&incY);
 
         for(i=0; i<2*N; i++){
-//fprintf(stderr,"output X %lf\n",x1[i]);
-//fprintf(stderr,"output Y %lf\n",y1[i]);
 		ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
                 ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
         }
@@ -138,5 +136,24 @@ CTEST(axpy,caxpy_inc_0)
 		ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
 	}
 }
+
+CTEST(axpy,caxpy_incx_0)
+{
+	blasint i;
+	blasint N=4,incX=0,incY=1;
+	float a[2]={0.25,0.5};
+	float x1[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0};
+	float y1[]={2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0};
+        double x2[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0};
+        double y2[]={0.75,5.25,4.75,9.25,0.75,5.25,4.75,9.25};
+
+	//OpenBLAS
+	BLASFUNC(caxpy)(&N,a,x1,&incX,y1,&incY);
+
+	for(i=0; i<2*N; i++){
+		ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+		ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+	}
+}
 #endif
 

From f2c9ae9c332a5247efc79ede4df5656331a11f65 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 4 Aug 2023 15:31:03 +0200
Subject: [PATCH 243/718] Identify the new generation of flang that comes with
 LLVM17

---
 f_check | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/f_check b/f_check
index d071e016e..526c41dc6 100755
--- a/f_check
+++ b/f_check
@@ -101,7 +101,14 @@ else
                         *flang*)
                             vendor=FLANG
                             openmp='-fopenmp'
-                            ;;
+			    data=`$compiler -v 2>&1 > /dev/null `
+                	    v="${data#*version *}"
+                	    v="${v%%*.}"
+                	    major="${v%%.*}"
+                	    if [ "$major" -ge 17 ]; then
+                        	vendor=FLANGNEW
+			    fi	
+			;;
                         *ifort*|*ifx*)
                             vendor=INTEL
                             openmp='-fopenmp'

From e8bc8a0ee7af5cf42c736253c300b0248bf9782c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 4 Aug 2023 15:32:19 +0200
Subject: [PATCH 244/718] Add support for the new generation flang that comes
 with LLVM17

---
 Makefile.system | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Makefile.system b/Makefile.system
index 62926b380..71535b0cb 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -1087,8 +1087,9 @@ endif
 endif
 endif
 
-ifeq ($(F_COMPILER), GFORTRAN)
+ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW))
 CCOMMON_OPT += -DF_INTERFACE_GFORT
+ifeq ($(F_COMPILER), GFORTRAN)
 FCOMMON_OPT += -Wall
 # make single-threaded LAPACK calls thread-safe #1847
 FCOMMON_OPT += -frecursive
@@ -1102,6 +1103,7 @@ EXTRALIB += -lgfortran
 endif
 endif
 endif
+endif
 ifdef NO_BINARY_MODE
 ifeq ($(ARCH), $(filter $(ARCH),mips64))
 ifdef BINARY64

From d46772e037e0eb8b5abe6e39b434105921306eed Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Fri, 30 Jun 2023 16:19:38 +0800
Subject: [PATCH 245/718] LoongArch64: Add compiler feature checks

---
 Makefile.system                       |  2 ++
 c_check                               | 35 +++++++++++++++++++++
 c_check.pl                            | 45 +++++++++++++++++++++++++++
 kernel/loongarch64/KERNEL.LOONGSON3R5 |  2 ++
 param.h                               | 10 ++++--
 5 files changed, 92 insertions(+), 2 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index 71535b0cb..3c1648dc7 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -1770,6 +1770,8 @@ export TARGET_CORE
 export NO_AVX512
 export NO_AVX2
 export BUILD_BFLOAT16
+export NO_LSX
+export NO_LASX
 
 export SBGEMM_UNROLL_M
 export SBGEMM_UNROLL_N
diff --git a/c_check b/c_check
index 7c8494e4a..7ee183163 100755
--- a/c_check
+++ b/c_check
@@ -185,6 +185,37 @@ if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
     rm -rf "$tmpd"
 fi
 
+no_lsx=0
+no_lasx=0
+if [ "$architecture" = "loongarch64" ]; then
+    tmpd="$(mktemp -d)"
+    tmplsx="$tmpd/lsx.c"
+    codelsx='"vadd.b $vr0, $vr0, $vr0"'
+    lsx_flags='-march=loongarch64 -mlsx'
+    printf "#include <lsxintrin.h>\n\n" >> "$tmplsx"
+    printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx"
+    args="$lsx_flags -o $tmplsx.o $tmplsx"
+    {
+        $compiler_name $flags $args >/dev/null 2>&1
+    } || {
+        no_lsx=1
+    }
+
+    tmplasx="$tmpd/lasx.c"
+    codelasx='"xvadd.b $xr0, $xr0, $xr0"'
+    lasx_flags='-march=loongarch64 -mlasx'
+    printf "#include <lasxintrin.h>\n\n" >> "$tmplasx"
+    printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx"
+    args="$lasx_flags -o $tmplasx.o $tmplasx"
+    {
+        $compiler_name $flags $args >/dev/null 2>&1
+    } || {
+        no_lasx=1
+    }
+
+    rm -rf "$tmpd"
+fi
+
 case "$data" in
     *ARCH_X86_64*) architecture=x86_64 ;;
     *ARCH_X86*) architecture=x86 ;;
@@ -399,6 +430,8 @@ done
     [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
     [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
     [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
+    [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"
+    [ "$no_lasx" -eq 1 ] && printf "NO_LASX=1\n"
 } >> "$makefile"
 
 os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ `
@@ -414,6 +447,8 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' `
     [ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu"
     [ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n"
     [ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n"
+    [ "$no_lsx" -eq 1 ] && printf "#define NO_LSX\t1\n"
+    [ "$no_lasx" -eq 1 ] && printf "#define NO_LASX\t1\n"
 } >> "$config"
 
 
diff --git a/c_check.pl b/c_check.pl
index 6ce28e11b..7a860a211 100644
--- a/c_check.pl
+++ b/c_check.pl
@@ -232,6 +232,47 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
     }
 }
 
+$no_lsx = 0;
+$no_lasx = 0;
+if (($architecture eq "loongarch64")) {
+    eval "use File::Temp qw(tempfile)";
+    if ($@){
+	warn "could not load PERL module File::Temp, so could not check LSX and LASX capatibility";
+    } else {
+	$tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
+	$codelsx = '"vadd.b $vr0, $vr0, $vr0"';
+	$lsx_flags = "-march=loongarch64 -mlsx";
+	print $tmplsx "#include <lsxintrin.h>\n\n";
+	print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n";
+
+	$args = "$lsx_flags -o $tmplsx.o $tmplsx";
+	my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
+	system(@cmd) == 0;
+	if ($? != 0) {
+	    $no_lsx = 1;
+	} else {
+	    $no_lsx = 0;
+	}
+	unlink("$tmplsx.o");
+
+	$tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
+	$codelasx = '"xvadd.b $xr0, $xr0, $xr0"';
+	$lasx_flags = "-march=loongarch64 -mlasx";
+	print $tmplasx "#include <lasxintrin.h>\n\n";
+	print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n";
+
+	$args = "$lasx_flags -o $tmplasx.o $tmplasx";
+	my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
+	system(@cmd) == 0;
+	if ($? != 0) {
+	    $no_lasx = 1;
+	} else {
+	    $no_lasx = 0;
+	}
+	unlink("$tmplasx.o");
+    }
+}
+
 $architecture = x86          if ($data =~ /ARCH_X86/);
 $architecture = x86_64       if ($data =~ /ARCH_X86_64/);
 $architecture = e2k          if ($data =~ /ARCH_E2K/);
@@ -424,6 +465,8 @@ print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1;
 print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
 print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1;
 print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1;
+print MAKEFILE "NO_LSX=1\n" if $no_lsx eq 1;
+print MAKEFILE "NO_LASX=1\n" if $no_lasx eq 1;
 
 $os           =~ tr/[a-z]/[A-Z]/;
 $architecture =~ tr/[a-z]/[A-Z]/;
@@ -437,6 +480,8 @@ print CONFFILE "#define __64BIT__\t1\n"  if $binformat eq bin64;
 print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
 print CONFFILE "#define HAVE_MSA\t1\n"  if $have_msa eq 1;
 print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1;
+print CONFFILE "#define NO_LSX\t1\n" if $no_lsx eq 1;
+print CONFFILE "#define NO_LASX\t1\n" if $no_lasx eq 1;
 
 
 if ($os eq "LINUX") {
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index cda359040..253aa2464 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -1,3 +1,4 @@
+ifndef NO_LASX
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
@@ -7,6 +8,7 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
 DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
 DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
 
 DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
 DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
diff --git a/param.h b/param.h
index 8fb4bcc48..547463b2f 100644
--- a/param.h
+++ b/param.h
@@ -2845,15 +2845,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B 0
 #define GEMM_DEFAULT_ALIGN 0x0ffffUL
 
-#define SGEMM_DEFAULT_UNROLL_N 8
+#if defined(NO_LASX)
+#define DGEMM_DEFAULT_UNROLL_N 8
+#define DGEMM_DEFAULT_UNROLL_M 2
+#else
 #define DGEMM_DEFAULT_UNROLL_N 4
+#define DGEMM_DEFAULT_UNROLL_M 16
+#endif
+
+#define SGEMM_DEFAULT_UNROLL_N 8
 #define QGEMM_DEFAULT_UNROLL_N 2
 #define CGEMM_DEFAULT_UNROLL_N 4
 #define ZGEMM_DEFAULT_UNROLL_N 4
 #define XGEMM_DEFAULT_UNROLL_N 1
 
 #define SGEMM_DEFAULT_UNROLL_M 2
-#define DGEMM_DEFAULT_UNROLL_M 16
 #define QGEMM_DEFAULT_UNROLL_M 2
 #define CGEMM_DEFAULT_UNROLL_M 1
 #define ZGEMM_DEFAULT_UNROLL_M 1

From db9a42f8c366c46dd914a1c6057afb87b2ca4e00 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Fri, 30 Jun 2023 16:31:47 +0800
Subject: [PATCH 246/718] LoongArch64: using getauxval to do runtime check

Using the getauxval instruction can prevent errors
caused by hardware supporting vector instructions
while the kernel does not support them
---
 cpuid_loongarch64.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c
index ca07c7ffb..7c389db27 100644
--- a/cpuid_loongarch64.c
+++ b/cpuid_loongarch64.c
@@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 **********************************************************************************/
 
 #include <stdint.h>
+#include <sys/auxv.h>
 
 /*  If LASX extension instructions supported,
  *  using core LOONGSON3R5
@@ -46,9 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CPU_LOONGSON3R5    1
 #define CPU_LOONGSON2K1000 2
 
-#define LOONGARCH_CFG2  0x02
-#define LOONGARCH_LASX  1<<7
-#define LOONGARCH_LSX   1<<6
+#define LA_HWCAP_LSX    (1<<4)
+#define LA_HWCAP_LASX   (1<<5)
 
 static char *cpuname[] = {
   "LOONGSONGENERIC",
@@ -64,17 +64,11 @@ static char *cpuname_lower[] = {
 
 int detect(void) {
 #ifdef __linux
-  uint32_t reg = 0;
+  int flag  = (int)getauxval(AT_HWCAP);
 
-  __asm__ volatile (
-    "cpucfg %0, %1 \n\t"
-    : "+&r"(reg)
-    : "r"(LOONGARCH_CFG2)
-  );
-
-  if (reg & LOONGARCH_LASX)
+  if (flag & LA_HWCAP_LASX)
     return CPU_LOONGSON3R5;
-  else if (reg & LOONGARCH_LSX)
+  else if (flag & LA_HWCAP_LSX)
     return CPU_LOONGSON2K1000;
   else
     return CPU_GENERIC;

From 96bf226bca7e2f0a6359f524280222e45df6bff1 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Fri, 30 Jun 2023 17:34:08 +0800
Subject: [PATCH 247/718] gh-actions: Add loongarch64 CI

---
 .github/workflows/loongarch64.yml | 110 ++++++++++++++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 .github/workflows/loongarch64.yml

diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml
new file mode 100644
index 000000000..5501e98e0
--- /dev/null
+++ b/.github/workflows/loongarch64.yml
@@ -0,0 +1,110 @@
+name: loongarch64 qemu test
+
+on: [push, pull_request]
+
+jobs:
+  TEST:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - target: LOONGSONGENERIC
+            triple:  loongarch64-unknown-linux-gnu
+            opts: NO_SHARED=1 TARGET=LOONGSONGENERIC
+          - target: LOONGSON3R5
+            triple: loongarch64-unknown-linux-gnu
+            opts: NO_SHARED=1 TARGET=LOONGSON3R5
+          - target: LOONGSON2K1000
+            triple: loongarch64-unknown-linux-gnu
+            opts: NO_SHARED=1 TARGET=LOONGSON2K1000
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Install APT deps
+        run: |
+          sudo add-apt-repository ppa:savoury1/virtualisation
+          sudo apt-get update
+          sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
+          qemu-user-static
+
+      - name: Download and install loongarch64-toolchain
+        run: |
+          wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz
+          tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt
+
+      - name: Set env
+        run: |
+          echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
+          echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV
+
+      - name: Compilation cache
+        uses: actions/cache@v3
+        with:
+          path: ~/.ccache
+          key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
+          restore-keys: |
+            ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
+            ccache-${{ runner.os }}-${{ matrix.target }}
+
+      - name: Configure ccache
+        run: |
+          test -d ~/.ccache || mkdir -p ~/.ccache
+          echo "max_size = 300M" > ~/.ccache/ccache.conf
+          echo "compression = true" >> ~/.ccache/ccache.conf
+          ccache -s
+
+      - name: Disable utest dsdot:dsdot_n_1
+        run: |
+          echo -n > utest/test_dsdot.c
+          echo "Due to the qemu versions 7.2 causing utest cases to fail,"
+          echo "the utest dsdot:dsdot_n_1 have been temporarily disabled."
+
+      - name: Build OpenBLAS
+        run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
+
+      - name: Test
+        run: |
+          qemu-loongarch64-static ./utest/openblas_utest
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1
+          rm -f ./test/?BLAT2.SUMM
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
+          rm -f ./test/?BLAT2.SUMM
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
+          rm -f ./test/?BLAT3.SUMM
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat
+          rm -f ./test/?BLAT3.SUMM
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat

From ec1e96aac851fd620a6a981b20e9dfdba6dd9493 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Tue, 11 Jul 2023 10:01:12 +0800
Subject: [PATCH 248/718] LoongArch64: Add dgemv_t_8_lasx.S and
 dgemv_n_8_lasx.S

---
 kernel/loongarch64/KERNEL.LOONGSON3R5 |   3 +
 kernel/loongarch64/KERNEL.generic     |   4 +
 kernel/loongarch64/dgemv_n_8_lasx.S   | 546 ++++++++++++++++++++++++++
 kernel/loongarch64/dgemv_t_8_lasx.S   | 468 ++++++++++++++++++++++
 kernel/loongarch64/loongarch64_asm.S  | 313 +++++++++++++++
 5 files changed, 1334 insertions(+)
 create mode 100644 kernel/loongarch64/dgemv_n_8_lasx.S
 create mode 100644 kernel/loongarch64/dgemv_t_8_lasx.S
 create mode 100644 kernel/loongarch64/loongarch64_asm.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index cda359040..19977c815 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -8,6 +8,9 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
 DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
+DGEMVNKERNEL = dgemv_n_8_lasx.S
+DGEMVTKERNEL = dgemv_t_8_lasx.S
+
 DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
 DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic
index b772a6f82..213add9ee 100644
--- a/kernel/loongarch64/KERNEL.generic
+++ b/kernel/loongarch64/KERNEL.generic
@@ -132,12 +132,16 @@ CSWAPKERNEL  = ../arm/zswap.c
 ZSWAPKERNEL  = ../arm/zswap.c
 
 SGEMVNKERNEL = ../arm/gemv_n.c
+ifndef DGEMVNKERNEL
 DGEMVNKERNEL = ../arm/gemv_n.c
+endif
 CGEMVNKERNEL = ../arm/zgemv_n.c
 ZGEMVNKERNEL = ../arm/zgemv_n.c
 
 SGEMVTKERNEL = ../arm/gemv_t.c
+ifndef DGEMVTKERNEL
 DGEMVTKERNEL = ../arm/gemv_t.c
+endif
 CGEMVTKERNEL = ../arm/zgemv_t.c
 ZGEMVTKERNEL = ../arm/zgemv_t.c
 
diff --git a/kernel/loongarch64/dgemv_n_8_lasx.S b/kernel/loongarch64/dgemv_n_8_lasx.S
new file mode 100644
index 000000000..940d27569
--- /dev/null
+++ b/kernel/loongarch64/dgemv_n_8_lasx.S
@@ -0,0 +1,546 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/07/14 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA   $f0
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define Y_ORG   $r15
+#define OFFSET  $r16
+#define K_LDA   $r17
+#define M8      $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+
+#define VALPHA  $xr1
+#define X0      $xr2
+#define X1      $xr3
+#define X2      $xr4
+#define X3      $xr5
+#define X4      $xr6
+#define X5      $xr7
+#define X6      $xr8
+#define X7      $xr9
+#define Y0      $xr10
+#define Y1      $xr11
+#define A0      $xr12
+#define A1      $xr13
+#define A2      $xr14
+#define A3      $xr15
+#define A4      $xr16
+#define A5      $xr17
+#define A6      $xr18
+#define A7      $xr19
+#define A8      $xr20
+#define A9      $xr21
+#define A10     $xr22
+#define A11     $xr23
+#define A12     $xr24
+#define A13     $xr25
+#define A14     $xr26
+#define A15     $xr27
+
+.macro DLOAD_X_8
+    GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
+                   X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
+                 X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
+.endm
+
+.macro DLOAD_X_4
+    GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
+.endm
+
+.macro DLOAD_X_2
+    GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA
+.endm
+
+.macro DLOAD_X_1
+    GLDREPL xv, d, X0, X, 0x00
+    GMUL xvf, d, X0, X0, VALPHA
+.endm
+
+.macro DLOAD_Y_8
+    GLD xv, , Y0, Y, 0, Y1, Y, 0x20
+.endm
+
+.macro DLOAD_Y_4
+    GLD xv, , Y0, Y, 0
+.endm
+
+.macro DLOAD_Y_1
+    fld.d  $f10,   Y,  0
+.endm
+
+.macro DSTORE_Y_8
+    GST xv, , Y0, Y, 0, Y1, Y, 0x20
+.endm
+
+.macro DSTORE_Y_4
+    GST xv, , Y0, Y, 0
+.endm
+
+.macro DSTORE_Y_1
+    fst.d   $f10,   Y,  0
+.endm
+
+// Unable to use vector load/store ins
+.macro DLOAD_Y_8_GAP
+    fld.d   $f10,   Y,  0
+    fldx.d  $f13,   Y,  INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  1
+    fld.d   $f14,   T0, 0
+    fldx.d  $f15,   T0, INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  2
+    fld.d   $f11,   T0, 0
+    fldx.d  $f17,   T0, INC_Y
+    PTR_ADD   T0,   T0, INC_Y
+    PTR_ADD   T0,   T0, INC_Y
+    fld.d   $f18,   T0, 0
+    fldx.d  $f19,   T0, INC_Y
+    GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3
+.endm
+
+.macro DLOAD_Y_4_GAP
+    fld.d   $f10,   Y,  0
+    fldx.d  $f13,   Y,  INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  1
+    fld.d   $f14,   T0, 0
+    fldx.d  $f15,   T0, INC_Y
+    GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3
+.endm
+
+.macro DSTORE_Y_8_GAP
+    xvstelm.d   Y0,     Y,      0,      0
+    PTR_ADD     T0,     Y,      INC_Y
+    xvstelm.d   Y0,     T0,     0,      1
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y0,     T0,     0,      2
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y0,     T0,     0,      3
+
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      0
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      1
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      2
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      3
+.endm
+
+.macro DSTORE_Y_4_GAP
+    xvstelm.d   Y0,     Y,      0,      0
+    PTR_ADD     T0,     Y,      INC_Y
+    xvstelm.d   Y0,     T0,     0,      1
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y0,     T0,     0,      2
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y0,     T0,     0,      3
+.endm
+
+.macro DLOAD_X_8_GAP
+    xvldrepl.d  X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    xvldrepl.d  X1,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X2,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X3,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X4,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X5,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X6,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X7,     T0,     0x00
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
+                 X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
+.endm
+
+.macro DLOAD_X_4_GAP
+    xvldrepl.d  X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    xvldrepl.d  X1,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X2,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X3,     T0,     0x00
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
+.endm
+
+.macro DLOAD_X_2_GAP
+    xvldrepl.d  X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    xvldrepl.d  X1,     T0,     0x00
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA
+.endm
+
+.macro DGEMV_N_8x8
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0, \
+    A8,  PA4, 0, A9,  PA4, 0, \
+    A10, PA5, 0, A11, PA5, 0, \
+    A12, PA6, 0, A13, PA6, 0, \
+    A14, PA7, 0, A15, PA7, 0
+
+    GMADD xvf, d, Y0, A0,  X0, Y0, Y1, A1,  X0, Y1, \
+                  Y0, A2,  X1, Y0, Y1, A3,  X1, Y1, \
+                  Y0, A4,  X2, Y0, Y1, A5,  X2, Y1, \
+                  Y0, A6,  X3, Y0, Y1, A7,  X3, Y1, \
+                  Y0, A8,  X4, Y0, Y1, A9,  X4, Y1, \
+                  Y0, A10, X5, Y0, Y1, A11, X5, Y1, \
+                  Y0, A12, X6, Y0, Y1, A13, X6, Y1, \
+                  Y0, A14, X7, Y0, Y1, A15, X7, Y1
+.endm
+
+.macro DGEMV_N_4x8
+    GLD_INC xv, , 0x20, A0,  PA0, 0, \
+                        A2,  PA1, 0, \
+                        A4,  PA2, 0, \
+                        A6,  PA3, 0, \
+                        A8,  PA4, 0, \
+                        A10, PA5, 0, \
+                        A12, PA6, 0, \
+                        A14, PA7, 0
+
+    GMADD xvf, d, Y0, A0,  X0, Y0, \
+                  Y0, A2,  X1, Y0, \
+                  Y0, A4,  X2, Y0, \
+                  Y0, A6,  X3, Y0, \
+                  Y0, A8,  X4, Y0, \
+                  Y0, A10, X5, Y0, \
+                  Y0, A12, X6, Y0, \
+                  Y0, A14, X7, Y0
+.endm
+
+.macro DGEMV_N_1x8
+    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \
+                        $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0
+    GMADD f, d, $f10, $f12, $f2, $f10, \
+                $f10, $f14, $f3, $f10, \
+                $f10, $f16, $f4, $f10, \
+                $f10, $f18, $f5, $f10, \
+                $f10, $f20, $f6, $f10, \
+                $f10, $f22, $f7, $f10, \
+                $f10, $f24, $f8, $f10, \
+                $f10, $f26, $f9, $f10,
+.endm
+
+.macro DGEMV_N_8x4
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0
+
+    GMADD xvf, d, Y0, A0,  X0, Y0, Y1, A1,  X0, Y1, \
+                  Y0, A2,  X1, Y0, Y1, A3,  X1, Y1, \
+                  Y0, A4,  X2, Y0, Y1, A5,  X2, Y1, \
+                  Y0, A6,  X3, Y0, Y1, A7,  X3, Y1
+.endm
+
+.macro DGEMV_N_4x4
+    GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0
+
+    GMADD xvf, d, Y0, A0,  X0, Y0, Y0, A2,  X1, Y0, \
+                  Y0, A4,  X2, Y0, Y0, A6,  X3, Y0
+.endm
+
+.macro DGEMV_N_1x4
+    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0
+    GMADD f, d, $f10, $f12, $f2, $f10, $f10, $f14, $f3, $f10, \
+                $f10, $f16, $f4, $f10, $f10, $f18, $f5, $f10
+.endm
+
+.macro DGEMV_N_8x2
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0
+    GMADD xvf, d, Y0, A0,  X0, Y0, Y1, A1,  X0, Y1, \
+                  Y0, A2,  X1, Y0, Y1, A3,  X1, Y1
+.endm
+
+.macro DGEMV_N_4x2
+    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0
+    GMADD xvf, d, Y0, A0,  X0, Y0, \
+                  Y0, A2,  X1, Y0
+.endm
+
+.macro DGEMV_N_1x2
+    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0
+    GMADD f, d, $f10, $f12, $f2, $f10, \
+                $f10, $f14, $f3, $f10
+.endm
+
+.macro DGEMV_N_1x1
+    fld.d   $f12,    PA0,    0
+    PTR_ADDI PA0,   PA0,    0x08
+    fmadd.d $f10,   $f12,   $f2,    $f10
+.endm
+
+.macro DGEMV_N XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req
+    PTR_SRLI  J,      N,      3
+    beqz      J,      .L_\XW\()_N_7
+    PTR_SLLI  K_LDA,  LDA,    3
+    PTR_SUB   K_LDA,  K_LDA,  M8
+.L_\XW\()_N_L8:
+    DLOAD_\X_8
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_M_7
+.align 5
+.L_\XW\()_M_L8:
+    DLOAD_\Y_8
+    DGEMV_N_8x8
+    DSTORE_\Y_8
+    PTR_ADDI    I,      I,      -1
+    PTR_ALSL    Y,      INC_Y,  Y,  3
+    PTR_ADDI    K,      K,      8
+    bnez        I,      .L_\XW\()_M_L8
+.L_\XW\()_M_7:
+    andi        I,      M,      4
+    beqz        I,      .L_\XW\()_M_3
+    DLOAD_\Y_4
+    DGEMV_N_4x8
+    DSTORE_\Y_4
+    PTR_ALSL    Y,      INC_Y,  Y,  2
+    PTR_ADDI    K,      K,      4
+.L_\XW\()_M_3:
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    DLOAD_\Y_1
+    DGEMV_N_1x8
+    DSTORE_\Y_1
+    PTR_ADDI    I,      I,      -1
+    PTR_ADD     Y,      Y,      INC_Y
+    PTR_ADDI    K,      K,      1
+    bnez        I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#else
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#endif
+    PTR_ALSL    X,      INC_X,  X,  3
+    bnez        J,      .L_\XW\()_N_L8
+.L_\XW\()_N_7:
+    andi        J,      N,      4
+    beqz        J,      .L_\XW\()_N_3
+    DLOAD_\X_4
+    xor         K,      K,      K
+    move        Y,      Y_ORG
+
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_N_4_M_7
+.align 5
+.L_\XW\()_N_4_M_L8:
+    DLOAD_\Y_8
+    DGEMV_N_8x4
+    DSTORE_\Y_8
+    PTR_ADDI  I,      I,      -1
+    PTR_ADDI  K,      K,      8
+    PTR_ALSL  Y,      INC_Y,  Y,  3
+    bnez    I,      .L_\XW\()_N_4_M_L8
+.L_\XW\()_N_4_M_7:
+    andi    I,      M,      4
+    beqz    I,      .L_\XW\()_N_4_M_3
+    DLOAD_\Y_4
+    DGEMV_N_4x4
+    DSTORE_\Y_4
+    PTR_ALSL  Y,    INC_Y,  Y,  2
+    PTR_ADDI  K,    K,      4
+.L_\XW\()_N_4_M_3:
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_N_4_M_END
+.align 5
+.L_\XW\()_N_4_M_L1:
+    DLOAD_\Y_1
+    DGEMV_N_1x4
+    DSTORE_\Y_1
+    PTR_ADDI    I,      I,      -1
+    PTR_ADD     Y,      Y,      INC_Y
+    PTR_ADDI    K,      K,      1
+    bnez        I,      .L_\XW\()_N_4_M_L1
+.L_\XW\()_N_4_M_END:
+    PTR_SLLI    K_LDA,  LDA,    2
+    PTR_SUB     K_LDA,  K_LDA,  M8
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#endif
+    PTR_ALSL    X,      INC_X,  X,  2
+.L_\XW\()_N_3:
+    andi        J,      N,      2
+    beqz        J,      .L_\XW\()_N_1
+    DLOAD_\X_2
+    xor         K,      K,      K
+    move        Y,      Y_ORG
+    PTR_SRLI    I,      M,       3
+    beqz    I,      .L_\XW\()_N_2_M_7
+.align 5
+.L_\XW\()_N_2_M_L8:
+    DLOAD_\Y_8
+    DGEMV_N_8x2
+    DSTORE_\Y_8
+    PTR_ADDI  I,      I,      -1
+    PTR_ADDI  K,      K,      8
+    PTR_ALSL  Y,      INC_Y,  Y,  3
+    bnez    I,      .L_\XW\()_N_2_M_L8
+.L_\XW\()_N_2_M_7:
+    andi    I,      M,      4
+    beqz    I,      .L_\XW\()_N_2_M_3
+    DLOAD_\Y_4
+    DGEMV_N_4x2
+    DSTORE_\Y_4
+    PTR_ALSL  Y,        INC_Y,      Y,      2
+    PTR_ADDI  K,        K,          4
+.L_\XW\()_N_2_M_3:
+    andi    I,      M,      3
+    beqz    I,      .L_\XW\()_N_2_M_END
+.align 5
+.L_\XW\()_N_2_M_L1:
+    DLOAD_\Y_1
+    DGEMV_N_1x2
+    DSTORE_\Y_1
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   Y,      Y,      INC_Y
+    PTR_ADDI  K,      K,      1
+    bnez    I,      .L_\XW\()_N_2_M_L1
+.L_\XW\()_N_2_M_END:
+    PTR_SLLI    K_LDA,  LDA,    1
+    PTR_SUB     K_LDA,  K_LDA,  M8
+    PTR_ADD     PA0,    PA0,    K_LDA
+    PTR_ADD     PA1,    PA1,    K_LDA
+    PTR_ALSL    X,      INC_X,  X,  1
+.L_\XW\()_N_1:
+    andi    J,      N,      1
+    beqz    J,      .L_END
+    DLOAD_\X_1
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    DLOAD_\Y_1
+    DGEMV_N_1x1
+    DSTORE_\Y_1
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   Y,      Y,      INC_Y
+    PTR_ADDI  K,      K,      1
+    bnez    I,      .L_\XW\()_N_1_M_L1
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 7, 24 + 4
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    PTR_SUB    J,      INC_Y,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    maskeqz    J,      K,       J  /* if(inc_y == 1) j = 0; else j = 1; */
+    PTR_ALSL   I,      I,       J,      1
+    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
+    xvreplve0.d     VALPHA, $xr0
+    move     Y_ORG,  Y
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#else
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword  .L_GAP_0_0 - .L_GAP_TABLE
+    .hword  .L_GAP_0_1 - .L_GAP_TABLE
+    .hword  .L_GAP_1_0 - .L_GAP_TABLE
+    .hword  .L_GAP_1_1 - .L_GAP_TABLE
+.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
+    DGEMV_N GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1
+.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
+    DGEMV_N GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1
+.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
+    DGEMV_N GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1
+.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
+    DGEMV_N GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
+.L_END:
+    pop_if_used 17 + 7, 24 + 4
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemv_t_8_lasx.S b/kernel/loongarch64/dgemv_t_8_lasx.S
new file mode 100644
index 000000000..be90cb1af
--- /dev/null
+++ b/kernel/loongarch64/dgemv_t_8_lasx.S
@@ -0,0 +1,468 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/07/17 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA   $f0
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define PY0     $r14
+#define X_ORG   $r15
+#define PY1     $r16
+#define K_LDA   $r17
+#define PY2     $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+#define M8      $r30
+
+#define VALPHA  $xr0
+#define X0      $xr1
+#define X1      $xr2
+#define A0      $xr3
+#define A1      $xr4
+#define A2      $xr5
+#define A3      $xr6
+#define A4      $xr7
+#define A5      $xr8
+#define A6      $xr9
+#define A7      $xr10
+#define A8      $xr11
+#define A9      $xr12
+#define A10     $xr13
+#define A11     $xr14
+#define A12     $xr15
+#define A13     $xr16
+#define A14     $xr17
+#define A15     $xr18
+#define TP0     $xr19
+#define TP1     $xr20
+#define TP2     $xr21
+#define TP3     $xr22
+#define TP4     $xr23
+#define TP5     $xr24
+#define TP6     $xr25
+#define TP7     $xr26
+#define Y0      $xr3
+#define Y1      $xr4
+#define Y2      $xr5
+#define Y3      $xr6
+#define Y4      $xr7
+#define Y5      $xr8
+#define Y6      $xr9
+#define Y7      $xr10
+
+.macro ZERO_Y8
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \
+                TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7
+.endm
+
+.macro ZERO_Y4
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
+.endm
+
+.macro ZERO_Y2
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1
+.endm
+
+.macro ZERO_Y1
+    GXOR xv, v, TP0, TP0, TP0
+.endm
+
+.macro DLOAD_X8
+    GLD xv, , X0, X, 0x00, X1, X, 0x20
+.endm
+
+.macro DLOAD_X4
+    GLD xv, , X0, X, 0x00
+.endm
+
+.macro DLOAD_X8_GAP
+    fld.d       $f1,    X,    0x00
+    fldx.d      $f2,    X,    INC_X
+    PTR_ALSL    T0,     INC_X,      X,      1
+    fld.d       $f3,    T0,   0x00
+    fldx.d      $f4,    T0,   INC_X
+    GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
+    PTR_ALSL    T0,     INC_X,      X,      2
+    fld.d       $f2,    T0,   0x00
+    fldx.d      $f3,    T0,   INC_X
+    PTR_ALSL    T0,     INC_X,      T0,     1
+    fld.d       $f4,    T0,   0x00
+    fldx.d      $f5,    T0,   INC_X
+    GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3
+.endm
+
+.macro DLOAD_X4_GAP
+    fld.d       $f1,    X,    0x00
+    fldx.d      $f2,    X,    INC_X
+    PTR_ALSL    T0,     INC_X,      X,      1
+    fld.d       $f3,    T0,   0x00
+    fldx.d      $f4,    T0,   INC_X
+    GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
+.endm
+
+.macro DGEMV_T_8x8
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0, \
+    A8,  PA4, 0, A9,  PA4, 0, \
+    A10, PA5, 0, A11, PA5, 0, \
+    A12, PA6, 0, A13, PA6, 0, \
+    A14, PA7, 0, A15, PA7, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP0, A1,  X1, TP0, \
+                  TP1, A2,  X0, TP1, TP1, A3,  X1, TP1, \
+                  TP2, A4,  X0, TP2, TP2, A5,  X1, TP2, \
+                  TP3, A6,  X0, TP3, TP3, A7,  X1, TP3, \
+                  TP4, A8,  X0, TP4, TP4, A9,  X1, TP4, \
+                  TP5, A10, X0, TP5, TP5, A11, X1, TP5, \
+                  TP6, A12, X0, TP6, TP6, A13, X1, TP6, \
+                  TP7, A14, X0, TP7, TP7, A15, X1, TP7
+.endm
+
+.macro DGEMV_T_8x4
+    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0, A4,  PA2, 0, A6,  PA3, 0, \
+                        A8,  PA4, 0, A10, PA5, 0, A12, PA6, 0, A14, PA7, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1, \
+                  TP2, A4,  X0, TP2, TP3, A6,  X0, TP3, \
+                  TP4, A8,  X0, TP4, TP5, A10, X0, TP5, \
+                  TP6, A12, X0, TP6, TP7, A14, X0, TP7,
+.endm
+
+.macro DGEMV_T_4x8
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP0, A1,  X1, TP0, \
+                  TP1, A2,  X0, TP1, TP1, A3,  X1, TP1, \
+                  TP2, A4,  X0, TP2, TP2, A5,  X1, TP2, \
+                  TP3, A6,  X0, TP3, TP3, A7,  X1, TP3
+.endm
+
+.macro DGEMV_T_4x4
+    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0, A4,  PA2, 0, A6,  PA3, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1, \
+                  TP2, A4,  X0, TP2, TP3, A6,  X0, TP3
+.endm
+
+.macro DGEMV_T_2x8
+    GLD_INC xv, , 0x20, A0,  PA0, 0, A1,  PA0, 0, A2, PA1, 0, A3,  PA1, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP0, A1,  X1, TP0, \
+                  TP1, A2,  X0, TP1, TP1, A3,  X1, TP1
+.endm
+
+.macro DGEMV_T_2x4
+    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1
+.endm
+
+.macro DGEMV_T XW:req X8:req, X4:req
+    PTR_SRLI  J,      N,      3
+    beqz      J,      .L_\XW\()_N_7
+    PTR_SLLI  K_LDA,  LDA,    3
+    PTR_SUB   K_LDA,  K_LDA,  M8
+.L_\XW\()_N_L8:
+    ZERO_Y8
+    move      X,      X_ORG
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_M_7
+.align 5
+.L_\XW\()_M_L8:
+    DLOAD_\X8
+    DGEMV_T_8x8
+    PTR_ADDI    I,  I,  -1
+    PTR_ALSL    X,  INC_X,  X,  3
+    bnez        I,  .L_\XW\()_M_L8
+.L_\XW\()_M_7:
+    andi        I,      M,      4
+    beqz        I,      .L_\XW\()_M_3
+    DLOAD_\X4
+    DGEMV_T_8x4
+    PTR_ALSL    X,  INC_X,  X,  2
+.L_\XW\()_M_3:
+    // Accumulated
+    GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \
+                 Y5, TP5, Y6, TP6, Y7, TP7
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    fld.d   $f1,    X,      0x00
+    fld.d   $f11,   PA0,    0x00
+    fld.d   $f12,   PA1,    0x00
+    fld.d   $f13,   PA2,    0x00
+    fld.d   $f14,   PA3,    0x00
+    fld.d   $f15,   PA4,    0x00
+    fld.d   $f16,   PA5,    0x00
+    fld.d   $f17,   PA6,    0x00
+    fld.d   $f18,   PA7,    0x00
+#if __loongarch_grlen == 64
+    GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
+               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
+#else
+    GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
+               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
+#endif
+    GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \
+                $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    fld.d   $f11,   Y,  0x00
+    fldx.d  $f12,   Y,  INC_Y
+    PTR_ALSL    PY0, INC_Y,  Y,  1
+    fld.d   $f13,   PY0,    0x00
+    fldx.d  $f14,   PY0,    INC_Y
+    PTR_ALSL    PY1, INC_Y,  Y,  2
+    fld.d   $f15,   PY1,    0x00
+    fldx.d  $f16,   PY1,    INC_Y
+    PTR_ALSL    PY2, INC_Y,  PY1, 1
+    fld.d   $f17,   PY2,    0x00
+    fldx.d  $f18,   PY2,    INC_Y
+
+    GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14, \
+                $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17, $f18, ALPHA, $f10, $f18
+
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#else
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#endif
+    fst.d   $f11,   Y,      0x00
+    fstx.d  $f12,   Y,      INC_Y
+    fst.d   $f13,   PY0,    0x00
+    fstx.d  $f14,   PY0,    INC_Y
+    fst.d   $f15,   PY1,    0x00
+    fstx.d  $f16,   PY1,    INC_Y
+    fst.d   $f17,   PY2,    0x00
+    fstx.d  $f18,   PY2,    INC_Y
+    PTR_ALSL    Y,      INC_Y,  Y,  3
+    bnez        J,      .L_\XW\()_N_L8
+.L_\XW\()_N_7:
+    andi        J,      N,      4
+    beqz        J,      .L_\XW\()_N_3
+    ZERO_Y4
+    move        X,      X_ORG
+    PTR_SRLI    I,      M,       3
+    beqz        I,      .L_\XW\()_N_4_M_7
+.align 5
+.L_\XW\()_N_4_M_L8:
+    DLOAD_\X8
+    DGEMV_T_4x8
+    PTR_ADDI  I,      I,      -1
+    PTR_ALSL  X,      INC_X,  X,  3
+    bnez      I,      .L_\XW\()_N_4_M_L8
+.L_\XW\()_N_4_M_7:
+    andi    I,      M,      4
+    beqz    I,      .L_\XW\()_N_4_M_3
+    DLOAD_\X4
+    DGEMV_T_4x4
+    PTR_ALSL  X,      INC_X,  X,  2
+.L_\XW\()_N_4_M_3:
+    // Accumulated
+    GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_N_4_M_END
+.align 5
+.L_\XW\()_N_4_M_L1:
+    fld.d   $f1,    X,      0x00
+    GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00, $f13, PA2, 0x00, $f14, PA3, 0x00
+    GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_N_4_M_L1
+.L_\XW\()_N_4_M_END:
+    fld.d   $f11,   Y,  0x00
+    fldx.d  $f12,   Y,  INC_Y
+    PTR_ALSL    PY0, INC_Y,  Y,  1
+    fld.d   $f13,   PY0,    0x00
+    fldx.d  $f14,   PY0,    INC_Y
+
+    GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14
+
+    PTR_SLLI    K_LDA,  LDA,    2
+    PTR_SUB     K_LDA,  K_LDA,  M8
+
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#else
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#endif
+    fst.d   $f11,   Y,      0x00
+    fstx.d  $f12,   Y,      INC_Y
+    fst.d   $f13,   PY0,    0x00
+    fstx.d  $f14,   PY0,    INC_Y
+    PTR_ALSL    Y,      INC_Y,  Y,  2
+.L_\XW\()_N_3:
+    andi        J,      N,      2
+    beqz        J,      .L_\XW\()_N_1
+    ZERO_Y2
+    move        X,      X_ORG
+    PTR_SRLI    I,      M,       3
+    beqz        I,      .L_\XW\()_N_2_M_7
+.align 5
+.L_\XW\()_N_2_M_L8:
+    DLOAD_\X8
+    DGEMV_T_2x8
+    PTR_ADDI  I,      I,      -1
+    PTR_ALSL  X,      INC_X,  X,  3
+    bnez      I,      .L_\XW\()_N_2_M_L8
+.L_\XW\()_N_2_M_7:
+    andi    I,      M,      4
+    beqz    I,      .L_\XW\()_N_2_M_3
+    DLOAD_\X4
+    DGEMV_T_2x4
+    PTR_ALSL  X,        INC_X,      X,      2
+.L_\XW\()_N_2_M_3:
+    // Accumulated
+    GACC xvf, d, Y0, TP0, Y1, TP1
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_N_2_M_END
+.align 5
+.L_\XW\()_N_2_M_L1:
+    fld.d   $f1,    X,      0x00
+    GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00
+    GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_N_2_M_L1
+.L_\XW\()_N_2_M_END:
+    fld.d   $f11,   Y,  0x00
+    fldx.d  $f12,   Y,  INC_Y
+
+    GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12
+
+    PTR_SLLI    K_LDA,  LDA,    1
+    PTR_SUB     K_LDA,  K_LDA,  M8
+
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#else
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#endif
+    fst.d   $f11,   Y,      0x00
+    fstx.d  $f12,   Y,      INC_Y
+    PTR_ALSL        Y,      INC_Y,  Y,  1
+.L_\XW\()_N_1:
+    andi    J,      N,      1
+    beqz    J,      .L_END
+    ZERO_Y1
+    move    X,      X_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    fld.d   $f3,    PA0,    0x00
+    fld.d   $f1,    X,      0x00
+    fmadd.d $f19,   $f3,    $f1,    $f19
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    PTR_ADDI  PA0,    PA0,    0x08
+    bnez      I,      .L_\XW\()_N_1_M_L1
+    fld.d     $f3,    Y,      0x00
+    fmadd.d   $f3,    ALPHA,  $f19,  $f3
+    fst.d     $f3,    Y,      0x00
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 8, 24 + 3
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
+    xvreplve0.d     VALPHA, $xr0
+    move     X_ORG,  X
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#else
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword .L_GAP_0 - .L_GAP_TABLE
+    .hword .L_GAP_1 - .L_GAP_TABLE
+.L_GAP_0: /* if (incx == 1) */
+    DGEMV_T GAP_0, X8, X4
+.L_GAP_1: /* if (incx != 1) */
+    DGEMV_T GAP_1, X8_GAP, X4_GAP
+.L_END:
+    pop_if_used 17 + 8, 24 + 3
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S
new file mode 100644
index 000000000..8876cbed9
--- /dev/null
+++ b/kernel/loongarch64/loongarch64_asm.S
@@ -0,0 +1,313 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#if __loongarch_grlen == 64
+#define LA_REG    int64_t
+#define REG_SIZE  8
+#define REG_LOG   3
+#define PTR_ADDI  addi.d
+#define PTR_ADD   add.d
+#define PTR_SUB   sub.d
+#define PTR_LD    ld.d
+#define PTR_ST    st.d
+#define PTR_SLLI  slli.d
+#define PTR_SRLI  srli.d
+#define PTR_ALSL  alsl.d
+#else
+#define LA_REG    int32_t
+#define REG_SIZE  4
+#define REG_LOG   2
+#define PTR_ADDI  addi.w
+#define PTR_ADD   add.w
+#define PTR_SUB   sub.w
+#define PTR_LD    ld.w
+#define PTR_ST    st.w
+#define PTR_SLLI  slli.w
+#define PTR_SRLI  srli.w
+#define PTR_ALSL  alsl.w
+#endif
+
+#if __loongarch_frlen == 64
+#define FREG_SIZE 8
+#define FREG_LOG  3
+#define PTR_FLD   fld.d
+#define PTR_FST   fst.d
+#else
+#define FREG_SIZE 4
+#define FREG_LOG  2
+#define PTR_FLD   fld.s
+#define PTR_FST   fst.s
+#endif
+
+// The max registers available to the user which
+// do not need to be preserved across calls.
+// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html
+#define MAX_INT_CALLER_SAVED 17
+#define MAX_FP_CALLER_SAVED  24
+
+.altmacro // Enable alternate macro mode
+
+.macro push_if_used regs, fregs
+.if \regs > MAX_INT_CALLER_SAVED
+    PTR_ADDI      $sp,    $sp,    -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG)
+    push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
+.endif
+.if \fregs > MAX_FP_CALLER_SAVED
+    PTR_ADDI      $sp,    $sp,    -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG)
+    push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
+.endif
+.endm // End push_if_used
+.macro pop_if_used regs, fregs
+.if \fregs > MAX_FP_CALLER_SAVED
+    pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
+    PTR_ADDI      $sp,    $sp,    (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG
+.endif
+.if \regs > MAX_INT_CALLER_SAVED
+    pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
+    PTR_ADDI      $sp,    $sp,    (\regs - MAX_INT_CALLER_SAVED) << REG_LOG
+.endif
+.endm // End pop_if_used
+.macro push_regs from, to
+    PTR_ST    $s\()\from,     $sp,    \from << REG_LOG
+.if \to - \from
+    push_regs %from + 1, \to
+.endif
+.endm // End push_regs
+.macro pop_regs from, to
+    PTR_LD    $s\()\from,     $sp,    \from << REG_LOG
+.if \to - \from
+    pop_regs %from + 1, \to
+.endif
+.endm // End pop_regs
+.macro push_fregs from, to
+    PTR_FST   $fs\()\from,    $sp,    \from << FREG_LOG
+.if \to - \from
+    push_fregs %from + 1, \to
+.endif
+.endm // End push_fregs
+.macro pop_fregs from, to
+    PTR_FLD   $fs\()\from,    $sp,    \from << FREG_LOG
+.if \to - \from
+    pop_fregs %from + 1, \to
+.endif
+.endm // End pop_fregs
+
+//
+// Instruction Related Macros
+//
+// GLD
+//
+.macro GLD pre_op:req, suf_op=0, out:req, src:req, offset:req/* imm */, more:vararg
+.ifeqs "\suf_op", "0"
+    \pre_op\()ld    \out,   \src,   \offset
+.else
+    \pre_op\()ld.\suf_op    \out,   \src,   \offset
+.endif
+.ifnb \more
+    GLD \pre_op, \suf_op, \more
+.endif
+.endm
+
+//
+// GLD_INC
+//
+.macro GLD_INC pre_op:req, suf_op=0, inc:req, out:req, src:req, offset:req/* imm */, more:vararg
+.ifeqs "\suf_op", "0"
+    \pre_op\()ld    \out,   \src,   \offset
+.else
+    \pre_op\()ld.\suf_op    \out,   \src,   \offset
+.endif
+    PTR_ADDI  \src,   \src,   \inc
+.ifnb \more
+    GLD_INC \pre_op, \suf_op, \inc, \more
+.endif
+.endm
+//
+// GLDX is same as GLD except the stride is a register
+//
+.macro GLDX pre_op:req, suf_op=0, out:req, src:req, offset:req/* reg */, more:vararg
+.ifeqs "\suf_op", "0"
+    \pre_op\()ldx    \out,   \src,   \offset
+.else
+    \pre_op\()ldx.\suf_op    \out,   \src,   \offset
+.endif
+.ifnb \more
+    GLDX \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GLDREPL
+//
+.macro GLDREPL pre_op:req, suf_op:req, out:req, src:req, offset:req/* imm */, more:vararg
+    \pre_op\()ldrepl.\suf_op    \out,   \src,   \offset
+.ifnb \more
+    GLDREPL  \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GST
+//
+.macro GST pre_op:req, suf_op=0, src:req, dst:req, offset:req/* imm */, more:vararg
+.ifeqs "\suf_op", "0"
+    \pre_op\()st    \src,   \dst,   \offset
+.else
+    \pre_op\()st.\suf_op \src,  \dst, \offset
+.endif
+.ifnb \more
+    GST \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GMUL
+//
+.macro GMUL pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()mul.\suf_op   \out,   \in0,   \in1
+.ifnb \more
+    GMUL \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GMADD
+//
+.macro GMADD pre_op, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg
+    \pre_op\()madd.\suf_op \out, \in0, \in1, \in2
+.ifnb \more
+    GMADD \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GADD
+//
+.macro GADD pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()add.\suf_op \out, \in0, \in1
+.ifnb \more
+    GADD \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GADDI
+//
+.macro GADDI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()addi.\suf_op  \out,   \in0,   \in1
+.ifnb \more
+    GADDI \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GSLLI
+//
+.macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()slli.\suf_op  \out,   \in0,   \in1
+.ifnb \more
+    GSLLI \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GINSVE0
+//
+.macro GINSVE0 pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()insve0.\suf_op    \out,   \in0,   \in1
+.ifnb \more
+    GINSVE0 \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GXOR
+//
+.macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()xor.\suf_op    \out,   \in0,   \in1
+.ifnb \more
+    GXOR \pre_op, \suf_op, \more
+.endif
+.endm
+
+//
+// Compound instructions
+//
+// GACC: Accumulate the values of vector registers
+//
+.macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
+.ifeqs "\pre_op", "xvf"
+    xvpermi.q              \out,   \in,    0x01
+    \pre_op\()add.\suf_op  \in,    \out,   \in
+    xvpackod.d             \out,   \in,    \in
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifeqs "\suf_op", "s"
+    xvpackod.w             \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.endif
+.endif
+
+.ifeqs "\pre_op", "vf"
+    vpackod.d              \out,   \in,    \in
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifeqs "\suf_op", "s"
+    vpackod.w              \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.endif
+.endif
+
+.ifeqs "\pre_op", "xv"
+    xvpermi.q              \out,   \in,    0x01
+    \pre_op\()add.\suf_op  \in,    \out,   \in
+    xvpackod.d             \out,   \in,    \in
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "d"
+    xvpackod.w             \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "w"
+    xvpackod.h             \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "h"
+    xvpackod.b             \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.endif
+.endif
+.endif
+.endif
+
+.ifeqs "\pre_op", "v"
+    vpackod.d              \out,   \in,    \in
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "d"
+    vpackod.w              \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "w"
+    vpackod.h              \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "h"
+    vpackod.b              \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.endif
+.endif
+.endif
+.endif
+
+.ifnb \more
+    GACC \pre_op, \suf_op, \more
+.endif
+.endm

From 57256623f4e2e1ba9303f924d9b2af82bdc45de4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Markus=20M=C3=BCtzel?= <markus.muetzel@gmx.de>
Date: Fri, 21 Apr 2023 10:20:54 +0200
Subject: [PATCH 249/718] fc.cmake: Add support for LLVM Flang.

---
 cmake/fc.cmake | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/cmake/fc.cmake b/cmake/fc.cmake
index a67760885..b70c788c1 100644
--- a/cmake/fc.cmake
+++ b/cmake/fc.cmake
@@ -3,7 +3,8 @@
 ## Description: Ported from portion of OpenBLAS/Makefile.system
 ##              Sets Fortran related variables.
 
-if (${F_COMPILER} STREQUAL "FLANG")
+if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
+  # This is for classic Flang. LLVM Flang is handled with gfortran below.
   set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
   if (BINARY64 AND INTERFACE64)
     set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
@@ -38,15 +39,17 @@ if (${F_COMPILER} STREQUAL "G95")
   endif ()
 endif ()
 
-if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95")
+if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
   set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
-  # ensure reentrancy of lapack codes
-  set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
-  # work around ABI violation in passing string arguments from C
-  set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
-  #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
-  if (NOT NO_LAPACK)
-    set(EXTRALIB "${EXTRALIB} -lgfortran")
+  if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
+    # ensure reentrancy of lapack codes
+    set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
+    # work around ABI violation in passing string arguments from C
+    set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
+    if (NOT NO_LAPACK)
+      # Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
+      set(EXTRALIB "${EXTRALIB} -lgfortran")
+    endif ()
   endif ()
   if (NO_BINARY_MODE)
     if (MIPS64)

From f334bd9041f52528545dd6db63d9b435f129cbd4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Markus=20M=C3=BCtzel?= <markus.muetzel@gmx.de>
Date: Fri, 21 Apr 2023 10:36:21 +0200
Subject: [PATCH 250/718] CI (MSYS2): Use LLVM Flang on CLANG64 runners. Add
 CLANG32 runner.

---
 .github/workflows/dynamic_arch.yml | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml
index c16f87954..1b45cb184 100644
--- a/.github/workflows/dynamic_arch.yml
+++ b/.github/workflows/dynamic_arch.yml
@@ -151,40 +151,47 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        msystem: [MINGW64, MINGW32, CLANG64]
+        msystem: [MINGW64, MINGW32, CLANG64, CLANG32]
         idx: [int32, int64]
         build-type: [Release]
         include:
           - msystem: MINGW64
             idx: int32
             target-prefix: mingw-w64-x86_64
-            fc-pkg: mingw-w64-x86_64-gcc-fortran
+            fc-pkg: fc
           - msystem: MINGW32
             idx: int32
             target-prefix: mingw-w64-i686
-            fc-pkg: mingw-w64-i686-gcc-fortran
+            fc-pkg: fc
           - msystem: CLANG64
             idx: int32
             target-prefix: mingw-w64-clang-x86_64
+            fc-pkg: fc
+          - msystem: CLANG32
+            idx: int32
+            target-prefix: mingw-w64-clang-i686
+            fc-pkg: cc
             c-lapack-flags: -DC_LAPACK=ON
           - msystem: MINGW64
             idx: int64
             idx64-flags: -DBINARY=64 -DINTERFACE64=1
             target-prefix: mingw-w64-x86_64
-            fc-pkg: mingw-w64-x86_64-gcc-fortran
+            fc-pkg: fc
           - msystem: CLANG64
             idx: int64
             idx64-flags: -DBINARY=64 -DINTERFACE64=1
             target-prefix: mingw-w64-clang-x86_64
-            c-lapack-flags: -DC_LAPACK=ON
+            fc-pkg: fc
           - msystem: MINGW64
             idx: int32
             target-prefix: mingw-w64-x86_64
-            fc-pkg: mingw-w64-x86_64-gcc-fortran
+            fc-pkg: fc
             build-type: None
         exclude:
           - msystem: MINGW32
             idx: int64
+          - msystem: CLANG32
+            idx: int64
 
     defaults:
       run:
@@ -209,7 +216,7 @@ jobs:
           install: >-
             base-devel
             ${{ matrix.target-prefix }}-cc
-            ${{ matrix.fc-pkg }}
+            ${{ matrix.target-prefix }}-${{ matrix.fc-pkg }}
             ${{ matrix.target-prefix }}-cmake
             ${{ matrix.target-prefix }}-ninja
             ${{ matrix.target-prefix }}-ccache

From 1c3fcaaf42e3b55eef34e53b3d999a981c793e3f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Markus=20M=C3=BCtzel?= <markus.muetzel@gmx.de>
Date: Mon, 24 Apr 2023 18:32:03 +0200
Subject: [PATCH 251/718] CI (MSYS2): Re-run failed tests verbosely.

---
 .github/workflows/dynamic_arch.yml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml
index 1b45cb184..85b20c060 100644
--- a/.github/workflows/dynamic_arch.yml
+++ b/.github/workflows/dynamic_arch.yml
@@ -287,9 +287,22 @@ jobs:
           key: ${{ steps.ccache-prepare.outputs.key }}
 
       - name: Run tests
+        id: run-ctest
         timeout-minutes: 60
         run: cd build && ctest
 
+      - name: Re-run tests
+        if: always() && (steps.run-ctest.outcome == 'failure')
+        timeout-minutes: 60
+        run: |
+          cd build
+          echo "::group::Re-run ctest"
+          ctest --rerun-failed --output-on-failure || true
+          echo "::endgroup::"
+          echo "::group::Log from these tests"
+          [ ! -f Testing/Temporary/LastTest.log ] || cat Testing/Temporary/LastTest.log
+          echo "::endgroup::"
+
 
   cross_build:
     runs-on: ubuntu-22.04

From 53378296c8f27b13c527700199412bc292507a7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Markus=20M=C3=BCtzel?= <markus.muetzel@gmx.de>
Date: Sat, 5 Aug 2023 13:47:38 +0200
Subject: [PATCH 252/718] CI: Build with NO_AVX512 for the runners that use
 Flang 16.

---
 .github/workflows/dynamic_arch.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml
index 85b20c060..4fe6e63fc 100644
--- a/.github/workflows/dynamic_arch.yml
+++ b/.github/workflows/dynamic_arch.yml
@@ -167,6 +167,9 @@ jobs:
             idx: int32
             target-prefix: mingw-w64-clang-x86_64
             fc-pkg: fc
+            # Compiling with Flang 16 seems to cause test errors on machines
+            # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
+            no-avx512-flags: -DNO_AVX512=1
           - msystem: CLANG32
             idx: int32
             target-prefix: mingw-w64-clang-i686
@@ -182,6 +185,9 @@ jobs:
             idx64-flags: -DBINARY=64 -DINTERFACE64=1
             target-prefix: mingw-w64-clang-x86_64
             fc-pkg: fc
+            # Compiling with Flang 16 seems to cause test errors on machines
+            # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
+            no-avx512-flags: -DNO_AVX512=1
           - msystem: MINGW64
             idx: int32
             target-prefix: mingw-w64-x86_64
@@ -268,6 +274,7 @@ jobs:
                 -DTARGET=CORE2 \
                 ${{ matrix.idx64-flags }} \
                 ${{ matrix.c-lapack-flags }} \
+                ${{ matrix.no-avx512-flags }} \
                 -DCMAKE_C_COMPILER_LAUNCHER=ccache \
                 -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
                 ..

From df978c90cdbc51e24113ff4bffd6c438fa3db3b6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 5 Aug 2023 18:32:41 +0200
Subject: [PATCH 253/718] Update Jenkinsfile.pwr

---
 Jenkinsfile.pwr | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jenkinsfile.pwr b/Jenkinsfile.pwr
index d141ed8a5..96e18b8ad 100644
--- a/Jenkinsfile.pwr
+++ b/Jenkinsfile.pwr
@@ -9,7 +9,7 @@ pipeline {
             steps {
                 sh 'sudo apt update'
                 sh 'sudo apt install gfortran -y'
-                sh 'make'
+                sh 'make clean && make'
             }
         }
     }

From 51c218d17a74c5722f76590022130b2faa3616ed Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 5 Aug 2023 18:33:15 +0200
Subject: [PATCH 254/718] Update Jenkinsfile

---
 Jenkinsfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 5fad6a95b..baeeee59f 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -7,7 +7,7 @@ pipeline {
     stages {
         stage('Build') {
             steps {
-                sh 'make'
+                sh 'make clean && make'
             }
         }
     }

From 61d803547afb6bb5889a2f00ac169e0e44d1febe Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 6 Aug 2023 15:17:38 +0200
Subject: [PATCH 255/718] Apply USE_TRMM to MIPS64_GENERIC as to GENERIC

---
 kernel/Makefile.L3 | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index bea6cb048..174a1d41b 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -35,6 +35,12 @@ USE_TRMM = 1
 endif
 endif
 
+ifneq ($(DYNAMIC_ARCH), 1)
+ifeq ($(TARGET), MIPS64_GENERIC)
+USE_TRMM = 1
+endif
+endif
+
 ifeq ($(CORE), HASWELL)
 USE_TRMM = 1
 endif

From 41c31bc1d4dbdaea326f41cbdf7f254826f8a25a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 6 Aug 2023 16:00:03 +0200
Subject: [PATCH 256/718] Revert "LoongArch64: Add dgemv_t_8_lasx.S and
 dgemv_n_8_lasx.S"

---
 kernel/loongarch64/KERNEL.LOONGSON3R5 |   3 -
 kernel/loongarch64/KERNEL.generic     |   4 -
 kernel/loongarch64/dgemv_n_8_lasx.S   | 546 --------------------------
 kernel/loongarch64/dgemv_t_8_lasx.S   | 468 ----------------------
 kernel/loongarch64/loongarch64_asm.S  | 313 ---------------
 5 files changed, 1334 deletions(-)
 delete mode 100644 kernel/loongarch64/dgemv_n_8_lasx.S
 delete mode 100644 kernel/loongarch64/dgemv_t_8_lasx.S
 delete mode 100644 kernel/loongarch64/loongarch64_asm.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 79671c454..253aa2464 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -10,9 +10,6 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-DGEMVNKERNEL = dgemv_n_8_lasx.S
-DGEMVTKERNEL = dgemv_t_8_lasx.S
-
 DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
 DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic
index 213add9ee..b772a6f82 100644
--- a/kernel/loongarch64/KERNEL.generic
+++ b/kernel/loongarch64/KERNEL.generic
@@ -132,16 +132,12 @@ CSWAPKERNEL  = ../arm/zswap.c
 ZSWAPKERNEL  = ../arm/zswap.c
 
 SGEMVNKERNEL = ../arm/gemv_n.c
-ifndef DGEMVNKERNEL
 DGEMVNKERNEL = ../arm/gemv_n.c
-endif
 CGEMVNKERNEL = ../arm/zgemv_n.c
 ZGEMVNKERNEL = ../arm/zgemv_n.c
 
 SGEMVTKERNEL = ../arm/gemv_t.c
-ifndef DGEMVTKERNEL
 DGEMVTKERNEL = ../arm/gemv_t.c
-endif
 CGEMVTKERNEL = ../arm/zgemv_t.c
 ZGEMVTKERNEL = ../arm/zgemv_t.c
 
diff --git a/kernel/loongarch64/dgemv_n_8_lasx.S b/kernel/loongarch64/dgemv_n_8_lasx.S
deleted file mode 100644
index 940d27569..000000000
--- a/kernel/loongarch64/dgemv_n_8_lasx.S
+++ /dev/null
@@ -1,546 +0,0 @@
-/*******************************************************************************
-Copyright (c) 2023, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*******************************************************************************/
-#define ASSEMBLER
-
-#include "common.h"
-#include "loongarch64_asm.S"
-
-/*********************************************************************
-* 2023/07/14 guxiwei
-*        UTEST                  : OK
-*        CTEST                  : OK
-*        TEST                   : OK
-*
-*
-*********************************************************************/
-
-/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
- * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
- */
-#define M       $r4
-#define N       $r5
-#define ALPHA   $f0
-#define A       $r7
-#define LDA     $r8
-#define X       $r9
-#define INC_X   $r10
-#define Y       $r11
-#define INC_Y   $r6
-
-#define J       $r12
-#define I       $r13
-#define K       $r14
-#define Y_ORG   $r15
-#define OFFSET  $r16
-#define K_LDA   $r17
-#define M8      $r18
-#define T0      $r19
-#define PA0     $r20
-#define PA1     $r23
-#define PA2     $r24
-#define PA3     $r25
-#define PA4     $r26
-#define PA5     $r27
-#define PA6     $r28
-#define PA7     $r29
-
-#define VALPHA  $xr1
-#define X0      $xr2
-#define X1      $xr3
-#define X2      $xr4
-#define X3      $xr5
-#define X4      $xr6
-#define X5      $xr7
-#define X6      $xr8
-#define X7      $xr9
-#define Y0      $xr10
-#define Y1      $xr11
-#define A0      $xr12
-#define A1      $xr13
-#define A2      $xr14
-#define A3      $xr15
-#define A4      $xr16
-#define A5      $xr17
-#define A6      $xr18
-#define A7      $xr19
-#define A8      $xr20
-#define A9      $xr21
-#define A10     $xr22
-#define A11     $xr23
-#define A12     $xr24
-#define A13     $xr25
-#define A14     $xr26
-#define A15     $xr27
-
-.macro DLOAD_X_8
-    GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
-                   X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
-    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
-                 X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
-.endm
-
-.macro DLOAD_X_4
-    GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18
-    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
-.endm
-
-.macro DLOAD_X_2
-    GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08
-    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA
-.endm
-
-.macro DLOAD_X_1
-    GLDREPL xv, d, X0, X, 0x00
-    GMUL xvf, d, X0, X0, VALPHA
-.endm
-
-.macro DLOAD_Y_8
-    GLD xv, , Y0, Y, 0, Y1, Y, 0x20
-.endm
-
-.macro DLOAD_Y_4
-    GLD xv, , Y0, Y, 0
-.endm
-
-.macro DLOAD_Y_1
-    fld.d  $f10,   Y,  0
-.endm
-
-.macro DSTORE_Y_8
-    GST xv, , Y0, Y, 0, Y1, Y, 0x20
-.endm
-
-.macro DSTORE_Y_4
-    GST xv, , Y0, Y, 0
-.endm
-
-.macro DSTORE_Y_1
-    fst.d   $f10,   Y,  0
-.endm
-
-// Unable to use vector load/store ins
-.macro DLOAD_Y_8_GAP
-    fld.d   $f10,   Y,  0
-    fldx.d  $f13,   Y,  INC_Y
-    PTR_ALSL  T0,   INC_Y,  Y,  1
-    fld.d   $f14,   T0, 0
-    fldx.d  $f15,   T0, INC_Y
-    PTR_ALSL  T0,   INC_Y,  Y,  2
-    fld.d   $f11,   T0, 0
-    fldx.d  $f17,   T0, INC_Y
-    PTR_ADD   T0,   T0, INC_Y
-    PTR_ADD   T0,   T0, INC_Y
-    fld.d   $f18,   T0, 0
-    fldx.d  $f19,   T0, INC_Y
-    GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3
-.endm
-
-.macro DLOAD_Y_4_GAP
-    fld.d   $f10,   Y,  0
-    fldx.d  $f13,   Y,  INC_Y
-    PTR_ALSL  T0,   INC_Y,  Y,  1
-    fld.d   $f14,   T0, 0
-    fldx.d  $f15,   T0, INC_Y
-    GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3
-.endm
-
-.macro DSTORE_Y_8_GAP
-    xvstelm.d   Y0,     Y,      0,      0
-    PTR_ADD     T0,     Y,      INC_Y
-    xvstelm.d   Y0,     T0,     0,      1
-    PTR_ADD     T0,     T0,     INC_Y
-    xvstelm.d   Y0,     T0,     0,      2
-    PTR_ADD     T0,     T0,     INC_Y
-    xvstelm.d   Y0,     T0,     0,      3
-
-    PTR_ADD     T0,     T0,     INC_Y
-    xvstelm.d   Y1,     T0,     0,      0
-    PTR_ADD     T0,     T0,     INC_Y
-    xvstelm.d   Y1,     T0,     0,      1
-    PTR_ADD     T0,     T0,     INC_Y
-    xvstelm.d   Y1,     T0,     0,      2
-    PTR_ADD     T0,     T0,     INC_Y
-    xvstelm.d   Y1,     T0,     0,      3
-.endm
-
-.macro DSTORE_Y_4_GAP
-    xvstelm.d   Y0,     Y,      0,      0
-    PTR_ADD     T0,     Y,      INC_Y
-    xvstelm.d   Y0,     T0,     0,      1
-    PTR_ADD     T0,     T0,     INC_Y
-    xvstelm.d   Y0,     T0,     0,      2
-    PTR_ADD     T0,     T0,     INC_Y
-    xvstelm.d   Y0,     T0,     0,      3
-.endm
-
-.macro DLOAD_X_8_GAP
-    xvldrepl.d  X0,     X,      0x00
-    PTR_ADD     T0,     X,      INC_X
-    xvldrepl.d  X1,     T0,     0x00
-    PTR_ADD     T0,     T0,     INC_X
-    xvldrepl.d  X2,     T0,     0x00
-    PTR_ADD     T0,     T0,     INC_X
-    xvldrepl.d  X3,     T0,     0x00
-    PTR_ADD     T0,     T0,     INC_X
-    xvldrepl.d  X4,     T0,     0x00
-    PTR_ADD     T0,     T0,     INC_X
-    xvldrepl.d  X5,     T0,     0x00
-    PTR_ADD     T0,     T0,     INC_X
-    xvldrepl.d  X6,     T0,     0x00
-    PTR_ADD     T0,     T0,     INC_X
-    xvldrepl.d  X7,     T0,     0x00
-    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
-                 X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
-.endm
-
-.macro DLOAD_X_4_GAP
-    xvldrepl.d  X0,     X,      0x00
-    PTR_ADD     T0,     X,      INC_X
-    xvldrepl.d  X1,     T0,     0x00
-    PTR_ADD     T0,     T0,     INC_X
-    xvldrepl.d  X2,     T0,     0x00
-    PTR_ADD     T0,     T0,     INC_X
-    xvldrepl.d  X3,     T0,     0x00
-    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
-.endm
-
-.macro DLOAD_X_2_GAP
-    xvldrepl.d  X0,     X,      0x00
-    PTR_ADD     T0,     X,      INC_X
-    xvldrepl.d  X1,     T0,     0x00
-    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA
-.endm
-
-.macro DGEMV_N_8x8
-    GLD_INC xv, , 0x20,       \
-    A0,  PA0, 0, A1,  PA0, 0, \
-    A2,  PA1, 0, A3,  PA1, 0, \
-    A4,  PA2, 0, A5,  PA2, 0, \
-    A6,  PA3, 0, A7,  PA3, 0, \
-    A8,  PA4, 0, A9,  PA4, 0, \
-    A10, PA5, 0, A11, PA5, 0, \
-    A12, PA6, 0, A13, PA6, 0, \
-    A14, PA7, 0, A15, PA7, 0
-
-    GMADD xvf, d, Y0, A0,  X0, Y0, Y1, A1,  X0, Y1, \
-                  Y0, A2,  X1, Y0, Y1, A3,  X1, Y1, \
-                  Y0, A4,  X2, Y0, Y1, A5,  X2, Y1, \
-                  Y0, A6,  X3, Y0, Y1, A7,  X3, Y1, \
-                  Y0, A8,  X4, Y0, Y1, A9,  X4, Y1, \
-                  Y0, A10, X5, Y0, Y1, A11, X5, Y1, \
-                  Y0, A12, X6, Y0, Y1, A13, X6, Y1, \
-                  Y0, A14, X7, Y0, Y1, A15, X7, Y1
-.endm
-
-.macro DGEMV_N_4x8
-    GLD_INC xv, , 0x20, A0,  PA0, 0, \
-                        A2,  PA1, 0, \
-                        A4,  PA2, 0, \
-                        A6,  PA3, 0, \
-                        A8,  PA4, 0, \
-                        A10, PA5, 0, \
-                        A12, PA6, 0, \
-                        A14, PA7, 0
-
-    GMADD xvf, d, Y0, A0,  X0, Y0, \
-                  Y0, A2,  X1, Y0, \
-                  Y0, A4,  X2, Y0, \
-                  Y0, A6,  X3, Y0, \
-                  Y0, A8,  X4, Y0, \
-                  Y0, A10, X5, Y0, \
-                  Y0, A12, X6, Y0, \
-                  Y0, A14, X7, Y0
-.endm
-
-.macro DGEMV_N_1x8
-    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \
-                        $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0
-    GMADD f, d, $f10, $f12, $f2, $f10, \
-                $f10, $f14, $f3, $f10, \
-                $f10, $f16, $f4, $f10, \
-                $f10, $f18, $f5, $f10, \
-                $f10, $f20, $f6, $f10, \
-                $f10, $f22, $f7, $f10, \
-                $f10, $f24, $f8, $f10, \
-                $f10, $f26, $f9, $f10,
-.endm
-
-.macro DGEMV_N_8x4
-    GLD_INC xv, , 0x20,       \
-    A0,  PA0, 0, A1,  PA0, 0, \
-    A2,  PA1, 0, A3,  PA1, 0, \
-    A4,  PA2, 0, A5,  PA2, 0, \
-    A6,  PA3, 0, A7,  PA3, 0
-
-    GMADD xvf, d, Y0, A0,  X0, Y0, Y1, A1,  X0, Y1, \
-                  Y0, A2,  X1, Y0, Y1, A3,  X1, Y1, \
-                  Y0, A4,  X2, Y0, Y1, A5,  X2, Y1, \
-                  Y0, A6,  X3, Y0, Y1, A7,  X3, Y1
-.endm
-
-.macro DGEMV_N_4x4
-    GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0
-
-    GMADD xvf, d, Y0, A0,  X0, Y0, Y0, A2,  X1, Y0, \
-                  Y0, A4,  X2, Y0, Y0, A6,  X3, Y0
-.endm
-
-.macro DGEMV_N_1x4
-    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0
-    GMADD f, d, $f10, $f12, $f2, $f10, $f10, $f14, $f3, $f10, \
-                $f10, $f16, $f4, $f10, $f10, $f18, $f5, $f10
-.endm
-
-.macro DGEMV_N_8x2
-    GLD_INC xv, , 0x20,       \
-    A0,  PA0, 0, A1,  PA0, 0, \
-    A2,  PA1, 0, A3,  PA1, 0
-    GMADD xvf, d, Y0, A0,  X0, Y0, Y1, A1,  X0, Y1, \
-                  Y0, A2,  X1, Y0, Y1, A3,  X1, Y1
-.endm
-
-.macro DGEMV_N_4x2
-    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0
-    GMADD xvf, d, Y0, A0,  X0, Y0, \
-                  Y0, A2,  X1, Y0
-.endm
-
-.macro DGEMV_N_1x2
-    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0
-    GMADD f, d, $f10, $f12, $f2, $f10, \
-                $f10, $f14, $f3, $f10
-.endm
-
-.macro DGEMV_N_1x1
-    fld.d   $f12,    PA0,    0
-    PTR_ADDI PA0,   PA0,    0x08
-    fmadd.d $f10,   $f12,   $f2,    $f10
-.endm
-
-.macro DGEMV_N XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req
-    PTR_SRLI  J,      N,      3
-    beqz      J,      .L_\XW\()_N_7
-    PTR_SLLI  K_LDA,  LDA,    3
-    PTR_SUB   K_LDA,  K_LDA,  M8
-.L_\XW\()_N_L8:
-    DLOAD_\X_8
-    xor     K,      K,      K
-    move    Y,      Y_ORG
-    PTR_SRLI  I,      M,       3
-    beqz      I,      .L_\XW\()_M_7
-.align 5
-.L_\XW\()_M_L8:
-    DLOAD_\Y_8
-    DGEMV_N_8x8
-    DSTORE_\Y_8
-    PTR_ADDI    I,      I,      -1
-    PTR_ALSL    Y,      INC_Y,  Y,  3
-    PTR_ADDI    K,      K,      8
-    bnez        I,      .L_\XW\()_M_L8
-.L_\XW\()_M_7:
-    andi        I,      M,      4
-    beqz        I,      .L_\XW\()_M_3
-    DLOAD_\Y_4
-    DGEMV_N_4x8
-    DSTORE_\Y_4
-    PTR_ALSL    Y,      INC_Y,  Y,  2
-    PTR_ADDI    K,      K,      4
-.L_\XW\()_M_3:
-    andi        I,      M,      3
-    beqz        I,      .L_\XW\()_M_END
-.align 5
-.L_\XW\()_M_L1:
-    DLOAD_\Y_1
-    DGEMV_N_1x8
-    DSTORE_\Y_1
-    PTR_ADDI    I,      I,      -1
-    PTR_ADD     Y,      Y,      INC_Y
-    PTR_ADDI    K,      K,      1
-    bnez        I,      .L_\XW\()_M_L1
-.L_\XW\()_M_END:
-    PTR_ADDI    J,      J,      -1
-#if __loongarch_grlen == 64
-    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
-              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
-#else
-    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
-              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
-#endif
-    PTR_ALSL    X,      INC_X,  X,  3
-    bnez        J,      .L_\XW\()_N_L8
-.L_\XW\()_N_7:
-    andi        J,      N,      4
-    beqz        J,      .L_\XW\()_N_3
-    DLOAD_\X_4
-    xor         K,      K,      K
-    move        Y,      Y_ORG
-
-    PTR_SRLI  I,      M,       3
-    beqz      I,      .L_\XW\()_N_4_M_7
-.align 5
-.L_\XW\()_N_4_M_L8:
-    DLOAD_\Y_8
-    DGEMV_N_8x4
-    DSTORE_\Y_8
-    PTR_ADDI  I,      I,      -1
-    PTR_ADDI  K,      K,      8
-    PTR_ALSL  Y,      INC_Y,  Y,  3
-    bnez    I,      .L_\XW\()_N_4_M_L8
-.L_\XW\()_N_4_M_7:
-    andi    I,      M,      4
-    beqz    I,      .L_\XW\()_N_4_M_3
-    DLOAD_\Y_4
-    DGEMV_N_4x4
-    DSTORE_\Y_4
-    PTR_ALSL  Y,    INC_Y,  Y,  2
-    PTR_ADDI  K,    K,      4
-.L_\XW\()_N_4_M_3:
-    andi        I,      M,      3
-    beqz        I,      .L_\XW\()_N_4_M_END
-.align 5
-.L_\XW\()_N_4_M_L1:
-    DLOAD_\Y_1
-    DGEMV_N_1x4
-    DSTORE_\Y_1
-    PTR_ADDI    I,      I,      -1
-    PTR_ADD     Y,      Y,      INC_Y
-    PTR_ADDI    K,      K,      1
-    bnez        I,      .L_\XW\()_N_4_M_L1
-.L_\XW\()_N_4_M_END:
-    PTR_SLLI    K_LDA,  LDA,    2
-    PTR_SUB     K_LDA,  K_LDA,  M8
-#if __loongarch_grlen == 64
-    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
-#else
-    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
-#endif
-    PTR_ALSL    X,      INC_X,  X,  2
-.L_\XW\()_N_3:
-    andi        J,      N,      2
-    beqz        J,      .L_\XW\()_N_1
-    DLOAD_\X_2
-    xor         K,      K,      K
-    move        Y,      Y_ORG
-    PTR_SRLI    I,      M,       3
-    beqz    I,      .L_\XW\()_N_2_M_7
-.align 5
-.L_\XW\()_N_2_M_L8:
-    DLOAD_\Y_8
-    DGEMV_N_8x2
-    DSTORE_\Y_8
-    PTR_ADDI  I,      I,      -1
-    PTR_ADDI  K,      K,      8
-    PTR_ALSL  Y,      INC_Y,  Y,  3
-    bnez    I,      .L_\XW\()_N_2_M_L8
-.L_\XW\()_N_2_M_7:
-    andi    I,      M,      4
-    beqz    I,      .L_\XW\()_N_2_M_3
-    DLOAD_\Y_4
-    DGEMV_N_4x2
-    DSTORE_\Y_4
-    PTR_ALSL  Y,        INC_Y,      Y,      2
-    PTR_ADDI  K,        K,          4
-.L_\XW\()_N_2_M_3:
-    andi    I,      M,      3
-    beqz    I,      .L_\XW\()_N_2_M_END
-.align 5
-.L_\XW\()_N_2_M_L1:
-    DLOAD_\Y_1
-    DGEMV_N_1x2
-    DSTORE_\Y_1
-    PTR_ADDI  I,      I,      -1
-    PTR_ADD   Y,      Y,      INC_Y
-    PTR_ADDI  K,      K,      1
-    bnez    I,      .L_\XW\()_N_2_M_L1
-.L_\XW\()_N_2_M_END:
-    PTR_SLLI    K_LDA,  LDA,    1
-    PTR_SUB     K_LDA,  K_LDA,  M8
-    PTR_ADD     PA0,    PA0,    K_LDA
-    PTR_ADD     PA1,    PA1,    K_LDA
-    PTR_ALSL    X,      INC_X,  X,  1
-.L_\XW\()_N_1:
-    andi    J,      N,      1
-    beqz    J,      .L_END
-    DLOAD_\X_1
-    xor     K,      K,      K
-    move    Y,      Y_ORG
-    move    I,      M
-    beqz    I,      .L_END
-.align 5
-.L_\XW\()_N_1_M_L1:
-    DLOAD_\Y_1
-    DGEMV_N_1x1
-    DSTORE_\Y_1
-    PTR_ADDI  I,      I,      -1
-    PTR_ADD   Y,      Y,      INC_Y
-    PTR_ADDI  K,      K,      1
-    bnez    I,      .L_\XW\()_N_1_M_L1
-    b .L_END
-.endm
-
-    PROLOGUE
-    PTR_LD     INC_Y,  $sp,    0
-    push_if_used 17 + 7, 24 + 4
-    PTR_ADDI   K,      $r0,     0x01
-    PTR_SUB    I,      INC_X,   K
-    PTR_SUB    J,      INC_Y,   K
-    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
-    maskeqz    J,      K,       J  /* if(inc_y == 1) j = 0; else j = 1; */
-    PTR_ALSL   I,      I,       J,      1
-    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
-    xvreplve0.d     VALPHA, $xr0
-    move     Y_ORG,  Y
-    move     PA0,    A
-#if __loongarch_grlen == 64
-    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
-              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
-#else
-    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
-              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
-#endif
-    la.local    T0,     .L_GAP_TABLE
-    PTR_ALSL    I,      I,      T0,     1
-    ld.h        K,      I,      0
-    PTR_ADD     T0,     T0,     K
-    jirl        $r0,    T0,     0
-.L_GAP_TABLE:
-    .hword  .L_GAP_0_0 - .L_GAP_TABLE
-    .hword  .L_GAP_0_1 - .L_GAP_TABLE
-    .hword  .L_GAP_1_0 - .L_GAP_TABLE
-    .hword  .L_GAP_1_1 - .L_GAP_TABLE
-.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
-    DGEMV_N GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1
-.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
-    DGEMV_N GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1
-.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
-    DGEMV_N GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1
-.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
-    DGEMV_N GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
-.L_END:
-    pop_if_used 17 + 7, 24 + 4
-    jirl    $r0, $r1, 0x0
-    EPILOGUE
diff --git a/kernel/loongarch64/dgemv_t_8_lasx.S b/kernel/loongarch64/dgemv_t_8_lasx.S
deleted file mode 100644
index be90cb1af..000000000
--- a/kernel/loongarch64/dgemv_t_8_lasx.S
+++ /dev/null
@@ -1,468 +0,0 @@
-/*******************************************************************************
-Copyright (c) 2023, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*******************************************************************************/
-#define ASSEMBLER
-
-#include "common.h"
-#include "loongarch64_asm.S"
-
-/*********************************************************************
-* 2023/07/17 guxiwei
-*        UTEST                  : OK
-*        CTEST                  : OK
-*        TEST                   : OK
-*
-*
-*********************************************************************/
-
-/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
- * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
- */
-#define M       $r4
-#define N       $r5
-#define ALPHA   $f0
-#define A       $r7
-#define LDA     $r8
-#define X       $r9
-#define INC_X   $r10
-#define Y       $r11
-#define INC_Y   $r6
-
-#define J       $r12
-#define I       $r13
-#define K       $r14
-#define PY0     $r14
-#define X_ORG   $r15
-#define PY1     $r16
-#define K_LDA   $r17
-#define PY2     $r18
-#define T0      $r19
-#define PA0     $r20
-#define PA1     $r23
-#define PA2     $r24
-#define PA3     $r25
-#define PA4     $r26
-#define PA5     $r27
-#define PA6     $r28
-#define PA7     $r29
-#define M8      $r30
-
-#define VALPHA  $xr0
-#define X0      $xr1
-#define X1      $xr2
-#define A0      $xr3
-#define A1      $xr4
-#define A2      $xr5
-#define A3      $xr6
-#define A4      $xr7
-#define A5      $xr8
-#define A6      $xr9
-#define A7      $xr10
-#define A8      $xr11
-#define A9      $xr12
-#define A10     $xr13
-#define A11     $xr14
-#define A12     $xr15
-#define A13     $xr16
-#define A14     $xr17
-#define A15     $xr18
-#define TP0     $xr19
-#define TP1     $xr20
-#define TP2     $xr21
-#define TP3     $xr22
-#define TP4     $xr23
-#define TP5     $xr24
-#define TP6     $xr25
-#define TP7     $xr26
-#define Y0      $xr3
-#define Y1      $xr4
-#define Y2      $xr5
-#define Y3      $xr6
-#define Y4      $xr7
-#define Y5      $xr8
-#define Y6      $xr9
-#define Y7      $xr10
-
-.macro ZERO_Y8
-    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \
-                TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7
-.endm
-
-.macro ZERO_Y4
-    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
-.endm
-
-.macro ZERO_Y2
-    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1
-.endm
-
-.macro ZERO_Y1
-    GXOR xv, v, TP0, TP0, TP0
-.endm
-
-.macro DLOAD_X8
-    GLD xv, , X0, X, 0x00, X1, X, 0x20
-.endm
-
-.macro DLOAD_X4
-    GLD xv, , X0, X, 0x00
-.endm
-
-.macro DLOAD_X8_GAP
-    fld.d       $f1,    X,    0x00
-    fldx.d      $f2,    X,    INC_X
-    PTR_ALSL    T0,     INC_X,      X,      1
-    fld.d       $f3,    T0,   0x00
-    fldx.d      $f4,    T0,   INC_X
-    GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
-    PTR_ALSL    T0,     INC_X,      X,      2
-    fld.d       $f2,    T0,   0x00
-    fldx.d      $f3,    T0,   INC_X
-    PTR_ALSL    T0,     INC_X,      T0,     1
-    fld.d       $f4,    T0,   0x00
-    fldx.d      $f5,    T0,   INC_X
-    GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3
-.endm
-
-.macro DLOAD_X4_GAP
-    fld.d       $f1,    X,    0x00
-    fldx.d      $f2,    X,    INC_X
-    PTR_ALSL    T0,     INC_X,      X,      1
-    fld.d       $f3,    T0,   0x00
-    fldx.d      $f4,    T0,   INC_X
-    GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
-.endm
-
-.macro DGEMV_T_8x8
-    GLD_INC xv, , 0x20,       \
-    A0,  PA0, 0, A1,  PA0, 0, \
-    A2,  PA1, 0, A3,  PA1, 0, \
-    A4,  PA2, 0, A5,  PA2, 0, \
-    A6,  PA3, 0, A7,  PA3, 0, \
-    A8,  PA4, 0, A9,  PA4, 0, \
-    A10, PA5, 0, A11, PA5, 0, \
-    A12, PA6, 0, A13, PA6, 0, \
-    A14, PA7, 0, A15, PA7, 0
-
-    GMADD xvf, d, TP0, A0,  X0, TP0, TP0, A1,  X1, TP0, \
-                  TP1, A2,  X0, TP1, TP1, A3,  X1, TP1, \
-                  TP2, A4,  X0, TP2, TP2, A5,  X1, TP2, \
-                  TP3, A6,  X0, TP3, TP3, A7,  X1, TP3, \
-                  TP4, A8,  X0, TP4, TP4, A9,  X1, TP4, \
-                  TP5, A10, X0, TP5, TP5, A11, X1, TP5, \
-                  TP6, A12, X0, TP6, TP6, A13, X1, TP6, \
-                  TP7, A14, X0, TP7, TP7, A15, X1, TP7
-.endm
-
-.macro DGEMV_T_8x4
-    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0, A4,  PA2, 0, A6,  PA3, 0, \
-                        A8,  PA4, 0, A10, PA5, 0, A12, PA6, 0, A14, PA7, 0
-
-    GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1, \
-                  TP2, A4,  X0, TP2, TP3, A6,  X0, TP3, \
-                  TP4, A8,  X0, TP4, TP5, A10, X0, TP5, \
-                  TP6, A12, X0, TP6, TP7, A14, X0, TP7,
-.endm
-
-.macro DGEMV_T_4x8
-    GLD_INC xv, , 0x20,       \
-    A0,  PA0, 0, A1,  PA0, 0, \
-    A2,  PA1, 0, A3,  PA1, 0, \
-    A4,  PA2, 0, A5,  PA2, 0, \
-    A6,  PA3, 0, A7,  PA3, 0
-
-    GMADD xvf, d, TP0, A0,  X0, TP0, TP0, A1,  X1, TP0, \
-                  TP1, A2,  X0, TP1, TP1, A3,  X1, TP1, \
-                  TP2, A4,  X0, TP2, TP2, A5,  X1, TP2, \
-                  TP3, A6,  X0, TP3, TP3, A7,  X1, TP3
-.endm
-
-.macro DGEMV_T_4x4
-    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0, A4,  PA2, 0, A6,  PA3, 0
-
-    GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1, \
-                  TP2, A4,  X0, TP2, TP3, A6,  X0, TP3
-.endm
-
-.macro DGEMV_T_2x8
-    GLD_INC xv, , 0x20, A0,  PA0, 0, A1,  PA0, 0, A2, PA1, 0, A3,  PA1, 0
-
-    GMADD xvf, d, TP0, A0,  X0, TP0, TP0, A1,  X1, TP0, \
-                  TP1, A2,  X0, TP1, TP1, A3,  X1, TP1
-.endm
-
-.macro DGEMV_T_2x4
-    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0
-
-    GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1
-.endm
-
-.macro DGEMV_T XW:req X8:req, X4:req
-    PTR_SRLI  J,      N,      3
-    beqz      J,      .L_\XW\()_N_7
-    PTR_SLLI  K_LDA,  LDA,    3
-    PTR_SUB   K_LDA,  K_LDA,  M8
-.L_\XW\()_N_L8:
-    ZERO_Y8
-    move      X,      X_ORG
-    PTR_SRLI  I,      M,       3
-    beqz      I,      .L_\XW\()_M_7
-.align 5
-.L_\XW\()_M_L8:
-    DLOAD_\X8
-    DGEMV_T_8x8
-    PTR_ADDI    I,  I,  -1
-    PTR_ALSL    X,  INC_X,  X,  3
-    bnez        I,  .L_\XW\()_M_L8
-.L_\XW\()_M_7:
-    andi        I,      M,      4
-    beqz        I,      .L_\XW\()_M_3
-    DLOAD_\X4
-    DGEMV_T_8x4
-    PTR_ALSL    X,  INC_X,  X,  2
-.L_\XW\()_M_3:
-    // Accumulated
-    GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \
-                 Y5, TP5, Y6, TP6, Y7, TP7
-    andi        I,      M,      3
-    beqz        I,      .L_\XW\()_M_END
-.align 5
-.L_\XW\()_M_L1:
-    fld.d   $f1,    X,      0x00
-    fld.d   $f11,   PA0,    0x00
-    fld.d   $f12,   PA1,    0x00
-    fld.d   $f13,   PA2,    0x00
-    fld.d   $f14,   PA3,    0x00
-    fld.d   $f15,   PA4,    0x00
-    fld.d   $f16,   PA5,    0x00
-    fld.d   $f17,   PA6,    0x00
-    fld.d   $f18,   PA7,    0x00
-#if __loongarch_grlen == 64
-    GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
-               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
-#else
-    GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
-               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
-#endif
-    GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \
-                $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10
-    PTR_ADDI  I,      I,      -1
-    PTR_ADD   X,      X,      INC_X
-    bnez      I,      .L_\XW\()_M_L1
-.L_\XW\()_M_END:
-    fld.d   $f11,   Y,  0x00
-    fldx.d  $f12,   Y,  INC_Y
-    PTR_ALSL    PY0, INC_Y,  Y,  1
-    fld.d   $f13,   PY0,    0x00
-    fldx.d  $f14,   PY0,    INC_Y
-    PTR_ALSL    PY1, INC_Y,  Y,  2
-    fld.d   $f15,   PY1,    0x00
-    fldx.d  $f16,   PY1,    INC_Y
-    PTR_ALSL    PY2, INC_Y,  PY1, 1
-    fld.d   $f17,   PY2,    0x00
-    fldx.d  $f18,   PY2,    INC_Y
-
-    GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14, \
-                $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17, $f18, ALPHA, $f10, $f18
-
-    PTR_ADDI    J,      J,      -1
-#if __loongarch_grlen == 64
-    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
-              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
-#else
-    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
-              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
-#endif
-    fst.d   $f11,   Y,      0x00
-    fstx.d  $f12,   Y,      INC_Y
-    fst.d   $f13,   PY0,    0x00
-    fstx.d  $f14,   PY0,    INC_Y
-    fst.d   $f15,   PY1,    0x00
-    fstx.d  $f16,   PY1,    INC_Y
-    fst.d   $f17,   PY2,    0x00
-    fstx.d  $f18,   PY2,    INC_Y
-    PTR_ALSL    Y,      INC_Y,  Y,  3
-    bnez        J,      .L_\XW\()_N_L8
-.L_\XW\()_N_7:
-    andi        J,      N,      4
-    beqz        J,      .L_\XW\()_N_3
-    ZERO_Y4
-    move        X,      X_ORG
-    PTR_SRLI    I,      M,       3
-    beqz        I,      .L_\XW\()_N_4_M_7
-.align 5
-.L_\XW\()_N_4_M_L8:
-    DLOAD_\X8
-    DGEMV_T_4x8
-    PTR_ADDI  I,      I,      -1
-    PTR_ALSL  X,      INC_X,  X,  3
-    bnez      I,      .L_\XW\()_N_4_M_L8
-.L_\XW\()_N_4_M_7:
-    andi    I,      M,      4
-    beqz    I,      .L_\XW\()_N_4_M_3
-    DLOAD_\X4
-    DGEMV_T_4x4
-    PTR_ALSL  X,      INC_X,  X,  2
-.L_\XW\()_N_4_M_3:
-    // Accumulated
-    GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
-    andi        I,      M,      3
-    beqz        I,      .L_\XW\()_N_4_M_END
-.align 5
-.L_\XW\()_N_4_M_L1:
-    fld.d   $f1,    X,      0x00
-    GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00, $f13, PA2, 0x00, $f14, PA3, 0x00
-    GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6
-    PTR_ADDI  I,      I,      -1
-    PTR_ADD   X,      X,      INC_X
-    bnez      I,      .L_\XW\()_N_4_M_L1
-.L_\XW\()_N_4_M_END:
-    fld.d   $f11,   Y,  0x00
-    fldx.d  $f12,   Y,  INC_Y
-    PTR_ALSL    PY0, INC_Y,  Y,  1
-    fld.d   $f13,   PY0,    0x00
-    fldx.d  $f14,   PY0,    INC_Y
-
-    GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14
-
-    PTR_SLLI    K_LDA,  LDA,    2
-    PTR_SUB     K_LDA,  K_LDA,  M8
-
-#if __loongarch_grlen == 64
-    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
-#else
-    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
-#endif
-    fst.d   $f11,   Y,      0x00
-    fstx.d  $f12,   Y,      INC_Y
-    fst.d   $f13,   PY0,    0x00
-    fstx.d  $f14,   PY0,    INC_Y
-    PTR_ALSL    Y,      INC_Y,  Y,  2
-.L_\XW\()_N_3:
-    andi        J,      N,      2
-    beqz        J,      .L_\XW\()_N_1
-    ZERO_Y2
-    move        X,      X_ORG
-    PTR_SRLI    I,      M,       3
-    beqz        I,      .L_\XW\()_N_2_M_7
-.align 5
-.L_\XW\()_N_2_M_L8:
-    DLOAD_\X8
-    DGEMV_T_2x8
-    PTR_ADDI  I,      I,      -1
-    PTR_ALSL  X,      INC_X,  X,  3
-    bnez      I,      .L_\XW\()_N_2_M_L8
-.L_\XW\()_N_2_M_7:
-    andi    I,      M,      4
-    beqz    I,      .L_\XW\()_N_2_M_3
-    DLOAD_\X4
-    DGEMV_T_2x4
-    PTR_ALSL  X,        INC_X,      X,      2
-.L_\XW\()_N_2_M_3:
-    // Accumulated
-    GACC xvf, d, Y0, TP0, Y1, TP1
-    andi        I,      M,      3
-    beqz        I,      .L_\XW\()_N_2_M_END
-.align 5
-.L_\XW\()_N_2_M_L1:
-    fld.d   $f1,    X,      0x00
-    GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00
-    GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4
-    PTR_ADDI  I,      I,      -1
-    PTR_ADD   X,      X,      INC_X
-    bnez      I,      .L_\XW\()_N_2_M_L1
-.L_\XW\()_N_2_M_END:
-    fld.d   $f11,   Y,  0x00
-    fldx.d  $f12,   Y,  INC_Y
-
-    GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12
-
-    PTR_SLLI    K_LDA,  LDA,    1
-    PTR_SUB     K_LDA,  K_LDA,  M8
-
-#if __loongarch_grlen == 64
-    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
-#else
-    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
-#endif
-    fst.d   $f11,   Y,      0x00
-    fstx.d  $f12,   Y,      INC_Y
-    PTR_ALSL        Y,      INC_Y,  Y,  1
-.L_\XW\()_N_1:
-    andi    J,      N,      1
-    beqz    J,      .L_END
-    ZERO_Y1
-    move    X,      X_ORG
-    move    I,      M
-    beqz    I,      .L_END
-.align 5
-.L_\XW\()_N_1_M_L1:
-    fld.d   $f3,    PA0,    0x00
-    fld.d   $f1,    X,      0x00
-    fmadd.d $f19,   $f3,    $f1,    $f19
-    PTR_ADDI  I,      I,      -1
-    PTR_ADD   X,      X,      INC_X
-    PTR_ADDI  PA0,    PA0,    0x08
-    bnez      I,      .L_\XW\()_N_1_M_L1
-    fld.d     $f3,    Y,      0x00
-    fmadd.d   $f3,    ALPHA,  $f19,  $f3
-    fst.d     $f3,    Y,      0x00
-    b .L_END
-.endm
-
-    PROLOGUE
-    PTR_LD     INC_Y,  $sp,    0
-    push_if_used 17 + 8, 24 + 3
-    PTR_ADDI   K,      $r0,     0x01
-    PTR_SUB    I,      INC_X,   K
-    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
-    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
-    xvreplve0.d     VALPHA, $xr0
-    move     X_ORG,  X
-    move     PA0,    A
-#if __loongarch_grlen == 64
-    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
-              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
-#else
-    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
-              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
-#endif
-    la.local    T0,     .L_GAP_TABLE
-    PTR_ALSL    I,      I,      T0,     1
-    ld.h        K,      I,      0
-    PTR_ADD     T0,     T0,     K
-    jirl        $r0,    T0,     0
-.L_GAP_TABLE:
-    .hword .L_GAP_0 - .L_GAP_TABLE
-    .hword .L_GAP_1 - .L_GAP_TABLE
-.L_GAP_0: /* if (incx == 1) */
-    DGEMV_T GAP_0, X8, X4
-.L_GAP_1: /* if (incx != 1) */
-    DGEMV_T GAP_1, X8_GAP, X4_GAP
-.L_END:
-    pop_if_used 17 + 8, 24 + 3
-    jirl    $r0, $r1, 0x0
-    EPILOGUE
diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S
deleted file mode 100644
index 8876cbed9..000000000
--- a/kernel/loongarch64/loongarch64_asm.S
+++ /dev/null
@@ -1,313 +0,0 @@
-/*******************************************************************************
-Copyright (c) 2023, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*******************************************************************************/
-
-#if __loongarch_grlen == 64
-#define LA_REG    int64_t
-#define REG_SIZE  8
-#define REG_LOG   3
-#define PTR_ADDI  addi.d
-#define PTR_ADD   add.d
-#define PTR_SUB   sub.d
-#define PTR_LD    ld.d
-#define PTR_ST    st.d
-#define PTR_SLLI  slli.d
-#define PTR_SRLI  srli.d
-#define PTR_ALSL  alsl.d
-#else
-#define LA_REG    int32_t
-#define REG_SIZE  4
-#define REG_LOG   2
-#define PTR_ADDI  addi.w
-#define PTR_ADD   add.w
-#define PTR_SUB   sub.w
-#define PTR_LD    ld.w
-#define PTR_ST    st.w
-#define PTR_SLLI  slli.w
-#define PTR_SRLI  srli.w
-#define PTR_ALSL  alsl.w
-#endif
-
-#if __loongarch_frlen == 64
-#define FREG_SIZE 8
-#define FREG_LOG  3
-#define PTR_FLD   fld.d
-#define PTR_FST   fst.d
-#else
-#define FREG_SIZE 4
-#define FREG_LOG  2
-#define PTR_FLD   fld.s
-#define PTR_FST   fst.s
-#endif
-
-// The max registers available to the user which
-// do not need to be preserved across calls.
-// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html
-#define MAX_INT_CALLER_SAVED 17
-#define MAX_FP_CALLER_SAVED  24
-
-.altmacro // Enable alternate macro mode
-
-.macro push_if_used regs, fregs
-.if \regs > MAX_INT_CALLER_SAVED
-    PTR_ADDI      $sp,    $sp,    -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG)
-    push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
-.endif
-.if \fregs > MAX_FP_CALLER_SAVED
-    PTR_ADDI      $sp,    $sp,    -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG)
-    push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
-.endif
-.endm // End push_if_used
-.macro pop_if_used regs, fregs
-.if \fregs > MAX_FP_CALLER_SAVED
-    pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
-    PTR_ADDI      $sp,    $sp,    (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG
-.endif
-.if \regs > MAX_INT_CALLER_SAVED
-    pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
-    PTR_ADDI      $sp,    $sp,    (\regs - MAX_INT_CALLER_SAVED) << REG_LOG
-.endif
-.endm // End pop_if_used
-.macro push_regs from, to
-    PTR_ST    $s\()\from,     $sp,    \from << REG_LOG
-.if \to - \from
-    push_regs %from + 1, \to
-.endif
-.endm // End push_regs
-.macro pop_regs from, to
-    PTR_LD    $s\()\from,     $sp,    \from << REG_LOG
-.if \to - \from
-    pop_regs %from + 1, \to
-.endif
-.endm // End pop_regs
-.macro push_fregs from, to
-    PTR_FST   $fs\()\from,    $sp,    \from << FREG_LOG
-.if \to - \from
-    push_fregs %from + 1, \to
-.endif
-.endm // End push_fregs
-.macro pop_fregs from, to
-    PTR_FLD   $fs\()\from,    $sp,    \from << FREG_LOG
-.if \to - \from
-    pop_fregs %from + 1, \to
-.endif
-.endm // End pop_fregs
-
-//
-// Instruction Related Macros
-//
-// GLD
-//
-.macro GLD pre_op:req, suf_op=0, out:req, src:req, offset:req/* imm */, more:vararg
-.ifeqs "\suf_op", "0"
-    \pre_op\()ld    \out,   \src,   \offset
-.else
-    \pre_op\()ld.\suf_op    \out,   \src,   \offset
-.endif
-.ifnb \more
-    GLD \pre_op, \suf_op, \more
-.endif
-.endm
-
-//
-// GLD_INC
-//
-.macro GLD_INC pre_op:req, suf_op=0, inc:req, out:req, src:req, offset:req/* imm */, more:vararg
-.ifeqs "\suf_op", "0"
-    \pre_op\()ld    \out,   \src,   \offset
-.else
-    \pre_op\()ld.\suf_op    \out,   \src,   \offset
-.endif
-    PTR_ADDI  \src,   \src,   \inc
-.ifnb \more
-    GLD_INC \pre_op, \suf_op, \inc, \more
-.endif
-.endm
-//
-// GLDX is same as GLD except the stride is a register
-//
-.macro GLDX pre_op:req, suf_op=0, out:req, src:req, offset:req/* reg */, more:vararg
-.ifeqs "\suf_op", "0"
-    \pre_op\()ldx    \out,   \src,   \offset
-.else
-    \pre_op\()ldx.\suf_op    \out,   \src,   \offset
-.endif
-.ifnb \more
-    GLDX \pre_op, \suf_op, \more
-.endif
-.endm
-//
-// GLDREPL
-//
-.macro GLDREPL pre_op:req, suf_op:req, out:req, src:req, offset:req/* imm */, more:vararg
-    \pre_op\()ldrepl.\suf_op    \out,   \src,   \offset
-.ifnb \more
-    GLDREPL  \pre_op, \suf_op, \more
-.endif
-.endm
-//
-// GST
-//
-.macro GST pre_op:req, suf_op=0, src:req, dst:req, offset:req/* imm */, more:vararg
-.ifeqs "\suf_op", "0"
-    \pre_op\()st    \src,   \dst,   \offset
-.else
-    \pre_op\()st.\suf_op \src,  \dst, \offset
-.endif
-.ifnb \more
-    GST \pre_op, \suf_op, \more
-.endif
-.endm
-//
-// GMUL
-//
-.macro GMUL pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
-    \pre_op\()mul.\suf_op   \out,   \in0,   \in1
-.ifnb \more
-    GMUL \pre_op, \suf_op, \more
-.endif
-.endm
-//
-// GMADD
-//
-.macro GMADD pre_op, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg
-    \pre_op\()madd.\suf_op \out, \in0, \in1, \in2
-.ifnb \more
-    GMADD \pre_op, \suf_op, \more
-.endif
-.endm
-//
-// GADD
-//
-.macro GADD pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
-    \pre_op\()add.\suf_op \out, \in0, \in1
-.ifnb \more
-    GADD \pre_op, \suf_op, \more
-.endif
-.endm
-//
-// GADDI
-//
-.macro GADDI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
-    \pre_op\()addi.\suf_op  \out,   \in0,   \in1
-.ifnb \more
-    GADDI \pre_op, \suf_op, \more
-.endif
-.endm
-//
-// GSLLI
-//
-.macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
-    \pre_op\()slli.\suf_op  \out,   \in0,   \in1
-.ifnb \more
-    GSLLI \pre_op, \suf_op, \more
-.endif
-.endm
-//
-// GINSVE0
-//
-.macro GINSVE0 pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
-    \pre_op\()insve0.\suf_op    \out,   \in0,   \in1
-.ifnb \more
-    GINSVE0 \pre_op, \suf_op, \more
-.endif
-.endm
-//
-// GXOR
-//
-.macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
-    \pre_op\()xor.\suf_op    \out,   \in0,   \in1
-.ifnb \more
-    GXOR \pre_op, \suf_op, \more
-.endif
-.endm
-
-//
-// Compound instructions
-//
-// GACC: Accumulate the values of vector registers
-//
-.macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
-.ifeqs "\pre_op", "xvf"
-    xvpermi.q              \out,   \in,    0x01
-    \pre_op\()add.\suf_op  \in,    \out,   \in
-    xvpackod.d             \out,   \in,    \in
-    \pre_op\()add.\suf_op  \out,   \out,   \in
-.ifeqs "\suf_op", "s"
-    xvpackod.w             \in,    \out,   \out
-    \pre_op\()add.\suf_op  \out,   \out,   \in
-.endif
-.endif
-
-.ifeqs "\pre_op", "vf"
-    vpackod.d              \out,   \in,    \in
-    \pre_op\()add.\suf_op  \out,   \out,   \in
-.ifeqs "\suf_op", "s"
-    vpackod.w              \in,    \out,   \out
-    \pre_op\()add.\suf_op  \out,   \out,   \in
-.endif
-.endif
-
-.ifeqs "\pre_op", "xv"
-    xvpermi.q              \out,   \in,    0x01
-    \pre_op\()add.\suf_op  \in,    \out,   \in
-    xvpackod.d             \out,   \in,    \in
-    \pre_op\()add.\suf_op  \out,   \out,   \in
-.ifnc "\suf_op", "d"
-    xvpackod.w             \in,    \out,   \out
-    \pre_op\()add.\suf_op  \out,   \out,   \in
-.ifnc "\suf_op", "w"
-    xvpackod.h             \in,    \out,   \out
-    \pre_op\()add.\suf_op  \out,   \out,   \in
-.ifnc "\suf_op", "h"
-    xvpackod.b             \in,    \out,   \out
-    \pre_op\()add.\suf_op  \out,   \out,   \in
-.endif
-.endif
-.endif
-.endif
-
-.ifeqs "\pre_op", "v"
-    vpackod.d              \out,   \in,    \in
-    \pre_op\()add.\suf_op  \out,   \out,   \in
-.ifnc "\suf_op", "d"
-    vpackod.w              \in,    \out,   \out
-    \pre_op\()add.\suf_op  \out,   \out,   \in
-.ifnc "\suf_op", "w"
-    vpackod.h              \in,    \out,   \out
-    \pre_op\()add.\suf_op  \out,   \out,   \in
-.ifnc "\suf_op", "h"
-    vpackod.b              \in,    \out,   \out
-    \pre_op\()add.\suf_op  \out,   \out,   \in
-.endif
-.endif
-.endif
-.endif
-
-.ifnb \more
-    GACC \pre_op, \suf_op, \more
-.endif
-.endm

From 71fcee6eef23bd058d596c42f2d90494a629b401 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Thu, 29 Jun 2023 11:11:08 +0800
Subject: [PATCH 257/718] LoongArch64: Update dgemm kernel

---
 kernel/loongarch64/dgemm_kernel_16x4.S | 4058 ++++++++++--------------
 1 file changed, 1664 insertions(+), 2394 deletions(-)

diff --git a/kernel/loongarch64/dgemm_kernel_16x4.S b/kernel/loongarch64/dgemm_kernel_16x4.S
index 13faa977e..f8e26fda2 100644
--- a/kernel/loongarch64/dgemm_kernel_16x4.S
+++ b/kernel/loongarch64/dgemm_kernel_16x4.S
@@ -28,6 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
+/*********************************************************************
+* 2023/06/28 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+* 2023/06/28 guxiwei
+* Parameter:
+*       DGEMM_DEFAULT_UNROLL_N  4
+*       DGEMM_DEFAULT_UNROLL_M  16
+*       DGEMM_DEFAULT_P         32
+*       DGEMM_DEFAULT_Q         152
+*       DGEMM_DEFAULT_R         858
+*       A_PR1                   1024
+*       B_PR1                   256
+*
+*
+* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000:
+*       1 thread:       36.0 GFLOPS
+*       2 threads:      71.6 GFLOPS
+*       3 threads:     101.5 GFLOPS
+*       4 threads:     132.8 GFLOPS
+*********************************************************************/
+
 /* Function parameters */
 #define M      $r4   // param 1: bm
 #define N      $r5   // param 2: bn
@@ -68,1290 +93,1331 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define U4     $xr4
 #define U5     $xr5
 #define U6     $xr6
-#define D0     $xr7
-#define D1     $xr8
-#define D2     $xr9
-#define D3     $xr10
-#define D4     $xr11
-#define D5     $xr12
-#define D6     $xr13
-#define D7     $xr14
-#define D8     $xr15
-#define D9     $xr16
-#define D10    $xr17
-#define D11    $xr18
-#define D12    $xr19
-#define D13    $xr20
-#define D14    $xr21
-#define D15    $xr22
-#define VALPHA $xr23
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+#define VALPHA $xr15
 
 /* Prefetch interval */
-#define A_PRE  0x200
+#define A_PRE  0x400
 #define B_PRE  0x100
 
-    PROLOGUE
-
-    addi.d   $sp,   $sp,   -56
-    /* Store regs */
-    SDARG    $r23,  $sp,   0
-    SDARG    $r24,  $sp,   8
-    SDARG    $r25,  $sp,   16
-    SDARG    $r26,  $sp,   24
-    SDARG    $r27,  $sp,   32
-    ST       $f23,  $sp,   40
-    ST       ALPHA, $sp,   48
-
-    /* VALPHA = {ALPHA, ALPHA, ALPHA, ALPHA} */
-    xvld         VALPHA, $sp,  48
-    xvreplve0.d  VALPHA, VALPHA
-
-#if defined (TRMMKERNEL) && !defined(LEFT)
-    sub.d   OFF,   ZERO,  OFFSET
-#else
-    xor     OFF,   OFF,   OFF
-#endif
-
-    /* if (!(N >> 2)) goto L_N3 */
-    srai.d   J,     N,     2     /* J = bn >> 2 */
-    andi     N,     N,     0x03
-    beq      ZERO,  J,     .L_N3
-
-.L_J1: /* J-- && This loop include Condition 1 */
-
-/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! *************************
-*                                                   dgemm_core_16x4                                */
-    move     C0,    C
-    move     A0,    A
-    slli.d   T0,    LDC,   3
-    add.d    C1,    C0,    T0
-    addi.d   J,     J,     -1   /* J-- */
-    add.d    C2,    C1,    T0
-    add.d    C3,    C2,    T0
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-    move     OFF,   OFFSET
-#endif
-
-    /* if (!(M >> 4)) goto L_M8 */
-    srai.d   I,     M,     4     /* I = bm >> 4 */
-    beq      ZERO,  I,     .L_M8
-
-.L_I1: /* I-- */
-#if defined(TRMMKERNEL)
-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-    move     B0,    B
-#else
-    slli.d   T0,    OFF,  0x07
-    add.d    A0,    A0,   T0
-    slli.d   T0,    OFF,  0x05
-    add.d    B0,    B,    T0
-#endif
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-    sub.d    L,     K,    OFF
-#elif defined(LEFT)
-    /* number of values in A */
-    addi.d   L,     OFF,  16
-#else
-    /* number of values in B */
-    addi.d   L,     OFF,  4
-#endif
-#else   // #if !defined(TRMMKERNEL)
-    move     B0,    B
-    move     L,     K /* L = bk */
-#endif
-    /* Calculate the first set of D0~D15,
-     * avoidig set 0 operation
-     * Load 16 * 64 from A0
-     * U0 = {a3,  a2,  a1,  a0}
-     * U1 = {a7,  a6,  a5,  a4}
-     * U2 = {a11, a10, a9,  a8}
-     * U3 = {a15, a14, a13, a12}
-     */
+.macro KERNEL2x16x4
     xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d     U4, B0, 0x00
-    preld          0,  C0, 0x00
-    /* line 1 */
-    xvfmul.d  D0,  U0, U4
-    xvfmul.d  D1,  U1, U4
-    preld     0,   C0, 0x40
-    xvfmul.d  D2,  U2, U4
-    xvfmul.d  D3,  U3, U4
-
-    xvldrepl.d     U4, B0, 0x08
-    preld          0,   C1,    0x00
-    /* line 2 */
-    xvfmul.d  D4,  U0, U4
-    xvfmul.d  D5,  U1, U4
-    preld     0,   C1,    0x40
-    xvfmul.d  D6,  U2, U4
-    xvfmul.d  D7,  U3, U4
-
-    xvldrepl.d     U4, B0, 0x10
-    preld          0,   C2,    0x00
-    /* line 3 */
-    xvfmul.d  D8,  U0, U4
-    xvfmul.d  D9,  U1, U4
-    preld     0,   C2,    0x40
-    xvfmul.d  D10, U2, U4
-    xvfmul.d  D11, U3, U4
-
-    xvldrepl.d     U4, B0, 0x18
-    preld          0,   C3,    0x00
-    /* line 4 */
-    xvfmul.d  D12, U0, U4
-    xvfmul.d  D13, U1, U4
-    preld     0,   C3,    0x40
-    xvfmul.d  D14, U2, U4
-    xvfmul.d  D15, U3, U4
-
-    /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x80
-    addi.d    B0,  B0, 0x20
-    /* Reduce L */
-    addi.d    L,   L,  -1
-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_L7 */
-    beq       ZERO,TL, .L_L7
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
 
-    /* Calculate 8 sets of D0~D15 */
-.L_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
     xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
     xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
     xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
 
-    /* Cumulative D0~D15 */
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
+
     preld      0,   B0, B_PRE
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D10, U10, U14, D10
+    xvfmadd.d  D11, U11, U14, D11
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
     preld      0,   A0, A_PRE
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
     preld      0,   A0, A_PRE + 0x40
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D14, U10, U15, D14
+    xvfmadd.d  D15, U11, U15, D15
 
     addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x20
 
-           /***8-2***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    /* Cumulative D0~D15 */
-    xvldrepl.d U4,  B0, 0x00
+    xvld     U8,   A0,    0x00
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
+
+    xvld     U9,   A0,    0x20
     xvfmadd.d  D2,  U2, U4, D2
     xvfmadd.d  D3,  U3, U4, D3
+
+    xvld     U10,   A0,    0x40
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvld     U11,   A0,    0x60
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+
+    xvldrepl.d U12,  B0, 0x00
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+
     preld      0,   B0, B_PRE
+    xvldrepl.d U13,  B0, 0x08
+    xvfmadd.d  D10, U2, U6, D10
+    xvfmadd.d  D11, U3, U6, D11
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
     preld      0,   A0, A_PRE
+    xvldrepl.d U14,  B0, 0x10
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
     preld      0,   A0, A_PRE + 0x40
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvldrepl.d U15,  B0, 0x18
+    xvfmadd.d  D14, U2, U7, D14
+    xvfmadd.d  D15, U3, U7, D15
 
     addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x20
+.endm
 
-           /***8-3***/
-    /* Load 16 * 64 from A0 */
+.macro KERNEL2x16x4_END
     xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
     xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
     xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
     xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
 
-    /* Cumulative D0~D15 */
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
+
     preld      0,   B0, B_PRE
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D10, U10, U14, D10
+    xvfmadd.d  D11, U11, U14, D11
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
     preld      0,   A0, A_PRE
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
     preld      0,   A0, A_PRE + 0x40
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D14, U10, U15, D14
+    xvfmadd.d  D15, U11, U15, D15
 
     addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x20
 
-           /***8-4***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    /* Cumulative D0~D15 */
-    xvldrepl.d U4,  B0, 0x00
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
+
     xvfmadd.d  D2,  U2, U4, D2
     xvfmadd.d  D3,  U3, U4, D3
+
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+
     preld      0,   B0, B_PRE
+    xvfmadd.d  D10, U2, U6, D10
+    xvfmadd.d  D11, U3, U6, D11
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
     preld      0,   A0, A_PRE
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
     preld      0,   A0, A_PRE + 0x40
+    xvfmadd.d  D14, U2, U7, D14
+    xvfmadd.d  D15, U3, U7, D15
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+.macro KERNEL8x16x4
+.rept 4
+    KERNEL2x16x4
+.endr
+.endm
 
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x20
+.macro KERNEL8x16x4_END
+.rept 3
+    KERNEL2x16x4
+.endr
+    KERNEL2x16x4_END
+.endm
 
-           /***8-5***/
-    /* Load 16 * 64 from A0 */
+.macro KERNEL2x8x4
     xvld     U0,   A0,    0x00
     xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
 
-    /* Cumulative D0~D15 */
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-    preld      0,   B0, B_PRE
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-    preld      0,   A0, A_PRE
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
-    preld      0,   A0, A_PRE + 0x40
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
 
-    addi.d     A0,  A0, 0x80
+    addi.d     A0,  A0, 0x40
     addi.d     B0,  B0, 0x20
 
-           /***8-6***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
 
-    /* Cumulative D0~D15 */
-    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U12, B0, 0x00
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-    preld      0,   B0, B_PRE
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-    preld      0,   A0, A_PRE
+    xvldrepl.d U13, B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
-    preld      0,   A0, A_PRE + 0x40
+    xvldrepl.d U14, B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvldrepl.d U15,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
 
-    addi.d     A0,  A0, 0x80
+    addi.d     A0,  A0, 0x40
     addi.d     B0,  B0, 0x20
+.endm
 
-           /***8-7***/
-    /* Load 16 * 64 from A0 */
+.macro KERNEL2x8x4_END
     xvld     U0,   A0,    0x00
     xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
 
-    /* Cumulative D0~D15 */
     xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
+
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-    preld      0,   B0, B_PRE
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-    preld      0,   A0, A_PRE
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
-    preld      0,   A0, A_PRE + 0x40
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+.endm
 
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x20
+.macro KERNEL8x8x4
+.rept 4
+    KERNEL2x8x4
+.endr
+.endm
 
-           /***8-8***/
-    /* Load 16 * 64 from A0 */
+.macro KERNEL8x8x4_END
+.rept 3
+    KERNEL2x8x4
+.endr
+    KERNEL2x8x4_END
+.endm
+
+.macro KERNEL2x4x4
     xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
 
-    /* Cumulative D0~D15 */
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-    preld      0,   B0, B_PRE
+    xvfmadd.d  D0,  U8, U12, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-    preld      0,   A0, A_PRE
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
-    preld      0,   A0, A_PRE + 0x40
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
 
-    addi.d     A0,  A0, 0x80
+    addi.d     A0,  A0, 0x20
     addi.d     B0,  B0, 0x20
 
-    addi.d    TL,  TL, -1 /* TL-- */
-    blt       ZERO,TL, .L_TL1
+    xvld     U8,   A0,    0x00
 
-   /* Maybe we need calculate the last
-    * 7 sets of D0~D15?
-    */
-.L_L7:
-    /* if (!(L & 7)) goto L_L0 */
-    andi      TL,  L,   7
-    beq       TL,  ZERO,.L_L0
+    xvldrepl.d U12, B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
 
-.L_L71:
-    /* Load 16 * 64 from A0 */
+    xvldrepl.d U13, B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+
+    xvldrepl.d U14, B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+
+    xvldrepl.d U15,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x4x4_END
     xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
 
-    /* Cumulative D0~D15 */
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
+    xvfmadd.d  D0,  U8, U12, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
 
-    /* Add stride for A0, B0 */
-    addi.d     A0,  A0, 0x80
+    addi.d     A0,  A0, 0x20
     addi.d     B0,  B0, 0x20
 
-    addi.d     TL,  TL, -1
-    blt        ZERO,TL, .L_L71
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D12, U0, U7, D12
+.endm
 
-.L_L0:
-#if defined(TRMMKERNEL)
-    xvfmul.d  D0,   D0,  VALPHA
-    xvfmul.d  D1,   D1,  VALPHA
-    xvfmul.d  D2,   D2,  VALPHA
-    xvfmul.d  D3,   D3,  VALPHA
-    xvfmul.d  D4,   D4,  VALPHA
-    xvfmul.d  D5,   D5,  VALPHA
-    xvfmul.d  D6,   D6,  VALPHA
-    xvfmul.d  D7,   D7,  VALPHA
-    xvfmul.d  D8,   D8,  VALPHA
-    xvfmul.d  D9,   D9,  VALPHA
-    xvfmul.d  D10,  D10, VALPHA
-    xvfmul.d  D11,  D11, VALPHA
-    xvfmul.d  D12,  D12, VALPHA
-    xvfmul.d  D13,  D13, VALPHA
-    xvfmul.d  D14,  D14, VALPHA
-    xvfmul.d  D15,  D15, VALPHA
-#else
-    /* Load C0  */
-    xvld      U0,  C0,  0x00
-    xvld      U1,  C0,  0x20
-    xvld      U2,  C0,  0x40
-    xvld      U3,  C0,  0x60
-    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
-    xvfmadd.d D1,  D1,  VALPHA,  U1
-    xvfmadd.d D2,  D2,  VALPHA,  U2
-    xvfmadd.d D3,  D3,  VALPHA,  U3
+.macro KERNEL8x4x4
+.rept 4
+    KERNEL2x4x4
+.endr
+.endm
 
-    /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvld      U1,  C1,  0x20
-    xvld      U2,  C1,  0x40
-    xvld      U3,  C1,  0x60
-    xvfmadd.d D4,  D4,  VALPHA,  U0
-    xvfmadd.d D5,  D5,  VALPHA,  U1
-    xvfmadd.d D6,  D6,  VALPHA,  U2
-    xvfmadd.d D7,  D7,  VALPHA,  U3
+.macro KERNEL8x4x4_END
+.rept 3
+    KERNEL2x4x4
+.endr
+    KERNEL2x4x4_END
+.endm
 
-    /* Load C2  */
-    xvld      U0,  C2,  0x00
-    xvld      U1,  C2,  0x20
-    xvld      U2,  C2,  0x40
-    xvld      U3,  C2,  0x60
-    xvfmadd.d D8,  D8,  VALPHA,  U0
-    xvfmadd.d D9,  D9,  VALPHA,  U1
-    xvfmadd.d D10, D10, VALPHA,  U2
-    xvfmadd.d D11, D11, VALPHA,  U3
+.macro KERNEL2x2x4
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
 
-    /* Load C3  */
-    xvld      U0,  C3,  0x00
-    xvld      U1,  C3,  0x20
-    xvld      U2,  C3,  0x40
-    xvld      U3,  C3,  0x60
-    xvfmadd.d D12, D12, VALPHA,  U0
-    xvfmadd.d D13, D13, VALPHA,  U1
-    xvfmadd.d D14, D14, VALPHA,  U2
-    xvfmadd.d D15, D15, VALPHA,  U3
-#endif // #if defined(TRMMKERNEL)
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvfmadd.d   D1,     U9,     U12,    D1
 
-    /* Store C0 */
-    xvst      D0,  C0,  0x00
-    xvst      D1,  C0,  0x20
-    xvst      D2,  C0,  0x40
-    xvst      D3,  C0,  0x60
-    /* Store C1 */
-    xvst      D4,  C1,  0x00
-    xvst      D5,  C1,  0x20
-    xvst      D6,  C1,  0x40
-    xvst      D7,  C1,  0x60
-    /* Store C2 */
-    xvst      D8,  C2,  0x00
-    xvst      D9,  C2,  0x20
-    xvst      D10, C2,  0x40
-    xvst      D11, C2,  0x60
-    /* Store C3 */
-    xvst      D12, C3,  0x00
-    xvst      D13, C3,  0x20
-    xvst      D14, C3,  0x40
-    xvst      D15, C3,  0x60
+    xvld    U4,  B0, 0x00
+    addi.d  A0,  A0, 0x10
+    addi.d  B0,  B0, 0x20
 
-    /* Add stride for C */
-    addi.d    C0,  C0,  0x80
-    addi.d    C1,  C1,  0x80
-    addi.d    C2,  C2,  0x80
-    addi.d    C3,  C3,  0x80
+    xvldrepl.d     U8,   A0,    0x00
+    xvldrepl.d     U9,   A0,    0x08
 
-#if defined(TRMMKERNEL)
-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-    sub.d     L,   K,   OFF
-#ifdef LEFT
-    /* number of values  in A */
-    addi.d    L,   L,   -16
-#else
-    /* number of values in B */
-    addi.d    L,   L,   -4
-#endif
-    slli.d    T0,  L,  0x07
-    add.d     A0,  A0, T0
-    slli.d    T0,  L,  0x05
-    add.d     B0,  B0, T0
-#endif
+    xvfmadd.d   D0,     U0,     U4,    D0
+    xvfmadd.d   D1,     U1,     U4,    D1
 
-#ifdef LEFT
-    addi.d    OFF, OFF, 0x10
-#endif
-#endif   // #if defined(TRMMKERNEL)
+    xvld       U12, B0, 0x00
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+.endm
 
-    addi.d    I,   I,   -1  /* I-- */
-    blt       ZERO,I,   .L_I1
+.macro KERNEL2x2x4_END
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
 
-.L_M8:
-    /* We have done M & 16, considering M=8/4/2/1 */
-    andi      I,   M,   15
-    beq       ZERO,I,   .L_M0
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvfmadd.d   D1,     U9,     U12,    D1
 
-    andi      I,   M,   8
-    beq       ZERO,I,   .L_M4
+    xvld    U4,  B0, 0x00
+    addi.d  A0,  A0, 0x10
+    addi.d  B0,  B0, 0x20
 
-#if defined(TRMMKERNEL)
-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-    move     B0,    B
-#else
-    slli.d   T0,    OFF,  0x06
-    add.d    A0,    A0,   T0
-    slli.d   T0,    OFF,  0x05
-    add.d    B0,    B,    T0
-#endif
+    xvfmadd.d   D0,     U0,     U4,    D0
+    xvfmadd.d   D1,     U1,     U4,    D1
+.endm
 
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-    sub.d    L,     K,    OFF
-#elif defined(LEFT)
-    /* number of values in A */
-    addi.d   L,     OFF,  8
-#else
-    /* number of values in B */
-    addi.d   L,     OFF,  4
-#endif
-#else   // #if !defined(TRMMKERNEL)
-    move     B0,    B
-    move     L,     K /* L = bk */
-#endif  // #if defined(TRMMKERNEL)
+.macro KERNEL8x2x4
+.rept 4
+    KERNEL2x2x4
+.endr
+.endm
 
-    /* Load 8 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+.macro KERNEL8x2x4_END
+.rept 3
+    KERNEL2x2x4
+.endr
+    KERNEL2x2x4_END
+.endm
 
-    xvldrepl.d     U4, B0, 0x00
-    /* line 1 */
-    xvfmul.d  D0,  U0, U4
-    xvfmul.d  D1,  U1, U4
+.macro KERNEL2x1x4
+    xvldrepl.d  U0,     A0,     0x00
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvld        U4,     B0,     0x00
 
-    xvldrepl.d     U4, B0, 0x08
-    /* line 2 */
-    xvfmul.d  D4,  U0, U4
-    xvfmul.d  D5,  U1, U4
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
 
-    xvldrepl.d     U4, B0, 0x10
-    /* line 3 */
-    xvfmul.d  D8,  U0, U4
-    xvfmul.d  D9,  U1, U4
+    xvldrepl.d  U8,     A0,     0x00
+    xvfmadd.d   D0,     U0,     U4,     D0
+    xvld        U12,    B0,     0x00
 
-    xvldrepl.d     U4, B0, 0x18
-    /* line 4 */
-    xvfmul.d  D12, U0, U4
-    xvfmul.d  D13, U1, U4
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+.endm
 
-    /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x40
-    addi.d    B0,  B0, 0x20
-    /* Reduce L */
-    addi.d    L,   L,  -1
-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_M8_L7 */
-    beq       ZERO,TL, .L_M8_L7
+.macro KERNEL2x1x4_END
+    xvldrepl.d  U0,     A0,     0x00
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvld        U4,     B0,     0x00
 
-.L_M8_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 16 * 64 from A0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+    xvfmadd.d   D0,     U0,     U4,     D0
+.endm
+
+.macro KERNEL8x1x4
+.rept 4
+    KERNEL2x1x4
+.endr
+.endm
+
+.macro KERNEL8x1x4_END
+.rept 3
+    KERNEL2x1x4
+.endr
+    KERNEL2x1x4_END
+.endm
+
+.macro KERNEL2x16x2
     xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
     xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
 
     xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    xvld     U9,   A0,    0x20
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    xvld     U10,   A0,    0x40
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+    xvld     U11,   A0,    0x60
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+.endm
 
-           /***8-2***/
+.macro KERNEL2x16x2_END
     xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
     xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
 
     xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+.endm
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+.macro KERNEL8x16x2
+.rept 4
+    KERNEL2x16x2
+.endr
+.endm
+
+.macro KERNEL8x16x2_END
+.rept 3
+    KERNEL2x16x2
+.endr
+    KERNEL2x16x2_END
+.endm
 
-           /***8-3***/
+.macro KERNEL2x8x2
     xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
     xvld     U1,   A0,    0x20
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
 
     xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    xvld     U9,   A0,    0x20
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
 
     addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+    addi.d     B0,  B0, 0x10
+.endm
 
-           /***8-4***/
+.macro KERNEL2x8x2_END
     xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
     xvld     U1,   A0,    0x20
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
 
     xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+.endm
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+.macro KERNEL8x8x2
+.rept 4
+    KERNEL2x8x2
+.endr
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+.macro KERNEL8x8x2_END
+.rept 3
+    KERNEL2x8x2
+ .endr
+    KERNEL2x8x2_END
+.endm
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
-
-           /***8-5***/
+.macro KERNEL2x4x2
     xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
 
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    xvldrepl.d U5,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+.endm
 
-           /***8-6***/
+.macro KERNEL2x4x2_END
     xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
 
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    xvldrepl.d U5,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+.macro KERNEL8x4x2
+.rept 4
+    KERNEL2x4x2
+.endr
+.endm
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+.macro KERNEL8x4x2_END
+.rept 3
+    KERNEL2x4x2
+.endr
+    KERNEL2x4x2_END
+.endm
 
-           /***8-7***/
+.macro KERNEL2x2x2
     xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
 
     xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
     xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D4,  U0, U5, D4
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+.macro KERNEL2x2x2_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+.endm
 
-           /***8-8***/
+.macro KERNEL8x2x2
+.rept 4
+    KERNEL2x2x2
+.endr
+.endm
+
+.macro KERNEL8x2x2_END
+.rept 3
+    KERNEL2x2x2
+.endr
+    KERNEL2x2x2_END
+.endm
+
+.macro KERNEL2x1x2
     xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
 
     xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
     xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D4,  U0, U5, D4
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+.macro KERNEL2x1x2_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
 
-    addi.d    TL,  TL, -1 /* TL-- */
-    blt       ZERO,TL, .L_M8_TL1
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
 
-.L_M8_L7:
-    /* if (!(L & 7)) goto L_M8_L0 */
-    andi      TL,  L,   7
-    beq       TL,  ZERO,.L_M8_L0
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+.endm
 
-.L_M8_L71:
+.macro KERNEL8x1x2
+.rept 4
+    KERNEL2x1x2
+.endr
+.endm
+
+.macro KERNEL8x1x2_END
+.rept 3
+    KERNEL2x1x2
+.endr
+    KERNEL2x1x2_END
+.endm
+
+.macro KERNEL2x16x1
     xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
     xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
 
     xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+    xvld     U8,   A0,    0x00
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    xvld     U9,   A0,    0x20
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    xvld     U10,   A0,    0x40
+    xvld     U11,   A0,    0x60
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+    xvldrepl.d U12,  B0, 0x00
 
-    /* Add stride for A0, B0 */
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+.endm
 
-    addi.d     TL,  TL, -1
-    blt        ZERO,TL, .L_M8_L71
+.macro KERNEL2x16x1_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
 
-.L_M8_L0:
-#if defined(TRMMKERNEL)
-    xvfmul.d  D0,   D0,  VALPHA
-    xvfmul.d  D1,   D1,  VALPHA
-    xvfmul.d  D4,   D4,  VALPHA
-    xvfmul.d  D5,   D5,  VALPHA
-    xvfmul.d  D8,   D8,  VALPHA
-    xvfmul.d  D9,   D9,  VALPHA
-    xvfmul.d  D12,  D12, VALPHA
-    xvfmul.d  D13,  D13, VALPHA
-#else
-    /* Load C0  */
-    xvld      U0,  C0,  0x00
-    xvld      U1,  C0,  0x20
-    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
-    xvfmadd.d D1,  D1,  VALPHA,  U1
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
 
-    /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvld      U1,  C1,  0x20
-    xvfmadd.d D4,  D4,  VALPHA,  U0
-    xvfmadd.d D5,  D5,  VALPHA,  U1
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
 
-    /* Load C2  */
-    xvld      U0,  C2,  0x00
-    xvld      U1,  C2,  0x20
-    xvfmadd.d D8,  D8,  VALPHA,  U0
-    xvfmadd.d D9,  D9,  VALPHA,  U1
+    xvldrepl.d U4,  B0, 0x00
 
-    /* Load C3  */
-    xvld      U0,  C3,  0x00
-    xvld      U1,  C3,  0x20
-    xvfmadd.d D12, D12, VALPHA,  U0
-    xvfmadd.d D13, D13, VALPHA,  U1
-#endif   // #if defined(TRMMKERNEL)
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
 
-    /* Store C0 */
-    xvst      D0,  C0,  0x00
-    xvst      D1,  C0,  0x20
-    /* Store C1 */
-    xvst      D4,  C1,  0x00
-    xvst      D5,  C1,  0x20
-    /* Store C2 */
-    xvst      D8,  C2,  0x00
-    xvst      D9,  C2,  0x20
-    /* Store C3 */
-    xvst      D12, C3,  0x00
-    xvst      D13, C3,  0x20
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
 
-    /* Add stride for C */
-    addi.d    C0,  C0,  0x40
-    addi.d    C1,  C1,  0x40
-    addi.d    C2,  C2,  0x40
-    addi.d    C3,  C3,  0x40
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+.endm
 
-#if defined(TRMMKERNEL)
-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-    sub.d     L,   K,   OFF
-#ifdef LEFT
-    /* number of values in A */
-    addi.d    L,   L,   -8
-#else
-    /* number of values in B */
-    addi.d    L,   L,   -4
-#endif
-    slli.d    T0,  L,  0x06
-    add.d     A0,  A0, T0
-    slli.d    T0,  L,  0x05
-    add.d     B0,  B0, T0
-#endif
+.macro KERNEL8x16x1
+.rept 4
+    KERNEL2x16x1
+.endr
+.endm
 
-#ifdef LEFT
-    /* number of values in A */
-    addi.d    OFF,   OFF,  0x08
-#endif
-#endif   // #if defined(TRMMKERNEL)
+.macro KERNEL8x16x1_END
+.rept 3
+    KERNEL2x16x1
+.endr
+    KERNEL2x16x1_END
+.endm
 
-/********LOOP (if(N >> 2 ) && (M & 8)) End************/
+.macro KERNEL2x8x1
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+    xvld     U1,   A0,    0x20
+    xvldrepl.d U4,  B0, 0x00
 
-.L_M4:
-    andi      I,   M,   4
-    beq       ZERO,I,   .L_M2
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
 
-#if defined(TRMMKERNEL)
-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-    move     B0,    B
-#else
-    slli.d   T0,    OFF,  0x05
-    add.d    A0,    A0,   T0
-    add.d    B0,    B,    T0
-#endif
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvld     U9,   A0,    0x20
+    xvldrepl.d U12,  B0, 0x00
 
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-    sub.d    L,     K,    OFF
-#elif defined(LEFT)
-    /* number of values in A */
-    addi.d   L,     OFF,  4
-#else
-    /* number of values in B */
-    addi.d   L,     OFF,  4
-#endif
-#else   // #if !defined(TRMMKERNEL)
-    move     B0,    B
-    move     L,     K /* L = bk */
-#endif
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+.endm
 
-    /* Load 4 * 64 from A0 */
+.macro KERNEL2x8x1_END
     xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+    xvld     U1,   A0,    0x20
+    xvldrepl.d U4,  B0, 0x00
 
-    xvldrepl.d     U4, B0, 0x00
-    /* line 1 */
-    xvfmul.d  D0,  U0, U4
-
-    xvldrepl.d     U4, B0, 0x08
-    /* line 2 */
-    xvfmul.d  D4,  U0, U4
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
 
-    xvldrepl.d     U4, B0, 0x10
-    /* line 3 */
-    xvfmul.d  D8,  U0, U4
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+.endm
 
-    xvldrepl.d     U4, B0, 0x18
-    /* line 4 */
-    xvfmul.d  D12, U0, U4
+.macro KERNEL8x8x1
+.rept 4
+    KERNEL2x8x1
+.endr
+.endm
 
-    /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x20
-    addi.d    B0,  B0, 0x20
-    /* Reduce L */
-    addi.d    L,   L,  -1
-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_M4_L7 */
-    beq       ZERO,TL, .L_M4_L7
+.macro KERNEL8x8x1_END
+.rept 3
+    KERNEL2x8x1
+.endr
+    KERNEL2x8x1_END
+.endm
 
-.L_M4_TL1: /* TL-- */
-           /***8-1***/
+.macro KERNEL2x4x1
     xvld     U0,   A0,    0x00
-
+    xvfmadd.d  D0,  U8, U12, D0
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d U12,  B0, 0x00
 
     addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x20
+    addi.d     B0,  B0, 0x08
+.endm
 
-           /***8-2***/
+.macro KERNEL2x4x1_END
     xvld     U0,   A0,    0x00
-
+    xvfmadd.d  D0,  U8, U12, D0
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D0,  U0, U4, D0
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+.macro KERNEL8x4x1
+.rept 4
+    KERNEL2x4x1
+.endr
+.endm
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x20
+.macro KERNEL8x4x1_END
+.rept 3
+    KERNEL2x4x1
+.endr
+    KERNEL2x4x1_END
+.endm
 
-           /***8-3***/
+.macro KERNEL2x2x1
     xvld     U0,   A0,    0x00
-
+    xvfmadd.d  D0,  U8, U12, D0
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d U12,  B0, 0x00
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x20
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+.endm
 
-           /***8-4***/
+.macro KERNEL2x2x1_END
     xvld     U0,   A0,    0x00
-
+    xvfmadd.d  D0,  U8, U12, D0
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D0,  U0, U4, D0
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+.macro KERNEL8x2x1
+.rept 4
+    KERNEL2x2x1
+.endr
+.endm
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x20
+.macro KERNEL8x2x1_END
+.rept 3
+    KERNEL2x2x1
+.endr
+    KERNEL2x2x1_END
+.endm
 
-           /***8-5***/
+.macro KERNEL2x1x1
     xvld     U0,   A0,    0x00
-
+    xvfmadd.d  D0,  U8, U12, D0
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d U12,  B0, 0x00
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x20
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+.endm
 
-           /***8-6***/
+.macro KERNEL2x1x1_END
     xvld     U0,   A0,    0x00
-
+    xvfmadd.d  D0,  U8, U12, D0
     xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
     xvfmadd.d  D0,  U0, U4, D0
+.endm
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+.macro KERNEL8x1x1
+.rept 4
+    KERNEL2x1x1
+.endr
+.endm
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+.macro KERNEL8x1x1_END
+.rept 3
+    KERNEL2x1x1
+.endr
+    KERNEL2x1x1_END
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x20
+    PROLOGUE
 
-           /***8-7***/
-    xvld     U0,   A0,    0x00
+    addi.d   $sp,   $sp,   -120
+    /* Store regs */
+    SDARG    $r23,  $sp,   0
+    SDARG    $r24,  $sp,   8
+    SDARG    $r25,  $sp,   16
+    SDARG    $r26,  $sp,   24
+    SDARG    $r27,  $sp,   32
+    ST       $f23,  $sp,   40
+    ST       $f24,  $sp,   48
+    ST       $f25,  $sp,   56
+    ST       $f26,  $sp,   64
+    ST       $f27,  $sp,   72
+    ST       $f28,  $sp,   80
+    ST       $f29,  $sp,   88
+    ST       $f30,  $sp,   96
+    ST       $f31,  $sp,   104
+    ST       ALPHA, $sp,   112
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d   OFF,   ZERO,  OFFSET
+#else
+    xor     OFF,   OFF,   OFF
+#endif
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    /* if (!(N >> 2)) goto L_N3 */
+    srai.d   J,     N,     2     /* J = bn >> 2 */
+    andi     N,     N,     0x03
+    xvldrepl.d  VALPHA, $sp, 112 /* When N < 4, VALPHA will not changed */
+    beq      ZERO,  J,     .L_N3
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+.L_J1: /* J-- && This loop include Condition 1 */
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! *************************
+*                                                   dgemm_core_16x4                                */
+    move     C0,    C
+    move     A0,    A
+    slli.d   T0,    LDC,   3
+    add.d    C1,    C0,    T0
+    addi.d   J,     J,     -1   /* J-- */
+    add.d    C2,    C1,    T0
+    add.d    C3,    C2,    T0
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x20
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 4)) goto L_M8 */
+    srai.d   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_M8
 
-           /***8-8***/
+.L_I1: /* I-- */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x07
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  16
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    /* Calculate the first set of D0~D15,
+     * avoidig set 0 operation
+     * Load 16 * 64 from A0
+     * U0 = {a3,  a2,  a1,  a0}
+     * U1 = {a7,  a6,  a5,  a4}
+     * U2 = {a11, a10, a9,  a8}
+     * U3 = {a15, a14, a13, a12}
+     */
     xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d     U4, B0, 0x00
+    preld          0,  C0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+    preld     0,   C0, 0x40
+    xvfmul.d  D2,  U2, U4
+    xvfmul.d  D3,  U3, U4
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d     U5, B0, 0x08
+    preld          0,   C1,    0x00
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
+    preld     0,   C1,    0x40
+    xvfmul.d  D6,  U2, U5
+    xvfmul.d  D7,  U3, U5
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvldrepl.d     U6, B0, 0x10
+    preld          0,   C2,    0x00
+    /* line 3 */
+    xvfmul.d  D8,  U0, U6
+    xvfmul.d  D9,  U1, U6
+    preld     0,   C2,    0x40
+    xvfmul.d  D10, U2, U6
+    xvfmul.d  D11, U3, U6
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvldrepl.d     U7, B0, 0x18
+    preld          0,   C3,    0x00
+    /* line 4 */
+    xvfmul.d  D12, U0, U7
+    xvfmul.d  D13, U1, U7
+    preld     0,   C3,    0x40
+    xvfmul.d  D14, U2, U7
+    xvfmul.d  D15, U3, U7
 
-    addi.d     A0,  A0, 0x20
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x80
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_L7 */
+    beq       ZERO,TL, .L_L7
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+    xvld     U10,  A0,    0x40
+    xvld     U11,  A0,    0x60
+
+    addi.d    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    xvldrepl.d  U14,   B0,  0x10
+    xvldrepl.d  U15,   B0,  0x18
+    addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x20
 
+    beq    ZERO,    TL,  .L_TL1_END
+.L_TL1: /* TL-- */
+    KERNEL8x16x4
     addi.d    TL,  TL, -1 /* TL-- */
-    blt       ZERO,TL, .L_M4_TL1
+    blt       ZERO,TL, .L_TL1
 
-.L_M4_L7:
-    /* if (!(L & 7)) goto L_M4_L0 */
+.L_TL1_END:
+    KERNEL8x16x4_END
+
+   /* Maybe we need calculate the last
+    * 7 sets of D0~D15?
+    */
+.L_L7:
+    /* if (!(L & 7)) goto L_L0 */
     andi      TL,  L,   7
-    beq       TL,  ZERO,.L_M4_L0
+    beq       TL,  ZERO,.L_L0
 
-.L_M4_L71:
+.L_L71:
+    /* Load 16 * 64 from A0 */
     xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
 
+    /* Cumulative D0~D15 */
     xvldrepl.d U4,  B0, 0x00
     xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+    xvfmadd.d  D10, U2, U6, D10
+    xvfmadd.d  D11, U3, U6, D11
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+    xvfmadd.d  D14, U2, U7, D14
+    xvfmadd.d  D15, U3, U7, D15
 
     /* Add stride for A0, B0 */
-    addi.d     A0,  A0, 0x20
+    addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x20
 
     addi.d     TL,  TL, -1
-    blt        ZERO,TL, .L_M4_L71
+    blt        ZERO,TL, .L_L71
 
-.L_M4_L0:
+.L_L0:
+    xvldrepl.d  VALPHA, $sp, 112
 #if defined(TRMMKERNEL)
     xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D1,   D1,  VALPHA
+    xvfmul.d  D2,   D2,  VALPHA
+    xvfmul.d  D3,   D3,  VALPHA
     xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D5,   D5,  VALPHA
+    xvfmul.d  D6,   D6,  VALPHA
+    xvfmul.d  D7,   D7,  VALPHA
     xvfmul.d  D8,   D8,  VALPHA
+    xvfmul.d  D9,   D9,  VALPHA
+    xvfmul.d  D10,  D10, VALPHA
+    xvfmul.d  D11,  D11, VALPHA
     xvfmul.d  D12,  D12, VALPHA
+    xvfmul.d  D13,  D13, VALPHA
+    xvfmul.d  D14,  D14, VALPHA
+    xvfmul.d  D15,  D15, VALPHA
 #else
     /* Load C0  */
     xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
     xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    xvfmadd.d D1,  D1,  VALPHA,  U1
+    xvfmadd.d D2,  D2,  VALPHA,  U2
+    xvfmadd.d D3,  D3,  VALPHA,  U3
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+    xvfmadd.d D4,  D4,  VALPHA,  U4
+    xvfmadd.d D5,  D5,  VALPHA,  U5
+    xvfmadd.d D6,  D6,  VALPHA,  U6
+    xvfmadd.d D7,  D7,  VALPHA,  U7
 
     /* Load C2  */
-    xvld      U0,  C2,  0x00
-    xvfmadd.d D8,  D8,  VALPHA,  U0
+    xvld      U8,  C2,  0x00
+    xvld      U9,  C2,  0x20
+    xvld      U10, C2,  0x40
+    xvld      U11, C2,  0x60
+    xvfmadd.d D8,  D8,  VALPHA,  U8
+    xvfmadd.d D9,  D9,  VALPHA,  U9
+    xvfmadd.d D10, D10, VALPHA,  U10
+    xvfmadd.d D11, D11, VALPHA,  U11
 
     /* Load C3  */
     xvld      U0,  C3,  0x00
+    xvld      U1,  C3,  0x20
+    xvld      U2,  C3,  0x40
+    xvld      U3,  C3,  0x60
     xvfmadd.d D12, D12, VALPHA,  U0
-#endif   // #if defined(TRMMKERNEL)
+    xvfmadd.d D13, D13, VALPHA,  U1
+    xvfmadd.d D14, D14, VALPHA,  U2
+    xvfmadd.d D15, D15, VALPHA,  U3
+#endif // #if defined(TRMMKERNEL)
 
     /* Store C0 */
     xvst      D0,  C0,  0x00
+    xvst      D1,  C0,  0x20
+    xvst      D2,  C0,  0x40
+    xvst      D3,  C0,  0x60
     /* Store C1 */
     xvst      D4,  C1,  0x00
+    xvst      D5,  C1,  0x20
+    xvst      D6,  C1,  0x40
+    xvst      D7,  C1,  0x60
     /* Store C2 */
     xvst      D8,  C2,  0x00
+    xvst      D9,  C2,  0x20
+    xvst      D10, C2,  0x40
+    xvst      D11, C2,  0x60
     /* Store C3 */
     xvst      D12, C3,  0x00
+    xvst      D13, C3,  0x20
+    xvst      D14, C3,  0x40
+    xvst      D15, C3,  0x60
 
     /* Add stride for C */
-    addi.d    C0,  C0,  0x20
-    addi.d    C1,  C1,  0x20
-    addi.d    C2,  C2,  0x20
-    addi.d    C3,  C3,  0x20
+    addi.d    C0,  C0,  0x80
+    addi.d    C1,  C1,  0x80
+    addi.d    C2,  C2,  0x80
+    addi.d    C3,  C3,  0x80
 
 #if defined(TRMMKERNEL)
 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
     sub.d     L,   K,   OFF
 #ifdef LEFT
-    /* number of values in A */
-    addi.d    L,   L,   -4
+    /* number of values  in A */
+    addi.d    L,   L,   -16
 #else
     /* number of values in B */
     addi.d    L,   L,   -4
 #endif
-    slli.d    T0,  L,  0x05
+    slli.d    T0,  L,  0x07
     add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
     add.d     B0,  B0, T0
 #endif
 
 #ifdef LEFT
-    /* number of values in A */
-    addi.d    OFF,   OFF,  0x04
+    addi.d    OFF, OFF, 0x10
 #endif
 #endif   // #if defined(TRMMKERNEL)
 
-/********LOOP (if(N >> 2 ) && (M & 4) ) End************/
+    addi.d    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_I1
 
-.L_M2:
-    andi      I,   M,   2
-    beq       ZERO,I,   .L_M1
+.L_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_M0
+
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_M4
 
 #if defined(TRMMKERNEL)
 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
     move     B0,    B
 #else
-    slli.d   T0,    OFF,  0x04
+    slli.d   T0,    OFF,  0x06
     add.d    A0,    A0,   T0
     slli.d   T0,    OFF,  0x05
     add.d    B0,    B,    T0
@@ -1361,7 +1427,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     sub.d    L,     K,    OFF
 #elif defined(LEFT)
     /* number of values in A */
-    addi.d   L,     OFF,  2
+    addi.d   L,     OFF,  8
 #else
     /* number of values in B */
     addi.d   L,     OFF,  4
@@ -1369,262 +1435,163 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else   // #if !defined(TRMMKERNEL)
     move     B0,    B
     move     L,     K /* L = bk */
-#endif
+#endif  // #if defined(TRMMKERNEL)
 
-    /* Load 2 * 64 from A0 */
+    /* Load 8 * 64 from A0 */
     xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
 
     xvldrepl.d     U4, B0, 0x00
     /* line 1 */
     xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
 
-    xvldrepl.d     U4, B0, 0x08
+    xvldrepl.d     U5, B0, 0x08
     /* line 2 */
-    xvfmul.d  D4,  U0, U4
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
 
-    xvldrepl.d     U4, B0, 0x10
+    xvldrepl.d     U6, B0, 0x10
     /* line 3 */
-    xvfmul.d  D8,  U0, U4
+    xvfmul.d  D8,  U0, U6
+    xvfmul.d  D9,  U1, U6
 
-    xvldrepl.d     U4, B0, 0x18
+    xvldrepl.d     U7, B0, 0x18
     /* line 4 */
-    xvfmul.d  D12, U0, U4
+    xvfmul.d  D12, U0, U7
+    xvfmul.d  D13, U1, U7
 
     /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x10
+    addi.d    A0,  A0, 0x40
     addi.d    B0,  B0, 0x20
     /* Reduce L */
     addi.d    L,   L,  -1
     srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_M2_L7 */
-    beq       ZERO,TL, .L_M2_L7
-
-.L_M2_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 2 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x20
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x20
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x20
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x20
-
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x20
-
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x20
-
-           /***8-7***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    /* if (TL < 1) goto L_M8_L7 */
+    beq       ZERO,TL, .L_M8_L7
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    addi.d    TL,  TL,  -1
 
-    addi.d     A0,  A0, 0x10
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    xvldrepl.d  U14,   B0,  0x10
+    xvldrepl.d  U15,   B0,  0x18
+    addi.d     A0,  A0, 0x40
     addi.d     B0,  B0, 0x20
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    beq    ZERO,    TL,  .L_M8_TL1_END
 
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x20
+.L_M8_TL1: /* TL-- */
+    KERNEL8x8x4
 
     addi.d    TL,  TL, -1 /* TL-- */
-    blt       ZERO,TL, .L_M2_TL1
+    blt       ZERO,TL, .L_M8_TL1
 
-.L_M2_L7:
-    /* if (!(L & 7)) goto L_M2_L0 */
+.L_M8_TL1_END:
+    KERNEL8x8x4_END
+
+.L_M8_L7:
+    /* if (!(L & 7)) goto L_M8_L0 */
     andi      TL,  L,   7
-    beq       TL,  ZERO,.L_M2_L0
+    beq       TL,  ZERO,.L_M8_L0
 
-.L_M2_L71:
+.L_M8_L71:
     xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
 
     xvldrepl.d U4,  B0, 0x00
     xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
 
     /* Add stride for A0, B0 */
-    addi.d     A0,  A0, 0x10
+    addi.d     A0,  A0, 0x40
     addi.d     B0,  B0, 0x20
 
     addi.d     TL,  TL, -1
-    blt        ZERO,TL, .L_M2_L71
+    blt        ZERO,TL, .L_M8_L71
 
-.L_M2_L0:
+.L_M8_L0:
+    xvldrepl.d  VALPHA, $sp, 112
 #if defined(TRMMKERNEL)
     xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D1,   D1,  VALPHA
     xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D5,   D5,  VALPHA
     xvfmul.d  D8,   D8,  VALPHA
+    xvfmul.d  D9,   D9,  VALPHA
     xvfmul.d  D12,  D12, VALPHA
+    xvfmul.d  D13,  D13, VALPHA
 #else
     /* Load C0  */
     xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
     xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    xvfmadd.d D1,  D1,  VALPHA,  U1
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+    xvfmadd.d D4,  D4,  VALPHA,  U2
+    xvfmadd.d D5,  D5,  VALPHA,  U3
 
     /* Load C2  */
-    xvld      U0,  C2,  0x00
-    xvfmadd.d D8,  D8,  VALPHA,  U0
+    xvld      U4,  C2,  0x00
+    xvld      U5,  C2,  0x20
+    xvfmadd.d D8,  D8,  VALPHA,  U4
+    xvfmadd.d D9,  D9,  VALPHA,  U5
 
     /* Load C3  */
-    xvld      U0,  C3,  0x00
-    xvfmadd.d D12, D12, VALPHA,  U0
+    xvld      U6,  C3,  0x00
+    xvld      U7,  C3,  0x20
+    xvfmadd.d D12, D12, VALPHA,  U6
+    xvfmadd.d D13, D13, VALPHA,  U7
 #endif   // #if defined(TRMMKERNEL)
 
-    xvstelm.d D0,  C0,  0x00,    0x00
-    xvstelm.d D4,  C1,  0x00,    0x00
-    xvstelm.d D8,  C2,  0x00,    0x00
-    xvstelm.d D12, C3,  0x00,    0x00
-    xvstelm.d D0,  C0,  0x08,    0x01
-    xvstelm.d D4,  C1,  0x08,    0x01
-    xvstelm.d D8,  C2,  0x08,    0x01
-    xvstelm.d D12, C3,  0x08,    0x01
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    xvst      D1,  C0,  0x20
+    /* Store C1 */
+    xvst      D4,  C1,  0x00
+    xvst      D5,  C1,  0x20
+    /* Store C2 */
+    xvst      D8,  C2,  0x00
+    xvst      D9,  C2,  0x20
+    /* Store C3 */
+    xvst      D12, C3,  0x00
+    xvst      D13, C3,  0x20
 
     /* Add stride for C */
-    addi.d    C0,  C0,  0x10
-    addi.d    C1,  C1,  0x10
-    addi.d    C2,  C2,  0x10
-    addi.d    C3,  C3,  0x10
+    addi.d    C0,  C0,  0x40
+    addi.d    C1,  C1,  0x40
+    addi.d    C2,  C2,  0x40
+    addi.d    C3,  C3,  0x40
 
 #if defined(TRMMKERNEL)
 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
     sub.d     L,   K,   OFF
 #ifdef LEFT
     /* number of values in A */
-    addi.d    L,   L,   -2
+    addi.d    L,   L,   -8
 #else
     /* number of values in B */
     addi.d    L,   L,   -4
 #endif
-    slli.d    T0,  L,  0x04
+    slli.d    T0,  L,  0x06
     add.d     A0,  A0, T0
     slli.d    T0,  L,  0x05
     add.d     B0,  B0, T0
@@ -1632,23 +1599,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #ifdef LEFT
     /* number of values in A */
-    addi.d    OFF,   OFF,  0x02
+    addi.d    OFF,   OFF,  0x08
 #endif
 #endif   // #if defined(TRMMKERNEL)
 
-/********LOOP (if(N >> 2 ) && (M & 2) ) End************/
+/********LOOP (if(N >> 2 ) && (M & 8)) End************/
 
-.L_M1:
-    andi      I,   M,   1
-    beq       ZERO,I,   .L_M0
+.L_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_M2
 
 #if defined(TRMMKERNEL)
 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
     move     B0,    B
 #else
-    slli.d   T0,    OFF,  0x03
-    add.d    A0,    A0,   T0
     slli.d   T0,    OFF,  0x05
+    add.d    A0,    A0,   T0
     add.d    B0,    B,    T0
 #endif
 
@@ -1656,7 +1622,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     sub.d    L,     K,    OFF
 #elif defined(LEFT)
     /* number of values in A */
-    addi.d   L,     OFF,  1
+    addi.d   L,     OFF,  4
 #else
     /* number of values in B */
     addi.d   L,     OFF,  4
@@ -1666,55 +1632,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     move     L,     K /* L = bk */
 #endif
 
-    /* Load 1 * 64 from A0 */
+    /* Load 4 * 64 from A0 */
     xvld     U0,   A0,    0x00
 
     xvldrepl.d     U4, B0, 0x00
     /* line 1 */
     xvfmul.d  D0,  U0, U4
 
-    xvldrepl.d     U4, B0, 0x08
+    xvldrepl.d     U5, B0, 0x08
     /* line 2 */
-    xvfmul.d  D4,  U0, U4
+    xvfmul.d  D4,  U0, U5
 
-    xvldrepl.d     U4, B0, 0x10
+    xvldrepl.d     U6, B0, 0x10
     /* line 3 */
-    xvfmul.d  D8,  U0, U4
+    xvfmul.d  D8,  U0, U6
 
-    xvldrepl.d     U4, B0, 0x18
+    xvldrepl.d     U7, B0, 0x18
     /* line 4 */
-    xvfmul.d  D12, U0, U4
+    xvfmul.d  D12, U0, U7
 
     /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x08
+    addi.d    A0,  A0, 0x20
     addi.d    B0,  B0, 0x20
     /* Reduce L */
     addi.d    L,   L,  -1
     srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_M1_L7 */
-    beq       ZERO,TL, .L_M1_L7
+    /* if (TL < 1) goto L_M4_L7 */
+    beq       ZERO,TL, .L_M4_L7
 
-.L_M1_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 1 * 64 from A0 */
-    xvld     U0,   A0,    0x00
+    xvld     U8,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    addi.d    TL,  TL,  -1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    xvldrepl.d  U14,   B0,  0x10
+    xvldrepl.d  U15,   B0,  0x18
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    beq    ZERO,    TL,  .L_M4_TL1_END
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+.L_M4_TL1: /* TL-- */
+    KERNEL8x4x4
 
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x20
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M4_TL1
+
+.L_M4_TL1_END:
+    KERNEL8x4x4_END
+
+.L_M4_L7:
+    /* if (!(L & 7)) goto L_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M4_L0
 
-           /***8-2***/
+.L_M4_L71:
     xvld     U0,   A0,    0x00
 
     xvldrepl.d U4,  B0, 0x00
@@ -1729,119 +1702,287 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvldrepl.d U4,  B0, 0x18
     xvfmadd.d  D12, U0, U4, D12
 
-    addi.d     A0,  A0, 0x08
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x20
     addi.d     B0,  B0, 0x20
 
-           /***8-3***/
-    xvld     U0,   A0,    0x00
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M4_L71
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+.L_M4_L0:
+    xvldrepl.d  VALPHA, $sp, 112
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D8,   D8,  VALPHA
+    xvfmul.d  D12,  D12, VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U1
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    xvfmadd.d D8,  D8,  VALPHA,  U2
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+    xvfmadd.d D12, D12, VALPHA,  U3
+#endif   // #if defined(TRMMKERNEL)
 
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x20
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    /* Store C1 */
+    xvst      D4,  C1,  0x00
+    /* Store C2 */
+    xvst      D8,  C2,  0x00
+    /* Store C3 */
+    xvst      D12, C3,  0x00
 
-           /***8-4***/
-    xvld     U0,   A0,    0x00
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x20
+    addi.d    C1,  C1,  0x20
+    addi.d    C2,  C2,  0x20
+    addi.d    C3,  C3,  0x20
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -4
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x05
+    add.d     A0,  A0, T0
+    add.d     B0,  B0, T0
+#endif
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+/********LOOP (if(N >> 2 ) && (M & 4) ) End************/
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+.L_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_M1
 
-    addi.d     A0,  A0, 0x08
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x04
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  2
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 2 * 64 from A0 */
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
+
+    xvld    U4,     B0,     0x00
+
+    xvfmul.d    D0,     U0,     U4
+    xvfmul.d    D1,     U1,     U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M2_L7 */
+    beq       ZERO,TL, .L_M2_L7
+
+    xvldrepl.d     U8,   A0,    0x00
+    xvldrepl.d     U9,   A0,    0x08
+
+    addi.d    TL,  TL,  -1
+
+    xvld       U12, B0, 0x00
+    addi.d     A0,  A0, 0x10
     addi.d     B0,  B0, 0x20
 
-           /***8-5***/
-    xvld     U0,   A0,    0x00
+    beq    ZERO,    TL,  .L_M2_TL1_END
+.L_M2_TL1: /* TL-- */
+    KERNEL8x2x4
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M2_TL1
+.L_M2_TL1_END:
+    KERNEL8x2x4_END
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+.L_M2_L7:
+    /* if (!(L & 7)) goto L_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M2_L0
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+.L_M2_L71:
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvld    U4,  B0, 0x00
 
-    addi.d     A0,  A0, 0x08
+    xvfmadd.d   D0,     U0,     U4,    D0
+    xvfmadd.d   D1,     U1,     U4,    D1
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x10
     addi.d     B0,  B0, 0x20
 
-           /***8-6***/
-    xvld     U0,   A0,    0x00
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M2_L71
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+.L_M2_L0:
+    xvldrepl.d  VALPHA, $sp, 112
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D1,   D1,  VALPHA
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvstelm.d D0,  C0,  0x00,    0x00
+    xvstelm.d D0,  C1,  0x00,    0x01
+    xvstelm.d D0,  C2,  0x00,    0x02
+    xvstelm.d D0,  C3,  0x00,    0x03
+    xvstelm.d D1,  C0,  0x08,    0x00
+    xvstelm.d D1,  C1,  0x08,    0x01
+    xvstelm.d D1,  C2,  0x08,    0x02
+    xvstelm.d D1,  C3,  0x08,    0x03
+#else
+    xvpackev.d  D4,     D1,     D0
+    xvpackod.d  D5,     D1,     D0
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvpermi.q   U2, U0, 0x20
+    xvpermi.q   U3, U1, 0x20
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d   D0, D4, VALPHA, U2
+    xvfmadd.d   D1, D5, VALPHA, U3
 
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x20
+    vst       $vr16,    C0,      0x00
+    vst       $vr17,    C1,      0x00
+    xvstelm.d D0,  C2,  0x00,    0x02
+    xvstelm.d D1,  C3,  0x00,    0x02
+    xvstelm.d D0,  C2,  0x08,    0x03
+    xvstelm.d D1,  C3,  0x08,    0x03
+#endif   // #if defined(TRMMKERNEL)
 
-           /***8-7***/
-    xvld     U0,   A0,    0x00
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x10
+    addi.d    C1,  C1,  0x10
+    addi.d    C2,  C2,  0x10
+    addi.d    C3,  C3,  0x10
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -2
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x04
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
+    add.d     B0,  B0, T0
+#endif
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+/********LOOP (if(N >> 2 ) && (M & 2) ) End************/
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+.L_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_M0
 
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x20
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x03
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  1
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d  U0,     A0,     0x00
+    xvld        U4,     B0,     0x00
+    xvfmul.d    D0,     U0,     U4
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M1_L7 */
+    beq       ZERO,TL, .L_M1_L7
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvldrepl.d  U8,     A0,     0x00
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    addi.d     TL,  TL,  -1
+    xvld       U12, B0,  0x00
+    addi.d     A0,  A0,  0x08
+    addi.d     B0,  B0,  0x20
 
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x20
+    beq    ZERO,    TL,  .L_M1_TL1_END
+
+.L_M1_TL1: /* TL-- */
+    KERNEL8x1x4
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_M1_TL1
+.L_M1_TL1_END:
+    KERNEL8x1x4_END
 
 .L_M1_L7:
     /* if (!(L & 7)) goto L_M1_L0 */
@@ -1849,19 +1990,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     beq       TL,  ZERO,.L_M1_L0
 
 .L_M1_L71:
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvldrepl.d  U0,     A0,     0x00
+    xvld        U4,     B0,     0x00
+    xvfmadd.d   D0,     U0,     U4,    D0
 
     /* Add stride for A0, B0 */
     addi.d     A0,  A0, 0x08
@@ -1871,33 +2002,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     blt        ZERO,TL, .L_M1_L71
 
 .L_M1_L0:
+    xvldrepl.d  VALPHA, $sp, 112
 #if defined(TRMMKERNEL)
     xvfmul.d  D0,   D0,  VALPHA
-    xvfmul.d  D4,   D4,  VALPHA
-    xvfmul.d  D8,   D8,  VALPHA
-    xvfmul.d  D12,  D12, VALPHA
+
+    xvstelm.d D0,  C0,  0x00,    0x00
+    xvstelm.d D0,  C1,  0x00,    0x01
+    xvstelm.d D0,  C2,  0x00,    0x02
+    xvstelm.d D0,  C3,  0x00,    0x03
 #else
     /* Load C0  */
-    xvld      U0,  C0,  0x00
-    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    xvldrepl.d     U0,  C0,  0x00
+    xvfmadd.d D4,  D0,  VALPHA,  U0
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvldrepl.d     U1,  C1,  0x00
+    xvfmadd.d D5,  D0,  VALPHA,  U1
 
     /* Load C2  */
-    xvld      U0,  C2,  0x00
-    xvfmadd.d D8,  D8,  VALPHA,  U0
+    xvldrepl.d     U2,  C2,  0x00
+    xvfmadd.d D6,  D0,  VALPHA,  U2
 
     /* Load C3  */
-    xvld      U0,  C3,  0x00
-    xvfmadd.d D12, D12, VALPHA,  U0
-#endif   // #if defined(TRMMKERNEL)
+    xvldrepl.d     U3,  C3,  0x00
+    xvfmadd.d D7,  D0,  VALPHA,  U3
 
-    xvstelm.d D0,  C0,  0x00,    0x00
-    xvstelm.d D4,  C1,  0x00,    0x00
-    xvstelm.d D8,  C2,  0x00,    0x00
-    xvstelm.d D12, C3,  0x00,    0x00
+    xvstelm.d D4,  C0,  0x00,    0x00
+    xvstelm.d D5,  C1,  0x00,    0x01
+    xvstelm.d D6,  C2,  0x00,    0x02
+    xvstelm.d D7,  C3,  0x00,    0x03
+#endif   // #if defined(TRMMKERNEL)
 
     /* Add stride for C */
     addi.d    C0,  C0,  0x08
@@ -1952,6 +2086,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /////////////////////////////////////////////////
 /************************ Condition 1 if((N >> 2) && (M >> 4)) END !!! ************************/
 
+    xvldrepl.d  VALPHA, $sp, 112
+
 .L_N3:
     andi     J,    N,   2
     beq      ZERO, J,   .L_N1
@@ -1993,223 +2129,65 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     addi.d   L,     OFF,  2
 #endif
 #else   // #if !defined(TRMMKERNEL)
-    move     B0,    B
-    move     L,     K /* L = bk */
-#endif
-
-    /* Load 16 * 64 from A0
-     * U0 = {a3,  a2,  a1,  a0}
-     * U1 = {a7,  a6,  a5,  a4}
-     * U2 = {a11, a10, a9,  a8}
-     * U3 = {a15, a14, a13, a12}
-     */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d     U4, B0, 0x00
-    /* line 1 */
-    xvfmul.d  D0,  U0, U4
-    xvfmul.d  D1,  U1, U4
-    xvfmul.d  D2,  U2, U4
-    xvfmul.d  D3,  U3, U4
-
-    xvldrepl.d     U4, B0, 0x08
-    /* line 2 */
-    xvfmul.d  D4,  U0, U4
-    xvfmul.d  D5,  U1, U4
-    xvfmul.d  D6,  U2, U4
-    xvfmul.d  D7,  U3, U4
-
-    /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x80
-    addi.d    B0,  B0, 0x10
-    /* Reduce L */
-    addi.d    L,   L,  -1
-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_N3_L7 */
-    beq       ZERO,TL, .L_N3_L7
-
-.L_N3_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x10
-
-           /***8-2***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x10
-
-           /***8-3***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x10
-
-           /***8-4***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x10
-
-           /***8-5***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x10
-
-           /***8-6***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x10
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
 
-           /***8-7***/
-    /* Load 16 * 64 from A0 */
+    /* Load 16 * 64 from A0
+     * U0 = {a3,  a2,  a1,  a0}
+     * U1 = {a7,  a6,  a5,  a4}
+     * U2 = {a11, a10, a9,  a8}
+     * U3 = {a15, a14, a13, a12}
+     */
     xvld     U0,   A0,    0x00
     xvld     U1,   A0,    0x20
     xvld     U2,   A0,    0x40
     xvld     U3,   A0,    0x60
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+    xvfmul.d  D2,  U2, U4
+    xvfmul.d  D3,  U3, U4
 
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x10
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
+    xvfmul.d  D6,  U2, U5
+    xvfmul.d  D7,  U3, U5
 
-           /***8-8***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x80
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_L7 */
+    beq       ZERO,TL, .L_N3_L7
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+    xvld     U10,  A0,    0x40
+    xvld     U11,  A0,    0x60
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
     addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x10
 
+    beq    ZERO,    TL,  .L_N3_TL1_END
+
+.L_N3_TL1: /* TL-- */
+    KERNEL8x16x2
+
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N3_TL1
+.L_N3_TL1_END:
+    KERNEL8x16x2_END
 
 .L_N3_L7:
     /* if (!(L & 7)) goto L_N3_L0 */
@@ -2229,12 +2207,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmadd.d  D2,  U2, U4, D2
     xvfmadd.d  D3,  U3, U4, D3
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
     /* Add stride for A0, B0 */
     addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x10
@@ -2264,14 +2241,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmadd.d D3,  D3,  VALPHA,  U3
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvld      U1,  C1,  0x20
-    xvld      U2,  C1,  0x40
-    xvld      U3,  C1,  0x60
-    xvfmadd.d D4,  D4,  VALPHA,  U0
-    xvfmadd.d D5,  D5,  VALPHA,  U1
-    xvfmadd.d D6,  D6,  VALPHA,  U2
-    xvfmadd.d D7,  D7,  VALPHA,  U3
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+    xvfmadd.d D4,  D4,  VALPHA,  U4
+    xvfmadd.d D5,  D5,  VALPHA,  U5
+    xvfmadd.d D6,  D6,  VALPHA,  U6
+    xvfmadd.d D7,  D7,  VALPHA,  U7
 #endif // #if defined(TRMMKERNEL)
 
     /* Store C0 */
@@ -2352,10 +2329,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmul.d  D0,  U0, U4
     xvfmul.d  D1,  U1, U4
 
-    xvldrepl.d     U4, B0, 0x08
+    xvldrepl.d     U5, B0, 0x08
     /* line 2 */
-    xvfmul.d  D4,  U0, U4
-    xvfmul.d  D5,  U1, U4
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
 
     /* Add stride for A0 and B0 */
     addi.d    A0,  A0, 0x40
@@ -2366,131 +2343,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     /* if (TL < 1) goto L_N3_M8_L7 */
     beq       ZERO,TL, .L_N3_M8_L7
 
-.L_N3_M8_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x10
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x10
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x10
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x10
-
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    /* Cumulative D0~D15 */
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x10
-
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x10
-
-           /***8-7***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
     addi.d     A0,  A0, 0x40
     addi.d     B0,  B0, 0x10
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    beq    ZERO,    TL,  .L_N3_M8_TL1_END
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x10
+.L_N3_M8_TL1: /* TL-- */
+    KERNEL8x8x2
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N3_M8_TL1
+.L_N3_M8_TL1_END:
+    KERNEL8x8x2_END
 
 .L_N3_M8_L7:
     /* if (!(L & 7)) goto L_N3_M8_L0 */
@@ -2505,9 +2376,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
 
     /* Add stride for A0, B0 */
     addi.d     A0,  A0, 0x40
@@ -2530,10 +2401,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmadd.d D1,  D1,  VALPHA,  U1
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvld      U1,  C1,  0x20
-    xvfmadd.d D4,  D4,  VALPHA,  U0
-    xvfmadd.d D5,  D5,  VALPHA,  U1
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+    xvfmadd.d D4,  D4,  VALPHA,  U2
+    xvfmadd.d D5,  D5,  VALPHA,  U3
 #endif // #if defined(TRMMKERNEL)
 
     /* Store C0 */
@@ -2561,162 +2432,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     add.d     B0,  B0, T0
 #endif
 
-#ifdef LEFT
-    addi.d    OFF,   OFF,  0x08
-#endif
-#endif   // #if defined(TRMMKERNEL)
-
-/********LOOP (if(N & 2) && (M & 8) ) End************/
-
-.L_N3_M4:
-    andi      I,   M,   4
-    beq       ZERO,I,   .L_N3_M2
-
-#if defined(TRMMKERNEL)
-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-    move     B0,    B
-#else
-    slli.d   T0,    OFF,  0x05
-    add.d    A0,    A0,   T0
-    slli.d   T0,    OFF,  0x04
-    add.d    B0,    B,    T0
-#endif
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-    sub.d    L,     K,    OFF
-#elif defined(LEFT)
-    /* number of values in A */
-    addi.d   L,     OFF,  4
-#else
-    /* number of values in B */
-    addi.d   L,     OFF,  2
-#endif
-#else   // #if !defined(TRMMKERNEL)
-    move     B0,    B
-    move     L,     K /* L = bk */
-#endif
-
-    /* Load 4 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d     U4, B0, 0x00
-    /* line 1 */
-    xvfmul.d  D0,  U0, U4
-
-    xvldrepl.d     U4, B0, 0x08
-    /* line 2 */
-    xvfmul.d  D4,  U0, U4
-
-    /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x20
-    addi.d    B0,  B0, 0x10
-    /* Reduce L */
-    addi.d    L,   L,  -1
-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_N3_M4_L7 */
-    beq       ZERO,TL, .L_N3_M4_L7
-
-.L_N3_M4_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 8 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x10
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x10
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x10
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x10
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
 
-           /***8-5***/
-    xvld     U0,   A0,    0x00
+/********LOOP (if(N & 2) && (M & 8) ) End************/
 
-    /* Cumulative D0~D15 */
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+.L_N3_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N3_M2
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x05
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x04
+    add.d    B0,    B,    T0
+#endif
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x10
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  4
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
 
-           /***8-6***/
+    /* Load 4 * 64 from A0 */
     xvld     U0,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x10
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
 
-           /***8-7***/
-    xvld     U0,   A0,    0x00
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x20
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M4_L7 */
+    beq       ZERO,TL, .L_N3_M4_L7
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    xvld     U8,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
     addi.d     A0,  A0, 0x20
     addi.d     B0,  B0, 0x10
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    beq    ZERO,    TL,  .L_N3_M4_TL1_END
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x10
+.L_N3_M4_TL1: /* TL-- */
+    KERNEL8x4x2
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N3_M4_TL1
+.L_N3_M4_TL1_END:
+    KERNEL8x4x2_END
 
 .L_N3_M4_L7:
     /* if (!(L & 7)) goto L_N3_M4_L0 */
@@ -2729,8 +2517,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvldrepl.d U4,  B0, 0x00
     xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
 
     /* Add stride for A0, B0 */
     addi.d     A0,  A0, 0x20
@@ -2749,8 +2537,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvld      U1,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U1
 #endif // #if defined(TRMMKERNEL)
 
     /* Store C0 */
@@ -2830,106 +2618,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     /* if (TL < 1) goto L_N3_M2_L7 */
     beq       ZERO,TL, .L_N3_M2_L7
 
-.L_N3_M2_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 2 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x10
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x10
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x10
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x10
-
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x10
-
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x10
+    xvld     U8,   A0,    0x00
 
-           /***8-7***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
     addi.d     A0,  A0, 0x10
     addi.d     B0,  B0, 0x10
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    beq    ZERO,    TL,  .L_N3_M2_TL1_END
 
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x10
+.L_N3_M2_TL1: /* TL-- */
+    KERNEL8x2x2
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N3_M2_TL1
+.L_N3_M2_TL1_END:
+    KERNEL8x2x2_END
 
 .L_N3_M2_L7:
     /* if (!(L & 7)) goto L_N3_M2_L0 */
@@ -2942,8 +2648,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvldrepl.d U4,  B0, 0x00
     xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
 
     /* Add stride for A0, B0 */
     addi.d     A0,  A0, 0x10
@@ -2962,8 +2668,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvld      U1,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U1
 #endif // #if defined(TRMMKERNEL)
 
     xvstelm.d D0,  C0,  0x00,    0x00
@@ -3017,132 +2723,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
     /* number of values in B */
     addi.d   L,     OFF,  2
-#endif
-#else   // #if !defined(TRMMKERNEL)
-    move     B0,    B
-    move     L,     K /* L = bk */
-#endif
-
-    /* Load 1 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d     U4, B0, 0x00
-    /* line 1 */
-    xvfmul.d  D0,  U0, U4
-
-    xvldrepl.d     U4, B0, 0x08
-    /* line 2 */
-    xvfmul.d  D4,  U0, U4
-
-    /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x08
-    addi.d    B0,  B0, 0x10
-    /* Reduce L */
-    addi.d    L,   L,  -1
-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_N3_M1_L7 */
-    beq       ZERO,TL, .L_N3_M1_L7
-
-.L_N3_M1_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 1 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x10
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x10
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x10
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x10
-
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x10
-
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x10
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
 
-           /***8-7***/
+    /* Load 1 * 64 from A0 */
     xvld     U0,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
 
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x10
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M1_L7 */
+    beq       ZERO,TL, .L_N3_M1_L7
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    xvld     U8,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
     addi.d     A0,  A0, 0x08
     addi.d     B0,  B0, 0x10
 
+    beq    ZERO,    TL,  .L_N3_M1_TL1_END
+
+.L_N3_M1_TL1: /* TL-- */
+    KERNEL8x1x2
+
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N3_M1_TL1
+.L_N3_M1_TL1_END:
+    KERNEL8x1x2_END
 
 .L_N3_M1_L7:
     /* if (!(L & 7)) goto L_N3_M1_L0 */
@@ -3155,8 +2779,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvldrepl.d U4,  B0, 0x00
     xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
 
     /* Add stride for A0, B0 */
     addi.d     A0,  A0, 0x08
@@ -3175,8 +2799,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvld      U1,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U1
 #endif // #if defined(TRMMKERNEL)
 
     xvstelm.d D0,  C0,  0x00,    0x00
@@ -3300,137 +2924,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     /* if (TL < 1) goto L_N1_L7 */
     beq       ZERO,TL, .L_N1_L7
 
-.L_N1_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x08
-
-           /***8-2***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x08
-
-           /***8-3***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x08
-
-           /***8-4***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x08
-
-           /***8-5***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x08
-
-           /***8-6***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x08
-
-           /***8-7***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+    xvld     U10,  A0,    0x40
+    xvld     U11,  A0,    0x60
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
     addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x08
 
-           /***8-8***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x08
+    beq    ZERO,    TL,  .L_N1_TL1_END
+.L_N1_TL1: /* TL-- */
+    KERNEL8x16x1
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N1_TL1
+.L_N1_TL1_END:
+    KERNEL8x16x1_END
 
 .L_N1_L7:
     /* if (!(L & 7)) goto L_N1_L0 */
@@ -3494,161 +3006,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
     slli.d    T0,  L,  0x07
     add.d     A0,  A0, T0
-    slli.d    T0,  L,  0x03
-    add.d     B0,  B0, T0
-#endif
-
-#ifdef LEFT
-    addi.d    OFF,   OFF,  0x10
-#endif
-#endif   // #if defined(TRMMKERNEL)
-
-    addi.d    I,   I,   -1  /* I-- */
-    blt       ZERO,I,   .L_N1_I1
-
-.L_N1_M8:
-    /* We have done M & 16, considering M=8/4/2/1 */
-    andi      I,   M,   15
-    beq       ZERO,I,   .L_N1_M0
-
-    andi      I,   M,   8
-    beq       ZERO,I,   .L_N1_M4
-
-#if defined(TRMMKERNEL)
-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-    move     B0,    B
-#else
-    slli.d   T0,    OFF,  0x06
-    add.d    A0,    A0,   T0
-    slli.d   T0,    OFF,  0x03
-    add.d    B0,    B,    T0
-#endif
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-    sub.d    L,     K,    OFF
-#elif defined(LEFT)
-    /* number of values in A */
-    addi.d   L,     OFF,  8
-#else
-    /* number of values in B */
-    addi.d   L,     OFF,  1
-#endif
-#else   // #if !defined(TRMMKERNEL)
-    move     B0,    B
-    move     L,     K /* L = bk */
-#endif
-
-    /* Load 8 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d     U4, B0, 0x00
-    /* line 1 */
-    xvfmul.d  D0,  U0, U4
-    xvfmul.d  D1,  U1, U4
-
-    /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x40
-    addi.d    B0,  B0, 0x08
-    /* Reduce L */
-    addi.d    L,   L,  -1
-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_N1_M8_L7 */
-    beq       ZERO,TL, .L_N1_M8_L7
-
-.L_N1_M8_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x08
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x08
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x08
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x08
+    slli.d    T0,  L,  0x03
+    add.d     B0,  B0, T0
+#endif
 
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x10
+#endif
+#endif   // #if defined(TRMMKERNEL)
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    addi.d    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N1_I1
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x08
+.L_N1_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_N1_M0
 
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_N1_M4
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x06
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x03
+    add.d    B0,    B,    T0
+#endif
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x08
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  8
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
 
-           /***8-7***/
+    /* Load 8 * 64 from A0 */
     xvld     U0,   A0,    0x00
     xvld     U1,   A0,    0x20
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x08
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x40
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M8_L7 */
+    beq       ZERO,TL, .L_N1_M8_L7
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
     addi.d     A0,  A0, 0x40
     addi.d     B0,  B0, 0x08
 
+    beq    ZERO,    TL,  .L_N1_M8_TL1_END
+.L_N1_M8_TL1: /* TL-- */
+    KERNEL8x8x1
+
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N1_M8_TL1
 
+.L_N1_M8_TL1_END:
+    KERNEL8x8x1_END
+
 .L_N1_M8_L7:
     /* if (!(L & 7)) goto L_N1_M8_L0 */
     andi      TL,  L,   7
@@ -3753,81 +3191,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     /* if (TL < 1) goto L_N1_M4_L7 */
     beq       ZERO,TL, .L_N1_M4_L7
 
-.L_N1_M4_TL1: /* TL-- */
-           /***8-1***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x08
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x08
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x08
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x08
-
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x08
-
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x08
-
-           /***8-7***/
-    xvld     U0,   A0,    0x00
+    xvld     U8,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
     addi.d     A0,  A0, 0x20
     addi.d     B0,  B0, 0x08
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    beq    ZERO,    TL,  .L_N1_M4_TL1_END
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x08
+.L_N1_M4_TL1: /* TL-- */
+    KERNEL8x4x1
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N1_M4_TL1
+.L_N1_M4_TL1_END:
+    KERNEL8x4x1_END
 
 .L_N1_M4_L7:
     /* if (!(L & 7)) goto L_N1_M4_L0 */
@@ -3927,82 +3307,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     /* if (TL < 1) goto L_N1_M2_L7 */
     beq       ZERO,TL, .L_N1_M2_L7
 
-.L_N1_M2_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 2 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x08
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x08
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x08
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x08
-
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x08
-
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x08
-
-           /***8-7***/
-    xvld     U0,   A0,    0x00
+    xvld     U8,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
     addi.d     A0,  A0, 0x10
     addi.d     B0,  B0, 0x08
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    beq    ZERO,    TL,  .L_N1_M2_TL1_END
 
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x08
+.L_N1_M2_TL1: /* TL-- */
+    KERNEL8x2x1
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N1_M2_TL1
+.L_N1_M2_TL1_END:
+    KERNEL8x2x1_END
 
 .L_N1_M2_L7:
     /* if (!(L & 7)) goto L_N1_M2_L0 */
@@ -4101,82 +3422,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     /* if (TL < 1) goto L_N1_M1_L7 */
     beq       ZERO,TL, .L_N1_M1_L7
 
-.L_N1_M1_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 1 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x08
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x08
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x08
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x08
-
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x08
-
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x08
-
-           /***8-7***/
-    xvld     U0,   A0,    0x00
+    xvld     U8,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
     addi.d     A0,  A0, 0x08
     addi.d     B0,  B0, 0x08
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    beq    ZERO,    TL,  .L_N1_M1_TL1_END
 
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x08
+.L_N1_M1_TL1: /* TL-- */
+    KERNEL8x1x1
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N1_M1_TL1
+.L_N1_M1_TL1_END:
+    KERNEL8x1x1_END
 
 .L_N1_M1_L7:
     /* if (!(L & 7)) goto L_N1_M1_L0 */
@@ -4243,7 +3505,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     LDARG    $r26,  $sp,   24
     LDARG    $r27,  $sp,   32
     LD       $f23,  $sp,   40
-    addi.d   $sp,   $sp,   56
+    LD       $f24,  $sp,   48
+    LD       $f25,  $sp,   56
+    LD       $f26,  $sp,   64
+    LD       $f27,  $sp,   72
+    LD       $f28,  $sp,   80
+    LD       $f29,  $sp,   88
+    LD       $f30,  $sp,   96
+    LD       $f31,  $sp,   104
+    addi.d   $sp,   $sp,   120
 
     jirl    $r0, $r1, 0x0
 

From e8b571d245d57fb143c25fadda18fcc01a84b3f5 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Tue, 11 Jul 2023 10:01:12 +0800
Subject: [PATCH 258/718] LoongArch64: Add dgemv_t_8_lasx.S and
 dgemv_n_8_lasx.S V2

---
 kernel/loongarch64/KERNEL.LOONGSON3R5 |   3 +
 kernel/loongarch64/KERNEL.generic     |   4 +
 kernel/loongarch64/dgemv_n_8_lasx.S   | 546 ++++++++++++++++++++++++++
 kernel/loongarch64/dgemv_t_8_lasx.S   | 468 ++++++++++++++++++++++
 kernel/loongarch64/loongarch64_asm.S  | 313 +++++++++++++++
 5 files changed, 1334 insertions(+)
 create mode 100644 kernel/loongarch64/dgemv_n_8_lasx.S
 create mode 100644 kernel/loongarch64/dgemv_t_8_lasx.S
 create mode 100644 kernel/loongarch64/loongarch64_asm.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 253aa2464..9d858584c 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -8,6 +8,9 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
 DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
 DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMVNKERNEL = dgemv_n_8_lasx.S
+DGEMVTKERNEL = dgemv_t_8_lasx.S
 endif
 
 DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic
index b772a6f82..213add9ee 100644
--- a/kernel/loongarch64/KERNEL.generic
+++ b/kernel/loongarch64/KERNEL.generic
@@ -132,12 +132,16 @@ CSWAPKERNEL  = ../arm/zswap.c
 ZSWAPKERNEL  = ../arm/zswap.c
 
 SGEMVNKERNEL = ../arm/gemv_n.c
+ifndef DGEMVNKERNEL
 DGEMVNKERNEL = ../arm/gemv_n.c
+endif
 CGEMVNKERNEL = ../arm/zgemv_n.c
 ZGEMVNKERNEL = ../arm/zgemv_n.c
 
 SGEMVTKERNEL = ../arm/gemv_t.c
+ifndef DGEMVTKERNEL
 DGEMVTKERNEL = ../arm/gemv_t.c
+endif
 CGEMVTKERNEL = ../arm/zgemv_t.c
 ZGEMVTKERNEL = ../arm/zgemv_t.c
 
diff --git a/kernel/loongarch64/dgemv_n_8_lasx.S b/kernel/loongarch64/dgemv_n_8_lasx.S
new file mode 100644
index 000000000..940d27569
--- /dev/null
+++ b/kernel/loongarch64/dgemv_n_8_lasx.S
@@ -0,0 +1,546 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/07/14 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA   $f0
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define Y_ORG   $r15
+#define OFFSET  $r16
+#define K_LDA   $r17
+#define M8      $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+
+#define VALPHA  $xr1
+#define X0      $xr2
+#define X1      $xr3
+#define X2      $xr4
+#define X3      $xr5
+#define X4      $xr6
+#define X5      $xr7
+#define X6      $xr8
+#define X7      $xr9
+#define Y0      $xr10
+#define Y1      $xr11
+#define A0      $xr12
+#define A1      $xr13
+#define A2      $xr14
+#define A3      $xr15
+#define A4      $xr16
+#define A5      $xr17
+#define A6      $xr18
+#define A7      $xr19
+#define A8      $xr20
+#define A9      $xr21
+#define A10     $xr22
+#define A11     $xr23
+#define A12     $xr24
+#define A13     $xr25
+#define A14     $xr26
+#define A15     $xr27
+
+.macro DLOAD_X_8
+    GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
+                   X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
+                 X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
+.endm
+
+.macro DLOAD_X_4
+    GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
+.endm
+
+.macro DLOAD_X_2
+    GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA
+.endm
+
+.macro DLOAD_X_1
+    GLDREPL xv, d, X0, X, 0x00
+    GMUL xvf, d, X0, X0, VALPHA
+.endm
+
+.macro DLOAD_Y_8
+    GLD xv, , Y0, Y, 0, Y1, Y, 0x20
+.endm
+
+.macro DLOAD_Y_4
+    GLD xv, , Y0, Y, 0
+.endm
+
+.macro DLOAD_Y_1
+    fld.d  $f10,   Y,  0
+.endm
+
+.macro DSTORE_Y_8
+    GST xv, , Y0, Y, 0, Y1, Y, 0x20
+.endm
+
+.macro DSTORE_Y_4
+    GST xv, , Y0, Y, 0
+.endm
+
+.macro DSTORE_Y_1
+    fst.d   $f10,   Y,  0
+.endm
+
+// Unable to use vector load/store ins
+.macro DLOAD_Y_8_GAP
+    fld.d   $f10,   Y,  0
+    fldx.d  $f13,   Y,  INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  1
+    fld.d   $f14,   T0, 0
+    fldx.d  $f15,   T0, INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  2
+    fld.d   $f11,   T0, 0
+    fldx.d  $f17,   T0, INC_Y
+    PTR_ADD   T0,   T0, INC_Y
+    PTR_ADD   T0,   T0, INC_Y
+    fld.d   $f18,   T0, 0
+    fldx.d  $f19,   T0, INC_Y
+    GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3
+.endm
+
+.macro DLOAD_Y_4_GAP
+    fld.d   $f10,   Y,  0
+    fldx.d  $f13,   Y,  INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  1
+    fld.d   $f14,   T0, 0
+    fldx.d  $f15,   T0, INC_Y
+    GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3
+.endm
+
+.macro DSTORE_Y_8_GAP
+    xvstelm.d   Y0,     Y,      0,      0
+    PTR_ADD     T0,     Y,      INC_Y
+    xvstelm.d   Y0,     T0,     0,      1
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y0,     T0,     0,      2
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y0,     T0,     0,      3
+
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      0
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      1
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      2
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      3
+.endm
+
+.macro DSTORE_Y_4_GAP
+    xvstelm.d   Y0,     Y,      0,      0
+    PTR_ADD     T0,     Y,      INC_Y
+    xvstelm.d   Y0,     T0,     0,      1
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y0,     T0,     0,      2
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y0,     T0,     0,      3
+.endm
+
+.macro DLOAD_X_8_GAP
+    xvldrepl.d  X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    xvldrepl.d  X1,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X2,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X3,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X4,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X5,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X6,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X7,     T0,     0x00
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
+                 X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
+.endm
+
+.macro DLOAD_X_4_GAP
+    xvldrepl.d  X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    xvldrepl.d  X1,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X2,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X3,     T0,     0x00
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
+.endm
+
+.macro DLOAD_X_2_GAP
+    xvldrepl.d  X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    xvldrepl.d  X1,     T0,     0x00
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA
+.endm
+
+.macro DGEMV_N_8x8
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0, \
+    A8,  PA4, 0, A9,  PA4, 0, \
+    A10, PA5, 0, A11, PA5, 0, \
+    A12, PA6, 0, A13, PA6, 0, \
+    A14, PA7, 0, A15, PA7, 0
+
+    GMADD xvf, d, Y0, A0,  X0, Y0, Y1, A1,  X0, Y1, \
+                  Y0, A2,  X1, Y0, Y1, A3,  X1, Y1, \
+                  Y0, A4,  X2, Y0, Y1, A5,  X2, Y1, \
+                  Y0, A6,  X3, Y0, Y1, A7,  X3, Y1, \
+                  Y0, A8,  X4, Y0, Y1, A9,  X4, Y1, \
+                  Y0, A10, X5, Y0, Y1, A11, X5, Y1, \
+                  Y0, A12, X6, Y0, Y1, A13, X6, Y1, \
+                  Y0, A14, X7, Y0, Y1, A15, X7, Y1
+.endm
+
+.macro DGEMV_N_4x8
+    GLD_INC xv, , 0x20, A0,  PA0, 0, \
+                        A2,  PA1, 0, \
+                        A4,  PA2, 0, \
+                        A6,  PA3, 0, \
+                        A8,  PA4, 0, \
+                        A10, PA5, 0, \
+                        A12, PA6, 0, \
+                        A14, PA7, 0
+
+    GMADD xvf, d, Y0, A0,  X0, Y0, \
+                  Y0, A2,  X1, Y0, \
+                  Y0, A4,  X2, Y0, \
+                  Y0, A6,  X3, Y0, \
+                  Y0, A8,  X4, Y0, \
+                  Y0, A10, X5, Y0, \
+                  Y0, A12, X6, Y0, \
+                  Y0, A14, X7, Y0
+.endm
+
+.macro DGEMV_N_1x8
+    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \
+                        $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0
+    GMADD f, d, $f10, $f12, $f2, $f10, \
+                $f10, $f14, $f3, $f10, \
+                $f10, $f16, $f4, $f10, \
+                $f10, $f18, $f5, $f10, \
+                $f10, $f20, $f6, $f10, \
+                $f10, $f22, $f7, $f10, \
+                $f10, $f24, $f8, $f10, \
+                $f10, $f26, $f9, $f10,
+.endm
+
+.macro DGEMV_N_8x4
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0
+
+    GMADD xvf, d, Y0, A0,  X0, Y0, Y1, A1,  X0, Y1, \
+                  Y0, A2,  X1, Y0, Y1, A3,  X1, Y1, \
+                  Y0, A4,  X2, Y0, Y1, A5,  X2, Y1, \
+                  Y0, A6,  X3, Y0, Y1, A7,  X3, Y1
+.endm
+
+.macro DGEMV_N_4x4
+    GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0
+
+    GMADD xvf, d, Y0, A0,  X0, Y0, Y0, A2,  X1, Y0, \
+                  Y0, A4,  X2, Y0, Y0, A6,  X3, Y0
+.endm
+
+.macro DGEMV_N_1x4
+    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0
+    GMADD f, d, $f10, $f12, $f2, $f10, $f10, $f14, $f3, $f10, \
+                $f10, $f16, $f4, $f10, $f10, $f18, $f5, $f10
+.endm
+
+.macro DGEMV_N_8x2
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0
+    GMADD xvf, d, Y0, A0,  X0, Y0, Y1, A1,  X0, Y1, \
+                  Y0, A2,  X1, Y0, Y1, A3,  X1, Y1
+.endm
+
+.macro DGEMV_N_4x2
+    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0
+    GMADD xvf, d, Y0, A0,  X0, Y0, \
+                  Y0, A2,  X1, Y0
+.endm
+
+.macro DGEMV_N_1x2
+    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0
+    GMADD f, d, $f10, $f12, $f2, $f10, \
+                $f10, $f14, $f3, $f10
+.endm
+
+.macro DGEMV_N_1x1
+    fld.d   $f12,    PA0,    0
+    PTR_ADDI PA0,   PA0,    0x08
+    fmadd.d $f10,   $f12,   $f2,    $f10
+.endm
+
+.macro DGEMV_N XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req
+    PTR_SRLI  J,      N,      3
+    beqz      J,      .L_\XW\()_N_7
+    PTR_SLLI  K_LDA,  LDA,    3
+    PTR_SUB   K_LDA,  K_LDA,  M8
+.L_\XW\()_N_L8:
+    DLOAD_\X_8
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_M_7
+.align 5
+.L_\XW\()_M_L8:
+    DLOAD_\Y_8
+    DGEMV_N_8x8
+    DSTORE_\Y_8
+    PTR_ADDI    I,      I,      -1
+    PTR_ALSL    Y,      INC_Y,  Y,  3
+    PTR_ADDI    K,      K,      8
+    bnez        I,      .L_\XW\()_M_L8
+.L_\XW\()_M_7:
+    andi        I,      M,      4
+    beqz        I,      .L_\XW\()_M_3
+    DLOAD_\Y_4
+    DGEMV_N_4x8
+    DSTORE_\Y_4
+    PTR_ALSL    Y,      INC_Y,  Y,  2
+    PTR_ADDI    K,      K,      4
+.L_\XW\()_M_3:
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    DLOAD_\Y_1
+    DGEMV_N_1x8
+    DSTORE_\Y_1
+    PTR_ADDI    I,      I,      -1
+    PTR_ADD     Y,      Y,      INC_Y
+    PTR_ADDI    K,      K,      1
+    bnez        I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#else
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#endif
+    PTR_ALSL    X,      INC_X,  X,  3
+    bnez        J,      .L_\XW\()_N_L8
+.L_\XW\()_N_7:
+    andi        J,      N,      4
+    beqz        J,      .L_\XW\()_N_3
+    DLOAD_\X_4
+    xor         K,      K,      K
+    move        Y,      Y_ORG
+
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_N_4_M_7
+.align 5
+.L_\XW\()_N_4_M_L8:
+    DLOAD_\Y_8
+    DGEMV_N_8x4
+    DSTORE_\Y_8
+    PTR_ADDI  I,      I,      -1
+    PTR_ADDI  K,      K,      8
+    PTR_ALSL  Y,      INC_Y,  Y,  3
+    bnez    I,      .L_\XW\()_N_4_M_L8
+.L_\XW\()_N_4_M_7:
+    andi    I,      M,      4
+    beqz    I,      .L_\XW\()_N_4_M_3
+    DLOAD_\Y_4
+    DGEMV_N_4x4
+    DSTORE_\Y_4
+    PTR_ALSL  Y,    INC_Y,  Y,  2
+    PTR_ADDI  K,    K,      4
+.L_\XW\()_N_4_M_3:
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_N_4_M_END
+.align 5
+.L_\XW\()_N_4_M_L1:
+    DLOAD_\Y_1
+    DGEMV_N_1x4
+    DSTORE_\Y_1
+    PTR_ADDI    I,      I,      -1
+    PTR_ADD     Y,      Y,      INC_Y
+    PTR_ADDI    K,      K,      1
+    bnez        I,      .L_\XW\()_N_4_M_L1
+.L_\XW\()_N_4_M_END:
+    PTR_SLLI    K_LDA,  LDA,    2
+    PTR_SUB     K_LDA,  K_LDA,  M8
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#endif
+    PTR_ALSL    X,      INC_X,  X,  2
+.L_\XW\()_N_3:
+    andi        J,      N,      2
+    beqz        J,      .L_\XW\()_N_1
+    DLOAD_\X_2
+    xor         K,      K,      K
+    move        Y,      Y_ORG
+    PTR_SRLI    I,      M,       3
+    beqz    I,      .L_\XW\()_N_2_M_7
+.align 5
+.L_\XW\()_N_2_M_L8:
+    DLOAD_\Y_8
+    DGEMV_N_8x2
+    DSTORE_\Y_8
+    PTR_ADDI  I,      I,      -1
+    PTR_ADDI  K,      K,      8
+    PTR_ALSL  Y,      INC_Y,  Y,  3
+    bnez    I,      .L_\XW\()_N_2_M_L8
+.L_\XW\()_N_2_M_7:
+    andi    I,      M,      4
+    beqz    I,      .L_\XW\()_N_2_M_3
+    DLOAD_\Y_4
+    DGEMV_N_4x2
+    DSTORE_\Y_4
+    PTR_ALSL  Y,        INC_Y,      Y,      2
+    PTR_ADDI  K,        K,          4
+.L_\XW\()_N_2_M_3:
+    andi    I,      M,      3
+    beqz    I,      .L_\XW\()_N_2_M_END
+.align 5
+.L_\XW\()_N_2_M_L1:
+    DLOAD_\Y_1
+    DGEMV_N_1x2
+    DSTORE_\Y_1
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   Y,      Y,      INC_Y
+    PTR_ADDI  K,      K,      1
+    bnez    I,      .L_\XW\()_N_2_M_L1
+.L_\XW\()_N_2_M_END:
+    PTR_SLLI    K_LDA,  LDA,    1
+    PTR_SUB     K_LDA,  K_LDA,  M8
+    PTR_ADD     PA0,    PA0,    K_LDA
+    PTR_ADD     PA1,    PA1,    K_LDA
+    PTR_ALSL    X,      INC_X,  X,  1
+.L_\XW\()_N_1:
+    andi    J,      N,      1
+    beqz    J,      .L_END
+    DLOAD_\X_1
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    DLOAD_\Y_1
+    DGEMV_N_1x1
+    DSTORE_\Y_1
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   Y,      Y,      INC_Y
+    PTR_ADDI  K,      K,      1
+    bnez    I,      .L_\XW\()_N_1_M_L1
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 7, 24 + 4
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    PTR_SUB    J,      INC_Y,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    maskeqz    J,      K,       J  /* if(inc_y == 1) j = 0; else j = 1; */
+    PTR_ALSL   I,      I,       J,      1
+    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
+    xvreplve0.d     VALPHA, $xr0
+    move     Y_ORG,  Y
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#else
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword  .L_GAP_0_0 - .L_GAP_TABLE
+    .hword  .L_GAP_0_1 - .L_GAP_TABLE
+    .hword  .L_GAP_1_0 - .L_GAP_TABLE
+    .hword  .L_GAP_1_1 - .L_GAP_TABLE
+.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
+    DGEMV_N GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1
+.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
+    DGEMV_N GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1
+.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
+    DGEMV_N GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1
+.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
+    DGEMV_N GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
+.L_END:
+    pop_if_used 17 + 7, 24 + 4
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemv_t_8_lasx.S b/kernel/loongarch64/dgemv_t_8_lasx.S
new file mode 100644
index 000000000..be90cb1af
--- /dev/null
+++ b/kernel/loongarch64/dgemv_t_8_lasx.S
@@ -0,0 +1,468 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/07/17 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA   $f0
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define PY0     $r14
+#define X_ORG   $r15
+#define PY1     $r16
+#define K_LDA   $r17
+#define PY2     $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+#define M8      $r30
+
+#define VALPHA  $xr0
+#define X0      $xr1
+#define X1      $xr2
+#define A0      $xr3
+#define A1      $xr4
+#define A2      $xr5
+#define A3      $xr6
+#define A4      $xr7
+#define A5      $xr8
+#define A6      $xr9
+#define A7      $xr10
+#define A8      $xr11
+#define A9      $xr12
+#define A10     $xr13
+#define A11     $xr14
+#define A12     $xr15
+#define A13     $xr16
+#define A14     $xr17
+#define A15     $xr18
+#define TP0     $xr19
+#define TP1     $xr20
+#define TP2     $xr21
+#define TP3     $xr22
+#define TP4     $xr23
+#define TP5     $xr24
+#define TP6     $xr25
+#define TP7     $xr26
+#define Y0      $xr3
+#define Y1      $xr4
+#define Y2      $xr5
+#define Y3      $xr6
+#define Y4      $xr7
+#define Y5      $xr8
+#define Y6      $xr9
+#define Y7      $xr10
+
+.macro ZERO_Y8
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \
+                TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7
+.endm
+
+.macro ZERO_Y4
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
+.endm
+
+.macro ZERO_Y2
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1
+.endm
+
+.macro ZERO_Y1
+    GXOR xv, v, TP0, TP0, TP0
+.endm
+
+.macro DLOAD_X8
+    GLD xv, , X0, X, 0x00, X1, X, 0x20
+.endm
+
+.macro DLOAD_X4
+    GLD xv, , X0, X, 0x00
+.endm
+
+.macro DLOAD_X8_GAP
+    fld.d       $f1,    X,    0x00
+    fldx.d      $f2,    X,    INC_X
+    PTR_ALSL    T0,     INC_X,      X,      1
+    fld.d       $f3,    T0,   0x00
+    fldx.d      $f4,    T0,   INC_X
+    GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
+    PTR_ALSL    T0,     INC_X,      X,      2
+    fld.d       $f2,    T0,   0x00
+    fldx.d      $f3,    T0,   INC_X
+    PTR_ALSL    T0,     INC_X,      T0,     1
+    fld.d       $f4,    T0,   0x00
+    fldx.d      $f5,    T0,   INC_X
+    GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3
+.endm
+
+.macro DLOAD_X4_GAP
+    fld.d       $f1,    X,    0x00
+    fldx.d      $f2,    X,    INC_X
+    PTR_ALSL    T0,     INC_X,      X,      1
+    fld.d       $f3,    T0,   0x00
+    fldx.d      $f4,    T0,   INC_X
+    GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
+.endm
+
+.macro DGEMV_T_8x8
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0, \
+    A8,  PA4, 0, A9,  PA4, 0, \
+    A10, PA5, 0, A11, PA5, 0, \
+    A12, PA6, 0, A13, PA6, 0, \
+    A14, PA7, 0, A15, PA7, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP0, A1,  X1, TP0, \
+                  TP1, A2,  X0, TP1, TP1, A3,  X1, TP1, \
+                  TP2, A4,  X0, TP2, TP2, A5,  X1, TP2, \
+                  TP3, A6,  X0, TP3, TP3, A7,  X1, TP3, \
+                  TP4, A8,  X0, TP4, TP4, A9,  X1, TP4, \
+                  TP5, A10, X0, TP5, TP5, A11, X1, TP5, \
+                  TP6, A12, X0, TP6, TP6, A13, X1, TP6, \
+                  TP7, A14, X0, TP7, TP7, A15, X1, TP7
+.endm
+
+.macro DGEMV_T_8x4
+    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0, A4,  PA2, 0, A6,  PA3, 0, \
+                        A8,  PA4, 0, A10, PA5, 0, A12, PA6, 0, A14, PA7, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1, \
+                  TP2, A4,  X0, TP2, TP3, A6,  X0, TP3, \
+                  TP4, A8,  X0, TP4, TP5, A10, X0, TP5, \
+                  TP6, A12, X0, TP6, TP7, A14, X0, TP7,
+.endm
+
+.macro DGEMV_T_4x8
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP0, A1,  X1, TP0, \
+                  TP1, A2,  X0, TP1, TP1, A3,  X1, TP1, \
+                  TP2, A4,  X0, TP2, TP2, A5,  X1, TP2, \
+                  TP3, A6,  X0, TP3, TP3, A7,  X1, TP3
+.endm
+
+.macro DGEMV_T_4x4
+    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0, A4,  PA2, 0, A6,  PA3, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1, \
+                  TP2, A4,  X0, TP2, TP3, A6,  X0, TP3
+.endm
+
+.macro DGEMV_T_2x8
+    GLD_INC xv, , 0x20, A0,  PA0, 0, A1,  PA0, 0, A2, PA1, 0, A3,  PA1, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP0, A1,  X1, TP0, \
+                  TP1, A2,  X0, TP1, TP1, A3,  X1, TP1
+.endm
+
+.macro DGEMV_T_2x4
+    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1
+.endm
+
+.macro DGEMV_T XW:req X8:req, X4:req
+    PTR_SRLI  J,      N,      3
+    beqz      J,      .L_\XW\()_N_7
+    PTR_SLLI  K_LDA,  LDA,    3
+    PTR_SUB   K_LDA,  K_LDA,  M8
+.L_\XW\()_N_L8:
+    ZERO_Y8
+    move      X,      X_ORG
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_M_7
+.align 5
+.L_\XW\()_M_L8:
+    DLOAD_\X8
+    DGEMV_T_8x8
+    PTR_ADDI    I,  I,  -1
+    PTR_ALSL    X,  INC_X,  X,  3
+    bnez        I,  .L_\XW\()_M_L8
+.L_\XW\()_M_7:
+    andi        I,      M,      4
+    beqz        I,      .L_\XW\()_M_3
+    DLOAD_\X4
+    DGEMV_T_8x4
+    PTR_ALSL    X,  INC_X,  X,  2
+.L_\XW\()_M_3:
+    // Accumulated
+    GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \
+                 Y5, TP5, Y6, TP6, Y7, TP7
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    fld.d   $f1,    X,      0x00
+    fld.d   $f11,   PA0,    0x00
+    fld.d   $f12,   PA1,    0x00
+    fld.d   $f13,   PA2,    0x00
+    fld.d   $f14,   PA3,    0x00
+    fld.d   $f15,   PA4,    0x00
+    fld.d   $f16,   PA5,    0x00
+    fld.d   $f17,   PA6,    0x00
+    fld.d   $f18,   PA7,    0x00
+#if __loongarch_grlen == 64
+    GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
+               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
+#else
+    GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
+               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
+#endif
+    GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \
+                $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    fld.d   $f11,   Y,  0x00
+    fldx.d  $f12,   Y,  INC_Y
+    PTR_ALSL    PY0, INC_Y,  Y,  1
+    fld.d   $f13,   PY0,    0x00
+    fldx.d  $f14,   PY0,    INC_Y
+    PTR_ALSL    PY1, INC_Y,  Y,  2
+    fld.d   $f15,   PY1,    0x00
+    fldx.d  $f16,   PY1,    INC_Y
+    PTR_ALSL    PY2, INC_Y,  PY1, 1
+    fld.d   $f17,   PY2,    0x00
+    fldx.d  $f18,   PY2,    INC_Y
+
+    GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14, \
+                $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17, $f18, ALPHA, $f10, $f18
+
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#else
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#endif
+    fst.d   $f11,   Y,      0x00
+    fstx.d  $f12,   Y,      INC_Y
+    fst.d   $f13,   PY0,    0x00
+    fstx.d  $f14,   PY0,    INC_Y
+    fst.d   $f15,   PY1,    0x00
+    fstx.d  $f16,   PY1,    INC_Y
+    fst.d   $f17,   PY2,    0x00
+    fstx.d  $f18,   PY2,    INC_Y
+    PTR_ALSL    Y,      INC_Y,  Y,  3
+    bnez        J,      .L_\XW\()_N_L8
+.L_\XW\()_N_7:
+    andi        J,      N,      4
+    beqz        J,      .L_\XW\()_N_3
+    ZERO_Y4
+    move        X,      X_ORG
+    PTR_SRLI    I,      M,       3
+    beqz        I,      .L_\XW\()_N_4_M_7
+.align 5
+.L_\XW\()_N_4_M_L8:
+    DLOAD_\X8
+    DGEMV_T_4x8
+    PTR_ADDI  I,      I,      -1
+    PTR_ALSL  X,      INC_X,  X,  3
+    bnez      I,      .L_\XW\()_N_4_M_L8
+.L_\XW\()_N_4_M_7:
+    andi    I,      M,      4
+    beqz    I,      .L_\XW\()_N_4_M_3
+    DLOAD_\X4
+    DGEMV_T_4x4
+    PTR_ALSL  X,      INC_X,  X,  2
+.L_\XW\()_N_4_M_3:
+    // Accumulated
+    GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_N_4_M_END
+.align 5
+.L_\XW\()_N_4_M_L1:
+    fld.d   $f1,    X,      0x00
+    GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00, $f13, PA2, 0x00, $f14, PA3, 0x00
+    GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_N_4_M_L1
+.L_\XW\()_N_4_M_END:
+    fld.d   $f11,   Y,  0x00
+    fldx.d  $f12,   Y,  INC_Y
+    PTR_ALSL    PY0, INC_Y,  Y,  1
+    fld.d   $f13,   PY0,    0x00
+    fldx.d  $f14,   PY0,    INC_Y
+
+    GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14
+
+    PTR_SLLI    K_LDA,  LDA,    2
+    PTR_SUB     K_LDA,  K_LDA,  M8
+
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#else
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#endif
+    fst.d   $f11,   Y,      0x00
+    fstx.d  $f12,   Y,      INC_Y
+    fst.d   $f13,   PY0,    0x00
+    fstx.d  $f14,   PY0,    INC_Y
+    PTR_ALSL    Y,      INC_Y,  Y,  2
+.L_\XW\()_N_3:
+    andi        J,      N,      2
+    beqz        J,      .L_\XW\()_N_1
+    ZERO_Y2
+    move        X,      X_ORG
+    PTR_SRLI    I,      M,       3
+    beqz        I,      .L_\XW\()_N_2_M_7
+.align 5
+.L_\XW\()_N_2_M_L8:
+    DLOAD_\X8
+    DGEMV_T_2x8
+    PTR_ADDI  I,      I,      -1
+    PTR_ALSL  X,      INC_X,  X,  3
+    bnez      I,      .L_\XW\()_N_2_M_L8
+.L_\XW\()_N_2_M_7:
+    andi    I,      M,      4
+    beqz    I,      .L_\XW\()_N_2_M_3
+    DLOAD_\X4
+    DGEMV_T_2x4
+    PTR_ALSL  X,        INC_X,      X,      2
+.L_\XW\()_N_2_M_3:
+    // Accumulated
+    GACC xvf, d, Y0, TP0, Y1, TP1
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_N_2_M_END
+.align 5
+.L_\XW\()_N_2_M_L1:
+    fld.d   $f1,    X,      0x00
+    GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00
+    GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_N_2_M_L1
+.L_\XW\()_N_2_M_END:
+    fld.d   $f11,   Y,  0x00
+    fldx.d  $f12,   Y,  INC_Y
+
+    GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12
+
+    PTR_SLLI    K_LDA,  LDA,    1
+    PTR_SUB     K_LDA,  K_LDA,  M8
+
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#else
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#endif
+    fst.d   $f11,   Y,      0x00
+    fstx.d  $f12,   Y,      INC_Y
+    PTR_ALSL        Y,      INC_Y,  Y,  1
+.L_\XW\()_N_1:
+    andi    J,      N,      1
+    beqz    J,      .L_END
+    ZERO_Y1
+    move    X,      X_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    fld.d   $f3,    PA0,    0x00
+    fld.d   $f1,    X,      0x00
+    fmadd.d $f19,   $f3,    $f1,    $f19
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    PTR_ADDI  PA0,    PA0,    0x08
+    bnez      I,      .L_\XW\()_N_1_M_L1
+    fld.d     $f3,    Y,      0x00
+    fmadd.d   $f3,    ALPHA,  $f19,  $f3
+    fst.d     $f3,    Y,      0x00
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 8, 24 + 3
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
+    xvreplve0.d     VALPHA, $xr0
+    move     X_ORG,  X
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#else
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword .L_GAP_0 - .L_GAP_TABLE
+    .hword .L_GAP_1 - .L_GAP_TABLE
+.L_GAP_0: /* if (incx == 1) */
+    DGEMV_T GAP_0, X8, X4
+.L_GAP_1: /* if (incx != 1) */
+    DGEMV_T GAP_1, X8_GAP, X4_GAP
+.L_END:
+    pop_if_used 17 + 8, 24 + 3
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S
new file mode 100644
index 000000000..8876cbed9
--- /dev/null
+++ b/kernel/loongarch64/loongarch64_asm.S
@@ -0,0 +1,313 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#if __loongarch_grlen == 64
+#define LA_REG    int64_t
+#define REG_SIZE  8
+#define REG_LOG   3
+#define PTR_ADDI  addi.d
+#define PTR_ADD   add.d
+#define PTR_SUB   sub.d
+#define PTR_LD    ld.d
+#define PTR_ST    st.d
+#define PTR_SLLI  slli.d
+#define PTR_SRLI  srli.d
+#define PTR_ALSL  alsl.d
+#else
+#define LA_REG    int32_t
+#define REG_SIZE  4
+#define REG_LOG   2
+#define PTR_ADDI  addi.w
+#define PTR_ADD   add.w
+#define PTR_SUB   sub.w
+#define PTR_LD    ld.w
+#define PTR_ST    st.w
+#define PTR_SLLI  slli.w
+#define PTR_SRLI  srli.w
+#define PTR_ALSL  alsl.w
+#endif
+
+#if __loongarch_frlen == 64
+#define FREG_SIZE 8
+#define FREG_LOG  3
+#define PTR_FLD   fld.d
+#define PTR_FST   fst.d
+#else
+#define FREG_SIZE 4
+#define FREG_LOG  2
+#define PTR_FLD   fld.s
+#define PTR_FST   fst.s
+#endif
+
+// The max registers available to the user which
+// do not need to be preserved across calls.
+// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html
+#define MAX_INT_CALLER_SAVED 17
+#define MAX_FP_CALLER_SAVED  24
+
+.altmacro // Enable alternate macro mode
+
+.macro push_if_used regs, fregs
+.if \regs > MAX_INT_CALLER_SAVED
+    PTR_ADDI      $sp,    $sp,    -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG)
+    push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
+.endif
+.if \fregs > MAX_FP_CALLER_SAVED
+    PTR_ADDI      $sp,    $sp,    -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG)
+    push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
+.endif
+.endm // End push_if_used
+.macro pop_if_used regs, fregs
+.if \fregs > MAX_FP_CALLER_SAVED
+    pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
+    PTR_ADDI      $sp,    $sp,    (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG
+.endif
+.if \regs > MAX_INT_CALLER_SAVED
+    pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
+    PTR_ADDI      $sp,    $sp,    (\regs - MAX_INT_CALLER_SAVED) << REG_LOG
+.endif
+.endm // End pop_if_used
+.macro push_regs from, to
+    PTR_ST    $s\()\from,     $sp,    \from << REG_LOG
+.if \to - \from
+    push_regs %from + 1, \to
+.endif
+.endm // End push_regs
+.macro pop_regs from, to
+    PTR_LD    $s\()\from,     $sp,    \from << REG_LOG
+.if \to - \from
+    pop_regs %from + 1, \to
+.endif
+.endm // End pop_regs
+.macro push_fregs from, to
+    PTR_FST   $fs\()\from,    $sp,    \from << FREG_LOG
+.if \to - \from
+    push_fregs %from + 1, \to
+.endif
+.endm // End push_fregs
+.macro pop_fregs from, to
+    PTR_FLD   $fs\()\from,    $sp,    \from << FREG_LOG
+.if \to - \from
+    pop_fregs %from + 1, \to
+.endif
+.endm // End pop_fregs
+
+//
+// Instruction Related Macros
+//
+// GLD
+//
+.macro GLD pre_op:req, suf_op=0, out:req, src:req, offset:req/* imm */, more:vararg
+.ifeqs "\suf_op", "0"
+    \pre_op\()ld    \out,   \src,   \offset
+.else
+    \pre_op\()ld.\suf_op    \out,   \src,   \offset
+.endif
+.ifnb \more
+    GLD \pre_op, \suf_op, \more
+.endif
+.endm
+
+//
+// GLD_INC
+//
+.macro GLD_INC pre_op:req, suf_op=0, inc:req, out:req, src:req, offset:req/* imm */, more:vararg
+.ifeqs "\suf_op", "0"
+    \pre_op\()ld    \out,   \src,   \offset
+.else
+    \pre_op\()ld.\suf_op    \out,   \src,   \offset
+.endif
+    PTR_ADDI  \src,   \src,   \inc
+.ifnb \more
+    GLD_INC \pre_op, \suf_op, \inc, \more
+.endif
+.endm
+//
+// GLDX is same as GLD except the stride is a register
+//
+.macro GLDX pre_op:req, suf_op=0, out:req, src:req, offset:req/* reg */, more:vararg
+.ifeqs "\suf_op", "0"
+    \pre_op\()ldx    \out,   \src,   \offset
+.else
+    \pre_op\()ldx.\suf_op    \out,   \src,   \offset
+.endif
+.ifnb \more
+    GLDX \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GLDREPL
+//
+.macro GLDREPL pre_op:req, suf_op:req, out:req, src:req, offset:req/* imm */, more:vararg
+    \pre_op\()ldrepl.\suf_op    \out,   \src,   \offset
+.ifnb \more
+    GLDREPL  \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GST
+//
+.macro GST pre_op:req, suf_op=0, src:req, dst:req, offset:req/* imm */, more:vararg
+.ifeqs "\suf_op", "0"
+    \pre_op\()st    \src,   \dst,   \offset
+.else
+    \pre_op\()st.\suf_op \src,  \dst, \offset
+.endif
+.ifnb \more
+    GST \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GMUL
+//
+.macro GMUL pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()mul.\suf_op   \out,   \in0,   \in1
+.ifnb \more
+    GMUL \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GMADD
+//
+.macro GMADD pre_op, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg
+    \pre_op\()madd.\suf_op \out, \in0, \in1, \in2
+.ifnb \more
+    GMADD \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GADD
+//
+.macro GADD pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()add.\suf_op \out, \in0, \in1
+.ifnb \more
+    GADD \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GADDI
+//
+.macro GADDI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()addi.\suf_op  \out,   \in0,   \in1
+.ifnb \more
+    GADDI \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GSLLI
+//
+.macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()slli.\suf_op  \out,   \in0,   \in1
+.ifnb \more
+    GSLLI \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GINSVE0
+//
+.macro GINSVE0 pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()insve0.\suf_op    \out,   \in0,   \in1
+.ifnb \more
+    GINSVE0 \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GXOR
+//
+.macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()xor.\suf_op    \out,   \in0,   \in1
+.ifnb \more
+    GXOR \pre_op, \suf_op, \more
+.endif
+.endm
+
+//
+// Compound instructions
+//
+// GACC: Accumulate the values of vector registers
+//
+.macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
+.ifeqs "\pre_op", "xvf"
+    xvpermi.q              \out,   \in,    0x01
+    \pre_op\()add.\suf_op  \in,    \out,   \in
+    xvpackod.d             \out,   \in,    \in
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifeqs "\suf_op", "s"
+    xvpackod.w             \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.endif
+.endif
+
+.ifeqs "\pre_op", "vf"
+    vpackod.d              \out,   \in,    \in
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifeqs "\suf_op", "s"
+    vpackod.w              \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.endif
+.endif
+
+.ifeqs "\pre_op", "xv"
+    xvpermi.q              \out,   \in,    0x01
+    \pre_op\()add.\suf_op  \in,    \out,   \in
+    xvpackod.d             \out,   \in,    \in
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "d"
+    xvpackod.w             \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "w"
+    xvpackod.h             \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "h"
+    xvpackod.b             \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.endif
+.endif
+.endif
+.endif
+
+.ifeqs "\pre_op", "v"
+    vpackod.d              \out,   \in,    \in
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "d"
+    vpackod.w              \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "w"
+    vpackod.h              \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "h"
+    vpackod.b              \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.endif
+.endif
+.endif
+.endif
+
+.ifnb \more
+    GACC \pre_op, \suf_op, \more
+.endif
+.endm

From 4d0f000db63fedbe62ca318bb7bcd5d3152fc4a2 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Mon, 7 Aug 2023 16:55:59 +0800
Subject: [PATCH 259/718] MIPS: Enable MSA

---
 kernel/mips/KERNEL.P5600         | 34 ++++++++++-----------
 kernel/mips64/KERNEL.LOONGSON3R4 | 32 ++++++++++----------
 param.h                          | 52 ++++++++++++++++----------------
 3 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600
index 9a6e06d67..2e1e3390d 100644
--- a/kernel/mips/KERNEL.P5600
+++ b/kernel/mips/KERNEL.P5600
@@ -35,7 +35,7 @@ DSUMKERNEL  = ../mips/sum.c
 CSUMKERNEL  = ../mips/zsum.c
 ZSUMKERNEL  = ../mips/zsum.c
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SASUMKERNEL  = ../mips/sasum_msa.c
 DASUMKERNEL  = ../mips/dasum_msa.c
 CASUMKERNEL  = ../mips/casum_msa.c
@@ -47,7 +47,7 @@ CASUMKERNEL  = ../mips/zasum.c
 ZASUMKERNEL  = ../mips/zasum.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SAXPYKERNEL  = ../mips/saxpy_msa.c
 DAXPYKERNEL  = ../mips/daxpy_msa.c
 CAXPYKERNEL  = ../mips/caxpy_msa.c
@@ -59,7 +59,7 @@ CAXPYKERNEL  = ../mips/zaxpy.c
 ZAXPYKERNEL  = ../mips/zaxpy.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SCOPYKERNEL  = ../mips/scopy_msa.c
 DCOPYKERNEL  = ../mips/dcopy_msa.c
 CCOPYKERNEL  = ../mips/ccopy_msa.c
@@ -71,7 +71,7 @@ CCOPYKERNEL  = ../mips/zcopy.c
 ZCOPYKERNEL  = ../mips/zcopy.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SDOTKERNEL   = ../mips/sdot_msa.c
 DDOTKERNEL   = ../mips/ddot_msa.c
 CDOTKERNEL   = ../mips/cdot_msa.c
@@ -88,7 +88,7 @@ DNRM2KERNEL  = ../mips/nrm2.c
 CNRM2KERNEL  = ../mips/znrm2.c
 ZNRM2KERNEL  = ../mips/znrm2.c
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SROTKERNEL   = ../mips/srot_msa.c
 DROTKERNEL   = ../mips/drot_msa.c
 CROTKERNEL   = ../mips/crot_msa.c
@@ -100,7 +100,7 @@ CROTKERNEL   = ../mips/zrot.c
 ZROTKERNEL   = ../mips/zrot.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SSCALKERNEL  = ../mips/sscal_msa.c
 DSCALKERNEL  = ../mips/dscal_msa.c
 CSCALKERNEL  = ../mips/cscal_msa.c
@@ -112,7 +112,7 @@ CSCALKERNEL  = ../mips/zscal.c
 ZSCALKERNEL  = ../mips/zscal.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SSWAPKERNEL  = ../mips/sswap_msa.c
 DSWAPKERNEL  = ../mips/dswap_msa.c
 CSWAPKERNEL  = ../mips/cswap_msa.c
@@ -124,7 +124,7 @@ CSWAPKERNEL  = ../mips/zswap.c
 ZSWAPKERNEL  = ../mips/zswap.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SGEMVNKERNEL = ../mips/sgemv_n_msa.c
 DGEMVNKERNEL = ../mips/dgemv_n_msa.c
 CGEMVNKERNEL = ../mips/cgemv_n_msa.c
@@ -136,7 +136,7 @@ CGEMVNKERNEL = ../mips/zgemv_n.c
 ZGEMVNKERNEL = ../mips/zgemv_n.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SGEMVTKERNEL = ../mips/sgemv_t_msa.c
 DGEMVTKERNEL = ../mips/dgemv_t_msa.c
 CGEMVTKERNEL = ../mips/cgemv_t_msa.c
@@ -148,7 +148,7 @@ CGEMVTKERNEL = ../mips/zgemv_t.c
 ZGEMVTKERNEL = ../mips/zgemv_t.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SGEMMKERNEL    = ../mips/sgemm_kernel_8x8_msa.c
 SGEMMONCOPY    = ../mips/sgemm_ncopy_8_msa.c
 SGEMMOTCOPY    = ../mips/sgemm_tcopy_8_msa.c
@@ -162,7 +162,7 @@ SGEMMONCOPYOBJ = sgemm_oncopy.o
 SGEMMOTCOPYOBJ = sgemm_otcopy.o
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 DGEMMKERNEL    = ../mips/dgemm_kernel_8x4_msa.c
 DGEMMINCOPY    = ../mips/dgemm_ncopy_8_msa.c
 DGEMMITCOPY    = ../mips/dgemm_tcopy_8_msa.c
@@ -180,7 +180,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o
 DGEMMOTCOPYOBJ = dgemm_otcopy.o
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 CGEMMKERNEL    = ../mips/cgemm_kernel_8x4_msa.c
 CGEMMINCOPY    = ../mips/cgemm_ncopy_8_msa.c
 CGEMMITCOPY    = ../mips/cgemm_tcopy_8_msa.c
@@ -198,7 +198,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy.o
 CGEMMOTCOPYOBJ = cgemm_otcopy.o
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 ZGEMMKERNEL    = ../mips/zgemm_kernel_4x4_msa.c
 ZGEMMONCOPY    = ../mips/zgemm_ncopy_4_msa.c
 ZGEMMOTCOPY    = ../mips/zgemm_tcopy_4_msa.c
@@ -212,7 +212,7 @@ ZGEMMONCOPYOBJ = zgemm_oncopy.o
 ZGEMMOTCOPYOBJ = zgemm_otcopy.o
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c
 STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c
 STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c
@@ -224,7 +224,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
 STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
 DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c
 DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c
@@ -236,7 +236,7 @@ DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
 DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
 CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
 CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
@@ -248,7 +248,7 @@ CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
 CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
 ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
 ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
diff --git a/kernel/mips64/KERNEL.LOONGSON3R4 b/kernel/mips64/KERNEL.LOONGSON3R4
index b81e5441d..1149d97f1 100644
--- a/kernel/mips64/KERNEL.LOONGSON3R4
+++ b/kernel/mips64/KERNEL.LOONGSON3R4
@@ -1,4 +1,4 @@
-ifdef HAVE_MSA
+ifndef NO_MSA
 SAXPYKERNEL = ../mips/saxpy_msa.c
 DAXPYKERNEL = ../mips/daxpy_msa.c
 CAXPYKERNEL = ../mips/caxpy_msa.c
@@ -8,14 +8,14 @@ SAXPYKERNEL = axpy_loongson3a.S
 DAXPYKERNEL = daxpy_loongson3a_simd.S
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SCOPYKERNEL  = ../mips/scopy_msa.c
 DCOPYKERNEL  = ../mips/dcopy_msa.c
 CCOPYKERNEL  = ../mips/ccopy_msa.c
 ZCOPYKERNEL  = ../mips/zcopy_msa.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SDOTKERNEL   = ../mips/sdot_msa.c
 DDOTKERNEL   = ../mips/ddot_msa.c
 CDOTKERNEL   = ../mips/cdot_msa.c
@@ -23,21 +23,21 @@ ZDOTKERNEL   = ../mips/zdot_msa.c
 endif
 DSDOTKERNEL  = ../mips/dot.c
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SROTKERNEL   = ../mips/srot_msa.c
 DROTKERNEL   = ../mips/drot_msa.c
 CROTKERNEL   = ../mips/crot_msa.c
 ZROTKERNEL   = ../mips/zrot_msa.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SSCALKERNEL  = ../mips/sscal_msa.c
 DSCALKERNEL  = ../mips/dscal_msa.c
 CSCALKERNEL  = ../mips/cscal_msa.c
 ZSCALKERNEL  = ../mips/zscal_msa.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SGEMVNKERNEL = ../mips/sgemv_n_msa.c
 DGEMVNKERNEL = ../mips/dgemv_n_msa.c
 SGEMVTKERNEL = ../mips/sgemv_t_msa.c
@@ -57,21 +57,21 @@ ZGEMVNKERNEL = zgemv_n_loongson3a.c
 ZGEMVTKERNEL = zgemv_t_loongson3a.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SASUMKERNEL  = ../mips/sasum_msa.c
 DASUMKERNEL  = ../mips/dasum_msa.c
 CASUMKERNEL  = ../mips/casum_msa.c
 ZASUMKERNEL  = ../mips/zasum_msa.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SSWAPKERNEL  = ../mips/sswap_msa.c
 DSWAPKERNEL  = ../mips/dswap_msa.c
 CSWAPKERNEL  = ../mips/cswap_msa.c
 ZSWAPKERNEL  = ../mips/zswap_msa.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SGEMMKERNEL    = ../mips/sgemm_kernel_8x8_msa.c
 SGEMMONCOPY    = ../mips/sgemm_ncopy_8_msa.c
 SGEMMOTCOPY    = ../mips/sgemm_tcopy_8_msa.c
@@ -89,7 +89,7 @@ SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 DGEMMKERNEL    = ../mips/dgemm_kernel_8x4_msa.c
 DGEMMINCOPY    = ../mips/dgemm_ncopy_8_msa.c
 DGEMMITCOPY    = ../mips/dgemm_tcopy_8_msa.c
@@ -107,7 +107,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 CGEMMKERNEL    = ../mips/cgemm_kernel_8x4_msa.c
 CGEMMINCOPY    = ../mips/cgemm_ncopy_8_msa.c
 CGEMMITCOPY    = ../mips/cgemm_tcopy_8_msa.c
@@ -129,7 +129,7 @@ CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 ZGEMMKERNEL    = ../mips/zgemm_kernel_4x4_msa.c
 ZGEMMONCOPY    = ../mips/zgemm_ncopy_4_msa.c
 ZGEMMOTCOPY    = ../mips/zgemm_tcopy_4_msa.c
@@ -143,7 +143,7 @@ ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c
 STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c
 STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c
@@ -155,7 +155,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
 STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
 DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c
 DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c
@@ -167,7 +167,7 @@ DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
 DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
 CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
 CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
@@ -179,7 +179,7 @@ CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
 CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
 ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
 ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
diff --git a/param.h b/param.h
index 547463b2f..a4672ce07 100644
--- a/param.h
+++ b/param.h
@@ -2748,19 +2748,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B 0
 #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
-#ifdef HAVE_MSA
-#define SGEMM_DEFAULT_UNROLL_M  8
-#define SGEMM_DEFAULT_UNROLL_N  8
-
-#define DGEMM_DEFAULT_UNROLL_M  8
-#define DGEMM_DEFAULT_UNROLL_N  4
-
-#define CGEMM_DEFAULT_UNROLL_M  8
-#define CGEMM_DEFAULT_UNROLL_N  4
-
-#define ZGEMM_DEFAULT_UNROLL_M  4
-#define ZGEMM_DEFAULT_UNROLL_N  4
-#else
+#if defined(NO_MSA)
 #define SGEMM_DEFAULT_UNROLL_M  8
 #define SGEMM_DEFAULT_UNROLL_N  4
 
@@ -2772,6 +2760,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define ZGEMM_DEFAULT_UNROLL_M  2
 #define ZGEMM_DEFAULT_UNROLL_N  2
+#else
+#define SGEMM_DEFAULT_UNROLL_M  8
+#define SGEMM_DEFAULT_UNROLL_N  8
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
+
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
+
+#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_N  4
 #endif
 
 #define SGEMM_DEFAULT_P	64
@@ -2961,19 +2961,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B  0
 #define GEMM_DEFAULT_ALIGN (BLASLONG) 0x03fffUL
 
-#if defined(HAVE_MSA)
-#define SGEMM_DEFAULT_UNROLL_M  8
-#define SGEMM_DEFAULT_UNROLL_N  8
-
-#define DGEMM_DEFAULT_UNROLL_M  8
-#define DGEMM_DEFAULT_UNROLL_N  4
-
-#define CGEMM_DEFAULT_UNROLL_M  8
-#define CGEMM_DEFAULT_UNROLL_N  4
-
-#define ZGEMM_DEFAULT_UNROLL_M  4
-#define ZGEMM_DEFAULT_UNROLL_N  4
-#else
+#if defined(NO_MSA)
 #define SGEMM_DEFAULT_UNROLL_M  2
 #define SGEMM_DEFAULT_UNROLL_N  2
 
@@ -2985,6 +2973,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define ZGEMM_DEFAULT_UNROLL_M  2
 #define ZGEMM_DEFAULT_UNROLL_N  2
+#else
+#define SGEMM_DEFAULT_UNROLL_M  8
+#define SGEMM_DEFAULT_UNROLL_N  8
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
+
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
+
+#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_N  4
 #endif
 
 #define SGEMM_DEFAULT_P  128

From 4cc804c75434ffba0dd6cdd4f5ed4b332f525d77 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 9 Aug 2023 16:13:23 +0200
Subject: [PATCH 260/718] Prepare for INCX < 0 in new NRM2 implementation from
 BLAS 3.10

---
 interface/nrm2.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/interface/nrm2.c b/interface/nrm2.c
index dc8c08e9a..331ebc3d0 100644
--- a/interface/nrm2.c
+++ b/interface/nrm2.c
@@ -54,6 +54,21 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
 
   if (n <= 0) return 0.;
 
+#ifndef COMPLEX
+  if (n == 1)
+#ifdef DOUBLE
+    return fabs(x[0]);
+#else
+    return fabsf(x[0]);
+#endif
+#endif
+
+  if (incx < 0) 
+#ifdef COMPLEX    
+    x -= (n - 1) * incx * 2;
+#else
+    x -= (n - 1) * incx;
+#endif
   IDEBUG_START;
 
   FUNCTION_PROFILE_START();
@@ -82,6 +97,22 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
 
   if (n <= 0) return 0.;
 
+  #ifndef COMPLEX
+  if (n == 1)
+#ifdef DOUBLE
+    return fabs(x[0]);
+#else
+    return fabsf(x[0]);
+#endif
+#endif
+
+  if (incx < 0) 
+#ifdef COMPLEX    
+    x -= (n - 1) * incx * 2;
+#else
+    x -= (n - 1) * incx;
+#endif
+  
   IDEBUG_START;
 
   FUNCTION_PROFILE_START();

From a2d867f4d1257c4c47b5b7afacbbc554182885bf Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 10 Aug 2023 16:49:05 +0200
Subject: [PATCH 261/718] Allow negative iNCX (API change from version 3.10 of
 the reference implementation)

---
 kernel/arm/nrm2.c  | 2 +-
 kernel/arm/znrm2.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/arm/nrm2.c b/kernel/arm/nrm2.c
index fcff09337..8cc189fe3 100644
--- a/kernel/arm/nrm2.c
+++ b/kernel/arm/nrm2.c
@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT absxi = 0.0;
 
 
-	if (n <= 0 || inc_x <= 0) return(0.0);
+	if (n <= 0 || inc_x == 0) return(0.0);
 	if ( n == 1 ) return( ABS(x[0]) );
 
 	n *= inc_x;
diff --git a/kernel/arm/znrm2.c b/kernel/arm/znrm2.c
index fc1c8b54a..28bb0eda5 100644
--- a/kernel/arm/znrm2.c
+++ b/kernel/arm/znrm2.c
@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG inc_x2;
 	FLOAT temp;
 
-	if (n <= 0 || inc_x <= 0) return(0.0);
+	if (n <= 0 || inc_x == 0) return(0.0);
 
 	inc_x2 = 2 * inc_x;
 

From d15ffb7fdfce1609c0fddd813bc6c4236150c445 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 10 Aug 2023 16:50:44 +0200
Subject: [PATCH 262/718] Allow negative INCX (API change from version 3.10 of
 the reference implementation)

---
 kernel/loongarch64/cnrm2.S | 2 +-
 kernel/loongarch64/dnrm2.S | 2 +-
 kernel/loongarch64/snrm2.S | 2 +-
 kernel/loongarch64/znrm2.S | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/loongarch64/cnrm2.S b/kernel/loongarch64/cnrm2.S
index 9d27987e1..41667485a 100644
--- a/kernel/loongarch64/cnrm2.S
+++ b/kernel/loongarch64/cnrm2.S
@@ -61,7 +61,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    fmov.d s2, s1
    bge $r0,    N, .L999
    slli.d INCX, INCX, ZBASE_SHIFT
-   bge $r0,    INCX, .L999
+   beq $r0,    INCX, .L999
    srai.d  I, N, 2
    bge $r0,    I, .L25
    LD a1,  X,   0 * SIZE
diff --git a/kernel/loongarch64/dnrm2.S b/kernel/loongarch64/dnrm2.S
index ff937ae53..2160b93a6 100644
--- a/kernel/loongarch64/dnrm2.S
+++ b/kernel/loongarch64/dnrm2.S
@@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    MTC  s1, $r0
    bge $r0,    N, .L999
    slli.d INCX, INCX, BASE_SHIFT
-   bge $r0,    INCX, .L999
+   beq $r0,    INCX, .L999
    move    XX, X
    NOP
    LD a1,  X,   0 * SIZE
diff --git a/kernel/loongarch64/snrm2.S b/kernel/loongarch64/snrm2.S
index 57c21a017..8c5c91ade 100644
--- a/kernel/loongarch64/snrm2.S
+++ b/kernel/loongarch64/snrm2.S
@@ -61,7 +61,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    fmov.d s2, s1
    bge $r0,    N, .L999
    slli.d INCX, INCX, BASE_SHIFT
-   bge $r0,    INCX, .L999
+   beq $r0,    INCX, .L999
    srai.d I, N, 3
    bne INCX, TEMP, .L20
    bge $r0,    I, .L15
diff --git a/kernel/loongarch64/znrm2.S b/kernel/loongarch64/znrm2.S
index 49f640268..8e2165ab7 100644
--- a/kernel/loongarch64/znrm2.S
+++ b/kernel/loongarch64/znrm2.S
@@ -64,7 +64,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    MTC  s1, $r0
    bge $r0,    N, .L999
    slli.d INCX, INCX, ZBASE_SHIFT
-   bge $r0,    INCX, .L999
+   beq $r0,    INCX, .L999
    move    XX, X
    MOV s2, s1
    srai.d  I, N, 2

From f6921787921862a744e4366b7a42ffcab1993177 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 10 Aug 2023 16:52:09 +0200
Subject: [PATCH 263/718] Allow negative INCX (API change from version 3.10 of
 the reference implementation)

---
 kernel/mips/nrm2.c  | 2 +-
 kernel/mips/znrm2.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/mips/nrm2.c b/kernel/mips/nrm2.c
index fcff09337..8cc189fe3 100644
--- a/kernel/mips/nrm2.c
+++ b/kernel/mips/nrm2.c
@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT absxi = 0.0;
 
 
-	if (n <= 0 || inc_x <= 0) return(0.0);
+	if (n <= 0 || inc_x == 0) return(0.0);
 	if ( n == 1 ) return( ABS(x[0]) );
 
 	n *= inc_x;
diff --git a/kernel/mips/znrm2.c b/kernel/mips/znrm2.c
index 85be39cd1..d11a6bd4a 100644
--- a/kernel/mips/znrm2.c
+++ b/kernel/mips/znrm2.c
@@ -48,7 +48,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG inc_x2;
 	FLOAT temp;
 
-	if (n <= 0 || inc_x <= 0) return(0.0);
+	if (n <= 0 || inc_x == 0) return(0.0);
 
 	inc_x2 = 2 * inc_x;
 

From 7dd441d5db84c21723256976f275e2b3bf59533c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 10 Aug 2023 16:53:33 +0200
Subject: [PATCH 264/718] Allow negative INCX (API change from version 3.10 of
 the reference implementation)

---
 kernel/mips64/cnrm2.S | 2 +-
 kernel/mips64/dnrm2.S | 2 +-
 kernel/mips64/snrm2.S | 2 +-
 kernel/mips64/znrm2.S | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/mips64/cnrm2.S b/kernel/mips64/cnrm2.S
index 76fa9c295..159f9bea9 100644
--- a/kernel/mips64/cnrm2.S
+++ b/kernel/mips64/cnrm2.S
@@ -77,7 +77,7 @@
 	blez	N, .L999
 	mov.d	s2, s1
 
-	blez	INCX, .L999
+	beqz	INCX, .L999
 	dsll	INCX, INCX, ZBASE_SHIFT
 
 	dsra	I, N, 2
diff --git a/kernel/mips64/dnrm2.S b/kernel/mips64/dnrm2.S
index cd40414a2..1b55d9fc3 100644
--- a/kernel/mips64/dnrm2.S
+++ b/kernel/mips64/dnrm2.S
@@ -81,7 +81,7 @@
 	blez	N, .L999
 	MTC	$0,  s1
 
-	blez	INCX, .L999
+	beqz	INCX, .L999
 	dsll	INCX, INCX, BASE_SHIFT
 
 	move	XX, X
diff --git a/kernel/mips64/snrm2.S b/kernel/mips64/snrm2.S
index 1ba061a7d..f18151b5c 100644
--- a/kernel/mips64/snrm2.S
+++ b/kernel/mips64/snrm2.S
@@ -77,7 +77,7 @@
 	blez	N, .L999
 	mov.d	s2, s1
 
-	blez	INCX, .L999
+	beqz	INCX, .L999
 	dsll	INCX, INCX, BASE_SHIFT
 
 	bne	INCX, TEMP, .L20
diff --git a/kernel/mips64/znrm2.S b/kernel/mips64/znrm2.S
index 1c247bca9..d33284a47 100644
--- a/kernel/mips64/znrm2.S
+++ b/kernel/mips64/znrm2.S
@@ -80,7 +80,7 @@
 	blez	N, .L999
 	MTC	$0,  s1
 
-	blez	INCX, .L999
+	beqz	INCX, .L999
 	dsll	INCX, INCX, ZBASE_SHIFT
 
 	move	XX, X

From 54d3246fc6f8282539bb9dd2be996f65ebb5a51e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 10 Aug 2023 16:55:17 +0200
Subject: [PATCH 265/718] Allow negative INCX (API change from version 3.10 of
 the reference implementation)

---
 kernel/power/cnrm2.S        | 2 +-
 kernel/power/cnrm2_hummer.S | 2 +-
 kernel/power/cnrm2_ppc440.S | 2 +-
 kernel/power/dnrm2_hummer.S | 2 +-
 kernel/power/dnrm2_ppc440.S | 2 +-
 kernel/power/nrm2.S         | 2 +-
 kernel/power/snrm2.S        | 2 +-
 kernel/power/snrm2_hummer.S | 2 +-
 kernel/power/snrm2_ppc440.S | 2 +-
 kernel/power/znrm2.S        | 2 +-
 kernel/power/znrm2_hummer.S | 2 +-
 kernel/power/znrm2_ppc440.S | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/kernel/power/cnrm2.S b/kernel/power/cnrm2.S
index c115650fd..74117a831 100644
--- a/kernel/power/cnrm2.S
+++ b/kernel/power/cnrm2.S
@@ -99,7 +99,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(9999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(9999)
+	beq-	LL(9999)
 
 	fmr	f0,  f1
 	fmr	f2,  f1
diff --git a/kernel/power/cnrm2_hummer.S b/kernel/power/cnrm2_hummer.S
index 46c29c654..0d036b32f 100644
--- a/kernel/power/cnrm2_hummer.S
+++ b/kernel/power/cnrm2_hummer.S
@@ -119,7 +119,7 @@
 	cmpwi	cr0, N, 0
 	ble	LL(99)
 	cmpwi	cr0, INCX, 0
-	ble	LL(99)
+	beq	LL(99)
 
 	andi.	r0, X, 2 * SIZE - 1
 	bne	LL(100)
diff --git a/kernel/power/cnrm2_ppc440.S b/kernel/power/cnrm2_ppc440.S
index c71c34b7c..8e3abf9f9 100644
--- a/kernel/power/cnrm2_ppc440.S
+++ b/kernel/power/cnrm2_ppc440.S
@@ -104,7 +104,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(999)
+	beq-	LL(999)
 
 	fmr	f0,  f1
 	sub	X, X, INCX
diff --git a/kernel/power/dnrm2_hummer.S b/kernel/power/dnrm2_hummer.S
index 4931f5ab1..8638ca424 100644
--- a/kernel/power/dnrm2_hummer.S
+++ b/kernel/power/dnrm2_hummer.S
@@ -134,7 +134,7 @@
 	cmpwi	cr0, N, 0
 	ble	LL(99)
 	cmpwi	cr0, INCX, 0
-	ble	LL(99)
+	beq	LL(99)
 
 	mr	XX, X
 
diff --git a/kernel/power/dnrm2_ppc440.S b/kernel/power/dnrm2_ppc440.S
index 849ca1f35..529f6adf0 100644
--- a/kernel/power/dnrm2_ppc440.S
+++ b/kernel/power/dnrm2_ppc440.S
@@ -111,7 +111,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(999)
+	beq-	LL(999)
 
 	mr	NN, N
 	mr	XX, X
diff --git a/kernel/power/nrm2.S b/kernel/power/nrm2.S
index d9e1f4e9a..880b5d1b4 100644
--- a/kernel/power/nrm2.S
+++ b/kernel/power/nrm2.S
@@ -113,7 +113,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(9999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(9999)
+	beq-	LL(9999)
 
 	mr	NN, N
 	mr	XX, X
diff --git a/kernel/power/snrm2.S b/kernel/power/snrm2.S
index be974cc48..696d404bb 100644
--- a/kernel/power/snrm2.S
+++ b/kernel/power/snrm2.S
@@ -97,7 +97,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(9999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(9999)
+	beq-	LL(9999)
 
 	fmr	f0,  f1
 	fmr	f2,  f1
diff --git a/kernel/power/snrm2_hummer.S b/kernel/power/snrm2_hummer.S
index a0ff3d1b2..a4292af78 100644
--- a/kernel/power/snrm2_hummer.S
+++ b/kernel/power/snrm2_hummer.S
@@ -119,7 +119,7 @@
 	cmpwi	cr0, N, 0
 	ble	LL(99)
 	cmpwi	cr0, INCX, 0
-	ble	LL(99)
+	beq	LL(99)
 
 	cmpwi	cr0, INCX, SIZE
 	bne	LL(100)
diff --git a/kernel/power/snrm2_ppc440.S b/kernel/power/snrm2_ppc440.S
index 0a80d1224..3547d7f47 100644
--- a/kernel/power/snrm2_ppc440.S
+++ b/kernel/power/snrm2_ppc440.S
@@ -105,7 +105,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(999)
+	beq-	LL(999)
 
 	fmr	f0,  f1
 	fmr	f2,  f1
diff --git a/kernel/power/znrm2.S b/kernel/power/znrm2.S
index 60f379d25..3048e3480 100644
--- a/kernel/power/znrm2.S
+++ b/kernel/power/znrm2.S
@@ -105,7 +105,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(9999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(9999)
+	beq-	LL(9999)
 
 	mr	NN, N
 	mr	XX, X
diff --git a/kernel/power/znrm2_hummer.S b/kernel/power/znrm2_hummer.S
index 1d0c598f8..4ef2212df 100644
--- a/kernel/power/znrm2_hummer.S
+++ b/kernel/power/znrm2_hummer.S
@@ -134,7 +134,7 @@
 	cmpwi	cr0, N, 0
 	ble	LL(99)
 	cmpwi	cr0, INCX, 0
-	ble	LL(99)
+	beq	LL(99)
 
 	mr	XX, X
 
diff --git a/kernel/power/znrm2_ppc440.S b/kernel/power/znrm2_ppc440.S
index 778b805de..f775c3e62 100644
--- a/kernel/power/znrm2_ppc440.S
+++ b/kernel/power/znrm2_ppc440.S
@@ -112,7 +112,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(999)
+	beq-	LL(999)
 
 	mr	NN, N
 	mr	XX, X

From a34a0a7abceb6682c841c1dbd43728005ff7b940 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 10 Aug 2023 16:56:52 +0200
Subject: [PATCH 266/718] Allow negative INCX (API change from version 3.10 of
 the reference implementation)

---
 kernel/riscv64/nrm2.c  | 2 +-
 kernel/riscv64/znrm2.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/riscv64/nrm2.c b/kernel/riscv64/nrm2.c
index fcff09337..8cc189fe3 100644
--- a/kernel/riscv64/nrm2.c
+++ b/kernel/riscv64/nrm2.c
@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT absxi = 0.0;
 
 
-	if (n <= 0 || inc_x <= 0) return(0.0);
+	if (n <= 0 || inc_x == 0) return(0.0);
 	if ( n == 1 ) return( ABS(x[0]) );
 
 	n *= inc_x;
diff --git a/kernel/riscv64/znrm2.c b/kernel/riscv64/znrm2.c
index fc1c8b54a..28bb0eda5 100644
--- a/kernel/riscv64/znrm2.c
+++ b/kernel/riscv64/znrm2.c
@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG inc_x2;
 	FLOAT temp;
 
-	if (n <= 0 || inc_x <= 0) return(0.0);
+	if (n <= 0 || inc_x == 0) return(0.0);
 
 	inc_x2 = 2 * inc_x;
 

From c211da0688ddead349a14dfb26a760c02280b433 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 10 Aug 2023 16:58:57 +0200
Subject: [PATCH 267/718] Allow negative INCX (API change from version 3.10 of
 the reference implementation)

---
 kernel/sparc/cnrm2.S | 2 +-
 kernel/sparc/dnrm2.S | 2 +-
 kernel/sparc/snrm2.S | 2 +-
 kernel/sparc/znrm2.S | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sparc/cnrm2.S b/kernel/sparc/cnrm2.S
index 8dc4b56b6..0840c8848 100644
--- a/kernel/sparc/cnrm2.S
+++ b/kernel/sparc/cnrm2.S
@@ -76,7 +76,7 @@
 	FMOV	c1, t4
 
 	cmp	INCX, 0
-	ble	.LL20
+	beq	.LL20
 	sll	INCX, ZBASE_SHIFT, INCX
 
 	cmp	N, 0
diff --git a/kernel/sparc/dnrm2.S b/kernel/sparc/dnrm2.S
index cf7522953..41e993440 100644
--- a/kernel/sparc/dnrm2.S
+++ b/kernel/sparc/dnrm2.S
@@ -107,7 +107,7 @@
 	FMOV	fzero, c1
 
 	cmp	INCX, 0
-	ble	.LL99
+	beq	.LL99
 	sll	INCX, BASE_SHIFT, INCX
 
 	add	%sp, -8, %sp
diff --git a/kernel/sparc/snrm2.S b/kernel/sparc/snrm2.S
index a80247259..a7405b6e1 100644
--- a/kernel/sparc/snrm2.S
+++ b/kernel/sparc/snrm2.S
@@ -76,7 +76,7 @@
 	FMOV	c1, t4
 
 	cmp	INCX, 0
-	ble	.LL20
+	beq	.LL20
 	sll	INCX, BASE_SHIFT, INCX
 
 	cmp	N, 0
diff --git a/kernel/sparc/znrm2.S b/kernel/sparc/znrm2.S
index 065d22784..dae53ffe7 100644
--- a/kernel/sparc/znrm2.S
+++ b/kernel/sparc/znrm2.S
@@ -107,7 +107,7 @@
 	FMOV	fzero, c1
 
 	cmp	INCX, 0
-	ble	.LL99
+	beq	.LL99
 	sll	INCX, ZBASE_SHIFT, INCX
 
 	add	%sp, -8, %sp

From 07e32c4cb8044219c588dfd32c33cef28b6cfdd4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 10 Aug 2023 17:00:18 +0200
Subject: [PATCH 268/718] Allow negative INCX (API change from version 3.10 of
 the reference implementation)

---
 kernel/x86/nrm2.S      | 2 +-
 kernel/x86/nrm2_sse.S  | 2 +-
 kernel/x86/znrm2.S     | 2 +-
 kernel/x86/znrm2_sse.S | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/x86/nrm2.S b/kernel/x86/nrm2.S
index 7a14da862..3a6417462 100644
--- a/kernel/x86/nrm2.S
+++ b/kernel/x86/nrm2.S
@@ -78,7 +78,7 @@
 	testl	M, M
 	jle	.L999
 	testl	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	sall	$BASE_SHIFT, INCX
 	fldz
diff --git a/kernel/x86/nrm2_sse.S b/kernel/x86/nrm2_sse.S
index 0f174c408..129b41a03 100644
--- a/kernel/x86/nrm2_sse.S
+++ b/kernel/x86/nrm2_sse.S
@@ -69,7 +69,7 @@
 	jle	.L999
 	pxor	%xmm1, %xmm1
 	testl	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	leal	(, INCX, SIZE), INCX
 	cmpl	$SIZE, INCX
diff --git a/kernel/x86/znrm2.S b/kernel/x86/znrm2.S
index 263612e9a..7a65df77a 100644
--- a/kernel/x86/znrm2.S
+++ b/kernel/x86/znrm2.S
@@ -78,7 +78,7 @@
 	testl	M, M
 	jle	.L999
 	testl	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	sall	$ZBASE_SHIFT, INCX
 	fldz
diff --git a/kernel/x86/znrm2_sse.S b/kernel/x86/znrm2_sse.S
index bbc3677ae..4ad326120 100644
--- a/kernel/x86/znrm2_sse.S
+++ b/kernel/x86/znrm2_sse.S
@@ -69,7 +69,7 @@
 	jle	.L999
 	pxor	%xmm1, %xmm1
 	testl	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	sall	$ZBASE_SHIFT, INCX
 

From 34da1a067dabf10851f9c5838df64c0ed6c6a6fa Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 10 Aug 2023 17:01:50 +0200
Subject: [PATCH 269/718] Allow negative INCX (API change from version 3.10 of
 the reference implementation)

---
 kernel/x86_64/nrm2.S      | 2 +-
 kernel/x86_64/nrm2_sse.S  | 2 +-
 kernel/x86_64/znrm2.S     | 2 +-
 kernel/x86_64/znrm2_sse.S | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/x86_64/nrm2.S b/kernel/x86_64/nrm2.S
index b79ac2adb..61cf8c452 100644
--- a/kernel/x86_64/nrm2.S
+++ b/kernel/x86_64/nrm2.S
@@ -58,7 +58,7 @@
 	testq	M, M
 	jle	.L999
 	testq	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	salq	$BASE_SHIFT, INCX
 
diff --git a/kernel/x86_64/nrm2_sse.S b/kernel/x86_64/nrm2_sse.S
index 33b1ee496..c1f3a45fc 100644
--- a/kernel/x86_64/nrm2_sse.S
+++ b/kernel/x86_64/nrm2_sse.S
@@ -57,7 +57,7 @@
 	jle	.L999
 	pxor	%xmm1, %xmm1
 	testq	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	pxor	%xmm2, %xmm2
 	leaq	(, INCX, SIZE), INCX
diff --git a/kernel/x86_64/znrm2.S b/kernel/x86_64/znrm2.S
index 0d2aa3480..748fde310 100644
--- a/kernel/x86_64/znrm2.S
+++ b/kernel/x86_64/znrm2.S
@@ -58,7 +58,7 @@
 	testq	M, M
 	jle	.L999
 	testq	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	salq	$ZBASE_SHIFT, INCX
 
diff --git a/kernel/x86_64/znrm2_sse.S b/kernel/x86_64/znrm2_sse.S
index f78b83f7e..2274f2e98 100644
--- a/kernel/x86_64/znrm2_sse.S
+++ b/kernel/x86_64/znrm2_sse.S
@@ -58,7 +58,7 @@
 	jle	.L999
 	pxor	%xmm1, %xmm1
 	testq	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	xorq	FLAG, FLAG
 

From 8a8a8479be704a413233f27c7f13b2abc489f5b5 Mon Sep 17 00:00:00 2001
From: Ian McInerney <i.mcinerney17@imperial.ac.uk>
Date: Mon, 14 Aug 2023 15:41:28 +0100
Subject: [PATCH 270/718] Fix cooperlake and sapphire rapids march flags on
 clang

The march=cooperlake and march=sapphirerapids flags were never getting
added when building with Clang targetting those architectures. Instead
it was falling back to the skylake AVX512 implementation.

Clang added support for these two architectures in Clang 9 and Clang 12,
so introduce new checks for those versions to enable the appropriate
march flag, and fallback to skylake otherwise.
---
 Makefile.system |  5 ++++
 Makefile.x86_64 | 74 +++++++++++++++++++++++++++++++++----------------
 kernel/Makefile |  4 +--
 3 files changed, 57 insertions(+), 26 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index 3c1648dc7..b3968d739 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -384,6 +384,11 @@ GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d
 GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
 endif
 
+ifeq ($(C_COMPILER), CLANG)
+CLANGVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
+CLANGVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12)
+endif
+
 #
 #  OS dependent settings
 #
diff --git a/Makefile.x86_64 b/Makefile.x86_64
index 7ab331b1f..702447ace 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -75,18 +75,31 @@ endif
 ifeq ($(CORE), COOPERLAKE)
 ifndef NO_AVX512
 ifeq ($(C_COMPILER), GCC)
-# cooperlake support was added in 10.1
-ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
-CCOMMON_OPT += -march=cooperlake
-ifneq ($(F_COMPILER), NAG)
-FCOMMON_OPT += -march=cooperlake
-endif
-else  # gcc not support, fallback to avx512
-CCOMMON_OPT += -march=skylake-avx512
-ifneq ($(F_COMPILER), NAG)
-FCOMMON_OPT += -march=skylake-avx512
-endif
-endif
+ # cooperlake support was added in 10.1
+ ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
+  CCOMMON_OPT += -march=cooperlake
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=cooperlake
+  endif
+ else  # gcc not support, fallback to avx512
+  CCOMMON_OPT += -march=skylake-avx512
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=skylake-avx512
+  endif
+ endif
+else ifeq ($(C_COMPILER), CLANG)
+ # cooperlake support was added in clang 9
+ ifeq ($(CLANGVERSIONGTEQ9), 1)
+  CCOMMON_OPT += -march=cooperlake
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=cooperlake
+  endif
+ else  # not supported in clang, fallback to avx512
+  CCOMMON_OPT += -march=skylake-avx512
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=skylake-avx512
+  endif
+ endif
 endif
 ifeq ($(OSNAME), CYGWIN_NT)
 CCOMMON_OPT += -fno-asynchronous-unwind-tables
@@ -104,18 +117,31 @@ endif
 ifeq ($(CORE), SAPPHIRERAPIDS)
 ifndef NO_AVX512
 ifeq ($(C_COMPILER), GCC)
-# sapphire rapids support was added in 11
-ifeq ($(GCCVERSIONGTEQ11), 1)
-CCOMMON_OPT += -march=sapphirerapids
-ifneq ($(F_COMPILER), NAG)
-FCOMMON_OPT += -march=sapphirerapids
-endif
-else  # gcc not support, fallback to avx512
-CCOMMON_OPT += -march=skylake-avx512
-ifneq ($(F_COMPILER), NAG)
-FCOMMON_OPT += -march=skylake-avx512
-endif
-endif
+ # sapphire rapids support was added in 11
+ ifeq ($(GCCVERSIONGTEQ11), 1)
+  CCOMMON_OPT += -march=sapphirerapids
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=sapphirerapids
+  endif
+ else  # gcc not support, fallback to avx512
+  CCOMMON_OPT += -march=skylake-avx512
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=skylake-avx512
+  endif
+ endif
+else ifeq ($(C_COMPILER), CLANG)
+ # cooperlake support was added in clang 12
+ ifeq ($(CLANGVERSIONGTEQ12), 1)
+  CCOMMON_OPT += -march=cooperlake
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=cooperlake
+  endif
+ else  # not supported in clang, fallback to avx512
+  CCOMMON_OPT += -march=skylake-avx512
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=skylake-avx512
+  endif
+ endif
 endif
 ifeq ($(OSNAME), CYGWIN_NT)
 CCOMMON_OPT += -fno-asynchronous-unwind-tables
diff --git a/kernel/Makefile b/kernel/Makefile
index d426a1bdb..795f25eec 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -33,7 +33,7 @@ endif
 ifdef TARGET_CORE
 ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
- ifeq ($(GCCVERSIONGTEQ11), 1) 
+ ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12)))
   override CFLAGS += -march=sapphirerapids
  else 
   override CFLAGS += -march=skylake-avx512 -mavx512f
@@ -48,7 +48,7 @@ ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
  endif
 else ifeq ($(TARGET_CORE), COOPERLAKE)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
- ifeq ($(GCCVERSIONGTEQ10), 1) 
+ ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(CLANGVERSIONGTEQ9)))
   override CFLAGS += -march=cooperlake
  else 
   override CFLAGS += -march=skylake-avx512 -mavx512f

From 0d30daa77275023fe3fe729349eaa85427a5416b Mon Sep 17 00:00:00 2001
From: TiborGY <tibor.gyori@chem.u-szeged.hu>
Date: Wed, 16 Aug 2023 00:07:17 +0200
Subject: [PATCH 271/718] Add junk from BF16 test to .gitignore

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 1195bc9b8..8b27325db 100644
--- a/.gitignore
+++ b/.gitignore
@@ -72,6 +72,7 @@ test/SBLAT3.SUMM
 test/ZBLAT2.SUMM
 test/ZBLAT3.SUMM
 test/SHBLAT3.SUMM
+test/SBBLAT3.SUMM
 test/cblat1
 test/cblat2
 test/cblat3
@@ -82,6 +83,7 @@ test/sblat1
 test/sblat2
 test/sblat3
 test/test_shgemm
+test/test_sbgemm
 test/zblat1
 test/zblat2
 test/zblat3

From b5ba95a6c0a7043f4bc9e68e9da7063a2aedfbc5 Mon Sep 17 00:00:00 2001
From: TGY <tibor.gyori@chem.u-szeged.hu>
Date: Wed, 16 Aug 2023 00:48:40 +0200
Subject: [PATCH 272/718] Modernize obsolete inline order

---
 common.h            | 2 +-
 common_alpha.h      | 2 +-
 common_arm.h        | 2 +-
 common_arm64.h      | 2 +-
 common_power.h      | 2 +-
 common_sparc.h      | 2 +-
 common_x86.h        | 2 +-
 common_x86_64.h     | 2 +-
 common_zarch.h      | 2 +-
 kernel/power/lock.c | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/common.h b/common.h
index 4eeeb8d55..4074df069 100644
--- a/common.h
+++ b/common.h
@@ -525,7 +525,7 @@ static inline unsigned long long rpcc(void){
 #endif // !RPCC_DEFINED
 
 #if !defined(BLAS_LOCK_DEFINED) && defined(__GNUC__)
-static void __inline blas_lock(volatile BLASULONG *address){
+static __inline void blas_lock(volatile BLASULONG *address){
 
   do {
     while (*address) {YIELDING;};
diff --git a/common_alpha.h b/common_alpha.h
index 021eb93ae..e5380454a 100644
--- a/common_alpha.h
+++ b/common_alpha.h
@@ -45,7 +45,7 @@
 #define WMB asm("wmb")
 #define RMB asm("mb")
 
-static void __inline blas_lock(unsigned long *address){
+static __inline void blas_lock(unsigned long *address){
 #ifndef __DECC
   unsigned long tmp1, tmp2;
   asm volatile(
diff --git a/common_arm.h b/common_arm.h
index 682315de5..a3db9953c 100644
--- a/common_arm.h
+++ b/common_arm.h
@@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(ARMV6) || defined(ARMV7) || defined(ARMV8)
 
-static void __inline blas_lock(volatile BLASULONG *address){
+static __inline void blas_lock(volatile BLASULONG *address){
 
   int register ret;
 
diff --git a/common_arm64.h b/common_arm64.h
index 6a18a294c..436ccb8f5 100644
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef ASSEMBLER
 
 
-static void __inline blas_lock(volatile BLASULONG *address){
+static __inline void blas_lock(volatile BLASULONG *address){
 
   BLASULONG ret;
 
diff --git a/common_power.h b/common_power.h
index a49197fd7..3fe776f23 100644
--- a/common_power.h
+++ b/common_power.h
@@ -91,7 +91,7 @@
 
 void *qalloc(int flags, size_t bytes);
 
-static void INLINE blas_lock(volatile unsigned long *address){
+static INLINE void blas_lock(volatile unsigned long *address){
 
   long int ret, val = 1;
 
diff --git a/common_sparc.h b/common_sparc.h
index 90a24ebf1..4b9e7840a 100644
--- a/common_sparc.h
+++ b/common_sparc.h
@@ -45,7 +45,7 @@
 
 #ifndef ASSEMBLER
 
-static void __inline blas_lock(volatile unsigned long *address){
+static __inline void blas_lock(volatile unsigned long *address){
 
   long int ret = 1;
 
diff --git a/common_x86.h b/common_x86.h
index bc77eca58..65fb9a460 100644
--- a/common_x86.h
+++ b/common_x86.h
@@ -54,7 +54,7 @@
 #define	__volatile__
 #endif
 
-static void __inline blas_lock(volatile BLASULONG *address){
+static __inline void blas_lock(volatile BLASULONG *address){
 
   int ret;
 
diff --git a/common_x86_64.h b/common_x86_64.h
index 729a055ce..dda168d6c 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -70,7 +70,7 @@
 #define RMB
 #endif
 
-static void __inline blas_lock(volatile BLASULONG *address){
+static __inline void blas_lock(volatile BLASULONG *address){
 
 	
 #ifndef C_MSVC
diff --git a/common_zarch.h b/common_zarch.h
index 442bae821..80609251b 100644
--- a/common_zarch.h
+++ b/common_zarch.h
@@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef ASSEMBLER
 
   /*
-static void __inline blas_lock(volatile BLASULONG *address){
+static __inline void blas_lock(volatile BLASULONG *address){
 
   BLASULONG ret;
 
diff --git a/kernel/power/lock.c b/kernel/power/lock.c
index 1c1b006b0..de28680d8 100644
--- a/kernel/power/lock.c
+++ b/kernel/power/lock.c
@@ -36,7 +36,7 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-static void __inline blas_lock(volatile BLASULONG *address){
+static __inline void blas_lock(volatile BLASULONG *address){
 
 #ifdef __GNUC__
 

From 79c15db348b7a9365a6abf3c32ebed3f4f1bbf62 Mon Sep 17 00:00:00 2001
From: Ian McInerney <i.mcinerney17@imperial.ac.uk>
Date: Mon, 14 Aug 2023 21:36:35 +0100
Subject: [PATCH 273/718] Fix power10 gcc intrinsic check

__builtin_vsx_assemble_pair was only in GCC 10-11.2 and was replaced by
__builtin_vsx_build_pair thereafter.
---
 kernel/power/dgemm_small_kernel_nn_power10.c | 2 +-
 kernel/power/dgemm_small_kernel_tn_power10.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/power/dgemm_small_kernel_nn_power10.c b/kernel/power/dgemm_small_kernel_nn_power10.c
index ecdc3e5c6..73f6d5b99 100644
--- a/kernel/power/dgemm_small_kernel_nn_power10.c
+++ b/kernel/power/dgemm_small_kernel_nn_power10.c
@@ -167,7 +167,7 @@ typedef __vector unsigned char vec_t;
 
 #define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
 
-#if (defined(__GNUC__) && (__GNUC__ == 10))
+#if (defined(__GNUC__) && (__GNUC__ == 10 || (__GNUC__ == 11 && __GNUC_MINOR__ <= 2)))
 #if defined(_AIX)
 #define LOAD_PAIR(pair, v0, v1)                             \
   __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1);
diff --git a/kernel/power/dgemm_small_kernel_tn_power10.c b/kernel/power/dgemm_small_kernel_tn_power10.c
index 93a942b02..426948185 100644
--- a/kernel/power/dgemm_small_kernel_tn_power10.c
+++ b/kernel/power/dgemm_small_kernel_tn_power10.c
@@ -167,7 +167,7 @@ typedef __vector unsigned char vec_t;
 
 #define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
 
-#if (defined(__GNUC__) && (__GNUC__ == 10))
+#if (defined(__GNUC__) && (__GNUC__ == 10 || (__GNUC__ == 11 && __GNUC_MINOR__ <= 2)))
 #if defined(_AIX)
 #define LOAD_PAIR(pair, v0, v1)                             \
   __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1);

From 214be14c1d21b01713f3d89217de6a67dc23f369 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 18 Aug 2023 22:48:30 +0200
Subject: [PATCH 274/718] Correct INFO returned for lda in non-CBLAS s/dgeadd

---
 interface/geadd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/geadd.c b/interface/geadd.c
index f0befa14a..3a0ea015d 100644
--- a/interface/geadd.c
+++ b/interface/geadd.c
@@ -68,7 +68,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
   info = 0;
 
 
-  if (lda < MAX(1, m))	info = 6;
+  if (lda < MAX(1, m))	info = 5;
   if (ldc < MAX(1, m))	info = 8;
 
   if (n < 0)		info = 2;

From 553cc1372fa4fca3b5f71a92fa6218478af3c714 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Fri, 18 Aug 2023 17:39:44 +0800
Subject: [PATCH 275/718] LoongArch64: Add sgemm_kernel

---
 kernel/loongarch64/KERNEL.LOONGSON3R5       |   15 +
 kernel/loongarch64/loongarch64_asm.S        |   94 +
 kernel/loongarch64/sgemm_kernel_16x8_lasx.S | 2325 +++++++++++++++++++
 kernel/loongarch64/sgemm_ncopy_16_lasx.S    |  463 ++++
 kernel/loongarch64/sgemm_ncopy_8_lasx.S     |  298 +++
 kernel/loongarch64/sgemm_tcopy_16_lasx.S    |  526 +++++
 kernel/loongarch64/sgemm_tcopy_8_lasx.S     |  406 ++++
 param.h                                     |   12 +-
 8 files changed, 4134 insertions(+), 5 deletions(-)
 create mode 100644 kernel/loongarch64/sgemm_kernel_16x8_lasx.S
 create mode 100644 kernel/loongarch64/sgemm_ncopy_16_lasx.S
 create mode 100644 kernel/loongarch64/sgemm_ncopy_8_lasx.S
 create mode 100644 kernel/loongarch64/sgemm_tcopy_16_lasx.S
 create mode 100644 kernel/loongarch64/sgemm_tcopy_8_lasx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 9d858584c..67d1fd11c 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -11,9 +11,24 @@ DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 DGEMVNKERNEL = dgemv_n_8_lasx.S
 DGEMVTKERNEL = dgemv_t_8_lasx.S
+
+SGEMMKERNEL  = sgemm_kernel_16x8_lasx.S
+SGEMMINCOPY  = sgemm_ncopy_16_lasx.S
+SGEMMITCOPY  = sgemm_tcopy_16_lasx.S
+SGEMMONCOPY  = sgemm_ncopy_8_lasx.S
+SGEMMOTCOPY  = sgemm_tcopy_8_lasx.S
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 endif
 
 DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
 DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
 DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+STRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S
index 8876cbed9..89243c620 100644
--- a/kernel/loongarch64/loongarch64_asm.S
+++ b/kernel/loongarch64/loongarch64_asm.S
@@ -36,6 +36,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define PTR_ST    st.d
 #define PTR_SLLI  slli.d
 #define PTR_SRLI  srli.d
+#define PTR_SRAI  srai.d
+#define PTR_MUL   mul.d
 #define PTR_ALSL  alsl.d
 #else
 #define LA_REG    int32_t
@@ -48,6 +50,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define PTR_ST    st.w
 #define PTR_SLLI  slli.w
 #define PTR_SRLI  srli.w
+#define PTR_SRAI  srai.w
+#define PTR_MUL   mul.w
 #define PTR_ALSL  alsl.w
 #endif
 
@@ -218,6 +222,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endif
 .endm
 //
+// GSUB
+//
+.macro GSUB pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()sub.\suf_op \out, \in0, \in1
+.ifnb \more
+    GSUB \pre_op, \suf_op, \more
+.endif
+.endm
+//
 // GSLLI
 //
 .macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
@@ -244,6 +257,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     GXOR \pre_op, \suf_op, \more
 .endif
 .endm
+//
+// GPERMI
+//
+.macro GPERMI pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()permi.\suf_op     \out,   \in0,   \in1
+.ifnb \more
+    GPERMI \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GNMSUB
+//
+.macro GNMSUB pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg
+    \pre_op\()nmsub.\suf_op     \out,   \in0,   \in1,   \in2
+.ifnb \more
+    GNMSUB \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GPRELD
+//
+.macro GPRELD in0:req, in1:req, in2:req, more:vararg
+    preld \in0, \in1, \in2
+.ifnb \more
+    GPRELD \more
+.endif
+.endm
 
 //
 // Compound instructions
@@ -311,3 +351,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     GACC \pre_op, \suf_op, \more
 .endif
 .endm
+//
+// GMOV
+//
+.macro GMOV pre_op:req, out:req, in:req, more:vararg
+    \pre_op\()or.v  \out, \in, \in
+.ifnb \more
+    GMOV \pre_op, \more
+.endif
+.endm
+
+//
+// Media Related Macros
+//
+.macro GSBUTTERFLY pre_op, suf_op, out0, out1, in0, in1
+    \pre_op\()ilvl.\suf_op \out0, \in0, \in1
+    \pre_op\()ilvh.\suf_op \out1, \in0, \in1
+.endm
+.macro GINTERLACE pre_op, suf_op, out0, out1, in0, in1
+    \pre_op\()pickev.\suf_op \out0, \in0, \in1
+    \pre_op\()pickod.\suf_op \out1, \in0, \in1
+.endm
+
+//
+// TRANSPOSE4x4_D: Transpose 4x4 block with double-word elements in vectors,
+// has no pre_op param. 128-bit vector instructions are not supported.
+//
+.macro GTRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \
+                       vt0, vt1
+    GSBUTTERFLY xv, d, \vt0, \out1, \in1, \in0
+    GSBUTTERFLY xv, d, \vt1, \out3, \in3, \in2
+    GMOV xv, \out0, \vt0, \out2, \vt1, \vt1, \out3
+    GPERMI xv, q, \out0, \out2, 0x02, \out2, \vt0, 0x31, \out3, \out1, 0x31, \out1, \vt1, 0x02
+.endm
+
+.macro GTRANSPOSE8x8_W out0, out1, out2, out3, out4, out5, out6, out7, \
+                       in0, in1, in2, in3, in4, in5, in6, in7, \
+                       tmp0, tmp1, tmp2, tmp3
+    GSBUTTERFLY xv, w, \tmp0, \tmp2, \in2, \in0
+    GSBUTTERFLY xv, w, \tmp1, \tmp3, \in3, \in1
+    GSBUTTERFLY xv, w, \out0, \out1, \tmp1, \tmp0
+    GSBUTTERFLY xv, w, \out2, \out3, \tmp3, \tmp2
+
+    GSBUTTERFLY xv, w, \tmp0, \tmp2, \in6, \in4
+    GSBUTTERFLY xv, w, \tmp1, \tmp3, \in7, \in5
+    GSBUTTERFLY xv, w, \out4, \out5, \tmp1, \tmp0
+    GSBUTTERFLY xv, w, \out6, \out7, \tmp3, \tmp2
+
+    GMOV xv, \tmp0, \out0, \tmp1, \out1, \tmp2, \out2, \tmp3, \out3
+
+    GPERMI xv, q, \out0, \out4, 0x02, \out1, \out5, 0x02, \
+                  \out2, \out6, 0x02, \out3, \out7, 0x02, \
+                  \out4, \tmp0, 0x31, \out5, \tmp1, 0x31, \
+                  \out6, \tmp2, 0x31, \out7, \tmp3, 0x31
+.endm
diff --git a/kernel/loongarch64/sgemm_kernel_16x8_lasx.S b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S
new file mode 100644
index 000000000..254dbe052
--- /dev/null
+++ b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S
@@ -0,0 +1,2325 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/23 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+* 2023/08/23 guxiwei
+* Parameter:
+*       SGEMM_DEFAULT_UNROLL_N  8
+*       SGEMM_DEFAULT_UNROLL_M  16
+*       SGEMM_DEFAULT_P         256
+*       SGEMM_DEFAULT_Q         256
+*       SGEMM_DEFAULT_R         1024
+*       A_PRE                   1024
+*       B_PRE                   256    // Enable prefetching for B results in a performance decrease, temporarily disabled.
+*
+*
+* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000:
+*       1 thread:       71.7 GFLOPS
+*       2 threads:     142.6 GFLOPS
+*       3 threads:     211.5 GFLOPS
+*       4 threads:     265.0 GFLOPS
+*********************************************************************/
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA  $f0   // param 4: alpha
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+
+#ifdef TRMMKERNEL
+#define OFFSET $r11  // param 9: offset
+#endif
+#define OFF    $r12
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define C4     $r25
+#define C5     $r26
+#define C6     $r27
+#define C7     $r28
+#define T0     $r29
+#define T1     $r30
+#undef  ZERO
+#define ZERO   $r0
+
+/* LASX Vectors
+ * Store 16 sets of 32-bit data in A using UO and U1, with each register holding 8 data.
+ * Use X0 through X7 to store 8 sets of 32-bit data in B, with each register holding a broadcast value of a single data.
+ * Use D0 to D15 to store intermediate values of the computation.
+ * Use VALPHA to store the broadcast value of alpha
+ */
+#define U0     $xr0
+#define U1     $xr1
+#define X0     $xr2
+#define X1     $xr3
+#define X2     $xr4
+#define X3     $xr5
+#define X4     $xr6
+#define X5     $xr7
+#define X6     $xr8
+#define X7     $xr9
+#define D0     $xr10
+#define D1     $xr11
+#define D2     $xr12
+#define D3     $xr13
+#define D4     $xr14
+#define D5     $xr15
+#define D6     $xr16
+#define D7     $xr17
+#define D8     $xr18
+#define D9     $xr19
+#define D10    $xr20
+#define D11    $xr21
+#define D12    $xr22
+#define D13    $xr23
+#define D14    $xr24
+#define D15    $xr25
+#define VALPHA $xr26
+
+/* Prefetch interval */
+#define A_PRE  0x400
+#define B_PRE  0x100
+
+// Loops outline:
+// .L_N8  <--------------------------------------------------------------------------------------------    /* if N >> 3 == 0, goto .L_N7; else, enter .L_N8. */
+// |    .L_M16 <---------------------                                                                  |   /* if M >> 4 == 0, goto .L_M8; Otherwise, enter .L_M16. */
+// |    |  .L_M16_TL1               |                                                                  |
+// |    |  .L_M16_L7                | The entire core loop of the function, KERNEK16x8                 |
+// |    |  .L_M16_L71               |                                                                  |
+// |    |  .L_M16_L0 ----------------                                                                  |
+// |    .L_M8                                                                                          |
+// |    |  .L_M8_TL1                |                                                                  |
+// |    |  .L_M8_L7                 | KERNEK8x8                                                        |
+// |    |  .L_M8_L71                |                                                                  |
+// |    |  .L_M8_L0                 |                                                                  |
+// |    .L_M4                                                                                          |
+// |    |  .L_M4_TL1                |                                                                  |
+// |    |  .L_M4_L7                 | KERNEK4x8                                                        |
+// |    |  .L_M4_L71                |                                                                  |
+// |    |  .L_M4_L0                 |                                                                  |
+// |    .L_M2                                                                                          |
+// |    |  .L_M2_TL1                |                                                                  |
+// |    |  .L_M2_L7                 | KERNEK2x8                                                        |
+// |    |  .L_M2_L71                |                                                                  |
+// |    |  .L_M2_L0                 |                                                                  |
+// |    .L_M1                                                                                          |
+// |    |  .L_M1_TL1                |                                                                  |
+// |    |  .L_M1_L7                 | KERNEK1x8                                                        |
+// |    |  .L_M1_L71                |                                                                  |
+// |    |  .L_M1_L0                 |                                                                  |
+// |    .L_M0------------------------------------------------------------------------------------------
+// .L_N7 /* if N & 7 == 0, goto .L_N0; else, enter .L_N4 */
+// .L_N4
+// |    .L_N4_M16 <---------------------
+// |    |  .L_N4_M16_TL1               |
+// |    |  .L_N4_M16_L7                |  KERNEL16x4
+// |    |  .L_N4_M16_L71               |
+// |    |  .L_N4_M16_L0 ----------------
+// |    .L_N4_M8
+// |    |  .L_N4_M8_TL1                |
+// |    |  .L_N4_M8_L7                 | KERNEL8x4
+// |    |  .L_N4_M8_L71                |
+// |    |  .L_N4_M8_L0                 |
+// |    .L_N4_M4
+// |    |  .L_N4_M4_TL1                |
+// |    |  .L_N4_M4_L7                 | KERNEL4x4
+// |    |  .L_N4_M4_L71                |
+// |    |  .L_N4_M4_L0                 |
+// |    .L_N4_M2
+// |    |  .L_N4_M2_TL1                |
+// |    |  .L_N4_M2_L7                 | KERNEL2x4
+// |    |  .L_N4_M2_L71                |
+// |    |  .L_N4_M2_L0                 |
+// |    .L_N4_M1
+// |    |  .L_N4_M1_TL1                |
+// |    |  .L_N4_M1_L7                 | KERNEL1x4
+// |    |  .L_N4_M1_L71                |
+// |    |  .L_N4_M1_L0                 |
+// |    .L_N4_M0
+// .L_N3     /* if N & 2 == 0, goto .L_N1; else enter .L_N2 */
+// .L_N2
+// |    .L_N2_M16 <---------------------
+// |    |  .L_N2_M16_TL1               |
+// |    |  .L_N2_M16_L7                | KERNEL16x2
+// |    |  .L_N2_M16_L71               |
+// |    |  .L_N2_M16_L0 ----------------
+// |    .L_N2_M8
+// |    |  .L_N2_M8_TL1                 |
+// |    |  .L_N2_M8_L7                  | KERNEL8x2
+// |    |  .L_N2_M8_L71                 |
+// |    |  .L_N2_M8_L0                  |
+// |    .L_N2_M4
+// |    |  .L_N2_M4_TL1                 |
+// |    |  .L_N2_M4_L7                  | KERNEL4x2
+// |    |  .L_N2_M4_L71                 |
+// |    |  .L_N2_M4_L0                  |
+// |    .L_N2_M2
+// |    |  .L_N2_M2_TL1                 |
+// |    |  .L_N2_M2_L7                  | KERNEL2x2
+// |    |  .L_N2_M2_L71                 |
+// |    |  .L_N2_M2_L0                  |
+// |    .L_N2_M1
+// |    |  .L_N2_M1_TL1                 |
+// |    |  .L_N2_M1_L7                  | KERNEL1x2
+// |    |  .L_N2_M1_L71                 |
+// |    |  .L_N2_M1_L0                  |
+// |    .L_N2_M0
+// .L_N1
+// |    .L_N1_M16 <---------------------
+// |    |  .L_N1_M16_TL1               |
+// |    |  .L_N1_M16_L7                | KERNEL16x1
+// |    |  .L_N1_M16_L71               |
+// |    |  .L_N1_M16_L0 ----------------
+// |    .L_N1_M8
+// |    |  .L_N1_M8_TL1                |
+// |    |  .L_N1_M8_L7                 | KERNEL8x1
+// |    |  .L_N1_M8_L71                |
+// |    |  .L_N1_M8_L0                 |
+// |    .L_N1_M4
+// |    |  .L_N1_M4_TL1                |
+// |    |  .L_N1_M4_L7                 | KERNEL4x1
+// |    |  .L_N1_M4_L71                |
+// |    |  .L_N1_M4_L0                 |
+// |    .L_N1_M2
+// |    |  .L_N1_M2_TL1                |
+// |    |  .L_N1_M2_L7                 | KERNEL2x1
+// |    |  .L_N1_M2_L71                |
+// |    |  .L_N1_M2_L0                 |
+// |    .L_N1_M1
+// |    |  .L_N1_M1_TL1                |
+// |    |  .L_N1_M1_L7                 | KERNEL1x1
+// |    |  .L_N1_M1_L71                |
+// |    |  .L_N1_M1_L0                 |
+// |    .L_N1_M0
+// .L_N0
+
+/*************** sgemm_kernel_macros ***************/
+.macro KERNEL1x16x8_START
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMUL xvf, s, D0, U0, X0, D1, U1, X0
+    preld   0,  C0, 0x00
+    GMUL xvf, s, D2, U0, X1, D3, U1, X1
+    preld   0,  C1, 0x00
+    GMUL xvf, s, D4, U0, X2, D5, U1, X2
+    preld   0,  C2, 0x00
+    GMUL xvf, s, D6, U0, X3, D7, U1, X3
+    preld   0,  C3, 0x00
+    GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
+    GMUL xvf, s, D8,  U0, X4, D9,  U1, X4
+    preld   0,  C4, 0x00
+    GMUL xvf, s, D10, U0, X5, D11, U1, X5
+    preld   0,  C5, 0x00
+    GMUL xvf, s, D12, U0, X6, D13, U1, X6
+    preld   0,  C6, 0x00
+    GMUL xvf, s, D14, U0, X7, D15, U1, X7
+    preld   0,  C7, 0x00
+    PTR_ADDI   A0,     A0,     0x40
+    PTR_ADDI   B0,     B0,     0x20
+.endm
+
+.macro KERNEL1x16x8
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \
+                  D2, U0, X1, D2, D3, U1, X1, D3
+    preld      0,   A0, A_PRE
+    GMADD xvf, s, D4, U0, X2, D4, D5, U1, X2, D5, \
+                  D6, U0, X3, D6, D7, U1, X3  D7
+    GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
+    GMADD xvf, s, D8,  U0, X4, D8,  D9,  U1, X4, D9,  \
+                  D10, U0, X5, D10, D11, U1, X5, D11
+    //preld      0,   B0, B_PRE
+    GMADD xvf, s, D12, U0, X6, D12, D13, U1, X6, D13, \
+                  D14, U0, X7, D14, D15, U1, X7  D15
+    PTR_ADDI   A0,     A0,     0x40
+    PTR_ADDI   B0,     B0,     0x20
+.endm
+
+.macro KERNEL8x16x8
+.rept 8
+    KERNEL1x16x8
+.endr
+.endm
+
+.macro SAVE16x8
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA, D1,  D1,  VALPHA, D2,  D2,  VALPHA, D3,  D3,  VALPHA, \
+                 D4,  D4,  VALPHA, D5,  D5,  VALPHA, D6,  D6,  VALPHA, D7,  D7,  VALPHA, \
+                 D8,  D8,  VALPHA, D9,  D9,  VALPHA, D10, D10, VALPHA, D11, D11, VALPHA, \
+                 D12, D12, VALPHA, D13, D13, VALPHA, D14, D14, VALPHA, D15, D15, VALPHA
+#else
+    /* Load C0  */
+    GLD xv, , X0, C0, 0x00, X1, C0, 0x20
+    GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
+    /* Load C1  */
+    GLD xv, , X2, C1, 0x00, X3, C1, 0x20
+    GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3
+    /* Load C2  */
+    GLD xv, , X4, C2, 0x00, X5, C2, 0x20
+    GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5
+    /* Load C3  */
+    GLD xv, , X6, C3, 0x00, X7, C3, 0x20
+    GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7
+    /* Load C4  */
+    GLD xv, , X0, C4, 0x00, X1, C4, 0x20
+    GMADD xvf, s, D8, D8, VALPHA, X0, D9, D9, VALPHA, X1
+    /* Load C5  */
+    GLD xv, , X2, C5, 0x00, X3, C5, 0x20
+    GMADD xvf, s, D10, D10, VALPHA, X2, D11, D11, VALPHA, X3
+    /* Load C6  */
+    GLD xv, , X4, C6, 0x00, X5, C6, 0x20
+    GMADD xvf, s, D12, D12, VALPHA, X4, D13, D13, VALPHA, X5
+    /* Load C7  */
+    GLD xv, , X6, C7, 0x00, X7, C7, 0x20
+    GMADD xvf, s, D14, D14, VALPHA, X6, D15, D15, VALPHA, X7
+#endif // #if defined(TRMMKERNEL)
+    GST xv, , D0,  C0, 0x00, D1,  C0, 0x20, \
+              D2,  C1, 0x00, D3,  C1, 0x20, \
+              D4,  C2, 0x00, D5,  C2, 0x20, \
+              D6,  C3, 0x00, D7,  C3, 0x20, \
+              D8,  C4, 0x00, D9,  C4, 0x20, \
+              D10, C5, 0x00, D11, C5, 0x20, \
+              D12, C6, 0x00, D13, C6, 0x20, \
+              D14, C7, 0x00, D15, C7, 0x20
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \
+               C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40
+#else
+    GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \
+               C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40
+#endif
+.endm
+
+// m = 8, 4, 2, 1
+// stride = 0x20, 0x10, 0x08, 0x04
+.macro KERNEL1xMx8_START m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMUL xvf, s, D0, U0, X0, D2, U0, X1, \
+                 D4, U0, X2, D6, U0, X3
+    GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
+    GMUL xvf, s, D8,  U0, X4, D10, U0, X5, \
+                 D12, U0, X6, D14, U0, X7
+   PTR_ADDI   A0,     A0,     \stride
+   PTR_ADDI   B0,     B0,     0x20
+.endm
+
+.macro KERNEL1xMx8 m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \
+                  D4, U0, X2, D4, D6, U0, X3, D6
+    GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
+    GMADD xvf, s, D8,  U0, X4, D8,  D10, U0, X5, D10, \
+                  D12, U0, X6, D12, D14, U0, X7, D14
+   PTR_ADDI   A0,     A0,     \stride
+   PTR_ADDI   B0,     B0,     0x20
+.endm
+
+.macro KERNEL8xMx8 m, stride
+.rept 8
+    KERNEL1xMx8 \m, \stride
+.endr
+.endm
+
+.macro SAVEMx8 m, stride
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA, D2,  D2,  VALPHA, \
+                 D4,  D4,  VALPHA, D6,  D6,  VALPHA, \
+                 D8,  D8,  VALPHA, D10, D10, VALPHA, \
+                 D12, D12, VALPHA, D14, D14, VALPHA
+#else
+    /* Load C0, C1, C2, C3, C4, C5, C6, C7 */
+ .if \m == 8
+    GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00
+ .elseif \m == 4
+    GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00
+.elseif \m == 2
+    GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
+.elseif \m == 1
+    GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
+ .endif
+    GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \
+                  D4, D4, VALPHA, X4, D6, D6, VALPHA, X6
+.if \m == 8
+    GLD xv, , X0, C4, 0x00, X2, C5, 0x00, X4, C6, 0x00, X6, C7, 0x00
+.elseif \m == 4
+    GLD v, , $vr2, C4, 0x00, $vr4, C5, 0x00, $vr6, C6, 0x00, $vr8, C7, 0x00
+.elseif \m == 2
+    GLD f, d, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00
+.elseif \m == 1
+    GLD f, s, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00
+.endif
+    GMADD xvf, s, D8,  D8,  VALPHA, X0, D10, D10, VALPHA, X2, \
+                  D12, D12, VALPHA, X4, D14, D14, VALPHA, X6
+#endif // #if defined(TRMMKERNEL)
+.if \m == 8
+    GST xv, , D0,  C0, 0x00, D2,  C1, 0x00, \
+              D4,  C2, 0x00, D6,  C3, 0x00, \
+              D8,  C4, 0x00, D10, C5, 0x00, \
+              D12, C6, 0x00, D14, C7, 0x00
+.elseif \m == 4
+    GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \
+             $vr14, C2, 0x00, $vr16, C3, 0x00, \
+             $vr18, C4, 0x00, $vr20, C5, 0x00, \
+             $vr22, C6, 0x00, $vr24, C7, 0x00
+.elseif \m == 2
+    GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \
+              $f14, C2, 0x00, $f16, C3, 0x00, \
+              $f18, C4, 0x00, $f20, C5, 0x00, \
+              $f22, C6, 0x00, $f24, C7, 0x00
+.elseif \m == 1
+    GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \
+              $f14, C2, 0x00, $f16, C3, 0x00, \
+              $f18, C4, 0x00, $f20, C5, 0x00, \
+              $f22, C6, 0x00, $f24, C7, 0x00
+.endif
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \
+               C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride
+#else
+    GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \
+               C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride
+#endif
+.endm
+
+.macro KERNEL1x16x4_START
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMUL xvf, s, D0, U0, X0, D1, U1, X0, \
+                 D2, U0, X1, D3, U1, X1, \
+                 D4, U0, X2, D5, U1, X2, \
+                 D6, U0, X3, D7, U1, X3
+   PTR_ADDI   A0,     A0,     0x40
+   PTR_ADDI   B0,     B0,     0x10
+.endm
+
+.macro KERNEL1x16x4
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \
+                  D2, U0, X1, D2, D3, U1, X1, D3, \
+                  D4, U0, X2, D4, D5, U1, X2, D5, \
+                  D6, U0, X3, D6, D7, U1, X3  D7
+    PTR_ADDI   A0,     A0,     0x40
+    PTR_ADDI   B0,     B0,     0x10
+.endm
+
+.macro KERNEL8x16x4
+.rept 8
+    KERNEL1x16x4
+.endr
+.endm
+
+.macro SAVE16x4
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA, D1,  D1,  VALPHA, D2,  D2,  VALPHA, D3,  D3,  VALPHA, \
+                 D4,  D4,  VALPHA, D5,  D5,  VALPHA, D6,  D6,  VALPHA, D7,  D7,  VALPHA
+#else
+    /* Load C0  */
+    GLD xv, , X0, C0, 0x00, X1, C0, 0x20
+    GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
+    /* Load C1  */
+    GLD xv, , X2, C1, 0x00, X3, C1, 0x20
+    GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3
+    /* Load C2  */
+    GLD xv, , X4, C2, 0x00, X5, C2, 0x20
+    GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5
+    /* Load C3  */
+    GLD xv, , X6, C3, 0x00, X7, C3, 0x20
+    GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7
+#endif // #if defined(TRMMKERNEL)
+    GST xv, , D0,  C0, 0x00, D1,  C0, 0x20, \
+              D2,  C1, 0x00, D3,  C1, 0x20, \
+              D4,  C2, 0x00, D5,  C2, 0x20, \
+              D6,  C3, 0x00, D7,  C3, 0x20
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40
+#else
+    GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40
+#endif
+.endm
+
+// m = 8, 4, 2, 1
+// stride = 0x20, 0x10, 0x08, 0x04
+.macro KERNEL1xMx4_START m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMUL xvf, s, D0, U0, X0, D2, U0, X1, \
+                 D4, U0, X2, D6, U0, X3
+   PTR_ADDI   A0,     A0,     \stride
+   PTR_ADDI   B0,     B0,     0x10
+.endm
+
+.macro KERNEL1xMx4 m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \
+                  D4, U0, X2, D4, D6, U0, X3, D6
+   PTR_ADDI   A0,     A0,     \stride
+   PTR_ADDI   B0,     B0,     0x10
+.endm
+
+.macro KERNEL8xMx4 m, stride
+.rept 8
+    KERNEL1xMx4 \m, \stride
+.endr
+.endm
+
+.macro SAVEMx4 m, stride
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA, D2,  D2,  VALPHA, \
+                 D4,  D4,  VALPHA, D6,  D6,  VALPHA
+#else
+    /* Load C0, C1, C2, C3 */
+ .if \m == 8
+    GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00
+ .elseif \m == 4
+    GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00
+.elseif \m == 2
+    GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
+.elseif \m == 1
+    GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
+ .endif
+    GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \
+                  D4, D4, VALPHA, X4, D6, D6, VALPHA, X6
+#endif // #if defined(TRMMKERNEL)
+.if \m == 8
+    GST xv, , D0,  C0, 0x00, D2,  C1, 0x00, \
+              D4,  C2, 0x00, D6,  C3, 0x00
+.elseif \m == 4
+    GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \
+             $vr14, C2, 0x00, $vr16, C3, 0x00
+.elseif \m == 2
+    GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \
+              $f14, C2, 0x00, $f16, C3, 0x00
+.elseif \m == 1
+    GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \
+              $f14, C2, 0x00, $f16, C3, 0x00
+.endif
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride
+#else
+    GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride
+#endif
+.endm
+
+.macro KERNEL1x16x2_START
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
+    GMUL xvf, s, D0, U0, X0, D1, U1, X0, \
+                 D2, U0, X1, D3, U1, X1
+   PTR_ADDI   A0,     A0,     0x40
+   PTR_ADDI   B0,     B0,     0x08
+.endm
+
+.macro KERNEL1x16x2
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
+    GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \
+                  D2, U0, X1, D2, D3, U1, X1, D3
+   PTR_ADDI   A0,     A0,     0x40
+   PTR_ADDI   B0,     B0,     0x08
+.endm
+
+.macro KERNEL8x16x2
+.rept 8
+    KERNEL1x16x2
+.endr
+.endm
+
+.macro SAVE16x2
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA, D1,  D1,  VALPHA, D2,  D2,  VALPHA, D3,  D3,  VALPHA
+#else
+    /* Load C0  */
+    GLD xv, , X0, C0, 0x00, X1, C0, 0x20
+    GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
+    /* Load C1  */
+    GLD xv, , X2, C1, 0x00, X3, C1, 0x20
+    GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3
+#endif // #if defined(TRMMKERNEL)
+    GST xv, , D0,  C0, 0x00, D1,  C0, 0x20, \
+              D2,  C1, 0x00, D3,  C1, 0x20
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, 0x40, C1, C1, 0x40
+#else
+    GADDI , w, C0, C0, 0x40, C1, C1, 0x40
+#endif
+.endm
+
+// m = 8, 4, 2, 1
+// stride = 0x20, 0x10, 0x08, 0x04
+.macro KERNEL1xMx2_START m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
+    GMUL xvf, s, D0, U0, X0, D2, U0, X1
+    PTR_ADDI   A0,     A0,     \stride
+    PTR_ADDI   B0,     B0,     0x08
+.endm
+
+.macro KERNEL1xMx2 m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
+    GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2
+    PTR_ADDI   A0,     A0,     \stride
+    PTR_ADDI   B0,     B0,     0x08
+.endm
+
+.macro KERNEL8xMx2 m, stride
+.rept 8
+    KERNEL1xMx2 \m, \stride
+.endr
+.endm
+
+.macro SAVEMx2 m, stride
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA, D2,  D2,  VALPHA
+#else
+    /* Load C0, C1 */
+ .if \m == 8
+    GLD xv, , X0, C0, 0x00, X2, C1, 0x00
+ .elseif \m == 4
+    GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00
+.elseif \m == 2
+    GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00
+.elseif \m == 1
+    GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00
+ .endif
+    GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2
+#endif // #if defined(TRMMKERNEL)
+.if \m == 8
+    GST xv, , D0,  C0, 0x00, D2,  C1, 0x00
+.elseif \m == 4
+    GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00
+.elseif \m == 2
+    GST f, d, $f10, C0, 0x00, $f12, C1, 0x00
+.elseif \m == 1
+    GST f, s, $f10, C0, 0x00, $f12, C1, 0x00
+.endif
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, \stride, C1, C1, \stride
+#else
+    GADDI , w, C0, C0, \stride, C1, C1, \stride
+#endif
+.endm
+
+.macro KERNEL1x16x1_START
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+    GLDREPL xv, w, X0, B0, 0x00
+    GMUL xvf, s, D0, U0, X0, D1, U1, X0
+    PTR_ADDI   A0,     A0,     0x40
+    PTR_ADDI   B0,     B0,     0x04
+.endm
+
+.macro KERNEL1x16x1
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+    GLDREPL xv, w, X0, B0, 0x00
+    GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1
+    PTR_ADDI   A0,     A0,     0x40
+    PTR_ADDI   B0,     B0,     0x04
+.endm
+
+.macro KERNEL8x16x1
+.rept 8
+    KERNEL1x16x1
+.endr
+.endm
+
+.macro SAVE16x1
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA, D1,  D1,  VALPHA
+#else
+    /* Load C0  */
+    GLD xv, , X0, C0, 0x00, X1, C0, 0x20
+    GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
+#endif // #if defined(TRMMKERNEL)
+    GST xv, , D0,  C0, 0x00, D1,  C0, 0x20
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, 0x40
+#else
+    GADDI , w, C0, C0, 0x40
+#endif
+.endm
+
+// m = 8, 4, 2, 1
+// stride = 0x20, 0x10, 0x08, 0x04
+.macro KERNEL1xMx1_START m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+    GLDREPL xv, w, X0, B0, 0x00
+    GMUL xvf, s, D0, U0, X0
+    PTR_ADDI   A0,     A0,     \stride
+    PTR_ADDI   B0,     B0,     0x04
+.endm
+
+.macro KERNEL1xMx1 m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+    GLDREPL xv, w, X0, B0, 0x00
+    GMADD xvf, s, D0, U0, X0, D0
+    PTR_ADDI   A0,     A0,     \stride
+    PTR_ADDI   B0,     B0,     0x04
+.endm
+
+.macro KERNEL8xMx1 m, stride
+.rept 8
+    KERNEL1xMx1 \m, \stride
+.endr
+.endm
+
+.macro SAVEMx1 m, stride
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA
+#else
+    /* Load C0, C1 */
+ .if \m == 8
+    GLD xv, , X0, C0, 0x00
+ .elseif \m == 4
+    GLD v, , $vr2, C0, 0x00
+.elseif \m == 2
+    GLD f, d, $f2, C0, 0x00
+.elseif \m == 1
+    GLD f, s, $f2, C0, 0x00
+ .endif
+    GMADD xvf, s, D0, D0, VALPHA, X0
+#endif // #if defined(TRMMKERNEL)
+.if \m == 8
+    GST xv, , D0,  C0, 0x00
+.elseif \m == 4
+    GST v, , $vr10, C0, 0x00
+.elseif \m == 2
+    GST f, d, $f10, C0, 0x00
+.elseif \m == 1
+    GST f, s, $f10, C0, 0x00
+.endif
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, \stride
+#else
+    GADDI , w, C0, C0, \stride
+#endif
+.endm
+
+    PROLOGUE
+    push_if_used 26, 32
+    xvreplve0.w   VALPHA,     $xr0
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    PTR_SUB   OFF,   ZERO,  OFFSET
+#else
+    xor     OFF,   OFF,   OFF
+#endif
+    /* if (!(N >> 3)) goto L_N7 */
+    PTR_SRAI   J,     N,     3     /* J = bn >> 3 */
+    andi     N,     N,     0x07
+    beq      ZERO,  J,     .L_N7
+.L_N8: /* J -- */
+    move     C0,    C
+    move     A0,    A
+    PTR_SLLI   T0,    LDC,   2
+    PTR_ADDI   J,     J,     -1   /* J-- */
+#if __loongarch_grlen == 64
+    GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \
+              C6, C5, T0, C7, C6, T0
+#else
+    GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \
+              C6, C5, T0, C7, C6, T0
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+    /* if (!(M >> 4)) goto L_M8 */
+    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_M8
+.align 5
+.L_M16: /* I-- */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x06
+    PTR_ADD    A0,    A0,   T0 /* A0 += 16 * OFF */
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    B0,    B,    T0 /* B0 = B + 8 * OFF */
+#endif
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  16
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  8
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1x16x8_START
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M16_L7 */
+    beq       ZERO,TL, .L_M16_L7
+.align 5
+.L_M16_TL1:
+    KERNEL8x16x8
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M16_TL1
+.L_M16_L7:
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M16_L0
+.align 5
+.L_M16_L71:
+    KERNEL1x16x8
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_M16_L71
+.L_M16_L0:
+    SAVE16x8
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    /* number of values  in A */
+    PTR_ADDI    L,   L,   -16
+#else
+    /* number of values in B */
+    PTR_ADDI    L,   L,   -8
+#endif
+    PTR_SLLI    T0,  L,  0x06
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF, OFF, 0x10 /* number of values in A */
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    PTR_ADDI    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_M16
+.L_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_M0
+
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_M4
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    A0,    A0,   T0 /* A0 += 8 * OFF */
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    B0,    B,    T0 /* B0 = B + 8 * OFF */
+#endif
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  8
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  8
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif  // #if defined(TRMMKERNEL)
+    KERNEL1xMx8_START 8, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M8_L7 */
+    beq       ZERO,TL, .L_M8_L7
+.align 5
+.L_M8_TL1:
+    KERNEL8xMx8 8, 0x20
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M8_TL1
+.L_M8_L7:
+    /* if (!(L & 7)) goto L_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M8_L0
+.align 5
+.L_M8_L71:
+    KERNEL1xMx8 8, 0x20
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_M8_L71
+.L_M8_L0:
+    SAVEMx8 8, 0x20
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    L,   L,   -8
+#else
+    /* number of values in B */
+    PTR_ADDI    L,   L,   -8
+#endif
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    OFF,   OFF,  0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_M2
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    A0,    A0,   T0 /* A0 += 4 * OFF */
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    B0,    B,    T0 /* B0 = B + 8 * OFF */
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  4
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  8
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx8_START 4, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M4_L7 */
+    beq       ZERO,TL, .L_M4_L7
+.align 5
+.L_M4_TL1:
+    KERNEL8xMx8 4, 0x10
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_M4_TL1
+.L_M4_L7:
+    /* if (!(L & 7)) goto L_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M4_L0
+.L_M4_L71:
+    KERNEL1xMx8 4, 0x10
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_M4_L71
+.L_M4_L0:
+    SAVEMx8 4, 0x10
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    L,   L,   -4
+#else
+    /* number of values in B */
+    PTR_ADDI    L,   L,   -8
+#endif
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  2
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  8
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx8_START 2, 0x08
+
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M2_L7 */
+    beq       ZERO,TL, .L_M2_L7
+.align 5
+.L_M2_TL1:
+    KERNEL8xMx8 2, 0x08
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M2_TL1
+.L_M2_L7:
+    /* if (!(L & 7)) goto L_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M2_L0
+.align 5
+.L_M2_L71:
+    KERNEL1xMx8 2, 0x08
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_M2_L71
+.L_M2_L0:
+    SAVEMx8 2, 0x08
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    L,   L,   -2
+#else
+    /* number of values in B */
+    PTR_ADDI    L,   L,   -8
+#endif
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  1
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  8
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx8_START 1, 0x04
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M1_L7 */
+    beq       ZERO,TL, .L_M1_L7
+.align 5
+.L_M1_TL1:
+    KERNEL8xMx8 1, 0x04
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M1_TL1
+.L_M1_L7:
+    /* if (!(L & 7)) goto L_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M1_L0
+.align 5
+.L_M1_L71:
+    KERNEL1xMx8 1, 0x04
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_M1_L71
+.L_M1_L0:
+    SAVEMx8 1, 0x04
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    L,   L,   -1
+#else
+    /* number of values in B */
+    PTR_ADDI    L,   L,   -8
+#endif
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L_M0:
+    /* Add stride for B and C
+     * B += (K * 32)
+     * C += (LDC * 32)
+     */
+    PTR_SLLI    T0,   K,   5
+    PTR_SLLI    T1,   LDC, 5
+    PTR_ADD     B,    B,   T0
+    PTR_ADD     C,    C,   T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    PTR_ADDI    OFF,  OFF, 0x08 /* number of values in B */
+#endif
+    blt      ZERO,  J,   .L_N8
+
+.L_N7:
+    andi     J,    N,   4
+    beq      ZERO, J,   .L_N3
+.L_N4:
+    move     C0,    C
+    move     A0,    A
+    PTR_SLLI   T0,    LDC,   2
+#if __loongarch_grlen == 64
+    GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0
+#else
+    GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 4)) goto L_N4_M8 */
+    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_N4_M8
+.align 5
+.L_N4_M16:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x06
+    PTR_ADD    A0,    A0,   T0 /* A0 += 16 * OFF */
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    B0,    B,    T0 /* B0 += 4 * OFF */
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  16
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1x16x4_START
+
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N4_L7 */
+    beq       ZERO,TL, .L_N4_M16_L7
+.align 5
+.L_N4_M16_TL1: /* TL-- */
+    KERNEL8x16x4
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N4_M16_TL1
+.L_N4_M16_L7:
+    /* if (!(L & 7)) goto L_N4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N4_M16_L0
+.align 5
+.L_N4_M16_L71:
+    KERNEL1x16x4
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N4_M16_L71
+.L_N4_M16_L0:
+    SAVE16x4
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -16
+#else
+    PTR_ADDI    L,   L,   -4
+#endif
+    PTR_SLLI    T0,  L,  0x06
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x10
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    PTR_ADDI    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N4_M16
+.L_N4_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_N4_M0
+
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_N4_M4
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  8
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx4_START 8, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N4_M8_L7 */
+    beq       ZERO,TL, .L_N4_M8_L7
+.align 5
+.L_N4_M8_TL1: /* TL-- */
+    KERNEL8xMx4 8, 0x20
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N4_M8_TL1
+.L_N4_M8_L7:
+    /* if (!(L & 7)) goto L_N4_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N4_M8_L0
+.align 5
+.L_N4_M8_L71:
+    KERNEL1xMx4 8, 0x20
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N4_M8_L71
+.L_N4_M8_L0:
+    SAVEMx4 8, 0x20
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -8
+#else
+    PTR_ADDI    L,   L,   -4
+#endif
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N4_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N4_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  4
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx4_START 4, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N4_M4_L7 */
+    beq       ZERO,TL, .L_N4_M4_L7
+.align 5
+.L_N4_M4_TL1: /* TL-- */
+    KERNEL8xMx4 4, 0x10
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N4_M4_TL1
+.L_N4_M4_L7:
+    /* if (!(L & 7)) goto L_N4_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N4_M4_L0
+.align 5
+.L_N4_M4_L71:
+    KERNEL1xMx4 4, 0x10
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N4_M4_L71
+.L_N4_M4_L0:
+    SAVEMx4 4, 0x10
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -4
+#else
+    PTR_ADDI    L,   L,   -4
+#endif
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N4_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_N4_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  2
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx4_START 2, 0x08
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N4_M2_L7 */
+    beq       ZERO,TL, .L_N4_M2_L7
+.align 5
+.L_N4_M2_TL1: /* TL-- */
+    KERNEL8xMx4 2, 0x08
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N4_M2_TL1
+.L_N4_M2_L7:
+    /* if (!(L & 7)) goto L_N4_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N4_M2_L0
+.align 5
+.L_N4_M2_L71:
+    KERNEL1xMx4 2, 0x08
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N4_M2_L71
+.L_N4_M2_L0:
+    SAVEMx4 2, 0x08
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -2
+#else
+    PTR_ADDI    L,   L,   -4
+#endif
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N4_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_N4_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  1
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx4_START 1, 0x04
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N4_M1_L7 */
+    beq       ZERO,TL, .L_N4_M1_L7
+.align 5
+.L_N4_M1_TL1: /* TL-- */
+    KERNEL8xMx4 1, 0x04
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N4_M1_TL1
+.L_N4_M1_L7:
+    /* if (!(L & 7)) goto L_N4_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N4_M1_L0
+.align 5
+.L_N4_M1_L71:
+    KERNEL1xMx4 1, 0x04
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N4_M1_L71
+.L_N4_M1_L0:
+    SAVEMx4 1, 0x04
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -1
+#else
+    PTR_ADDI    L,   L,   -4
+#endif
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N4_M0:
+    /* Add stride for B and C
+     * B += 4 * K
+     * C += 4 * LDC
+     */
+    PTR_SLLI    T0,   K,   4
+    PTR_SLLI    T1,   LDC, 4
+    PTR_ADD     B,    B,   T0
+    PTR_ADD     C,    C,   T1
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    PTR_ADDI    OFF,  OFF, 0x04
+#endif
+    /* We must reinit I */
+    PTR_SRAI   I,     M,   4     /* I = bm >> 4 */
+.L_N3:
+    andi     J,    N,   2
+    beq      ZERO, J,   .L_N1
+
+.L_N2:
+    move     C0,    C
+    move     A0,    A
+    PTR_SLLI   T0,    LDC,   2
+    PTR_ADD    C1,    C0,    T0
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 4)) goto L_N2_M8 */
+    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_N2_M8
+.align 5
+.L_N2_M16:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x06
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  16
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1x16x2_START
+
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N2_M16_L7 */
+    beq       ZERO,TL, .L_N2_M16_L7
+.align 5
+.L_N2_M16_TL1: /* TL-- */
+    KERNEL8x16x2
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N2_M16_TL1
+.L_N2_M16_L7:
+    /* if (!(L & 7)) goto L_N2_M16_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N2_M16_L0
+.align 5
+.L_N2_M16_L71:
+    KERNEL1x16x2
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N2_M16_L71
+.L_N2_M16_L0:
+    SAVE16x2
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -16
+#else
+    PTR_ADDI    L,   L,   -2
+#endif
+    PTR_SLLI    T0,  L,  0x06
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x10
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    PTR_ADDI    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N2_M16
+.L_N2_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_N2_M0
+
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_N2_M4
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  8
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx2_START 8, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N2_M8_L7 */
+    beq       ZERO,TL, .L_N2_M8_L7
+.align 5
+.L_N2_M8_TL1: /* TL-- */
+    KERNEL8xMx2 8, 0x20
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N2_M8_TL1
+.L_N2_M8_L7:
+    /* if (!(L & 7)) goto L_N2_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N2_M8_L0
+.align 5
+.L_N2_M8_L71:
+    KERNEL1xMx2 8, 0x20
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N2_M8_L71
+.L_N2_M8_L0:
+    SAVEMx2 8, 0x20
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -8
+#else
+    PTR_ADDI    L,   L,   -2
+#endif
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N2_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N2_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  4
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx2_START 4, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N2_M4_L7 */
+    beq       ZERO,TL, .L_N2_M4_L7
+.align 5
+.L_N2_M4_TL1: /* TL-- */
+    KERNEL8xMx2 4, 0x10
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N2_M4_TL1
+.L_N2_M4_L7:
+    /* if (!(L & 7)) goto L_N2_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N2_M4_L0
+.align 5
+.L_N2_M4_L71:
+    KERNEL1xMx2 4, 0x10
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N2_M4_L71
+.L_N2_M4_L0:
+    SAVEMx2 4, 0x10
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -4
+#else
+    PTR_ADDI    L,   L,   -2
+#endif
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N2_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_N2_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    A0,    A0,   T0
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  2
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx2_START 2, 0x08
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N2_M2_L7 */
+    beq       ZERO,TL, .L_N2_M2_L7
+.align 5
+.L_N2_M2_TL1: /* TL-- */
+    KERNEL8xMx2 2, 0x08
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N2_M2_TL1
+.L_N2_M2_L7:
+    /* if (!(L & 7)) goto L_N2_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N2_M2_L0
+.align 5
+.L_N2_M2_L71:
+    KERNEL1xMx2 2, 0x08
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N2_M2_L71
+.L_N2_M2_L0:
+    SAVEMx2 2, 0x08
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -2
+#else
+    PTR_ADDI    L,   L,   -2
+#endif
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     A0,  A0, T0
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N2_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_N2_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  1
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx2_START 1, 0x04
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N2_M1_L7 */
+    beq       ZERO,TL, .L_N2_M1_L7
+.align 5
+.L_N2_M1_TL1: /* TL-- */
+    KERNEL8xMx2 1, 0x04
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N2_M1_TL1
+.L_N2_M1_L7:
+    /* if (!(L & 7)) goto L_N2_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N2_M1_L0
+.align 5
+.L_N2_M1_L71:
+    KERNEL1xMx2 1, 0x04
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N2_M1_L71
+.L_N2_M1_L0:
+    SAVEMx2 1, 0x04
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -1
+#else
+    PTR_ADDI    L,   L,   -2
+#endif
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N2_M0:
+    /* Add stride for B and C
+     * B += 2 * K
+     * C += 2 * LDC
+     */
+    PTR_SLLI    T0,   K,   3
+    PTR_SLLI    T1,   LDC, 3
+    PTR_ADD     B,    B,   T0
+    PTR_ADD     C,    C,   T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    PTR_ADDI    OFF,  OFF, 0x02
+#endif
+    /* We must reinit I */
+    PTR_SRAI   I,     M,   4     /* I = bm >> 4 */
+.L_N1:
+    andi     J,    N,   1
+    beq      ZERO, J,   .L_N0
+    move     C0,    C
+    move     A0,    A
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+    /* if (!(M >> 4)) goto L_N1_M8 */
+    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_N1_M8
+.L_N1_M16:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x06
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  16
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1x16x1_START
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M16_L7 */
+    beq       ZERO,TL, .L_N1_M16_L7
+.align 5
+.L_N1_M16_TL1: /* TL-- */
+    KERNEL8x16x1
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M16_TL1
+.L_N1_M16_L7:
+    /* if (!(L & 7)) goto L_N1_M16_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M16_L0
+.align 5
+.L_N1_M16_L71:
+    KERNEL1x16x1
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M16_L71
+.L_N1_M16_L0:
+    SAVE16x1
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -16
+#else
+    PTR_ADDI    L,   L,   -1
+#endif
+    PTR_SLLI    T0,  L,  0x06
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x10
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    PTR_ADDI    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N1_M16
+.L_N1_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_N1_M0
+
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_N1_M4
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  8
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx1_START 8, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M8_L7 */
+    beq       ZERO,TL, .L_N1_M8_L7
+.align 5
+.L_N1_M8_TL1: /* TL-- */
+    KERNEL8xMx1 8, 0x20
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M8_TL1
+.L_N1_M8_L7:
+    /* if (!(L & 7)) goto L_N1_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M8_L0
+.align 5
+.L_N1_M8_L71:
+    KERNEL1xMx1 8, 0x20
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M8_L71
+.L_N1_M8_L0:
+    SAVEMx1 8, 0x20
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -8
+#else
+    PTR_ADDI    L,   L,   -1
+#endif
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N1_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N1_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  4
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx1_START 4, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M4_L7 */
+    beq       ZERO,TL, .L_N1_M4_L7
+.align 5
+.L_N1_M4_TL1: /* TL-- */
+    KERNEL8xMx1 4, 0x10
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M4_TL1
+.L_N1_M4_L7:
+    /* if (!(L & 7)) goto L_N1_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M4_L0
+.align 5
+.L_N1_M4_L71:
+    KERNEL1xMx1 4, 0x10
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M4_L71
+.L_N1_M4_L0:
+    SAVEMx1 4, 0x10
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -4
+#else
+    PTR_ADDI    L,   L,   -1
+#endif
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N1_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_N1_M1
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  2
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx1_START 2, 0x08
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M2_L7 */
+    beq       ZERO,TL, .L_N1_M2_L7
+.align 5
+.L_N1_M2_TL1: /* TL-- */
+    KERNEL8xMx1 2, 0x08
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M2_TL1
+.L_N1_M2_L7:
+    /* if (!(L & 7)) goto L_N1_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M2_L0
+.align 5
+.L_N1_M2_L71:
+    KERNEL1xMx1 2, 0x08
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M2_L71
+.L_N1_M2_L0:
+    SAVEMx1 2, 0x08
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -2
+#else
+    PTR_ADDI    L,   L,   -1
+#endif
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L_N1_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_N1_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    A0,    A0,   T0
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  1
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx1_START 1, 0x04
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M1_L7 */
+    beq       ZERO,TL, .L_N1_M1_L7
+.align 5
+.L_N1_M1_TL1: /* TL-- */
+    KERNEL8xMx1 1, 0x04
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M1_TL1
+.L_N1_M1_L7:
+    /* if (!(L & 7)) goto L_N1_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M1_L0
+.align 5
+.L_N1_M1_L71:
+    KERNEL1xMx1 1, 0x04
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M1_L71
+.L_N1_M1_L0:
+    SAVEMx1 1, 0x04
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -1
+#else
+    PTR_ADDI    L,   L,   -1
+#endif
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     A0,  A0, T0
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N1_M0:
+.L_N0:
+    pop_if_used 26, 32
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/sgemm_ncopy_16_lasx.S b/kernel/loongarch64/sgemm_ncopy_16_lasx.S
new file mode 100644
index 000000000..266c07c5c
--- /dev/null
+++ b/kernel/loongarch64/sgemm_ncopy_16_lasx.S
@@ -0,0 +1,463 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/23 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*********************************************************************/
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define S9     $r20
+#define S10    $r23
+#define S11    $r24
+#define S12    $r25
+#define S13    $r26
+#define S14    $r27
+#define S15    $r28
+#define S16    $r29
+#define TD     $r30
+#define TS     $r31
+#define TL     $r7
+#define T0     $r6
+#undef  ZERO
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+
+// Loops outline
+//.L_N16 <-------------------
+//|   .L_M8:                |
+//|   .L_M7:                | Main Loop
+//|   .L_M1:                |
+//|   .L_M0: ---------------
+//.L_N15:
+//.L_N8:
+//|   .L_N8_M8:
+//|   .L_N8_M7:
+//|   .L_N8_M1:
+//.L_N7:
+//.L_N4:
+//|   .L_N4_M4:
+//|   .L_N4_M3:
+//|   .L_N4_M1:
+//.L_N3:
+//.L_N2:
+//|   .L_N2_M2:
+//|   .L_N2_M1:
+//.L_N1:
+//|   .L_N1_M1:
+//.L_N0
+
+    PROLOGUE
+    push_if_used 26, 32
+
+    move       TD,   DST
+    move       TS,   SRC
+    PTR_SLLI   TL,   LDA,  0x02
+    PTR_SLLI   T0,   TL,   0x01
+    PTR_SRAI   J,    N,    0x04
+    beq        J,    ZERO, .L_N15
+.align 5
+.L_N16:
+    move         S1,   TS
+    PTR_ADD      S2,   TS,   TL
+    PTR_SRAI     I,    M,    0x03
+    PTR_ADD      S3,   S2,   TL
+    PTR_ADDI     J,    J,    -1
+    PTR_ADD      S4,   S3,   TL
+    PTR_ADD      S5,   S3,   T0
+    PTR_ADD      S6,   S4,   T0
+    PTR_ADD      S7,   S5,   T0
+    PTR_ADD      S8,   S6,   T0
+    PTR_ADD      S9,   S7,   T0
+    PTR_ADD      S10,  S8,   T0
+    PTR_ADD      S11,  S9,   T0
+    PTR_ADD      S12,  S10,  T0
+    PTR_ADD      S13,  S11,  T0
+    PTR_ADD      S14,  S12,  T0
+    PTR_ADD      S15,  S13,  T0
+    PTR_ADD      S16,  S14,  T0
+    PTR_ADD      TS,   S15,  T0
+    beq          I,    ZERO, .L_M7
+.align 5
+.L_M8:
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+    xvld       U4,   S5,   0x00
+    xvld       U5,   S6,   0x00
+    xvld       U6,   S7,   0x00
+    xvld       U7,   S8,   0x00
+    xvld       U8,   S9,   0x00
+    xvld       U9,   S10,  0x00
+    xvld       U10,  S11,  0x00
+    xvld       U11,  S12,  0x00
+    xvld       U12,  S13,  0x00
+    xvld       U13,  S14,  0x00
+    xvld       U14,  S15,  0x00
+    xvld       U15,  S16,  0x00
+
+    GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \
+                    U0, U1, U2, U3, U4, U5, U6, U7, \
+                    D1, D3, D5, D7 // As tmp
+    GTRANSPOSE8x8_W D1, D3, D5, D7, D9, D11, D13, D15, \
+                    U8, U9, U10, U11, U12, U13, U14, U15, \
+                    U0, U1, U2, U3 // As tmp
+    GST xv, , D0, TD, 0x00, D1, TD, 0x20, D2, TD, 0x40, D3, TD, 0x60, \
+              D4, TD, 0x80, D5, TD, 0xA0, D6, TD, 0xC0, D7, TD, 0xE0
+    PTR_ADDI  TD, TD, 0x100
+    GST xv, , D8,  TD, 0x00, D9,  TD, 0x20, D10, TD, 0x40, D11, TD, 0x60, \
+              D12, TD, 0x80, D13, TD, 0xA0, D14, TD, 0xC0, D15, TD, 0xE0
+    PTR_ADDI  TD, TD, 0x100
+    PTR_ADDI     S1,   S1,   0x20
+    PTR_ADDI     S2,   S2,   0x20
+    PTR_ADDI     S3,   S3,   0x20
+    PTR_ADDI     S4,   S4,   0x20
+    PTR_ADDI     S5,   S5,   0x20
+    PTR_ADDI     S6,   S6,   0x20
+    PTR_ADDI     S7,   S7,   0x20
+    PTR_ADDI     S8,   S8,   0x20
+    PTR_ADDI     S9,   S9,   0x20
+    PTR_ADDI     S10,  S10,  0x20
+    PTR_ADDI     S11,  S11,  0x20
+    PTR_ADDI     S12,  S12,  0x20
+    PTR_ADDI     S13,  S13,  0x20
+    PTR_ADDI     S14,  S14,  0x20
+    PTR_ADDI     S15,  S15,  0x20
+    PTR_ADDI     S16,  S16,  0x20
+
+    PTR_ADDI     I,    I,    -1
+    blt          ZERO, I,    .L_M8
+.L_M7:
+    andi      I,     M,    0x07
+    beq       I,     ZERO, .L_M0
+.align 5
+.L_M1:
+    fld.s     F0,    S1,  0x00
+    fld.s     F1,    S2,  0x00
+    fld.s     F2,    S3,  0x00
+    fld.s     F3,    S4,  0x00
+    fld.s     F4,    S5,  0x00
+    fld.s     F5,    S6,  0x00
+    fld.s     F6,    S7,  0x00
+    fld.s     F7,    S8,  0x00
+
+    fst.s     F0,    TD,  0x00
+    fst.s     F1,    TD,  0x04
+    fst.s     F2,    TD,  0x08
+    fst.s     F3,    TD,  0x0C
+    fst.s     F4,    TD,  0x10
+    fst.s     F5,    TD,  0x14
+    fst.s     F6,    TD,  0x18
+    fst.s     F7,    TD,  0x1C
+
+    PTR_ADDI    S1,    S1,  0x04
+    PTR_ADDI    S2,    S2,  0x04
+    PTR_ADDI    S3,    S3,  0x04
+    PTR_ADDI    S4,    S4,  0x04
+    PTR_ADDI    S5,    S5,  0x04
+    PTR_ADDI    S6,    S6,  0x04
+    PTR_ADDI    S7,    S7,  0x04
+    PTR_ADDI    S8,    S8,  0x04
+    PTR_ADDI    TD,    TD,  0x20
+
+    fld.s     F0,    S9,  0x00
+    fld.s     F1,    S10, 0x00
+    fld.s     F2,    S11, 0x00
+    fld.s     F3,    S12, 0x00
+    fld.s     F4,    S13, 0x00
+    fld.s     F5,    S14, 0x00
+    fld.s     F6,    S15, 0x00
+    fld.s     F7,    S16, 0x00
+
+    fst.s     F0,    TD,  0x00
+    fst.s     F1,    TD,  0x04
+    fst.s     F2,    TD,  0x08
+    fst.s     F3,    TD,  0x0C
+    fst.s     F4,    TD,  0x10
+    fst.s     F5,    TD,  0x14
+    fst.s     F6,    TD,  0x18
+    fst.s     F7,    TD,  0x1C
+
+    PTR_ADDI    S9,    S9,  0x04
+    PTR_ADDI    S10,   S10, 0x04
+    PTR_ADDI    S11,   S11, 0x04
+    PTR_ADDI    S12,   S12, 0x04
+    PTR_ADDI    S13,   S13, 0x04
+    PTR_ADDI    S14,   S14, 0x04
+    PTR_ADDI    S15,   S15, 0x04
+    PTR_ADDI    S16,   S16, 0x04
+    PTR_ADDI    TD,    TD,  0x20
+
+    PTR_ADDI    I,     I,   -1
+    blt         ZERO,  I,   .L_M1
+.L_M0:
+    blt       ZERO,  J,   .L_N16
+.L_N15:
+    andi      J,     N,   0x0f
+    beq       ZERO,  J,   .L_N0
+
+    andi      J,     N,   0x08
+    beq       ZERO,  J,   .L_N7
+.L_N8:
+    move         S1,   TS
+    PTR_ADD      S2,   TS,   TL
+    PTR_SRAI     I,    M,    0x03
+    PTR_ADD      S3,   S2,   TL
+    PTR_ADD      S4,   S2,   T0
+    PTR_ADD      S5,   S3,   T0
+    PTR_ADD      S6,   S4,   T0
+    PTR_ADD      S7,   S5,   T0
+    PTR_ADD      S8,   S6,   T0
+    PTR_ADD      TS,   S7,   T0
+    beq          I,    ZERO, .L_N8_M7
+.align 5
+.L_N8_M8:
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+    xvld       U4,   S5,   0x00
+    xvld       U5,   S6,   0x00
+    xvld       U6,   S7,   0x00
+    xvld       U7,   S8,   0x00
+
+    GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \
+                    U0, U1, U2, U3, U4, U5, U6, U7, \
+                    D1, D3, D5, D7 // As tmp
+    GST xv, , D0, TD, 0x00, D2,  TD, 0x20, D4,  TD, 0x40, D6,  TD, 0x60, \
+              D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0
+    PTR_ADDI  TD, TD, 0x100
+    PTR_ADDI     S1,   S1,   0x20
+    PTR_ADDI     S2,   S2,   0x20
+    PTR_ADDI     S3,   S3,   0x20
+    PTR_ADDI     S4,   S4,   0x20
+    PTR_ADDI     S5,   S5,   0x20
+    PTR_ADDI     S6,   S6,   0x20
+    PTR_ADDI     S7,   S7,   0x20
+    PTR_ADDI     S8,   S8,   0x20
+
+    PTR_ADDI     I,    I,    -1
+    blt          ZERO, I,    .L_N8_M8
+.L_N8_M7:
+    andi      I,     M,    0x07
+    beq       I,     ZERO, .L_N7
+.align 5
+.L_N8_M1:
+    fld.s     F0,    S1,  0x00
+    fld.s     F1,    S2,  0x00
+    fld.s     F2,    S3,  0x00
+    fld.s     F3,    S4,  0x00
+    fld.s     F4,    S5,  0x00
+    fld.s     F5,    S6,  0x00
+    fld.s     F6,    S7,  0x00
+    fld.s     F7,    S8,  0x00
+
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F1,    TD,  0x04
+    PTR_ADDI    S2,    S2,  0x04
+    fst.s       F2,    TD,  0x08
+    PTR_ADDI    S3,    S3,  0x04
+    fst.s       F3,    TD,  0x0C
+    PTR_ADDI    S4,    S4,  0x04
+    fst.s       F4,    TD,  0x10
+    PTR_ADDI    S5,    S5,  0x04
+    fst.s       F5,    TD,  0x14
+    PTR_ADDI    S6,    S6,  0x04
+    fst.s       F6,    TD,  0x18
+    PTR_ADDI    S7,    S7,  0x04
+    fst.s       F7,    TD,  0x1C
+    PTR_ADDI    S8,    S8,  0x04
+
+    PTR_ADDI    TD,    TD,  0x20
+    PTR_ADDI    I,     I,   -1
+    blt         ZERO,  I,   .L_N8_M1
+.L_N7:
+    andi      J,     N,   0x07
+    beq       ZERO,  J,   .L_N0
+
+    andi      J,     N,   0x04
+    beq       ZERO,  J,   .L_N3
+.L_N4:
+    move         S1,   TS
+    PTR_ADD      S2,   TS,   TL
+    PTR_SRAI     I,    M,    0x02
+    PTR_ADD      S3,   S2,   TL
+    PTR_ADD      S4,   S2,   T0
+    PTR_ADD      TS,   S3,   T0
+    beq          I,    ZERO, .L_N4_M3
+.align 5
+.L_N4_M4:
+    GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0
+    GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0
+    GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1
+    GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4
+    GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5
+    GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30
+    PTR_ADDI     S1,   S1,   0x10
+    PTR_ADDI     S2,   S2,   0x10
+    PTR_ADDI     S3,   S3,   0x10
+    PTR_ADDI     S4,   S4,   0x10
+    PTR_ADDI     TD,   TD,   0x40
+    PTR_ADDI     I,    I,    -1
+    blt          ZERO, I,    .L_N4_M4
+.L_N4_M3:
+    andi      I,     M,    0x03
+    beq       I,     ZERO, .L_N3
+.align 5
+.L_N4_M1:
+    fld.s     F0,    S1,  0x00
+    fld.s     F1,    S2,  0x00
+    fld.s     F2,    S3,  0x00
+    fld.s     F3,    S4,  0x00
+
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F1,    TD,  0x04
+    PTR_ADDI    S2,    S2,  0x04
+    fst.s       F2,    TD,  0x08
+    PTR_ADDI    S3,    S3,  0x04
+    fst.s       F3,    TD,  0x0C
+    PTR_ADDI    S4,    S4,  0x04
+
+    PTR_ADDI    TD,    TD,  0x10
+    PTR_ADDI    I,     I,   -1
+    blt         ZERO,  I,   .L_N4_M1
+.L_N3:
+    andi      J,     N,   0x03
+    beq       ZERO,  J,   .L_N0
+
+    andi      J,     N,   0x02
+    beq       ZERO,  J,   .L_N1
+.L_N2:
+    move         S1,   TS
+    PTR_ADD      S2,   TS,   TL
+    PTR_SRAI     I,    M,    0x01
+    PTR_ADD      TS,   S2,   TL
+    beq          I,    ZERO, .L_N2_M1
+.align 5
+.L_N2_M2:
+    GLD f, d, F0, S1, 0x00, F1, S2, 0x00
+    vilvl.w $vr0, $vr1, $vr0
+    GST v, , $vr0, TD, 0x00
+    PTR_ADDI     S1,   S1,   0x08
+    PTR_ADDI     S2,   S2,   0x08
+    PTR_ADDI     TD,   TD,   0x10
+
+    PTR_ADDI     I,    I,    -1
+    blt          ZERO, I,    .L_N2_M2
+.L_N2_M1:
+    andi      I,     M,    0x01
+    beq       I,     ZERO, .L_N1
+
+    fld.s     F0,    S1,  0x00
+    fld.s     F1,    S2,  0x00
+
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F1,    TD,  0x04
+    PTR_ADDI    S2,    S2,  0x04
+    PTR_ADDI    TD,    TD,  0x08
+.align 5
+.L_N1:
+    move      S1,    TS
+    beq       ZERO,  M,   .L_N0
+.L_N1_M1:
+    fld.s       F0,    S1,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    TD,    TD,  0x04
+    PTR_ADDI    M,     M,   -1
+    blt         ZERO,  M,   .L_N1_M1
+.L_N0:
+    pop_if_used 26, 32
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/sgemm_ncopy_8_lasx.S b/kernel/loongarch64/sgemm_ncopy_8_lasx.S
new file mode 100644
index 000000000..5c173568b
--- /dev/null
+++ b/kernel/loongarch64/sgemm_ncopy_8_lasx.S
@@ -0,0 +1,298 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/23 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*********************************************************************/
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r6
+#undef  ZERO
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+#define D8     $xr16
+#define D10    $xr17
+#define D12    $xr18
+#define D14    $xr19
+
+// Loops outline
+//.L_N8: <----------------
+//|   .L_M8:              |
+//|   .L_M7:              | Main Loop
+//|   .L_M1:              |
+//|   .L_M0:--------------
+//.L_N7:
+//.L_N4:
+//|   .L_N4_M4:
+//|   .L_N4_M3:
+//|   .L_N4_M1:
+//.L_N3:
+//.L_N2:
+//|   .L_N2_M2:
+//|   .L_N2_M1:
+//.L_N1:
+//|   .L_N1_M1:
+//.L_N0
+
+    PROLOGUE
+    push_if_used 17, 20
+
+    move       TD,   DST
+    move       TS,   SRC
+    PTR_SLLI   TL,   LDA,  0x02
+    PTR_SLLI   T0,   TL,   0x01
+    PTR_SRAI   J,    N,    0x03
+    beq        J,    ZERO, .L_N7
+.align 5
+.L_N8:
+    move         S1,   TS
+    PTR_ADD      S2,   TS,   TL
+    PTR_SRAI     I,    M,    0x03
+    PTR_ADD      S3,   S2,   TL
+    PTR_ADDI     J,    J,    -1
+    PTR_ADD      S4,   S2,   T0
+    PTR_ADD      S5,   S3,   T0
+    PTR_ADD      S6,   S4,   T0
+    PTR_ADD      S7,   S5,   T0
+    PTR_ADD      S8,   S6,   T0
+    PTR_ADD      TS,   S7,   T0
+    beq          I,    ZERO, .L_M7
+.align 5
+.L_M8:
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+    xvld       U4,   S5,   0x00
+    xvld       U5,   S6,   0x00
+    xvld       U6,   S7,   0x00
+    xvld       U7,   S8,   0x00
+
+    GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \
+                    U0, U1, U2, U3, U4, U5, U6, U7, \
+                    D1, D3, D5, D7 // As tmp
+    GST xv, , D0, TD, 0x00, D2,  TD, 0x20, D4,  TD, 0x40, D6,  TD, 0x60, \
+              D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0
+    PTR_ADDI  TD, TD, 0x100
+    PTR_ADDI     S1,   S1,   0x20
+    PTR_ADDI     S2,   S2,   0x20
+    PTR_ADDI     S3,   S3,   0x20
+    PTR_ADDI     S4,   S4,   0x20
+    PTR_ADDI     S5,   S5,   0x20
+    PTR_ADDI     S6,   S6,   0x20
+    PTR_ADDI     S7,   S7,   0x20
+    PTR_ADDI     S8,   S8,   0x20
+    PTR_ADDI     I,    I,    -1
+    blt          ZERO, I,    .L_M8
+.L_M7:
+    andi      I,     M,    0x07
+    beq       I,     ZERO, .L_M0
+.align 5
+.L_M1:
+    fld.s     F0,    S1,  0x00
+    fld.s     F1,    S2,  0x00
+    fld.s     F2,    S3,  0x00
+    fld.s     F3,    S4,  0x00
+    fld.s     F4,    S5,  0x00
+    fld.s     F5,    S6,  0x00
+    fld.s     F6,    S7,  0x00
+    fld.s     F7,    S8,  0x00
+
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F1,    TD,  0x04
+    PTR_ADDI    S2,    S2,  0x04
+    fst.s       F2,    TD,  0x08
+    PTR_ADDI    S3,    S3,  0x04
+    fst.s       F3,    TD,  0x0C
+    PTR_ADDI    S4,    S4,  0x04
+    fst.s       F4,    TD,  0x10
+    PTR_ADDI    S5,    S5,  0x04
+    fst.s       F5,    TD,  0x14
+    PTR_ADDI    S6,    S6,  0x04
+    fst.s       F6,    TD,  0x18
+    PTR_ADDI    S7,    S7,  0x04
+    fst.s       F7,    TD,  0x1C
+    PTR_ADDI    S8,    S8,  0x04
+
+    PTR_ADDI    TD,    TD,  0x20
+    PTR_ADDI    I,     I,   -1
+    blt         ZERO,  I,   .L_M1
+.L_M0:
+    blt       ZERO,  J,   .L_N8
+.L_N7:
+    andi      J,     N,   0x07
+    beq       ZERO,  J,   .L_N0
+
+    andi      J,     N,   0x04
+    beq       ZERO,  J,   .L_N3
+.L_N4:
+    move         S1,   TS
+    PTR_ADD      S2,   TS,   TL
+    PTR_SRAI     I,    M,    0x02
+    PTR_ADD      S3,   S2,   TL
+    PTR_ADD      S4,   S2,   T0
+    PTR_ADD      TS,   S3,   T0
+    beq          I,    ZERO, .L_N4_M3
+.align 5
+.L_N4_M4:
+    GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0
+    GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0
+    GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1
+    GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4
+    GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5
+    GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30
+    PTR_ADDI     S1,   S1,   0x10
+    PTR_ADDI     S2,   S2,   0x10
+    PTR_ADDI     S3,   S3,   0x10
+    PTR_ADDI     S4,   S4,   0x10
+    PTR_ADDI     TD,   TD,   0x40
+    PTR_ADDI     I,    I,    -1
+    blt          ZERO, I,    .L_N4_M4
+.L_N4_M3:
+    andi      I,     M,    0x03
+    beq       I,     ZERO, .L_N3
+.align 5
+.L_N4_M1:
+    fld.s     F0,    S1,  0x00
+    fld.s     F1,    S2,  0x00
+    fld.s     F2,    S3,  0x00
+    fld.s     F3,    S4,  0x00
+
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F1,    TD,  0x04
+    PTR_ADDI    S2,    S2,  0x04
+    fst.s       F2,    TD,  0x08
+    PTR_ADDI    S3,    S3,  0x04
+    fst.s       F3,    TD,  0x0C
+    PTR_ADDI    S4,    S4,  0x04
+
+    PTR_ADDI    TD,    TD,  0x10
+    PTR_ADDI    I,     I,   -1
+    blt         ZERO,  I,   .L_N4_M1
+.L_N3:
+    andi      J,     N,   0x03
+    beq       ZERO,  J,   .L_N0
+
+    andi      J,     N,   0x02
+    beq       ZERO,  J,   .L_N1
+.L_N2:
+    move         S1,   TS
+    PTR_ADD      S2,   TS,   TL
+    PTR_SRAI     I,    M,    0x01
+    PTR_ADD      TS,   S2,   TL
+    beq          I,    ZERO, .L_N2_M1
+.align 5
+.L_N2_M2:
+    GLD f, d, F0, S1, 0x00, F1, S2, 0x00
+    vilvl.w $vr0, $vr1, $vr0
+    GST v, , $vr0, TD, 0x00
+    PTR_ADDI     S1,   S1,   0x08
+    PTR_ADDI     S2,   S2,   0x08
+    PTR_ADDI     TD,   TD,   0x10
+
+    PTR_ADDI     I,    I,    -1
+    blt          ZERO, I,    .L_N2_M2
+.L_N2_M1:
+    andi      I,     M,    0x01
+    beq       I,     ZERO, .L_N1
+
+    fld.s     F0,    S1,  0x00
+    fld.s     F1,    S2,  0x00
+
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F1,    TD,  0x04
+    PTR_ADDI    S2,    S2,  0x04
+    PTR_ADDI    TD,    TD,  0x08
+.align 5
+.L_N1:
+    move      S1,    TS
+    beq       ZERO,  M,   .L_N0
+.L_N1_M1:
+    fld.s       F0,    S1,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    TD,    TD,  0x04
+    PTR_ADDI    M,     M,   -1
+    blt         ZERO,  M,   .L_N1_M1
+.L_N0:
+    pop_if_used 17, 20
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/sgemm_tcopy_16_lasx.S b/kernel/loongarch64/sgemm_tcopy_16_lasx.S
new file mode 100644
index 000000000..d9789bdcd
--- /dev/null
+++ b/kernel/loongarch64/sgemm_tcopy_16_lasx.S
@@ -0,0 +1,526 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/23 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*********************************************************************/
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S0     $r11
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define P0     $r20
+#define P1     $r23
+#define P2     $r24
+#define P3     $r25
+#define P4     $r26
+#define P5     $r27
+#define T0     $r28
+#define T1     $r29
+#define TL     $r7
+#define ZERO   $r0
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+
+// Loops outline
+//.L_M8 <-------------------
+//|   .L_N16:               |
+//|   .L_N15:               |
+//|   .L_N8:                |
+//|   .L_N7:                | Main Loop
+//|   .L_N4:                |
+//|   .L_N3:                |
+//|   .L_N2:                |
+//|   .L_N1:                |
+//|   .L_N0: ---------------
+//.L_M7
+//.L_M4
+//|   .L_M4_N16:
+//|   .L_M4_N15:
+//|   .L_M4_N8:
+//|   .L_M4_N7:
+//|   .L_M4_N4:
+//|   .L_M4_N3:
+//|   .L_M4_N2:
+//|   .L_M4_N1:
+//.L_M3
+//.L_M2
+//|   .L_M2_N16:
+//|   .L_M2_N15:
+//|   .L_M2_N8:
+//|   .L_M2_N7:
+//|   .L_M2_N4:
+//|   .L_M2_N3:
+//|   .L_M2_N2:
+//|   .L_M2_N1:
+//.L_M1
+//|   .L_M1_N16:
+//|   .L_M1_N15:
+//|   .L_M1_N8:
+//|   .L_M1_N7:
+//|   .L_M1_N4:
+//|   .L_M1_N3:
+//|   .L_M1_N2:
+//|   .L_M1_N1:
+//.L_M0
+
+    PROLOGUE
+    push_if_used 24, 8
+
+    move       S0,     SRC
+    move       P0,     DST
+
+    PTR_SRAI     T0,     N,     0x04
+    PTR_SRAI     T1,     N,     0x03
+    PTR_SLLI     T0,     T0,    0x04
+    PTR_SLLI     T1,     T1,    0x03
+
+    PTR_MUL      P2,     M,     T0
+    PTR_MUL      P3,     M,     T1
+    PTR_SLLI     P2,     P2,    0x02
+    PTR_SLLI     P3,     P3,    0x02
+    PTR_ADD      P2,     DST,   P2
+    PTR_ADD      P3,     DST,   P3
+
+    PTR_SRAI     T0,     N,     0x02
+    PTR_SRAI     T1,     N,     0x01
+    PTR_SLLI     T0,     T0,    0x02
+    PTR_SLLI     T1,     T1,    0x01
+    PTR_MUL      P4,     M,     T0
+    PTR_MUL      P5,     M,     T1
+    PTR_SLLI     P4,     P4,    0x02
+    PTR_SLLI     P5,     P5,    0x02
+    PTR_ADD      P4,     DST,   P4
+    PTR_ADD      P5,     DST,   P5
+
+    PTR_SLLI     TL,     LDA,   0x02
+    PTR_SRAI     J,      M,     0x03
+    PTR_SLLI     T0,     TL,    0x01
+    PTR_SLLI     T1,     M,     0x06
+    beq          ZERO,   J,     .L_M7
+.align 5
+.L_M8:
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+    PTR_ADD      S3,     S1,    T0
+    PTR_ADD      S4,     S2,    T0
+    PTR_ADD      S5,     S3,    T0
+    PTR_ADD      S6,     S4,    T0
+    PTR_ADD      S7,     S5,    T0
+    PTR_ADD      S8,     S6,    T0
+    PTR_ADD      S0,     S7,    T0
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x200
+
+    PTR_SRAI     I,      N,     0x04
+    PTR_ADDI     J,      J,     -1
+    beq          ZERO,   I,     .L_N15
+.L_N16:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+
+    xvld       U4,     S3,    0x00
+    xvld       U5,     S3,    0x20
+    xvld       U6,     S4,    0x00
+    xvld       U7,     S4,    0x20
+
+    xvst       U4,     P1,    0x80
+    xvst       U5,     P1,    0xA0
+    xvst       U6,     P1,    0xC0
+    xvst       U7,     P1,    0xE0
+
+    xvld       U0,     S5,    0x00
+    xvld       U1,     S5,    0x20
+    xvld       U2,     S6,    0x00
+    xvld       U3,     S6,    0x20
+
+    xvst       U0,     P1,    0x100
+    xvst       U1,     P1,    0x120
+    xvst       U2,     P1,    0x140
+    xvst       U3,     P1,    0x160
+
+    xvld       U4,     S7,    0x00
+    xvld       U5,     S7,    0x20
+    xvld       U6,     S8,    0x00
+    xvld       U7,     S8,    0x20
+
+    xvst       U4,     P1,    0x180
+    xvst       U5,     P1,    0x1A0
+    xvst       U6,     P1,    0x1C0
+    xvst       U7,     P1,    0x1E0
+
+    PTR_ADDI     S1,     S1,    0x40
+    PTR_ADDI     S2,     S2,    0x40
+    PTR_ADDI     S3,     S3,    0x40
+    PTR_ADDI     S4,     S4,    0x40
+    PTR_ADDI     S5,     S5,    0x40
+    PTR_ADDI     S6,     S6,    0x40
+    PTR_ADDI     S7,     S7,    0x40
+    PTR_ADDI     S8,     S8,    0x40
+
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_N16
+.L_N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_N7
+.L_N8:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+    xvld       U4,     S5,    0x00
+    xvld       U5,     S6,    0x00
+    xvld       U6,     S7,    0x00
+    xvld       U7,     S8,    0x00
+
+    GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60, \
+              U4, P2, 0x80, U5, P2, 0xA0, U6, P2, 0xC0, U7, P2, 0xE0
+
+    PTR_ADDI     S1,     S1,    0x20
+    PTR_ADDI     S2,     S2,    0x20
+    PTR_ADDI     S3,     S3,    0x20
+    PTR_ADDI     S4,     S4,    0x20
+    PTR_ADDI     S5,     S5,    0x20
+    PTR_ADDI     S6,     S6,    0x20
+    PTR_ADDI     S7,     S7,    0x20
+    PTR_ADDI     S8,     S8,    0x20
+    PTR_ADDI     P2,     P2,    0x100
+.L_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_N3
+.L_N4:
+    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \
+             $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00
+    GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30, \
+             $vr4, P3, 0x40, $vr5, P3, 0x50, $vr6, P3, 0x60, $vr7, P3, 0x70
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     S2,     S2,    0x10
+    PTR_ADDI     S3,     S3,    0x10
+    PTR_ADDI     S4,     S4,    0x10
+    PTR_ADDI     S5,     S5,    0x10
+    PTR_ADDI     S6,     S6,    0x10
+    PTR_ADDI     S7,     S7,    0x10
+    PTR_ADDI     S8,     S8,    0x10
+    PTR_ADDI     P3,     P3,    0x80
+.L_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_N1
+.L_N2:
+    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
+              $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
+    GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18, \
+              $f4, P4, 0x20, $f5, P4, 0x28, $f6, P4, 0x30, $f7, P4, 0x38
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     S2,     S2,    0x08
+    PTR_ADDI     S3,     S3,    0x08
+    PTR_ADDI     S4,     S4,    0x08
+    PTR_ADDI     S5,     S5,    0x08
+    PTR_ADDI     S6,     S6,    0x08
+    PTR_ADDI     S7,     S7,    0x08
+    PTR_ADDI     S8,     S8,    0x08
+    PTR_ADDI     P4,     P4,    0x40
+.L_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
+              $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
+    GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C, \
+              $f4, P5, 0x10, $f5, P5, 0x14, $f6, P5, 0x18, $f7, P5, 0x1C
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     S2,     S2,    0x04
+    PTR_ADDI     S3,     S3,    0x04
+    PTR_ADDI     S4,     S4,    0x04
+    PTR_ADDI     S5,     S5,    0x04
+    PTR_ADDI     S6,     S6,    0x04
+    PTR_ADDI     S7,     S7,    0x04
+    PTR_ADDI     S8,     S8,    0x04
+    PTR_ADDI     P5,     P5,    0x20
+.L_N0:
+    blt        ZERO,   J,     .L_M8
+.L_M7:
+    andi       J,      M,     0x04
+    beq        ZERO,   J,     .L_M3
+.L_M4:
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+    PTR_ADD      S3,     S1,    T0
+    PTR_ADD      S4,     S2,    T0
+    PTR_ADD      S0,     S3,    T0
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x100
+
+    PTR_SRAI     I,      N,     0x04
+    beq          ZERO,   I,     .L_M4_N15
+.align 5
+.L_M4_N16:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+
+    xvld       U4,     S3,    0x00
+    xvld       U5,     S3,    0x20
+    xvld       U6,     S4,    0x00
+    xvld       U7,     S4,    0x20
+
+    xvst       U4,     P1,    0x80
+    xvst       U5,     P1,    0xA0
+    xvst       U6,     P1,    0xC0
+    xvst       U7,     P1,    0xE0
+
+    PTR_ADDI     S1,     S1,    0x40
+    PTR_ADDI     S2,     S2,    0x40
+    PTR_ADDI     S3,     S3,    0x40
+    PTR_ADDI     S4,     S4,    0x40
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_M4_N16
+.L_M4_N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_M4_N7
+.L_M4_N8:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+
+    GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60
+
+    PTR_ADDI     S1,     S1,    0x20
+    PTR_ADDI     S2,     S2,    0x20
+    PTR_ADDI     S3,     S3,    0x20
+    PTR_ADDI     S4,     S4,    0x20
+    PTR_ADDI     P2,     P2,    0x80
+.L_M4_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_M4_N3
+.L_M4_N4:
+    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00
+    GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     S2,     S2,    0x10
+    PTR_ADDI     S3,     S3,    0x10
+    PTR_ADDI     S4,     S4,    0x10
+    PTR_ADDI     P3,     P3,    0x40
+.L_M4_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_M4_N1
+.L_M4_N2:
+    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
+    GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     S2,     S2,    0x08
+    PTR_ADDI     S3,     S3,    0x08
+    PTR_ADDI     S4,     S4,    0x08
+    PTR_ADDI     P4,     P4,    0x20
+.L_M4_N1:
+    andi        I,      N,     0x01
+    beq         ZERO,   I,     .L_M3
+
+    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
+    GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     S2,     S2,    0x04
+    PTR_ADDI     S3,     S3,    0x04
+    PTR_ADDI     S4,     S4,    0x04
+    PTR_ADDI     P5,     P5,    0x10
+.L_M3:
+    andi       J,      M,     0x02
+    beq        ZERO,   J,     .L_M1
+.L_M2:
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+    PTR_ADD      S0,     S0,    T0
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x80
+
+    PTR_SRAI     I,      N,     0x04
+    beq          ZERO,   I,     .L_M2_N15
+.align 5
+.L_M2_N16:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+
+    PTR_ADDI     S1,     S1,    0x40
+    PTR_ADDI     S2,     S2,    0x40
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_M2_N16
+.L_M2_N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_M2_N7
+.L_M2_N8:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    GST xv, , U0, P2, 0x00, U1, P2, 0x20
+
+    PTR_ADDI     S1,     S1,    0x20
+    PTR_ADDI     S2,     S2,    0x20
+    PTR_ADDI     P2,     P2,    0x40
+.L_M2_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_M2_N3
+.L_M2_N4:
+    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00
+    GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     S2,     S2,    0x10
+    PTR_ADDI     P3,     P3,    0x20
+.L_M2_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_M2_N1
+.L_M2_N2:
+    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00
+    GST f, d, $f0, P4, 0x00, $f1, P4, 0x08
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     S2,     S2,    0x08
+    PTR_ADDI     P4,     P4,    0x10
+.L_M2_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M1
+
+    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00
+    GST f, s, $f0, P5, 0x00, $f1, P5, 0x04
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     S2,     S2,    0x04
+    PTR_ADDI     P5,     P5,    0x08
+.L_M1:
+    andi       J,      M,     0x01
+    beq        ZERO,   J,     .L_M0
+
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x40
+
+    PTR_SRAI     I,      N,     0x04
+    beq          ZERO,   I,     .L_M1_N15
+.align 5
+.L_M1_N16:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+
+    PTR_ADDI     S1,     S1,    0x40
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_M1_N16
+.L_M1_N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_M1_N7
+.L_M1_N8:
+    xvld       U0,     S1,    0x00
+
+    GST xv, , U0, P2, 0x00
+
+    PTR_ADDI     S1,     S1,    0x20
+    PTR_ADDI     P2,     P2,    0x20
+.L_M1_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_M1_N3
+.L_M1_N4:
+    GLD v, , $vr0, S1, 0x00
+    GST v, , $vr0, P3, 0x00
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     P3,     P3,    0x10
+.L_M1_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_M1_N1
+.L_M1_N2:
+    GLD f, d, $f0, S1, 0x00
+    GST f, d, $f0, P4, 0x00
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     P4,     P4,    0x08
+.L_M1_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M0
+
+    GLD f, s, $f0, S1, 0x00
+    GST f, s, $f0, P5, 0x00
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     P5,     P5,    0x04
+.L_M0:
+    pop_if_used 24, 8
+    jirl       $r0,    $r1,   0x00
+    EPILOGUE
diff --git a/kernel/loongarch64/sgemm_tcopy_8_lasx.S b/kernel/loongarch64/sgemm_tcopy_8_lasx.S
new file mode 100644
index 000000000..725a47a60
--- /dev/null
+++ b/kernel/loongarch64/sgemm_tcopy_8_lasx.S
@@ -0,0 +1,406 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/23 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*********************************************************************/
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S0     $r11
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define P0     $r20
+#define P1     $r23
+#define P2     $r24
+#define P3     $r25
+#define P4     $r26
+#define T0     $r27
+#define T1     $r28
+#define TL     $r7
+#undef  ZERO
+#define ZERO   $r0
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+
+// Loops outline
+//.L_M8 <-------------------
+//|   .L_N8:                |
+//|   .L_N7:                | Main Loop
+//|   .L_N4:                |
+//|   .L_N3:                |
+//|   .L_N2:                |
+//|   .L_N1:                |
+//|   .L_N0: ---------------
+//.L_M7
+//.L_M4
+//|   .L_M4_N8:
+//|   .L_M4_N7:
+//|   .L_M4_N4:
+//|   .L_M4_N3:
+//|   .L_M4_N2:
+//|   .L_M4_N1:
+//.L_M3
+//.L_M2
+//|   .L_M2_N8:
+//|   .L_M2_N7:
+//|   .L_M2_N4:
+//|   .L_M2_N3:
+//|   .L_M2_N2:
+//|   .L_M2_N1:
+//.L_M1
+//|   .L_M1_N8:
+//|   .L_M1_N7:
+//|   .L_M1_N4:
+//|   .L_M1_N3:
+//|   .L_M1_N2:
+//|   .L_M1_N1:
+//.L_M0
+
+    PROLOGUE
+    push_if_used 23, 8
+
+    move       S0,     SRC
+    move       P0,     DST
+
+    PTR_SRAI     T0,     N,     0x04
+    PTR_SRAI     T1,     N,     0x03
+    PTR_SLLI     T0,     T0,    0x04
+    PTR_SLLI     T1,     T1,    0x03
+
+    PTR_MUL      P2,     M,     T1
+    PTR_SLLI     P2,     P2,    0x02
+    PTR_ADD      P2,     DST,   P2
+    PTR_SRAI     T0,     N,     0x02
+    PTR_SRAI     T1,     N,     0x01
+    PTR_SLLI     T0,     T0,    0x02
+    PTR_SLLI     T1,     T1,    0x01
+    PTR_MUL      P3,     M,     T0
+    PTR_MUL      P4,     M,     T1
+    PTR_SLLI     P3,     P3,    0x02
+    PTR_SLLI     P4,     P4,    0x02
+    PTR_ADD      P3,     DST,   P3
+    PTR_ADD      P4,     DST,   P4
+
+    PTR_SLLI     TL,     LDA,   0x02
+    PTR_SRAI     J,      M,     0x03
+    PTR_SLLI     T0,     TL,    0x01
+    PTR_SLLI     T1,     M,     0x05
+    beq          ZERO,   J,     .L_M7
+.align 5
+.L_M8:
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+    PTR_ADD      S3,     S1,    T0
+    PTR_ADD      S4,     S2,    T0
+    PTR_ADD      S5,     S3,    T0
+    PTR_ADD      S6,     S4,    T0
+    PTR_ADD      S7,     S5,    T0
+    PTR_ADD      S8,     S6,    T0
+    PTR_ADD      S0,     S7,    T0
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x100
+
+    PTR_SRAI     I,      N,     0x03
+    PTR_ADDI     J,      J,     -1
+    beq          ZERO,   I,     .L_N7
+.L_N8:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+    xvld       U4,     S5,    0x00
+    xvld       U5,     S6,    0x00
+    xvld       U6,     S7,    0x00
+    xvld       U7,     S8,    0x00
+
+    GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60, \
+              U4, P1, 0x80, U5, P1, 0xA0, U6, P1, 0xC0, U7, P1, 0xE0
+
+    PTR_ADDI     S1,     S1,    0x20
+    PTR_ADDI     S2,     S2,    0x20
+    PTR_ADDI     S3,     S3,    0x20
+    PTR_ADDI     S4,     S4,    0x20
+    PTR_ADDI     S5,     S5,    0x20
+    PTR_ADDI     S6,     S6,    0x20
+    PTR_ADDI     S7,     S7,    0x20
+    PTR_ADDI     S8,     S8,    0x20
+
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_N8
+.L_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_N3
+.L_N4:
+    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \
+             $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00
+    GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30, \
+             $vr4, P2, 0x40, $vr5, P2, 0x50, $vr6, P2, 0x60, $vr7, P2, 0x70
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     S2,     S2,    0x10
+    PTR_ADDI     S3,     S3,    0x10
+    PTR_ADDI     S4,     S4,    0x10
+    PTR_ADDI     S5,     S5,    0x10
+    PTR_ADDI     S6,     S6,    0x10
+    PTR_ADDI     S7,     S7,    0x10
+    PTR_ADDI     S8,     S8,    0x10
+    PTR_ADDI     P2,     P2,    0x80
+.L_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_N1
+.L_N2:
+    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
+              $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
+    GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18, \
+              $f4, P3, 0x20, $f5, P3, 0x28, $f6, P3, 0x30, $f7, P3, 0x38
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     S2,     S2,    0x08
+    PTR_ADDI     S3,     S3,    0x08
+    PTR_ADDI     S4,     S4,    0x08
+    PTR_ADDI     S5,     S5,    0x08
+    PTR_ADDI     S6,     S6,    0x08
+    PTR_ADDI     S7,     S7,    0x08
+    PTR_ADDI     S8,     S8,    0x08
+    PTR_ADDI     P3,     P3,    0x40
+.L_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
+              $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
+    GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C, \
+              $f4, P4, 0x10, $f5, P4, 0x14, $f6, P4, 0x18, $f7, P4, 0x1C
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     S2,     S2,    0x04
+    PTR_ADDI     S3,     S3,    0x04
+    PTR_ADDI     S4,     S4,    0x04
+    PTR_ADDI     S5,     S5,    0x04
+    PTR_ADDI     S6,     S6,    0x04
+    PTR_ADDI     S7,     S7,    0x04
+    PTR_ADDI     S8,     S8,    0x04
+    PTR_ADDI     P4,     P4,    0x20
+.L_N0:
+    blt        ZERO,   J,     .L_M8
+
+.L_M7:
+    andi       J,      M,     0x04
+    beq        ZERO,   J,     .L_M3
+.L_M4:
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+    PTR_ADD      S3,     S1,    T0
+    PTR_ADD      S4,     S2,    T0
+    PTR_ADD      S0,     S3,    T0
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x80
+
+    PTR_SRAI     I,      N,     0x03
+    beq          ZERO,   I,     .L_M4_N7
+.align 5
+.L_M4_N8:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+
+    GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60
+
+    PTR_ADDI     S1,     S1,    0x20
+    PTR_ADDI     S2,     S2,    0x20
+    PTR_ADDI     S3,     S3,    0x20
+    PTR_ADDI     S4,     S4,    0x20
+
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_M4_N8
+.L_M4_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_M4_N3
+.L_M4_N4:
+    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00
+    GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     S2,     S2,    0x10
+    PTR_ADDI     S3,     S3,    0x10
+    PTR_ADDI     S4,     S4,    0x10
+    PTR_ADDI     P2,     P2,    0x40
+.L_M4_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_M4_N1
+.L_M4_N2:
+    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
+    GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     S2,     S2,    0x08
+    PTR_ADDI     S3,     S3,    0x08
+    PTR_ADDI     S4,     S4,    0x08
+    PTR_ADDI     P3,     P3,    0x20
+.L_M4_N1:
+    andi        I,      N,     0x01
+    beq         ZERO,   I,     .L_M3
+
+    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
+    GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     S2,     S2,    0x04
+    PTR_ADDI     S3,     S3,    0x04
+    PTR_ADDI     S4,     S4,    0x04
+    PTR_ADDI     P4,     P4,    0x10
+.L_M3:
+    andi       J,      M,     0x02
+    beq        ZERO,   J,     .L_M1
+.L_M2:
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+    PTR_ADD      S0,     S0,    T0
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x40
+
+    PTR_SRAI     I,      N,     0x03
+    beq          ZERO,   I,     .L_M2_N7
+.align 5
+.L_M2_N8:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    GST xv, , U0, P1, 0x00, U1, P1, 0x20
+
+    PTR_ADDI     S1,     S1,    0x20
+    PTR_ADDI     S2,     S2,    0x20
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_M2_N8
+.L_M2_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_M2_N3
+.L_M2_N4:
+    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00
+    GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     S2,     S2,    0x10
+    PTR_ADDI     P2,     P2,    0x20
+.L_M2_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_M2_N1
+.L_M2_N2:
+    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00
+    GST f, d, $f0, P3, 0x00, $f1, P3, 0x08
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     S2,     S2,    0x08
+    PTR_ADDI     P3,     P3,    0x10
+.L_M2_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M1
+
+    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00
+    GST f, s, $f0, P4, 0x00, $f1, P4, 0x04
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     S2,     S2,    0x04
+    PTR_ADDI     P4,     P4,    0x08
+.L_M1:
+    andi       J,      M,     0x01
+    beq        ZERO,   J,     .L_M0
+
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x20
+
+    PTR_SRAI     I,      N,     0x03
+    beq          ZERO,   I,     .L_M1_N7
+.align 5
+.L_M1_N8:
+    xvld       U0,     S1,    0x00
+
+    GST xv, , U0, P1, 0x00
+
+    PTR_ADDI     S1,     S1,    0x20
+
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_M1_N8
+.L_M1_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_M1_N3
+.L_M1_N4:
+    GLD v, , $vr0, S1, 0x00
+    GST v, , $vr0, P2, 0x00
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     P2,     P2,    0x10
+.L_M1_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_M1_N1
+.L_M1_N2:
+    GLD f, d, $f0, S1, 0x00
+    GST f, d, $f0, P3, 0x00
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     P3,     P3,    0x08
+.L_M1_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M0
+
+    GLD f, s, $f0, S1, 0x00
+    GST f, s, $f0, P4, 0x00
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     P4,     P4,    0x04
+.L_M0:
+    pop_if_used 23, 8
+    jirl       $r0,    $r1,   0x00
+    EPILOGUE
diff --git a/param.h b/param.h
index 547463b2f..03bf3624f 100644
--- a/param.h
+++ b/param.h
@@ -2848,34 +2848,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(NO_LASX)
 #define DGEMM_DEFAULT_UNROLL_N 8
 #define DGEMM_DEFAULT_UNROLL_M 2
+#define SGEMM_DEFAULT_UNROLL_N 8
+#define SGEMM_DEFAULT_UNROLL_M 2
 #else
 #define DGEMM_DEFAULT_UNROLL_N 4
 #define DGEMM_DEFAULT_UNROLL_M 16
+#define SGEMM_DEFAULT_UNROLL_N 8
+#define SGEMM_DEFAULT_UNROLL_M 16
 #endif
 
-#define SGEMM_DEFAULT_UNROLL_N 8
 #define QGEMM_DEFAULT_UNROLL_N 2
 #define CGEMM_DEFAULT_UNROLL_N 4
 #define ZGEMM_DEFAULT_UNROLL_N 4
 #define XGEMM_DEFAULT_UNROLL_N 1
 
-#define SGEMM_DEFAULT_UNROLL_M 2
 #define QGEMM_DEFAULT_UNROLL_M 2
 #define CGEMM_DEFAULT_UNROLL_M 1
 #define ZGEMM_DEFAULT_UNROLL_M 1
 #define XGEMM_DEFAULT_UNROLL_M 1
 
-#define SGEMM_DEFAULT_P 512
+#define SGEMM_DEFAULT_P 256
 #define DGEMM_DEFAULT_P 32
 #define CGEMM_DEFAULT_P 128
 #define ZGEMM_DEFAULT_P 128
 
-#define SGEMM_DEFAULT_R 12288
+#define SGEMM_DEFAULT_R 1024
 #define DGEMM_DEFAULT_R 858
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
-#define SGEMM_DEFAULT_Q 128
+#define SGEMM_DEFAULT_Q 256
 #define DGEMM_DEFAULT_Q 152
 #define CGEMM_DEFAULT_Q 128
 #define ZGEMM_DEFAULT_Q 128

From e9f1b2d26f8c68c2bd1f108565645d72f55b7180 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 25 Aug 2023 16:45:56 +0200
Subject: [PATCH 276/718] Expand the SVE compatibility check for the NVIDIA HPC
 compiler

---
 c_check | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/c_check b/c_check
index 7ee183163..4d12c1674 100755
--- a/c_check
+++ b/c_check
@@ -283,6 +283,9 @@ if [ "$architecture" = "arm64" ]; then
     no_sve=0
     {
         $compiler_name $flags $args >/dev/null 2>&1
+    } || {
+        args=" -Msve_intrinsics -c -o $tmpf.o $tmpf"
+        $compiler_name $flags $args >/dev/null 2>&1
     } || {
         no_sve=1
     }

From 8794544b4322d478608fbd742200b2d6aea12294 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 25 Aug 2023 16:47:32 +0200
Subject: [PATCH 277/718] Add support for compiling the Neoverse SVE kernels
 with the NVIDIA HPC compiler

---
 kernel/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/Makefile b/kernel/Makefile
index 795f25eec..4bcb571a4 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -77,6 +77,10 @@ else ifeq ($(TARGET_CORE), ZEN)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
 else ifeq ($(TARGET_CORE), LOONGSON3R4)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS)
+else ifneq ($(filter NEOVERSEN2 NEOVERSEV1, $(TARGET_CORE)),)
+ ifeq ($(C_COMPILER), PGI)
+  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -Msve_intrinsics
+ endif
 else
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
 endif

From 49689fbef7b929f0382322b0d21217837ae9b375 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 25 Aug 2023 17:11:04 +0200
Subject: [PATCH 278/718] Add support for compiling SVE kernels with the NVIDIA
 HPC compiler

---
 cmake/cc.cmake     | 32 ++++++++++++++++++++++----------
 cmake/system.cmake | 12 ++++++++++++
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index aeaa76710..7b4ef8947 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -180,22 +180,30 @@ endif ()
 
 if (${CORE} STREQUAL NEOVERSEN2)
   if (NOT DYNAMIC_ARCH)
-    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
-    if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
-      set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
+	set (CCOMMON_OPT  "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
     else ()
-      set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
-    endif()
+      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+      if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
+	set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
+      else ()
+	set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
+      endif()
+    endif ()
   endif ()
 endif ()
 
 if (${CORE} STREQUAL NEOVERSEV1)
   if (NOT DYNAMIC_ARCH)
-    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
-    if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
-      set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
+	set (CCOMMON_OPT  "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
     else ()
-      set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
+      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+      if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
+        set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
+      else ()
+        set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
+      endif()
     endif()
   endif ()
 endif ()
@@ -213,7 +221,11 @@ endif ()
 
 if (${CORE} STREQUAL ARMV8SVE)
   if (NOT DYNAMIC_ARCH)
-    set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
+	set (CCOMMON_OPT  "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve")
+    else ()
+      set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
+    endif ()
   endif ()
 endif ()
 
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 414193ec8..bc87f7b44 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -282,23 +282,35 @@ if (DEFINED TARGET)
   endif()
 
 if (${TARGET} STREQUAL NEOVERSEV1)
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
+	set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
+    else ()
     execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
     if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
       set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1")
     else ()
 	    message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.")
     endif()
+    endif()
   endif()
   if (${TARGET} STREQUAL NEOVERSEN2)
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
+	set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
+    else ()
     execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
     if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
       set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
     else ()
 	    message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.")
     endif()
+    endif()
   endif()
   if (${TARGET} STREQUAL ARMV8SVE)
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
+      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve")
+    else ()
       set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve")
+    endif()
   endif()
 
 endif()

From 2c3034ff7f6e38dbb8c3fc7a59fbff9a58486d35 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 25 Aug 2023 17:22:51 +0200
Subject: [PATCH 279/718] Disable the C/ZASUM AVX512 microkernels when
 compiling with LLVM17 as well

---
 kernel/x86_64/casum_microk_skylakex-2.c | 2 +-
 kernel/x86_64/zasum_microk_skylakex-2.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c
index ac1dc6fa1..d261962de 100644
--- a/kernel/x86_64/casum_microk_skylakex-2.c
+++ b/kernel/x86_64/casum_microk_skylakex-2.c
@@ -2,7 +2,7 @@
 #ifdef __NVCOMPILER
 #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
 #endif
-#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309))
+#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && (__clang_major__ >= 9 &&__clang_major__ !=17)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
 
 #if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309))
 
diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c
index d4f35db63..dddf03fe2 100644
--- a/kernel/x86_64/zasum_microk_skylakex-2.c
+++ b/kernel/x86_64/zasum_microk_skylakex-2.c
@@ -2,7 +2,7 @@
 #ifdef __NVCOMPILER
 #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
 #endif
-#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309))
+#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && ( __clang_major__ >= 9 && __clang_major__ != 17)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
 
 #if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309))
 

From 7f7d3896dd4012eb051d41b9ca47df6219e8a769 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 25 Aug 2023 18:07:47 +0200
Subject: [PATCH 280/718] Fix missing type declaration for main

---
 lapack-netlib/INSTALL/lsametst.c  | 2 +-
 lapack-netlib/INSTALL/secondtst.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lapack-netlib/INSTALL/lsametst.c b/lapack-netlib/INSTALL/lsametst.c
index 4b46115fc..631733841 100644
--- a/lapack-netlib/INSTALL/lsametst.c
+++ b/lapack-netlib/INSTALL/lsametst.c
@@ -426,7 +426,7 @@ static integer c__3 = 3;
 /*     December 2016 */
 
 /*  ===================================================================== */
-/* Main program */ main(void)
+/* Main program */ int main(void)
 {
     /* Format strings */
     static char fmt_9999[] = "(\002 *** Error:  LSAME( \002,a1,\002, \002,"
diff --git a/lapack-netlib/INSTALL/secondtst.c b/lapack-netlib/INSTALL/secondtst.c
index 694679bb5..03e7814e9 100644
--- a/lapack-netlib/INSTALL/secondtst.c
+++ b/lapack-netlib/INSTALL/secondtst.c
@@ -422,7 +422,7 @@ static integer c__1000 = 1000;
 
 /* ===================================================================== */
 
-/* Main program */ main(void)
+/* Main program */ int main(void)
 {
     /* Format strings */
     static char fmt_9999[] = "(\002 Time for \002,g10.3,\002 SAXPY ops = "

From 7a6203ffa15d962bcfde5c23f0f6ff03c1a7b60f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 25 Aug 2023 18:25:51 +0200
Subject: [PATCH 281/718] restore default Neoverse SVE build instructions for
 non-NVIDIA compilers

---
 kernel/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/Makefile b/kernel/Makefile
index 4bcb571a4..1e0a0074f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -80,6 +80,8 @@ else ifeq ($(TARGET_CORE), LOONGSON3R4)
 else ifneq ($(filter NEOVERSEN2 NEOVERSEV1, $(TARGET_CORE)),)
  ifeq ($(C_COMPILER), PGI)
   override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -Msve_intrinsics
+ else
+  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
  endif
 else
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)

From fc8894dd98f734d48c55e155dbb692ce398c43f4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 26 Aug 2023 00:30:17 +0200
Subject: [PATCH 282/718] Workaround miscompilation by NVIDIA nvc

---
 kernel/arm64/zdot_thunderx2t99.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/kernel/arm64/zdot_thunderx2t99.c b/kernel/arm64/zdot_thunderx2t99.c
index 728f97fb3..6f65e5cfd 100644
--- a/kernel/arm64/zdot_thunderx2t99.c
+++ b/kernel/arm64/zdot_thunderx2t99.c
@@ -24,7 +24,12 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
-
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#if (NVCOMPVERS < 2309)
+#pragma opt 1
+#endif
+#endif
 
 #include "common.h"
 

From 22a402bc2c09c64e15c72af3da90d116a015892d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 26 Aug 2023 01:58:08 +0200
Subject: [PATCH 283/718] clarify the comment on the out-of-bounds check from
 #723

---
 lapack/getf2/getf2_k.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack/getf2/getf2_k.c b/lapack/getf2/getf2_k.c
index 80c66dd7a..5795797d3 100644
--- a/lapack/getf2/getf2_k.c
+++ b/lapack/getf2/getf2_k.c
@@ -95,7 +95,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
       GEMV_N(m - j, j, 0, dm1,  a + j, lda, b, 1, b + j, 1, sb);
 
       jp = j + IAMAX_K(m - j, b + j, 1);
-      if (jp>m) jp = m;        //avoid out of boundary
+      if (jp>m) jp = m;        //avoid out of boundary when the iamax kernel does not cope with NaN in input, see gh issue 723
       ipiv[j + offset] = jp + offset;
       jp--;
       temp1 = *(b + jp);

From 3b6050ac0428893b3698e33f8348023f5c1bd09c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 26 Aug 2023 02:00:00 +0200
Subject: [PATCH 284/718] clarify the comment on the out-of-bounds check from
 #723

---
 lapack/getf2/zgetf2_k.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack/getf2/zgetf2_k.c b/lapack/getf2/zgetf2_k.c
index e3d53c96f..6a2137b3e 100644
--- a/lapack/getf2/zgetf2_k.c
+++ b/lapack/getf2/zgetf2_k.c
@@ -99,7 +99,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
       GEMV_N(m - j, j, 0, dm1,  ZERO, a + j * 2, lda, b, 1, b + j * 2, 1, sb);
 
       jp = j + IAMAX_K(m - j, b + j * 2, 1);
-      if (jp>m) jp = m;        //avoid out of boundary
+      if (jp>m) jp = m;        //avoid out of boundary when the iamax kernel does not cope with NaN in input, see gh issue 723
       ipiv[j + offset] = jp + offset;
       jp--;
 

From 394a1fd1bfacb7b968b0cacd877e2919a97d58e3 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Thu, 31 Aug 2023 15:44:22 +0800
Subject: [PATCH 285/718] LoongArch64: Compatible with early internal toolchain

__loongarch_grlen and __loongarch_frlen were introduced in gcc version 8.3.0
(Loongnix 8.3.0-6.lnd.vec.31) internally within Loongson to standardize the
general and floating-point register widths. However, previous versions did
not have them, requiring additional checks to be added.
---
 kernel/loongarch64/dgemv_n_8_lasx.S         | 12 +++++-
 kernel/loongarch64/dgemv_t_8_lasx.S         | 23 ++++++++---
 kernel/loongarch64/loongarch64_asm.S        | 27 ++++++++++++-
 kernel/loongarch64/sgemm_kernel_16x8_lasx.S | 43 ++++++++++++++++-----
 4 files changed, 86 insertions(+), 19 deletions(-)

diff --git a/kernel/loongarch64/dgemv_n_8_lasx.S b/kernel/loongarch64/dgemv_n_8_lasx.S
index 940d27569..c6523f9ab 100644
--- a/kernel/loongarch64/dgemv_n_8_lasx.S
+++ b/kernel/loongarch64/dgemv_n_8_lasx.S
@@ -386,9 +386,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if __loongarch_grlen == 64
     GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
               PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
-#else
+#elif __loongarch_grlen == 32
     GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
               PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
 #endif
     PTR_ALSL    X,      INC_X,  X,  3
     bnez        J,      .L_\XW\()_N_L8
@@ -435,6 +438,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     PTR_SUB     K_LDA,  K_LDA,  M8
 #if __loongarch_grlen == 64
     GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
 #else
     GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
 #endif
@@ -518,9 +523,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if __loongarch_grlen == 64
     GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
               PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
-#else
+#elif __loongarch_grlen == 32
     GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
               PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
 #endif
     la.local    T0,     .L_GAP_TABLE
     PTR_ALSL    I,      I,      T0,     1
diff --git a/kernel/loongarch64/dgemv_t_8_lasx.S b/kernel/loongarch64/dgemv_t_8_lasx.S
index be90cb1af..7f57c1d88 100644
--- a/kernel/loongarch64/dgemv_t_8_lasx.S
+++ b/kernel/loongarch64/dgemv_t_8_lasx.S
@@ -263,9 +263,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if __loongarch_grlen == 64
     GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
                PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
-#else
+#elif __loongarch_grlen == 32
     GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
                PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
+#else
+    GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
+               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
 #endif
     GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \
                 $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10
@@ -292,9 +295,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if __loongarch_grlen == 64
     GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
               PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
-#else
+#elif __loongarch_grlen == 32
     GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
               PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
 #endif
     fst.d   $f11,   Y,      0x00
     fstx.d  $f12,   Y,      INC_Y
@@ -353,8 +359,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if __loongarch_grlen == 64
     GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
-#else
+#elif __loongarch_grlen == 32
     GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
 #endif
     fst.d   $f11,   Y,      0x00
     fstx.d  $f12,   Y,      INC_Y
@@ -405,8 +413,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if __loongarch_grlen == 64
     GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
-#else
+#elif __loongarch_grlen == 32
     GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
 #endif
     fst.d   $f11,   Y,      0x00
     fstx.d  $f12,   Y,      INC_Y
@@ -446,9 +456,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if __loongarch_grlen == 64
     GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
               PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
-#else
+#elif __loongarch_grlen == 32
     GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
               PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
 #endif
     la.local    T0,     .L_GAP_TABLE
     PTR_ALSL    I,      I,      T0,     1
diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S
index 89243c620..694dcdaa9 100644
--- a/kernel/loongarch64/loongarch64_asm.S
+++ b/kernel/loongarch64/loongarch64_asm.S
@@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define PTR_SRAI  srai.d
 #define PTR_MUL   mul.d
 #define PTR_ALSL  alsl.d
-#else
+#elif __loongarch_grlen == 32
 #define LA_REG    int32_t
 #define REG_SIZE  4
 #define REG_LOG   2
@@ -53,6 +53,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define PTR_SRAI  srai.w
 #define PTR_MUL   mul.w
 #define PTR_ALSL  alsl.w
+#else
+// If neither of the above two conditions is supported, it means this is an early
+// internal toolchain. To ensure maximum compatibility, the following approach is taken:
+#define LA_REG    int64_t
+#define REG_SIZE  8
+#define REG_LOG   3
+#define PTR_ADDI  addi.d
+#define PTR_ADD   add.d
+#define PTR_SUB   sub.d
+#define PTR_LD    ld.d
+#define PTR_ST    st.d
+#define PTR_SLLI  slli.d
+#define PTR_SRLI  srli.d
+#define PTR_SRAI  srai.d
+#define PTR_MUL   mul.d
+#define PTR_ALSL  alsl.d
 #endif
 
 #if __loongarch_frlen == 64
@@ -60,11 +76,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FREG_LOG  3
 #define PTR_FLD   fld.d
 #define PTR_FST   fst.d
-#else
+#elif __loongarch_frlen == 32
 #define FREG_SIZE 4
 #define FREG_LOG  2
 #define PTR_FLD   fld.s
 #define PTR_FST   fst.s
+#else
+// If neither of the above two conditions is supported, it means this is an early
+// internal toolchain. To ensure maximum compatibility, the following approach is taken:
+#define FREG_SIZE 8
+#define FREG_LOG  3
+#define PTR_FLD   fld.d
+#define PTR_FST   fst.d
 #endif
 
 // The max registers available to the user which
diff --git a/kernel/loongarch64/sgemm_kernel_16x8_lasx.S b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S
index 254dbe052..bd609394e 100644
--- a/kernel/loongarch64/sgemm_kernel_16x8_lasx.S
+++ b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S
@@ -335,9 +335,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if __loongarch_grlen == 64
     GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \
                C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40
-#else
+#elif __loongarch_grlen == 32
     GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \
                C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40
+#else
+    GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \
+               C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40
 #endif
 .endm
 
@@ -445,9 +448,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if __loongarch_grlen == 64
     GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \
                C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride
-#else
+#elif __loongarch_grlen == 32
     GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \
                C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride
+#else
+    GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \
+               C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride
 #endif
 .endm
 
@@ -505,8 +511,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
               D6,  C3, 0x00, D7,  C3, 0x20
 #if __loongarch_grlen == 64
     GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40
-#else
+#elif __loongarch_grlen == 32
     GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40
+#else
+    GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40
 #endif
 .endm
 
@@ -585,8 +593,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endif
 #if __loongarch_grlen == 64
     GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride
-#else
+#elif __loongarch_grlen == 32
     GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride
+#else
+    GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride
 #endif
 .endm
 
@@ -631,8 +641,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
               D2,  C1, 0x00, D3,  C1, 0x20
 #if __loongarch_grlen == 64
     GADDI , d, C0, C0, 0x40, C1, C1, 0x40
-#else
+#elif __loongarch_grlen == 32
     GADDI , w, C0, C0, 0x40, C1, C1, 0x40
+#else
+    GADDI , d, C0, C0, 0x40, C1, C1, 0x40
 #endif
 .endm
 
@@ -703,8 +715,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endif
 #if __loongarch_grlen == 64
     GADDI , d, C0, C0, \stride, C1, C1, \stride
-#else
+#elif __loongarch_grlen == 32
     GADDI , w, C0, C0, \stride, C1, C1, \stride
+#else
+    GADDI , d, C0, C0, \stride, C1, C1, \stride
 #endif
 .endm
 
@@ -741,8 +755,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     GST xv, , D0,  C0, 0x00, D1,  C0, 0x20
 #if __loongarch_grlen == 64
     GADDI , d, C0, C0, 0x40
-#else
+#elif __loongarch_grlen == 32
     GADDI , w, C0, C0, 0x40
+#else
+    GADDI , d, C0, C0, 0x40
 #endif
 .endm
 
@@ -813,8 +829,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endif
 #if __loongarch_grlen == 64
     GADDI , d, C0, C0, \stride
-#else
+#elif __loongarch_grlen == 32
     GADDI , w, C0, C0, \stride
+#else
+    GADDI , d, C0, C0, \stride
 #endif
 .endm
 
@@ -838,9 +856,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if __loongarch_grlen == 64
     GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \
               C6, C5, T0, C7, C6, T0
-#else
+#elif __loongarch_grlen == 32
     GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \
               C6, C5, T0, C7, C6, T0
+#else
+    GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \
+              C6, C5, T0, C7, C6, T0
 #endif
 #if defined(TRMMKERNEL) && defined(LEFT)
     move     OFF,   OFFSET
@@ -1222,8 +1243,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     PTR_SLLI   T0,    LDC,   2
 #if __loongarch_grlen == 64
     GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0
-#else
+#elif __loongarch_grlen == 32
     GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0
+#else
+    GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0
 #endif
 
 #if defined(TRMMKERNEL) && defined(LEFT)

From a2a184572ce12d5e361a974328281d5243323de8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 31 Aug 2023 23:42:12 +0200
Subject: [PATCH 286/718] update zrotg

---
 interface/zrotg.c | 271 ++++++++++++++++++++++++----------------------
 1 file changed, 143 insertions(+), 128 deletions(-)

diff --git a/interface/zrotg.c b/interface/zrotg.c
index dd765f05f..af6f85c1c 100644
--- a/interface/zrotg.c
+++ b/interface/zrotg.c
@@ -18,20 +18,26 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
 
 #ifdef DOUBLE
   long double safmin = DBL_MIN;
+  long double rtmin = sqrt(DBL_MIN/DBL_EPSILON);
 #else
   long double safmin = FLT_MIN;
+  long double rtmin = sqrt(FLT_MIN/FLT_EPSILON);
 #endif
 
-#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86)
 
-  long double da_r = *(DA + 0);
-  long double da_i = *(DA + 1);
-  long double db_r = *(DB + 0);
-  long double db_i = *(DB + 1);
-  long double r;
+  FLOAT da_r = *(DA+0);
+  FLOAT da_i = *(DA+1);
+  FLOAT db_r = *(DB+0);
+  FLOAT db_i = *(DB+1);
+  //long double r;
+  FLOAT *r, *S1=(FLOAT *)malloc(2*sizeof(FLOAT));
+  FLOAT *R=(FLOAT *)malloc(2*sizeof(FLOAT));
+  long double d;
 
-  long double ada = fabsl(da_r) + fabsl(da_i);
-  long double adb = sqrt(db_r * db_r + db_i * db_i);
+  FLOAT ada =  da_r * da_r + da_i * da_i; 
+  FLOAT adb =  db_r * db_r + db_i * db_i; 
+  FLOAT adart = sqrt( da_r * da_r + da_i * da_i); 
+  FLOAT adbrt = sqrt( db_r * db_r + db_i * db_i); 
 
   PRINT_DEBUG_NAME;
 
@@ -39,128 +45,137 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
 
   FUNCTION_PROFILE_START();
 
-  if (ada == ZERO) {
-    *C        = ZERO;
-    *(S  + 0) = ONE;
+  if (db_r == ZERO && db_i == ZERO) {
+    *C        = ONE;
+    *(S  + 0) = ZERO;
     *(S  + 1) = ZERO;
-    *(DA + 0) = db_r;
-    *(DA + 1) = db_i;
-  } else {
-    long double alpha_r, alpha_i;
-    long double safmax = 1./safmin;
-    long double sigma;
-    long double maxab = MAX(ada,adb);
-    long double scale = MIN(MAX(safmin,maxab), safmax);
-
-
-    long double aa_r    = da_r / scale;
-    long double aa_i    = da_i / scale;
-    long double bb_r    = db_r / scale;
-    long double bb_i    = db_i / scale;
-
-    if (ada > adb)
-        sigma = copysign(1.,da_r);
-    else
-        sigma = copysign(1.,db_r);
-
-    r = sigma * scale * sqrt(aa_r * aa_r + aa_i * aa_i + bb_r * bb_r + bb_i * bb_i);
-
-
-    alpha_r = da_r / ada;
-    alpha_i = da_i / ada;
-
-    *(C + 0)  = ada / r;
-    *(S + 0)  = (alpha_r * db_r + alpha_i *db_i) / r;
-    *(S + 1)  = (alpha_i * db_r - alpha_r *db_i) / r;
-    *(DA + 0) = alpha_r * r;
-    *(DA + 1) = alpha_i * r;
+    return;
   }
-#else
-  FLOAT da_r = *(DA + 0);
-  FLOAT da_i = *(DA + 1);
-  FLOAT db_r = *(DB + 0);
-  FLOAT db_i = *(DB + 1);
-  FLOAT r;
-
-  FLOAT ada = fabs(da_r) + fabs(da_i);
-  FLOAT adb = fabs(db_r) + fabs(db_i);
-
-  PRINT_DEBUG_NAME;
-
-  IDEBUG_START;
-
-  FUNCTION_PROFILE_START();
-
-  if (ada == ZERO) {
-    *C        = ZERO;
-    *(S  + 0) = ONE;
-    *(S  + 1) = ZERO;
-    *(DA + 0) = db_r;
-    *(DA + 1) = db_i;
-  } else {
-    long double safmax = 1./safmin;
-    FLOAT scale, sigma;
-    FLOAT aa_r, aa_i, bb_r, bb_i;
-    FLOAT alpha_r, alpha_i;
-
-    aa_r = fabs(da_r);
-    aa_i = fabs(da_i);
-
-    if (aa_i > aa_r) {
-      aa_r = fabs(da_i);
-      aa_i = fabs(da_r);
-    }
-
-    if (aa_r == ZERO) {
-	ada = 0.;
-    } else {
-        scale = (aa_i / aa_r);
-        ada = aa_r * sqrt(ONE + scale * scale);
-    }
-
-    bb_r = fabs(db_r);
-    bb_i = fabs(db_i);
-
-    if (bb_i > bb_r) {
-      bb_r = fabs(bb_i);
-      bb_i = fabs(bb_r);
-    }
-
-    if (bb_r == ZERO) {
-	adb = 0.;
-    } else {
-    	scale = (bb_i / bb_r);
-    	adb = bb_r * sqrt(ONE + scale * scale);
-    }
-    FLOAT maxab = MAX(ada,adb);
-    scale = MIN(MAX(safmin,maxab), safmax);
-
-    aa_r    = da_r / scale;
-    aa_i    = da_i / scale;
-    bb_r    = db_r / scale;
-    bb_i    = db_i / scale;
 
-    if (ada > adb)
-        sigma = copysign(1.,da_r);
-    else
-        sigma = copysign(1.,db_r);
-
-    r = sigma * scale * sqrt(aa_r * aa_r + aa_i * aa_i + bb_r * bb_r + bb_i * bb_i);
-
-    alpha_r = da_r / ada;
-    alpha_i = da_i / ada;
-
-    *(C + 0)  = ada / r;
-    *(S + 0)  = (alpha_r * db_r + alpha_i *db_i) / r;
-    *(S + 1)  = (alpha_i * db_r - alpha_r *db_i) / r;
-    *(DA + 0) = alpha_r * r;
-    *(DA + 1) = alpha_i * r;
-  }
+  long double safmax = 1./safmin;
+#if defined DOUBLE
+  long double rtmax = safmax /DBL_EPSILON;
+#else
+  long double rtmax = safmax /FLT_EPSILON;
 #endif
-
-  FUNCTION_PROFILE_END(4, 4, 4);
-
-  IDEBUG_END;
-
-  return;
+		*(S1 + 0) = *(DB + 0);
+		*(S1 + 1) = *(DB + 1) *-1;
+	if (da_r == ZERO && da_i == ZERO) {
+	    *C = ZERO;	  
+	    if (db_r == ZERO) {
+		    (*DA) = fabsl(db_i);
+		*S = *S1 /da_r;
+		*(S+1) = *(S1+1) /da_r;
+		return;
+	    } else if ( db_i == ZERO) {
+		    *DA = fabsl(db_r);
+		*S = *S1 /da_r;
+		*(S+1) = *(S1+1) /da_r;
+		return;
+	    } else {
+	        long double g1 = MAX( fabsl(db_r), fabsl(db_i));
+	        rtmax =sqrt(safmax/2.);
+	        if (g1 > rtmin && g1 < rtmax) { // unscaled
+		    d = sqrt(adb);
+		    *S = *S1 /d;
+		    *(S+1) = *(S1+1) /d;
+		    *DA = d ;
+		    *(DA+1) = ZERO;
+		    return;
+    	        } else { // scaled algorithm
+		  long double u = MIN ( safmax, MAX ( safmin, g1));
+		    FLOAT gs_r = db_r/u;
+		    FLOAT gs_i = db_i/u;
+		    d = sqrt ( gs_r*gs_r + gs_i*gs_i);
+		    *S = gs_r / d;
+		    *(S + 1) = (gs_i * -1) / d;
+		    *DA = d * u;
+		    *(DA+1) = ZERO;
+		    return;
+	        }
+	    }
+	} else {
+	       FLOAT f1 = MAX ( fabsl(da_r), fabsl(da_i));
+	       FLOAT g1 = MAX ( fabsl(db_r), fabsl(db_i));
+	       rtmax = sqrt(safmax / 4.);
+	       if ( f1 > rtmin && f1 < rtmax && g1 > rtmin && g1 < rtmax) { //unscaled
+		    long double h = ada + adb;
+	   	    double adahsq = sqrt(ada * h);
+		    if (ada >= h *safmin) {
+			*C = sqrt(ada/h);
+			*R = *DA / *C;
+			*(R+1) = *(DA+1) / *(C+1);
+			rtmax *= 2.;
+			if ( ada > rtmin && h < rtmax) { // no risk of intermediate overflow
+				*S = *S1 * (*DA / adahsq) - *(S1+1)* (*(DA+1)/adahsq);
+				*(S+1) = *S1 * (*(DA+1) / adahsq) + *(S1+1) * (*DA/adahsq);
+			} else {
+				*S = *S1 * (*R/h) - *(S1+1) * (*(R+1)/h);
+				*(S+1) = *S1 * (*(R+1)/h) + *(S1+1) * (*(R)/h);
+			}
+	    	    } else {
+		        *C = ada / adahsq;
+		        if (*C >= safmin) 
+			    *R = *DA / *C;
+		        else
+			    *R = *DA * (h / adahsq);
+		        *S = *S1 * ada / adahsq;
+		    	*(S+1) = *(S1+1) * ada / adahsq;
+		    }
+		    *DA=*R;
+		    *(DA+1)=*(R+1);
+		    return;
+	        } else {	// scaled
+		   FLOAT fs_r, fs_i, gs_r, gs_i;
+		long double v,w,f2,g2,h;
+		long double u = MIN ( safmax, MAX ( safmin, MAX(f1,g1)));
+		    gs_r = db_r/u;
+		    gs_i = db_i/u;
+		    g2 = sqrt ( gs_r*gs_r + gs_i*gs_i);
+		    if (f1 /u < rtmin) {
+			v = MIN (safmax, MAX (safmin, f1));
+			w = v / u;
+			fs_r = *DA/ v;
+			fs_i = *(DA+1) / v;
+		        f2 = sqrt ( fs_r*fs_r + fs_i*fs_i);
+		        h = f2 * w * w + g2;		
+		    } else { // use same scaling for both
+			w = 1.;
+			fs_r = *DA/ u;
+			fs_i = *(DA+1) / u;
+		        f2 = sqrt ( fs_r*fs_r + fs_i*fs_i);
+			h = f2 + g2;
+		    }
+		    if ( f2 >= h * safmin) {
+			    *C = sqrt ( f2 / h );
+			    *DA = fs_r / *C;
+			    *(DA+1) = fs_i / *C;
+			    rtmax *= 2;
+			    if ( f2 > rtmin && h < rtmax) {
+				    *S = gs_r * (fs_r /sqrt(f2*h)) - gs_i * (fs_i / sqrt(f2*h));
+				    *(S+1) = gs_r * (fs_i /sqrt(f2*h)) + gs_i * -1. * (fs_r / sqrt(f2*h));
+			    } else {
+				    *S = gs_r * (*DA/h) - gs_i * (*(DA+1) / h);
+				    *(S+1) = gs_r * (*(DA+1) /h) + gs_i * -1. * (*DA / h);
+			    }
+		    } else { // intermediates might overflow
+			d = sqrt ( f2 * h);
+			*C = f2 /d;
+			if (*C >= safmin) {
+				*DA = fs_r / *C;
+				*(DA+1) = fs_i / *C;
+			} else {
+				*DA = fs_r * (h / d);
+				*(DA+1) = fs_i / (h / d);
+			}
+			*S = gs_r * (fs_r /d) - gs_i * (fs_i / d);
+			*(S+1) = gs_r * (fs_i /d) + gs_i * -1. * (fs_r / d);
+	            }
+		    *C *= w;
+		    *DA *= u;
+		    *(DA+1) *= u;
+		    return;
+		}
+	}
 }
+	
\ No newline at end of file

From 5d7304106888d6fe9411f8e7013207708ae7277f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 3 Sep 2023 19:05:53 +0200
Subject: [PATCH 287/718] Update Changelog for 0.3.24

---
 Changelog.txt | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)

diff --git a/Changelog.txt b/Changelog.txt
index aa445ae82..3937ef08c 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,104 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.24
+ 03-Sep-2023
+
+general:
+   - declared the arguments of cblas_xerbla as const (in accordance with the reference implementation 
+     and others, the previous discrepancy appears to have dated back to GotoBLAS)
+   - fixed the implementation of ?GEMMT that was added in 0.3.23
+   - made cpu-specific SWITCH_RATIO parameters for GEMM available to DYNAMIC_ARCH builds
+   - fixed application of SYMBOLSUFFIX in CMAKE builds
+   - fixed missing SSYCONVF function in the shared library
+   - fixed parallel build logic used with gmake
+   - added support for compilation with LLVM17, in particular its new Fortran compiler
+   - added support for CMAKE builds using the NVIDIA HPC compiler
+   - fixed INTERFACE64 builds with CMAKE and the f95 Fortran compiler
+   - fixed cross-build detection and management in c_check
+   - disabled building of the tests with CMAKE when ONLY_CBLAS is defined
+   - fixed several issues with the handling of runtime limits on the number of OPENMP threads
+   - corrected the error code returned by SGEADD/DGEADD when LDA is too small
+   - corrected the error code returned by IMATCOPY when LDB is too small
+   - updated ?NRM2 to support negative increment values (as introduced in release 3.10 
+     of the reference BLAS)
+   - fixed OpenMP builds with CLANG for the case where libomp is not in a standard location
+   - fixed a potential overwrite of unrelated memory during thread initialisation on startup
+   - fixed a potential integer overflow in the multithreading threshold for ?SYMM/?SYRK
+   - fixed build of the LAPACKE interfaces for the LAPACK 3.11.0 ?TRSYL functions added in 0.3.22
+   - fixed installation of .cmake files in concurrent 32 and 64bit builds with CMAKE
+   - applied additions and corrections from the development branch of Reference-LAPACK:
+   - fixed actual arguments passed to a number of LAPACK functions (from Reference-LAPACK PR 885)
+   - fixed workspace query results in LAPACK ?SYTRF/?TRECV3 (from Reference-LAPACK PR 883)
+   - fixed derivation of the UPLO parameter in LAPACKE_?larfb (from Reference-LAPACK PR 878)
+   - fixed a crash in LAPACK ?GELSDD on NRHS=0 (from Reference-LAPACK PR 876)
+   - added new LAPACK utility functions CRSCL and ZRSCL (from Reference-LAPACK PR 839)
+	- corrected the order of eigenvalues for 2x2 matrices in ?STEMR (Reference-LAPACK PR 867)
+	- removed spurious reference to OpenMP variables outside OpenMP contexts (Reference-LAPACK PR 860)
+	- updated file comments on use of LAMBDA variable in LAPACK (Reference-LAPACK PR 852)
+	- fixed documentation of LAPACK SLASD0/DLASD0 (Reference-LAPACK PR 855)
+	- fixed confusing use of "minor" in LAPACK documentation (Reference-LAPACK PR 849)
+	- added new LAPACK functions ?GEDMD for dynamic mode decomposition (Reference-LAPACK PR 736)
+	- fixed potential stack overflows in the EIG part of the LAPACK testsuite (Reference-LAPACK PR 854)
+	- applied small improvements to the variants of Cholesky and QR functions (Reference-LAPACK PR 847)
+	- removed unused variables from LAPACK ?BDSQR (Reference-LAPACK PR 832)
+	- fixed a potential crash on allocation failure in LAPACKE SGEESX/DGEESX (Reference-LAPACK PR 836)
+	- added a quick return from SLARUV/DLARUV for N < 1 (Reference-LAPACK PR 837)
+	- updated function descriptions in LAPACK ?GEGS/?GEGV (Reference-LAPACK PR 831)
+	- improved algorithm description in ?GELSY (Reference-LAPACK PR 833)
+	- fixed scaling in LAPACK STGSNA/DTGSNA (Reference-LAPACK PR 830)
+	- fixed crash in LAPACKE_?geqrt with row-major data (Reference-LAPACK PR 768)
+	- added LAPACKE interfaces for C/ZUNHR_COL and S/DORHR_COL (Reference-LAPACK PR 827)
+	- added error exit tests for SYSV/SYTD2/GEHD2 to the testsuite (Reference-LAPACK PR 795)
+	- fixed typos in LAPACK source and comments (Reference-LAPACK PRs 809,811,812,814,820)
+	- adopt refactored ?GEBAL implementation (Reference-LAPACK PR 808)
+
+x86_64:
+   - added cpu model autodetection for Intel Alder Lake N
+   - added activation of the AMX tile to the Sapphire Rapids SBGEMM kernel
+   - worked around miscompilations of GEMV/SYMV kernels by gcc's tree-vectorizer
+   - fixed compilation of Cooperlake and Sapphire Rapids kernels with CLANG
+   - fixed runtime detection of Cooperlake and Sapphire Rapids in DYNAMIC_ARCH
+   - fixed feature-based cputype fallback in DYNAMIC_ARCH
+   - added support for building the AVX512 kernels with the NVIDIA HPC compiler
+   - corrected ZAXPY result on old pre-AVX hardware for the INCX=0 case
+   - fixed a potential use of uninitialized variables in ZTRSM
+
+ARM64:
+   - added cpu model autodetection for Apple M2
+   - fixed wrong results of CGEMM/CTRMM/DNRM2 under OSX (use of reserved register)
+   - added support for building the SVE kernels with the NVIDIA HPC compiler
+   - added support for building the SVE kernels with the Apple Clang compiler
+   - fixed compiler option handling for building the SVE kernels with LLVM
+   - implemented SWITCH_RATIO parameter for improved GEMM performance on Neoverse
+   - activated SVE SGEMM and DGEMM kernels for Neoverse V1
+   - improved performance of the SVE CGEMM and ZGEMM kernels on Neoverse V1
+   - improved kernel selection for the ARMV8SVE target and added it to DYNAMIC_ARCH
+   - fixed runtime check for SVE availability in DYNAMIC_ARCH builds to take OS or
+     container restrictions into account
+   - fixed a potential use of uninitialized variables in ZTRSM
+   - fix a potential misdetection of ARMV8 hardware as 32bit in CMAKE builds
+
+LOONGARCH64:
+   - added ABI detection
+   - added support for cpu affinity handling
+   - fixed compilation with early versions of the Loongson toolchain
+   - added an optimized SGEMM kernel for 3A5000
+   - added optimized DGEMV kernels for 3A5000
+   - improved the performance of the DGEMM kernel for 3A5000
+
+MIPS64:
+   - fixed miscompilation of TRMM kernels for the MIPS64_GENERIC target
+
+POWER:
+   - fixed compiler warnings in the POWER10 SBGEMM kernel
+
+RISCV:
+   - fixed application of the INTERFACE64 option when building with CMAKE
+   - fix a potential misdetection of RISCV hardware as 32bit in CMAKE builds
+   - fixed IDAMAX and DOT kernels for C910V
+   - fixed corner cases in the ROT and SWAP kernels for C910V
+   - fixed compilation of the C910V target with recent vendor compilers
+
 ====================================================================
 Version 0.3.23
  01-Apr-2023

From 3c49711f1e6a3ab1ddd6f3147806e53358541247 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 3 Sep 2023 22:57:22 +0200
Subject: [PATCH 288/718] Update version to 0.3.24

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 86758d8b6..444baa114 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
 
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 23.dev)
+set(OpenBLAS_PATCH_VERSION 24)
 
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 

From 9f815cf1bf16b4e64d4aee681b33558fc090b62a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 3 Sep 2023 22:58:32 +0200
Subject: [PATCH 289/718] Update version to 0.3.24

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index e210e49e8..80060a0fc 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.23.dev
+VERSION = 0.3.24
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

From f2cf9293744c4a17e04cc1690a419ef641fa4c21 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Thu, 31 Aug 2023 16:59:37 +0800
Subject: [PATCH 290/718] LoongArch64: Add sgemv kernel

---
 kernel/loongarch64/KERNEL.LOONGSON3R5 |   3 +
 kernel/loongarch64/sgemv_n_8_lasx.S   | 463 ++++++++++++++++++++++++++
 kernel/loongarch64/sgemv_t_8_lasx.S   | 405 ++++++++++++++++++++++
 3 files changed, 871 insertions(+)
 create mode 100644 kernel/loongarch64/sgemv_n_8_lasx.S
 create mode 100644 kernel/loongarch64/sgemv_t_8_lasx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 67d1fd11c..c23c2fac5 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -21,6 +21,9 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+SGEMVNKERNEL = sgemv_n_8_lasx.S
+SGEMVTKERNEL = sgemv_t_8_lasx.S
 endif
 
 DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
diff --git a/kernel/loongarch64/sgemv_n_8_lasx.S b/kernel/loongarch64/sgemv_n_8_lasx.S
new file mode 100644
index 000000000..da172ca50
--- /dev/null
+++ b/kernel/loongarch64/sgemv_n_8_lasx.S
@@ -0,0 +1,463 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/30 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA   $f0
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define Y_ORG   $r15
+#define OFFSET  $r16
+#define K_LDA   $r17
+#define M4      $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+
+#define VALPHA  $xr1
+#define X0      $xr2
+#define X1      $xr3
+#define X2      $xr4
+#define X3      $xr5
+#define X4      $xr6
+#define X5      $xr7
+#define X6      $xr8
+#define X7      $xr9
+#define Y0      $xr10
+#define A0      $xr11
+#define A1      $xr12
+#define A2      $xr13
+#define A3      $xr14
+#define A4      $xr15
+#define A5      $xr16
+#define A6      $xr17
+#define A7      $xr18
+
+#define X0_F    $f2
+#define X1_F    $f3
+#define X2_F    $f4
+#define X3_F    $f5
+#define X4_F    $f6
+#define X5_F    $f7
+#define X6_F    $f8
+#define X7_F    $f9
+#define Y0_F    $f10
+#define A0_F    $f11
+#define A1_F    $f12
+#define A2_F    $f13
+#define A3_F    $f14
+#define A4_F    $f15
+#define A5_F    $f16
+#define A6_F    $f17
+#define A7_F    $f18
+
+.macro SLOAD_X_8
+    GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04, X2, X, 0x08, X3, X, 0x0C, \
+                   X4, X, 0x10, X5, X, 0x14, X6, X, 0x18, X7, X, 0x1C
+    GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
+                 X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
+.endm
+
+.macro SLOAD_X_8_GAP
+    xvldrepl.w  X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    xvldrepl.w  X1,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.w  X2,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.w  X3,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.w  X4,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.w  X5,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.w  X6,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.w  X7,     T0,     0x00
+    GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
+                 X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
+.endm
+
+.macro SLOAD_X_4
+    GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04, X2, X, 0x08, X3, X, 0x0C
+    GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
+.endm
+
+.macro SLOAD_X_4_GAP
+    xvldrepl.w  X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    xvldrepl.w  X1,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.w  X2,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.w  X3,     T0,     0x00
+    GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
+.endm
+
+.macro SLOAD_X_2
+    GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04
+    GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA
+.endm
+
+.macro SLOAD_X_2_GAP
+    xvldrepl.w  X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    xvldrepl.w  X1,     T0,     0x00
+    GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA
+.endm
+
+.macro SLOAD_X_1
+    GLDREPL xv, w, X0, X, 0x00
+    GMUL xvf, s, X0, X0, VALPHA
+.endm
+
+.macro SLOAD_Y_8
+    GLD xv, , Y0, Y, 0
+.endm
+
+.macro SLOAD_Y_8_GAP
+    fld.s   Y0_F,   Y,  0
+    fldx.s  A0_F,   Y,  INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  1
+    fld.s   A1_F,   T0, 0
+    fldx.s  A2_F,   T0, INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  2
+    fld.s   A3_F,   T0, 0
+    fldx.s  A4_F,   T0, INC_Y
+    PTR_ADD   T0,   T0, INC_Y
+    PTR_ADD   T0,   T0, INC_Y
+    fld.s   A5_F,   T0, 0
+    fldx.s  A6_F,   T0, INC_Y
+    GINSVE0 xv, w, Y0, A0, 1, Y0, A1, 2, Y0, A2, 3, Y0, A3, 4, \
+                   Y0, A4, 5, Y0, A5, 6, Y0, A6, 7
+.endm
+
+.macro SLOAD_Y_1
+    GLD f, s, Y0_F, Y, 0
+.endm
+
+.macro SGEMV_N_8x8
+    GLD_INC xv, , 0x20,             \
+            A0, PA0, 0, A1, PA1, 0, \
+            A2, PA2, 0, A3, PA3, 0, \
+            A4, PA4, 0, A5, PA5, 0, \
+            A6, PA6, 0, A7, PA7, 0
+    GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0, \
+                  Y0, A2, X2, Y0, Y0, A3, X3, Y0, \
+                  Y0, A4, X4, Y0, Y0, A5, X5, Y0, \
+                  Y0, A6, X6, Y0, Y0, A7, X7, Y0
+.endm
+
+.macro SGEMV_N_1x8
+    GLD_INC f, s, 0x04,                 \
+            A0_F, PA0, 0, A1_F, PA1, 0, \
+            A2_F, PA2, 0, A3_F, PA3, 0, \
+            A4_F, PA4, 0, A5_F, PA5, 0, \
+            A6_F, PA6, 0, A7_F, PA7, 0
+    GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F, \
+                Y0_F, A2_F, X2_F, Y0_F, Y0_F, A3_F, X3_F, Y0_F, \
+                Y0_F, A4_F, X4_F, Y0_F, Y0_F, A5_F, X5_F, Y0_F, \
+                Y0_F, A6_F, X6_F, Y0_F, Y0_F, A7_F, X7_F, Y0_F
+.endm
+
+.macro SGEMV_N_8x4
+    GLD_INC xv, , 0x20,             \
+            A0, PA0, 0, A1, PA1, 0, \
+            A2, PA2, 0, A3, PA3, 0
+    GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0, \
+                  Y0, A2, X2, Y0, Y0, A3, X3, Y0
+.endm
+
+.macro SGEMV_N_1x4
+    GLD_INC f, s, 0x04,                 \
+            A0_F, PA0, 0, A1_F, PA1, 0, \
+            A2_F, PA2, 0, A3_F, PA3, 0
+    GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F, \
+                Y0_F, A2_F, X2_F, Y0_F, Y0_F, A3_F, X3_F, Y0_F
+.endm
+
+.macro SGEMV_N_8x2
+    GLD_INC xv, , 0x20,             \
+            A0, PA0, 0, A1, PA1, 0
+    GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0
+.endm
+
+.macro SGEMV_N_1x2
+    GLD_INC f, s, 0x04,             \
+            A0_F, PA0, 0, A1_F, PA1, 0
+    GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F
+.endm
+
+.macro SGEMV_N_1x1
+    GLD_INC f, s, 0x04, A0_F, PA0, 0
+    GMADD f, s, Y0_F, A0_F, X0_F, Y0_F
+.endm
+
+.macro SSTORE_Y_8
+    GST xv, , Y0, Y, 0
+.endm
+
+.macro SSTORE_Y_8_GAP
+    xvstelm.w   Y0,     Y,      0,      0
+    PTR_ADD     T0,     Y,      INC_Y
+    xvstelm.w   Y0,     T0,     0,      1
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.w   Y0,     T0,     0,      2
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.w   Y0,     T0,     0,      3
+
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.w   Y0,     T0,     0,      4
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.w   Y0,     T0,     0,      5
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.w   Y0,     T0,     0,      6
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.w   Y0,     T0,     0,      7
+.endm
+
+.macro SSTORE_Y_1
+    GST f, s, Y0_F, Y, 0
+.endm
+
+.macro SGEMV_N XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req
+    PTR_SRLI  J,      N,      3
+    beqz      J,      .L_\XW\()_N_7
+    PTR_SLLI  K_LDA,  LDA,    3
+    PTR_SUB   K_LDA,  K_LDA,  M4
+.L_\XW\()_N_L8:
+    SLOAD_\X_8
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_M_7
+.align 5
+.L_\XW\()_M_L8:
+    SLOAD_\Y_8
+    SGEMV_N_8x8
+    SSTORE_\Y_8
+    PTR_ADDI    I,      I,      -1
+    PTR_ALSL    Y,      INC_Y,  Y,  3
+    PTR_ADDI    K,      K,      8
+    bnez        I,      .L_\XW\()_M_L8
+.L_\XW\()_M_7:
+    andi        I,      M,      7
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    SLOAD_\Y_1
+    SGEMV_N_1x8
+    SSTORE_\Y_1
+    PTR_ADDI    I,      I,      -1
+    PTR_ADD     Y,      Y,      INC_Y
+    PTR_ADDI    K,      K,      1
+    bnez        I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#endif
+    PTR_ALSL    X,      INC_X,  X,  3
+    bnez        J,      .L_\XW\()_N_L8
+.L_\XW\()_N_7:
+    andi        J,      N,      4
+    beqz        J,      .L_\XW\()_N_3
+    SLOAD_\X_4
+    xor         K,      K,      K
+    move        Y,      Y_ORG
+
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_N_4_M_7
+.align 5
+.L_\XW\()_N_4_M_L8:
+    SLOAD_\Y_8
+    SGEMV_N_8x4
+    SSTORE_\Y_8
+    PTR_ADDI  I,      I,      -1
+    PTR_ADDI  K,      K,      8
+    PTR_ALSL  Y,      INC_Y,  Y,  3
+    bnez    I,      .L_\XW\()_N_4_M_L8
+.L_\XW\()_N_4_M_7:
+    andi        I,      M,      7
+    beqz        I,      .L_\XW\()_N_4_M_END
+.align 5
+.L_\XW\()_N_4_M_L1:
+    SLOAD_\Y_1
+    SGEMV_N_1x4
+    SSTORE_\Y_1
+    PTR_ADDI    I,      I,      -1
+    PTR_ADD     Y,      Y,      INC_Y
+    PTR_ADDI    K,      K,      1
+    bnez        I,      .L_\XW\()_N_4_M_L1
+.L_\XW\()_N_4_M_END:
+    PTR_SLLI    K_LDA,  LDA,    2
+    PTR_SUB     K_LDA,  K_LDA,  M4
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#endif
+    PTR_ALSL    X,      INC_X,  X,  2
+.L_\XW\()_N_3:
+    andi        J,      N,      2
+    beqz        J,      .L_\XW\()_N_1
+    SLOAD_\X_2
+    xor         K,      K,      K
+    move        Y,      Y_ORG
+    PTR_SRLI    I,      M,       3
+    beqz    I,      .L_\XW\()_N_2_M_7
+.align 5
+.L_\XW\()_N_2_M_L8:
+    SLOAD_\Y_8
+    SGEMV_N_8x2
+    SSTORE_\Y_8
+    PTR_ADDI  I,      I,      -1
+    PTR_ADDI  K,      K,      8
+    PTR_ALSL  Y,      INC_Y,  Y,  3
+    bnez    I,      .L_\XW\()_N_2_M_L8
+.L_\XW\()_N_2_M_7:
+    andi    I,      M,      7
+    beqz    I,      .L_\XW\()_N_2_M_END
+.align 5
+.L_\XW\()_N_2_M_L1:
+    SLOAD_\Y_1
+    SGEMV_N_1x2
+    SSTORE_\Y_1
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   Y,      Y,      INC_Y
+    PTR_ADDI  K,      K,      1
+    bnez    I,      .L_\XW\()_N_2_M_L1
+.L_\XW\()_N_2_M_END:
+    PTR_SLLI    K_LDA,  LDA,    1
+    PTR_SUB     K_LDA,  K_LDA,  M4
+    PTR_ADD     PA0,    PA0,    K_LDA
+    PTR_ADD     PA1,    PA1,    K_LDA
+    PTR_ALSL    X,      INC_X,  X,  1
+.L_\XW\()_N_1:
+    andi    J,      N,      1
+    beqz    J,      .L_END
+    SLOAD_\X_1
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    SLOAD_\Y_1
+    SGEMV_N_1x1
+    SSTORE_\Y_1
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   Y,      Y,      INC_Y
+    PTR_ADDI  K,      K,      1
+    bnez    I,      .L_\XW\()_N_1_M_L1
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 7, 19
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    PTR_SUB    J,      INC_Y,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    maskeqz    J,      K,       J  /* if(inc_y == 1) j = 0; else j = 1; */
+    PTR_ALSL   I,      I,       J,      1
+    GSLLI , d, LDA, LDA, 2, INC_X, INC_X, 2, INC_Y, INC_Y, 2, M4, M, 2
+    xvreplve0.w     VALPHA, $xr0
+    move     Y_ORG,  Y
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword  .L_GAP_0_0 - .L_GAP_TABLE
+    .hword  .L_GAP_0_1 - .L_GAP_TABLE
+    .hword  .L_GAP_1_0 - .L_GAP_TABLE
+    .hword  .L_GAP_1_1 - .L_GAP_TABLE
+.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
+    SGEMV_N GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1
+.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
+    SGEMV_N GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1
+.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
+    SGEMV_N GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1
+.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
+    SGEMV_N GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
+.L_END:
+    pop_if_used 17 + 7, 19
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/sgemv_t_8_lasx.S b/kernel/loongarch64/sgemv_t_8_lasx.S
new file mode 100644
index 000000000..dde3f4a30
--- /dev/null
+++ b/kernel/loongarch64/sgemv_t_8_lasx.S
@@ -0,0 +1,405 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/30 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA   $f0
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define PY0     $r14
+#define X_ORG   $r15
+#define PY1     $r16
+#define K_LDA   $r17
+#define PY2     $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+#define M4      $r30
+
+#define VALPHA  $xr0
+#define X0      $xr1
+#define A0      $xr2
+#define A1      $xr3
+#define A2      $xr4
+#define A3      $xr5
+#define A4      $xr6
+#define A5      $xr7
+#define A6      $xr8
+#define A7      $xr9
+#define TP0     $xr10
+#define TP1     $xr11
+#define TP2     $xr12
+#define TP3     $xr13
+#define TP4     $xr14
+#define TP5     $xr15
+#define TP6     $xr16
+#define TP7     $xr17
+#define Y0      $xr2
+#define Y1      $xr3
+#define Y2      $xr4
+#define Y3      $xr5
+#define Y4      $xr6
+#define Y5      $xr7
+#define Y6      $xr8
+#define Y7      $xr9
+
+.macro ZERO_Y8
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \
+                TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7
+.endm
+
+.macro ZERO_Y4
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
+.endm
+
+.macro ZERO_Y2
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1
+.endm
+
+.macro ZERO_Y1
+    GXOR xv, v, TP0, TP0, TP0
+.endm
+
+.macro SLOAD_X8
+    GLD xv, , X0, X, 0x00
+.endm
+
+.macro SLOAD_X8_GAP
+    fld.s       $f1,    X,    0x00
+    fldx.s      $f2,    X,    INC_X
+    PTR_ALSL    T0,     INC_X,      X,      1
+    fld.s       $f3,    T0,   0x00
+    fldx.s      $f4,    T0,   INC_X
+    GINSVE0 xv, w, X0, A0, 1, X0, A1, 2, X0, A2, 3
+    PTR_ALSL    T0,     INC_X,      X,      2
+    fld.s       $f2,    T0,   0x00
+    fldx.s      $f3,    T0,   INC_X
+    PTR_ALSL    T0,     INC_X,      T0,     1
+    fld.s       $f4,    T0,   0x00
+    fldx.s      $f5,    T0,   INC_X
+    GINSVE0 xv, w, X0, A0, 4, X0, A1, 5, X0, A2, 6, X0, A3, 7
+.endm
+
+.macro SGEMV_T_8x8
+    GLD_INC xv, , 0x20,             \
+            A0, PA0, 0, A1, PA1, 0, \
+            A2, PA2, 0, A3, PA3, 0, \
+            A4, PA4, 0, A5, PA5, 0, \
+            A6, PA6, 0, A7, PA7, 0
+    GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1, \
+                  TP2, A2, X0, TP2, TP3, A3, X0, TP3, \
+                  TP4, A4, X0, TP4, TP5, A5, X0, TP5, \
+                  TP6, A6, X0, TP6, TP7, A7, X0, TP7
+.endm
+
+.macro SGEMV_T_4x8
+    GLD_INC xv, , 0x20,             \
+            A0, PA0, 0, A1, PA1, 0, \
+            A2, PA2, 0, A3, PA3, 0
+    GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1, \
+                  TP2, A2, X0, TP2, TP3, A3, X0, TP3
+.endm
+
+.macro SGEMV_T_2x8
+    GLD_INC xv, , 0x20,             \
+            A0, PA0, 0, A1, PA1, 0
+    GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1
+.endm
+
+.macro SGEMV_T XW:req X8:req, X4:req
+    PTR_SRLI  J,      N,      3
+    beqz      J,      .L_\XW\()_N_7
+    PTR_SLLI  K_LDA,  LDA,    3
+    PTR_SUB   K_LDA,  K_LDA,  M4
+.L_\XW\()_N_L8:
+    ZERO_Y8
+    move      X,      X_ORG
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_M_7
+.align 5
+.L_\XW\()_M_L8:
+    SLOAD_\X8
+    SGEMV_T_8x8
+    PTR_ADDI    I,  I,  -1
+    PTR_ALSL    X,  INC_X,  X,  3
+    bnez        I,  .L_\XW\()_M_L8
+.L_\XW\()_M_7:
+    // Accumulated
+    GACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \
+                 Y5, TP5, Y6, TP6, Y7, TP7
+    andi        I,      M,      7
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    fld.s   $f1,    X,      0x00
+    fld.s   $f10,   PA0,    0x00
+    fld.s   $f11,   PA1,    0x00
+    fld.s   $f12,   PA2,    0x00
+    fld.s   $f13,   PA3,    0x00
+    fld.s   $f14,   PA4,    0x00
+    fld.s   $f15,   PA5,    0x00
+    fld.s   $f16,   PA6,    0x00
+    fld.s   $f17,   PA7,    0x00
+#if __loongarch_grlen == 64
+    GADDI , d, PA0, PA0, 0x04, PA1, PA1, 0x04, PA2, PA2, 0x04, PA3, PA3, 0x04, \
+               PA4, PA4, 0x04, PA5, PA5, 0x04, PA6, PA6, 0x04, PA7, PA7, 0x04
+#elif __loongarch_grlen == 32
+    GADDI , w, PA0, PA0, 0x04, PA1, PA1, 0x04, PA2, PA2, 0x04, PA3, PA3, 0x04, \
+               PA4, PA4, 0x04, PA5, PA5, 0x04, PA6, PA6, 0x04, PA7, PA7, 0x04
+#else
+    GADDI , d, PA0, PA0, 0x04, PA1, PA1, 0x04, PA2, PA2, 0x04, PA3, PA3, 0x04, \
+               PA4, PA4, 0x04, PA5, PA5, 0x04, PA6, PA6, 0x04, PA7, PA7, 0x04
+#endif
+    GMADD f, s, $f2, $f10, $f1, $f2, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, \
+                $f6, $f14, $f1, $f6, $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9,
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    fld.s   $f10,   Y,  0x00
+    fldx.s  $f11,   Y,  INC_Y
+    PTR_ALSL    PY0, INC_Y,  Y,  1
+    fld.s   $f12,   PY0,    0x00
+    fldx.s  $f13,   PY0,    INC_Y
+    PTR_ALSL    PY1, INC_Y,  Y,  2
+    fld.s   $f14,   PY1,    0x00
+    fldx.s  $f15,   PY1,    INC_Y
+    PTR_ALSL    PY2, INC_Y,  PY1, 1
+    fld.s   $f16,   PY2,    0x00
+    fldx.s  $f17,   PY2,    INC_Y
+
+    GMADD f, s, $f10, ALPHA, $f2, $f10, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, \
+                $f14, ALPHA, $f6, $f14, $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17
+
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#endif
+    fst.s   $f10,   Y,      0x00
+    fstx.s  $f11,   Y,      INC_Y
+    fst.s   $f12,   PY0,    0x00
+    fstx.s  $f13,   PY0,    INC_Y
+    fst.s   $f14,   PY1,    0x00
+    fstx.s  $f15,   PY1,    INC_Y
+    fst.s   $f16,   PY2,    0x00
+    fstx.s  $f17,   PY2,    INC_Y
+
+    PTR_ALSL    Y,      INC_Y,  Y,  3
+    bnez        J,      .L_\XW\()_N_L8
+.L_\XW\()_N_7:
+    andi        J,      N,      4
+    beqz        J,      .L_\XW\()_N_3
+    ZERO_Y4
+    move        X,      X_ORG
+    PTR_SRLI    I,      M,       3
+    beqz        I,      .L_\XW\()_N_4_M_7
+.align 5
+.L_\XW\()_N_4_M_L8:
+    SLOAD_\X8
+    SGEMV_T_4x8
+    PTR_ADDI  I,      I,      -1
+    PTR_ALSL  X,      INC_X,  X,  3
+    bnez      I,      .L_\XW\()_N_4_M_L8
+.L_\XW\()_N_4_M_7:
+    // Accumulated
+    GACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
+    andi        I,      M,      7
+    beqz        I,      .L_\XW\()_N_4_M_END
+.align 5
+.L_\XW\()_N_4_M_L1:
+    fld.s   $f1,    X,      0x00
+    GLD_INC f, s, 0x04, $f10, PA0, 0x00, $f11, PA1, 0x00, $f12, PA2, 0x00, $f13, PA3, 0x00
+    GMADD f, s, $f2, $f10, $f1, $f2, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_N_4_M_L1
+.L_\XW\()_N_4_M_END:
+    fld.s   $f10,   Y,  0x00
+    fldx.s  $f11,   Y,  INC_Y
+    PTR_ALSL    PY0, INC_Y,  Y,  1
+    fld.s   $f12,   PY0,    0x00
+    fldx.s  $f13,   PY0,    INC_Y
+
+    GMADD f, s, $f10, ALPHA, $f2, $f10, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13
+
+    PTR_SLLI    K_LDA,  LDA,    2
+    PTR_SUB     K_LDA,  K_LDA,  M4
+
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#endif
+    fst.s   $f10,   Y,      0x00
+    fstx.s  $f11,   Y,      INC_Y
+    fst.s   $f12,   PY0,    0x00
+    fstx.s  $f13,   PY0,    INC_Y
+    PTR_ALSL    Y,      INC_Y,  Y,  2
+.L_\XW\()_N_3:
+    andi        J,      N,      2
+    beqz        J,      .L_\XW\()_N_1
+    ZERO_Y2
+    move        X,      X_ORG
+    PTR_SRLI    I,      M,       3
+    beqz        I,      .L_\XW\()_N_2_M_7
+.align 5
+.L_\XW\()_N_2_M_L8:
+    SLOAD_\X8
+    SGEMV_T_2x8
+    PTR_ADDI  I,      I,      -1
+    PTR_ALSL  X,      INC_X,  X,  3
+    bnez      I,      .L_\XW\()_N_2_M_L8
+.L_\XW\()_N_2_M_7:
+    // Accumulated
+    GACC xvf, s, Y0, TP0, Y1, TP1
+    andi        I,      M,      7
+    beqz        I,      .L_\XW\()_N_2_M_END
+.align 5
+.L_\XW\()_N_2_M_L1:
+    fld.s   $f1,    X,      0x00
+    GLD_INC f, s, 0x04, $f10, PA0, 0x00, $f11, PA1, 0x00
+    GMADD f, s, $f2, $f10, $f1, $f2, $f3, $f11, $f1, $f3
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_N_2_M_L1
+.L_\XW\()_N_2_M_END:
+    fld.s   $f10,   Y,  0x00
+    fldx.s  $f11,   Y,  INC_Y
+
+    GMADD f, s, $f10, ALPHA, $f2, $f10, $f11, ALPHA, $f3, $f11
+
+    PTR_SLLI    K_LDA,  LDA,    1
+    PTR_SUB     K_LDA,  K_LDA,  M4
+
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#endif
+    fst.s   $f10,   Y,      0x00
+    fstx.s  $f11,   Y,      INC_Y
+    PTR_ALSL        Y,      INC_Y,  Y,  1
+.L_\XW\()_N_1:
+    andi    J,      N,      1
+    beqz    J,      .L_END
+    ZERO_Y1
+    move    X,      X_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    fld.s   $f2,    PA0,    0x00
+    fld.s   $f1,    X,      0x00
+    fmadd.s $f10,   $f2,    $f1,    $f10
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    PTR_ADDI  PA0,    PA0,    0x04
+    bnez      I,      .L_\XW\()_N_1_M_L1
+
+    fld.s     $f2,    Y,      0x00
+    fmadd.s   $f2,    ALPHA,  $f10,  $f2
+    fst.s     $f2,    Y,      0x00
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 8, 18
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    GSLLI , d, LDA, LDA, 2, INC_X, INC_X, 2, INC_Y, INC_Y, 2, M4, M, 2
+    xvreplve0.w     VALPHA, $xr0
+    move     X_ORG,  X
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword .L_GAP_0 - .L_GAP_TABLE
+    .hword .L_GAP_1 - .L_GAP_TABLE
+.L_GAP_0: /* if (incx == 1) */
+    SGEMV_T GAP_0, X8, X4
+.L_GAP_1: /* if (incx != 1) */
+    SGEMV_T GAP_1, X8_GAP, X4_GAP
+.L_END:
+    pop_if_used 17 + 8, 18
+    jirl    $r0, $r1, 0x0
+    EPILOGUE

From 4867cf5dd78c47fff3cc18a49267575903651a1f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 4 Sep 2023 08:39:40 +0200
Subject: [PATCH 291/718] Update version to 0.3.24.dev

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 444baa114..35077f3c2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
 
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 24)
+set(OpenBLAS_PATCH_VERSION 24.dev)
 
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 

From c3f2a3c0ca8d4a3700f81f176dd3f1bc08cff3dd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 4 Sep 2023 08:40:25 +0200
Subject: [PATCH 292/718] Update version to 0.3.24.dev

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index 80060a0fc..707924904 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.24
+VERSION = 0.3.24.dev
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

From 09911f077e62818109b3a393ca4c2fdf9a2112f7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 5 Sep 2023 16:33:40 +0200
Subject: [PATCH 293/718] Disable SVE targets for DYNAMIC_ARCH when compiling
 with (homebrew)gcc on macOS/arm64

---
 Makefile.system | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index b3968d739..5a4af9698 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -397,6 +397,9 @@ ifeq ($(OSNAME), Darwin)
 ifndef MACOSX_DEPLOYMENT_TARGET
 ifeq ($(ARCH), arm64)
 export MACOSX_DEPLOYMENT_TARGET=11.0
+ifeq ($(C_COMPILER), GCC)
+export NO_SVE = 1
+endif
 else
 export MACOSX_DEPLOYMENT_TARGET=10.8
 endif

From 6a611db560089b11d2b786179cf26443171798fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tiziano=20M=C3=BCller?= <tiziano.mueller@hpe.com>
Date: Sun, 10 Sep 2023 08:44:07 +0200
Subject: [PATCH 294/718] memory: show correct number of max threads

---
 driver/others/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index 4fceae754..fb7f36cdc 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -3012,7 +3012,7 @@ void *blas_memory_alloc(int procpos){
  if (memory_overflowed) goto terminate;
   fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n");
   fprintf(stderr,"To avoid this warning, please rebuild your copy of OpenBLAS with a larger NUM_THREADS setting\n");
-  fprintf(stderr,"or set the environment variable OPENBLAS_NUM_THREADS to %d or lower\n", NUM_BUFFERS);
+  fprintf(stderr,"or set the environment variable OPENBLAS_NUM_THREADS to %d or lower\n", MAX_CPU_NUMBER);
   memory_overflowed=1;
   new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t));
   newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct));

From fb97cc4d5e81d788ba2c99b016cb3b7628e64229 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 14 Sep 2023 10:46:23 +0200
Subject: [PATCH 295/718] Add la_constants.o to SCLAUX/DZLAUX

---
 lapack-netlib/SRC/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile
index c75fd5f49..26314c4df 100644
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@@ -85,7 +85,7 @@ ALLAUX_O = ilaenv.o ilaenv2stage.o ieeeck.o lsamen.o xerbla.o xerbla_array.o \
    ../INSTALL/ilaver.o ../INSTALL/lsame.o ../INSTALL/slamch.o
 
 ifneq "$(or $(BUILD_SINGLE),$(BUILD_COMPLEX))" ""
-SCLAUX = \
+SCLAUX = la_constants.o \
    sbdsvdx.o sstevx.o sstein.o \
    sbdsdc.o \
    sbdsqr.o sdisna.o slabad.o slacpy.o sladiv.o slae2.o  slaebz.o \
@@ -106,7 +106,7 @@ SCLAUX = \
 endif
 
 ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" ""
-DZLAUX = \
+DZLAUX = la_constants.o\
    dcombssq.o \
    dbdsvdx.o dstevx.o dstein.o \
    dbdsdc.o \

From 7779bb6fb1938dad2961f819cc13cdb4316442da Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 14 Sep 2023 20:21:06 +0200
Subject: [PATCH 296/718] Make IWORK array larger to avoid overflow

---
 lapack-netlib/SRC/dtgex2.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/dtgex2.f b/lapack-netlib/SRC/dtgex2.f
index 00c23a833..1ccd92c8f 100644
--- a/lapack-netlib/SRC/dtgex2.f
+++ b/lapack-netlib/SRC/dtgex2.f
@@ -254,7 +254,7 @@
      $                   THRESHA, THRESHB
 *     ..
 *     .. Local Arrays ..
-      INTEGER            IWORK( LDST )
+      INTEGER            IWORK( LDST + 2 )
       DOUBLE PRECISION   AI( 2 ), AR( 2 ), BE( 2 ), IR( LDST, LDST ),
      $                   IRCOP( LDST, LDST ), LI( LDST, LDST ),
      $                   LICOP( LDST, LDST ), S( LDST, LDST ),

From 1285b53e398ec9cbab79368f762062f5154ed383 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 14 Sep 2023 20:22:11 +0200
Subject: [PATCH 297/718] Make IWORK array larger to avoid overflow

---
 lapack-netlib/SRC/stgex2.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/stgex2.f b/lapack-netlib/SRC/stgex2.f
index d61da2327..885739ab5 100644
--- a/lapack-netlib/SRC/stgex2.f
+++ b/lapack-netlib/SRC/stgex2.f
@@ -255,7 +255,7 @@
      $                   THRESHA, THRESHB
 *     ..
 *     .. Local Arrays ..
-      INTEGER            IWORK( LDST )
+      INTEGER            IWORK( LDST + 2 )
       REAL               AI( 2 ), AR( 2 ), BE( 2 ), IR( LDST, LDST ),
      $                   IRCOP( LDST, LDST ), LI( LDST, LDST ),
      $                   LICOP( LDST, LDST ), S( LDST, LDST ),

From 7e939fb8312d512c7e3948d7977d0af4ba660371 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 19 Sep 2023 23:33:39 +0200
Subject: [PATCH 298/718] Fix handling of additional buffer structures in case
 of overflow

---
 driver/others/memory.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index fb7f36cdc..b27fec431 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -73,6 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
+#define NEW_BUFFERS 512
 #ifndef likely
 #ifdef __GNUC__
 #define likely(x) __builtin_expect(!!(x), 1)
@@ -2897,7 +2898,7 @@ void *blas_memory_alloc(int procpos){
 #endif
       position ++;
 
-    } while (position < 512+NUM_BUFFERS);
+    } while (position < NEW_BUFFERS + NUM_BUFFERS);
   }
 #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
   UNLOCK_COMMAND(&alloc_lock);
@@ -3014,9 +3015,10 @@ void *blas_memory_alloc(int procpos){
   fprintf(stderr,"To avoid this warning, please rebuild your copy of OpenBLAS with a larger NUM_THREADS setting\n");
   fprintf(stderr,"or set the environment variable OPENBLAS_NUM_THREADS to %d or lower\n", MAX_CPU_NUMBER);
   memory_overflowed=1;
-  new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t));
-  newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct));
-  for (i = 0; i < 512; i++) {
+  MB;
+  new_release_info = (struct release_t*) malloc(NEW_BUFFERS * sizeof(struct release_t));
+  newmemory = (struct newmemstruct*) malloc(NEW_BUFFERS * sizeof(struct newmemstruct));
+  for (i = 0; i < NEW_BUFFERS; i++) {
   newmemory[i].addr   = (void *)0;
 #if defined(WHEREAMI) && !defined(USE_OPENMP)
   newmemory[i].pos    = -1;
@@ -3129,12 +3131,12 @@ void blas_memory_free(void *free_area){
   printf("  Position : %d\n", position);
 #endif
   if (unlikely(memory_overflowed && position >= NUM_BUFFERS)) {
-    while ((position < NUM_BUFFERS+512) && (newmemory[position-NUM_BUFFERS].addr != free_area))
+    while ((position < NUM_BUFFERS+NEW_BUFFERS) && (newmemory[position-NUM_BUFFERS].addr != free_area))
       position++;
   // arm: ensure all writes are finished before other thread takes this memory
   WMB;
-
-  newmemory[position].used = 0;
+if (position - NUM_BUFFERS >= NEW_BUFFERS) goto error;
+  newmemory[position-NUM_BUFFERS].used = 0;
 #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
   UNLOCK_COMMAND(&alloc_lock);
 #endif
@@ -3213,7 +3215,7 @@ void blas_shutdown(void){
     memory[pos].lock   = 0;
   }
   if (memory_overflowed)
-    for (pos = 0; pos < 512; pos ++){
+    for (pos = 0; pos < NEW_BUFFERS; pos ++){
       newmemory[pos].addr   = (void *)0;
       newmemory[pos].used   = 0;
 #if defined(WHEREAMI) && !defined(USE_OPENMP)

From 6876ae0c3b77b3fb08185e8619801343e4388868 Mon Sep 17 00:00:00 2001
From: Angelika Schwarz <17718454+angsch@users.noreply.github.com>
Date: Wed, 20 Sep 2023 19:10:08 +0200
Subject: [PATCH 299/718] Fix division by zero in zrotg

The cases
[ c  s ] * [ 0      ] = [ |db_i| ]
[-s  c ]   [ i*db_i ]   [  0     ]
and
[ c  s ] * [ 0      ] = [ |db_r| ]
[-s  c ]   [ db_r   ]   [  0     ]
computed s incorrectly. To flip the entries of vector,
s should be conjg(db)/|db| and not conjg(db) / da,
where da == 0.0.
---
 interface/zrotg.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/interface/zrotg.c b/interface/zrotg.c
index af6f85c1c..4d2a9d510 100644
--- a/interface/zrotg.c
+++ b/interface/zrotg.c
@@ -61,16 +61,16 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
 		*(S1 + 0) = *(DB + 0);
 		*(S1 + 1) = *(DB + 1) *-1;
 	if (da_r == ZERO && da_i == ZERO) {
-	    *C = ZERO;	  
+	    *C = ZERO;
 	    if (db_r == ZERO) {
 		    (*DA) = fabsl(db_i);
-		*S = *S1 /da_r;
-		*(S+1) = *(S1+1) /da_r;
+		*S = *S1 /(*DA);
+		*(S+1) = *(S1+1) /(*DA);
 		return;
 	    } else if ( db_i == ZERO) {
 		    *DA = fabsl(db_r);
-		*S = *S1 /da_r;
-		*(S+1) = *(S1+1) /da_r;
+		*S = *S1 /(*DA);
+		*(S+1) = *(S1+1) /(*DA);
 		return;
 	    } else {
 	        long double g1 = MAX( fabsl(db_r), fabsl(db_i));

From db3a43c8edeb36ecc9e7cde10b1c06be3f2147fc Mon Sep 17 00:00:00 2001
From: Angelika Schwarz <17718454+angsch@users.noreply.github.com>
Date: Wed, 20 Sep 2023 19:42:13 +0200
Subject: [PATCH 300/718] Simplify rotg

* The check da != ZERO is no longer necessary since there
  is a special case ada == ZERO, where ada = |da|.
* Add the missing check c != ZERO before the division.

Note that with these two changes the long double code
follows the float/double version of the code.
---
 interface/rotg.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/interface/rotg.c b/interface/rotg.c
index 8d40d9c53..423ebda21 100644
--- a/interface/rotg.c
+++ b/interface/rotg.c
@@ -66,13 +66,8 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
     c = da / r;
     s = db / r;
     z = ONE;
-    if (da != ZERO) {
-      if (ada > adb){
-	z = s;
-      } else {
-	z = ONE / c;
-      }
-    }
+    if (ada > adb) z = s;
+    if ((ada <= adb) && (c != ZERO)) z = ONE / c;
 
     *C = c;
     *S = s;

From 44e6e5479b87f697b4d4fc92030c162f2451b384 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 21 Sep 2023 23:01:21 +0200
Subject: [PATCH 301/718] Use the C compiler for the C SBGEMM test source

---
 test/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/Makefile b/test/Makefile
index 46a7b1158..fa054f15b 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -326,7 +326,7 @@ endif
 
 ifeq ($(BUILD_BFLOAT16),1)
 test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+	$(CC) $(FLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
 endif
 
 ifeq ($(BUILD_COMPLEX),1)

From 2390e0bfbc203f5566b0fede523b1caf1c344deb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 21 Sep 2023 23:04:25 +0200
Subject: [PATCH 302/718] Quote the BU (underscore) option as it may not be set

---
 exports/Makefile | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/exports/Makefile b/exports/Makefile
index d81735342..7682f851d 100644
--- a/exports/Makefile
+++ b/exports/Makefile
@@ -272,23 +272,23 @@ static : ../$(LIBNAME)
 	rm -f goto.$(SUFFIX)
 
 osx.def : $(GENSYM) ../Makefile.system ../getarch.c
-	./$(GENSYM) osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
+	./$(GENSYM) osx $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
 
 aix.def : $(GENSYM) ../Makefile.system ../getarch.c
-	./$(GENSYM) aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
+	./$(GENSYM) aix $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
 
 objcopy.def : $(GENSYM) ../Makefile.system ../getarch.c
-	./$(GENSYM) objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
+	./$(GENSYM) objcopy $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
 
 objconv.def : $(GENSYM) ../Makefile.system ../getarch.c
-	./$(GENSYM) objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
+	./$(GENSYM) objconv $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
 
 test : linktest.c
 	$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
 	rm -f linktest
 
 linktest.c : $(GENSYM) ../Makefile.system ../getarch.c
-	./$(GENSYM) linktest  $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c
+	./$(GENSYM) linktest  $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c
 
 clean ::
 	@rm -f *.def *.dylib __.SYMDEF* *.renamed

From b926e70ebd879bb022d265a3859bfb5481b4d99f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 21 Sep 2023 23:07:32 +0200
Subject: [PATCH 303/718] Fix typo in build rule of "profiled" sbgemm

---
 interface/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/Makefile b/interface/Makefile
index 2ac9663d6..78335357b 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -1301,7 +1301,7 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c
 ifeq ($(BUILD_BFLOAT16),1)
 sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)
-sbgemmt.$(SUFFIX) sbgemm.$(PSUFFIX) : gemmt.c ../param.h
+sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : gemmt.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 endif
 

From bb4718322294bd9f28b0343a643d0986e9046a2b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 24 Sep 2023 10:13:47 +0200
Subject: [PATCH 304/718] Force -qextname for trailing underscore generation
 when IBM xlf is used with gcc

---
 Makefile.system | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index 5a4af9698..ae6db40b0 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -1167,6 +1167,10 @@ endif
 
 ifeq ($(F_COMPILER), IBM)
 CCOMMON_OPT += -DF_INTERFACE_IBM
+FEXTRALIB += -lxlf90
+ifeq ($(C_COMPILER), GCC)
+FCOMMON_OPT += -qextname
+endif
 # FCOMMON_OPT	+= -qarch=440
 ifdef BINARY64
 FCOMMON_OPT += -q64

From 8012afcabbc912e32961924a77858e334ee75356 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 24 Sep 2023 10:15:12 +0200
Subject: [PATCH 305/718] Avoid using some gcc-specific flags with IBM xlf

---
 Makefile.power | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/Makefile.power b/Makefile.power
index 28a0bae08..33702c932 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -42,15 +42,14 @@ FCOMMON_OPT += -O2 -qrecur -qnosave
 else
 FCOMMON_OPT += -O2 -frecursive -fno-fast-math
 endif
-ifeq ($(C_COMPILER), GCC)
+
+ifeq ($(F_COMPILER), GFORTRAN)
 ifneq ($(GCCVERSIONGT4), 1)
 $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
 FCOMMON_OPT += -mcpu=power8 -mtune=power8 
 else
 FCOMMON_OPT += -mcpu=power9 -mtune=power9 
 endif
-else
-FCOMMON_OPT += -mcpu=power9 -mtune=power9
 endif
 else
 FCOMMON_OPT += -O2 -Mrecursive
@@ -84,12 +83,16 @@ CCOMMON_OPT += -DUSE_OPENMP -fopenmp
 else
 CCOMMON_OPT += -DUSE_OPENMP -mp
 endif
+ifeq ($(F_COMPILER), IBM)
+FCOMMON_OPT += -DUSE_OPENMP 
+else
 ifneq ($(F_COMPILER), PGI)
 FCOMMON_OPT += -DUSE_OPENMP -fopenmp
 else
 FCOMMON_OPT += -DUSE_OPENMP -mp
 endif
 endif
+endif
 
 # workaround for C->FORTRAN ABI violation in LAPACKE
 ifeq ($(F_COMPILER), GFORTRAN)

From 4de963dc17eb682e774a85a494a28001d6e6aa98 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 24 Sep 2023 10:16:37 +0200
Subject: [PATCH 306/718] Enforce trailing underscores on symbols when IBM xlf
 is combined with gcc

---
 f_check | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/f_check b/f_check
index 526c41dc6..f30231bc4 100755
--- a/f_check
+++ b/f_check
@@ -155,6 +155,10 @@ else
             *'IBM XL'*)
                 vendor=IBM
                 openmp='-openmp'
+		case "$CC" in *gcc*)
+		bu=_
+		;;
+		esac
                 ;;
             *NAG*)
                 vendor=NAG
@@ -223,6 +227,10 @@ else
             *ppuf*|*xlf*)
                 vendor=IBM
                 openmp='-openmp'
+		case "$CC" in *gcc*)
+		bu=_
+		;;
+		esac
                 ;;
             *open64*)
                 vendor=OPEN64

From 7a96908d0cb0ee3cc5b49390a5ec0ca3a71fefdf Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 24 Sep 2023 10:18:24 +0200
Subject: [PATCH 307/718] Add -lgomp when IBM xlf is combined with gcc in
 OPENMP builds

---
 ctest/Makefile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ctest/Makefile b/ctest/Makefile
index 9e85d23b9..af5b34a36 100644
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -214,6 +214,11 @@ endif
 ifeq ($(F_COMPILER), NAG)
 CEXTRALIB = -lgomp
 endif
+ifeq ($(F_COMPILER), IBM)
+ifeq ($(C_COMPILER), GCC)
+CEXTRALIB += -lgomp
+endif
+endif
 endif
 
 ifeq ($(BUILD_SINGLE),1)

From 2a9981a2442106f67d963ee68cb4ee3b1a7a0334 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 24 Sep 2023 10:19:11 +0200
Subject: [PATCH 308/718] Add -lgomp when IBM xlf is combined with gcc in
 OPENMP builds

---
 test/Makefile | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/test/Makefile b/test/Makefile
index fa054f15b..715842b4d 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -271,6 +271,11 @@ endif
 ifeq ($(F_COMPILER), NAG)
 CEXTRALIB = -lgomp
 endif
+ifeq ($(F_COMPILER), IBM)
+ifeq ($(C_COMPILER), GCC)
+CEXTRALIB += -lgomp
+endif
+endif
 endif
 
 ifeq ($(BUILD_SINGLE),1)
@@ -326,7 +331,7 @@ endif
 
 ifeq ($(BUILD_BFLOAT16),1)
 test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME)
-	$(CC) $(FLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+	$(CC) $(CFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
 endif
 
 ifeq ($(BUILD_COMPLEX),1)

From 4670eb1462b73ffa82699a70bc383bfd41461a0f Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Wed, 20 Sep 2023 09:09:35 +0800
Subject: [PATCH 309/718] LoongArch64: Add dtrsm kernel

---
 kernel/loongarch64/KERNEL.LOONGSON3R5         |   10 +-
 .../loongarch64/dtrsm_kernel_LN_16x4_lasx.S   | 1366 +++++++++++
 .../loongarch64/dtrsm_kernel_LT_16x4_lasx.S   |  959 ++++++++
 .../loongarch64/dtrsm_kernel_RN_16x4_lasx.S   |  882 +++++++
 .../loongarch64/dtrsm_kernel_RT_16x4_lasx.S   |  953 ++++++++
 kernel/loongarch64/dtrsm_kernel_macro.S       | 2147 +++++++++++++++++
 6 files changed, 6312 insertions(+), 5 deletions(-)
 create mode 100644 kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S
 create mode 100644 kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S
 create mode 100644 kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S
 create mode 100644 kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S
 create mode 100644 kernel/loongarch64/dtrsm_kernel_macro.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index c23c2fac5..011e8b89e 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -24,12 +24,12 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 SGEMVNKERNEL = sgemv_n_8_lasx.S
 SGEMVTKERNEL = sgemv_t_8_lasx.S
-endif
 
-DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+DTRSMKERNEL_LN  = dtrsm_kernel_LN_16x4_lasx.S
+DTRSMKERNEL_LT  = dtrsm_kernel_LT_16x4_lasx.S
+DTRSMKERNEL_RN  = dtrsm_kernel_RN_16x4_lasx.S
+DTRSMKERNEL_RT  = dtrsm_kernel_RT_16x4_lasx.S
+endif
 
 STRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
diff --git a/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S
new file mode 100644
index 000000000..3315daccb
--- /dev/null
+++ b/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S
@@ -0,0 +1,1366 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/07/26 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ *          FLOAT *c, BLASLONG ldc, BLASLONG offset)
+ */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+#define OFFSET $r11  // param 9: offset
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define T0     $r25
+#define T1     $r26
+#define T2     $r27
+#define KK     $r28
+#define AA     $r29
+#define CC     $r30
+#undef  ZERO
+#define ZERO   $r0
+
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+
+/* Prefetch interval */
+#define A_PRE  0x400
+#define B_PRE  0x100
+
+#include "dtrsm_kernel_macro.S"
+
+// By integrating the dgemm and dsolve processes, the following advantages can be obtained:
+// 1. Avoid the overhead of function calls (by not invoking dgemm_kernel)
+// 2. Reduce the storage and retrieval of C data
+// 3. Vectorization of dsolve
+// GEMM_UNROLL_M x DGEMM_UNROLL_N is 16x4, which is a fairly large size.
+// To achieve finer-grained optimization, 15 scenarios have been addressed:
+// 16x4, 16x2, 16x1, 8x4, 8x2, 8x1, 4x4, 4x2, 4x1, 2x4, 2x2, 2x1, 1x4, 1x2, 1x1.
+
+.macro dsolve_16 N
+// if N = 4 the data layout of C is as follows:
+// U0  U1  U2  U3
+// U4  U5  U6  U7
+// U8  U9  U10 U11
+// U12 U13 U14 U15
+// if N = 2 the dat layout of C is as follows:
+// U0 U1 U2 U3
+// U4 U5 U6 U7
+// if N = 1 the dat layout of C is as follows:
+// U0 U1 U2 U3
+// The matrix A has dimensions of 16x16, and
+// it will be divided into 4 segments for processing.
+
+#define G12 U3
+#define G13 U7
+#define G14 U11
+#define G15 U15
+    GTRANSPOSE4x4_D U3, U7, U11, U15, G12, G13, G14, G15, D0, D1
+    // A
+    // G12 G13 G14 G15
+    // -----------------
+    // 204             | D9
+    // 220 221         | D8 D7
+    // 236 237 238     | D6 D5 D4
+    // 252 253 254 255 | D3 D2 D1 D0
+    PTR_ADDI T0, A0, 252 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI T0, A0, 236 * 8
+    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
+    PTR_ADDI T0, A0, 220 * 8
+    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
+    PTR_ADDI T0, A0, 204 * 8
+    GLDREPL xv, d, D9, T0, 0
+
+    xvfmul.d    G15,    G15,    D0
+    GNMSUB xvf, d, G14, G15, D1, G14
+    xvfmul.d    G14,    G14,    D4
+    GNMSUB xvf, d, G13, G15, D2, G13, G13, G14, D5, G13
+    xvfmul.d    G13,    G13,    D7
+    GNMSUB xvf, d, G12, G15, D3, G12, G12, G14, D6, G12, G12, G13, D8, G12
+    xvfmul.d    G12,    G12,    D9
+    // Store B
+.if \N == 4
+    // x x x x ... x x x x
+    // x x x x ... x x x x
+    // x x x x ... x x x x
+    // b48 b49 b50 b51 ... b60 b61 b62 b63
+    GST xv, , G12, B0, 48 * 8, G13, B0, 52 * 8, G14, B0, 56 * 8, G15, B0, 60 * 8
+.elseif \N == 2
+    // x x x x ... x x x x
+    // x x x x ... x x x x
+    // x x x x ... x x x x
+    // b24 b25 b26 b27 b28 b29 b30 b31
+    GST v, , $vr3, B0, 24 * 8, $vr7, B0, 26 * 8, $vr11, B0, 28 * 8, $vr15, B0, 30 * 8
+.elseif \N == 1
+    // x x x x
+    // x x x x
+    // x x x x
+    // b12 b13 b14 b15
+    GST f, d, $f3, B0, 12 * 8, $f7, B0, 13 * 8, $f11, B0, 14 * 8, $f15, B0, 15 * 8
+.endif
+    // Transpose G15 G14 G13 G12
+    GTRANSPOSE4x4_D G12, G13, G14, G15, D0, D1, D2, D3, D4, D5
+    // Store C
+.if \N == 4
+    // x x x x ... c12 c13 c14 c15
+    // x x x x ... c28 c29 c30 c31
+    // x x x x ... c44 c45 c46 c47
+    // x x x x ... c60 c61 c62 c63
+    GST xv, , D0, C0, 12 * 8, D1, C1, 12 * 8, D2, C2, 12 * 8, D3, C3, 12 * 8
+.elseif \N == 2
+    // x x x x ... c12 c13 c14 c15
+    // x x x x ... c28 c29 c30 c31
+    GST xv, , D0, C0, 12 * 8, D1, C1, 12 * 8
+.elseif \N == 1
+    // Store C
+    // x x x x ... c12 c13 c14 c15
+    GST xv, , D0, C0, 12 * 8
+.endif
+
+#define G8  U2
+#define G9  U6
+#define G10 U10
+#define G11 U14
+    GTRANSPOSE4x4_D U2, U6, U10, U14, G8, G9, G10, G11, D0, D1
+    // A
+    // G8  G9  G10  G11
+    // -----------------
+    // 136             | D9
+    // 152 153         | D8  D7
+    // 168 169 170     | D6  D5  D4
+    // 184 185 186 187 | D3  D2  D1  D0
+    // 200 201 202 203 | D15 D14 D13 D12
+    // 216 217 218 219 | D11 D10 D9  D8
+    // 232 233 234 235 | D7  D6  D5  D4
+    // 248 249 250 251 | D3  D2  D1  D0
+    PTR_ADDI  T0, A0, 248 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI  T0, A0, 232 * 8
+    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
+    PTR_ADDI  T0, A0, 216 * 8
+    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
+    PTR_ADDI  T0, A0, 200 * 8
+    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
+    GNMSUB xvf, d, G11, G15, D0,  G11, G10, G15, D1,  G10, G9, G15, D2,  G9, G8, G15, D3,  G8, \
+                   G11, G14, D4,  G11, G10, G14, D5,  G10, G9, G14, D6,  G9, G8, G14, D7,  G8, \
+                   G11, G13, D8,  G11, G10, G13, D9,  G10, G9, G13, D10, G9, G8, G13, D11, G8, \
+                   G11, G12, D12, G11, G10, G12, D13, G10, G9, G12, D14, G9, G8, G12, D15, G8
+    PTR_ADDI T0, A0, 184 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI T0, A0, 168 * 8
+    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
+    PTR_ADDI T0, A0, 152 * 8
+    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
+    PTR_ADDI T0, A0, 136 * 8
+    GLDREPL xv, d, D9, T0, 0
+
+    xvfmul.d    G11,    G11,    D0
+    GNMSUB xvf, d, G10, G11, D1, G10, G9, G11, D2, G9, G8, G11, D3, G8
+    xvfmul.d    G10,    G10,    D4
+    GNMSUB xvf, d, G9, G10, D5, G9, G8, G10, D6, G8
+    xvfmul.d    G9,     G9,     D7
+    GNMSUB xvf, d, G8, G9, D8, G8
+    xvfmul.d    G8,     G8,     D9
+    // Store B
+.if \N == 4
+    // x x x x ... x x x x
+    // x x x x ... x x x x
+    // b32 b33 b34 b34 ... b44 b45 b46 b47
+    // b48 b49 b50 b51 ... b60 b61 b62 b63
+    GST xv, , G8, B0, 32 * 8, G9, B0, 36 * 8, G10, B0, 40 * 8, G11, B0, 44 * 8
+.elseif \N == 2
+    // x x x x ... x x x x
+    // x x x x ... x x x x
+    // b16 b17 b18 b19 b20 b21 b22 b23
+    // b24 b25 b26 b27 b28 b29 b30 b31
+    GST v, , $vr2, B0, 16 * 8, $vr6, B0, 18 * 8, $vr10, B0, 20 * 8, $vr14, B0, 22 * 8
+.elseif \N == 1
+    // x x x x
+    // x x x x
+    // b8  b9  b10 b11
+    // b12 b13 b14 b15
+    GST f, d, $f2, B0, 8 * 8, $f6, B0, 9 * 8, $f10, B0, 10 * 8, $f14, B0, 11 * 8
+.endif
+    // Transpose G11 G10 G9 G8
+    GTRANSPOSE4x4_D G8, G9, G10, G11, D0, D1, D2, D3, D4, D5
+    // Store C
+.if \N == 4
+    // x x x x ... c8  c9  c10 c11 c12 c13 c14 c15
+    // x x x x ... c24 c25 c26 c27 c28 c29 c30 c31
+    // x x x x ... c40 c41 c42 c43 c44 c45 c46 c47
+    // x x x x ... c56 c57 c58 c59 c60 c61 c62 c63
+    GST xv, , D0, C0, 8 * 8, D1, C1, 8 * 8, D2, C2, 8 * 8, D3, C3, 8 * 8
+.elseif \N == 2
+    // x x x x ... c8  c9  c10 c11 c12 c13 c14 c15
+    // x x x x ... c24 c25 c26 c27 c28 c29 c30 c31
+    GST xv, , D0, C0, 8 * 8, D1, C1, 8 * 8
+.elseif \N == 1
+    // x x x x ... c8  c9  c10 c11 c12 c13 c14 c15
+    GST xv, , D0, C0, 8 * 8
+.endif
+
+#define G4 U1
+#define G5 U5
+#define G6 U9
+#define G7 U13
+    GTRANSPOSE4x4_D U1, U5, U9, U13, G4, G5, G6, G7, D0, D1
+    // A
+    // G4   G5  G6   G7
+    // ------------------
+    // 68               | D9
+    // 84	85          | D8  D7
+    // 100	101	102     | D6  D5  D4
+    // 116	117	118	119 | D3  D2  D1  D0
+    // 132	133	134	135 | D15 D14 D13 D12
+    // 148	149	150	151 | D11 D10 D9  D8
+    // 164	165	166	167 | D7  D6  D5  D4
+    // 180	181	182	183 | D3  D2  D1  D0
+    // 196	197	198	199 | D15 D14 D13 D12
+    // 212	213	214	215 | D11 D10 D9  D8
+    // 228	229	230	231 | D7  D6  D5  D4
+    // 244	245	246	247 | D3  D2  D1  D0
+    PTR_ADDI  T0, A0, 244 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI  T0, A0, 228 * 8
+    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
+    PTR_ADDI  T0, A0, 212 * 8
+    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
+    PTR_ADDI  T0, A0, 196 * 8
+    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
+    GNMSUB xvf, d, G7, G15, D0,  G7, G6, G15, D1,  G6, G5, G15, D2,  G5, G4, G15, D3,  G4, \
+                   G7, G14, D4,  G7, G6, G14, D5,  G6, G5, G14, D6,  G5, G4, G14, D7,  G4, \
+                   G7, G13, D8,  G7, G6, G13, D9,  G6, G5, G13, D10, G5, G4, G13, D11, G4, \
+                   G7, G12, D12, G7, G6, G12, D13, G6, G5, G12, D14, G5, G4, G12, D15, G4
+    PTR_ADDI  T0, A0, 180 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI  T0, A0, 164 * 8
+    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
+    PTR_ADDI  T0, A0, 148 * 8
+    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
+    PTR_ADDI  T0, A0, 132 * 8
+    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
+    GNMSUB xvf, d, G7, G11, D0,  G7, G6, G11, D1,  G6, G5, G11, D2,  G5, G4, G11, D3,  G4, \
+                   G7, G10, D4,  G7, G6, G10, D5,  G6, G5, G10, D6,  G5, G4, G10, D7,  G4, \
+                   G7, G9,  D8,  G7, G6, G9,  D9,  G6, G5, G9,  D10, G5, G4, G9,  D11, G4, \
+                   G7, G8,  D12, G7, G6, G8,  D13, G6, G5, G8,  D14, G5, G4, G8,  D15, G4
+    PTR_ADDI T0, A0, 116 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI T0, A0, 100 * 8
+    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
+    PTR_ADDI T0, A0, 84 * 8
+    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
+    PTR_ADDI T0, A0, 68 * 8
+    GLDREPL xv, d, D9, T0, 0
+    xvfmul.d    G7,     G7,     D0
+    GNMSUB xvf, d, G6, G7, D1, G6, G5, G7, D2, G5, G4, G7, D3, G4
+    xvfmul.d    G6,     G6,     D4
+    GNMSUB xvf, d, G5, G6, D5, G5, G4, G6, D6, G4
+    xvfmul.d    G5,     G5,     D7
+    GNMSUB xvf, d, G4, G5, D8, G4
+    xvfmul.d    G4,     G4,     D9
+    // Store B
+.if \N == 4
+    // x x x x ... x x x x
+    // b16 b17 b18 b19 ... b28 b29 b30 b31
+    // b32 b33 b34 b34 ... b44 b45 b46 b47
+    // b48 b49 b50 b51 ... b60 b61 b62 b63
+    GST xv, , G4, B0, 16 * 8, G5, B0, 20 * 8, G6, B0, 24 * 8, G7, B0, 28 * 8
+.elseif \N == 2
+    // x x x x ... x x x x
+    // b8  b9  b10 b11 b12 b13 b14 b15
+    // b16 b17 b18 b19 b20 b21 b22 b23
+    // b24 b25 b26 b27 b28 b29 b30 b31
+    GST v, , $vr1, B0, 8 * 8, $vr5, B0, 10 * 8, $vr9, B0, 12 * 8, $vr13, B0, 14 * 8
+.elseif \N == 1
+    // x x x x
+    // b4  b5  b6  b7
+    // b8  b9  b10 b11
+    // b12 b13 b14 b15
+    GST f, d, $f1, B0, 4 * 8, $f5, B0, 5 * 8, $f9, B0, 6 * 8, $f13, B0, 7 * 8
+.endif
+    // Transpose G7 G6 G5 G4
+    GTRANSPOSE4x4_D G4, G5, G6, G7, D0, D1, D2, D3, D4, D5
+    // Store C
+.if \N == 4
+    // x x x x c4  c5  c6  c7  c8  c9  c10 c11 c12 c13 c14 c15
+    // x x x x c20 c21 c22 c23 c24 c25 c26 c27 c28 c29 c30 c31
+    // x x x x c36 c37 c38 c39 c40 c41 c42 c43 c44 c45 c46 c47
+    // x x x x c52 c53 c54 c55 c56 c57 c58 c59 c60 c61 c62 c63
+    GST xv, , D0, C0, 4 * 8, D1, C1, 4 * 8, D2, C2, 4 * 8, D3, C3, 4 * 8
+.elseif \N == 2
+    // x x x x c4  c5  c6  c7  c8  c9  c10 c11 c12 c13 c14 c15
+    // x x x x c20 c21 c22 c23 c24 c25 c26 c27 c28 c29 c30 c31
+    GST xv, , D0, C0, 4 * 8, D1, C1, 4 * 8
+.elseif \N == 1
+    // x x x x c4  c5  c6  c7  c8  c9  c10 c11 c12 c13 c14 c15
+    GST xv, , D0, C0, 4 * 8
+.endif
+
+#define G0 U0
+#define G1 U4
+#define G2 U8
+#define G3 U12
+    GTRANSPOSE4x4_D U0, U4, U8, U12, G0, G1, G2, G3, D0, D1
+    // A
+    // G0   G1  G2   G3
+    // ------------------
+    // 0                 | D9
+    // 16	17           | D8  D7
+    // 32	33	34       | D6  D5  D4
+    // 48	49	50	51   | D3  D2  D1  D0
+    // 64	65	66	67   | D15 D14 D13 D12
+    // 80	81	82	83   | D11 D10 D9  D8
+    // 96	97	98	99   | D7  D6  D5  D4
+    // 112	113	114	115  | D3  D2  D1  D0
+    // 128	129	130	131  | D15 D14 D13 D12
+    // 144	145	146	147  | D11 D10 D9  D8
+    // 160	161	162	163  | D7  D6  D5  D4
+    // 176	177	178	179  | D3  D2  D1  D0
+    // 192	193	194	195  | D15 D14 D13 D12
+    // 208	209	210	211  | D11 D10 D9  D8
+    // 224	225	226	227  | D7  D6  D5  D4
+    // 240	241	242	243  | D3  D2  D1  D0
+    PTR_ADDI  T0, A0, 240 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI  T0, A0, 224 * 8
+    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
+    PTR_ADDI  T0, A0, 208 * 8
+    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
+    PTR_ADDI  T0, A0, 192 * 8
+    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
+    GNMSUB xvf, d, G3, G15, D0,  G3, G2, G15, D1,  G2, G1, G15, D2,  G1, G0, G15, D3,  G0, \
+                   G3, G14, D4,  G3, G2, G14, D5,  G2, G1, G14, D6,  G1, G0, G14, D7,  G0, \
+                   G3, G13, D8,  G3, G2, G13, D9,  G2, G1, G13, D10, G1, G0, G13, D11, G0, \
+                   G3, G12, D12, G3, G2, G12, D13, G2, G1, G12, D14, G1, G0, G12, D15, G0
+    PTR_ADDI  T0, A0, 176 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI  T0, A0, 160 * 8
+    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
+    PTR_ADDI  T0, A0, 144 * 8
+    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
+    PTR_ADDI  T0, A0, 128 * 8
+    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
+    GNMSUB xvf, d, G3, G11, D0,  G3, G2, G11, D1,  G2, G1, G11, D2,  G1, G0, G11, D3,  G0, \
+                   G3, G10, D4,  G3, G2, G10, D5,  G2, G1, G10, D6,  G1, G0, G10, D7,  G0, \
+                   G3, G9,  D8,  G3, G2, G9,  D9,  G2, G1, G9,  D10, G1, G0,  G9, D11, G0, \
+                   G3, G8,  D12, G3, G2, G8,  D13, G2, G1, G8,  D14, G1, G0,  G8, D15, G0
+    PTR_ADDI  T0, A0, 112 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI  T0, A0, 96 * 8
+    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
+    PTR_ADDI  T0, A0, 80 * 8
+    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
+    PTR_ADDI  T0, A0, 64 * 8
+    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
+    GNMSUB xvf, d, G3, G7, D0,  G3, G2, G7, D1,  G2, G1, G7, D2,  G1, G0, G7, D3,  G0, \
+                   G3, G6, D4,  G3, G2, G6, D5,  G2, G1, G6, D6,  G1, G0, G6, D7,  G0, \
+                   G3, G5, D8,  G3, G2, G5, D9,  G2, G1, G5, D10, G1, G0, G5, D11, G0, \
+                   G3, G4, D12, G3, G2, G4, D13, G2, G1, G4, D14, G1, G0, G4, D15, G0
+    PTR_ADDI T0, A0, 48 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI T0, A0, 32 * 8
+    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
+    PTR_ADDI T0, A0, 16 * 8
+    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
+    PTR_ADDI T0, A0, 0 * 8
+    GLDREPL xv, d, D9, T0, 0
+
+    xvfmul.d    G3,     G3,     D0
+    GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0
+    xvfmul.d    G2,     G2,     D4
+    GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0
+    xvfmul.d    G1,     G1,     D7
+    GNMSUB  xvf, d, G0, G1, D8, G0
+    xvfmul.d    G0,     G0,     D9
+    // Store B
+.if \N == 4
+    // b0  b1  b2  b3  ... b12 b13 b14 b15
+    // b16 b17 b18 b19 ... b28 b29 b30 b31
+    // b32 b33 b34 b34 ... b44 b45 b46 b47
+    // b48 b49 b50 b51 ... b60 b61 b62 b63
+    GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8
+.elseif \N == 2
+    // b0  b1  b2  b3  b4  b5  b6  b7
+    // b8  b9  b10 b11 b12 b13 b14 b15
+    // b16 b17 b18 b19 b20 b21 b22 b23
+    // b24 b25 b26 b27 b28 b29 b30 b31
+    GST v, , $vr0, B0, 0, $vr4, B0, 2 * 8, $vr8, B0, 4 * 8, $vr12, B0, 6 * 8
+.elseif \N == 1
+    // b0  b1  b2  b3
+    // b4  b5  b6  b7
+    // b8  b9  b10 b11
+    // b12 b13 b14 b15
+    GST f, d, $f0, B0, 0, $f4, B0, 1 * 8, $f8, B0, 2 * 8, $f12, B0, 3 * 8
+.endif
+    // Transpose C3 C2 C1 C0
+    GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5
+    // Store C
+.if \N == 4
+    // c0  c1  c2  c3  ... c12 c13 c14 c15
+    // c16 c17 c18 c19 ... c28 c29 c30 c31
+    // c32 c33 c34 c34 ... c44 c45 c46 c47
+    // c48 c49 c50 c51 ... c60 c61 c62 c63
+    GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0
+.elseif \N == 2
+    // c0  c1  c2  c3  ... c12 c13 c14 c15
+    // c16 c17 c18 c19 ... c28 c29 c30 c31
+    GST xv, , D0, C0, 0, D1, C1, 0
+.elseif \N == 1
+    // c0  c1  c2  c3  ... c12 c13 c14 c15
+    GST xv, , D0, C0, 0
+.endif
+
+#undef G0
+#undef G1
+#undef G2
+#undef G3
+#undef G4
+#undef G5
+#undef G6
+#undef G7
+#undef G8
+#undef G9
+#undef G10
+#undef G11
+#undef G12
+#undef G13
+#undef G14
+#undef G15
+.endm
+
+.macro dsolve_8 N
+// if N = 4 the data layout of C is as follows:
+// U0  U1
+// U2  U3
+// U4  U5
+// U6  U7
+// if N = 2 the dat layout of C is as follows:
+// U0  U1
+// U2  U3
+// if N = 1 the dat layout of C is as follows:
+// U0  U1
+// The matrix A has dimensions of 8x8, and
+// it will be divided into 2 segments for processing.
+
+#define G4 U1
+#define G5 U3
+#define G6 U5
+#define G7 U7
+    // Transpose U7 U5 U3 U1
+    GTRANSPOSE4x4_D U1, U3, U5, U7, G4, G5, G6, G7, D0, D1
+    // A
+    // G4   G5  G6  G7
+    // ---------------
+    // 36              | D9
+    // 44	45         | D8 D7
+    // 52	53	54     | D6 D5 D4
+    // 60	61	62	63 | D3 D2 D1 D0
+    PTR_ADDI      T0,     A0,     60 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI      T0,     A0,     52 * 8
+    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
+    PTR_ADDI      T0,     A0,     44 * 8
+    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
+    PTR_ADDI      T0,     A0,     36 * 8
+    GLDREPL xv, d, D9, T0, 0
+
+    xvfmul.d    G7,     G7,     D0
+    GNMSUB xvf, d, G6, G7, D1, G6, G5, G7, D2, G5, G4, G7, D3, G4
+    xvfmul.d    G6,     G6,     D4
+    GNMSUB xvf, d, G5, G6, D5, G5, G4, G6, D6, G4
+    xvfmul.d    G5,     G5,     D7
+    GNMSUB xvf, d, G4, G5, D8, G4
+    xvfmul.d    G4,     G4,     D9
+    // Store B
+.if \N == 4
+    GST xv, , G4, B0, 16 * 8, G5, B0, 20 * 8, G6, B0, 24 * 8, G7, B0, 28 * 8
+.elseif \N == 2
+    GST v, , $vr1, B0, 8 * 8, $vr3, B0, 10 * 8, $vr5, B0, 12 * 8, $vr7, B0, 14 * 8
+.elseif \N == 1
+    GST f, d, $f1, B0, 4 * 8, $f3, B0, 5 * 8, $f5, B0, 6 * 8, $f7, B0, 7 * 8
+.endif
+    // Transpose
+    GTRANSPOSE4x4_D G4, G5, G6, G7, D4, D5, D6, D7, D8, D9
+    // Store C
+.if \N == 4
+    GST xv, , D4, C0, 4 * 8, D5, C1, 4 * 8, D6, C2, 4 * 8, D7, C3, 4 * 8
+.elseif \N == 2
+    GST xv, , D4, C0, 4 * 8, D5, C1, 4 * 8
+.elseif \N == 1
+    GST xv, , D4, C0, 4 * 8
+.endif
+
+#define G0 U0
+#define G1 U2
+#define G2 U4
+#define G3 U6
+    // Transpose U6 U4 U2 U0
+    GTRANSPOSE4x4_D U0, U2, U4, U6, G0, G1, G2, G3, D0, D1
+    // A
+    // G0  G1   G2  G3
+    //-----------------
+    // 0               | D9
+    // 8	9          | D8  D7
+    // 16	17	18     | D6  D5  D4
+    // 24	25	26	27 | D3  D2  D1  D0
+    // 32	33	34	35 | D15 D14 D13 D12
+    // 40	41	42	43 | D11 D10 D9  D8
+    // 48	49	50	51 | D7  D6  D5  D4
+    // 56	57	58	59 | D3  D2  D1  D0
+    PTR_ADDI  T0, A0, 56 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI  T0, A0, 48 * 8
+    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
+    PTR_ADDI  T0, A0, 40 * 8
+    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
+    PTR_ADDI  T0, A0, 32 * 8
+    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
+    GNMSUB xvf, d, G3, G7, D0,  G3, G2, G7, D1,  G2, G1, G7, D2,  G1, G0, G7, D3,  G0, \
+                   G3, G6, D4,  G3, G2, G6, D5,  G2, G1, G6, D6,  G1, G0, G6, D7,  G0, \
+                   G3, G5, D8,  G3, G2, G5, D9,  G2, G1, G5, D10, G1, G0, G5, D11, G0, \
+                   G3, G4, D12, G3, G2, G4, D13, G2, G1, G4, D14, G1, G0, G4, D15, G0
+    PTR_ADDI T0, A0, 24 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI T0, A0, 16 * 8
+    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
+    PTR_ADDI T0, A0, 8 * 8
+    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
+    PTR_ADDI T0, A0, 0 * 8
+    GLDREPL xv, d, D9, T0, 0
+
+    xvfmul.d    G3,     G3,     D0
+    GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0
+    xvfmul.d    G2,     G2,     D4
+    GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0
+    xvfmul.d    G1,     G1,     D7
+    GNMSUB xvf, d, G0, G1, D8, G0
+    xvfmul.d    G0,     G0,     D9
+    // Store B
+.if \N == 4
+    GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8
+.elseif \N == 2
+    GST v, , $vr0, B0, 0, $vr2, B0, 2 * 8, $vr4, B0, 4 * 8, $vr6, B0, 6 * 8
+.elseif \N == 1
+    GST f, d, $f0, B0, 0, $f2, B0, 1 * 8, $f4, B0, 2 * 8, $f6, B0, 3 * 8
+.endif
+    // Transpose
+    GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5
+    // Store C
+.if \N == 4
+    GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0
+.elseif \N == 2
+    GST xv, , D0, C0, 0, D1, C1, 0
+.elseif \N == 1
+    GST xv, , D0, C0, 0
+.endif
+
+#undef G0
+#undef G1
+#undef G2
+#undef G3
+#undef G4
+#undef G5
+#undef G6
+#undef G7
+.endm
+
+.macro dsolve_4 N
+// if N = 4 the data layout of C is as follows:
+// U0
+// U1
+// U2
+// U3
+// if N = 2 the dat layout of C is as follows:
+// U0
+// U1
+// if N = 1 the dat layout of C is as follows:
+// U0
+// The matrix A has dimensions of 4x4, and
+// it will be divided into 1 segments for processing.
+
+#define G0 U0
+#define G1 U1
+#define G2 U2
+#define G3 U3
+    // Transpose U3 U2 U1 U0
+    GTRANSPOSE4x4_D U0, U1, U2, U3, G0, G1, G2, G3, D0, D1
+    // A
+    // G0 G1 G2 G3
+    //-------------
+    // 0           | D9
+    // 4  5        | D8 D7
+    // 8  9  10    | D6 D5 D4
+    // 12 13 14 15 | D3 D2 D1 D0
+    GLDREPL xv, d, D3, A0, 12 * 8, D2, A0, 13 * 8, D1, A0, 14 * 8, D0, A0, 15 * 8, \
+                   D6, A0, 8 * 8,  D5, A0, 9 * 8,  D4, A0, 10 * 8, \
+                   D8, A0, 4 * 8,  D7, A0, 5 * 8, \
+                   D9, A0, 0 * 8
+    xvfmul.d    G3,     G3,     D0
+    GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0
+    xvfmul.d    G2,     G2,     D4
+    GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0
+    xvfmul.d    G1,     G1,     D7
+    GNMSUB xvf, d, G0, G1, D8, G0
+    xvfmul.d    G0,     G0,     D9
+    // Store B
+.if \N == 4
+    GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8
+.elseif \N == 2
+    GST v, , $vr0, B0, 0, $vr1, B0, 2 * 8, $vr2, B0, 4 * 8, $vr3, B0, 6 * 8
+.elseif \N == 1
+    GST f, d, $f0, B0, 0, $f1, B0, 1 * 8, $f2, B0, 2 * 8, $f3, B0, 3 * 8
+.endif
+    // Transpose
+    GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5
+    // Store C
+.if \N == 4
+    GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0
+.elseif \N == 2
+    GST xv, , D0, C0, 0, D1, C1, 0
+.elseif \N == 1
+    GST xv, , D0, C0, 0
+.endif
+
+#undef G0
+#undef G1
+#undef G2
+#undef G3
+.endm
+
+.macro dsolve_2 N
+#define G0  U2
+#define G1  U3
+    // Transpose
+    GSBUTTERFLY xv, d, G0, G1, U1, U0
+    // A
+    // G0 G1
+    // ------
+    // 0    | D2
+    // 2  3 | D1 D0
+    GLDREPL xv, d, D2, A0, 0, D1, A0, 2 * 8, D0, A0, 3 * 8
+    xvfmul.d    G1,     G1,     D0
+    GNMSUB xvf, d, G0, G1, D1, G0
+    xvfmul.d    G0,     G0,     D2
+    // Store B
+.if \N == 4
+    GST xv, , G0, B0, 0, G1, B0, 4 * 8
+.elseif \N == 2
+    GST v, , $vr2, B0, 0, $vr3, B0, 2 * 8
+.elseif \N == 1
+    GST f, d, $f2, B0, 0, $f3, B0, 8
+.endif
+    // Transpose
+    GSBUTTERFLY xv, d, D0, D1, G1, G0
+    // Store C
+.if \N == 4
+    vst       $vr16,    C0,      0x00
+    vst       $vr17,    C1,      0x00
+    xvstelm.d D0,  C2,  0x00,    0x02
+    xvstelm.d D1,  C3,  0x00,    0x02
+    xvstelm.d D0,  C2,  0x08,    0x03
+    xvstelm.d D1,  C3,  0x08,    0x03
+.elseif \N == 2
+    GST v, , $vr16, C0, 0, $vr17, C1, 0
+.elseif \N == 1
+    GST v, , $vr16, C0, 0
+.endif
+
+#undef G0
+#undef G1
+.endm
+
+.macro dgemm_dsolve_16x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_16x4_load
+    dgemm_16x4
+    b	.L_dsolve_16x4
+.L_dsolve_16x4_load:
+    // Load C
+    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
+    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
+    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+/********************** solver ******************/
+.L_dsolve_16x4:
+    PTR_ADDI    A0,    T1,    -(16 * 8 * 8)
+    PTR_ADDI    A0,    A0,    -(16 * 8 * 8)
+    PTR_ADDI    B0,    T2,    -(16 * 4 * 8)
+    dsolve_16 4
+.endm
+
+.macro dgemm_dsolve_1x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_1x4_load
+    dgemm_1x4
+    b   .L_dsolve_1x4
+.L_dsolve_1x4_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    fld.d       $f2,    C2,     0x00
+    fld.d       $f3,    C3,     0x00
+    xvinsve0.d  U0,     U1,     0x01
+    xvinsve0.d  U0,     U2,     0x02
+    xvinsve0.d  U0,     U3,     0x03
+.L_dsolve_1x4:
+    or      A0,     T1,     T1
+    or      B0,     T2,     T2
+    GLDREPL xv, d, D0, A0, -1 * 8
+    GMUL xvf, d, U0, U0, D0
+    // Store C
+    xvstelm.d   U0,     C0,     0x00,       0x00
+    xvstelm.d   U0,     C1,     0x00,       0x01
+    xvstelm.d   U0,     C2,     0x00,       0x02
+    xvstelm.d   U0,     C3,     0x00,       0x03
+    // Store B
+    xvst    U0,     B0,     -32
+.endm
+
+.macro dgemm_dsolve_2x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_2x4_load
+    dgemm_2x4
+    b   .L_dsolve_2x4
+.L_dsolve_2x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+
+    xvpermi.q   U0, U2, 0x02
+    xvpermi.q   U1, U3, 0x02
+/********************** solver ******************/
+.L_dsolve_2x4:
+    PTR_ADDI      A0,     T1,     -(2 * 2 * 8)
+    PTR_ADDI      B0,     T2,     -(2 * 4 * 8)
+    dsolve_2 4
+.endm
+
+.macro dgemm_dsolve_4x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_4x4_load
+    dgemm_4x4
+    b .L_dsolve_4x4
+.L_dsolve_4x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/************** solver *****************/
+.L_dsolve_4x4:
+    PTR_ADDI      A0,     T1,     -(4 * 4 * 8)
+    PTR_ADDI      B0,     T2,     -(4 * 4 * 8)
+
+    dsolve_4 4
+.endm
+
+.macro dgemm_dsolve_8x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_8x4_load
+    dgemm_8x4
+    b .L_dsolve_8x4
+.L_dsolve_8x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+
+    /* Load C2  */
+    xvld      U4,  C2,  0x00
+    xvld      U5,  C2,  0x20
+
+    /* Load C3  */
+    xvld      U6,  C3,  0x00
+    xvld      U7,  C3,  0x20
+/********* solver *********/
+.L_dsolve_8x4:
+    PTR_ADDI      A0,     T1,     -(8 * 8 * 8)
+    PTR_ADDI      B0,     T2,     -(8 * 4 * 8)
+
+    dsolve_8 4
+.endm
+
+.macro dgemm_dsolve_4x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_4x2_load
+    dgemm_4x2
+    b .L_dsolve_4x2
+.L_dsolve_4x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_4x2:
+    PTR_ADDI      A0,     T1,     -(4 * 4 * 8)
+    PTR_ADDI      B0,     T2,     -(4 * 2 * 8)
+
+    dsolve_4 2
+.endm
+
+.macro dgemm_dsolve_2x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_2x2_load
+    dgemm_2x2
+    b .L_dsolve_2x2
+.L_dsolve_2x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_2x2:
+    PTR_ADDI     A0,  T1,     -(2 * 2 * 8)
+    PTR_ADDI     B0,  T2,     -(2 * 2 * 8)
+
+    dsolve_2 2
+.endm
+
+.macro dgemm_dsolve_8x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_8x2_load
+    dgemm_8x2
+    b .L_dsolve_8x2
+.L_dsolve_8x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+.L_dsolve_8x2:
+    PTR_ADDI     A0,  T1,     -(8 * 8 * 8)
+    PTR_ADDI     B0,  T2,     -(8 * 2 * 8)
+
+    dsolve_8 2
+.endm
+
+.macro dgemm_dsolve_16x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_16x2_load
+    dgemm_16x2
+    b .L_dsolve_16x2
+.L_dsolve_16x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    /* Load C1  */
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+.L_dsolve_16x2:
+    PTR_ADDI    A0,    T1,    -(16 * 8 * 8)
+    PTR_ADDI    A0,    A0,    -(16 * 8 * 8)
+    PTR_ADDI    B0,    T2,    -(16 * 2 * 8)
+
+    dsolve_16 2
+.endm
+
+.macro dgemm_dsolve_2x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_2x1_load
+    dgemm_2x1
+    b .L_dsolve_2x1
+.L_dsolve_2x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_2x1:
+    PTR_ADDI     A0,  T1,     -(2 * 2 * 8)
+    PTR_ADDI     B0,  T2,     -(2 * 1 * 8)
+
+    dsolve_2 1
+.endm
+
+.macro dgemm_dsolve_4x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_4x1_load
+    dgemm_4x1
+    b .L_dsolve_4x1
+.L_dsolve_4x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_4x1:
+    PTR_ADDI      A0,     T1,     -(4 * 4 * 8)
+    PTR_ADDI      B0,     T2,     -(4 * 1 * 8)
+
+    dsolve_4 1
+.endm
+
+.macro dgemm_dsolve_8x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_8x1_load
+    dgemm_8x1
+    b .L_dsolve_8x1
+.L_dsolve_8x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+.L_dsolve_8x1:
+    PTR_ADDI     A0,  T1,     -(8 * 8 * 8)
+    PTR_ADDI     B0,  T2,     -(8 * 1 * 8)
+
+    dsolve_8 1
+.endm
+
+.macro dgemm_dsolve_16x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_16x1_load
+    dgemm_16x1
+    b .L_dsolve_16x1
+.L_dsolve_16x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+.L_dsolve_16x1:
+    PTR_ADDI    A0,    T1,    -(16 * 8 * 8)
+    PTR_ADDI    A0,    A0,    -(16 * 8 * 8)
+    PTR_ADDI    B0,    T2,    -(16 * 1 * 8)
+
+    dsolve_16 1
+.endm
+
+    PROLOGUE
+    push_if_used 26, 32
+    PTR_SLLI   LDC,   LDC,   3
+    /* if (!(N >> 2)) goto L_N3 */
+    PTR_SRAI   J,     N,     2     /* J = bn >> 2 */
+    andi     N,     N,     0x03
+    beq      ZERO,  J,     .L_N3
+.align 5
+.L_J1:
+    PTR_ADDI   J,     J,     -1
+    PTR_ADD    KK,    M,     OFFSET
+
+    andi      I,    M,      15
+    beq       ZERO, I,      .L_M16
+    andi      I,    M,      1
+    beqz      I,    .L_M2
+.L_M1:
+    PTR_ADDI    T0,   M,      -1
+    PTR_SLLI    T0,   T0,     3
+    PTR_MUL     AA,   T0,     K
+    PTR_ADD     AA,   AA,     A
+    PTR_ALSL    A0,   KK,     AA,     3 /* a + (m - 1) * k + kk */
+    PTR_ADD     CC,   T0,     C         /* c + (m - 1) */
+
+    PTR_SLLI   T0,    KK,     5
+    PTR_ADD    B0,    B,      T0 /* b + 4 * kk */
+    PTR_SUB    L,     K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    dgemm_dsolve_1x4
+    PTR_ADDI   KK,    KK,     -1
+.L_M2:
+    andi    I,      M,      2
+    beqz    I,      .L_M4
+    PTR_SRLI  T0,     M,      1
+    PTR_SLLI  T0,     T0,     1
+    PTR_ADDI  T0,     T0,     -2
+    PTR_SLLI  T0,     T0,     3 /* ((m & -2) - 2) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -2) - 2)*/
+    PTR_SLLI  T1,     KK,     4
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -2) - 2) * k + 2 * kk */
+    PTR_SLLI  T0,     KK,     5
+    PTR_ADD   B0,     B,      T0 /* b + 4 * kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    dgemm_dsolve_2x4
+    PTR_ADDI  KK,     KK,     -2
+.L_M4:
+    andi    I,      M,      4
+    beqz    I,      .L_M8
+    PTR_SRLI  T0,     M,      2
+    PTR_SLLI  T0,     T0,     2
+    PTR_ADDI  T0,     T0,     -4
+    PTR_SLLI  T0,     T0,     3 /* ((m & -4) - 4) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -4) - 4)*/
+    PTR_SLLI  T1,     KK,     5
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -4) - 4) * k + 4 * kk */
+    PTR_SLLI  T0,     KK,     5
+    PTR_ADD   B0,     B,      T0 /* b + 4 * kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    dgemm_dsolve_4x4
+    PTR_ADDI  KK,     KK,     -4
+.L_M8:
+    andi    I,      M,      8
+    beqz    I,      .L_M16
+    PTR_SRLI  T0,     M,      3
+    PTR_SLLI  T0,     T0,     3
+    PTR_ADDI  T0,     T0,     -8
+    PTR_SLLI  T0,     T0,     3 /* ((m & -8) - 8) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -8) - 8)*/
+    PTR_SLLI  T1,     KK,     6
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -8) - 8) * k + 8 * kk */
+    PTR_SLLI  T0,     KK,     5
+    PTR_ADD   B0,     B,      T0 /* b + 4 * kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    dgemm_dsolve_8x4
+    PTR_ADDI  KK,     KK,     -8
+.L_M16:
+    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_M0
+
+    PTR_SRLI   T0,    M,      4
+    PTR_SLLI   T0,    T0,     4
+    PTR_ADDI   T0,    T0,     -16 /* ((M & -16)) - 16) */
+    PTR_SLLI   T0,    T0,     3
+    PTR_MUL    AA,    T0,     K
+    PTR_ADD    AA,    A,      AA
+    PTR_ADD    CC,    C,      T0
+.align 5
+.L_I1:
+    PTR_SLLI   T0,    KK,     5
+    PTR_ADD    B0,    B,      T0
+    PTR_SUB    L,     K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_SLLI   T0,    KK,     7
+    PTR_ADD    A0,    AA,     T0
+    dgemm_dsolve_16x4
+    PTR_ADDI   I,     I,      -1
+    PTR_ADDI   KK,    KK,     -16
+    PTR_ADDI   CC,    CC,     -(16 * 8)
+    PTR_SLLI   T0,    K,      7
+    PTR_SUB    AA,    AA,     T0
+    blt      ZERO,  I,      .L_I1
+.L_M0:
+    PTR_SLLI   T0,    K,      3
+    PTR_ALSL   B,     T0,     B,      2 // b += 4 * k;
+    PTR_ALSL   C,     LDC,    C,      2 // c += 4 * ldc
+    blt      ZERO,  J,      .L_J1
+.L_N3:
+    andi    J,      N,      2
+    beq     ZERO,   J,      .L_N1
+
+    PTR_ADD    KK,    M,     OFFSET
+    andi      I,    M,      15
+    beq       ZERO, I,      .L_N3_M16
+    andi      I,    M,      1
+    beqz      I,    .L_N3_M2
+.L_N3_M1:
+    PTR_ADDI    KK,   KK,     -1
+
+    PTR_ADDI    T0,   M,      -1
+    PTR_SLLI    T0,   T0,     3
+    PTR_MUL     AA,   T0,     K
+    PTR_ADD     AA,   AA,     A
+    PTR_ALSL    A0,   KK,     AA,     3 /* a + (m - 1) * k + kk */
+    PTR_ADD     CC,   T0,     C         /* c + (m - 1) */
+
+    PTR_SLLI   T0,    KK,     4
+    PTR_ADD    B0,    B,      T0 /* b + 2 * kk */
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    // dgemm_dsolve_1x2
+    GLD f, d, $f0, A0, 0, $f1, C0, 0, $f2, C1, 0
+    GMUL f, d, $f1, $f1, $f0, $f2, $f2, $f0
+    GST f, d, $f1, C0, 0, $f2, C1, 0, $f1, B0, 0, $f2, B0, 8
+.L_N3_M2:
+    andi    I,      M,      2
+    beqz    I,      .L_N3_M4
+    PTR_SRLI  T0,     M,      1
+    PTR_SLLI  T0,     T0,     1
+    PTR_ADDI  T0,     T0,     -2
+    PTR_SLLI  T0,     T0,     3 /* ((m & -2) - 2) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -2) - 2)*/
+    PTR_SLLI  T1,     KK,     4
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -2) - 2) * k + 2 * kk */
+    PTR_SLLI  T0,     KK,     4
+    PTR_ADD   B0,     B,      T0 /* b + 2 * kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    dgemm_dsolve_2x2
+    PTR_ADDI  KK,     KK,     -2
+.L_N3_M4:
+    andi    I,      M,      4
+    beqz    I,      .L_N3_M8
+    PTR_SRLI  T0,     M,      2
+    PTR_SLLI  T0,     T0,     2
+    PTR_ADDI  T0,     T0,     -4
+    PTR_SLLI  T0,     T0,     3 /* ((m & -4) - 4) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -4) - 4)*/
+    PTR_SLLI  T1,     KK,     5
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -4) - 4) * k + 4 * kk */
+    PTR_SLLI  T0,     KK,     4
+    PTR_ADD   B0,     B,      T0 /* b + 2 * kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    dgemm_dsolve_4x2
+    PTR_ADDI  KK,     KK,     -4
+.L_N3_M8:
+    andi    I,      M,      8
+    beqz    I,      .L_N3_M16
+    PTR_SRLI  T0,     M,      3
+    PTR_SLLI  T0,     T0,     3
+    PTR_ADDI  T0,     T0,     -8
+    PTR_SLLI  T0,     T0,     3 /* ((m & -8) - 8) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -8) - 8)*/
+    PTR_SLLI  T1,     KK,     6
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -8) - 8) * k + 8 * kk */
+    PTR_SLLI  T0,     KK,     4
+    PTR_ADD   B0,     B,      T0 /* b + 2 * kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    dgemm_dsolve_8x2
+    PTR_ADDI  KK,     KK,     -8
+.L_N3_M16:
+    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_N3_M0
+
+    PTR_SRLI   T0,    M,      4
+    PTR_SLLI   T0,    T0,     4
+    PTR_ADDI   T0,    T0,     -16 /* ((M & -16)) - 16) */
+    PTR_SLLI   T0,    T0,     3
+    PTR_MUL    AA,    T0,     K
+    PTR_ADD    AA,    A,      AA
+    PTR_ADD    CC,    C,      T0
+.align 5
+.L_N3_I1:
+    PTR_SLLI   T0,    KK,     4
+    PTR_ADD    B0,    B,      T0
+    PTR_SUB    L,     K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_SLLI   T0,    KK,     7
+    PTR_ADD    A0,    AA,     T0
+    dgemm_dsolve_16x2
+    PTR_ADDI   I,     I,      -1
+    PTR_ADDI   KK,    KK,     -16
+    PTR_ADDI   CC,    CC,     -(16 * 8)
+    PTR_SLLI   T0,    K,      7
+    PTR_SUB    AA,    AA,     T0
+    blt      ZERO,  I,      .L_N3_I1
+.L_N3_M0:
+    PTR_SLLI   T0,    K,      3
+    PTR_ALSL   B,     T0,     B,      1 // b += 2 * k;
+    PTR_ALSL   C,     LDC,    C,      1 // c += 2 * ldc
+.L_N1:
+    andi    J,      N,      1
+    beq     ZERO,   J,      .L_N0
+
+    PTR_ADD    KK,    M,     OFFSET
+    andi      I,    M,      15
+    beq       ZERO, I,      .L_N1_M16
+    andi      I,    M,      1
+    beqz      I,    .L_N1_M2
+.L_N1_M1:
+    PTR_ADDI    KK,   KK,     -1
+
+    PTR_ADDI    T0,   M,      -1
+    PTR_SLLI    T0,   T0,     3
+    PTR_MUL     AA,   T0,     K
+    PTR_ADD     AA,   AA,     A
+    PTR_ALSL    A0,   KK,     AA,     3 /* a + (m - 1) * k + kk */
+    PTR_ADD     CC,   T0,     C         /* c + (m - 1) */
+
+    PTR_SLLI   T0,    KK,     3
+    PTR_ADD    B0,    B,      T0 /* b + kk */
+    GADD , d, C0, CC, ZERO
+    // dgemm_dsolve_1x1
+    GLD f, d, $f0, A0, 0, $f1, C0, 0
+    GMUL f, d, $f1, $f1, $f0
+    GST f, d, $f1, C0, 0, $f1, B0, 0
+.L_N1_M2:
+    andi    I,      M,      2
+    beqz    I,      .L_N1_M4
+    PTR_SRLI  T0,     M,      1
+    PTR_SLLI  T0,     T0,     1
+    PTR_ADDI  T0,     T0,     -2
+    PTR_SLLI  T0,     T0,     3 /* ((m & -2) - 2) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -2) - 2)*/
+    PTR_SLLI  T1,     KK,     4
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -2) - 2) * k + 2 * kk */
+    PTR_SLLI  T0,     KK,     3
+    PTR_ADD   B0,     B,      T0 /* b + kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO
+    dgemm_dsolve_2x1
+    PTR_ADDI  KK,     KK,     -2
+.L_N1_M4:
+    andi    I,      M,      4
+    beqz    I,      .L_N1_M8
+    PTR_SRLI  T0,     M,      2
+    PTR_SLLI  T0,     T0,     2
+    PTR_ADDI  T0,     T0,     -4
+    PTR_SLLI  T0,     T0,     3 /* ((m & -4) - 4) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -4) - 4)*/
+    PTR_SLLI  T1,     KK,     5
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -4) - 4) * k + 4 * kk */
+    PTR_SLLI  T0,     KK,     3
+    PTR_ADD   B0,     B,      T0 /* b + kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO
+    dgemm_dsolve_4x1
+    PTR_ADDI  KK,     KK,     -4
+.L_N1_M8:
+    andi    I,      M,      8
+    beqz    I,      .L_N1_M16
+    PTR_SRLI  T0,     M,      3
+    PTR_SLLI  T0,     T0,     3
+    PTR_ADDI  T0,     T0,     -8
+    PTR_SLLI  T0,     T0,     3 /* ((m & -8) - 8) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -8) - 8)*/
+    PTR_SLLI  T1,     KK,     6
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -8) - 8) * k + 8 * kk */
+    PTR_SLLI  T0,     KK,     3
+    PTR_ADD   B0,     B,      T0 /* b +  kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO
+    dgemm_dsolve_8x1
+    PTR_ADDI  KK,     KK,     -8
+.L_N1_M16:
+    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_N1_M0
+
+    PTR_SRLI   T0,    M,      4
+    PTR_SLLI   T0,    T0,     4
+    PTR_ADDI   T0,    T0,     -16 /* ((M & -16)) - 16) */
+    PTR_SLLI   T0,    T0,     3
+    PTR_MUL    AA,    T0,     K
+    PTR_ADD    AA,    A,      AA
+    PTR_ADD    CC,    C,      T0
+.align 5
+.L_N1_I1:
+    PTR_SLLI   T0,    KK,     3
+    PTR_ADD    B0,    B,      T0
+    PTR_SUB    L,     K,      KK
+    GADD , d, C0, CC, ZERO
+    PTR_SLLI   T0,    KK,     7
+    PTR_ADD    A0,    AA,     T0
+    dgemm_dsolve_16x1
+    PTR_ADDI   I,     I,      -1
+    PTR_ADDI   KK,    KK,     -16
+    PTR_ADDI   CC,    CC,     -(16 * 8)
+    PTR_SLLI   T0,    K,      7
+    PTR_SUB    AA,    AA,     T0
+    blt      ZERO,  I,      .L_N1_I1
+.L_N1_M0:
+.L_N0:
+    pop_if_used 26, 32
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S
new file mode 100644
index 000000000..0e2cacccf
--- /dev/null
+++ b/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S
@@ -0,0 +1,959 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/26 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ *          FLOAT *c, BLASLONG ldc, BLASLONG offset)
+ */
+
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+#define OFFSET $r11  // param 9: offset
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define T0     $r25
+#define T1     $r26
+#define T2     $r27
+#define KK     $r28
+#define AA     $r29
+#define CC     $r30
+#define BB     B0
+#undef  ZERO
+#define ZERO   $r0
+
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+#define G0     D0
+#define G1     D1
+#define G2     D2
+#define G3     D3
+#define G4     D4
+#define G5     D5
+#define G6     D6
+#define G7     D7
+#define G8     D8
+#define G9     D9
+#define G10    D10
+#define G11    D11
+#define G12    D12
+#define G13    D13
+#define G14    D14
+#define G15    D15
+
+/* Prefetch interval */
+#define A_PRE  0x400
+#define B_PRE  0x100
+
+#include "dtrsm_kernel_macro.S"
+
+.macro ldrepl_macro start, end, stride
+// Load Ux (x = 0...15)
+.if \start <= \end
+    GLDREPL xv, d, $xr\start, A0, \stride * 8
+    ldrepl_macro %start + 1, \end, %stride + 1
+.endif
+.endm
+.macro nmsub_macro start0, end0, start1, reg
+// Gx -= reg * Ux
+.if \start0 <= \end0
+    xvfnmsub.d  $xr\start0, \reg, $xr\start1, $xr\start0
+    nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg
+.endif
+.endm
+.macro B_st_macro start, end, stride, N
+// Store Gx(x = 16...31)
+.if \start <= \end
+.if \N == 4
+    xvst    $xr\start, B0, \stride * 0x20
+.elseif \N == 2
+    vst     $vr\start, B0, \stride * 0x10
+.elseif \N == 1
+    fst.d   $f\start, B0, \stride * 0x08
+.endif
+    B_st_macro %start + 1, \end, %stride + 1, \N
+.endif
+.endm
+
+.macro dsolve_16 N
+// The data layout of C (4x16) is as follows (store 4 data in each register):
+// U0  U1  U2  U3
+// U4  U5  U6  U7
+// U8  U9  U10 U11
+// U12 U13 U14 U15
+// The first step is to transpose the result of C
+    GTRANSPOSE4x4_D U3, U7, U11, U15, G12, G13, G14, G15, D0, D1
+    GTRANSPOSE4x4_D U2, U6, U10, U14, G8, G9, G10, G11, D0, D1
+    GTRANSPOSE4x4_D U1, U5, U9, U13, G4, G5, G6, G7, U3, U7
+    GTRANSPOSE4x4_D U0, U4, U8, U12, G0, G1, G2, G3, U3, U7
+// Now we have the following memory layout of C:
+//     0     1    2   3    ...    15
+// 0 |    |    |    |    |     |     |
+// 1 | G0 | G1 | G2 | G3 | ... | G15 |
+// 2 |    |    |    |    |     |     |
+// 3 |    |    |    |    |     |     |
+// Next we are going to process matrix A with a size of 16x16,
+// using only the upper triangular portion. The memory layout of
+// matrix A is as follows, quite large.
+//0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15
+//	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31
+//		34	35	36	37	38	39	40	41	42	43	44	45	46	47
+//			51	52	53	54	55	56	57	58	59	60	61	62	63
+//				68	69	70	71	72	73	74	75	76	77	78	79
+//					85	86	87	88	89	90	91	92	93	94	95
+//						102	103	104	105	106	107	108	109	110	111
+//							119	120	121	122	123	124	125	126	127
+//								136	137	138	139	140	141	142	143
+//									153	154	155	156	157	158	159
+//										170	171	172	173	174	175
+//											187	188	189	190	191
+//												204	205	206	207
+//													221	222	223
+//														238	239
+//															255
+// Sequentially extract data from A in row order
+// Load 0
+    ldrepl_macro 0, 15, 0
+    GMUL xvf, d, G0, G0, U0
+    nmsub_macro 17, 31, 1, G0
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 1
+    ldrepl_macro 1, 15, 0
+    GMUL xvf, d, G1, G1, U1
+    nmsub_macro 18, 31, 2, G1
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 2
+    ldrepl_macro 2, 15, 0
+    GMUL xvf, d, G2, G2, U2
+    nmsub_macro 19, 31, 3, G2
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 3
+    ldrepl_macro 3, 15, 0
+    GMUL xvf, d, G3, G3, U3
+    nmsub_macro 20, 31, 4, G3
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 4
+    ldrepl_macro 4, 15, 0
+    GMUL xvf, d, G4, G4, U4
+    nmsub_macro 21, 31, 5, G4
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 5
+    ldrepl_macro 5, 15, 0
+    GMUL xvf, d, G5, G5, U5
+    nmsub_macro 22, 31, 6, G5
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 6
+    ldrepl_macro 6, 15, 0
+    GMUL xvf, d, G6, G6, U6
+    nmsub_macro 23, 31, 7, G6
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 7
+    ldrepl_macro 7, 15, 0
+    GMUL xvf, d, G7, G7, U7
+    nmsub_macro 24, 31, 8, G7
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 8
+    ldrepl_macro 8, 15, 0
+    GMUL xvf, d, G8, G8, U8
+    nmsub_macro 25, 31, 9, G8
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 9
+    ldrepl_macro 9, 15, 0
+    GMUL xvf, d, G9, G9, U9
+    nmsub_macro 26, 31, 10, G9
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 10
+    ldrepl_macro 10, 15, 0
+    GMUL xvf, d, G10, G10, U10
+    nmsub_macro 27, 31, 11, G10
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 11
+    ldrepl_macro 11, 15, 0
+    GMUL xvf, d, G11, G11, U11
+    nmsub_macro 28, 31, 12, G11
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 12
+    ldrepl_macro 12, 15, 0
+    GMUL xvf, d, G12, G12, U12
+    nmsub_macro 29, 31, 13, G12
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 13
+    ldrepl_macro 13, 15, 0
+    GMUL xvf, d, G13, G13, U13
+    nmsub_macro 30, 31, 14, G13
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 14
+    ldrepl_macro 14, 15, 0
+    GMUL xvf, d, G14, G14, U14
+    nmsub_macro 31, 31, 15, G14
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 15
+    ldrepl_macro 15, 15, 0
+    GMUL xvf, d, G15, G15, U15
+// Finally, We can store the result.
+// For B, stored sequentially, and  C, first transpose and then store
+    B_st_macro 16, 31, 0, \N
+    GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
+    GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1
+    GTRANSPOSE4x4_D G8, G9, G10, G11, G8, G9, G10, G11, U0, U1
+    GTRANSPOSE4x4_D G12, G13, G14, G15, G12, G13, G14, G15, U0, U1
+.if \N == 4
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8,  C0, 0x40, G12, C0, 0x60, \
+              G1, C1, 0x00, G5, C1, 0x20, G9,  C1, 0x40, G13, C1, 0x60, \
+              G2, C2, 0x00, G6, C2, 0x20, G10, C2, 0x40, G14, C2, 0x60, \
+              G3, C3, 0x00, G7, C3, 0x20, G11, C3, 0x40, G15, C3, 0x60
+.elseif \N == 2
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8,  C0, 0x40, G12, C0, 0x60, \
+              G1, C1, 0x00, G5, C1, 0x20, G9,  C1, 0x40, G13, C1, 0x60
+.elseif \N == 1
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8,  C0, 0x40, G12, C0, 0x60
+.endif
+.endm
+
+.macro dgemm_dsolve_16x4
+    bge     ZERO,       KK,     .L_dsolve_16x4_load
+    dgemm_16x4
+    b	.L_dsolve_16x4
+.L_dsolve_16x4_load:
+    // Load C
+    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
+    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
+    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+/********************** solver ******************/
+.L_dsolve_16x4:
+    dsolve_16 4
+.endm
+
+.macro dsolve_8 N
+// The data layout of C (4x8) is as follows (store 4 data in each register):
+// U0  U1
+// U2  U3
+// U4  U5
+// U6  U7
+// The first step is to transpose the result of C
+    GTRANSPOSE4x4_D U1, U3, U5, U7, G4, G5, G6, G7, G8, G9
+    GTRANSPOSE4x4_D U0, U2, U4, U6, G0, G1, G2, G3, G8, G9
+// Now we have the following memory layout of C:
+//     0     1    2   3    ...   7
+// 0 |    |    |    |    |     |    |
+// 1 | G0 | G1 | G2 | G3 | ... | G7 |
+// 2 |    |    |    |    |     |    |
+// 3 |    |    |    |    |     |    |
+// Next we are going to process matrix A with a size of 8x8,
+// using only the upper triangular portion. The memory layout of
+// matrix A is as follows:
+//0	1	2	3	4	5	6	7
+//	9	10	11	12	13	14	15
+//		18	19	20	21	22	23
+//			27	28	29	30	31
+//				36	37	38	39
+//					45	46	47
+//						54	55
+//							63
+// Sequentially extract data from A in row order
+// Load 0
+    ldrepl_macro 0, 7, 0
+    GMUL xvf, d, G0, G0, U0
+    nmsub_macro 17, 23, 1, G0
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 1
+    ldrepl_macro 1, 7, 0
+    GMUL xvf, d, G1, G1, U1
+    nmsub_macro 18, 23, 2, G1
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 2
+    ldrepl_macro 2, 7, 0
+    GMUL xvf, d, G2, G2, U2
+    nmsub_macro 19, 23, 3, G2
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 3
+    ldrepl_macro 3, 7, 0
+    GMUL xvf, d, G3, G3, U3
+    nmsub_macro 20, 23, 4, G3
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 4
+    ldrepl_macro 4, 7, 0
+    GMUL xvf, d, G4, G4, U4
+    nmsub_macro 21, 23, 5, G4
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 5
+    ldrepl_macro 5, 7, 0
+    GMUL xvf, d, G5, G5, U5
+    nmsub_macro 22, 23, 6, G5
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 6
+    ldrepl_macro 6, 7, 0
+    GMUL xvf, d, G6, G6, U6
+    nmsub_macro 23, 23, 7, G6
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 7
+    ldrepl_macro 7, 7, 0
+    GMUL xvf, d, G7, G7, U7
+// Finally, We can store the result.
+// For B, stored sequentially, and  C, first transpose and then store
+    B_st_macro 16, 23, 0, \N
+    GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
+    GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1
+.if \N == 4
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20, \
+              G1, C1, 0x00, G5, C1, 0x20, \
+              G2, C2, 0x00, G6, C2, 0x20, \
+              G3, C3, 0x00, G7, C3, 0x20
+.elseif \N == 2
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20, \
+              G1, C1, 0x00, G5, C1, 0x20
+.elseif \N == 1
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20
+.endif
+.endm
+
+.macro dgemm_dsolve_8x4
+    bge   ZERO, L,	.L_dsolve_8x4_load
+    dgemm_8x4
+    b .L_dsolve_8x4
+.L_dsolve_8x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+
+    /* Load C2  */
+    xvld      U4,  C2,  0x00
+    xvld      U5,  C2,  0x20
+
+    /* Load C3  */
+    xvld      U6,  C3,  0x00
+    xvld      U7,  C3,  0x20
+/********* solver *********/
+.L_dsolve_8x4:
+    dsolve_8 4
+.endm
+
+.macro dsolve_4 N
+// The data layout of C (4x4) is as follows (store 4 data in each register):
+// U0
+// U1
+// U2
+// U3
+// The first step is to transpose the result of C
+    GTRANSPOSE4x4_D U0, U1, U2, U3, G0, G1, G2, G3, G4, G5
+// Now we have the following memory layout of C:
+//     0     1    2   3
+// 0 |    |    |    |    |
+// 1 | G0 | G1 | G2 | G3 |
+// 2 |    |    |    |    |
+// 3 |    |    |    |    |
+// Next we are going to process matrix A with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix A is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from A in row order
+// Load 0
+    ldrepl_macro 0, 3, 0
+    GMUL xvf, d, G0, G0, U0
+    nmsub_macro 17, 19, 1, G0
+    PTR_ADDI    A0,      A0,      5 * 8
+// Load 1
+    ldrepl_macro 1, 3, 0
+    GMUL xvf, d, G1, G1, U1
+    nmsub_macro 18, 19, 2, G1
+    PTR_ADDI    A0,      A0,      5 * 8
+// Load 2
+    ldrepl_macro 2, 3, 0
+    GMUL xvf, d, G2, G2, U2
+    nmsub_macro 19, 19, 3, G2
+    PTR_ADDI    A0,      A0,      5 * 8
+// Load 3
+    ldrepl_macro 3, 3, 0
+    GMUL xvf, d, G3, G3, U3
+// Finally, We can store the result.
+// For B, stored sequentially, and  C, first transpose and then store
+    B_st_macro 16, 19, 0, \N
+    GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
+.if \N == 4
+    GST xv, , G0, C0, 0x00, G1, C1, 0x00, G2, C2, 0x00, G3, C3, 0x00
+.elseif \N == 2
+    GST xv, , G0, C0, 0x00, G1, C1, 0x00
+.elseif \N == 1
+    GST xv, , G0, C0, 0x00
+.endif
+.endm
+
+.macro dgemm_dsolve_4x4
+    bge   ZERO, L,    .L_dsolve_4x4_load
+    dgemm_4x4
+    b .L_dsolve_4x4
+.L_dsolve_4x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/************** solver *****************/
+.L_dsolve_4x4:
+    dsolve_4 4
+.endm
+
+.macro dsolve_2 N
+// Transpose
+    GSBUTTERFLY xv, d, G0, G1, U1, U0
+// Now we have the following memory layout of C:
+//     0     1
+// 0 |    |    |
+// 1 | G0 | G1 |
+// 2 |    |    |
+// 3 |    |    |
+// Next we are going to process matrix A with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix A is as follows:
+//0	1
+//	3
+// Sequentially extract data from A in row order
+// Load 0
+    ldrepl_macro 0, 1, 0
+    GMUL xvf, d, G0, G0, U0
+    nmsub_macro 17, 17, 1, G0
+    PTR_ADDI    A0,      A0,      3 * 8
+// Load 1
+    ldrepl_macro 1, 1, 0
+    GMUL xvf, d, G1, G1, U1
+// Finally, We can store the result.
+// For B, stored sequentially, and  C, first transpose and then store
+    B_st_macro 16, 17, 0, \N
+    GSBUTTERFLY xv, d, U0, U1, G1, G0
+.if \N == 4
+    vst       $vr0,     C0,      0x00
+    vst       $vr1,     C1,      0x00
+    xvstelm.d U0,  C2,  0x00,    0x02
+    xvstelm.d U1,  C3,  0x00,    0x02
+    xvstelm.d U0,  C2,  0x08,    0x03
+    xvstelm.d U1,  C3,  0x08,    0x03
+.elseif \N == 2
+    vst       $vr0,     C0,      0x00
+    vst       $vr1,     C1,      0x00
+.elseif \N == 1
+    vst       $vr0,     C0,      0x00
+.endif
+.endm
+
+.macro dgemm_dsolve_2x4
+    bge   ZERO, L,    .L_dsolve_2x4_load
+    dgemm_2x4
+    b   .L_dsolve_2x4
+.L_dsolve_2x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+
+    xvpermi.q   U0, U2, 0x02
+    xvpermi.q   U1, U3, 0x02
+/********************** solver ******************/
+.L_dsolve_2x4:
+    dsolve_2 4
+.endm
+
+.macro dgemm_dsolve_1x4
+    bge   ZERO, L,    .L_dsolve_1x4_load
+    dgemm_1x4
+    b   .L_dsolve_1x4
+.L_dsolve_1x4_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    fld.d       $f2,    C2,     0x00
+    fld.d       $f3,    C3,     0x00
+    xvinsve0.d  U0,     U1,     0x01
+    xvinsve0.d  U0,     U2,     0x02
+    xvinsve0.d  U0,     U3,     0x03
+.L_dsolve_1x4:
+    GLDREPL xv, d, D0, A0, 0x00
+    GMUL xvf, d, U0, U0, D0
+    // Store C
+    xvstelm.d   U0,     C0,     0x00,       0x00
+    xvstelm.d   U0,     C1,     0x00,       0x01
+    xvstelm.d   U0,     C2,     0x00,       0x02
+    xvstelm.d   U0,     C3,     0x00,       0x03
+    // Store B
+    xvst    U0,     B0,     0x00
+.endm
+
+.macro dgemm_dsolve_16x2
+    bge   ZERO, L,	.L_dsolve_16x2_load
+    dgemm_16x2
+    b .L_dsolve_16x2
+.L_dsolve_16x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    /* Load C1  */
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+.L_dsolve_16x2:
+    dsolve_16 2
+.endm
+
+.macro dgemm_dsolve_8x2
+    bge   ZERO, L,	.L_dsolve_8x2_load
+    dgemm_8x2
+    b .L_dsolve_8x2
+.L_dsolve_8x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+.L_dsolve_8x2:
+    dsolve_8 2
+.endm
+
+.macro dgemm_dsolve_4x2
+    bge   ZERO, L,	.L_dsolve_4x2_load
+    dgemm_4x2
+    b .L_dsolve_4x2
+.L_dsolve_4x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_4x2:
+    dsolve_4 2
+.endm
+
+.macro dgemm_dsolve_1x2
+    bge   ZERO, L,    .L_dsolve_1x2_load
+    dgemm_1x2
+    b   .L_dsolve_1x2
+.L_dsolve_1x2_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    xvinsve0.d  U0,     U1,     0x01
+.L_dsolve_1x2:
+    GLDREPL xv, d, D0, A0, 0x00
+    GMUL xvf, d, U0, U0, D0
+    // Store C
+    xvstelm.d   U0,     C0,     0x00,       0x00
+    xvstelm.d   U0,     C1,     0x00,       0x01
+    // Store B
+    vst    $vr0,     B0,     0x00
+.endm
+
+.macro dgemm_dsolve_2x2
+    bge   ZERO, L,	.L_dsolve_2x2_load
+    dgemm_2x2
+    b .L_dsolve_2x2
+.L_dsolve_2x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_2x2:
+    dsolve_2 2
+.endm
+
+.macro dgemm_dsolve_16x1
+    bge   ZERO, L,	.L_dsolve_16x1_load
+    dgemm_16x1
+    b .L_dsolve_16x1
+.L_dsolve_16x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+.L_dsolve_16x1:
+    dsolve_16 1
+.endm
+
+.macro dgemm_dsolve_8x1
+    bge   ZERO, L,	.L_dsolve_8x1_load
+    dgemm_8x1
+    b .L_dsolve_8x1
+.L_dsolve_8x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+.L_dsolve_8x1:
+    dsolve_8 1
+.endm
+
+.macro dgemm_dsolve_4x1
+    bge   ZERO, L,	.L_dsolve_4x1_load
+    dgemm_4x1
+    b .L_dsolve_4x1
+.L_dsolve_4x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_4x1:
+    dsolve_4 1
+.endm
+
+.macro dgemm_dsolve_2x1
+    bge   ZERO, L,	.L_dsolve_2x1_load
+    dgemm_2x1
+    b .L_dsolve_2x1
+.L_dsolve_2x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_2x1:
+    dsolve_2 1
+.endm
+
+.macro dgemm_dsolve_1x1
+    bge   ZERO, L,    .L_dsolve_1x1_load
+    dgemm_1x1
+    b .L_dsolve_1x1
+.L_dsolve_1x1_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+.L_dsolve_1x1:
+    GLDREPL xv, d, D0, A0, 0x00
+    GMUL xvf, d, U0, U0, D0
+    // Store C
+    xvstelm.d   U0,     C0,     0x00,       0x00
+    // Store B
+    xvstelm.d   U0,     B0,     0x00,       0x00
+.endm
+
+    PROLOGUE
+    push_if_used 26, 32
+    PTR_SLLI   LDC,   LDC,   3
+    /* if (!(N >> 2)) goto L_N3 */
+    PTR_SRAI   J,     N,     2     /* J = bn >> 2 */
+    andi       N,     N,     0x03
+    beq        ZERO,  J,     .L_N3
+.align 5
+.L_J1:
+    PTR_ADDI    J,      J,     -1
+    move        KK,     OFFSET
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_M15
+.align 4
+.L_I1:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x4
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADDI    KK,     KK,     0x10 // kk += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_I1
+.L_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_M7
+.L_M8:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x4
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADDI    KK,     KK,     0x08 // kk += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_M3
+.L_M4:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x4
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADDI    KK,     KK,     0x04 // kk += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_M1
+.L_M2:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x4
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADDI    KK,     KK,     0x02 // kk += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_M0
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x4
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADDI    KK,     KK,     0x01 // kk += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_M0:
+    PTR_SLLI    T0,     K,      5
+    PTR_SLLI    T1,     LDC,    2
+    PTR_ADD     B,      B,      T0 // b += 4 * k
+    PTR_ADD     C,      C,      T1 // c += 4 * ldc
+    bnez        J,      .L_J1
+.L_N3:
+    andi    J,      N,      2
+    beq     ZERO,   J,      .L_N1
+.L_N2:
+    move        KK,     OFFSET
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N2_M15
+.align 4
+.L_N2_I1:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x2
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADDI    KK,     KK,     0x10 // kk += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N2_I1
+.L_N2_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N2_M7
+.L_N2_M8:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x2
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADDI    KK,     KK,     0x08 // kk += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N2_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N2_M3
+.L_N2_M4:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x2
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADDI    KK,     KK,     0x04 // kk += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N2_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N2_M1
+.L_N2_M2:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x2
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADDI    KK,     KK,     0x02 // kk += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N2_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N2_M0
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x2
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADDI    KK,     KK,     0x01 // kk += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N2_M0:
+    PTR_SLLI    T0,     K,      4
+    PTR_SLLI    T1,     LDC,    1
+    PTR_ADD     B,      B,      T0 // b += 2 * k
+    PTR_ADD     C,      C,      T1 // c += 2 * ldc
+.L_N1:
+    andi    J,      N,      1
+    beq     ZERO,   J,      .L_N0
+
+    move        KK,     OFFSET
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N1_M15
+.align 4
+.L_N1_I1:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x1
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADDI    KK,     KK,     0x10 // kk += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N1_I1
+.L_N1_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N1_M7
+.L_N1_M8:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x1
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADDI    KK,     KK,     0x08 // kk += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N1_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N1_M3
+.L_N1_M4:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x1
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADDI    KK,     KK,     0x04 // kk += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N1_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N1_M1
+.L_N1_M2:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x1
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADDI    KK,     KK,     0x02 // kk += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N1_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N1_M0
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x1
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADDI    KK,     KK,     0x01 // kk += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N1_M0:
+.L_N0:
+    pop_if_used 26, 32
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S
new file mode 100644
index 000000000..421339736
--- /dev/null
+++ b/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S
@@ -0,0 +1,882 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/09/26 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ *          FLOAT *c, BLASLONG ldc, BLASLONG offset)
+ */
+
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+#define OFFSET $r11  // param 9: offset
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define T0     $r25
+#define T1     $r26
+#define T2     $r27
+#define KK     $r28
+#define AA     $r29
+#define CC     $r30
+#define BB     B0
+#undef  ZERO
+#define ZERO   $r0
+
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+#define G0     D0
+#define G1     D1
+#define G2     D2
+#define G3     D3
+#define G4     D4
+#define G5     D5
+#define G6     D6
+#define G7     D7
+#define G8     D8
+#define G9     D9
+#define G10    D10
+#define G11    D11
+#define G12    D12
+#define G13    D13
+#define G14    D14
+#define G15    D15
+
+/* Prefetch interval */
+#define A_PRE  0x400
+#define B_PRE  0x100
+
+#include "dtrsm_kernel_macro.S"
+
+.macro ldrepl_macro start, end, stride
+// Load Ux (x = 0...15)
+.if \start <= \end
+    GLDREPL xv, d, $xr\start, B0, \stride * 8
+    ldrepl_macro %start + 1, \end, %stride + 1
+.endif
+.endm
+
+.macro nmsub_macro start0, end0, start1, reg
+// Ux -= reg * Dx
+.if \start0 <= \end0
+    xvfnmsub.d  $xr\start0, \reg, $xr\start1, $xr\start0
+    nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg
+.endif
+.endm
+
+.macro A_st_macro start, end, stride, N
+// Store Ux(x = 0...15)
+.if \start <= \end
+.if \N == 4
+    xvst    $xr\start, A0, \stride * 0x20
+.elseif \N == 2
+    vst     $vr\start, A0, \stride * 0x10
+.elseif \N == 1
+    fst.d   $f\start, A0, \stride * 0x08
+.endif
+    A_st_macro %start + 1, \end, %stride + 1, \N
+.endif
+.endm
+
+.macro dsolve_16x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 19, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+    ldrepl_macro 20, 22, 5
+    nmsub_macro 4, 7, 0, D1
+    ldrepl_macro 23, 24, 10
+    GMUL xvf, d, U4, D4, U4, U5, D4, U5, U6, D4, U6, U7, D4, U7
+    ldrepl_macro 25, 25, 15
+    nmsub_macro 8, 11, 0, D2
+    nmsub_macro 8, 11, 4, D5
+    GMUL xvf, d, U8, D7, U8, U9, D7, U9, U10, D7, U10, U11, D7, U11
+    nmsub_macro 12, 15, 0, D3
+    nmsub_macro 12, 15, 4, D6
+    nmsub_macro 12, 15, 8, D8
+    GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15
+// Store A
+    A_st_macro 0, 15, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60, \
+              U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60, \
+              U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60, \
+              U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+.endm
+
+.macro dsolve_16x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1
+//	3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 17, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+    ldrepl_macro 18, 18, 3
+    nmsub_macro 4, 7, 0, D1
+    GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
+// Store A
+    A_st_macro 0, 7, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60, \
+              U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+.endm
+
+.macro dsolve_8x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 19, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+    ldrepl_macro 20, 22, 5
+    nmsub_macro 2, 3, 0, D1
+    ldrepl_macro 23, 24, 10
+    GMUL xvf, d, U2, D4, U2, U3, D4, U3
+    ldrepl_macro 25, 25, 15
+    nmsub_macro 4, 5, 0, D2
+    nmsub_macro 4, 5, 2, D5
+    GMUL xvf, d, U4, D7, U4, U5, D7, U5
+    nmsub_macro 6, 7, 0, D3
+    nmsub_macro 6, 7, 2, D6
+    nmsub_macro 6, 7, 4, D8
+    GMUL xvf, d, U6, D9, U6, U7, D9, U7
+// Store A
+    A_st_macro 0, 7, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
+              U2, C1, 0x00, U3, C1, 0x20, \
+              U4, C2, 0x00, U5, C2, 0x20, \
+              U6, C3, 0x00, U7, C3, 0x20
+.endm
+
+.macro dsolve_8x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1
+//	3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 17, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+    ldrepl_macro 18, 18, 3
+    nmsub_macro 2, 3, 0, D1
+    GMUL xvf, d, U2, D2, U2, U3, D2, U3
+// Store A
+    A_st_macro 0, 3, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, \
+              U2,  C1, 0x00, U3,  C1, 0x20
+.endm
+
+.macro dsolve_4x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 19, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 20, 22, 5
+    nmsub_macro 1, 1, 0, D1
+    ldrepl_macro 23, 24, 10
+    GMUL xvf, d, U1, D4, U1
+    ldrepl_macro 25, 25, 15
+    nmsub_macro 2, 2, 0, D2
+    nmsub_macro 2, 2, 1, D5
+    GMUL xvf, d, U2, D7, U2
+    nmsub_macro 3, 3, 0, D3
+    nmsub_macro 3, 3, 1, D6
+    nmsub_macro 3, 3, 2, D8
+    GMUL xvf, d, U3, D9, U3
+// Store A
+    A_st_macro 0, 3, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00
+.endm
+
+.macro dsolve_4x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1
+//	3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 17, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 18, 18, 3
+    nmsub_macro 1, 1, 0, D1
+    GMUL xvf, d, U1, D2, U1
+// Store A
+    A_st_macro 0, 1, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C1, 0x00
+.endm
+
+.macro dsolve_2x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 19, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 20, 22, 5
+    nmsub_macro 1, 1, 0, D1
+    ldrepl_macro 23, 24, 10
+    GMUL xvf, d, U1, D4, U1
+
+    ldrepl_macro 25, 25, 15
+    nmsub_macro 2, 2, 0, D2
+    nmsub_macro 2, 2, 1, D5
+    GMUL xvf, d, U2, D7, U2
+    nmsub_macro 3, 3, 0, D3
+    nmsub_macro 3, 3, 1, D6
+    nmsub_macro 3, 3, 2, D8
+    GMUL xvf, d, U3, D9, U3
+// Store A
+    A_st_macro 0, 3, 0, 2
+// Store C
+    GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00,
+.endm
+
+.macro dsolve_2x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1
+//	3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 17, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 18, 18, 3
+    nmsub_macro 1, 1, 0, D1
+    GMUL xvf, d, U1, D2, U1
+// Store A
+    A_st_macro 0, 1, 0, 2
+// Store C
+    GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00
+.endm
+
+.macro dsolve_1x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 19, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 20, 22, 5
+    nmsub_macro 1, 1, 0, D1
+    ldrepl_macro 23, 24, 10
+    GMUL xvf, d, U1, D4, U1
+
+    ldrepl_macro 25, 25, 15
+    nmsub_macro 2, 2, 0, D2
+    nmsub_macro 2, 2, 1, D5
+    GMUL xvf, d, U2, D7, U2
+    nmsub_macro 3, 3, 0, D3
+    nmsub_macro 3, 3, 1, D6
+    nmsub_macro 3, 3, 2, D8
+    GMUL xvf, d, U3, D9, U3
+// Store A
+    A_st_macro 0, 3, 0, 1
+// Store C
+    GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00,
+.endm
+
+.macro dsolve_1x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1
+//	3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 17, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 18, 18, 3
+    nmsub_macro 1, 1, 0, D1
+    GMUL xvf, d, U1, D2, U1
+// Store A
+    A_st_macro 0, 1, 0, 1
+// Store C
+    GST f, d, $f0, C0, 0x00, $f1, C1, 0x00
+.endm
+
+.macro dgemm_dsolve_16x4
+    bge   ZERO, L,	.L_dsolve_16x4_load
+    dgemm_16x4
+    b	.L_dsolve_16x4
+.L_dsolve_16x4_load:
+    // Load C
+    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
+    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
+    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+/********************** solver ******************/
+.L_dsolve_16x4:
+    dsolve_16x4
+.endm
+
+.macro dgemm_dsolve_8x4
+    bge   ZERO, L,	.L_dsolve_8x4_load
+    dgemm_8x4
+    b .L_dsolve_8x4
+.L_dsolve_8x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+
+    /* Load C2  */
+    xvld      U4,  C2,  0x00
+    xvld      U5,  C2,  0x20
+
+    /* Load C3  */
+    xvld      U6,  C3,  0x00
+    xvld      U7,  C3,  0x20
+/********* solver *********/
+.L_dsolve_8x4:
+    dsolve_8x4
+.endm
+
+.macro dgemm_dsolve_4x4
+    bge   ZERO, L,    .L_dsolve_4x4_load
+    dgemm_4x4
+    b .L_dsolve_4x4
+.L_dsolve_4x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/************** solver *****************/
+.L_dsolve_4x4:
+    dsolve_4x4
+.endm
+
+.macro dgemm_dsolve_2x4
+    bge   ZERO, L,    .L_dsolve_2x4_load
+    dgemm_2x4
+    xvpermi.q   U2,     U0,     0x01
+    xvpermi.q   U3,     U1,     0x01
+    b   .L_dsolve_2x4
+.L_dsolve_2x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/********************** solver ******************/
+.L_dsolve_2x4:
+    dsolve_2x4
+.endm
+
+.macro dgemm_dsolve_1x4
+    bge   ZERO, L,    .L_dsolve_1x4_load
+    dgemm_1x4
+    xvpackod.d  U1,     U0,     U0
+    xvpermi.q   U2,     U0,     0x01
+    xvpermi.q   U3,     U1,     0x01
+    b   .L_dsolve_1x4
+.L_dsolve_1x4_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    fld.d       $f2,    C2,     0x00
+    fld.d       $f3,    C3,     0x00
+.L_dsolve_1x4:
+    dsolve_1x4
+.endm
+
+.macro dgemm_dsolve_16x2
+    bge   ZERO, L,	.L_dsolve_16x2_load
+    dgemm_16x2
+    b .L_dsolve_16x2
+.L_dsolve_16x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    /* Load C1  */
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+.L_dsolve_16x2:
+    dsolve_16x2
+.endm
+
+.macro dgemm_dsolve_8x2
+    bge   ZERO, L,	.L_dsolve_8x2_load
+    dgemm_8x2
+    b .L_dsolve_8x2
+.L_dsolve_8x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+.L_dsolve_8x2:
+    dsolve_8x2
+.endm
+
+.macro dgemm_dsolve_4x2
+    bge   ZERO, L,	.L_dsolve_4x2_load
+    dgemm_4x2
+    b .L_dsolve_4x2
+.L_dsolve_4x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_4x2:
+    dsolve_4x2
+.endm
+
+.macro dgemm_dsolve_2x2
+    bge   ZERO, L,	.L_dsolve_2x2_load
+    dgemm_2x2
+    b .L_dsolve_2x2
+.L_dsolve_2x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_2x2:
+    dsolve_2x2
+.endm
+
+.macro dgemm_dsolve_1x2
+    bge   ZERO, L,    .L_dsolve_1x2_load
+    dgemm_1x2
+    xvpackod.d  U1,     U0,     U0
+    b   .L_dsolve_1x2
+.L_dsolve_1x2_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+.L_dsolve_1x2:
+    dsolve_1x2
+.endm
+
+.macro dgemm_dsolve_16x1
+    bge   ZERO, L,	.L_dsolve_16x1_load
+    dgemm_16x1
+    b .L_dsolve_16x1
+.L_dsolve_16x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+.L_dsolve_16x1:
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+    // Store A
+    A_st_macro 0, 3, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60
+.endm
+
+.macro dgemm_dsolve_8x1
+    bge   ZERO, L,	.L_dsolve_8x1_load
+    dgemm_8x1
+    b .L_dsolve_8x1
+.L_dsolve_8x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+.L_dsolve_8x1:
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+    // Store A
+    A_st_macro 0, 1, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20
+.endm
+
+.macro dgemm_dsolve_4x1
+    bge   ZERO, L,	.L_dsolve_4x1_load
+    dgemm_4x1
+    b .L_dsolve_4x1
+.L_dsolve_4x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_4x1:
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00
+.endm
+
+.macro dgemm_dsolve_2x1
+    bge   ZERO, L,	.L_dsolve_2x1_load
+    dgemm_2x1
+    b .L_dsolve_2x1
+.L_dsolve_2x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_2x1:
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 2
+    // Strore C
+    GST v, , $vr0, C0, 0x00
+.endm
+
+.macro dgemm_dsolve_1x1
+    bge   ZERO, L,    .L_dsolve_1x1_load
+    dgemm_1x1
+    b .L_dsolve_1x1
+.L_dsolve_1x1_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+.L_dsolve_1x1:
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 1
+    // Strore C
+    GST f, d, $f0, C0, 0x00
+.endm
+
+    PROLOGUE
+    push_if_used 26, 32
+    PTR_SLLI   LDC,   LDC,   3
+    PTR_SUB    KK,    ZERO,  OFFSET
+    /* if (!(N >> 2)) goto L_N3 */
+    PTR_SRAI   J,     N,     2     /* J = bn >> 2 */
+    andi       N,     N,     0x03
+    beq        ZERO,  J,     .L_N3
+.align 5
+.L_J1:
+    PTR_ADDI    J,      J,     -1
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_M15
+.align 4
+.L_I1:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x4
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_I1
+.L_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_M7
+.L_M8:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x4
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_M3
+.L_M4:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x4
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_M1
+.L_M2:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x4
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_M0
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x4
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_M0:
+    PTR_SLLI    T0,     K,      5
+    PTR_SLLI    T1,     LDC,    2
+    PTR_ADD     B,      B,      T0 // b += 4 * k
+    PTR_ADD     C,      C,      T1 // c += 4 * ldc
+    PTR_ADDI    KK,     KK,     4 // kk += 4
+    bnez        J,      .L_J1
+.L_N3:
+    andi    J,      N,      2
+    beq     ZERO,   J,      .L_N1
+.L_N2:
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N2_M15
+.align 4
+.L_N2_I1:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x2
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N2_I1
+.L_N2_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N2_M7
+.L_N2_M8:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x2
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N2_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N2_M3
+.L_N2_M4:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x2
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N2_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N2_M1
+.L_N2_M2:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x2
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N2_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N2_M0
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x2
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N2_M0:
+    PTR_SLLI    T0,     K,      4
+    PTR_SLLI    T1,     LDC,    1
+    PTR_ADD     B,      B,      T0 // b += 2 * k
+    PTR_ADD     C,      C,      T1 // c += 2 * ldc
+    PTR_ADDI    KK,     KK,     2 // kk += 2
+.L_N1:
+    andi    J,      N,      1
+    beq     ZERO,   J,      .L_N0
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N1_M15
+.align 4
+.L_N1_I1:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x1
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N1_I1
+.L_N1_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N1_M7
+.L_N1_M8:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x1
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N1_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N1_M3
+.L_N1_M4:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x1
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N1_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N1_M1
+.L_N1_M2:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x1
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N1_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N1_M0
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x1
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N1_M0:
+.L_N0:
+    pop_if_used 26, 32
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S
new file mode 100644
index 000000000..5f86d75b5
--- /dev/null
+++ b/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S
@@ -0,0 +1,953 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/09/26 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ *          FLOAT *c, BLASLONG ldc, BLASLONG offset)
+ */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+#define OFFSET $r11  // param 9: offset
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define T0     $r25
+#define T1     $r26
+#define T2     $r27
+#define KK     $r28
+#define AA     $r29
+#define CC     $r30
+#define BB     $r31
+#undef  ZERO
+#define ZERO   $r0
+
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+
+/* Prefetch interval */
+#define A_PRE  0x400
+#define B_PRE  0x100
+
+#include "dtrsm_kernel_macro.S"
+
+.macro ldrepl_macro start, end, stride
+// Load Ux (x = 0...15)
+.if \start <= \end
+    GLDREPL xv, d, $xr\start, B0, \stride * 8
+    ldrepl_macro %start + 1, \end, %stride + 1
+.endif
+.endm
+
+.macro nmsub_macro start0, end0, start1, reg
+// Ux -= reg * Dx
+.if \start0 <= \end0
+    xvfnmsub.d  $xr\start0, \reg, $xr\start1, $xr\start0
+    nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg
+.endif
+.endm
+
+.macro A_st_macro start, end, stride, N
+// Store Ux(x = 0...15)
+.if \start <= \end
+.if \N == 4
+    xvst    $xr\start, A0, \stride * 0x20
+.elseif \N == 2
+    vst     $vr\start, A0, \stride * 0x10
+.elseif \N == 1
+    fst.d   $f\start, A0, \stride * 0x08
+.endif
+    A_st_macro %start + 1, \end, %stride + 1, \N
+.endif
+.endm
+
+.macro dsolve_16x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//2	 3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 16, 0
+    ldrepl_macro 17, 18, 2
+    GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
+    nmsub_macro 0, 3, 4, D1
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+// Store A
+    A_st_macro 0, 7, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60, \
+              U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+.endm
+
+.macro dsolve_8x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//2	 3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 16, 0
+    ldrepl_macro 17, 18, 2
+    GMUL xvf, d, U2, D2, U2, U3, D2, U3
+    nmsub_macro 0, 1, 2, D1
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+// Store A
+    A_st_macro 0, 3, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, \
+              U2,  C1, 0x00, U3,  C1, 0x20
+.endm
+
+.macro dsolve_4x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//2	 3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 16, 0
+    ldrepl_macro 17, 18, 2
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 1, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C1, 0x00
+.endm
+
+.macro dsolve_2x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//2	 3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 16, 0
+    ldrepl_macro 17, 18, 2
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 1, 0, 2
+// Store C
+    GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00
+.endm
+
+.macro dsolve_1x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//2	 3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 16, 0
+    ldrepl_macro 17, 18, 2
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 1, 0, 1
+// Store C
+    GST f, d, $f0, C0, 0x00, $f1, C1, 0x00
+.endm
+
+.macro dsolve_16x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//4	 5
+//8	 9	10
+//12 13	14	15
+// Sequentially extract data from B in row order
+    ldrepl_macro 22, 25, 12
+    GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15
+    ldrepl_macro 19, 21, 8
+    nmsub_macro 8, 11, 12, D8
+    ldrepl_macro 17, 18, 4
+    GMUL xvf, d, U8, D5, U8, U9, D5, U9, U10, D5, U10, U11, D5, U11
+    ldrepl_macro 16, 16, 0
+    nmsub_macro 4, 7, 12, D7
+    nmsub_macro 4, 7, 8, D4
+    GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
+    nmsub_macro 0, 3, 12, D6
+    nmsub_macro 0, 3, 8, D3
+    nmsub_macro 0, 3, 4, D1
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+// Store A
+    A_st_macro 0, 15, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60, \
+              U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60, \
+              U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60, \
+              U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+.endm
+
+.macro dsolve_8x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//4	 5
+//8	 9	10
+//12 13	14	15
+// Sequentially extract data from B in row order
+    ldrepl_macro 22, 25, 12
+    GMUL xvf, d, U6, D9, U6, U7, D9, U7
+    ldrepl_macro 19, 21, 8
+    nmsub_macro 4, 5, 6, D8
+    ldrepl_macro 17, 18, 4
+    GMUL xvf, d, U4, D5, U4, U5, D5, U5
+    ldrepl_macro 16, 16, 0
+    nmsub_macro 2, 3, 6, D7
+    nmsub_macro 2, 3, 4, D4
+    GMUL xvf, d, U2, D2, U2, U3, D2, U3
+    nmsub_macro 0, 1, 6, D6
+    nmsub_macro 0, 1, 4, D3
+    nmsub_macro 0, 1, 2, D1
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+// Store A
+    A_st_macro 0, 7, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
+              U2, C1, 0x00, U3, C1, 0x20, \
+              U4, C2, 0x00, U5, C2, 0x20, \
+              U6, C3, 0x00, U7, C3, 0x20
+.endm
+
+.macro dsolve_4x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//4	 5
+//8	 9	10
+//12 13	14	15
+// Sequentially extract data from B in row order
+    ldrepl_macro 22, 25, 12
+    GMUL xvf, d, U3, D9, U3
+    ldrepl_macro 19, 21, 8
+    nmsub_macro 2, 2, 3, D8
+    ldrepl_macro 17, 18, 4
+    GMUL xvf, d, U2, D5, U2
+    ldrepl_macro 16, 16, 0
+    nmsub_macro 1, 1, 3, D7
+    nmsub_macro 1, 1, 2, D4
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 3, D6
+    nmsub_macro 0, 0, 2, D3
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 3, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00
+.endm
+
+.macro dsolve_2x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//4	 5
+//8	 9	10
+//12 13	14	15
+// Sequentially extract data from B in row order
+    ldrepl_macro 22, 25, 12
+    GMUL xvf, d, U3, D9, U3
+    ldrepl_macro 19, 21, 8
+    nmsub_macro 2, 2, 3, D8
+    ldrepl_macro 17, 18, 4
+    GMUL xvf, d, U2, D5, U2
+    ldrepl_macro 16, 16, 0
+    nmsub_macro 1, 1, 3, D7
+    nmsub_macro 1, 1, 2, D4
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 3, D6
+    nmsub_macro 0, 0, 2, D3
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 3, 0, 2
+// Store C
+    GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00
+.endm
+
+.macro dsolve_1x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//4	 5
+//8	 9	10
+//12 13	14	15
+// Sequentially extract data from B in row order
+    ldrepl_macro 22, 25, 12
+    GMUL xvf, d, U3, D9, U3
+    ldrepl_macro 19, 21, 8
+    nmsub_macro 2, 2, 3, D8
+    ldrepl_macro 17, 18, 4
+    GMUL xvf, d, U2, D5, U2
+    ldrepl_macro 16, 16, 0
+    nmsub_macro 1, 1, 3, D7
+    nmsub_macro 1, 1, 2, D4
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 3, D6
+    nmsub_macro 0, 0, 2, D3
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 3, 0, 1
+// Store C
+    GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00,
+.endm
+
+.macro dgemm_dsolve_16x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_16x1_load
+    dgemm_16x1
+    b .L_dsolve_16x1
+.L_dsolve_16x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+.L_dsolve_16x1:
+    PTR_ADDI    A0,     T1,     -16 * 8
+    PTR_ADDI    B0,     T2,     -1 * 8
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+    // Store A
+    A_st_macro 0, 3, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60
+.endm
+
+.macro dgemm_dsolve_8x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_8x1_load
+    dgemm_8x1
+    b .L_dsolve_8x1
+.L_dsolve_8x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+.L_dsolve_8x1:
+    PTR_ADDI    A0,     T1,     -8 * 8
+    PTR_ADDI    B0,     T2,     -1 * 8
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+    // Store A
+    A_st_macro 0, 1, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20
+.endm
+
+.macro dgemm_dsolve_4x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_4x1_load
+    dgemm_4x1
+    b .L_dsolve_4x1
+.L_dsolve_4x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_4x1:
+    PTR_ADDI    A0,     T1,     -4 * 8
+    PTR_ADDI    B0,     T2,     -1 * 8
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00
+.endm
+
+.macro dgemm_dsolve_2x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_2x1_load
+    dgemm_2x1
+    b .L_dsolve_2x1
+.L_dsolve_2x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_2x1:
+    PTR_ADDI    A0,     T1,     -2 * 8
+    PTR_ADDI    B0,     T2,     -1 * 8
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 2
+    // Strore C
+    GST v, , $vr0, C0, 0x00
+.endm
+
+.macro dgemm_dsolve_1x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_1x1_load
+    dgemm_1x1
+    b .L_dsolve_1x1
+.L_dsolve_1x1_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+.L_dsolve_1x1:
+    PTR_ADDI    A0,     T1,     -1 * 8
+    PTR_ADDI    B0,     T2,     -1 * 8
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 1
+    // Strore C
+    GST f, d, $f0, C0, 0x00
+.endm
+
+.macro dgemm_dsolve_16x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_16x2_load
+    dgemm_16x2
+    b .L_dsolve_16x2
+.L_dsolve_16x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    /* Load C1  */
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+.L_dsolve_16x2:
+    PTR_ADDI    A0,     T1,     -(16 * 2) * 8
+    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
+    dsolve_16x2
+.endm
+
+.macro dgemm_dsolve_8x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_8x2_load
+    dgemm_8x2
+    b .L_dsolve_8x2
+.L_dsolve_8x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+.L_dsolve_8x2:
+    PTR_ADDI    A0,     T1,     -(8 * 2) * 8
+    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
+    dsolve_8x2
+.endm
+
+.macro dgemm_dsolve_4x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_4x2_load
+    dgemm_4x2
+    b .L_dsolve_4x2
+.L_dsolve_4x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_4x2:
+    PTR_ADDI    A0,     T1,     -(4 * 2) * 8
+    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
+    dsolve_4x2
+.endm
+
+.macro dgemm_dsolve_2x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_2x2_load
+    dgemm_2x2
+    b .L_dsolve_2x2
+.L_dsolve_2x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_2x2:
+    PTR_ADDI    A0,     T1,     -(2 * 2) * 8
+    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
+    dsolve_2x2
+.endm
+
+.macro dgemm_dsolve_1x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_1x2_load
+    dgemm_1x2
+    xvpackod.d  U1,     U0,     U0
+    b   .L_dsolve_1x2
+.L_dsolve_1x2_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+.L_dsolve_1x2:
+    PTR_ADDI    A0,     T1,     -(1 * 2) * 8
+    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
+    dsolve_1x2
+.endm
+
+.macro dgemm_dsolve_16x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_16x4_load
+    dgemm_16x4
+    b	.L_dsolve_16x4
+.L_dsolve_16x4_load:
+    // Load C
+    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
+    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
+    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+/********************** solver ******************/
+.L_dsolve_16x4:
+    PTR_ADDI    A0,     T1,     -(16 * 4) * 8
+    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
+    dsolve_16x4
+.endm
+
+.macro dgemm_dsolve_8x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_8x4_load
+    dgemm_8x4
+    b .L_dsolve_8x4
+.L_dsolve_8x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+
+    /* Load C2  */
+    xvld      U4,  C2,  0x00
+    xvld      U5,  C2,  0x20
+
+    /* Load C3  */
+    xvld      U6,  C3,  0x00
+    xvld      U7,  C3,  0x20
+/********* solver *********/
+.L_dsolve_8x4:
+    PTR_ADDI    A0,     T1,     -(8 * 4) * 8
+    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
+    dsolve_8x4
+.endm
+
+.macro dgemm_dsolve_4x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_4x4_load
+    dgemm_4x4
+    b .L_dsolve_4x4
+.L_dsolve_4x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/************** solver *****************/
+.L_dsolve_4x4:
+    PTR_ADDI    A0,     T1,     -(4 * 4) * 8
+    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
+    dsolve_4x4
+.endm
+
+.macro dgemm_dsolve_2x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_2x4_load
+    dgemm_2x4
+    xvpermi.q   U2,     U0,     0x01
+    xvpermi.q   U3,     U1,     0x01
+    b   .L_dsolve_2x4
+.L_dsolve_2x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/********************** solver ******************/
+.L_dsolve_2x4:
+    PTR_ADDI    A0,     T1,     -(2 * 4) * 8
+    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
+    dsolve_2x4
+.endm
+
+.macro dgemm_dsolve_1x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_1x4_load
+    dgemm_1x4
+    xvpackod.d  U1,     U0,     U0
+    xvpermi.q   U2,     U0,     0x01
+    xvpermi.q   U3,     U1,     0x01
+    b   .L_dsolve_1x4
+.L_dsolve_1x4_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    fld.d       $f2,    C2,     0x00
+    fld.d       $f3,    C3,     0x00
+.L_dsolve_1x4:
+    PTR_ADDI    A0,     T1,     -(1 * 4) * 8
+    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
+    dsolve_1x4
+.endm
+
+    PROLOGUE
+    push_if_used 26, 32
+    PTR_SLLI    LDC,    LDC,    3
+    PTR_SUB     KK,     N,      OFFSET
+    PTR_MUL     T0,     N,      LDC
+    PTR_MUL     T1,     N,      K
+    PTR_ADD     C,      C,      T0 // c += n * ldc
+    PTR_SLLI    T1,     T1,     3
+    PTR_ADD     B,      B,      T1
+
+    andi        J,      N,      1
+    beqz        J,      .L_N2
+.L_N1:
+    move        AA,     A
+    PTR_SUB     C,      C,      LDC // c -= ldc
+    PTR_SLLI    T0,     K,      3
+    PTR_SLLI    T1,     KK,     3
+    PTR_SUB     B,      B,      T0 // b -= k
+    PTR_ADD     BB,     B,      T1 // bb = b + kk
+    move        CC,     C
+
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N1_M15
+.align 4
+.L_N1_I1:
+    PTR_SLLI    T1,     KK,     7
+    GADD , d, C0, CC, ZERO
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 16 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_16x1
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N1_I1
+.L_N1_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N1_M7
+.L_N1_M8:
+    PTR_SLLI    T1,     KK,     6
+    GADD , d, C0, CC, ZERO
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 8 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_8x1
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N1_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N1_M3
+.L_N1_M4:
+    PTR_SLLI    T1,     KK,     5
+    GADD , d, C0, CC, ZERO
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 4 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_4x1
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N1_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N1_M1
+.L_N1_M2:
+    PTR_SLLI    T1,     KK,     4
+    GADD , d, C0, CC, ZERO
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 2 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_2x1
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N1_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N1_M0
+    PTR_SLLI    T1,     KK,     3
+    GADD , d, C0, CC, ZERO
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_1x1
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N1_M0:
+    PTR_ADDI    KK,     KK,     -1
+.L_N2:
+    andi    J,      N,      2
+    beq     ZERO,   J,      .L_N4
+    move    AA,     A
+    PTR_SLLI    T0, LDC,    1
+    PTR_SLLI    T1, K,      4
+    PTR_SLLI    T2, KK,     4
+    PTR_SUB     B,  B,      T1
+    PTR_SUB     C,  C,      T0
+    PTR_ADD     BB, B,      T2
+    move    CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N2_M15
+.align 4
+.L_N2_I1:
+    PTR_SLLI    T1,     KK,     7
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 16 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_16x2
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N2_I1
+.L_N2_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N2_M7
+.L_N2_M8:
+    PTR_SLLI    T1,     KK,     6
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 8 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_8x2
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N2_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N2_M3
+.L_N2_M4:
+    PTR_SLLI    T1,     KK,     5
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 4 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_4x2
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N2_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N2_M1
+.L_N2_M2:
+    PTR_SLLI    T1,     KK,     4
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 2 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_2x2
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N2_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N2_M0
+    PTR_SLLI    T1,     KK,     3
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_1x2
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N2_M0:
+    PTR_ADDI    KK,     KK,     -2
+.L_N4:
+    PTR_SRAI   J,     N,     2     /* J = bn >> 2 */
+    beq        ZERO,  J,     .L_N0
+.align 5
+.L_J1:
+    PTR_ADDI    J,      J,     -1
+    move    AA,     A
+    PTR_SLLI    T0, LDC,    2
+    PTR_SLLI    T1, K,      5
+    PTR_SLLI    T2, KK,     5
+    PTR_SUB     B,  B,      T1
+    PTR_SUB     C,  C,      T0
+    PTR_ADD     BB, B,      T2
+    move    CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_M15
+.align 4
+.L_I1:
+    PTR_SLLI    T1,     KK,     7
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 16 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_16x4
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_I1
+.L_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_M7
+.L_M8:
+    PTR_SLLI    T1,     KK,     6
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 8 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_8x4
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_M3
+.L_M4:
+    PTR_SLLI    T1,     KK,     5
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 4 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_4x4
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_M1
+.L_M2:
+    PTR_SLLI    T1,     KK,     4
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 2 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_2x4
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_M0
+    PTR_SLLI    T1,     KK,     3
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_1x4
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_M0:
+    PTR_ADDI    KK,     KK,     -4
+    bnez        J,      .L_J1
+.L_N0:
+    pop_if_used 26, 32
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/dtrsm_kernel_macro.S b/kernel/loongarch64/dtrsm_kernel_macro.S
new file mode 100644
index 000000000..88b7121d1
--- /dev/null
+++ b/kernel/loongarch64/dtrsm_kernel_macro.S
@@ -0,0 +1,2147 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+/************** Dgemm Kernel 16x4 ****************/
+.macro KERNEL2x16x4
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
+
+    preld      0,   B0, B_PRE
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D10, U10, U14, D10
+    xvfmadd.d  D11, U11, U14, D11
+
+    preld      0,   A0, A_PRE
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
+
+    preld      0,   A0, A_PRE + 0x40
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D14, U10, U15, D14
+    xvfmadd.d  D15, U11, U15, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvld     U9,   A0,    0x20
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvld     U10,   A0,    0x40
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvld     U11,   A0,    0x60
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+
+    xvldrepl.d U12,  B0, 0x00
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+
+    preld      0,   B0, B_PRE
+    xvldrepl.d U13,  B0, 0x08
+    xvfmadd.d  D10, U2, U6, D10
+    xvfmadd.d  D11, U3, U6, D11
+
+    preld      0,   A0, A_PRE
+    xvldrepl.d U14,  B0, 0x10
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+
+    preld      0,   A0, A_PRE + 0x40
+    xvldrepl.d U15,  B0, 0x18
+    xvfmadd.d  D14, U2, U7, D14
+    xvfmadd.d  D15, U3, U7, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x16x4_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
+
+    preld      0,   B0, B_PRE
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D10, U10, U14, D10
+    xvfmadd.d  D11, U11, U14, D11
+
+    preld      0,   A0, A_PRE
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
+
+    preld      0,   A0, A_PRE + 0x40
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D14, U10, U15, D14
+    xvfmadd.d  D15, U11, U15, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+
+    preld      0,   B0, B_PRE
+    xvfmadd.d  D10, U2, U6, D10
+    xvfmadd.d  D11, U3, U6, D11
+
+    preld      0,   A0, A_PRE
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+
+    preld      0,   A0, A_PRE + 0x40
+    xvfmadd.d  D14, U2, U7, D14
+    xvfmadd.d  D15, U3, U7, D15
+.endm
+
+.macro KERNEL8x16x4
+.rept 4
+    KERNEL2x16x4
+.endr
+.endm
+
+.macro KERNEL8x16x4_END
+.rept 3
+    KERNEL2x16x4
+.endr
+    KERNEL2x16x4_END
+.endm
+
+.macro KERNEL2x8x4
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
+
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+
+    xvldrepl.d U12, B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U13, B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvldrepl.d U14, B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+
+    xvldrepl.d U15,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x8x4_END
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
+
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+.endm
+
+.macro KERNEL8x8x4
+.rept 4
+    KERNEL2x8x4
+.endr
+.endm
+
+.macro KERNEL8x8x4_END
+.rept 3
+    KERNEL2x8x4
+.endr
+    KERNEL2x8x4_END
+.endm
+
+.macro KERNEL2x4x4
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U8, U12, D0
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
+
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    xvld     U8,   A0,    0x00
+
+    xvldrepl.d U12, B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U13, B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+
+    xvldrepl.d U14, B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+
+    xvldrepl.d U15,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x4x4_END
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U8, U12, D0
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
+
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D12, U0, U7, D12
+.endm
+
+.macro KERNEL8x4x4
+.rept 4
+    KERNEL2x4x4
+.endr
+.endm
+
+.macro KERNEL8x4x4_END
+.rept 3
+    KERNEL2x4x4
+.endr
+    KERNEL2x4x4_END
+.endm
+
+.macro KERNEL2x2x4
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
+
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvfmadd.d   D1,     U9,     U12,    D1
+
+    xvld    U4,  B0, 0x00
+    addi.d  A0,  A0, 0x10
+    addi.d  B0,  B0, 0x20
+
+    xvldrepl.d     U8,   A0,    0x00
+    xvldrepl.d     U9,   A0,    0x08
+
+    xvfmadd.d   D0,     U0,     U4,    D0
+    xvfmadd.d   D1,     U1,     U4,    D1
+
+    xvld       U12, B0, 0x00
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x2x4_END
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
+
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvfmadd.d   D1,     U9,     U12,    D1
+
+    xvld    U4,  B0, 0x00
+    addi.d  A0,  A0, 0x10
+    addi.d  B0,  B0, 0x20
+
+    xvfmadd.d   D0,     U0,     U4,    D0
+    xvfmadd.d   D1,     U1,     U4,    D1
+.endm
+
+.macro KERNEL8x2x4
+.rept 4
+    KERNEL2x2x4
+.endr
+.endm
+
+.macro KERNEL8x2x4_END
+.rept 3
+    KERNEL2x2x4
+.endr
+    KERNEL2x2x4_END
+.endm
+
+.macro KERNEL2x1x4
+    xvldrepl.d  U0,     A0,     0x00
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvld        U4,     B0,     0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+    xvldrepl.d  U8,     A0,     0x00
+    xvfmadd.d   D0,     U0,     U4,     D0
+    xvld        U12,    B0,     0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x1x4_END
+    xvldrepl.d  U0,     A0,     0x00
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvld        U4,     B0,     0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+    xvfmadd.d   D0,     U0,     U4,     D0
+.endm
+
+.macro KERNEL8x1x4
+.rept 4
+    KERNEL2x1x4
+.endr
+.endm
+
+.macro KERNEL8x1x4_END
+.rept 3
+    KERNEL2x1x4
+.endr
+    KERNEL2x1x4_END
+.endm
+
+.macro KERNEL2x16x2
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvld     U9,   A0,    0x20
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvld     U10,   A0,    0x40
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvld     U11,   A0,    0x60
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x16x2_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+.endm
+
+.macro KERNEL8x16x2
+.rept 4
+    KERNEL2x16x2
+.endr
+.endm
+
+.macro KERNEL8x16x2_END
+.rept 3
+    KERNEL2x16x2
+.endr
+    KERNEL2x16x2_END
+.endm
+
+.macro KERNEL2x8x2
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvld     U9,   A0,    0x20
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x8x2_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+.endm
+
+.macro KERNEL8x8x2
+.rept 4
+    KERNEL2x8x2
+.endr
+.endm
+
+.macro KERNEL8x8x2_END
+.rept 3
+    KERNEL2x8x2
+ .endr
+    KERNEL2x8x2_END
+.endm
+
+.macro KERNEL2x4x2
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x4x2_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+.endm
+
+.macro KERNEL8x4x2
+.rept 4
+    KERNEL2x4x2
+.endr
+.endm
+
+.macro KERNEL8x4x2_END
+.rept 3
+    KERNEL2x4x2
+.endr
+    KERNEL2x4x2_END
+.endm
+
+.macro KERNEL2x2x2
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x2x2_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+.endm
+
+.macro KERNEL8x2x2
+.rept 4
+    KERNEL2x2x2
+.endr
+.endm
+
+.macro KERNEL8x2x2_END
+.rept 3
+    KERNEL2x2x2
+.endr
+    KERNEL2x2x2_END
+.endm
+
+.macro KERNEL2x1x2
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x1x2_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+.endm
+
+.macro KERNEL8x1x2
+.rept 4
+    KERNEL2x1x2
+.endr
+.endm
+
+.macro KERNEL8x1x2_END
+.rept 3
+    KERNEL2x1x2
+.endr
+    KERNEL2x1x2_END
+.endm
+
+.macro KERNEL2x16x1
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvld     U9,   A0,    0x20
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvld     U10,   A0,    0x40
+    xvld     U11,   A0,    0x60
+
+    xvldrepl.d U12,  B0, 0x00
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+.endm
+
+.macro KERNEL2x16x1_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+.endm
+
+.macro KERNEL8x16x1
+.rept 4
+    KERNEL2x16x1
+.endr
+.endm
+
+.macro KERNEL8x16x1_END
+.rept 3
+    KERNEL2x16x1
+.endr
+    KERNEL2x16x1_END
+.endm
+
+.macro KERNEL2x8x1
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+    xvld     U1,   A0,    0x20
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvld     U9,   A0,    0x20
+    xvldrepl.d U12,  B0, 0x00
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+.endm
+
+.macro KERNEL2x8x1_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+    xvld     U1,   A0,    0x20
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+.endm
+
+.macro KERNEL8x8x1
+.rept 4
+    KERNEL2x8x1
+.endr
+.endm
+
+.macro KERNEL8x8x1_END
+.rept 3
+    KERNEL2x8x1
+.endr
+    KERNEL2x8x1_END
+.endm
+
+.macro KERNEL2x4x1
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d U12,  B0, 0x00
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+.endm
+
+.macro KERNEL2x4x1_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+    xvfmadd.d  D0,  U0, U4, D0
+.endm
+
+.macro KERNEL8x4x1
+.rept 4
+    KERNEL2x4x1
+.endr
+.endm
+
+.macro KERNEL8x4x1_END
+.rept 3
+    KERNEL2x4x1
+.endr
+    KERNEL2x4x1_END
+.endm
+
+.macro KERNEL2x2x1
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d U12,  B0, 0x00
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+.endm
+
+.macro KERNEL2x2x1_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    xvfmadd.d  D0,  U0, U4, D0
+.endm
+
+.macro KERNEL8x2x1
+.rept 4
+    KERNEL2x2x1
+.endr
+.endm
+
+.macro KERNEL8x2x1_END
+.rept 3
+    KERNEL2x2x1
+.endr
+    KERNEL2x2x1_END
+.endm
+
+.macro KERNEL2x1x1
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d U12,  B0, 0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+.endm
+
+.macro KERNEL2x1x1_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    xvfmadd.d  D0,  U0, U4, D0
+.endm
+
+.macro KERNEL8x1x1
+.rept 4
+    KERNEL2x1x1
+.endr
+.endm
+
+.macro KERNEL8x1x1_END
+.rept 3
+    KERNEL2x1x1
+.endr
+    KERNEL2x1x1_END
+.endm
+
+.macro dgemm_16x4
+.L_dgemm_16x4: // See dgemm_kernel_16x4.S
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+    xvfmul.d  D2,  U2, U4
+    xvfmul.d  D3,  U3, U4
+
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
+    xvfmul.d  D6,  U2, U5
+    xvfmul.d  D7,  U3, U5
+
+    xvldrepl.d     U6, B0, 0x10
+    /* line 3 */
+    xvfmul.d  D8,  U0, U6
+    xvfmul.d  D9,  U1, U6
+    xvfmul.d  D10, U2, U6
+    xvfmul.d  D11, U3, U6
+
+    xvldrepl.d     U7, B0, 0x18
+    /* line 4 */
+    xvfmul.d  D12, U0, U7
+    xvfmul.d  D13, U1, U7
+    xvfmul.d  D14, U2, U7
+    xvfmul.d  D15, U3, U7
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x80
+    PTR_ADDI    B0,  B0, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_L7 */
+    beq       ZERO,TL, .L_dgemm_16x4_L7
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+    xvld     U10,  A0,    0x40
+    xvld     U11,  A0,    0x60
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    xvldrepl.d  U14,   B0,  0x10
+    xvldrepl.d  U15,   B0,  0x18
+    PTR_ADDI     A0,  A0, 0x80
+    PTR_ADDI     B0,  B0, 0x20
+
+    beq    ZERO,    TL,  .L_dgemm_16x4_TL1_END
+.align 5
+.L_dgemm_16x4_TL1:
+    KERNEL8x16x4
+    PTR_ADDI  TL,    TL,     -1
+    blt     ZERO,  TL,     .L_dgemm_16x4_TL1
+.L_dgemm_16x4_TL1_END:
+    KERNEL8x16x4_END
+.L_dgemm_16x4_L7:
+    andi      TL,  L,    7
+    beq       TL,  ZERO, .L_dgemm_16x4_L0
+.align 5
+.L_dgemm_16x4_L71:
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+    xvfmadd.d  D10, U2, U6, D10
+    xvfmadd.d  D11, U3, U6, D11
+
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+    xvfmadd.d  D14, U2, U7, D14
+    xvfmadd.d  D15, U3, U7, D15
+
+    PTR_ADDI     A0,  A0, 0x80
+    PTR_ADDI     B0,  B0, 0x20
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_16x4_L71
+.L_dgemm_16x4_L0:
+    // Load C
+    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
+    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
+    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+    GSUB xvf, d, U0,  U0,  D0,  U1,  U1,  D1,  U2,  U2,  D2,  U3,  U3,  D3,  \
+                 U4,  U4,  D4,  U5,  U5,  D5,  U6,  U6,  D6,  U7,  U7,  D7,  \
+                 U8,  U8,  D8,  U9,  U9,  D9,  U10, U10, D10, U11, U11, D11, \
+                 U12, U12, D12, U13, U13, D13, U14, U14, D14, U15, U15, D15
+.endm
+
+.macro dgemm_1x4
+.L_dgemm_1x4: // See dgemm_kernel_16x4.S
+    xvldrepl.d  U0,     A0,     0x00
+    xvld        U4,     B0,     0x00
+    xvfmul.d    D0,     U0,     U4
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x08
+    PTR_ADDI    B0,  B0, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M1_L7 */
+    beq       ZERO,TL, .L_dgemm_1x4_M1_L7
+    xvldrepl.d  U8,     A0,     0x00
+
+    PTR_ADDI     TL,  TL,  -1
+    xvld       U12, B0,  0x00
+    PTR_ADDI     A0,  A0,  0x08
+    PTR_ADDI     B0,  B0,  0x20
+
+    beq    ZERO,    TL,  .L_dgemm_1x4_M1_TL1_END
+.align 5
+.L_dgemm_1x4_M1_TL1:
+    KERNEL8x1x4
+    PTR_ADDI  TL,     TL,     -1
+    blt     ZERO,TL, .L_dgemm_1x4_M1_TL1
+.L_dgemm_1x4_M1_TL1_END:
+    KERNEL8x1x4_END
+.L_dgemm_1x4_M1_L7:
+    /* if (!(L & 7)) goto L_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_1x4_M1_L0
+.align 5
+.L_dgemm_1x4_M1_L71:
+    xvldrepl.d  U0,     A0,     0x00
+    xvld        U4,     B0,     0x00
+    xvfmadd.d   D0,     U0,     U4,    D0
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x08
+    PTR_ADDI     B0,  B0, 0x20
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_1x4_M1_L71
+.L_dgemm_1x4_M1_L0:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    fld.d       $f2,    C2,     0x00
+    fld.d       $f3,    C3,     0x00
+    xvinsve0.d  U0,     U1,     0x01
+    xvinsve0.d  U0,     U2,     0x02
+    xvinsve0.d  U0,     U3,     0x03
+    GSUB xvf, d, U0, U0, D0
+.endm
+
+.macro dgemm_2x4
+.L_dgemm_2x4:
+    /* Load 2 * 64 from A0 */
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
+    xvld    U4,     B0,     0x00
+    xvfmul.d    D0,     U0,     U4
+    xvfmul.d    D1,     U1,     U4
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x10
+    PTR_ADDI    B0,  B0, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M2_L7 */
+    beq       ZERO,TL, .L_dgemm_2x4_M2_L7
+
+    xvldrepl.d     U8,   A0,    0x00
+    xvldrepl.d     U9,   A0,    0x08
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvld       U12, B0, 0x00
+    PTR_ADDI     A0,  A0, 0x10
+    PTR_ADDI     B0,  B0, 0x20
+
+    beq    ZERO,    TL,  .L_dgemm_2x4_M2_TL1_END
+.align 5
+.L_dgemm_2x4_M2_TL1:
+    KERNEL8x2x4
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_2x4_M2_TL1
+.L_dgemm_2x4_M2_TL1_END:
+    KERNEL8x2x4_END
+
+.L_dgemm_2x4_M2_L7:
+    /* if (!(L & 7)) goto L_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_2x4_M2_L0
+.align 5
+.L_dgemm_2x4_M2_L71:
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
+
+    xvld    U4,  B0, 0x00
+
+    xvfmadd.d   D0,     U0,     U4,    D0
+    xvfmadd.d   D1,     U1,     U4,    D1
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x10
+    PTR_ADDI     B0,  B0, 0x20
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_2x4_M2_L71
+.L_dgemm_2x4_M2_L0:
+    xvpackev.d  D4,     D1,     D0
+    xvpackod.d  D5,     D1,     D0
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+
+    xvpermi.q   U0, U2, 0x02
+    xvpermi.q   U1, U3, 0x02
+
+    GSUB xvf, d, U0, U0, D4, U1, U1, D5
+.endm
+
+.macro dgemm_4x4
+.L_dgemm_4x4:
+    /* Load 4 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+
+    xvldrepl.d     U6, B0, 0x10
+    /* line 3 */
+    xvfmul.d  D8,  U0, U6
+
+    xvldrepl.d     U7, B0, 0x18
+    /* line 4 */
+    xvfmul.d  D12, U0, U7
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x20
+    PTR_ADDI    B0,  B0, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M4_L7 */
+    beq       ZERO,TL, .L_dgemm_4x4_M4_L7
+
+    xvld     U8,   A0,    0x00
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    xvldrepl.d  U14,   B0,  0x10
+    xvldrepl.d  U15,   B0,  0x18
+    PTR_ADDI     A0,  A0, 0x20
+    PTR_ADDI     B0,  B0, 0x20
+
+    beq    ZERO,    TL,  .L_dgemm_4x4_M4_TL1_END
+.align 5
+.L_dgemm_4x4_M4_TL1: /* TL-- */
+    KERNEL8x4x4
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_4x4_M4_TL1
+.L_dgemm_4x4_M4_TL1_END:
+    KERNEL8x4x4_END
+.L_dgemm_4x4_M4_L7:
+    /* if (!(L & 7)) goto L_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_4x4_M4_L0
+.align 5
+.L_dgemm_4x4_M4_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x20
+    PTR_ADDI     B0,  B0, 0x20
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_4x4_M4_L71
+ .L_dgemm_4x4_M4_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+
+    GSUB xvf, d, U0, U0, D0, U1, U1, D4, U2, U2, D8, U3, U3, D12
+.endm
+
+.macro dgemm_8x4
+.L_dgemm_8x4:
+    /* Load 8 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
+
+    xvldrepl.d     U6, B0, 0x10
+    /* line 3 */
+    xvfmul.d  D8,  U0, U6
+    xvfmul.d  D9,  U1, U6
+
+    xvldrepl.d     U7, B0, 0x18
+    /* line 4 */
+    xvfmul.d  D12, U0, U7
+    xvfmul.d  D13, U1, U7
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x40
+    PTR_ADDI    B0,  B0, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M8_L7 */
+    beq       ZERO,TL, .L_dgemm_8x4_M8_L7
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    xvldrepl.d  U14,   B0,  0x10
+    xvldrepl.d  U15,   B0,  0x18
+    PTR_ADDI     A0,  A0, 0x40
+    PTR_ADDI     B0,  B0, 0x20
+
+    beq    ZERO,    TL,  .L_dgemm_8x4_M8_TL1_END
+.align 5
+.L_dgemm_8x4_M8_TL1: /* TL-- */
+    KERNEL8x8x4
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_8x4_M8_TL1
+
+.L_dgemm_8x4_M8_TL1_END:
+    KERNEL8x8x4_END
+
+.L_dgemm_8x4_M8_L7:
+    /* if (!(L & 7)) goto L_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_8x4_M8_L0
+.align 5
+.L_dgemm_8x4_M8_L71:
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x40
+    PTR_ADDI     B0,  B0, 0x20
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_8x4_M8_L71
+.L_dgemm_8x4_M8_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+
+    /* Load C2  */
+    xvld      U4,  C2,  0x00
+    xvld      U5,  C2,  0x20
+
+    /* Load C3  */
+    xvld      U6,  C3,  0x00
+    xvld      U7,  C3,  0x20
+
+    GSUB xvf, d, U0, U0, D0,  U1, U1, D1, \
+                 U2, U2, D4,  U3, U3, D5, \
+                 U4, U4, D8,  U5, U5, D9, \
+                 U6, U6, D12, U7, U7, D13
+.endm
+
+.macro dgemm_4x2
+.L_dgemm_4x2:
+    /* Load 4 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x20
+    PTR_ADDI    B0,  B0, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_dgemm_4x2_N3_M4_L7 */
+    beq       ZERO,TL, .L_dgemm_4x2_N3_M4_L7
+
+    xvld     U8,   A0,    0x00
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    PTR_ADDI     A0,  A0, 0x20
+    PTR_ADDI     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_dgemm_4x2_N3_M4_TL1_END
+.align 5
+.L_dgemm_4x2_N3_M4_TL1: /* TL-- */
+    KERNEL8x4x2
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_4x2_N3_M4_TL1
+.L_dgemm_4x2_N3_M4_TL1_END:
+    KERNEL8x4x2_END
+
+.L_dgemm_4x2_N3_M4_L7:
+    /* if (!(L & 7)) goto L_dgemm_4x2_N3_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_4x2_N3_M4_L0
+.align 5
+.L_dgemm_4x2_N3_M4_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x20
+    PTR_ADDI     B0,  B0, 0x10
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_4x2_N3_M4_L71
+
+.L_dgemm_4x2_N3_M4_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    GSUB xvf, d, U0, U0, D0, U1, U1, D4
+.endm
+
+.macro dgemm_2x2
+.L_dgemm_2x2:
+    /* Load 2 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x10
+    PTR_ADDI    B0,  B0, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_dgemm_2x2_N3_M2_L7 */
+    beq       ZERO,TL, .L_dgemm_2x2_N3_M2_L7
+
+    xvld     U8,   A0,    0x00
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    PTR_ADDI     A0,  A0, 0x10
+    PTR_ADDI     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_dgemm_2x2_N3_M2_TL1_END
+.align 5
+.L_dgemm_2x2_N3_M2_TL1: /* TL-- */
+    KERNEL8x2x2
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_2x2_N3_M2_TL1
+.L_dgemm_2x2_N3_M2_TL1_END:
+    KERNEL8x2x2_END
+
+.L_dgemm_2x2_N3_M2_L7:
+    /* if (!(L & 7)) goto L_dgemm_2x2_N3_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_2x2_N3_M2_L0
+.align 5
+.L_dgemm_2x2_N3_M2_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x10
+    PTR_ADDI     B0,  B0, 0x10
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_2x2_N3_M2_L71
+.L_dgemm_2x2_N3_M2_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    GSUB xvf, d, U0, U0, D0, U1, U1, D4
+.endm
+
+.macro dgemm_8x2
+.L_dgemm_8x2:
+    /* Load 8 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x40
+    PTR_ADDI    B0,  B0, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_dgemm_8x2_N3_M8_L7 */
+    beq       ZERO,TL, .L_dgemm_8x2_N3_M8_L7
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    PTR_ADDI     A0,  A0, 0x40
+    PTR_ADDI     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_dgemm_8x2_N3_M8_TL1_END
+.align 5
+.L_dgemm_8x2_N3_M8_TL1: /* TL-- */
+    KERNEL8x8x2
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_8x2_N3_M8_TL1
+.L_dgemm_8x2_N3_M8_TL1_END:
+    KERNEL8x8x2_END
+
+.L_dgemm_8x2_N3_M8_L7:
+    /* if (!(L & 7)) goto L_dgemm_8x2_N3_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_8x2_N3_M8_L0
+.align 5
+.L_dgemm_8x2_N3_M8_L71:
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x40
+    PTR_ADDI     B0,  B0, 0x10
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_8x2_N3_M8_L71
+
+.L_dgemm_8x2_N3_M8_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+    GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D4, U3, U3, D5
+.endm
+
+.macro dgemm_16x2
+.L_dgemm_16x2:
+    /* Load 16 * 64 from A0
+     * U0 = {a3,  a2,  a1,  a0}
+     * U1 = {a7,  a6,  a5,  a4}
+     * U2 = {a11, a10, a9,  a8}
+     * U3 = {a15, a14, a13, a12}
+     */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+    xvfmul.d  D2,  U2, U4
+    xvfmul.d  D3,  U3, U4
+
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
+    xvfmul.d  D6,  U2, U5
+    xvfmul.d  D7,  U3, U5
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x80
+    PTR_ADDI    B0,  B0, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_L7 */
+    beq       ZERO,TL, .L_dgemm_16x2_N3_L7
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+    xvld     U10,  A0,    0x40
+    xvld     U11,  A0,    0x60
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    PTR_ADDI     A0,  A0, 0x80
+    PTR_ADDI     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_dgemm_16x2_N3_TL1_END
+.align 5
+.L_dgemm_16x2_N3_TL1: /* TL-- */
+    KERNEL8x16x2
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_16x2_N3_TL1
+.L_dgemm_16x2_N3_TL1_END:
+    KERNEL8x16x2_END
+
+.L_dgemm_16x2_N3_L7:
+    /* if (!(L & 7)) goto L_dgemm_16x2_N3_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_16x2_N3_L0
+.align 5
+.L_dgemm_16x2_N3_L71:
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x80
+    PTR_ADDI     B0,  B0, 0x10
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_16x2_N3_L71
+
+.L_dgemm_16x2_N3_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    /* Load C1  */
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+    GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3, \
+                 U4, U4, D4, U5, U5, D5, U6, U6, D6, U7, U7, D7
+.endm
+
+.macro dgemm_2x1
+.L_dgemm_2x1:
+    /* Load 2 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x10
+    PTR_ADDI    B0,  B0, 0x08
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_dgemm_2x1_N1_M2_L7 */
+    beq       ZERO,TL, .L_dgemm_2x1_N1_M2_L7
+
+    xvld     U8,   A0,    0x00
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    PTR_ADDI     A0,  A0, 0x10
+    PTR_ADDI     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_dgemm_2x1_N1_M2_TL1_END
+.align 5
+.L_dgemm_2x1_N1_M2_TL1: /* TL-- */
+    KERNEL8x2x1
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_2x1_N1_M2_TL1
+.L_dgemm_2x1_N1_M2_TL1_END:
+    KERNEL8x2x1_END
+
+.L_dgemm_2x1_N1_M2_L7:
+    /* if (!(L & 7)) goto L_dgemm_2x1_N1_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_2x1_N1_M2_L0
+.align 5
+.L_dgemm_2x1_N1_M2_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x10
+    PTR_ADDI     B0,  B0, 0x08
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_2x1_N1_M2_L71
+.L_dgemm_2x1_N1_M2_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    GSUB xvf, d, U0, U0, D0
+.endm
+
+.macro dgemm_4x1
+.L_dgemm_4x1:
+    /* Load 4 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x20
+    PTR_ADDI    B0,  B0, 0x08
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_dgemm_4x1_N1_M4_L7 */
+    beq       ZERO,TL, .L_dgemm_4x1_N1_M4_L7
+
+    xvld     U8,   A0,    0x00
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    PTR_ADDI     A0,  A0, 0x20
+    PTR_ADDI     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_dgemm_4x1_N1_M4_TL1_END
+.align 5
+.L_dgemm_4x1_N1_M4_TL1: /* TL-- */
+    KERNEL8x4x1
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_4x1_N1_M4_TL1
+.L_dgemm_4x1_N1_M4_TL1_END:
+    KERNEL8x4x1_END
+
+.L_dgemm_4x1_N1_M4_L7:
+    /* if (!(L & 7)) goto L_dgemm_4x1_N1_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_4x1_N1_M4_L0
+.align 5
+.L_dgemm_4x1_N1_M4_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x20
+    PTR_ADDI     B0,  B0, 0x08
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_4x1_N1_M4_L71
+.L_dgemm_4x1_N1_M4_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    GSUB xvf, d, U0, U0, D0
+.endm
+
+.macro dgemm_8x1
+.L_dgemm_8x1:
+    /* Load 8 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x40
+    PTR_ADDI    B0,  B0, 0x08
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_dgemm_8x1_N1_M8_L7 */
+    beq       ZERO,TL, .L_dgemm_8x1_N1_M8_L7
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    PTR_ADDI     A0,  A0, 0x40
+    PTR_ADDI     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_dgemm_8x1_N1_M8_TL1_END
+.align 5
+.L_dgemm_8x1_N1_M8_TL1: /* TL-- */
+    KERNEL8x8x1
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_8x1_N1_M8_TL1
+
+.L_dgemm_8x1_N1_M8_TL1_END:
+    KERNEL8x8x1_END
+
+.L_dgemm_8x1_N1_M8_L7:
+    /* if (!(L & 7)) goto L_dgemm_8x1_N1_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_8x1_N1_M8_L0
+.align 5
+.L_dgemm_8x1_N1_M8_L71:
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x40
+    PTR_ADDI     B0,  B0, 0x08
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_8x1_N1_M8_L71
+.L_dgemm_8x1_N1_M8_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    GSUB xvf, d, U0, U0, D0, U1, U1, D1
+.endm
+
+.macro dgemm_16x1
+.L_dgemm_16x1:
+    /* Load 16 * 64 from A0
+     * U0 = {a3,  a2,  a1,  a0}
+     * U1 = {a7,  a6,  a5,  a4}
+     * U2 = {a11, a10, a9,  a8}
+     * U3 = {a15, a14, a13, a12}
+     */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+    xvfmul.d  D2,  U2, U4
+    xvfmul.d  D3,  U3, U4
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x80
+    PTR_ADDI    B0,  B0, 0x08
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_dgemm_16x1_N1_L7 */
+    beq       ZERO,TL, .L_dgemm_16x1_N1_L7
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+    xvld     U10,  A0,    0x40
+    xvld     U11,  A0,    0x60
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    PTR_ADDI     A0,  A0, 0x80
+    PTR_ADDI     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_dgemm_16x1_N1_TL1_END
+.align 5
+.L_dgemm_16x1_N1_TL1: /* TL-- */
+    KERNEL8x16x1
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_16x1_N1_TL1
+.L_dgemm_16x1_N1_TL1_END:
+    KERNEL8x16x1_END
+
+.L_dgemm_16x1_N1_L7:
+    /* if (!(L & 7)) goto L_dgemm_16x1_N1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_16x1_N1_L0
+.align 5
+.L_dgemm_16x1_N1_L71:
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x80
+    PTR_ADDI     B0,  B0, 0x08
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_16x1_N1_L71
+.L_dgemm_16x1_N1_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3
+.endm
+
+.macro dgemm_1x2
+.L_dgemm_1x2: // See dgemm_kernel_16x4.S
+    /* Load 1 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M1_L7 */
+    beq       ZERO,TL, .L_dgemm_1x2_N3_M1_L7
+
+    xvld     U8,   A0,    0x00
+
+    addi.d    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+    beq    ZERO,    TL,  .L_dgemm_1x2_N3_M1_TL1_END
+.L_dgemm_1x2_N3_M1_TL1: /* TL-- */
+    KERNEL8x1x2
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_1x2_N3_M1_TL1
+.L_dgemm_1x2_N3_M1_TL1_END:
+    KERNEL8x1x2_END
+.L_dgemm_1x2_N3_M1_L7:
+    /* if (!(L & 7)) goto L_dgemm_1x2_N3_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_1x2_N3_M1_L0
+.L_dgemm_1x2_N3_M1_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_1x2_N3_M1_L71
+.L_dgemm_1x2_N3_M1_L0:
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C1,  0x00
+    xvinsve0.d  U0,     U1,     0x01
+    xvinsve0.d  D0,     D4,     0x01
+    GSUB xvf, d, U0, U0, D0
+.endm
+
+.macro dgemm_1x1
+.L_dgemm_1x1:
+    /* Load 1 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M1_L7 */
+    beq       ZERO,TL, .L_N1_M1_L7
+
+    xvld     U8,   A0,    0x00
+
+    addi.d    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_N1_M1_TL1_END
+.L_N1_M1_TL1: /* TL-- */
+    KERNEL8x1x1
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M1_TL1
+.L_N1_M1_TL1_END:
+    KERNEL8x1x1_END
+.L_N1_M1_L7:
+    /* if (!(L & 7)) goto L_N1_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M1_L0
+
+.L_N1_M1_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M1_L71
+.L_N1_M1_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    GSUB xvf, d, U0, U0, D0
+.endm

From d15e0a055cf73502f0639cec829a5396a780c753 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Wed, 27 Sep 2023 09:40:40 +0800
Subject: [PATCH 310/718] LoongArch64: Fixed compilation issues when enable
 DYNAMIC_ARCH

---
 kernel/loongarch64/dgemv_n_8_lasx.S | 10 +++++-----
 kernel/loongarch64/dgemv_t_8_lasx.S |  6 +++---
 kernel/loongarch64/sgemv_n_8_lasx.S | 10 +++++-----
 kernel/loongarch64/sgemv_t_8_lasx.S |  6 +++---
 lapack/laswp/loongarch64/Makefile   |  5 +++++
 5 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/kernel/loongarch64/dgemv_n_8_lasx.S b/kernel/loongarch64/dgemv_n_8_lasx.S
index c6523f9ab..a49bf9bb1 100644
--- a/kernel/loongarch64/dgemv_n_8_lasx.S
+++ b/kernel/loongarch64/dgemv_n_8_lasx.S
@@ -341,7 +341,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     fmadd.d $f10,   $f12,   $f2,    $f10
 .endm
 
-.macro DGEMV_N XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req
+.macro DGEMV_N_LASX XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req
     PTR_SRLI  J,      N,      3
     beqz      J,      .L_\XW\()_N_7
     PTR_SLLI  K_LDA,  LDA,    3
@@ -541,13 +541,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     .hword  .L_GAP_1_0 - .L_GAP_TABLE
     .hword  .L_GAP_1_1 - .L_GAP_TABLE
 .L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
-    DGEMV_N GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1
+    DGEMV_N_LASX GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1
 .L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
-    DGEMV_N GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1
+    DGEMV_N_LASX GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1
 .L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
-    DGEMV_N GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1
+    DGEMV_N_LASX GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1
 .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
-    DGEMV_N GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
+    DGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
 .L_END:
     pop_if_used 17 + 7, 24 + 4
     jirl    $r0, $r1, 0x0
diff --git a/kernel/loongarch64/dgemv_t_8_lasx.S b/kernel/loongarch64/dgemv_t_8_lasx.S
index 7f57c1d88..71f942b0f 100644
--- a/kernel/loongarch64/dgemv_t_8_lasx.S
+++ b/kernel/loongarch64/dgemv_t_8_lasx.S
@@ -220,7 +220,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1
 .endm
 
-.macro DGEMV_T XW:req X8:req, X4:req
+.macro DGEMV_T_LASX XW:req X8:req, X4:req
     PTR_SRLI  J,      N,      3
     beqz      J,      .L_\XW\()_N_7
     PTR_SLLI  K_LDA,  LDA,    3
@@ -472,9 +472,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     .hword .L_GAP_0 - .L_GAP_TABLE
     .hword .L_GAP_1 - .L_GAP_TABLE
 .L_GAP_0: /* if (incx == 1) */
-    DGEMV_T GAP_0, X8, X4
+    DGEMV_T_LASX GAP_0, X8, X4
 .L_GAP_1: /* if (incx != 1) */
-    DGEMV_T GAP_1, X8_GAP, X4_GAP
+    DGEMV_T_LASX GAP_1, X8_GAP, X4_GAP
 .L_END:
     pop_if_used 17 + 8, 24 + 3
     jirl    $r0, $r1, 0x0
diff --git a/kernel/loongarch64/sgemv_n_8_lasx.S b/kernel/loongarch64/sgemv_n_8_lasx.S
index da172ca50..52ffc320e 100644
--- a/kernel/loongarch64/sgemv_n_8_lasx.S
+++ b/kernel/loongarch64/sgemv_n_8_lasx.S
@@ -274,7 +274,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     GST f, s, Y0_F, Y, 0
 .endm
 
-.macro SGEMV_N XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req
+.macro SGEMV_N_LASX XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req
     PTR_SRLI  J,      N,      3
     beqz      J,      .L_\XW\()_N_7
     PTR_SLLI  K_LDA,  LDA,    3
@@ -450,13 +450,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     .hword  .L_GAP_1_0 - .L_GAP_TABLE
     .hword  .L_GAP_1_1 - .L_GAP_TABLE
 .L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
-    SGEMV_N GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1
+    SGEMV_N_LASX GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1
 .L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
-    SGEMV_N GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1
+    SGEMV_N_LASX GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1
 .L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
-    SGEMV_N GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1
+    SGEMV_N_LASX GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1
 .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
-    SGEMV_N GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
+    SGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
 .L_END:
     pop_if_used 17 + 7, 19
     jirl    $r0, $r1, 0x0
diff --git a/kernel/loongarch64/sgemv_t_8_lasx.S b/kernel/loongarch64/sgemv_t_8_lasx.S
index dde3f4a30..f4bfffb42 100644
--- a/kernel/loongarch64/sgemv_t_8_lasx.S
+++ b/kernel/loongarch64/sgemv_t_8_lasx.S
@@ -160,7 +160,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1
 .endm
 
-.macro SGEMV_T XW:req X8:req, X4:req
+.macro SGEMV_T_LASX XW:req X8:req, X4:req
     PTR_SRLI  J,      N,      3
     beqz      J,      .L_\XW\()_N_7
     PTR_SLLI  K_LDA,  LDA,    3
@@ -396,9 +396,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     .hword .L_GAP_0 - .L_GAP_TABLE
     .hword .L_GAP_1 - .L_GAP_TABLE
 .L_GAP_0: /* if (incx == 1) */
-    SGEMV_T GAP_0, X8, X4
+    SGEMV_T_LASX GAP_0, X8, X4
 .L_GAP_1: /* if (incx != 1) */
-    SGEMV_T GAP_1, X8_GAP, X4_GAP
+    SGEMV_T_LASX GAP_1, X8_GAP, X4_GAP
 .L_END:
     pop_if_used 17 + 8, 18
     jirl    $r0, $r1, 0x0
diff --git a/lapack/laswp/loongarch64/Makefile b/lapack/laswp/loongarch64/Makefile
index 71e5a87cb..1c85667ec 100644
--- a/lapack/laswp/loongarch64/Makefile
+++ b/lapack/laswp/loongarch64/Makefile
@@ -1,6 +1,11 @@
 TOPDIR	= ../../..
 include ../../../Makefile.system
 
+ifeq ($(DYNAMIC_ARCH), 1)
+LASWP	= ../generic/laswp_k_4.c
+ZLASWP	= ../generic/zlaswp_k_4.c
+endif
+
 ifndef LASWP
 LASWP	= ../generic/laswp_k.c
 endif

From 211bb35dee47ba6ade41a28171738f62e5621f4b Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Wed, 27 Sep 2023 10:10:41 +0800
Subject: [PATCH 311/718] gh-actions: Adding DYNAMIC_ARCH test for LoongArch64

---
 .github/workflows/loongarch64.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml
index 5501e98e0..e0236ca86 100644
--- a/.github/workflows/loongarch64.yml
+++ b/.github/workflows/loongarch64.yml
@@ -18,6 +18,9 @@ jobs:
           - target: LOONGSON2K1000
             triple: loongarch64-unknown-linux-gnu
             opts: NO_SHARED=1 TARGET=LOONGSON2K1000
+          - target: DYNAMIC_ARCH
+            triple: loongarch64-unknown-linux-gnu
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC
 
     steps:
       - name: Checkout repository

From a92dc25fb3a257ad418da2d1227b092db49bc2b4 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Fri, 29 Sep 2023 08:08:29 -0500
Subject: [PATCH 312/718] Fix Makefile.power for xlf

---
 Makefile.power | 4 ++++
 f_check        | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/Makefile.power b/Makefile.power
index 33702c932..46afb2d4a 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -70,8 +70,12 @@ else
 FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8  -fno-fast-math 
 endif
 else
+ifeq ($(F_COMPILER), IBM)
+FCOMMON_OPT += -O2 -qrecur -qnosave
+else
 FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8  -fno-fast-math 
 endif
+endif
 else
 FCOMMON_OPT += -O2 -Mrecursive
 endif
diff --git a/f_check b/f_check
index f30231bc4..31f4376d0 100755
--- a/f_check
+++ b/f_check
@@ -117,6 +117,9 @@ else
                             vendor=PGI
                             openmp='-mp'
                             ;;
+                        *xlf*)
+                            vendor=IBM
+                            ;;
                         *)
                             vendor=G77
                             openmp=''

From a69367c43b28e2e0029d42092e791415565fe804 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Fri, 29 Sep 2023 09:29:41 -0500
Subject: [PATCH 313/718] Fix Makefile.system for OpenXL.

---
 Makefile.system | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index ae6db40b0..af840f029 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -1170,6 +1170,8 @@ CCOMMON_OPT += -DF_INTERFACE_IBM
 FEXTRALIB += -lxlf90
 ifeq ($(C_COMPILER), GCC)
 FCOMMON_OPT += -qextname
+else ifeq ($(C_COMPILER), CLANG)
+FCOMMON_OPT += -qextname
 endif
 # FCOMMON_OPT	+= -qarch=440
 ifdef BINARY64

From a11e1e10f436f4601275669bdf7b951e6e8df0e8 Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Fri, 29 Sep 2023 10:32:34 -0500
Subject: [PATCH 314/718] powerpc: Fix build errors with xlf

This patch fixes errors when using xlf as fortran compiler on Linux.
Tested with gcc/xlf and clang/xlf compiler combinations.
---
 Makefile.power  | 4 ++++
 Makefile.system | 2 +-
 f_check         | 3 +++
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/Makefile.power b/Makefile.power
index 33702c932..46afb2d4a 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -70,8 +70,12 @@ else
 FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8  -fno-fast-math 
 endif
 else
+ifeq ($(F_COMPILER), IBM)
+FCOMMON_OPT += -O2 -qrecur -qnosave
+else
 FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8  -fno-fast-math 
 endif
+endif
 else
 FCOMMON_OPT += -O2 -Mrecursive
 endif
diff --git a/Makefile.system b/Makefile.system
index ae6db40b0..b1a357fdf 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -1168,7 +1168,7 @@ endif
 ifeq ($(F_COMPILER), IBM)
 CCOMMON_OPT += -DF_INTERFACE_IBM
 FEXTRALIB += -lxlf90
-ifeq ($(C_COMPILER), GCC)
+ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG))
 FCOMMON_OPT += -qextname
 endif
 # FCOMMON_OPT	+= -qarch=440
diff --git a/f_check b/f_check
index f30231bc4..31f4376d0 100755
--- a/f_check
+++ b/f_check
@@ -117,6 +117,9 @@ else
                             vendor=PGI
                             openmp='-mp'
                             ;;
+                        *xlf*)
+                            vendor=IBM
+                            ;;
                         *)
                             vendor=G77
                             openmp=''

From ccbb91e4a739ca05b3cefb130863f89dcc1e92f3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 30 Sep 2023 12:46:34 +0200
Subject: [PATCH 315/718] fix improper function prototypes (empty parentheses)

---
 common_thread.h | 20 ++++++++++----------
 cpuid_x86.c     | 10 +++++-----
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/common_thread.h b/common_thread.h
index 06a7a1a38..70c724597 100644
--- a/common_thread.h
+++ b/common_thread.h
@@ -141,14 +141,14 @@ static __inline int num_cpu_avail(int level) {
 
 #ifdef USE_OPENMP
 int openmp_nthreads;
-	openmp_nthreads=omp_get_max_threads();
+	openmp_nthreads=omp_get_max_threads(void);
 #endif
 
 #ifndef USE_OPENMP 
   if (blas_cpu_number == 1
 #endif
 #ifdef USE_OPENMP
-     if (openmp_nthreads == 1 || omp_in_parallel()
+     if (openmp_nthreads == 1 || omp_in_parallel(void)
 #endif
       ) return 1;        
 
@@ -192,27 +192,27 @@ int exec_blas(BLASLONG num_cpu, blas_param_t *param, void *buffer);
 int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha,
 		       void *a, BLASLONG lda,
 		       void *b, BLASLONG ldb,
-		       void *c, BLASLONG ldc, int (*function)(), int threads);
+		       void *c, BLASLONG ldc, int (*function)(void), int threads);
 
-int gemm_thread_m (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
+int gemm_thread_m (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG ), void *, void *, BLASLONG);
 
-int gemm_thread_n (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
+int gemm_thread_n (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT*, FLOAT*, BLASLONG), void *, void *, BLASLONG);
 
-int gemm_thread_mn(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
+int gemm_thread_mn(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG), void *, void *, BLASLONG);
 
-int gemm_thread_variable(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG, BLASLONG);
+int gemm_thread_variable(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG), void *, void *, BLASLONG, BLASLONG);
 
 int trsm_thread(int mode, BLASLONG m, BLASLONG n,
 		double alpha_r, double alpha_i,
 		void *a, BLASLONG lda,
-		void *c, BLASLONG ldc, int (*function)(), void *buffer);
+		void *c, BLASLONG ldc, int (*function)(void), void *buffer);
 
-int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
+int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*, FLOAT *, FLOAT *, BLASLONG), void*, void*, BLASLONG);
 
 int getrf_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k,
 		 void *offsetA, BLASLONG lda,
 		 void *offsetB, BLASLONG jb,
-		 void *ipiv, BLASLONG offset, int (*function)(), void *buffer);
+		 void *ipiv, BLASLONG offset, int (*function)(void), void *buffer);
 
 #endif  /* ENDIF ASSEMBLER */
 
diff --git a/cpuid_x86.c b/cpuid_x86.c
index c485f3ddf..fdcead8bd 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -194,7 +194,7 @@ static C_INLINE void xgetbv(int op, int * eax, int * edx){
 }
 #endif
 
-int support_avx(){
+int support_avx(void){
 #ifndef NO_AVX
   int eax, ebx, ecx, edx;
   int ret=0;
@@ -212,7 +212,7 @@ int support_avx(){
 #endif
 }
 
-int support_avx2(){
+int support_avx2(void){
 #ifndef NO_AVX2
   int eax, ebx, ecx=0, edx;
   int ret=0;
@@ -228,7 +228,7 @@ int support_avx2(){
 #endif
 }
 
-int support_avx512(){
+int support_avx512(void){
 #if !defined(NO_AVX) && !defined(NO_AVX512)
   int eax, ebx, ecx, edx;
   int ret=0;
@@ -250,7 +250,7 @@ int support_avx512(){
 #endif
 }
 
-int support_avx512_bf16(){
+int support_avx512_bf16(void){
 #if !defined(NO_AVX) && !defined(NO_AVX512)
   int eax, ebx, ecx, edx;
   int ret=0;
@@ -271,7 +271,7 @@ int support_avx512_bf16(){
 #define BIT_AMX_BF16	0x00400000
 #define BIT_AMX_ENBD	0x00060000
 
-int support_amx_bf16() {
+int support_amx_bf16(void) {
 #if !defined(NO_AVX) && !defined(NO_AVX512)
   int eax, ebx, ecx, edx;
   int ret=0;

From c4bd4a2e5dbbb648ac8198f6193657ea403d088b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 30 Sep 2023 12:49:24 +0200
Subject: [PATCH 316/718] fix improper function prototypes (empty parentheses)

---
 driver/level3/gemm_thread_m.c        | 2 +-
 driver/level3/gemm_thread_mn.c       | 2 +-
 driver/level3/gemm_thread_n.c        | 2 +-
 driver/level3/gemm_thread_variable.c | 2 +-
 driver/level3/syrk_thread.c          | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/driver/level3/gemm_thread_m.c b/driver/level3/gemm_thread_m.c
index 8813e5529..353ae0be9 100644
--- a/driver/level3/gemm_thread_m.c
+++ b/driver/level3/gemm_thread_m.c
@@ -40,7 +40,7 @@
 #include <stdlib.h>
 #include "common.h"
 
-int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) {
+int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG ), void *sa, void *sb, BLASLONG nthreads) {
 
   blas_queue_t queue[MAX_CPU_NUMBER];
   BLASLONG range[MAX_CPU_NUMBER + 1];
diff --git a/driver/level3/gemm_thread_mn.c b/driver/level3/gemm_thread_mn.c
index 6b52df884..4f370999a 100644
--- a/driver/level3/gemm_thread_mn.c
+++ b/driver/level3/gemm_thread_mn.c
@@ -60,7 +60,7 @@ static const int divide_rule[][2] =
    { 1, 61}, { 2, 31}, { 7,  9}, { 8,  8},
 };
 
-int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) {
+int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG ), void *sa, void *sb, BLASLONG nthreads) {
 
   blas_queue_t queue[MAX_CPU_NUMBER];
 
diff --git a/driver/level3/gemm_thread_n.c b/driver/level3/gemm_thread_n.c
index 9668841bb..d583456bd 100644
--- a/driver/level3/gemm_thread_n.c
+++ b/driver/level3/gemm_thread_n.c
@@ -40,7 +40,7 @@
 #include <stdlib.h>
 #include "common.h"
 
-int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) {
+int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG), void *sa, void *sb, BLASLONG nthreads) {
 
   blas_queue_t queue[MAX_CPU_NUMBER];
   BLASLONG range[MAX_CPU_NUMBER + 1];
diff --git a/driver/level3/gemm_thread_variable.c b/driver/level3/gemm_thread_variable.c
index 162a75f70..75e49cb1a 100644
--- a/driver/level3/gemm_thread_variable.c
+++ b/driver/level3/gemm_thread_variable.c
@@ -42,7 +42,7 @@
 
 int CNAME(int mode,
 	  blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
-	  int (*function)(), void *sa, void *sb, BLASLONG divM, BLASLONG divN) {
+	  int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG ), void *sa, void *sb, BLASLONG divM, BLASLONG divN) {
 
   blas_queue_t queue[MAX_CPU_NUMBER];
 
diff --git a/driver/level3/syrk_thread.c b/driver/level3/syrk_thread.c
index 12808afd5..a40122e38 100644
--- a/driver/level3/syrk_thread.c
+++ b/driver/level3/syrk_thread.c
@@ -41,7 +41,7 @@
 #include <math.h>
 #include "common.h"
 
-int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) {
+int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*, FLOAT *, FLOAT *, BLASLONG), void *sa, void *sb, BLASLONG nthreads) {
 
   blas_queue_t queue[MAX_CPU_NUMBER];
   BLASLONG range[MAX_CPU_NUMBER + 1];

From c6b1d8e7a31f96b6e17fdd92fb6dbbbb2ef7b562 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 30 Sep 2023 12:52:06 +0200
Subject: [PATCH 317/718] fix improper function prototypes (empty parentheses)

---
 driver/others/blas_l1_thread.c        |  4 ++--
 driver/others/blas_server.c           |  2 +-
 driver/others/memory.c                | 10 +++++-----
 driver/others/openblas_env.c          | 18 +++++++++---------
 driver/others/openblas_error_handle.c |  2 +-
 driver/others/openblas_get_config.c   |  8 ++++----
 driver/others/openblas_get_parallel.c |  6 +++---
 driver/others/parameter.c             |  2 +-
 8 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/driver/others/blas_l1_thread.c b/driver/others/blas_l1_thread.c
index 06039c952..01b254f5d 100644
--- a/driver/others/blas_l1_thread.c
+++ b/driver/others/blas_l1_thread.c
@@ -43,7 +43,7 @@
 int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha,
 		       void *a, BLASLONG lda,
 		       void *b, BLASLONG ldb,
-		       void *c, BLASLONG ldc, int (*function)(), int nthreads){
+		       void *c, BLASLONG ldc, int (*function)(void), int nthreads){
 
   blas_queue_t queue[MAX_CPU_NUMBER];
   blas_arg_t   args [MAX_CPU_NUMBER];
@@ -141,7 +141,7 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha
 int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha,
 		       void *a, BLASLONG lda,
 		       void *b, BLASLONG ldb,
-		       void *c, BLASLONG ldc, int (*function)(), int nthreads){
+		       void *c, BLASLONG ldc, int (*function)(void), int nthreads){
 
   blas_queue_t queue[MAX_CPU_NUMBER];
   blas_arg_t   args [MAX_CPU_NUMBER];
diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c
index a8a84acbb..2fcb37192 100644
--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@@ -93,7 +93,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-extern unsigned int openblas_thread_timeout();
+extern unsigned int openblas_thread_timeout(void);
 
 #ifdef SMP_SERVER
 
diff --git a/driver/others/memory.c b/driver/others/memory.c
index b27fec431..a74cdabd4 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -1999,7 +1999,7 @@ int  goto_get_num_procs  (void) {
   return blas_cpu_number;
 }
 
-void openblas_fork_handler()
+void openblas_fork_handler(void)
 {
   // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
   // built with "make USE_OPENMP=0".
@@ -2016,9 +2016,9 @@ void openblas_fork_handler()
 #endif
 }
 
-extern int openblas_num_threads_env();
-extern int openblas_goto_num_threads_env();
-extern int openblas_omp_num_threads_env();
+extern int openblas_num_threads_env(void);
+extern int openblas_goto_num_threads_env(void);
+extern int openblas_omp_num_threads_env(void);
 
 int blas_get_cpu_number(void){
 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
@@ -3339,7 +3339,7 @@ static void gotoblas_memory_init(void) {
 /* Initialization for all function; this function should be called before main */
 
 static int gotoblas_initialized = 0;
-extern void openblas_read_env();
+extern void openblas_read_env(void);
 
 void CONSTRUCTOR gotoblas_init(void) {
 
diff --git a/driver/others/openblas_env.c b/driver/others/openblas_env.c
index 35b2270d4..c65f0f320 100644
--- a/driver/others/openblas_env.c
+++ b/driver/others/openblas_env.c
@@ -41,15 +41,15 @@ static int openblas_env_goto_num_threads=0;
 static int openblas_env_omp_num_threads=0;
 static int openblas_env_omp_adaptive=0;
 
-int openblas_verbose() { return openblas_env_verbose;}
-unsigned int openblas_thread_timeout() { return openblas_env_thread_timeout;}
-int openblas_block_factor() { return openblas_env_block_factor;}
-int openblas_num_threads_env() { return openblas_env_openblas_num_threads;}
-int openblas_goto_num_threads_env() { return openblas_env_goto_num_threads;}
-int openblas_omp_num_threads_env() { return openblas_env_omp_num_threads;}
-int openblas_omp_adaptive_env() { return openblas_env_omp_adaptive;}
-
-void openblas_read_env() {
+int openblas_verbose(void) { return openblas_env_verbose;}
+unsigned int openblas_thread_timeout(void) { return openblas_env_thread_timeout;}
+int openblas_block_factor(void) { return openblas_env_block_factor;}
+int openblas_num_threads_env(void) { return openblas_env_openblas_num_threads;}
+int openblas_goto_num_threads_env(void) { return openblas_env_goto_num_threads;}
+int openblas_omp_num_threads_env(void) { return openblas_env_omp_num_threads;}
+int openblas_omp_adaptive_env(void) { return openblas_env_omp_adaptive;}
+
+void openblas_read_env(void) {
   int ret=0;
   env_var_t p;
   if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p);
diff --git a/driver/others/openblas_error_handle.c b/driver/others/openblas_error_handle.c
index 9ac72c15d..aa0aa776a 100644
--- a/driver/others/openblas_error_handle.c
+++ b/driver/others/openblas_error_handle.c
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-extern int openblas_verbose();
+extern int openblas_verbose(void);
 
 void openblas_warning(int verbose, const char * msg) {
   int current_verbose;
diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c
index 7a5cbeb62..867d0e361 100644
--- a/driver/others/openblas_get_config.c
+++ b/driver/others/openblas_get_config.c
@@ -69,13 +69,13 @@ static char* openblas_config_str=""
   ;
 
 #ifdef DYNAMIC_ARCH
-char *gotoblas_corename();
+char *gotoblas_corename(void);
 #endif
 
 static char tmp_config_str[256];
-int openblas_get_parallel();
+int openblas_get_parallel(void);
 
-char* CNAME() {
+char* CNAME(void) {
 char tmpstr[20];
   strcpy(tmp_config_str, openblas_config_str);
 #ifdef DYNAMIC_ARCH
@@ -90,7 +90,7 @@ char tmpstr[20];
 }
 
 
-char* openblas_get_corename() {
+char* openblas_get_corename(void) {
 #ifndef DYNAMIC_ARCH 
   return CHAR_CORENAME;
 #else
diff --git a/driver/others/openblas_get_parallel.c b/driver/others/openblas_get_parallel.c
index 5dfda6e59..becfa0a3a 100644
--- a/driver/others/openblas_get_parallel.c
+++ b/driver/others/openblas_get_parallel.c
@@ -42,17 +42,17 @@ static int parallel = 0;
 
 
 #ifdef NEEDBUNDERSCORE
-int CNAME() {
+int CNAME(void) {
   return parallel;
 }
 
-int NAME() {
+int NAME(void) {
   return parallel;
 }
 
 #else
 //The CNAME and NAME are the same.
-int NAME() {
+int NAME(void) {
   return parallel;
 }
 #endif
diff --git a/driver/others/parameter.c b/driver/others/parameter.c
index 0d5c6aec0..de6bf0de4 100644
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@@ -40,7 +40,7 @@
 #include <string.h>
 #include "common.h"
 
-extern int openblas_block_factor();
+extern int openblas_block_factor(void);
 int get_L2_size(void);
 
 #define DEFAULT_GEMM_P 128

From 13ba4edf4373c324bc46a97ec6e96764d44fb873 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 30 Sep 2023 12:53:35 +0200
Subject: [PATCH 318/718] fix function prototypes (empty parentheses)

---
 interface/lapack/laswp.c  | 2 +-
 interface/lapack/zlaswp.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/interface/lapack/laswp.c b/interface/lapack/laswp.c
index 0dde33ae3..6544dbc5b 100644
--- a/interface/lapack/laswp.c
+++ b/interface/lapack/laswp.c
@@ -97,7 +97,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *
 
   blas_level1_thread(mode, n, k1, k2, dummyalpha,
 		     a, lda, NULL, 0, ipiv, incx,
-		     (int(*)())laswp[flag], nthreads);
+		     (int(*)(void))laswp[flag], nthreads);
   }
 #endif
 
diff --git a/interface/lapack/zlaswp.c b/interface/lapack/zlaswp.c
index b77a40985..7bb4a659e 100644
--- a/interface/lapack/zlaswp.c
+++ b/interface/lapack/zlaswp.c
@@ -96,7 +96,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *
   mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
 #endif
 
-  blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, (int(*)())laswp[flag], nthreads);
+  blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, (int(*)(void))laswp[flag], nthreads);
   }
 #endif
 

From 675cd551da315af964b6b097e6e5ab7b35bd6e59 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 30 Sep 2023 12:56:38 +0200
Subject: [PATCH 319/718] fix improper function prototypes (empty parentheses)

---
 kernel/x86_64/ddot.c | 2 +-
 kernel/x86_64/drot.c | 2 +-
 kernel/x86_64/srot.c | 2 +-
 kernel/x86_64/zdot.c | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c
index f3b9ee701..569ed2416 100644
--- a/kernel/x86_64/ddot.c
+++ b/kernel/x86_64/ddot.c
@@ -159,7 +159,7 @@ static int dot_thread_function(BLASLONG n, BLASLONG dummy0,
 
 extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
         BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
-        void *c, BLASLONG ldc, int (*function)(), int nthreads);
+        void *c, BLASLONG ldc, int (*function)(void), int nthreads);
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
diff --git a/kernel/x86_64/drot.c b/kernel/x86_64/drot.c
index 40c9cf19d..6fdf4ae56 100644
--- a/kernel/x86_64/drot.c
+++ b/kernel/x86_64/drot.c
@@ -169,7 +169,7 @@ static int rot_thread_function(blas_arg_t *args)
     return 0;
 }
 
-extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads);
+extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(void), int nthreads);
 #endif
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
 {
diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c
index a49544616..05724b427 100644
--- a/kernel/x86_64/srot.c
+++ b/kernel/x86_64/srot.c
@@ -171,7 +171,7 @@ static int rot_thread_function(blas_arg_t *args)
     return 0;
 }
 
-extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads);
+extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(void), int nthreads);
 #endif
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
 {
diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c
index 72a712a9e..51efa2dfe 100644
--- a/kernel/x86_64/zdot.c
+++ b/kernel/x86_64/zdot.c
@@ -92,7 +92,7 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
 #if defined(SMP)
 extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
         BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
-        void *c, BLASLONG ldc, int (*function)(), int nthreads);
+        void *c, BLASLONG ldc, int (*function)(void), int nthreads);
 #endif
                 
                 

From cd8ac192a901b38980755583faaa35559df7910a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 30 Sep 2023 12:58:30 +0200
Subject: [PATCH 320/718] fix improper function prototypes (empty parentheses)

---
 lapack-netlib/LAPACKE/src/lapacke_nancheck.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/LAPACKE/src/lapacke_nancheck.c b/lapack-netlib/LAPACKE/src/lapacke_nancheck.c
index c7d5c33f1..bb894f351 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_nancheck.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_nancheck.c
@@ -39,7 +39,7 @@ void LAPACKE_set_nancheck( int flag )
     nancheck_flag = ( flag ) ? 1 : 0;
 }
 
-int LAPACKE_get_nancheck( )
+int LAPACKE_get_nancheck( void )
 {
     char* env;
     if ( nancheck_flag != -1 ) {

From f4f31fb53b5f4069ae19a1840035bb770e237945 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 30 Sep 2023 12:59:44 +0200
Subject: [PATCH 321/718] fix improper function prototypes (empty parentheses)

---
 lapack/lauum/lauum_L_parallel.c | 4 ++--
 lapack/lauum/lauum_U_parallel.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lapack/lauum/lauum_L_parallel.c b/lapack/lauum/lauum_L_parallel.c
index 1b32e4519..0f4eaefaa 100644
--- a/lapack/lauum/lauum_L_parallel.c
+++ b/lapack/lauum/lauum_L_parallel.c
@@ -102,7 +102,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
     newarg.c = a;
 
     syrk_thread(mode | BLAS_TRANSA_T | BLAS_TRANSB_N | BLAS_UPLO,
-		&newarg, NULL, NULL, (int (*)(void))HERK_LC, sa, sb, args -> nthreads);
+		&newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))HERK_LC, sa, sb, args -> nthreads);
 
     newarg.m = bk;
     newarg.n = i;
@@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
     newarg.b = a + (i          ) * COMPSIZE;
 
     gemm_thread_n(mode | BLAS_TRANSA_T,
-		  &newarg, NULL, NULL, (int (*)(void))TRMM_LCLN, sa, sb, args -> nthreads);
+		  &newarg, NULL, NULL, (int (*)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT*, FLOAT*, BLASLONG))TRMM_LCLN, sa, sb, args -> nthreads);
 
     newarg.m = bk;
     newarg.n = bk;
diff --git a/lapack/lauum/lauum_U_parallel.c b/lapack/lauum/lauum_U_parallel.c
index f5ea54c88..77bfeebc7 100644
--- a/lapack/lauum/lauum_U_parallel.c
+++ b/lapack/lauum/lauum_U_parallel.c
@@ -102,7 +102,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
     newarg.c = a;
 
     syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T,
-		&newarg, NULL, NULL, (int (*)(void))HERK_UN, sa, sb, args -> nthreads);
+		&newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))HERK_UN, sa, sb, args -> nthreads);
 
     newarg.m = i;
     newarg.n = bk;
@@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
     newarg.b = a + (    i * lda) * COMPSIZE;
 
     gemm_thread_m(mode | BLAS_TRANSA_T | BLAS_RSIDE,
-		  &newarg, NULL, NULL, (int (*)(void))TRMM_RCUN, sa, sb, args -> nthreads);
+		  &newarg, NULL, NULL, (int (*)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT*, FLOAT*, BLASLONG))TRMM_RCUN, sa, sb, args -> nthreads);
 
     newarg.m = bk;
     newarg.n = bk;

From 1d4aa8d7d52f469724c26c1378ddf9cca778ce99 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 30 Sep 2023 13:00:51 +0200
Subject: [PATCH 322/718] fix improper function prototypes (empty parentheses)

---
 lapack/potrf/potrf_L_parallel.c | 2 +-
 lapack/potrf/potrf_U_parallel.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lapack/potrf/potrf_L_parallel.c b/lapack/potrf/potrf_L_parallel.c
index 986816d1a..7d6bcd776 100644
--- a/lapack/potrf/potrf_L_parallel.c
+++ b/lapack/potrf/potrf_L_parallel.c
@@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
       newarg.b = a + (i + bk + i * lda) * COMPSIZE;
 
       gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO,
-		    &newarg, NULL, NULL, (int (*)(void))TRSM_RCLN, sa, sb, args -> nthreads);
+		    &newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))TRSM_RCLN, sa, sb, args -> nthreads);
 
       newarg.n = n - i - bk;
       newarg.k = bk;
diff --git a/lapack/potrf/potrf_U_parallel.c b/lapack/potrf/potrf_U_parallel.c
index cc6ff9912..1f1427276 100644
--- a/lapack/potrf/potrf_U_parallel.c
+++ b/lapack/potrf/potrf_U_parallel.c
@@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
       newarg.b = a + (i + (i + bk) * lda) * COMPSIZE;
 
       gemm_thread_n(mode | BLAS_TRANSA_T,
-		    &newarg, NULL, NULL, (int (*)(void))TRSM_LCUN, sa, sb, args -> nthreads);
+		    &newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))TRSM_LCUN, sa, sb, args -> nthreads);
 
       newarg.n = n - i - bk;
       newarg.k = bk;

From 60ff5872af834658dbc2cf5d72f375c37052032e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 30 Sep 2023 13:01:44 +0200
Subject: [PATCH 323/718] fix improper function prototypes (empty parentheses)

---
 utest/ctest.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/utest/ctest.h b/utest/ctest.h
index 79961badf..b158b4538 100644
--- a/utest/ctest.h
+++ b/utest/ctest.h
@@ -41,7 +41,7 @@ typedef void (*RunWithDataFunc)(void*);
 struct ctest {
     const char* ssname;  // suite name
     const char* ttname;  // test name
-    void (*run)();
+    void (*run)(void);
     int skip;
 
     void* data;
@@ -159,9 +159,9 @@ struct ctest {
     void WEAK sname##_teardown(struct sname##_data* data)
 
 #define __CTEST_INTERNAL(sname, tname, _skip) \
-    void __FNAME(sname, tname)(); \
+    void __FNAME(sname, tname)(void); \
     __CTEST_STRUCT(sname, tname, _skip, NULL, NULL, NULL) \
-    void __FNAME(sname, tname)()
+    void __FNAME(sname, tname)(void)
 
 #ifdef __CTEST_APPLE
 #define SETUP_FNAME(sname) NULL
@@ -366,7 +366,7 @@ void __ctest_addTest(struct ctest *test)
 #ifndef __CTEST_MSVC
 /* Add all tests to linked list automatically.
  */
-static void __ctest_linkTests()
+static void __ctest_linkTests(void)
 {
     struct ctest ** test;
     struct ctest ** ctest_begin = (struct ctest **)__PNAME(suite, test);
@@ -401,7 +401,7 @@ static void __ctest_linkTests()
     __ctest_head_p = ctest_begin;
 }
 #else //for msvc
-static void __ctest_linkTests()
+static void __ctest_linkTests(void)
 {
     struct ctest ** ctest_start = __ctest_head_p;
     struct ctest ** test;
@@ -450,7 +450,7 @@ static void msg_start(const char* color, const char* title) {
     print_errormsg("  %s: ", title);
 }
 
-static void msg_end() {
+static void msg_end(void) {
     if (color_output) {
     	print_errormsg(ANSI_NORMAL);
     }
@@ -634,7 +634,7 @@ static int suite_test_filter(struct ctest* t) {
 
 
 #ifndef __CTEST_NO_TIME
-static uint64_t getCurrentTime() {
+static uint64_t getCurrentTime(void) {
     struct timeval now;
     gettimeofday(&now, NULL);
     uint64_t now64 = (uint64_t) now.tv_sec;

From 2dba455d2e950da2ea40cd9c6696294419b2a538 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 30 Sep 2023 15:33:55 +0200
Subject: [PATCH 324/718] revert accidental changes

---
 common_thread.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common_thread.h b/common_thread.h
index 70c724597..6e18d2a8e 100644
--- a/common_thread.h
+++ b/common_thread.h
@@ -141,14 +141,14 @@ static __inline int num_cpu_avail(int level) {
 
 #ifdef USE_OPENMP
 int openmp_nthreads;
-	openmp_nthreads=omp_get_max_threads(void);
+	openmp_nthreads=omp_get_max_threads();
 #endif
 
 #ifndef USE_OPENMP 
   if (blas_cpu_number == 1
 #endif
 #ifdef USE_OPENMP
-     if (openmp_nthreads == 1 || omp_in_parallel(void)
+     if (openmp_nthreads == 1 || omp_in_parallel()
 #endif
       ) return 1;        
 

From cf2174fb6967db717ec5ec640f3a88d0ccf0372e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 30 Sep 2023 17:04:39 +0200
Subject: [PATCH 325/718] fix improper function prototypes (empty parentheses)

---
 driver/others/blas_server_omp.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c
index fe6b4a7c0..3e2179373 100644
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -70,7 +70,7 @@
 int blas_server_avail = 0;
 int blas_omp_number_max = 0;
 
-extern int openblas_omp_adaptive_env();
+extern int openblas_omp_adaptive_env(void);
 
 static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
 #ifdef HAVE_C11
@@ -79,7 +79,7 @@ static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
 static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
 #endif
 
-static void adjust_thread_buffers() {
+static void adjust_thread_buffers(void) {
 
   int i=0, j=0;
 
@@ -124,9 +124,9 @@ void openblas_set_num_threads(int num_threads) {
 }
 
 int blas_thread_init(void){
-if(blas_omp_number_max <= 0)
-  blas_omp_number_max = omp_get_max_threads();
-
+  if(blas_omp_number_max <= 0)
+    blas_omp_number_max = omp_get_max_threads();
+	
   blas_get_cpu_number();
 
   adjust_thread_buffers();

From 90f890ee675945cdb7d7d9887e4baf50c7d5bb29 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 30 Sep 2023 23:12:36 +0200
Subject: [PATCH 326/718] fix improper function prototypes (empty parentheses)
 (USE_TLS branch)

---
 driver/others/memory.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index a74cdabd4..caef3e2b7 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -427,9 +427,9 @@ int  goto_get_num_procs  (void) {
   return blas_cpu_number;
 }
 
-static void blas_memory_init();
+static void blas_memory_init(void);
 
-void openblas_fork_handler()
+void openblas_fork_handler(void)
 {
   // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
   // built with "make USE_OPENMP=0".
@@ -446,9 +446,9 @@ void openblas_fork_handler()
 #endif
 }
 
-extern int openblas_num_threads_env();
-extern int openblas_goto_num_threads_env();
-extern int openblas_omp_num_threads_env();
+extern int openblas_num_threads_env(void);
+extern int openblas_goto_num_threads_env(void);
+extern int openblas_omp_num_threads_env(void);
 
 int blas_get_cpu_number(void){
 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
@@ -592,7 +592,7 @@ static BLASULONG  key_lock = 0UL;
 #endif
 
 /* Returns a pointer to the start of the per-thread memory allocation data */
-static __inline struct alloc_t ** get_memory_table() {
+static __inline struct alloc_t ** get_memory_table(void) {
 #if defined(SMP)
 LOCK_COMMAND(&key_lock);
 lsk=local_storage_key;
@@ -1145,7 +1145,7 @@ static void blas_memory_cleanup(void* ptr){
   }
 }
 
-static void blas_memory_init(){
+static void blas_memory_init(void){
 #if defined(SMP)
 #  if defined(OS_WINDOWS)
   local_storage_key = TlsAlloc();
@@ -1502,7 +1502,7 @@ static void gotoblas_memory_init(void) {
 /* Initialization for all function; this function should be called before main */
 
 static int gotoblas_initialized = 0;
-extern void openblas_read_env();
+extern void openblas_read_env(void);
 
 void CONSTRUCTOR gotoblas_init(void) {
 

From 3b1150fcee164922ed932c7d46b28a8ffec744a8 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 2 Oct 2023 12:00:48 -0500
Subject: [PATCH 327/718] Fix CPU identification to work on AIX.

---
 driver/others/dynamic_power.c | 216 +++++++++-------------------------
 1 file changed, 58 insertions(+), 158 deletions(-)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 2847ea9ae..7f8bfd5b9 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -20,12 +20,10 @@ static char *corename[] = {
 	"POWER10"
 };
 
-#define NUM_CORETYPES 4
+#define NUM_CORETYPES 5
 
 char *gotoblas_corename(void) {
-#ifndef C_PGI
 	if (gotoblas == &gotoblas_POWER6)	return corename[1];
-#endif
 	if (gotoblas == &gotoblas_POWER8)	return corename[2];
 #if (!defined __GNUC__) || ( __GNUC__ >= 6)
 	if (gotoblas == &gotoblas_POWER9)	return corename[3];
@@ -36,177 +34,81 @@ char *gotoblas_corename(void) {
 	return corename[0];
 }
 
-#if defined(__clang__)
-static int __builtin_cpu_supports(char* arg) 
-{
-	return 0;
-}
-#endif
-
-#if defined(C_PGI) || defined(__clang__)
-/*
- * NV HPC compilers do not yet implement __builtin_cpu_is().
- * Fake a version here for use in the CPU detection code below.
- *
- * Strategy here is to first check the CPU to see what it actually is,
- * and then test the input to see if what the CPU actually is matches
- * what was requested.
- */
+#ifdef _AIX
+#include <sys/systemcfg.h>
 
-#include <string.h>
-
-/*
- *  Define POWER processor version table.
- *
- *  NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time
- */
-
-#define CPU_UNKNOWN 0
-#define CPU_POWER5 5
-#define CPU_POWER6 6
-#define CPU_POWER8 8
-#define CPU_POWER9 9
+#define CPU_UNKNOWN  0
+#define CPU_POWER6   6
+#define CPU_POWER7   7
+#define CPU_POWER8   8
+#define CPU_POWER9   9
 #define CPU_POWER10 10
 
-static  struct {
-    uint32_t    pvr_mask;
-    uint32_t    pvr_value;
-    const char* cpu_name;
-    uint32_t    cpu_type;
-} pvrPOWER [] = {
-
-    {   /* POWER6 in P5+ mode; 2.04-compliant processor */
-        .pvr_mask       = 0xffffffff,
-        .pvr_value      = 0x0f000001,
-        .cpu_name       = "POWER5+",
-        .cpu_type       = CPU_POWER5,
-    },
-
-    {   /* Power6 aka POWER6X*/
-        .pvr_mask       = 0xffff0000,
-        .pvr_value      = 0x003e0000,
-        .cpu_name       = "POWER6 (raw)",
-        .cpu_type       = CPU_POWER6,
-    },
-
-    {   /* Power7 */
-        .pvr_mask       = 0xffff0000,
-        .pvr_value      = 0x003f0000,
-        .cpu_name       = "POWER7 (raw)",
-        .cpu_type       = CPU_POWER6,
-    },
-
-    {   /* Power7+ */
-        .pvr_mask       = 0xffff0000,
-        .pvr_value      = 0x004A0000,
-        .cpu_name       = "POWER7+ (raw)",
-        .cpu_type       = CPU_POWER6,
-    },
-
-    {   /* Power8E */
-        .pvr_mask       = 0xffff0000,
-        .pvr_value      = 0x004b0000,
-        .cpu_name       = "POWER8E (raw)",
-        .cpu_type       = CPU_POWER8,
-    },
-
-    {   /* Power8NVL */
-        .pvr_mask       = 0xffff0000,
-        .pvr_value      = 0x004c0000,
-        .cpu_name       = "POWER8NVL (raw)",
-        .cpu_type       = CPU_POWER8,
-    },
-
-    {   /* Power8 */
-        .pvr_mask       = 0xffff0000,
-        .pvr_value      = 0x004d0000,
-        .cpu_name       = "POWER8 (raw)",
-        .cpu_type       = CPU_POWER8,
-    },
-
-    {   /* Power9 DD2.0 */
-        .pvr_mask       = 0xffffefff,
-        .pvr_value      = 0x004e0200,
-        .cpu_name       = "POWER9 (raw)",
-        .cpu_type       = CPU_POWER9,
-    },
-
-    {   /* Power9 DD 2.1 */
-        .pvr_mask       = 0xffffefff,
-        .pvr_value      = 0x004e0201,
-        .cpu_name       = "POWER9 (raw)",
-        .cpu_type       = CPU_POWER9,
-    },
-
-    {   /* Power9 DD2.2 or later */
-        .pvr_mask       = 0xffff0000,
-        .pvr_value      = 0x004e0000,
-        .cpu_name       = "POWER9 (raw)",
-        .cpu_type       = CPU_POWER9,
-    },
-
-    {   /* Power10 */
-        .pvr_mask       = 0xffff0000,
-        .pvr_value      = 0x00800000,
-        .cpu_name       = "POWER10 (raw)",
-        .cpu_type       = CPU_POWER10,
-    },
-
-    {   /* End of table, pvr_mask and pvr_value must be zero */
-        .pvr_mask       = 0x0,
-        .pvr_value      = 0x0,
-        .cpu_name       = "Unknown",
-        .cpu_type       = CPU_UNKNOWN,
-    },
-};
-
-static int __builtin_cpu_is(const char *cpu) {
-	int i;
-	uint32_t pvr;
-	uint32_t cpu_type;
-
-	asm("mfpvr    %0" : "=r"(pvr));
-
-	for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) {
-		if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) {
-			break;
-		}
-	}
-
-#if defined(DEBUG)
-	printf("%s: returning CPU=%s, cpu_type=%p\n", __func__,
-		pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type);
+int cpuid()
+{
+    int arch = _system_configuration.implementation;
+#ifdef POWER_6
+    if (arch == POWER_6) return CPU_POWER6;
 #endif
-	cpu_type = pvrPOWER[i].cpu_type;
-
-	if (!strcmp(cpu, "power8"))
-		return cpu_type == CPU_POWER8;
-	if (!strcmp(cpu, "power9"))
-		return cpu_type == CPU_POWER9;
-	return 0;
+#ifdef POWER_7
+    else if (arch == POWER_7) return CPU_POWER7;
+#endif
+#ifdef POWER_8
+    else if (arch == POWER_8) return CPU_POWER8;
+#endif
+#ifdef POWER_9
+    else if (arch == POWER_9) return CPU_POWER9;
+#endif
+#ifdef POWER_10
+    else if (arch == POWER_10) return CPU_POWER10;
+#endif
+    return CPU_UNKNOWN;
 }
 
-#endif  /* C_PGI */
+#ifndef __BUILTIN_CPU_SUPPORTS__
+static int __builtin_cpu_supports(char* arg)
+{
+    static int ipinfo = -1;
+    if (ipinfo < 0) {
+        ipinfo = cpuid();
+    }
+    if (ipinfo >= CPU_POWER10) {
+        if (!strcmp(arg, "power10")) return 1;
+    }
+    if (ipinfo >= CPU_POWER9) {
+        if (!strcmp(arg, "power9")) return 1;
+    }
+    if (ipinfo >= CPU_POWER8) {
+        if (!strcmp(arg, "power8")) return 1;
+    }
+    if (ipinfo >= CPU_POWER6) {
+        if (!strcmp(arg, "power6")) return 1;
+    }
+    return 0;
+}
+#endif
 
 static gotoblas_t *get_coretype(void) {
 
-#ifndef C_PGI
-	if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x"))
+	if (__builtin_cpu_supports("power6"))
 		return &gotoblas_POWER6;
-#endif
-	if (__builtin_cpu_is("power8"))
+	if (__builtin_cpu_supports("power8"))
 		return &gotoblas_POWER8;
 #if (!defined __GNUC__) || ( __GNUC__ >= 6)
-	if (__builtin_cpu_is("power9"))
+	if (__builtin_cpu_supports("power9"))
 		return &gotoblas_POWER9;
 #endif
 #ifdef HAVE_P10_SUPPORT
-	if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma"))
+#ifdef _AIX
+    if (__builtin_cpu_supports("power10"))
+#else
+	if (__builtin_cpu_supports("arch_3_1") && __builtin_cpu_supports("mma"))
+#endif
 		return &gotoblas_POWER10;
 #endif
 	/* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */
-#if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
-	if (__builtin_cpu_is("power10"))
+#if (!defined __GNUC__) || ( __GNUC__ < 11) || (__GNUC__ == 10 && __GNUC_MINOR__ < 2)
+	if (__builtin_cpu_supports("power10"))
 		return &gotoblas_POWER9;
 #endif	
 	return NULL;
@@ -229,9 +131,7 @@ static gotoblas_t *force_coretype(char * coretype) {
 
 	switch (found)
 	{
-#ifndef C_PGI
 	case  1: return (&gotoblas_POWER6);
-#endif
 	case  2: return (&gotoblas_POWER8);
 #if (!defined __GNUC__) || ( __GNUC__ >= 6)
 	case  3: return (&gotoblas_POWER9);

From eb738d99293dc658bd6941cc6c2b76cd6ece0c11 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 2 Oct 2023 12:14:46 -0500
Subject: [PATCH 328/718] Minor changes.

---
 Makefile.system               | 2 --
 driver/others/dynamic_power.c | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index 1fd47e68e..b1a357fdf 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -1170,8 +1170,6 @@ CCOMMON_OPT += -DF_INTERFACE_IBM
 FEXTRALIB += -lxlf90
 ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG))
 FCOMMON_OPT += -qextname
-else ifeq ($(C_COMPILER), CLANG)
-FCOMMON_OPT += -qextname
 endif
 # FCOMMON_OPT	+= -qarch=440
 ifdef BINARY64
diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 7f8bfd5b9..1d3f36875 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -44,7 +44,7 @@ char *gotoblas_corename(void) {
 #define CPU_POWER9   9
 #define CPU_POWER10 10
 
-int cpuid()
+static int cpuid(void)
 {
     int arch = _system_configuration.implementation;
 #ifdef POWER_6

From 12130ee9613936f2fa49fd58a7f6bf8210a65552 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 2 Oct 2023 12:19:22 -0500
Subject: [PATCH 329/718] Remove tab.

---
 driver/others/dynamic_power.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 1d3f36875..3c5f1f3c1 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -102,7 +102,7 @@ static gotoblas_t *get_coretype(void) {
 #ifdef _AIX
     if (__builtin_cpu_supports("power10"))
 #else
-	if (__builtin_cpu_supports("arch_3_1") && __builtin_cpu_supports("mma"))
+    if (__builtin_cpu_supports("arch_3_1") && __builtin_cpu_supports("mma"))
 #endif
 		return &gotoblas_POWER10;
 #endif

From a922a07e610e0508e2f2f84ae158c46e2e3d7a0e Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 2 Oct 2023 12:24:30 -0500
Subject: [PATCH 330/718] Cleanup white spaces.

---
 driver/others/dynamic_power.c | 158 +++++++++++++++++-----------------
 1 file changed, 79 insertions(+), 79 deletions(-)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 3c5f1f3c1..40f00a634 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -13,25 +13,25 @@ extern gotoblas_t gotoblas_POWER10;
 extern void openblas_warning(int verbose, const char *msg);
 
 static char *corename[] = {
-	"unknown",
-	"POWER6",
-	"POWER8",
-	"POWER9",
-	"POWER10"
+    "unknown",
+    "POWER6",
+    "POWER8",
+    "POWER9",
+    "POWER10"
 };
 
 #define NUM_CORETYPES 5
 
 char *gotoblas_corename(void) {
-	if (gotoblas == &gotoblas_POWER6)	return corename[1];
-	if (gotoblas == &gotoblas_POWER8)	return corename[2];
+    if (gotoblas == &gotoblas_POWER6)    return corename[1];
+    if (gotoblas == &gotoblas_POWER8)    return corename[2];
 #if (!defined __GNUC__) || ( __GNUC__ >= 6)
-	if (gotoblas == &gotoblas_POWER9)	return corename[3];
+    if (gotoblas == &gotoblas_POWER9)    return corename[3];
 #endif
 #ifdef HAVE_P10_SUPPORT
-	if (gotoblas == &gotoblas_POWER10)	return corename[4];
+    if (gotoblas == &gotoblas_POWER10)   return corename[4];
 #endif
-	return corename[0];
+    return corename[0];
 }
 
 #ifdef _AIX
@@ -90,13 +90,13 @@ static int __builtin_cpu_supports(char* arg)
 
 static gotoblas_t *get_coretype(void) {
 
-	if (__builtin_cpu_supports("power6"))
-		return &gotoblas_POWER6;
-	if (__builtin_cpu_supports("power8"))
-		return &gotoblas_POWER8;
+    if (__builtin_cpu_supports("power6"))
+        return &gotoblas_POWER6;
+    if (__builtin_cpu_supports("power8"))
+        return &gotoblas_POWER8;
 #if (!defined __GNUC__) || ( __GNUC__ >= 6)
-	if (__builtin_cpu_supports("power9"))
-		return &gotoblas_POWER9;
+    if (__builtin_cpu_supports("power9"))
+        return &gotoblas_POWER9;
 #endif
 #ifdef HAVE_P10_SUPPORT
 #ifdef _AIX
@@ -104,84 +104,84 @@ static gotoblas_t *get_coretype(void) {
 #else
     if (__builtin_cpu_supports("arch_3_1") && __builtin_cpu_supports("mma"))
 #endif
-		return &gotoblas_POWER10;
+        return &gotoblas_POWER10;
 #endif
-	/* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */
+    /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */
 #if (!defined __GNUC__) || ( __GNUC__ < 11) || (__GNUC__ == 10 && __GNUC_MINOR__ < 2)
-	if (__builtin_cpu_supports("power10"))
-		return &gotoblas_POWER9;
-#endif	
-	return NULL;
+    if (__builtin_cpu_supports("power10"))
+        return &gotoblas_POWER9;
+#endif
+    return NULL;
 }
 
 static gotoblas_t *force_coretype(char * coretype) {
 
-	int i ;
-	int found = -1;
-	char message[128];
-
-	for ( i = 0 ; i < NUM_CORETYPES; i++)
-	{
-		if (!strncasecmp(coretype, corename[i], 20))
-		{
-			found = i;
-			break;
-		}
-	}
-
-	switch (found)
-	{
-	case  1: return (&gotoblas_POWER6);
-	case  2: return (&gotoblas_POWER8);
+    int i ;
+    int found = -1;
+    char message[128];
+
+    for ( i = 0 ; i < NUM_CORETYPES; i++)
+    {
+        if (!strncasecmp(coretype, corename[i], 20))
+        {
+            found = i;
+            break;
+        }
+    }
+
+    switch (found)
+    {
+    case  1: return (&gotoblas_POWER6);
+    case  2: return (&gotoblas_POWER8);
 #if (!defined __GNUC__) || ( __GNUC__ >= 6)
-	case  3: return (&gotoblas_POWER9);
+    case  3: return (&gotoblas_POWER9);
 #endif
 #ifdef HAVE_P10_SUPPORT
-	case  4: return (&gotoblas_POWER10);
+    case  4: return (&gotoblas_POWER10);
 #endif
-	default: return NULL;
-	}
-	snprintf(message, 128, "Core not found: %s\n", coretype);
-	openblas_warning(1, message);
+    default: return NULL;
+    }
+    snprintf(message, 128, "Core not found: %s\n", coretype);
+    openblas_warning(1, message);
 }
 
 void gotoblas_dynamic_init(void) {
 
-	char coremsg[128];
-	char coren[22];
-	char *p;
-
-
-	if (gotoblas) return;
-
-	p = getenv("OPENBLAS_CORETYPE");
-	if ( p )
-	{
-		gotoblas = force_coretype(p);
-	}
-	else
-	{
-		gotoblas = get_coretype();
-	}
-
-	if (gotoblas == NULL)
-	{
-		snprintf(coremsg, 128, "Falling back to POWER8 core\n");
-		openblas_warning(1, coremsg);
-		gotoblas = &gotoblas_POWER8;
-	}
-
-	if (gotoblas && gotoblas -> init) {
-		strncpy(coren,gotoblas_corename(),20);
-		sprintf(coremsg, "Core: %s\n",coren);
-		openblas_warning(2, coremsg);
-		gotoblas -> init();
-	} else {
-		openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
-		exit(1);
-	}
+    char coremsg[128];
+    char coren[22];
+    char *p;
+
+
+    if (gotoblas) return;
+
+    p = getenv("OPENBLAS_CORETYPE");
+    if ( p )
+    {
+        gotoblas = force_coretype(p);
+    }
+    else
+    {
+        gotoblas = get_coretype();
+    }
+
+    if (gotoblas == NULL)
+    {
+        snprintf(coremsg, 128, "Falling back to POWER8 core\n");
+        openblas_warning(1, coremsg);
+        gotoblas = &gotoblas_POWER8;
+    }
+
+    if (gotoblas && gotoblas -> init) {
+        strncpy(coren,gotoblas_corename(),20);
+        sprintf(coremsg, "Core: %s\n",coren);
+        openblas_warning(2, coremsg);
+        gotoblas -> init();
+    } else {
+        openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
+        exit(1);
+    }
 }
 
 void gotoblas_dynamic_quit(void) {
-	gotoblas = NULL;
+    gotoblas = NULL;
 }

From 10210748de17a217fd67f6cb8501272b8bfa88c2 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 2 Oct 2023 12:44:07 -0500
Subject: [PATCH 331/718] Revert PGI changes.

---
 driver/others/dynamic_power.c | 310 +++++++++++++++++++++++++---------
 1 file changed, 234 insertions(+), 76 deletions(-)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 40f00a634..0f5b06be5 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -13,27 +13,181 @@ extern gotoblas_t gotoblas_POWER10;
 extern void openblas_warning(int verbose, const char *msg);
 
 static char *corename[] = {
-    "unknown",
-    "POWER6",
-    "POWER8",
-    "POWER9",
-    "POWER10"
+	"unknown",
+	"POWER6",
+	"POWER8",
+	"POWER9",
+	"POWER10"
 };
 
 #define NUM_CORETYPES 5
 
 char *gotoblas_corename(void) {
-    if (gotoblas == &gotoblas_POWER6)    return corename[1];
-    if (gotoblas == &gotoblas_POWER8)    return corename[2];
+#ifndef C_PGI
+	if (gotoblas == &gotoblas_POWER6)	return corename[1];
+#endif
+	if (gotoblas == &gotoblas_POWER8)	return corename[2];
 #if (!defined __GNUC__) || ( __GNUC__ >= 6)
-    if (gotoblas == &gotoblas_POWER9)    return corename[3];
+	if (gotoblas == &gotoblas_POWER9)	return corename[3];
 #endif
 #ifdef HAVE_P10_SUPPORT
-    if (gotoblas == &gotoblas_POWER10)   return corename[4];
+	if (gotoblas == &gotoblas_POWER10)	return corename[4];
+#endif
+	return corename[0];
+}
+
+#if defined(__clang__)
+static int __builtin_cpu_supports(char* arg) 
+{
+	return 0;
+}
+#endif
+
+#if defined(C_PGI) || defined(__clang__)
+/*
+ * NV HPC compilers do not yet implement __builtin_cpu_is().
+ * Fake a version here for use in the CPU detection code below.
+ *
+ * Strategy here is to first check the CPU to see what it actually is,
+ * and then test the input to see if what the CPU actually is matches
+ * what was requested.
+ */
+
+#include <string.h>
+
+/*
+ *  Define POWER processor version table.
+ *
+ *  NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time
+ */
+
+#define CPU_UNKNOWN 0
+#define CPU_POWER5 5
+#define CPU_POWER6 6
+#define CPU_POWER8 8
+#define CPU_POWER9 9
+#define CPU_POWER10 10
+
+static  struct {
+    uint32_t    pvr_mask;
+    uint32_t    pvr_value;
+    const char* cpu_name;
+    uint32_t    cpu_type;
+} pvrPOWER [] = {
+
+    {   /* POWER6 in P5+ mode; 2.04-compliant processor */
+        .pvr_mask       = 0xffffffff,
+        .pvr_value      = 0x0f000001,
+        .cpu_name       = "POWER5+",
+        .cpu_type       = CPU_POWER5,
+    },
+
+    {   /* Power6 aka POWER6X*/
+        .pvr_mask       = 0xffff0000,
+        .pvr_value      = 0x003e0000,
+        .cpu_name       = "POWER6 (raw)",
+        .cpu_type       = CPU_POWER6,
+    },
+
+    {   /* Power7 */
+        .pvr_mask       = 0xffff0000,
+        .pvr_value      = 0x003f0000,
+        .cpu_name       = "POWER7 (raw)",
+        .cpu_type       = CPU_POWER6,
+    },
+
+    {   /* Power7+ */
+        .pvr_mask       = 0xffff0000,
+        .pvr_value      = 0x004A0000,
+        .cpu_name       = "POWER7+ (raw)",
+        .cpu_type       = CPU_POWER6,
+    },
+
+    {   /* Power8E */
+        .pvr_mask       = 0xffff0000,
+        .pvr_value      = 0x004b0000,
+        .cpu_name       = "POWER8E (raw)",
+        .cpu_type       = CPU_POWER8,
+    },
+
+    {   /* Power8NVL */
+        .pvr_mask       = 0xffff0000,
+        .pvr_value      = 0x004c0000,
+        .cpu_name       = "POWER8NVL (raw)",
+        .cpu_type       = CPU_POWER8,
+    },
+
+    {   /* Power8 */
+        .pvr_mask       = 0xffff0000,
+        .pvr_value      = 0x004d0000,
+        .cpu_name       = "POWER8 (raw)",
+        .cpu_type       = CPU_POWER8,
+    },
+
+    {   /* Power9 DD2.0 */
+        .pvr_mask       = 0xffffefff,
+        .pvr_value      = 0x004e0200,
+        .cpu_name       = "POWER9 (raw)",
+        .cpu_type       = CPU_POWER9,
+    },
+
+    {   /* Power9 DD 2.1 */
+        .pvr_mask       = 0xffffefff,
+        .pvr_value      = 0x004e0201,
+        .cpu_name       = "POWER9 (raw)",
+        .cpu_type       = CPU_POWER9,
+    },
+
+    {   /* Power9 DD2.2 or later */
+        .pvr_mask       = 0xffff0000,
+        .pvr_value      = 0x004e0000,
+        .cpu_name       = "POWER9 (raw)",
+        .cpu_type       = CPU_POWER9,
+    },
+
+    {   /* Power10 */
+        .pvr_mask       = 0xffff0000,
+        .pvr_value      = 0x00800000,
+        .cpu_name       = "POWER10 (raw)",
+        .cpu_type       = CPU_POWER10,
+    },
+
+    {   /* End of table, pvr_mask and pvr_value must be zero */
+        .pvr_mask       = 0x0,
+        .pvr_value      = 0x0,
+        .cpu_name       = "Unknown",
+        .cpu_type       = CPU_UNKNOWN,
+    },
+};
+
+static int __builtin_cpu_is(const char *cpu) {
+	int i;
+	uint32_t pvr;
+	uint32_t cpu_type;
+
+	asm("mfpvr    %0" : "=r"(pvr));
+
+	for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) {
+		if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) {
+			break;
+		}
+	}
+
+#if defined(DEBUG)
+	printf("%s: returning CPU=%s, cpu_type=%p\n", __func__,
+		pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type);
 #endif
-    return corename[0];
+	cpu_type = pvrPOWER[i].cpu_type;
+
+	if (!strcmp(cpu, "power8"))
+		return cpu_type == CPU_POWER8;
+	if (!strcmp(cpu, "power9"))
+		return cpu_type == CPU_POWER9;
+	return 0;
 }
 
+#endif  /* C_PGI */
+
 #ifdef _AIX
 #include <sys/systemcfg.h>
 
@@ -90,98 +244,102 @@ static int __builtin_cpu_supports(char* arg)
 
 static gotoblas_t *get_coretype(void) {
 
-    if (__builtin_cpu_supports("power6"))
-        return &gotoblas_POWER6;
-    if (__builtin_cpu_supports("power8"))
-        return &gotoblas_POWER8;
+#ifndef C_PGI
+	if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x"))
+		return &gotoblas_POWER6;
+#endif
+	if (__builtin_cpu_is("power8"))
+		return &gotoblas_POWER8;
 #if (!defined __GNUC__) || ( __GNUC__ >= 6)
-    if (__builtin_cpu_supports("power9"))
-        return &gotoblas_POWER9;
+	if (__builtin_cpu_is("power9"))
+		return &gotoblas_POWER9;
 #endif
 #ifdef HAVE_P10_SUPPORT
 #ifdef _AIX
-    if (__builtin_cpu_supports("power10"))
+	if (__builtin_cpu_supports("power10"))
 #else
-    if (__builtin_cpu_supports("arch_3_1") && __builtin_cpu_supports("mma"))
-#endif
-        return &gotoblas_POWER10;
+	if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma"))
 #endif
-    /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */
-#if (!defined __GNUC__) || ( __GNUC__ < 11) || (__GNUC__ == 10 && __GNUC_MINOR__ < 2)
-    if (__builtin_cpu_supports("power10"))
-        return &gotoblas_POWER9;
+		return &gotoblas_POWER10;
 #endif
-    return NULL;
+	/* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */
+#if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
+	if (__builtin_cpu_is("power10"))
+		return &gotoblas_POWER9;
+#endif	
+	return NULL;
 }
 
 static gotoblas_t *force_coretype(char * coretype) {
 
-    int i ;
-    int found = -1;
-    char message[128];
-
-    for ( i = 0 ; i < NUM_CORETYPES; i++)
-    {
-        if (!strncasecmp(coretype, corename[i], 20))
-        {
-            found = i;
-            break;
-        }
-    }
+	int i ;
+	int found = -1;
+	char message[128];
 
-    switch (found)
-    {
-    case  1: return (&gotoblas_POWER6);
-    case  2: return (&gotoblas_POWER8);
+	for ( i = 0 ; i < NUM_CORETYPES; i++)
+	{
+		if (!strncasecmp(coretype, corename[i], 20))
+		{
+			found = i;
+			break;
+		}
+	}
+
+	switch (found)
+	{
+#ifndef C_PGI
+	case  1: return (&gotoblas_POWER6);
+#endif
+	case  2: return (&gotoblas_POWER8);
 #if (!defined __GNUC__) || ( __GNUC__ >= 6)
-    case  3: return (&gotoblas_POWER9);
+	case  3: return (&gotoblas_POWER9);
 #endif
 #ifdef HAVE_P10_SUPPORT
-    case  4: return (&gotoblas_POWER10);
+	case  4: return (&gotoblas_POWER10);
 #endif
-    default: return NULL;
-    }
-    snprintf(message, 128, "Core not found: %s\n", coretype);
-    openblas_warning(1, message);
+	default: return NULL;
+	}
+	snprintf(message, 128, "Core not found: %s\n", coretype);
+	openblas_warning(1, message);
 }
 
 void gotoblas_dynamic_init(void) {
 
-    char coremsg[128];
-    char coren[22];
-    char *p;
+	char coremsg[128];
+	char coren[22];
+	char *p;
 
 
-    if (gotoblas) return;
+	if (gotoblas) return;
 
-    p = getenv("OPENBLAS_CORETYPE");
-    if ( p )
-    {
-        gotoblas = force_coretype(p);
-    }
-    else
-    {
-        gotoblas = get_coretype();
-    }
+	p = getenv("OPENBLAS_CORETYPE");
+	if ( p )
+	{
+		gotoblas = force_coretype(p);
+	}
+	else
+	{
+		gotoblas = get_coretype();
+	}
 
-    if (gotoblas == NULL)
-    {
-        snprintf(coremsg, 128, "Falling back to POWER8 core\n");
-        openblas_warning(1, coremsg);
-        gotoblas = &gotoblas_POWER8;
-    }
+	if (gotoblas == NULL)
+	{
+		snprintf(coremsg, 128, "Falling back to POWER8 core\n");
+		openblas_warning(1, coremsg);
+		gotoblas = &gotoblas_POWER8;
+	}
 
-    if (gotoblas && gotoblas -> init) {
-        strncpy(coren,gotoblas_corename(),20);
-        sprintf(coremsg, "Core: %s\n",coren);
-        openblas_warning(2, coremsg);
-        gotoblas -> init();
-    } else {
-        openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
-        exit(1);
-    }
+	if (gotoblas && gotoblas -> init) {
+		strncpy(coren,gotoblas_corename(),20);
+		sprintf(coremsg, "Core: %s\n",coren);
+		openblas_warning(2, coremsg);
+		gotoblas -> init();
+	} else {
+		openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
+		exit(1);
+	}
 }
 
 void gotoblas_dynamic_quit(void) {
-    gotoblas = NULL;
+	gotoblas = NULL;
 }

From e5dc376912dab278afdf677cb112008d36ead0fe Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 2 Oct 2023 12:48:47 -0500
Subject: [PATCH 332/718] Remove duplicate defines.

---
 driver/others/dynamic_power.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 0f5b06be5..7b0b4ea01 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -43,6 +43,13 @@ static int __builtin_cpu_supports(char* arg)
 }
 #endif
 
+#define CPU_UNKNOWN  0
+#define CPU_POWER6   6
+#define CPU_POWER7   7
+#define CPU_POWER8   8
+#define CPU_POWER9   9
+#define CPU_POWER10 10
+
 #if defined(C_PGI) || defined(__clang__)
 /*
  * NV HPC compilers do not yet implement __builtin_cpu_is().
@@ -61,13 +68,6 @@ static int __builtin_cpu_supports(char* arg)
  *  NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time
  */
 
-#define CPU_UNKNOWN 0
-#define CPU_POWER5 5
-#define CPU_POWER6 6
-#define CPU_POWER8 8
-#define CPU_POWER9 9
-#define CPU_POWER10 10
-
 static  struct {
     uint32_t    pvr_mask;
     uint32_t    pvr_value;
@@ -191,13 +191,6 @@ static int __builtin_cpu_is(const char *cpu) {
 #ifdef _AIX
 #include <sys/systemcfg.h>
 
-#define CPU_UNKNOWN  0
-#define CPU_POWER6   6
-#define CPU_POWER7   7
-#define CPU_POWER8   8
-#define CPU_POWER9   9
-#define CPU_POWER10 10
-
 static int cpuid(void)
 {
     int arch = _system_configuration.implementation;

From b677d0d5fd175768e63d02253b12d1b0ccb2d242 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 2 Oct 2023 13:09:12 -0500
Subject: [PATCH 333/718] Adding missing endif

---
 driver/others/dynamic_power.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 7b0b4ea01..6ed26ad1e 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -234,6 +234,7 @@ static int __builtin_cpu_supports(char* arg)
     return 0;
 }
 #endif
+#endif
 
 static gotoblas_t *get_coretype(void) {
 

From a8c90eb3ed5cae583bdc289846fe7d37fdc42d28 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Tue, 3 Oct 2023 10:24:04 -0500
Subject: [PATCH 334/718] Added cpu_is

---
 driver/others/dynamic_power.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 6ed26ad1e..252e409b3 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -233,8 +233,27 @@ static int __builtin_cpu_supports(char* arg)
     }
     return 0;
 }
+
+static int __builtin_cpu_is(char *arg)
+{
+    static int ipinfo = -1;
+    if (ipinfo < 0) {
+        ipinfo = cpuid();
+    }
+    if (ipinfo == CPU_POWER10) {
+        if (!strcmp(arg, "power10") return 1;
+    } else if (ipinfo == CPU_POWER9) {
+        if (!strcmp(arg, "power9") return 1;
+    } else if (ipinfo == CPU_POWER8) {
+        if (!strcmp(arg, "power8") return 1;
+    } else if (ipinfo == CPU_POWER6) {
+        if (!strcmp(arg, "power6") return 1;
+    } else {
+        return 0;
+    }
+}
 #endif
-#endif
+#endif   /* _AIX */
 
 static gotoblas_t *get_coretype(void) {
 

From 2d0b2334259d41c2003b51a07580dbd25cfe267c Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Tue, 3 Oct 2023 10:26:14 -0500
Subject: [PATCH 335/718] Fix missing parens.

---
 driver/others/dynamic_power.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 252e409b3..734122178 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -241,13 +241,13 @@ static int __builtin_cpu_is(char *arg)
         ipinfo = cpuid();
     }
     if (ipinfo == CPU_POWER10) {
-        if (!strcmp(arg, "power10") return 1;
+        if (!strcmp(arg, "power10")) return 1;
     } else if (ipinfo == CPU_POWER9) {
-        if (!strcmp(arg, "power9") return 1;
+        if (!strcmp(arg, "power9")) return 1;
     } else if (ipinfo == CPU_POWER8) {
-        if (!strcmp(arg, "power8") return 1;
+        if (!strcmp(arg, "power8")) return 1;
     } else if (ipinfo == CPU_POWER6) {
-        if (!strcmp(arg, "power6") return 1;
+        if (!strcmp(arg, "power6")) return 1;
     } else {
         return 0;
     }

From 09212f84bff0ca8173f928c59ec81da3ab00933b Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Tue, 3 Oct 2023 12:23:21 -0500
Subject: [PATCH 336/718] Fix default case for cpu_is.

---
 driver/others/dynamic_power.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 734122178..252baaeeb 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -248,9 +248,8 @@ static int __builtin_cpu_is(char *arg)
         if (!strcmp(arg, "power8")) return 1;
     } else if (ipinfo == CPU_POWER6) {
         if (!strcmp(arg, "power6")) return 1;
-    } else {
-        return 0;
     }
+    return 0;
 }
 #endif
 #endif   /* _AIX */

From 3cc72a3797ac050841975ff38d317f34ecfeb503 Mon Sep 17 00:00:00 2001
From: Chip Kerchner <chip.kerchner@ibm.com>
Date: Wed, 4 Oct 2023 09:54:37 -0500
Subject: [PATCH 337/718] Only include cpu_id and cpu_supports in AIX and fix
 parameter types.

---
 driver/others/dynamic_power.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 252baaeeb..c01d112bc 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -36,7 +36,7 @@ char *gotoblas_corename(void) {
 	return corename[0];
 }
 
-#if defined(__clang__)
+#if defined(__clang__) && !defined(_AIX)
 static int __builtin_cpu_supports(char* arg) 
 {
 	return 0;
@@ -50,7 +50,7 @@ static int __builtin_cpu_supports(char* arg)
 #define CPU_POWER9   9
 #define CPU_POWER10 10
 
-#if defined(C_PGI) || defined(__clang__)
+#if defined(C_PGI) || (defined(__clang__) && !defined(_AIX))
 /*
  * NV HPC compilers do not yet implement __builtin_cpu_is().
  * Fake a version here for use in the CPU detection code below.
@@ -213,7 +213,7 @@ static int cpuid(void)
 }
 
 #ifndef __BUILTIN_CPU_SUPPORTS__
-static int __builtin_cpu_supports(char* arg)
+static int __builtin_cpu_supports(const char* arg)
 {
     static int ipinfo = -1;
     if (ipinfo < 0) {
@@ -234,7 +234,7 @@ static int __builtin_cpu_supports(char* arg)
     return 0;
 }
 
-static int __builtin_cpu_is(char *arg)
+static int __builtin_cpu_is(const char *arg)
 {
     static int ipinfo = -1;
     if (ipinfo < 0) {

From db0805906ba0d7477ff2adad41bf815cd25e9d06 Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Wed, 4 Oct 2023 14:04:03 -0500
Subject: [PATCH 338/718] powerpc: Fix build errors with Open XL C

This patch fixes errors when using Open XL C compiler on AIX.
Tested with gcc/xlf and ibm-clang/xlf compiler combinations.
---
 Makefile.power  |  7 +++++++
 c_check         | 18 +++++++++++++-----
 kernel/Makefile |  6 ------
 3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/Makefile.power b/Makefile.power
index 46afb2d4a..ada51b2e1 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -98,6 +98,9 @@ endif
 endif
 endif
 
+ifeq ($(C_COMPILER), CLANG)
+CCOMMON_OPT += -fno-integrated-as
+endif
 # workaround for C->FORTRAN ABI violation in LAPACKE
 ifeq ($(F_COMPILER), GFORTRAN)
 FCOMMON_OPT += -fno-optimize-sibling-calls
@@ -133,7 +136,11 @@ ifdef BINARY64
 
 
 ifeq ($(OSNAME), AIX)
+ifeq ($(C_COMPILER), GCC)
 CCOMMON_OPT	+= -mpowerpc64 -maix64
+else
+CCOMMON_OPT	+= -m64
+endif
 ifeq ($(COMPILER_F77), g77)
 FCOMMON_OPT	+= -mpowerpc64 -maix64
 endif
diff --git a/c_check b/c_check
index 4d12c1674..b018c10a8 100755
--- a/c_check
+++ b/c_check
@@ -96,11 +96,19 @@ esac
 defined=0
 
 if [ "$os" = "AIX" ]; then
-    case "$BINARY" in
-        32) compiler_name="$compiler_name -maix32" ;;
-        64) compiler_name="$compiler_name -maix64" ;;
-    esac
-    defined=1
+    if [ "$compiler" = "GCC" ]; then
+        case "$BINARY" in
+            32) compiler_name="$compiler_name -maix32" ;;
+            64) compiler_name="$compiler_name -maix64" ;;
+        esac
+        defined=1
+    else
+        case "$BINARY" in
+            32) compiler_name="$compiler_name -m32" ;;
+            64) compiler_name="$compiler_name -m64" ;;
+        esac
+        defined=1
+    fi
 fi
 
 case "$architecture" in
diff --git a/kernel/Makefile b/kernel/Makefile
index 1e0a0074f..3f9afd3fa 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,12 +5,6 @@ endif
 TOPDIR	= ..
 include $(TOPDIR)/Makefile.system
 
-ifeq ($(ARCH), power)
-ifeq ($(C_COMPILER), CLANG)
- override CFLAGS += -fno-integrated-as
-endif
-endif
-
 AVX2OPT = 
 ifeq ($(C_COMPILER), GCC)
 # AVX2 support was added in 4.7.0

From c60f9d9c084a97d1c416d63d921a8fcb30b090ac Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Fri, 6 Oct 2023 09:49:17 -0500
Subject: [PATCH 339/718] Add missing CPU_POWER5.

---
 driver/others/dynamic_power.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index c01d112bc..8c5caada0 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -44,6 +44,7 @@ static int __builtin_cpu_supports(char* arg)
 #endif
 
 #define CPU_UNKNOWN  0
+#define CPU_POWER5   5
 #define CPU_POWER6   6
 #define CPU_POWER7   7
 #define CPU_POWER8   8

From 71c6689af4e61cc4891eba3d996fb39920798e37 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Fri, 6 Oct 2023 12:20:40 -0500
Subject: [PATCH 340/718] Fix dynamic dispatch to work for clang.

---
 driver/others/dynamic_power.c | 141 ++++++++++++++--------------------
 1 file changed, 59 insertions(+), 82 deletions(-)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 8c5caada0..c43738ef4 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -36,13 +36,6 @@ char *gotoblas_corename(void) {
 	return corename[0];
 }
 
-#if defined(__clang__) && !defined(_AIX)
-static int __builtin_cpu_supports(char* arg) 
-{
-	return 0;
-}
-#endif
-
 #define CPU_UNKNOWN  0
 #define CPU_POWER5   5
 #define CPU_POWER6   6
@@ -51,7 +44,31 @@ static int __builtin_cpu_supports(char* arg)
 #define CPU_POWER9   9
 #define CPU_POWER10 10
 
-#if defined(C_PGI) || (defined(__clang__) && !defined(_AIX))
+#ifdef _AIX
+#include <sys/systemcfg.h>
+
+static int cpuid(void)
+{
+    int arch = _system_configuration.implementation;
+#ifdef POWER_6
+    if (arch == POWER_6) return CPU_POWER6;
+#endif
+#ifdef POWER_7
+    else if (arch == POWER_7) return CPU_POWER7;
+#endif
+#ifdef POWER_8
+    else if (arch == POWER_8) return CPU_POWER8;
+#endif
+#ifdef POWER_9
+    else if (arch == POWER_9) return CPU_POWER9;
+#endif
+#ifdef POWER_10
+    else if (arch == POWER_10) return CPU_POWER10;
+#endif
+    return CPU_UNKNOWN;
+}
+#else
+#if defined(C_PGI) || defined(__clang__)
 /*
  * NV HPC compilers do not yet implement __builtin_cpu_is().
  * Fake a version here for use in the CPU detection code below.
@@ -61,8 +78,6 @@ static int __builtin_cpu_supports(char* arg)
  * what was requested.
  */
 
-#include <string.h>
-
 /*
  *  Define POWER processor version table.
  *
@@ -161,79 +176,32 @@ static  struct {
     },
 };
 
-static int __builtin_cpu_is(const char *cpu) {
-	int i;
-	uint32_t pvr;
-	uint32_t cpu_type;
+static int cpuid(void)
+{
+    int i;
+    uint32_t pvr;
+    uint32_t cpu_type;
 
-	asm("mfpvr    %0" : "=r"(pvr));
+    asm("mfpvr    %0" : "=r"(pvr));
 
-	for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) {
-		if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) {
-			break;
-		}
-	}
+    for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) {
+        if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) {
+            break;
+        }
+    }
 
 #if defined(DEBUG)
-	printf("%s: returning CPU=%s, cpu_type=%p\n", __func__,
-		pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type);
+    printf("%s: returning CPU=%s, cpu_type=%p\n", __func__,
+        pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type);
 #endif
-	cpu_type = pvrPOWER[i].cpu_type;
-
-	if (!strcmp(cpu, "power8"))
-		return cpu_type == CPU_POWER8;
-	if (!strcmp(cpu, "power9"))
-		return cpu_type == CPU_POWER9;
-	return 0;
+    cpu_type = pvrPOWER[i].cpu_type;
+    return (int)(cpu_type);
 }
-
 #endif  /* C_PGI */
-
-#ifdef _AIX
-#include <sys/systemcfg.h>
-
-static int cpuid(void)
-{
-    int arch = _system_configuration.implementation;
-#ifdef POWER_6
-    if (arch == POWER_6) return CPU_POWER6;
-#endif
-#ifdef POWER_7
-    else if (arch == POWER_7) return CPU_POWER7;
-#endif
-#ifdef POWER_8
-    else if (arch == POWER_8) return CPU_POWER8;
-#endif
-#ifdef POWER_9
-    else if (arch == POWER_9) return CPU_POWER9;
-#endif
-#ifdef POWER_10
-    else if (arch == POWER_10) return CPU_POWER10;
-#endif
-    return CPU_UNKNOWN;
-}
+#endif  /* _AIX */
 
 #ifndef __BUILTIN_CPU_SUPPORTS__
-static int __builtin_cpu_supports(const char* arg)
-{
-    static int ipinfo = -1;
-    if (ipinfo < 0) {
-        ipinfo = cpuid();
-    }
-    if (ipinfo >= CPU_POWER10) {
-        if (!strcmp(arg, "power10")) return 1;
-    }
-    if (ipinfo >= CPU_POWER9) {
-        if (!strcmp(arg, "power9")) return 1;
-    }
-    if (ipinfo >= CPU_POWER8) {
-        if (!strcmp(arg, "power8")) return 1;
-    }
-    if (ipinfo >= CPU_POWER6) {
-        if (!strcmp(arg, "power6")) return 1;
-    }
-    return 0;
-}
+#include <string.h>
 
 static int __builtin_cpu_is(const char *arg)
 {
@@ -241,19 +209,28 @@ static int __builtin_cpu_is(const char *arg)
     if (ipinfo < 0) {
         ipinfo = cpuid();
     }
+#ifdef HAVE_P10_SUPPORT
     if (ipinfo == CPU_POWER10) {
         if (!strcmp(arg, "power10")) return 1;
-    } else if (ipinfo == CPU_POWER9) {
+    }
+#endif
+    if (ipinfo == CPU_POWER9) {
         if (!strcmp(arg, "power9")) return 1;
     } else if (ipinfo == CPU_POWER8) {
         if (!strcmp(arg, "power8")) return 1;
+#ifndef C_PGI
     } else if (ipinfo == CPU_POWER6) {
         if (!strcmp(arg, "power6")) return 1;
+#endif
     }
     return 0;
 }
+
+static int __builtin_cpu_supports(const char *arg)
+{
+    return 0;
+}
 #endif
-#endif   /* _AIX */
 
 static gotoblas_t *get_coretype(void) {
 
@@ -268,18 +245,18 @@ static gotoblas_t *get_coretype(void) {
 		return &gotoblas_POWER9;
 #endif
 #ifdef HAVE_P10_SUPPORT
-#ifdef _AIX
-	if (__builtin_cpu_supports("power10"))
+#if defined(_AIX) || defined(__clang__)
+    if (__builtin_cpu_is("power10"))
 #else
 	if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma"))
 #endif
 		return &gotoblas_POWER10;
 #endif
-	/* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */
+    /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */
 #if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
-	if (__builtin_cpu_is("power10"))
-		return &gotoblas_POWER9;
-#endif	
+    if (__builtin_cpu_is("power10"))
+        return &gotoblas_POWER9;
+#endif
 	return NULL;
 }
 

From 298bf1f240afcac73d306f4c2da35b314c39dba6 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Fri, 6 Oct 2023 12:50:28 -0500
Subject: [PATCH 341/718] Reduce differences.

---
 driver/others/dynamic_power.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index c43738ef4..4e8710bc7 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -178,23 +178,23 @@ static  struct {
 
 static int cpuid(void)
 {
-    int i;
-    uint32_t pvr;
-    uint32_t cpu_type;
+	int i;
+	uint32_t pvr;
+	uint32_t cpu_type;
 
-    asm("mfpvr    %0" : "=r"(pvr));
+	asm("mfpvr    %0" : "=r"(pvr));
 
-    for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) {
-        if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) {
-            break;
-        }
-    }
+	for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) {
+		if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) {
+			break;
+		}
+	}
 
 #if defined(DEBUG)
-    printf("%s: returning CPU=%s, cpu_type=%p\n", __func__,
-        pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type);
+	printf("%s: returning CPU=%s, cpu_type=%p\n", __func__,
+		pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type);
 #endif
-    cpu_type = pvrPOWER[i].cpu_type;
+	cpu_type = pvrPOWER[i].cpu_type;
     return (int)(cpu_type);
 }
 #endif  /* C_PGI */
@@ -252,10 +252,10 @@ static gotoblas_t *get_coretype(void) {
 #endif
 		return &gotoblas_POWER10;
 #endif
-    /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */
+	/* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */
 #if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
-    if (__builtin_cpu_is("power10"))
-        return &gotoblas_POWER9;
+	if (__builtin_cpu_is("power10"))
+		return &gotoblas_POWER9;
 #endif
 	return NULL;
 }

From 36e08f69946321a7ca3f9ef495d198802e1b5b17 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Fri, 6 Oct 2023 13:08:41 -0500
Subject: [PATCH 342/718] One more small change.

---
 driver/others/dynamic_power.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 4e8710bc7..311987d31 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -195,7 +195,7 @@ static int cpuid(void)
 		pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type);
 #endif
 	cpu_type = pvrPOWER[i].cpu_type;
-    return (int)(cpu_type);
+	return (int)(cpu_type);
 }
 #endif  /* C_PGI */
 #endif  /* _AIX */

From 3655632611173f191c22a36d7c9e0950cdcc202e Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Fri, 6 Oct 2023 13:11:40 -0500
Subject: [PATCH 343/718] Another small change.

---
 driver/others/dynamic_power.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 311987d31..f98fedd45 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -246,7 +246,7 @@ static gotoblas_t *get_coretype(void) {
 #endif
 #ifdef HAVE_P10_SUPPORT
 #if defined(_AIX) || defined(__clang__)
-    if (__builtin_cpu_is("power10"))
+	if (__builtin_cpu_is("power10"))
 #else
 	if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma"))
 #endif

From 880af052dde230595328d8a19d10e42f39369a43 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Fri, 6 Oct 2023 13:41:49 -0500
Subject: [PATCH 344/718] Fix dynamic dispatch P9 for clang.

---
 driver/others/dynamic_power.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index f98fedd45..db04e635f 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -3,7 +3,7 @@
 
 extern gotoblas_t gotoblas_POWER6;
 extern gotoblas_t gotoblas_POWER8;
-#if (!defined __GNUC__) || ( __GNUC__ >= 6)
+#if ((!defined __GNUC__) || ( __GNUC__ >= 6)) || defined(__clang__)
 extern gotoblas_t gotoblas_POWER9;
 #endif
 #ifdef HAVE_P10_SUPPORT
@@ -27,7 +27,7 @@ char *gotoblas_corename(void) {
 	if (gotoblas == &gotoblas_POWER6)	return corename[1];
 #endif
 	if (gotoblas == &gotoblas_POWER8)	return corename[2];
-#if (!defined __GNUC__) || ( __GNUC__ >= 6)
+#if ((!defined __GNUC__) || ( __GNUC__ >= 6)) || defined(__clang__)
 	if (gotoblas == &gotoblas_POWER9)	return corename[3];
 #endif
 #ifdef HAVE_P10_SUPPORT
@@ -240,7 +240,7 @@ static gotoblas_t *get_coretype(void) {
 #endif
 	if (__builtin_cpu_is("power8"))
 		return &gotoblas_POWER8;
-#if (!defined __GNUC__) || ( __GNUC__ >= 6)
+#if ((!defined __GNUC__) || ( __GNUC__ >= 6)) || defined(__clang__)
 	if (__builtin_cpu_is("power9"))
 		return &gotoblas_POWER9;
 #endif
@@ -281,7 +281,7 @@ static gotoblas_t *force_coretype(char * coretype) {
 	case  1: return (&gotoblas_POWER6);
 #endif
 	case  2: return (&gotoblas_POWER8);
-#if (!defined __GNUC__) || ( __GNUC__ >= 6)
+#if ((!defined __GNUC__) || ( __GNUC__ >= 6)) || defined(__clang__)
 	case  3: return (&gotoblas_POWER9);
 #endif
 #ifdef HAVE_P10_SUPPORT

From b626544ca32396cbd6bd138d75669bee05330877 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 7 Oct 2023 22:31:03 +0200
Subject: [PATCH 345/718] complete function prototypes and remove unused
 functions

---
 ctest/c_cblat1c.c | 303 +++--------------------------
 ctest/c_cblat2c.c | 366 +++++++----------------------------
 ctest/c_cblat3c.c | 124 ------------
 ctest/c_dblat1c.c | 192 +++---------------
 ctest/c_dblat2c.c | 352 +++++++--------------------------
 ctest/c_dblat3c.c | 435 ++++++-----------------------------------
 ctest/c_sblat1c.c | 318 +++---------------------------
 ctest/c_sblat2c.c | 481 +++++++---------------------------------------
 ctest/c_sblat3c.c | 437 ++++++-----------------------------------
 ctest/c_zblat1c.c | 304 +++--------------------------
 ctest/c_zblat2c.c | 367 +++++++----------------------------
 ctest/c_zblat3c.c | 463 +++++++-------------------------------------
 12 files changed, 549 insertions(+), 3593 deletions(-)

diff --git a/ctest/c_cblat1c.c b/ctest/c_cblat1c.c
index 8c0dd140c..2f84da43b 100644
--- a/ctest/c_cblat1c.c
+++ b/ctest/c_cblat1c.c
@@ -242,251 +242,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
-#if 0
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-#endif
 
 /* Common Block Declarations */
 
@@ -503,16 +258,16 @@ static integer c__1 = 1;
 static integer c__5 = 5;
 static real c_b43 = (float)1.;
 
-/* Main program */ int main()
+/* Main program */ int main(void)
 {
     /* Initialized data */
 
     static real sfac = (float)9.765625e-4;
 
     /* Local variables */
-    extern /* Subroutine */ int check1_(), check2_();
+    extern /* Subroutine */ int check1_(real*), check2_(real*);
     static integer ic;
-    extern /* Subroutine */ int header_();
+    extern /* Subroutine */ int header_(void);
 
 /*     Test program for the COMPLEX    Level 1 CBLAS. */
 /*     Based upon the original CBLAS test routine together with: */
@@ -553,7 +308,7 @@ static real c_b43 = (float)1.;
 
 } /* MAIN__ */
 
-/* Subroutine */ int header_()
+/* Subroutine */ int header_(void)
 {
     /* Initialized data */
 
@@ -564,7 +319,7 @@ static real c_b43 = (float)1.;
     /* Format strings */
 
     /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
+    integer s_wsfe(void), do_fio(void), e_wsfe(void);
 
 /*     .. Parameters .. */
 /*     .. Scalars in Common .. */
@@ -577,8 +332,7 @@ static real c_b43 = (float)1.;
 
 } /* header_ */
 
-/* Subroutine */ int check1_(sfac)
-real *sfac;
+/* Subroutine */ int check1_(real* sfac)
 {
     /* Initialized data */
 
@@ -683,15 +437,15 @@ real *sfac;
 
     /* Local variables */
     static integer i__;
-    extern /* Subroutine */ int ctest_();
+    extern /* Subroutine */ int ctest_(int*, complex*, complex*, complex*, real*);
     static complex mwpcs[5], mwpct[5];
-    extern /* Subroutine */ int itest1_(), stest1_();
+    extern /* Subroutine */ int itest1_(int*, int*), stest1_(real*,real*,real*,real*);
     static complex cx[8];
-    extern real scnrm2test_();
+    extern real scnrm2test_(int*, complex*, int*);
     static integer np1;
-    extern integer icamaxtest_();
-    extern /* Subroutine */ int csscaltest_();
-    extern real scasumtest_();
+    extern integer icamaxtest_(int*, complex*, int*);
+    extern /* Subroutine */ int csscaltest_(int*, real*, complex*, int*);
+    extern real scasumtest_(int*, complex*, int*);
     static integer len;
 
 /*     .. Parameters .. */
@@ -808,8 +562,7 @@ real *sfac;
     return 0;
 } /* check1_ */
 
-/* Subroutine */ int check2_(sfac)
-real *sfac;
+/* Subroutine */ int check2_(real* sfac)
 {
     /* Initialized data */
 
@@ -981,10 +734,10 @@ real *sfac;
     static complex cdot[1];
     static integer lenx, leny, i__;
     static complex ctemp;
-    extern /* Subroutine */ int ctest_();
+    extern /* Subroutine */ int ctest_(int*, complex*, complex*, complex*, real*);
     static integer ksize;
-    extern /* Subroutine */ int cdotctest_(), ccopytest_(), cdotutest_(), 
-	    cswaptest_(), caxpytest_();
+    extern /* Subroutine */ int cdotctest_(int*, complex*, int*, complex*, int*,complex*), ccopytest_(int*, complex*, int*, complex*, int*), cdotutest_(int*, complex*, int*, complex*, int*, complex*), 
+	    cswaptest_(int*, complex*, int*, complex*, int*), caxpytest_(int*, complex*, complex*, int*, complex*, int*);
     static integer ki, kn;
     static complex cx[7], cy[7];
     static integer mx, my;
@@ -1067,9 +820,7 @@ real *sfac;
     return 0;
 } /* check2_ */
 
-/* Subroutine */ int stest_(len, scomp, strue, ssize, sfac)
-integer *len;
-real *scomp, *strue, *ssize, *sfac;
+/* Subroutine */ int stest_(integer* len, real* scomp, real* strue, real* ssize,real* sfac)
 {
     /* System generated locals */
     integer i__1;
@@ -1077,7 +828,7 @@ real *scomp, *strue, *ssize, *sfac;
 
     /* Local variables */
     static integer i__;
-    extern doublereal sdiff_();
+    extern doublereal sdiff_(real*, real*);
     static real sd;
 
 /*     ********************************* STEST ************************** */
@@ -1133,11 +884,10 @@ L40:
 
 } /* stest_ */
 
-/* Subroutine */ int stest1_(scomp1, strue1, ssize, sfac)
-real *scomp1, *strue1, *ssize, *sfac;
+/* Subroutine */ int stest1_(real* scomp1, real* strue1, real* ssize, real* sfac)
 {
     static real scomp[1], strue[1];
-    extern /* Subroutine */ int stest_();
+    extern /* Subroutine */ int stest_(int*, real*, real*, real*, real*);
 
 /*     ************************* STEST1 ***************************** */
 
@@ -1164,8 +914,7 @@ real *scomp1, *strue1, *ssize, *sfac;
     return 0;
 } /* stest1_ */
 
-doublereal sdiff_(sa, sb)
-real *sa, *sb;
+doublereal sdiff_(real* sa, real* sb)
 {
     /* System generated locals */
     real ret_val;
@@ -1179,10 +928,7 @@ real *sa, *sb;
     return ret_val;
 } /* sdiff_ */
 
-/* Subroutine */ int ctest_(len, ccomp, ctrue, csize, sfac)
-integer *len;
-complex *ccomp, *ctrue, *csize;
-real *sfac;
+/* Subroutine */ int ctest_(integer* len, complex* ccomp, complex* ctrue, complex* csize, real* sfac)
 {
     /* System generated locals */
     integer i__1, i__2;
@@ -1193,7 +939,7 @@ real *sfac;
     /* Local variables */
     static integer i__;
     static real scomp[20], ssize[20], strue[20];
-    extern /* Subroutine */ int stest_();
+    extern /* Subroutine */ int stest_(int*, real*,real*,real*,real*);
 
 /*     **************************** CTEST ***************************** */
 
@@ -1231,8 +977,7 @@ real *sfac;
     return 0;
 } /* ctest_ */
 
-/* Subroutine */ int itest1_(icomp, itrue)
-integer *icomp, *itrue;
+/* Subroutine */ int itest1_(integer* icomp, integer* itrue)
 {
     /* Local variables */
     static integer id;
diff --git a/ctest/c_cblat2c.c b/ctest/c_cblat2c.c
index b7616b11d..c0cf0b9e2 100644
--- a/ctest/c_cblat2c.c
+++ b/ctest/c_cblat2c.c
@@ -242,129 +242,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
 
 
 /* Common Block Declarations */
@@ -396,7 +273,7 @@ static integer c_n1 = -1;
 static integer c__0 = 0;
 static logical c_false = FALSE_;
 
-/* Main program */ int main()
+/* Main program */ int main(void)
 {
     /* Initialized data */
 
@@ -414,17 +291,21 @@ static logical c_false = FALSE_;
     static logical same;
     static integer ninc, nbet, ntra;
     static logical rewi;
-    extern /* Subroutine */ int cchk1_(), cchk2_(), cchk3_(), cchk4_(), 
-	    cchk5_(), cchk6_();
+    extern /* Subroutine */ int cchk1_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, integer*, ftnlen);
+    extern /* Subroutine */ int cchk2_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, integer*, ftnlen);
+    extern /* Subroutine */ int cchk3_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, complex*, integer*, ftnlen);
+    extern /* Subroutine */ int cchk4_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, complex*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, complex*, integer*, ftnlen);
+    extern /* Subroutine */ int cchk5_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, complex*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, complex*, integer*, ftnlen);
+    extern /* Subroutine */ int cchk6_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, complex*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, complex*, integer*, ftnlen);
     static complex a[4225]	/* was [65][65] */;
     static real g[65];
     static integer i__, j, n;
     static logical fatal;
     static complex x[65], y[65], z__[130];
-    extern doublereal sdiff_();
+    extern doublereal sdiff_(real*, real*);
     static logical trace;
     static integer nidim;
-    extern /* Subroutine */ int cmvch_();
+    extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen);
     static char snaps[32], trans[1];
     static integer isnum;
     static logical ltest[17];
@@ -438,11 +319,11 @@ static logical c_false = FALSE_;
     static char snamet[12];
     static real thresh;
     static logical rorder;
-    extern /* Subroutine */ int cc2chke_();
+    extern /* Subroutine */ void cc2chke_(char*, ftnlen);
     static integer layout;
     static logical ltestt, tsterr;
     static complex alf[7];
-    extern logical lce_();
+    extern logical lce_(complex*, complex*, integer*);
     static integer inc[7], nkb;
     static complex bet[7];
     static real eps, err;
@@ -983,22 +864,7 @@ L240:
 
 } /* MAIN__ */
 
-/* Subroutine */ int cchk1_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, 
-	incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *nalf;
-complex *alf;
-integer *nbet;
-complex *bet;
-integer *ninc, *inc, *nmax, *incmax;
-complex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt;
-real *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int cchk1_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, complex* alf, integer* nbet, complex* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* y, complex* yy, complex* ys, complex* yt, real* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1015,10 +881,10 @@ ftnlen sname_len;
     static integer incx, incy;
     static logical full, tran, null;
     static integer i__, m, n;
-    extern /* Subroutine */ int cmake_();
+    extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen);
     static complex alpha;
     static logical isame[13];
-    extern /* Subroutine */ int cmvch_();
+    extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static integer incxs, incys;
@@ -1026,14 +892,15 @@ ftnlen sname_len;
     static integer ia, ib, ic;
     static logical banded;
     static integer nc, nd, im, in, kl, ml, nk, nl, ku, ix, iy, ms, lx, ly, ns;
-    extern /* Subroutine */ int ccgbmv_(), ccgemv_();
-    extern logical lceres_();
+    extern /* Subroutine */ int ccgbmv_(integer*, char*, integer*, integer*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, ftnlen);
+    extern /* Subroutine */ void ccgemv_(integer*, char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, ftnlen);
+    extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen);
     static char ctrans[14];
     static real errmax;
     static complex transl;
     static char transs[1];
     static integer laa, lda;
-    extern logical lce_();
+    extern logical lce_(complex*, complex*, integer*);
     static complex als, bls;
     static real err;
     static integer iku, kls, kus;
@@ -1448,22 +1315,7 @@ L140:
 
 } /* cchk1_ */
 
-/* Subroutine */ int cchk2_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, 
-	incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *nalf;
-complex *alf;
-integer *nbet;
-complex *bet;
-integer *ninc, *inc, *nmax, *incmax;
-complex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt;
-real *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int cchk2_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, complex* alf, integer* nbet, complex* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* y, complex* yy, complex* ys, complex* yt, real* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1481,10 +1333,10 @@ ftnlen sname_len;
     static logical full, null;
     static char uplo[1];
     static integer i__, k, n;
-    extern /* Subroutine */ int cmake_();
+    extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen);
     static complex alpha;
     static logical isame[13];
-    extern /* Subroutine */ int cmvch_();
+    extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static char cuplo[14];
@@ -1495,13 +1347,14 @@ ftnlen sname_len;
     static integer nc, ik, in;
     static logical packed;
     static integer nk, ks, ix, iy, ns, lx, ly;
-    extern /* Subroutine */ int cchbmv_(), cchemv_();
-    extern logical lceres_();
-    extern /* Subroutine */ int cchpmv_();
+    extern /* Subroutine */ void cchbmv_(integer*, char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, ftnlen);
+    extern /* Subroutine */ void cchemv_(integer*, char*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, ftnlen);
+    extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cchpmv_(integer*, char*, integer*, complex*, complex*, complex*, integer*, complex*, complex*, integer*, ftnlen);
     static real errmax;
     static complex transl;
     static integer laa, lda;
-    extern logical lce_();
+    extern logical lce_(complex*, complex*, integer*);
     static complex als, bls;
     static real err;
 
@@ -1906,19 +1759,7 @@ L130:
 
 } /* cchk2_ */
 
-/* Subroutine */ int cchk3_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, xt, g, z__, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *ninc, *inc, *nmax, *incmax;
-complex *a, *aa, *as, *x, *xx, *xs, *xt;
-real *g;
-complex *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int cchk3_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* xt, real* g, complex* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1937,10 +1778,10 @@ ftnlen sname_len;
     static logical full, null;
     static char uplo[1], cdiag[14];
     static integer i__, k, n;
-    extern /* Subroutine */ int cmake_();
+    extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen);
     static char diags[1];
     static logical isame[13];
-    extern /* Subroutine */ int cmvch_();
+    extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static char cuplo[14];
@@ -1950,17 +1791,19 @@ ftnlen sname_len;
     static integer nc, ik, in;
     static logical packed;
     static integer nk, ks, ix, ns, lx;
-    extern logical lceres_();
-    extern /* Subroutine */ int cctbmv_(), cctbsv_();
+    extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cctbmv_(integer*, char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cctbsv_(integer*, char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, ftnlen, ftnlen, ftnlen);
     static char ctrans[14];
-    extern /* Subroutine */ int cctpmv_();
+    extern /* Subroutine */ void cctpmv_(integer*, char*, char*, char*, integer*, complex*, complex*, integer*, ftnlen, ftnlen, ftnlen);
     static real errmax;
-    extern /* Subroutine */ int cctrmv_(), cctpsv_();
+    extern /* Subroutine */ void cctrmv_(integer*, char*, char*, char*, integer*, complex*, integer*, complex*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cctpsv_(integer*, char*, char*, char*, integer*, complex*, complex*, integer*, ftnlen, ftnlen, ftnlen);
     static complex transl;
-    extern /* Subroutine */ int cctrsv_();
+    extern /* Subroutine */ void cctrsv_(integer*, char*, char*, char*, integer*, complex*, integer*, complex*, integer*, ftnlen, ftnlen, ftnlen);
     static char transs[1];
     static integer laa, icd, lda;
-    extern logical lce_();
+    extern logical lce_(complex*, complex*, integer*);
     static integer ict, icu;
     static real err;
 
@@ -2418,21 +2261,7 @@ L130:
 
 } /* cchk3_ */
 
-/* Subroutine */ int cchk4_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-complex *alf;
-integer *ninc, *inc, *nmax, *incmax;
-complex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt;
-real *g;
-complex *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int cchk4_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, complex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* y, complex* yy, complex* ys, complex* yt, real* g, complex* z__, integer* iorder, ftnlen sname_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7;
@@ -2444,21 +2273,21 @@ ftnlen sname_len;
     static integer incx, incy;
     static logical null;
     static integer i__, j, m, n;
-    extern /* Subroutine */ int cmake_();
+    extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen);
     static complex alpha, w[1];
     static logical isame[13];
-    extern /* Subroutine */ int cmvch_();
+    extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static integer incxs, incys, ia, nc, nd, im, in;
-    extern /* Subroutine */ int ccgerc_();
+    extern /* Subroutine */ void ccgerc_(integer*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, integer*);
     static integer ms, ix, iy, ns, lx, ly;
-    extern /* Subroutine */ int ccgeru_();
-    extern logical lceres_();
+    extern /* Subroutine */ void ccgeru_(integer*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, integer*);
+    extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen);
     static real errmax;
     static complex transl;
     static integer laa, lda;
-    extern logical lce_();
+    extern logical lce_(complex*, complex*, integer*);
     static complex als;
     static real err;
 
@@ -2786,21 +2615,7 @@ L150:
 
 } /* cchk4_ */
 
-/* Subroutine */ int cchk5_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-complex *alf;
-integer *ninc, *inc, *nmax, *incmax;
-complex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt;
-real *g;
-complex *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int cchk5_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, complex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* y, complex* yy, complex* ys, complex* yt, real* g, complex* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2818,10 +2633,12 @@ ftnlen sname_len;
     static logical full, null;
     static char uplo[1];
     static integer i__, j, n;
-    extern /* Subroutine */ int cmake_(), ccher_();
+    extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void ccher_(integer*, char*, integer*, real*, complex*, integer*, complex*, integer*, ftnlen);
     static complex alpha, w[1];
     static logical isame[13];
-    extern /* Subroutine */ int cchpr_(), cmvch_();
+    extern /* Subroutine */ void cchpr_(integer*, char*, integer*, real*, complex*, integer*, complex*, ftnlen);
+    extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static char cuplo[14];
@@ -2832,11 +2649,11 @@ ftnlen sname_len;
     static logical packed;
     static integer ix, ns, lx;
     static real ralpha;
-    extern logical lceres_();
+    extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen);
     static real errmax;
     static complex transl;
     static integer laa, lda;
-    extern logical lce_();
+    extern logical lce_(complex*, complex*, integer*);
     static real err;
 
 /*  Tests CHER and CHPR. */
@@ -3160,21 +2977,7 @@ L130:
 
 } /* cchk5_ */
 
-/* Subroutine */ int cchk6_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-complex *alf;
-integer *ninc, *inc, *nmax, *incmax;
-complex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt;
-real *g;
-complex *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int cchk6_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, complex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* y, complex* yy, complex* ys, complex* yt, real* g, complex* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -3192,25 +2995,26 @@ ftnlen sname_len;
     static logical full, null;
     static char uplo[1];
     static integer i__, j, n;
-    extern /* Subroutine */ int cmake_();
+    extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen);
     static complex alpha, w[2];
     static logical isame[13];
-    extern /* Subroutine */ int cmvch_();
+    extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static char cuplo[14];
     static integer incxs, incys;
     static logical upper;
     static char uplos[1];
-    extern /* Subroutine */ int ccher2_(), cchpr2_();
+    extern /* Subroutine */ void ccher2_(integer*, char*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, integer*, ftnlen);
+    extern /* Subroutine */ void cchpr2_(integer*, char*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, ftnlen);
     static integer ia, ja, ic, nc, jj, lj, in;
     static logical packed;
     static integer ix, iy, ns, lx, ly;
-    extern logical lceres_();
+    extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen);
     static real errmax;
     static complex transl;
     static integer laa, lda;
-    extern logical lce_();
+    extern logical lce_(complex*, complex*, integer*);
     static complex als;
     static real err;
 
@@ -3597,24 +3401,7 @@ L170:
 
 } /* cchk6_ */
 
-/* Subroutine */ int cmvch_(trans, m, n, alpha, a, nmax, x, incx, beta, y, 
-	incy, yt, g, yy, eps, err, fatal, nout, mv, trans_len)
-char *trans;
-integer *m, *n;
-complex *alpha, *a;
-integer *nmax;
-complex *x;
-integer *incx;
-complex *beta, *y;
-integer *incy;
-complex *yt;
-real *g;
-complex *yy;
-real *eps, *err;
-logical *fatal;
-integer *nout;
-logical *mv;
-ftnlen trans_len;
+/* Subroutine */ int cmvch_(char* trans, integer* m, integer* n, complex* alpha, complex* a, integer* nmax, complex* x, integer* incx, complex* beta, complex* y, integer* incy, complex* yt, real* g, complex* yy, real* eps, real* err, logical* fatal, integer* nout, logical* mv, ftnlen trans_len)
 {
 
     /* System generated locals */
@@ -3812,9 +3599,7 @@ L80:
 
 } /* cmvch_ */
 
-logical lce_(ri, rj, lr)
-complex *ri, *rj;
-integer *lr;
+logical lce_(complex* ri, complex* rj, integer* lr)
 {
     /* System generated locals */
     integer i__1, i__2, i__3;
@@ -3861,13 +3646,7 @@ L30:
 
 } /* lce_ */
 
-logical lceres_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len)
-char *type__, *uplo;
-integer *m, *n;
-complex *aa, *as;
-integer *lda;
-ftnlen type_len;
-ftnlen uplo_len;
+logical lceres_(char* type__, char* uplo, integer* m, integer* n, complex* aa, complex* as, integer* lda, ftnlen type_len, ftnlen uplo_len)
 {
     /* System generated locals */
     integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4;
@@ -3960,9 +3739,7 @@ L80:
 
 } /* lceres_ */
 
-/* Complex */ VOID cbeg_( ret_val, reset)
-complex * ret_val;
-logical *reset;
+/* Complex */ VOID cbeg_(complex* ret_val, logical* reset)
 {
     /* System generated locals */
     real r__1, r__2;
@@ -4023,8 +3800,7 @@ L10:
 
 } /* cbeg_ */
 
-doublereal sdiff_(x, y)
-real *x, *y;
+doublereal sdiff_(real* x, real* y)
 {
     /* System generated locals */
     real ret_val;
@@ -4044,19 +3820,7 @@ real *x, *y;
 
 } /* sdiff_ */
 
-/* Subroutine */ int cmake_(type__, uplo, diag, m, n, a, nmax, aa, lda, kl, 
-	ku, reset, transl, type_len, uplo_len, diag_len)
-char *type__, *uplo, *diag;
-integer *m, *n;
-complex *a;
-integer *nmax;
-complex *aa;
-integer *lda, *kl, *ku;
-logical *reset;
-complex *transl;
-ftnlen type_len;
-ftnlen uplo_len;
-ftnlen diag_len;
+/* Subroutine */ int cmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, complex* a, integer* nmax, complex* aa, integer* lda, integer* kl, integer* ku, logical* reset, complex* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
@@ -4064,7 +3828,7 @@ ftnlen diag_len;
     complex q__1, q__2;
 
     /* Local variables */
-    extern /* Complex */ VOID cbeg_();
+    extern /* Complex */ VOID cbeg_(complex*, logical*);
     static integer ibeg, iend, ioff;
     static logical unit;
     static integer i__, j;
diff --git a/ctest/c_cblat3c.c b/ctest/c_cblat3c.c
index 1f4b967b0..5ad9b8bd8 100644
--- a/ctest/c_cblat3c.c
+++ b/ctest/c_cblat3c.c
@@ -242,130 +242,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Common Block Declarations */
diff --git a/ctest/c_dblat1c.c b/ctest/c_dblat1c.c
index bf2f7a781..f0141f2a5 100644
--- a/ctest/c_dblat1c.c
+++ b/ctest/c_dblat1c.c
@@ -21,19 +21,6 @@ typedef float real;
 typedef double doublereal;
 typedef struct { real r, i; } complex;
 typedef struct { doublereal r, i; } doublecomplex;
-#ifdef _MSC_VER
-static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
-static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
-static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
-static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
-#else
-static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
-static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
-static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
-static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
-#endif
-#define pCf(z) (*_pCf(z))
-#define pCd(z) (*_pCd(z))
 typedef int logical;
 typedef short int shortlogical;
 typedef char logical1;
@@ -242,124 +229,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
 
 /* Common Block Declarations */
 
@@ -375,16 +244,16 @@ struct {
 static integer c__1 = 1;
 static doublereal c_b34 = 1.;
 
-/* Main program */ int main()
+/* Main program */ int main(void)
 {
     /* Initialized data */
 
     static doublereal sfac = 9.765625e-4;
 
     /* Local variables */
-    extern /* Subroutine */ int check0_(), check1_(), check2_(), check3_();
+    extern /* Subroutine */ int check0_(doublereal*), check1_(doublereal*), check2_(doublereal*), check3_(doublereal*);
     static integer ic;
-    extern /* Subroutine */ int header_();
+    extern /* Subroutine */ int header_(void);
 
 /*     Test program for the DOUBLE PRECISION Level 1 CBLAS. */
 /*     Based upon the original CBLAS test routine together with: */
@@ -431,7 +300,7 @@ static doublereal c_b34 = 1.;
 
 } /* MAIN__ */
 
-/* Subroutine */ int header_()
+/* Subroutine */ int header_(void)
 {
     /* Initialized data */
 
@@ -450,8 +319,7 @@ static doublereal c_b34 = 1.;
 
 } /* header_ */
 
-/* Subroutine */ int check0_(sfac)
-doublereal *sfac;
+/* Subroutine */ int check0_(doublereal* sfac)
 {
     /* Initialized data */
 
@@ -464,7 +332,7 @@ doublereal *sfac;
 
     /* Local variables */
     static integer k;
-    extern /* Subroutine */ int drotgtest_(), stest1_();
+    extern /* Subroutine */ int drotgtest_(doublereal*,doublereal*,doublereal*,doublereal*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*);
     static doublereal sa, sb, sc, ss;
 
 /*     .. Parameters .. */
@@ -509,8 +377,7 @@ L40:
     return 0;
 } /* check0_ */
 
-/* Subroutine */ int check1_(sfac)
-doublereal *sfac;
+/* Subroutine */ int check1_(doublereal* sfac)
 {
     /* Initialized data */
 
@@ -535,14 +402,14 @@ doublereal *sfac;
 
     /* Local variables */
     static integer i__;
-    extern doublereal dnrm2test_();
+    extern doublereal dnrm2test_(int*, doublereal*, int*);
     static doublereal stemp[1], strue[8];
-    extern /* Subroutine */ int stest_(), dscaltest_();
-    extern doublereal dasumtest_();
-    extern /* Subroutine */ int itest1_(), stest1_();
+    extern /* Subroutine */ int stest_(int*,doublereal*,doublereal*,doublereal*,doublereal*), dscaltest_(int*,doublereal*,doublereal*,int*);
+    extern doublereal dasumtest_(int*,doublereal*,int*);
+    extern /* Subroutine */ int itest1_(int*,int*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*);
     static doublereal sx[8];
     static integer np1;
-    extern integer idamaxtest_();
+    extern integer idamaxtest_(int*,doublereal*,int*);
     static integer len;
 
 /*     .. Parameters .. */
@@ -603,8 +470,7 @@ doublereal *sfac;
     return 0;
 } /* check1_ */
 
-/* Subroutine */ int check2_(sfac)
-doublereal *sfac;
+/* Subroutine */ int check2_(doublereal* sfac)
 {
     /* Initialized data */
 
@@ -649,10 +515,10 @@ doublereal *sfac;
 
     /* Local variables */
     static integer lenx, leny;
-    extern doublereal ddottest_();
+    extern doublereal ddottest_(int*,doublereal*,int*,doublereal*,int*);
     static integer i__, j, ksize;
-    extern /* Subroutine */ int stest_(), dcopytest_(), dswaptest_(), 
-	    daxpytest_(), stest1_();
+    extern /* Subroutine */ int stest_(int*,doublereal*,doublereal*,doublereal*,doublereal*), dcopytest_(int*,doublereal*,int*,doublereal*,int*), dswaptest_(int*,doublereal*,int*,doublereal*,int*), 
+	    daxpytest_(int*,doublereal*,doublereal*,int*,doublereal*,int*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*);
     static integer ki, kn, mx, my;
     static doublereal sx[7], sy[7], stx[7], sty[7];
 
@@ -733,8 +599,7 @@ doublereal *sfac;
     return 0;
 } /* check2_ */
 
-/* Subroutine */ int check3_(sfac)
-doublereal *sfac;
+/* Subroutine */ int check3_(doublereal* sfac)
 {
     /* Initialized data */
 
@@ -753,9 +618,9 @@ doublereal *sfac;
 	    ;
 
     /* Local variables */
-    extern /* Subroutine */ int drottest_();
+    extern /* Subroutine */ int drottest_(int*,doublereal*,int*,doublereal*,int*,doublereal*,doublereal*);
     static integer i__, k, ksize;
-    extern /* Subroutine */int stest_(), drotmtest_();
+    extern /* Subroutine */int stest_(int*,doublereal*,doublereal*,doublereal*,doublereal*), drotmtest_(int*,doublereal*,int*,doublereal*,int*,doublereal*);
     static integer ki, kn;
     static doublereal dparam[5], sx[10], sy[10], stx[10], sty[10];
 
@@ -826,9 +691,7 @@ doublereal *sfac;
     return 0;
 } /* check3_ */
 
-/* Subroutine */ int stest_(len, scomp, strue, ssize, sfac)
-integer *len;
-doublereal *scomp, *strue, *ssize, *sfac;
+/* Subroutine */ int stest_(int* len, doublereal* scomp, doublereal* strue, doublereal* ssize, doublereal* sfac)
 {
     /* System generated locals */
     integer i__1;
@@ -836,7 +699,7 @@ doublereal *scomp, *strue, *ssize, *sfac;
 
     /* Local variables */
     static integer i__;
-    extern doublereal sdiff_();
+    extern doublereal sdiff_(doublereal*,doublereal*);
     static doublereal sd;
 
 /*     ********************************* STEST ************************** */
@@ -892,11 +755,10 @@ L40:
 
 } /* stest_ */
 
-/* Subroutine */ int stest1_(scomp1, strue1, ssize, sfac)
-doublereal *scomp1, *strue1, *ssize, *sfac;
+/* Subroutine */ int stest1_(doublereal* scomp1, doublereal* strue1, doublereal* ssize, doublereal* sfac)
 {
     static doublereal scomp[1], strue[1];
-    extern /* Subroutine */ int stest_();
+    extern /* Subroutine */ int stest_(int*, doublereal*, doublereal*, doublereal*, doublereal*);
 
 /*     ************************* STEST1 ***************************** */
 
@@ -923,8 +785,7 @@ doublereal *scomp1, *strue1, *ssize, *sfac;
     return 0;
 } /* stest1_ */
 
-doublereal sdiff_(sa, sb)
-doublereal *sa, *sb;
+doublereal sdiff_(doublereal* sa, doublereal* sb)
 {
     /* System generated locals */
     doublereal ret_val;
@@ -938,8 +799,7 @@ doublereal *sa, *sb;
     return ret_val;
 } /* sdiff_ */
 
-/* Subroutine */ int itest1_(icomp, itrue)
-integer *icomp, *itrue;
+/* Subroutine */ int itest1_(int* icomp, int* itrue)
 {
     /* Local variables */
     static integer id;
@@ -1188,4 +1048,4 @@ doublereal *dparam;
     return 0;
 } /* drotm_ */
 
-#endif
\ No newline at end of file
+#endif
diff --git a/ctest/c_dblat2c.c b/ctest/c_dblat2c.c
index f94dbc1fe..547aa808e 100644
--- a/ctest/c_dblat2c.c
+++ b/ctest/c_dblat2c.c
@@ -242,129 +242,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
 
 
 /* Common Block Declarations */
@@ -395,7 +272,7 @@ static integer c_n1 = -1;
 static integer c__0 = 0;
 static logical c_false = FALSE_;
 
-/* Main program */ int main()
+/* Main program */ int main(void)
 {
     /* Initialized data */
 
@@ -413,17 +290,21 @@ static logical c_false = FALSE_;
     static logical same;
     static integer ninc, nbet, ntra;
     static logical rewi;
-    extern /* Subroutine */ int dchk1_(), dchk2_(), dchk3_(), dchk4_(), 
-	    dchk5_(), dchk6_();
+    extern /* Subroutine */ int dchk1_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int dchk2_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int dchk3_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int dchk4_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int dchk5_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int dchk6_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen);
     static doublereal a[4225]	/* was [65][65] */, g[65];
     static integer i__, j;
-    extern doublereal ddiff_();
+    extern doublereal ddiff_(doublereal*, doublereal*);
     static integer n;
     static logical fatal;
     static doublereal x[65], y[65], z__[130];
     static logical trace;
     static integer nidim;
-    extern /* Subroutine */ int dmvch_();
+    extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static char snaps[32], trans[1];
     static integer isnum;
     static logical ltest[16];
@@ -437,11 +318,11 @@ static logical c_false = FALSE_;
     static char snamet[12];
     static doublereal thresh;
     static logical rorder;
-    extern /* Subroutine */ int cd2chke_();
+    extern /* Subroutine */ void cd2chke_(char*, ftnlen);
     static integer layout;
     static logical ltestt, tsterr;
     static doublereal alf[7];
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static integer inc[7], nkb;
     static doublereal bet[7],eps,err;
     char   tmpchar;
@@ -977,21 +858,7 @@ L240:
 
 } /* MAIN__ */
 
-/* Subroutine */ int dchk1_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, 
-	incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *nalf;
-doublereal *alf;
-integer *nbet;
-doublereal *bet;
-integer *ninc, *inc, *nmax, *incmax;
-doublereal *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* y, doublereal* yy, doublereal* ys, doublereal* yt, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1007,10 +874,10 @@ ftnlen sname_len;
     static integer incx, incy;
     static logical full, tran, null;
     static integer i__, m, n;
-    extern /* Subroutine */ int dmake_();
+    extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static doublereal alpha;
     static logical isame[13];
-    extern /* Subroutine */ int dmvch_();
+    extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static integer incxs, incys;
@@ -1018,13 +885,14 @@ ftnlen sname_len;
     static integer ia, ib, ic;
     static logical banded;
     static integer nc, nd, im, in, kl, ml, nk, nl, ku, ix, iy, ms, lx, ly, ns;
-    extern /* Subroutine */ int cdgbmv_(), cdgemv_();
-    extern logical lderes_();
+    extern /* Subroutine */ void cdgbmv_(integer*, char*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ void cdgemv_(integer*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen);
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static char ctrans[14];
     static doublereal errmax, transl;
     static char transs[1];
     static integer laa, lda;
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static doublereal als, bls, err;
     static integer iku, kls, kus;
 
@@ -1429,21 +1297,7 @@ L140:
 
 } /* dchk1_ */
 
-/* Subroutine */ int dchk2_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, 
-	incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *nalf;
-doublereal *alf;
-integer *nbet;
-doublereal *bet;
-integer *ninc, *inc, *nmax, *incmax;
-doublereal *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* y, doublereal* yy, doublereal* ys, doublereal* yt, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1460,10 +1314,10 @@ ftnlen sname_len;
     static logical full, null;
     static char uplo[1];
     static integer i__, k, n;
-    extern /* Subroutine */ int dmake_();
+    extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static doublereal alpha;
     static logical isame[13];
-    extern /* Subroutine */ int dmvch_();
+    extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static char cuplo[14];
@@ -1474,12 +1328,13 @@ ftnlen sname_len;
     static integer nc, ik, in;
     static logical packed;
     static integer nk, ks, ix, iy, ns, lx, ly;
-    extern logical lderes_();
-    extern /* Subroutine */ int cdsbmv_(), cdspmv_();
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cdsbmv_(integer*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ void cdspmv_(integer*, char*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen);
     static doublereal errmax, transl;
-    extern /* Subroutine */ int cdsymv_();
+    extern /* Subroutine */ void cdsymv_(integer*, char*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen);
     static integer laa, lda;
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static doublereal als, bls, err;
 
 
@@ -1882,17 +1737,7 @@ L130:
 
 } /* dchk2_ */
 
-/* Subroutine */ int dchk3_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, xt, g, z__, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *ninc, *inc, *nmax, *incmax;
-doublereal *a, *aa, *as, *x, *xx, *xs, *xt, *g, *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* xt, doublereal* g, doublereal* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1911,10 +1756,10 @@ ftnlen sname_len;
     static logical full, null;
     static char uplo[1], cdiag[14];
     static integer i__, k, n;
-    extern /* Subroutine */ int dmake_();
+    extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static char diags[1];
     static logical isame[13];
-    extern /* Subroutine */ int dmvch_();
+    extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static char cuplo[14];
@@ -1924,16 +1769,19 @@ ftnlen sname_len;
     static integer nc, ik, in;
     static logical packed;
     static integer nk, ks, ix, ns, lx;
-    extern logical lderes_();
-    extern /* Subroutine */ int cdtbmv_(), cdtbsv_();
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cdtbmv_(integer*, char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cdtbsv_(integer*, char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
     static char ctrans[14];
     static doublereal errmax;
-    extern /* Subroutine */ int cdtpmv_(), cdtrmv_();
+    extern /* Subroutine */ void cdtpmv_(integer*, char*, char*, char*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cdtrmv_(integer*, char*, char*, char*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
     static doublereal transl;
-    extern /* Subroutine */ int cdtpsv_(), cdtrsv_();
+    extern /* Subroutine */ void cdtpsv_(integer*, char*, char*, char*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cdtrsv_(integer*, char*, char*, char*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
     static char transs[1];
     static integer laa, icd, lda;
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static integer ict, icu;
     static doublereal err;
 
@@ -2388,19 +2236,7 @@ L130:
 
 } /* dchk3_ */
 
-/* Subroutine */ int dchk4_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublereal *alf;
-integer *ninc, *inc, *nmax, *incmax;
-doublereal *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g, *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* y, doublereal* yy, doublereal* ys, doublereal* yt, doublereal* g, doublereal* z__, integer* iorder, ftnlen sname_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6;
@@ -2411,17 +2247,18 @@ ftnlen sname_len;
     static integer incx, incy;
     static logical null;
     static integer i__, j, m, n;
-    extern /* Subroutine */ int dmake_(), cdger_();
+    extern /* Subroutine */ void cdger_(integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, integer*);
+    extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static doublereal alpha, w[1];
     static logical isame[13];
-    extern /* Subroutine */ int dmvch_();
+    extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static integer incxs, incys, ia, nc, nd, im, in, ms, ix, iy, ns, lx, ly;
-    extern logical lderes_();
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static doublereal errmax, transl;
     static integer laa, lda;
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static doublereal als, err;
 
 
@@ -2727,19 +2564,7 @@ L150:
 
 } /* dchk4_ */
 
-/* Subroutine */ int dchk5_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublereal *alf;
-integer *ninc, *inc, *nmax, *incmax;
-doublereal *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g, *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* y, doublereal* yy, doublereal* ys, doublereal* yt, doublereal* g, doublereal* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2757,25 +2582,25 @@ ftnlen sname_len;
     static logical full, null;
     static char uplo[1];
     static integer i__, j, n;
-    extern /* Subroutine */ int dmake_();
+    extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static doublereal alpha, w[1];
     static logical isame[13];
-    extern /* Subroutine */ int dmvch_();
+    extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int cdspr_();
+    extern /* Subroutine */ void cdspr_(integer*, char*, integer*, doublereal*, doublereal*, integer*, doublereal*, ftnlen);
     static logical reset;
     static char cuplo[14];
     static integer incxs;
-    extern /* Subroutine */ int cdsyr_();
+    extern /* Subroutine */ void cdsyr_(integer*, char*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, ftnlen);
     static logical upper;
     static char uplos[1];
     static integer ia, ja, ic, nc, jj, lj, in;
     static logical packed;
     static integer ix, ns, lx;
-    extern logical lderes_();
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static doublereal errmax, transl;
     static integer laa, lda;
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static doublereal als, err;
 
 
@@ -3096,19 +2921,7 @@ L130:
 
 } /* dchk5_ */
 
-/* Subroutine */ int dchk6_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublereal *alf;
-integer *ninc, *inc, *nmax, *incmax;
-doublereal *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g, *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk6_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* y, doublereal* yy, doublereal* ys, doublereal* yt, doublereal* g, doublereal* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -3125,24 +2938,25 @@ ftnlen sname_len;
     static logical full, null;
     static char uplo[1];
     static integer i__, j, n;
-    extern /* Subroutine */ int dmake_();
+    extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static doublereal alpha, w[2];
     static logical isame[13];
-    extern /* Subroutine */ int dmvch_();
+    extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static char cuplo[14];
     static integer incxs, incys;
     static logical upper;
     static char uplos[1];
-    extern /* Subroutine */ int cdspr2_(), cdsyr2_();
+    extern /* Subroutine */ void cdspr2_(integer*, char*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, ftnlen);
+    extern /* Subroutine */ void cdsyr2_(integer*, char*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen);
     static integer ia, ja, ic, nc, jj, lj, in;
     static logical packed;
     static integer ix, iy, ns, lx, ly;
-    extern logical lderes_();
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static doublereal errmax, transl;
     static integer laa, lda;
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static doublereal als, err;
 
 /*  Tests DSYR2 and DSPR2. */
@@ -3508,25 +3322,13 @@ L170:
 
 } /* dchk6_ */
 
-/* Subroutine */ int dmake_(type__, uplo, diag, m, n, a, nmax, aa, lda, kl, 
-	ku, reset, transl, type_len, uplo_len, diag_len)
-char *type__, *uplo, *diag;
-integer *m, *n;
-doublereal *a;
-integer *nmax;
-doublereal *aa;
-integer *lda, *kl, *ku;
-logical *reset;
-doublereal *transl;
-ftnlen type_len;
-ftnlen uplo_len;
-ftnlen diag_len;
+/* Subroutine */ int dmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublereal* a, integer* nmax, doublereal* aa, integer* lda, integer* kl, integer* ku, logical* reset, doublereal* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
 
     /* Local variables */
-    extern doublereal dbeg_();
+    extern doublereal dbeg_(logical* );
     static integer ibeg, iend, ioff;
     static logical unit;
     static integer i__, j;
@@ -3752,28 +3554,14 @@ ftnlen diag_len;
 
 } /* dmake_ */
 
-/* Subroutine */ int dmvch_(trans, m, n, alpha, a, nmax, x, incx, beta, y, 
-	incy, yt, g, yy, eps, err, fatal, nout, mv, trans_len)
-char *trans;
-integer *m, *n;
-doublereal *alpha, *a;
-integer *nmax;
-doublereal *x;
-integer *incx;
-doublereal *beta, *y;
-integer *incy;
-doublereal *yt, *g, *yy, *eps, *err;
-logical *fatal;
-integer *nout;
-logical *mv;
-ftnlen trans_len;
+/* Subroutine */ int dmvch_(char* trans, integer* m, integer* n, doublereal* alpha, doublereal* a, integer* nmax, doublereal* x, integer* incx, doublereal* beta, doublereal* y, integer* incy, doublereal* yt, doublereal* g, doublereal* yy, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen trans_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2;
     doublereal d__1;
 
     /* Builtin functions */
-    double sqrt();
+    double sqrt(double);
 
     /* Local variables */
     static doublereal erri;
@@ -3902,9 +3690,7 @@ L70:
 
 } /* dmvch_ */
 
-logical lde_(ri, rj, lr)
-doublereal *ri, *rj;
-integer *lr;
+logical lde_(doublereal* ri, doublereal* rj, integer* lr)
 {
     /* System generated locals */
     integer i__1;
@@ -3949,13 +3735,7 @@ L30:
 
 } /* lde_ */
 
-logical lderes_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len)
-char *type__, *uplo;
-integer *m, *n;
-doublereal *aa, *as;
-integer *lda;
-ftnlen type_len;
-ftnlen uplo_len;
+logical lderes_(char* type__, char* uplo, integer* m, integer* n, doublereal* aa, doublereal* as, integer* lda, ftnlen type_len, ftnlen uplo_len)
 {
     /* System generated locals */
     integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2;
@@ -4042,8 +3822,7 @@ L80:
 
 } /* lderes_ */
 
-doublereal dbeg_(reset)
-logical *reset;
+doublereal dbeg_(logical* reset)
 {
     /* System generated locals */
     doublereal ret_val;
@@ -4094,8 +3873,7 @@ L10:
 
 } /* dbeg_ */
 
-doublereal ddiff_(x, y)
-doublereal *x, *y;
+doublereal ddiff_(doublereal* x, doublereal* y)
 {
     /* System generated locals */
     doublereal ret_val;
diff --git a/ctest/c_dblat3c.c b/ctest/c_dblat3c.c
index 05d6b65b0..dc3d6f9e7 100644
--- a/ctest/c_dblat3c.c
+++ b/ctest/c_dblat3c.c
@@ -242,129 +242,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
 
 
 /* Common Block Declarations */
@@ -393,7 +270,7 @@ static logical c_true = TRUE_;
 static integer c__0 = 0;
 static logical c_false = FALSE_;
 
-/* Main program  MAIN__() */ int main()
+/* Main program  MAIN__() */ int main(void)
 {
     /* Initialized data */
 
@@ -403,25 +280,24 @@ static logical c_false = FALSE_;
     integer i__1, i__2, i__3;
     doublereal d__1;
 
-    /* Builtin functions */
-    integer s_rsle(), do_lio(), e_rsle(), f_open(), s_wsfe(), do_fio(), 
-	    e_wsfe(), s_wsle(), e_wsle(), s_rsfe(), e_rsfe();
-    integer f_clos();
 
     /* Local variables */
     static integer nalf, idim[9];
     static logical same;
     static integer nbet, ntra;
     static logical rewi;
-    extern /* Subroutine */ int dchk1_(), dchk2_(), dchk3_(), dchk4_(), 
-	    dchk5_();
+    extern /* Subroutine */ int dchk1_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int dchk2_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int dchk3_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int dchk4_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen);
+/* Subroutine */ int dchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* nmax, doublereal* ab, doublereal* aa, doublereal* as, doublereal* bb, doublereal* bs, doublereal* c__, doublereal* cc, doublereal* cs, doublereal* ct, doublereal* g, doublereal* w, integer* iorder, ftnlen sname_len);
     static doublereal c__[4225]	/* was [65][65] */, g[65];
     static integer i__, j;
-    extern doublereal ddiff_();
+    extern doublereal ddiff_(doublereal*, doublereal*);
     static integer n;
     static logical fatal;
     static doublereal w[130];
-    extern /* Subroutine */ int dmmch_();
+    extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static logical trace;
     static integer nidim;
     static char snaps[32];
@@ -433,11 +309,11 @@ static logical c_false = FALSE_;
     static char snamet[12], transa[1], transb[1];
     static doublereal thresh;
     static logical rorder;
-    extern /* Subroutine */ int cd3chke_();
+    extern /* Subroutine */ void cd3chke_(char*, ftnlen);
     static integer layout;
     static logical ltestt, tsterr;
     static doublereal alf[7];
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static doublereal bet[7], eps, err;
     char tmpchar;
 
@@ -907,21 +783,7 @@ L230:
 
 } /* MAIN__ */
 
-/* Subroutine */ int dchk1_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, 
-	c__, cc, cs, ct, g, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublereal *alf;
-integer *nbet;
-doublereal *bet;
-integer *nmax;
-doublereal *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* nmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* b, doublereal* bb, doublereal* bs, doublereal* c__, doublereal* cc, doublereal* cs, doublereal* ct, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -931,29 +793,27 @@ ftnlen sname_len;
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5, i__6;
 
-    /* Builtin functions */
-    integer f_rew(), s_wsfe(), e_wsfe(), do_fio();
 
     /* Local variables */
     static doublereal beta;
     static integer ldas, ldbs, ldcs;
     static logical same, null;
     static integer i__, k, m, n;
-    extern /* Subroutine */ int dmake_();
+    extern /* Subroutine */ int dmake_(char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static doublereal alpha;
-    extern /* Subroutine */ int dmmch_();
+    extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static logical isame[13], trana, tranb;
     static integer nargs;
     static logical reset;
-    extern /* Subroutine */ void dprcn1_();
+    extern /* Subroutine */ void dprcn1_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, integer*, doublereal*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
     static integer ia, ib, ma, mb, na, nb, nc, ik, im, in;
-    extern /* Subroutine */ int cdgemm_();
+    extern /* Subroutine */ void cdgemm_(integer*, char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static integer ks, ms, ns;
-    extern logical lderes_();
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static char tranas[1], tranbs[1], transa[1], transb[1];
     static doublereal errmax;
     static integer ica, icb, laa, lbb, lda, lcc, ldb, ldc;
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static doublereal als, bls, err;
 
 /*  Tests DGEMM. */
@@ -1283,23 +1143,8 @@ L130:
 
 } /* dchk1_ */
 
-/* Subroutine */ void dprcn1_(nout, nc, sname, iorder, transa, transb, m, n, k,
-	 alpha, lda, ldb, beta, ldc, sname_len, transa_len, transb_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *transa, *transb;
-integer *m, *n, *k;
-doublereal *alpha;
-integer *lda, *ldb;
-doublereal *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen transa_len;
-ftnlen transb_len;
+/* Subroutine */ void dprcn1_(integer* nout, integer* nc, char* sname, integer* iorder, char* transa, char* transb, integer* m, integer* n, integer* k, doublereal* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen transa_len, ftnlen transb_len)
 {
-    /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
 
     /* Local variables */
     static char crc[14], cta[14], ctb[14];
@@ -1328,21 +1173,7 @@ ftnlen transb_len;
 } /* dprcn1_ */
 
 
-/* Subroutine */ int dchk2_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, 
-	c__, cc, cs, ct, g, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublereal *alf;
-integer *nbet;
-doublereal *bet;
-integer *nmax;
-doublereal *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* nmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* b, doublereal* bb, doublereal* bs, doublereal* c__, doublereal* cc, doublereal* cs, doublereal* ct, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1353,8 +1184,6 @@ ftnlen sname_len;
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5;
 
-    /* Builtin functions */
-    integer f_rew(), s_wsfe(), e_wsfe(), do_fio();
 
     /* Local variables */
     static doublereal beta;
@@ -1364,21 +1193,21 @@ ftnlen sname_len;
     static logical left, null;
     static char uplo[1];
     static integer i__, m, n;
-    extern /* Subroutine */ int dmake_();
+    extern /* Subroutine */ int dmake_(char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static doublereal alpha;
-    extern /* Subroutine */ int dmmch_();
+    extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static logical isame[13];
     static char sides[1];
     static integer nargs;
     static logical reset;
     static char uplos[1];
-    extern /* Subroutine */ void dprcn2_();
+    extern /* Subroutine */ void dprcn2_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
     static integer ia, ib, na, nc, im, in, ms, ns;
-    extern logical lderes_();
-    extern /* Subroutine */ int cdsymm_();
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cdsymm_(integer*, char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static doublereal errmax;
     static integer laa, lbb, lda, lcc, ldb, ldc;
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static integer ics;
     static doublereal als, bls;
     static integer icu;
@@ -1692,23 +1521,8 @@ L120:
 } /* dchk2_ */
 
 
-/* Subroutine */ void dprcn2_(nout, nc, sname, iorder, side, uplo, m, n, alpha,
-	 lda, ldb, beta, ldc, sname_len, side_len, uplo_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *side, *uplo;
-integer *m, *n;
-doublereal *alpha;
-integer *lda, *ldb;
-doublereal *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen side_len;
-ftnlen uplo_len;
+/* Subroutine */ void dprcn2_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, integer* m, integer* n, doublereal* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len)
 {
-    /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
 
     /* Local variables */
     static char cs[14], cu[14], crc[14];
@@ -1733,19 +1547,7 @@ ftnlen uplo_len;
 } /* dprcn2_ */
 
 
-/* Subroutine */ int dchk3_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nmax, a, aa, as, b, bb, bs, ct, g, c__,
-	 iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublereal *alf;
-integer *nmax;
-doublereal *a, *aa, *as, *b, *bb, *bs, *ct, *g, *c__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* b, doublereal* bb, doublereal* bs, doublereal* ct, doublereal* g, doublereal* c__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1766,24 +1568,24 @@ ftnlen sname_len;
     static logical left, null;
     static char uplo[1];
     static integer i__, j, m, n;
-    extern /* Subroutine */ int dmake_();
+    extern /* Subroutine */ int dmake_(char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static doublereal alpha;
     static char diags[1];
-    extern /* Subroutine */ int dmmch_();
+    extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static logical isame[13];
     static char sides[1];
     static integer nargs;
     static logical reset;
     static char uplos[1];
-    extern /* Subroutine */ void dprcn3_();
+    extern /* Subroutine */ void dprcn3_(integer*, integer*, char*, integer*, char*, char*, char*, char*, integer*, integer*, doublereal*, integer*, integer*, ftnlen, ftnlen, ftnlen, ftnlen, ftnlen);
     static integer ia, na, nc, im, in, ms, ns;
-    extern logical lderes_();
-    extern /* Subroutine */ int cdtrmm_();
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cdtrmm_(integer*, char*, char*, char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen, ftnlen);
     static char tranas[1], transa[1];
-    extern /* Subroutine */ int cdtrsm_();
+    extern /* Subroutine */ void cdtrsm_(integer*, char*, char*, char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen, ftnlen);
     static doublereal errmax;
     static integer laa, icd, lbb, lda, ldb;
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static integer ics;
     static doublereal als;
     static integer ict, icu;
@@ -2165,24 +1967,8 @@ L160:
 } /* dchk3_ */
 
 
-/* Subroutine */ void dprcn3_(nout, nc, sname, iorder, side, uplo, transa, 
-	diag, m, n, alpha, lda, ldb, sname_len, side_len, uplo_len, 
-	transa_len, diag_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *side, *uplo, *transa, *diag;
-integer *m, *n;
-doublereal *alpha;
-integer *lda, *ldb;
-ftnlen sname_len;
-ftnlen side_len;
-ftnlen uplo_len;
-ftnlen transa_len;
-ftnlen diag_len;
+/* Subroutine */ void dprcn3_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, char* transa, char* diag, integer* m, integer* n, doublereal* alpha, integer* lda, integer* ldb, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len, ftnlen transa_len, ftnlen diag_len)
 {
-    /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
 
     /* Local variables */
     static char ca[14], cd[14], cs[14], cu[14], crc[14];
@@ -2219,21 +2005,7 @@ ftnlen diag_len;
 } /* dprcn3_ */
 
 
-/* Subroutine */ int dchk4_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, 
-	c__, cc, cs, ct, g, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublereal *alf;
-integer *nbet;
-doublereal *bet;
-integer *nmax;
-doublereal *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* nmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* b, doublereal* bb, doublereal* bs, doublereal* c__, doublereal* cc, doublereal* cs, doublereal* ct, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2244,8 +2016,6 @@ ftnlen sname_len;
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5;
 
-    /* Builtin functions */
-    integer f_rew(), s_wsfe(), e_wsfe(), do_fio();
 
     /* Local variables */
     static doublereal beta;
@@ -2255,23 +2025,23 @@ ftnlen sname_len;
     static logical tran, null;
     static char uplo[1];
     static integer i__, j, k, n;
-    extern /* Subroutine */ int dmake_();
+    extern /* Subroutine */ int dmake_(char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static doublereal alpha;
-    extern /* Subroutine */ int dmmch_();
+    extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static logical isame[13];
     static integer nargs;
     static logical reset;
     static char trans[1];
     static logical upper;
     static char uplos[1];
-    extern /* Subroutine */ void dprcn4_();
+    extern /* Subroutine */ void dprcn4_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
     static integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks, ns;
-    extern logical lderes_();
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static doublereal errmax;
-    extern /* Subroutine */ int cdsyrk_();
+    extern /* Subroutine */ void cdsyrk_(integer*, char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static char transs[1];
     static integer laa, lda, lcc, ldc;
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static doublereal als;
     static integer ict, icu;
     static doublereal err;
@@ -2586,23 +2356,8 @@ L130:
 } /* dchk4_ */
 
 
-/* Subroutine */ void dprcn4_(nout, nc, sname, iorder, uplo, transa, n, k, 
-	alpha, lda, beta, ldc, sname_len, uplo_len, transa_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *uplo, *transa;
-integer *n, *k;
-doublereal *alpha;
-integer *lda;
-doublereal *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen uplo_len;
-ftnlen transa_len;
+/* Subroutine */ void dprcn4_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublereal* alpha, integer* lda, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
 {
-    /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
 
     /* Local variables */
     static char ca[14], cu[14], crc[14];
@@ -2629,21 +2384,7 @@ ftnlen transa_len;
 } /* dprcn4_ */
 
 
-/* Subroutine */ int dchk5_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, ab, aa, as, bb, bs, 
-	c__, cc, cs, ct, g, w, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublereal *alf;
-integer *nbet;
-doublereal *bet;
-integer *nmax;
-doublereal *ab, *aa, *as, *bb, *bs, *c__, *cc, *cs, *ct, *g, *w;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* nmax, doublereal* ab, doublereal* aa, doublereal* as, doublereal* bb, doublereal* bs, doublereal* c__, doublereal* cc, doublereal* cs, doublereal* ct, doublereal* g, doublereal* w, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2653,8 +2394,6 @@ ftnlen sname_len;
     /* System generated locals */
     integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8;
 
-    /* Builtin functions */
-    integer f_rew(), s_wsfe(), e_wsfe(), do_fio();
 
     /* Local variables */
     static integer jjab;
@@ -2665,23 +2404,23 @@ ftnlen sname_len;
     static logical tran, null;
     static char uplo[1];
     static integer i__, j, k, n;
-    extern /* Subroutine */ int dmake_();
+    extern /* Subroutine */ int dmake_(char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static doublereal alpha;
-    extern /* Subroutine */ int dmmch_();
+    extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static logical isame[13];
     static integer nargs;
     static logical reset;
     static char trans[1];
     static logical upper;
     static char uplos[1];
-    extern /* Subroutine */ void dprcn5_();
+    extern /* Subroutine */ void dprcn5_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
     static integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks, ns;
-    extern logical lderes_();
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static doublereal errmax;
     static char transs[1];
     static integer laa, lbb, lda, lcc, ldb, ldc;
-    extern logical lde_();
-    extern /* Subroutine */ int cdsyr2k_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
+    extern /* Subroutine */ void cdsyr2k_(integer*, char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static doublereal als;
     static integer ict, icu;
     static doublereal err;
@@ -3048,23 +2787,8 @@ L160:
 } /* dchk5_ */
 
 
-/* Subroutine */ void dprcn5_(nout, nc, sname, iorder, uplo, transa, n, k, 
-	alpha, lda, ldb, beta, ldc, sname_len, uplo_len, transa_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *uplo, *transa;
-integer *n, *k;
-doublereal *alpha;
-integer *lda, *ldb;
-doublereal *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen uplo_len;
-ftnlen transa_len;
+/* Subroutine */ void dprcn5_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublereal* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
 {
-    /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
 
     /* Local variables */
     static char ca[14], cu[14], crc[14];
@@ -3091,25 +2815,13 @@ ftnlen transa_len;
 } /* dprcn5_ */
 
 
-/* Subroutine */ int dmake_(type__, uplo, diag, m, n, a, nmax, aa, lda, reset,
-	 transl, type_len, uplo_len, diag_len)
-char *type__, *uplo, *diag;
-integer *m, *n;
-doublereal *a;
-integer *nmax;
-doublereal *aa;
-integer *lda;
-logical *reset;
-doublereal *transl;
-ftnlen type_len;
-ftnlen uplo_len;
-ftnlen diag_len;
+/* Subroutine */ int dmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublereal* a, integer* nmax, doublereal* aa, integer* lda, logical* reset, doublereal* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2;
 
     /* Local variables */
-    extern doublereal dbeg_();
+    extern doublereal dbeg_(logical*);
     static integer ibeg, iend;
     static logical unit;
     static integer i__, j;
@@ -3241,25 +2953,7 @@ ftnlen diag_len;
 
 } /* dmake_ */
 
-/* Subroutine */ int dmmch_(transa, transb, m, n, kk, alpha, a, lda, b, ldb, 
-	beta, c__, ldc, ct, g, cc, ldcc, eps, err, fatal, nout, mv, 
-	transa_len, transb_len)
-char *transa, *transb;
-integer *m, *n, *kk;
-doublereal *alpha, *a;
-integer *lda;
-doublereal *b;
-integer *ldb;
-doublereal *beta, *c__;
-integer *ldc;
-doublereal *ct, *g, *cc;
-integer *ldcc;
-doublereal *eps, *err;
-logical *fatal;
-integer *nout;
-logical *mv;
-ftnlen transa_len;
-ftnlen transb_len;
+/* Subroutine */ int dmmch_(char* transa, char* transb, integer* m, integer* n, integer* kk, doublereal* alpha, doublereal* a, integer* lda, doublereal* b, integer* ldb, doublereal* beta, doublereal* c__, integer* ldc, doublereal* ct, doublereal* g, doublereal* cc, integer* ldcc, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen transa_len, ftnlen transb_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, 
@@ -3267,8 +2961,7 @@ ftnlen transb_len;
     doublereal d__1, d__2;
 
     /* Builtin functions */
-    double sqrt();
-    integer s_wsfe(), e_wsfe(), do_fio();
+    double sqrt(double);
 
     /* Local variables */
     static doublereal erri;
@@ -3432,9 +3125,7 @@ L150:
 
 } /* dmmch_ */
 
-logical lde_(ri, rj, lr)
-doublereal *ri, *rj;
-integer *lr;
+logical lde_(doublereal* ri, doublereal* rj, integer* lr)
 {
     /* System generated locals */
     integer i__1;
@@ -3481,13 +3172,7 @@ L30:
 
 } /* lde_ */
 
-logical lderes_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len)
-char *type__, *uplo;
-integer *m, *n;
-doublereal *aa, *as;
-integer *lda;
-ftnlen type_len;
-ftnlen uplo_len;
+logical lderes_(char* type__, char* uplo, integer* m, integer* n, doublereal* aa, doublereal* as, integer* lda, ftnlen type_len, ftnlen uplo_len)
 {
     /* System generated locals */
     integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2;
@@ -3576,8 +3261,7 @@ L80:
 
 } /* lderes_ */
 
-doublereal dbeg_(reset)
-logical *reset;
+doublereal dbeg_(logical* reset)
 {
     /* System generated locals */
     doublereal ret_val;
@@ -3629,8 +3313,7 @@ L10:
 
 } /* dbeg_ */
 
-doublereal ddiff_(x, y)
-doublereal *x, *y;
+doublereal ddiff_(doublereal* x, doublereal* y)
 {
     /* System generated locals */
     doublereal ret_val;
diff --git a/ctest/c_sblat1c.c b/ctest/c_sblat1c.c
index 57e4707a9..1424e39b4 100644
--- a/ctest/c_sblat1c.c
+++ b/ctest/c_sblat1c.c
@@ -21,19 +21,6 @@ typedef float real;
 typedef double doublereal;
 typedef struct { real r, i; } complex;
 typedef struct { doublereal r, i; } doublecomplex;
-#ifdef _MSC_VER
-static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
-static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
-static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
-static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
-#else
-static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
-static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
-static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
-static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
-#endif
-#define pCf(z) (*_pCf(z))
-#define pCd(z) (*_pCd(z))
 typedef int logical;
 typedef short int shortlogical;
 typedef char logical1;
@@ -242,250 +229,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
-#if 0
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-#endif
 
 
 /* Common Block Declarations */
@@ -502,16 +245,16 @@ struct {
 static integer c__1 = 1;
 static real c_b34 = (float)1.;
 
-/* Main program */ int main ()
+/* Main program */ int main (void)
 {
     /* Initialized data */
 
     static real sfac = (float)9.765625e-4;
 
     /* Local variables */
-    extern /* Subroutine */ int check0_(), check1_(), check2_(), check3_();
+    extern /* Subroutine */ int check0_(real*), check1_(real*), check2_(real*), check3_(real*);
     static integer ic;
-    extern /* Subroutine */ int header_();
+    extern /* Subroutine */ int header_(void);
 
 /*     Test program for the REAL             Level 1 CBLAS. */
 /*     Based upon the original CBLAS test routine together with: */
@@ -557,7 +300,7 @@ static real c_b34 = (float)1.;
     exit(0);
 } /* MAIN__ */
 
-/* Subroutine */ int header_()
+/* Subroutine */ int header_(void)
 {
     /* Initialized data */
 
@@ -580,8 +323,7 @@ static real c_b34 = (float)1.;
 
 } /* header_ */
 
-/* Subroutine */ int check0_(sfac)
-real *sfac;
+/* Subroutine */ int check0_(real *sfac)
 {
     /* Initialized data */
 
@@ -600,7 +342,7 @@ real *sfac;
 
     /* Local variables */
     static integer k;
-    extern /* Subroutine */ int srotgtest_(), stest1_();
+    extern /* Subroutine */ int srotgtest_(real*,real*,real*,real*), stest1_(real*,real*,real*,real*);
     static real sa, sb, sc, ss;
 
 /*     .. Parameters .. */
@@ -645,8 +387,7 @@ L40:
     return 0;
 } /* check0_ */
 
-/* Subroutine */ int check1_(sfac)
-real *sfac;
+/* Subroutine */ int check1_(real* sfac)
 {
     /* Initialized data */
 
@@ -692,14 +433,14 @@ real *sfac;
 
     /* Local variables */
     static integer i__;
-    extern real snrm2test_();
+    extern real snrm2test_(int*,real*,int*);
     static real stemp[1], strue[8];
-    extern /* Subroutine */ int stest_(), sscaltest_();
-    extern real sasumtest_();
-    extern /* Subroutine */ int itest1_(), stest1_();
+    extern /* Subroutine */ int stest_(int*, real*,real*,real*,real*), sscaltest_(int*,real*,real*,int*);
+    extern real sasumtest_(int*,real*,int*);
+    extern /* Subroutine */ int itest1_(int*,int*), stest1_(real*,real*,real*,real*);
     static real sx[8];
     static integer np1;
-    extern integer isamaxtest_();
+    extern integer isamaxtest_(int*,real*,int*);
     static integer len;
 
 
@@ -761,8 +502,7 @@ real *sfac;
     return 0;
 } /* check1_ */
 
-/* Subroutine */ int check2_(sfac)
-real *sfac;
+/* Subroutine */ int check2_(real* sfac)
 {
     /* Initialized data */
 
@@ -850,12 +590,12 @@ real *sfac;
 
     /* Local variables */
     static integer lenx, leny;
-    extern real sdottest_();
+    extern real sdottest_(int*,real*,int*,real*,int*);
     static integer i__, j, ksize;
-    extern /* Subroutine */ int stest_(), scopytest_(), sswaptest_(), 
-	    saxpytest_();
+    extern /* Subroutine */ int stest_(int*,real*,real*,real*,real*), scopytest_(int*,real*,int*,real*,int*), sswaptest_(int*,real*,int*,real*,int*), 
+	    saxpytest_(int*,real*,real*,int*,real*,int*);
     static integer ki;
-    extern /* Subroutine */ int stest1_();
+    extern /* Subroutine */ int stest1_(real*,real*,real*,real*);
     static integer kn, mx, my;
     static real sx[7], sy[7], stx[7], sty[7];
 
@@ -936,8 +676,7 @@ real *sfac;
     return 0;
 } /* check2_ */
 
-/* Subroutine */ int check3_(sfac)
-real *sfac;
+/* Subroutine */ int check3_(real* sfac)
 {
     /* Initialized data */
 
@@ -969,9 +708,9 @@ real *sfac;
 	    1.17 };
 
     /* Local variables */
-    extern /* Subroutine */ void srottest_();
+    extern /* Subroutine */ void srottest_(int*,real*,int*,real*,int*,real*,real*);
     static integer i__, k, ksize;
-    extern /* Subroutine */ int stest_(), srotmtest_();
+    extern /* Subroutine */ int stest_(int*,real*,real*,real*,real*), srotmtest_(int*,real*,int*,real*,int*,real*);
     static integer ki, kn;
     static real sx[19], sy[19], sparam[5], stx[19], sty[19];
 
@@ -1042,16 +781,14 @@ real *sfac;
     return 0;
 } /* check3_ */
 
-/* Subroutine */ int stest_(len, scomp, strue, ssize, sfac)
-integer *len;
-real *scomp, *strue, *ssize, *sfac;
+/* Subroutine */ int stest_(int* len, real* scomp, real* strue, real* ssize, real* sfac)
 {
     integer i__1;
     real r__1, r__2, r__3, r__4, r__5;
 
     /* Local variables */
     static integer i__;
-    extern doublereal sdiff_();
+    extern doublereal sdiff_(real*,real*);
     static real sd;
 
 /*     ********************************* STEST ************************** */
@@ -1107,11 +844,10 @@ L40:
 
 } /* stest_ */
 
-/* Subroutine */ int stest1_(scomp1, strue1, ssize, sfac)
-real *scomp1, *strue1, *ssize, *sfac;
+/* Subroutine */ int stest1_(real* scomp1, real* strue1, real* ssize, real* sfac)
 {
     static real scomp[1], strue[1];
-    extern /* Subroutine */ int stest_();
+    extern /* Subroutine */ int stest_(int*,real*,real*,real*,real*);
 
 /*     ************************* STEST1 ***************************** */
 
@@ -1138,8 +874,7 @@ real *scomp1, *strue1, *ssize, *sfac;
     return 0;
 } /* stest1_ */
 
-doublereal sdiff_(sa, sb)
-real *sa, *sb;
+doublereal sdiff_(real* sa, real* sb)
 {
     /* System generated locals */
     real ret_val;
@@ -1153,8 +888,7 @@ real *sa, *sb;
     return ret_val;
 } /* sdiff_ */
 
-/* Subroutine */ int itest1_(icomp, itrue)
-integer *icomp, *itrue;
+/* Subroutine */ int itest1_(integer* icomp, integer* itrue)
 {
     /* Local variables */
     static integer id;
diff --git a/ctest/c_sblat2c.c b/ctest/c_sblat2c.c
index a646091f7..53295e0c7 100644
--- a/ctest/c_sblat2c.c
+++ b/ctest/c_sblat2c.c
@@ -242,255 +242,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
-#if 0
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
 
 
 /* Common Block Declarations */
@@ -521,7 +272,7 @@ static integer c_n1 = -1;
 static integer c__0 = 0;
 static logical c_false = FALSE_;
 
-/* Main program */ int main()
+/* Main program */ int main(void)
 {
     /* Initialized data */
 
@@ -539,16 +290,20 @@ static logical c_false = FALSE_;
     static logical same;
     static integer ninc, nbet, ntra;
     static logical rewi;
-    extern /* Subroutine */ int schk1_(), schk2_(), schk3_(), schk4_(), 
-	    schk5_(), schk6_();
+extern /* Subroutine */ int schk1_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, real*, integer*, real*, integer*, integer*, integer*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen);
+extern /* Subroutine */ int schk2_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, real*, integer*, real*, integer*, integer*, integer*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen);
+extern /* Subroutine */ int schk3_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen);
+extern /* Subroutine */ int schk4_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, integer*, integer*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen);
+extern /* Subroutine */ int schk5_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, integer*, integer*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen);
+extern /* Subroutine */ int schk6_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, real* a, real* aa, real* as, real* x, real* xx, real* xs, real* y, real* yy, real* ys, real* yt, real* g, real* z__, integer* iorder, ftnlen sname_len);
     static real a[4225]	/* was [65][65] */, g[65];
     static integer i__, j, n;
     static logical fatal;
     static real x[65], y[65], z__[130];
-    extern doublereal sdiff_();
+    extern doublereal sdiff_(real*, real*);
     static logical trace;
     static integer nidim;
-    extern /* Subroutine */ int smvch_();
+    extern /* Subroutine */ int smvch_(char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, real*, real*, logical*, integer*, logical*, ftnlen);
     static char snaps[32], trans[1];
     static integer isnum;
     static logical ltest[16];
@@ -564,12 +319,12 @@ static logical c_false = FALSE_;
     static logical rorder;
     static integer layout;
     static logical ltestt;
-    extern /* Subroutine */ int cs2chke_();
+    extern /* Subroutine */ int cs2chke_(char*, ftnlen);
     static logical tsterr;
     static real alf[7];
     static integer inc[7], nkb;
     static real bet[7];
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real eps, err;
     char   tmpchar;
     
@@ -1098,21 +853,7 @@ L240:
 
 } /* MAIN__ */
 
-/* Subroutine */ int schk1_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, 
-	incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *nalf;
-real *alf;
-integer *nbet;
-real *bet;
-integer *ninc, *inc, *nmax, *incmax;
-real *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk1_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, real* alf, integer* nbet, real* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, real* a, real* aa, real* as, real* x, real* xx, real* xs, real* y, real* yy, real* ys, real* yt, real* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1130,24 +871,25 @@ ftnlen sname_len;
     static integer i__, m, n;
     static real alpha;
     static logical isame[13];
-    extern /* Subroutine */ int smake_();
+    extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, integer*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int smvch_();
+    extern /* Subroutine */ int smvch_(char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, real*, real*, logical*, integer*, logical*, ftnlen);
     static logical reset;
     static integer incxs, incys;
     static char trans[1];
     static integer ia, ib, ic;
     static logical banded;
     static integer nc, nd, im, in, kl, ml, nk, nl, ku, ix, iy, ms, lx, ly, ns;
-    extern /* Subroutine */ int csgbmv_(), csgemv_();
+    extern /* Subroutine */ void csgbmv_(integer*, char*, integer*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, ftnlen);
+    extern /* Subroutine */ void csgemv_(integer*, char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, ftnlen);
     static char ctrans[14];
     static real errmax;
-    extern logical lseres_();
+    extern logical lseres_(char* type__, char* uplo, integer* m, integer* n, real* aa, real* as, integer* lda, ftnlen ltype_len, ftnlen uplo_len);
     static real transl;
     static char transs[1];
     static integer laa, lda;
     static real als, bls;
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real err;
     static integer iku, kls, kus;
 
@@ -1552,21 +1294,7 @@ L140:
 
 } /* schk1_ */
 
-/* Subroutine */ int schk2_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, 
-	incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *nalf;
-real *alf;
-integer *nbet;
-real *bet;
-integer *ninc, *inc, *nmax, *incmax;
-real *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk2_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, real* alf, integer* nbet, real* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, real* a, real* aa, real* as, real* x, real* xx, real* xs, real* y, real* yy, real* ys, real* yt, real* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1585,9 +1313,9 @@ ftnlen sname_len;
     static integer i__, k, n;
     static real alpha;
     static logical isame[13];
-    extern /* Subroutine */ int smake_();
+    extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, integer*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int smvch_();
+    extern /* Subroutine */ int smvch_(char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, real*, real*, logical*, integer*, logical*, ftnlen);
     static logical reset;
     static char cuplo[14];
     static integer incxs, incys;
@@ -1598,13 +1326,14 @@ ftnlen sname_len;
     static logical packed;
     static integer nk, ks, ix, iy, ns, lx, ly;
     static real errmax;
-    extern logical lseres_();
-    extern /* Subroutine */ int cssbmv_();
+    extern logical lseres_(char* , char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cssbmv_(integer*, char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, ftnlen);
     static real transl;
-    extern /* Subroutine */ int csspmv_(), cssymv_();
+    extern /* Subroutine */ void csspmv_(integer*, char*, integer*, real*, real*, real*, integer*, real*, real*, integer*, ftnlen);
+    extern /* Subroutine */ void cssymv_(integer*, char*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, ftnlen);
     static integer laa, lda;
     static real als, bls;
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real err;
 
 /*  Tests SSYMV, SSBMV and SSPMV. */
@@ -2003,17 +1732,7 @@ L130:
 
 } /* schk2_ */
 
-/* Subroutine */ int schk3_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, xt, g, z__, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *ninc, *inc, *nmax, *incmax;
-real *a, *aa, *as, *x, *xx, *xs, *xt, *g, *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk3_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* ninc, integer* inc, integer* nmax, integer* incmax, real* a, real* aa, real* as, real* x, real* xx, real* xs, real* xt, real* g, real* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2034,9 +1753,9 @@ ftnlen sname_len;
     static integer i__, k, n;
     static char diags[1];
     static logical isame[13];
-    extern /* Subroutine */ int smake_();
+    extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, integer*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int smvch_();
+    extern /* Subroutine */ int smvch_(char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, real*, real*, logical*, integer*, logical*, ftnlen);
     static logical reset;
     static char cuplo[14];
     static integer incxs;
@@ -2047,14 +1766,17 @@ ftnlen sname_len;
     static integer nk, ks, ix, ns, lx;
     static char ctrans[14];
     static real errmax;
-    extern logical lseres_();
-    extern /* Subroutine */ int cstbmv_();
+    extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cstbmv_(integer*, char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen);
     static real transl;
-    extern /* Subroutine */ int cstbsv_();
+    extern /* Subroutine */ void cstbsv_(integer*, char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen);
     static char transs[1];
-    extern /* Subroutine */ int cstpmv_(), cstrmv_(), cstpsv_(), cstrsv_();
+    extern /* Subroutine */ void cstpmv_(integer*, char*, char*, char*, integer*, real*, real*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cstrmv_(integer*, char*, char*, char*, integer*, real*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cstpsv_(integer*, char*, char*, char*, integer*, real*, real*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cstrsv_(integer*, char*, char*, char*, integer*, real*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen);
     static integer laa, icd, lda, ict, icu;
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real err;
 
 /*  Tests STRMV, STBMV, STPMV, STRSV, STBSV and STPSV. */
@@ -2508,19 +2230,7 @@ L130:
 
 } /* schk3_ */
 
-/* Subroutine */ int schk4_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-real *alf;
-integer *ninc, *inc, *nmax, *incmax;
-real *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g, *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk4_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, real* a, real* aa, real* as, real* x, real* xx, real* xs, real* y, real* yy, real* ys, real* yt, real* g, real* z__, integer* iorder, ftnlen sname_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6;
@@ -2533,17 +2243,18 @@ ftnlen sname_len;
     static integer i__, j, m, n;
     static real alpha, w[1];
     static logical isame[13];
-    extern /* Subroutine */ int smake_(), csger_();
+/* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, integer*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void csger_(integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, integer*);
     static integer nargs;
-    extern /* Subroutine */ int smvch_();
+    extern /* Subroutine */ int smvch_(char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, real*, real*, logical*, integer*, logical*, ftnlen);
     static logical reset;
     static integer incxs, incys, ia, nc, nd, im, in, ms, ix, iy, ns, lx, ly;
     static real errmax;
-    extern logical lseres_();
+    extern logical lseres_(char* , char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen);
     static real transl;
     static integer laa, lda;
     static real als;
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real err;
 
 /*  Tests SGER. */
@@ -2848,19 +2559,7 @@ L150:
 
 } /* schk4_ */
 
-/* Subroutine */ int schk5_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-real *alf;
-integer *ninc, *inc, *nmax, *incmax;
-real *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g, *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk5_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, real* a, real* aa, real* as, real* x, real* xx, real* xs, real* y, real* yy, real* ys, real* yt, real* g, real* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2880,25 +2579,25 @@ ftnlen sname_len;
     static integer i__, j, n;
     static real alpha, w[1];
     static logical isame[13];
-    extern /* Subroutine */ int smake_();
+    extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, integer*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int smvch_();
+    extern /* Subroutine */ int smvch_(char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, real*, real*, logical*, integer*, logical*, ftnlen);
     static logical reset;
     static char cuplo[14];
     static integer incxs;
-    extern /* Subroutine */ int csspr_();
+    extern /* Subroutine */ void csspr_(integer*, char*, integer*, real*, real*, integer*, real*, ftnlen);
     static logical upper;
     static char uplos[1];
-    extern /* Subroutine */ int cssyr_();
+    extern /* Subroutine */ void cssyr_(integer*, char*, integer*, real*, real*, integer*, real*, integer*, ftnlen);
     static integer ia, ja, ic, nc, jj, lj, in;
     static logical packed;
     static integer ix, ns, lx;
     static real errmax;
-    extern logical lseres_();
+    extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen);
     static real transl;
     static integer laa, lda;
     static real als;
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real err;
 
 /*  Tests SSYR and SSPR. */
@@ -3218,19 +2917,7 @@ L130:
 
 } /* schk5_ */
 
-/* Subroutine */ int schk6_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-real *alf;
-integer *ninc, *inc, *nmax, *incmax;
-real *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g, *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk6_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, real* a, real* aa, real* as, real* x, real* xx, real* xs, real* y, real* yy, real* ys, real* yt, real* g, real* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -3249,26 +2936,26 @@ ftnlen sname_len;
     static integer i__, j, n;
     static real alpha, w[2];
     static logical isame[13];
-    extern /* Subroutine */ int smake_();
+    extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, integer*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int smvch_();
+    extern /* Subroutine */ int smvch_(char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, real*, real*, logical*, integer*, logical*, ftnlen);
     static logical reset;
     static char cuplo[14];
     static integer incxs, incys;
     static logical upper;
     static char uplos[1];
     static integer ia, ja, ic;
-    extern /* Subroutine */ int csspr2_();
+    extern /* Subroutine */ void csspr2_(integer*, char*, integer*, real*, real*, integer*, real*, integer*, real*, ftnlen);
     static integer nc, jj, lj, in;
     static logical packed;
-    extern /* Subroutine */ int cssyr2_();
+    extern /* Subroutine */ void cssyr2_(integer*, char*, integer*, real*, real*, integer*, real*, integer*, real*, integer*, ftnlen);
     static integer ix, iy, ns, lx, ly;
     static real errmax;
-    extern logical lseres_();
+    extern logical lseres_(char* type__, char* uplo, integer* m, integer* n, real* aa, real* as, integer* lda, ftnlen ltype_len, ftnlen uplo_len);
     static real transl;
     static integer laa, lda;
     static real als;
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real err;
 
 /*  Tests SSYR2 and SSPR2. */
@@ -3634,26 +3321,14 @@ L170:
 
 } /* schk6_ */
 
-/* Subroutine */ int smake_(type__, uplo, diag, m, n, a, nmax, aa, lda, kl, 
-	ku, reset, transl, type_len, uplo_len, diag_len)
-char *type__, *uplo, *diag;
-integer *m, *n;
-real *a;
-integer *nmax;
-real *aa;
-integer *lda, *kl, *ku;
-logical *reset;
-real *transl;
-ftnlen type_len;
-ftnlen uplo_len;
-ftnlen diag_len;
-{
+/* Subroutine */ int smake_(char* type__, char* uplo, char* diag, integer* m, integer* n, real* a, integer* nmax, real* aa, integer* lda, integer* kl, integer* ku, logical* reset, real* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len)
+ {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
 
     /* Local variables */
     static integer ibeg, iend;
-    extern doublereal sbeg_();
+    extern doublereal sbeg_(logical*);
     static integer ioff;
     static logical unit;
     static integer i__, j;
@@ -3879,28 +3554,14 @@ ftnlen diag_len;
 
 } /* smake_ */
 
-/* Subroutine */ int smvch_(trans, m, n, alpha, a, nmax, x, incx, beta, y, 
-	incy, yt, g, yy, eps, err, fatal, nout, mv, trans_len)
-char *trans;
-integer *m, *n;
-real *alpha, *a;
-integer *nmax;
-real *x;
-integer *incx;
-real *beta, *y;
-integer *incy;
-real *yt, *g, *yy, *eps, *err;
-logical *fatal;
-integer *nout;
-logical *mv;
-ftnlen trans_len;
+/* Subroutine */ int smvch_(char* trans, integer* m, integer* n, real* alpha, real* a, integer* nmax, real* x, integer* incx, real* beta, real* y, integer* incy, real* yt, real* g, real* yy, real* eps, real* err, logical* fatal, integer* nout, logical* mv, ftnlen trans_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2;
     real r__1;
 
     /* Builtin functions */
-    double sqrt();
+    double sqrt(double);
 
     /* Local variables */
     static real erri;
@@ -4029,9 +3690,7 @@ L70:
 
 } /* smvch_ */
 
-logical lse_(ri, rj, lr)
-real *ri, *rj;
-integer *lr;
+logical lse_(real* ri, real* rj, integer* lr)
 {
     /* System generated locals */
     integer i__1;
@@ -4076,13 +3735,7 @@ L30:
 
 } /* lse_ */
 
-logical lseres_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len)
-char *type__, *uplo;
-integer *m, *n;
-real *aa, *as;
-integer *lda;
-ftnlen type_len;
-ftnlen uplo_len;
+logical lseres_(char* type__, char* uplo, integer* m, integer* n, real* aa, real* as, integer* lda, ftnlen ltype_len, ftnlen uplo_len)
 {
     /* System generated locals */
     integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2;
@@ -4169,8 +3822,7 @@ L80:
 
 } /* lseres_ */
 
-doublereal sbeg_(reset)
-logical *reset;
+doublereal sbeg_(logical* reset)
 {
     /* System generated locals */
     real ret_val;
@@ -4221,8 +3873,7 @@ L10:
 
 } /* sbeg_ */
 
-doublereal sdiff_(x, y)
-real *x, *y;
+doublereal sdiff_(real* x, real* y)
 {
     /* System generated locals */
     real ret_val;
diff --git a/ctest/c_sblat3c.c b/ctest/c_sblat3c.c
index 73897a8a5..402c58c8b 100644
--- a/ctest/c_sblat3c.c
+++ b/ctest/c_sblat3c.c
@@ -242,129 +242,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
 
 
 /* Common Block Declarations */
@@ -393,7 +270,7 @@ static logical c_true = TRUE_;
 static integer c__0 = 0;
 static logical c_false = FALSE_;
 
-/* Main program  MAIN__() */ int main()
+/* Main program  MAIN__() */ int main(void)
 {
     /* Initialized data */
 
@@ -402,26 +279,25 @@ static logical c_false = FALSE_;
     /* System generated locals */
     integer i__1, i__2, i__3;
     real r__1;
-    /* Builtin functions */
-    integer s_rsle(), do_lio(), e_rsle(), f_open(), s_wsfe(), do_fio(), 
-	    e_wsfe(), s_wsle(), e_wsle(), s_rsfe(), e_rsfe();
-    integer f_clos();
 
     /* Local variables */
     static integer nalf, idim[9];
     static logical same;
     static integer nbet, ntra;
     static logical rewi;
-    extern /* Subroutine */ int schk1_(), schk2_(), schk3_(), schk4_(), 
-	    schk5_();
+    extern /* Subroutine */ int schk1_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, real*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen);
+    extern /* Subroutine */ int schk2_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, real*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen);
+    extern /* Subroutine */ int schk3_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen);
+    extern /* Subroutine */ int schk4_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, real*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen);
+    extern /* Subroutine */ int schk5_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, real*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen);
     static real c__[4225]	/* was [65][65] */, g[65];
     static integer i__, j, n;
     static logical fatal;
     static real w[130];
-    extern doublereal sdiff_();
+    extern doublereal sdiff_(real*, real*);
     static logical trace;
     static integer nidim;
-    extern /* Subroutine */ int smmch_();
+    extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen);
     static char snaps[32];
     static integer isnum;
     static logical ltest[6];
@@ -433,9 +309,9 @@ static logical c_false = FALSE_;
     static logical rorder;
     static integer layout;
     static logical ltestt, tsterr;
-    extern /* Subroutine */ int cs3chke_();
+    extern /* Subroutine */ void cs3chke_(char*, ftnlen);
     static real alf[7], bet[7];
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real eps, err;
     char tmpchar;
 
@@ -899,21 +775,7 @@ L230:
 
 } /* MAIN__ */
 
-/* Subroutine */ int schk1_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, 
-	c__, cc, cs, ct, g, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-real *alf;
-integer *nbet;
-real *bet;
-integer *nmax;
-real *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk1_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* nbet, real* bet, integer* nmax, real* a, real* aa, real* as, real* b, real* bb, real* bs, real* c__, real* cc, real* cs, real* ct, real* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -923,8 +785,6 @@ ftnlen sname_len;
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5, i__6;
 
-    /* Builtin functions */
-    integer f_rew(), s_wsfe(), e_wsfe(), do_fio();
 
     /* Local variables */
     static real beta;
@@ -936,18 +796,17 @@ ftnlen sname_len;
     static logical trana, tranb;
     static integer nargs;
     static logical reset;
-    extern /* Subroutine */ void sprcn1_();
-    extern /* Subroutine */ int smake_();
-    extern /* Subroutine */ int smmch_();
+    extern /* Subroutine */ void sprcn1_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, integer*, real*, integer*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen);
     static integer ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns;
-    extern /* Subroutine */ int csgemm_();
+    extern /* Subroutine */ void csgemm_(integer*, char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, ftnlen, ftnlen);
     static char tranas[1], tranbs[1], transa[1], transb[1];
     static real errmax;
-    extern logical lseres_();
-    extern logical lse_();
+    extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen);
+    extern logical lse_(real*, real*, integer*);
     static integer ica, icb, laa, lbb, lda, lcc, ldb, ldc;
     static real als, bls;
-    extern logical lse_();
     static real err;
 
 /*  Tests SGEMM. */
@@ -1278,23 +1137,8 @@ L130:
 
 
 
-/* Subroutine */ void sprcn1_(nout, nc, sname, iorder, transa, transb, m, n, k,
-	 alpha, lda, ldb, beta, ldc, sname_len, transa_len, transb_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *transa, *transb;
-integer *m, *n, *k;
-real *alpha;
-integer *lda, *ldb;
-real *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen transa_len;
-ftnlen transb_len;
+/* Subroutine */ void sprcn1_(integer* nout, integer* nc, char* sname, integer* iorder, char* transa, char* transb, integer* m, integer* n, integer* k, real* alpha, integer* lda, integer* ldb, real* beta, integer* ldc, ftnlen sname_len, ftnlen transa_len, ftnlen transb_len)
 {
-    /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
 
     /* Local variables */
     static char crc[14], cta[14], ctb[14];
@@ -1324,21 +1168,7 @@ ftnlen transb_len;
 } /* sprcn1_ */
 
 
-/* Subroutine */ int schk2_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, 
-	c__, cc, cs, ct, g, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-real *alf;
-integer *nbet;
-real *bet;
-integer *nmax;
-real *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk2_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* nbet, real* bet, integer* nmax, real* a, real* aa, real* as, real* b, real* bb, real* bs, real* c__, real* cc, real* cs, real* ct, real* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1349,8 +1179,6 @@ ftnlen sname_len;
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5;
 
-    /* Builtin functions */
-    integer f_rew(), s_wsfe(), e_wsfe(), do_fio();
 
     /* Local variables */
     static real beta;
@@ -1368,15 +1196,15 @@ ftnlen sname_len;
     static char uplos[1];
     static integer ia, ib, na, nc, im, in, ms, ns;
     static real errmax;
-    extern logical lseres_();
-    extern /* Subroutine */ int cssymm_();
-    extern void sprcn2_();
-    extern int smake_();
-    extern int smmch_();
+    extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cssymm_(integer*, char*, char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, ftnlen, ftnlen);
+    extern void sprcn2_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, real*, integer*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen);
     static integer laa, lbb, lda, lcc, ldb, ldc, ics;
     static real als, bls;
     static integer icu;
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real err;
 
 /*  Tests SSYMM. */
@@ -1685,23 +1513,8 @@ L120:
 } /* schk2_ */
 
 
-/* Subroutine */ void sprcn2_(nout, nc, sname, iorder, side, uplo, m, n, alpha,
-	 lda, ldb, beta, ldc, sname_len, side_len, uplo_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *side, *uplo;
-integer *m, *n;
-real *alpha;
-integer *lda, *ldb;
-real *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen side_len;
-ftnlen uplo_len;
+/* Subroutine */ void sprcn2_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, integer* m, integer* n, real* alpha, integer* lda, integer* ldb, real* beta, integer* ldc, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len)
 {
-    /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
 
     /* Local variables */
     static char cs[14], cu[14], crc[14];
@@ -1726,19 +1539,7 @@ ftnlen uplo_len;
 } /* sprcn2_ */
 
 
-/* Subroutine */ int schk3_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nmax, a, aa, as, b, bb, bs, ct, g, c__,
-	 iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-real *alf;
-integer *nmax;
-real *a, *aa, *as, *b, *bb, *bs, *ct, *g, *c__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk3_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* nmax, real* a, real* aa, real* as, real* b, real* bb, real* bs, real* ct, real* g, real* c__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1751,8 +1552,6 @@ ftnlen sname_len;
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5;
 
-    /* Builtin functions */
-    integer f_rew(), s_wsfe(), e_wsfe(), do_fio();
 
     /* Local variables */
     static char diag[1];
@@ -1769,18 +1568,19 @@ ftnlen sname_len;
     static integer nargs;
     static logical reset;
     static char uplos[1];
-    extern /* Subroutine */ void sprcn3_();
+    extern /* Subroutine */ void sprcn3_(integer*, integer*, char*, integer*, char*, char*, char*, char*, integer*, integer*, real*, integer*, integer*, ftnlen , ftnlen, ftnlen, ftnlen, ftnlen);
     static integer ia, na, nc, im, in, ms, ns;
     static char tranas[1], transa[1];
     static real errmax;
-    extern int smake_();
-    extern int smmch_();
-    extern logical lseres_();
-    extern /* Subroutine */ int cstrmm_(), cstrsm_();
+    extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen);
+    extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cstrmm_(integer*, char*, char*, char*, char*, integer*, integer*, real*, real*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cstrsm_(integer*, char*, char*, char*, char*, integer*, integer*, real*, real*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen, ftnlen);
     static integer laa, icd, lbb, lda, ldb, ics;
     static real als;
     static integer ict, icu;
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real err;
 
 /*  Tests STRMM and STRSM. */
@@ -2155,24 +1955,8 @@ L160:
 } /* schk3_ */
 
 
-/* Subroutine */ void sprcn3_(nout, nc, sname, iorder, side, uplo, transa, 
-	diag, m, n, alpha, lda, ldb, sname_len, side_len, uplo_len, 
-	transa_len, diag_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *side, *uplo, *transa, *diag;
-integer *m, *n;
-real *alpha;
-integer *lda, *ldb;
-ftnlen sname_len;
-ftnlen side_len;
-ftnlen uplo_len;
-ftnlen transa_len;
-ftnlen diag_len;
+/* Subroutine */ void sprcn3_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, char* transa, char* diag, integer* m, integer* n, real* alpha, integer* lda, integer* ldb, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len, ftnlen transa_len, ftnlen diag_len)
 {
-    /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
 
     /* Local variables */
     static char ca[14], cd[14], cs[14], cu[14], crc[14];
@@ -2210,21 +1994,7 @@ ftnlen diag_len;
 } /* sprcn3_ */
 
 
-/* Subroutine */ int schk4_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, 
-	c__, cc, cs, ct, g, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-real *alf;
-integer *nbet;
-real *bet;
-integer *nmax;
-real *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk4_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* nbet, real* bet, integer* nmax, real* a, real* aa, real* as, real* b, real* bb, real* bs, real* c__, real* cc, real* cs, real* ct, real* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2235,8 +2005,6 @@ ftnlen sname_len;
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5;
 
-    /* Builtin functions */
-    integer f_rew(), s_wsfe(), e_wsfe(), do_fio();
 
     /* Local variables */
     static real beta;
@@ -2253,18 +2021,18 @@ ftnlen sname_len;
     static char trans[1];
     static logical upper;
     static char uplos[1];
-    extern /* Subroutine */ void sprcn4_();
-    extern /* Subroutine */ int smake_();
-    extern /* Subroutine */ int smmch_();
+    extern /* Subroutine */ void sprcn4_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen);
     static integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks, ns;
     static real errmax;
-    extern logical lseres_();
+    extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen);
     static char transs[1];
-    extern /* Subroutine */ int cssyrk_();
+    extern /* Subroutine */ void cssyrk_(integer*, char*, char*, integer*, integer*, real*, real*, integer*, real*, real*, integer*, ftnlen, ftnlen);
     static integer laa, lda, lcc, ldc;
     static real als;
     static integer ict, icu;
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real err;
 
 /*  Tests SSYRK. */
@@ -2575,23 +2343,8 @@ L130:
 } /* schk4_ */
 
 
-/* Subroutine */ void sprcn4_(nout, nc, sname, iorder, uplo, transa, n, k, 
-	alpha, lda, beta, ldc, sname_len, uplo_len, transa_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *uplo, *transa;
-integer *n, *k;
-real *alpha;
-integer *lda;
-real *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen uplo_len;
-ftnlen transa_len;
+/* Subroutine */ void sprcn4_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, real* alpha, integer* lda, real* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
 {
-    /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
 
     /* Local variables */
     static char ca[14], cu[14], crc[14];
@@ -2619,21 +2372,7 @@ ftnlen transa_len;
 } /* sprcn4_ */
 
 
-/* Subroutine */ int schk5_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, ab, aa, as, bb, bs, 
-	c__, cc, cs, ct, g, w, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-real *alf;
-integer *nbet;
-real *bet;
-integer *nmax;
-real *ab, *aa, *as, *bb, *bs, *c__, *cc, *cs, *ct, *g, *w;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk5_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* nbet, real* bet, integer* nmax, real* ab, real* aa, real* as, real* bb, real* bs, real* c__, real* cc, real* cs, real* ct, real* g, real* w, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2643,8 +2382,6 @@ ftnlen sname_len;
     /* System generated locals */
     integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8;
 
-    /* Builtin functions */
-    integer f_rew(), s_wsfe(), e_wsfe(), do_fio();
 
     /* Local variables */
     static integer jjab;
@@ -2663,18 +2400,18 @@ ftnlen sname_len;
     static logical upper;
     static char uplos[1];
     static integer ia, ib;
-    extern /* Subroutine */ void sprcn5_();
+    extern /* Subroutine */ void sprcn5_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, real*, integer*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen);
     static integer jc, ma, na, nc, ik, in, jj, lj, ks, ns;
     static real errmax;
-    extern logical lseres_();
-    extern int smake_();
+    extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
     static char transs[1];
     static integer laa, lbb, lda, lcc, ldb, ldc;
     static real als;
     static integer ict, icu;
-    extern /* Subroutine */ int cssyr2k_();
-    extern logical lse_();
-    extern int smmch_();
+    extern /* Subroutine */ void cssyr2k_(integer*, char*, char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, ftnlen, ftnlen);
+    extern logical lse_(real*, real*, integer*);
+    extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen);
     static real err;
 
 /*  Tests SSYR2K. */
@@ -3037,23 +2774,8 @@ L160:
 } /* schk5_ */
 
 
-/* Subroutine */ void sprcn5_(nout, nc, sname, iorder, uplo, transa, n, k, 
-	alpha, lda, ldb, beta, ldc, sname_len, uplo_len, transa_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *uplo, *transa;
-integer *n, *k;
-real *alpha;
-integer *lda, *ldb;
-real *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen uplo_len;
-ftnlen transa_len;
+/* Subroutine */ void sprcn5_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, real* alpha, integer* lda, integer* ldb, real* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
 {
-    /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
 
     /* Local variables */
     static char ca[14], cu[14], crc[14];
@@ -3081,19 +2803,7 @@ ftnlen transa_len;
 } /* sprcn5_ */
 
 
-/* Subroutine */ int smake_(type__, uplo, diag, m, n, a, nmax, aa, lda, reset,
-	 transl, type_len, uplo_len, diag_len)
-char *type__, *uplo, *diag;
-integer *m, *n;
-real *a;
-integer *nmax;
-real *aa;
-integer *lda;
-logical *reset;
-real *transl;
-ftnlen type_len;
-ftnlen uplo_len;
-ftnlen diag_len;
+/* Subroutine */ int smake_(char* type__, char* uplo, char* diag, integer* m, integer* n, real* a, integer* nmax, real* aa, integer* lda, logical* reset, real* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2;
@@ -3102,7 +2812,7 @@ ftnlen diag_len;
 
     /* Local variables */
     static integer ibeg, iend;
-    extern doublereal sbeg_();
+    extern doublereal sbeg_(logical*);
     static logical unit;
     static integer i__, j;
     static logical lower, upper, gen, tri, sym;
@@ -3233,25 +2943,7 @@ ftnlen diag_len;
 
 } /* smake_ */
 
-/* Subroutine */ int smmch_(transa, transb, m, n, kk, alpha, a, lda, b, ldb, 
-	beta, c__, ldc, ct, g, cc, ldcc, eps, err, fatal, nout, mv, 
-	transa_len, transb_len)
-char *transa, *transb;
-integer *m, *n, *kk;
-real *alpha, *a;
-integer *lda;
-real *b;
-integer *ldb;
-real *beta, *c__;
-integer *ldc;
-real *ct, *g, *cc;
-integer *ldcc;
-real *eps, *err;
-logical *fatal;
-integer *nout;
-logical *mv;
-ftnlen transa_len;
-ftnlen transb_len;
+/* Subroutine */ int smmch_(char* transa, char* transb, integer* m, integer* n, integer* kk, real* alpha, real* a, integer* lda, real* b, integer* ldb, real* beta, real* c__, integer* ldc, real* ct, real* g, real* cc, integer* ldcc, real* eps, real* err, logical* fatal, integer* nout, logical* mv, ftnlen transa_len, ftnlen transb_len)
 {
 
     /* System generated locals */
@@ -3260,8 +2952,7 @@ ftnlen transb_len;
     real r__1, r__2;
 
     /* Builtin functions */
-    double sqrt();
-    integer s_wsfe(), e_wsfe(), do_fio();
+    double sqrt(double);
 
     /* Local variables */
     static real erri;
@@ -3426,9 +3117,7 @@ L150:
 
 } /* smmch_ */
 
-logical lse_(ri, rj, lr)
-real *ri, *rj;
-integer *lr;
+logical lse_(real* ri, real* rj, integer* lr)
 {
     /* System generated locals */
     integer i__1;
@@ -3475,13 +3164,7 @@ L30:
 
 } /* lse_ */
 
-logical lseres_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len)
-char *type__, *uplo;
-integer *m, *n;
-real *aa, *as;
-integer *lda;
-ftnlen type_len;
-ftnlen uplo_len;
+logical lseres_(char* type__, char* uplo, integer* m, integer* n, real* aa, real* as, integer* lda, ftnlen type_len, ftnlen uplo_len)
 {
     /* System generated locals */
     integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2;
@@ -3572,8 +3255,7 @@ L80:
 
 } /* lseres_ */
 
-doublereal sbeg_(reset)
-logical *reset;
+doublereal sbeg_(logical* reset)
 {
     /* System generated locals */
     real ret_val;
@@ -3625,8 +3307,7 @@ L10:
 
 } /* sbeg_ */
 
-doublereal sdiff_(x, y)
-real *x, *y;
+doublereal sdiff_(real* x, real* y)
 {
     /* System generated locals */
     real ret_val;
diff --git a/ctest/c_zblat1c.c b/ctest/c_zblat1c.c
index d5b080633..4761e63d7 100644
--- a/ctest/c_zblat1c.c
+++ b/ctest/c_zblat1c.c
@@ -242,250 +242,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
-#if 0
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-#endif
 
 /* Common Block Declarations */
 
@@ -502,16 +258,16 @@ static integer c__1 = 1;
 static integer c__5 = 5;
 static doublereal c_b43 = 1.;
 
-/* Main program */ int main()
+/* Main program */ int main(void)
 {
     /* Initialized data */
 
     static doublereal sfac = 9.765625e-4;
 
     /* Local variables */
-    extern /* Subroutine */ int check1_(), check2_();
+    extern /* Subroutine */ int check1_(doublereal*), check2_(doublereal*);
     static integer ic;
-    extern /* Subroutine */ int header_();
+    extern /* Subroutine */ int header_(void);
 
 /*     Test program for the COMPLEX*16 Level 1 CBLAS. */
 /*     Based upon the original CBLAS test routine together with: */
@@ -551,7 +307,7 @@ static doublereal c_b43 = 1.;
     exit(0);
 } /* MAIN__ */
 
-/* Subroutine */ int header_()
+/* Subroutine */ int header_(void)
 {
     /* Initialized data */
 
@@ -570,8 +326,7 @@ static doublereal c_b43 = 1.;
 
 } /* header_ */
 
-/* Subroutine */ int check1_(sfac)
-doublereal *sfac;
+/* Subroutine */ int check1_(doublereal* sfac)
 {
     /* Initialized data */
 
@@ -623,15 +378,15 @@ doublereal *sfac;
 
     /* Local variables */
     static integer i__;
-    extern /* Subroutine */ int ctest_();
+    extern /* Subroutine */ int ctest_(int*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*);
     static doublecomplex mwpcs[5], mwpct[5];
-    extern /* Subroutine */ int zscaltest_(), itest1_(), stest1_();
+    extern /* Subroutine */ int zscaltest_(int*, doublereal*, doublecomplex*, int*), itest1_(int*, int*), stest1_(doublereal*, doublereal*, doublereal*, doublereal*);
     static doublecomplex cx[8];
-    extern doublereal dznrm2test_();
+    extern doublereal dznrm2test_(integer*, doublecomplex*, integer*);
     static integer np1;
-    extern /* Subroutine */ int zdscaltest_();
-    extern integer izamaxtest_();
-    extern doublereal dzasumtest_();
+    extern /* Subroutine */ int zdscaltest_(integer*, doublereal*, doublecomplex*, integer*);
+    extern integer izamaxtest_(integer*, doublecomplex*, integer*);
+    extern doublereal dzasumtest_(integer*, doublecomplex*, integer*);
     static integer len;
 
 /*     .. Parameters .. */
@@ -748,8 +503,7 @@ doublereal *sfac;
     return 0;
 } /* check1_ */
 
-/* Subroutine */ int check2_(sfac)
-doublereal *sfac;
+/* Subroutine */ int check2_(doublereal* sfac)
 {
     /* Initialized data */
 
@@ -834,14 +588,14 @@ doublereal *sfac;
     /* Local variables */
     static doublecomplex cdot[1];
     static integer lenx, leny, i__;
-    extern /* Subroutine */ int ctest_();
+    extern /* Subroutine */ int ctest_(int*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*);
     static integer ksize;
     static doublecomplex ztemp;
-    extern /* Subroutine */ int zdotctest_(), zcopytest_();
+    extern /* Subroutine */ int zdotctest_(int*, doublecomplex*, int*, doublecomplex*, int*, doublecomplex*), zcopytest_(int*, doublecomplex*, int*, doublecomplex*, int*);
     static integer ki;
-    extern /* Subroutine */ int zdotutest_(), zswaptest_();
+    extern /* Subroutine */ int zdotutest_(int*, doublecomplex*, int*, doublecomplex*, int*, doublecomplex*), zswaptest_(int*, doublecomplex*, int*, doublecomplex*, int*);
     static integer kn;
-    extern /* Subroutine */ int zaxpytest_();
+    extern /* Subroutine */ int zaxpytest_(int*, doublereal*, doublecomplex*, int*, doublecomplex*, int*);
     static doublecomplex cx[7], cy[7];
     static integer mx, my;
 
@@ -923,20 +677,18 @@ doublereal *sfac;
     return 0;
 } /* check2_ */
 
-/* Subroutine */ int stest_(len, scomp, strue, ssize, sfac)
-integer *len;
-doublereal *scomp, *strue, *ssize, *sfac;
+/* Subroutine */ int stest_(integer* len, doublereal* scomp, doublereal* strue, doublereal* ssize, doublereal* sfac)
 {
     /* System generated locals */
     integer i__1;
     doublereal d__1, d__2, d__3, d__4, d__5;
 
     /* Builtin functions */
-    integer s_wsfe(), e_wsfe(), do_fio();
+    integer s_wsfe(void), e_wsfe(void), do_fio(void);
 
     /* Local variables */
     static integer i__;
-    extern doublereal sdiff_();
+    extern doublereal sdiff_(doublereal*, doublereal*);
     static doublereal sd;
 
 /*     ********************************* STEST ************************** */
@@ -992,11 +744,10 @@ L40:
 
 } /* stest_ */
 
-/* Subroutine */ int stest1_(scomp1, strue1, ssize, sfac)
-doublereal *scomp1, *strue1, *ssize, *sfac;
+/* Subroutine */ int stest1_(doublereal* scomp1, doublereal* strue1, doublereal* ssize, doublereal* sfac)
 {
     static doublereal scomp[1], strue[1];
-    extern /* Subroutine */ int stest_();
+    extern /* Subroutine */ int stest_(int*,doublereal*, doublereal*, doublereal*, doublereal*);
 
 /*     ************************* STEST1 ***************************** */
 
@@ -1023,8 +774,7 @@ doublereal *scomp1, *strue1, *ssize, *sfac;
     return 0;
 } /* stest1_ */
 
-doublereal sdiff_(sa, sb)
-doublereal *sa, *sb;
+doublereal sdiff_(doublereal* sa, doublereal* sb)
 {
     /* System generated locals */
     doublereal ret_val;
@@ -1038,10 +788,7 @@ doublereal *sa, *sb;
     return ret_val;
 } /* sdiff_ */
 
-/* Subroutine */ int ctest_(len, ccomp, ctrue, csize, sfac)
-integer *len;
-doublecomplex *ccomp, *ctrue, *csize;
-doublereal *sfac;
+/* Subroutine */ int ctest_(integer* len, doublecomplex* ccomp, doublecomplex* ctrue, doublecomplex* csize, doublereal* sfac)
 {
     /* System generated locals */
     integer i__1, i__2;
@@ -1049,7 +796,7 @@ doublereal *sfac;
     /* Local variables */
     static integer i__;
     static doublereal scomp[20], ssize[20], strue[20];
-    extern /* Subroutine */ int stest_();
+    extern /* Subroutine */ int stest_(integer*, doublereal*, doublereal*, doublereal*, doublereal*);
 
 /*     **************************** CTEST ***************************** */
 
@@ -1087,8 +834,7 @@ doublereal *sfac;
     return 0;
 } /* ctest_ */
 
-/* Subroutine */ int itest1_(icomp, itrue)
-integer *icomp, *itrue;
+/* Subroutine */ int itest1_(integer* icomp, integer* itrue)
 {
     static integer id;
 
diff --git a/ctest/c_zblat2c.c b/ctest/c_zblat2c.c
index 92f03a045..8abd58394 100644
--- a/ctest/c_zblat2c.c
+++ b/ctest/c_zblat2c.c
@@ -242,129 +242,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
 
 
 /* Common Block Declarations */
@@ -396,7 +273,7 @@ static integer c_n1 = -1;
 static integer c__0 = 0;
 static logical c_false = FALSE_;
 
-/* Main program */ int main()
+/* Main program */ int main(void)
 {
     /* Initialized data */
 
@@ -414,19 +291,23 @@ static logical c_false = FALSE_;
     static logical same;
     static integer ninc, nbet, ntra;
     static logical rewi;
-    extern /* Subroutine */ int zchk1_(), zchk2_(), zchk3_(), zchk4_(), 
-	    zchk5_(), zchk6_();
+    extern /* Subroutine */ int zchk1_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk2_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk3_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk4_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk5_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk6_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen);
     static doublecomplex a[4225]	/* was [65][65] */;
     static doublereal g[65];
     static integer i__, j;
-    extern doublereal ddiff_();
+    extern doublereal ddiff_(doublereal*, doublereal*);
     static integer n;
     static logical fatal;
     static doublecomplex x[65], y[65], z__[130];
     static logical trace;
     static integer nidim;
     static char snaps[32], trans[1];
-    extern /* Subroutine */ int zmvch_();
+    extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static integer isnum;
     static logical ltest[17];
     static doublecomplex aa[4225];
@@ -441,12 +322,12 @@ static logical c_false = FALSE_;
     static logical rorder;
     static integer layout;
     static logical ltestt, tsterr;
-    extern /* Subroutine */ int cz2chke_();
+    extern /* Subroutine */ void cz2chke_(char*, ftnlen);
     static doublecomplex alf[7];
     static integer inc[7], nkb;
     static doublecomplex bet[7];
     static doublereal eps, err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
     char tmpchar;
 
 /*  Test program for the DOUBLE PRECISION COMPLEX Level 2 Blas. */
@@ -984,22 +865,7 @@ L240:
 
 } /* MAIN__ */
 
-/* Subroutine */ int zchk1_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, 
-	incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *nalf;
-doublecomplex *alf;
-integer *nbet;
-doublecomplex *bet;
-integer *ninc, *inc, *nmax, *incmax;
-doublecomplex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt;
-doublereal *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* y, doublecomplex* yy, doublecomplex* ys, doublecomplex* yt, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1018,27 +884,27 @@ ftnlen sname_len;
     static integer i__, m, n;
     static doublecomplex alpha;
     static logical isame[13];
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
     static logical reset;
     static integer incxs, incys;
     static char trans[1];
-    extern /* Subroutine */ int zmvch_();
+    extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static integer ia, ib, ic;
     static logical banded;
     static integer nc, nd, im, in, kl, ml, nk, nl, ku, ix, iy, ms, lx, ly, ns;
-    extern /* Subroutine */ int czgbmv_();
+    extern /* Subroutine */ void czgbmv_(integer*, char*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen);
     static char ctrans[14];
-    extern /* Subroutine */ int czgemv_();
+    extern /* Subroutine */ void czgemv_(integer*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen);
     static doublereal errmax;
     static doublecomplex transl;
-    extern logical lzeres_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static char transs[1];
     static integer laa, lda;
     static doublecomplex als, bls;
     static doublereal err;
     static integer iku, kls;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
     static integer kus;
 
 
@@ -1451,22 +1317,7 @@ L140:
 
 } /* zchk1_ */
 
-/* Subroutine */ int zchk2_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, 
-	incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *nalf;
-doublecomplex *alf;
-integer *nbet;
-doublecomplex *bet;
-integer *ninc, *inc, *nmax, *incmax;
-doublecomplex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt;
-doublereal *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* y, doublecomplex* yy, doublecomplex* ys, doublecomplex* yt, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1486,27 +1337,28 @@ ftnlen sname_len;
     static integer i__, k, n;
     static doublecomplex alpha;
     static logical isame[13];
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
     static logical reset;
     static char cuplo[14];
     static integer incxs, incys;
-    extern /* Subroutine */ int zmvch_();
+    extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static char uplos[1];
     static integer ia, ib, ic;
     static logical banded;
     static integer nc, ik, in;
     static logical packed;
     static integer nk, ks, ix, iy, ns, lx, ly;
-    extern /* Subroutine */ int czhbmv_(), czhemv_();
+    extern /* Subroutine */ void czhbmv_(integer*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen);
+    extern /* Subroutine */ void czhemv_(integer*, char*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen);
     static doublereal errmax;
     static doublecomplex transl;
-    extern logical lzeres_();
-    extern /* Subroutine */ int czhpmv_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void czhpmv_(integer*, char*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen);
     static integer laa, lda;
     static doublecomplex als, bls;
     static doublereal err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 /*  Tests CHEMV, CHBMV and CHPMV. */
 
@@ -1909,19 +1761,7 @@ L130:
 
 } /* zchk2_ */
 
-/* Subroutine */ int zchk3_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, xt, g, z__, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *ninc, *inc, *nmax, *incmax;
-doublecomplex *a, *aa, *as, *x, *xx, *xs, *xt;
-doublereal *g;
-doublecomplex *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* xt, doublereal* g, doublecomplex* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1942,13 +1782,13 @@ ftnlen sname_len;
     static integer i__, k, n;
     static char diags[1];
     static logical isame[13];
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
     static logical reset;
     static char cuplo[14];
     static integer incxs;
     static char trans[1];
-    extern /* Subroutine */ int zmvch_();
+    extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static char uplos[1];
     static logical banded;
     static integer nc, ik, in;
@@ -1957,14 +1797,17 @@ ftnlen sname_len;
     static char ctrans[14];
     static doublereal errmax;
     static doublecomplex transl;
-    extern logical lzeres_();
-    extern /* Subroutine */ int cztbmv_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cztbmv_(integer*, char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
     static char transs[1];
-    extern /* Subroutine */ int cztbsv_(), cztpmv_(), cztrmv_(), cztpsv_(), 
-	    cztrsv_();
+    extern /* Subroutine */ void cztbsv_(integer*, char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cztpmv_(integer*, char*, char*, char*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cztpsv_(integer*, char*, char*, char*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cztrmv_(integer*, char*, char*, char*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cztrsv_(integer*, char*, char*, char*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
     static integer laa, icd, lda, ict, icu;
     static doublereal err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 
 
@@ -2422,21 +2265,7 @@ L130:
 
 } /* zchk3_ */
 
-/* Subroutine */ int zchk4_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublecomplex *alf;
-integer *ninc, *inc, *nmax, *incmax;
-doublecomplex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt;
-doublereal *g;
-doublecomplex *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* y, doublecomplex* yy, doublecomplex* ys, doublecomplex* yt, doublereal* g, doublecomplex* z__, integer* iorder, ftnlen sname_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7;
@@ -2450,21 +2279,21 @@ ftnlen sname_len;
     static integer i__, j, m, n;
     static doublecomplex alpha, w[1];
     static logical isame[13];
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
     static logical reset;
     static integer incxs, incys;
-    extern /* Subroutine */ int zmvch_();
+    extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static integer ia, nc, nd, im, in, ms, ix, iy, ns, lx, ly;
-    extern /* Subroutine */ int czgerc_();
+    extern /* Subroutine */ void czgerc_(integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, integer*);
     static doublereal errmax;
-    extern /* Subroutine */ int czgeru_();
+    extern /* Subroutine */ void czgeru_(integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, integer*);
     static doublecomplex transl;
-    extern logical lzeres_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static integer laa, lda;
     static doublecomplex als;
     static doublereal err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 
 
@@ -2793,21 +2622,7 @@ L150:
 
 } /* zchk4_ */
 
-/* Subroutine */ int zchk5_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublecomplex *alf;
-integer *ninc, *inc, *nmax, *incmax;
-doublecomplex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt;
-doublereal *g;
-doublecomplex *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* y, doublecomplex* yy, doublecomplex* ys, doublecomplex* yt, doublereal* g, doublecomplex* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2827,13 +2642,14 @@ ftnlen sname_len;
     static integer i__, j, n;
     static doublecomplex alpha, w[1];
     static logical isame[13];
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int czher_();
+    extern /* Subroutine */ void czher_(integer*, char*, integer*, doublereal*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen);
     static logical reset;
     static char cuplo[14];
     static integer incxs;
-    extern /* Subroutine */ int czhpr_(), zmvch_();
+    extern /* Subroutine */ void czhpr_(integer*, char*, integer*, doublereal*, doublecomplex*, integer*, doublecomplex*, ftnlen);
+    extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static logical upper;
     static char uplos[1];
     static integer ia, ja, ic, nc, jj, lj, in;
@@ -2841,10 +2657,10 @@ ftnlen sname_len;
     static integer ix, ns, lx;
     static doublereal ralpha, errmax;
     static doublecomplex transl;
-    extern logical lzeres_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static integer laa, lda;
     static doublereal err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 /*  Tests ZHER and ZHPR. */
 
@@ -3167,21 +2983,7 @@ L130:
 
 } /* zchk5_ */
 
-/* Subroutine */ int zchk6_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublecomplex *alf;
-integer *ninc, *inc, *nmax, *incmax;
-doublecomplex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt;
-doublereal *g;
-doublecomplex *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk6_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* y, doublecomplex* yy, doublecomplex* ys, doublecomplex* yt, doublereal* g, doublecomplex* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -3201,25 +3003,26 @@ ftnlen sname_len;
     static integer i__, j, n;
     static doublecomplex alpha, w[2];
     static logical isame[13];
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
     static logical reset;
     static char cuplo[14];
     static integer incxs, incys;
-    extern /* Subroutine */ int zmvch_();
+    extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static logical upper;
     static char uplos[1];
-    extern /* Subroutine */ int czher2_(), czhpr2_();
+    extern /* Subroutine */ void czher2_(integer*, char*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen);
+    extern /* Subroutine */ void czhpr2_(integer*, char*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, ftnlen);
     static integer ia, ja, ic, nc, jj, lj, in;
     static logical packed;
     static integer ix, iy, ns, lx, ly;
     static doublereal errmax;
     static doublecomplex transl;
-    extern logical lzeres_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static integer laa, lda;
     static doublecomplex als;
     static doublereal err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 /*  Tests ZHER2 and ZHPR2. */
 
@@ -3604,24 +3407,7 @@ L170:
 
 } /* zchk6_ */
 
-/* Subroutine */ int zmvch_(trans, m, n, alpha, a, nmax, x, incx, beta, y, 
-	incy, yt, g, yy, eps, err, fatal, nout, mv, trans_len)
-char *trans;
-integer *m, *n;
-doublecomplex *alpha, *a;
-integer *nmax;
-doublecomplex *x;
-integer *incx;
-doublecomplex *beta, *y;
-integer *incy;
-doublecomplex *yt;
-doublereal *g;
-doublecomplex *yy;
-doublereal *eps, *err;
-logical *fatal;
-integer *nout;
-logical *mv;
-ftnlen trans_len;
+/* Subroutine */ int zmvch_(char* trans, integer* m, integer* n, doublecomplex* alpha, doublecomplex* a, integer* nmax, doublecomplex* x, integer* incx, doublecomplex* beta, doublecomplex* y, integer* incy, doublecomplex* yt, doublereal* g, doublecomplex* yy, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen trans_len)
 {
 
     /* System generated locals */
@@ -3819,9 +3605,7 @@ L80:
 
 } /* zmvch_ */
 
-logical lze_(ri, rj, lr)
-doublecomplex *ri, *rj;
-integer *lr;
+logical lze_(doublecomplex* ri, doublecomplex* rj, integer* lr)
 {
     /* System generated locals */
     integer i__1, i__2, i__3;
@@ -3868,13 +3652,7 @@ L30:
 
 } /* lze_ */
 
-logical lzeres_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len)
-char *type__, *uplo;
-integer *m, *n;
-doublecomplex *aa, *as;
-integer *lda;
-ftnlen type_len;
-ftnlen uplo_len;
+logical lzeres_(char* type__, char* uplo, integer* m, integer* n, doublecomplex* aa, doublecomplex* as, integer* lda, ftnlen type_len, ftnlen uplo_len)
 {
     /* System generated locals */
     integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4;
@@ -3967,9 +3745,7 @@ L80:
 
 } /* lzeres_ */
 
-/* Double Complex */ VOID zbeg_( ret_val, reset)
-doublecomplex * ret_val;
-logical *reset;
+/* Double Complex */ VOID zbeg_( doublecomplex* ret_val, logical* reset)
 {
     /* System generated locals */
     doublereal d__1, d__2;
@@ -4030,8 +3806,7 @@ L10:
 
 } /* zbeg_ */
 
-doublereal ddiff_(x, y)
-doublereal *x, *y;
+doublereal ddiff_(doublereal* x, doublereal* y)
 {
     /* System generated locals */
     doublereal ret_val;
@@ -4051,19 +3826,7 @@ doublereal *x, *y;
 
 } /* ddiff_ */
 
-/* Subroutine */ int zmake_(type__, uplo, diag, m, n, a, nmax, aa, lda, kl, 
-	ku, reset, transl, type_len, uplo_len, diag_len)
-char *type__, *uplo, *diag;
-integer *m, *n;
-doublecomplex *a;
-integer *nmax;
-doublecomplex *aa;
-integer *lda, *kl, *ku;
-logical *reset;
-doublecomplex *transl;
-ftnlen type_len;
-ftnlen uplo_len;
-ftnlen diag_len;
+/* Subroutine */ int zmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublecomplex* a, integer* nmax, doublecomplex* aa, integer* lda, integer* kl, integer* ku, logical* reset, doublecomplex* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
@@ -4072,7 +3835,7 @@ ftnlen diag_len;
 
     /* Local variables */
     static integer ibeg, iend, ioff;
-    extern /* Double Complex */ VOID zbeg_();
+    extern /* Double Complex */ VOID zbeg_(doublecomplex*, logical*);
     static logical unit;
     static integer i__, j;
     static logical lower;
diff --git a/ctest/c_zblat3c.c b/ctest/c_zblat3c.c
index eca2c3ff6..6025c0052 100644
--- a/ctest/c_zblat3c.c
+++ b/ctest/c_zblat3c.c
@@ -22,14 +22,11 @@ typedef double doublereal;
 typedef struct { real r, i; } complex;
 typedef struct { doublereal r, i; } doublecomplex;
 #ifdef _MSC_VER
-static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
 static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
-static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
 static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
 #else
 static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
-static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
 static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
 #endif
 #define pCf(z) (*_pCf(z))
@@ -242,124 +239,7 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
+
 
 /* Common Block Declarations */
 
@@ -388,7 +268,7 @@ static logical c_true = TRUE_;
 static integer c__0 = 0;
 static logical c_false = FALSE_;
 
-/* Main program  MAIN__() */ int main()
+/* Main program  MAIN__() */ int main(void)
 {
     /* Initialized data */
 
@@ -400,26 +280,29 @@ static logical c_false = FALSE_;
     doublereal d__1;
 
     /* Builtin functions */
-    integer s_rsle(), do_lio(), e_rsle(), f_open(), s_wsfe(), do_fio(), 
-	    e_wsfe(), s_wsle(), e_wsle(), s_rsfe(), e_rsfe();
+    integer s_rsle(void), do_lio(void), e_rsle(void), f_open(void), s_wsfe(void), do_fio(void), 
+	    e_wsfe(void), s_wsle(void), e_wsle(void), s_rsfe(void), e_rsfe(void);
 
     /* Local variables */
     static integer nalf, idim[9];
     static logical same;
     static integer nbet, ntra;
     static logical rewi;
-    extern /* Subroutine */ int zchk1_(), zchk2_(), zchk3_(), zchk4_(), 
-	    zchk5_();
+    extern /* Subroutine */ int zchk1_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk2_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk3_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk4_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk5_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen);
     static doublecomplex c__[4225]	/* was [65][65] */;
     static doublereal g[65];
     static integer i__, j;
-    extern doublereal ddiff_();
+    extern doublereal ddiff_(doublereal*, doublereal*);
     static integer n;
     static logical fatal;
     static doublecomplex w[130];
     static logical trace;
     static integer nidim;
-    extern /* Subroutine */ int zmmch_();
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static char snaps[32];
     static integer isnum;
     static logical ltest[9];
@@ -431,10 +314,10 @@ static logical c_false = FALSE_;
     static logical rorder;
     static integer layout;
     static logical ltestt, tsterr;
-    extern /* Subroutine */ int cz3chke_();
+    extern /* Subroutine */ int cz3chke_(char*, ftnlen);
     static doublecomplex alf[7], bet[7];
     static doublereal eps, err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
     char tmpchar;
     
 /*  Test program for the COMPLEX*16          Level 3 Blas. */
@@ -924,22 +807,7 @@ L230:
 
 } /* MAIN__ */
 
-/* Subroutine */ int zchk1_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, 
-	c__, cc, cs, ct, g, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublecomplex *alf;
-integer *nbet;
-doublecomplex *bet;
-integer *nmax;
-doublecomplex *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct;
-doublereal *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -956,21 +824,21 @@ ftnlen sname_len;
     static integer i__, k, m, n;
     static doublecomplex alpha;
     static logical isame[13], trana, tranb;
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int zmmch_();
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static logical reset;
     static integer ia, ib;
-    extern /* Subroutine */ int zprcn1_();
+    extern /* Subroutine */ int zprcn1_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
     static integer ma, mb, na, nb, nc, ik, im, in, ks, ms, ns;
-    extern /* Subroutine */ int czgemm_();
+    extern /* Subroutine */ void czgemm_(integer*, char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static char tranas[1], tranbs[1], transa[1], transb[1];
     static doublereal errmax;
-    extern logical lzeres_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static integer ica, icb, laa, lbb, lda, lcc, ldb, ldc;
     static doublecomplex als, bls;
     static doublereal err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 /*  Tests ZGEMM. */
 
@@ -1313,20 +1181,7 @@ L130:
 } /* zchk1_ */
 
 
-/* Subroutine */ int zprcn1_(nout, nc, sname, iorder, transa, transb, m, n, k,
-	 alpha, lda, ldb, beta, ldc, sname_len, transa_len, transb_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *transa, *transb;
-integer *m, *n, *k;
-doublecomplex *alpha;
-integer *lda, *ldb;
-doublecomplex *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen transa_len;
-ftnlen transb_len;
+/* Subroutine */ int zprcn1_(integer* nout, integer* nc, char* sname, integer* iorder, char* transa, char* transb, integer* m, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen transa_len, ftnlen transb_len)
 {
     /* Local variables */
     static char crc[14], cta[14], ctb[14];
@@ -1357,22 +1212,7 @@ return 0;
 } /* zprcn1_ */
 
 
-/* Subroutine */ int zchk2_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, 
-	c__, cc, cs, ct, g, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublecomplex *alf;
-integer *nbet;
-doublecomplex *bet;
-integer *nmax;
-doublecomplex *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct;
-doublereal *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1394,23 +1234,23 @@ ftnlen sname_len;
     static doublecomplex alpha;
     static logical isame[13];
     static char sides[1];
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int zmmch_();
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static logical reset;
     static char uplos[1];
     static integer ia, ib;
-    extern /* Subroutine */ int zprcn2_();
+    extern /* Subroutine */ int zprcn2_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
     static integer na, nc, im, in, ms, ns;
-    extern /* Subroutine */ int czhemm_();
+    extern /* Subroutine */ void czhemm_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static doublereal errmax;
-    extern logical lzeres_();
-    extern /* Subroutine */ int czsymm_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void czsymm_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static integer laa, lbb, lda, lcc, ldb, ldc, ics;
     static doublecomplex als, bls;
     static integer icu;
     static doublereal err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 /*  Tests ZHEMM and ZSYMM. */
 
@@ -1737,20 +1577,7 @@ L120:
 } /* zchk2_ */
 
 
-/* Subroutine */ int zprcn2_(nout, nc, sname, iorder, side, uplo, m, n, alpha,
-	 lda, ldb, beta, ldc, sname_len, side_len, uplo_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *side, *uplo;
-integer *m, *n;
-doublecomplex *alpha;
-integer *lda, *ldb;
-doublecomplex *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen side_len;
-ftnlen uplo_len;
+/* Subroutine */ int zprcn2_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, integer* m, integer* n, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len)
 {
     /* Local variables */
     static char cs[14], cu[14], crc[14];
@@ -1777,21 +1604,7 @@ return 0;
 } /* zprcn2_ */
 
 
-/* Subroutine */ int zchk3_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nmax, a, aa, as, b, bb, bs, ct, g, c__,
-	 iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublecomplex *alf;
-integer *nmax;
-doublecomplex *a, *aa, *as, *b, *bb, *bs, *ct;
-doublereal *g;
-doublecomplex *c__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* ct, doublereal* g, doublecomplex* c__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1817,23 +1630,24 @@ ftnlen sname_len;
     static char diags[1];
     static logical isame[13];
     static char sides[1];
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int zmmch_();
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static logical reset;
     static char uplos[1];
     static integer ia, na;
-    extern /* Subroutine */ int zprcn3_();
+    extern /* Subroutine */ int zprcn3_(integer*, integer*, char*, integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, ftnlen, ftnlen, ftnlen, ftnlen, ftnlen);
     static integer nc, im, in, ms, ns;
     static char tranas[1], transa[1];
     static doublereal errmax;
-    extern logical lzeres_();
-    extern /* Subroutine */ int cztrmm_(), cztrsm_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cztrmm_(integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cztrsm_(integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen, ftnlen);
     static integer laa, icd, lbb, lda, ldb, ics;
     static doublecomplex als;
     static integer ict, icu;
     static doublereal err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 /*  Tests ZTRMM and ZTRSM. */
 
@@ -2227,21 +2041,7 @@ L160:
 } /* zchk3_ */
 
 
-/* Subroutine */ int zprcn3_(nout, nc, sname, iorder, side, uplo, transa, 
-	diag, m, n, alpha, lda, ldb, sname_len, side_len, uplo_len, 
-	transa_len, diag_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *side, *uplo, *transa, *diag;
-integer *m, *n;
-doublecomplex *alpha;
-integer *lda, *ldb;
-ftnlen sname_len;
-ftnlen side_len;
-ftnlen uplo_len;
-ftnlen transa_len;
-ftnlen diag_len;
+/* Subroutine */ int zprcn3_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, char* transa, char* diag, integer* m, integer* n, doublecomplex* alpha, integer* lda, integer* ldb, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len, ftnlen transa_len, ftnlen diag_len)
 {
 
     /* Local variables */
@@ -2281,22 +2081,7 @@ return 0;
 } /* zprcn3_ */
 
 
-/* Subroutine */ int zchk4_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, 
-	c__, cc, cs, ct, g, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublecomplex *alf;
-integer *nbet;
-doublecomplex *bet;
-integer *nmax;
-doublecomplex *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct;
-doublereal *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2320,30 +2105,30 @@ ftnlen sname_len;
     static doublecomplex alpha;
     static doublereal rbeta;
     static logical isame[13];
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int zmmch_();
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static doublereal rbets;
     static logical reset;
     static char trans[1];
     static logical upper;
     static char uplos[1];
     static integer ia, ib, jc, ma, na;
-    extern /* Subroutine */ int zprcn4_();
+    extern /* Subroutine */ int zprcn4_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
     static integer nc;
-    extern /* Subroutine */ int zprcn6_();
+    extern /* Subroutine */ int zprcn6_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
     static integer ik, in, jj, lj, ks, ns;
     static doublereal ralpha;
-    extern /* Subroutine */ int czherk_();
+    extern /* Subroutine */ int czherk_(integer*, char*, char*, integer*, integer*, doublereal*, doublecomplex*, integer*, doublereal*, doublecomplex*, integer*, ftnlen, ftnlen);
     static doublereal errmax;
-    extern logical lzeres_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static char transs[1], transt[1];
-    extern /* Subroutine */ int czsyrk_();
+    extern /* Subroutine */ int czsyrk_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static integer laa, lda, lcc, ldc;
     static doublecomplex als;
     static integer ict, icu;
     static doublereal err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 /*  Tests ZHERK and ZSYRK. */
 
@@ -2732,20 +2517,7 @@ L130:
 } /* zchk4_ */
 
 
-/* Subroutine */ int zprcn4_(nout, nc, sname, iorder, uplo, transa, n, k, 
-	alpha, lda, beta, ldc, sname_len, uplo_len, transa_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *uplo, *transa;
-integer *n, *k;
-doublecomplex *alpha;
-integer *lda;
-doublecomplex *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen uplo_len;
-ftnlen transa_len;
+/* Subroutine */ int zprcn4_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
 {
     /* Local variables */
     static char ca[14], cu[14], crc[14];
@@ -2775,20 +2547,7 @@ return 0;
 
 
 
-/* Subroutine */ int zprcn6_(nout, nc, sname, iorder, uplo, transa, n, k, 
-	alpha, lda, beta, ldc, sname_len, uplo_len, transa_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *uplo, *transa;
-integer *n, *k;
-doublereal *alpha;
-integer *lda;
-doublereal *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen uplo_len;
-ftnlen transa_len;
+/* Subroutine */ int zprcn6_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublereal* alpha, integer* lda, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
 {
 
     /* Local variables */
@@ -2818,23 +2577,7 @@ return 0;
 } /* zprcn6_ */
 
 
-/* Subroutine */ int zchk5_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, ab, aa, as, bb, bs, 
-	c__, cc, cs, ct, g, w, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublecomplex *alf;
-integer *nbet;
-doublecomplex *bet;
-integer *nmax;
-doublecomplex *ab, *aa, *as, *bb, *bs, *c__, *cc, *cs, *ct;
-doublereal *g;
-doublecomplex *w;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* ab, doublecomplex* aa, doublecomplex* as, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, doublecomplex* w, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2857,27 +2600,28 @@ ftnlen sname_len;
     static doublecomplex alpha;
     static doublereal rbeta;
     static logical isame[13];
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int zmmch_();
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static doublereal rbets;
     static logical reset;
     static char trans[1];
     static logical upper;
     static char uplos[1];
     static integer ia, ib, jc, ma, na, nc;
-    extern /* Subroutine */ int zprcn5_(), zprcn7_();
+    extern /* Subroutine */ int zprcn5_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ int zprcn7_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
     static integer ik, in, jj, lj, ks, ns;
     static doublereal errmax;
-    extern logical lzeres_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static char transs[1], transt[1];
-    extern /* Subroutine */ int czher2k_();
+    extern /* Subroutine */ int czher2k_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublereal*, doublecomplex*, integer*, ftnlen, ftnlen);
     static integer laa, lbb, lda, lcc, ldb, ldc;
     static doublecomplex als;
     static integer ict, icu;
-    extern /* Subroutine */ int czsyr2k_();
+    extern /* Subroutine */ int czsyr2k_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static doublereal err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 /*  Tests ZHER2K and ZSYR2K. */
 
@@ -3349,20 +3093,7 @@ L160:
 } /* zchk5_ */
 
 
-/* Subroutine */ int zprcn5_(nout, nc, sname, iorder, uplo, transa, n, k, 
-	alpha, lda, ldb, beta, ldc, sname_len, uplo_len, transa_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *uplo, *transa;
-integer *n, *k;
-doublecomplex *alpha;
-integer *lda, *ldb;
-doublecomplex *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen uplo_len;
-ftnlen transa_len;
+/* Subroutine */ int zprcn5_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
 {
     /* Local variables */
     static char ca[14], cu[14], crc[14];
@@ -3392,20 +3123,7 @@ return 0;
 
 
 
-/* Subroutine */ int zprcn7_(nout, nc, sname, iorder, uplo, transa, n, k, 
-	alpha, lda, ldb, beta, ldc, sname_len, uplo_len, transa_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *uplo, *transa;
-integer *n, *k;
-doublecomplex *alpha;
-integer *lda, *ldb;
-doublereal *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen uplo_len;
-ftnlen transa_len;
+/* Subroutine */ int zprcn7_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
 {
 
     /* Local variables */
@@ -3435,19 +3153,7 @@ return 0;
 } /* zprcn7_ */
 
 
-/* Subroutine */ int zmake_(type__, uplo, diag, m, n, a, nmax, aa, lda, reset,
-	 transl, type_len, uplo_len, diag_len)
-char *type__, *uplo, *diag;
-integer *m, *n;
-doublecomplex *a;
-integer *nmax;
-doublecomplex *aa;
-integer *lda;
-logical *reset;
-doublecomplex *transl;
-ftnlen type_len;
-ftnlen uplo_len;
-ftnlen diag_len;
+/* Subroutine */ int zmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublecomplex* a, integer* nmax, doublecomplex* aa, integer* lda, logical* reset, doublecomplex* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
@@ -3456,7 +3162,7 @@ ftnlen diag_len;
 
     /* Local variables */
     static integer ibeg, iend;
-    extern /* Double Complex */ VOID zbeg_();
+    extern /* Double Complex */ VOID zbeg_(doublecomplex*, logical*);
     static logical unit;
     static integer i__, j;
     static logical lower, upper;
@@ -3629,27 +3335,7 @@ ftnlen diag_len;
 
 } /* zmake_ */
 
-/* Subroutine */ int zmmch_(transa, transb, m, n, kk, alpha, a, lda, b, ldb, 
-	beta, c__, ldc, ct, g, cc, ldcc, eps, err, fatal, nout, mv, 
-	transa_len, transb_len)
-char *transa, *transb;
-integer *m, *n, *kk;
-doublecomplex *alpha, *a;
-integer *lda;
-doublecomplex *b;
-integer *ldb;
-doublecomplex *beta, *c__;
-integer *ldc;
-doublecomplex *ct;
-doublereal *g;
-doublecomplex *cc;
-integer *ldcc;
-doublereal *eps, *err;
-logical *fatal;
-integer *nout;
-logical *mv;
-ftnlen transa_len;
-ftnlen transb_len;
+/* Subroutine */ int zmmch_(char* transa, char* transb, integer* m, integer* n, integer* kk, doublecomplex* alpha, doublecomplex* a, integer* lda, doublecomplex* b, integer* ldb, doublecomplex* beta, doublecomplex* c__, integer* ldc, doublecomplex* ct, doublereal* g, doublecomplex* cc, integer* ldcc, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen transa_len, ftnlen transb_len)
 {
 
     /* System generated locals */
@@ -3658,7 +3344,7 @@ ftnlen transb_len;
     doublereal d__1, d__2, d__3, d__4, d__5, d__6;
     doublecomplex z__1, z__2, z__3, z__4;
 
-    double sqrt();
+    double sqrt(double);
     /* Local variables */
     static doublereal erri;
     static integer i__, j, k;
@@ -4031,9 +3717,7 @@ L250:
 
 } /* zmmch_ */
 
-logical lze_(ri, rj, lr)
-doublecomplex *ri, *rj;
-integer *lr;
+logical lze_(doublecomplex* ri, doublecomplex* rj, integer* lr)
 {
     /* System generated locals */
     integer i__1, i__2, i__3;
@@ -4082,13 +3766,7 @@ L30:
 
 } /* lze_ */
 
-logical lzeres_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len)
-char *type__, *uplo;
-integer *m, *n;
-doublecomplex *aa, *as;
-integer *lda;
-ftnlen type_len;
-ftnlen uplo_len;
+logical lzeres_(char* type__, char* uplo, integer* m, integer* n, doublecomplex *aa, doublecomplex* as, integer* lda, ftnlen type_len, ftnlen uplo_len)
 {
     /* System generated locals */
     integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4;
@@ -4184,9 +3862,7 @@ L80:
 
 } /* lzeres_ */
 
-/* Double Complex */ VOID zbeg_( ret_val, reset)
-doublecomplex * ret_val;
-logical *reset;
+/* Double Complex */ VOID zbeg_(doublecomplex* ret_val, logical* reset)
 {
     /* System generated locals */
     doublereal d__1, d__2;
@@ -4249,8 +3925,7 @@ L10:
 
 } /* zbeg_ */
 
-doublereal ddiff_(x, y)
-doublereal *x, *y;
+doublereal ddiff_(doublereal* x, doublereal* y)
 {
     /* System generated locals */
     doublereal ret_val;

From 4041b7fb42dcba67d99a0e00e1e820b6cc29f7fb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 7 Oct 2023 22:33:08 +0200
Subject: [PATCH 346/718] fix function prototypes in f2c-converted files

---
 lapack-netlib/INSTALL/dlamch.c           | 230 +--------------------
 lapack-netlib/INSTALL/droundup_lwork.c   | 141 -------------
 lapack-netlib/INSTALL/dsecnd_INT_ETIME.c | 144 +-------------
 lapack-netlib/INSTALL/ilaver.c           | 243 +----------------------
 lapack-netlib/INSTALL/second_INT_ETIME.c | 144 +-------------
 lapack-netlib/INSTALL/slamch.c           | 229 ---------------------
 lapack-netlib/INSTALL/sroundup_lwork.c   | 141 -------------
 7 files changed, 6 insertions(+), 1266 deletions(-)

diff --git a/lapack-netlib/INSTALL/dlamch.c b/lapack-netlib/INSTALL/dlamch.c
index 744130a87..ce6b76a32 100644
--- a/lapack-netlib/INSTALL/dlamch.c
+++ b/lapack-netlib/INSTALL/dlamch.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,24 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
 static double dpow_ui(double x, integer n) {
 	double pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -291,217 +273,7 @@ static double dpow_ui(double x, integer n) {
 	}
 	return pow;
 }
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
+
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/INSTALL/droundup_lwork.c b/lapack-netlib/INSTALL/droundup_lwork.c
index d05923a46..6d1f39a12 100644
--- a/lapack-netlib/INSTALL/droundup_lwork.c
+++ b/lapack-netlib/INSTALL/droundup_lwork.c
@@ -223,7 +223,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -237,145 +236,5 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
 #endif
diff --git a/lapack-netlib/INSTALL/dsecnd_INT_ETIME.c b/lapack-netlib/INSTALL/dsecnd_INT_ETIME.c
index 36853ef4b..e4230a6e4 100644
--- a/lapack-netlib/INSTALL/dsecnd_INT_ETIME.c
+++ b/lapack-netlib/INSTALL/dsecnd_INT_ETIME.c
@@ -223,7 +223,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -237,149 +236,10 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
 #endif
-/*  -- translated by f2c (version 20000121).
+
+/*
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
diff --git a/lapack-netlib/INSTALL/ilaver.c b/lapack-netlib/INSTALL/ilaver.c
index b274af292..184e9b78d 100644
--- a/lapack-netlib/INSTALL/ilaver.c
+++ b/lapack-netlib/INSTALL/ilaver.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,248 +260,8 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
+/*
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
diff --git a/lapack-netlib/INSTALL/second_INT_ETIME.c b/lapack-netlib/INSTALL/second_INT_ETIME.c
index 51b7ddb54..f2ad529e3 100644
--- a/lapack-netlib/INSTALL/second_INT_ETIME.c
+++ b/lapack-netlib/INSTALL/second_INT_ETIME.c
@@ -223,7 +223,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -237,149 +236,10 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
 #endif
-/*  -- translated by f2c (version 20000121).
+
+/*
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
diff --git a/lapack-netlib/INSTALL/slamch.c b/lapack-netlib/INSTALL/slamch.c
index a000ed664..2def9337a 100644
--- a/lapack-netlib/INSTALL/slamch.c
+++ b/lapack-netlib/INSTALL/slamch.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,11 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
 static float spow_ui(float x, integer n) {
 	float pow=1.0; unsigned long int u;
@@ -279,229 +273,6 @@ static float spow_ui(float x, integer n) {
 	}
 	return pow;
 }
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/INSTALL/sroundup_lwork.c b/lapack-netlib/INSTALL/sroundup_lwork.c
index d05923a46..6d1f39a12 100644
--- a/lapack-netlib/INSTALL/sroundup_lwork.c
+++ b/lapack-netlib/INSTALL/sroundup_lwork.c
@@ -223,7 +223,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -237,145 +236,5 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
 #endif

From 281f1e44325bf4229d55e9b4869ea3cdaf20a745 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 7 Oct 2023 22:36:29 +0200
Subject: [PATCH 347/718] fix function prototypes in f2c-converted files

---
 lapack-netlib/TESTING/MATGEN/clagge.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/claghe.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/clagsy.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/clahilb.c | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/clakf2.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/clarge.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/clarnd.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/claror.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/clarot.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/clatm1.c  | 235 -----------------------
 lapack-netlib/TESTING/MATGEN/clatm2.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/clatm3.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/clatm5.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/clatm6.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/clatme.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/clatmr.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/clatms.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/clatmt.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/dlagge.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/dlagsy.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/dlahilb.c | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/dlakf2.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/dlaran.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/dlarge.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/dlarnd.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/dlaror.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/dlarot.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/dlatm1.c  | 235 -----------------------
 lapack-netlib/TESTING/MATGEN/dlatm2.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/dlatm3.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/dlatm5.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/dlatm6.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/dlatm7.c  | 235 -----------------------
 lapack-netlib/TESTING/MATGEN/dlatme.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/dlatmr.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/dlatms.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/dlatmt.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/slagge.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/slagsy.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/slahilb.c | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/slakf2.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/slaran.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/slarge.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/slarnd.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/slaror.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/slarot.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/slatm1.c  | 235 -----------------------
 lapack-netlib/TESTING/MATGEN/slatm2.c  | 247 ------------------------
 lapack-netlib/TESTING/MATGEN/slatm3.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/slatm5.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/slatm6.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/slatm7.c  | 235 -----------------------
 lapack-netlib/TESTING/MATGEN/slatme.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/slatmr.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/slatms.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/slatmt.c  | 248 ------------------------
 lapack-netlib/TESTING/MATGEN/zlagge.c  | 249 -------------------------
 lapack-netlib/TESTING/MATGEN/zlaghe.c  | 249 -------------------------
 lapack-netlib/TESTING/MATGEN/zlagsy.c  | 249 -------------------------
 lapack-netlib/TESTING/MATGEN/zlahilb.c | 249 -------------------------
 lapack-netlib/TESTING/MATGEN/zlakf2.c  | 249 -------------------------
 lapack-netlib/TESTING/MATGEN/zlarge.c  | 249 -------------------------
 lapack-netlib/TESTING/MATGEN/zlarnd.c  | 249 -------------------------
 lapack-netlib/TESTING/MATGEN/zlaror.c  | 249 -------------------------
 lapack-netlib/TESTING/MATGEN/zlarot.c  | 249 -------------------------
 lapack-netlib/TESTING/MATGEN/zlatm1.c  | 236 -----------------------
 lapack-netlib/TESTING/MATGEN/zlatm2.c  | 249 -------------------------
 lapack-netlib/TESTING/MATGEN/zlatm3.c  | 249 -------------------------
 lapack-netlib/TESTING/MATGEN/zlatm5.c  | 249 -------------------------
 lapack-netlib/TESTING/MATGEN/zlatm6.c  | 249 -------------------------
 lapack-netlib/TESTING/MATGEN/zlatme.c  | 249 -------------------------
 lapack-netlib/TESTING/MATGEN/zlatmr.c  | 249 -------------------------
 lapack-netlib/TESTING/MATGEN/zlatms.c  | 249 -------------------------
 lapack-netlib/TESTING/MATGEN/zlatmt.c  | 249 -------------------------
 74 files changed, 18291 deletions(-)

diff --git a/lapack-netlib/TESTING/MATGEN/clagge.c b/lapack-netlib/TESTING/MATGEN/clagge.c
index f05905bd7..62c33d01e 100644
--- a/lapack-netlib/TESTING/MATGEN/clagge.c
+++ b/lapack-netlib/TESTING/MATGEN/clagge.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/claghe.c b/lapack-netlib/TESTING/MATGEN/claghe.c
index 77ed94912..009329dd8 100644
--- a/lapack-netlib/TESTING/MATGEN/claghe.c
+++ b/lapack-netlib/TESTING/MATGEN/claghe.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clagsy.c b/lapack-netlib/TESTING/MATGEN/clagsy.c
index de215bc63..5117f85db 100644
--- a/lapack-netlib/TESTING/MATGEN/clagsy.c
+++ b/lapack-netlib/TESTING/MATGEN/clagsy.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clahilb.c b/lapack-netlib/TESTING/MATGEN/clahilb.c
index 95b00fb3e..1ee293c6f 100644
--- a/lapack-netlib/TESTING/MATGEN/clahilb.c
+++ b/lapack-netlib/TESTING/MATGEN/clahilb.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clakf2.c b/lapack-netlib/TESTING/MATGEN/clakf2.c
index e15bc58ab..2b1b4e905 100644
--- a/lapack-netlib/TESTING/MATGEN/clakf2.c
+++ b/lapack-netlib/TESTING/MATGEN/clakf2.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clarge.c b/lapack-netlib/TESTING/MATGEN/clarge.c
index 0215436b7..25801da89 100644
--- a/lapack-netlib/TESTING/MATGEN/clarge.c
+++ b/lapack-netlib/TESTING/MATGEN/clarge.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clarnd.c b/lapack-netlib/TESTING/MATGEN/clarnd.c
index 260f246d5..26a22a3e0 100644
--- a/lapack-netlib/TESTING/MATGEN/clarnd.c
+++ b/lapack-netlib/TESTING/MATGEN/clarnd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b CLARND */
diff --git a/lapack-netlib/TESTING/MATGEN/claror.c b/lapack-netlib/TESTING/MATGEN/claror.c
index cd0d15300..b0d73f37c 100644
--- a/lapack-netlib/TESTING/MATGEN/claror.c
+++ b/lapack-netlib/TESTING/MATGEN/claror.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clarot.c b/lapack-netlib/TESTING/MATGEN/clarot.c
index f1e73ed5b..bd5f01cd3 100644
--- a/lapack-netlib/TESTING/MATGEN/clarot.c
+++ b/lapack-netlib/TESTING/MATGEN/clarot.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clatm1.c b/lapack-netlib/TESTING/MATGEN/clatm1.c
index 665b16057..2651091c8 100644
--- a/lapack-netlib/TESTING/MATGEN/clatm1.c
+++ b/lapack-netlib/TESTING/MATGEN/clatm1.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,11 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
 static float spow_ui(float x, integer n) {
 	float pow=1.0; unsigned long int u;
@@ -279,235 +273,6 @@ static float spow_ui(float x, integer n) {
 	}
 	return pow;
 }
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clatm2.c b/lapack-netlib/TESTING/MATGEN/clatm2.c
index cf60fa296..2dceff497 100644
--- a/lapack-netlib/TESTING/MATGEN/clatm2.c
+++ b/lapack-netlib/TESTING/MATGEN/clatm2.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b CLATM2 */
diff --git a/lapack-netlib/TESTING/MATGEN/clatm3.c b/lapack-netlib/TESTING/MATGEN/clatm3.c
index 58cd4e551..fcd8dbfcb 100644
--- a/lapack-netlib/TESTING/MATGEN/clatm3.c
+++ b/lapack-netlib/TESTING/MATGEN/clatm3.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b CLATM3 */
diff --git a/lapack-netlib/TESTING/MATGEN/clatm5.c b/lapack-netlib/TESTING/MATGEN/clatm5.c
index c2b81ccf3..8fbc1c0a6 100644
--- a/lapack-netlib/TESTING/MATGEN/clatm5.c
+++ b/lapack-netlib/TESTING/MATGEN/clatm5.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clatm6.c b/lapack-netlib/TESTING/MATGEN/clatm6.c
index 96a3bd290..f6992a362 100644
--- a/lapack-netlib/TESTING/MATGEN/clatm6.c
+++ b/lapack-netlib/TESTING/MATGEN/clatm6.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clatme.c b/lapack-netlib/TESTING/MATGEN/clatme.c
index a905f5608..fd0aec8ec 100644
--- a/lapack-netlib/TESTING/MATGEN/clatme.c
+++ b/lapack-netlib/TESTING/MATGEN/clatme.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clatmr.c b/lapack-netlib/TESTING/MATGEN/clatmr.c
index 4a117a9d4..2f987f744 100644
--- a/lapack-netlib/TESTING/MATGEN/clatmr.c
+++ b/lapack-netlib/TESTING/MATGEN/clatmr.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clatms.c b/lapack-netlib/TESTING/MATGEN/clatms.c
index 3f7011166..006b33b7a 100644
--- a/lapack-netlib/TESTING/MATGEN/clatms.c
+++ b/lapack-netlib/TESTING/MATGEN/clatms.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clatmt.c b/lapack-netlib/TESTING/MATGEN/clatmt.c
index 3bb7ca4d9..f6a842861 100644
--- a/lapack-netlib/TESTING/MATGEN/clatmt.c
+++ b/lapack-netlib/TESTING/MATGEN/clatmt.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlagge.c b/lapack-netlib/TESTING/MATGEN/dlagge.c
index dcca84326..5482c3ff6 100644
--- a/lapack-netlib/TESTING/MATGEN/dlagge.c
+++ b/lapack-netlib/TESTING/MATGEN/dlagge.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlagsy.c b/lapack-netlib/TESTING/MATGEN/dlagsy.c
index 2812db86d..efcbe3f99 100644
--- a/lapack-netlib/TESTING/MATGEN/dlagsy.c
+++ b/lapack-netlib/TESTING/MATGEN/dlagsy.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlahilb.c b/lapack-netlib/TESTING/MATGEN/dlahilb.c
index dccfcd8c7..d671adee6 100644
--- a/lapack-netlib/TESTING/MATGEN/dlahilb.c
+++ b/lapack-netlib/TESTING/MATGEN/dlahilb.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlakf2.c b/lapack-netlib/TESTING/MATGEN/dlakf2.c
index 0b403540a..b4a09204b 100644
--- a/lapack-netlib/TESTING/MATGEN/dlakf2.c
+++ b/lapack-netlib/TESTING/MATGEN/dlakf2.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlaran.c b/lapack-netlib/TESTING/MATGEN/dlaran.c
index 3aed1fce0..f13ada834 100644
--- a/lapack-netlib/TESTING/MATGEN/dlaran.c
+++ b/lapack-netlib/TESTING/MATGEN/dlaran.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b DLARAN */
diff --git a/lapack-netlib/TESTING/MATGEN/dlarge.c b/lapack-netlib/TESTING/MATGEN/dlarge.c
index 5d8a81387..5cc7fbce8 100644
--- a/lapack-netlib/TESTING/MATGEN/dlarge.c
+++ b/lapack-netlib/TESTING/MATGEN/dlarge.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlarnd.c b/lapack-netlib/TESTING/MATGEN/dlarnd.c
index 8e0f43ee5..3119097bf 100644
--- a/lapack-netlib/TESTING/MATGEN/dlarnd.c
+++ b/lapack-netlib/TESTING/MATGEN/dlarnd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b DLARND */
diff --git a/lapack-netlib/TESTING/MATGEN/dlaror.c b/lapack-netlib/TESTING/MATGEN/dlaror.c
index d9e2e46ae..fdd126174 100644
--- a/lapack-netlib/TESTING/MATGEN/dlaror.c
+++ b/lapack-netlib/TESTING/MATGEN/dlaror.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlarot.c b/lapack-netlib/TESTING/MATGEN/dlarot.c
index 1d7a4939d..3eadf15c6 100644
--- a/lapack-netlib/TESTING/MATGEN/dlarot.c
+++ b/lapack-netlib/TESTING/MATGEN/dlarot.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlatm1.c b/lapack-netlib/TESTING/MATGEN/dlatm1.c
index aa81469e7..aa58300a3 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatm1.c
+++ b/lapack-netlib/TESTING/MATGEN/dlatm1.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,24 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
 static double dpow_ui(double x, integer n) {
 	double pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -291,223 +273,6 @@ static double dpow_ui(double x, integer n) {
 	}
 	return pow;
 }
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b DLATM1 */
diff --git a/lapack-netlib/TESTING/MATGEN/dlatm2.c b/lapack-netlib/TESTING/MATGEN/dlatm2.c
index d74bc9168..7491e9829 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatm2.c
+++ b/lapack-netlib/TESTING/MATGEN/dlatm2.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b DLATM2 */
diff --git a/lapack-netlib/TESTING/MATGEN/dlatm3.c b/lapack-netlib/TESTING/MATGEN/dlatm3.c
index 86f964ced..a9d26c7fc 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatm3.c
+++ b/lapack-netlib/TESTING/MATGEN/dlatm3.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b DLATM3 */
diff --git a/lapack-netlib/TESTING/MATGEN/dlatm5.c b/lapack-netlib/TESTING/MATGEN/dlatm5.c
index 94b49d6e3..7f1c36428 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatm5.c
+++ b/lapack-netlib/TESTING/MATGEN/dlatm5.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlatm6.c b/lapack-netlib/TESTING/MATGEN/dlatm6.c
index fee10724a..0b3fbb42f 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatm6.c
+++ b/lapack-netlib/TESTING/MATGEN/dlatm6.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlatm7.c b/lapack-netlib/TESTING/MATGEN/dlatm7.c
index eb42e2b8b..c907feff4 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatm7.c
+++ b/lapack-netlib/TESTING/MATGEN/dlatm7.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,24 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
 static double dpow_ui(double x, integer n) {
 	double pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -291,223 +273,6 @@ static double dpow_ui(double x, integer n) {
 	}
 	return pow;
 }
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b DLATM7 */
diff --git a/lapack-netlib/TESTING/MATGEN/dlatme.c b/lapack-netlib/TESTING/MATGEN/dlatme.c
index a92c70ef2..e29df164c 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatme.c
+++ b/lapack-netlib/TESTING/MATGEN/dlatme.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlatmr.c b/lapack-netlib/TESTING/MATGEN/dlatmr.c
index e6ce5456d..77e224840 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatmr.c
+++ b/lapack-netlib/TESTING/MATGEN/dlatmr.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlatms.c b/lapack-netlib/TESTING/MATGEN/dlatms.c
index d56d26a50..c9a8b226e 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatms.c
+++ b/lapack-netlib/TESTING/MATGEN/dlatms.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlatmt.c b/lapack-netlib/TESTING/MATGEN/dlatmt.c
index b61e8b779..6b0871257 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatmt.c
+++ b/lapack-netlib/TESTING/MATGEN/dlatmt.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slagge.c b/lapack-netlib/TESTING/MATGEN/slagge.c
index c3ec8f2b8..9213d9688 100644
--- a/lapack-netlib/TESTING/MATGEN/slagge.c
+++ b/lapack-netlib/TESTING/MATGEN/slagge.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slagsy.c b/lapack-netlib/TESTING/MATGEN/slagsy.c
index 62614593d..fad4cf5fa 100644
--- a/lapack-netlib/TESTING/MATGEN/slagsy.c
+++ b/lapack-netlib/TESTING/MATGEN/slagsy.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slahilb.c b/lapack-netlib/TESTING/MATGEN/slahilb.c
index ae633d3c9..7367d71a2 100644
--- a/lapack-netlib/TESTING/MATGEN/slahilb.c
+++ b/lapack-netlib/TESTING/MATGEN/slahilb.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slakf2.c b/lapack-netlib/TESTING/MATGEN/slakf2.c
index 67be11662..388d630e1 100644
--- a/lapack-netlib/TESTING/MATGEN/slakf2.c
+++ b/lapack-netlib/TESTING/MATGEN/slakf2.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slaran.c b/lapack-netlib/TESTING/MATGEN/slaran.c
index 43982d860..af9b56412 100644
--- a/lapack-netlib/TESTING/MATGEN/slaran.c
+++ b/lapack-netlib/TESTING/MATGEN/slaran.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b SLARAN */
diff --git a/lapack-netlib/TESTING/MATGEN/slarge.c b/lapack-netlib/TESTING/MATGEN/slarge.c
index 6b37e9400..d5fbd541c 100644
--- a/lapack-netlib/TESTING/MATGEN/slarge.c
+++ b/lapack-netlib/TESTING/MATGEN/slarge.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slarnd.c b/lapack-netlib/TESTING/MATGEN/slarnd.c
index af9bf4a57..eabfa13c0 100644
--- a/lapack-netlib/TESTING/MATGEN/slarnd.c
+++ b/lapack-netlib/TESTING/MATGEN/slarnd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b SLARND */
diff --git a/lapack-netlib/TESTING/MATGEN/slaror.c b/lapack-netlib/TESTING/MATGEN/slaror.c
index 48b532dfd..7e3065432 100644
--- a/lapack-netlib/TESTING/MATGEN/slaror.c
+++ b/lapack-netlib/TESTING/MATGEN/slaror.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slarot.c b/lapack-netlib/TESTING/MATGEN/slarot.c
index a23e186e1..8d2f51f92 100644
--- a/lapack-netlib/TESTING/MATGEN/slarot.c
+++ b/lapack-netlib/TESTING/MATGEN/slarot.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slatm1.c b/lapack-netlib/TESTING/MATGEN/slatm1.c
index 5c4cdc067..8acbdfa3e 100644
--- a/lapack-netlib/TESTING/MATGEN/slatm1.c
+++ b/lapack-netlib/TESTING/MATGEN/slatm1.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,11 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
 static float spow_ui(float x, integer n) {
 	float pow=1.0; unsigned long int u;
@@ -279,235 +273,6 @@ static float spow_ui(float x, integer n) {
 	}
 	return pow;
 }
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b SLATM1 */
diff --git a/lapack-netlib/TESTING/MATGEN/slatm2.c b/lapack-netlib/TESTING/MATGEN/slatm2.c
index e7b72006f..833ee5dea 100644
--- a/lapack-netlib/TESTING/MATGEN/slatm2.c
+++ b/lapack-netlib/TESTING/MATGEN/slatm2.c
@@ -261,253 +261,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b SLATM2 */
diff --git a/lapack-netlib/TESTING/MATGEN/slatm3.c b/lapack-netlib/TESTING/MATGEN/slatm3.c
index 4f9f5fee2..cdf96ef51 100644
--- a/lapack-netlib/TESTING/MATGEN/slatm3.c
+++ b/lapack-netlib/TESTING/MATGEN/slatm3.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b SLATM3 */
diff --git a/lapack-netlib/TESTING/MATGEN/slatm5.c b/lapack-netlib/TESTING/MATGEN/slatm5.c
index 24ee0915d..9122bc041 100644
--- a/lapack-netlib/TESTING/MATGEN/slatm5.c
+++ b/lapack-netlib/TESTING/MATGEN/slatm5.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slatm6.c b/lapack-netlib/TESTING/MATGEN/slatm6.c
index c50fcd29f..6e3306d29 100644
--- a/lapack-netlib/TESTING/MATGEN/slatm6.c
+++ b/lapack-netlib/TESTING/MATGEN/slatm6.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slatm7.c b/lapack-netlib/TESTING/MATGEN/slatm7.c
index e1b32ac9f..5ea2e32f6 100644
--- a/lapack-netlib/TESTING/MATGEN/slatm7.c
+++ b/lapack-netlib/TESTING/MATGEN/slatm7.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,11 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
 static float spow_ui(float x, integer n) {
 	float pow=1.0; unsigned long int u;
@@ -279,235 +273,6 @@ static float spow_ui(float x, integer n) {
 	}
 	return pow;
 }
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b SLATM7 */
diff --git a/lapack-netlib/TESTING/MATGEN/slatme.c b/lapack-netlib/TESTING/MATGEN/slatme.c
index a8a6b39a3..126c42121 100644
--- a/lapack-netlib/TESTING/MATGEN/slatme.c
+++ b/lapack-netlib/TESTING/MATGEN/slatme.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slatmr.c b/lapack-netlib/TESTING/MATGEN/slatmr.c
index 01de66f54..cc227a9d8 100644
--- a/lapack-netlib/TESTING/MATGEN/slatmr.c
+++ b/lapack-netlib/TESTING/MATGEN/slatmr.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slatms.c b/lapack-netlib/TESTING/MATGEN/slatms.c
index 8e85f5db8..95f005858 100644
--- a/lapack-netlib/TESTING/MATGEN/slatms.c
+++ b/lapack-netlib/TESTING/MATGEN/slatms.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slatmt.c b/lapack-netlib/TESTING/MATGEN/slatmt.c
index e34bb8c03..cd1c3c7c1 100644
--- a/lapack-netlib/TESTING/MATGEN/slatmt.c
+++ b/lapack-netlib/TESTING/MATGEN/slatmt.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlagge.c b/lapack-netlib/TESTING/MATGEN/zlagge.c
index bfa33cf34..56d0318e2 100644
--- a/lapack-netlib/TESTING/MATGEN/zlagge.c
+++ b/lapack-netlib/TESTING/MATGEN/zlagge.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlaghe.c b/lapack-netlib/TESTING/MATGEN/zlaghe.c
index 06a8e31a3..63a8a05d6 100644
--- a/lapack-netlib/TESTING/MATGEN/zlaghe.c
+++ b/lapack-netlib/TESTING/MATGEN/zlaghe.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlagsy.c b/lapack-netlib/TESTING/MATGEN/zlagsy.c
index 431522ebe..8804bfbce 100644
--- a/lapack-netlib/TESTING/MATGEN/zlagsy.c
+++ b/lapack-netlib/TESTING/MATGEN/zlagsy.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlahilb.c b/lapack-netlib/TESTING/MATGEN/zlahilb.c
index 0e9cc1c4a..ff3d06525 100644
--- a/lapack-netlib/TESTING/MATGEN/zlahilb.c
+++ b/lapack-netlib/TESTING/MATGEN/zlahilb.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlakf2.c b/lapack-netlib/TESTING/MATGEN/zlakf2.c
index f7a90eef7..4c08bd112 100644
--- a/lapack-netlib/TESTING/MATGEN/zlakf2.c
+++ b/lapack-netlib/TESTING/MATGEN/zlakf2.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlarge.c b/lapack-netlib/TESTING/MATGEN/zlarge.c
index 4b06a818d..848cde007 100644
--- a/lapack-netlib/TESTING/MATGEN/zlarge.c
+++ b/lapack-netlib/TESTING/MATGEN/zlarge.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlarnd.c b/lapack-netlib/TESTING/MATGEN/zlarnd.c
index 014d48afb..ff1bf1467 100644
--- a/lapack-netlib/TESTING/MATGEN/zlarnd.c
+++ b/lapack-netlib/TESTING/MATGEN/zlarnd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* > \brief \b ZLARND */
diff --git a/lapack-netlib/TESTING/MATGEN/zlaror.c b/lapack-netlib/TESTING/MATGEN/zlaror.c
index 6ada57b8a..c8a84f215 100644
--- a/lapack-netlib/TESTING/MATGEN/zlaror.c
+++ b/lapack-netlib/TESTING/MATGEN/zlaror.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlarot.c b/lapack-netlib/TESTING/MATGEN/zlarot.c
index 6b2e88b32..be951dcb5 100644
--- a/lapack-netlib/TESTING/MATGEN/zlarot.c
+++ b/lapack-netlib/TESTING/MATGEN/zlarot.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlatm1.c b/lapack-netlib/TESTING/MATGEN/zlatm1.c
index c75787730..447edc259 100644
--- a/lapack-netlib/TESTING/MATGEN/zlatm1.c
+++ b/lapack-netlib/TESTING/MATGEN/zlatm1.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,24 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
 static double dpow_ui(double x, integer n) {
 	double pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -291,224 +273,6 @@ static double dpow_ui(double x, integer n) {
 	}
 	return pow;
 }
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlatm2.c b/lapack-netlib/TESTING/MATGEN/zlatm2.c
index e1689d875..e6fe75d06 100644
--- a/lapack-netlib/TESTING/MATGEN/zlatm2.c
+++ b/lapack-netlib/TESTING/MATGEN/zlatm2.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* > \brief \b ZLATM2 */
diff --git a/lapack-netlib/TESTING/MATGEN/zlatm3.c b/lapack-netlib/TESTING/MATGEN/zlatm3.c
index c35ffe4d9..6370a9d39 100644
--- a/lapack-netlib/TESTING/MATGEN/zlatm3.c
+++ b/lapack-netlib/TESTING/MATGEN/zlatm3.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* > \brief \b ZLATM3 */
diff --git a/lapack-netlib/TESTING/MATGEN/zlatm5.c b/lapack-netlib/TESTING/MATGEN/zlatm5.c
index 753ee0ce6..5ee6cc8ce 100644
--- a/lapack-netlib/TESTING/MATGEN/zlatm5.c
+++ b/lapack-netlib/TESTING/MATGEN/zlatm5.c
@@ -248,7 +248,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -262,254 +261,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlatm6.c b/lapack-netlib/TESTING/MATGEN/zlatm6.c
index e20ec7092..f394cb1df 100644
--- a/lapack-netlib/TESTING/MATGEN/zlatm6.c
+++ b/lapack-netlib/TESTING/MATGEN/zlatm6.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlatme.c b/lapack-netlib/TESTING/MATGEN/zlatme.c
index ded6ca214..11127aa51 100644
--- a/lapack-netlib/TESTING/MATGEN/zlatme.c
+++ b/lapack-netlib/TESTING/MATGEN/zlatme.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlatmr.c b/lapack-netlib/TESTING/MATGEN/zlatmr.c
index c42a25f2b..02a4cf16f 100644
--- a/lapack-netlib/TESTING/MATGEN/zlatmr.c
+++ b/lapack-netlib/TESTING/MATGEN/zlatmr.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlatms.c b/lapack-netlib/TESTING/MATGEN/zlatms.c
index f0bb66a44..87f26059d 100644
--- a/lapack-netlib/TESTING/MATGEN/zlatms.c
+++ b/lapack-netlib/TESTING/MATGEN/zlatms.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlatmt.c b/lapack-netlib/TESTING/MATGEN/zlatmt.c
index 551b17628..64c30ec17 100644
--- a/lapack-netlib/TESTING/MATGEN/zlatmt.c
+++ b/lapack-netlib/TESTING/MATGEN/zlatmt.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */

From 1806cfecbcb30bc8eb442522fe8aca65601ad11d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 7 Oct 2023 22:38:30 +0200
Subject: [PATCH 348/718] fix function prototypes in f2c-converted files

---
 lapack-netlib/SRC/DEPRECATED/cgegs.c  | 241 -------------------------
 lapack-netlib/SRC/DEPRECATED/cgegv.c  | 242 -------------------------
 lapack-netlib/SRC/DEPRECATED/cgelsx.c | 242 -------------------------
 lapack-netlib/SRC/DEPRECATED/cgeqpf.c | 241 -------------------------
 lapack-netlib/SRC/DEPRECATED/cggsvd.c | 243 +------------------------
 lapack-netlib/SRC/DEPRECATED/cggsvp.c | 242 -------------------------
 lapack-netlib/SRC/DEPRECATED/clahrd.c | 248 --------------------------
 lapack-netlib/SRC/DEPRECATED/clatzm.c | 248 --------------------------
 lapack-netlib/SRC/DEPRECATED/ctzrqf.c | 247 -------------------------
 lapack-netlib/SRC/DEPRECATED/dgegs.c  | 242 +------------------------
 lapack-netlib/SRC/DEPRECATED/dgegv.c  | 242 -------------------------
 lapack-netlib/SRC/DEPRECATED/dgelsx.c | 242 -------------------------
 lapack-netlib/SRC/DEPRECATED/dgeqpf.c | 243 +------------------------
 lapack-netlib/SRC/DEPRECATED/dggsvd.c | 241 -------------------------
 lapack-netlib/SRC/DEPRECATED/dggsvp.c | 242 -------------------------
 lapack-netlib/SRC/DEPRECATED/dlahrd.c | 246 -------------------------
 lapack-netlib/SRC/DEPRECATED/dlatzm.c | 248 --------------------------
 lapack-netlib/SRC/DEPRECATED/dtzrqf.c | 247 -------------------------
 lapack-netlib/SRC/DEPRECATED/sgegs.c  | 241 -------------------------
 lapack-netlib/SRC/DEPRECATED/sgegv.c  | 242 -------------------------
 lapack-netlib/SRC/DEPRECATED/sgelsx.c | 242 -------------------------
 lapack-netlib/SRC/DEPRECATED/sgeqpf.c | 243 +------------------------
 lapack-netlib/SRC/DEPRECATED/sggsvd.c | 241 -------------------------
 lapack-netlib/SRC/DEPRECATED/sggsvp.c | 242 -------------------------
 lapack-netlib/SRC/DEPRECATED/slahrd.c | 242 -------------------------
 lapack-netlib/SRC/DEPRECATED/slatzm.c | 247 -------------------------
 lapack-netlib/SRC/DEPRECATED/stzrqf.c | 248 --------------------------
 lapack-netlib/SRC/DEPRECATED/zgegs.c  | 241 -------------------------
 lapack-netlib/SRC/DEPRECATED/zgegv.c  | 242 -------------------------
 lapack-netlib/SRC/DEPRECATED/zgelsx.c | 242 -------------------------
 lapack-netlib/SRC/DEPRECATED/zgeqpf.c | 243 +------------------------
 lapack-netlib/SRC/DEPRECATED/zggsvd.c | 243 +------------------------
 lapack-netlib/SRC/DEPRECATED/zggsvp.c | 244 +------------------------
 lapack-netlib/SRC/DEPRECATED/zlahrd.c | 248 --------------------------
 lapack-netlib/SRC/DEPRECATED/zlatzm.c | 248 --------------------------
 lapack-netlib/SRC/DEPRECATED/ztzrqf.c | 248 --------------------------
 36 files changed, 7 insertions(+), 8767 deletions(-)

diff --git a/lapack-netlib/SRC/DEPRECATED/cgegs.c b/lapack-netlib/SRC/DEPRECATED/cgegs.c
index 35b59e683..270a05ebd 100644
--- a/lapack-netlib/SRC/DEPRECATED/cgegs.c
+++ b/lapack-netlib/SRC/DEPRECATED/cgegs.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/cgegv.c b/lapack-netlib/SRC/DEPRECATED/cgegv.c
index 5a5ea919f..d485b05b6 100644
--- a/lapack-netlib/SRC/DEPRECATED/cgegv.c
+++ b/lapack-netlib/SRC/DEPRECATED/cgegv.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/cgelsx.c b/lapack-netlib/SRC/DEPRECATED/cgelsx.c
index bdc395acf..cb3c33323 100644
--- a/lapack-netlib/SRC/DEPRECATED/cgelsx.c
+++ b/lapack-netlib/SRC/DEPRECATED/cgelsx.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/cgeqpf.c b/lapack-netlib/SRC/DEPRECATED/cgeqpf.c
index 6c810fc79..12f8e5c76 100644
--- a/lapack-netlib/SRC/DEPRECATED/cgeqpf.c
+++ b/lapack-netlib/SRC/DEPRECATED/cgeqpf.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/cggsvd.c b/lapack-netlib/SRC/DEPRECATED/cggsvd.c
index d6f7b5903..1a8a827c1 100644
--- a/lapack-netlib/SRC/DEPRECATED/cggsvd.c
+++ b/lapack-netlib/SRC/DEPRECATED/cggsvd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,248 +260,8 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
+/*
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
diff --git a/lapack-netlib/SRC/DEPRECATED/cggsvp.c b/lapack-netlib/SRC/DEPRECATED/cggsvp.c
index d29338f73..3bf587172 100644
--- a/lapack-netlib/SRC/DEPRECATED/cggsvp.c
+++ b/lapack-netlib/SRC/DEPRECATED/cggsvp.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/clahrd.c b/lapack-netlib/SRC/DEPRECATED/clahrd.c
index ea62964f8..517e4786b 100644
--- a/lapack-netlib/SRC/DEPRECATED/clahrd.c
+++ b/lapack-netlib/SRC/DEPRECATED/clahrd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/SRC/DEPRECATED/clatzm.c b/lapack-netlib/SRC/DEPRECATED/clatzm.c
index 30393dc26..747f5bc11 100644
--- a/lapack-netlib/SRC/DEPRECATED/clatzm.c
+++ b/lapack-netlib/SRC/DEPRECATED/clatzm.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/SRC/DEPRECATED/ctzrqf.c b/lapack-netlib/SRC/DEPRECATED/ctzrqf.c
index 537e77dc6..3986f7854 100644
--- a/lapack-netlib/SRC/DEPRECATED/ctzrqf.c
+++ b/lapack-netlib/SRC/DEPRECATED/ctzrqf.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,252 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
 
 
 
diff --git a/lapack-netlib/SRC/DEPRECATED/dgegs.c b/lapack-netlib/SRC/DEPRECATED/dgegs.c
index 70fb1aa3b..f034821a4 100644
--- a/lapack-netlib/SRC/DEPRECATED/dgegs.c
+++ b/lapack-netlib/SRC/DEPRECATED/dgegs.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,8 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
+
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/dgegv.c b/lapack-netlib/SRC/DEPRECATED/dgegv.c
index c52b4b7d2..38df37116 100644
--- a/lapack-netlib/SRC/DEPRECATED/dgegv.c
+++ b/lapack-netlib/SRC/DEPRECATED/dgegv.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/dgelsx.c b/lapack-netlib/SRC/DEPRECATED/dgelsx.c
index 86977c94e..5abeebe84 100644
--- a/lapack-netlib/SRC/DEPRECATED/dgelsx.c
+++ b/lapack-netlib/SRC/DEPRECATED/dgelsx.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/dgeqpf.c b/lapack-netlib/SRC/DEPRECATED/dgeqpf.c
index 94a431523..21c6a5c7d 100644
--- a/lapack-netlib/SRC/DEPRECATED/dgeqpf.c
+++ b/lapack-netlib/SRC/DEPRECATED/dgeqpf.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,248 +260,8 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
+/*
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
diff --git a/lapack-netlib/SRC/DEPRECATED/dggsvd.c b/lapack-netlib/SRC/DEPRECATED/dggsvd.c
index 71a4010d4..e5993b833 100644
--- a/lapack-netlib/SRC/DEPRECATED/dggsvd.c
+++ b/lapack-netlib/SRC/DEPRECATED/dggsvd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/dggsvp.c b/lapack-netlib/SRC/DEPRECATED/dggsvp.c
index 593d7e492..b3a53fad1 100644
--- a/lapack-netlib/SRC/DEPRECATED/dggsvp.c
+++ b/lapack-netlib/SRC/DEPRECATED/dggsvp.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/dlahrd.c b/lapack-netlib/SRC/DEPRECATED/dlahrd.c
index 8b74436c4..30f7a0f7a 100644
--- a/lapack-netlib/SRC/DEPRECATED/dlahrd.c
+++ b/lapack-netlib/SRC/DEPRECATED/dlahrd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,251 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
 
 
 
diff --git a/lapack-netlib/SRC/DEPRECATED/dlatzm.c b/lapack-netlib/SRC/DEPRECATED/dlatzm.c
index 39a00abf5..698b09e85 100644
--- a/lapack-netlib/SRC/DEPRECATED/dlatzm.c
+++ b/lapack-netlib/SRC/DEPRECATED/dlatzm.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/SRC/DEPRECATED/dtzrqf.c b/lapack-netlib/SRC/DEPRECATED/dtzrqf.c
index a5cf3dcfe..ff47d4177 100644
--- a/lapack-netlib/SRC/DEPRECATED/dtzrqf.c
+++ b/lapack-netlib/SRC/DEPRECATED/dtzrqf.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,252 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
 
 
 
diff --git a/lapack-netlib/SRC/DEPRECATED/sgegs.c b/lapack-netlib/SRC/DEPRECATED/sgegs.c
index 48cce1ede..6ab9050e0 100644
--- a/lapack-netlib/SRC/DEPRECATED/sgegs.c
+++ b/lapack-netlib/SRC/DEPRECATED/sgegs.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/sgegv.c b/lapack-netlib/SRC/DEPRECATED/sgegv.c
index eb129ab65..39b2d6fb6 100644
--- a/lapack-netlib/SRC/DEPRECATED/sgegv.c
+++ b/lapack-netlib/SRC/DEPRECATED/sgegv.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/sgelsx.c b/lapack-netlib/SRC/DEPRECATED/sgelsx.c
index a887910c9..b2c480481 100644
--- a/lapack-netlib/SRC/DEPRECATED/sgelsx.c
+++ b/lapack-netlib/SRC/DEPRECATED/sgelsx.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/sgeqpf.c b/lapack-netlib/SRC/DEPRECATED/sgeqpf.c
index ac672f804..67121341e 100644
--- a/lapack-netlib/SRC/DEPRECATED/sgeqpf.c
+++ b/lapack-netlib/SRC/DEPRECATED/sgeqpf.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,248 +260,8 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
+/*
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
diff --git a/lapack-netlib/SRC/DEPRECATED/sggsvd.c b/lapack-netlib/SRC/DEPRECATED/sggsvd.c
index 0af712876..a10edf060 100644
--- a/lapack-netlib/SRC/DEPRECATED/sggsvd.c
+++ b/lapack-netlib/SRC/DEPRECATED/sggsvd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/sggsvp.c b/lapack-netlib/SRC/DEPRECATED/sggsvp.c
index bf69f4239..ca47bc629 100644
--- a/lapack-netlib/SRC/DEPRECATED/sggsvp.c
+++ b/lapack-netlib/SRC/DEPRECATED/sggsvp.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/slahrd.c b/lapack-netlib/SRC/DEPRECATED/slahrd.c
index 0cf290be2..920288686 100644
--- a/lapack-netlib/SRC/DEPRECATED/slahrd.c
+++ b/lapack-netlib/SRC/DEPRECATED/slahrd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/slatzm.c b/lapack-netlib/SRC/DEPRECATED/slatzm.c
index 1826db1a4..6e53dceca 100644
--- a/lapack-netlib/SRC/DEPRECATED/slatzm.c
+++ b/lapack-netlib/SRC/DEPRECATED/slatzm.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,252 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
 
 
 
diff --git a/lapack-netlib/SRC/DEPRECATED/stzrqf.c b/lapack-netlib/SRC/DEPRECATED/stzrqf.c
index 16cf7e33a..72c0083bb 100644
--- a/lapack-netlib/SRC/DEPRECATED/stzrqf.c
+++ b/lapack-netlib/SRC/DEPRECATED/stzrqf.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/SRC/DEPRECATED/zgegs.c b/lapack-netlib/SRC/DEPRECATED/zgegs.c
index 7dab0efe9..c053ec432 100644
--- a/lapack-netlib/SRC/DEPRECATED/zgegs.c
+++ b/lapack-netlib/SRC/DEPRECATED/zgegs.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/zgegv.c b/lapack-netlib/SRC/DEPRECATED/zgegv.c
index 991a6ff11..8272bd918 100644
--- a/lapack-netlib/SRC/DEPRECATED/zgegv.c
+++ b/lapack-netlib/SRC/DEPRECATED/zgegv.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/zgelsx.c b/lapack-netlib/SRC/DEPRECATED/zgelsx.c
index f1f39eb9c..82d195550 100644
--- a/lapack-netlib/SRC/DEPRECATED/zgelsx.c
+++ b/lapack-netlib/SRC/DEPRECATED/zgelsx.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/zgeqpf.c b/lapack-netlib/SRC/DEPRECATED/zgeqpf.c
index 12ecc9940..0a2f8c7aa 100644
--- a/lapack-netlib/SRC/DEPRECATED/zgeqpf.c
+++ b/lapack-netlib/SRC/DEPRECATED/zgeqpf.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,248 +260,8 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
+/*
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
diff --git a/lapack-netlib/SRC/DEPRECATED/zggsvd.c b/lapack-netlib/SRC/DEPRECATED/zggsvd.c
index 8f1c7e46c..e9d723404 100644
--- a/lapack-netlib/SRC/DEPRECATED/zggsvd.c
+++ b/lapack-netlib/SRC/DEPRECATED/zggsvd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,248 +260,8 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
+/*
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
diff --git a/lapack-netlib/SRC/DEPRECATED/zggsvp.c b/lapack-netlib/SRC/DEPRECATED/zggsvp.c
index cbe9a9083..0409e97b1 100644
--- a/lapack-netlib/SRC/DEPRECATED/zggsvp.c
+++ b/lapack-netlib/SRC/DEPRECATED/zggsvp.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,248 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
+/*
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
diff --git a/lapack-netlib/SRC/DEPRECATED/zlahrd.c b/lapack-netlib/SRC/DEPRECATED/zlahrd.c
index 3c86494cc..41c26d98e 100644
--- a/lapack-netlib/SRC/DEPRECATED/zlahrd.c
+++ b/lapack-netlib/SRC/DEPRECATED/zlahrd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/SRC/DEPRECATED/zlatzm.c b/lapack-netlib/SRC/DEPRECATED/zlatzm.c
index b3c31d994..25c476524 100644
--- a/lapack-netlib/SRC/DEPRECATED/zlatzm.c
+++ b/lapack-netlib/SRC/DEPRECATED/zlatzm.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/SRC/DEPRECATED/ztzrqf.c b/lapack-netlib/SRC/DEPRECATED/ztzrqf.c
index 222cd33c7..3a94fa79a 100644
--- a/lapack-netlib/SRC/DEPRECATED/ztzrqf.c
+++ b/lapack-netlib/SRC/DEPRECATED/ztzrqf.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */

From 65bfe1a06d3be4887dadec05a649d5519cd086b9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 8 Oct 2023 11:36:06 +0200
Subject: [PATCH 349/718] fix prototype of itest1 for INTERFACE64

---
 ctest/c_sblat1c.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ctest/c_sblat1c.c b/ctest/c_sblat1c.c
index 1424e39b4..7c049b796 100644
--- a/ctest/c_sblat1c.c
+++ b/ctest/c_sblat1c.c
@@ -437,7 +437,7 @@ L40:
     static real stemp[1], strue[8];
     extern /* Subroutine */ int stest_(int*, real*,real*,real*,real*), sscaltest_(int*,real*,real*,int*);
     extern real sasumtest_(int*,real*,int*);
-    extern /* Subroutine */ int itest1_(int*,int*), stest1_(real*,real*,real*,real*);
+    extern /* Subroutine */ int itest1_(integer*,integer*), stest1_(real*,real*,real*,real*);
     static real sx[8];
     static integer np1;
     extern integer isamaxtest_(int*,real*,int*);

From 2b865da7304f930d44194681bd7ef86cf8edbb4c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 8 Oct 2023 11:55:10 +0200
Subject: [PATCH 350/718] fix prototypes of stest and ctest for INTERFACE64

---
 ctest/c_zblat1c.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ctest/c_zblat1c.c b/ctest/c_zblat1c.c
index 4761e63d7..d8cff5dee 100644
--- a/ctest/c_zblat1c.c
+++ b/ctest/c_zblat1c.c
@@ -378,7 +378,7 @@ static doublereal c_b43 = 1.;
 
     /* Local variables */
     static integer i__;
-    extern /* Subroutine */ int ctest_(int*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*);
+    extern /* Subroutine */ int ctest_(integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*);
     static doublecomplex mwpcs[5], mwpct[5];
     extern /* Subroutine */ int zscaltest_(int*, doublereal*, doublecomplex*, int*), itest1_(int*, int*), stest1_(doublereal*, doublereal*, doublereal*, doublereal*);
     static doublecomplex cx[8];
@@ -588,7 +588,7 @@ static doublereal c_b43 = 1.;
     /* Local variables */
     static doublecomplex cdot[1];
     static integer lenx, leny, i__;
-    extern /* Subroutine */ int ctest_(int*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*);
+    extern /* Subroutine */ int ctest_(integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*);
     static integer ksize;
     static doublecomplex ztemp;
     extern /* Subroutine */ int zdotctest_(int*, doublecomplex*, int*, doublecomplex*, int*, doublecomplex*), zcopytest_(int*, doublecomplex*, int*, doublecomplex*, int*);
@@ -747,7 +747,7 @@ L40:
 /* Subroutine */ int stest1_(doublereal* scomp1, doublereal* strue1, doublereal* ssize, doublereal* sfac)
 {
     static doublereal scomp[1], strue[1];
-    extern /* Subroutine */ int stest_(int*,doublereal*, doublereal*, doublereal*, doublereal*);
+    extern /* Subroutine */ integer stest_(int*,doublereal*, doublereal*, doublereal*, doublereal*);
 
 /*     ************************* STEST1 ***************************** */
 

From c30b53087860186287b5db0ba455499fbe7d20e5 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 8 Oct 2023 11:59:19 +0200
Subject: [PATCH 351/718] fix prototypes of ctest and itest for INTERFACE64

---
 ctest/c_cblat1c.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ctest/c_cblat1c.c b/ctest/c_cblat1c.c
index 2f84da43b..fa97cbf92 100644
--- a/ctest/c_cblat1c.c
+++ b/ctest/c_cblat1c.c
@@ -437,9 +437,9 @@ static real c_b43 = (float)1.;
 
     /* Local variables */
     static integer i__;
-    extern /* Subroutine */ int ctest_(int*, complex*, complex*, complex*, real*);
+    extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*);
     static complex mwpcs[5], mwpct[5];
-    extern /* Subroutine */ int itest1_(int*, int*), stest1_(real*,real*,real*,real*);
+    extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*);
     static complex cx[8];
     extern real scnrm2test_(int*, complex*, int*);
     static integer np1;
@@ -734,7 +734,7 @@ static real c_b43 = (float)1.;
     static complex cdot[1];
     static integer lenx, leny, i__;
     static complex ctemp;
-    extern /* Subroutine */ int ctest_(int*, complex*, complex*, complex*, real*);
+    extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*);
     static integer ksize;
     extern /* Subroutine */ int cdotctest_(int*, complex*, int*, complex*, int*,complex*), ccopytest_(int*, complex*, int*, complex*, int*), cdotutest_(int*, complex*, int*, complex*, int*, complex*), 
 	    cswaptest_(int*, complex*, int*, complex*, int*), caxpytest_(int*, complex*, complex*, int*, complex*, int*);
@@ -939,7 +939,7 @@ doublereal sdiff_(real* sa, real* sb)
     /* Local variables */
     static integer i__;
     static real scomp[20], ssize[20], strue[20];
-    extern /* Subroutine */ int stest_(int*, real*,real*,real*,real*);
+    extern /* Subroutine */ int stest_(integer*, real*,real*,real*,real*);
 
 /*     **************************** CTEST ***************************** */
 

From 769a58e9d18029cefab7647916fec3bd635b604e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 8 Oct 2023 12:51:41 +0200
Subject: [PATCH 352/718] fix prototypes of stest and itest1 for INTERFACE64

---
 ctest/c_dblat1c.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ctest/c_dblat1c.c b/ctest/c_dblat1c.c
index f0141f2a5..d26cd9924 100644
--- a/ctest/c_dblat1c.c
+++ b/ctest/c_dblat1c.c
@@ -404,9 +404,9 @@ L40:
     static integer i__;
     extern doublereal dnrm2test_(int*, doublereal*, int*);
     static doublereal stemp[1], strue[8];
-    extern /* Subroutine */ int stest_(int*,doublereal*,doublereal*,doublereal*,doublereal*), dscaltest_(int*,doublereal*,doublereal*,int*);
+    extern /* Subroutine */ int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), dscaltest_(int*,doublereal*,doublereal*,int*);
     extern doublereal dasumtest_(int*,doublereal*,int*);
-    extern /* Subroutine */ int itest1_(int*,int*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*);
+    extern /* Subroutine */ int itest1_(integer*,integer*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*);
     static doublereal sx[8];
     static integer np1;
     extern integer idamaxtest_(int*,doublereal*,int*);
@@ -517,7 +517,7 @@ L40:
     static integer lenx, leny;
     extern doublereal ddottest_(int*,doublereal*,int*,doublereal*,int*);
     static integer i__, j, ksize;
-    extern /* Subroutine */ int stest_(int*,doublereal*,doublereal*,doublereal*,doublereal*), dcopytest_(int*,doublereal*,int*,doublereal*,int*), dswaptest_(int*,doublereal*,int*,doublereal*,int*), 
+    extern /* Subroutine */ int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), dcopytest_(int*,doublereal*,int*,doublereal*,int*), dswaptest_(int*,doublereal*,int*,doublereal*,int*), 
 	    daxpytest_(int*,doublereal*,doublereal*,int*,doublereal*,int*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*);
     static integer ki, kn, mx, my;
     static doublereal sx[7], sy[7], stx[7], sty[7];
@@ -620,7 +620,7 @@ L40:
     /* Local variables */
     extern /* Subroutine */ int drottest_(int*,doublereal*,int*,doublereal*,int*,doublereal*,doublereal*);
     static integer i__, k, ksize;
-    extern /* Subroutine */int stest_(int*,doublereal*,doublereal*,doublereal*,doublereal*), drotmtest_(int*,doublereal*,int*,doublereal*,int*,doublereal*);
+    extern /* Subroutine */int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), drotmtest_(int*,doublereal*,int*,doublereal*,int*,doublereal*);
     static integer ki, kn;
     static doublereal dparam[5], sx[10], sy[10], stx[10], sty[10];
 
@@ -691,7 +691,7 @@ L40:
     return 0;
 } /* check3_ */
 
-/* Subroutine */ int stest_(int* len, doublereal* scomp, doublereal* strue, doublereal* ssize, doublereal* sfac)
+/* Subroutine */ int stest_(integer* len, doublereal* scomp, doublereal* strue, doublereal* ssize, doublereal* sfac)
 {
     /* System generated locals */
     integer i__1;
@@ -758,7 +758,7 @@ L40:
 /* Subroutine */ int stest1_(doublereal* scomp1, doublereal* strue1, doublereal* ssize, doublereal* sfac)
 {
     static doublereal scomp[1], strue[1];
-    extern /* Subroutine */ int stest_(int*, doublereal*, doublereal*, doublereal*, doublereal*);
+    extern /* Subroutine */ int stest_(integer*, doublereal*, doublereal*, doublereal*, doublereal*);
 
 /*     ************************* STEST1 ***************************** */
 

From d8126c76e770efe8b7394d856de58ada75e5f3b1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 8 Oct 2023 13:38:39 +0200
Subject: [PATCH 353/718] fix prototype

---
 ctest/c_cblat1c.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ctest/c_cblat1c.c b/ctest/c_cblat1c.c
index fa97cbf92..929f3eaf3 100644
--- a/ctest/c_cblat1c.c
+++ b/ctest/c_cblat1c.c
@@ -887,7 +887,7 @@ L40:
 /* Subroutine */ int stest1_(real* scomp1, real* strue1, real* ssize, real* sfac)
 {
     static real scomp[1], strue[1];
-    extern /* Subroutine */ int stest_(int*, real*, real*, real*, real*);
+    extern /* Subroutine */ int stest_(integer*, real*, real*, real*, real*);
 
 /*     ************************* STEST1 ***************************** */
 

From c5e7339c9eb441c56264f30d0b998ff87335cbef Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 8 Oct 2023 16:13:37 +0200
Subject: [PATCH 354/718] correct prototypes for INTERFACE64 builds

---
 ctest/c_cblat1c.c | 12 ++++++------
 ctest/c_dblat1c.c | 20 ++++++++++----------
 ctest/c_sblat1c.c | 22 +++++++++++-----------
 ctest/c_zblat1c.c | 10 +++++-----
 4 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/ctest/c_cblat1c.c b/ctest/c_cblat1c.c
index 929f3eaf3..b4c512436 100644
--- a/ctest/c_cblat1c.c
+++ b/ctest/c_cblat1c.c
@@ -441,11 +441,11 @@ static real c_b43 = (float)1.;
     static complex mwpcs[5], mwpct[5];
     extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*);
     static complex cx[8];
-    extern real scnrm2test_(int*, complex*, int*);
+    extern real scnrm2test_(integer*, complex*, integer*);
     static integer np1;
-    extern integer icamaxtest_(int*, complex*, int*);
-    extern /* Subroutine */ int csscaltest_(int*, real*, complex*, int*);
-    extern real scasumtest_(int*, complex*, int*);
+    extern integer icamaxtest_(integer*, complex*, integer*);
+    extern /* Subroutine */ int csscaltest_(integer*, real*, complex*, integer*);
+    extern real scasumtest_(integer*, complex*, integer*);
     static integer len;
 
 /*     .. Parameters .. */
@@ -736,8 +736,8 @@ static real c_b43 = (float)1.;
     static complex ctemp;
     extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*);
     static integer ksize;
-    extern /* Subroutine */ int cdotctest_(int*, complex*, int*, complex*, int*,complex*), ccopytest_(int*, complex*, int*, complex*, int*), cdotutest_(int*, complex*, int*, complex*, int*, complex*), 
-	    cswaptest_(int*, complex*, int*, complex*, int*), caxpytest_(int*, complex*, complex*, int*, complex*, int*);
+    extern /* Subroutine */ int cdotctest_(integer*, complex*, integer*, complex*, integer*,complex*), ccopytest_(integer*, complex*, integer*, complex*, integer*), cdotutest_(integer*, complex*, integer*, complex*, integer*, complex*), 
+	    cswaptest_(integer*, complex*, integer*, complex*, integer*), caxpytest_(integer*, complex*, complex*, integer*, complex*, integer*);
     static integer ki, kn;
     static complex cx[7], cy[7];
     static integer mx, my;
diff --git a/ctest/c_dblat1c.c b/ctest/c_dblat1c.c
index d26cd9924..089dca4da 100644
--- a/ctest/c_dblat1c.c
+++ b/ctest/c_dblat1c.c
@@ -402,14 +402,14 @@ L40:
 
     /* Local variables */
     static integer i__;
-    extern doublereal dnrm2test_(int*, doublereal*, int*);
+    extern doublereal dnrm2test_(integer*, doublereal*, integer*);
     static doublereal stemp[1], strue[8];
-    extern /* Subroutine */ int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), dscaltest_(int*,doublereal*,doublereal*,int*);
-    extern doublereal dasumtest_(int*,doublereal*,int*);
+    extern /* Subroutine */ int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), dscaltest_(integer*,doublereal*,doublereal*,integer*);
+    extern doublereal dasumtest_(integer*,doublereal*,integer*);
     extern /* Subroutine */ int itest1_(integer*,integer*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*);
     static doublereal sx[8];
     static integer np1;
-    extern integer idamaxtest_(int*,doublereal*,int*);
+    extern integer idamaxtest_(integer*,doublereal*,integer*);
     static integer len;
 
 /*     .. Parameters .. */
@@ -515,10 +515,10 @@ L40:
 
     /* Local variables */
     static integer lenx, leny;
-    extern doublereal ddottest_(int*,doublereal*,int*,doublereal*,int*);
+    extern doublereal ddottest_(integer*,doublereal*,integer*,doublereal*,integer*);
     static integer i__, j, ksize;
-    extern /* Subroutine */ int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), dcopytest_(int*,doublereal*,int*,doublereal*,int*), dswaptest_(int*,doublereal*,int*,doublereal*,int*), 
-	    daxpytest_(int*,doublereal*,doublereal*,int*,doublereal*,int*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*);
+    extern /* Subroutine */ int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), dcopytest_(integer*,doublereal*,integer*,doublereal*,integer*), dswaptest_(integer*,doublereal*,integer*,doublereal*,integer*), 
+	    daxpytest_(integer*,doublereal*,doublereal*,integer*,doublereal*,integer*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*);
     static integer ki, kn, mx, my;
     static doublereal sx[7], sy[7], stx[7], sty[7];
 
@@ -618,9 +618,9 @@ L40:
 	    ;
 
     /* Local variables */
-    extern /* Subroutine */ int drottest_(int*,doublereal*,int*,doublereal*,int*,doublereal*,doublereal*);
+    extern /* Subroutine */ int drottest_(integer*,doublereal*,integer*,doublereal*,integer*,doublereal*,doublereal*);
     static integer i__, k, ksize;
-    extern /* Subroutine */int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), drotmtest_(int*,doublereal*,int*,doublereal*,int*,doublereal*);
+    extern /* Subroutine */int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), drotmtest_(integer*,doublereal*,integer*,doublereal*,integer*,doublereal*);
     static integer ki, kn;
     static doublereal dparam[5], sx[10], sy[10], stx[10], sty[10];
 
@@ -799,7 +799,7 @@ doublereal sdiff_(doublereal* sa, doublereal* sb)
     return ret_val;
 } /* sdiff_ */
 
-/* Subroutine */ int itest1_(int* icomp, int* itrue)
+/* Subroutine */ int itest1_(integer* icomp, integer* itrue)
 {
     /* Local variables */
     static integer id;
diff --git a/ctest/c_sblat1c.c b/ctest/c_sblat1c.c
index 7c049b796..7a81e04c1 100644
--- a/ctest/c_sblat1c.c
+++ b/ctest/c_sblat1c.c
@@ -433,14 +433,14 @@ L40:
 
     /* Local variables */
     static integer i__;
-    extern real snrm2test_(int*,real*,int*);
+    extern real snrm2test_(integer*,real*,integer*);
     static real stemp[1], strue[8];
-    extern /* Subroutine */ int stest_(int*, real*,real*,real*,real*), sscaltest_(int*,real*,real*,int*);
-    extern real sasumtest_(int*,real*,int*);
+    extern /* Subroutine */ int stest_(integer*, real*,real*,real*,real*), sscaltest_(integer*,real*,real*,integer*);
+    extern real sasumtest_(integer*,real*,integer*);
     extern /* Subroutine */ int itest1_(integer*,integer*), stest1_(real*,real*,real*,real*);
     static real sx[8];
     static integer np1;
-    extern integer isamaxtest_(int*,real*,int*);
+    extern integer isamaxtest_(integer*,real*,integer*);
     static integer len;
 
 
@@ -590,10 +590,10 @@ L40:
 
     /* Local variables */
     static integer lenx, leny;
-    extern real sdottest_(int*,real*,int*,real*,int*);
+    extern real sdottest_(integer*,real*,integer*,real*,integer*);
     static integer i__, j, ksize;
-    extern /* Subroutine */ int stest_(int*,real*,real*,real*,real*), scopytest_(int*,real*,int*,real*,int*), sswaptest_(int*,real*,int*,real*,int*), 
-	    saxpytest_(int*,real*,real*,int*,real*,int*);
+    extern /* Subroutine */ int stest_(integer*,real*,real*,real*,real*), scopytest_(integer*,real*,integer*,real*,integer*), sswaptest_(integer*,real*,integer*,real*,integer*), 
+	    saxpytest_(integer*,real*,real*,integer*,real*,integer*);
     static integer ki;
     extern /* Subroutine */ int stest1_(real*,real*,real*,real*);
     static integer kn, mx, my;
@@ -708,9 +708,9 @@ L40:
 	    1.17 };
 
     /* Local variables */
-    extern /* Subroutine */ void srottest_(int*,real*,int*,real*,int*,real*,real*);
+    extern /* Subroutine */ void srottest_(integer*,real*,integer*,real*,integer*,real*,real*);
     static integer i__, k, ksize;
-    extern /* Subroutine */ int stest_(int*,real*,real*,real*,real*), srotmtest_(int*,real*,int*,real*,int*,real*);
+    extern /* Subroutine */ int stest_(integer*,real*,real*,real*,real*), srotmtest_(integer*,real*,integer*,real*,integer*,real*);
     static integer ki, kn;
     static real sx[19], sy[19], sparam[5], stx[19], sty[19];
 
@@ -781,7 +781,7 @@ L40:
     return 0;
 } /* check3_ */
 
-/* Subroutine */ int stest_(int* len, real* scomp, real* strue, real* ssize, real* sfac)
+/* Subroutine */ int stest_(integer* len, real* scomp, real* strue, real* ssize, real* sfac)
 {
     integer i__1;
     real r__1, r__2, r__3, r__4, r__5;
@@ -847,7 +847,7 @@ L40:
 /* Subroutine */ int stest1_(real* scomp1, real* strue1, real* ssize, real* sfac)
 {
     static real scomp[1], strue[1];
-    extern /* Subroutine */ int stest_(int*,real*,real*,real*,real*);
+    extern /* Subroutine */ int stest_(integer*,real*,real*,real*,real*);
 
 /*     ************************* STEST1 ***************************** */
 
diff --git a/ctest/c_zblat1c.c b/ctest/c_zblat1c.c
index d8cff5dee..f7c0515fc 100644
--- a/ctest/c_zblat1c.c
+++ b/ctest/c_zblat1c.c
@@ -380,7 +380,7 @@ static doublereal c_b43 = 1.;
     static integer i__;
     extern /* Subroutine */ int ctest_(integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*);
     static doublecomplex mwpcs[5], mwpct[5];
-    extern /* Subroutine */ int zscaltest_(int*, doublereal*, doublecomplex*, int*), itest1_(int*, int*), stest1_(doublereal*, doublereal*, doublereal*, doublereal*);
+    extern /* Subroutine */ int zscaltest_(integer*, doublereal*, doublecomplex*, integer*), itest1_(integer*, integer*), stest1_(doublereal*, doublereal*, doublereal*, doublereal*);
     static doublecomplex cx[8];
     extern doublereal dznrm2test_(integer*, doublecomplex*, integer*);
     static integer np1;
@@ -591,11 +591,11 @@ static doublereal c_b43 = 1.;
     extern /* Subroutine */ int ctest_(integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*);
     static integer ksize;
     static doublecomplex ztemp;
-    extern /* Subroutine */ int zdotctest_(int*, doublecomplex*, int*, doublecomplex*, int*, doublecomplex*), zcopytest_(int*, doublecomplex*, int*, doublecomplex*, int*);
+    extern /* Subroutine */ int zdotctest_(integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*), zcopytest_(integer*, doublecomplex*, integer*, doublecomplex*, integer*);
     static integer ki;
-    extern /* Subroutine */ int zdotutest_(int*, doublecomplex*, int*, doublecomplex*, int*, doublecomplex*), zswaptest_(int*, doublecomplex*, int*, doublecomplex*, int*);
+    extern /* Subroutine */ int zdotutest_(integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*), zswaptest_(integer*, doublecomplex*, integer*, doublecomplex*, integer*);
     static integer kn;
-    extern /* Subroutine */ int zaxpytest_(int*, doublereal*, doublecomplex*, int*, doublecomplex*, int*);
+    extern /* Subroutine */ int zaxpytest_(integer*, doublereal*, doublecomplex*, integer*, doublecomplex*, integer*);
     static doublecomplex cx[7], cy[7];
     static integer mx, my;
 
@@ -747,7 +747,7 @@ L40:
 /* Subroutine */ int stest1_(doublereal* scomp1, doublereal* strue1, doublereal* ssize, doublereal* sfac)
 {
     static doublereal scomp[1], strue[1];
-    extern /* Subroutine */ integer stest_(int*,doublereal*, doublereal*, doublereal*, doublereal*);
+    extern /* Subroutine */ int stest_(integer*,doublereal*, doublereal*, doublereal*, doublereal*);
 
 /*     ************************* STEST1 ***************************** */
 

From 103d6f4e42fbe532ae4ea48e8d90d7d792bc93d2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 10 Oct 2023 16:15:52 +0200
Subject: [PATCH 355/718] Require "classic ld" with XCODE 15.x on Mac

---
 Makefile.system | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index b1a357fdf..77c36c870 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -405,6 +405,13 @@ export MACOSX_DEPLOYMENT_TARGET=10.8
 endif
 endif
 MD5SUM = md5 -r
+XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.Xcode |awk '/version:/ {print $2}'|cut -d: -f2|cut -f1 -d.)
+ifeq (x$(XCVER)x,xx)
+XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables |awk '/version:/ {print $2}'|cut -d: -f2|cut -f1 -d.)
+endif
+ifeq (x$(XCVER), x 15)
+CCOMMON_OPT += -Wl,-ld_classic
+endif
 endif
 
 ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly))

From 82fc29a57a403c2e6567d06e7dbaf58ae1fe30a5 Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Wed, 11 Oct 2023 17:04:42 -0500
Subject: [PATCH 356/718] POWER10: Fallback to POWER8 functions

As cgemm and zgemm kernels are not optimized for big endian falling
back to POWER8 versions.  Tested on AIX using gcc and Open XL C.
---
 kernel/power/KERNEL.POWER10 | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index 79d889fe0..58f865322 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -19,8 +19,13 @@ SBGEMMOTCOPYOBJ =  sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 STRMMKERNEL	= sgemm_kernel_power10.c
 DTRMMKERNEL	= dgemm_kernel_power10.c
+ifeq ($(OSNAME), AIX)
+CTRMMKERNEL     = ctrmm_kernel_8x4_power8.S
+ZTRMMKERNEL     = ztrmm_kernel_8x2_power8.S
+else
 CTRMMKERNEL	= cgemm_kernel_power10.S
 ZTRMMKERNEL	= zgemm_kernel_power10.S
+endif
 
 SGEMMKERNEL    =  sgemm_kernel_power10.c
 SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
@@ -62,10 +67,18 @@ DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c
 DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c
 DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c
 
+ifeq ($(OSNAME), AIX)
+CGEMMKERNEL    = cgemm_kernel_8x4_power8.S
+else
 CGEMMKERNEL    = cgemm_kernel_power10.S
+endif
 #CGEMMKERNEL     = cgemm_kernel_8x4_power8.S
 CGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
+ifeq ($(OSNAME), AIX)
+CGEMMITCOPY    = cgemm_tcopy_8_power8.S
+else
 CGEMMITCOPY    = ../generic/zgemm_tcopy_8.c
+endif
 CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
 CGEMMOTCOPY    = ../generic/zgemm_tcopy_4.c
 CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
@@ -73,7 +86,11 @@ CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
 CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
 
+ifeq ($(OSNAME), AIX)
+ZGEMMKERNEL    = zgemm_kernel_8x2_power8.S
+else
 ZGEMMKERNEL    = zgemm_kernel_power10.S
+endif
 ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
 ZGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
@@ -124,6 +141,7 @@ ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 #SMINKERNEL   = ../arm/min.c
 #DMINKERNEL   = ../arm/min.c
 #
+ifeq ($(C_COMPILER), GCC)
 ifneq ($(GCCVERSIONGTEQ9),1)
 ISAMAXKERNEL = isamax_power9.S
 else
@@ -148,6 +166,15 @@ ICAMINKERNEL = icamin_power9.S
 else
 ICAMINKERNEL = icamin.c
 endif
+else
+ISAMAXKERNEL = isamax.c
+IDAMAXKERNEL = idamax.c
+ICAMAXKERNEL = icamax.c
+IZAMAXKERNEL = izamax.c
+ISAMINKERNEL = isamin.c
+IDAMINKERNEL = idamin.c
+ICAMINKERNEL = icamin.c
+endif
 IZAMINKERNEL = izamin.c
 #
 #ISMAXKERNEL  = ../arm/imax.c

From 71d733e5f735d18f93ab40d15d85b5129980ec91 Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Wed, 11 Oct 2023 17:18:42 -0500
Subject: [PATCH 357/718] POWER: Avoid m4 conversions for C files

This patch removes intermediate m4 conversions used in sbgemm
compilation as it is not needed for .c files.
Tested on AIX with gcc and IBM Open XL C.
---
 kernel/Makefile.L3 | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 174a1d41b..7db9d7907 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -634,15 +634,7 @@ $(KDIR)$(SBGEMMONCOPYOBJ) : $(KERNELDIR)/$(SBGEMMONCOPY)
 	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 
 $(KDIR)$(SBGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SBGEMMOTCOPY)
-
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmotcopy.s
-	m4 sbgemmotcopy.s > sbgemmotcopy_nomacros.s
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmotcopy_nomacros.s -o $@
-	rm sbgemmotcopy.s sbgemmotcopy_nomacros.s
-else
 	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-endif
 
 ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N))
 
@@ -650,14 +642,7 @@ $(KDIR)$(SBGEMMINCOPYOBJ) : $(KERNELDIR)/$(SBGEMMINCOPY)
 	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 
 $(KDIR)$(SBGEMMITCOPYOBJ) : $(KERNELDIR)/$(SBGEMMITCOPY)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmitcopy.s
-	m4 sbgemmitcopy.s > sbgemmitcopy_nomacros.s
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmitcopy_nomacros.s -o $@
-	rm sbgemmitcopy.s sbgemmitcopy_nomacros.s
-else
 	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-endif
 
 endif
 endif
@@ -829,15 +814,8 @@ endif
 ifeq ($(BUILD_BFLOAT16), 1)
 
 $(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX  $< -o - > sbgemm_kernel$(TSUFFIX).s
-	m4 sbgemm_kernel$(TSUFFIX).s > sbgemm_kernel$(TSUFFIX)_nomacros.s
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemm_kernel$(TSUFFIX)_nomacros.s -o $@
-	rm sbgemm_kernel$(TSUFFIX).s sbgemm_kernel$(TSUFFIX)_nomacros.s
-else
 	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 endif
-endif
 
 $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
 ifeq ($(OS), AIX)

From 97a61d0577bcfefe92df0515ece079234400e7eb Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Wed, 11 Oct 2023 17:36:43 -0500
Subject: [PATCH 358/718] Fix bfloat16_bits union so that it always the sizeof
 unsigned short.

---
 test/compare_sgemm_sbgemm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c
index 276fecae9..57d416c94 100644
--- a/test/compare_sgemm_sbgemm.c
+++ b/test/compare_sgemm_sbgemm.c
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 typedef union
 {
   unsigned short v;
-  struct
+  struct __attribute__((packed))
   {
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
     unsigned short s:1;

From e98e3c4783fe9250c9bda11cada6e82ef627854e Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Wed, 11 Oct 2023 18:05:55 -0500
Subject: [PATCH 359/718] Fix float32_bits union so that it always the sizeof
 float.

---
 test/compare_sgemm_sbgemm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c
index 57d416c94..b723b6a3b 100644
--- a/test/compare_sgemm_sbgemm.c
+++ b/test/compare_sgemm_sbgemm.c
@@ -49,7 +49,7 @@ typedef union
 typedef union
 {
   float v;
-  struct
+  struct __attribute__((packed))
   {
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
     uint32_t s:1;

From e7d05402e02c6cb3e9cca60cff976927cbd4f506 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 12 Oct 2023 14:24:53 +0200
Subject: [PATCH 360/718] Fix up S/D GEMM copy function definitions after #4009

---
 kernel/arm64/KERNEL.A64FX | 90 +++++++++++++++++----------------------
 1 file changed, 40 insertions(+), 50 deletions(-)

diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX
index bd25f7cd8..ccbce27e1 100644
--- a/kernel/arm64/KERNEL.A64FX
+++ b/kernel/arm64/KERNEL.A64FX
@@ -57,7 +57,7 @@ CAMAXKERNEL  = zamax.S
 ZAMAXKERNEL  = zamax.S
 
 SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = axpy.S
+DAXPYKERNEL  = daxpy_thunderx2t99.S
 CAXPYKERNEL  = zaxpy.S
 ZAXPYKERNEL  = zaxpy.S
 
@@ -81,45 +81,35 @@ DGEMVTKERNEL = gemv_t.S
 CGEMVTKERNEL = zgemv_t.S
 ZGEMVTKERNEL = zgemv_t.S
 
-
-SASUMKERNEL    = asum.S
-DASUMKERNEL    = asum.S
-CASUMKERNEL    = casum.S
-ZASUMKERNEL    = zasum.S
-
-SCOPYKERNEL    = copy.S
-DCOPYKERNEL    = copy.S
-CCOPYKERNEL    = copy.S
-ZCOPYKERNEL    = copy.S
-
-SSWAPKERNEL    = swap.S
-DSWAPKERNEL    = swap.S
-CSWAPKERNEL    = swap.S
-ZSWAPKERNEL    = swap.S
-
-ISAMAXKERNEL   = iamax.S
-IDAMAXKERNEL   = iamax.S
-ICAMAXKERNEL   = izamax.S
-IZAMAXKERNEL   = izamax.S
-
-SNRM2KERNEL    = nrm2.S
-DNRM2KERNEL    = nrm2.S
-CNRM2KERNEL    = znrm2.S
-ZNRM2KERNEL    = znrm2.S
-
-DDOTKERNEL     = dot.S
-ifneq ($(C_COMPILER), PGI)
-SDOTKERNEL     = ../generic/dot.c
-else
-SDOTKERNEL = dot.S
-endif
-ifneq ($(C_COMPILER), PGI)
-CDOTKERNEL     = zdot.S
-ZDOTKERNEL     = zdot.S
-else
-CDOTKERNEL = ../arm/zdot.c
-ZDOTKERNEL = ../arm/zdot.c
-endif
+SASUMKERNEL    = sasum_thunderx2t99.c
+DASUMKERNEL    = dasum_thunderx2t99.c
+CASUMKERNEL    = casum_thunderx2t99.c
+ZASUMKERNEL    = zasum_thunderx2t99.c
+
+SCOPYKERNEL    = copy_thunderx2t99.c
+DCOPYKERNEL    = copy_thunderx2t99.c
+CCOPYKERNEL    = copy_thunderx2t99.c
+ZCOPYKERNEL    = copy_thunderx2t99.c
+
+SSWAPKERNEL    = swap_thunderx2t99.S
+DSWAPKERNEL    = swap_thunderx2t99.S
+CSWAPKERNEL    = swap_thunderx2t99.S
+ZSWAPKERNEL    = swap_thunderx2t99.S
+
+ISAMAXKERNEL   = iamax_thunderx2t99.c
+IDAMAXKERNEL   = iamax_thunderx2t99.c
+ICAMAXKERNEL   = izamax_thunderx2t99.c
+IZAMAXKERNEL   = izamax_thunderx2t99.c
+
+SNRM2KERNEL    = scnrm2_thunderx2t99.c
+DNRM2KERNEL    = dznrm2_thunderx2t99.c
+CNRM2KERNEL    = scnrm2_thunderx2t99.c
+ZNRM2KERNEL    = dznrm2_thunderx2t99.c
+
+DDOTKERNEL     = dot.c
+SDOTKERNEL     = dot.c
+CDOTKERNEL     = zdot_thunderx2t99.c
+ZDOTKERNEL     = zdot_thunderx2t99.c
 DSDOTKERNEL    = dot.S
 
 DGEMM_BETA     = dgemm_beta.S
@@ -128,10 +118,10 @@ SGEMM_BETA     = sgemm_beta.S
 SGEMMKERNEL    =  sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
 STRMMKERNEL    =  strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
 
-SGEMMINCOPY    =  sgemm_ncopy_sve_v1.c
-SGEMMITCOPY    =  sgemm_tcopy_sve_v1.c
-SGEMMONCOPY    =  sgemm_ncopy_$(DGEMM_UNROLL_N).S
-SGEMMOTCOPY    =  sgemm_tcopy_$(DGEMM_UNROLL_N).S
+SGEMMINCOPY    =  gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c
+SGEMMITCOPY    =  gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c
+SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
+SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
 
 SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
@@ -149,8 +139,8 @@ SSYMMLCOPY_M    =  symm_lcopy_sve.c
 DGEMMKERNEL    =  dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
 DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
 
-DGEMMINCOPY    =  dgemm_ncopy_sve_v1.c
-DGEMMITCOPY    =  dgemm_tcopy_sve_v1.c
+DGEMMINCOPY    =  gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c
+DGEMMITCOPY    =  gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c
 DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
 DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
 
@@ -170,8 +160,8 @@ DSYMMLCOPY_M    =  symm_lcopy_sve.c
 CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
 CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
 
-CGEMMINCOPY    =  cgemm_ncopy_sve_v1.c
-CGEMMITCOPY    =  cgemm_tcopy_sve_v1.c
+CGEMMINCOPY    =  gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
+CGEMMITCOPY    =  gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
 CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
 CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
 
@@ -194,8 +184,8 @@ CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
 ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
 ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
 
-ZGEMMINCOPY    =  zgemm_ncopy_sve_v1.c
-ZGEMMITCOPY    =  zgemm_tcopy_sve_v1.c
+ZGEMMINCOPY    =  gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
+ZGEMMITCOPY    =  gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
 ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
 ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
 

From d46eba06a7a95a61649cf25a1fd350f46d8050b6 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Thu, 12 Oct 2023 09:41:33 -0500
Subject: [PATCH 361/718] Pack structure only on AIX.

---
 test/compare_sgemm_sbgemm.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c
index b723b6a3b..cf808b56d 100644
--- a/test/compare_sgemm_sbgemm.c
+++ b/test/compare_sgemm_sbgemm.c
@@ -32,7 +32,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 typedef union
 {
   unsigned short v;
+#if defined(_AIX)
   struct __attribute__((packed))
+#else
+  struct
+#endif
   {
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
     unsigned short s:1;
@@ -49,7 +53,11 @@ typedef union
 typedef union
 {
   float v;
+#if defined(_AIX)
   struct __attribute__((packed))
+#else
+  struct
+#endif
   {
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
     uint32_t s:1;

From fe75c88a2c65240f4b7265be3dbc4b93132ec24f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 12 Oct 2023 18:20:09 +0200
Subject: [PATCH 362/718] AzureCI: move OSX-Clang jobs to macOS-12 to resolve
 setup/build timeouts

---
 azure-pipelines.yml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index ff56ad00b..317bc504a 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -167,11 +167,10 @@ jobs:
      
 - job: OSX_OpenMP_Clang
   pool:
-     vmImage: 'macOS-11'
+     vmImage: 'macOS-latest'
   variables:
      LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
      LIBRARY_PATH: /usr/local/opt/llvm/lib
-     MACOSX_DEPLOYMENT_TARGET: 11.0
   steps:   
   - script: |
       brew update
@@ -180,7 +179,7 @@ jobs:
 
 - job: OSX_OpenMP_Clang_cmake
   pool:
-     vmImage: 'macOS-11'
+     vmImage: 'macOS-latest'
   variables:
      LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
      LIBRARY_PATH: /usr/local/opt/llvm/lib
@@ -210,7 +209,7 @@ jobs:
 
 - job: OSX_Ifort_Clang
   pool:
-     vmImage: 'macOS-11'
+     vmImage: 'macOS-latest'
   variables:
      LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
      MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg

From 9f42570e33db010665d65bc50f803a673fcc5af0 Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Thu, 12 Oct 2023 12:37:40 -0500
Subject: [PATCH 363/718] POWER: Increase macro size limit for AIX

This patch increases the macro size limit from 4096 to 16384 to
allow compiling larger assembly files in AIX.
Tested with GCC and IBM Open XL C.
---
 kernel/Makefile.L3 | 97 +++++++++++++++++++++++++---------------------
 1 file changed, 53 insertions(+), 44 deletions(-)

diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 7db9d7907..448e096a3 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -61,6 +61,15 @@ ifeq ($(CORE), ZEN)
 USE_TRMM = 1
 endif
 
+ifeq ($(OS), AIX)
+M4VERSION := $(shell m4 --version < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?)
+ifeq ($(M4VERSION), 0)
+M4_AIX := m4 -l16384
+else
+M4_AIX := m4 -B16384
+endif
+$(info $$var is [${$(M4_AIX)}])
+endif
 ifeq ($(CORE), POWER8)
 ifeq ($(BINARY64),1)
 USE_TRMM = 1
@@ -653,7 +662,7 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
 $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s
-	m4 sgemmotcopy.s > sgemmotcopy_nomacros.s
+	$(M4_AIX) sgemmotcopy.s > sgemmotcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@
 	rm sgemmotcopy.s sgemmotcopy_nomacros.s
 else
@@ -669,7 +678,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY)
 $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s
-	m4 sgemmitcopy.s > sgemmitcopy_nomacros.s
+	$(M4_AIX) sgemmitcopy.s > sgemmitcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@
 	rm sgemmitcopy.s sgemmitcopy_nomacros.s
 else
@@ -681,7 +690,7 @@ endif
 $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_ncopy.s
-	m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s
+	$(M4_AIX) dgemm_ncopy.s > dgemm_ncopy_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@
 	rm dgemm_ncopy.s dgemm_ncopy_nomacros.s
 else
@@ -699,7 +708,7 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY)
 $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_itcopy.s
-	m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s
+	$(M4_AIX) dgemm_itcopy.s > dgemm_itcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@
 	rm dgemm_itcopy.s dgemm_itcopy_nomacros.s
 else
@@ -742,7 +751,7 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY)
 $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -S $< -o - > cgemm_itcopy.s
-	m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s
+	$(M4_AIX) cgemm_itcopy.s > cgemm_itcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@
 	rm cgemm_itcopy.s cgemm_itcopy_nomacros.s
 else
@@ -765,7 +774,7 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY)
 $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > zgemm_itcopy.s
-	m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s
+	$(M4_AIX) zgemm_itcopy.s > zgemm_itcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@
 	rm zgemm_itcopy.s zgemm_itcopy_nomacros.s
 else
@@ -797,7 +806,7 @@ endif
 $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX  $< -o - > sgemm_kernel$(TSUFFIX).s
-	m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s
+	$(M4_AIX) sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@
 	rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s
 else
@@ -820,7 +829,7 @@ endif
 $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s
-	m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s
+	$(M4_AIX) dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@
 	rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s
 else
@@ -833,7 +842,7 @@ $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEP
 $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNN $< -o - > cgemm_kernel_n.s
-	m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
+	$(M4_AIX) cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@
 	rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s
 else
@@ -843,7 +852,7 @@ endif
 $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCN $< -o - > cgemm_kernel_l.s
-	m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
+	$(M4_AIX) cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@
 	rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s
 else
@@ -853,7 +862,7 @@ endif
 $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNC  $< -o - > cgemm_kernel_r.s
-	m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
+	$(M4_AIX) cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
 	rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
 else
@@ -863,7 +872,7 @@ endif
 $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCC $< -o - > cgemm_kernel_b.s
-	m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
+	$(M4_AIX) cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@
 	rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s
 else
@@ -873,7 +882,7 @@ endif
 $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zgemm_kernel_n.s
-	m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
+	$(M4_AIX) zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
 	rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
 else ifeq ($(CORE),SANDYBRIDGE)
@@ -885,7 +894,7 @@ endif
 $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCN $< -o - > zgemm_kernel_l.s
-	m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
+	$(M4_AIX) zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
 	rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
 else ifeq ($(CORE),SANDYBRIDGE)
@@ -897,7 +906,7 @@ endif
 $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNC $< -o - > zgemm_kernel_r.s
-	m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
+	$(M4_AIX) zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
 	rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
 else ifeq ($(CORE),SANDYBRIDGE)
@@ -909,7 +918,7 @@ endif
 $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCC $< -o - > zgemm_kernel_b.s
-	m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
+	$(M4_AIX) zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
 	rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
 else ifeq ($(CORE),SANDYBRIDGE)
@@ -935,7 +944,7 @@ ifdef USE_TRMM
 $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > strmmkernel_ln.s	
-	m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s
+	$(M4_AIX) strmmkernel_ln.s > strmmkernel_ln_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@
 	rm strmmkernel_ln.s strmmkernel_ln_nomacros.s
 else
@@ -945,7 +954,7 @@ endif
 $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > strmmkernel_lt.s	
-	m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s
+	$(M4_AIX) strmmkernel_lt.s > strmmkernel_lt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@
 	rm strmmkernel_lt.s strmmkernel_lt_nomacros.s
 else
@@ -955,7 +964,7 @@ endif
 $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > strmmkernel_rn.s	
-	m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s
+	$(M4_AIX) strmmkernel_rn.s > strmmkernel_rn_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@
 	rm strmmkernel_rn.s strmmkernel_rn_nomacros.s
 else
@@ -965,7 +974,7 @@ endif
 $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s	
-	m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
+	$(M4_AIX) strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
 	rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
 else
@@ -975,7 +984,7 @@ endif
 $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > dtrmm_kernel_ln.s
-	m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s
+	$(M4_AIX) dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@
 	rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s
 else
@@ -985,7 +994,7 @@ endif
 $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > dtrmm_kernel_lt.s
-	m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s
+	$(M4_AIX) dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@
 	rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s
 else
@@ -995,7 +1004,7 @@ endif
 $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > dtrmm_kernel_rn.s
-	m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s
+	$(M4_AIX) dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@
 	rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s
 else
@@ -1005,7 +1014,7 @@ endif
 $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > dtrmm_kernel_rt.s
-	m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s
+	$(M4_AIX) dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@
 	rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s
 else
@@ -1027,7 +1036,7 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
 $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN  $< -o - > ctrmm_kernel_ln.s
-	m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
+	$(M4_AIX) ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@
 	rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s
 else
@@ -1037,7 +1046,7 @@ endif
 $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_lt.s
-	m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
+	$(M4_AIX) ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@
 	rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s
 else
@@ -1047,7 +1056,7 @@ endif
 $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lr.s
-	m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
+	$(M4_AIX) ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN  ctrmm_kernel_lr_nomacros.s -o $@
 	rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s
 else
@@ -1057,7 +1066,7 @@ endif
 $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lc.s
-	m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
+	$(M4_AIX) ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@
 	rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s
 else
@@ -1067,7 +1076,7 @@ endif
 $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rn.s
-	m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
+	$(M4_AIX) ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@
 	rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s
 else
@@ -1077,7 +1086,7 @@ endif
 $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rt.s
-	m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
+	$(M4_AIX) ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@
 	rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s
 else
@@ -1087,7 +1096,7 @@ endif
 $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_rr.s
-	m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
+	$(M4_AIX) ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@
 	rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s
 else
@@ -1097,7 +1106,7 @@ endif
 $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_RC.s
-	m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
+	$(M4_AIX) ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@
 	rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s
 else
@@ -1107,7 +1116,7 @@ endif
 $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_ln.s
-	m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
+	$(M4_AIX) ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
 	rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
 else ifeq ($(CORE), SANDYBRIDGE)
@@ -1119,7 +1128,7 @@ endif
 $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_lt.s
-	m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
+	$(M4_AIX) ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
 	rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
 else ifeq ($(CORE), SANDYBRIDGE)
@@ -1131,7 +1140,7 @@ endif
 $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lr.s
-	m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
+	$(M4_AIX) ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
 	rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
 else ifeq ($(CORE), SANDYBRIDGE)
@@ -1143,7 +1152,7 @@ endif
 $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lc.s
-	m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
+	$(M4_AIX) ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
 	rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s 
 else ifeq ($(CORE), SANDYBRIDGE)
@@ -1155,7 +1164,7 @@ endif
 $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rn.s
-	m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
+	$(M4_AIX) ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
 	rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
 else ifeq ($(CORE), SANDYBRIDGE)
@@ -1167,7 +1176,7 @@ endif
 $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rt.s
-	m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
+	$(M4_AIX) ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
 	rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
 else ifeq ($(CORE), SANDYBRIDGE)
@@ -1179,7 +1188,7 @@ endif
 $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rr.s
-	m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
+	$(M4_AIX) ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
 	rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
 else ifeq ($(CORE), SANDYBRIDGE)
@@ -1191,7 +1200,7 @@ endif
 $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rc.s
-	m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
+	$(M4_AIX) ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
 	rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
 else ifeq ($(CORE), SANDYBRIDGE)
@@ -1213,7 +1222,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s	
-	m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
+	$(M4_AIX) strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
 	rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
 else
@@ -1373,7 +1382,7 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT
 $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o - > dtrsm_kernel_lt.s
-	m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
+	$(M4_AIX) dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
 	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@
 	rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s
 else
@@ -2965,7 +2974,7 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM
 $(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(PFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s
-	m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
+	$(M4_AIX) cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
 	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
 	rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s 
 else
@@ -3011,7 +3020,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s	
-	m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s
+	$(M4_AIX) strmmkernel_rn.s > strmm_kernel_rt_nomacros.s
 	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
 	rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
 else

From 301e2ecc49aa9207573a062fe272f902161c48a7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 16 Oct 2023 22:15:46 +0200
Subject: [PATCH 364/718] Cray Fortran uses -O in combinations like -O omp so
 don't filter that out

---
 Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile b/Makefile
index 299970c67..8621a8b3f 100644
--- a/Makefile
+++ b/Makefile
@@ -35,7 +35,11 @@ export NO_LAPACK
 export C_LAPACK
 endif
 
+ifeq ($(F_COMPILER),CRAY)
+LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -Og -Os,$(LAPACK_FFLAGS))
+else
 LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
+endif
 
 SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
 

From b41cab08756563819e0cbc7ab005ab746fa4721b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 16 Oct 2023 22:20:59 +0200
Subject: [PATCH 365/718] Need to use override to actually strip down the
 already defined FFLAGS for NAG and CCE Fortran

---
 Makefile.system | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index 77c36c870..868cca4f9 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -1642,11 +1642,11 @@ endif
 
 ifeq ($(F_COMPILER),NAG)
 LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
-FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
+override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
 endif
 ifeq ($(F_COMPILER),CRAY)
 LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
-FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
+override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
 endif
 
 LAPACK_CFLAGS = $(CFLAGS)

From f8c230c21c0bb20dc61d14988069ae60df6e0423 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 18 Oct 2023 11:58:54 +0200
Subject: [PATCH 366/718] Switch MINGW-W64 jobs to UCRT

---
 .github/workflows/dynamic_arch.yml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml
index 4fe6e63fc..0c39bfddf 100644
--- a/.github/workflows/dynamic_arch.yml
+++ b/.github/workflows/dynamic_arch.yml
@@ -151,13 +151,13 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        msystem: [MINGW64, MINGW32, CLANG64, CLANG32]
+        msystem: [UCRT64, MINGW32, CLANG64, CLANG32]
         idx: [int32, int64]
         build-type: [Release]
         include:
-          - msystem: MINGW64
+          - msystem: UCRT64
             idx: int32
-            target-prefix: mingw-w64-x86_64
+            target-prefix: mingw-w64-ucrt-x86_64
             fc-pkg: fc
           - msystem: MINGW32
             idx: int32
@@ -175,10 +175,10 @@ jobs:
             target-prefix: mingw-w64-clang-i686
             fc-pkg: cc
             c-lapack-flags: -DC_LAPACK=ON
-          - msystem: MINGW64
+          - msystem: UCRT64
             idx: int64
             idx64-flags: -DBINARY=64 -DINTERFACE64=1
-            target-prefix: mingw-w64-x86_64
+            target-prefix: mingw-w64-ucrt-x86_64
             fc-pkg: fc
           - msystem: CLANG64
             idx: int64
@@ -188,9 +188,9 @@ jobs:
             # Compiling with Flang 16 seems to cause test errors on machines
             # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
             no-avx512-flags: -DNO_AVX512=1
-          - msystem: MINGW64
+          - msystem: UCRT64
             idx: int32
-            target-prefix: mingw-w64-x86_64
+            target-prefix: mingw-w64-ucrt-x86_64
             fc-pkg: fc
             build-type: None
         exclude:

From e12aaed13d39a77bb089d5c39478ed203160f196 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 18 Oct 2023 16:28:54 +0200
Subject: [PATCH 367/718] Fix unwanted fallthrough from Intel Family 6 to 15 in
 case of identification failure

---
 driver/others/dynamic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 8e0f53f74..69a473060 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -805,7 +805,8 @@ static gotoblas_t *get_coretype(void){
         }      
 	return NULL;
       }
-      case 0xf:
+      break;
+    case 0xf:
       if (model <= 0x2) return &gotoblas_NORTHWOOD;
       return &gotoblas_PRESCOTT;
     }

From 6b8379d6d998f94ecb9a6adccc6cbd3f1d23f1c7 Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Thu, 19 Oct 2023 11:38:26 +0200
Subject: [PATCH 368/718] Run nightly Homebrew cron job only on the main repo,
 not on forks

I noticed this because GitHub emailed me that it would disable the
nightly job because it hadn't changed for 3 months. It currently takes
30-50 minutes daily, and by default runs on all forks of the main
repository that have the relevant workflow yaml file. That serves little
purpose and wastes quite a bit of energy - so disable the runs outside
of the main repo.

This will not disable the runs on forks already made in the past that
contain this workflow file, but it does save 3 months worth of runs on
every new fork that is created.

[skip ci]
---
 .github/workflows/nightly-Homebrew-build.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml
index 96063565d..eb315f1d4 100644
--- a/.github/workflows/nightly-Homebrew-build.yml
+++ b/.github/workflows/nightly-Homebrew-build.yml
@@ -23,6 +23,7 @@ permissions:
 
 jobs:
   build-OpenBLAS-with-Homebrew:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
     runs-on: macos-latest
     env:
       DEVELOPER_DIR: /Applications/Xcode_11.4.1.app/Contents/Developer

From 980f702f72c20d01f4110e40fc44e35352812b8b Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Thu, 19 Oct 2023 18:48:19 -0500
Subject: [PATCH 369/718] POWER: AIX: Make use of power10 optimization

POWER10 optimizations are disabled when using default AIX assembler.
As we have fixed many issues recently, enabling optimization path
for default assembler.
---
 Makefile.system             | 8 ++++----
 kernel/power/KERNEL.POWER10 | 4 ----
 param.h                     | 5 -----
 3 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index 868cca4f9..30b0ddec2 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -277,10 +277,6 @@ endif
 ifndef GOTOBLAS_MAKEFILE
 export GOTOBLAS_MAKEFILE = 1
 
-# Determine if the assembler is GNU Assembler
-HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?)
-GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS)
-
 # Generating Makefile.conf and config.h
 DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
 
@@ -760,7 +756,11 @@ DYNAMIC_CORE += POWER9
 else
 $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
 endif
+ifeq ($(OSNAME), AIX)
+LDVERSIONGTEQ35 := 1
+else
 LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35)
+endif
 ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11)
 DYNAMIC_CORE += POWER10
 CCOMMON_OPT += -DHAVE_P10_SUPPORT
diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index 58f865322..9047c714c 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -1,6 +1,3 @@
-ifeq ($(HAVE_GAS), 1)
-include $(KERNELDIR)/KERNEL.POWER8
-else
 #SGEMM_BETA = ../generic/gemm_beta.c
 #DGEMM_BETA = ../generic/gemm_beta.c
 #CGEMM_BETA = ../generic/zgemm_beta.c
@@ -265,4 +262,3 @@ QCABS_KERNEL	= ../generic/cabs.c
 #Dump kernel
 CGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
 ZGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
-endif
diff --git a/param.h b/param.h
index 03bf3624f..ee4640f57 100644
--- a/param.h
+++ b/param.h
@@ -2600,13 +2600,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define SGEMM_DEFAULT_UNROLL_M 16
 #define SGEMM_DEFAULT_UNROLL_N 8
-#if defined(HAVE_GAS) && (HAVE_GAS == 1) 
-#define DGEMM_DEFAULT_UNROLL_M 16
-#define DGEMM_DEFAULT_UNROLL_N 4
-#else
 #define DGEMM_DEFAULT_UNROLL_M 8
 #define DGEMM_DEFAULT_UNROLL_N 8
-#endif
 #define CGEMM_DEFAULT_UNROLL_M 8
 #define CGEMM_DEFAULT_UNROLL_N 4
 #define ZGEMM_DEFAULT_UNROLL_M 8

From a7f73c764cee8fada4f7f359ae4a8be6b9810ada Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 27 Oct 2023 16:48:47 +0200
Subject: [PATCH 370/718] Clarify "make" options and the need to repeat them in
 the install step

---
 README.md | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 081d45870..f57cec831 100644
--- a/README.md
+++ b/README.md
@@ -54,10 +54,15 @@ Building OpenBLAS requires the following to be installed:
 
 Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically.
 To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`.
-The full target list is in the file `TargetList.txt`. For building with `cmake`, the
-usual conventions apply, i.e. create a build directory either underneath the toplevel
-OpenBLAS source directory or separate from it, and invoke `cmake` there with the path
-to the source tree and any build options you plan to set.
+The full target list is in the file `TargetList.txt`, other build optionss are documented in Makefile.rule and
+can either be set there (typically by removing the comment character from the respective line), or used on the
+`make` command line. 
+Note that when you run `make install` after building, you need to repeat all command line options you provided to `make`
+in the build step, as some settings like the supported maximum number of threads are automatically derived from the
+build host by default, which might not be what you want.
+For building with `cmake`, the usual conventions apply, i.e. create a build directory either underneath the toplevel
+OpenBLAS source directory or separate from it, and invoke `cmake` there with the path to the source tree and any 
+build options you plan to set.
 
 ### Cross compile
 

From f5e1f20f4db408d826cb89638175f1987304cf5b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 27 Oct 2023 17:10:37 +0200
Subject: [PATCH 371/718] Update target list

---
 README.md | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index f57cec831..3c4e38f18 100644
--- a/README.md
+++ b/README.md
@@ -122,7 +122,7 @@ Use `PREFIX=` when invoking `make`, for example
 ```sh
 make install PREFIX=your_installation_directory
 ```
-
+(along with all options you added on the `make` command line in the preceding build step)
 The default installation directory is `/opt/OpenBLAS`.
 
 ## Supported CPUs and Operating Systems
@@ -142,7 +142,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
 - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
 - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
-- **AMD ZEN**: Uses Haswell codes with some optimizations.
+- **AMD ZEN**: Uses Haswell codes with some optimizations for Zen 2/3 (use SkylakeX for Zen4)
 
 #### MIPS32
 
@@ -174,13 +174,16 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 - **TSV110**: Optimized some Level-3 helper functions
 - **EMAG 8180**: preliminary support based on A57
 - **Neoverse N1**: (AWS Graviton2) preliminary support
-- **Apple Vortex**: preliminary support based on ARMV8
+- **Neoverse V1**: (AWS Graviton3) optimized Level-3 BLAS
+- **Apple Vortex**: preliminary support based on ThunderX2/3
+- **A64FX**:  preliminary support, optimized Level-3 BLAS
+- **ARMV8SVE**: any ARMV8 cpu with SVE extensions 
 
 #### PPC/PPC64
 
 - **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
 - **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only. 
-- **POWER10**:
+- **POWER10**: Optimized Level-3 BLAS including SBGEMM and some Level-1,2.
 
 #### IBM zEnterprise System
 

From 1cec1c0fc7509a949b65ce5bb50696c18838046e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 28 Oct 2023 14:43:19 +0200
Subject: [PATCH 372/718] Add FreeBSD clang/gfortran build with OpenMP

---
 .cirrus.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.cirrus.yml b/.cirrus.yml
index 02cd40997..c405b958d 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -148,6 +148,15 @@ FreeBSD_task:
   - ls -l /usr/local/lib
   - gmake CC=gcc INTERFACE64=1
 
+FreeBSD_task:
+  name: FreeBSD-clang-openmp
+  freebsd_instance:
+    image_family: freebsd-13-2
+  install_script:
+  - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc 
+  compile_script:
+  - gmake CC=clang FC=gfortran USE_OPENMP=1 CPP_THREAD_SAFETY_TEST=1
+
 #task:
 #  name: Windows/LLVM16     --- too slow ---
 #  windows_container:

From 289a5f6d9b8570de6fa5c2bf2789e04abce494ea Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 28 Oct 2023 18:44:58 +0200
Subject: [PATCH 373/718] work around libgfortran install issue on FreeBSD

---
 .cirrus.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.cirrus.yml b/.cirrus.yml
index c405b958d..5a1f2cfda 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -154,6 +154,7 @@ FreeBSD_task:
     image_family: freebsd-13-2
   install_script:
   - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc 
+  - ln -s /usr/local/gcc12/lib/libgfortran.so.5.0.0 /usr/lib/libgfortran.so
   compile_script:
   - gmake CC=clang FC=gfortran USE_OPENMP=1 CPP_THREAD_SAFETY_TEST=1
 

From dc1c880782e33307aaa2b04467b110003f3305e1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 28 Oct 2023 23:14:36 +0200
Subject: [PATCH 374/718] fix libgfortran path on bsd

---
 .cirrus.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 5a1f2cfda..6c2baf8a0 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -154,7 +154,7 @@ FreeBSD_task:
     image_family: freebsd-13-2
   install_script:
   - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc 
-  - ln -s /usr/local/gcc12/lib/libgfortran.so.5.0.0 /usr/lib/libgfortran.so
+  - ln -s /usr/local/lib/gcc12/libgfortran.so.5.0.0 /usr/lib/libgfortran.so
   compile_script:
   - gmake CC=clang FC=gfortran USE_OPENMP=1 CPP_THREAD_SAFETY_TEST=1
 

From d003ad630b1792f169373b8ab35c5ea7a6dfdccd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 31 Oct 2023 10:26:38 +0100
Subject: [PATCH 375/718] Increase the default GEMM buffer size on modern ARM
 server cpus

---
 common_arm64.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/common_arm64.h b/common_arm64.h
index 436ccb8f5..1e593c66f 100644
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -162,7 +162,11 @@ REALNAME:
 #define HUGE_PAGESIZE   ( 4 << 20)
 
 #ifndef BUFFERSIZE
+if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE)
+#define BUFFER_SIZE     (32 << 22)
+else
 #define BUFFER_SIZE     (32 << 20)
+#endif
 #else
 #define BUFFER_SIZE	(32 << BUFFERSIZE)
 #endif

From 728788f6676bb5e999cdf4fbcda9e2c7b8b9cd53 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 31 Oct 2023 11:08:22 +0100
Subject: [PATCH 376/718] typo fix

---
 common_arm64.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common_arm64.h b/common_arm64.h
index 1e593c66f..6ae6a35a3 100644
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -162,9 +162,9 @@ REALNAME:
 #define HUGE_PAGESIZE   ( 4 << 20)
 
 #ifndef BUFFERSIZE
-if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE)
+#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE)
 #define BUFFER_SIZE     (32 << 22)
-else
+#else
 #define BUFFER_SIZE     (32 << 20)
 #endif
 #else

From c8882bd9d890c332adaf992a0b9da6be8384bb97 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Wed, 1 Nov 2023 14:53:55 -0500
Subject: [PATCH 377/718] Remove POWER7 from cpu list.

---
 driver/others/dynamic_power.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index db04e635f..b4a1cc6be 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -39,7 +39,6 @@ char *gotoblas_corename(void) {
 #define CPU_UNKNOWN  0
 #define CPU_POWER5   5
 #define CPU_POWER6   6
-#define CPU_POWER7   7
 #define CPU_POWER8   8
 #define CPU_POWER9   9
 #define CPU_POWER10 10
@@ -53,9 +52,6 @@ static int cpuid(void)
 #ifdef POWER_6
     if (arch == POWER_6) return CPU_POWER6;
 #endif
-#ifdef POWER_7
-    else if (arch == POWER_7) return CPU_POWER7;
-#endif
 #ifdef POWER_8
     else if (arch == POWER_8) return CPU_POWER8;
 #endif

From 7dcb2d67f23caa8b70df4ea37c05a12ff8c15898 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Wed, 1 Nov 2023 15:23:28 -0500
Subject: [PATCH 378/718] Have POWER7 return arch=POWER6.

---
 driver/others/dynamic_power.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index b4a1cc6be..10a5d64b3 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -52,6 +52,9 @@ static int cpuid(void)
 #ifdef POWER_6
     if (arch == POWER_6) return CPU_POWER6;
 #endif
+#ifdef POWER_7
+    else if (arch == POWER_7) return CPU_POWER6;
+#endif
 #ifdef POWER_8
     else if (arch == POWER_8) return CPU_POWER8;
 #endif

From 3bfa4d4dccf8616ab330387a7be1ebd709a3214c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 3 Nov 2023 14:55:31 +0100
Subject: [PATCH 379/718] Fix outdated SVE kernel definitions for Cortex cpus
 by aliasing to ARMV8SVE

---
 kernel/arm64/KERNEL.CORTEXA510 | 217 +--------------------------------
 kernel/arm64/KERNEL.CORTEXA710 | 217 +--------------------------------
 kernel/arm64/KERNEL.CORTEXX2   | 217 +--------------------------------
 3 files changed, 3 insertions(+), 648 deletions(-)

diff --git a/kernel/arm64/KERNEL.CORTEXA510 b/kernel/arm64/KERNEL.CORTEXA510
index bd25f7cd8..bc5999097 100644
--- a/kernel/arm64/KERNEL.CORTEXA510
+++ b/kernel/arm64/KERNEL.CORTEXA510
@@ -1,216 +1 @@
-SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
-CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
-
-SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
-
-SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
-
-ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = ../arm/iamin.c
-ICAMINKERNEL = ../arm/izamin.c
-IZAMINKERNEL = ../arm/izamin.c
-
-ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
-
-ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
-
-STRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-STRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-STRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-STRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-DTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-DTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-DTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-DTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-TRSMCOPYLN_M    = trsm_lncopy_sve.c
-TRSMCOPYLT_M    = trsm_ltcopy_sve.c
-TRSMCOPYUN_M    = trsm_uncopy_sve.c
-TRSMCOPYUT_M    = trsm_utcopy_sve.c
-
-CTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-CTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-CTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-CTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-ZTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-ZTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-ZTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-ZTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-ZTRSMCOPYLN_M    = ztrsm_lncopy_sve.c
-ZTRSMCOPYLT_M    = ztrsm_ltcopy_sve.c
-ZTRSMCOPYUN_M    = ztrsm_uncopy_sve.c
-ZTRSMCOPYUT_M    = ztrsm_utcopy_sve.c
-
-
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
-SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = axpy.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
-
-SROTKERNEL   = rot.S
-DROTKERNEL   = rot.S
-CROTKERNEL   = zrot.S
-ZROTKERNEL   = zrot.S
-
-SSCALKERNEL  = scal.S
-DSCALKERNEL  = scal.S
-CSCALKERNEL  = zscal.S
-ZSCALKERNEL  = zscal.S
-
-SGEMVNKERNEL = gemv_n.S
-DGEMVNKERNEL = gemv_n.S
-CGEMVNKERNEL = zgemv_n.S
-ZGEMVNKERNEL = zgemv_n.S
-
-SGEMVTKERNEL = gemv_t.S
-DGEMVTKERNEL = gemv_t.S
-CGEMVTKERNEL = zgemv_t.S
-ZGEMVTKERNEL = zgemv_t.S
-
-
-SASUMKERNEL    = asum.S
-DASUMKERNEL    = asum.S
-CASUMKERNEL    = casum.S
-ZASUMKERNEL    = zasum.S
-
-SCOPYKERNEL    = copy.S
-DCOPYKERNEL    = copy.S
-CCOPYKERNEL    = copy.S
-ZCOPYKERNEL    = copy.S
-
-SSWAPKERNEL    = swap.S
-DSWAPKERNEL    = swap.S
-CSWAPKERNEL    = swap.S
-ZSWAPKERNEL    = swap.S
-
-ISAMAXKERNEL   = iamax.S
-IDAMAXKERNEL   = iamax.S
-ICAMAXKERNEL   = izamax.S
-IZAMAXKERNEL   = izamax.S
-
-SNRM2KERNEL    = nrm2.S
-DNRM2KERNEL    = nrm2.S
-CNRM2KERNEL    = znrm2.S
-ZNRM2KERNEL    = znrm2.S
-
-DDOTKERNEL     = dot.S
-ifneq ($(C_COMPILER), PGI)
-SDOTKERNEL     = ../generic/dot.c
-else
-SDOTKERNEL = dot.S
-endif
-ifneq ($(C_COMPILER), PGI)
-CDOTKERNEL     = zdot.S
-ZDOTKERNEL     = zdot.S
-else
-CDOTKERNEL = ../arm/zdot.c
-ZDOTKERNEL = ../arm/zdot.c
-endif
-DSDOTKERNEL    = dot.S
-
-DGEMM_BETA     = dgemm_beta.S
-SGEMM_BETA     = sgemm_beta.S
-
-SGEMMKERNEL    =  sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
-STRMMKERNEL    =  strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
-
-SGEMMINCOPY    =  sgemm_ncopy_sve_v1.c
-SGEMMITCOPY    =  sgemm_tcopy_sve_v1.c
-SGEMMONCOPY    =  sgemm_ncopy_$(DGEMM_UNROLL_N).S
-SGEMMOTCOPY    =  sgemm_tcopy_$(DGEMM_UNROLL_N).S
-
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-STRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
-STRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
-STRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
-STRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
-
-SSYMMUCOPY_M    =  symm_ucopy_sve.c
-SSYMMLCOPY_M    =  symm_lcopy_sve.c
-
-DGEMMKERNEL    =  dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
-DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
-
-DGEMMINCOPY    =  dgemm_ncopy_sve_v1.c
-DGEMMITCOPY    =  dgemm_tcopy_sve_v1.c
-DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
-DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
-
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DTRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
-DTRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
-DTRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
-DTRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
-
-DSYMMUCOPY_M    =  symm_ucopy_sve.c
-DSYMMLCOPY_M    =  symm_lcopy_sve.c
-
-CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-
-CGEMMINCOPY    =  cgemm_ncopy_sve_v1.c
-CGEMMITCOPY    =  cgemm_tcopy_sve_v1.c
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
-CTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
-CTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
-CTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
-
-CHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
-CHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
-
-CSYMMUCOPY_M    =  zsymm_ucopy_sve.c
-CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
-
-ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-
-ZGEMMINCOPY    =  zgemm_ncopy_sve_v1.c
-ZGEMMITCOPY    =  zgemm_tcopy_sve_v1.c
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
-ZTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
-ZTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
-ZTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
-
-ZHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
-ZHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
-
-ZSYMMUCOPY_M    =  zsymm_ucopy_sve.c
-ZSYMMLCOPY_M    =  zsymm_lcopy_sve.c
+include $(KERNELDIR)/KERNEL.ARMV8SVE
diff --git a/kernel/arm64/KERNEL.CORTEXA710 b/kernel/arm64/KERNEL.CORTEXA710
index bd25f7cd8..bc5999097 100644
--- a/kernel/arm64/KERNEL.CORTEXA710
+++ b/kernel/arm64/KERNEL.CORTEXA710
@@ -1,216 +1 @@
-SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
-CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
-
-SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
-
-SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
-
-ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = ../arm/iamin.c
-ICAMINKERNEL = ../arm/izamin.c
-IZAMINKERNEL = ../arm/izamin.c
-
-ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
-
-ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
-
-STRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-STRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-STRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-STRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-DTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-DTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-DTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-DTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-TRSMCOPYLN_M    = trsm_lncopy_sve.c
-TRSMCOPYLT_M    = trsm_ltcopy_sve.c
-TRSMCOPYUN_M    = trsm_uncopy_sve.c
-TRSMCOPYUT_M    = trsm_utcopy_sve.c
-
-CTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-CTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-CTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-CTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-ZTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-ZTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-ZTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-ZTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-ZTRSMCOPYLN_M    = ztrsm_lncopy_sve.c
-ZTRSMCOPYLT_M    = ztrsm_ltcopy_sve.c
-ZTRSMCOPYUN_M    = ztrsm_uncopy_sve.c
-ZTRSMCOPYUT_M    = ztrsm_utcopy_sve.c
-
-
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
-SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = axpy.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
-
-SROTKERNEL   = rot.S
-DROTKERNEL   = rot.S
-CROTKERNEL   = zrot.S
-ZROTKERNEL   = zrot.S
-
-SSCALKERNEL  = scal.S
-DSCALKERNEL  = scal.S
-CSCALKERNEL  = zscal.S
-ZSCALKERNEL  = zscal.S
-
-SGEMVNKERNEL = gemv_n.S
-DGEMVNKERNEL = gemv_n.S
-CGEMVNKERNEL = zgemv_n.S
-ZGEMVNKERNEL = zgemv_n.S
-
-SGEMVTKERNEL = gemv_t.S
-DGEMVTKERNEL = gemv_t.S
-CGEMVTKERNEL = zgemv_t.S
-ZGEMVTKERNEL = zgemv_t.S
-
-
-SASUMKERNEL    = asum.S
-DASUMKERNEL    = asum.S
-CASUMKERNEL    = casum.S
-ZASUMKERNEL    = zasum.S
-
-SCOPYKERNEL    = copy.S
-DCOPYKERNEL    = copy.S
-CCOPYKERNEL    = copy.S
-ZCOPYKERNEL    = copy.S
-
-SSWAPKERNEL    = swap.S
-DSWAPKERNEL    = swap.S
-CSWAPKERNEL    = swap.S
-ZSWAPKERNEL    = swap.S
-
-ISAMAXKERNEL   = iamax.S
-IDAMAXKERNEL   = iamax.S
-ICAMAXKERNEL   = izamax.S
-IZAMAXKERNEL   = izamax.S
-
-SNRM2KERNEL    = nrm2.S
-DNRM2KERNEL    = nrm2.S
-CNRM2KERNEL    = znrm2.S
-ZNRM2KERNEL    = znrm2.S
-
-DDOTKERNEL     = dot.S
-ifneq ($(C_COMPILER), PGI)
-SDOTKERNEL     = ../generic/dot.c
-else
-SDOTKERNEL = dot.S
-endif
-ifneq ($(C_COMPILER), PGI)
-CDOTKERNEL     = zdot.S
-ZDOTKERNEL     = zdot.S
-else
-CDOTKERNEL = ../arm/zdot.c
-ZDOTKERNEL = ../arm/zdot.c
-endif
-DSDOTKERNEL    = dot.S
-
-DGEMM_BETA     = dgemm_beta.S
-SGEMM_BETA     = sgemm_beta.S
-
-SGEMMKERNEL    =  sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
-STRMMKERNEL    =  strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
-
-SGEMMINCOPY    =  sgemm_ncopy_sve_v1.c
-SGEMMITCOPY    =  sgemm_tcopy_sve_v1.c
-SGEMMONCOPY    =  sgemm_ncopy_$(DGEMM_UNROLL_N).S
-SGEMMOTCOPY    =  sgemm_tcopy_$(DGEMM_UNROLL_N).S
-
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-STRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
-STRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
-STRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
-STRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
-
-SSYMMUCOPY_M    =  symm_ucopy_sve.c
-SSYMMLCOPY_M    =  symm_lcopy_sve.c
-
-DGEMMKERNEL    =  dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
-DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
-
-DGEMMINCOPY    =  dgemm_ncopy_sve_v1.c
-DGEMMITCOPY    =  dgemm_tcopy_sve_v1.c
-DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
-DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
-
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DTRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
-DTRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
-DTRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
-DTRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
-
-DSYMMUCOPY_M    =  symm_ucopy_sve.c
-DSYMMLCOPY_M    =  symm_lcopy_sve.c
-
-CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-
-CGEMMINCOPY    =  cgemm_ncopy_sve_v1.c
-CGEMMITCOPY    =  cgemm_tcopy_sve_v1.c
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
-CTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
-CTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
-CTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
-
-CHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
-CHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
-
-CSYMMUCOPY_M    =  zsymm_ucopy_sve.c
-CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
-
-ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-
-ZGEMMINCOPY    =  zgemm_ncopy_sve_v1.c
-ZGEMMITCOPY    =  zgemm_tcopy_sve_v1.c
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
-ZTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
-ZTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
-ZTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
-
-ZHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
-ZHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
-
-ZSYMMUCOPY_M    =  zsymm_ucopy_sve.c
-ZSYMMLCOPY_M    =  zsymm_lcopy_sve.c
+include $(KERNELDIR)/KERNEL.ARMV8SVE
diff --git a/kernel/arm64/KERNEL.CORTEXX2 b/kernel/arm64/KERNEL.CORTEXX2
index bd25f7cd8..bc5999097 100644
--- a/kernel/arm64/KERNEL.CORTEXX2
+++ b/kernel/arm64/KERNEL.CORTEXX2
@@ -1,216 +1 @@
-SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
-CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
-
-SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
-
-SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
-
-ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = ../arm/iamin.c
-ICAMINKERNEL = ../arm/izamin.c
-IZAMINKERNEL = ../arm/izamin.c
-
-ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
-
-ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
-
-STRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-STRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-STRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-STRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-DTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-DTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-DTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-DTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-TRSMCOPYLN_M    = trsm_lncopy_sve.c
-TRSMCOPYLT_M    = trsm_ltcopy_sve.c
-TRSMCOPYUN_M    = trsm_uncopy_sve.c
-TRSMCOPYUT_M    = trsm_utcopy_sve.c
-
-CTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-CTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-CTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-CTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-ZTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-ZTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-ZTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-ZTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-ZTRSMCOPYLN_M    = ztrsm_lncopy_sve.c
-ZTRSMCOPYLT_M    = ztrsm_ltcopy_sve.c
-ZTRSMCOPYUN_M    = ztrsm_uncopy_sve.c
-ZTRSMCOPYUT_M    = ztrsm_utcopy_sve.c
-
-
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
-SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = axpy.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
-
-SROTKERNEL   = rot.S
-DROTKERNEL   = rot.S
-CROTKERNEL   = zrot.S
-ZROTKERNEL   = zrot.S
-
-SSCALKERNEL  = scal.S
-DSCALKERNEL  = scal.S
-CSCALKERNEL  = zscal.S
-ZSCALKERNEL  = zscal.S
-
-SGEMVNKERNEL = gemv_n.S
-DGEMVNKERNEL = gemv_n.S
-CGEMVNKERNEL = zgemv_n.S
-ZGEMVNKERNEL = zgemv_n.S
-
-SGEMVTKERNEL = gemv_t.S
-DGEMVTKERNEL = gemv_t.S
-CGEMVTKERNEL = zgemv_t.S
-ZGEMVTKERNEL = zgemv_t.S
-
-
-SASUMKERNEL    = asum.S
-DASUMKERNEL    = asum.S
-CASUMKERNEL    = casum.S
-ZASUMKERNEL    = zasum.S
-
-SCOPYKERNEL    = copy.S
-DCOPYKERNEL    = copy.S
-CCOPYKERNEL    = copy.S
-ZCOPYKERNEL    = copy.S
-
-SSWAPKERNEL    = swap.S
-DSWAPKERNEL    = swap.S
-CSWAPKERNEL    = swap.S
-ZSWAPKERNEL    = swap.S
-
-ISAMAXKERNEL   = iamax.S
-IDAMAXKERNEL   = iamax.S
-ICAMAXKERNEL   = izamax.S
-IZAMAXKERNEL   = izamax.S
-
-SNRM2KERNEL    = nrm2.S
-DNRM2KERNEL    = nrm2.S
-CNRM2KERNEL    = znrm2.S
-ZNRM2KERNEL    = znrm2.S
-
-DDOTKERNEL     = dot.S
-ifneq ($(C_COMPILER), PGI)
-SDOTKERNEL     = ../generic/dot.c
-else
-SDOTKERNEL = dot.S
-endif
-ifneq ($(C_COMPILER), PGI)
-CDOTKERNEL     = zdot.S
-ZDOTKERNEL     = zdot.S
-else
-CDOTKERNEL = ../arm/zdot.c
-ZDOTKERNEL = ../arm/zdot.c
-endif
-DSDOTKERNEL    = dot.S
-
-DGEMM_BETA     = dgemm_beta.S
-SGEMM_BETA     = sgemm_beta.S
-
-SGEMMKERNEL    =  sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
-STRMMKERNEL    =  strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
-
-SGEMMINCOPY    =  sgemm_ncopy_sve_v1.c
-SGEMMITCOPY    =  sgemm_tcopy_sve_v1.c
-SGEMMONCOPY    =  sgemm_ncopy_$(DGEMM_UNROLL_N).S
-SGEMMOTCOPY    =  sgemm_tcopy_$(DGEMM_UNROLL_N).S
-
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-STRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
-STRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
-STRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
-STRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
-
-SSYMMUCOPY_M    =  symm_ucopy_sve.c
-SSYMMLCOPY_M    =  symm_lcopy_sve.c
-
-DGEMMKERNEL    =  dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
-DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
-
-DGEMMINCOPY    =  dgemm_ncopy_sve_v1.c
-DGEMMITCOPY    =  dgemm_tcopy_sve_v1.c
-DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
-DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
-
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DTRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
-DTRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
-DTRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
-DTRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
-
-DSYMMUCOPY_M    =  symm_ucopy_sve.c
-DSYMMLCOPY_M    =  symm_lcopy_sve.c
-
-CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-
-CGEMMINCOPY    =  cgemm_ncopy_sve_v1.c
-CGEMMITCOPY    =  cgemm_tcopy_sve_v1.c
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
-CTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
-CTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
-CTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
-
-CHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
-CHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
-
-CSYMMUCOPY_M    =  zsymm_ucopy_sve.c
-CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
-
-ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-
-ZGEMMINCOPY    =  zgemm_ncopy_sve_v1.c
-ZGEMMITCOPY    =  zgemm_tcopy_sve_v1.c
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
-ZTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
-ZTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
-ZTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
-
-ZHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
-ZHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
-
-ZSYMMUCOPY_M    =  zsymm_ucopy_sve.c
-ZSYMMLCOPY_M    =  zsymm_lcopy_sve.c
+include $(KERNELDIR)/KERNEL.ARMV8SVE

From 9019bc494514a74c2042152cdca0a36adea7b42f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 4 Nov 2023 22:10:06 +0100
Subject: [PATCH 380/718] Use SkylakeX ?ASUM microkernel for
 Cooperlake/Sapphirerapids as well

---
 kernel/x86_64/casum.c | 2 +-
 kernel/x86_64/dasum.c | 2 +-
 kernel/x86_64/sasum.c | 2 +-
 kernel/x86_64/zasum.c | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/x86_64/casum.c b/kernel/x86_64/casum.c
index 60feec0ce..e4d054311 100644
--- a/kernel/x86_64/casum.c
+++ b/kernel/x86_64/casum.c
@@ -4,7 +4,7 @@
 #define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
 #endif
 
-#if defined(SKYLAKEX)
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 #include "casum_microk_skylakex-2.c"
 #endif
 
diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c
index a9c40f38f..0147c6978 100644
--- a/kernel/x86_64/dasum.c
+++ b/kernel/x86_64/dasum.c
@@ -4,7 +4,7 @@
 #define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
 #endif
 
-#if defined(SKYLAKEX)
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 #include "dasum_microk_skylakex-2.c"
 #elif defined(HASWELL) || defined(ZEN)
 #include "dasum_microk_haswell-2.c"
diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c
index 37a92468f..3f22cb97a 100644
--- a/kernel/x86_64/sasum.c
+++ b/kernel/x86_64/sasum.c
@@ -9,7 +9,7 @@
 
 #endif
 
-#if defined(SKYLAKEX)
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 #include "sasum_microk_skylakex-2.c"
 #elif defined(HASWELL) || defined(ZEN)
 #include "sasum_microk_haswell-2.c"
diff --git a/kernel/x86_64/zasum.c b/kernel/x86_64/zasum.c
index 80e95a2c8..3f17ab1cf 100644
--- a/kernel/x86_64/zasum.c
+++ b/kernel/x86_64/zasum.c
@@ -4,7 +4,7 @@
 #define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
 #endif
 
-#if defined(SKYLAKEX)
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 #include "zasum_microk_skylakex-2.c"
 #endif
 

From 04bc801999e8d6e6ed101a1ab8ec9720f271ad2c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 4 Nov 2023 23:48:59 +0100
Subject: [PATCH 381/718] (Re)apply fixes for supporting only a subset of
 precision types from PR 3915

---
 kernel/Makefile.L3 | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 448e096a3..863f376e9 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -182,7 +182,7 @@ ifeq ($(BUILD_BFLOAT16),1)
 SBBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX)
 endif
 
-ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" ""
+ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" ""
 SBLASOBJS	+= \
 	sgemm_beta$(TSUFFIX).$(SUFFIX) \
 	strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
@@ -191,7 +191,7 @@ SBLASOBJS	+= \
 	strsm_kernel_RN$(TSUFFIX).$(SUFFIX) strsm_kernel_RT$(TSUFFIX).$(SUFFIX) 
 endif
 
-ifeq ($(BUILD_DOUBLE),1)
+ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" ""
 DBLASOBJS	+= \
 	dgemm_beta$(TSUFFIX).$(SUFFIX) \
 	dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
@@ -207,7 +207,7 @@ QBLASOBJS	+= \
 	qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \
 	qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX)
 
-ifeq ($(BUILD_COMPLEX),1)
+ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" ""
 CBLASOBJS	+= \
 	ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
 	ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \
@@ -255,7 +255,7 @@ XBLASOBJS    +=  xgemm3m_kernel$(TSUFFIX).$(SUFFIX)
 
 endif
 
-ifeq ($(BUILD_SINGLE),1)
+ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" ""
 SBLASOBJS += \
 	strmm_iunucopy$(TSUFFIX).$(SUFFIX) strmm_iunncopy$(TSUFFIX).$(SUFFIX) \
 	strmm_ilnucopy$(TSUFFIX).$(SUFFIX) strmm_ilnncopy$(TSUFFIX).$(SUFFIX) \
@@ -264,10 +264,7 @@ SBLASOBJS += \
 	strmm_ounucopy$(TSUFFIX).$(SUFFIX) strmm_ounncopy$(TSUFFIX).$(SUFFIX) \
 	strmm_olnucopy$(TSUFFIX).$(SUFFIX) strmm_olnncopy$(TSUFFIX).$(SUFFIX) \
 	strmm_outucopy$(TSUFFIX).$(SUFFIX) strmm_outncopy$(TSUFFIX).$(SUFFIX) \
-	strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX) 
-endif
-ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" ""
-SBLASOBJS += \
+	strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX) \
 	strsm_iunucopy$(TSUFFIX).$(SUFFIX) strsm_iunncopy$(TSUFFIX).$(SUFFIX) \
 	strsm_ilnucopy$(TSUFFIX).$(SUFFIX) strsm_ilnncopy$(TSUFFIX).$(SUFFIX) \
 	strsm_iutucopy$(TSUFFIX).$(SUFFIX) strsm_iutncopy$(TSUFFIX).$(SUFFIX) \
@@ -275,10 +272,7 @@ SBLASOBJS += \
 	strsm_ounucopy$(TSUFFIX).$(SUFFIX) strsm_ounncopy$(TSUFFIX).$(SUFFIX) \
 	strsm_olnucopy$(TSUFFIX).$(SUFFIX) strsm_olnncopy$(TSUFFIX).$(SUFFIX) \
 	strsm_outucopy$(TSUFFIX).$(SUFFIX) strsm_outncopy$(TSUFFIX).$(SUFFIX) \
-	strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX) 
-endif
-ifeq ($(BUILD_SINGLE),1)
-SBLASOBJS += \
+	strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX) \
 	ssymm_iutcopy$(TSUFFIX).$(SUFFIX) ssymm_iltcopy$(TSUFFIX).$(SUFFIX) \
 	ssymm_outcopy$(TSUFFIX).$(SUFFIX) ssymm_oltcopy$(TSUFFIX).$(SUFFIX)
 endif
@@ -400,7 +394,7 @@ XBLASOBJS += \
 
 ifeq ($(USE_GEMM3M), 1)
 
-ifeq ($(BUILD_COMPLEX),1)
+ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" ""
 CBLASOBJS += \
 	cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \
 	cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \

From 3a86fde5a7f762fe94196c6e090377721bc551b5 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 4 Nov 2023 23:52:46 +0100
Subject: [PATCH 382/718] Delete misplaced file from PR 3915 after applying in
 the correct place

---
 Makefile.L3 | 5089 ---------------------------------------------------
 1 file changed, 5089 deletions(-)
 delete mode 100644 Makefile.L3

diff --git a/Makefile.L3 b/Makefile.L3
deleted file mode 100644
index 76586d826..000000000
--- a/Makefile.L3
+++ /dev/null
@@ -1,5089 +0,0 @@
-USE_GEMM3M = 0
-OS := $(shell uname)
-
-ifeq ($(ARCH), x86)
-USE_GEMM3M = 1
-endif
-
-ifeq ($(ARCH), x86_64)
-USE_GEMM3M = 1
-endif
-
-ifeq ($(ARCH), x86_64)
-USE_DIRECT_SGEMM = 1
-endif
-
-ifeq ($(ARCH), ia64)
-USE_GEMM3M = 1
-endif
-
-ifeq ($(ARCH), arm)
-USE_TRMM = 1
-endif
-
-ifeq ($(ARCH), arm64)
-USE_TRMM = 1
-endif
-
-ifeq ($(ARCH), riscv64)
-USE_TRMM = 1
-endif
-
-ifneq ($(DYNAMIC_ARCH), 1)
-ifeq ($(TARGET), GENERIC)
-USE_TRMM = 1
-endif
-endif
-
-ifeq ($(CORE), HASWELL)
-USE_TRMM = 1
-endif
-
-ifeq ($(CORE), SKYLAKEX)
-USE_TRMM = 1
-endif
-
-ifeq ($(CORE), COOPERLAKE)
-USE_TRMM = 1
-endif
-
-ifeq ($(CORE), SAPPHIRERAPIDS)
-USE_TRMM = 1
-endif
-
-ifeq ($(CORE), ZEN)
-USE_TRMM = 1
-endif
-
-ifeq ($(CORE), POWER8)
-ifeq ($(BINARY64),1)
-USE_TRMM = 1
-endif
-endif
-
-ifeq ($(CORE), POWER9)
-USE_TRMM = 1
-endif
-
-ifeq ($(CORE), POWER10)
-USE_TRMM = 1
-endif
-
-ifeq ($(ARCH), zarch)
-USE_TRMM = 1
-endif
-
-ifeq ($(CORE), Z14)
-USE_TRMM = 1
-endif
-
-ifdef USE_DIRECT_SGEMM
-ifndef SGEMMDIRECTKERNEL
-SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c
-SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c
-endif
-endif
-
-ifeq ($(BUILD_BFLOAT16), 1)
-ifndef SBGEMMKERNEL
-SBGEMM_BETA = ../generic/gemm_beta.c
-SBGEMMKERNEL    = ../generic/gemmkernel_2x2.c
-SBGEMMINCOPY    = ../generic/gemm_ncopy_2.c
-SBGEMMITCOPY    = ../generic/gemm_tcopy_2.c
-SBGEMMONCOPY    = ../generic/gemm_ncopy_2.c
-SBGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
-SBGEMMINCOPYOBJ =  sbgemm_incopy$(TSUFFIX).$(SUFFIX)
-SBGEMMITCOPYOBJ =  sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
-SBGEMMONCOPYOBJ =  sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SBGEMMOTCOPYOBJ =  sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-SBKERNELOBJS	+= \
-	sbgemm_kernel$(TSUFFIX).$(SUFFIX) \
-	$(SBGEMMINCOPYOBJ) $(SBGEMMITCOPYOBJ) \
-	$(SBGEMMONCOPYOBJ) $(SBGEMMOTCOPYOBJ)
-endif
-
-ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" ""
-SKERNELOBJS	+= \
-	sgemm_kernel$(TSUFFIX).$(SUFFIX) \
-	sgemm_beta$(TSUFFIX).$(SUFFIX) \
-	$(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \
-	$(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ)
-
-ifdef USE_DIRECT_SGEMM
-SKERNELOBJS += \
-	sgemm_direct$(TSUFFIX).$(SUFFIX) \
-	sgemm_direct_performant$(TSUFFIX).$(SUFFIX) 
-endif
-endif
-
-ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" ""
-DKERNELOBJS	+= \
-	dgemm_beta$(TSUFFIX).$(SUFFIX) \
-	dgemm_kernel$(TSUFFIX).$(SUFFIX) \
-	$(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \
-	$(DGEMMONCOPYOBJ) $(DGEMMOTCOPYOBJ)
-endif
-
-QKERNELOBJS	+= \
-	qgemm_kernel$(TSUFFIX).$(SUFFIX) \
-	$(QGEMMINCOPYOBJ) $(QGEMMITCOPYOBJ) \
-	$(QGEMMONCOPYOBJ) $(QGEMMOTCOPYOBJ)
-
-ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" ""
-CKERNELOBJS	+= \
-	cgemm_kernel_n$(TSUFFIX).$(SUFFIX) cgemm_kernel_r$(TSUFFIX).$(SUFFIX) \
-	cgemm_kernel_l$(TSUFFIX).$(SUFFIX) cgemm_kernel_b$(TSUFFIX).$(SUFFIX) \
-	$(CGEMMINCOPYOBJ) $(CGEMMITCOPYOBJ) \
-	$(CGEMMONCOPYOBJ) $(CGEMMOTCOPYOBJ)
-endif
-
-ifeq ($(BUILD_COMPLEX16),1)
-ZKERNELOBJS	+= \
-	zgemm_kernel_n$(TSUFFIX).$(SUFFIX) zgemm_kernel_r$(TSUFFIX).$(SUFFIX) \
-	zgemm_kernel_l$(TSUFFIX).$(SUFFIX) zgemm_kernel_b$(TSUFFIX).$(SUFFIX) \
-	$(ZGEMMINCOPYOBJ) $(ZGEMMITCOPYOBJ) \
-	$(ZGEMMONCOPYOBJ) $(ZGEMMOTCOPYOBJ)
-endif
-
-XKERNELOBJS	+= \
-	xgemm_kernel_n$(TSUFFIX).$(SUFFIX) xgemm_kernel_r$(TSUFFIX).$(SUFFIX) \
-	xgemm_kernel_l$(TSUFFIX).$(SUFFIX) xgemm_kernel_b$(TSUFFIX).$(SUFFIX) \
-	$(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \
-	$(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ)
-
-ifeq ($(BUILD_BFLOAT16),1)
-SBBLASOBJS      += $(SBKERNELOBJS)
-endif
-SBLASOBJS	+= $(SKERNELOBJS)
-DBLASOBJS	+= $(DKERNELOBJS)
-QBLASOBJS	+= $(QKERNELOBJS)
-CBLASOBJS	+= $(CKERNELOBJS)
-ZBLASOBJS	+= $(ZKERNELOBJS)
-XBLASOBJS	+= $(XKERNELOBJS)
-
-ifeq ($(BUILD_BFLOAT16),1)
-SBBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX)
-endif
-
-ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" ""
-SBLASOBJS	+= \
-	sgemm_beta$(TSUFFIX).$(SUFFIX) \
-	strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	strmm_kernel_RN$(TSUFFIX).$(SUFFIX) strmm_kernel_RT$(TSUFFIX).$(SUFFIX) \
-	strsm_kernel_LN$(TSUFFIX).$(SUFFIX) strsm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	strsm_kernel_RN$(TSUFFIX).$(SUFFIX) strsm_kernel_RT$(TSUFFIX).$(SUFFIX) 
-endif
-
-ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" ""
-DBLASOBJS	+= \
-	dgemm_beta$(TSUFFIX).$(SUFFIX) \
-	dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \
-	dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) 
-endif
-
-QBLASOBJS	+= \
-	qgemm_beta$(TSUFFIX).$(SUFFIX) \
-	qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \
-	qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX)
-
-ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" ""
-CBLASOBJS	+= \
-	ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \
-	ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \
-	ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX)
-endif
-ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" ""
-CBLASOBJS	+= \
-	cgemm_beta$(TSUFFIX).$(SUFFIX) \
-	ctrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	ctrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \
-	ctrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \
-	ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX) 
-endif
-
-ifeq ($(BUILD_COMPLEX16),1)
-ZBLASOBJS	+= \
-	zgemm_beta$(TSUFFIX).$(SUFFIX) \
-	ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \
-	ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \
-	ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \
-	ztrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	ztrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \
-	ztrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \
-	ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX) 
-endif
-
-XBLASOBJS	+= \
-	xgemm_beta$(TSUFFIX).$(SUFFIX) \
-	xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \
-	xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \
-	xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \
-	xtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	xtrsm_kernel_LR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \
-	xtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \
-	xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) 
-
-ifeq ($(USE_GEMM3M), 1)
-
-CBLASOBJS    +=  cgemm3m_kernel$(TSUFFIX).$(SUFFIX)
-ZBLASOBJS    +=  zgemm3m_kernel$(TSUFFIX).$(SUFFIX)
-XBLASOBJS    +=  xgemm3m_kernel$(TSUFFIX).$(SUFFIX)
-
-endif
-
-ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" ""
-SBLASOBJS += \
-	strmm_iunucopy$(TSUFFIX).$(SUFFIX) strmm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	strmm_ilnucopy$(TSUFFIX).$(SUFFIX) strmm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	strmm_iutucopy$(TSUFFIX).$(SUFFIX) strmm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	strmm_iltucopy$(TSUFFIX).$(SUFFIX) strmm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	strmm_ounucopy$(TSUFFIX).$(SUFFIX) strmm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	strmm_olnucopy$(TSUFFIX).$(SUFFIX) strmm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	strmm_outucopy$(TSUFFIX).$(SUFFIX) strmm_outncopy$(TSUFFIX).$(SUFFIX) \
-	strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	strsm_iunucopy$(TSUFFIX).$(SUFFIX) strsm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	strsm_ilnucopy$(TSUFFIX).$(SUFFIX) strsm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	strsm_iutucopy$(TSUFFIX).$(SUFFIX) strsm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	strsm_iltucopy$(TSUFFIX).$(SUFFIX) strsm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	strsm_ounucopy$(TSUFFIX).$(SUFFIX) strsm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	strsm_olnucopy$(TSUFFIX).$(SUFFIX) strsm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	strsm_outucopy$(TSUFFIX).$(SUFFIX) strsm_outncopy$(TSUFFIX).$(SUFFIX) \
-	strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	ssymm_iutcopy$(TSUFFIX).$(SUFFIX) ssymm_iltcopy$(TSUFFIX).$(SUFFIX) \
-	ssymm_outcopy$(TSUFFIX).$(SUFFIX) ssymm_oltcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-ifeq ($(BUILD_DOUBLE),1)
-DBLASOBJS += \
-	dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) dtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	dtrmm_olnucopy$(TSUFFIX).$(SUFFIX) dtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	dtrmm_outucopy$(TSUFFIX).$(SUFFIX) dtrmm_outncopy$(TSUFFIX).$(SUFFIX) \
-	dtrmm_oltucopy$(TSUFFIX).$(SUFFIX) dtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) dtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	dtrsm_olnucopy$(TSUFFIX).$(SUFFIX) dtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	dtrsm_outucopy$(TSUFFIX).$(SUFFIX) dtrsm_outncopy$(TSUFFIX).$(SUFFIX) \
-	dtrsm_oltucopy$(TSUFFIX).$(SUFFIX) dtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	dsymm_iutcopy$(TSUFFIX).$(SUFFIX) dsymm_iltcopy$(TSUFFIX).$(SUFFIX) \
-	dsymm_outcopy$(TSUFFIX).$(SUFFIX) dsymm_oltcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-QBLASOBJS += \
-	qtrmm_iunucopy$(TSUFFIX).$(SUFFIX) qtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	qtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) qtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	qtrmm_iutucopy$(TSUFFIX).$(SUFFIX) qtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	qtrmm_iltucopy$(TSUFFIX).$(SUFFIX) qtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	qtrmm_ounucopy$(TSUFFIX).$(SUFFIX) qtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	qtrmm_olnucopy$(TSUFFIX).$(SUFFIX) qtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	qtrmm_outucopy$(TSUFFIX).$(SUFFIX) qtrmm_outncopy$(TSUFFIX).$(SUFFIX) \
-	qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	qtrsm_iunucopy$(TSUFFIX).$(SUFFIX) qtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	qtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) qtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	qtrsm_iutucopy$(TSUFFIX).$(SUFFIX) qtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	qtrsm_iltucopy$(TSUFFIX).$(SUFFIX) qtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	qtrsm_ounucopy$(TSUFFIX).$(SUFFIX) qtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	qtrsm_olnucopy$(TSUFFIX).$(SUFFIX) qtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	qtrsm_outucopy$(TSUFFIX).$(SUFFIX) qtrsm_outncopy$(TSUFFIX).$(SUFFIX) \
-	qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	qsymm_iutcopy$(TSUFFIX).$(SUFFIX) qsymm_iltcopy$(TSUFFIX).$(SUFFIX) \
-	qsymm_outcopy$(TSUFFIX).$(SUFFIX) qsymm_oltcopy$(TSUFFIX).$(SUFFIX) 
-
-ifeq ($(BUILD_COMPLEX),1)
-CBLASOBJS += \
-	ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) ctrmm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	ctrmm_olnucopy$(TSUFFIX).$(SUFFIX) ctrmm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	ctrmm_outucopy$(TSUFFIX).$(SUFFIX) ctrmm_outncopy$(TSUFFIX).$(SUFFIX) \
-	ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	csymm_iutcopy$(TSUFFIX).$(SUFFIX) csymm_iltcopy$(TSUFFIX).$(SUFFIX) \
-	csymm_outcopy$(TSUFFIX).$(SUFFIX) csymm_oltcopy$(TSUFFIX).$(SUFFIX) \
-	chemm_iutcopy$(TSUFFIX).$(SUFFIX) chemm_iltcopy$(TSUFFIX).$(SUFFIX) \
-	chemm_outcopy$(TSUFFIX).$(SUFFIX) chemm_oltcopy$(TSUFFIX).$(SUFFIX)
-endif
-ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" ""
-CBLASOBJS += \
-	ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) ctrsm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	ctrsm_olnucopy$(TSUFFIX).$(SUFFIX) ctrsm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	ctrsm_outucopy$(TSUFFIX).$(SUFFIX) ctrsm_outncopy$(TSUFFIX).$(SUFFIX) \
-	ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) 
-endif
-
-ifeq ($(BUILD_COMPLEX16),1)
-ZBLASOBJS += \
-	ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) ztrmm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	ztrmm_olnucopy$(TSUFFIX).$(SUFFIX) ztrmm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	ztrmm_outucopy$(TSUFFIX).$(SUFFIX) ztrmm_outncopy$(TSUFFIX).$(SUFFIX) \
-	ztrmm_oltucopy$(TSUFFIX).$(SUFFIX) ztrmm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) ztrsm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	ztrsm_olnucopy$(TSUFFIX).$(SUFFIX) ztrsm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	ztrsm_outucopy$(TSUFFIX).$(SUFFIX) ztrsm_outncopy$(TSUFFIX).$(SUFFIX) \
-	ztrsm_oltucopy$(TSUFFIX).$(SUFFIX) ztrsm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	zsymm_iutcopy$(TSUFFIX).$(SUFFIX) zsymm_iltcopy$(TSUFFIX).$(SUFFIX) \
-	zsymm_outcopy$(TSUFFIX).$(SUFFIX) zsymm_oltcopy$(TSUFFIX).$(SUFFIX) \
-	zhemm_iutcopy$(TSUFFIX).$(SUFFIX) zhemm_iltcopy$(TSUFFIX).$(SUFFIX) \
-	zhemm_outcopy$(TSUFFIX).$(SUFFIX) zhemm_oltcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-XBLASOBJS += \
-	xtrmm_iunucopy$(TSUFFIX).$(SUFFIX) xtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	xtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) xtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	xtrmm_iutucopy$(TSUFFIX).$(SUFFIX) xtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	xtrmm_iltucopy$(TSUFFIX).$(SUFFIX) xtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	xtrmm_ounucopy$(TSUFFIX).$(SUFFIX) xtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	xtrmm_olnucopy$(TSUFFIX).$(SUFFIX) xtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	xtrmm_outucopy$(TSUFFIX).$(SUFFIX) xtrmm_outncopy$(TSUFFIX).$(SUFFIX) \
-	xtrmm_oltucopy$(TSUFFIX).$(SUFFIX) xtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	xtrsm_iunucopy$(TSUFFIX).$(SUFFIX) xtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	xtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) xtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	xtrsm_iutucopy$(TSUFFIX).$(SUFFIX) xtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	xtrsm_iltucopy$(TSUFFIX).$(SUFFIX) xtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	xtrsm_ounucopy$(TSUFFIX).$(SUFFIX) xtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	xtrsm_olnucopy$(TSUFFIX).$(SUFFIX) xtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	xtrsm_outucopy$(TSUFFIX).$(SUFFIX) xtrsm_outncopy$(TSUFFIX).$(SUFFIX) \
-	xtrsm_oltucopy$(TSUFFIX).$(SUFFIX) xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	xsymm_iutcopy$(TSUFFIX).$(SUFFIX) xsymm_iltcopy$(TSUFFIX).$(SUFFIX) \
-	xsymm_outcopy$(TSUFFIX).$(SUFFIX) xsymm_oltcopy$(TSUFFIX).$(SUFFIX) \
-	xhemm_iutcopy$(TSUFFIX).$(SUFFIX) xhemm_iltcopy$(TSUFFIX).$(SUFFIX) \
-	xhemm_outcopy$(TSUFFIX).$(SUFFIX) xhemm_oltcopy$(TSUFFIX).$(SUFFIX)
-
-ifeq ($(USE_GEMM3M), 1)
-
-ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" ""
-CBLASOBJS += \
-	cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \
-	cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \
-	cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \
-	cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \
-	cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \
-	cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \
-	csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \
-	csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \
-	csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \
-	csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \
-	csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \
-	csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \
-	chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \
-	chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \
-	chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \
-	chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \
-	chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \
-	chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) chemm3m_olcopyi$(TSUFFIX).$(SUFFIX)
-endif
-
-ifeq ($(BUILD_COMPLEX16),1)
-ZBLASOBJS += \
-	zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \
-	zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \
-	zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \
-	zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \
-	zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \
-	zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \
-	zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \
-	zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \
-	zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \
-	zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \
-	zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \
-	zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \
-	zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \
-	zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \
-	zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \
-	zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \
-	zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \
-	zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX)
-endif
-
-XBLASOBJS += \
-	xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \
-	xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \
-	xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \
-	xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \
-	xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \
-	xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \
-	xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \
-	xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \
-	xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \
-	xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \
-	xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \
-	xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \
-	xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \
-	xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \
-	xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \
-	xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \
-	xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \
-	xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX)
-
-endif
-
-######  BLAS small matrix optimization #####
-ifeq ($(SMALL_MATRIX_OPT), 1)
-
-ifeq ($(BUILD_BFLOAT16),1)
-SBBLASOBJS += \
-	sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
-	sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
-	sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
-	sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
-	sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
-endif
-
-SBLASOBJS += \
-	sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
-	sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
-	sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
-	sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
-	sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
-
-DBLASOBJS += \
-	dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
-	dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
-	dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
-	dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
-	dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
-
-CBLASOBJS += \
-	cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX)
-
-ZBLASOBJS += \
-	zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX)
-
-endif
-
-######  BLAS extensions #####
-
-ifeq ($(BUILD_SINGLE),1)
-SBLASOBJS += \
-	somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
-	somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
-	simatcopy_k_cn$(TSUFFIX).$(SUFFIX) simatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
-	simatcopy_k_ct$(TSUFFIX).$(SUFFIX) simatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
-	sgeadd_k$(TSUFFIX).$(SUFFIX) 
-endif
-ifeq ($(BUILD_DOUBLE),1)
-DBLASOBJS += \
-	domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
-	domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
-	dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
-	dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
-	dgeadd_k$(TSUFFIX).$(SUFFIX) 
-endif
-
-ifeq ($(BUILD_COMPLEX),1)
-CBLASOBJS += \
-	comatcopy_k_cn$(TSUFFIX).$(SUFFIX) comatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
-	comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
-	comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \
-	comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \
-	cimatcopy_k_cn$(TSUFFIX).$(SUFFIX)  cimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
-	cimatcopy_k_ct$(TSUFFIX).$(SUFFIX)  cimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
-	cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \
-	cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \
-	cgeadd_k$(TSUFFIX).$(SUFFIX) 
-endif
-
-ifeq ($(BUILD_COMPLEX16),1)
-ZBLASOBJS += \
-	zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
-	zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
-	zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \
-	zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \
-	zimatcopy_k_cn$(TSUFFIX).$(SUFFIX)  zimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
-	zimatcopy_k_ct$(TSUFFIX).$(SUFFIX)  zimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
-	zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \
-	zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \
-	zgeadd_k$(TSUFFIX).$(SUFFIX) 
-endif
-
-ifeq ($(BUILD_BFLOAT16), 1)
-SBGEMMINCOPYOBJ_P = $(SBGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-SBGEMMITCOPYOBJ_P = $(SBGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-SBGEMMONCOPYOBJ_P = $(SBGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-SBGEMMOTCOPYOBJ_P = $(SBGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-endif
-
-SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-SGEMMOTCOPYOBJ_P = $(SGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-DGEMMINCOPYOBJ_P = $(DGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-DGEMMITCOPYOBJ_P = $(DGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-DGEMMONCOPYOBJ_P = $(DGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-DGEMMOTCOPYOBJ_P = $(DGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-QGEMMINCOPYOBJ_P = $(QGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-QGEMMITCOPYOBJ_P = $(QGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-QGEMMONCOPYOBJ_P = $(QGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-QGEMMOTCOPYOBJ_P = $(QGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-CGEMMINCOPYOBJ_P = $(CGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-CGEMMITCOPYOBJ_P = $(CGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-CGEMMONCOPYOBJ_P = $(CGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-CGEMMOTCOPYOBJ_P = $(CGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-ZGEMMINCOPYOBJ_P = $(ZGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-ZGEMMITCOPYOBJ_P = $(ZGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-ZGEMMONCOPYOBJ_P = $(ZGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-ZGEMMOTCOPYOBJ_P = $(ZGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-XGEMMINCOPYOBJ_P = $(XGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-
-ifeq ($(BUILD_BFLOAT16),1)
-$(KDIR)sbgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-endif
-
-$(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)dgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_BETA)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)qgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMM_BETA)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)cgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_BETA)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@
-
-$(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@
-
-$(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@
-
-ifeq ($(ARCH), E2K)
-USE_TRMM = 1
-endif
-
-
-ifeq ($(BUILD_BFLOAT16), 1)
-
-$(KDIR)$(SBGEMMONCOPYOBJ) : $(KERNELDIR)/$(SBGEMMONCOPY)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(SBGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SBGEMMOTCOPY)
-
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmotcopy.s
-	m4 sbgemmotcopy.s > sbgemmotcopy_nomacros.s
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmotcopy_nomacros.s -o $@
-	rm sbgemmotcopy.s sbgemmotcopy_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-endif
-
-ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N))
-
-$(KDIR)$(SBGEMMINCOPYOBJ) : $(KERNELDIR)/$(SBGEMMINCOPY)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(SBGEMMITCOPYOBJ) : $(KERNELDIR)/$(SBGEMMITCOPY)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmitcopy.s
-	m4 sbgemmitcopy.s > sbgemmitcopy_nomacros.s
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmitcopy_nomacros.s -o $@
-	rm sbgemmitcopy.s sbgemmitcopy_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-endif
-
-endif
-endif
-
-$(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s
-	m4 sgemmotcopy.s > sgemmotcopy_nomacros.s
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@
-	rm sgemmotcopy.s sgemmotcopy_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-endif	
-
-
-ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
-
-$(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s
-	m4 sgemmitcopy.s > sgemmitcopy_nomacros.s
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@
-	rm sgemmitcopy.s sgemmitcopy_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ 
-endif
-	
-endif
-
-$(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_ncopy.s
-	m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@
-	rm dgemm_ncopy.s dgemm_ncopy_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-endif
-
-$(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
-
-$(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_itcopy.s
-	m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@
-	rm dgemm_itcopy.s dgemm_itcopy_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-endif
-
-endif
-
-ifdef EXPRECISION
-
-$(KDIR)$(QGEMMONCOPYOBJ) : $(KERNELDIR)/$(QGEMMONCOPY)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(QGEMMOTCOPYOBJ) : $(KERNELDIR)/$(QGEMMOTCOPY)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(QGEMM_UNROLL_M), $(QGEMM_UNROLL_N))
-
-$(KDIR)$(QGEMMINCOPYOBJ) : $(KERNELDIR)/$(QGEMMINCOPY)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(QGEMMITCOPYOBJ) : $(KERNELDIR)/$(QGEMMITCOPY)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-endif
-
-endif
-
-$(KDIR)$(CGEMMONCOPYOBJ) : $(KERNELDIR)/$(CGEMMONCOPY)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(CGEMMOTCOPYOBJ) : $(KERNELDIR)/$(CGEMMOTCOPY)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
-
-$(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -S $< -o - > cgemm_itcopy.s
-	m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@
-	rm cgemm_itcopy.s cgemm_itcopy_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-endif
-
-endif
-
-$(KDIR)$(ZGEMMONCOPYOBJ) : $(KERNELDIR)/$(ZGEMMONCOPY)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(ZGEMMOTCOPYOBJ) : $(KERNELDIR)/$(ZGEMMOTCOPY)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
-
-$(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > zgemm_itcopy.s
-	m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@
-	rm zgemm_itcopy.s zgemm_itcopy_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-endif
-
-endif
-
-ifdef EXPRECISION
-
-$(KDIR)$(XGEMMONCOPYOBJ) : $(KERNELDIR)/$(XGEMMONCOPY)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(XGEMMOTCOPYOBJ) : $(KERNELDIR)/$(XGEMMOTCOPY)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(XGEMM_UNROLL_M), $(XGEMM_UNROLL_N))
-
-$(KDIR)$(XGEMMINCOPYOBJ) : $(KERNELDIR)/$(XGEMMINCOPY)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(XGEMMITCOPYOBJ) : $(KERNELDIR)/$(XGEMMITCOPY)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-endif
-
-endif
-
-$(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX  $< -o - > sgemm_kernel$(TSUFFIX).s
-	m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@
-	rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-endif
-
-ifdef USE_DIRECT_SGEMM
-$(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-endif
-
-ifeq ($(BUILD_BFLOAT16), 1)
-
-$(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX  $< -o - > sbgemm_kernel$(TSUFFIX).s
-	m4 sbgemm_kernel$(TSUFFIX).s > sbgemm_kernel$(TSUFFIX)_nomacros.s
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemm_kernel$(TSUFFIX)_nomacros.s -o $@
-	rm sbgemm_kernel$(TSUFFIX).s sbgemm_kernel$(TSUFFIX)_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-endif
-endif
-
-$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s
-	m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@
-	rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-endif
-
-$(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNN $< -o - > cgemm_kernel_n.s
-	m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@
-	rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
-endif
-
-$(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCN $< -o - > cgemm_kernel_l.s
-	m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@
-	rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
-endif
-
-$(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNC  $< -o - > cgemm_kernel_r.s
-	m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
-	rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
-endif
-
-$(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCC $< -o - > cgemm_kernel_b.s
-	m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@
-	rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
-endif
-
-$(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zgemm_kernel_n.s
-	m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
-	rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
-else ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
-endif
-
-$(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCN $< -o - > zgemm_kernel_l.s
-	m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
-	rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
-else ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
-endif
-
-$(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNC $< -o - > zgemm_kernel_r.s
-	m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
-	rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
-else ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
-endif
-
-$(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCC $< -o - > zgemm_kernel_b.s
-	m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
-	rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
-else ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
-endif
-
-$(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@
-
-$(KDIR)xgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $@
-
-$(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $@
-
-$(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@
-
-
-ifdef USE_TRMM
-$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > strmmkernel_ln.s	
-	m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@
-	rm strmmkernel_ln.s strmmkernel_ln_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
-endif
-
-$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > strmmkernel_lt.s	
-	m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@
-	rm strmmkernel_lt.s strmmkernel_lt_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
-endif
-
-$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > strmmkernel_rn.s	
-	m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@
-	rm strmmkernel_rn.s strmmkernel_rn_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
-endif
-
-$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s	
-	m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
-	rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
-endif
-
-$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > dtrmm_kernel_ln.s
-	m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@
-	rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
-endif
-
-$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > dtrmm_kernel_lt.s
-	m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@
-	rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
-endif
-
-$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > dtrmm_kernel_rn.s
-	m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@
-	rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
-endif
-
-$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > dtrmm_kernel_rt.s
-	m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@
-	rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
-endif
-
-$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
-
-$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN  $< -o - > ctrmm_kernel_ln.s
-	m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@
-	rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
-endif
-
-$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_lt.s
-	m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@
-	rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
-endif
-
-$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lr.s
-	m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN  ctrmm_kernel_lr_nomacros.s -o $@
-	rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
-endif
-
-$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lc.s
-	m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@
-	rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
-endif
-
-$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rn.s
-	m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@
-	rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
-endif
-
-$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rt.s
-	m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@
-	rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
-endif
-
-$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_rr.s
-	m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@
-	rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
-endif
-
-$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_RC.s
-	m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@
-	rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
-endif
-
-$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_ln.s
-	m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
-	rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
-else ifeq ($(CORE), SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
-endif
-
-$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_lt.s
-	m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
-	rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
-else ifeq ($(CORE), SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
-endif
-
-$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lr.s
-	m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
-	rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
-else ifeq ($(CORE), SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
-endif
-
-$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lc.s
-	m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
-	rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s 
-else ifeq ($(CORE), SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
-endif
-
-$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rn.s
-	m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
-	rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
-else ifeq ($(CORE), SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
-endif
-
-$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rt.s
-	m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
-	rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
-else ifeq ($(CORE), SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
-endif
-
-$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rr.s
-	m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
-	rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
-else ifeq ($(CORE), SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
-endif
-
-$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rc.s
-	m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
-	rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
-else ifeq ($(CORE), SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
-endif
-
-else
-$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
-
-$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
-
-$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
-
-$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s	
-	m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
-	rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
-endif
-
-$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
-
-$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
-
-$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
-
-$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
-
-$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
-
-$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
-
-$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
-
-$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
-
-$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
-endif
-
-$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
-endif
-$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
-endif
-$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
-endif
-$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
-endif
-$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
-endif
-$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
-endif
-$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
-endif
-endif
-
-
-
-
-$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
-
-$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
-
-$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
-
-$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
-
-$(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
-
-$(KDIR)zgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM3MKERNEL)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
-
-$(KDIR)xgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM3MKERNEL)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@
-
-$(KDIR)strsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LN) $(STRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)strsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LT) $(STRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)strsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RN) $(STRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)strsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(STRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o - > dtrsm_kernel_lt.s
-	m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@
-	rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s
-else
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-endif
-
-$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RT) $(DTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LN) $(QTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LT) $(QTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RN) $(QTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RT) $(QTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -DCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -DCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -DCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -DCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -DCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -DCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -DCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -DCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -DCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -DCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -DCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@
-
-
-ifdef STRMMUNCOPY_M
-$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef STRMMLNCOPY_M
-$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-ifdef STRMMUTCOPY_M
-$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef STRMMLTCOPY_M
-$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-$(KDIR)strmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)strmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-ifdef DTRMMUNCOPY_M
-$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef DTRMMLNCOPY_M
-$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-ifdef DTRMMUTCOPY_M
-$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef DTRMMLTCOPY_M
-$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-$(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-ifdef CTRMMUNCOPY_M
-$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef CTRMMLNCOPY_M
-$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef CTRMMUTCOPY_M
-$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef CTRMMLTCOPY_M
-$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-$(KDIR)ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-ifdef ZTRMMUNCOPY_M
-$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef ZTRMMLNCOPY_M
-$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-ifdef ZTRMMUTCOPY_M
-$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef ZTRMMLTCOPY_M
-$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-$(KDIR)ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ssymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)ssymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
-
-ifdef SSYMMUCOPY_M
-$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMUCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
-else
-$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
-endif
-
-ifdef SSYMMLCOPY_M
-$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMLCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
-else
-$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
-endif
-
-$(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
-
-ifdef DSYMMUCOPY_M
-$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMUCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
-else
-$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
-endif
-
-ifdef DSYMMLCOPY_M
-$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMLCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
-else
-$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
-endif
-
-$(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)qsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
-
-$(KDIR)qsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
-
-$(KDIR)qsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
-
-$(KDIR)csymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)csymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@
-
-ifdef CSYMMUCOPY_M
-$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMUCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
-else
-$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
-endif
-
-ifdef CSYMMLCOPY_M
-$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMLCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
-else
-$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
-endif
-
-$(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)zsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@
-
-ifdef ZSYMMUCOPY_M
-$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMUCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
-else
-$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
-endif
-
-ifdef ZSYMMLCOPY_M
-$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMLCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
-else
-$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
-endif
-
-$(KDIR)xsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)xsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@
-
-$(KDIR)xsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
-
-$(KDIR)xsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
-
-$(KDIR)chemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@
-
-$(KDIR)chemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@
-
-ifdef CHEMMUTCOPY_M
-$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMUTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
-else
-$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
-endif
-
-ifdef CHEMMLTCOPY_M
-$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMLTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
-else
-$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
-endif
-
-$(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@
-
-$(KDIR)zhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@
-
-ifdef ZHEMMUTCOPY_M
-$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMUTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
-else
-$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
-endif
-
-ifdef ZHEMMLTCOPY_M
-$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMLTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
-else
-$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
-endif
-
-$(KDIR)xhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@
-
-$(KDIR)xhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@
-
-$(KDIR)xhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
-
-$(KDIR)xhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
-
-$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-ifdef TRSMCOPYUN_M
-$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef TRSMCOPYLN_M
-$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-ifdef TRSMCOPYUT_M
-$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef TRSMCOPYLT_M
-$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-$(KDIR)strsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)strsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-ifdef TRSMCOPYUN_M
-$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef TRSMCOPYLN_M
-$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-ifdef TRSMCOPYUT_M
-$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef TRSMCOPYLT_M
-$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-$(KDIR)dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-ifdef ZTRSMCOPYUN_M
-$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef ZTRSMCOPYLN_M
-$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-ifdef ZTRSMCOPYUT_M
-$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef ZTRSMCOPYLT_M
-$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-$(KDIR)ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-ifdef ZTRSMCOPYUN_M
-$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef ZTRSMCOPYLN_M
-$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-ifdef ZTRSMCOPYUT_M
-$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef ZTRSMCOPYLT_M
-$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-$(KDIR)ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-
-$(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
-	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-ifeq ($(BUILD_BFLOAT16),1)
-$(KDIR)sbgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA)
-	$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-endif
-
-$(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA)
-	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)qgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMM_BETA)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)cgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMM_BETA)
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@
-
-$(KDIR)zgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA)
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@
-
-$(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@
-
-
-ifeq ($(BUILD_BFLOAT16), 1)
-$(SBGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMONCOPY)
-	$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-$(SBGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMOTCOPY)
-	$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N))
-$(SBGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMINCOPY)
-	$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-$(SBGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMITCOPY)
-	$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-endif
-endif
-
-$(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY)
-	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(SGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMOTCOPY)
-	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
-
-$(SGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMINCOPY)
-	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY)
-	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-endif
-
-$(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
-	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY)
-	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
-
-$(DGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMINCOPY)
-	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(DGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMITCOPY)
-	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-endif
-
-ifdef EXPRECISION
-
-$(QGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMONCOPY)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(QGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMOTCOPY)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(QGEMM_UNROLL_M), $(QGEMM_UNROLL_N))
-
-$(QGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMINCOPY)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(QGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMITCOPY)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-endif
-
-endif
-
-$(CGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMONCOPY)
-	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(CGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMOTCOPY)
-	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
-
-$(CGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMINCOPY)
-	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(CGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMITCOPY)
-	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-endif
-
-$(ZGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMONCOPY)
-	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(ZGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMOTCOPY)
-	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
-
-$(ZGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMINCOPY)
-	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(ZGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMITCOPY)
-	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-endif
-
-ifdef EXPRECISION
-
-$(XGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMONCOPY)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(XGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMOTCOPY)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(XGEMM_UNROLL_M), $(XGEMM_UNROLL_N))
-
-$(XGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMINCOPY)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(XGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMITCOPY)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-endif
-
-endif
-
-
-ifeq ($(BUILD_BFLOAT16), 1)
-$(KDIR)sbgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-endif
-
-$(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)dgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)qgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)cgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
-
-$(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
-
-$(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(PFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s
-	m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
-	rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s 
-else
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
-endif
-
-$(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
-
-$(KDIR)zgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
-
-$(KDIR)zgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
-
-$(KDIR)zgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
-
-$(KDIR)zgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
-
-$(KDIR)xgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@
-
-$(KDIR)xgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $@
-
-$(KDIR)xgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $@
-
-$(KDIR)xgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@
-
-$(KDIR)strmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
-
-$(KDIR)strmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
-
-$(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
-
-$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s	
-	m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
-	rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
-endif
-
-$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
-
-$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
-
-$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
-
-$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
-
-$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
-
-$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
-
-$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
-
-$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
-
-$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
-
-$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
-
-$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
-
-$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
-
-$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
-
-$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
-
-$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
-
-$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
-
-$(KDIR)cgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL)
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
-
-$(KDIR)zgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM3MKERNEL)
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
-
-$(KDIR)xgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM3MKERNEL)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@
-
-$(KDIR)strsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LN) $(STRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)strsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LT) $(STRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)strsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RN) $(STRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)strsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(STRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)dtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RT) $(DTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)qtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LN) $(QTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)qtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LT) $(QTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)qtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RN) $(QTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)qtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RT) $(QTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -DCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -DCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -DCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -DCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -DCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -DCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -DCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -DCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -DCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -DCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -DCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@
-
-
-$(KDIR)strmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)strmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)strmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)strmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ssymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)ssymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
-
-$(KDIR)ssymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
-
-$(KDIR)ssymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
-
-$(KDIR)dsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)dsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
-
-$(KDIR)dsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
-
-$(KDIR)dsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
-
-$(KDIR)qsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)qsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
-
-$(KDIR)qsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
-
-$(KDIR)qsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
-
-$(KDIR)csymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)csymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@
-
-$(KDIR)csymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
-
-$(KDIR)csymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
-
-$(KDIR)zsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)zsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@
-
-$(KDIR)zsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
-
-$(KDIR)zsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
-
-$(KDIR)xsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)xsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@
-
-$(KDIR)xsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
-
-$(KDIR)xsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
-
-$(KDIR)chemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@
-
-$(KDIR)chemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@
-
-$(KDIR)chemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
-
-$(KDIR)chemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
-
-$(KDIR)zhemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@
-
-$(KDIR)zhemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@
-
-$(KDIR)zhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
-
-$(KDIR)zhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
-
-$(KDIR)xhemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@
-
-$(KDIR)xhemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@
-
-$(KDIR)xhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
-
-$(KDIR)xhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
-
-$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)strsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)strsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)strsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)strsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-
-#####  BLAS extensions ######
-
-ifndef DOMATCOPY_CN
-DOMATCOPY_CN = ../arm/omatcopy_cn.c
-endif
-
-$(KDIR)domatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_CN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@
-
-ifndef DOMATCOPY_RN
-DOMATCOPY_RN = ../arm/omatcopy_rn.c
-endif
-
-$(KDIR)domatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_RN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@
-
-ifndef DOMATCOPY_CT
-DOMATCOPY_CT = ../arm/omatcopy_ct.c
-endif
-
-$(KDIR)domatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_CT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@
-
-ifndef DOMATCOPY_RT
-DOMATCOPY_RT = ../arm/omatcopy_rt.c
-endif
-
-$(KDIR)domatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_RT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@
-
-ifndef DIMATCOPY_CN
-DIMATCOPY_CN = ../generic/imatcopy_cn.c
-endif
-
-$(KDIR)dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_CN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@
-
-ifndef DIMATCOPY_RN
-DIMATCOPY_RN = ../generic/imatcopy_rn.c
-endif
-
-$(KDIR)dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_RN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@
-
-ifndef DIMATCOPY_CT
-DIMATCOPY_CT = ../generic/imatcopy_ct.c
-endif
-
-$(KDIR)dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_CT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@
-
-ifndef DIMATCOPY_RT
-DIMATCOPY_RT = ../generic/imatcopy_rt.c
-endif
-
-$(KDIR)dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_RT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@
-
-ifndef SOMATCOPY_CN
-SOMATCOPY_CN = ../arm/omatcopy_cn.c
-endif
-
-$(KDIR)somatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_CN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@
-
-ifndef SOMATCOPY_RN
-SOMATCOPY_RN = ../arm/omatcopy_rn.c
-endif
-
-$(KDIR)somatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_RN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@
-
-ifndef SOMATCOPY_CT
-SOMATCOPY_CT = ../arm/omatcopy_ct.c
-endif
-
-$(KDIR)somatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_CT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@
-
-ifndef SOMATCOPY_RT
-SOMATCOPY_RT = ../arm/omatcopy_rt.c
-endif
-
-$(KDIR)somatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_RT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@
-
-ifndef SIMATCOPY_CN
-SIMATCOPY_CN = ../generic/imatcopy_cn.c
-endif
-
-$(KDIR)simatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_CN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@
-
-ifndef SIMATCOPY_RN
-SIMATCOPY_RN = ../generic/imatcopy_rn.c
-endif
-
-$(KDIR)simatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_RN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@
-
-ifndef SIMATCOPY_CT
-SIMATCOPY_CT = ../generic/imatcopy_ct.c
-endif
-
-$(KDIR)simatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_CT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@
-
-ifndef SIMATCOPY_RT
-SIMATCOPY_RT = ../generic/imatcopy_rt.c
-endif
-
-$(KDIR)simatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_RT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@
-
-
-ifndef COMATCOPY_CN
-COMATCOPY_CN = ../arm/zomatcopy_cn.c
-endif
-
-$(KDIR)comatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@
-
-ifndef COMATCOPY_RN
-COMATCOPY_RN = ../arm/zomatcopy_rn.c
-endif
-
-$(KDIR)comatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@
-
-ifndef COMATCOPY_CT
-COMATCOPY_CT = ../arm/zomatcopy_ct.c
-endif
-
-$(KDIR)comatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@
-
-ifndef COMATCOPY_RT
-COMATCOPY_RT = ../arm/zomatcopy_rt.c
-endif
-
-$(KDIR)comatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@
-
-ifndef COMATCOPY_CNC
-COMATCOPY_CNC = ../arm/zomatcopy_cnc.c
-endif
-
-$(KDIR)comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CNC)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@
-
-ifndef COMATCOPY_RNC
-COMATCOPY_RNC = ../arm/zomatcopy_rnc.c
-endif
-
-$(KDIR)comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RNC)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
-
-ifndef COMATCOPY_CTC
-COMATCOPY_CTC = ../arm/zomatcopy_ctc.c
-endif
-
-$(KDIR)comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CTC)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@
-
-ifndef COMATCOPY_RTC
-COMATCOPY_RTC = ../arm/zomatcopy_rtc.c
-endif
-
-$(KDIR)comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RTC)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
-
-ifndef CIMATCOPY_CN
-CIMATCOPY_CN = ../generic/zimatcopy_cn.c
-endif
-
-$(KDIR)cimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@
-
-ifndef CIMATCOPY_RN
-CIMATCOPY_RN = ../generic/zimatcopy_rn.c
-endif
-
-$(KDIR)cimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@
-
-ifndef CIMATCOPY_CT
-CIMATCOPY_CT = ../generic/zimatcopy_ct.c
-endif
-
-$(KDIR)cimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@
-
-ifndef CIMATCOPY_RT
-CIMATCOPY_RT = ../generic/zimatcopy_rt.c
-endif
-
-$(KDIR)cimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@
-
-ifndef CIMATCOPY_CNC
-CIMATCOPY_CNC = ../generic/zimatcopy_cnc.c
-endif
-
-$(KDIR)cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CNC)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@
-
-ifndef CIMATCOPY_RNC
-CIMATCOPY_RNC = ../generic/zimatcopy_rnc.c
-endif
-
-$(KDIR)cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RNC)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
-
-ifndef CIMATCOPY_CTC
-CIMATCOPY_CTC = ../generic/zimatcopy_ctc.c
-endif
-
-$(KDIR)cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CTC)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@
-
-ifndef CIMATCOPY_RTC
-CIMATCOPY_RTC = ../generic/zimatcopy_rtc.c
-endif
-
-$(KDIR)cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RTC)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
-
-
-
-ifndef ZOMATCOPY_CN
-ZOMATCOPY_CN = ../arm/zomatcopy_cn.c
-endif
-
-$(KDIR)zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@
-
-ifndef ZOMATCOPY_RN
-ZOMATCOPY_RN = ../arm/zomatcopy_rn.c
-endif
-
-$(KDIR)zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@
-
-ifndef ZOMATCOPY_CT
-ZOMATCOPY_CT = ../arm/zomatcopy_ct.c
-endif
-
-$(KDIR)zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@
-
-ifndef ZOMATCOPY_RT
-ZOMATCOPY_RT = ../arm/zomatcopy_rt.c
-endif
-
-$(KDIR)zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@
-
-ifndef ZOMATCOPY_CNC
-ZOMATCOPY_CNC = ../arm/zomatcopy_cnc.c
-endif
-
-$(KDIR)zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CNC)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@
-
-ifndef ZOMATCOPY_RNC
-ZOMATCOPY_RNC = ../arm/zomatcopy_rnc.c
-endif
-
-$(KDIR)zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RNC)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
-
-ifndef ZOMATCOPY_CTC
-ZOMATCOPY_CTC = ../arm/zomatcopy_ctc.c
-endif
-
-$(KDIR)zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CTC)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@
-
-ifndef ZOMATCOPY_RTC
-ZOMATCOPY_RTC = ../arm/zomatcopy_rtc.c
-endif
-
-$(KDIR)zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RTC)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
-
-ifndef ZIMATCOPY_CN
-ZIMATCOPY_CN = ../generic/zimatcopy_cn.c
-endif
-
-$(KDIR)zimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@
-
-ifndef ZIMATCOPY_RN
-ZIMATCOPY_RN = ../generic/zimatcopy_rn.c
-endif
-
-$(KDIR)zimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@
-
-ifndef ZIMATCOPY_CT
-ZIMATCOPY_CT = ../generic/zimatcopy_ct.c
-endif
-
-$(KDIR)zimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@
-
-ifndef ZIMATCOPY_RT
-ZIMATCOPY_RT = ../generic/zimatcopy_rt.c
-endif
-
-$(KDIR)zimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@
-
-ifndef ZIMATCOPY_CNC
-ZIMATCOPY_CNC = ../generic/zimatcopy_cnc.c
-endif
-
-$(KDIR)zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CNC)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@
-
-ifndef ZIMATCOPY_RNC
-ZIMATCOPY_RNC = ../generic/zimatcopy_rnc.c
-endif
-
-$(KDIR)zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RNC)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
-
-ifndef ZIMATCOPY_CTC
-ZIMATCOPY_CTC = ../generic/zimatcopy_ctc.c
-endif
-
-$(KDIR)zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CTC)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@
-
-ifndef ZIMATCOPY_RTC
-ZIMATCOPY_RTC = ../generic/zimatcopy_rtc.c
-endif
-
-$(KDIR)zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RTC)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
-
-
-ifndef SGEADD_K
-SGEADD_K = ../generic/geadd.c
-endif
-
-$(KDIR)sgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@
-
-ifndef DGEADD_K
-DGEADD_K = ../generic/geadd.c
-endif
-
-$(KDIR)dgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEADD_K)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@
-
-ifndef CGEADD_K
-CGEADD_K = ../generic/zgeadd.c
-endif
-
-$(KDIR)cgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEADD_K)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM $< -o $@
-
-ifndef ZGEADD_K
-ZGEADD_K = ../generic/zgeadd.c
-endif
-
-$(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@
-
-
-
-######  BLAS small matrix optimization #####
-
-ifndef DGEMM_SMALL_M_PERMIT
-DGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
-endif
-
-ifndef DGEMM_SMALL_K_NN
-DGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
-endif
-
-ifndef DGEMM_SMALL_K_NT
-DGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
-endif
-
-ifndef DGEMM_SMALL_K_TN
-DGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
-endif
-
-ifndef DGEMM_SMALL_K_TT
-DGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
-endif
-
-$(KDIR)dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_M_PERMIT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-ifndef DGEMM_SMALL_K_B0_NN
-DGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
-endif
-
-ifndef DGEMM_SMALL_K_B0_NT
-DGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
-endif
-
-ifndef DGEMM_SMALL_K_B0_TN
-DGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
-endif
-
-ifndef DGEMM_SMALL_K_B0_TT
-DGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
-endif
-
-$(KDIR)dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-$(KDIR)dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-$(KDIR)dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-$(KDIR)dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-ifndef SGEMM_SMALL_M_PERMIT
-SGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
-endif
-
-ifndef SGEMM_SMALL_K_NN
-SGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
-endif
-
-ifndef SGEMM_SMALL_K_NT
-SGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
-endif
-
-ifndef SGEMM_SMALL_K_TN
-SGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
-endif
-
-ifndef SGEMM_SMALL_K_TT
-SGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
-endif
-
-$(KDIR)sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_M_PERMIT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-ifndef SGEMM_SMALL_K_B0_NN
-SGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
-endif
-
-ifndef SGEMM_SMALL_K_B0_NT
-SGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
-endif
-
-ifndef SGEMM_SMALL_K_B0_TN
-SGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
-endif
-
-ifndef SGEMM_SMALL_K_B0_TT
-SGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
-endif
-
-$(KDIR)sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-$(KDIR)sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-$(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-$(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-
-ifeq ($(BUILD_BFLOAT16), 1)
-ifndef SBGEMM_SMALL_M_PERMIT
-SBGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
-endif
-
-ifndef SBGEMM_SMALL_K_NN
-SBGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
-endif
-
-ifndef SBGEMM_SMALL_K_NT
-SBGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
-endif
-
-ifndef SBGEMM_SMALL_K_TN
-SBGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
-endif
-
-ifndef SBGEMM_SMALL_K_TT
-SBGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
-endif
-
-$(KDIR)sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_M_PERMIT)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-ifndef SBGEMM_SMALL_K_B0_NN
-SBGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
-endif
-
-ifndef SBGEMM_SMALL_K_B0_NT
-SBGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
-endif
-
-ifndef SBGEMM_SMALL_K_B0_TN
-SBGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
-endif
-
-ifndef SBGEMM_SMALL_K_B0_TT
-SBGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
-endif
-
-$(KDIR)sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-$(KDIR)sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-$(KDIR)sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-$(KDIR)sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
-endif
-
-ifndef CGEMM_SMALL_M_PERMIT
-CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c
-endif
-
-ifndef CGEMM_SMALL_K_NN
-CGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c
-endif
-
-ifndef CGEMM_SMALL_K_NT
-CGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c
-endif
-
-ifndef CGEMM_SMALL_K_TN
-CGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c
-endif
-
-ifndef CGEMM_SMALL_K_TT
-CGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c
-endif
-
-$(KDIR)cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_M_PERMIT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@
-
-$(KDIR)cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
-	
-$(KDIR)cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $@
-
-$(KDIR)cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $@
-
-$(KDIR)cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $@
-
-$(KDIR)cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $@
-
-$(KDIR)cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
-
-$(KDIR)cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@
-
-$(KDIR)cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $@
-
-$(KDIR)cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@
-
-$(KDIR)cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $@
-
-$(KDIR)cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
-
-$(KDIR)cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $@
-
-$(KDIR)cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@
-
-$(KDIR)cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $@
-
-$(KDIR)cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $@
-
-$(KDIR)cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
-
-ifndef CGEMM_SMALL_K_B0_NN
-CGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c
-endif
-
-ifndef CGEMM_SMALL_K_B0_NT
-CGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c
-endif
-
-ifndef CGEMM_SMALL_K_B0_TN
-CGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c
-endif
-
-ifndef CGEMM_SMALL_K_B0_TT
-CGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c
-endif
-
-$(KDIR)cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@
-	
-$(KDIR)cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@
-
-ifndef ZGEMM_SMALL_M_PERMIT
-ZGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c
-endif
-
-ifndef ZGEMM_SMALL_K_NN
-ZGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c
-endif
-
-ifndef ZGEMM_SMALL_K_NT
-ZGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c
-endif
-
-ifndef ZGEMM_SMALL_K_TN
-ZGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c
-endif
-
-ifndef ZGEMM_SMALL_K_TT
-ZGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c
-endif
-
-$(KDIR)zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_M_PERMIT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@
-
-
-$(KDIR)zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
-	
-$(KDIR)zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $@
-
-$(KDIR)zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $@
-
-$(KDIR)zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $@
-
-$(KDIR)zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $@
-
-$(KDIR)zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
-
-$(KDIR)zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@
-
-$(KDIR)zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $@
-
-$(KDIR)zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@
-
-$(KDIR)zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $@
-
-$(KDIR)zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
-
-$(KDIR)zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $@
-
-$(KDIR)zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@
-
-$(KDIR)zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $@
-
-$(KDIR)zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $@
-
-$(KDIR)zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
-
-ifndef ZGEMM_SMALL_K_B0_NN
-ZGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c
-endif
-
-ifndef ZGEMM_SMALL_K_B0_NT
-ZGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c
-endif
-
-ifndef ZGEMM_SMALL_K_B0_TN
-ZGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c
-endif
-
-ifndef ZGEMM_SMALL_K_B0_TT
-ZGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c
-endif
-
-$(KDIR)zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@
-	
-$(KDIR)zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@

From cd8eb83bae95f0ccc5308d1612b60d505847296e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 5 Nov 2023 18:13:37 +0100
Subject: [PATCH 383/718] Fix allocations and compiler warnings in ZROTG
 (#4289)

* Clean up ZROTG
---
 interface/zrotg.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/interface/zrotg.c b/interface/zrotg.c
index 4d2a9d510..ea73352dd 100644
--- a/interface/zrotg.c
+++ b/interface/zrotg.c
@@ -30,14 +30,12 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
   FLOAT db_r = *(DB+0);
   FLOAT db_i = *(DB+1);
   //long double r;
-  FLOAT *r, *S1=(FLOAT *)malloc(2*sizeof(FLOAT));
-  FLOAT *R=(FLOAT *)malloc(2*sizeof(FLOAT));
+  FLOAT S1[2];
+  FLOAT R[2];
   long double d;
 
   FLOAT ada =  da_r * da_r + da_i * da_i; 
   FLOAT adb =  db_r * db_r + db_i * db_i; 
-  FLOAT adart = sqrt( da_r * da_r + da_i * da_i); 
-  FLOAT adbrt = sqrt( db_r * db_r + db_i * db_i); 
 
   PRINT_DEBUG_NAME;
 
@@ -115,10 +113,13 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
 			}
 	    	    } else {
 		        *C = ada / adahsq;
-		        if (*C >= safmin) 
+		        if (*C >= safmin) {
 			    *R = *DA / *C;
-		        else
+			    *(R+1) = *(DA+1) / *(C+1);
+			} else {
 			    *R = *DA * (h / adahsq);
+			    *(R+1) = *(DA+1) * (h / adahsq);
+			}
 		        *S = *S1 * ada / adahsq;
 		    	*(S+1) = *(S1+1) * ada / adahsq;
 		    }
@@ -178,4 +179,4 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
 		}
 	}
 }
-	
\ No newline at end of file
+	

From ac7efc61fd41dd4f93bcf471bc8687e96fa882ac Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 6 Nov 2023 00:03:33 +0100
Subject: [PATCH 384/718] Put more build information into Makefile.conf_last

---
 Makefile         | 20 ++++++++++++++++----
 Makefile.install |  8 ++++++++
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 8621a8b3f..5f6643402 100644
--- a/Makefile
+++ b/Makefile
@@ -35,11 +35,7 @@ export NO_LAPACK
 export C_LAPACK
 endif
 
-ifeq ($(F_COMPILER),CRAY)
-LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -Og -Os,$(LAPACK_FFLAGS))
-else
 LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
-endif
 
 SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
 
@@ -210,9 +206,25 @@ ifeq ($(DYNAMIC_OLDER), 1)
 	@echo DYNAMIC_OLDER=1 >> Makefile.conf_last
 endif	
 endif
+	@echo TARGET=$(CORE) >> Makefile.conf_last
 ifdef USE_THREAD
 	@echo USE_THREAD=$(USE_THREAD) >>  Makefile.conf_last
 endif
+ifdef SMP
+ifdef NUM_THREADS
+	@echo NUM_THREADS=$(NUM_THREADS) >>  Makefile.conf_last
+else
+	@echo NUM_THREADS=$(NUM_CORES) >>  Makefile.conf_last
+endif
+endif
+ifeq ($(USE_OPENMP),1)
+	@echo USE_OPENMP=1 >>  Makefile.conf_last
+endif
+ifeq ($(INTERFACE64),1)
+	@echo INTERFACE64=1 >>  Makefile.conf_last
+endif
+	@echo THELIBNAME=$(LIBNAME) >>  Makefile.conf_last
+	@echo THELIBSONAME=$(LIBSONAME) >>  Makefile.conf_last
 	@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
 	@touch lib.grd
 
diff --git a/Makefile.install b/Makefile.install
index 01899b970..81f959177 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -3,6 +3,14 @@ export GOTOBLAS_MAKEFILE = 1
 -include $(TOPDIR)/Makefile.conf_last
 include ./Makefile.system
 
+ifdef THELIBNAME
+LIBNAME=$(THELIBNAME)
+LIBSONAME=$(THELIBSONAME)
+endif
+ifeq ($(INTERFACE64),1)
+USE_64BITINT=1
+endif
+
 PREFIX ?= /opt/OpenBLAS
 
 OPENBLAS_INCLUDE_DIR := $(PREFIX)/include

From cf8295da5ca2774a04fed14a89d7dc0de25f9146 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 6 Nov 2023 21:22:26 +0100
Subject: [PATCH 385/718] Fix potential  overflow in the calculation of MAXIT

---
 lapack-netlib/SRC/cbdsqr.f | 34 +++++++++++++++++++++++++---------
 lapack-netlib/SRC/zbdsqr.f | 34 +++++++++++++++++++++++++---------
 2 files changed, 50 insertions(+), 18 deletions(-)

diff --git a/lapack-netlib/SRC/cbdsqr.f b/lapack-netlib/SRC/cbdsqr.f
index 40706644e..cf1459ad2 100644
--- a/lapack-netlib/SRC/cbdsqr.f
+++ b/lapack-netlib/SRC/cbdsqr.f
@@ -204,6 +204,17 @@
 *>          algorithm through its inner loop. The algorithms stops
 *>          (and so fails to converge) if the number of passes
 *>          through the inner loop exceeds MAXITR*N**2.
+*>
+*> \endverbatim
+*
+*> \par Note:
+*  ===========
+*>
+*> \verbatim
+*>  Bug report from Cezary Dendek.
+*>  On November 3rd 2023, the INTEGER variable MAXIT = MAXITR*N**2 is
+*>  removed since it can overflow pretty easily (for N larger or equal
+*>  than 18,919). We instead use MAXITDIVN = MAXITR*N.
 *> \endverbatim
 *
 *  Authors:
@@ -214,7 +225,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup bdsqr
 *
 *  =====================================================================
       SUBROUTINE CBDSQR( UPLO, N, NCVT, NRU, NCC, D, E, VT, LDVT, U,
@@ -255,8 +266,8 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LOWER, ROTATE
-      INTEGER            I, IDIR, ISUB, ITER, J, LL, LLL, M, MAXIT, NM1,
-     $                   NM12, NM13, OLDLL, OLDM
+      INTEGER            I, IDIR, ISUB, ITER, ITERDIVN, J, LL, LLL, M,
+     $                   MAXITDIVN, NM1, NM12, NM13, OLDLL, OLDM
       REAL               ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU,
      $                   OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL,
      $                   SINR, SLL, SMAX, SMIN, SMINOA,
@@ -389,20 +400,21 @@
    40    CONTINUE
    50    CONTINUE
          SMINOA = SMINOA / SQRT( REAL( N ) )
-         THRESH = MAX( TOL*SMINOA, MAXITR*N*N*UNFL )
+         THRESH = MAX( TOL*SMINOA, MAXITR*(N*(N*UNFL)) )
       ELSE
 *
 *        Absolute accuracy desired
 *
-         THRESH = MAX( ABS( TOL )*SMAX, MAXITR*N*N*UNFL )
+         THRESH = MAX( ABS( TOL )*SMAX, MAXITR*(N*(N*UNFL)) )
       END IF
 *
 *     Prepare for main iteration loop for the singular values
 *     (MAXIT is the maximum number of passes through the inner
 *     loop permitted before nonconvergence signalled.)
 *
-      MAXIT = MAXITR*N*N
-      ITER = 0
+      MAXITDIVN = MAXITR*N
+      ITERDIVN = 0
+      ITER = -1
       OLDLL = -1
       OLDM = -1
 *
@@ -418,8 +430,12 @@
 *
       IF( M.LE.1 )
      $   GO TO 160
-      IF( ITER.GT.MAXIT )
-     $   GO TO 200
+      IF( ITER.GE.N ) THEN
+         ITER = ITER - N
+         ITERDIVN = ITERDIVN + 1
+         IF( ITERDIVN.GE.MAXITDIVN )
+     $      GO TO 200
+      END IF
 *
 *     Find diagonal block of matrix to work on
 *
diff --git a/lapack-netlib/SRC/zbdsqr.f b/lapack-netlib/SRC/zbdsqr.f
index faedafc3c..865bb9dd5 100644
--- a/lapack-netlib/SRC/zbdsqr.f
+++ b/lapack-netlib/SRC/zbdsqr.f
@@ -204,6 +204,17 @@
 *>          algorithm through its inner loop. The algorithms stops
 *>          (and so fails to converge) if the number of passes
 *>          through the inner loop exceeds MAXITR*N**2.
+*>
+*> \endverbatim
+*
+*> \par Note:
+*  ===========
+*>
+*> \verbatim
+*>  Bug report from Cezary Dendek.
+*>  On November 3rd 2023, the INTEGER variable MAXIT = MAXITR*N**2 is
+*>  removed since it can overflow pretty easily (for N larger or equal
+*>  than 18,919). We instead use MAXITDIVN = MAXITR*N.
 *> \endverbatim
 *
 *  Authors:
@@ -214,7 +225,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup bdsqr
 *
 *  =====================================================================
       SUBROUTINE ZBDSQR( UPLO, N, NCVT, NRU, NCC, D, E, VT, LDVT, U,
@@ -255,8 +266,8 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LOWER, ROTATE
-      INTEGER            I, IDIR, ISUB, ITER, J, LL, LLL, M, MAXIT, NM1,
-     $                   NM12, NM13, OLDLL, OLDM
+      INTEGER            I, IDIR, ISUB, ITER, ITERDIVN, J, LL, LLL, M,
+     $                   MAXITDIVN, NM1, NM12, NM13, OLDLL, OLDM
       DOUBLE PRECISION   ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU,
      $                   OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL,
      $                   SINR, SLL, SMAX, SMIN, SMINOA,
@@ -389,20 +400,21 @@
    40    CONTINUE
    50    CONTINUE
          SMINOA = SMINOA / SQRT( DBLE( N ) )
-         THRESH = MAX( TOL*SMINOA, MAXITR*N*N*UNFL )
+         THRESH = MAX( TOL*SMINOA, MAXITR*(N*(N*UNFL)) )
       ELSE
 *
 *        Absolute accuracy desired
 *
-         THRESH = MAX( ABS( TOL )*SMAX, MAXITR*N*N*UNFL )
+         THRESH = MAX( ABS( TOL )*SMAX, MAXITR*(N*(N*UNFL)) )
       END IF
 *
 *     Prepare for main iteration loop for the singular values
 *     (MAXIT is the maximum number of passes through the inner
 *     loop permitted before nonconvergence signalled.)
 *
-      MAXIT = MAXITR*N*N
-      ITER = 0
+      MAXITDIVN = MAXITR*N
+      ITERDIVN = 0
+      ITER = -1
       OLDLL = -1
       OLDM = -1
 *
@@ -418,8 +430,12 @@
 *
       IF( M.LE.1 )
      $   GO TO 160
-      IF( ITER.GT.MAXIT )
-     $   GO TO 200
+      IF( ITER.GE.N ) THEN
+         ITER = ITER - N
+         ITERDIVN = ITERDIVN + 1
+         IF( ITERDIVN.GE.MAXITDIVN )
+     $      GO TO 200
+      END IF
 *
 *     Find diagonal block of matrix to work on
 *

From f6ec777701d5d5e8271c9c271e260873c9b05911 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 6 Nov 2023 21:40:50 +0100
Subject: [PATCH 386/718] Fix accumulation (Reference-LAPACK PR 909)

---
 lapack-netlib/SRC/classq.f90 | 81 ++++++++++++++++++------------------
 lapack-netlib/SRC/dlassq.f90 | 81 ++++++++++++++++++------------------
 lapack-netlib/SRC/slassq.f90 | 81 ++++++++++++++++++------------------
 lapack-netlib/SRC/zlassq.f90 | 81 ++++++++++++++++++------------------
 4 files changed, 160 insertions(+), 164 deletions(-)

diff --git a/lapack-netlib/SRC/classq.f90 b/lapack-netlib/SRC/classq.f90
index cb4e7971f..c5f793cc0 100644
--- a/lapack-netlib/SRC/classq.f90
+++ b/lapack-netlib/SRC/classq.f90
@@ -34,28 +34,15 @@
 !>
 !> \verbatim
 !>
-!> CLASSQ  returns the values  scl  and  smsq  such that
+!> CLASSQ returns the values scale_out and sumsq_out such that
 !>
-!>    ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq,
+!>    (scale_out**2)*sumsq_out = x( 1 )**2 +...+ x( n )**2 + (scale**2)*sumsq,
 !>
-!> where  x( i ) = X( 1 + ( i - 1 )*INCX ). The value of  sumsq  is
+!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is
 !> assumed to be non-negative.
 !>
 !> scale and sumsq must be supplied in SCALE and SUMSQ and
-!> scl and smsq are overwritten on SCALE and SUMSQ respectively.
-!>
-!> If scale * sqrt( sumsq ) > tbig then
-!>    we require:   scale >= sqrt( TINY*EPS ) / sbig   on entry,
-!> and if 0 < scale * sqrt( sumsq ) < tsml then
-!>    we require:   scale <= sqrt( HUGE ) / ssml       on entry,
-!> where
-!>    tbig -- upper threshold for values whose square is representable;
-!>    sbig -- scaling constant for big numbers; \see la_constants.f90
-!>    tsml -- lower threshold for values whose square is representable;
-!>    ssml -- scaling constant for small numbers; \see la_constants.f90
-!> and
-!>    TINY*EPS -- tiniest representable number;
-!>    HUGE     -- biggest representable number.
+!> scale_out and sumsq_out are overwritten on SCALE and SUMSQ respectively.
 !>
 !> \endverbatim
 !
@@ -72,7 +59,7 @@
 !> \verbatim
 !>          X is COMPLEX array, dimension (1+(N-1)*abs(INCX))
 !>          The vector for which a scaled sum of squares is computed.
-!>             x( i )  = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
+!>             x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
 !> \endverbatim
 !>
 !> \param[in] INCX
@@ -82,24 +69,24 @@
 !>          If INCX > 0, X(1+(i-1)*INCX) = x(i) for 1 <= i <= n
 !>          If INCX < 0, X(1-(n-i)*INCX) = x(i) for 1 <= i <= n
 !>          If INCX = 0, x isn't a vector so there is no need to call
-!>          this subroutine.  If you call it anyway, it will count x(1)
+!>          this subroutine. If you call it anyway, it will count x(1)
 !>          in the vector norm N times.
 !> \endverbatim
 !>
 !> \param[in,out] SCALE
 !> \verbatim
 !>          SCALE is REAL
-!>          On entry, the value  scale  in the equation above.
-!>          On exit, SCALE is overwritten with  scl , the scaling factor
+!>          On entry, the value scale in the equation above.
+!>          On exit, SCALE is overwritten by scale_out, the scaling factor
 !>          for the sum of squares.
 !> \endverbatim
 !>
 !> \param[in,out] SUMSQ
 !> \verbatim
 !>          SUMSQ is REAL
-!>          On entry, the value  sumsq  in the equation above.
-!>          On exit, SUMSQ is overwritten with  smsq , the basic sum of
-!>          squares from which  scl  has been factored out.
+!>          On entry, the value sumsq in the equation above.
+!>          On exit, SUMSQ is overwritten by sumsq_out, the basic sum of
+!>          squares from which scale_out has been factored out.
 !> \endverbatim
 !
 !  Authors:
@@ -130,10 +117,10 @@
 !>
 !> \endverbatim
 !
-!> \ingroup OTHERauxiliary
+!> \ingroup lassq
 !
 !  =====================================================================
-subroutine CLASSQ( n, x, incx, scl, sumsq )
+subroutine CLASSQ( n, x, incx, scale, sumsq )
    use LA_CONSTANTS, &
       only: wp=>sp, zero=>szero, one=>sone, &
             sbig=>ssbig, ssml=>sssml, tbig=>stbig, tsml=>stsml
@@ -145,7 +132,7 @@ subroutine CLASSQ( n, x, incx, scl, sumsq )
 !
 !  .. Scalar Arguments ..
    integer :: incx, n
-   real(wp) :: scl, sumsq
+   real(wp) :: scale, sumsq
 !  ..
 !  .. Array Arguments ..
    complex(wp) :: x(*)
@@ -158,10 +145,10 @@ subroutine CLASSQ( n, x, incx, scl, sumsq )
 !
 !  Quick return if possible
 !
-   if( LA_ISNAN(scl) .or. LA_ISNAN(sumsq) ) return
-   if( sumsq == zero ) scl = one
-   if( scl == zero ) then
-      scl = one
+   if( LA_ISNAN(scale) .or. LA_ISNAN(sumsq) ) return
+   if( sumsq == zero ) scale = one
+   if( scale == zero ) then
+      scale = one
       sumsq = zero
    end if
    if (n <= 0) then
@@ -207,15 +194,27 @@ subroutine CLASSQ( n, x, incx, scl, sumsq )
 !  Put the existing sum of squares into one of the accumulators
 !
    if( sumsq > zero ) then
-      ax = scl*sqrt( sumsq )
+      ax = scale*sqrt( sumsq )
       if (ax > tbig) then
-!        We assume scl >= sqrt( TINY*EPS ) / sbig
-         abig = abig + (scl*sbig)**2 * sumsq
+         if (scale > one) then
+            scale = scale * sbig
+            abig = abig + scale * (scale * sumsq)
+         else
+            ! sumsq > tbig^2 => (sbig * (sbig * sumsq)) is representable
+            abig = abig + scale * (scale * (sbig * (sbig * sumsq)))
+         end if
       else if (ax < tsml) then
-!        We assume scl <= sqrt( HUGE ) / ssml
-         if (notbig) asml = asml + (scl*ssml)**2 * sumsq
+         if (notbig) then
+            if (scale < one) then
+               scale = scale * ssml
+               asml = asml + scale * (scale * sumsq)
+            else
+               ! sumsq < tsml^2 => (ssml * (ssml * sumsq)) is representable
+               asml = asml + scale * (scale * (ssml * (ssml * sumsq)))
+            end if
+         end if
       else
-         amed = amed + scl**2 * sumsq
+         amed = amed + scale * (scale * sumsq)
       end if
    end if
 !
@@ -229,7 +228,7 @@ subroutine CLASSQ( n, x, incx, scl, sumsq )
       if (amed > zero .or. LA_ISNAN(amed)) then
          abig = abig + (amed*sbig)*sbig
       end if
-      scl = one / sbig
+      scale = one / sbig
       sumsq = abig
    else if (asml > zero) then
 !
@@ -245,17 +244,17 @@ subroutine CLASSQ( n, x, incx, scl, sumsq )
             ymin = asml
             ymax = amed
          end if
-         scl = one
+         scale = one
          sumsq = ymax**2*( one + (ymin/ymax)**2 )
       else
-         scl = one / ssml
+         scale = one / ssml
          sumsq = asml
       end if
    else
 !
 !     Otherwise all values are mid-range or zero
 !
-      scl = one
+      scale = one
       sumsq = amed
    end if
    return
diff --git a/lapack-netlib/SRC/dlassq.f90 b/lapack-netlib/SRC/dlassq.f90
index fddd1bf38..37626844b 100644
--- a/lapack-netlib/SRC/dlassq.f90
+++ b/lapack-netlib/SRC/dlassq.f90
@@ -34,28 +34,15 @@
 !>
 !> \verbatim
 !>
-!> DLASSQ  returns the values  scl  and  smsq  such that
+!> DLASSQ returns the values scale_out and sumsq_out such that
 !>
-!>    ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq,
+!>    (scale_out**2)*sumsq_out = x( 1 )**2 +...+ x( n )**2 + (scale**2)*sumsq,
 !>
-!> where  x( i ) = X( 1 + ( i - 1 )*INCX ). The value of  sumsq  is
+!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is
 !> assumed to be non-negative.
 !>
 !> scale and sumsq must be supplied in SCALE and SUMSQ and
-!> scl and smsq are overwritten on SCALE and SUMSQ respectively.
-!>
-!> If scale * sqrt( sumsq ) > tbig then
-!>    we require:   scale >= sqrt( TINY*EPS ) / sbig   on entry,
-!> and if 0 < scale * sqrt( sumsq ) < tsml then
-!>    we require:   scale <= sqrt( HUGE ) / ssml       on entry,
-!> where
-!>    tbig -- upper threshold for values whose square is representable;
-!>    sbig -- scaling constant for big numbers; \see la_constants.f90
-!>    tsml -- lower threshold for values whose square is representable;
-!>    ssml -- scaling constant for small numbers; \see la_constants.f90
-!> and
-!>    TINY*EPS -- tiniest representable number;
-!>    HUGE     -- biggest representable number.
+!> scale_out and sumsq_out are overwritten on SCALE and SUMSQ respectively.
 !>
 !> \endverbatim
 !
@@ -72,7 +59,7 @@
 !> \verbatim
 !>          X is DOUBLE PRECISION array, dimension (1+(N-1)*abs(INCX))
 !>          The vector for which a scaled sum of squares is computed.
-!>             x( i )  = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
+!>             x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
 !> \endverbatim
 !>
 !> \param[in] INCX
@@ -82,24 +69,24 @@
 !>          If INCX > 0, X(1+(i-1)*INCX) = x(i) for 1 <= i <= n
 !>          If INCX < 0, X(1-(n-i)*INCX) = x(i) for 1 <= i <= n
 !>          If INCX = 0, x isn't a vector so there is no need to call
-!>          this subroutine.  If you call it anyway, it will count x(1)
+!>          this subroutine. If you call it anyway, it will count x(1)
 !>          in the vector norm N times.
 !> \endverbatim
 !>
 !> \param[in,out] SCALE
 !> \verbatim
 !>          SCALE is DOUBLE PRECISION
-!>          On entry, the value  scale  in the equation above.
-!>          On exit, SCALE is overwritten with  scl , the scaling factor
+!>          On entry, the value scale in the equation above.
+!>          On exit, SCALE is overwritten by scale_out, the scaling factor
 !>          for the sum of squares.
 !> \endverbatim
 !>
 !> \param[in,out] SUMSQ
 !> \verbatim
 !>          SUMSQ is DOUBLE PRECISION
-!>          On entry, the value  sumsq  in the equation above.
-!>          On exit, SUMSQ is overwritten with  smsq , the basic sum of
-!>          squares from which  scl  has been factored out.
+!>          On entry, the value sumsq in the equation above.
+!>          On exit, SUMSQ is overwritten by sumsq_out, the basic sum of
+!>          squares from which scale_out has been factored out.
 !> \endverbatim
 !
 !  Authors:
@@ -130,10 +117,10 @@
 !>
 !> \endverbatim
 !
-!> \ingroup OTHERauxiliary
+!> \ingroup lassq
 !
 !  =====================================================================
-subroutine DLASSQ( n, x, incx, scl, sumsq )
+subroutine DLASSQ( n, x, incx, scale, sumsq )
    use LA_CONSTANTS, &
       only: wp=>dp, zero=>dzero, one=>done, &
             sbig=>dsbig, ssml=>dssml, tbig=>dtbig, tsml=>dtsml
@@ -145,7 +132,7 @@ subroutine DLASSQ( n, x, incx, scl, sumsq )
 !
 !  .. Scalar Arguments ..
    integer :: incx, n
-   real(wp) :: scl, sumsq
+   real(wp) :: scale, sumsq
 !  ..
 !  .. Array Arguments ..
    real(wp) :: x(*)
@@ -158,10 +145,10 @@ subroutine DLASSQ( n, x, incx, scl, sumsq )
 !
 !  Quick return if possible
 !
-   if( LA_ISNAN(scl) .or. LA_ISNAN(sumsq) ) return
-   if( sumsq == zero ) scl = one
-   if( scl == zero ) then
-      scl = one
+   if( LA_ISNAN(scale) .or. LA_ISNAN(sumsq) ) return
+   if( sumsq == zero ) scale = one
+   if( scale == zero ) then
+      scale = one
       sumsq = zero
    end if
    if (n <= 0) then
@@ -198,15 +185,27 @@ subroutine DLASSQ( n, x, incx, scl, sumsq )
 !  Put the existing sum of squares into one of the accumulators
 !
    if( sumsq > zero ) then
-      ax = scl*sqrt( sumsq )
+      ax = scale*sqrt( sumsq )
       if (ax > tbig) then
-!        We assume scl >= sqrt( TINY*EPS ) / sbig
-         abig = abig + (scl*sbig)**2 * sumsq
+         if (scale > one) then
+            scale = scale * sbig
+            abig = abig + scale * (scale * sumsq)
+         else
+            ! sumsq > tbig^2 => (sbig * (sbig * sumsq)) is representable
+            abig = abig + scale * (scale * (sbig * (sbig * sumsq)))
+         end if
       else if (ax < tsml) then
-!        We assume scl <= sqrt( HUGE ) / ssml
-         if (notbig) asml = asml + (scl*ssml)**2 * sumsq
+         if (notbig) then
+            if (scale < one) then
+               scale = scale * ssml
+               asml = asml + scale * (scale * sumsq)
+            else
+               ! sumsq < tsml^2 => (ssml * (ssml * sumsq)) is representable
+               asml = asml + scale * (scale * (ssml * (ssml * sumsq)))
+            end if
+         end if
       else
-         amed = amed + scl**2 * sumsq
+         amed = amed + scale * (scale * sumsq)
       end if
    end if
 !
@@ -220,7 +219,7 @@ subroutine DLASSQ( n, x, incx, scl, sumsq )
       if (amed > zero .or. LA_ISNAN(amed)) then
          abig = abig + (amed*sbig)*sbig
       end if
-      scl = one / sbig
+      scale = one / sbig
       sumsq = abig
    else if (asml > zero) then
 !
@@ -236,17 +235,17 @@ subroutine DLASSQ( n, x, incx, scl, sumsq )
             ymin = asml
             ymax = amed
          end if
-         scl = one
+         scale = one
          sumsq = ymax**2*( one + (ymin/ymax)**2 )
       else
-         scl = one / ssml
+         scale = one / ssml
          sumsq = asml
       end if
    else
 !
 !     Otherwise all values are mid-range or zero
 !
-      scl = one
+      scale = one
       sumsq = amed
    end if
    return
diff --git a/lapack-netlib/SRC/slassq.f90 b/lapack-netlib/SRC/slassq.f90
index 19f49402b..c8959f4a7 100644
--- a/lapack-netlib/SRC/slassq.f90
+++ b/lapack-netlib/SRC/slassq.f90
@@ -34,28 +34,15 @@
 !>
 !> \verbatim
 !>
-!> SLASSQ  returns the values  scl  and  smsq  such that
+!> SLASSQ returns the values scale_out and sumsq_out such that
 !>
-!>    ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq,
+!>    (scale_out**2)*sumsq_out = x( 1 )**2 +...+ x( n )**2 + (scale**2)*sumsq,
 !>
-!> where  x( i ) = X( 1 + ( i - 1 )*INCX ). The value of  sumsq  is
+!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is
 !> assumed to be non-negative.
 !>
 !> scale and sumsq must be supplied in SCALE and SUMSQ and
-!> scl and smsq are overwritten on SCALE and SUMSQ respectively.
-!>
-!> If scale * sqrt( sumsq ) > tbig then
-!>    we require:   scale >= sqrt( TINY*EPS ) / sbig   on entry,
-!> and if 0 < scale * sqrt( sumsq ) < tsml then
-!>    we require:   scale <= sqrt( HUGE ) / ssml       on entry,
-!> where
-!>    tbig -- upper threshold for values whose square is representable;
-!>    sbig -- scaling constant for big numbers; \see la_constants.f90
-!>    tsml -- lower threshold for values whose square is representable;
-!>    ssml -- scaling constant for small numbers; \see la_constants.f90
-!> and
-!>    TINY*EPS -- tiniest representable number;
-!>    HUGE     -- biggest representable number.
+!> scale_out and sumsq_out are overwritten on SCALE and SUMSQ respectively.
 !>
 !> \endverbatim
 !
@@ -72,7 +59,7 @@
 !> \verbatim
 !>          X is REAL array, dimension (1+(N-1)*abs(INCX))
 !>          The vector for which a scaled sum of squares is computed.
-!>             x( i )  = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
+!>             x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
 !> \endverbatim
 !>
 !> \param[in] INCX
@@ -82,24 +69,24 @@
 !>          If INCX > 0, X(1+(i-1)*INCX) = x(i) for 1 <= i <= n
 !>          If INCX < 0, X(1-(n-i)*INCX) = x(i) for 1 <= i <= n
 !>          If INCX = 0, x isn't a vector so there is no need to call
-!>          this subroutine.  If you call it anyway, it will count x(1)
+!>          this subroutine. If you call it anyway, it will count x(1)
 !>          in the vector norm N times.
 !> \endverbatim
 !>
 !> \param[in,out] SCALE
 !> \verbatim
 !>          SCALE is REAL
-!>          On entry, the value  scale  in the equation above.
-!>          On exit, SCALE is overwritten with  scl , the scaling factor
+!>          On entry, the value scale in the equation above.
+!>          On exit, SCALE is overwritten by scale_out, the scaling factor
 !>          for the sum of squares.
 !> \endverbatim
 !>
 !> \param[in,out] SUMSQ
 !> \verbatim
 !>          SUMSQ is REAL
-!>          On entry, the value  sumsq  in the equation above.
-!>          On exit, SUMSQ is overwritten with  smsq , the basic sum of
-!>          squares from which  scl  has been factored out.
+!>          On entry, the value sumsq in the equation above.
+!>          On exit, SUMSQ is overwritten by sumsq_out, the basic sum of
+!>          squares from which scale_out has been factored out.
 !> \endverbatim
 !
 !  Authors:
@@ -130,10 +117,10 @@
 !>
 !> \endverbatim
 !
-!> \ingroup OTHERauxiliary
+!> \ingroup lassq
 !
 !  =====================================================================
-subroutine SLASSQ( n, x, incx, scl, sumsq )
+subroutine SLASSQ( n, x, incx, scale, sumsq )
    use LA_CONSTANTS, &
       only: wp=>sp, zero=>szero, one=>sone, &
             sbig=>ssbig, ssml=>sssml, tbig=>stbig, tsml=>stsml
@@ -145,7 +132,7 @@ subroutine SLASSQ( n, x, incx, scl, sumsq )
 !
 !  .. Scalar Arguments ..
    integer :: incx, n
-   real(wp) :: scl, sumsq
+   real(wp) :: scale, sumsq
 !  ..
 !  .. Array Arguments ..
    real(wp) :: x(*)
@@ -158,10 +145,10 @@ subroutine SLASSQ( n, x, incx, scl, sumsq )
 !
 !  Quick return if possible
 !
-   if( LA_ISNAN(scl) .or. LA_ISNAN(sumsq) ) return
-   if( sumsq == zero ) scl = one
-   if( scl == zero ) then
-      scl = one
+   if( LA_ISNAN(scale) .or. LA_ISNAN(sumsq) ) return
+   if( sumsq == zero ) scale = one
+   if( scale == zero ) then
+      scale = one
       sumsq = zero
    end if
    if (n <= 0) then
@@ -198,15 +185,27 @@ subroutine SLASSQ( n, x, incx, scl, sumsq )
 !  Put the existing sum of squares into one of the accumulators
 !
    if( sumsq > zero ) then
-      ax = scl*sqrt( sumsq )
+      ax = scale*sqrt( sumsq )
       if (ax > tbig) then
-!        We assume scl >= sqrt( TINY*EPS ) / sbig
-         abig = abig + (scl*sbig)**2 * sumsq
+         if (scale > one) then
+            scale = scale * sbig
+            abig = abig + scale * (scale * sumsq)
+         else
+            ! sumsq > tbig^2 => (sbig * (sbig * sumsq)) is representable
+            abig = abig + scale * (scale * (sbig * (sbig * sumsq)))
+         end if
       else if (ax < tsml) then
-!        We assume scl <= sqrt( HUGE ) / ssml
-         if (notbig) asml = asml + (scl*ssml)**2 * sumsq
+         if (notbig) then
+            if (scale < one) then
+               scale = scale * ssml
+               asml = asml + scale * (scale * sumsq)
+            else
+               ! sumsq < tsml^2 => (ssml * (ssml * sumsq)) is representable
+               asml = asml + scale * (scale * (ssml * (ssml * sumsq)))
+            end if
+         end if
       else
-         amed = amed + scl**2 * sumsq
+         amed = amed + scale * (scale * sumsq)
       end if
    end if
 !
@@ -220,7 +219,7 @@ subroutine SLASSQ( n, x, incx, scl, sumsq )
       if (amed > zero .or. LA_ISNAN(amed)) then
          abig = abig + (amed*sbig)*sbig
       end if
-      scl = one / sbig
+      scale = one / sbig
       sumsq = abig
    else if (asml > zero) then
 !
@@ -236,17 +235,17 @@ subroutine SLASSQ( n, x, incx, scl, sumsq )
             ymin = asml
             ymax = amed
          end if
-         scl = one
+         scale = one
          sumsq = ymax**2*( one + (ymin/ymax)**2 )
       else
-         scl = one / ssml
+         scale = one / ssml
          sumsq = asml
       end if
    else
 !
 !     Otherwise all values are mid-range or zero
 !
-      scl = one
+      scale = one
       sumsq = amed
    end if
    return
diff --git a/lapack-netlib/SRC/zlassq.f90 b/lapack-netlib/SRC/zlassq.f90
index 9346dacac..c35214766 100644
--- a/lapack-netlib/SRC/zlassq.f90
+++ b/lapack-netlib/SRC/zlassq.f90
@@ -34,28 +34,15 @@
 !>
 !> \verbatim
 !>
-!> ZLASSQ  returns the values  scl  and  smsq  such that
+!> ZLASSQ returns the values scale_out and sumsq_out such that
 !>
-!>    ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq,
+!>    (scale_out**2)*sumsq_out = x( 1 )**2 +...+ x( n )**2 + (scale**2)*sumsq,
 !>
-!> where  x( i ) = X( 1 + ( i - 1 )*INCX ). The value of  sumsq  is
+!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is
 !> assumed to be non-negative.
 !>
 !> scale and sumsq must be supplied in SCALE and SUMSQ and
-!> scl and smsq are overwritten on SCALE and SUMSQ respectively.
-!>
-!> If scale * sqrt( sumsq ) > tbig then
-!>    we require:   scale >= sqrt( TINY*EPS ) / sbig   on entry,
-!> and if 0 < scale * sqrt( sumsq ) < tsml then
-!>    we require:   scale <= sqrt( HUGE ) / ssml       on entry,
-!> where
-!>    tbig -- upper threshold for values whose square is representable;
-!>    sbig -- scaling constant for big numbers; \see la_constants.f90
-!>    tsml -- lower threshold for values whose square is representable;
-!>    ssml -- scaling constant for small numbers; \see la_constants.f90
-!> and
-!>    TINY*EPS -- tiniest representable number;
-!>    HUGE     -- biggest representable number.
+!> scale_out and sumsq_out are overwritten on SCALE and SUMSQ respectively.
 !>
 !> \endverbatim
 !
@@ -72,7 +59,7 @@
 !> \verbatim
 !>          X is DOUBLE COMPLEX array, dimension (1+(N-1)*abs(INCX))
 !>          The vector for which a scaled sum of squares is computed.
-!>             x( i )  = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
+!>             x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
 !> \endverbatim
 !>
 !> \param[in] INCX
@@ -82,24 +69,24 @@
 !>          If INCX > 0, X(1+(i-1)*INCX) = x(i) for 1 <= i <= n
 !>          If INCX < 0, X(1-(n-i)*INCX) = x(i) for 1 <= i <= n
 !>          If INCX = 0, x isn't a vector so there is no need to call
-!>          this subroutine.  If you call it anyway, it will count x(1)
+!>          this subroutine. If you call it anyway, it will count x(1)
 !>          in the vector norm N times.
 !> \endverbatim
 !>
 !> \param[in,out] SCALE
 !> \verbatim
 !>          SCALE is DOUBLE PRECISION
-!>          On entry, the value  scale  in the equation above.
-!>          On exit, SCALE is overwritten with  scl , the scaling factor
+!>          On entry, the value scale in the equation above.
+!>          On exit, SCALE is overwritten by scale_out, the scaling factor
 !>          for the sum of squares.
 !> \endverbatim
 !>
 !> \param[in,out] SUMSQ
 !> \verbatim
 !>          SUMSQ is DOUBLE PRECISION
-!>          On entry, the value  sumsq  in the equation above.
-!>          On exit, SUMSQ is overwritten with  smsq , the basic sum of
-!>          squares from which  scl  has been factored out.
+!>          On entry, the value sumsq in the equation above.
+!>          On exit, SUMSQ is overwritten by sumsq_out, the basic sum of
+!>          squares from which scale_out has been factored out.
 !> \endverbatim
 !
 !  Authors:
@@ -130,10 +117,10 @@
 !>
 !> \endverbatim
 !
-!> \ingroup OTHERauxiliary
+!> \ingroup lassq
 !
 !  =====================================================================
-subroutine ZLASSQ( n, x, incx, scl, sumsq )
+subroutine ZLASSQ( n, x, incx, scale, sumsq )
    use LA_CONSTANTS, &
       only: wp=>dp, zero=>dzero, one=>done, &
             sbig=>dsbig, ssml=>dssml, tbig=>dtbig, tsml=>dtsml
@@ -145,7 +132,7 @@ subroutine ZLASSQ( n, x, incx, scl, sumsq )
 !
 !  .. Scalar Arguments ..
    integer :: incx, n
-   real(wp) :: scl, sumsq
+   real(wp) :: scale, sumsq
 !  ..
 !  .. Array Arguments ..
    complex(wp) :: x(*)
@@ -158,10 +145,10 @@ subroutine ZLASSQ( n, x, incx, scl, sumsq )
 !
 !  Quick return if possible
 !
-   if( LA_ISNAN(scl) .or. LA_ISNAN(sumsq) ) return
-   if( sumsq == zero ) scl = one
-   if( scl == zero ) then
-      scl = one
+   if( LA_ISNAN(scale) .or. LA_ISNAN(sumsq) ) return
+   if( sumsq == zero ) scale = one
+   if( scale == zero ) then
+      scale = one
       sumsq = zero
    end if
    if (n <= 0) then
@@ -207,15 +194,27 @@ subroutine ZLASSQ( n, x, incx, scl, sumsq )
 !  Put the existing sum of squares into one of the accumulators
 !
    if( sumsq > zero ) then
-      ax = scl*sqrt( sumsq )
+      ax = scale*sqrt( sumsq )
       if (ax > tbig) then
-!        We assume scl >= sqrt( TINY*EPS ) / sbig
-         abig = abig + (scl*sbig)**2 * sumsq
+         if (scale > one) then
+            scale = scale * sbig
+            abig = abig + scale * (scale * sumsq)
+         else
+            ! sumsq > tbig^2 => (sbig * (sbig * sumsq)) is representable
+            abig = abig + scale * (scale * (sbig * (sbig * sumsq)))
+         end if
       else if (ax < tsml) then
-!        We assume scl <= sqrt( HUGE ) / ssml
-         if (notbig) asml = asml + (scl*ssml)**2 * sumsq
+         if (notbig) then
+            if (scale < one) then
+               scale = scale * ssml
+               asml = asml + scale * (scale * sumsq)
+            else
+               ! sumsq < tsml^2 => (ssml * (ssml * sumsq)) is representable
+               asml = asml + scale * (scale * (ssml * (ssml * sumsq)))
+            end if
+         end if
       else
-         amed = amed + scl**2 * sumsq
+         amed = amed + scale * (scale * sumsq)
       end if
    end if
 !
@@ -229,7 +228,7 @@ subroutine ZLASSQ( n, x, incx, scl, sumsq )
       if (amed > zero .or. LA_ISNAN(amed)) then
          abig = abig + (amed*sbig)*sbig
       end if
-      scl = one / sbig
+      scale = one / sbig
       sumsq = abig
    else if (asml > zero) then
 !
@@ -245,17 +244,17 @@ subroutine ZLASSQ( n, x, incx, scl, sumsq )
             ymin = asml
             ymax = amed
          end if
-         scl = one
+         scale = one
          sumsq = ymax**2*( one + (ymin/ymax)**2 )
       else
-         scl = one / ssml
+         scale = one / ssml
          sumsq = asml
       end if
    else
 !
 !     Otherwise all values are mid-range or zero
 !
-      scl = one
+      scale = one
       sumsq = amed
    end if
    return

From 176cc6348ed3c0415391c42478301430e9bbe031 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 6 Nov 2023 21:55:19 +0100
Subject: [PATCH 387/718] Correct constant ALPHA to match changed algorithm
 (Reference-LAPACK PR 928)

---
 lapack-netlib/SRC/cunbdb6.f | 4 ++--
 lapack-netlib/SRC/dorbdb6.f | 4 ++--
 lapack-netlib/SRC/sorbdb6.f | 4 ++--
 lapack-netlib/SRC/zunbdb6.f | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/lapack-netlib/SRC/cunbdb6.f b/lapack-netlib/SRC/cunbdb6.f
index b93a389d6..cd14d9295 100644
--- a/lapack-netlib/SRC/cunbdb6.f
+++ b/lapack-netlib/SRC/cunbdb6.f
@@ -152,7 +152,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unbdb6
 *
 *  =====================================================================
       SUBROUTINE CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
@@ -174,7 +174,7 @@
 *
 *     .. Parameters ..
       REAL               ALPHA, REALONE, REALZERO
-      PARAMETER          ( ALPHA = 0.01E0, REALONE = 1.0E0,
+      PARAMETER          ( ALPHA = 0.1E0, REALONE = 1.0E0,
      $                     REALZERO = 0.0E0 )
       COMPLEX            NEGONE, ONE, ZERO
       PARAMETER          ( NEGONE = (-1.0E0,0.0E0), ONE = (1.0E0,0.0E0),
diff --git a/lapack-netlib/SRC/dorbdb6.f b/lapack-netlib/SRC/dorbdb6.f
index 45c8ba8a2..142887684 100644
--- a/lapack-netlib/SRC/dorbdb6.f
+++ b/lapack-netlib/SRC/dorbdb6.f
@@ -152,7 +152,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup unbdb6
 *
 *  =====================================================================
       SUBROUTINE DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
@@ -174,7 +174,7 @@
 *
 *     .. Parameters ..
       DOUBLE PRECISION   ALPHA, REALONE, REALZERO
-      PARAMETER          ( ALPHA = 0.01D0, REALONE = 1.0D0,
+      PARAMETER          ( ALPHA = 0.1D0, REALONE = 1.0D0,
      $                     REALZERO = 0.0D0 )
       DOUBLE PRECISION   NEGONE, ONE, ZERO
       PARAMETER          ( NEGONE = -1.0D0, ONE = 1.0D0, ZERO = 0.0D0 )
diff --git a/lapack-netlib/SRC/sorbdb6.f b/lapack-netlib/SRC/sorbdb6.f
index b2449e3be..d320c9e46 100644
--- a/lapack-netlib/SRC/sorbdb6.f
+++ b/lapack-netlib/SRC/sorbdb6.f
@@ -152,7 +152,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unbdb6
 *
 *  =====================================================================
       SUBROUTINE SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
@@ -174,7 +174,7 @@
 *
 *     .. Parameters ..
       REAL               ALPHA, REALONE, REALZERO
-      PARAMETER          ( ALPHA = 0.01E0, REALONE = 1.0E0,
+      PARAMETER          ( ALPHA = 0.1E0, REALONE = 1.0E0,
      $                     REALZERO = 0.0E0 )
       REAL               NEGONE, ONE, ZERO
       PARAMETER          ( NEGONE = -1.0E0, ONE = 1.0E0, ZERO = 0.0E0 )
diff --git a/lapack-netlib/SRC/zunbdb6.f b/lapack-netlib/SRC/zunbdb6.f
index ed666e449..ac7fa4be3 100644
--- a/lapack-netlib/SRC/zunbdb6.f
+++ b/lapack-netlib/SRC/zunbdb6.f
@@ -152,7 +152,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup unbdb6
 *
 *  =====================================================================
       SUBROUTINE ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
@@ -174,7 +174,7 @@
 *
 *     .. Parameters ..
       DOUBLE PRECISION   ALPHA, REALONE, REALZERO
-      PARAMETER          ( ALPHA = 0.01D0, REALONE = 1.0D0,
+      PARAMETER          ( ALPHA = 0.1D0, REALONE = 1.0D0,
      $                     REALZERO = 0.0D0 )
       COMPLEX*16         NEGONE, ONE, ZERO
       PARAMETER          ( NEGONE = (-1.0D0,0.0D0), ONE = (1.0D0,0.0D0),

From 206e76187ed1e7e78d3f93258eb90fa753a9d1d9 Mon Sep 17 00:00:00 2001
From: Chip Kerchner <chip.kerchner@ibm.com>
Date: Tue, 7 Nov 2023 18:08:57 -0600
Subject: [PATCH 388/718] Fix FCOMMON_OPT for power.  Error out for certain C
 and Fortran compiler combos in AIX.

---
 Makefile.power  | 23 +++++++++++++++--------
 Makefile.system |  4 ++++
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/Makefile.power b/Makefile.power
index ada51b2e1..95bada34f 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -13,9 +13,9 @@ ifeq ($(CORE), POWER10)
 ifneq ($(C_COMPILER), PGI)
 CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
 ifeq ($(F_COMPILER), IBM)
-FCOMMON_OPT += -O2 -qrecur -qnosave
+FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr10 -qtune=pwr10 -qfloat=nomaf -qzerosize
 else
-FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10  -fno-fast-math
+FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
 endif
 endif
 endif
@@ -38,9 +38,9 @@ CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
 endif
 ifneq ($(F_COMPILER), PGI)
 ifeq ($(F_COMPILER), IBM)
-FCOMMON_OPT += -O2 -qrecur -qnosave
+FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr9 -qtune=pwr9 -qfloat=nomaf -qzerosize
 else
-FCOMMON_OPT += -O2 -frecursive -fno-fast-math
+FCOMMON_OPT += -O2 -frecursive -fno-fast-math -mcpu=power9 -mtune=power9
 endif
 
 ifeq ($(F_COMPILER), GFORTRAN)
@@ -65,15 +65,15 @@ endif
 ifneq ($(F_COMPILER), PGI)
 ifeq ($(OSNAME), AIX)
 ifeq ($(F_COMPILER), IBM)
-FCOMMON_OPT += -O2 -qrecur -qnosave
+FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr8 -qtune=pwr8 -qfloat=nomaf -qzerosize
 else
-FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8  -fno-fast-math 
+FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
 endif
 else
 ifeq ($(F_COMPILER), IBM)
-FCOMMON_OPT += -O2 -qrecur -qnosave
+FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr8 -qtune=pwr8 -qfloat=nomaf -qzerosize
 else
-FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8  -fno-fast-math 
+FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
 endif
 endif
 else
@@ -135,6 +135,13 @@ endif
 ifdef BINARY64
 
 
+ifeq ($(C_COMPILER)$(F_COMPILER)$(OSNAME), GCCIBMAIX)
+$(error Using GCC and XLF on AIX is not a supported combination.)
+endif
+ifeq ($(C_COMPILER)$(F_COMPILER)$(OSNAME), CLANGGFORTRANAIX)
+$(error Using Clang and gFortran on AIX is not a supported combination.)
+endif
+
 ifeq ($(OSNAME), AIX)
 ifeq ($(C_COMPILER), GCC)
 CCOMMON_OPT	+= -mpowerpc64 -maix64
diff --git a/Makefile.system b/Makefile.system
index 30b0ddec2..cb19dea73 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -1374,6 +1374,8 @@ ifeq ($(F_COMPILER), SUN)
 FCOMMON_OPT  += -pic
 else ifeq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -PIC
+else ifeq ($(F_COMPILER), IBM)
+FCOMMON_OPT += -qpic=large
 else
 FCOMMON_OPT += -fPIC
 endif
@@ -1626,9 +1628,11 @@ override FPFLAGS    += $(FCOMMON_OPT) $(COMMON_PROF)
 
 ifeq ($(NEED_PIC), 1)
 ifeq (,$(findstring PIC,$(FFLAGS)))
+ifneq ($(F_COMPILER),IBM)
 override FFLAGS += -fPIC
 endif
 endif
+endif
 
 #For LAPACK Fortran codes.
 #Disable -fopenmp for LAPACK Fortran codes on Windows.

From 5e31c5708393b4e086052c0e7b8cbc57d7ebede3 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Tue, 7 Nov 2023 20:58:34 -0600
Subject: [PATCH 389/718] Only define __builtin_cpu_is and
 __builtin_cpu_supports if not present.

---
 driver/others/dynamic_power.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 10a5d64b3..570a87568 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -202,6 +202,7 @@ static int cpuid(void)
 #ifndef __BUILTIN_CPU_SUPPORTS__
 #include <string.h>
 
+#if defined(__has_builtin) && !__has_builtin(__builtin_cpu_is)
 static int __builtin_cpu_is(const char *arg)
 {
     static int ipinfo = -1;
@@ -224,12 +225,15 @@ static int __builtin_cpu_is(const char *arg)
     }
     return 0;
 }
+#endif
 
+#if defined(__has_builtin) && !__has_builtin(__builtin_cpu_supports)
 static int __builtin_cpu_supports(const char *arg)
 {
     return 0;
 }
 #endif
+#endif
 
 static gotoblas_t *get_coretype(void) {
 

From 4eecccd49b251be2cb303b67093c4602afb39aec Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Wed, 8 Nov 2023 07:12:21 -0600
Subject: [PATCH 390/718] Fix __builtin_cpu_is for AIX.

---
 driver/others/dynamic_power.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 570a87568..f0faf2baf 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -202,7 +202,7 @@ static int cpuid(void)
 #ifndef __BUILTIN_CPU_SUPPORTS__
 #include <string.h>
 
-#if defined(__has_builtin) && !__has_builtin(__builtin_cpu_is)
+#if defined(_AIX) || (defined(__has_builtin) && !__has_builtin(__builtin_cpu_is))
 static int __builtin_cpu_is(const char *arg)
 {
     static int ipinfo = -1;
@@ -227,7 +227,7 @@ static int __builtin_cpu_is(const char *arg)
 }
 #endif
 
-#if defined(__has_builtin) && !__has_builtin(__builtin_cpu_supports)
+#if defined(_AIX) || (defined(__has_builtin) && !__has_builtin(__builtin_cpu_supports))
 static int __builtin_cpu_supports(const char *arg)
 {
     return 0;

From 778e3b746a7217bbafa099133f956753e8355c4b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 10 Nov 2023 11:55:29 +0100
Subject: [PATCH 391/718] Enable autodetection of current AMD cpus as their
 AVX512 Intel counterparts

---
 cpuid_x86.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/cpuid_x86.c b/cpuid_x86.c
index fdcead8bd..6cf4d6503 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1660,7 +1660,13 @@ int get_cpuname(void){
 	  else
 	    return CPUTYPE_BARCELONA;
         }
-      case 10: // Zen3		      
+      case 10: // Zen3/4
+#ifndef NO_AVX512
+          if(support_avx512_bf16())
+            return CPUTYPE_COOPERLAKE;
+          if(support_avx512())
+            return CPUTYPE_SKYLAKEX;
+#endif
 	if(support_avx())
 #ifndef NO_AVX2
 	    return CPUTYPE_ZEN;
@@ -2438,6 +2444,12 @@ int get_coretype(void){
 	  // Ryzen 2
 	default:
 	  // Matisse,Renoir Ryzen2 models		
+#ifndef NO_AVX512
+          if(support_avx512_bf16())
+            return CORE_COOPERLAKE;
+          if(support_avx512())
+            return CORE_SKYLAKEX;
+#endif
 	  if(support_avx())
 #ifndef NO_AVX2
 	    return CORE_ZEN;

From 3ad27007fc71d5acb42c24569792146331b0cba3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 10 Nov 2023 15:19:11 +0100
Subject: [PATCH 392/718] rebase

---
 Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile b/Makefile
index 5f6643402..b344abcd2 100644
--- a/Makefile
+++ b/Makefile
@@ -35,7 +35,11 @@ export NO_LAPACK
 export C_LAPACK
 endif
 
+ifeq ($(F_COMPILER),CRAY)
+LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -Og -Os,$(LAPACK_FFLAGS))
+else
 LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
+endif
 
 SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
 

From 1a308a006664b20ae9dbce5e3e69d52e09b44829 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 10 Nov 2023 15:27:46 +0100
Subject: [PATCH 393/718] Move OpenMP dependency handling for clang/gfortran
 combo

---
 Makefile.system | 3 +++
 f_check         | 7 -------
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index 30b0ddec2..c6c1dc02f 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -608,6 +608,9 @@ endif
 
 ifeq ($(C_COMPILER), CLANG)
 CCOMMON_OPT    += -fopenmp
+ifeq ($(F_COMPILER), GFORTRAN)
+FEXTRALIB := $(subst -lgomp,-lomp,$(FEXTRALIB))
+endif
 endif
 
 ifeq ($(C_COMPILER), INTEL)
diff --git a/f_check b/f_check
index 31f4376d0..dac34edee 100755
--- a/f_check
+++ b/f_check
@@ -373,13 +373,6 @@ if [ -n "$link" ]; then
     	    ;;
     	esac
 
-    	case "$flag" in *-lgomp*)
-    	    case "$CC" in *clang*)
-    	        flag="-lomp"
-    	        ;;
-    	    esac
-    	esac
-
         case "$flag" in -l*)
             case "$flag" in
                 *ibrary*|*gfortranbegin*|*flangmain*|*frtbegin*|*pathfstart*|\

From 1d4ed20c2f47994cba88432de2c5a5ea3c1db58e Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Fri, 10 Nov 2023 15:58:30 +0100
Subject: [PATCH 394/718] Add conditions to all CI jobs to only run on main
 repo by default

This is a follow-up to gh-4271. At the moment, when a contributor
pushes the latest `develop` to their own branch to bring their own
fork in sync with `main`, or if they push another branch, this triggers
30 CI jobs to run. Most will complete silently and only burn CPU
time unnecessarily. If there's a failure, this may result in unexpected
failure notifications. And the AWS Graviton3 run won't complete at all
and time out, since the Cirun hook will only work when triggered from
the main repo.
---
 .github/workflows/arm64_graviton.yml | 1 +
 .github/workflows/c910v.yml          | 1 +
 .github/workflows/dynamic_arch.yml   | 3 +++
 .github/workflows/loongarch64.yml    | 1 +
 .github/workflows/mips64.yml         | 1 +
 5 files changed, 7 insertions(+)

diff --git a/.github/workflows/arm64_graviton.yml b/.github/workflows/arm64_graviton.yml
index bcb05047c..9dd0ae589 100644
--- a/.github/workflows/arm64_graviton.yml
+++ b/.github/workflows/arm64_graviton.yml
@@ -7,6 +7,7 @@ permissions:
 
 jobs:
   build:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
     runs-on: "cirun-aws-runner-graviton--${{ github.run_id }}"
 
     strategy:
diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml
index 199304fb1..e6ed08f2c 100644
--- a/.github/workflows/c910v.yml
+++ b/.github/workflows/c910v.yml
@@ -7,6 +7,7 @@ permissions:
 
 jobs:
   TEST:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
     runs-on: ubuntu-latest
     env:
       xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282
diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml
index 0c39bfddf..02429e317 100644
--- a/.github/workflows/dynamic_arch.yml
+++ b/.github/workflows/dynamic_arch.yml
@@ -7,6 +7,7 @@ permissions:
 
 jobs:
   build:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
     runs-on: ${{ matrix.os }}
 
     strategy:
@@ -146,6 +147,7 @@ jobs:
 
 
   msys2:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
     runs-on: windows-latest
 
     strategy:
@@ -312,6 +314,7 @@ jobs:
 
 
   cross_build:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
     runs-on: ubuntu-22.04
 
     strategy:
diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml
index e0236ca86..fa62d0b41 100644
--- a/.github/workflows/loongarch64.yml
+++ b/.github/workflows/loongarch64.yml
@@ -4,6 +4,7 @@ on: [push, pull_request]
 
 jobs:
   TEST:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml
index de7c0c0f3..7f09d4fca 100644
--- a/.github/workflows/mips64.yml
+++ b/.github/workflows/mips64.yml
@@ -7,6 +7,7 @@ permissions:
 
 jobs:
   TEST:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false

From 2418a20f1fb89783dc1e198fd0e2bd78ecf43e77 Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Fri, 10 Nov 2023 16:05:52 +0100
Subject: [PATCH 395/718] Cancel running CI jobs when new changes are pushed to
 a PR

The `group` expression ensures that the cancel-in-progress
behavior is to only cancel if a new commit is pushed to the PR for
which the job is running, not other PRs.

This is a fairly standard snippet, used also in CI jobs for NumPy
and other projects.
---
 .github/workflows/arm64_graviton.yml         | 4 ++++
 .github/workflows/c910v.yml                  | 4 ++++
 .github/workflows/dynamic_arch.yml           | 4 ++++
 .github/workflows/loongarch64.yml            | 4 ++++
 .github/workflows/mips64.yml                 | 4 ++++
 .github/workflows/nightly-Homebrew-build.yml | 4 ++++
 6 files changed, 24 insertions(+)

diff --git a/.github/workflows/arm64_graviton.yml b/.github/workflows/arm64_graviton.yml
index 9dd0ae589..4382510df 100644
--- a/.github/workflows/arm64_graviton.yml
+++ b/.github/workflows/arm64_graviton.yml
@@ -2,6 +2,10 @@ name: arm64 graviton cirun
 
 on: [push, pull_request]
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
 permissions:
   contents: read # to fetch code (actions/checkout)
 
diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml
index e6ed08f2c..30cf32b34 100644
--- a/.github/workflows/c910v.yml
+++ b/.github/workflows/c910v.yml
@@ -2,6 +2,10 @@ name: c910v qemu test
 
 on: [push, pull_request]
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
 permissions:
   contents: read # to fetch code (actions/checkout)
 
diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml
index 02429e317..49721958a 100644
--- a/.github/workflows/dynamic_arch.yml
+++ b/.github/workflows/dynamic_arch.yml
@@ -2,6 +2,10 @@ name: continuous build
 
 on: [push, pull_request]
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
 permissions:
   contents: read # to fetch code (actions/checkout)
 
diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml
index fa62d0b41..4a9bf98b6 100644
--- a/.github/workflows/loongarch64.yml
+++ b/.github/workflows/loongarch64.yml
@@ -2,6 +2,10 @@ name: loongarch64 qemu test
 
 on: [push, pull_request]
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
 jobs:
   TEST:
     if: "github.repository == 'OpenMathLib/OpenBLAS'"
diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml
index 7f09d4fca..4686ba713 100644
--- a/.github/workflows/mips64.yml
+++ b/.github/workflows/mips64.yml
@@ -2,6 +2,10 @@ name: mips64 qemu test
 
 on: [push, pull_request]
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
 permissions:
   contents: read # to fetch code (actions/checkout)
 
diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml
index eb315f1d4..ca57fba70 100644
--- a/.github/workflows/nightly-Homebrew-build.yml
+++ b/.github/workflows/nightly-Homebrew-build.yml
@@ -18,6 +18,10 @@ on:
 
 name: Nightly-Homebrew-Build
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
 permissions:
   contents: read # to fetch code (actions/checkout)
 

From 8613632dc53eebc505b56c38af5d46123e2704b3 Mon Sep 17 00:00:00 2001
From: Ralf Gommers <ralf.gommers@gmail.com>
Date: Fri, 10 Nov 2023 16:23:44 +0100
Subject: [PATCH 396/718] Trigger AWS Graviton 3 CI job only for develop and
 release branches

---
 .github/workflows/arm64_graviton.yml | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/arm64_graviton.yml b/.github/workflows/arm64_graviton.yml
index 4382510df..6928312b5 100644
--- a/.github/workflows/arm64_graviton.yml
+++ b/.github/workflows/arm64_graviton.yml
@@ -1,6 +1,14 @@
 name: arm64 graviton cirun
 
-on: [push, pull_request]
+on:
+  push:
+    branches:
+      - develop
+      - release-**
+  pull_request:
+    branches:
+      - develop
+      - release-**
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}

From 5ffbe646e12fe76682410674f5de4146c87aa973 Mon Sep 17 00:00:00 2001
From: Angelika Schwarz <17718454+angsch@users.noreply.github.com>
Date: Sat, 11 Nov 2023 09:19:48 +0100
Subject: [PATCH 397/718] Improve matcopy interface

* rows = 0 or cols = 0 is now a legal input and
  takes quick return path
* Follow BLAS/LAPACK convention that the leading
  dimensions must be at least 1.
---
 interface/imatcopy.c  | 22 ++++++++++++----------
 interface/omatcopy.c  | 22 ++++++++++++----------
 interface/zimatcopy.c | 30 ++++++++++++++++--------------
 interface/zomatcopy.c | 30 ++++++++++++++++--------------
 4 files changed, 56 insertions(+), 48 deletions(-)

diff --git a/interface/imatcopy.c b/interface/imatcopy.c
index 4cf0966cc..6a1ad282c 100644
--- a/interface/imatcopy.c
+++ b/interface/imatcopy.c
@@ -100,27 +100,29 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 
 	if ( order == BlasColMajor)
 	{
-        	if ( trans == BlasNoTrans  &&  *ldb < *rows ) info = 8;
-        	if ( trans == BlasTrans    &&  *ldb < *cols ) info = 8;
+        	if ( trans == BlasNoTrans  &&  *ldb < MAX(1,*rows) ) info = 8;
+        	if ( trans == BlasTrans    &&  *ldb < MAX(1,*cols) ) info = 8;
 	}
 	if ( order == BlasRowMajor)
 	{
-        	if ( trans == BlasNoTrans  &&  *ldb < *cols ) info = 8;
-        	if ( trans == BlasTrans    &&  *ldb < *rows ) info = 8;
+        	if ( trans == BlasNoTrans  &&  *ldb < MAX(1,*cols) ) info = 8;
+        	if ( trans == BlasTrans    &&  *ldb < MAX(1,*rows) ) info = 8;
 	}
 
-	if ( order == BlasColMajor &&  *lda < *rows ) info = 7;
-	if ( order == BlasRowMajor &&  *lda < *cols ) info = 7;
-	if ( *cols <= 0 ) info = 4;
-	if ( *rows <= 0 ) info = 3;
-	if ( trans < 0  ) info = 2;
-	if ( order < 0  ) info = 1;
+	if ( order == BlasColMajor &&  *lda < MAX(1,*rows) ) info = 7;
+	if ( order == BlasRowMajor &&  *lda < MAX(1,*cols) ) info = 7;
+	if ( *cols < 0 ) info = 4;
+	if ( *rows < 0 ) info = 3;
+	if ( trans < 0 ) info = 2;
+	if ( order < 0 ) info = 1;
 
 	if (info >= 0) {
     		BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
     		return;
   	}
 
+	if ((*rows == 0) || (*cols == 0)) return;
+
 #ifdef NEW_IMATCOPY
     if ( *lda == *ldb ) {
         if ( order == BlasColMajor )
diff --git a/interface/omatcopy.c b/interface/omatcopy.c
index 59650cfa0..c26446f5c 100644
--- a/interface/omatcopy.c
+++ b/interface/omatcopy.c
@@ -90,27 +90,29 @@ void CNAME(enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 #endif
 	if ( order == BlasColMajor)
 	{
-        	if ( trans == BlasNoTrans  &&  *ldb < *rows ) info = 9;
-        	if ( trans == BlasTrans    &&  *ldb < *cols ) info = 9;
+        	if ( trans == BlasNoTrans  &&  *ldb < MAX(1,*rows) ) info = 9;
+        	if ( trans == BlasTrans    &&  *ldb < MAX(1,*cols) ) info = 9;
 	}
 	if ( order == BlasRowMajor)
 	{
-        	if ( trans == BlasNoTrans  &&  *ldb < *cols ) info = 9;
-        	if ( trans == BlasTrans    &&  *ldb < *rows ) info = 9;
+        	if ( trans == BlasNoTrans  &&  *ldb < MAX(1,*cols) ) info = 9;
+        	if ( trans == BlasTrans    &&  *ldb < MAX(1,*rows) ) info = 9;
 	}
 
-	if ( order == BlasColMajor &&  *lda < *rows ) info = 7;
-	if ( order == BlasRowMajor &&  *lda < *cols ) info = 7;
-	if ( *cols <= 0 ) info = 4;
-	if ( *rows <= 0 ) info = 3;
-	if ( trans < 0  ) info = 2;
-	if ( order < 0  ) info = 1;
+	if ( order == BlasColMajor &&  *lda < MAX(1,*rows) ) info = 7;
+	if ( order == BlasRowMajor &&  *lda < MAX(1,*cols) ) info = 7;
+	if ( *cols < 0 ) info = 4;
+	if ( *rows < 0 ) info = 3;
+	if ( trans < 0 ) info = 2;
+	if ( order < 0 ) info = 1;
 
 	if (info >= 0) {
     		BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
     		return;
   	}
 
+	if ((*rows == 0) || (*cols == 0)) return;
+
 	if ( order == BlasColMajor )
 	{
 		if ( trans == BlasNoTrans )
diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c
index b0b32dc87..b66489eb7 100644
--- a/interface/zimatcopy.c
+++ b/interface/zimatcopy.c
@@ -101,31 +101,33 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 
 	if ( order == BlasColMajor)
 	{
-        	if ( trans == BlasNoTrans      &&  *ldb < *rows ) info = 9;
-        	if ( trans == BlasConj         &&  *ldb < *rows ) info = 9;
-        	if ( trans == BlasTrans        &&  *ldb < *cols ) info = 9;
-        	if ( trans == BlasTransConj    &&  *ldb < *cols ) info = 9;
+        	if ( trans == BlasNoTrans      &&  *ldb < MAX(1,*rows) ) info = 9;
+        	if ( trans == BlasConj         &&  *ldb < MAX(1,*rows) ) info = 9;
+        	if ( trans == BlasTrans        &&  *ldb < MAX(1,*cols) ) info = 9;
+        	if ( trans == BlasTransConj    &&  *ldb < MAX(1,*cols) ) info = 9;
 	}
 	if ( order == BlasRowMajor)
 	{
-        	if ( trans == BlasNoTrans    &&  *ldb < *cols ) info = 9;
-        	if ( trans == BlasConj       &&  *ldb < *cols ) info = 9;
-        	if ( trans == BlasTrans      &&  *ldb < *rows ) info = 9;
-        	if ( trans == BlasTransConj  &&  *ldb < *rows ) info = 9;
+        	if ( trans == BlasNoTrans    &&  *ldb < MAX(1,*cols) ) info = 9;
+        	if ( trans == BlasConj       &&  *ldb < MAX(1,*cols) ) info = 9;
+        	if ( trans == BlasTrans      &&  *ldb < MAX(1,*rows) ) info = 9;
+        	if ( trans == BlasTransConj  &&  *ldb < MAX(1,*rows) ) info = 9;
 	}
 
-	if ( order == BlasColMajor &&  *lda < *rows ) info = 7;
-	if ( order == BlasRowMajor &&  *lda < *cols ) info = 7;
-	if ( *cols <= 0 ) info = 4;
-	if ( *rows <= 0 ) info = 3;
-	if ( trans < 0  ) info = 2;
-	if ( order < 0  ) info = 1;
+	if ( order == BlasColMajor &&  *lda < MAX(1,*rows) ) info = 7;
+	if ( order == BlasRowMajor &&  *lda < MAX(1,*cols) ) info = 7;
+	if ( *cols < 0 ) info = 4;
+	if ( *rows < 0 ) info = 3;
+	if ( trans < 0 ) info = 2;
+	if ( order < 0 ) info = 1;
 
 	if (info >= 0) {
     		BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
     		return;
   	}
 
+	if ((*rows == 0) || (*cols == 0)) return;
+
 #ifdef NEW_IMATCOPY
     if (*lda == *ldb ) {
         if ( order == BlasColMajor )
diff --git a/interface/zomatcopy.c b/interface/zomatcopy.c
index 7345633a2..7121711d8 100644
--- a/interface/zomatcopy.c
+++ b/interface/zomatcopy.c
@@ -92,31 +92,33 @@ void CNAME(enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 #endif
 	if ( order == BlasColMajor)
 	{
-        	if ( trans == BlasNoTrans      &&  *ldb < *rows ) info = 9;
-        	if ( trans == BlasConj         &&  *ldb < *rows ) info = 9;
-        	if ( trans == BlasTrans        &&  *ldb < *cols ) info = 9;
-        	if ( trans == BlasTransConj    &&  *ldb < *cols ) info = 9;
+        	if ( trans == BlasNoTrans      &&  *ldb < MAX(1,*rows) ) info = 9;
+        	if ( trans == BlasConj         &&  *ldb < MAX(1,*rows) ) info = 9;
+        	if ( trans == BlasTrans        &&  *ldb < MAX(1,*cols) ) info = 9;
+        	if ( trans == BlasTransConj    &&  *ldb < MAX(1,*cols) ) info = 9;
 	}
 	if ( order == BlasRowMajor)
 	{
-        	if ( trans == BlasNoTrans    &&  *ldb < *cols ) info = 9;
-        	if ( trans == BlasConj       &&  *ldb < *cols ) info = 9;
-        	if ( trans == BlasTrans      &&  *ldb < *rows ) info = 9;
-        	if ( trans == BlasTransConj  &&  *ldb < *rows ) info = 9;
+        	if ( trans == BlasNoTrans    &&  *ldb < MAX(1,*cols) ) info = 9;
+        	if ( trans == BlasConj       &&  *ldb < MAX(1,*cols) ) info = 9;
+        	if ( trans == BlasTrans      &&  *ldb < MAX(1,*rows) ) info = 9;
+        	if ( trans == BlasTransConj  &&  *ldb < MAX(1,*rows) ) info = 9;
 	}
 
-	if ( order == BlasColMajor &&  *lda < *rows ) info = 7;
-	if ( order == BlasRowMajor &&  *lda < *cols ) info = 7;
-	if ( *cols <= 0 ) info = 4;
-	if ( *rows <= 0 ) info = 3;
-	if ( trans < 0  ) info = 2;
-	if ( order < 0  ) info = 1;
+	if ( order == BlasColMajor &&  *lda < MAX(1,*rows) ) info = 7;
+	if ( order == BlasRowMajor &&  *lda < MAX(1,*cols) ) info = 7;
+	if ( *cols < 0 ) info = 4;
+	if ( *rows < 0 ) info = 3;
+	if ( trans < 0 ) info = 2;
+	if ( order < 0 ) info = 1;
 
 	if (info >= 0) {
     		BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
     		return;
   	}
 
+	if ((*rows == 0) || (*cols == 0)) return;
+
 	if ( order == BlasColMajor )
 	{
 

From ff6437f2d7954530ca8cd74fb4ea98631ff83398 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 11 Nov 2023 21:30:32 +0100
Subject: [PATCH 398/718] Add workaround for omp_get_max_threads hanging on
 FreeBSD with libomp from LLVM14

---
 driver/others/blas_server_omp.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c
index 3e2179373..fcc0d68ad 100644
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -124,9 +124,18 @@ void openblas_set_num_threads(int num_threads) {
 }
 
 int blas_thread_init(void){
-  if(blas_omp_number_max <= 0)
-    blas_omp_number_max = omp_get_max_threads();
-	
+
+#if defined(__FreeBSD__) && defined(__clang__)
+extern int openblas_omp_num_threads_env();
+
+   if(blas_omp_number_max <= 0)
+	   blas_omp_number_max= openblas_omp_num_threads_env();
+   if (blas_omp_number_max <= 0) 
+	   blas_omp_number_max=MAX_CPU_NUMBER;
+#else
+    blas_omp_number_max = /omp_get_max_threads();
+#endif
+
   blas_get_cpu_number();
 
   adjust_thread_buffers();

From 9324520d0ebd8f89507f59f1fe7b5d8b0f758915 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 11 Nov 2023 23:14:58 +0100
Subject: [PATCH 399/718] typo fix

---
 driver/others/blas_server_omp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c
index fcc0d68ad..f7008fb08 100644
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -133,7 +133,7 @@ extern int openblas_omp_num_threads_env();
    if (blas_omp_number_max <= 0) 
 	   blas_omp_number_max=MAX_CPU_NUMBER;
 #else
-    blas_omp_number_max = /omp_get_max_threads();
+    blas_omp_number_max = omp_get_max_threads();
 #endif
 
   blas_get_cpu_number();

From b6144f70ff0d6f7967c8119b068d460d6c5aaf95 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 11 Nov 2023 23:41:18 +0100
Subject: [PATCH 400/718] Change ?GECON to return INFO=1 if RCOND is NaN
 (Reference-LAPACK PR 926)

---
 lapack-netlib/SRC/cgecon.f | 39 +++++++++++++++++++++++++++++++-------
 lapack-netlib/SRC/dgecon.f | 39 +++++++++++++++++++++++++++++++-------
 lapack-netlib/SRC/sgecon.f | 39 +++++++++++++++++++++++++++++++-------
 lapack-netlib/SRC/zgecon.f | 39 +++++++++++++++++++++++++++++++-------
 4 files changed, 128 insertions(+), 28 deletions(-)

diff --git a/lapack-netlib/SRC/cgecon.f b/lapack-netlib/SRC/cgecon.f
index 6f426c2ab..e018b18bb 100644
--- a/lapack-netlib/SRC/cgecon.f
+++ b/lapack-netlib/SRC/cgecon.f
@@ -105,8 +105,15 @@
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
-*>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          =-5:  if ANORM is NAN or negative.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>                NaNs are illegal values for ANORM, and they propagate to
+*>                the output parameter RCOND.
+*>                Infinity is illegal for ANORM, and it propagates to the output
+*>                parameter RCOND as 0.
+*>          = 1:  if RCOND = NaN, or
+*>                   RCOND = Inf, or
+*>                   the computed norm of the inverse of A is 0.
+*>                In the latter, RCOND = 0 is returned.
 *> \endverbatim
 *
 *  Authors:
@@ -117,7 +124,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup gecon
 *
 *  =====================================================================
       SUBROUTINE CGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, RWORK,
@@ -147,7 +154,7 @@
       LOGICAL            ONENRM
       CHARACTER          NORMIN
       INTEGER            IX, KASE, KASE1
-      REAL               AINVNM, SCALE, SL, SMLNUM, SU
+      REAL               AINVNM, SCALE, SL, SMLNUM, SU, HUGEVAL
       COMPLEX            ZDUM
 *     ..
 *     .. Local Arrays ..
@@ -172,6 +179,8 @@
       CABS1( ZDUM ) = ABS( REAL( ZDUM ) ) + ABS( AIMAG( ZDUM ) )
 *     ..
 *     .. Executable Statements ..
+*
+      HUGEVAL = SLAMCH( 'Overflow' )
 *
 *     Test the input parameters.
 *
@@ -183,7 +192,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF( ANORM.LT.ZERO .OR. SISNAN( ANORM ) ) THEN
+      ELSE IF( ANORM.LT.ZERO ) THEN
          INFO = -5
       END IF
       IF( INFO.NE.0 ) THEN
@@ -199,6 +208,13 @@
          RETURN
       ELSE IF( ANORM.EQ.ZERO ) THEN
          RETURN
+      ELSE IF( SISNAN( ANORM ) ) THEN
+         RCOND = ANORM
+         INFO = -5
+         RETURN
+      ELSE IF( ANORM.GT.HUGEVAL ) THEN
+         INFO = -5
+         RETURN
       END IF
 *
       SMLNUM = SLAMCH( 'Safe minimum' )
@@ -256,8 +272,17 @@
 *
 *     Compute the estimate of the reciprocal condition number.
 *
-      IF( AINVNM.NE.ZERO )
-     $   RCOND = ( ONE / AINVNM ) / ANORM
+      IF( AINVNM.NE.ZERO ) THEN
+         RCOND = ( ONE / AINVNM ) / ANORM
+      ELSE
+         INFO = 1
+         RETURN
+      END IF
+*
+*     Check for NaNs and Infs
+*
+      IF( SISNAN( RCOND ) .OR. RCOND.GT.HUGEVAL )
+     $   INFO = 1
 *
    20 CONTINUE
       RETURN
diff --git a/lapack-netlib/SRC/dgecon.f b/lapack-netlib/SRC/dgecon.f
index 1ad302ae3..a543eb03a 100644
--- a/lapack-netlib/SRC/dgecon.f
+++ b/lapack-netlib/SRC/dgecon.f
@@ -105,8 +105,15 @@
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
-*>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          =-5:  if ANORM is NAN or negative.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>                NaNs are illegal values for ANORM, and they propagate to
+*>                the output parameter RCOND.
+*>                Infinity is illegal for ANORM, and it propagates to the output
+*>                parameter RCOND as 0.
+*>          = 1:  if RCOND = NaN, or
+*>                   RCOND = Inf, or
+*>                   the computed norm of the inverse of A is 0.
+*>                In the latter, RCOND = 0 is returned.
 *> \endverbatim
 *
 *  Authors:
@@ -117,7 +124,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup gecon
 *
 *  =====================================================================
       SUBROUTINE DGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, IWORK,
@@ -147,7 +154,7 @@
       LOGICAL            ONENRM
       CHARACTER          NORMIN
       INTEGER            IX, KASE, KASE1
-      DOUBLE PRECISION   AINVNM, SCALE, SL, SMLNUM, SU
+      DOUBLE PRECISION   AINVNM, SCALE, SL, SMLNUM, SU, HUGEVAL
 *     ..
 *     .. Local Arrays ..
       INTEGER            ISAVE( 3 )
@@ -165,6 +172,8 @@
       INTRINSIC          ABS, MAX
 *     ..
 *     .. Executable Statements ..
+*
+      HUGEVAL = DLAMCH( 'Overflow' )
 *
 *     Test the input parameters.
 *
@@ -176,7 +185,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF( ANORM.LT.ZERO .OR. DISNAN( ANORM ) ) THEN
+      ELSE IF( ANORM.LT.ZERO ) THEN
          INFO = -5
       END IF
       IF( INFO.NE.0 ) THEN
@@ -192,6 +201,13 @@
          RETURN
       ELSE IF( ANORM.EQ.ZERO ) THEN
          RETURN
+      ELSE IF( DISNAN( ANORM ) ) THEN
+         RCOND = ANORM
+         INFO = -5
+         RETURN
+      ELSE IF( ANORM.GT.HUGEVAL ) THEN
+         INFO = -5
+         RETURN
       END IF
 *
       SMLNUM = DLAMCH( 'Safe minimum' )
@@ -248,8 +264,17 @@
 *
 *     Compute the estimate of the reciprocal condition number.
 *
-      IF( AINVNM.NE.ZERO )
-     $   RCOND = ( ONE / AINVNM ) / ANORM
+      IF( AINVNM.NE.ZERO ) THEN
+         RCOND = ( ONE / AINVNM ) / ANORM
+      ELSE
+         INFO = 1
+         RETURN
+      END IF
+*
+*     Check for NaNs and Infs
+*
+      IF( DISNAN( RCOND ) .OR. RCOND.GT.HUGEVAL )
+     $   INFO = 1
 *
    20 CONTINUE
       RETURN
diff --git a/lapack-netlib/SRC/sgecon.f b/lapack-netlib/SRC/sgecon.f
index 86aeea73b..82f463ebb 100644
--- a/lapack-netlib/SRC/sgecon.f
+++ b/lapack-netlib/SRC/sgecon.f
@@ -105,8 +105,15 @@
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
-*>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          =-5:  if ANORM is NAN or negative.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>                NaNs are illegal values for ANORM, and they propagate to
+*>                the output parameter RCOND.
+*>                Infinity is illegal for ANORM, and it propagates to the output
+*>                parameter RCOND as 0.
+*>          = 1:  if RCOND = NaN, or
+*>                   RCOND = Inf, or
+*>                   the computed norm of the inverse of A is 0.
+*>                In the latter, RCOND = 0 is returned.
 *> \endverbatim
 *
 *  Authors:
@@ -117,7 +124,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup gecon
 *
 *  =====================================================================
       SUBROUTINE SGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, IWORK,
@@ -147,7 +154,7 @@
       LOGICAL            ONENRM
       CHARACTER          NORMIN
       INTEGER            IX, KASE, KASE1
-      REAL               AINVNM, SCALE, SL, SMLNUM, SU
+      REAL               AINVNM, SCALE, SL, SMLNUM, SU, HUGEVAL
 *     ..
 *     .. Local Arrays ..
       INTEGER            ISAVE( 3 )
@@ -165,6 +172,8 @@
       INTRINSIC          ABS, MAX
 *     ..
 *     .. Executable Statements ..
+*
+      HUGEVAL = SLAMCH( 'Overflow' )
 *
 *     Test the input parameters.
 *
@@ -176,7 +185,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF( ANORM.LT.ZERO .OR. SISNAN( ANORM ) ) THEN
+      ELSE IF( ANORM.LT.ZERO ) THEN
          INFO = -5
       END IF
       IF( INFO.NE.0 ) THEN
@@ -192,6 +201,13 @@
          RETURN
       ELSE IF( ANORM.EQ.ZERO ) THEN
          RETURN
+      ELSE IF( SISNAN( ANORM ) ) THEN
+         RCOND = ANORM
+         INFO = -5
+         RETURN
+      ELSE IF( ANORM.GT.HUGEVAL ) THEN
+         INFO = -5
+         RETURN
       END IF
 *
       SMLNUM = SLAMCH( 'Safe minimum' )
@@ -248,8 +264,17 @@
 *
 *     Compute the estimate of the reciprocal condition number.
 *
-      IF( AINVNM.NE.ZERO )
-     $   RCOND = ( ONE / AINVNM ) / ANORM
+      IF( AINVNM.NE.ZERO ) THEN
+         RCOND = ( ONE / AINVNM ) / ANORM
+      ELSE
+         INFO = 1
+         RETURN
+      END IF
+*
+*     Check for NaNs and Infs
+*
+      IF( SISNAN( RCOND ) .OR. RCOND.GT.HUGEVAL )
+     $   INFO = 1
 *
    20 CONTINUE
       RETURN
diff --git a/lapack-netlib/SRC/zgecon.f b/lapack-netlib/SRC/zgecon.f
index 9cbfe35bc..ef567d7c2 100644
--- a/lapack-netlib/SRC/zgecon.f
+++ b/lapack-netlib/SRC/zgecon.f
@@ -105,8 +105,15 @@
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
-*>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          =-5:  if ANORM is NAN or negative.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>                NaNs are illegal values for ANORM, and they propagate to
+*>                the output parameter RCOND.
+*>                Infinity is illegal for ANORM, and it propagates to the output
+*>                parameter RCOND as 0.
+*>          = 1:  if RCOND = NaN, or
+*>                   RCOND = Inf, or
+*>                   the computed norm of the inverse of A is 0.
+*>                In the latter, RCOND = 0 is returned.
 *> \endverbatim
 *
 *  Authors:
@@ -117,7 +124,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup gecon
 *
 *  =====================================================================
       SUBROUTINE ZGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, RWORK,
@@ -147,7 +154,7 @@
       LOGICAL            ONENRM
       CHARACTER          NORMIN
       INTEGER            IX, KASE, KASE1
-      DOUBLE PRECISION   AINVNM, SCALE, SL, SMLNUM, SU
+      DOUBLE PRECISION   AINVNM, SCALE, SL, SMLNUM, SU, HUGEVAL
       COMPLEX*16         ZDUM
 *     ..
 *     .. Local Arrays ..
@@ -172,6 +179,8 @@
       CABS1( ZDUM ) = ABS( DBLE( ZDUM ) ) + ABS( DIMAG( ZDUM ) )
 *     ..
 *     .. Executable Statements ..
+*
+      HUGEVAL = DLAMCH( 'Overflow' )
 *
 *     Test the input parameters.
 *
@@ -183,7 +192,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF( ANORM.LT.ZERO .OR. DISNAN( ANORM ) ) THEN
+      ELSE IF( ANORM.LT.ZERO ) THEN
          INFO = -5
       END IF
       IF( INFO.NE.0 ) THEN
@@ -199,6 +208,13 @@
          RETURN
       ELSE IF( ANORM.EQ.ZERO ) THEN
          RETURN
+      ELSE IF( DISNAN( ANORM ) ) THEN
+         RCOND = ANORM
+         INFO = -5
+         RETURN
+      ELSE IF( ANORM.GT.HUGEVAL ) THEN
+         INFO = -5
+         RETURN
       END IF
 *
       SMLNUM = DLAMCH( 'Safe minimum' )
@@ -256,8 +272,17 @@
 *
 *     Compute the estimate of the reciprocal condition number.
 *
-      IF( AINVNM.NE.ZERO )
-     $   RCOND = ( ONE / AINVNM ) / ANORM
+      IF( AINVNM.NE.ZERO ) THEN
+         RCOND = ( ONE / AINVNM ) / ANORM
+      ELSE
+         INFO = 1
+         RETURN
+      END IF
+*
+*     Check for NaNs and Infs
+*
+      IF( DISNAN( RCOND ) .OR. RCOND.GT.HUGEVAL )
+     $   INFO = 1
 *
    20 CONTINUE
       RETURN

From 58427ff74d25952c82d1d1d8edea1afaa94f9fcd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 12 Nov 2023 10:54:39 +0100
Subject: [PATCH 401/718] Deprecate ?GELQS and ?GEQRS from TESTING/LIN
 (Reference-LAPACK PR 900) (#4307)

* Move ?GELQS and ?GEQRS from TESTING/LIN to DEPRECATED (Reference-LAPACK PR 900)

* Add f2c-converted versions of ?GELQS and ?GEQRS
---
 cmake/lapack.cmake                            |   8 +
 lapack-netlib/SRC/DEPRECATED/cgelqs.c         | 479 +++++++++++++++++
 .../{TESTING/LIN => SRC/DEPRECATED}/cgelqs.f  |   0
 lapack-netlib/SRC/DEPRECATED/cgeqrs.c         | 471 +++++++++++++++++
 .../{TESTING/LIN => SRC/DEPRECATED}/cgeqrs.f  |   0
 lapack-netlib/SRC/DEPRECATED/dgelqs.c         | 480 +++++++++++++++++
 .../{TESTING/LIN => SRC/DEPRECATED}/dgelqs.f  |   0
 lapack-netlib/SRC/DEPRECATED/dgeqrs.c         | 471 +++++++++++++++++
 .../{TESTING/LIN => SRC/DEPRECATED}/dgeqrs.f  |   0
 lapack-netlib/SRC/DEPRECATED/sgelqs.c         | 472 +++++++++++++++++
 .../{TESTING/LIN => SRC/DEPRECATED}/sgelqs.f  |   0
 lapack-netlib/SRC/DEPRECATED/sgeqrs.c         | 470 +++++++++++++++++
 .../{TESTING/LIN => SRC/DEPRECATED}/sgeqrs.f  |   0
 lapack-netlib/SRC/DEPRECATED/zgelqs.c         | 481 ++++++++++++++++++
 .../{TESTING/LIN => SRC/DEPRECATED}/zgelqs.f  |   0
 lapack-netlib/SRC/DEPRECATED/zgeqrs.c         | 472 +++++++++++++++++
 .../{TESTING/LIN => SRC/DEPRECATED}/zgeqrs.f  |   0
 lapack-netlib/SRC/Makefile                    |  12 +-
 lapack-netlib/TESTING/LIN/CMakeLists.txt      |   8 +-
 lapack-netlib/TESTING/LIN/Makefile            |   8 +-
 lapack-netlib/TESTING/LIN/cchklq.f            |  20 +-
 lapack-netlib/TESTING/LIN/cchkqr.f            |  22 +-
 lapack-netlib/TESTING/LIN/cerrlq.f            |  27 +-
 lapack-netlib/TESTING/LIN/cerrqr.f            |  27 +-
 lapack-netlib/TESTING/LIN/dchklq.f            |  20 +-
 lapack-netlib/TESTING/LIN/dchkqr.f            |  22 +-
 lapack-netlib/TESTING/LIN/derrlq.f            |  27 +-
 lapack-netlib/TESTING/LIN/derrqr.f            |  27 +-
 lapack-netlib/TESTING/LIN/schklq.f            |  20 +-
 lapack-netlib/TESTING/LIN/schkqr.f            |  20 +-
 lapack-netlib/TESTING/LIN/serrlq.f            |  27 +-
 lapack-netlib/TESTING/LIN/serrqr.f            |  27 +-
 lapack-netlib/TESTING/LIN/zchklq.f            |  20 +-
 lapack-netlib/TESTING/LIN/zchkqr.f            |  20 +-
 lapack-netlib/TESTING/LIN/zerrlq.f            |  27 +-
 lapack-netlib/TESTING/LIN/zerrqr.f            |  27 +-
 36 files changed, 3934 insertions(+), 278 deletions(-)
 create mode 100644 lapack-netlib/SRC/DEPRECATED/cgelqs.c
 rename lapack-netlib/{TESTING/LIN => SRC/DEPRECATED}/cgelqs.f (100%)
 create mode 100644 lapack-netlib/SRC/DEPRECATED/cgeqrs.c
 rename lapack-netlib/{TESTING/LIN => SRC/DEPRECATED}/cgeqrs.f (100%)
 create mode 100644 lapack-netlib/SRC/DEPRECATED/dgelqs.c
 rename lapack-netlib/{TESTING/LIN => SRC/DEPRECATED}/dgelqs.f (100%)
 create mode 100644 lapack-netlib/SRC/DEPRECATED/dgeqrs.c
 rename lapack-netlib/{TESTING/LIN => SRC/DEPRECATED}/dgeqrs.f (100%)
 create mode 100644 lapack-netlib/SRC/DEPRECATED/sgelqs.c
 rename lapack-netlib/{TESTING/LIN => SRC/DEPRECATED}/sgelqs.f (100%)
 create mode 100644 lapack-netlib/SRC/DEPRECATED/sgeqrs.c
 rename lapack-netlib/{TESTING/LIN => SRC/DEPRECATED}/sgeqrs.f (100%)
 create mode 100644 lapack-netlib/SRC/DEPRECATED/zgelqs.c
 rename lapack-netlib/{TESTING/LIN => SRC/DEPRECATED}/zgelqs.f (100%)
 create mode 100644 lapack-netlib/SRC/DEPRECATED/zgeqrs.c
 rename lapack-netlib/{TESTING/LIN => SRC/DEPRECATED}/zgeqrs.f (100%)

diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake
index 5c6290484..22476f561 100644
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -438,15 +438,19 @@ endif()
 
 if(BUILD_LAPACK_DEPRECATED)
 list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f
+  DEPRECATED/sgelqs.f DEPRECATED/sgeqrs.f
   DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f
   DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f)
 list(APPEND DLASRC DEPRECATED/dgegs.f DEPRECATED/dgegv.f
+  DEPRECATED/dgelqs.f DEPRECATED/dgeqrs.f
   DEPRECATED/dgeqpf.f DEPRECATED/dgelsx.f DEPRECATED/dggsvd.f
   DEPRECATED/dggsvp.f DEPRECATED/dlahrd.f DEPRECATED/dlatzm.f DEPRECATED/dtzrqf.f)
 list(APPEND CLASRC DEPRECATED/cgegs.f DEPRECATED/cgegv.f
+  DEPRECATED/cgelqs.f DEPRECATED/cgeqrs.f
   DEPRECATED/cgeqpf.f DEPRECATED/cgelsx.f DEPRECATED/cggsvd.f
   DEPRECATED/cggsvp.f DEPRECATED/clahrd.f DEPRECATED/clatzm.f DEPRECATED/ctzrqf.f)
 list(APPEND ZLASRC DEPRECATED/zgegs.f DEPRECATED/zgegv.f
+  DEPRECATED/zgelqs.f DEPRECATED/zgeqrs.f
   DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f
   DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f)
 message(STATUS "Building deprecated routines")
@@ -935,15 +939,19 @@ endif()
 
 if(BUILD_LAPACK_DEPRECATED)
 list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c
+  DEPRECATED/sgelqs.c DEPRECATED/sgeqrs.c
   DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c
   DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c)
 list(APPEND DLASRC DEPRECATED/dgegs.c DEPRECATED/dgegv.c
+  DEPRECATED/dgelqs.c DEPRECATED/dgeqrs.c
   DEPRECATED/dgeqpf.c DEPRECATED/dgelsx.c DEPRECATED/dggsvd.c
   DEPRECATED/dggsvp.c DEPRECATED/dlahrd.c DEPRECATED/dlatzm.c DEPRECATED/dtzrqf.c)
 list(APPEND CLASRC DEPRECATED/cgegs.c DEPRECATED/cgegv.c
+  DEPRECATED/cgelqs.c DEPRECATED/cgeqrs.c
   DEPRECATED/cgeqpf.c DEPRECATED/cgelsx.c DEPRECATED/cggsvd.c
   DEPRECATED/cggsvp.c DEPRECATED/clahrd.c DEPRECATED/clatzm.c DEPRECATED/ctzrqf.c)
 list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c
+  DEPRECATED/zgelqs.c DEPRECATED/zgeqrs.c
   DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c
   DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c)
 message(STATUS "Building deprecated routines")
diff --git a/lapack-netlib/SRC/DEPRECATED/cgelqs.c b/lapack-netlib/SRC/DEPRECATED/cgelqs.c
new file mode 100644
index 000000000..ee6d56119
--- /dev/null
+++ b/lapack-netlib/SRC/DEPRECATED/cgelqs.c
@@ -0,0 +1,479 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static complex c_b1 = {0.f,0.f};
+static complex c_b2 = {1.f,0.f};
+
+/* > \brief \b CGELQS */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE CGELQS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */
+/*                          INFO ) */
+
+/*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS */
+/*       COMPLEX            A( LDA, * ), B( LDB, * ), TAU( * ), */
+/*      $                   WORK( LWORK ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > Compute a minimum-norm solution */
+/* >     f2cmin || A*X - B || */
+/* > using the LQ factorization */
+/* >     A = L*Q */
+/* > computed by CGELQF. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of columns of the matrix A.  N >= M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of columns of B.  NRHS >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is COMPLEX array, dimension (LDA,N) */
+/* >          Details of the LQ factorization of the original matrix A as */
+/* >          returned by CGELQF. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TAU */
+/* > \verbatim */
+/* >          TAU is COMPLEX array, dimension (M) */
+/* >          Details of the orthogonal matrix Q. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] B */
+/* > \verbatim */
+/* >          B is COMPLEX array, dimension (LDB,NRHS) */
+/* >          On entry, the m-by-nrhs right hand side matrix B. */
+/* >          On exit, the n-by-nrhs solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= N. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is COMPLEX array, dimension (LWORK) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* > \verbatim */
+/* >          LWORK is INTEGER */
+/* >          The length of the array WORK.  LWORK must be at least NRHS, */
+/* >          and should be at least NRHS*NB, where NB is the block size */
+/* >          for this environment. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0: successful exit */
+/* >          < 0: if INFO = -i, the i-th argument had an illegal value */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complex_lin */
+
+/*  ===================================================================== */
+/* Subroutine */ int cgelqs_(integer *m, integer *n, integer *nrhs, complex *
+	a, integer *lda, complex *tau, complex *b, integer *ldb, complex *
+	work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, 
+	    integer *, integer *, complex *, complex *, integer *, complex *, 
+	    integer *), claset_(char *, 
+	    integer *, integer *, complex *, complex *, complex *, integer *), xerbla_(char *, integer *), cunmlq_(char *, char 
+	    *, integer *, integer *, integer *, complex *, integer *, complex 
+	    *, complex *, integer *, complex *, integer *, integer *);
+
+
+/*  -- LAPACK test routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --tau;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *m > *n) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -5;
+    } else if (*ldb < f2cmax(1,*n)) {
+	*info = -8;
+    } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGELQS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0 || *m == 0) {
+	return 0;
+    }
+
+/*     Solve L*X = B(1:m,:) */
+
+    ctrsm_("Left", "Lower", "No transpose", "Non-unit", m, nrhs, &c_b2, &a[
+	    a_offset], lda, &b[b_offset], ldb);
+
+/*     Set B(m+1:n,:) to zero */
+
+    if (*m < *n) {
+	i__1 = *n - *m;
+	claset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[*m + 1 + b_dim1], ldb);
+    }
+
+/*     B := Q' * B */
+
+    cunmlq_("Left", "Conjugate transpose", n, nrhs, m, &a[a_offset], lda, &
+	    tau[1], &b[b_offset], ldb, &work[1], lwork, info);
+
+    return 0;
+
+/*     End of CGELQS */
+
+} /* cgelqs_ */
+
diff --git a/lapack-netlib/TESTING/LIN/cgelqs.f b/lapack-netlib/SRC/DEPRECATED/cgelqs.f
similarity index 100%
rename from lapack-netlib/TESTING/LIN/cgelqs.f
rename to lapack-netlib/SRC/DEPRECATED/cgelqs.f
diff --git a/lapack-netlib/SRC/DEPRECATED/cgeqrs.c b/lapack-netlib/SRC/DEPRECATED/cgeqrs.c
new file mode 100644
index 000000000..c71b8af67
--- /dev/null
+++ b/lapack-netlib/SRC/DEPRECATED/cgeqrs.c
@@ -0,0 +1,471 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static complex c_b1 = {1.f,0.f};
+
+/* > \brief \b CGEQRS */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE CGEQRS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */
+/*                          INFO ) */
+
+/*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS */
+/*       COMPLEX            A( LDA, * ), B( LDB, * ), TAU( * ), */
+/*      $                   WORK( LWORK ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > Solve the least squares problem */
+/* >     f2cmin || A*X - B || */
+/* > using the QR factorization */
+/* >     A = Q*R */
+/* > computed by CGEQRF. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of columns of the matrix A.  M >= N >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of columns of B.  NRHS >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is COMPLEX array, dimension (LDA,N) */
+/* >          Details of the QR factorization of the original matrix A as */
+/* >          returned by CGEQRF. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TAU */
+/* > \verbatim */
+/* >          TAU is COMPLEX array, dimension (N) */
+/* >          Details of the orthogonal matrix Q. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] B */
+/* > \verbatim */
+/* >          B is COMPLEX array, dimension (LDB,NRHS) */
+/* >          On entry, the m-by-nrhs right hand side matrix B. */
+/* >          On exit, the n-by-nrhs solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is COMPLEX array, dimension (LWORK) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* > \verbatim */
+/* >          LWORK is INTEGER */
+/* >          The length of the array WORK.  LWORK must be at least NRHS, */
+/* >          and should be at least NRHS*NB, where NB is the block size */
+/* >          for this environment. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0: successful exit */
+/* >          < 0: if INFO = -i, the i-th argument had an illegal value */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complex_lin */
+
+/*  ===================================================================== */
+/* Subroutine */ int cgeqrs_(integer *m, integer *n, integer *nrhs, complex *
+	a, integer *lda, complex *tau, complex *b, integer *ldb, complex *
+	work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, 
+	    integer *, integer *, complex *, complex *, integer *, complex *, 
+	    integer *), xerbla_(char *, 
+	    integer *), cunmqr_(char *, char *, integer *, integer *, 
+	    integer *, complex *, integer *, complex *, complex *, integer *, 
+	    complex *, integer *, integer *);
+
+
+/*  -- LAPACK test routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test the input arguments. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --tau;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *n > *m) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -5;
+    } else if (*ldb < f2cmax(1,*m)) {
+	*info = -8;
+    } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGEQRS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0 || *m == 0) {
+	return 0;
+    }
+
+/*     B := Q' * B */
+
+    cunmqr_("Left", "Conjugate transpose", m, nrhs, n, &a[a_offset], lda, &
+	    tau[1], &b[b_offset], ldb, &work[1], lwork, info);
+
+/*     Solve R*X = B(1:n,:) */
+
+    ctrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b1, &a[
+	    a_offset], lda, &b[b_offset], ldb);
+
+    return 0;
+
+/*     End of CGEQRS */
+
+} /* cgeqrs_ */
+
diff --git a/lapack-netlib/TESTING/LIN/cgeqrs.f b/lapack-netlib/SRC/DEPRECATED/cgeqrs.f
similarity index 100%
rename from lapack-netlib/TESTING/LIN/cgeqrs.f
rename to lapack-netlib/SRC/DEPRECATED/cgeqrs.f
diff --git a/lapack-netlib/SRC/DEPRECATED/dgelqs.c b/lapack-netlib/SRC/DEPRECATED/dgelqs.c
new file mode 100644
index 000000000..e3cf1e029
--- /dev/null
+++ b/lapack-netlib/SRC/DEPRECATED/dgelqs.c
@@ -0,0 +1,480 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static doublereal c_b7 = 1.;
+static doublereal c_b9 = 0.;
+
+/* > \brief \b DGELQS */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE DGELQS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */
+/*                          INFO ) */
+
+/*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS */
+/*       DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), TAU( * ), */
+/*      $                   WORK( LWORK ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > Compute a minimum-norm solution */
+/* >     f2cmin || A*X - B || */
+/* > using the LQ factorization */
+/* >     A = L*Q */
+/* > computed by DGELQF. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of columns of the matrix A.  N >= M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of columns of B.  NRHS >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is DOUBLE PRECISION array, dimension (LDA,N) */
+/* >          Details of the LQ factorization of the original matrix A as */
+/* >          returned by DGELQF. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TAU */
+/* > \verbatim */
+/* >          TAU is DOUBLE PRECISION array, dimension (M) */
+/* >          Details of the orthogonal matrix Q. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] B */
+/* > \verbatim */
+/* >          B is DOUBLE PRECISION array, dimension (LDB,NRHS) */
+/* >          On entry, the m-by-nrhs right hand side matrix B. */
+/* >          On exit, the n-by-nrhs solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= N. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is DOUBLE PRECISION array, dimension (LWORK) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* > \verbatim */
+/* >          LWORK is INTEGER */
+/* >          The length of the array WORK.  LWORK must be at least NRHS, */
+/* >          and should be at least NRHS*NB, where NB is the block size */
+/* >          for this environment. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0: successful exit */
+/* >          < 0: if INFO = -i, the i-th argument had an illegal value */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup double_lin */
+
+/*  ===================================================================== */
+/* Subroutine */ int dgelqs_(integer *m, integer *n, integer *nrhs, 
+	doublereal *a, integer *lda, doublereal *tau, doublereal *b, integer *
+	ldb, doublereal *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, 
+	    integer *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *, integer *), dlaset_(
+	    char *, integer *, integer *, doublereal *, doublereal *, 
+	    doublereal *, integer *), xerbla_(char *, integer *), dormlq_(char *, char *, integer *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *, integer *, integer *);
+
+
+/*  -- LAPACK test routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --tau;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *m > *n) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -5;
+    } else if (*ldb < f2cmax(1,*n)) {
+	*info = -8;
+    } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGELQS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0 || *m == 0) {
+	return 0;
+    }
+
+/*     Solve L*X = B(1:m,:) */
+
+    dtrsm_("Left", "Lower", "No transpose", "Non-unit", m, nrhs, &c_b7, &a[
+	    a_offset], lda, &b[b_offset], ldb);
+
+/*     Set B(m+1:n,:) to zero */
+
+    if (*m < *n) {
+	i__1 = *n - *m;
+	dlaset_("Full", &i__1, nrhs, &c_b9, &c_b9, &b[*m + 1 + b_dim1], ldb);
+    }
+
+/*     B := Q' * B */
+
+    dormlq_("Left", "Transpose", n, nrhs, m, &a[a_offset], lda, &tau[1], &b[
+	    b_offset], ldb, &work[1], lwork, info);
+
+    return 0;
+
+/*     End of DGELQS */
+
+} /* dgelqs_ */
+
diff --git a/lapack-netlib/TESTING/LIN/dgelqs.f b/lapack-netlib/SRC/DEPRECATED/dgelqs.f
similarity index 100%
rename from lapack-netlib/TESTING/LIN/dgelqs.f
rename to lapack-netlib/SRC/DEPRECATED/dgelqs.f
diff --git a/lapack-netlib/SRC/DEPRECATED/dgeqrs.c b/lapack-netlib/SRC/DEPRECATED/dgeqrs.c
new file mode 100644
index 000000000..70236738a
--- /dev/null
+++ b/lapack-netlib/SRC/DEPRECATED/dgeqrs.c
@@ -0,0 +1,471 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static doublereal c_b9 = 1.;
+
+/* > \brief \b DGEQRS */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE DGEQRS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */
+/*                          INFO ) */
+
+/*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS */
+/*       DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), TAU( * ), */
+/*      $                   WORK( LWORK ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > Solve the least squares problem */
+/* >     f2cmin || A*X - B || */
+/* > using the QR factorization */
+/* >     A = Q*R */
+/* > computed by DGEQRF. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of columns of the matrix A.  M >= N >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of columns of B.  NRHS >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is DOUBLE PRECISION array, dimension (LDA,N) */
+/* >          Details of the QR factorization of the original matrix A as */
+/* >          returned by DGEQRF. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TAU */
+/* > \verbatim */
+/* >          TAU is DOUBLE PRECISION array, dimension (N) */
+/* >          Details of the orthogonal matrix Q. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] B */
+/* > \verbatim */
+/* >          B is DOUBLE PRECISION array, dimension (LDB,NRHS) */
+/* >          On entry, the m-by-nrhs right hand side matrix B. */
+/* >          On exit, the n-by-nrhs solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is DOUBLE PRECISION array, dimension (LWORK) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* > \verbatim */
+/* >          LWORK is INTEGER */
+/* >          The length of the array WORK.  LWORK must be at least NRHS, */
+/* >          and should be at least NRHS*NB, where NB is the block size */
+/* >          for this environment. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0: successful exit */
+/* >          < 0: if INFO = -i, the i-th argument had an illegal value */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup double_lin */
+
+/*  ===================================================================== */
+/* Subroutine */ int dgeqrs_(integer *m, integer *n, integer *nrhs, 
+	doublereal *a, integer *lda, doublereal *tau, doublereal *b, integer *
+	ldb, doublereal *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, 
+	    integer *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *, integer *), xerbla_(
+	    char *, integer *), dormqr_(char *, char *, integer *, 
+	    integer *, integer *, doublereal *, integer *, doublereal *, 
+	    doublereal *, integer *, doublereal *, integer *, integer *);
+
+
+/*  -- LAPACK test routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test the input arguments. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --tau;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *n > *m) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -5;
+    } else if (*ldb < f2cmax(1,*m)) {
+	*info = -8;
+    } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGEQRS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0 || *m == 0) {
+	return 0;
+    }
+
+/*     B := Q' * B */
+
+    dormqr_("Left", "Transpose", m, nrhs, n, &a[a_offset], lda, &tau[1], &b[
+	    b_offset], ldb, &work[1], lwork, info);
+
+/*     Solve R*X = B(1:n,:) */
+
+    dtrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b9, &a[
+	    a_offset], lda, &b[b_offset], ldb);
+
+    return 0;
+
+/*     End of DGEQRS */
+
+} /* dgeqrs_ */
+
diff --git a/lapack-netlib/TESTING/LIN/dgeqrs.f b/lapack-netlib/SRC/DEPRECATED/dgeqrs.f
similarity index 100%
rename from lapack-netlib/TESTING/LIN/dgeqrs.f
rename to lapack-netlib/SRC/DEPRECATED/dgeqrs.f
diff --git a/lapack-netlib/SRC/DEPRECATED/sgelqs.c b/lapack-netlib/SRC/DEPRECATED/sgelqs.c
new file mode 100644
index 000000000..03034b0dc
--- /dev/null
+++ b/lapack-netlib/SRC/DEPRECATED/sgelqs.c
@@ -0,0 +1,472 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+/* Table of constant values */
+
+static real c_b7 = 1.f;
+static real c_b9 = 0.f;
+
+/* > \brief \b SGELQS */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE SGELQS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */
+/*                          INFO ) */
+
+/*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS */
+/*       REAL               A( LDA, * ), B( LDB, * ), TAU( * ), */
+/*      $                   WORK( LWORK ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > Compute a minimum-norm solution */
+/* >     f2cmin || A*X - B || */
+/* > using the LQ factorization */
+/* >     A = L*Q */
+/* > computed by SGELQF. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of columns of the matrix A.  N >= M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of columns of B.  NRHS >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is REAL array, dimension (LDA,N) */
+/* >          Details of the LQ factorization of the original matrix A as */
+/* >          returned by SGELQF. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TAU */
+/* > \verbatim */
+/* >          TAU is REAL array, dimension (M) */
+/* >          Details of the orthogonal matrix Q. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] B */
+/* > \verbatim */
+/* >          B is REAL array, dimension (LDB,NRHS) */
+/* >          On entry, the m-by-nrhs right hand side matrix B. */
+/* >          On exit, the n-by-nrhs solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= N. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is REAL array, dimension (LWORK) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* > \verbatim */
+/* >          LWORK is INTEGER */
+/* >          The length of the array WORK.  LWORK must be at least NRHS, */
+/* >          and should be at least NRHS*NB, where NB is the block size */
+/* >          for this environment. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0: successful exit */
+/* >          < 0: if INFO = -i, the i-th argument had an illegal value */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup single_lin */
+
+/*  ===================================================================== */
+/* Subroutine */ int sgelqs_(integer *m, integer *n, integer *nrhs, real *a, 
+	integer *lda, real *tau, real *b, integer *ldb, real *work, integer *
+	lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int strsm_(char *, char *, char *, char *, 
+	    integer *, integer *, real *, real *, integer *, real *, integer *
+	    ), xerbla_(char *, integer *), slaset_(char *, integer *, integer *, real *, real *, 
+	    real *, integer *), sormlq_(char *, char *, integer *, 
+	    integer *, integer *, real *, integer *, real *, real *, integer *
+	    , real *, integer *, integer *);
+
+
+/*  -- LAPACK test routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --tau;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *m > *n) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -5;
+    } else if (*ldb < f2cmax(1,*n)) {
+	*info = -8;
+    } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGELQS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0 || *m == 0) {
+	return 0;
+    }
+
+/*     Solve L*X = B(1:m,:) */
+
+    strsm_("Left", "Lower", "No transpose", "Non-unit", m, nrhs, &c_b7, &a[
+	    a_offset], lda, &b[b_offset], ldb);
+
+/*     Set B(m+1:n,:) to zero */
+
+    if (*m < *n) {
+	i__1 = *n - *m;
+	slaset_("Full", &i__1, nrhs, &c_b9, &c_b9, &b[*m + 1 + b_dim1], ldb);
+    }
+
+/*     B := Q' * B */
+
+    sormlq_("Left", "Transpose", n, nrhs, m, &a[a_offset], lda, &tau[1], &b[
+	    b_offset], ldb, &work[1], lwork, info);
+
+    return 0;
+
+/*     End of SGELQS */
+
+} /* sgelqs_ */
+
diff --git a/lapack-netlib/TESTING/LIN/sgelqs.f b/lapack-netlib/SRC/DEPRECATED/sgelqs.f
similarity index 100%
rename from lapack-netlib/TESTING/LIN/sgelqs.f
rename to lapack-netlib/SRC/DEPRECATED/sgelqs.f
diff --git a/lapack-netlib/SRC/DEPRECATED/sgeqrs.c b/lapack-netlib/SRC/DEPRECATED/sgeqrs.c
new file mode 100644
index 000000000..b593d0dc9
--- /dev/null
+++ b/lapack-netlib/SRC/DEPRECATED/sgeqrs.c
@@ -0,0 +1,470 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static real c_b9 = 1.f;
+
+/* > \brief \b SGEQRS */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE SGEQRS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */
+/*                          INFO ) */
+
+/*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS */
+/*       REAL               A( LDA, * ), B( LDB, * ), TAU( * ), */
+/*      $                   WORK( LWORK ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > Solve the least squares problem */
+/* >     f2cmin || A*X - B || */
+/* > using the QR factorization */
+/* >     A = Q*R */
+/* > computed by SGEQRF. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of columns of the matrix A.  M >= N >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of columns of B.  NRHS >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is REAL array, dimension (LDA,N) */
+/* >          Details of the QR factorization of the original matrix A as */
+/* >          returned by SGEQRF. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TAU */
+/* > \verbatim */
+/* >          TAU is REAL array, dimension (N) */
+/* >          Details of the orthogonal matrix Q. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] B */
+/* > \verbatim */
+/* >          B is REAL array, dimension (LDB,NRHS) */
+/* >          On entry, the m-by-nrhs right hand side matrix B. */
+/* >          On exit, the n-by-nrhs solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is REAL array, dimension (LWORK) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* > \verbatim */
+/* >          LWORK is INTEGER */
+/* >          The length of the array WORK.  LWORK must be at least NRHS, */
+/* >          and should be at least NRHS*NB, where NB is the block size */
+/* >          for this environment. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0: successful exit */
+/* >          < 0: if INFO = -i, the i-th argument had an illegal value */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup single_lin */
+
+/*  ===================================================================== */
+/* Subroutine */ int sgeqrs_(integer *m, integer *n, integer *nrhs, real *a, 
+	integer *lda, real *tau, real *b, integer *ldb, real *work, integer *
+	lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int strsm_(char *, char *, char *, char *, 
+	    integer *, integer *, real *, real *, integer *, real *, integer *
+	    ), xerbla_(char *, integer *), sormqr_(char *, char *, integer *, integer *, integer *, 
+	    real *, integer *, real *, real *, integer *, real *, integer *, 
+	    integer *);
+
+
+/*  -- LAPACK test routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test the input arguments. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --tau;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *n > *m) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -5;
+    } else if (*ldb < f2cmax(1,*m)) {
+	*info = -8;
+    } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGEQRS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0 || *m == 0) {
+	return 0;
+    }
+
+/*     B := Q' * B */
+
+    sormqr_("Left", "Transpose", m, nrhs, n, &a[a_offset], lda, &tau[1], &b[
+	    b_offset], ldb, &work[1], lwork, info);
+
+/*     Solve R*X = B(1:n,:) */
+
+    strsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b9, &a[
+	    a_offset], lda, &b[b_offset], ldb);
+
+    return 0;
+
+/*     End of SGEQRS */
+
+} /* sgeqrs_ */
+
diff --git a/lapack-netlib/TESTING/LIN/sgeqrs.f b/lapack-netlib/SRC/DEPRECATED/sgeqrs.f
similarity index 100%
rename from lapack-netlib/TESTING/LIN/sgeqrs.f
rename to lapack-netlib/SRC/DEPRECATED/sgeqrs.f
diff --git a/lapack-netlib/SRC/DEPRECATED/zgelqs.c b/lapack-netlib/SRC/DEPRECATED/zgelqs.c
new file mode 100644
index 000000000..b77ba906a
--- /dev/null
+++ b/lapack-netlib/SRC/DEPRECATED/zgelqs.c
@@ -0,0 +1,481 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static doublecomplex c_b1 = {0.,0.};
+static doublecomplex c_b2 = {1.,0.};
+
+/* > \brief \b ZGELQS */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE ZGELQS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */
+/*                          INFO ) */
+
+/*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS */
+/*       COMPLEX*16         A( LDA, * ), B( LDB, * ), TAU( * ), */
+/*      $                   WORK( LWORK ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > Compute a minimum-norm solution */
+/* >     f2cmin || A*X - B || */
+/* > using the LQ factorization */
+/* >     A = L*Q */
+/* > computed by ZGELQF. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of columns of the matrix A.  N >= M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of columns of B.  NRHS >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is COMPLEX*16 array, dimension (LDA,N) */
+/* >          Details of the LQ factorization of the original matrix A as */
+/* >          returned by ZGELQF. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TAU */
+/* > \verbatim */
+/* >          TAU is COMPLEX*16 array, dimension (M) */
+/* >          Details of the orthogonal matrix Q. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] B */
+/* > \verbatim */
+/* >          B is COMPLEX*16 array, dimension (LDB,NRHS) */
+/* >          On entry, the m-by-nrhs right hand side matrix B. */
+/* >          On exit, the n-by-nrhs solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= N. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is COMPLEX*16 array, dimension (LWORK) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* > \verbatim */
+/* >          LWORK is INTEGER */
+/* >          The length of the array WORK.  LWORK must be at least NRHS, */
+/* >          and should be at least NRHS*NB, where NB is the block size */
+/* >          for this environment. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0: successful exit */
+/* >          < 0: if INFO = -i, the i-th argument had an illegal value */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complex16_lin */
+
+/*  ===================================================================== */
+/* Subroutine */ int zgelqs_(integer *m, integer *n, integer *nrhs, 
+	doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *b, 
+	integer *ldb, doublecomplex *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, 
+	    integer *, integer *, doublecomplex *, doublecomplex *, integer *,
+	     doublecomplex *, integer *), 
+	    xerbla_(char *, integer *), zlaset_(char *, integer *, 
+	    integer *, doublecomplex *, doublecomplex *, doublecomplex *, 
+	    integer *), zunmlq_(char *, char *, integer *, integer *, 
+	    integer *, doublecomplex *, integer *, doublecomplex *, 
+	    doublecomplex *, integer *, doublecomplex *, integer *, integer *);
+
+
+/*  -- LAPACK test routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --tau;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *m > *n) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -5;
+    } else if (*ldb < f2cmax(1,*n)) {
+	*info = -8;
+    } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGELQS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0 || *m == 0) {
+	return 0;
+    }
+
+/*     Solve L*X = B(1:m,:) */
+
+    ztrsm_("Left", "Lower", "No transpose", "Non-unit", m, nrhs, &c_b2, &a[
+	    a_offset], lda, &b[b_offset], ldb);
+
+/*     Set B(m+1:n,:) to zero */
+
+    if (*m < *n) {
+	i__1 = *n - *m;
+	zlaset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[*m + 1 + b_dim1], ldb);
+    }
+
+/*     B := Q' * B */
+
+    zunmlq_("Left", "Conjugate transpose", n, nrhs, m, &a[a_offset], lda, &
+	    tau[1], &b[b_offset], ldb, &work[1], lwork, info);
+
+    return 0;
+
+/*     End of ZGELQS */
+
+} /* zgelqs_ */
+
diff --git a/lapack-netlib/TESTING/LIN/zgelqs.f b/lapack-netlib/SRC/DEPRECATED/zgelqs.f
similarity index 100%
rename from lapack-netlib/TESTING/LIN/zgelqs.f
rename to lapack-netlib/SRC/DEPRECATED/zgelqs.f
diff --git a/lapack-netlib/SRC/DEPRECATED/zgeqrs.c b/lapack-netlib/SRC/DEPRECATED/zgeqrs.c
new file mode 100644
index 000000000..3e8f3cce7
--- /dev/null
+++ b/lapack-netlib/SRC/DEPRECATED/zgeqrs.c
@@ -0,0 +1,472 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static doublecomplex c_b1 = {1.,0.};
+
+/* > \brief \b ZGEQRS */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE ZGEQRS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */
+/*                          INFO ) */
+
+/*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS */
+/*       COMPLEX*16         A( LDA, * ), B( LDB, * ), TAU( * ), */
+/*      $                   WORK( LWORK ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > Solve the least squares problem */
+/* >     f2cmin || A*X - B || */
+/* > using the QR factorization */
+/* >     A = Q*R */
+/* > computed by ZGEQRF. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of columns of the matrix A.  M >= N >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of columns of B.  NRHS >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is COMPLEX*16 array, dimension (LDA,N) */
+/* >          Details of the QR factorization of the original matrix A as */
+/* >          returned by ZGEQRF. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TAU */
+/* > \verbatim */
+/* >          TAU is COMPLEX*16 array, dimension (N) */
+/* >          Details of the orthogonal matrix Q. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] B */
+/* > \verbatim */
+/* >          B is COMPLEX*16 array, dimension (LDB,NRHS) */
+/* >          On entry, the m-by-nrhs right hand side matrix B. */
+/* >          On exit, the n-by-nrhs solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is COMPLEX*16 array, dimension (LWORK) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* > \verbatim */
+/* >          LWORK is INTEGER */
+/* >          The length of the array WORK.  LWORK must be at least NRHS, */
+/* >          and should be at least NRHS*NB, where NB is the block size */
+/* >          for this environment. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0: successful exit */
+/* >          < 0: if INFO = -i, the i-th argument had an illegal value */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complex16_lin */
+
+/*  ===================================================================== */
+/* Subroutine */ int zgeqrs_(integer *m, integer *n, integer *nrhs, 
+	doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *b, 
+	integer *ldb, doublecomplex *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, 
+	    integer *, integer *, doublecomplex *, doublecomplex *, integer *,
+	     doublecomplex *, integer *), 
+	    xerbla_(char *, integer *), zunmqr_(char *, char *, 
+	    integer *, integer *, integer *, doublecomplex *, integer *, 
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
+	    integer *, integer *);
+
+
+/*  -- LAPACK test routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test the input arguments. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --tau;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *n > *m) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -5;
+    } else if (*ldb < f2cmax(1,*m)) {
+	*info = -8;
+    } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGEQRS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0 || *m == 0) {
+	return 0;
+    }
+
+/*     B := Q' * B */
+
+    zunmqr_("Left", "Conjugate transpose", m, nrhs, n, &a[a_offset], lda, &
+	    tau[1], &b[b_offset], ldb, &work[1], lwork, info);
+
+/*     Solve R*X = B(1:n,:) */
+
+    ztrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b1, &a[
+	    a_offset], lda, &b[b_offset], ldb);
+
+    return 0;
+
+/*     End of ZGEQRS */
+
+} /* zgeqrs_ */
+
diff --git a/lapack-netlib/TESTING/LIN/zgeqrs.f b/lapack-netlib/SRC/DEPRECATED/zgeqrs.f
similarity index 100%
rename from lapack-netlib/TESTING/LIN/zgeqrs.f
rename to lapack-netlib/SRC/DEPRECATED/zgeqrs.f
diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile
index 26314c4df..448fbd8df 100644
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@@ -544,26 +544,30 @@ endif
 ifeq ($(BUILD_COMPLEX),1)
 CDEPRECSRC = DEPRECATED/cgegs.o DEPRECATED/cgegv.o DEPRECATED/cgelsx.o \
    DEPRECATED/cgeqpf.o DEPRECATED/cggsvd.o DEPRECATED/cggsvp.o \
-   DEPRECATED/clahrd.o DEPRECATED/clatzm.o DEPRECATED/ctzrqf.o 
+   DEPRECATED/clahrd.o DEPRECATED/clatzm.o DEPRECATED/ctzrqf.o \
+   DEPRECATED/cgelqs.o DEPRECATED/cgeqrs.o
 endif
 
 ifeq ($(BUILD_DOUBLE),1)
 DDEPRECSRC = \
    DEPRECATED/dgegs.o  DEPRECATED/dgegv.o  DEPRECATED/dgelsx.o \
    DEPRECATED/dgeqpf.o DEPRECATED/dggsvd.o DEPRECATED/dggsvp.o \
-   DEPRECATED/dlahrd.o DEPRECATED/dlatzm.o DEPRECATED/dtzrqf.o 
+   DEPRECATED/dlahrd.o DEPRECATED/dlatzm.o DEPRECATED/dtzrqf.o \
+   DEPRECATED/dgelqs.o DEPRECATED/dgeqrs.o
 endif
 ifeq ($(BUILD_SINGLE),1)
 SDEPRECSRC = \
    DEPRECATED/sgegs.o  DEPRECATED/sgegv.o  DEPRECATED/sgelsx.o \
    DEPRECATED/sgeqpf.o DEPRECATED/sggsvd.o DEPRECATED/sggsvp.o \
-   DEPRECATED/slahrd.o DEPRECATED/slatzm.o DEPRECATED/stzrqf.o
+   DEPRECATED/slahrd.o DEPRECATED/slatzm.o DEPRECATED/stzrqf.o \
+   DEPRECATED/sgelqs.o DEPRECATED/sgeqrs.o
 endif
 ifeq ($(BUILD_COMPLEX16),1)
 ZDEPRECSRC = \
    DEPRECATED/zgegs.o  DEPRECATED/zgegv.o  DEPRECATED/zgelsx.o \
    DEPRECATED/zgeqpf.o DEPRECATED/zggsvd.o DEPRECATED/zggsvp.o \
-   DEPRECATED/zlahrd.o DEPRECATED/zlatzm.o DEPRECATED/ztzrqf.o
+   DEPRECATED/zlahrd.o DEPRECATED/zlatzm.o DEPRECATED/ztzrqf.o \
+   DEPRECATED/zgelqs.o DEPRECATED/zgeqrs.o
 endif
 
 # filter out optimized codes from OpenBLAS
diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt
index fc55b8a96..676857a80 100644
--- a/lapack-netlib/TESTING/LIN/CMakeLists.txt
+++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt
@@ -20,7 +20,7 @@ set(SLINTST schkaa.F
    serrgt.f serrlq.f serrls.f
    serrps.f serrql.f serrqp.f serrqr.f
    serrrq.f serrtr.f serrtz.f
-   sgbt01.f sgbt02.f sgbt05.f sgelqs.f sgeqls.f sgeqrs.f
+   sgbt01.f sgbt02.f sgbt05.f sgeqls.f
    sgerqs.f sget01.f sget02.f
    sget03.f sget04.f sget06.f sget07.f sgtt01.f sgtt02.f
    sgtt05.f slaptm.f slarhs.f slatb4.f slatb5.f slattb.f slattp.f
@@ -70,7 +70,7 @@ set(CLINTST cchkaa.F
    cerrgt.f cerrlq.f
    cerrls.f cerrps.f cerrql.f cerrqp.f
    cerrqr.f cerrrq.f cerrtr.f cerrtz.f
-   cgbt01.f cgbt02.f cgbt05.f cgelqs.f cgeqls.f cgeqrs.f
+   cgbt01.f cgbt02.f cgbt05.f cgeqls.f
    cgerqs.f cget01.f cget02.f
    cget03.f cget04.f cget07.f cgtt01.f cgtt02.f
    cgtt05.f chet01.f chet01_rook.f chet01_3.f
@@ -121,7 +121,7 @@ set(DLINTST dchkaa.F
    derrgt.f derrlq.f derrls.f
    derrps.f derrql.f derrqp.f derrqr.f
    derrrq.f derrtr.f derrtz.f
-   dgbt01.f dgbt02.f dgbt05.f dgelqs.f dgeqls.f dgeqrs.f
+   dgbt01.f dgbt02.f dgbt05.f dgeqls.f
    dgerqs.f dget01.f dget02.f
    dget03.f dget04.f dget06.f dget07.f dgtt01.f dgtt02.f
    dgtt05.f dlaptm.f dlarhs.f dlatb4.f dlatb5.f dlattb.f dlattp.f
@@ -172,7 +172,7 @@ set(ZLINTST zchkaa.F
    zerrgt.f zerrlq.f
    zerrls.f zerrps.f zerrql.f zerrqp.f
    zerrqr.f zerrrq.f zerrtr.f zerrtz.f
-   zgbt01.f zgbt02.f zgbt05.f zgelqs.f zgeqls.f zgeqrs.f
+   zgbt01.f zgbt02.f zgbt05.f zgeqls.f
    zgerqs.f zget01.f zget02.f
    zget03.f zget04.f zget07.f zgtt01.f zgtt02.f
    zgtt05.f zhet01.f zhet01_rook.f zhet01_3.f
diff --git a/lapack-netlib/TESTING/LIN/Makefile b/lapack-netlib/TESTING/LIN/Makefile
index 54b26455e..64abc4dba 100644
--- a/lapack-netlib/TESTING/LIN/Makefile
+++ b/lapack-netlib/TESTING/LIN/Makefile
@@ -55,7 +55,7 @@ SLINTST = schkaa.o \
    serrgt.o serrlq.o serrls.o \
    serrps.o serrql.o serrqp.o serrqr.o \
    serrrq.o serrtr.o serrtz.o \
-   sgbt01.o sgbt02.o sgbt05.o sgelqs.o sgeqls.o sgeqrs.o \
+   sgbt01.o sgbt02.o sgbt05.o sgeqls.o \
    sgerqs.o sget01.o sget02.o \
    sget03.o sget04.o sget06.o sget07.o sgtt01.o sgtt02.o \
    sgtt05.o slaptm.o slarhs.o slatb4.o slatb5.o slattb.o slattp.o \
@@ -100,7 +100,7 @@ CLINTST = cchkaa.o \
    cerrgt.o cerrlq.o \
    cerrls.o cerrps.o cerrql.o cerrqp.o \
    cerrqr.o cerrrq.o cerrtr.o cerrtz.o \
-   cgbt01.o cgbt02.o cgbt05.o cgelqs.o cgeqls.o cgeqrs.o \
+   cgbt01.o cgbt02.o cgbt05.o cgeqls.o \
    cgerqs.o cget01.o cget02.o \
    cget03.o cget04.o cget07.o cgtt01.o cgtt02.o \
    cgtt05.o chet01.o chet01_rook.o chet01_3.o chet01_aa.o \
@@ -147,7 +147,7 @@ DLINTST = dchkaa.o \
    derrgt.o derrlq.o derrls.o \
    derrps.o derrql.o derrqp.o derrqr.o \
    derrrq.o derrtr.o derrtz.o \
-   dgbt01.o dgbt02.o dgbt05.o dgelqs.o dgeqls.o dgeqrs.o \
+   dgbt01.o dgbt02.o dgbt05.o dgeqls.o \
    dgerqs.o dget01.o dget02.o \
    dget03.o dget04.o dget06.o dget07.o dgtt01.o dgtt02.o \
    dgtt05.o dlaptm.o dlarhs.o dlatb4.o dlatb5.o dlattb.o dlattp.o \
@@ -192,7 +192,7 @@ ZLINTST = zchkaa.o \
    zerrgt.o zerrlq.o \
    zerrls.o zerrps.o zerrql.o zerrqp.o \
    zerrqr.o zerrrq.o zerrtr.o zerrtz.o \
-   zgbt01.o zgbt02.o zgbt05.o zgelqs.o zgeqls.o zgeqrs.o \
+   zgbt01.o zgbt02.o zgbt05.o zgeqls.o \
    zgerqs.o zget01.o zget02.o \
    zget03.o zget04.o zget07.o zgtt01.o zgtt02.o \
    zgtt05.o zhet01.o zhet01_rook.o zhet01_3.o zhet01_aa.o \
diff --git a/lapack-netlib/TESTING/LIN/cchklq.f b/lapack-netlib/TESTING/LIN/cchklq.f
index 54107d047..4499de36f 100644
--- a/lapack-netlib/TESTING/LIN/cchklq.f
+++ b/lapack-netlib/TESTING/LIN/cchklq.f
@@ -235,7 +235,7 @@
       REAL               RESULT( NTESTS )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAERH, ALAHD, ALASUM, CERRLQ, CGELQS, CGET02,
+      EXTERNAL           ALAERH, ALAHD, ALASUM, CERRLQ, CGELS, CGET02,
      $                   CLACPY, CLARHS, CLATB4, CLATMS, CLQT01, CLQT02,
      $                   CLQT03, XLAENV
 *     ..
@@ -370,7 +370,7 @@
      $                               WORK, LWORK, RWORK, RESULT( 3 ) )
                         NT = NT + 4
 *
-*                       If M>=N and K=N, call CGELQS to solve a system
+*                       If M<=N and K=M, call CGELS to solve a system
 *                       with NRHS right hand sides and compute the
 *                       residual.
 *
@@ -387,14 +387,20 @@
 *
                            CALL CLACPY( 'Full', M, NRHS, B, LDA, X,
      $                                  LDA )
-                           SRNAMT = 'CGELQS'
-                           CALL CGELQS( M, N, NRHS, AF, LDA, TAU, X,
-     $                                  LDA, WORK, LWORK, INFO )
 *
-*                          Check error code from CGELQS.
+*                          Reset AF to the original matrix. CGELS
+*                          factors the matrix before solving the system.
+*
+                           CALL CLACPY( 'Full', M, N, A, LDA, AF, LDA )
+*
+                           SRNAMT = 'CGELS'
+                           CALL CGELS( 'No transpose', M, N, NRHS, AF,
+     $                                 LDA, X, LDA, WORK, LWORK, INFO )
+*
+*                          Check error code from CGELS.
 *
                            IF( INFO.NE.0 )
-     $                        CALL ALAERH( PATH, 'CGELQS', INFO, 0, ' ',
+     $                        CALL ALAERH( PATH, 'CGELS', INFO, 0, 'N',
      $                                     M, N, NRHS, -1, NB, IMAT,
      $                                     NFAIL, NERRS, NOUT )
 *
diff --git a/lapack-netlib/TESTING/LIN/cchkqr.f b/lapack-netlib/TESTING/LIN/cchkqr.f
index 7ea178eaf..4fa7413f9 100644
--- a/lapack-netlib/TESTING/LIN/cchkqr.f
+++ b/lapack-netlib/TESTING/LIN/cchkqr.f
@@ -244,7 +244,7 @@
       EXTERNAL           CGENND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAERH, ALAHD, ALASUM, CERRQR, CGEQRS, CGET02,
+      EXTERNAL           ALAERH, ALAHD, ALASUM, CERRQR, CGELS, CGET02,
      $                   CLACPY, CLARHS, CLATB4, CLATMS, CQRT01,
      $                   CQRT01P, CQRT02, CQRT03, XLAENV
 *     ..
@@ -371,7 +371,7 @@
                          IF( .NOT. CGENND( M, N, AF, LDA ) )
      $                       RESULT( 9 ) = 2*THRESH
                         NT = NT + 1
-                    ELSE IF( M.GE.N ) THEN
+                     ELSE IF( M.GE.N ) THEN
 *
 *                       Test CUNGQR, using factorization
 *                       returned by CQRT01
@@ -388,7 +388,7 @@
      $                               WORK, LWORK, RWORK, RESULT( 3 ) )
                         NT = NT + 4
 *
-*                       If M>=N and K=N, call CGEQRS to solve a system
+*                       If M>=N and K=N, call CGELS to solve a system
 *                       with NRHS right hand sides and compute the
 *                       residual.
 *
@@ -405,14 +405,20 @@
 *
                            CALL CLACPY( 'Full', M, NRHS, B, LDA, X,
      $                                  LDA )
-                           SRNAMT = 'CGEQRS'
-                           CALL CGEQRS( M, N, NRHS, AF, LDA, TAU, X,
-     $                                  LDA, WORK, LWORK, INFO )
 *
-*                          Check error code from CGEQRS.
+*                          Reset AF to the original matrix. CGELS
+*                          factors the matrix before solving the system.
+*
+                           CALL CLACPY( 'Full', M, N, A, LDA, AF, LDA )
+*
+                           SRNAMT = 'CGELS'
+                           CALL CGELS( 'No transpose', M, N, NRHS, AF,
+     $                                 LDA, X, LDA, WORK, LWORK, INFO )
+*
+*                          Check error code from CGELS.
 *
                            IF( INFO.NE.0 )
-     $                        CALL ALAERH( PATH, 'CGEQRS', INFO, 0, ' ',
+     $                        CALL ALAERH( PATH, 'CGELS', INFO, 0, 'N',
      $                                     M, N, NRHS, -1, NB, IMAT,
      $                                     NFAIL, NERRS, NOUT )
 *
diff --git a/lapack-netlib/TESTING/LIN/cerrlq.f b/lapack-netlib/TESTING/LIN/cerrlq.f
index 1036835b4..495adac0d 100644
--- a/lapack-netlib/TESTING/LIN/cerrlq.f
+++ b/lapack-netlib/TESTING/LIN/cerrlq.f
@@ -76,7 +76,7 @@
      $                   W( NMAX ), X( NMAX )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAESM, CGELQ2, CGELQF, CGELQS, CHKXER, CUNGL2,
+      EXTERNAL           ALAESM, CGELQ2, CGELQF, CHKXER, CUNGL2,
      $                   CUNGLQ, CUNML2, CUNMLQ
 *     ..
 *     .. Scalars in Common ..
@@ -140,31 +140,6 @@
       CALL CGELQ2( 2, 1, A, 1, B, W, INFO )
       CALL CHKXER( 'CGELQ2', INFOT, NOUT, LERR, OK )
 *
-*     CGELQS
-*
-      SRNAMT = 'CGELQS'
-      INFOT = 1
-      CALL CGELQS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL CGELQS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL CGELQS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 3
-      CALL CGELQS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 5
-      CALL CGELQS( 2, 2, 0, A, 1, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 8
-      CALL CGELQS( 1, 2, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 10
-      CALL CGELQS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK )
-*
 *     CUNGLQ
 *
       SRNAMT = 'CUNGLQ'
diff --git a/lapack-netlib/TESTING/LIN/cerrqr.f b/lapack-netlib/TESTING/LIN/cerrqr.f
index 21cf22936..30ce001eb 100644
--- a/lapack-netlib/TESTING/LIN/cerrqr.f
+++ b/lapack-netlib/TESTING/LIN/cerrqr.f
@@ -77,7 +77,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAESM, CGEQR2, CGEQR2P, CGEQRF, CGEQRFP,
-     $                   CGEQRS, CHKXER, CUNG2R, CUNGQR, CUNM2R,
+     $                   CHKXER, CUNG2R, CUNGQR, CUNM2R,
      $                   CUNMQR
 *     ..
 *     .. Scalars in Common ..
@@ -170,31 +170,6 @@
       CALL CGEQR2P( 2, 1, A, 1, B, W, INFO )
       CALL CHKXER( 'CGEQR2P', INFOT, NOUT, LERR, OK )
 *
-*     CGEQRS
-*
-      SRNAMT = 'CGEQRS'
-      INFOT = 1
-      CALL CGEQRS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL CGEQRS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL CGEQRS( 1, 2, 0, A, 2, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 3
-      CALL CGEQRS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 5
-      CALL CGEQRS( 2, 1, 0, A, 1, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 8
-      CALL CGEQRS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 10
-      CALL CGEQRS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK )
-*
 *     CUNGQR
 *
       SRNAMT = 'CUNGQR'
diff --git a/lapack-netlib/TESTING/LIN/dchklq.f b/lapack-netlib/TESTING/LIN/dchklq.f
index 70af41fe0..a207e0056 100644
--- a/lapack-netlib/TESTING/LIN/dchklq.f
+++ b/lapack-netlib/TESTING/LIN/dchklq.f
@@ -235,7 +235,7 @@
       DOUBLE PRECISION   RESULT( NTESTS )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAERH, ALAHD, ALASUM, DERRLQ, DGELQS, DGET02,
+      EXTERNAL           ALAERH, ALAHD, ALASUM, DERRLQ, DGELS, DGET02,
      $                   DLACPY, DLARHS, DLATB4, DLATMS, DLQT01, DLQT02,
      $                   DLQT03, XLAENV
 *     ..
@@ -373,7 +373,7 @@
      $                               WORK, LWORK, RWORK, RESULT( 3 ) )
                         NT = NT + 4
 *
-*                       If M>=N and K=N, call DGELQS to solve a system
+*                       If M<=N and K=M, call DGELS to solve a system
 *                       with NRHS right hand sides and compute the
 *                       residual.
 *
@@ -390,14 +390,20 @@
 *
                            CALL DLACPY( 'Full', M, NRHS, B, LDA, X,
      $                                  LDA )
-                           SRNAMT = 'DGELQS'
-                           CALL DGELQS( M, N, NRHS, AF, LDA, TAU, X,
-     $                                  LDA, WORK, LWORK, INFO )
 *
-*                          Check error code from DGELQS.
+*                          Reset AF to the original matrix. DGELS
+*                          factors the matrix before solving the system.
+*
+                           CALL DLACPY( 'Full', M, N, A, LDA, AF, LDA )
+*
+                           SRNAMT = 'DGELS'
+                           CALL DGELS( 'No transpose', M, N, NRHS, AF,
+     $                                 LDA, X, LDA, WORK, LWORK, INFO )
+*
+*                          Check error code from DGELS.
 *
                            IF( INFO.NE.0 )
-     $                        CALL ALAERH( PATH, 'DGELQS', INFO, 0, ' ',
+     $                        CALL ALAERH( PATH, 'DGELS', INFO, 0, 'N',
      $                                     M, N, NRHS, -1, NB, IMAT,
      $                                     NFAIL, NERRS, NOUT )
 *
diff --git a/lapack-netlib/TESTING/LIN/dchkqr.f b/lapack-netlib/TESTING/LIN/dchkqr.f
index c729e61a9..8188d7a00 100644
--- a/lapack-netlib/TESTING/LIN/dchkqr.f
+++ b/lapack-netlib/TESTING/LIN/dchkqr.f
@@ -244,7 +244,7 @@
       EXTERNAL           DGENND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAERH, ALAHD, ALASUM, DERRQR, DGEQRS, DGET02,
+      EXTERNAL           ALAERH, ALAHD, ALASUM, DERRQR, DGELS, DGET02,
      $                   DLACPY, DLARHS, DLATB4, DLATMS, DQRT01,
      $                   DQRT01P, DQRT02, DQRT03, XLAENV
 *     ..
@@ -372,7 +372,7 @@
                          IF( .NOT. DGENND( M, N, AF, LDA ) )
      $                       RESULT( 9 ) = 2*THRESH
                         NT = NT + 1
-                    ELSE IF( M.GE.N ) THEN
+                     ELSE IF( M.GE.N ) THEN
 *
 *                       Test DORGQR, using factorization
 *                       returned by DQRT01
@@ -389,7 +389,7 @@
      $                               WORK, LWORK, RWORK, RESULT( 3 ) )
                         NT = NT + 4
 *
-*                       If M>=N and K=N, call DGEQRS to solve a system
+*                       If M>=N and K=N, call DGELS to solve a system
 *                       with NRHS right hand sides and compute the
 *                       residual.
 *
@@ -406,14 +406,20 @@
 *
                            CALL DLACPY( 'Full', M, NRHS, B, LDA, X,
      $                                  LDA )
-                           SRNAMT = 'DGEQRS'
-                           CALL DGEQRS( M, N, NRHS, AF, LDA, TAU, X,
-     $                                  LDA, WORK, LWORK, INFO )
 *
-*                          Check error code from DGEQRS.
+*                          Reset AF. DGELS overwrites the matrix with
+*                          its factorization.
+*
+                           CALL DLACPY( 'Full', M, N, A, LDA, AF, LDA )
+*
+                           SRNAMT = 'DGELS'
+                           CALL DGELS( 'No transpose', M, N, NRHS, AF,
+     $                                 LDA, X, LDA, WORK, LWORK, INFO )
+*
+*                          Check error code from DGELS.
 *
                            IF( INFO.NE.0 )
-     $                        CALL ALAERH( PATH, 'DGEQRS', INFO, 0, ' ',
+     $                        CALL ALAERH( PATH, 'DGELS', INFO, 0, 'N',
      $                                     M, N, NRHS, -1, NB, IMAT,
      $                                     NFAIL, NERRS, NOUT )
 *
diff --git a/lapack-netlib/TESTING/LIN/derrlq.f b/lapack-netlib/TESTING/LIN/derrlq.f
index d3cfcddd0..76ff4709e 100644
--- a/lapack-netlib/TESTING/LIN/derrlq.f
+++ b/lapack-netlib/TESTING/LIN/derrlq.f
@@ -76,7 +76,7 @@
      $                   W( NMAX ), X( NMAX )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAESM, CHKXER, DGELQ2, DGELQF, DGELQS, DORGL2,
+      EXTERNAL           ALAESM, CHKXER, DGELQ2, DGELQF, DORGL2,
      $                   DORGLQ, DORML2, DORMLQ
 *     ..
 *     .. Scalars in Common ..
@@ -140,31 +140,6 @@
       CALL DGELQ2( 2, 1, A, 1, B, W, INFO )
       CALL CHKXER( 'DGELQ2', INFOT, NOUT, LERR, OK )
 *
-*     DGELQS
-*
-      SRNAMT = 'DGELQS'
-      INFOT = 1
-      CALL DGELQS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL DGELQS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL DGELQS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 3
-      CALL DGELQS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 5
-      CALL DGELQS( 2, 2, 0, A, 1, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 8
-      CALL DGELQS( 1, 2, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 10
-      CALL DGELQS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK )
-*
 *     DORGLQ
 *
       SRNAMT = 'DORGLQ'
diff --git a/lapack-netlib/TESTING/LIN/derrqr.f b/lapack-netlib/TESTING/LIN/derrqr.f
index 03155b133..f7e850b80 100644
--- a/lapack-netlib/TESTING/LIN/derrqr.f
+++ b/lapack-netlib/TESTING/LIN/derrqr.f
@@ -77,7 +77,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAESM, CHKXER, DGEQR2, DGEQR2P, DGEQRF,
-     $                   DGEQRFP, DGEQRS, DORG2R, DORGQR, DORM2R,
+     $                   DGEQRFP, DORG2R, DORGQR, DORM2R,
      $                   DORMQR
 *     ..
 *     .. Scalars in Common ..
@@ -170,31 +170,6 @@
       CALL DGEQR2P( 2, 1, A, 1, B, W, INFO )
       CALL CHKXER( 'DGEQR2P', INFOT, NOUT, LERR, OK )
 *
-*     DGEQRS
-*
-      SRNAMT = 'DGEQRS'
-      INFOT = 1
-      CALL DGEQRS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL DGEQRS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL DGEQRS( 1, 2, 0, A, 2, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 3
-      CALL DGEQRS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 5
-      CALL DGEQRS( 2, 1, 0, A, 1, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 8
-      CALL DGEQRS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 10
-      CALL DGEQRS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK )
-*
 *     DORGQR
 *
       SRNAMT = 'DORGQR'
diff --git a/lapack-netlib/TESTING/LIN/schklq.f b/lapack-netlib/TESTING/LIN/schklq.f
index cd66e8d10..9335503f9 100644
--- a/lapack-netlib/TESTING/LIN/schklq.f
+++ b/lapack-netlib/TESTING/LIN/schklq.f
@@ -235,7 +235,7 @@
       REAL               RESULT( NTESTS )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAERH, ALAHD, ALASUM, SERRLQ, SGELQS, SGET02,
+      EXTERNAL           ALAERH, ALAHD, ALASUM, SERRLQ, SGET02,
      $                   SLACPY, SLARHS, SLATB4, SLATMS, SLQT01, SLQT02,
      $                   SLQT03, XLAENV
 *     ..
@@ -370,7 +370,7 @@
      $                               WORK, LWORK, RWORK, RESULT( 3 ) )
                         NT = NT + 4
 *
-*                       If M>=N and K=N, call SGELQS to solve a system
+*                       If M<=N and K=M, call SGELS to solve a system
 *                       with NRHS right hand sides and compute the
 *                       residual.
 *
@@ -387,14 +387,20 @@
 *
                            CALL SLACPY( 'Full', M, NRHS, B, LDA, X,
      $                                  LDA )
-                           SRNAMT = 'SGELQS'
-                           CALL SGELQS( M, N, NRHS, AF, LDA, TAU, X,
-     $                                  LDA, WORK, LWORK, INFO )
 *
-*                          Check error code from SGELQS.
+*                          Reset AF to the original matrix. SGELS
+*                          factors the matrix before solving the system.
+*
+                           CALL SLACPY( 'Full', M, N, A, LDA, AF, LDA )
+*
+                           SRNAMT = 'SGELS'
+                           CALL SGELS( 'No transpose', M, N, NRHS, AF,
+     $                                 LDA, X, LDA, WORK, LWORK, INFO )
+*
+*                          Check error code from SGELS.
 *
                            IF( INFO.NE.0 )
-     $                        CALL ALAERH( PATH, 'SGELQS', INFO, 0, ' ',
+     $                        CALL ALAERH( PATH, 'SGELS', INFO, 0, 'N',
      $                                     M, N, NRHS, -1, NB, IMAT,
      $                                     NFAIL, NERRS, NOUT )
 *
diff --git a/lapack-netlib/TESTING/LIN/schkqr.f b/lapack-netlib/TESTING/LIN/schkqr.f
index 5c45ede9b..f72c8f1eb 100644
--- a/lapack-netlib/TESTING/LIN/schkqr.f
+++ b/lapack-netlib/TESTING/LIN/schkqr.f
@@ -244,7 +244,7 @@
       EXTERNAL           SGENND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAERH, ALAHD, ALASUM, SERRQR, SGEQRS, SGET02,
+      EXTERNAL           ALAERH, ALAHD, ALASUM, SERRQR, SGELS, SGET02,
      $                   SLACPY, SLARHS, SLATB4, SLATMS, SQRT01,
      $                   SQRT01P, SQRT02, SQRT03, XLAENV
 *     ..
@@ -388,7 +388,7 @@
      $                               WORK, LWORK, RWORK, RESULT( 3 ) )
                         NT = NT + 4
 *
-*                       If M>=N and K=N, call SGEQRS to solve a system
+*                       If M>=N and K=N, call SGELS to solve a system
 *                       with NRHS right hand sides and compute the
 *                       residual.
 *
@@ -405,14 +405,20 @@
 *
                            CALL SLACPY( 'Full', M, NRHS, B, LDA, X,
      $                                  LDA )
-                           SRNAMT = 'SGEQRS'
-                           CALL SGEQRS( M, N, NRHS, AF, LDA, TAU, X,
-     $                                  LDA, WORK, LWORK, INFO )
 *
-*                          Check error code from SGEQRS.
+*                          Reset AF to the original matrix. SGELS
+*                          factors the matrix before solving the system.
+*
+                           CALL SLACPY( 'Full', M, N, A, LDA, AF, LDA )
+*
+                           SRNAMT = 'SGELS'
+                           CALL SGELS( 'No transpose', M, N, NRHS, AF,
+     $                                 LDA, X, LDA, WORK, LWORK, INFO )
+*
+*                          Check error code from SGELS.
 *
                            IF( INFO.NE.0 )
-     $                        CALL ALAERH( PATH, 'SGEQRS', INFO, 0, ' ',
+     $                        CALL ALAERH( PATH, 'SGELS', INFO, 0, 'N',
      $                                     M, N, NRHS, -1, NB, IMAT,
      $                                     NFAIL, NERRS, NOUT )
 *
diff --git a/lapack-netlib/TESTING/LIN/serrlq.f b/lapack-netlib/TESTING/LIN/serrlq.f
index 5bb0fe201..e5df8ce52 100644
--- a/lapack-netlib/TESTING/LIN/serrlq.f
+++ b/lapack-netlib/TESTING/LIN/serrlq.f
@@ -76,7 +76,7 @@
      $                   W( NMAX ), X( NMAX )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAESM, CHKXER, SGELQ2, SGELQF, SGELQS, SORGL2,
+      EXTERNAL           ALAESM, CHKXER, SGELQ2, SGELQF, SORGL2,
      $                   SORGLQ, SORML2, SORMLQ
 *     ..
 *     .. Scalars in Common ..
@@ -140,31 +140,6 @@
       CALL SGELQ2( 2, 1, A, 1, B, W, INFO )
       CALL CHKXER( 'SGELQ2', INFOT, NOUT, LERR, OK )
 *
-*     SGELQS
-*
-      SRNAMT = 'SGELQS'
-      INFOT = 1
-      CALL SGELQS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL SGELQS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL SGELQS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 3
-      CALL SGELQS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 5
-      CALL SGELQS( 2, 2, 0, A, 1, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 8
-      CALL SGELQS( 1, 2, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 10
-      CALL SGELQS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK )
-*
 *     SORGLQ
 *
       SRNAMT = 'SORGLQ'
diff --git a/lapack-netlib/TESTING/LIN/serrqr.f b/lapack-netlib/TESTING/LIN/serrqr.f
index 1ad40b7aa..e228813f7 100644
--- a/lapack-netlib/TESTING/LIN/serrqr.f
+++ b/lapack-netlib/TESTING/LIN/serrqr.f
@@ -77,7 +77,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAESM, CHKXER, SGEQR2, SGEQR2P, SGEQRF,
-     $                   SGEQRFP, SGEQRS, SORG2R, SORGQR, SORM2R,
+     $                   SGEQRFP, SORG2R, SORGQR, SORM2R,
      $                   SORMQR
 *     ..
 *     .. Scalars in Common ..
@@ -170,31 +170,6 @@
       CALL SGEQR2P( 2, 1, A, 1, B, W, INFO )
       CALL CHKXER( 'SGEQR2P', INFOT, NOUT, LERR, OK )
 *
-*     SGEQRS
-*
-      SRNAMT = 'SGEQRS'
-      INFOT = 1
-      CALL SGEQRS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL SGEQRS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL SGEQRS( 1, 2, 0, A, 2, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 3
-      CALL SGEQRS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 5
-      CALL SGEQRS( 2, 1, 0, A, 1, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 8
-      CALL SGEQRS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 10
-      CALL SGEQRS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK )
-*
 *     SORGQR
 *
       SRNAMT = 'SORGQR'
diff --git a/lapack-netlib/TESTING/LIN/zchklq.f b/lapack-netlib/TESTING/LIN/zchklq.f
index 371bb946b..ccef7b803 100644
--- a/lapack-netlib/TESTING/LIN/zchklq.f
+++ b/lapack-netlib/TESTING/LIN/zchklq.f
@@ -235,7 +235,7 @@
       DOUBLE PRECISION   RESULT( NTESTS )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAERH, ALAHD, ALASUM, XLAENV, ZERRLQ, ZGELQS,
+      EXTERNAL           ALAERH, ALAHD, ALASUM, XLAENV, ZERRLQ, ZGELS,
      $                   ZGET02, ZLACPY, ZLARHS, ZLATB4, ZLATMS, ZLQT01,
      $                   ZLQT02, ZLQT03
 *     ..
@@ -370,7 +370,7 @@
      $                               WORK, LWORK, RWORK, RESULT( 3 ) )
                         NT = NT + 4
 *
-*                       If M>=N and K=N, call ZGELQS to solve a system
+*                       If M<=N and K=M, call ZGELS to solve a system
 *                       with NRHS right hand sides and compute the
 *                       residual.
 *
@@ -387,14 +387,20 @@
 *
                            CALL ZLACPY( 'Full', M, NRHS, B, LDA, X,
      $                                  LDA )
-                           SRNAMT = 'ZGELQS'
-                           CALL ZGELQS( M, N, NRHS, AF, LDA, TAU, X,
-     $                                  LDA, WORK, LWORK, INFO )
 *
-*                          Check error code from ZGELQS.
+*                          Reset AF to the original matrix. ZGELS
+*                          factors the matrix before solving the system.
+*
+                           CALL ZLACPY( 'Full', M, N, A, LDA, AF, LDA )
+*
+                           SRNAMT = 'ZGELS'
+                           CALL ZGELS( 'No transpose', M, N, NRHS, AF,
+     $                                 LDA, X, LDA, WORK, LWORK, INFO )
+*
+*                          Check error code from ZGELS.
 *
                            IF( INFO.NE.0 )
-     $                        CALL ALAERH( PATH, 'ZGELQS', INFO, 0, ' ',
+     $                        CALL ALAERH( PATH, 'ZGELS', INFO, 0, 'N',
      $                                     M, N, NRHS, -1, NB, IMAT,
      $                                     NFAIL, NERRS, NOUT )
 *
diff --git a/lapack-netlib/TESTING/LIN/zchkqr.f b/lapack-netlib/TESTING/LIN/zchkqr.f
index a240d2da5..c088bacc9 100644
--- a/lapack-netlib/TESTING/LIN/zchkqr.f
+++ b/lapack-netlib/TESTING/LIN/zchkqr.f
@@ -244,7 +244,7 @@
       EXTERNAL           ZGENND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAERH, ALAHD, ALASUM, XLAENV, ZERRQR, ZGEQRS,
+      EXTERNAL           ALAERH, ALAHD, ALASUM, XLAENV, ZERRQR, ZGELS,
      $                   ZGET02, ZLACPY, ZLARHS, ZLATB4, ZLATMS, ZQRT01,
      $                   ZQRT01P, ZQRT02, ZQRT03
 *     ..
@@ -388,7 +388,7 @@
      $                               WORK, LWORK, RWORK, RESULT( 3 ) )
                         NT = NT + 4
 *
-*                       If M>=N and K=N, call ZGEQRS to solve a system
+*                       If M>=N and K=N, call ZGELS to solve a system
 *                       with NRHS right hand sides and compute the
 *                       residual.
 *
@@ -405,14 +405,20 @@
 *
                            CALL ZLACPY( 'Full', M, NRHS, B, LDA, X,
      $                                  LDA )
-                           SRNAMT = 'ZGEQRS'
-                           CALL ZGEQRS( M, N, NRHS, AF, LDA, TAU, X,
-     $                                  LDA, WORK, LWORK, INFO )
 *
-*                          Check error code from ZGEQRS.
+*                          Reset AF to the original matrix. ZGELS
+*                          factors the matrix before solving the system.
+*
+                           CALL ZLACPY( 'Full', M, N, A, LDA, AF, LDA )
+*
+                           SRNAMT = 'ZGELS'
+                           CALL ZGELS( 'No transpose', M, N, NRHS, AF,
+     $                                 LDA, X, LDA, WORK, LWORK, INFO )
+*
+*                          Check error code from ZGELS.
 *
                            IF( INFO.NE.0 )
-     $                        CALL ALAERH( PATH, 'ZGEQRS', INFO, 0, ' ',
+     $                        CALL ALAERH( PATH, 'ZGELS', INFO, 0, 'N',
      $                                     M, N, NRHS, -1, NB, IMAT,
      $                                     NFAIL, NERRS, NOUT )
 *
diff --git a/lapack-netlib/TESTING/LIN/zerrlq.f b/lapack-netlib/TESTING/LIN/zerrlq.f
index d8e5a8fe8..d91b4e4b3 100644
--- a/lapack-netlib/TESTING/LIN/zerrlq.f
+++ b/lapack-netlib/TESTING/LIN/zerrlq.f
@@ -76,7 +76,7 @@
      $                   W( NMAX ), X( NMAX )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAESM, CHKXER, ZGELQ2, ZGELQF, ZGELQS, ZUNGL2,
+      EXTERNAL           ALAESM, CHKXER, ZGELQ2, ZGELQF, ZUNGL2,
      $                   ZUNGLQ, ZUNML2, ZUNMLQ
 *     ..
 *     .. Scalars in Common ..
@@ -142,31 +142,6 @@
       CALL ZGELQ2( 2, 1, A, 1, B, W, INFO )
       CALL CHKXER( 'ZGELQ2', INFOT, NOUT, LERR, OK )
 *
-*     ZGELQS
-*
-      SRNAMT = 'ZGELQS'
-      INFOT = 1
-      CALL ZGELQS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL ZGELQS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL ZGELQS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 3
-      CALL ZGELQS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 5
-      CALL ZGELQS( 2, 2, 0, A, 1, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 8
-      CALL ZGELQS( 1, 2, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 10
-      CALL ZGELQS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK )
-*
 *     ZUNGLQ
 *
       SRNAMT = 'ZUNGLQ'
diff --git a/lapack-netlib/TESTING/LIN/zerrqr.f b/lapack-netlib/TESTING/LIN/zerrqr.f
index 114453d4c..3542c7a04 100644
--- a/lapack-netlib/TESTING/LIN/zerrqr.f
+++ b/lapack-netlib/TESTING/LIN/zerrqr.f
@@ -77,7 +77,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAESM, CHKXER, ZGEQR2, ZGEQR2P, ZGEQRF,
-     $                   ZGEQRFP, ZGEQRS, ZUNG2R, ZUNGQR, ZUNM2R,
+     $                   ZGEQRFP, ZUNG2R, ZUNGQR, ZUNM2R,
      $                   ZUNMQR
 *     ..
 *     .. Scalars in Common ..
@@ -172,31 +172,6 @@
       CALL ZGEQR2P( 2, 1, A, 1, B, W, INFO )
       CALL CHKXER( 'ZGEQR2P', INFOT, NOUT, LERR, OK )
 *
-*     ZGEQRS
-*
-      SRNAMT = 'ZGEQRS'
-      INFOT = 1
-      CALL ZGEQRS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL ZGEQRS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL ZGEQRS( 1, 2, 0, A, 2, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 3
-      CALL ZGEQRS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 5
-      CALL ZGEQRS( 2, 1, 0, A, 1, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 8
-      CALL ZGEQRS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 10
-      CALL ZGEQRS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK )
-*
 *     ZUNGQR
 *
       SRNAMT = 'ZUNGQR'

From 225036fd92fb0093280dbfdafd295a4f40678917 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 12 Nov 2023 13:43:22 +0100
Subject: [PATCH 402/718] Apply ROUNDUP_LWORK (Reference-LAPACK PR 904)

---
 lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f | 9 +++++----
 lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f | 9 +++++----
 lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f | 9 +++++----
 lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f | 9 +++++----
 4 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f
index 743731a00..d3f78b8be 100644
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f
+++ b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f
@@ -176,7 +176,8 @@ C>
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 
@@ -225,13 +226,13 @@ C>
 *         Optimal workspace for dlarfb = MAX(1,N)*NT
 *
           LWKOPT = (LBWORK+LLWORK)*NB
-          WORK( 1 ) = (LWKOPT+NT*NT)
+          WORK( 1 ) = SROUNDUP_LWORK(LWKOPT+NT*NT)
 
       ELSE
 
           LBWORK = CEILING(REAL(K)/REAL(NB))*NB
           LWKOPT = (LBWORK+LLWORK-NB)*NB
-          WORK( 1 ) = LWKOPT
+          WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 
       END IF
 
@@ -413,7 +414,7 @@ C>
 
       END IF
 
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of CGEQRF
diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f
index bbdd46113..dd8baab4d 100644
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f
+++ b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f
@@ -176,7 +176,8 @@ C>
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      DOUBLE PRECISION   DROUNDUP_LWORK
+      EXTERNAL           ILAENV, DROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 
@@ -225,13 +226,13 @@ C>
 *         Optimal workspace for dlarfb = MAX(1,N)*NT
 *
           LWKOPT = (LBWORK+LLWORK)*NB
-          WORK( 1 ) = (LWKOPT+NT*NT)
+          WORK( 1 ) = DROUNDUP_LWORK(LWKOPT+NT*NT)
 
       ELSE
 
           LBWORK = CEILING(REAL(K)/REAL(NB))*NB
           LWKOPT = (LBWORK+LLWORK-NB)*NB
-          WORK( 1 ) = LWKOPT
+          WORK( 1 ) = DROUNDUP_LWORK(LWKOPT)
 
       END IF
 
@@ -413,7 +414,7 @@ C>
 
       END IF
 
-      WORK( 1 ) = IWS
+      WORK( 1 ) = DROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of DGEQRF
diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f
index bf68d635b..93dc48fa5 100644
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f
+++ b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f
@@ -176,7 +176,8 @@ C>
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      DOUBLE PRECISION   DROUNDUP_LWORK
+      EXTERNAL           ILAENV, DROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 
@@ -225,13 +226,13 @@ C>
 *         Optimal workspace for dlarfb = MAX(1,N)*NT
 *
           LWKOPT = (LBWORK+LLWORK)*NB
-          WORK( 1 ) = (LWKOPT+NT*NT)
+          WORK( 1 ) = DROUNDUP_LWORK(LWKOPT+NT*NT)
 
       ELSE
 
           LBWORK = CEILING(REAL(K)/REAL(NB))*NB
           LWKOPT = (LBWORK+LLWORK-NB)*NB
-          WORK( 1 ) = LWKOPT
+          WORK( 1 ) = DROUNDUP_LWORK(LWKOPT)
 
       END IF
 
@@ -413,7 +414,7 @@ C>
 
       END IF
 
-      WORK( 1 ) = IWS
+      WORK( 1 ) = DROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of SGEQRF
diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f
index 06918568e..3ef07bfc7 100644
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f
+++ b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f
@@ -176,7 +176,8 @@ C>
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 
@@ -225,13 +226,13 @@ C>
 *         Optimal workspace for dlarfb = MAX(1,N)*NT
 *
           LWKOPT = (LBWORK+LLWORK)*NB
-          WORK( 1 ) = (LWKOPT+NT*NT)
+          WORK( 1 ) = SROUNDUP_LWORK(LWKOPT+NT*NT)
 
       ELSE
 
           LBWORK = CEILING(REAL(K)/REAL(NB))*NB
           LWKOPT = (LBWORK+LLWORK-NB)*NB
-          WORK( 1 ) = LWKOPT
+          WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 
       END IF
 
@@ -413,7 +414,7 @@ C>
 
       END IF
 
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of ZGEQRF

From c9378badd929615aadca9120a664ab1e4bf83d11 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 12 Nov 2023 13:56:06 +0100
Subject: [PATCH 403/718] Apply ROUNDUP_LWORK (Reference-LAPACK PR 904)

---
 lapack-netlib/SRC/cgees.f           |  13 ++--
 lapack-netlib/SRC/cgeesx.f          |  13 ++--
 lapack-netlib/SRC/cgeev.f           |  14 ++--
 lapack-netlib/SRC/cgeevx.f          |  14 ++--
 lapack-netlib/SRC/cgehrd.f          |   9 +--
 lapack-netlib/SRC/cgelq.f           |  11 +--
 lapack-netlib/SRC/cgelqf.f          |   9 +--
 lapack-netlib/SRC/cgelsd.f          |  13 ++--
 lapack-netlib/SRC/cgelss.f          |   8 +--
 lapack-netlib/SRC/cgelst.f          |  19 +++--
 lapack-netlib/SRC/cgeqlf.f          |   9 +--
 lapack-netlib/SRC/cgeqrf.f          |   9 +--
 lapack-netlib/SRC/cgeqrfp.f         |   9 +--
 lapack-netlib/SRC/cgerqf.f          |   9 +--
 lapack-netlib/SRC/cgesvd.f          |  10 +--
 lapack-netlib/SRC/cgetri.f          |   9 +--
 lapack-netlib/SRC/cgetsls.f         |  15 ++--
 lapack-netlib/SRC/cgges.f           |  14 ++--
 lapack-netlib/SRC/cggesx.f          |  14 ++--
 lapack-netlib/SRC/cggev.f           |  14 ++--
 lapack-netlib/SRC/cggevx.f          |  13 ++--
 lapack-netlib/SRC/cggglm.f          |   7 +-
 lapack-netlib/SRC/cgglse.f          |   7 +-
 lapack-netlib/SRC/cggqrf.f          |   7 +-
 lapack-netlib/SRC/cggrqf.f          |   7 +-
 lapack-netlib/SRC/chbev_2stage.f    |  23 +++---
 lapack-netlib/SRC/chbevd.f          |  10 +--
 lapack-netlib/SRC/chbevx_2stage.f   |  29 ++++----
 lapack-netlib/SRC/chbgvd.f          |   9 +--
 lapack-netlib/SRC/cheev.f           |  10 +--
 lapack-netlib/SRC/cheev_2stage.f    |  25 +++----
 lapack-netlib/SRC/cheevd.f          |  10 +--
 lapack-netlib/SRC/cheevr.f          |  10 +--
 lapack-netlib/SRC/cheevx.f          |  10 +--
 lapack-netlib/SRC/cheevx_2stage.f   |  29 ++++----
 lapack-netlib/SRC/chegv.f           |   9 +--
 lapack-netlib/SRC/chegv_2stage.f    |  23 +++---
 lapack-netlib/SRC/chegvd.f          |   9 +--
 lapack-netlib/SRC/chegvx.f          |   9 +--
 lapack-netlib/SRC/chesv.f           |   9 +--
 lapack-netlib/SRC/chesv_aa.f        |   9 +--
 lapack-netlib/SRC/chesv_aa_2stage.f |   7 +-
 lapack-netlib/SRC/chesv_rk.f        |   9 +--
 lapack-netlib/SRC/chesv_rook.f      |   9 +--
 lapack-netlib/SRC/chesvx.f          |  10 +--
 lapack-netlib/SRC/chetrd_hb2st.F    | 107 ++++++++++++++--------------
 46 files changed, 334 insertions(+), 317 deletions(-)

diff --git a/lapack-netlib/SRC/cgees.f b/lapack-netlib/SRC/cgees.f
index 71acfdba3..2085dc49b 100644
--- a/lapack-netlib/SRC/cgees.f
+++ b/lapack-netlib/SRC/cgees.f
@@ -189,7 +189,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEeigen
+*> \ingroup gees
 *
 *  =====================================================================
       SUBROUTINE CGEES( JOBVS, SORT, SELECT, N, A, LDA, SDIM, W, VS,
@@ -230,13 +230,13 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CCOPY, CGEBAK, CGEBAL, CGEHRD, CHSEQR, CLACPY,
-     $                   CLASCL, CTRSEN, CUNGHR, SLABAD, XERBLA
+     $                   CLASCL, CTRSEN, CUNGHR, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, SQRT
@@ -292,7 +292,7 @@
                MAXWRK = MAX( MAXWRK, HSWORK )
             END IF
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -12
@@ -318,7 +318,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -413,7 +412,7 @@
          CALL CCOPY( N, A, LDA+1, W, 1 )
       END IF
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of CGEES
diff --git a/lapack-netlib/SRC/cgeesx.f b/lapack-netlib/SRC/cgeesx.f
index 782e36747..036ae90c2 100644
--- a/lapack-netlib/SRC/cgeesx.f
+++ b/lapack-netlib/SRC/cgeesx.f
@@ -230,7 +230,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEeigen
+*> \ingroup geesx
 *
 *  =====================================================================
       SUBROUTINE CGEESX( JOBVS, SORT, SELECT, SENSE, N, A, LDA, SDIM, W,
@@ -274,13 +274,13 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CCOPY, CGEBAK, CGEBAL, CGEHRD, CHSEQR, CLACPY,
-     $                   CLASCL, CTRSEN, CUNGHR, SLABAD, SLASCL, XERBLA
+     $                   CLASCL, CTRSEN, CUNGHR, SLASCL, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, SQRT
@@ -350,7 +350,7 @@
             IF( .NOT.WANTSN )
      $         LWRK = MAX( LWRK, ( N*N )/2 )
          END IF
-         WORK( 1 ) = LWRK
+         WORK( 1 ) = SROUNDUP_LWORK(LWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -15
@@ -376,7 +376,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -488,7 +487,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of CGEESX
diff --git a/lapack-netlib/SRC/cgeev.f b/lapack-netlib/SRC/cgeev.f
index a77525ef8..bb41599d1 100644
--- a/lapack-netlib/SRC/cgeev.f
+++ b/lapack-netlib/SRC/cgeev.f
@@ -172,7 +172,7 @@
 *
 *  @generated from zgeev.f, fortran z -> c, Tue Apr 19 01:47:44 2016
 *
-*> \ingroup complexGEeigen
+*> \ingroup geev
 *
 *  =====================================================================
       SUBROUTINE CGEEV( JOBVL, JOBVR, N, A, LDA, W, VL, LDVL, VR, LDVR,
@@ -212,14 +212,15 @@
       REAL   DUM( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLABAD, XERBLA, CSSCAL, CGEBAK, CGEBAL, CGEHRD,
+      EXTERNAL           XERBLA, CSSCAL, CGEBAK, CGEBAL, CGEHRD,
      $                   CHSEQR, CLACPY, CLASCL, CSCAL, CTREVC3, CUNGHR
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ISAMAX, ILAENV
-      REAL   SLAMCH, SCNRM2, CLANGE
-      EXTERNAL           LSAME, ISAMAX, ILAENV, SLAMCH, SCNRM2, CLANGE
+      REAL               SLAMCH, SCNRM2, CLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ISAMAX, ILAENV, SLAMCH, SCNRM2, CLANGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          REAL, CMPLX, CONJG, AIMAG, MAX, SQRT
@@ -291,7 +292,7 @@
             HSWORK = INT( WORK(1) )
             MAXWRK = MAX( MAXWRK, HSWORK, MINWRK )
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -12
@@ -315,7 +316,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -493,7 +493,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of CGEEV
diff --git a/lapack-netlib/SRC/cgeevx.f b/lapack-netlib/SRC/cgeevx.f
index 2388f5acc..5dbc394e9 100644
--- a/lapack-netlib/SRC/cgeevx.f
+++ b/lapack-netlib/SRC/cgeevx.f
@@ -279,7 +279,7 @@
 *
 *  @generated from zgeevx.f, fortran z -> c, Tue Apr 19 01:47:44 2016
 *
-*> \ingroup complexGEeigen
+*> \ingroup geevx
 *
 *  =====================================================================
       SUBROUTINE CGEEVX( BALANC, JOBVL, JOBVR, SENSE, N, A, LDA, W, VL,
@@ -323,15 +323,16 @@
       REAL   DUM( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLABAD, SLASCL, XERBLA, CSSCAL, CGEBAK, CGEBAL,
+      EXTERNAL           SLASCL, XERBLA, CSSCAL, CGEBAK, CGEBAL,
      $                   CGEHRD, CHSEQR, CLACPY, CLASCL, CSCAL, CTREVC3,
      $                   CTRSNA, CUNGHR
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ISAMAX, ILAENV
-      REAL   SLAMCH, SCNRM2, CLANGE
-      EXTERNAL           LSAME, ISAMAX, ILAENV, SLAMCH, SCNRM2, CLANGE
+      REAL               SLAMCH, SCNRM2, CLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ISAMAX, ILAENV, SLAMCH, SCNRM2, CLANGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          REAL, CMPLX, CONJG, AIMAG, MAX, SQRT
@@ -434,7 +435,7 @@
             END IF
             MAXWRK = MAX( MAXWRK, MINWRK )
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -20
@@ -458,7 +459,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -657,7 +657,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of CGEEVX
diff --git a/lapack-netlib/SRC/cgehrd.f b/lapack-netlib/SRC/cgehrd.f
index d9c050267..f407f931a 100644
--- a/lapack-netlib/SRC/cgehrd.f
+++ b/lapack-netlib/SRC/cgehrd.f
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup gehrd
 *
 *> \par Further Details:
 *  =====================
@@ -201,7 +201,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -227,7 +228,7 @@
 *
          NB = MIN( NBMAX, ILAENV( 1, 'CGEHRD', ' ', N, ILO, IHI, -1 ) )
          LWKOPT = N*NB + TSIZE
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -344,7 +345,7 @@
 *     Use unblocked code to reduce the rest of the matrix
 *
       CALL CGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO )
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgelq.f b/lapack-netlib/SRC/cgelq.f
index d66033166..ff482bc42 100644
--- a/lapack-netlib/SRC/cgelq.f
+++ b/lapack-netlib/SRC/cgelq.f
@@ -166,6 +166,8 @@
 *>  the LQ factorization.
 *> \endverbatim
 *>
+*> \ingroup gelq
+*>
 *  =====================================================================
       SUBROUTINE CGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
@@ -190,7 +192,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGELQT, CLASWLQ, XERBLA
@@ -292,9 +295,9 @@
         T( 2 ) = MB
         T( 3 ) = NB
         IF( MINW ) THEN
-          WORK( 1 ) = LWMIN
+          WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
         ELSE
-          WORK( 1 ) = LWREQ
+          WORK( 1 ) = SROUNDUP_LWORK(LWREQ)
         END IF
       END IF
       IF( INFO.NE.0 ) THEN
@@ -319,7 +322,7 @@
      $                LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = LWREQ
+      WORK( 1 ) = SROUNDUP_LWORK(LWREQ)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgelqf.f b/lapack-netlib/SRC/cgelqf.f
index 37ef13a27..75f5bc960 100644
--- a/lapack-netlib/SRC/cgelqf.f
+++ b/lapack-netlib/SRC/cgelqf.f
@@ -118,7 +118,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup gelqf
 *
 *> \par Further Details:
 *  =====================
@@ -167,7 +167,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -176,7 +177,7 @@
       INFO = 0
       NB = ILAENV( 1, 'CGELQF', ' ', M, N, -1, -1 )
       LWKOPT = M*NB
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -266,7 +267,7 @@
      $   CALL CGELQ2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK,
      $                IINFO )
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of CGELQF
diff --git a/lapack-netlib/SRC/cgelsd.f b/lapack-netlib/SRC/cgelsd.f
index c3c77bf63..5d7eec68d 100644
--- a/lapack-netlib/SRC/cgelsd.f
+++ b/lapack-netlib/SRC/cgelsd.f
@@ -204,7 +204,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEsolve
+*> \ingroup gelsd
 *
 *> \par Contributors:
 *  ==================
@@ -249,13 +249,13 @@
 *     .. External Subroutines ..
       EXTERNAL           CGEBRD, CGELQF, CGEQRF, CLACPY,
      $                   CLALSD, CLASCL, CLASET, CUNMBR,
-     $                   CUNMLQ, CUNMQR, SLABAD, SLASCL,
+     $                   CUNMLQ, CUNMQR, SLASCL,
      $                   SLASET, XERBLA
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           CLANGE, SLAMCH, ILAENV
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           CLANGE, SLAMCH, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, LOG, MAX, MIN, REAL
@@ -367,7 +367,7 @@
             END IF
          END IF
          MINWRK = MIN( MINWRK, MAXWRK )
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
          IWORK( 1 ) = LIWORK
          RWORK( 1 ) = LRWORK
 *
@@ -396,7 +396,6 @@
       SFMIN = SLAMCH( 'S' )
       SMLNUM = SFMIN / EPS
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max entry outside range [SMLNUM,BIGNUM].
 *
@@ -647,7 +646,7 @@
       END IF
 *
    10 CONTINUE
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       IWORK( 1 ) = LIWORK
       RWORK( 1 ) = LRWORK
       RETURN
diff --git a/lapack-netlib/SRC/cgelss.f b/lapack-netlib/SRC/cgelss.f
index d1e38c504..00d7f596a 100644
--- a/lapack-netlib/SRC/cgelss.f
+++ b/lapack-netlib/SRC/cgelss.f
@@ -218,8 +218,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           ILAENV, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -361,7 +361,7 @@
             END IF
             MAXWRK = MAX( MINWRK, MAXWRK )
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY )
      $      INFO = -12
@@ -758,7 +758,7 @@
          CALL CLASCL( 'G', 0, 0, BIGNUM, BNRM, N, NRHS, B, LDB, INFO )
       END IF
    70 CONTINUE
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of CGELSS
diff --git a/lapack-netlib/SRC/cgelst.f b/lapack-netlib/SRC/cgelst.f
index 7d8e44ddf..b69626934 100644
--- a/lapack-netlib/SRC/cgelst.f
+++ b/lapack-netlib/SRC/cgelst.f
@@ -176,7 +176,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEsolve
+*> \ingroup gelst
 *
 *> \par Contributors:
 *  ==================
@@ -224,15 +224,15 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, CLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, CLANGE
+      REAL               SLAMCH, CLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, CLANGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CGELQT, CGEQRT, CGEMLQT, CGEMQRT, SLABAD,
+      EXTERNAL           CGELQT, CGEQRT, CGEMLQT, CGEMQRT,
      $                   CLASCL, CLASET, CTRTRS, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          REAL, MAX, MIN
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
@@ -270,7 +270,7 @@
 *
          MNNRHS = MAX( MN, NRHS )
          LWOPT = MAX( 1, (MN+MNNRHS)*NB )
-         WORK( 1 ) = REAL( LWOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWOPT )
 *
       END IF
 *
@@ -285,7 +285,7 @@
 *
       IF( MIN( M, N, NRHS ).EQ.0 ) THEN
          CALL CLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB )
-         WORK( 1 ) = REAL( LWOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWOPT )
          RETURN
       END IF
 *
@@ -311,7 +311,6 @@
 *
       SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A, B if max element outside range [SMLNUM,BIGNUM]
 *
@@ -334,7 +333,7 @@
 *        Matrix all zero. Return zero solution.
 *
          CALL CLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB )
-         WORK( 1 ) = REAL( LWOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWOPT )
          RETURN
       END IF
 *
@@ -524,7 +523,7 @@
      $                INFO )
       END IF
 *
-      WORK( 1 ) = REAL( LWOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgeqlf.f b/lapack-netlib/SRC/cgeqlf.f
index d2c11c269..918bbddad 100644
--- a/lapack-netlib/SRC/cgeqlf.f
+++ b/lapack-netlib/SRC/cgeqlf.f
@@ -113,7 +113,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup geqlf
 *
 *> \par Further Details:
 *  =====================
@@ -162,7 +162,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -186,7 +187,7 @@
             NB = ILAENV( 1, 'CGEQLF', ' ', M, N, -1, -1 )
             LWKOPT = N*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
             INFO = -7
@@ -276,7 +277,7 @@
       IF( MU.GT.0 .AND. NU.GT.0 )
      $   CALL CGEQL2( MU, NU, A, LDA, TAU, WORK, IINFO )
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of CGEQLF
diff --git a/lapack-netlib/SRC/cgeqrf.f b/lapack-netlib/SRC/cgeqrf.f
index d71bd5b33..bf22a2cd3 100644
--- a/lapack-netlib/SRC/cgeqrf.f
+++ b/lapack-netlib/SRC/cgeqrf.f
@@ -121,7 +121,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup geqrf
 *
 *> \par Further Details:
 *  =====================
@@ -170,7 +170,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -199,7 +200,7 @@
          ELSE
             LWKOPT = N*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          RETURN
       END IF
 *
@@ -274,7 +275,7 @@
      $   CALL CGEQR2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK,
      $                IINFO )
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of CGEQRF
diff --git a/lapack-netlib/SRC/cgeqrfp.f b/lapack-netlib/SRC/cgeqrfp.f
index 995404f43..eaf98ddf3 100644
--- a/lapack-netlib/SRC/cgeqrfp.f
+++ b/lapack-netlib/SRC/cgeqrfp.f
@@ -122,7 +122,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup geqrfp
 *
 *> \par Further Details:
 *  =====================
@@ -173,7 +173,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -182,7 +183,7 @@
       INFO = 0
       NB = ILAENV( 1, 'CGEQRF', ' ', M, N, -1, -1 )
       LWKOPT = N*NB
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -272,7 +273,7 @@
      $   CALL CGEQR2P( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK,
      $                IINFO )
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of CGEQRFP
diff --git a/lapack-netlib/SRC/cgerqf.f b/lapack-netlib/SRC/cgerqf.f
index d2247844c..6f914c892 100644
--- a/lapack-netlib/SRC/cgerqf.f
+++ b/lapack-netlib/SRC/cgerqf.f
@@ -114,7 +114,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup gerqf
 *
 *> \par Further Details:
 *  =====================
@@ -163,7 +163,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -187,7 +188,7 @@
             NB = ILAENV( 1, 'CGERQF', ' ', M, N, -1, -1 )
             LWKOPT = M*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF ( .NOT.LQUERY ) THEN
             IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) )
@@ -278,7 +279,7 @@
       IF( MU.GT.0 .AND. NU.GT.0 )
      $   CALL CGERQ2( MU, NU, A, LDA, TAU, WORK, IINFO )
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of CGERQF
diff --git a/lapack-netlib/SRC/cgesvd.f b/lapack-netlib/SRC/cgesvd.f
index 239b13431..6165a6acf 100644
--- a/lapack-netlib/SRC/cgesvd.f
+++ b/lapack-netlib/SRC/cgesvd.f
@@ -206,7 +206,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEsing
+*> \ingroup gesvd
 *
 *  =====================================================================
       SUBROUTINE CGESVD( JOBU, JOBVT, M, N, A, LDA, S, U, LDU, VT, LDVT,
@@ -259,8 +259,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN, SQRT
@@ -615,7 +615,7 @@
             END IF
          END IF
          MAXWRK = MAX( MINWRK, MAXWRK )
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -13
@@ -3694,7 +3694,7 @@
 *
 *     Return optimal workspace in WORK(1)
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgetri.f b/lapack-netlib/SRC/cgetri.f
index bd7fc286c..2060d1444 100644
--- a/lapack-netlib/SRC/cgetri.f
+++ b/lapack-netlib/SRC/cgetri.f
@@ -107,7 +107,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup getri
 *
 *  =====================================================================
       SUBROUTINE CGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -138,7 +138,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEMM, CGEMV, CSWAP, CTRSM, CTRTRI, XERBLA
@@ -153,7 +154,7 @@
       INFO = 0
       NB = ILAENV( 1, 'CGETRI', ' ', N, -1, -1, -1 )
       LWKOPT = N*NB
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
          INFO = -1
@@ -251,7 +252,7 @@
      $      CALL CSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 )
    60 CONTINUE
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of CGETRI
diff --git a/lapack-netlib/SRC/cgetsls.f b/lapack-netlib/SRC/cgetsls.f
index 8a4d02224..b4bb7562f 100644
--- a/lapack-netlib/SRC/cgetsls.f
+++ b/lapack-netlib/SRC/cgetsls.f
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEsolve
+*> \ingroup getsls
 *
 *  =====================================================================
       SUBROUTINE CGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB,
@@ -191,15 +191,15 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, CLANGE
-      EXTERNAL           LSAME, SLABAD, SLAMCH, CLANGE
+      REAL               SLAMCH, CLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, CLANGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEQR, CGEMQR, CLASCL, CLASET,
      $                   CTRTRS, XERBLA, CGELQ, CGEMLQ
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          REAL, MAX, MIN, INT
+      INTRINSIC          MAX, MIN, INT
 *     ..
 *     .. Executable Statements ..
 *
@@ -265,7 +265,7 @@
           INFO = -10
        END IF
 *
-       WORK( 1 ) = REAL( WSIZEO )
+       WORK( 1 ) = SROUNDUP_LWORK( WSIZEO )
 *
       END IF
 *
@@ -274,7 +274,7 @@
         RETURN
       END IF
       IF( LQUERY ) THEN
-        IF( LWORK.EQ.-2 ) WORK( 1 ) = REAL( WSIZEM )
+        IF( LWORK.EQ.-2 ) WORK( 1 ) = SROUNDUP_LWORK( WSIZEM )
         RETURN
       END IF
       IF( LWORK.LT.WSIZEO ) THEN
@@ -297,7 +297,6 @@
 *
        SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' )
        BIGNUM = ONE / SMLNUM
-       CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A, B if max element outside range [SMLNUM,BIGNUM]
 *
@@ -485,7 +484,7 @@
       END IF
 *
    50 CONTINUE
-      WORK( 1 ) = REAL( TSZO + LWO )
+      WORK( 1 ) = SROUNDUP_LWORK( TSZO + LWO )
       RETURN
 *
 *     End of CGETSLS
diff --git a/lapack-netlib/SRC/cgges.f b/lapack-netlib/SRC/cgges.f
index c54174da4..0ff848735 100644
--- a/lapack-netlib/SRC/cgges.f
+++ b/lapack-netlib/SRC/cgges.f
@@ -261,7 +261,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEeigen
+*> \ingroup gges
 *
 *  =====================================================================
       SUBROUTINE CGGES( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, LDB,
@@ -312,14 +312,13 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEQRF, CGGBAK, CGGBAL, CGGHRD, CHGEQZ, CLACPY,
-     $                   CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, SLABAD,
-     $                   XERBLA
+     $                   CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, SQRT
@@ -390,7 +389,7 @@
             LWKOPT = MAX( LWKOPT, N +
      $                    N*ILAENV( 1, 'CUNGQR', ' ', N, 1, N, -1 ) )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY )
      $      INFO = -18
@@ -415,7 +414,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -587,7 +585,7 @@
 *
    30 CONTINUE
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cggesx.f b/lapack-netlib/SRC/cggesx.f
index 6385a74c1..3bf460fac 100644
--- a/lapack-netlib/SRC/cggesx.f
+++ b/lapack-netlib/SRC/cggesx.f
@@ -320,7 +320,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEeigen
+*> \ingroup ggesx
 *
 *  =====================================================================
       SUBROUTINE CGGESX( JOBVSL, JOBVSR, SORT, SELCTG, SENSE, N, A, LDA,
@@ -373,14 +373,13 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEQRF, CGGBAK, CGGBAL, CGGHRD, CHGEQZ, CLACPY,
-     $                   CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, SLABAD,
-     $                   XERBLA
+     $                   CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, SQRT
@@ -476,7 +475,7 @@
             MAXWRK = 1
             LWRK   = 1
          END IF
-         WORK( 1 ) = LWRK
+         WORK( 1 ) = SROUNDUP_LWORK(LWRK)
          IF( WANTSN .OR. N.EQ.0 ) THEN
             LIWMIN = 1
          ELSE
@@ -510,7 +509,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -705,7 +703,7 @@
 *
    40 CONTINUE
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/cggev.f b/lapack-netlib/SRC/cggev.f
index c1c28a180..cf16e3079 100644
--- a/lapack-netlib/SRC/cggev.f
+++ b/lapack-netlib/SRC/cggev.f
@@ -209,7 +209,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEeigen
+*> \ingroup ggev
 *
 *  =====================================================================
       SUBROUTINE CGGEV( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHA, BETA,
@@ -254,14 +254,13 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEQRF, CGGBAK, CGGBAL, CGGHRD, CHGEQZ, CLACPY,
-     $                   CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, SLABAD,
-     $                   XERBLA
+     $                   CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, AIMAG, MAX, REAL, SQRT
@@ -336,7 +335,7 @@
             LWKOPT = MAX( LWKOPT, N +
      $                 N*ILAENV( 1, 'CUNGQR', ' ', N, 1, N, -1 ) )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY )
      $      INFO = -15
@@ -359,7 +358,6 @@
       EPS = SLAMCH( 'E' )*SLAMCH( 'B' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -547,7 +545,7 @@
       IF( ILBSCL )
      $   CALL CLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR )
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CGGEV
diff --git a/lapack-netlib/SRC/cggevx.f b/lapack-netlib/SRC/cggevx.f
index 405c9c3b5..fa4e92682 100644
--- a/lapack-netlib/SRC/cggevx.f
+++ b/lapack-netlib/SRC/cggevx.f
@@ -335,7 +335,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEeigen
+*> \ingroup ggevx
 *
 *> \par Further Details:
 *  =====================
@@ -416,13 +416,13 @@
 *     .. External Subroutines ..
       EXTERNAL           CGEQRF, CGGBAK, CGGBAL, CGGHRD, CHGEQZ, CLACPY,
      $                   CLASCL, CLASET, CTGEVC, CTGSNA, CUNGQR, CUNMQR,
-     $                   SLABAD, SLASCL, XERBLA
+     $                   SLASCL, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, AIMAG, MAX, REAL, SQRT
@@ -521,7 +521,7 @@
      $                       N*ILAENV( 1, 'CUNGQR', ' ', N, 1, N, 0 ) )
             END IF
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -25
@@ -545,7 +545,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -793,7 +792,7 @@
       IF( ILBSCL )
      $   CALL CLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR )
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of CGGEVX
diff --git a/lapack-netlib/SRC/cggglm.f b/lapack-netlib/SRC/cggglm.f
index fb384b651..0d36deca6 100644
--- a/lapack-netlib/SRC/cggglm.f
+++ b/lapack-netlib/SRC/cggglm.f
@@ -177,7 +177,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHEReigen
+*> \ingroup ggglm
 *
 *  =====================================================================
       SUBROUTINE CGGGLM( N, M, P, A, LDA, B, LDB, D, X, Y, WORK, LWORK,
@@ -213,7 +213,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN
@@ -252,7 +253,7 @@
             LWKMIN = M + N + P
             LWKOPT = M + NP + MAX( N, P )*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
             INFO = -12
diff --git a/lapack-netlib/SRC/cgglse.f b/lapack-netlib/SRC/cgglse.f
index cca20dfed..b1c562385 100644
--- a/lapack-netlib/SRC/cgglse.f
+++ b/lapack-netlib/SRC/cgglse.f
@@ -172,7 +172,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERsolve
+*> \ingroup gglse
 *
 *  =====================================================================
       SUBROUTINE CGGLSE( M, N, P, A, LDA, B, LDB, C, D, X, WORK, LWORK,
@@ -207,7 +207,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN
@@ -246,7 +247,7 @@
             LWKMIN = M + N + P
             LWKOPT = P + MN + MAX( M, N )*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
             INFO = -12
diff --git a/lapack-netlib/SRC/cggqrf.f b/lapack-netlib/SRC/cggqrf.f
index 0185f4e0d..29b0bf4af 100644
--- a/lapack-netlib/SRC/cggqrf.f
+++ b/lapack-netlib/SRC/cggqrf.f
@@ -173,7 +173,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup ggqrf
 *
 *> \par Further Details:
 *  =====================
@@ -236,7 +236,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN
@@ -251,7 +252,7 @@
       NB3 = ILAENV( 1, 'CUNMQR', ' ', N, M, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
       LWKOPT = MAX( N, M, P)*NB
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
          INFO = -1
diff --git a/lapack-netlib/SRC/cggrqf.f b/lapack-netlib/SRC/cggrqf.f
index 5227100da..273ab3ef7 100644
--- a/lapack-netlib/SRC/cggrqf.f
+++ b/lapack-netlib/SRC/cggrqf.f
@@ -172,7 +172,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup ggrqf
 *
 *> \par Further Details:
 *  =====================
@@ -235,7 +235,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN
@@ -250,7 +251,7 @@
       NB3 = ILAENV( 1, 'CUNMRQ', ' ', M, N, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
       LWKOPT = MAX( N, M, P)*NB
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
diff --git a/lapack-netlib/SRC/chbev_2stage.f b/lapack-netlib/SRC/chbev_2stage.f
index 123d84729..f84d8d3d4 100644
--- a/lapack-netlib/SRC/chbev_2stage.f
+++ b/lapack-netlib/SRC/chbev_2stage.f
@@ -132,7 +132,7 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, dimension) where
 *>                                   dimension = (2KD+1)*N + KD*NTHREADS
@@ -171,7 +171,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHEReigen
+*> \ingroup hbev_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -189,7 +189,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -197,11 +197,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -240,8 +240,9 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      REAL               SLAMCH, CLANHB
-      EXTERNAL           LSAME, SLAMCH, CLANHB, ILAENV2STAGE
+      REAL               SLAMCH, CLANHB, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, CLANHB, ILAENV2STAGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SSCAL, SSTERF, XERBLA, CLASCL, CSTEQR,
@@ -276,7 +277,7 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWMIN = 1
-            WORK( 1 ) = LWMIN
+            WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          ELSE
             IB    = ILAENV2STAGE( 2, 'CHETRD_HB2ST', JOBZ,
      $                            N, KD, -1, -1 )
@@ -285,7 +286,7 @@
             LWTRD = ILAENV2STAGE( 4, 'CHETRD_HB2ST', JOBZ,
      $                            N, KD, IB, -1 )
             LWMIN = LHTRD + LWTRD
-            WORK( 1 )  = LWMIN
+            WORK( 1 )  = SROUNDUP_LWORK(LWMIN)
          ENDIF
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
@@ -351,7 +352,7 @@
       LLWORK  = LWORK - INDWRK + 1
 *
       CALL CHETRD_HB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
-     $                    RWORK( INDE ), WORK( INDHOUS ), LHTRD, 
+     $                    RWORK( INDE ), WORK( INDHOUS ), LHTRD,
      $                    WORK( INDWRK ), LLWORK, IINFO )
 *
 *     For eigenvalues only, call SSTERF.  For eigenvectors, call CSTEQR.
@@ -377,7 +378,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chbevd.f b/lapack-netlib/SRC/chbevd.f
index de33c9039..a5afe6b76 100644
--- a/lapack-netlib/SRC/chbevd.f
+++ b/lapack-netlib/SRC/chbevd.f
@@ -201,7 +201,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHEReigen
+*> \ingroup hbevd
 *
 *  =====================================================================
       SUBROUTINE CHBEVD( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, WORK,
@@ -239,8 +239,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               CLANHB, SLAMCH
-      EXTERNAL           LSAME, CLANHB, SLAMCH
+      REAL               CLANHB, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, CLANHB, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEMM, CHBTRD, CLACPY, CLASCL, CSTEDC, SSCAL,
@@ -288,7 +288,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          RWORK( 1 ) = LRWMIN
          IWORK( 1 ) = LIWMIN
 *
@@ -382,7 +382,7 @@
          CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RWORK( 1 ) = LRWMIN
       IWORK( 1 ) = LIWMIN
       RETURN
diff --git a/lapack-netlib/SRC/chbevx_2stage.f b/lapack-netlib/SRC/chbevx_2stage.f
index 22bced45f..1d609dfbd 100644
--- a/lapack-netlib/SRC/chbevx_2stage.f
+++ b/lapack-netlib/SRC/chbevx_2stage.f
@@ -22,7 +22,7 @@
 *
 *       SUBROUTINE CHBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB,
 *                                 Q, LDQ, VL, VU, IL, IU, ABSTOL, M, W,
-*                                 Z, LDZ, WORK, LWORK, RWORK, IWORK, 
+*                                 Z, LDZ, WORK, LWORK, RWORK, IWORK,
 *                                 IFAIL, INFO )
 *
 *       IMPLICIT NONE
@@ -233,7 +233,7 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, dimension) where
 *>                                   dimension = (2KD+1)*N + KD*NTHREADS
@@ -285,7 +285,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHEReigen
+*> \ingroup hbevx_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -303,7 +303,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -311,18 +311,18 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
 *  =====================================================================
       SUBROUTINE CHBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB,
      $                          Q, LDQ, VL, VU, IL, IU, ABSTOL, M, W,
-     $                          Z, LDZ, WORK, LWORK, RWORK, IWORK, 
+     $                          Z, LDZ, WORK, LWORK, RWORK, IWORK,
      $                          IFAIL, INFO )
 *
       IMPLICIT NONE
@@ -367,8 +367,9 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      REAL               SLAMCH, CLANHB
-      EXTERNAL           LSAME, SLAMCH, CLANHB, ILAENV2STAGE
+      REAL               SLAMCH, CLANHB, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, CLANHB, ILAENV2STAGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CCOPY,
@@ -424,16 +425,16 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWMIN = 1
-            WORK( 1 ) = LWMIN
+            WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          ELSE
             IB    = ILAENV2STAGE( 2, 'CHETRD_HB2ST', JOBZ,
      $                            N, KD, -1, -1 )
-            LHTRD = ILAENV2STAGE( 3, 'CHETRD_HB2ST', JOBZ, 
+            LHTRD = ILAENV2STAGE( 3, 'CHETRD_HB2ST', JOBZ,
      $                            N, KD, IB, -1 )
-            LWTRD = ILAENV2STAGE( 4, 'CHETRD_HB2ST', JOBZ, 
+            LWTRD = ILAENV2STAGE( 4, 'CHETRD_HB2ST', JOBZ,
      $                            N, KD, IB, -1 )
             LWMIN = LHTRD + LWTRD
-            WORK( 1 )  = LWMIN
+            WORK( 1 )  = SROUNDUP_LWORK(LWMIN)
          ENDIF
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
@@ -637,7 +638,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chbgvd.f b/lapack-netlib/SRC/chbgvd.f
index 655006370..00fb2b5f5 100644
--- a/lapack-netlib/SRC/chbgvd.f
+++ b/lapack-netlib/SRC/chbgvd.f
@@ -232,7 +232,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHEReigen
+*> \ingroup hbgvd
 *
 *> \par Contributors:
 *  ==================
@@ -275,7 +275,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SSTERF, XERBLA, CGEMM, CHBGST, CHBTRD, CLACPY,
@@ -322,7 +323,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          RWORK( 1 ) = LRWMIN
          IWORK( 1 ) = LIWMIN
 *
@@ -388,7 +389,7 @@
          CALL CLACPY( 'A', N, N, WORK( INDWK2 ), N, Z, LDZ )
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RWORK( 1 ) = LRWMIN
       IWORK( 1 ) = LIWMIN
       RETURN
diff --git a/lapack-netlib/SRC/cheev.f b/lapack-netlib/SRC/cheev.f
index fb8e451df..60df7d8b8 100644
--- a/lapack-netlib/SRC/cheev.f
+++ b/lapack-netlib/SRC/cheev.f
@@ -132,7 +132,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup heev
 *
 *  =====================================================================
       SUBROUTINE CHEEV( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, RWORK,
@@ -169,8 +169,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANHE, SLAMCH
-      EXTERNAL           ILAENV, LSAME, CLANHE, SLAMCH
+      REAL               CLANHE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, CLANHE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHETRD, CLASCL, CSTEQR, CUNGTR, SSCAL, SSTERF,
@@ -201,7 +201,7 @@
       IF( INFO.EQ.0 ) THEN
          NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, ( NB+1 )*N )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.MAX( 1, 2*N-1 ) .AND. .NOT.LQUERY )
      $      INFO = -8
@@ -286,7 +286,7 @@
 *
 *     Set WORK(1) to optimal complex workspace size.
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cheev_2stage.f b/lapack-netlib/SRC/cheev_2stage.f
index fb7989d9f..4e1cecc64 100644
--- a/lapack-netlib/SRC/cheev_2stage.f
+++ b/lapack-netlib/SRC/cheev_2stage.f
@@ -106,12 +106,12 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -149,7 +149,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup heev_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -167,7 +167,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -175,11 +175,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -220,8 +220,9 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      REAL               SLAMCH, CLANHE
-      EXTERNAL           LSAME, SLAMCH, CLANHE, ILAENV2STAGE
+      REAL               SLAMCH, CLANHE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, CLANHE, ILAENV2STAGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SSCAL, SSTERF, XERBLA, CLASCL, CSTEQR,
@@ -255,7 +256,7 @@
          LHTRD = ILAENV2STAGE( 3, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
          LWTRD = ILAENV2STAGE( 4, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
          LWMIN = N + LHTRD + LWTRD
-         WORK( 1 )  = LWMIN
+         WORK( 1 )  = SROUNDUP_LWORK(LWMIN)
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
      $      INFO = -8
@@ -314,7 +315,7 @@
       LLWORK  = LWORK - INDWRK + 1
 *
       CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, RWORK( INDE ),
-     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD,
      $                    WORK( INDWRK ), LLWORK, IINFO )
 *
 *     For eigenvalues only, call SSTERF.  For eigenvectors, first call
@@ -343,7 +344,7 @@
 *
 *     Set WORK(1) to optimal complex workspace size.
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cheevd.f b/lapack-netlib/SRC/cheevd.f
index dce0b2083..b5ca804eb 100644
--- a/lapack-netlib/SRC/cheevd.f
+++ b/lapack-netlib/SRC/cheevd.f
@@ -180,7 +180,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup heevd
 *
 *> \par Further Details:
 *  =====================
@@ -230,8 +230,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANHE, SLAMCH
-      EXTERNAL           ILAENV, LSAME, CLANHE, SLAMCH
+      REAL               CLANHE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, CLANHE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHETRD, CLACPY, CLASCL, CSTEDC, CUNMTR, SSCAL,
@@ -282,7 +282,7 @@
             LROPT = LRWMIN
             LIOPT = LIWMIN
          END IF
-         WORK( 1 ) = LOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LOPT)
          RWORK( 1 ) = LROPT
          IWORK( 1 ) = LIOPT
 *
@@ -378,7 +378,7 @@
          CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
       END IF
 *
-      WORK( 1 ) = LOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LOPT)
       RWORK( 1 ) = LROPT
       IWORK( 1 ) = LIOPT
 *
diff --git a/lapack-netlib/SRC/cheevr.f b/lapack-netlib/SRC/cheevr.f
index b8854b182..05c5e66be 100644
--- a/lapack-netlib/SRC/cheevr.f
+++ b/lapack-netlib/SRC/cheevr.f
@@ -338,7 +338,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup heevr
 *
 *> \par Contributors:
 *  ==================
@@ -392,8 +392,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANSY, SLAMCH
-      EXTERNAL           LSAME, ILAENV, CLANSY, SLAMCH
+      REAL               CLANSY, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, CLANSY, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHETRD, CSSCAL, CSTEMR, CSTEIN, CSWAP, CUNMTR,
@@ -454,7 +454,7 @@
          NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 )
          NB = MAX( NB, ILAENV( 1, 'CUNMTR', UPLO, N, -1, -1, -1 ) )
          LWKOPT = MAX( ( NB+1 )*N, LWMIN )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          RWORK( 1 ) = LRWMIN
          IWORK( 1 ) = LIWMIN
 *
@@ -710,7 +710,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RWORK( 1 ) = LRWMIN
       IWORK( 1 ) = LIWMIN
 *
diff --git a/lapack-netlib/SRC/cheevx.f b/lapack-netlib/SRC/cheevx.f
index 1cec902aa..e91599a44 100644
--- a/lapack-netlib/SRC/cheevx.f
+++ b/lapack-netlib/SRC/cheevx.f
@@ -250,7 +250,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup heevx
 *
 *  =====================================================================
       SUBROUTINE CHEEVX( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, IL, IU,
@@ -294,8 +294,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, CLANHE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, CLANHE
+      REAL               SLAMCH, CLANHE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, CLANHE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL,
@@ -354,7 +354,7 @@
             NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 )
             NB = MAX( NB, ILAENV( 1, 'CUNMTR', UPLO, N, -1, -1, -1 ) )
             LWKOPT = MAX( 1, ( NB + 1 )*N )
-            WORK( 1 ) = LWKOPT
+            WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          END IF
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY )
@@ -552,7 +552,7 @@
 *
 *     Set WORK(1) to optimal complex workspace size.
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cheevx_2stage.f b/lapack-netlib/SRC/cheevx_2stage.f
index 04f6e30e8..70a681ec4 100644
--- a/lapack-netlib/SRC/cheevx_2stage.f
+++ b/lapack-netlib/SRC/cheevx_2stage.f
@@ -209,12 +209,12 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, 8*N, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -265,7 +265,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup heevx_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -283,7 +283,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -291,11 +291,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -335,7 +335,7 @@
       CHARACTER          ORDER
       INTEGER            I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL,
      $                   INDISP, INDIWK, INDRWK, INDTAU, INDWRK, ISCALE,
-     $                   ITMP1, J, JJ, LLWORK, 
+     $                   ITMP1, J, JJ, LLWORK,
      $                   NSPLIT, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS
       REAL               ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
      $                   SIGMA, SMLNUM, TMP1, VLL, VUU
@@ -343,8 +343,9 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      REAL               SLAMCH, CLANHE
-      EXTERNAL           LSAME, SLAMCH, CLANHE, ILAENV2STAGE
+      REAL               SLAMCH, CLANHE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, CLANHE, ILAENV2STAGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL,
@@ -397,7 +398,7 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWMIN = 1
-            WORK( 1 ) = LWMIN
+            WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          ELSE
             KD    = ILAENV2STAGE( 1, 'CHETRD_2STAGE', JOBZ,
      $                            N, -1, -1, -1 )
@@ -408,7 +409,7 @@
             LWTRD = ILAENV2STAGE( 4, 'CHETRD_2STAGE', JOBZ,
      $                            N, KD, IB, -1 )
             LWMIN = N + LHTRD + LWTRD
-            WORK( 1 )  = LWMIN
+            WORK( 1 )  = SROUNDUP_LWORK(LWMIN)
          END IF
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
@@ -499,7 +500,7 @@
       LLWORK  = LWORK - INDWRK + 1
 *
       CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDD ),
-     $                    RWORK( INDE ), WORK( INDTAU ), 
+     $                    RWORK( INDE ), WORK( INDTAU ),
      $                    WORK( INDHOUS ), LHTRD, WORK( INDWRK ),
      $                    LLWORK, IINFO )
 *
@@ -610,7 +611,7 @@
 *
 *     Set WORK(1) to optimal complex workspace size.
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chegv.f b/lapack-netlib/SRC/chegv.f
index 198e5d102..53f9d5196 100644
--- a/lapack-netlib/SRC/chegv.f
+++ b/lapack-netlib/SRC/chegv.f
@@ -173,7 +173,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup hegv
 *
 *  =====================================================================
       SUBROUTINE CHEGV( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W, WORK,
@@ -206,7 +206,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHEEV, CHEGST, CPOTRF, CTRMM, CTRSM, XERBLA
@@ -240,7 +241,7 @@
       IF( INFO.EQ.0 ) THEN
          NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, ( NB + 1 )*N )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.MAX( 1, 2*N-1 ) .AND. .NOT.LQUERY ) THEN
             INFO = -11
@@ -309,7 +310,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chegv_2stage.f b/lapack-netlib/SRC/chegv_2stage.f
index d2b8fc795..8de1f7f06 100644
--- a/lapack-netlib/SRC/chegv_2stage.f
+++ b/lapack-netlib/SRC/chegv_2stage.f
@@ -144,12 +144,12 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -192,7 +192,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup hegv_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -210,7 +210,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -218,11 +218,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -259,7 +259,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      EXTERNAL           LSAME, ILAENV2STAGE
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV2STAGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CHEGST, CPOTRF, CTRMM, CTRSM,
@@ -297,7 +298,7 @@
          LHTRD = ILAENV2STAGE( 3, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
          LWTRD = ILAENV2STAGE( 4, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
          LWMIN = N + LHTRD + LWTRD
-         WORK( 1 )  = LWMIN
+         WORK( 1 )  = SROUNDUP_LWORK(LWMIN)
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
             INFO = -11
@@ -327,7 +328,7 @@
 *     Transform problem to standard eigenvalue problem and solve.
 *
       CALL CHEGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO )
-      CALL CHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, 
+      CALL CHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W,
      $                   WORK, LWORK, RWORK, INFO )
 *
       IF( WANTZ ) THEN
@@ -367,7 +368,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chegvd.f b/lapack-netlib/SRC/chegvd.f
index 4edc36f2a..d2dc941e6 100644
--- a/lapack-netlib/SRC/chegvd.f
+++ b/lapack-netlib/SRC/chegvd.f
@@ -219,7 +219,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup hegvd
 *
 *> \par Further Details:
 *  =====================
@@ -268,7 +268,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHEEVD, CHEGST, CPOTRF, CTRMM, CTRSM, XERBLA
@@ -316,7 +317,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LOPT)
          RWORK( 1 ) = LROPT
          IWORK( 1 ) = LIOPT
 *
@@ -392,7 +393,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = LOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LOPT)
       RWORK( 1 ) = LROPT
       IWORK( 1 ) = LIOPT
 *
diff --git a/lapack-netlib/SRC/chegvx.f b/lapack-netlib/SRC/chegvx.f
index 8e565222d..172d0571e 100644
--- a/lapack-netlib/SRC/chegvx.f
+++ b/lapack-netlib/SRC/chegvx.f
@@ -293,7 +293,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup hegvx
 *
 *> \par Contributors:
 *  ==================
@@ -335,7 +335,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHEEVX, CHEGST, CPOTRF, CTRMM, CTRSM, XERBLA
@@ -390,7 +391,7 @@
       IF( INFO.EQ.0 ) THEN
          NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, ( NB + 1 )*N )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
             INFO = -20
@@ -464,7 +465,7 @@
 *
 *     Set WORK(1) to optimal complex workspace size.
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chesv.f b/lapack-netlib/SRC/chesv.f
index 238fb0a94..cea1235b7 100644
--- a/lapack-netlib/SRC/chesv.f
+++ b/lapack-netlib/SRC/chesv.f
@@ -163,7 +163,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEsolve
+*> \ingroup hesv
 *
 *  =====================================================================
       SUBROUTINE CHESV( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
@@ -191,7 +191,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CHETRF, CHETRS, CHETRS2
@@ -226,7 +227,7 @@
             NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 )
             LWKOPT = N*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -259,7 +260,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chesv_aa.f b/lapack-netlib/SRC/chesv_aa.f
index c9b97e09c..53ecc0a16 100644
--- a/lapack-netlib/SRC/chesv_aa.f
+++ b/lapack-netlib/SRC/chesv_aa.f
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEsolve
+*> \ingroup hesv_aa
 *
 *  =====================================================================
       SUBROUTINE CHESV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
@@ -182,7 +182,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CHETRF_AA, CHETRS_AA
@@ -217,7 +218,7 @@
      $                   -1, INFO )
          LWKOPT_HETRS = INT( WORK(1) )
          LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -239,7 +240,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chesv_aa_2stage.f b/lapack-netlib/SRC/chesv_aa_2stage.f
index 36970a329..12950c4af 100644
--- a/lapack-netlib/SRC/chesv_aa_2stage.f
+++ b/lapack-netlib/SRC/chesv_aa_2stage.f
@@ -177,7 +177,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup hesv_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE CHESV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB,
@@ -207,7 +207,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHETRF_AA_2STAGE, CHETRS_AA_2STAGE,
@@ -267,7 +268,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chesv_rk.f b/lapack-netlib/SRC/chesv_rk.f
index e123fa299..268a55e23 100644
--- a/lapack-netlib/SRC/chesv_rk.f
+++ b/lapack-netlib/SRC/chesv_rk.f
@@ -205,7 +205,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEsolve
+*> \ingroup hesv_rk
 *
 *> \par Contributors:
 *  ==================
@@ -247,7 +247,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CHETRF_RK, CHETRS_3
@@ -282,7 +283,7 @@
             CALL CHETRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO )
             LWKOPT = INT( WORK( 1 ) )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -304,7 +305,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chesv_rook.f b/lapack-netlib/SRC/chesv_rook.f
index 8e0b1a88f..2a0d3fdaf 100644
--- a/lapack-netlib/SRC/chesv_rook.f
+++ b/lapack-netlib/SRC/chesv_rook.f
@@ -184,7 +184,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEsolve
+*> \ingroup hesv_rook
 *>
 *> \verbatim
 *>
@@ -225,7 +225,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CHETRF_ROOK, CHETRS_ROOK
@@ -260,7 +261,7 @@
             NB = ILAENV( 1, 'CHETRF_ROOK', UPLO, N, -1, -1, -1 )
             LWKOPT = N*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -283,7 +284,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chesvx.f b/lapack-netlib/SRC/chesvx.f
index 6da49bdcf..c23a35ce7 100644
--- a/lapack-netlib/SRC/chesvx.f
+++ b/lapack-netlib/SRC/chesvx.f
@@ -276,7 +276,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEsolve
+*> \ingroup hesvx
 *
 *  =====================================================================
       SUBROUTINE CHESVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B,
@@ -313,8 +313,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANHE, SLAMCH
-      EXTERNAL           ILAENV, LSAME, CLANHE, SLAMCH
+      REAL               CLANHE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, CLANHE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHECON, CHERFS, CHETRF, CHETRS, CLACPY, XERBLA
@@ -356,7 +356,7 @@
             NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 )
             LWKOPT = MAX( LWKOPT, N*NB )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -405,7 +405,7 @@
       IF( RCOND.LT.SLAMCH( 'Epsilon' ) )
      $   INFO = N + 1
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chetrd_hb2st.F b/lapack-netlib/SRC/chetrd_hb2st.F
index 30b01ed83..3688e40a3 100644
--- a/lapack-netlib/SRC/chetrd_hb2st.F
+++ b/lapack-netlib/SRC/chetrd_hb2st.F
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+*       SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
 *                               D, E, HOUS, LHOUS, WORK, LWORK, INFO )
 *
 *       #if defined(_OPENMP)
@@ -53,12 +53,12 @@
 *> \param[in] STAGE1
 *> \verbatim
 *>          STAGE1 is CHARACTER*1
-*>          = 'N':  "No": to mention that the stage 1 of the reduction  
+*>          = 'N':  "No": to mention that the stage 1 of the reduction
 *>                  from dense to band using the chetrd_he2hb routine
-*>                  was not called before this routine to reproduce AB. 
-*>                  In other term this routine is called as standalone. 
-*>          = 'Y':  "Yes": to mention that the stage 1 of the 
-*>                  reduction from dense to band using the chetrd_he2hb 
+*>                  was not called before this routine to reproduce AB.
+*>                  In other term this routine is called as standalone.
+*>          = 'Y':  "Yes": to mention that the stage 1 of the
+*>                  reduction from dense to band using the chetrd_he2hb
 *>                  routine has been called to produce AB (e.g., AB is
 *>                  the output of chetrd_he2hb.
 *> \endverbatim
@@ -66,10 +66,10 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  and thus LHOUS is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate or to apply Q later on, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate or to apply Q later on,
 *>                  then LHOUS is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -147,7 +147,7 @@
 *>          message related to LHOUS is issued by XERBLA.
 *>          LHOUS = MAX(1, dimension) where
 *>          dimension = 4*N if VECT='N'
-*>          not available now if VECT='H'     
+*>          not available now if VECT='H'
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -188,7 +188,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup hetrd_hb2st
 *
 *> \par Further Details:
 *  =====================
@@ -208,7 +208,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -216,16 +216,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+      SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
      $                         D, E, HOUS, LHOUS, WORK, LWORK, INFO )
 *
 *
@@ -259,11 +259,11 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, WANTQ, UPPER, AFTERS1
-      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, 
+      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
      $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
      $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
      $                   NBTILES, TTYPE, TID, NTHREADS, DEBUG,
-     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, 
+     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
      $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
      $                   SICEV, SIZETAU, LDV, LHMIN, LWMIN
       REAL               ABSTMP
@@ -277,8 +277,9 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      INTEGER            ILAENV2STAGE 
-      EXTERNAL           LSAME, ILAENV2STAGE
+      INTEGER            ILAENV2STAGE
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV2STAGE, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -318,7 +319,7 @@
 *
       IF( INFO.EQ.0 ) THEN
          HOUS( 1 ) = LHMIN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -358,7 +359,7 @@
           ABDPOS   = KD + 1
           ABOFDPOS = KD
       ELSE
-          APOS     = INDA 
+          APOS     = INDA
           AWPOS    = INDA + KD + 1
           DPOS     = APOS
           OFDPOS   = DPOS + 1
@@ -366,11 +367,11 @@
           ABOFDPOS = 2
 
       ENDIF
-*      
-*     Case KD=0: 
-*     The matrix is diagonal. We just copy it (convert to "real" for 
-*     complex because D is double and the imaginary part should be 0) 
-*     and store it in D. A sequential code here is better or 
+*
+*     Case KD=0:
+*     The matrix is diagonal. We just copy it (convert to "real" for
+*     complex because D is double and the imaginary part should be 0)
+*     and store it in D. A sequential code here is better or
 *     in a parallel environment it might need two cores for D and E
 *
       IF( KD.EQ.0 ) THEN
@@ -385,17 +386,17 @@
           WORK( 1 ) = 1
           RETURN
       END IF
-*      
-*     Case KD=1: 
-*     The matrix is already Tridiagonal. We have to make diagonal 
+*
+*     Case KD=1:
+*     The matrix is already Tridiagonal. We have to make diagonal
 *     and offdiagonal elements real, and store them in D and E.
-*     For that, for real precision just copy the diag and offdiag 
-*     to D and E while for the COMPLEX case the bulge chasing is  
-*     performed to convert the hermetian tridiagonal to symmetric 
-*     tridiagonal. A simpler conversion formula might be used, but then 
+*     For that, for real precision just copy the diag and offdiag
+*     to D and E while for the COMPLEX case the bulge chasing is
+*     performed to convert the hermetian tridiagonal to symmetric
+*     tridiagonal. A simpler conversion formula might be used, but then
 *     updating the Q matrix will be required and based if Q is generated
-*     or not this might complicate the story. 
-*      
+*     or not this might complicate the story.
+*
       IF( KD.EQ.1 ) THEN
           DO 50 I = 1, N
               D( I ) = REAL( AB( ABDPOS, I ) )
@@ -444,7 +445,7 @@ C                 END IF
           RETURN
       END IF
 *
-*     Main code start here. 
+*     Main code start here.
 *     Reduce the hermitian band of A to a tridiagonal matrix.
 *
       THGRSIZ   = N
@@ -453,7 +454,7 @@ C                 END IF
       NBTILES   = CEILING( REAL(N)/REAL(KD) )
       STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) )
       THGRNB    = CEILING( REAL(N-1)/REAL(THGRSIZ) )
-*      
+*
       CALL CLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA )
       CALL CLASET( "A", KD,   N, ZERO, ZERO, WORK( AWPOS ), LDA )
 *
@@ -462,7 +463,7 @@ C                 END IF
 *
 #if defined(_OPENMP)
 !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
-!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) 
+!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID )
 !$OMP$         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
 !$OMP$         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
 !$OMP$         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
@@ -471,7 +472,7 @@ C                 END IF
 #endif
 *
 *     main bulge chasing loop
-*      
+*
       DO 100 THGRID = 1, THGRNB
           STT  = (THGRID-1)*THGRSIZ+1
           THED = MIN( (STT + THGRSIZ -1), (N-1))
@@ -482,7 +483,7 @@ C                 END IF
                   ST = STT
                   DO 130 SWEEPID = ST, ED
                       DO 140 K = 1, GRSIZ
-                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ) 
+                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ)
      $                           + (M-1)*GRSIZ + K
                           IF ( MYID.EQ.1 ) THEN
                               TTYPE = 1
@@ -508,16 +509,16 @@ C                 END IF
                           ENDIF
 *
 *                         Call the kernel
-*                             
+*
 #if defined(_OPENMP) && _OPENMP >= 201307
-                          IF( TTYPE.NE.1 ) THEN      
+                          IF( TTYPE.NE.1 ) THEN
 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
 !$OMP$     DEPEND(in:WORK(MYID-1))
 !$OMP$     DEPEND(out:WORK(MYID))
                               TID      = OMP_GET_THREAD_NUM()
-                              CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                              CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                             STIND, EDIND, SWEEPID, N, KD, IB,
-     $                             WORK ( INDA ), LDA, 
+     $                             WORK ( INDA ), LDA,
      $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                             WORK( INDW + TID*KD ) )
 !$OMP END TASK
@@ -525,20 +526,20 @@ C                 END IF
 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
 !$OMP$     DEPEND(out:WORK(MYID))
                               TID      = OMP_GET_THREAD_NUM()
-                              CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                              CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                             STIND, EDIND, SWEEPID, N, KD, IB,
-     $                             WORK ( INDA ), LDA, 
+     $                             WORK ( INDA ), LDA,
      $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                             WORK( INDW + TID*KD ) )
 !$OMP END TASK
                           ENDIF
 #else
-                          CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                          CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                         STIND, EDIND, SWEEPID, N, KD, IB,
-     $                         WORK ( INDA ), LDA, 
+     $                         WORK ( INDA ), LDA,
      $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                         WORK( INDW ) )
-#endif 
+#endif
                           IF ( BLKLASTIND.GE.(N-1) ) THEN
                               STT = STT + 1
                               EXIT
@@ -553,14 +554,14 @@ C                 END IF
 !$OMP END MASTER
 !$OMP END PARALLEL
 #endif
-*      
+*
 *     Copy the diagonal from A to D. Note that D is REAL thus only
 *     the Real part is needed, the imaginary part should be zero.
 *
       DO 150 I = 1, N
           D( I ) = REAL( WORK( DPOS+(I-1)*LDA ) )
   150 CONTINUE
-*      
+*
 *     Copy the off diagonal from A to E. Note that E is REAL thus only
 *     the Real part is needed, the imaginary part should be zero.
 *
@@ -575,10 +576,10 @@ C                 END IF
       ENDIF
 *
       HOUS( 1 ) = LHMIN
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RETURN
 *
 *     End of CHETRD_HB2ST
 *
       END
-      
+

From 71fbdd908d59087e8d809dd123b83f68850ec122 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 12 Nov 2023 14:10:16 +0100
Subject: [PATCH 404/718] Apply ROUNDUP_LWORK (Reference-LAPACK PR 904)

---
 lapack-netlib/SRC/chetrd_he2hb.f     |  9 +++++----
 lapack-netlib/SRC/chetrf.f           |  9 +++++----
 lapack-netlib/SRC/chetrf_aa.f        |  9 +++++----
 lapack-netlib/SRC/chetrf_rk.f        |  9 +++++----
 lapack-netlib/SRC/chetrf_rook.f      |  9 +++++----
 lapack-netlib/SRC/chetri_3.f         |  9 +++++----
 lapack-netlib/SRC/chetrs_aa.f        |  7 ++++---
 lapack-netlib/SRC/chpevd.f           | 10 +++++-----
 lapack-netlib/SRC/chpgvd.f           |  9 +++++----
 lapack-netlib/SRC/chseqr.f           |  5 +++--
 lapack-netlib/SRC/clamswlq.f         | 11 +++++++----
 lapack-netlib/SRC/clamtsqr.f         |  9 ++++++---
 lapack-netlib/SRC/claswlq.f          | 14 +++++++-------
 lapack-netlib/SRC/clatsqr.f          |  9 ++++++---
 lapack-netlib/SRC/cstedc.f           | 10 +++++-----
 lapack-netlib/SRC/cstemr.f           |  8 ++++----
 lapack-netlib/SRC/csysv.f            |  9 +++++----
 lapack-netlib/SRC/csysv_aa.f         |  9 +++++----
 lapack-netlib/SRC/csysv_aa_2stage.f  |  7 ++++---
 lapack-netlib/SRC/csysv_rk.f         |  9 +++++----
 lapack-netlib/SRC/csysv_rook.f       |  9 +++++----
 lapack-netlib/SRC/csysvx.f           | 10 +++++-----
 lapack-netlib/SRC/csytrf.f           |  9 +++++----
 lapack-netlib/SRC/csytrf_aa.f        |  9 +++++----
 lapack-netlib/SRC/csytrf_aa_2stage.f |  7 ++++---
 lapack-netlib/SRC/csytrf_rk.f        |  9 +++++----
 lapack-netlib/SRC/csytrf_rook.f      |  9 +++++----
 lapack-netlib/SRC/csytri_3.f         |  9 +++++----
 lapack-netlib/SRC/csytrs_aa.f        |  7 ++++---
 lapack-netlib/SRC/ctgsen.f           | 10 +++++++---
 lapack-netlib/SRC/ctgsna.f           | 14 +++++++-------
 lapack-netlib/SRC/ctgsyl.f           |  9 +++++----
 lapack-netlib/SRC/ctrevc3.f          | 12 ++++++------
 lapack-netlib/SRC/ctrsen.f           | 10 +++++-----
 lapack-netlib/SRC/ctzrzf.f           |  9 +++++----
 lapack-netlib/SRC/cunbdb.f           |  8 ++++----
 lapack-netlib/SRC/cunbdb1.f          |  8 ++++----
 lapack-netlib/SRC/cunbdb2.f          |  8 ++++----
 lapack-netlib/SRC/cunbdb3.f          |  8 ++++----
 lapack-netlib/SRC/cunbdb4.f          |  8 ++++----
 lapack-netlib/SRC/cuncsd.f           |  8 +++++---
 lapack-netlib/SRC/cuncsd2by1.f       |  7 ++++---
 lapack-netlib/SRC/cungbr.f           |  9 +++++----
 lapack-netlib/SRC/cunghr.f           |  9 +++++----
 lapack-netlib/SRC/cunglq.f           |  9 +++++----
 lapack-netlib/SRC/cungql.f           |  7 ++++---
 lapack-netlib/SRC/cungqr.f           |  9 +++++----
 lapack-netlib/SRC/cungrq.f           |  9 +++++----
 lapack-netlib/SRC/cungtr.f           |  9 +++++----
 lapack-netlib/SRC/cunmbr.f           |  9 +++++----
 lapack-netlib/SRC/cunmhr.f           |  9 +++++----
 lapack-netlib/SRC/cunmlq.f           |  9 +++++----
 lapack-netlib/SRC/cunmql.f           |  9 +++++----
 lapack-netlib/SRC/cunmqr.f           |  9 +++++----
 lapack-netlib/SRC/cunmrq.f           |  9 +++++----
 lapack-netlib/SRC/cunmrz.f           |  9 +++++----
 lapack-netlib/SRC/cunmtr.f           |  9 +++++----
 57 files changed, 282 insertions(+), 228 deletions(-)

diff --git a/lapack-netlib/SRC/chetrd_he2hb.f b/lapack-netlib/SRC/chetrd_he2hb.f
index 904555c10..090f02100 100644
--- a/lapack-netlib/SRC/chetrd_he2hb.f
+++ b/lapack-netlib/SRC/chetrd_he2hb.f
@@ -158,7 +158,7 @@
 *> \author Univ. of Colorado Denver 
 *> \author NAG Ltd. 
 *
-*> \ingroup complexHEcomputational
+*> \ingroup hetrd_he2hb
 *
 *> \par Further Details:
 *  =====================
@@ -283,7 +283,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE 
-      EXTERNAL           LSAME, ILAENV2STAGE
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV2STAGE, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -313,7 +314,7 @@
          CALL XERBLA( 'CHETRD_HE2HB', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          RETURN
       END IF
 *
@@ -506,7 +507,7 @@
 
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RETURN
 *
 *     End of CHETRD_HE2HB
diff --git a/lapack-netlib/SRC/chetrf.f b/lapack-netlib/SRC/chetrf.f
index 484e76256..0c596ffe7 100644
--- a/lapack-netlib/SRC/chetrf.f
+++ b/lapack-netlib/SRC/chetrf.f
@@ -130,7 +130,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEcomputational
+*> \ingroup hetrf
 *
 *> \par Further Details:
 *  =====================
@@ -197,7 +197,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHETF2, CLAHEF, XERBLA
@@ -228,7 +229,7 @@
 *
          NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 )
          LWKOPT = N*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -346,7 +347,7 @@
       END IF
 *
    40 CONTINUE
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CHETRF
diff --git a/lapack-netlib/SRC/chetrf_aa.f b/lapack-netlib/SRC/chetrf_aa.f
index d9e4fbd19..0547a4eab 100644
--- a/lapack-netlib/SRC/chetrf_aa.f
+++ b/lapack-netlib/SRC/chetrf_aa.f
@@ -125,7 +125,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEcomputational
+*> \ingroup hetrf_aa
 *
 *  =====================================================================
       SUBROUTINE CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO)
@@ -159,7 +159,8 @@
 *     .. External Functions ..
       LOGICAL      LSAME
       INTEGER      ILAENV
-      EXTERNAL     LSAME, ILAENV
+      REAL         SROUNDUP_LWORK
+      EXTERNAL     LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL     CLAHEF_AA, CGEMM, CCOPY, CSWAP, CSCAL, XERBLA
@@ -190,7 +191,7 @@
 *
       IF( INFO.EQ.0 ) THEN
          LWKOPT = (NB+1)*N
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -459,7 +460,7 @@
       END IF
 *
    20 CONTINUE
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CHETRF_AA
diff --git a/lapack-netlib/SRC/chetrf_rk.f b/lapack-netlib/SRC/chetrf_rk.f
index 3f60f4370..ef442c937 100644
--- a/lapack-netlib/SRC/chetrf_rk.f
+++ b/lapack-netlib/SRC/chetrf_rk.f
@@ -229,7 +229,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEcomputational
+*> \ingroup hetrf_rk
 *
 *> \par Further Details:
 *  =====================
@@ -280,7 +280,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLAHEF_RK, CHETF2_RK, CSWAP, XERBLA
@@ -311,7 +312,7 @@
 *
          NB = ILAENV( 1, 'CHETRF_RK', UPLO, N, -1, -1, -1 )
          LWKOPT = N*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -487,7 +488,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CHETRF_RK
diff --git a/lapack-netlib/SRC/chetrf_rook.f b/lapack-netlib/SRC/chetrf_rook.f
index 805e0f4cb..1593c2edc 100644
--- a/lapack-netlib/SRC/chetrf_rook.f
+++ b/lapack-netlib/SRC/chetrf_rook.f
@@ -150,7 +150,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEcomputational
+*> \ingroup hetrf_rook
 *
 *> \par Further Details:
 *  =====================
@@ -232,7 +232,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLAHEF_ROOK, CHETF2_ROOK, XERBLA
@@ -263,7 +264,7 @@
 *
          NB = ILAENV( 1, 'CHETRF_ROOK', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, N*NB )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -386,7 +387,7 @@
       END IF
 *
    40 CONTINUE
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CHETRF_ROOK
diff --git a/lapack-netlib/SRC/chetri_3.f b/lapack-netlib/SRC/chetri_3.f
index cc2318b22..deda63598 100644
--- a/lapack-netlib/SRC/chetri_3.f
+++ b/lapack-netlib/SRC/chetri_3.f
@@ -152,7 +152,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEcomputational
+*> \ingroup hetri_3
 *
 *> \par Contributors:
 *  ==================
@@ -190,7 +190,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHETRI_3X, XERBLA
@@ -225,7 +226,7 @@
          CALL XERBLA( 'CHETRI_3', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          RETURN
       END IF
 *
@@ -236,7 +237,7 @@
 *
       CALL CHETRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO )
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chetrs_aa.f b/lapack-netlib/SRC/chetrs_aa.f
index 2546adb2d..879549106 100644
--- a/lapack-netlib/SRC/chetrs_aa.f
+++ b/lapack-netlib/SRC/chetrs_aa.f
@@ -123,7 +123,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEcomputational
+*> \ingroup hetrs_aa
 *
 *  =====================================================================
       SUBROUTINE CHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB,
@@ -155,7 +155,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME,SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLACPY, CLACGV, CGTSV, CSWAP, CTRSM, XERBLA
@@ -186,7 +187,7 @@
          RETURN
       ELSE IF( LQUERY ) THEN
          LWKOPT = (3*N-2)
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/chpevd.f b/lapack-netlib/SRC/chpevd.f
index 06d01064d..2449783a2 100644
--- a/lapack-netlib/SRC/chpevd.f
+++ b/lapack-netlib/SRC/chpevd.f
@@ -186,7 +186,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHEReigen
+*> \ingroup hpevd
 *
 *  =====================================================================
       SUBROUTINE CHPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK,
@@ -223,8 +223,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               CLANHP, SLAMCH
-      EXTERNAL           LSAME, CLANHP, SLAMCH
+      REAL               CLANHP, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, CLANHP, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHPTRD, CSSCAL, CSTEDC, CUPMTR, SSCAL, SSTERF,
@@ -268,7 +268,7 @@
                LIWMIN = 1
             END IF
          END IF
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          RWORK( 1 ) = LRWMIN
          IWORK( 1 ) = LIWMIN
 *
@@ -359,7 +359,7 @@
          CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RWORK( 1 ) = LRWMIN
       IWORK( 1 ) = LIWMIN
       RETURN
diff --git a/lapack-netlib/SRC/chpgvd.f b/lapack-netlib/SRC/chpgvd.f
index c24ca1360..57ac4fc72 100644
--- a/lapack-netlib/SRC/chpgvd.f
+++ b/lapack-netlib/SRC/chpgvd.f
@@ -212,7 +212,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHEReigen
+*> \ingroup hpgvd
 *
 *> \par Contributors:
 *  ==================
@@ -246,7 +246,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHPEVD, CHPGST, CPPTRF, CTPMV, CTPSV, XERBLA
@@ -292,7 +293,7 @@
             END IF
          END IF
 *
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          RWORK( 1 ) = LRWMIN
          IWORK( 1 ) = LIWMIN
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -374,7 +375,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RWORK( 1 ) = LRWMIN
       IWORK( 1 ) = LIWMIN
       RETURN
diff --git a/lapack-netlib/SRC/chseqr.f b/lapack-netlib/SRC/chseqr.f
index 007f72f59..56ff01fc6 100644
--- a/lapack-netlib/SRC/chseqr.f
+++ b/lapack-netlib/SRC/chseqr.f
@@ -216,7 +216,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup hseqr
 *
 *> \par Contributors:
 *  ==================
@@ -343,7 +343,8 @@
 *     .. External Functions ..
       INTEGER            ILAENV
       LOGICAL            LSAME
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CCOPY, CLACPY, CLAHQR, CLAQR0, CLASET, XERBLA
diff --git a/lapack-netlib/SRC/clamswlq.f b/lapack-netlib/SRC/clamswlq.f
index 1606cc611..5daf60bf6 100644
--- a/lapack-netlib/SRC/clamswlq.f
+++ b/lapack-netlib/SRC/clamswlq.f
@@ -189,6 +189,8 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamswlq
+*>
 *  =====================================================================
       SUBROUTINE CLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
      $    LDT, C, LDC, WORK, LWORK, INFO )
@@ -215,7 +217,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     .. External Subroutines ..
       EXTERNAL    CTPMLQT, CGEMLQT, XERBLA
 *     ..
@@ -259,10 +262,10 @@
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'CLAMSWLQ', -INFO )
-        WORK(1) = LW
+        WORK(1) = SROUNDUP_LWORK(LW)
         RETURN
       ELSE IF (LQUERY) THEN
-        WORK(1) = LW
+        WORK(1) = SROUNDUP_LWORK(LW)
         RETURN
       END IF
 *
@@ -401,7 +404,7 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK(1) = SROUNDUP_LWORK(LW)
       RETURN
 *
 *     End of CLAMSWLQ
diff --git a/lapack-netlib/SRC/clamtsqr.f b/lapack-netlib/SRC/clamtsqr.f
index 5677420ac..05021e642 100644
--- a/lapack-netlib/SRC/clamtsqr.f
+++ b/lapack-netlib/SRC/clamtsqr.f
@@ -191,6 +191,8 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamtsqr
+*>
 *  =====================================================================
       SUBROUTINE CLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
      $        LDT, C, LDC, WORK, LWORK, INFO )
@@ -217,7 +219,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     .. External Subroutines ..
       EXTERNAL   CGEMQRT, CTPMQRT, XERBLA
 *     ..
@@ -264,7 +267,7 @@
 *     Determine the block size if it is tall skinny or short and wide
 *
       IF( INFO.EQ.0)  THEN
-          WORK(1) = LW
+          WORK(1) = SROUNDUP_LWORK(LW)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -409,7 +412,7 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK(1) = SROUNDUP_LWORK(LW)
       RETURN
 *
 *     End of CLAMTSQR
diff --git a/lapack-netlib/SRC/claswlq.f b/lapack-netlib/SRC/claswlq.f
index 1a09b8305..12e8373df 100644
--- a/lapack-netlib/SRC/claswlq.f
+++ b/lapack-netlib/SRC/claswlq.f
@@ -159,6 +159,8 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup laswlq
+*>
 *  =====================================================================
       SUBROUTINE CLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
      $                  INFO)
@@ -183,16 +185,14 @@
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      INTEGER            ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     .. EXTERNAL SUBROUTINES ..
       EXTERNAL           CGELQT, CTPLQT, XERBLA
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
-*     .. EXTERNAL FUNCTIONS ..
-      INTEGER            ILAENV
-      EXTERNAL           ILAENV
-*     ..
 *     .. EXECUTABLE STATEMENTS ..
 *
 *     TEST THE INPUT ARGUMENTS
@@ -217,7 +217,7 @@
         INFO = -10
       END IF
       IF( INFO.EQ.0)  THEN
-      WORK(1) = MB*M
+      WORK(1) = SROUNDUP_LWORK(MB*M)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -266,7 +266,7 @@
      $                  WORK, INFO )
        END IF
 *
-      WORK( 1 ) = M * MB
+      WORK( 1 ) = SROUNDUP_LWORK(M * MB)
       RETURN
 *
 *     End of CLASWLQ
diff --git a/lapack-netlib/SRC/clatsqr.f b/lapack-netlib/SRC/clatsqr.f
index 377190081..cd2cb4aa7 100644
--- a/lapack-netlib/SRC/clatsqr.f
+++ b/lapack-netlib/SRC/clatsqr.f
@@ -161,6 +161,8 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup latsqr
+*>
 *  =====================================================================
       SUBROUTINE CLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK,
      $                    LWORK, INFO)
@@ -185,7 +187,8 @@
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     .. EXTERNAL SUBROUTINES ..
       EXTERNAL    CGEQRT, CTPQRT, XERBLA
 *     .. INTRINSIC FUNCTIONS ..
@@ -215,7 +218,7 @@
         INFO = -10
       END IF
       IF( INFO.EQ.0)  THEN
-        WORK(1) = NB*N
+        WORK(1) = SROUNDUP_LWORK(NB*N)
       END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'CLATSQR', -INFO )
@@ -262,7 +265,7 @@
      $                  WORK, INFO )
        END IF
 *
-      work( 1 ) = N*NB
+      WORK( 1 ) = SROUNDUP_LWORK(N*NB)
       RETURN
 *
 *     End of CLATSQR
diff --git a/lapack-netlib/SRC/cstedc.f b/lapack-netlib/SRC/cstedc.f
index 77a4ec3be..d7db591b3 100644
--- a/lapack-netlib/SRC/cstedc.f
+++ b/lapack-netlib/SRC/cstedc.f
@@ -192,7 +192,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup stedc
 *
 *> \par Contributors:
 *  ==================
@@ -233,8 +233,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANST
-      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANST
+      REAL               SLAMCH, SLANST, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANST, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CLACPY, CLACRM, CLAED0, CSTEQR, CSWAP,
@@ -295,7 +295,7 @@
             LRWMIN = 1 + 4*N + 2*N**2
             LIWMIN = 3 + 5*N
          END IF
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          RWORK( 1 ) = LRWMIN
          IWORK( 1 ) = LIWMIN
 *
@@ -466,7 +466,7 @@
       END IF
 *
    70 CONTINUE
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RWORK( 1 ) = LRWMIN
       IWORK( 1 ) = LIWMIN
 *
diff --git a/lapack-netlib/SRC/cstemr.f b/lapack-netlib/SRC/cstemr.f
index 9d47450e3..46b20d880 100644
--- a/lapack-netlib/SRC/cstemr.f
+++ b/lapack-netlib/SRC/cstemr.f
@@ -376,8 +376,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANST
-      EXTERNAL           LSAME, SLAMCH, SLANST
+      REAL               SLAMCH, SLANST, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANST, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLARRV, CSWAP, SCOPY, SLAE2, SLAEV2, SLARRC,
@@ -462,7 +462,7 @@
       RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          IWORK( 1 ) = LIWMIN
 *
          IF( WANTZ .AND. ALLEIG ) THEN
@@ -801,7 +801,7 @@
       ENDIF
 *
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
       RETURN
 *
diff --git a/lapack-netlib/SRC/csysv.f b/lapack-netlib/SRC/csysv.f
index 4ddabf62f..a2d1e7cbe 100644
--- a/lapack-netlib/SRC/csysv.f
+++ b/lapack-netlib/SRC/csysv.f
@@ -163,7 +163,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYsolve
+*> \ingroup hesv
 *
 *  =====================================================================
       SUBROUTINE CSYSV( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
@@ -190,7 +190,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CSYTRF, CSYTRS, CSYTRS2
@@ -225,7 +226,7 @@
             CALL CSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
             LWKOPT = INT( WORK( 1 ) )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -258,7 +259,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/csysv_aa.f b/lapack-netlib/SRC/csysv_aa.f
index 8548c2789..571a91123 100644
--- a/lapack-netlib/SRC/csysv_aa.f
+++ b/lapack-netlib/SRC/csysv_aa.f
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYsolve
+*> \ingroup hesv_aa
 *
 *  =====================================================================
       SUBROUTINE CSYSV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
@@ -182,7 +182,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CSYTRF_AA, CSYTRS_AA
@@ -217,7 +218,7 @@
      $                   -1, INFO )
          LWKOPT_SYTRS = INT( WORK(1) )
          LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -239,7 +240,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/csysv_aa_2stage.f b/lapack-netlib/SRC/csysv_aa_2stage.f
index 22227505c..10119d8ba 100644
--- a/lapack-netlib/SRC/csysv_aa_2stage.f
+++ b/lapack-netlib/SRC/csysv_aa_2stage.f
@@ -177,7 +177,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup hesv_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE CSYSV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB,
@@ -207,7 +207,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CSYTRF_AA_2STAGE, 
@@ -267,7 +268,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/csysv_rk.f b/lapack-netlib/SRC/csysv_rk.f
index ef5334dcd..cb98ab1dc 100644
--- a/lapack-netlib/SRC/csysv_rk.f
+++ b/lapack-netlib/SRC/csysv_rk.f
@@ -205,7 +205,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYsolve
+*> \ingroup hesv_rk
 *
 *> \par Contributors:
 *  ==================
@@ -247,7 +247,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CSYTRF_RK, CSYTRS_3
@@ -282,7 +283,7 @@
             CALL CSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO )
             LWKOPT = INT( WORK( 1 ) )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -304,7 +305,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/csysv_rook.f b/lapack-netlib/SRC/csysv_rook.f
index aad594e21..8798ddfb2 100644
--- a/lapack-netlib/SRC/csysv_rook.f
+++ b/lapack-netlib/SRC/csysv_rook.f
@@ -181,7 +181,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYsolve
+*> \ingroup hesv_rook
 *
 *> \par Contributors:
 *  ==================
@@ -223,7 +223,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CSYTRF_ROOK, CSYTRS_ROOK
@@ -258,7 +259,7 @@
             CALL CSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
             LWKOPT = INT( WORK( 1 ) )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -281,7 +282,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/csysvx.f b/lapack-netlib/SRC/csysvx.f
index 2afa082a9..3c7a37889 100644
--- a/lapack-netlib/SRC/csysvx.f
+++ b/lapack-netlib/SRC/csysvx.f
@@ -276,7 +276,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYsolve
+*> \ingroup hesvx
 *
 *  =====================================================================
       SUBROUTINE CSYSVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B,
@@ -313,8 +313,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANSY, SLAMCH
-      EXTERNAL           ILAENV, LSAME, CLANSY, SLAMCH
+      REAL               CLANSY, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, CLANSY, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLACPY, CSYCON, CSYRFS, CSYTRF, CSYTRS, XERBLA
@@ -356,7 +356,7 @@
             NB = ILAENV( 1, 'CSYTRF', UPLO, N, -1, -1, -1 )
             LWKOPT = MAX( LWKOPT, N*NB )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -405,7 +405,7 @@
       IF( RCOND.LT.SLAMCH( 'Epsilon' ) )
      $   INFO = N + 1
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/csytrf.f b/lapack-netlib/SRC/csytrf.f
index 951196b83..519e78490 100644
--- a/lapack-netlib/SRC/csytrf.f
+++ b/lapack-netlib/SRC/csytrf.f
@@ -135,7 +135,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup hetrf
 *
 *> \par Further Details:
 *  =====================
@@ -202,7 +202,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLASYF, CSYTF2, XERBLA
@@ -233,7 +234,7 @@
 *
          NB = ILAENV( 1, 'CSYTRF', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, N*NB )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -351,7 +352,7 @@
       END IF
 *
    40 CONTINUE
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CSYTRF
diff --git a/lapack-netlib/SRC/csytrf_aa.f b/lapack-netlib/SRC/csytrf_aa.f
index c5467bf01..cf994913d 100644
--- a/lapack-netlib/SRC/csytrf_aa.f
+++ b/lapack-netlib/SRC/csytrf_aa.f
@@ -125,7 +125,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup hetrf_aa
 *
 *  =====================================================================
       SUBROUTINE CSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO)
@@ -159,7 +159,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLASYF_AA, CGEMM, CGEMV, CSCAL, CSWAP, CCOPY,
@@ -191,7 +192,7 @@
 *
       IF( INFO.EQ.0 ) THEN
          LWKOPT = (NB+1)*N
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -457,7 +458,7 @@
       END IF
 *
    20 CONTINUE
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CSYTRF_AA
diff --git a/lapack-netlib/SRC/csytrf_aa_2stage.f b/lapack-netlib/SRC/csytrf_aa_2stage.f
index b21df8cd3..e56aedaf6 100644
--- a/lapack-netlib/SRC/csytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/csytrf_aa_2stage.f
@@ -152,7 +152,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup hetrf_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE CSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV,
@@ -188,7 +188,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CCOPY, CGBTRF, CGEMM, CGETRF, CLACPY,  
@@ -230,7 +231,7 @@
             TB( 1 ) = (3*NB+1)*N
          END IF
          IF( WQUERY ) THEN
-            WORK( 1 ) = N*NB
+            WORK( 1 ) = SROUNDUP_LWORK(N*NB)
          END IF
       END IF
       IF( TQUERY .OR. WQUERY ) THEN
diff --git a/lapack-netlib/SRC/csytrf_rk.f b/lapack-netlib/SRC/csytrf_rk.f
index 996801e7d..de39bda41 100644
--- a/lapack-netlib/SRC/csytrf_rk.f
+++ b/lapack-netlib/SRC/csytrf_rk.f
@@ -229,7 +229,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup hetrf_rk
 *
 *> \par Further Details:
 *  =====================
@@ -280,7 +280,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLASYF_RK, CSYTF2_RK, CSWAP, XERBLA
@@ -311,7 +312,7 @@
 *
          NB = ILAENV( 1, 'CSYTRF_RK', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, N*NB )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -487,7 +488,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CSYTRF_RK
diff --git a/lapack-netlib/SRC/csytrf_rook.f b/lapack-netlib/SRC/csytrf_rook.f
index ce7c1e586..72fe0629f 100644
--- a/lapack-netlib/SRC/csytrf_rook.f
+++ b/lapack-netlib/SRC/csytrf_rook.f
@@ -146,7 +146,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup hetrf_rook
 *
 *> \par Further Details:
 *  =====================
@@ -228,7 +228,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLASYF_ROOK, CSYTF2_ROOK, XERBLA
@@ -259,7 +260,7 @@
 *
          NB = ILAENV( 1, 'CSYTRF_ROOK', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, N*NB )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -382,7 +383,7 @@
       END IF
 *
    40 CONTINUE
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CSYTRF_ROOK
diff --git a/lapack-netlib/SRC/csytri_3.f b/lapack-netlib/SRC/csytri_3.f
index 279f62853..604d84b21 100644
--- a/lapack-netlib/SRC/csytri_3.f
+++ b/lapack-netlib/SRC/csytri_3.f
@@ -152,7 +152,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup hetri_3
 *
 *> \par Contributors:
 *  ==================
@@ -190,7 +190,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CSYTRI_3X, XERBLA
@@ -225,7 +226,7 @@
          CALL XERBLA( 'CSYTRI_3', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          RETURN
       END IF
 *
@@ -236,7 +237,7 @@
 *
       CALL CSYTRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO )
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/csytrs_aa.f b/lapack-netlib/SRC/csytrs_aa.f
index 1f6ea40af..7f63539a6 100644
--- a/lapack-netlib/SRC/csytrs_aa.f
+++ b/lapack-netlib/SRC/csytrs_aa.f
@@ -123,7 +123,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup hetrs_aa
 *
 *  =====================================================================
       SUBROUTINE CSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB,
@@ -155,7 +155,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLACPY, CGTSV, CSWAP, CTRSM, XERBLA
@@ -186,7 +187,7 @@
          RETURN
       ELSE IF( LQUERY ) THEN
          LWKOPT = (3*N-2)
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/ctgsen.f b/lapack-netlib/SRC/ctgsen.f
index ffd638099..180e96b32 100644
--- a/lapack-netlib/SRC/ctgsen.f
+++ b/lapack-netlib/SRC/ctgsen.f
@@ -290,7 +290,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup tgsen
 *
 *> \par Further Details:
 *  =====================
@@ -467,6 +467,10 @@
 *     .. Local Arrays ..
       INTEGER            ISAVE( 3 )
 *     ..
+*     .. External Functions ..
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
       REAL               SLAMCH
       EXTERNAL           CLACN2, CLACPY, CLASSQ, CSCAL, CTGEXC, CTGSYL,
@@ -537,7 +541,7 @@
          LIWMIN = 1
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) =  SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
 *
       IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -771,7 +775,7 @@
 *
    70 CONTINUE
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) =  SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/ctgsna.f b/lapack-netlib/SRC/ctgsna.f
index 2295dc5cc..50498c413 100644
--- a/lapack-netlib/SRC/ctgsna.f
+++ b/lapack-netlib/SRC/ctgsna.f
@@ -213,7 +213,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup tgsna
 *
 *> \par Further Details:
 *  =====================
@@ -343,12 +343,13 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SCNRM2, SLAMCH, SLAPY2
+      REAL               SCNRM2, SLAMCH, SLAPY2, SROUNDUP_LWORK
       COMPLEX            CDOTC
-      EXTERNAL           LSAME, SCNRM2, SLAMCH, SLAPY2, CDOTC
+      EXTERNAL           LSAME, SCNRM2, SLAMCH, SLAPY2, SROUNDUP_LWORK,
+     $                   CDOTC
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CGEMV, CLACPY, CTGEXC, CTGSYL, SLABAD, XERBLA
+      EXTERNAL           CGEMV, CLACPY, CTGEXC, CTGSYL, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, CMPLX, MAX
@@ -402,7 +403,7 @@
          ELSE
             LWMIN = N
          END IF
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
          IF( MM.LT.M ) THEN
             INFO = -15
@@ -428,7 +429,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' ) / EPS
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       KS = 0
       DO 20 K = 1, N
 *
@@ -508,7 +508,7 @@
          END IF
 *
    20 CONTINUE
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RETURN
 *
 *     End of CTGSNA
diff --git a/lapack-netlib/SRC/ctgsyl.f b/lapack-netlib/SRC/ctgsyl.f
index ae1437125..620556399 100644
--- a/lapack-netlib/SRC/ctgsyl.f
+++ b/lapack-netlib/SRC/ctgsyl.f
@@ -260,7 +260,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup tgsyl
 *
 *> \par Contributors:
 *  ==================
@@ -329,7 +329,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEMM, CLACPY, CLASET, CSCAL, CTGSY2, XERBLA
@@ -382,7 +383,7 @@
          ELSE
             LWMIN = 1
          END IF
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
             INFO = -20
@@ -683,7 +684,7 @@
   210    CONTINUE
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ctrevc3.f b/lapack-netlib/SRC/ctrevc3.f
index 11b32104d..13cbf553f 100644
--- a/lapack-netlib/SRC/ctrevc3.f
+++ b/lapack-netlib/SRC/ctrevc3.f
@@ -222,7 +222,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup trevc3
 *
 *> \par Further Details:
 *  =====================
@@ -278,12 +278,13 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV, ICAMAX
-      REAL               SLAMCH, SCASUM
-      EXTERNAL           LSAME, ILAENV, ICAMAX, SLAMCH, SCASUM
+      REAL               SLAMCH, SCASUM, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, ICAMAX, SLAMCH, SCASUM,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CCOPY, CLASET, CSSCAL, CGEMM, CGEMV,
-     $                   CLATRS, CLACPY, SLABAD
+     $                   CLATRS, CLACPY
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, REAL, CMPLX, CONJG, AIMAG, MAX
@@ -322,7 +323,7 @@
       INFO = 0
       NB = ILAENV( 1, 'CTREVC', SIDE // HOWMNY, N, -1, -1, -1 )
       MAXWRK = MAX( 1, N + 2*N*NB )
-      WORK(1) = MAXWRK
+      WORK(1) = SROUNDUP_LWORK(MAXWRK)
       RWORK(1) = MAX( 1, N )
       LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 )
       IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN
@@ -371,7 +372,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Precision' )
       SMLNUM = UNFL*( N / ULP )
 *
diff --git a/lapack-netlib/SRC/ctrsen.f b/lapack-netlib/SRC/ctrsen.f
index d93b97be6..9d59f6bf2 100644
--- a/lapack-netlib/SRC/ctrsen.f
+++ b/lapack-netlib/SRC/ctrsen.f
@@ -182,7 +182,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup trsen
 *
 *> \par Further Details:
 *  =====================
@@ -293,8 +293,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               CLANGE
-      EXTERNAL           LSAME, CLANGE
+      REAL               CLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, CLANGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLACN2, CLACPY, CTREXC, CTRSYL, XERBLA
@@ -350,7 +350,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -444,7 +444,7 @@
          W( K ) = T( K, K )
    50 CONTINUE
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ctzrzf.f b/lapack-netlib/SRC/ctzrzf.f
index b21f092ce..ac3f59400 100644
--- a/lapack-netlib/SRC/ctzrzf.f
+++ b/lapack-netlib/SRC/ctzrzf.f
@@ -116,7 +116,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup tzrzf
 *
 *> \par Contributors:
 *  ==================
@@ -179,7 +179,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -207,7 +208,7 @@
             LWKOPT = M*NB
             LWKMIN = MAX( 1, M )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
             INFO = -7
@@ -301,7 +302,7 @@
       IF( MU.GT.0 )
      $   CALL CLATRZ( MU, N, N-M, A, LDA, TAU, WORK )
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cunbdb.f b/lapack-netlib/SRC/cunbdb.f
index a41895dc8..b45dcfde6 100644
--- a/lapack-netlib/SRC/cunbdb.f
+++ b/lapack-netlib/SRC/cunbdb.f
@@ -255,7 +255,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unbdb
 *
 *> \par Further Details:
 *  =====================
@@ -320,9 +320,9 @@
 *
 *     ..
 *     .. External Functions ..
-      REAL               SCNRM2
+      REAL               SCNRM2, SROUNDUP_LWORK
       LOGICAL            LSAME
-      EXTERNAL           SCNRM2, LSAME
+      EXTERNAL           SCNRM2, SROUNDUP_LWORK, LSAME
 *     ..
 *     .. Intrinsic Functions
       INTRINSIC          ATAN2, COS, MAX, MIN, SIN
@@ -377,7 +377,7 @@
       IF( INFO .EQ. 0 ) THEN
          LWORKOPT = M - Q
          LWORKMIN = M - Q
-         WORK(1) = LWORKOPT
+         WORK(1) = SROUNDUP_LWORK(LWORKOPT)
          IF( LWORK .LT. LWORKMIN .AND. .NOT. LQUERY ) THEN
             INFO = -21
          END IF
diff --git a/lapack-netlib/SRC/cunbdb1.f b/lapack-netlib/SRC/cunbdb1.f
index 80faa8808..a4875ab5b 100644
--- a/lapack-netlib/SRC/cunbdb1.f
+++ b/lapack-netlib/SRC/cunbdb1.f
@@ -173,7 +173,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unbdb1
 *
 *> \par Further Details:
 *  =====================
@@ -230,8 +230,8 @@
       EXTERNAL           CLACGV
 *     ..
 *     .. External Functions ..
-      REAL               SCNRM2
-      EXTERNAL           SCNRM2
+      REAL               SCNRM2, SROUNDUP_LWORK
+      EXTERNAL           SCNRM2, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Function ..
       INTRINSIC          ATAN2, COS, MAX, SIN, SQRT
@@ -264,7 +264,7 @@
          LORBDB5 = Q-2
          LWORKOPT = MAX( ILARF+LLARF-1, IORBDB5+LORBDB5-1 )
          LWORKMIN = LWORKOPT
-         WORK(1) = LWORKOPT
+         WORK(1) = SROUNDUP_LWORK(LWORKOPT)
          IF( LWORK .LT. LWORKMIN .AND. .NOT.LQUERY ) THEN
            INFO = -14
          END IF
diff --git a/lapack-netlib/SRC/cunbdb2.f b/lapack-netlib/SRC/cunbdb2.f
index 94b9fdbf9..6399964f8 100644
--- a/lapack-netlib/SRC/cunbdb2.f
+++ b/lapack-netlib/SRC/cunbdb2.f
@@ -173,7 +173,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unbdb2
 *
 *> \par Further Details:
 *  =====================
@@ -231,8 +231,8 @@
      $                   XERBLA
 *     ..
 *     .. External Functions ..
-      REAL               SCNRM2
-      EXTERNAL           SCNRM2
+      REAL               SCNRM2, SROUNDUP_LWORK
+      EXTERNAL           SCNRM2, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Function ..
       INTRINSIC          ATAN2, COS, MAX, SIN, SQRT
@@ -265,7 +265,7 @@
          LORBDB5 = Q-1
          LWORKOPT = MAX( ILARF+LLARF-1, IORBDB5+LORBDB5-1 )
          LWORKMIN = LWORKOPT
-         WORK(1) = LWORKOPT
+         WORK(1) = SROUNDUP_LWORK(LWORKOPT)
          IF( LWORK .LT. LWORKMIN .AND. .NOT.LQUERY ) THEN
            INFO = -14
          END IF
diff --git a/lapack-netlib/SRC/cunbdb3.f b/lapack-netlib/SRC/cunbdb3.f
index f942bc698..d02460597 100644
--- a/lapack-netlib/SRC/cunbdb3.f
+++ b/lapack-netlib/SRC/cunbdb3.f
@@ -173,7 +173,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unbdb3
 *
 *> \par Further Details:
 *  =====================
@@ -229,8 +229,8 @@
       EXTERNAL           CLARF, CLARFGP, CUNBDB5, CSROT, CLACGV, XERBLA
 *     ..
 *     .. External Functions ..
-      REAL               SCNRM2
-      EXTERNAL           SCNRM2
+      REAL               SCNRM2, SROUNDUP_LWORK
+      EXTERNAL           SCNRM2, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Function ..
       INTRINSIC          ATAN2, COS, MAX, SIN, SQRT
@@ -263,7 +263,7 @@
          LORBDB5 = Q-1
          LWORKOPT = MAX( ILARF+LLARF-1, IORBDB5+LORBDB5-1 )
          LWORKMIN = LWORKOPT
-         WORK(1) = LWORKOPT
+         WORK(1) = SROUNDUP_LWORK(LWORKOPT)
          IF( LWORK .LT. LWORKMIN .AND. .NOT.LQUERY ) THEN
            INFO = -14
          END IF
diff --git a/lapack-netlib/SRC/cunbdb4.f b/lapack-netlib/SRC/cunbdb4.f
index a551c184e..33acc1ee5 100644
--- a/lapack-netlib/SRC/cunbdb4.f
+++ b/lapack-netlib/SRC/cunbdb4.f
@@ -183,7 +183,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unbdb4
 *
 *> \par Further Details:
 *  =====================
@@ -242,8 +242,8 @@
      $                   XERBLA
 *     ..
 *     .. External Functions ..
-      REAL               SCNRM2
-      EXTERNAL           SCNRM2
+      REAL               SCNRM2, SROUNDUP_LWORK
+      EXTERNAL           SCNRM2, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Function ..
       INTRINSIC          ATAN2, COS, MAX, SIN, SQRT
@@ -277,7 +277,7 @@
          LWORKOPT = ILARF + LLARF - 1
          LWORKOPT = MAX( LWORKOPT, IORBDB5 + LORBDB5 - 1 )
          LWORKMIN = LWORKOPT
-         WORK(1) = LWORKOPT
+         WORK(1) = SROUNDUP_LWORK(LWORKOPT)
          IF( LWORK .LT. LWORKMIN .AND. .NOT.LQUERY ) THEN
            INFO = -14
          END IF
diff --git a/lapack-netlib/SRC/cuncsd.f b/lapack-netlib/SRC/cuncsd.f
index 3653a396a..003daaab4 100644
--- a/lapack-netlib/SRC/cuncsd.f
+++ b/lapack-netlib/SRC/cuncsd.f
@@ -308,7 +308,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup uncsd
 *
 *  =====================================================================
       RECURSIVE SUBROUTINE CUNCSD( JOBU1, JOBU2, JOBV1T, JOBV2T, TRANS,
@@ -365,7 +365,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions
       INTRINSIC          INT, MAX, MIN
@@ -504,7 +505,8 @@
      $              IORBDB + LORBDBWORKOPT ) - 1
          LWORKMIN = MAX( IORGQR + LORGQRWORKMIN, IORGLQ + LORGLQWORKMIN,
      $              IORBDB + LORBDBWORKMIN ) - 1
-         WORK(1) = MAX(LWORKOPT,LWORKMIN)
+         LWORKOPT = MAX(LWORKOPT,LWORKMIN)
+         WORK(1) = SROUNDUP_LWORK(LWORKOPT)
 *
          IF( LWORK .LT. LWORKMIN
      $       .AND. .NOT. ( LQUERY .OR. LRQUERY ) ) THEN
diff --git a/lapack-netlib/SRC/cuncsd2by1.f b/lapack-netlib/SRC/cuncsd2by1.f
index f0c44f670..128e82cec 100644
--- a/lapack-netlib/SRC/cuncsd2by1.f
+++ b/lapack-netlib/SRC/cuncsd2by1.f
@@ -247,7 +247,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup uncsd2by1
 *
 *  =====================================================================
       SUBROUTINE CUNCSD2BY1( JOBU1, JOBU2, JOBV1T, M, P, Q, X11, LDX11,
@@ -299,7 +299,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Function ..
       INTRINSIC          INT, MAX, MIN
@@ -508,7 +509,7 @@
          LWORKOPT = MAX( IORBDB+LORBDB-1,
      $                   IORGQR+LORGQROPT-1,
      $                   IORGLQ+LORGLQOPT-1 )
-         WORK(1) = LWORKOPT
+         WORK(1) = SROUNDUP_LWORK(LWORKOPT)
          IF( LWORK .LT. LWORKMIN .AND. .NOT.LQUERY ) THEN
             INFO = -19
          END IF
diff --git a/lapack-netlib/SRC/cungbr.f b/lapack-netlib/SRC/cungbr.f
index a31a53d79..2f0208fdb 100644
--- a/lapack-netlib/SRC/cungbr.f
+++ b/lapack-netlib/SRC/cungbr.f
@@ -150,7 +150,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGBcomputational
+*> \ingroup ungbr
 *
 *  =====================================================================
       SUBROUTINE CUNGBR( VECT, M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -180,7 +180,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CUNGLQ, CUNGQR, XERBLA
@@ -241,7 +242,7 @@
          CALL XERBLA( 'CUNGBR', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          RETURN
       END IF
 *
@@ -327,7 +328,7 @@
             END IF
          END IF
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CUNGBR
diff --git a/lapack-netlib/SRC/cunghr.f b/lapack-netlib/SRC/cunghr.f
index 4f8a0a263..3aa3fb1ae 100644
--- a/lapack-netlib/SRC/cunghr.f
+++ b/lapack-netlib/SRC/cunghr.f
@@ -119,7 +119,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unghr
 *
 *  =====================================================================
       SUBROUTINE CUNGHR( N, ILO, IHI, A, LDA, TAU, WORK, LWORK, INFO )
@@ -151,7 +151,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -178,7 +179,7 @@
       IF( INFO.EQ.0 ) THEN
          NB = ILAENV( 1, 'CUNGQR', ' ', NH, NH, NH, -1 )
          LWKOPT = MAX( 1, NH )*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -230,7 +231,7 @@
          CALL CUNGQR( NH, NH, NH, A( ILO+1, ILO+1 ), LDA, TAU( ILO ),
      $                WORK, LWORK, IINFO )
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CUNGHR
diff --git a/lapack-netlib/SRC/cunglq.f b/lapack-netlib/SRC/cunglq.f
index e250e036c..353715054 100644
--- a/lapack-netlib/SRC/cunglq.f
+++ b/lapack-netlib/SRC/cunglq.f
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unglq
 *
 *  =====================================================================
       SUBROUTINE CUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -155,7 +155,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -164,7 +165,7 @@
       INFO = 0
       NB = ILAENV( 1, 'CUNGLQ', ' ', M, N, K, -1 )
       LWKOPT = MAX( 1, M )*NB
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -278,7 +279,7 @@
    50    CONTINUE
       END IF
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of CUNGLQ
diff --git a/lapack-netlib/SRC/cungql.f b/lapack-netlib/SRC/cungql.f
index d3b812a62..ed2f6803c 100644
--- a/lapack-netlib/SRC/cungql.f
+++ b/lapack-netlib/SRC/cungql.f
@@ -121,7 +121,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup ungql
 *
 *  =====================================================================
       SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -156,7 +156,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -181,7 +182,7 @@
             NB = ILAENV( 1, 'CUNGQL', ' ', M, N, K, -1 )
             LWKOPT = N*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
             INFO = -8
diff --git a/lapack-netlib/SRC/cungqr.f b/lapack-netlib/SRC/cungqr.f
index 5010ae0df..b6e8cc59a 100644
--- a/lapack-netlib/SRC/cungqr.f
+++ b/lapack-netlib/SRC/cungqr.f
@@ -121,7 +121,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup ungqr
 *
 *  =====================================================================
       SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -156,7 +156,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -165,7 +166,7 @@
       INFO = 0
       NB = ILAENV( 1, 'CUNGQR', ' ', M, N, K, -1 )
       LWKOPT = MAX( 1, N )*NB
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -279,7 +280,7 @@
    50    CONTINUE
       END IF
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of CUNGQR
diff --git a/lapack-netlib/SRC/cungrq.f b/lapack-netlib/SRC/cungrq.f
index 1593ff938..aceaac0b8 100644
--- a/lapack-netlib/SRC/cungrq.f
+++ b/lapack-netlib/SRC/cungrq.f
@@ -121,7 +121,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup ungrq
 *
 *  =====================================================================
       SUBROUTINE CUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -156,7 +156,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -181,7 +182,7 @@
             NB = ILAENV( 1, 'CUNGRQ', ' ', M, N, K, -1 )
             LWKOPT = M*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN
             INFO = -8
@@ -286,7 +287,7 @@
    50    CONTINUE
       END IF
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of CUNGRQ
diff --git a/lapack-netlib/SRC/cungtr.f b/lapack-netlib/SRC/cungtr.f
index 26ff0428e..27f197340 100644
--- a/lapack-netlib/SRC/cungtr.f
+++ b/lapack-netlib/SRC/cungtr.f
@@ -116,7 +116,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup ungtr
 *
 *  =====================================================================
       SUBROUTINE CUNGTR( UPLO, N, A, LDA, TAU, WORK, LWORK, INFO )
@@ -147,7 +147,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CUNGQL, CUNGQR, XERBLA
@@ -179,7 +180,7 @@
            NB = ILAENV( 1, 'CUNGQR', ' ', N-1, N-1, N-1, -1 )
          END IF
          LWKOPT = MAX( 1, N-1 )*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -245,7 +246,7 @@
      $                   LWORK, IINFO )
          END IF
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CUNGTR
diff --git a/lapack-netlib/SRC/cunmbr.f b/lapack-netlib/SRC/cunmbr.f
index cef6025b0..a21c486e9 100644
--- a/lapack-netlib/SRC/cunmbr.f
+++ b/lapack-netlib/SRC/cunmbr.f
@@ -189,7 +189,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unmbr
 *
 *  =====================================================================
       SUBROUTINE CUNMBR( VECT, SIDE, TRANS, M, N, K, A, LDA, TAU, C,
@@ -218,7 +218,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CUNMLQ, CUNMQR, XERBLA
@@ -290,7 +291,7 @@
          ELSE
             LWKOPT = 1
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -367,7 +368,7 @@
      $                   TAU, C( I1, I2 ), LDC, WORK, LWORK, IINFO )
          END IF
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CUNMBR
diff --git a/lapack-netlib/SRC/cunmhr.f b/lapack-netlib/SRC/cunmhr.f
index af3140d5f..29bb631f1 100644
--- a/lapack-netlib/SRC/cunmhr.f
+++ b/lapack-netlib/SRC/cunmhr.f
@@ -171,7 +171,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unmhr
 *
 *  =====================================================================
       SUBROUTINE CUNMHR( SIDE, TRANS, M, N, ILO, IHI, A, LDA, TAU, C,
@@ -199,7 +199,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CUNMQR, XERBLA
@@ -253,7 +254,7 @@
             NB = ILAENV( 1, 'CUNMQR', SIDE // TRANS, M, NH, NH, -1 )
          END IF
          LWKOPT = NW*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -285,7 +286,7 @@
       CALL CUNMQR( SIDE, TRANS, MI, NI, NH, A( ILO+1, ILO ), LDA,
      $             TAU( ILO ), C( I1, I2 ), LDC, WORK, LWORK, IINFO )
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CUNMHR
diff --git a/lapack-netlib/SRC/cunmlq.f b/lapack-netlib/SRC/cunmlq.f
index 25a410770..4da1af1d5 100644
--- a/lapack-netlib/SRC/cunmlq.f
+++ b/lapack-netlib/SRC/cunmlq.f
@@ -160,7 +160,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unmlq
 *
 *  =====================================================================
       SUBROUTINE CUNMLQ( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC,
@@ -195,7 +195,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLARFB, CLARFT, CUNML2, XERBLA
@@ -250,7 +251,7 @@
      $                               K, -1 ) )
             LWKOPT = NW*NB + TSIZE
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -343,7 +344,7 @@
      $                   C( IC, JC ), LDC, WORK, LDWORK )
    10    CONTINUE
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CUNMLQ
diff --git a/lapack-netlib/SRC/cunmql.f b/lapack-netlib/SRC/cunmql.f
index 3c7166066..84fc29d32 100644
--- a/lapack-netlib/SRC/cunmql.f
+++ b/lapack-netlib/SRC/cunmql.f
@@ -160,7 +160,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unmql
 *
 *  =====================================================================
       SUBROUTINE CUNMQL( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC,
@@ -194,7 +194,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLARFB, CLARFT, CUNM2L, XERBLA
@@ -249,7 +250,7 @@
      $                               K, -1 ) )
             LWKOPT = NW*NB + TSIZE
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -332,7 +333,7 @@
      $                   WORK, LDWORK )
    10    CONTINUE
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CUNMQL
diff --git a/lapack-netlib/SRC/cunmqr.f b/lapack-netlib/SRC/cunmqr.f
index 7e59d7129..7d85a861f 100644
--- a/lapack-netlib/SRC/cunmqr.f
+++ b/lapack-netlib/SRC/cunmqr.f
@@ -160,7 +160,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unmqr
 *
 *  =====================================================================
       SUBROUTINE CUNMQR( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC,
@@ -194,7 +194,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLARFB, CLARFT, CUNM2R, XERBLA
@@ -245,7 +246,7 @@
          NB = MIN( NBMAX, ILAENV( 1, 'CUNMQR', SIDE // TRANS, M, N, K,
      $        -1 ) )
          LWKOPT = NW*NB + TSIZE
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -331,7 +332,7 @@
      $                   C( IC, JC ), LDC, WORK, LDWORK )
    10    CONTINUE
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CUNMQR
diff --git a/lapack-netlib/SRC/cunmrq.f b/lapack-netlib/SRC/cunmrq.f
index 5a233f604..f02cfd9a9 100644
--- a/lapack-netlib/SRC/cunmrq.f
+++ b/lapack-netlib/SRC/cunmrq.f
@@ -160,7 +160,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unmrq
 *
 *  =====================================================================
       SUBROUTINE CUNMRQ( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC,
@@ -195,7 +195,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLARFB, CLARFT, CUNMR2, XERBLA
@@ -250,7 +251,7 @@
      $                               K, -1 ) )
             LWKOPT = NW*NB + TSIZE
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -337,7 +338,7 @@
      $                   WORK, LDWORK )
    10    CONTINUE
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CUNMRQ
diff --git a/lapack-netlib/SRC/cunmrz.f b/lapack-netlib/SRC/cunmrz.f
index 8e06f2329..9ccf1878b 100644
--- a/lapack-netlib/SRC/cunmrz.f
+++ b/lapack-netlib/SRC/cunmrz.f
@@ -168,7 +168,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unmrz
 *
 *> \par Contributors:
 *  ==================
@@ -213,7 +213,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLARZB, CLARZT, CUNMR3, XERBLA
@@ -271,7 +272,7 @@
      $                               K, -1 ) )
             LWKOPT = NW*NB + TSIZE
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -371,7 +372,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cunmtr.f b/lapack-netlib/SRC/cunmtr.f
index 097dba91e..6eafc15c4 100644
--- a/lapack-netlib/SRC/cunmtr.f
+++ b/lapack-netlib/SRC/cunmtr.f
@@ -164,7 +164,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unmtr
 *
 *  =====================================================================
       SUBROUTINE CUNMTR( SIDE, UPLO, TRANS, M, N, A, LDA, TAU, C, LDC,
@@ -192,7 +192,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CUNMQL, CUNMQR, XERBLA
@@ -256,7 +257,7 @@
             END IF
          END IF
          LWKOPT = NW*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -301,7 +302,7 @@
          CALL CUNMQR( SIDE, TRANS, MI, NI, NQ-1, A( 2, 1 ), LDA, TAU,
      $                C( I1, I2 ), LDC, WORK, LWORK, IINFO )
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CUNMTR

From f5664740cd492d9f7c614c4876a9204c40fdf777 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 12 Nov 2023 14:29:04 +0100
Subject: [PATCH 405/718] Apply ROUNDUP_LWORK (Reference-LAPACK PR 904)

---
 lapack-netlib/SRC/sgebrd.f     | 11 ++++++-----
 lapack-netlib/SRC/sgees.f      | 15 +++++++--------
 lapack-netlib/SRC/sgeesx.f     | 13 ++++++-------
 lapack-netlib/SRC/sgeev.f      | 16 +++++++---------
 lapack-netlib/SRC/sgeevx.f     | 13 ++++++-------
 lapack-netlib/SRC/sgehrd.f     |  9 +++++----
 lapack-netlib/SRC/sgelq.f      | 11 +++++++----
 lapack-netlib/SRC/sgelqf.f     |  9 +++++----
 lapack-netlib/SRC/sgels.f      | 15 +++++++--------
 lapack-netlib/SRC/sgelsd.f     | 15 +++++++--------
 lapack-netlib/SRC/sgelss.f     |  8 ++++----
 lapack-netlib/SRC/sgelst.f     | 19 +++++++++----------
 lapack-netlib/SRC/sgelsy.f     | 13 ++++++-------
 lapack-netlib/SRC/sgemlq.f     | 16 +++++++++++-----
 lapack-netlib/SRC/sgemqr.f     | 15 +++++++++------
 lapack-netlib/SRC/sgeqlf.f     |  9 +++++----
 lapack-netlib/SRC/sgeqp3.f     | 10 +++++-----
 lapack-netlib/SRC/sgeqrf.f     |  9 +++++----
 lapack-netlib/SRC/sgeqrfp.f    |  9 +++++----
 lapack-netlib/SRC/sgerqf.f     |  9 +++++----
 lapack-netlib/SRC/sgesvd.f     | 10 +++++-----
 lapack-netlib/SRC/sgesvdx.f    | 10 +++++-----
 lapack-netlib/SRC/sgetri.f     |  9 +++++----
 lapack-netlib/SRC/sgetsls.f    | 15 +++++++--------
 lapack-netlib/SRC/sgetsqrhrt.f | 12 ++++++++----
 lapack-netlib/SRC/sgges.f      | 16 +++++++---------
 lapack-netlib/SRC/sgges3.f     | 16 +++++++---------
 lapack-netlib/SRC/sggesx.f     | 16 +++++++---------
 lapack-netlib/SRC/sggev.f      | 16 +++++++---------
 lapack-netlib/SRC/sggev3.f     | 16 +++++++---------
 lapack-netlib/SRC/sggevx.f     | 17 ++++++++---------
 lapack-netlib/SRC/sggglm.f     |  7 ++++---
 lapack-netlib/SRC/sgghd3.f     | 11 ++++++-----
 lapack-netlib/SRC/sgglse.f     |  7 ++++---
 lapack-netlib/SRC/sggqrf.f     | 10 ++++++----
 lapack-netlib/SRC/sggrqf.f     | 10 ++++++----
 lapack-netlib/SRC/sggsvd3.f    | 10 +++++-----
 lapack-netlib/SRC/sggsvp3.f    |  9 +++++----
 lapack-netlib/SRC/shgeqz.f     |  9 +++++----
 lapack-netlib/SRC/shseqr.f     |  7 ++++---
 lapack-netlib/SRC/slaqr2.f     | 13 ++++++-------
 lapack-netlib/SRC/slaqr3.f     | 16 +++++++---------
 lapack-netlib/SRC/slaqr4.f     | 11 ++++++-----
 lapack-netlib/SRC/slaqz0.f     |  6 +++---
 lapack-netlib/SRC/slaqz3.f     |  9 ++++-----
 lapack-netlib/SRC/slaqz4.f     |  5 +++--
 lapack-netlib/SRC/slaswlq.f    |  7 +++++--
 lapack-netlib/SRC/sorgbr.f     |  9 +++++----
 48 files changed, 284 insertions(+), 269 deletions(-)

diff --git a/lapack-netlib/SRC/sgebrd.f b/lapack-netlib/SRC/sgebrd.f
index 08701164c..2d0c6d651 100644
--- a/lapack-netlib/SRC/sgebrd.f
+++ b/lapack-netlib/SRC/sgebrd.f
@@ -147,7 +147,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup gebrd
 *
 *> \par Further Details:
 *  =====================
@@ -230,11 +230,12 @@
       EXTERNAL           SGEBD2, SGEMM, SLABRD, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX, MIN, REAL
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -243,7 +244,7 @@
       INFO = 0
       NB = MAX( 1, ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) )
       LWKOPT = ( M+N )*NB
-      WORK( 1 ) = REAL( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -341,7 +342,7 @@
 *
       CALL SGEBD2( M-I+1, N-I+1, A( I, I ), LDA, D( I ), E( I ),
      $             TAUQ( I ), TAUP( I ), WORK, IINFO )
-      WORK( 1 ) = WS
+      WORK( 1 ) = SROUNDUP_LWORK(WS)
       RETURN
 *
 *     End of SGEBRD
diff --git a/lapack-netlib/SRC/sgees.f b/lapack-netlib/SRC/sgees.f
index 6febd549c..4418ea064 100644
--- a/lapack-netlib/SRC/sgees.f
+++ b/lapack-netlib/SRC/sgees.f
@@ -208,7 +208,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEeigen
+*> \ingroup gees
 *
 *  =====================================================================
       SUBROUTINE SGEES( JOBVS, SORT, SELECT, N, A, LDA, SDIM, WR, WI,
@@ -250,14 +250,14 @@
       REAL               DUM( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SCOPY, SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLABAD,
-     $                   SLACPY, SLASCL, SORGHR, SSWAP, STRSEN, XERBLA
+      EXTERNAL           SCOPY, SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLACPY,
+     $                   SLASCL, SORGHR, SSWAP, STRSEN, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, SQRT
@@ -312,7 +312,7 @@
                MAXWRK = MAX( MAXWRK, N + HSWORK )
             END IF
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -13
@@ -338,7 +338,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -524,7 +523,7 @@
    30    CONTINUE
       END IF
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of SGEES
diff --git a/lapack-netlib/SRC/sgeesx.f b/lapack-netlib/SRC/sgeesx.f
index 6810fe7c8..cabe9f1f7 100644
--- a/lapack-netlib/SRC/sgeesx.f
+++ b/lapack-netlib/SRC/sgeesx.f
@@ -272,7 +272,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEeigen
+*> \ingroup geesx
 *
 *  =====================================================================
       SUBROUTINE SGEESX( JOBVS, SORT, SELECT, SENSE, N, A, LDA, SDIM,
@@ -317,14 +317,14 @@
       REAL               DUM( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SCOPY, SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLABAD,
+      EXTERNAL           SCOPY, SGEBAK, SGEBAL, SGEHRD, SHSEQR,
      $                   SLACPY, SLASCL, SORGHR, SSWAP, STRSEN, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, SQRT
@@ -398,7 +398,7 @@
      $         LIWRK = ( N*N )/4
          END IF
          IWORK( 1 ) = LIWRK
-         WORK( 1 ) = LWRK
+         WORK( 1 ) = SROUNDUP_LWORK(LWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -16
@@ -426,7 +426,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -634,7 +633,7 @@
    30    CONTINUE
       END IF
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       IF( WANTSV .OR. WANTSB ) THEN
          IWORK( 1 ) = SDIM*(N-SDIM)
       ELSE
diff --git a/lapack-netlib/SRC/sgeev.f b/lapack-netlib/SRC/sgeev.f
index ed1724721..93f993265 100644
--- a/lapack-netlib/SRC/sgeev.f
+++ b/lapack-netlib/SRC/sgeev.f
@@ -184,7 +184,7 @@
 *
 *  @generated from dgeev.f, fortran d -> s, Tue Apr 19 01:47:44 2016
 *
-*> \ingroup realGEeigen
+*> \ingroup geev
 *
 *  =====================================================================
       SUBROUTINE SGEEV( JOBVL, JOBVR, N, A, LDA, WR, WI, VL, LDVL, VR,
@@ -223,16 +223,15 @@
       REAL   DUM( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLABAD, SLACPY,
-     $                   SLARTG, SLASCL, SORGHR, SROT, SSCAL, STREVC3,
-     $                   XERBLA
+      EXTERNAL           SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLACPY, SLARTG,
+     $                   SLASCL, SORGHR, SROT, SSCAL, STREVC3, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ISAMAX, ILAENV
-      REAL   SLAMCH, SLANGE, SLAPY2, SNRM2
+      REAL               SLAMCH, SLANGE, SLAPY2, SNRM2, SROUNDUP_LWORK
       EXTERNAL           LSAME, ISAMAX, ILAENV, SLAMCH, SLANGE, SLAPY2,
-     $                   SNRM2
+     $                   SNRM2, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, SQRT
@@ -312,7 +311,7 @@
             END IF
             MAXWRK = MAX( MAXWRK, MINWRK )
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -13
@@ -336,7 +335,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -519,7 +517,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of SGEEV
diff --git a/lapack-netlib/SRC/sgeevx.f b/lapack-netlib/SRC/sgeevx.f
index ed1ea1cb9..b0af78605 100644
--- a/lapack-netlib/SRC/sgeevx.f
+++ b/lapack-netlib/SRC/sgeevx.f
@@ -297,7 +297,7 @@
 *
 *  @generated from dgeevx.f, fortran d -> s, Tue Apr 19 01:47:44 2016
 *
-*> \ingroup realGEeigen
+*> \ingroup geevx
 *
 *  =====================================================================
       SUBROUTINE SGEEVX( BALANC, JOBVL, JOBVR, SENSE, N, A, LDA, WR, WI,
@@ -341,16 +341,16 @@
       REAL               DUM( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLABAD, SLACPY,
+      EXTERNAL           SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLACPY,
      $                   SLARTG, SLASCL, SORGHR, SROT, SSCAL, STREVC3,
      $                   STRSNA, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ISAMAX, ILAENV
-      REAL               SLAMCH, SLANGE, SLAPY2, SNRM2
+      REAL               SLAMCH, SLANGE, SLAPY2, SNRM2, SROUNDUP_LWORK
       EXTERNAL           LSAME, ISAMAX, ILAENV, SLAMCH, SLANGE, SLAPY2,
-     $                   SNRM2
+     $                   SNRM2, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, SQRT
@@ -453,7 +453,7 @@
             END IF
             MAXWRK = MAX( MAXWRK, MINWRK )
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -21
@@ -477,7 +477,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -684,7 +683,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of SGEEVX
diff --git a/lapack-netlib/SRC/sgehrd.f b/lapack-netlib/SRC/sgehrd.f
index 41b9aa78e..47733d947 100644
--- a/lapack-netlib/SRC/sgehrd.f
+++ b/lapack-netlib/SRC/sgehrd.f
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup gehrd
 *
 *> \par Further Details:
 *  =====================
@@ -201,7 +201,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -227,7 +228,7 @@
 *
          NB = MIN( NBMAX, ILAENV( 1, 'SGEHRD', ' ', N, ILO, IHI, -1 ) )
          LWKOPT = N*NB + TSIZE
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -344,7 +345,7 @@
 *     Use unblocked code to reduce the rest of the matrix
 *
       CALL SGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO )
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgelq.f b/lapack-netlib/SRC/sgelq.f
index 9209f918e..74c7cc267 100644
--- a/lapack-netlib/SRC/sgelq.f
+++ b/lapack-netlib/SRC/sgelq.f
@@ -166,6 +166,8 @@
 *>  the LQ factorization.
 *> \endverbatim
 *>
+*> \ingroup gelq
+*>
 *  =====================================================================
       SUBROUTINE SGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
@@ -190,7 +192,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGELQT, SLASWLQ, XERBLA
@@ -292,9 +295,9 @@
         T( 2 ) = MB
         T( 3 ) = NB
         IF( MINW ) THEN
-          WORK( 1 ) = LWMIN
+          WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
         ELSE
-          WORK( 1 ) = LWREQ
+          WORK( 1 ) = SROUNDUP_LWORK(LWREQ)
         END IF
       END IF
       IF( INFO.NE.0 ) THEN
@@ -319,7 +322,7 @@
      $                LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = LWREQ
+      WORK( 1 ) = SROUNDUP_LWORK(LWREQ)
       RETURN
 *
 *     End of SGELQ
diff --git a/lapack-netlib/SRC/sgelqf.f b/lapack-netlib/SRC/sgelqf.f
index 24d8ab19c..1ceec4742 100644
--- a/lapack-netlib/SRC/sgelqf.f
+++ b/lapack-netlib/SRC/sgelqf.f
@@ -118,7 +118,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup gelqf
 *
 *> \par Further Details:
 *  =====================
@@ -167,7 +167,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -176,7 +177,7 @@
       INFO = 0
       NB = ILAENV( 1, 'SGELQF', ' ', M, N, -1, -1 )
       LWKOPT = M*NB
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -266,7 +267,7 @@
      $   CALL SGELQ2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK,
      $                IINFO )
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of SGELQF
diff --git a/lapack-netlib/SRC/sgels.f b/lapack-netlib/SRC/sgels.f
index ea02c3318..b58f70c9e 100644
--- a/lapack-netlib/SRC/sgels.f
+++ b/lapack-netlib/SRC/sgels.f
@@ -175,7 +175,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEsolve
+*> \ingroup gels
 *
 *  =====================================================================
       SUBROUTINE SGELS( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK,
@@ -210,15 +210,15 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGELQF, SGEQRF, SLABAD, SLASCL, SLASET, SORMLQ,
+      EXTERNAL           SGELQF, SGEQRF, SLASCL, SLASET, SORMLQ,
      $                   SORMQR, STRTRS, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX, MIN, REAL
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
@@ -273,7 +273,7 @@
          END IF
 *
          WSIZE = MAX( 1, MN + MAX( MN, NRHS )*NB )
-         WORK( 1 ) = REAL( WSIZE )
+         WORK( 1 ) = SROUNDUP_LWORK( WSIZE )
 *
       END IF
 *
@@ -295,7 +295,6 @@
 *
       SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A, B if max element outside range [SMLNUM,BIGNUM]
 *
@@ -492,7 +491,7 @@
       END IF
 *
    50 CONTINUE
-      WORK( 1 ) = REAL( WSIZE )
+      WORK( 1 ) = SROUNDUP_LWORK( WSIZE )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgelsd.f b/lapack-netlib/SRC/sgelsd.f
index 9fda7b593..2818213f4 100644
--- a/lapack-netlib/SRC/sgelsd.f
+++ b/lapack-netlib/SRC/sgelsd.f
@@ -189,7 +189,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEsolve
+*> \ingroup gelsd
 *
 *> \par Contributors:
 *  ==================
@@ -229,13 +229,13 @@
       REAL               ANRM, BIGNUM, BNRM, EPS, SFMIN, SMLNUM
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEBRD, SGELQF, SGEQRF, SLABAD, SLACPY, SLALSD,
-     $                   SLASCL, SLASET, SORMBR, SORMLQ, SORMQR, XERBLA
+      EXTERNAL           SGEBRD, SGELQF, SGEQRF, SLACPY, SLALSD, SLASCL,
+     $                   SLASET, SORMBR, SORMLQ, SORMQR, XERBLA
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           SLAMCH, SLANGE, ILAENV
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           SLAMCH, SLANGE, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, LOG, MAX, MIN, REAL
@@ -348,7 +348,7 @@
             END IF
          END IF
          MINWRK = MIN( MINWRK, MAXWRK )
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
          IWORK( 1 ) = LIWORK
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
@@ -376,7 +376,6 @@
       SFMIN = SLAMCH( 'S' )
       SMLNUM = SFMIN / EPS
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max entry outside range [SMLNUM,BIGNUM].
 *
@@ -615,7 +614,7 @@
       END IF
 *
    10 CONTINUE
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       IWORK( 1 ) = LIWORK
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgelss.f b/lapack-netlib/SRC/sgelss.f
index 89d3a6e4f..2e4b0cdd5 100644
--- a/lapack-netlib/SRC/sgelss.f
+++ b/lapack-netlib/SRC/sgelss.f
@@ -207,8 +207,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -355,7 +355,7 @@
             END IF
             MAXWRK = MAX( MINWRK, MAXWRK )
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY )
      $      INFO = -12
@@ -731,7 +731,7 @@
       END IF
 *
    70 CONTINUE
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of SGELSS
diff --git a/lapack-netlib/SRC/sgelst.f b/lapack-netlib/SRC/sgelst.f
index 5377bc720..b89918656 100644
--- a/lapack-netlib/SRC/sgelst.f
+++ b/lapack-netlib/SRC/sgelst.f
@@ -176,7 +176,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEsolve
+*> \ingroup gelst
 *
 *> \par Contributors:
 *  ==================
@@ -222,15 +222,15 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGELQT, SGEQRT, SGEMLQT, SGEMQRT, SLABAD,
+      EXTERNAL           SGELQT, SGEQRT, SGEMLQT, SGEMQRT,
      $                   SLASCL, SLASET, STRTRS, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          REAL, MAX, MIN
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
@@ -268,7 +268,7 @@
 *
          MNNRHS = MAX( MN, NRHS )
          LWOPT = MAX( 1, (MN+MNNRHS)*NB )
-         WORK( 1 ) = REAL( LWOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWOPT )
 *
       END IF
 *
@@ -283,7 +283,7 @@
 *
       IF( MIN( M, N, NRHS ).EQ.0 ) THEN
          CALL SLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB )
-         WORK( 1 ) = REAL( LWOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWOPT )
          RETURN
       END IF
 *
@@ -309,7 +309,6 @@
 *
       SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A, B if max element outside range [SMLNUM,BIGNUM]
 *
@@ -332,7 +331,7 @@
 *        Matrix all zero. Return zero solution.
 *
          CALL SLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB )
-         WORK( 1 ) = REAL( LWOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWOPT )
          RETURN
       END IF
 *
@@ -522,7 +521,7 @@
      $                INFO )
       END IF
 *
-      WORK( 1 ) = REAL( LWOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgelsy.f b/lapack-netlib/SRC/sgelsy.f
index 89dd39e80..c7f5069de 100644
--- a/lapack-netlib/SRC/sgelsy.f
+++ b/lapack-netlib/SRC/sgelsy.f
@@ -191,7 +191,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEsolve
+*> \ingroup gelsy
 *
 *> \par Contributors:
 *  ==================
@@ -234,11 +234,11 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SCOPY, SGEQP3, SLABAD, SLAIC1, SLASCL, SLASET,
+      EXTERNAL           SCOPY, SGEQP3, SLAIC1, SLASCL, SLASET,
      $                   SORMQR, SORMRZ, STRSM, STZRZF, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
@@ -282,7 +282,7 @@
             LWKOPT = MAX( LWKMIN,
      $                    MN + 2*N + NB*( N + 1 ), 2*MN + NB*NRHS )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
             INFO = -12
@@ -307,7 +307,6 @@
 *
       SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A, B if max entries outside range [SMLNUM,BIGNUM]
 *
@@ -469,7 +468,7 @@
       END IF
 *
    70 CONTINUE
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgemlq.f b/lapack-netlib/SRC/sgemlq.f
index 536abf184..83536825c 100644
--- a/lapack-netlib/SRC/sgemlq.f
+++ b/lapack-netlib/SRC/sgemlq.f
@@ -119,7 +119,7 @@
 *>          The dimension of the array WORK.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -143,7 +143,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -159,11 +159,13 @@
 *>  block sizes MB and NB returned by ILAENV, SGELQ will use either
 *>  SLASWLQ (if the matrix is wide-and-short) or SGELQT to compute
 *>  the LQ factorization.
-*>  This version of SGEMLQ will use either SLAMSWLQ or SGEMLQT to 
+*>  This version of SGEMLQ will use either SLAMSWLQ or SGEMLQT to
 *>  multiply matrix Q by another matrix.
 *>  Further Details in SLAMSWLQ or SGEMLQT.
 *> \endverbatim
 *>
+*> \ingroup gemlq
+*>
 *  =====================================================================
       SUBROUTINE SGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -191,6 +193,10 @@
       LOGICAL            LSAME
       EXTERNAL           LSAME
 *     ..
+*     .. External Functions ..
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
       EXTERNAL           SLAMSWLQ, SGEMLQT, XERBLA
 *     ..
@@ -249,7 +255,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = REAL( LW )
+        WORK( 1 ) = SROUNDUP_LWORK( LW )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -274,7 +280,7 @@
      $                 MB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = REAL( LW )
+      WORK( 1 ) = SROUNDUP_LWORK( LW )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgemqr.f b/lapack-netlib/SRC/sgemqr.f
index 2a9257459..3207f8bfd 100644
--- a/lapack-netlib/SRC/sgemqr.f
+++ b/lapack-netlib/SRC/sgemqr.f
@@ -120,7 +120,7 @@
 *>          The dimension of the array WORK.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -144,7 +144,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -160,12 +160,14 @@
 *>  block sizes MB and NB returned by ILAENV, SGEQR will use either
 *>  SLATSQR (if the matrix is tall-and-skinny) or SGEQRT to compute
 *>  the QR factorization.
-*>  This version of SGEMQR will use either SLAMTSQR or SGEMQRT to 
+*>  This version of SGEMQR will use either SLAMTSQR or SGEMQRT to
 *>  multiply matrix Q by another matrix.
 *>  Further Details in SLAMTSQR or SGEMQRT.
 *>
 *> \endverbatim
 *>
+*> \ingroup gemqr
+*>
 *  =====================================================================
       SUBROUTINE SGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -191,7 +193,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMQRT, SLAMTSQR, XERBLA
@@ -251,7 +254,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = LW
+        WORK( 1 ) = SROUNDUP_LWORK(LW)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -276,7 +279,7 @@
      $                 NB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = LW
+      WORK( 1 ) = SROUNDUP_LWORK(LW)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgeqlf.f b/lapack-netlib/SRC/sgeqlf.f
index efecfbb3c..b1266c89e 100644
--- a/lapack-netlib/SRC/sgeqlf.f
+++ b/lapack-netlib/SRC/sgeqlf.f
@@ -113,7 +113,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup geqlf
 *
 *> \par Further Details:
 *  =====================
@@ -162,7 +162,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -186,7 +187,7 @@
             NB = ILAENV( 1, 'SGEQLF', ' ', M, N, -1, -1 )
             LWKOPT = N*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
             INFO = -7
@@ -276,7 +277,7 @@
       IF( MU.GT.0 .AND. NU.GT.0 )
      $   CALL SGEQL2( MU, NU, A, LDA, TAU, WORK, IINFO )
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of SGEQLF
diff --git a/lapack-netlib/SRC/sgeqp3.f b/lapack-netlib/SRC/sgeqp3.f
index 493bdae6a..9f2f40b2e 100644
--- a/lapack-netlib/SRC/sgeqp3.f
+++ b/lapack-netlib/SRC/sgeqp3.f
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup geqp3
 *
 *> \par Further Details:
 *  =====================
@@ -177,8 +177,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               SNRM2
-      EXTERNAL           ILAENV, SNRM2
+      REAL               SNRM2, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SNRM2, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN
@@ -205,7 +205,7 @@
             NB = ILAENV( INB, 'SGEQRF', ' ', M, N, -1, -1 )
             LWKOPT = 2*N + ( N + 1 )*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN
             INFO = -8
@@ -347,7 +347,7 @@
 *
       END IF
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of SGEQP3
diff --git a/lapack-netlib/SRC/sgeqrf.f b/lapack-netlib/SRC/sgeqrf.f
index b24615f7a..689fe1aea 100644
--- a/lapack-netlib/SRC/sgeqrf.f
+++ b/lapack-netlib/SRC/sgeqrf.f
@@ -121,7 +121,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup geqrf
 *
 *> \par Further Details:
 *  =====================
@@ -170,7 +170,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -199,7 +200,7 @@
          ELSE
             LWKOPT = N*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          RETURN
       END IF
 *
@@ -274,7 +275,7 @@
      $   CALL SGEQR2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK,
      $                IINFO )
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of SGEQRF
diff --git a/lapack-netlib/SRC/sgeqrfp.f b/lapack-netlib/SRC/sgeqrfp.f
index 03d33654b..d1ee2a828 100644
--- a/lapack-netlib/SRC/sgeqrfp.f
+++ b/lapack-netlib/SRC/sgeqrfp.f
@@ -122,7 +122,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup geqrfp
 *
 *> \par Further Details:
 *  =====================
@@ -173,7 +173,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -182,7 +183,7 @@
       INFO = 0
       NB = ILAENV( 1, 'SGEQRF', ' ', M, N, -1, -1 )
       LWKOPT = N*NB
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -272,7 +273,7 @@
      $   CALL SGEQR2P( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK,
      $                IINFO )
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of SGEQRFP
diff --git a/lapack-netlib/SRC/sgerqf.f b/lapack-netlib/SRC/sgerqf.f
index 037cd5345..1d3400a1f 100644
--- a/lapack-netlib/SRC/sgerqf.f
+++ b/lapack-netlib/SRC/sgerqf.f
@@ -114,7 +114,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup gerqf
 *
 *> \par Further Details:
 *  =====================
@@ -163,7 +163,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -187,7 +188,7 @@
             NB = ILAENV( 1, 'SGERQF', ' ', M, N, -1, -1 )
             LWKOPT = M*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF ( .NOT.LQUERY ) THEN
             IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) )
@@ -278,7 +279,7 @@
       IF( MU.GT.0 .AND. NU.GT.0 )
      $   CALL SGERQ2( MU, NU, A, LDA, TAU, WORK, IINFO )
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of SGERQF
diff --git a/lapack-netlib/SRC/sgesvd.f b/lapack-netlib/SRC/sgesvd.f
index 83321ffaa..d3fa94582 100644
--- a/lapack-netlib/SRC/sgesvd.f
+++ b/lapack-netlib/SRC/sgesvd.f
@@ -203,7 +203,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEsing
+*> \ingroup gesvd
 *
 *  =====================================================================
       SUBROUTINE SGESVD( JOBU, JOBVT, M, N, A, LDA, S, U, LDU, VT, LDVT,
@@ -251,8 +251,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN, SQRT
@@ -628,7 +628,7 @@
             END IF
          END IF
          MAXWRK = MAX( MAXWRK, MINWRK )
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -13
@@ -3493,7 +3493,7 @@
 *
 *     Return optimal workspace in WORK(1)
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgesvdx.f b/lapack-netlib/SRC/sgesvdx.f
index b6495dbd4..8b55b9b2e 100644
--- a/lapack-netlib/SRC/sgesvdx.f
+++ b/lapack-netlib/SRC/sgesvdx.f
@@ -254,7 +254,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEsing
+*> \ingroup gesvdx
 *
 *  =====================================================================
       SUBROUTINE SGESVDX( JOBU, JOBVT, RANGE, M, N, A, LDA, VL, VU,
@@ -301,8 +301,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN, SQRT
@@ -456,7 +456,7 @@
             END IF
          END IF
          MAXWRK = MAX( MAXWRK, MINWRK )
-         WORK( 1 ) = REAL( MAXWRK )
+         WORK( 1 ) = SROUNDUP_LWORK( MAXWRK )
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
              INFO = -19
@@ -822,7 +822,7 @@
 *
 *     Return optimal workspace in WORK(1)
 *
-      WORK( 1 ) = REAL( MAXWRK )
+      WORK( 1 ) = SROUNDUP_LWORK( MAXWRK )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgetri.f b/lapack-netlib/SRC/sgetri.f
index 749ede9a7..fe71bc4a5 100644
--- a/lapack-netlib/SRC/sgetri.f
+++ b/lapack-netlib/SRC/sgetri.f
@@ -107,7 +107,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup getri
 *
 *  =====================================================================
       SUBROUTINE SGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -137,7 +137,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMM, SGEMV, SSWAP, STRSM, STRTRI, XERBLA
@@ -152,7 +153,7 @@
       INFO = 0
       NB = ILAENV( 1, 'SGETRI', ' ', N, -1, -1, -1 )
       LWKOPT = N*NB
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
          INFO = -1
@@ -250,7 +251,7 @@
      $      CALL SSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 )
    60 CONTINUE
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of SGETRI
diff --git a/lapack-netlib/SRC/sgetsls.f b/lapack-netlib/SRC/sgetsls.f
index e6ce705fa..d89c6a4e6 100644
--- a/lapack-netlib/SRC/sgetsls.f
+++ b/lapack-netlib/SRC/sgetsls.f
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEsolve
+*> \ingroup getsls
 *
 *  =====================================================================
       SUBROUTINE SGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB,
@@ -188,15 +188,15 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, SLABAD, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEQR, SGEMQR, SLASCL, SLASET,
      $                   STRTRS, XERBLA, SGELQ, SGEMLQ
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          REAL, MAX, MIN, INT
+      INTRINSIC          MAX, MIN, INT
 *     ..
 *     .. Executable Statements ..
 *
@@ -262,7 +262,7 @@
           INFO = -10
        END IF
 *
-       WORK( 1 ) = REAL( WSIZEO )
+       WORK( 1 ) = SROUNDUP_LWORK( WSIZEO )
 *
       END IF
 *
@@ -271,7 +271,7 @@
         RETURN
       END IF
       IF( LQUERY ) THEN
-        IF( LWORK.EQ.-2 ) WORK( 1 ) = REAL( WSIZEM )
+        IF( LWORK.EQ.-2 ) WORK( 1 ) = SROUNDUP_LWORK( WSIZEM )
         RETURN
       END IF
       IF( LWORK.LT.WSIZEO ) THEN
@@ -294,7 +294,6 @@
 *
        SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' )
        BIGNUM = ONE / SMLNUM
-       CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A, B if max element outside range [SMLNUM,BIGNUM]
 *
@@ -482,7 +481,7 @@
       END IF
 *
    50 CONTINUE
-      WORK( 1 ) = REAL( TSZO + LWO )
+      WORK( 1 ) = SROUNDUP_LWORK( TSZO + LWO )
       RETURN
 *
 *     End of SGETSLS
diff --git a/lapack-netlib/SRC/sgetsqrhrt.f b/lapack-netlib/SRC/sgetsqrhrt.f
index f9580da7b..d80ff4da8 100644
--- a/lapack-netlib/SRC/sgetsqrhrt.f
+++ b/lapack-netlib/SRC/sgetsqrhrt.f
@@ -160,7 +160,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup singleOTHERcomputational
+*> \ingroup getsqrhrt
 *
 *> \par Contributors:
 *  ==================
@@ -200,6 +200,10 @@
       INTEGER            I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT,
      $                   NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS
 *     ..
+*     .. External Functions ..
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SLATSQR, SORGTSQR_ROW, SORHR_COL,
      $                   XERBLA
@@ -277,14 +281,14 @@
          CALL XERBLA( 'SGETSQRHRT', -INFO )
          RETURN
       ELSE IF ( LQUERY ) THEN
-         WORK( 1 ) = REAL( LWORKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
          RETURN
       END IF
 *
 *     Quick return if possible
 *
       IF( MIN( M, N ).EQ.0 ) THEN
-         WORK( 1 ) = REAL( LWORKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
          RETURN
       END IF
 *
@@ -341,7 +345,7 @@
          END IF
       END DO
 *
-      WORK( 1 ) = REAL( LWORKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
       RETURN
 *
 *     End of SGETSQRHRT
diff --git a/lapack-netlib/SRC/sgges.f b/lapack-netlib/SRC/sgges.f
index 3834aea00..8f42882dd 100644
--- a/lapack-netlib/SRC/sgges.f
+++ b/lapack-netlib/SRC/sgges.f
@@ -275,7 +275,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEeigen
+*> \ingroup gges
 *
 *  =====================================================================
       SUBROUTINE SGGES( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, LDB,
@@ -321,15 +321,14 @@
       REAL               DIF( 2 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLABAD,
-     $                   SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGSEN,
-     $                   XERBLA
+      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLACPY,
+     $                   SLASCL, SLASET, SORGQR, SORMQR, STGSEN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
@@ -406,7 +405,7 @@
             MINWRK = 1
             MAXWRK = 1
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY )
      $      INFO = -19
@@ -431,7 +430,6 @@
       EPS = SLAMCH( 'P' )
       SAFMIN = SLAMCH( 'S' )
       SAFMAX = ONE / SAFMIN
-      CALL SLABAD( SAFMIN, SAFMAX )
       SMLNUM = SQRT( SAFMIN ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -668,7 +666,7 @@
 *
    40 CONTINUE
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgges3.f b/lapack-netlib/SRC/sgges3.f
index b27704ff5..e35d4955a 100644
--- a/lapack-netlib/SRC/sgges3.f
+++ b/lapack-netlib/SRC/sgges3.f
@@ -273,7 +273,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEeigen
+*> \ingroup gges3
 *
 *  =====================================================================
       SUBROUTINE SGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B,
@@ -318,14 +318,13 @@
       REAL               DIF( 2 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHD3, SLAQZ0, SLABAD,
-     $                   SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGSEN,
-     $                   XERBLA
+      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHD3, SLAQZ0, SLACPY,
+     $                   SLASCL, SLASET, SORGQR, SORMQR, STGSEN, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
@@ -408,7 +407,7 @@
      $                   IERR )
             LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -430,7 +429,6 @@
       EPS = SLAMCH( 'P' )
       SAFMIN = SLAMCH( 'S' )
       SAFMAX = ONE / SAFMIN
-      CALL SLABAD( SAFMIN, SAFMAX )
       SMLNUM = SQRT( SAFMIN ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -659,7 +657,7 @@
 *
    40 CONTINUE
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sggesx.f b/lapack-netlib/SRC/sggesx.f
index a6c0443ba..e5a14fc19 100644
--- a/lapack-netlib/SRC/sggesx.f
+++ b/lapack-netlib/SRC/sggesx.f
@@ -337,7 +337,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEeigen
+*> \ingroup ggesx
 *
 *> \par Further Details:
 *  =====================
@@ -405,15 +405,14 @@
       REAL               DIF( 2 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLABAD,
-     $                   SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGSEN,
-     $                   XERBLA
+      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLACPY,
+     $                   SLASCL, SLASET, SORGQR, SORMQR, STGSEN, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
@@ -510,7 +509,7 @@
             MAXWRK = 1
             LWRK   = 1
          END IF
-         WORK( 1 ) = LWRK
+         WORK( 1 ) = SROUNDUP_LWORK(LWRK)
          IF( WANTSN .OR. N.EQ.0 ) THEN
             LIWMIN = 1
          ELSE
@@ -544,7 +543,6 @@
       EPS = SLAMCH( 'P' )
       SAFMIN = SLAMCH( 'S' )
       SAFMAX = ONE / SAFMIN
-      CALL SLABAD( SAFMIN, SAFMAX )
       SMLNUM = SQRT( SAFMIN ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -807,7 +805,7 @@
 *
    50 CONTINUE
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/sggev.f b/lapack-netlib/SRC/sggev.f
index 69744b72b..cacad7cac 100644
--- a/lapack-netlib/SRC/sggev.f
+++ b/lapack-netlib/SRC/sggev.f
@@ -218,7 +218,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEeigen
+*> \ingroup ggev
 *
 *  =====================================================================
       SUBROUTINE SGGEV( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHAR, ALPHAI,
@@ -257,15 +257,14 @@
       LOGICAL            LDUMMA( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLABAD,
-     $                   SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGEVC,
-     $                   XERBLA
+      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLACPY,
+     $                   SLASCL, SLASET, SORGQR, SORMQR, STGEVC, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
@@ -335,7 +334,7 @@
             MAXWRK = MAX( MAXWRK, N*( 7 +
      $                 ILAENV( 1, 'SORGQR', ' ', N, 1, N, -1 ) ) )
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY )
      $      INFO = -16
@@ -358,7 +357,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -581,7 +579,7 @@
          CALL SLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR )
       END IF
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of SGGEV
diff --git a/lapack-netlib/SRC/sggev3.f b/lapack-netlib/SRC/sggev3.f
index 945c3a017..c82d2187f 100644
--- a/lapack-netlib/SRC/sggev3.f
+++ b/lapack-netlib/SRC/sggev3.f
@@ -217,7 +217,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEeigen
+*> \ingroup ggev3
 *
 *  =====================================================================
       SUBROUTINE SGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHAR,
@@ -256,14 +256,13 @@
       LOGICAL            LDUMMA( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHD3, SLAQZ0, SLABAD,
-     $                   SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGEVC,
-     $                   XERBLA
+      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHD3, SLAQZ0, SLACPY,
+     $                   SLASCL, SLASET, SORGQR, SORMQR, STGEVC
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
@@ -341,7 +340,7 @@
      $                   WORK, -1, 0, IERR )
             LWKOPT = MAX( LWKOPT, 2*N+INT ( WORK( 1 ) ) )
          END IF
-         WORK( 1 ) = REAL( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       END IF
 *
@@ -362,7 +361,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -578,7 +576,7 @@
          CALL SLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR )
       END IF
 *
-      WORK( 1 ) = REAL( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of SGGEV3
diff --git a/lapack-netlib/SRC/sggevx.f b/lapack-netlib/SRC/sggevx.f
index bb05f499a..63164a021 100644
--- a/lapack-netlib/SRC/sggevx.f
+++ b/lapack-netlib/SRC/sggevx.f
@@ -352,7 +352,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEeigen
+*> \ingroup ggevx
 *
 *> \par Further Details:
 *  =====================
@@ -427,15 +427,15 @@
       LOGICAL            LDUMMA( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLABAD,
-     $                   SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGEVC,
-     $                   STGSNA, XERBLA
+      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLACPY,
+     $                   SLASCL, SLASET, SORGQR, SORMQR, STGEVC, STGSNA,
+     $                   XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
@@ -532,7 +532,7 @@
      $                       N*ILAENV( 1, 'SORGQR', ' ', N, 1, N, 0 ) )
             END IF
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -26
@@ -557,7 +557,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -855,7 +854,7 @@
          CALL SLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR )
       END IF
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of SGGEVX
diff --git a/lapack-netlib/SRC/sggglm.f b/lapack-netlib/SRC/sggglm.f
index 56b4dba52..37094e4f2 100644
--- a/lapack-netlib/SRC/sggglm.f
+++ b/lapack-netlib/SRC/sggglm.f
@@ -177,7 +177,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHEReigen
+*> \ingroup ggglm
 *
 *  =====================================================================
       SUBROUTINE SGGGLM( N, M, P, A, LDA, B, LDB, D, X, Y, WORK, LWORK,
@@ -212,7 +212,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN
@@ -251,7 +252,7 @@
             LWKMIN = M + N + P
             LWKOPT = M + NP + MAX( N, P )*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
             INFO = -12
diff --git a/lapack-netlib/SRC/sgghd3.f b/lapack-netlib/SRC/sgghd3.f
index 23acf6ec5..9c5858b5a 100644
--- a/lapack-netlib/SRC/sgghd3.f
+++ b/lapack-netlib/SRC/sgghd3.f
@@ -211,7 +211,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup gghd3
 *
 *> \par Further Details:
 *  =====================
@@ -260,14 +260,15 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGGHRD, SLARTG, SLASET, SORM22, SROT, SGEMM,
      $                   SGEMV, STRMV, SLACPY, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          REAL, MAX
+      INTRINSIC          MAX
 *     ..
 *     .. Executable Statements ..
 *
@@ -276,7 +277,7 @@
       INFO = 0
       NB = ILAENV( 1, 'SGGHD3', ' ', N, ILO, IHI, -1 )
       LWKOPT = MAX( 6*N*NB, 1 )
-      WORK( 1 ) = REAL( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       INITQ = LSAME( COMPQ, 'I' )
       WANTQ = INITQ .OR. LSAME( COMPQ, 'V' )
       INITZ = LSAME( COMPZ, 'I' )
@@ -885,7 +886,7 @@
       IF ( JCOL.LT.IHI )
      $   CALL SGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q,
      $                LDQ, Z, LDZ, IERR )
-      WORK( 1 ) = REAL( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgglse.f b/lapack-netlib/SRC/sgglse.f
index 59addc3f4..53e3f8e45 100644
--- a/lapack-netlib/SRC/sgglse.f
+++ b/lapack-netlib/SRC/sgglse.f
@@ -172,7 +172,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERsolve
+*> \ingroup gglse
 *
 *  =====================================================================
       SUBROUTINE SGGLSE( M, N, P, A, LDA, B, LDB, C, D, X, WORK, LWORK,
@@ -207,7 +207,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN
@@ -246,7 +247,7 @@
             LWKMIN = M + N + P
             LWKOPT = P + MN + MAX( M, N )*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
             INFO = -12
diff --git a/lapack-netlib/SRC/sggqrf.f b/lapack-netlib/SRC/sggqrf.f
index 59b498da5..ebb42a899 100644
--- a/lapack-netlib/SRC/sggqrf.f
+++ b/lapack-netlib/SRC/sggqrf.f
@@ -173,7 +173,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup ggqrf
 *
 *> \par Further Details:
 *  =====================
@@ -236,7 +236,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN
@@ -251,7 +252,7 @@
       NB3 = ILAENV( 1, 'SORMQR', ' ', N, M, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
       LWKOPT = MAX( N, M, P )*NB
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
          INFO = -1
@@ -287,7 +288,8 @@
 *     RQ factorization of N-by-P matrix B: B = T*Z.
 *
       CALL SGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO )
-      WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) )
+      LWKOPT = MAX( LOPT, INT( WORK( 1 ) ) )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sggrqf.f b/lapack-netlib/SRC/sggrqf.f
index 8b7d4786a..2163f1ef8 100644
--- a/lapack-netlib/SRC/sggrqf.f
+++ b/lapack-netlib/SRC/sggrqf.f
@@ -172,7 +172,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup ggrqf
 *
 *> \par Further Details:
 *  =====================
@@ -235,7 +235,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN
@@ -250,7 +251,7 @@
       NB3 = ILAENV( 1, 'SORMRQ', ' ', M, N, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
       LWKOPT = MAX( N, M, P)*NB
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -287,7 +288,8 @@
 *     QR factorization of P-by-N matrix B: B = Z*T
 *
       CALL SGEQRF( P, N, B, LDB, TAUB, WORK, LWORK, INFO )
-      WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) )
+      LWKOPT = MAX( LOPT, INT( WORK( 1 ) ) )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sggsvd3.f b/lapack-netlib/SRC/sggsvd3.f
index 9077f2ea8..053fff5de 100644
--- a/lapack-netlib/SRC/sggsvd3.f
+++ b/lapack-netlib/SRC/sggsvd3.f
@@ -328,7 +328,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEsing
+*> \ingroup ggsvd3
 *
 *> \par Contributors:
 *  ==================
@@ -372,8 +372,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SGGSVP3, STGSJA, XERBLA
@@ -429,7 +429,7 @@
          LWKOPT = N + INT( WORK( 1 ) )
          LWKOPT = MAX( 2*N, LWKOPT )
          LWKOPT = MAX( 1, LWKOPT )
-         WORK( 1 ) = REAL( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -492,7 +492,7 @@
          END IF
    20 CONTINUE
 *
-      WORK( 1 ) = REAL( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of SGGSVD3
diff --git a/lapack-netlib/SRC/sggsvp3.f b/lapack-netlib/SRC/sggsvp3.f
index 4f76b32bc..a463b9064 100644
--- a/lapack-netlib/SRC/sggsvp3.f
+++ b/lapack-netlib/SRC/sggsvp3.f
@@ -250,7 +250,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup ggsvp3
 *
 *> \par Further Details:
 *  =====================
@@ -300,7 +300,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEQP3, SGEQR2, SGERQ2, SLACPY, SLAPMT,
@@ -365,7 +366,7 @@
          CALL SGEQP3( M, N, A, LDA, IWORK, TAU, WORK, -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK ( 1 ) ) )
          LWKOPT = MAX( 1, LWKOPT )
-         WORK( 1 ) = REAL( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -560,7 +561,7 @@
 *
       END IF
 *
-      WORK( 1 ) = REAL( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of SGGSVP3
diff --git a/lapack-netlib/SRC/shgeqz.f b/lapack-netlib/SRC/shgeqz.f
index 6543f8cb1..9ad64d2bf 100644
--- a/lapack-netlib/SRC/shgeqz.f
+++ b/lapack-netlib/SRC/shgeqz.f
@@ -282,7 +282,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup hgeqz
 *
 *> \par Further Details:
 *  =====================
@@ -346,8 +346,9 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANHS, SLAPY2, SLAPY3
-      EXTERNAL           LSAME, SLAMCH, SLANHS, SLAPY2, SLAPY3
+      REAL               SLAMCH, SLANHS, SLAPY2, SLAPY3, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANHS, SLAPY2, SLAPY3,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLAG2, SLARFG, SLARTG, SLASET, SLASV2, SROT,
@@ -1364,7 +1365,7 @@
 *     Exit (other than argument error) -- return optimal workspace size
 *
   420 CONTINUE
-      WORK( 1 ) = REAL( N )
+      WORK( 1 ) = SROUNDUP_LWORK( N )
       RETURN
 *
 *     End of SHGEQZ
diff --git a/lapack-netlib/SRC/shseqr.f b/lapack-netlib/SRC/shseqr.f
index 3b8d4c4d8..68b9fe6bd 100644
--- a/lapack-netlib/SRC/shseqr.f
+++ b/lapack-netlib/SRC/shseqr.f
@@ -233,7 +233,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup hseqr
 *
 *> \par Contributors:
 *  ==================
@@ -358,7 +358,8 @@
 *     .. External Functions ..
       INTEGER            ILAENV
       LOGICAL            LSAME
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLACPY, SLAHQR, SLAQR0, SLASET, XERBLA
@@ -373,7 +374,7 @@
       WANTT = LSAME( JOB, 'S' )
       INITZ = LSAME( COMPZ, 'I' )
       WANTZ = INITZ .OR. LSAME( COMPZ, 'V' )
-      WORK( 1 ) = REAL( MAX( 1, N ) )
+      WORK( 1 ) = SROUNDUP_LWORK( MAX( 1, N ) )
       LQUERY = LWORK.EQ.-1
 *
       INFO = 0
diff --git a/lapack-netlib/SRC/slaqr2.f b/lapack-netlib/SRC/slaqr2.f
index 62c4ef5eb..caf79fd1c 100644
--- a/lapack-netlib/SRC/slaqr2.f
+++ b/lapack-netlib/SRC/slaqr2.f
@@ -263,7 +263,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERauxiliary
+*> \ingroup laqr2
 *
 *> \par Contributors:
 *  ==================
@@ -305,11 +305,11 @@
       LOGICAL            BULGE, SORTED
 *     ..
 *     .. External Functions ..
-      REAL               SLAMCH
-      EXTERNAL           SLAMCH
+      REAL               SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SCOPY, SGEHRD, SGEMM, SLABAD, SLACPY, SLAHQR,
+      EXTERNAL           SCOPY, SGEHRD, SGEMM, SLACPY, SLAHQR,
      $                   SLANV2, SLARF, SLARFG, SLASET, SORMHR, STREXC
 *     ..
 *     .. Intrinsic Functions ..
@@ -343,7 +343,7 @@
 *     ==== Quick return in case of workspace query. ====
 *
       IF( LWORK.EQ.-1 ) THEN
-         WORK( 1 ) = REAL( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          RETURN
       END IF
 *
@@ -362,7 +362,6 @@
 *
       SAFMIN = SLAMCH( 'SAFE MINIMUM' )
       SAFMAX = ONE / SAFMIN
-      CALL SLABAD( SAFMIN, SAFMAX )
       ULP = SLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( REAL( N ) / ULP )
 *
@@ -674,7 +673,7 @@
 *
 *      ==== Return optimal workspace. ====
 *
-      WORK( 1 ) = REAL( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
 *     ==== End of SLAQR2 ====
 *
diff --git a/lapack-netlib/SRC/slaqr3.f b/lapack-netlib/SRC/slaqr3.f
index 519ccd6ed..d3ffb0f96 100644
--- a/lapack-netlib/SRC/slaqr3.f
+++ b/lapack-netlib/SRC/slaqr3.f
@@ -260,7 +260,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERauxiliary
+*> \ingroup laqr3
 *
 *> \par Contributors:
 *  ==================
@@ -302,14 +302,13 @@
       LOGICAL            BULGE, SORTED
 *     ..
 *     .. External Functions ..
-      REAL               SLAMCH
+      REAL               SLAMCH, SROUNDUP_LWORK
       INTEGER            ILAENV
-      EXTERNAL           SLAMCH, ILAENV
+      EXTERNAL           SLAMCH, SROUNDUP_LWORK, ILAENV
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SCOPY, SGEHRD, SGEMM, SLABAD, SLACPY, SLAHQR,
-     $                   SLANV2, SLAQR4, SLARF, SLARFG, SLASET, SORMHR,
-     $                   STREXC
+      EXTERNAL           SCOPY, SGEHRD, SGEMM, SLACPY, SLAHQR, SLANV2,
+     $                   SLAQR4, SLARF, SLARFG, SLASET, SORMHR, STREXC
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, INT, MAX, MIN, REAL, SQRT
@@ -348,7 +347,7 @@
 *     ==== Quick return in case of workspace query. ====
 *
       IF( LWORK.EQ.-1 ) THEN
-         WORK( 1 ) = REAL( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          RETURN
       END IF
 *
@@ -367,7 +366,6 @@
 *
       SAFMIN = SLAMCH( 'SAFE MINIMUM' )
       SAFMAX = ONE / SAFMIN
-      CALL SLABAD( SAFMIN, SAFMAX )
       ULP = SLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( REAL( N ) / ULP )
 *
@@ -685,7 +683,7 @@
 *
 *      ==== Return optimal workspace. ====
 *
-      WORK( 1 ) = REAL( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
 *     ==== End of SLAQR3 ====
 *
diff --git a/lapack-netlib/SRC/slaqr4.f b/lapack-netlib/SRC/slaqr4.f
index 1f0a51c85..d6721df97 100644
--- a/lapack-netlib/SRC/slaqr4.f
+++ b/lapack-netlib/SRC/slaqr4.f
@@ -239,7 +239,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERauxiliary
+*> \ingroup laqr4
 *
 *> \par Contributors:
 *  ==================
@@ -316,7 +316,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Local Arrays ..
       REAL               ZDUM( 1, 1 )
@@ -325,7 +326,7 @@
       EXTERNAL           SLACPY, SLAHQR, SLANV2, SLAQR2, SLAQR5
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          ABS, INT, MAX, MIN, MOD, REAL
+      INTRINSIC          ABS, INT, MAX, MIN, MOD
 *     ..
 *     .. Executable Statements ..
       INFO = 0
@@ -401,7 +402,7 @@
 *        ==== Quick return in case of workspace query. ====
 *
          IF( LWORK.EQ.-1 ) THEN
-            WORK( 1 ) = REAL( LWKOPT )
+            WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
             RETURN
          END IF
 *
@@ -732,7 +733,7 @@
 *
 *     ==== Return the optimal value of LWORK. ====
 *
-      WORK( 1 ) = REAL( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
 *     ==== End of SLAQR4 ====
 *
diff --git a/lapack-netlib/SRC/slaqz0.f b/lapack-netlib/SRC/slaqz0.f
index 8b2d3286e..c128093e4 100644
--- a/lapack-netlib/SRC/slaqz0.f
+++ b/lapack-netlib/SRC/slaqz0.f
@@ -294,7 +294,7 @@
 *
 *> \date May 2020
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup laqz0
 *>
 *  =====================================================================
       RECURSIVE SUBROUTINE SLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A,
@@ -331,7 +331,7 @@
 *     External Functions
       EXTERNAL :: XERBLA, SHGEQZ, SLAQZ3, SLAQZ4, SLASET,
      $            SLARTG, SROT
-      REAL, EXTERNAL :: SLAMCH, SLANHS
+      REAL, EXTERNAL :: SLAMCH, SLANHS, SROUNDUP_LWORK
       LOGICAL, EXTERNAL :: LSAME
       INTEGER, EXTERNAL :: ILAENV
 
@@ -461,7 +461,7 @@
 
       LWORKREQ = MAX( ITEMP1+2*NW**2, ITEMP2+2*NBR**2 )
       IF ( LWORK .EQ.-1 ) THEN
-         WORK( 1 ) = REAL( LWORKREQ )
+         WORK( 1 ) = SROUNDUP_LWORK( LWORKREQ )
          RETURN
       ELSE IF ( LWORK .LT. LWORKREQ ) THEN
          INFO = -19
diff --git a/lapack-netlib/SRC/slaqz3.f b/lapack-netlib/SRC/slaqz3.f
index edb8a6012..979381364 100644
--- a/lapack-netlib/SRC/slaqz3.f
+++ b/lapack-netlib/SRC/slaqz3.f
@@ -228,7 +228,7 @@
 *
 *> \date May 2020
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup laqz3
 *>
 *  =====================================================================
       RECURSIVE SUBROUTINE SLAQZ3( ILSCHUR, ILQ, ILZ, N, ILO, IHI, NW,
@@ -258,9 +258,9 @@
       REAL :: S, SMLNUM, ULP, SAFMIN, SAFMAX, C1, S1, TEMP
 
 *     External Functions
-      EXTERNAL :: XERBLA, STGEXC, SLABAD, SLAQZ0, SLACPY, SLASET,
+      EXTERNAL :: XERBLA, STGEXC, SLAQZ0, SLACPY, SLASET,
      $            SLAQZ2, SROT, SLARTG, SLAG2, SGEMM
-      REAL, EXTERNAL :: SLAMCH
+      REAL, EXTERNAL :: SLAMCH, SROUNDUP_LWORK
 
       INFO = 0
 
@@ -286,7 +286,7 @@
       LWORKREQ = MAX( LWORKREQ, N*NW, 2*NW**2+N )
       IF ( LWORK .EQ.-1 ) THEN
 *        workspace query, quick return
-         WORK( 1 ) = LWORKREQ
+         WORK( 1 ) = SROUNDUP_LWORK(LWORKREQ)
          RETURN
       ELSE IF ( LWORK .LT. LWORKREQ ) THEN
          INFO = -26
@@ -300,7 +300,6 @@
 *     Get machine constants
       SAFMIN = SLAMCH( 'SAFE MINIMUM' )
       SAFMAX = ONE/SAFMIN
-      CALL SLABAD( SAFMIN, SAFMAX )
       ULP = SLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( REAL( N )/ULP )
 
diff --git a/lapack-netlib/SRC/slaqz4.f b/lapack-netlib/SRC/slaqz4.f
index 3c307dd47..95b2784c5 100644
--- a/lapack-netlib/SRC/slaqz4.f
+++ b/lapack-netlib/SRC/slaqz4.f
@@ -204,7 +204,7 @@
 *
 *> \date May 2020
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup laqz4
 *>
 *  =====================================================================
       SUBROUTINE SLAQZ4( ILSCHUR, ILQ, ILZ, N, ILO, IHI, NSHIFTS,
@@ -236,6 +236,7 @@
 *     External functions
       EXTERNAL :: XERBLA, SGEMM, SLAQZ1, SLAQZ2, SLASET, SLARTG, SROT,
      $            SLACPY
+      REAL, EXTERNAL :: SROUNDUP_LWORK
 
       INFO = 0
       IF ( NBLOCK_DESIRED .LT. NSHIFTS+1 ) THEN
@@ -243,7 +244,7 @@
       END IF
       IF ( LWORK .EQ.-1 ) THEN
 *        workspace query, quick return
-         WORK( 1 ) = N*NBLOCK_DESIRED
+         WORK( 1 ) = SROUNDUP_LWORK(N*NBLOCK_DESIRED)
          RETURN
       ELSE IF ( LWORK .LT. N*NBLOCK_DESIRED ) THEN
          INFO = -25
diff --git a/lapack-netlib/SRC/slaswlq.f b/lapack-netlib/SRC/slaswlq.f
index 95e0ddcce..685f823a0 100644
--- a/lapack-netlib/SRC/slaswlq.f
+++ b/lapack-netlib/SRC/slaswlq.f
@@ -159,6 +159,8 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup laswlq
+*>
 *  =====================================================================
       SUBROUTINE SLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
      $                  INFO)
@@ -183,7 +185,8 @@
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     .. EXTERNAL SUBROUTINES ..
       EXTERNAL           SGELQT, SGEQRT, STPLQT, STPQRT, XERBLA
 *     .. INTRINSIC FUNCTIONS ..
@@ -262,7 +265,7 @@
      $                  WORK, INFO )
        END IF
 *
-      WORK( 1 ) = M * MB
+      WORK( 1 ) = SROUNDUP_LWORK(M * MB)
       RETURN
 *
 *     End of SLASWLQ
diff --git a/lapack-netlib/SRC/sorgbr.f b/lapack-netlib/SRC/sorgbr.f
index b1a5c03a2..46f4ab130 100644
--- a/lapack-netlib/SRC/sorgbr.f
+++ b/lapack-netlib/SRC/sorgbr.f
@@ -150,7 +150,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGBcomputational
+*> \ingroup ungbr
 *
 *  =====================================================================
       SUBROUTINE SORGBR( VECT, M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -179,7 +179,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SORGLQ, SORGQR, XERBLA
@@ -240,7 +241,7 @@
          CALL XERBLA( 'SORGBR', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          RETURN
       END IF
 *
@@ -326,7 +327,7 @@
             END IF
          END IF
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SORGBR

From 2ce67e2ada4f4d5033f83e860c2857db71eb3965 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 12 Nov 2023 14:42:52 +0100
Subject: [PATCH 406/718] Apply ROUNDUP_LWORK (Reference-LAPACK PR 904)

---
 lapack-netlib/SRC/sorghr.f           |   9 ++-
 lapack-netlib/SRC/sorglq.f           |   9 ++-
 lapack-netlib/SRC/sorgql.f           |   9 ++-
 lapack-netlib/SRC/sorgqr.f           |   9 ++-
 lapack-netlib/SRC/sorgrq.f           |   9 ++-
 lapack-netlib/SRC/sorgtr.f           |   9 ++-
 lapack-netlib/SRC/sorgtsqr.f         |  14 ++--
 lapack-netlib/SRC/sorgtsqr_row.f     |  14 ++--
 lapack-netlib/SRC/sorm22.f           |  11 +--
 lapack-netlib/SRC/sormbr.f           |   9 ++-
 lapack-netlib/SRC/sormhr.f           |   9 ++-
 lapack-netlib/SRC/sormlq.f           |   9 ++-
 lapack-netlib/SRC/sormql.f           |   9 ++-
 lapack-netlib/SRC/sormqr.f           |   9 ++-
 lapack-netlib/SRC/sormrq.f           |   9 ++-
 lapack-netlib/SRC/sormrz.f           |   9 ++-
 lapack-netlib/SRC/sormtr.f           |   9 ++-
 lapack-netlib/SRC/ssbev_2stage.f     |  25 ++++---
 lapack-netlib/SRC/ssbevd.f           |  10 +--
 lapack-netlib/SRC/ssbevd_2stage.f    |  21 +++---
 lapack-netlib/SRC/ssbevx_2stage.f    |  23 +++---
 lapack-netlib/SRC/ssbgvd.f           |   9 ++-
 lapack-netlib/SRC/sspevd.f           |  10 +--
 lapack-netlib/SRC/sspgvd.f           |   9 ++-
 lapack-netlib/SRC/sstedc.f           |  10 +--
 lapack-netlib/SRC/sstemr.f           |   8 +-
 lapack-netlib/SRC/sstevd.f           |  10 +--
 lapack-netlib/SRC/sstevr.f           |  10 +--
 lapack-netlib/SRC/ssyev.f            |  10 +--
 lapack-netlib/SRC/ssyev_2stage.f     |  27 +++----
 lapack-netlib/SRC/ssyevd.f           |  10 +--
 lapack-netlib/SRC/ssyevr.f           |  10 +--
 lapack-netlib/SRC/ssyevr_2stage.f    |  27 +++----
 lapack-netlib/SRC/ssyevx.f           |  12 +--
 lapack-netlib/SRC/ssyevx_2stage.f    |  25 ++++---
 lapack-netlib/SRC/ssygv.f            |   9 ++-
 lapack-netlib/SRC/ssygv_2stage.f     |  19 ++---
 lapack-netlib/SRC/ssygvd.f           |   9 ++-
 lapack-netlib/SRC/ssygvx.f           |   9 ++-
 lapack-netlib/SRC/ssysv.f            |   9 ++-
 lapack-netlib/SRC/ssysv_aa.f         |   9 ++-
 lapack-netlib/SRC/ssysv_aa_2stage.f  |   7 +-
 lapack-netlib/SRC/ssysv_rk.f         |   9 ++-
 lapack-netlib/SRC/ssysv_rook.f       |   9 ++-
 lapack-netlib/SRC/ssysvx.f           |  10 +--
 lapack-netlib/SRC/ssytrd.f           |   9 ++-
 lapack-netlib/SRC/ssytrd_sb2st.F     | 107 ++++++++++++++-------------
 lapack-netlib/SRC/ssytrd_sy2sb.f     |   9 ++-
 lapack-netlib/SRC/ssytrf.f           |   9 ++-
 lapack-netlib/SRC/ssytrf_aa.f        |   9 ++-
 lapack-netlib/SRC/ssytrf_aa_2stage.f |   7 +-
 lapack-netlib/SRC/ssytrf_rk.f        |   9 ++-
 lapack-netlib/SRC/ssytrf_rook.f      |   9 ++-
 lapack-netlib/SRC/ssytri_3.f         |   9 ++-
 lapack-netlib/SRC/ssytrs_aa.f        |   7 +-
 lapack-netlib/SRC/stgexc.f           |   8 +-
 lapack-netlib/SRC/stgsen.f           |  10 +--
 lapack-netlib/SRC/stgsna.f           |  11 +--
 lapack-netlib/SRC/stgsyl.f           |   9 ++-
 lapack-netlib/SRC/strsen.f           |  10 +--
 lapack-netlib/SRC/stzrzf.f           |   9 ++-
 61 files changed, 414 insertions(+), 357 deletions(-)

diff --git a/lapack-netlib/SRC/sorghr.f b/lapack-netlib/SRC/sorghr.f
index f65cd898c..624ede282 100644
--- a/lapack-netlib/SRC/sorghr.f
+++ b/lapack-netlib/SRC/sorghr.f
@@ -119,7 +119,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unghr
 *
 *  =====================================================================
       SUBROUTINE SORGHR( N, ILO, IHI, A, LDA, TAU, WORK, LWORK, INFO )
@@ -150,7 +150,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -177,7 +178,7 @@
       IF( INFO.EQ.0 ) THEN
          NB = ILAENV( 1, 'SORGQR', ' ', NH, NH, NH, -1 )
          LWKOPT = MAX( 1, NH )*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -229,7 +230,7 @@
          CALL SORGQR( NH, NH, NH, A( ILO+1, ILO+1 ), LDA, TAU( ILO ),
      $                WORK, LWORK, IINFO )
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SORGHR
diff --git a/lapack-netlib/SRC/sorglq.f b/lapack-netlib/SRC/sorglq.f
index b1d107964..30f6d5d48 100644
--- a/lapack-netlib/SRC/sorglq.f
+++ b/lapack-netlib/SRC/sorglq.f
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unglq
 *
 *  =====================================================================
       SUBROUTINE SORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -155,7 +155,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -164,7 +165,7 @@
       INFO = 0
       NB = ILAENV( 1, 'SORGLQ', ' ', M, N, K, -1 )
       LWKOPT = MAX( 1, M )*NB
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -278,7 +279,7 @@
    50    CONTINUE
       END IF
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of SORGLQ
diff --git a/lapack-netlib/SRC/sorgql.f b/lapack-netlib/SRC/sorgql.f
index 34ab5edef..f104e64b2 100644
--- a/lapack-netlib/SRC/sorgql.f
+++ b/lapack-netlib/SRC/sorgql.f
@@ -121,7 +121,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup ungql
 *
 *  =====================================================================
       SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -156,7 +156,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -181,7 +182,7 @@
             NB = ILAENV( 1, 'SORGQL', ' ', M, N, K, -1 )
             LWKOPT = N*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
             INFO = -8
@@ -285,7 +286,7 @@
    50    CONTINUE
       END IF
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of SORGQL
diff --git a/lapack-netlib/SRC/sorgqr.f b/lapack-netlib/SRC/sorgqr.f
index 056de54d7..a87ea6c65 100644
--- a/lapack-netlib/SRC/sorgqr.f
+++ b/lapack-netlib/SRC/sorgqr.f
@@ -121,7 +121,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup ungqr
 *
 *  =====================================================================
       SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -156,7 +156,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -165,7 +166,7 @@
       INFO = 0
       NB = ILAENV( 1, 'SORGQR', ' ', M, N, K, -1 )
       LWKOPT = MAX( 1, N )*NB
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -279,7 +280,7 @@
    50    CONTINUE
       END IF
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of SORGQR
diff --git a/lapack-netlib/SRC/sorgrq.f b/lapack-netlib/SRC/sorgrq.f
index d9b6ccbe6..331f20904 100644
--- a/lapack-netlib/SRC/sorgrq.f
+++ b/lapack-netlib/SRC/sorgrq.f
@@ -121,7 +121,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup ungrq
 *
 *  =====================================================================
       SUBROUTINE SORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -156,7 +156,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -181,7 +182,7 @@
             NB = ILAENV( 1, 'SORGRQ', ' ', M, N, K, -1 )
             LWKOPT = M*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN
             INFO = -8
@@ -285,7 +286,7 @@
    50    CONTINUE
       END IF
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of SORGRQ
diff --git a/lapack-netlib/SRC/sorgtr.f b/lapack-netlib/SRC/sorgtr.f
index 67bde00cb..6a1dc3034 100644
--- a/lapack-netlib/SRC/sorgtr.f
+++ b/lapack-netlib/SRC/sorgtr.f
@@ -116,7 +116,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup ungtr
 *
 *  =====================================================================
       SUBROUTINE SORGTR( UPLO, N, A, LDA, TAU, WORK, LWORK, INFO )
@@ -146,7 +146,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SORGQL, SORGQR, XERBLA
@@ -178,7 +179,7 @@
            NB = ILAENV( 1, 'SORGQR', ' ', N-1, N-1, N-1, -1 )
          END IF
          LWKOPT = MAX( 1, N-1 )*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -244,7 +245,7 @@
      $                   LWORK, IINFO )
          END IF
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SORGTR
diff --git a/lapack-netlib/SRC/sorgtsqr.f b/lapack-netlib/SRC/sorgtsqr.f
index 692eba1d9..0be27af77 100644
--- a/lapack-netlib/SRC/sorgtsqr.f
+++ b/lapack-netlib/SRC/sorgtsqr.f
@@ -157,7 +157,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup singleOTHERcomputational
+*> \ingroup ungtsqr
 *
 *> \par Contributors:
 *  ==================
@@ -196,11 +196,15 @@
       LOGICAL            LQUERY
       INTEGER            IINFO, LDC, LWORKOPT, LC, LW, NBLOCAL, J
 *     ..
+*     .. External Functions ..
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SLAMTSQR, SLASET, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          REAL, MAX, MIN
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
@@ -257,14 +261,14 @@
          CALL XERBLA( 'SORGTSQR', -INFO )
          RETURN
       ELSE IF ( LQUERY ) THEN
-         WORK( 1 ) = REAL( LWORKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
          RETURN
       END IF
 *
 *     Quick return if possible
 *
       IF( MIN( M, N ).EQ.0 ) THEN
-         WORK( 1 ) = REAL( LWORKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
          RETURN
       END IF
 *
@@ -297,7 +301,7 @@
          CALL SCOPY( M, WORK( (J-1)*LDC + 1 ), 1, A( 1, J ), 1 )
       END DO
 *
-      WORK( 1 ) = REAL( LWORKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
       RETURN
 *
 *     End of SORGTSQR
diff --git a/lapack-netlib/SRC/sorgtsqr_row.f b/lapack-netlib/SRC/sorgtsqr_row.f
index d2a2150cd..5a1e1ff07 100644
--- a/lapack-netlib/SRC/sorgtsqr_row.f
+++ b/lapack-netlib/SRC/sorgtsqr_row.f
@@ -169,7 +169,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup sigleOTHERcomputational
+*> \ingroup ungtsqr_row
 *
 *> \par Contributors:
 *  ==================
@@ -213,11 +213,15 @@
 *     .. Local Arrays ..
       REAL               DUMMY( 1, 1 )
 *     ..
+*     .. External Functions ..
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
       EXTERNAL           SLARFB_GETT, SLASET, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          REAL, MAX, MIN
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
@@ -255,14 +259,14 @@
          CALL XERBLA( 'SORGTSQR_ROW', -INFO )
          RETURN
       ELSE IF ( LQUERY ) THEN
-         WORK( 1 ) = REAL( LWORKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
          RETURN
       END IF
 *
 *     Quick return if possible
 *
       IF( MIN( M, N ).EQ.0 ) THEN
-         WORK( 1 ) = REAL( LWORKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
          RETURN
       END IF
 *
@@ -371,7 +375,7 @@
 *
       END DO
 *
-      WORK( 1 ) = REAL( LWORKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
       RETURN
 *
 *     End of SORGTSQR_ROW
diff --git a/lapack-netlib/SRC/sorm22.f b/lapack-netlib/SRC/sorm22.f
index 15096870a..886adb2cf 100644
--- a/lapack-netlib/SRC/sorm22.f
+++ b/lapack-netlib/SRC/sorm22.f
@@ -155,7 +155,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unm22
 *
 *  =====================================================================
       SUBROUTINE SORM22( SIDE, TRANS, M, N, N1, N2, Q, LDQ, C, LDC,
@@ -187,13 +187,14 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMM, SLACPY, STRMM, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          REAL, MAX, MIN
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
@@ -237,7 +238,7 @@
 *
       IF( INFO.EQ.0 ) THEN
          LWKOPT = M*N
-         WORK( 1 ) = REAL( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -430,7 +431,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = REAL( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of SORM22
diff --git a/lapack-netlib/SRC/sormbr.f b/lapack-netlib/SRC/sormbr.f
index efe5be41a..e2dccc363 100644
--- a/lapack-netlib/SRC/sormbr.f
+++ b/lapack-netlib/SRC/sormbr.f
@@ -188,7 +188,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unmbr
 *
 *  =====================================================================
       SUBROUTINE SORMBR( VECT, SIDE, TRANS, M, N, K, A, LDA, TAU, C,
@@ -217,7 +217,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SORMLQ, SORMQR, XERBLA
@@ -285,7 +286,7 @@
             END IF
          END IF
          LWKOPT = NW*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -363,7 +364,7 @@
      $                   TAU, C( I1, I2 ), LDC, WORK, LWORK, IINFO )
          END IF
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SORMBR
diff --git a/lapack-netlib/SRC/sormhr.f b/lapack-netlib/SRC/sormhr.f
index 2d2053af4..e033feb38 100644
--- a/lapack-netlib/SRC/sormhr.f
+++ b/lapack-netlib/SRC/sormhr.f
@@ -171,7 +171,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unmhr
 *
 *  =====================================================================
       SUBROUTINE SORMHR( SIDE, TRANS, M, N, ILO, IHI, A, LDA, TAU, C,
@@ -199,7 +199,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SORMQR, XERBLA
@@ -253,7 +254,7 @@
             NB = ILAENV( 1, 'SORMQR', SIDE // TRANS, M, NH, NH, -1 )
          END IF
          LWKOPT = NW*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -285,7 +286,7 @@
       CALL SORMQR( SIDE, TRANS, MI, NI, NH, A( ILO+1, ILO ), LDA,
      $             TAU( ILO ), C( I1, I2 ), LDC, WORK, LWORK, IINFO )
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SORMHR
diff --git a/lapack-netlib/SRC/sormlq.f b/lapack-netlib/SRC/sormlq.f
index ee996e560..1a32568b6 100644
--- a/lapack-netlib/SRC/sormlq.f
+++ b/lapack-netlib/SRC/sormlq.f
@@ -160,7 +160,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unmlq
 *
 *  =====================================================================
       SUBROUTINE SORMLQ( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC,
@@ -195,7 +195,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLARFB, SLARFT, SORML2, XERBLA
@@ -246,7 +247,7 @@
          NB = MIN( NBMAX, ILAENV( 1, 'SORMLQ', SIDE // TRANS, M, N, K,
      $             -1 ) )
          LWKOPT = NW*NB + TSIZE
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -338,7 +339,7 @@
      $                   C( IC, JC ), LDC, WORK, LDWORK )
    10    CONTINUE
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SORMLQ
diff --git a/lapack-netlib/SRC/sormql.f b/lapack-netlib/SRC/sormql.f
index 72a8d22ee..9564d4141 100644
--- a/lapack-netlib/SRC/sormql.f
+++ b/lapack-netlib/SRC/sormql.f
@@ -160,7 +160,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unmql
 *
 *  =====================================================================
       SUBROUTINE SORMQL( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC,
@@ -194,7 +194,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLARFB, SLARFT, SORM2L, XERBLA
@@ -249,7 +250,7 @@
      $                               K, -1 ) )
             LWKOPT = NW*NB + TSIZE
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -330,7 +331,7 @@
      $                   WORK, LDWORK )
    10    CONTINUE
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SORMQL
diff --git a/lapack-netlib/SRC/sormqr.f b/lapack-netlib/SRC/sormqr.f
index 5d4256f09..adb1203df 100644
--- a/lapack-netlib/SRC/sormqr.f
+++ b/lapack-netlib/SRC/sormqr.f
@@ -160,7 +160,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unmqr
 *
 *  =====================================================================
       SUBROUTINE SORMQR( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC,
@@ -194,7 +194,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLARFB, SLARFT, SORM2R, XERBLA
@@ -245,7 +246,7 @@
          NB = MIN( NBMAX, ILAENV( 1, 'SORMQR', SIDE // TRANS, M, N, K,
      $        -1 ) )
          LWKOPT = NW*NB + TSIZE
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -331,7 +332,7 @@
      $                   C( IC, JC ), LDC, WORK, LDWORK )
    10    CONTINUE
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SORMQR
diff --git a/lapack-netlib/SRC/sormrq.f b/lapack-netlib/SRC/sormrq.f
index 62fcdacdb..f091f0507 100644
--- a/lapack-netlib/SRC/sormrq.f
+++ b/lapack-netlib/SRC/sormrq.f
@@ -160,7 +160,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unmrq
 *
 *  =====================================================================
       SUBROUTINE SORMRQ( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC,
@@ -195,7 +195,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLARFB, SLARFT, SORMR2, XERBLA
@@ -250,7 +251,7 @@
      $                               K, -1 ) )
             LWKOPT = NW*NB + TSIZE
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -337,7 +338,7 @@
      $                   WORK, LDWORK )
    10    CONTINUE
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SORMRQ
diff --git a/lapack-netlib/SRC/sormrz.f b/lapack-netlib/SRC/sormrz.f
index cdadd62b5..b037a984b 100644
--- a/lapack-netlib/SRC/sormrz.f
+++ b/lapack-netlib/SRC/sormrz.f
@@ -168,7 +168,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unmrz
 *
 *> \par Contributors:
 *  ==================
@@ -213,7 +213,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLARZB, SLARZT, SORMR3, XERBLA
@@ -271,7 +272,7 @@
      $                               K, -1 ) )
             LWKOPT = NW*NB + TSIZE
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -367,7 +368,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sormtr.f b/lapack-netlib/SRC/sormtr.f
index 3ba749fee..1bc87768f 100644
--- a/lapack-netlib/SRC/sormtr.f
+++ b/lapack-netlib/SRC/sormtr.f
@@ -164,7 +164,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unmtr
 *
 *  =====================================================================
       SUBROUTINE SORMTR( SIDE, UPLO, TRANS, M, N, A, LDA, TAU, C, LDC,
@@ -192,7 +192,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SORMQL, SORMQR, XERBLA
@@ -256,7 +257,7 @@
             END IF
          END IF
          LWKOPT = NW*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -301,7 +302,7 @@
          CALL SORMQR( SIDE, TRANS, MI, NI, NQ-1, A( 2, 1 ), LDA, TAU,
      $                C( I1, I2 ), LDC, WORK, LWORK, IINFO )
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SORMTR
diff --git a/lapack-netlib/SRC/ssbev_2stage.f b/lapack-netlib/SRC/ssbev_2stage.f
index 5752c1ecc..71ace4e27 100644
--- a/lapack-netlib/SRC/ssbev_2stage.f
+++ b/lapack-netlib/SRC/ssbev_2stage.f
@@ -131,7 +131,7 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, dimension) where
 *>                                   dimension = (2KD+1)*N + KD*NTHREADS + N
@@ -164,7 +164,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHEReigen
+*> \ingroup hbev_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -182,7 +182,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -190,11 +190,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -232,12 +232,13 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      REAL               SLAMCH, SLANSB
-      EXTERNAL           LSAME, SLAMCH, SLANSB, ILAENV2STAGE
+      REAL               SLAMCH, SLANSB, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANSB, ILAENV2STAGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLASCL, SSCAL, SSTEQR, SSTERF, XERBLA,
-     $                   SSYTRD_SB2ST 
+     $                   SSYTRD_SB2ST
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          SQRT
@@ -268,7 +269,7 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWMIN = 1
-            WORK( 1 ) = LWMIN
+            WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          ELSE
             IB    = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', JOBZ,
      $                            N, KD, -1, -1 )
@@ -277,7 +278,7 @@
             LWTRD = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', JOBZ,
      $                            N, KD, IB, -1 )
             LWMIN = N + LHTRD + LWTRD
-            WORK( 1 )  = LWMIN
+            WORK( 1 )  = SROUNDUP_LWORK(LWMIN)
          ENDIF
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
@@ -343,7 +344,7 @@
       LLWORK  = LWORK - INDWRK + 1
 *
       CALL SSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
-     $                    WORK( INDE ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDE ), WORK( INDHOUS ), LHTRD,
      $                    WORK( INDWRK ), LLWORK, IINFO )
 *
 *     For eigenvalues only, call SSTERF.  For eigenvectors, call SSTEQR.
@@ -368,7 +369,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssbevd.f b/lapack-netlib/SRC/ssbevd.f
index e87f9a030..e4118dbed 100644
--- a/lapack-netlib/SRC/ssbevd.f
+++ b/lapack-netlib/SRC/ssbevd.f
@@ -179,7 +179,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHEReigen
+*> \ingroup hbevd
 *
 *  =====================================================================
       SUBROUTINE SSBEVD( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, WORK,
@@ -213,8 +213,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANSB
-      EXTERNAL           LSAME, SLAMCH, SLANSB
+      REAL               SLAMCH, SLANSB, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANSB, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMM, SLACPY, SLASCL, SSBTRD, SSCAL, SSTEDC,
@@ -259,7 +259,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -342,7 +342,7 @@
       IF( ISCALE.EQ.1 )
      $   CALL SSCAL( N, ONE / SIGMA, W, 1 )
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssbevd_2stage.f b/lapack-netlib/SRC/ssbevd_2stage.f
index 014bade48..de3f1c010 100644
--- a/lapack-netlib/SRC/ssbevd_2stage.f
+++ b/lapack-netlib/SRC/ssbevd_2stage.f
@@ -134,7 +134,7 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, dimension) where
 *>                                   dimension = (2KD+1)*N + KD*NTHREADS + N
@@ -188,7 +188,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHEReigen
+*> \ingroup hbevd_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -206,7 +206,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -214,11 +214,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -258,8 +258,9 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      REAL               SLAMCH, SLANSB
-      EXTERNAL           LSAME, SLAMCH, SLANSB, ILAENV2STAGE
+      REAL               SLAMCH, SLANSB, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANSB, ILAENV2STAGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMM, SLACPY, SLASCL, SSCAL, SSTEDC,
@@ -307,7 +308,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 )  = LWMIN
+         WORK( 1 )  = SROUNDUP_LWORK(LWMIN)
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -374,7 +375,7 @@
       LLWRK2  = LWORK - INDWK2 + 1
 *
       CALL SSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
-     $                    WORK( INDE ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDE ), WORK( INDHOUS ), LHTRD,
      $                    WORK( INDWRK ), LLWORK, IINFO )
 *
 *     For eigenvalues only, call SSTERF.  For eigenvectors, call SSTEDC.
@@ -394,7 +395,7 @@
       IF( ISCALE.EQ.1 )
      $   CALL SSCAL( N, ONE / SIGMA, W, 1 )
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssbevx_2stage.f b/lapack-netlib/SRC/ssbevx_2stage.f
index 224b676b7..d25d3639a 100644
--- a/lapack-netlib/SRC/ssbevx_2stage.f
+++ b/lapack-netlib/SRC/ssbevx_2stage.f
@@ -235,7 +235,7 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, 7*N, dimension) where
 *>                                   dimension = (2KD+1)*N + KD*NTHREADS + 2*N
@@ -281,7 +281,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHEReigen
+*> \ingroup hbevx_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -299,7 +299,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -307,11 +307,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -357,8 +357,9 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      REAL               SLAMCH, SLANSB
-      EXTERNAL           LSAME, SLAMCH, SLANSB, ILAENV2STAGE
+      REAL               SLAMCH, SLANSB, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANSB, ILAENV2STAGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SGEMV, SLACPY, SLASCL, SSCAL,
@@ -414,7 +415,7 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWMIN = 1
-            WORK( 1 ) = LWMIN
+            WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          ELSE
             IB    = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', JOBZ,
      $                            N, KD, -1, -1 )
@@ -423,7 +424,7 @@
             LWTRD = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', JOBZ,
      $                            N, KD, IB, -1 )
             LWMIN = 2*N + LHTRD + LWTRD
-            WORK( 1 )  = LWMIN
+            WORK( 1 )  = SROUNDUP_LWORK(LWMIN)
          ENDIF
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
@@ -513,7 +514,7 @@
       LLWORK  = LWORK - INDWRK + 1
 *
       CALL SSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, WORK( INDD ),
-     $                    WORK( INDE ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDE ), WORK( INDHOUS ), LHTRD,
      $                    WORK( INDWRK ), LLWORK, IINFO )
 *
 *     If all eigenvalues are desired and ABSTOL is less than or equal
@@ -624,7 +625,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssbgvd.f b/lapack-netlib/SRC/ssbgvd.f
index 7c21ee455..f872e5464 100644
--- a/lapack-netlib/SRC/ssbgvd.f
+++ b/lapack-netlib/SRC/ssbgvd.f
@@ -208,7 +208,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHEReigen
+*> \ingroup hbgvd
 *
 *> \par Contributors:
 *  ==================
@@ -247,7 +247,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMM, SLACPY, SPBSTF, SSBGST, SSBTRD, SSTEDC,
@@ -292,7 +293,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -353,7 +354,7 @@
          CALL SLACPY( 'A', N, N, WORK( INDWK2 ), N, Z, LDZ )
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/sspevd.f b/lapack-netlib/SRC/sspevd.f
index 0872e95ac..1aae48d1d 100644
--- a/lapack-netlib/SRC/sspevd.f
+++ b/lapack-netlib/SRC/sspevd.f
@@ -164,7 +164,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHEReigen
+*> \ingroup hpevd
 *
 *  =====================================================================
       SUBROUTINE SSPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK,
@@ -198,8 +198,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANSP
-      EXTERNAL           LSAME, SLAMCH, SLANSP
+      REAL               SLAMCH, SLANSP, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANSP, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SOPMTR, SSCAL, SSPTRD, SSTEDC, SSTERF, XERBLA
@@ -240,7 +240,7 @@
             END IF
          END IF
          IWORK( 1 ) = LIWMIN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
             INFO = -9
@@ -319,7 +319,7 @@
       IF( ISCALE.EQ.1 )
      $   CALL SSCAL( N, ONE / SIGMA, W, 1 )
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
       RETURN
 *
diff --git a/lapack-netlib/SRC/sspgvd.f b/lapack-netlib/SRC/sspgvd.f
index 1a88365f2..c1e14594b 100644
--- a/lapack-netlib/SRC/sspgvd.f
+++ b/lapack-netlib/SRC/sspgvd.f
@@ -191,7 +191,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHEReigen
+*> \ingroup hpgvd
 *
 *> \par Contributors:
 *  ==================
@@ -225,7 +225,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SPPTRF, SSPEVD, SSPGST, STPMV, STPSV, XERBLA
@@ -267,7 +268,7 @@
                LWMIN = 2*N
             END IF
          END IF
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          IWORK( 1 ) = LIWMIN
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
             INFO = -11
@@ -345,7 +346,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/sstedc.f b/lapack-netlib/SRC/sstedc.f
index 61e3c2fda..0e1cb4258 100644
--- a/lapack-netlib/SRC/sstedc.f
+++ b/lapack-netlib/SRC/sstedc.f
@@ -167,7 +167,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup auxOTHERcomputational
+*> \ingroup stedc
 *
 *> \par Contributors:
 *  ==================
@@ -208,8 +208,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANST
-      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANST
+      REAL               SLAMCH, SLANST, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANST, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMM, SLACPY, SLAED0, SLASCL, SLASET, SLASRT,
@@ -268,7 +268,7 @@
                LIWMIN = 3 + 5*N
             END IF
          END IF
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT. LQUERY ) THEN
@@ -463,7 +463,7 @@
       END IF
 *
    50 CONTINUE
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/sstemr.f b/lapack-netlib/SRC/sstemr.f
index 2ed697b69..62cfa3d4d 100644
--- a/lapack-netlib/SRC/sstemr.f
+++ b/lapack-netlib/SRC/sstemr.f
@@ -359,8 +359,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANST
-      EXTERNAL           LSAME, SLAMCH, SLANST
+      REAL               SLAMCH, SLANST, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANST, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SLAE2, SLAEV2, SLARRC, SLARRE, SLARRJ,
@@ -443,7 +443,7 @@
       RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          IWORK( 1 ) = LIWMIN
 *
          IF( WANTZ .AND. ALLEIG ) THEN
@@ -782,7 +782,7 @@
       ENDIF
 *
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
       RETURN
 *
diff --git a/lapack-netlib/SRC/sstevd.f b/lapack-netlib/SRC/sstevd.f
index 218af8c76..4fc2a6311 100644
--- a/lapack-netlib/SRC/sstevd.f
+++ b/lapack-netlib/SRC/sstevd.f
@@ -149,7 +149,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHEReigen
+*> \ingroup stevd
 *
 *  =====================================================================
       SUBROUTINE SSTEVD( JOBZ, N, D, E, Z, LDZ, WORK, LWORK, IWORK,
@@ -182,8 +182,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANST
-      EXTERNAL           LSAME, SLAMCH, SLANST
+      REAL               SLAMCH, SLANST, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANST, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SSCAL, SSTEDC, SSTERF, XERBLA
@@ -215,7 +215,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -283,7 +283,7 @@
       IF( ISCALE.EQ.1 )
      $   CALL SSCAL( N, ONE / SIGMA, D, 1 )
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/sstevr.f b/lapack-netlib/SRC/sstevr.f
index 2ab63eb3e..42f49b11b 100644
--- a/lapack-netlib/SRC/sstevr.f
+++ b/lapack-netlib/SRC/sstevr.f
@@ -287,7 +287,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHEReigen
+*> \ingroup stevr
 *
 *> \par Contributors:
 *  ==================
@@ -336,8 +336,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANST
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANST
+      REAL               SLAMCH, SLANST, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANST, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SSCAL, SSTEBZ, SSTEMR, SSTEIN, SSTERF,
@@ -389,7 +389,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -570,7 +570,7 @@
 *      IF (wantz .and. INDEIG ) Z( 1,1) = Z(1,1) / 1.002 + .002
 *
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssyev.f b/lapack-netlib/SRC/ssyev.f
index 03ed326b8..638445f04 100644
--- a/lapack-netlib/SRC/ssyev.f
+++ b/lapack-netlib/SRC/ssyev.f
@@ -125,7 +125,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup heev
 *
 *  =====================================================================
       SUBROUTINE SSYEV( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, INFO )
@@ -158,8 +158,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANSY
-      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANSY
+      REAL               SLAMCH, SLANSY, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANSY, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLASCL, SORGTR, SSCAL, SSTEQR, SSTERF, SSYTRD,
@@ -190,7 +190,7 @@
       IF( INFO.EQ.0 ) THEN
          NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, ( NB+2 )*N )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.MAX( 1, 3*N-1 ) .AND. .NOT.LQUERY )
      $      INFO = -8
@@ -274,7 +274,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssyev_2stage.f b/lapack-netlib/SRC/ssyev_2stage.f
index a6fa30cc8..519ee334d 100644
--- a/lapack-netlib/SRC/ssyev_2stage.f
+++ b/lapack-netlib/SRC/ssyev_2stage.f
@@ -20,7 +20,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, 
+*       SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
 *                                INFO )
 *
 *       IMPLICIT NONE
@@ -105,12 +105,12 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + 2*N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + 2*N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -143,7 +143,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup heev_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -161,7 +161,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -169,16 +169,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
 *  =====================================================================
-      SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, 
+      SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
      $                         INFO )
 *
       IMPLICIT NONE
@@ -211,8 +211,9 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      REAL               SLAMCH, SLANSY
-      EXTERNAL           LSAME, SLAMCH, SLANSY, ILAENV2STAGE
+      REAL               SLAMCH, SLANSY, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANSY, ILAENV2STAGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLASCL, SORGTR, SSCAL, SSTEQR, SSTERF,
@@ -305,7 +306,7 @@
       LLWORK  = LWORK - INDWRK + 1
 *
       CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ),
-     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD,
      $                    WORK( INDWRK ), LLWORK, IINFO )
 *
 *     For eigenvalues only, call SSTERF.  For eigenvectors, first call
@@ -336,7 +337,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssyevd.f b/lapack-netlib/SRC/ssyevd.f
index ee0e33384..a5e4638d6 100644
--- a/lapack-netlib/SRC/ssyevd.f
+++ b/lapack-netlib/SRC/ssyevd.f
@@ -160,7 +160,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup heevd
 *
 *> \par Contributors:
 *  ==================
@@ -204,8 +204,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANSY
-      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANSY
+      REAL               SLAMCH, SLANSY, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANSY, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLACPY, SLASCL, SORMTR, SSCAL, SSTEDC, SSTERF,
@@ -251,7 +251,7 @@
      $                  N*ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) )
             LIOPT = LIWMIN
          END IF
-         WORK( 1 ) = LOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LOPT)
          IWORK( 1 ) = LIOPT
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -335,7 +335,7 @@
       IF( ISCALE.EQ.1 )
      $   CALL SSCAL( N, ONE / SIGMA, W, 1 )
 *
-      WORK( 1 ) = LOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LOPT)
       IWORK( 1 ) = LIOPT
 *
       RETURN
diff --git a/lapack-netlib/SRC/ssyevr.f b/lapack-netlib/SRC/ssyevr.f
index d8e4ce3ea..47e4d7cbf 100644
--- a/lapack-netlib/SRC/ssyevr.f
+++ b/lapack-netlib/SRC/ssyevr.f
@@ -317,7 +317,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup heevr
 *
 *> \par Contributors:
 *  ==================
@@ -368,8 +368,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANSY
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANSY
+      REAL               SLAMCH, SLANSY, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANSY, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SORMTR, SSCAL, SSTEBZ, SSTEMR, SSTEIN,
@@ -428,7 +428,7 @@
          NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
          NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) )
          LWKOPT = MAX( ( NB+1 )*N, LWMIN )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -677,7 +677,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/ssyevr_2stage.f b/lapack-netlib/SRC/ssyevr_2stage.f
index 8ab2844c6..a2d6a6231 100644
--- a/lapack-netlib/SRC/ssyevr_2stage.f
+++ b/lapack-netlib/SRC/ssyevr_2stage.f
@@ -263,7 +263,7 @@
 *>          indicating the nonzero elements in Z. The i-th eigenvector
 *>          is nonzero only in elements ISUPPZ( 2*i-1 ) through
 *>          ISUPPZ( 2*i ). This is an output of SSTEMR (tridiagonal
-*>          matrix). The support of the eigenvectors of A is typically 
+*>          matrix). The support of the eigenvectors of A is typically
 *>          1:N because of the orthogonal transformations applied by SORMTR.
 *>          Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1
 *> \endverbatim
@@ -277,12 +277,12 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  
+*>          The dimension of the array WORK.
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, 26*N, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + 5*N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + 5*N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -330,7 +330,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup heevr_2stage
 *
 *> \par Contributors:
 *  ==================
@@ -358,7 +358,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -366,11 +366,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -416,8 +416,9 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV, ILAENV2STAGE
-      REAL               SLAMCH, SLANSY
-      EXTERNAL           LSAME, SLAMCH, SLANSY, ILAENV, ILAENV2STAGE
+      REAL               SLAMCH, SLANSY, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANSY, SROUNDUP_LWORK, ILAENV,
+     $                   ILAENV2STAGE
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SORMTR, SSCAL, SSTEBZ, SSTEMR, SSTEIN,
@@ -484,7 +485,7 @@
 *         NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
 *         NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) )
 *         LWKOPT = MAX( ( NB+1 )*N, LWMIN )
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          IWORK( 1 ) = LIWMIN
       END IF
 *
@@ -608,7 +609,7 @@
 *     Call SSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form.
 *
 *
-      CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), 
+      CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ),
      $                    WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ),
      $                    LHTRD, WORK( INDWK ), LLWORK, IINFO )
 *
@@ -732,7 +733,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/ssyevx.f b/lapack-netlib/SRC/ssyevx.f
index 11776e8c5..2204aa39b 100644
--- a/lapack-netlib/SRC/ssyevx.f
+++ b/lapack-netlib/SRC/ssyevx.f
@@ -244,7 +244,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup heevx
 *
 *  =====================================================================
       SUBROUTINE SSYEVX( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, IL, IU,
@@ -285,8 +285,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANSY
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANSY
+      REAL               SLAMCH, SLANSY, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANSY, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SLACPY, SORGTR, SORMTR, SSCAL, SSTEBZ,
@@ -338,13 +338,13 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWKMIN = 1
-            WORK( 1 ) = LWKMIN
+            WORK( 1 ) = SROUNDUP_LWORK(LWKMIN)
          ELSE
             LWKMIN = 8*N
             NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
             NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) )
             LWKOPT = MAX( LWKMIN, ( NB + 3 )*N )
-            WORK( 1 ) = LWKOPT
+            WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          END IF
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY )
@@ -542,7 +542,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssyevx_2stage.f b/lapack-netlib/SRC/ssyevx_2stage.f
index 1a2225c87..a8585e5f7 100644
--- a/lapack-netlib/SRC/ssyevx_2stage.f
+++ b/lapack-netlib/SRC/ssyevx_2stage.f
@@ -208,12 +208,12 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, 8*N, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + 3*N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + 3*N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -259,7 +259,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup heevx_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -277,7 +277,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -285,11 +285,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -334,8 +334,9 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      REAL               SLAMCH, SLANSY
-      EXTERNAL           LSAME, SLAMCH, SLANSY, ILAENV2STAGE
+      REAL               SLAMCH, SLANSY, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANSY, ILAENV2STAGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SLACPY, SORGTR, SORMTR, SSCAL, SSTEBZ,
@@ -388,7 +389,7 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWMIN = 1
-            WORK( 1 ) = LWMIN
+            WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          ELSE
             KD    = ILAENV2STAGE( 1, 'SSYTRD_2STAGE', JOBZ,
      $                            N, -1, -1, -1 )
@@ -487,7 +488,7 @@
       INDWRK  = INDHOUS + LHTRD
       LLWORK  = LWORK - INDWRK + 1
 *
-      CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), 
+      CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ),
      $                    WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ),
      $                    LHTRD, WORK( INDWRK ), LLWORK, IINFO )
 *
@@ -600,7 +601,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssygv.f b/lapack-netlib/SRC/ssygv.f
index f39947d92..3a79f5431 100644
--- a/lapack-netlib/SRC/ssygv.f
+++ b/lapack-netlib/SRC/ssygv.f
@@ -167,7 +167,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup hegv
 *
 *  =====================================================================
       SUBROUTINE SSYGV( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W, WORK,
@@ -199,7 +199,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SPOTRF, SSYEV, SSYGST, STRMM, STRSM, XERBLA
@@ -234,7 +235,7 @@
          LWKMIN = MAX( 1, 3*N - 1 )
          NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( LWKMIN, ( NB + 2 )*N )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
             INFO = -11
@@ -303,7 +304,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SSYGV
diff --git a/lapack-netlib/SRC/ssygv_2stage.f b/lapack-netlib/SRC/ssygv_2stage.f
index 3d9a44b5e..8719d8c7a 100644
--- a/lapack-netlib/SRC/ssygv_2stage.f
+++ b/lapack-netlib/SRC/ssygv_2stage.f
@@ -143,12 +143,12 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + 2*N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + 2*N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -186,7 +186,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup hegv_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -204,7 +204,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -212,11 +212,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -252,7 +252,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      EXTERNAL           LSAME, ILAENV2STAGE
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV2STAGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SPOTRF, SSYGST, STRMM, STRSM, XERBLA,
@@ -359,7 +360,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RETURN
 *
 *     End of SSYGV_2STAGE
diff --git a/lapack-netlib/SRC/ssygvd.f b/lapack-netlib/SRC/ssygvd.f
index 3c8bd2a0e..a90d1afb7 100644
--- a/lapack-netlib/SRC/ssygvd.f
+++ b/lapack-netlib/SRC/ssygvd.f
@@ -197,7 +197,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup hegvd
 *
 *> \par Further Details:
 *  =====================
@@ -245,7 +245,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SPOTRF, SSYEVD, SSYGST, STRMM, STRSM, XERBLA
@@ -289,7 +290,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LOPT)
          IWORK( 1 ) = LIOPT
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -361,7 +362,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = LOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LOPT)
       IWORK( 1 ) = LIOPT
 *
       RETURN
diff --git a/lapack-netlib/SRC/ssygvx.f b/lapack-netlib/SRC/ssygvx.f
index 344075c9f..16adefa22 100644
--- a/lapack-netlib/SRC/ssygvx.f
+++ b/lapack-netlib/SRC/ssygvx.f
@@ -283,7 +283,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup hegvx
 *
 *> \par Contributors:
 *  ==================
@@ -324,7 +324,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SPOTRF, SSYEVX, SSYGST, STRMM, STRSM, XERBLA
@@ -380,7 +381,7 @@
          LWKMIN = MAX( 1, 8*N )
          NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( LWKMIN, ( NB + 3 )*N )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
             INFO = -20
@@ -453,7 +454,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssysv.f b/lapack-netlib/SRC/ssysv.f
index 06a42dfb7..523ea66c1 100644
--- a/lapack-netlib/SRC/ssysv.f
+++ b/lapack-netlib/SRC/ssysv.f
@@ -163,7 +163,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYsolve
+*> \ingroup hesv
 *
 *  =====================================================================
       SUBROUTINE SSYSV( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
@@ -190,7 +190,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, SSYTRF, SSYTRS, SSYTRS2
@@ -225,7 +226,7 @@
             CALL SSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
             LWKOPT = INT( WORK( 1 ) )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -258,7 +259,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssysv_aa.f b/lapack-netlib/SRC/ssysv_aa.f
index 5661332c5..e43d4de7f 100644
--- a/lapack-netlib/SRC/ssysv_aa.f
+++ b/lapack-netlib/SRC/ssysv_aa.f
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYsolve
+*> \ingroup hesv_aa
 *
 *  =====================================================================
       SUBROUTINE SSYSV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
@@ -181,7 +181,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, SSYTRS_AA, SSYTRF_AA
@@ -216,7 +217,7 @@
      $                   -1, INFO )
          LWKOPT_SYTRS = INT( WORK(1) )
          LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -238,7 +239,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssysv_aa_2stage.f b/lapack-netlib/SRC/ssysv_aa_2stage.f
index aa862f14b..3d88e068e 100644
--- a/lapack-netlib/SRC/ssysv_aa_2stage.f
+++ b/lapack-netlib/SRC/ssysv_aa_2stage.f
@@ -178,7 +178,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYsolve
+*> \ingroup hesv_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE SSYSV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB,
@@ -208,7 +208,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SSYTRF_AA_2STAGE, SSYTRS_AA_2STAGE,
@@ -268,7 +269,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssysv_rk.f b/lapack-netlib/SRC/ssysv_rk.f
index 9a7dfa4bb..abf862d66 100644
--- a/lapack-netlib/SRC/ssysv_rk.f
+++ b/lapack-netlib/SRC/ssysv_rk.f
@@ -205,7 +205,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup singleSYsolve
+*> \ingroup hesv_rk
 *
 *> \par Contributors:
 *  ==================
@@ -247,7 +247,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, SSYTRF_RK, SSYTRS_3
@@ -282,7 +283,7 @@
             CALL SSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO )
             LWKOPT = INT( WORK( 1 ) )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -305,7 +306,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssysv_rook.f b/lapack-netlib/SRC/ssysv_rook.f
index fb7ba8c53..c5c77e562 100644
--- a/lapack-netlib/SRC/ssysv_rook.f
+++ b/lapack-netlib/SRC/ssysv_rook.f
@@ -181,7 +181,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYsolve
+*> \ingroup hesv_rook
 *
 *> \par Contributors:
 *  ==================
@@ -223,7 +223,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, SSYTRF_ROOK, SSYTRS_ROOK
@@ -258,7 +259,7 @@
             CALL SSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
             LWKOPT = INT( WORK( 1 ) )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -281,7 +282,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssysvx.f b/lapack-netlib/SRC/ssysvx.f
index b19ce2641..0d72217eb 100644
--- a/lapack-netlib/SRC/ssysvx.f
+++ b/lapack-netlib/SRC/ssysvx.f
@@ -275,7 +275,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYsolve
+*> \ingroup hesvx
 *
 *  =====================================================================
       SUBROUTINE SSYSVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B,
@@ -311,8 +311,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANSY
-      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANSY
+      REAL               SLAMCH, SLANSY, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANSY, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLACPY, SSYCON, SSYRFS, SSYTRF, SSYTRS, XERBLA
@@ -354,7 +354,7 @@
             NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 )
             LWKOPT = MAX( LWKOPT, N*NB )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -404,7 +404,7 @@
       IF( RCOND.LT.SLAMCH( 'Epsilon' ) )
      $   INFO = N + 1
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssytrd.f b/lapack-netlib/SRC/ssytrd.f
index f09ad9ab4..f4fbecdc9 100644
--- a/lapack-netlib/SRC/ssytrd.f
+++ b/lapack-netlib/SRC/ssytrd.f
@@ -139,7 +139,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYcomputational
+*> \ingroup hetrd
 *
 *> \par Further Details:
 *  =====================
@@ -223,7 +223,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -248,7 +249,7 @@
 *
          NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
          LWKOPT = N*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -365,7 +366,7 @@
      $                TAU( I ), IINFO )
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SSYTRD
diff --git a/lapack-netlib/SRC/ssytrd_sb2st.F b/lapack-netlib/SRC/ssytrd_sb2st.F
index b8386670a..32bae26dc 100644
--- a/lapack-netlib/SRC/ssytrd_sb2st.F
+++ b/lapack-netlib/SRC/ssytrd_sb2st.F
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+*       SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
 *                               D, E, HOUS, LHOUS, WORK, LWORK, INFO )
 *
 *       #if defined(_OPENMP)
@@ -53,12 +53,12 @@
 *> \param[in] STAGE1
 *> \verbatim
 *>          STAGE1 is CHARACTER*1
-*>          = 'N':  "No": to mention that the stage 1 of the reduction  
+*>          = 'N':  "No": to mention that the stage 1 of the reduction
 *>                  from dense to band using the ssytrd_sy2sb routine
-*>                  was not called before this routine to reproduce AB. 
-*>                  In other term this routine is called as standalone. 
-*>          = 'Y':  "Yes": to mention that the stage 1 of the 
-*>                  reduction from dense to band using the ssytrd_sy2sb 
+*>                  was not called before this routine to reproduce AB.
+*>                  In other term this routine is called as standalone.
+*>          = 'Y':  "Yes": to mention that the stage 1 of the
+*>                  reduction from dense to band using the ssytrd_sy2sb
 *>                  routine has been called to produce AB (e.g., AB is
 *>                  the output of ssytrd_sy2sb.
 *> \endverbatim
@@ -66,10 +66,10 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  and thus LHOUS is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate or to apply Q later on, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate or to apply Q later on,
 *>                  then LHOUS is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -147,7 +147,7 @@
 *>          message related to LHOUS is issued by XERBLA.
 *>          LHOUS = MAX(1, dimension) where
 *>          dimension = 4*N if VECT='N'
-*>          not available now if VECT='H'     
+*>          not available now if VECT='H'
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -188,7 +188,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup real16OTHERcomputational
+*> \ingroup hetrd_hb2st
 *
 *> \par Further Details:
 *  =====================
@@ -208,7 +208,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -216,16 +216,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+      SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
      $                         D, E, HOUS, LHOUS, WORK, LWORK, INFO )
 *
 #if defined(_OPENMP)
@@ -258,11 +258,11 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, WANTQ, UPPER, AFTERS1
-      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, 
+      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
      $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
      $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
      $                   NBTILES, TTYPE, TID, NTHREADS, DEBUG,
-     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, 
+     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
      $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
      $                   SISEV, SIZETAU, LDV, LHMIN, LWMIN
 *     ..
@@ -274,8 +274,9 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      INTEGER            ILAENV2STAGE 
-      EXTERNAL           LSAME, ILAENV2STAGE
+      INTEGER            ILAENV2STAGE
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV2STAGE, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -315,7 +316,7 @@
 *
       IF( INFO.EQ.0 ) THEN
          HOUS( 1 ) = LHMIN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -355,7 +356,7 @@
           ABDPOS   = KD + 1
           ABOFDPOS = KD
       ELSE
-          APOS     = INDA 
+          APOS     = INDA
           AWPOS    = INDA + KD + 1
           DPOS     = APOS
           OFDPOS   = DPOS + 1
@@ -363,11 +364,11 @@
           ABOFDPOS = 2
 
       ENDIF
-*      
-*     Case KD=0: 
-*     The matrix is diagonal. We just copy it (convert to "real" for 
-*     real because D is double and the imaginary part should be 0) 
-*     and store it in D. A sequential code here is better or 
+*
+*     Case KD=0:
+*     The matrix is diagonal. We just copy it (convert to "real" for
+*     real because D is double and the imaginary part should be 0)
+*     and store it in D. A sequential code here is better or
 *     in a parallel environment it might need two cores for D and E
 *
       IF( KD.EQ.0 ) THEN
@@ -382,17 +383,17 @@
           WORK( 1 ) = 1
           RETURN
       END IF
-*      
-*     Case KD=1: 
-*     The matrix is already Tridiagonal. We have to make diagonal 
+*
+*     Case KD=1:
+*     The matrix is already Tridiagonal. We have to make diagonal
 *     and offdiagonal elements real, and store them in D and E.
-*     For that, for real precision just copy the diag and offdiag 
-*     to D and E while for the COMPLEX case the bulge chasing is  
-*     performed to convert the hermetian tridiagonal to symmetric 
-*     tridiagonal. A simpler conversion formula might be used, but then 
+*     For that, for real precision just copy the diag and offdiag
+*     to D and E while for the COMPLEX case the bulge chasing is
+*     performed to convert the hermetian tridiagonal to symmetric
+*     tridiagonal. A simpler conversion formula might be used, but then
 *     updating the Q matrix will be required and based if Q is generated
-*     or not this might complicate the story. 
-*      
+*     or not this might complicate the story.
+*
       IF( KD.EQ.1 ) THEN
           DO 50 I = 1, N
               D( I ) = ( AB( ABDPOS, I ) )
@@ -413,7 +414,7 @@
           RETURN
       END IF
 *
-*     Main code start here. 
+*     Main code start here.
 *     Reduce the symmetric band of A to a tridiagonal matrix.
 *
       THGRSIZ   = N
@@ -422,7 +423,7 @@
       NBTILES   = CEILING( REAL(N)/REAL(KD) )
       STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) )
       THGRNB    = CEILING( REAL(N-1)/REAL(THGRSIZ) )
-*      
+*
       CALL SLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA )
       CALL SLASET( "A", KD,   N, ZERO, ZERO, WORK( AWPOS ), LDA )
 *
@@ -431,7 +432,7 @@
 *
 #if defined(_OPENMP)
 !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
-!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) 
+!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID )
 !$OMP$         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
 !$OMP$         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
 !$OMP$         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
@@ -440,7 +441,7 @@
 #endif
 *
 *     main bulge chasing loop
-*      
+*
       DO 100 THGRID = 1, THGRNB
           STT  = (THGRID-1)*THGRSIZ+1
           THED = MIN( (STT + THGRSIZ -1), (N-1))
@@ -451,7 +452,7 @@
                   ST = STT
                   DO 130 SWEEPID = ST, ED
                       DO 140 K = 1, GRSIZ
-                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ) 
+                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ)
      $                           + (M-1)*GRSIZ + K
                           IF ( MYID.EQ.1 ) THEN
                               TTYPE = 1
@@ -477,16 +478,16 @@
                           ENDIF
 *
 *                         Call the kernel
-*                             
+*
 #if defined(_OPENMP) && _OPENMP >= 201307
-                          IF( TTYPE.NE.1 ) THEN      
+                          IF( TTYPE.NE.1 ) THEN
 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
 !$OMP$     DEPEND(in:WORK(MYID-1))
 !$OMP$     DEPEND(out:WORK(MYID))
                               TID      = OMP_GET_THREAD_NUM()
-                              CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                              CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                             STIND, EDIND, SWEEPID, N, KD, IB,
-     $                             WORK ( INDA ), LDA, 
+     $                             WORK ( INDA ), LDA,
      $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                             WORK( INDW + TID*KD ) )
 !$OMP END TASK
@@ -494,20 +495,20 @@
 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
 !$OMP$     DEPEND(out:WORK(MYID))
                               TID      = OMP_GET_THREAD_NUM()
-                              CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                              CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                             STIND, EDIND, SWEEPID, N, KD, IB,
-     $                             WORK ( INDA ), LDA, 
+     $                             WORK ( INDA ), LDA,
      $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                             WORK( INDW + TID*KD ) )
 !$OMP END TASK
                           ENDIF
 #else
-                          CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                          CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                         STIND, EDIND, SWEEPID, N, KD, IB,
-     $                         WORK ( INDA ), LDA, 
+     $                         WORK ( INDA ), LDA,
      $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                         WORK( INDW ) )
-#endif 
+#endif
                           IF ( BLKLASTIND.GE.(N-1) ) THEN
                               STT = STT + 1
                               EXIT
@@ -522,14 +523,14 @@
 !$OMP END MASTER
 !$OMP END PARALLEL
 #endif
-*      
+*
 *     Copy the diagonal from A to D. Note that D is REAL thus only
 *     the Real part is needed, the imaginary part should be zero.
 *
       DO 150 I = 1, N
           D( I ) = ( WORK( DPOS+(I-1)*LDA ) )
   150 CONTINUE
-*      
+*
 *     Copy the off diagonal from A to E. Note that E is REAL thus only
 *     the Real part is needed, the imaginary part should be zero.
 *
@@ -544,10 +545,10 @@
       ENDIF
 *
       HOUS( 1 ) = LHMIN
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RETURN
 *
 *     End of SSYTRD_SB2ST
 *
       END
-      
+
diff --git a/lapack-netlib/SRC/ssytrd_sy2sb.f b/lapack-netlib/SRC/ssytrd_sy2sb.f
index 2c92cd14a..4efc43630 100644
--- a/lapack-netlib/SRC/ssytrd_sy2sb.f
+++ b/lapack-netlib/SRC/ssytrd_sy2sb.f
@@ -158,7 +158,7 @@
 *> \author Univ. of Colorado Denver 
 *> \author NAG Ltd. 
 *
-*> \ingroup realSYcomputational
+*> \ingroup hetrd_he2hb
 *
 *> \par Further Details:
 *  =====================
@@ -283,7 +283,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE 
-      EXTERNAL           LSAME, ILAENV2STAGE
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV2STAGE, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -313,7 +314,7 @@
          CALL XERBLA( 'SSYTRD_SY2SB', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          RETURN
       END IF
 *
@@ -506,7 +507,7 @@
 
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RETURN
 *
 *     End of SSYTRD_SY2SB
diff --git a/lapack-netlib/SRC/ssytrf.f b/lapack-netlib/SRC/ssytrf.f
index 31e38e466..a788fbcf0 100644
--- a/lapack-netlib/SRC/ssytrf.f
+++ b/lapack-netlib/SRC/ssytrf.f
@@ -135,7 +135,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYcomputational
+*> \ingroup hetrf
 *
 *> \par Further Details:
 *  =====================
@@ -202,7 +202,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLASYF, SSYTF2, XERBLA
@@ -233,7 +234,7 @@
 *
          NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, N*NB )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -352,7 +353,7 @@
       END IF
 *
    40 CONTINUE
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SSYTRF
diff --git a/lapack-netlib/SRC/ssytrf_aa.f b/lapack-netlib/SRC/ssytrf_aa.f
index 4ba026fc8..d6408a978 100644
--- a/lapack-netlib/SRC/ssytrf_aa.f
+++ b/lapack-netlib/SRC/ssytrf_aa.f
@@ -125,7 +125,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYcomputational
+*> \ingroup hetrf_aa
 *
 *  =====================================================================
       SUBROUTINE SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO)
@@ -159,7 +159,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLASYF_AA, SGEMV, SSCAL, SCOPY, SSWAP, SGEMM,
@@ -191,7 +192,7 @@
 *
       IF( INFO.EQ.0 ) THEN
          LWKOPT = (NB+1)*N
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -457,7 +458,7 @@
       END IF
 *
    20 CONTINUE
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SSYTRF_AA
diff --git a/lapack-netlib/SRC/ssytrf_aa_2stage.f b/lapack-netlib/SRC/ssytrf_aa_2stage.f
index 07357f2ab..abe6564c5 100644
--- a/lapack-netlib/SRC/ssytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/ssytrf_aa_2stage.f
@@ -152,7 +152,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYcomputational
+*> \ingroup hetrf_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV,
@@ -187,7 +187,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, SCOPY, SLACPY,
@@ -230,7 +231,7 @@
             TB( 1 ) = (3*NB+1)*N
          END IF
          IF( WQUERY ) THEN
-            WORK( 1 ) = N*NB
+            WORK( 1 ) = SROUNDUP_LWORK(N*NB)
          END IF
       END IF
       IF( TQUERY .OR. WQUERY ) THEN
diff --git a/lapack-netlib/SRC/ssytrf_rk.f b/lapack-netlib/SRC/ssytrf_rk.f
index 8e1ef460a..72830543c 100644
--- a/lapack-netlib/SRC/ssytrf_rk.f
+++ b/lapack-netlib/SRC/ssytrf_rk.f
@@ -229,7 +229,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup singleSYcomputational
+*> \ingroup hetrf_rk
 *
 *> \par Further Details:
 *  =====================
@@ -280,7 +280,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLASYF_RK, SSYTF2_RK, SSWAP,  XERBLA
@@ -311,7 +312,7 @@
 *
          NB = ILAENV( 1, 'SSYTRF_RK', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, N*NB )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -487,7 +488,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SSYTRF_RK
diff --git a/lapack-netlib/SRC/ssytrf_rook.f b/lapack-netlib/SRC/ssytrf_rook.f
index 653289e2b..339a229e7 100644
--- a/lapack-netlib/SRC/ssytrf_rook.f
+++ b/lapack-netlib/SRC/ssytrf_rook.f
@@ -146,7 +146,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYcomputational
+*> \ingroup hetrf_rook
 *
 *> \par Further Details:
 *  =====================
@@ -228,7 +228,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLASYF_ROOK, SSYTF2_ROOK, XERBLA
@@ -259,7 +260,7 @@
 *
          NB = ILAENV( 1, 'SSYTRF_ROOK', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, N*NB )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -382,7 +383,7 @@
       END IF
 *
    40 CONTINUE
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SSYTRF_ROOK
diff --git a/lapack-netlib/SRC/ssytri_3.f b/lapack-netlib/SRC/ssytri_3.f
index 58d5df92a..bca01105d 100644
--- a/lapack-netlib/SRC/ssytri_3.f
+++ b/lapack-netlib/SRC/ssytri_3.f
@@ -152,7 +152,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup singleSYcomputational
+*> \ingroup hetri_3
 *
 *> \par Contributors:
 *  ==================
@@ -190,7 +190,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SSYTRI_3X, XERBLA
@@ -225,7 +226,7 @@
          CALL XERBLA( 'SSYTRI_3', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          RETURN
       END IF
 *
@@ -236,7 +237,7 @@
 *
       CALL SSYTRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO )
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssytrs_aa.f b/lapack-netlib/SRC/ssytrs_aa.f
index 3cfa2a206..12fca0c71 100644
--- a/lapack-netlib/SRC/ssytrs_aa.f
+++ b/lapack-netlib/SRC/ssytrs_aa.f
@@ -123,7 +123,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYcomputational
+*> \ingroup hetrs_aa
 *
 *  =====================================================================
       SUBROUTINE SSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB,
@@ -155,7 +155,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGTSV, SSWAP, SLACPY, STRSM, XERBLA
@@ -186,7 +187,7 @@
          RETURN
       ELSE IF( LQUERY ) THEN
          LWKOPT = (3*N-2)
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/stgexc.f b/lapack-netlib/SRC/stgexc.f
index d1ad79936..d68eb5fc7 100644
--- a/lapack-netlib/SRC/stgexc.f
+++ b/lapack-netlib/SRC/stgexc.f
@@ -195,7 +195,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup tgexc
 *
 *> \par Contributors:
 *  ==================
@@ -241,6 +241,10 @@
       LOGICAL            LQUERY
       INTEGER            HERE, LWMIN, NBF, NBL, NBNEXT
 *     ..
+*     .. External Functions ..
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
       EXTERNAL           STGEX2, XERBLA
 *     ..
@@ -533,7 +537,7 @@
      $      GO TO 20
       END IF
       ILST = HERE
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RETURN
 *
 *     End of STGEXC
diff --git a/lapack-netlib/SRC/stgsen.f b/lapack-netlib/SRC/stgsen.f
index f1103d740..ac9c4677a 100644
--- a/lapack-netlib/SRC/stgsen.f
+++ b/lapack-netlib/SRC/stgsen.f
@@ -304,7 +304,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup tgsen
 *
 *> \par Further Details:
 *  =====================
@@ -490,8 +490,8 @@
      $                   XERBLA
 *     ..
 *     .. External Functions ..
-      REAL               SLAMCH
-      EXTERNAL           SLAMCH
+      REAL               SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, SIGN, SQRT
@@ -571,7 +571,7 @@
          LIWMIN = 1
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
 *
       IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -852,7 +852,7 @@
          END IF
    70 CONTINUE
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/stgsna.f b/lapack-netlib/SRC/stgsna.f
index 430f3c4b7..e8cb28b95 100644
--- a/lapack-netlib/SRC/stgsna.f
+++ b/lapack-netlib/SRC/stgsna.f
@@ -230,7 +230,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup tgsna
 *
 *> \par Further Details:
 *  =====================
@@ -416,8 +416,9 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SDOT, SLAMCH, SLAPY2, SNRM2
-      EXTERNAL           LSAME, SDOT, SLAMCH, SLAPY2, SNRM2
+      REAL               SDOT, SLAMCH, SLAPY2, SNRM2, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SDOT, SLAMCH, SLAPY2, SNRM2,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMV, SLACPY, SLAG2, STGEXC, STGSYL, XERBLA
@@ -490,7 +491,7 @@
          ELSE
             LWMIN = N
          END IF
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
          IF( MM.LT.M ) THEN
             INFO = -15
@@ -689,7 +690,7 @@
      $      KS = KS + 1
 *
    20 CONTINUE
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RETURN
 *
 *     End of STGSNA
diff --git a/lapack-netlib/SRC/stgsyl.f b/lapack-netlib/SRC/stgsyl.f
index 733c8ab9c..07a82e380 100644
--- a/lapack-netlib/SRC/stgsyl.f
+++ b/lapack-netlib/SRC/stgsyl.f
@@ -261,7 +261,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYcomputational
+*> \ingroup tgsyl
 *
 *> \par Contributors:
 *  ==================
@@ -331,7 +331,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMM, SLACPY, SLASET, SSCAL, STGSY2, XERBLA
@@ -384,7 +385,7 @@
          ELSE
             LWMIN = 1
          END IF
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
             INFO = -20
@@ -670,7 +671,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/strsen.f b/lapack-netlib/SRC/strsen.f
index c0f75fb12..f7a05ae8b 100644
--- a/lapack-netlib/SRC/strsen.f
+++ b/lapack-netlib/SRC/strsen.f
@@ -231,7 +231,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup trsen
 *
 *> \par Further Details:
 *  =====================
@@ -346,8 +346,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLANGE
-      EXTERNAL           LSAME, SLANGE
+      REAL               SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLACN2, SLACPY, STREXC, STRSYL, XERBLA
@@ -427,7 +427,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          IWORK( 1 ) = LIWMIN
       END IF
 *
@@ -558,7 +558,7 @@
          END IF
    60 CONTINUE
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/stzrzf.f b/lapack-netlib/SRC/stzrzf.f
index e8cbb56b6..516bea5d4 100644
--- a/lapack-netlib/SRC/stzrzf.f
+++ b/lapack-netlib/SRC/stzrzf.f
@@ -116,7 +116,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup tzrzf
 *
 *> \par Contributors:
 *  ==================
@@ -179,7 +179,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -207,7 +208,7 @@
             LWKOPT = M*NB
             LWKMIN = MAX( 1, M )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
             INFO = -7
@@ -301,7 +302,7 @@
       IF( MU.GT.0 )
      $   CALL SLATRZ( MU, N, N-M, A, LDA, TAU, WORK )
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *

From 3d38da2bc401d56162a528547430b0b69664ea38 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 12 Nov 2023 16:50:52 +0100
Subject: [PATCH 407/718] Make vector orthogonalization more reliable
 (Reference-LAPACK PR 930)

---
 lapack-netlib/SRC/clarfgp.f |  7 +++---
 lapack-netlib/SRC/cunbdb5.f | 50 ++++++++++++++++++++++++++-----------
 lapack-netlib/SRC/cunbdb6.f | 21 ++++++++--------
 lapack-netlib/SRC/dlarfgp.f |  9 ++++---
 lapack-netlib/SRC/dorbdb5.f | 50 ++++++++++++++++++++++++++-----------
 lapack-netlib/SRC/dorbdb6.f | 21 ++++++++--------
 lapack-netlib/SRC/slarfgp.f |  7 +++---
 lapack-netlib/SRC/sorbdb5.f | 50 ++++++++++++++++++++++++++-----------
 lapack-netlib/SRC/sorbdb6.f | 21 ++++++++--------
 lapack-netlib/SRC/zlarfgp.f |  7 +++---
 lapack-netlib/SRC/zunbdb5.f | 50 ++++++++++++++++++++++++++-----------
 lapack-netlib/SRC/zunbdb6.f | 21 ++++++++--------
 12 files changed, 201 insertions(+), 113 deletions(-)

diff --git a/lapack-netlib/SRC/clarfgp.f b/lapack-netlib/SRC/clarfgp.f
index b584484c7..47b5e47b0 100644
--- a/lapack-netlib/SRC/clarfgp.f
+++ b/lapack-netlib/SRC/clarfgp.f
@@ -97,7 +97,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERauxiliary
+*> \ingroup larfgp
 *
 *  =====================================================================
       SUBROUTINE CLARFGP( N, ALPHA, X, INCX, TAU )
@@ -122,7 +122,7 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            J, KNT
-      REAL               ALPHI, ALPHR, BETA, BIGNUM, SMLNUM, XNORM
+      REAL               ALPHI, ALPHR, BETA, BIGNUM, EPS, SMLNUM, XNORM
       COMPLEX            SAVEALPHA
 *     ..
 *     .. External Functions ..
@@ -143,11 +143,12 @@
          RETURN
       END IF
 *
+      EPS = SLAMCH( 'Precision' )
       XNORM = SCNRM2( N-1, X, INCX )
       ALPHR = REAL( ALPHA )
       ALPHI = AIMAG( ALPHA )
 *
-      IF( XNORM.EQ.ZERO ) THEN
+      IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN
 *
 *        H  =  [1-alpha/abs(alpha) 0; 0 I], sign chosen so ALPHA >= 0.
 *
diff --git a/lapack-netlib/SRC/cunbdb5.f b/lapack-netlib/SRC/cunbdb5.f
index d2ff4e700..22513cf8b 100644
--- a/lapack-netlib/SRC/cunbdb5.f
+++ b/lapack-netlib/SRC/cunbdb5.f
@@ -148,7 +148,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unbdb5
 *
 *  =====================================================================
       SUBROUTINE CUNBDB5( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
@@ -169,18 +169,21 @@
 *  =====================================================================
 *
 *     .. Parameters ..
+      REAL               REALZERO
+      PARAMETER          ( REALZERO = 0.0E0 )
       COMPLEX            ONE, ZERO
       PARAMETER          ( ONE = (1.0E0,0.0E0), ZERO = (0.0E0,0.0E0) )
 *     ..
 *     .. Local Scalars ..
       INTEGER            CHILDINFO, I, J
+      REAL               EPS, NORM, SCL, SSQ
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CUNBDB6, XERBLA
+      EXTERNAL           CLASSQ, CUNBDB6, CSCAL, XERBLA
 *     ..
 *     .. External Functions ..
-      REAL               SCNRM2
-      EXTERNAL           SCNRM2
+      REAL               SLAMCH, SCNRM2
+      EXTERNAL           SLAMCH, SCNRM2
 *     ..
 *     .. Intrinsic Function ..
       INTRINSIC          MAX
@@ -213,16 +216,33 @@
          RETURN
       END IF
 *
-*     Project X onto the orthogonal complement of Q
+      EPS = SLAMCH( 'Precision' )
 *
-      CALL CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, LDQ2,
-     $              WORK, LWORK, CHILDINFO )
+*     Project X onto the orthogonal complement of Q if X is nonzero
 *
-*     If the projection is nonzero, then return
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL CLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL CLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM = SCL * SQRT( SSQ )
 *
-      IF( SCNRM2(M1,X1,INCX1) .NE. ZERO
-     $    .OR. SCNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
-         RETURN
+      IF( NORM .GT. N * EPS ) THEN
+*        Scale vector to unit norm to avoid problems in the caller code.
+*        Computing the reciprocal is undesirable but
+*         * xLASCL cannot be used because of the vector increments and
+*         * the round-off error has a negligible impact on
+*           orthogonalization.
+         CALL CSCAL( M1, ONE / NORM, X1, INCX1 )
+         CALL CSCAL( M2, ONE / NORM, X2, INCX2 )
+         CALL CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
+     $              LDQ2, WORK, LWORK, CHILDINFO )
+*
+*        If the projection is nonzero, then return
+*
+         IF( SCNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. SCNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
+            RETURN
+         END IF
       END IF
 *
 *     Project each standard basis vector e_1,...,e_M1 in turn, stopping
@@ -238,8 +258,8 @@
          END DO
          CALL CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
      $                 LDQ2, WORK, LWORK, CHILDINFO )
-         IF( SCNRM2(M1,X1,INCX1) .NE. ZERO
-     $       .OR. SCNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
+         IF( SCNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. SCNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
             RETURN
          END IF
       END DO
@@ -257,8 +277,8 @@
          X2(I) = ONE
          CALL CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
      $                 LDQ2, WORK, LWORK, CHILDINFO )
-         IF( SCNRM2(M1,X1,INCX1) .NE. ZERO
-     $       .OR. SCNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
+         IF( SCNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. SCNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
             RETURN
          END IF
       END DO
diff --git a/lapack-netlib/SRC/cunbdb6.f b/lapack-netlib/SRC/cunbdb6.f
index cd14d9295..566fd76b7 100644
--- a/lapack-netlib/SRC/cunbdb6.f
+++ b/lapack-netlib/SRC/cunbdb6.f
@@ -41,9 +41,8 @@
 *> with respect to the columns of
 *>      Q = [ Q1 ] .
 *>          [ Q2 ]
-*> The Euclidean norm of X must be one and the columns of Q must be
-*> orthonormal. The orthogonalized vector will be zero if and only if it
-*> lies entirely in the range of Q.
+*> The columns of Q must be orthonormal. The orthogonalized vector will
+*> be zero if and only if it lies entirely in the range of Q.
 *>
 *> The projection is computed with at most two iterations of the
 *> classical Gram-Schmidt algorithm, see
@@ -174,7 +173,7 @@
 *
 *     .. Parameters ..
       REAL               ALPHA, REALONE, REALZERO
-      PARAMETER          ( ALPHA = 0.1E0, REALONE = 1.0E0,
+      PARAMETER          ( ALPHA = 0.83E0, REALONE = 1.0E0,
      $                     REALZERO = 0.0E0 )
       COMPLEX            NEGONE, ONE, ZERO
       PARAMETER          ( NEGONE = (-1.0E0,0.0E0), ONE = (1.0E0,0.0E0),
@@ -223,14 +222,16 @@
 *
       EPS = SLAMCH( 'Precision' )
 *
+*     Compute the Euclidean norm of X
+*
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL CLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL CLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM = SCL * SQRT( SSQ )
+*
 *     First, project X onto the orthogonal complement of Q's column
 *     space
-*
-*     Christoph Conrads: In debugging mode the norm should be computed
-*     and an assertion added comparing the norm with one. Alas, Fortran
-*     never made it into 1989 when assert() was introduced into the C
-*     programming language.
-      NORM = REALONE
 *
       IF( M1 .EQ. 0 ) THEN
          DO I = 1, N
diff --git a/lapack-netlib/SRC/dlarfgp.f b/lapack-netlib/SRC/dlarfgp.f
index 69845056d..a8cf1b31e 100644
--- a/lapack-netlib/SRC/dlarfgp.f
+++ b/lapack-netlib/SRC/dlarfgp.f
@@ -97,7 +97,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERauxiliary
+*> \ingroup larfgp
 *
 *  =====================================================================
       SUBROUTINE DLARFGP( N, ALPHA, X, INCX, TAU )
@@ -122,7 +122,7 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            J, KNT
-      DOUBLE PRECISION   BETA, BIGNUM, SAVEALPHA, SMLNUM, XNORM
+      DOUBLE PRECISION   BETA, BIGNUM, EPS, SAVEALPHA, SMLNUM, XNORM
 *     ..
 *     .. External Functions ..
       DOUBLE PRECISION   DLAMCH, DLAPY2, DNRM2
@@ -141,11 +141,12 @@
          RETURN
       END IF
 *
+      EPS = DLAMCH( 'Precision' )
       XNORM = DNRM2( N-1, X, INCX )
 *
-      IF( XNORM.EQ.ZERO ) THEN
+      IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN
 *
-*        H  =  [+/-1, 0; I], sign chosen so ALPHA >= 0
+*        H  =  [+/-1, 0; I], sign chosen so ALPHA >= 0.
 *
          IF( ALPHA.GE.ZERO ) THEN
 *           When TAU.eq.ZERO, the vector is special-cased to be
diff --git a/lapack-netlib/SRC/dorbdb5.f b/lapack-netlib/SRC/dorbdb5.f
index 6e057a05f..cbd58ae54 100644
--- a/lapack-netlib/SRC/dorbdb5.f
+++ b/lapack-netlib/SRC/dorbdb5.f
@@ -148,7 +148,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup unbdb5
 *
 *  =====================================================================
       SUBROUTINE DORBDB5( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
@@ -169,18 +169,21 @@
 *  =====================================================================
 *
 *     .. Parameters ..
+      DOUBLE PRECISION   REALZERO
+      PARAMETER          ( REALZERO = 0.0D0 )
       DOUBLE PRECISION   ONE, ZERO
       PARAMETER          ( ONE = 1.0D0, ZERO = 0.0D0 )
 *     ..
 *     .. Local Scalars ..
       INTEGER            CHILDINFO, I, J
+      DOUBLE PRECISION   EPS, NORM, SCL, SSQ
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DORBDB6, XERBLA
+      EXTERNAL           DLASSQ, DORBDB6, DSCAL, XERBLA
 *     ..
 *     .. External Functions ..
-      DOUBLE PRECISION   DNRM2
-      EXTERNAL           DNRM2
+      DOUBLE PRECISION   DLAMCH, DNRM2
+      EXTERNAL           DLAMCH, DNRM2
 *     ..
 *     .. Intrinsic Function ..
       INTRINSIC          MAX
@@ -213,16 +216,33 @@
          RETURN
       END IF
 *
-*     Project X onto the orthogonal complement of Q
+      EPS = DLAMCH( 'Precision' )
 *
-      CALL DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, LDQ2,
-     $              WORK, LWORK, CHILDINFO )
+*     Project X onto the orthogonal complement of Q if X is nonzero
 *
-*     If the projection is nonzero, then return
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL DLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL DLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM = SCL * SQRT( SSQ )
 *
-      IF( DNRM2(M1,X1,INCX1) .NE. ZERO
-     $    .OR. DNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
-         RETURN
+      IF( NORM .GT. N * EPS ) THEN
+*        Scale vector to unit norm to avoid problems in the caller code.
+*        Computing the reciprocal is undesirable but
+*         * xLASCL cannot be used because of the vector increments and
+*         * the round-off error has a negligible impact on
+*           orthogonalization.
+         CALL DSCAL( M1, ONE / NORM, X1, INCX1 )
+         CALL DSCAL( M2, ONE / NORM, X2, INCX2 )
+         CALL DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
+     $              LDQ2, WORK, LWORK, CHILDINFO )
+*
+*        If the projection is nonzero, then return
+*
+         IF( DNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. DNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
+            RETURN
+         END IF
       END IF
 *
 *     Project each standard basis vector e_1,...,e_M1 in turn, stopping
@@ -238,8 +258,8 @@
          END DO
          CALL DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
      $                 LDQ2, WORK, LWORK, CHILDINFO )
-         IF( DNRM2(M1,X1,INCX1) .NE. ZERO
-     $       .OR. DNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
+         IF( DNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. DNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
             RETURN
          END IF
       END DO
@@ -257,8 +277,8 @@
          X2(I) = ONE
          CALL DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
      $                 LDQ2, WORK, LWORK, CHILDINFO )
-         IF( DNRM2(M1,X1,INCX1) .NE. ZERO
-     $       .OR. DNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
+         IF( DNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. DNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
             RETURN
          END IF
       END DO
diff --git a/lapack-netlib/SRC/dorbdb6.f b/lapack-netlib/SRC/dorbdb6.f
index 142887684..3e356d001 100644
--- a/lapack-netlib/SRC/dorbdb6.f
+++ b/lapack-netlib/SRC/dorbdb6.f
@@ -41,9 +41,8 @@
 *> with respect to the columns of
 *>      Q = [ Q1 ] .
 *>          [ Q2 ]
-*> The Euclidean norm of X must be one and the columns of Q must be
-*> orthonormal. The orthogonalized vector will be zero if and only if it
-*> lies entirely in the range of Q.
+*> The columns of Q must be orthonormal. The orthogonalized vector will
+*> be zero if and only if it lies entirely in the range of Q.
 *>
 *> The projection is computed with at most two iterations of the
 *> classical Gram-Schmidt algorithm, see
@@ -174,7 +173,7 @@
 *
 *     .. Parameters ..
       DOUBLE PRECISION   ALPHA, REALONE, REALZERO
-      PARAMETER          ( ALPHA = 0.1D0, REALONE = 1.0D0,
+      PARAMETER          ( ALPHA = 0.83D0, REALONE = 1.0D0,
      $                     REALZERO = 0.0D0 )
       DOUBLE PRECISION   NEGONE, ONE, ZERO
       PARAMETER          ( NEGONE = -1.0D0, ONE = 1.0D0, ZERO = 0.0D0 )
@@ -222,14 +221,16 @@
 *
       EPS = DLAMCH( 'Precision' )
 *
+*     Compute the Euclidean norm of X
+*
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL DLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL DLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM = SCL * SQRT( SSQ )
+*
 *     First, project X onto the orthogonal complement of Q's column
 *     space
-*
-*     Christoph Conrads: In debugging mode the norm should be computed
-*     and an assertion added comparing the norm with one. Alas, Fortran
-*     never made it into 1989 when assert() was introduced into the C
-*     programming language.
-      NORM = REALONE
 *
       IF( M1 .EQ. 0 ) THEN
          DO I = 1, N
diff --git a/lapack-netlib/SRC/slarfgp.f b/lapack-netlib/SRC/slarfgp.f
index df42980c4..c28274c2c 100644
--- a/lapack-netlib/SRC/slarfgp.f
+++ b/lapack-netlib/SRC/slarfgp.f
@@ -97,7 +97,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERauxiliary
+*> \ingroup larfgp
 *
 *  =====================================================================
       SUBROUTINE SLARFGP( N, ALPHA, X, INCX, TAU )
@@ -122,7 +122,7 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            J, KNT
-      REAL               BETA, BIGNUM, SAVEALPHA, SMLNUM, XNORM
+      REAL               BETA, BIGNUM, EPS, SAVEALPHA, SMLNUM, XNORM
 *     ..
 *     .. External Functions ..
       REAL               SLAMCH, SLAPY2, SNRM2
@@ -141,9 +141,10 @@
          RETURN
       END IF
 *
+      EPS = SLAMCH( 'Precision' )
       XNORM = SNRM2( N-1, X, INCX )
 *
-      IF( XNORM.EQ.ZERO ) THEN
+      IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN
 *
 *        H  =  [+/-1, 0; I], sign chosen so ALPHA >= 0.
 *
diff --git a/lapack-netlib/SRC/sorbdb5.f b/lapack-netlib/SRC/sorbdb5.f
index 8c67aedfb..8fb88876f 100644
--- a/lapack-netlib/SRC/sorbdb5.f
+++ b/lapack-netlib/SRC/sorbdb5.f
@@ -148,7 +148,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unbdb5
 *
 *  =====================================================================
       SUBROUTINE SORBDB5( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
@@ -169,18 +169,21 @@
 *  =====================================================================
 *
 *     .. Parameters ..
+      REAL               REALZERO
+      PARAMETER          ( REALZERO = 0.0E0 )
       REAL               ONE, ZERO
       PARAMETER          ( ONE = 1.0E0, ZERO = 0.0E0 )
 *     ..
 *     .. Local Scalars ..
       INTEGER            CHILDINFO, I, J
+      REAL               EPS, NORM, SCL, SSQ
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SORBDB6, XERBLA
+      EXTERNAL           SLASSQ, SORBDB6, SSCAL, XERBLA
 *     ..
 *     .. External Functions ..
-      REAL               SNRM2
-      EXTERNAL           SNRM2
+      REAL               SLAMCH, SNRM2
+      EXTERNAL           SLAMCH, SNRM2
 *     ..
 *     .. Intrinsic Function ..
       INTRINSIC          MAX
@@ -213,16 +216,33 @@
          RETURN
       END IF
 *
-*     Project X onto the orthogonal complement of Q
+      EPS = SLAMCH( 'Precision' )
 *
-      CALL SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, LDQ2,
-     $              WORK, LWORK, CHILDINFO )
+*     Project X onto the orthogonal complement of Q if X is nonzero
 *
-*     If the projection is nonzero, then return
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL SLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL SLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM = SCL * SQRT( SSQ )
 *
-      IF( SNRM2(M1,X1,INCX1) .NE. ZERO
-     $    .OR. SNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
-         RETURN
+      IF( NORM .GT. N * EPS ) THEN
+*        Scale vector to unit norm to avoid problems in the caller code.
+*        Computing the reciprocal is undesirable but
+*         * xLASCL cannot be used because of the vector increments and
+*         * the round-off error has a negligible impact on
+*           orthogonalization.
+         CALL SSCAL( M1, ONE / NORM, X1, INCX1 )
+         CALL SSCAL( M2, ONE / NORM, X2, INCX2 )
+         CALL SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
+     $              LDQ2, WORK, LWORK, CHILDINFO )
+*
+*        If the projection is nonzero, then return
+*
+         IF( SNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. SNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
+            RETURN
+         END IF
       END IF
 *
 *     Project each standard basis vector e_1,...,e_M1 in turn, stopping
@@ -238,8 +258,8 @@
          END DO
          CALL SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
      $                 LDQ2, WORK, LWORK, CHILDINFO )
-         IF( SNRM2(M1,X1,INCX1) .NE. ZERO
-     $       .OR. SNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
+         IF( SNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. SNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
             RETURN
          END IF
       END DO
@@ -257,8 +277,8 @@
          X2(I) = ONE
          CALL SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
      $                 LDQ2, WORK, LWORK, CHILDINFO )
-         IF( SNRM2(M1,X1,INCX1) .NE. ZERO
-     $       .OR. SNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
+         IF( SNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. SNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
             RETURN
          END IF
       END DO
diff --git a/lapack-netlib/SRC/sorbdb6.f b/lapack-netlib/SRC/sorbdb6.f
index d320c9e46..eac177722 100644
--- a/lapack-netlib/SRC/sorbdb6.f
+++ b/lapack-netlib/SRC/sorbdb6.f
@@ -41,9 +41,8 @@
 *> with respect to the columns of
 *>      Q = [ Q1 ] .
 *>          [ Q2 ]
-*> The Euclidean norm of X must be one and the columns of Q must be
-*> orthonormal. The orthogonalized vector will be zero if and only if it
-*> lies entirely in the range of Q.
+*> The columns of Q must be orthonormal. The orthogonalized vector will
+*> be zero if and only if it lies entirely in the range of Q.
 *>
 *> The projection is computed with at most two iterations of the
 *> classical Gram-Schmidt algorithm, see
@@ -174,7 +173,7 @@
 *
 *     .. Parameters ..
       REAL               ALPHA, REALONE, REALZERO
-      PARAMETER          ( ALPHA = 0.1E0, REALONE = 1.0E0,
+      PARAMETER          ( ALPHA = 0.83E0, REALONE = 1.0E0,
      $                     REALZERO = 0.0E0 )
       REAL               NEGONE, ONE, ZERO
       PARAMETER          ( NEGONE = -1.0E0, ONE = 1.0E0, ZERO = 0.0E0 )
@@ -222,14 +221,16 @@
 *
       EPS = SLAMCH( 'Precision' )
 *
+*     Compute the Euclidean norm of X
+*
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL SLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL SLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM = SCL * SQRT( SSQ )
+*
 *     First, project X onto the orthogonal complement of Q's column
 *     space
-*
-*     Christoph Conrads: In debugging mode the norm should be computed
-*     and an assertion added comparing the norm with one. Alas, Fortran
-*     never made it into 1989 when assert() was introduced into the C
-*     programming language.
-      NORM = REALONE
 *
       IF( M1 .EQ. 0 ) THEN
          DO I = 1, N
diff --git a/lapack-netlib/SRC/zlarfgp.f b/lapack-netlib/SRC/zlarfgp.f
index 77eba8e86..6c9efb04c 100644
--- a/lapack-netlib/SRC/zlarfgp.f
+++ b/lapack-netlib/SRC/zlarfgp.f
@@ -97,7 +97,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERauxiliary
+*> \ingroup larfgp
 *
 *  =====================================================================
       SUBROUTINE ZLARFGP( N, ALPHA, X, INCX, TAU )
@@ -122,7 +122,7 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            J, KNT
-      DOUBLE PRECISION   ALPHI, ALPHR, BETA, BIGNUM, SMLNUM, XNORM
+      DOUBLE PRECISION   ALPHI, ALPHR, BETA, BIGNUM, EPS, SMLNUM, XNORM
       COMPLEX*16         SAVEALPHA
 *     ..
 *     .. External Functions ..
@@ -143,11 +143,12 @@
          RETURN
       END IF
 *
+      EPS = DLAMCH( 'Precision' )
       XNORM = DZNRM2( N-1, X, INCX )
       ALPHR = DBLE( ALPHA )
       ALPHI = DIMAG( ALPHA )
 *
-      IF( XNORM.EQ.ZERO ) THEN
+      IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN
 *
 *        H  =  [1-alpha/abs(alpha) 0; 0 I], sign chosen so ALPHA >= 0.
 *
diff --git a/lapack-netlib/SRC/zunbdb5.f b/lapack-netlib/SRC/zunbdb5.f
index 23174fe50..c451ae921 100644
--- a/lapack-netlib/SRC/zunbdb5.f
+++ b/lapack-netlib/SRC/zunbdb5.f
@@ -148,7 +148,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup unbdb5
 *
 *  =====================================================================
       SUBROUTINE ZUNBDB5( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
@@ -169,18 +169,21 @@
 *  =====================================================================
 *
 *     .. Parameters ..
+      DOUBLE PRECISION   REALZERO
+      PARAMETER          ( REALZERO = 0.0D0 )
       COMPLEX*16         ONE, ZERO
       PARAMETER          ( ONE = (1.0D0,0.0D0), ZERO = (0.0D0,0.0D0) )
 *     ..
 *     .. Local Scalars ..
       INTEGER            CHILDINFO, I, J
+      DOUBLE PRECISION   EPS, NORM, SCL, SSQ
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ZUNBDB6, XERBLA
+      EXTERNAL           ZLASSQ, ZUNBDB6, ZSCAL, XERBLA
 *     ..
 *     .. External Functions ..
-      DOUBLE PRECISION   DZNRM2
-      EXTERNAL           DZNRM2
+      DOUBLE PRECISION   DLAMCH, DZNRM2
+      EXTERNAL           DLAMCH, DZNRM2
 *     ..
 *     .. Intrinsic Function ..
       INTRINSIC          MAX
@@ -213,16 +216,33 @@
          RETURN
       END IF
 *
-*     Project X onto the orthogonal complement of Q
+      EPS = DLAMCH( 'Precision' )
 *
-      CALL ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, LDQ2,
-     $              WORK, LWORK, CHILDINFO )
+*     Project X onto the orthogonal complement of Q if X is nonzero
 *
-*     If the projection is nonzero, then return
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL ZLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL ZLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM = SCL * SQRT( SSQ )
 *
-      IF( DZNRM2(M1,X1,INCX1) .NE. ZERO
-     $    .OR. DZNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
-         RETURN
+      IF( NORM .GT. N * EPS ) THEN
+*        Scale vector to unit norm to avoid problems in the caller code.
+*        Computing the reciprocal is undesirable but
+*         * xLASCL cannot be used because of the vector increments and
+*         * the round-off error has a negligible impact on
+*           orthogonalization.
+         CALL ZSCAL( M1, ONE / NORM, X1, INCX1 )
+         CALL ZSCAL( M2, ONE / NORM, X2, INCX2 )
+         CALL ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
+     $              LDQ2, WORK, LWORK, CHILDINFO )
+*
+*        If the projection is nonzero, then return
+*
+         IF( DZNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. DZNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
+            RETURN
+         END IF
       END IF
 *
 *     Project each standard basis vector e_1,...,e_M1 in turn, stopping
@@ -238,8 +258,8 @@
          END DO
          CALL ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
      $                 LDQ2, WORK, LWORK, CHILDINFO )
-         IF( DZNRM2(M1,X1,INCX1) .NE. ZERO
-     $       .OR. DZNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
+         IF( DZNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. DZNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
             RETURN
          END IF
       END DO
@@ -257,8 +277,8 @@
          X2(I) = ONE
          CALL ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
      $                 LDQ2, WORK, LWORK, CHILDINFO )
-         IF( DZNRM2(M1,X1,INCX1) .NE. ZERO
-     $       .OR. DZNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
+         IF( DZNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. DZNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
             RETURN
          END IF
       END DO
diff --git a/lapack-netlib/SRC/zunbdb6.f b/lapack-netlib/SRC/zunbdb6.f
index ac7fa4be3..ddc9dfc61 100644
--- a/lapack-netlib/SRC/zunbdb6.f
+++ b/lapack-netlib/SRC/zunbdb6.f
@@ -41,9 +41,8 @@
 *> with respect to the columns of
 *>      Q = [ Q1 ] .
 *>          [ Q2 ]
-*> The Euclidean norm of X must be one and the columns of Q must be
-*> orthonormal. The orthogonalized vector will be zero if and only if it
-*> lies entirely in the range of Q.
+*> The columns of Q must be orthonormal. The orthogonalized vector will
+*> be zero if and only if it lies entirely in the range of Q.
 *>
 *> The projection is computed with at most two iterations of the
 *> classical Gram-Schmidt algorithm, see
@@ -174,7 +173,7 @@
 *
 *     .. Parameters ..
       DOUBLE PRECISION   ALPHA, REALONE, REALZERO
-      PARAMETER          ( ALPHA = 0.1D0, REALONE = 1.0D0,
+      PARAMETER          ( ALPHA = 0.83D0, REALONE = 1.0D0,
      $                     REALZERO = 0.0D0 )
       COMPLEX*16         NEGONE, ONE, ZERO
       PARAMETER          ( NEGONE = (-1.0D0,0.0D0), ONE = (1.0D0,0.0D0),
@@ -223,14 +222,16 @@
 *
       EPS = DLAMCH( 'Precision' )
 *
+*     Compute the Euclidean norm of X
+*
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL ZLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL ZLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM = SCL * SQRT( SSQ )
+*
 *     First, project X onto the orthogonal complement of Q's column
 *     space
-*
-*     Christoph Conrads: In debugging mode the norm should be computed
-*     and an assertion added comparing the norm with one. Alas, Fortran
-*     never made it into 1989 when assert() was introduced into the C
-*     programming language.
-      NORM = REALONE
 *
       IF( M1 .EQ. 0 ) THEN
          DO I = 1, N

From 9b5f8eb33a263afafb26746a9d188018bba3b3b2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 12 Nov 2023 19:35:53 +0100
Subject: [PATCH 408/718] Fix empty function prototypes

---
 driver/others/blas_server_omp.c | 2 +-
 driver/others/dynamic_zarch.c   | 2 +-
 driver/others/memory_qalloc.c   | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c
index f7008fb08..213531057 100644
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -126,7 +126,7 @@ void openblas_set_num_threads(int num_threads) {
 int blas_thread_init(void){
 
 #if defined(__FreeBSD__) && defined(__clang__)
-extern int openblas_omp_num_threads_env();
+extern int openblas_omp_num_threads_env(void);
 
    if(blas_omp_number_max <= 0)
 	   blas_omp_number_max= openblas_omp_num_threads_env();
diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c
index 5b45aae2f..dd26c8e80 100644
--- a/driver/others/dynamic_zarch.c
+++ b/driver/others/dynamic_zarch.c
@@ -13,7 +13,7 @@ extern gotoblas_t gotoblas_Z14;
 
 #define NUM_CORETYPES 4
 
-extern int openblas_verbose();
+extern int openblas_verbose(void);
 extern void openblas_warning(int verbose, const char* msg);
 
 char* gotoblas_corename(void) {
diff --git a/driver/others/memory_qalloc.c b/driver/others/memory_qalloc.c
index 6174d9b75..a2593e01f 100644
--- a/driver/others/memory_qalloc.c
+++ b/driver/others/memory_qalloc.c
@@ -288,7 +288,7 @@ int  goto_get_num_procs  (void) {
   return blas_cpu_number;
 }
 
-void openblas_fork_handler()
+void openblas_fork_handler(void)
 {
   // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
   // built with "make USE_OPENMP=0".
@@ -305,9 +305,9 @@ void openblas_fork_handler()
 #endif
 }
 
-extern int openblas_num_threads_env();
-extern int openblas_goto_num_threads_env();
-extern int openblas_omp_num_threads_env();
+extern int openblas_num_threads_env(void);
+extern int openblas_goto_num_threads_env(void);
+extern int openblas_omp_num_threads_env(void);
 
 int blas_get_cpu_number(void){
 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)

From c245c12dc232f8474e97b9ceda7a6e276f5c73b0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 12 Nov 2023 22:17:39 +0100
Subject: [PATCH 409/718] Update Changelog for 0.3.25 (#4314)

* Update Changelog.txt for 0.3.25
---
 Changelog.txt | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/Changelog.txt b/Changelog.txt
index 3937ef08c..e0fe0ca5a 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,50 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.25
+ 12-Nov-2023
+
+general:
+- improved the error message shown on exceeding the maximum thread count
+- improved the code to add supplementary thread buffers in case of overflow
+- fixed a potential division by zero in ?ROTG
+- improved the ?MATCOPY functions to accept zero-sized rows or columns
+- corrected empty prototypes in function declarations
+- cleaned up unused declarations in the f2c-converted versions of the LAPACK sources
+- fixed compilation with the Cray CCE Compiler suite
+- improved link line rewriting to avoid mixed libgomp/libomp builds with clang&gfortran
+- worked around OPENMP builds with LLVM14's libomp hanging on FreeBSD
+- improved the Makefiles to require less option duplication on "make install"
+- imported the following changes from the upcoming release 3.12 of Reference-LAPACK
+  - deprecate utility functions ?GELQS and ?GEQRS (LAPACK PR 900)
+  - apply rounding up to workspace calculations done in floating point (LAPACK PR 904)
+  - avoid overflow in STGEX2/DTGEX2 (LAPACK PR 907)
+  - fix accumulation in ?LASSQ (LAPACK PR 909)
+  - fix handling of NaN values in ?GECON (LAPACK PR 926)
+  - avoid overflow in CBDSQR/ZBDSQR (LAPACK PR 927)
+  - fix poor vector orthogonalizations in ?ORBDB5/?UNBDB5 (LAPACK PR 928 & 930)
+
+x86-64:
+- fixed compile-time autodetection of AMD Ryzen3 and Ryzen4 cpus
+- fixed capability-based fallback selection for unknown cpus in DYNAMIC_ARCH
+- added AVX512 optimizations for ?ASUM on Sapphire Rapids and Cooper Lake
+
+ARM64:
+- fixed building on Apple with homebrew gcc
+- fixed building with XCODE 15
+- fixed building on A64FX and Cortex A710/X1/X2
+- increased the default buffer size for recent ARM server cpus 
+
+POWER:
+- fixed building with the IBM xlf 16.1.1 compiler
+- fixed building with IBM XL C
+- added support for DYNAMIC_ARCH builds with clang
+- fixed union declaration in the BFLOAT16 test case
+- enable optimizations for the AIX assembler on POWER10
+
+LOONGARCH64:
+- added an optimized SGEMV kernel
+- added an optimized DTRSM kernel
+
 ====================================================================
 Version 0.3.24
  03-Sep-2023

From e1f529d0247a4bc1dd9d1f86f4be10c2cfbd2990 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 12 Nov 2023 22:37:11 +0100
Subject: [PATCH 410/718] Add OSX hw.cpufamily value for Apple M3

---
 cpuid_arm64.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpuid_arm64.c b/cpuid_arm64.c
index e586f9a3c..8c5d04c14 100644
--- a/cpuid_arm64.c
+++ b/cpuid_arm64.c
@@ -270,6 +270,7 @@ int detect(void)
 	sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0);
 	if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1
 	if (value64 == 3660830781) return CPU_VORTEX; //A15/M2
+	if (value64 == 2271604202) return CPU_VORTEX; //A16/M3
 #endif
 	return CPU_ARMV8;	
 #endif

From f1940010e4f219cb7859f2fd495dc27a3afa3425 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 12 Nov 2023 22:51:26 +0100
Subject: [PATCH 411/718] Update version to 0.3.25

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 444baa114..6b65871f8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
 
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 24)
+set(OpenBLAS_PATCH_VERSION 25)
 
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 

From 0e54cbd18c09afb66254157100d59cdb5da92c00 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 12 Nov 2023 22:52:05 +0100
Subject: [PATCH 412/718] Update version to 0.3.25

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index 80060a0fc..fd44b1b12 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.24
+VERSION = 0.3.25
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

From dff686a86c1a4b59ea4604251c9f8c6648047807 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 12 Nov 2023 23:06:46 +0100
Subject: [PATCH 413/718] Update version to 0.3.25.dev

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b65871f8..69077322a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
 
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 25)
+set(OpenBLAS_PATCH_VERSION 25.dev)
 
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 

From f4cc1b7a6f976ddb985dcfe00f352b519ab209a5 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 12 Nov 2023 23:07:19 +0100
Subject: [PATCH 414/718] Update version to 0.3.25.dev

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index fd44b1b12..58f02358e 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.25
+VERSION = 0.3.25.dev
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

From d99aad8ee308600832da39105a6511275cfe32ad Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Tue, 14 Nov 2023 11:07:08 -0600
Subject: [PATCH 415/718] Fix older version of gcc - missing __has_builtin,
 cpuid and no support of P10.

---
 Makefile.power                | 12 ++++++++++++
 driver/others/dynamic_power.c | 17 ++++++++++++-----
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/Makefile.power b/Makefile.power
index 95bada34f..aa1ca080a 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -11,7 +11,19 @@ endif
 
 ifeq ($(CORE), POWER10)
 ifneq ($(C_COMPILER), PGI)
+ifeq ($(C_COMPILER), GCC))
+ifeq ($(GCCVERSIONGTEQ10), 1)
 CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
+else ifneq ($(GCCVERSIONGT4), 1)
+$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
+CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
+else
+$(warning your compiler is too old to fully support POWER10, getting a newer version of gcc is recommended)
+CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
+endif
+else
+CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
+endif
 ifeq ($(F_COMPILER), IBM)
 FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr10 -qtune=pwr10 -qfloat=nomaf -qzerosize
 else
diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index f0faf2baf..0454f186c 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -66,8 +66,7 @@ static int cpuid(void)
 #endif
     return CPU_UNKNOWN;
 }
-#else
-#if defined(C_PGI) || defined(__clang__)
+#elif defined(C_PGI) || defined(__clang__)
 /*
  * NV HPC compilers do not yet implement __builtin_cpu_is().
  * Fake a version here for use in the CPU detection code below.
@@ -196,13 +195,21 @@ static int cpuid(void)
 	cpu_type = pvrPOWER[i].cpu_type;
 	return (int)(cpu_type);
 }
-#endif  /* C_PGI */
+#elif !defined(__BUILTIN_CPU_SUPPORTS__)
+static int cpuid(void)
+{
+    return CPU_UNKNOWN;
+}
 #endif  /* _AIX */
 
 #ifndef __BUILTIN_CPU_SUPPORTS__
 #include <string.h>
 
-#if defined(_AIX) || (defined(__has_builtin) && !__has_builtin(__builtin_cpu_is))
+#ifndef __has_builtin
+#define __has_builtin(x)   0
+#endif
+
+#if defined(_AIX) || !__has_builtin(__builtin_cpu_is)
 static int __builtin_cpu_is(const char *arg)
 {
     static int ipinfo = -1;
@@ -227,7 +234,7 @@ static int __builtin_cpu_is(const char *arg)
 }
 #endif
 
-#if defined(_AIX) || (defined(__has_builtin) && !__has_builtin(__builtin_cpu_supports))
+#if defined(_AIX) || !__has_builtin(__builtin_cpu_supports)
 static int __builtin_cpu_supports(const char *arg)
 {
     return 0;

From 23cda457fb0306d0aba39c3e19a1ced3fa0116c9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 15 Nov 2023 09:48:23 +0100
Subject: [PATCH 416/718] Implement truncated QR with pivoting
 (Reference-LAPACK PR 891)

---
 lapack-netlib/SRC/cgeqp3rk.f | 1091 ++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/claqp2rk.f |  726 ++++++++++++++++++++++
 lapack-netlib/SRC/claqp3rk.f |  947 +++++++++++++++++++++++++++++
 lapack-netlib/SRC/dgeqp3rk.f | 1081 +++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/dlaqp2rk.f |  713 ++++++++++++++++++++++
 lapack-netlib/SRC/dlaqp3rk.f |  935 +++++++++++++++++++++++++++++
 lapack-netlib/SRC/ilaenv.f   |   21 +-
 lapack-netlib/SRC/sgeqp3rk.f | 1081 +++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/slaqp2rk.f |  713 ++++++++++++++++++++++
 lapack-netlib/SRC/slaqp3rk.f |  935 +++++++++++++++++++++++++++++
 lapack-netlib/SRC/zgeqp3rk.f | 1091 ++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/zlaqp2rk.f |  726 ++++++++++++++++++++++
 lapack-netlib/SRC/zlaqp3rk.f |  947 +++++++++++++++++++++++++++++
 13 files changed, 11006 insertions(+), 1 deletion(-)
 create mode 100644 lapack-netlib/SRC/cgeqp3rk.f
 create mode 100644 lapack-netlib/SRC/claqp2rk.f
 create mode 100644 lapack-netlib/SRC/claqp3rk.f
 create mode 100644 lapack-netlib/SRC/dgeqp3rk.f
 create mode 100644 lapack-netlib/SRC/dlaqp2rk.f
 create mode 100644 lapack-netlib/SRC/dlaqp3rk.f
 create mode 100644 lapack-netlib/SRC/sgeqp3rk.f
 create mode 100644 lapack-netlib/SRC/slaqp2rk.f
 create mode 100644 lapack-netlib/SRC/slaqp3rk.f
 create mode 100644 lapack-netlib/SRC/zgeqp3rk.f
 create mode 100644 lapack-netlib/SRC/zlaqp2rk.f
 create mode 100644 lapack-netlib/SRC/zlaqp3rk.f

diff --git a/lapack-netlib/SRC/cgeqp3rk.f b/lapack-netlib/SRC/cgeqp3rk.f
new file mode 100644
index 000000000..70789e64f
--- /dev/null
+++ b/lapack-netlib/SRC/cgeqp3rk.f
@@ -0,0 +1,1091 @@
+*> \brief \b CGEQP3RK computes a truncated Householder QR factorization with column pivoting of a complex m-by-n matrix A by using Level 3 BLAS and overwrites m-by-nrhs matrix B with Q**H * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CGEQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cgeqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cgeqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cgeqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+*      $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*      $                     WORK, LWORK, RWORK, IWORK, INFO )
+*       IMPLICIT NONE
+*
+*      .. Scalar Arguments ..
+*       INTEGER            INFO, K, KMAX, LDA, LWORK, M, N, NRHS
+*       REAL               ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*      ..
+*      .. Array Arguments ..
+*       INTEGER            IWORK( * ), JPIV( * )
+*       REAL               RWORK( * )
+*       COMPLEX            A( LDA, * ), TAU( * ), WORK( * )
+*      ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CGEQP3RK performs two tasks simultaneously:
+*>
+*> Task 1: The routine computes a truncated (rank K) or full rank
+*> Householder QR factorization with column pivoting of a complex
+*> M-by-N matrix A using Level 3 BLAS. K is the number of columns
+*> that were factorized, i.e. factorization rank of the
+*> factor R, K <= min(M,N).
+*>
+*>  A * P(K) = Q(K) * R(K)  =
+*>
+*>        = Q(K) * ( R11(K) R12(K) ) = Q(K) * (   R(K)_approx    )
+*>                 ( 0      R22(K) )          ( 0  R(K)_residual ),
+*>
+*> where:
+*>
+*>  P(K)            is an N-by-N permutation matrix;
+*>  Q(K)            is an M-by-M orthogonal matrix;
+*>  R(K)_approx   = ( R11(K), R12(K) ) is a rank K approximation of the
+*>                    full rank factor R with K-by-K upper-triangular
+*>                    R11(K) and K-by-N rectangular R12(K). The diagonal
+*>                    entries of R11(K) appear in non-increasing order
+*>                    of absolute value, and absolute values of all of
+*>                    them exceed the maximum column 2-norm of R22(K)
+*>                    up to roundoff error.
+*>  R(K)_residual = R22(K) is the residual of a rank K approximation
+*>                    of the full rank factor R. It is a
+*>                    an (M-K)-by-(N-K) rectangular matrix;
+*>  0               is a an (M-K)-by-K zero matrix.
+*>
+*> Task 2: At the same time, the routine overwrites a complex M-by-NRHS
+*> matrix B with  Q(K)**H * B  using Level 3 BLAS.
+*>
+*> =====================================================================
+*>
+*> The matrices A and B are stored on input in the array A as
+*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS)
+*> respectively.
+*>
+*>                                  N     NRHS
+*>             array_A   =   M  [ mat_A, mat_B ]
+*>
+*> The truncation criteria (i.e. when to stop the factorization)
+*> can be any of the following:
+*>
+*>   1) The input parameter KMAX, the maximum number of columns
+*>      KMAX to factorize, i.e. the factorization rank is limited
+*>      to KMAX. If KMAX >= min(M,N), the criterion is not used.
+*>
+*>   2) The input parameter ABSTOL, the absolute tolerance for
+*>      the maximum column 2-norm of the residual matrix R22(K). This
+*>      means that the factorization stops if this norm is less or
+*>      equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used.
+*>
+*>   3) The input parameter RELTOL, the tolerance for the maximum
+*>      column 2-norm matrix of the residual matrix R22(K) divided
+*>      by the maximum column 2-norm of the original matrix A, which
+*>      is equal to abs(R(1,1)). This means that the factorization stops
+*>      when the ratio of the maximum column 2-norm of R22(K) to
+*>      the maximum column 2-norm of A is less than or equal to RELTOL.
+*>      If RELTOL < 0.0, the criterion is not used.
+*>
+*>   4) In case both stopping criteria ABSTOL or RELTOL are not used,
+*>      and when the residual matrix R22(K) is a zero matrix in some
+*>      factorization step K. ( This stopping criterion is implicit. )
+*>
+*>  The algorithm stops when any of these conditions is first
+*>  satisfied, otherwise the whole matrix A is factorized.
+*>
+*>  To factorize the whole matrix A, use the values
+*>  KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0.
+*>
+*>  The routine returns:
+*>     a) Q(K), R(K)_approx = ( R11(K), R12(K) ),
+*>        R(K)_residual = R22(K), P(K), i.e. the resulting matrices
+*>        of the factorization; P(K) is represented by JPIV,
+*>        ( if K = min(M,N), R(K)_approx is the full factor R,
+*>        and there is no residual matrix R(K)_residual);
+*>     b) K, the number of columns that were factorized,
+*>        i.e. factorization rank;
+*>     c) MAXC2NRMK, the maximum column 2-norm of the residual
+*>        matrix R(K)_residual = R22(K),
+*>        ( if K = min(M,N), MAXC2NRMK = 0.0 );
+*>     d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum
+*>        column 2-norm of the original matrix A, which is equal
+*>        to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 );
+*>     e) Q(K)**H * B, the matrix B with the orthogonal
+*>        transformation Q(K)**H applied on the left.
+*>
+*> The N-by-N permutation matrix P(K) is stored in a compact form in
+*> the integer array JPIV. For 1 <= j <= N, column j
+*> of the matrix A was interchanged with column JPIV(j).
+*>
+*> The M-by-M orthogonal matrix Q is represented as a product
+*> of elementary Householder reflectors
+*>
+*>     Q(K) = H(1) *  H(2) * . . . * H(K),
+*>
+*> where K is the number of columns that were factorized.
+*>
+*> Each H(j) has the form
+*>
+*>     H(j) = I - tau * v * v**H,
+*>
+*> where 1 <= j <= K and
+*>   I    is an M-by-M identity matrix,
+*>   tau  is a complex scalar,
+*>   v    is a complex vector with v(1:j-1) = 0 and v(j) = 1.
+*>
+*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j).
+*>
+*> See the Further Details section for more information.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e. the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M,N), then this stopping criterion
+*>                is not used, the routine factorizes columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>                satisfied on input and the routine exits immediately.
+*>                This means that the factorization is not performed,
+*>                the matrices A and B are not modified, and
+*>                the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is REAL
+*>
+*>          The second factorization stopping criterion, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix R22(K).
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix R22(K)
+*>          is less than or equal to ABSTOL. Let SAFMIN = DLAMCH('S').
+*>
+*>          a) If ABSTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -5 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN
+*>                is used. This includes the case ABSTOL = -0.0.
+*>
+*>          d) If 2*SAFMIN <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If ABSTOL chosen above is >= MAXC2NRM, then this
+*>          stopping criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed. The routine
+*>          returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case ABSTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is REAL
+*>
+*>          The third factorization stopping criterion, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio
+*>          abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of
+*>          the residual matrix R22(K) to the maximum column 2-norm of
+*>          the original matrix A. The algorithm converges (stops the
+*>          factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less
+*>          than or equal to RELTOL. Let EPS = DLAMCH('E').
+*>
+*>          a) If RELTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -6 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used.
+*>                This includes the case RELTOL = -0.0.
+*>
+*>          d) If EPS <= RELTOL then the input value of RELTOL
+*>                is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If RELTOL chosen above is >= 1.0, then this stopping
+*>          criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed.
+*>          The routine returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case RELTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*>
+*>          NOTE: We recommend that RELTOL satisfy
+*>                min( 10*max(M,N)*EPS, sqrt(EPS) ) <= RELTOL
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N+NRHS)
+*>
+*>          On entry:
+*>
+*>          a) The subarray A(1:M,1:N) contains the M-by-N matrix A.
+*>          b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS
+*>             matrix B.
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>
+*>          a) The subarray A(1:M,1:N) contains parts of the factors
+*>             of the matrix A:
+*>
+*>            1) If K = 0, A(1:M,1:N) contains the original matrix A.
+*>            2) If K > 0, A(1:M,1:N) contains parts of the
+*>            factors:
+*>
+*>              1. The elements below the diagonal of the subarray
+*>                 A(1:M,1:K) together with TAU(1:K) represent the
+*>                 orthogonal matrix Q(K) as a product of K Householder
+*>                 elementary reflectors.
+*>
+*>              2. The elements on and above the diagonal of
+*>                 the subarray A(1:K,1:N) contain K-by-N
+*>                 upper-trapezoidal matrix
+*>                 R(K)_approx = ( R11(K), R12(K) ).
+*>                 NOTE: If K=min(M,N), i.e. full rank factorization,
+*>                       then R_approx(K) is the full factor R which
+*>                       is upper-trapezoidal. If, in addition, M>=N,
+*>                       then R is upper-triangular.
+*>
+*>              3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K)
+*>                 rectangular matrix R(K)_residual = R22(K).
+*>
+*>          b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains
+*>             the M-by-NRHS product Q(K)**H * B.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*>          This is the leading dimension for both matrices, A and B.
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*>
+*>          NOTE: If K = 0, a) the arrays A and B are not modified;
+*>                          b) the array TAU(1:min(M,N)) is set to ZERO,
+*>                             if the matrix A does not contain NaN,
+*>                             otherwise the elements TAU(1:min(M,N))
+*>                             are undefined;
+*>                          c) the elements of the array JPIV are set
+*>                             as follows: for j = 1:N, JPIV(j) = j.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is REAL
+*>          The maximum column 2-norm of the residual matrix R22(K),
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then MAXC2NRMK equals the maximum column 2-norm
+*>             of the original matrix A.
+*>
+*>          b) If 0 < K < min(M,N), then MAXC2NRMK is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then MAXC2NRMK = 0.0.
+*>
+*>          NOTE: MAXC2NRMK in the factorization step K would equal
+*>                R(K+1,K+1) in the next factorization step K+1.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is REAL
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix R22(K) (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then RELMAXC2NRMK = 1.0.
+*>
+*>          b) If 0 < K < min(M,N), then
+*>                RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then RELMAXC2NRMK = 0.0.
+*>
+*>         NOTE: RELMAXC2NRMK in the factorization step K would equal
+*>               abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization
+*>               step K+1.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices. For 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*>
+*>          The elements of the array JPIV(1:N) are always set
+*>          by the routine, for example, even  when no columns
+*>          were factorized, i.e. when K = 0, the elements are
+*>          set as JPIV(j) = j for j = 1:N.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX array, dimension (min(M,N))
+*>          The scalar factors of the elementary reflectors.
+*>
+*>          If 0 < K <= min(M,N), only the elements TAU(1:K) of
+*>          the array TAU are modified by the factorization.
+*>          After the factorization computed, if no NaN was found
+*>          during the factorization, the remaining elements
+*>          TAU(K+1:min(M,N)) are set to zero, otherwise the
+*>          elements TAU(K+1:min(M,N)) are not set and therefore
+*>          undefined.
+*>          ( If K = 0, all elements of TAU are set to zero, if
+*>          the matrix A does not contain NaN. )
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*.          LWORK >= N+NRHS-1
+*>          For optimal performance LWORK >= NB*( N+NRHS+1 ),
+*>          where NB is the optimal block size for CGEQP3RK returned
+*>          by ILAENV. Minimal block size MINNB=2.
+*>
+*>          NOTE: The decision, whether to use unblocked BLAS 2
+*>          or blocked BLAS 3 code is based not only on the dimension
+*>          LWORK of the availbale workspace WORK, but also also on the
+*>          matrix A dimension N via crossover point NX returned
+*>          by ILAENV. (For N less than NX, unblocked code should be
+*>          used.)
+*>
+*>          If LWORK = -1, then a workspace query is assumed;
+*>          the routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (2*N)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix in the blocked step auxiliary subroutine CLAQP3RK ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) INFO < 0: if INFO = -i, the i-th argument had an
+*>                       illegal value.
+*>          3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup geqp3rk
+*
+*> \par Further Details:
+*  =====================
+*
+*> \verbatim
+*> CGEQP3RK is based on the same BLAS3 Householder QR factorization
+*> algorithm with column pivoting as in CGEQP3 routine which uses
+*> CLARFG routine to generate Householder reflectors
+*> for QR factorization.
+*>
+*> We can also write:
+*>
+*>   A = A_approx(K) + A_residual(K)
+*>
+*> The low rank approximation matrix A(K)_approx from
+*> the truncated QR factorization of rank K of the matrix A is:
+*>
+*>   A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T
+*>                        (     0     0 )
+*>
+*>               = Q(K) * ( R11(K) R12(K) ) * P(K)**T
+*>                        (      0      0 )
+*>
+*> The residual A_residual(K) of the matrix A is:
+*>
+*>   A_residual(K) = Q(K) * ( 0              0 ) * P(K)**T =
+*>                          ( 0  R(K)_residual )
+*>
+*>                 = Q(K) * ( 0        0 ) * P(K)**T
+*>                          ( 0   R22(K) )
+*>
+*> The truncated (rank K) factorization guarantees that
+*> the maximum column 2-norm of A_residual(K) is less than
+*> or equal to MAXC2NRMK up to roundoff error.
+*>
+*> NOTE: An approximation of the null vectors
+*>       of A can be easily computed from R11(K)
+*>       and R12(K):
+*>
+*>       Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) )
+*>                                 (         -I           )
+*>
+*> \endverbatim
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  Computer Science Division,
+*>                  University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE CGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+     $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     WORK, LWORK, RWORK, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS
+      REAL               ABSTOL,  MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      REAL               RWORK( * )
+      COMPLEX            A( LDA, * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            INB, INBMIN, IXOVER
+      PARAMETER          ( INB = 1, INBMIN = 2, IXOVER = 3 )
+      REAL               ZERO, ONE, TWO
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0 )
+      COMPLEX            CZERO
+      PARAMETER          ( CZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, DONE
+      INTEGER            IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX,
+     $                   JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB,
+     $                   NBMIN, NX
+      REAL               EPS, HUGEVAL, MAXC2NRM, SAFMIN
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CLAQP2RK, CLAQP3RK, XERBLA
+*     ..
+*     .. External Functions ..
+      LOGICAL            SISNAN
+      INTEGER            ISAMAX, ILAENV
+      REAL               SLAMCH, SCNRM2
+      EXTERNAL           SISNAN, SLAMCH, SCNRM2, ISAMAX, ILAENV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CMPLX, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test input arguments
+*     ====================
+*
+      INFO = 0
+      LQUERY = ( LWORK.EQ.-1 )
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KMAX.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( SISNAN( ABSTOL ) ) THEN
+         INFO = -5
+      ELSE IF( SISNAN( RELTOL ) ) THEN
+         INFO = -6
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -8
+      END IF
+*
+*     If the input parameters M, N, NRHS, KMAX, LDA are valid:
+*       a) Test the input workspace size LWORK for the minimum
+*          size requirement IWS.
+*       b) Determine the optimal block size NB and optimal
+*          workspace size LWKOPT to be returned in WORK(1)
+*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE.,
+*          (3) when routine exits.
+*     Here, IWS is the miminum workspace required for unblocked
+*     code.
+*
+      IF( INFO.EQ.0 ) THEN
+         MINMN = MIN( M, N )
+         IF( MINMN.EQ.0 ) THEN
+            IWS = 1
+            LWKOPT = 1
+         ELSE
+*
+*           Minimal workspace size in case of using only unblocked
+*           BLAS 2 code in CLAQP2RK.
+*           1) CLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in CLARF subroutine inside CLAQP2RK to apply an
+*              elementary reflector from the left.
+*           TOTAL_WORK_SIZE = 3*N + NRHS - 1
+*
+            IWS = N + NRHS - 1
+*
+*           Assign to NB optimal block size.
+*
+            NB = ILAENV( INB, 'CGEQP3RK', ' ', M, N, -1, -1 )
+*
+*           A formula for the optimal workspace size in case of using
+*           both unblocked BLAS 2 in CLAQP2RK and blocked BLAS 3 code
+*           in CLAQP3RK.
+*           1) CGEQP3RK, CLAQP2RK, CLAQP3RK: 2*N to store full and
+*              partial column 2-norms.
+*           2) CLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in CLARF subroutine to apply an elementary reflector
+*              from the left.
+*           3) CLAQP3RK: NB*(N+NRHS) to use in the work array F that
+*              is used to apply a block reflector from
+*              the left.
+*           4) CLAQP3RK: NB to use in the auxilixary array AUX.
+*           Sizes (2) and ((3) + (4)) should intersect, therefore
+*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2.
+*
+            LWKOPT = 2*N + NB*( N+NRHS+1 )
+         END IF
+         WORK( 1 ) = CMPLX( LWKOPT )
+*
+         IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN
+            INFO = -15
+         END IF
+      END IF
+*
+*      NOTE: The optimal workspace size is returned in WORK(1), if
+*            the input parameters M, N, NRHS, KMAX, LDA are valid.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CGEQP3RK', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible for M=0 or N=0.
+*
+      IF( MINMN.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+         WORK( 1 ) = CMPLX( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+*     Initialize column pivot array JPIV.
+*
+      DO J = 1, N
+         JPIV( J ) = J
+      END DO
+*
+*     ==================================================================
+*
+*     Initialize storage for partial and exact column 2-norms.
+*     a) The elements WORK(1:N) are used to store partial column
+*        2-norms of the matrix A, and may decrease in each computation
+*        step; initialize to the values of complete columns 2-norms.
+*     b) The elements WORK(N+1:2*N) are used to store complete column
+*        2-norms of the matrix A, they are not changed during the
+*        computation; initialize the values of complete columns 2-norms.
+*
+      DO J = 1, N
+         RWORK( J ) = SCNRM2( M, A( 1, J ), 1 )
+         RWORK( N+J ) = RWORK( J )
+      END DO
+*
+*     ==================================================================
+*
+*     Compute the pivot column index and the maximum column 2-norm
+*     for the whole original matrix stored in A(1:M,1:N).
+*
+      KP1 = ISAMAX( N, RWORK( 1 ), 1 )
+*
+*     ==================================================================.
+*
+      IF( SISNAN( MAXC2NRM ) ) THEN
+*
+*        Check if the matrix A contains NaN, set INFO parameter
+*        to the column number where the first NaN is found and return
+*        from the routine.
+*
+         K = 0
+         INFO = KP1
+*
+*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = MAXC2NRM
+*
+*        Array TAU is not set and contains undefined elements.
+*
+         WORK( 1 ) = CMPLX( LWKOPT )
+         RETURN
+      END IF
+*
+*     ===================================================================
+*
+      IF( MAXC2NRM.EQ.ZERO ) THEN
+*
+*        Check is the matrix A is a zero matrix, set array TAU and
+*        return from the routine.
+*
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+*
+         DO J = 1, MINMN
+            TAU( J ) = CZERO
+         END DO
+*
+         WORK( 1 ) = CMPLX( LWKOPT )
+         RETURN
+*
+      END IF
+*
+*     ===================================================================
+*
+      HUGEVAL = SLAMCH( 'Overflow' )
+*
+      IF( MAXC2NRM.GT.HUGEVAL ) THEN
+*
+*        Check if the matrix A contains +Inf or -Inf, set INFO parameter
+*        to the column number, where the first +/-Inf  is found plus N,
+*        and continue the computation.
+*
+         INFO = N + KP1
+*
+      END IF
+*
+*     ==================================================================
+*
+*     Quick return if possible for the case when the first
+*     stopping criterion is satisfied, i.e. KMAX = 0.
+*
+      IF( KMAX.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+         DO J = 1, MINMN
+            TAU( J ) = CZERO
+         END DO
+         WORK( 1 ) = CMPLX( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+      EPS = SLAMCH('Epsilon')
+*
+*     Adjust ABSTOL
+*
+      IF( ABSTOL.GE.ZERO ) THEN
+         SAFMIN = SLAMCH('Safe minimum')
+         ABSTOL = MAX( ABSTOL, TWO*SAFMIN )
+      END IF
+*
+*     Adjust RELTOL
+*
+      IF( RELTOL.GE.ZERO ) THEN
+         RELTOL = MAX( RELTOL, EPS )
+      END IF
+*
+*     ===================================================================
+*
+*     JMAX is the maximum index of the column to be factorized,
+*     which is also limited by the first stopping criterion KMAX.
+*
+      JMAX = MIN( KMAX, MINMN )
+*
+*     ===================================================================
+*
+*     Quick return if possible for the case when the second or third
+*     stopping criterion for the whole original matrix is satified,
+*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL
+*     (which is ONE <= RELTOL).
+*
+      IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN
+*
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+*
+         DO J = 1, MINMN
+            TAU( J ) = CZERO
+         END DO
+*
+         WORK( 1 ) = CMPLX( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*     Factorize columns
+*     ==================================================================
+*
+*     Determine the block size.
+*
+      NBMIN = 2
+      NX = 0
+*
+      IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN
+*
+*        Determine when to cross over from blocked to unblocked code.
+*        (for N less than NX, unblocked code should be used).
+*
+         NX = MAX( 0, ILAENV( IXOVER, 'CGEQP3RK', ' ', M, N, -1, -1 ) )
+*
+         IF( NX.LT.MINMN ) THEN
+*
+*           Determine if workspace is large enough for blocked code.
+*
+            IF( LWORK.LT.LWKOPT ) THEN
+*
+*              Not enough workspace to use optimal block size that
+*              is currently stored in NB.
+*              Reduce NB and determine the minimum value of NB.
+*
+               NB = ( LWORK-2*N ) / ( N+1 )
+               NBMIN = MAX( 2, ILAENV( INBMIN, 'CGEQP3RK', ' ', M, N,
+     $                 -1, -1 ) )
+*
+            END IF
+         END IF
+      END IF
+*
+*     ==================================================================
+*
+*     DONE is the boolean flag to rerpresent the case when the
+*     factorization completed in the block factorization routine,
+*     before the end of the block.
+*
+      DONE = .FALSE.
+*
+*     J is the column index.
+*
+      J = 1
+*
+*     (1) Use blocked code initially.
+*
+*     JMAXB is the maximum column index of the block, when the
+*     blocked code is used, is also limited by the first stopping
+*     criterion KMAX.
+*
+      JMAXB = MIN( KMAX, MINMN - NX )
+*
+      IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN
+*
+*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here:
+*        J   is the column index of a column block;
+*        JB  is the column block size to pass to block factorization
+*            routine in a loop step;
+*        JBF is the number of columns that were actually factorized
+*            that was returned by the block factorization routine
+*            in a loop step, JBF <= JB;
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         DO WHILE( J.LE.JMAXB )
+*
+            JB = MIN( NB, JMAXB-J+1 )
+            N_SUB = N-J+1
+            IOFFSET = J-1
+*
+*           Factorize JB columns among the columns A(J:N).
+*
+            CALL CLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                     DONE, JBF, MAXC2NRMK, RELMAXC2NRMK,
+     $                     JPIV( J ), TAU( J ),
+     $                     RWORK( J ), RWORK( N+J ),
+     $                     WORK( 1 ), WORK( JB+1 ),
+     $                     N+NRHS-J+1, IWORK, IINFO )
+*
+*           Set INFO on the first occurence of Inf.
+*
+            IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+               INFO = 2*IOFFSET + IINFO
+            END IF
+*
+            IF( DONE ) THEN
+*
+*              Either the submatrix is zero before the end of the
+*              column block, or ABSTOL or RELTOL criterion is
+*              satisfied before the end of the column block, we can
+*              return from the routine. Perform the following before
+*              returning:
+*                a) Set the number of factorized columns K,
+*                   K = IOFFSET + JBF from the last call of blocked
+*                   routine.
+*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned
+*                         by the block factorization routine;
+*                      2) The remaining TAUs are set to ZERO by the
+*                         block factorization routine.
+*
+               K = IOFFSET + JBF
+*
+*              Set INFO on the first occurrence of NaN, NaN takes
+*              prcedence over Inf.
+*
+               IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+                  INFO = IOFFSET + IINFO
+               END IF
+*
+*              Return from the routine.
+*
+               WORK( 1 ) = CMPLX( LWKOPT )
+*
+               RETURN
+*
+            END IF
+*
+            J = J + JBF
+*
+         END DO
+*
+      END IF
+*
+*     Use unblocked code to factor the last or only block.
+*     J = JMAX+1 means we factorized the maximum possible number of
+*     columns, that is in ELSE clause we need to compute
+*     the MAXC2NORM and RELMAXC2NORM to return after we processed
+*     the blocks.
+*
+      IF( J.LE.JMAX ) THEN
+*
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         N_SUB = N-J+1
+         IOFFSET = J-1
+*
+         CALL CLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1,
+     $                  ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                  KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ),
+     $                  TAU( J ), RWORK( J ), RWORK( N+J ),
+     $                  WORK( 1 ), IINFO )
+*
+*        ABSTOL or RELTOL criterion is satisfied when the number of
+*        the factorized columns KF is smaller then the  number
+*        of columns JMAX-J+1 supplied to be factorized by the
+*        unblocked routine, we can return from
+*        the routine. Perform the following before returning:
+*           a) Set the number of factorized columns K,
+*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the
+*              unblocked factorization routine above.
+*
+         K = J - 1 + KF
+*
+*        Set INFO on the first exception occurence.
+*
+*        Set INFO on the first exception occurence of Inf or NaN,
+*        (NaN takes precedence over Inf).
+*
+         IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+            INFO = 2*IOFFSET + IINFO
+         ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+            INFO = IOFFSET + IINFO
+         END IF
+*
+      ELSE
+*
+*        Compute the return values for blocked code.
+*
+*        Set the number of factorized columns if the unblocked routine
+*        was not called.
+*
+            K = JMAX
+*
+*        If there exits a residual matrix after the blocked code:
+*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the
+*              residual matrix, otherwise set them to ZERO;
+*           2) Set TAU(K+1:MINMN) to ZERO.
+*
+         IF( K.LT.MINMN ) THEN
+            JMAXC2NRM = K + ISAMAX( N-K, RWORK( K+1 ), 1 )
+            MAXC2NRMK = RWORK( JMAXC2NRM )
+            IF( K.EQ.0 ) THEN
+               RELMAXC2NRMK = ONE
+            ELSE
+               RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+            END IF
+*
+            DO J = K + 1, MINMN
+               TAU( J ) = CZERO
+            END DO
+*
+         ELSE
+            MAXC2NRMK = ZERO
+            RELMAXC2NRMK = ZERO
+*
+         END IF
+*
+*     END IF( J.LE.JMAX ) THEN
+*
+      END IF
+*
+      WORK( 1 ) = CMPLX( LWKOPT )
+*
+      RETURN
+*
+*     End of CGEQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/claqp2rk.f b/lapack-netlib/SRC/claqp2rk.f
new file mode 100644
index 000000000..073ad0f88
--- /dev/null
+++ b/lapack-netlib/SRC/claqp2rk.f
@@ -0,0 +1,726 @@
+*> \brief \b CLAQP2RK computes truncated QR factorization with column pivoting of a complex matrix block using Level 2 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CLAQP2RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/claqp2rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/claqp2rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/claqp2rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE CLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+*     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+*     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+*     $                     INFO )
+*      IMPLICIT NONE
+*
+*     .. Scalar Arguments ..
+*      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+*      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            JPIV( * )
+*      REAL               VN1( * ), VN2( * )
+*      COMPLEX            A( LDA, * ), TAU( * ), WORK( * )
+*     $
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CLAQP2RK computes a truncated (rank K) or full rank Householder QR
+*> factorization with column pivoting of the complex matrix
+*> block A(IOFFSET+1:M,1:N) as
+*>
+*>   A * P(K) = Q(K) * R(K).
+*>
+*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N)
+*> is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides matrix block B
+*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**H * B.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M-IOFFSET,N), then this stopping
+*>                criterion is not used, factorize columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>             satisfied on input and the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is REAL, cannot be NaN.
+*>
+*>          The second factorization stopping criterion.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is REAL, cannot be NaN.
+*>
+*>          The third factorization stopping criterion.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine CGEQP3RK. 1 <= KP1 <= N_orig_mat.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is REAL
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine CGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:K) below
+*>             the diagonal together with the array TAU represent
+*>             the orthogonal matrix Q(K) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:K) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,K+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(K)**H.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is REAL
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is REAL
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is REAL array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is REAL array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (N-1)
+*>          Used in CLARF subroutine to apply an elementary
+*>          reflector from the left.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp2rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  Computer Science Division,
+*>                  University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE CLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+     $                     INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            JPIV( * )
+      REAL               VN1( * ), VN2( * )
+      COMPLEX            A( LDA, * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+      COMPLEX            CZERO, CONE
+      PARAMETER          ( CZERO = ( 0.0E+0, 0.0E+0 ),
+     $                   CONE = ( 1.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT,
+     $                   MINMNUPDT
+      REAL               HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z
+      COMPLEX            AIKK
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CLARF, CLARFG, CSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, REAL, CONJG, IMAG, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            SISNAN
+      INTEGER            ISAMAX
+      REAL               SLAMCH, SCNRM2
+      EXTERNAL           SISNAN, SLAMCH, ISAMAX, SCNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+*     MINMNUPDT is the smallest dimension
+*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which
+*     contains the submatrices A(IOFFSET+1:M,1:N) and
+*     B(IOFFSET+1:M,1:NRHS) as column blocks.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      KMAX = MIN( KMAX, MINMNFACT )
+      TOL3Z = SQRT( SLAMCH( 'Epsilon' ) )
+      HUGEVAL = SLAMCH( 'Overflow' )
+*
+*     Compute the factorization, KK is the lomn loop index.
+*
+      DO KK = 1, KMAX
+*
+         I = IOFFSET + KK
+*
+         IF( I.EQ.1 ) THEN
+*
+*           ============================================================
+*
+*           We are at the first column of the original whole matrix A,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+            KP = KP1
+*
+*           ============================================================
+*
+         ELSE
+*
+*           ============================================================
+*
+*           Determine the pivot column in KK-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( KK-1 ) + ISAMAX( N-KK+1, VN1( KK ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,KK:N) in step KK.
+*           RELMAXC2NRMK  will be computed later, after somecondition
+*           checks on MAXC2NRMK.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains NaN, and set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( SISNAN( MAXC2NRMK ) ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+                K = KK - 1
+                INFO = K + KP
+*
+*               Set RELMAXC2NRMK to NaN.
+*
+                RELMAXC2NRMK = MAXC2NRMK
+*
+*               Array TAU(K+1:MINMNFACT) is not set and contains
+*               undefined elements.
+*
+               RETURN
+            END IF
+*
+*           ============================================================
+*
+*           Quick return, if the submatrix A(I:M,KK:N) is
+*           a zero matrix.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+               K = KK - 1
+               RELMAXC2NRMK = ZERO
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + KK - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third stopping criteria.
+*           NOTE: There is no need to test for ABSTOL >= ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+*              Set K, the number of factorized columns.
+*
+               K = KK - 1
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,KK:N):
+*        1) swap the KK-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) copy the KK-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than KK in the next loop step.)
+*        3) Save the pivot interchange with the indices relative to the
+*           the original matrix A, not the block A(1:M,1:N).
+*
+         IF( KP.NE.KK ) THEN
+            CALL CSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 )
+            VN1( KP ) = VN1( KK )
+            VN2( KP ) = VN2( KK )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( KK )
+            JPIV( KK ) = ITEMP
+         END IF
+*
+*        Generate elementary reflector H(KK) using the column A(I:M,KK),
+*        if the column has more than one element, otherwise
+*        the elementary reflector would be an identity matrix,
+*        and TAU(KK) = CZERO.
+*
+         IF( I.LT.M ) THEN
+            CALL CLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1,
+     $                   TAU( KK ) )
+         ELSE
+            TAU( KK ) = CZERO
+         END IF
+*
+*        Check if TAU(KK) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(KK) for Inf,
+*        since CLARFG cannot produce TAU(KK) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by CLARFG can contain Inf, which requires
+*        TAU(KK) to contain NaN. Therefore, this case of generating Inf
+*        by CLARFG is covered by checking TAU(KK) for NaN.
+*
+         IF( SISNAN( REAL( TAU(KK) ) ) ) THEN
+            TAUNAN = REAL( TAU(KK) )
+         ELSE IF( SISNAN( IMAG( TAU(KK) ) ) ) THEN
+            TAUNAN = IMAG( TAU(KK) )
+         ELSE
+            TAUNAN = ZERO
+         END IF
+*
+         IF( SISNAN( TAUNAN ) ) THEN
+            K = KK - 1
+            INFO = KK
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAUNAN
+            RELMAXC2NRMK = TAUNAN
+*
+*           Array TAU(KK:MINMNFACT) is not set and contains
+*           undefined elements, except the first element TAU(KK) = NaN.
+*
+            RETURN
+         END IF
+*
+*        Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left.
+*        ( If M >= N, then at KK = N there is no residual matrix,
+*         i.e. no columns of A to update, only columns of B.
+*         If M < N, then at KK = M-IOFFSET, I = M and we have a
+*         one-row residual matrix in A and the elementary
+*         reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update
+*         is needed for the residual matrix in A and the
+*         right-hand-side-matrix in B.
+*         Therefore, we update only if
+*         KK < MINMNUPDT = min(M-IOFFSET, N+NRHS)
+*         condition is satisfied, not only KK < N+NRHS )
+*
+         IF( KK.LT.MINMNUPDT ) THEN
+            AIKK = A( I, KK )
+            A( I, KK ) = CONE
+            CALL CLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1,
+     $                  CONJG( TAU( KK ) ), A( I, KK+1 ), LDA,
+     $                  WORK( 1 ) )
+            A( I, KK ) = AIKK
+         END IF
+*
+         IF( KK.LT.MINMNFACT ) THEN
+*
+*           Update the partial column 2-norms for the residual matrix,
+*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e.
+*           when KK < min(M-IOFFSET, N).
+*
+            DO J = KK + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2
+                  TEMP = MAX( TEMP, ZERO )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2 .LE. TOL3Z ) THEN
+*
+*                    Compute the column 2-norm for the partial
+*                    column A(I+1:M,J) by explicitly computing it,
+*                    and store it in both partial 2-norm vector VN1
+*                    and exact column 2-norm vector VN2.
+*
+                     VN1( J ) = SCNRM2( M-I, A( I+1, J ), 1 )
+                     VN2( J ) = VN1( J )
+*
+                  ELSE
+*
+*                    Update the column 2-norm for the partial
+*                    column A(I+1:M,J) by removing one
+*                    element A(I,J) and store it in partial
+*                    2-norm vector VN1.
+*
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+*
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*     End factorization loop
+*
+      END DO
+*
+*     If we reached this point, all colunms have been factorized,
+*     i.e. no condition was triggered to exit the routine.
+*     Set the number of factorized columns.
+*
+      K = KMAX
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before
+*     we return.
+*
+      IF( K.LT.MINMNFACT ) THEN
+*
+         JMAXC2NRM = K + ISAMAX( N-K, VN1( K+1 ), 1 )
+         MAXC2NRMK = VN1( JMAXC2NRM )
+*
+         IF( K.EQ.0 ) THEN
+            RELMAXC2NRMK = ONE
+         ELSE
+            RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+         END IF
+*
+      ELSE
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+      END IF
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, set TAUs corresponding to the columns that were
+*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO.
+*
+      DO J = K + 1, MINMNFACT
+         TAU( J ) = CZERO
+      END DO
+*
+      RETURN
+*
+*     End of CLAQP2RK
+*
+      END
diff --git a/lapack-netlib/SRC/claqp3rk.f b/lapack-netlib/SRC/claqp3rk.f
new file mode 100644
index 000000000..af5e85645
--- /dev/null
+++ b/lapack-netlib/SRC/claqp3rk.f
@@ -0,0 +1,947 @@
+*> \brief \b CLAQP3RK computes a step of truncated QR factorization with column pivoting of a complex m-by-n matrix A using Level 3 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CLAQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/claqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/claqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/claqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE CLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+*     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+*     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+*      IMPLICIT NONE
+*      LOGICAL            DONE
+*      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+*     $                   NB, NRHS
+*      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            IWORK( * ), JPIV( * )
+*      REAL               VN1( * ), VN2( * )
+*      COMPLEX*16         A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * )
+*     ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CLAQP3RK computes a step of truncated QR factorization with column
+*> pivoting of a complex M-by-N matrix A block A(IOFFSET+1:M,1:N)
+*> by using Level 3 BLAS as
+*>
+*>   A * P(KB) = Q(KB) * R(KB).
+*>
+*> The routine tries to factorize NB columns from A starting from
+*> the row IOFFSET+1 and updates the residual matrix with BLAS 3
+*> xGEMM. The number of actually factorized columns is returned
+*> is smaller than NB.
+*>
+*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides B matrix stored
+*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**H * B.
+*>
+*> Cases when the number of factorized columns KB < NB:
+*>
+*> (1) In some cases, due to catastrophic cancellations, it cannot
+*> factorize all NB columns and need to update the residual matrix.
+*> Hence, the actual number of factorized columns in the block returned
+*> in KB is smaller than NB. The logical DONE is returned as FALSE.
+*> The factorization of the whole original matrix A_orig must proceed
+*> with the next block.
+*>
+*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB.
+*>
+*> (3) In case both stopping criteria ABSTOL or RELTOL are not used,
+*> and when the residual matrix is a zero matrix in some factorization
+*> step KB, the factorization of the whole original matrix A_orig is
+*> stopped, the logical DONE is returned as TRUE. The number of
+*> factorized columns which is smaller than NB is returned in KB.
+*>
+*> (4) Whenever NaN is detected in the matrix A or in the array TAU,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB. The INFO
+*> parameter is set to the column index of the first NaN occurrence.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          Factorization block size, i.e the number of columns
+*>          to factorize in the matrix A. 0 <= NB
+*>
+*>          If NB = 0, then the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is REAL, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is REAL, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine CGEQP3RK. 1 <= KP1 <= N_orig.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is REAL
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine CGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:KB) below
+*>             the diagonal together with the array TAU represent
+*>             the orthogonal matrix Q(KB) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:KB) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,KB+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(KB)**H.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out]
+*> \verbatim
+*>          DONE is LOGICAL
+*>          TRUE: a) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to ABSTOL
+*>                   or RELTOL criterion,
+*>                b) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to the
+*>                   residual matrix being a ZERO matrix.
+*>                c) when NaN was detected in the matrix A
+*>                   or in the array TAU.
+*>          FALSE: otherwise.
+*> \endverbatim
+*>
+*> \param[out] KB
+*> \verbatim
+*>          KB is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R.  0 <= KB <= min(M-IOFFSET,NB,N).
+*>
+*>          KB also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is REAL
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank KB. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is REAL
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank KB) to the maximum column 2-norm of the
+*>          original matrix A_orig. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is REAL array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is REAL array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] AUXV
+*> \verbatim
+*>          AUXV is COMPLEX array, dimension (NB)
+*>          Auxiliary vector.
+*> \endverbatim
+*>
+*> \param[out] F
+*> \verbatim
+*>          F is COMPLEX array, dimension (LDF,NB)
+*>          Matrix F**H = L*(Y**H)*A.
+*> \endverbatim
+*>
+*> \param[in] LDF
+*> \verbatim
+*>          LDF is INTEGER
+*>          The leading dimension of the array F. LDF >= max(1,N+NRHS).
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step KB+1 ( when KB columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             KB                  is set to the number of
+*>                                    factorized columns without
+*>                                    exception.
+*>             MAXC2NRMK           is set to NaN.
+*>             RELMAXC2NRMK        is set to NaN.
+*>             TAU(KB+1:min(M,N))     is not set and contains undefined
+*>                                    elements. If j_1=KB+1, TAU(KB+1)
+*>                                    may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the actorization
+*>             step KB+1 ( when KB columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp3rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  Computer Science Division,
+*>                  University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE CLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      LOGICAL            DONE
+      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+     $                   NB, NRHS
+      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      REAL               VN1( * ), VN2( * )
+      COMPLEX            A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+      COMPLEX            CZERO, CONE
+      PARAMETER          ( CZERO = ( 0.0E+0, 0.0E+0 ),
+     $                   CONE = ( 1.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            ITEMP, J, K, MINMNFACT, MINMNUPDT,
+     $                   LSTICC, KP, I, IF
+      REAL               HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z
+      COMPLEX            AIK
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CGEMM, CGEMV, CLARFG, CSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, REAL, CONJG, IMAG, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            SISNAN
+      INTEGER            ISAMAX
+      REAL               SLAMCH, SCNRM2
+      EXTERNAL           SISNAN, SLAMCH, ISAMAX, SCNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      NB = MIN( NB, MINMNFACT )
+      TOL3Z = SQRT( SLAMCH( 'Epsilon' ) )
+      HUGEVAL = SLAMCH( 'Overflow' )
+*
+*     Compute factorization in a while loop over NB columns,
+*     K is the column index in the block A(1:M,1:N).
+*
+      K = 0
+      LSTICC = 0
+      DONE = .FALSE.
+*
+      DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 )
+         K = K + 1
+         I = IOFFSET + K
+*
+         IF( I.EQ.1 ) THEN
+*
+*           We are at the first column of the original whole matrix A_orig,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+            KP = KP1
+*
+         ELSE
+*
+*           Determine the pivot column in K-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( K-1 ) + ISAMAX( N-K+1, VN1( K ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,K:N) in step K.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains NaN, set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( SISNAN( MAXC2NRMK ) ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               INFO = KB + KP
+*
+*              Set RELMAXC2NRMK to NaN.
+*
+               RELMAXC2NRMK = MAXC2NRMK
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix contains NaN and we stop
+*              the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H.
+
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $                  M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Array TAU(KF+1:MINMNFACT) is not set and contains
+*              undefined elements.
+*
+*              Return from the routine.
+*
+               RETURN
+            END IF
+*
+*           Quick return, if the submatrix A(I:M,K:N) is
+*           a zero matrix. We need to check it only if the column index
+*           (same as row index) is larger than 1, since the condition
+*           for the whole original matrix A_orig is checked in the main
+*           routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               RELMAXC2NRMK = ZERO
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix is zero and we stop the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H.
+*
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $                  M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + K - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third tolerance stopping criteria.
+*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig;
+*
+                  KB = K - 1
+                  IF = I - 1
+*
+*              Apply the block reflector to the residual of the
+*              matrix A and the residual of the right hand sides B, if
+*              the residual matrix and and/or the residual of the right
+*              hand sides exist,  i.e. if the submatrix
+*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when
+*                 KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H.
+*
+               IF( KB.LT.MINMNUPDT ) THEN
+                  CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $                  M-IF, N+NRHS-KB, KB,-CONE, A( IF+1, 1 ), LDA,
+     $                  F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,K:N):
+*        1) swap the K-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1)
+*        3) copy the K-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than K in the next loop step.)
+*        4) Save the pivot interchange with the indices relative to the
+*           the original matrix A_orig, not the block A(1:M,1:N).
+*
+         IF( KP.NE.K ) THEN
+            CALL CSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 )
+            CALL CSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF )
+            VN1( KP ) = VN1( K )
+            VN2( KP ) = VN2( K )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( K )
+            JPIV( K ) = ITEMP
+         END IF
+*
+*        Apply previous Householder reflectors to column K:
+*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H.
+*
+         IF( K.GT.1 ) THEN
+            DO J = 1, K - 1
+               F( K, J ) = CONJG( F( K, J ) )
+            END DO
+            CALL CGEMV( 'No transpose', M-I+1, K-1, -CONE, A( I, 1 ),
+     $                  LDA, F( K, 1 ), LDF, CONE, A( I, K ), 1 )
+            DO J = 1, K - 1
+               F( K, J ) = CONJG( F( K, J ) )
+            END DO
+         END IF
+*
+*        Generate elementary reflector H(k) using the column A(I:M,K).
+*
+         IF( I.LT.M ) THEN
+            CALL CLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) )
+         ELSE
+            TAU( K ) = CZERO
+         END IF
+*
+*        Check if TAU(K) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(K) for Inf,
+*        since CLARFG cannot produce TAU(KK) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by CLARFG can contain Inf, which requires
+*        TAU(K) to contain NaN. Therefore, this case of generating Inf
+*        by CLARFG is covered by checking TAU(K) for NaN.
+*
+         IF( SISNAN( REAL( TAU(K) ) ) ) THEN
+            TAUNAN = REAL( TAU(K) )
+         ELSE IF( SISNAN( IMAG( TAU(K) ) ) ) THEN
+            TAUNAN = IMAG( TAU(K) )
+         ELSE
+            TAUNAN = ZERO
+         END IF
+*
+         IF( SISNAN( TAUNAN ) ) THEN
+*
+            DONE = .TRUE.
+*
+*           Set KB, the number of factorized partial columns
+*                   that are non-zero in each step in the block,
+*                   i.e. the rank of the factor R.
+*           Set IF, the number of processed rows in the block, which
+*                   is the same as the number of processed rows in
+*                   the original whole matrix A_orig.
+*
+            KB = K - 1
+            IF = I - 1
+            INFO = K
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAUNAN
+            RELMAXC2NRMK = TAUNAN
+*
+*           There is no need to apply the block reflector to the
+*           residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*           since the submatrix contains NaN and we stop
+*           the computation.
+*           But, we need to apply the block reflector to the residual
+*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*           residual right hand sides exist.  This occurs
+*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H.
+*
+            IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+               CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $               M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA,
+     $               F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA )
+            END IF
+*
+*           There is no need to recompute the 2-norm of the
+*           difficult columns, since we stop the factorization.
+*
+*           Array TAU(KF+1:MINMNFACT) is not set and contains
+*           undefined elements.
+*
+*           Return from the routine.
+*
+            RETURN
+         END IF
+*
+*        ===============================================================
+*
+         AIK = A( I, K )
+         A( I, K ) = CONE
+*
+*        ===============================================================
+*
+*        Compute the current K-th column of F:
+*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K).
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL CGEMV( 'Conjugate transpose', M-I+1, N+NRHS-K,
+     $                  TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1,
+     $                  CZERO, F( K+1, K ), 1 )
+         END IF
+*
+*           2) Zero out elements above and on the diagonal of the
+*              column K in matrix F, i.e elements F(1:K,K).
+*
+         DO J = 1, K
+            F( J, K ) = CZERO
+         END DO
+*
+*         3) Incremental updating of the K-th column of F:
+*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H
+*                    * A(I:M,K).
+*
+         IF( K.GT.1 ) THEN
+            CALL CGEMV( 'Conjugate Transpose', M-I+1, K-1, -TAU( K ),
+     $                   A( I, 1 ), LDA, A( I, K ), 1, CZERO,
+     $                   AUXV( 1 ), 1 )
+*
+            CALL CGEMV( 'No transpose', N+NRHS, K-1, CONE,
+     $                  F( 1, 1 ), LDF, AUXV( 1 ), 1, CONE,
+     $                  F( 1, K ), 1 )
+         END IF
+*
+*        ===============================================================
+*
+*        Update the current I-th row of A:
+*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS)
+*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**H.
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $            1, N+NRHS-K, K, -CONE, A( I, 1 ), LDA,
+     $            F( K+1, 1 ), LDF, CONE, A( I, K+1 ), LDA )
+         END IF
+*
+         A( I, K ) = AIK
+*
+*        Update the partial column 2-norms for the residual matrix,
+*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e.
+*        when K < MINMNFACT = min( M-IOFFSET, N ).
+*
+         IF( K.LT.MINMNFACT ) THEN
+*
+            DO J = K + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ABS( A( I, J ) ) / VN1( J )
+                  TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2.LE.TOL3Z ) THEN
+*
+*                    At J-index, we have a difficult column for the
+*                    update of the 2-norm. Save the index of the previous
+*                    difficult column in IWORK(J-1).
+*                    NOTE: ILSTCC > 1, threfore we can use IWORK only
+*                    with N-1 elements, where the elements are
+*                    shifted by 1 to the left.
+*
+                     IWORK( J-1 ) = LSTICC
+*
+*                    Set the index of the last difficult column LSTICC.
+*
+                     LSTICC = J
+*
+                  ELSE
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*        End of while loop.
+*
+      END DO
+*
+*     Now, afler the loop:
+*        Set KB, the number of factorized columns in the block;
+*        Set IF, the number of processed rows in the block, which
+*                is the same as the number of processed rows in
+*                the original whole matrix A_orig, IF = IOFFSET + KB.
+*
+      KB = K
+      IF = I
+*
+*     Apply the block reflector to the residual of the matrix A
+*     and the residual of the right hand sides B, if the residual
+*     matrix and and/or the residual of the right hand sides
+*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists.
+*     This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H.
+*
+      IF( KB.LT.MINMNUPDT ) THEN
+         CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $         M-IF, N+NRHS-KB, KB, -CONE, A( IF+1, 1 ), LDA,
+     $         F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA )
+      END IF
+*
+*     Recompute the 2-norm of the difficult columns.
+*     Loop over the index of the difficult columns from the largest
+*     to the smallest index.
+*
+      DO WHILE( LSTICC.GT.0 )
+*
+*        LSTICC is the index of the last difficult column is greater
+*        than 1.
+*        ITEMP is the index of the previous difficult column.
+*
+         ITEMP = IWORK( LSTICC-1 )
+*
+*        Compute the 2-norm explicilty for the last difficult column and
+*        save it in the partial and exact 2-norm vectors VN1 and VN2.
+*
+*        NOTE: The computation of VN1( LSTICC ) relies on the fact that
+*        SCNRM2 does not fail on vectors with norm below the value of
+*        SQRT(SLAMCH('S'))
+*
+         VN1( LSTICC ) = SCNRM2( M-IF, A( IF+1, LSTICC ), 1 )
+         VN2( LSTICC ) = VN1( LSTICC )
+*
+*        Downdate the index of the last difficult column to
+*        the index of the previous difficult column.
+*
+         LSTICC = ITEMP
+*
+      END DO
+*
+      RETURN
+*
+*     End of CLAQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/dgeqp3rk.f b/lapack-netlib/SRC/dgeqp3rk.f
new file mode 100644
index 000000000..ace97b712
--- /dev/null
+++ b/lapack-netlib/SRC/dgeqp3rk.f
@@ -0,0 +1,1081 @@
+*> \brief \b DGEQP3RK computes a truncated Householder QR factorization with column pivoting of a real m-by-n matrix A by using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DGEQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dgeqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dgeqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dgeqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+*      $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*      $                     WORK, LWORK, IWORK, INFO )
+*       IMPLICIT NONE
+*
+*      .. Scalar Arguments ..
+*       INTEGER            INFO, K, KMAX, LDA, LWORK, M, N, NRHS
+*       DOUBLE PRECISION   ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*      ..
+*      .. Array Arguments ..
+*       INTEGER            IWORK( * ), JPIV( * )
+*       DOUBLE PRECISION   A( LDA, * ), TAU( * ), WORK( * )
+*      ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DGEQP3RK performs two tasks simultaneously:
+*>
+*> Task 1: The routine computes a truncated (rank K) or full rank
+*> Householder QR factorization with column pivoting of a real
+*> M-by-N matrix A using Level 3 BLAS. K is the number of columns
+*> that were factorized, i.e. factorization rank of the
+*> factor R, K <= min(M,N).
+*>
+*>  A * P(K) = Q(K) * R(K)  =
+*>
+*>        = Q(K) * ( R11(K) R12(K) ) = Q(K) * (   R(K)_approx    )
+*>                 ( 0      R22(K) )          ( 0  R(K)_residual ),
+*>
+*> where:
+*>
+*>  P(K)            is an N-by-N permutation matrix;
+*>  Q(K)            is an M-by-M orthogonal matrix;
+*>  R(K)_approx   = ( R11(K), R12(K) ) is a rank K approximation of the
+*>                    full rank factor R with K-by-K upper-triangular
+*>                    R11(K) and K-by-N rectangular R12(K). The diagonal
+*>                    entries of R11(K) appear in non-increasing order
+*>                    of absolute value, and absolute values of all of
+*>                    them exceed the maximum column 2-norm of R22(K)
+*>                    up to roundoff error.
+*>  R(K)_residual = R22(K) is the residual of a rank K approximation
+*>                    of the full rank factor R. It is a
+*>                    an (M-K)-by-(N-K) rectangular matrix;
+*>  0               is a an (M-K)-by-K zero matrix.
+*>
+*> Task 2: At the same time, the routine overwrites a real M-by-NRHS
+*> matrix B with  Q(K)**T * B  using Level 3 BLAS.
+*>
+*> =====================================================================
+*>
+*> The matrices A and B are stored on input in the array A as
+*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS)
+*> respectively.
+*>
+*>                                  N     NRHS
+*>             array_A   =   M  [ mat_A, mat_B ]
+*>
+*> The truncation criteria (i.e. when to stop the factorization)
+*> can be any of the following:
+*>
+*>   1) The input parameter KMAX, the maximum number of columns
+*>      KMAX to factorize, i.e. the factorization rank is limited
+*>      to KMAX. If KMAX >= min(M,N), the criterion is not used.
+*>
+*>   2) The input parameter ABSTOL, the absolute tolerance for
+*>      the maximum column 2-norm of the residual matrix R22(K). This
+*>      means that the factorization stops if this norm is less or
+*>      equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used.
+*>
+*>   3) The input parameter RELTOL, the tolerance for the maximum
+*>      column 2-norm matrix of the residual matrix R22(K) divided
+*>      by the maximum column 2-norm of the original matrix A, which
+*>      is equal to abs(R(1,1)). This means that the factorization stops
+*>      when the ratio of the maximum column 2-norm of R22(K) to
+*>      the maximum column 2-norm of A is less than or equal to RELTOL.
+*>      If RELTOL < 0.0, the criterion is not used.
+*>
+*>   4) In case both stopping criteria ABSTOL or RELTOL are not used,
+*>      and when the residual matrix R22(K) is a zero matrix in some
+*>      factorization step K. ( This stopping criterion is implicit. )
+*>
+*>  The algorithm stops when any of these conditions is first
+*>  satisfied, otherwise the whole matrix A is factorized.
+*>
+*>  To factorize the whole matrix A, use the values
+*>  KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0.
+*>
+*>  The routine returns:
+*>     a) Q(K), R(K)_approx = ( R11(K), R12(K) ),
+*>        R(K)_residual = R22(K), P(K), i.e. the resulting matrices
+*>        of the factorization; P(K) is represented by JPIV,
+*>        ( if K = min(M,N), R(K)_approx is the full factor R,
+*>        and there is no residual matrix R(K)_residual);
+*>     b) K, the number of columns that were factorized,
+*>        i.e. factorization rank;
+*>     c) MAXC2NRMK, the maximum column 2-norm of the residual
+*>        matrix R(K)_residual = R22(K),
+*>        ( if K = min(M,N), MAXC2NRMK = 0.0 );
+*>     d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum
+*>        column 2-norm of the original matrix A, which is equal
+*>        to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 );
+*>     e) Q(K)**T * B, the matrix B with the orthogonal
+*>        transformation Q(K)**T applied on the left.
+*>
+*> The N-by-N permutation matrix P(K) is stored in a compact form in
+*> the integer array JPIV. For 1 <= j <= N, column j
+*> of the matrix A was interchanged with column JPIV(j).
+*>
+*> The M-by-M orthogonal matrix Q is represented as a product
+*> of elementary Householder reflectors
+*>
+*>     Q(K) = H(1) *  H(2) * . . . * H(K),
+*>
+*> where K is the number of columns that were factorized.
+*>
+*> Each H(j) has the form
+*>
+*>     H(j) = I - tau * v * v**T,
+*>
+*> where 1 <= j <= K and
+*>   I    is an M-by-M identity matrix,
+*>   tau  is a real scalar,
+*>   v    is a real vector with v(1:j-1) = 0 and v(j) = 1.
+*>
+*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j).
+*>
+*> See the Further Details section for more information.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e. the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M,N), then this stopping criterion
+*>                is not used, the routine factorizes columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>                satisfied on input and the routine exits immediately.
+*>                This means that the factorization is not performed,
+*>                the matrices A and B are not modified, and
+*>                the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION
+*>
+*>          The second factorization stopping criterion, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix R22(K).
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix R22(K)
+*>          is less than or equal to ABSTOL. Let SAFMIN = DLAMCH('S').
+*>
+*>          a) If ABSTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -5 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN
+*>                is used. This includes the case ABSTOL = -0.0.
+*>
+*>          d) If 2*SAFMIN <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If ABSTOL chosen above is >= MAXC2NRM, then this
+*>          stopping criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed. The routine
+*>          returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case ABSTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is DOUBLE PRECISION
+*>
+*>          The third factorization stopping criterion, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio
+*>          abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of
+*>          the residual matrix R22(K) to the maximum column 2-norm of
+*>          the original matrix A. The algorithm converges (stops the
+*>          factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less
+*>          than or equal to RELTOL. Let EPS = DLAMCH('E').
+*>
+*>          a) If RELTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -6 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used.
+*>                This includes the case RELTOL = -0.0.
+*>
+*>          d) If EPS <= RELTOL then the input value of RELTOL
+*>                is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If RELTOL chosen above is >= 1.0, then this stopping
+*>          criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed.
+*>          The routine returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case RELTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*>
+*>          NOTE: We recommend that RELTOL satisfy
+*>                min( max(M,N)*EPS, sqrt(EPS) ) <= RELTOL
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N+NRHS)
+*>
+*>          On entry:
+*>
+*>          a) The subarray A(1:M,1:N) contains the M-by-N matrix A.
+*>          b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS
+*>             matrix B.
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>
+*>          a) The subarray A(1:M,1:N) contains parts of the factors
+*>             of the matrix A:
+*>
+*>            1) If K = 0, A(1:M,1:N) contains the original matrix A.
+*>            2) If K > 0, A(1:M,1:N) contains parts of the
+*>            factors:
+*>
+*>              1. The elements below the diagonal of the subarray
+*>                 A(1:M,1:K) together with TAU(1:K) represent the
+*>                 orthogonal matrix Q(K) as a product of K Householder
+*>                 elementary reflectors.
+*>
+*>              2. The elements on and above the diagonal of
+*>                 the subarray A(1:K,1:N) contain K-by-N
+*>                 upper-trapezoidal matrix
+*>                 R(K)_approx = ( R11(K), R12(K) ).
+*>                 NOTE: If K=min(M,N), i.e. full rank factorization,
+*>                       then R_approx(K) is the full factor R which
+*>                       is upper-trapezoidal. If, in addition, M>=N,
+*>                       then R is upper-triangular.
+*>
+*>              3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K)
+*>                 rectangular matrix R(K)_residual = R22(K).
+*>
+*>          b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains
+*>             the M-by-NRHS product Q(K)**T * B.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*>          This is the leading dimension for both matrices, A and B.
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*>
+*>          NOTE: If K = 0, a) the arrays A and B are not modified;
+*>                          b) the array TAU(1:min(M,N)) is set to ZERO,
+*>                             if the matrix A does not contain NaN,
+*>                             otherwise the elements TAU(1:min(M,N))
+*>                             are undefined;
+*>                          c) the elements of the array JPIV are set
+*>                             as follows: for j = 1:N, JPIV(j) = j.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is DOUBLE PRECISION
+*>          The maximum column 2-norm of the residual matrix R22(K),
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then MAXC2NRMK equals the maximum column 2-norm
+*>             of the original matrix A.
+*>
+*>          b) If 0 < K < min(M,N), then MAXC2NRMK is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then MAXC2NRMK = 0.0.
+*>
+*>          NOTE: MAXC2NRMK in the factorization step K would equal
+*>                R(K+1,K+1) in the next factorization step K+1.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is DOUBLE PRECISION
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix R22(K) (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then RELMAXC2NRMK = 1.0.
+*>
+*>          b) If 0 < K < min(M,N), then
+*>                RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then RELMAXC2NRMK = 0.0.
+*>
+*>         NOTE: RELMAXC2NRMK in the factorization step K would equal
+*>               abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization
+*>               step K+1.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices. For 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*>
+*>          The elements of the array JPIV(1:N) are always set
+*>          by the routine, for example, even  when no columns
+*>          were factorized, i.e. when K = 0, the elements are
+*>          set as JPIV(j) = j for j = 1:N.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is DOUBLE PRECISION array, dimension (min(M,N))
+*>          The scalar factors of the elementary reflectors.
+*>
+*>          If 0 < K <= min(M,N), only the elements TAU(1:K) of
+*>          the array TAU are modified by the factorization.
+*>          After the factorization computed, if no NaN was found
+*>          during the factorization, the remaining elements
+*>          TAU(K+1:min(M,N)) are set to zero, otherwise the
+*>          elements TAU(K+1:min(M,N)) are not set and therefore
+*>          undefined.
+*>          ( If K = 0, all elements of TAU are set to zero, if
+*>          the matrix A does not contain NaN. )
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*.          LWORK >= (3*N + NRHS - 1)
+*>          For optimal performance LWORK >= (2*N + NB*( N+NRHS+1 )),
+*>          where NB is the optimal block size for DGEQP3RK returned
+*>          by ILAENV. Minimal block size MINNB=2.
+*>
+*>          NOTE: The decision, whether to use unblocked BLAS 2
+*>          or blocked BLAS 3 code is based not only on the dimension
+*>          LWORK of the availbale workspace WORK, but also also on the
+*>          matrix A dimension N via crossover point NX returned
+*>          by ILAENV. (For N less than NX, unblocked code should be
+*>          used.)
+*>
+*>          If LWORK = -1, then a workspace query is assumed;
+*>          the routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix in the blocked step auxiliary subroutine DLAQP3RK ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) INFO < 0: if INFO = -i, the i-th argument had an
+*>                       illegal value.
+*>          3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup geqp3rk
+*
+*> \par Further Details:
+*  =====================
+*
+*> \verbatim
+*> DGEQP3RK is based on the same BLAS3 Householder QR factorization
+*> algorithm with column pivoting as in DGEQP3 routine which uses
+*> DLARFG routine to generate Householder reflectors
+*> for QR factorization.
+*>
+*> We can also write:
+*>
+*>   A = A_approx(K) + A_residual(K)
+*>
+*> The low rank approximation matrix A(K)_approx from
+*> the truncated QR factorization of rank K of the matrix A is:
+*>
+*>   A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T
+*>                        (     0     0 )
+*>
+*>               = Q(K) * ( R11(K) R12(K) ) * P(K)**T
+*>                        (      0      0 )
+*>
+*> The residual A_residual(K) of the matrix A is:
+*>
+*>   A_residual(K) = Q(K) * ( 0              0 ) * P(K)**T =
+*>                          ( 0  R(K)_residual )
+*>
+*>                 = Q(K) * ( 0        0 ) * P(K)**T
+*>                          ( 0   R22(K) )
+*>
+*> The truncated (rank K) factorization guarantees that
+*> the maximum column 2-norm of A_residual(K) is less than
+*> or equal to MAXC2NRMK up to roundoff error.
+*>
+*> NOTE: An approximation of the null vectors
+*>       of A can be easily computed from R11(K)
+*>       and R12(K):
+*>
+*>       Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) )
+*>                                 (         -I           )
+*>
+*> \endverbatim
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  Computer Science Division,
+*>                  University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE DGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+     $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     WORK, LWORK, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS
+      DOUBLE PRECISION   ABSTOL,  MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      DOUBLE PRECISION   A( LDA, * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            INB, INBMIN, IXOVER
+      PARAMETER          ( INB = 1, INBMIN = 2, IXOVER = 3 )
+      DOUBLE PRECISION   ZERO, ONE, TWO
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, DONE
+      INTEGER            IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX,
+     $                   JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB,
+     $                   NBMIN, NX
+      DOUBLE PRECISION   EPS, HUGEVAL, MAXC2NRM, SAFMIN
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLAQP2RK, DLAQP3RK, XERBLA
+*     ..
+*     .. External Functions ..
+      LOGICAL            DISNAN
+      INTEGER            IDAMAX, ILAENV
+      DOUBLE PRECISION   DLAMCH, DNRM2
+      EXTERNAL           DISNAN, DLAMCH, DNRM2, IDAMAX, ILAENV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test input arguments
+*     ====================
+*
+      INFO = 0
+      LQUERY = ( LWORK.EQ.-1 )
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KMAX.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( DISNAN( ABSTOL ) ) THEN
+         INFO = -5
+      ELSE IF( DISNAN( RELTOL ) ) THEN
+         INFO = -6
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -8
+      END IF
+*
+*     If the input parameters M, N, NRHS, KMAX, LDA are valid:
+*       a) Test the input workspace size LWORK for the minimum
+*          size requirement IWS.
+*       b) Determine the optimal block size NB and optimal
+*          workspace size LWKOPT to be returned in WORK(1)
+*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE.,
+*          (3) when routine exits.
+*     Here, IWS is the miminum workspace required for unblocked
+*     code.
+*
+      IF( INFO.EQ.0 ) THEN
+         MINMN = MIN( M, N )
+         IF( MINMN.EQ.0 ) THEN
+            IWS = 1
+            LWKOPT = 1
+         ELSE
+*
+*           Minimal workspace size in case of using only unblocked
+*           BLAS 2 code in DLAQP2RK.
+*           1) DGEQP3RK and DLAQP2RK: 2*N to store full and partial
+*              column 2-norms.
+*           2) DLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in DLARF subroutine inside DLAQP2RK to apply an
+*              elementary reflector from the left.
+*           TOTAL_WORK_SIZE = 3*N + NRHS - 1
+*
+            IWS = 3*N + NRHS - 1
+*
+*           Assign to NB optimal block size.
+*
+            NB = ILAENV( INB, 'DGEQP3RK', ' ', M, N, -1, -1 )
+*
+*           A formula for the optimal workspace size in case of using
+*           both unblocked BLAS 2 in DLAQP2RK and blocked BLAS 3 code
+*           in DLAQP3RK.
+*           1) DGEQP3RK, DLAQP2RK, DLAQP3RK: 2*N to store full and
+*              partial column 2-norms.
+*           2) DLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in DLARF subroutine to apply an elementary reflector
+*              from the left.
+*           3) DLAQP3RK: NB*(N+NRHS) to use in the work array F that
+*              is used to apply a block reflector from
+*              the left.
+*           4) DLAQP3RK: NB to use in the auxilixary array AUX.
+*           Sizes (2) and ((3) + (4)) should intersect, therefore
+*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2.
+*
+            LWKOPT = 2*N + NB*( N+NRHS+1 )
+         END IF
+         WORK( 1 ) = DBLE( LWKOPT )
+*
+         IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN
+            INFO = -15
+         END IF
+      END IF
+*
+*      NOTE: The optimal workspace size is returned in WORK(1), if
+*            the input parameters M, N, NRHS, KMAX, LDA are valid.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DGEQP3RK', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible for M=0 or N=0.
+*
+      IF( MINMN.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+         WORK( 1 ) = DBLE( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+*     Initialize column pivot array JPIV.
+*
+      DO J = 1, N
+         JPIV( J ) = J
+      END DO
+*
+*     ==================================================================
+*
+*     Initialize storage for partial and exact column 2-norms.
+*     a) The elements WORK(1:N) are used to store partial column
+*        2-norms of the matrix A, and may decrease in each computation
+*        step; initialize to the values of complete columns 2-norms.
+*     b) The elements WORK(N+1:2*N) are used to store complete column
+*        2-norms of the matrix A, they are not changed during the
+*        computation; initialize the values of complete columns 2-norms.
+*
+      DO J = 1, N
+         WORK( J ) = DNRM2( M, A( 1, J ), 1 )
+         WORK( N+J ) = WORK( J )
+      END DO
+*
+*     ==================================================================
+*
+*     Compute the pivot column index and the maximum column 2-norm
+*     for the whole original matrix stored in A(1:M,1:N).
+*
+      KP1 = IDAMAX( N, WORK( 1 ), 1 )
+      MAXC2NRM = WORK( KP1 )
+*
+*     ==================================================================.
+*
+      IF( DISNAN( MAXC2NRM ) ) THEN
+*
+*        Check if the matrix A contains NaN, set INFO parameter
+*        to the column number where the first NaN is found and return
+*        from the routine.
+*
+         K = 0
+         INFO = KP1
+*
+*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = MAXC2NRM
+*
+*        Array TAU is not set and contains undefined elements.
+*
+         WORK( 1 ) = DBLE( LWKOPT )
+         RETURN
+      END IF
+*
+*     ===================================================================
+*
+      IF( MAXC2NRM.EQ.ZERO ) THEN
+*
+*        Check is the matrix A is a zero matrix, set array TAU and
+*        return from the routine.
+*
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+*
+         DO J = 1, MINMN
+            TAU( J ) = ZERO
+         END DO
+*
+         WORK( 1 ) = DBLE( LWKOPT )
+         RETURN
+*
+      END IF
+*
+*     ===================================================================
+*
+      HUGEVAL = DLAMCH( 'Overflow' )
+*
+      IF( MAXC2NRM.GT.HUGEVAL ) THEN
+*
+*        Check if the matrix A contains +Inf or -Inf, set INFO parameter
+*        to the column number, where the first +/-Inf  is found plus N,
+*        and continue the computation.
+*
+         INFO = N + KP1
+*
+      END IF
+*
+*     ==================================================================
+*
+*     Quick return if possible for the case when the first
+*     stopping criterion is satisfied, i.e. KMAX = 0.
+*
+      IF( KMAX.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+         DO J = 1, MINMN
+            TAU( J ) = ZERO
+         END DO
+         WORK( 1 ) = DBLE( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+      EPS = DLAMCH('Epsilon')
+*
+*     Adjust ABSTOL
+*
+      IF( ABSTOL.GE.ZERO ) THEN
+         SAFMIN = DLAMCH('Safe minimum')
+         ABSTOL = MAX( ABSTOL, TWO*SAFMIN )
+      END IF
+*
+*     Adjust RELTOL
+*
+      IF( RELTOL.GE.ZERO ) THEN
+         RELTOL = MAX( RELTOL, EPS )
+      END IF
+*
+*     ===================================================================
+*
+*     JMAX is the maximum index of the column to be factorized,
+*     which is also limited by the first stopping criterion KMAX.
+*
+      JMAX = MIN( KMAX, MINMN )
+*
+*     ===================================================================
+*
+*     Quick return if possible for the case when the second or third
+*     stopping criterion for the whole original matrix is satified,
+*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL
+*     (which is ONE <= RELTOL).
+*
+      IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN
+*
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+*
+         DO J = 1, MINMN
+            TAU( J ) = ZERO
+         END DO
+*
+         WORK( 1 ) = DBLE( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*     Factorize columns
+*     ==================================================================
+*
+*     Determine the block size.
+*
+      NBMIN = 2
+      NX = 0
+*
+      IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN
+*
+*        Determine when to cross over from blocked to unblocked code.
+*        (for N less than NX, unblocked code should be used).
+*
+         NX = MAX( 0, ILAENV( IXOVER, 'DGEQP3RK', ' ', M, N, -1, -1 ))
+*
+         IF( NX.LT.MINMN ) THEN
+*
+*           Determine if workspace is large enough for blocked code.
+*
+            IF( LWORK.LT.LWKOPT ) THEN
+*
+*              Not enough workspace to use optimal block size that
+*              is currently stored in NB.
+*              Reduce NB and determine the minimum value of NB.
+*
+               NB = ( LWORK-2*N ) / ( N+1 )
+               NBMIN = MAX( 2, ILAENV( INBMIN, 'DGEQP3RK', ' ', M, N,
+     $                 -1, -1 ) )
+*
+            END IF
+         END IF
+      END IF
+*
+*     ==================================================================
+*
+*     DONE is the boolean flag to rerpresent the case when the
+*     factorization completed in the block factorization routine,
+*     before the end of the block.
+*
+      DONE = .FALSE.
+*
+*     J is the column index.
+*
+      J = 1
+*
+*     (1) Use blocked code initially.
+*
+*     JMAXB is the maximum column index of the block, when the
+*     blocked code is used, is also limited by the first stopping
+*     criterion KMAX.
+*
+      JMAXB = MIN( KMAX, MINMN - NX )
+*
+      IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN
+*
+*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here:
+*        J   is the column index of a column block;
+*        JB  is the column block size to pass to block factorization
+*            routine in a loop step;
+*        JBF is the number of columns that were actually factorized
+*            that was returned by the block factorization routine
+*            in a loop step, JBF <= JB;
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         DO WHILE( J.LE.JMAXB )
+*
+            JB = MIN( NB, JMAXB-J+1 )
+            N_SUB = N-J+1
+            IOFFSET = J-1
+*
+*           Factorize JB columns among the columns A(J:N).
+*
+            CALL DLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                     DONE, JBF, MAXC2NRMK, RELMAXC2NRMK,
+     $                     JPIV( J ), TAU( J ),
+     $                     WORK( J ), WORK( N+J ),
+     $                     WORK( 2*N+1 ), WORK( 2*N+JB+1 ),
+     $                     N+NRHS-J+1, IWORK, IINFO )
+*
+*           Set INFO on the first occurence of Inf.
+*
+            IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+               INFO = 2*IOFFSET + IINFO
+            END IF
+*
+            IF( DONE ) THEN
+*
+*              Either the submatrix is zero before the end of the
+*              column block, or ABSTOL or RELTOL criterion is
+*              satisfied before the end of the column block, we can
+*              return from the routine. Perform the following before
+*              returning:
+*                a) Set the number of factorized columns K,
+*                   K = IOFFSET + JBF from the last call of blocked
+*                   routine.
+*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned
+*                         by the block factorization routine;
+*                      2) The remaining TAUs are set to ZERO by the
+*                         block factorization routine.
+*
+               K = IOFFSET + JBF
+*
+*              Set INFO on the first occurrence of NaN, NaN takes
+*              prcedence over Inf.
+*
+               IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+                  INFO = IOFFSET + IINFO
+               END IF
+*
+*              Return from the routine.
+*
+               WORK( 1 ) = DBLE( LWKOPT )
+*
+               RETURN
+*
+            END IF
+*
+            J = J + JBF
+*
+         END DO
+*
+      END IF
+*
+*     Use unblocked code to factor the last or only block.
+*     J = JMAX+1 means we factorized the maximum possible number of
+*     columns, that is in ELSE clause we need to compute
+*     the MAXC2NORM and RELMAXC2NORM to return after we processed
+*     the blocks.
+*
+      IF( J.LE.JMAX ) THEN
+*
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         N_SUB = N-J+1
+         IOFFSET = J-1
+*
+         CALL DLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1,
+     $                  ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                  KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ),
+     $                  TAU( J ), WORK( J ), WORK( N+J ),
+     $                  WORK( 2*N+1 ), IINFO )
+*
+*        ABSTOL or RELTOL criterion is satisfied when the number of
+*        the factorized columns KF is smaller then the  number
+*        of columns JMAX-J+1 supplied to be factorized by the
+*        unblocked routine, we can return from
+*        the routine. Perform the following before returning:
+*           a) Set the number of factorized columns K,
+*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the
+*              unblocked factorization routine above.
+*
+         K = J - 1 + KF
+*
+*        Set INFO on the first exception occurence.
+*
+*        Set INFO on the first exception occurence of Inf or NaN,
+*        (NaN takes precedence over Inf).
+*
+         IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+            INFO = 2*IOFFSET + IINFO
+         ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+            INFO = IOFFSET + IINFO
+         END IF
+*
+      ELSE
+*
+*        Compute the return values for blocked code.
+*
+*        Set the number of factorized columns if the unblocked routine
+*        was not called.
+*
+            K = JMAX
+*
+*        If there exits a residual matrix after the blocked code:
+*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the
+*              residual matrix, otherwise set them to ZERO;
+*           2) Set TAU(K+1:MINMN) to ZERO.
+*
+         IF( K.LT.MINMN ) THEN
+            JMAXC2NRM = K + IDAMAX( N-K, WORK( K+1 ), 1 )
+            MAXC2NRMK = WORK( JMAXC2NRM )
+            IF( K.EQ.0 ) THEN
+               RELMAXC2NRMK = ONE
+            ELSE
+               RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+            END IF
+*
+            DO J = K + 1, MINMN
+               TAU( J ) = ZERO
+            END DO
+*
+         END IF
+*
+*     END IF( J.LE.JMAX ) THEN
+*
+      END IF
+*
+      WORK( 1 ) = DBLE( LWKOPT )
+*
+      RETURN
+*
+*     End of DGEQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/dlaqp2rk.f b/lapack-netlib/SRC/dlaqp2rk.f
new file mode 100644
index 000000000..b5a84d0de
--- /dev/null
+++ b/lapack-netlib/SRC/dlaqp2rk.f
@@ -0,0 +1,713 @@
+*> \brief \b DLAQP2RK computes truncated QR factorization with column pivoting of a real matrix block using Level 2 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DLAQP2RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dlaqp2rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dlaqp2rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dlaqp2rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE DLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+*     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+*     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+*     $                     INFO )
+*      IMPLICIT NONE
+*
+*     .. Scalar Arguments ..
+*      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+*      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            JPIV( * )
+*      DOUBLE PRECISION   A( LDA, * ), TAU( * ), VN1( * ), VN2( * ),
+*     $                   WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLAQP2RK computes a truncated (rank K) or full rank Householder QR
+*> factorization with column pivoting of a real matrix
+*> block A(IOFFSET+1:M,1:N) as
+*>
+*>   A * P(K) = Q(K) * R(K).
+*>
+*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N)
+*> is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides matrix block B
+*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**T * B.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M-IOFFSET,N), then this stopping
+*>                criterion is not used, factorize columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>             satisfied on input and the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The second factorization stopping criterion.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The third factorization stopping criterion.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine DGEQP3RK. 1 <= KP1 <= N_orig_mat.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is DOUBLE PRECISION
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine DGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:K) below
+*>             the diagonal together with the array TAU represent
+*>             the orthogonal matrix Q(K) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:K) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,K+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(K)**T.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is DOUBLE PRECISION
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is DOUBLE PRECISION
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is DOUBLE PRECISION array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (N-1)
+*>          Used in DLARF subroutine to apply an elementary
+*>          reflector from the left.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp2rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  Computer Science Division,
+*>                  University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE DLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+     $                     INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            JPIV( * )
+      DOUBLE PRECISION   A( LDA, * ), TAU( * ), VN1( * ), VN2( * ),
+     $                   WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT,
+     $                   MINMNUPDT
+      DOUBLE PRECISION   AIKK, HUGEVAL, TEMP, TEMP2, TOL3Z
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLARF, DLARFG, DSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            DISNAN
+      INTEGER            IDAMAX
+      DOUBLE PRECISION   DLAMCH, DNRM2
+      EXTERNAL           DISNAN, DLAMCH, IDAMAX, DNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+*     MINMNUPDT is the smallest dimension
+*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which
+*     contains the submatrices A(IOFFSET+1:M,1:N) and
+*     B(IOFFSET+1:M,1:NRHS) as column blocks.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      KMAX = MIN( KMAX, MINMNFACT )
+      TOL3Z = SQRT( DLAMCH( 'Epsilon' ) )
+      HUGEVAL = DLAMCH( 'Overflow' )
+*
+*     Compute the factorization, KK is the lomn loop index.
+*
+      DO KK = 1, KMAX
+*
+         I = IOFFSET + KK
+*
+         IF( I.EQ.1 ) THEN
+*
+*           ============================================================
+*
+*           We are at the first column of the original whole matrix A,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+
+            KP = KP1
+*
+*           ============================================================
+*
+         ELSE
+*
+*           ============================================================
+*
+*           Determine the pivot column in KK-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( KK-1 ) + IDAMAX( N-KK+1, VN1( KK ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,KK:N) in step KK.
+*           RELMAXC2NRMK  will be computed later, after somecondition
+*           checks on MAXC2NRMK.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains NaN, and set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( DISNAN( MAXC2NRMK ) ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+                K = KK - 1
+                INFO = K + KP
+*
+*               Set RELMAXC2NRMK to NaN.
+*
+                RELMAXC2NRMK = MAXC2NRMK
+*
+*               Array TAU(K+1:MINMNFACT) is not set and contains
+*               undefined elements.
+*
+               RETURN
+            END IF
+*
+*           ============================================================
+*
+*           Quick return, if the submatrix A(I:M,KK:N) is
+*           a zero matrix.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+               K = KK - 1
+               RELMAXC2NRMK = ZERO
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + KK - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third stopping criteria.
+*           NOTE: There is no need to test for ABSTOL >= ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+*              Set K, the number of factorized columns.
+*
+               K = KK - 1
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,KK:N):
+*        1) swap the KK-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) copy the KK-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than KK in the next loop step.)
+*        3) Save the pivot interchange with the indices relative to the
+*           the original matrix A, not the block A(1:M,1:N).
+*
+         IF( KP.NE.KK ) THEN
+            CALL DSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 )
+            VN1( KP ) = VN1( KK )
+            VN2( KP ) = VN2( KK )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( KK )
+            JPIV( KK ) = ITEMP
+         END IF
+*
+*        Generate elementary reflector H(KK) using the column A(I:M,KK),
+*        if the column has more than one element, otherwise
+*        the elementary reflector would be an identity matrix,
+*        and TAU(KK) = ZERO.
+*
+         IF( I.LT.M ) THEN
+            CALL DLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1,
+     $                   TAU( KK ) )
+         ELSE
+            TAU( KK ) = ZERO
+         END IF
+*
+*        Check if TAU(KK) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(KK) for Inf,
+*        since DLARFG cannot produce TAU(KK) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by DLARFG can contain Inf, which requires
+*        TAU(KK) to contain NaN. Therefore, this case of generating Inf
+*        by DLARFG is covered by checking TAU(KK) for NaN.
+*
+         IF( DISNAN( TAU(KK) ) ) THEN
+            K = KK - 1
+            INFO = KK
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAU( KK )
+            RELMAXC2NRMK = TAU( KK )
+*
+*           Array TAU(KK:MINMNFACT) is not set and contains
+*           undefined elements, except the first element TAU(KK) = NaN.
+*
+            RETURN
+         END IF
+*
+*        Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left.
+*        ( If M >= N, then at KK = N there is no residual matrix,
+*         i.e. no columns of A to update, only columns of B.
+*         If M < N, then at KK = M-IOFFSET, I = M and we have a
+*         one-row residual matrix in A and the elementary
+*         reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update
+*         is needed for the residual matrix in A and the
+*         right-hand-side-matrix in B.
+*         Therefore, we update only if
+*         KK < MINMNUPDT = min(M-IOFFSET, N+NRHS)
+*         condition is satisfied, not only KK < N+NRHS )
+*
+         IF( KK.LT.MINMNUPDT ) THEN
+            AIKK = A( I, KK )
+            A( I, KK ) = ONE
+            CALL DLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1,
+     $                  TAU( KK ), A( I, KK+1 ), LDA, WORK( 1 ) )
+            A( I, KK ) = AIKK
+         END IF
+*
+         IF( KK.LT.MINMNFACT ) THEN
+*
+*           Update the partial column 2-norms for the residual matrix,
+*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e.
+*           when KK < min(M-IOFFSET, N).
+*
+            DO J = KK + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2
+                  TEMP = MAX( TEMP, ZERO )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2 .LE. TOL3Z ) THEN
+*
+*                    Compute the column 2-norm for the partial
+*                    column A(I+1:M,J) by explicitly computing it,
+*                    and store it in both partial 2-norm vector VN1
+*                    and exact column 2-norm vector VN2.
+*
+                     VN1( J ) = DNRM2( M-I, A( I+1, J ), 1 )
+                     VN2( J ) = VN1( J )
+*
+                  ELSE
+*
+*                    Update the column 2-norm for the partial
+*                    column A(I+1:M,J) by removing one
+*                    element A(I,J) and store it in partial
+*                    2-norm vector VN1.
+*
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+*
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*     End factorization loop
+*
+      END DO
+*
+*     If we reached this point, all colunms have been factorized,
+*     i.e. no condition was triggered to exit the routine.
+*     Set the number of factorized columns.
+*
+      K = KMAX
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before
+*     we return.
+*
+      IF( K.LT.MINMNFACT ) THEN
+*
+         JMAXC2NRM = K + IDAMAX( N-K, VN1( K+1 ), 1 )
+         MAXC2NRMK = VN1( JMAXC2NRM )
+*
+         IF( K.EQ.0 ) THEN
+            RELMAXC2NRMK = ONE
+         ELSE
+            RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+         END IF
+*
+      ELSE
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+      END IF
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, set TAUs corresponding to the columns that were
+*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO.
+*
+      DO J = K + 1, MINMNFACT
+         TAU( J ) = ZERO
+      END DO
+*
+      RETURN
+*
+*     End of DLAQP2RK
+*
+      END
diff --git a/lapack-netlib/SRC/dlaqp3rk.f b/lapack-netlib/SRC/dlaqp3rk.f
new file mode 100644
index 000000000..39e617d0e
--- /dev/null
+++ b/lapack-netlib/SRC/dlaqp3rk.f
@@ -0,0 +1,935 @@
+*> \brief \b DLAQP3RK computes a step of truncated QR factorization with column pivoting of a real m-by-n matrix A using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DLAQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dlaqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dlaqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dlaqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE DLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+*     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+*     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+*      IMPLICIT NONE
+*      LOGICAL            DONE
+*      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+*     $                   NB, NRHS
+*      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*
+*     .. Scalar Arguments ..
+*      LOGICAL            DONE
+*      INTEGER            KB, LDA, LDF, M, N, NB, NRHS, IOFFSET
+*      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            IWORK( * ), JPIV( * )
+*      DOUBLE PRECISION   A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ),
+*     $                   VN1( * ), VN2( * )
+*     ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLAQP3RK computes a step of truncated QR factorization with column
+*> pivoting of a real M-by-N matrix A block A(IOFFSET+1:M,1:N)
+*> by using Level 3 BLAS as
+*>
+*>   A * P(KB) = Q(KB) * R(KB).
+*>
+*> The routine tries to factorize NB columns from A starting from
+*> the row IOFFSET+1 and updates the residual matrix with BLAS 3
+*> xGEMM. The number of actually factorized columns is returned
+*> is smaller than NB.
+*>
+*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides B matrix stored
+*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**T * B.
+*>
+*> Cases when the number of factorized columns KB < NB:
+*>
+*> (1) In some cases, due to catastrophic cancellations, it cannot
+*> factorize all NB columns and need to update the residual matrix.
+*> Hence, the actual number of factorized columns in the block returned
+*> in KB is smaller than NB. The logical DONE is returned as FALSE.
+*> The factorization of the whole original matrix A_orig must proceed
+*> with the next block.
+*>
+*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB.
+*>
+*> (3) In case both stopping criteria ABSTOL or RELTOL are not used,
+*> and when the residual matrix is a zero matrix in some factorization
+*> step KB, the factorization of the whole original matrix A_orig is
+*> stopped, the logical DONE is returned as TRUE. The number of
+*> factorized columns which is smaller than NB is returned in KB.
+*>
+*> (4) Whenever NaN is detected in the matrix A or in the array TAU,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB. The INFO
+*> parameter is set to the column index of the first NaN occurrence.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          Factorization block size, i.e the number of columns
+*>          to factorize in the matrix A. 0 <= NB
+*>
+*>          If NB = 0, then the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine DGEQP3RK. 1 <= KP1 <= N_orig.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is DOUBLE PRECISION
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine DGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:KB) below
+*>             the diagonal together with the array TAU represent
+*>             the orthogonal matrix Q(KB) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:KB) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,KB+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(KB)**T.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out]
+*> \verbatim
+*>          DONE is LOGICAL
+*>          TRUE: a) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to ABSTOL
+*>                   or RELTOL criterion,
+*>                b) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to the
+*>                   residual matrix being a ZERO matrix.
+*>                c) when NaN was detected in the matrix A
+*>                   or in the array TAU.
+*>          FALSE: otherwise.
+*> \endverbatim
+*>
+*> \param[out] KB
+*> \verbatim
+*>          KB is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R.  0 <= KB <= min(M-IOFFSET,NB,N).
+*>
+*>          KB also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is DOUBLE PRECISION
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank KB. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is DOUBLE PRECISION
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank KB) to the maximum column 2-norm of the
+*>          original matrix A_orig. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is DOUBLE PRECISION array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] AUXV
+*> \verbatim
+*>          AUXV is DOUBLE PRECISION array, dimension (NB)
+*>          Auxiliary vector.
+*> \endverbatim
+*>
+*> \param[out] F
+*> \verbatim
+*>          F is DOUBLE PRECISION array, dimension (LDF,NB)
+*>          Matrix F**T = L*(Y**T)*A.
+*> \endverbatim
+*>
+*> \param[in] LDF
+*> \verbatim
+*>          LDF is INTEGER
+*>          The leading dimension of the array F. LDF >= max(1,N+NRHS).
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step KB+1 ( when KB columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             KB                  is set to the number of
+*>                                    factorized columns without
+*>                                    exception.
+*>             MAXC2NRMK           is set to NaN.
+*>             RELMAXC2NRMK        is set to NaN.
+*>             TAU(KB+1:min(M,N))     is not set and contains undefined
+*>                                    elements. If j_1=KB+1, TAU(KB+1)
+*>                                    may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the actorization
+*>             step KB+1 ( when KB columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp3rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  Computer Science Division,
+*>                  University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE DLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      LOGICAL            DONE
+      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+     $                   NB, NRHS
+      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      DOUBLE PRECISION   A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ),
+     $                   VN1( * ), VN2( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            ITEMP, J, K, MINMNFACT, MINMNUPDT,
+     $                   LSTICC, KP, I, IF
+      DOUBLE PRECISION   AIK, HUGEVAL, TEMP, TEMP2, TOL3Z
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DGEMM, DGEMV, DLARFG, DSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            DISNAN
+      INTEGER            IDAMAX
+      DOUBLE PRECISION   DLAMCH, DNRM2
+      EXTERNAL           DISNAN, DLAMCH, IDAMAX, DNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      NB = MIN( NB, MINMNFACT )
+      TOL3Z = SQRT( DLAMCH( 'Epsilon' ) )
+      HUGEVAL = DLAMCH( 'Overflow' )
+*
+*     Compute factorization in a while loop over NB columns,
+*     K is the column index in the block A(1:M,1:N).
+*
+      K = 0
+      LSTICC = 0
+      DONE = .FALSE.
+*
+      DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 )
+         K = K + 1
+         I = IOFFSET + K
+*
+         IF( I.EQ.1 ) THEN
+*
+*           We are at the first column of the original whole matrix A_orig,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+            KP = KP1
+*
+         ELSE
+*
+*           Determine the pivot column in K-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( K-1 ) + IDAMAX( N-K+1, VN1( K ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,K:N) in step K.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains NaN, set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( DISNAN( MAXC2NRMK ) ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               INFO = KB + KP
+*
+*              Set RELMAXC2NRMK to NaN.
+*
+               RELMAXC2NRMK = MAXC2NRMK
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix contains NaN and we stop
+*              the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T.
+
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL DGEMM( 'No transpose', 'Transpose',
+     $                  M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Array TAU(KF+1:MINMNFACT) is not set and contains
+*              undefined elements.
+*
+*              Return from the routine.
+*
+               RETURN
+            END IF
+*
+*           Quick return, if the submatrix A(I:M,K:N) is
+*           a zero matrix. We need to check it only if the column index
+*           (same as row index) is larger than 1, since the condition
+*           for the whole original matrix A_orig is checked in the main
+*           routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               RELMAXC2NRMK = ZERO
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix is zero and we stop the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T.
+*
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL DGEMM( 'No transpose', 'Transpose',
+     $                  M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + K - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third tolerance stopping criteria.
+*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig;
+*
+                  KB = K - 1
+                  IF = I - 1
+*
+*              Apply the block reflector to the residual of the
+*              matrix A and the residual of the right hand sides B, if
+*              the residual matrix and and/or the residual of the right
+*              hand sides exist,  i.e. if the submatrix
+*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when
+*                 KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T.
+*
+               IF( KB.LT.MINMNUPDT ) THEN
+                  CALL DGEMM( 'No transpose', 'Transpose',
+     $                  M-IF, N+NRHS-KB, KB,-ONE, A( IF+1, 1 ), LDA,
+     $                  F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,K:N):
+*        1) swap the K-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1)
+*        3) copy the K-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than K in the next loop step.)
+*        4) Save the pivot interchange with the indices relative to the
+*           the original matrix A_orig, not the block A(1:M,1:N).
+*
+         IF( KP.NE.K ) THEN
+            CALL DSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 )
+            CALL DSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF )
+            VN1( KP ) = VN1( K )
+            VN2( KP ) = VN2( K )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( K )
+            JPIV( K ) = ITEMP
+         END IF
+*
+*        Apply previous Householder reflectors to column K:
+*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T.
+*
+         IF( K.GT.1 ) THEN
+            CALL DGEMV( 'No transpose', M-I+1, K-1, -ONE, A( I, 1 ),
+     $                  LDA, F( K, 1 ), LDF, ONE, A( I, K ), 1 )
+         END IF
+*
+*        Generate elementary reflector H(k) using the column A(I:M,K).
+*
+         IF( I.LT.M ) THEN
+            CALL DLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) )
+         ELSE
+            TAU( K ) = ZERO
+         END IF
+*
+*        Check if TAU(K) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(K) for Inf,
+*        since DLARFG cannot produce TAU(K) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by DLARFG can contain Inf, which requires
+*        TAU(K) to contain NaN. Therefore, this case of generating Inf
+*        by DLARFG is covered by checking TAU(K) for NaN.
+*
+         IF( DISNAN( TAU(K) ) ) THEN
+*
+            DONE = .TRUE.
+*
+*           Set KB, the number of factorized partial columns
+*                   that are non-zero in each step in the block,
+*                   i.e. the rank of the factor R.
+*           Set IF, the number of processed rows in the block, which
+*                   is the same as the number of processed rows in
+*                   the original whole matrix A_orig.
+*
+            KB = K - 1
+            IF = I - 1
+            INFO = K
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAU( K )
+            RELMAXC2NRMK = TAU( K )
+*
+*           There is no need to apply the block reflector to the
+*           residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*           since the submatrix contains NaN and we stop
+*           the computation.
+*           But, we need to apply the block reflector to the residual
+*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*           residual right hand sides exist.  This occurs
+*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T.
+*
+            IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+               CALL DGEMM( 'No transpose', 'Transpose',
+     $               M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA,
+     $               F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA )
+            END IF
+*
+*           There is no need to recompute the 2-norm of the
+*           difficult columns, since we stop the factorization.
+*
+*           Array TAU(KF+1:MINMNFACT) is not set and contains
+*           undefined elements.
+*
+*           Return from the routine.
+*
+            RETURN
+         END IF
+*
+*        ===============================================================
+*
+         AIK = A( I, K )
+         A( I, K ) = ONE
+*
+*        ===============================================================
+*
+*        Compute the current K-th column of F:
+*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K).
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL DGEMV( 'Transpose', M-I+1, N+NRHS-K,
+     $                  TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1,
+     $                  ZERO, F( K+1, K ), 1 )
+         END IF
+*
+*           2) Zero out elements above and on the diagonal of the
+*              column K in matrix F, i.e elements F(1:K,K).
+*
+         DO J = 1, K
+            F( J, K ) = ZERO
+         END DO
+*
+*         3) Incremental updating of the K-th column of F:
+*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T
+*                    * A(I:M,K).
+*
+         IF( K.GT.1 ) THEN
+            CALL DGEMV( 'Transpose', M-I+1, K-1, -TAU( K ),
+     $                  A( I, 1 ), LDA, A( I, K ), 1, ZERO,
+     $                  AUXV( 1 ), 1 )
+*
+            CALL DGEMV( 'No transpose', N+NRHS, K-1, ONE,
+     $                  F( 1, 1 ), LDF, AUXV( 1 ), 1, ONE,
+     $                  F( 1, K ), 1 )
+         END IF
+*
+*        ===============================================================
+*
+*        Update the current I-th row of A:
+*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS)
+*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**T.
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL DGEMV( 'No transpose', N+NRHS-K, K, -ONE,
+     $                  F( K+1, 1 ), LDF, A( I, 1 ), LDA, ONE,
+     $                  A( I, K+1 ), LDA )
+         END IF
+*
+         A( I, K ) = AIK
+*
+*        Update the partial column 2-norms for the residual matrix,
+*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e.
+*        when K < MINMNFACT = min( M-IOFFSET, N ).
+*
+         IF( K.LT.MINMNFACT ) THEN
+*
+            DO J = K + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ABS( A( I, J ) ) / VN1( J )
+                  TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2.LE.TOL3Z ) THEN
+*
+*                    At J-index, we have a difficult column for the
+*                    update of the 2-norm. Save the index of the previous
+*                    difficult column in IWORK(J-1).
+*                    NOTE: ILSTCC > 1, threfore we can use IWORK only
+*                    with N-1 elements, where the elements are
+*                    shifted by 1 to the left.
+*
+                     IWORK( J-1 ) = LSTICC
+*
+*                    Set the index of the last difficult column LSTICC.
+*
+                     LSTICC = J
+*
+                  ELSE
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*        End of while loop.
+*
+      END DO
+*
+*     Now, afler the loop:
+*        Set KB, the number of factorized columns in the block;
+*        Set IF, the number of processed rows in the block, which
+*                is the same as the number of processed rows in
+*                the original whole matrix A_orig, IF = IOFFSET + KB.
+*
+      KB = K
+      IF = I
+*
+*     Apply the block reflector to the residual of the matrix A
+*     and the residual of the right hand sides B, if the residual
+*     matrix and and/or the residual of the right hand sides
+*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists.
+*     This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T.
+*
+      IF( KB.LT.MINMNUPDT ) THEN
+         CALL DGEMM( 'No transpose', 'Transpose',
+     $         M-IF, N+NRHS-KB, KB, -ONE, A( IF+1, 1 ), LDA,
+     $         F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA )
+      END IF
+*
+*     Recompute the 2-norm of the difficult columns.
+*     Loop over the index of the difficult columns from the largest
+*     to the smallest index.
+*
+      DO WHILE( LSTICC.GT.0 )
+*
+*        LSTICC is the index of the last difficult column is greater
+*        than 1.
+*        ITEMP is the index of the previous difficult column.
+*
+         ITEMP = IWORK( LSTICC-1 )
+*
+*        Compute the 2-norm explicilty for the last difficult column and
+*        save it in the partial and exact 2-norm vectors VN1 and VN2.
+*
+*        NOTE: The computation of VN1( LSTICC ) relies on the fact that
+*        DNRM2 does not fail on vectors with norm below the value of
+*        SQRT(DLAMCH('S'))
+*
+         VN1( LSTICC ) = DNRM2( M-IF, A( IF+1, LSTICC ), 1 )
+         VN2( LSTICC ) = VN1( LSTICC )
+*
+*        Downdate the index of the last difficult column to
+*        the index of the previous difficult column.
+*
+         LSTICC = ITEMP
+*
+      END DO
+*
+      RETURN
+*
+*     End of DLAQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/ilaenv.f b/lapack-netlib/SRC/ilaenv.f
index a639e0375..e74a2b35e 100644
--- a/lapack-netlib/SRC/ilaenv.f
+++ b/lapack-netlib/SRC/ilaenv.f
@@ -132,7 +132,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup OTHERauxiliary
+*> \ingroup ilaenv
 *
 *> \par Further Details:
 *  =====================
@@ -355,6 +355,12 @@
             ELSE
                NB = 64
             END IF
+         ELSE IF( SUBNAM( 4: 7 ).EQ.'QP3RK' ) THEN
+            IF( SNAME ) THEN
+               NB = 32
+            ELSE
+               NB = 32
+            END IF
          END IF
       ELSE IF( C2.EQ.'PO' ) THEN
          IF( C3.EQ.'TRF' ) THEN
@@ -541,7 +547,14 @@
             ELSE
                NBMIN = 2
             END IF
+         ELSE IF( SUBNAM( 4: 7 ).EQ.'QP3RK' ) THEN
+            IF( SNAME ) THEN
+               NBMIN = 2
+            ELSE
+               NBMIN = 2
+            END IF
          END IF
+
       ELSE IF( C2.EQ.'SY' ) THEN
          IF( C3.EQ.'TRF' ) THEN
             IF( SNAME ) THEN
@@ -618,6 +631,12 @@
             ELSE
                NX = 128
             END IF
+         ELSE IF( SUBNAM( 4: 7 ).EQ.'QP3RK' ) THEN
+            IF( SNAME ) THEN
+               NX = 128
+            ELSE
+               NX = 128
+            END IF
          END IF
       ELSE IF( C2.EQ.'SY' ) THEN
          IF( SNAME .AND. C3.EQ.'TRD' ) THEN
diff --git a/lapack-netlib/SRC/sgeqp3rk.f b/lapack-netlib/SRC/sgeqp3rk.f
new file mode 100644
index 000000000..17559c7f4
--- /dev/null
+++ b/lapack-netlib/SRC/sgeqp3rk.f
@@ -0,0 +1,1081 @@
+*> \brief \b SGEQP3RK computes a truncated Householder QR factorization with column pivoting of a real m-by-n matrix A by using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SGEQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sgeqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sgeqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sgeqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+*      $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*      $                     WORK, LWORK, IWORK, INFO )
+*       IMPLICIT NONE
+*
+*      .. Scalar Arguments ..
+*       INTEGER            INFO, K, KMAX, LDA, LWORK, M, N, NRHS
+*       REAL               ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*      ..
+*      .. Array Arguments ..
+*       INTEGER            IWORK( * ), JPIV( * )
+*       REAL               A( LDA, * ), TAU( * ), WORK( * )
+*      ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SGEQP3RK performs two tasks simultaneously:
+*>
+*> Task 1: The routine computes a truncated (rank K) or full rank
+*> Householder QR factorization with column pivoting of a real
+*> M-by-N matrix A using Level 3 BLAS. K is the number of columns
+*> that were factorized, i.e. factorization rank of the
+*> factor R, K <= min(M,N).
+*>
+*>  A * P(K) = Q(K) * R(K)  =
+*>
+*>        = Q(K) * ( R11(K) R12(K) ) = Q(K) * (   R(K)_approx    )
+*>                 ( 0      R22(K) )          ( 0  R(K)_residual ),
+*>
+*> where:
+*>
+*>  P(K)            is an N-by-N permutation matrix;
+*>  Q(K)            is an M-by-M orthogonal matrix;
+*>  R(K)_approx   = ( R11(K), R12(K) ) is a rank K approximation of the
+*>                    full rank factor R with K-by-K upper-triangular
+*>                    R11(K) and K-by-N rectangular R12(K). The diagonal
+*>                    entries of R11(K) appear in non-increasing order
+*>                    of absolute value, and absolute values of all of
+*>                    them exceed the maximum column 2-norm of R22(K)
+*>                    up to roundoff error.
+*>  R(K)_residual = R22(K) is the residual of a rank K approximation
+*>                    of the full rank factor R. It is a
+*>                    an (M-K)-by-(N-K) rectangular matrix;
+*>  0               is a an (M-K)-by-K zero matrix.
+*>
+*> Task 2: At the same time, the routine overwrites a real M-by-NRHS
+*> matrix B with  Q(K)**T * B  using Level 3 BLAS.
+*>
+*> =====================================================================
+*>
+*> The matrices A and B are stored on input in the array A as
+*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS)
+*> respectively.
+*>
+*>                                  N     NRHS
+*>             array_A   =   M  [ mat_A, mat_B ]
+*>
+*> The truncation criteria (i.e. when to stop the factorization)
+*> can be any of the following:
+*>
+*>   1) The input parameter KMAX, the maximum number of columns
+*>      KMAX to factorize, i.e. the factorization rank is limited
+*>      to KMAX. If KMAX >= min(M,N), the criterion is not used.
+*>
+*>   2) The input parameter ABSTOL, the absolute tolerance for
+*>      the maximum column 2-norm of the residual matrix R22(K). This
+*>      means that the factorization stops if this norm is less or
+*>      equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used.
+*>
+*>   3) The input parameter RELTOL, the tolerance for the maximum
+*>      column 2-norm matrix of the residual matrix R22(K) divided
+*>      by the maximum column 2-norm of the original matrix A, which
+*>      is equal to abs(R(1,1)). This means that the factorization stops
+*>      when the ratio of the maximum column 2-norm of R22(K) to
+*>      the maximum column 2-norm of A is less than or equal to RELTOL.
+*>      If RELTOL < 0.0, the criterion is not used.
+*>
+*>   4) In case both stopping criteria ABSTOL or RELTOL are not used,
+*>      and when the residual matrix R22(K) is a zero matrix in some
+*>      factorization step K. ( This stopping criterion is implicit. )
+*>
+*>  The algorithm stops when any of these conditions is first
+*>  satisfied, otherwise the whole matrix A is factorized.
+*>
+*>  To factorize the whole matrix A, use the values
+*>  KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0.
+*>
+*>  The routine returns:
+*>     a) Q(K), R(K)_approx = ( R11(K), R12(K) ),
+*>        R(K)_residual = R22(K), P(K), i.e. the resulting matrices
+*>        of the factorization; P(K) is represented by JPIV,
+*>        ( if K = min(M,N), R(K)_approx is the full factor R,
+*>        and there is no residual matrix R(K)_residual);
+*>     b) K, the number of columns that were factorized,
+*>        i.e. factorization rank;
+*>     c) MAXC2NRMK, the maximum column 2-norm of the residual
+*>        matrix R(K)_residual = R22(K),
+*>        ( if K = min(M,N), MAXC2NRMK = 0.0 );
+*>     d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum
+*>        column 2-norm of the original matrix A, which is equal
+*>        to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 );
+*>     e) Q(K)**T * B, the matrix B with the orthogonal
+*>        transformation Q(K)**T applied on the left.
+*>
+*> The N-by-N permutation matrix P(K) is stored in a compact form in
+*> the integer array JPIV. For 1 <= j <= N, column j
+*> of the matrix A was interchanged with column JPIV(j).
+*>
+*> The M-by-M orthogonal matrix Q is represented as a product
+*> of elementary Householder reflectors
+*>
+*>     Q(K) = H(1) *  H(2) * . . . * H(K),
+*>
+*> where K is the number of columns that were factorized.
+*>
+*> Each H(j) has the form
+*>
+*>     H(j) = I - tau * v * v**T,
+*>
+*> where 1 <= j <= K and
+*>   I    is an M-by-M identity matrix,
+*>   tau  is a real scalar,
+*>   v    is a real vector with v(1:j-1) = 0 and v(j) = 1.
+*>
+*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j).
+*>
+*> See the Further Details section for more information.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e. the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M,N), then this stopping criterion
+*>                is not used, the routine factorizes columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>                satisfied on input and the routine exits immediately.
+*>                This means that the factorization is not performed,
+*>                the matrices A and B are not modified, and
+*>                the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is REAL
+*>
+*>          The second factorization stopping criterion, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix R22(K).
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix R22(K)
+*>          is less than or equal to ABSTOL. Let SAFMIN = SLAMCH('S').
+*>
+*>          a) If ABSTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -5 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN
+*>                is used. This includes the case ABSTOL = -0.0.
+*>
+*>          d) If 2*SAFMIN <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If ABSTOL chosen above is >= MAXC2NRM, then this
+*>          stopping criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed. The routine
+*>          returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case ABSTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is REAL
+*>
+*>          The third factorization stopping criterion, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio
+*>          abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of
+*>          the residual matrix R22(K) to the maximum column 2-norm of
+*>          the original matrix A. The algorithm converges (stops the
+*>          factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less
+*>          than or equal to RELTOL. Let EPS = SLAMCH('E').
+*>
+*>          a) If RELTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -6 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used.
+*>                This includes the case RELTOL = -0.0.
+*>
+*>          d) If EPS <= RELTOL then the input value of RELTOL
+*>                is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If RELTOL chosen above is >= 1.0, then this stopping
+*>          criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed.
+*>          The routine returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case RELTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*>
+*>          NOTE: We recommend that RELTOL satisfy
+*>                min( max(M,N)*EPS, sqrt(EPS) ) <= RELTOL
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N+NRHS)
+*>
+*>          On entry:
+*>
+*>          a) The subarray A(1:M,1:N) contains the M-by-N matrix A.
+*>          b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS
+*>             matrix B.
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>
+*>          a) The subarray A(1:M,1:N) contains parts of the factors
+*>             of the matrix A:
+*>
+*>            1) If K = 0, A(1:M,1:N) contains the original matrix A.
+*>            2) If K > 0, A(1:M,1:N) contains parts of the
+*>            factors:
+*>
+*>              1. The elements below the diagonal of the subarray
+*>                 A(1:M,1:K) together with TAU(1:K) represent the
+*>                 orthogonal matrix Q(K) as a product of K Householder
+*>                 elementary reflectors.
+*>
+*>              2. The elements on and above the diagonal of
+*>                 the subarray A(1:K,1:N) contain K-by-N
+*>                 upper-trapezoidal matrix
+*>                 R(K)_approx = ( R11(K), R12(K) ).
+*>                 NOTE: If K=min(M,N), i.e. full rank factorization,
+*>                       then R_approx(K) is the full factor R which
+*>                       is upper-trapezoidal. If, in addition, M>=N,
+*>                       then R is upper-triangular.
+*>
+*>              3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K)
+*>                 rectangular matrix R(K)_residual = R22(K).
+*>
+*>          b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains
+*>             the M-by-NRHS product Q(K)**T * B.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*>          This is the leading dimension for both matrices, A and B.
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*>
+*>          NOTE: If K = 0, a) the arrays A and B are not modified;
+*>                          b) the array TAU(1:min(M,N)) is set to ZERO,
+*>                             if the matrix A does not contain NaN,
+*>                             otherwise the elements TAU(1:min(M,N))
+*>                             are undefined;
+*>                          c) the elements of the array JPIV are set
+*>                             as follows: for j = 1:N, JPIV(j) = j.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is REAL
+*>          The maximum column 2-norm of the residual matrix R22(K),
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then MAXC2NRMK equals the maximum column 2-norm
+*>             of the original matrix A.
+*>
+*>          b) If 0 < K < min(M,N), then MAXC2NRMK is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then MAXC2NRMK = 0.0.
+*>
+*>          NOTE: MAXC2NRMK in the factorization step K would equal
+*>                R(K+1,K+1) in the next factorization step K+1.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is REAL
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix R22(K) (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then RELMAXC2NRMK = 1.0.
+*>
+*>          b) If 0 < K < min(M,N), then
+*>                RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then RELMAXC2NRMK = 0.0.
+*>
+*>         NOTE: RELMAXC2NRMK in the factorization step K would equal
+*>               abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization
+*>               step K+1.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices. For 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*>
+*>          The elements of the array JPIV(1:N) are always set
+*>          by the routine, for example, even  when no columns
+*>          were factorized, i.e. when K = 0, the elements are
+*>          set as JPIV(j) = j for j = 1:N.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is REAL array, dimension (min(M,N))
+*>          The scalar factors of the elementary reflectors.
+*>
+*>          If 0 < K <= min(M,N), only the elements TAU(1:K) of
+*>          the array TAU are modified by the factorization.
+*>          After the factorization computed, if no NaN was found
+*>          during the factorization, the remaining elements
+*>          TAU(K+1:min(M,N)) are set to zero, otherwise the
+*>          elements TAU(K+1:min(M,N)) are not set and therefore
+*>          undefined.
+*>          ( If K = 0, all elements of TAU are set to zero, if
+*>          the matrix A does not contain NaN. )
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*.          LWORK >= (3*N + NRHS - 1)
+*>          For optimal performance LWORK >= (2*N + NB*( N+NRHS+1 )),
+*>          where NB is the optimal block size for SGEQP3RK returned
+*>          by ILAENV. Minimal block size MINNB=2.
+*>
+*>          NOTE: The decision, whether to use unblocked BLAS 2
+*>          or blocked BLAS 3 code is based not only on the dimension
+*>          LWORK of the availbale workspace WORK, but also also on the
+*>          matrix A dimension N via crossover point NX returned
+*>          by ILAENV. (For N less than NX, unblocked code should be
+*>          used.)
+*>
+*>          If LWORK = -1, then a workspace query is assumed;
+*>          the routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix in the blocked step auxiliary subroutine SLAQP3RK ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) INFO < 0: if INFO = -i, the i-th argument had an
+*>                       illegal value.
+*>          3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup geqp3rk
+*
+*> \par Further Details:
+*  =====================
+*
+*> \verbatim
+*> SGEQP3RK is based on the same BLAS3 Householder QR factorization
+*> algorithm with column pivoting as in SGEQP3 routine which uses
+*> SLARFG routine to generate Householder reflectors
+*> for QR factorization.
+*>
+*> We can also write:
+*>
+*>   A = A_approx(K) + A_residual(K)
+*>
+*> The low rank approximation matrix A(K)_approx from
+*> the truncated QR factorization of rank K of the matrix A is:
+*>
+*>   A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T
+*>                        (     0     0 )
+*>
+*>               = Q(K) * ( R11(K) R12(K) ) * P(K)**T
+*>                        (      0      0 )
+*>
+*> The residual A_residual(K) of the matrix A is:
+*>
+*>   A_residual(K) = Q(K) * ( 0              0 ) * P(K)**T =
+*>                          ( 0  R(K)_residual )
+*>
+*>                 = Q(K) * ( 0        0 ) * P(K)**T
+*>                          ( 0   R22(K) )
+*>
+*> The truncated (rank K) factorization guarantees that
+*> the maximum column 2-norm of A_residual(K) is less than
+*> or equal to MAXC2NRMK up to roundoff error.
+*>
+*> NOTE: An approximation of the null vectors
+*>       of A can be easily computed from R11(K)
+*>       and R12(K):
+*>
+*>       Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) )
+*>                                 (         -I           )
+*>
+*> \endverbatim
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  Computer Science Division,
+*>                  University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE SGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+     $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     WORK, LWORK, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS
+      REAL               ABSTOL,  MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      REAL               A( LDA, * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            INB, INBMIN, IXOVER
+      PARAMETER          ( INB = 1, INBMIN = 2, IXOVER = 3 )
+      REAL               ZERO, ONE, TWO
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, DONE
+      INTEGER            IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX,
+     $                   JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB,
+     $                   NBMIN, NX
+      REAL               EPS, HUGEVAL, MAXC2NRM, SAFMIN
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLAQP2RK, SLAQP3RK, XERBLA
+*     ..
+*     .. External Functions ..
+      LOGICAL            SISNAN
+      INTEGER            ISAMAX, ILAENV
+      REAL               SLAMCH, SNRM2
+      EXTERNAL           SISNAN, SLAMCH, SNRM2, ISAMAX, ILAENV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          REAL, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test input arguments
+*     ====================
+*
+      INFO = 0
+      LQUERY = ( LWORK.EQ.-1 )
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KMAX.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( SISNAN( ABSTOL ) ) THEN
+         INFO = -5
+      ELSE IF( SISNAN( RELTOL ) ) THEN
+         INFO = -6
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -8
+      END IF
+*
+*     If the input parameters M, N, NRHS, KMAX, LDA are valid:
+*       a) Test the input workspace size LWORK for the minimum
+*          size requirement IWS.
+*       b) Determine the optimal block size NB and optimal
+*          workspace size LWKOPT to be returned in WORK(1)
+*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE.,
+*          (3) when routine exits.
+*     Here, IWS is the miminum workspace required for unblocked
+*     code.
+*
+      IF( INFO.EQ.0 ) THEN
+         MINMN = MIN( M, N )
+         IF( MINMN.EQ.0 ) THEN
+            IWS = 1
+            LWKOPT = 1
+         ELSE
+*
+*           Minimal workspace size in case of using only unblocked
+*           BLAS 2 code in SLAQP2RK.
+*           1) SGEQP3RK and SLAQP2RK: 2*N to store full and partial
+*              column 2-norms.
+*           2) SLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in SLARF subroutine inside SLAQP2RK to apply an
+*              elementary reflector from the left.
+*           TOTAL_WORK_SIZE = 3*N + NRHS - 1
+*
+            IWS = 3*N + NRHS - 1
+*
+*           Assign to NB optimal block size.
+*
+            NB = ILAENV( INB, 'SGEQP3RK', ' ', M, N, -1, -1 )
+*
+*           A formula for the optimal workspace size in case of using
+*           both unblocked BLAS 2 in SLAQP2RK and blocked BLAS 3 code
+*           in SLAQP3RK.
+*           1) SGEQP3RK, SLAQP2RK, SLAQP3RK: 2*N to store full and
+*              partial column 2-norms.
+*           2) SLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in SLARF subroutine to apply an elementary reflector
+*              from the left.
+*           3) SLAQP3RK: NB*(N+NRHS) to use in the work array F that
+*              is used to apply a block reflector from
+*              the left.
+*           4) SLAQP3RK: NB to use in the auxilixary array AUX.
+*           Sizes (2) and ((3) + (4)) should intersect, therefore
+*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2.
+*
+            LWKOPT = 2*N + NB*( N+NRHS+1 )
+         END IF
+         WORK( 1 ) = REAL( LWKOPT )
+*
+         IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN
+            INFO = -15
+         END IF
+      END IF
+*
+*      NOTE: The optimal workspace size is returned in WORK(1), if
+*            the input parameters M, N, NRHS, KMAX, LDA are valid.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SGEQP3RK', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible for M=0 or N=0.
+*
+      IF( MINMN.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+         WORK( 1 ) = REAL( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+*     Initialize column pivot array JPIV.
+*
+      DO J = 1, N
+         JPIV( J ) = J
+      END DO
+*
+*     ==================================================================
+*
+*     Initialize storage for partial and exact column 2-norms.
+*     a) The elements WORK(1:N) are used to store partial column
+*        2-norms of the matrix A, and may decrease in each computation
+*        step; initialize to the values of complete columns 2-norms.
+*     b) The elements WORK(N+1:2*N) are used to store complete column
+*        2-norms of the matrix A, they are not changed during the
+*        computation; initialize the values of complete columns 2-norms.
+*
+      DO J = 1, N
+         WORK( J ) = SNRM2( M, A( 1, J ), 1 )
+         WORK( N+J ) = WORK( J )
+      END DO
+*
+*     ==================================================================
+*
+*     Compute the pivot column index and the maximum column 2-norm
+*     for the whole original matrix stored in A(1:M,1:N).
+*
+      KP1 = ISAMAX( N, WORK( 1 ), 1 )
+      MAXC2NRM = WORK( KP1 )
+*
+*     ==================================================================.
+*
+      IF( SISNAN( MAXC2NRM ) ) THEN
+*
+*        Check if the matrix A contains NaN, set INFO parameter
+*        to the column number where the first NaN is found and return
+*        from the routine.
+*
+         K = 0
+         INFO = KP1
+*
+*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = MAXC2NRM
+*
+*        Array TAU is not set and contains undefined elements.
+*
+         WORK( 1 ) = REAL( LWKOPT )
+         RETURN
+      END IF
+*
+*     ===================================================================
+*
+      IF( MAXC2NRM.EQ.ZERO ) THEN
+*
+*        Check is the matrix A is a zero matrix, set array TAU and
+*        return from the routine.
+*
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+*
+         DO J = 1, MINMN
+            TAU( J ) = ZERO
+         END DO
+*
+         WORK( 1 ) = REAL( LWKOPT )
+         RETURN
+*
+      END IF
+*
+*     ===================================================================
+*
+      HUGEVAL = SLAMCH( 'Overflow' )
+*
+      IF( MAXC2NRM.GT.HUGEVAL ) THEN
+*
+*        Check if the matrix A contains +Inf or -Inf, set INFO parameter
+*        to the column number, where the first +/-Inf  is found plus N,
+*        and continue the computation.
+*
+         INFO = N + KP1
+*
+      END IF
+*
+*     ==================================================================
+*
+*     Quick return if possible for the case when the first
+*     stopping criterion is satisfied, i.e. KMAX = 0.
+*
+      IF( KMAX.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+         DO J = 1, MINMN
+            TAU( J ) = ZERO
+         END DO
+         WORK( 1 ) = REAL( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+      EPS = SLAMCH('Epsilon')
+*
+*     Adjust ABSTOL
+*
+      IF( ABSTOL.GE.ZERO ) THEN
+         SAFMIN = SLAMCH('Safe minimum')
+         ABSTOL = MAX( ABSTOL, TWO*SAFMIN )
+      END IF
+*
+*     Adjust RELTOL
+*
+      IF( RELTOL.GE.ZERO ) THEN
+         RELTOL = MAX( RELTOL, EPS )
+      END IF
+*
+*     ===================================================================
+*
+*     JMAX is the maximum index of the column to be factorized,
+*     which is also limited by the first stopping criterion KMAX.
+*
+      JMAX = MIN( KMAX, MINMN )
+*
+*     ===================================================================
+*
+*     Quick return if possible for the case when the second or third
+*     stopping criterion for the whole original matrix is satified,
+*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL
+*     (which is ONE <= RELTOL).
+*
+      IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN
+*
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+*
+         DO J = 1, MINMN
+            TAU( J ) = ZERO
+         END DO
+*
+         WORK( 1 ) = REAL( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*     Factorize columns
+*     ==================================================================
+*
+*     Determine the block size.
+*
+      NBMIN = 2
+      NX = 0
+*
+      IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN
+*
+*        Determine when to cross over from blocked to unblocked code.
+*        (for N less than NX, unblocked code should be used).
+*
+         NX = MAX( 0, ILAENV( IXOVER, 'SGEQP3RK', ' ', M, N, -1, -1 ))
+*
+         IF( NX.LT.MINMN ) THEN
+*
+*           Determine if workspace is large enough for blocked code.
+*
+            IF( LWORK.LT.LWKOPT ) THEN
+*
+*              Not enough workspace to use optimal block size that
+*              is currently stored in NB.
+*              Reduce NB and determine the minimum value of NB.
+*
+               NB = ( LWORK-2*N ) / ( N+1 )
+               NBMIN = MAX( 2, ILAENV( INBMIN, 'SGEQP3RK', ' ', M, N,
+     $                 -1, -1 ) )
+*
+            END IF
+         END IF
+      END IF
+*
+*     ==================================================================
+*
+*     DONE is the boolean flag to rerpresent the case when the
+*     factorization completed in the block factorization routine,
+*     before the end of the block.
+*
+      DONE = .FALSE.
+*
+*     J is the column index.
+*
+      J = 1
+*
+*     (1) Use blocked code initially.
+*
+*     JMAXB is the maximum column index of the block, when the
+*     blocked code is used, is also limited by the first stopping
+*     criterion KMAX.
+*
+      JMAXB = MIN( KMAX, MINMN - NX )
+*
+      IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN
+*
+*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here:
+*        J   is the column index of a column block;
+*        JB  is the column block size to pass to block factorization
+*            routine in a loop step;
+*        JBF is the number of columns that were actually factorized
+*            that was returned by the block factorization routine
+*            in a loop step, JBF <= JB;
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         DO WHILE( J.LE.JMAXB )
+*
+            JB = MIN( NB, JMAXB-J+1 )
+            N_SUB = N-J+1
+            IOFFSET = J-1
+*
+*           Factorize JB columns among the columns A(J:N).
+*
+            CALL SLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                     DONE, JBF, MAXC2NRMK, RELMAXC2NRMK,
+     $                     JPIV( J ), TAU( J ),
+     $                     WORK( J ), WORK( N+J ),
+     $                     WORK( 2*N+1 ), WORK( 2*N+JB+1 ),
+     $                     N+NRHS-J+1, IWORK, IINFO )
+*
+*           Set INFO on the first occurence of Inf.
+*
+            IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+               INFO = 2*IOFFSET + IINFO
+            END IF
+*
+            IF( DONE ) THEN
+*
+*              Either the submatrix is zero before the end of the
+*              column block, or ABSTOL or RELTOL criterion is
+*              satisfied before the end of the column block, we can
+*              return from the routine. Perform the following before
+*              returning:
+*                a) Set the number of factorized columns K,
+*                   K = IOFFSET + JBF from the last call of blocked
+*                   routine.
+*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned
+*                         by the block factorization routine;
+*                      2) The remaining TAUs are set to ZERO by the
+*                         block factorization routine.
+*
+               K = IOFFSET + JBF
+*
+*              Set INFO on the first occurrence of NaN, NaN takes
+*              prcedence over Inf.
+*
+               IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+                  INFO = IOFFSET + IINFO
+               END IF
+*
+*              Return from the routine.
+*
+               WORK( 1 ) = REAL( LWKOPT )
+*
+               RETURN
+*
+            END IF
+*
+            J = J + JBF
+*
+         END DO
+*
+      END IF
+*
+*     Use unblocked code to factor the last or only block.
+*     J = JMAX+1 means we factorized the maximum possible number of
+*     columns, that is in ELSE clause we need to compute
+*     the MAXC2NORM and RELMAXC2NORM to return after we processed
+*     the blocks.
+*
+      IF( J.LE.JMAX ) THEN
+*
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         N_SUB = N-J+1
+         IOFFSET = J-1
+*
+         CALL SLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1,
+     $                  ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                  KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ),
+     $                  TAU( J ), WORK( J ), WORK( N+J ),
+     $                  WORK( 2*N+1 ), IINFO )
+*
+*        ABSTOL or RELTOL criterion is satisfied when the number of
+*        the factorized columns KF is smaller then the  number
+*        of columns JMAX-J+1 supplied to be factorized by the
+*        unblocked routine, we can return from
+*        the routine. Perform the following before returning:
+*           a) Set the number of factorized columns K,
+*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the
+*              unblocked factorization routine above.
+*
+         K = J - 1 + KF
+*
+*        Set INFO on the first exception occurence.
+*
+*        Set INFO on the first exception occurence of Inf or NaN,
+*        (NaN takes precedence over Inf).
+*
+         IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+            INFO = 2*IOFFSET + IINFO
+         ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+            INFO = IOFFSET + IINFO
+         END IF
+*
+      ELSE
+*
+*        Compute the return values for blocked code.
+*
+*        Set the number of factorized columns if the unblocked routine
+*        was not called.
+*
+            K = JMAX
+*
+*        If there exits a residual matrix after the blocked code:
+*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the
+*              residual matrix, otherwise set them to ZERO;
+*           2) Set TAU(K+1:MINMN) to ZERO.
+*
+         IF( K.LT.MINMN ) THEN
+            JMAXC2NRM = K + ISAMAX( N-K, WORK( K+1 ), 1 )
+            MAXC2NRMK = WORK( JMAXC2NRM )
+            IF( K.EQ.0 ) THEN
+               RELMAXC2NRMK = ONE
+            ELSE
+               RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+            END IF
+*
+            DO J = K + 1, MINMN
+               TAU( J ) = ZERO
+            END DO
+*
+         END IF
+*
+*     END IF( J.LE.JMAX ) THEN
+*
+      END IF
+*
+      WORK( 1 ) = REAL( LWKOPT )
+*
+      RETURN
+*
+*     End of SGEQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/slaqp2rk.f b/lapack-netlib/SRC/slaqp2rk.f
new file mode 100644
index 000000000..d3dbb3d7c
--- /dev/null
+++ b/lapack-netlib/SRC/slaqp2rk.f
@@ -0,0 +1,713 @@
+*> \brief \b SLAQP2RK computes truncated QR factorization with column pivoting of a real matrix block using Level 2 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SLAQP2RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/slaqp2rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/slaqp2rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/slaqp2rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE SLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+*     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+*     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+*     $                     INFO )
+*      IMPLICIT NONE
+*
+*     .. Scalar Arguments ..
+*      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+*      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            JPIV( * )
+*      REAL               A( LDA, * ), TAU( * ), VN1( * ), VN2( * ),
+*     $                   WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLAQP2RK computes a truncated (rank K) or full rank Householder QR
+*> factorization with column pivoting of a real matrix
+*> block A(IOFFSET+1:M,1:N) as
+*>
+*>   A * P(K) = Q(K) * R(K).
+*>
+*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N)
+*> is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides matrix block B
+*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**T * B.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M-IOFFSET,N), then this stopping
+*>                criterion is not used, factorize columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>             satisfied on input and the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The second factorization stopping criterion.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The third factorization stopping criterion.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine SGEQP3RK. 1 <= KP1 <= N_orig_mat.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is DOUBLE PRECISION
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine SGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:K) below
+*>             the diagonal together with the array TAU represent
+*>             the orthogonal matrix Q(K) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:K) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,K+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(K)**T.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is DOUBLE PRECISION
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is DOUBLE PRECISION
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is REAL array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is REAL array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is REAL array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (N-1)
+*>          Used in SLARF subroutine to apply an elementary
+*>          reflector from the left.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp2rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  Computer Science Division,
+*>                  University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE SLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+     $                     INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            JPIV( * )
+      REAL               A( LDA, * ), TAU( * ), VN1( * ), VN2( * ),
+     $                   WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT,
+     $                   MINMNUPDT
+      REAL               AIKK, HUGEVAL, TEMP, TEMP2, TOL3Z
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLARF, SLARFG, SSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            SISNAN
+      INTEGER            ISAMAX
+      REAL               SLAMCH, SNRM2
+      EXTERNAL           SISNAN, SLAMCH, ISAMAX, SNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+*     MINMNUPDT is the smallest dimension
+*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which
+*     contains the submatrices A(IOFFSET+1:M,1:N) and
+*     B(IOFFSET+1:M,1:NRHS) as column blocks.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      KMAX = MIN( KMAX, MINMNFACT )
+      TOL3Z = SQRT( SLAMCH( 'Epsilon' ) )
+      HUGEVAL = SLAMCH( 'Overflow' )
+*
+*     Compute the factorization, KK is the lomn loop index.
+*
+      DO KK = 1, KMAX
+*
+         I = IOFFSET + KK
+*
+         IF( I.EQ.1 ) THEN
+*
+*           ============================================================
+*
+*           We are at the first column of the original whole matrix A,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+
+            KP = KP1
+*
+*           ============================================================
+*
+         ELSE
+*
+*           ============================================================
+*
+*           Determine the pivot column in KK-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( KK-1 ) + ISAMAX( N-KK+1, VN1( KK ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,KK:N) in step KK.
+*           RELMAXC2NRMK  will be computed later, after somecondition
+*           checks on MAXC2NRMK.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains NaN, and set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( SISNAN( MAXC2NRMK ) ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+                K = KK - 1
+                INFO = K + KP
+*
+*               Set RELMAXC2NRMK to NaN.
+*
+                RELMAXC2NRMK = MAXC2NRMK
+*
+*               Array TAU(K+1:MINMNFACT) is not set and contains
+*               undefined elements.
+*
+               RETURN
+            END IF
+*
+*           ============================================================
+*
+*           Quick return, if the submatrix A(I:M,KK:N) is
+*           a zero matrix.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+               K = KK - 1
+               RELMAXC2NRMK = ZERO
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + KK - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third stopping criteria.
+*           NOTE: There is no need to test for ABSTOL >= ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+*              Set K, the number of factorized columns.
+*
+               K = KK - 1
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,KK:N):
+*        1) swap the KK-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) copy the KK-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than KK in the next loop step.)
+*        3) Save the pivot interchange with the indices relative to the
+*           the original matrix A, not the block A(1:M,1:N).
+*
+         IF( KP.NE.KK ) THEN
+            CALL SSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 )
+            VN1( KP ) = VN1( KK )
+            VN2( KP ) = VN2( KK )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( KK )
+            JPIV( KK ) = ITEMP
+         END IF
+*
+*        Generate elementary reflector H(KK) using the column A(I:M,KK),
+*        if the column has more than one element, otherwise
+*        the elementary reflector would be an identity matrix,
+*        and TAU(KK) = ZERO.
+*
+         IF( I.LT.M ) THEN
+            CALL SLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1,
+     $                   TAU( KK ) )
+         ELSE
+            TAU( KK ) = ZERO
+         END IF
+*
+*        Check if TAU(KK) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(KK) for Inf,
+*        since SLARFG cannot produce TAU(KK) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by SLARFG can contain Inf, which requires
+*        TAU(KK) to contain NaN. Therefore, this case of generating Inf
+*        by SLARFG is covered by checking TAU(KK) for NaN.
+*
+         IF( SISNAN( TAU(KK) ) ) THEN
+            K = KK - 1
+            INFO = KK
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAU( KK )
+            RELMAXC2NRMK = TAU( KK )
+*
+*           Array TAU(KK:MINMNFACT) is not set and contains
+*           undefined elements, except the first element TAU(KK) = NaN.
+*
+            RETURN
+         END IF
+*
+*        Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left.
+*        ( If M >= N, then at KK = N there is no residual matrix,
+*         i.e. no columns of A to update, only columns of B.
+*         If M < N, then at KK = M-IOFFSET, I = M and we have a
+*         one-row residual matrix in A and the elementary
+*         reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update
+*         is needed for the residual matrix in A and the
+*         right-hand-side-matrix in B.
+*         Therefore, we update only if
+*         KK < MINMNUPDT = min(M-IOFFSET, N+NRHS)
+*         condition is satisfied, not only KK < N+NRHS )
+*
+         IF( KK.LT.MINMNUPDT ) THEN
+            AIKK = A( I, KK )
+            A( I, KK ) = ONE
+            CALL SLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1,
+     $                  TAU( KK ), A( I, KK+1 ), LDA, WORK( 1 ) )
+            A( I, KK ) = AIKK
+         END IF
+*
+         IF( KK.LT.MINMNFACT ) THEN
+*
+*           Update the partial column 2-norms for the residual matrix,
+*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e.
+*           when KK < min(M-IOFFSET, N).
+*
+            DO J = KK + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2
+                  TEMP = MAX( TEMP, ZERO )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2 .LE. TOL3Z ) THEN
+*
+*                    Compute the column 2-norm for the partial
+*                    column A(I+1:M,J) by explicitly computing it,
+*                    and store it in both partial 2-norm vector VN1
+*                    and exact column 2-norm vector VN2.
+*
+                     VN1( J ) = SNRM2( M-I, A( I+1, J ), 1 )
+                     VN2( J ) = VN1( J )
+*
+                  ELSE
+*
+*                    Update the column 2-norm for the partial
+*                    column A(I+1:M,J) by removing one
+*                    element A(I,J) and store it in partial
+*                    2-norm vector VN1.
+*
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+*
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*     End factorization loop
+*
+      END DO
+*
+*     If we reached this point, all colunms have been factorized,
+*     i.e. no condition was triggered to exit the routine.
+*     Set the number of factorized columns.
+*
+      K = KMAX
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before
+*     we return.
+*
+      IF( K.LT.MINMNFACT ) THEN
+*
+         JMAXC2NRM = K + ISAMAX( N-K, VN1( K+1 ), 1 )
+         MAXC2NRMK = VN1( JMAXC2NRM )
+*
+         IF( K.EQ.0 ) THEN
+            RELMAXC2NRMK = ONE
+         ELSE
+            RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+         END IF
+*
+      ELSE
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+      END IF
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, set TAUs corresponding to the columns that were
+*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO.
+*
+      DO J = K + 1, MINMNFACT
+         TAU( J ) = ZERO
+      END DO
+*
+      RETURN
+*
+*     End of SLAQP2RK
+*
+      END
diff --git a/lapack-netlib/SRC/slaqp3rk.f b/lapack-netlib/SRC/slaqp3rk.f
new file mode 100644
index 000000000..fa735bb9d
--- /dev/null
+++ b/lapack-netlib/SRC/slaqp3rk.f
@@ -0,0 +1,935 @@
+*> \brief \b SLAQP3RK computes a step of truncated QR factorization with column pivoting of a real m-by-n matrix A using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SLAQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/slaqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/slaqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/slaqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE SLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+*     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+*     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+*      IMPLICIT NONE
+*      LOGICAL            DONE
+*      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+*     $                   NB, NRHS
+*      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*
+*     .. Scalar Arguments ..
+*      LOGICAL            DONE
+*      INTEGER            KB, LDA, LDF, M, N, NB, NRHS, IOFFSET
+*      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            IWORK( * ), JPIV( * )
+*      REAL               A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ),
+*     $                   VN1( * ), VN2( * )
+*     ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLAQP3RK computes a step of truncated QR factorization with column
+*> pivoting of a real M-by-N matrix A block A(IOFFSET+1:M,1:N)
+*> by using Level 3 BLAS as
+*>
+*>   A * P(KB) = Q(KB) * R(KB).
+*>
+*> The routine tries to factorize NB columns from A starting from
+*> the row IOFFSET+1 and updates the residual matrix with BLAS 3
+*> xGEMM. The number of actually factorized columns is returned
+*> is smaller than NB.
+*>
+*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides B matrix stored
+*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**T * B.
+*>
+*> Cases when the number of factorized columns KB < NB:
+*>
+*> (1) In some cases, due to catastrophic cancellations, it cannot
+*> factorize all NB columns and need to update the residual matrix.
+*> Hence, the actual number of factorized columns in the block returned
+*> in KB is smaller than NB. The logical DONE is returned as FALSE.
+*> The factorization of the whole original matrix A_orig must proceed
+*> with the next block.
+*>
+*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB.
+*>
+*> (3) In case both stopping criteria ABSTOL or RELTOL are not used,
+*> and when the residual matrix is a zero matrix in some factorization
+*> step KB, the factorization of the whole original matrix A_orig is
+*> stopped, the logical DONE is returned as TRUE. The number of
+*> factorized columns which is smaller than NB is returned in KB.
+*>
+*> (4) Whenever NaN is detected in the matrix A or in the array TAU,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB. The INFO
+*> parameter is set to the column index of the first NaN occurrence.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          Factorization block size, i.e the number of columns
+*>          to factorize in the matrix A. 0 <= NB
+*>
+*>          If NB = 0, then the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is REAL, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is REAL, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine SGEQP3RK. 1 <= KP1 <= N_orig.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is REAL
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine SGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:KB) below
+*>             the diagonal together with the array TAU represent
+*>             the orthogonal matrix Q(KB) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:KB) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,KB+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(KB)**T.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out]
+*> \verbatim
+*>          DONE is LOGICAL
+*>          TRUE: a) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to ABSTOL
+*>                   or RELTOL criterion,
+*>                b) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to the
+*>                   residual matrix being a ZERO matrix.
+*>                c) when NaN was detected in the matrix A
+*>                   or in the array TAU.
+*>          FALSE: otherwise.
+*> \endverbatim
+*>
+*> \param[out] KB
+*> \verbatim
+*>          KB is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R.  0 <= KB <= min(M-IOFFSET,NB,N).
+*>
+*>          KB also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is REAL
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank KB. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is REAL
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank KB) to the maximum column 2-norm of the
+*>          original matrix A_orig. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is REAL array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is REAL array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is REAL array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] AUXV
+*> \verbatim
+*>          AUXV is REAL array, dimension (NB)
+*>          Auxiliary vector.
+*> \endverbatim
+*>
+*> \param[out] F
+*> \verbatim
+*>          F is REAL array, dimension (LDF,NB)
+*>          Matrix F**T = L*(Y**T)*A.
+*> \endverbatim
+*>
+*> \param[in] LDF
+*> \verbatim
+*>          LDF is INTEGER
+*>          The leading dimension of the array F. LDF >= max(1,N+NRHS).
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step KB+1 ( when KB columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             KB                  is set to the number of
+*>                                    factorized columns without
+*>                                    exception.
+*>             MAXC2NRMK           is set to NaN.
+*>             RELMAXC2NRMK        is set to NaN.
+*>             TAU(KB+1:min(M,N))     is not set and contains undefined
+*>                                    elements. If j_1=KB+1, TAU(KB+1)
+*>                                    may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the actorization
+*>             step KB+1 ( when KB columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp3rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  Computer Science Division,
+*>                  University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE SLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      LOGICAL            DONE
+      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+     $                   NB, NRHS
+      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      REAL               A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ),
+     $                   VN1( * ), VN2( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            ITEMP, J, K, MINMNFACT, MINMNUPDT,
+     $                   LSTICC, KP, I, IF
+      REAL               AIK, HUGEVAL, TEMP, TEMP2, TOL3Z
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SGEMM, SGEMV, SLARFG, SSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            SISNAN
+      INTEGER            ISAMAX
+      REAL               SLAMCH, SNRM2
+      EXTERNAL           SISNAN, SLAMCH, ISAMAX, SNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      NB = MIN( NB, MINMNFACT )
+      TOL3Z = SQRT( SLAMCH( 'Epsilon' ) )
+      HUGEVAL = SLAMCH( 'Overflow' )
+*
+*     Compute factorization in a while loop over NB columns,
+*     K is the column index in the block A(1:M,1:N).
+*
+      K = 0
+      LSTICC = 0
+      DONE = .FALSE.
+*
+      DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 )
+         K = K + 1
+         I = IOFFSET + K
+*
+         IF( I.EQ.1 ) THEN
+*
+*           We are at the first column of the original whole matrix A_orig,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+            KP = KP1
+*
+         ELSE
+*
+*           Determine the pivot column in K-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( K-1 ) + ISAMAX( N-K+1, VN1( K ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,K:N) in step K.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains NaN, set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( SISNAN( MAXC2NRMK ) ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               INFO = KB + KP
+*
+*              Set RELMAXC2NRMK to NaN.
+*
+               RELMAXC2NRMK = MAXC2NRMK
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix contains NaN and we stop
+*              the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T.
+
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL SGEMM( 'No transpose', 'Transpose',
+     $                  M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Array TAU(KF+1:MINMNFACT) is not set and contains
+*              undefined elements.
+*
+*              Return from the routine.
+*
+               RETURN
+            END IF
+*
+*           Quick return, if the submatrix A(I:M,K:N) is
+*           a zero matrix. We need to check it only if the column index
+*           (same as row index) is larger than 1, since the condition
+*           for the whole original matrix A_orig is checked in the main
+*           routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               RELMAXC2NRMK = ZERO
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix is zero and we stop the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T.
+*
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL SGEMM( 'No transpose', 'Transpose',
+     $                  M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + K - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third tolerance stopping criteria.
+*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig;
+*
+                  KB = K - 1
+                  IF = I - 1
+*
+*              Apply the block reflector to the residual of the
+*              matrix A and the residual of the right hand sides B, if
+*              the residual matrix and and/or the residual of the right
+*              hand sides exist,  i.e. if the submatrix
+*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when
+*                 KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T.
+*
+               IF( KB.LT.MINMNUPDT ) THEN
+                  CALL SGEMM( 'No transpose', 'Transpose',
+     $                  M-IF, N+NRHS-KB, KB,-ONE, A( IF+1, 1 ), LDA,
+     $                  F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,K:N):
+*        1) swap the K-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1)
+*        3) copy the K-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than K in the next loop step.)
+*        4) Save the pivot interchange with the indices relative to the
+*           the original matrix A_orig, not the block A(1:M,1:N).
+*
+         IF( KP.NE.K ) THEN
+            CALL SSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 )
+            CALL SSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF )
+            VN1( KP ) = VN1( K )
+            VN2( KP ) = VN2( K )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( K )
+            JPIV( K ) = ITEMP
+         END IF
+*
+*        Apply previous Householder reflectors to column K:
+*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T.
+*
+         IF( K.GT.1 ) THEN
+            CALL SGEMV( 'No transpose', M-I+1, K-1, -ONE, A( I, 1 ),
+     $                  LDA, F( K, 1 ), LDF, ONE, A( I, K ), 1 )
+         END IF
+*
+*        Generate elementary reflector H(k) using the column A(I:M,K).
+*
+         IF( I.LT.M ) THEN
+            CALL SLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) )
+         ELSE
+            TAU( K ) = ZERO
+         END IF
+*
+*        Check if TAU(K) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(K) for Inf,
+*        since SLARFG cannot produce TAU(K) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by SLARFG can contain Inf, which requires
+*        TAU(K) to contain NaN. Therefore, this case of generating Inf
+*        by SLARFG is covered by checking TAU(K) for NaN.
+*
+         IF( SISNAN( TAU(K) ) ) THEN
+*
+            DONE = .TRUE.
+*
+*           Set KB, the number of factorized partial columns
+*                   that are non-zero in each step in the block,
+*                   i.e. the rank of the factor R.
+*           Set IF, the number of processed rows in the block, which
+*                   is the same as the number of processed rows in
+*                   the original whole matrix A_orig.
+*
+            KB = K - 1
+            IF = I - 1
+            INFO = K
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAU( K )
+            RELMAXC2NRMK = TAU( K )
+*
+*           There is no need to apply the block reflector to the
+*           residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*           since the submatrix contains NaN and we stop
+*           the computation.
+*           But, we need to apply the block reflector to the residual
+*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*           residual right hand sides exist.  This occurs
+*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T.
+*
+            IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+               CALL SGEMM( 'No transpose', 'Transpose',
+     $               M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA,
+     $               F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA )
+            END IF
+*
+*           There is no need to recompute the 2-norm of the
+*           difficult columns, since we stop the factorization.
+*
+*           Array TAU(KF+1:MINMNFACT) is not set and contains
+*           undefined elements.
+*
+*           Return from the routine.
+*
+            RETURN
+         END IF
+*
+*        ===============================================================
+*
+         AIK = A( I, K )
+         A( I, K ) = ONE
+*
+*        ===============================================================
+*
+*        Compute the current K-th column of F:
+*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K).
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL SGEMV( 'Transpose', M-I+1, N+NRHS-K,
+     $                  TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1,
+     $                  ZERO, F( K+1, K ), 1 )
+         END IF
+*
+*           2) Zero out elements above and on the diagonal of the
+*              column K in matrix F, i.e elements F(1:K,K).
+*
+         DO J = 1, K
+            F( J, K ) = ZERO
+         END DO
+*
+*         3) Incremental updating of the K-th column of F:
+*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T
+*                    * A(I:M,K).
+*
+         IF( K.GT.1 ) THEN
+            CALL SGEMV( 'Transpose', M-I+1, K-1, -TAU( K ),
+     $                  A( I, 1 ), LDA, A( I, K ), 1, ZERO,
+     $                  AUXV( 1 ), 1 )
+*
+            CALL SGEMV( 'No transpose', N+NRHS, K-1, ONE,
+     $                  F( 1, 1 ), LDF, AUXV( 1 ), 1, ONE,
+     $                  F( 1, K ), 1 )
+         END IF
+*
+*        ===============================================================
+*
+*        Update the current I-th row of A:
+*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS)
+*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**T.
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL SGEMV( 'No transpose', N+NRHS-K, K, -ONE,
+     $                  F( K+1, 1 ), LDF, A( I, 1 ), LDA, ONE,
+     $                  A( I, K+1 ), LDA )
+         END IF
+*
+         A( I, K ) = AIK
+*
+*        Update the partial column 2-norms for the residual matrix,
+*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e.
+*        when K < MINMNFACT = min( M-IOFFSET, N ).
+*
+         IF( K.LT.MINMNFACT ) THEN
+*
+            DO J = K + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ABS( A( I, J ) ) / VN1( J )
+                  TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2.LE.TOL3Z ) THEN
+*
+*                    At J-index, we have a difficult column for the
+*                    update of the 2-norm. Save the index of the previous
+*                    difficult column in IWORK(J-1).
+*                    NOTE: ILSTCC > 1, threfore we can use IWORK only
+*                    with N-1 elements, where the elements are
+*                    shifted by 1 to the left.
+*
+                     IWORK( J-1 ) = LSTICC
+*
+*                    Set the index of the last difficult column LSTICC.
+*
+                     LSTICC = J
+*
+                  ELSE
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*        End of while loop.
+*
+      END DO
+*
+*     Now, afler the loop:
+*        Set KB, the number of factorized columns in the block;
+*        Set IF, the number of processed rows in the block, which
+*                is the same as the number of processed rows in
+*                the original whole matrix A_orig, IF = IOFFSET + KB.
+*
+      KB = K
+      IF = I
+*
+*     Apply the block reflector to the residual of the matrix A
+*     and the residual of the right hand sides B, if the residual
+*     matrix and and/or the residual of the right hand sides
+*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists.
+*     This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T.
+*
+      IF( KB.LT.MINMNUPDT ) THEN
+         CALL SGEMM( 'No transpose', 'Transpose',
+     $         M-IF, N+NRHS-KB, KB, -ONE, A( IF+1, 1 ), LDA,
+     $         F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA )
+      END IF
+*
+*     Recompute the 2-norm of the difficult columns.
+*     Loop over the index of the difficult columns from the largest
+*     to the smallest index.
+*
+      DO WHILE( LSTICC.GT.0 )
+*
+*        LSTICC is the index of the last difficult column is greater
+*        than 1.
+*        ITEMP is the index of the previous difficult column.
+*
+         ITEMP = IWORK( LSTICC-1 )
+*
+*        Compute the 2-norm explicilty for the last difficult column and
+*        save it in the partial and exact 2-norm vectors VN1 and VN2.
+*
+*        NOTE: The computation of VN1( LSTICC ) relies on the fact that
+*        SNRM2 does not fail on vectors with norm below the value of
+*        SQRT(SLAMCH('S'))
+*
+         VN1( LSTICC ) = SNRM2( M-IF, A( IF+1, LSTICC ), 1 )
+         VN2( LSTICC ) = VN1( LSTICC )
+*
+*        Downdate the index of the last difficult column to
+*        the index of the previous difficult column.
+*
+         LSTICC = ITEMP
+*
+      END DO
+*
+      RETURN
+*
+*     End of SLAQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/zgeqp3rk.f b/lapack-netlib/SRC/zgeqp3rk.f
new file mode 100644
index 000000000..f8ef986c7
--- /dev/null
+++ b/lapack-netlib/SRC/zgeqp3rk.f
@@ -0,0 +1,1091 @@
+*> \brief \b ZGEQP3RK computes a truncated Householder QR factorization with column pivoting of a complex m-by-n matrix A by using Level 3 BLAS and overwrites m-by-nrhs matrix B with Q**H * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZGEQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zgeqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zgeqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zgeqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+*      $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*      $                     WORK, LWORK, RWORK, IWORK, INFO )
+*       IMPLICIT NONE
+*
+*      .. Scalar Arguments ..
+*       INTEGER            INFO, K, KMAX, LDA, LWORK, M, N, NRHS
+*       DOUBLE PRECISION   ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*      ..
+*      .. Array Arguments ..
+*       INTEGER            IWORK( * ), JPIV( * )
+*       DOUBLE PRECISION   RWORK( * )
+*       COMPLEX*16         A( LDA, * ), TAU( * ), WORK( * )
+*      ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZGEQP3RK performs two tasks simultaneously:
+*>
+*> Task 1: The routine computes a truncated (rank K) or full rank
+*> Householder QR factorization with column pivoting of a complex
+*> M-by-N matrix A using Level 3 BLAS. K is the number of columns
+*> that were factorized, i.e. factorization rank of the
+*> factor R, K <= min(M,N).
+*>
+*>  A * P(K) = Q(K) * R(K)  =
+*>
+*>        = Q(K) * ( R11(K) R12(K) ) = Q(K) * (   R(K)_approx    )
+*>                 ( 0      R22(K) )          ( 0  R(K)_residual ),
+*>
+*> where:
+*>
+*>  P(K)            is an N-by-N permutation matrix;
+*>  Q(K)            is an M-by-M orthogonal matrix;
+*>  R(K)_approx   = ( R11(K), R12(K) ) is a rank K approximation of the
+*>                    full rank factor R with K-by-K upper-triangular
+*>                    R11(K) and K-by-N rectangular R12(K). The diagonal
+*>                    entries of R11(K) appear in non-increasing order
+*>                    of absolute value, and absolute values of all of
+*>                    them exceed the maximum column 2-norm of R22(K)
+*>                    up to roundoff error.
+*>  R(K)_residual = R22(K) is the residual of a rank K approximation
+*>                    of the full rank factor R. It is a
+*>                    an (M-K)-by-(N-K) rectangular matrix;
+*>  0               is a an (M-K)-by-K zero matrix.
+*>
+*> Task 2: At the same time, the routine overwrites a complex M-by-NRHS
+*> matrix B with  Q(K)**H * B  using Level 3 BLAS.
+*>
+*> =====================================================================
+*>
+*> The matrices A and B are stored on input in the array A as
+*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS)
+*> respectively.
+*>
+*>                                  N     NRHS
+*>             array_A   =   M  [ mat_A, mat_B ]
+*>
+*> The truncation criteria (i.e. when to stop the factorization)
+*> can be any of the following:
+*>
+*>   1) The input parameter KMAX, the maximum number of columns
+*>      KMAX to factorize, i.e. the factorization rank is limited
+*>      to KMAX. If KMAX >= min(M,N), the criterion is not used.
+*>
+*>   2) The input parameter ABSTOL, the absolute tolerance for
+*>      the maximum column 2-norm of the residual matrix R22(K). This
+*>      means that the factorization stops if this norm is less or
+*>      equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used.
+*>
+*>   3) The input parameter RELTOL, the tolerance for the maximum
+*>      column 2-norm matrix of the residual matrix R22(K) divided
+*>      by the maximum column 2-norm of the original matrix A, which
+*>      is equal to abs(R(1,1)). This means that the factorization stops
+*>      when the ratio of the maximum column 2-norm of R22(K) to
+*>      the maximum column 2-norm of A is less than or equal to RELTOL.
+*>      If RELTOL < 0.0, the criterion is not used.
+*>
+*>   4) In case both stopping criteria ABSTOL or RELTOL are not used,
+*>      and when the residual matrix R22(K) is a zero matrix in some
+*>      factorization step K. ( This stopping criterion is implicit. )
+*>
+*>  The algorithm stops when any of these conditions is first
+*>  satisfied, otherwise the whole matrix A is factorized.
+*>
+*>  To factorize the whole matrix A, use the values
+*>  KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0.
+*>
+*>  The routine returns:
+*>     a) Q(K), R(K)_approx = ( R11(K), R12(K) ),
+*>        R(K)_residual = R22(K), P(K), i.e. the resulting matrices
+*>        of the factorization; P(K) is represented by JPIV,
+*>        ( if K = min(M,N), R(K)_approx is the full factor R,
+*>        and there is no residual matrix R(K)_residual);
+*>     b) K, the number of columns that were factorized,
+*>        i.e. factorization rank;
+*>     c) MAXC2NRMK, the maximum column 2-norm of the residual
+*>        matrix R(K)_residual = R22(K),
+*>        ( if K = min(M,N), MAXC2NRMK = 0.0 );
+*>     d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum
+*>        column 2-norm of the original matrix A, which is equal
+*>        to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 );
+*>     e) Q(K)**H * B, the matrix B with the orthogonal
+*>        transformation Q(K)**H applied on the left.
+*>
+*> The N-by-N permutation matrix P(K) is stored in a compact form in
+*> the integer array JPIV. For 1 <= j <= N, column j
+*> of the matrix A was interchanged with column JPIV(j).
+*>
+*> The M-by-M orthogonal matrix Q is represented as a product
+*> of elementary Householder reflectors
+*>
+*>     Q(K) = H(1) *  H(2) * . . . * H(K),
+*>
+*> where K is the number of columns that were factorized.
+*>
+*> Each H(j) has the form
+*>
+*>     H(j) = I - tau * v * v**H,
+*>
+*> where 1 <= j <= K and
+*>   I    is an M-by-M identity matrix,
+*>   tau  is a complex scalar,
+*>   v    is a complex vector with v(1:j-1) = 0 and v(j) = 1.
+*>
+*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j).
+*>
+*> See the Further Details section for more information.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e. the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M,N), then this stopping criterion
+*>                is not used, the routine factorizes columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>                satisfied on input and the routine exits immediately.
+*>                This means that the factorization is not performed,
+*>                the matrices A and B are not modified, and
+*>                the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION
+*>
+*>          The second factorization stopping criterion, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix R22(K).
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix R22(K)
+*>          is less than or equal to ABSTOL. Let SAFMIN = DLAMCH('S').
+*>
+*>          a) If ABSTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -5 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN
+*>                is used. This includes the case ABSTOL = -0.0.
+*>
+*>          d) If 2*SAFMIN <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If ABSTOL chosen above is >= MAXC2NRM, then this
+*>          stopping criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed. The routine
+*>          returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case ABSTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is DOUBLE PRECISION
+*>
+*>          The third factorization stopping criterion, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio
+*>          abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of
+*>          the residual matrix R22(K) to the maximum column 2-norm of
+*>          the original matrix A. The algorithm converges (stops the
+*>          factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less
+*>          than or equal to RELTOL. Let EPS = DLAMCH('E').
+*>
+*>          a) If RELTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -6 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used.
+*>                This includes the case RELTOL = -0.0.
+*>
+*>          d) If EPS <= RELTOL then the input value of RELTOL
+*>                is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If RELTOL chosen above is >= 1.0, then this stopping
+*>          criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed.
+*>          The routine returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case RELTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*>
+*>          NOTE: We recommend that RELTOL satisfy
+*>                min( 10*max(M,N)*EPS, sqrt(EPS) ) <= RELTOL
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N+NRHS)
+*>
+*>          On entry:
+*>
+*>          a) The subarray A(1:M,1:N) contains the M-by-N matrix A.
+*>          b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS
+*>             matrix B.
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>
+*>          a) The subarray A(1:M,1:N) contains parts of the factors
+*>             of the matrix A:
+*>
+*>            1) If K = 0, A(1:M,1:N) contains the original matrix A.
+*>            2) If K > 0, A(1:M,1:N) contains parts of the
+*>            factors:
+*>
+*>              1. The elements below the diagonal of the subarray
+*>                 A(1:M,1:K) together with TAU(1:K) represent the
+*>                 orthogonal matrix Q(K) as a product of K Householder
+*>                 elementary reflectors.
+*>
+*>              2. The elements on and above the diagonal of
+*>                 the subarray A(1:K,1:N) contain K-by-N
+*>                 upper-trapezoidal matrix
+*>                 R(K)_approx = ( R11(K), R12(K) ).
+*>                 NOTE: If K=min(M,N), i.e. full rank factorization,
+*>                       then R_approx(K) is the full factor R which
+*>                       is upper-trapezoidal. If, in addition, M>=N,
+*>                       then R is upper-triangular.
+*>
+*>              3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K)
+*>                 rectangular matrix R(K)_residual = R22(K).
+*>
+*>          b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains
+*>             the M-by-NRHS product Q(K)**H * B.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*>          This is the leading dimension for both matrices, A and B.
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*>
+*>          NOTE: If K = 0, a) the arrays A and B are not modified;
+*>                          b) the array TAU(1:min(M,N)) is set to ZERO,
+*>                             if the matrix A does not contain NaN,
+*>                             otherwise the elements TAU(1:min(M,N))
+*>                             are undefined;
+*>                          c) the elements of the array JPIV are set
+*>                             as follows: for j = 1:N, JPIV(j) = j.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is DOUBLE PRECISION
+*>          The maximum column 2-norm of the residual matrix R22(K),
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then MAXC2NRMK equals the maximum column 2-norm
+*>             of the original matrix A.
+*>
+*>          b) If 0 < K < min(M,N), then MAXC2NRMK is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then MAXC2NRMK = 0.0.
+*>
+*>          NOTE: MAXC2NRMK in the factorization step K would equal
+*>                R(K+1,K+1) in the next factorization step K+1.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is DOUBLE PRECISION
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix R22(K) (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then RELMAXC2NRMK = 1.0.
+*>
+*>          b) If 0 < K < min(M,N), then
+*>                RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then RELMAXC2NRMK = 0.0.
+*>
+*>         NOTE: RELMAXC2NRMK in the factorization step K would equal
+*>               abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization
+*>               step K+1.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices. For 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*>
+*>          The elements of the array JPIV(1:N) are always set
+*>          by the routine, for example, even  when no columns
+*>          were factorized, i.e. when K = 0, the elements are
+*>          set as JPIV(j) = j for j = 1:N.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX*16 array, dimension (min(M,N))
+*>          The scalar factors of the elementary reflectors.
+*>
+*>          If 0 < K <= min(M,N), only the elements TAU(1:K) of
+*>          the array TAU are modified by the factorization.
+*>          After the factorization computed, if no NaN was found
+*>          during the factorization, the remaining elements
+*>          TAU(K+1:min(M,N)) are set to zero, otherwise the
+*>          elements TAU(K+1:min(M,N)) are not set and therefore
+*>          undefined.
+*>          ( If K = 0, all elements of TAU are set to zero, if
+*>          the matrix A does not contain NaN. )
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*.          LWORK >= N+NRHS-1
+*>          For optimal performance LWORK >= NB*( N+NRHS+1 ),
+*>          where NB is the optimal block size for ZGEQP3RK returned
+*>          by ILAENV. Minimal block size MINNB=2.
+*>
+*>          NOTE: The decision, whether to use unblocked BLAS 2
+*>          or blocked BLAS 3 code is based not only on the dimension
+*>          LWORK of the availbale workspace WORK, but also also on the
+*>          matrix A dimension N via crossover point NX returned
+*>          by ILAENV. (For N less than NX, unblocked code should be
+*>          used.)
+*>
+*>          If LWORK = -1, then a workspace query is assumed;
+*>          the routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (2*N)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix in the blocked step auxiliary subroutine ZLAQP3RK ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) INFO < 0: if INFO = -i, the i-th argument had an
+*>                       illegal value.
+*>          3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup geqp3rk
+*
+*> \par Further Details:
+*  =====================
+*
+*> \verbatim
+*> ZGEQP3RK is based on the same BLAS3 Householder QR factorization
+*> algorithm with column pivoting as in ZGEQP3 routine which uses
+*> ZLARFG routine to generate Householder reflectors
+*> for QR factorization.
+*>
+*> We can also write:
+*>
+*>   A = A_approx(K) + A_residual(K)
+*>
+*> The low rank approximation matrix A(K)_approx from
+*> the truncated QR factorization of rank K of the matrix A is:
+*>
+*>   A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T
+*>                        (     0     0 )
+*>
+*>               = Q(K) * ( R11(K) R12(K) ) * P(K)**T
+*>                        (      0      0 )
+*>
+*> The residual A_residual(K) of the matrix A is:
+*>
+*>   A_residual(K) = Q(K) * ( 0              0 ) * P(K)**T =
+*>                          ( 0  R(K)_residual )
+*>
+*>                 = Q(K) * ( 0        0 ) * P(K)**T
+*>                          ( 0   R22(K) )
+*>
+*> The truncated (rank K) factorization guarantees that
+*> the maximum column 2-norm of A_residual(K) is less than
+*> or equal to MAXC2NRMK up to roundoff error.
+*>
+*> NOTE: An approximation of the null vectors
+*>       of A can be easily computed from R11(K)
+*>       and R12(K):
+*>
+*>       Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) )
+*>                                 (         -I           )
+*>
+*> \endverbatim
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  Computer Science Division,
+*>                  University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE ZGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+     $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     WORK, LWORK, RWORK, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS
+      DOUBLE PRECISION   ABSTOL,  MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      DOUBLE PRECISION   RWORK( * )
+      COMPLEX*16         A( LDA, * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            INB, INBMIN, IXOVER
+      PARAMETER          ( INB = 1, INBMIN = 2, IXOVER = 3 )
+      DOUBLE PRECISION   ZERO, ONE, TWO
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 )
+      COMPLEX*16         CZERO
+      PARAMETER          ( CZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, DONE
+      INTEGER            IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX,
+     $                   JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB,
+     $                   NBMIN, NX
+      DOUBLE PRECISION   EPS, HUGEVAL, MAXC2NRM, SAFMIN
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZLAQP2RK, ZLAQP3RK, XERBLA
+*     ..
+*     .. External Functions ..
+      LOGICAL            DISNAN
+      INTEGER            IDAMAX, ILAENV
+      DOUBLE PRECISION   DLAMCH, DZNRM2
+      EXTERNAL           DISNAN, DLAMCH, DZNRM2, IDAMAX, ILAENV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCMPLX, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test input arguments
+*     ====================
+*
+      INFO = 0
+      LQUERY = ( LWORK.EQ.-1 )
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KMAX.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( DISNAN( ABSTOL ) ) THEN
+         INFO = -5
+      ELSE IF( DISNAN( RELTOL ) ) THEN
+         INFO = -6
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -8
+      END IF
+*
+*     If the input parameters M, N, NRHS, KMAX, LDA are valid:
+*       a) Test the input workspace size LWORK for the minimum
+*          size requirement IWS.
+*       b) Determine the optimal block size NB and optimal
+*          workspace size LWKOPT to be returned in WORK(1)
+*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE.,
+*          (3) when routine exits.
+*     Here, IWS is the miminum workspace required for unblocked
+*     code.
+*
+      IF( INFO.EQ.0 ) THEN
+         MINMN = MIN( M, N )
+         IF( MINMN.EQ.0 ) THEN
+            IWS = 1
+            LWKOPT = 1
+         ELSE
+*
+*           Minimal workspace size in case of using only unblocked
+*           BLAS 2 code in ZLAQP2RK.
+*           1) ZLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in ZLARF subroutine inside ZLAQP2RK to apply an
+*              elementary reflector from the left.
+*           TOTAL_WORK_SIZE = 3*N + NRHS - 1
+*
+            IWS = N + NRHS - 1
+*
+*           Assign to NB optimal block size.
+*
+            NB = ILAENV( INB, 'ZGEQP3RK', ' ', M, N, -1, -1 )
+*
+*           A formula for the optimal workspace size in case of using
+*           both unblocked BLAS 2 in ZLAQP2RK and blocked BLAS 3 code
+*           in ZLAQP3RK.
+*           1) ZGEQP3RK, ZLAQP2RK, ZLAQP3RK: 2*N to store full and
+*              partial column 2-norms.
+*           2) ZLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in ZLARF subroutine to apply an elementary reflector
+*              from the left.
+*           3) ZLAQP3RK: NB*(N+NRHS) to use in the work array F that
+*              is used to apply a block reflector from
+*              the left.
+*           4) ZLAQP3RK: NB to use in the auxilixary array AUX.
+*           Sizes (2) and ((3) + (4)) should intersect, therefore
+*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2.
+*
+            LWKOPT = 2*N + NB*( N+NRHS+1 )
+         END IF
+         WORK( 1 ) = DCMPLX( LWKOPT )
+*
+         IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN
+            INFO = -15
+         END IF
+      END IF
+*
+*      NOTE: The optimal workspace size is returned in WORK(1), if
+*            the input parameters M, N, NRHS, KMAX, LDA are valid.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZGEQP3RK', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible for M=0 or N=0.
+*
+      IF( MINMN.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+         WORK( 1 ) = DCMPLX( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+*     Initialize column pivot array JPIV.
+*
+      DO J = 1, N
+         JPIV( J ) = J
+      END DO
+*
+*     ==================================================================
+*
+*     Initialize storage for partial and exact column 2-norms.
+*     a) The elements WORK(1:N) are used to store partial column
+*        2-norms of the matrix A, and may decrease in each computation
+*        step; initialize to the values of complete columns 2-norms.
+*     b) The elements WORK(N+1:2*N) are used to store complete column
+*        2-norms of the matrix A, they are not changed during the
+*        computation; initialize the values of complete columns 2-norms.
+*
+      DO J = 1, N
+         RWORK( J ) = DZNRM2( M, A( 1, J ), 1 )
+         RWORK( N+J ) = RWORK( J )
+      END DO
+*
+*     ==================================================================
+*
+*     Compute the pivot column index and the maximum column 2-norm
+*     for the whole original matrix stored in A(1:M,1:N).
+*
+      KP1 = IDAMAX( N, RWORK( 1 ), 1 )
+*
+*     ==================================================================.
+*
+      IF( DISNAN( MAXC2NRM ) ) THEN
+*
+*        Check if the matrix A contains NaN, set INFO parameter
+*        to the column number where the first NaN is found and return
+*        from the routine.
+*
+         K = 0
+         INFO = KP1
+*
+*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = MAXC2NRM
+*
+*        Array TAU is not set and contains undefined elements.
+*
+         WORK( 1 ) = DCMPLX( LWKOPT )
+         RETURN
+      END IF
+*
+*     ===================================================================
+*
+      IF( MAXC2NRM.EQ.ZERO ) THEN
+*
+*        Check is the matrix A is a zero matrix, set array TAU and
+*        return from the routine.
+*
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+*
+         DO J = 1, MINMN
+            TAU( J ) = CZERO
+         END DO
+*
+         WORK( 1 ) = DCMPLX( LWKOPT )
+         RETURN
+*
+      END IF
+*
+*     ===================================================================
+*
+      HUGEVAL = DLAMCH( 'Overflow' )
+*
+      IF( MAXC2NRM.GT.HUGEVAL ) THEN
+*
+*        Check if the matrix A contains +Inf or -Inf, set INFO parameter
+*        to the column number, where the first +/-Inf  is found plus N,
+*        and continue the computation.
+*
+         INFO = N + KP1
+*
+      END IF
+*
+*     ==================================================================
+*
+*     Quick return if possible for the case when the first
+*     stopping criterion is satisfied, i.e. KMAX = 0.
+*
+      IF( KMAX.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+         DO J = 1, MINMN
+            TAU( J ) = CZERO
+         END DO
+         WORK( 1 ) = DCMPLX( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+      EPS = DLAMCH('Epsilon')
+*
+*     Adjust ABSTOL
+*
+      IF( ABSTOL.GE.ZERO ) THEN
+         SAFMIN = DLAMCH('Safe minimum')
+         ABSTOL = MAX( ABSTOL, TWO*SAFMIN )
+      END IF
+*
+*     Adjust RELTOL
+*
+      IF( RELTOL.GE.ZERO ) THEN
+         RELTOL = MAX( RELTOL, EPS )
+      END IF
+*
+*     ===================================================================
+*
+*     JMAX is the maximum index of the column to be factorized,
+*     which is also limited by the first stopping criterion KMAX.
+*
+      JMAX = MIN( KMAX, MINMN )
+*
+*     ===================================================================
+*
+*     Quick return if possible for the case when the second or third
+*     stopping criterion for the whole original matrix is satified,
+*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL
+*     (which is ONE <= RELTOL).
+*
+      IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN
+*
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+*
+         DO J = 1, MINMN
+            TAU( J ) = CZERO
+         END DO
+*
+         WORK( 1 ) = DCMPLX( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*     Factorize columns
+*     ==================================================================
+*
+*     Determine the block size.
+*
+      NBMIN = 2
+      NX = 0
+*
+      IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN
+*
+*        Determine when to cross over from blocked to unblocked code.
+*        (for N less than NX, unblocked code should be used).
+*
+         NX = MAX( 0, ILAENV( IXOVER, 'ZGEQP3RK', ' ', M, N, -1, -1 ) )
+*
+         IF( NX.LT.MINMN ) THEN
+*
+*           Determine if workspace is large enough for blocked code.
+*
+            IF( LWORK.LT.LWKOPT ) THEN
+*
+*              Not enough workspace to use optimal block size that
+*              is currently stored in NB.
+*              Reduce NB and determine the minimum value of NB.
+*
+               NB = ( LWORK-2*N ) / ( N+1 )
+               NBMIN = MAX( 2, ILAENV( INBMIN, 'ZGEQP3RK', ' ', M, N,
+     $                 -1, -1 ) )
+*
+            END IF
+         END IF
+      END IF
+*
+*     ==================================================================
+*
+*     DONE is the boolean flag to rerpresent the case when the
+*     factorization completed in the block factorization routine,
+*     before the end of the block.
+*
+      DONE = .FALSE.
+*
+*     J is the column index.
+*
+      J = 1
+*
+*     (1) Use blocked code initially.
+*
+*     JMAXB is the maximum column index of the block, when the
+*     blocked code is used, is also limited by the first stopping
+*     criterion KMAX.
+*
+      JMAXB = MIN( KMAX, MINMN - NX )
+*
+      IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN
+*
+*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here:
+*        J   is the column index of a column block;
+*        JB  is the column block size to pass to block factorization
+*            routine in a loop step;
+*        JBF is the number of columns that were actually factorized
+*            that was returned by the block factorization routine
+*            in a loop step, JBF <= JB;
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         DO WHILE( J.LE.JMAXB )
+*
+            JB = MIN( NB, JMAXB-J+1 )
+            N_SUB = N-J+1
+            IOFFSET = J-1
+*
+*           Factorize JB columns among the columns A(J:N).
+*
+            CALL ZLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                     DONE, JBF, MAXC2NRMK, RELMAXC2NRMK,
+     $                     JPIV( J ), TAU( J ),
+     $                     RWORK( J ), RWORK( N+J ),
+     $                     WORK( 1 ), WORK( JB+1 ),
+     $                     N+NRHS-J+1, IWORK, IINFO )
+*
+*           Set INFO on the first occurence of Inf.
+*
+            IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+               INFO = 2*IOFFSET + IINFO
+            END IF
+*
+            IF( DONE ) THEN
+*
+*              Either the submatrix is zero before the end of the
+*              column block, or ABSTOL or RELTOL criterion is
+*              satisfied before the end of the column block, we can
+*              return from the routine. Perform the following before
+*              returning:
+*                a) Set the number of factorized columns K,
+*                   K = IOFFSET + JBF from the last call of blocked
+*                   routine.
+*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned
+*                         by the block factorization routine;
+*                      2) The remaining TAUs are set to ZERO by the
+*                         block factorization routine.
+*
+               K = IOFFSET + JBF
+*
+*              Set INFO on the first occurrence of NaN, NaN takes
+*              prcedence over Inf.
+*
+               IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+                  INFO = IOFFSET + IINFO
+               END IF
+*
+*              Return from the routine.
+*
+               WORK( 1 ) = DCMPLX( LWKOPT )
+*
+               RETURN
+*
+            END IF
+*
+            J = J + JBF
+*
+         END DO
+*
+      END IF
+*
+*     Use unblocked code to factor the last or only block.
+*     J = JMAX+1 means we factorized the maximum possible number of
+*     columns, that is in ELSE clause we need to compute
+*     the MAXC2NORM and RELMAXC2NORM to return after we processed
+*     the blocks.
+*
+      IF( J.LE.JMAX ) THEN
+*
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         N_SUB = N-J+1
+         IOFFSET = J-1
+*
+         CALL ZLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1,
+     $                  ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                  KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ),
+     $                  TAU( J ), RWORK( J ), RWORK( N+J ),
+     $                  WORK( 1 ), IINFO )
+*
+*        ABSTOL or RELTOL criterion is satisfied when the number of
+*        the factorized columns KF is smaller then the  number
+*        of columns JMAX-J+1 supplied to be factorized by the
+*        unblocked routine, we can return from
+*        the routine. Perform the following before returning:
+*           a) Set the number of factorized columns K,
+*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the
+*              unblocked factorization routine above.
+*
+         K = J - 1 + KF
+*
+*        Set INFO on the first exception occurence.
+*
+*        Set INFO on the first exception occurence of Inf or NaN,
+*        (NaN takes precedence over Inf).
+*
+         IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+            INFO = 2*IOFFSET + IINFO
+         ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+            INFO = IOFFSET + IINFO
+         END IF
+*
+      ELSE
+*
+*        Compute the return values for blocked code.
+*
+*        Set the number of factorized columns if the unblocked routine
+*        was not called.
+*
+            K = JMAX
+*
+*        If there exits a residual matrix after the blocked code:
+*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the
+*              residual matrix, otherwise set them to ZERO;
+*           2) Set TAU(K+1:MINMN) to ZERO.
+*
+         IF( K.LT.MINMN ) THEN
+            JMAXC2NRM = K + IDAMAX( N-K, RWORK( K+1 ), 1 )
+            MAXC2NRMK = RWORK( JMAXC2NRM )
+            IF( K.EQ.0 ) THEN
+               RELMAXC2NRMK = ONE
+            ELSE
+               RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+            END IF
+*
+            DO J = K + 1, MINMN
+               TAU( J ) = CZERO
+            END DO
+*
+         ELSE
+            MAXC2NRMK = ZERO
+            RELMAXC2NRMK = ZERO
+*
+         END IF
+*
+*     END IF( J.LE.JMAX ) THEN
+*
+      END IF
+*
+      WORK( 1 ) = DCMPLX( LWKOPT )
+*
+      RETURN
+*
+*     End of ZGEQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/zlaqp2rk.f b/lapack-netlib/SRC/zlaqp2rk.f
new file mode 100644
index 000000000..f1e9f4899
--- /dev/null
+++ b/lapack-netlib/SRC/zlaqp2rk.f
@@ -0,0 +1,726 @@
+*> \brief \b ZLAQP2RK computes truncated QR factorization with column pivoting of a complex matrix block using Level 2 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZLAQP2RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zlaqp2rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zlaqp2rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zlaqp2rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE ZLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+*     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+*     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+*     $                     INFO )
+*      IMPLICIT NONE
+*
+*     .. Scalar Arguments ..
+*      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+*      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            JPIV( * )
+*      DOUBLE PRECISION   VN1( * ), VN2( * )
+*      COMPLEX*16         A( LDA, * ), TAU( * ), WORK( * )
+*     $
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZLAQP2RK computes a truncated (rank K) or full rank Householder QR
+*> factorization with column pivoting of the complex matrix
+*> block A(IOFFSET+1:M,1:N) as
+*>
+*>   A * P(K) = Q(K) * R(K).
+*>
+*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N)
+*> is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides matrix block B
+*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**H * B.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M-IOFFSET,N), then this stopping
+*>                criterion is not used, factorize columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>             satisfied on input and the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The second factorization stopping criterion.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The third factorization stopping criterion.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine ZGEQP3RK. 1 <= KP1 <= N_orig_mat.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is DOUBLE PRECISION
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine ZGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:K) below
+*>             the diagonal together with the array TAU represent
+*>             the orthogonal matrix Q(K) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:K) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,K+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(K)**H.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is DOUBLE PRECISION
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is DOUBLE PRECISION
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX*16 array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (N-1)
+*>          Used in ZLARF subroutine to apply an elementary
+*>          reflector from the left.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp2rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  Computer Science Division,
+*>                  University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE ZLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+     $                     INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            JPIV( * )
+      DOUBLE PRECISION   VN1( * ), VN2( * )
+      COMPLEX*16         A( LDA, * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+      COMPLEX*16         CZERO, CONE
+      PARAMETER          ( CZERO = ( 0.0D+0, 0.0D+0 ),
+     $                   CONE = ( 1.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT,
+     $                   MINMNUPDT
+      DOUBLE PRECISION   HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z
+      COMPLEX*16         AIKK
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZLARF, ZLARFG, ZSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DBLE, DCONJG, DIMAG, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            DISNAN
+      INTEGER            IDAMAX
+      DOUBLE PRECISION   DLAMCH, DZNRM2
+      EXTERNAL           DISNAN, DLAMCH, IDAMAX, DZNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+*     MINMNUPDT is the smallest dimension
+*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which
+*     contains the submatrices A(IOFFSET+1:M,1:N) and
+*     B(IOFFSET+1:M,1:NRHS) as column blocks.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      KMAX = MIN( KMAX, MINMNFACT )
+      TOL3Z = SQRT( DLAMCH( 'Epsilon' ) )
+      HUGEVAL = DLAMCH( 'Overflow' )
+*
+*     Compute the factorization, KK is the lomn loop index.
+*
+      DO KK = 1, KMAX
+*
+         I = IOFFSET + KK
+*
+         IF( I.EQ.1 ) THEN
+*
+*           ============================================================
+*
+*           We are at the first column of the original whole matrix A,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+            KP = KP1
+*
+*           ============================================================
+*
+         ELSE
+*
+*           ============================================================
+*
+*           Determine the pivot column in KK-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( KK-1 ) + IDAMAX( N-KK+1, VN1( KK ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,KK:N) in step KK.
+*           RELMAXC2NRMK  will be computed later, after somecondition
+*           checks on MAXC2NRMK.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains NaN, and set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( DISNAN( MAXC2NRMK ) ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+                K = KK - 1
+                INFO = K + KP
+*
+*               Set RELMAXC2NRMK to NaN.
+*
+                RELMAXC2NRMK = MAXC2NRMK
+*
+*               Array TAU(K+1:MINMNFACT) is not set and contains
+*               undefined elements.
+*
+               RETURN
+            END IF
+*
+*           ============================================================
+*
+*           Quick return, if the submatrix A(I:M,KK:N) is
+*           a zero matrix.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+               K = KK - 1
+               RELMAXC2NRMK = ZERO
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + KK - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third stopping criteria.
+*           NOTE: There is no need to test for ABSTOL >= ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+*              Set K, the number of factorized columns.
+*
+               K = KK - 1
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,KK:N):
+*        1) swap the KK-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) copy the KK-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than KK in the next loop step.)
+*        3) Save the pivot interchange with the indices relative to the
+*           the original matrix A, not the block A(1:M,1:N).
+*
+         IF( KP.NE.KK ) THEN
+            CALL ZSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 )
+            VN1( KP ) = VN1( KK )
+            VN2( KP ) = VN2( KK )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( KK )
+            JPIV( KK ) = ITEMP
+         END IF
+*
+*        Generate elementary reflector H(KK) using the column A(I:M,KK),
+*        if the column has more than one element, otherwise
+*        the elementary reflector would be an identity matrix,
+*        and TAU(KK) = CZERO.
+*
+         IF( I.LT.M ) THEN
+            CALL ZLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1,
+     $                   TAU( KK ) )
+         ELSE
+            TAU( KK ) = CZERO
+         END IF
+*
+*        Check if TAU(KK) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(KK) for Inf,
+*        since ZLARFG cannot produce TAU(KK) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by ZLARFG can contain Inf, which requires
+*        TAU(KK) to contain NaN. Therefore, this case of generating Inf
+*        by ZLARFG is covered by checking TAU(KK) for NaN.
+*
+         IF( DISNAN( DBLE( TAU(KK) ) ) ) THEN
+            TAUNAN = DBLE( TAU(KK) )
+         ELSE IF( DISNAN( DIMAG( TAU(KK) ) ) ) THEN
+            TAUNAN = DIMAG( TAU(KK) )
+         ELSE
+            TAUNAN = ZERO
+         END IF
+*
+         IF( DISNAN( TAUNAN ) ) THEN
+            K = KK - 1
+            INFO = KK
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAUNAN
+            RELMAXC2NRMK = TAUNAN
+*
+*           Array TAU(KK:MINMNFACT) is not set and contains
+*           undefined elements, except the first element TAU(KK) = NaN.
+*
+            RETURN
+         END IF
+*
+*        Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left.
+*        ( If M >= N, then at KK = N there is no residual matrix,
+*         i.e. no columns of A to update, only columns of B.
+*         If M < N, then at KK = M-IOFFSET, I = M and we have a
+*         one-row residual matrix in A and the elementary
+*         reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update
+*         is needed for the residual matrix in A and the
+*         right-hand-side-matrix in B.
+*         Therefore, we update only if
+*         KK < MINMNUPDT = min(M-IOFFSET, N+NRHS)
+*         condition is satisfied, not only KK < N+NRHS )
+*
+         IF( KK.LT.MINMNUPDT ) THEN
+            AIKK = A( I, KK )
+            A( I, KK ) = CONE
+            CALL ZLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1,
+     $                  DCONJG( TAU( KK ) ), A( I, KK+1 ), LDA,
+     $                  WORK( 1 ) )
+            A( I, KK ) = AIKK
+         END IF
+*
+         IF( KK.LT.MINMNFACT ) THEN
+*
+*           Update the partial column 2-norms for the residual matrix,
+*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e.
+*           when KK < min(M-IOFFSET, N).
+*
+            DO J = KK + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2
+                  TEMP = MAX( TEMP, ZERO )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2 .LE. TOL3Z ) THEN
+*
+*                    Compute the column 2-norm for the partial
+*                    column A(I+1:M,J) by explicitly computing it,
+*                    and store it in both partial 2-norm vector VN1
+*                    and exact column 2-norm vector VN2.
+*
+                     VN1( J ) = DZNRM2( M-I, A( I+1, J ), 1 )
+                     VN2( J ) = VN1( J )
+*
+                  ELSE
+*
+*                    Update the column 2-norm for the partial
+*                    column A(I+1:M,J) by removing one
+*                    element A(I,J) and store it in partial
+*                    2-norm vector VN1.
+*
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+*
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*     End factorization loop
+*
+      END DO
+*
+*     If we reached this point, all colunms have been factorized,
+*     i.e. no condition was triggered to exit the routine.
+*     Set the number of factorized columns.
+*
+      K = KMAX
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before
+*     we return.
+*
+      IF( K.LT.MINMNFACT ) THEN
+*
+         JMAXC2NRM = K + IDAMAX( N-K, VN1( K+1 ), 1 )
+         MAXC2NRMK = VN1( JMAXC2NRM )
+*
+         IF( K.EQ.0 ) THEN
+            RELMAXC2NRMK = ONE
+         ELSE
+            RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+         END IF
+*
+      ELSE
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+      END IF
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, set TAUs corresponding to the columns that were
+*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO.
+*
+      DO J = K + 1, MINMNFACT
+         TAU( J ) = CZERO
+      END DO
+*
+      RETURN
+*
+*     End of ZLAQP2RK
+*
+      END
diff --git a/lapack-netlib/SRC/zlaqp3rk.f b/lapack-netlib/SRC/zlaqp3rk.f
new file mode 100644
index 000000000..7a9fdfd95
--- /dev/null
+++ b/lapack-netlib/SRC/zlaqp3rk.f
@@ -0,0 +1,947 @@
+*> \brief \b ZLAQP3RK computes a step of truncated QR factorization with column pivoting of a complex m-by-n matrix A using Level 3 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZLAQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zlaqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zlaqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zlaqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE ZLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+*     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+*     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+*      IMPLICIT NONE
+*      LOGICAL            DONE
+*      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+*     $                   NB, NRHS
+*      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            IWORK( * ), JPIV( * )
+*      DOUBLE PRECISION   VN1( * ), VN2( * )
+*      COMPLEX*16         A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * )
+*     ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZLAQP3RK computes a step of truncated QR factorization with column
+*> pivoting of a complex M-by-N matrix A block A(IOFFSET+1:M,1:N)
+*> by using Level 3 BLAS as
+*>
+*>   A * P(KB) = Q(KB) * R(KB).
+*>
+*> The routine tries to factorize NB columns from A starting from
+*> the row IOFFSET+1 and updates the residual matrix with BLAS 3
+*> xGEMM. The number of actually factorized columns is returned
+*> is smaller than NB.
+*>
+*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides B matrix stored
+*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**H * B.
+*>
+*> Cases when the number of factorized columns KB < NB:
+*>
+*> (1) In some cases, due to catastrophic cancellations, it cannot
+*> factorize all NB columns and need to update the residual matrix.
+*> Hence, the actual number of factorized columns in the block returned
+*> in KB is smaller than NB. The logical DONE is returned as FALSE.
+*> The factorization of the whole original matrix A_orig must proceed
+*> with the next block.
+*>
+*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB.
+*>
+*> (3) In case both stopping criteria ABSTOL or RELTOL are not used,
+*> and when the residual matrix is a zero matrix in some factorization
+*> step KB, the factorization of the whole original matrix A_orig is
+*> stopped, the logical DONE is returned as TRUE. The number of
+*> factorized columns which is smaller than NB is returned in KB.
+*>
+*> (4) Whenever NaN is detected in the matrix A or in the array TAU,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB. The INFO
+*> parameter is set to the column index of the first NaN occurrence.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          Factorization block size, i.e the number of columns
+*>          to factorize in the matrix A. 0 <= NB
+*>
+*>          If NB = 0, then the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine ZGEQP3RK. 1 <= KP1 <= N_orig.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is DOUBLE PRECISION
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine ZGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:KB) below
+*>             the diagonal together with the array TAU represent
+*>             the orthogonal matrix Q(KB) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:KB) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,KB+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(KB)**H.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out]
+*> \verbatim
+*>          DONE is LOGICAL
+*>          TRUE: a) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to ABSTOL
+*>                   or RELTOL criterion,
+*>                b) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to the
+*>                   residual matrix being a ZERO matrix.
+*>                c) when NaN was detected in the matrix A
+*>                   or in the array TAU.
+*>          FALSE: otherwise.
+*> \endverbatim
+*>
+*> \param[out] KB
+*> \verbatim
+*>          KB is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R.  0 <= KB <= min(M-IOFFSET,NB,N).
+*>
+*>          KB also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is DOUBLE PRECISION
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank KB. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is DOUBLE PRECISION
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank KB) to the maximum column 2-norm of the
+*>          original matrix A_orig. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX*16 array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] AUXV
+*> \verbatim
+*>          AUXV is COMPLEX*16 array, dimension (NB)
+*>          Auxiliary vector.
+*> \endverbatim
+*>
+*> \param[out] F
+*> \verbatim
+*>          F is COMPLEX*16 array, dimension (LDF,NB)
+*>          Matrix F**H = L*(Y**H)*A.
+*> \endverbatim
+*>
+*> \param[in] LDF
+*> \verbatim
+*>          LDF is INTEGER
+*>          The leading dimension of the array F. LDF >= max(1,N+NRHS).
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step KB+1 ( when KB columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             KB                  is set to the number of
+*>                                    factorized columns without
+*>                                    exception.
+*>             MAXC2NRMK           is set to NaN.
+*>             RELMAXC2NRMK        is set to NaN.
+*>             TAU(KB+1:min(M,N))     is not set and contains undefined
+*>                                    elements. If j_1=KB+1, TAU(KB+1)
+*>                                    may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the actorization
+*>             step KB+1 ( when KB columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp3rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  Computer Science Division,
+*>                  University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE ZLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      LOGICAL            DONE
+      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+     $                   NB, NRHS
+      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      DOUBLE PRECISION   VN1( * ), VN2( * )
+      COMPLEX*16         A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+      COMPLEX*16         CZERO, CONE
+      PARAMETER          ( CZERO = ( 0.0D+0, 0.0D+0 ),
+     $                   CONE = ( 1.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            ITEMP, J, K, MINMNFACT, MINMNUPDT,
+     $                   LSTICC, KP, I, IF
+      DOUBLE PRECISION   HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z
+      COMPLEX*16         AIK
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZGEMM, ZGEMV, ZLARFG, ZSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DBLE, DCONJG, DIMAG, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            DISNAN
+      INTEGER            IDAMAX
+      DOUBLE PRECISION   DLAMCH, DZNRM2
+      EXTERNAL           DISNAN, DLAMCH, IDAMAX, DZNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      NB = MIN( NB, MINMNFACT )
+      TOL3Z = SQRT( DLAMCH( 'Epsilon' ) )
+      HUGEVAL = DLAMCH( 'Overflow' )
+*
+*     Compute factorization in a while loop over NB columns,
+*     K is the column index in the block A(1:M,1:N).
+*
+      K = 0
+      LSTICC = 0
+      DONE = .FALSE.
+*
+      DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 )
+         K = K + 1
+         I = IOFFSET + K
+*
+         IF( I.EQ.1 ) THEN
+*
+*           We are at the first column of the original whole matrix A_orig,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+            KP = KP1
+*
+         ELSE
+*
+*           Determine the pivot column in K-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( K-1 ) + IDAMAX( N-K+1, VN1( K ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,K:N) in step K.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains NaN, set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( DISNAN( MAXC2NRMK ) ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               INFO = KB + KP
+*
+*              Set RELMAXC2NRMK to NaN.
+*
+               RELMAXC2NRMK = MAXC2NRMK
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix contains NaN and we stop
+*              the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H.
+
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $                  M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Array TAU(KF+1:MINMNFACT) is not set and contains
+*              undefined elements.
+*
+*              Return from the routine.
+*
+               RETURN
+            END IF
+*
+*           Quick return, if the submatrix A(I:M,K:N) is
+*           a zero matrix. We need to check it only if the column index
+*           (same as row index) is larger than 1, since the condition
+*           for the whole original matrix A_orig is checked in the main
+*           routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               RELMAXC2NRMK = ZERO
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix is zero and we stop the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H.
+*
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $                  M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + K - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third tolerance stopping criteria.
+*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig;
+*
+                  KB = K - 1
+                  IF = I - 1
+*
+*              Apply the block reflector to the residual of the
+*              matrix A and the residual of the right hand sides B, if
+*              the residual matrix and and/or the residual of the right
+*              hand sides exist,  i.e. if the submatrix
+*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when
+*                 KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H.
+*
+               IF( KB.LT.MINMNUPDT ) THEN
+                  CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $                  M-IF, N+NRHS-KB, KB,-CONE, A( IF+1, 1 ), LDA,
+     $                  F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,K:N):
+*        1) swap the K-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1)
+*        3) copy the K-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than K in the next loop step.)
+*        4) Save the pivot interchange with the indices relative to the
+*           the original matrix A_orig, not the block A(1:M,1:N).
+*
+         IF( KP.NE.K ) THEN
+            CALL ZSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 )
+            CALL ZSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF )
+            VN1( KP ) = VN1( K )
+            VN2( KP ) = VN2( K )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( K )
+            JPIV( K ) = ITEMP
+         END IF
+*
+*        Apply previous Householder reflectors to column K:
+*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H.
+*
+         IF( K.GT.1 ) THEN
+            DO J = 1, K - 1
+               F( K, J ) = DCONJG( F( K, J ) )
+            END DO
+            CALL ZGEMV( 'No transpose', M-I+1, K-1, -CONE, A( I, 1 ),
+     $                  LDA, F( K, 1 ), LDF, CONE, A( I, K ), 1 )
+            DO J = 1, K - 1
+               F( K, J ) = DCONJG( F( K, J ) )
+            END DO
+         END IF
+*
+*        Generate elementary reflector H(k) using the column A(I:M,K).
+*
+         IF( I.LT.M ) THEN
+            CALL ZLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) )
+         ELSE
+            TAU( K ) = CZERO
+         END IF
+*
+*        Check if TAU(K) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(K) for Inf,
+*        since ZLARFG cannot produce TAU(KK) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by ZLARFG can contain Inf, which requires
+*        TAU(K) to contain NaN. Therefore, this case of generating Inf
+*        by ZLARFG is covered by checking TAU(K) for NaN.
+*
+         IF( DISNAN( DBLE( TAU(K) ) ) ) THEN
+            TAUNAN = DBLE( TAU(K) )
+         ELSE IF( DISNAN( DIMAG( TAU(K) ) ) ) THEN
+            TAUNAN = DIMAG( TAU(K) )
+         ELSE
+            TAUNAN = ZERO
+         END IF
+*
+         IF( DISNAN( TAUNAN ) ) THEN
+*
+            DONE = .TRUE.
+*
+*           Set KB, the number of factorized partial columns
+*                   that are non-zero in each step in the block,
+*                   i.e. the rank of the factor R.
+*           Set IF, the number of processed rows in the block, which
+*                   is the same as the number of processed rows in
+*                   the original whole matrix A_orig.
+*
+            KB = K - 1
+            IF = I - 1
+            INFO = K
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAUNAN
+            RELMAXC2NRMK = TAUNAN
+*
+*           There is no need to apply the block reflector to the
+*           residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*           since the submatrix contains NaN and we stop
+*           the computation.
+*           But, we need to apply the block reflector to the residual
+*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*           residual right hand sides exist.  This occurs
+*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H.
+*
+            IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+               CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $               M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA,
+     $               F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA )
+            END IF
+*
+*           There is no need to recompute the 2-norm of the
+*           difficult columns, since we stop the factorization.
+*
+*           Array TAU(KF+1:MINMNFACT) is not set and contains
+*           undefined elements.
+*
+*           Return from the routine.
+*
+            RETURN
+         END IF
+*
+*        ===============================================================
+*
+         AIK = A( I, K )
+         A( I, K ) = CONE
+*
+*        ===============================================================
+*
+*        Compute the current K-th column of F:
+*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K).
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL ZGEMV( 'Conjugate transpose', M-I+1, N+NRHS-K,
+     $                  TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1,
+     $                  CZERO, F( K+1, K ), 1 )
+         END IF
+*
+*           2) Zero out elements above and on the diagonal of the
+*              column K in matrix F, i.e elements F(1:K,K).
+*
+         DO J = 1, K
+            F( J, K ) = CZERO
+         END DO
+*
+*         3) Incremental updating of the K-th column of F:
+*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H
+*                    * A(I:M,K).
+*
+         IF( K.GT.1 ) THEN
+            CALL ZGEMV( 'Conjugate Transpose', M-I+1, K-1, -TAU( K ),
+     $                   A( I, 1 ), LDA, A( I, K ), 1, CZERO,
+     $                   AUXV( 1 ), 1 )
+*
+            CALL ZGEMV( 'No transpose', N+NRHS, K-1, CONE,
+     $                  F( 1, 1 ), LDF, AUXV( 1 ), 1, CONE,
+     $                  F( 1, K ), 1 )
+         END IF
+*
+*        ===============================================================
+*
+*        Update the current I-th row of A:
+*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS)
+*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**H.
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $            1, N+NRHS-K, K, -CONE, A( I, 1 ), LDA,
+     $            F( K+1, 1 ), LDF, CONE, A( I, K+1 ), LDA )
+         END IF
+*
+         A( I, K ) = AIK
+*
+*        Update the partial column 2-norms for the residual matrix,
+*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e.
+*        when K < MINMNFACT = min( M-IOFFSET, N ).
+*
+         IF( K.LT.MINMNFACT ) THEN
+*
+            DO J = K + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ABS( A( I, J ) ) / VN1( J )
+                  TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2.LE.TOL3Z ) THEN
+*
+*                    At J-index, we have a difficult column for the
+*                    update of the 2-norm. Save the index of the previous
+*                    difficult column in IWORK(J-1).
+*                    NOTE: ILSTCC > 1, threfore we can use IWORK only
+*                    with N-1 elements, where the elements are
+*                    shifted by 1 to the left.
+*
+                     IWORK( J-1 ) = LSTICC
+*
+*                    Set the index of the last difficult column LSTICC.
+*
+                     LSTICC = J
+*
+                  ELSE
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*        End of while loop.
+*
+      END DO
+*
+*     Now, afler the loop:
+*        Set KB, the number of factorized columns in the block;
+*        Set IF, the number of processed rows in the block, which
+*                is the same as the number of processed rows in
+*                the original whole matrix A_orig, IF = IOFFSET + KB.
+*
+      KB = K
+      IF = I
+*
+*     Apply the block reflector to the residual of the matrix A
+*     and the residual of the right hand sides B, if the residual
+*     matrix and and/or the residual of the right hand sides
+*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists.
+*     This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H.
+*
+      IF( KB.LT.MINMNUPDT ) THEN
+         CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $         M-IF, N+NRHS-KB, KB, -CONE, A( IF+1, 1 ), LDA,
+     $         F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA )
+      END IF
+*
+*     Recompute the 2-norm of the difficult columns.
+*     Loop over the index of the difficult columns from the largest
+*     to the smallest index.
+*
+      DO WHILE( LSTICC.GT.0 )
+*
+*        LSTICC is the index of the last difficult column is greater
+*        than 1.
+*        ITEMP is the index of the previous difficult column.
+*
+         ITEMP = IWORK( LSTICC-1 )
+*
+*        Compute the 2-norm explicilty for the last difficult column and
+*        save it in the partial and exact 2-norm vectors VN1 and VN2.
+*
+*        NOTE: The computation of VN1( LSTICC ) relies on the fact that
+*        DZNRM2 does not fail on vectors with norm below the value of
+*        SQRT(DLAMCH('S'))
+*
+         VN1( LSTICC ) = DZNRM2( M-IF, A( IF+1, LSTICC ), 1 )
+         VN2( LSTICC ) = VN1( LSTICC )
+*
+*        Downdate the index of the last difficult column to
+*        the index of the previous difficult column.
+*
+         LSTICC = ITEMP
+*
+      END DO
+*
+      RETURN
+*
+*     End of ZLAQP3RK
+*
+      END

From 40109c03927e3b5249f6e34dfe9279b15075fd8a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 15 Nov 2023 09:50:30 +0100
Subject: [PATCH 417/718] Implement truncated QR with pivoting
 (Reference-LAPACK PR 891)

---
 lapack-netlib/TESTING/EIG/alareq.f | 6 +++---
 lapack-netlib/TESTING/EIG/alarqg.f | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/lapack-netlib/TESTING/EIG/alareq.f b/lapack-netlib/TESTING/EIG/alareq.f
index 2585a686a..2cbe6db38 100644
--- a/lapack-netlib/TESTING/EIG/alareq.f
+++ b/lapack-netlib/TESTING/EIG/alareq.f
@@ -28,12 +28,12 @@
 *> to evaluate the input line which requested NMATS matrix types for
 *> PATH.  The flow of control is as follows:
 *>
-*> If NMATS = NTYPES then
+*> IF NMATS = NTYPES THEN
 *>    DOTYPE(1:NTYPES) = .TRUE.
-*> else
+*> ELSE
 *>    Read the next input line for NMATS matrix types
 *>    Set DOTYPE(I) = .TRUE. for each valid type I
-*> endif
+*> END IF
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/TESTING/EIG/alarqg.f b/lapack-netlib/TESTING/EIG/alarqg.f
index 6e2e6e7ec..b9fb88c65 100644
--- a/lapack-netlib/TESTING/EIG/alarqg.f
+++ b/lapack-netlib/TESTING/EIG/alarqg.f
@@ -28,12 +28,12 @@
 *> to evaluate the input line which requested NMATS matrix types for
 *> PATH.  The flow of control is as follows:
 *>
-*> If NMATS = NTYPES then
+*> IF NMATS = NTYPES THEN
 *>    DOTYPE(1:NTYPES) = .TRUE.
-*> else
+*> ELSE
 *>    Read the next input line for NMATS matrix types
 *>    Set DOTYPE(I) = .TRUE. for each valid type I
-*> endif
+*> END IF
 *> \endverbatim
 *
 *  Arguments:

From 387830b9d5df8956483d515fce3025502d325dea Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 15 Nov 2023 09:53:06 +0100
Subject: [PATCH 418/718] Implement truncated QR with pivoting
 (Reference-LAPACK PR 891)

---
 lapack-netlib/TESTING/LIN/CMakeLists.txt |  11 +-
 lapack-netlib/TESTING/LIN/Makefile       |  48 +-
 lapack-netlib/TESTING/LIN/alaerh.f       |  17 +
 lapack-netlib/TESTING/LIN/alahd.f        |  62 +-
 lapack-netlib/TESTING/LIN/alareq.f       |   6 +-
 lapack-netlib/TESTING/LIN/cchkaa.F       |  43 +-
 lapack-netlib/TESTING/LIN/cchkqp3rk.f    | 836 +++++++++++++++++++++++
 lapack-netlib/TESTING/LIN/clatb4.f       | 121 +++-
 lapack-netlib/TESTING/LIN/cqpt01.f       |  23 +-
 lapack-netlib/TESTING/LIN/cqrt11.f       |   4 +-
 lapack-netlib/TESTING/LIN/cqrt12.f       |  19 +-
 lapack-netlib/TESTING/LIN/dchkaa.F       |  48 +-
 lapack-netlib/TESTING/LIN/dchkq3.f       |   2 +-
 lapack-netlib/TESTING/LIN/dchkqp3rk.f    | 832 ++++++++++++++++++++++
 lapack-netlib/TESTING/LIN/dlatb4.f       | 123 +++-
 lapack-netlib/TESTING/LIN/dqpt01.f       |  42 +-
 lapack-netlib/TESTING/LIN/dqrt11.f       |   4 +-
 lapack-netlib/TESTING/LIN/dqrt12.f       |  22 +-
 lapack-netlib/TESTING/LIN/schkaa.F       |  48 +-
 lapack-netlib/TESTING/LIN/schkqp3rk.f    | 831 ++++++++++++++++++++++
 lapack-netlib/TESTING/LIN/slatb4.f       | 121 +++-
 lapack-netlib/TESTING/LIN/sqpt01.f       |  23 +-
 lapack-netlib/TESTING/LIN/sqrt11.f       |   4 +-
 lapack-netlib/TESTING/LIN/sqrt12.f       |  18 +-
 lapack-netlib/TESTING/LIN/zchkaa.F       |  59 +-
 lapack-netlib/TESTING/LIN/zchkqp3rk.f    | 836 +++++++++++++++++++++++
 lapack-netlib/TESTING/LIN/zlatb4.f       | 121 +++-
 lapack-netlib/TESTING/LIN/zqpt01.f       |  22 +-
 lapack-netlib/TESTING/LIN/zqrt11.f       |   4 +-
 lapack-netlib/TESTING/LIN/zqrt12.f       |  20 +-
 30 files changed, 4132 insertions(+), 238 deletions(-)
 create mode 100644 lapack-netlib/TESTING/LIN/cchkqp3rk.f
 create mode 100644 lapack-netlib/TESTING/LIN/dchkqp3rk.f
 create mode 100644 lapack-netlib/TESTING/LIN/schkqp3rk.f
 create mode 100644 lapack-netlib/TESTING/LIN/zchkqp3rk.f

diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt
index 676857a80..e28818c76 100644
--- a/lapack-netlib/TESTING/LIN/CMakeLists.txt
+++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt
@@ -9,7 +9,7 @@ set(DZLNTST dlaord.f)
 set(SLINTST schkaa.F
    schkeq.f schkgb.f schkge.f schkgt.f
    schklq.f schkpb.f schkpo.f schkps.f schkpp.f
-   schkpt.f schkq3.f schkql.f schkqr.f schkrq.f
+   schkpt.f schkq3.f schkqp3rk.f schkql.f schkqr.f schkrq.f
    schksp.f schksy.f schksy_rook.f schksy_rk.f 
    schksy_aa.f schksy_aa_2stage.f 
    schktb.f schktp.f schktr.f
@@ -56,7 +56,7 @@ set(CLINTST cchkaa.F
    cchkhe.f cchkhe_rook.f cchkhe_rk.f 
    cchkhe_aa.f cchkhe_aa_2stage.f
    cchkhp.f cchklq.f cchkpb.f
-   cchkpo.f cchkps.f cchkpp.f cchkpt.f cchkq3.f cchkql.f
+   cchkpo.f cchkps.f cchkpp.f cchkpt.f cchkq3.f cchkqp3rk.f cchkql.f
    cchkqr.f cchkrq.f cchksp.f cchksy.f cchksy_rook.f cchksy_rk.f
    cchksy_aa.f cchksy_aa_2stage.f
    cchktb.f
@@ -110,7 +110,7 @@ endif()
 set(DLINTST dchkaa.F
    dchkeq.f dchkgb.f dchkge.f dchkgt.f
    dchklq.f dchkpb.f dchkpo.f dchkps.f dchkpp.f
-   dchkpt.f dchkq3.f dchkql.f dchkqr.f dchkrq.f
+   dchkpt.f dchkq3.f dchkqp3rk.f dchkql.f dchkqr.f dchkrq.f
    dchksp.f dchksy.f dchksy_rook.f dchksy_rk.f 
    dchksy_aa.f dchksy_aa_2stage.f
    dchktb.f dchktp.f dchktr.f
@@ -158,7 +158,7 @@ set(ZLINTST zchkaa.F
    zchkhe.f zchkhe_rook.f zchkhe_rk.f 
    zchkhe_aa.f zchkhe_aa_2stage.f
    zchkhp.f zchklq.f zchkpb.f
-   zchkpo.f zchkps.f zchkpp.f zchkpt.f zchkq3.f zchkql.f
+   zchkpo.f zchkps.f zchkpp.f zchkpt.f zchkq3.f zchkqp3rk.f zchkql.f
    zchkqr.f zchkrq.f zchksp.f zchksy.f zchksy_rook.f zchksy_rk.f
    zchksy_aa.f  zchksy_aa_2stage.f
    zchktb.f
@@ -239,8 +239,7 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr
 
 macro(add_lin_executable name)
   add_executable(${name} ${ARGN})
-  target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE})
-#${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
+  target_link_libraries(${name} ${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
 endmacro()
 
 if(BUILD_SINGLE)
diff --git a/lapack-netlib/TESTING/LIN/Makefile b/lapack-netlib/TESTING/LIN/Makefile
index 64abc4dba..46e096c2f 100644
--- a/lapack-netlib/TESTING/LIN/Makefile
+++ b/lapack-netlib/TESTING/LIN/Makefile
@@ -45,7 +45,7 @@ DZLNTST = dlaord.o
 SLINTST = schkaa.o \
    schkeq.o schkgb.o schkge.o schkgt.o \
    schklq.o schkpb.o schkpo.o schkps.o schkpp.o \
-   schkpt.o schkq3.o schkql.o schkqr.o schkrq.o \
+   schkpt.o schkq3.o schkqp3rk.o schkql.o schkqr.o schkrq.o \
    schksp.o schksy.o schksy_rook.o schksy_rk.o  \
    schksy_aa.o schksy_aa_2stage.o schktb.o schktp.o schktr.o \
    schktz.o \
@@ -89,7 +89,7 @@ CLINTST = cchkaa.o \
    cchkeq.o cchkgb.o cchkge.o cchkgt.o \
    cchkhe.o cchkhe_rook.o cchkhe_rk.o  \
    cchkhe_aa.o cchkhe_aa_2stage.o cchkhp.o cchklq.o cchkpb.o \
-   cchkpo.o cchkps.o cchkpp.o cchkpt.o cchkq3.o cchkql.o \
+   cchkpo.o cchkps.o cchkpp.o cchkpt.o cchkq3.o cchkqp3rk.o cchkql.o \
    cchkqr.o cchkrq.o cchksp.o cchksy.o cchksy_rook.o cchksy_rk.o \
    cchksy_aa.o cchksy_aa_2stage.o cchktb.o \
    cchktp.o cchktr.o cchktz.o \
@@ -137,7 +137,7 @@ endif
 DLINTST = dchkaa.o \
    dchkeq.o dchkgb.o dchkge.o dchkgt.o \
    dchklq.o dchkpb.o dchkpo.o dchkps.o dchkpp.o \
-   dchkpt.o dchkq3.o dchkql.o dchkqr.o dchkrq.o \
+   dchkpt.o dchkq3.o dchkqp3rk.o dchkql.o dchkqr.o dchkrq.o \
    dchksp.o dchksy.o dchksy_rook.o dchksy_rk.o  \
    dchksy_aa.o dchksy_aa_2stage.o dchktb.o dchktp.o dchktr.o \
    dchktz.o \
@@ -182,7 +182,7 @@ ZLINTST = zchkaa.o \
    zchkeq.o zchkgb.o zchkge.o zchkgt.o \
    zchkhe.o zchkhe_rook.o zchkhe_rk.o zchkhe_aa.o zchkhe_aa_2stage.o \
    zchkhp.o zchklq.o zchkpb.o \
-   zchkpo.o zchkps.o zchkpp.o zchkpt.o zchkq3.o zchkql.o \
+   zchkpo.o zchkps.o zchkpp.o zchkpt.o zchkq3.o zchkqp3rk.o zchkql.o \
    zchkqr.o zchkrq.o zchksp.o zchksy.o zchksy_rook.o zchksy_rk.o \
    zchksy_aa.o zchksy_aa_2stage.o zchktb.o \
    zchktp.o zchktr.o zchktz.o \
@@ -269,35 +269,35 @@ proto-double: xlintstds xlintstrfd
 proto-complex: xlintstrfc
 proto-complex16: xlintstzc xlintstrfz
 
-xlintsts: $(ALINTST) $(SLINTST) $(SCLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
-	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xlintsts: $(ALINTST) $(SLINTST) $(SCLNTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstc: $(ALINTST) $(CLINTST) $(SCLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
-	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xlintstc: $(ALINTST) $(CLINTST) $(SCLNTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstd: $(ALINTST) $(DLINTST) $(DZLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
-	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xlintstd: $(ALINTST) $(DLINTST) $(DZLNTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstz: $(ALINTST) $(ZLINTST) $(DZLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
-	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xlintstz: $(ALINTST) $(ZLINTST) $(DZLNTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstds: $(DSLINTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xlintstds: $(DSLINTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstzc: $(ZCLINTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xlintstzc: $(ZCLINTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstrfs: $(SLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xlintstrfs: $(SLINTSTRFP) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstrfd: $(DLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xlintstrfd: $(DLINTSTRFP) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstrfc: $(CLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xlintstrfc: $(CLINTSTRFP) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstrfz: $(ZLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xlintstrfz: $(ZLINTSTRFP) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 $(ALINTST): $(FRC)
 $(SCLNTST): $(FRC)
diff --git a/lapack-netlib/TESTING/LIN/alaerh.f b/lapack-netlib/TESTING/LIN/alaerh.f
index 1845888a6..6c8a47f1e 100644
--- a/lapack-netlib/TESTING/LIN/alaerh.f
+++ b/lapack-netlib/TESTING/LIN/alaerh.f
@@ -797,6 +797,18 @@
             WRITE( NOUT, FMT = 9978 )
      $     SUBNAM(1:LEN_TRIM( SUBNAM )), INFO, M, N, IMAT
          END IF
+*
+      ELSE IF( LSAMEN( 2, P2, 'QK' ) ) THEN
+*
+*        xQK:  truncated QR factorization with pivoting
+*
+         IF( LSAMEN( 7, SUBNAM( 2: 8 ), 'GEQP3RK' )  ) THEN
+            WRITE( NOUT, FMT = 9930 )
+     $     SUBNAM(1:LEN_TRIM( SUBNAM )), INFO, M, N, KL, N5, IMAT
+         ELSE IF( LSAMEN( 5, SUBNAM( 2: 6 ), 'LATMS' ) ) THEN
+            WRITE( NOUT, FMT = 9978 )
+     $     SUBNAM(1:LEN_TRIM( SUBNAM )), INFO, M, N, IMAT
+         END IF
 *
       ELSE IF( LSAMEN( 2, P2, 'LQ' ) ) THEN
 *
@@ -1147,6 +1159,11 @@
 *     What we do next
 *
  9949 FORMAT( ' ==> Doing only the condition estimate for this case' )
+*
+*     SUBNAM, INFO, M, N, NB, IMAT
+*
+ 9930 FORMAT( ' *** Error code from ', A, '=', I5, / ' ==> M =', I5,
+     $      ', N =', I5, ', NX =', I5, ', NB =', I4, ', type ', I2 )
 *
       RETURN
 *
diff --git a/lapack-netlib/TESTING/LIN/alahd.f b/lapack-netlib/TESTING/LIN/alahd.f
index dd75394b3..8f966c584 100644
--- a/lapack-netlib/TESTING/LIN/alahd.f
+++ b/lapack-netlib/TESTING/LIN/alahd.f
@@ -584,13 +584,27 @@
 *
 *        QR decomposition with column pivoting
 *
-         WRITE( IOUNIT, FMT = 9986 )PATH
+         WRITE( IOUNIT, FMT = 8006 )PATH
          WRITE( IOUNIT, FMT = 9969 )
          WRITE( IOUNIT, FMT = '( '' Test ratios:'' )' )
          WRITE( IOUNIT, FMT = 9940 )1
          WRITE( IOUNIT, FMT = 9939 )2
          WRITE( IOUNIT, FMT = 9938 )3
          WRITE( IOUNIT, FMT = '( '' Messages:'' )' )
+*
+      ELSE IF( LSAMEN( 2, P2, 'QK' ) ) THEN
+*
+*        truncated QR decomposition with column pivoting
+*
+         WRITE( IOUNIT, FMT = 8006 )PATH
+         WRITE( IOUNIT, FMT = 9871 )
+         WRITE( IOUNIT, FMT = '( '' Test ratios:'' )' )
+         WRITE( IOUNIT, FMT = 8060 )1
+         WRITE( IOUNIT, FMT = 8061 )2
+         WRITE( IOUNIT, FMT = 8062 )3
+         WRITE( IOUNIT, FMT = 8063 )4
+         WRITE( IOUNIT, FMT = 8064 )5
+         WRITE( IOUNIT, FMT = '( '' Messages:'' )' )
 *
       ELSE IF( LSAMEN( 2, P2, 'TZ' ) ) THEN
 *
@@ -779,6 +793,8 @@
      $       'tall-skinny or short-wide matrices' )
  8005 FORMAT( / 1X, A3, ':  Householder reconstruction from TSQR',
      $       ' factorization output ', /,' for tall-skinny matrices.' )
+ 8006 FORMAT( / 1X, A3, ':  truncated QR factorization',
+     $        ' with column pivoting' )
 *
 *     GE matrix types
 *
@@ -922,6 +938,36 @@
      $      / 4X, '3. Geometric distribution', 10X,
      $      '6. Every second column fixed' )
 *
+*     QK matrix types
+*
+ 9871 FORMAT( 4X, ' 1. Zero matrix', /
+     $        4X, ' 2. Random, Diagonal, CNDNUM = 2', /
+     $        4X, ' 3. Random, Upper triangular, CNDNUM = 2', /
+     $        4X, ' 4. Random, Lower triangular, CNDNUM = 2', /
+     $        4X, ' 5. Random, First column is zero, CNDNUM = 2', /
+     $        4X, ' 6. Random, Last MINMN column is zero, CNDNUM = 2', /
+     $        4X, ' 7. Random, Last N column is zero, CNDNUM = 2', /
+     $        4X, ' 8. Random, Middle column in MINMN is zero,',
+     $               ' CNDNUM = 2', /
+     $        4X, ' 9. Random, First half of MINMN columns are zero,',
+     $                 ' CNDNUM = 2', /
+     $        4X, '10. Random, Last columns are zero starting from',
+     $                 ' MINMN/2+1, CNDNUM = 2', /
+     $        4X, '11. Random, Half MINMN columns in the middle are',
+     $                 ' zero starting from MINMN/2-(MINMN/2)/2+1,'
+     $                 ' CNDNUM = 2', /
+     $        4X, '12. Random, Odd columns are ZERO, CNDNUM = 2', /
+     $        4X, '13. Random, Even columns are ZERO, CNDNUM = 2', /
+     $        4X, '14. Random, CNDNUM = 2', /
+     $        4X, '15. Random, CNDNUM = sqrt(0.1/EPS)', /
+     $        4X, '16. Random, CNDNUM = 0.1/EPS', /
+     $        4X, '17. Random, CNDNUM = 0.1/EPS,',
+     $                 ' one small singular value S(N)=1/CNDNUM', /
+     $        4X, '18. Random, CNDNUM = 2, scaled near underflow,',
+     $                 ' NORM = SMALL = SAFMIN', /
+     $        4X, '19. Random, CNDNUM = 2, scaled near overflow,',
+     $            ' NORM = LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) )' )
+*
 *     TZ matrix types
 *
  9968 FORMAT( ' Matrix types (2-3 have condition 1/EPS):', / 4X,
@@ -1030,9 +1076,8 @@
      $      ' * norm(C) * EPS )' )
  9940 FORMAT( 3X, I2, ': norm(svd(A) - svd(R)) / ',
      $      '( M * norm(svd(R)) * EPS )' )
- 9939 FORMAT( 3X, I2, ': norm( A*P - Q*R )     / ( M * norm(A) * EPS )'
-     $       )
- 9938 FORMAT( 3X, I2, ': norm( I - Q''*Q )      / ( M * EPS )' )
+ 9939 FORMAT( 3X, I2, ': norm( A*P - Q*R ) / ( M * norm(A) * EPS )')
+ 9938 FORMAT( 3X, I2, ': norm( I - Q''*Q ) / ( M * EPS )' )
  9937 FORMAT( 3X, I2, ': norm( A - R*Q )       / ( M * norm(A) * EPS )'
      $       )
  9935 FORMAT( 3X, I2, ': norm( B - A * X )   / ',
@@ -1105,6 +1150,15 @@
  8054 FORMAT(3X,I2,': norm( C*Q - C*Q ) / ( M * norm(C) * EPS )' )
  8055 FORMAT(3X,I2,': norm( C*Q'' - C*Q'' ) / ( M * norm(C) * EPS )')
 
+ 8060 FORMAT( 3X, I2, ': 2-norm(svd(A) - svd(R)) / ',
+     $      '( max(M,N) * 2-norm(svd(R)) * EPS )' )
+ 8061 FORMAT( 3X, I2, ': 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A)',
+     $                ' * EPS )')
+ 8062 FORMAT( 3X, I2, ': 1-norm( I - Q''*Q ) / ( M * EPS )' )
+ 8063 FORMAT( 3X, I2, ': Returns 1.0D+100, if abs(R(K+1,K+1))',
+     $                 ' > abs(R(K,K)), where K=1:KFACT-1' )
+ 8064 FORMAT( 3X, I2, ': 1-norm(Q**T * B - Q**T * B ) / ( M * EPS )')
+
 *
       RETURN
 *
diff --git a/lapack-netlib/TESTING/LIN/alareq.f b/lapack-netlib/TESTING/LIN/alareq.f
index db18775eb..3f057fa48 100644
--- a/lapack-netlib/TESTING/LIN/alareq.f
+++ b/lapack-netlib/TESTING/LIN/alareq.f
@@ -28,12 +28,12 @@
 *> to evaluate the input line which requested NMATS matrix types for
 *> PATH.  The flow of control is as follows:
 *>
-*> If NMATS = NTYPES then
+*> IF NMATS = NTYPES THEN
 *>    DOTYPE(1:NTYPES) = .TRUE.
-*> else
+*> ELSE
 *>    Read the next input line for NMATS matrix types
 *>    Set DOTYPE(I) = .TRUE. for each valid type I
-*> endif
+*> END IF
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/TESTING/LIN/cchkaa.F b/lapack-netlib/TESTING/LIN/cchkaa.F
index ec1534ed4..474454a51 100644
--- a/lapack-netlib/TESTING/LIN/cchkaa.F
+++ b/lapack-netlib/TESTING/LIN/cchkaa.F
@@ -69,6 +69,7 @@
 *> CLQ    8               List types on next line if 0 < NTYPES <  8
 *> CQL    8               List types on next line if 0 < NTYPES <  8
 *> CQP    6               List types on next line if 0 < NTYPES <  6
+*> ZQK   19               List types on next line if 0 < NTYPES < 19
 *> CTZ    3               List types on next line if 0 < NTYPES <  3
 *> CLS    6               List types on next line if 0 < NTYPES <  6
 *> CEQ
@@ -153,12 +154,11 @@
      $                   NBVAL( MAXIN ), NBVAL2( MAXIN ),
      $                   NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ),
      $                   RANKVAL( MAXIN ), PIV( NMAX )
-      REAL               S( 2*NMAX )
-      COMPLEX            E( NMAX )
 *     ..
 *     .. Allocatable Arrays ..
       INTEGER AllocateStatus
-      REAL, DIMENSION(:), ALLOCATABLE :: RWORK
+      REAL, DIMENSION(:), ALLOCATABLE :: RWORK, S
+      COMPLEX, DIMENSION(:), ALLOCATABLE :: E
       COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK
 *     ..
 *     .. External Functions ..
@@ -170,14 +170,14 @@
       EXTERNAL           ALAREQ, CCHKEQ, CCHKGB, CCHKGE, CCHKGT, CCHKHE,
      $                   CCHKHE_ROOK, CCHKHE_RK, CCHKHE_AA, CCHKHP,
      $                   CCHKLQ, CCHKUNHR_COL, CCHKPB, CCHKPO, CCHKPS,
-     $                   CCHKPP, CCHKPT, CCHKQ3, CCHKQL, CCHKQR, CCHKRQ,
-     $                   CCHKSP, CCHKSY, CCHKSY_ROOK, CCHKSY_RK,
-     $                   CCHKSY_AA, CCHKTB,  CCHKTP, CCHKTR, CCHKTZ,
-     $                   CDRVGB, CDRVGE, CDRVGT, CDRVHE, CDRVHE_ROOK,
-     $                   CDRVHE_RK, CDRVHE_AA, CDRVHP, CDRVLS, CDRVPB,
-     $                   CDRVPO, CDRVPP, CDRVPT, CDRVSP, CDRVSY,
-     $                   CDRVSY_ROOK, CDRVSY_RK, CDRVSY_AA, ILAVER,
-     $                   CCHKQRT, CCHKQRTP
+     $                   CCHKPP, CCHKPT, CCHKQ3, CCHKQP3RK, CCHKQL,
+     $                   CCHKQR, CCHKRQ, CCHKSP, CCHKSY, CCHKSY_ROOK,
+     $                   CCHKSY_RK, CCHKSY_AA, CCHKTB, CCHKTP, CCHKTR,
+     $                   CCHKTZ, CDRVGB, CDRVGE, CDRVGT, CDRVHE,
+     $                   CDRVHE_ROOK, CDRVHE_RK, CDRVHE_AA, CDRVHP,
+     $                   CDRVLS, CDRVPB, CDRVPO, CDRVPP, CDRVPT, CDRVSP,
+     $                   CDRVSY, CDRVSY_ROOK, CDRVSY_RK, CDRVSY_AA,
+     $                   ILAVER, CCHKQRT, CCHKQRTP
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -203,6 +203,10 @@
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
       ALLOCATE ( WORK( NMAX, NMAX+MAXRHS+10 ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( E( NMAX ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus)
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
       ALLOCATE ( RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
 *     ..
@@ -1109,6 +1113,23 @@
          ELSE
             WRITE( NOUT, FMT = 9989 )PATH
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        QK: truncated QR factorization with pivoting
+*
+         NTYPES = 19
+         CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT )
+*
+         IF( TSTCHK ) THEN
+            CALL CCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                   NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ),
+     $                   A( 1, 2 ), B( 1, 1 ), B( 1, 2 ),
+     $                   S( 1 ), B( 1, 4 ),
+     $                   WORK, RWORK, IWORK, NOUT )
+         ELSE
+            WRITE( NOUT, FMT = 9989 )PATH
+         END IF
 *
       ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN
 *
diff --git a/lapack-netlib/TESTING/LIN/cchkqp3rk.f b/lapack-netlib/TESTING/LIN/cchkqp3rk.f
new file mode 100644
index 000000000..79d6add72
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/cchkqp3rk.f
@@ -0,0 +1,836 @@
+*> \brief \b CCHKQP3RK
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE CCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+*     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+*     $                      B, COPYB, S, TAU,
+*     $                      WORK, RWORK, IWORK, NOUT )
+*      IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER            NM, NN, NNB, NOUT
+*       REAL               THRESH
+*       ..
+*       .. Array Arguments ..
+*       LOGICAL            DOTYPE( * )
+*       INTEGER            IWORK( * ), MVAL( * ), NBVAL( * ), NVAL( * ),
+*      $                   NXVAL( * )
+*       REAL               S( * ), RWORK( * )
+*       COMPLEX            A( * ), COPYA( * ), TAU( * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CCHKQP3RK tests CGEQP3RK.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] DOTYPE
+*> \verbatim
+*>          DOTYPE is LOGICAL array, dimension (NTYPES)
+*>          The matrix types to be used for testing.  Matrices of type j
+*>          (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) =
+*>          .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used.
+*> \endverbatim
+*>
+*> \param[in] NM
+*> \verbatim
+*>          NM is INTEGER
+*>          The number of values of M contained in the vector MVAL.
+*> \endverbatim
+*>
+*> \param[in] MVAL
+*> \verbatim
+*>          MVAL is INTEGER array, dimension (NM)
+*>          The values of the matrix row dimension M.
+*> \endverbatim
+*>
+*> \param[in] NN
+*> \verbatim
+*>          NN is INTEGER
+*>          The number of values of N contained in the vector NVAL.
+*> \endverbatim
+*>
+*> \param[in] NVAL
+*> \verbatim
+*>          NVAL is INTEGER array, dimension (NN)
+*>          The values of the matrix column dimension N.
+*> \endverbatim
+*>
+*> \param[in] NNS
+*> \verbatim
+*>          NNS is INTEGER
+*>          The number of values of NRHS contained in the vector NSVAL.
+*> \endverbatim
+*>
+*> \param[in] NSVAL
+*> \verbatim
+*>          NSVAL is INTEGER array, dimension (NNS)
+*>          The values of the number of right hand sides NRHS.
+*> \endverbatim
+*> \param[in] NNB
+*> \verbatim
+*>          NNB is INTEGER
+*>          The number of values of NB and NX contained in the
+*>          vectors NBVAL and NXVAL.  The blocking parameters are used
+*>          in pairs (NB,NX).
+*> \endverbatim
+*>
+*> \param[in] NBVAL
+*> \verbatim
+*>          NBVAL is INTEGER array, dimension (NNB)
+*>          The values of the blocksize NB.
+*> \endverbatim
+*>
+*> \param[in] NXVAL
+*> \verbatim
+*>          NXVAL is INTEGER array, dimension (NNB)
+*>          The values of the crossover point NX.
+*> \endverbatim
+*>
+*> \param[in] THRESH
+*> \verbatim
+*>          THRESH is REAL
+*>          The threshold value for the test ratios.  A result is
+*>          included in the output file if RESULT >= THRESH.  To have
+*>          every test ratio printed, use THRESH = 0.
+*> \endverbatim
+*>
+*> \param[out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (MMAX*NMAX)
+*>          where MMAX is the maximum value of M in MVAL and NMAX is the
+*>          maximum value of N in NVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYA
+*> \verbatim
+*>          COPYA is COMPLEX array, dimension (MMAX*NMAX)
+*> \endverbatim
+*>
+*> \param[out] B
+*> \verbatim
+*>          B is COMPLEX array, dimension (MMAX*NSMAX)
+*>          where MMAX is the maximum value of M in MVAL and NSMAX is the
+*>          maximum value of NRHS in NSVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYB
+*> \verbatim
+*>          COPYB is COMPLEX array, dimension (MMAX*NSMAX)
+*> \endverbatim
+*>
+*> \param[out] S
+*> \verbatim
+*>          S is REAL array, dimension
+*>                      (min(MMAX,NMAX))
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX array, dimension (MMAX)
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension
+*>                      (max(M*max(M,N) + 4*min(M,N) + max(M,N)))
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (4*NMAX)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (2*NMAX)
+*> \endverbatim
+*>
+*> \param[in] NOUT
+*> \verbatim
+*>          NOUT is INTEGER
+*>          The unit number for output.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex_lin
+*
+*  =====================================================================
+      SUBROUTINE CCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+     $                      B, COPYB, S, TAU,
+     $                      WORK, RWORK, IWORK, NOUT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            NM, NN, NNB, NNS, NOUT
+      REAL               THRESH
+*     ..
+*     .. Array Arguments ..
+      LOGICAL            DOTYPE( * )
+      INTEGER            IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ),
+     $                   NSVAL( * ), NXVAL( * )
+      REAL               S( * ), RWORK( * )
+      COMPLEX            A( * ), COPYA( * ), B( * ), COPYB( * ),
+     $                   TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NTYPES
+      PARAMETER          ( NTYPES = 19 )
+      INTEGER            NTESTS
+      PARAMETER          ( NTESTS = 5 )
+      REAL               ONE, ZERO, BIGNUM
+      COMPLEX            CONE, CZERO
+      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0,
+     $                   CZERO = ( 0.0E+0, 0.0E+0 ),
+     $                   CONE = ( 1.0E+0, 0.0E+0 ),
+     $                   BIGNUM = 1.0E+38 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER          DIST, TYPE
+      CHARACTER*3        PATH
+      INTEGER            I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO,
+     $                   INB, IND_OFFSET_GEN,
+     $                   IND_IN, IND_OUT, INS, INFO,
+     $                   ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO,
+     $                   KFACT, KL, KMAX, KU, LDA, LW, LWORK,
+     $                   LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N,
+     $                   NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS,
+     $                   NRUN, NX, T
+      REAL               ANORM, CNDNUM, EPS, ABSTOL, RELTOL,
+     $                   DTEMP, MAXC2NRMK, RELMAXC2NRMK
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 ), ISEEDY( 4 )
+      REAL               RESULT( NTESTS ), RDUMMY( 1 )
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH, CQPT01, CQRT11, CQRT12, CLANGE
+      EXTERNAL           SLAMCH, CQPT01, CQRT11, CQRT12, CLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ALAERH, ALAHD, ALASUM, SLAORD, ICOPY, CAXPY,
+     $                   XLAENV, CGEQP3RK, CLACPY, CLASET, CLATB4,
+     $                   CLATMS, CUNMQR, CSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN, MOD, REAL
+*     ..
+*     .. Scalars in Common ..
+      LOGICAL            LERR, OK
+      CHARACTER*32       SRNAMT
+      INTEGER            INFOT, IOUNIT, CUNMQR_LWORK
+*     ..
+*     .. Common blocks ..
+      COMMON             / INFOC / INFOT, IOUNIT, OK, LERR
+      COMMON             / SRNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA               ISEEDY / 1988, 1989, 1990, 1991 /
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize constants and the random number seed.
+*
+      PATH( 1: 1 ) = 'Complex precision'
+      PATH( 2: 3 ) = 'QK'
+      NRUN = 0
+      NFAIL = 0
+      NERRS = 0
+      DO  I = 1, 4
+         ISEED( I ) = ISEEDY( I )
+      END DO
+      EPS = SLAMCH( 'Epsilon' )
+      INFOT = 0
+*
+      DO IM = 1, NM
+*
+*        Do for each value of M in MVAL.
+*
+         M = MVAL( IM )
+         LDA = MAX( 1, M )
+*
+         DO IN = 1, NN
+*
+*           Do for each value of N in NVAL.
+*
+            N = NVAL( IN )
+            MINMN = MIN( M, N )
+            LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ),
+     $                   M*N + 2*MINMN + 4*N )
+*
+            DO INS = 1, NNS
+               NRHS = NSVAL( INS )
+*
+*                 Set up parameters with CLATB4 and generate
+*                 M-by-NRHS B matrix with CLATMS.
+*                 IMAT = 14:
+*                 Random matrix, CNDNUM = 2, NORM = ONE,
+*                 MODE = 3 (geometric distribution of singular values).
+*
+                  CALL CLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'CLATMS'
+                  CALL CLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYB, LDA, WORK, INFO )
+*
+*                 Check error code from CLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'CLATMS', INFO, 0, ' ', M,
+     $                            NRHS, -1, -1, -1, 6, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+               DO IMAT = 1, NTYPES
+*
+*              Do the tests only if DOTYPE( IMAT ) is true.
+*
+               IF( .NOT.DOTYPE( IMAT ) )
+     $            CYCLE
+*
+*              The type of distribution used to generate the random
+*              eigen-/singular values:
+*              ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 )
+*
+*           Do for each type of NON-SYMMETRIC matrix:                               CNDNUM                     NORM                                     MODE
+*            1. Zero matrix
+*            2. Random, Diagonal, CNDNUM = 2                                        CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            3. Random, Upper triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            4. Random, Lower triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            5. Random, First column is zero, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            6. Random, Last MINMN column is zero, CNDNUM = 2                       CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            7. Random, Last N column is zero, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            8. Random, Middle column in MINMN is zero, CNDNUM = 2                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            9. Random, First half of MINMN columns are zero, CNDNUM = 2            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2   CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           11. Random, Half MINMN columns in the middle are zero starting
+*                  from  MINMN/2-(MINMN/2)/2+1, CNDNUM = 2                          CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           12. Random, Odd columns are ZERO, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           13. Random, Even columns are ZERO, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           14. Random, CNDNUM = 2                                                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           15. Random, CNDNUM = sqrt(0.1/EPS)                                      CNDNUM = BADC1 = sqrt(0.1/EPS)  ONE                                      3 ( geometric distribution of singular values )
+*           16. Random, CNDNUM = 0.1/EPS                                            CNDNUM = BADC2 = 0.1/EPS        ONE                                      3 ( geometric distribution of singular values )
+*           17. Random, CNDNUM = 0.1/EPS,                                           CNDNUM = BADC2 = 0.1/EPS        ONE                                      2 ( one small singular value, S(N)=1/CNDNUM )
+*                 one small singular value S(N)=1/CNDNUM
+*           18. Random, CNDNUM = 2, scaled near underflow                           CNDNUM = 2                      SMALL = SAFMIN
+*           19. Random, CNDNUM = 2, scaled near overflow                            CNDNUM = 2                      LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) )  3 ( geometric distribution of singular values )
+*
+               IF( IMAT.EQ.1 ) THEN
+*
+*                 Matrix 1: Zero matrix
+*
+                  CALL CLASET( 'Full', M, N, CZERO, CZERO, COPYA, LDA )
+                  DO I = 1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 )
+     $                  .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN
+*
+*                 Matrices 2-5.
+*
+*                 Set up parameters with DLATB4 and generate a test
+*                 matrix with CLATMS.
+*
+                  CALL CLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'CLATMS'
+                  CALL CLATMS( M, N, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYA, LDA, WORK, INFO )
+*
+*                 Check error code from CLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'CLATMS', INFO, 0, ' ', M, N,
+     $                            -1, -1, -1, IMAT, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+                  CALL SLAORD( 'Decreasing', MINMN, S, 1 )
+*
+               ELSE IF( MINMN.GE.2
+     $                  .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN
+*
+*                 Rectangular matrices 5-13 that contain zero columns,
+*                 only for matrices MINMN >=2.
+*
+*                 JB_ZERO is the column index of ZERO block.
+*                 NB_ZERO is the column block size of ZERO block.
+*                 NB_GEN is the column blcok size of the
+*                 generated block.
+*                 J_INC in the non_zero column index increment
+*                 for matrix 12 and 13.
+*                 J_FIRS_NZ is the index of the first non-zero
+*                 column.
+*
+                  IF( IMAT.EQ.5 ) THEN
+*
+*                    First column is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.6 ) THEN
+*
+*                    Last column MINMN is zero.
+*
+                     JB_ZERO = MINMN
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.7 ) THEN
+*
+*                    Last column N is zero.
+*
+                     JB_ZERO = N
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.8 ) THEN
+*
+*                    Middle column in MINMN is zero.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.9 ) THEN
+*
+*                    First half of MINMN columns is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.10 ) THEN
+*
+*                    Last columns are zero columns,
+*                    starting from (MINMN / 2 + 1) column.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = N - JB_ZERO + 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.11 ) THEN
+*
+*                    Half of the columns in the middle of MINMN
+*                    columns is zero, starting from
+*                    MINMN/2 - (MINMN/2)/2 + 1 column.
+*
+                     JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.12 ) THEN
+*
+*                    Odd-numbered columns are zero,
+*
+                     NB_GEN = N / 2
+                     NB_ZERO = N - NB_GEN
+                     J_INC = 2
+                     J_FIRST_NZ = 2
+*
+                  ELSE IF( IMAT.EQ.13 ) THEN
+*
+*                    Even-numbered columns are zero.
+*
+                     NB_ZERO = N / 2
+                     NB_GEN = N - NB_ZERO
+                     J_INC = 2
+                     J_FIRST_NZ = 1
+*
+                  END IF
+*
+*
+*                 1) Set the first NB_ZERO columns in COPYA(1:M,1:N)
+*                    to zero.
+*
+                  CALL CLASET( 'Full', M, NB_ZERO, CZERO, CZERO,
+     $                         COPYA, LDA )
+*
+*                    2) Generate an M-by-(N-NB_ZERO) matrix with the
+*                       chosen singular value distribution
+*                       in COPYA(1:M,NB_ZERO+1:N).
+*
+                  CALL CLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU,
+     $                         ANORM, MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'CLATMS'
+*
+                  IND_OFFSET_GEN = NB_ZERO * LDA
+*
+                  CALL CLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE,
+     $                        CNDNUM, ANORM, KL, KU, 'No packing',
+     $                        COPYA( IND_OFFSET_GEN + 1 ), LDA,
+     $                        WORK, INFO )
+*
+*                 Check error code from CLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'CLATMS', INFO, 0, ' ', M,
+     $                            NB_GEN, -1, -1, -1, IMAT, NFAIL,
+     $                            NERRS, NOUT )
+                     CYCLE
+                  END IF
+*
+*                 3) Swap the gererated colums from the right side
+*                 NB_GEN-size block in COPYA into correct column
+*                 positions.
+*
+                  IF( IMAT.EQ.6
+     $                    .OR. IMAT.EQ.7
+     $                    .OR. IMAT.EQ.8
+     $                    .OR. IMAT.EQ.10
+     $                    .OR. IMAT.EQ.11 ) THEN
+*
+*                    Move by swapping the generated columns
+*                    from the right NB_GEN-size block from
+*                    (NB_ZERO+1:NB_ZERO+JB_ZERO)
+*                    into columns (1:JB_ZERO-1).
+*
+                     DO J = 1, JB_ZERO-1, 1
+                        CALL CSWAP( M,
+     $                        COPYA( ( NB_ZERO+J-1)*LDA+1), 1,
+     $                        COPYA( (J-1)*LDA + 1 ), 1 )
+                     END DO
+*
+                  ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN
+*
+*                    ( IMAT = 12, Odd-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the even zero colums in the
+*                    left NB_ZERO-size block.
+*
+*                    ( IMAT = 13, Even-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the odd zero colums in the
+*                    left NB_ZERO-size block.
+*
+                     DO J = 1, NB_GEN, 1
+                        IND_OUT = ( NB_ZERO+J-1 )*LDA + 1
+                        IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA
+     $                            + 1
+                        CALL CSWAP( M,
+     $                              COPYA( IND_OUT ), 1,
+     $                              COPYA( IND_IN), 1 )
+                        END DO
+*
+                  END IF
+*
+*                 5) Order the singular values generated by
+*                    DLAMTS in decreasing order and add trailing zeros
+*                    that correspond to zero columns.
+*                    The total number of singular values is MINMN.
+*
+                  MINMNB_GEN = MIN( M, NB_GEN )
+*
+                  CALL SLAORD( 'Decreasing', MINMNB_GEN, S, 1 )
+
+                  DO I = MINMNB_GEN+1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE
+*
+*                    IF(MINMN.LT.2) skip this size for this matrix type.
+*
+                     CYCLE
+               END IF
+*
+*              Initialize a copy array for a pivot array for DGEQP3RK.
+*
+               DO I = 1, N
+                  IWORK( I ) = 0
+               END DO
+*
+               DO INB = 1, NNB
+*
+*                 Do for each pair of values (NB,NX) in NBVAL and NXVAL.
+*
+                  NB = NBVAL( INB )
+                  CALL XLAENV( 1, NB )
+                  NX = NXVAL( INB )
+                  CALL XLAENV( 3, NX )
+*
+*                 We do MIN(M,N)+1 because we need a test for KMAX > N,
+*                 when KMAX is larger than MIN(M,N), KMAX should be
+*                 KMAX = MIN(M,N)
+*
+                  DO KMAX = 0, MIN(M,N)+1
+*
+*                 Get a working copy of COPYA into A( 1:M,1:N ).
+*                 Get a working copy of COPYB into A( 1:M, (N+1):NRHS ).
+*                 Get a working copy of COPYB into into B( 1:M, 1:NRHS ).
+*                 Get a working copy of IWORK(1:N) awith zeroes into
+*                 which is going to be used as pivot array IWORK( N+1:2N ).
+*                 NOTE: IWORK(2N+1:3N) is going to be used as a WORK array
+*                 for the routine.
+*
+                  CALL CLACPY( 'All', M, N, COPYA, LDA, A, LDA )
+                  CALL CLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         A( LDA*N + 1 ),  LDA )
+                  CALL CLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         B,  LDA )
+                  CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 )
+*
+                  ABSTOL = -1.0
+                  RELTOl = -1.0
+*
+*                 Compute the QR factorization with pivoting of A
+*
+                  LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ),
+     $                              3*N + NRHS - 1 ) )
+*
+*                 Compute CGEQP3RK factorization of A.
+*
+                  SRNAMT = 'CGEQP3RK'
+                  CALL CGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                           A, LDA, KFACT, MAXC2NRMK,
+     $                           RELMAXC2NRMK, IWORK( N+1 ), TAU,
+     $                           WORK, LW, RWORK, IWORK( 2*N+1 ),
+     $                           INFO )
+*
+*                 Check error code from CGEQP3RK.
+*
+                  IF( INFO.LT.0 )
+     $               CALL ALAERH( PATH, 'CGEQP3RK', INFO, 0, ' ',
+     $                            M, N, NX, -1, NB, IMAT,
+     $                            NFAIL, NERRS, NOUT )
+*
+                  IF( KFACT.EQ.MINMN ) THEN
+*
+*                 Compute test 1:
+*
+*                 This test in only for the full rank factorization of
+*                 the matrix A.
+*
+*                 Array S(1:min(M,N)) contains svd(A) the sigular values
+*                 of the original matrix A in decreasing absolute value
+*                 order. The test computes svd(R), the vector sigular
+*                 values of the upper trapezoid of A(1:M,1:N) that
+*                 contains the factor R, in decreasing order. The test
+*                 returns the ratio:
+*
+*                 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS )
+*
+                     RESULT( 1 ) = CQRT12( M, N, A, LDA, S, WORK,
+     $                                     LWORK , RWORK )
+*
+                     DO T = 1, 1
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL, NB, NX,
+     $                        IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                   End test 1
+*
+                  END IF
+
+*                 Compute test 2:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS )
+*
+                  RESULT( 2 ) = CQPT01( M, N, KFACT, COPYA, A, LDA, TAU,
+     $                          IWORK( N+1 ), WORK, LWORK )
+*
+*                 Compute test 3:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( Q**T * Q - I ) / ( M * EPS )
+*
+                  RESULT( 3 ) = CQRT11( M, KFACT, A, LDA, TAU, WORK,
+     $                          LWORK )
+*
+*                 Print information about the tests that did not pass
+*                 the threshold.
+*
+                  DO T = 2, 3
+                     IF( RESULT( T ).GE.THRESH ) THEN
+                        IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                     CALL ALAHD( NOUT, PATH )
+                        WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK', M, N,
+     $                      NRHS, KMAX, ABSTOL, RELTOL,
+     $                      NB, NX, IMAT, T, RESULT( T )
+                        NFAIL = NFAIL + 1
+                     END IF
+                  END DO
+                  NRUN = NRUN + 2
+*
+*                 Compute test 4:
+*
+*                 This test is only for the factorizations with the
+*                 rank greater than 2.
+*                 The elements on the diagonal of R should be non-
+*                 increasing.
+*
+*                 The test returns the ratio:
+*
+*                 Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)),
+*                 K=1:KFACT-1
+*
+                  IF( MIN(KFACT, MINMN).GE.2 ) THEN
+*
+                     DO J = 1, KFACT-1, 1
+*
+                        DTEMP = (( ABS( A( (J-1)*M+J ) ) -
+     $                          ABS( A( (J)*M+J+1 ) ) ) /
+     $                          ABS( A(1) ) )
+*
+                        IF( DTEMP.LT.ZERO ) THEN
+                           RESULT( 4 ) = BIGNUM
+                        END IF
+*
+                     END DO
+*
+*                    Print information about the tests that did not
+*                    pass the threshold.
+*
+                     DO T = 4, 4
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK',
+     $                        M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T,
+     $                        RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End test 4.
+*
+                  END IF
+*
+*                 Compute test 5:
+*
+*                 This test in only for matrix A with min(M,N) > 0.
+*
+*                 The test returns the ratio:
+*
+*                 1-norm(Q**T * B - Q**T * B ) /
+*                       ( M * EPS )
+*
+*                 (1) Compute B:=Q**T * B in the matrix B.
+*
+                  IF( MINMN.GT.0 ) THEN
+*
+                     LWORK_MQR = MAX(1, NRHS)
+                     CALL CUNMQR( 'Left', 'Conjugate transpose',
+     $                         M, NRHS, KFACT, A, LDA, TAU, B, LDA,
+     $                         WORK, LWORK_MQR, INFO )
+*
+                     DO I = 1, NRHS
+*
+*                       Compare N+J-th column of A and J-column of B.
+*
+                        CALL CAXPY( M, -CONE, A( ( N+I-1 )*LDA+1 ), 1,
+     $                                    B( ( I-1 )*LDA+1 ), 1 )
+                     END DO
+*
+                     RESULT( 5 ) =
+     $               ABS(
+     $               CLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) /
+     $               ( REAL( M )*SLAMCH( 'Epsilon' ) )
+     $               )
+*
+*                    Print information about the tests that did not pass
+*                    the threshold.
+*
+                     DO T = 5, 5
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End compute test 5.
+*
+                  END IF
+*
+*                 END DO KMAX = 1, MIN(M,N)+1
+*
+                  END DO
+*
+*                 END DO for INB = 1, NNB
+*
+               END DO
+*
+*              END DO  for IMAT = 1, NTYPES
+*
+               END DO
+*
+*              END DO for INS = 1, NNS
+*
+            END DO
+*
+*           END DO for IN = 1, NN
+*
+         END DO
+*
+*        END DO for IM = 1, NM
+*
+      END DO
+*
+*     Print a summary of the results.
+*
+      CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
+*
+ 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5,
+     $        ', KMAX =', I5, ', ABSTOL =', G12.5,
+     $        ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4,
+     $        ', type ', I2, ', test ', I2, ', ratio =', G12.5 )
+*
+*     End of CCHKQP3RK
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/clatb4.f b/lapack-netlib/TESTING/LIN/clatb4.f
index eeb0f03a9..233a8631a 100644
--- a/lapack-netlib/TESTING/LIN/clatb4.f
+++ b/lapack-netlib/TESTING/LIN/clatb4.f
@@ -154,9 +154,6 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           SLABAD
-*     ..
 *     .. Save statement ..
       SAVE               EPS, SMALL, LARGE, BADC1, BADC2, FIRST
 *     ..
@@ -174,11 +171,6 @@
          BADC1 = SQRT( BADC2 )
          SMALL = SLAMCH( 'Safe minimum' )
          LARGE = ONE / SMALL
-*
-*        If it looks like we're on a Cray, take the square root of
-*        SMALL and LARGE to avoid overflow and underflow problems.
-*
-         CALL SLABAD( SMALL, LARGE )
          SMALL = SHRINK*( SMALL / EPS )
          LARGE = ONE / SMALL
       END IF
@@ -233,6 +225,110 @@
          ELSE
             ANORM = ONE
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        xQK: truncated QR with pivoting.
+*             Set parameters to generate a general
+*             M x N matrix.
+*
+*        Set TYPE, the type of matrix to be generated.  'N' is nonsymmetric.
+*
+         TYPE = 'N'
+*
+*        Set DIST, the type of distribution for the random
+*        number generator. 'S' is
+*
+         DIST = 'S'
+*
+*        Set the lower and upper bandwidths.
+*
+         IF( IMAT.EQ.2 ) THEN
+*
+*           2. Random, Diagonal, CNDNUM = 2
+*
+            KL = 0
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.3 ) THEN
+*
+*           3. Random, Upper triangular,  CNDNUM = 2
+*
+            KL = 0
+            KU = MAX( N-1, 0 )
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.4 ) THEN
+*
+*          4. Random, Lower triangular,  CNDNUM = 2
+*
+            KL = MAX( M-1, 0 )
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE
+*
+*           5.-19. Rectangular matrix
+*
+            KL = MAX( M-1, 0 )
+            KU = MAX( N-1, 0 )
+*
+            IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN
+*
+*              5.-14. Random, CNDNUM = 2.
+*
+               CNDNUM = TWO
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.15 ) THEN
+*
+*              15. Random, CNDNUM = sqrt(0.1/EPS)
+*
+               CNDNUM = BADC1
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.16 ) THEN
+*
+*              16. Random, CNDNUM = 0.1/EPS
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.17 ) THEN
+*
+*              17. Random, CNDNUM = 0.1/EPS,
+*                  one small singular value S(N)=1/CNDNUM
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 2
+*
+            ELSE IF( IMAT.EQ.18 ) THEN
+*
+*              18. Random, scaled near underflow
+*
+               CNDNUM = TWO
+               ANORM = SMALL
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.19 ) THEN
+*
+*              19. Random, scaled near overflow
+*
+               CNDNUM = TWO
+               ANORM = LARGE
+               MODE = 3
+*
+            END IF
+*
+         END IF
 *
       ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN
 *
@@ -517,17 +613,18 @@
 *
 *        Set the norm and condition number.
 *
-         IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN
+         MAT = ABS( IMAT )
+         IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN
             CNDNUM = BADC1
-         ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN
+         ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN
             CNDNUM = BADC2
          ELSE
             CNDNUM = TWO
          END IF
 *
-         IF( IMAT.EQ.4 ) THEN
+         IF( MAT.EQ.4 ) THEN
             ANORM = SMALL
-         ELSE IF( IMAT.EQ.5 ) THEN
+         ELSE IF( MAT.EQ.5 ) THEN
             ANORM = LARGE
          ELSE
             ANORM = ONE
diff --git a/lapack-netlib/TESTING/LIN/cqpt01.f b/lapack-netlib/TESTING/LIN/cqpt01.f
index 79fc2dc66..149c5bb7c 100644
--- a/lapack-netlib/TESTING/LIN/cqpt01.f
+++ b/lapack-netlib/TESTING/LIN/cqpt01.f
@@ -33,7 +33,8 @@
 *> Householder vectors, and the rest of AF contains a partially updated
 *> matrix.
 *>
-*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M)
+*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) )
+*> where || . || is matrix one norm.
 *> \endverbatim
 *
 *  Arguments:
@@ -172,28 +173,28 @@
 *
       NORMA = CLANGE( 'One-norm', M, N, A, LDA, RWORK )
 *
-      DO 30 J = 1, K
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, K
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = AF( I, J )
-   10    CONTINUE
-         DO 20 I = J + 1, M
+         END DO
+         DO I = J + 1, M
             WORK( ( J-1 )*M+I ) = ZERO
-   20    CONTINUE
-   30 CONTINUE
-      DO 40 J = K + 1, N
+         END DO
+      END DO
+      DO J = K + 1, N
          CALL CCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 )
-   40 CONTINUE
+      END DO
 *
       CALL CUNMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK,
      $             M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
-      DO 50 J = 1, N
+      DO J = 1, N
 *
 *        Compare i-th column of QR and jpvt(i)-th column of A
 *
          CALL CAXPY( M, CMPLX( -ONE ), A( 1, JPVT( J ) ), 1,
      $               WORK( ( J-1 )*M+1 ), 1 )
-   50 CONTINUE
+      END DO
 *
       CQPT01 = CLANGE( 'One-norm', M, N, WORK, M, RWORK ) /
      $         ( REAL( MAX( M, N ) )*SLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/cqrt11.f b/lapack-netlib/TESTING/LIN/cqrt11.f
index 494d5e9cd..a52084973 100644
--- a/lapack-netlib/TESTING/LIN/cqrt11.f
+++ b/lapack-netlib/TESTING/LIN/cqrt11.f
@@ -157,9 +157,9 @@
       CALL CUNM2R( 'Left', 'Conjugate transpose', M, M, K, A, LDA, TAU,
      $             WORK, M, WORK( M*M+1 ), INFO )
 *
-      DO 10 J = 1, M
+      DO J = 1, M
          WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE
-   10 CONTINUE
+      END DO
 *
       CQRT11 = CLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) /
      $         ( REAL( M )*SLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/cqrt12.f b/lapack-netlib/TESTING/LIN/cqrt12.f
index 4c29423ae..0df2d833b 100644
--- a/lapack-netlib/TESTING/LIN/cqrt12.f
+++ b/lapack-netlib/TESTING/LIN/cqrt12.f
@@ -28,7 +28,7 @@
 *> CQRT12 computes the singular values `svlues' of the upper trapezoid
 *> of A(1:M,1:N) and returns the ratio
 *>
-*>      || s - svlues||/(||svlues||*eps*max(M,N))
+*>      || svlues -s ||/( ||s||*eps*max(M,N) )
 *> \endverbatim
 *
 *  Arguments:
@@ -125,8 +125,8 @@
       EXTERNAL           CLANGE, SASUM, SLAMCH, SNRM2
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CGEBD2, CLASCL, CLASET, SAXPY, SBDSQR, SLABAD,
-     $                   SLASCL, XERBLA
+      EXTERNAL           CGEBD2, CLASCL, CLASET, SAXPY, SBDSQR, SLASCL,
+     $                   XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          CMPLX, MAX, MIN, REAL
@@ -153,17 +153,16 @@
 *     Copy upper triangle of A into work
 *
       CALL CLASET( 'Full', M, N, CMPLX( ZERO ), CMPLX( ZERO ), WORK, M )
-      DO 20 J = 1, N
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, N
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = A( I, J )
-   10    CONTINUE
-   20 CONTINUE
+         END DO
+      END DO
 *
 *     Get machine parameters
 *
       SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale work if max entry outside range [SMLNUM,BIGNUM]
 *
@@ -207,9 +206,9 @@
 *
       ELSE
 *
-         DO 30 I = 1, MN
+         DO I = 1, MN
             RWORK( I ) = ZERO
-   30    CONTINUE
+         END DO
       END IF
 *
 *     Compare s and singular values of work
diff --git a/lapack-netlib/TESTING/LIN/dchkaa.F b/lapack-netlib/TESTING/LIN/dchkaa.F
index ef9d7808c..74077eb94 100644
--- a/lapack-netlib/TESTING/LIN/dchkaa.F
+++ b/lapack-netlib/TESTING/LIN/dchkaa.F
@@ -63,6 +63,7 @@
 *> DLQ    8               List types on next line if 0 < NTYPES <  8
 *> DQL    8               List types on next line if 0 < NTYPES <  8
 *> DQP    6               List types on next line if 0 < NTYPES <  6
+*> DQK    19              List types on next line if 0 < NTYPES <  19
 *> DTZ    3               List types on next line if 0 < NTYPES <  3
 *> DLS    6               List types on next line if 0 < NTYPES <  6
 *> DEQ
@@ -149,12 +150,12 @@
      $                   NBVAL( MAXIN ), NBVAL2( MAXIN ),
      $                   NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ),
      $                   RANKVAL( MAXIN ), PIV( NMAX )
-      DOUBLE PRECISION   E( NMAX ), S( 2*NMAX )
 *     ..
 *     .. Allocatable Arrays ..
       INTEGER AllocateStatus
-      DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK
-      DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK      
+      DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK, S
+      DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: E
+      DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, LSAMEN
@@ -164,13 +165,13 @@
 *     .. External Subroutines ..
       EXTERNAL           ALAREQ, DCHKEQ, DCHKGB, DCHKGE, DCHKGT, DCHKLQ,
      $                   DCHKORHR_COL, DCHKPB, DCHKPO, DCHKPS, DCHKPP,
-     $                   DCHKPT, DCHKQ3, DCHKQL, DCHKQR, DCHKRQ, DCHKSP,
-     $                   DCHKSY, DCHKSY_ROOK, DCHKSY_RK, DCHKSY_AA,
-     $                   DCHKTB, DCHKTP, DCHKTR, DCHKTZ, DDRVGB, DDRVGE,
-     $                   DDRVGT, DDRVLS, DDRVPB, DDRVPO, DDRVPP, DDRVPT,
-     $                   DDRVSP, DDRVSY, DDRVSY_ROOK, DDRVSY_RK,
-     $                   DDRVSY_AA, ILAVER, DCHKLQTP, DCHKQRT, DCHKQRTP,
-     $                   DCHKLQT,DCHKTSQR
+     $                   DCHKPT, DCHKQ3, DCHKQP3RK, DCHKQL, DCHKQR,
+     $                   DCHKRQ, DCHKSP, DCHKSY, DCHKSY_ROOK, DCHKSY_RK,
+     $                   DCHKSY_AA, DCHKTB, DCHKTP, DCHKTR, DCHKTZ,
+     $                   DDRVGB, DDRVGE, DDRVGT, DDRVLS, DDRVPB, DDRVPO,
+     $                   DDRVPP, DDRVPT, DDRVSP, DDRVSY, DDRVSY_ROOK,
+     $                   DDRVSY_RK, DDRVSY_AA, ILAVER, DCHKLQTP, DCHKQRT,
+     $                   DCHKQRTP, DCHKLQT,DCHKTSQR
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -197,6 +198,10 @@
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
       ALLOCATE ( WORK( NMAX, 3*NMAX+MAXRHS+30 ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( E( NMAX ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
       ALLOCATE ( RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
 *
@@ -919,9 +924,26 @@
          CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT )
 *
          IF( TSTCHK ) THEN
-            CALL DCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL,
-     $                   THRESH, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ),
-     $                   B( 1, 3 ), WORK, IWORK, NOUT )
+            CALL DCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL,
+     $                   NXVAL, THRESH, A( 1, 1 ), A( 1, 2 ),
+     $                   B( 1, 1 ), B( 1, 3 ), WORK, IWORK, NOUT )
+         ELSE
+            WRITE( NOUT, FMT = 9989 )PATH
+         END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        QK: truncated QR factorization with pivoting
+*
+         NTYPES = 19
+         CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT )
+*
+         IF( TSTCHK ) THEN
+            CALL DCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                      NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ),
+     $                      A( 1, 2 ), B( 1, 1 ), B( 1, 2 ),
+     $                      B( 1, 3 ), B( 1, 4 ),
+     $                      WORK, IWORK, NOUT )
          ELSE
             WRITE( NOUT, FMT = 9989 )PATH
          END IF
diff --git a/lapack-netlib/TESTING/LIN/dchkq3.f b/lapack-netlib/TESTING/LIN/dchkq3.f
index 1fdf07252..494008fa8 100644
--- a/lapack-netlib/TESTING/LIN/dchkq3.f
+++ b/lapack-netlib/TESTING/LIN/dchkq3.f
@@ -30,7 +30,7 @@
 *>
 *> \verbatim
 *>
-*> DCHKQ3 tests DGEQP3.
+*> DCHKQ3 tests  DGEQP3.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/TESTING/LIN/dchkqp3rk.f b/lapack-netlib/TESTING/LIN/dchkqp3rk.f
new file mode 100644
index 000000000..434d2067e
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/dchkqp3rk.f
@@ -0,0 +1,832 @@
+*> \brief \b DCHKQP3RK
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE DCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+*     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+*     $                      B, COPYB, S, TAU,
+*     $                      WORK, IWORK, NOUT )
+*      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+*      INTEGER            NM, NN, NNS, NNB, NOUT
+*      DOUBLE PRECISION   THRESH
+*     ..
+*     .. Array Arguments ..
+*      LOGICAL            DOTYPE( * )
+*      INTEGER            IWORK( * ), MVAL( * ), NBVAL( * ), NSVAL( * ),
+*     $                   NVAL( * ), NXVAL( * )
+*      DOUBLE PRECISION   A( * ), COPYA( * ), B( * ), COPYB( * ),
+*     $                   S( * ), TAU( * ), WORK( * )
+*     ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DCHKQP3RK tests DGEQP3RK.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] DOTYPE
+*> \verbatim
+*>          DOTYPE is LOGICAL array, dimension (NTYPES)
+*>          The matrix types to be used for testing.  Matrices of type j
+*>          (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) =
+*>          .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used.
+*> \endverbatim
+*>
+*> \param[in] NM
+*> \verbatim
+*>          NM is INTEGER
+*>          The number of values of M contained in the vector MVAL.
+*> \endverbatim
+*>
+*> \param[in] MVAL
+*> \verbatim
+*>          MVAL is INTEGER array, dimension (NM)
+*>          The values of the matrix row dimension M.
+*> \endverbatim
+*>
+*> \param[in] NN
+*> \verbatim
+*>          NN is INTEGER
+*>          The number of values of N contained in the vector NVAL.
+*> \endverbatim
+*>
+*> \param[in] NVAL
+*> \verbatim
+*>          NVAL is INTEGER array, dimension (NN)
+*>          The values of the matrix column dimension N.
+*> \endverbatim
+*>
+*> \param[in] NNS
+*> \verbatim
+*>          NNS is INTEGER
+*>          The number of values of NRHS contained in the vector NSVAL.
+*> \endverbatim
+*>
+*> \param[in] NSVAL
+*> \verbatim
+*>          NSVAL is INTEGER array, dimension (NNS)
+*>          The values of the number of right hand sides NRHS.
+*> \endverbatim
+*>
+*> \param[in] NNB
+*> \verbatim
+*>          NNB is INTEGER
+*>          The number of values of NB and NX contained in the
+*>          vectors NBVAL and NXVAL.  The blocking parameters are used
+*>          in pairs (NB,NX).
+*> \endverbatim
+*>
+*> \param[in] NBVAL
+*> \verbatim
+*>          NBVAL is INTEGER array, dimension (NNB)
+*>          The values of the blocksize NB.
+*> \endverbatim
+*>
+*> \param[in] NXVAL
+*> \verbatim
+*>          NXVAL is INTEGER array, dimension (NNB)
+*>          The values of the crossover point NX.
+*> \endverbatim
+*>
+*> \param[in] THRESH
+*> \verbatim
+*>          THRESH is DOUBLE PRECISION
+*>          The threshold value for the test ratios.  A result is
+*>          included in the output file if RESULT >= THRESH.  To have
+*>          every test ratio printed, use THRESH = 0.
+*> \endverbatim
+*>
+*> \param[out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (MMAX*NMAX)
+*>          where MMAX is the maximum value of M in MVAL and NMAX is the
+*>          maximum value of N in NVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYA
+*> \verbatim
+*>          COPYA is DOUBLE PRECISION array, dimension (MMAX*NMAX)
+*> \endverbatim
+*>
+*> \param[out] B
+*> \verbatim
+*>          B is DOUBLE PRECISION array, dimension (MMAX*NSMAX)
+*>          where MMAX is the maximum value of M in MVAL and NSMAX is the
+*>          maximum value of NRHS in NSVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYB
+*> \verbatim
+*>          COPYB is DOUBLE PRECISION array, dimension (MMAX*NSMAX)
+*> \endverbatim
+*>
+*> \param[out] S
+*> \verbatim
+*>          S is DOUBLE PRECISION array, dimension
+*>                      (min(MMAX,NMAX))
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is DOUBLE PRECISION array, dimension (MMAX)
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension
+*>                      (MMAX*NMAX + 4*NMAX + MMAX)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (2*NMAX)
+*> \endverbatim
+*>
+*> \param[in] NOUT
+*> \verbatim
+*>          NOUT is INTEGER
+*>          The unit number for output.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup double_lin
+*
+*  =====================================================================
+      SUBROUTINE DCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+     $                      B, COPYB, S, TAU,
+     $                      WORK, IWORK, NOUT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            NM, NN, NNB, NNS, NOUT
+      DOUBLE PRECISION   THRESH
+*     ..
+*     .. Array Arguments ..
+      LOGICAL            DOTYPE( * )
+      INTEGER            IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ),
+     $                   NSVAL( * ), NXVAL( * )
+      DOUBLE PRECISION   A( * ), COPYA( * ), B( * ), COPYB( * ),
+     $                   S( * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NTYPES
+      PARAMETER          ( NTYPES = 19 )
+      INTEGER            NTESTS
+      PARAMETER          ( NTESTS = 5 )
+      DOUBLE PRECISION   ONE, ZERO, BIGNUM
+      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0,
+     $                     BIGNUM = 1.0D+38 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER          DIST, TYPE
+      CHARACTER*3        PATH
+      INTEGER            I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO,
+     $                   INB, IND_OFFSET_GEN,
+     $                   IND_IN, IND_OUT, INS, INFO,
+     $                   ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO,
+     $                   KFACT, KL, KMAX, KU, LDA, LW, LWORK,
+     $                   LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N,
+     $                   NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS,
+     $                   NRUN, NX, T
+      DOUBLE PRECISION   ANORM, CNDNUM, EPS, ABSTOL, RELTOL,
+     $                   DTEMP, MAXC2NRMK, RELMAXC2NRMK
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 ), ISEEDY( 4 )
+      DOUBLE PRECISION   RESULT( NTESTS ), RDUMMY( 1 )
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH, DQPT01, DQRT11, DQRT12, DLANGE,
+     $                   DLAPY2
+      EXTERNAL           DLAMCH, DQPT01, DQRT11, DQRT12, DLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ALAERH, ALAHD, ALASUM, DAXPY, DGEQP3RK,
+     $                   DLACPY, DLAORD, DLASET, DLATB4, DLATMS,
+     $                   DORMQR, DSWAP, ICOPY, XLAENV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DBLE, MAX, MIN, MOD
+*     ..
+*     .. Scalars in Common ..
+      LOGICAL            LERR, OK
+      CHARACTER*32       SRNAMT
+      INTEGER            INFOT, IOUNIT
+*     ..
+*     .. Common blocks ..
+      COMMON             / INFOC / INFOT, IOUNIT, OK, LERR
+      COMMON             / SRNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA               ISEEDY / 1988, 1989, 1990, 1991 /
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize constants and the random number seed.
+*
+      PATH( 1: 1 ) = 'Double precision'
+      PATH( 2: 3 ) = 'QK'
+      NRUN = 0
+      NFAIL = 0
+      NERRS = 0
+      DO I = 1, 4
+         ISEED( I ) = ISEEDY( I )
+      END DO
+      EPS = DLAMCH( 'Epsilon' )
+      INFOT = 0
+*
+      DO IM = 1, NM
+*
+*        Do for each value of M in MVAL.
+*
+         M = MVAL( IM )
+         LDA = MAX( 1, M )
+*
+         DO IN = 1, NN
+*
+*           Do for each value of N in NVAL.
+*
+            N = NVAL( IN )
+            MINMN = MIN( M, N )
+            LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ),
+     $                   M*N + 2*MINMN + 4*N )
+*
+            DO INS = 1, NNS
+               NRHS = NSVAL( INS )
+*
+*                 Set up parameters with DLATB4 and generate
+*                 M-by-NRHS B matrix with DLATMS.
+*                 IMAT = 14:
+*                 Random matrix, CNDNUM = 2, NORM = ONE,
+*                 MODE = 3 (geometric distribution of singular values).
+*
+                  CALL DLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'DLATMS'
+                  CALL DLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYB, LDA, WORK, INFO )
+
+
+*
+*                 Check error code from DLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'DLATMS', INFO, 0, ' ', M,
+     $                            NRHS, -1, -1, -1, 6, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+               DO IMAT = 1, NTYPES
+*
+*              Do the tests only if DOTYPE( IMAT ) is true.
+*
+               IF( .NOT.DOTYPE( IMAT ) )
+     $            CYCLE
+*
+*              The type of distribution used to generate the random
+*              eigen-/singular values:
+*              ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 )
+*
+*           Do for each type of NON-SYMMETRIC matrix:                               CNDNUM                     NORM                                     MODE
+*            1. Zero matrix
+*            2. Random, Diagonal, CNDNUM = 2                                        CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            3. Random, Upper triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            4. Random, Lower triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            5. Random, First column is zero, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            6. Random, Last MINMN column is zero, CNDNUM = 2                       CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            7. Random, Last N column is zero, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            8. Random, Middle column in MINMN is zero, CNDNUM = 2                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            9. Random, First half of MINMN columns are zero, CNDNUM = 2            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2   CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           11. Random, Half MINMN columns in the middle are zero starting
+*                  from  MINMN/2-(MINMN/2)/2+1, CNDNUM = 2                          CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           12. Random, Odd columns are ZERO, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           13. Random, Even columns are ZERO, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           14. Random, CNDNUM = 2                                                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           15. Random, CNDNUM = sqrt(0.1/EPS)                                      CNDNUM = BADC1 = sqrt(0.1/EPS)  ONE                                      3 ( geometric distribution of singular values )
+*           16. Random, CNDNUM = 0.1/EPS                                            CNDNUM = BADC2 = 0.1/EPS        ONE                                      3 ( geometric distribution of singular values )
+*           17. Random, CNDNUM = 0.1/EPS,                                           CNDNUM = BADC2 = 0.1/EPS        ONE                                      2 ( one small singular value, S(N)=1/CNDNUM )
+*                 one small singular value S(N)=1/CNDNUM
+*           18. Random, CNDNUM = 2, scaled near underflow                           CNDNUM = 2                      SMALL = SAFMIN
+*           19. Random, CNDNUM = 2, scaled near overflow                            CNDNUM = 2                      LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) )  3 ( geometric distribution of singular values )
+*
+               IF( IMAT.EQ.1 ) THEN
+*
+*                 Matrix 1: Zero matrix
+*
+                  CALL DLASET( 'Full', M, N, ZERO, ZERO, COPYA, LDA )
+                  DO I = 1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 )
+     $                  .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN
+*
+*                 Matrices 2-5.
+*
+*                 Set up parameters with DLATB4 and generate a test
+*                 matrix with DLATMS.
+*
+                  CALL DLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'DLATMS'
+                  CALL DLATMS( M, N, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYA, LDA, WORK, INFO )
+*
+*                 Check error code from DLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'DLATMS', INFO, 0, ' ', M, N,
+     $                            -1, -1, -1, IMAT, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+                  CALL DLAORD( 'Decreasing', MINMN, S, 1 )
+*
+               ELSE IF( MINMN.GE.2
+     $                  .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN
+*
+*                 Rectangular matrices 5-13 that contain zero columns,
+*                 only for matrices MINMN >=2.
+*
+*                 JB_ZERO is the column index of ZERO block.
+*                 NB_ZERO is the column block size of ZERO block.
+*                 NB_GEN is the column blcok size of the
+*                 generated block.
+*                 J_INC in the non_zero column index increment
+*                 for matrix 12 and 13.
+*                 J_FIRS_NZ is the index of the first non-zero
+*                 column.
+*
+                  IF( IMAT.EQ.5 ) THEN
+*
+*                    First column is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.6 ) THEN
+*
+*                    Last column MINMN is zero.
+*
+                     JB_ZERO = MINMN
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.7 ) THEN
+*
+*                    Last column N is zero.
+*
+                     JB_ZERO = N
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.8 ) THEN
+*
+*                    Middle column in MINMN is zero.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.9 ) THEN
+*
+*                    First half of MINMN columns is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.10 ) THEN
+*
+*                    Last columns are zero columns,
+*                    starting from (MINMN / 2 + 1) column.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = N - JB_ZERO + 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.11 ) THEN
+*
+*                    Half of the columns in the middle of MINMN
+*                    columns is zero, starting from
+*                    MINMN/2 - (MINMN/2)/2 + 1 column.
+*
+                     JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.12 ) THEN
+*
+*                    Odd-numbered columns are zero,
+*
+                     NB_GEN = N / 2
+                     NB_ZERO = N - NB_GEN
+                     J_INC = 2
+                     J_FIRST_NZ = 2
+*
+                  ELSE IF( IMAT.EQ.13 ) THEN
+*
+*                    Even-numbered columns are zero.
+*
+                     NB_ZERO = N / 2
+                     NB_GEN = N - NB_ZERO
+                     J_INC = 2
+                     J_FIRST_NZ = 1
+*
+                  END IF
+*
+*
+*                 1) Set the first NB_ZERO columns in COPYA(1:M,1:N)
+*                    to zero.
+*
+                  CALL DLASET( 'Full', M, NB_ZERO, ZERO, ZERO,
+     $                         COPYA, LDA )
+*
+*                    2) Generate an M-by-(N-NB_ZERO) matrix with the
+*                       chosen singular value distribution
+*                       in COPYA(1:M,NB_ZERO+1:N).
+*
+                  CALL DLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU,
+     $                         ANORM, MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'DLATMS'
+*
+                  IND_OFFSET_GEN = NB_ZERO * LDA
+*
+                  CALL DLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE,
+     $                        CNDNUM, ANORM, KL, KU, 'No packing',
+     $                        COPYA( IND_OFFSET_GEN + 1 ), LDA,
+     $                        WORK, INFO )
+*
+*                 Check error code from DLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'DLATMS', INFO, 0, ' ', M,
+     $                            NB_GEN, -1, -1, -1, IMAT, NFAIL,
+     $                            NERRS, NOUT )
+                     CYCLE
+                  END IF
+*
+*                 3) Swap the gererated colums from the right side
+*                 NB_GEN-size block in COPYA into correct column
+*                 positions.
+*
+                  IF( IMAT.EQ.6
+     $                    .OR. IMAT.EQ.7
+     $                    .OR. IMAT.EQ.8
+     $                    .OR. IMAT.EQ.10
+     $                    .OR. IMAT.EQ.11 ) THEN
+*
+*                    Move by swapping the generated columns
+*                    from the right NB_GEN-size block from
+*                    (NB_ZERO+1:NB_ZERO+JB_ZERO)
+*                    into columns (1:JB_ZERO-1).
+*
+                     DO J = 1, JB_ZERO-1, 1
+                        CALL DSWAP( M,
+     $                        COPYA( ( NB_ZERO+J-1)*LDA+1), 1,
+     $                        COPYA( (J-1)*LDA + 1 ), 1 )
+                     END DO
+*
+                  ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN
+*
+*                    ( IMAT = 12, Odd-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the even zero colums in the
+*                    left NB_ZERO-size block.
+*
+*                    ( IMAT = 13, Even-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the odd zero colums in the
+*                    left NB_ZERO-size block.
+*
+                     DO J = 1, NB_GEN, 1
+                        IND_OUT = ( NB_ZERO+J-1 )*LDA + 1
+                        IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA
+     $                            + 1
+                        CALL DSWAP( M,
+     $                              COPYA( IND_OUT ), 1,
+     $                              COPYA( IND_IN), 1 )
+                        END DO
+*
+                  END IF
+*
+*                 5) Order the singular values generated by
+*                    DLAMTS in decreasing order and add trailing zeros
+*                    that correspond to zero columns.
+*                    The total number of singular values is MINMN.
+*
+                  MINMNB_GEN = MIN( M, NB_GEN )
+*
+                  DO I = MINMNB_GEN+1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE
+*
+*                    IF(MINMN.LT.2) skip this size for this matrix type.
+*
+                     CYCLE
+               END IF
+*
+*              Initialize a copy array for a pivot array for DGEQP3RK.
+*
+               DO I = 1, N
+                  IWORK( I ) = 0
+               END DO
+*
+               DO INB = 1, NNB
+*
+*                 Do for each pair of values (NB,NX) in NBVAL and NXVAL.
+*
+                  NB = NBVAL( INB )
+                  CALL XLAENV( 1, NB )
+                  NX = NXVAL( INB )
+                  CALL XLAENV( 3, NX )
+*
+*                 We do MIN(M,N)+1 because we need a test for KMAX > N,
+*                 when KMAX is larger than MIN(M,N), KMAX should be
+*                 KMAX = MIN(M,N)
+*
+                  DO KMAX = 0, MIN(M,N)+1
+*
+*                 Get a working copy of COPYA into A( 1:M,1:N ).
+*                 Get a working copy of COPYB into A( 1:M, (N+1):NRHS ).
+*                 Get a working copy of COPYB into into B( 1:M, 1:NRHS ).
+*                 Get a working copy of IWORK(1:N) awith zeroes into
+*                 which is going to be used as pivot array IWORK( N+1:2N ).
+*                 NOTE: IWORK(2N+1:3N) is going to be used as a WORK array
+*                 for the routine.
+*
+                  CALL DLACPY( 'All', M, N, COPYA, LDA, A, LDA )
+                  CALL DLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         A( LDA*N + 1 ),  LDA )
+                  CALL DLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         B,  LDA )
+                  CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 )
+*
+                  ABSTOL = -1.0
+                  RELTOL = -1.0
+*
+*                 Compute the QR factorization with pivoting of A
+*
+                  LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ),
+     $                              3*N + NRHS - 1 ) )
+*
+*                 Compute DGEQP3RK factorization of A.
+*
+                  SRNAMT = 'DGEQP3RK'
+                  CALL DGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                           A, LDA, KFACT, MAXC2NRMK,
+     $                           RELMAXC2NRMK, IWORK( N+1 ), TAU,
+     $                           WORK, LW, IWORK( 2*N+1 ), INFO )
+*
+*                 Check error code from DGEQP3RK.
+*
+                  IF( INFO.LT.0 )
+     $               CALL ALAERH( PATH, 'DGEQP3RK', INFO, 0, ' ',
+     $                            M, N, NX, -1, NB, IMAT,
+     $                            NFAIL, NERRS, NOUT )
+*
+*                 Compute test 1:
+*
+*                 This test in only for the full rank factorization of
+*                 the matrix A.
+*
+*                 Array S(1:min(M,N)) contains svd(A) the sigular values
+*                 of the original matrix A in decreasing absolute value
+*                 order. The test computes svd(R), the vector sigular
+*                 values of the upper trapezoid of A(1:M,1:N) that
+*                 contains the factor R, in decreasing order. The test
+*                 returns the ratio:
+*
+*                 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS )
+*
+                  IF( KFACT.EQ.MINMN ) THEN
+*
+                     RESULT( 1 ) = DQRT12( M, N, A, LDA, S, WORK,
+     $                                     LWORK )
+*
+                     DO T = 1, 1
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL, NB, NX,
+     $                        IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                   End test 1
+*
+                  END IF
+*
+*                 Compute test 2:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS )
+*
+                  RESULT( 2 ) = DQPT01( M, N, KFACT, COPYA, A, LDA, TAU,
+     $                          IWORK( N+1 ), WORK, LWORK )
+*
+*                 Compute test 3:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( Q**T * Q - I ) / ( M * EPS )
+*
+                  RESULT( 3 ) = DQRT11( M, KFACT, A, LDA, TAU, WORK,
+     $                          LWORK )
+*
+*                 Print information about the tests that did not pass
+*                 the threshold.
+*
+                  DO T = 2, 3
+                     IF( RESULT( T ).GE.THRESH ) THEN
+                        IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                     CALL ALAHD( NOUT, PATH )
+                        WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK', M, N,
+     $                      NRHS, KMAX, ABSTOL, RELTOL,
+     $                      NB, NX, IMAT, T, RESULT( T )
+                        NFAIL = NFAIL + 1
+                     END IF
+                  END DO
+                  NRUN = NRUN + 2
+*
+*                 Compute test 4:
+*
+*                 This test is only for the factorizations with the
+*                 rank greater than 2.
+*                 The elements on the diagonal of R should be non-
+*                 increasing.
+*
+*                 The test returns the ratio:
+*
+*                 Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)),
+*                 K=1:KFACT-1
+*
+                  IF( MIN(KFACT, MINMN).GE.2 ) THEN
+*
+                     DO J = 1, KFACT-1, 1
+
+                        DTEMP = (( ABS( A( (J-1)*M+J ) ) -
+     $                          ABS( A( (J)*M+J+1 ) ) ) /
+     $                          ABS( A(1) ) )
+*
+                        IF( DTEMP.LT.ZERO ) THEN
+                           RESULT( 4 ) = BIGNUM
+                        END IF
+*
+                     END DO
+*
+*                    Print information about the tests that did not
+*                    pass the threshold.
+*
+                     DO T = 4, 4
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK',
+     $                        M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T,
+     $                        RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End test 4.
+*
+                  END IF
+*
+*                 Compute test 5:
+*
+*                 This test in only for matrix A with min(M,N) > 0.
+*
+*                 The test returns the ratio:
+*
+*                 1-norm(Q**T * B - Q**T * B ) /
+*                       ( M * EPS )
+*
+*                 (1) Compute B:=Q**T * B in the matrix B.
+*
+                  IF( MINMN.GT.0 ) THEN
+*
+                     LWORK_MQR = MAX(1, NRHS)
+                     CALL DORMQR( 'Left', 'Transpose',
+     $                         M, NRHS, KFACT, A, LDA, TAU, B, LDA,
+     $                         WORK, LWORK_MQR, INFO )
+*
+                     DO I = 1, NRHS
+*
+*                       Compare N+J-th column of A and J-column of B.
+*
+                        CALL DAXPY( M, -ONE, A( ( N+I-1 )*LDA+1 ), 1,
+     $                                 B( ( I-1 )*LDA+1 ), 1 )
+                     END DO
+*
+                   RESULT( 5 ) =
+     $               ABS(
+     $               DLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) /
+     $               ( DBLE( M )*DLAMCH( 'Epsilon' ) )
+     $               )
+*
+*                    Print information about the tests that did not pass
+*                    the threshold.
+*
+                     DO T = 5, 5
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End compute test 5.
+*
+                  END IF
+*
+*                 END DO KMAX = 1, MIN(M,N)+1
+*
+                  END DO
+*
+*                 END DO for INB = 1, NNB
+*
+               END DO
+*
+*              END DO  for IMAT = 1, NTYPES
+*
+               END DO
+*
+*              END DO for INS = 1, NNS
+*
+            END DO
+*
+*           END DO for IN = 1, NN
+*
+         END DO
+*
+*        END DO for IM = 1, NM
+*
+      END DO
+*
+*     Print a summary of the results.
+*
+      CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
+*
+ 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5,
+     $        ', KMAX =', I5, ', ABSTOL =', G12.5,
+     $        ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4,
+     $        ', type ', I2, ', test ', I2, ', ratio =', G12.5 )
+*
+*     End of DCHKQP3RK
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/dlatb4.f b/lapack-netlib/TESTING/LIN/dlatb4.f
index 8825d13e7..f3bccd45b 100644
--- a/lapack-netlib/TESTING/LIN/dlatb4.f
+++ b/lapack-netlib/TESTING/LIN/dlatb4.f
@@ -133,7 +133,7 @@
 *
 *     .. Parameters ..
       DOUBLE PRECISION   SHRINK, TENTH
-      PARAMETER          ( SHRINK = 0.25D0, TENTH = 0.1D+0 )
+      PARAMETER          ( SHRINK = 0.25D+0, TENTH = 0.1D+0 )
       DOUBLE PRECISION   ONE
       PARAMETER          ( ONE = 1.0D+0 )
       DOUBLE PRECISION   TWO
@@ -153,9 +153,6 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           DLABAD
-*     ..
 *     .. Save statement ..
       SAVE               EPS, SMALL, LARGE, BADC1, BADC2, FIRST
 *     ..
@@ -173,11 +170,6 @@
          BADC1 = SQRT( BADC2 )
          SMALL = DLAMCH( 'Safe minimum' )
          LARGE = ONE / SMALL
-*
-*        If it looks like we're on a Cray, take the square root of
-*        SMALL and LARGE to avoid overflow and underflow problems.
-*
-         CALL DLABAD( SMALL, LARGE )
          SMALL = SHRINK*( SMALL / EPS )
          LARGE = ONE / SMALL
       END IF
@@ -232,6 +224,110 @@
          ELSE
             ANORM = ONE
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        xQK: truncated QR with pivoting.
+*             Set parameters to generate a general
+*             M x N matrix.
+*
+*        Set TYPE, the type of matrix to be generated.  'N' is nonsymmetric.
+*
+         TYPE = 'N'
+*
+*        Set DIST, the type of distribution for the random
+*        number generator. 'S' is
+*
+         DIST = 'S'
+*
+*        Set the lower and upper bandwidths.
+*
+         IF( IMAT.EQ.2 ) THEN
+*
+*           2. Random, Diagonal, CNDNUM = 2
+*
+            KL = 0
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.3 ) THEN
+*
+*           3. Random, Upper triangular,  CNDNUM = 2
+*
+            KL = 0
+            KU = MAX( N-1, 0 )
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.4 ) THEN
+*
+*          4. Random, Lower triangular,  CNDNUM = 2
+*
+            KL = MAX( M-1, 0 )
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE
+*
+*           5.-19. Rectangular matrix
+*
+            KL = MAX( M-1, 0 )
+            KU = MAX( N-1, 0 )
+*
+            IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN
+*
+*              5.-14. Random, CNDNUM = 2.
+*
+               CNDNUM = TWO
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.15 ) THEN
+*
+*              15. Random, CNDNUM = sqrt(0.1/EPS)
+*
+               CNDNUM = BADC1
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.16 ) THEN
+*
+*              16. Random, CNDNUM = 0.1/EPS
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.17 ) THEN
+*
+*              17. Random, CNDNUM = 0.1/EPS,
+*                  one small singular value S(N)=1/CNDNUM
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 2
+*
+            ELSE IF( IMAT.EQ.18 ) THEN
+*
+*              18. Random, scaled near underflow
+*
+               CNDNUM = TWO
+               ANORM = SMALL
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.19 ) THEN
+*
+*              19. Random, scaled near overflow
+*
+               CNDNUM = TWO
+               ANORM = LARGE
+               MODE = 3
+*
+            END IF
+*
+         END IF
 *
       ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN
 *
@@ -518,17 +614,18 @@
 *
 *        Set the norm and condition number.
 *
-         IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN
+         MAT = ABS( IMAT )
+         IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN
             CNDNUM = BADC1
-         ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN
+         ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN
             CNDNUM = BADC2
          ELSE
             CNDNUM = TWO
          END IF
 *
-         IF( IMAT.EQ.4 ) THEN
+         IF( MAT.EQ.4 ) THEN
             ANORM = SMALL
-         ELSE IF( IMAT.EQ.5 ) THEN
+         ELSE IF( MAT.EQ.5 ) THEN
             ANORM = LARGE
          ELSE
             ANORM = ONE
diff --git a/lapack-netlib/TESTING/LIN/dqpt01.f b/lapack-netlib/TESTING/LIN/dqpt01.f
index 8efbdc774..af3f5dd36 100644
--- a/lapack-netlib/TESTING/LIN/dqpt01.f
+++ b/lapack-netlib/TESTING/LIN/dqpt01.f
@@ -28,12 +28,13 @@
 *>
 *> DQPT01 tests the QR-factorization with pivoting of a matrix A.  The
 *> array AF contains the (possibly partial) QR-factorization of A, where
-*> the upper triangle of AF(1:k,1:k) is a partial triangular factor,
-*> the entries below the diagonal in the first k columns are the
+*> the upper triangle of AF(1:K,1:K) is a partial triangular factor,
+*> the entries below the diagonal in the first K columns are the
 *> Householder vectors, and the rest of AF contains a partially updated
 *> matrix.
 *>
-*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M)
+*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) ),
+*> where || . || is matrix one norm.
 *> \endverbatim
 *
 *  Arguments:
@@ -172,28 +173,41 @@
 *
       NORMA = DLANGE( 'One-norm', M, N, A, LDA, RWORK )
 *
-      DO 30 J = 1, K
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, K
+*
+*        Copy the upper triangular part of the factor R stored
+*        in AF(1:K,1:K) into the work array WORK.
+*
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = AF( I, J )
-   10    CONTINUE
-         DO 20 I = J + 1, M
+         END DO
+*
+*        Zero out the elements below the diagonal in the work array.
+*
+         DO I = J + 1, M
             WORK( ( J-1 )*M+I ) = ZERO
-   20    CONTINUE
-   30 CONTINUE
-      DO 40 J = K + 1, N
+         END DO
+      END DO
+*
+*     Copy columns (K+1,N) from AF into the work array WORK.
+*     AF(1:K,K+1:N) contains the rectangular block of the upper trapezoidal
+*     factor R, AF(K+1:M,K+1:N) contains the partially updated residual
+*     matrix of R.
+*
+      DO J = K + 1, N
          CALL DCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 )
-   40 CONTINUE
+      END DO
 *
       CALL DORMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK,
      $             M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
-      DO 50 J = 1, N
+      DO J = 1, N
 *
-*        Compare i-th column of QR and jpvt(i)-th column of A
+*        Compare J-th column of QR and JPVT(J)-th column of A.
 *
          CALL DAXPY( M, -ONE, A( 1, JPVT( J ) ), 1, WORK( ( J-1 )*M+1 ),
      $               1 )
-   50 CONTINUE
+      END DO
 *
       DQPT01 = DLANGE( 'One-norm', M, N, WORK, M, RWORK ) /
      $         ( DBLE( MAX( M, N ) )*DLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/dqrt11.f b/lapack-netlib/TESTING/LIN/dqrt11.f
index 33c7fab37..38bbeb822 100644
--- a/lapack-netlib/TESTING/LIN/dqrt11.f
+++ b/lapack-netlib/TESTING/LIN/dqrt11.f
@@ -157,9 +157,9 @@
       CALL DORM2R( 'Left', 'Transpose', M, M, K, A, LDA, TAU, WORK, M,
      $             WORK( M*M+1 ), INFO )
 *
-      DO 10 J = 1, M
+      DO J = 1, M
          WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE
-   10 CONTINUE
+      END DO
 *
       DQRT11 = DLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) /
      $         ( DBLE( M )*DLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/dqrt12.f b/lapack-netlib/TESTING/LIN/dqrt12.f
index 278e01bf0..b8a124c59 100644
--- a/lapack-netlib/TESTING/LIN/dqrt12.f
+++ b/lapack-netlib/TESTING/LIN/dqrt12.f
@@ -26,7 +26,7 @@
 *> DQRT12 computes the singular values `svlues' of the upper trapezoid
 *> of A(1:M,1:N) and returns the ratio
 *>
-*>      || s - svlues||/(||svlues||*eps*max(M,N))
+*>      || svlues - s ||/(||s||*eps*max(M,N))
 *> \endverbatim
 *
 *  Arguments:
@@ -113,8 +113,7 @@
       EXTERNAL           DASUM, DLAMCH, DLANGE, DNRM2
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DAXPY, DBDSQR, DGEBD2, DLABAD, DLASCL, DLASET,
-     $                   XERBLA
+      EXTERNAL           DAXPY, DBDSQR, DGEBD2, DLASCL, DLASET, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          DBLE, MAX, MIN
@@ -145,17 +144,16 @@
 *     Copy upper triangle of A into work
 *
       CALL DLASET( 'Full', M, N, ZERO, ZERO, WORK, M )
-      DO 20 J = 1, N
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, N
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = A( I, J )
-   10    CONTINUE
-   20 CONTINUE
+         END DO
+      END DO
 *
 *     Get machine parameters
 *
       SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' )
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale work if max entry outside range [SMLNUM,BIGNUM]
 *
@@ -199,16 +197,18 @@
 *
       ELSE
 *
-         DO 30 I = 1, MN
+         DO I = 1, MN
             WORK( M*N+I ) = ZERO
-   30    CONTINUE
+         END DO
       END IF
 *
 *     Compare s and singular values of work
 *
       CALL DAXPY( MN, -ONE, S, 1, WORK( M*N+1 ), 1 )
+*
       DQRT12 = DASUM( MN, WORK( M*N+1 ), 1 ) /
-     $         ( DLAMCH( 'Epsilon' )*DBLE( MAX( M, N ) ) )
+     $  ( DLAMCH('Epsilon') * DBLE( MAX( M, N ) ) )
+*
       IF( NRMSVL.NE.ZERO )
      $   DQRT12 = DQRT12 / NRMSVL
 *
diff --git a/lapack-netlib/TESTING/LIN/schkaa.F b/lapack-netlib/TESTING/LIN/schkaa.F
index a5b826d06..2b9f2ea45 100644
--- a/lapack-netlib/TESTING/LIN/schkaa.F
+++ b/lapack-netlib/TESTING/LIN/schkaa.F
@@ -63,6 +63,7 @@
 *> SLQ    8               List types on next line if 0 < NTYPES <  8
 *> SQL    8               List types on next line if 0 < NTYPES <  8
 *> SQP    6               List types on next line if 0 < NTYPES <  6
+*> DQK    19              List types on next line if 0 < NTYPES <  19
 *> STZ    3               List types on next line if 0 < NTYPES <  3
 *> SLS    6               List types on next line if 0 < NTYPES <  6
 *> SEQ
@@ -147,11 +148,11 @@
      $                   NBVAL( MAXIN ), NBVAL2( MAXIN ),
      $                   NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ),
      $                   RANKVAL( MAXIN ), PIV( NMAX )
-      REAL               E( NMAX ), S( 2*NMAX )
 *     ..
 *     .. Allocatable Arrays ..
       INTEGER AllocateStatus
-      REAL, DIMENSION(:), ALLOCATABLE :: RWORK
+      REAL, DIMENSION(:), ALLOCATABLE :: RWORK, S
+      REAL, DIMENSION(:), ALLOCATABLE :: E
       REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK
 *     ..
 *     .. External Functions ..
@@ -162,13 +163,13 @@
 *     .. External Subroutines ..
       EXTERNAL           ALAREQ, SCHKEQ, SCHKGB, SCHKGE, SCHKGT, SCHKLQ,
      $                   SCHKORHR_COL, SCHKPB, SCHKPO, SCHKPS, SCHKPP,
-     $                   SCHKPT, SCHKQ3, SCHKQL, SCHKQR, SCHKRQ, SCHKSP,
-     $                   SCHKSY, SCHKSY_ROOK, SCHKSY_RK, SCHKSY_AA,
-     $                   SCHKTB, SCHKTP, SCHKTR, SCHKTZ, SDRVGB, SDRVGE,
-     $                   SDRVGT, SDRVLS, SDRVPB, SDRVPO, SDRVPP, SDRVPT,
-     $                   SDRVSP, SDRVSY, SDRVSY_ROOK, SDRVSY_RK,
-     $                   SDRVSY_AA, ILAVER, SCHKLQTP, SCHKQRT, SCHKQRTP,
-     $                   SCHKLQT, SCHKTSQR
+     $                   SCHKPT, SCHKQ3, SCHKQP3RK, SCHKQL, SCHKQR,
+     $                   SCHKRQ, SCHKSP, SCHKSY, SCHKSY_ROOK, SCHKSY_RK,
+     $                   SCHKSY_AA, SCHKTB, SCHKTP, SCHKTR, SCHKTZ,
+     $                   SDRVGB, SDRVGE, SDRVGT, SDRVLS, SDRVPB, SDRVPO,
+     $                   SDRVPP, SDRVPT, SDRVSP, SDRVSY, SDRVSY_ROOK,
+     $                   SDRVSY_RK, SDRVSY_AA, ILAVER, SCHKLQTP, SCHKQRT,
+     $                   SCHKQRTP, SCHKLQT, SCHKTSQR
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -188,13 +189,17 @@
 *     ..
 *     .. Allocate memory dynamically ..
 *
-      ALLOCATE (A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus )
+      ALLOCATE ( A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
-      ALLOCATE (B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus )
+      ALLOCATE ( B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
-      ALLOCATE (WORK( NMAX, NMAX+MAXRHS+30 ) , STAT = AllocateStatus )
+      ALLOCATE ( WORK( NMAX, 3*NMAX+MAXRHS+30 ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
-      ALLOCATE (RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus )
+      ALLOCATE ( E( NMAX ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" 
 *     ..
 *     .. Executable Statements ..
@@ -920,6 +925,23 @@
          ELSE
             WRITE( NOUT, FMT = 9989 )PATH
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        QK: truncated QR factorization with pivoting
+*
+         NTYPES = 19
+         CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT )
+*
+         IF( TSTCHK ) THEN
+            CALL SCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                      NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ),
+     $                      A( 1, 2 ), B( 1, 1 ), B( 1, 2 ),
+     $                      B( 1, 3 ), B( 1, 4 ),
+     $                      WORK, IWORK, NOUT )
+         ELSE
+            WRITE( NOUT, FMT = 9989 )PATH
+         END IF
 *
       ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN
 *
diff --git a/lapack-netlib/TESTING/LIN/schkqp3rk.f b/lapack-netlib/TESTING/LIN/schkqp3rk.f
new file mode 100644
index 000000000..36cf9370e
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/schkqp3rk.f
@@ -0,0 +1,831 @@
+*> \brief \b SCHKQP3RK
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE SCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+*     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+*     $                      B, COPYB, S, TAU,
+*     $                      WORK, IWORK, NOUT )
+*      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+*      INTEGER            NM, NN, NNS, NNB, NOUT
+*      REAL               THRESH
+*     ..
+*     .. Array Arguments ..
+*      LOGICAL            DOTYPE( * )
+*      INTEGER            IWORK( * ), MVAL( * ), NBVAL( * ), NSVAL( * ),
+*     $                   NVAL( * ), NXVAL( * )
+*      REAL               A( * ), COPYA( * ), B( * ), COPYB( * ),
+*     $                   S( * ), TAU( * ), WORK( * )
+*     ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SCHKQP3RK tests SGEQP3RK.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] DOTYPE
+*> \verbatim
+*>          DOTYPE is LOGICAL array, dimension (NTYPES)
+*>          The matrix types to be used for testing.  Matrices of type j
+*>          (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) =
+*>          .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used.
+*> \endverbatim
+*>
+*> \param[in] NM
+*> \verbatim
+*>          NM is INTEGER
+*>          The number of values of M contained in the vector MVAL.
+*> \endverbatim
+*>
+*> \param[in] MVAL
+*> \verbatim
+*>          MVAL is INTEGER array, dimension (NM)
+*>          The values of the matrix row dimension M.
+*> \endverbatim
+*>
+*> \param[in] NN
+*> \verbatim
+*>          NN is INTEGER
+*>          The number of values of N contained in the vector NVAL.
+*> \endverbatim
+*>
+*> \param[in] NVAL
+*> \verbatim
+*>          NVAL is INTEGER array, dimension (NN)
+*>          The values of the matrix column dimension N.
+*> \endverbatim
+*>
+*> \param[in] NNS
+*> \verbatim
+*>          NNS is INTEGER
+*>          The number of values of NRHS contained in the vector NSVAL.
+*> \endverbatim
+*>
+*> \param[in] NSVAL
+*> \verbatim
+*>          NSVAL is INTEGER array, dimension (NNS)
+*>          The values of the number of right hand sides NRHS.
+*> \endverbatim
+*>
+*> \param[in] NNB
+*> \verbatim
+*>          NNB is INTEGER
+*>          The number of values of NB and NX contained in the
+*>          vectors NBVAL and NXVAL.  The blocking parameters are used
+*>          in pairs (NB,NX).
+*> \endverbatim
+*>
+*> \param[in] NBVAL
+*> \verbatim
+*>          NBVAL is INTEGER array, dimension (NNB)
+*>          The values of the blocksize NB.
+*> \endverbatim
+*>
+*> \param[in] NXVAL
+*> \verbatim
+*>          NXVAL is INTEGER array, dimension (NNB)
+*>          The values of the crossover point NX.
+*> \endverbatim
+*>
+*> \param[in] THRESH
+*> \verbatim
+*>          THRESH is REAL
+*>          The threshold value for the test ratios.  A result is
+*>          included in the output file if RESULT >= THRESH.  To have
+*>          every test ratio printed, use THRESH = 0.
+*> \endverbatim
+*>
+*> \param[out] A
+*> \verbatim
+*>          A is REAL array, dimension (MMAX*NMAX)
+*>          where MMAX is the maximum value of M in MVAL and NMAX is the
+*>          maximum value of N in NVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYA
+*> \verbatim
+*>          COPYA is REAL array, dimension (MMAX*NMAX)
+*> \endverbatim
+*>
+*> \param[out] B
+*> \verbatim
+*>          B is REAL array, dimension (MMAX*NSMAX)
+*>          where MMAX is the maximum value of M in MVAL and NSMAX is the
+*>          maximum value of NRHS in NSVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYB
+*> \verbatim
+*>          COPYB is REAL array, dimension (MMAX*NSMAX)
+*> \endverbatim
+*>
+*> \param[out] S
+*> \verbatim
+*>          S is REAL array, dimension
+*>                      (min(MMAX,NMAX))
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is REAL array, dimension (MMAX)
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension
+*>                      (MMAX*NMAX + 4*NMAX + MMAX)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (2*NMAX)
+*> \endverbatim
+*>
+*> \param[in] NOUT
+*> \verbatim
+*>          NOUT is INTEGER
+*>          The unit number for output.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup single_lin
+*
+*  =====================================================================
+      SUBROUTINE SCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+     $                      B, COPYB, S, TAU,
+     $                      WORK, IWORK, NOUT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            NM, NN, NNB, NNS, NOUT
+      REAL               THRESH
+*     ..
+*     .. Array Arguments ..
+      LOGICAL            DOTYPE( * )
+      INTEGER            IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ),
+     $                   NSVAL( * ), NXVAL( * )
+      REAL               A( * ), COPYA( * ), B( * ), COPYB( * ),
+     $                   S( * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NTYPES
+      PARAMETER          ( NTYPES = 19 )
+      INTEGER            NTESTS
+      PARAMETER          ( NTESTS = 5 )
+      REAL               ONE, ZERO, BIGNUM
+      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0,
+     $                     BIGNUM = 1.0E+38 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER          DIST, TYPE
+      CHARACTER*3        PATH
+      INTEGER            I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO,
+     $                   INB, IND_OFFSET_GEN,
+     $                   IND_IN, IND_OUT, INS, INFO,
+     $                   ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO,
+     $                   KFACT, KL, KMAX, KU, LDA, LW, LWORK,
+     $                   LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N,
+     $                   NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS,
+     $                   NRUN, NX, T
+      REAL               ANORM, CNDNUM, EPS, ABSTOL, RELTOL,
+     $                   DTEMP, MAXC2NRMK, RELMAXC2NRMK
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 ), ISEEDY( 4 )
+      REAL               RESULT( NTESTS ), RDUMMY( 1 )
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH, SQPT01, SQRT11, SQRT12, SLANGE
+      EXTERNAL           SLAMCH, SQPT01, SQRT11, SQRT12, SLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ALAERH, ALAHD, ALASUM, SAXPY, SGEQP3RK,
+     $                   SLACPY, SLAORD, SLASET, SLATB4, SLATMS,
+     $                   SORMQR, SSWAP, ICOPY, XLAENV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN, MOD, REAL
+*     ..
+*     .. Scalars in Common ..
+      LOGICAL            LERR, OK
+      CHARACTER*32       SRNAMT
+      INTEGER            INFOT, IOUNIT
+*     ..
+*     .. Common blocks ..
+      COMMON             / INFOC / INFOT, IOUNIT, OK, LERR
+      COMMON             / SRNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA               ISEEDY / 1988, 1989, 1990, 1991 /
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize constants and the random number seed.
+*
+      PATH( 1: 1 ) = 'Single precision'
+      PATH( 2: 3 ) = 'QK'
+      NRUN = 0
+      NFAIL = 0
+      NERRS = 0
+      DO I = 1, 4
+         ISEED( I ) = ISEEDY( I )
+      END DO
+      EPS = SLAMCH( 'Epsilon' )
+      INFOT = 0
+*
+      DO IM = 1, NM
+*
+*        Do for each value of M in MVAL.
+*
+         M = MVAL( IM )
+         LDA = MAX( 1, M )
+*
+         DO IN = 1, NN
+*
+*           Do for each value of N in NVAL.
+*
+            N = NVAL( IN )
+            MINMN = MIN( M, N )
+            LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ),
+     $                   M*N + 2*MINMN + 4*N )
+*
+            DO INS = 1, NNS
+               NRHS = NSVAL( INS )
+*
+*                 Set up parameters with SLATB4 and generate
+*                 M-by-NRHS B matrix with SLATMS.
+*                 IMAT = 14:
+*                 Random matrix, CNDNUM = 2, NORM = ONE,
+*                 MODE = 3 (geometric distribution of singular values).
+*
+                  CALL SLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'SLATMS'
+                  CALL SLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYB, LDA, WORK, INFO )
+
+
+*
+*                 Check error code from SLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'SLATMS', INFO, 0, ' ', M,
+     $                            NRHS, -1, -1, -1, 6, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+               DO IMAT = 1, NTYPES
+*
+*              Do the tests only if DOTYPE( IMAT ) is true.
+*
+               IF( .NOT.DOTYPE( IMAT ) )
+     $            CYCLE
+*
+*              The type of distribution used to generate the random
+*              eigen-/singular values:
+*              ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 )
+*
+*           Do for each type of NON-SYMMETRIC matrix:                               CNDNUM                     NORM                                     MODE
+*            1. Zero matrix
+*            2. Random, Diagonal, CNDNUM = 2                                        CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            3. Random, Upper triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            4. Random, Lower triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            5. Random, First column is zero, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            6. Random, Last MINMN column is zero, CNDNUM = 2                       CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            7. Random, Last N column is zero, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            8. Random, Middle column in MINMN is zero, CNDNUM = 2                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            9. Random, First half of MINMN columns are zero, CNDNUM = 2            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2   CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           11. Random, Half MINMN columns in the middle are zero starting
+*                  from  MINMN/2-(MINMN/2)/2+1, CNDNUM = 2                          CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           12. Random, Odd columns are ZERO, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           13. Random, Even columns are ZERO, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           14. Random, CNDNUM = 2                                                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           15. Random, CNDNUM = sqrt(0.1/EPS)                                      CNDNUM = BADC1 = sqrt(0.1/EPS)  ONE                                      3 ( geometric distribution of singular values )
+*           16. Random, CNDNUM = 0.1/EPS                                            CNDNUM = BADC2 = 0.1/EPS        ONE                                      3 ( geometric distribution of singular values )
+*           17. Random, CNDNUM = 0.1/EPS,                                           CNDNUM = BADC2 = 0.1/EPS        ONE                                      2 ( one small singular value, S(N)=1/CNDNUM )
+*                 one small singular value S(N)=1/CNDNUM
+*           18. Random, CNDNUM = 2, scaled near underflow                           CNDNUM = 2                      SMALL = SAFMIN
+*           19. Random, CNDNUM = 2, scaled near overflow                            CNDNUM = 2                      LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) )  3 ( geometric distribution of singular values )
+*
+               IF( IMAT.EQ.1 ) THEN
+*
+*                 Matrix 1: Zero matrix
+*
+                  CALL SLASET( 'Full', M, N, ZERO, ZERO, COPYA, LDA )
+                  DO I = 1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 )
+     $                  .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN
+*
+*                 Matrices 2-5.
+*
+*                 Set up parameters with SLATB4 and generate a test
+*                 matrix with SLATMS.
+*
+                  CALL SLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'SLATMS'
+                  CALL SLATMS( M, N, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYA, LDA, WORK, INFO )
+*
+*                 Check error code from SLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'SLATMS', INFO, 0, ' ', M, N,
+     $                            -1, -1, -1, IMAT, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+                  CALL SLAORD( 'Decreasing', MINMN, S, 1 )
+*
+               ELSE IF( MINMN.GE.2
+     $                  .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN
+*
+*                 Rectangular matrices 5-13 that contain zero columns,
+*                 only for matrices MINMN >=2.
+*
+*                 JB_ZERO is the column index of ZERO block.
+*                 NB_ZERO is the column block size of ZERO block.
+*                 NB_GEN is the column blcok size of the
+*                 generated block.
+*                 J_INC in the non_zero column index increment
+*                 for matrix 12 and 13.
+*                 J_FIRS_NZ is the index of the first non-zero
+*                 column.
+*
+                  IF( IMAT.EQ.5 ) THEN
+*
+*                    First column is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.6 ) THEN
+*
+*                    Last column MINMN is zero.
+*
+                     JB_ZERO = MINMN
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.7 ) THEN
+*
+*                    Last column N is zero.
+*
+                     JB_ZERO = N
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.8 ) THEN
+*
+*                    Middle column in MINMN is zero.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.9 ) THEN
+*
+*                    First half of MINMN columns is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.10 ) THEN
+*
+*                    Last columns are zero columns,
+*                    starting from (MINMN / 2 + 1) column.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = N - JB_ZERO + 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.11 ) THEN
+*
+*                    Half of the columns in the middle of MINMN
+*                    columns is zero, starting from
+*                    MINMN/2 - (MINMN/2)/2 + 1 column.
+*
+                     JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.12 ) THEN
+*
+*                    Odd-numbered columns are zero,
+*
+                     NB_GEN = N / 2
+                     NB_ZERO = N - NB_GEN
+                     J_INC = 2
+                     J_FIRST_NZ = 2
+*
+                  ELSE IF( IMAT.EQ.13 ) THEN
+*
+*                    Even-numbered columns are zero.
+*
+                     NB_ZERO = N / 2
+                     NB_GEN = N - NB_ZERO
+                     J_INC = 2
+                     J_FIRST_NZ = 1
+*
+                  END IF
+*
+*
+*                 1) Set the first NB_ZERO columns in COPYA(1:M,1:N)
+*                    to zero.
+*
+                  CALL SLASET( 'Full', M, NB_ZERO, ZERO, ZERO,
+     $                         COPYA, LDA )
+*
+*                    2) Generate an M-by-(N-NB_ZERO) matrix with the
+*                       chosen singular value distribution
+*                       in COPYA(1:M,NB_ZERO+1:N).
+*
+                  CALL SLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU,
+     $                         ANORM, MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'SLATMS'
+*
+                  IND_OFFSET_GEN = NB_ZERO * LDA
+*
+                  CALL SLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE,
+     $                        CNDNUM, ANORM, KL, KU, 'No packing',
+     $                        COPYA( IND_OFFSET_GEN + 1 ), LDA,
+     $                        WORK, INFO )
+*
+*                 Check error code from SLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'SLATMS', INFO, 0, ' ', M,
+     $                            NB_GEN, -1, -1, -1, IMAT, NFAIL,
+     $                            NERRS, NOUT )
+                     CYCLE
+                  END IF
+*
+*                 3) Swap the gererated colums from the right side
+*                 NB_GEN-size block in COPYA into correct column
+*                 positions.
+*
+                  IF( IMAT.EQ.6
+     $                    .OR. IMAT.EQ.7
+     $                    .OR. IMAT.EQ.8
+     $                    .OR. IMAT.EQ.10
+     $                    .OR. IMAT.EQ.11 ) THEN
+*
+*                    Move by swapping the generated columns
+*                    from the right NB_GEN-size block from
+*                    (NB_ZERO+1:NB_ZERO+JB_ZERO)
+*                    into columns (1:JB_ZERO-1).
+*
+                     DO J = 1, JB_ZERO-1, 1
+                        CALL SSWAP( M,
+     $                        COPYA( ( NB_ZERO+J-1)*LDA+1), 1,
+     $                        COPYA( (J-1)*LDA + 1 ), 1 )
+                     END DO
+*
+                  ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN
+*
+*                    ( IMAT = 12, Odd-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the even zero colums in the
+*                    left NB_ZERO-size block.
+*
+*                    ( IMAT = 13, Even-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the odd zero colums in the
+*                    left NB_ZERO-size block.
+*
+                     DO J = 1, NB_GEN, 1
+                        IND_OUT = ( NB_ZERO+J-1 )*LDA + 1
+                        IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA
+     $                            + 1
+                        CALL SSWAP( M,
+     $                              COPYA( IND_OUT ), 1,
+     $                              COPYA( IND_IN), 1 )
+                        END DO
+*
+                  END IF
+*
+*                 5) Order the singular values generated by
+*                    DLAMTS in decreasing order and add trailing zeros
+*                    that correspond to zero columns.
+*                    The total number of singular values is MINMN.
+*
+                  MINMNB_GEN = MIN( M, NB_GEN )
+*
+                  DO I = MINMNB_GEN+1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE
+*
+*                    IF(MINMN.LT.2) skip this size for this matrix type.
+*
+                     CYCLE
+               END IF
+*
+*              Initialize a copy array for a pivot array for SGEQP3RK.
+*
+               DO I = 1, N
+                  IWORK( I ) = 0
+               END DO
+*
+               DO INB = 1, NNB
+*
+*                 Do for each pair of values (NB,NX) in NBVAL and NXVAL.
+*
+                  NB = NBVAL( INB )
+                  CALL XLAENV( 1, NB )
+                  NX = NXVAL( INB )
+                  CALL XLAENV( 3, NX )
+*
+*                 We do MIN(M,N)+1 because we need a test for KMAX > N,
+*                 when KMAX is larger than MIN(M,N), KMAX should be
+*                 KMAX = MIN(M,N)
+*
+                  DO KMAX = 0, MIN(M,N)+1
+*
+*                 Get a working copy of COPYA into A( 1:M,1:N ).
+*                 Get a working copy of COPYB into A( 1:M, (N+1):NRHS ).
+*                 Get a working copy of COPYB into into B( 1:M, 1:NRHS ).
+*                 Get a working copy of IWORK(1:N) awith zeroes into
+*                 which is going to be used as pivot array IWORK( N+1:2N ).
+*                 NOTE: IWORK(2N+1:3N) is going to be used as a WORK array
+*                 for the routine.
+*
+                  CALL SLACPY( 'All', M, N, COPYA, LDA, A, LDA )
+                  CALL SLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         A( LDA*N + 1 ),  LDA )
+                  CALL SLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         B,  LDA )
+                  CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 )
+*
+                  ABSTOL = -1.0
+                  RELTOL = -1.0
+*
+*                 Compute the QR factorization with pivoting of A
+*
+                  LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ),
+     $                              3*N + NRHS - 1 ) )
+*
+*                 Compute SGEQP3RK factorization of A.
+*
+                  SRNAMT = 'SGEQP3RK'
+                  CALL SGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                           A, LDA, KFACT, MAXC2NRMK,
+     $                           RELMAXC2NRMK, IWORK( N+1 ), TAU,
+     $                           WORK, LW, IWORK( 2*N+1 ), INFO )
+*
+*                 Check error code from SGEQP3RK.
+*
+                  IF( INFO.LT.0 )
+     $               CALL ALAERH( PATH, 'SGEQP3RK', INFO, 0, ' ',
+     $                            M, N, NX, -1, NB, IMAT,
+     $                            NFAIL, NERRS, NOUT )
+*
+*                 Compute test 1:
+*
+*                 This test in only for the full rank factorization of
+*                 the matrix A.
+*
+*                 Array S(1:min(M,N)) contains svd(A) the sigular values
+*                 of the original matrix A in decreasing absolute value
+*                 order. The test computes svd(R), the vector sigular
+*                 values of the upper trapezoid of A(1:M,1:N) that
+*                 contains the factor R, in decreasing order. The test
+*                 returns the ratio:
+*
+*                 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS )
+*
+                  IF( KFACT.EQ.MINMN ) THEN
+*
+                     RESULT( 1 ) = SQRT12( M, N, A, LDA, S, WORK,
+     $                                     LWORK )
+*
+                     DO T = 1, 1
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL, NB, NX,
+     $                        IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                   End test 1
+*
+                  END IF
+*
+*                 Compute test 2:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS )
+*
+                  RESULT( 2 ) = SQPT01( M, N, KFACT, COPYA, A, LDA, TAU,
+     $                          IWORK( N+1 ), WORK, LWORK )
+*
+*                 Compute test 3:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( Q**T * Q - I ) / ( M * EPS )
+*
+                  RESULT( 3 ) = SQRT11( M, KFACT, A, LDA, TAU, WORK,
+     $                          LWORK )
+*
+*                 Print information about the tests that did not pass
+*                 the threshold.
+*
+                  DO T = 2, 3
+                     IF( RESULT( T ).GE.THRESH ) THEN
+                        IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                     CALL ALAHD( NOUT, PATH )
+                        WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK', M, N,
+     $                      NRHS, KMAX, ABSTOL, RELTOL,
+     $                      NB, NX, IMAT, T, RESULT( T )
+                        NFAIL = NFAIL + 1
+                     END IF
+                  END DO
+                  NRUN = NRUN + 2
+*
+*                 Compute test 4:
+*
+*                 This test is only for the factorizations with the
+*                 rank greater than 2.
+*                 The elements on the diagonal of R should be non-
+*                 increasing.
+*
+*                 The test returns the ratio:
+*
+*                 Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)),
+*                 K=1:KFACT-1
+*
+                  IF( MIN(KFACT, MINMN).GE.2 ) THEN
+*
+                     DO J = 1, KFACT-1, 1
+
+                        DTEMP = (( ABS( A( (J-1)*M+J ) ) -
+     $                          ABS( A( (J)*M+J+1 ) ) ) /
+     $                          ABS( A(1) ) )
+*
+                        IF( DTEMP.LT.ZERO ) THEN
+                           RESULT( 4 ) = BIGNUM
+                        END IF
+*
+                     END DO
+*
+*                    Print information about the tests that did not
+*                    pass the threshold.
+*
+                     DO T = 4, 4
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK',
+     $                        M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T,
+     $                        RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End test 4.
+*
+                  END IF
+*
+*                 Compute test 5:
+*
+*                 This test in only for matrix A with min(M,N) > 0.
+*
+*                 The test returns the ratio:
+*
+*                 1-norm(Q**T * B - Q**T * B ) /
+*                       ( M * EPS )
+*
+*                 (1) Compute B:=Q**T * B in the matrix B.
+*
+                  IF( MINMN.GT.0 ) THEN
+*
+                     LWORK_MQR = MAX(1, NRHS)
+                     CALL SORMQR( 'Left', 'Transpose',
+     $                         M, NRHS, KFACT, A, LDA, TAU, B, LDA,
+     $                         WORK, LWORK_MQR, INFO )
+*
+                     DO I = 1, NRHS
+*
+*                       Compare N+J-th column of A and J-column of B.
+*
+                        CALL SAXPY( M, -ONE, A( ( N+I-1 )*LDA+1 ), 1,
+     $                                 B( ( I-1 )*LDA+1 ), 1 )
+                     END DO
+*
+                   RESULT( 5 ) =
+     $               ABS(
+     $               SLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) /
+     $               ( REAL( M )*SLAMCH( 'Epsilon' ) )
+     $               )
+*
+*                    Print information about the tests that did not pass
+*                    the threshold.
+*
+                     DO T = 5, 5
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End compute test 5.
+*
+                  END IF
+*
+*                 END DO KMAX = 1, MIN(M,N)+1
+*
+                  END DO
+*
+*                 END DO for INB = 1, NNB
+*
+               END DO
+*
+*              END DO  for IMAT = 1, NTYPES
+*
+               END DO
+*
+*              END DO for INS = 1, NNS
+*
+            END DO
+*
+*           END DO for IN = 1, NN
+*
+         END DO
+*
+*        END DO for IM = 1, NM
+*
+      END DO
+*
+*     Print a summary of the results.
+*
+      CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
+*
+ 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5,
+     $        ', KMAX =', I5, ', ABSTOL =', G12.5,
+     $        ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4,
+     $        ', type ', I2, ', test ', I2, ', ratio =', G12.5 )
+*
+*     End of SCHKQP3RK
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/slatb4.f b/lapack-netlib/TESTING/LIN/slatb4.f
index 94d29db40..72a310727 100644
--- a/lapack-netlib/TESTING/LIN/slatb4.f
+++ b/lapack-netlib/TESTING/LIN/slatb4.f
@@ -153,9 +153,6 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           SLABAD
-*     ..
 *     .. Save statement ..
       SAVE               EPS, SMALL, LARGE, BADC1, BADC2, FIRST
 *     ..
@@ -173,11 +170,6 @@
          BADC1 = SQRT( BADC2 )
          SMALL = SLAMCH( 'Safe minimum' )
          LARGE = ONE / SMALL
-*
-*        If it looks like we're on a Cray, take the square root of
-*        SMALL and LARGE to avoid overflow and underflow problems.
-*
-         CALL SLABAD( SMALL, LARGE )
          SMALL = SHRINK*( SMALL / EPS )
          LARGE = ONE / SMALL
       END IF
@@ -232,6 +224,110 @@
          ELSE
             ANORM = ONE
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        xQK: truncated QR with pivoting.
+*             Set parameters to generate a general
+*             M x N matrix.
+*
+*        Set TYPE, the type of matrix to be generated.  'N' is nonsymmetric.
+*
+         TYPE = 'N'
+*
+*        Set DIST, the type of distribution for the random
+*        number generator. 'S' is
+*
+         DIST = 'S'
+*
+*        Set the lower and upper bandwidths.
+*
+         IF( IMAT.EQ.2 ) THEN
+*
+*           2. Random, Diagonal, CNDNUM = 2
+*
+            KL = 0
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.3 ) THEN
+*
+*           3. Random, Upper triangular,  CNDNUM = 2
+*
+            KL = 0
+            KU = MAX( N-1, 0 )
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.4 ) THEN
+*
+*          4. Random, Lower triangular,  CNDNUM = 2
+*
+            KL = MAX( M-1, 0 )
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE
+*
+*           5.-19. Rectangular matrix
+*
+            KL = MAX( M-1, 0 )
+            KU = MAX( N-1, 0 )
+*
+            IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN
+*
+*              5.-14. Random, CNDNUM = 2.
+*
+               CNDNUM = TWO
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.15 ) THEN
+*
+*              15. Random, CNDNUM = sqrt(0.1/EPS)
+*
+               CNDNUM = BADC1
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.16 ) THEN
+*
+*              16. Random, CNDNUM = 0.1/EPS
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.17 ) THEN
+*
+*              17. Random, CNDNUM = 0.1/EPS,
+*                  one small singular value S(N)=1/CNDNUM
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 2
+*
+            ELSE IF( IMAT.EQ.18 ) THEN
+*
+*              18. Random, scaled near underflow
+*
+               CNDNUM = TWO
+               ANORM = SMALL
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.19 ) THEN
+*
+*              19. Random, scaled near overflow
+*
+               CNDNUM = TWO
+               ANORM = LARGE
+               MODE = 3
+*
+            END IF
+*
+         END IF
 *
       ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN
 *
@@ -518,17 +614,18 @@
 *
 *        Set the norm and condition number.
 *
-         IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN
+         MAT = ABS( IMAT )
+         IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN
             CNDNUM = BADC1
-         ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN
+         ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN
             CNDNUM = BADC2
          ELSE
             CNDNUM = TWO
          END IF
 *
-         IF( IMAT.EQ.4 ) THEN
+         IF( MAT.EQ.4 ) THEN
             ANORM = SMALL
-         ELSE IF( IMAT.EQ.5 ) THEN
+         ELSE IF( MAT.EQ.5 ) THEN
             ANORM = LARGE
          ELSE
             ANORM = ONE
diff --git a/lapack-netlib/TESTING/LIN/sqpt01.f b/lapack-netlib/TESTING/LIN/sqpt01.f
index de0c80e53..f53686a65 100644
--- a/lapack-netlib/TESTING/LIN/sqpt01.f
+++ b/lapack-netlib/TESTING/LIN/sqpt01.f
@@ -33,7 +33,8 @@
 *> Householder vectors, and the rest of AF contains a partially updated
 *> matrix.
 *>
-*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M)
+*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) )
+*> where || . || is matrix one norm.
 *> \endverbatim
 *
 *  Arguments:
@@ -172,28 +173,28 @@
 *
       NORMA = SLANGE( 'One-norm', M, N, A, LDA, RWORK )
 *
-      DO 30 J = 1, K
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, K
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = AF( I, J )
-   10    CONTINUE
-         DO 20 I = J + 1, M
+         END DO
+         DO I = J + 1, M
             WORK( ( J-1 )*M+I ) = ZERO
-   20    CONTINUE
-   30 CONTINUE
-      DO 40 J = K + 1, N
+         END DO
+      END DO
+      DO J = K + 1, N
          CALL SCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 )
-   40 CONTINUE
+      END DO
 *
       CALL SORMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK,
      $             M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
-      DO 50 J = 1, N
+      DO J = 1, N
 *
 *        Compare i-th column of QR and jpvt(i)-th column of A
 *
          CALL SAXPY( M, -ONE, A( 1, JPVT( J ) ), 1, WORK( ( J-1 )*M+1 ),
      $               1 )
-   50 CONTINUE
+      END DO
 *
       SQPT01 = SLANGE( 'One-norm', M, N, WORK, M, RWORK ) /
      $         ( REAL( MAX( M, N ) )*SLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/sqrt11.f b/lapack-netlib/TESTING/LIN/sqrt11.f
index d4422dacb..a3753adcf 100644
--- a/lapack-netlib/TESTING/LIN/sqrt11.f
+++ b/lapack-netlib/TESTING/LIN/sqrt11.f
@@ -157,9 +157,9 @@
       CALL SORM2R( 'Left', 'Transpose', M, M, K, A, LDA, TAU, WORK, M,
      $             WORK( M*M+1 ), INFO )
 *
-      DO 10 J = 1, M
+      DO J = 1, M
          WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE
-   10 CONTINUE
+      END DO
 *
       SQRT11 = SLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) /
      $         ( REAL( M )*SLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/sqrt12.f b/lapack-netlib/TESTING/LIN/sqrt12.f
index 2eab0ee0d..46b359e07 100644
--- a/lapack-netlib/TESTING/LIN/sqrt12.f
+++ b/lapack-netlib/TESTING/LIN/sqrt12.f
@@ -26,7 +26,7 @@
 *> SQRT12 computes the singular values `svlues' of the upper trapezoid
 *> of A(1:M,1:N) and returns the ratio
 *>
-*>      || s - svlues||/(||svlues||*eps*max(M,N))
+*>      || svlues - s ||/(||s||*eps*max(M,N))
 *> \endverbatim
 *
 *  Arguments:
@@ -113,8 +113,7 @@
       EXTERNAL           SASUM, SLAMCH, SLANGE, SNRM2
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SAXPY, SBDSQR, SGEBD2, SLABAD, SLASCL, SLASET,
-     $                   XERBLA
+      EXTERNAL           SAXPY, SBDSQR, SGEBD2, SLASCL, SLASET, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN, REAL
@@ -145,17 +144,16 @@
 *     Copy upper triangle of A into work
 *
       CALL SLASET( 'Full', M, N, ZERO, ZERO, WORK, M )
-      DO 20 J = 1, N
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, N
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = A( I, J )
-   10    CONTINUE
-   20 CONTINUE
+         END DO
+      END DO
 *
 *     Get machine parameters
 *
       SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale work if max entry outside range [SMLNUM,BIGNUM]
 *
@@ -199,9 +197,9 @@
 *
       ELSE
 *
-         DO 30 I = 1, MN
+         DO I = 1, MN
             WORK( M*N+I ) = ZERO
-   30    CONTINUE
+         END DO
       END IF
 *
 *     Compare s and singular values of work
diff --git a/lapack-netlib/TESTING/LIN/zchkaa.F b/lapack-netlib/TESTING/LIN/zchkaa.F
index a118515a5..57d71833f 100644
--- a/lapack-netlib/TESTING/LIN/zchkaa.F
+++ b/lapack-netlib/TESTING/LIN/zchkaa.F
@@ -69,6 +69,7 @@
 *> ZLQ    8               List types on next line if 0 < NTYPES <  8
 *> ZQL    8               List types on next line if 0 < NTYPES <  8
 *> ZQP    6               List types on next line if 0 < NTYPES <  6
+*> ZQK   19               List types on next line if 0 < NTYPES < 19
 *> ZTZ    3               List types on next line if 0 < NTYPES <  3
 *> ZLS    6               List types on next line if 0 < NTYPES <  6
 *> ZEQ
@@ -153,12 +154,11 @@
      $                   NBVAL( MAXIN ), NBVAL2( MAXIN ),
      $                   NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ),
      $                   RANKVAL( MAXIN ), PIV( NMAX )
-      DOUBLE PRECISION   S( 2*NMAX )
-      COMPLEX*16         E( NMAX )
-*
-*    .. Allocatable Arrays ..
+*     ..
+*     .. Allocatable Arrays ..
       INTEGER AllocateStatus
-      DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE::   RWORK
+      DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE::  RWORK, S
+      COMPLEX*16, DIMENSION(:), ALLOCATABLE :: E
       COMPLEX*16, DIMENSION(:,:), ALLOCATABLE::  A, B, WORK
 *     ..
 *     .. External Functions ..
@@ -170,15 +170,16 @@
       EXTERNAL           ALAREQ, ZCHKEQ, ZCHKGB, ZCHKGE, ZCHKGT, ZCHKHE,
      $                   ZCHKHE_ROOK, ZCHKHE_RK, ZCHKHE_AA, ZCHKHP,
      $                   ZCHKLQ, ZCHKUNHR_COL, ZCHKPB, ZCHKPO, ZCHKPS,
-     $                   ZCHKPP, ZCHKPT, ZCHKQ3, ZCHKQL, ZCHKQR, ZCHKRQ,
-     $                   ZCHKSP, ZCHKSY, ZCHKSY_ROOK, ZCHKSY_RK,
-     $                   ZCHKSY_AA, ZCHKTB, ZCHKTP, ZCHKTR, ZCHKTZ,
-     $                   ZDRVGB, ZDRVGE, ZDRVGT, ZDRVHE, ZDRVHE_ROOK,
-     $                   ZDRVHE_RK, ZDRVHE_AA, ZDRVHE_AA_2STAGE, ZDRVHP,
-     $                   ZDRVLS, ZDRVPB,  ZDRVPO, ZDRVPP, ZDRVPT,
-     $                   ZDRVSP, ZDRVSY, ZDRVSY_ROOK, ZDRVSY_RK,
-     $                   ZDRVSY_AA, ZDRVSY_AA_2STAGE, ILAVER, ZCHKQRT,
-     $                   ZCHKQRTP, ZCHKLQT, ZCHKLQTP, ZCHKTSQR
+     $                   ZCHKPP, ZCHKPT, ZCHKQ3, ZCHKQP3RK, ZCHKQL,
+     $                   ZCHKQR, ZCHKRQ, ZCHKSP, ZCHKSY, ZCHKSY_ROOK,
+     $                   ZCHKSY_RK, ZCHKSY_AA, ZCHKTB, ZCHKTP, ZCHKTR,
+     $                   ZCHKTZ, ZDRVGB, ZDRVGE, ZDRVGT, ZDRVHE,
+     $                   ZDRVHE_ROOK, ZDRVHE_RK, ZDRVHE_AA,
+     $                   ZDRVHE_AA_2STAGE, ZDRVHP, ZDRVLS, ZDRVPB,
+     $                   ZDRVPO, ZDRVPP, ZDRVPT, ZDRVSP, ZDRVSY,
+     $                   ZDRVSY_ROOK, ZDRVSY_RK, ZDRVSY_AA,
+     $                   ZDRVSY_AA_2STAGE, ILAVER, ZCHKQRT, ZCHKQRTP,
+     $                   ZCHKLQT, ZCHKLQTP, ZCHKTSQR
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -197,13 +198,18 @@
       DATA               THREQ / 2.0D0 / , INTSTR / '0123456789' /
 *
 *     .. Allocate memory dynamically ..
-      ALLOCATE (RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus)
+*
+      ALLOCATE ( A ( (KDMAX+1) * NMAX, 7 ), STAT = AllocateStatus)
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( B ( NMAX * MAXRHS, 4 ), STAT = AllocateStatus)
+      IF (AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( WORK ( NMAX, NMAX+MAXRHS+10 ), STAT = AllocateStatus)
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
-      ALLOCATE (A ((KDMAX+1) * NMAX, 7), STAT = AllocateStatus)
+      ALLOCATE ( E( NMAX ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
-      ALLOCATE (B (NMAX * MAXRHS, 4), STAT = AllocateStatus)
+      ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus)
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
-      ALLOCATE (WORK (NMAX, NMAX+MAXRHS+10), STAT = AllocateStatus)
+      ALLOCATE ( RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus)
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
 *     ..
 *     .. Executable Statements ..
@@ -1109,6 +1115,23 @@
          ELSE
             WRITE( NOUT, FMT = 9989 )PATH
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        QK: truncated QR factorization with pivoting
+*
+         NTYPES = 19
+         CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT )
+*
+         IF( TSTCHK ) THEN
+            CALL ZCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                   NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ),
+     $                   A( 1, 2 ), B( 1, 1 ), B( 1, 2 ),
+     $                   S( 1 ), B( 1, 4 ),
+     $                   WORK, RWORK, IWORK, NOUT )
+         ELSE
+            WRITE( NOUT, FMT = 9989 )PATH
+         END IF
 *
       ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN
 *
diff --git a/lapack-netlib/TESTING/LIN/zchkqp3rk.f b/lapack-netlib/TESTING/LIN/zchkqp3rk.f
new file mode 100644
index 000000000..302c7b1a8
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/zchkqp3rk.f
@@ -0,0 +1,836 @@
+*> \brief \b ZCHKQP3RK
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE ZCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+*     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+*     $                      B, COPYB, S, TAU,
+*     $                      WORK, RWORK, IWORK, NOUT )
+*      IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER            NM, NN, NNB, NOUT
+*       DOUBLE PRECISION   THRESH
+*       ..
+*       .. Array Arguments ..
+*       LOGICAL            DOTYPE( * )
+*       INTEGER            IWORK( * ), MVAL( * ), NBVAL( * ), NVAL( * ),
+*      $                   NXVAL( * )
+*       DOUBLE PRECISION   S( * ), RWORK( * )
+*       COMPLEX*16         A( * ), COPYA( * ), TAU( * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZCHKQP3RK tests ZGEQP3RK.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] DOTYPE
+*> \verbatim
+*>          DOTYPE is LOGICAL array, dimension (NTYPES)
+*>          The matrix types to be used for testing.  Matrices of type j
+*>          (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) =
+*>          .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used.
+*> \endverbatim
+*>
+*> \param[in] NM
+*> \verbatim
+*>          NM is INTEGER
+*>          The number of values of M contained in the vector MVAL.
+*> \endverbatim
+*>
+*> \param[in] MVAL
+*> \verbatim
+*>          MVAL is INTEGER array, dimension (NM)
+*>          The values of the matrix row dimension M.
+*> \endverbatim
+*>
+*> \param[in] NN
+*> \verbatim
+*>          NN is INTEGER
+*>          The number of values of N contained in the vector NVAL.
+*> \endverbatim
+*>
+*> \param[in] NVAL
+*> \verbatim
+*>          NVAL is INTEGER array, dimension (NN)
+*>          The values of the matrix column dimension N.
+*> \endverbatim
+*>
+*> \param[in] NNS
+*> \verbatim
+*>          NNS is INTEGER
+*>          The number of values of NRHS contained in the vector NSVAL.
+*> \endverbatim
+*>
+*> \param[in] NSVAL
+*> \verbatim
+*>          NSVAL is INTEGER array, dimension (NNS)
+*>          The values of the number of right hand sides NRHS.
+*> \endverbatim
+*> \param[in] NNB
+*> \verbatim
+*>          NNB is INTEGER
+*>          The number of values of NB and NX contained in the
+*>          vectors NBVAL and NXVAL.  The blocking parameters are used
+*>          in pairs (NB,NX).
+*> \endverbatim
+*>
+*> \param[in] NBVAL
+*> \verbatim
+*>          NBVAL is INTEGER array, dimension (NNB)
+*>          The values of the blocksize NB.
+*> \endverbatim
+*>
+*> \param[in] NXVAL
+*> \verbatim
+*>          NXVAL is INTEGER array, dimension (NNB)
+*>          The values of the crossover point NX.
+*> \endverbatim
+*>
+*> \param[in] THRESH
+*> \verbatim
+*>          THRESH is DOUBLE PRECISION
+*>          The threshold value for the test ratios.  A result is
+*>          included in the output file if RESULT >= THRESH.  To have
+*>          every test ratio printed, use THRESH = 0.
+*> \endverbatim
+*>
+*> \param[out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (MMAX*NMAX)
+*>          where MMAX is the maximum value of M in MVAL and NMAX is the
+*>          maximum value of N in NVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYA
+*> \verbatim
+*>          COPYA is COMPLEX*16 array, dimension (MMAX*NMAX)
+*> \endverbatim
+*>
+*> \param[out] B
+*> \verbatim
+*>          B is COMPLEX*16 array, dimension (MMAX*NSMAX)
+*>          where MMAX is the maximum value of M in MVAL and NSMAX is the
+*>          maximum value of NRHS in NSVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYB
+*> \verbatim
+*>          COPYB is COMPLEX*16 array, dimension (MMAX*NSMAX)
+*> \endverbatim
+*>
+*> \param[out] S
+*> \verbatim
+*>          S is DOUBLE PRECISION array, dimension
+*>                      (min(MMAX,NMAX))
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX*16 array, dimension (MMAX)
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension
+*>                      (max(M*max(M,N) + 4*min(M,N) + max(M,N)))
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (4*NMAX)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (2*NMAX)
+*> \endverbatim
+*>
+*> \param[in] NOUT
+*> \verbatim
+*>          NOUT is INTEGER
+*>          The unit number for output.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16_lin
+*
+*  =====================================================================
+      SUBROUTINE ZCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+     $                      B, COPYB, S, TAU,
+     $                      WORK, RWORK, IWORK, NOUT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            NM, NN, NNB, NNS, NOUT
+      DOUBLE PRECISION   THRESH
+*     ..
+*     .. Array Arguments ..
+      LOGICAL            DOTYPE( * )
+      INTEGER            IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ),
+     $                   NSVAL( * ), NXVAL( * )
+      DOUBLE PRECISION   S( * ), RWORK( * )
+      COMPLEX*16         A( * ), COPYA( * ), B( * ), COPYB( * ),
+     $                   TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NTYPES
+      PARAMETER          ( NTYPES = 19 )
+      INTEGER            NTESTS
+      PARAMETER          ( NTESTS = 5 )
+      DOUBLE PRECISION   ONE, ZERO, BIGNUM
+      COMPLEX*16         CONE, CZERO
+      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0,
+     $                   CZERO = ( 0.0D+0, 0.0D+0 ),
+     $                   CONE = ( 1.0D+0, 0.0D+0 ),
+     $                   BIGNUM = 1.0D+38 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER          DIST, TYPE
+      CHARACTER*3        PATH
+      INTEGER            I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO,
+     $                   INB, IND_OFFSET_GEN,
+     $                   IND_IN, IND_OUT, INS, INFO,
+     $                   ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO,
+     $                   KFACT, KL, KMAX, KU, LDA, LW, LWORK,
+     $                   LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N,
+     $                   NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS,
+     $                   NRUN, NX, T
+      DOUBLE PRECISION   ANORM, CNDNUM, EPS, ABSTOL, RELTOL,
+     $                   DTEMP, MAXC2NRMK, RELMAXC2NRMK
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 ), ISEEDY( 4 )
+      DOUBLE PRECISION   RESULT( NTESTS ), RDUMMY( 1 )
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH, ZQPT01, ZQRT11, ZQRT12, ZLANGE
+      EXTERNAL           DLAMCH, ZQPT01, ZQRT11, ZQRT12, ZLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ALAERH, ALAHD, ALASUM, DLAORD, ICOPY, ZAXPY,
+     $                   XLAENV, ZGEQP3RK, ZLACPY, ZLASET, ZLATB4,
+     $                   ZLATMS, ZUNMQR, ZSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DBLE, MAX, MIN, MOD
+*     ..
+*     .. Scalars in Common ..
+      LOGICAL            LERR, OK
+      CHARACTER*32       SRNAMT
+      INTEGER            INFOT, IOUNIT, ZUNMQR_LWORK
+*     ..
+*     .. Common blocks ..
+      COMMON             / INFOC / INFOT, IOUNIT, OK, LERR
+      COMMON             / SRNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA               ISEEDY / 1988, 1989, 1990, 1991 /
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize constants and the random number seed.
+*
+      PATH( 1: 1 ) = 'Zomplex precision'
+      PATH( 2: 3 ) = 'QK'
+      NRUN = 0
+      NFAIL = 0
+      NERRS = 0
+      DO  I = 1, 4
+         ISEED( I ) = ISEEDY( I )
+      END DO
+      EPS = DLAMCH( 'Epsilon' )
+      INFOT = 0
+*
+      DO IM = 1, NM
+*
+*        Do for each value of M in MVAL.
+*
+         M = MVAL( IM )
+         LDA = MAX( 1, M )
+*
+         DO IN = 1, NN
+*
+*           Do for each value of N in NVAL.
+*
+            N = NVAL( IN )
+            MINMN = MIN( M, N )
+            LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ),
+     $                   M*N + 2*MINMN + 4*N )
+*
+            DO INS = 1, NNS
+               NRHS = NSVAL( INS )
+*
+*                 Set up parameters with ZLATB4 and generate
+*                 M-by-NRHS B matrix with ZLATMS.
+*                 IMAT = 14:
+*                 Random matrix, CNDNUM = 2, NORM = ONE,
+*                 MODE = 3 (geometric distribution of singular values).
+*
+                  CALL ZLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'ZLATMS'
+                  CALL ZLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYB, LDA, WORK, INFO )
+*
+*                 Check error code from ZLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'ZLATMS', INFO, 0, ' ', M,
+     $                            NRHS, -1, -1, -1, 6, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+               DO IMAT = 1, NTYPES
+*
+*              Do the tests only if DOTYPE( IMAT ) is true.
+*
+               IF( .NOT.DOTYPE( IMAT ) )
+     $            CYCLE
+*
+*              The type of distribution used to generate the random
+*              eigen-/singular values:
+*              ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 )
+*
+*           Do for each type of NON-SYMMETRIC matrix:                               CNDNUM                     NORM                                     MODE
+*            1. Zero matrix
+*            2. Random, Diagonal, CNDNUM = 2                                        CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            3. Random, Upper triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            4. Random, Lower triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            5. Random, First column is zero, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            6. Random, Last MINMN column is zero, CNDNUM = 2                       CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            7. Random, Last N column is zero, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            8. Random, Middle column in MINMN is zero, CNDNUM = 2                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            9. Random, First half of MINMN columns are zero, CNDNUM = 2            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2   CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           11. Random, Half MINMN columns in the middle are zero starting
+*                  from  MINMN/2-(MINMN/2)/2+1, CNDNUM = 2                          CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           12. Random, Odd columns are ZERO, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           13. Random, Even columns are ZERO, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           14. Random, CNDNUM = 2                                                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           15. Random, CNDNUM = sqrt(0.1/EPS)                                      CNDNUM = BADC1 = sqrt(0.1/EPS)  ONE                                      3 ( geometric distribution of singular values )
+*           16. Random, CNDNUM = 0.1/EPS                                            CNDNUM = BADC2 = 0.1/EPS        ONE                                      3 ( geometric distribution of singular values )
+*           17. Random, CNDNUM = 0.1/EPS,                                           CNDNUM = BADC2 = 0.1/EPS        ONE                                      2 ( one small singular value, S(N)=1/CNDNUM )
+*                 one small singular value S(N)=1/CNDNUM
+*           18. Random, CNDNUM = 2, scaled near underflow                           CNDNUM = 2                      SMALL = SAFMIN
+*           19. Random, CNDNUM = 2, scaled near overflow                            CNDNUM = 2                      LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) )  3 ( geometric distribution of singular values )
+*
+               IF( IMAT.EQ.1 ) THEN
+*
+*                 Matrix 1: Zero matrix
+*
+                  CALL ZLASET( 'Full', M, N, CZERO, CZERO, COPYA, LDA )
+                  DO I = 1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 )
+     $                  .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN
+*
+*                 Matrices 2-5.
+*
+*                 Set up parameters with DLATB4 and generate a test
+*                 matrix with ZLATMS.
+*
+                  CALL ZLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'ZLATMS'
+                  CALL ZLATMS( M, N, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYA, LDA, WORK, INFO )
+*
+*                 Check error code from ZLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'ZLATMS', INFO, 0, ' ', M, N,
+     $                            -1, -1, -1, IMAT, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+                  CALL DLAORD( 'Decreasing', MINMN, S, 1 )
+*
+               ELSE IF( MINMN.GE.2
+     $                  .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN
+*
+*                 Rectangular matrices 5-13 that contain zero columns,
+*                 only for matrices MINMN >=2.
+*
+*                 JB_ZERO is the column index of ZERO block.
+*                 NB_ZERO is the column block size of ZERO block.
+*                 NB_GEN is the column blcok size of the
+*                 generated block.
+*                 J_INC in the non_zero column index increment
+*                 for matrix 12 and 13.
+*                 J_FIRS_NZ is the index of the first non-zero
+*                 column.
+*
+                  IF( IMAT.EQ.5 ) THEN
+*
+*                    First column is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.6 ) THEN
+*
+*                    Last column MINMN is zero.
+*
+                     JB_ZERO = MINMN
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.7 ) THEN
+*
+*                    Last column N is zero.
+*
+                     JB_ZERO = N
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.8 ) THEN
+*
+*                    Middle column in MINMN is zero.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.9 ) THEN
+*
+*                    First half of MINMN columns is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.10 ) THEN
+*
+*                    Last columns are zero columns,
+*                    starting from (MINMN / 2 + 1) column.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = N - JB_ZERO + 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.11 ) THEN
+*
+*                    Half of the columns in the middle of MINMN
+*                    columns is zero, starting from
+*                    MINMN/2 - (MINMN/2)/2 + 1 column.
+*
+                     JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.12 ) THEN
+*
+*                    Odd-numbered columns are zero,
+*
+                     NB_GEN = N / 2
+                     NB_ZERO = N - NB_GEN
+                     J_INC = 2
+                     J_FIRST_NZ = 2
+*
+                  ELSE IF( IMAT.EQ.13 ) THEN
+*
+*                    Even-numbered columns are zero.
+*
+                     NB_ZERO = N / 2
+                     NB_GEN = N - NB_ZERO
+                     J_INC = 2
+                     J_FIRST_NZ = 1
+*
+                  END IF
+*
+*
+*                 1) Set the first NB_ZERO columns in COPYA(1:M,1:N)
+*                    to zero.
+*
+                  CALL ZLASET( 'Full', M, NB_ZERO, CZERO, CZERO,
+     $                         COPYA, LDA )
+*
+*                    2) Generate an M-by-(N-NB_ZERO) matrix with the
+*                       chosen singular value distribution
+*                       in COPYA(1:M,NB_ZERO+1:N).
+*
+                  CALL ZLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU,
+     $                         ANORM, MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'ZLATMS'
+*
+                  IND_OFFSET_GEN = NB_ZERO * LDA
+*
+                  CALL ZLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE,
+     $                        CNDNUM, ANORM, KL, KU, 'No packing',
+     $                        COPYA( IND_OFFSET_GEN + 1 ), LDA,
+     $                        WORK, INFO )
+*
+*                 Check error code from ZLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'ZLATMS', INFO, 0, ' ', M,
+     $                            NB_GEN, -1, -1, -1, IMAT, NFAIL,
+     $                            NERRS, NOUT )
+                     CYCLE
+                  END IF
+*
+*                 3) Swap the gererated colums from the right side
+*                 NB_GEN-size block in COPYA into correct column
+*                 positions.
+*
+                  IF( IMAT.EQ.6
+     $                    .OR. IMAT.EQ.7
+     $                    .OR. IMAT.EQ.8
+     $                    .OR. IMAT.EQ.10
+     $                    .OR. IMAT.EQ.11 ) THEN
+*
+*                    Move by swapping the generated columns
+*                    from the right NB_GEN-size block from
+*                    (NB_ZERO+1:NB_ZERO+JB_ZERO)
+*                    into columns (1:JB_ZERO-1).
+*
+                     DO J = 1, JB_ZERO-1, 1
+                        CALL ZSWAP( M,
+     $                        COPYA( ( NB_ZERO+J-1)*LDA+1), 1,
+     $                        COPYA( (J-1)*LDA + 1 ), 1 )
+                     END DO
+*
+                  ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN
+*
+*                    ( IMAT = 12, Odd-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the even zero colums in the
+*                    left NB_ZERO-size block.
+*
+*                    ( IMAT = 13, Even-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the odd zero colums in the
+*                    left NB_ZERO-size block.
+*
+                     DO J = 1, NB_GEN, 1
+                        IND_OUT = ( NB_ZERO+J-1 )*LDA + 1
+                        IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA
+     $                            + 1
+                        CALL ZSWAP( M,
+     $                              COPYA( IND_OUT ), 1,
+     $                              COPYA( IND_IN), 1 )
+                        END DO
+*
+                  END IF
+*
+*                 5) Order the singular values generated by
+*                    DLAMTS in decreasing order and add trailing zeros
+*                    that correspond to zero columns.
+*                    The total number of singular values is MINMN.
+*
+                  MINMNB_GEN = MIN( M, NB_GEN )
+*
+                  CALL DLAORD( 'Decreasing', MINMNB_GEN, S, 1 )
+
+                  DO I = MINMNB_GEN+1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE
+*
+*                    IF(MINMN.LT.2) skip this size for this matrix type.
+*
+                     CYCLE
+               END IF
+*
+*              Initialize a copy array for a pivot array for DGEQP3RK.
+*
+               DO I = 1, N
+                  IWORK( I ) = 0
+               END DO
+*
+               DO INB = 1, NNB
+*
+*                 Do for each pair of values (NB,NX) in NBVAL and NXVAL.
+*
+                  NB = NBVAL( INB )
+                  CALL XLAENV( 1, NB )
+                  NX = NXVAL( INB )
+                  CALL XLAENV( 3, NX )
+*
+*                 We do MIN(M,N)+1 because we need a test for KMAX > N,
+*                 when KMAX is larger than MIN(M,N), KMAX should be
+*                 KMAX = MIN(M,N)
+*
+                  DO KMAX = 0, MIN(M,N)+1
+*
+*                 Get a working copy of COPYA into A( 1:M,1:N ).
+*                 Get a working copy of COPYB into A( 1:M, (N+1):NRHS ).
+*                 Get a working copy of COPYB into into B( 1:M, 1:NRHS ).
+*                 Get a working copy of IWORK(1:N) awith zeroes into
+*                 which is going to be used as pivot array IWORK( N+1:2N ).
+*                 NOTE: IWORK(2N+1:3N) is going to be used as a WORK array
+*                 for the routine.
+*
+                  CALL ZLACPY( 'All', M, N, COPYA, LDA, A, LDA )
+                  CALL ZLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         A( LDA*N + 1 ),  LDA )
+                  CALL ZLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         B,  LDA )
+                  CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 )
+*
+                  ABSTOL = -1.0
+                  RELTOl = -1.0
+*
+*                 Compute the QR factorization with pivoting of A
+*
+                  LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ),
+     $                              3*N + NRHS - 1 ) )
+*
+*                 Compute ZGEQP3RK factorization of A.
+*
+                  SRNAMT = 'ZGEQP3RK'
+                  CALL ZGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                           A, LDA, KFACT, MAXC2NRMK,
+     $                           RELMAXC2NRMK, IWORK( N+1 ), TAU,
+     $                           WORK, LW, RWORK, IWORK( 2*N+1 ),
+     $                           INFO )
+*
+*                 Check error code from ZGEQP3RK.
+*
+                  IF( INFO.LT.0 )
+     $               CALL ALAERH( PATH, 'ZGEQP3RK', INFO, 0, ' ',
+     $                            M, N, NX, -1, NB, IMAT,
+     $                            NFAIL, NERRS, NOUT )
+*
+                  IF( KFACT.EQ.MINMN ) THEN
+*
+*                 Compute test 1:
+*
+*                 This test in only for the full rank factorization of
+*                 the matrix A.
+*
+*                 Array S(1:min(M,N)) contains svd(A) the sigular values
+*                 of the original matrix A in decreasing absolute value
+*                 order. The test computes svd(R), the vector sigular
+*                 values of the upper trapezoid of A(1:M,1:N) that
+*                 contains the factor R, in decreasing order. The test
+*                 returns the ratio:
+*
+*                 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS )
+*
+                     RESULT( 1 ) = ZQRT12( M, N, A, LDA, S, WORK,
+     $                                     LWORK , RWORK )
+*
+                     DO T = 1, 1
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL, NB, NX,
+     $                        IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                   End test 1
+*
+                  END IF
+
+*                 Compute test 2:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS )
+*
+                  RESULT( 2 ) = ZQPT01( M, N, KFACT, COPYA, A, LDA, TAU,
+     $                          IWORK( N+1 ), WORK, LWORK )
+*
+*                 Compute test 3:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( Q**T * Q - I ) / ( M * EPS )
+*
+                  RESULT( 3 ) = ZQRT11( M, KFACT, A, LDA, TAU, WORK,
+     $                          LWORK )
+*
+*                 Print information about the tests that did not pass
+*                 the threshold.
+*
+                  DO T = 2, 3
+                     IF( RESULT( T ).GE.THRESH ) THEN
+                        IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                     CALL ALAHD( NOUT, PATH )
+                        WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK', M, N,
+     $                      NRHS, KMAX, ABSTOL, RELTOL,
+     $                      NB, NX, IMAT, T, RESULT( T )
+                        NFAIL = NFAIL + 1
+                     END IF
+                  END DO
+                  NRUN = NRUN + 2
+*
+*                 Compute test 4:
+*
+*                 This test is only for the factorizations with the
+*                 rank greater than 2.
+*                 The elements on the diagonal of R should be non-
+*                 increasing.
+*
+*                 The test returns the ratio:
+*
+*                 Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)),
+*                 K=1:KFACT-1
+*
+                  IF( MIN(KFACT, MINMN).GE.2 ) THEN
+*
+                     DO J = 1, KFACT-1, 1
+*
+                        DTEMP = (( ABS( A( (J-1)*M+J ) ) -
+     $                          ABS( A( (J)*M+J+1 ) ) ) /
+     $                          ABS( A(1) ) )
+*
+                        IF( DTEMP.LT.ZERO ) THEN
+                           RESULT( 4 ) = BIGNUM
+                        END IF
+*
+                     END DO
+*
+*                    Print information about the tests that did not
+*                    pass the threshold.
+*
+                     DO T = 4, 4
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK',
+     $                        M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T,
+     $                        RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End test 4.
+*
+                  END IF
+*
+*                 Compute test 5:
+*
+*                 This test in only for matrix A with min(M,N) > 0.
+*
+*                 The test returns the ratio:
+*
+*                 1-norm(Q**T * B - Q**T * B ) /
+*                       ( M * EPS )
+*
+*                 (1) Compute B:=Q**T * B in the matrix B.
+*
+                  IF( MINMN.GT.0 ) THEN
+*
+                     LWORK_MQR = MAX(1, NRHS)
+                     CALL ZUNMQR( 'Left', 'Conjugate transpose',
+     $                         M, NRHS, KFACT, A, LDA, TAU, B, LDA,
+     $                         WORK, LWORK_MQR, INFO )
+*
+                     DO I = 1, NRHS
+*
+*                       Compare N+J-th column of A and J-column of B.
+*
+                        CALL ZAXPY( M, -CONE, A( ( N+I-1 )*LDA+1 ), 1,
+     $                                    B( ( I-1 )*LDA+1 ), 1 )
+                     END DO
+*
+                     RESULT( 5 ) =
+     $               ABS(
+     $               ZLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) /
+     $               ( DBLE( M )*DLAMCH( 'Epsilon' ) )
+     $               )
+*
+*                    Print information about the tests that did not pass
+*                    the threshold.
+*
+                     DO T = 5, 5
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End compute test 5.
+*
+                  END IF
+*
+*                 END DO KMAX = 1, MIN(M,N)+1
+*
+                  END DO
+*
+*                 END DO for INB = 1, NNB
+*
+               END DO
+*
+*              END DO  for IMAT = 1, NTYPES
+*
+               END DO
+*
+*              END DO for INS = 1, NNS
+*
+            END DO
+*
+*           END DO for IN = 1, NN
+*
+         END DO
+*
+*        END DO for IM = 1, NM
+*
+      END DO
+*
+*     Print a summary of the results.
+*
+      CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
+*
+ 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5,
+     $        ', KMAX =', I5, ', ABSTOL =', G12.5,
+     $        ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4,
+     $        ', type ', I2, ', test ', I2, ', ratio =', G12.5 )
+*
+*     End of ZCHKQP3RK
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/zlatb4.f b/lapack-netlib/TESTING/LIN/zlatb4.f
index a6977f4e9..a2b19f83d 100644
--- a/lapack-netlib/TESTING/LIN/zlatb4.f
+++ b/lapack-netlib/TESTING/LIN/zlatb4.f
@@ -154,9 +154,6 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           DLABAD
-*     ..
 *     .. Save statement ..
       SAVE               EPS, SMALL, LARGE, BADC1, BADC2, FIRST
 *     ..
@@ -174,11 +171,6 @@
          BADC1 = SQRT( BADC2 )
          SMALL = DLAMCH( 'Safe minimum' )
          LARGE = ONE / SMALL
-*
-*        If it looks like we're on a Cray, take the square root of
-*        SMALL and LARGE to avoid overflow and underflow problems.
-*
-         CALL DLABAD( SMALL, LARGE )
          SMALL = SHRINK*( SMALL / EPS )
          LARGE = ONE / SMALL
       END IF
@@ -233,6 +225,110 @@
          ELSE
             ANORM = ONE
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        xQK: truncated QR with pivoting.
+*             Set parameters to generate a general
+*             M x N matrix.
+*
+*        Set TYPE, the type of matrix to be generated.  'N' is nonsymmetric.
+*
+         TYPE = 'N'
+*
+*        Set DIST, the type of distribution for the random
+*        number generator. 'S' is
+*
+         DIST = 'S'
+*
+*        Set the lower and upper bandwidths.
+*
+         IF( IMAT.EQ.2 ) THEN
+*
+*           2. Random, Diagonal, CNDNUM = 2
+*
+            KL = 0
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.3 ) THEN
+*
+*           3. Random, Upper triangular,  CNDNUM = 2
+*
+            KL = 0
+            KU = MAX( N-1, 0 )
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.4 ) THEN
+*
+*          4. Random, Lower triangular,  CNDNUM = 2
+*
+            KL = MAX( M-1, 0 )
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE
+*
+*           5.-19. Rectangular matrix
+*
+            KL = MAX( M-1, 0 )
+            KU = MAX( N-1, 0 )
+*
+            IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN
+*
+*              5.-14. Random, CNDNUM = 2.
+*
+               CNDNUM = TWO
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.15 ) THEN
+*
+*              15. Random, CNDNUM = sqrt(0.1/EPS)
+*
+               CNDNUM = BADC1
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.16 ) THEN
+*
+*              16. Random, CNDNUM = 0.1/EPS
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.17 ) THEN
+*
+*              17. Random, CNDNUM = 0.1/EPS,
+*                  one small singular value S(N)=1/CNDNUM
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 2
+*
+            ELSE IF( IMAT.EQ.18 ) THEN
+*
+*              18. Random, scaled near underflow
+*
+               CNDNUM = TWO
+               ANORM = SMALL
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.19 ) THEN
+*
+*              19. Random, scaled near overflow
+*
+               CNDNUM = TWO
+               ANORM = LARGE
+               MODE = 3
+*
+            END IF
+*
+         END IF
 *
       ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN
 *
@@ -517,17 +613,18 @@
 *
 *        Set the norm and condition number.
 *
-         IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN
+         MAT = ABS( IMAT )
+         IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN
             CNDNUM = BADC1
-         ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN
+         ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN
             CNDNUM = BADC2
          ELSE
             CNDNUM = TWO
          END IF
 *
-         IF( IMAT.EQ.4 ) THEN
+         IF( MAT.EQ.4 ) THEN
             ANORM = SMALL
-         ELSE IF( IMAT.EQ.5 ) THEN
+         ELSE IF( MAT.EQ.5 ) THEN
             ANORM = LARGE
          ELSE
             ANORM = ONE
diff --git a/lapack-netlib/TESTING/LIN/zqpt01.f b/lapack-netlib/TESTING/LIN/zqpt01.f
index 4e53f92c8..c69eb658f 100644
--- a/lapack-netlib/TESTING/LIN/zqpt01.f
+++ b/lapack-netlib/TESTING/LIN/zqpt01.f
@@ -33,7 +33,7 @@
 *> Householder vectors, and the rest of AF contains a partially updated
 *> matrix.
 *>
-*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M)
+*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) )
 *> \endverbatim
 *
 *  Arguments:
@@ -172,28 +172,28 @@
 *
       NORMA = ZLANGE( 'One-norm', M, N, A, LDA, RWORK )
 *
-      DO 30 J = 1, K
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, K
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = AF( I, J )
-   10    CONTINUE
-         DO 20 I = J + 1, M
+         END DO
+         DO I = J + 1, M
             WORK( ( J-1 )*M+I ) = ZERO
-   20    CONTINUE
-   30 CONTINUE
-      DO 40 J = K + 1, N
+         END DO
+      END DO
+      DO J = K + 1, N
          CALL ZCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 )
-   40 CONTINUE
+      END DO
 *
       CALL ZUNMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK,
      $             M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
-      DO 50 J = 1, N
+      DO J = 1, N
 *
 *        Compare i-th column of QR and jpvt(i)-th column of A
 *
          CALL ZAXPY( M, DCMPLX( -ONE ), A( 1, JPVT( J ) ), 1,
      $               WORK( ( J-1 )*M+1 ), 1 )
-   50 CONTINUE
+      END DO
 *
       ZQPT01 = ZLANGE( 'One-norm', M, N, WORK, M, RWORK ) /
      $         ( DBLE( MAX( M, N ) )*DLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/zqrt11.f b/lapack-netlib/TESTING/LIN/zqrt11.f
index c3be59c36..dc4af744f 100644
--- a/lapack-netlib/TESTING/LIN/zqrt11.f
+++ b/lapack-netlib/TESTING/LIN/zqrt11.f
@@ -158,9 +158,9 @@
       CALL ZUNM2R( 'Left', 'Conjugate transpose', M, M, K, A, LDA, TAU,
      $             WORK, M, WORK( M*M+1 ), INFO )
 *
-      DO 10 J = 1, M
+      DO J = 1, M
          WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE
-   10 CONTINUE
+      END DO
 *
       ZQRT11 = ZLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) /
      $         ( DBLE( M )*DLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/zqrt12.f b/lapack-netlib/TESTING/LIN/zqrt12.f
index 0da6be157..91477b5ea 100644
--- a/lapack-netlib/TESTING/LIN/zqrt12.f
+++ b/lapack-netlib/TESTING/LIN/zqrt12.f
@@ -28,7 +28,7 @@
 *> ZQRT12 computes the singular values `svlues' of the upper trapezoid
 *> of A(1:M,1:N) and returns the ratio
 *>
-*>      || s - svlues||/(||svlues||*eps*max(M,N))
+*>      || svlues - s||/(||s||*eps*max(M,N))
 *> \endverbatim
 *
 *  Arguments:
@@ -125,8 +125,8 @@
       EXTERNAL           DASUM, DLAMCH, DNRM2, ZLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DAXPY, DBDSQR, DLABAD, DLASCL, XERBLA, ZGEBD2,
-     $                   ZLASCL, ZLASET
+      EXTERNAL           DAXPY, DBDSQR, DLASCL, XERBLA, ZGEBD2, ZLASCL,
+     $                   ZLASET
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          DBLE, DCMPLX, MAX, MIN
@@ -154,17 +154,16 @@
 *
       CALL ZLASET( 'Full', M, N, DCMPLX( ZERO ), DCMPLX( ZERO ), WORK,
      $             M )
-      DO 20 J = 1, N
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, N
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = A( I, J )
-   10    CONTINUE
-   20 CONTINUE
+         END DO
+      END DO
 *
 *     Get machine parameters
 *
       SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' )
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale work if max entry outside range [SMLNUM,BIGNUM]
 *
@@ -208,9 +207,9 @@
 *
       ELSE
 *
-         DO 30 I = 1, MN
+         DO I = 1, MN
             RWORK( I ) = ZERO
-   30    CONTINUE
+         END DO
       END IF
 *
 *     Compare s and singular values of work
@@ -218,6 +217,7 @@
       CALL DAXPY( MN, -ONE, S, 1, RWORK( 1 ), 1 )
       ZQRT12 = DASUM( MN, RWORK( 1 ), 1 ) /
      $         ( DLAMCH( 'Epsilon' )*DBLE( MAX( M, N ) ) )
+*
       IF( NRMSVL.NE.ZERO )
      $   ZQRT12 = ZQRT12 / NRMSVL
 *

From 0eb8a87977531b50a70011bad39f90ae57953e06 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 15 Nov 2023 09:56:37 +0100
Subject: [PATCH 419/718] Implement truncated QR with pivoting
 (Reference-LAPACK PR 891)

---
 lapack-netlib/TESTING/ctest.in | 1 +
 lapack-netlib/TESTING/dtest.in | 1 +
 lapack-netlib/TESTING/stest.in | 1 +
 lapack-netlib/TESTING/ztest.in | 1 +
 4 files changed, 4 insertions(+)

diff --git a/lapack-netlib/TESTING/ctest.in b/lapack-netlib/TESTING/ctest.in
index a3588b4a1..74ff31ab8 100644
--- a/lapack-netlib/TESTING/ctest.in
+++ b/lapack-netlib/TESTING/ctest.in
@@ -42,6 +42,7 @@ CRQ    8               List types on next line if 0 < NTYPES <  8
 CLQ    8               List types on next line if 0 < NTYPES <  8
 CQL    8               List types on next line if 0 < NTYPES <  8
 CQP    6               List types on next line if 0 < NTYPES <  6
+CQK   19               List types on next line if 0 < NTYPES < 19
 CTZ    3               List types on next line if 0 < NTYPES <  3
 CLS    6               List types on next line if 0 < NTYPES <  6
 CEQ
diff --git a/lapack-netlib/TESTING/dtest.in b/lapack-netlib/TESTING/dtest.in
index 29bb8b92e..1b6c7bd4a 100644
--- a/lapack-netlib/TESTING/dtest.in
+++ b/lapack-netlib/TESTING/dtest.in
@@ -36,6 +36,7 @@ DRQ    8               List types on next line if 0 < NTYPES <  8
 DLQ    8               List types on next line if 0 < NTYPES <  8
 DQL    8               List types on next line if 0 < NTYPES <  8
 DQP    6               List types on next line if 0 < NTYPES <  6
+DQK   19               LIst types on next line if 0 < NTYPES < 19
 DTZ    3               List types on next line if 0 < NTYPES <  3
 DLS    6               List types on next line if 0 < NTYPES <  6
 DEQ
diff --git a/lapack-netlib/TESTING/stest.in b/lapack-netlib/TESTING/stest.in
index 27ac30040..7faa8b7a1 100644
--- a/lapack-netlib/TESTING/stest.in
+++ b/lapack-netlib/TESTING/stest.in
@@ -36,6 +36,7 @@ SRQ    8               List types on next line if 0 < NTYPES <  8
 SLQ    8               List types on next line if 0 < NTYPES <  8
 SQL    8               List types on next line if 0 < NTYPES <  8
 SQP    6               List types on next line if 0 < NTYPES <  6
+SQK   19               List types on next line if 0 < NTYPES < 19
 STZ    3               List types on next line if 0 < NTYPES <  3
 SLS    6               List types on next line if 0 < NTYPES <  6
 SEQ
diff --git a/lapack-netlib/TESTING/ztest.in b/lapack-netlib/TESTING/ztest.in
index 58da33d60..c83e82e45 100644
--- a/lapack-netlib/TESTING/ztest.in
+++ b/lapack-netlib/TESTING/ztest.in
@@ -42,6 +42,7 @@ ZRQ    8               List types on next line if 0 < NTYPES <  8
 ZLQ    8               List types on next line if 0 < NTYPES <  8
 ZQL    8               List types on next line if 0 < NTYPES <  8
 ZQP    6               List types on next line if 0 < NTYPES <  6
+ZQK   19               List types on next line if 0 < NTYPES < 19
 ZTZ    3               List types on next line if 0 < NTYPES <  3
 ZLS    6               List types on next line if 0 < NTYPES <  6
 ZEQ

From 5bf87c86f580982d741d3870740364659f99a4af Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 15 Nov 2023 12:10:20 +0100
Subject: [PATCH 420/718] Implement truncated QR with pivoting
 (Reference-LAPACK PR 891)

---
 cmake/lapack.cmake | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake
index 22476f561..003a8b3c1 100644
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -52,7 +52,7 @@ set(SLASRC
    sgebrd.f sgecon.f sgeequ.f sgees.f  sgeesx.f sgeev.f  sgeevx.f
    sgehd2.f sgehrd.f sgelq2.f sgelqf.f
    sgels.f  sgelsd.f sgelss.f sgelsy.f sgeql2.f sgeqlf.f
-   sgeqp3.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f
+   sgeqp3.f sgeqp3rk.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f
    sgesc2.f sgesdd.f sgesvd.f sgesvdx.f sgesvx.f sgetc2.f
    sgetrf2.f sgetri.f
    sggbak.f sggbal.f
@@ -67,7 +67,7 @@ set(SLASRC
    slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f
    slansy.f slantb.f slantp.f slantr.f slanv2.f
    slapll.f slapmt.f
-   slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f
+   slaqgb.f slaqge.f slaqp2.f slaqps.f slaqp2rk.f slaqp3rk.f slaqsb.f slaqsp.f slaqsy.f
    slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
    slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
    slarf.f  slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
@@ -139,7 +139,7 @@ set(CLASRC
    cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f
    cgecon.f cgeequ.f cgees.f  cgeesx.f cgeev.f  cgeevx.f
    cgehd2.f cgehrd.f cgelq2.f cgelqf.f
-   cgels.f  cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f
+   cgels.f  cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f cgeqp3rk.f
    cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f cgerq2.f cgerqf.f
    cgesc2.f cgesdd.f cgesvd.f cgesvdx.f
    cgesvj.f cgejsv.f cgsvj0.f cgsvj1.f
@@ -173,7 +173,7 @@ set(CLASRC
    clanhb.f clanhe.f
    clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f
    clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f
-   claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f
+   claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqp2rk.f claqp3rk.f claqsb.f
    claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
    claqz0.f claqz1.f claqz2.f claqz3.f
    claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
@@ -243,7 +243,7 @@ set(DLASRC
    dgebrd.f dgecon.f dgeequ.f dgees.f  dgeesx.f dgeev.f  dgeevx.f
    dgehd2.f dgehrd.f dgelq2.f dgelqf.f
    dgels.f  dgelsd.f dgelss.f dgelsy.f dgeql2.f dgeqlf.f
-   dgeqp3.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f
+   dgeqp3.f dgeqp3rk.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f
    dgesc2.f dgesdd.f dgesvd.f dgesvdx.f dgesvx.f dgetc2.f
    dgetrf2.f dgetri.f
    dggbak.f dggbal.f
@@ -258,7 +258,7 @@ set(DLASRC
    dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f
    dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f
    dlapll.f dlapmt.f
-   dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
+   dlaqgb.f dlaqge.f dlaqp2.f dlaqp2rk.f dlaqp3rk.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
    dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
    dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
    dlarf.f  dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
@@ -331,7 +331,7 @@ set(ZLASRC
    zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f
    zgecon.f zgeequ.f zgees.f  zgeesx.f zgeev.f  zgeevx.f
    zgehd2.f zgehrd.f zgelq2.f zgelqf.f
-   zgels.f  zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f
+   zgels.f  zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f zgeqp3rk.f
    zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f
    zgesc2.f zgesdd.f zgesvd.f zgesvdx.f zgesvx.f
    zgesvj.f zgejsv.f zgsvj0.f zgsvj1.f
@@ -367,7 +367,7 @@ set(ZLASRC
    zlanhe.f
    zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f
    zlantp.f zlantr.f zlapll.f zlapmt.f zlaqgb.f zlaqge.f
-   zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f
+   zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqp2rk.f zlaqp3rk.f zlaqps.f zlaqsb.f
    zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
    zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
    zlarcm.f zlarf.f  zlarfb.f zlarfb_gett.f
@@ -557,7 +557,7 @@ set(SLASRC
    sgebrd.c sgecon.c sgeequ.c sgees.c  sgeesx.c sgeev.c  sgeevx.c
    sgehd2.c sgehrd.c sgelq2.c sgelqf.c
    sgels.c  sgelsd.c sgelss.c sgelsy.c sgeql2.c sgeqlf.c
-   sgeqp3.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c
+   sgeqp3.c sgeqp3rk.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c
    sgesc2.c sgesdd.c sgesvd.c sgesvdx.c sgesvx.c sgetc2.c
    sgetrf2.c sgetri.c
    sggbak.c sggbal.c
@@ -571,7 +571,7 @@ set(SLASRC
    slangb.c slange.c slangt.c slanhs.c slansb.c slansp.c
    slansy.c slantb.c slantp.c slantr.c slanv2.c
    slapll.c slapmt.c
-   slaqgb.c slaqge.c slaqp2.c slaqps.c slaqsb.c slaqsp.c slaqsy.c
+   slaqgb.c slaqge.c slaqp2.c slaqp2rk.c slaqp3rk.c slaqps.c slaqsb.c slaqsp.c slaqsy.c
    slaqr0.c slaqr1.c slaqr2.c slaqr3.c slaqr4.c slaqr5.c
    slaqtr.c slar1v.c slar2v.c ilaslr.c ilaslc.c
    slarf.c  slarfb.c slarfb_gett.c slarfg.c slarfgp.c slarft.c slarfx.c slarfy.c slargv.c
@@ -643,7 +643,7 @@ set(CLASRC
    cgbtf2.c cgbtrf.c cgbtrs.c cgebak.c cgebal.c cgebd2.c cgebrd.c
    cgecon.c cgeequ.c cgees.c  cgeesx.c cgeev.c  cgeevx.c
    cgehd2.c cgehrd.c cgelq2.c cgelqf.c
-   cgels.c  cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c
+   cgels.c  cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c cgeqp3rk.c
    cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c
    cgesc2.c cgesdd.c cgesvd.c cgesvdx.c
    cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c
@@ -677,7 +677,7 @@ set(CLASRC
    clanhb.c clanhe.c
    clanhp.c clanhs.c clanht.c clansb.c clansp.c clansy.c clantb.c
    clantp.c clantr.c clapll.c clapmt.c clarcm.c claqgb.c claqge.c
-   claqhb.c claqhe.c claqhp.c claqp2.c claqps.c claqsb.c
+   claqhb.c claqhe.c claqhp.c claqp2.c claqp2rk.c claqp3rk.c claqps.c claqsb.c
    claqr0.c claqr1.c claqr2.c claqr3.c claqr4.c claqr5.c
    claqsp.c claqsy.c clar1v.c clar2v.c ilaclr.c ilaclc.c
    clarf.c  clarfb.c clarfb_gett.c clarfg.c clarfgp.c clarft.c
@@ -746,7 +746,7 @@ set(DLASRC
    dgebrd.c dgecon.c dgeequ.c dgees.c  dgeesx.c dgeev.c  dgeevx.c
    dgehd2.c dgehrd.c dgelq2.c dgelqf.c
    dgels.c  dgelsd.c dgelss.c dgelsy.c dgeql2.c dgeqlf.c
-   dgeqp3.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c
+   dgeqp3.c dgeqp3rk.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c
    dgesc2.c dgesdd.c dgesvd.c dgesvdx.c dgesvx.c dgetc2.c
    dgetrf2.c dgetri.c
    dggbak.c dggbal.c
@@ -760,7 +760,7 @@ set(DLASRC
    dlangb.c dlange.c dlangt.c dlanhs.c dlansb.c dlansp.c
    dlansy.c dlantb.c dlantp.c dlantr.c dlanv2.c
    dlapll.c dlapmt.c
-   dlaqgb.c dlaqge.c dlaqp2.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c
+   dlaqgb.c dlaqge.c dlaqp2.c dlaqp2rk.c dlaqp3rk.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c
    dlaqr0.c dlaqr1.c dlaqr2.c dlaqr3.c dlaqr4.c dlaqr5.c
    dlaqtr.c dlar1v.c dlar2v.c iladlr.c iladlc.c
    dlarf.c  dlarfb.c dlarfb_gett.c dlarfg.c dlarfgp.c dlarft.c dlarfx.c dlarfy.c
@@ -833,7 +833,7 @@ set(ZLASRC
    zgbtf2.c zgbtrf.c zgbtrs.c zgebak.c zgebal.c zgebd2.c zgebrd.c
    zgecon.c zgeequ.c zgees.c  zgeesx.c zgeev.c  zgeevx.c
    zgehd2.c zgehrd.c zgelq2.c zgelqf.c
-   zgels.c  zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c
+   zgels.c  zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c zgeqp3rk.c
    zgeqr2.c zgeqr2p.c zgeqrf.c zgeqrfp.c zgerfs.c zgerq2.c zgerqf.c
    zgesc2.c zgesdd.c zgesvd.c zgesvdx.c zgesvx.c
    zgesvj.c zgejsv.c zgsvj0.c zgsvj1.c
@@ -868,7 +868,7 @@ set(ZLASRC
    zlanhe.c
    zlanhp.c zlanhs.c zlanht.c zlansb.c zlansp.c zlansy.c zlantb.c
    zlantp.c zlantr.c zlapll.c zlapmt.c zlaqgb.c zlaqge.c
-   zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqps.c zlaqsb.c
+   zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqp2rk.c zlaqp3rk.c zlaqps.c zlaqsb.c
    zlaqr0.c zlaqr1.c zlaqr2.c zlaqr3.c zlaqr4.c zlaqr5.c
    zlaqsp.c zlaqsy.c zlar1v.c zlar2v.c ilazlr.c ilazlc.c
    zlarcm.c zlarf.c  zlarfb.c zlarfb_gett.c

From f437339130a7e3878a94b2a2ea87edbdd05ba5cf Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 15 Nov 2023 12:12:26 +0100
Subject: [PATCH 421/718] Implement truncated QR with pivoting
 (Reference-LAPACK PR 891)

---
 lapack-netlib/SRC/cgeqp3rk.c | 1071 +++++++++++++++++++++++++++++++
 lapack-netlib/SRC/claqp2rk.c |  943 +++++++++++++++++++++++++++
 lapack-netlib/SRC/claqp3rk.c | 1152 +++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/dgeqp3rk.c | 1059 +++++++++++++++++++++++++++++++
 lapack-netlib/SRC/dlaqp2rk.c |  923 +++++++++++++++++++++++++++
 lapack-netlib/SRC/dlaqp3rk.c | 1113 ++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/sgeqp3rk.c | 1055 +++++++++++++++++++++++++++++++
 lapack-netlib/SRC/slaqp2rk.c |  918 +++++++++++++++++++++++++++
 lapack-netlib/SRC/slaqp3rk.c | 1109 ++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/zgeqp3rk.c | 1074 +++++++++++++++++++++++++++++++
 lapack-netlib/SRC/zlaqp2rk.c |  947 ++++++++++++++++++++++++++++
 lapack-netlib/SRC/zlaqp3rk.c | 1157 ++++++++++++++++++++++++++++++++++
 12 files changed, 12521 insertions(+)
 create mode 100644 lapack-netlib/SRC/cgeqp3rk.c
 create mode 100644 lapack-netlib/SRC/claqp2rk.c
 create mode 100644 lapack-netlib/SRC/claqp3rk.c
 create mode 100644 lapack-netlib/SRC/dgeqp3rk.c
 create mode 100644 lapack-netlib/SRC/dlaqp2rk.c
 create mode 100644 lapack-netlib/SRC/dlaqp3rk.c
 create mode 100644 lapack-netlib/SRC/sgeqp3rk.c
 create mode 100644 lapack-netlib/SRC/slaqp2rk.c
 create mode 100644 lapack-netlib/SRC/slaqp3rk.c
 create mode 100644 lapack-netlib/SRC/zgeqp3rk.c
 create mode 100644 lapack-netlib/SRC/zlaqp2rk.c
 create mode 100644 lapack-netlib/SRC/zlaqp3rk.c

diff --git a/lapack-netlib/SRC/cgeqp3rk.c b/lapack-netlib/SRC/cgeqp3rk.c
new file mode 100644
index 000000000..54e7fb140
--- /dev/null
+++ b/lapack-netlib/SRC/cgeqp3rk.c
@@ -0,0 +1,1071 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static integer c__3 = 3;
+static integer c__2 = 2;
+
+/* Subroutine */ int cgeqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*kmax, real *abstol, real *reltol, complex *a, integer *lda, integer *
+	k, real *maxc2nrmk, real *relmaxc2nrmk, integer *jpiv, complex *tau, 
+	complex *work, integer *lwork, real *rwork, integer *iwork, integer *
+	info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    real r__1, r__2;
+    complex q__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int claqp3rk_(integer *, integer *, integer *, 
+	    integer *, integer *, real *, real *, integer *, real *, complex *
+	    , integer *, logical *, integer *, real *, real *, integer *, 
+	    complex *, real *, real *, complex *, complex *, integer *, 
+	    integer *, integer *);
+    real maxc2nrm;
+    logical done;
+    integer jmax, j, jmaxc2nrm, jmaxb, nbmin, iinfo, n_sub__, minmn;
+    real myhugeval;
+    extern real scnrm2_(integer *, complex *, integer *);
+    integer jb, nb, kf, nx;
+    extern real slamch_(char *);
+    real safmin;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen), isamax_(integer *, real *, 
+	    integer *);
+    extern logical sisnan_(real *);
+    integer kp1, lwkopt;
+    logical lquery;
+    integer jbf;
+    real eps;
+    integer iws, ioffset;
+    extern /* Subroutine */ int claqp2rk_(integer *, integer *, integer *, 
+	    integer *, integer *, real *, real *, integer *, real *, complex *
+	    , integer *, integer *, real *, real *, integer *, complex *, 
+	    real *, real *, complex *, integer *);
+
+
+/*  -- LAPACK computational routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test input arguments */
+/*     ==================== */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --work;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*kmax < 0) {
+	*info = -4;
+    } else if (sisnan_(abstol)) {
+	*info = -5;
+    } else if (sisnan_(reltol)) {
+	*info = -6;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -8;
+    }
+
+/*     If the input parameters M, N, NRHS, KMAX, LDA are valid: */
+/*       a) Test the input workspace size LWORK for the minimum */
+/*          size requirement IWS. */
+/*       b) Determine the optimal block size NB and optimal */
+/*          workspace size LWKOPT to be returned in WORK(1) */
+/*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., */
+/*          (3) when routine exits. */
+/*     Here, IWS is the miminum workspace required for unblocked */
+/*     code. */
+
+    if (*info == 0) {
+	minmn = f2cmin(*m,*n);
+	if (minmn == 0) {
+	    iws = 1;
+	    lwkopt = 1;
+	} else {
+
+/*           Minimal workspace size in case of using only unblocked */
+/*           BLAS 2 code in CLAQP2RK. */
+/*           1) CLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in CLARF subroutine inside CLAQP2RK to apply an */
+/*              elementary reflector from the left. */
+/*           TOTAL_WORK_SIZE = 3*N + NRHS - 1 */
+
+	    iws = *n + *nrhs - 1;
+
+/*           Assign to NB optimal block size. */
+
+	    nb = ilaenv_(&c__1, "CGEQP3RK", " ", m, n, &c_n1, &c_n1, (ftnlen)
+		    8, (ftnlen)1);
+
+/*           A formula for the optimal workspace size in case of using */
+/*           both unblocked BLAS 2 in CLAQP2RK and blocked BLAS 3 code */
+/*           in CLAQP3RK. */
+/*           1) CGEQP3RK, CLAQP2RK, CLAQP3RK: 2*N to store full and */
+/*              partial column 2-norms. */
+/*           2) CLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in CLARF subroutine to apply an elementary reflector */
+/*              from the left. */
+/*           3) CLAQP3RK: NB*(N+NRHS) to use in the work array F that */
+/*              is used to apply a block reflector from */
+/*              the left. */
+/*           4) CLAQP3RK: NB to use in the auxilixary array AUX. */
+/*           Sizes (2) and ((3) + (4)) should intersect, therefore */
+/*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. */
+
+	    lwkopt = (*n << 1) + nb * (*n + *nrhs + 1);
+	}
+	q__1.r = (real) lwkopt, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+
+	if (*lwork < iws && ! lquery) {
+	    *info = -15;
+	}
+    }
+
+/*      NOTE: The optimal workspace size is returned in WORK(1), if */
+/*            the input parameters M, N, NRHS, KMAX, LDA are valid. */
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGEQP3RK", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible for M=0 or N=0. */
+
+    if (minmn == 0) {
+	*k = 0;
+	*maxc2nrmk = 0.f;
+	*relmaxc2nrmk = 0.f;
+	q__1.r = (real) lwkopt, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize column pivot array JPIV. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	jpiv[j] = j;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize storage for partial and exact column 2-norms. */
+/*     a) The elements WORK(1:N) are used to store partial column */
+/*        2-norms of the matrix A, and may decrease in each computation */
+/*        step; initialize to the values of complete columns 2-norms. */
+/*     b) The elements WORK(N+1:2*N) are used to store complete column */
+/*        2-norms of the matrix A, they are not changed during the */
+/*        computation; initialize the values of complete columns 2-norms. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	rwork[j] = scnrm2_(m, &a[j * a_dim1 + 1], &c__1);
+	rwork[*n + j] = rwork[j];
+    }
+
+/*     ================================================================== */
+
+/*     Compute the pivot column index and the maximum column 2-norm */
+/*     for the whole original matrix stored in A(1:M,1:N). */
+
+    kp1 = isamax_(n, &rwork[1], &c__1);
+
+/*     ==================================================================. */
+
+    if (sisnan_(&maxc2nrm)) {
+
+/*        Check if the matrix A contains NaN, set INFO parameter */
+/*        to the column number where the first NaN is found and return */
+/*        from the routine. */
+
+	*k = 0;
+	*info = kp1;
+
+/*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = maxc2nrm;
+
+/*        Array TAU is not set and contains undefined elements. */
+
+	q__1.r = (real) lwkopt, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+	return 0;
+    }
+
+/*     =================================================================== */
+
+    if (maxc2nrm == 0.f) {
+
+/*        Check is the matrix A is a zero matrix, set array TAU and */
+/*        return from the routine. */
+
+	*k = 0;
+	*maxc2nrmk = 0.f;
+	*relmaxc2nrmk = 0.f;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j;
+	    tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+	}
+
+	q__1.r = (real) lwkopt, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+	return 0;
+
+    }
+
+/*     =================================================================== */
+
+    myhugeval = slamch_("Overflow");
+
+    if (maxc2nrm > myhugeval) {
+
+/*        Check if the matrix A contains +Inf or -Inf, set INFO parameter */
+/*        to the column number, where the first +/-Inf  is found plus N, */
+/*        and continue the computation. */
+
+	*info = *n + kp1;
+
+    }
+
+/*     ================================================================== */
+
+/*     Quick return if possible for the case when the first */
+/*     stopping criterion is satisfied, i.e. KMAX = 0. */
+
+    if (*kmax == 0) {
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.f;
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j;
+	    tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+	}
+	q__1.r = (real) lwkopt, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+    eps = slamch_("Epsilon");
+
+/*     Adjust ABSTOL */
+
+    if (*abstol >= 0.f) {
+	safmin = slamch_("Safe minimum");
+/* Computing MAX */
+	r__1 = *abstol, r__2 = safmin * 2.f;
+	*abstol = f2cmax(r__1,r__2);
+    }
+
+/*     Adjust RELTOL */
+
+    if (*reltol >= 0.f) {
+	*reltol = f2cmax(*reltol,eps);
+    }
+
+/*     =================================================================== */
+
+/*     JMAX is the maximum index of the column to be factorized, */
+/*     which is also limited by the first stopping criterion KMAX. */
+
+    jmax = f2cmin(*kmax,minmn);
+
+/*     =================================================================== */
+
+/*     Quick return if possible for the case when the second or third */
+/*     stopping criterion for the whole original matrix is satified, */
+/*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */
+/*     (which is ONE <= RELTOL). */
+
+    if (maxc2nrm <= *abstol || 1.f <= *reltol) {
+
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.f;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j;
+	    tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+	}
+
+	q__1.r = (real) lwkopt, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+	return 0;
+    }
+
+/*     ================================================================== */
+/*     Factorize columns */
+/*     ================================================================== */
+
+/*     Determine the block size. */
+
+    nbmin = 2;
+    nx = 0;
+
+    if (nb > 1 && nb < minmn) {
+
+/*        Determine when to cross over from blocked to unblocked code. */
+/*        (for N less than NX, unblocked code should be used). */
+
+/* Computing MAX */
+	i__1 = 0, i__2 = ilaenv_(&c__3, "CGEQP3RK", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)8, (ftnlen)1);
+	nx = f2cmax(i__1,i__2);
+
+	if (nx < minmn) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    if (*lwork < lwkopt) {
+
+/*              Not enough workspace to use optimal block size that */
+/*              is currently stored in NB. */
+/*              Reduce NB and determine the minimum value of NB. */
+
+		nb = (*lwork - (*n << 1)) / (*n + 1);
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "CGEQP3RK", " ", m, n, &c_n1, 
+			&c_n1, (ftnlen)8, (ftnlen)1);
+		nbmin = f2cmax(i__1,i__2);
+
+	    }
+	}
+    }
+
+/*     ================================================================== */
+
+/*     DONE is the boolean flag to rerpresent the case when the */
+/*     factorization completed in the block factorization routine, */
+/*     before the end of the block. */
+
+    done = FALSE_;
+
+/*     J is the column index. */
+
+    j = 1;
+
+/*     (1) Use blocked code initially. */
+
+/*     JMAXB is the maximum column index of the block, when the */
+/*     blocked code is used, is also limited by the first stopping */
+/*     criterion KMAX. */
+
+/* Computing MIN */
+    i__1 = *kmax, i__2 = minmn - nx;
+    jmaxb = f2cmin(i__1,i__2);
+
+    if (nb >= nbmin && nb < jmax && jmaxb > 0) {
+
+/*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */
+/*        J   is the column index of a column block; */
+/*        JB  is the column block size to pass to block factorization */
+/*            routine in a loop step; */
+/*        JBF is the number of columns that were actually factorized */
+/*            that was returned by the block factorization routine */
+/*            in a loop step, JBF <= JB; */
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	while(j <= jmaxb) {
+
+/* Computing MIN */
+	    i__1 = nb, i__2 = jmaxb - j + 1;
+	    jb = f2cmin(i__1,i__2);
+	    n_sub__ = *n - j + 1;
+	    ioffset = j - 1;
+
+/*           Factorize JB columns among the columns A(J:N). */
+
+	    i__1 = *n + *nrhs - j + 1;
+	    claqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, 
+		    &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, 
+		    maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], &
+		    rwork[*n + j], &work[1], &work[jb + 1], &i__1, &iwork[1], 
+		    &iinfo);
+
+/*           Set INFO on the first occurence of Inf. */
+
+	    if (iinfo > n_sub__ && *info == 0) {
+		*info = (ioffset << 1) + iinfo;
+	    }
+
+	    if (done) {
+
+/*              Either the submatrix is zero before the end of the */
+/*              column block, or ABSTOL or RELTOL criterion is */
+/*              satisfied before the end of the column block, we can */
+/*              return from the routine. Perform the following before */
+/*              returning: */
+/*                a) Set the number of factorized columns K, */
+/*                   K = IOFFSET + JBF from the last call of blocked */
+/*                   routine. */
+/*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */
+/*                         by the block factorization routine; */
+/*                      2) The remaining TAUs are set to ZERO by the */
+/*                         block factorization routine. */
+
+		*k = ioffset + jbf;
+
+/*              Set INFO on the first occurrence of NaN, NaN takes */
+/*              prcedence over Inf. */
+
+		if (iinfo <= n_sub__ && iinfo > 0) {
+		    *info = ioffset + iinfo;
+		}
+
+/*              Return from the routine. */
+
+		q__1.r = (real) lwkopt, q__1.i = 0.f;
+		work[1].r = q__1.r, work[1].i = q__1.i;
+
+		return 0;
+
+	    }
+
+	    j += jbf;
+
+	}
+
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+/*     J = JMAX+1 means we factorized the maximum possible number of */
+/*     columns, that is in ELSE clause we need to compute */
+/*     the MAXC2NORM and RELMAXC2NORM to return after we processed */
+/*     the blocks. */
+
+    if (j <= jmax) {
+
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	n_sub__ = *n - j + 1;
+	ioffset = j - 1;
+
+	i__1 = jmax - j + 1;
+	claqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, &
+		maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, 
+		relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], &rwork[*n + j], &
+		work[1], &iinfo);
+
+/*        ABSTOL or RELTOL criterion is satisfied when the number of */
+/*        the factorized columns KF is smaller then the  number */
+/*        of columns JMAX-J+1 supplied to be factorized by the */
+/*        unblocked routine, we can return from */
+/*        the routine. Perform the following before returning: */
+/*           a) Set the number of factorized columns K, */
+/*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the */
+/*              unblocked factorization routine above. */
+
+	*k = j - 1 + kf;
+
+/*        Set INFO on the first exception occurence. */
+
+/*        Set INFO on the first exception occurence of Inf or NaN, */
+/*        (NaN takes precedence over Inf). */
+
+	if (iinfo > n_sub__ && *info == 0) {
+	    *info = (ioffset << 1) + iinfo;
+	} else if (iinfo <= n_sub__ && iinfo > 0) {
+	    *info = ioffset + iinfo;
+	}
+
+    } else {
+
+/*        Compute the return values for blocked code. */
+
+/*        Set the number of factorized columns if the unblocked routine */
+/*        was not called. */
+
+	*k = jmax;
+
+/*        If there exits a residual matrix after the blocked code: */
+/*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */
+/*              residual matrix, otherwise set them to ZERO; */
+/*           2) Set TAU(K+1:MINMN) to ZERO. */
+
+	if (*k < minmn) {
+	    i__1 = *n - *k;
+	    jmaxc2nrm = *k + isamax_(&i__1, &rwork[*k + 1], &c__1);
+	    *maxc2nrmk = rwork[jmaxc2nrm];
+	    if (*k == 0) {
+		*relmaxc2nrmk = 1.f;
+	    } else {
+		*relmaxc2nrmk = *maxc2nrmk / maxc2nrm;
+	    }
+
+	    i__1 = minmn;
+	    for (j = *k + 1; j <= i__1; ++j) {
+		i__2 = j;
+		tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+	    }
+
+	} else {
+	    *maxc2nrmk = 0.f;
+	    *relmaxc2nrmk = 0.f;
+
+	}
+
+/*     END IF( J.LE.JMAX ) THEN */
+
+    }
+
+    q__1.r = (real) lwkopt, q__1.i = 0.f;
+    work[1].r = q__1.r, work[1].i = q__1.i;
+
+    return 0;
+
+/*     End of CGEQP3RK */
+
+} /* cgeqp3rk_ */
+
diff --git a/lapack-netlib/SRC/claqp2rk.c b/lapack-netlib/SRC/claqp2rk.c
new file mode 100644
index 000000000..4184c5927
--- /dev/null
+++ b/lapack-netlib/SRC/claqp2rk.c
@@ -0,0 +1,943 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+
+/* Subroutine */ int claqp2rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *kmax, real *abstol, real *reltol, integer *kp1, 
+	real *maxc2nrm, complex *a, integer *lda, integer *k, real *maxc2nrmk,
+	 real *relmaxc2nrmk, integer *jpiv, complex *tau, real *vn1, real *
+	vn2, complex *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    real r__1;
+    complex q__1;
+
+    /* Local variables */
+    complex aikk;
+    real temp, temp2;
+    integer i__, j;
+    real tol3z;
+    integer jmaxc2nrm;
+    extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex *
+	    , integer *, complex *, complex *, integer *, complex *), 
+	    cswap_(integer *, complex *, integer *, complex *, integer *);
+    integer itemp, minmnfact;
+    real myhugeval;
+    integer minmnupdt;
+    extern real scnrm2_(integer *, complex *, integer *);
+    integer kk, kp;
+    extern /* Subroutine */ int clarfg_(integer *, complex *, complex *, 
+	    integer *, complex *);
+    extern real slamch_(char *);
+    extern integer isamax_(integer *, real *, integer *);
+    real taunan;
+    extern logical sisnan_(real *);
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/*     MINMNUPDT is the smallest dimension */
+/*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which */
+/*     contains the submatrices A(IOFFSET+1:M,1:N) and */
+/*     B(IOFFSET+1:M,1:NRHS) as column blocks. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *kmax = f2cmin(*kmax,minmnfact);
+    tol3z = sqrt(slamch_("Epsilon"));
+    myhugeval = slamch_("Overflow");
+
+/*     Compute the factorization, KK is the lomn loop index. */
+
+    i__1 = *kmax;
+    for (kk = 1; kk <= i__1; ++kk) {
+
+	i__ = *ioffset + kk;
+
+	if (i__ == 1) {
+
+/*           ============================================================ */
+
+/*           We are at the first column of the original whole matrix A, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+/*           ============================================================ */
+
+	} else {
+
+/*           ============================================================ */
+
+/*           Determine the pivot column in KK-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__2 = *n - kk + 1;
+	    kp = kk - 1 + isamax_(&i__2, &vn1[kk], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,KK:N) in step KK. */
+/*           RELMAXC2NRMK  will be computed later, after somecondition */
+/*           checks on MAXC2NRMK. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains NaN, and set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (sisnan_(maxc2nrmk)) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*info = *k + kp;
+
+/*               Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*               Array TAU(K+1:MINMNFACT) is not set and contains */
+/*               undefined elements. */
+
+		return 0;
+	    }
+
+/*           ============================================================ */
+
+/*           Quick return, if the submatrix A(I:M,KK:N) is */
+/*           a zero matrix. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*maxc2nrmk == 0.f) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*relmaxc2nrmk = 0.f;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    i__3 = j;
+		    tau[i__3].r = 0.f, tau[i__3].i = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + kk - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL >= ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+/*              Set K, the number of factorized columns. */
+
+		*k = kk - 1;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    i__3 = j;
+		    tau[i__3].r = 0.f, tau[i__3].i = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,KK:N): */
+/*        1) swap the KK-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) copy the KK-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than KK in the next loop step.) */
+/*        3) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A, not the block A(1:M,1:N). */
+
+	if (kp != kk) {
+	    cswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1);
+	    vn1[kp] = vn1[kk];
+	    vn2[kp] = vn2[kk];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[kk];
+	    jpiv[kk] = itemp;
+	}
+
+/*        Generate elementary reflector H(KK) using the column A(I:M,KK), */
+/*        if the column has more than one element, otherwise */
+/*        the elementary reflector would be an identity matrix, */
+/*        and TAU(KK) = CZERO. */
+
+	if (i__ < *m) {
+	    i__2 = *m - i__ + 1;
+	    clarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], &
+		    c__1, &tau[kk]);
+	} else {
+	    i__2 = kk;
+	    tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+	}
+
+/*        Check if TAU(KK) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(KK) for Inf, */
+/*        since CLARFG cannot produce TAU(KK) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by CLARFG can contain Inf, which requires */
+/*        TAU(KK) to contain NaN. Therefore, this case of generating Inf */
+/*        by CLARFG is covered by checking TAU(KK) for NaN. */
+
+	i__2 = kk;
+	r__1 = tau[i__2].r;
+	if (sisnan_(&r__1)) {
+	    i__2 = kk;
+	    taunan = tau[i__2].r;
+	} else /* if(complicated condition) */ {
+	    r__1 = r_imag(&tau[kk]);
+	    if (sisnan_(&r__1)) {
+		taunan = r_imag(&tau[kk]);
+	    } else {
+		taunan = 0.f;
+	    }
+	}
+
+	if (sisnan_(&taunan)) {
+	    *k = kk - 1;
+	    *info = kk;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = taunan;
+	    *relmaxc2nrmk = taunan;
+
+/*           Array TAU(KK:MINMNFACT) is not set and contains */
+/*           undefined elements, except the first element TAU(KK) = NaN. */
+
+	    return 0;
+	}
+
+/*        Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left. */
+/*        ( If M >= N, then at KK = N there is no residual matrix, */
+/*         i.e. no columns of A to update, only columns of B. */
+/*         If M < N, then at KK = M-IOFFSET, I = M and we have a */
+/*         one-row residual matrix in A and the elementary */
+/*         reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update */
+/*         is needed for the residual matrix in A and the */
+/*         right-hand-side-matrix in B. */
+/*         Therefore, we update only if */
+/*         KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */
+/*         condition is satisfied, not only KK < N+NRHS ) */
+
+	if (kk < minmnupdt) {
+	    i__2 = i__ + kk * a_dim1;
+	    aikk.r = a[i__2].r, aikk.i = a[i__2].i;
+	    i__2 = i__ + kk * a_dim1;
+	    a[i__2].r = 1.f, a[i__2].i = 0.f;
+	    i__2 = *m - i__ + 1;
+	    i__3 = *n + *nrhs - kk;
+	    r_cnjg(&q__1, &tau[kk]);
+	    clarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &q__1, 
+		    &a[i__ + (kk + 1) * a_dim1], lda, &work[1]);
+	    i__2 = i__ + kk * a_dim1;
+	    a[i__2].r = aikk.r, a[i__2].i = aikk.i;
+	}
+
+	if (kk < minmnfact) {
+
+/*           Update the partial column 2-norms for the residual matrix, */
+/*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */
+/*           when KK < f2cmin(M-IOFFSET, N). */
+
+	    i__2 = *n;
+	    for (j = kk + 1; j <= i__2; ++j) {
+		if (vn1[j] != 0.f) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+/* Computing 2nd power */
+		    r__1 = c_abs(&a[i__ + j * a_dim1]) / vn1[j];
+		    temp = 1.f - r__1 * r__1;
+		    temp = f2cmax(temp,0.f);
+/* Computing 2nd power */
+		    r__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (r__1 * r__1);
+		    if (temp2 <= tol3z) {
+
+/*                    Compute the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by explicitly computing it, */
+/*                    and store it in both partial 2-norm vector VN1 */
+/*                    and exact column 2-norm vector VN2. */
+
+			i__3 = *m - i__;
+			vn1[j] = scnrm2_(&i__3, &a[i__ + 1 + j * a_dim1], &
+				c__1);
+			vn2[j] = vn1[j];
+
+		    } else {
+
+/*                    Update the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by removing one */
+/*                    element A(I,J) and store it in partial */
+/*                    2-norm vector VN1. */
+
+			vn1[j] *= sqrt(temp);
+
+		    }
+		}
+	    }
+
+	}
+
+/*     End factorization loop */
+
+    }
+
+/*     If we reached this point, all colunms have been factorized, */
+/*     i.e. no condition was triggered to exit the routine. */
+/*     Set the number of factorized columns. */
+
+    *k = *kmax;
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */
+/*     we return. */
+
+    if (*k < minmnfact) {
+
+	i__1 = *n - *k;
+	jmaxc2nrm = *k + isamax_(&i__1, &vn1[*k + 1], &c__1);
+	*maxc2nrmk = vn1[jmaxc2nrm];
+
+	if (*k == 0) {
+	    *relmaxc2nrmk = 1.f;
+	} else {
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+	}
+
+    } else {
+	*maxc2nrmk = 0.f;
+	*relmaxc2nrmk = 0.f;
+    }
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, set TAUs corresponding to the columns that were */
+/*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO. */
+
+    i__1 = minmnfact;
+    for (j = *k + 1; j <= i__1; ++j) {
+	i__2 = j;
+	tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+    }
+
+    return 0;
+
+/*     End of CLAQP2RK */
+
+} /* claqp2rk_ */
+
diff --git a/lapack-netlib/SRC/claqp3rk.c b/lapack-netlib/SRC/claqp3rk.c
new file mode 100644
index 000000000..ca305fab7
--- /dev/null
+++ b/lapack-netlib/SRC/claqp3rk.c
@@ -0,0 +1,1152 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static complex c_b1 = {0.f,0.f};
+static complex c_b2 = {1.f,0.f};
+static integer c__1 = 1;
+
+/* Subroutine */ int claqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *nb, real *abstol, real *reltol, integer *kp1, real 
+	*maxc2nrm, complex *a, integer *lda, logical *done, integer *kb, real 
+	*maxc2nrmk, real *relmaxc2nrmk, integer *jpiv, complex *tau, real *
+	vn1, real *vn2, complex *auxv, complex *f, integer *ldf, integer *
+	iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, f_dim1, f_offset, i__1, i__2, i__3;
+    real r__1, r__2;
+    complex q__1;
+
+    /* Local variables */
+    real temp, temp2;
+    integer i__, j, k;
+    real tol3z;
+    extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *), cgemv_(char *, 
+	    integer *, integer *, complex *, complex *, integer *, complex *, 
+	    integer *, complex *, complex *, integer *), cswap_(
+	    integer *, complex *, integer *, complex *, integer *);
+    integer itemp, minmnfact;
+    real myhugeval;
+    integer minmnupdt;
+    extern real scnrm2_(integer *, complex *, integer *);
+    integer if__, kp;
+    extern /* Subroutine */ int clarfg_(integer *, complex *, complex *, 
+	    integer *, complex *);
+    extern real slamch_(char *);
+    integer lsticc;
+    extern integer isamax_(integer *, real *, integer *);
+    real taunan;
+    extern logical sisnan_(real *);
+    complex aik;
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --auxv;
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *nb = f2cmin(*nb,minmnfact);
+    tol3z = sqrt(slamch_("Epsilon"));
+    myhugeval = slamch_("Overflow");
+
+/*     Compute factorization in a while loop over NB columns, */
+/*     K is the column index in the block A(1:M,1:N). */
+
+    k = 0;
+    lsticc = 0;
+    *done = FALSE_;
+
+    while(k < *nb && lsticc == 0) {
+	++k;
+	i__ = *ioffset + k;
+
+	if (i__ == 1) {
+
+/*           We are at the first column of the original whole matrix A_orig, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+	} else {
+
+/*           Determine the pivot column in K-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__1 = *n - k + 1;
+	    kp = k - 1 + isamax_(&i__1, &vn1[k], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,K:N) in step K. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains NaN, set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (sisnan_(maxc2nrmk)) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*info = *kb + kp;
+
+/*              Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix contains NaN and we stop */
+/*              the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    q__1.r = -1.f, q__1.i = 0.f;
+		    cgemm_("No transpose", "Conjugate transpose", &i__1, nrhs,
+			     kb, &q__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 
+			    + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*              undefined elements. */
+
+/*              Return from the routine. */
+
+		return 0;
+	    }
+
+/*           Quick return, if the submatrix A(I:M,K:N) is */
+/*           a zero matrix. We need to check it only if the column index */
+/*           (same as row index) is larger than 1, since the condition */
+/*           for the whole original matrix A_orig is checked in the main */
+/*           routine. */
+
+	    if (*maxc2nrmk == 0.f) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*relmaxc2nrmk = 0.f;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix is zero and we stop the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */
+
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    q__1.r = -1.f, q__1.i = 0.f;
+		    cgemm_("No transpose", "Conjugate transpose", &i__1, nrhs,
+			     kb, &q__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 
+			    + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    i__2 = j;
+		    tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + k - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third tolerance stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig; */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+
+/*              Apply the block reflector to the residual of the */
+/*              matrix A and the residual of the right hand sides B, if */
+/*              the residual matrix and and/or the residual of the right */
+/*              hand sides exist,  i.e. if the submatrix */
+/*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when */
+/*                 KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */
+
+		if (*kb < minmnupdt) {
+		    i__1 = *m - if__;
+		    i__2 = *n + *nrhs - *kb;
+		    q__1.r = -1.f, q__1.i = 0.f;
+		    cgemm_("No transpose", "Conjugate transpose", &i__1, &
+			    i__2, kb, &q__1, &a[if__ + 1 + a_dim1], lda, &f[*
+			    kb + 1 + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*kb 
+			    + 1) * a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    i__2 = j;
+		    tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,K:N): */
+/*        1) swap the K-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */
+/*        3) copy the K-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than K in the next loop step.) */
+/*        4) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A_orig, not the block A(1:M,1:N). */
+
+	if (kp != k) {
+	    cswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
+	    i__1 = k - 1;
+	    cswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf);
+	    vn1[kp] = vn1[k];
+	    vn2[kp] = vn2[k];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[k];
+	    jpiv[k] = itemp;
+	}
+
+/*        Apply previous Householder reflectors to column K: */
+/*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H. */
+
+	if (k > 1) {
+	    i__1 = k - 1;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = k + j * f_dim1;
+		r_cnjg(&q__1, &f[k + j * f_dim1]);
+		f[i__2].r = q__1.r, f[i__2].i = q__1.i;
+	    }
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    q__1.r = -1.f, q__1.i = 0.f;
+	    cgemv_("No transpose", &i__1, &i__2, &q__1, &a[i__ + a_dim1], lda,
+		     &f[k + f_dim1], ldf, &c_b2, &a[i__ + k * a_dim1], &c__1);
+	    i__1 = k - 1;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = k + j * f_dim1;
+		r_cnjg(&q__1, &f[k + j * f_dim1]);
+		f[i__2].r = q__1.r, f[i__2].i = q__1.i;
+	    }
+	}
+
+/*        Generate elementary reflector H(k) using the column A(I:M,K). */
+
+	if (i__ < *m) {
+	    i__1 = *m - i__ + 1;
+	    clarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], &
+		    c__1, &tau[k]);
+	} else {
+	    i__1 = k;
+	    tau[i__1].r = 0.f, tau[i__1].i = 0.f;
+	}
+
+/*        Check if TAU(K) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(K) for Inf, */
+/*        since CLARFG cannot produce TAU(KK) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by CLARFG can contain Inf, which requires */
+/*        TAU(K) to contain NaN. Therefore, this case of generating Inf */
+/*        by CLARFG is covered by checking TAU(K) for NaN. */
+
+	i__1 = k;
+	r__1 = tau[i__1].r;
+	if (sisnan_(&r__1)) {
+	    i__1 = k;
+	    taunan = tau[i__1].r;
+	} else /* if(complicated condition) */ {
+	    r__1 = r_imag(&tau[k]);
+	    if (sisnan_(&r__1)) {
+		taunan = r_imag(&tau[k]);
+	    } else {
+		taunan = 0.f;
+	    }
+	}
+
+	if (sisnan_(&taunan)) {
+
+	    *done = TRUE_;
+
+/*           Set KB, the number of factorized partial columns */
+/*                   that are non-zero in each step in the block, */
+/*                   i.e. the rank of the factor R. */
+/*           Set IF, the number of processed rows in the block, which */
+/*                   is the same as the number of processed rows in */
+/*                   the original whole matrix A_orig. */
+
+	    *kb = k - 1;
+	    if__ = i__ - 1;
+	    *info = k;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = taunan;
+	    *relmaxc2nrmk = taunan;
+
+/*           There is no need to apply the block reflector to the */
+/*           residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*           since the submatrix contains NaN and we stop */
+/*           the computation. */
+/*           But, we need to apply the block reflector to the residual */
+/*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*           residual right hand sides exist.  This occurs */
+/*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */
+
+	    if (*nrhs > 0 && *kb < *m - *ioffset) {
+		i__1 = *m - if__;
+		q__1.r = -1.f, q__1.i = 0.f;
+		cgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, kb,
+			 &q__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + 
+			f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * a_dim1],
+			 lda);
+	    }
+
+/*           There is no need to recompute the 2-norm of the */
+/*           difficult columns, since we stop the factorization. */
+
+/*           Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*           undefined elements. */
+
+/*           Return from the routine. */
+
+	    return 0;
+	}
+
+/*        =============================================================== */
+
+	i__1 = i__ + k * a_dim1;
+	aik.r = a[i__1].r, aik.i = a[i__1].i;
+	i__1 = i__ + k * a_dim1;
+	a[i__1].r = 1.f, a[i__1].i = 0.f;
+
+/*        =============================================================== */
+
+/*        Compute the current K-th column of F: */
+/*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K). */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = *n + *nrhs - k;
+	    cgemv_("Conjugate transpose", &i__1, &i__2, &tau[k], &a[i__ + (k 
+		    + 1) * a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, &
+		    f[k + 1 + k * f_dim1], &c__1);
+	}
+
+/*           2) Zero out elements above and on the diagonal of the */
+/*              column K in matrix F, i.e elements F(1:K,K). */
+
+	i__1 = k;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j + k * f_dim1;
+	    f[i__2].r = 0.f, f[i__2].i = 0.f;
+	}
+
+/*         3) Incremental updating of the K-th column of F: */
+/*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H */
+/*                    * A(I:M,K). */
+
+	if (k > 1) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    i__3 = k;
+	    q__1.r = -tau[i__3].r, q__1.i = -tau[i__3].i;
+	    cgemv_("Conjugate Transpose", &i__1, &i__2, &q__1, &a[i__ + 
+		    a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, &auxv[1]
+		    , &c__1);
+
+	    i__1 = *n + *nrhs;
+	    i__2 = k - 1;
+	    cgemv_("No transpose", &i__1, &i__2, &c_b2, &f[f_dim1 + 1], ldf, &
+		    auxv[1], &c__1, &c_b2, &f[k * f_dim1 + 1], &c__1);
+	}
+
+/*        =============================================================== */
+
+/*        Update the current I-th row of A: */
+/*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */
+/*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**H. */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *n + *nrhs - k;
+	    q__1.r = -1.f, q__1.i = 0.f;
+	    cgemm_("No transpose", "Conjugate transpose", &c__1, &i__1, &k, &
+		    q__1, &a[i__ + a_dim1], lda, &f[k + 1 + f_dim1], ldf, &
+		    c_b2, &a[i__ + (k + 1) * a_dim1], lda);
+	}
+
+	i__1 = i__ + k * a_dim1;
+	a[i__1].r = aik.r, a[i__1].i = aik.i;
+
+/*        Update the partial column 2-norms for the residual matrix, */
+/*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */
+/*        when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */
+
+	if (k < minmnfact) {
+
+	    i__1 = *n;
+	    for (j = k + 1; j <= i__1; ++j) {
+		if (vn1[j] != 0.f) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+		    temp = c_abs(&a[i__ + j * a_dim1]) / vn1[j];
+/* Computing MAX */
+		    r__1 = 0.f, r__2 = (temp + 1.f) * (1.f - temp);
+		    temp = f2cmax(r__1,r__2);
+/* Computing 2nd power */
+		    r__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (r__1 * r__1);
+		    if (temp2 <= tol3z) {
+
+/*                    At J-index, we have a difficult column for the */
+/*                    update of the 2-norm. Save the index of the previous */
+/*                    difficult column in IWORK(J-1). */
+/*                    NOTE: ILSTCC > 1, threfore we can use IWORK only */
+/*                    with N-1 elements, where the elements are */
+/*                    shifted by 1 to the left. */
+
+			iwork[j - 1] = lsticc;
+
+/*                    Set the index of the last difficult column LSTICC. */
+
+			lsticc = j;
+
+		    } else {
+			vn1[j] *= sqrt(temp);
+		    }
+		}
+	    }
+
+	}
+
+/*        End of while loop. */
+
+    }
+
+/*     Now, afler the loop: */
+/*        Set KB, the number of factorized columns in the block; */
+/*        Set IF, the number of processed rows in the block, which */
+/*                is the same as the number of processed rows in */
+/*                the original whole matrix A_orig, IF = IOFFSET + KB. */
+
+    *kb = k;
+    if__ = i__;
+
+/*     Apply the block reflector to the residual of the matrix A */
+/*     and the residual of the right hand sides B, if the residual */
+/*     matrix and and/or the residual of the right hand sides */
+/*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */
+/*     This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */
+
+    if (*kb < minmnupdt) {
+	i__1 = *m - if__;
+	i__2 = *n + *nrhs - *kb;
+	q__1.r = -1.f, q__1.i = 0.f;
+	cgemm_("No transpose", "Conjugate transpose", &i__1, &i__2, kb, &q__1,
+		 &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b2,
+		 &a[if__ + 1 + (*kb + 1) * a_dim1], lda);
+    }
+
+/*     Recompute the 2-norm of the difficult columns. */
+/*     Loop over the index of the difficult columns from the largest */
+/*     to the smallest index. */
+
+    while(lsticc > 0) {
+
+/*        LSTICC is the index of the last difficult column is greater */
+/*        than 1. */
+/*        ITEMP is the index of the previous difficult column. */
+
+	itemp = iwork[lsticc - 1];
+
+/*        Compute the 2-norm explicilty for the last difficult column and */
+/*        save it in the partial and exact 2-norm vectors VN1 and VN2. */
+
+/*        NOTE: The computation of VN1( LSTICC ) relies on the fact that */
+/*        SCNRM2 does not fail on vectors with norm below the value of */
+/*        SQRT(SLAMCH('S')) */
+
+	i__1 = *m - if__;
+	vn1[lsticc] = scnrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1);
+	vn2[lsticc] = vn1[lsticc];
+
+/*        Downdate the index of the last difficult column to */
+/*        the index of the previous difficult column. */
+
+	lsticc = itemp;
+
+    }
+
+    return 0;
+
+/*     End of CLAQP3RK */
+
+} /* claqp3rk_ */
+
diff --git a/lapack-netlib/SRC/dgeqp3rk.c b/lapack-netlib/SRC/dgeqp3rk.c
new file mode 100644
index 000000000..17a78dd5a
--- /dev/null
+++ b/lapack-netlib/SRC/dgeqp3rk.c
@@ -0,0 +1,1059 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static integer c__3 = 3;
+static integer c__2 = 2;
+
+/* Subroutine */ int dgeqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*kmax, doublereal *abstol, doublereal *reltol, doublereal *a, integer 
+	*lda, integer *k, doublereal *maxc2nrmk, doublereal *relmaxc2nrmk, 
+	integer *jpiv, doublereal *tau, doublereal *work, integer *lwork, 
+	integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    extern /* Subroutine */ int dlaqp2rk_(integer *, integer *, integer *, 
+	    integer *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *, doublereal *, integer *, integer *, doublereal *, 
+	    doublereal *, integer *, doublereal *, doublereal *, doublereal *,
+	     doublereal *, integer *), dlaqp3rk_(integer *, integer *, 
+	    integer *, integer *, integer *, doublereal *, doublereal *, 
+	    integer *, doublereal *, doublereal *, integer *, logical *, 
+	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
+	    doublereal *, doublereal *, doublereal *, doublereal *, integer *,
+	     integer *, integer *);
+    doublereal maxc2nrm;
+    logical done;
+    integer jmax;
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    integer j, jmaxc2nrm, jmaxb, nbmin, iinfo, n_sub__, minmn;
+    doublereal myhugeval;
+    integer jb, nb, kf;
+    extern doublereal dlamch_(char *);
+    extern integer idamax_(integer *, doublereal *, integer *);
+    integer nx;
+    doublereal safmin;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern logical disnan_(doublereal *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen);
+    integer kp1, lwkopt;
+    logical lquery;
+    integer jbf;
+    doublereal eps;
+    integer iws, ioffset;
+
+
+/*  -- LAPACK computational routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test input arguments */
+/*     ==================== */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*kmax < 0) {
+	*info = -4;
+    } else if (disnan_(abstol)) {
+	*info = -5;
+    } else if (disnan_(reltol)) {
+	*info = -6;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -8;
+    }
+
+/*     If the input parameters M, N, NRHS, KMAX, LDA are valid: */
+/*       a) Test the input workspace size LWORK for the minimum */
+/*          size requirement IWS. */
+/*       b) Determine the optimal block size NB and optimal */
+/*          workspace size LWKOPT to be returned in WORK(1) */
+/*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., */
+/*          (3) when routine exits. */
+/*     Here, IWS is the miminum workspace required for unblocked */
+/*     code. */
+
+    if (*info == 0) {
+	minmn = f2cmin(*m,*n);
+	if (minmn == 0) {
+	    iws = 1;
+	    lwkopt = 1;
+	} else {
+
+/*           Minimal workspace size in case of using only unblocked */
+/*           BLAS 2 code in DLAQP2RK. */
+/*           1) DGEQP3RK and DLAQP2RK: 2*N to store full and partial */
+/*              column 2-norms. */
+/*           2) DLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in DLARF subroutine inside DLAQP2RK to apply an */
+/*              elementary reflector from the left. */
+/*           TOTAL_WORK_SIZE = 3*N + NRHS - 1 */
+
+	    iws = *n * 3 + *nrhs - 1;
+
+/*           Assign to NB optimal block size. */
+
+	    nb = ilaenv_(&c__1, "DGEQP3RK", " ", m, n, &c_n1, &c_n1, (ftnlen)
+		    8, (ftnlen)1);
+
+/*           A formula for the optimal workspace size in case of using */
+/*           both unblocked BLAS 2 in DLAQP2RK and blocked BLAS 3 code */
+/*           in DLAQP3RK. */
+/*           1) DGEQP3RK, DLAQP2RK, DLAQP3RK: 2*N to store full and */
+/*              partial column 2-norms. */
+/*           2) DLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in DLARF subroutine to apply an elementary reflector */
+/*              from the left. */
+/*           3) DLAQP3RK: NB*(N+NRHS) to use in the work array F that */
+/*              is used to apply a block reflector from */
+/*              the left. */
+/*           4) DLAQP3RK: NB to use in the auxilixary array AUX. */
+/*           Sizes (2) and ((3) + (4)) should intersect, therefore */
+/*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. */
+
+	    lwkopt = (*n << 1) + nb * (*n + *nrhs + 1);
+	}
+	work[1] = (doublereal) lwkopt;
+
+	if (*lwork < iws && ! lquery) {
+	    *info = -15;
+	}
+    }
+
+/*      NOTE: The optimal workspace size is returned in WORK(1), if */
+/*            the input parameters M, N, NRHS, KMAX, LDA are valid. */
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGEQP3RK", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible for M=0 or N=0. */
+
+    if (minmn == 0) {
+	*k = 0;
+	*maxc2nrmk = 0.;
+	*relmaxc2nrmk = 0.;
+	work[1] = (doublereal) lwkopt;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize column pivot array JPIV. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	jpiv[j] = j;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize storage for partial and exact column 2-norms. */
+/*     a) The elements WORK(1:N) are used to store partial column */
+/*        2-norms of the matrix A, and may decrease in each computation */
+/*        step; initialize to the values of complete columns 2-norms. */
+/*     b) The elements WORK(N+1:2*N) are used to store complete column */
+/*        2-norms of the matrix A, they are not changed during the */
+/*        computation; initialize the values of complete columns 2-norms. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	work[j] = dnrm2_(m, &a[j * a_dim1 + 1], &c__1);
+	work[*n + j] = work[j];
+    }
+
+/*     ================================================================== */
+
+/*     Compute the pivot column index and the maximum column 2-norm */
+/*     for the whole original matrix stored in A(1:M,1:N). */
+
+    kp1 = idamax_(n, &work[1], &c__1);
+    maxc2nrm = work[kp1];
+
+/*     ==================================================================. */
+
+    if (disnan_(&maxc2nrm)) {
+
+/*        Check if the matrix A contains NaN, set INFO parameter */
+/*        to the column number where the first NaN is found and return */
+/*        from the routine. */
+
+	*k = 0;
+	*info = kp1;
+
+/*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = maxc2nrm;
+
+/*        Array TAU is not set and contains undefined elements. */
+
+	work[1] = (doublereal) lwkopt;
+	return 0;
+    }
+
+/*     =================================================================== */
+
+    if (maxc2nrm == 0.) {
+
+/*        Check is the matrix A is a zero matrix, set array TAU and */
+/*        return from the routine. */
+
+	*k = 0;
+	*maxc2nrmk = 0.;
+	*relmaxc2nrmk = 0.;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    tau[j] = 0.;
+	}
+
+	work[1] = (doublereal) lwkopt;
+	return 0;
+
+    }
+
+/*     =================================================================== */
+
+    myhugeval = dlamch_("Overflow");
+
+    if (maxc2nrm > myhugeval) {
+
+/*        Check if the matrix A contains +Inf or -Inf, set INFO parameter */
+/*        to the column number, where the first +/-Inf  is found plus N, */
+/*        and continue the computation. */
+
+	*info = *n + kp1;
+
+    }
+
+/*     ================================================================== */
+
+/*     Quick return if possible for the case when the first */
+/*     stopping criterion is satisfied, i.e. KMAX = 0. */
+
+    if (*kmax == 0) {
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.;
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    tau[j] = 0.;
+	}
+	work[1] = (doublereal) lwkopt;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+    eps = dlamch_("Epsilon");
+
+/*     Adjust ABSTOL */
+
+    if (*abstol >= 0.) {
+	safmin = dlamch_("Safe minimum");
+/* Computing MAX */
+	d__1 = *abstol, d__2 = safmin * 2.;
+	*abstol = f2cmax(d__1,d__2);
+    }
+
+/*     Adjust RELTOL */
+
+    if (*reltol >= 0.) {
+	*reltol = f2cmax(*reltol,eps);
+    }
+
+/*     =================================================================== */
+
+/*     JMAX is the maximum index of the column to be factorized, */
+/*     which is also limited by the first stopping criterion KMAX. */
+
+    jmax = f2cmin(*kmax,minmn);
+
+/*     =================================================================== */
+
+/*     Quick return if possible for the case when the second or third */
+/*     stopping criterion for the whole original matrix is satified, */
+/*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */
+/*     (which is ONE <= RELTOL). */
+
+    if (maxc2nrm <= *abstol || 1. <= *reltol) {
+
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    tau[j] = 0.;
+	}
+
+	work[1] = (doublereal) lwkopt;
+	return 0;
+    }
+
+/*     ================================================================== */
+/*     Factorize columns */
+/*     ================================================================== */
+
+/*     Determine the block size. */
+
+    nbmin = 2;
+    nx = 0;
+
+    if (nb > 1 && nb < minmn) {
+
+/*        Determine when to cross over from blocked to unblocked code. */
+/*        (for N less than NX, unblocked code should be used). */
+
+/* Computing MAX */
+	i__1 = 0, i__2 = ilaenv_(&c__3, "DGEQP3RK", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)8, (ftnlen)1);
+	nx = f2cmax(i__1,i__2);
+
+	if (nx < minmn) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    if (*lwork < lwkopt) {
+
+/*              Not enough workspace to use optimal block size that */
+/*              is currently stored in NB. */
+/*              Reduce NB and determine the minimum value of NB. */
+
+		nb = (*lwork - (*n << 1)) / (*n + 1);
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "DGEQP3RK", " ", m, n, &c_n1, 
+			&c_n1, (ftnlen)8, (ftnlen)1);
+		nbmin = f2cmax(i__1,i__2);
+
+	    }
+	}
+    }
+
+/*     ================================================================== */
+
+/*     DONE is the boolean flag to rerpresent the case when the */
+/*     factorization completed in the block factorization routine, */
+/*     before the end of the block. */
+
+    done = FALSE_;
+
+/*     J is the column index. */
+
+    j = 1;
+
+/*     (1) Use blocked code initially. */
+
+/*     JMAXB is the maximum column index of the block, when the */
+/*     blocked code is used, is also limited by the first stopping */
+/*     criterion KMAX. */
+
+/* Computing MIN */
+    i__1 = *kmax, i__2 = minmn - nx;
+    jmaxb = f2cmin(i__1,i__2);
+
+    if (nb >= nbmin && nb < jmax && jmaxb > 0) {
+
+/*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */
+/*        J   is the column index of a column block; */
+/*        JB  is the column block size to pass to block factorization */
+/*            routine in a loop step; */
+/*        JBF is the number of columns that were actually factorized */
+/*            that was returned by the block factorization routine */
+/*            in a loop step, JBF <= JB; */
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	while(j <= jmaxb) {
+
+/* Computing MIN */
+	    i__1 = nb, i__2 = jmaxb - j + 1;
+	    jb = f2cmin(i__1,i__2);
+	    n_sub__ = *n - j + 1;
+	    ioffset = j - 1;
+
+/*           Factorize JB columns among the columns A(J:N). */
+
+	    i__1 = *n + *nrhs - j + 1;
+	    dlaqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, 
+		    &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, 
+		    maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], &
+		    work[*n + j], &work[(*n << 1) + 1], &work[(*n << 1) + jb 
+		    + 1], &i__1, &iwork[1], &iinfo);
+
+/*           Set INFO on the first occurence of Inf. */
+
+	    if (iinfo > n_sub__ && *info == 0) {
+		*info = (ioffset << 1) + iinfo;
+	    }
+
+	    if (done) {
+
+/*              Either the submatrix is zero before the end of the */
+/*              column block, or ABSTOL or RELTOL criterion is */
+/*              satisfied before the end of the column block, we can */
+/*              return from the routine. Perform the following before */
+/*              returning: */
+/*                a) Set the number of factorized columns K, */
+/*                   K = IOFFSET + JBF from the last call of blocked */
+/*                   routine. */
+/*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */
+/*                         by the block factorization routine; */
+/*                      2) The remaining TAUs are set to ZERO by the */
+/*                         block factorization routine. */
+
+		*k = ioffset + jbf;
+
+/*              Set INFO on the first occurrence of NaN, NaN takes */
+/*              prcedence over Inf. */
+
+		if (iinfo <= n_sub__ && iinfo > 0) {
+		    *info = ioffset + iinfo;
+		}
+
+/*              Return from the routine. */
+
+		work[1] = (doublereal) lwkopt;
+
+		return 0;
+
+	    }
+
+	    j += jbf;
+
+	}
+
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+/*     J = JMAX+1 means we factorized the maximum possible number of */
+/*     columns, that is in ELSE clause we need to compute */
+/*     the MAXC2NORM and RELMAXC2NORM to return after we processed */
+/*     the blocks. */
+
+    if (j <= jmax) {
+
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	n_sub__ = *n - j + 1;
+	ioffset = j - 1;
+
+	i__1 = jmax - j + 1;
+	dlaqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, &
+		maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, 
+		relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], &work[*n + j], &
+		work[(*n << 1) + 1], &iinfo);
+
+/*        ABSTOL or RELTOL criterion is satisfied when the number of */
+/*        the factorized columns KF is smaller then the  number */
+/*        of columns JMAX-J+1 supplied to be factorized by the */
+/*        unblocked routine, we can return from */
+/*        the routine. Perform the following before returning: */
+/*           a) Set the number of factorized columns K, */
+/*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the */
+/*              unblocked factorization routine above. */
+
+	*k = j - 1 + kf;
+
+/*        Set INFO on the first exception occurence. */
+
+/*        Set INFO on the first exception occurence of Inf or NaN, */
+/*        (NaN takes precedence over Inf). */
+
+	if (iinfo > n_sub__ && *info == 0) {
+	    *info = (ioffset << 1) + iinfo;
+	} else if (iinfo <= n_sub__ && iinfo > 0) {
+	    *info = ioffset + iinfo;
+	}
+
+    } else {
+
+/*        Compute the return values for blocked code. */
+
+/*        Set the number of factorized columns if the unblocked routine */
+/*        was not called. */
+
+	*k = jmax;
+
+/*        If there exits a residual matrix after the blocked code: */
+/*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */
+/*              residual matrix, otherwise set them to ZERO; */
+/*           2) Set TAU(K+1:MINMN) to ZERO. */
+
+	if (*k < minmn) {
+	    i__1 = *n - *k;
+	    jmaxc2nrm = *k + idamax_(&i__1, &work[*k + 1], &c__1);
+	    *maxc2nrmk = work[jmaxc2nrm];
+	    if (*k == 0) {
+		*relmaxc2nrmk = 1.;
+	    } else {
+		*relmaxc2nrmk = *maxc2nrmk / maxc2nrm;
+	    }
+
+	    i__1 = minmn;
+	    for (j = *k + 1; j <= i__1; ++j) {
+		tau[j] = 0.;
+	    }
+
+	}
+
+/*     END IF( J.LE.JMAX ) THEN */
+
+    }
+
+    work[1] = (doublereal) lwkopt;
+
+    return 0;
+
+/*     End of DGEQP3RK */
+
+} /* dgeqp3rk_ */
+
diff --git a/lapack-netlib/SRC/dlaqp2rk.c b/lapack-netlib/SRC/dlaqp2rk.c
new file mode 100644
index 000000000..de216ad97
--- /dev/null
+++ b/lapack-netlib/SRC/dlaqp2rk.c
@@ -0,0 +1,923 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+
+/* Subroutine */ int dlaqp2rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *kmax, doublereal *abstol, doublereal *reltol, 
+	integer *kp1, doublereal *maxc2nrm, doublereal *a, integer *lda, 
+	integer *k, doublereal *maxc2nrmk, doublereal *relmaxc2nrmk, integer *
+	jpiv, doublereal *tau, doublereal *vn1, doublereal *vn2, doublereal *
+	work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    doublereal aikk, temp;
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    doublereal temp2;
+    integer i__, j;
+    doublereal tol3z;
+    integer jmaxc2nrm;
+    extern /* Subroutine */ int dlarf_(char *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *);
+    integer itemp;
+    extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, 
+	    doublereal *, integer *);
+    integer minmnfact;
+    doublereal myhugeval;
+    integer minmnupdt, kk;
+    extern doublereal dlamch_(char *);
+    integer kp;
+    extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *,
+	     integer *, doublereal *);
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern logical disnan_(doublereal *);
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/*     MINMNUPDT is the smallest dimension */
+/*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which */
+/*     contains the submatrices A(IOFFSET+1:M,1:N) and */
+/*     B(IOFFSET+1:M,1:NRHS) as column blocks. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *kmax = f2cmin(*kmax,minmnfact);
+    tol3z = sqrt(dlamch_("Epsilon"));
+    myhugeval = dlamch_("Overflow");
+
+/*     Compute the factorization, KK is the lomn loop index. */
+
+    i__1 = *kmax;
+    for (kk = 1; kk <= i__1; ++kk) {
+
+	i__ = *ioffset + kk;
+
+	if (i__ == 1) {
+
+/*           ============================================================ */
+
+/*           We are at the first column of the original whole matrix A, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+/*           ============================================================ */
+
+	} else {
+
+/*           ============================================================ */
+
+/*           Determine the pivot column in KK-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__2 = *n - kk + 1;
+	    kp = kk - 1 + idamax_(&i__2, &vn1[kk], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,KK:N) in step KK. */
+/*           RELMAXC2NRMK  will be computed later, after somecondition */
+/*           checks on MAXC2NRMK. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains NaN, and set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (disnan_(maxc2nrmk)) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*info = *k + kp;
+
+/*               Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*               Array TAU(K+1:MINMNFACT) is not set and contains */
+/*               undefined elements. */
+
+		return 0;
+	    }
+
+/*           ============================================================ */
+
+/*           Quick return, if the submatrix A(I:M,KK:N) is */
+/*           a zero matrix. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*maxc2nrmk == 0.) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*relmaxc2nrmk = 0.;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    tau[j] = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + kk - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL >= ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+/*              Set K, the number of factorized columns. */
+
+		*k = kk - 1;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    tau[j] = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,KK:N): */
+/*        1) swap the KK-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) copy the KK-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than KK in the next loop step.) */
+/*        3) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A, not the block A(1:M,1:N). */
+
+	if (kp != kk) {
+	    dswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1);
+	    vn1[kp] = vn1[kk];
+	    vn2[kp] = vn2[kk];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[kk];
+	    jpiv[kk] = itemp;
+	}
+
+/*        Generate elementary reflector H(KK) using the column A(I:M,KK), */
+/*        if the column has more than one element, otherwise */
+/*        the elementary reflector would be an identity matrix, */
+/*        and TAU(KK) = ZERO. */
+
+	if (i__ < *m) {
+	    i__2 = *m - i__ + 1;
+	    dlarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], &
+		    c__1, &tau[kk]);
+	} else {
+	    tau[kk] = 0.;
+	}
+
+/*        Check if TAU(KK) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(KK) for Inf, */
+/*        since DLARFG cannot produce TAU(KK) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by DLARFG can contain Inf, which requires */
+/*        TAU(KK) to contain NaN. Therefore, this case of generating Inf */
+/*        by DLARFG is covered by checking TAU(KK) for NaN. */
+
+	if (disnan_(&tau[kk])) {
+	    *k = kk - 1;
+	    *info = kk;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = tau[kk];
+	    *relmaxc2nrmk = tau[kk];
+
+/*           Array TAU(KK:MINMNFACT) is not set and contains */
+/*           undefined elements, except the first element TAU(KK) = NaN. */
+
+	    return 0;
+	}
+
+/*        Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left. */
+/*        ( If M >= N, then at KK = N there is no residual matrix, */
+/*         i.e. no columns of A to update, only columns of B. */
+/*         If M < N, then at KK = M-IOFFSET, I = M and we have a */
+/*         one-row residual matrix in A and the elementary */
+/*         reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update */
+/*         is needed for the residual matrix in A and the */
+/*         right-hand-side-matrix in B. */
+/*         Therefore, we update only if */
+/*         KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */
+/*         condition is satisfied, not only KK < N+NRHS ) */
+
+	if (kk < minmnupdt) {
+	    aikk = a[i__ + kk * a_dim1];
+	    a[i__ + kk * a_dim1] = 1.;
+	    i__2 = *m - i__ + 1;
+	    i__3 = *n + *nrhs - kk;
+	    dlarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &tau[
+		    kk], &a[i__ + (kk + 1) * a_dim1], lda, &work[1]);
+	    a[i__ + kk * a_dim1] = aikk;
+	}
+
+	if (kk < minmnfact) {
+
+/*           Update the partial column 2-norms for the residual matrix, */
+/*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */
+/*           when KK < f2cmin(M-IOFFSET, N). */
+
+	    i__2 = *n;
+	    for (j = kk + 1; j <= i__2; ++j) {
+		if (vn1[j] != 0.) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+/* Computing 2nd power */
+		    d__2 = (d__1 = a[i__ + j * a_dim1], abs(d__1)) / vn1[j];
+		    temp = 1. - d__2 * d__2;
+		    temp = f2cmax(temp,0.);
+/* Computing 2nd power */
+		    d__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (d__1 * d__1);
+		    if (temp2 <= tol3z) {
+
+/*                    Compute the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by explicitly computing it, */
+/*                    and store it in both partial 2-norm vector VN1 */
+/*                    and exact column 2-norm vector VN2. */
+
+			i__3 = *m - i__;
+			vn1[j] = dnrm2_(&i__3, &a[i__ + 1 + j * a_dim1], &
+				c__1);
+			vn2[j] = vn1[j];
+
+		    } else {
+
+/*                    Update the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by removing one */
+/*                    element A(I,J) and store it in partial */
+/*                    2-norm vector VN1. */
+
+			vn1[j] *= sqrt(temp);
+
+		    }
+		}
+	    }
+
+	}
+
+/*     End factorization loop */
+
+    }
+
+/*     If we reached this point, all colunms have been factorized, */
+/*     i.e. no condition was triggered to exit the routine. */
+/*     Set the number of factorized columns. */
+
+    *k = *kmax;
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */
+/*     we return. */
+
+    if (*k < minmnfact) {
+
+	i__1 = *n - *k;
+	jmaxc2nrm = *k + idamax_(&i__1, &vn1[*k + 1], &c__1);
+	*maxc2nrmk = vn1[jmaxc2nrm];
+
+	if (*k == 0) {
+	    *relmaxc2nrmk = 1.;
+	} else {
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+	}
+
+    } else {
+	*maxc2nrmk = 0.;
+	*relmaxc2nrmk = 0.;
+    }
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, set TAUs corresponding to the columns that were */
+/*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO. */
+
+    i__1 = minmnfact;
+    for (j = *k + 1; j <= i__1; ++j) {
+	tau[j] = 0.;
+    }
+
+    return 0;
+
+/*     End of DLAQP2RK */
+
+} /* dlaqp2rk_ */
+
diff --git a/lapack-netlib/SRC/dlaqp3rk.c b/lapack-netlib/SRC/dlaqp3rk.c
new file mode 100644
index 000000000..e8c61c257
--- /dev/null
+++ b/lapack-netlib/SRC/dlaqp3rk.c
@@ -0,0 +1,1113 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static doublereal c_b7 = -1.;
+static doublereal c_b8 = 1.;
+static doublereal c_b30 = 0.;
+
+/* Subroutine */ int dlaqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *nb, doublereal *abstol, doublereal *reltol, 
+	integer *kp1, doublereal *maxc2nrm, doublereal *a, integer *lda, 
+	logical *done, integer *kb, doublereal *maxc2nrmk, doublereal *
+	relmaxc2nrmk, integer *jpiv, doublereal *tau, doublereal *vn1, 
+	doublereal *vn2, doublereal *auxv, doublereal *f, integer *ldf, 
+	integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, f_dim1, f_offset, i__1, i__2;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    doublereal temp;
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    doublereal temp2;
+    integer i__, j, k;
+    doublereal tol3z;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, 
+	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
+	    integer *, doublereal *, doublereal *, integer *),
+	     dgemv_(char *, integer *, integer *, doublereal *, doublereal *, 
+	    integer *, doublereal *, integer *, doublereal *, doublereal *, 
+	    integer *);
+    integer itemp;
+    extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, 
+	    doublereal *, integer *);
+    integer minmnfact;
+    doublereal myhugeval;
+    integer minmnupdt, if__;
+    extern doublereal dlamch_(char *);
+    integer kp;
+    extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *,
+	     integer *, doublereal *);
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern logical disnan_(doublereal *);
+    integer lsticc;
+    doublereal aik;
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --auxv;
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *nb = f2cmin(*nb,minmnfact);
+    tol3z = sqrt(dlamch_("Epsilon"));
+    myhugeval = dlamch_("Overflow");
+
+/*     Compute factorization in a while loop over NB columns, */
+/*     K is the column index in the block A(1:M,1:N). */
+
+    k = 0;
+    lsticc = 0;
+    *done = FALSE_;
+
+    while(k < *nb && lsticc == 0) {
+	++k;
+	i__ = *ioffset + k;
+
+	if (i__ == 1) {
+
+/*           We are at the first column of the original whole matrix A_orig, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+	} else {
+
+/*           Determine the pivot column in K-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__1 = *n - k + 1;
+	    kp = k - 1 + idamax_(&i__1, &vn1[k], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,K:N) in step K. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains NaN, set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (disnan_(maxc2nrmk)) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*info = *kb + kp;
+
+/*              Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix contains NaN and we stop */
+/*              the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    dgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &
+			    c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + 
+			    f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*              undefined elements. */
+
+/*              Return from the routine. */
+
+		return 0;
+	    }
+
+/*           Quick return, if the submatrix A(I:M,K:N) is */
+/*           a zero matrix. We need to check it only if the column index */
+/*           (same as row index) is larger than 1, since the condition */
+/*           for the whole original matrix A_orig is checked in the main */
+/*           routine. */
+
+	    if (*maxc2nrmk == 0.) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*relmaxc2nrmk = 0.;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix is zero and we stop the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */
+
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    dgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &
+			    c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + 
+			    f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    tau[j] = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + k - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third tolerance stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig; */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+
+/*              Apply the block reflector to the residual of the */
+/*              matrix A and the residual of the right hand sides B, if */
+/*              the residual matrix and and/or the residual of the right */
+/*              hand sides exist,  i.e. if the submatrix */
+/*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when */
+/*                 KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */
+
+		if (*kb < minmnupdt) {
+		    i__1 = *m - if__;
+		    i__2 = *n + *nrhs - *kb;
+		    dgemm_("No transpose", "Transpose", &i__1, &i__2, kb, &
+			    c_b7, &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + 
+			    f_dim1], ldf, &c_b8, &a[if__ + 1 + (*kb + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    tau[j] = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,K:N): */
+/*        1) swap the K-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */
+/*        3) copy the K-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than K in the next loop step.) */
+/*        4) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A_orig, not the block A(1:M,1:N). */
+
+	if (kp != k) {
+	    dswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
+	    i__1 = k - 1;
+	    dswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf);
+	    vn1[kp] = vn1[k];
+	    vn2[kp] = vn2[k];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[k];
+	    jpiv[k] = itemp;
+	}
+
+/*        Apply previous Householder reflectors to column K: */
+/*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T. */
+
+	if (k > 1) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    dgemv_("No transpose", &i__1, &i__2, &c_b7, &a[i__ + a_dim1], lda,
+		     &f[k + f_dim1], ldf, &c_b8, &a[i__ + k * a_dim1], &c__1);
+	}
+
+/*        Generate elementary reflector H(k) using the column A(I:M,K). */
+
+	if (i__ < *m) {
+	    i__1 = *m - i__ + 1;
+	    dlarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], &
+		    c__1, &tau[k]);
+	} else {
+	    tau[k] = 0.;
+	}
+
+/*        Check if TAU(K) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(K) for Inf, */
+/*        since DLARFG cannot produce TAU(K) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by DLARFG can contain Inf, which requires */
+/*        TAU(K) to contain NaN. Therefore, this case of generating Inf */
+/*        by DLARFG is covered by checking TAU(K) for NaN. */
+
+	if (disnan_(&tau[k])) {
+
+	    *done = TRUE_;
+
+/*           Set KB, the number of factorized partial columns */
+/*                   that are non-zero in each step in the block, */
+/*                   i.e. the rank of the factor R. */
+/*           Set IF, the number of processed rows in the block, which */
+/*                   is the same as the number of processed rows in */
+/*                   the original whole matrix A_orig. */
+
+	    *kb = k - 1;
+	    if__ = i__ - 1;
+	    *info = k;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = tau[k];
+	    *relmaxc2nrmk = tau[k];
+
+/*           There is no need to apply the block reflector to the */
+/*           residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*           since the submatrix contains NaN and we stop */
+/*           the computation. */
+/*           But, we need to apply the block reflector to the residual */
+/*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*           residual right hand sides exist.  This occurs */
+/*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */
+
+	    if (*nrhs > 0 && *kb < *m - *ioffset) {
+		i__1 = *m - if__;
+		dgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &c_b7, &
+			a[if__ + 1 + a_dim1], lda, &f[*n + 1 + f_dim1], ldf, &
+			c_b8, &a[if__ + 1 + (*n + 1) * a_dim1], lda);
+	    }
+
+/*           There is no need to recompute the 2-norm of the */
+/*           difficult columns, since we stop the factorization. */
+
+/*           Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*           undefined elements. */
+
+/*           Return from the routine. */
+
+	    return 0;
+	}
+
+/*        =============================================================== */
+
+	aik = a[i__ + k * a_dim1];
+	a[i__ + k * a_dim1] = 1.;
+
+/*        =============================================================== */
+
+/*        Compute the current K-th column of F: */
+/*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K). */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = *n + *nrhs - k;
+	    dgemv_("Transpose", &i__1, &i__2, &tau[k], &a[i__ + (k + 1) * 
+		    a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b30, &f[k + 
+		    1 + k * f_dim1], &c__1);
+	}
+
+/*           2) Zero out elements above and on the diagonal of the */
+/*              column K in matrix F, i.e elements F(1:K,K). */
+
+	i__1 = k;
+	for (j = 1; j <= i__1; ++j) {
+	    f[j + k * f_dim1] = 0.;
+	}
+
+/*         3) Incremental updating of the K-th column of F: */
+/*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T */
+/*                    * A(I:M,K). */
+
+	if (k > 1) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    d__1 = -tau[k];
+	    dgemv_("Transpose", &i__1, &i__2, &d__1, &a[i__ + a_dim1], lda, &
+		    a[i__ + k * a_dim1], &c__1, &c_b30, &auxv[1], &c__1);
+
+	    i__1 = *n + *nrhs;
+	    i__2 = k - 1;
+	    dgemv_("No transpose", &i__1, &i__2, &c_b8, &f[f_dim1 + 1], ldf, &
+		    auxv[1], &c__1, &c_b8, &f[k * f_dim1 + 1], &c__1);
+	}
+
+/*        =============================================================== */
+
+/*        Update the current I-th row of A: */
+/*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */
+/*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**T. */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *n + *nrhs - k;
+	    dgemv_("No transpose", &i__1, &k, &c_b7, &f[k + 1 + f_dim1], ldf, 
+		    &a[i__ + a_dim1], lda, &c_b8, &a[i__ + (k + 1) * a_dim1], 
+		    lda);
+	}
+
+	a[i__ + k * a_dim1] = aik;
+
+/*        Update the partial column 2-norms for the residual matrix, */
+/*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */
+/*        when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */
+
+	if (k < minmnfact) {
+
+	    i__1 = *n;
+	    for (j = k + 1; j <= i__1; ++j) {
+		if (vn1[j] != 0.) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+		    temp = (d__1 = a[i__ + j * a_dim1], abs(d__1)) / vn1[j];
+/* Computing MAX */
+		    d__1 = 0., d__2 = (temp + 1.) * (1. - temp);
+		    temp = f2cmax(d__1,d__2);
+/* Computing 2nd power */
+		    d__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (d__1 * d__1);
+		    if (temp2 <= tol3z) {
+
+/*                    At J-index, we have a difficult column for the */
+/*                    update of the 2-norm. Save the index of the previous */
+/*                    difficult column in IWORK(J-1). */
+/*                    NOTE: ILSTCC > 1, threfore we can use IWORK only */
+/*                    with N-1 elements, where the elements are */
+/*                    shifted by 1 to the left. */
+
+			iwork[j - 1] = lsticc;
+
+/*                    Set the index of the last difficult column LSTICC. */
+
+			lsticc = j;
+
+		    } else {
+			vn1[j] *= sqrt(temp);
+		    }
+		}
+	    }
+
+	}
+
+/*        End of while loop. */
+
+    }
+
+/*     Now, afler the loop: */
+/*        Set KB, the number of factorized columns in the block; */
+/*        Set IF, the number of processed rows in the block, which */
+/*                is the same as the number of processed rows in */
+/*                the original whole matrix A_orig, IF = IOFFSET + KB. */
+
+    *kb = k;
+    if__ = i__;
+
+/*     Apply the block reflector to the residual of the matrix A */
+/*     and the residual of the right hand sides B, if the residual */
+/*     matrix and and/or the residual of the right hand sides */
+/*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */
+/*     This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */
+
+    if (*kb < minmnupdt) {
+	i__1 = *m - if__;
+	i__2 = *n + *nrhs - *kb;
+	dgemm_("No transpose", "Transpose", &i__1, &i__2, kb, &c_b7, &a[if__ 
+		+ 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b8, &a[if__ 
+		+ 1 + (*kb + 1) * a_dim1], lda);
+    }
+
+/*     Recompute the 2-norm of the difficult columns. */
+/*     Loop over the index of the difficult columns from the largest */
+/*     to the smallest index. */
+
+    while(lsticc > 0) {
+
+/*        LSTICC is the index of the last difficult column is greater */
+/*        than 1. */
+/*        ITEMP is the index of the previous difficult column. */
+
+	itemp = iwork[lsticc - 1];
+
+/*        Compute the 2-norm explicilty for the last difficult column and */
+/*        save it in the partial and exact 2-norm vectors VN1 and VN2. */
+
+/*        NOTE: The computation of VN1( LSTICC ) relies on the fact that */
+/*        DNRM2 does not fail on vectors with norm below the value of */
+/*        SQRT(DLAMCH('S')) */
+
+	i__1 = *m - if__;
+	vn1[lsticc] = dnrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1);
+	vn2[lsticc] = vn1[lsticc];
+
+/*        Downdate the index of the last difficult column to */
+/*        the index of the previous difficult column. */
+
+	lsticc = itemp;
+
+    }
+
+    return 0;
+
+/*     End of DLAQP3RK */
+
+} /* dlaqp3rk_ */
+
diff --git a/lapack-netlib/SRC/sgeqp3rk.c b/lapack-netlib/SRC/sgeqp3rk.c
new file mode 100644
index 000000000..fe52901bf
--- /dev/null
+++ b/lapack-netlib/SRC/sgeqp3rk.c
@@ -0,0 +1,1055 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static integer c__3 = 3;
+static integer c__2 = 2;
+
+/* Subroutine */ int sgeqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*kmax, real *abstol, real *reltol, real *a, integer *lda, integer *k, 
+	real *maxc2nrmk, real *relmaxc2nrmk, integer *jpiv, real *tau, real *
+	work, integer *lwork, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    real r__1, r__2;
+
+    /* Local variables */
+    real maxc2nrm;
+    extern /* Subroutine */ int slaqp2rk_(integer *, integer *, integer *, 
+	    integer *, integer *, real *, real *, integer *, real *, real *, 
+	    integer *, integer *, real *, real *, integer *, real *, real *, 
+	    real *, real *, integer *), slaqp3rk_(integer *, integer *, 
+	    integer *, integer *, integer *, real *, real *, integer *, real *
+	    , real *, integer *, logical *, integer *, real *, real *, 
+	    integer *, real *, real *, real *, real *, real *, integer *, 
+	    integer *, integer *);
+    logical done;
+    integer jmax;
+    extern real snrm2_(integer *, real *, integer *);
+    integer j, jmaxc2nrm, jmaxb, nbmin, iinfo, n_sub__, minmn;
+    real myhugeval;
+    integer jb, nb, kf, nx;
+    extern real slamch_(char *);
+    real safmin;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen), isamax_(integer *, real *, 
+	    integer *);
+    extern logical sisnan_(real *);
+    integer kp1, lwkopt;
+    logical lquery;
+    integer jbf;
+    real eps;
+    integer iws, ioffset;
+
+
+/*  -- LAPACK computational routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test input arguments */
+/*     ==================== */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*kmax < 0) {
+	*info = -4;
+    } else if (sisnan_(abstol)) {
+	*info = -5;
+    } else if (sisnan_(reltol)) {
+	*info = -6;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -8;
+    }
+
+/*     If the input parameters M, N, NRHS, KMAX, LDA are valid: */
+/*       a) Test the input workspace size LWORK for the minimum */
+/*          size requirement IWS. */
+/*       b) Determine the optimal block size NB and optimal */
+/*          workspace size LWKOPT to be returned in WORK(1) */
+/*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., */
+/*          (3) when routine exits. */
+/*     Here, IWS is the miminum workspace required for unblocked */
+/*     code. */
+
+    if (*info == 0) {
+	minmn = f2cmin(*m,*n);
+	if (minmn == 0) {
+	    iws = 1;
+	    lwkopt = 1;
+	} else {
+
+/*           Minimal workspace size in case of using only unblocked */
+/*           BLAS 2 code in SLAQP2RK. */
+/*           1) SGEQP3RK and SLAQP2RK: 2*N to store full and partial */
+/*              column 2-norms. */
+/*           2) SLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in SLARF subroutine inside SLAQP2RK to apply an */
+/*              elementary reflector from the left. */
+/*           TOTAL_WORK_SIZE = 3*N + NRHS - 1 */
+
+	    iws = *n * 3 + *nrhs - 1;
+
+/*           Assign to NB optimal block size. */
+
+	    nb = ilaenv_(&c__1, "SGEQP3RK", " ", m, n, &c_n1, &c_n1, (ftnlen)
+		    8, (ftnlen)1);
+
+/*           A formula for the optimal workspace size in case of using */
+/*           both unblocked BLAS 2 in SLAQP2RK and blocked BLAS 3 code */
+/*           in SLAQP3RK. */
+/*           1) SGEQP3RK, SLAQP2RK, SLAQP3RK: 2*N to store full and */
+/*              partial column 2-norms. */
+/*           2) SLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in SLARF subroutine to apply an elementary reflector */
+/*              from the left. */
+/*           3) SLAQP3RK: NB*(N+NRHS) to use in the work array F that */
+/*              is used to apply a block reflector from */
+/*              the left. */
+/*           4) SLAQP3RK: NB to use in the auxilixary array AUX. */
+/*           Sizes (2) and ((3) + (4)) should intersect, therefore */
+/*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. */
+
+	    lwkopt = (*n << 1) + nb * (*n + *nrhs + 1);
+	}
+	work[1] = (real) lwkopt;
+
+	if (*lwork < iws && ! lquery) {
+	    *info = -15;
+	}
+    }
+
+/*      NOTE: The optimal workspace size is returned in WORK(1), if */
+/*            the input parameters M, N, NRHS, KMAX, LDA are valid. */
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGEQP3RK", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible for M=0 or N=0. */
+
+    if (minmn == 0) {
+	*k = 0;
+	*maxc2nrmk = 0.f;
+	*relmaxc2nrmk = 0.f;
+	work[1] = (real) lwkopt;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize column pivot array JPIV. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	jpiv[j] = j;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize storage for partial and exact column 2-norms. */
+/*     a) The elements WORK(1:N) are used to store partial column */
+/*        2-norms of the matrix A, and may decrease in each computation */
+/*        step; initialize to the values of complete columns 2-norms. */
+/*     b) The elements WORK(N+1:2*N) are used to store complete column */
+/*        2-norms of the matrix A, they are not changed during the */
+/*        computation; initialize the values of complete columns 2-norms. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	work[j] = snrm2_(m, &a[j * a_dim1 + 1], &c__1);
+	work[*n + j] = work[j];
+    }
+
+/*     ================================================================== */
+
+/*     Compute the pivot column index and the maximum column 2-norm */
+/*     for the whole original matrix stored in A(1:M,1:N). */
+
+    kp1 = isamax_(n, &work[1], &c__1);
+    maxc2nrm = work[kp1];
+
+/*     ==================================================================. */
+
+    if (sisnan_(&maxc2nrm)) {
+
+/*        Check if the matrix A contains NaN, set INFO parameter */
+/*        to the column number where the first NaN is found and return */
+/*        from the routine. */
+
+	*k = 0;
+	*info = kp1;
+
+/*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = maxc2nrm;
+
+/*        Array TAU is not set and contains undefined elements. */
+
+	work[1] = (real) lwkopt;
+	return 0;
+    }
+
+/*     =================================================================== */
+
+    if (maxc2nrm == 0.f) {
+
+/*        Check is the matrix A is a zero matrix, set array TAU and */
+/*        return from the routine. */
+
+	*k = 0;
+	*maxc2nrmk = 0.f;
+	*relmaxc2nrmk = 0.f;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    tau[j] = 0.f;
+	}
+
+	work[1] = (real) lwkopt;
+	return 0;
+
+    }
+
+/*     =================================================================== */
+
+    myhugeval = slamch_("Overflow");
+
+    if (maxc2nrm > myhugeval) {
+
+/*        Check if the matrix A contains +Inf or -Inf, set INFO parameter */
+/*        to the column number, where the first +/-Inf  is found plus N, */
+/*        and continue the computation. */
+
+	*info = *n + kp1;
+
+    }
+
+/*     ================================================================== */
+
+/*     Quick return if possible for the case when the first */
+/*     stopping criterion is satisfied, i.e. KMAX = 0. */
+
+    if (*kmax == 0) {
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.f;
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    tau[j] = 0.f;
+	}
+	work[1] = (real) lwkopt;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+    eps = slamch_("Epsilon");
+
+/*     Adjust ABSTOL */
+
+    if (*abstol >= 0.f) {
+	safmin = slamch_("Safe minimum");
+/* Computing MAX */
+	r__1 = *abstol, r__2 = safmin * 2.f;
+	*abstol = f2cmax(r__1,r__2);
+    }
+
+/*     Adjust RELTOL */
+
+    if (*reltol >= 0.f) {
+	*reltol = f2cmax(*reltol,eps);
+    }
+
+/*     =================================================================== */
+
+/*     JMAX is the maximum index of the column to be factorized, */
+/*     which is also limited by the first stopping criterion KMAX. */
+
+    jmax = f2cmin(*kmax,minmn);
+
+/*     =================================================================== */
+
+/*     Quick return if possible for the case when the second or third */
+/*     stopping criterion for the whole original matrix is satified, */
+/*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */
+/*     (which is ONE <= RELTOL). */
+
+    if (maxc2nrm <= *abstol || 1.f <= *reltol) {
+
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.f;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    tau[j] = 0.f;
+	}
+
+	work[1] = (real) lwkopt;
+	return 0;
+    }
+
+/*     ================================================================== */
+/*     Factorize columns */
+/*     ================================================================== */
+
+/*     Determine the block size. */
+
+    nbmin = 2;
+    nx = 0;
+
+    if (nb > 1 && nb < minmn) {
+
+/*        Determine when to cross over from blocked to unblocked code. */
+/*        (for N less than NX, unblocked code should be used). */
+
+/* Computing MAX */
+	i__1 = 0, i__2 = ilaenv_(&c__3, "SGEQP3RK", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)8, (ftnlen)1);
+	nx = f2cmax(i__1,i__2);
+
+	if (nx < minmn) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    if (*lwork < lwkopt) {
+
+/*              Not enough workspace to use optimal block size that */
+/*              is currently stored in NB. */
+/*              Reduce NB and determine the minimum value of NB. */
+
+		nb = (*lwork - (*n << 1)) / (*n + 1);
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "SGEQP3RK", " ", m, n, &c_n1, 
+			&c_n1, (ftnlen)8, (ftnlen)1);
+		nbmin = f2cmax(i__1,i__2);
+
+	    }
+	}
+    }
+
+/*     ================================================================== */
+
+/*     DONE is the boolean flag to rerpresent the case when the */
+/*     factorization completed in the block factorization routine, */
+/*     before the end of the block. */
+
+    done = FALSE_;
+
+/*     J is the column index. */
+
+    j = 1;
+
+/*     (1) Use blocked code initially. */
+
+/*     JMAXB is the maximum column index of the block, when the */
+/*     blocked code is used, is also limited by the first stopping */
+/*     criterion KMAX. */
+
+/* Computing MIN */
+    i__1 = *kmax, i__2 = minmn - nx;
+    jmaxb = f2cmin(i__1,i__2);
+
+    if (nb >= nbmin && nb < jmax && jmaxb > 0) {
+
+/*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */
+/*        J   is the column index of a column block; */
+/*        JB  is the column block size to pass to block factorization */
+/*            routine in a loop step; */
+/*        JBF is the number of columns that were actually factorized */
+/*            that was returned by the block factorization routine */
+/*            in a loop step, JBF <= JB; */
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	while(j <= jmaxb) {
+
+/* Computing MIN */
+	    i__1 = nb, i__2 = jmaxb - j + 1;
+	    jb = f2cmin(i__1,i__2);
+	    n_sub__ = *n - j + 1;
+	    ioffset = j - 1;
+
+/*           Factorize JB columns among the columns A(J:N). */
+
+	    i__1 = *n + *nrhs - j + 1;
+	    slaqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, 
+		    &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, 
+		    maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], &
+		    work[*n + j], &work[(*n << 1) + 1], &work[(*n << 1) + jb 
+		    + 1], &i__1, &iwork[1], &iinfo);
+
+/*           Set INFO on the first occurence of Inf. */
+
+	    if (iinfo > n_sub__ && *info == 0) {
+		*info = (ioffset << 1) + iinfo;
+	    }
+
+	    if (done) {
+
+/*              Either the submatrix is zero before the end of the */
+/*              column block, or ABSTOL or RELTOL criterion is */
+/*              satisfied before the end of the column block, we can */
+/*              return from the routine. Perform the following before */
+/*              returning: */
+/*                a) Set the number of factorized columns K, */
+/*                   K = IOFFSET + JBF from the last call of blocked */
+/*                   routine. */
+/*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */
+/*                         by the block factorization routine; */
+/*                      2) The remaining TAUs are set to ZERO by the */
+/*                         block factorization routine. */
+
+		*k = ioffset + jbf;
+
+/*              Set INFO on the first occurrence of NaN, NaN takes */
+/*              prcedence over Inf. */
+
+		if (iinfo <= n_sub__ && iinfo > 0) {
+		    *info = ioffset + iinfo;
+		}
+
+/*              Return from the routine. */
+
+		work[1] = (real) lwkopt;
+
+		return 0;
+
+	    }
+
+	    j += jbf;
+
+	}
+
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+/*     J = JMAX+1 means we factorized the maximum possible number of */
+/*     columns, that is in ELSE clause we need to compute */
+/*     the MAXC2NORM and RELMAXC2NORM to return after we processed */
+/*     the blocks. */
+
+    if (j <= jmax) {
+
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	n_sub__ = *n - j + 1;
+	ioffset = j - 1;
+
+	i__1 = jmax - j + 1;
+	slaqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, &
+		maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, 
+		relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], &work[*n + j], &
+		work[(*n << 1) + 1], &iinfo);
+
+/*        ABSTOL or RELTOL criterion is satisfied when the number of */
+/*        the factorized columns KF is smaller then the  number */
+/*        of columns JMAX-J+1 supplied to be factorized by the */
+/*        unblocked routine, we can return from */
+/*        the routine. Perform the following before returning: */
+/*           a) Set the number of factorized columns K, */
+/*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the */
+/*              unblocked factorization routine above. */
+
+	*k = j - 1 + kf;
+
+/*        Set INFO on the first exception occurence. */
+
+/*        Set INFO on the first exception occurence of Inf or NaN, */
+/*        (NaN takes precedence over Inf). */
+
+	if (iinfo > n_sub__ && *info == 0) {
+	    *info = (ioffset << 1) + iinfo;
+	} else if (iinfo <= n_sub__ && iinfo > 0) {
+	    *info = ioffset + iinfo;
+	}
+
+    } else {
+
+/*        Compute the return values for blocked code. */
+
+/*        Set the number of factorized columns if the unblocked routine */
+/*        was not called. */
+
+	*k = jmax;
+
+/*        If there exits a residual matrix after the blocked code: */
+/*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */
+/*              residual matrix, otherwise set them to ZERO; */
+/*           2) Set TAU(K+1:MINMN) to ZERO. */
+
+	if (*k < minmn) {
+	    i__1 = *n - *k;
+	    jmaxc2nrm = *k + isamax_(&i__1, &work[*k + 1], &c__1);
+	    *maxc2nrmk = work[jmaxc2nrm];
+	    if (*k == 0) {
+		*relmaxc2nrmk = 1.f;
+	    } else {
+		*relmaxc2nrmk = *maxc2nrmk / maxc2nrm;
+	    }
+
+	    i__1 = minmn;
+	    for (j = *k + 1; j <= i__1; ++j) {
+		tau[j] = 0.f;
+	    }
+
+	}
+
+/*     END IF( J.LE.JMAX ) THEN */
+
+    }
+
+    work[1] = (real) lwkopt;
+
+    return 0;
+
+/*     End of SGEQP3RK */
+
+} /* sgeqp3rk_ */
+
diff --git a/lapack-netlib/SRC/slaqp2rk.c b/lapack-netlib/SRC/slaqp2rk.c
new file mode 100644
index 000000000..0bfa71ab9
--- /dev/null
+++ b/lapack-netlib/SRC/slaqp2rk.c
@@ -0,0 +1,918 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+
+/* Subroutine */ int slaqp2rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *kmax, real *abstol, real *reltol, integer *kp1, 
+	real *maxc2nrm, real *a, integer *lda, integer *k, real *maxc2nrmk, 
+	real *relmaxc2nrmk, integer *jpiv, real *tau, real *vn1, real *vn2, 
+	real *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    real r__1, r__2;
+
+    /* Local variables */
+    real aikk, temp, temp2;
+    extern real snrm2_(integer *, real *, integer *);
+    integer i__, j;
+    real tol3z;
+    integer jmaxc2nrm;
+    extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, 
+	    integer *, real *, real *, integer *, real *);
+    integer itemp, minmnfact;
+    extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, 
+	    integer *);
+    real myhugeval;
+    integer minmnupdt, kk, kp;
+    extern real slamch_(char *);
+    extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *, 
+	    real *);
+    extern integer isamax_(integer *, real *, integer *);
+    extern logical sisnan_(real *);
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/*     MINMNUPDT is the smallest dimension */
+/*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which */
+/*     contains the submatrices A(IOFFSET+1:M,1:N) and */
+/*     B(IOFFSET+1:M,1:NRHS) as column blocks. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *kmax = f2cmin(*kmax,minmnfact);
+    tol3z = sqrt(slamch_("Epsilon"));
+    myhugeval = slamch_("Overflow");
+
+/*     Compute the factorization, KK is the lomn loop index. */
+
+    i__1 = *kmax;
+    for (kk = 1; kk <= i__1; ++kk) {
+
+	i__ = *ioffset + kk;
+
+	if (i__ == 1) {
+
+/*           ============================================================ */
+
+/*           We are at the first column of the original whole matrix A, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+/*           ============================================================ */
+
+	} else {
+
+/*           ============================================================ */
+
+/*           Determine the pivot column in KK-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__2 = *n - kk + 1;
+	    kp = kk - 1 + isamax_(&i__2, &vn1[kk], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,KK:N) in step KK. */
+/*           RELMAXC2NRMK  will be computed later, after somecondition */
+/*           checks on MAXC2NRMK. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains NaN, and set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (sisnan_(maxc2nrmk)) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*info = *k + kp;
+
+/*               Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*               Array TAU(K+1:MINMNFACT) is not set and contains */
+/*               undefined elements. */
+
+		return 0;
+	    }
+
+/*           ============================================================ */
+
+/*           Quick return, if the submatrix A(I:M,KK:N) is */
+/*           a zero matrix. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*maxc2nrmk == 0.f) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*relmaxc2nrmk = 0.f;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    tau[j] = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + kk - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL >= ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+/*              Set K, the number of factorized columns. */
+
+		*k = kk - 1;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    tau[j] = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,KK:N): */
+/*        1) swap the KK-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) copy the KK-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than KK in the next loop step.) */
+/*        3) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A, not the block A(1:M,1:N). */
+
+	if (kp != kk) {
+	    sswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1);
+	    vn1[kp] = vn1[kk];
+	    vn2[kp] = vn2[kk];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[kk];
+	    jpiv[kk] = itemp;
+	}
+
+/*        Generate elementary reflector H(KK) using the column A(I:M,KK), */
+/*        if the column has more than one element, otherwise */
+/*        the elementary reflector would be an identity matrix, */
+/*        and TAU(KK) = ZERO. */
+
+	if (i__ < *m) {
+	    i__2 = *m - i__ + 1;
+	    slarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], &
+		    c__1, &tau[kk]);
+	} else {
+	    tau[kk] = 0.f;
+	}
+
+/*        Check if TAU(KK) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(KK) for Inf, */
+/*        since SLARFG cannot produce TAU(KK) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by SLARFG can contain Inf, which requires */
+/*        TAU(KK) to contain NaN. Therefore, this case of generating Inf */
+/*        by SLARFG is covered by checking TAU(KK) for NaN. */
+
+	if (sisnan_(&tau[kk])) {
+	    *k = kk - 1;
+	    *info = kk;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = tau[kk];
+	    *relmaxc2nrmk = tau[kk];
+
+/*           Array TAU(KK:MINMNFACT) is not set and contains */
+/*           undefined elements, except the first element TAU(KK) = NaN. */
+
+	    return 0;
+	}
+
+/*        Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left. */
+/*        ( If M >= N, then at KK = N there is no residual matrix, */
+/*         i.e. no columns of A to update, only columns of B. */
+/*         If M < N, then at KK = M-IOFFSET, I = M and we have a */
+/*         one-row residual matrix in A and the elementary */
+/*         reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update */
+/*         is needed for the residual matrix in A and the */
+/*         right-hand-side-matrix in B. */
+/*         Therefore, we update only if */
+/*         KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */
+/*         condition is satisfied, not only KK < N+NRHS ) */
+
+	if (kk < minmnupdt) {
+	    aikk = a[i__ + kk * a_dim1];
+	    a[i__ + kk * a_dim1] = 1.f;
+	    i__2 = *m - i__ + 1;
+	    i__3 = *n + *nrhs - kk;
+	    slarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &tau[
+		    kk], &a[i__ + (kk + 1) * a_dim1], lda, &work[1]);
+	    a[i__ + kk * a_dim1] = aikk;
+	}
+
+	if (kk < minmnfact) {
+
+/*           Update the partial column 2-norms for the residual matrix, */
+/*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */
+/*           when KK < f2cmin(M-IOFFSET, N). */
+
+	    i__2 = *n;
+	    for (j = kk + 1; j <= i__2; ++j) {
+		if (vn1[j] != 0.f) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+/* Computing 2nd power */
+		    r__2 = (r__1 = a[i__ + j * a_dim1], abs(r__1)) / vn1[j];
+		    temp = 1.f - r__2 * r__2;
+		    temp = f2cmax(temp,0.f);
+/* Computing 2nd power */
+		    r__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (r__1 * r__1);
+		    if (temp2 <= tol3z) {
+
+/*                    Compute the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by explicitly computing it, */
+/*                    and store it in both partial 2-norm vector VN1 */
+/*                    and exact column 2-norm vector VN2. */
+
+			i__3 = *m - i__;
+			vn1[j] = snrm2_(&i__3, &a[i__ + 1 + j * a_dim1], &
+				c__1);
+			vn2[j] = vn1[j];
+
+		    } else {
+
+/*                    Update the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by removing one */
+/*                    element A(I,J) and store it in partial */
+/*                    2-norm vector VN1. */
+
+			vn1[j] *= sqrt(temp);
+
+		    }
+		}
+	    }
+
+	}
+
+/*     End factorization loop */
+
+    }
+
+/*     If we reached this point, all colunms have been factorized, */
+/*     i.e. no condition was triggered to exit the routine. */
+/*     Set the number of factorized columns. */
+
+    *k = *kmax;
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */
+/*     we return. */
+
+    if (*k < minmnfact) {
+
+	i__1 = *n - *k;
+	jmaxc2nrm = *k + isamax_(&i__1, &vn1[*k + 1], &c__1);
+	*maxc2nrmk = vn1[jmaxc2nrm];
+
+	if (*k == 0) {
+	    *relmaxc2nrmk = 1.f;
+	} else {
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+	}
+
+    } else {
+	*maxc2nrmk = 0.f;
+	*relmaxc2nrmk = 0.f;
+    }
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, set TAUs corresponding to the columns that were */
+/*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO. */
+
+    i__1 = minmnfact;
+    for (j = *k + 1; j <= i__1; ++j) {
+	tau[j] = 0.f;
+    }
+
+    return 0;
+
+/*     End of SLAQP2RK */
+
+} /* slaqp2rk_ */
+
diff --git a/lapack-netlib/SRC/slaqp3rk.c b/lapack-netlib/SRC/slaqp3rk.c
new file mode 100644
index 000000000..e3632538b
--- /dev/null
+++ b/lapack-netlib/SRC/slaqp3rk.c
@@ -0,0 +1,1109 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static real c_b7 = -1.f;
+static real c_b8 = 1.f;
+static real c_b30 = 0.f;
+
+/* Subroutine */ int slaqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *nb, real *abstol, real *reltol, integer *kp1, real 
+	*maxc2nrm, real *a, integer *lda, logical *done, integer *kb, real *
+	maxc2nrmk, real *relmaxc2nrmk, integer *jpiv, real *tau, real *vn1, 
+	real *vn2, real *auxv, real *f, integer *ldf, integer *iwork, integer 
+	*info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, f_dim1, f_offset, i__1, i__2;
+    real r__1, r__2;
+
+    /* Local variables */
+    real temp, temp2;
+    extern real snrm2_(integer *, real *, integer *);
+    integer i__, j, k;
+    real tol3z;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, 
+	    integer *, real *, real *, integer *, real *, integer *, real *, 
+	    real *, integer *);
+    integer itemp;
+    extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, 
+	    real *, integer *, real *, integer *, real *, real *, integer *);
+    integer minmnfact;
+    extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, 
+	    integer *);
+    real myhugeval;
+    integer minmnupdt, if__, kp;
+    extern real slamch_(char *);
+    extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *, 
+	    real *);
+    integer lsticc;
+    extern integer isamax_(integer *, real *, integer *);
+    extern logical sisnan_(real *);
+    real aik;
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --auxv;
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *nb = f2cmin(*nb,minmnfact);
+    tol3z = sqrt(slamch_("Epsilon"));
+    myhugeval = slamch_("Overflow");
+
+/*     Compute factorization in a while loop over NB columns, */
+/*     K is the column index in the block A(1:M,1:N). */
+
+    k = 0;
+    lsticc = 0;
+    *done = FALSE_;
+
+    while(k < *nb && lsticc == 0) {
+	++k;
+	i__ = *ioffset + k;
+
+	if (i__ == 1) {
+
+/*           We are at the first column of the original whole matrix A_orig, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+	} else {
+
+/*           Determine the pivot column in K-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__1 = *n - k + 1;
+	    kp = k - 1 + isamax_(&i__1, &vn1[k], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,K:N) in step K. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains NaN, set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (sisnan_(maxc2nrmk)) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*info = *kb + kp;
+
+/*              Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix contains NaN and we stop */
+/*              the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    sgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &
+			    c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + 
+			    f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*              undefined elements. */
+
+/*              Return from the routine. */
+
+		return 0;
+	    }
+
+/*           Quick return, if the submatrix A(I:M,K:N) is */
+/*           a zero matrix. We need to check it only if the column index */
+/*           (same as row index) is larger than 1, since the condition */
+/*           for the whole original matrix A_orig is checked in the main */
+/*           routine. */
+
+	    if (*maxc2nrmk == 0.f) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*relmaxc2nrmk = 0.f;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix is zero and we stop the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */
+
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    sgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &
+			    c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + 
+			    f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    tau[j] = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + k - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third tolerance stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig; */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+
+/*              Apply the block reflector to the residual of the */
+/*              matrix A and the residual of the right hand sides B, if */
+/*              the residual matrix and and/or the residual of the right */
+/*              hand sides exist,  i.e. if the submatrix */
+/*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when */
+/*                 KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */
+
+		if (*kb < minmnupdt) {
+		    i__1 = *m - if__;
+		    i__2 = *n + *nrhs - *kb;
+		    sgemm_("No transpose", "Transpose", &i__1, &i__2, kb, &
+			    c_b7, &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + 
+			    f_dim1], ldf, &c_b8, &a[if__ + 1 + (*kb + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    tau[j] = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,K:N): */
+/*        1) swap the K-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */
+/*        3) copy the K-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than K in the next loop step.) */
+/*        4) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A_orig, not the block A(1:M,1:N). */
+
+	if (kp != k) {
+	    sswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
+	    i__1 = k - 1;
+	    sswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf);
+	    vn1[kp] = vn1[k];
+	    vn2[kp] = vn2[k];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[k];
+	    jpiv[k] = itemp;
+	}
+
+/*        Apply previous Householder reflectors to column K: */
+/*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T. */
+
+	if (k > 1) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    sgemv_("No transpose", &i__1, &i__2, &c_b7, &a[i__ + a_dim1], lda,
+		     &f[k + f_dim1], ldf, &c_b8, &a[i__ + k * a_dim1], &c__1);
+	}
+
+/*        Generate elementary reflector H(k) using the column A(I:M,K). */
+
+	if (i__ < *m) {
+	    i__1 = *m - i__ + 1;
+	    slarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], &
+		    c__1, &tau[k]);
+	} else {
+	    tau[k] = 0.f;
+	}
+
+/*        Check if TAU(K) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(K) for Inf, */
+/*        since SLARFG cannot produce TAU(K) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by SLARFG can contain Inf, which requires */
+/*        TAU(K) to contain NaN. Therefore, this case of generating Inf */
+/*        by SLARFG is covered by checking TAU(K) for NaN. */
+
+	if (sisnan_(&tau[k])) {
+
+	    *done = TRUE_;
+
+/*           Set KB, the number of factorized partial columns */
+/*                   that are non-zero in each step in the block, */
+/*                   i.e. the rank of the factor R. */
+/*           Set IF, the number of processed rows in the block, which */
+/*                   is the same as the number of processed rows in */
+/*                   the original whole matrix A_orig. */
+
+	    *kb = k - 1;
+	    if__ = i__ - 1;
+	    *info = k;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = tau[k];
+	    *relmaxc2nrmk = tau[k];
+
+/*           There is no need to apply the block reflector to the */
+/*           residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*           since the submatrix contains NaN and we stop */
+/*           the computation. */
+/*           But, we need to apply the block reflector to the residual */
+/*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*           residual right hand sides exist.  This occurs */
+/*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */
+
+	    if (*nrhs > 0 && *kb < *m - *ioffset) {
+		i__1 = *m - if__;
+		sgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &c_b7, &
+			a[if__ + 1 + a_dim1], lda, &f[*n + 1 + f_dim1], ldf, &
+			c_b8, &a[if__ + 1 + (*n + 1) * a_dim1], lda);
+	    }
+
+/*           There is no need to recompute the 2-norm of the */
+/*           difficult columns, since we stop the factorization. */
+
+/*           Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*           undefined elements. */
+
+/*           Return from the routine. */
+
+	    return 0;
+	}
+
+/*        =============================================================== */
+
+	aik = a[i__ + k * a_dim1];
+	a[i__ + k * a_dim1] = 1.f;
+
+/*        =============================================================== */
+
+/*        Compute the current K-th column of F: */
+/*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K). */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = *n + *nrhs - k;
+	    sgemv_("Transpose", &i__1, &i__2, &tau[k], &a[i__ + (k + 1) * 
+		    a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b30, &f[k + 
+		    1 + k * f_dim1], &c__1);
+	}
+
+/*           2) Zero out elements above and on the diagonal of the */
+/*              column K in matrix F, i.e elements F(1:K,K). */
+
+	i__1 = k;
+	for (j = 1; j <= i__1; ++j) {
+	    f[j + k * f_dim1] = 0.f;
+	}
+
+/*         3) Incremental updating of the K-th column of F: */
+/*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T */
+/*                    * A(I:M,K). */
+
+	if (k > 1) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    r__1 = -tau[k];
+	    sgemv_("Transpose", &i__1, &i__2, &r__1, &a[i__ + a_dim1], lda, &
+		    a[i__ + k * a_dim1], &c__1, &c_b30, &auxv[1], &c__1);
+
+	    i__1 = *n + *nrhs;
+	    i__2 = k - 1;
+	    sgemv_("No transpose", &i__1, &i__2, &c_b8, &f[f_dim1 + 1], ldf, &
+		    auxv[1], &c__1, &c_b8, &f[k * f_dim1 + 1], &c__1);
+	}
+
+/*        =============================================================== */
+
+/*        Update the current I-th row of A: */
+/*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */
+/*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**T. */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *n + *nrhs - k;
+	    sgemv_("No transpose", &i__1, &k, &c_b7, &f[k + 1 + f_dim1], ldf, 
+		    &a[i__ + a_dim1], lda, &c_b8, &a[i__ + (k + 1) * a_dim1], 
+		    lda);
+	}
+
+	a[i__ + k * a_dim1] = aik;
+
+/*        Update the partial column 2-norms for the residual matrix, */
+/*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */
+/*        when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */
+
+	if (k < minmnfact) {
+
+	    i__1 = *n;
+	    for (j = k + 1; j <= i__1; ++j) {
+		if (vn1[j] != 0.f) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+		    temp = (r__1 = a[i__ + j * a_dim1], abs(r__1)) / vn1[j];
+/* Computing MAX */
+		    r__1 = 0.f, r__2 = (temp + 1.f) * (1.f - temp);
+		    temp = f2cmax(r__1,r__2);
+/* Computing 2nd power */
+		    r__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (r__1 * r__1);
+		    if (temp2 <= tol3z) {
+
+/*                    At J-index, we have a difficult column for the */
+/*                    update of the 2-norm. Save the index of the previous */
+/*                    difficult column in IWORK(J-1). */
+/*                    NOTE: ILSTCC > 1, threfore we can use IWORK only */
+/*                    with N-1 elements, where the elements are */
+/*                    shifted by 1 to the left. */
+
+			iwork[j - 1] = lsticc;
+
+/*                    Set the index of the last difficult column LSTICC. */
+
+			lsticc = j;
+
+		    } else {
+			vn1[j] *= sqrt(temp);
+		    }
+		}
+	    }
+
+	}
+
+/*        End of while loop. */
+
+    }
+
+/*     Now, afler the loop: */
+/*        Set KB, the number of factorized columns in the block; */
+/*        Set IF, the number of processed rows in the block, which */
+/*                is the same as the number of processed rows in */
+/*                the original whole matrix A_orig, IF = IOFFSET + KB. */
+
+    *kb = k;
+    if__ = i__;
+
+/*     Apply the block reflector to the residual of the matrix A */
+/*     and the residual of the right hand sides B, if the residual */
+/*     matrix and and/or the residual of the right hand sides */
+/*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */
+/*     This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */
+
+    if (*kb < minmnupdt) {
+	i__1 = *m - if__;
+	i__2 = *n + *nrhs - *kb;
+	sgemm_("No transpose", "Transpose", &i__1, &i__2, kb, &c_b7, &a[if__ 
+		+ 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b8, &a[if__ 
+		+ 1 + (*kb + 1) * a_dim1], lda);
+    }
+
+/*     Recompute the 2-norm of the difficult columns. */
+/*     Loop over the index of the difficult columns from the largest */
+/*     to the smallest index. */
+
+    while(lsticc > 0) {
+
+/*        LSTICC is the index of the last difficult column is greater */
+/*        than 1. */
+/*        ITEMP is the index of the previous difficult column. */
+
+	itemp = iwork[lsticc - 1];
+
+/*        Compute the 2-norm explicilty for the last difficult column and */
+/*        save it in the partial and exact 2-norm vectors VN1 and VN2. */
+
+/*        NOTE: The computation of VN1( LSTICC ) relies on the fact that */
+/*        SNRM2 does not fail on vectors with norm below the value of */
+/*        SQRT(SLAMCH('S')) */
+
+	i__1 = *m - if__;
+	vn1[lsticc] = snrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1);
+	vn2[lsticc] = vn1[lsticc];
+
+/*        Downdate the index of the last difficult column to */
+/*        the index of the previous difficult column. */
+
+	lsticc = itemp;
+
+    }
+
+    return 0;
+
+/*     End of SLAQP3RK */
+
+} /* slaqp3rk_ */
+
diff --git a/lapack-netlib/SRC/zgeqp3rk.c b/lapack-netlib/SRC/zgeqp3rk.c
new file mode 100644
index 000000000..0c8b41c2d
--- /dev/null
+++ b/lapack-netlib/SRC/zgeqp3rk.c
@@ -0,0 +1,1074 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static integer c__3 = 3;
+static integer c__2 = 2;
+
+/* Subroutine */ int zgeqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*kmax, doublereal *abstol, doublereal *reltol, doublecomplex *a, 
+	integer *lda, integer *k, doublereal *maxc2nrmk, doublereal *
+	relmaxc2nrmk, integer *jpiv, doublecomplex *tau, doublecomplex *work, 
+	integer *lwork, doublereal *rwork, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    doublereal d__1, d__2;
+    doublecomplex z__1;
+
+    /* Local variables */
+    doublereal maxc2nrm;
+    logical done;
+    extern /* Subroutine */ int zlaqp2rk_(integer *, integer *, integer *, 
+	    integer *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *, doublecomplex *, integer *, integer *, doublereal *,
+	     doublereal *, integer *, doublecomplex *, doublereal *, 
+	    doublereal *, doublecomplex *, integer *), zlaqp3rk_(integer *, 
+	    integer *, integer *, integer *, integer *, doublereal *, 
+	    doublereal *, integer *, doublereal *, doublecomplex *, integer *,
+	     logical *, integer *, doublereal *, doublereal *, integer *, 
+	    doublecomplex *, doublereal *, doublereal *, doublecomplex *, 
+	    doublecomplex *, integer *, integer *, integer *);
+    integer jmax, j, jmaxc2nrm, jmaxb, nbmin, iinfo, n_sub__, minmn;
+    doublereal myhugeval;
+    integer jb;
+    extern doublereal dznrm2_(integer *, doublecomplex *, integer *);
+    integer nb, kf;
+    extern doublereal dlamch_(char *);
+    extern integer idamax_(integer *, doublereal *, integer *);
+    integer nx;
+    doublereal safmin;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern logical disnan_(doublereal *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen);
+    integer kp1, lwkopt;
+    logical lquery;
+    integer jbf;
+    doublereal eps;
+    integer iws, ioffset;
+
+
+/*  -- LAPACK computational routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test input arguments */
+/*     ==================== */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --work;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*kmax < 0) {
+	*info = -4;
+    } else if (disnan_(abstol)) {
+	*info = -5;
+    } else if (disnan_(reltol)) {
+	*info = -6;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -8;
+    }
+
+/*     If the input parameters M, N, NRHS, KMAX, LDA are valid: */
+/*       a) Test the input workspace size LWORK for the minimum */
+/*          size requirement IWS. */
+/*       b) Determine the optimal block size NB and optimal */
+/*          workspace size LWKOPT to be returned in WORK(1) */
+/*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., */
+/*          (3) when routine exits. */
+/*     Here, IWS is the miminum workspace required for unblocked */
+/*     code. */
+
+    if (*info == 0) {
+	minmn = f2cmin(*m,*n);
+	if (minmn == 0) {
+	    iws = 1;
+	    lwkopt = 1;
+	} else {
+
+/*           Minimal workspace size in case of using only unblocked */
+/*           BLAS 2 code in ZLAQP2RK. */
+/*           1) ZLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in ZLARF subroutine inside ZLAQP2RK to apply an */
+/*              elementary reflector from the left. */
+/*           TOTAL_WORK_SIZE = 3*N + NRHS - 1 */
+
+	    iws = *n + *nrhs - 1;
+
+/*           Assign to NB optimal block size. */
+
+	    nb = ilaenv_(&c__1, "ZGEQP3RK", " ", m, n, &c_n1, &c_n1, (ftnlen)
+		    8, (ftnlen)1);
+
+/*           A formula for the optimal workspace size in case of using */
+/*           both unblocked BLAS 2 in ZLAQP2RK and blocked BLAS 3 code */
+/*           in ZLAQP3RK. */
+/*           1) ZGEQP3RK, ZLAQP2RK, ZLAQP3RK: 2*N to store full and */
+/*              partial column 2-norms. */
+/*           2) ZLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in ZLARF subroutine to apply an elementary reflector */
+/*              from the left. */
+/*           3) ZLAQP3RK: NB*(N+NRHS) to use in the work array F that */
+/*              is used to apply a block reflector from */
+/*              the left. */
+/*           4) ZLAQP3RK: NB to use in the auxilixary array AUX. */
+/*           Sizes (2) and ((3) + (4)) should intersect, therefore */
+/*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. */
+
+	    lwkopt = (*n << 1) + nb * (*n + *nrhs + 1);
+	}
+	z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+
+	if (*lwork < iws && ! lquery) {
+	    *info = -15;
+	}
+    }
+
+/*      NOTE: The optimal workspace size is returned in WORK(1), if */
+/*            the input parameters M, N, NRHS, KMAX, LDA are valid. */
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGEQP3RK", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible for M=0 or N=0. */
+
+    if (minmn == 0) {
+	*k = 0;
+	*maxc2nrmk = 0.;
+	*relmaxc2nrmk = 0.;
+	z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize column pivot array JPIV. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	jpiv[j] = j;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize storage for partial and exact column 2-norms. */
+/*     a) The elements WORK(1:N) are used to store partial column */
+/*        2-norms of the matrix A, and may decrease in each computation */
+/*        step; initialize to the values of complete columns 2-norms. */
+/*     b) The elements WORK(N+1:2*N) are used to store complete column */
+/*        2-norms of the matrix A, they are not changed during the */
+/*        computation; initialize the values of complete columns 2-norms. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	rwork[j] = dznrm2_(m, &a[j * a_dim1 + 1], &c__1);
+	rwork[*n + j] = rwork[j];
+    }
+
+/*     ================================================================== */
+
+/*     Compute the pivot column index and the maximum column 2-norm */
+/*     for the whole original matrix stored in A(1:M,1:N). */
+
+    kp1 = idamax_(n, &rwork[1], &c__1);
+
+/*     ==================================================================. */
+
+    if (disnan_(&maxc2nrm)) {
+
+/*        Check if the matrix A contains NaN, set INFO parameter */
+/*        to the column number where the first NaN is found and return */
+/*        from the routine. */
+
+	*k = 0;
+	*info = kp1;
+
+/*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = maxc2nrm;
+
+/*        Array TAU is not set and contains undefined elements. */
+
+	z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+	return 0;
+    }
+
+/*     =================================================================== */
+
+    if (maxc2nrm == 0.) {
+
+/*        Check is the matrix A is a zero matrix, set array TAU and */
+/*        return from the routine. */
+
+	*k = 0;
+	*maxc2nrmk = 0.;
+	*relmaxc2nrmk = 0.;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j;
+	    tau[i__2].r = 0., tau[i__2].i = 0.;
+	}
+
+	z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+	return 0;
+
+    }
+
+/*     =================================================================== */
+
+    myhugeval = dlamch_("Overflow");
+
+    if (maxc2nrm > myhugeval) {
+
+/*        Check if the matrix A contains +Inf or -Inf, set INFO parameter */
+/*        to the column number, where the first +/-Inf  is found plus N, */
+/*        and continue the computation. */
+
+	*info = *n + kp1;
+
+    }
+
+/*     ================================================================== */
+
+/*     Quick return if possible for the case when the first */
+/*     stopping criterion is satisfied, i.e. KMAX = 0. */
+
+    if (*kmax == 0) {
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.;
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j;
+	    tau[i__2].r = 0., tau[i__2].i = 0.;
+	}
+	z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+    eps = dlamch_("Epsilon");
+
+/*     Adjust ABSTOL */
+
+    if (*abstol >= 0.) {
+	safmin = dlamch_("Safe minimum");
+/* Computing MAX */
+	d__1 = *abstol, d__2 = safmin * 2.;
+	*abstol = f2cmax(d__1,d__2);
+    }
+
+/*     Adjust RELTOL */
+
+    if (*reltol >= 0.) {
+	*reltol = f2cmax(*reltol,eps);
+    }
+
+/*     =================================================================== */
+
+/*     JMAX is the maximum index of the column to be factorized, */
+/*     which is also limited by the first stopping criterion KMAX. */
+
+    jmax = f2cmin(*kmax,minmn);
+
+/*     =================================================================== */
+
+/*     Quick return if possible for the case when the second or third */
+/*     stopping criterion for the whole original matrix is satified, */
+/*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */
+/*     (which is ONE <= RELTOL). */
+
+    if (maxc2nrm <= *abstol || 1. <= *reltol) {
+
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j;
+	    tau[i__2].r = 0., tau[i__2].i = 0.;
+	}
+
+	z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+	return 0;
+    }
+
+/*     ================================================================== */
+/*     Factorize columns */
+/*     ================================================================== */
+
+/*     Determine the block size. */
+
+    nbmin = 2;
+    nx = 0;
+
+    if (nb > 1 && nb < minmn) {
+
+/*        Determine when to cross over from blocked to unblocked code. */
+/*        (for N less than NX, unblocked code should be used). */
+
+/* Computing MAX */
+	i__1 = 0, i__2 = ilaenv_(&c__3, "ZGEQP3RK", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)8, (ftnlen)1);
+	nx = f2cmax(i__1,i__2);
+
+	if (nx < minmn) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    if (*lwork < lwkopt) {
+
+/*              Not enough workspace to use optimal block size that */
+/*              is currently stored in NB. */
+/*              Reduce NB and determine the minimum value of NB. */
+
+		nb = (*lwork - (*n << 1)) / (*n + 1);
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "ZGEQP3RK", " ", m, n, &c_n1, 
+			&c_n1, (ftnlen)8, (ftnlen)1);
+		nbmin = f2cmax(i__1,i__2);
+
+	    }
+	}
+    }
+
+/*     ================================================================== */
+
+/*     DONE is the boolean flag to rerpresent the case when the */
+/*     factorization completed in the block factorization routine, */
+/*     before the end of the block. */
+
+    done = FALSE_;
+
+/*     J is the column index. */
+
+    j = 1;
+
+/*     (1) Use blocked code initially. */
+
+/*     JMAXB is the maximum column index of the block, when the */
+/*     blocked code is used, is also limited by the first stopping */
+/*     criterion KMAX. */
+
+/* Computing MIN */
+    i__1 = *kmax, i__2 = minmn - nx;
+    jmaxb = f2cmin(i__1,i__2);
+
+    if (nb >= nbmin && nb < jmax && jmaxb > 0) {
+
+/*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */
+/*        J   is the column index of a column block; */
+/*        JB  is the column block size to pass to block factorization */
+/*            routine in a loop step; */
+/*        JBF is the number of columns that were actually factorized */
+/*            that was returned by the block factorization routine */
+/*            in a loop step, JBF <= JB; */
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	while(j <= jmaxb) {
+
+/* Computing MIN */
+	    i__1 = nb, i__2 = jmaxb - j + 1;
+	    jb = f2cmin(i__1,i__2);
+	    n_sub__ = *n - j + 1;
+	    ioffset = j - 1;
+
+/*           Factorize JB columns among the columns A(J:N). */
+
+	    i__1 = *n + *nrhs - j + 1;
+	    zlaqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, 
+		    &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, 
+		    maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], &
+		    rwork[*n + j], &work[1], &work[jb + 1], &i__1, &iwork[1], 
+		    &iinfo);
+
+/*           Set INFO on the first occurence of Inf. */
+
+	    if (iinfo > n_sub__ && *info == 0) {
+		*info = (ioffset << 1) + iinfo;
+	    }
+
+	    if (done) {
+
+/*              Either the submatrix is zero before the end of the */
+/*              column block, or ABSTOL or RELTOL criterion is */
+/*              satisfied before the end of the column block, we can */
+/*              return from the routine. Perform the following before */
+/*              returning: */
+/*                a) Set the number of factorized columns K, */
+/*                   K = IOFFSET + JBF from the last call of blocked */
+/*                   routine. */
+/*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */
+/*                         by the block factorization routine; */
+/*                      2) The remaining TAUs are set to ZERO by the */
+/*                         block factorization routine. */
+
+		*k = ioffset + jbf;
+
+/*              Set INFO on the first occurrence of NaN, NaN takes */
+/*              prcedence over Inf. */
+
+		if (iinfo <= n_sub__ && iinfo > 0) {
+		    *info = ioffset + iinfo;
+		}
+
+/*              Return from the routine. */
+
+		z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+		work[1].r = z__1.r, work[1].i = z__1.i;
+
+		return 0;
+
+	    }
+
+	    j += jbf;
+
+	}
+
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+/*     J = JMAX+1 means we factorized the maximum possible number of */
+/*     columns, that is in ELSE clause we need to compute */
+/*     the MAXC2NORM and RELMAXC2NORM to return after we processed */
+/*     the blocks. */
+
+    if (j <= jmax) {
+
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	n_sub__ = *n - j + 1;
+	ioffset = j - 1;
+
+	i__1 = jmax - j + 1;
+	zlaqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, &
+		maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, 
+		relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], &rwork[*n + j], &
+		work[1], &iinfo);
+
+/*        ABSTOL or RELTOL criterion is satisfied when the number of */
+/*        the factorized columns KF is smaller then the  number */
+/*        of columns JMAX-J+1 supplied to be factorized by the */
+/*        unblocked routine, we can return from */
+/*        the routine. Perform the following before returning: */
+/*           a) Set the number of factorized columns K, */
+/*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the */
+/*              unblocked factorization routine above. */
+
+	*k = j - 1 + kf;
+
+/*        Set INFO on the first exception occurence. */
+
+/*        Set INFO on the first exception occurence of Inf or NaN, */
+/*        (NaN takes precedence over Inf). */
+
+	if (iinfo > n_sub__ && *info == 0) {
+	    *info = (ioffset << 1) + iinfo;
+	} else if (iinfo <= n_sub__ && iinfo > 0) {
+	    *info = ioffset + iinfo;
+	}
+
+    } else {
+
+/*        Compute the return values for blocked code. */
+
+/*        Set the number of factorized columns if the unblocked routine */
+/*        was not called. */
+
+	*k = jmax;
+
+/*        If there exits a residual matrix after the blocked code: */
+/*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */
+/*              residual matrix, otherwise set them to ZERO; */
+/*           2) Set TAU(K+1:MINMN) to ZERO. */
+
+	if (*k < minmn) {
+	    i__1 = *n - *k;
+	    jmaxc2nrm = *k + idamax_(&i__1, &rwork[*k + 1], &c__1);
+	    *maxc2nrmk = rwork[jmaxc2nrm];
+	    if (*k == 0) {
+		*relmaxc2nrmk = 1.;
+	    } else {
+		*relmaxc2nrmk = *maxc2nrmk / maxc2nrm;
+	    }
+
+	    i__1 = minmn;
+	    for (j = *k + 1; j <= i__1; ++j) {
+		i__2 = j;
+		tau[i__2].r = 0., tau[i__2].i = 0.;
+	    }
+
+	} else {
+	    *maxc2nrmk = 0.;
+	    *relmaxc2nrmk = 0.;
+
+	}
+
+/*     END IF( J.LE.JMAX ) THEN */
+
+    }
+
+    z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+    work[1].r = z__1.r, work[1].i = z__1.i;
+
+    return 0;
+
+/*     End of ZGEQP3RK */
+
+} /* zgeqp3rk_ */
+
diff --git a/lapack-netlib/SRC/zlaqp2rk.c b/lapack-netlib/SRC/zlaqp2rk.c
new file mode 100644
index 000000000..0d38e71fb
--- /dev/null
+++ b/lapack-netlib/SRC/zlaqp2rk.c
@@ -0,0 +1,947 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+
+/* Subroutine */ int zlaqp2rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *kmax, doublereal *abstol, doublereal *reltol, 
+	integer *kp1, doublereal *maxc2nrm, doublecomplex *a, integer *lda, 
+	integer *k, doublereal *maxc2nrmk, doublereal *relmaxc2nrmk, integer *
+	jpiv, doublecomplex *tau, doublereal *vn1, doublereal *vn2, 
+	doublecomplex *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    doublereal d__1;
+    doublecomplex z__1;
+
+    /* Local variables */
+    doublecomplex aikk;
+    doublereal temp, temp2;
+    integer i__, j;
+    doublereal tol3z;
+    integer jmaxc2nrm, itemp;
+    extern /* Subroutine */ int zlarf_(char *, integer *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *, doublecomplex *);
+    integer minmnfact;
+    extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *);
+    doublereal myhugeval;
+    integer minmnupdt;
+    extern doublereal dznrm2_(integer *, doublecomplex *, integer *);
+    integer kk;
+    extern doublereal dlamch_(char *);
+    integer kp;
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern logical disnan_(doublereal *);
+    extern /* Subroutine */ int zlarfg_(integer *, doublecomplex *, 
+	    doublecomplex *, integer *, doublecomplex *);
+    doublereal taunan;
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/*     MINMNUPDT is the smallest dimension */
+/*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which */
+/*     contains the submatrices A(IOFFSET+1:M,1:N) and */
+/*     B(IOFFSET+1:M,1:NRHS) as column blocks. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *kmax = f2cmin(*kmax,minmnfact);
+    tol3z = sqrt(dlamch_("Epsilon"));
+    myhugeval = dlamch_("Overflow");
+
+/*     Compute the factorization, KK is the lomn loop index. */
+
+    i__1 = *kmax;
+    for (kk = 1; kk <= i__1; ++kk) {
+
+	i__ = *ioffset + kk;
+
+	if (i__ == 1) {
+
+/*           ============================================================ */
+
+/*           We are at the first column of the original whole matrix A, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+/*           ============================================================ */
+
+	} else {
+
+/*           ============================================================ */
+
+/*           Determine the pivot column in KK-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__2 = *n - kk + 1;
+	    kp = kk - 1 + idamax_(&i__2, &vn1[kk], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,KK:N) in step KK. */
+/*           RELMAXC2NRMK  will be computed later, after somecondition */
+/*           checks on MAXC2NRMK. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains NaN, and set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (disnan_(maxc2nrmk)) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*info = *k + kp;
+
+/*               Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*               Array TAU(K+1:MINMNFACT) is not set and contains */
+/*               undefined elements. */
+
+		return 0;
+	    }
+
+/*           ============================================================ */
+
+/*           Quick return, if the submatrix A(I:M,KK:N) is */
+/*           a zero matrix. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*maxc2nrmk == 0.) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*relmaxc2nrmk = 0.;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    i__3 = j;
+		    tau[i__3].r = 0., tau[i__3].i = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + kk - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL >= ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+/*              Set K, the number of factorized columns. */
+
+		*k = kk - 1;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    i__3 = j;
+		    tau[i__3].r = 0., tau[i__3].i = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,KK:N): */
+/*        1) swap the KK-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) copy the KK-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than KK in the next loop step.) */
+/*        3) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A, not the block A(1:M,1:N). */
+
+	if (kp != kk) {
+	    zswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1);
+	    vn1[kp] = vn1[kk];
+	    vn2[kp] = vn2[kk];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[kk];
+	    jpiv[kk] = itemp;
+	}
+
+/*        Generate elementary reflector H(KK) using the column A(I:M,KK), */
+/*        if the column has more than one element, otherwise */
+/*        the elementary reflector would be an identity matrix, */
+/*        and TAU(KK) = CZERO. */
+
+	if (i__ < *m) {
+	    i__2 = *m - i__ + 1;
+	    zlarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], &
+		    c__1, &tau[kk]);
+	} else {
+	    i__2 = kk;
+	    tau[i__2].r = 0., tau[i__2].i = 0.;
+	}
+
+/*        Check if TAU(KK) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(KK) for Inf, */
+/*        since ZLARFG cannot produce TAU(KK) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by ZLARFG can contain Inf, which requires */
+/*        TAU(KK) to contain NaN. Therefore, this case of generating Inf */
+/*        by ZLARFG is covered by checking TAU(KK) for NaN. */
+
+	i__2 = kk;
+	d__1 = tau[i__2].r;
+	if (disnan_(&d__1)) {
+	    i__2 = kk;
+	    taunan = tau[i__2].r;
+	} else /* if(complicated condition) */ {
+	    d__1 = d_imag(&tau[kk]);
+	    if (disnan_(&d__1)) {
+		taunan = d_imag(&tau[kk]);
+	    } else {
+		taunan = 0.;
+	    }
+	}
+
+	if (disnan_(&taunan)) {
+	    *k = kk - 1;
+	    *info = kk;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = taunan;
+	    *relmaxc2nrmk = taunan;
+
+/*           Array TAU(KK:MINMNFACT) is not set and contains */
+/*           undefined elements, except the first element TAU(KK) = NaN. */
+
+	    return 0;
+	}
+
+/*        Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left. */
+/*        ( If M >= N, then at KK = N there is no residual matrix, */
+/*         i.e. no columns of A to update, only columns of B. */
+/*         If M < N, then at KK = M-IOFFSET, I = M and we have a */
+/*         one-row residual matrix in A and the elementary */
+/*         reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update */
+/*         is needed for the residual matrix in A and the */
+/*         right-hand-side-matrix in B. */
+/*         Therefore, we update only if */
+/*         KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */
+/*         condition is satisfied, not only KK < N+NRHS ) */
+
+	if (kk < minmnupdt) {
+	    i__2 = i__ + kk * a_dim1;
+	    aikk.r = a[i__2].r, aikk.i = a[i__2].i;
+	    i__2 = i__ + kk * a_dim1;
+	    a[i__2].r = 1., a[i__2].i = 0.;
+	    i__2 = *m - i__ + 1;
+	    i__3 = *n + *nrhs - kk;
+	    d_cnjg(&z__1, &tau[kk]);
+	    zlarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &z__1, 
+		    &a[i__ + (kk + 1) * a_dim1], lda, &work[1]);
+	    i__2 = i__ + kk * a_dim1;
+	    a[i__2].r = aikk.r, a[i__2].i = aikk.i;
+	}
+
+	if (kk < minmnfact) {
+
+/*           Update the partial column 2-norms for the residual matrix, */
+/*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */
+/*           when KK < f2cmin(M-IOFFSET, N). */
+
+	    i__2 = *n;
+	    for (j = kk + 1; j <= i__2; ++j) {
+		if (vn1[j] != 0.) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+/* Computing 2nd power */
+		    d__1 = z_abs(&a[i__ + j * a_dim1]) / vn1[j];
+		    temp = 1. - d__1 * d__1;
+		    temp = f2cmax(temp,0.);
+/* Computing 2nd power */
+		    d__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (d__1 * d__1);
+		    if (temp2 <= tol3z) {
+
+/*                    Compute the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by explicitly computing it, */
+/*                    and store it in both partial 2-norm vector VN1 */
+/*                    and exact column 2-norm vector VN2. */
+
+			i__3 = *m - i__;
+			vn1[j] = dznrm2_(&i__3, &a[i__ + 1 + j * a_dim1], &
+				c__1);
+			vn2[j] = vn1[j];
+
+		    } else {
+
+/*                    Update the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by removing one */
+/*                    element A(I,J) and store it in partial */
+/*                    2-norm vector VN1. */
+
+			vn1[j] *= sqrt(temp);
+
+		    }
+		}
+	    }
+
+	}
+
+/*     End factorization loop */
+
+    }
+
+/*     If we reached this point, all colunms have been factorized, */
+/*     i.e. no condition was triggered to exit the routine. */
+/*     Set the number of factorized columns. */
+
+    *k = *kmax;
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */
+/*     we return. */
+
+    if (*k < minmnfact) {
+
+	i__1 = *n - *k;
+	jmaxc2nrm = *k + idamax_(&i__1, &vn1[*k + 1], &c__1);
+	*maxc2nrmk = vn1[jmaxc2nrm];
+
+	if (*k == 0) {
+	    *relmaxc2nrmk = 1.;
+	} else {
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+	}
+
+    } else {
+	*maxc2nrmk = 0.;
+	*relmaxc2nrmk = 0.;
+    }
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, set TAUs corresponding to the columns that were */
+/*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO. */
+
+    i__1 = minmnfact;
+    for (j = *k + 1; j <= i__1; ++j) {
+	i__2 = j;
+	tau[i__2].r = 0., tau[i__2].i = 0.;
+    }
+
+    return 0;
+
+/*     End of ZLAQP2RK */
+
+} /* zlaqp2rk_ */
+
diff --git a/lapack-netlib/SRC/zlaqp3rk.c b/lapack-netlib/SRC/zlaqp3rk.c
new file mode 100644
index 000000000..cb44e4d34
--- /dev/null
+++ b/lapack-netlib/SRC/zlaqp3rk.c
@@ -0,0 +1,1157 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static doublecomplex c_b1 = {0.,0.};
+static doublecomplex c_b2 = {1.,0.};
+static integer c__1 = 1;
+
+/* Subroutine */ int zlaqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *nb, doublereal *abstol, doublereal *reltol, 
+	integer *kp1, doublereal *maxc2nrm, doublecomplex *a, integer *lda, 
+	logical *done, integer *kb, doublereal *maxc2nrmk, doublereal *
+	relmaxc2nrmk, integer *jpiv, doublecomplex *tau, doublereal *vn1, 
+	doublereal *vn2, doublecomplex *auxv, doublecomplex *f, integer *ldf, 
+	integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, f_dim1, f_offset, i__1, i__2, i__3;
+    doublereal d__1, d__2;
+    doublecomplex z__1;
+
+    /* Local variables */
+    doublereal temp, temp2;
+    integer i__, j, k;
+    doublereal tol3z;
+    integer itemp;
+    extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, 
+	    integer *, doublecomplex *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *), zgemv_(char *, integer *, integer *, 
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
+	    integer *, doublecomplex *, doublecomplex *, integer *);
+    integer minmnfact;
+    extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *);
+    doublereal myhugeval;
+    integer minmnupdt;
+    extern doublereal dznrm2_(integer *, doublecomplex *, integer *);
+    integer if__;
+    extern doublereal dlamch_(char *);
+    integer kp;
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern logical disnan_(doublereal *);
+    integer lsticc;
+    extern /* Subroutine */ int zlarfg_(integer *, doublecomplex *, 
+	    doublecomplex *, integer *, doublecomplex *);
+    doublereal taunan;
+    doublecomplex aik;
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --auxv;
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *nb = f2cmin(*nb,minmnfact);
+    tol3z = sqrt(dlamch_("Epsilon"));
+    myhugeval = dlamch_("Overflow");
+
+/*     Compute factorization in a while loop over NB columns, */
+/*     K is the column index in the block A(1:M,1:N). */
+
+    k = 0;
+    lsticc = 0;
+    *done = FALSE_;
+
+    while(k < *nb && lsticc == 0) {
+	++k;
+	i__ = *ioffset + k;
+
+	if (i__ == 1) {
+
+/*           We are at the first column of the original whole matrix A_orig, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+	} else {
+
+/*           Determine the pivot column in K-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__1 = *n - k + 1;
+	    kp = k - 1 + idamax_(&i__1, &vn1[k], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,K:N) in step K. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains NaN, set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (disnan_(maxc2nrmk)) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*info = *kb + kp;
+
+/*              Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix contains NaN and we stop */
+/*              the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    z__1.r = -1., z__1.i = 0.;
+		    zgemm_("No transpose", "Conjugate transpose", &i__1, nrhs,
+			     kb, &z__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 
+			    + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*              undefined elements. */
+
+/*              Return from the routine. */
+
+		return 0;
+	    }
+
+/*           Quick return, if the submatrix A(I:M,K:N) is */
+/*           a zero matrix. We need to check it only if the column index */
+/*           (same as row index) is larger than 1, since the condition */
+/*           for the whole original matrix A_orig is checked in the main */
+/*           routine. */
+
+	    if (*maxc2nrmk == 0.) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*relmaxc2nrmk = 0.;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix is zero and we stop the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */
+
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    z__1.r = -1., z__1.i = 0.;
+		    zgemm_("No transpose", "Conjugate transpose", &i__1, nrhs,
+			     kb, &z__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 
+			    + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    i__2 = j;
+		    tau[i__2].r = 0., tau[i__2].i = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + k - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third tolerance stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig; */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+
+/*              Apply the block reflector to the residual of the */
+/*              matrix A and the residual of the right hand sides B, if */
+/*              the residual matrix and and/or the residual of the right */
+/*              hand sides exist,  i.e. if the submatrix */
+/*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when */
+/*                 KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */
+
+		if (*kb < minmnupdt) {
+		    i__1 = *m - if__;
+		    i__2 = *n + *nrhs - *kb;
+		    z__1.r = -1., z__1.i = 0.;
+		    zgemm_("No transpose", "Conjugate transpose", &i__1, &
+			    i__2, kb, &z__1, &a[if__ + 1 + a_dim1], lda, &f[*
+			    kb + 1 + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*kb 
+			    + 1) * a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    i__2 = j;
+		    tau[i__2].r = 0., tau[i__2].i = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,K:N): */
+/*        1) swap the K-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */
+/*        3) copy the K-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than K in the next loop step.) */
+/*        4) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A_orig, not the block A(1:M,1:N). */
+
+	if (kp != k) {
+	    zswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
+	    i__1 = k - 1;
+	    zswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf);
+	    vn1[kp] = vn1[k];
+	    vn2[kp] = vn2[k];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[k];
+	    jpiv[k] = itemp;
+	}
+
+/*        Apply previous Householder reflectors to column K: */
+/*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H. */
+
+	if (k > 1) {
+	    i__1 = k - 1;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = k + j * f_dim1;
+		d_cnjg(&z__1, &f[k + j * f_dim1]);
+		f[i__2].r = z__1.r, f[i__2].i = z__1.i;
+	    }
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    z__1.r = -1., z__1.i = 0.;
+	    zgemv_("No transpose", &i__1, &i__2, &z__1, &a[i__ + a_dim1], lda,
+		     &f[k + f_dim1], ldf, &c_b2, &a[i__ + k * a_dim1], &c__1);
+	    i__1 = k - 1;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = k + j * f_dim1;
+		d_cnjg(&z__1, &f[k + j * f_dim1]);
+		f[i__2].r = z__1.r, f[i__2].i = z__1.i;
+	    }
+	}
+
+/*        Generate elementary reflector H(k) using the column A(I:M,K). */
+
+	if (i__ < *m) {
+	    i__1 = *m - i__ + 1;
+	    zlarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], &
+		    c__1, &tau[k]);
+	} else {
+	    i__1 = k;
+	    tau[i__1].r = 0., tau[i__1].i = 0.;
+	}
+
+/*        Check if TAU(K) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(K) for Inf, */
+/*        since ZLARFG cannot produce TAU(KK) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by ZLARFG can contain Inf, which requires */
+/*        TAU(K) to contain NaN. Therefore, this case of generating Inf */
+/*        by ZLARFG is covered by checking TAU(K) for NaN. */
+
+	i__1 = k;
+	d__1 = tau[i__1].r;
+	if (disnan_(&d__1)) {
+	    i__1 = k;
+	    taunan = tau[i__1].r;
+	} else /* if(complicated condition) */ {
+	    d__1 = d_imag(&tau[k]);
+	    if (disnan_(&d__1)) {
+		taunan = d_imag(&tau[k]);
+	    } else {
+		taunan = 0.;
+	    }
+	}
+
+	if (disnan_(&taunan)) {
+
+	    *done = TRUE_;
+
+/*           Set KB, the number of factorized partial columns */
+/*                   that are non-zero in each step in the block, */
+/*                   i.e. the rank of the factor R. */
+/*           Set IF, the number of processed rows in the block, which */
+/*                   is the same as the number of processed rows in */
+/*                   the original whole matrix A_orig. */
+
+	    *kb = k - 1;
+	    if__ = i__ - 1;
+	    *info = k;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = taunan;
+	    *relmaxc2nrmk = taunan;
+
+/*           There is no need to apply the block reflector to the */
+/*           residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*           since the submatrix contains NaN and we stop */
+/*           the computation. */
+/*           But, we need to apply the block reflector to the residual */
+/*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*           residual right hand sides exist.  This occurs */
+/*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */
+
+	    if (*nrhs > 0 && *kb < *m - *ioffset) {
+		i__1 = *m - if__;
+		z__1.r = -1., z__1.i = 0.;
+		zgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, kb,
+			 &z__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + 
+			f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * a_dim1],
+			 lda);
+	    }
+
+/*           There is no need to recompute the 2-norm of the */
+/*           difficult columns, since we stop the factorization. */
+
+/*           Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*           undefined elements. */
+
+/*           Return from the routine. */
+
+	    return 0;
+	}
+
+/*        =============================================================== */
+
+	i__1 = i__ + k * a_dim1;
+	aik.r = a[i__1].r, aik.i = a[i__1].i;
+	i__1 = i__ + k * a_dim1;
+	a[i__1].r = 1., a[i__1].i = 0.;
+
+/*        =============================================================== */
+
+/*        Compute the current K-th column of F: */
+/*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K). */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = *n + *nrhs - k;
+	    zgemv_("Conjugate transpose", &i__1, &i__2, &tau[k], &a[i__ + (k 
+		    + 1) * a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, &
+		    f[k + 1 + k * f_dim1], &c__1);
+	}
+
+/*           2) Zero out elements above and on the diagonal of the */
+/*              column K in matrix F, i.e elements F(1:K,K). */
+
+	i__1 = k;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j + k * f_dim1;
+	    f[i__2].r = 0., f[i__2].i = 0.;
+	}
+
+/*         3) Incremental updating of the K-th column of F: */
+/*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H */
+/*                    * A(I:M,K). */
+
+	if (k > 1) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    i__3 = k;
+	    z__1.r = -tau[i__3].r, z__1.i = -tau[i__3].i;
+	    zgemv_("Conjugate Transpose", &i__1, &i__2, &z__1, &a[i__ + 
+		    a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, &auxv[1]
+		    , &c__1);
+
+	    i__1 = *n + *nrhs;
+	    i__2 = k - 1;
+	    zgemv_("No transpose", &i__1, &i__2, &c_b2, &f[f_dim1 + 1], ldf, &
+		    auxv[1], &c__1, &c_b2, &f[k * f_dim1 + 1], &c__1);
+	}
+
+/*        =============================================================== */
+
+/*        Update the current I-th row of A: */
+/*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */
+/*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**H. */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *n + *nrhs - k;
+	    z__1.r = -1., z__1.i = 0.;
+	    zgemm_("No transpose", "Conjugate transpose", &c__1, &i__1, &k, &
+		    z__1, &a[i__ + a_dim1], lda, &f[k + 1 + f_dim1], ldf, &
+		    c_b2, &a[i__ + (k + 1) * a_dim1], lda);
+	}
+
+	i__1 = i__ + k * a_dim1;
+	a[i__1].r = aik.r, a[i__1].i = aik.i;
+
+/*        Update the partial column 2-norms for the residual matrix, */
+/*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */
+/*        when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */
+
+	if (k < minmnfact) {
+
+	    i__1 = *n;
+	    for (j = k + 1; j <= i__1; ++j) {
+		if (vn1[j] != 0.) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+		    temp = z_abs(&a[i__ + j * a_dim1]) / vn1[j];
+/* Computing MAX */
+		    d__1 = 0., d__2 = (temp + 1.) * (1. - temp);
+		    temp = f2cmax(d__1,d__2);
+/* Computing 2nd power */
+		    d__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (d__1 * d__1);
+		    if (temp2 <= tol3z) {
+
+/*                    At J-index, we have a difficult column for the */
+/*                    update of the 2-norm. Save the index of the previous */
+/*                    difficult column in IWORK(J-1). */
+/*                    NOTE: ILSTCC > 1, threfore we can use IWORK only */
+/*                    with N-1 elements, where the elements are */
+/*                    shifted by 1 to the left. */
+
+			iwork[j - 1] = lsticc;
+
+/*                    Set the index of the last difficult column LSTICC. */
+
+			lsticc = j;
+
+		    } else {
+			vn1[j] *= sqrt(temp);
+		    }
+		}
+	    }
+
+	}
+
+/*        End of while loop. */
+
+    }
+
+/*     Now, afler the loop: */
+/*        Set KB, the number of factorized columns in the block; */
+/*        Set IF, the number of processed rows in the block, which */
+/*                is the same as the number of processed rows in */
+/*                the original whole matrix A_orig, IF = IOFFSET + KB. */
+
+    *kb = k;
+    if__ = i__;
+
+/*     Apply the block reflector to the residual of the matrix A */
+/*     and the residual of the right hand sides B, if the residual */
+/*     matrix and and/or the residual of the right hand sides */
+/*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */
+/*     This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */
+
+    if (*kb < minmnupdt) {
+	i__1 = *m - if__;
+	i__2 = *n + *nrhs - *kb;
+	z__1.r = -1., z__1.i = 0.;
+	zgemm_("No transpose", "Conjugate transpose", &i__1, &i__2, kb, &z__1,
+		 &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b2,
+		 &a[if__ + 1 + (*kb + 1) * a_dim1], lda);
+    }
+
+/*     Recompute the 2-norm of the difficult columns. */
+/*     Loop over the index of the difficult columns from the largest */
+/*     to the smallest index. */
+
+    while(lsticc > 0) {
+
+/*        LSTICC is the index of the last difficult column is greater */
+/*        than 1. */
+/*        ITEMP is the index of the previous difficult column. */
+
+	itemp = iwork[lsticc - 1];
+
+/*        Compute the 2-norm explicilty for the last difficult column and */
+/*        save it in the partial and exact 2-norm vectors VN1 and VN2. */
+
+/*        NOTE: The computation of VN1( LSTICC ) relies on the fact that */
+/*        DZNRM2 does not fail on vectors with norm below the value of */
+/*        SQRT(DLAMCH('S')) */
+
+	i__1 = *m - if__;
+	vn1[lsticc] = dznrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1);
+	vn2[lsticc] = vn1[lsticc];
+
+/*        Downdate the index of the last difficult column to */
+/*        the index of the previous difficult column. */
+
+	lsticc = itemp;
+
+    }
+
+    return 0;
+
+/*     End of ZLAQP3RK */
+
+} /* zlaqp3rk_ */
+

From 20a2a83f498ada982c431bd34da39181d2785ba1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 15 Nov 2023 12:18:15 +0100
Subject: [PATCH 422/718] Implement truncated QR with pivoting
 (Reference-LAPACK PR 891)

---
 lapack-netlib/SRC/Makefile | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile
index 448fbd8df..de2242701 100644
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@@ -136,7 +136,7 @@ SLASRC_O = \
    sgebrd.o sgecon.o sgeequ.o sgees.o  sgeesx.o sgeev.o  sgeevx.o \
    sgehd2.o sgehrd.o sgelq2.o sgelqf.o \
    sgels.o  sgelsd.o sgelss.o sgelsy.o sgeql2.o sgeqlf.o \
-   sgeqp3.o sgeqr2.o sgeqr2p.o sgeqrf.o sgeqrfp.o sgerfs.o \
+   sgeqp3.o sgeqp3rk.o sgeqr2.o sgeqr2p.o sgeqrf.o sgeqrfp.o sgerfs.o \
    sgerq2.o sgerqf.o sgesc2.o sgesdd.o sgesv.o  sgesvd.o sgesvdx.o sgesvx.o \
    sgetc2.o sgetf2.o sgetri.o \
    sggbak.o sggbal.o sgges.o  sgges3.o sggesx.o \
@@ -151,7 +151,7 @@ SLASRC_O = \
    slangb.o slange.o slangt.o slanhs.o slansb.o slansp.o \
    slansy.o slantb.o slantp.o slantr.o slanv2.o \
    slapll.o slapmt.o \
-   slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \
+   slaqgb.o slaqge.o slaqp2.o slaqp2rk.o slaqp3rk.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \
    slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \
    slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \
    slarf.o  slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \
@@ -232,7 +232,7 @@ CLASRC_O = \
    cgbtf2.o cgbtrf.o cgbtrs.o cgebak.o cgebal.o cgebd2.o cgebrd.o \
    cgecon.o cgeequ.o cgees.o  cgeesx.o cgeev.o  cgeevx.o \
    cgehd2.o cgehrd.o cgelq2.o cgelqf.o \
-   cgels.o  cgelsd.o cgelss.o cgelsy.o cgeql2.o cgeqlf.o cgeqp3.o \
+   cgels.o  cgelsd.o cgelss.o cgelsy.o cgeql2.o cgeqlf.o cgeqp3.o cgeqp3rk.o \
    cgeqr2.o cgeqr2p.o cgeqrf.o cgeqrfp.o cgerfs.o \
    cgerq2.o cgerqf.o cgesc2.o cgesdd.o cgesv.o  cgesvd.o  cgesvdx.o \
    cgesvj.o cgejsv.o cgsvj0.o cgsvj1.o \
@@ -266,7 +266,7 @@ CLASRC_O = \
    clanhb.o clanhe.o \
    clanhp.o clanhs.o clanht.o clansb.o clansp.o clansy.o clantb.o \
    clantp.o clantr.o clapll.o clapmt.o clarcm.o claqgb.o claqge.o \
-   claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqsb.o \
+   claqhb.o claqhe.o claqhp.o claqp2.o claqp2rk.o claqp3rk.o claqps.o claqsb.o \
    claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \
    claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \
    claqz0.o claqz1.o claqz2.o claqz3.o \
@@ -345,7 +345,7 @@ DLASRC_O = \
    dgebrd.o dgecon.o dgeequ.o dgees.o  dgeesx.o dgeev.o  dgeevx.o \
    dgehd2.o dgehrd.o dgelq2.o dgelqf.o \
    dgels.o  dgelsd.o dgelss.o dgelsy.o dgeql2.o dgeqlf.o \
-   dgeqp3.o dgeqr2.o dgeqr2p.o dgeqrf.o dgeqrfp.o dgerfs.o \
+   dgeqp3.o dgeqp3rk.o dgeqr2.o dgeqr2p.o dgeqrf.o dgeqrfp.o dgerfs.o \
    dgerq2.o dgerqf.o dgesc2.o dgesdd.o dgesv.o  dgesvd.o dgesvdx.o dgesvx.o \
    dgetc2.o dgetf2.o dgetrf.o dgetri.o \
    dgetrs.o dggbak.o dggbal.o dgges.o  dgges3.o dggesx.o \
@@ -360,7 +360,7 @@ DLASRC_O = \
    dlangb.o dlange.o dlangt.o dlanhs.o dlansb.o dlansp.o \
    dlansy.o dlantb.o dlantp.o dlantr.o dlanv2.o \
    dlapll.o dlapmt.o \
-   dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \
+   dlaqgb.o dlaqge.o dlaqp2.o dlaqp2rk.o dlaqp3rk.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \
    dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \
    dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \
    dlarf.o  dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \
@@ -437,7 +437,7 @@ ZLASRC_O = \
    zgbtf2.o zgbtrf.o zgbtrs.o zgebak.o zgebal.o zgebd2.o zgebrd.o \
    zgecon.o zgeequ.o zgees.o  zgeesx.o zgeev.o  zgeevx.o \
    zgehd2.o zgehrd.o zgelq2.o zgelqf.o \
-   zgels.o  zgelsd.o zgelss.o zgelsy.o zgeql2.o zgeqlf.o zgeqp3.o \
+   zgels.o  zgelsd.o zgelss.o zgelsy.o zgeql2.o zgeqlf.o zgeqp3.o zgeqp3rk.o \
    zgeqr2.o zgeqr2p.o zgeqrf.o zgeqrfp.o zgerfs.o zgerq2.o zgerqf.o \
    zgesc2.o zgesdd.o zgesv.o  zgesvd.o zgesvdx.o \
    zgesvj.o zgejsv.o zgsvj0.o zgsvj1.o \
@@ -473,7 +473,7 @@ ZLASRC_O = \
    zlanhe.o \
    zlanhp.o zlanhs.o zlanht.o zlansb.o zlansp.o zlansy.o zlantb.o \
    zlantp.o zlantr.o zlapll.o zlapmt.o zlaqgb.o zlaqge.o \
-   zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqsb.o \
+   zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqp2rk.o zlaqp3rk.o zlaqps.o zlaqsb.o \
    zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \
    zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \
    zlaqz0.o zlaqz1.o zlaqz2.o zlaqz3.o \

From 8b2a9568905b6480b457fb8873a7dff7b116a9fc Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 15 Nov 2023 14:20:12 +0100
Subject: [PATCH 423/718] Implement truncated QR with pivot (Reference-LAPACK
 PR 891)

---
 lapack-netlib/SRC/ilaenv.c | 83 +++++++++++++++++++++++++++++---------
 1 file changed, 63 insertions(+), 20 deletions(-)

diff --git a/lapack-netlib/SRC/ilaenv.c b/lapack-netlib/SRC/ilaenv.c
index c47224a0c..8f3b2db8e 100644
--- a/lapack-netlib/SRC/ilaenv.c
+++ b/lapack-netlib/SRC/ilaenv.c
@@ -191,7 +191,7 @@ typedef struct Namelist Namelist;
 #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
 #ifdef _MSC_VER
 #define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
-#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
 #else
 #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
 #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
@@ -252,11 +252,11 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
 #define myexit_() break;
-#define mycycle() continue;
-#define myceiling(w) {ceil(w)}
-#define myhuge(w) {HUGE_VAL}
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
 //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
-#define mymaxloc(w,s,e,n) {dmaxloc_(w,*(s),*(e),n)}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
 
 /* procedure parameter types for -A and -C++ */
 
@@ -509,12 +509,18 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 
 
 
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
 
 /* Table of constant values */
 
 static integer c__1 = 1;
-static real c_b174 = 0.f;
-static real c_b175 = 1.f;
+static real c_b179 = 0.f;
+static real c_b180 = 1.f;
 static integer c__0 = 0;
 
 /* > \brief \b ILAENV */
@@ -599,9 +605,9 @@ f"> */
 /* >          = 9: maximum size of the subproblems at the bottom of the */
 /* >               computation tree in the divide-and-conquer algorithm */
 /* >               (used by xGELSD and xGESDD) */
-/* >          =10: ieee NaN arithmetic can be trusted not to trap */
+/* >          =10: ieee infinity and NaN arithmetic can be trusted not to trap */
 /* >          =11: infinity arithmetic can be trusted not to trap */
-/* >          12 <= ISPEC <= 16: */
+/* >          12 <= ISPEC <= 17: */
 /* >               xHSEQR or related subroutines, */
 /* >               see IPARMQ for detailed explanation */
 /* > \endverbatim */
@@ -652,9 +658,7 @@ f"> */
 /* > \author Univ. of Colorado Denver */
 /* > \author NAG Ltd. */
 
-/* > \date November 2019 */
-
-/* > \ingroup OTHERauxiliary */
+/* > \ingroup ilaenv */
 
 /* > \par Further Details: */
 /*  ===================== */
@@ -685,7 +689,7 @@ integer ilaenv_(integer *ispec, char *name__, char *opts, integer *n1,
 	opts_len)
 {
     /* System generated locals */
-    integer ret_val;
+    integer ret_val, i__1, i__2, i__3;
 
     /* Local variables */
     logical twostage;
@@ -702,10 +706,9 @@ integer ilaenv_(integer *ispec, char *name__, char *opts, integer *n1,
 	    integer *, integer *);
 
 
-/*  -- LAPACK auxiliary routine (version 3.9.0) -- */
+/*  -- LAPACK auxiliary routine -- */
 /*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
 /*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
-/*     November 2019 */
 
 
 /*  ===================================================================== */
@@ -728,6 +731,7 @@ integer ilaenv_(integer *ispec, char *name__, char *opts, integer *n1,
 	case 14:  goto L160;
 	case 15:  goto L160;
 	case 16:  goto L160;
+	case 17:  goto L160;
     }
 
 /*     Invalid value for ISPEC */
@@ -908,6 +912,12 @@ L50:
 	    } else {
 		nb = 64;
 	    }
+	} else if (s_cmp(subnam + 3, "QP3RK", (ftnlen)4, (ftnlen)5) == 0) {
+	    if (sname) {
+		nb = 32;
+	    } else {
+		nb = 32;
+	    }
 	}
     } else if (s_cmp(c2, "PO", (ftnlen)2, (ftnlen)2) == 0) {
 	if (s_cmp(c3, "TRF", (ftnlen)3, (ftnlen)3) == 0) {
@@ -1034,6 +1044,21 @@ L50:
 	    } else {
 		nb = 64;
 	    }
+	} else if (s_cmp(c3, "SYL", (ftnlen)3, (ftnlen)3) == 0) {
+/*           The upper bound is to prevent overly aggressive scaling. */
+	    if (sname) {
+/* Computing MIN */
+/* Computing MAX */
+		i__2 = 48, i__3 = (f2cmin(*n1,*n2) << 4) / 100;
+		i__1 = f2cmax(i__2,i__3);
+		nb = f2cmin(i__1,240);
+	    } else {
+/* Computing MIN */
+/* Computing MAX */
+		i__2 = 24, i__3 = (f2cmin(*n1,*n2) << 3) / 100;
+		i__1 = f2cmax(i__2,i__3);
+		nb = f2cmin(i__1,80);
+	    }
 	}
     } else if (s_cmp(c2, "LA", (ftnlen)2, (ftnlen)2) == 0) {
 	if (s_cmp(c3, "UUM", (ftnlen)3, (ftnlen)3) == 0) {
@@ -1042,6 +1067,12 @@ L50:
 	    } else {
 		nb = 64;
 	    }
+	} else if (s_cmp(c3, "TRS", (ftnlen)3, (ftnlen)3) == 0) {
+	    if (sname) {
+		nb = 32;
+	    } else {
+		nb = 32;
+	    }
 	}
     } else if (sname && s_cmp(c2, "ST", (ftnlen)2, (ftnlen)2) == 0) {
 	if (s_cmp(c3, "EBZ", (ftnlen)3, (ftnlen)3) == 0) {
@@ -1093,6 +1124,12 @@ L60:
 	    } else {
 		nbmin = 2;
 	    }
+	} else if (s_cmp(subnam + 3, "QP3RK", (ftnlen)4, (ftnlen)5) == 0) {
+	    if (sname) {
+		nbmin = 2;
+	    } else {
+		nbmin = 2;
+	    }
 	}
     } else if (s_cmp(c2, "SY", (ftnlen)2, (ftnlen)2) == 0) {
 	if (s_cmp(c3, "TRF", (ftnlen)3, (ftnlen)3) == 0) {
@@ -1184,6 +1221,12 @@ L70:
 	    } else {
 		nx = 128;
 	    }
+	} else if (s_cmp(subnam + 3, "QP3RK", (ftnlen)4, (ftnlen)5) == 0) {
+	    if (sname) {
+		nx = 128;
+	    } else {
+		nx = 128;
+	    }
 	}
     } else if (s_cmp(c2, "SY", (ftnlen)2, (ftnlen)2) == 0) {
 	if (sname && s_cmp(c3, "TRD", (ftnlen)3, (ftnlen)3) == 0) {
@@ -1270,29 +1313,29 @@ L130:
 
 L140:
 
-/*     ISPEC = 10: ieee NaN arithmetic can be trusted not to trap */
+/*     ISPEC = 10: ieee and infinity NaN arithmetic can be trusted not to trap */
 
 /*     ILAENV = 0 */
     ret_val = 1;
     if (ret_val == 1) {
-	ret_val = ieeeck_(&c__1, &c_b174, &c_b175);
+	ret_val = ieeeck_(&c__1, &c_b179, &c_b180);
     }
     return ret_val;
 
 L150:
 
-/*     ISPEC = 11: infinity arithmetic can be trusted not to trap */
+/*     ISPEC = 11: ieee infinity arithmetic can be trusted not to trap */
 
 /*     ILAENV = 0 */
     ret_val = 1;
     if (ret_val == 1) {
-	ret_val = ieeeck_(&c__0, &c_b174, &c_b175);
+	ret_val = ieeeck_(&c__0, &c_b179, &c_b180);
     }
     return ret_val;
 
 L160:
 
-/*     12 <= ISPEC <= 16: xHSEQR or related subroutines. */
+/*     12 <= ISPEC <= 17: xHSEQR or related subroutines. */
 
     ret_val = iparmq_(ispec, name__, opts, n1, n2, n3, n4)
 	    ;

From ee47e4e494629b29d39557d4415349566060a8e6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 15 Nov 2023 15:21:32 +0100
Subject: [PATCH 424/718] run m1/llvm/cmake buid on all 4 cores

---
 .cirrus.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 6c2baf8a0..b4c4870d0 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -29,7 +29,7 @@ task:
   - mkdir build
   - cd build
   - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
-  - make
+  - make -j 4
 
 task:
   name: AppleM1/GCC/MAKE/OPENMP

From cc622f2406dc7f2ca47af27db5eb3bd57ab46ae0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 15 Nov 2023 22:51:09 +0100
Subject: [PATCH 425/718] restore OpenBLAS-specific target_link_libraries

---
 lapack-netlib/TESTING/LIN/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt
index e28818c76..143fd0597 100644
--- a/lapack-netlib/TESTING/LIN/CMakeLists.txt
+++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt
@@ -239,7 +239,8 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr
 
 macro(add_lin_executable name)
   add_executable(${name} ${ARGN})
-  target_link_libraries(${name} ${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
+  target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE})
+#${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
 endmacro()
 
 if(BUILD_SINGLE)

From f8ad5344c210960fc399ca5b0ad8559ab5ca253e Mon Sep 17 00:00:00 2001
From: Bart Oldeman <bart.oldeman@calculquebec.ca>
Date: Fri, 17 Nov 2023 23:49:34 +0000
Subject: [PATCH 426/718] Fix casum fallback kernel.

This kernel is only used on Skylake+ if the kernel with AVX512
intrinsics can't be used, but used the variable x1 incorrectly
in the tail end of the loop, as it is still at the initial
value instead of where x points to.

This caused 55 "other error"s in the LAPACK tests
(https://github.com/OpenMathLib/OpenBLAS/issues/4282)

This change makes casum.c as similar as possible as zasum.c,
because zasum.c does this correctly.
---
 kernel/x86_64/casum.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/kernel/x86_64/casum.c b/kernel/x86_64/casum.c
index e4d054311..28d78d279 100644
--- a/kernel/x86_64/casum.c
+++ b/kernel/x86_64/casum.c
@@ -9,12 +9,12 @@
 #endif
 
 #ifndef HAVE_CASUM_KERNEL
-static FLOAT casum_kernel(BLASLONG n, FLOAT *x1)
+static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
 {
 
     BLASLONG i=0;
     BLASLONG n_8 = n & -8;
-    FLOAT *x = x1;
+    FLOAT *x1 = x;
     FLOAT temp0, temp1, temp2, temp3;
     FLOAT temp4, temp5, temp6, temp7;
     FLOAT sum0 = 0.0;
@@ -24,14 +24,14 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x1)
     FLOAT sum4 = 0.0;
     
     while (i < n_8) {
-        temp0 = ABS_K(x[0]);
-        temp1 = ABS_K(x[1]);
-        temp2 = ABS_K(x[2]);
-        temp3 = ABS_K(x[3]);
-        temp4 = ABS_K(x[4]);
-        temp5 = ABS_K(x[5]);
-        temp6 = ABS_K(x[6]);
-        temp7 = ABS_K(x[7]);
+        temp0 = ABS_K(x1[0]);
+        temp1 = ABS_K(x1[1]);
+        temp2 = ABS_K(x1[2]);
+        temp3 = ABS_K(x1[3]);
+        temp4 = ABS_K(x1[4]);
+        temp5 = ABS_K(x1[5]);
+        temp6 = ABS_K(x1[6]);
+        temp7 = ABS_K(x1[7]);
         
         sum0 += temp0;
         sum1 += temp1;
@@ -43,12 +43,12 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x1)
         sum2 += temp6;
         sum3 += temp7;
         
-        x+=8;
+        x1+=8;
         i+=4;
     }
 
      while (i < n) {
-        sum4 += (ABS_K(x1[0]) + ABS_K(x1[1]));
+        sum4 += ABS_K(x1[0]) + ABS_K(x1[1]);
         x1 += 2;
         i++;
      }

From 47b03fd4b4ce7bc51d5b56397e52e6da3c5f3f36 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 18 Nov 2023 23:45:02 +0100
Subject: [PATCH 427/718] Copy XCode15-specific workaround to Fortran flags to
 fix build of tests

---
 Makefile.system | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile.system b/Makefile.system
index 1b84195e4..ff06e503c 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -407,6 +407,7 @@ XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables |awk '/vers
 endif
 ifeq (x$(XCVER), x 15)
 CCOMMON_OPT += -Wl,-ld_classic
+FCOMMON_OPT += -Wl,-ld_classic
 endif
 endif
 

From 22aa401656032c70c4c0efb8d24bf1a63577c992 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 19 Nov 2023 00:04:31 +0100
Subject: [PATCH 428/718] Temporarily disable the AVX512 CASUM/ZASUM
 microkernels for any version of NVIDIA HPC (#4327)

* Temporarily disable the C/ZASUM microkernels for any version of NVHPC
---
 kernel/x86_64/casum_microk_skylakex-2.c | 3 ++-
 kernel/x86_64/zasum_microk_skylakex-2.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c
index d261962de..5d37fd541 100644
--- a/kernel/x86_64/casum_microk_skylakex-2.c
+++ b/kernel/x86_64/casum_microk_skylakex-2.c
@@ -4,7 +4,8 @@
 #endif
 #if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && (__clang_major__ >= 9 &&__clang_major__ !=17)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
 
-#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309))
+#if (!(defined(__NVCOMPILER) ))
+//&& NVCOMPVERS < 2309))
 
 #define HAVE_CASUM_KERNEL 1
 
diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c
index dddf03fe2..7260922e7 100644
--- a/kernel/x86_64/zasum_microk_skylakex-2.c
+++ b/kernel/x86_64/zasum_microk_skylakex-2.c
@@ -4,7 +4,8 @@
 #endif
 #if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && ( __clang_major__ >= 9 && __clang_major__ != 17)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
 
-#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309))
+#if (!(defined(__NVCOMPILER) ))
+//&& NVCOMPVERS < 2309))
 
 #define HAVE_ZASUM_KERNEL 1
 

From 6b2651ece32365201793b02e78e198abd1f137ac Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 19 Nov 2023 02:57:13 -0600
Subject: [PATCH 429/718] Fix building test_sbgemm

---
 test/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/Makefile b/test/Makefile
index 715842b4d..56acf1c5b 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -260,6 +260,7 @@ endif
 
 
 FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
+CLDFLAGS = $(CFLAGS) $(LDFLAGS)
 
 
 ifeq ($(USE_OPENMP), 1)
@@ -331,7 +332,7 @@ endif
 
 ifeq ($(BUILD_BFLOAT16),1)
 test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+	$(CC) $(CLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
 endif
 
 ifeq ($(BUILD_COMPLEX),1)

From c34e2cf380a757621f166e6ce8ea8755155158f2 Mon Sep 17 00:00:00 2001
From: Bart Oldeman <bart.oldeman@calculquebec.ca>
Date: Sun, 19 Nov 2023 21:21:23 +0000
Subject: [PATCH 430/718] Use _mm_set1_epi{32,64x} to init mask in x86-64
 [cz]asum

for skylake kernels. This is the same method as used in [sd]asum.
_mm_set1_epi64x was commented out for zasum, but has the advantage
of avoiding possible undefined behaviour (using an uninitialized
variable), optimized out by NVHPC and icx. The new code works
fine with those compilers.

For GCC 12.3 the generated code is identical; no matter what method
you use, the compiler optimizes the code into a compile-time
constant, there is no performance benefit using mm_cmpeq_epi8
since the corresponding instruction (VPCMPEQB) isn't actually
generated!
---
 kernel/x86_64/casum_microk_skylakex-2.c | 10 ++++------
 kernel/x86_64/zasum_microk_skylakex-2.c | 11 ++++-------
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c
index 5d37fd541..10b70ff20 100644
--- a/kernel/x86_64/casum_microk_skylakex-2.c
+++ b/kernel/x86_64/casum_microk_skylakex-2.c
@@ -2,10 +2,9 @@
 #ifdef __NVCOMPILER
 #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
 #endif
-#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && (__clang_major__ >= 9 &&__clang_major__ !=17)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
+#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))
 
-#if (!(defined(__NVCOMPILER) ))
-//&& NVCOMPVERS < 2309))
+#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))
 
 #define HAVE_CASUM_KERNEL 1
 
@@ -21,15 +20,14 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
     
     if (n2 < 64) {
         __m128 accum_10, accum_11, accum_12, accum_13;
-        __m128 abs_mask1 = abs_mask1;
+        __m128 abs_mask1;
 
         accum_10 = _mm_setzero_ps();
         accum_11 = _mm_setzero_ps();
         accum_12 = _mm_setzero_ps();
         accum_13 = _mm_setzero_ps();
         
-        abs_mask1 = (__m128)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
-        abs_mask1 = (__m128)_mm_srli_epi32((__m128i) abs_mask1, 1);
+        abs_mask1 = (__m128)_mm_set1_epi32(0x7fffffff);
                 
         _mm_prefetch(&x1[0], _MM_HINT_T0);
         
diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c
index 7260922e7..f6bc8e37b 100644
--- a/kernel/x86_64/zasum_microk_skylakex-2.c
+++ b/kernel/x86_64/zasum_microk_skylakex-2.c
@@ -2,10 +2,9 @@
 #ifdef __NVCOMPILER
 #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
 #endif
-#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && ( __clang_major__ >= 9 && __clang_major__ != 17)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
+#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))
 
-#if (!(defined(__NVCOMPILER) ))
-//&& NVCOMPVERS < 2309))
+#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))
 
 #define HAVE_ZASUM_KERNEL 1
 
@@ -22,16 +21,14 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
 
     if (n2 < 32) {
         __m128d accum_10, accum_11, accum_12, accum_13;
-        __m128d abs_mask1 = abs_mask1;
+        __m128d abs_mask1;
 
         accum_10 = _mm_setzero_pd();
         accum_11 = _mm_setzero_pd();
         accum_12 = _mm_setzero_pd();
         accum_13 = _mm_setzero_pd();
         
-        // abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff);
-        abs_mask1 = (__m128d)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
-        abs_mask1 = (__m128d)_mm_srli_epi64((__m128i) abs_mask1, 1);
+        abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff);
                 
         _mm_prefetch(&x1[0], _MM_HINT_T0);
         if (n2 >= 16){

From 54be8f4d67185c8bf4b7f0737504130e0fa53e83 Mon Sep 17 00:00:00 2001
From: Yuning Zhang <codeworm96@outlook.com>
Date: Mon, 20 Nov 2023 13:28:25 -0800
Subject: [PATCH 431/718] Update the list of default dynamic targets for x86_64
 in the README to be consistent with the Makefile

Signed-off-by: Yuning Zhang <codeworm96@outlook.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3c4e38f18..aaadd0d9c 100644
--- a/README.md
+++ b/README.md
@@ -202,7 +202,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 
 OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
 
-For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default.
+For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX, Cooper Lake, Sapphire Rapids. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default.
 
 `DYNAMIC_ARCH` is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
 Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.

From 47da601a2d456d1b85b8261e6b1d21a1c2c7e315 Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Mon, 20 Nov 2023 17:24:22 -0600
Subject: [PATCH 432/718] POWER: Fixing Makefile error

Recent commit d99aad8ee308600832da39105a6511275cfe32ad added
extra `)`. This patch fixes the warning from Makefile.
---
 Makefile.power | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.power b/Makefile.power
index aa1ca080a..3fa6d6faf 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -11,7 +11,7 @@ endif
 
 ifeq ($(CORE), POWER10)
 ifneq ($(C_COMPILER), PGI)
-ifeq ($(C_COMPILER), GCC))
+ifeq ($(C_COMPILER), GCC)
 ifeq ($(GCCVERSIONGTEQ10), 1)
 CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
 else ifneq ($(GCCVERSIONGT4), 1)

From c883abf838f33354f78f044ea9945278558173e8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 22 Nov 2023 22:41:53 +0100
Subject: [PATCH 433/718] Revert unintentional change to linking rule from PR
 4322

---
 lapack-netlib/TESTING/LIN/Makefile | 40 +++++++++++++++---------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/lapack-netlib/TESTING/LIN/Makefile b/lapack-netlib/TESTING/LIN/Makefile
index 46e096c2f..714efa52a 100644
--- a/lapack-netlib/TESTING/LIN/Makefile
+++ b/lapack-netlib/TESTING/LIN/Makefile
@@ -269,35 +269,35 @@ proto-double: xlintstds xlintstrfd
 proto-complex: xlintstrfc
 proto-complex16: xlintstzc xlintstrfz
 
-xlintsts: $(ALINTST) $(SLINTST) $(SCLNTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xlintsts: $(ALINTST) $(SLINTST) $(SCLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstc: $(ALINTST) $(CLINTST) $(SCLNTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xlintstc: $(ALINTST) $(CLINTST) $(SCLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstd: $(ALINTST) $(DLINTST) $(DZLNTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xlintstd: $(ALINTST) $(DLINTST) $(DZLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstz: $(ALINTST) $(ZLINTST) $(DZLNTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xlintstz: $(ALINTST) $(ZLINTST) $(DZLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstds: $(DSLINTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xlintstds: $(DSLINTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstzc: $(ZCLINTST) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xlintstzc: $(ZCLINTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstrfs: $(SLINTSTRFP) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xlintstrfs: $(SLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstrfd: $(DLINTSTRFP) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xlintstrfd: $(DLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstrfc: $(CLINTSTRFP) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xlintstrfc: $(CLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstrfz: $(ZLINTSTRFP) $(TMGLIB) $(VARLIB) $(LAPACKLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xlintstrfz: $(ZLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 $(ALINTST): $(FRC)
 $(SCLNTST): $(FRC)

From f745f02f35641061511a3b2c368a21ef374c56f4 Mon Sep 17 00:00:00 2001
From: Shiyou Yin <yinshiyou-hf@loongson.cn>
Date: Fri, 24 Nov 2023 14:51:37 +0800
Subject: [PATCH 434/718] benchmark: Fix missing colons in outputs of
 ./strsv.goto

---
 benchmark/trsv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/trsv.c b/benchmark/trsv.c
index 66ac3a3c7..e17c57157 100644
--- a/benchmark/trsv.c
+++ b/benchmark/trsv.c
@@ -127,7 +127,7 @@ int main(int argc, char *argv[]){
       long long muls = n*(n+1)/2.0;
       long long adds = (n - 1.0)*n/2.0;
 
-      fprintf(stderr, "%10d   %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg);
+      fprintf(stderr, "%10d :   %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg);
       if(a != NULL){
         free(a);
       }

From ca5a87ff1de25fd9d6dd22f4b4c635ec040fe59d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 25 Nov 2023 15:31:18 +0100
Subject: [PATCH 435/718] Small documentation fix for Truncated QR With
 Pivoting (Reference-LAPACK PR 941)

---
 lapack-netlib/SRC/cgeqp3rk.f | 12 ++++++------
 lapack-netlib/SRC/claqp2rk.f |  6 +++---
 lapack-netlib/SRC/claqp3rk.f |  6 +++---
 lapack-netlib/SRC/dgeqp3rk.f |  4 ++--
 lapack-netlib/SRC/dlaqp2rk.f |  4 ++--
 lapack-netlib/SRC/dlaqp3rk.f |  4 ++--
 lapack-netlib/SRC/sgeqp3rk.f |  4 ++--
 lapack-netlib/SRC/slaqp2rk.f |  4 ++--
 lapack-netlib/SRC/slaqp3rk.f |  4 ++--
 lapack-netlib/SRC/zgeqp3rk.f | 12 ++++++------
 lapack-netlib/SRC/zlaqp2rk.f |  6 +++---
 lapack-netlib/SRC/zlaqp3rk.f |  6 +++---
 12 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/lapack-netlib/SRC/cgeqp3rk.f b/lapack-netlib/SRC/cgeqp3rk.f
index 70789e64f..587860684 100644
--- a/lapack-netlib/SRC/cgeqp3rk.f
+++ b/lapack-netlib/SRC/cgeqp3rk.f
@@ -55,7 +55,7 @@
 *> where:
 *>
 *>  P(K)            is an N-by-N permutation matrix;
-*>  Q(K)            is an M-by-M orthogonal matrix;
+*>  Q(K)            is an M-by-M unitary matrix;
 *>  R(K)_approx   = ( R11(K), R12(K) ) is a rank K approximation of the
 *>                    full rank factor R with K-by-K upper-triangular
 *>                    R11(K) and K-by-N rectangular R12(K). The diagonal
@@ -124,14 +124,14 @@
 *>     d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum
 *>        column 2-norm of the original matrix A, which is equal
 *>        to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 );
-*>     e) Q(K)**H * B, the matrix B with the orthogonal
+*>     e) Q(K)**H * B, the matrix B with the unitary
 *>        transformation Q(K)**H applied on the left.
 *>
 *> The N-by-N permutation matrix P(K) is stored in a compact form in
 *> the integer array JPIV. For 1 <= j <= N, column j
 *> of the matrix A was interchanged with column JPIV(j).
 *>
-*> The M-by-M orthogonal matrix Q is represented as a product
+*> The M-by-M unitary matrix Q is represented as a product
 *> of elementary Householder reflectors
 *>
 *>     Q(K) = H(1) *  H(2) * . . . * H(K),
@@ -300,7 +300,7 @@
 *>
 *>              1. The elements below the diagonal of the subarray
 *>                 A(1:M,1:K) together with TAU(1:K) represent the
-*>                 orthogonal matrix Q(K) as a product of K Householder
+*>                 unitary matrix Q(K) as a product of K Householder
 *>                 elementary reflectors.
 *>
 *>              2. The elements on and above the diagonal of
@@ -579,8 +579,8 @@
 *> \verbatim
 *>
 *>  November  2023, Igor Kozachenko, James Demmel,
-*>                  Computer Science Division,
-*>                  University of California, Berkeley
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
 *>
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/claqp2rk.f b/lapack-netlib/SRC/claqp2rk.f
index 073ad0f88..6b1db085a 100644
--- a/lapack-netlib/SRC/claqp2rk.f
+++ b/lapack-netlib/SRC/claqp2rk.f
@@ -178,7 +178,7 @@
 *>          On exit:
 *>          1. The elements in block A(IOFFSET+1:M,1:K) below
 *>             the diagonal together with the array TAU represent
-*>             the orthogonal matrix Q(K) as a product of elementary
+*>             the unitary matrix Q(K) as a product of elementary
 *>             reflectors.
 *>          2. The upper triangular block of the matrix A stored
 *>             in A(IOFFSET+1:M,1:K) is the triangular factor obtained.
@@ -332,8 +332,8 @@
 *> \verbatim
 *>
 *>  November  2023, Igor Kozachenko, James Demmel,
-*>                  Computer Science Division,
-*>                  University of California, Berkeley
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
 *>
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/claqp3rk.f b/lapack-netlib/SRC/claqp3rk.f
index af5e85645..3703bcbd6 100644
--- a/lapack-netlib/SRC/claqp3rk.f
+++ b/lapack-netlib/SRC/claqp3rk.f
@@ -196,7 +196,7 @@
 *>          On exit:
 *>          1. The elements in block A(IOFFSET+1:M,1:KB) below
 *>             the diagonal together with the array TAU represent
-*>             the orthogonal matrix Q(KB) as a product of elementary
+*>             the unitary matrix Q(KB) as a product of elementary
 *>             reflectors.
 *>          2. The upper triangular block of the matrix A stored
 *>             in A(IOFFSET+1:M,1:KB) is the triangular factor obtained.
@@ -383,8 +383,8 @@
 *> \verbatim
 *>
 *>  November  2023, Igor Kozachenko, James Demmel,
-*>                  Computer Science Division,
-*>                  University of California, Berkeley
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
 *>
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dgeqp3rk.f b/lapack-netlib/SRC/dgeqp3rk.f
index ace97b712..117a68287 100644
--- a/lapack-netlib/SRC/dgeqp3rk.f
+++ b/lapack-netlib/SRC/dgeqp3rk.f
@@ -573,8 +573,8 @@
 *> \verbatim
 *>
 *>  November  2023, Igor Kozachenko, James Demmel,
-*>                  Computer Science Division,
-*>                  University of California, Berkeley
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
 *>
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dlaqp2rk.f b/lapack-netlib/SRC/dlaqp2rk.f
index b5a84d0de..aecd6bb69 100644
--- a/lapack-netlib/SRC/dlaqp2rk.f
+++ b/lapack-netlib/SRC/dlaqp2rk.f
@@ -331,8 +331,8 @@
 *> \verbatim
 *>
 *>  November  2023, Igor Kozachenko, James Demmel,
-*>                  Computer Science Division,
-*>                  University of California, Berkeley
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
 *>
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dlaqp3rk.f b/lapack-netlib/SRC/dlaqp3rk.f
index 39e617d0e..8139345ed 100644
--- a/lapack-netlib/SRC/dlaqp3rk.f
+++ b/lapack-netlib/SRC/dlaqp3rk.f
@@ -389,8 +389,8 @@
 *> \verbatim
 *>
 *>  November  2023, Igor Kozachenko, James Demmel,
-*>                  Computer Science Division,
-*>                  University of California, Berkeley
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
 *>
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/sgeqp3rk.f b/lapack-netlib/SRC/sgeqp3rk.f
index 17559c7f4..bb5da72dc 100644
--- a/lapack-netlib/SRC/sgeqp3rk.f
+++ b/lapack-netlib/SRC/sgeqp3rk.f
@@ -573,8 +573,8 @@
 *> \verbatim
 *>
 *>  November  2023, Igor Kozachenko, James Demmel,
-*>                  Computer Science Division,
-*>                  University of California, Berkeley
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
 *>
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/slaqp2rk.f b/lapack-netlib/SRC/slaqp2rk.f
index d3dbb3d7c..f88b0ce90 100644
--- a/lapack-netlib/SRC/slaqp2rk.f
+++ b/lapack-netlib/SRC/slaqp2rk.f
@@ -331,8 +331,8 @@
 *> \verbatim
 *>
 *>  November  2023, Igor Kozachenko, James Demmel,
-*>                  Computer Science Division,
-*>                  University of California, Berkeley
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
 *>
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/slaqp3rk.f b/lapack-netlib/SRC/slaqp3rk.f
index fa735bb9d..b2dc2b334 100644
--- a/lapack-netlib/SRC/slaqp3rk.f
+++ b/lapack-netlib/SRC/slaqp3rk.f
@@ -389,8 +389,8 @@
 *> \verbatim
 *>
 *>  November  2023, Igor Kozachenko, James Demmel,
-*>                  Computer Science Division,
-*>                  University of California, Berkeley
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
 *>
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zgeqp3rk.f b/lapack-netlib/SRC/zgeqp3rk.f
index f8ef986c7..247a3c379 100644
--- a/lapack-netlib/SRC/zgeqp3rk.f
+++ b/lapack-netlib/SRC/zgeqp3rk.f
@@ -55,7 +55,7 @@
 *> where:
 *>
 *>  P(K)            is an N-by-N permutation matrix;
-*>  Q(K)            is an M-by-M orthogonal matrix;
+*>  Q(K)            is an M-by-M unitary matrix;
 *>  R(K)_approx   = ( R11(K), R12(K) ) is a rank K approximation of the
 *>                    full rank factor R with K-by-K upper-triangular
 *>                    R11(K) and K-by-N rectangular R12(K). The diagonal
@@ -124,14 +124,14 @@
 *>     d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum
 *>        column 2-norm of the original matrix A, which is equal
 *>        to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 );
-*>     e) Q(K)**H * B, the matrix B with the orthogonal
+*>     e) Q(K)**H * B, the matrix B with the unitary
 *>        transformation Q(K)**H applied on the left.
 *>
 *> The N-by-N permutation matrix P(K) is stored in a compact form in
 *> the integer array JPIV. For 1 <= j <= N, column j
 *> of the matrix A was interchanged with column JPIV(j).
 *>
-*> The M-by-M orthogonal matrix Q is represented as a product
+*> The M-by-M unitary matrix Q is represented as a product
 *> of elementary Householder reflectors
 *>
 *>     Q(K) = H(1) *  H(2) * . . . * H(K),
@@ -300,7 +300,7 @@
 *>
 *>              1. The elements below the diagonal of the subarray
 *>                 A(1:M,1:K) together with TAU(1:K) represent the
-*>                 orthogonal matrix Q(K) as a product of K Householder
+*>                 unitary matrix Q(K) as a product of K Householder
 *>                 elementary reflectors.
 *>
 *>              2. The elements on and above the diagonal of
@@ -579,8 +579,8 @@
 *> \verbatim
 *>
 *>  November  2023, Igor Kozachenko, James Demmel,
-*>                  Computer Science Division,
-*>                  University of California, Berkeley
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
 *>
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zlaqp2rk.f b/lapack-netlib/SRC/zlaqp2rk.f
index f1e9f4899..f6bf555c2 100644
--- a/lapack-netlib/SRC/zlaqp2rk.f
+++ b/lapack-netlib/SRC/zlaqp2rk.f
@@ -178,7 +178,7 @@
 *>          On exit:
 *>          1. The elements in block A(IOFFSET+1:M,1:K) below
 *>             the diagonal together with the array TAU represent
-*>             the orthogonal matrix Q(K) as a product of elementary
+*>             the unitary matrix Q(K) as a product of elementary
 *>             reflectors.
 *>          2. The upper triangular block of the matrix A stored
 *>             in A(IOFFSET+1:M,1:K) is the triangular factor obtained.
@@ -332,8 +332,8 @@
 *> \verbatim
 *>
 *>  November  2023, Igor Kozachenko, James Demmel,
-*>                  Computer Science Division,
-*>                  University of California, Berkeley
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
 *>
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zlaqp3rk.f b/lapack-netlib/SRC/zlaqp3rk.f
index 7a9fdfd95..0dd8bf8e3 100644
--- a/lapack-netlib/SRC/zlaqp3rk.f
+++ b/lapack-netlib/SRC/zlaqp3rk.f
@@ -196,7 +196,7 @@
 *>          On exit:
 *>          1. The elements in block A(IOFFSET+1:M,1:KB) below
 *>             the diagonal together with the array TAU represent
-*>             the orthogonal matrix Q(KB) as a product of elementary
+*>             the unitary matrix Q(KB) as a product of elementary
 *>             reflectors.
 *>          2. The upper triangular block of the matrix A stored
 *>             in A(IOFFSET+1:M,1:KB) is the triangular factor obtained.
@@ -383,8 +383,8 @@
 *> \verbatim
 *>
 *>  November  2023, Igor Kozachenko, James Demmel,
-*>                  Computer Science Division,
-*>                  University of California, Berkeley
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
 *>
 *> \endverbatim
 *

From f7351e493c6ce928e5c924c15d709c3de7c62d7b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 25 Nov 2023 18:49:34 +0100
Subject: [PATCH 436/718] Update Reference-LAPACK docs to 3.12.0

---
 lapack-netlib/DOCS/Doxyfile          |  478 +++++---
 lapack-netlib/DOCS/DoxygenLayout.xml |  197 ++++
 lapack-netlib/DOCS/groups-usr.dox    | 1575 ++++++++++++++++----------
 3 files changed, 1480 insertions(+), 770 deletions(-)
 create mode 100644 lapack-netlib/DOCS/DoxygenLayout.xml

diff --git a/lapack-netlib/DOCS/Doxyfile b/lapack-netlib/DOCS/Doxyfile
index 43cea43b5..577675772 100644
--- a/lapack-netlib/DOCS/Doxyfile
+++ b/lapack-netlib/DOCS/Doxyfile
@@ -1,4 +1,4 @@
-# Doxyfile 1.8.10
+# Doxyfile 1.9.1
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -17,11 +17,11 @@
 # Project related configuration options
 #---------------------------------------------------------------------------
 
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
 # The default value is: UTF-8.
 
 DOXYFILE_ENCODING      = UTF-8
@@ -38,7 +38,7 @@ PROJECT_NAME           = LAPACK
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 3.9.0
+PROJECT_NUMBER         = 3.12.0
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -93,6 +93,14 @@ ALLOW_UNICODE_NAMES    = NO
 
 OUTPUT_LANGUAGE        = English
 
+# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all generated output in the proper direction.
+# Possible values are: None, LTR, RTL and Context.
+# The default value is: None.
+
+OUTPUT_TEXT_DIRECTION  = None
+
 # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
 # descriptions after the members that are listed in the file and class
 # documentation (similar to Javadoc). Set to NO to disable this.
@@ -179,6 +187,16 @@ SHORT_NAMES            = NO
 
 JAVADOC_AUTOBRIEF      = NO
 
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
 # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
 # line (until the first dot) of a Qt-style comment as the brief description. If
 # set to NO, the Qt-style will behave just like regular Qt-style comments (thus
@@ -199,6 +217,14 @@ QT_AUTOBRIEF           = NO
 
 MULTILINE_CPP_IS_BRIEF = NO
 
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING       = YES
+
 # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
 # documentation from any documented member that it re-implements.
 # The default value is: YES.
@@ -226,16 +252,15 @@ TAB_SIZE               = 8
 # will allow you to put the command \sideeffect (or @sideeffect) in the
 # documentation, which will result in a user-defined paragraph with heading
 # "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
+# newlines (in the resulting output). You can put ^^ in the value part of an
+# alias to insert a newline as if a physical newline was in the original file.
+# When you need a literal { or } or , in the value part of an alias you have to
+# escape them by means of a backslash (\), this can lead to conflicts with the
+# commands \{ and \} for these it is advised to use the version @{ and @} or use
+# a double escape (\\{ and \\})
 
 ALIASES                =
 
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
 # instance, some of the names that are used will be different. The list of all
@@ -264,28 +289,40 @@ OPTIMIZE_FOR_FORTRAN   = YES
 
 OPTIMIZE_OUTPUT_VHDL   = NO
 
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
 # Doxygen selects the parser to use depending on the extension of the files it
 # parses. With this tag you can assign which parser to use for a given
 # extension. Doxygen has a built-in mapping, but you can override or extend it
 # using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, VHDL,
+# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files). For instance to make doxygen treat .inc files
+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C.
 #
 # Note: For files without extension you can use no_extension as a placeholder.
 #
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
 
 EXTENSION_MAPPING      =
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
 # The output of markdown processing is further processed by doxygen, so you can
 # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
 # case of backward compatibilities issues.
@@ -293,6 +330,15 @@ EXTENSION_MAPPING      =
 
 MARKDOWN_SUPPORT       = YES
 
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 5
+
 # When enabled doxygen tries to link words that correspond to documented
 # classes, or namespaces to their corresponding documentation. Such a link can
 # be prevented in individual cases by putting a % sign in front of the word or
@@ -318,7 +364,7 @@ BUILTIN_STL_SUPPORT    = NO
 CPP_CLI_SUPPORT        = NO
 
 # Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
 # will parse them like normal C++ but will assume all classes use public instead
 # of private inheritance when no explicit protection keyword is present.
 # The default value is: NO.
@@ -341,7 +387,7 @@ IDL_PROPERTY_SUPPORT   = YES
 # all members of a group must be documented explicitly.
 # The default value is: NO.
 
-DISTRIBUTE_GROUP_DOC   = YES
+DISTRIBUTE_GROUP_DOC   = NO
 
 # If one adds a struct or class to a group and this option is enabled, then also
 # any nested class or struct is added to the same group. By default this option
@@ -404,6 +450,19 @@ TYPEDEF_HIDES_STRUCT   = NO
 
 LOOKUP_CACHE_SIZE      = 0
 
+# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which efficively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS       = 1
+
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
@@ -424,6 +483,12 @@ EXTRACT_ALL            = YES
 
 EXTRACT_PRIVATE        = NO
 
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
+
 # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
 # scope will be included in the documentation.
 # The default value is: NO.
@@ -461,6 +526,13 @@ EXTRACT_LOCAL_METHODS  = NO
 
 EXTRACT_ANON_NSPACES   = NO
 
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
 # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
 # undocumented members inside documented classes or files. If set to NO these
 # members will be included in the various overviews, but no documentation
@@ -478,8 +550,8 @@ HIDE_UNDOC_MEMBERS     = NO
 HIDE_UNDOC_CLASSES     = NO
 
 # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
 # The default value is: NO.
 
 HIDE_FRIEND_COMPOUNDS  = NO
@@ -498,11 +570,18 @@ HIDE_IN_BODY_DOCS      = NO
 
 INTERNAL_DOCS          = NO
 
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
 # The default value is: system dependent.
 
 CASE_SENSE_NAMES       = NO
@@ -684,12 +763,12 @@ FILE_VERSION_FILTER    =
 # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
 # tag is left empty.
 
-LAYOUT_FILE            =
+LAYOUT_FILE            = DOCS/DoxygenLayout.xml
 
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
 # extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
 # For LaTeX the style of the bibliography can be controlled using
 # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
 # search path. See also \cite for info how to create references.
@@ -705,7 +784,7 @@ CITE_BIB_FILES         =
 # messages are off.
 # The default value is: NO.
 
-QUIET                  = YES
+QUIET                  = NO
 
 # The WARNINGS tag can be used to turn on/off the warning messages that are
 # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
@@ -734,10 +813,20 @@ WARN_IF_DOC_ERROR      = YES
 # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
 # are documented, but have no documentation for their parameters or return
 # value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
+# parameter documentation, but not about the absence of documentation. If
+# EXTRACT_ALL is set to YES then this flag will automatically be disabled.
 # The default value is: NO.
 
-WARN_NO_PARAMDOC       = NO
+WARN_NO_PARAMDOC       = YES
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# Possible values are: NO, YES and FAIL_ON_WARNINGS.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
 
 # The WARN_FORMAT tag determines the format of the warning messages that doxygen
 # can produce. The string should contain the $file, $line, and $text tags, which
@@ -753,7 +842,7 @@ WARN_FORMAT            = "$file:$line: $text"
 # messages should be written. If left blank the output is written to standard
 # error (stderr).
 
-WARN_LOGFILE           = output_err
+WARN_LOGFILE           = doxygen_error
 
 #---------------------------------------------------------------------------
 # Configuration options related to the input files
@@ -762,17 +851,18 @@ WARN_LOGFILE           = output_err
 # The INPUT tag is used to specify the files and/or directories that contain
 # documented source files. You may enter file names like myfile.cpp or
 # directories like /usr/src/myproject. Separate the files or directories with
-# spaces.
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = . \
-                         DOCS/groups-usr.dox
+INPUT                  = BLAS CBLAS SRC INSTALL TESTING \
+                         DOCS/groups-usr.dox \
+                         README.md
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
 # The default value is: UTF-8.
 
 INPUT_ENCODING         = UTF-8
@@ -785,14 +875,19 @@ INPUT_ENCODING         = UTF-8
 # need to set EXTENSION_MAPPING for the extension otherwise the files are not
 # read by doxygen.
 #
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
 # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
 # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
 # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd,
-# *.vhdl, *.ucf, *.qsf, *.as and *.js.
+# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment),
+# *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl,
+# *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          = *.c \
                          *.f \
+                         *.f90 \
                          *.h
 
 # The RECURSIVE tag can be used to specify whether or not subdirectories should
@@ -808,34 +903,15 @@ RECURSIVE              = YES
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                = CMAKE \
-                         DOCS \
-                         .svn \
-                         CBLAS/.svn \
-                         CBLAS/src/.svn \
-                         CBLAS/testing/.svn \
-                         CBLAS/example/.svn \
-                         CBLAS/include/.svn \
-                         BLAS/.svn \
-                         BLAS/SRC/.svn \
-                         BLAS/TESTING/.svn \
-                         SRC/.svn \
-                         SRC/VARIANTS/.svn \
-                         SRC/VARIANTS/LIB/.svn \
-                         SRC/VARIANTS/cholesky/.svn \
-                         SRC/VARIANTS/cholesky/RL/.svn \
-                         SRC/VARIANTS/cholesky/TOP/.svn \
-                         SRC/VARIANTS/lu/.svn \
-                         SRC/VARIANTS/lu/CR/.svn \
-                         SRC/VARIANTS/lu/LL/.svn \
-                         SRC/VARIANTS/lu/REC/.svn \
-                         SRC/VARIANTS/qr/.svn \
-                         SRC/VARIANTS/qr/LL/.svn \
-                         INSTALL/.svn \
-                         TESTING/.svn \
-                         TESTING/EIG/.svn \
-                         TESTING/MATGEN/.svn \
-                         TESTING/LIN/.svn
+# Exclude functions that are duplicated, creating conflicts.
+EXCLUDE                = .git \
+                         .github \
+                         SRC/VARIANTS \
+                         BLAS/SRC/lsame.f \
+                         BLAS/SRC/xerbla.f \
+                         BLAS/SRC/xerbla_array.f \
+                         INSTALL/slamchf77.f \
+                         INSTALL/dlamchf77.f \
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -908,6 +984,10 @@ IMAGE_PATH             =
 # Note that the filter must not add or remove lines; it is applied before the
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
 
 INPUT_FILTER           =
 
@@ -917,6 +997,10 @@ INPUT_FILTER           =
 # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
 # filters are used. If the FILTER_PATTERNS tag is empty or if none of the
 # patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
 
 FILTER_PATTERNS        =
 
@@ -969,7 +1053,7 @@ INLINE_SOURCES         = YES
 STRIP_CODE_COMMENTS    = YES
 
 # If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
+# entity all documented functions referencing it will be listed.
 # The default value is: NO.
 
 REFERENCED_BY_RELATION = NO
@@ -1001,12 +1085,12 @@ SOURCE_TOOLTIPS        = YES
 # If the USE_HTAGS tag is set to YES then the references to source code will
 # point to the HTML generated by the htags(1) tool instead of doxygen built-in
 # source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
+# (see https://www.gnu.org/software/global/global.html). You will need version
 # 4.8.6 or higher.
 #
 # To use it do the following:
 # - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
 # - Make sure the INPUT points to the root of the source tree
 # - Run doxygen as normal
 #
@@ -1028,25 +1112,6 @@ USE_HTAGS              = NO
 
 VERBATIM_HEADERS       = YES
 
-# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
-# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
-# cost of reduced performance. This can be particularly helpful with template
-# rich C++ code for which doxygen's built-in parser lacks the necessary type
-# information.
-# Note: The availability of this option depends on whether or not doxygen was
-# compiled with the --with-libclang option.
-# The default value is: NO.
-
-CLANG_ASSISTED_PARSING = NO
-
-# If clang assisted parsing is enabled you can provide the compiler with command
-# line options that you would normally use when invoking the compiler. Note that
-# the include paths will already be set by doxygen for the files and directories
-# specified with INPUT and INCLUDE_PATH.
-# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
-
-CLANG_OPTIONS          =
-
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
 #---------------------------------------------------------------------------
@@ -1058,13 +1123,6 @@ CLANG_OPTIONS          =
 
 ALPHABETICAL_INDEX     = YES
 
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
 # In case all classes in a project start with a common prefix, all classes will
 # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
 # can be used to specify a prefix (or a list of prefixes) that should be ignored
@@ -1165,7 +1223,7 @@ HTML_EXTRA_FILES       =
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
 # this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
 # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
 # purple, and 360 is red again.
 # Minimum value: 0, maximum value: 359, default value: 220.
@@ -1201,6 +1259,17 @@ HTML_COLORSTYLE_GAMMA  = 80
 
 HTML_TIMESTAMP         = YES
 
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via JavaScript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have JavaScript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
 # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
 # documentation will contain sections that can be hidden and shown after the
 # page has loaded.
@@ -1224,13 +1293,14 @@ HTML_INDEX_NUM_ENTRIES = 100
 
 # If the GENERATE_DOCSET tag is set to YES, additional index files will be
 # generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
 # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
@@ -1269,8 +1339,8 @@ DOCSET_PUBLISHER_NAME  = Publisher
 # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
 # additional HTML index files: index.hhp, index.hhc, and index.hhk. The
 # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
+# (see:
+# https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows.
 #
 # The HTML Help Workshop contains a compiler that can convert all HTML output
 # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
@@ -1300,7 +1370,7 @@ CHM_FILE               =
 HHC_LOCATION           =
 
 # The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
+# (YES) or that it should be included in the main .chm file (NO).
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
@@ -1345,7 +1415,8 @@ QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
 # The default value is: org.doxygen.Project.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1353,8 +1424,8 @@ QHP_NAMESPACE          = org.doxygen.Project
 
 # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
 # Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
 # The default value is: doc.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1362,30 +1433,30 @@ QHP_VIRTUAL_FOLDER     = doc
 
 # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
 # filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_ATTRS  =
 
 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_SECT_FILTER_ATTRS  =
 
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHG_LOCATION           =
@@ -1462,6 +1533,17 @@ TREEVIEW_WIDTH         = 250
 
 EXT_LINKS_IN_WINDOW    = NO
 
+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
+# the HTML output. These images will generally look nicer at scaled resolutions.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
+# The default value is: png.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FORMULA_FORMAT    = png
+
 # Use this tag to change the font size of LaTeX formulas included as images in
 # the HTML documentation. When you change the font size after a successful
 # doxygen run you need to manually remove any form_*.png images from the HTML
@@ -1471,7 +1553,7 @@ EXT_LINKS_IN_WINDOW    = NO
 
 FORMULA_FONTSIZE       = 10
 
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
 # generated for formulas are transparent PNGs. Transparent PNGs are not
 # supported properly for IE 6.0, but are supported on all modern browsers.
 #
@@ -1482,8 +1564,14 @@ FORMULA_FONTSIZE       = 10
 
 FORMULA_TRANSPARENT    = YES
 
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
+
+FORMULA_MACROFILE      =
+
 # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
 # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
 # installed or if you want to formulas look prettier in the HTML output. When
 # enabled you may also need to install MathJax separately and configure the path
@@ -1495,7 +1583,7 @@ USE_MATHJAX            = NO
 
 # When MathJax is enabled you can set the default output format to be used for
 # the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details.
 # Possible values are: HTML-CSS (which is slower, but has the best
 # compatibility), NativeMML (i.e. MathML) and SVG.
 # The default value is: HTML-CSS.
@@ -1510,8 +1598,8 @@ MATHJAX_FORMAT         = HTML-CSS
 # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
 # Content Delivery Network so you can quickly see the result without installing
 # MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# MathJax from https://www.mathjax.org before deployment.
+# The default value is: https://cdn.jsdelivr.net/npm/mathjax@2.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
 MATHJAX_RELPATH        = http://www.mathjax.org/mathjax
@@ -1525,7 +1613,8 @@ MATHJAX_EXTENSIONS     =
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
@@ -1553,7 +1642,7 @@ MATHJAX_CODEFILE       =
 SEARCHENGINE           = YES
 
 # When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
+# implemented using a web server instead of a web client using JavaScript. There
 # are two flavors of web server based searching depending on the EXTERNAL_SEARCH
 # setting. When disabled, doxygen will generate a PHP script for searching and
 # an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
@@ -1572,7 +1661,8 @@ SERVER_BASED_SEARCH    = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
+# Xapian (see:
+# https://xapian.org/).
 #
 # See the section "External Indexing and Searching" for details.
 # The default value is: NO.
@@ -1585,8 +1675,9 @@ EXTERNAL_SEARCH        = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
 SEARCHENGINE_URL       =
@@ -1637,21 +1728,35 @@ LATEX_OUTPUT           = latex
 # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
 # invoked.
 #
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_CMD_NAME         = latex
 
 # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
 # index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
 # The default file is: makeindex.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 MAKEINDEX_CMD_NAME     = makeindex
 
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
 # If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
 # documents. This may be useful for small projects and may help to save some
 # trees in general.
@@ -1736,9 +1841,11 @@ LATEX_EXTRA_FILES      =
 
 PDF_HYPERLINKS         = YES
 
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
 # The default value is: YES.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1772,12 +1879,28 @@ LATEX_SOURCE_CODE      = NO
 
 # The LATEX_BIB_STYLE tag can be used to specify the style to use for the
 # bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
 # The default value is: plain.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_BIB_STYLE        = plain
 
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
 #---------------------------------------------------------------------------
 # Configuration options related to the RTF output
 #---------------------------------------------------------------------------
@@ -1817,9 +1940,9 @@ COMPACT_RTF            = NO
 
 RTF_HYPERLINKS         = YES
 
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
 #
 # See also section "Doxygen usage" for information on how to generate the
 # default style sheet that doxygen normally uses.
@@ -1828,8 +1951,8 @@ RTF_HYPERLINKS         = YES
 RTF_STYLESHEET_FILE    =
 
 # Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
 RTF_EXTENSIONS_FILE    =
@@ -1915,6 +2038,13 @@ XML_OUTPUT             = xml
 
 XML_PROGRAMLISTING     = YES
 
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
 #---------------------------------------------------------------------------
 # Configuration options related to the DOCBOOK output
 #---------------------------------------------------------------------------
@@ -1947,9 +2077,9 @@ DOCBOOK_PROGRAMLISTING = NO
 #---------------------------------------------------------------------------
 
 # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sf.net) file that captures the
-# structure of the code including all documentation. Note that this feature is
-# still experimental and incomplete at the moment.
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
 # The default value is: NO.
 
 GENERATE_AUTOGEN_DEF   = NO
@@ -2116,12 +2246,6 @@ EXTERNAL_GROUPS        = YES
 
 EXTERNAL_PAGES         = YES
 
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /sw/bin/perl
-
 #---------------------------------------------------------------------------
 # Configuration options related to the dot tool
 #---------------------------------------------------------------------------
@@ -2135,15 +2259,6 @@ PERL_PATH              = /sw/bin/perl
 
 CLASS_DIAGRAMS         = YES
 
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
 # DIA_PATH tag allows you to specify the directory where the dia binary resides.
@@ -2241,10 +2356,32 @@ UML_LOOK               = NO
 # but if the number exceeds 15, the total amount of fields shown is limited to
 # 10.
 # Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
+# This tag requires that the tag UML_LOOK is set to YES.
 
 UML_LIMIT_NUM_FIELDS   = 10
 
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS        = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_WRAP_THRESHOLD     = 17
+
 # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
 # collaboration graphs will show the relations between templates and their
 # instances.
@@ -2371,6 +2508,11 @@ DIAFILE_DIRS           =
 
 PLANTUML_JAR_PATH      =
 
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
 # When using plantuml, the specified paths are searched for files specified by
 # the !include statement in a plantuml block.
 
@@ -2429,9 +2571,11 @@ DOT_MULTI_TARGETS      = NO
 
 GENERATE_LEGEND        = YES
 
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
 # files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc and
+# plantuml temporary files.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_CLEANUP            = YES
diff --git a/lapack-netlib/DOCS/DoxygenLayout.xml b/lapack-netlib/DOCS/DoxygenLayout.xml
new file mode 100644
index 000000000..aeb346d8d
--- /dev/null
+++ b/lapack-netlib/DOCS/DoxygenLayout.xml
@@ -0,0 +1,197 @@
+<doxygenlayout version="1.0">
+  <!-- Generated by doxygen 1.8.13 -->
+  <!-- Navigation index tabs for HTML output -->
+  <navindex>
+    <tab type="mainpage" visible="yes" title=""/>
+    <tab type="pages" visible="yes" title="" intro=""/>
+
+    <!-- LAPACK: change "Modules" to "Routines" -->
+    <tab type="modules" visible="yes" title="Routines" intro=""/>
+
+    <tab type="namespaces" visible="yes" title="">
+      <tab type="namespacelist" visible="yes" title="" intro=""/>
+      <tab type="namespacemembers" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="classes" visible="yes" title="">
+      <tab type="classlist" visible="yes" title="" intro=""/>
+      <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/>
+      <tab type="hierarchy" visible="yes" title="" intro=""/>
+      <tab type="classmembers" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="files" visible="yes" title="">
+      <tab type="filelist" visible="yes" title="" intro=""/>
+      <tab type="globals" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="examples" visible="yes" title="" intro=""/>
+  </navindex>
+
+  <!-- Layout definition for a class page -->
+  <class>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <inheritancegraph visible="$CLASS_GRAPH"/>
+    <collaborationgraph visible="$COLLABORATION_GRAPH"/>
+    <memberdecl>
+      <nestedclasses visible="yes" title=""/>
+      <publictypes title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <publicslots title=""/>
+      <signals title=""/>
+      <publicmethods title=""/>
+      <publicstaticmethods title=""/>
+      <publicattributes title=""/>
+      <publicstaticattributes title=""/>
+      <protectedtypes title=""/>
+      <protectedslots title=""/>
+      <protectedmethods title=""/>
+      <protectedstaticmethods title=""/>
+      <protectedattributes title=""/>
+      <protectedstaticattributes title=""/>
+      <packagetypes title=""/>
+      <packagemethods title=""/>
+      <packagestaticmethods title=""/>
+      <packageattributes title=""/>
+      <packagestaticattributes title=""/>
+      <properties title=""/>
+      <events title=""/>
+      <privatetypes title=""/>
+      <privateslots title=""/>
+      <privatemethods title=""/>
+      <privatestaticmethods title=""/>
+      <privateattributes title=""/>
+      <privatestaticattributes title=""/>
+      <friends title=""/>
+      <related title="" subtitle=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <constructors title=""/>
+      <functions title=""/>
+      <related title=""/>
+      <variables title=""/>
+      <properties title=""/>
+      <events title=""/>
+    </memberdef>
+    <allmemberslink visible="yes"/>
+    <usedfiles visible="$SHOW_USED_FILES"/>
+    <authorsection visible="yes"/>
+  </class>
+
+  <!-- Layout definition for a namespace page -->
+  <namespace>
+    <briefdescription visible="yes"/>
+    <memberdecl>
+      <nestednamespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </namespace>
+
+  <!-- Layout definition for a file page -->
+  <file>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <includegraph visible="$INCLUDE_GRAPH"/>
+    <includedbygraph visible="$INCLUDED_BY_GRAPH"/>
+    <sourcelink visible="yes"/>
+    <memberdecl>
+      <classes visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection/>
+  </file>
+
+  <!-- Layout definition for a group page -->
+  <group>
+    <briefdescription visible="yes"/>
+    <groupgraph visible="$GROUP_GRAPHS"/>
+    <memberdecl>
+      <nestedgroups visible="yes" title=""/>
+      <dirs visible="yes" title=""/>
+      <files visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <pagedocs/>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </group>
+
+  <!-- Layout definition for a directory page -->
+  <directory>
+    <briefdescription visible="yes"/>
+    <directorygraph visible="yes"/>
+    <memberdecl>
+      <dirs visible="yes"/>
+      <files visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+  </directory>
+</doxygenlayout>
diff --git a/lapack-netlib/DOCS/groups-usr.dox b/lapack-netlib/DOCS/groups-usr.dox
index 6c31e4cf5..cbd747165 100644
--- a/lapack-netlib/DOCS/groups-usr.dox
+++ b/lapack-netlib/DOCS/groups-usr.dox
@@ -1,605 +1,974 @@
 /**
- *
- ****
- *
- * @defgroup lapack LAPACK
- *   This is the group of LAPACK routines.
- *
- * @defgroup GE  General Matrices
- *   @ingroup lapack
- *   This is the group of General Matrices routines
- * @defgroup GB  General Band Matrix
- *   @ingroup lapack
- *   This is the group of General Band routines
- * @defgroup SY  Symmetric Matrix
- *   @ingroup lapack
- *   This is the group of Symmetric Matrix routines
- * @defgroup HE  Hermitian Matrix
- *   @ingroup lapack
- *   This is the group of Hermitian Matrix routines
- * @defgroup PO  Positive Definite Matrix
- *   @ingroup lapack
- *   This is the group of Positive Definite routines
- * @defgroup GT  General tridiagonal Matrix
- *   @ingroup lapack
- *   This is the group of General tridiagonal routines
- * @defgroup PT  Positive Definite tridiagonal Matrix
- *   @ingroup lapack
- *   This is the group of Positive Definite tridiagonal routines
- * @defgroup OTHEReigen  Eigenvalue
- *   @ingroup lapack
- *   This is the group of Other Eigenvalue routines
- * @defgroup OTHERauxiliary  Other Auxiliary Routines
- *   @ingroup lapack
- *   This is the group of Other Auxiliary routines
- * @defgroup OTHERcomputational  Other Computational Routines
- *   @ingroup lapack
- *   This is the group of Other Computational routines
- * @defgroup OTHERsolve  Other Solve Routines
- *   @ingroup lapack
- *   This is the group of Other Solve routines
- *
- ****
- *
- * @defgroup solveGE  Linear Solve
- *   @ingroup GE
- *   This is the group of Linear Solve Driver routines
- * @defgroup solveGB  Linear Solve
- *   @ingroup GB
- *   This is the group of Linear Solve Driver routines
- * @defgroup solveSY  Linear Solve
- *   @ingroup SY
- *   This is the group of Linear Solve Driver routines
- * @defgroup solveHE  Linear Solve
- *   @ingroup HE
- *   This is the group of Linear Solve Driver routines
- * @defgroup solvePO  Linear Solve
- *   @ingroup PO
- *   This is the group of Linear Solve Driver routines
- * @defgroup solveGT  Linear Solve
- *   @ingroup GT
- *   This is the group of Linear Solve Driver routines
- * @defgroup solvePT  Linear Solve
- *   @ingroup PT
- *   This is the group of Linear Solve Driver routines
- * @defgroup eigenGE  Eigenvalue
- *   @ingroup GE
- *   This is the group of Eigenvalue Driver routines
- * @defgroup eigenSY  Eigenvalue
- *   @ingroup SY
- *   This is the group of Eigenvalue Driver routines
- * @defgroup eigenHE  Eigenvalue
- *   @ingroup HE
- *   This is the group of Eigenvalue Driver routines
- * @defgroup singGE  Singular Value
- *   @ingroup GE
- *   This is the group of Singular Value Driver routines
- * @defgroup computationalGE  Computational routines
- *   @ingroup GE
- *   This is the group of Computational routines
- * @defgroup variantsGEcomputational  Variants Computational routines
- *   @ingroup GE
- *   This is the group of Variants Computational routines
- * @defgroup computationalGB  Computational routines
- *   @ingroup GB
- *   This is the group of Computational routines
- * @defgroup computationalSY  Computational routines
- *   @ingroup SY
- *   This is the group of Computational routines
- * @defgroup computationalHE  Computational routines
- *   @ingroup HE
- *   This is the group of Computational routines
- * @defgroup computationalPO  Computational routines
- *   @ingroup PO
- *   This is the group of Computational routines
- * @defgroup variantsPOcomputational  Variants Computational routines
- *   @ingroup PO
- *   This is the group of Variants Computational routines
- * @defgroup computationalGT  Computational routines
- *   @ingroup GT
- *   This is the group of Computational routines
- * @defgroup computationalPT  Computational routines
- *   @ingroup PT
- *   This is the group of Computational routines
- * @defgroup variantsOTHERcomputational  Variants Computational routines
- *   @ingroup OTHERcomputational
- *   This is the group of Variants Computational routines
- * @defgroup auxiliaryGE  Auxiliary routines
- *   @ingroup GE
- *   This is the group of Auxiliary routines
- * @defgroup auxiliaryGB  Auxiliary routines
- *   @ingroup GB
- *   This is the group of Auxiliary routines
- * @defgroup auxiliarySY  Auxiliary routines
- *   @ingroup SY
- *   This is the group of Auxiliary routines
- * @defgroup auxiliaryHE  Auxiliary routines
- *   @ingroup HE
- *   This is the group of Auxiliary routines
- * @defgroup auxiliaryPO  Auxiliary routines
- *   @ingroup PO
- *   This is the group of Auxiliary routines
- * @defgroup auxiliaryGT  Auxiliary routines
- *   @ingroup GT
- *   This is the group of Auxiliary routines
- * @defgroup auxiliaryPT  Auxiliary routines
- *   @ingroup PT
- *   This is the group of Auxiliary routines
- *
- ****
- *
- * @defgroup doubleGEsolve double
- *   @ingroup solveGE
- *   This is the group of double solve driver functions for GE matrices
- * @defgroup doublePOsolve double
- *   @ingroup solvePO
- *   This is the group of double solve driver functions for PO matrices
- * @defgroup doubleSYsolve double
- *   @ingroup solveSY
- *   This is the group of double solve driver functions for SY matrices
- * @defgroup doubleGBsolve double
- *   @ingroup solveGB
- *   This is the group of double solve driver functions for GB matrices
- * @defgroup doubleGTsolve double
- *   @ingroup solveGT
- *   This is the group of double solve driver functions for GT matrices
- * @defgroup doublePTsolve double
- *   @ingroup solvePT
- *   This is the group of double solve driver functions for PT matrices
- * @defgroup doubleGEeigen double
- *   @ingroup eigenGE
- *   This is the group of double eigenvalue driver functions for GE matrices
- * @defgroup doubleSYeigen double
- *   @ingroup eigenSY
- *   This is the group of double eigenvalue driver functions for SY matrices
- * @defgroup doubleGEsing double
- *   @ingroup singGE
- *   This is the group of double singular value driver functions for GE matrices
- * @defgroup doubleGEcomputational double
- *   @ingroup computationalGE
- *   This is the group of double computational functions for GE matrices
- * @defgroup doublePOcomputational double
- *   @ingroup computationalPO
- *   This is the group of double computational functions for PO matrices
- * @defgroup doubleSYcomputational double
- *   @ingroup computationalSY
- *   This is the group of double computational functions for SY matrices
- * @defgroup doubleGBcomputational double
- *   @ingroup computationalGB
- *   This is the group of double computational functions for GB matrices
- * @defgroup doubleGTcomputational double
- *   @ingroup computationalGT
- *   This is the group of double computational functions for GT matrices
- * @defgroup doublePTcomputational double
- *   @ingroup computationalPT
- *   This is the group of double computational functions for PT matrices
- * @defgroup doubleGEauxiliary double
- *   @ingroup auxiliaryGE
- *   This is the group of double auxiliary functions for GE matrices
- * @defgroup doublePOauxiliary double
- *   @ingroup auxiliaryPO
- *   This is the group of double auxiliary functions for PO matrices
- * @defgroup doubleSYauxiliary double
- *   @ingroup auxiliarySY
- *   This is the group of double auxiliary functions for SY matrices
- * @defgroup doubleGBauxiliary double
- *   @ingroup auxiliaryGB
- *   This is the group of double auxiliary functions for GB matrices
- * @defgroup doublePTauxiliary double
- *   @ingroup auxiliaryPT
- *   This is the group of double auxiliary functions for PT matrices
- * @defgroup doubleGTauxiliary double
- *   @ingroup auxiliaryGT
- *   This is the group of double auxiliary functions for GT matrices
- * @defgroup doubleOTHERauxiliary  double
- *   @ingroup OTHERauxiliary
- *   This is the group of double other auxiliary routines
- * @defgroup doubleOTHERcomputational  double
- *   @ingroup OTHERcomputational
- *   This is the group of double other Computational routines
- * @defgroup doubleOTHERsolve  double Other Solve Routines
- *   @ingroup OTHERsolve
- *   This is the group of double Other Solve routines
- * @defgroup doubleOTHEReigen  double
- *   @ingroup OTHEReigen
- *   This is the group of double Other Eigenvalue routines
- *
- ****
- *
- * @defgroup realGEsolve real
- *   @ingroup solveGE
- *   This is the group of real solve driver functions for GE matrices
- * @defgroup realPOsolve real
- *   @ingroup solvePO
- *   This is the group of real solve driver functions for PO matrices
- * @defgroup realSYsolve real
- *   @ingroup solveSY
- *   This is the group of real solve driver functions for SY matrices
- * @defgroup realGBsolve real
- *   @ingroup solveGB
- *   This is the group of real solve driver functions for GB matrices
- * @defgroup realGTsolve real
- *   @ingroup solveGT
- *   This is the group of real solve driver functions for GT matrices
- * @defgroup realPTsolve real
- *   @ingroup solvePT
- *   This is the group of real solve driver functions for PT matrices
- * @defgroup realGEeigen real
- *   @ingroup eigenGE
- *   This is the group of real eigenvalue driver functions for GE matrices
- * @defgroup realSYeigen real
- *   @ingroup eigenSY
- *   This is the group of real eigenvalue driver functions for SY matrices
- * @defgroup realGEsing real
- *   @ingroup singGE
- *   This is the group of real singular value driver functions for GE matrices
- * @defgroup realGEcomputational real
- *   @ingroup computationalGE
- *   This is the group of real computational functions for GE matrices
- * @defgroup realPOcomputational real
- *   @ingroup computationalPO
- *   This is the group of real computational functions for PO matrices
- * @defgroup realSYcomputational real
- *   @ingroup computationalSY
- *   This is the group of real computational functions for SY matrices
- * @defgroup realGBcomputational real
- *   @ingroup computationalGB
- *   This is the group of real computational functions for GB matrices
- * @defgroup realPTcomputational real
- *   @ingroup computationalPT
- *   This is the group of real computational functions for PT matrices
- * @defgroup realGTcomputational real
- *   @ingroup computationalGT
- *   This is the group of real computational functions for GT matrices
- * @defgroup realGEauxiliary real
- *   @ingroup auxiliaryGE
- *   This is the group of real auxiliary functions for GE matrices
- * @defgroup realPOauxiliary real
- *   @ingroup auxiliaryPO
- *   This is the group of real auxiliary functions for PO matrices
- * @defgroup realSYauxiliary real
- *   @ingroup auxiliarySY
- *   This is the group of real auxiliary functions for SY matrices
- * @defgroup realGBauxiliary real
- *   @ingroup auxiliaryGB
- *   This is the group of real auxiliary functions for GB matrices
- * @defgroup realGTauxiliary real
- *   @ingroup auxiliaryGT
- *   This is the group of real auxiliary functions for GT matrices
- * @defgroup realPTauxiliary real
- *   @ingroup auxiliaryPT
- *   This is the group of real auxiliary functions for PT matrices
- * @defgroup realOTHERauxiliary  real
- *   @ingroup OTHERauxiliary
- *   This is the group of real other auxiliary routines
- * @defgroup realOTHERcomputational  real
- *   @ingroup OTHERcomputational
- *   This is the group of real other Computational routines
- * @defgroup realOTHERsolve  real Other Solve Routines
- *   @ingroup OTHERsolve
- *   This is the group of real Other Solve routines
- * @defgroup realOTHEReigen  real
- *   @ingroup OTHEReigen
- *   This is the group of real Other Eigenvalue routines
- *
- ****
- *
- * @defgroup complexGEsolve complex
- *   @ingroup solveGE
- *   This is the group of complex solve driver functions for GE matrices
- * @defgroup complexPOsolve complex
- *   @ingroup solvePO
- *   This is the group of complex solve driver functions for PO matrices
- * @defgroup complexSYsolve complex
- *   @ingroup solveSY
- *   This is the group of complex solve driver functions for SY matrices
- * @defgroup complexHEsolve complex
- *   @ingroup solveHE
- *   This is the group of complex solve driver functions for HE matrices
- * @defgroup complexGBsolve complex
- *   @ingroup solveGB
- *   This is the group of complex solve driver functions for GB matrices
- * @defgroup complexGTsolve complex
- *   @ingroup solveGT
- *   This is the group of complex solve driver functions for GT matrices
- * @defgroup complexPTsolve complex
- *   @ingroup solvePT
- *   This is the group of complex solve driver functions for PT matrices
- * @defgroup complexGEeigen complex
- *   @ingroup eigenGE
- *   This is the group of complex eigenvalue driver functions for GE matrices
- * @defgroup complexSYeigen complex
- *   @ingroup eigenSY
- *   This is the group of complex eigenvalue driver functions for SY matrices
- * @defgroup complexHEeigen complex
- *   @ingroup eigenHE
- *   This is the group of complex eigenvalue driver functions for HE matrices
- * @defgroup complexGEsing complex
- *   @ingroup singGE
- *   This is the group of complex singular value driver functions for GE matrices
- * @defgroup complexGEcomputational complex
- *   @ingroup computationalGE
- *   This is the group of complex computational functions for GE matrices
- * @defgroup complexPOcomputational complex
- *   @ingroup computationalPO
- *   This is the group of complex computational functions for PO matrices
- * @defgroup complexSYcomputational complex
- *   @ingroup computationalSY
- *   This is the group of complex computational functions for SY matrices
- * @defgroup complexHEcomputational complex
- *   @ingroup computationalHE
- *   This is the group of complex computational functions for HE matrices
- * @defgroup complexGBcomputational complex
- *   @ingroup computationalGB
- *   This is the group of complex computational functions for GB matrices
- * @defgroup complexGTcomputational complex
- *   @ingroup computationalGT
- *   This is the group of complex computational functions for GT matrices
- * @defgroup complexPTcomputational complex
- *   @ingroup computationalPT
- *   This is the group of complex computational functions for PT matrices
- * @defgroup complexGEauxiliary complex
- *   @ingroup auxiliaryGE
- *   This is the group of complex auxiliary functions for GE matrices
- * @defgroup complexPOauxiliary complex
- *   @ingroup auxiliaryPO
- *   This is the group of complex auxiliary functions for PO matrices
- * @defgroup complexSYauxiliary complex
- *   @ingroup auxiliarySY
- *   This is the group of complex auxiliary functions for SY matrices
- * @defgroup complexHEauxiliary complex
- *   @ingroup auxiliaryHE
- *   This is the group of complex auxiliary functions for HE matrices
- * @defgroup complexGBauxiliary complex
- *   @ingroup auxiliaryGB
- *   This is the group of complex auxiliary functions for GB matrices
- * @defgroup complexOTHERauxiliary  complex
- *   @ingroup OTHERauxiliary
- *   This is the group of complex other auxiliary routines
- * @defgroup complexOTHERcomputational  complex
- *   @ingroup OTHERcomputational
- *   This is the group of complex other Computational routines
- * @defgroup complexOTHERsolve  complex Other Solve Routines
- *   @ingroup OTHERsolve
- *   This is the group of complex Other Solve routines
- * @defgroup complexOTHEReigen  complex Other Eigenvalue routines
- *   @ingroup OTHEReigen
- *   This is the group of complex Other Eigenvalue routines
- *
- ****
- *
- * @defgroup complex16GEsolve complex16
- *   @ingroup solveGE
- *   This is the group of complex16 solve driver functions for GE matrices
- * @defgroup complex16POsolve complex16
- *   @ingroup solvePO
- *   This is the group of complex16 solve driver functions for PO matrices
- * @defgroup complex16SYsolve complex16
- *   @ingroup solveSY
- *   This is the group of complex16 solve driver functions for SY matrices
- * @defgroup complex16HEsolve complex16
- *   @ingroup solveHE
- *   This is the group of complex16 solve driver functions for HE matrices
- * @defgroup complex16GBsolve complex16
- *   @ingroup solveGB
- *   This is the group of complex16 solve driver functions for GB matrices
- * @defgroup complex16GTsolve complex16
- *   @ingroup solveGT
- *   This is the group of complex16 solve driver functions for GT matrices
- * @defgroup complex16PTsolve complex16
- *   @ingroup solvePT
- *   This is the group of complex16 solve driver functions for PT matrices
- * @defgroup complex16GEeigen complex16
- *   @ingroup eigenGE
- *   This is the group of complex16 eigenvalue driver functions for GE matrices
- * @defgroup complex16SYeigen complex16
- *   @ingroup eigenSY
- *   This is the group of complex16 eigenvalue driver functions for SY matrices
- * @defgroup complex16HEeigen complex16
- *   @ingroup eigenHE
- *   This is the group of complex16 eigenvalue driver functions for HE matrices
- * @defgroup complex16GEsing complex16
- *   @ingroup singGE
- *   This is the group of complex16 singular value driver functions for GE matrices
- * @defgroup complex16GEcomputational complex16
- *   @ingroup computationalGE
- *   This is the group of complex16 computational functions for GE matrices
- * @defgroup complex16POcomputational complex16
- *   @ingroup computationalPO
- *   This is the group of complex16 computational functions for PO matrices
- * @defgroup complex16SYcomputational complex16
- *   @ingroup computationalSY
- *   This is the group of complex16 computational functions for SY matrices
- * @defgroup complex16HEcomputational complex16
- *   @ingroup computationalHE
- *   This is the group of complex16 computational functions for HE matrices
- * @defgroup complex16GBcomputational complex16
- *   @ingroup computationalGB
- *   This is the group of complex16 computational functions for GB matrices
- * @defgroup complex16GTcomputational complex16
- *   @ingroup computationalGT
- *   This is the group of complex16 computational functions for GT matrices
- * @defgroup complex16PTcomputational complex16
- *   @ingroup computationalPT
- *   This is the group of complex16 computational functions for PT matrices
- * @defgroup complex16GEauxiliary complex16
- *   @ingroup auxiliaryGE
- *   This is the group of complex16 auxiliary functions for GE matrices
- * @defgroup complex16POauxiliary complex16
- *   @ingroup auxiliaryPO
- *   This is the group of complex16 auxiliary functions for PO matrices
- * @defgroup complex16SYauxiliary complex16
- *   @ingroup auxiliarySY
- *   This is the group of complex16 auxiliary functions for SY matrices
- * @defgroup complex16HEauxiliary complex16
- *   @ingroup auxiliaryHE
- *   This is the group of complex16 auxiliary functions for HE matrices
- * @defgroup complex16GBauxiliary complex16
- *   @ingroup auxiliaryGB
- *   This is the group of complex16 auxiliary functions for GB matrices
- * @defgroup complex16OTHERcomputational  complex16
- *   @ingroup OTHERcomputational
- *   This is the group of complex16 other Computational routines
- * @defgroup complex16OTHERauxiliary  complex16
- *   @ingroup OTHERauxiliary
- *   This is the group of complex16 other auxiliary routines
- * @defgroup auxOTHERcomputational  auxiliary Computational routines
- *   @ingroup OTHERcomputational
- *   This is the group of auxiliary Computational routines
- * @defgroup complex16OTHERsolve  complex16 Other Solve Routines
- *   @ingroup OTHERsolve
- *   This is the group of complex16 Other Solve routines
- * @defgroup complex16OTHEReigen  complex16 Other Eigenvalue routines
- *   @ingroup OTHEReigen
- *   This is the group of complex16 Other Eigenvalue routines
- *
- ****
- *
- * @defgroup testing LAPACK Testing
- *   This is the group of LAPACK TESTING routines.
- *
- * @defgroup matgen Matrix Generation
- *   @ingroup testing
- *   This is the group of LAPACK TESTING MATGEN routines.
- *
- * @defgroup lin Linear Solve
- *   @ingroup testing
- *   This is the group of LAPACK TESTING LIN routines.
- *
- * @defgroup eig Eigenvalue and Singular value
- *   @ingroup testing
- *   This is the group of LAPACK TESTING EIG routines.
- *
- * @defgroup real_matgen real
- *   @ingroup matgen
- *   This is the group of real LAPACK TESTING MATGEN routines.
- *
- * @defgroup double_matgen double
- *   @ingroup matgen
- *   This is the group of double LAPACK TESTING MATGEN routines.
- *
- * @defgroup complex_matgen complex
- *   @ingroup matgen
- *   This is the group of complex LAPACK TESTING MATGEN routines.
- *
- * @defgroup complex16_matgen complex16
- *   @ingroup matgen
- *   This is the group of complex16 LAPACK TESTING MATGEN routines.
- *
- * @defgroup aux_matgen aux
- *   @ingroup matgen
- *   This is the group of auxiliary LAPACK TESTING MATGEN routines.
- *
- * @defgroup single_lin real
- *   @ingroup lin
- *   This is the group of real LAPACK TESTING LIN routines.
- *
- * @defgroup double_lin double
- *   @ingroup lin
- *   This is the group of double LAPACK TESTING LIN routines.
- *
- * @defgroup complex_lin complex
- *   @ingroup lin
- *   This is the group of complex LAPACK TESTING LIN routines.
- *
- * @defgroup complex16_lin complex16
- *   @ingroup lin
- *   This is the group of complex16 LAPACK TESTING LIN routines.
- *
- * @defgroup aux_lin aux
- *   @ingroup lin
- *   This is the group of auxiliary LAPACK TESTING LIN routines.
- *
- * @defgroup single_eig real
- *   @ingroup eig
- *   This is the group of real LAPACK TESTING EIG routines.
- *
- * @defgroup double_eig double
- *   @ingroup eig
- *   This is the group of double LAPACK TESTING EIG routines.
- *
- * @defgroup complex_eig complex
- *   @ingroup eig
- *   This is the group of complex LAPACK TESTING EIG routines.
- *
- * @defgroup complex16_eig complex16
- *   @ingroup eig
- *   This is the group of complex16 LAPACK TESTING EIG routines.
- *
- * @defgroup aux_eig aux
- *   @ingroup eig
- *   This is the group of auxiliary LAPACK TESTING EIG routines.
- *
- ****
- * @defgroup blas Reference BLAS
- *   This is the group of BLAS routines.
- *
- * @defgroup level1 Level1
- *   @ingroup blas
- *   This is the group of LEVEL 1 BLAS routines.
- * @defgroup level2 Level2
- *   @ingroup blas
- *   This is the group of LEVEL 2 BLAS routines.
- * @defgroup level3 Level3
- *   @ingroup blas
- *   This is the group of LEVEL 3 BLAS routines.
-  * @defgroup aux_blas Auxiliary BLAS
- *   @ingroup blas
- *   This is the group of Auxiliary 3 BLAS routines.
-* @defgroup blastesting Testing
- *   @ingroup blas
- *   This is the group of BLAS TESTING routines.
- *
- * @defgroup single_blas_level1 real
- *   @ingroup level1
- *   This is the group of real LEVEL 1 BLAS routines.
- * @defgroup double_blas_level1 double
- *   @ingroup level1
- *   This is the group of double LEVEL 1 BLAS routines.
- * @defgroup complex_blas_level1 complex
- *   @ingroup level1
- *   This is the group of complex LEVEL 1 BLAS routines.
- * @defgroup complex16_blas_level1 complex16
- *   @ingroup level1
- *   This is the group of complex16 LEVEL 1 BLAS routines.
- *
- * @defgroup single_blas_level2 real
- *   @ingroup level2
- *   This is the group of real LEVEL 2 BLAS routines.
- * @defgroup double_blas_level2 double
- *   @ingroup level2
- *   This is the group of double LEVEL 2 BLAS routines.
- * @defgroup complex_blas_level2 complex
- *   @ingroup level2
- *   This is the group of complex LEVEL 2 BLAS routines.
- * @defgroup complex16_blas_level2 complex16
- *   @ingroup level2
- *   This is the group of complex16 LEVEL 2 BLAS routines.
- *
- * @defgroup single_blas_level3 real
- *   @ingroup level3
- *   This is the group of real LEVEL 3 BLAS routines.
- * @defgroup double_blas_level3 double
- *   @ingroup level3
- *   This is the group of double LEVEL 3 BLAS routines.
- * @defgroup complex_blas_level3 complex
- *   @ingroup level3
- *   This is the group of complex LEVEL 3 BLAS routines.
- * @defgroup complex16_blas_level3 complex16
- *   @ingroup level3
- *   This is the group of complex16 LEVEL 3 BLAS routines.
- *
- * @defgroup single_blas_testing real
- *   @ingroup blastesting
- *   This is the group of real BLAS TESTING routines.
- * @defgroup double_blas_testing double
- *   @ingroup blastesting
- *   This is the group of double BLAS TESTING routines.
- * @defgroup complex_blas_testing complex
- *   @ingroup blastesting
- *   This is the group of complex BLAS TESTING routines.
- * @defgroup complex16_blas_testing complex16
- *   @ingroup blastesting
- *   This is the group of complex16 BLAS TESTING routines.
- *
+
+@defgroup lapack_top                LAPACK
+@{
+    @defgroup solve_top             Linear solve, AX = B
+    @{
+        @defgroup gesv_driver_grp   LU: General matrix, driver
+        @{
+            @defgroup gesv_driver   --- full ---
+            @defgroup gesv          gesv:           factor and solve
+            @defgroup gesvx         gesvx:          factor and solve, expert
+            @defgroup gesvxx        gesvxx:         factor and solve, extra precise
+            @defgroup gesv_mixed    gesv:           factor and solve, mixed precision
+
+            @defgroup gbsv_driver   --- banded ---
+            @defgroup gbsv          gbsv:           factor and solve
+            @defgroup gbsvx         gbsvx:          factor and solve, expert
+            @defgroup gbsvxx        gbsvxx:         factor and solve, extra precise
+
+            @defgroup gtsv_driver   --- tridiagonal ---
+            @defgroup gtsv          gtsv:           factor and solve
+            @defgroup gtsvx         gtsvx:          factor and solve, expert
+        @}
+
+        @defgroup gesv_comp_grp     LU: computational routines (factor, cond, etc.)
+        @{
+            @defgroup gesv_comp     --- full ---
+            @defgroup gecon         gecon:          condition number estimate
+            @defgroup getrf         getrf:          triangular factor
+            @defgroup getrf2        getrf2:         triangular factor panel, recursive?
+            @defgroup getf2         getf2:          triangular factor panel, level 2
+            @defgroup getrs         getrs:          triangular solve using factor
+            @defgroup getri         getri:          triangular inverse
+            @defgroup gerfs         gerfs:          iterative refinement
+            @defgroup gerfsx        gerfsx:         iterative refinement, expert
+            @defgroup geequ         geequ:          equilibration
+            @defgroup geequb        geequb:         equilibration, power of 2
+            @defgroup laqge         laqge:          row/col scale matrix
+            @defgroup laswp         laswp:          swap permutation
+
+            @defgroup getc2         getc2:          triangular factor, with complete pivoting
+            @defgroup gesc2         gesc2:          triangular solve using factor, with complete pivoting
+            @defgroup latdf         latdf:          Dif-estimate with complete pivoting LU, step in tgsen
+
+            @defgroup la_gercond    la_gercond: Skeel condition number estimate
+            @defgroup la_gerpvgrw   la_gerpvgrw: reciprocal pivot growth
+            @defgroup la_gerfsx_extended  la_gerfsx_extended:  step in gerfsx
+
+            @defgroup gbsv_comp     --- banded ---
+            @defgroup gbcon         gbcon:          condition number estimate
+            @defgroup gbtrf         gbtrf:          triangular factor
+            @defgroup gbtf2         gbtf2:          triangular factor, level 2
+            @defgroup gbtrs         gbtrs:          triangular solve using factor
+            @defgroup gbrfs         gbrfs:          iterative refinement
+            @defgroup gbrfsx        gbrfsx:         iterative refinement, expert
+            @defgroup gbequ         gbequ:          equilibration
+            @defgroup gbequb        gbequb:         equilibration, power of 2
+            @defgroup laqgb         laqgb:          row/col scale matrix
+            @defgroup la_gbrcond    la_gbrcond:     Skeel condition number estimate
+            @defgroup la_gbrpvgrw   la_gbrpvgrw:    reciprocal pivot growth
+            @defgroup la_gbrfsx_extended  la_gbrfsx_extended:  step in gbrfsx
+
+            @defgroup gtsv_comp     --- tridiagonal ---
+            @defgroup gtcon         gtcon:          condition number estimate
+            @defgroup gttrf         gttrf:          triangular factor
+            @defgroup gttrs         gttrs:          triangular solve using factor
+            @defgroup gtts2         gtts2:          triangular solve using factor
+            @defgroup gtrfs         gtrfs:          iterative refinement
+        @}
+
+        @defgroup posv_driver_grp   Cholesky: Hermitian/symmetric positive definite matrix, driver
+        @{
+            @defgroup posv_driver   --- full ---
+            @defgroup posv          posv:           factor and solve
+            @defgroup posvx         posvx:          factor and solve, expert
+            @defgroup posvxx        posvxx:         factor and solve, extra precise
+            @defgroup posv_mixed    posv:           factor and solve, mixed precision
+
+            @defgroup ppsv_driver   --- packed ---
+            @defgroup ppsv          ppsv:           factor and solve
+            @defgroup ppsvx         ppsvx:          factor and solve, expert
+
+            @defgroup pfsv_driver   --- rectangular full packed (RFP) ---
+            @defgroup pfsv          pfsv:           factor and solve [not available]
+
+            @defgroup pbsv_driver   --- banded ---
+            @defgroup pbsv          pbsv:           factor and solve
+            @defgroup pbsvx         pbsvx:          factor and solve, expert
+
+            @defgroup ptsv_driver   --- tridiagonal ---
+            @defgroup ptsv          ptsv:           factor and solve
+            @defgroup ptsvx         ptsvx:          factor and solve, expert
+        @}
+
+        @defgroup posv_comp_grp     Cholesky: computational routines (factor, cond, etc.)
+        @{
+            @defgroup posv_comp     --- full ---
+            @defgroup pocon         pocon:          condition number estimate
+            @defgroup potrf         potrf:          triangular factor
+            @defgroup potrf2        potrf2:         triangular factor panel, recursive?
+            @defgroup potf2         potf2:          triangular factor panel, level 2
+            @defgroup pstrf         pstrf:          triangular factor, with pivoting
+            @defgroup pstf2         pstf2:          triangular factor, with pivoting panel, level 2
+            @defgroup potrs         potrs:          triangular solve using factor
+            @defgroup potri         potri:          triangular inverse
+            @defgroup porfs         porfs:          iterative refinement
+            @defgroup porfsx        porfsx:         iterative refinement, expert
+            @defgroup poequ         poequ:          equilibration
+            @defgroup poequb        poequb:         equilibration, power of 2
+            @defgroup laqhe         laqhe:          row/col scale matrix
+            @defgroup la_porcond    la_porcond:     Skeel condition number estimate
+            @defgroup la_porpvgrw   la_porpvgrw:    reciprocal pivot growth
+            @defgroup la_porfsx_extended  la_porfsx_extended:  step in porfsx
+
+            @defgroup ppsv_comp     --- packed ---
+            @defgroup ppcon         ppcon:          condition number estimate
+            @defgroup pptrf         pptrf:          triangular factor
+            @defgroup pptrs         pptrs:          triangular solve using factor
+            @defgroup pptri         pptri:          triangular inverse
+            @defgroup pprfs         pprfs:          iterative refinement
+            @defgroup ppequ         ppequ:          equilibration
+            @defgroup laqhp         laqhp:          row/col scale matrix
+
+            @defgroup pfsv_comp     --- rectangular full packed (RFP) ---
+            @defgroup pftrf         pftrf:          triangular factor
+            @defgroup pftrs         pftrs:          triangular solve using factor
+            @defgroup pftri         pftri:          triangular inverse
+
+            @defgroup pbsv_comp     --- banded ---
+            @defgroup pbcon         pbcon:          condition number estimate
+            @defgroup pbtrf         pbtrf:          triangular factor
+            @defgroup pbtf2         pbtf2:          triangular factor panel, level 2
+            @defgroup pbtrs         pbtrs:          triangular solve using factor
+            @defgroup pbrfs         pbrfs:          iterative refinement
+            @defgroup pbequ         pbequ:          equilibration
+            @defgroup laqhb         laqhb:          row/col scale matrix
+
+            @defgroup ptsv_comp     --- tridiagonal ---
+            @defgroup ptcon         ptcon:          condition number estimate
+            @defgroup pttrf         pttrf:          triangular factor
+            @defgroup pttrs         pttrs:          triangular solve using factor
+            @defgroup ptts2         ptts2:          triangular solve using factor, unblocked
+            @defgroup ptrfs         ptrfs:          iterative refinement
+        @}
+
+        @defgroup hesv_driver_grp   LDL: Hermitian/symmetric indefinite matrix, driver
+        @{
+            @defgroup hesv_driver   --- full, rook pivoting ---
+            @defgroup hesv          {he,sy}sv:      rook (v1)
+            @defgroup hesv_rook     {he,sy}sv_rook: rook (v2)
+            @defgroup hesv_rk       {he,sy}sv_rk:   rook (v3)
+            @defgroup hesvx         {he,sy}svx:     rook (v1, expert)
+            @defgroup hesvxx        {he,sy}svxx:    rook (v1, expert)
+
+            @defgroup hpsv_driver   --- packed, rook pivoting ---
+            @defgroup hpsv          {hp,sp}sv:      factor and solve
+            @defgroup hpsvx         {hp,sp}svx:     factor and solve, expert
+
+            @defgroup hesv_aa_driver  --- full, Aasen ---
+            @defgroup hesv_aa       {he,sy}sv_aa:   Aasen
+            @defgroup hesv_aa_2stage  {he,sy}sv_aa_2stage:  Aasen, blocked 2-stage
+        @}
+
+        @defgroup hesv_comp_grp     LDL: computational routines (factor, cond, etc.)
+        @{
+            @defgroup hesv_comp_v1  --- full, rook v1 ---
+            @defgroup hecon         {he,sy}con:     condition number estimate
+            @defgroup hetrf         {he,sy}trf:     triangular factor
+            @defgroup lahef         la{he,sy}f:     step in hetrf
+            @defgroup hetf2         {he,sy}tf2:     triangular factor, level 2
+            @defgroup hetrs         {he,sy}trs:     triangular solve using factor
+            @defgroup hetri         {he,sy}tri:     triangular inverse
+            @defgroup herfs         {he,sy}rfs:     iterative refinement
+            @defgroup herfsx        {he,sy}rfsx:    iterative refinement, expert
+            @defgroup heequb        {he,sy}equb:    equilibration, power of 2
+            @defgroup syconv        syconv:         convert to/from L and D from hetrf
+
+            @defgroup hecon_3       {he,sy}con_3:   condition number estimate
+            @defgroup hetri2        {he,sy}tri2:    inverse
+            @defgroup hetri2x       {he,sy}tri2x:   inverse
+            @defgroup hetri_3       {he,sy}tri_3:   inverse
+            @defgroup hetri_3x      {he,sy}tri_3x:  inverse
+            @defgroup hetrs2        {he,sy}trs2:    solve using factor
+            @defgroup hetrs_3       {he,sy}trs_3:   solve using factor
+
+            @defgroup heswapr             {he,sy}swapr:         apply 2-sided permutation
+            @defgroup la_hercond          la_hercond:           Skeel condition number estimate
+            @defgroup la_herfsx_extended  la_herfsx_extended:   step in herfsx
+            @defgroup la_herpvgrw         la_herpvgrw:          reciprocal pivot growth
+
+            @defgroup hpsv_comp     --- packed, rook v1 ---
+            @defgroup hpcon         {hp,sp}con:     condition number estimate
+            @defgroup hptrf         {hp,sp}trf:     triangular factor
+            @defgroup hptrs         {hp,sp}trs:     triangular solve using factor
+            @defgroup hptri         {hp,sp}tri:     triangular inverse
+            @defgroup hprfs         {hp,sp}rfs:     iterative refinement
+
+            @defgroup hesv_comp_v2  --- full, rook v2 ---
+            @defgroup hecon_rook    {he,sy}con_rook:  condition number estimate
+            @defgroup hetrf_rook    {he,sy}trf_rook:  triangular factor
+            @defgroup lahef_rook    la{he,sy}f_rook:  triangular factor step
+            @defgroup hetf2_rook    {he,sy}tf2_rook:  triangular factor, level 2
+            @defgroup hetrs_rook    {he,sy}trs_rook:  triangular solve using factor
+            @defgroup hetri_rook    {he,sy}tri_rook:  triangular inverse
+
+            @defgroup hesv_comp_v3  --- full, rook v3 ---
+            @defgroup hetrf_rk      {he,sy}trf_rk:  triangular factor
+            @defgroup lahef_rk      la{he,sy}f_rk:  triangular factor step
+            @defgroup hetf2_rk      {he,sy}tf2_rk:  triangular factor, level 2
+            @defgroup syconvf       syconvf:        convert to/from hetrf      to hetrf_rk format
+            @defgroup syconvf_rook  syconvf_rook:   convert to/from hetrf_rook to hetrf_rk format
+
+            @defgroup hesv_comp_aasen  --- full, Aasen ---
+            @defgroup hetrf_aa      {he,sy}trf_aa:  triangular factor
+            @defgroup lahef_aa      la{he,sy}f_aa:  triangular factor partial factor
+            @defgroup hetrs_aa      {he,sy}trs_aa:  triangular solve using factor
+
+            @defgroup hesv_comp_aasen2 --- full, Aasen, blocked 2-stage ---
+            @defgroup hetrf_aa_2stage  {he,sy}trf_aa_2stage:  triangular factor
+            @defgroup hetrs_aa_2stage  {he,sy}trs_aa_2stage:  triangular solve using factor
+        @}
+
+        @defgroup trsv_comp_grp     Triangular computational routines (solve, cond, etc.)
+        @{
+            @defgroup trsv_comp     --- full ---
+            @defgroup trcon         trcon:          condition number estimate
+            @defgroup trtrs         trtrs:          triangular solve
+            @defgroup latrs         latrs:          triangular solve with robust scaling
+            @defgroup latrs3        latrs3:         triangular solve with robust scaling, level 3
+            @defgroup trtri         trtri:          triangular inverse
+            @defgroup trti2         trti2:          triangular inverse, level 2
+            @defgroup trrfs         trrfs:          triangular iterative refinement
+            @defgroup lauum         lauum:          triangular multiply: U^H U
+            @defgroup lauu2         lauu2:          triangular multiply: U^H U, level 2
+
+            @defgroup tpsv_comp     --- packed ---
+            @defgroup tpcon         tpcon:          condition number estimate
+            @defgroup tptrs         tptrs:          triangular solve
+            @defgroup latps         latps:          triangular solve with robust scaling
+            @defgroup tptri         tptri:          triangular inverse
+            @defgroup tprfs         tprfs:          triangular iterative refinement
+
+            @defgroup tfsv_comp     --- rectangular full packed (RFP) ---
+            @defgroup tftri         tftri:          triangular inverse, RFP
+
+            @defgroup tbsv_comp     --- banded ---
+            @defgroup tbcon         tbcon:          condition number estimate
+            @defgroup tbtrs         tbtrs:          triangular solve
+            @defgroup latbs         latbs:          triangular solve with scaling
+            @defgroup tbrfs         tbrfs:          triangular iterative refinement
+        @}
+
+        @defgroup solve_aux_grp     Auxiliary routines
+        @{
+            @defgroup lacn2         lacn2:          1-norm estimate, e.g., || A^{-1} ||_1 in gecon
+            @defgroup lacon         lacon:          1-norm estimate, e.g., || A^{-1} ||_1 in gecon, old
+            @defgroup la_lin_berr   la_lin_berr:    backward error
+        @}
+    @}
+
+    @defgroup gels_top Least squares
+    @{
+        @defgroup gels_driver_grp   Standard least squares, min || Ax - b ||_2
+        @{
+            @defgroup gels          gels:           least squares using QR/LQ
+            @defgroup gelst         gelst:          least squares using QR/LQ with T matrix
+            @defgroup gelss         gelss:          least squares using SVD, QR iteration
+            @defgroup gelsd         gelsd:          least squares using SVD, divide and conquer
+            @defgroup gelsy         gelsy:          least squares using complete orthogonal factor
+            @defgroup getsls        getsls:         least squares using tall-skinny QR/LQ
+        @}
+
+        @defgroup ggls_driver_grp   Constrained least squares
+        @{
+            @defgroup gglse         gglse:          equality-constrained least squares
+            @defgroup ggglm         ggglm:          Gauss-Markov linear model
+        @}
+
+        @defgroup gels_aux_grp      Auxiliary routines
+        @{
+            @defgroup laic1         laic1:          condition estimate, step in gelsy
+            @defgroup lals0         lals0:          back multiplying factors, step in gelsd
+            @defgroup lalsa         lalsa:          SVD of coefficient matrix, step in gelsd
+            @defgroup lalsd         lalsd:          uses SVD for least squares, step in gelsd
+        @}
+    @}
+
+    @defgroup unitary_top Orthogonal/unitary factors (QR, CS, etc.)
+    @{
+        @defgroup geqr_comp_grp     QR
+        @{
+            @defgroup geqr_comp1    --- flexible ---
+            @defgroup geqr          geqr:           QR factor, flexible
+            @defgroup gemqr         gemqr:          multiply by Q from geqr
+
+            @defgroup geqr_comp2    --- classic ---
+            @defgroup geqrf         geqrf:          QR factor
+            @defgroup geqr2         geqr2:          QR factor, level 2
+            @defgroup ungqr         {un,or}gqr:     generate explicit Q from geqrf
+            @defgroup ung2r         {un,or}g2r:     generate explicit Q from geqrf, level 2
+            @defgroup unmqr         {un,or}mqr:     multiply by Q from geqrf
+            @defgroup unm2r         {un,or}m2r:     multiply by Q from geqrf, level 2
+
+            @defgroup geqr_comp3    --- with T ---
+            @defgroup geqrt         geqrt:          QR factor, with T
+            @defgroup geqrt2        geqrt2:         QR factor, with T, level 2
+            @defgroup geqrt3        geqrt3:         QR factor, with T, recursive panel
+            @defgroup gemqrt        gemqrt:         multiply by Q from geqrt
+
+            @defgroup geqr_comp4    --- positive ---
+            @defgroup geqrfp        geqrfp:         QR factor, diag( R ) ≥ 0
+            @defgroup geqr2p        geqr2p:         QR factor, diag( R ) ≥ 0, level 2
+        @}
+
+        @defgroup geqpf_comp_grp    QR with pivoting
+        @{
+            @defgroup geqp3         geqp3:          QR factor with pivoting, level 3
+            @defgroup laqp2         laqp2:          step of geqp3
+            @defgroup laqps         laqps:          step of geqp3
+        @}
+
+        @defgroup getsqr_comp_grp   QR, tall-skinny
+        @{
+            @defgroup latsqr        latsqr:         tall-skinny QR factor
+            @defgroup ungtsqr       {un,or}gtsqr:   generate Q from latsqr
+            @defgroup ungtsqr_row   {un,or}gtsqr_row:   generate Q from latsqr
+            @defgroup larfb_gett    larfb_gett:     step in ungtsqr_row
+            @defgroup lamtsqr       lamtsqr:        multiply by Q from latsqr
+            @defgroup getsqrhrt     getsqrhrt:      tall-skinny QR factor, with Householder reconstruction
+            @defgroup unhr_col      {un,or}hr_col:  Householder reconstruction
+            @defgroup launhr_col_getrfnp   la{un,or}hr_col_getrfnp:   LU factor without pivoting
+            @defgroup launhr_col_getrfnp2  la{un,or}hr_col_getrfnp2:  LU factor without pivoting, level 2
+        @}
+
+        @defgroup tpqr_comp_grp     QR, triangular-pentagonal
+        @{
+            @defgroup tpqrt         tpqrt:          QR factor
+            @defgroup tpqrt2        tpqrt2:         QR factor, level 2
+            @defgroup tpmqrt        tpmqrt:         applies Q
+            @defgroup tprfb         tprfb:          applies Q (like larfb)
+        @}
+
+        @defgroup ggqr_comp_grp     Generalized QR
+        @{
+            @defgroup ggqrf         ggqrf:          Generalized QR factor
+        @}
+
+        @defgroup gelq_comp_grp     LQ
+        @{
+            @defgroup gelq_comp1    --- flexible ---
+            @defgroup gelq          gelq:           LQ factor, flexible
+            @defgroup gemlq         gemlq:          multiply by Q from gelq
+
+            @defgroup gelq_comp2    --- classic ---
+            @defgroup gelqf         gelqf:          LQ factor
+            @defgroup gelq2         gelq2:          LQ factor, level 2
+            @defgroup unglq         {un,or}glq:     generate explicit Q from gelqf
+            @defgroup ungl2         {un,or}gl2:     generate explicit Q, level 2, step in unglq
+            @defgroup unmlq         {un,or}mlq:     multiply by Q from gelqf
+            @defgroup unml2         {un,or}ml2:     multiply by Q, level 2, step in unmlq
+
+            @defgroup gelq_comp3    --- with T ---
+            @defgroup gelqt         gelqt:          LQ factor, with T
+            @defgroup gelqt3        gelqt3:         LQ factor, with T, recursive
+            @defgroup gemlqt        gemlqt:         multiply by Q from gelqt
+        @}
+
+        @defgroup geswlq_comp_grp   LQ, short-wide
+        @{
+            @defgroup laswlq        laswlq:         short-wide LQ factor
+            @defgroup lamswlq       lamswlq:        multiply by Q from laswlq
+        @}
+
+        @defgroup tplq_comp_grp     LQ, triangular-pentagonal
+        @{
+            @defgroup tplqt         tplqt:          QR factor
+            @defgroup tplqt2        tplqt2:         QR factor, level 2
+            @defgroup tpmlqt        tpmlqt:         applies Q
+        @}
+
+        @defgroup geql_comp_grp     QL
+        @{
+            @defgroup geqlf         geqlf:          QL factor
+            @defgroup geql2         geql2:          QL factor, level 2
+            @defgroup ungql         {un,or}gql:     generate explicit Q from geqlf
+            @defgroup unmql         {un,or}mql:     multiply by Q from geqlf
+            @defgroup ung2l         {un,or}g2l:     step in ungql
+            @defgroup unm2l         {un,or}m2l:     step in unmql
+        @}
+
+        @defgroup gerq_comp_grp     RQ
+        @{
+            @defgroup gerqf         gerqf:          RQ factor
+            @defgroup gerq2         gerq2:          RQ factor, level 2
+            @defgroup ungrq         {un,or}grq:     generate explicit Q from gerqf
+            @defgroup unmrq         {un,or}mrq:     multiply by Q from gerqf
+            @defgroup unmr2         {un,or}mr2:     step in unmrq
+            @defgroup ungr2         {un,or}gr2:     step in ungrq
+        @}
+
+        @defgroup ggrq_comp_grp     Generalized RQ
+        @{
+            @defgroup ggrqf         ggrqf:          Generalized RQ factor
+        @}
+
+        @defgroup gerz_comp_grp     RZ
+        @{
+            @defgroup tzrzf         tzrzf:          RZ factor
+            @defgroup latrz         latrz:          RZ factor step
+            @defgroup unmrz         {un,or}mrz:     multiply by Z from tzrzf
+            @defgroup unmr3         {un,or}mr3:     step in unmrz
+            @defgroup larz          larz:           apply reflector
+            @defgroup larzb         larzb:          apply block reflector
+            @defgroup larzt         larzt:          generate T matrix
+        @}
+
+        @defgroup gecs_comp_grp     Cosine-Sine (CS) decomposition
+        @{
+            @defgroup bbcsd         bbcsd:          ??
+            @defgroup uncsd         {un,or}csd:     ??
+            @defgroup uncsd2by1     {un,or}csd2by1: ??
+            @defgroup unbdb         {un,or}bdb:     bidiagonalize partitioned unitary matrix, step in uncsd
+            @defgroup unbdb1        {un,or}bdb1:    step in uncsd2by1
+            @defgroup unbdb2        {un,or}bdb2:    step in uncsd2by1
+            @defgroup unbdb3        {un,or}bdb3:    step in uncsd2by1
+            @defgroup unbdb4        {un,or}bdb4:    step in uncsd2by1
+            @defgroup unbdb5        {un,or}bdb5:    step in uncsd2by1
+            @defgroup unbdb6        {un,or}bdb6:    step in uncsd2by1
+
+            @defgroup lapmr         lapmr:          permute rows
+            @defgroup lapmt         lapmt:          permute cols
+        @}
+
+        @defgroup reflector_aux_grp Householder reflectors
+        @{
+            @defgroup larf          larf:           apply Householder reflector
+            @defgroup larfx         larfx:          apply Householder reflector, unrolled
+            @defgroup larfy         larfy:          apply Householder reflector symmetrically (2-sided)
+            @defgroup larfb         larfb:          apply block Householder reflector
+            @defgroup larfg         larfg:          generate Householder reflector
+            @defgroup larfgp        larfgp:         generate Householder reflector, beta ≥ 0
+            @defgroup larft         larft:          generate T matrix
+        @}
+
+        @defgroup rot_aux_grp       Givens/Jacobi plane rotations
+        @{
+            @defgroup lartg         lartg:          generate plane rotation, more accurate than BLAS rot
+            @defgroup lartgp        lartgp:         generate plane rotation, more accurate than BLAS rot
+            @defgroup lasr          lasr:           apply series of plane rotations
+            @defgroup largv         largv:          generate vector of plane rotations
+            @defgroup lartv         lartv:          apply vector of plane rotations to vectors
+            @defgroup lar2v         lar2v:          apply vector of plane rotations to 2x2 matrices
+            @defgroup lacrt         lacrt:          apply plane rotation (unused?)
+        @}
+    @}
+
+    @defgroup geev_top              Non-symmetric eigenvalues
+    @{
+        @defgroup geev_driver_grp   Standard eig driver, AV = VΛ
+        @{
+            @defgroup geev          geev:           eig
+            @defgroup geevx         geevx:          eig, expert
+
+            @defgroup gees          gees:           Schur form
+            @defgroup geesx         geesx:          Schur form, expert
+        @}
+
+        @defgroup ggev_driver_grp   Generalized eig driver
+        @{
+            @defgroup ggev3         ggev3:          eig
+            @defgroup ggev          ggev:           eig, unblocked
+            @defgroup ggevx         ggevx:          eig, expert
+
+            @defgroup gges3         gges3:          Schur form
+            @defgroup gges          gges:           Schur form, unblocked
+            @defgroup ggesx         ggesx:          Schur form, expert
+        @}
+
+        @defgroup gedmd             DMD driver, Dynamic Mode Decomposition
+
+        @defgroup geev_comp_grp     Eig computational routines
+        @{
+            @defgroup gebal         gebal:          balance matrix
+            @defgroup gehrd         gehrd:          reduction to Hessenberg
+            @defgroup gehd2         gehd2:          reduction to Hessenberg, level 2
+            @defgroup lahr2         lahr2:          step in gehrd
+            @defgroup unghr         {un,or}ghr:     generate Q from gehrd
+            @defgroup unmhr         {un,or}mhr:     multiply by Q from gehrd
+            @defgroup gebak         gebak:          back-transform eigvec
+            @defgroup hseqr         hseqr:          Hessenberg eig, QR iteration
+            @defgroup hsein         hsein:          Hessenberg inverse iteration for eigvec
+            @defgroup trevc         trevc:          eigenvectors of triangular Schur form, old
+            @defgroup trevc3        trevc3:         eigenvectors of triangular Schur form, blocked
+            @defgroup laln2         laln2:          1x1 or 2x2 solve, step in trevc
+
+            @defgroup trsyl         trsyl:          Sylvester equation
+            @defgroup trsyl3        trsyl3:         Sylvester equation, level 3
+            @defgroup lasy2         lasy2:          Sylvester equation
+
+            @defgroup trsna         trsna:          eig condition numbers
+            @defgroup laqtr         laqtr:          quasi-triangular solve
+
+            @defgroup trexc         trexc:          reorder Schur form
+            @defgroup trsen         trsen:          reorder Schur form
+            @defgroup laexc         laexc:          reorder Schur form
+
+            @defgroup lanv2         lanv2:          2x2 Schur factor
+
+            @defgroup laqr_group    --- hseqr auxiliary ---
+            @defgroup laein         laein:          eigvec by Hessenberg inverse iteration
+            @defgroup lahqr         lahqr:          eig of Hessenberg, step in hseqr
+            @defgroup laqr0         laqr0:          eig of Hessenberg, step in hseqr
+            @defgroup laqr1         laqr1:          step in hseqr
+            @defgroup laqr2         laqr2:          step in hseqr
+            @defgroup laqr3         laqr3:          step in hseqr
+            @defgroup laqr4         laqr4:          eig of Hessenberg, step in hseqr
+            @defgroup laqr5         laqr5:          step in hseqr
+
+            @defgroup iparmq        iparmq:         set parameters for hseqr
+
+            @defgroup laqz_group    --- ggev3, gges3 auxiliary ---
+            @defgroup laqz0         laqz0:          step in ggev3, gges3
+            @defgroup laqz1         laqz1:          step in ggev3, gges3
+            @defgroup laqz2         laqz2:          step in ggev3, gges3
+            @defgroup laqz3         laqz3:          step in ggev3, gges3
+            @defgroup laqz4         laqz4:          step in ggev3, gges3
+        @}
+
+        @defgroup ggev_comp_grp     Generalized eig computational routines
+        @{
+            @defgroup ggbal         ggbal:          balance matrix
+            @defgroup gghrd         gghrd:          reduction to Hessenberg
+            @defgroup gghd3         gghd3:          reduction to Hessenberg, level 3
+            @defgroup hgeqz         hgeqz:          generalized Hessenberg eig
+            @defgroup ggbak         ggbak:          back-transform eigvec
+            @defgroup tgsen         tgsen:          reorder generalized Schur form
+            @defgroup tgsna         tgsna:          reciprocal cond est
+            @defgroup tgsyl         tgsyl:          Sylvester equation
+            @defgroup tgsy2         tgsy2:          Sylvester equation panel (?)
+            @defgroup unm22         {un,or}m22:     multiply by banded Q, step in gghd3
+            @defgroup lagv2         lagv2:          2x2 generalized Schur factor
+            @defgroup tgevc         tgevc:          eigvec of pair of matrices
+            @defgroup tgexc         tgexc:          reorder generalized Schur form
+            @defgroup tgex2         tgex2:          reorder generalized Schur form
+        @}
+    @}
+
+    @defgroup heev_top              Hermitian/symmetric eigenvalues
+    @{
+        @defgroup heev_driver_grp   Standard eig driver, AV = VΛ
+        @{
+            @defgroup heev_driver   --- full ---
+            @defgroup heev          {he,sy}ev:      eig, QR iteration
+            @defgroup heevd         {he,sy}evd:     eig, divide and conquer
+            @defgroup heevr         {he,sy}evr:     eig, MRRR
+            @defgroup heevx         {he,sy}evx:     eig, bisection
+
+            @defgroup heev_driver2  --- full, 2-stage ---
+            @defgroup heev_2stage   {he,sy}ev_2stage:   eig, QR iteration
+            @defgroup heevd_2stage  {he,sy}evd_2stage:  eig, divide and conquer
+            @defgroup heevr_2stage  {he,sy}evr_2stage:  eig, MRRR
+            @defgroup heevx_2stage  {he,sy}evx_2stage:  eig, bisection
+
+            @defgroup hpev_driver   --- packed ---
+            @defgroup hpev          {hp,sp}ev:      eig, QR iteration
+            @defgroup hpevd         {hp,sp}evd:     eig, divide and conquer
+            @defgroup hpevx         {hp,sp}evx:     eig, bisection
+
+            @defgroup hbev_driver   --- banded ---
+            @defgroup hbev          {hb,sb}ev:      eig, QR iteration
+            @defgroup hbevd         {hb,sb}evd:     eig, divide and conquer
+            @defgroup hbevx         {hb,sb}evx:     eig, bisection
+
+            @defgroup hbev_driver2  --- banded, 2nd-stage ---
+            @defgroup hbev_2stage   {hb,sb}ev_2stage:   eig, QR iteration
+            @defgroup hbevd_2stage  {hb,sb}evd_2stage:  eig, divide and conquer
+            @defgroup hbevx_2stage  {hb,sb}evx_2stage:  eig, bisection
+
+            @defgroup stev_driver   --- tridiagonal ---
+            @defgroup stev          stev:           eig, QR iteration
+            @defgroup stevd         stevd:          eig, divide and conquer
+            @defgroup stevr         stevr:          eig, MRRR
+            @defgroup stevx         stevx:          eig, bisection
+            @defgroup pteqr         pteqr:          eig, positive definite tridiagonal
+
+            @defgroup stebz         stebz:          eig, Kahan
+            @defgroup sterf         sterf:          eig, QR iteration
+            @defgroup stedc         stedc:          eig, divide and conquer
+            @defgroup stegr         stegr:          eig, bisection, see stemr
+            @defgroup stein         stein:          eig, inverse iteration
+            @defgroup stemr         stemr:          eig, relatively robust representation (RRR)
+            @defgroup steqr         steqr:          eig, QR iteration
+        @}
+
+        @defgroup hegv_driver_grp   Generalized eig driver, AV = BVΛ, etc.
+        @{
+            @defgroup hegv_driver   --- full ---
+            @defgroup hegv          {he,sy}gv:      eig, QR iteration
+            @defgroup hegv_2stage   {he,sy}gv_2stage: eig, QR iteration, 2-stage
+            @defgroup hegvd         {he,sy}gvd:     eig, divide and conquer
+            @defgroup hegvx         {he,sy}gvx:     eig, bisection
+
+            @defgroup hpgv_driver   --- packed ---
+            @defgroup hpgv          {hp,sp}gv:      eig, QR iteration
+            @defgroup hpgvd         {hp,sp}gvd:     eig, divide and conquer
+            @defgroup hpgvx         {hp,sp}gvx:     eig, bisection
+
+            @defgroup hbgv_driver   --- banded ---
+            @defgroup hbgv          {hb,sb}gv:      eig, QR iteration
+            @defgroup hbgvd         {hb,sb}gvd:     eig, divide and conquer
+            @defgroup hbgvx         {hb,sb}gvx:     eig, bisection
+        @}
+
+        @defgroup heev_comp_grp     Eig computational routines
+        @{
+            @defgroup heev_comp     --- full ---
+            @defgroup disna         disna:          eig condition numbers
+            @defgroup hetrd         {he,sy}trd:     reduction to tridiagonal
+            @defgroup hetd2         {he,sy}td2:     reduction to tridiagonal, level 2
+            @defgroup latrd         latrd:          step in hetrd
+            @defgroup ungtr         {un,or}gtr:     generate Q from hetrd
+            @defgroup unmtr         {un,or}mtr:     multiply by Q from hetrd
+
+            @defgroup hetrd_2stage  {he,sy}trd_2stage: reduction to tridiagonal, 2-stage
+            @defgroup hetrd_he2hb   {he,sy}trd_he2hb:   full to band (1st stage)
+            @defgroup hetrd_hb2st   {he,sy}trd_hb2st:   band to tridiagonal (2nd stage)
+            @defgroup hb2st_kernels {hb,sb}2st_kernels: band to tridiagonal (2nd stage)
+
+            @defgroup lae2          lae2:           2x2 eig, step in steqr, stemr
+            @defgroup laesy         laesy:          2x2 eig
+            @defgroup laev2         laev2:          2x2 eig
+            @defgroup lagtf         lagtf:          LU factor of (T - λI)
+            @defgroup lagts         lagts:          LU solve  of (T - λI) x = y
+
+            @defgroup hpev_comp     --- packed ---
+            @defgroup hptrd         {hp,sp}trd:     reduction to tridiagonal
+            @defgroup upgtr         {up,op}gtr:     generate Q from hetrd
+            @defgroup upmtr         {up,op}mtr:     multiply by Q from hptrd
+
+            @defgroup hbev_comp     --- banded ---
+            @defgroup hbtrd         {hb,sb}trd:     reduction to tridiagonal
+        @}
+
+        @defgroup hegv_comp_grp     Generalized eig computational routines
+        @{
+            @defgroup hegst         {he,sy}gst:     reduction to standard form
+            @defgroup hegs2         {he,sy}gs2:     reduction to standard form, level 2
+            @defgroup hpgst         {hp,sp}gst:     reduction to standard form, packed
+            @defgroup hbgst         {hb,sb}gst:     reduction to standard form, banded
+            @defgroup pbstf         pbstf:          split Cholesky factor, use with hbgst
+            @defgroup lag2          lag2:           2x2 eig
+        @}
+
+        @defgroup stev_comp_grp     tridiag bisection routines
+        @{
+            @defgroup laebz         laebz:          counts eigvals <= value
+            @defgroup laneg         laneg:          Sturm count
+        @}
+
+        @defgroup laed_comp_grp     tridiag divide and conquer (D&C) routines
+        @{
+            @defgroup laed0         laed0:          D&C step: top level solver
+            @defgroup laed1         laed1:          D&C step: merge subproblems
+            @defgroup laed2         laed2:          D&C step: deflation
+            @defgroup laed3         laed3:          D&C step: secular equation
+            @defgroup laed4         laed4:          D&C step: secular equation nonlinear solver
+            @defgroup laed5         laed5:          D&C step: secular equation, 2x2
+            @defgroup laed6         laed6:          D&C step: secular equation Newton step
+            @defgroup lamrg         lamrg:          permutation to merge 2 sorted lists
+
+            @defgroup laed_comp2    --- eig value only or update Q ---
+            @defgroup laed7         laed7:          D&C step: merge subproblems
+            @defgroup laed8         laed8:          D&C step: deflation
+            @defgroup laed9         laed9:          D&C step: secular equation
+            @defgroup laeda         laeda:          D&C step: z vector
+        @}
+
+        @defgroup larr_comp_grp     tridiag RRR routines
+        @{
+            @defgroup larra         larra:          step in stemr
+            @defgroup larrb         larrb:          step in stemr
+            @defgroup larrc         larrc:          step in stemr
+            @defgroup larrd         larrd:          step in stemr, tridiag eig
+            @defgroup larre         larre:          step in stemr
+            @defgroup larrf         larrf:          step in stemr, find relative robust representation (RRR)
+            @defgroup larrj         larrj:          step in stemr, refine eigval estimates
+            @defgroup larrk         larrk:          step in stemr, compute one eigval
+            @defgroup larrr         larrr:          step in stemr, test to do expensive tridiag eig algorithm
+            @defgroup larrv         larrv:          eig tridiagonal, step in stemr & stegr
+            @defgroup lar1v         lar1v:          step in larrv, hence stemr & stegr
+        @}
+    @}
+
+    @defgroup svd_top               Singular Value Decomposition (SVD)
+    @{
+        @defgroup svd_driver_grp    Standard SVD driver, A = UΣV^H
+        @{
+            @defgroup gesvd_driver  --- full ---
+            @defgroup gesvd         gesvd:          SVD, QR iteration
+            @defgroup gesvdq        gesvdq:         SVD, QR with pivoting
+            @defgroup gesdd         gesdd:          SVD, divide and conquer
+            @defgroup gesvdx        gesvdx:         SVD, bisection
+            @defgroup gejsv         gejsv:          SVD, Jacobi, high-level
+            @defgroup gesvj         gesvj:          SVD, Jacobi, low-level
+
+            @defgroup bdsvd_driver  --- bidiagonal ---
+            @defgroup bdsqr         bdsqr:          bidiagonal SVD, QR iteration (dqds)
+            @defgroup bdsdc         bdsdc:          bidiagonal SVD, divide and conquer
+            @defgroup bdsvdx        bdsvdx:         bidiagonal SVD, bisection
+        @}
+
+        @defgroup ggsvd_driver_grp  Generalized SVD driver
+        @{
+            @defgroup ggsvd3        ggsvd3:         SVD, QR iteration
+        @}
+
+        @defgroup gesvd_comp_grp    SVD computational routines
+        @{
+            @defgroup gebrd         gebrd:          reduction to bidiagonal
+            @defgroup gebd2         gebd2:          reduction to bidiagonal, level 2
+            @defgroup labrd         labrd:          step in gebrd
+            @defgroup gbbrd         gbbrd:          band to bidiagonal
+            @defgroup ungbr         {un,or}gbr:     generate Q, P from gebrd
+            @defgroup unmbr         {un,or}mbr:     multiply by Q, P from gebrd
+
+            @defgroup gesvd_aux     --- auxiliary routines ---
+            @defgroup gsvj0         gsvj0:          step in gesvj
+            @defgroup gsvj1         gsvj1:          step in gesvj
+            @defgroup las2          las2:           2x2 triangular SVD
+            @defgroup lasv2         lasv2:          2x2 triangular SVD
+            @defgroup lartgs        lartgs:         generate plane rotation for bidiag SVD
+        @}
+
+        @defgroup ggsvd_comp_grp    Generalized SVD computational routines
+        @{
+            @defgroup ggsvp3        ggsvp3:         step in ggsvd
+            @defgroup tgsja         tgsja:          generalized SVD of trapezoidal matrices, step in ggsvd3
+            @defgroup lags2         lags2:          2x2 orthogonal factor, step in tgsja
+            @defgroup lapll         lapll:          linear dependence of 2 vectors
+        @}
+
+        @defgroup lasq_comp_grp     bidiag QR iteration routines
+        @{
+            @defgroup lasq1         lasq1:          dqds step
+            @defgroup lasq2         lasq2:          dqds step
+            @defgroup lasq3         lasq3:          dqds step
+            @defgroup lasq4         lasq4:          dqds step
+            @defgroup lasq5         lasq5:          dqds step
+            @defgroup lasq6         lasq6:          dqds step
+        @}
+
+        @defgroup lasd_comp_grp     bidiag D&C routines
+        @{
+            @defgroup lasd0         lasd0:          D&C step: top level solver
+            @defgroup lasdt         lasdt:          D&C step: tree
+            @defgroup lasd1         lasd1:          D&C step: merge subproblems
+            @defgroup lasd2         lasd2:          D&C step: deflation
+            @defgroup lasd3         lasd3:          D&C step: secular equation
+            @defgroup lasd4         lasd4:          D&C step: secular equation nonlinear solver
+            @defgroup lasd5         lasd5:          D&C step: secular equation, 2x2
+            @defgroup lasdq         lasdq:          D&C step: leaf using bdsqr
+
+            @defgroup lasd_comp2    --- singular values only or factored form ---
+            @defgroup lasda         lasda:          D&C step: top level solver
+            @defgroup lasd6         lasd6:          D&C step: merge subproblems
+            @defgroup lasd7         lasd7:          D&C step: deflation
+            @defgroup lasd8         lasd8:          D&C step: secular equation
+        @}
+    @}
+
+    @defgroup blas_like_top         BLAS-like
+    @{
+        @defgroup set_grp           Initialize, copy, convert
+        @{
+            @defgroup laset         laset:          set matrix
+            @defgroup larnv         larnv:          random vector
+            @defgroup laruv         laruv:          random uniform vector
+
+            @defgroup lacpy         lacpy:          copy matrix
+
+            @defgroup lacp2         lacp2:          general matrix, convert real to complex
+            @defgroup _lag2_        _lag2_:         general matrix, convert double <=> single
+            @defgroup _lat2_        _lat2_:         triangular matrix, convert double <=> single
+
+            @defgroup tfttp         tfttp:          triangular matrix, RFP    (tf) to packed (tp)
+            @defgroup tfttr         tfttr:          triangular matrix, RFP    (tf) to full   (tr)
+            @defgroup tpttf         tpttf:          triangular matrix, packed (tp) to RFP    (tf)
+            @defgroup tpttr         tpttr:          triangular matrix, packed (tp) to full   (tr)
+            @defgroup trttf         trttf:          triangular matrix, full   (tr) to RFP    (tf)
+            @defgroup trttp         trttp:          triangular matrix, full   (tr) to packed (tp)
+        @}
+
+        @defgroup norm_grp          Matrix norm
+        @{
+            @defgroup lange         lange:          general matrix
+            @defgroup langb         langb:          general matrix, banded
+            @defgroup langt         langt:          general matrix, tridiagonal
+
+            @defgroup lanhs         lanhs:          Hessenberg
+
+            @defgroup lanhe         lan{he,sy}:     Hermitian/symmetric matrix
+            @defgroup lanhf         lan{hf,sf}:     Hermitian/symmetric matrix, RFP
+            @defgroup lanhp         lan{hp,sp}:     Hermitian/symmetric matrix, packed
+            @defgroup lanhb         lan{hb,sb}:     Hermitian/symmetric matrix, banded
+            @defgroup lanht         lan{ht,st}:     Hermitian/symmetric matrix, tridiagonal
+
+            @defgroup lantr         lantr:          triangular matrix
+            @defgroup lantp         lantp:          triangular matrix, packed
+            @defgroup lantb         lantb:          triangular matrix, banded
+        @}
+
+        @defgroup blas0_like_grp    Scalar operations
+        @{
+            @defgroup isnan         isnan:          test for NaN
+            @defgroup laisnan       laisnan:        test for NaN, unoptimized
+            @defgroup ladiv         ladiv:          complex divide
+            @defgroup lapy2         lapy2:          robust sqrt( x^2 + y^2 )
+            @defgroup lapy3         lapy3:          robust sqrt( x^2 + y^2 + z^2 )
+            @defgroup larmm         larmm:          scale factor to avoid overflow, step in latrs
+        @}
+
+        @defgroup blas1_like_grp    Level 1 BLAS-like vector ops
+        @{
+            @defgroup lacgv         lacgv:          conjugate vector
+            @defgroup lasrt         lasrt:          sort vector
+            @defgroup lassq         lassq:          sum-of-squares, avoiding over/underflow
+            @defgroup rscl          rscl:           scale vector by reciprocal
+        @}
+
+        @defgroup blas2_like_grp    Level 2 BLAS-like matrix-vector ops
+        @{
+            @defgroup ilalc         ilalc:          find non-zero col
+            @defgroup ilalr         ilalr:          find non-zero row
+            @defgroup lascl         lascl:          scale matrix
+            @defgroup la_geamv      la_geamv:       matrix-vector multiply |A| * |x|, general
+            @defgroup la_gbamv      la_gbamv:       matrix-vector multiply |A| * |x|, general banded
+            @defgroup la_heamv      la_heamv:       matrix-vector multiply |A| * |x|, Hermitian/symmetric
+            @defgroup lascl2        lascl2:         diagonal scale matrix, A = D A
+            @defgroup larscl2       larscl2:        reciprocal diagonal scale matrix, A = D^{-1} A
+            @defgroup la_wwaddw     la_wwaddw:      add to double-double or single-single vector
+        @}
+
+        @defgroup blas3_like_grp    Level 3 BLAS-like matrix-matrix ops
+        @{
+            @defgroup lagtm         lagtm:          tridiagonal matrix-matrix multiply
+            @defgroup lacrm         lacrm:          complex * real matrix-matrix multiply
+            @defgroup larcm         larcm:          real * complex matrix-matrix multiply
+            @defgroup hfrk          hfrk:           Hermitian rank-k update, RFP format
+            @defgroup tfsm          tfsm:           triangular-matrix solve, RFP format
+        @}
+    @}
+
+    @defgroup aux_top               Auxiliary routines
+    @{
+        @defgroup aux_grp           Other auxiliary routines
+        @{
+            @defgroup lsame         lsame:          string comparison
+            @defgroup lsamen        lsamen:         string comparison
+            @defgroup roundup_lwork roundup_lwork:  fix rounding integer to float
+            @defgroup second        second:         wall clock timer
+        @}
+
+        @defgroup params_grp        Parameters
+        @{
+            @defgroup lamch         lamch:          machine parameters
+            @defgroup lamc1         lamc1:          ??
+            @defgroup lamc2         lamc2:          ??
+            @defgroup lamc3         lamc3:          ??
+            @defgroup lamc4         lamc4:          ??
+            @defgroup lamc5         lamc5:          ??
+            @defgroup labad         labad:          over/underflow on obsolete pre-IEEE machines
+            @defgroup ilaver        ilaver:         LAPACK version
+            @defgroup ilaenv        ilaenv:         tuning parameters
+            @defgroup ilaenv2stage  ilaenv2stage:   tuning parameters for 2-stage eig
+            @defgroup iparam2stage  iparam2stage:   sets parameters for 2-stage eig
+            @defgroup ieeeck        ieeeck:         verify inf and NaN are safe
+            @defgroup la_constants  la_constants:   Fortran 95 module of constants
+
+            @defgroup blast_aux     --- BLAST constants ---
+            @defgroup iladiag       iladiag:        diag      string to BLAST const
+            @defgroup ilaprec       ilaprec:        precision string to BLAST const
+            @defgroup ilatrans      ilatrans:       trans     string to BLAST const
+            @defgroup ilauplo       ilauplo:        uplo      string to BLAST const
+            @defgroup la_transtype  la_transtype:   BLAST const to string
+        @}
+
+        @defgroup xerbla_grp        Error reporting
+        @{
+            @defgroup xerbla        xerbla:         error reporting
+            @defgroup xerbla_array  xerbla_array:   error reporting, callable from C
+        @}
+    @}
+@}
+
+@defgroup blas_top BLAS
+BLAS are defined by three papers:
+Basic linear algebra subprograms for {FORTRAN} usage, Lawson et al, 1979.
+An extended set of {FORTRAN} basic linear algebra subprograms, Dongarra et al, 1988.
+A set of level 3 basic linear algebra subprograms, Dongarra et al, 1990.
+Some BLAS-like routines (e.g., csymv, crot, csum1, icmax1) exist in
+LAPACK rather than the classic BLAS.
+These were extended by the Extra Precision BLAS (XBLAS, not documented here)
+https://www.netlib.org/xblas/
+
+@{
+        @defgroup scalar_grp        Scalar operations
+        @{
+            @defgroup abs1          abs1:           | real( x ) | + | imag( x ) |
+        @}
+
+        @defgroup blas1_grp         Level 1 BLAS: vector ops
+        @{
+            @defgroup asum          asum:           sum | real( x_i ) | + | imag( x_i ) |
+            @defgroup sum1          sum1:           sum | x_i | (in LAPACK)
+            @defgroup axpy          axpy:           y = ax + y
+            @defgroup copy          copy:           y = x
+            @defgroup dot           dot:            x^H x and x^T x
+            @defgroup iamax         iamax:          argmax_i | real( x_i ) | + | imag( x_i ) |
+            @defgroup imax1         imax1:          argmax_i | x_i | (in LAPACK)
+            @defgroup nrm2          nrm2:           || x ||_2
+            @defgroup scal          scal:           x = alpha x
+            @defgroup swap          swap:           x <=> y
+
+            @defgroup rot_comp      --- Givens/Jacobi plane rotations ---
+            @defgroup rot           rot:            apply plane rotation ([cz]rot in LAPACK)
+            @defgroup rotg          rotg:           generate plane rotation (cf. lartg)
+            @defgroup rotm          rotm:           apply modified (fast) plane rotation
+            @defgroup rotmg         rotmg:          generate modified (fast) plane rotation
+        @}
+
+        @defgroup blas2_grp         Level 2 BLAS: matrix-vector ops
+        @{
+            @defgroup blas2_full    --- full ---
+            @defgroup gemv          gemv:           general matrix-vector multiply
+            @defgroup ger           ger:            general matrix rank-1 update
+
+            @defgroup hemv          {he,sy}mv:      Hermitian/symmetric matrix-vector multiply ([cz]symv in LAPACK)
+            @defgroup her           {he,sy}r:       Hermitian/symmetric rank-1 update
+            @defgroup her2          {he,sy}r2:      Hermitian/symmetric rank-2 update
+
+            @defgroup trmv          trmv:           triangular matrix-vector multiply
+            @defgroup trsv          trsv:           triangular matrix-vector solve
+
+            @defgroup blas2_packed  --- packed ---
+            @defgroup hpmv          {hp,sp}mv:      Hermitian/symmetric matrix-vector multiply
+            @defgroup hpr           {hp,sp}r:       Hermitian/symmetric rank-1 update
+            @defgroup hpr2          {hp,sp}r2:      Hermitian/symmetric rank-2 update
+
+            @defgroup tpmv          tpmv:           triangular matrix-vector multiply
+            @defgroup tpsv          tpsv:           triangular matrix-vector solve
+
+            @defgroup blas2_banded  --- banded ---
+            @defgroup gbmv          gbmv:           general matrix-vector multiply
+
+            @defgroup hbmv          {hb,sb}mv:      Hermitian/symmetric matrix-vector multiply
+
+            @defgroup tbmv          tbmv:           triangular matrix-vector multiply
+            @defgroup tbsv          tbsv:           triangular matrix-vector solve
+        @}
+
+        @defgroup blas3_grp         Level 3 BLAS: matrix-matrix ops
+        @{
+            @defgroup gemm          gemm:           general matrix-matrix multiply
+
+            @defgroup hemm          {he,sy}mm:      Hermitian/symmetric matrix-matrix multiply
+            @defgroup herk          {he,sy}rk:      Hermitian/symmetric rank-k update
+            @defgroup her2k         {he,sy}r2k:     Hermitian/symmetric rank-2k update
+
+            @defgroup trmm          trmm:           triangular matrix-matrix multiply
+            @defgroup trsm          trsm:           triangular matrix-matrix solve
+        @}
+@}
+
 **/

From 3d9e20f61455ca6893744c2f9a0e19d89d9c5998 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 25 Nov 2023 18:51:54 +0100
Subject: [PATCH 437/718] Update version to 3.12.0

---
 lapack-netlib/INSTALL/ilaver.f | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lapack-netlib/INSTALL/ilaver.f b/lapack-netlib/INSTALL/ilaver.f
index a246c37cb..1827d5cd2 100644
--- a/lapack-netlib/INSTALL/ilaver.f
+++ b/lapack-netlib/INSTALL/ilaver.f
@@ -44,7 +44,6 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
 *
 *> \ingroup auxOTHERauxiliary
 *
@@ -60,7 +59,7 @@
       INTEGER VERS_MAJOR, VERS_MINOR, VERS_PATCH
 *  =====================================================================
       VERS_MAJOR = 3
-      VERS_MINOR = 11
+      VERS_MINOR = 12
       VERS_PATCH = 0
 *  =====================================================================
 *

From 578f0f95901617941c7de5ddc62fa97f12f9c0a0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 25 Nov 2023 18:53:16 +0100
Subject: [PATCH 438/718] Update version number to 3.12.0

---
 lapack-netlib/INSTALL/ilaver.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lapack-netlib/INSTALL/ilaver.c b/lapack-netlib/INSTALL/ilaver.c
index 184e9b78d..d64c841a2 100644
--- a/lapack-netlib/INSTALL/ilaver.c
+++ b/lapack-netlib/INSTALL/ilaver.c
@@ -315,7 +315,6 @@ typedef struct Namelist Namelist;
 /* > \author Univ. of Colorado Denver */
 /* > \author NAG Ltd. */
 
-/* > \date November 2019 */
 
 /* > \ingroup auxOTHERauxiliary */
 
@@ -332,7 +331,7 @@ typedef struct Namelist Namelist;
 
 /*  ===================================================================== */
     *vers_major__ = 3;
-    *vers_minor__ = 11;
+    *vers_minor__ = 12;
     *vers_patch__ = 0;
 /*  ===================================================================== */
 

From 08be9004f89e41607e4c19de1e3d1018ee536728 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 25 Nov 2023 18:57:17 +0100
Subject: [PATCH 439/718] Update version number and copyright date to
 Reference-LAPACK 3.12.0

---
 lapack-netlib/CMakeLists.txt | 240 ++++++++++++++++-------------------
 lapack-netlib/LICENSE        |   6 +-
 lapack-netlib/README.md      |   3 +-
 3 files changed, 111 insertions(+), 138 deletions(-)

diff --git a/lapack-netlib/CMakeLists.txt b/lapack-netlib/CMakeLists.txt
index fefaa8b89..f1f47ae24 100644
--- a/lapack-netlib/CMakeLists.txt
+++ b/lapack-netlib/CMakeLists.txt
@@ -1,9 +1,9 @@
-cmake_minimum_required(VERSION 3.2)
+cmake_minimum_required(VERSION 3.6)
 
-project(LAPACK Fortran C)
+project(LAPACK)
 
 set(LAPACK_MAJOR_VERSION 3)
-set(LAPACK_MINOR_VERSION 11)
+set(LAPACK_MINOR_VERSION 12)
 set(LAPACK_PATCH_VERSION 0)
 set(
   LAPACK_VERSION
@@ -45,6 +45,14 @@ if(_is_coverage_build)
   find_package(codecov)
 endif()
 
+# Use valgrind if it is found
+option( LAPACK_TESTING_USE_PYTHON "Use Python for testing. Disable it on memory checks." ON )
+find_program( MEMORYCHECK_COMMAND valgrind )
+if( MEMORYCHECK_COMMAND )
+  message( STATUS "Found valgrind: ${MEMORYCHECK_COMMAND}" )
+  set( MEMORYCHECK_COMMAND_OPTIONS "--leak-check=full --show-leak-kinds=all --track-origins=yes" )
+endif()
+
 # By default test Fortran compiler complex abs and complex division
 option(TEST_FORTRAN_COMPILER "Test Fortran compiler complex abs and complex division" OFF)
 if( TEST_FORTRAN_COMPILER )
@@ -76,7 +84,7 @@ if( TEST_FORTRAN_COMPILER )
     WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL
     COMMENT "Running test_zminMax in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zminMax.err"
     SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zminMax.f )
-    
+
 endif()
 
 # By default static library
@@ -99,6 +107,8 @@ else()
   set(LAPACKELIB "lapacke")
   set(TMGLIB "tmglib")
 endif()
+# By default build standard API and extended _64 API
+option(BUILD_INDEX64_EXT_API "Build Index-64 API as extended API with _64 suffix" ON)
 
 include(GNUInstallDirs)
 
@@ -127,90 +137,6 @@ configure_file(
 include(PreventInSourceBuilds)
 include(PreventInBuildInstalls)
 
-# Check if recursive flag exists
-include(CheckFortranCompilerFlag)
-if(CMAKE_Fortran_COMPILER_ID STREQUAL Flang)
-  check_fortran_compiler_flag("-Mrecursive" _MrecursiveFlag)
-elseif(CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
-  check_fortran_compiler_flag("-frecursive" _frecursiveFlag)
-elseif(CMAKE_Fortran_COMPILER_ID STREQUAL Intel)
-  check_fortran_compiler_flag("-recursive" _recursiveFlag)
-elseif(CMAKE_Fortran_COMPILER_ID STREQUAL XL)
-  check_fortran_compiler_flag("-qrecur" _qrecurFlag)
-elseif(CMAKE_Fortran_COMPILER_ID STREQUAL NAG)
-  check_fortran_compiler_flag("-recursive" _recursiveFlag)
-else()
-  message(WARNING "Fortran local arrays should be allocated on the stack."
-    " Please use a compiler which guarantees that feature."
-    " See https://github.com/Reference-LAPACK/lapack/pull/188 and references therein.")
-endif()
-
-# Add recursive flag
-if(_MrecursiveFlag)
-  string(REGEX MATCH "-Mrecursive" output_test <string> "${CMAKE_Fortran_FLAGS}")
-  if(NOT output_test)
-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -Mrecursive"
-      CACHE STRING "Recursive flag must be set" FORCE)
-  endif()
-elseif(_frecursiveFlag)
-  string(REGEX MATCH "-frecursive" output_test <string> "${CMAKE_Fortran_FLAGS}")
-  if(NOT output_test)
-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -frecursive"
-      CACHE STRING "Recursive flag must be set" FORCE)
-  endif()
-elseif(_recursiveFlag)
-  string(REGEX MATCH "-recursive" output_test <string> "${CMAKE_Fortran_FLAGS}")
-  if(NOT output_test)
-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -recursive"
-      CACHE STRING "Recursive flag must be set" FORCE)
-  endif()
-elseif(_qrecurFlag)
-  string(REGEX MATCH "-qrecur" output_test <string> "${CMAKE_Fortran_FLAGS}")
-  if(NOT output_test)
-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qrecur"
-      CACHE STRING "Recursive flag must be set" FORCE)
-  endif()
-endif()
-
-if(UNIX)
-  if(CMAKE_Fortran_COMPILER_ID STREQUAL Intel)
-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fp-model strict")
-  endif()
-  if(CMAKE_Fortran_COMPILER_ID STREQUAL XL)
-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict")
-  endif()
-# Delete libmtsk in linking sequence for Sun/Oracle Fortran Compiler.
-# This library is not present in the Sun package SolarisStudio12.3-linux-x86-bin
-  string(REPLACE \;mtsk\; \; CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES "${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES}")
-endif()
-
-if(CMAKE_Fortran_COMPILER_ID STREQUAL Compaq)
-  if(WIN32)
-    if(CMAKE_GENERATOR STREQUAL "NMake Makefiles")
-      get_filename_component(CMAKE_Fortran_COMPILER_CMDNAM ${CMAKE_Fortran_COMPILER} NAME_WE)
-      message(STATUS "Using Compaq Fortran compiler with command name ${CMAKE_Fortran_COMPILER_CMDNAM}")
-      set(cmd ${CMAKE_Fortran_COMPILER_CMDNAM})
-      string(TOLOWER "${cmd}" cmdlc)
-      if(cmdlc STREQUAL "df")
-        message(STATUS "Assume the Compaq Visual Fortran Compiler is being used")
-        set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
-        set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_INCLUDES 1)
-        #This is a workaround that is needed to avoid forward-slashes in the
-        #filenames listed in response files from incorrectly being interpreted as
-        #introducing compiler command options
-        if(${BUILD_SHARED_LIBS})
-          message(FATAL_ERROR "Making of shared libraries with CVF has not been tested.")
-        endif()
-        set(str "NMake version 9 or later should be used. NMake version 6.0 which is\n")
-        set(str "${str}   included with the CVF distribution fails to build Lapack because\n")
-        set(str "${str}   the number of source files exceeds the limit for NMake v6.0\n")
-        message(STATUS ${str})
-        set(CMAKE_Fortran_LINK_EXECUTABLE "LINK /out:<TARGET> <LINK_FLAGS> <LINK_LIBRARIES> <OBJECTS>")
-      endif()
-    endif()
-  endif()
-endif()
-
 # Add option to enable flat namespace for symbol resolution on macOS
 if(APPLE)
   option(USE_FLAT_NAMESPACE "Use flat namespaces for symbol resolution during build and runtime." OFF)
@@ -268,26 +194,6 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LAPACK_BINARY_DIR}/bin)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LAPACK_BINARY_DIR}/lib)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LAPACK_BINARY_DIR}/lib)
 
-# --------------------------------------------------
-# Check for any necessary platform specific compiler flags
-include(CheckLAPACKCompilerFlags)
-CheckLAPACKCompilerFlags()
-
-# --------------------------------------------------
-# Check second function
-
-include(CheckTimeFunction)
-set(TIME_FUNC NONE)
-CHECK_TIME_FUNCTION(NONE TIME_FUNC)
-CHECK_TIME_FUNCTION(INT_CPU_TIME TIME_FUNC)
-CHECK_TIME_FUNCTION(EXT_ETIME TIME_FUNC)
-CHECK_TIME_FUNCTION(EXT_ETIME_ TIME_FUNC)
-CHECK_TIME_FUNCTION(INT_ETIME TIME_FUNC)
-message(STATUS "--> Will use second_${TIME_FUNC}.f and dsecnd_${TIME_FUNC}.f as timing function.")
-
-set(SECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/second_${TIME_FUNC}.f)
-set(DSECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/dsecnd_${TIME_FUNC}.f)
-
 # deprecated LAPACK and LAPACKE routines
 option(BUILD_DEPRECATED "Build deprecated routines" OFF)
 message(STATUS "Build deprecated routines: ${BUILD_DEPRECATED}")
@@ -380,18 +286,27 @@ endif()
 
 # Check the usage of the user provided or automatically found LAPACK libraries
 if(LAPACK_LIBRARIES)
-  include(CheckFortranFunctionExists)
-  set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES})
-  # Check if new routine of 3.4.0 is in LAPACK_LIBRARIES
-  CHECK_FORTRAN_FUNCTION_EXISTS("dgeqrt" LATESTLAPACK_FOUND)
-  unset(CMAKE_REQUIRED_LIBRARIES)
-  if(LATESTLAPACK_FOUND)
-    message(STATUS "--> LAPACK supplied by user is WORKING, will use ${LAPACK_LIBRARIES}.")
+  include(CheckLanguage)
+  check_language(Fortran)
+  if(CMAKE_Fortran_COMPILER)
+    enable_language(Fortran)
+    include(CheckFortranFunctionExists)
+    set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES})
+    # Check if new routine of 3.4.0 is in LAPACK_LIBRARIES
+    CHECK_FORTRAN_FUNCTION_EXISTS("dgeqrt" LATESTLAPACK_FOUND)
+    unset(CMAKE_REQUIRED_LIBRARIES)
+    if(LATESTLAPACK_FOUND)
+      message(STATUS "--> LAPACK supplied by user is WORKING, will use ${LAPACK_LIBRARIES}.")
+    else()
+      message(ERROR "--> LAPACK supplied by user is not WORKING or is older than LAPACK 3.4.0, CANNOT USE ${LAPACK_LIBRARIES}.")
+      message(ERROR "-->     Will use REFERENCE LAPACK (by default)")
+      message(ERROR "-->     Or Correct your LAPACK_LIBRARIES entry ")
+      message(ERROR "-->     Or Consider checking USE_OPTIMIZED_LAPACK")
+    endif()
   else()
-    message(ERROR "--> LAPACK supplied by user is not WORKING or is older than LAPACK 3.4.0, CANNOT USE ${LAPACK_LIBRARIES}.")
-    message(ERROR "-->     Will use REFERENCE LAPACK (by default)")
-    message(ERROR "-->     Or Correct your LAPACK_LIBRARIES entry ")
-    message(ERROR "-->     Or Consider checking USE_OPTIMIZED_LAPACK")
+    message(STATUS "--> LAPACK supplied by user is ${LAPACK_LIBRARIES}.")
+    message(STATUS "--> CMake couldn't find a Fortran compiler, so it cannot check if the provided LAPACK library works.")
+    set(LATESTLAPACK_FOUND TRUE)
   endif()
 endif()
 
@@ -399,6 +314,27 @@ endif()
 if(NOT LATESTLAPACK_FOUND)
   message(STATUS "Using supplied NETLIB LAPACK implementation")
   set(LAPACK_LIBRARIES ${LAPACKLIB})
+
+  enable_language(Fortran)
+
+  # Check for any necessary platform specific compiler flags
+  include(CheckLAPACKCompilerFlags)
+  CheckLAPACKCompilerFlags()
+
+  # Check second function
+  include(CheckTimeFunction)
+  set(TIME_FUNC NONE)
+  CHECK_TIME_FUNCTION(NONE TIME_FUNC)
+  CHECK_TIME_FUNCTION(INT_CPU_TIME TIME_FUNC)
+  CHECK_TIME_FUNCTION(EXT_ETIME TIME_FUNC)
+  CHECK_TIME_FUNCTION(EXT_ETIME_ TIME_FUNC)
+  CHECK_TIME_FUNCTION(INT_ETIME TIME_FUNC)
+
+  # Set second function
+  message(STATUS "--> Will use second_${TIME_FUNC}.f and dsecnd_${TIME_FUNC}.f as timing function.")
+  set(SECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/second_${TIME_FUNC}.f)
+  set(DSECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/dsecnd_${TIME_FUNC}.f)
+
   add_subdirectory(SRC)
 else()
   set(CMAKE_EXE_LINKER_FLAGS
@@ -431,9 +367,11 @@ endif()
 # Cache export target
 set(LAPACK_INSTALL_EXPORT_NAME_CACHE ${LAPACK_INSTALL_EXPORT_NAME})
 if(BUILD_TESTING OR LAPACKE_WITH_TMG)
+  enable_language(Fortran)
   if(LATESTLAPACK_FOUND AND LAPACKE_WITH_TMG)
     set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES})
     # Check if dlatms (part of tmg) is found
+    include(CheckFortranFunctionExists)
     CHECK_FORTRAN_FUNCTION_EXISTS("dlatms" LAPACK_WITH_TMGLIB_FOUND)
     unset(CMAKE_REQUIRED_LIBRARIES)
     if(NOT LAPACK_WITH_TMGLIB_FOUND)
@@ -448,6 +386,12 @@ endif()
 set(LAPACK_INSTALL_EXPORT_NAME ${LAPACK_INSTALL_EXPORT_NAME_CACHE})
 unset(LAPACK_INSTALL_EXPORT_NAME_CACHE)
 
+
+#-------------------------------------
+# LAPACKE
+# Include lapack.h and lapacke_mangling.h even if LAPACKE is not built
+add_subdirectory(LAPACKE/include)
+
 if(LAPACKE)
   add_subdirectory(LAPACKE)
 endif()
@@ -474,8 +418,8 @@ if (BLAS++)
   ExternalProject_Add(blaspp
 	  URL https://bitbucket.org/icl/blaspp/downloads/blaspp-2020.10.02.tar.gz
 	  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${PROJECT_BINARY_DIR}/blaspp-prefix/src/blaspp
-	  BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make
-	  INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install
+	  BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" ${CMAKE_COMMAND} --build .
+	  INSTALL_COMMAND ${CMAKE_COMMAND} -E env PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" ${CMAKE_COMMAND} --install .
 	  )
   ExternalProject_Add_StepDependencies(blaspp build ${BLAS_LIBRARIES})
 endif()
@@ -487,16 +431,16 @@ if (LAPACK++)
   ExternalProject_Add(lapackpp
 	  URL https://bitbucket.org/icl/lapackpp/downloads/lapackpp-2020.10.02.tar.gz
 	  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DLAPACK_LIBRARIES=${LAPACK_LIBRARIES} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}  ${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp
-	  BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make
-	  INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install
+	  BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" ${CMAKE_COMMAND} --build .
+	  INSTALL_COMMAND ${CMAKE_COMMAND} -E env PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" ${CMAKE_COMMAND} --install .
 	  )
   else ()
 # FIXME this does not really work as the libraries list gets converted to a semicolon-separated list somewhere in the lapack++ build files
   ExternalProject_Add(lapackpp
 	  URL https://bitbucket.org/icl/lapackpp/downloads/lapackpp-2020.10.02.tar.gz
 	  CONFIGURE_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DLAPACK_LIBRARIES="${PROJECT_BINARY_DIR}/lib/liblapack.a -lgfortran"  -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}  ${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp
-	  BUILD_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" make
-	  INSTALL_COMMAND make PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" install
+	  BUILD_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" ${CMAKE_COMMAND} --build .
+	  INSTALL_COMMAND ${CMAKE_COMMAND} -E env PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" ${CMAKE_COMMAND} --install .
   )
   endif()
   ExternalProject_Add_StepDependencies(lapackpp build blaspp ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES})
@@ -671,22 +615,34 @@ if(BUILD_HTML_DOCUMENTATION OR BUILD_MAN_DOCUMENTATION)
     set(DOXYGEN_PROJECT_BRIEF "LAPACK: Linear Algebra PACKage")
     set(DOXYGEN_PROJECT_NUMBER ${LAPACK_VERSION})
     set(DOXYGEN_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/DOCS)
-    set(PROJECT_LOGO ${CMAKE_CURRENT_SOURCE_DIR}/DOCS/lapack.png)
+    set(DOXYGEN_PROJECT_LOGO ${CMAKE_CURRENT_SOURCE_DIR}/DOCS/lapack.png)
     set(DOXYGEN_OPTIMIZE_FOR_FORTRAN YES)
     set(DOXYGEN_SOURCE_BROWSER YES)
-    set(DISTRIBUTE_GROUP_DOC YES)
     set(DOXYGEN_CREATE_SUBDIRS YES)
     set(DOXYGEN_SEPARATE_MEMBER_PAGES YES)
     set(DOXYGEN_EXTRACT_ALL YES)
-    set(DOXYGEN_FILE_PATTERNS "*.f;*.c;*.h")
+    set(DOXYGEN_FILE_PATTERNS *.f *.f90 *.c *.h )
     set(DOXYGEN_RECURSIVE YES)
     set(DOXYGEN_GENERATE_TREEVIEW YES)
+    set(DOXYGEN_DOT_IMAGE_FORMAT svg)
     set(DOXYGEN_INTERACTIVE_SVG YES)
-    set(DOXYGEN_QUIET YES)
-    set(DOXYGEN_WARNINGS NO)
+    set(DOXYGEN_QUIET NO)
+    set(DOXYGEN_WARNINGS YES)
+    set(DOXYGEN_WARN_NO_PARAMDOC YES)
+    set(DOXYGEN_WARN_LOGFILE doxygen_error)
     set(DOXYGEN_GENERATE_HTML NO)
     set(DOXYGEN_GENERATE_MAN NO)
-
+    set(DOXYGEN_LAYOUT_FILE "DOCS/DoxygenLayout.xml")
+
+    # Exclude functions that are duplicated, creating conflicts.
+    set(DOXYGEN_EXCLUDE .git
+                        .github
+                        SRC/VARIANTS
+                        BLAS/SRC/lsame.f
+                        BLAS/SRC/xerbla.f
+                        BLAS/SRC/xerbla_array.f
+                        INSTALL/slamchf77.f
+                        INSTALL/dlamchf77.f )
 
     if (BUILD_HTML_DOCUMENTATION)
         set(DOXYGEN_GENERATE_HTML YES)
@@ -697,13 +653,21 @@ if(BUILD_HTML_DOCUMENTATION OR BUILD_MAN_DOCUMENTATION)
 
         doxygen_add_docs(
             html
-            ${PROJECT_SOURCE_DIR}
+
+            # Doxygen INPUT =
+            ${PROJECT_SOURCE_DIR}/README.md
+            ${PROJECT_SOURCE_DIR}/BLAS
+            ${PROJECT_SOURCE_DIR}/CBLAS
+            ${PROJECT_SOURCE_DIR}/SRC
+            ${PROJECT_SOURCE_DIR}/INSTALL
+            ${PROJECT_SOURCE_DIR}/TESTING
+            ${PROJECT_SOURCE_DIR}/DOCS/groups-usr.dox
+
             COMMENT "Generating html LAPACK documentation (it will take some time... time to grab a coffee)"
         )
     endif()
     if (BUILD_MAN_DOCUMENTATION)
         set(DOXYGEN_GENERATE_MAN YES)
-        set(DOXYGEN_EXCLUDE SRC/VARIANTS)
         set(DOXYGEN_MAN_LINKS YES)
         set(DOXYGEN_INLINE_SOURCES NO)
         set(DOXYGEN_CALL_GRAPH NO)
@@ -711,7 +675,15 @@ if(BUILD_HTML_DOCUMENTATION OR BUILD_MAN_DOCUMENTATION)
 
         doxygen_add_docs(
             man
-            ${PROJECT_SOURCE_DIR}
+
+            # Doxygen INPUT =
+            ${PROJECT_SOURCE_DIR}/BLAS
+            ${PROJECT_SOURCE_DIR}/CBLAS
+            ${PROJECT_SOURCE_DIR}/SRC
+            ${PROJECT_SOURCE_DIR}/INSTALL
+            ${PROJECT_SOURCE_DIR}/TESTING
+            ${PROJECT_SOURCE_DIR}/DOCS/groups-usr.dox
+
             COMMENT "Generating man LAPACK documentation"
         )
     endif()
diff --git a/lapack-netlib/LICENSE b/lapack-netlib/LICENSE
index 94cdb0f85..96b04c988 100644
--- a/lapack-netlib/LICENSE
+++ b/lapack-netlib/LICENSE
@@ -1,9 +1,9 @@
-Copyright (c) 1992-2017 The University of Tennessee and The University
+Copyright (c) 1992-2023 The University of Tennessee and The University
                         of Tennessee Research Foundation.  All rights
                         reserved.
-Copyright (c) 2000-2017 The University of California Berkeley. All
+Copyright (c) 2000-2023 The University of California Berkeley. All
                         rights reserved.
-Copyright (c) 2006-2017 The University of Colorado Denver.  All rights
+Copyright (c) 2006-2023 The University of Colorado Denver.  All rights
                         reserved.
 
 $COPYRIGHT$
diff --git a/lapack-netlib/README.md b/lapack-netlib/README.md
index 142aa7b72..a00d4c51d 100644
--- a/lapack-netlib/README.md
+++ b/lapack-netlib/README.md
@@ -6,7 +6,7 @@
 [![Appveyor](https://ci.appveyor.com/api/projects/status/bh38iin398msrbtr?svg=true)](https://ci.appveyor.com/project/langou/lapack/)
 [![codecov](https://codecov.io/gh/Reference-LAPACK/lapack/branch/master/graph/badge.svg)](https://codecov.io/gh/Reference-LAPACK/lapack)
 [![Packaging status](https://repology.org/badge/tiny-repos/lapack.svg)](https://repology.org/metapackage/lapack/versions)
-
+[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/Reference-LAPACK/lapack/badge)](https://securityscorecards.dev/viewer/?uri=github.com/Reference-LAPACK/lapack)
 
 * VERSION 1.0   :  February 29, 1992
 * VERSION 1.0a  :  June 30, 1992
@@ -37,6 +37,7 @@
 * VERSION 3.10.0 : June 2021
 * VERSION 3.10.1 : April 2022
 * VERSION 3.11.0 : November 2022
+* VERSION 3.12.0 : November 2023
 
 LAPACK is a library of Fortran subroutines for solving the most commonly
 occurring problems in numerical linear algebra.

From 01c7010543d1abb5e99da0b97b731aff1acd5aa4 Mon Sep 17 00:00:00 2001
From: "Kirill A. Korinsky" <kirill@korins.ky>
Date: Mon, 27 Nov 2023 14:28:08 +0000
Subject: [PATCH 440/718] cmake/openblas.pc.in: fixed version and URL

---
 cmake/openblas.pc.in | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in
index 7e120af86..11e5606e5 100644
--- a/cmake/openblas.pc.in
+++ b/cmake/openblas.pc.in
@@ -5,7 +5,7 @@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
 openblas_config=USE_64BITINT=@INTERFACE64@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ 
 Name: OpenBLAS
 Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
-Version: @OPENBLAS_VERSION@
-URL: https://github.com/xianyi/OpenBLAS
+Version: @OpenBLAS_VERSION@
+URL: https://github.com/OpenMathLib/OpenBLAS
 Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix} 
 Cflags: -I${includedir}

From 9beee55167c80d7fee7aadb78856c27c7d4b4fe2 Mon Sep 17 00:00:00 2001
From: "Kirill A. Korinsky" <kirill@korins.ky>
Date: Mon, 27 Nov 2023 16:54:49 +0000
Subject: [PATCH 441/718] Enable overstep of too long args without DYNAMIC_ARCH

---
 CMakeLists.txt | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 69077322a..7c6b96f41 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -249,20 +249,21 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago
   endif()
 endif()
 
-if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS)
+# Seems that this hack doesn't required since macOS 11 Big Sur
+if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
   set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
   if (NOT NOFORTRAN)
   set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
   set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
- "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
- "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
+ "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
+ "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
  "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
  "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
  "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
   else ()
   set (CMAKE_C_CREATE_SHARED_LIBRARY
-   "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
-   "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
+   "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
+   "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
    "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
   endif ()
 endif()

From ff92e6e7071aa09dcf0f3d7196bdf53d1bbfc422 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 28 Nov 2023 12:53:35 +0100
Subject: [PATCH 442/718] Fix installation location of lapacke_mangling header

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c6b96f41..6b02864ca 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -542,7 +542,7 @@ if(NOT NO_LAPACKE)
   ADD_CUSTOM_TARGET(genlapacke
   COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
   )
-  install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
+  install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 endif()
 
 # Install pkg-config files

From 1310a0931bc614d8b8bc211ddb82d72cd64aee43 Mon Sep 17 00:00:00 2001
From: Shiyou Yin <yinshiyou-hf@loongson.cn>
Date: Wed, 15 Nov 2023 16:54:06 +0800
Subject: [PATCH 443/718] loongarch: Refine build control for loongarch64.

1. Use getauxval instead of cpucfg to test hardware capability.
2. Remove unnecessary code and option for compiler check in c_check.
---
 c_check                             |  6 ++----
 c_check.pl                          |  6 ++----
 cpuid_loongarch64.c                 | 16 +++++++++++-----
 driver/others/dynamic_loongarch64.c | 19 +++++++------------
 4 files changed, 22 insertions(+), 25 deletions(-)

diff --git a/c_check b/c_check
index b018c10a8..b5e4a9ad0 100755
--- a/c_check
+++ b/c_check
@@ -199,8 +199,7 @@ if [ "$architecture" = "loongarch64" ]; then
     tmpd="$(mktemp -d)"
     tmplsx="$tmpd/lsx.c"
     codelsx='"vadd.b $vr0, $vr0, $vr0"'
-    lsx_flags='-march=loongarch64 -mlsx'
-    printf "#include <lsxintrin.h>\n\n" >> "$tmplsx"
+    lsx_flags='-march=loongarch64'
     printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx"
     args="$lsx_flags -o $tmplsx.o $tmplsx"
     {
@@ -211,8 +210,7 @@ if [ "$architecture" = "loongarch64" ]; then
 
     tmplasx="$tmpd/lasx.c"
     codelasx='"xvadd.b $xr0, $xr0, $xr0"'
-    lasx_flags='-march=loongarch64 -mlasx'
-    printf "#include <lasxintrin.h>\n\n" >> "$tmplasx"
+    lasx_flags='-march=loongarch64'
     printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx"
     args="$lasx_flags -o $tmplasx.o $tmplasx"
     {
diff --git a/c_check.pl b/c_check.pl
index 7a860a211..d9c36793c 100644
--- a/c_check.pl
+++ b/c_check.pl
@@ -241,8 +241,7 @@ if (($architecture eq "loongarch64")) {
     } else {
 	$tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
 	$codelsx = '"vadd.b $vr0, $vr0, $vr0"';
-	$lsx_flags = "-march=loongarch64 -mlsx";
-	print $tmplsx "#include <lsxintrin.h>\n\n";
+	$lsx_flags = "-march=loongarch64";
 	print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n";
 
 	$args = "$lsx_flags -o $tmplsx.o $tmplsx";
@@ -257,8 +256,7 @@ if (($architecture eq "loongarch64")) {
 
 	$tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
 	$codelasx = '"xvadd.b $xr0, $xr0, $xr0"';
-	$lasx_flags = "-march=loongarch64 -mlasx";
-	print $tmplasx "#include <lasxintrin.h>\n\n";
+	$lasx_flags = "-march=loongarch64";
 	print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n";
 
 	$args = "$lasx_flags -o $tmplasx.o $tmplasx";
diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c
index 7c389db27..0ad32ae4e 100644
--- a/cpuid_loongarch64.c
+++ b/cpuid_loongarch64.c
@@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CPU_LOONGSON3R5    1
 #define CPU_LOONGSON2K1000 2
 
-#define LA_HWCAP_LSX    (1<<4)
-#define LA_HWCAP_LASX   (1<<5)
+#define LA_HWCAP_LSX    (1U << 4)
+#define LA_HWCAP_LASX   (1U << 5)
 
 static char *cpuname[] = {
   "LOONGSONGENERIC",
@@ -64,11 +64,11 @@ static char *cpuname_lower[] = {
 
 int detect(void) {
 #ifdef __linux
-  int flag  = (int)getauxval(AT_HWCAP);
+  int hwcap  = (int)getauxval(AT_HWCAP);
 
-  if (flag & LA_HWCAP_LASX)
+  if (hwcap & LA_HWCAP_LASX)
     return CPU_LOONGSON3R5;
-  else if (flag & LA_HWCAP_LSX)
+  else if (hwcap & LA_HWCAP_LSX)
     return CPU_LOONGSON2K1000;
   else
     return CPU_GENERIC;
@@ -94,7 +94,9 @@ void get_subdirname(void) {
 }
 
 void get_cpuconfig(void) {
+  uint32_t hwcaps = 0;
   int d = detect();
+
   switch (d) {
     case CPU_LOONGSON3R5:
       printf("#define LOONGSON3R5\n");
@@ -129,6 +131,10 @@ void get_cpuconfig(void) {
       printf("#define L2_ASSOCIATIVE 16\n");
     break;
   }
+
+  hwcaps = (uint32_t)getauxval( AT_HWCAP );
+  if (hwcaps & LA_HWCAP_LSX)      printf("#define HAVE_LSX\n");
+  if (hwcaps & LA_HWCAP_LASX)     printf("#define HAVE_LASX\n");
 }
 
 void get_libname(void){
diff --git a/driver/others/dynamic_loongarch64.c b/driver/others/dynamic_loongarch64.c
index 52f8bcb2f..44de59669 100644
--- a/driver/others/dynamic_loongarch64.c
+++ b/driver/others/dynamic_loongarch64.c
@@ -25,6 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 
+#include <sys/auxv.h>
 #include "common.h"
 
 extern gotoblas_t  gotoblas_LOONGSON3R5;
@@ -74,21 +75,15 @@ static gotoblas_t *force_coretype(char *coretype) {
   return NULL;
 }
 
-#define LASX_MASK       1<<7
-#define LSX_MASK        1<<6
-#define LOONGARCH_CFG2  0x02
+#define LA_HWCAP_LSX    (1U << 4)
+#define LA_HWCAP_LASX   (1U << 5)
 
 static gotoblas_t *get_coretype(void) {
-  int ret = 0;
-  __asm__ volatile (
-    "cpucfg %0, %1 \n\t"
-    : "+&r"(ret)
-    : "r"(LOONGARCH_CFG2)
-  );
-
-  if (ret & LASX_MASK)
+  int hwcap = (int)getauxval(AT_HWCAP);
+
+  if (hwcap & LA_HWCAP_LASX)
     return &gotoblas_LOONGSON3R5;
-  else if (ret & LSX_MASK)
+  else if (hwcap & LA_HWCAP_LSX)
     return &gotoblas_LOONGSON2K1000;
   else
     return &gotoblas_LOONGSONGENERIC;

From 3def6a8143253a7b3c5b6477d803ecdb1165bfbc Mon Sep 17 00:00:00 2001
From: Shiyou Yin <yinshiyou-hf@loongson.cn>
Date: Wed, 15 Nov 2023 17:24:33 +0800
Subject: [PATCH 444/718] loongarch: Add LASX optimization for dot.

---
 common_loongarch64.h                     |  19 ++
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   0
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   4 +
 kernel/loongarch64/dot_lasx.S            | 309 +++++++++++++++++++++++
 4 files changed, 332 insertions(+)
 create mode 100644 kernel/loongarch64/KERNEL.LOONGSON2K1000
 create mode 100644 kernel/loongarch64/dot_lasx.S

diff --git a/common_loongarch64.h b/common_loongarch64.h
index ce1fcf091..4963b2f07 100644
--- a/common_loongarch64.h
+++ b/common_loongarch64.h
@@ -124,7 +124,17 @@ static inline int WhereAmI(void){
 #define CMPLE   fcmp.cle.d
 #define CMPLT   fcmp.clt.d
 #define NEG     fneg.d
+
+#define XVFSUB  xvfsub.d
+#define XVFADD  xvfadd.d
+#define XVFMADD xvfmadd.d
+
+#define VFSUB  vfsub.d
+#define VFADD  vfadd.d
+#define VFMADD vfmadd.d
+
 #else
+
 #define LD      fld.s
 #define ST      fst.s
 #define MADD    fmadd.s
@@ -142,6 +152,15 @@ static inline int WhereAmI(void){
 #define CMPLE   fcmp.cle.s
 #define CMPLT   fcmp.clt.s
 #define NEG     fneg.s
+
+#define XVFSUB  xvfsub.s
+#define XVFADD  xvfadd.s
+#define XVFMADD xvfmadd.s
+
+#define VFSUB  vfsub.s
+#define VFADD  vfadd.s
+#define VFMADD vfmadd.s
+
 #endif /* defined(DOUBLE) */
 
 #if defined(__64BIT__) && defined(USE64BITINT)
diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
new file mode 100644
index 000000000..e69de29bb
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 011e8b89e..d1b93b0ca 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -1,4 +1,8 @@
 ifndef NO_LASX
+
+SDOTKERNEL = dot_lasx.S
+DDOTKERNEL = dot_lasx.S
+
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
diff --git a/kernel/loongarch64/dot_lasx.S b/kernel/loongarch64/dot_lasx.S
new file mode 100644
index 000000000..9d3cbf63d
--- /dev/null
+++ b/kernel/loongarch64/dot_lasx.S
@@ -0,0 +1,309 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r17
+#define TEMP   $r18
+
+/* Don't change following FR unless you know the effects. */
+#define s1     $f8
+#define s2     $f9
+#define a1     $f10
+#define b1     $f11
+
+PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT     N,      0(N)
+    LDINT     INCX,   0(INCX)
+    LDINT     INCY,   0(INCY)
+#endif
+    SUB       s1,     s1,      s1
+    SUB       s2,     s2,      s2
+    slli.d    INCX,   INCX,    BASE_SHIFT
+    li.d      TEMP,   SIZE
+    slli.d    INCY,   INCY,    BASE_SHIFT
+    bge       $r0,    N,       .L999
+    bne       INCX,   TEMP,    .L20   /* inc_x=1 */
+    bne       INCY,   TEMP,    .L20   /* inc_y=1 */
+#ifdef DOUBLE
+    srai.d    I,      N,       4
+#else
+    srai.d    I,      N,       5
+#endif
+
+    /* init $xr8 and $xr9 to zero */
+#ifdef DOUBLE
+    xvldrepl.d $xr0,  X,       0
+#else
+    xvldrepl.w $xr0,  X,       0
+#endif
+    XVFSUB    $xr8,   $xr0,    $xr0
+    XVFSUB    $xr9,   $xr0,    $xr0
+
+/* !((inc_x == 1) && (inc_y == 1)) */
+    bge       $r0,    I,       .L12   /* <32 */
+.L11:
+    /* case 32~ */
+    xvld      $xr0,   X,       0
+    xvld      $xr1,   X,       32
+    xvld      $xr2,   X,       64
+    xvld      $xr3,   X,       96
+    xvld      $xr4,   Y,       0
+    xvld      $xr5,   Y,       32
+    xvld      $xr6,   Y,       64
+    xvld      $xr7,   Y,       96
+    addi.w    I,      I,       -1
+    addi.d    X,      X,       128
+    addi.d    Y,      Y,       128
+    XVFMADD   $xr8,   $xr0,    $xr4,   $xr8
+    XVFMADD   $xr9,   $xr1,    $xr5,   $xr9
+    XVFMADD   $xr8,   $xr2,    $xr6,   $xr8
+    XVFMADD   $xr9,   $xr3,    $xr7,   $xr9
+    bnez      I,      .L11
+.L12:
+#ifdef DOUBLE
+    andi      I,      N,       0xf
+    srai.d    I,      I,       2
+#else
+    andi      I,      N,       0x1f
+    srai.d    I,      I,       3
+#endif
+    bge       $r0,    I,       .L14   /* <8 */
+.L13:
+    /* case 8~31 */
+    xvld      $xr0,   X,       0
+    xvld      $xr4,   Y,       0
+    addi.w    I,      I,       -1
+    addi.d    X,      X,       32
+    addi.d    Y,      Y,       32
+    XVFMADD   $xr8,   $xr0,    $xr4,   $xr8
+    bnez      I,      .L13
+.L14:
+    /* store dot in s1 $f8 */
+    XVFADD    $xr8,   $xr8,    $xr9
+    SUB       s2,     s2,      s2   /* set s2 to 0.0 */
+    xvpermi.q $xr0,   $xr8,    0x1
+    VFADD     $vr8,   $vr8,    $vr0
+    vpackod.d $vr0,   $vr8,    $vr8
+#ifdef DOUBLE
+    VFADD     $vr8,   $vr8,    $vr0
+#else
+    VFADD     $vr8,   $vr8,    $vr0
+    vpackod.w $vr0,   $vr8,    $vr8
+    VFADD     $vr8,   $vr8,    $vr0
+#endif
+.L15:
+#ifdef DOUBLE
+    andi      I,      N,       0x3
+#else
+    andi      I,      N,       0x7
+#endif
+    bge       $r0,    I,       .L999  /* =0 */
+   .align  3
+.L16:
+    /* case 1~7 */
+    LD        a1,     X,       0
+    LD        b1,     Y,       0
+#ifdef DSDOT
+    fcvt.d.s  a1,     a1
+    fcvt.d.s  b1,     b1
+    fmadd.d   s1,     b1,      a1,     s1
+#else
+    MADD      s1,     b1,      a1,     s1
+#endif
+    addi.d    I,      I,       -1
+    addi.d    X,      X,       SIZE
+    addi.d    Y,      Y,       SIZE
+    bnez      I,      .L16
+    b         .L999
+    .align 3
+
+.L20:
+/* !((inc_x == 1) && (inc_y == 1)) */
+    srai.d    I,      N,       3
+#ifdef F_INTERFACE
+    bgez    INCX, .L21
+    addi.d  TEMP, N, -1
+    mult    TEMP, INCX
+    mflo    TEMP
+    dsub    X, X, TEMP
+    .align 3
+
+.L21:
+    bgez    INCY, .L22
+    addi.d  TEMP, N, -1
+    mult    TEMP, INCY
+    mflo    TEMP
+    dsub    Y, Y, TEMP
+    .align 3
+
+.L22:
+#endif
+    bge $r0,    I, .L25  /* <8 */
+    .align 3
+
+.L23:
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+    addi.d  I, I, -1
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+    blt $r0,    I, .L23
+    .align 3
+
+.L25:
+    andi    I,  N, 7
+    bge $r0,    I, .L999
+    .align  3
+
+.L26:
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+    addi.d  I, I, -1
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+    blt $r0,    I, .L26
+    .align 3
+
+.L999:
+#ifdef DSDOT
+    fadd.d    $f0,    s1,      s2
+#else
+    ADD       $f0,    s1,      s2
+#endif
+    move      $r4,    $r17
+    jirl      $r0,    $r1,     0x0
+
+EPILOGUE

From 13b8c44b44f7fc2eea4f1839b034802a8b1edcae Mon Sep 17 00:00:00 2001
From: Shiyou Yin <yinshiyou-hf@loongson.cn>
Date: Fri, 24 Nov 2023 16:40:32 +0800
Subject: [PATCH 445/718] loongarch: Add optimization for dsdot kernel.

---
 kernel/loongarch64/KERNEL.LOONGSON3R5 |  5 +-
 kernel/loongarch64/dot_lasx.S         | 83 +++++++++++++++++++++++----
 2 files changed, 74 insertions(+), 14 deletions(-)

diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index d1b93b0ca..020a82303 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -1,7 +1,8 @@
 ifndef NO_LASX
 
-SDOTKERNEL = dot_lasx.S
-DDOTKERNEL = dot_lasx.S
+SDOTKERNEL  = dot_lasx.S
+DSDOTKERNEL = dot_lasx.S
+DDOTKERNEL  = dot_lasx.S
 
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
diff --git a/kernel/loongarch64/dot_lasx.S b/kernel/loongarch64/dot_lasx.S
index 9d3cbf63d..0715b6311 100644
--- a/kernel/loongarch64/dot_lasx.S
+++ b/kernel/loongarch64/dot_lasx.S
@@ -51,6 +51,8 @@ PROLOGUE
     LDINT     INCX,   0(INCX)
     LDINT     INCY,   0(INCY)
 #endif
+
+    /* init $f8 and $f9 to zero */
     SUB       s1,     s1,      s1
     SUB       s2,     s2,      s2
     slli.d    INCX,   INCX,    BASE_SHIFT
@@ -59,11 +61,8 @@ PROLOGUE
     bge       $r0,    N,       .L999
     bne       INCX,   TEMP,    .L20   /* inc_x=1 */
     bne       INCY,   TEMP,    .L20   /* inc_y=1 */
-#ifdef DOUBLE
-    srai.d    I,      N,       4
-#else
-    srai.d    I,      N,       5
-#endif
+
+    /* !((inc_x == 1) && (inc_y == 1)) */
 
     /* init $xr8 and $xr9 to zero */
 #ifdef DOUBLE
@@ -71,13 +70,24 @@ PROLOGUE
 #else
     xvldrepl.w $xr0,  X,       0
 #endif
+#ifdef DSDOT
+    xvfcvtl.d.s       $xr0,    $xr0
+    xvfsub.d  $xr8,   $xr0,    $xr0
+    xvfsub.d  $xr9,   $xr0,    $xr0
+#else
     XVFSUB    $xr8,   $xr0,    $xr0
     XVFSUB    $xr9,   $xr0,    $xr0
+#endif
 
-/* !((inc_x == 1) && (inc_y == 1)) */
-    bge       $r0,    I,       .L12   /* <32 */
+#ifdef DOUBLE
+    srai.d    I,      N,       4
+#else
+    srai.d    I,      N,       5
+#endif
+    bge       $r0,    I,       .L12   /* FLOAT: <32 ; DOUBLE: <16 */
+    .align  3
 .L11:
-    /* case 32~ */
+    /* FLOAT: 32~ ; DOUBLE: 16~ */
     xvld      $xr0,   X,       0
     xvld      $xr1,   X,       32
     xvld      $xr2,   X,       64
@@ -89,11 +99,39 @@ PROLOGUE
     addi.w    I,      I,       -1
     addi.d    X,      X,       128
     addi.d    Y,      Y,       128
+#ifdef DSDOT
+    xvfcvtl.d.s       $xr10,   $xr0
+    xvfcvtl.d.s       $xr11,   $xr4
+    xvfcvth.d.s       $xr12,   $xr0
+    xvfcvth.d.s       $xr13,   $xr4
+    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
+    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
+    xvfcvtl.d.s       $xr10,   $xr1
+    xvfcvtl.d.s       $xr11,   $xr5
+    xvfcvth.d.s       $xr12,   $xr1
+    xvfcvth.d.s       $xr13,   $xr5
+    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
+    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
+    xvfcvtl.d.s       $xr10,   $xr2
+    xvfcvtl.d.s       $xr11,   $xr6
+    xvfcvth.d.s       $xr12,   $xr2
+    xvfcvth.d.s       $xr13,   $xr6
+    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
+    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
+    xvfcvtl.d.s       $xr10,   $xr3
+    xvfcvtl.d.s       $xr11,   $xr7
+    xvfcvth.d.s       $xr12,   $xr3
+    xvfcvth.d.s       $xr13,   $xr7
+    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
+    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
+#else
     XVFMADD   $xr8,   $xr0,    $xr4,   $xr8
     XVFMADD   $xr9,   $xr1,    $xr5,   $xr9
     XVFMADD   $xr8,   $xr2,    $xr6,   $xr8
     XVFMADD   $xr9,   $xr3,    $xr7,   $xr9
+#endif
     bnez      I,      .L11
+    .align  3
 .L12:
 #ifdef DOUBLE
     andi      I,      N,       0xf
@@ -102,18 +140,37 @@ PROLOGUE
     andi      I,      N,       0x1f
     srai.d    I,      I,       3
 #endif
-    bge       $r0,    I,       .L14   /* <8 */
+    bge       $r0,    I,       .L14   /* DOUBLE: <4 ; FLOAT: <8 */
+    .align  3
 .L13:
-    /* case 8~31 */
+    /* FLOAT: 8~31 ; DOUBLE: 4~15 */
     xvld      $xr0,   X,       0
     xvld      $xr4,   Y,       0
     addi.w    I,      I,       -1
     addi.d    X,      X,       32
     addi.d    Y,      Y,       32
+#ifdef DSDOT
+    xvfcvtl.d.s       $xr10,   $xr0
+    xvfcvtl.d.s       $xr11,   $xr4
+    xvfcvth.d.s       $xr12,   $xr0
+    xvfcvth.d.s       $xr13,   $xr4
+    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
+    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
+#else
     XVFMADD   $xr8,   $xr0,    $xr4,   $xr8
+#endif
     bnez      I,      .L13
+    .align  3
 .L14:
     /* store dot in s1 $f8 */
+#ifdef DSDOT
+    xvfadd.d  $xr8,   $xr8,    $xr9
+    fsub.s    s2,     s2,      s2,  /* set s2 to 0.0 */
+    xvpermi.q $xr0,   $xr8,    0x1
+    vfadd.d   $vr8,   $vr8,    $vr0
+    vpackod.d $vr0,   $vr8,    $vr8
+    vfadd.d   $vr8,   $vr8,    $vr0
+#else
     XVFADD    $xr8,   $xr8,    $xr9
     SUB       s2,     s2,      s2   /* set s2 to 0.0 */
     xvpermi.q $xr0,   $xr8,    0x1
@@ -125,7 +182,9 @@ PROLOGUE
     VFADD     $vr8,   $vr8,    $vr0
     vpackod.w $vr0,   $vr8,    $vr8
     VFADD     $vr8,   $vr8,    $vr0
-#endif
+#endif /* defined DOUBLE */
+#endif /* defined DSDOT */
+    .align  3
 .L15:
 #ifdef DOUBLE
     andi      I,      N,       0x3
@@ -135,7 +194,7 @@ PROLOGUE
     bge       $r0,    I,       .L999  /* =0 */
    .align  3
 .L16:
-    /* case 1~7 */
+    /* FLOAT: 1~7 ; DOUBLE: 1~3 */
     LD        a1,     X,       0
     LD        b1,     Y,       0
 #ifdef DSDOT

From 9fe07d82fdeb4c2d81bdf57415ce1db157109bb1 Mon Sep 17 00:00:00 2001
From: Shiyou Yin <yinshiyou-hf@loongson.cn>
Date: Fri, 24 Nov 2023 17:57:14 +0800
Subject: [PATCH 446/718] loongarch: Add LSX optimization for dot.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   7 +
 kernel/loongarch64/dot_lsx.S             | 364 +++++++++++++++++++++++
 2 files changed, 371 insertions(+)
 create mode 100644 kernel/loongarch64/dot_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index e69de29bb..b2a396674 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -0,0 +1,7 @@
+ifndef NO_LSX
+
+SDOTKERNEL  = dot_lsx.S
+DSDOTKERNEL = dot_lsx.S
+DDOTKERNEL  = dot_lsx.S
+
+endif
diff --git a/kernel/loongarch64/dot_lsx.S b/kernel/loongarch64/dot_lsx.S
new file mode 100644
index 000000000..f7f613553
--- /dev/null
+++ b/kernel/loongarch64/dot_lsx.S
@@ -0,0 +1,364 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r17
+#define TEMP   $r18
+
+/* Don't change following FR unless you know the effects. */
+#define s1     $f8
+#define s2     $f9
+#define a1     $f10
+#define b1     $f11
+
+PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT     N,      0(N)
+    LDINT     INCX,   0(INCX)
+    LDINT     INCY,   0(INCY)
+#endif
+
+    /* init $f8 and $f9 to zero */
+    SUB       s1,     s1,      s1
+    SUB       s2,     s2,      s2
+    slli.d    INCX,   INCX,    BASE_SHIFT
+    li.d      TEMP,   SIZE
+    slli.d    INCY,   INCY,    BASE_SHIFT
+    bge       $r0,    N,       .L999
+    bne       INCX,   TEMP,    .L20   /* inc_x=1 */
+    bne       INCY,   TEMP,    .L20   /* inc_y=1 */
+
+    /* !((inc_x == 1) && (inc_y == 1)) */
+
+    /* init $vr8 and $vr9 to zero */
+#ifdef DOUBLE
+    vldrepl.d  $vr0,  X,       0
+#else
+    vldrepl.w  $vr0,  X,       0
+#endif
+#ifdef DSDOT
+    vfcvtl.d.s        $vr0,    $vr0
+    vfsub.d   $vr8,   $vr0,    $vr0
+    vfsub.d   $vr9,   $vr0,    $vr0
+#else
+    VFSUB     $vr8,   $vr0,    $vr0
+    VFSUB     $vr9,   $vr0,    $vr0
+#endif
+
+#ifdef DOUBLE
+    srai.d    I,      N,       3
+#else
+    srai.d    I,      N,       4
+#endif
+    bge       $r0,    I,       .L12   /* FLOAT: <16 ; DOUBLE: <8 */
+    .align  3
+.L11:
+    /* FLOAT: 16~ ; DOUBLE: 8~ */
+    vld       $vr0,   X,       0
+    vld       $vr1,   X,       16
+    vld       $vr2,   X,       32
+    vld       $vr3,   X,       48
+    vld       $vr4,   Y,       0
+    vld       $vr5,   Y,       16
+    vld       $vr6,   Y,       32
+    vld       $vr7,   Y,       48
+    addi.w    I,      I,       -1
+    addi.d    X,      X,       64
+    addi.d    Y,      Y,       64
+#ifdef DSDOT
+    vfcvtl.d.s        $vr10,   $vr0
+    vfcvtl.d.s        $vr11,   $vr4
+    vfcvth.d.s        $vr12,   $vr0
+    vfcvth.d.s        $vr13,   $vr4
+    vfmadd.d $vr8,    $vr10,   $vr12,  $vr8
+    vfmadd.d $vr9,    $vr11,   $vr13,  $vr9
+    vfcvtl.d.s        $vr10,   $vr1
+    vfcvtl.d.s        $vr11,   $vr5
+    vfcvth.d.s        $vr12,   $vr1
+    vfcvth.d.s        $vr13,   $vr5
+    vfmadd.d $vr8,    $vr10,   $vr12,  $vr8
+    vfmadd.d $vr9,    $vr11,   $vr13,  $vr9
+    vfcvtl.d.s        $vr10,   $vr2
+    vfcvtl.d.s        $vr11,   $vr6
+    vfcvth.d.s        $vr12,   $vr2
+    vfcvth.d.s        $vr13,   $vr6
+    vfmadd.d $vr8,    $vr10,   $vr12,  $vr8
+    vfmadd.d $vr9,    $vr11,   $vr13,  $vr9
+    vfcvtl.d.s        $vr10,   $vr3
+    vfcvtl.d.s        $vr11,   $vr7
+    vfcvth.d.s        $vr12,   $vr3
+    vfcvth.d.s        $vr13,   $vr7
+    vfmadd.d $vr8,    $vr10,   $vr12,  $vr8
+    vfmadd.d $vr9,    $vr11,   $vr13,  $vr9
+#else
+    VFMADD    $vr8,   $vr0,    $vr4,   $vr8
+    VFMADD    $vr9,   $vr1,    $vr5,   $vr9
+    VFMADD    $vr8,   $vr2,    $vr6,   $vr8
+    VFMADD    $vr9,   $vr3,    $vr7,   $vr9
+#endif
+    bnez      I,      .L11
+    .align  3
+.L12:
+#ifdef DOUBLE
+    andi      I,      N,       0x7
+    srai.d    I,      I,       1
+#else
+    andi      I,      N,       0xf
+    srai.d    I,      I,       2
+#endif
+    bge       $r0,    I,       .L14   /* DOUBLE: <2 ; FLOAT: <4 */
+    .align  3
+.L13:
+    /* FLOAT: 4~15 ; DOUBLE: 2~7 */
+    vld       $vr0,   X,       0
+    vld       $vr4,   Y,       0
+    addi.w    I,      I,       -1
+    addi.d    X,      X,       16
+    addi.d    Y,      Y,       16
+#ifdef DSDOT
+    vfcvtl.d.s        $vr10,   $vr0
+    vfcvtl.d.s        $vr11,   $vr4
+    vfcvth.d.s        $vr12,   $vr0
+    vfcvth.d.s        $vr13,   $vr4
+    vfmadd.d  $vr8,   $vr10,   $vr12,  $vr8
+    vfmadd.d  $vr9,   $vr11,   $vr13,  $vr9
+#else
+    VFMADD    $vr8,   $vr0,    $vr4,   $vr8
+#endif
+    bnez      I,      .L13
+    .align  3
+.L14:
+    /* store dot in s1 $f8 */
+#ifdef DSDOT
+    vfadd.d   $vr8,   $vr8,    $vr9
+    fsub.s    s2,     s2,      s2,  /* set s2 to 0.0 */
+    vpackod.d $vr0,   $vr8,    $vr8
+    vfadd.d   $vr8,   $vr8,    $vr0
+#else
+    VFADD     $vr8,   $vr8,    $vr9
+    SUB       s2,     s2,      s2   /* set s2 to 0.0 */
+    vpackod.d $vr0,   $vr8,    $vr8
+#ifdef DOUBLE
+    VFADD     $vr8,   $vr8,    $vr0
+#else
+    VFADD     $vr8,   $vr8,    $vr0
+    vpackod.w $vr0,   $vr8,    $vr8
+    VFADD     $vr8,   $vr8,    $vr0
+#endif /* defined DOUBLE */
+#endif /* defined DSDOT */
+    .align  3
+.L15:
+#ifdef DOUBLE
+    andi      I,      N,       0x1
+#else
+    andi      I,      N,       0x3
+#endif
+    bge       $r0,    I,       .L999  /* =0 */
+   .align  3
+.L16:
+    /* DOUBLE: 1 ; FLOAT: 1~3 */
+    LD        a1,     X,       0
+    LD        b1,     Y,       0
+#ifdef DSDOT
+    fcvt.d.s  a1,     a1
+    fcvt.d.s  b1,     b1
+    fmadd.d   s1,     b1,      a1,     s1
+#else
+    MADD      s1,     b1,      a1,     s1
+#endif
+    addi.d    I,      I,       -1
+    addi.d    X,      X,       SIZE
+    addi.d    Y,      Y,       SIZE
+    bnez      I,      .L16
+    b         .L999
+    .align 3
+
+.L20:
+/* !((inc_x == 1) && (inc_y == 1)) */
+    srai.d    I,      N,       3
+#ifdef F_INTERFACE
+    bgez    INCX, .L21
+    addi.d  TEMP, N, -1
+    mult    TEMP, INCX
+    mflo    TEMP
+    dsub    X, X, TEMP
+    .align 3
+
+.L21:
+    bgez    INCY, .L22
+    addi.d  TEMP, N, -1
+    mult    TEMP, INCY
+    mflo    TEMP
+    dsub    Y, Y, TEMP
+    .align 3
+
+.L22:
+#endif
+    bge $r0,    I, .L25  /* <8 */
+    .align 3
+
+.L23:
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+    addi.d  I, I, -1
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+    blt $r0,    I, .L23
+    .align 3
+
+.L25:
+    andi    I,  N, 7
+    bge $r0,    I, .L999
+    .align  3
+
+.L26:
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+    addi.d  I, I, -1
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+    blt $r0,    I, .L26
+    .align 3
+
+.L999:
+#ifdef DSDOT
+    fadd.d    $f0,    s1,      s2
+#else
+    ADD       $f0,    s1,      s2
+#endif
+    move      $r4,    $r17
+    jirl      $r0,    $r1,     0x0
+
+EPILOGUE

From a1562e4baec6ead6a4c4263fb6d22d5d35f7394e Mon Sep 17 00:00:00 2001
From: "Kirill A. Korinsky" <kirill@korins.ky>
Date: Tue, 28 Nov 2023 14:04:01 +0000
Subject: [PATCH 447/718] Allow weak linking on old macOS

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c6b96f41..7e5d1e0fb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -258,13 +258,13 @@ if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
  "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
  "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
  "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
- "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
+ "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -undefined dynamic_lookup -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
  "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
   else ()
   set (CMAKE_C_CREATE_SHARED_LIBRARY
    "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
    "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
-   "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
+   "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -undefined dynamic_lookup -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
   endif ()
 endif()
 

From 08fde5ebd206d4f808069668a4020d64852da34d Mon Sep 17 00:00:00 2001
From: "Kirill A. Korinsky" <kirill@korins.ky>
Date: Thu, 30 Nov 2023 21:24:58 +0000
Subject: [PATCH 448/718] Use 64bit build on `CMAKE_SYSTEM_PROCESSOR=i386` on
 Darwin

Here a bit tricky things.

A value `CMAKE_SYSTEM_PROCESSOR` is came from output of `uname -m` which
migth be 32bit with 64bit building applicaiton.

So, for that case use `CMAKE_SIZEOF_VOID_P` to detect the target.

See https://trac.macports.org/ticket/68488
---
 cmake/system_check.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake
index 49b9863e3..c9671b379 100644
--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@@ -46,7 +46,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
   set(LOONGARCH64 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*")
   set(RISCV64 1)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*"))
   if (NOT BINARY)
     if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
       set(X86_64 1)

From 89fa51d495560b88bd2057aae4782e2ee58ce706 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Erik=20Br=C3=A5then=20Solem?= <erikbsolem@hotmail.com>
Date: Sun, 3 Dec 2023 19:06:49 +0100
Subject: [PATCH 449/718] Revert 42b5e08 ("Allow weak linking on old macOS")

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7203088a0..6b02864ca 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -258,13 +258,13 @@ if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
  "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
  "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
  "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
- "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -undefined dynamic_lookup -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
+ "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
  "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
   else ()
   set (CMAKE_C_CREATE_SHARED_LIBRARY
    "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
    "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
-   "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -undefined dynamic_lookup -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
+   "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
   endif ()
 endif()
 

From 2381132ada5e97a576f47abff7f97f6d2a26266a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Erik=20Br=C3=A5then=20Solem?= <erikbsolem@hotmail.com>
Date: Sun, 3 Dec 2023 19:13:53 +0100
Subject: [PATCH 450/718] Darwin < 20: always write xerbla.c.o into archive

Write xerbla.c.o into archive regardless of timestamp by using ar -rs
instead of ar -ru.
---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b02864ca..f3eac2edf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -256,14 +256,14 @@ if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
   set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
   set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
  "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
- "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
+ "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
  "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
  "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
  "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
   else ()
   set (CMAKE_C_CREATE_SHARED_LIBRARY
    "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
-   "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
+   "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
    "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
   endif ()
 endif()

From e4586e81b896b85b600c50f9670e59989cbdabf7 Mon Sep 17 00:00:00 2001
From: Octavian Maghiar <octavian.maghiar@imgtec.com>
Date: Mon, 4 Dec 2023 11:02:18 +0000
Subject: [PATCH 451/718] [RISC-V] Add RISC-V Vector 128-bit target

Current RVV x280 target depends on vlen=512-bits for Level 3 operations.
Commit adds generic target that supports vlen=128-bits.
New target uses the same scalable kernels as x280 for Level 1&2 operations, and autogenerated kernels for Level 3 operations.
Functional correctness of Level 3 operations tested on vlen=128-bits using QEMU v8.1.1 for ctests and BLAS-Tester.
---
 Makefile.prebuild                         |    4 +
 Makefile.riscv64                          |    4 +
 TargetList.txt                            |    1 +
 cpuid_riscv64.c                           |    4 +-
 getarch.c                                 |   13 +-
 kernel/riscv64/KERNEL.RISCV64_ZVL128B     |  243 +++++
 kernel/riscv64/cgemm_kernel_8x4_zvl128b.c |  996 +++++++++++++++++++
 kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c | 1102 +++++++++++++++++++++
 kernel/riscv64/dgemm_kernel_8x4_zvl128b.c |  492 +++++++++
 kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c |  660 ++++++++++++
 kernel/riscv64/sgemm_kernel_8x8_zvl128b.c |  791 +++++++++++++++
 kernel/riscv64/strmm_kernel_8x8_zvl128b.c |  991 ++++++++++++++++++
 kernel/riscv64/zgemm_kernel_4x4_zvl128b.c |  720 ++++++++++++++
 kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c |  805 +++++++++++++++
 param.h                                   |   39 +
 15 files changed, 6863 insertions(+), 2 deletions(-)
 create mode 100644 kernel/riscv64/KERNEL.RISCV64_ZVL128B
 create mode 100644 kernel/riscv64/cgemm_kernel_8x4_zvl128b.c
 create mode 100644 kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c
 create mode 100644 kernel/riscv64/dgemm_kernel_8x4_zvl128b.c
 create mode 100644 kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c
 create mode 100644 kernel/riscv64/sgemm_kernel_8x8_zvl128b.c
 create mode 100644 kernel/riscv64/strmm_kernel_8x8_zvl128b.c
 create mode 100644 kernel/riscv64/zgemm_kernel_4x4_zvl128b.c
 create mode 100644 kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c

diff --git a/Makefile.prebuild b/Makefile.prebuild
index c4f4a2602..b56169da0 100644
--- a/Makefile.prebuild
+++ b/Makefile.prebuild
@@ -59,6 +59,10 @@ ifeq ($(TARGET), x280)
 TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
 endif
 
+ifeq ($(TARGET), RISCV64_ZVL128B)
+TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
+endif
+
 ifeq ($(TARGET), RISCV64_GENERIC)
 TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d
 endif
diff --git a/Makefile.riscv64 b/Makefile.riscv64
index ce7a27141..93e270bde 100644
--- a/Makefile.riscv64
+++ b/Makefile.riscv64
@@ -6,6 +6,10 @@ ifeq ($(CORE), x280)
 CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math
 FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
 endif
+ifeq ($(CORE), RISCV64_ZVL128B)
+CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d 
+FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
+endif
 ifeq ($(CORE), RISCV64_GENERIC)
 CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d
 FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static
diff --git a/TargetList.txt b/TargetList.txt
index f76f605cc..5b7a63831 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -119,6 +119,7 @@ Z14
 
 10.RISC-V 64:
 RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54)
+RISCV64_ZVL128B
 C910V
 x280
 
diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c
index 1b6b62f21..15a539c20 100644
--- a/cpuid_riscv64.c
+++ b/cpuid_riscv64.c
@@ -73,11 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CPU_GENERIC         0
 #define CPU_C910V           1
 #define CPU_RISCV64_ZVL256B 2
+#define CPU_RISCV64_ZVL128B 3
 
 static char *cpuname[] = {
   "RISCV64_GENERIC",
   "C910V",
-  "CPU_RISCV64_ZVL256B"
+  "CPU_RISCV64_ZVL256B",
+  "CPU_RISCV64_ZVL128B"
 };
 
 int detect(void){
diff --git a/getarch.c b/getarch.c
index 772836347..b8b7ef7e0 100644
--- a/getarch.c
+++ b/getarch.c
@@ -1691,7 +1691,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "x280"
 #else
 #endif
-
+#ifdef FORCE_RISCV64_ZVL128B
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "RISCV64_ZVL128B"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG "-DRISCV64_ZVL128B "                          \
+                   "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
+                   "-DL2_SIZE=1048576 -DL2_LINESIZE=32 "         \
+                   "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME  "riscv64_zvl128b"
+#define CORENAME "RISCV64_ZVL128B"
+#endif
 
 #if defined(FORCE_E2K) || defined(__e2k__)
 #define FORCE
diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL128B b/kernel/riscv64/KERNEL.RISCV64_ZVL128B
new file mode 100644
index 000000000..fec69ee09
--- /dev/null
+++ b/kernel/riscv64/KERNEL.RISCV64_ZVL128B
@@ -0,0 +1,243 @@
+SAMAXKERNEL  = amax_rvv.c
+DAMAXKERNEL  = amax_rvv.c
+CAMAXKERNEL  = zamax_rvv.c
+ZAMAXKERNEL  = zamax_rvv.c
+
+SAMINKERNEL  = amin_rvv.c
+DAMINKERNEL  = amin_rvv.c
+CAMINKERNEL  = zamin_rvv.c
+ZAMINKERNEL  = zamin_rvv.c
+
+SMAXKERNEL   = max_rvv.c
+DMAXKERNEL   = max_rvv.c
+
+SMINKERNEL   = min_rvv.c
+DMINKERNEL   = min_rvv.c
+
+ISAMAXKERNEL = iamax_rvv.c
+IDAMAXKERNEL = iamax_rvv.c
+ICAMAXKERNEL = izamax_rvv.c
+IZAMAXKERNEL = izamax_rvv.c
+
+ISAMINKERNEL = iamin_rvv.c
+IDAMINKERNEL = iamin_rvv.c
+ICAMINKERNEL = izamin_rvv.c
+IZAMINKERNEL = izamin_rvv.c
+
+ISMAXKERNEL  = imax_rvv.c
+IDMAXKERNEL  = imax_rvv.c
+
+ISMINKERNEL  = imin_rvv.c
+IDMINKERNEL  = imin_rvv.c
+
+SASUMKERNEL  = asum_rvv.c
+DASUMKERNEL  = asum_rvv.c
+CASUMKERNEL  = zasum_rvv.c
+ZASUMKERNEL  = zasum_rvv.c
+
+SSUMKERNEL  = sum_rvv.c
+DSUMKERNEL  = sum_rvv.c
+CSUMKERNEL  = zsum_rvv.c
+ZSUMKERNEL  = zsum_rvv.c
+
+SAXPYKERNEL  = axpy_rvv.c
+DAXPYKERNEL  = axpy_rvv.c
+CAXPYKERNEL  = zaxpy_rvv.c
+ZAXPYKERNEL  = zaxpy_rvv.c
+
+SAXPBYKERNEL  = axpby_rvv.c
+DAXPBYKERNEL  = axpby_rvv.c
+CAXPBYKERNEL  = zaxpby_rvv.c
+ZAXPBYKERNEL  = zaxpby_rvv.c
+
+SCOPYKERNEL  = copy_rvv.c
+DCOPYKERNEL  = copy_rvv.c
+CCOPYKERNEL  = zcopy_rvv.c
+ZCOPYKERNEL  = zcopy_rvv.c
+
+SDOTKERNEL   = dot_rvv.c
+DDOTKERNEL   = dot_rvv.c
+CDOTKERNEL   = zdot_rvv.c
+ZDOTKERNEL   = zdot_rvv.c
+DSDOTKERNEL  = dot_rvv.c
+
+SNRM2KERNEL  = nrm2_rvv.c
+DNRM2KERNEL  = nrm2_rvv.c
+CNRM2KERNEL  = znrm2_rvv.c
+ZNRM2KERNEL  = znrm2_rvv.c
+
+SROTKERNEL   = rot_rvv.c
+DROTKERNEL   = rot_rvv.c
+CROTKERNEL   = zrot_rvv.c
+ZROTKERNEL   = zrot_rvv.c
+
+SSCALKERNEL  = scal_rvv.c
+DSCALKERNEL  = scal_rvv.c
+CSCALKERNEL  = zscal_rvv.c
+ZSCALKERNEL  = zscal_rvv.c
+
+SSWAPKERNEL  = swap_rvv.c
+DSWAPKERNEL  = swap_rvv.c
+CSWAPKERNEL  = zswap_rvv.c
+ZSWAPKERNEL  = zswap_rvv.c
+
+SGEMVNKERNEL = gemv_n_rvv.c
+DGEMVNKERNEL = gemv_n_rvv.c
+CGEMVNKERNEL = zgemv_n_rvv.c
+ZGEMVNKERNEL = zgemv_n_rvv.c
+
+SGEMVTKERNEL = gemv_t_rvv.c
+DGEMVTKERNEL = gemv_t_rvv.c
+CGEMVTKERNEL = zgemv_t_rvv.c
+ZGEMVTKERNEL = zgemv_t_rvv.c
+
+SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c
+DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c
+ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+STRMMKERNEL	   =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c
+STRMMUNCOPY_M  =  ../generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
+STRMMLNCOPY_M  =  ../generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
+STRMMUTCOPY_M  =  ../generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
+STRMMLTCOPY_M  =  ../generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
+
+DTRMMKERNEL	   =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c
+DTRMMUNCOPY_M  =  ../generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
+DTRMMLNCOPY_M  =  ../generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
+DTRMMUTCOPY_M  =  ../generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
+DTRMMLTCOPY_M  =  ../generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
+
+CTRMMKERNEL	   =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c
+CTRMMUNCOPY_M  =  ../generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
+CTRMMLNCOPY_M  =  ../generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
+CTRMMUTCOPY_M  =  ../generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
+CTRMMLTCOPY_M  =  ../generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
+
+ZTRMMKERNEL	   =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c
+ZTRMMUNCOPY_M  =  ../generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
+ZTRMMLNCOPY_M  =  ../generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
+ZTRMMUTCOPY_M  =  ../generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
+ZTRMMLTCOPY_M  =  ../generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+SSYMV_U_KERNEL =  symv_U_rvv.c 
+SSYMV_L_KERNEL =  symv_L_rvv.c
+DSYMV_U_KERNEL =  symv_U_rvv.c 
+DSYMV_L_KERNEL =  symv_L_rvv.c
+CSYMV_U_KERNEL =  zsymv_U_rvv.c
+CSYMV_L_KERNEL =  zsymv_L_rvv.c
+ZSYMV_U_KERNEL =  zsymv_U_rvv.c
+ZSYMV_L_KERNEL =  zsymv_L_rvv.c
+
+CHEMV_L_KERNEL =  zhemv_LM_rvv.c
+CHEMV_M_KERNEL =  zhemv_LM_rvv.c
+CHEMV_U_KERNEL =  zhemv_UV_rvv.c
+CHEMV_V_KERNEL =  zhemv_UV_rvv.c
+ZHEMV_L_KERNEL =  zhemv_LM_rvv.c
+ZHEMV_M_KERNEL =  zhemv_LM_rvv.c
+ZHEMV_U_KERNEL =  zhemv_UV_rvv.c
+ZHEMV_V_KERNEL =  zhemv_UV_rvv.c
+
+SSYMMUCOPY_M   =  ../generic/symm_ucopy_$(SGEMM_UNROLL_M).c 
+SSYMMLCOPY_M   =  ../generic/symm_lcopy_$(SGEMM_UNROLL_M).c
+
+DSYMMUCOPY_M   =  ../generic/symm_ucopy_$(DGEMM_UNROLL_M).c 
+DSYMMLCOPY_M   =  ../generic/symm_lcopy_$(DGEMM_UNROLL_M).c
+
+CSYMMUCOPY_M   =  ../generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c 
+CSYMMLCOPY_M   =  ../generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c
+
+ZSYMMUCOPY_M   =  ../generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c 
+ZSYMMLCOPY_M   =  ../generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c
+
+CHEMMLTCOPY_M  =  ../generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c
+CHEMMUTCOPY_M  =  ../generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c
+
+ZHEMMLTCOPY_M  =  ../generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c
+ZHEMMUTCOPY_M  =  ../generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c
+
+LSAME_KERNEL = ../generic/lsame.c
+
+SCABS_KERNEL = ../generic/cabs.c
+DCABS_KERNEL = ../generic/cabs.c
+QCABS_KERNEL = ../generic/cabs.c
+
+ifndef SGEMM_BETA
+SGEMM_BETA = gemm_beta_rvv.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = gemm_beta_rvv.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = zgemm_beta_rvv.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = zgemm_beta_rvv.c
+endif
diff --git a/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c
new file mode 100644
index 000000000..bd615389c
--- /dev/null
+++ b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c
@@ -0,0 +1,996 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=2
+ M=8
+ M_tail_scalar_from=2
+ N=4
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='float'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=2
+ VFMACC='__riscv_vfmacc_vf_f32m2'
+ VFMUL='__riscv_vfmul_vf_f32m2'
+ VLEV='__riscv_vle32_v_f32m2'
+ VLSEV='__riscv_vlse32_v_f32m2'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m2'
+ VSETVL='__riscv_vsetvl_e32m2'
+ VSEV='__riscv_vse32_v_f32m2'
+ VSSEV='__riscv_vsse32_v_f32m2'
+ acc_vector_t='vfloat32m2_t'
+ output='cgemm_kernel_8x4_zvl128b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m2_t'
+
+*/
+
+#include "common.h"
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define S0 1
+#define S1 -1
+#define S2 1
+#define S3 1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfmacc
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define S0 1
+#define S1 1
+#define S2 1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfmsac
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define S0 1
+#define S1 1
+#define S2 -1
+#define S3 1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfnmsac
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define S0 1
+#define S1 -1
+#define S2 -1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfnmacc
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 4; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m2(8);
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            float B2r = B[bi + 2 * 2 + 0];
+            float B2i = B[bi + 2 * 2 + 1];
+            float B3r = B[bi + 3 * 2 + 0];
+            float B3i = B[bi + 3 * 2 + 1];
+            bi += 4 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 8 * 2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 6 vector registers for temporaries
+            // performing 2 operations between reuses of temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+            tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+            vfloat32m2_t ACC2r = tmp0r;
+            vfloat32m2_t ACC2i = tmp0i;
+            vfloat32m2_t ACC3r = tmp1r;
+            vfloat32m2_t ACC3i = tmp1i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                B2r = B[bi + 2 * 2 + 0];
+                B2i = B[bi + 2 * 2 + 1];
+                B3r = B[bi + 3 * 2 + 0];
+                B3i = B[bi + 3 * 2 + 1];
+                bi += 4 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 8 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
+                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
+                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
+                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
+            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
+            C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl);
+            C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl);
+            C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl);
+            C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
+            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
+            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
+            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
+
+            m_top += 8;
+        }
+
+        // -- tails for main pass
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            float B2r = B[bi + 2 * 2 + 0];
+            float B2i = B[bi + 2 * 2 + 1];
+            float B3r = B[bi + 3 * 2 + 0];
+            float B3i = B[bi + 3 * 2 + 1];
+            bi += 4 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 6 vector registers for temporaries
+            // performing 2 operations between reuses of temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+            tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+            vfloat32m2_t ACC2r = tmp0r;
+            vfloat32m2_t ACC2i = tmp0i;
+            vfloat32m2_t ACC3r = tmp1r;
+            vfloat32m2_t ACC3i = tmp1i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                B2r = B[bi + 2 * 2 + 0];
+                B2i = B[bi + 2 * 2 + 1];
+                B3r = B[bi + 3 * 2 + 0];
+                B3i = B[bi + 3 * 2 + 1];
+                bi += 4 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
+                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
+                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
+                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
+            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
+            C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl);
+            C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl);
+            C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl);
+            C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
+            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
+            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
+            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1];
+                result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1];
+                result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1];
+                result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1];
+                ai += 2 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result4 * alphar;
+            Ci += result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
+            Cr += result6 * alphar;
+            Ci += result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
+            Cr += result8 * alphar;
+            Ci += result9 * alphar;
+            Cr -= result9 * alphai;
+            Ci += result8 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 2 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 2 * ldc + 1) * 2 + 1];
+            Cr += result10 * alphar;
+            Ci += result11 * alphar;
+            Cr -= result11 * alphai;
+            Ci += result10 * alphai;
+            C[(ci + 2 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
+            Cr += result12 * alphar;
+            Ci += result13 * alphar;
+            Cr -= result13 * alphai;
+            Ci += result12 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 3 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 3 * ldc + 1) * 2 + 1];
+            Cr += result14 * alphar;
+            Ci += result15 * alphar;
+            Cr -= result15 * alphai;
+            Ci += result14 * alphai;
+            C[(ci + 3 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                ai += 1 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
+            Cr += result4 * alphar;
+            Ci += result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
+            Cr += result6 * alphar;
+            Ci += result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            bi += 2 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 8 * 2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 10 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                bi += 2 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 8 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
+            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            bi += 2 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 10 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                bi += 2 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
+            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                ai += 2 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result4 * alphar;
+            Ci += result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
+            Cr += result6 * alphar;
+            Ci += result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                ai += 1 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            bi += 1 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 8 * 2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                bi += 1 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 8 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            bi += 1 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                bi += 1 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                ai += 2 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                ai += 1 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c b/kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c
new file mode 100644
index 000000000..3268cb810
--- /dev/null
+++ b/kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c
@@ -0,0 +1,1102 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=2
+ M=8
+ M_tail_scalar_from=2
+ N=4
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='float'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=2
+ VFMACC='__riscv_vfmacc_vf_f32m2'
+ VFMUL='__riscv_vfmul_vf_f32m2'
+ VLEV='__riscv_vle32_v_f32m2'
+ VLSEV='__riscv_vlse32_v_f32m2'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m2'
+ VSETVL='__riscv_vsetvl_e32m2'
+ VSEV='__riscv_vse32_v_f32m2'
+ VSSEV='__riscv_vsse32_v_f32m2'
+ acc_vector_t='vfloat32m2_t'
+ output='ctrmm_kernel_8x4_zvl128b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m2_t'
+
+*/
+
+#include "common.h"
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define S0 1
+#define S1 -1
+#define S2 1
+#define S3 1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfmacc
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define S0 1
+#define S1 1
+#define S2 1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfmsac
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define S0 1
+#define S1 1
+#define S2 -1
+#define S3 1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfnmsac
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define S0 1
+#define S1 -1
+#define S2 -1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfnmacc
+#endif
+
+#if defined(LEFT) != defined(TRANSA)
+#define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 4; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m2(8);
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8 * 2;
+            bi += off * 4 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            float B2r = B[bi + 2 * 2 + 0];
+            float B2i = B[bi + 2 * 2 + 1];
+            float B3r = B[bi + 3 * 2 + 0];
+            float B3i = B[bi + 3 * 2 + 1];
+            bi += 4 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 8 * 2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 6 vector registers for temporaries
+            // performing 2 operations between reuses of temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+            tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+            vfloat32m2_t ACC2r = tmp0r;
+            vfloat32m2_t ACC2i = tmp0i;
+            vfloat32m2_t ACC3r = tmp1r;
+            vfloat32m2_t ACC3i = tmp1i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                B2r = B[bi + 2 * 2 + 0];
+                B2i = B[bi + 2 * 2 + 1];
+                B3r = B[bi + 3 * 2 + 0];
+                B3i = B[bi + 3 * 2 + 1];
+                bi += 4 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 8 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
+                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
+                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
+                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl);
+            vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl);
+            vfloat32m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl);
+            vfloat32m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl);
+            vfloat32m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl);
+            vfloat32m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
+            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
+            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
+            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
+
+            m_top += 8;
+        }
+
+        // -- tails for main pass
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4 * 2;
+            bi += off * 4 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            float B2r = B[bi + 2 * 2 + 0];
+            float B2i = B[bi + 2 * 2 + 1];
+            float B3r = B[bi + 3 * 2 + 0];
+            float B3i = B[bi + 3 * 2 + 1];
+            bi += 4 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 6 vector registers for temporaries
+            // performing 2 operations between reuses of temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+            tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+            vfloat32m2_t ACC2r = tmp0r;
+            vfloat32m2_t ACC2i = tmp0i;
+            vfloat32m2_t ACC3r = tmp1r;
+            vfloat32m2_t ACC3i = tmp1i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                B2r = B[bi + 2 * 2 + 0];
+                B2i = B[bi + 2 * 2 + 1];
+                B3r = B[bi + 3 * 2 + 0];
+                B3i = B[bi + 3 * 2 + 1];
+                bi += 4 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
+                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
+                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
+                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl);
+            vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl);
+            vfloat32m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl);
+            vfloat32m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl);
+            vfloat32m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl);
+            vfloat32m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
+            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
+            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
+            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2 * 2;
+            bi += off * 4 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1];
+                result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1];
+                result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1];
+                result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1];
+                ai += 2 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result4 * alphar;
+            Ci = result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result6 * alphar;
+            Ci = result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result8 * alphar;
+            Ci = result9 * alphar;
+            Cr -= result9 * alphai;
+            Ci += result8 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result10 * alphar;
+            Ci = result11 * alphar;
+            Cr -= result11 * alphai;
+            Ci += result10 * alphai;
+            C[(ci + 2 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result12 * alphar;
+            Ci = result13 * alphar;
+            Cr -= result13 * alphai;
+            Ci += result12 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result14 * alphar;
+            Ci = result15 * alphar;
+            Cr -= result15 * alphai;
+            Ci += result14 * alphai;
+            C[(ci + 3 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1 * 2;
+            bi += off * 4 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                ai += 1 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result4 * alphar;
+            Ci = result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result6 * alphar;
+            Ci = result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8 * 2;
+            bi += off * 2 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            bi += 2 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 8 * 2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 10 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                bi += 2 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 8 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl);
+            vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4 * 2;
+            bi += off * 2 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            bi += 2 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 10 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                bi += 2 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl);
+            vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2 * 2;
+            bi += off * 2 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                ai += 2 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result4 * alphar;
+            Ci = result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result6 * alphar;
+            Ci = result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1 * 2;
+            bi += off * 2 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                ai += 1 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8 * 2;
+            bi += off * 1 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            bi += 1 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 8 * 2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                bi += 1 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 8 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4 * 2;
+            bi += off * 1 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            bi += 1 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                bi += 1 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2 * 2;
+            bi += off * 1 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                ai += 2 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1 * 2;
+            bi += off * 1 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                ai += 1 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c
new file mode 100644
index 000000000..a613f0bce
--- /dev/null
+++ b/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c
@@ -0,0 +1,492 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=4
+ M=8
+ M_tail_scalar_from=2
+ N=4
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='double'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=4
+ VFMACC='__riscv_vfmacc_vf_f64m4'
+ VFMUL='__riscv_vfmul_vf_f64m4'
+ VLEV='__riscv_vle64_v_f64m4'
+ VLSEV='__riscv_vlse64_v_f64m4'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m4'
+ VSETVL='__riscv_vsetvl_e64m4'
+ VSEV='__riscv_vse64_v_f64m4'
+ VSSEV='__riscv_vsse64_v_f64m4'
+ acc_vector_t='vfloat64m4_t'
+ output='dgemm_kernel_8x4_zvl128b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m4_t'
+
+*/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 4; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m4(8);
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            double B2 = B[bi + 2];
+            double B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+            vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
+            vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
+            c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl);
+            c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+        // -- tails for main pass
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e64m4(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            double B2 = B[bi + 2];
+            double B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+            vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
+            vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
+            c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl);
+            c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                result4 += A[ai + 0] * B[bi + 2];
+                result5 += A[ai + 1] * B[bi + 2];
+                result6 += A[ai + 0] * B[bi + 3];
+                result7 += A[ai + 1] * B[bi + 3];
+                ai += 2;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 0 * ldc + 1] += alpha * result1;
+            C[ci + 1 * ldc + 0] += alpha * result2;
+            C[ci + 1 * ldc + 1] += alpha * result3;
+            C[ci + 2 * ldc + 0] += alpha * result4;
+            C[ci + 2 * ldc + 1] += alpha * result5;
+            C[ci + 3 * ldc + 0] += alpha * result6;
+            C[ci + 3 * ldc + 1] += alpha * result7;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                result2 += A[ai + 0] * B[bi + 2];
+                result3 += A[ai + 0] * B[bi + 3];
+                ai += 1;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 1 * ldc + 0] += alpha * result1;
+            C[ci + 2 * ldc + 0] += alpha * result2;
+            C[ci + 3 * ldc + 0] += alpha * result3;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e64m4(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e64m4(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                ai += 2;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 0 * ldc + 1] += alpha * result1;
+            C[ci + 1 * ldc + 0] += alpha * result2;
+            C[ci + 1 * ldc + 1] += alpha * result3;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                ai += 1;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 1 * ldc + 0] += alpha * result1;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e64m4(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            double B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e64m4(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            double B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                ai += 2;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 0 * ldc + 1] += alpha * result1;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                ai += 1;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c b/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c
new file mode 100644
index 000000000..c1e0da86e
--- /dev/null
+++ b/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c
@@ -0,0 +1,660 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=4
+ M=8
+ M_tail_scalar_from=2
+ N=4
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='double'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=4
+ VFMACC='__riscv_vfmacc_vf_f64m4'
+ VFMUL='__riscv_vfmul_vf_f64m4'
+ VLEV='__riscv_vle64_v_f64m4'
+ VLSEV='__riscv_vlse64_v_f64m4'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m4'
+ VSETVL='__riscv_vsetvl_e64m4'
+ VSEV='__riscv_vse64_v_f64m4'
+ VSSEV='__riscv_vsse64_v_f64m4'
+ acc_vector_t='vfloat64m4_t'
+ output='dtrmm_kernel_8x4_zvl128b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m4_t'
+
+*/
+
+#include "common.h"
+
+#if defined(LEFT) != defined(TRANSA)
+#define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 4; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m4(8);
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            double B2 = B[bi + 2];
+            double B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+            vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
+            vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
+            vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl);
+            vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl);
+            vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl);
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+        // -- tails for main pass
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e64m4(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            double B2 = B[bi + 2];
+            double B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+            vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
+            vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
+            vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl);
+            vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl);
+            vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl);
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                result4 += A[ai + 0] * B[bi + 2];
+                result5 += A[ai + 1] * B[bi + 2];
+                result6 += A[ai + 0] * B[bi + 3];
+                result7 += A[ai + 1] * B[bi + 3];
+                ai += 2;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 0 * ldc + 1] = alpha * result1;
+            C[ci + 1 * ldc + 0] = alpha * result2;
+            C[ci + 1 * ldc + 1] = alpha * result3;
+            C[ci + 2 * ldc + 0] = alpha * result4;
+            C[ci + 2 * ldc + 1] = alpha * result5;
+            C[ci + 3 * ldc + 0] = alpha * result6;
+            C[ci + 3 * ldc + 1] = alpha * result7;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                result2 += A[ai + 0] * B[bi + 2];
+                result3 += A[ai + 0] * B[bi + 3];
+                ai += 1;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 1 * ldc + 0] = alpha * result1;
+            C[ci + 2 * ldc + 0] = alpha * result2;
+            C[ci + 3 * ldc + 0] = alpha * result3;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e64m4(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
+            vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl);
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e64m4(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
+            vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl);
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                ai += 2;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 0 * ldc + 1] = alpha * result1;
+            C[ci + 1 * ldc + 0] = alpha * result2;
+            C[ci + 1 * ldc + 1] = alpha * result3;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                ai += 1;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 1 * ldc + 0] = alpha * result1;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e64m4(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+            double B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e64m4(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+            double B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                ai += 2;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 0 * ldc + 1] = alpha * result1;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                ai += 1;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c b/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c
new file mode 100644
index 000000000..ad720e694
--- /dev/null
+++ b/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c
@@ -0,0 +1,791 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=2
+ M=8
+ M_tail_scalar_from=2
+ N=8
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='float'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=2
+ VFMACC='__riscv_vfmacc_vf_f32m2'
+ VFMUL='__riscv_vfmul_vf_f32m2'
+ VLEV='__riscv_vle32_v_f32m2'
+ VLSEV='__riscv_vlse32_v_f32m2'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m2'
+ VSETVL='__riscv_vsetvl_e32m2'
+ VSEV='__riscv_vse32_v_f32m2'
+ VSSEV='__riscv_vsse32_v_f32m2'
+ acc_vector_t='vfloat32m2_t'
+ output='sgemm_kernel_8x8_zvl128b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m2_t'
+
+*/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 8; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m2(8);
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            float B4 = B[bi + 4];
+            float B5 = B[bi + 5];
+            float B6 = B[bi + 6];
+            float B7 = B[bi + 7];
+            bi += 8;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+            vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl);
+            vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl);
+            vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl);
+            vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                B4 = B[bi + 4];
+                B5 = B[bi + 5];
+                B6 = B[bi + 6];
+                B7 = B[bi + 7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
+            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
+            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
+            c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl);
+            c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl);
+            c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl);
+            c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c4, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c5, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c6, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c7, gvl);
+            m_top += 8;
+        }
+
+        // -- tails for main pass
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            float B4 = B[bi + 4];
+            float B5 = B[bi + 5];
+            float B6 = B[bi + 6];
+            float B7 = B[bi + 7];
+            bi += 8;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+            vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl);
+            vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl);
+            vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl);
+            vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                B4 = B[bi + 4];
+                B5 = B[bi + 5];
+                B6 = B[bi + 6];
+                B7 = B[bi + 7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
+            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
+            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
+            c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl);
+            c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl);
+            c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl);
+            c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c4, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c5, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c6, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c7, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                result4 += A[ai + 0] * B[bi + 2];
+                result5 += A[ai + 1] * B[bi + 2];
+                result6 += A[ai + 0] * B[bi + 3];
+                result7 += A[ai + 1] * B[bi + 3];
+                result8 += A[ai + 0] * B[bi + 4];
+                result9 += A[ai + 1] * B[bi + 4];
+                result10 += A[ai + 0] * B[bi + 5];
+                result11 += A[ai + 1] * B[bi + 5];
+                result12 += A[ai + 0] * B[bi + 6];
+                result13 += A[ai + 1] * B[bi + 6];
+                result14 += A[ai + 0] * B[bi + 7];
+                result15 += A[ai + 1] * B[bi + 7];
+                ai += 2;
+                bi += 8;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 0 * ldc + 1] += alpha * result1;
+            C[ci + 1 * ldc + 0] += alpha * result2;
+            C[ci + 1 * ldc + 1] += alpha * result3;
+            C[ci + 2 * ldc + 0] += alpha * result4;
+            C[ci + 2 * ldc + 1] += alpha * result5;
+            C[ci + 3 * ldc + 0] += alpha * result6;
+            C[ci + 3 * ldc + 1] += alpha * result7;
+            C[ci + 4 * ldc + 0] += alpha * result8;
+            C[ci + 4 * ldc + 1] += alpha * result9;
+            C[ci + 5 * ldc + 0] += alpha * result10;
+            C[ci + 5 * ldc + 1] += alpha * result11;
+            C[ci + 6 * ldc + 0] += alpha * result12;
+            C[ci + 6 * ldc + 1] += alpha * result13;
+            C[ci + 7 * ldc + 0] += alpha * result14;
+            C[ci + 7 * ldc + 1] += alpha * result15;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                result2 += A[ai + 0] * B[bi + 2];
+                result3 += A[ai + 0] * B[bi + 3];
+                result4 += A[ai + 0] * B[bi + 4];
+                result5 += A[ai + 0] * B[bi + 5];
+                result6 += A[ai + 0] * B[bi + 6];
+                result7 += A[ai + 0] * B[bi + 7];
+                ai += 1;
+                bi += 8;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 1 * ldc + 0] += alpha * result1;
+            C[ci + 2 * ldc + 0] += alpha * result2;
+            C[ci + 3 * ldc + 0] += alpha * result3;
+            C[ci + 4 * ldc + 0] += alpha * result4;
+            C[ci + 5 * ldc + 0] += alpha * result5;
+            C[ci + 6 * ldc + 0] += alpha * result6;
+            C[ci + 7 * ldc + 0] += alpha * result7;
+            m_top += 1;
+        }
+
+        n_top += 8;
+    }
+
+    // -- tails for N=4
+
+    if (N & 4) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
+            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
+            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
+            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
+            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                result4 += A[ai + 0] * B[bi + 2];
+                result5 += A[ai + 1] * B[bi + 2];
+                result6 += A[ai + 0] * B[bi + 3];
+                result7 += A[ai + 1] * B[bi + 3];
+                ai += 2;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 0 * ldc + 1] += alpha * result1;
+            C[ci + 1 * ldc + 0] += alpha * result2;
+            C[ci + 1 * ldc + 1] += alpha * result3;
+            C[ci + 2 * ldc + 0] += alpha * result4;
+            C[ci + 2 * ldc + 1] += alpha * result5;
+            C[ci + 3 * ldc + 0] += alpha * result6;
+            C[ci + 3 * ldc + 1] += alpha * result7;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                result2 += A[ai + 0] * B[bi + 2];
+                result3 += A[ai + 0] * B[bi + 3];
+                ai += 1;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 1 * ldc + 0] += alpha * result1;
+            C[ci + 2 * ldc + 0] += alpha * result2;
+            C[ci + 3 * ldc + 0] += alpha * result3;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                ai += 2;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 0 * ldc + 1] += alpha * result1;
+            C[ci + 1 * ldc + 0] += alpha * result2;
+            C[ci + 1 * ldc + 1] += alpha * result3;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                ai += 1;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 1 * ldc + 0] += alpha * result1;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                ai += 2;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 0 * ldc + 1] += alpha * result1;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                ai += 1;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/strmm_kernel_8x8_zvl128b.c b/kernel/riscv64/strmm_kernel_8x8_zvl128b.c
new file mode 100644
index 000000000..ef18f036c
--- /dev/null
+++ b/kernel/riscv64/strmm_kernel_8x8_zvl128b.c
@@ -0,0 +1,991 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=2
+ M=8
+ M_tail_scalar_from=2
+ N=8
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='float'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=2
+ VFMACC='__riscv_vfmacc_vf_f32m2'
+ VFMUL='__riscv_vfmul_vf_f32m2'
+ VLEV='__riscv_vle32_v_f32m2'
+ VLSEV='__riscv_vlse32_v_f32m2'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m2'
+ VSETVL='__riscv_vsetvl_e32m2'
+ VSEV='__riscv_vse32_v_f32m2'
+ VSSEV='__riscv_vsse32_v_f32m2'
+ acc_vector_t='vfloat32m2_t'
+ output='strmm_kernel_8x8_zvl128b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m2_t'
+
+*/
+
+#include "common.h"
+
+#if defined(LEFT) != defined(TRANSA)
+#define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 8; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m2(8);
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8;
+            bi += off * 8;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 8;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            float B4 = B[bi + 4];
+            float B5 = B[bi + 5];
+            float B6 = B[bi + 6];
+            float B7 = B[bi + 7];
+            bi += 8;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+            vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl);
+            vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl);
+            vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl);
+            vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                B4 = B[bi + 4];
+                B5 = B[bi + 5];
+                B6 = B[bi + 6];
+                B7 = B[bi + 7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
+            vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl);
+            vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl);
+            vfloat32m2_t c4 = __riscv_vfmul_vf_f32m2(result4, alpha, gvl);
+            vfloat32m2_t c5 = __riscv_vfmul_vf_f32m2(result5, alpha, gvl);
+            vfloat32m2_t c6 = __riscv_vfmul_vf_f32m2(result6, alpha, gvl);
+            vfloat32m2_t c7 = __riscv_vfmul_vf_f32m2(result7, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c4, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c5, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c6, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c7, gvl);
+            m_top += 8;
+        }
+
+        // -- tails for main pass
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4;
+            bi += off * 8;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 8;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            float B4 = B[bi + 4];
+            float B5 = B[bi + 5];
+            float B6 = B[bi + 6];
+            float B7 = B[bi + 7];
+            bi += 8;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+            vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl);
+            vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl);
+            vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl);
+            vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                B4 = B[bi + 4];
+                B5 = B[bi + 5];
+                B6 = B[bi + 6];
+                B7 = B[bi + 7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
+            vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl);
+            vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl);
+            vfloat32m2_t c4 = __riscv_vfmul_vf_f32m2(result4, alpha, gvl);
+            vfloat32m2_t c5 = __riscv_vfmul_vf_f32m2(result5, alpha, gvl);
+            vfloat32m2_t c6 = __riscv_vfmul_vf_f32m2(result6, alpha, gvl);
+            vfloat32m2_t c7 = __riscv_vfmul_vf_f32m2(result7, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c4, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c5, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c6, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c7, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2;
+            bi += off * 8;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 8;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                result4 += A[ai + 0] * B[bi + 2];
+                result5 += A[ai + 1] * B[bi + 2];
+                result6 += A[ai + 0] * B[bi + 3];
+                result7 += A[ai + 1] * B[bi + 3];
+                result8 += A[ai + 0] * B[bi + 4];
+                result9 += A[ai + 1] * B[bi + 4];
+                result10 += A[ai + 0] * B[bi + 5];
+                result11 += A[ai + 1] * B[bi + 5];
+                result12 += A[ai + 0] * B[bi + 6];
+                result13 += A[ai + 1] * B[bi + 6];
+                result14 += A[ai + 0] * B[bi + 7];
+                result15 += A[ai + 1] * B[bi + 7];
+                ai += 2;
+                bi += 8;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 0 * ldc + 1] = alpha * result1;
+            C[ci + 1 * ldc + 0] = alpha * result2;
+            C[ci + 1 * ldc + 1] = alpha * result3;
+            C[ci + 2 * ldc + 0] = alpha * result4;
+            C[ci + 2 * ldc + 1] = alpha * result5;
+            C[ci + 3 * ldc + 0] = alpha * result6;
+            C[ci + 3 * ldc + 1] = alpha * result7;
+            C[ci + 4 * ldc + 0] = alpha * result8;
+            C[ci + 4 * ldc + 1] = alpha * result9;
+            C[ci + 5 * ldc + 0] = alpha * result10;
+            C[ci + 5 * ldc + 1] = alpha * result11;
+            C[ci + 6 * ldc + 0] = alpha * result12;
+            C[ci + 6 * ldc + 1] = alpha * result13;
+            C[ci + 7 * ldc + 0] = alpha * result14;
+            C[ci + 7 * ldc + 1] = alpha * result15;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1;
+            bi += off * 8;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 8;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                result2 += A[ai + 0] * B[bi + 2];
+                result3 += A[ai + 0] * B[bi + 3];
+                result4 += A[ai + 0] * B[bi + 4];
+                result5 += A[ai + 0] * B[bi + 5];
+                result6 += A[ai + 0] * B[bi + 6];
+                result7 += A[ai + 0] * B[bi + 7];
+                ai += 1;
+                bi += 8;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 1 * ldc + 0] = alpha * result1;
+            C[ci + 2 * ldc + 0] = alpha * result2;
+            C[ci + 3 * ldc + 0] = alpha * result3;
+            C[ci + 4 * ldc + 0] = alpha * result4;
+            C[ci + 5 * ldc + 0] = alpha * result5;
+            C[ci + 6 * ldc + 0] = alpha * result6;
+            C[ci + 7 * ldc + 0] = alpha * result7;
+            m_top += 1;
+        }
+
+        n_top += 8;
+    }
+
+    // -- tails for N=4
+
+    if (N & 4) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
+            vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl);
+            vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
+            vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl);
+            vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                result4 += A[ai + 0] * B[bi + 2];
+                result5 += A[ai + 1] * B[bi + 2];
+                result6 += A[ai + 0] * B[bi + 3];
+                result7 += A[ai + 1] * B[bi + 3];
+                ai += 2;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 0 * ldc + 1] = alpha * result1;
+            C[ci + 1 * ldc + 0] = alpha * result2;
+            C[ci + 1 * ldc + 1] = alpha * result3;
+            C[ci + 2 * ldc + 0] = alpha * result4;
+            C[ci + 2 * ldc + 1] = alpha * result5;
+            C[ci + 3 * ldc + 0] = alpha * result6;
+            C[ci + 3 * ldc + 1] = alpha * result7;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                result2 += A[ai + 0] * B[bi + 2];
+                result3 += A[ai + 0] * B[bi + 3];
+                ai += 1;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 1 * ldc + 0] = alpha * result1;
+            C[ci + 2 * ldc + 0] = alpha * result2;
+            C[ci + 3 * ldc + 0] = alpha * result3;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                ai += 2;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 0 * ldc + 1] = alpha * result1;
+            C[ci + 1 * ldc + 0] = alpha * result2;
+            C[ci + 1 * ldc + 1] = alpha * result3;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                ai += 1;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 1 * ldc + 0] = alpha * result1;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                ai += 2;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 0 * ldc + 1] = alpha * result1;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                ai += 1;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c b/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c
new file mode 100644
index 000000000..0776f03fd
--- /dev/null
+++ b/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c
@@ -0,0 +1,720 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=2
+ M=4
+ M_tail_scalar_from=2
+ N=4
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='double'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=2
+ VFMACC='__riscv_vfmacc_vf_f64m2'
+ VFMUL='__riscv_vfmul_vf_f64m2'
+ VLEV='__riscv_vle64_v_f64m2'
+ VLSEV='__riscv_vlse64_v_f64m2'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m2'
+ VSETVL='__riscv_vsetvl_e64m2'
+ VSEV='__riscv_vse64_v_f64m2'
+ VSSEV='__riscv_vsse64_v_f64m2'
+ acc_vector_t='vfloat64m2_t'
+ output='zgemm_kernel_4x4_zvl128b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m2_t'
+
+*/
+
+#include "common.h"
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define S0 1
+#define S1 -1
+#define S2 1
+#define S3 1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfmacc
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define S0 1
+#define S1 1
+#define S2 1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfmsac
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define S0 1
+#define S1 1
+#define S2 -1
+#define S3 1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfnmsac
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define S0 1
+#define S1 -1
+#define S2 -1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfnmacc
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 4; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m2(4);
+
+        for (BLASLONG i = 0; i < M / 4; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            double B0r = B[bi + 0 * 2 + 0];
+            double B0i = B[bi + 0 * 2 + 1];
+            double B1r = B[bi + 1 * 2 + 0];
+            double B1i = B[bi + 1 * 2 + 1];
+            double B2r = B[bi + 2 * 2 + 0];
+            double B2i = B[bi + 2 * 2 + 1];
+            double B3r = B[bi + 3 * 2 + 0];
+            double B3i = B[bi + 3 * 2 + 1];
+            bi += 4 * 2;
+
+            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 6 vector registers for temporaries
+            // performing 2 operations between reuses of temporaries
+            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+            vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+            vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat64m2_t ACC0r = tmp0r;
+            vfloat64m2_t ACC0i = tmp0i;
+            vfloat64m2_t ACC1r = tmp1r;
+            vfloat64m2_t ACC1i = tmp1i;
+            tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl);
+            tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+            vfloat64m2_t ACC2r = tmp0r;
+            vfloat64m2_t ACC2i = tmp0i;
+            vfloat64m2_t ACC3r = tmp1r;
+            vfloat64m2_t ACC3i = tmp1i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                B2r = B[bi + 2 * 2 + 0];
+                B2i = B[bi + 2 * 2 + 1];
+                B3r = B[bi + 3 * 2 + 0];
+                B3i = B[bi + 3 * 2 + 1];
+                bi += 4 * 2;
+
+                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
+                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
+                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
+                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m2_t C2r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t C2i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m2_t C3r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t C3i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
+            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
+            C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl);
+            C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl);
+            C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl);
+            C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
+            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
+            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
+            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
+
+            m_top += 4;
+        }
+
+        // -- tails for main pass
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            double result8 = 0;
+            double result9 = 0;
+            double result10 = 0;
+            double result11 = 0;
+            double result12 = 0;
+            double result13 = 0;
+            double result14 = 0;
+            double result15 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1];
+                result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1];
+                result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1];
+                result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1];
+                ai += 2 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result4 * alphar;
+            Ci += result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
+            Cr += result6 * alphar;
+            Ci += result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
+            Cr += result8 * alphar;
+            Ci += result9 * alphar;
+            Cr -= result9 * alphai;
+            Ci += result8 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 2 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 2 * ldc + 1) * 2 + 1];
+            Cr += result10 * alphar;
+            Ci += result11 * alphar;
+            Cr -= result11 * alphai;
+            Ci += result10 * alphai;
+            C[(ci + 2 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
+            Cr += result12 * alphar;
+            Ci += result13 * alphar;
+            Cr -= result13 * alphai;
+            Ci += result12 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 3 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 3 * ldc + 1) * 2 + 1];
+            Cr += result14 * alphar;
+            Ci += result15 * alphar;
+            Cr -= result15 * alphai;
+            Ci += result14 * alphai;
+            C[(ci + 3 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                ai += 1 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
+            Cr += result4 * alphar;
+            Ci += result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
+            Cr += result6 * alphar;
+            Ci += result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e64m2(4);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 4; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            double B0r = B[bi + 0 * 2 + 0];
+            double B0i = B[bi + 0 * 2 + 1];
+            double B1r = B[bi + 1 * 2 + 0];
+            double B1i = B[bi + 1 * 2 + 1];
+            bi += 2 * 2;
+
+            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 10 vector registers for temporaries
+            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+            vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+            vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat64m2_t ACC0r = tmp0r;
+            vfloat64m2_t ACC0i = tmp0i;
+            vfloat64m2_t ACC1r = tmp1r;
+            vfloat64m2_t ACC1i = tmp1i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                bi += 2 * 2;
+
+                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
+            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                ai += 2 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result4 * alphar;
+            Ci += result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
+            Cr += result6 * alphar;
+            Ci += result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                ai += 1 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e64m2(4);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 4; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            double B0r = B[bi + 0 * 2 + 0];
+            double B0i = B[bi + 0 * 2 + 1];
+            bi += 1 * 2;
+
+            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            vfloat64m2_t ACC0r = tmp0r;
+            vfloat64m2_t ACC0i = tmp0i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                bi += 1 * 2;
+
+                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                ai += 2 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                ai += 1 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c b/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c
new file mode 100644
index 000000000..d7d5e5fea
--- /dev/null
+++ b/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c
@@ -0,0 +1,805 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=2
+ M=4
+ M_tail_scalar_from=2
+ N=4
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='double'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=2
+ VFMACC='__riscv_vfmacc_vf_f64m2'
+ VFMUL='__riscv_vfmul_vf_f64m2'
+ VLEV='__riscv_vle64_v_f64m2'
+ VLSEV='__riscv_vlse64_v_f64m2'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m2'
+ VSETVL='__riscv_vsetvl_e64m2'
+ VSEV='__riscv_vse64_v_f64m2'
+ VSSEV='__riscv_vsse64_v_f64m2'
+ acc_vector_t='vfloat64m2_t'
+ output='ztrmm_kernel_4x4_zvl128b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m2_t'
+
+*/
+
+#include "common.h"
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define S0 1
+#define S1 -1
+#define S2 1
+#define S3 1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfmacc
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define S0 1
+#define S1 1
+#define S2 1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfmsac
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define S0 1
+#define S1 1
+#define S2 -1
+#define S3 1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfnmsac
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define S0 1
+#define S1 -1
+#define S2 -1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfnmacc
+#endif
+
+#if defined(LEFT) != defined(TRANSA)
+#define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 4; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m2(4);
+
+        for (BLASLONG i = 0; i < M / 4; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4 * 2;
+            bi += off * 4 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+            double B0r = B[bi + 0 * 2 + 0];
+            double B0i = B[bi + 0 * 2 + 1];
+            double B1r = B[bi + 1 * 2 + 0];
+            double B1i = B[bi + 1 * 2 + 1];
+            double B2r = B[bi + 2 * 2 + 0];
+            double B2i = B[bi + 2 * 2 + 1];
+            double B3r = B[bi + 3 * 2 + 0];
+            double B3i = B[bi + 3 * 2 + 1];
+            bi += 4 * 2;
+
+            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 6 vector registers for temporaries
+            // performing 2 operations between reuses of temporaries
+            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+            vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+            vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat64m2_t ACC0r = tmp0r;
+            vfloat64m2_t ACC0i = tmp0i;
+            vfloat64m2_t ACC1r = tmp1r;
+            vfloat64m2_t ACC1i = tmp1i;
+            tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl);
+            tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+            vfloat64m2_t ACC2r = tmp0r;
+            vfloat64m2_t ACC2i = tmp0i;
+            vfloat64m2_t ACC3r = tmp1r;
+            vfloat64m2_t ACC3i = tmp1i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                B2r = B[bi + 2 * 2 + 0];
+                B2i = B[bi + 2 * 2 + 1];
+                B3r = B[bi + 3 * 2 + 0];
+                B3i = B[bi + 3 * 2 + 1];
+                bi += 4 * 2;
+
+                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
+                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
+                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
+                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            vfloat64m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl);
+            vfloat64m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl);
+            vfloat64m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl);
+            vfloat64m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl);
+            vfloat64m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl);
+            vfloat64m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
+            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
+            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
+            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
+
+            m_top += 4;
+        }
+
+        // -- tails for main pass
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            double result8 = 0;
+            double result9 = 0;
+            double result10 = 0;
+            double result11 = 0;
+            double result12 = 0;
+            double result13 = 0;
+            double result14 = 0;
+            double result15 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2 * 2;
+            bi += off * 4 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1];
+                result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1];
+                result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1];
+                result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1];
+                ai += 2 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result4 * alphar;
+            Ci = result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result6 * alphar;
+            Ci = result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result8 * alphar;
+            Ci = result9 * alphar;
+            Cr -= result9 * alphai;
+            Ci += result8 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result10 * alphar;
+            Ci = result11 * alphar;
+            Cr -= result11 * alphai;
+            Ci += result10 * alphai;
+            C[(ci + 2 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result12 * alphar;
+            Ci = result13 * alphar;
+            Cr -= result13 * alphai;
+            Ci += result12 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result14 * alphar;
+            Ci = result15 * alphar;
+            Cr -= result15 * alphai;
+            Ci += result14 * alphai;
+            C[(ci + 3 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1 * 2;
+            bi += off * 4 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                ai += 1 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result4 * alphar;
+            Ci = result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result6 * alphar;
+            Ci = result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e64m2(4);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 4; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4 * 2;
+            bi += off * 2 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+            double B0r = B[bi + 0 * 2 + 0];
+            double B0i = B[bi + 0 * 2 + 1];
+            double B1r = B[bi + 1 * 2 + 0];
+            double B1i = B[bi + 1 * 2 + 1];
+            bi += 2 * 2;
+
+            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 10 vector registers for temporaries
+            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+            vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+            vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat64m2_t ACC0r = tmp0r;
+            vfloat64m2_t ACC0i = tmp0i;
+            vfloat64m2_t ACC1r = tmp1r;
+            vfloat64m2_t ACC1i = tmp1i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                bi += 2 * 2;
+
+                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            vfloat64m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl);
+            vfloat64m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2 * 2;
+            bi += off * 2 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                ai += 2 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result4 * alphar;
+            Ci = result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result6 * alphar;
+            Ci = result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1 * 2;
+            bi += off * 2 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                ai += 1 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e64m2(4);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 4; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4 * 2;
+            bi += off * 1 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+            double B0r = B[bi + 0 * 2 + 0];
+            double B0i = B[bi + 0 * 2 + 1];
+            bi += 1 * 2;
+
+            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            vfloat64m2_t ACC0r = tmp0r;
+            vfloat64m2_t ACC0i = tmp0i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                bi += 1 * 2;
+
+                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2 * 2;
+            bi += off * 1 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                ai += 2 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1 * 2;
+            bi += off * 1 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                ai += 1 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/param.h b/param.h
index c5c70b78e..a1a70400c 100644
--- a/param.h
+++ b/param.h
@@ -3123,6 +3123,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
+#ifdef RISCV64_ZVL128B
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M 8
+#define SGEMM_DEFAULT_UNROLL_N 8
+
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define DGEMM_DEFAULT_UNROLL_N 4
+
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define CGEMM_DEFAULT_UNROLL_N 4
+
+#define ZGEMM_DEFAULT_UNROLL_M 4
+#define ZGEMM_DEFAULT_UNROLL_N 4
+
+#define SGEMM_DEFAULT_P 128
+#define DGEMM_DEFAULT_P 128
+#define CGEMM_DEFAULT_P 96
+#define ZGEMM_DEFAULT_P 64
+
+#define SGEMM_DEFAULT_Q 240
+#define DGEMM_DEFAULT_Q 120
+#define CGEMM_DEFAULT_Q 120
+#define ZGEMM_DEFAULT_Q 120
+
+#define SGEMM_DEFAULT_R 12288
+#define DGEMM_DEFAULT_R 8192
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#define SYMV_P 16
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+
+#endif
+
 #ifdef RISCV64_ZVL256B
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0

From 4a12cf53ec116c06e5d74073b54a3bca6046cb17 Mon Sep 17 00:00:00 2001
From: Octavian Maghiar <octavian.maghiar@imgtec.com>
Date: Mon, 4 Dec 2023 11:13:35 +0000
Subject: [PATCH 452/718] [RISC-V] Improve RVV kernel generator LMUL usage

The RVV kernel generation script uses the provided LMUL to increase the number of accumulator registers.
Since the effect of the LMUL is to group together the vector registers into larger ones, it actually should be used as a multiplier in the calculation of vlenmax.
At the moment, no matter what LMUL is provided, the generated kernels would only set the maximum number of vector elements equal to VLEN/SEW.
Commit changes the use of LMUL to properly adjust vlenmax. Note that an increase in LMUL results in a decrease in the number of effective vector registers.
---
 kernel/riscv64/generate_kernel.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/kernel/riscv64/generate_kernel.py b/kernel/riscv64/generate_kernel.py
index e2ce97971..8be7c9f9c 100755
--- a/kernel/riscv64/generate_kernel.py
+++ b/kernel/riscv64/generate_kernel.py
@@ -197,13 +197,13 @@ def generate_gemm_kernel_inner_complex( settings, dest, M, N, vlen, a_regs ):
         dest.write("ai += {M}*2;")
         dest.write()
 
-
-        accumulation_regs = a_regs * N * settings['LMUL_ACC'].value
+        # for each vector register loaded from matrix A, we require N registers to hold vector-scalar multiply-accumulate results
+        accumulation_regs = a_regs * N
         dest.write("// {a_regs} vector regs to hold A array contents, {accumulation_regs} regs to hold values accumulated over k",
                 a_regs=a_regs*2, accumulation_regs=accumulation_regs*2
             )
         pass_regs = (accumulation_regs + a_regs)*2
-        tmp_regs = 32-pass_regs
+        tmp_regs = (32 // settings['LMUL_ACC'].value) - pass_regs
         if tmp_regs < 2:
             raise RuntimeError("Complex kernel would use too many registers!")
 
@@ -337,10 +337,12 @@ def generate_gemm_kernel( settings, OUTPUT ):
 
     M = settings['M'].value
     N = settings['N'].value
-    vlenmax = int( settings['reg_width_bits'].value / settings['ELEN_PARAM'].value )
+    vlenmax = int(settings['reg_width_bits'].value * settings['LMUL_ACC'].value /
+                  settings['ELEN_PARAM'].value)
     a_regs = max(int(M/vlenmax), 1)
 
-    accumulation_regs = a_regs * N * settings['LMUL_ACC'].value
+    # for each vector register loaded from matrix A, we require N registers to hold vector-scalar multiply-accumulate results
+    accumulation_regs = a_regs * N
     required_regs = accumulation_regs + a_regs
     if is_complex:
         required_regs = required_regs * 2 + 2
@@ -380,9 +382,9 @@ def generate_gemm_kernel( settings, OUTPUT ):
 '''.format(tail_policy=settings['tail_policy'].value))
 
 
-    if required_regs > 32:
-        raise Exception("{} vector registers needed during accumulation for unrolling {} x {}{} but only 32 are available".format(
-            required_regs, N, M, (" with wide accumulator" if settings['LMUL_ACC'].value > 1 else '')
+    if required_regs > (32 // settings['LMUL_ACC'].value):
+        raise Exception("{} vector registers needed during accumulation for unrolling {} x {}{} but only {} are available".format(
+            required_regs, N, M, (" with wide accumulator" if settings['LMUL_ACC'].value > 1 else ''), 32 // settings['LMUL_ACC'].value
             ))
 
     TRMM = (settings['op'].value == 'trmm')
@@ -448,7 +450,8 @@ def generate_gemm_kernel( settings, OUTPUT ):
 def generate_M_tails( dest, settings, M, N ):
     M_tail = int(M/2)
     M_tail_min = settings['M_tail_scalar_from'].value
-    vlenmax = int( settings['reg_width_bits'].value / settings['ELEN_PARAM'].value )
+    vlenmax = int(settings['reg_width_bits'].value * settings['LMUL_ACC'].value
+                  / settings['ELEN_PARAM'].value )
     TRMM           = (settings['op'].value == 'trmm')
     is_complex = settings['complex'].value
     generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real
@@ -667,4 +670,4 @@ def main():
             ERROR("unsupported kernel type {}".format(settings['op']))
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From a53a79e059545e2935e2745f41cf3498277f1a1e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 5 Dec 2023 15:41:39 +0100
Subject: [PATCH 453/718] Add tests for the DMD functions (Reference-LAPACK PR
 736)

---
 lapack-netlib/TESTING/EIG/Makefile    |  58 +-
 lapack-netlib/TESTING/EIG/cchkdmd.f90 | 721 +++++++++++++++++++++++
 lapack-netlib/TESTING/EIG/dchkdmd.f90 | 813 ++++++++++++++++++++++++++
 lapack-netlib/TESTING/EIG/schkdmd.f90 | 792 +++++++++++++++++++++++++
 lapack-netlib/TESTING/EIG/zchkdmd.f90 | 745 +++++++++++++++++++++++
 5 files changed, 3116 insertions(+), 13 deletions(-)
 create mode 100644 lapack-netlib/TESTING/EIG/cchkdmd.f90
 create mode 100644 lapack-netlib/TESTING/EIG/dchkdmd.f90
 create mode 100644 lapack-netlib/TESTING/EIG/schkdmd.f90
 create mode 100644 lapack-netlib/TESTING/EIG/zchkdmd.f90

diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile
index 942ae6982..5de315b6e 100644
--- a/lapack-netlib/TESTING/EIG/Makefile
+++ b/lapack-netlib/TESTING/EIG/Makefile
@@ -64,6 +64,8 @@ SEIGTST = schkee.o \
    sort03.o ssbt21.o ssgt01.o sslect.o sspt21.o sstt21.o \
    sstt22.o ssyl01.o ssyt21.o ssyt22.o
 
+SDMDEIGTST = schkdmd.o
+
 CEIGTST = cchkee.o \
    cbdt01.o cbdt02.o cbdt03.o cbdt05.o \
    cchkbb.o cchkbd.o cchkbk.o cchkbl.o cchkec.o \
@@ -81,6 +83,8 @@ CEIGTST = cchkee.o \
    csgt01.o cslect.o csyl01.o\
    cstt21.o cstt22.o cunt01.o cunt03.o
 
+CDMDEIGTST = cchkdmd.o
+
 DZIGTST = dlafts.o dlahd2.o dlasum.o dlatb9.o dstech.o dstect.o \
    dsvdch.o dsvdct.o dsxt1.o
 
@@ -101,6 +105,8 @@ DEIGTST = dchkee.o \
    dort03.o dsbt21.o dsgt01.o dslect.o dspt21.o dstt21.o \
    dstt22.o dsyl01.o dsyt21.o dsyt22.o
 
+DDMDEIGTST = dchkdmd.o
+
 ZEIGTST = zchkee.o \
    zbdt01.o zbdt02.o zbdt03.o zbdt05.o \
    zchkbb.o zchkbd.o zchkbk.o zchkbl.o zchkec.o \
@@ -118,27 +124,45 @@ ZEIGTST = zchkee.o \
    zsgt01.o zslect.o zsyl01.o\
    zstt21.o zstt22.o zunt01.o zunt03.o
 
+ZDMDEIGTST = zchkdmd.o
+
 .PHONY: all
 all: single complex double complex16
 
 .PHONY: single complex double complex16
-single: xeigtsts
-complex: xeigtstc
-double: xeigtstd
-complex16: xeigtstz
+single: xeigtsts xdmdeigtsts
+complex: xeigtstc xdmdeigtstc
+double: xeigtstd xdmdeigtstd
+complex16: xeigtstz xdmdeigtstz
+
+xdmdeigtsts: $(SDMDEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+
+xdmdeigtstc: $(CDMDEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xdmdeigtstd: $(DDMDEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xdmdeigtstz: $(ZDMDEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
+xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+
+xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+
+$(SDMDEIGTST): $(FRC)
+$(CDMDEIGTST): $(FRC)
+$(DDMDEIGTST): $(FRC)
+$(ZDMDEIGTST): $(FRC)
 $(AEIGTST): $(FRC)
 $(SCIGTST): $(FRC)
 $(DZIGTST): $(FRC)
@@ -155,7 +179,7 @@ clean: cleanobj cleanexe
 cleanobj:
 	rm -f *.o
 cleanexe:
-	rm -f xeigtst*
+	rm -f xeigtst* xdmdeigtst*
 
 schkee.o: schkee.F
 	$(FC) $(FFLAGS_DRV) -c -o $@ $<
@@ -165,3 +189,11 @@ cchkee.o: cchkee.F
 	$(FC) $(FFLAGS_DRV) -c -o $@ $<
 zchkee.o: zchkee.F
 	$(FC) $(FFLAGS_DRV) -c -o $@ $<
+schkdmd.o: schkdmd.f90
+	$(FC) $(FFLAGS_DRV) -c -o $@ $<
+cchkdmd.o: cchkdmd.f90
+	$(FC) $(FFLAGS_DRV) -c -o $@ $<
+dchkdmd.o: dchkdmd.f90
+	$(FC) $(FFLAGS_DRV) -c -o $@ $<
+zchkdmd.o: zchkdmd.f90
+	$(FC) $(FFLAGS_DRV) -c -o $@ $<
diff --git a/lapack-netlib/TESTING/EIG/cchkdmd.f90 b/lapack-netlib/TESTING/EIG/cchkdmd.f90
new file mode 100644
index 000000000..a9c181da9
--- /dev/null
+++ b/lapack-netlib/TESTING/EIG/cchkdmd.f90
@@ -0,0 +1,721 @@
+!   This is a test program for checking the implementations of
+!   the implementations of the following subroutines
+!
+!   CGEDMD,  for computation of the
+!            Dynamic Mode Decomposition (DMD)
+!   CGEDMDQ, for computation of a
+!            QR factorization based compressed DMD
+!
+!   Developed and supported by:
+!   ===========================
+!   Developed and coded by Zlatko Drmac, Faculty of Science,
+!   University of Zagreb;  drmac@math.hr
+!   In cooperation with
+!   AIMdyn Inc., Santa Barbara, CA.
+!   ========================================================
+!   How to run the code (compiler, link info)
+!   ========================================================
+!   Compile as FORTRAN 90 (or later) and link with BLAS and
+!   LAPACK libraries.
+!   NOTE: The code is developed and tested on top of the
+!   Intel MKL library (versions 2022.0.3 and 2022.2.0),
+!   using the Intel Fortran compiler.
+!
+!   For developers of the C++ implementation
+!   ========================================================
+!   See the LAPACK++ and Template Numerical Toolkit (TNT)
+!
+!   Note on a development of the GPU HP implementation
+!   ========================================================
+!   Work in progress. See CUDA, MAGMA, SLATE.
+!   NOTE: The four SVD subroutines used in this code are
+!   included as a part of R&D and for the completeness.
+!   This was also an opportunity to test those SVD codes.
+!   If the scaling option is used all four are essentially
+!   equally good. For implementations on HP platforms,
+!   one can use whichever SVD is available.
+!............................................................
+
+!............................................................
+!............................................................
+!
+      PROGRAM DMD_TEST
+
+      use iso_fortran_env
+      IMPLICIT NONE
+      integer, parameter :: WP = real32
+!............................................................
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+
+      COMPLEX(KIND=WP), PARAMETER ::  CONE = ( 1.0_WP, 0.0_WP )
+      COMPLEX(KIND=WP), PARAMETER :: CZERO = ( 0.0_WP, 0.0_WP )
+!............................................................
+      REAL(KIND=WP), ALLOCATABLE, DIMENSION(:)   :: RES, &
+                     RES1, RESEX, SINGVX, SINGVQX, WORK
+      INTEGER      , ALLOCATABLE, DIMENSION(:)   ::   IWORK
+      REAL(KIND=WP) :: WDUMMY(2)
+      INTEGER       :: IDUMMY(4), ISEED(4)
+      REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, EPS,       &
+                       TOL, TOL2, SVDIFF, TMP, TMP_AU,       &
+                       TMP_FQR, TMP_REZ, TMP_REZQ,  TMP_XW, &
+                       TMP_EX
+!............................................................
+      COMPLEX(KIND=WP) :: CMAX
+      INTEGER :: LCWORK
+      COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:,:) ::  A, AC,  &
+                                 AU, F, F0, F1, S, W,  &
+                                 X, X0, Y, Y0, Y1, Z, Z1
+      COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:)   ::  CDA, CDR, &
+                                       CDL, CEIGS, CEIGSA, CWORK
+      COMPLEX(KIND=WP) ::  CDUMMY(22), CDUM2X2(2,2)
+!............................................................
+      INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY,  &
+                 LDZ, LIWORK, LWORK, M, N, LLOOP, NRNK
+      INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, j,     &
+                 NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ,     &
+                 NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL,  &
+                 NFAIL_Z_XV,  MODE, MODEL, MODER, WHTSVD
+      INTEGER :: iNRNK, iWHTSVD,  K_traj, LWMINOPT
+      CHARACTER :: GRADE, JOBREF, JOBZ, PIVTNG, RSIGN,   &
+                   SCALE, RESIDS, WANTQ, WANTR
+      LOGICAL :: TEST_QRDMD
+
+!..... external subroutines (BLAS and LAPACK)
+      EXTERNAL CAXPY, CGEEV, CGEMM, CGEMV, CLASCL
+!.....external subroutines DMD package
+!     subroutines under test
+      EXTERNAL CGEDMD, CGEDMDQ
+!..... external functions (BLAS and LAPACK)
+      EXTERNAL         SCNRM2, SLAMCH
+      REAL(KIND=WP) :: SCNRM2, SLAMCH
+      EXTERNAL         CLANGE
+      REAL(KIND=WP) :: CLANGE
+      EXTERNAL ICAMAX
+      INTEGER  ICAMAX
+      EXTERNAL LSAME
+      LOGICAL  LSAME
+
+      INTRINSIC ABS, INT, MIN, MAX, SIGN
+!............................................................
+
+
+      WRITE(*,*) 'COMPLEX CODE TESTING'
+
+      ! The test is always in pairs : ( CGEDMD and CGEDMDQ)
+      ! because the test includes comparing the results (in pairs).
+!.....................................................................................
+      ! This code by default performs tests on CGEDMDQ
+      ! Since the QR factorizations based algorithm is designed for
+      ! single trajectory data, only single trajectory tests will
+      ! be performed with xGEDMDQ.
+
+      WANTQ = 'Q'
+      WANTR = 'R'
+!.................................................................................
+
+      EPS = SLAMCH( 'P' )  ! machine precision WP
+
+      ! Global counters of failures of some particular tests
+      NFAIL      = 0
+      NFAIL_REZ  = 0
+      NFAIL_REZQ = 0
+      NFAIL_Z_XV = 0
+      NFAIL_F_QR = 0
+      NFAIL_AU   = 0
+      NFAIL_SVDIFF = 0
+      NFAIL_TOTAL  = 0
+      NFAILQ_TOTAL = 0
+
+      DO LLOOP = 1, 4
+
+      WRITE(*,*) 'L Loop Index = ', LLOOP
+
+      ! Set the dimensions of the problem ...
+      READ(*,*) M
+      WRITE(*,*) 'M = ', M
+      ! ... and the number of snapshots.
+      READ(*,*) N
+      WRITE(*,*) 'N = ', N
+
+      ! Test the dimensions
+      IF ( ( MIN(M,N) == 0 ) .OR. ( M < N )  ) THEN
+          WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.'
+          STOP
+      END IF
+!.............
+      ! The seed inside the LLOOP so that each pass can be reproduced easily.
+      ISEED(1) = 4
+      ISEED(2) = 3
+      ISEED(3) = 2
+      ISEED(4) = 1
+
+      LDA  = M
+      LDF  = M
+      LDX  = M
+      LDY  = M
+      LDW  = N
+      LDZ  = M
+      LDAU = M
+      LDS  = N
+
+      TMP_XW  = ZERO
+      TMP_AU   = ZERO
+      TMP_REZ  = ZERO
+      TMP_REZQ = ZERO
+      SVDIFF   = ZERO
+      TMP_EX   = ZERO
+
+      ALLOCATE( A(LDA,M) )
+      ALLOCATE( AC(LDA,M) )
+      ALLOCATE( F(LDF,N+1) )
+      ALLOCATE( F0(LDF,N+1) )
+      ALLOCATE( F1(LDF,N+1) )
+      ALLOCATE( X(LDX,N) )
+      ALLOCATE( X0(LDX,N) )
+      ALLOCATE( Y(LDY,N+1) )
+      ALLOCATE( Y0(LDY,N+1) )
+      ALLOCATE( Y1(LDY,N+1) )
+      ALLOCATE( AU(LDAU,N) )
+      ALLOCATE( W(LDW,N) )
+      ALLOCATE( S(LDS,N) )
+      ALLOCATE( Z(LDZ,N) )
+      ALLOCATE( Z1(LDZ,N) )
+      ALLOCATE( RES(N) )
+      ALLOCATE( RES1(N) )
+      ALLOCATE( RESEX(N) )
+      ALLOCATE( CEIGS(N) )
+      ALLOCATE( SINGVX(N) )
+      ALLOCATE( SINGVQX(N) )
+
+      TOL  = 10*M*EPS
+      TOL2 = 10*M*N*EPS
+
+!.............
+
+      DO K_traj = 1, 2
+      !  Number of intial conditions in the simulation/trajectories (1 or 2)
+
+      COND   = 1.0D4
+      CMAX   = (1.0D1,1.0D1)
+      RSIGN  = 'F'
+      GRADE  = 'N'
+      MODEL  = 6
+      CONDL  = 1.0D1
+      MODER  = 6
+      CONDR  = 1.0D1
+      PIVTNG = 'N'
+      ! Loop over all parameter MODE values for CLATMR (+-1,..,+-6)
+
+      DO MODE = 1, 6
+
+      ALLOCATE( IWORK(2*M) )
+      ALLOCATE( CDA(M) )
+      ALLOCATE( CDL(M) )
+      ALLOCATE( CDR(M) )
+
+      CALL CLATMR( M, M, 'N', ISEED, 'N', CDA, MODE, COND, &
+                   CMAX, RSIGN, GRADE, CDL, MODEL,  CONDL, &
+                   CDR, MODER, CONDR, PIVTNG, IWORK, M, M, &
+                   ZERO, -ONE, 'N', A, LDA, IWORK(M+1), INFO )
+      DEALLOCATE( CDR )
+      DEALLOCATE( CDL )
+      DEALLOCATE( CDA )
+      DEALLOCATE( IWORK )
+
+      LCWORK = MAX(1,2*M)
+      ALLOCATE( CEIGSA(M) )
+      ALLOCATE( CWORK(LCWORK) )
+      ALLOCATE( WORK(2*M) )
+      AC(1:M,1:M) = A(1:M,1:M)
+      CALL CGEEV( 'N','N', M, AC, LDA, CEIGSA, CDUM2X2, 2, &
+                  CDUM2X2, 2, CWORK, LCWORK, WORK, INFO ) ! LAPACK CALL
+      DEALLOCATE(WORK)
+      DEALLOCATE(CWORK)
+
+      TMP = ABS(CEIGSA(ICAMAX(M, CEIGSA, 1))) ! The spectral radius of A
+      ! Scale the matrix A to have unit spectral radius.
+      CALL CLASCL( 'G',0, 0, TMP, ONE, M, M, &
+                   A, LDA, INFO )
+      CALL CLASCL( 'G',0, 0, TMP, ONE, M, 1, &
+                   CEIGSA, M, INFO )
+      ANORM = CLANGE( 'F', M, M, A, LDA, WDUMMY )
+
+      IF ( K_traj == 2 ) THEN
+          ! generate data as two trajectories
+          ! with two inital conditions
+          CALL CLARNV(2, ISEED, M, F(1,1) )
+          DO i = 1, N/2
+             CALL CGEMV( 'N', M, M, CONE, A, LDA, F(1,i), 1,  &
+                  CZERO, F(1,i+1), 1 )
+          END DO
+          X0(1:M,1:N/2) = F(1:M,1:N/2)
+          Y0(1:M,1:N/2) = F(1:M,2:N/2+1)
+
+          CALL CLARNV(2, ISEED, M, F(1,1) )
+          DO i = 1, N-N/2
+             CALL CGEMV( 'N', M, M, CONE, A, LDA, F(1,i), 1,  &
+                  CZERO, F(1,i+1), 1 )
+          END DO
+          X0(1:M,N/2+1:N) = F(1:M,1:N-N/2)
+          Y0(1:M,N/2+1:N) = F(1:M,2:N-N/2+1)
+      ELSE
+          CALL CLARNV(2, ISEED, M, F(1,1) )
+          DO i = 1, N
+             CALL CGEMV( 'N', M, M, CONE, A, M, F(1,i), 1,  &
+                  CZERO, F(1,i+1), 1 )
+          END DO
+          F0(1:M,1:N+1) = F(1:M,1:N+1)
+          X0(1:M,1:N) = F0(1:M,1:N)
+          Y0(1:M,1:N) = F0(1:M,2:N+1)
+      END IF
+
+      DEALLOCATE( CEIGSA )
+!........................................................................
+
+      DO iJOBZ = 1, 4
+
+          SELECT CASE ( iJOBZ )
+          CASE(1)
+              JOBZ   = 'V'
+              RESIDS = 'R'
+          CASE(2)
+              JOBZ   = 'V'
+              RESIDS = 'N'
+          CASE(3)
+              JOBZ   = 'F'
+              RESIDS = 'N'
+          CASE(4)
+              JOBZ   = 'N'
+              RESIDS = 'N'
+          END SELECT
+
+      DO iJOBREF = 1, 3
+
+          SELECT CASE ( iJOBREF )
+          CASE(1)
+              JOBREF = 'R'
+          CASE(2)
+              JOBREF = 'E'
+          CASE(3)
+              JOBREF = 'N'
+          END SELECT
+
+      DO iSCALE = 1, 4
+
+          SELECT CASE ( iSCALE )
+          CASE(1)
+              SCALE = 'S'
+          CASE(2)
+              SCALE = 'C'
+          CASE(3)
+              SCALE = 'Y'
+          CASE(4)
+              SCALE = 'N'
+          END SELECT
+
+      DO iNRNK = -1, -2, -1
+          NRNK   = iNRNK
+
+      DO iWHTSVD = 1,  3
+         ! Check all four options to compute the POD basis
+         ! via the SVD.
+         WHTSVD   = iWHTSVD
+
+      DO LWMINOPT = 1, 2
+         ! Workspace query for the minimal (1) and for the optimal
+         ! (2) workspace lengths determined by workspace query.
+
+      ! CGEDMD is always tested and its results are also used for
+      ! comparisons with CGEDMDQ.
+
+      X(1:M,1:N) = X0(1:M,1:N)
+      Y(1:M,1:N) = Y0(1:M,1:N)
+
+      CALL CGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD,  &
+                M,  N, X, LDX, Y, LDY, NRNK, TOL,  &
+                K, CEIGS, Z, LDZ,  RES,  &
+                AU, LDAU, W,  LDW,   S, LDS,        &
+                CDUMMY, -1, WDUMMY, -1, IDUMMY, -1, INFO )
+
+      IF ( (INFO .EQ. 2) .OR. ( INFO .EQ. 3 ) &
+                       .OR. ( INFO < 0 ) ) THEN
+        WRITE(*,*) 'Call to CGEDMD workspace query failed. &
+                   &Check the calling sequence and the code.'
+        WRITE(*,*) 'The error code is ', INFO
+        WRITE(*,*) 'The input parameters were ',      &
+        SCALE, JOBZ, RESIDS, JOBREF, WHTSVD,          &
+        M, N, LDX, LDY, NRNK, TOL, LDZ, LDAU, LDW, LDS
+        STOP
+      ELSE
+        !WRITE(*,*) '... done. Workspace length computed.'
+      END IF
+
+      LCWORK = INT(CDUMMY(LWMINOPT))
+      ALLOCATE(CWORK(LCWORK))
+      LIWORK = IDUMMY(1)
+      ALLOCATE(IWORK(LIWORK))
+      LWORK = INT(WDUMMY(1))
+      ALLOCATE(WORK(LWORK))
+
+      CALL CGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD,  &
+                   M,  N, X, LDX, Y, LDY, NRNK, TOL,  &
+                   K, CEIGS, Z, LDZ,  RES,  &
+                   AU, LDAU, W,  LDW,   S, LDS,        &
+                   CWORK, LCWORK, WORK, LWORK, IWORK, LIWORK, INFO )
+      IF ( INFO /= 0 ) THEN
+           WRITE(*,*) 'Call to CGEDMD failed. &
+           &Check the calling sequence and the code.'
+           WRITE(*,*) 'The error code is ', INFO
+           WRITE(*,*) 'The input parameters were ',&
+           SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+           M, N, LDX, LDY, NRNK, TOL
+           STOP
+      END IF
+      SINGVX(1:N) = WORK(1:N)
+
+      !...... CGEDMD check point
+      IF ( LSAME(JOBZ,'V')  ) THEN
+          ! Check that Z = X*W, on return from CGEDMD
+          ! This checks that the returned eigenvectors in Z are
+          ! the product of the SVD'POD basis returned in X
+          ! and the eigenvectors of the Rayleigh quotient
+          ! returned in W
+          CALL CGEMM( 'N', 'N', M, K, K, CONE, X, LDX, W, LDW, &
+                      CZERO, Z1, LDZ )
+          TMP = ZERO
+          DO i = 1, K
+             CALL CAXPY( M, -CONE, Z(1,i), 1, Z1(1,i), 1)
+             TMP = MAX(TMP, SCNRM2( M, Z1(1,i), 1 ) )
+          END DO
+          TMP_XW = MAX(TMP_XW, TMP )
+          IF ( TMP_XW <= TOL ) THEN
+              !WRITE(*,*) ' :) .... OK .........CGEDMD PASSED.'
+          ELSE
+              NFAIL_Z_XV = NFAIL_Z_XV + 1
+              WRITE(*,*) ':( .................CGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+      END IF
+      !...... CGEDMD check point
+
+      IF ( LSAME(JOBREF,'R') ) THEN
+           ! The matrix A*U is returned for computing refined Ritz vectors.
+           ! Check that A*U is computed correctly using the formula
+           ! A*U = Y * V * inv(SIGMA). This depends on the
+           ! accuracy in the computed singular values and vectors of X.
+           ! See the paper for an error analysis.
+           ! Note that the left singular vectors of the input matrix X
+           ! are returned in the array X.
+           CALL CGEMM( 'N', 'N', M, K, M, CONE, A, LDA, X, LDX, &
+                      CZERO, Z1, LDZ )
+          TMP = ZERO
+          DO i = 1, K
+             CALL CAXPY( M, -CONE, AU(1,i), 1, Z1(1,i), 1)
+             TMP = MAX( TMP, SCNRM2( M, Z1(1,i),1 ) * &
+                     SINGVX(K)/(ANORM*SINGVX(1)) )
+          END DO
+          TMP_AU = MAX( TMP_AU, TMP )
+          IF ( TMP <= TOL2 ) THEN
+              !WRITE(*,*) ':) .... OK .........CGEDMD PASSED.'
+          ELSE
+              NFAIL_AU = NFAIL_AU + 1
+              WRITE(*,*) ':( .................CGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL2
+          END IF
+      ELSEIF ( LSAME(JOBREF,'E') ) THEN
+          ! The unscaled vectors of the Exact DMD are computed.
+          ! This option is included for the sake of completeness,
+          ! for users who prefer the Exact DMD vectors. The
+          ! returned vectors are in the real form, in the same way
+          ! as the Ritz vectors. Here we just save the vectors
+          ! and test them separately using a Matlab script.
+          CALL CGEMM( 'N', 'N', M, K, M, CONE, A, LDA, AU, LDAU, CZERO, Y1, LDY )
+
+          DO i=1, K
+             CALL CAXPY( M, -CEIGS(i), AU(1,i), 1, Y1(1,i), 1 )
+             RESEX(i) = SCNRM2( M, Y1(1,i), 1) / SCNRM2(M,AU(1,i),1)
+          END DO
+      END IF
+      !...... CGEDMD check point
+
+      IF ( LSAME(RESIDS, 'R') ) THEN
+          ! Compare the residuals returned by CGEDMD with the
+          ! explicitly computed residuals using the matrix A.
+          ! Compute explicitly Y1 = A*Z
+          CALL CGEMM( 'N', 'N', M, K, M, CONE, A, LDA, Z, LDZ, CZERO, Y1, LDY )
+          ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+          ! of the invariant subspaces that correspond to complex conjugate
+          ! pairs of eigencalues. (See the description of Z in CGEDMD,)
+
+          DO i=1, K
+                ! have a real eigenvalue with real eigenvector
+                CALL CAXPY( M, -CEIGS(i), Z(1,i), 1, Y1(1,i), 1 )
+                RES1(i) = SCNRM2( M, Y1(1,i), 1)
+          END DO
+          TMP = ZERO
+          DO i = 1, K
+          TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+                    SINGVX(K)/(ANORM*SINGVX(1)) )
+          END DO
+          TMP_REZ = MAX( TMP_REZ, TMP )
+          IF ( TMP <= TOL2 ) THEN
+              !WRITE(*,*) ':) .... OK ..........CGEDMD PASSED.'
+          ELSE
+              NFAIL_REZ = NFAIL_REZ + 1
+              WRITE(*,*) ':( ..................CGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+
+
+         IF ( LSAME(JOBREF,'E') ) THEN
+            TMP = ZERO
+          DO i = 1, K
+          TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) )
+          END DO
+          TMP_EX = MAX(TMP_EX,TMP)
+         END IF
+
+      END IF
+
+      DEALLOCATE(CWORK)
+      DEALLOCATE(WORK)
+      DEALLOCATE(IWORK)
+
+!.......................................................................................................
+
+      IF ( K_traj == 1 ) THEN
+
+          F(1:M,1:N+1) = F0(1:M,1:N+1)
+          CALL CGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, &
+                    WHTSVD, M, N+1, F, LDF,  X, LDX,  Y, LDY,  &
+                    NRNK,  TOL, K, CEIGS, Z, LDZ, RES,  AU,  &
+                    LDAU, W, LDW, S, LDS, CDUMMY, -1,   &
+                    WDUMMY,  -1, IDUMMY, -1, INFO )
+
+          LCWORK = INT(CDUMMY(LWMINOPT))
+          ALLOCATE(CWORK(LCWORK))
+          LIWORK = IDUMMY(1)
+          ALLOCATE(IWORK(LIWORK))
+          LWORK = INT(WDUMMY(1))
+          ALLOCATE(WORK(LWORK))
+
+          CALL CGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, &
+                        WHTSVD, M, N+1, F, LDF,  X, LDX,  Y, LDY,  &
+                        NRNK,  TOL, KQ, CEIGS, Z, LDZ, RES,  AU,  &
+                        LDAU, W, LDW, S, LDS, CWORK, LCWORK,   &
+                        WORK,  LWORK, IWORK, LIWORK, INFO )
+          IF ( INFO /= 0 ) THEN
+                 WRITE(*,*) 'Call to CGEDMDQ failed. &
+                 &Check the calling sequence and the code.'
+                 WRITE(*,*) 'The error code is ', INFO
+                 WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, WANTQ, WANTR, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+                 STOP
+          END IF
+          SINGVQX(1:N) =WORK(1:N)
+
+          !..... ZGEDMDQ check point
+
+          TMP = ZERO
+          DO i = 1, MIN(K, KQ)
+             TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / &
+                                   SINGVX(1) )
+          END DO
+          SVDIFF = MAX( SVDIFF, TMP )
+          IF ( TMP > TOL2 ) THEN
+               WRITE(*,*) 'FAILED! Something was wrong with the run.'
+             NFAIL_SVDIFF = NFAIL_SVDIFF + 1
+          END IF
+          !..... CGEDMDQ check point
+
+          !..... CGEDMDQ check point
+          IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN
+             ! Check that the QR factors are computed and returned
+             ! as requested. The residual ||F-Q*R||_F / ||F||_F
+             ! is compared to M*N*EPS.
+             F1(1:M,1:N+1) = F0(1:M,1:N+1)
+             CALL CGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -CONE, F, &
+                         LDF, Y, LDY, CONE, F1, LDF )
+             TMP_FQR = CLANGE( 'F', M, N+1, F1, LDF, WORK ) / &
+                   CLANGE( 'F', M, N+1, F0,  LDF, WORK )
+             IF ( TMP_FQR <= TOL2 ) THEN
+                !WRITE(*,*) ':) CGEDMDQ ........ PASSED.'
+             ELSE
+                WRITE(*,*) ':( CGEDMDQ ........ FAILED.'
+                NFAIL_F_QR = NFAIL_F_QR + 1
+             END IF
+          END IF
+          !..... ZGEDMDQ checkpoint
+                 !..... ZGEDMDQ checkpoint
+          IF ( LSAME(RESIDS, 'R') ) THEN
+              ! Compare the residuals returned by ZGEDMDQ with the
+              ! explicitly computed residuals using the matrix A.
+              ! Compute explicitly Y1 = A*Z
+              CALL CGEMM( 'N', 'N', M, KQ, M, CONE, A, LDA, Z, LDZ, CZERO, Y1, LDY )
+              ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+              ! of the invariant subspaces that correspond to complex conjugate
+              ! pairs of eigencalues. (See the description of Z in ZGEDMDQ)
+              DO i = 1, KQ
+                    ! have a real eigenvalue with real eigenvector
+                    CALL CAXPY( M, -CEIGS(i), Z(1,i), 1, Y1(1,i), 1 )
+                    ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i)
+                    RES1(i) = SCNRM2( M, Y1(1,i), 1)
+              END DO
+              TMP = ZERO
+              DO i = 1, KQ
+              TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+                  SINGVQX(KQ)/(ANORM*SINGVQX(1)) )
+              END DO
+              TMP_REZQ = MAX( TMP_REZQ, TMP )
+              IF ( TMP <= TOL2 ) THEN
+                  !WRITE(*,*) '.... OK ........ CGEDMDQ PASSED.'
+              ELSE
+                  NFAIL_REZQ = NFAIL_REZQ + 1
+                  WRITE(*,*) '................ CGEDMDQ FAILED!', &
+                      'Check the code for implementation errors.'
+              END IF
+          END IF
+
+          DEALLOCATE(CWORK)
+          DEALLOCATE(WORK)
+          DEALLOCATE(IWORK)
+
+      END IF
+
+      END DO   ! LWMINOPT
+      !write(*,*) 'LWMINOPT loop completed'
+      END DO   ! iWHTSVD
+      !write(*,*) 'WHTSVD loop completed'
+      END DO   ! iNRNK  -2:-1
+      !write(*,*) 'NRNK loop completed'
+      END DO   ! iSCALE  1:4
+      !write(*,*) 'SCALE loop completed'
+      END DO
+      !write(*,*) 'JOBREF loop completed'
+      END DO   ! iJOBZ
+      !write(*,*) 'JOBZ loop completed'
+
+      END DO ! MODE -6:6
+      !write(*,*) 'MODE loop completed'
+      END DO ! 1 or 2 trajectories
+      !write(*,*) 'trajectories  loop completed'
+
+      DEALLOCATE( A )
+      DEALLOCATE( AC )
+      DEALLOCATE( Z )
+      DEALLOCATE( F )
+      DEALLOCATE( F0 )
+      DEALLOCATE( F1 )
+      DEALLOCATE( X )
+      DEALLOCATE( X0 )
+      DEALLOCATE( Y )
+      DEALLOCATE( Y0 )
+      DEALLOCATE( Y1 )
+      DEALLOCATE( AU )
+      DEALLOCATE( W )
+      DEALLOCATE( S )
+      DEALLOCATE( Z1 )
+      DEALLOCATE( RES )
+      DEALLOCATE( RES1 )
+      DEALLOCATE( RESEX )
+      DEALLOCATE( CEIGS )
+      DEALLOCATE( SINGVX )
+      DEALLOCATE( SINGVQX )
+
+      END DO ! LLOOP
+
+      WRITE(*,*)
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for CGEDMD :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+      IF ( NFAIL_Z_XV == 0 ) THEN
+          WRITE(*,*) '>>>> Z - U*V test PASSED.'
+      ELSE
+          WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)'
+          WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_XW
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_z_XV
+      END IF
+
+      IF ( NFAIL_AU == 0 ) THEN
+          WRITE(*,*) '>>>> A*U test PASSED. '
+      ELSE
+          WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)'
+          WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU
+      END IF
+
+
+      IF ( NFAIL_REZ == 0 ) THEN
+         WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+        WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)'
+        WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ
+        WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+        NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ
+      END IF
+      IF ( NFAIL_TOTAL == 0 ) THEN
+        WRITE(*,*) '>>>> CGEDMD :: ALL TESTS PASSED.'
+      ELSE
+        WRITE(*,*) NFAIL_TOTAL, 'FAILURES!'
+        WRITE(*,*) '>>>>>>>>>>>>>> CGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      WRITE(*,*)
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for CGEDMDQ :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+
+      IF ( NFAIL_SVDIFF == 0 ) THEN
+        WRITE(*,*) '>>>> CGEDMD and CGEDMDQ computed singular &
+           &values test PASSED.'
+      ELSE
+        WRITE(*,*) 'ZGEDMD and ZGEDMDQ discrepancies in &
+            &the singular values unacceptable ', &
+            NFAIL_SVDIFF, ' times. Test FAILED.'
+        WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF
+        WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+        NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF
+      END IF
+      IF ( NFAIL_F_QR == 0 ) THEN
+        WRITE(*,*) '>>>> F - Q*R test PASSED.'
+      ELSE
+        WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)'
+        WRITE(*,*) 'The largest relative residual was ', TMP_FQR
+        WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+        NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR
+      END IF
+
+      IF ( NFAIL_REZQ == 0 ) THEN
+        WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+        WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)'
+        WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ
+        WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+        NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ
+      END IF
+
+      IF ( NFAILQ_TOTAL == 0 ) THEN
+        WRITE(*,*) '>>>>>>> CGEDMDQ :: ALL TESTS PASSED.'
+      ELSE
+        WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!'
+        WRITE(*,*) '>>>>>>> CGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      WRITE(*,*)
+      WRITE(*,*) 'Test completed.'
+      STOP
+      END
diff --git a/lapack-netlib/TESTING/EIG/dchkdmd.f90 b/lapack-netlib/TESTING/EIG/dchkdmd.f90
new file mode 100644
index 000000000..4fbf7531b
--- /dev/null
+++ b/lapack-netlib/TESTING/EIG/dchkdmd.f90
@@ -0,0 +1,813 @@
+!     This is a test program for checking the implementations of
+!     the implementations of the following subroutines
+!
+!     DGEDMD  for computation of the
+!             Dynamic Mode Decomposition (DMD)
+!     DGEDMDQ for computation of a
+!             QR factorization based compressed DMD
+!
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     ========================================================
+!     How to run the code (compiler, link info)
+!     ========================================================
+!     Compile as FORTRAN 90 (or later) and link with BLAS and
+!     LAPACK libraries.
+!     NOTE: The code is developed and tested on top of the
+!     Intel MKL library (versions 2022.0.3 and 2022.2.0),
+!     using the Intel Fortran compiler.
+!
+!     For developers of the C++ implementation
+!     ========================================================
+!     See the LAPACK++ and Template Numerical Toolkit (TNT)
+!
+!     Note on a development of the GPU HP implementation
+!     ========================================================
+!     Work in progress. See CUDA, MAGMA, SLATE.
+!     NOTE: The four SVD subroutines used in this code are
+!     included as a part of R&D and for the completeness.
+!     This was also an opportunity to test those SVD codes.
+!     If the scaling option is used all four are essentially
+!     equally good. For implementations on HP platforms,
+!     one can use whichever SVD is available.
+!...  .........................................................
+!     NOTE:
+!     When using the Intel MKL 2022.0.3 the subroutine xGESVDQ
+!     (optionally used in xGEDMD) may cause access violation
+!     error for x = S, D, C, Z, but only if called with the
+!     work space query. (At least in our Windows 10 MSVS 2019.)
+!     The problem can be mitigated by downloading the source
+!     code of xGESVDQ from the LAPACK repository and use it
+!     localy instead of the one in the MKL. This seems to
+!     indicate that the problem is indeed in the MKL.
+!     This problem did not appear whith Intel MKL 2022.2.0.
+!
+!     NOTE:
+!     xGESDD seems to have a problem with workspace. In some
+!     cases the length of the optimal workspace is returned
+!     smaller than the minimal workspace, as specified in the
+!     code. As a precaution, all optimal workspaces are
+!     set as MAX(minimal, optimal).
+!     Latest implementations of complex xGESDD have different
+!     length of the real worksapce. We use max value over
+!     two versions.
+!............................................................
+!............................................................
+!
+      PROGRAM DMD_TEST
+      use iso_fortran_env, only: real64
+      IMPLICIT NONE
+      integer, parameter :: WP = real64
+
+!............................................................
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+!............................................................
+      REAL(KIND=WP), ALLOCATABLE, DIMENSION(:,:) ::          &
+                     A, AC, EIGA, LAMBDA, LAMBDAQ, F, F1, F2,&
+                     Z, Z1, S, AU, W, VA, X, X0, Y, Y0, Y1
+      REAL(KIND=WP), ALLOCATABLE, DIMENSION(:)   ::          &
+                     DA, DL, DR, REIG, REIGA, REIGQ, IEIG,   &
+                     IEIGA, IEIGQ,  RES, RES1, RESEX, SINGVX,&
+                     SINGVQX, WORK
+      INTEGER      , ALLOCATABLE, DIMENSION(:)   ::   IWORK
+      REAL(KIND=WP) :: AB(2,2),   WDUMMY(2)
+      INTEGER       :: IDUMMY(2), ISEED(4), RJOBDATA(8)
+      REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, DMAX, EPS, &
+                       TOL, TOL2, SVDIFF, TMP, TMP_AU,       &
+                       TMP_FQR, TMP_REZ, TMP_REZQ,  TMP_ZXW, &
+                       TMP_EX, XNORM, YNORM
+!............................................................
+      INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY,  &
+                 LDZ, LIWORK, LWORK, M, N, L, LLOOP, NRNK
+      INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, j, KDIFF,  &
+                 NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ,     &
+                 NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL, &
+                 NFAIL_Z_XV, MODE, MODEL, MODER, WHTSVD
+      INTEGER    iNRNK, iWHTSVD, K_TRAJ, LWMINOPT
+      CHARACTER(LEN=1) GRADE, JOBREF, JOBZ, PIVTNG, RSIGN,   &
+                       SCALE, RESIDS, WANTQ, WANTR
+
+      LOGICAL  TEST_QRDMD
+!..... external subroutines (BLAS and LAPACK)
+      EXTERNAL DAXPY,  DGEEV, DGEMM, DGEMV, DLACPY, DLASCL
+      EXTERNAL DLARNV, DLATMR
+!.....external subroutines DMD package, part 1
+!     subroutines under test
+      EXTERNAL DGEDMD, DGEDMDQ
+
+!..... external functions (BLAS and LAPACK)
+      EXTERNAL         DLAMCH, DLANGE, DNRM2
+      REAL(KIND=WP) :: DLAMCH, DLANGE, DNRM2
+      EXTERNAL         LSAME
+      LOGICAL          LSAME
+
+      INTRINSIC ABS, INT, MIN, MAX
+!............................................................
+
+      ! The test is always in pairs : ( DGEDMD and DGEDMDQ )
+      ! because the test includes comparing the results (in pairs).
+!.....................................................................................
+      TEST_QRDMD = .TRUE. ! This code by default performs tests on DGEDMDQ
+                          ! Since the QR factorizations based algorithm is designed for
+                          ! single trajectory data, only single trajectory tests will
+                          ! be performed with xGEDMDQ.
+      WANTQ = 'Q'
+      WANTR = 'R'
+!.................................................................................
+
+      EPS = DLAMCH( 'P' )  ! machine precision DP
+
+      ! Global counters of failures of some particular tests
+      NFAIL      = 0
+      NFAIL_REZ  = 0
+      NFAIL_REZQ = 0
+      NFAIL_Z_XV = 0
+      NFAIL_F_QR = 0
+      NFAIL_AU   = 0
+      KDIFF      = 0
+      NFAIL_SVDIFF = 0
+      NFAIL_TOTAL  = 0
+      NFAILQ_TOTAL = 0
+
+
+      DO LLOOP = 1, 4
+
+      WRITE(*,*) 'L Loop Index = ', LLOOP
+
+      ! Set the dimensions of the problem ...
+      WRITE(*,*) 'M = '
+      READ(*,*) M
+      WRITE(*,*) M
+      ! ... and the number of snapshots.
+      WRITE(*,*) 'N = '
+      READ(*,*) N
+      WRITE(*,*) N
+
+      ! ... Test the dimensions
+      IF ( ( MIN(M,N) == 0 ) .OR. ( M < N )  ) THEN
+          WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.'
+          STOP
+      END IF
+!.............
+      ! The seed inside the LLOOP so that each pass can be reproduced easily.
+
+      ISEED(1) = 4
+      ISEED(2) = 3
+      ISEED(3) = 2
+      ISEED(4) = 1
+
+      LDA  = M
+      LDF  = M
+      LDX  = MAX(M,N+1)
+      LDY  = MAX(M,N+1)
+      LDW  = N
+      LDZ  = M
+      LDAU = MAX(M,N+1)
+      LDS  = N
+
+      TMP_ZXW  = ZERO
+      TMP_AU   = ZERO
+      TMP_REZ  = ZERO
+      TMP_REZQ = ZERO
+      SVDIFF   = ZERO
+      TMP_EX   = ZERO
+
+      !
+      ! Test the subroutines on real data snapshots. All
+      ! computation is done in real arithmetic, even when
+      ! Koopman eigenvalues and modes are real.
+      !
+      ! Allocate memory space
+      ALLOCATE( A(LDA,M) )
+      ALLOCATE( AC(LDA,M) )
+      ALLOCATE( DA(M) )
+      ALLOCATE( DL(M) )
+      ALLOCATE( F(LDF,N+1) )
+      ALLOCATE( F1(LDF,N+1) )
+      ALLOCATE( F2(LDF,N+1) )
+      ALLOCATE( X(LDX,N) )
+      ALLOCATE( X0(LDX,N) )
+      ALLOCATE( SINGVX(N) )
+      ALLOCATE( SINGVQX(N) )
+      ALLOCATE( Y(LDY,N+1) )
+      ALLOCATE( Y0(LDY,N+1) )
+      ALLOCATE( Y1(M,N+1) )
+      ALLOCATE( Z(LDZ,N) )
+      ALLOCATE( Z1(LDZ,N) )
+      ALLOCATE( RES(N)  )
+      ALLOCATE( RES1(N) )
+      ALLOCATE( RESEX(N) )
+      ALLOCATE( REIG(N) )
+      ALLOCATE( IEIG(N) )
+      ALLOCATE( REIGQ(N) )
+      ALLOCATE( IEIGQ(N) )
+      ALLOCATE( REIGA(M) )
+      ALLOCATE( IEIGA(M) )
+      ALLOCATE( VA(LDA,M) )
+      ALLOCATE( LAMBDA(N,2) )
+      ALLOCATE( LAMBDAQ(N,2) )
+      ALLOCATE( EIGA(M,2) )
+      ALLOCATE( W(LDW,N) )
+      ALLOCATE( AU(LDAU,N) )
+      ALLOCATE( S(N,N) )
+
+      TOL  = M*EPS
+      ! This mimics O(M*N)*EPS bound for accumulated roundoff error.
+      ! The factor 10 is somewhat arbitrary.
+      TOL2 = 10*M*N*EPS
+
+!.............
+
+      DO K_TRAJ = 1, 2
+      !  Number of intial conditions in the simulation/trajectories (1 or 2)
+
+      COND = 1.0D8
+      DMAX = 1.0D2
+      RSIGN = 'F'
+      GRADE = 'N'
+      MODEL = 6
+      CONDL = 1.0D2
+      MODER = 6
+      CONDR = 1.0D2
+      PIVTNG = 'N'
+
+      ! Loop over all parameter MODE values for ZLATMR (+1,..,+6)
+      DO MODE = 1, 6
+
+      ALLOCATE( IWORK(2*M) )
+      ALLOCATE(DR(N))
+      CALL DLATMR( M, M, 'S', ISEED, 'N', DA, MODE, COND, &
+                   DMAX, RSIGN, GRADE, DL, MODEL,  CONDL, &
+                   DR, MODER, CONDR, PIVTNG, IWORK, M, M, &
+                   ZERO, -ONE, 'N', A, LDA, IWORK(M+1), INFO )
+      DEALLOCATE(IWORK)
+      DEALLOCATE(DR)
+
+      LWORK = 4*M+1
+      ALLOCATE(WORK(LWORK))
+      AC  = A
+      CALL DGEEV( 'N','V', M, AC, M, REIGA, IEIGA, VA, M, &
+                  VA, M, WORK, LWORK, INFO ) ! LAPACK CALL
+      DEALLOCATE(WORK)
+      TMP = ZERO
+      DO i = 1, M
+         EIGA(i,1) = REIGA(i)
+         EIGA(i,2) = IEIGA(i)
+         TMP = MAX( TMP, SQRT(REIGA(i)**2+IEIGA(i)**2))
+      END DO
+
+      ! Scale A to have the desirable spectral radius.
+      CALL DLASCL( 'G', 0, 0, TMP, ONE, M, M, A, M, INFO )
+      CALL DLASCL( 'G', 0, 0, TMP, ONE, M, 2, EIGA, M, INFO )
+
+      ! Compute the norm of A
+      ANORM = DLANGE( 'F', N, N, A, M, WDUMMY )
+
+      IF ( K_TRAJ == 2 ) THEN
+          ! generate data with two inital conditions
+      CALL DLARNV(2, ISEED, M, F1(1,1) )
+      F1(1:M,1) = 1.0E-10*F1(1:M,1)
+      DO i = 1, N/2
+         CALL DGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, &
+              F1(1,i+1), 1 )
+      END DO
+      X0(1:M,1:N/2) = F1(1:M,1:N/2)
+      Y0(1:M,1:N/2) = F1(1:M,2:N/2+1)
+
+      CALL DLARNV(2, ISEED, M, F1(1,1) )
+      DO i = 1, N-N/2
+         CALL DGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, &
+              F1(1,i+1), 1 )
+      END DO
+      X0(1:M,N/2+1:N) = F1(1:M,1:N-N/2)
+      Y0(1:M,N/2+1:N) = F1(1:M,2:N-N/2+1)
+      ELSE
+      CALL DLARNV(2, ISEED, M, F(1,1) )
+      DO i = 1, N
+         CALL DGEMV( 'N', M, M, ONE, A, M, F(1,i), 1, ZERO, &
+              F(1,i+1), 1 )
+      END DO
+      X0(1:M,1:N) = F(1:M,1:N)
+      Y0(1:M,1:N) = F(1:M,2:N+1)
+      END IF
+
+      XNORM = DLANGE( 'F', M, N, X0, LDX, WDUMMY )
+      YNORM = DLANGE( 'F', M, N, Y0, LDX, WDUMMY )
+!............................................................
+
+      DO iJOBZ = 1, 4
+
+          SELECT CASE ( iJOBZ )
+          CASE(1)
+              JOBZ   = 'V' ! Ritz vectors will be computed
+              RESIDS = 'R' ! Residuals will be computed
+          CASE(2)
+              JOBZ   = 'V'
+              RESIDS = 'N'
+          CASE(3)
+              JOBZ   = 'F' ! Ritz vectors in factored form
+              RESIDS = 'N'
+          CASE(4)
+              JOBZ   = 'N'
+              RESIDS = 'N'
+          END SELECT
+
+      DO iJOBREF = 1, 3
+
+          SELECT CASE ( iJOBREF )
+          CASE(1)
+              JOBREF = 'R' ! Data for refined Ritz vectors
+          CASE(2)
+              JOBREF = 'E' ! Exact DMD vectors
+          CASE(3)
+              JOBREF = 'N'
+          END SELECT
+
+      DO iSCALE = 1, 4
+
+          SELECT CASE ( iSCALE )
+          CASE(1)
+              SCALE = 'S' ! X data normalized
+          CASE(2)
+              SCALE = 'C' ! X normalized, consist. check
+          CASE(3)
+              SCALE = 'Y' ! Y data normalized
+          CASE(4)
+              SCALE = 'N'
+          END SELECT
+
+      DO iNRNK = -1, -2, -1
+          ! Two truncation strategies. The "-2" case for R&D
+          ! purposes only - it uses possibly low accuracy small
+          ! singular values, in which case the formulas used in
+          ! the DMD are highly sensitive.
+          NRNK   = iNRNK
+
+      DO iWHTSVD = 1, 4
+          ! Check all four options to compute the POD basis
+          ! via the SVD.
+          WHTSVD   = iWHTSVD
+
+      DO LWMINOPT = 1, 2
+          ! Workspace query for the minimal (1) and for the optimal
+          ! (2) workspace lengths determined by workspace query.
+
+      X(1:M,1:N) = X0(1:M,1:N)
+      Y(1:M,1:N) = Y0(1:M,1:N)
+
+      ! DGEDMD: Workspace query and workspace allocation
+      CALL DGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, &
+           N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, &
+           LDZ, RES, AU, LDAU, W, LDW, S, LDS, WDUMMY, -1, &
+           IDUMMY, -1, INFO )
+
+      LIWORK = IDUMMY(1)
+      ALLOCATE( IWORK(LIWORK) )
+      LWORK = INT(WDUMMY(LWMINOPT))
+      ALLOCATE( WORK(LWORK) )
+
+      ! DGEDMD test: CALL DGEDMD
+      CALL DGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, &
+           N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, &
+           LDZ, RES, AU, LDAU, W, LDW, S, LDS, WORK, LWORK,&
+           IWORK, LIWORK, INFO )
+
+      SINGVX(1:N) = WORK(1:N)
+
+      !...... DGEDMD check point
+      IF ( LSAME(JOBZ,'V')  ) THEN
+          ! Check that Z = X*W, on return from DGEDMD
+          ! This checks that the returned aigenvectors in Z are
+          ! the product of the SVD'POD basis returned in X
+          ! and the eigenvectors of the rayleigh quotient
+          ! returned in W
+          CALL DGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, &
+                      ZERO, Z1, LDZ )
+          TMP = ZERO
+          DO i = 1, K
+             CALL DAXPY( M, -ONE, Z(1,i), 1, Z1(1,i), 1)
+             TMP = MAX(TMP, DNRM2( M, Z1(1,i), 1 ) )
+          END DO
+          TMP_ZXW = MAX(TMP_ZXW, TMP )
+
+          IF ( TMP_ZXW > 10*M*EPS ) THEN
+              NFAIL_Z_XV = NFAIL_Z_XV + 1
+              WRITE(*,*) ':( .................DGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+
+      END IF
+
+      !...... DGEDMD check point
+      IF ( LSAME(JOBREF,'R') ) THEN
+          ! The matrix A*U is returned for computing refined Ritz vectors.
+          ! Check that A*U is computed correctly using the formula
+          ! A*U = Y * V * inv(SIGMA). This depends on the
+          ! accuracy in the computed singular values and vectors of X.
+          ! See the paper for an error analysis.
+          ! Note that the left singular vectors of the input matrix X
+          ! are returned in the array X.
+          CALL DGEMM( 'N', 'N', M, K, M, ONE, A, LDA, X, LDX, &
+                     ZERO, Z1, LDZ )
+          TMP = ZERO
+          DO i = 1, K
+              CALL DAXPY( M, -ONE, AU(1,i), 1, Z1(1,i), 1)
+              TMP = MAX( TMP, DNRM2( M, Z1(1,i),1 ) * &
+                       SINGVX(K)/(ANORM*SINGVX(1)) )
+          END DO
+          TMP_AU = MAX( TMP_AU, TMP )
+
+          IF ( TMP > TOL2 ) THEN
+              NFAIL_AU = NFAIL_AU + 1
+              WRITE(*,*) ':( .................DGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+
+      ELSEIF ( LSAME(JOBREF,'E') ) THEN
+      ! The unscaled vectors of the Exact DMD are computed.
+      ! This option is included for the sake of completeness,
+      ! for users who prefer the Exact DMD vectors. The
+      ! returned vectors are in the real form, in the same way
+      ! as the Ritz vectors. Here we just save the vectors
+      ! and test them separately using a Matlab script.
+
+       CALL DGEMM( 'N', 'N', M, K, M, ONE, A, LDA, AU, LDAU, ZERO, Y1, M )
+       i=1
+       DO WHILE ( i <= K )
+           IF ( IEIG(i) == ZERO ) THEN
+           ! have a real eigenvalue with real eigenvector
+           CALL DAXPY( M, -REIG(i), AU(1,i), 1, Y1(1,i), 1 )
+           RESEX(i) = DNRM2( M, Y1(1,i), 1) / DNRM2(M,AU(1,i),1)
+           i = i + 1
+           ELSE
+           ! Have a complex conjugate pair
+           ! REIG(i) +- sqrt(-1)*IMEIG(i).
+           ! Since all computation is done in real
+           ! arithmetic, the formula for the residual
+           ! is recast for real representation of the
+           ! complex conjugate eigenpair. See the
+           ! description of RES.
+           AB(1,1) =  REIG(i)
+           AB(2,1) = -IEIG(i)
+           AB(1,2) =  IEIG(i)
+           AB(2,2) =  REIG(i)
+           CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, AU(1,i), &
+                       M, AB, 2, ONE, Y1(1,i), M )
+           RESEX(i)   = DLANGE( 'F', M, 2, Y1(1,i), M, &
+                        WORK )/ DLANGE( 'F', M, 2, AU(1,i), M, &
+                        WORK )
+           RESEX(i+1) = RESEX(i)
+           i = i + 2
+           END IF
+       END DO
+
+      END IF
+
+      !...... DGEDMD check point
+      IF ( LSAME(RESIDS, 'R') ) THEN
+          ! Compare the residuals returned by DGEDMD with the
+          ! explicitly computed residuals using the matrix A.
+          ! Compute explicitly Y1 = A*Z
+          CALL DGEMM( 'N', 'N', M, K, M, ONE, A, LDA, Z, LDZ, ZERO, Y1, M )
+          ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+          ! of the invariant subspaces that correspond to complex conjugate
+          ! pairs of eigencalues. (See the description of Z in DGEDMD,)
+          i = 1
+          DO WHILE ( i <= K )
+              IF ( IEIG(i) == ZERO ) THEN
+                  ! have a real eigenvalue with real eigenvector
+                  CALL DAXPY( M, -REIG(i), Z(1,i), 1, Y1(1,i), 1 )
+                  RES1(i) = DNRM2( M, Y1(1,i), 1)
+                  i = i + 1
+              ELSE
+                  ! Have a complex conjugate pair
+                  ! REIG(i) +- sqrt(-1)*IMEIG(i).
+                  ! Since all computation is done in real
+                  ! arithmetic, the formula for the residual
+                  ! is recast for real representation of the
+                  ! complex conjugate eigenpair. See the
+                  ! description of RES.
+                  AB(1,1) =  REIG(i)
+                  AB(2,1) = -IEIG(i)
+                  AB(1,2) =  IEIG(i)
+                  AB(2,2) =  REIG(i)
+                  CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), &
+                              M, AB, 2, ONE, Y1(1,i), M )
+                  RES1(i)   = DLANGE( 'F', M, 2, Y1(1,i), M, &
+                                     WORK )
+                  RES1(i+1) = RES1(i)
+                  i = i + 2
+              END IF
+          END DO
+          TMP = ZERO
+          DO i = 1, K
+              TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+                        SINGVX(K)/(ANORM*SINGVX(1)) )
+          END DO
+          TMP_REZ = MAX( TMP_REZ, TMP )
+
+          IF ( TMP > TOL2 ) THEN
+              NFAIL_REZ = NFAIL_REZ + 1
+              WRITE(*,*) ':( ..................DGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+
+          IF ( LSAME(JOBREF,'E') ) THEN
+              TMP = ZERO
+              DO i = 1, K
+                  TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) )
+              END DO
+              TMP_EX = MAX(TMP_EX,TMP)
+          END IF
+
+      END IF
+
+      !..... store the results for inspection
+      DO i = 1, K
+          LAMBDA(i,1) = REIG(i)
+          LAMBDA(i,2) = IEIG(i)
+      END DO
+
+      DEALLOCATE(IWORK)
+      DEALLOCATE(WORK)
+
+      !======================================================================
+      !     Now test the DGEDMDQ
+      !======================================================================
+      IF ( TEST_QRDMD .AND. (K_TRAJ == 1) ) THEN
+          RJOBDATA(2) = 1
+          F1 = F
+
+          ! DGEDMDQ test: Workspace query and workspace allocation
+          CALL DGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, &
+               JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, &
+               LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ,   &
+               RES, AU, LDAU, W, LDW, S, LDS, WDUMMY,      &
+               -1, IDUMMY, -1, INFO )
+          LIWORK = IDUMMY(1)
+          ALLOCATE( IWORK(LIWORK) )
+          LWORK = INT(WDUMMY(LWMINOPT))
+          ALLOCATE(WORK(LWORK))
+          ! DGEDMDQ test: CALL DGEDMDQ
+          CALL DGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, &
+               JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, &
+               LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ,   &
+               RES, AU, LDAU, W, LDW, S, LDS,              &
+               WORK, LWORK, IWORK, LIWORK, INFO )
+
+          SINGVQX(1:KQ) = WORK(MIN(M,N+1)+1: MIN(M,N+1)+KQ)
+
+          !..... DGEDMDQ check point
+          IF ( KQ /= K ) THEN
+              KDIFF = KDIFF+1
+          END IF
+
+          TMP = ZERO
+          DO i = 1, MIN(K, KQ)
+              TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / &
+                                    SINGVX(1) )
+          END DO
+          SVDIFF = MAX( SVDIFF, TMP )
+          IF ( TMP > M*N*EPS ) THEN
+              WRITE(*,*) 'FAILED! Something was wrong with the run.'
+              NFAIL_SVDIFF = NFAIL_SVDIFF + 1
+              DO j =1, 3
+                  write(*,*) j, SINGVX(j), SINGVQX(j)
+                  read(*,*)
+              END DO
+          END IF
+
+          !..... DGEDMDQ check point
+          IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN
+              ! Check that the QR factors are computed and returned
+              ! as requested. The residual ||F-Q*R||_F / ||F||_F
+              ! is compared to M*N*EPS.
+              F2 = F
+              CALL DGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -ONE, F1, &
+                          LDF, Y, LDY, ONE, F2, LDF )
+              TMP_FQR = DLANGE( 'F', M, N+1, F2, LDF, WORK ) / &
+                    DLANGE( 'F', M, N+1, F,  LDF, WORK )
+              IF ( TMP_FQR > TOL2 ) THEN
+                  WRITE(*,*) 'FAILED! Something was wrong with the run.'
+                  NFAIL_F_QR = NFAIL_F_QR + 1
+              END IF
+          END IF
+
+          !..... DGEDMDQ check point
+          IF ( LSAME(RESIDS, 'R') ) THEN
+              ! Compare the residuals returned by DGEDMDQ with the
+              ! explicitly computed residuals using the matrix A.
+              ! Compute explicitly Y1 = A*Z
+              CALL DGEMM( 'N', 'N', M, KQ, M, ONE, A, M, Z, M, ZERO, Y1, M )
+              ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+              ! of the invariant subspaces that correspond to complex conjugate
+              ! pairs of eigencalues. (See the description of Z in DGEDMDQ)
+              i = 1
+              DO WHILE ( i <= KQ )
+                  IF ( IEIGQ(i) == ZERO ) THEN
+                      ! have a real eigenvalue with real eigenvector
+                      CALL DAXPY( M, -REIGQ(i), Z(1,i), 1, Y1(1,i), 1 )
+                      ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i)
+                      RES1(i) = DNRM2( M, Y1(1,i), 1)
+                      i = i + 1
+                  ELSE
+                     ! Have a complex conjugate pair
+                     ! REIG(i) +- sqrt(-1)*IMEIG(i).
+                     ! Since all computation is done in real
+                     ! arithmetic, the formula for the residual
+                     ! is recast for real representation of the
+                     ! complex conjugate eigenpair. See the
+                     ! description of RES.
+                     AB(1,1) =  REIGQ(i)
+                     AB(2,1) = -IEIGQ(i)
+                     AB(1,2) =  IEIGQ(i)
+                     AB(2,2) =  REIGQ(i)
+                     CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), &
+                                 M, AB, 2, ONE, Y1(1,i), M )             ! BLAS CALL
+                     ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB   ! INTRINSIC
+                     RES1(i)   = DLANGE( 'F', M, 2, Y1(1,i), M, &
+                                        WORK )                           ! LAPACK CALL
+                     RES1(i+1) = RES1(i)
+                     i = i + 2
+                  END IF
+              END DO
+              TMP = ZERO
+              DO i = 1, KQ
+                  TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+                      SINGVQX(K)/(ANORM*SINGVQX(1)) )
+              END DO
+              TMP_REZQ = MAX( TMP_REZQ, TMP )
+              IF ( TMP > TOL2 ) THEN
+                  NFAIL_REZQ = NFAIL_REZQ + 1
+                  WRITE(*,*) '................ DGEDMDQ FAILED!', &
+                      'Check the code for implementation errors.'
+                  STOP
+              END IF
+
+          END IF
+
+          DO i = 1, KQ
+              LAMBDAQ(i,1) = REIGQ(i)
+              LAMBDAQ(i,2) = IEIGQ(i)
+          END DO
+
+          DEALLOCATE(WORK)
+          DEALLOCATE(IWORK)
+      END IF ! TEST_QRDMD
+!======================================================================
+
+      END DO ! LWMINOPT
+      !write(*,*) 'LWMINOPT loop completed'
+      END DO ! WHTSVD LOOP
+      !write(*,*) 'WHTSVD loop completed'
+      END DO ! NRNK LOOP
+      !write(*,*) 'NRNK loop completed'
+      END DO ! SCALE LOOP
+      !write(*,*) 'SCALE loop completed'
+      END DO ! JOBF LOOP
+      !write(*,*) 'JOBREF loop completed'
+      END DO ! JOBZ LOOP
+      !write(*,*) 'JOBZ loop completed'
+
+      END DO ! MODE -6:6
+      !write(*,*) 'MODE loop completed'
+      END DO ! 1 or 2 trajectories
+      !write(*,*) 'trajectories  loop completed'
+
+      DEALLOCATE(A)
+      DEALLOCATE(AC)
+      DEALLOCATE(DA)
+      DEALLOCATE(DL)
+      DEALLOCATE(F)
+      DEALLOCATE(F1)
+      DEALLOCATE(F2)
+      DEALLOCATE(X)
+      DEALLOCATE(X0)
+      DEALLOCATE(SINGVX)
+      DEALLOCATE(SINGVQX)
+      DEALLOCATE(Y)
+      DEALLOCATE(Y0)
+      DEALLOCATE(Y1)
+      DEALLOCATE(Z)
+      DEALLOCATE(Z1)
+      DEALLOCATE(RES)
+      DEALLOCATE(RES1)
+      DEALLOCATE(RESEX)
+      DEALLOCATE(REIG)
+      DEALLOCATE(IEIG)
+      DEALLOCATE(REIGQ)
+      DEALLOCATE(IEIGQ)
+      DEALLOCATE(REIGA)
+      DEALLOCATE(IEIGA)
+      DEALLOCATE(VA)
+      DEALLOCATE(LAMBDA)
+      DEALLOCATE(LAMBDAQ)
+      DEALLOCATE(EIGA)
+      DEALLOCATE(W)
+      DEALLOCATE(AU)
+      DEALLOCATE(S)
+
+!............................................................
+      !     Generate random M-by-M matrix A. Use DLATMR from
+      END DO ! LLOOP
+
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for DGEDMD :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+      IF ( NFAIL_Z_XV == 0 ) THEN
+          WRITE(*,*) '>>>> Z - U*V test PASSED.'
+      ELSE
+          WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)'
+          WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_ZXW
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_Z_XV
+      END IF
+      IF ( NFAIL_AU == 0 ) THEN
+          WRITE(*,*) '>>>> A*U test PASSED. '
+      ELSE
+          WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)'
+          WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU
+      END IF
+
+      IF ( NFAIL_REZ == 0 ) THEN
+          WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+          WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)'
+          WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ
+      END IF
+
+      IF ( NFAIL_TOTAL == 0 ) THEN
+          WRITE(*,*) '>>>> DGEDMD :: ALL TESTS PASSED.'
+      ELSE
+          WRITE(*,*) NFAIL_TOTAL, 'FAILURES!'
+          WRITE(*,*) '>>>>>>>>>>>>>> DGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      IF ( TEST_QRDMD ) THEN
+      WRITE(*,*)
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for DGEDMDQ :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+
+      IF ( NFAIL_SVDIFF == 0 ) THEN
+          WRITE(*,*) '>>>> DGEDMD and DGEDMDQ computed singular &
+              &values test PASSED.'
+      ELSE
+          WRITE(*,*) 'DGEDMD and DGEDMDQ discrepancies in &
+              &the singular values unacceptable ', &
+              NFAIL_SVDIFF, ' times. Test FAILED.'
+          WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF
+      END IF
+
+      IF ( NFAIL_F_QR == 0 ) THEN
+          WRITE(*,*) '>>>> F - Q*R test PASSED.'
+      ELSE
+          WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)'
+          WRITE(*,*) 'The largest relative residual was ', TMP_FQR
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR
+      END IF
+
+      IF ( NFAIL_REZQ == 0 ) THEN
+          WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+          WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)'
+          WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ
+      END IF
+
+      IF ( NFAILQ_TOTAL == 0 ) THEN
+          WRITE(*,*) '>>>>>>> DGEDMDQ :: ALL TESTS PASSED.'
+      ELSE
+         WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!'
+         WRITE(*,*) '>>>>>>> DGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      END IF
+
+      WRITE(*,*)
+      WRITE(*,*) 'Test completed.'
+      STOP
+      END
diff --git a/lapack-netlib/TESTING/EIG/schkdmd.f90 b/lapack-netlib/TESTING/EIG/schkdmd.f90
new file mode 100644
index 000000000..77e3e46c0
--- /dev/null
+++ b/lapack-netlib/TESTING/EIG/schkdmd.f90
@@ -0,0 +1,792 @@
+!     This is a test program for checking the implementations of
+!     the implementations of the following subroutines
+!
+!     SGEDMD  for computation of the
+!             Dynamic Mode Decomposition (DMD)
+!     SGEDMDQ for computation of a
+!             QR factorization based compressed DMD
+!
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     ========================================================
+!     How to run the code (compiler, link info)
+!     ========================================================
+!     Compile as FORTRAN 90 (or later) and link with BLAS and
+!     LAPACK libraries.
+!     NOTE: The code is developed and tested on top of the
+!     Intel MKL library (versions 2022.0.3 and 2022.2.0),
+!     using the Intel Fortran compiler.
+!
+!     For developers of the C++ implementation
+!     ========================================================
+!     See the LAPACK++ and Template Numerical Toolkit (TNT)
+!
+!     Note on a development of the GPU HP implementation
+!     ========================================================
+!     Work in progress. See CUDA, MAGMA, SLATE.
+!     NOTE: The four SVD subroutines used in this code are
+!     included as a part of R&D and for the completeness.
+!     This was also an opportunity to test those SVD codes.
+!     If the scaling option is used all four are essentially
+!     equally good. For implementations on HP platforms,
+!     one can use whichever SVD is available.
+!...  .........................................................
+!     NOTE:
+!     When using the Intel MKL 2022.0.3 the subroutine xGESVDQ
+!     (optionally used in xGEDMD) may cause access violation
+!     error for x = S, D, C, Z, but only if called with the
+!     work space query. (At least in our Windows 10 MSVS 2019.)
+!     The problem can be mitigated by downloading the source
+!     code of xGESVDQ from the LAPACK repository and use it
+!     localy instead of the one in the MKL. This seems to
+!     indicate that the problem is indeed in the MKL.
+!     This problem did not appear whith Intel MKL 2022.2.0.
+!
+!     NOTE:
+!     xGESDD seems to have a problem with workspace. In some
+!     cases the length of the optimal workspace is returned
+!     smaller than the minimal workspace, as specified in the
+!     code. As a precaution, all optimal workspaces are
+!     set as MAX(minimal, optimal).
+!     Latest implementations of complex xGESDD have different
+!     length of the real worksapce. We use max value over
+!     two versions.
+!............................................................
+!............................................................
+!
+      PROGRAM DMD_TEST
+      use iso_fortran_env, only: real32
+      IMPLICIT NONE
+      integer, parameter :: WP = real32
+
+!............................................................
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+!............................................................
+      REAL(KIND=WP), ALLOCATABLE, DIMENSION(:,:) ::          &
+                     A, AC, EIGA, LAMBDA, LAMBDAQ, F, F1, F2,&
+                     Z, Z1, S, AU, W, VA, X, X0, Y, Y0, Y1
+      REAL(KIND=WP), ALLOCATABLE, DIMENSION(:)   ::          &
+                     DA, DL, DR, REIG, REIGA, REIGQ, IEIG,   &
+                     IEIGA, IEIGQ,  RES, RES1, RESEX, SINGVX,&
+                     SINGVQX, WORK
+      INTEGER      , ALLOCATABLE, DIMENSION(:)   ::   IWORK
+      REAL(KIND=WP) :: AB(2,2),   WDUMMY(2)
+      INTEGER       :: IDUMMY(2), ISEED(4), RJOBDATA(8)
+      REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, DMAX, EPS, &
+                       TOL, TOL2, SVDIFF, TMP, TMP_AU,       &
+                       TMP_FQR, TMP_REZ, TMP_REZQ,  TMP_ZXW, &
+                       TMP_EX, XNORM, YNORM
+!............................................................
+      INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY,  &
+                 LDZ, LIWORK, LWORK, M, N, L, LLOOP, NRNK
+      INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, KDIFF,     &
+                 NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ,     &
+                 NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL, &
+                 NFAIL_Z_XV, MODE, MODEL, MODER, WHTSVD
+      INTEGER    iNRNK, iWHTSVD, K_TRAJ, LWMINOPT
+      CHARACTER(LEN=1) GRADE, JOBREF, JOBZ, PIVTNG, RSIGN,   &
+                       SCALE, RESIDS, WANTQ, WANTR
+
+      LOGICAL          TEST_QRDMD
+!..... external subroutines (BLAS and LAPACK)
+      EXTERNAL SAXPY,  SGEEV, SGEMM, SGEMV, SLACPY, SLASCL
+      EXTERNAL SLARNV, SLATMR
+!.....external subroutines DMD package, part 1
+!     subroutines under test
+      EXTERNAL SGEDMD, SGEDMDQ
+
+!..... external functions (BLAS and LAPACK)
+      EXTERNAL         SLAMCH, SLANGE, SNRM2
+      REAL(KIND=WP) :: SLAMCH, SLANGE, SNRM2
+      EXTERNAL         LSAME
+      LOGICAL          LSAME
+
+      INTRINSIC ABS, INT, MIN, MAX
+!............................................................
+
+      ! The test is always in pairs : ( SGEDMD and SGEDMDQ )
+      ! because the test includes comparing the results (in pairs).
+!.....................................................................................
+      TEST_QRDMD = .TRUE. ! This code by default performs tests on SGEDMDQ
+                          ! Since the QR factorizations based algorithm is designed for
+                          ! single trajectory data, only single trajectory tests will
+                          ! be performed with xGEDMDQ.
+      WANTQ = 'Q'
+      WANTR = 'R'
+!.................................................................................
+
+      EPS = SLAMCH( 'P' )  ! machine precision SP
+
+      ! Global counters of failures of some particular tests
+      NFAIL      = 0
+      NFAIL_REZ  = 0
+      NFAIL_REZQ = 0
+      NFAIL_Z_XV = 0
+      NFAIL_F_QR = 0
+      NFAIL_AU   = 0
+      KDIFF      = 0
+      NFAIL_SVDIFF = 0
+      NFAIL_TOTAL  = 0
+      NFAILQ_TOTAL = 0
+
+
+      DO LLOOP = 1, 4
+
+      WRITE(*,*) 'L Loop Index = ', LLOOP
+
+      ! Set the dimensions of the problem ...
+      WRITE(*,*) 'M = '
+      READ(*,*) M
+      WRITE(*,*) M
+      ! ... and the number of snapshots.
+      WRITE(*,*) 'N = '
+      READ(*,*) N
+      WRITE(*,*) N
+
+      ! ... Test the dimensions
+      IF ( ( MIN(M,N) == 0 ) .OR. ( M < N )  ) THEN
+          WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.'
+          STOP
+      END IF
+!.............
+      ! The seed inside the LLOOP so that each pass can be reproduced easily.
+
+      ISEED(1) = 4
+      ISEED(2) = 3
+      ISEED(3) = 2
+      ISEED(4) = 1
+
+      LDA = M
+      LDF = M
+      LDX = MAX(M,N+1)
+      LDY = MAX(M,N+1)
+      LDW = N
+      LDZ = M
+      LDAU = MAX(M,N+1)
+      LDS = N
+
+      TMP_ZXW  = ZERO
+      TMP_AU   = ZERO
+      TMP_REZ  = ZERO
+      TMP_REZQ = ZERO
+      SVDIFF   = ZERO
+      TMP_EX   = ZERO
+
+      !
+      ! Test the subroutines on real data snapshots. All
+      ! computation is done in real arithmetic, even when
+      ! Koopman eigenvalues and modes are real.
+      !
+      ! Allocate memory space
+      ALLOCATE( A(LDA,M) )
+      ALLOCATE( AC(LDA,M) )
+      ALLOCATE( DA(M) )
+      ALLOCATE( DL(M) )
+      ALLOCATE( F(LDF,N+1) )
+      ALLOCATE( F1(LDF,N+1) )
+      ALLOCATE( F2(LDF,N+1) )
+      ALLOCATE( X(LDX,N) )
+      ALLOCATE( X0(LDX,N) )
+      ALLOCATE( SINGVX(N) )
+      ALLOCATE( SINGVQX(N) )
+      ALLOCATE( Y(LDY,N+1) )
+      ALLOCATE( Y0(LDY,N+1) )
+      ALLOCATE( Y1(M,N+1) )
+      ALLOCATE( Z(LDZ,N) )
+      ALLOCATE( Z1(LDZ,N) )
+      ALLOCATE( RES(N)  )
+      ALLOCATE( RES1(N) )
+      ALLOCATE( RESEX(N) )
+      ALLOCATE( REIG(N) )
+      ALLOCATE( IEIG(N) )
+      ALLOCATE( REIGQ(N) )
+      ALLOCATE( IEIGQ(N) )
+      ALLOCATE( REIGA(M) )
+      ALLOCATE( IEIGA(M) )
+      ALLOCATE( VA(LDA,M) )
+      ALLOCATE( LAMBDA(N,2) )
+      ALLOCATE( LAMBDAQ(N,2) )
+      ALLOCATE( EIGA(M,2) )
+      ALLOCATE( W(LDW,N) )
+      ALLOCATE( AU(LDAU,N) )
+      ALLOCATE( S(N,N) )
+
+      TOL  = M*EPS
+      ! This mimics O(M*N)*EPS bound for accumulated roundoff error.
+      ! The factor 10 is somewhat arbitrary.
+      TOL2 = 10*M*N*EPS
+
+!.............
+
+      DO K_TRAJ = 1, 2
+      !  Number of intial conditions in the simulation/trajectories (1 or 2)
+
+      COND = 1.0D8
+      DMAX = 1.0D2
+      RSIGN = 'F'
+      GRADE = 'N'
+      MODEL = 6
+      CONDL = 1.0D2
+      MODER = 6
+      CONDR = 1.0D2
+      PIVTNG = 'N'
+
+      ! Loop over all parameter MODE values for ZLATMR (+1,..,+6)
+      DO MODE = 1, 6
+
+      ALLOCATE( IWORK(2*M) )
+      ALLOCATE(DR(N))
+      CALL SLATMR( M, M, 'S', ISEED, 'N', DA, MODE, COND, &
+                   DMAX, RSIGN, GRADE, DL, MODEL,  CONDL, &
+                   DR, MODER, CONDR, PIVTNG, IWORK, M, M, &
+                   ZERO, -ONE, 'N', A, LDA, IWORK(M+1), INFO )
+      DEALLOCATE(IWORK)
+      DEALLOCATE(DR)
+
+      LWORK = 4*M+1
+      ALLOCATE(WORK(LWORK))
+      AC  = A
+      CALL SGEEV( 'N','V', M, AC, M, REIGA, IEIGA, VA, M, &
+                  VA, M, WORK, LWORK, INFO ) ! LAPACK CALL
+      DEALLOCATE(WORK)
+      TMP = ZERO
+      DO i = 1, M
+          EIGA(i,1) = REIGA(i)
+          EIGA(i,2) = IEIGA(i)
+          TMP = MAX( TMP, SQRT(REIGA(i)**2+IEIGA(i)**2))
+      END DO
+
+      ! Scale A to have the desirable spectral radius.
+      CALL SLASCL( 'G', 0, 0, TMP, ONE, M, M, A, M, INFO )
+      CALL SLASCL( 'G', 0, 0, TMP, ONE, M, 2, EIGA, M, INFO )
+
+      ! Compute the norm of A
+      ANORM = SLANGE( 'F', N, N, A, M, WDUMMY )
+
+      IF ( K_TRAJ == 2 ) THEN
+          ! generate data with two inital conditions
+      CALL SLARNV(2, ISEED, M, F1(1,1) )
+      F1(1:M,1) = 1.0E-10*F1(1:M,1)
+      DO i = 1, N/2
+         CALL SGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, &
+              F1(1,i+1), 1 )
+      END DO
+      X0(1:M,1:N/2) = F1(1:M,1:N/2)
+      Y0(1:M,1:N/2) = F1(1:M,2:N/2+1)
+
+      CALL SLARNV(2, ISEED, M, F1(1,1) )
+      DO i = 1, N-N/2
+         CALL SGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, &
+              F1(1,i+1), 1 )
+      END DO
+      X0(1:M,N/2+1:N) = F1(1:M,1:N-N/2)
+      Y0(1:M,N/2+1:N) = F1(1:M,2:N-N/2+1)
+      ELSE
+          ! single trajectory
+      CALL SLARNV(2, ISEED, M, F(1,1) )
+      DO i = 1, N
+         CALL SGEMV( 'N', M, M, ONE, A, M, F(1,i), 1, ZERO, &
+              F(1,i+1), 1 )
+      END DO
+      X0(1:M,1:N) = F(1:M,1:N)
+      Y0(1:M,1:N) = F(1:M,2:N+1)
+      END IF
+
+      XNORM = SLANGE( 'F', M, N, X0, LDX, WDUMMY )
+      YNORM = SLANGE( 'F', M, N, Y0, LDX, WDUMMY )
+!............................................................
+
+      DO iJOBZ = 1, 4
+
+          SELECT CASE ( iJOBZ )
+          CASE(1)
+              JOBZ   = 'V' ! Ritz vectors will be computed
+              RESIDS = 'R' ! Residuals will be computed
+          CASE(2)
+              JOBZ   = 'V'
+              RESIDS = 'N'
+          CASE(3)
+              JOBZ   = 'F' ! Ritz vectors in factored form
+              RESIDS = 'N'
+          CASE(4)
+              JOBZ   = 'N'
+              RESIDS = 'N'
+          END SELECT
+
+      DO iJOBREF = 1, 3
+
+          SELECT CASE ( iJOBREF )
+          CASE(1)
+              JOBREF = 'R' ! Data for refined Ritz vectors
+          CASE(2)
+              JOBREF = 'E' ! Exact DMD vectors
+          CASE(3)
+              JOBREF = 'N'
+          END SELECT
+
+      DO iSCALE = 1, 4
+
+          SELECT CASE ( iSCALE )
+          CASE(1)
+              SCALE = 'S' ! X data normalized
+          CASE(2)
+              SCALE = 'C' ! X normalized, consist. check
+          CASE(3)
+              SCALE = 'Y' ! Y data normalized
+          CASE(4)
+              SCALE = 'N'
+          END SELECT
+
+      DO iNRNK = -1, -2, -1
+          ! Two truncation strategies. The "-2" case for R&D
+          ! purposes only - it uses possibly low accuracy small
+          ! singular values, in which case the formulas used in
+          ! the DMD are highly sensitive.
+          NRNK   = iNRNK
+
+      DO iWHTSVD = 1, 4
+          ! Check all four options to compute the POD basis
+          ! via the SVD.
+          WHTSVD   = iWHTSVD
+
+      DO LWMINOPT = 1, 2
+          ! Workspace query for the minimal (1) and for the optimal
+          ! (2) workspace lengths determined by workspace query.
+
+       X(1:M,1:N) = X0(1:M,1:N)
+       Y(1:M,1:N) = Y0(1:M,1:N)
+
+       ! SGEDMD: Workspace query and workspace allocation
+       CALL SGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, &
+            N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, &
+            LDZ, RES, AU, LDAU, W, LDW, S, LDS, WDUMMY, -1, &
+            IDUMMY, -1, INFO )
+
+       LIWORK = IDUMMY(1)
+       ALLOCATE( IWORK(LIWORK) )
+       LWORK = INT(WDUMMY(LWMINOPT))
+       ALLOCATE( WORK(LWORK) )
+
+       ! SGEDMD test: CALL SGEDMD
+       CALL SGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, &
+            N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, &
+            LDZ, RES, AU, LDAU, W, LDW, S, LDS, WORK, LWORK,&
+            IWORK, LIWORK, INFO )
+
+       SINGVX(1:N) = WORK(1:N)
+
+       !...... SGEDMD check point
+       IF ( LSAME(JOBZ,'V')  ) THEN
+          ! Check that Z = X*W, on return from SGEDMD
+          ! This checks that the returned aigenvectors in Z are
+          ! the product of the SVD'POD basis returned in X
+          ! and the eigenvectors of the rayleigh quotient
+          ! returned in W
+          CALL SGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, &
+                      ZERO, Z1, LDZ )
+          TMP = ZERO
+          DO i = 1, K
+             CALL SAXPY( M, -ONE, Z(1,i), 1, Z1(1,i), 1)
+             TMP = MAX(TMP, SNRM2( M, Z1(1,i), 1 ) )
+          END DO
+          TMP_ZXW = MAX(TMP_ZXW, TMP )
+
+          IF ( TMP_ZXW > 10*M*EPS ) THEN
+              NFAIL_Z_XV = NFAIL_Z_XV + 1
+          END IF
+
+       END IF
+
+       !...... SGEDMD check point
+       IF ( LSAME(JOBREF,'R') ) THEN
+           ! The matrix A*U is returned for computing refined Ritz vectors.
+           ! Check that A*U is computed correctly using the formula
+           ! A*U = Y * V * inv(SIGMA). This depends on the
+           ! accuracy in the computed singular values and vectors of X.
+           ! See the paper for an error analysis.
+           ! Note that the left singular vectors of the input matrix X
+           ! are returned in the array X.
+           CALL SGEMM( 'N', 'N', M, K, M, ONE, A, LDA, X, LDX, &
+                      ZERO, Z1, LDZ )
+           TMP = ZERO
+           DO i = 1, K
+              CALL SAXPY( M, -ONE, AU(1,i), 1, Z1(1,i), 1)
+              TMP = MAX( TMP, SNRM2( M, Z1(1,i),1 ) * &
+                       SINGVX(K)/(ANORM*SINGVX(1)) )
+           END DO
+           TMP_AU = MAX( TMP_AU, TMP )
+
+           IF ( TMP > TOL2 ) THEN
+               NFAIL_AU = NFAIL_AU + 1
+           END IF
+
+       ELSEIF ( LSAME(JOBREF,'E') ) THEN
+       ! The unscaled vectors of the Exact DMD are computed.
+       ! This option is included for the sake of completeness,
+       ! for users who prefer the Exact DMD vectors. The
+       ! returned vectors are in the real form, in the same way
+       ! as the Ritz vectors. Here we just save the vectors
+       ! and test them separately using a Matlab script.
+
+       CALL SGEMM( 'N', 'N', M, K, M, ONE, A, LDA, AU, LDAU, ZERO, Y1, M )
+       i=1
+       DO WHILE ( i <= K )
+       IF ( IEIG(i) == ZERO ) THEN
+        ! have a real eigenvalue with real eigenvector
+        CALL SAXPY( M, -REIG(i), AU(1,i), 1, Y1(1,i), 1 )
+        RESEX(i) = SNRM2( M, Y1(1,i), 1) / SNRM2(M,AU(1,i),1)
+        i = i + 1
+       ELSE
+       ! Have a complex conjugate pair
+       ! REIG(i) +- sqrt(-1)*IMEIG(i).
+       ! Since all computation is done in real
+       ! arithmetic, the formula for the residual
+       ! is recast for real representation of the
+       ! complex conjugate eigenpair. See the
+       ! description of RES.
+       AB(1,1) =  REIG(i)
+       AB(2,1) = -IEIG(i)
+       AB(1,2) =  IEIG(i)
+       AB(2,2) =  REIG(i)
+       CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, AU(1,i), &
+                   M, AB, 2, ONE, Y1(1,i), M )
+       RESEX(i)   = SLANGE( 'F', M, 2, Y1(1,i), M, &
+                    WORK )/ SLANGE( 'F', M, 2, AU(1,i), M, &
+                    WORK )
+       RESEX(i+1) = RESEX(i)
+       i = i + 2
+       END IF
+       END DO
+
+       END IF
+
+      !...... SGEDMD check point
+      IF ( LSAME(RESIDS, 'R') ) THEN
+          ! Compare the residuals returned by SGEDMD with the
+          ! explicitly computed residuals using the matrix A.
+          ! Compute explicitly Y1 = A*Z
+          CALL SGEMM( 'N', 'N', M, K, M, ONE, A, LDA, Z, LDZ, ZERO, Y1, M )
+          ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+          ! of the invariant subspaces that correspond to complex conjugate
+          ! pairs of eigencalues. (See the description of Z in SGEDMD,)
+          i = 1
+          DO WHILE ( i <= K )
+            IF ( IEIG(i) == ZERO ) THEN
+                ! have a real eigenvalue with real eigenvector
+                CALL SAXPY( M, -REIG(i), Z(1,i), 1, Y1(1,i), 1 )
+                RES1(i) = SNRM2( M, Y1(1,i), 1)
+                i = i + 1
+            ELSE
+               ! Have a complex conjugate pair
+               ! REIG(i) +- sqrt(-1)*IMEIG(i).
+               ! Since all computation is done in real
+               ! arithmetic, the formula for the residual
+               ! is recast for real representation of the
+               ! complex conjugate eigenpair. See the
+               ! description of RES.
+               AB(1,1) =  REIG(i)
+               AB(2,1) = -IEIG(i)
+               AB(1,2) =  IEIG(i)
+               AB(2,2) =  REIG(i)
+               CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), &
+                           M, AB, 2, ONE, Y1(1,i), M )
+               RES1(i)   = SLANGE( 'F', M, 2, Y1(1,i), M, &
+                                  WORK )
+               RES1(i+1) = RES1(i)
+               i = i + 2
+            END IF
+          END DO
+          TMP = ZERO
+          DO i = 1, K
+          TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+                    SINGVX(K)/(ANORM*SINGVX(1)) )
+          END DO
+          TMP_REZ = MAX( TMP_REZ, TMP )
+
+          IF ( TMP > TOL2 ) THEN
+              NFAIL_REZ = NFAIL_REZ + 1
+          END IF
+
+         IF ( LSAME(JOBREF,'E') ) THEN
+            TMP = ZERO
+          DO i = 1, K
+          TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) )
+          END DO
+          TMP_EX = MAX(TMP_EX,TMP)
+         END IF
+
+      END IF
+
+      ! ... store the results for inspection
+      DO i = 1, K
+          LAMBDA(i,1) = REIG(i)
+          LAMBDA(i,2) = IEIG(i)
+      END DO
+
+      DEALLOCATE(IWORK)
+      DEALLOCATE(WORK)
+
+      !======================================================================
+      !     Now test the SGEDMDQ, if requested.
+      !======================================================================
+      IF ( TEST_QRDMD .AND. (K_TRAJ == 1) ) THEN
+          RJOBDATA(2) = 1
+          F1 = F
+
+          ! SGEDMDQ test: Workspace query and workspace allocation
+          CALL SGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, &
+               JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, &
+               LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ,   &
+               RES, AU, LDAU, W, LDW, S, LDS, WDUMMY,      &
+               -1, IDUMMY, -1, INFO )
+          LIWORK = IDUMMY(1)
+          ALLOCATE( IWORK(LIWORK) )
+          LWORK = INT(WDUMMY(LWMINOPT))
+          ALLOCATE(WORK(LWORK))
+
+          ! SGEDMDQ test: CALL SGEDMDQ
+          CALL SGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, &
+               JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, &
+               LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ,   &
+               RES, AU, LDAU, W, LDW, S, LDS,              &
+               WORK, LWORK, IWORK, LIWORK, INFO )
+
+          SINGVQX(1:KQ) = WORK(MIN(M,N+1)+1: MIN(M,N+1)+KQ)
+
+          !..... SGEDMDQ check point
+          IF ( KQ /= K ) THEN
+             KDIFF = KDIFF+1
+          END IF
+
+          TMP = ZERO
+          DO i = 1, MIN(K, KQ)
+             TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / &
+                                   SINGVX(1) )
+          END DO
+          SVDIFF = MAX( SVDIFF, TMP )
+          IF ( TMP > M*N*EPS ) THEN
+             NFAIL_SVDIFF = NFAIL_SVDIFF + 1
+          END IF
+
+          !..... SGEDMDQ check point
+          IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN
+             ! Check that the QR factors are computed and returned
+             ! as requested. The residual ||F-Q*R||_F / ||F||_F
+             ! is compared to M*N*EPS.
+             F2 = F
+             CALL SGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -ONE, F1, &
+                         LDF, Y, LDY, ONE, F2, LDF )
+             TMP_FQR = SLANGE( 'F', M, N+1, F2, LDF, WORK ) / &
+                   SLANGE( 'F', M, N+1, F,  LDF, WORK )
+             IF ( TMP_FQR > TOL2 ) THEN
+                 NFAIL_F_QR = NFAIL_F_QR + 1
+             END IF
+          END IF
+
+          !..... SGEDMDQ checkpoint
+          IF ( LSAME(RESIDS, 'R') ) THEN
+              ! Compare the residuals returned by SGEDMDQ with the
+              ! explicitly computed residuals using the matrix A.
+              ! Compute explicitly Y1 = A*Z
+              CALL SGEMM( 'N', 'N', M, KQ, M, ONE, A, M, Z, M, ZERO, Y1, M )
+              ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+              ! of the invariant subspaces that correspond to complex conjugate
+              ! pairs of eigencalues. (See the description of Z in SGEDMDQ)
+              i = 1
+              DO WHILE ( i <= KQ )
+                IF ( IEIGQ(i) == ZERO ) THEN
+                    ! have a real eigenvalue with real eigenvector
+                    CALL SAXPY( M, -REIGQ(i), Z(1,i), 1, Y1(1,i), 1 )
+                    ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i)
+                    RES1(i) = SNRM2( M, Y1(1,i), 1)
+                    i = i + 1
+                ELSE
+                   ! Have a complex conjugate pair
+                   ! REIG(i) +- sqrt(-1)*IMEIG(i).
+                   ! Since all computation is done in real
+                   ! arithmetic, the formula for the residual
+                   ! is recast for real representation of the
+                   ! complex conjugate eigenpair. See the
+                   ! description of RES.
+                   AB(1,1) =  REIGQ(i)
+                   AB(2,1) = -IEIGQ(i)
+                   AB(1,2) =  IEIGQ(i)
+                   AB(2,2) =  REIGQ(i)
+                   CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), &
+                               M, AB, 2, ONE, Y1(1,i), M )             ! BLAS CALL
+                   ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB   ! INTRINSIC
+                   RES1(i)   = SLANGE( 'F', M, 2, Y1(1,i), M, &
+                                      WORK )                           ! LAPACK CALL
+                   RES1(i+1) = RES1(i)
+                   i = i + 2
+                END IF
+              END DO
+              TMP = ZERO
+              DO i = 1, KQ
+              TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+                  SINGVQX(K)/(ANORM*SINGVQX(1)) )
+              END DO
+              TMP_REZQ = MAX( TMP_REZQ, TMP )
+              IF ( TMP > TOL2 ) THEN
+                  NFAIL_REZQ = NFAIL_REZQ + 1
+              END IF
+
+          END IF
+
+          DO i = 1, KQ
+              LAMBDAQ(i,1) = REIGQ(i)
+              LAMBDAQ(i,2) = IEIGQ(i)
+          END DO
+
+      DEALLOCATE(WORK)
+      DEALLOCATE(IWORK)
+      END IF            ! TEST_QRDMD
+!======================================================================
+
+      END DO ! LWMINOPT
+      !write(*,*) 'LWMINOPT loop completed'
+      END DO ! WHTSVD LOOP
+      !write(*,*) 'WHTSVD loop completed'
+      END DO ! NRNK LOOP
+      !write(*,*) 'NRNK loop completed'
+      END DO ! SCALE LOOP
+      !write(*,*) 'SCALE loop completed'
+      END DO ! JOBF LOOP
+      !write(*,*) 'JOBREF loop completed'
+      END DO ! JOBZ LOOP
+      !write(*,*) 'JOBZ loop completed'
+
+      END DO ! MODE -6:6
+      !write(*,*) 'MODE loop completed'
+      END DO ! 1 or 2 trajectories
+      !write(*,*) 'trajectories  loop completed'
+
+      DEALLOCATE(A)
+      DEALLOCATE(AC)
+      DEALLOCATE(DA)
+      DEALLOCATE(DL)
+      DEALLOCATE(F)
+      DEALLOCATE(F1)
+      DEALLOCATE(F2)
+      DEALLOCATE(X)
+      DEALLOCATE(X0)
+      DEALLOCATE(SINGVX)
+      DEALLOCATE(SINGVQX)
+      DEALLOCATE(Y)
+      DEALLOCATE(Y0)
+      DEALLOCATE(Y1)
+      DEALLOCATE(Z)
+      DEALLOCATE(Z1)
+      DEALLOCATE(RES)
+      DEALLOCATE(RES1)
+      DEALLOCATE(RESEX)
+      DEALLOCATE(REIG)
+      DEALLOCATE(IEIG)
+      DEALLOCATE(REIGQ)
+      DEALLOCATE(IEIGQ)
+      DEALLOCATE(REIGA)
+      DEALLOCATE(IEIGA)
+      DEALLOCATE(VA)
+      DEALLOCATE(LAMBDA)
+      DEALLOCATE(LAMBDAQ)
+      DEALLOCATE(EIGA)
+      DEALLOCATE(W)
+      DEALLOCATE(AU)
+      DEALLOCATE(S)
+
+!............................................................
+      !     Generate random M-by-M matrix A. Use DLATMR from
+      END DO ! LLOOP
+
+
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for SGEDMD :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+      IF ( NFAIL_Z_XV == 0 ) THEN
+          WRITE(*,*) '>>>> Z - U*V test PASSED.'
+      ELSE
+          WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)'
+          WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_ZXW
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_Z_XV
+      END IF
+      IF ( NFAIL_AU == 0 ) THEN
+          WRITE(*,*) '>>>> A*U test PASSED. '
+      ELSE
+          WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)'
+          WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU
+      END IF
+
+      IF ( NFAIL_REZ == 0 ) THEN
+          WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+          WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)'
+          WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ
+      END IF
+
+      IF ( NFAIL_TOTAL == 0 ) THEN
+          WRITE(*,*) '>>>> SGEDMD :: ALL TESTS PASSED.'
+      ELSE
+          WRITE(*,*) NFAIL_TOTAL, 'FAILURES!'
+          WRITE(*,*) '>>>>>>>>>>>>>> SGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      IF ( TEST_QRDMD ) THEN
+      WRITE(*,*)
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for SGEDMDQ :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+
+      IF ( NFAIL_SVDIFF == 0 ) THEN
+          WRITE(*,*) '>>>> SGEDMD and SGEDMDQ computed singular &
+              &values test PASSED.'
+      ELSE
+          WRITE(*,*) 'SGEDMD and SGEDMDQ discrepancies in &
+              &the singular values unacceptable ', &
+              NFAIL_SVDIFF, ' times. Test FAILED.'
+          WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF
+      END IF
+
+      IF ( NFAIL_F_QR == 0 ) THEN
+          WRITE(*,*) '>>>> F - Q*R test PASSED.'
+      ELSE
+          WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)'
+          WRITE(*,*) 'The largest relative residual was ', TMP_FQR
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR
+      END IF
+
+      IF ( NFAIL_REZQ == 0 ) THEN
+          WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+          WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)'
+          WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ
+      END IF
+
+      IF ( NFAILQ_TOTAL == 0 ) THEN
+          WRITE(*,*) '>>>>>>> SGEDMDQ :: ALL TESTS PASSED.'
+      ELSE
+         WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!'
+         WRITE(*,*) '>>>>>>> SGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      END IF
+
+      WRITE(*,*)
+      WRITE(*,*) 'Test completed.'
+      STOP
+      END
diff --git a/lapack-netlib/TESTING/EIG/zchkdmd.f90 b/lapack-netlib/TESTING/EIG/zchkdmd.f90
new file mode 100644
index 000000000..873d956c4
--- /dev/null
+++ b/lapack-netlib/TESTING/EIG/zchkdmd.f90
@@ -0,0 +1,745 @@
+!     This is a test program for checking the implementations of
+!     the implementations of the following subroutines
+!
+!     ZGEDMD,  for computation of the
+!              Dynamic Mode Decomposition (DMD)
+!     ZGEDMDQ, for computation of a
+!              QR factorization based compressed DMD
+!
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     ========================================================
+!     How to run the code (compiler, link info)
+!     ========================================================
+!     Compile as FORTRAN 90 (or later) and link with BLAS and
+!     LAPACK libraries.
+!     NOTE: The code is developed and tested on top of the
+!     Intel MKL library (versions 2022.0.3 and 2022.2.0),
+!     using the Intel Fortran compiler.
+!
+!     For developers of the C++ implementation
+!     ========================================================
+!     See the LAPACK++ and Template Numerical Toolkit (TNT)
+!
+!     Note on a development of the GPU HP implementation
+!     ========================================================
+!     Work in progress. See CUDA, MAGMA, SLATE.
+!     NOTE: The four SVD subroutines used in this code are
+!     included as a part of R&D and for the completeness.
+!     This was also an opportunity to test those SVD codes.
+!     If the scaling option is used all four are essentially
+!     equally good. For implementations on HP platforms,
+!     one can use whichever SVD is available.
+!............................................................
+
+!............................................................
+!............................................................
+!
+      PROGRAM DMD_TEST
+      use iso_fortran_env, only: real64
+      IMPLICIT NONE
+      integer, parameter :: WP = real64
+
+!............................................................
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+
+      COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP )
+      COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP )
+!............................................................
+      REAL(KIND=WP), ALLOCATABLE, DIMENSION(:)   :: RES, &
+                     RES1, RESEX, SINGVX, SINGVQX, WORK
+      INTEGER      , ALLOCATABLE, DIMENSION(:)   :: IWORK
+      REAL(KIND=WP) :: WDUMMY(2)
+      INTEGER       :: IDUMMY(4), ISEED(4)
+      REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, EPS,       &
+                       TOL, TOL2, SVDIFF, TMP, TMP_AU,       &
+                       TMP_FQR, TMP_REZ, TMP_REZQ,  TMP_ZXW, &
+                       TMP_EX
+
+!............................................................
+      COMPLEX(KIND=WP) :: ZMAX
+      INTEGER :: LZWORK
+      COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:,:) ::  ZA, ZAC,  &
+                                 ZAU, ZF, ZF0, ZF1, ZS, ZW,       &
+                                 ZX, ZX0, ZY, ZY0, ZY1, ZZ, ZZ1
+      COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:)   ::  ZDA, ZDR, &
+                                       ZDL, ZEIGS, ZEIGSA, ZWORK
+      COMPLEX(KIND=WP) ::  ZDUMMY(22), ZDUM2X2(2,2)
+!............................................................
+      INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY,  &
+                 LDZ, LIWORK, LWORK, M, N, LLOOP, NRNK, NRNKsp
+      INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, j,     &
+                 NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ,     &
+                 NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL,  &
+                 NFAIL_Z_XV,  MODE, MODEL, MODER, WHTSVD,     &
+                 WHTSVDsp
+      INTEGER :: iNRNK, iWHTSVD,  K_TRAJ, LWMINOPT
+      CHARACTER :: GRADE, JOBREF, JOBZ, PIVTNG, RSIGN,   &
+                       SCALE, RESIDS, WANTQ, WANTR
+      LOGICAL :: TEST_QRDMD
+
+!.....external subroutines (BLAS and LAPACK)
+      EXTERNAL DAXPY,  DGEEV, DGEMM, DGEMV, DLACPY, DLASCL
+      EXTERNAL ZGEEV,  ZGEMV, ZLASCL
+      EXTERNAL ZLARNV, ZLATMR
+      EXTERNAL ZAXPY,  ZGEMM
+!.....external subroutines DMD package, part 1
+!     subroutines under test
+      EXTERNAL ZGEDMD, ZGEDMDQ
+!.....external functions (BLAS and LAPACK)
+      EXTERNAL         DLAMCH,  DZNRM2
+      REAL(KIND=WP) :: DLAMCH,  DZNRM2
+      REAL(KIND=WP) ::          ZLANGE
+      EXTERNAL IZAMAX
+      INTEGER  IZAMAX
+      EXTERNAL LSAME
+      LOGICAL  LSAME
+
+      INTRINSIC ABS, INT, MIN, MAX, SIGN
+!............................................................
+
+      ! The test is always in pairs : ( ZGEDMD and ZGEDMDQ )
+      ! because the test includes comparing the results (in pairs).
+!.....................................................................................
+      TEST_QRDMD = .TRUE. ! This code by default performs tests on ZGEDMDQ
+                          ! Since the QR factorizations based algorithm is designed for
+                          ! single trajectory data, only single trajectory tests will
+                          ! be performed with xGEDMDQ.
+      WANTQ = 'Q'
+      WANTR = 'R'
+!.................................................................................
+
+      EPS = DLAMCH( 'P' )  ! machine precision DP
+
+      ! Global counters of failures of some particular tests
+      NFAIL      = 0
+      NFAIL_REZ  = 0
+      NFAIL_REZQ = 0
+      NFAIL_Z_XV = 0
+      NFAIL_F_QR = 0
+      NFAIL_AU   = 0
+      NFAIL_SVDIFF = 0
+      NFAIL_TOTAL  = 0
+      NFAILQ_TOTAL = 0
+
+      DO LLOOP = 1, 4
+
+      WRITE(*,*) 'L Loop Index = ', LLOOP
+
+      ! Set the dimensions of the problem ...
+      WRITE(*,*) 'M = '
+      READ(*,*) M
+      WRITE(*,*) M
+      ! ... and the number of snapshots.
+      WRITE(*,*) 'N = '
+      READ(*,*) N
+      WRITE(*,*) N
+
+      ! ... Test the dimensions
+      IF ( ( MIN(M,N) == 0 ) .OR. ( M < N )  ) THEN
+          WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.'
+          STOP
+      END IF
+!.............
+      ! The seed inside the LLOOP so that each pass can be reproduced easily.
+      ISEED(1) = 4
+      ISEED(2) = 3
+      ISEED(3) = 2
+      ISEED(4) = 1
+
+      LDA  = M
+      LDF  = M
+      LDX  = M
+      LDY  = M
+      LDW  = N
+      LDZ  = M
+      LDAU = M
+      LDS  = N
+
+      TMP_ZXW  = ZERO
+      TMP_AU   = ZERO
+      TMP_REZ  = ZERO
+      TMP_REZQ = ZERO
+      SVDIFF   = ZERO
+      TMP_EX   = ZERO
+
+      ALLOCATE( ZA(LDA,M) )
+      ALLOCATE( ZAC(LDA,M) )
+      ALLOCATE( ZF(LDF,N+1) )
+      ALLOCATE( ZF0(LDF,N+1) )
+      ALLOCATE( ZF1(LDF,N+1) )
+      ALLOCATE( ZX(LDX,N) )
+      ALLOCATE( ZX0(LDX,N) )
+      ALLOCATE( ZY(LDY,N+1) )
+      ALLOCATE( ZY0(LDY,N+1) )
+      ALLOCATE( ZY1(LDY,N+1) )
+      ALLOCATE( ZAU(LDAU,N) )
+      ALLOCATE( ZW(LDW,N) )
+      ALLOCATE( ZS(LDS,N) )
+      ALLOCATE( ZZ(LDZ,N) )
+      ALLOCATE( ZZ1(LDZ,N) )
+      ALLOCATE( RES(N) )
+      ALLOCATE( RES1(N) )
+      ALLOCATE( RESEX(N) )
+      ALLOCATE( ZEIGS(N) )
+      ALLOCATE( SINGVX(N) )
+      ALLOCATE( SINGVQX(N) )
+
+      TOL  = M*EPS
+      ! This mimics O(M*N)*EPS bound for accumulated roundoff error.
+      ! The factor 10 is somewhat arbitrary.
+      TOL2 = 10*M*N*EPS
+
+!.............
+
+      DO K_TRAJ = 1, 2
+      !  Number of intial conditions in the simulation/trajectories (1 or 2)
+
+      COND = 1.0D4
+      ZMAX = (1.0D1,1.0D1)
+      RSIGN = 'F'
+      GRADE = 'N'
+      MODEL = 6
+      CONDL = 1.0D1
+      MODER = 6
+      CONDR = 1.0D1
+      PIVTNG = 'N'
+
+      ! Loop over all parameter MODE values for ZLATMR (+1,..,+6)
+      DO MODE = 1, 6
+
+      ALLOCATE( IWORK(2*M) )
+      ALLOCATE( ZDA(M) )
+      ALLOCATE( ZDL(M) )
+      ALLOCATE( ZDR(M) )
+
+      CALL ZLATMR( M, M, 'N', ISEED, 'N', ZDA, MODE, COND, &
+                   ZMAX, RSIGN, GRADE, ZDL, MODEL,  CONDL, &
+                   ZDR, MODER, CONDR, PIVTNG, IWORK, M, M, &
+                   ZERO, -ONE, 'N', ZA, LDA, IWORK(M+1), INFO )
+      DEALLOCATE( ZDR )
+      DEALLOCATE( ZDL )
+      DEALLOCATE( ZDA )
+      DEALLOCATE( IWORK )
+
+      LZWORK = MAX(1,2*M)
+      ALLOCATE( ZEIGSA(M) )
+      ALLOCATE( ZWORK(LZWORK) )
+      ALLOCATE( WORK(2*M) )
+      ZAC(1:M,1:M) = ZA(1:M,1:M)
+      CALL ZGEEV( 'N','N', M, ZAC, LDA, ZEIGSA, ZDUM2X2, 2, &
+                  ZDUM2X2, 2, ZWORK, LZWORK, WORK, INFO ) ! LAPACK CALL
+      DEALLOCATE(WORK)
+      DEALLOCATE(ZWORK)
+
+      TMP = ABS(ZEIGSA(IZAMAX(M, ZEIGSA, 1))) ! The spectral radius of ZA
+      ! Scale the matrix ZA to have unit spectral radius.
+      CALL ZLASCL( 'G',0, 0, TMP, ONE, M, M, &
+                   ZA, LDA, INFO )
+      CALL ZLASCL( 'G',0, 0, TMP, ONE, M, 1, &
+                   ZEIGSA, M, INFO )
+      ANORM = ZLANGE( 'F', M, M, ZA, LDA, WDUMMY )
+
+      IF ( K_TRAJ == 2 ) THEN
+          ! generate data as two trajectories
+          ! with two inital conditions
+          CALL ZLARNV(2, ISEED, M, ZF(1,1) )
+          DO i = 1, N/2
+             CALL ZGEMV( 'N', M, M, ZONE, ZA, LDA, ZF(1,i), 1,  &
+                  ZZERO, ZF(1,i+1), 1 )
+          END DO
+          ZX0(1:M,1:N/2) = ZF(1:M,1:N/2)
+          ZY0(1:M,1:N/2) = ZF(1:M,2:N/2+1)
+
+          CALL ZLARNV(2, ISEED, M, ZF(1,1) )
+          DO i = 1, N-N/2
+             CALL ZGEMV( 'N', M, M, ZONE, ZA, LDA, ZF(1,i), 1,  &
+                  ZZERO, ZF(1,i+1), 1 )
+          END DO
+          ZX0(1:M,N/2+1:N) = ZF(1:M,1:N-N/2)
+          ZY0(1:M,N/2+1:N) = ZF(1:M,2:N-N/2+1)
+      ELSE
+          CALL ZLARNV(2, ISEED, M, ZF(1,1) )
+          DO i = 1, N
+             CALL ZGEMV( 'N', M, M, ZONE, ZA, M, ZF(1,i), 1,  &
+                  ZZERO, ZF(1,i+1), 1 )
+          END DO
+          ZF0(1:M,1:N+1) = ZF(1:M,1:N+1)
+          ZX0(1:M,1:N) = ZF0(1:M,1:N)
+          ZY0(1:M,1:N) = ZF0(1:M,2:N+1)
+      END IF
+
+      DEALLOCATE( ZEIGSA )
+!........................................................................
+
+      DO iJOBZ = 1, 4
+
+          SELECT CASE ( iJOBZ )
+          CASE(1)
+              JOBZ   = 'V'
+              RESIDS = 'R'
+          CASE(2)
+              JOBZ   = 'V'
+              RESIDS = 'N'
+          CASE(3)
+              JOBZ   = 'F'
+              RESIDS = 'N'
+          CASE(4)
+              JOBZ   = 'N'
+              RESIDS = 'N'
+          END SELECT
+
+      DO iJOBREF = 1, 3
+
+          SELECT CASE ( iJOBREF )
+          CASE(1)
+              JOBREF = 'R'
+          CASE(2)
+              JOBREF = 'E'
+          CASE(3)
+              JOBREF = 'N'
+          END SELECT
+
+      DO iSCALE = 1, 4
+
+          SELECT CASE ( iSCALE )
+          CASE(1)
+              SCALE = 'S'
+          CASE(2)
+              SCALE = 'C'
+          CASE(3)
+              SCALE = 'Y'
+          CASE(4)
+              SCALE = 'N'
+          END SELECT
+
+      DO iNRNK = -1, -2, -1
+         NRNK   = iNRNK
+         NRNKsp = iNRNK
+
+      DO iWHTSVD = 1,  3
+         ! Check all four options to compute the POD basis
+         ! via the SVD.
+         WHTSVD   = iWHTSVD
+         WHTSVDsp = iWHTSVD
+
+      DO LWMINOPT = 1, 2
+         ! Workspace query for the minimal (1) and for the optimal
+         ! (2) workspace lengths determined by workspace query.
+
+      ! ZGEDMD is always tested and its results are also used for
+      ! comparisons with ZGEDMDQ.
+
+      ZX(1:M,1:N) = ZX0(1:M,1:N)
+      ZY(1:M,1:N) = ZY0(1:M,1:N)
+
+      CALL ZGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD,   &
+                   M,  N, ZX, LDX, ZY, LDY, NRNK, TOL,    &
+                   K, ZEIGS, ZZ, LDZ,  RES, ZAU, LDAU,    &
+                   ZW,  LDW, ZS, LDS,  ZDUMMY, -1,        &
+                   WDUMMY, -1, IDUMMY, -1, INFO )
+      IF ( (INFO .EQ. 2) .OR. ( INFO .EQ. 3 ) &
+                          .OR. ( INFO < 0 ) ) THEN
+           WRITE(*,*) 'Call to ZGEDMD workspace query failed. &
+                      &Check the calling sequence and the code.'
+           WRITE(*,*) 'The error code is ', INFO
+           WRITE(*,*) 'The input parameters were ',      &
+           SCALE, JOBZ, RESIDS, JOBREF, WHTSVD,          &
+           M, N, LDX, LDY, NRNK, TOL, LDZ, LDAU, LDW, LDS
+           STOP
+      END IF
+
+      LZWORK = INT(ZDUMMY(LWMINOPT))
+      LWORK  = INT(WDUMMY(1))
+      LIWORK = IDUMMY(1)
+
+      ALLOCATE(ZWORK(LZWORK))
+      ALLOCATE( WORK(LWORK))
+      ALLOCATE(IWORK(LIWORK))
+
+      CALL ZGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD,  &
+                   M,  N, ZX, LDX, ZY, LDY, NRNK, TOL,   &
+                   K, ZEIGS, ZZ, LDZ,  RES, ZAU, LDAU,   &
+                   ZW,  LDW,  ZS, LDS, ZWORK,  LZWORK,   &
+                   WORK, LWORK, IWORK, LIWORK, INFO )
+
+      IF ( INFO /= 0 ) THEN
+           WRITE(*,*) 'Call to ZGEDMD failed. &
+           &Check the calling sequence and the code.'
+           WRITE(*,*) 'The error code is ', INFO
+           WRITE(*,*) 'The input parameters were ',&
+           SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+           M, N, LDX, LDY, NRNK, TOL
+           STOP
+      END IF
+
+      SINGVX(1:N) = WORK(1:N)
+
+      !...... ZGEDMD check point
+      IF ( LSAME(JOBZ,'V')  ) THEN
+          ! Check that Z = X*W, on return from ZGEDMD
+          ! This checks that the returned eigenvectors in Z are
+          ! the product of the SVD'POD basis returned in X
+          ! and the eigenvectors of the rayleigh quotient
+          ! returned in W
+          CALL ZGEMM( 'N', 'N', M, K, K, ZONE, ZX, LDX, ZW, LDW, &
+                      ZZERO, ZZ1, LDZ )
+          TMP = ZERO
+          DO i = 1, K
+             CALL ZAXPY( M, -ZONE, ZZ(1,i), 1, ZZ1(1,i), 1)
+             TMP = MAX(TMP, DZNRM2( M, ZZ1(1,i), 1 ) )
+          END DO
+          TMP_ZXW = MAX(TMP_ZXW, TMP )
+          IF ( TMP_ZXW <= 10*M*EPS ) THEN
+              !WRITE(*,*) ' :) .... OK .........ZGEDMD PASSED.'
+          ELSE
+              NFAIL_Z_XV = NFAIL_Z_XV + 1
+              WRITE(*,*) ':( .................ZGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+      END IF
+
+
+      !...... ZGEDMD check point
+      IF ( LSAME(JOBREF,'R') ) THEN
+           ! The matrix A*U is returned for computing refined Ritz vectors.
+           ! Check that A*U is computed correctly using the formula
+           ! A*U = Y * V * inv(SIGMA). This depends on the
+           ! accuracy in the computed singular values and vectors of X.
+           ! See the paper for an error analysis.
+           ! Note that the left singular vectors of the input matrix X
+           ! are returned in the array X.
+           CALL ZGEMM( 'N', 'N', M, K, M, ZONE, ZA, LDA, ZX, LDX, &
+                      ZZERO, ZZ1, LDZ )
+          TMP = ZERO
+          DO i = 1, K
+            CALL ZAXPY( M, -ZONE, ZAU(1,i), 1, ZZ1(1,i), 1)
+            TMP = MAX( TMP, DZNRM2( M, ZZ1(1,i),1 ) * &
+                     SINGVX(K)/(ANORM*SINGVX(1)) )
+          END DO
+          TMP_AU = MAX( TMP_AU, TMP )
+          IF ( TMP <= TOL2 ) THEN
+              !WRITE(*,*) ':) .... OK .........ZGEDMD PASSED.'
+          ELSE
+              NFAIL_AU = NFAIL_AU + 1
+              WRITE(*,*) ':( .................ZGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+      ELSEIF ( LSAME(JOBREF,'E') ) THEN
+       ! The unscaled vectors of the Exact DMD are computed.
+       ! This option is included for the sake of completeness,
+       ! for users who prefer the Exact DMD vectors. The
+       ! returned vectors are in the real form, in the same way
+       ! as the Ritz vectors. Here we just save the vectors
+       ! and test them separately using a Matlab script.
+
+
+       CALL ZGEMM( 'N', 'N', M, K, M, ZONE, ZA, LDA, ZAU, LDAU, ZZERO, ZY1, LDY )
+
+               DO i=1, K
+                  ! have a real eigenvalue with real eigenvector
+                CALL ZAXPY( M, -ZEIGS(i), ZAU(1,i), 1, ZY1(1,i), 1 )
+                RESEX(i) = DZNRM2( M, ZY1(1,i), 1) / DZNRM2(M,ZAU(1,i),1)
+               END DO
+      END IF
+      !...... ZGEDMD check point
+
+      IF ( LSAME(RESIDS, 'R') ) THEN
+          ! Compare the residuals returned by ZGEDMD with the
+          ! explicitly computed residuals using the matrix A.
+          ! Compute explicitly Y1 = A*Z
+          CALL ZGEMM( 'N', 'N', M, K, M, ZONE, ZA, LDA, ZZ, LDZ, ZZERO, ZY1, LDY )
+          ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+          ! of the invariant subspaces that correspond to complex conjugate
+          ! pairs of eigencalues. (See the description of Z in ZGEDMD,)
+
+          DO i=1, K
+                ! have a real eigenvalue with real eigenvector
+                CALL ZAXPY( M, -ZEIGS(i), ZZ(1,i), 1, ZY1(1,i), 1 )
+                RES1(i) = DZNRM2( M, ZY1(1,i), 1)
+          END DO
+          TMP = ZERO
+          DO i = 1, K
+          TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+                    SINGVX(K)/(ANORM*SINGVX(1)) )
+          END DO
+          TMP_REZ = MAX( TMP_REZ, TMP )
+          IF ( TMP <= TOL2 ) THEN
+              !WRITE(*,*) ':) .... OK ..........ZGEDMD PASSED.'
+          ELSE
+              NFAIL_REZ = NFAIL_REZ + 1
+              WRITE(*,*) ':( ..................ZGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+
+
+         IF ( LSAME(JOBREF,'E') ) THEN
+            TMP = ZERO
+          DO i = 1, K
+          TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) )
+          END DO
+          TMP_EX = MAX(TMP_EX,TMP)
+         END IF
+
+      END IF
+
+      DEALLOCATE(ZWORK)
+      DEALLOCATE(WORK)
+      DEALLOCATE(IWORK)
+
+      IF ( TEST_QRDMD .AND. (K_TRAJ == 1) ) THEN
+
+      ZF(1:M,1:N+1) = ZF0(1:M,1:N+1)
+
+      CALL ZGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, &
+                    WHTSVD, M, N+1, ZF, LDF,  ZX, LDX,  ZY, LDY,  &
+                    NRNK,  TOL, K, ZEIGS, ZZ, LDZ, RES,  ZAU,  &
+                    LDAU, ZW, LDW, ZS, LDS, ZDUMMY, -1,   &
+                    WDUMMY,  -1, IDUMMY, -1, INFO )
+
+      LZWORK = INT(ZDUMMY(LWMINOPT))
+      ALLOCATE( ZWORK(LZWORK) )
+      LIWORK = IDUMMY(1)
+      ALLOCATE(IWORK(LIWORK))
+      LWORK = INT(WDUMMY(1))
+      ALLOCATE(WORK(LWORK))
+
+      CALL ZGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, &
+                    WHTSVD, M, N+1, ZF, LDF,  ZX, LDX,  ZY, LDY,  &
+                    NRNK,  TOL, KQ, ZEIGS, ZZ, LDZ, RES,  ZAU,  &
+                    LDAU, ZW, LDW, ZS, LDS, ZWORK, LZWORK,   &
+                    WORK,  LWORK, IWORK, LIWORK, INFO )
+
+      IF ( INFO /= 0 ) THEN
+             WRITE(*,*) 'Call to ZGEDMDQ failed. &
+             &Check the calling sequence and the code.'
+             WRITE(*,*) 'The error code is ', INFO
+             WRITE(*,*) 'The input parameters were ',&
+             SCALE, JOBZ, RESIDS, WANTQ, WANTR, WHTSVD, &
+             M, N, LDX, LDY, NRNK, TOL
+             STOP
+      END IF
+      SINGVQX(1:N) = WORK(1:N)
+
+      !..... ZGEDMDQ check point
+
+          IF ( 1 == 0 ) THEN
+              ! Comparison of ZGEDMD and ZGEDMDQ singular values disabled
+          TMP = ZERO
+          DO i = 1, MIN(K, KQ)
+             TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / &
+                                   SINGVX(1) )
+          END DO
+          SVDIFF = MAX( SVDIFF, TMP )
+          IF ( TMP > M*N*EPS ) THEN
+             WRITE(*,*) 'FAILED! Something was wrong with the run.'
+             NFAIL_SVDIFF = NFAIL_SVDIFF + 1
+             DO j =1, 3
+                 write(*,*) j, SINGVX(j), SINGVQX(j)
+                 read(*,*)
+             END DO
+
+          END IF
+          END IF
+
+      !..... ZGEDMDQ check point
+      IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN
+         ! Check that the QR factors are computed and returned
+         ! as requested. The residual ||F-Q*R||_F / ||F||_F
+         ! is compared to M*N*EPS.
+         ZF1(1:M,1:N+1) = ZF0(1:M,1:N+1)
+         CALL ZGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -ZONE, ZF, &
+                     LDF, ZY, LDY, ZONE, ZF1, LDF )
+         TMP_FQR = ZLANGE( 'F', M, N+1, ZF1, LDF, WORK ) / &
+               ZLANGE( 'F', M, N+1, ZF0,  LDF, WORK )
+         IF ( TMP_FQR > TOL2 ) THEN
+              WRITE(*,*) 'FAILED! Something was wrong with the run.'
+             NFAIL_F_QR = NFAIL_F_QR + 1
+         ELSE
+             !WRITE(*,*) '........ PASSED.'
+         END IF
+      END IF
+
+      !..... ZGEDMDQ check point
+      IF ( LSAME(RESIDS, 'R') ) THEN
+          ! Compare the residuals returned by ZGEDMDQ with the
+          ! explicitly computed residuals using the matrix A.
+          ! Compute explicitly Y1 = A*Z
+          CALL ZGEMM( 'N', 'N', M, KQ, M, ZONE, ZA, LDA, ZZ, LDZ, ZZERO, ZY1, LDY )
+          ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+          ! of the invariant subspaces that correspond to complex conjugate
+          ! pairs of eigencalues. (See the description of Z in ZGEDMDQ)
+
+          DO i=1, KQ
+                ! have a real eigenvalue with real eigenvector
+                CALL ZAXPY( M, -ZEIGS(i), ZZ(1,i), 1, ZY1(1,i), 1 )
+                ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i)
+                RES1(i) = DZNRM2( M, ZY1(1,i), 1)
+          END DO
+          TMP = ZERO
+          DO i = 1, KQ
+          TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+              SINGVQX(KQ)/(ANORM*SINGVQX(1)) )
+          END DO
+          TMP_REZQ = MAX( TMP_REZQ, TMP )
+          IF ( TMP <= TOL2 ) THEN
+              !WRITE(*,*) '.... OK ........ ZGEDMDQ PASSED.'
+          ELSE
+              NFAIL_REZQ = NFAIL_REZQ + 1
+              WRITE(*,*) '................ ZGEDMDQ FAILED!', &
+                  'Check the code for implementation errors.'
+              STOP
+          END IF
+
+      END IF
+
+      DEALLOCATE( ZWORK )
+      DEALLOCATE( WORK  )
+      DEALLOCATE( IWORK )
+
+      END IF ! ZGEDMDQ
+
+!.......................................................................................................
+
+      END DO   ! LWMINOPT
+      !write(*,*) 'LWMINOPT loop completed'
+      END DO   ! iWHTSVD
+      !write(*,*) 'WHTSVD loop completed'
+      END DO   ! iNRNK  -2:-1
+      !write(*,*) 'NRNK loop completed'
+      END DO   ! iSCALE  1:4
+      !write(*,*) 'SCALE loop completed'
+      END DO
+      !write(*,*) 'JOBREF loop completed'
+      END DO   ! iJOBZ
+      !write(*,*) 'JOBZ loop completed'
+
+      END DO ! MODE -6:6
+      !write(*,*) 'MODE loop completed'
+      END DO ! 1 or 2 trajectories
+      !write(*,*) 'trajectories  loop completed'
+
+      DEALLOCATE( ZA )
+      DEALLOCATE( ZAC )
+      DEALLOCATE( ZZ )
+      DEALLOCATE( ZF )
+      DEALLOCATE( ZF0 )
+      DEALLOCATE( ZF1 )
+      DEALLOCATE( ZX )
+      DEALLOCATE( ZX0 )
+      DEALLOCATE( ZY )
+      DEALLOCATE( ZY0 )
+      DEALLOCATE( ZY1 )
+      DEALLOCATE( ZAU )
+      DEALLOCATE( ZW )
+      DEALLOCATE( ZS )
+      DEALLOCATE( ZZ1 )
+      DEALLOCATE( RES )
+      DEALLOCATE( RES1 )
+      DEALLOCATE( RESEX )
+      DEALLOCATE( ZEIGS )
+      DEALLOCATE( SINGVX )
+      DEALLOCATE( SINGVQX )
+
+      END DO ! LLOOP
+
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for ZGEDMD :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+      IF ( NFAIL_Z_XV == 0 ) THEN
+         WRITE(*,*) '>>>> Z - U*V test PASSED.'
+      ELSE
+         WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)'
+         WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_ZXW
+         NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_Z_XV
+      END IF
+      IF ( NFAIL_AU == 0 ) THEN
+        WRITE(*,*) '>>>> A*U test PASSED. '
+      ELSE
+        WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)'
+        WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU
+        WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+        NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU
+      END IF
+
+      IF ( NFAIL_REZ == 0 ) THEN
+        WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+        WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)'
+        WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ
+        WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+        NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ
+      END IF
+
+      IF ( NFAIL_TOTAL == 0 ) THEN
+        WRITE(*,*) '>>>> ZGEDMD :: ALL TESTS PASSED.'
+      ELSE
+        WRITE(*,*) NFAIL_TOTAL, 'FAILURES!'
+        WRITE(*,*) '>>>>>>>>>>>>>> ZGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      IF ( TEST_QRDMD ) THEN
+      WRITE(*,*)
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for ZGEDMDQ :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+
+      IF ( NFAIL_SVDIFF == 0 ) THEN
+          WRITE(*,*) '>>>> ZGEDMD and ZGEDMDQ computed singular &
+              &values test PASSED.'
+      ELSE
+         WRITE(*,*) 'ZGEDMD and ZGEDMDQ discrepancies in &
+             &the singular values unacceptable ', &
+             NFAIL_SVDIFF, ' times. Test FAILED.'
+         WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF
+         WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+         NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF
+      END IF
+
+      IF ( NFAIL_F_QR == 0 ) THEN
+          WRITE(*,*) '>>>> F - Q*R test PASSED.'
+      ELSE
+          WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)'
+          WRITE(*,*) 'The largest relative residual was ', TMP_FQR
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR
+      END IF
+
+      IF ( NFAIL_REZQ == 0 ) THEN
+          WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+          WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)'
+          WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ
+      END IF
+
+      IF ( NFAILQ_TOTAL == 0 ) THEN
+          WRITE(*,*) '>>>>>>> ZGEDMDQ :: ALL TESTS PASSED.'
+      ELSE
+         WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!'
+         WRITE(*,*) '>>>>>>> ZGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      END IF
+
+      WRITE(*,*)
+      WRITE(*,*) 'Test completed.'
+      STOP
+      END

From fa03e5497a11e4861cf8908aeb09249b39473c9c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 5 Dec 2023 15:43:28 +0100
Subject: [PATCH 454/718] Add tests for the DMD functions (Reference-LAPACK PR
 736)

---
 lapack-netlib/TESTING/Makefile | 43 ++++++++++++++++++++++++++++++----
 lapack-netlib/TESTING/cdmd.in  | 11 +++++++++
 lapack-netlib/TESTING/ddmd.in  | 11 +++++++++
 lapack-netlib/TESTING/sdmd.in  | 11 +++++++++
 lapack-netlib/TESTING/zdmd.in  | 11 +++++++++
 5 files changed, 83 insertions(+), 4 deletions(-)
 create mode 100644 lapack-netlib/TESTING/cdmd.in
 create mode 100644 lapack-netlib/TESTING/ddmd.in
 create mode 100644 lapack-netlib/TESTING/sdmd.in
 create mode 100644 lapack-netlib/TESTING/zdmd.in

diff --git a/lapack-netlib/TESTING/Makefile b/lapack-netlib/TESTING/Makefile
index bdea2bfaa..3963260ac 100644
--- a/lapack-netlib/TESTING/Makefile
+++ b/lapack-netlib/TESTING/Makefile
@@ -61,6 +61,8 @@ SEIGTST= snep.out \
          scsd.out \
          slse.out
 
+SDMDEIGTST= sdmd.out
+
 CEIGTST= cnep.out \
          csep.out \
          cse2.out \
@@ -82,6 +84,8 @@ CEIGTST= cnep.out \
          ccsd.out \
          clse.out
 
+CDMDEIGTST= cdmd.out
+
 DEIGTST= dnep.out \
          dsep.out \
          dse2.out \
@@ -103,6 +107,8 @@ DEIGTST= dnep.out \
          dcsd.out \
          dlse.out
 
+DDMDEIGTST= ddmd.out
+
 ZEIGTST= znep.out \
          zsep.out \
          zse2.out \
@@ -124,6 +130,7 @@ ZEIGTST= znep.out \
          zcsd.out \
          zlse.out
 
+ZDMDEIGTST= zdmd.out
 
 SLINTST= stest.out
 
@@ -142,10 +149,10 @@ ZLINTST= ztest.out
 ZLINTSTPROTO= zctest.out ztest_rfp.out
 
 .PHONY: single complex double complex16
-single:         $(SLINTST) $(SEIGTST)
-complex:        $(CLINTST) $(CEIGTST)
-double:         $(DLINTST) $(DEIGTST)
-complex16:      $(ZLINTST) $(ZEIGTST)
+single:         $(SLINTST) $(SEIGTST) $(SDMDEIGTST)
+complex:        $(CLINTST) $(CEIGTST) $(CDMDEIGTST)
+double:         $(DLINTST) $(DEIGTST) $(DDMDEIGTST)
+complex16:      $(ZLINTST) $(ZEIGTST) $(ZDMDEIGTST)
 
 .PHONY: singleproto complexproto doubleproto complex16proto
 singleproto:    $(SLINTSTPROTO)
@@ -297,6 +304,10 @@ scsd.out: csd.in EIG/xeigtsts
 slse.out: lse.in EIG/xeigtsts
 	@echo LSE: Testing Constrained Linear Least Squares routines
 	./EIG/xeigtsts < lse.in > $@ 2>&1
+
+sdmd.out: sdmd.in EIG/xdmdeigtsts
+	@echo DMD: Testing Dynamic Mode Decomposition routines
+	./EIG/xdmdeigtsts < sdmd.in > $@ 2>&1
 #
 # ======== COMPLEX EIG TESTS ===========================
 
@@ -379,6 +390,10 @@ ccsd.out: csd.in EIG/xeigtstc
 clse.out: lse.in EIG/xeigtstc
 	@echo LSE: Testing Constrained Linear Least Squares routines
 	./EIG/xeigtstc < lse.in > $@ 2>&1
+
+cdmd.out: cdmd.in EIG/xdmdeigtstc
+	@echo DMD: Testing Dynamic Mode Decomposition routines
+	./EIG/xdmdeigtstc < cdmd.in > $@ 2>&1
 #
 # ======== DOUBLE EIG TESTS ===========================
 
@@ -461,6 +476,10 @@ dcsd.out: csd.in EIG/xeigtstd
 dlse.out: lse.in EIG/xeigtstd
 	@echo LSE: Testing Constrained Linear Least Squares routines
 	./EIG/xeigtstd < lse.in > $@ 2>&1
+
+ddmd.out: ddmd.in EIG/xdmdeigtstd
+	@echo DMD: Testing Dynamic Mode Decomposition routines
+	./EIG/xdmdeigtstd < ddmd.in > $@ 2>&1
 #
 # ======== COMPLEX16 EIG TESTS ===========================
 
@@ -543,6 +562,10 @@ zcsd.out: csd.in EIG/xeigtstz
 zlse.out: lse.in EIG/xeigtstz
 	@echo LSE: Testing Constrained Linear Least Squares routines
 	./EIG/xeigtstz < lse.in > $@ 2>&1
+
+zdmd.out: zdmd.in EIG/xdmdeigtstz
+	@echo DMD: Testing Dynamic Mode Decomposition routines
+	./EIG/xdmdeigtstz < zdmd.in > $@ 2>&1
 # ==============================================================================
 
 LIN/xlintsts: $(FRCLIN) $(FRC)
@@ -578,15 +601,27 @@ LIN/xlintstzc: $(FRCLIN) $(FRC)
 EIG/xeigtsts: $(FRCEIG) $(FRC)
 	$(MAKE) -C EIG xeigtsts
 
+EIG/xdmdeigtsts: $(FRCEIG) $(FRC)
+	$(MAKE) -C EIG xdmdeigtsts
+
 EIG/xeigtstc: $(FRCEIG) $(FRC)
 	$(MAKE) -C EIG xeigtstc
 
+EIG/xdmdeigtstc: $(FRCEIG) $(FRC)
+	$(MAKE) -C EIG xdmdeigtstc
+
 EIG/xeigtstd: $(FRCEIG) $(FRC)
 	$(MAKE) -C EIG xeigtstd
 
+EIG/xdmdeigtstd: $(FRCEIG) $(FRC)
+	$(MAKE) -C EIG xdmdeigtstd
+
 EIG/xeigtstz: $(FRCEIG) $(FRC)
 	$(MAKE) -C EIG xeigtstz
 
+EIG/xdmdeigtstz: $(FRCEIG) $(FRC)
+	$(MAKE) -C EIG xdmdeigtstz
+
 .PHONY: clean cleantest
 clean: cleantest
 cleantest:
diff --git a/lapack-netlib/TESTING/cdmd.in b/lapack-netlib/TESTING/cdmd.in
new file mode 100644
index 000000000..42d046e01
--- /dev/null
+++ b/lapack-netlib/TESTING/cdmd.in
@@ -0,0 +1,11 @@
+10
+5
+
+20
+10
+
+30
+11
+
+50
+20
diff --git a/lapack-netlib/TESTING/ddmd.in b/lapack-netlib/TESTING/ddmd.in
new file mode 100644
index 000000000..42d046e01
--- /dev/null
+++ b/lapack-netlib/TESTING/ddmd.in
@@ -0,0 +1,11 @@
+10
+5
+
+20
+10
+
+30
+11
+
+50
+20
diff --git a/lapack-netlib/TESTING/sdmd.in b/lapack-netlib/TESTING/sdmd.in
new file mode 100644
index 000000000..42d046e01
--- /dev/null
+++ b/lapack-netlib/TESTING/sdmd.in
@@ -0,0 +1,11 @@
+10
+5
+
+20
+10
+
+30
+11
+
+50
+20
diff --git a/lapack-netlib/TESTING/zdmd.in b/lapack-netlib/TESTING/zdmd.in
new file mode 100644
index 000000000..42d046e01
--- /dev/null
+++ b/lapack-netlib/TESTING/zdmd.in
@@ -0,0 +1,11 @@
+10
+5
+
+20
+10
+
+30
+11
+
+50
+20

From c5fa318addf536754df6e1c02d3a3cfb9e3becb9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 5 Dec 2023 15:45:59 +0100
Subject: [PATCH 455/718] Add tests for DMD (Reference-LAPACK PR 736)

---
 lapack-netlib/lapack_testing.py | 85 +++++++++++++++++----------------
 1 file changed, 43 insertions(+), 42 deletions(-)

diff --git a/lapack-netlib/lapack_testing.py b/lapack-netlib/lapack_testing.py
index 5582744a0..ae59926b8 100755
--- a/lapack-netlib/lapack_testing.py
+++ b/lapack-netlib/lapack_testing.py
@@ -1,31 +1,29 @@
-#! /usr/bin/env python
-# -*- coding: utf-8 -*-
+#!/usr/bin/env python3
 
 
 ###############################################################################
 # lapack_testing.py
 ###############################################################################
 
-from __future__ import print_function
 from subprocess import Popen, STDOUT, PIPE
 import os, sys, math
 import getopt
 # Arguments
 try:
     opts, args = getopt.getopt(sys.argv[1:], "hd:b:srep:t:n",
-                               ["help", "dir", "bin", "short", "run", "error","prec=","test=","number"])
+                               ["help", "dir=", "bin=", "short", "run", "error","prec=","test=","number"])
 
 except getopt.error as msg:
     print(msg)
     print("for help use --help")
     sys.exit(2)
 
-short_summary=0
-with_file=1
-just_errors = 0
+short_summary = False
+with_file = True
+just_errors = False
 prec='x'
 test='all'
-only_numbers=0
+only_numbers = False
 test_dir='TESTING'
 bin_dir='bin/Release'
 
@@ -34,10 +32,9 @@ for o, a in opts:
         print(sys.argv[0]+" [-h|--help] [-d dir |--dir dir] [-s |--short] [-r |--run] [-e |--error] [-p p |--prec p] [-t test |--test test] [-n | --number]")
         print("     - h is to print this message")
         print("     - r is to use to run the LAPACK tests then analyse the output (.out files). By default, the script will not run all the LAPACK tests")
-        print("     - d [dir] is to indicate where is the LAPACK testing directory (.out files). By default, the script will use .")
-        print("     - b [bin] is to indicate where is the LAPACK binary files are located. By default, the script will use .")
+        print("     - d [dir] indicates the location of the LAPACK testing directory (.out files). By default, the script will use {:s}.".format(test_dir))
+        print("     - b [bin] indicates the location of the LAPACK binary files. By default, the script will use {:s}.".format(bin_dir))
         print(" LEVEL OF OUTPUT")
-        print("     - x is to print a detailed summary")
         print("     - e is to print only the error summary")
         print("     - s is to print a short summary")
         print("     - n is to print the numbers of failing tests (turn on summary mode)")
@@ -63,15 +60,14 @@ for o, a in opts:
         print("            Will return the numbers of failed tests in REAL precision by running the LAPACK Tests then analyzing the output")
         print("     ./lapack_testing.py -n -p s -t eig ")
         print("            Will return the numbers of failed tests in REAL precision by analyzing only the LAPACK output of EIGEN testings")
-        print("Written by Julie Langou (June 2011) ")
         sys.exit(0)
     else:
         if o in ("-s", "--short"):
-            short_summary = 1
+            short_summary = True
         if o in ("-r", "--run"):
-            with_file = 0
+            with_file = False
         if o in ("-e", "--error"):
-            just_errors = 1
+            just_errors = True
         if o in ( '-p', '--prec' ):
             prec = a
         if o in ( '-b', '--bin' ):
@@ -81,12 +77,12 @@ for o, a in opts:
         if o in ( '-t', '--test' ):
             test = a
         if o in ( '-n', '--number' ):
-            only_numbers = 1
-            short_summary = 1
+            only_numbers = True
+            short_summary = True
 
 # process options
 
-abs_bin_dir=os.path.normpath(os.path.join(os.getcwd(),bin_dir))
+abs_bin_dir=os.path.abspath(bin_dir)
 
 os.chdir(test_dir)
 
@@ -108,7 +104,7 @@ def run_summary_test( f, cmdline, short_summary):
     nb_test_illegal=0
     nb_test_info=0
 
-    if (with_file):
+    if with_file:
         if not os.path.exists(cmdline):
             error_message=cmdline+" file not found"
             r=1
@@ -145,16 +141,16 @@ def run_summary_test( f, cmdline, short_summary):
                 whereisrun=words_in_line.index("run)")
                 nb_test_run+=int(words_in_line[whereisrun-2])
             if (line.find("out of")!=-1):
-                if (short_summary==0): print(line, end=' ')
+                if not short_summary: print(line, end=' ')
                 whereisout= words_in_line.index("out")
                 nb_test_fail+=int(words_in_line[whereisout-1])
             if ((line.find("illegal")!=-1) or (line.find("Illegal")!=-1)):
-                if (short_summary==0):print(line, end=' ')
+                if not short_summary: print(line, end=' ')
                 nb_test_illegal+=1
             if (line.find(" INFO")!=-1):
-                if (short_summary==0):print(line, end=' ')
+                if not short_summary: print(line, end=' ')
                 nb_test_info+=1
-            if (with_file==1):
+            if with_file:
                 pipe.close()
 
     f.flush();
@@ -169,7 +165,7 @@ try:
 except IOError:
     f = sys.stdout
 
-if (short_summary==0):
+if not short_summary:
     print(" ")
     print("---------------- Testing LAPACK Routines ----------------")
     print(" ")
@@ -203,6 +199,8 @@ elif test=='mixed':
     range_prec=[1,3]
 elif test=='rfp':
     range_test=[18]
+elif test=='dmd':
+    range_test=[20]
 elif test=='eig':
     range_test=list(range(16))
 else:
@@ -219,7 +217,7 @@ for dtype in range_prec:
     letter = dtypes[0][dtype]
     name = dtypes[1][dtype]
 
-    if (short_summary==0):
+    if not short_summary:
         print(" ")
         print("------------------------- %s ------------------------" % name)
         print(" ")
@@ -231,19 +229,19 @@ for dtype in range_prec:
     letter+"gd",letter+"sb",letter+"sg",
     letter+"bb","glm","gqr",
     "gsv","csd","lse",
-    letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp"),
+    letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp",letter+"dmd"),
     ("Nonsymmetric-Eigenvalue-Problem", "Symmetric-Eigenvalue-Problem", "Symmetric-Eigenvalue-Problem-2-stage", "Singular-Value-Decomposition",
     "Eigen-Condition","Nonsymmetric-Eigenvalue","Nonsymmetric-Generalized-Eigenvalue-Problem",
     "Nonsymmetric-Generalized-Eigenvalue-Problem-driver", "Symmetric-Eigenvalue-Problem", "Symmetric-Eigenvalue-Generalized-Problem",
     "Banded-Singular-Value-Decomposition-routines", "Generalized-Linear-Regression-Model-routines", "Generalized-QR-and-RQ-factorization-routines",
     "Generalized-Singular-Value-Decomposition-routines", "CS-Decomposition-routines", "Constrained-Linear-Least-Squares-routines",
-    "Linear-Equation-routines", "Mixed-Precision-linear-equation-routines","RFP-linear-equation-routines"),
+    "Linear-Equation-routines", "Mixed-Precision-linear-equation-routines","RFP-linear-equation-routines","Dynamic-Mode-Decomposition"),
     (letter+"nep", letter+"sep", letter+"se2", letter+"svd",
     letter+"ec",letter+"ed",letter+"gg",
     letter+"gd",letter+"sb",letter+"sg",
     letter+"bb",letter+"glm",letter+"gqr",
     letter+"gsv",letter+"csd",letter+"lse",
-    letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp"),
+    letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp",letter+"dmd"),
     )
 
 
@@ -252,22 +250,25 @@ for dtype in range_prec:
         # NEED TO SKIP SOME PRECISION (namely s and c) FOR PROTO MIXED PRECISION TESTING
         if dtest==17 and (letter=="s" or letter=="c"):
             continue
-        if (with_file==1):
+        if with_file:
             cmdbase=dtests[2][dtest]+".out"
         else:
             if dtest==16:
                 # LIN TESTS
-                cmdbase="LIN/xlintst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
+                cmdbase="xlintst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
             elif dtest==17:
                 # PROTO LIN TESTS
-                cmdbase="LIN/xlintst"+letter+dtypes[0][dtype-1]+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
+                cmdbase="xlintst"+letter+dtypes[0][dtype-1]+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
             elif dtest==18:
                 # PROTO LIN TESTS
-                cmdbase="LIN/xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
+                cmdbase="xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
+            elif dtest==20:
+                # DMD EIG TESTS
+                cmdbase="xdmdeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
             else:
                 # EIG TESTS
-                cmdbase="EIG/xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
-        if (not just_errors and not short_summary):
+                cmdbase="xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
+        if not just_errors and not short_summary:
             print("Testing "+name+" "+dtests[1][dtest]+"-"+cmdbase, end=' ')
         # Run the process: either to read the file or run the LAPACK testing
         nb_test = run_summary_test(f, cmdbase, short_summary)
@@ -277,19 +278,19 @@ for dtype in range_prec:
         list_results[3][dtype]+=nb_test[3]
         got_error=nb_test[1]+nb_test[2]+nb_test[3]
 
-        if (not short_summary):
-            if (nb_test[0]>0 and just_errors==0):
+        if not short_summary:
+            if nb_test[0] > 0 and not just_errors:
                 print("passed: "+str(nb_test[0]))
-            if (nb_test[1]>0):
+            if nb_test[1] > 0:
                 print("failing to pass the threshold: "+str(nb_test[1]))
-            if (nb_test[2]>0):
+            if nb_test[2] > 0:
                 print("Illegal Error: "+str(nb_test[2]))
-            if (nb_test[3]>0):
+            if nb_test[3] > 0:
                 print("Info Error: "+str(nb_test[3]))
-            if (got_error>0 and just_errors==1):
+            if got_error > 0 and just_errors:
                 print("ERROR IS LOCATED IN "+name+" "+dtests[1][dtest]+" [ "+cmdbase+" ]")
                 print("")
-            if (just_errors==0):
+            if not just_errors:
                 print("")
 #     elif (got_error>0):
 #        print dtests[2][dtest]+".out \t"+str(nb_test[1])+"\t"+str(nb_test[2])+"\t"+str(nb_test[3])
@@ -307,7 +308,7 @@ for dtype in range_prec:
     list_results[2][4]+=list_results[2][dtype]
     list_results[3][4]+=list_results[3][dtype]
 
-if only_numbers==1:
+if only_numbers:
     print(str(list_results[1][4])+"\n"+str(list_results[2][4]+list_results[3][4]))
 else:
     print(summary)

From 226a14c5499e46d65cfd42b7e3b04fdca0844244 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 5 Dec 2023 15:50:06 +0100
Subject: [PATCH 456/718] Restore library path adjustments

---
 lapack-netlib/TESTING/EIG/Makefile | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile
index 5de315b6e..4e7cf4629 100644
--- a/lapack-netlib/TESTING/EIG/Makefile
+++ b/lapack-netlib/TESTING/EIG/Makefile
@@ -135,28 +135,28 @@ complex: xeigtstc xdmdeigtstc
 double: xeigtstd xdmdeigtstd
 complex16: xeigtstz xdmdeigtstz
 
-xdmdeigtsts: $(SDMDEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
+xdmdeigtsts: $(SDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
 	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xdmdeigtstc: $(CDMDEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
+xdmdeigtstc: $(CDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
 	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xdmdeigtstd: $(DDMDEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
+xdmdeigtstd: $(DDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
 	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xdmdeigtstz: $(ZDMDEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
+xdmdeigtstz: $(ZDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
 	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
+xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
 	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
+xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
 	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
+xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
 	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
+xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
 	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 $(SDMDEIGTST): $(FRC)

From effb7af2a27f4f346f204223ce2372ba462412da Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 5 Dec 2023 17:55:38 +0100
Subject: [PATCH 457/718] Fix memory leak (Reference-LAPACK PR 953)

---
 lapack-netlib/TESTING/LIN/cchkaa.F | 2 ++
 lapack-netlib/TESTING/LIN/dchkaa.F | 2 ++
 lapack-netlib/TESTING/LIN/schkaa.F | 2 ++
 lapack-netlib/TESTING/LIN/zchkaa.F | 2 ++
 4 files changed, 8 insertions(+)

diff --git a/lapack-netlib/TESTING/LIN/cchkaa.F b/lapack-netlib/TESTING/LIN/cchkaa.F
index 474454a51..57d95c741 100644
--- a/lapack-netlib/TESTING/LIN/cchkaa.F
+++ b/lapack-netlib/TESTING/LIN/cchkaa.F
@@ -1232,6 +1232,8 @@
 *
       DEALLOCATE (A, STAT = AllocateStatus)
       DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (E, STAT = AllocateStatus)
+      DEALLOCATE (S, STAT = AllocateStatus)
       DEALLOCATE (WORK, STAT = AllocateStatus)
       DEALLOCATE (RWORK,  STAT = AllocateStatus)
 *
diff --git a/lapack-netlib/TESTING/LIN/dchkaa.F b/lapack-netlib/TESTING/LIN/dchkaa.F
index 74077eb94..6582cac13 100644
--- a/lapack-netlib/TESTING/LIN/dchkaa.F
+++ b/lapack-netlib/TESTING/LIN/dchkaa.F
@@ -1076,6 +1076,8 @@
 *
       DEALLOCATE (A, STAT = AllocateStatus)
       DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (E, STAT = AllocateStatus)
+      DEALLOCATE (S, STAT = AllocateStatus)
       DEALLOCATE (WORK, STAT = AllocateStatus)
       DEALLOCATE (RWORK,  STAT = AllocateStatus)
 *
diff --git a/lapack-netlib/TESTING/LIN/schkaa.F b/lapack-netlib/TESTING/LIN/schkaa.F
index 2b9f2ea45..036b13924 100644
--- a/lapack-netlib/TESTING/LIN/schkaa.F
+++ b/lapack-netlib/TESTING/LIN/schkaa.F
@@ -1070,6 +1070,8 @@
 *
       DEALLOCATE (A, STAT = AllocateStatus)
       DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (E, STAT = AllocateStatus)
+      DEALLOCATE (S, STAT = AllocateStatus)
       DEALLOCATE (WORK, STAT = AllocateStatus)
       DEALLOCATE (RWORK,  STAT = AllocateStatus)
 *
diff --git a/lapack-netlib/TESTING/LIN/zchkaa.F b/lapack-netlib/TESTING/LIN/zchkaa.F
index 57d71833f..f1020f2d8 100644
--- a/lapack-netlib/TESTING/LIN/zchkaa.F
+++ b/lapack-netlib/TESTING/LIN/zchkaa.F
@@ -1268,6 +1268,8 @@
 *
       DEALLOCATE (A, STAT = AllocateStatus)
       DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (E, STAT = AllocateStatus)
+      DEALLOCATE (S, STAT = AllocateStatus)
       DEALLOCATE (RWORK, STAT = AllocateStatus)
       DEALLOCATE (WORK,  STAT = AllocateStatus)
 *

From 589f2b6466d7439220a72a99f5b385239bd37043 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 5 Dec 2023 20:10:20 +0100
Subject: [PATCH 458/718] Fix search phrase used to count successful tests
 (Reference-LAPACK PR 954)

---
 lapack-netlib/lapack_testing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/lapack_testing.py b/lapack-netlib/lapack_testing.py
index ae59926b8..96fbeb2a6 100755
--- a/lapack-netlib/lapack_testing.py
+++ b/lapack-netlib/lapack_testing.py
@@ -136,7 +136,7 @@ def run_summary_test( f, cmdline, short_summary):
         for line in pipe.readlines():
             f.write(str(line))
             words_in_line=line.split()
-            if (line.find("run")!=-1):
+            if (line.find("run)")!=-1):
 #                  print line
                 whereisrun=words_in_line.index("run)")
                 nb_test_run+=int(words_in_line[whereisrun-2])

From 5f51811728645baae32ad54f7b34a00259e6e8e8 Mon Sep 17 00:00:00 2001
From: Mark Seminatore <mseminatore@users.noreply.github.com>
Date: Tue, 5 Dec 2023 22:43:36 -0800
Subject: [PATCH 459/718] try at new threading model

---
 CONTRIBUTORS.md                   |   3 +
 common_thread.h                   |   5 +-
 driver/others/blas_server_win32.c | 139 ++++++++++++++++--------------
 3 files changed, 79 insertions(+), 68 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 71df13634..493747052 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -216,3 +216,6 @@ In chronological order:
   
 * Pablo Romero <https://github.com/pablorcum>
   * [2022-08] Fix building from sources for QNX
+
+* Mark Seminatore <https://github.com/mseminatore>
+  * [2023-11-09] Improve Windows threading performance scaling
\ No newline at end of file
diff --git a/common_thread.h b/common_thread.h
index 6e18d2a8e..9e7dae74a 100644
--- a/common_thread.h
+++ b/common_thread.h
@@ -111,8 +111,9 @@ typedef struct blas_queue {
   struct blas_queue *next;
 
 #if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__)
-  CRITICAL_SECTION lock;
-  HANDLE finish;
+  // CRITICAL_SECTION lock;
+  // HANDLE finish;
+  volatile int finished;
 #else
   pthread_mutex_t	 lock;
   pthread_cond_t	 finished;
diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 5bdfc1276..464e3fa03 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -51,15 +51,19 @@
 /* This is a thread implementation for Win32 lazy implementation */
 
 /* Thread server common information */
-typedef struct{
-  CRITICAL_SECTION lock;
-  HANDLE filled;
-  HANDLE killed;
-
-  blas_queue_t	*queue;    /* Parameter Pointer */
-  int		shutdown;  /* server shutdown flag */
-
-} blas_pool_t;
+//typedef struct{
+//  CRITICAL_SECTION lock;
+//  HANDLE filled;
+//  HANDLE killed;
+//
+//  blas_queue_t	*queue;    /* Parameter Pointer */
+//  int		shutdown;  /* server shutdown flag */
+//
+//} blas_pool_t;
+
+static blas_queue_t *work_queue = NULL;
+static HANDLE kickoff_event = NULL;
+static CRITICAL_SECTION queue_lock;
 
 /* We need this global for checking if initialization is finished.   */
 int blas_server_avail = 0;
@@ -67,7 +71,7 @@ int blas_server_avail = 0;
 /* Local Variables */
 static BLASULONG server_lock       = 0;
 
-static blas_pool_t   pool;
+//static blas_pool_t   pool;
 static HANDLE	    blas_threads   [MAX_CPU_NUMBER];
 static DWORD	    blas_threads_id[MAX_CPU_NUMBER];
 
@@ -209,7 +213,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
   void *buffer, *sa, *sb;
   blas_queue_t	*queue;
   DWORD action;
-  HANDLE handles[] = {pool.filled, pool.killed};
+  //HANDLE handles[] = {pool.filled, pool.killed};
 
   /* Each server needs each buffer */
   buffer   = blas_memory_alloc(2);
@@ -225,29 +229,38 @@ static DWORD WINAPI blas_thread_server(void *arg){
 #ifdef SMP_DEBUG
     fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu);
 #endif
-
-    do {
-      action = WaitForMultipleObjects(2, handles, FALSE, INFINITE);
-    } while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1));
-
-    if (action == WAIT_OBJECT_0 + 1) break;
+	// event raised when work is added to the queue
+	WaitForSingleObject(kickoff_event, INFINITE);
 
 #ifdef SMP_DEBUG
     fprintf(STDERR, "Server[%2ld] Got it.\n", cpu);
 #endif
 
-    EnterCriticalSection(&pool.lock);
+#if 1
+    EnterCriticalSection(&queue_lock);
 
-    queue = pool.queue;
-    if (queue) pool.queue = queue->next;
+    queue = work_queue;
+    if (queue)
+        work_queue = work_queue->next;
 
-    LeaveCriticalSection(&pool.lock);
+    LeaveCriticalSection(&queue_lock);
+#else
+    volatile work_queue_t* queue_next;
+
+    INT_PTR prev_value;
+    do {
+        queue = (volatile work_queue_t*)work_queue;
+        if (!queue)
+            break;
+
+        queue_next = (volatile work_queue_t*)queue->next;
+        prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue);
+    } while (prev_value != work_item);
+#endif
 
     if (queue)  {
       int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
 
-      if (pool.queue) SetEvent(pool.filled);
-
       sa = queue -> sa;
       sb = queue -> sb;
 
@@ -331,14 +344,6 @@ static DWORD WINAPI blas_thread_server(void *arg){
 #ifdef SMP_DEBUG
     fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
 #endif
-
-    EnterCriticalSection(&queue->lock);
-
-    queue -> status = BLAS_STATUS_FINISHED;
-
-    LeaveCriticalSection(&queue->lock);
-
-    SetEvent(queue->finish);
   }
 
   /* Shutdown procedure */
@@ -366,13 +371,10 @@ int blas_thread_init(void){
 #endif
 
   if (!blas_server_avail){
+	// create the kickoff Event
+	kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
 
-    InitializeCriticalSection(&pool.lock);
-    pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
-    pool.killed = CreateEvent(NULL, TRUE,  FALSE, NULL);
-
-    pool.shutdown = 0;
-    pool.queue    = NULL;
+    InitializeCriticalSection(&queue_lock);
 
     for(i = 0; i < blas_cpu_number - 1; i++){
       blas_threads[i] = CreateThread(NULL, 0,
@@ -409,8 +411,6 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
   current = queue;
 
   while (current) {
-    InitializeCriticalSection(&current -> lock);
-    current -> finish = CreateEvent(NULL, FALSE, FALSE, NULL);
     current -> position = pos;
 
 #ifdef CONSISTENT_FPCSR
@@ -418,23 +418,32 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
     __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode));
 #endif
 
+	current->finished = 0;
     current = current -> next;
     pos ++;
   }
 
-  EnterCriticalSection(&pool.lock);
+  EnterCriticalSection(&queue_lock);
 
-  if (pool.queue) {
-    current = pool.queue;
-    while (current -> next) current = current -> next;
-    current -> next = queue;
-  } else {
-    pool.queue = queue;
+  if (!work_queue)
+  {
+      work_queue = queue;
   }
+  else
+  {
+	  blas_queue_t *next_item = work_queue;
 
-  LeaveCriticalSection(&pool.lock);
+      // find the end of the work queue
+      while (next_item)
+          next_item = next_item->next;
 
-  SetEvent(pool.filled);
+      // add new work to the end
+      next_item = queue;
+  }
+
+  LeaveCriticalSection(&queue_lock);
+
+  SetEvent(kickoff_event);
 
   return 0;
 }
@@ -449,21 +458,26 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
 #ifdef SMP_DEBUG
     fprintf(STDERR, "Waiting Queue ..\n");
 #endif
+	  while (!queue->finished)
+		  YIELDING;
 
-      WaitForSingleObject(queue->finish, INFINITE);
-
-      CloseHandle(queue->finish);
-      DeleteCriticalSection(&queue -> lock);
-
-      queue = queue -> next;
-      num --;
+  	  queue = queue->next;
+      num--;
     }
 
 #ifdef SMP_DEBUG
     fprintf(STDERR, "Completely Done.\n\n");
 #endif
+	// if work was added to the queue after this batch we can't sleep the worker threads
+	// by resetting the event
+	EnterCriticalSection(&queue_lock);
 
-  return 0;
+	if (work_queue == NULL)
+		ResetEvent(kickoff_event);
+
+	LeaveCriticalSection(&queue_lock);
+
+	return 0;
 }
 
 /* Execute Threads */
@@ -512,8 +526,6 @@ int BLASFUNC(blas_thread_shutdown)(void){
 
   if (blas_server_avail){
 
-    SetEvent(pool.killed);
-
     for(i = 0; i < blas_num_threads - 1; i++){
       // Could also just use WaitForMultipleObjects
       DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
@@ -528,9 +540,6 @@ int BLASFUNC(blas_thread_shutdown)(void){
       CloseHandle(blas_threads[i]);
     }
 
-    CloseHandle(pool.filled);
-    CloseHandle(pool.killed);
-
     blas_server_avail = 0;
   }
 
@@ -558,13 +567,11 @@ void goto_set_num_threads(int num_threads)
 
 		//increased_threads = 1;
 	    if (!blas_server_avail){
+			// create the kickoff Event
+			kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
 
-			InitializeCriticalSection(&pool.lock);
-			pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
-			pool.killed = CreateEvent(NULL, TRUE,  FALSE, NULL);
+			InitializeCriticalSection(&queue_lock);
 
-			pool.shutdown = 0;
-			pool.queue    = NULL;
 			blas_server_avail = 1;
 		}
 

From 4ebf814b4258904d1f48bfe427a6727514d9efa6 Mon Sep 17 00:00:00 2001
From: Mark Seminatore <mseminatore@users.noreply.github.com>
Date: Tue, 5 Dec 2023 23:28:37 -0800
Subject: [PATCH 460/718] fix bug failing to mark task as finished.

---
 driver/others/blas_server_win32.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 464e3fa03..5af1f1a51 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -344,6 +344,9 @@ static DWORD WINAPI blas_thread_server(void *arg){
 #ifdef SMP_DEBUG
     fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
 #endif
+	
+	queue->finished = 1;
+
   }
 
   /* Shutdown procedure */

From 993ede7c70658870921b446b145310f1b6c1edf6 Mon Sep 17 00:00:00 2001
From: yancheng <yancheng@loongson.cn>
Date: Mon, 27 Nov 2023 11:30:34 +0800
Subject: [PATCH 461/718] loongarch64: Add optimizations for scal.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   3 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   3 +
 kernel/loongarch64/dscal_lasx.S          | 194 +++++++++++++++++++++
 kernel/loongarch64/dscal_lsx.S           | 205 +++++++++++++++++++++++
 kernel/loongarch64/sscal_lasx.S          | 188 +++++++++++++++++++++
 kernel/loongarch64/sscal_lsx.S           | 194 +++++++++++++++++++++
 6 files changed, 787 insertions(+)
 create mode 100644 kernel/loongarch64/dscal_lasx.S
 create mode 100644 kernel/loongarch64/dscal_lsx.S
 create mode 100644 kernel/loongarch64/sscal_lasx.S
 create mode 100644 kernel/loongarch64/sscal_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index b2a396674..e553c4b95 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -4,4 +4,7 @@ SDOTKERNEL  = dot_lsx.S
 DSDOTKERNEL = dot_lsx.S
 DDOTKERNEL  = dot_lsx.S
 
+SSCALKERNEL  = sscal_lsx.S
+DSCALKERNEL  = dscal_lsx.S
+
 endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 020a82303..4c0c1c2c8 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -4,6 +4,9 @@ SDOTKERNEL  = dot_lasx.S
 DSDOTKERNEL = dot_lasx.S
 DDOTKERNEL  = dot_lasx.S
 
+SSCALKERNEL  = sscal_lasx.S
+DSCALKERNEL  = dscal_lasx.S
+
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
diff --git a/kernel/loongarch64/dscal_lasx.S b/kernel/loongarch64/dscal_lasx.S
new file mode 100644
index 000000000..153662378
--- /dev/null
+++ b/kernel/loongarch64/dscal_lasx.S
@@ -0,0 +1,194 @@
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define ALPHA $f0
+#define X  $r7
+#define INCX $r8
+#define I $r12
+#define TEMP $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define XX $r16
+#define VX0 $xr12
+#define VX1 $xr13
+#define VT0 $xr14
+#define VT1 $xr15
+#define VALPHA $xr19
+#define a1 $f8
+#define a2 $f23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.d.l a1, a1
+    movgr2fr.d a2, TEMP
+    ffint.d.l a2, a2
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    fcmp.ceq.d $fcc0, ALPHA, a1
+    bcnez $fcc0, .L20   //ALPHA==0
+    fcmp.ceq.d $fcc0, ALPHA, a2
+    bcnez $fcc0, .L999  //ALPHA==1 return
+    srai.d I, N, 3
+    beq INCX, TEMP, .L30    //ALPHA！=0|1 and INCX==1
+    movfr2gr.d TEMP, ALPHA
+    xvreplgr2vr.d VALPHA, TEMP
+    move XX, X
+    .align 3
+
+.L10:
+    bge $r0, I, .L32
+    .align 3
+.L11:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    xvfmul.d VT0, VX0, VALPHA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvfmul.d VT1, VX1, VALPHA
+    xvstelm.d VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT1, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT1, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT1, XX, 0, 3
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    b .L32
+    .align 3
+
+.L20:
+    srai.d I, N, 3
+    beq INCX, TEMP, .L24
+    bge $r0, I, .L22
+    .align 3
+
+.L21:
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L23:
+    fst.d a1, X, 0 * SIZE
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L23
+    jirl $r0, $r1, 0
+    .align 3
+
+.L24:
+    bge $r0, I, .L26 /*N<8 INCX==1*/
+    .align 3
+.L25:
+    xvxor.v VX0, VX0, VX0
+    xvst VX0, X, 0 * SIZE
+    xvst VX0, X, 4 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L25
+    .align 3
+
+.L26:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L27:
+    fst.d a1, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, SIZE
+    blt $r0, I, .L27
+    jirl $r0, $r1, 0
+    .align 3
+
+.L30:
+    bge $r0, I, .L32/*N<8 INCX==1*/
+    movfr2gr.d TEMP, ALPHA
+    xvreplgr2vr.d VALPHA , TEMP
+    .align 3
+
+.L31:
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfmul.d VT0, VX0, VALPHA
+    xvfmul.d VT1, VX1, VALPHA
+    addi.d I, I, -1
+    xvst VT0, X, 0 * SIZE
+    xvst VT1, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L31
+    .align 3
+
+.L32:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L33:
+    fld.d a1, X, 0 * SIZE
+    addi.d I, I, -1
+    fmul.d a1, ALPHA, a1
+    fst.d a1, X, 0 * SIZE
+    add.d X, X, INCX
+    blt $r0, I, .L33
+    jirl $r0, $r1, 0
+    .align 3
+
+.L999:
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dscal_lsx.S b/kernel/loongarch64/dscal_lsx.S
new file mode 100644
index 000000000..55f497752
--- /dev/null
+++ b/kernel/loongarch64/dscal_lsx.S
@@ -0,0 +1,205 @@
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define ALPHA $f0
+#define X  $r7
+#define INCX $r8
+#define I $r12
+#define TEMP $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define XX $r16
+#define VX0 $vr12
+#define VX1 $vr13
+#define VT0 $vr14
+#define VT1 $vr15
+#define VALPHA $vr19
+#define a1 $f8
+#define a2 $f23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.d.l a1, a1
+    movgr2fr.d a2, TEMP
+    ffint.d.l a2, a2
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    fcmp.ceq.d $fcc0, ALPHA, a1
+    bcnez $fcc0, .L20   //ALPHA==0
+    fcmp.ceq.d $fcc0, ALPHA, a2
+    bcnez $fcc0, .L999  //ALPHA==1 return
+    srai.d I, N, 3
+    beq INCX, TEMP, .L30    //ALPHA！=0|1 and INCX==1
+    movfr2gr.d TEMP, ALPHA
+    vreplgr2vr.d VALPHA, TEMP
+    move XX, X
+    .align 3
+
+.L10: //ALPHA！=0|1 and INCX!=1
+    bge $r0, I, .L32
+    .align 3
+
+.L11:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vfmul.d VT0, VX0, VALPHA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vfmul.d VT1, VX1, VALPHA
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vstelm.d VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT1, XX, 0, 1
+    add.d XX, XX, INCX
+    vfmul.d VT0, VX0, VALPHA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vfmul.d VT1, VX1, VALPHA
+    vstelm.d VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT1, XX, 0, 1
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    b .L32
+    .align 3
+
+.L20:
+    srai.d I, N, 3
+    beq INCX, TEMP, .L24
+    bge $r0, I, .L22
+    .align 3
+
+.L21:
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L23:
+    fst.d a1, X, 0 * SIZE
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L23
+    jirl $r0, $r1, 0
+    .align 3
+
+.L24:
+    bge $r0, I, .L26 /*N<8 INCX==1*/
+    .align 3
+.L25:
+    vxor.v VX0, VX0, VX0
+    vst VX0, X, 0 * SIZE
+    vst VX0, X, 2 * SIZE
+    vst VX0, X, 4 * SIZE
+    vst VX0, X, 6 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L25
+    .align 3
+
+.L26:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L27:
+    fst.d a1, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, SIZE
+    blt $r0, I, .L27
+    jirl $r0, $r1, 0
+    .align 3
+
+.L30:
+    bge $r0, I, .L32/*N<8 INCX==1*/
+    movfr2gr.d TEMP, ALPHA
+    vreplgr2vr.d VALPHA , TEMP
+    .align 3
+
+.L31:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfmul.d VT0, VX0, VALPHA
+    vfmul.d VT1, VX1, VALPHA
+    vld VX0, X, 4 * SIZE
+    vst VT0, X, 0 * SIZE
+    vst VT1, X, 2 * SIZE
+    vfmul.d VT0, VX0, VALPHA
+    vld VX1, X, 6 * SIZE
+    vst VT0, X, 4 * SIZE
+    vfmul.d VT1, VX1, VALPHA
+    vst VT1, X, 6 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L31
+    .align 3
+
+.L32:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L33:
+    fld.d a1, X, 0 * SIZE
+    addi.d I, I, -1
+    fmul.d a1, ALPHA, a1
+    fst.d a1, X, 0 * SIZE
+    add.d X, X, INCX
+    blt $r0, I, .L33
+    jirl $r0, $r1, 0
+    .align 3
+
+.L999:
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/sscal_lasx.S b/kernel/loongarch64/sscal_lasx.S
new file mode 100644
index 000000000..329f24659
--- /dev/null
+++ b/kernel/loongarch64/sscal_lasx.S
@@ -0,0 +1,188 @@
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define ALPHA $f0
+#define X  $r7
+#define INCX $r8
+#define I $r12
+#define TEMP $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define XX $r16
+#define VX0 $xr12
+#define VX1 $xr13
+#define VT0 $xr14
+#define VT1 $xr15
+#define VALPHA $xr19
+#define a1 $f8
+#define a2 $f23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.s.l a1, a1
+    movgr2fr.d a2, TEMP
+    ffint.s.l a2, a2
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    fcmp.ceq.s $fcc0, ALPHA, a1
+    bcnez $fcc0, .L20   //ALPHA==0
+    fcmp.ceq.s $fcc0, ALPHA, a2
+    bcnez $fcc0, .L999  //ALPHA==1 return
+    srai.d I, N, 3
+    beq INCX, TEMP, .L30    //ALPHA！=0|1 and INCX==1
+    movfr2gr.s TEMP, ALPHA
+    xvreplgr2vr.w VALPHA, TEMP
+    move XX, X
+
+.L10:   //ALPHA！=0|1 and INCX!=1
+    bge $r0, I, .L32
+    .align 3
+.L11:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvfmul.s VT0, VX0, VALPHA
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    b .L32
+    .align 3
+
+.L20:
+    srai.d I, N, 3
+    beq INCX, TEMP, .L24
+    bge $r0, I, .L22
+    .align 3
+
+.L21:
+    fst.s a1, X, 0
+    add.d X, X, INCX
+    fst.s a1, X, 0
+    add.d X, X, INCX
+    fst.s a1, X, 0
+    add.d X, X, INCX
+    fst.s a1, X, 0
+    add.d X, X, INCX
+    fst.s a1, X, 0
+    add.d X, X, INCX
+    fst.s a1, X, 0
+    add.d X, X, INCX
+    fst.s a1, X, 0
+    add.d X, X, INCX
+    fst.s a1, X, 0
+    add.d X, X, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L23:
+    fst.s a1, X, 0 * SIZE
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L23
+    jirl $r0, $r1, 0
+    .align 3
+
+.L24:
+    bge $r0, I, .L26 /*N<8 INCX==1*/
+    .align 3
+.L25:
+    xvxor.v VX0, VX0, VX0
+    xvst VX0, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L25
+    .align 3
+
+.L26:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L27:
+    fst.s a1, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, SIZE
+    blt $r0, I, .L27
+    jirl $r0, $r1, 0
+    .align 3
+
+.L30:
+    bge $r0, I, .L32/*N<8 INCX==1*/
+    movfr2gr.s TEMP, ALPHA
+    xvreplgr2vr.w VALPHA , TEMP
+    .align 3
+
+.L31:
+    xvld VX0, X, 0 * SIZE
+    addi.d I, I, -1
+    xvfmul.s VT0, VX0, VALPHA
+    xvst VT0, X, 0 * SIZE
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L31
+    .align 3
+
+.L32:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L33:
+    fld.s a1, X, 0 * SIZE
+    addi.d I, I, -1
+    fmul.s a1, ALPHA, a1
+    fst.s a1, X, 0 * SIZE
+    add.d X, X, INCX
+    blt $r0, I, .L33
+    jirl $r0, $r1, 0
+    .align 3
+
+.L999:
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/sscal_lsx.S b/kernel/loongarch64/sscal_lsx.S
new file mode 100644
index 000000000..d0ea1307d
--- /dev/null
+++ b/kernel/loongarch64/sscal_lsx.S
@@ -0,0 +1,194 @@
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define ALPHA $f0
+#define X  $r7
+#define INCX $r8
+#define I $r12
+#define TEMP $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define XX $r16
+#define VX0 $vr12
+#define VX1 $vr13
+#define VT0 $vr14
+#define VT1 $vr15
+#define VALPHA $vr19
+#define a1 $f8
+#define a2 $f23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.s.l a1, a1
+    movgr2fr.d a2, TEMP
+    ffint.s.l a2, a2
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    fcmp.ceq.s $fcc0, ALPHA, a1
+    bcnez $fcc0, .L20   //ALPHA==0
+    fcmp.ceq.s $fcc0, ALPHA, a2
+    bcnez $fcc0, .L999  //ALPHA==1 return
+    srai.d I, N, 3
+    beq INCX, TEMP, .L30    //ALPHA！=0|1 and INCX==1
+    movfr2gr.s TEMP, ALPHA
+    vreplgr2vr.w VALPHA, TEMP
+    move XX, X
+    .align 3
+
+.L10:   //ALPHA！=0|1 and INCX!=1
+    bge $r0, I, .L32
+    .align 3
+.L11:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vfmul.s VT0, VX0, VALPHA
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vfmul.s VT1, VX1, VALPHA
+    vstelm.w VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT1, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT1, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT1, XX, 0, 3
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    b .L32
+    .align 3
+
+.L20:
+    srai.d I, N, 3
+    beq INCX, TEMP, .L24
+    bge $r0, I, .L22
+    .align 3
+
+.L21:
+    fst.s a1, X, 0
+    add.d X, X, INCX
+    fst.s a1, X, 0
+    add.d X, X, INCX
+    fst.s a1, X, 0
+    add.d X, X, INCX
+    fst.s a1, X, 0
+    add.d X, X, INCX
+    fst.s a1, X, 0
+    add.d X, X, INCX
+    fst.s a1, X, 0
+    add.d X, X, INCX
+    fst.s a1, X, 0
+    add.d X, X, INCX
+    fst.s a1, X, 0
+    add.d X, X, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L23:
+    fst.s a1, X, 0 * SIZE
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L23
+    jirl $r0, $r1, 0
+    .align 3
+
+.L24:
+    bge $r0, I, .L26 /*N<8 INCX==1*/
+    .align 3
+.L25:
+    vxor.v VX0, VX0, VX0
+    vst VX0, X, 0 * SIZE
+    vst VX0, X, 4 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L25
+    .align 3
+
+.L26:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L27:
+    fst.s a1, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, SIZE
+    blt $r0, I, .L27
+    jirl $r0, $r1, 0
+    .align 3
+
+.L30:
+    bge $r0, I, .L32/*N<8 INCX==1*/
+    movfr2gr.s TEMP, ALPHA
+    vreplgr2vr.w VALPHA , TEMP
+    .align 3
+
+.L31:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vfmul.s VT0, VX0, VALPHA
+    vfmul.s VT1, VX1, VALPHA
+    addi.d I, I, -1
+    vst VT0, X, 0 * SIZE
+    vst VT1, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L31
+    .align 3
+
+.L32:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L33:
+    fld.s a1, X, 0 * SIZE
+    addi.d I, I, -1
+    fmul.s a1, ALPHA, a1
+    fst.s a1, X, 0 * SIZE
+    add.d X, X, INCX
+    blt $r0, I, .L33
+    jirl $r0, $r1, 0
+    .align 3
+
+.L999:
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE

From 265b5f2e803d2d66b7fe75ec356dc14fcb5ef726 Mon Sep 17 00:00:00 2001
From: yancheng <yancheng@loongson.cn>
Date: Thu, 7 Dec 2023 10:57:13 +0800
Subject: [PATCH 462/718] loongarch64: Add optimizations for amax.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   3 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   3 +
 kernel/loongarch64/damax_lasx.S          | 183 ++++++++++++++++++++
 kernel/loongarch64/damax_lsx.S           | 145 ++++++++++++++++
 kernel/loongarch64/samax_lasx.S          | 208 +++++++++++++++++++++++
 kernel/loongarch64/samax_lsx.S           | 177 +++++++++++++++++++
 6 files changed, 719 insertions(+)
 create mode 100644 kernel/loongarch64/damax_lasx.S
 create mode 100644 kernel/loongarch64/damax_lsx.S
 create mode 100644 kernel/loongarch64/samax_lasx.S
 create mode 100644 kernel/loongarch64/samax_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index e553c4b95..a4d5f8f87 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -7,4 +7,7 @@ DDOTKERNEL  = dot_lsx.S
 SSCALKERNEL  = sscal_lsx.S
 DSCALKERNEL  = dscal_lsx.S
 
+SAMAXKERNEL =  samax_lsx.S
+DAMAXKERNEL =  damax_lsx.S
+
 endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 4c0c1c2c8..8c7481ae6 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -7,6 +7,9 @@ DDOTKERNEL  = dot_lasx.S
 SSCALKERNEL  = sscal_lasx.S
 DSCALKERNEL  = dscal_lasx.S
 
+SAMAXKERNEL =  samax_lasx.S
+DAMAXKERNEL =  damax_lasx.S
+
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
diff --git a/kernel/loongarch64/damax_lasx.S b/kernel/loongarch64/damax_lasx.S
new file mode 100644
index 000000000..c44ce4995
--- /dev/null
+++ b/kernel/loongarch64/damax_lasx.S
@@ -0,0 +1,183 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define J $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define TEMP $r16
+#define m0 $xr8
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define x5 $xr13
+#define x6 $xr14
+#define x7 $xr15
+#define x8 $xr16
+#define VX0 $xr20
+#define VX1 $xr21
+#define VM0 $xr22
+#define VM1 $xr23
+#define VM2 $xr18
+#define VM3 $xr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+    srai.d I, N, 3
+    bge $r0, I, .L12
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    xvfmaxa.d VM1, VX1, VX0
+    addi.d X, X, 8 * SIZE
+    xvfmaxa.d VM0, VM0, VM1
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmaxa.d VM1, x1, x2
+    xvfmaxa.d VM2, x3, x4
+    xvfmaxa.d VM0, VM1, VM2
+    .align 3
+
+.L12: //INCX==1 and N<8
+    andi I, N, 7
+    li.d J, 4
+    bge J, I, .L13 // 4<N<8
+    xvld VX0, X, 0
+    slli.d J, J, 1 // 8
+    sub.d I, J, I
+    slli.d I, I, BASE_SHIFT
+    xvldx VX1, X, I
+    xvfmaxa.d m0, VX0, VX1 //patial repeat read
+    xvpickve.d x1, m0, 0
+    xvpickve.d x2, m0, 1
+    xvpickve.d x3, m0, 2
+    xvpickve.d x4, m0, 3
+    xvfmaxa.d VM1, x1, x2
+    xvfmaxa.d m0, x3, x4
+    xvfmaxa.d m0, m0, VM1
+    xvfmaxa.d VM0, m0, VM0
+    fabs.d $f22, $f22
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L13: //INCX==1 and 0<=N<=4
+    bge $r0, I, .L15
+    .align 3
+
+.L14:
+    xvld x1, X, 0
+    addi.d I, I, -1
+    xvfmaxa.d VM0, VM0, x1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    .align 3
+
+.L15:
+    fabs.d $f22, $f22
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X // initialize the maxa value
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t2, 1
+    xvinsgr2vr.d VM0, t3, 2
+    xvinsgr2vr.d VM0, t4, 3
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    addi.d I, I, -1
+    xvfmaxa.d VM1, VX1, VX0
+    xvfmaxa.d VM0, VM1, VM0
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmaxa.d VM1, x1, x2
+    xvfmaxa.d VM2, x3, x4
+    xvfmaxa.d VM0, VM1, VM2
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    xvld x1, X, 0
+    addi.d I, I, -1
+    xvfmaxa.d VM0, VM0, x1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fabs.d $f22, $f22
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/damax_lsx.S b/kernel/loongarch64/damax_lsx.S
new file mode 100644
index 000000000..3e5a10525
--- /dev/null
+++ b/kernel/loongarch64/damax_lsx.S
@@ -0,0 +1,145 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define TEMP $r16
+#define x1 $vr9
+#define x2 $vr10
+#define VX0 $vr20
+#define VX1 $vr21
+#define VM0 $vr22
+#define VM1 $vr23
+#define VM2 $vr18
+#define VM3 $vr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+    srai.d I, N, 3
+    bge $r0, I, .L12
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    addi.d I, I, -1
+    vfmaxa.d VM1, VX1, VX0
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vfmaxa.d VM2, VX1, VX0
+    vfmaxa.d VM3, VM1, VM2
+    addi.d X, X, 8 * SIZE
+    vfmaxa.d VM0, VM0, VM3
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    vfmaxa.d VM0, x1, x2
+    .align 3
+
+.L12: //INCX==1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L14
+    .align 3
+
+.L13:
+    vld x1, X, 0
+    addi.d I, I, -1
+    vfmaxa.d VM0, VM0, x1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L13
+    .align 3
+
+.L14:
+    fabs.d $f22, $f22
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X // initialize the maxa value
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t2, 1
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM2, VX0, VX1
+    vfmaxa.d VM3, VM1, VM2
+    vfmaxa.d VM0, VM0, VM3
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    vfmaxa.d VM0, x1, x2
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    vld x1, X, 0
+    addi.d I, I, -1
+    vfmaxa.d VM0, VM0, x1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fabs.d $f22, $f22
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/samax_lasx.S b/kernel/loongarch64/samax_lasx.S
new file mode 100644
index 000000000..a04e4a90b
--- /dev/null
+++ b/kernel/loongarch64/samax_lasx.S
@@ -0,0 +1,208 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define J $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define TEMP $r16
+#define m0 $xr8
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define x5 $xr13
+#define x6 $xr14
+#define x7 $xr15
+#define x8 $xr16
+#define VX0 $xr20
+#define VX1 $xr21
+#define VM0 $xr22
+#define VM1 $xr23
+#define VM2 $xr19
+#define VM3 $xr18
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+    srai.d I, N, 3
+    bge $r0, I, .L12
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    addi.d I, I, -1
+    xvfmaxa.s VM0, VM0, VX0
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvpickve.w x5, VM0, 4
+    xvpickve.w x6, VM0, 5
+    xvpickve.w x7, VM0, 6
+    xvpickve.w x8, VM0, 7
+    xvfmaxa.s VM3, x1, x2
+    xvfmaxa.s VM2, x3, x4
+    xvfmaxa.s VM1, x5, x6
+    xvfmaxa.s VM0, x7, x8
+    xvfmaxa.s VM2, VM2, VM3
+    xvfmaxa.s VM0, VM0, VM1
+    xvfmaxa.s VM0, VM0, VM2
+    .align 3
+
+.L12: //INCX==1 and N<8
+    andi I, N, 7
+    li.d J, 4
+    bge J, I, .L13 // 4<N<8
+    xvld VX0, X, 0
+    slli.d J, J, 1 // 8
+    sub.d I, J, I
+    slli.d I, I, BASE_SHIFT
+    xvldx VX1, X, I
+    xvfmaxa.s m0, VX0, VX1 //patial repeat read
+    xvpickve.w x1, m0, 0
+    xvpickve.w x2, m0, 1
+    xvpickve.w x3, m0, 2
+    xvpickve.w x4, m0, 3
+    xvfmaxa.s m0, x1, x2
+    xvfmaxa.s VM1, x3, x4
+    xvfmaxa.s m0, m0, VM1
+    xvfmaxa.s VM0, m0, VM0
+    fabs.s $f22, $f22
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L13: //INCX==1 and 0<=N<=4
+    bge $r0, I, .L15
+    .align 3
+
+.L14:
+    xvld x1, X, 0
+    addi.d I, I, -1
+    xvfmaxa.s VM0, VM0, x1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    .align 3
+
+.L15:
+    fabs.s $f22, $f22
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X // initialize the maxa value
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t2, 1
+    xvinsgr2vr.w VM0, t3, 2
+    xvinsgr2vr.w VM0, t4, 3
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 4
+    xvinsgr2vr.w VM0, t2, 5
+    xvinsgr2vr.w VM0, t3, 6
+    xvinsgr2vr.w VM0, t4, 7
+    .align 3
+
+.L21:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    addi.d I, I, -1
+    xvfmaxa.s VM0, VM0, VX0
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvpickve.w x5, VM0, 4
+    xvpickve.w x6, VM0, 5
+    xvpickve.w x7, VM0, 6
+    xvpickve.w x8, VM0, 7
+    xvfmaxa.s VM3, x1, x2
+    xvfmaxa.s VM2, x3, x4
+    xvfmaxa.s VM1, x5, x6
+    xvfmaxa.s VM0, x7, x8
+    xvfmaxa.s VM2, VM2, VM3
+    xvfmaxa.s VM0, VM0, VM1
+    xvfmaxa.s VM0, VM0, VM2
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    xvld x1, X, 0
+    addi.d I, I, -1
+    xvfmaxa.s VM0, VM0, x1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fabs.s $f22, $f22
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/samax_lsx.S b/kernel/loongarch64/samax_lsx.S
new file mode 100644
index 000000000..5c20dd96a
--- /dev/null
+++ b/kernel/loongarch64/samax_lsx.S
@@ -0,0 +1,177 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define J $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define TEMP $r16
+#define m0 $vr8
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr20
+#define VX1 $vr21
+#define VM0 $vr22
+#define VM1 $vr23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+    srai.d I, N, 3
+    bge $r0, I, .L12
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    vfmaxa.s VM1, VX0, VX1
+    addi.d X, X, 8 * SIZE
+    vfmaxa.s VM0, VM0, VM1
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmaxa.s VM1, x1, x2
+    vfmaxa.s VM0, x3, x4
+    vfmaxa.s VM0, VM0, VM1
+    .align 3
+
+.L12: //INCX==1 and N<8
+    andi I, N, 7
+    li.d J, 4
+    bge J, I, .L13 // 4<N<8
+    vld VX0, X, 0
+    slli.d J, J, 1 // 8
+    sub.d I, J, I
+    slli.d I, I, BASE_SHIFT
+    vldx VX1, X, I
+    vfmaxa.s m0, VX0, VX1 //patial repeat read
+    vreplvei.w x1, m0, 0
+    vreplvei.w x2, m0, 1
+    vreplvei.w x3, m0, 2
+    vreplvei.w x4, m0, 3
+    vfmaxa.s m0, x1, x2
+    vfmaxa.s VM1, x3, x4
+    vfmaxa.s m0, m0, VM1
+    vfmaxa.s VM0, m0, VM0
+    fabs.s $f22, $f22
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L13: //INCX==1 and 0<=N<=4
+    bge $r0, I, .L15
+    .align 3
+
+.L14:
+    vld x1, X, 0
+    addi.d I, I, -1
+    vfmaxa.s VM0, VM0, x1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    .align 3
+
+.L15:
+    fabs.s $f22, $f22
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t2, 1
+    vinsgr2vr.w VM0, t3, 2
+    vinsgr2vr.w VM0, t4, 3
+    .align 3
+
+.L21:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    addi.d I, I, -1
+    vfmaxa.s VM1, VX0, VX1
+    vfmaxa.s VM0, VM0, VM1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmaxa.s VM1, x1, x2
+    vfmaxa.s VM0, x3, x4
+    vfmaxa.s VM0, VM0, VM1
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    vld x1, X, 0
+    addi.d I, I, -1
+    vfmaxa.s VM0, VM0, x1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fabs.s $f22, $f22
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE

From ff2ecc6cdad57c2ea00df641f4ef674eea7b2d10 Mon Sep 17 00:00:00 2001
From: yancheng <yancheng@loongson.cn>
Date: Thu, 7 Dec 2023 11:08:09 +0800
Subject: [PATCH 463/718] loongarch64: Add optimization for amin.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   3 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   3 +
 kernel/loongarch64/damin_lasx.S          | 178 +++++++++++++++++++
 kernel/loongarch64/damin_lsx.S           | 145 ++++++++++++++++
 kernel/loongarch64/samin_lasx.S          | 208 +++++++++++++++++++++++
 kernel/loongarch64/samin_lsx.S           | 177 +++++++++++++++++++
 6 files changed, 714 insertions(+)
 create mode 100644 kernel/loongarch64/damin_lasx.S
 create mode 100644 kernel/loongarch64/damin_lsx.S
 create mode 100644 kernel/loongarch64/samin_lasx.S
 create mode 100644 kernel/loongarch64/samin_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index a4d5f8f87..279ff6a9c 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -10,4 +10,7 @@ DSCALKERNEL  = dscal_lsx.S
 SAMAXKERNEL =  samax_lsx.S
 DAMAXKERNEL =  damax_lsx.S
 
+SAMINKERNEL =  samin_lsx.S
+DAMINKERNEL =  damin_lsx.S
+
 endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 8c7481ae6..83db79050 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -10,6 +10,9 @@ DSCALKERNEL  = dscal_lasx.S
 SAMAXKERNEL =  samax_lasx.S
 DAMAXKERNEL =  damax_lasx.S
 
+SAMINKERNEL =  samin_lasx.S
+DAMINKERNEL =  damin_lasx.S
+
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
diff --git a/kernel/loongarch64/damin_lasx.S b/kernel/loongarch64/damin_lasx.S
new file mode 100644
index 000000000..9d96dd997
--- /dev/null
+++ b/kernel/loongarch64/damin_lasx.S
@@ -0,0 +1,178 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define J $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define TEMP $r16
+#define m0 $xr8
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr20
+#define VX1 $xr21
+#define VM0 $xr22
+#define VM1 $xr23
+#define VM2 $xr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+    srai.d I, N, 3
+    bge $r0, I, .L12
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    addi.d I, I, -1
+    xvld VX1, X, 4 * SIZE
+    xvfmina.d VM1, VX1, VX0
+    addi.d X, X, 8 * SIZE
+    xvfmina.d VM0, VM0, VM1
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmina.d VM1, x1, x2
+    xvfmina.d VM2, x3, x4
+    xvfmina.d VM0, VM1, VM2
+    .align 3
+
+.L12: //INCX==1 and N<8
+    andi I, N, 7
+    li.d J, 4
+    bge J, I, .L13 // 4<N<8
+    xvld VX0, X, 0
+    slli.d J, J, 1 // 8
+    sub.d I, J, I
+    slli.d I, I, BASE_SHIFT
+    xvldx VX1, X, I
+    xvfmina.d m0, VX0, VX1 //patial repeat read
+    xvpickve.d x1, m0, 0
+    xvpickve.d x2, m0, 1
+    xvpickve.d x3, m0, 2
+    xvpickve.d x4, m0, 3
+    xvfmina.d VM1, x1, x2
+    xvfmina.d m0, x3, x4
+    xvfmina.d m0, m0, VM1
+    xvfmina.d VM0, m0, VM0
+    fabs.d $f22, $f22
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L13: //INCX==1 and 0<=N<=4
+    bge $r0, I, .L15
+    .align 3
+
+.L14:
+    xvld x1, X, 0
+    addi.d I, I, -1
+    xvfmina.d VM0, VM0, x1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    .align 3
+
+.L15:
+    fabs.d $f22, $f22
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X // initialize the mina value
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t2, 1
+    xvinsgr2vr.d VM0, t3, 2
+    xvinsgr2vr.d VM0, t4, 3
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    addi.d I, I, -1
+    xvfmina.d VM1, VX1, VX0
+    xvfmina.d VM0, VM1, VM0
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmina.d VM1, x1, x2
+    xvfmina.d VM2, x3, x4
+    xvfmina.d VM0, VM1, VM2
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    xvld x1, X, 0
+    addi.d I, I, -1
+    xvfmina.d VM0, VM0, x1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fabs.d $f22, $f22
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/damin_lsx.S b/kernel/loongarch64/damin_lsx.S
new file mode 100644
index 000000000..c62fc2731
--- /dev/null
+++ b/kernel/loongarch64/damin_lsx.S
@@ -0,0 +1,145 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define TEMP $r16
+#define x1 $vr9
+#define x2 $vr10
+#define VX0 $vr20
+#define VX1 $vr21
+#define VM0 $vr22
+#define VM1 $vr23
+#define VM2 $vr18
+#define VM3 $vr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+    srai.d I, N, 3
+    bge $r0, I, .L12
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    addi.d I, I, -1
+    vfmina.d VM1, VX1, VX0
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vfmina.d VM2, VX1, VX0
+    vfmina.d VM3, VM1, VM2
+    addi.d X, X, 8 * SIZE
+    vfmina.d VM0, VM0, VM3
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    vfmina.d VM0, x1, x2
+    .align 3
+
+.L12: //INCX==1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L14
+    .align 3
+
+.L13:
+    vld x1, X, 0
+    addi.d I, I, -1
+    vfmina.d VM0, VM0, x1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L13
+    .align 3
+
+.L14:
+    fabs.d $f22, $f22
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X // initialize the mina value
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t2, 1
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmina.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    addi.d I, I, -1
+    vfmina.d VM2, VX0, VX1
+    vfmina.d VM3, VM1, VM2
+    vfmina.d VM0, VM0, VM3
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    vfmina.d VM0, x1, x2
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    vld x1, X, 0
+    vfmina.d VM0, VM0, x1
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fabs.d $f22, $f22
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/samin_lasx.S b/kernel/loongarch64/samin_lasx.S
new file mode 100644
index 000000000..8e96968fa
--- /dev/null
+++ b/kernel/loongarch64/samin_lasx.S
@@ -0,0 +1,208 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define J $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define TEMP $r16
+#define m0 $xr8
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define x5 $xr13
+#define x6 $xr14
+#define x7 $xr15
+#define x8 $xr16
+#define VX0 $xr20
+#define VX1 $xr21
+#define VM0 $xr22
+#define VM1 $xr23
+#define VM2 $xr18
+#define VM3 $xr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+    srai.d I, N, 3
+    bge $r0, I, .L12
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    addi.d I, I, -1
+    xvfmina.s VM0, VM0, VX0
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvpickve.w x5, VM0, 4
+    xvpickve.w x6, VM0, 5
+    xvpickve.w x7, VM0, 6
+    xvpickve.w x8, VM0, 7
+    xvfmina.s VM3, x1, x2
+    xvfmina.s VM2, x3, x4
+    xvfmina.s VM1, x5, x6
+    xvfmina.s VM0, x7, x8
+    xvfmina.s VM2, VM2, VM3
+    xvfmina.s VM0, VM0, VM1
+    xvfmina.s VM0, VM0, VM2
+    .align 3
+
+.L12: //INCX==1 and N<8
+    andi I, N, 7
+    li.d J, 4
+    bge J, I, .L13 // 4<N<8
+    xvld VX0, X, 0
+    slli.d J, J, 1 // 8
+    sub.d I, J, I
+    slli.d I, I, BASE_SHIFT
+    xvldx VX1, X, I
+    xvfmina.s m0, VX0, VX1 //patial repeat read
+    xvpickve.w x1, m0, 0
+    xvpickve.w x2, m0, 1
+    xvpickve.w x3, m0, 2
+    xvpickve.w x4, m0, 3
+    xvfmina.s m0, x1, x2
+    xvfmina.s VM1, x3, x4
+    xvfmina.s m0, m0, VM1
+    xvfmina.s VM0, m0, VM0
+    fabs.s $f22, $f22
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L13: //INCX==1 and 0<=N<=4
+    bge $r0, I, .L15
+    .align 3
+
+.L14:
+    xvld x1, X, 0
+    addi.d I, I, -1
+    xvfmina.s VM0, VM0, x1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    .align 3
+
+.L15:
+    fabs.s $f22, $f22
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X // initialize the mina value
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t2, 1
+    xvinsgr2vr.w VM0, t3, 2
+    xvinsgr2vr.w VM0, t4, 3
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 4
+    xvinsgr2vr.w VM0, t2, 5
+    xvinsgr2vr.w VM0, t3, 6
+    xvinsgr2vr.w VM0, t4, 7
+    .align 3
+
+.L21:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    addi.d I, I, -1
+    xvfmina.s VM0, VM0, VX0
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvpickve.w x5, VM0, 4
+    xvpickve.w x6, VM0, 5
+    xvpickve.w x7, VM0, 6
+    xvpickve.w x8, VM0, 7
+    xvfmina.s VM3, x1, x2
+    xvfmina.s VM2, x3, x4
+    xvfmina.s VM1, x5, x6
+    xvfmina.s VM0, x7, x8
+    xvfmina.s VM2, VM2, VM3
+    xvfmina.s VM0, VM0, VM1
+    xvfmina.s VM0, VM0, VM2
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    xvld x1, X, 0
+    addi.d I, I, -1
+    xvfmina.s VM0, VM0, x1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fabs.s $f22, $f22
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/samin_lsx.S b/kernel/loongarch64/samin_lsx.S
new file mode 100644
index 000000000..e222fbcd1
--- /dev/null
+++ b/kernel/loongarch64/samin_lsx.S
@@ -0,0 +1,177 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define J $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define TEMP $r16
+#define m0 $vr8
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr20
+#define VX1 $vr21
+#define VM0 $vr22
+#define VM1 $vr23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+    srai.d I, N, 3
+    bge $r0, I, .L12
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    vfmina.s VM1, VX0, VX1
+    addi.d X, X, 8 * SIZE
+    vfmina.s VM0, VM0, VM1
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmina.s VM1, x1, x2
+    vfmina.s VM0, x3, x4
+    vfmina.s VM0, VM0, VM1
+    .align 3
+
+.L12: //INCX==1 and N<8
+    andi I, N, 7
+    li.d J, 4
+    bge J, I, .L13 // 4<N<8
+    vld VX0, X, 0
+    slli.d J, J, 1 // 8
+    sub.d I, J, I
+    slli.d I, I, BASE_SHIFT
+    vldx VX1, X, I
+    vfmina.s m0, VX0, VX1 //patial repeat read
+    vreplvei.w x1, m0, 0
+    vreplvei.w x2, m0, 1
+    vreplvei.w x3, m0, 2
+    vreplvei.w x4, m0, 3
+    vfmina.s m0, x1, x2
+    vfmina.s VM1, x3, x4
+    vfmina.s m0, m0, VM1
+    vfmina.s VM0, m0, VM0
+    fabs.s $f22, $f22
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L13: //INCX==1 and 0<=N<=4
+    bge $r0, I, .L15
+    .align 3
+
+.L14:
+    vld x1, X, 0
+    addi.d I, I, -1
+    vfmina.s VM0, VM0, x1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    .align 3
+
+.L15:
+    fabs.s $f22, $f22
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t2, 1
+    vinsgr2vr.w VM0, t3, 2
+    vinsgr2vr.w VM0, t4, 3
+    .align 3
+
+.L21:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    addi.d I, I, -1
+    vfmina.s VM1, VX0, VX1
+    vfmina.s VM0, VM0, VM1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmina.s VM1, x1, x2
+    vfmina.s VM0, x3, x4
+    vfmina.s VM0, VM0, VM1
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    vld x1, X, 0
+    addi.d I, I, -1
+    vfmina.s VM0, VM0, x1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fabs.s $f22, $f22
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE

From 346b384d1c6f0cb00170c1bf8ad3e8302f142237 Mon Sep 17 00:00:00 2001
From: yancheng <yancheng@loongson.cn>
Date: Thu, 7 Dec 2023 11:30:02 +0800
Subject: [PATCH 464/718] loongarch64: Add optimization for max.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   3 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   3 +
 kernel/loongarch64/dmax_lasx.S           | 175 +++++++++++++++++++
 kernel/loongarch64/dmax_lsx.S            | 141 ++++++++++++++++
 kernel/loongarch64/smax_lasx.S           | 205 +++++++++++++++++++++++
 kernel/loongarch64/smax_lsx.S            | 171 +++++++++++++++++++
 6 files changed, 698 insertions(+)
 create mode 100644 kernel/loongarch64/dmax_lasx.S
 create mode 100644 kernel/loongarch64/dmax_lsx.S
 create mode 100644 kernel/loongarch64/smax_lasx.S
 create mode 100644 kernel/loongarch64/smax_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 279ff6a9c..e00893b72 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -13,4 +13,7 @@ DAMAXKERNEL =  damax_lsx.S
 SAMINKERNEL =  samin_lsx.S
 DAMINKERNEL =  damin_lsx.S
 
+SMAXKERNEL  =  smax_lsx.S
+DMAXKERNEL  =  dmax_lsx.S
+
 endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 83db79050..f238436f5 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -13,6 +13,9 @@ DAMAXKERNEL =  damax_lasx.S
 SAMINKERNEL =  samin_lasx.S
 DAMINKERNEL =  damin_lasx.S
 
+SMAXKERNEL  =  smax_lasx.S
+DMAXKERNEL =   dmax_lasx.S
+
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
diff --git a/kernel/loongarch64/dmax_lasx.S b/kernel/loongarch64/dmax_lasx.S
new file mode 100644
index 000000000..46366d2ec
--- /dev/null
+++ b/kernel/loongarch64/dmax_lasx.S
@@ -0,0 +1,175 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define J $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define TEMP $r16
+#define m0 $xr8
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr20
+#define VX1 $xr21
+#define VM0 $xr22
+#define VM1 $xr23
+#define VM2 $xr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+    srai.d I, N, 3
+    bge $r0, I, .L12
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    xvfmax.d VM1, VX1, VX0
+    addi.d X, X, 8 * SIZE
+    xvfmax.d VM0, VM0, VM1
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmax.d VM1, x1, x2
+    xvfmax.d VM2, x3, x4
+    xvfmax.d VM0, VM1, VM2
+    .align 3
+
+.L12: //INCX==1 and N<8
+    andi I, N, 7
+    li.d J, 4
+    bge J, I, .L13 // 4<N<8
+    xvld VX0, X, 0
+    slli.d J, J, 1 // 8
+    sub.d I, J, I
+    slli.d I, I, BASE_SHIFT
+    xvldx VX1, X, I
+    xvfmax.d m0, VX0, VX1 //patial repeat read
+    xvpickve.d x1, m0, 0
+    xvpickve.d x2, m0, 1
+    xvpickve.d x3, m0, 2
+    xvpickve.d x4, m0, 3
+    xvfmax.d VM1, x1, x2
+    xvfmax.d m0, x3, x4
+    xvfmax.d m0, m0, VM1
+    xvfmax.d VM0, m0, VM0
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L13: //INCX==1 and 0<=N<=4
+    bge $r0, I, .L15
+    .align 3
+
+.L14:
+    xvld x1, X, 0
+    addi.d I, I, -1
+    xvfmax.d VM0, VM0, x1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    .align 3
+
+.L15:
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X // initialize the max value
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t2, 1
+    xvinsgr2vr.d VM0, t3, 2
+    xvinsgr2vr.d VM0, t4, 3
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    addi.d I, I, -1
+    xvfmax.d VM1, VX1, VX0
+    xvfmax.d VM0, VM1, VM0
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmax.d VM1, x1, x2
+    xvfmax.d VM2, x3, x4
+    xvfmax.d VM0, VM1, VM2
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    xvld x1, X, 0
+    addi.d I, I, -1
+    xvfmax.d VM0, VM0, x1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/dmax_lsx.S b/kernel/loongarch64/dmax_lsx.S
new file mode 100644
index 000000000..99e9b2f30
--- /dev/null
+++ b/kernel/loongarch64/dmax_lsx.S
@@ -0,0 +1,141 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define TEMP $r16
+#define x1 $vr9
+#define x2 $vr10
+#define VX0 $vr20
+#define VX1 $vr21
+#define VM0 $vr22
+#define VM1 $vr23
+#define VM2 $vr19
+#define VM3 $vr18
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+    srai.d I, N, 3
+    bge $r0, I, .L12
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    addi.d I, I, -1
+    vfmax.d VM1, VX1, VX0
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vfmax.d VM2, VX1, VX0
+    vfmax.d VM3, VM1, VM2
+    addi.d X, X, 8 * SIZE
+    vfmax.d VM0, VM0, VM3
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    vreplvei.d x2, VM0, 1
+    vfmax.d VM0, VM0, x2
+    .align 3
+
+.L12: //INCX==1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L14
+    .align 3
+
+.L13:
+    vld x1, X, 0
+    addi.d I, I, -1
+    vfmax.d VM0, VM0, x1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L13
+    .align 3
+
+.L14:
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X // initialize the max value
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t2, 1
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmax.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    addi.d I, I, -1
+    vfmax.d VM2, VX0, VX1
+    vfmax.d VM3, VM1, VM2
+    vfmax.d VM0, VM0, VM3
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    vreplvei.d x2, VM0, 1
+    vfmax.d VM0, VM0, x2
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    vld x1, X, 0
+    addi.d I, I, -1
+    vfmax.d VM0, VM0, x1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/smax_lasx.S b/kernel/loongarch64/smax_lasx.S
new file mode 100644
index 000000000..8cceb9928
--- /dev/null
+++ b/kernel/loongarch64/smax_lasx.S
@@ -0,0 +1,205 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define J $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define TEMP $r16
+#define m0 $xr8
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define x5 $xr13
+#define x6 $xr14
+#define x7 $xr15
+#define x8 $xr16
+#define VX0 $xr20
+#define VX1 $xr21
+#define VM0 $xr22
+#define VM1 $xr23
+#define VM2 $xr18
+#define VM3 $xr17
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+    srai.d I, N, 3
+    bge $r0, I, .L12
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    addi.d I, I, -1
+    xvfmax.s VM0, VM0, VX0
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvpickve.w x5, VM0, 4
+    xvpickve.w x6, VM0, 5
+    xvpickve.w x7, VM0, 6
+    xvpickve.w x8, VM0, 7
+    xvfmax.s VM3, x1, x2
+    xvfmax.s VM2, x3, x4
+    xvfmax.s VM1, x5, x6
+    xvfmax.s VM0, x7, x8
+    xvfmax.s VM2, VM2, VM3
+    xvfmax.s VM0, VM0, VM1
+    xvfmax.s VM0, VM0, VM2
+    .align 3
+
+.L12: //INCX==1 and N<8
+    andi I, N, 7
+    li.d J, 4
+    bge J, I, .L13 // 4<N<8
+    xvld VX0, X, 0
+    slli.d J, J, 1 // 8
+    sub.d I, J, I
+    slli.d I, I, BASE_SHIFT
+    xvldx VX1, X, I
+    xvfmax.s m0, VX0, VX1 //patial repeat read
+    xvpickve.w x1, m0, 0
+    xvpickve.w x2, m0, 1
+    xvpickve.w x3, m0, 2
+    xvpickve.w x4, m0, 3
+    xvfmax.s m0, x1, x2
+    xvfmax.s VM1, x3, x4
+    xvfmax.s m0, m0, VM1
+    xvfmax.s VM0, m0, VM0
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L13: //INCX==1 and 0<=N<=4
+    bge $r0, I, .L15
+    .align 3
+
+.L14:
+    xvld x1, X, 0
+    addi.d I, I, -1
+    xvfmax.s VM0, VM0, x1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    .align 3
+
+.L15:
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X // initialize the max value
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t2, 1
+    xvinsgr2vr.w VM0, t3, 2
+    xvinsgr2vr.w VM0, t4, 3
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 4
+    xvinsgr2vr.w VM0, t2, 5
+    xvinsgr2vr.w VM0, t3, 6
+    xvinsgr2vr.w VM0, t4, 7
+    .align 3
+
+.L21:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    addi.d I, I, -1
+    xvfmax.s VM0, VM0, VX0
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvpickve.w x5, VM0, 4
+    xvpickve.w x6, VM0, 5
+    xvpickve.w x7, VM0, 6
+    xvpickve.w x8, VM0, 7
+    xvfmax.s VM3, x1, x2
+    xvfmax.s VM2, x3, x4
+    xvfmax.s VM1, x5, x6
+    xvfmax.s VM0, x7, x8
+    xvfmax.s VM2, VM2, VM3
+    xvfmax.s VM0, VM0, VM1
+    xvfmax.s VM0, VM0, VM2
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    xvld x1, X, 0
+    addi.d I, I, -1
+    xvfmax.s VM0, VM0, x1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/smax_lsx.S b/kernel/loongarch64/smax_lsx.S
new file mode 100644
index 000000000..71e6863b8
--- /dev/null
+++ b/kernel/loongarch64/smax_lsx.S
@@ -0,0 +1,171 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+#define I     $r12
+#define J     $r13
+#define t1    $r14
+#define t2    $r18
+#define t3    $r15
+#define t4    $r17
+#define TEMP  $r16
+#define m0    $vr8
+#define x1    $vr9
+#define x2    $vr10
+#define x3    $vr11
+#define x4    $vr12
+#define VX0   $vr20
+#define VX1   $vr21
+#define VM0   $vr22
+#define VM1   $vr23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+    srai.d I, N, 3
+    bge $r0, I, .L12
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    vfmax.s VM1, VX0, VX1
+    addi.d X, X, 8 * SIZE
+    vfmax.s VM0, VM0, VM1
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmax.s VM1, x2, x3
+    vfmax.s VM0, x4, VM0
+    vfmax.s VM0, VM0, VM1
+    .align 3
+
+.L12: //INCX==1 and N<8
+    andi I, N, 7
+    li.d J, 4
+    bge J, I, .L13 // 4<N<8
+    vld VX0, X, 0
+    slli.d J, J, 1 // 8
+    sub.d I, J, I
+    slli.d I, I, BASE_SHIFT
+    vldx VX1, X, I
+    vfmax.s m0, VX0, VX1 //patial repeat read
+    vreplvei.w x2, m0, 1
+    vreplvei.w x3, m0, 2
+    vreplvei.w x4, m0, 3
+    vfmax.s x1, x2, x3
+    vfmax.s VM1, x4, m0
+    vfmax.s m0, x1, VM1
+    vfmax.s VM0, m0, VM0
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L13: //INCX==1 and 0<=N<=4
+    bge $r0, I, .L15
+    .align 3
+
+.L14:
+    vld x1, X, 0
+    addi.d I, I, -1
+    vfmax.s VM0, VM0, x1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    .align 3
+
+.L15:
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t2, 1
+    vinsgr2vr.w VM0, t3, 2
+    vinsgr2vr.w VM0, t4, 3
+    .align 3
+
+.L21:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    addi.d I, I, -1
+    vfmax.s VM1, VX0, VX1
+    vfmax.s VM0, VM0, VM1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmax.s VM1, x2, x3
+    vfmax.s VM0, x4, VM0
+    vfmax.s VM0, VM0, VM1
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    vld x1, X, 0
+    addi.d I, I, -1
+    vfmax.s VM0, VM0, x1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE

From 702fc1d56dc8278a856c322a5fc1804067d01fd2 Mon Sep 17 00:00:00 2001
From: yancheng <yancheng@loongson.cn>
Date: Thu, 7 Dec 2023 11:51:19 +0800
Subject: [PATCH 465/718] loongarch64: Add optimization for min.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   3 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   3 +
 kernel/loongarch64/dmin_lasx.S           | 175 +++++++++++++++++++
 kernel/loongarch64/dmin_lsx.S            | 143 ++++++++++++++++
 kernel/loongarch64/smin_lasx.S           | 205 +++++++++++++++++++++++
 kernel/loongarch64/smin_lsx.S            | 174 +++++++++++++++++++
 6 files changed, 703 insertions(+)
 create mode 100644 kernel/loongarch64/dmin_lasx.S
 create mode 100644 kernel/loongarch64/dmin_lsx.S
 create mode 100644 kernel/loongarch64/smin_lasx.S
 create mode 100644 kernel/loongarch64/smin_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index e00893b72..0ff73c2db 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -16,4 +16,7 @@ DAMINKERNEL =  damin_lsx.S
 SMAXKERNEL  =  smax_lsx.S
 DMAXKERNEL  =  dmax_lsx.S
 
+SMINKERNEL  =  smin_lsx.S
+DMINKERNEL  =  dmin_lsx.S
+
 endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index f238436f5..71f53d9d7 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -16,6 +16,9 @@ DAMINKERNEL =  damin_lasx.S
 SMAXKERNEL  =  smax_lasx.S
 DMAXKERNEL =   dmax_lasx.S
 
+SMINKERNEL =   smin_lasx.S
+DMINKERNEL =   dmin_lasx.S
+
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
diff --git a/kernel/loongarch64/dmin_lasx.S b/kernel/loongarch64/dmin_lasx.S
new file mode 100644
index 000000000..e76056565
--- /dev/null
+++ b/kernel/loongarch64/dmin_lasx.S
@@ -0,0 +1,175 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define J $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define TEMP $r16
+#define m0 $xr8
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr20
+#define VX1 $xr21
+#define VM0 $xr22
+#define VM1 $xr23
+#define VM2 $xr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+    srai.d I, N, 3
+    bge $r0, I, .L12
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    xvfmin.d VM1, VX1, VX0
+    addi.d X, X, 8 * SIZE
+    xvfmin.d VM0, VM0, VM1
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmin.d VM1, x1, x2
+    xvfmin.d VM2, x3, x4
+    xvfmin.d VM0, VM1, VM2
+    .align 3
+
+.L12: //INCX==1 and N<8
+    andi I, N, 7
+    li.d J, 4
+    bge J, I, .L13 // 4<N<8
+    xvld VX0, X, 0
+    slli.d J, J, 1 // 8
+    sub.d I, J, I
+    slli.d I, I, BASE_SHIFT
+    xvldx VX1, X, I
+    xvfmin.d m0, VX0, VX1 //patial repeat read
+    xvpickve.d x1, m0, 0
+    xvpickve.d x2, m0, 1
+    xvpickve.d x3, m0, 2
+    xvpickve.d x4, m0, 3
+    xvfmin.d VM1, x1, x2
+    xvfmin.d m0, x3, x4
+    xvfmin.d m0, m0, VM1
+    xvfmin.d VM0, m0, VM0
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L13: //INCX==1 and 0<=N<=4
+    bge $r0, I, .L15
+    .align 3
+
+.L14:
+    xvld x1, X, 0
+    xvfmin.d VM0, VM0, x1
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    .align 3
+
+.L15:
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X // initialize the min value
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t2, 1
+    xvinsgr2vr.d VM0, t3, 2
+    xvinsgr2vr.d VM0, t4, 3
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    addi.d I, I, -1
+    xvfmin.d VM1, VX1, VX0
+    xvfmin.d VM0, VM1, VM0
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmin.d VM1, x1, x2
+    xvfmin.d VM2, x3, x4
+    xvfmin.d VM0, VM1, VM2
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    xvld x1, X, 0
+    xvfmin.d VM0, VM0, x1
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/dmin_lsx.S b/kernel/loongarch64/dmin_lsx.S
new file mode 100644
index 000000000..f076067a3
--- /dev/null
+++ b/kernel/loongarch64/dmin_lsx.S
@@ -0,0 +1,143 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define TEMP $r16
+#define x1 $vr9
+#define x2 $vr10
+#define VX0 $vr20
+#define VX1 $vr21
+#define VM0 $vr22
+#define VM1 $vr23
+#define VM2 $vr18
+#define VM3 $vr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+    srai.d I, N, 3
+    bge $r0, I, .L12
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    addi.d I, I, -1
+    vfmin.d VM1, VX1, VX0
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vfmin.d VM2, VX1, VX0
+    vfmin.d VM3, VM1, VM2
+    addi.d X, X, 8 * SIZE
+    vfmin.d VM0, VM0, VM3
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    vfmin.d VM0, x1, x2
+    .align 3
+
+.L12: //INCX==1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L14
+    .align 3
+
+.L13:
+    vld x1, X, 0
+    addi.d I, I, -1
+    vfmin.d VM0, VM0, x1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L13
+    .align 3
+
+.L14:
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X // initialize the min value
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t2, 1
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmin.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    addi.d I, I, -1
+    vfmin.d VM2, VX0, VX1
+    vfmin.d VM3, VM1, VM2
+    vfmin.d VM0, VM0, VM3
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    vfmin.d VM0, x1, x2
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    vld x1, X, 0
+    addi.d I, I, -1
+    vfmin.d VM0, VM0, x1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/smin_lasx.S b/kernel/loongarch64/smin_lasx.S
new file mode 100644
index 000000000..f6bbbfb8d
--- /dev/null
+++ b/kernel/loongarch64/smin_lasx.S
@@ -0,0 +1,205 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define J $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define TEMP $r16
+#define m0 $xr8
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define x5 $xr13
+#define x6 $xr14
+#define x7 $xr15
+#define x8 $xr16
+#define VX0 $xr20
+#define VX1 $xr21
+#define VM0 $xr22
+#define VM1 $xr23
+#define VM2 $xr18
+#define VM3 $xr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+    srai.d I, N, 3
+    bge $r0, I, .L12
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    addi.d I, I, -1
+    xvfmin.s VM0, VM0, VX0
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvpickve.w x5, VM0, 4
+    xvpickve.w x6, VM0, 5
+    xvpickve.w x7, VM0, 6
+    xvpickve.w x8, VM0, 7
+    xvfmin.s VM3, x1, x2
+    xvfmin.s VM2, x3, x4
+    xvfmin.s VM1, x5, x6
+    xvfmin.s VM0, x7, x8
+    xvfmin.s VM2, VM2, VM3
+    xvfmin.s VM0, VM0, VM1
+    xvfmin.s VM0, VM0, VM2
+    .align 3
+
+.L12: //INCX==1 and N<8
+    andi I, N, 7
+    li.d J, 4
+    bge J, I, .L13 // 4<N<8
+    xvld VX0, X, 0
+    slli.d J, J, 1 // 8
+    sub.d I, J, I
+    slli.d I, I, BASE_SHIFT
+    xvldx VX1, X, I
+    xvfmin.s m0, VX0, VX1 //patial repeat read
+    xvpickve.w x1, m0, 0
+    xvpickve.w x2, m0, 1
+    xvpickve.w x3, m0, 2
+    xvpickve.w x4, m0, 3
+    xvfmin.s m0, x1, x2
+    xvfmin.s VM1, x3, x4
+    xvfmin.s m0, m0, VM1
+    xvfmin.s VM0, m0, VM0
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L13: //INCX==1 and 0<=N<=4
+    bge $r0, I, .L15
+    .align 3
+
+.L14:
+    xvld x1, X, 0
+    addi.d I, I, -1
+    xvfmin.s VM0, VM0, x1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    .align 3
+
+.L15:
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X // initialize the min value
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t2, 1
+    xvinsgr2vr.w VM0, t3, 2
+    xvinsgr2vr.w VM0, t4, 3
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 4
+    xvinsgr2vr.w VM0, t2, 5
+    xvinsgr2vr.w VM0, t3, 6
+    xvinsgr2vr.w VM0, t4, 7
+    .align 3
+
+.L21:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    addi.d I, I, -1
+    xvfmin.s VM0, VM0, VX0
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvpickve.w x5, VM0, 4
+    xvpickve.w x6, VM0, 5
+    xvpickve.w x7, VM0, 6
+    xvpickve.w x8, VM0, 7
+    xvfmin.s VM3, x1, x2
+    xvfmin.s VM2, x3, x4
+    xvfmin.s VM1, x5, x6
+    xvfmin.s VM0, x7, x8
+    xvfmin.s VM2, VM2, VM3
+    xvfmin.s VM0, VM0, VM1
+    xvfmin.s VM0, VM0, VM2
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    xvld x1, X, 0
+    addi.d I, I, -1
+    xvfmin.s VM0, VM0, x1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/smin_lsx.S b/kernel/loongarch64/smin_lsx.S
new file mode 100644
index 000000000..1cc3699b0
--- /dev/null
+++ b/kernel/loongarch64/smin_lsx.S
@@ -0,0 +1,174 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define J $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define TEMP $r16
+#define m0 $vr8
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr20
+#define VX1 $vr21
+#define VM0 $vr22
+#define VM1 $vr23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+    srai.d I, N, 3
+    bge $r0, I, .L12
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    vfmin.s VM1, VX0, VX1
+    addi.d X, X, 8 * SIZE
+    vfmin.s VM0, VM0, VM1
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmin.s VM1, x1, x2
+    vfmin.s VM0, x3, x4
+    vfmin.s VM0, VM0, VM1
+    .align 3
+
+.L12: //INCX==1 and N<8
+    andi I, N, 7
+    li.d J, 4
+    bge J, I, .L13 // 4<N<8
+    vld VX0, X, 0
+    slli.d J, J, 1 // 8
+    sub.d I, J, I
+    slli.d I, I, BASE_SHIFT
+    vldx VX1, X, I
+    vfmin.s m0, VX0, VX1 //patial repeat read
+    vreplvei.w x1, m0, 0
+    vreplvei.w x2, m0, 1
+    vreplvei.w x3, m0, 2
+    vreplvei.w x4, m0, 3
+    vfmin.s m0, x1, x2
+    vfmin.s VM1, x3, x4
+    vfmin.s m0, m0, VM1
+    vfmin.s VM0, m0, VM0
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L13: //INCX==1 and 0<=N<=4
+    bge $r0, I, .L15
+    .align 3
+
+.L14:
+    vld x1, X, 0
+    addi.d I, I, -1
+    vfmin.s VM0, VM0, x1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    .align 3
+
+.L15:
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t2, 1
+    vinsgr2vr.w VM0, t3, 2
+    vinsgr2vr.w VM0, t4, 3
+    .align 3
+
+.L21:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmin.s VM1, VX0, VX1
+    addi.d I, I, -1
+    vfmin.s VM0, VM0, VM1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmin.s VM1, x1, x2
+    vfmin.s VM0, x3, x4
+    vfmin.s VM0, VM0, VM1
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    vld x1, X, 0
+    vfmin.s VM0, VM0, x1
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE

From e46b48e3727532327b6a6e8c9aeb777452cde1cf Mon Sep 17 00:00:00 2001
From: yancheng <yancheng@loongson.cn>
Date: Thu, 7 Dec 2023 11:56:41 +0800
Subject: [PATCH 466/718] loongarch64: Add optimizations for imax.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   3 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   3 +
 kernel/loongarch64/idmax_lasx.S          | 273 +++++++++++++++++
 kernel/loongarch64/idmax_lsx.S           | 225 ++++++++++++++
 kernel/loongarch64/ismax_lasx.S          | 375 +++++++++++++++++++++++
 kernel/loongarch64/ismax_lsx.S           | 272 ++++++++++++++++
 6 files changed, 1151 insertions(+)
 create mode 100644 kernel/loongarch64/idmax_lasx.S
 create mode 100644 kernel/loongarch64/idmax_lsx.S
 create mode 100644 kernel/loongarch64/ismax_lasx.S
 create mode 100644 kernel/loongarch64/ismax_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 0ff73c2db..3c201ad89 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -19,4 +19,7 @@ DMAXKERNEL  =  dmax_lsx.S
 SMINKERNEL  =  smin_lsx.S
 DMINKERNEL  =  dmin_lsx.S
 
+ISMAXKERNEL =  ismax_lsx.S
+IDMAXKERNEL =  idmax_lsx.S
+
 endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 71f53d9d7..953793b9a 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -19,6 +19,9 @@ DMAXKERNEL =   dmax_lasx.S
 SMINKERNEL =   smin_lasx.S
 DMINKERNEL =   dmin_lasx.S
 
+ISMAXKERNEL =  ismax_lasx.S
+IDMAXKERNEL =  idmax_lasx.S
+
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
diff --git a/kernel/loongarch64/idmax_lasx.S b/kernel/loongarch64/idmax_lasx.S
new file mode 100644
index 000000000..bbfe0941a
--- /dev/null
+++ b/kernel/loongarch64/idmax_lasx.S
@@ -0,0 +1,273 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    xvadd.d VI1, VI1, VINC8
+    xvld VX1, X, 4 * SIZE
+    xvadd.d VI2, VI1, VINC4
+    xvfcmp.clt.d VT0, VX0, VX1
+    addi.d I, I, -1
+    xvbitsel.v VM1, VX0, VX1, VT0
+    xvbitsel.v VI2, VI1, VI2, VT0
+    xvfcmp.clt.d VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfcmp.clt.d VT0, x1, x2
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.d VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.d VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.d $f17, TEMP
+    ffint.d.l $f17, $f17
+    xvfcmp.ceq.d VT0, VM0, x1
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    xvfcmp.clt.d VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+
+.L20: // INCX!=1
+    move TEMP, X
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t2, 1
+    xvinsgr2vr.d VM0, t3, 2
+    xvinsgr2vr.d VM0, t4, 3
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+    .align 3
+
+.L24:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvadd.d VI2, VI1, VINC4
+    xvfcmp.clt.d VT0, VX0, VX1
+    addi.d I, I, -1
+    xvbitsel.v VM1, VX0, VX1, VT0
+    xvbitsel.v VI2, VI1, VI2, VT0
+    xvfcmp.clt.d VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfcmp.clt.d VT0, x1, x2
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.d VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.d VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.d $f17, TEMP
+    ffint.d.l $f17, $f17
+    xvfcmp.ceq.d VT0, VM0, x1
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    xvfcmp.clt.d VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    xvfcmp.ceq.d VT0, VM0, x2
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L27
+    xvfcmp.clt.d VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    xvfcmp.ceq.d VT0, VM0, x3
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L28
+    xvfcmp.clt.d VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    xvfcmp.ceq.d VT0, VM0, x4
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L29
+    xvfcmp.clt.d VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    addi.d I, I, -1
+    fcmp.clt.d $fcc0, $f15, $f9
+    add.d  X, X, INCX
+    fsel $f15, $f15, $f9, $fcc0
+    fsel $f20, $f20, $f21, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/idmax_lsx.S b/kernel/loongarch64/idmax_lsx.S
new file mode 100644
index 000000000..1b4734bab
--- /dev/null
+++ b/kernel/loongarch64/idmax_lsx.S
@@ -0,0 +1,225 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VINC2 $vr17
+#define VINC4 $vr18
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC2, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 2 * SIZE
+    vadd.d VI2, VI1, VINC2
+    vfcmp.clt.d VT0, VX0, VX1
+    vbitsel.v x1, VX0, VX1, VT0
+    vbitsel.v x2, VI1, VI2, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI2, VINC2
+    vld VX1, X, 6 * SIZE
+    vadd.d VI2, VI1, VINC2
+    vfcmp.clt.d VT0, VX0, VX1
+    addi.d I, I, -1
+    vbitsel.v x3, VX0, VX1, VT0
+    vbitsel.v x4, VI1, VI2, VT0
+    vfcmp.clt.d VT0, x1, x3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT0
+    vfcmp.clt.d VT0, VM0, x1
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, VM0, x1, VT0
+    vbitsel.v VI0, VI0, x2, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.d $f17, TEMP
+    ffint.d.l $f17, $f17
+    vfcmp.ceq.d VT0, x2, x1
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L27
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t2, 1
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC2, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+    .align 3
+
+.L24:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI1, VINC4
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, VINC2
+    vfcmp.clt.d VT0, VX0, VX1
+    vbitsel.v x1, VX0, VX1, VT0
+    vbitsel.v x2, VI1, VI2, VT0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI2, VINC2
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, VINC2
+    vfcmp.clt.d VT0, VX0, VX1
+    vbitsel.v x3, VX0, VX1, VT0
+    vbitsel.v x4, VI1, VI2, VT0
+    vfcmp.clt.d VT0, x1, x3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT0
+    vfcmp.clt.d VT0, VM0, x1
+    addi.d I, I, -1
+    vbitsel.v VM0, VM0, x1, VT0
+    vbitsel.v VI0, VI0, x2, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.d $f17, TEMP
+    ffint.d.l $f17, $f17
+    vfcmp.ceq.d VT0, x2, x1
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L27
+    .align 3
+
+.L26:
+    vfcmp.clt.d VT0, x1, x2
+    vbitsel.v VM0, x1, x2, VT0
+    vbitsel.v VI0, VI1, VI2, VT0
+    .align 3
+
+.L27:
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    addi.d I, I, -1
+    fcmp.clt.d $fcc0, $f15, $f9
+    add.d  X, X, INCX
+    fsel $f15, $f15, $f9, $fcc0
+    fsel $f20, $f20, $f21, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/ismax_lasx.S b/kernel/loongarch64/ismax_lasx.S
new file mode 100644
index 000000000..843dd6c6a
--- /dev/null
+++ b/kernel/loongarch64/ismax_lasx.S
@@ -0,0 +1,375 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    xvadd.w VI1, VI1, VINC8
+    xvfcmp.clt.s VT0, VM0, VX0
+    addi.d I, I, -1
+    xvbitsel.v VM0, VM0, VX0, VT0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    xvfcmp.ceq.s VT0, VM0, x1
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+
+.L20: // INCX!=1
+    move TEMP, X
+    addi.w i0, i0, 1
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 0
+    xvinsgr2vr.w VM0, t2, 1
+    xvinsgr2vr.w VM0, t3, 2
+    xvinsgr2vr.w VM0, t4, 3
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 4
+    xvinsgr2vr.w VM0, t2, 5
+    xvinsgr2vr.w VM0, t3, 6
+    xvinsgr2vr.w VM0, t4, 7
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+    .align 3
+
+.L24:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvadd.w VI1, VI1, VINC8
+    xvfcmp.clt.s VT0, VM0, VX0
+    addi.d I, I, -1
+    xvbitsel.v VM0, VM0, VX0, VT0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    xvfcmp.ceq.s VT0, VM0, x1
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    xvfcmp.ceq.s VT0, VM0, x2
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L27
+    xvfcmp.clt.s VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    xvfcmp.ceq.s VT0, VM0, x3
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L28
+    xvfcmp.clt.s VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    xvfcmp.ceq.s VT0, VM0, x4
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L29
+    xvfcmp.clt.s VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    fmov.s $f16, $f20
+    .align 3
+
+.L252:
+    xvxor.v VI0, VI0, VI0
+    xvor.v VI0, VI0, VX0
+    fmov.s $f13, $f15
+    xvxor.v VM0, VM0, VM0
+    xvor.v VM0, VM0, VX1
+    xvpickve.w VI1, VI0, 4
+    xvpickve.w VI2, VI0, 5
+    xvpickve.w VI3, VI0, 6
+    xvpickve.w VI4, VI0, 7
+    xvpickve.w x1, VM0, 4
+    xvpickve.w x2, VM0, 5
+    xvpickve.w x3, VM0, 6
+    xvpickve.w x4, VM0, 7
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v x1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, x1
+    xvbitsel.v VM0, VM0, x1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    xvfcmp.ceq.s VT0, VM0, x1
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L262
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L262:
+    xvfcmp.ceq.s VT0, VM0, x2
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L272
+    xvfcmp.clt.s VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L272:
+    xvfcmp.ceq.s VT0, VM0, x3
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L282
+    xvfcmp.clt.s VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L282:
+    xvfcmp.ceq.s VT0, VM0, x4
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L292
+    xvfcmp.clt.s VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L292:
+    fcmp.clt.s $fcc0, $f15, $f13
+    fsel $f15, $f15, $f13, $fcc0
+    fsel $f20, $f20, $f16, $fcc0
+    movfr2gr.s i0, $f20
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    addi.d I, I, -1
+    fcmp.clt.s $fcc0, $f15, $f9
+    add.d  X, X, INCX
+    fsel $f15, $f15, $f9, $fcc0
+    fsel $f20, $f20, $f21, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    movfr2gr.s i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/ismax_lsx.S b/kernel/loongarch64/ismax_lsx.S
new file mode 100644
index 000000000..33b326bbd
--- /dev/null
+++ b/kernel/loongarch64/ismax_lsx.S
@@ -0,0 +1,272 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VINC4 $vr17
+#define VINC8 $vr18
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vadd.w VI1, VI1, VINC8
+    vld VX1, X, 4 * SIZE
+    vadd.w VI2, VI1, VINC4
+    vfcmp.clt.s VT0, VX0, VX1
+    addi.d I, I, -1
+    vbitsel.v VM1, VX0, VX1, VT0
+    vbitsel.v VI2, VI1, VI2, VT0
+    vfcmp.clt.s VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, VI0, VI2, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfcmp.clt.s VT0, x1, x2
+    vbitsel.v VM1, x1, x2, VT0
+    vbitsel.v VINC4, VI1, VI2, VT0
+    vfcmp.clt.s VT0, x3, x4
+    vbitsel.v VM0, x3, x4, VT0
+    vbitsel.v VINC8, VI3, VI4, VT0
+    vfcmp.clt.s VT0, VM0, VM1
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    vfcmp.ceq.s VT0, VM0, x1
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+    addi.w i0, i0, 1
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t2, 1
+    vinsgr2vr.w VM0, t3, 2
+    vinsgr2vr.w VM0, t4, 3
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+    .align 3
+
+.L24:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    vadd.w VI1, VI1, VINC8
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vadd.w VI2, VI1, VINC4
+    vfcmp.clt.s VT0, VX0, VX1
+    addi.d I, I, -1
+    vbitsel.v VM1, VX0, VX1, VT0
+    vbitsel.v VI2, VI1, VI2, VT0
+    vfcmp.clt.s VT0, VM0, VM1
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, VI0, VI2, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfcmp.clt.s VT0, x1, x2
+    vbitsel.v VM1, x1, x2, VT0
+    vbitsel.v VINC4, VI1, VI2, VT0
+    vfcmp.clt.s VT0, x3, x4
+    vbitsel.v VM0, x3, x4, VT0
+    vbitsel.v VINC8, VI3, VI4, VT0
+    vfcmp.clt.s VT0, VM0, VM1
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    vfcmp.ceq.s VT0, VM0, x1
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    vfcmp.ceq.s VT0, VM0, x2
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L27
+    vfcmp.clt.s VT0, VI2, VI0
+    vbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    vfcmp.ceq.s VT0, VM0, x3
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L28
+    vfcmp.clt.s VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    vfcmp.ceq.s VT0, VM0, x4
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L29
+    vfcmp.clt.s VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    movfr2gr.s i0, $f20
+    .align 3
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    addi.d I, I, -1
+    fcmp.clt.s $fcc0, $f15, $f9
+    fsel $f15, $f15, $f9, $fcc0
+    fsel $f20, $f20, $f21, $fcc0
+    addi.d i1, i1, 1
+    add.d  X, X, INCX
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    movfr2gr.s i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file

From e3fb2b5afa63fc0db472268adfd7038146ea6ac6 Mon Sep 17 00:00:00 2001
From: yancheng <yancheng@loongson.cn>
Date: Thu, 7 Dec 2023 12:01:05 +0800
Subject: [PATCH 467/718] loongarch64: Add optimizations for imin.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   3 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   3 +
 kernel/loongarch64/idmin_lasx.S          | 272 +++++++++++++++++
 kernel/loongarch64/idmin_lsx.S           | 225 ++++++++++++++
 kernel/loongarch64/ismin_lasx.S          | 374 +++++++++++++++++++++++
 kernel/loongarch64/ismin_lsx.S           | 271 ++++++++++++++++
 6 files changed, 1148 insertions(+)
 create mode 100644 kernel/loongarch64/idmin_lasx.S
 create mode 100644 kernel/loongarch64/idmin_lsx.S
 create mode 100644 kernel/loongarch64/ismin_lasx.S
 create mode 100644 kernel/loongarch64/ismin_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 3c201ad89..d0eab3d7c 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -22,4 +22,7 @@ DMINKERNEL  =  dmin_lsx.S
 ISMAXKERNEL =  ismax_lsx.S
 IDMAXKERNEL =  idmax_lsx.S
 
+ISMINKERNEL =  ismin_lsx.S
+IDMINKERNEL =  idmin_lsx.S
+
 endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 953793b9a..92389001b 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -22,6 +22,9 @@ DMINKERNEL =   dmin_lasx.S
 ISMAXKERNEL =  ismax_lasx.S
 IDMAXKERNEL =  idmax_lasx.S
 
+ISMINKERNEL =  ismin_lasx.S
+IDMINKERNEL =  idmin_lasx.S
+
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
diff --git a/kernel/loongarch64/idmin_lasx.S b/kernel/loongarch64/idmin_lasx.S
new file mode 100644
index 000000000..7930d4963
--- /dev/null
+++ b/kernel/loongarch64/idmin_lasx.S
@@ -0,0 +1,272 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    xvadd.d VI1, VI1, VINC8
+    xvld VX1, X, 4 * SIZE
+    xvadd.d VI2, VI1, VINC4
+    xvfcmp.clt.d VT0, VX1, VX0
+    addi.d I, I, -1
+    xvbitsel.v VM1, VX0, VX1, VT0
+    xvbitsel.v VI2, VI1, VI2, VT0
+    xvfcmp.clt.d VT0, VM1, VM0
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfcmp.clt.d VT0, x2, x1
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.d VT0, x4, x3
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.d VT0, VM1, VM0
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.d $f17, TEMP
+    ffint.d.l $f17, $f17
+    xvfcmp.ceq.d VT0, VM0, x1
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    xvfcmp.clt.d VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t2, 1
+    xvinsgr2vr.d VM0, t3, 2
+    xvinsgr2vr.d VM0, t4, 3
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+    .align 3
+
+.L24:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvadd.d VI2, VI1, VINC4
+    xvfcmp.clt.d VT0, VX1, VX0
+    addi.d I, I, -1
+    xvbitsel.v VM1, VX0, VX1, VT0
+    xvbitsel.v VI2, VI1, VI2, VT0
+    xvfcmp.clt.d VT0, VM1, VM0
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfcmp.clt.d VT0, x2, x1
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.d VT0, x4, x3
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.d VT0, VM1, VM0
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.d $f17, TEMP
+    ffint.d.l $f17, $f17
+    xvfcmp.ceq.d VT0, VM0, x1
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    xvfcmp.clt.d VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    xvfcmp.ceq.d VT0, VM0, x2
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L27
+    xvfcmp.clt.d VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    xvfcmp.ceq.d VT0, VM0, x3
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L28
+    xvfcmp.clt.d VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    xvfcmp.ceq.d VT0, VM0, x4
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L29
+    xvfcmp.clt.d VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    addi.d I, I, -1
+    fcmp.clt.d $fcc0, $f9, $f15
+    add.d  X, X, INCX
+    fsel $f15, $f15, $f9, $fcc0
+    fsel $f20, $f20, $f21, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/idmin_lsx.S b/kernel/loongarch64/idmin_lsx.S
new file mode 100644
index 000000000..8b6edcbf0
--- /dev/null
+++ b/kernel/loongarch64/idmin_lsx.S
@@ -0,0 +1,225 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VINC2 $vr17
+#define VINC4 $vr18
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC2, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 2 * SIZE
+    vadd.d VI2, VI1, VINC2
+    vfcmp.clt.d VT0, VX1, VX0
+    vbitsel.v x1, VX0, VX1, VT0
+    vbitsel.v x2, VI1, VI2, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI2, VINC2
+    vld VX1, X, 6 * SIZE
+    vadd.d VI2, VI1, VINC2
+    vfcmp.clt.d VT0, VX1, VX0
+    addi.d I, I, -1
+    vbitsel.v x3, VX0, VX1, VT0
+    vbitsel.v x4, VI1, VI2, VT0
+    vfcmp.clt.d VT0, x3, x1
+    addi.d X, X, 8 * SIZE
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT0
+    vfcmp.clt.d VT0, x1, VM0
+    vbitsel.v VM0, VM0, x1, VT0
+    vbitsel.v VI0, VI0, x2, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.d $f17, TEMP
+    ffint.d.l $f17, $f17
+    vfcmp.ceq.d VT0, x2, x1
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L27
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t2, 1
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC2, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+    .align 3
+
+.L24:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI1, VINC4
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, VINC2
+    vfcmp.clt.d VT0, VX1, VX0
+    vbitsel.v x1, VX0, VX1, VT0
+    vbitsel.v x2, VI1, VI2, VT0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI2, VINC2
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, VINC2
+    vfcmp.clt.d VT0, VX1, VX0
+    vbitsel.v x3, VX0, VX1, VT0
+    vbitsel.v x4, VI1, VI2, VT0
+    vfcmp.clt.d VT0, x3, x1
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT0
+    vfcmp.clt.d VT0, x1, VM0
+    addi.d I, I, -1
+    vbitsel.v VM0, VM0, x1, VT0
+    vbitsel.v VI0, VI0, x2, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.d $f17, TEMP
+    ffint.d.l $f17, $f17
+    vfcmp.ceq.d VT0, x2, x1
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L27
+    .align 3
+
+.L26:
+    vfcmp.clt.d VT0, x2, x1
+    vbitsel.v VM0, x1, x2, VT0
+    vbitsel.v VI0, VI1, VI2, VT0
+    .align 3
+
+.L27:
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    addi.d I, I, -1
+    fcmp.clt.d $fcc0, $f9, $f15
+    add.d  X, X, INCX
+    fsel $f15, $f15, $f9, $fcc0
+    fsel $f20, $f20, $f21, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/ismin_lasx.S b/kernel/loongarch64/ismin_lasx.S
new file mode 100644
index 000000000..15f6e2ec9
--- /dev/null
+++ b/kernel/loongarch64/ismin_lasx.S
@@ -0,0 +1,374 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    xvadd.w VI1, VI1, VINC8
+    xvfcmp.clt.s VT0, VX0, VM0
+    addi.d I, I, -1
+    xvbitsel.v VM0, VM0, VX0, VT0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfcmp.clt.s VT0, x2, x1
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x4, x3
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM1, VM0
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    xvfcmp.ceq.s VT0, x1, VM0
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+    addi.w i0, i0, 1
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 0
+    xvinsgr2vr.w VM0, t2, 1
+    xvinsgr2vr.w VM0, t3, 2
+    xvinsgr2vr.w VM0, t4, 3
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 4
+    xvinsgr2vr.w VM0, t2, 5
+    xvinsgr2vr.w VM0, t3, 6
+    xvinsgr2vr.w VM0, t4, 7
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+    .align 3
+
+.L24:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvadd.w VI1, VI1, VINC8
+    xvfcmp.clt.s VT0, VX0, VM0
+    addi.d I, I, -1
+    xvbitsel.v VM0, VM0, VX0, VT0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfcmp.clt.s VT0, x2, x1
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x4, x3
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM1, VM0
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    xvfcmp.ceq.s VT0, VM0, x1
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    xvfcmp.ceq.s VT0, VM0, x2
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L27
+    xvfcmp.clt.s VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    xvfcmp.ceq.s VT0, VM0, x3
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L28
+    xvfcmp.clt.s VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    xvfcmp.ceq.s VT0, VM0, x4
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L29
+    xvfcmp.clt.s VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    fmov.s $f16, $f20
+    .align 3
+
+.L252:
+    xvxor.v VI0, VI0, VI0
+    xvor.v VI0, VI0, VX0
+    fmov.s $f13, $f15
+    xvxor.v VM0, VM0, VM0
+    xvor.v VM0, VM0, VX1
+    xvpickve.w VI1, VI0, 4
+    xvpickve.w VI2, VI0, 5
+    xvpickve.w VI3, VI0, 6
+    xvpickve.w VI4, VI0, 7
+    xvpickve.w x1, VM0, 4
+    xvpickve.w x2, VM0, 5
+    xvpickve.w x3, VM0, 6
+    xvpickve.w x4, VM0, 7
+    xvfcmp.clt.s VT0, x2, x1
+    xvbitsel.v x1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x4, x3
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, x1, VM0
+    xvbitsel.v VM0, VM0, x1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    xvfcmp.ceq.s VT0, VM0, x1
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L262
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L262:
+    xvfcmp.ceq.s VT0, VM0, x2
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L272
+    xvfcmp.clt.s VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L272:
+    xvfcmp.ceq.s VT0, VM0, x3
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L282
+    xvfcmp.clt.s VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L282:
+    xvfcmp.ceq.s VT0, VM0, x4
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L292
+    xvfcmp.clt.s VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L292:
+    fcmp.clt.s $fcc0, $f13, $f15
+    fsel $f15, $f15, $f13, $fcc0
+    fsel $f20, $f20, $f16, $fcc0
+    movfr2gr.s i0, $f20
+
+.L21:   //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    addi.d I, I, -1
+    fcmp.clt.s $fcc0, $f9, $f15
+    fsel $f15, $f15, $f9, $fcc0
+    fsel $f20, $f20, $f21, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    add.d  X, X, INCX
+    blt $r0, I, .L22
+    movfr2gr.s i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/ismin_lsx.S b/kernel/loongarch64/ismin_lsx.S
new file mode 100644
index 000000000..f90ebbd57
--- /dev/null
+++ b/kernel/loongarch64/ismin_lsx.S
@@ -0,0 +1,271 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VINC4 $vr17
+#define VINC8 $vr18
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vadd.w VI1, VI1, VINC8
+    vld VX1, X, 4 * SIZE
+    vadd.w VI2, VI1, VINC4
+    vfcmp.clt.s VT0, VX1, VX0
+    addi.d I, I, -1
+    vbitsel.v VM1, VX0, VX1, VT0
+    vbitsel.v VI2, VI1, VI2, VT0
+    vfcmp.clt.s VT0, VM1, VM0
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, VI0, VI2, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfcmp.clt.s VT0, x2, x1
+    vbitsel.v VM1, x1, x2, VT0
+    vbitsel.v VINC4, VI1, VI2, VT0
+    vfcmp.clt.s VT0, x4, x3
+    vbitsel.v VM0, x3, x4, VT0
+    vbitsel.v VINC8, VI3, VI4, VT0
+    vfcmp.clt.s VT0, VM1, VM0
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    vfcmp.ceq.s VT0, x1, VM0
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+    addi.w i0, i0, 1
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t2, 1
+    vinsgr2vr.w VM0, t3, 2
+    vinsgr2vr.w VM0, t4, 3
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+    .align 3
+
+.L24:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    vadd.w VI1, VI1, VINC8
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vadd.w VI2, VI1, VINC4
+    vfcmp.clt.s VT0, VX1, VX0
+    addi.d I, I, -1
+    vbitsel.v VM1, VX0, VX1, VT0
+    vbitsel.v VI2, VI1, VI2, VT0
+    vfcmp.clt.s VT0, VM1, VM0
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, VI0, VI2, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfcmp.clt.s VT0, x2, x1
+    vbitsel.v VM1, x1, x2, VT0
+    vbitsel.v VINC4, VI1, VI2, VT0
+    vfcmp.clt.s VT0, x4, x3
+    vbitsel.v VM0, x3, x4, VT0
+    vbitsel.v VINC8, VI3, VI4, VT0
+    vfcmp.clt.s VT0, VM1, VM0
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    vfcmp.ceq.s VT0, x1, VM0
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    vfcmp.ceq.s VT0, x2, VM0
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L27
+    vfcmp.clt.s VT0, VI2, VI0
+    vbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    vfcmp.ceq.s VT0, x3, VM0
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L28
+    vfcmp.clt.s VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    vfcmp.ceq.s VT0, x4, VM0
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L29
+    vfcmp.clt.s VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    movfr2gr.s i0, $f20
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    fcmp.clt.s $fcc0, $f9, $f15
+    fsel $f15, $f15, $f9, $fcc0
+    fsel $f20, $f20, $f21, $fcc0
+    addi.d I, I, -1
+    addi.d i1, i1, 1
+    add.d  X, X, INCX
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    movfr2gr.s i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file

From be83f5e4e08bfc5a7d94d5ed659368ca077c4374 Mon Sep 17 00:00:00 2001
From: yancheng <yancheng@loongson.cn>
Date: Thu, 7 Dec 2023 12:07:30 +0800
Subject: [PATCH 468/718] loongarch64: Add optimizations for iamax.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   3 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   3 +
 kernel/loongarch64/idamax_lasx.S         | 275 +++++++++++++++++
 kernel/loongarch64/idamax_lsx.S          | 267 ++++++++++++++++
 kernel/loongarch64/isamax_lasx.S         | 378 +++++++++++++++++++++++
 kernel/loongarch64/isamax_lsx.S          | 275 +++++++++++++++++
 6 files changed, 1201 insertions(+)
 create mode 100644 kernel/loongarch64/idamax_lasx.S
 create mode 100644 kernel/loongarch64/idamax_lsx.S
 create mode 100644 kernel/loongarch64/isamax_lasx.S
 create mode 100644 kernel/loongarch64/isamax_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index d0eab3d7c..a4c8927ba 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -25,4 +25,7 @@ IDMAXKERNEL =  idmax_lsx.S
 ISMINKERNEL =  ismin_lsx.S
 IDMINKERNEL =  idmin_lsx.S
 
+ISAMAXKERNEL = isamax_lsx.S
+IDAMAXKERNEL = idamax_lsx.S
+
 endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 92389001b..835b027b8 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -25,6 +25,9 @@ IDMAXKERNEL =  idmax_lasx.S
 ISMINKERNEL =  ismin_lasx.S
 IDMINKERNEL =  idmin_lasx.S
 
+ISAMAXKERNEL = isamax_lasx.S
+IDAMAXKERNEL = idamax_lasx.S
+
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
diff --git a/kernel/loongarch64/idamax_lasx.S b/kernel/loongarch64/idamax_lasx.S
new file mode 100644
index 000000000..8248ee757
--- /dev/null
+++ b/kernel/loongarch64/idamax_lasx.S
@@ -0,0 +1,275 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    xvadd.d VI1, VI1, VINC8
+    xvld VX1, X, 4 * SIZE
+    xvadd.d VI2, VI1, VINC4
+    xvfmaxa.d VM1, VX0, VX1
+    xvfcmp.ceq.d VT0, VX0, VM1
+    addi.d I, I, -1
+    xvbitsel.v VI2, VI2, VI1, VT0
+    xvfmaxa.d VM1, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI2, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmaxa.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, x1, VM1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmaxa.d VM0, x4, x3
+    xvfcmp.ceq.d VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmaxa.d VM0, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.d $f17, TEMP
+    ffint.d.l $f17, $f17
+    xvfcmp.ceq.d VT0, VM0, x1
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    xvfcmp.clt.d VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t2, 1
+    xvinsgr2vr.d VM0, t3, 2
+    xvinsgr2vr.d VM0, t4, 3
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+    .align 3
+
+.L24:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvadd.d VI2, VI1, VINC4
+    xvfmaxa.d VM1, VX0, VX1
+    xvfcmp.ceq.d VT0, VX0, VM1
+    addi.d I, I, -1
+    xvbitsel.v VI2, VI2, VI1, VT0
+    xvfmaxa.d VM1, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI2, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmaxa.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, x1, VM1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmaxa.d VM0, x4, x3
+    xvfcmp.ceq.d VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmaxa.d VM0, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.d $f17, TEMP
+    ffint.d.l $f17, $f17
+    xvfcmp.ceq.d VT0, VM0, x1
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    xvfcmp.clt.d VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    xvfcmp.ceq.d VT0, VM0, x2
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L27
+    xvfcmp.clt.d VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    xvfcmp.ceq.d VT0, VM0, x3
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L28
+    xvfcmp.clt.d VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    xvfcmp.ceq.d VT0, VM0, x4
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L29
+    xvfcmp.clt.d VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    addi.d I, I, -1
+    xvfmaxa.d VM1, x1, VM0
+    xvfcmp.ceq.d VT0, VM0, VM1
+    add.d  X, X, INCX
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/idamax_lsx.S b/kernel/loongarch64/idamax_lsx.S
new file mode 100644
index 000000000..fb2d5bac1
--- /dev/null
+++ b/kernel/loongarch64/idamax_lsx.S
@@ -0,0 +1,267 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VINC2 $vr17
+#define VINC4 $vr18
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L11
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC2, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 2 * SIZE
+    vadd.d VI2, VI1, VINC2
+    vfmaxa.d x1, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x1
+    vbitsel.v x2, VI2, VI1, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI2, VINC2
+    vld VX1, X, 6 * SIZE
+    vadd.d VI2, VI1, VINC2
+    vfmaxa.d x3, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x3
+    vbitsel.v x4, VI2, VI1, VT0
+    vfmaxa.d x3, x1, x3
+    vfcmp.ceq.d VT0, x1, x3
+    vbitsel.v x2, x4, x2, VT0
+    vfmaxa.d VM1, VM0, x3
+    vfcmp.ceq.d VT0, VM0, VM1
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, x2, VI0, VT0
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.d $f17, TEMP
+    ffint.d.l $f17, $f17
+    vfcmp.ceq.d VT0, x2, x1
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L16
+    vfcmp.clt.d VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L17
+    .align 3
+
+.L16:
+    vfmaxa.d VM0, x1, x2
+    vfcmp.ceq.d VT0, x1, VM0
+    vbitsel.v VI0, VI2, VI1, VT0
+    .align 3
+
+.L17:
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L11: //INCX==1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L14
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L13:
+    fld.d $f9, X, 0
+    vfmaxa.d VM1, x1, VM0
+    vfcmp.ceq.d VT0, VM0, VM1
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    addi.d I, I, -1
+    addi.d i1, i1, 1
+    addi.d  X, X, SIZE
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L13
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L14:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t2, 1
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC2, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+    .align 3
+
+.L24:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI1, VINC4
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, VINC2
+    vfmaxa.d x1, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x1
+    vbitsel.v x2, VI2, VI1, VT0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI2, VINC2
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, VINC2
+    vfmaxa.d x3, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x3
+    vbitsel.v x4, VI2, VI1, VT0
+    vfmaxa.d x3, x1, x3
+    vfcmp.ceq.d VT0, x1, x3
+    vbitsel.v x2, x4, x2, VT0
+    vfmaxa.d VM1, VM0, x3
+    vbitsel.v VM0, VM1, VM0, VT0
+    vfcmp.ceq.d VT0, VM0, VM1
+    vbitsel.v VI0, x2, VI0, VT0
+    addi.d I, I, -1
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.d $f17, TEMP
+    ffint.d.l $f17, $f17
+    vfcmp.ceq.d VT0, x2, x1
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L27
+    .align 3
+
+.L26:
+    vfmaxa.d VM0, x1, x2
+    vfcmp.ceq.d VT0, x1, VM0
+    vbitsel.v VI0, VI2, VI1, VT0
+    .align 3
+
+.L27:
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L21: // N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    vfmaxa.d VM1, x1, VM0
+    vfcmp.ceq.d VT0, VM0, VM1
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    addi.d I, I, -1
+    addi.d i1, i1, 1
+    add.d  X, X, INCX
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/isamax_lasx.S b/kernel/loongarch64/isamax_lasx.S
new file mode 100644
index 000000000..2800b1d43
--- /dev/null
+++ b/kernel/loongarch64/isamax_lasx.S
@@ -0,0 +1,378 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    addi.d I, I, -1
+    xvadd.w VI1, VI1, VINC8
+    xvfmaxa.s VM1, VX0, VM0
+    xvfcmp.ceq.s VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfmaxa.s VM1, x1, x2
+    xvfcmp.ceq.s VT0, x1, VM1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmaxa.s VM0, x3, x4
+    xvfcmp.ceq.s VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmaxa.s VM0, VM0, VM1
+    xvfcmp.ceq.s VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    xvfcmp.ceq.s VT0, VM0, x1
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+    addi.w i0, i0, 1
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t2, 1
+    xvinsgr2vr.w VM0, t3, 2
+    xvinsgr2vr.w VM0, t4, 3
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 4
+    xvinsgr2vr.w VM0, t2, 5
+    xvinsgr2vr.w VM0, t3, 6
+    xvinsgr2vr.w VM0, t4, 7
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+    .align 3
+
+.L24:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvadd.w VI1, VI1, VINC8
+    xvfmaxa.s VM1, VX0, VM0
+    xvfcmp.ceq.s VT0, VM1, VM0
+    addi.d I, I, -1
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfmaxa.s VM1, x1, x2
+    xvfcmp.ceq.s VT0, x1, VM1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmaxa.s VM0, x3, x4
+    xvfcmp.ceq.s VT0, x3, VM0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfmaxa.s VM0, VM0, VM1
+    xvfcmp.ceq.s VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    xvfcmp.ceq.s VT0, VM0, x1
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    xvfcmp.ceq.s VT0, VM0, x2
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L27
+    xvfcmp.clt.s VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    xvfcmp.ceq.s VT0, VM0, x3
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L28
+    xvfcmp.clt.s VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    xvfcmp.ceq.s VT0, VM0, x4
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L29
+    xvfcmp.clt.s VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    fmov.s $f16, $f20
+    .align 3
+
+.L252:
+    xvxor.v VI0, VI0, VI0
+    xvor.v VI0, VI0, VX0
+    fmov.s $f13, $f15
+    xvxor.v VM0, VM0, VM0
+    xvor.v VM0, VM0, VX1
+    xvpickve.w VI1, VI0, 4
+    xvpickve.w VI2, VI0, 5
+    xvpickve.w VI3, VI0, 6
+    xvpickve.w VI4, VI0, 7
+    xvpickve.w x1, VM0, 4
+    xvpickve.w x2, VM0, 5
+    xvpickve.w x3, VM0, 6
+    xvpickve.w x4, VM0, 7
+    xvfmaxa.s VM1, x1, x2
+    xvfcmp.ceq.s VT0, x1, VM1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmaxa.s VM0, x3, x4
+    xvfcmp.ceq.s VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmaxa.s VM0, VM0, VM1
+    xvfcmp.ceq.s VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    xvfcmp.ceq.s VT0, VM0, x1
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L262
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L262:
+    xvfcmp.ceq.s VT0, VM0, x2
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L272
+    xvfcmp.clt.s VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L272:
+    xvfcmp.ceq.s VT0, VM0, x3
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L282
+    xvfcmp.clt.s VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L282:
+    xvfcmp.ceq.s VT0, VM0, x4
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L292
+    xvfcmp.clt.s VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L292:
+    xvfmaxa.s VM0, VX0, VM0
+    xvfcmp.ceq.s VT0, VM0, VX0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    movfr2gr.s i0, $f20
+
+.L21: // N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.s $f9, X, 0
+    addi.d I, I, -1
+    xvfmaxa.s VM1, x1, VM0
+    xvfcmp.ceq.s VT0, VM0, VM1
+    add.d  X, X, INCX
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    movfr2gr.s i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/isamax_lsx.S b/kernel/loongarch64/isamax_lsx.S
new file mode 100644
index 000000000..a18aa7354
--- /dev/null
+++ b/kernel/loongarch64/isamax_lsx.S
@@ -0,0 +1,275 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VINC4 $vr17
+#define VINC8 $vr18
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vadd.w VI1, VI1, VINC8
+    vld VX1, X, 4 * SIZE
+    vadd.w VI2, VI1, VINC4
+    vfmaxa.s VM1, VX0, VX1
+    vfcmp.ceq.s VT0, VX0, VM1
+    addi.d I, I, -1
+    vbitsel.v VI2, VI2, VI1, VT0
+    vfmaxa.s VM1, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI2, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmaxa.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC4, VI2, VI1, VT0
+    vfmaxa.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC8, VI4, VI3, VT0
+    vfmaxa.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    vfcmp.ceq.s VT0, VM0, x1
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+    addi.w i0, i0, 1
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t2, 1
+    vinsgr2vr.w VM0, t3, 2
+    vinsgr2vr.w VM0, t4, 3
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+    .align 3
+
+.L24:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    vadd.w VI1, VI1, VINC8
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vadd.w VI2, VI1, VINC4
+    vfmaxa.s VM1, VX0, VX1
+    vfcmp.ceq.s VT0, VX0, VM1
+    vbitsel.v VI2, VI2, VI1, VT0
+    vfmaxa.s VM1, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    addi.d I, I, -1
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI2, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmaxa.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC4, VI2, VI1, VT0
+    vfmaxa.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC8, VI4, VI3, VT0
+    vfmaxa.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    vfcmp.ceq.s VT0, VM0, x1
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    vfcmp.ceq.s VT0, VM0, x2
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L27
+    vfcmp.clt.s VT0, VI2, VI0
+    vbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    vfcmp.ceq.s VT0, VM0, x3
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L28
+    vfcmp.clt.s VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    vfcmp.ceq.s VT0, VM0, x4
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L29
+    vfcmp.clt.s VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    movfr2gr.s i0, $f20
+    .align 3
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.s $f9, X, 0
+    addi.d I, I, -1
+    vfmaxa.s VM1, x1, VM0
+    vfcmp.ceq.s VT0, VM0, VM1
+    add.d  X, X, INCX
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    movfr2gr.s i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE

From 49829b2b7db5d4ad78447c47391e8c0c96ffe803 Mon Sep 17 00:00:00 2001
From: yancheng <yancheng@loongson.cn>
Date: Thu, 7 Dec 2023 12:11:30 +0800
Subject: [PATCH 469/718] loongarch64: Add optimizations for iamin.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   3 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   3 +
 kernel/loongarch64/idamin_lasx.S         | 275 +++++++++++++++++
 kernel/loongarch64/idamin_lsx.S          | 228 ++++++++++++++
 kernel/loongarch64/isamin_lasx.S         | 378 +++++++++++++++++++++++
 kernel/loongarch64/isamin_lsx.S          | 275 +++++++++++++++++
 6 files changed, 1162 insertions(+)
 create mode 100644 kernel/loongarch64/idamin_lasx.S
 create mode 100644 kernel/loongarch64/idamin_lsx.S
 create mode 100644 kernel/loongarch64/isamin_lasx.S
 create mode 100644 kernel/loongarch64/isamin_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index a4c8927ba..bff52ce93 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -28,4 +28,7 @@ IDMINKERNEL =  idmin_lsx.S
 ISAMAXKERNEL = isamax_lsx.S
 IDAMAXKERNEL = idamax_lsx.S
 
+ISAMINKERNEL = isamin_lsx.S
+IDAMINKERNEL = idamin_lsx.S
+
 endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 835b027b8..a08598cc5 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -28,6 +28,9 @@ IDMINKERNEL =  idmin_lasx.S
 ISAMAXKERNEL = isamax_lasx.S
 IDAMAXKERNEL = idamax_lasx.S
 
+ISAMINKERNEL = isamin_lasx.S
+IDAMINKERNEL = idamin_lasx.S
+
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
diff --git a/kernel/loongarch64/idamin_lasx.S b/kernel/loongarch64/idamin_lasx.S
new file mode 100644
index 000000000..6ef1e8903
--- /dev/null
+++ b/kernel/loongarch64/idamin_lasx.S
@@ -0,0 +1,275 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    xvadd.d VI1, VI1, VINC8
+    xvld VX1, X, 4 * SIZE
+    xvadd.d VI2, VI1, VINC4
+    xvfmina.d VM1, VX0, VX1
+    xvfcmp.ceq.d VT0, VX0, VM1
+    addi.d I, I, -1
+    xvbitsel.v VI2, VI2, VI1, VT0
+    xvfmina.d VM1, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI2, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmina.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, x1, VM1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmina.d VM0, x4, x3
+    xvfcmp.ceq.d VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmina.d VM0, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.d $f17, TEMP
+    ffint.d.l $f17, $f17
+    xvfcmp.ceq.d VT0, VM0, x1
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    xvfcmp.clt.d VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t2, 1
+    xvinsgr2vr.d VM0, t3, 2
+    xvinsgr2vr.d VM0, t4, 3
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+    .align 3
+
+.L24:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvadd.d VI2, VI1, VINC4
+    xvfmina.d VM1, VX0, VX1
+    xvfcmp.ceq.d VT0, VX0, VM1
+    xvbitsel.v VI2, VI2, VI1, VT0
+    xvfmina.d VM1, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    addi.d I, I, -1
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI2, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmina.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, x1, VM1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmina.d VM0, x4, x3
+    xvfcmp.ceq.d VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmina.d VM0, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.d $f17, TEMP
+    ffint.d.l $f17, $f17
+    xvfcmp.ceq.d VT0, VM0, x1
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    xvfcmp.clt.d VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    xvfcmp.ceq.d VT0, VM0, x2
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L27
+    xvfcmp.clt.d VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    xvfcmp.ceq.d VT0, VM0, x3
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L28
+    xvfcmp.clt.d VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    xvfcmp.ceq.d VT0, VM0, x4
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L29
+    xvfcmp.clt.d VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L21: // N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    addi.d I, I, -1
+    xvfmina.d VM1, x1, VM0
+    xvfcmp.ceq.d VT0, VM0, VM1
+    add.d  X, X, INCX
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/idamin_lsx.S b/kernel/loongarch64/idamin_lsx.S
new file mode 100644
index 000000000..9eb9d883f
--- /dev/null
+++ b/kernel/loongarch64/idamin_lsx.S
@@ -0,0 +1,228 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VINC2 $vr17
+#define VINC4 $vr18
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC2, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 2 * SIZE
+    vadd.d VI2, VI1, VINC2
+    vfmina.d x1, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x1
+    vbitsel.v x2, VI2, VI1, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI2, VINC2
+    vld VX1, X, 6 * SIZE
+    vadd.d VI2, VI1, VINC2
+    vfmina.d x3, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x3
+    vbitsel.v x4, VI2, VI1, VT0
+    vfmina.d x3, x1, x3
+    vfcmp.ceq.d VT0, x1, x3
+    addi.d I, I, -1
+    vbitsel.v x2, x4, x2, VT0
+    vfmina.d VM1, VM0, x3
+    vfcmp.ceq.d VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, x2, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.d $f17, TEMP
+    ffint.d.l $f17, $f17
+    vfcmp.ceq.d VT0, x2, x1
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L27
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t2, 1
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC2, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+    .align 3
+
+.L24:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI1, VINC4
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, VINC2
+    vfmina.d x1, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x1
+    vbitsel.v x2, VI2, VI1, VT0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI2, VINC2
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, VINC2
+    vfmina.d x3, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x3
+    vbitsel.v x4, VI2, VI1, VT0
+    vfmina.d x3, x1, x3
+    vfcmp.ceq.d VT0, x1, x3
+    addi.d I, I, -1
+    vbitsel.v x2, x4, x2, VT0
+    vfmina.d VM1, VM0, x3
+    vbitsel.v VM0, VM1, VM0, VT0
+    vfcmp.ceq.d VT0, VM0, VM1
+    vbitsel.v VI0, x2, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.d $f17, TEMP
+    ffint.d.l $f17, $f17
+    vfcmp.ceq.d VT0, x2, x1
+    fcmp.ceq.d $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L27
+    .align 3
+
+.L26:
+    vfmina.d VM0, x1, x2
+    vfcmp.ceq.d VT0, x1, VM0
+    vbitsel.v VI0, VI2, VI1, VT0
+    .align 3
+
+.L27:
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    addi.d I, I, -1
+    vfmina.d VM1, x1, VM0
+    vfcmp.ceq.d VT0, VM0, VM1
+    add.d  X, X, INCX
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/isamin_lasx.S b/kernel/loongarch64/isamin_lasx.S
new file mode 100644
index 000000000..cbdf32530
--- /dev/null
+++ b/kernel/loongarch64/isamin_lasx.S
@@ -0,0 +1,378 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    addi.d I, I, -1
+    xvadd.w VI1, VI1, VINC8
+    xvfmina.s VM1, VX0, VM0
+    xvfcmp.ceq.s VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfmina.s VM1, x1, x2
+    xvfcmp.ceq.s VT0, x1, VM1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmina.s VM0, x3, x4
+    xvfcmp.ceq.s VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmina.s VM0, VM0, VM1
+    xvfcmp.ceq.s VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    xvfcmp.ceq.s VT0, VM0, x1
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+    addi.w i0, i0, 1
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t2, 1
+    xvinsgr2vr.w VM0, t3, 2
+    xvinsgr2vr.w VM0, t4, 3
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 4
+    xvinsgr2vr.w VM0, t2, 5
+    xvinsgr2vr.w VM0, t3, 6
+    xvinsgr2vr.w VM0, t4, 7
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+    .align 3
+
+.L24:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvadd.w VI1, VI1, VINC8
+    xvfmina.s VM1, VX0, VM0
+    xvfcmp.ceq.s VT0, VM1, VM0
+    addi.d I, I, -1
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfmina.s VM1, x1, x2
+    xvfcmp.ceq.s VT0, x1, VM1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmina.s VM0, x3, x4
+    xvfcmp.ceq.s VT0, x3, VM0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfmina.s VM0, VM0, VM1
+    xvfcmp.ceq.s VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    xvfcmp.ceq.s VT0, VM0, x1
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    xvfcmp.ceq.s VT0, VM0, x2
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L27
+    xvfcmp.clt.s VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    xvfcmp.ceq.s VT0, VM0, x3
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L28
+    xvfcmp.clt.s VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    xvfcmp.ceq.s VT0, VM0, x4
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L29
+    xvfcmp.clt.s VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    fmov.s $f16, $f20
+    .align 3
+
+.L252:
+    xvxor.v VI0, VI0, VI0
+    xvor.v VI0, VI0, VX0
+    fmov.s $f13, $f15
+    xvxor.v VM0, VM0, VM0
+    xvor.v VM0, VM0, VX1
+    xvpickve.w VI1, VI0, 4
+    xvpickve.w VI2, VI0, 5
+    xvpickve.w VI3, VI0, 6
+    xvpickve.w VI4, VI0, 7
+    xvpickve.w x1, VM0, 4
+    xvpickve.w x2, VM0, 5
+    xvpickve.w x3, VM0, 6
+    xvpickve.w x4, VM0, 7
+    xvfmina.s VM1, x1, x2
+    xvfcmp.ceq.s VT0, x1, VM1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmina.s VM0, x3, x4
+    xvfcmp.ceq.s VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmina.s VM0, VM0, VM1
+    xvfcmp.ceq.s VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    xvfcmp.ceq.s VT0, VM0, x1
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L262
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L262:
+    xvfcmp.ceq.s VT0, VM0, x2
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L272
+    xvfcmp.clt.s VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L272:
+    xvfcmp.ceq.s VT0, VM0, x3
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L282
+    xvfcmp.clt.s VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L282:
+    xvfcmp.ceq.s VT0, VM0, x4
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L292
+    xvfcmp.clt.s VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L292:
+    xvfmina.s VM0, VX0, VM0
+    xvfcmp.ceq.s VT0, VM0, VX0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    movfr2gr.s i0, $f20
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.s $f9, X, 0
+    addi.d I, I, -1
+    xvfmina.s VM1, x1, VM0
+    xvfcmp.ceq.s VT0, VM0, VM1
+    add.d  X, X, INCX
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    movfr2gr.s i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/isamin_lsx.S b/kernel/loongarch64/isamin_lsx.S
new file mode 100644
index 000000000..598888660
--- /dev/null
+++ b/kernel/loongarch64/isamin_lsx.S
@@ -0,0 +1,275 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VINC4 $vr17
+#define VINC8 $vr18
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vadd.w VI1, VI1, VINC8
+    vld VX1, X, 4 * SIZE
+    vadd.w VI2, VI1, VINC4
+    vfmina.s VM1, VX0, VX1
+    vfcmp.ceq.s VT0, VX0, VM1
+    addi.d I, I, -1
+    vbitsel.v VI2, VI2, VI1, VT0
+    vfmina.s VM1, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI2, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmina.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC4, VI2, VI1, VT0
+    vfmina.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC8, VI4, VI3, VT0
+    vfmina.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    vfcmp.ceq.s VT0, VM0, x1
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+    addi.w i0, i0, 1
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t2, 1
+    vinsgr2vr.w VM0, t3, 2
+    vinsgr2vr.w VM0, t4, 3
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+    .align 3
+
+.L24:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    vadd.w VI1, VI1, VINC8
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vadd.w VI2, VI1, VINC4
+    vfmina.s VM1, VX0, VX1
+    vfcmp.ceq.s VT0, VX0, VM1
+    vbitsel.v VI2, VI2, VI1, VT0
+    vfmina.s VM1, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    addi.d I, I, -1
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI2, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmina.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC4, VI2, VI1, VT0
+    vfmina.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC8, VI4, VI3, VT0
+    vfmina.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    vfcmp.ceq.s VT0, VM0, x1
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    vfcmp.ceq.s VT0, VM0, x2
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L27
+    vfcmp.clt.s VT0, VI2, VI0
+    vbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    vfcmp.ceq.s VT0, VM0, x3
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L28
+    vfcmp.clt.s VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    vfcmp.ceq.s VT0, VM0, x4
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L29
+    vfcmp.clt.s VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    movfr2gr.s i0, $f20
+    .align 3
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.s $f9, X, 0
+    addi.d I, I, -1
+    vfmina.s VM1, x1, VM0
+    vfcmp.ceq.s VT0, VM0, VM1
+    add.d  X, X, INCX
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    movfr2gr.s i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file

From 174c25766b6b43e36a9ace6da26fe21f3a7305a9 Mon Sep 17 00:00:00 2001
From: yancheng <yancheng@loongson.cn>
Date: Thu, 7 Dec 2023 12:15:46 +0800
Subject: [PATCH 470/718] loongarch64: Add optimizations for copy.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   3 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   3 +
 kernel/loongarch64/dcopy_lasx.S          | 224 ++++++++++++++++++++++
 kernel/loongarch64/dcopy_lsx.S           | 232 +++++++++++++++++++++++
 kernel/loongarch64/scopy_lasx.S          | 216 +++++++++++++++++++++
 kernel/loongarch64/scopy_lsx.S           | 220 +++++++++++++++++++++
 6 files changed, 898 insertions(+)
 create mode 100644 kernel/loongarch64/dcopy_lasx.S
 create mode 100644 kernel/loongarch64/dcopy_lsx.S
 create mode 100644 kernel/loongarch64/scopy_lasx.S
 create mode 100644 kernel/loongarch64/scopy_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index bff52ce93..565bec0f2 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -31,4 +31,7 @@ IDAMAXKERNEL = idamax_lsx.S
 ISAMINKERNEL = isamin_lsx.S
 IDAMINKERNEL = idamin_lsx.S
 
+SCOPYKERNEL =  scopy_lsx.S
+DCOPYKERNEL =  dcopy_lsx.S
+
 endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index a08598cc5..a2443720b 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -31,6 +31,9 @@ IDAMAXKERNEL = idamax_lasx.S
 ISAMINKERNEL = isamin_lasx.S
 IDAMINKERNEL = idamin_lasx.S
 
+SCOPYKERNEL =  scopy_lasx.S
+DCOPYKERNEL =  dcopy_lasx.S
+
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
diff --git a/kernel/loongarch64/dcopy_lasx.S b/kernel/loongarch64/dcopy_lasx.S
new file mode 100644
index 000000000..9d7da4a80
--- /dev/null
+++ b/kernel/loongarch64/dcopy_lasx.S
@@ -0,0 +1,224 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define VX0    $xr12
+#define VX1    $xr13
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    fld.d $f12, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    fst.d $f12, Y, 0 * SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.d VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    xvstelm.d VX0, Y, 0, 3
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0, 2
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0, 3
+    add.d Y, Y, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    fld.d $f12, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    fst.d $f12, Y, 0 * SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    xvst VX0, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvst VX1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    fld.d $f12, X, 0 * SIZE
+    addi.d I, I, -1
+    fst.d $f12, Y, 0 * SIZE
+    add.d  X, X, INCX
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bgez INCX, .L220
+    .align 3
+
+.L220:
+    bge $r0, I, .L223
+    .align 3
+
+.L222:
+    fld.d a1, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.d a2, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.d a3, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.d a4, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.d a1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.d a2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.d a3, X, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.d a4, X, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.d a1, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.d a2, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.d a3, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.d a4, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.d a1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.d a2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.d a3, X, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.d a4, X, 0 * SIZE
+    add.d Y, Y, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    fld.d $f12, X, 0 * SIZE
+    addi.d I, I, -1
+    fst.d $f12, Y, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dcopy_lsx.S b/kernel/loongarch64/dcopy_lsx.S
new file mode 100644
index 000000000..161655bbd
--- /dev/null
+++ b/kernel/loongarch64/dcopy_lsx.S
@@ -0,0 +1,232 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define VX0    $vr12
+#define VX1    $vr13
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    addi.d  I, I, -1
+    vst VX0, Y, 4 * SIZE
+    vst VX1, Y, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    fld.d $f12, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    fst.d $f12, Y, 0 * SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    fld.d $f12, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    fst.d $f12, Y, 0 * SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vst VX0, Y, 0 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX1, Y, 2 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vst VX0, Y, 4 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX1, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    fld.d $f12, X, 0 * SIZE
+    addi.d I, I, -1
+    fst.d $f12, Y, 0 * SIZE
+    add.d  X, X, INCX
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bgez INCX, .L220
+    .align 3
+
+.L220:
+    bge $r0, I, .L223
+    .align 3
+
+.L222:
+    fld.d a1, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.d a2, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.d a3, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.d a4, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.d a1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.d a2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.d a3, X, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.d a4, X, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.d a1, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.d a2, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.d a3, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.d a4, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.d a1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.d a2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.d a3, X, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.d a4, X, 0 * SIZE
+    add.d Y, Y, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    fld.d $f12, X, 0 * SIZE
+    addi.d I, I, -1
+    fst.d $f12, Y, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/scopy_lasx.S b/kernel/loongarch64/scopy_lasx.S
new file mode 100644
index 000000000..7db1e7cee
--- /dev/null
+++ b/kernel/loongarch64/scopy_lasx.S
@@ -0,0 +1,216 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define VX0    $xr12
+#define VX1    $xr13
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    xvld VX0, X, 0 * SIZE
+    addi.d  I, I, -1
+    xvst VX0, Y, 0 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    fld.s $f12, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    fst.s $f12, Y, 0 * SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+    xvld VX0, X, 0 * SIZE
+    xvstelm.w VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 3
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 4
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 5
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 6
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 7
+    add.d Y, Y, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    fld.s $f12, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    fst.s $f12, Y, 0 * SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvst VX0, Y, 0 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    fld.s $f12, X, 0 * SIZE
+    addi.d I, I, -1
+    fst.s $f12, Y, 0 * SIZE
+    add.d  X, X, INCX
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+
+.L222:
+    fld.s a1, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.s a2, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.s a3, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.s a4, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.s a1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.s a2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.s a3, X, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.s a4, X, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.s a1, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.s a2, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.s a3, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.s a4, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.s a1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.s a2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.s a3, X, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.s a4, X, 0 * SIZE
+    add.d Y, Y, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    fld.s $f12, X, 0 * SIZE
+    addi.d I, I, -1
+    fst.s $f12, Y, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/scopy_lsx.S b/kernel/loongarch64/scopy_lsx.S
new file mode 100644
index 000000000..32150d3d6
--- /dev/null
+++ b/kernel/loongarch64/scopy_lsx.S
@@ -0,0 +1,220 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define VX0    $vr12
+#define VX1    $vr13
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    addi.d  I, I, -1
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    fld.s $f12, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    fst.s $f12, Y, 0 * SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vstelm.w VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.w VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    vstelm.w VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    vstelm.w VX0, Y, 0, 3
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0, 2
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0, 3
+    add.d Y, Y, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    fld.s $f12, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    fst.s $f12, Y, 0 * SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    vst VX0, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vst VX1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    fld.s $f12, X, 0 * SIZE
+    addi.d I, I, -1
+    fst.s $f12, Y, 0 * SIZE
+    add.d  X, X, INCX
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+
+.L222:
+    fld.s a1, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.s a2, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.s a3, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.s a4, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.s a1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.s a2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.s a3, X, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.s a4, X, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.s a1, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.s a2, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.s a3, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.s a4, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.s a1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.s a2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.s a3, X, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.s a4, X, 0 * SIZE
+    add.d Y, Y, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    fld.s $f12, X, 0 * SIZE
+    addi.d I, I, -1
+    fst.s $f12, Y, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE

From 360acc0a41dd2ec3fc461bac390c74c958525ae8 Mon Sep 17 00:00:00 2001
From: yancheng <yancheng@loongson.cn>
Date: Thu, 7 Dec 2023 12:57:05 +0800
Subject: [PATCH 471/718] loongarch64: Add optimizations for swap.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   3 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   3 +
 kernel/loongarch64/dswap_lasx.S          | 301 +++++++++++++++++++++
 kernel/loongarch64/dswap_lsx.S           | 317 +++++++++++++++++++++++
 kernel/loongarch64/sswap_lasx.S          | 286 ++++++++++++++++++++
 kernel/loongarch64/sswap_lsx.S           | 294 +++++++++++++++++++++
 6 files changed, 1204 insertions(+)
 create mode 100644 kernel/loongarch64/dswap_lasx.S
 create mode 100644 kernel/loongarch64/dswap_lsx.S
 create mode 100644 kernel/loongarch64/sswap_lasx.S
 create mode 100644 kernel/loongarch64/sswap_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 565bec0f2..879a6f68b 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -34,4 +34,7 @@ IDAMINKERNEL = idamin_lsx.S
 SCOPYKERNEL =  scopy_lsx.S
 DCOPYKERNEL =  dcopy_lsx.S
 
+SSWAPKERNEL =  sswap_lsx.S
+DSWAPKERNEL =  dswap_lsx.S
+
 endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index a2443720b..581cfdbbe 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -34,6 +34,9 @@ IDAMINKERNEL = idamin_lasx.S
 SCOPYKERNEL =  scopy_lasx.S
 DCOPYKERNEL =  dcopy_lasx.S
 
+SSWAPKERNEL =  sswap_lasx.S
+DSWAPKERNEL =  dswap_lasx.S
+
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
diff --git a/kernel/loongarch64/dswap_lasx.S b/kernel/loongarch64/dswap_lasx.S
new file mode 100644
index 000000000..221cb7fa2
--- /dev/null
+++ b/kernel/loongarch64/dswap_lasx.S
@@ -0,0 +1,301 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define YY     $r6
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    addi.d  I, I, -1
+    xvst VX2, X, 0 * SIZE
+    xvst VX3, X, 4 * SIZE
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    fld.d $f12, X, 0 * SIZE
+    fld.d $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fst.d $f12, Y, 0 * SIZE
+    fst.d $f14, X, 0 * SIZE
+    addi.d  X, X, SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+    xvld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    xvstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    xvstelm.d VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    xvstelm.d VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvstelm.d VX0, Y, 0, 3
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvst VX2, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    xvstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    xvstelm.d VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    xvstelm.d VX1, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvstelm.d VX1, Y, 0, 3
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvst VX3, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    fld.d $f12, X, 0 * SIZE
+    fld.d $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fst.d $f12, Y, 0 * SIZE
+    fst.d $f14, X, 0 * SIZE
+    addi.d  X, X, SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    xvstelm.d VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    xvstelm.d VX2, X, 0, 1
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    xvstelm.d VX2, X, 0, 2
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvstelm.d VX2, X, 0, 3
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvst VX0, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    xvstelm.d VX3, X, 0, 0
+    add.d X, X, INCY
+    ld.d t2, X, 0 * SIZE
+    xvstelm.d VX3, X, 0, 1
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    xvstelm.d VX3, X, 0, 2
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvstelm.d VX3, X, 0, 3
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvst VX1, Y, 0 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    fld.d $f12, X, 0 * SIZE
+    fld.d $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fst.d $f12, Y, 0 * SIZE
+    fst.d $f14, X, 0 * SIZE
+    add.d  X, X, INCX
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bgez INCX, .L220
+    //addi.d TEMP, N, -1
+    //mul.d TEMP, TEMP, INCX
+    //sub.d X, X, TEMP
+    .align 3
+
+.L220:
+    bge $r0, I, .L223
+    .align 3
+    move XX, X
+
+.L222:
+    fld.d a1, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.d a2, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.d a3, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.d a4, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.d b1, Y, 0 * SIZE
+    fst.d a1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.d b2, Y, 0 * SIZE
+    fst.d a2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.d b3, Y, 0 * SIZE
+    fst.d a3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.d b4, Y, 0 * SIZE
+    fst.d a4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.d a1, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.d b1, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fld.d b1, Y, 0 * SIZE
+    fst.d a1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.d a2, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.d b2, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fld.d b2, Y, 0 * SIZE
+    fst.d a2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.d a3, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.d b3, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fld.d b3, Y, 0 * SIZE
+    fst.d a3, Y, 0 * SIZE
+    fld.d a4, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.d b4, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fld.d b4, Y, 0 * SIZE
+    fst.d a4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.d b1, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fst.d b2, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fst.d b3, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fst.d b4, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    fld.d $f12, X, 0 * SIZE
+    fld.d $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fst.d $f12, Y, 0 * SIZE
+    fst.d $f14, X, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dswap_lsx.S b/kernel/loongarch64/dswap_lsx.S
new file mode 100644
index 000000000..7f7f585e1
--- /dev/null
+++ b/kernel/loongarch64/dswap_lsx.S
@@ -0,0 +1,317 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define YY     $r6
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 2 * SIZE
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    addi.d  I, I, -1
+    vst VX2, X, 4 * SIZE
+    vst VX3, X, 6 * SIZE
+    vst VX0, Y, 4 * SIZE
+    vst VX1, Y, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    fld.d $f12, X, 0 * SIZE
+    fld.d $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fst.d $f12, Y, 0 * SIZE
+    fst.d $f14, X, 0 * SIZE
+    addi.d  X, X, SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+    vld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    vstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vstelm.d VX0, Y, 0, 1
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vst VX2, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    vstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vstelm.d VX1, Y, 0, 1
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vst VX3, X, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    vstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vstelm.d VX0, Y, 0, 1
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vst VX2, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    vstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vstelm.d VX1, Y, 0, 1
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vst VX3, X, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    fld.d $f12, X, 0 * SIZE
+    fld.d $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fst.d $f12, Y, 0 * SIZE
+    fst.d $f14, X, 0 * SIZE
+    addi.d  X, X, SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+    vld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    vstelm.d VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vstelm.d VX2, X, 0, 1
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCY
+    vst VX0, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    ld.d t3, X, 0 * SIZE
+    vstelm.d VX3, X, 0, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vstelm.d VX3, X, 0, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vst VX1, Y, 2 * SIZE
+    vld VX2, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    vstelm.d VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vstelm.d VX2, X, 0, 1
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCY
+    vst VX0, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t3, X, 0 * SIZE
+    vstelm.d VX3, X, 0, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vstelm.d VX3, X, 0, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vst VX1, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    fld.d $f12, X, 0 * SIZE
+    fld.d $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fst.d $f12, Y, 0 * SIZE
+    fst.d $f14, X, 0 * SIZE
+    add.d  X, X, INCX
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bgez INCX, .L220
+    //addi.d TEMP, N, -1
+    //mul.d TEMP, TEMP, INCX
+    //sub.d X, X, TEMP
+    .align 3
+
+.L220:
+    bge $r0, I, .L223
+    .align 3
+    move XX, X
+
+.L222:
+    fld.d a1, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.d a2, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.d a3, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.d a4, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.d b1, Y, 0 * SIZE
+    fst.d a1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.d b2, Y, 0 * SIZE
+    fst.d a2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.d b3, Y, 0 * SIZE
+    fst.d a3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.d b4, Y, 0 * SIZE
+    fst.d a4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.d a1, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.d b1, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fld.d b1, Y, 0 * SIZE
+    fst.d a1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.d a2, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.d b2, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fld.d b2, Y, 0 * SIZE
+    fst.d a2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.d a3, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.d b3, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fld.d b3, Y, 0 * SIZE
+    fst.d a3, Y, 0 * SIZE
+    fld.d a4, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.d b4, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fld.d b4, Y, 0 * SIZE
+    fst.d a4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.d b1, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fst.d b2, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fst.d b3, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fst.d b4, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    fld.d $f12, X, 0 * SIZE
+    fld.d $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fst.d $f12, Y, 0 * SIZE
+    fst.d $f14, X, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/sswap_lasx.S b/kernel/loongarch64/sswap_lasx.S
new file mode 100644
index 000000000..7184eff45
--- /dev/null
+++ b/kernel/loongarch64/sswap_lasx.S
@@ -0,0 +1,286 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define YY     $r6
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    addi.d  I, I, -1
+    xvst VX2, X, 0 * SIZE
+    xvst VX0, Y, 0 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    fld.s $f12, X, 0 * SIZE
+    fld.s $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fst.s $f12, Y, 0 * SIZE
+    fst.s $f14, X, 0 * SIZE
+    addi.d  X, X, SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    xvstelm.w VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    xvstelm.w VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    xvstelm.w VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvstelm.w VX0, Y, 0, 3
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    ld.w t1, Y, 0 * SIZE
+    xvstelm.w VX0, Y, 0, 4
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    xvstelm.w VX0, Y, 0, 5
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    xvstelm.w VX0, Y, 0, 6
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvstelm.w VX0, Y, 0, 7
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvst VX2, X, 0 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    fld.s $f12, X, 0 * SIZE
+    fld.s $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fst.s $f12, Y, 0 * SIZE
+    fst.s $f14, X, 0 * SIZE
+    addi.d  X, X, SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    xvstelm.w VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    xvstelm.w VX2, X, 0, 1
+    add.d X, X, INCY
+    ld.w t3, X, 0 * SIZE
+    xvstelm.w VX2, X, 0, 2
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvstelm.w VX2, X, 0, 3
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    xvstelm.w VX2, X, 0, 4
+    add.d X, X, INCY
+    ld.w t2, X, 0 * SIZE
+    xvstelm.w VX2, X, 0, 5
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    xvstelm.w VX2, X, 0, 6
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvstelm.w VX2, X, 0, 7
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvst VX1, Y, 0 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    fld.s $f12, X, 0 * SIZE
+    fld.s $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fst.s $f12, Y, 0 * SIZE
+    fst.s $f14, X, 0 * SIZE
+    add.d  X, X, INCX
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+    move XX, X
+
+.L222:
+    fld.s a1, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.s a2, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.s a3, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.s a4, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.s b1, Y, 0 * SIZE
+    fst.s a1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.s b2, Y, 0 * SIZE
+    fst.s a2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.s b3, Y, 0 * SIZE
+    fst.s a3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.s b4, Y, 0 * SIZE
+    fst.s a4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.s a1, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.s b1, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fld.s b1, Y, 0 * SIZE
+    fst.s a1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.s a2, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.s b2, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fld.s b2, Y, 0 * SIZE
+    fst.s a2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.s a3, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.s b3, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fld.s b3, Y, 0 * SIZE
+    fst.s a3, Y, 0 * SIZE
+    fld.s a4, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.s b4, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fld.s b4, Y, 0 * SIZE
+    fst.s a4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.s b1, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fst.s b2, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fst.s b3, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fst.s b4, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    fld.s $f12, X, 0 * SIZE
+    fld.s $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fst.s $f12, Y, 0 * SIZE
+    fst.s $f14, X, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/sswap_lsx.S b/kernel/loongarch64/sswap_lsx.S
new file mode 100644
index 000000000..4f19a8024
--- /dev/null
+++ b/kernel/loongarch64/sswap_lsx.S
@@ -0,0 +1,294 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define YY     $r6
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    addi.d  I, I, -1
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 4 * SIZE
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    fld.s $f12, X, 0 * SIZE
+    fld.s $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fst.s $f12, Y, 0 * SIZE
+    fst.s $f14, X, 0 * SIZE
+    addi.d  X, X, SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+    vld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    vstelm.w VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    vstelm.w VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    vstelm.w VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vstelm.w VX0, Y, 0, 3
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vst VX2, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    vstelm.w VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    vstelm.w VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    vstelm.w VX1, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vstelm.w VX1, Y, 0, 3
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vst VX3, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    fld.s $f12, X, 0 * SIZE
+    fld.s $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fst.s $f12, Y, 0 * SIZE
+    fst.s $f14, X, 0 * SIZE
+    addi.d  X, X, SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+    vld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    vstelm.w VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    vstelm.w VX2, X, 0, 1
+    add.d X, X, INCY
+    ld.w t3, X, 0 * SIZE
+    vstelm.w VX2, X, 0, 2
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vstelm.w VX2, X, 0, 3
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vst VX0, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    vstelm.w VX3, X, 0, 0
+    add.d X, X, INCY
+    ld.w t2, X, 0 * SIZE
+    vstelm.w VX3, X, 0, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    vstelm.w VX3, X, 0, 2
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vstelm.w VX3, X, 0, 3
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    vst VX1, Y, 0 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    fld.s $f12, X, 0 * SIZE
+    fld.s $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fst.s $f12, Y, 0 * SIZE
+    fst.s $f14, X, 0 * SIZE
+    add.d  X, X, INCX
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+    move XX, X
+
+.L222:
+    fld.s a1, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.s a2, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.s a3, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.s a4, X, 0 * SIZE
+    add.d X, X, INCX
+    fld.s b1, Y, 0 * SIZE
+    fst.s a1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.s b2, Y, 0 * SIZE
+    fst.s a2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.s b3, Y, 0 * SIZE
+    fst.s a3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.s b4, Y, 0 * SIZE
+    fst.s a4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.s a1, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.s b1, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fld.s b1, Y, 0 * SIZE
+    fst.s a1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.s a2, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.s b2, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fld.s b2, Y, 0 * SIZE
+    fst.s a2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fld.s a3, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.s b3, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fld.s b3, Y, 0 * SIZE
+    fst.s a3, Y, 0 * SIZE
+    fld.s a4, X, 0 * SIZE
+    add.d X, X, INCX
+    fst.s b4, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fld.s b4, Y, 0 * SIZE
+    fst.s a4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    fst.s b1, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fst.s b2, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fst.s b3, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    fst.s b4, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    fld.s $f12, X, 0 * SIZE
+    fld.s $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fst.s $f12, Y, 0 * SIZE
+    fst.s $f14, X, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE

From d4c96a35a80da33c9d71fc5dd291f05f55900e27 Mon Sep 17 00:00:00 2001
From: yancheng <yancheng@loongson.cn>
Date: Thu, 7 Dec 2023 13:02:03 +0800
Subject: [PATCH 472/718] loongarch64: Add optimizations for axpy and axpby.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   6 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   6 +
 kernel/loongarch64/daxpby_lasx.S         | 629 ++++++++++++++++++++
 kernel/loongarch64/daxpby_lsx.S          | 693 +++++++++++++++++++++++
 kernel/loongarch64/daxpy_lasx.S          | 338 +++++++++++
 kernel/loongarch64/daxpy_lsx.S           | 365 ++++++++++++
 kernel/loongarch64/saxpby_lasx.S         | 597 +++++++++++++++++++
 kernel/loongarch64/saxpby_lsx.S          | 629 ++++++++++++++++++++
 kernel/loongarch64/saxpy_lasx.S          | 323 +++++++++++
 kernel/loongarch64/saxpy_lsx.S           | 338 +++++++++++
 10 files changed, 3924 insertions(+)
 create mode 100644 kernel/loongarch64/daxpby_lasx.S
 create mode 100644 kernel/loongarch64/daxpby_lsx.S
 create mode 100644 kernel/loongarch64/daxpy_lasx.S
 create mode 100644 kernel/loongarch64/daxpy_lsx.S
 create mode 100644 kernel/loongarch64/saxpby_lasx.S
 create mode 100644 kernel/loongarch64/saxpby_lsx.S
 create mode 100644 kernel/loongarch64/saxpy_lasx.S
 create mode 100644 kernel/loongarch64/saxpy_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 879a6f68b..a94303151 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -37,4 +37,10 @@ DCOPYKERNEL =  dcopy_lsx.S
 SSWAPKERNEL =  sswap_lsx.S
 DSWAPKERNEL =  dswap_lsx.S
 
+SAXPYKERNEL =  saxpy_lsx.S
+DAXPYKERNEL =  daxpy_lsx.S
+
+SAXPBYKERNEL = saxpby_lsx.S
+DAXPBYKERNEL = daxpby_lsx.S
+
 endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 581cfdbbe..4cfd53058 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -37,6 +37,12 @@ DCOPYKERNEL =  dcopy_lasx.S
 SSWAPKERNEL =  sswap_lasx.S
 DSWAPKERNEL =  dswap_lasx.S
 
+SAXPYKERNEL =  saxpy_lasx.S
+DAXPYKERNEL =  daxpy_lasx.S
+
+SAXPBYKERNEL = saxpby_lasx.S
+DAXPBYKERNEL = daxpby_lasx.S
+
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
diff --git a/kernel/loongarch64/daxpby_lasx.S b/kernel/loongarch64/daxpby_lasx.S
new file mode 100644
index 000000000..4b19703e7
--- /dev/null
+++ b/kernel/loongarch64/daxpby_lasx.S
@@ -0,0 +1,629 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define ALPHA  $f0
+#define X      $r5
+#define INCX   $r6
+#define BETA   $f1
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define a2     $f13
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VXA    $xr23
+#define VXB    $xr9
+#define VXZ    $xr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.d.l a1, a1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    movfr2gr.d t1, ALPHA
+    xvreplgr2vr.d VXA, t1
+    movfr2gr.d t2, BETA
+    xvreplgr2vr.d VXB, t2
+    movfr2gr.d t3, a1
+    xvreplgr2vr.d VXZ, t3
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    fcmp.ceq.d $fcc0, ALPHA, a1
+    bcnez $fcc0, .L110
+    fcmp.ceq.d $fcc0, BETA, a1
+    bcnez $fcc0, .L112 // ALPHA!=0 BETA==0
+    b .L111 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L110:
+    fcmp.ceq.d $fcc0, BETA, a1
+    bcnez $fcc0, .L114 // ALPHA==0 BETA==0
+    b .L113 // ALPHA==0 BETA!=0
+    .align 3
+
+.L111: // ALPHA!=0 BETA!=0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvfmul.d VX0, VX0, VXA
+    xvfmul.d VX1, VX1, VXA
+    xvfmadd.d VX2, VX2, VXB, VX0
+    xvfmadd.d VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // ALPHA!=0 BETA==0
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfmul.d VX0, VX0, VXA
+    xvfmul.d VX1, VX1, VXA
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // ALPHA==0 BETA!=0
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvfmul.d VX2, VX2, VXB
+    xvfmul.d VX3, VX3, VXB
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // ALPHA==0 BETA==0
+    xvst VXZ, Y, 0 * SIZE
+    xvst VXZ, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    fcmp.ceq.d $fcc0, ALPHA, a1
+    bcnez $fcc0, .L120
+    fcmp.ceq.d $fcc0, BETA, a1
+    bcnez $fcc0, .L122 // ALPHA!=0 BETA==0
+    b .L121 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L120:
+    fcmp.ceq.d $fcc0, BETA, a1
+    bcnez $fcc0, .L124 // ALPHA==0 BETA==0
+    b .L123 // ALPHA==0 BETA!=0
+    .align 3
+
+.L121: // ALPHA!=0 BETA!=0
+    xvld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX0, VX0, VXA
+    xvld VX1, X, 4 * SIZE
+    xvfmadd.d VX2, VX2, VXB, VX0
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    xvstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvfmul.d VX1, VX1, VXA
+    xvfmadd.d VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    xvstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L122: // ALPHA!=0 BETA==0
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfmul.d VX0, VX0, VXA
+    xvfmul.d VX1, VX1, VXA
+    xvstelm.d VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L122
+    b .L997
+    .align 3
+
+.L123: // ALPHA==0 BETA!=0
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX2, VX2, VXB
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    xvstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvfmul.d VX3, VX3, VXB
+    addi.d  I, I, -1
+    xvstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L123
+    b .L997
+    .align 3
+
+.L124: // ALPHA==0 BETA==0
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L124
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    fcmp.ceq.d $fcc0, ALPHA, a1
+    bcnez $fcc0, .L210
+    fcmp.ceq.d $fcc0, BETA, a1
+    bcnez $fcc0, .L212 // ALPHA!=0 BETA==0
+    b .L211 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L210:
+    fcmp.ceq.d $fcc0, BETA, a1
+    bcnez $fcc0, .L214 // ALPHA==0 BETA==0
+    b .L213 // ALPHA==0 BETA!=0
+    .align 3
+
+.L211: // ALPHA!=0 BETA!=0
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VX0, VXA, VX0
+    xvfmadd.d VX2, VX2, VXB, VX0
+    xvld VX3, Y, 4 * SIZE
+    xvst VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VX1, VX1, VXA
+    xvfmadd.d VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    xvst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L212: // ALPHA!=0 BETA==0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VX0, VXA, VX0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvst VX0, Y, 0 * SIZE
+    xvfmul.d VX1, VX1, VXA
+    addi.d  I, I, -1
+    xvst VX1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L212
+    b .L997
+    .align 3
+
+.L213: // ALPHA==0 BETA!=0
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvfmul.d VX2, VX2, VXB
+    xvfmul.d VX3, VX3, VXB
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L213
+    b .L997
+    .align 3
+
+.L214: // ALPHA==0 BETA==0
+    xvst VXZ, Y, 0 * SIZE
+    xvst VXZ, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L214
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    fcmp.ceq.d $fcc0, ALPHA, a1
+    bcnez $fcc0, .L220
+    fcmp.ceq.d $fcc0, BETA, a1
+    bcnez $fcc0, .L222 // ALPHA!=0 BETA==0
+    b .L221 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L220:
+    fcmp.ceq.d $fcc0, BETA, a1
+    bcnez $fcc0, .L224 // ALPHA==0 BETA==0
+    b .L223 // ALPHA==0 BETA!=0
+    .align 3
+
+.L221: // ALPHA!=0 BETA!=0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX0, VX0, VXA
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    xvfmadd.d VX2, VX2, VXB, VX0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX1, VX1, VXA
+    xvfmadd.d VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    xvstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222: // ALPHA!=0 BETA==0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VX0, VX0, VXA
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvstelm.d VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    xvfmul.d VX1, VX1, VXA
+    addi.d  I, I, -1
+    xvstelm.d VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: // ALPHA==0 BETA!=0
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX2, VX2, VXB
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    xvstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvfmul.d VX3, VX3, VXB
+    addi.d  I, I, -1
+    xvstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224: // ALPHA==0 BETA==0
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    fld.d $f12, X, 0 * SIZE
+    fld.d $f13, Y, 0 * SIZE
+    addi.d I, I, -1
+    fmul.d $f12, $f12, ALPHA
+    fmadd.d $f13, $f13, BETA, $f12
+    fst.d $f13, Y, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/daxpby_lsx.S b/kernel/loongarch64/daxpby_lsx.S
new file mode 100644
index 000000000..9aafbaf2a
--- /dev/null
+++ b/kernel/loongarch64/daxpby_lsx.S
@@ -0,0 +1,693 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define ALPHA  $f0
+#define X      $r5
+#define INCX   $r6
+#define BETA   $f1
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define a2     $f13
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VXA    $vr23
+#define VXB    $vr9
+#define VXZ    $vr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.d.l a1, a1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    movfr2gr.d t1, ALPHA
+    vreplgr2vr.d VXA, t1
+    movfr2gr.d t2, BETA
+    vreplgr2vr.d VXB, t2
+    movfr2gr.d t3, a1
+    vreplgr2vr.d VXZ, t3
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    fcmp.ceq.d $fcc0, ALPHA, a1
+    bcnez $fcc0, .L110
+    fcmp.ceq.d $fcc0, BETA, a1
+    bcnez $fcc0, .L112 // ALPHA!=0 BETA==0
+    b .L111 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L110:
+    fcmp.ceq.d $fcc0, BETA, a1
+    bcnez $fcc0, .L114 // ALPHA==0 BETA==0
+    b .L113 // ALPHA==0 BETA!=0
+    .align 3
+
+.L111: // ALPHA!=0 BETA!=0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vfmul.d VX0, VX0, VXA
+    vfmul.d VX1, VX1, VXA
+    vfmadd.d VX2, VX2, VXB, VX0
+    vfmadd.d VX3, VX3, VXB, VX1
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vfmul.d VX0, VX0, VXA
+    vfmul.d VX1, VX1, VXA
+    vfmadd.d VX2, VX2, VXB, VX0
+    vfmadd.d VX3, VX3, VXB, VX1
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // ALPHA!=0 BETA==0
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfmul.d VX0, VX0, VXA
+    vfmul.d VX1, VX1, VXA
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 2 * SIZE
+    vld VX2, X, 4 * SIZE
+    vld VX3, X, 6 * SIZE
+    vfmul.d VX2, VX2, VXA
+    vfmul.d VX3, VX3, VXA
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // ALPHA==0 BETA!=0\
+    vld VX0, Y, 0 * SIZE
+    vld VX1, Y, 2 * SIZE
+    vfmul.d VX0, VX0, VXB
+    vfmul.d VX1, VX1, VXB
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 2 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vfmul.d VX2, VX2, VXB
+    vfmul.d VX3, VX3, VXB
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // ALPHA==0 BETA==0
+    vst VXZ, Y, 0 * SIZE
+    vst VXZ, Y, 2 * SIZE
+    vst VXZ, Y, 4 * SIZE
+    vst VXZ, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    fcmp.ceq.d $fcc0, ALPHA, a1
+    bcnez $fcc0, .L120
+    fcmp.ceq.d $fcc0, BETA, a1
+    bcnez $fcc0, .L122 // ALPHA!=0 BETA==0
+    b .L121 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L120:
+    fcmp.ceq.d $fcc0, BETA, a1
+    bcnez $fcc0, .L124 // ALPHA==0 BETA==0
+    b .L123 // ALPHA==0 BETA!=0
+    .align 3
+
+.L121: // ALPHA!=0 BETA!=0
+    vld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    vfmul.d VX0, VX0, VXA
+    vld VX1, X, 2 * SIZE
+    vfmadd.d VX2, VX2, VXB, VX0
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX1, VX1, VXA
+    vld VX0, X, 4 * SIZE
+    vfmadd.d VX3, VX3, VXB, VX1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX0, VX0, VXA
+    vld VX1, X, 6 * SIZE
+    vfmadd.d VX2, VX2, VXB, VX0
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX1, VX1, VXA
+    vfmadd.d VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L122: // ALPHA!=0 BETA==0
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfmul.d VX0, VX0, VXA
+    vfmul.d VX1, VX1, VXA
+    vstelm.d VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vfmul.d VX0, VX0, VXA
+    vfmul.d VX1, VX1, VXA
+    vstelm.d VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L122
+    b .L997
+    .align 3
+
+.L123: // ALPHA==0 BETA!=0
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmul.d VX2, VX2, VXB
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX3, VX3, VXB
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX2, VX2, VXB
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX3, VX3, VXB
+    addi.d  I, I, -1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    blt $r0, I, .L123
+    b .L997
+    .align 3
+
+.L124: // ALPHA==0 BETA==0
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L124
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    fcmp.ceq.d $fcc0, ALPHA, a1
+    bcnez $fcc0, .L210
+    fcmp.ceq.d $fcc0, BETA, a1
+    bcnez $fcc0, .L212 // ALPHA!=0 BETA==0
+    b .L211 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L210:
+    fcmp.ceq.d $fcc0, BETA, a1
+    bcnez $fcc0, .L214 // ALPHA==0 BETA==0
+    b .L213 // ALPHA==0 BETA!=0
+    .align 3
+
+.L211: // ALPHA!=0 BETA!=0
+    vld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX0, VXA, VX0
+    vld VX3, Y, 2 * SIZE
+    vfmadd.d VX2, VX2, VXB, VX0
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX2, Y, 0 * SIZE
+    vfmul.d VX1, VXA, VX1
+    vld VX2, Y, 4 * SIZE
+    vfmadd.d VX3, VX3, VXB, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vst VX3, Y, 2 * SIZE
+    vfmul.d VX0, VX0, VXA
+    vld VX3, Y, 6 * SIZE
+    vfmadd.d VX2, VX2, VXB, VX0
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX2, Y, 4 * SIZE
+    vfmul.d VX1, VX1, VXA
+    vfmadd.d VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    vst VX3, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L212: // ALPHA!=0 BETA==0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX0, VXA, VX0
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 0 * SIZE
+    vfmul.d VX1, VXA, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vst VX1, Y, 2 * SIZE
+    vfmul.d VX0, VX0, VXA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 4 * SIZE
+    vfmul.d VX1, VX1, VXA
+    addi.d  I, I, -1
+    vst VX1, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L212
+    b .L997
+    .align 3
+
+.L213: // ALPHA==0 BETA!=0
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vfmul.d VX2, VX2, VXB
+    vfmul.d VX3, VX3, VXB
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vfmul.d VX2, VX2, VXB
+    vfmul.d VX3, VX3, VXB
+    addi.d  I, I, -1
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L213
+    b .L997
+    .align 3
+
+.L214: // ALPHA==0 BETA==0
+    vst VXZ, Y, 0 * SIZE
+    vst VXZ, Y, 2 * SIZE
+    vst VXZ, Y, 4 * SIZE
+    vst VXZ, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L214
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    fcmp.ceq.d $fcc0, ALPHA, a1
+    bcnez $fcc0, .L220
+    fcmp.ceq.d $fcc0, BETA, a1
+    bcnez $fcc0, .L222 // ALPHA!=0 BETA==0
+    b .L221 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L220:
+    fcmp.ceq.d $fcc0, BETA, a1
+    bcnez $fcc0, .L224 // ALPHA==0 BETA==0
+    b .L223 // ALPHA==0 BETA!=0
+    .align 3
+
+.L221: // ALPHA!=0 BETA!=0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, VX0, VXA
+    vfmadd.d VX2, VX2, VXB, VX0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX1, VX1, VXA
+    vfmadd.d VX3, VX3, VXB, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, VX0, VXA
+    vfmadd.d VX2, VX2, VXB, VX0
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    vfmul.d VX1, VX1, VXA
+    vfmadd.d VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222: // ALPHA!=0 BETA==0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX0, VX0, VXA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX1, VX1, VXA
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vstelm.d VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX0, VX0, VXA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX1, VX1, VXA
+    addi.d  I, I, -1
+    vstelm.d VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: // ALPHA==0 BETA!=0
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmul.d VX2, VX2, VXB
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX3, VX3, VXB
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX2, VX2, VXB
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX3, VX3, VXB
+    addi.d  I, I, -1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224: // ALPHA==0 BETA==0
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    fld.d $f12, X, 0 * SIZE
+    fld.d $f13, Y, 0 * SIZE
+    addi.d I, I, -1
+    fmul.d $f12, $f12, ALPHA
+    fmadd.d $f13, $f13, BETA, $f12
+    fst.d $f13, Y, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/daxpy_lasx.S b/kernel/loongarch64/daxpy_lasx.S
new file mode 100644
index 000000000..bafd871ab
--- /dev/null
+++ b/kernel/loongarch64/daxpy_lasx.S
@@ -0,0 +1,338 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define XX     $r5
+#define YY     $r6
+#define ALPHA  $f0
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VXA    $xr23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.d.l a1, a1
+    movgr2fr.d a2, TEMP
+    ffint.d.l a2, a2
+    fcmp.ceq.d $fcc0, ALPHA, a1
+    bcnez $fcc0, .L999
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    movfr2gr.d t1, ALPHA
+    xvreplgr2vr.d VXA, t1
+
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L113
+    fcmp.ceq.d $fcc0, ALPHA, a2
+    bceqz $fcc0, .L112
+    .align 3
+
+.L111:
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvfadd.d VX2, VX0, VX2
+    xvfadd.d VX3, VX1, VX3
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    b .L113
+    .align 3
+
+.L112:
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvfmadd.d VX2, VX0, VXA, VX2
+    xvfmadd.d VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L112
+    .align 3
+
+.L113:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L114:
+    fld.d $f12, X, 0 * SIZE
+    fld.d $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fmadd.d $f14, $f12, $f0, $f14
+    fst.d $f14, Y, 0 * SIZE
+    addi.d  X, X, SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L114
+    b .L999
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    move YY, Y
+    .align 3
+
+.L121:
+    xvld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmadd.d VX2, VX0, VXA, VX2
+    xvld VX1, X, 4 * SIZE
+    xvstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmadd.d VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    xvstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    fld.d $f12, X, 0 * SIZE
+    fld.d $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fmadd.d $f14, $f12, $f0, $f14
+    fst.d $f14, Y, 0 * SIZE
+    addi.d  X, X, SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvfmadd.d VX2, VX0, VXA, VX2
+    xvld VX3, Y, 4 * SIZE
+    xvst VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvfmadd.d VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    xvst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    fld.d $f12, X, 0 * SIZE
+    fld.d $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fmadd.d $f14, $f12, $f0, $f14
+    fst.d $f14, Y, 0 * SIZE
+    add.d X, X, INCX
+    addi.d Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    move YY, Y
+    .align 3
+
+.L222:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmadd.d VX2, VX0, VXA, VX2
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmadd.d VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    xvstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    fld.d $f12, X, 0 * SIZE
+    fld.d $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fmadd.d $f14, $f12, $f0, $f14
+    fst.d $f14, Y, 0 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L224
+    b .L999
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/daxpy_lsx.S b/kernel/loongarch64/daxpy_lsx.S
new file mode 100644
index 000000000..fc88f0bb9
--- /dev/null
+++ b/kernel/loongarch64/daxpy_lsx.S
@@ -0,0 +1,365 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define XX     $r5
+#define YY     $r6
+#define ALPHA  $f0
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VXA    $vr23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.d.l a1, a1
+    movgr2fr.d a2, TEMP
+    ffint.d.l a2, a2
+    fcmp.ceq.d $fcc0, ALPHA, a1
+    bcnez $fcc0, .L999
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    movfr2gr.d t1, ALPHA
+    vreplgr2vr.d VXA, t1
+
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L113
+    fcmp.ceq.d $fcc0, ALPHA, a2
+    bceqz $fcc0, .L112
+    .align 3
+
+.L111:
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vfadd.d VX2, VX0, VX2
+    vfadd.d VX3, VX1, VX3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vfadd.d VX2, VX0, VX2
+    vfadd.d VX3, VX1, VX3
+    addi.d  I, I, -1
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    b .L113
+    .align 3
+
+.L112:
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vfmadd.d VX2, VX0, VXA, VX2
+    vfmadd.d VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+    vfmadd.d VX2, VX0, VXA, VX2
+    vfmadd.d VX3, VX1, VXA, VX3
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L112
+    .align 3
+
+.L113:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L114:
+    fld.d $f12, X, 0 * SIZE
+    fld.d $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fmadd.d $f14, $f12, $f0, $f14
+    fst.d $f14, Y, 0 * SIZE
+    addi.d  X, X, SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L114
+    b .L999
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    move YY, Y
+    .align 3
+
+.L121:
+    vld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX2, VX0, VXA, VX2
+    vld VX1, X, 2 * SIZE
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX3, VX1, VXA, VX3
+    vld VX0, X, 4 * SIZE
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX2, VX0, VXA, VX2
+    vld VX1, X, 6 * SIZE
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX3, VX1, VXA, VX3
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    fld.d $f12, X, 0 * SIZE
+    fld.d $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fmadd.d $f14, $f12, $f0, $f14
+    fst.d $f14, Y, 0 * SIZE
+    addi.d  X, X, SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+    vld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vfmadd.d VX2, VX0, VXA, VX2
+    vld VX3, Y, 2 * SIZE
+    vst VX2, Y, 0 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmadd.d VX3, VX1, VXA, VX3
+    vld VX2, Y, 4 * SIZE
+    vst VX3, Y, 2 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vfmadd.d VX2, VX0, VXA, VX2
+    vld VX3, Y, 6 * SIZE
+    vst VX2, Y, 4 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmadd.d VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vst VX3, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    fld.d $f12, X, 0 * SIZE
+    fld.d $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fmadd.d $f14, $f12, $f0, $f14
+    fst.d $f14, Y, 0 * SIZE
+    add.d X, X, INCX
+    addi.d Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    move YY, Y
+    .align 3
+
+.L222:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX2, VX0, VXA, VX2
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX3, VX1, VXA, VX3
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX2, VX0, VXA, VX2
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    fld.d $f12, X, 0 * SIZE
+    fld.d $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fmadd.d $f14, $f12, $f0, $f14
+    fst.d $f14, Y, 0 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/saxpby_lasx.S b/kernel/loongarch64/saxpby_lasx.S
new file mode 100644
index 000000000..c5d1ff402
--- /dev/null
+++ b/kernel/loongarch64/saxpby_lasx.S
@@ -0,0 +1,597 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define ALPHA  $f0
+#define X      $r5
+#define INCX   $r6
+#define BETA   $f1
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define a2     $f13
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VXA    $xr23
+#define VXB    $xr9
+#define VXZ    $xr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.s.l a1, a1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    movfr2gr.s t1, ALPHA
+    xvreplgr2vr.w VXA, t1
+    movfr2gr.s t2, BETA
+    xvreplgr2vr.w VXB, t2
+    movfr2gr.s t3, a1
+    xvreplgr2vr.w VXZ, t3
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    fcmp.ceq.s $fcc0, ALPHA, a1
+    bcnez $fcc0, .L110
+    fcmp.ceq.s $fcc0, BETA, a1
+    bcnez $fcc0, .L112 // ALPHA!=0 BETA==0
+    b .L111 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L110:
+    fcmp.ceq.s $fcc0, BETA, a1
+    bcnez $fcc0, .L114 // ALPHA==0 BETA==0
+    b .L113 // ALPHA==0 BETA!=0
+    .align 3
+
+.L111: // ALPHA!=0 BETA!=0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvfmul.s VX0, VX0, VXA
+    addi.d  I, I, -1
+    xvfmadd.s VX2, VX2, VXB, VX0
+    xvst VX2, Y, 0 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // ALPHA!=0 BETA==0
+    xvld VX0, X, 0 * SIZE
+    xvfmul.s VX0, VX0, VXA
+    addi.d  I, I, -1
+    xvst VX0, Y, 0 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // ALPHA==0 BETA!=0
+    xvld VX2, Y, 0 * SIZE
+    xvfmul.s VX2, VX2, VXB
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // ALPHA==0 BETA==0
+    xvst VXZ, Y, 0 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    fcmp.ceq.s $fcc0, ALPHA, a1
+    bcnez $fcc0, .L120
+    fcmp.ceq.s $fcc0, BETA, a1
+    bcnez $fcc0, .L122 // ALPHA!=0 BETA==0
+    b .L121 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L120:
+    fcmp.ceq.s $fcc0, BETA, a1
+    bcnez $fcc0, .L124 // ALPHA==0 BETA==0
+    b .L123 // ALPHA==0 BETA!=0
+    .align 3
+
+.L121: // ALPHA!=0 BETA!=0
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX0, VX0, VXA
+    xvfmadd.s VX2, VX2, VXB, VX0
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L122: // ALPHA!=0 BETA==0
+    xvld VX0, X, 0 * SIZE
+    xvfmul.s VX0, VX0, VXA
+    addi.d  I, I, -1
+    xvstelm.w VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 7
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L122
+    b .L997
+    .align 3
+
+.L123: // ALPHA==0 BETA!=0
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX2, VX2, VXB
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L123
+    b .L997
+    .align 3
+
+.L124: // ALPHA==0 BETA==0
+    xvstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 7
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L124
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    fcmp.ceq.s $fcc0, ALPHA, a1
+    bcnez $fcc0, .L210
+    fcmp.ceq.s $fcc0, BETA, a1
+    bcnez $fcc0, .L212 // ALPHA!=0 BETA==0
+    b .L211 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L210:
+    fcmp.ceq.s $fcc0, BETA, a1
+    bcnez $fcc0, .L214 // ALPHA==0 BETA==0
+    b .L213 // ALPHA==0 BETA!=0
+    .align 3
+
+.L211: // ALPHA!=0 BETA!=0
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VX0, VXA, VX0
+    xvfmadd.s VX2, VX2, VXB, VX0
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L212: // ALPHA!=0 BETA==0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VX0, VXA, VX0
+    addi.d  I, I, -1
+    xvst VX0, Y, 0 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L212
+    b .L997
+    .align 3
+
+.L213: // ALPHA==0 BETA!=0
+    xvld VX2, Y, 0 * SIZE
+    xvfmul.s VX2, VX2, VXB
+    xvst VX2, Y, 0 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L213
+    b .L997
+    .align 3
+
+.L214: // ALPHA==0 BETA==0
+    xvst VXZ, Y, 0 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L214
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    fcmp.ceq.s $fcc0, ALPHA, a1
+    bcnez $fcc0, .L220
+    fcmp.ceq.s $fcc0, BETA, a1
+    bcnez $fcc0, .L222 // ALPHA!=0 BETA==0
+    b .L221 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L220:
+    fcmp.ceq.s $fcc0, BETA, a1
+    bcnez $fcc0, .L224 // ALPHA==0 BETA==0
+    b .L223 // ALPHA==0 BETA!=0
+    .align 3
+
+.L221: // ALPHA!=0 BETA!=0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX0, VX0, VXA
+    xvfmadd.s VX2, VX2, VXB, VX0
+    addi.d  I, I, -1
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+    add.d YY, YY, INCY
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222: // ALPHA!=0 BETA==0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VX0, VX0, VXA
+    addi.d  I, I, -1
+    xvstelm.w VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 7
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: // ALPHA==0 BETA!=0
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX2, VX2, VXB
+    addi.d  I, I, -1
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+    add.d YY, YY, INCY
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224: // ALPHA==0 BETA==0
+    xvstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 7
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    fld.s $f12, X, 0 * SIZE
+    fld.s $f13, Y, 0 * SIZE
+    addi.d I, I, -1
+    fmul.s $f12, $f12, ALPHA
+    fmadd.s $f13, $f13, BETA, $f12
+    fst.s $f13, Y, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/saxpby_lsx.S b/kernel/loongarch64/saxpby_lsx.S
new file mode 100644
index 000000000..7f8cea2dd
--- /dev/null
+++ b/kernel/loongarch64/saxpby_lsx.S
@@ -0,0 +1,629 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define ALPHA  $f0
+#define X      $r5
+#define INCX   $r6
+#define BETA   $f1
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define a2     $f13
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VXA    $vr23
+#define VXB    $vr9
+#define VXZ    $vr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.s.l a1, a1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    movfr2gr.s t1, ALPHA
+    vreplgr2vr.w VXA, t1
+    movfr2gr.s t2, BETA
+    vreplgr2vr.w VXB, t2
+    movfr2gr.s t3, a1
+    vreplgr2vr.w VXZ, t3
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    fcmp.ceq.s $fcc0, ALPHA, a1
+    bcnez $fcc0, .L110
+    fcmp.ceq.s $fcc0, BETA, a1
+    bcnez $fcc0, .L112 // ALPHA!=0 BETA==0
+    b .L111 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L110:
+    fcmp.ceq.s $fcc0, BETA, a1
+    bcnez $fcc0, .L114 // ALPHA==0 BETA==0
+    b .L113 // ALPHA==0 BETA!=0
+    .align 3
+
+.L111: // ALPHA!=0 BETA!=0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfmul.s VX0, VX0, VXA
+    vfmul.s VX1, VX1, VXA
+    vfmadd.s VX2, VX2, VXB, VX0
+    vfmadd.s VX3, VX3, VXB, VX1
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // ALPHA!=0 BETA==0
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vfmul.s VX0, VX0, VXA
+    vfmul.s VX1, VX1, VXA
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // ALPHA==0 BETA!=0
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfmul.s VX2, VX2, VXB
+    vfmul.s VX3, VX3, VXB
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // ALPHA==0 BETA==0
+    vst VXZ, Y, 0 * SIZE
+    vst VXZ, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    fcmp.ceq.s $fcc0, ALPHA, a1
+    bcnez $fcc0, .L120
+    fcmp.ceq.s $fcc0, BETA, a1
+    bcnez $fcc0, .L122 // ALPHA!=0 BETA==0
+    b .L121 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L120:
+    fcmp.ceq.s $fcc0, BETA, a1
+    bcnez $fcc0, .L124 // ALPHA==0 BETA==0
+    b .L123 // ALPHA==0 BETA!=0
+    .align 3
+
+.L121: // ALPHA!=0 BETA!=0
+    vld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX0, VX0, VXA
+    vld VX1, X, 4 * SIZE
+    vfmadd.s VX2, VX2, VXB, VX0
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    vfmul.s VX1, VX1, VXA
+    vfmadd.s VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L122: // ALPHA!=0 BETA==0
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vfmul.s VX0, VX0, VXA
+    vfmul.s VX1, VX1, VXA
+    vstelm.w VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L122
+    b .L997
+    .align 3
+
+.L123: // ALPHA==0 BETA!=0
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX2, VX2, VXB
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    vfmul.s VX3, VX3, VXB
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L123
+    b .L997
+    .align 3
+
+.L124: // ALPHA==0 BETA==0
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L124
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    fcmp.ceq.s $fcc0, ALPHA, a1
+    bcnez $fcc0, .L210
+    fcmp.ceq.s $fcc0, BETA, a1
+    bcnez $fcc0, .L212 // ALPHA!=0 BETA==0
+    b .L211 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L210:
+    fcmp.ceq.s $fcc0, BETA, a1
+    bcnez $fcc0, .L214 // ALPHA==0 BETA==0
+    b .L213 // ALPHA==0 BETA!=0
+    .align 3
+
+.L211: // ALPHA!=0 BETA!=0
+    vld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmul.s VX0, VXA, VX0
+    vld VX3, Y, 4 * SIZE
+    vfmadd.s VX2, VX2, VXB, VX0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vst VX2, Y, 0 * SIZE
+    vfmul.s VX1, VX1, VXA
+    vfmadd.s VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    vst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L212: // ALPHA!=0 BETA==0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmul.s VX0, VXA, VX0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vst VX0, Y, 0 * SIZE
+    vfmul.s VX1, VX1, VXA
+    addi.d  I, I, -1
+    vst VX1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L212
+    b .L997
+    .align 3
+
+.L213: // ALPHA==0 BETA!=0
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfmul.s VX2, VX2, VXB
+    vfmul.s VX3, VX3, VXB
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L213
+    b .L997
+    .align 3
+
+.L214: // ALPHA==0 BETA==0
+    vst VXZ, Y, 0 * SIZE
+    vst VXZ, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L214
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    fcmp.ceq.s $fcc0, ALPHA, a1
+    bcnez $fcc0, .L220
+    fcmp.ceq.s $fcc0, BETA, a1
+    bcnez $fcc0, .L222 // ALPHA!=0 BETA==0
+    b .L221 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L220:
+    fcmp.ceq.s $fcc0, BETA, a1
+    bcnez $fcc0, .L224 // ALPHA==0 BETA==0
+    b .L223 // ALPHA==0 BETA!=0
+    .align 3
+
+.L221: // ALPHA!=0 BETA!=0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX0, VX0, VXA
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vfmadd.s VX2, VX2, VXB, VX0
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX1, VX1, VXA
+    addi.d  I, I, -1
+    vfmadd.s VX3, VX3, VXB, VX1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222: // ALPHA!=0 BETA==0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmul.s VX0, VX0, VXA
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vstelm.w VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    vfmul.s VX1, VX1, VXA
+    addi.d  I, I, -1
+    vstelm.w VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: // ALPHA==0 BETA!=0
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX2, VX2, VXB
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    vfmul.s VX3, VX3, VXB
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224: // ALPHA==0 BETA==0
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    fld.s $f12, X, 0 * SIZE
+    fld.s $f13, Y, 0 * SIZE
+    addi.d I, I, -1
+    fmul.s $f12, $f12, ALPHA
+    fmadd.s $f13, $f13, BETA, $f12
+    fst.s $f13, Y, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/saxpy_lasx.S b/kernel/loongarch64/saxpy_lasx.S
new file mode 100644
index 000000000..609e26328
--- /dev/null
+++ b/kernel/loongarch64/saxpy_lasx.S
@@ -0,0 +1,323 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define XX     $r5
+#define YY     $r6
+#define ALPHA  $f0
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VXA    $xr23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.s.l a1, a1
+    movgr2fr.d a2, TEMP
+    ffint.s.l a2, a2
+    fcmp.ceq.s $fcc0, ALPHA, a1
+    bcnez $fcc0, .L999
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    movfr2gr.s t1, ALPHA
+    xvreplgr2vr.w VXA, t1
+
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L113
+    fcmp.ceq.s $fcc0, ALPHA, a2
+    bceqz $fcc0, .L112
+    .align 3
+
+.L111:
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    addi.d  I, I, -1
+    xvfadd.s VX2, VX0, VX2
+    xvst VX2, Y, 0 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    b .L113
+    .align 3
+
+.L112:
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    addi.d  I, I, -1
+    xvfmadd.s VX2, VX0, VXA, VX2
+    xvst VX2, Y, 0 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L112
+    .align 3
+
+.L113:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L114:
+    fld.s $f12, X, 0 * SIZE
+    fld.s $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fmadd.s $f14, $f12, $f0, $f14
+    fst.s $f14, Y, 0 * SIZE
+    addi.d  X, X, SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L114
+    b .L999
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    move YY, Y
+    .align 3
+
+.L121:
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmadd.s VX2, VX0, VXA, VX2
+    addi.d  I, I, -1
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    fld.s $f12, X, 0 * SIZE
+    fld.s $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fmadd.s $f14, $f12, $f0, $f14
+    fst.s $f14, Y, 0 * SIZE
+    addi.d  X, X, SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvfmadd.s VX2, VX0, VXA, VX2
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    fld.s $f12, X, 0 * SIZE
+    fld.s $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fmadd.s $f14, $f12, $f0, $f14
+    fst.s $f14, Y, 0 * SIZE
+    add.d X, X, INCX
+    addi.d Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    move YY, Y
+    .align 3
+
+.L222:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmadd.s VX2, VX0, VXA, VX2
+    addi.d  I, I, -1
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    fld.s $f12, X, 0 * SIZE
+    fld.s $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fmadd.s $f14, $f12, $f0, $f14
+    fst.s $f14, Y, 0 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/saxpy_lsx.S b/kernel/loongarch64/saxpy_lsx.S
new file mode 100644
index 000000000..f47415ed6
--- /dev/null
+++ b/kernel/loongarch64/saxpy_lsx.S
@@ -0,0 +1,338 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define XX     $r5
+#define YY     $r6
+#define ALPHA  $f0
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VXA    $vr23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.s.l a1, a1
+    movgr2fr.d a2, TEMP
+    ffint.s.l a2, a2
+    fcmp.ceq.s $fcc0, ALPHA, a1
+    bcnez $fcc0, .L999
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    movfr2gr.s t1, ALPHA
+    vreplgr2vr.w VXA, t1
+
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L113
+    fcmp.ceq.s $fcc0, ALPHA, a2
+    bceqz $fcc0, .L112
+    .align 3
+
+.L111:
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfadd.s VX2, VX0, VX2
+    vfadd.s VX3, VX1, VX3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L113
+    .align 3
+
+.L112:
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfmadd.s VX2, VX0, VXA, VX2
+    vfmadd.s VX3, VX1, VXA, VX3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L113
+    .align 3
+
+.L113:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L114:
+    fld.s $f12, X, 0 * SIZE
+    fld.s $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fmadd.s $f14, $f12, $f0, $f14
+    fst.s $f14, Y, 0 * SIZE
+    addi.d  X, X, SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L114
+    b .L999
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    move YY, Y
+    .align 3
+
+.L121:
+    vld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmadd.s VX2, VX0, VXA, VX2
+    vld VX1, X, 4 * SIZE
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vfmadd.s VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    fld.s $f12, X, 0 * SIZE
+    fld.s $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fmadd.s $f14, $f12, $f0, $f14
+    fst.s $f14, Y, 0 * SIZE
+    addi.d  X, X, SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+    vld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmadd.s VX2, VX0, VXA, VX2
+    vld VX3, Y, 4 * SIZE
+    vst VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    vfmadd.s VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    fld.s $f12, X, 0 * SIZE
+    fld.s $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fmadd.s $f14, $f12, $f0, $f14
+    fst.s $f14, Y, 0 * SIZE
+    add.d X, X, INCX
+    addi.d Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    move YY, Y
+    .align 3
+
+.L222:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmadd.s VX2, VX0, VXA, VX2
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vfmadd.s VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    fld.s $f12, X, 0 * SIZE
+    fld.s $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    fmadd.s $f14, $f12, $f0, $f14
+    fst.s $f14, Y, 0 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE

From c80e7e27d1b70d6f9cf80bd64bde096d0c3359e3 Mon Sep 17 00:00:00 2001
From: yancheng <yancheng@loongson.cn>
Date: Thu, 7 Dec 2023 13:08:03 +0800
Subject: [PATCH 473/718] loongarch64: Add optimizations for sum and asum.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   6 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   6 +
 kernel/loongarch64/dasum_lasx.S          | 148 +++++++++++++++++++++
 kernel/loongarch64/dasum_lsx.S           | 158 +++++++++++++++++++++++
 kernel/loongarch64/dsum_lasx.S           | 125 ++++++++++++++++++
 kernel/loongarch64/dsum_lsx.S            | 123 ++++++++++++++++++
 kernel/loongarch64/sasum_lasx.S          | 157 ++++++++++++++++++++++
 kernel/loongarch64/sasum_lsx.S           | 148 +++++++++++++++++++++
 kernel/loongarch64/ssum_lasx.S           | 140 ++++++++++++++++++++
 kernel/loongarch64/ssum_lsx.S            | 125 ++++++++++++++++++
 10 files changed, 1136 insertions(+)
 create mode 100644 kernel/loongarch64/dasum_lasx.S
 create mode 100644 kernel/loongarch64/dasum_lsx.S
 create mode 100644 kernel/loongarch64/dsum_lasx.S
 create mode 100644 kernel/loongarch64/dsum_lsx.S
 create mode 100644 kernel/loongarch64/sasum_lasx.S
 create mode 100644 kernel/loongarch64/sasum_lsx.S
 create mode 100644 kernel/loongarch64/ssum_lasx.S
 create mode 100644 kernel/loongarch64/ssum_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index a94303151..5e83e67a4 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -43,4 +43,10 @@ DAXPYKERNEL =  daxpy_lsx.S
 SAXPBYKERNEL = saxpby_lsx.S
 DAXPBYKERNEL = daxpby_lsx.S
 
+SSUMKERNEL  =  ssum_lsx.S
+DSUMKERNEL  =  dsum_lsx.S
+
+SASUMKERNEL =  sasum_lsx.S
+DASUMKERNEL =  dasum_lsx.S
+
 endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 4cfd53058..2cc2edb74 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -43,6 +43,12 @@ DAXPYKERNEL =  daxpy_lasx.S
 SAXPBYKERNEL = saxpby_lasx.S
 DAXPBYKERNEL = daxpby_lasx.S
 
+SSUMKERNEL  =  ssum_lasx.S
+DSUMKERNEL  =  dsum_lasx.S
+
+SASUMKERNEL =  sasum_lasx.S
+DASUMKERNEL =  dasum_lasx.S
+
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
diff --git a/kernel/loongarch64/dasum_lasx.S b/kernel/loongarch64/dasum_lasx.S
new file mode 100644
index 000000000..49de98c40
--- /dev/null
+++ b/kernel/loongarch64/dasum_lasx.S
@@ -0,0 +1,148 @@
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define VT0    $xr23
+#define VT1    $xr22
+#define res1   $xr16
+#define res2   $xr17
+#define res0   $xr18
+#define neg1   $xr19
+
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    xvxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d t1, -1
+    xvreplgr2vr.d neg1, t1
+    xvffint.d.l neg1, neg1
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    fld.d $f12, X, 0 * SIZE
+    fabs.d $f12, $f12
+    fadd.d $f16, $f12, $f16
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    fld.d $f12, X, 0 * SIZE
+    fabs.d $f12, $f12
+    fadd.d $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.d $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/dasum_lsx.S b/kernel/loongarch64/dasum_lsx.S
new file mode 100644
index 000000000..94750815e
--- /dev/null
+++ b/kernel/loongarch64/dasum_lsx.S
@@ -0,0 +1,158 @@
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define VT0    $vr23
+#define VT1    $vr22
+#define res1   $vr16
+#define res2   $vr17
+#define res0   $vr18
+#define neg1   $vr19
+
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    vxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d t1, -1
+    vreplgr2vr.d neg1, t1
+    vffint.d.l neg1, neg1
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    fld.d $f12, X, 0 * SIZE
+    fabs.d $f12, $f12
+    fadd.d $f16, $f12, $f16
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    fld.d $f12, X, 0 * SIZE
+    fabs.d $f12, $f12
+    fadd.d $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.d $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/dsum_lasx.S b/kernel/loongarch64/dsum_lasx.S
new file mode 100644
index 000000000..3c51dab60
--- /dev/null
+++ b/kernel/loongarch64/dsum_lasx.S
@@ -0,0 +1,125 @@
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define res1   $xr16
+#define res2   $xr17
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    fld.d $f12, X, 0 * SIZE
+    addi.d I, I, -1
+    fadd.d $f16, $f12, $f16
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    fld.d $f12, X, 0 * SIZE
+    fadd.d $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.d $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dsum_lsx.S b/kernel/loongarch64/dsum_lsx.S
new file mode 100644
index 000000000..402d087df
--- /dev/null
+++ b/kernel/loongarch64/dsum_lsx.S
@@ -0,0 +1,123 @@
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define res1   $vr16
+#define res2   $vr17
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    fld.d $f12, X, 0 * SIZE
+    fadd.d $f16, $f12, $f16
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    fld.d $f12, X, 0 * SIZE
+    fadd.d $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.d $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/sasum_lasx.S b/kernel/loongarch64/sasum_lasx.S
new file mode 100644
index 000000000..a452701aa
--- /dev/null
+++ b/kernel/loongarch64/sasum_lasx.S
@@ -0,0 +1,157 @@
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define VT0    $xr23
+#define VT1    $xr22
+#define res1   $xr16
+#define res2   $xr17
+#define res0   $xr18
+#define neg1   $xr19
+
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    xvxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.w t1, -1
+    xvreplgr2vr.w neg1, t1
+    xvffint.s.w neg1, neg1
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+    xvld VX0, X, 0 * SIZE
+    xvfmul.s VX2, neg1, VX0
+    xvfcmp.clt.s VT0, VX0, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvfadd.s res1, VX0, res1
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    fld.s $f12, X, 0 * SIZE
+    fabs.s $f12, $f12
+    fadd.s $f16, $f12, $f16
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvfmul.s VX2, neg1, VX0
+    xvfcmp.clt.s VT0, VX0, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvfadd.s res1, VX0, res1
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    fld.s $f12, X, 0 * SIZE
+    fabs.s $f12, $f12
+    fadd.s $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/sasum_lsx.S b/kernel/loongarch64/sasum_lsx.S
new file mode 100644
index 000000000..87026a144
--- /dev/null
+++ b/kernel/loongarch64/sasum_lsx.S
@@ -0,0 +1,148 @@
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define VT0    $vr23
+#define VT1    $vr22
+#define res1   $vr16
+#define res2   $vr17
+#define res0   $vr18
+#define neg1   $vr19
+
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    vxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.w t1, -1
+    vreplgr2vr.w neg1, t1
+    vffint.s.w neg1, neg1
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res2, VX0, VX1
+    vfadd.s res1, res1, res2
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    fld.s $f12, X, 0 * SIZE
+    fabs.s $f12, $f12
+    fadd.s $f16, $f12, $f16
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res2, VX0, VX1
+    vfadd.s res1, res1, res2
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    fld.s $f12, X, 0 * SIZE
+    fabs.s $f12, $f12
+    fadd.s $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/ssum_lasx.S b/kernel/loongarch64/ssum_lasx.S
new file mode 100644
index 000000000..7cf57bc77
--- /dev/null
+++ b/kernel/loongarch64/ssum_lasx.S
@@ -0,0 +1,140 @@
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define res1   $xr16
+#define res2   $xr17
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+    xvld VX0, X, 0 * SIZE
+    xvfadd.s res1, VX0, res1
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    fld.s $f12, X, 0 * SIZE
+    fadd.s $f16, $f12, $f16
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvfadd.s res1, VX0, res1
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    fld.s $f12, X, 0 * SIZE
+    fadd.s $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/ssum_lsx.S b/kernel/loongarch64/ssum_lsx.S
new file mode 100644
index 000000000..de63c69e3
--- /dev/null
+++ b/kernel/loongarch64/ssum_lsx.S
@@ -0,0 +1,125 @@
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define res1   $vr16
+#define res2   $vr17
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vfadd.s res2, VX0, VX1
+    vfadd.s res1, res1, res2
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    fld.s $f12, X, 0 * SIZE
+    fadd.s $f16, $f12, $f16
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfadd.s res2, VX0, VX1
+    vfadd.s res1, res1, res2
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    fld.s $f12, X, 0 * SIZE
+    fadd.s $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file

From f9b468990e2ca2af5ea3f5848dc23152aa0da5bb Mon Sep 17 00:00:00 2001
From: yancheng <yancheng@loongson.cn>
Date: Thu, 7 Dec 2023 13:12:29 +0800
Subject: [PATCH 474/718] loongarch64: Add optimizations for rot.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |    3 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |    3 +
 kernel/loongarch64/drot_lasx.S           |  927 +++++++++++++++++++
 kernel/loongarch64/drot_lsx.S            | 1050 ++++++++++++++++++++++
 kernel/loongarch64/srot_lasx.S           |  863 ++++++++++++++++++
 kernel/loongarch64/srot_lsx.S            |  927 +++++++++++++++++++
 6 files changed, 3773 insertions(+)
 create mode 100644 kernel/loongarch64/drot_lasx.S
 create mode 100644 kernel/loongarch64/drot_lsx.S
 create mode 100644 kernel/loongarch64/srot_lasx.S
 create mode 100644 kernel/loongarch64/srot_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 5e83e67a4..026ea0d77 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -49,4 +49,7 @@ DSUMKERNEL  =  dsum_lsx.S
 SASUMKERNEL =  sasum_lsx.S
 DASUMKERNEL =  dasum_lsx.S
 
+SROTKERNEL  =  srot_lsx.S
+DROTKERNEL  =  drot_lsx.S
+
 endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 2cc2edb74..4905a50a9 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -49,6 +49,9 @@ DSUMKERNEL  =  dsum_lasx.S
 SASUMKERNEL =  sasum_lasx.S
 DASUMKERNEL =  dasum_lasx.S
 
+SROTKERNEL  =  srot_lasx.S
+DROTKERNEL  =  drot_lasx.S
+
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
diff --git a/kernel/loongarch64/drot_lasx.S b/kernel/loongarch64/drot_lasx.S
new file mode 100644
index 000000000..d3644b780
--- /dev/null
+++ b/kernel/loongarch64/drot_lasx.S
@@ -0,0 +1,927 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define C      $f0
+#define S      $f1
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VT0    $xr10
+#define VT1    $xr18
+#define VXC    $xr23
+#define VXS    $xr9
+#define VXZ    $xr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.d.l a1, a1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    movfr2gr.d t1, C
+    xvreplgr2vr.d VXC, t1
+    movfr2gr.d t2, S
+    xvreplgr2vr.d VXS, t2
+    movfr2gr.d t3, a1
+    xvreplgr2vr.d VXZ, t3
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    fcmp.ceq.d $fcc0, C, a1
+    bcnez $fcc0, .L110
+    fcmp.ceq.d $fcc0, S, a1
+    bcnez $fcc0, .L112 // C!=0 S==0
+    b .L111 // C!=0 S!=0
+    .align 3
+
+.L110:
+    fcmp.ceq.d $fcc0, S, a1
+    bcnez $fcc0, .L114 // C==0 S==0
+    b .L113 // C==0 S!=0
+    .align 3
+
+.L111: // C!=0 S!=0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvfmul.d VT0, VX0, VXC
+    xvfmadd.d VT0, VX2, VXS, VT0
+    xvfmul.d VT1, VX0, VXS
+    xvfmsub.d VT1, VX2, VXC, VT1
+    xvst VT0, X, 0 * SIZE
+    xvst VT1, Y, 0 * SIZE
+    xvfmul.d VT0, VX1, VXC
+    xvfmadd.d VT0, VX3, VXS, VT0
+    xvfmul.d VT1, VX1, VXS
+    xvfmsub.d VT1, VX3, VXC, VT1
+    xvst VT0, X, 4 * SIZE
+    xvst VT1, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // C!=0 S==0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvfmul.d VT0, VX0, VXC
+    xvfmul.d VT1, VX2, VXC
+    xvst VT0, X, 0 * SIZE
+    xvst VT1, Y, 0 * SIZE
+    xvfmul.d VT0, VX1, VXC
+    xvfmul.d VT1, VX3, VXC
+    xvst VT0, X, 4 * SIZE
+    xvst VT1, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // C==0 S!=0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvfmul.d VT0, VX2, VXS
+    xvfmul.d VT1, VX0, VXS
+    xvfsub.d VT1, VXZ, VT1
+    xvst VT0, X, 0 * SIZE
+    xvst VT1, Y, 0 * SIZE
+    xvfmul.d VT0, VX3, VXS
+    xvfmul.d VT1, VX1, VXS
+    xvfsub.d VT1, VXZ, VT1
+    xvst VT0, X, 4 * SIZE
+    xvst VT1, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // C==0 S==0
+    xvst VXZ, X, 0 * SIZE
+    xvst VXZ, Y, 0 * SIZE
+    xvst VXZ, X, 4 * SIZE
+    xvst VXZ, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    fcmp.ceq.d $fcc0, C, a1
+    bcnez $fcc0, .L120
+    fcmp.ceq.d $fcc0, S, a1
+    bcnez $fcc0, .L122 // C!=0 S==0
+    b .L121 // C!=0 S!=0
+    .align 3
+
+.L120:
+    fcmp.ceq.d $fcc0, S, a1
+    bcnez $fcc0, .L124 // C==0 S==0
+    b .L123 // C==0 S!=0
+    .align 3
+
+.L121: // C!=0 S!=0
+    xvld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX0, VXC
+    xvfmadd.d VT0, VX2, VXS, VT0
+    xvfmul.d VT1, VX0, VXS
+    xvfmsub.d VT1, VX2, VXC, VT1
+    xvld VX1, X, 4 * SIZE
+    xvst VT0, X, 0 * SIZE
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX1, VXC
+    xvfmadd.d VT0, VX3, VXS, VT0
+    xvfmul.d VT1, VX1, VXS
+    xvfmsub.d VT1, VX3, VXC, VT1
+    addi.d  I, I, -1
+    xvst VT0, X, 4 * SIZE
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L122: // C!=0 S==0
+    xvld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX0, VXC
+    xvfmul.d VT1, VX2, VXC
+    xvld VX1, X, 4 * SIZE
+    xvst VT0, X, 0 * SIZE
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX1, VXC
+    xvfmul.d VT1, VX3, VXC
+    addi.d  I, I, -1
+    xvst VT0, X, 4 * SIZE
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L122
+    b .L997
+    .align 3
+
+.L123: // C==0 S!=0
+    xvld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX2, VXS
+    xvfmul.d VT1, VX0, VXS
+    xvfsub.d VT1, VXZ, VT1
+    xvst VT0, X, 0 * SIZE
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX3, VXS
+    xvfmul.d VT1, VX1, VXS
+    xvfsub.d VT1, VXZ, VT1
+    addi.d  I, I, -1
+    xvst VT0, X, 4 * SIZE
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L123
+    b .L997
+    .align 3
+
+.L124: // C==0 S==0
+    xvst VXZ, X, 0 * SIZE
+    xvst VXZ, X, 4 * SIZE
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L124
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    move XX, X
+    fcmp.ceq.d $fcc0, C, a1
+    bcnez $fcc0, .L210
+    fcmp.ceq.d $fcc0, S, a1
+    bcnez $fcc0, .L212 // C!=0 S==0
+    b .L211 // C!=0 S!=0
+    .align 3
+
+.L210:
+    fcmp.ceq.d $fcc0, S, a1
+    bcnez $fcc0, .L214 // C==0 S==0
+    b .L213 // C==0 S!=0
+    .align 3
+
+.L211: // C!=0 S!=0
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VT0, VXC, VX0
+    xvfmadd.d VT0, VX2, VXS, VT0
+    xvfmul.d VT1, VXS, VX0
+    xvfmsub.d VT1, VX2, VXC, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VT0, VX1, VXC
+    xvfmadd.d VT0, VX3, VXS, VT0
+    xvfmul.d VT1, VX1, VXS
+    xvfmsub.d VT1, VX3, VXC, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L212: // C!=0 S==0
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VT0, VXC, VX0
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvfmul.d VT1, VX2, VXC
+    xvst VT1, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VT0, VX1, VXC
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvfmul.d VT1, VX3, VXS
+    xvst VT1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L212
+    b .L997
+    .align 3
+
+.L213: // C==0 S!=0
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VT0, VXS, VX2
+    xvfmul.d VT1, VXS, VX0
+    xvfsub.d VT1, VXZ, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VT0, VX3, VXS
+    xvfmul.d VT1, VX1, VXS
+    xvfsub.d VT1, VXZ, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L213
+    b .L997
+    .align 3
+
+.L214: // C==0 S==0
+    xvstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+    xvstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    fcmp.ceq.d $fcc0, C, a1
+    bcnez $fcc0, .L220
+    fcmp.ceq.d $fcc0, S, a1
+    bcnez $fcc0, .L222 // C!=0 S==0
+    b .L221 // C!=0 S!=0
+    .align 3
+
+.L220:
+    fcmp.ceq.d $fcc0, S, a1
+    bcnez $fcc0, .L224 // C==0 S==0
+    b .L223 // C==0 S!=0
+    .align 3
+
+.L221: // C!=0 S!=0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX0, VXC
+    xvfmadd.d VT0, VX2, VXS, VT0
+    xvfmul.d VT1, VX0, VXS
+    xvfmsub.d VT1, VX2, VXC, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX1, VXC
+    xvfmadd.d VT0, VX3, VXS, VT0
+    xvfmul.d VT1, VX0, VXS
+    xvfmsub.d VT1, VX3, VXC, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222: // C!=0 S==0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX0, VXC
+    xvfmul.d VT1, VX2, VXC
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX1, VXC
+    xvfmul.d VT1, VX3, VXC
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: // C==0 S!=0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX2, VXS
+    xvfmul.d VT1, VX0, VXS
+    xvfsub.d VT1, VXZ, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX3, VXS
+    xvfmul.d VT1, VX0, VXS
+    xvfsub.d VT1, VXZ, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224: // C==0 S==0
+    xvstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    fld.d $f12, X, 0 * SIZE
+    fld.d $f13, Y, 0 * SIZE
+    fmul.d $f10, $f12, C
+    fmadd.d $f10, $f13, S, $f10
+    fst.d $f10, X, 0 * SIZE
+    addi.d I, I, -1
+    fmul.d $f20, $f12, S
+    fmsub.d $f20, $f13, C, $f20
+    fst.d $f20, Y, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/drot_lsx.S b/kernel/loongarch64/drot_lsx.S
new file mode 100644
index 000000000..6db803b1c
--- /dev/null
+++ b/kernel/loongarch64/drot_lsx.S
@@ -0,0 +1,1050 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define C      $f0
+#define S      $f1
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VT0    $vr10
+#define VT1    $vr18
+#define VXC    $vr23
+#define VXS    $vr9
+#define VXZ    $vr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.d.l a1, a1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    movfr2gr.d t1, C
+    vreplgr2vr.d VXC, t1
+    movfr2gr.d t2, S
+    vreplgr2vr.d VXS, t2
+    movfr2gr.d t3, a1
+    vreplgr2vr.d VXZ, t3
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    fcmp.ceq.d $fcc0, C, a1
+    bcnez $fcc0, .L110
+    fcmp.ceq.d $fcc0, S, a1
+    bcnez $fcc0, .L112 // C!=0 S==0
+    b .L111 // C!=0 S!=0
+    .align 3
+
+.L110:
+    fcmp.ceq.d $fcc0, S, a1
+    bcnez $fcc0, .L114 // C==0 S==0
+    b .L113 // C==0 S!=0
+    .align 3
+
+.L111: // C!=0 S!=0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vfmul.d VT0, VX0, VXC
+    vfmadd.d VT0, VX2, VXS, VT0
+    vfmul.d VT1, VX0, VXS
+    vfmsub.d VT1, VX2, VXC, VT1
+    vst VT0, X, 0 * SIZE
+    vst VT1, Y, 0 * SIZE
+    vfmul.d VT0, VX1, VXC
+    vfmadd.d VT0, VX3, VXS, VT0
+    vfmul.d VT1, VX1, VXS
+    vfmsub.d VT1, VX3, VXC, VT1
+    vst VT0, X, 2 * SIZE
+    vst VT1, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vfmul.d VT0, VX0, VXC
+    vfmadd.d VT0, VX2, VXS, VT0
+    vfmul.d VT1, VX0, VXS
+    vfmsub.d VT1, VX2, VXC, VT1
+    vst VT0, X, 4 * SIZE
+    vst VT1, Y, 4 * SIZE
+    vfmul.d VT0, VX1, VXC
+    vfmadd.d VT0, VX3, VXS, VT0
+    vfmul.d VT1, VX1, VXS
+    vfmsub.d VT1, VX3, VXC, VT1
+    vst VT0, X, 6 * SIZE
+    vst VT1, Y, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // C!=0 S==0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vfmul.d VT0, VX0, VXC
+    vfmul.d VT1, VX2, VXC
+    vst VT0, X, 0 * SIZE
+    vst VT1, Y, 0 * SIZE
+    vfmul.d VT0, VX1, VXC
+    vfmul.d VT1, VX3, VXC
+    vst VT0, X, 2 * SIZE
+    vst VT1, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vfmul.d VT0, VX0, VXC
+    vfmul.d VT1, VX2, VXC
+    vst VT0, X, 4 * SIZE
+    vst VT1, Y, 4 * SIZE
+    vfmul.d VT0, VX1, VXC
+    vfmul.d VT1, VX3, VXC
+    vst VT0, X, 6 * SIZE
+    vst VT1, Y, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // C==0 S!=0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vfmul.d VT0, VX2, VXS
+    vfmul.d VT1, VX0, VXS
+    vfsub.d VT1, VXZ, VT1
+    vst VT0, X, 0 * SIZE
+    vst VT1, Y, 0 * SIZE
+    vfmul.d VT0, VX3, VXS
+    vfmul.d VT1, VX1, VXS
+    vfsub.d VT1, VXZ, VT1
+    vst VT0, X, 2 * SIZE
+    vst VT1, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vfmul.d VT0, VX2, VXS
+    vfmul.d VT1, VX0, VXS
+    vfsub.d VT1, VXZ, VT1
+    vst VT0, X, 4 * SIZE
+    vst VT1, Y, 4 * SIZE
+    vfmul.d VT0, VX3, VXS
+    vfmul.d VT1, VX1, VXS
+    vfsub.d VT1, VXZ, VT1
+    vst VT0, X, 6 * SIZE
+    vst VT1, Y, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // C==0 S==0
+    vst VXZ, X, 0 * SIZE
+    vst VXZ, Y, 0 * SIZE
+    vst VXZ, X, 2 * SIZE
+    vst VXZ, Y, 2 * SIZE
+    vst VXZ, X, 4 * SIZE
+    vst VXZ, Y, 4 * SIZE
+    vst VXZ, X, 6 * SIZE
+    vst VXZ, Y, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    fcmp.ceq.d $fcc0, C, a1
+    bcnez $fcc0, .L120
+    fcmp.ceq.d $fcc0, S, a1
+    bcnez $fcc0, .L122 // C!=0 S==0
+    b .L121 // C!=0 S!=0
+    .align 3
+
+.L120:
+    fcmp.ceq.d $fcc0, S, a1
+    bcnez $fcc0, .L124 // C==0 S==0
+    b .L123 // C==0 S!=0
+    .align 3
+
+.L121: // C!=0 S!=0
+    vld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX0, VXC
+    vfmadd.d VT0, VX2, VXS, VT0
+    vfmul.d VT1, VX0, VXS
+    vfmsub.d VT1, VX2, VXC, VT1
+    vst VT0, X, 0 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX0, X, 2 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX0, VXC
+    vfmadd.d VT0, VX2, VXS, VT0
+    vfmul.d VT1, VX0, VXS
+    vfmsub.d VT1, VX2, VXC, VT1
+    vst VT0, X, 2 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX1, VXC
+    vfmadd.d VT0, VX3, VXS, VT0
+    vfmul.d VT1, VX1, VXS
+    vfmsub.d VT1, VX3, VXC, VT1
+    vst VT0, X, 4 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 6 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX1, VXC
+    vfmadd.d VT0, VX3, VXS, VT0
+    vfmul.d VT1, VX1, VXS
+    vfmsub.d VT1, VX3, VXC, VT1
+    vst VT0, X, 6 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L122: // C!=0 S==0
+    vld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX0, VXC
+    vfmul.d VT1, VX2, VXC
+    vst VT0, X, 0 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX0, X, 2 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX0, VXC
+    vfmul.d VT1, VX2, VXC
+    vst VT0, X, 2 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX1, VXC
+    vfmul.d VT1, VX3, VXC
+    vst VT0, X, 4 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 6 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX1, VXC
+    vfmul.d VT1, VX3, VXC
+    vst VT0, X, 6 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L122
+    b .L997
+    .align 3
+
+.L123: // C==0 S!=0
+    vld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX2, VXS
+    vfmul.d VT1, VX0, VXS
+    vfsub.d VT1, VXZ, VT1
+    vst VT0, X, 0 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX0, X, 2 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX2, VXS
+    vfmul.d VT1, VX0, VXS
+    vfsub.d VT1, VXZ, VT1
+    vst VT0, X, 2 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX3, VXS
+    vfmul.d VT1, VX1, VXS
+    vfsub.d VT1, VXZ, VT1
+    vst VT0, X, 4 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 6 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX3, VXS
+    vfmul.d VT1, VX1, VXS
+    vfsub.d VT1, VXZ, VT1
+    vst VT0, X, 6 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L123
+    b .L997
+    .align 3
+
+.L124: // C==0 S==0
+    vst VXZ, X, 0 * SIZE
+    vst VXZ, X, 4 * SIZE
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L124
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    move XX, X
+    fcmp.ceq.d $fcc0, C, a1
+    bcnez $fcc0, .L210
+    fcmp.ceq.d $fcc0, S, a1
+    bcnez $fcc0, .L212 // C!=0 S==0
+    b .L211 // C!=0 S!=0
+    .align 3
+
+.L210:
+    fcmp.ceq.d $fcc0, S, a1
+    bcnez $fcc0, .L214 // C==0 S==0
+    b .L213 // C==0 S!=0
+    .align 3
+
+.L211: // C!=0 S!=0
+    vld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vfmul.d VT0, VXC, VX0
+    vfmadd.d VT0, VX2, VXS, VT0
+    vfmul.d VT1, VXS, VX0
+    vfmsub.d VT1, VX2, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX2, Y, 2 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    add.d X, X, INCX
+    vfmul.d VT0, VXC, VX0
+    vfmadd.d VT0, VX2, VXS, VT0
+    vfmul.d VT1, VXS, VX0
+    vfmsub.d VT1, VX2, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 2 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfmul.d VT0, VX1, VXC
+    vfmadd.d VT0, VX3, VXS, VT0
+    vfmul.d VT1, VX1, VXS
+    vfmsub.d VT1, VX3, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmul.d VT0, VX1, VXC
+    vfmadd.d VT0, VX3, VXS, VT0
+    vfmul.d VT1, VX1, VXS
+    vfmsub.d VT1, VX3, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L212: // C!=0 S==0
+    vld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vfmul.d VT0, VXC, VX0
+    vfmul.d VT1, VX2, VXC
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX2, Y, 2 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    add.d X, X, INCX
+    vfmul.d VT0, VXC, VX0
+    vfmul.d VT1, VX2, VXC
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 2 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfmul.d VT0, VX1, VXC
+    vfmul.d VT1, VX3, VXS
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmul.d VT0, VX1, VXC
+    vfmul.d VT1, VX3, VXS
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    vst VT1, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L212
+    b .L997
+    .align 3
+
+.L213: // C==0 S!=0
+    vld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vfmul.d VT0, VXS, VX2
+    vfmul.d VT1, VXS, VX0
+    vfsub.d VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX2, Y, 2 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    add.d X, X, INCX
+    vfmul.d VT0, VXS, VX2
+    vfmul.d VT1, VXS, VX0
+    vfsub.d VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 2 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfmul.d VT0, VX3, VXS
+    vfmul.d VT1, VX1, VXS
+    vfsub.d VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmul.d VT0, VX3, VXS
+    vfmul.d VT1, VX1, VXS
+    vfsub.d VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L213
+    b .L997
+    .align 3
+
+.L214: // C==0 S==0
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    fcmp.ceq.d $fcc0, C, a1
+    bcnez $fcc0, .L220
+    fcmp.ceq.d $fcc0, S, a1
+    bcnez $fcc0, .L222 // C!=0 S==0
+    b .L221 // C!=0 S!=0
+    .align 3
+
+.L220:
+    fcmp.ceq.d $fcc0, S, a1
+    bcnez $fcc0, .L224 // C==0 S==0
+    b .L223 // C==0 S!=0
+    .align 3
+
+.L221: // C!=0 S!=0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX0, VXC
+    vfmadd.d VT0, VX2, VXS, VT0
+    vfmul.d VT1, VX0, VXS
+    vfmsub.d VT1, VX2, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d  VT0, VX0, VXC
+    vfmadd.d VT0, VX2, VXS, VT0
+    vfmul.d  VT1, VX0, VXS
+    vfmsub.d VT1, VX2, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX1, VXC
+    vfmadd.d VT0, VX3, VXS, VT0
+    vfmul.d VT1, VX0, VXS
+    vfmsub.d VT1, VX3, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX1, VXC
+    vfmadd.d VT0, VX3, VXS, VT0
+    vfmul.d VT1, VX0, VXS
+    vfmsub.d VT1, VX3, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222: // C!=0 S==0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX0, VXC
+    vfmul.d VT1, VX2, VXC
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX0, VXC
+    vfmul.d VT1, VX2, VXC
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX1, VXC
+    vfmul.d VT1, VX3, VXC
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX1, VXC
+    vfmul.d VT1, VX3, VXC
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: // C==0 S!=0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX2, VXS
+    vfmul.d VT1, VX0, VXS
+    vfsub.d VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX2, VXS
+    vfmul.d VT1, VX0, VXS
+    vfsub.d VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX3, VXS
+    vfmul.d VT1, VX0, VXS
+    vfsub.d VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VT0, VX3, VXS
+    vfmul.d VT1, VX0, VXS
+    vfsub.d VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224: // C==0 S==0
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    fld.d $f12, X, 0 * SIZE
+    fld.d $f13, Y, 0 * SIZE
+    fmul.d $f10, $f12, C
+    fmadd.d $f10, $f13, S, $f10
+    fst.d $f10, X, 0 * SIZE
+    addi.d I, I, -1
+    fmul.d $f20, $f12, S
+    fmsub.d $f20, $f13, C, $f20
+    fst.d $f20, Y, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/srot_lasx.S b/kernel/loongarch64/srot_lasx.S
new file mode 100644
index 000000000..9aeb4dcf5
--- /dev/null
+++ b/kernel/loongarch64/srot_lasx.S
@@ -0,0 +1,863 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define C      $f0
+#define S      $f1
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VT0    $xr10
+#define VT1    $xr18
+#define VXC    $xr23
+#define VXS    $xr9
+#define VXZ    $xr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.s.l a1, a1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    movfr2gr.s t1, C
+    xvreplgr2vr.w VXC, t1
+    movfr2gr.s t2, S
+    xvreplgr2vr.w VXS, t2
+    movfr2gr.s t3, a1
+    xvreplgr2vr.w VXZ, t3
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    fcmp.ceq.s $fcc0, C, a1
+    bcnez $fcc0, .L110
+    fcmp.ceq.s $fcc0, S, a1
+    bcnez $fcc0, .L112 // C!=0 S==0
+    b .L111 // C!=0 S!=0
+    .align 3
+
+.L110:
+    fcmp.ceq.s $fcc0, S, a1
+    bcnez $fcc0, .L114 // C==0 S==0
+    b .L113 // C==0 S!=0
+    .align 3
+
+.L111: // C!=0 S!=0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvfmul.s VT0, VX0, VXC
+    xvfmadd.s VT0, VX2, VXS, VT0
+    xvfmul.s VT1, VX0, VXS
+    xvfmsub.s VT1, VX2, VXC, VT1
+    xvst VT0, X, 0 * SIZE
+    xvst VT1, Y, 0 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // C!=0 S==0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvfmul.s VT0, VX0, VXC
+    xvfmul.s VT1, VX2, VXC
+    xvst VT0, X, 0 * SIZE
+    xvst VT1, Y, 0 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // C==0 S!=0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvfmul.s VT0, VX2, VXS
+    xvfmul.s VT1, VX0, VXS
+    xvfsub.s VT1, VXZ, VT1
+    xvst VT0, X, 0 * SIZE
+    xvst VT1, Y, 0 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // C==0 S==0
+    xvst VXZ, X, 0 * SIZE
+    xvst VXZ, Y, 0 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    fcmp.ceq.s $fcc0, C, a1
+    bcnez $fcc0, .L120
+    fcmp.ceq.s $fcc0, S, a1
+    bcnez $fcc0, .L122 // C!=0 S==0
+    b .L121 // C!=0 S!=0
+    .align 3
+
+.L120:
+    fcmp.ceq.s $fcc0, S, a1
+    bcnez $fcc0, .L124 // C==0 S==0
+    b .L123 // C==0 S!=0
+    .align 3
+
+.L121: // C!=0 S!=0
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VT0, VX0, VXC
+    xvfmadd.s VT0, VX2, VXS, VT0
+    xvfmul.s VT1, VX0, VXS
+    xvfmsub.s VT1, VX2, VXC, VT1
+    xvst VT0, X, 0 * SIZE
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L122: // C!=0 S==0
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VT0, VX0, VXC
+    xvfmul.s VT1, VX2, VXC
+    xvst VT0, X, 0 * SIZE
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L122
+    b .L997
+    .align 3
+
+.L123: // C==0 S!=0
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VT0, VX2, VXS
+    xvfmul.s VT1, VX0, VXS
+    xvfsub.s VT1, VXZ, VT1
+    xvst VT0, X, 0 * SIZE
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L123
+    b .L997
+    .align 3
+
+.L124: // C==0 S==0
+    xvst VXZ, X, 0 * SIZE
+    xvstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 7
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L124
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    move XX, X
+    fcmp.ceq.s $fcc0, C, a1
+    bcnez $fcc0, .L210
+    fcmp.ceq.s $fcc0, S, a1
+    bcnez $fcc0, .L212 // C!=0 S==0
+    b .L211 // C!=0 S!=0
+    .align 3
+
+.L210:
+    fcmp.ceq.s $fcc0, S, a1
+    bcnez $fcc0, .L214 // C==0 S==0
+    b .L213 // C==0 S!=0
+    .align 3
+
+.L211: // C!=0 S!=0
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VT0, VXC, VX0
+    xvfmadd.s VT0, VX2, VXS, VT0
+    xvfmul.s VT1, VX0, VXS
+    xvfmsub.s VT1, VX2, VXC, VT1
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L212: // C!=0 S==0
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VT0, VXC, VX0
+    xvfmul.s VT1, VX2, VXC
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L212
+    b .L997
+    .align 3
+
+.L213: // C==0 S!=0
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VT0, VXS, VX2
+    xvfmul.s VT1, VXS, VX0
+    xvfsub.s VT1, VXZ, VT1
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L213
+    b .L997
+    .align 3
+
+.L214: // C==0 S==0
+    xvstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+    xvstelm.w VXZ, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 7
+    add.d XX, XX, INCX
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    fcmp.ceq.s $fcc0, C, a1
+    bcnez $fcc0, .L220
+    fcmp.ceq.s $fcc0, S, a1
+    bcnez $fcc0, .L222 // C!=0 S==0
+    b .L221 // C!=0 S!=0
+    .align 3
+
+.L220:
+    fcmp.ceq.s $fcc0, S, a1
+    bcnez $fcc0, .L224 // C==0 S==0
+    b .L223 // C==0 S!=0
+    .align 3
+
+.L221: // C!=0 S!=0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s  VT0, VX0, VXC
+    xvfmadd.s VT0, VX2, VXS, VT0
+    xvfmul.s  VT1, VX0, VXS
+    xvfmsub.s VT1, VX2, VXC, VT1
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222: // C!=0 S==0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VT0, VX0, VXC
+    xvfmul.s VT1, VX2, VXC
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: // C==0 S!=0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VT0, VX2, VXS
+    xvfmul.s VT1, VX0, VXS
+    xvfsub.s VT1, VXZ, VT1
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224: // C==0 S==0
+    xvstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 7
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 7
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    fld.s $f12, X, 0 * SIZE
+    fld.s $f13, Y, 0 * SIZE
+    fmul.s $f10, $f12, C
+    fmadd.s $f10, $f13, S, $f10
+    fst.s $f10, X, 0 * SIZE
+    addi.d I, I, -1
+    fmul.s $f20, $f12, S
+    fmsub.s $f20, $f13, C, $f20
+    fst.s $f20, Y, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/srot_lsx.S b/kernel/loongarch64/srot_lsx.S
new file mode 100644
index 000000000..8822b58e4
--- /dev/null
+++ b/kernel/loongarch64/srot_lsx.S
@@ -0,0 +1,927 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define C      $f0
+#define S      $f1
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VT0    $vr10
+#define VT1    $vr18
+#define VXC    $vr23
+#define VXS    $vr9
+#define VXZ    $vr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.s.l a1, a1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    movfr2gr.s t1, C
+    vreplgr2vr.w VXC, t1
+    movfr2gr.s t2, S
+    vreplgr2vr.w VXS, t2
+    movfr2gr.s t3, a1
+    vreplgr2vr.w VXZ, t3
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    fcmp.ceq.s $fcc0, C, a1
+    bcnez $fcc0, .L110
+    fcmp.ceq.s $fcc0, S, a1
+    bcnez $fcc0, .L112 // C!=0 S==0
+    b .L111 // C!=0 S!=0
+    .align 3
+
+.L110:
+    fcmp.ceq.s $fcc0, S, a1
+    bcnez $fcc0, .L114 // C==0 S==0
+    b .L113 // C==0 S!=0
+    .align 3
+
+.L111: // C!=0 S!=0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfmul.s VT0, VX0, VXC
+    vfmadd.s VT0, VX2, VXS, VT0
+    vfmul.s VT1, VX0, VXS
+    vfmsub.s VT1, VX2, VXC, VT1
+    vst VT0, X, 0 * SIZE
+    vst VT1, Y, 0 * SIZE
+    vfmul.s VT0, VX1, VXC
+    vfmadd.s VT0, VX3, VXS, VT0
+    vfmul.s VT1, VX1, VXS
+    vfmsub.s VT1, VX3, VXC, VT1
+    vst VT0, X, 4 * SIZE
+    vst VT1, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // C!=0 S==0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfmul.s VT0, VX0, VXC
+    vfmul.s VT1, VX2, VXC
+    vst VT0, X, 0 * SIZE
+    vst VT1, Y, 0 * SIZE
+    vfmul.s VT0, VX1, VXC
+    vfmul.s VT1, VX3, VXC
+    vst VT0, X, 4 * SIZE
+    vst VT1, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // C==0 S!=0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfmul.s VT0, VX2, VXS
+    vfmul.s VT1, VX0, VXS
+    vfsub.s VT1, VXZ, VT1
+    vst VT0, X, 0 * SIZE
+    vst VT1, Y, 0 * SIZE
+    vfmul.s VT0, VX3, VXS
+    vfmul.s VT1, VX1, VXS
+    vfsub.s VT1, VXZ, VT1
+    vst VT0, X, 4 * SIZE
+    vst VT1, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // C==0 S==0
+    vst VXZ, X, 0 * SIZE
+    vst VXZ, Y, 0 * SIZE
+    vst VXZ, X, 4 * SIZE
+    vst VXZ, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    fcmp.ceq.s $fcc0, C, a1
+    bcnez $fcc0, .L120
+    fcmp.ceq.s $fcc0, S, a1
+    bcnez $fcc0, .L122 // C!=0 S==0
+    b .L121 // C!=0 S!=0
+    .align 3
+
+.L120:
+    fcmp.ceq.s $fcc0, S, a1
+    bcnez $fcc0, .L124 // C==0 S==0
+    b .L123 // C==0 S!=0
+    .align 3
+
+.L121: // C!=0 S!=0
+    vld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VT0, VX0, VXC
+    vfmadd.s VT0, VX2, VXS, VT0
+    vfmul.s VT1, VX0, VXS
+    vfmsub.s VT1, VX2, VXC, VT1
+    vst VT0, X, 0 * SIZE
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VT0, VX1, VXC
+    vfmadd.s VT0, VX3, VXS, VT0
+    vfmul.s VT1, VX1, VXS
+    vfmsub.s VT1, VX3, VXC, VT1
+    vst VT0, X, 4 * SIZE
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L122: // C!=0 S==0
+    vld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VT0, VX0, VXC
+    vfmul.s VT1, VX2, VXC
+    vst VT0, X, 0 * SIZE
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VT0, VX1, VXC
+    vfmul.s VT1, VX3, VXC
+    vst VT0, X, 4 * SIZE
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L122
+    b .L997
+    .align 3
+
+.L123: // C==0 S!=0
+    vld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VT0, VX2, VXS
+    vfmul.s VT1, VX0, VXS
+    vfsub.s VT1, VXZ, VT1
+    vst VT0, X, 0 * SIZE
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VT0, VX3, VXS
+    vfmul.s VT1, VX1, VXS
+    vfsub.s VT1, VXZ, VT1
+    vst VT0, X, 4 * SIZE
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L123
+    b .L997
+    .align 3
+
+.L124: // C==0 S==0
+    vst VXZ, X, 0 * SIZE
+    vst VXZ, X, 4 * SIZE
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L124
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    move XX, X
+    fcmp.ceq.s $fcc0, C, a1
+    bcnez $fcc0, .L210
+    fcmp.ceq.s $fcc0, S, a1
+    bcnez $fcc0, .L212 // C!=0 S==0
+    b .L211 // C!=0 S!=0
+    .align 3
+
+.L210:
+    fcmp.ceq.s $fcc0, S, a1
+    bcnez $fcc0, .L214 // C==0 S==0
+    b .L213 // C==0 S!=0
+    .align 3
+
+.L211: // C!=0 S!=0
+    vld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmul.s VT0, VXC, VX0
+    vfmadd.s VT0, VX2, VXS, VT0
+    vfmul.s VT1, VXS, VX0
+    vfmsub.s VT1, VX2, VXC, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    vfmul.s VT0, VX1, VXC
+    vfmadd.s VT0, VX3, VXS, VT0
+    vfmul.s VT1, VX1, VXS
+    vfmsub.s VT1, VX3, VXC, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L212: // C!=0 S==0
+    vld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmul.s VT0, VXC, VX0
+    vfmul.s VT1, VX2, VXC
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    vfmul.s VT0, VX1, VXC
+    vfmul.s VT1, VX3, VXS
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L212
+    b .L997
+    .align 3
+
+.L213: // C==0 S!=0
+    vld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmul.s VT0, VXS, VX2
+    vfmul.s VT1, VXS, VX0
+    vfsub.s VT1, VXZ, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    vfmul.s VT0, VX3, VXS
+    vfmul.s VT1, VX1, VXS
+    vfsub.s VT1, VXZ, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L213
+    b .L997
+    .align 3
+
+.L214: // C==0 S==0
+    vstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    fcmp.ceq.s $fcc0, C, a1
+    bcnez $fcc0, .L220
+    fcmp.ceq.s $fcc0, S, a1
+    bcnez $fcc0, .L222 // C!=0 S==0
+    b .L221 // C!=0 S!=0
+    .align 3
+
+.L220:
+    fcmp.ceq.s $fcc0, S, a1
+    bcnez $fcc0, .L224 // C==0 S==0
+    b .L223 // C==0 S!=0
+    .align 3
+
+.L221: // C!=0 S!=0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VT0, VX0, VXC
+    vfmadd.s VT0, VX2, VXS, VT0
+    vfmul.s VT1, VX0, VXS
+    vfmsub.s VT1, VX2, VXC, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VT0, VX1, VXC
+    vfmadd.s VT0, VX3, VXS, VT0
+    vfmul.s VT1, VX0, VXS
+    vfmsub.s VT1, VX3, VXC, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222: // C!=0 S==0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VT0, VX0, VXC
+    vfmul.s VT1, VX2, VXC
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VT0, VX1, VXC
+    vfmul.s VT1, VX3, VXC
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: // C==0 S!=0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VT0, VX2, VXS
+    vfmul.s VT1, VX0, VXS
+    vfsub.s VT1, VXZ, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VT0, VX3, VXS
+    vfmul.s VT1, VX0, VXS
+    vfsub.s VT1, VXZ, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224: // C==0 S==0
+    vstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    fld.s $f12, X, 0 * SIZE
+    fld.s $f13, Y, 0 * SIZE
+    fmul.s $f10, $f12, C
+    fmadd.s $f10, $f13, S, $f10
+    fst.s $f10, X, 0 * SIZE
+    addi.d I, I, -1
+    fmul.s $f20, $f12, S
+    fmsub.s $f20, $f13, C, $f20
+    fst.s $f20, Y, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
\ No newline at end of file

From d32f38fb37c94afae20a662182fe7178c005a673 Mon Sep 17 00:00:00 2001
From: yancheng <yancheng@loongson.cn>
Date: Thu, 7 Dec 2023 13:15:55 +0800
Subject: [PATCH 475/718] loongarch64: Add optimizations for nrm2.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   3 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   3 +
 kernel/loongarch64/dnrm2_lasx.S          | 233 ++++++++++++++++++++++
 kernel/loongarch64/dnrm2_lsx.S           | 242 +++++++++++++++++++++++
 kernel/loongarch64/snrm2_lasx.S          | 143 ++++++++++++++
 kernel/loongarch64/snrm2_lsx.S           | 156 +++++++++++++++
 6 files changed, 780 insertions(+)
 create mode 100644 kernel/loongarch64/dnrm2_lasx.S
 create mode 100644 kernel/loongarch64/dnrm2_lsx.S
 create mode 100644 kernel/loongarch64/snrm2_lasx.S
 create mode 100644 kernel/loongarch64/snrm2_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 026ea0d77..1e4fa7a9d 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -52,4 +52,7 @@ DASUMKERNEL =  dasum_lsx.S
 SROTKERNEL  =  srot_lsx.S
 DROTKERNEL  =  drot_lsx.S
 
+SNRM2KERNEL =  snrm2_lsx.S
+DNRM2KERNEL =  dnrm2_lsx.S
+
 endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 4905a50a9..f00abcb32 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -52,6 +52,9 @@ DASUMKERNEL =  dasum_lasx.S
 SROTKERNEL  =  srot_lasx.S
 DROTKERNEL  =  drot_lasx.S
 
+SNRM2KERNEL =  snrm2_lasx.S
+DNRM2KERNEL =  dnrm2_lasx.S
+
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
diff --git a/kernel/loongarch64/dnrm2_lasx.S b/kernel/loongarch64/dnrm2_lasx.S
new file mode 100644
index 000000000..2a9c3cf7b
--- /dev/null
+++ b/kernel/loongarch64/dnrm2_lasx.S
@@ -0,0 +1,233 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define XX     $r19
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+#define VX0    $xr15
+#define VX1    $xr16
+#define VM0    $xr17
+#define VM1    $xr18
+#define VM2    $xr13
+#define VM3    $xr14
+#define res1   $xr19
+#define res2   $xr20
+#define VALPHA $xr21
+#define INF    $f23
+#define a1     $f22
+#define max    $f17
+#define ALPHA  $f12
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    xvxor.v   res1, res1, res1
+    xvxor.v   res2, res2, res2
+    bge $r0, N, .L999
+    beq $r0, INCX, .L999
+    move  XX, X
+    // Init INF
+    addi.d  TEMP, $r0,  0x7FF
+    slli.d  TEMP, TEMP, 52
+    MTC  INF, TEMP
+    li.d  TEMP, SIZE
+    slli.d INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+    bge $r0,  I, .L97
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfmaxa.d VM1, VX1, VX0
+    xvfmaxa.d VM0, VM0, VM1
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L10
+    b .L96
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X // initialize the maxa value
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L97
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t2, 1
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t3, 2
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t3, 2
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM0, VM0, VM1
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    b .L96
+    .align 3
+
+.L96:
+    xvpickve.d VX0, VM0, 1
+    xvpickve.d VX1, VM0, 2
+    xvpickve.d VM3, VM0, 3
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM2, VM3, VM0
+    xvfmaxa.d VM0, VM1, VM2
+    .align 3
+
+.L97:
+    andi I, N, 7
+    bge $r0, I, .L99
+    .align 3
+
+.L98:
+    xvld VX1, X, 0
+    xvfmaxa.d VM0, VM0, VX1
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L98
+    .align 3
+
+.L99:
+    fabs.d  max, max
+    lu12i.w  TEMP, 0x3f800 // 1
+    movgr2fr.d  a1, $r0
+    movgr2fr.w  ALPHA, TEMP
+    CMPEQ  $fcc0, max, a1
+    fcvt.d.s  ALPHA, ALPHA
+    bcnez  $fcc0, .L999
+    fdiv.d  ALPHA, ALPHA, max
+    CMPEQ  $fcc0, INF, ALPHA
+    bcnez  $fcc0, .L999
+    movfr2gr.d TEMP, ALPHA
+    xvreplgr2vr.d VALPHA, TEMP
+
+.L100:
+    li.d TEMP, SIZE
+    bne INCX, TEMP, .L120
+    srai.d  I, N, 3
+    bge $r0, I, .L997
+    .align 3
+
+.L110:
+    xvld VX0, XX, 0 * SIZE
+    xvld VX1, XX, 4 * SIZE
+    xvfmul.d VM0, VX0, VALPHA
+    xvfmul.d VM1, VX1, VALPHA
+    xvfmadd.d res1, VM0, VM0, res1
+    xvfmadd.d res2, VM1, VM1, res2
+    addi.d XX, XX, 8 * SIZE
+    addi.d I, I, -1
+    blt $r0, I, .L110
+    b .L996
+    .align 3
+
+.L120:
+    srai.d I, N, 3
+    bge $r0, I, .L997
+
+.L121:
+    ld.d t1, XX,  0 * SIZE
+    add.d XX, XX, INCX
+    ld.d t2, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    ld.d t3, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    ld.d t4, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    ld.d t2, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    ld.d t3, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    ld.d t4, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmul.d VM0, VX0, VALPHA
+    xvfmul.d VM1, VX1, VALPHA
+    xvfmadd.d res1, VM0, VM0, res1
+    xvfmadd.d res2, VM1, VM1, res2
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L996
+    .align 3
+
+.L996:
+    xvfadd.d res1, res1, res2
+    xvpickve.d VX0, res1, 1
+    xvpickve.d VX1, res1, 2
+    xvpickve.d VM0, res1, 3
+    xvfadd.d res1, VX0, res1
+    xvfadd.d VX1, VX1, VM0
+    xvfadd.d res1, VX1, res1
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    fld.d $f15, XX, 0 * SIZE
+    addi.d I, I, -1
+    fmul.d $f15, $f15, ALPHA
+    fmadd.d $f19, $f15, $f15, $f19
+    add.d XX, XX , INCX
+    blt $r0, I, .L998
+    fsqrt.d $f19, $f19
+    fmul.d $f0, max, $f19
+    jirl    $r0, $r1, 0x0
+    .align 3
+
+.L999:
+    fmov.d $f0, $f19
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dnrm2_lsx.S b/kernel/loongarch64/dnrm2_lsx.S
new file mode 100644
index 000000000..e4615e18d
--- /dev/null
+++ b/kernel/loongarch64/dnrm2_lsx.S
@@ -0,0 +1,242 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define XX     $r19
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+#define VX0    $vr15
+#define VX1    $vr16
+#define VM0    $vr17
+#define VM1    $vr18
+#define VM2    $vr13
+#define VM3    $vr14
+#define res1   $vr19
+#define res2   $vr20
+#define VALPHA $vr21
+#define INF    $f23
+#define a1     $f22
+#define max    $f17
+#define ALPHA  $f12
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    vxor.v   res1, res1, res1
+    vxor.v   res2, res2, res2
+    bge $r0, N, .L999
+    beq $r0, INCX, .L999
+    move  XX, X
+    // Init INF
+    addi.d  TEMP, $r0,  0x7FF
+    slli.d  TEMP, TEMP, 52
+    MTC  INF, TEMP
+    li.d  TEMP, SIZE
+    slli.d INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+    bge $r0,  I, .L97
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfmaxa.d VM1, VX1, VX0
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vfmaxa.d VM2, VX1, VX0
+    vfmaxa.d VM3, VM1, VM2
+    vfmaxa.d VM0, VM0, VM3
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L10
+    b .L96
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X // initialize the maxa value
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L97
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t2, 1
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM2, VX0, VX1
+    vfmaxa.d VM3, VM1, VM2
+    vfmaxa.d VM0, VM0, VM3
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    b .L96
+    .align 3
+
+.L96:
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    vfmaxa.d VM0, VX0, VX1
+    .align 3
+
+.L97:
+    andi I, N, 7
+    bge $r0, I, .L99
+    .align 3
+
+.L98:
+    vld VX1, X, 0
+    vfmaxa.d VM0, VM0, VX1
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L98
+    .align 3
+
+.L99:
+    fabs.d  max, max
+    lu12i.w  TEMP, 0x3f800 // 1
+    movgr2fr.d  a1, $r0
+    movgr2fr.w  ALPHA, TEMP
+    CMPEQ  $fcc0, max, a1
+    fcvt.d.s  ALPHA, ALPHA
+    bcnez  $fcc0, .L999
+    fdiv.d  ALPHA, ALPHA, max
+    CMPEQ  $fcc0, INF, ALPHA
+    bcnez  $fcc0, .L999
+    movfr2gr.d TEMP, ALPHA
+    vreplgr2vr.d VALPHA, TEMP
+
+.L100:
+    li.d TEMP, SIZE
+    bne INCX, TEMP, .L120
+    srai.d  I, N, 3
+    bge $r0, I, .L997
+    .align 3
+
+.L110:
+    vld VX0, XX, 0 * SIZE
+    vld VX1, XX, 2 * SIZE
+    vfmul.d VM0, VX0, VALPHA
+    vfmul.d VM1, VX1, VALPHA
+    vfmadd.d res1, VM0, VM0, res1
+    vfmadd.d res2, VM1, VM1, res2
+    vld VX0, XX, 4 * SIZE
+    vld VX1, XX, 6 * SIZE
+    vfmul.d VM0, VX0, VALPHA
+    vfmul.d VM1, VX1, VALPHA
+    vfmadd.d res1, VM0, VM0, res1
+    vfmadd.d res2, VM1, VM1, res2
+    addi.d XX, XX, 8 * SIZE
+    addi.d I, I, -1
+    blt $r0, I, .L110
+    b .L996
+    .align 3
+
+.L120:
+    srai.d I, N, 3
+    bge $r0, I, .L997
+
+.L121:
+    ld.d t1, XX,  0 * SIZE
+    add.d XX, XX, INCX
+    ld.d t2, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    ld.d t3, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    ld.d t4, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmul.d VM0, VX0, VALPHA
+    ld.d t1, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    vfmul.d VM1, VX1, VALPHA
+    ld.d t2, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    vfmadd.d res1, VM0, VM0, res1
+    vfmadd.d res2, VM1, VM1, res2
+    ld.d t3, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    ld.d t4, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmul.d VM0, VX0, VALPHA
+    vfmul.d VM1, VX1, VALPHA
+    vfmadd.d res1, VM0, VM0, res1
+    vfmadd.d res2, VM1, VM1, res2
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L996
+    .align 3
+
+.L996:
+    vfadd.d res1, res1, res2
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    fld.d $f15, XX, 0 * SIZE
+    addi.d I, I, -1
+    fmul.d $f15, $f15, ALPHA
+    fmadd.d $f19, $f15, $f15, $f19
+    add.d XX, XX , INCX
+    blt $r0, I, .L998
+    fsqrt.d $f19, $f19
+    fmul.d $f0, max, $f19
+    jirl    $r0, $r1, 0x0
+    .align 3
+
+.L999:
+    fmov.d $f0, $f19
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/snrm2_lasx.S b/kernel/loongarch64/snrm2_lasx.S
new file mode 100644
index 000000000..274908c14
--- /dev/null
+++ b/kernel/loongarch64/snrm2_lasx.S
@@ -0,0 +1,143 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+#define VX0    $xr15
+#define VX1    $xr16
+#define VX2    $xr17
+#define VX3    $xr18
+#define res1   $xr19
+#define res2   $xr20
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   xvxor.v res1, res1, res1
+   xvxor.v res2, res2, res2
+   bge $r0, N, .L999
+   beq $r0, INCX, .L999
+   li.d  TEMP, SIZE
+   slli.d INCX, INCX, BASE_SHIFT
+   srai.d I, N, 3
+   bne INCX, TEMP, .L20
+   bge $r0,  I, .L997
+   .align 3
+
+.L10:
+   xvld VX0, X, 0 * SIZE
+   xvld VX1, X, 0 * SIZE
+   xvfcvtl.d.s VX0, VX0
+   xvfcvth.d.s VX1, VX1
+   xvfmadd.d res1, VX0, VX0, res1
+   xvfmadd.d res2, VX1, VX1, res2
+   addi.d I, I, -1
+   addi.d X, X, 8 * SIZE 
+   blt $r0, I, .L10
+   .align 3
+   b .L996
+
+.L20:
+   bge $r0, I, .L997
+   .align 3
+
+.L21:
+   ld.w t1, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t2, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t3, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t4, X, 0 * SIZE
+   add.d X, X, INCX
+   xvinsgr2vr.w VX0, t1, 0
+   xvinsgr2vr.w VX0, t2, 1
+   xvinsgr2vr.w VX0, t3, 2
+   xvinsgr2vr.w VX0, t4, 3
+   ld.w t1, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t2, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t3, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t4, X, 0 * SIZE
+   add.d X, X, INCX
+   xvinsgr2vr.w VX0, t1, 4
+   xvinsgr2vr.w VX0, t2, 5
+   xvinsgr2vr.w VX0, t3, 6
+   xvinsgr2vr.w VX0, t4, 7
+   ld.w t1, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t2, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t3, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t4, X, 0 * SIZE
+   add.d X, X, INCX
+   xvinsgr2vr.w VX1, t1, 0
+   xvinsgr2vr.w VX1, t2, 1
+   xvinsgr2vr.w VX1, t3, 2
+   xvinsgr2vr.w VX1, t4, 3
+   ld.w t1, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t2, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t3, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t4, X, 0 * SIZE
+   add.d X, X, INCX
+   xvinsgr2vr.w VX1, t1, 4
+   xvinsgr2vr.w VX1, t2, 5
+   xvinsgr2vr.w VX1, t3, 6
+   xvinsgr2vr.w VX1, t4, 7
+   xvfcvtl.d.s VX0, VX0
+   xvfcvth.d.s VX1, VX1
+   xvfmadd.d res1, VX0, VX0, res1
+   xvfmadd.d res2, VX1, VX1, res2
+   addi.d  I, I, -1
+   blt $r0, I, .L21
+   b .L996
+
+.L996:
+   xvfadd.d res1, res1, res2
+   xvpickve.w VX1, res1, 1
+   xvpickve.w VX2, res1, 2
+   xvpickve.w VX3, res1, 3
+   xvfadd.s res1, VX1, res1
+   xvfadd.s res1, VX2, res1
+   xvfadd.s res1, VX3, res1
+   .align 3
+
+.L997:
+   andi I, N, 7
+   bge $r0, I, .L999
+   .align 3
+
+.L998:
+   fld.s $f15, X, 0 * SIZE
+   addi.d I, I, -1
+   fcvt.d.s $f15, $f15
+   fmadd.d $f19, $f15, $f15, $f19
+   add.d X, X, INCX
+   blt $r0, I, .L998
+   .align 3
+
+.L999:
+   fsqrt.d $f19, $f19
+   move $r4, $r17
+   fcvt.s.d $f0, $f19
+   jirl $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/snrm2_lsx.S b/kernel/loongarch64/snrm2_lsx.S
new file mode 100644
index 000000000..17d017900
--- /dev/null
+++ b/kernel/loongarch64/snrm2_lsx.S
@@ -0,0 +1,156 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+#define VX0    $vr15
+#define VX1    $vr16
+#define VX2    $vr17
+#define VX3    $vr18
+#define res1   $vr19
+#define res2   $vr20
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   vxor.v res1, res1, res1
+   vxor.v res2, res2, res2
+   bge $r0, N, .L999
+   beq $r0, INCX, .L999
+   li.d  TEMP, SIZE
+   slli.d INCX, INCX, BASE_SHIFT
+   srai.d I, N, 3
+   bne INCX, TEMP, .L20
+   bge $r0,  I, .L997
+   .align 3
+
+.L10:
+   vld VX0, X, 0 * SIZE
+   vld VX1, X, 0 * SIZE
+   vfcvtl.d.s VX0, VX0
+   vfcvth.d.s VX1, VX1
+   vfmadd.d res1, VX0, VX0, res1
+   vfmadd.d res2, VX1, VX1, res2
+   vld VX2, X, 4 * SIZE
+   vld VX3, X, 4 * SIZE
+   vfcvtl.d.s VX2, VX2
+   vfcvth.d.s VX3, VX3
+   vfmadd.d res1, VX2, VX2, res1
+   vfmadd.d res2, VX3, VX3, res2
+   addi.d I, I, -1
+   addi.d X, X, 8 * SIZE 
+   blt $r0, I, .L10
+   b .L996
+   .align 3
+   
+
+.L20:
+   bge $r0, I, .L997
+   .align 3
+
+.L21:
+   ld.w t1, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t2, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t3, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t4, X, 0 * SIZE
+   add.d X, X, INCX
+   vinsgr2vr.w VX0, t1, 0
+   vinsgr2vr.w VX0, t2, 1
+   vinsgr2vr.w VX0, t3, 2
+   vinsgr2vr.w VX0, t4, 3  
+   ld.w t1, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t2, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t3, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t4, X, 0 * SIZE
+   add.d X, X, INCX
+   vinsgr2vr.w VX1, t1, 0
+   vinsgr2vr.w VX1, t2, 1
+   vinsgr2vr.w VX1, t3, 2
+   vinsgr2vr.w VX1, t4, 3
+   vfcvtl.d.s VX0, VX0
+   vfcvth.d.s VX1, VX1
+   vfmadd.d res1, VX0, VX0, res1
+   vfmadd.d res2, VX1, VX1, res2
+   ld.w t1, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t2, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t3, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t4, X, 0 * SIZE
+   add.d X, X, INCX
+   vinsgr2vr.w VX2, t1, 0
+   vinsgr2vr.w VX2, t2, 1
+   vinsgr2vr.w VX2, t3, 2
+   vinsgr2vr.w VX2, t4, 3  
+   ld.w t1, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t2, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t3, X, 0 * SIZE
+   add.d X, X, INCX
+   ld.w t4, X, 0 * SIZE
+   add.d X, X, INCX
+   vinsgr2vr.w VX3, t1, 0
+   vinsgr2vr.w VX3, t2, 1
+   vinsgr2vr.w VX3, t3, 2
+   vinsgr2vr.w VX3, t4, 3
+   vfcvtl.d.s VX2, VX2
+   vfcvth.d.s VX3, VX3
+   vfmadd.d res1, VX2, VX2, res1
+   vfmadd.d res2, VX3, VX3, res2
+   addi.d  I, I, -1
+   blt $r0, I, .L21
+   b .L996
+   .align 3
+
+.L996:
+   vfadd.d res1, res1, res2
+   vreplvei.w VX1, res1, 1
+   vreplvei.w VX2, res1, 2
+   vreplvei.w VX3, res1, 3
+   vfadd.s res1, VX1, res1
+   vfadd.s res1, VX2, res1
+   vfadd.s res1, VX3, res1
+   .align 3
+
+.L997:
+   andi I, N, 7
+   bge $r0, I, .L999
+   .align 3
+
+.L998:
+   fld.s $f15, X, 0 * SIZE
+   addi.d I, I, -1
+   fcvt.d.s $f15, $f15
+   fmadd.d $f19, $f15, $f15, $f19
+   add.d X, X, INCX
+   blt $r0, I, .L998
+   .align 3
+
+.L999:
+   fsqrt.d $f19, $f19
+   move $r4, $r17
+   fcvt.s.d $f0, $f19
+   jirl $r0, $r1, 0x0
+   .align 3
+
+   EPILOGUE

From edac80d7e8ba97e39002f223628a956456356fa9 Mon Sep 17 00:00:00 2001
From: Mark Seminatore <mseminatore@users.noreply.github.com>
Date: Thu, 7 Dec 2023 14:59:27 -0800
Subject: [PATCH 476/718] some cleanup, dynamically scale threads, add missing
 WIN_CASE defn

---
 driver/others/blas_server_win32.c | 70 ++++++++++++++++++++++---------
 1 file changed, 51 insertions(+), 19 deletions(-)

diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 5af1f1a51..40ff85abc 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -51,15 +51,6 @@
 /* This is a thread implementation for Win32 lazy implementation */
 
 /* Thread server common information */
-//typedef struct{
-//  CRITICAL_SECTION lock;
-//  HANDLE filled;
-//  HANDLE killed;
-//
-//  blas_queue_t	*queue;    /* Parameter Pointer */
-//  int		shutdown;  /* server shutdown flag */
-//
-//} blas_pool_t;
 
 static blas_queue_t *work_queue = NULL;
 static HANDLE kickoff_event = NULL;
@@ -71,11 +62,19 @@ int blas_server_avail = 0;
 /* Local Variables */
 static BLASULONG server_lock       = 0;
 
-//static blas_pool_t   pool;
 static HANDLE	    blas_threads   [MAX_CPU_NUMBER];
 static DWORD	    blas_threads_id[MAX_CPU_NUMBER];
+static volatile int thread_target;	// target num of live threads, volatile for cross-thread reads
 
-
+#if defined (__GNUC__) && (__GNUC__ < 6)
+	#define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch)
+#else
+	#if defined(_WIN64)
+		#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp)
+	#else
+		#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp)
+	#endif
+#endif
 
 static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 
@@ -206,14 +205,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 static DWORD WINAPI blas_thread_server(void *arg){
 
   /* Thread identifier */
-#ifdef SMP_DEBUG
   BLASLONG  cpu = (BLASLONG)arg;
-#endif
 
   void *buffer, *sa, *sb;
   blas_queue_t	*queue;
-  DWORD action;
-  //HANDLE handles[] = {pool.filled, pool.killed};
 
   /* Each server needs each buffer */
   buffer   = blas_memory_alloc(2);
@@ -232,6 +227,12 @@ static DWORD WINAPI blas_thread_server(void *arg){
 	// event raised when work is added to the queue
 	WaitForSingleObject(kickoff_event, INFINITE);
 
+	if (cpu > thread_target - 2)
+	{
+		//printf("thread [%d] exiting.\n", cpu);
+		break;	// excess thread, so worker thread exits
+	}
+
 #ifdef SMP_DEBUG
     fprintf(STDERR, "Server[%2ld] Got it.\n", cpu);
 #endif
@@ -245,17 +246,17 @@ static DWORD WINAPI blas_thread_server(void *arg){
 
     LeaveCriticalSection(&queue_lock);
 #else
-    volatile work_queue_t* queue_next;
+    volatile blas_queue_t* queue_next;
 
     INT_PTR prev_value;
     do {
-        queue = (volatile work_queue_t*)work_queue;
+        queue = (volatile blas_queue_t*)work_queue;
         if (!queue)
             break;
 
-        queue_next = (volatile work_queue_t*)queue->next;
+        queue_next = (volatile blas_queue_t*)queue->next;
         prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue);
-    } while (prev_value != work_item);
+    } while (prev_value != queue);
 #endif
 
     if (queue)  {
@@ -377,9 +378,13 @@ int blas_thread_init(void){
 	// create the kickoff Event
 	kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
 
+	thread_target = blas_cpu_number;
+
     InitializeCriticalSection(&queue_lock);
 
     for(i = 0; i < blas_cpu_number - 1; i++){
+	  //printf("thread_init: creating thread [%d]\n", i);
+
       blas_threads[i] = CreateThread(NULL, 0,
 				     blas_thread_server, (void *)i,
 				     0, &blas_threads_id[i]);
@@ -564,10 +569,36 @@ void goto_set_num_threads(int num_threads)
 
 	if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
 
+	if (blas_server_avail && num_threads < blas_num_threads)	{
+		LOCK_COMMAND(&server_lock);
+
+		thread_target = num_threads;
+		
+		SetEvent(kickoff_event);
+
+		for (i = num_threads - 1; i < blas_num_threads - 1; i++) {
+			//printf("set_num_threads: waiting on thread [%d] to quit.\n", i);
+
+			WaitForSingleObject(blas_threads[i], INFINITE);
+
+			//printf("set_num_threads: thread [%d] has quit.\n", i);
+
+			CloseHandle(blas_threads[i]);
+		}
+
+		blas_num_threads = num_threads;
+		
+		ResetEvent(kickoff_event);
+
+		UNLOCK_COMMAND(&server_lock);
+	}
+
 	if (num_threads > blas_num_threads) {
 
 		LOCK_COMMAND(&server_lock);
 
+		thread_target = num_threads;
+
 		//increased_threads = 1;
 	    if (!blas_server_avail){
 			// create the kickoff Event
@@ -579,6 +610,7 @@ void goto_set_num_threads(int num_threads)
 		}
 
 		for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
+			//printf("set_num_threads: creating thread [%d]\n", i);
 
 			blas_threads[i] = CreateThread(NULL, 0,
 				     blas_thread_server, (void *)i,

From 4e738e561a95707e5ae41465af4896b74b2ca138 Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Fri, 8 Dec 2023 12:36:08 -0600
Subject: [PATCH 477/718] Replace two vector loads with one vector pair load
 and fix endianess of stores.

---
 kernel/power/sgemm_tcopy_16_power8.S         |   3 +
 kernel/power/sgemm_tcopy_macros_16_power10.S | 323 +++++++++++++++++++
 kernel/power/sgemm_tcopy_macros_16_power8.S  |   6 +
 3 files changed, 332 insertions(+)
 create mode 100644 kernel/power/sgemm_tcopy_macros_16_power10.S

diff --git a/kernel/power/sgemm_tcopy_16_power8.S b/kernel/power/sgemm_tcopy_16_power8.S
index b9f6d63fb..6d2c6a555 100644
--- a/kernel/power/sgemm_tcopy_16_power8.S
+++ b/kernel/power/sgemm_tcopy_16_power8.S
@@ -108,6 +108,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define o0	0
 
+#ifdef POWER10
+#include "sgemm_tcopy_macros_16_power10.S"
+#endif
 #include "sgemm_tcopy_macros_16_power8.S"
 
 #define STACKSIZE 144
diff --git a/kernel/power/sgemm_tcopy_macros_16_power10.S b/kernel/power/sgemm_tcopy_macros_16_power10.S
new file mode 100644
index 000000000..dca37e48a
--- /dev/null
+++ b/kernel/power/sgemm_tcopy_macros_16_power10.S
@@ -0,0 +1,323 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/21 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_4x16', `
+#else
+.macro COPY_4x16
+#endif
+
+    lxvpx       vs32,   o0,     A0
+    lxvpx       vs34,   o32,    A0
+
+    lxvpx       vs36,   o0,     A1
+    lxvpx       vs38,   o32,    A1
+
+    lxvpx       vs40,   o0,     A2
+    lxvpx       vs42,   o32,    A2
+
+    lxvpx       vs44,   o0,     A3
+    lxvpx       vs46,   o32,    A3
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs32,   o0,     T1
+    stxvx       vs33,   o16,    T1
+    stxvx       vs34,   o32,    T1
+    stxvx       vs35,   o48,    T1
+#else
+    stxvx       vs33,   o0,     T1
+    stxvx       vs32,   o16,    T1
+    stxvx       vs35,   o32,    T1
+    stxvx       vs34,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs36,   o0,     T1
+    stxvx       vs37,   o16,    T1
+    stxvx       vs38,   o32,    T1
+    stxvx       vs39,   o48,    T1
+#else
+    stxvx       vs37,   o0,     T1
+    stxvx       vs36,   o16,    T1
+    stxvx       vs39,   o32,    T1
+    stxvx       vs38,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs40,   o0,     T1
+    stxvx       vs41,   o16,    T1
+    stxvx       vs42,   o32,    T1
+    stxvx       vs43,   o48,    T1
+#else
+    stxvx       vs41,   o0,     T1
+    stxvx       vs40,   o16,    T1
+    stxvx       vs43,   o32,    T1
+    stxvx       vs42,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs44,   o0,     T1
+    stxvx       vs45,   o16,    T1
+    stxvx       vs46,   o32,    T1
+    stxvx       vs47,   o48,    T1
+#else
+    stxvx       vs45,   o0,     T1
+    stxvx       vs44,   o16,    T1
+    stxvx       vs47,   o32,    T1
+    stxvx       vs46,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_4x8', `
+#else
+.macro COPY_4x8
+#endif
+
+    lxvpx       vs32,   o0,     A0
+
+    lxvpx       vs34,   o0,     A1
+
+    lxvpx       vs36,   o0,     A2
+
+    lxvpx       vs38,   o0,     A3
+
+    mr      T1, BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs32,   o0,     T1
+    stxvx       vs33,   o16,    T1
+
+    stxvx       vs34,   o32,    T1
+    stxvx       vs35,   o48,    T1
+#else
+    stxvx       vs33,   o0,     T1
+    stxvx       vs32,   o16,    T1
+
+    stxvx       vs35,   o32,    T1
+    stxvx       vs34,   o48,    T1
+#endif
+
+    addi        T1, T1, 64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs36,   o0,     T1
+    stxvx       vs37,   o16,    T1
+
+    stxvx       vs38,   o32,    T1
+    stxvx       vs39,   o48,    T1
+#else
+    stxvx       vs37,   o0,     T1
+    stxvx       vs36,   o16,    T1
+
+    stxvx       vs39,   o32,    T1
+    stxvx       vs38,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+/**********************************************************************************************
+* Macros for N=2 and M=16
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_2x16', `
+#else
+.macro COPY_2x16
+#endif
+
+    lxvpx       vs32,   o0,     A0
+    lxvpx       vs34,   o32,    A0
+
+    lxvpx       vs36,   o0,     A1
+    lxvpx       vs38,   o32,    A1
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs32,   o0,     T1
+    stxvx       vs33,   o16,    T1
+    stxvx       vs34,   o32,    T1
+    stxvx       vs35,   o48,    T1
+#else
+    stxvx       vs33,   o0,     T1
+    stxvx       vs32,   o16,    T1
+    stxvx       vs35,   o32,    T1
+    stxvx       vs34,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs36,   o0, T1
+    stxvx       vs37,   o16,    T1
+    stxvx       vs38,   o32,    T1
+    stxvx       vs39,   o48,    T1
+#else
+    stxvx       vs37,   o0, T1
+    stxvx       vs36,   o16,    T1
+    stxvx       vs39,   o32,    T1
+    stxvx       vs38,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_2x8', `
+#else
+.macro COPY_2x8
+#endif
+
+    lxvpx       vs32,   o0,     A0
+
+    lxvpx       vs34,   o0,     A1
+
+    mr      T1, BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs32,   o0,     T1
+    stxvx       vs33,   o16,    T1
+
+    stxvx       vs34,   o32,    T1
+    stxvx       vs35,   o48,    T1
+#else
+    stxvx       vs33,   o0,     T1
+    stxvx       vs32,   o16,    T1
+
+    stxvx       vs35,   o32,    T1
+    stxvx       vs34,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+/**********************************************************************************************
+* Macros for N=1 and M=16
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_1x16', `
+#else
+.macro COPY_1x16
+#endif
+
+    lxvpx       vs32,   o0,     A0
+    lxvpx       vs34,   o32,    A0
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs32,   o0, T1
+    stxvx       vs33,   o16,    T1
+    stxvx       vs34,   o32,    T1
+    stxvx       vs35,   o48,    T1
+#else
+    stxvx       vs33,   o0, T1
+    stxvx       vs32,   o16,    T1
+    stxvx       vs35,   o32,    T1
+    stxvx       vs34,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_1x8', `
+#else
+.macro COPY_1x8
+#endif
+
+    lxvpx       vs32,   o0,     A0
+
+    mr      T1, BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs32,   o0, T1
+    stxvx       vs33,   o16,    T1
+#else
+    stxvx       vs33,   o0, T1
+    stxvx       vs32,   o16,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
diff --git a/kernel/power/sgemm_tcopy_macros_16_power8.S b/kernel/power/sgemm_tcopy_macros_16_power8.S
index ed592a604..af237d5ee 100644
--- a/kernel/power/sgemm_tcopy_macros_16_power8.S
+++ b/kernel/power/sgemm_tcopy_macros_16_power8.S
@@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * Macros for N=4 and M=16
 **********************************************************************************************/
 
+#ifndef POWER10
 #if defined(_AIX)
 define(`COPY_4x16', `
 #else
@@ -141,6 +142,7 @@ define(`COPY_4x8', `
 #else
 .endm
 #endif
+#endif
 
 /**********************************************************************************************
 * Macros for N=4 and M=4
@@ -264,6 +266,7 @@ define(`COPY_4x1', `
 * Macros for N=2 and M=16
 **********************************************************************************************/
 
+#ifndef POWER10
 #if defined(_AIX)
 define(`COPY_2x16', `
 #else
@@ -329,6 +332,7 @@ define(`COPY_2x8', `
 #else
 .endm
 #endif
+#endif
 
 /**********************************************************************************************
 * Macros for N=2 and M=4
@@ -418,6 +422,7 @@ define(`COPY_2x1', `
 * Macros for N=1 and M=16
 **********************************************************************************************/
 
+#ifndef POWER10
 #if defined(_AIX)
 define(`COPY_1x16', `
 #else
@@ -465,6 +470,7 @@ define(`COPY_1x8', `
 #else
 .endm
 #endif
+#endif
 
 /**********************************************************************************************
 * Macros for N=1 and M=4

From c732f275a27cbd9044d8409c2dd13e1e32f675ca Mon Sep 17 00:00:00 2001
From: barracuda156 <vital.had@gmail.com>
Date: Mon, 11 Dec 2023 21:05:31 +0800
Subject: [PATCH 478/718] system_check.cmake: fix arch detection for Darwin
 PowerPC

---
 cmake/system_check.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake
index c9671b379..e94497a04 100644
--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@@ -38,7 +38,7 @@ if(CMAKE_CL_64 OR MINGW64)
   endif()
 elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING))
   set(X86 1)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc.*"))
   set(POWER 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
   set(MIPS64 1)
@@ -109,7 +109,7 @@ else()
 endif ()
 
 if (NOT BINARY)
-  if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64)
+  if (X86_64 OR ARM64 OR MIPS64 OR LOONGARCH64 OR RISCV64 OR (POWER AND NOT (CMAKE_OSX_ARCHITECTURES STREQUAL "ppc")))
     set(BINARY 64)
   else ()
     set(BINARY 32)

From 9dbc8129b3987fd038585904e612a8ff0f62f947 Mon Sep 17 00:00:00 2001
From: barracuda156 <vital.had@gmail.com>
Date: Mon, 11 Dec 2023 21:09:06 +0800
Subject: [PATCH 479/718] cpuid_power.c: add CPU_SUBTYPE_POWERPC_7400 case

---
 cpuid_power.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpuid_power.c b/cpuid_power.c
index 2526e8d0e..1ced8930a 100644
--- a/cpuid_power.c
+++ b/cpuid_power.c
@@ -160,6 +160,7 @@ int detect(void){
   infoCount = HOST_BASIC_INFO_COUNT;
   host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, &infoCount);
 
+  if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7400) return CPUTYPE_PPCG4;
   if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7450) return CPUTYPE_PPCG4;
   if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_970)  return CPUTYPE_PPC970;
 

From 330101e0b38b95337818a27b29300f1637c1cce5 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 11 Dec 2023 21:52:00 +0100
Subject: [PATCH 480/718] Add complex type definitions for MSVC

---
 lapack-netlib/LAPACKE/include/lapack.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h
index 28f8ad655..dac6b22be 100644
--- a/lapack-netlib/LAPACKE/include/lapack.h
+++ b/lapack-netlib/LAPACKE/include/lapack.h
@@ -37,7 +37,15 @@
 */
 
 #ifndef LAPACK_COMPLEX_CUSTOM
+#if defined(_MSC_VER)
+    #define _CRT_USE_C_COMPLEX_H
+    #include <complex.h>
+    #define LAPACK_COMPLEX_CUSTOM
+    #define lapack_complex_float _Fcomplex
+    #define lapack_complex_double _Dcomplex
+#endif
 
+#else
 /* Complex type (single precision) */
 #ifndef lapack_complex_float
 #ifndef __cplusplus
@@ -74,6 +82,7 @@
 #define lapack_complex_double_imag(z)       (cimag(z))
 #endif
 
+#endif
 #endif /* LAPACK_COMPLEX_CUSTOM */
 
 

From 6bd7c54af5ecc2004b8a6df0157fe72d55530927 Mon Sep 17 00:00:00 2001
From: Mark Seminatore <mseminatore@users.noreply.github.com>
Date: Mon, 11 Dec 2023 15:13:04 -0800
Subject: [PATCH 481/718] introduce MT_TRACE to clean up SMP_DEBUG code

---
 driver/others/blas_server_win32.c | 60 ++++++++++++-------------------
 1 file changed, 23 insertions(+), 37 deletions(-)

diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 40ff85abc..5820a55f4 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -48,6 +48,12 @@
 #endif
 #endif
 
+#ifdef SMP_DEBUG
+#   define MT_TRACE(...) fprintf(stderr, __VA_ARGS__)
+#else
+#   define MT_TRACE(...)
+#endif
+
 /* This is a thread implementation for Win32 lazy implementation */
 
 /* Thread server common information */
@@ -213,29 +219,24 @@ static DWORD WINAPI blas_thread_server(void *arg){
   /* Each server needs each buffer */
   buffer   = blas_memory_alloc(2);
 
-#ifdef SMP_DEBUG
-  fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu);
-#endif
+  MT_TRACE("Server[%2ld] Thread is started!\n", cpu);
 
   while (1){
 
     /* Waiting for Queue */
 
-#ifdef SMP_DEBUG
-    fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu);
-#endif
+    MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu);
+
 	// event raised when work is added to the queue
 	WaitForSingleObject(kickoff_event, INFINITE);
 
 	if (cpu > thread_target - 2)
 	{
-		//printf("thread [%d] exiting.\n", cpu);
+		//MT_TRACE("thread [%d] exiting.\n", cpu);
 		break;	// excess thread, so worker thread exits
 	}
 
-#ifdef SMP_DEBUG
-    fprintf(STDERR, "Server[%2ld] Got it.\n", cpu);
-#endif
+    MT_TRACE("Server[%2ld] Got it.\n", cpu);
 
 #if 1
     EnterCriticalSection(&queue_lock);
@@ -270,10 +271,8 @@ static DWORD WINAPI blas_thread_server(void *arg){
       __asm__ __volatile__ ("fldcw %0"   : : "m" (queue -> x87_mode));
 #endif
 
-#ifdef SMP_DEBUG
-      fprintf(STDERR, "Server[%2ld] Started.  Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
+      MT_TRACE("Server[%2ld] Started.  Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
 	      cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
-#endif
 
       // fprintf(stderr, "queue start[%ld]!!!\n", cpu);
 
@@ -342,19 +341,14 @@ static DWORD WINAPI blas_thread_server(void *arg){
 		continue; //if queue == NULL
 	}
 
-#ifdef SMP_DEBUG
-    fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
-#endif
+    MT_TRACE("Server[%2ld] Finished!\n", cpu);
 	
 	queue->finished = 1;
-
   }
 
   /* Shutdown procedure */
 
-#ifdef SMP_DEBUG
-  fprintf(STDERR, "Server[%2ld] Shutdown!\n",  cpu);
-#endif
+  MT_TRACE("Server[%2ld] Shutdown!\n",  cpu);
 
   blas_memory_free(buffer);
 
@@ -369,10 +363,7 @@ int blas_thread_init(void){
 
   LOCK_COMMAND(&server_lock);
 
-#ifdef SMP_DEBUG
-  fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n",
-	  blas_cpu_number);
-#endif
+  MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number);
 
   if (!blas_server_avail){
 	// create the kickoff Event
@@ -383,7 +374,7 @@ int blas_thread_init(void){
     InitializeCriticalSection(&queue_lock);
 
     for(i = 0; i < blas_cpu_number - 1; i++){
-	  //printf("thread_init: creating thread [%d]\n", i);
+	  //MT_TRACE("thread_init: creating thread [%d]\n", i);
 
       blas_threads[i] = CreateThread(NULL, 0,
 				     blas_thread_server, (void *)i,
@@ -458,14 +449,10 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
 
 int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
 
-#ifdef SMP_DEBUG
-    fprintf(STDERR, "Synchronization Waiting.\n");
-#endif
+    MT_TRACE("Synchronization Waiting.\n");
 
     while (num){
-#ifdef SMP_DEBUG
-    fprintf(STDERR, "Waiting Queue ..\n");
-#endif
+      MT_TRACE("Waiting Queue ..\n");
 	  while (!queue->finished)
 		  YIELDING;
 
@@ -473,9 +460,8 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
       num--;
     }
 
-#ifdef SMP_DEBUG
-    fprintf(STDERR, "Completely Done.\n\n");
-#endif
+    MT_TRACE("Completely Done.\n\n");
+
 	// if work was added to the queue after this batch we can't sleep the worker threads
 	// by resetting the event
 	EnterCriticalSection(&queue_lock);
@@ -577,11 +563,11 @@ void goto_set_num_threads(int num_threads)
 		SetEvent(kickoff_event);
 
 		for (i = num_threads - 1; i < blas_num_threads - 1; i++) {
-			//printf("set_num_threads: waiting on thread [%d] to quit.\n", i);
+			//MT_TRACE("set_num_threads: waiting on thread [%d] to quit.\n", i);
 
 			WaitForSingleObject(blas_threads[i], INFINITE);
 
-			//printf("set_num_threads: thread [%d] has quit.\n", i);
+			//MT_TRACE("set_num_threads: thread [%d] has quit.\n", i);
 
 			CloseHandle(blas_threads[i]);
 		}
@@ -610,7 +596,7 @@ void goto_set_num_threads(int num_threads)
 		}
 
 		for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
-			//printf("set_num_threads: creating thread [%d]\n", i);
+			//MT_TRACE("set_num_threads: creating thread [%d]\n", i);
 
 			blas_threads[i] = CreateThread(NULL, 0,
 				     blas_thread_server, (void *)i,

From dcf6999c4e3b6af70d2822b9c2629e2df91378e4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 12 Dec 2023 11:27:17 +0100
Subject: [PATCH 482/718] remove extraneous endif

---
 lapack-netlib/LAPACKE/include/lapack.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h
index dac6b22be..b9f81792d 100644
--- a/lapack-netlib/LAPACKE/include/lapack.h
+++ b/lapack-netlib/LAPACKE/include/lapack.h
@@ -43,7 +43,6 @@
     #define LAPACK_COMPLEX_CUSTOM
     #define lapack_complex_float _Fcomplex
     #define lapack_complex_double _Dcomplex
-#endif
 
 #else
 /* Complex type (single precision) */

From dcdc35127276b4a261292bf03ca8570e89ee105a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 12 Dec 2023 23:06:22 +0100
Subject: [PATCH 483/718] Add MSVC-compatible complex types

---
 lapack-netlib/LAPACKE/include/lapacke_config.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/lapack-netlib/LAPACKE/include/lapacke_config.h b/lapack-netlib/LAPACKE/include/lapacke_config.h
index c64fc4416..798a5eb2e 100644
--- a/lapack-netlib/LAPACKE/include/lapacke_config.h
+++ b/lapack-netlib/LAPACKE/include/lapacke_config.h
@@ -68,6 +68,17 @@ extern "C" {
 #endif
 
 #ifndef LAPACK_COMPLEX_CUSTOM
+#if defined(_MSC_VER)
+    #define _CRT_USE_C_COMPLEX_H
+    #include <complex.h>
+    #define LAPACK_COMPLEX_CUSTOM
+    #define lapack_complex_float _Fcomplex
+    #define lapack_complex_double _Dcomplex
+    #define lapack_complex_float_real(z)       (creal(z))
+    #define lapack_complex_float_imag(z)       (cimag(z))
+    #define lapack_complex_double_real(z)       (creal(z))
+    #define lapack_complex_double_imag(z)       (cimag(z))
+#else
 
 #if defined(LAPACK_COMPLEX_STRUCTURE)
 
@@ -109,6 +120,7 @@ typedef struct { double real, imag; } _lapack_complex_double;
 #define lapack_complex_double_real(z)       (creal(z))
 #define lapack_complex_double_imag(z)       (cimag(z))
 
+#endif
 #endif
 
 lapack_complex_float lapack_make_complex_float( float re, float im );

From aa46f1e4e721550a13524ee890c0d26077c0a9eb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 12 Dec 2023 23:07:48 +0100
Subject: [PATCH 484/718] revert addition of MSVC-compatible complex (moved to
 lapacke_config.h)

---
 lapack-netlib/LAPACKE/include/lapack.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h
index b9f81792d..28f8ad655 100644
--- a/lapack-netlib/LAPACKE/include/lapack.h
+++ b/lapack-netlib/LAPACKE/include/lapack.h
@@ -37,14 +37,7 @@
 */
 
 #ifndef LAPACK_COMPLEX_CUSTOM
-#if defined(_MSC_VER)
-    #define _CRT_USE_C_COMPLEX_H
-    #include <complex.h>
-    #define LAPACK_COMPLEX_CUSTOM
-    #define lapack_complex_float _Fcomplex
-    #define lapack_complex_double _Dcomplex
 
-#else
 /* Complex type (single precision) */
 #ifndef lapack_complex_float
 #ifndef __cplusplus
@@ -81,7 +74,6 @@
 #define lapack_complex_double_imag(z)       (cimag(z))
 #endif
 
-#endif
 #endif /* LAPACK_COMPLEX_CUSTOM */
 
 

From a8d3619f65e65cb5a7e24f148a4924339a1a702f Mon Sep 17 00:00:00 2001
From: barracuda156 <vital.had@gmail.com>
Date: Wed, 13 Dec 2023 19:42:56 +0800
Subject: [PATCH 485/718] cc.cmake: add optflags for G5 and G4 kernels

---
 cmake/cc.cmake | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index 7b4ef8947..ac8661a7b 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -282,6 +282,18 @@ if (${CORE} STREQUAL POWER8)
   endif ()
 endif ()
 
+if (${CORE} STREQUAL PPC970)
+  if (NOT DYNAMIC_ARCH)
+    set (CCOMMON_OPT  "${CCOMMON_OPT} -mcpu=970 -mtune=970 -maltivec -fno-fast-math")
+  endif ()
+endif ()
+
+if (${CORE} STREQUAL PPCG4)
+  if (NOT DYNAMIC_ARCH)
+    set (CCOMMON_OPT  "${CCOMMON_OPT} -mcpu=G4 -mtune=G4 -maltivec -fno-fast-math")
+  endif ()
+endif ()
+
 if (NOT DYNAMIC_ARCH)
 	if (HAVE_AVX2)
         set (CCOMMON_OPT  "${CCOMMON_OPT} -mavx2")

From d9653af01851f52470f7b65d8650f6f2f5431b6e Mon Sep 17 00:00:00 2001
From: barracuda156 <vital.had@gmail.com>
Date: Wed, 13 Dec 2023 19:23:50 +0800
Subject: [PATCH 486/718] KERNEL.PPC970, KERNEL.PPCG4: unbreak CMake parsing

Fixes: https://github.com/OpenMathLib/OpenBLAS/issues/4366
---
 kernel/power/KERNEL.PPC970 | 8 ++++----
 kernel/power/KERNEL.PPCG4  | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/power/KERNEL.PPC970 b/kernel/power/KERNEL.PPC970
index a99fb7d96..fee5fa529 100644
--- a/kernel/power/KERNEL.PPC970
+++ b/kernel/power/KERNEL.PPC970
@@ -1,11 +1,11 @@
 ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
 SGEMMKERNEL    =  gemm_kernel.S
-SGEMMINCOPY    =  
-SGEMMITCOPY    =  
+SGEMMINCOPY    =
+SGEMMITCOPY    =
 SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ =  
-SGEMMITCOPYOBJ =  
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 else
diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4
index 1bdd3119e..c73601cee 100644
--- a/kernel/power/KERNEL.PPCG4
+++ b/kernel/power/KERNEL.PPCG4
@@ -96,9 +96,9 @@ CGEMMINCOPY    =
 CGEMMONCOPY    =
 CGEMMONCOPY    =  ../generic/zgemm_ncopy_2.c
 CGEMMOTCOPY    =  ../generic/zgemm_tcopy_2.c
-CGEMMINCOPYOBJ =  
+CGEMMINCOPYOBJ =
 #cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  
+CGEMMITCOPYOBJ =
 #cgemm_itcopy$(TSUFFIX).$(SUFFIX)
 CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)

From 981e315b30bb50cbb2b7375665f0f11d0d567703 Mon Sep 17 00:00:00 2001
From: barracuda156 <vital.had@gmail.com>
Date: Thu, 14 Dec 2023 12:01:31 +0800
Subject: [PATCH 487/718] cc.cmake: use -force_cpusubtype_ALL for Darwin PPC

---
 cmake/cc.cmake | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index ac8661a7b..d5bf3b7ee 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -286,12 +286,18 @@ if (${CORE} STREQUAL PPC970)
   if (NOT DYNAMIC_ARCH)
     set (CCOMMON_OPT  "${CCOMMON_OPT} -mcpu=970 -mtune=970 -maltivec -fno-fast-math")
   endif ()
+  if (APPLE)
+    set (CCOMMON_OPT  "${CCOMMON_OPT} -force_cpusubtype_ALL")
+  endif ()
 endif ()
 
 if (${CORE} STREQUAL PPCG4)
   if (NOT DYNAMIC_ARCH)
     set (CCOMMON_OPT  "${CCOMMON_OPT} -mcpu=G4 -mtune=G4 -maltivec -fno-fast-math")
   endif ()
+  if (APPLE)
+    set (CCOMMON_OPT  "${CCOMMON_OPT} -force_cpusubtype_ALL")
+  endif ()
 endif ()
 
 if (NOT DYNAMIC_ARCH)

From f06b5355667434f29258c3664bc5ea517988040b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 15 Dec 2023 09:58:44 +0100
Subject: [PATCH 488/718] Use C kernel for dgemv_t due to limitations of the
 old assembly one

---
 kernel/x86_64/KERNEL | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL
index bea7036c2..f8278c3b4 100644
--- a/kernel/x86_64/KERNEL
+++ b/kernel/x86_64/KERNEL
@@ -405,7 +405,7 @@ DGEMVNKERNEL = dgemv_n.S
 endif
 
 ifndef DGEMVTKERNEL
-DGEMVTKERNEL = dgemv_t.S
+DGEMVTKERNEL = dgemv_t_4.c
 endif
 
 ifndef CGEMVNKERNEL

From 544cb8630049650cec9bf4c582fda43f334d5c59 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 15 Dec 2023 14:03:59 +0100
Subject: [PATCH 489/718] Mention C906V instruction set limitation and update
 DYNAMIC_ARCH lists

---
 README.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index aaadd0d9c..b8d66ed42 100644
--- a/README.md
+++ b/README.md
@@ -196,7 +196,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
   ```sh
   make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran
   ```
-  (also known to work on C906)
+  (also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision)
 
 ### Support for multiple targets in a single library
 
@@ -207,9 +207,11 @@ For **x86_64**, the list of targets this activates contains Prescott, Core2, Neh
 `DYNAMIC_ARCH` is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
 Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.
 
-On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus.
+On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. If compiler support for SVE is available at build time, support for NeoverseN2, NeoverseV1 as well as generic ArmV8SVE targets is also enabled.
 
-For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
+For **POWER**, the list encompasses POWER6, POWER8 and POWER9. POWER10 is additionally available if a sufficiently recent compiler is used for the build.
+
+on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support.
 
 The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the
 common code in the library, usually you will want to set this to the oldest model you expect to encounter.

From 8c143331b0216809343d056b084e29a0ab3311a7 Mon Sep 17 00:00:00 2001
From: barracuda156 <vital.had@gmail.com>
Date: Fri, 15 Dec 2023 22:55:52 +0800
Subject: [PATCH 490/718] PPC970: drop -mcpu=970 which seems to produce faulty
 code

Fixes: https://github.com/OpenMathLib/OpenBLAS/issues/4376
---
 cmake/cc.cmake | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index d5bf3b7ee..00952e810 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -282,18 +282,21 @@ if (${CORE} STREQUAL POWER8)
   endif ()
 endif ()
 
+# With -mcpu=970 added it compiles, but library is broken, at least on macOS. If someone
+# tests on *BSD or Linux and adds this flag, please make sure it is not used for macOS case.
 if (${CORE} STREQUAL PPC970)
   if (NOT DYNAMIC_ARCH)
-    set (CCOMMON_OPT  "${CCOMMON_OPT} -mcpu=970 -mtune=970 -maltivec -fno-fast-math")
+    set (CCOMMON_OPT  "${CCOMMON_OPT} -mtune=970 -maltivec -fno-fast-math")
   endif ()
   if (APPLE)
     set (CCOMMON_OPT  "${CCOMMON_OPT} -force_cpusubtype_ALL")
   endif ()
 endif ()
 
+# -mcpu=G4 seems to work fine, but perhaps avoid it for the sake of consistency?
 if (${CORE} STREQUAL PPCG4)
   if (NOT DYNAMIC_ARCH)
-    set (CCOMMON_OPT  "${CCOMMON_OPT} -mcpu=G4 -mtune=G4 -maltivec -fno-fast-math")
+    set (CCOMMON_OPT  "${CCOMMON_OPT} -mtune=G4 -maltivec -fno-fast-math")
   endif ()
   if (APPLE)
     set (CCOMMON_OPT  "${CCOMMON_OPT} -force_cpusubtype_ALL")

From dab0da8243ce03d2a8d76e614259c099bf87b819 Mon Sep 17 00:00:00 2001
From: Darshan Patel <darshan.patel@fujitsu.com>
Date: Tue, 19 Dec 2023 13:56:55 +0530
Subject: [PATCH 491/718] Update GEMM param for NEOVERSEV1

---
 param.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/param.h b/param.h
index ee4640f57..1ec2d16dd 100644
--- a/param.h
+++ b/param.h
@@ -3396,13 +3396,13 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define ZGEMM_DEFAULT_UNROLL_N  4
 #define ZGEMM_DEFAULT_UNROLL_MN 16
 
-#define SGEMM_DEFAULT_P 128
-#define DGEMM_DEFAULT_P 160
+#define SGEMM_DEFAULT_P 240
+#define DGEMM_DEFAULT_P 240
 #define CGEMM_DEFAULT_P 128
 #define ZGEMM_DEFAULT_P 128
 
-#define SGEMM_DEFAULT_Q 352
-#define DGEMM_DEFAULT_Q 128
+#define SGEMM_DEFAULT_Q 640
+#define DGEMM_DEFAULT_Q 320
 #define CGEMM_DEFAULT_Q 224
 #define ZGEMM_DEFAULT_Q 112
 

From 7a4fef4f604db2a5e4e0c4ffaebea220a0646ab1 Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <chris.sidebottom@arm.com>
Date: Fri, 15 Dec 2023 12:50:48 +0000
Subject: [PATCH 492/718] Tweak SVE dot kernel

This changes the SVE dot kernel to only predicate when necessary as well
as streamlining the assembly a bit. The benchmarks seem to indicate this
can improve performance by ~33%.
---
 kernel/arm64/dot_kernel_sve.c | 100 +++++++++++++++++++++++++---------
 1 file changed, 74 insertions(+), 26 deletions(-)

diff --git a/kernel/arm64/dot_kernel_sve.c b/kernel/arm64/dot_kernel_sve.c
index 9c057551e..133de4ab3 100644
--- a/kernel/arm64/dot_kernel_sve.c
+++ b/kernel/arm64/dot_kernel_sve.c
@@ -1,4 +1,5 @@
 /***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
 Copyright (c) 2022, Arm Ltd
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
@@ -30,37 +31,84 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <arm_sve.h>
 
 #ifdef DOUBLE
-#define SVE_TYPE svfloat64_t
-#define SVE_ZERO svdup_f64(0.0)
-#define SVE_WHILELT svwhilelt_b64
-#define SVE_ALL svptrue_b64()
-#define SVE_WIDTH svcntd()
+#define DTYPE "d"
+#define WIDTH "d"
+#define SHIFT "3"
 #else
-#define SVE_TYPE svfloat32_t
-#define SVE_ZERO svdup_f32(0.0)
-#define SVE_WHILELT svwhilelt_b32
-#define SVE_ALL svptrue_b32()
-#define SVE_WIDTH svcntw()
+#define DTYPE "s"
+#define WIDTH "w"
+#define SHIFT "2"
 #endif
 
-static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) {
-        SVE_TYPE acc_a = SVE_ZERO;
-        SVE_TYPE acc_b = SVE_ZERO;
+#define COUNT \
+"        cnt"WIDTH"    x9                                   \n"
+#define SETUP_TRUE \
+"        ptrue   p0."DTYPE"                              \n"
+#define OFFSET_INPUTS                                     \
+"        add     x12, %[X_], x9, lsl #"SHIFT"               \n" \
+"        add     x13, %[Y_], x9, lsl #"SHIFT"               \n"
+#define TAIL_WHILE                                        \
+"        whilelo p1."DTYPE", x8, x0                         \n"
+#define UPDATE(pg, x,y,out)                               \
+"        ld1"WIDTH"    { z2."DTYPE" }, "pg"/z, ["x", x8, lsl #"SHIFT"]  \n" \
+"        ld1"WIDTH"    { z3."DTYPE" }, "pg"/z, ["y", x8, lsl #"SHIFT"]  \n" \
+"        fmla    "out"."DTYPE", "pg"/m, z2."DTYPE", z3."DTYPE"      \n"
+#define SUM_VECTOR(v) \
+"        faddv   "DTYPE""v", p0, z"v"."DTYPE"                     \n"
+#define RET \
+"        fadd    %"DTYPE"[RET_], "DTYPE"1, "DTYPE"0                     \n"
 
-        BLASLONG sve_width = SVE_WIDTH;
+#define DOT_KERNEL                                        \
+        COUNT                                             \
+"        mov     z1.d, #0                             \n" \
+"        mov     z0.d, #0                             \n" \
+"        mov     x8, #0                               \n" \
+"        movi    d1, #0x0                             \n" \
+        SETUP_TRUE                                        \
+"        neg     x10, x9, lsl #1                      \n" \
+"        ands    x11, x10, x0                         \n" \
+"        b.eq    .Lskip_2x                            \n" \
+        OFFSET_INPUTS                                     \
+".Lvector_2x:                                         \n" \
+        UPDATE("p0", "%[X_]", "%[Y_]", "z1") \
+        UPDATE("p0", "x12", "x13", "z0") \
+"        sub     x8, x8, x10                          \n" \
+"        cmp     x8, x11                              \n" \
+"        b.lo    .Lvector_2x                          \n" \
+        SUM_VECTOR("1") \
+".Lskip_2x:                                           \n" \
+"        neg     x10, x9                              \n" \
+"        and     x10, x10, x0                         \n" \
+"        cmp     x8, x10                              \n" \
+"        b.hs    .Ltail                               \n" \
+".Lvector_1x:                                         \n" \
+        UPDATE("p0", "%[X_]", "%[Y_]", "z0")              \
+"        add     x8, x8, x9                           \n" \
+"        cmp     x8, x10                              \n" \
+"        b.lo    .Lvector_1x                          \n" \
+".Ltail:                                              \n" \
+"        cmp     x10, x0                              \n" \
+"        b.eq    .Lend                                \n" \
+        TAIL_WHILE                                        \
+        UPDATE("p1", "%[X_]", "%[Y_]", "z0")              \
+".Lend:                                               \n" \
+        SUM_VECTOR("0") \
+        RET
 
-        for (BLASLONG i = 0; i < n; i += sve_width * 2) {
-                svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n);
-                svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n);
+static
+FLOAT
+dot_kernel_sve(BLASLONG n, FLOAT* x, FLOAT* y)
+{
+  FLOAT ret;
 
-                SVE_TYPE x_vec_a = svld1(pg_a, &x[i]);
-                SVE_TYPE y_vec_a = svld1(pg_a, &y[i]);
-                SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]);
-                SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]);
+  asm(DOT_KERNEL
+      :
+        [RET_] "=&w" (ret)
+      :
+        [N_] "r" (n),
+        [X_] "r" (x),
+        [Y_] "r" (y)
+      :);
 
-                acc_a = svmla_m(pg_a, acc_a, x_vec_a, y_vec_a);
-                acc_b = svmla_m(pg_b, acc_b, x_vec_b, y_vec_b);
-        }
-
-        return svaddv(SVE_ALL, acc_a) + svaddv(SVE_ALL, acc_b);
+  return ret;
 }

From 60e66725e40ef5c6b5a8b2214c3b3a633f699c9b Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <chris.sidebottom@arm.com>
Date: Tue, 19 Dec 2023 13:11:06 +0000
Subject: [PATCH 493/718] Use numeric labels to allow repeated inlining

---
 kernel/arm64/dot_kernel_sve.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/kernel/arm64/dot_kernel_sve.c b/kernel/arm64/dot_kernel_sve.c
index 133de4ab3..16f4cd537 100644
--- a/kernel/arm64/dot_kernel_sve.c
+++ b/kernel/arm64/dot_kernel_sve.c
@@ -67,31 +67,31 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         SETUP_TRUE                                        \
 "        neg     x10, x9, lsl #1                      \n" \
 "        ands    x11, x10, x0                         \n" \
-"        b.eq    .Lskip_2x                            \n" \
+"        b.eq    2f // skip_2x                        \n" \
         OFFSET_INPUTS                                     \
-".Lvector_2x:                                         \n" \
+"1: // vector_2x                                      \n" \
         UPDATE("p0", "%[X_]", "%[Y_]", "z1") \
         UPDATE("p0", "x12", "x13", "z0") \
 "        sub     x8, x8, x10                          \n" \
 "        cmp     x8, x11                              \n" \
-"        b.lo    .Lvector_2x                          \n" \
+"        b.lo    1b // vector_2x                      \n" \
         SUM_VECTOR("1") \
-".Lskip_2x:                                           \n" \
+"2: // skip_2x                                        \n" \
 "        neg     x10, x9                              \n" \
 "        and     x10, x10, x0                         \n" \
 "        cmp     x8, x10                              \n" \
-"        b.hs    .Ltail                               \n" \
-".Lvector_1x:                                         \n" \
+"        b.hs    4f // tail                           \n" \
+"3: // vector_1x                                      \n" \
         UPDATE("p0", "%[X_]", "%[Y_]", "z0")              \
 "        add     x8, x8, x9                           \n" \
 "        cmp     x8, x10                              \n" \
-"        b.lo    .Lvector_1x                          \n" \
-".Ltail:                                              \n" \
+"        b.lo    3b // vector_1x                      \n" \
+"4: // tail                                           \n" \
 "        cmp     x10, x0                              \n" \
-"        b.eq    .Lend                                \n" \
+"        b.eq    5f // end                            \n" \
         TAIL_WHILE                                        \
         UPDATE("p1", "%[X_]", "%[Y_]", "z0")              \
-".Lend:                                               \n" \
+"5: // end                                            \n" \
         SUM_VECTOR("0") \
         RET
 

From bb8b91e9f2c8176e7a6388fe7a949744771bb29d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 19 Dec 2023 23:13:02 +0100
Subject: [PATCH 494/718] restore OpenBLAS-specific test paths

---
 lapack-netlib/lapack_testing.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lapack-netlib/lapack_testing.py b/lapack-netlib/lapack_testing.py
index 96fbeb2a6..fea0ff765 100755
--- a/lapack-netlib/lapack_testing.py
+++ b/lapack-netlib/lapack_testing.py
@@ -255,19 +255,19 @@ for dtype in range_prec:
         else:
             if dtest==16:
                 # LIN TESTS
-                cmdbase="xlintst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
+                cmdbase="LIN/xlintst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
             elif dtest==17:
                 # PROTO LIN TESTS
-                cmdbase="xlintst"+letter+dtypes[0][dtype-1]+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
+                cmdbase="LIN/xlintst"+letter+dtypes[0][dtype-1]+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
             elif dtest==18:
                 # PROTO LIN TESTS
-                cmdbase="xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
+                cmdbase="LIN/xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
             elif dtest==20:
                 # DMD EIG TESTS
-                cmdbase="xdmdeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
+                cmdbase="EIG/xdmdeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
             else:
                 # EIG TESTS
-                cmdbase="xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
+                cmdbase="EIG/xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
         if not just_errors and not short_summary:
             print("Testing "+name+" "+dtests[1][dtest]+"-"+cmdbase, end=' ')
         # Run the process: either to read the file or run the LAPACK testing

From e67a0eaaf9f02e37270ac03747b333b28dac61e7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 19 Dec 2023 23:15:11 +0100
Subject: [PATCH 495/718] Restore OpenBLAS-specific build rule changes

---
 lapack-netlib/TESTING/EIG/Makefile | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile
index 4e7cf4629..9cf0fc95e 100644
--- a/lapack-netlib/TESTING/EIG/Makefile
+++ b/lapack-netlib/TESTING/EIG/Makefile
@@ -136,28 +136,28 @@ double: xeigtstd xdmdeigtstd
 complex16: xeigtstz xdmdeigtstz
 
 xdmdeigtsts: $(SDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 xdmdeigtstc: $(CDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 xdmdeigtstd: $(DDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 xdmdeigtstz: $(ZDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 $(SDMDEIGTST): $(FRC)
 $(CDMDEIGTST): $(FRC)

From a7ed60bfe93a73df853be560244aa33a774088ac Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 21 Dec 2023 20:05:23 +0100
Subject: [PATCH 496/718] Add lower limit for multithreading

---
 interface/lapack/gesv.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/interface/lapack/gesv.c b/interface/lapack/gesv.c
index 175350329..546c2bed2 100644
--- a/interface/lapack/gesv.c
+++ b/interface/lapack/gesv.c
@@ -114,7 +114,14 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv,
 
 #ifdef SMP
   args.common = NULL;
-  args.nthreads = num_cpu_avail(4);
+#ifndef DOUBLE
+  if (args.m*args.n < 40000)
+#else
+  if (args.m*args.n < 10000)
+#endif
+	args.nthreads=1;
+  else
+         args.nthreads = num_cpu_avail(4);
 
   if (args.nthreads == 1) {
 #endif

From ecae1389dff8bcb6643685aedb1fbaeb9b26d79d Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <chris.sidebottom@arm.com>
Date: Sat, 23 Dec 2023 12:21:48 +0000
Subject: [PATCH 497/718] Reduce duplication in kernel definitions

These files are exactly the same, so I believe we can reduce these files
down. Other files require a slightly more complex unpicking.
---
 kernel/arm64/KERNEL.A64FX         | 207 +-----------------------------
 kernel/arm64/KERNEL.CORTEXA55     | 197 +---------------------------
 kernel/arm64/KERNEL.THUNDERX3T110 | 185 +-------------------------
 3 files changed, 3 insertions(+), 586 deletions(-)

diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX
index ccbce27e1..bc5999097 100644
--- a/kernel/arm64/KERNEL.A64FX
+++ b/kernel/arm64/KERNEL.A64FX
@@ -1,206 +1 @@
-SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
-CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
-
-SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
-
-SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
-
-ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = ../arm/iamin.c
-ICAMINKERNEL = ../arm/izamin.c
-IZAMINKERNEL = ../arm/izamin.c
-
-ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
-
-ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
-
-STRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-STRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-STRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-STRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-DTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-DTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-DTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-DTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-TRSMCOPYLN_M    = trsm_lncopy_sve.c
-TRSMCOPYLT_M    = trsm_ltcopy_sve.c
-TRSMCOPYUN_M    = trsm_uncopy_sve.c
-TRSMCOPYUT_M    = trsm_utcopy_sve.c
-
-CTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-CTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-CTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-CTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-ZTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-ZTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-ZTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-ZTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-ZTRSMCOPYLN_M    = ztrsm_lncopy_sve.c
-ZTRSMCOPYLT_M    = ztrsm_ltcopy_sve.c
-ZTRSMCOPYUN_M    = ztrsm_uncopy_sve.c
-ZTRSMCOPYUT_M    = ztrsm_utcopy_sve.c
-
-
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
-SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = daxpy_thunderx2t99.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
-
-SROTKERNEL   = rot.S
-DROTKERNEL   = rot.S
-CROTKERNEL   = zrot.S
-ZROTKERNEL   = zrot.S
-
-SSCALKERNEL  = scal.S
-DSCALKERNEL  = scal.S
-CSCALKERNEL  = zscal.S
-ZSCALKERNEL  = zscal.S
-
-SGEMVNKERNEL = gemv_n.S
-DGEMVNKERNEL = gemv_n.S
-CGEMVNKERNEL = zgemv_n.S
-ZGEMVNKERNEL = zgemv_n.S
-
-SGEMVTKERNEL = gemv_t.S
-DGEMVTKERNEL = gemv_t.S
-CGEMVTKERNEL = zgemv_t.S
-ZGEMVTKERNEL = zgemv_t.S
-
-SASUMKERNEL    = sasum_thunderx2t99.c
-DASUMKERNEL    = dasum_thunderx2t99.c
-CASUMKERNEL    = casum_thunderx2t99.c
-ZASUMKERNEL    = zasum_thunderx2t99.c
-
-SCOPYKERNEL    = copy_thunderx2t99.c
-DCOPYKERNEL    = copy_thunderx2t99.c
-CCOPYKERNEL    = copy_thunderx2t99.c
-ZCOPYKERNEL    = copy_thunderx2t99.c
-
-SSWAPKERNEL    = swap_thunderx2t99.S
-DSWAPKERNEL    = swap_thunderx2t99.S
-CSWAPKERNEL    = swap_thunderx2t99.S
-ZSWAPKERNEL    = swap_thunderx2t99.S
-
-ISAMAXKERNEL   = iamax_thunderx2t99.c
-IDAMAXKERNEL   = iamax_thunderx2t99.c
-ICAMAXKERNEL   = izamax_thunderx2t99.c
-IZAMAXKERNEL   = izamax_thunderx2t99.c
-
-SNRM2KERNEL    = scnrm2_thunderx2t99.c
-DNRM2KERNEL    = dznrm2_thunderx2t99.c
-CNRM2KERNEL    = scnrm2_thunderx2t99.c
-ZNRM2KERNEL    = dznrm2_thunderx2t99.c
-
-DDOTKERNEL     = dot.c
-SDOTKERNEL     = dot.c
-CDOTKERNEL     = zdot_thunderx2t99.c
-ZDOTKERNEL     = zdot_thunderx2t99.c
-DSDOTKERNEL    = dot.S
-
-DGEMM_BETA     = dgemm_beta.S
-SGEMM_BETA     = sgemm_beta.S
-
-SGEMMKERNEL    =  sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
-STRMMKERNEL    =  strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
-
-SGEMMINCOPY    =  gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c
-SGEMMITCOPY    =  gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c
-SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
-SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
-
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-STRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
-STRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
-STRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
-STRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
-
-SSYMMUCOPY_M    =  symm_ucopy_sve.c
-SSYMMLCOPY_M    =  symm_lcopy_sve.c
-
-DGEMMKERNEL    =  dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
-DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
-
-DGEMMINCOPY    =  gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c
-DGEMMITCOPY    =  gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c
-DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
-DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
-
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DTRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
-DTRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
-DTRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
-DTRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
-
-DSYMMUCOPY_M    =  symm_ucopy_sve.c
-DSYMMLCOPY_M    =  symm_lcopy_sve.c
-
-CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-
-CGEMMINCOPY    =  gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
-CGEMMITCOPY    =  gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
-CTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
-CTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
-CTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
-
-CHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
-CHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
-
-CSYMMUCOPY_M    =  zsymm_ucopy_sve.c
-CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
-
-ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-
-ZGEMMINCOPY    =  gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
-ZGEMMITCOPY    =  gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
-ZTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
-ZTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
-ZTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
-
-ZHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
-ZHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
-
-ZSYMMUCOPY_M    =  zsymm_ucopy_sve.c
-ZSYMMLCOPY_M    =  zsymm_lcopy_sve.c
+include $(KERNELDIR)/KERNEL.ARMV8SVE
diff --git a/kernel/arm64/KERNEL.CORTEXA55 b/kernel/arm64/KERNEL.CORTEXA55
index e2e006770..574e98b8c 100644
--- a/kernel/arm64/KERNEL.CORTEXA55
+++ b/kernel/arm64/KERNEL.CORTEXA55
@@ -1,196 +1 @@
-SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
-CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
-
-SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
-
-SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
-
-ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = ../arm/iamin.c
-ICAMINKERNEL = ../arm/izamin.c
-IZAMINKERNEL = ../arm/izamin.c
-
-ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
-
-ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
-
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
-SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = axpy.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
-
-SROTKERNEL   = rot.S
-DROTKERNEL   = rot.S
-CROTKERNEL   = zrot.S
-ZROTKERNEL   = zrot.S
-
-SSCALKERNEL  = scal.S
-DSCALKERNEL  = scal.S
-CSCALKERNEL  = zscal.S
-ZSCALKERNEL  = zscal.S
-
-SGEMVNKERNEL = gemv_n.S
-DGEMVNKERNEL = gemv_n.S
-CGEMVNKERNEL = zgemv_n.S
-ZGEMVNKERNEL = zgemv_n.S
-
-SGEMVTKERNEL = gemv_t.S
-DGEMVTKERNEL = gemv_t.S
-CGEMVTKERNEL = zgemv_t.S
-ZGEMVTKERNEL = zgemv_t.S
-
-
-SASUMKERNEL    = asum.S
-DASUMKERNEL    = asum.S
-CASUMKERNEL    = casum.S
-ZASUMKERNEL    = zasum.S
-
-SCOPYKERNEL    = copy.S
-DCOPYKERNEL    = copy.S
-CCOPYKERNEL    = copy.S
-ZCOPYKERNEL    = copy.S
-
-SSWAPKERNEL    = swap.S
-DSWAPKERNEL    = swap.S
-CSWAPKERNEL    = swap.S
-ZSWAPKERNEL    = swap.S
-
-ISAMAXKERNEL   = iamax.S
-IDAMAXKERNEL   = iamax.S
-ICAMAXKERNEL   = izamax.S
-IZAMAXKERNEL   = izamax.S
-
-SNRM2KERNEL    = nrm2.S
-DNRM2KERNEL    = nrm2.S
-CNRM2KERNEL    = znrm2.S
-ZNRM2KERNEL    = znrm2.S
-
-ifneq ($(C_COMPILER), PGI)
-SDOTKERNEL   = ../generic/dot.c
-else
-SDOTKERNEL   = dot.S
-endif
-DDOTKERNEL   = dot.S
-ifneq ($(C_COMPILER), PGI)
-CDOTKERNEL   = zdot.S
-ZDOTKERNEL   = zdot.S
-else
-CDOTKERNEL = ../arm/zdot.c
-ZDOTKERNEL = ../arm/zdot.c
-endif
-DSDOTKERNEL  = dot.S
-
-DGEMM_BETA     = dgemm_beta.S
-SGEMM_BETA     = sgemm_beta.S
-
-ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
-SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
-STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
-else
-SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
-STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
-endif
-ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
-ifeq ($(SGEMM_UNROLL_M), 16)
-SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S
-else
-SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
-endif
-ifeq ($(SGEMM_UNROLL_M), 4)
-SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_M).S
-else
-SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
-endif
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
-SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c
-DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
-
-ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
-
-ifeq ($(DGEMM_UNROLL_M), 8)
-DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S
-DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S
-else
-DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
-DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
-endif
-
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-ifeq ($(DGEMM_UNROLL_N), 4)
-DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
-DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
-else
-DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
-DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
-endif
-
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c
-CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
-CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c
-ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
-ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+include $(KERNELDIR)/KERNEL.CORTEXA53
diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110
index 41cedc851..5d3bd69f7 100644
--- a/kernel/arm64/KERNEL.THUNDERX3T110
+++ b/kernel/arm64/KERNEL.THUNDERX3T110
@@ -1,184 +1 @@
-SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
-CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
-
-SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
-
-SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
-
-ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = ../arm/iamin.c
-ICAMINKERNEL = ../arm/izamin.c
-IZAMINKERNEL = ../arm/izamin.c
-
-ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
-
-ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
-
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
-SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = daxpy_thunderx2t99.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
-
-SROTKERNEL   = rot.S
-DROTKERNEL   = rot.S
-CROTKERNEL   = zrot.S
-ZROTKERNEL   = zrot.S
-
-SSCALKERNEL  = scal.S
-DSCALKERNEL  = scal.S
-CSCALKERNEL  = zscal.S
-ZSCALKERNEL  = zscal.S
-
-SGEMVNKERNEL = gemv_n.S
-DGEMVNKERNEL = gemv_n.S
-CGEMVNKERNEL = zgemv_n.S
-ZGEMVNKERNEL = zgemv_n.S
-
-SGEMVTKERNEL = gemv_t.S
-DGEMVTKERNEL = gemv_t.S
-CGEMVTKERNEL = zgemv_t.S
-ZGEMVTKERNEL = zgemv_t.S
-
-STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
-ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
-SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
-SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
-
-ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
-
-ifeq ($(DGEMM_UNROLL_M), 8)
-DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S
-DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S
-else
-DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
-DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
-endif
-
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-ifeq ($(DGEMM_UNROLL_N), 4)
-DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
-DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
-else
-DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
-DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
-endif
-
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
-CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
-ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-SASUMKERNEL    = sasum_thunderx2t99.c
-DASUMKERNEL    = dasum_thunderx2t99.c
-CASUMKERNEL    = casum_thunderx2t99.c
-ZASUMKERNEL    = zasum_thunderx2t99.c
-
-SCOPYKERNEL    = copy_thunderx2t99.c
-DCOPYKERNEL    = copy_thunderx2t99.c
-CCOPYKERNEL    = copy_thunderx2t99.c
-ZCOPYKERNEL    = copy_thunderx2t99.c
-
-SSWAPKERNEL    = swap_thunderx2t99.S
-DSWAPKERNEL    = swap_thunderx2t99.S
-CSWAPKERNEL    = swap_thunderx2t99.S
-ZSWAPKERNEL    = swap_thunderx2t99.S
-
-ISAMAXKERNEL   = iamax_thunderx2t99.c
-IDAMAXKERNEL   = iamax_thunderx2t99.c
-ICAMAXKERNEL   = izamax_thunderx2t99.c
-IZAMAXKERNEL   = izamax_thunderx2t99.c
-
-SNRM2KERNEL    = scnrm2_thunderx2t99.c
-CNRM2KERNEL    = scnrm2_thunderx2t99.c
-#DNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
-#ZNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
-DNRM2KERNEL    = dznrm2_thunderx2t99.c
-ZNRM2KERNEL    = dznrm2_thunderx2t99.c
-
-
-DDOTKERNEL     = dot.c
-SDOTKERNEL     = dot.c
-CDOTKERNEL     = zdot_thunderx2t99.c
-ZDOTKERNEL     = zdot_thunderx2t99.c
-DSDOTKERNEL    = dot.S
-
-ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
-DGEMMKERNEL    = dgemm_kernel_8x4_thunderx2t99.S
-endif
-
-ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4)
-SGEMMKERNEL    =  sgemm_kernel_16x4_thunderx2t99.S
-endif
-
-ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4)
-CGEMMKERNEL    =  cgemm_kernel_8x4_thunderx2t99.S
-endif
-
-ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4)
-ZGEMMKERNEL    =  zgemm_kernel_4x4_thunderx2t99.S
-endif
+include $(KERNELDIR)/KERNEL.THUNDERX2T99

From dc20a7818899cd6fbcf192b93e1940b353f83a0f Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <chris.sidebottom@arm.com>
Date: Sat, 23 Dec 2023 12:19:33 +0000
Subject: [PATCH 498/718] Use functionally equivalent dynamic targets

Similar to `drivers/other/dynamic.c`, I've looked for functionally
equivalent targets and mapped them in the default DYNAMIC_ARCH build.
Users can still build specific cores using DYNAMIC_LIST.
---
 Makefile.system               | 4 ----
 driver/others/dynamic_arm64.c | 8 ++++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index ff06e503c..e602eaf05 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -677,16 +677,12 @@ ifeq ($(ARCH), arm64)
 DYNAMIC_CORE =  ARMV8
 DYNAMIC_CORE += CORTEXA53
 DYNAMIC_CORE += CORTEXA57
-DYNAMIC_CORE += CORTEXA72
-DYNAMIC_CORE += CORTEXA73
 DYNAMIC_CORE += NEOVERSEN1
 ifneq ($(NO_SVE), 1)
 DYNAMIC_CORE += NEOVERSEV1
 DYNAMIC_CORE += NEOVERSEN2
 DYNAMIC_CORE += ARMV8SVE
 endif
-DYNAMIC_CORE += CORTEXA55
-DYNAMIC_CORE += FALKOR
 DYNAMIC_CORE += THUNDERX
 DYNAMIC_CORE += THUNDERX2T99
 DYNAMIC_CORE += TSV110
diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c
index 530d18115..b5fb8161d 100644
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@@ -122,10 +122,11 @@ extern gotoblas_t  gotoblas_CORTEXA55;
 #endif
 #else
 extern gotoblas_t  gotoblas_CORTEXA53;
+#define gotoblas_CORTEXA55 gotoblas_CORTEXA53
 extern gotoblas_t  gotoblas_CORTEXA57;
-extern gotoblas_t  gotoblas_CORTEXA72;
-extern gotoblas_t  gotoblas_CORTEXA73;
-extern gotoblas_t  gotoblas_FALKOR;
+#define gotoblas_CORTEXA72 gotoblas_CORTEXA57
+#define gotoblas_CORTEXA73 gotoblas_CORTEXA57
+#define gotoblas_FALKOR gotoblas_CORTEXA57
 extern gotoblas_t  gotoblas_THUNDERX;
 extern gotoblas_t  gotoblas_THUNDERX2T99;
 extern gotoblas_t  gotoblas_TSV110;
@@ -141,7 +142,6 @@ extern gotoblas_t  gotoblas_ARMV8SVE;
 #define gotoblas_ARMV8SVE   gotoblas_ARMV8
 #endif
 extern gotoblas_t  gotoblas_THUNDERX3T110;
-extern gotoblas_t  gotoblas_CORTEXA55;
 #endif
 
 extern void openblas_warning(int verbose, const char * msg);

From 8ce44c18a0e81527f7e9db98ee6276af4a408b8f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 23 Dec 2023 19:24:10 +0100
Subject: [PATCH 499/718] Handle corner cases of LWORK (Reference-LAPACK PR
 942)

---
 lapack-netlib/TESTING/EIG/cerrst.f | 12 ++++++------
 lapack-netlib/TESTING/EIG/chkxer.f |  2 +-
 lapack-netlib/TESTING/EIG/derrst.f |  4 ++--
 lapack-netlib/TESTING/EIG/serrst.f |  4 ++--
 lapack-netlib/TESTING/EIG/zerrst.f | 12 ++++++------
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/lapack-netlib/TESTING/EIG/cerrst.f b/lapack-netlib/TESTING/EIG/cerrst.f
index 1748a2aad..d23eb14ea 100644
--- a/lapack-netlib/TESTING/EIG/cerrst.f
+++ b/lapack-netlib/TESTING/EIG/cerrst.f
@@ -748,17 +748,17 @@
          CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK )
          INFOT = 18
          CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0,
-     $                M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ),
+     $                M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0,
-     $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N-1, IW( 2*N-1 ),
+     $                M, R, Z, 1, IW, Q, 2*N, RW, 0, IW( 2*N-1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK )
          INFOT = 22
          CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0,
-     $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 10*N-1,
+     $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 0,
      $                INFO )
          CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK )
          NT = NT + 12
@@ -830,19 +830,19 @@
          INFOT = 18
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
      $                0.0, 0.0, 1, 1, 0.0,
-     $                M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ),
+     $                M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
      $                0.0, 0.0, 1, 1, 0.0,
-     $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N-1, IW( 2*N-1 ),
+     $                M, R, Z, 1, IW, Q, 26*N, RW, 0, IW( 2*N-1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 22
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
      $                0.0, 0.0, 1, 1, 0.0,
-     $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 10*N-1,
+     $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 0,
      $                INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          NT = NT + 13
diff --git a/lapack-netlib/TESTING/EIG/chkxer.f b/lapack-netlib/TESTING/EIG/chkxer.f
index fd00bb65a..70caf7e0a 100644
--- a/lapack-netlib/TESTING/EIG/chkxer.f
+++ b/lapack-netlib/TESTING/EIG/chkxer.f
@@ -61,7 +61,7 @@
       RETURN
 *
  9999 FORMAT( ' *** Illegal value of parameter number ', I2,
-     $      ' not detected by ', A6, ' ***' )
+     $      ' not detected by ', A, ' ***' )
 *
 *     End of CHKXER
 *
diff --git a/lapack-netlib/TESTING/EIG/derrst.f b/lapack-netlib/TESTING/EIG/derrst.f
index 059538644..7d111e2e0 100644
--- a/lapack-netlib/TESTING/EIG/derrst.f
+++ b/lapack-netlib/TESTING/EIG/derrst.f
@@ -735,12 +735,12 @@
          CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK )
          INFOT = 18
          CALL DSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 26*N-1, IW( 2*N+1 ), 10*N,
+     $                M, R, Z, 1, IW, Q, 0, IW( 2*N+1 ), 10*N,
      $                INFO )
          CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL DSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 10*N-1,
+     $                M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 0,
      $                INFO )
          CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK )
          NT = NT + 11
diff --git a/lapack-netlib/TESTING/EIG/serrst.f b/lapack-netlib/TESTING/EIG/serrst.f
index b87fc42ef..408346382 100644
--- a/lapack-netlib/TESTING/EIG/serrst.f
+++ b/lapack-netlib/TESTING/EIG/serrst.f
@@ -733,12 +733,12 @@
          CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK )
          INFOT = 18
          CALL SSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0,
-     $                M, R, Z, 1, IW, Q, 26*N-1, IW( 2*N+1 ), 10*N,
+     $                M, R, Z, 1, IW, Q, 0, IW( 2*N+1 ), 10*N,
      $                INFO )
          CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL SSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0,
-     $                M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 10*N-1,
+     $                M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 0,
      $                INFO )
          CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK )
          NT = NT + 11
diff --git a/lapack-netlib/TESTING/EIG/zerrst.f b/lapack-netlib/TESTING/EIG/zerrst.f
index d7b41c053..31881c4de 100644
--- a/lapack-netlib/TESTING/EIG/zerrst.f
+++ b/lapack-netlib/TESTING/EIG/zerrst.f
@@ -748,17 +748,17 @@
          CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK )
          INFOT = 18
          CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ),
+     $                M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N-1, IW( 2*N-1 ),
+     $                M, R, Z, 1, IW, Q, 2*N, RW, 0, IW( 2*N-1 ),
      $                10*N, INFO )
          CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK )
          INFOT = 22
          CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 10*N-1,
+     $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 0,
      $                INFO )
          CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK )
          NT = NT + 12
@@ -830,19 +830,19 @@
          INFOT = 18
          CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
      $                0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ),
+     $                M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
      $                0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N-1, IW( 2*N-1 ),
+     $                M, R, Z, 1, IW, Q, 26*N, RW, 0, IW( 2*N-1 ),
      $                10*N, INFO )
          CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 22
          CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
      $                0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 10*N-1,
+     $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 0,
      $                INFO )
          CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          NT = NT + 13

From 5c11b2ff41607ef923593ff73f0317aa93be30a5 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 23 Dec 2023 19:27:20 +0100
Subject: [PATCH 500/718] Handle corner cases of LWORK (Reference-LAPACK PR
 942)

---
 lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f |  7 +++----
 lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f |  4 ++--
 lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f |  5 ++---
 lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f |  4 ++--
 lapack-netlib/TESTING/LIN/schksy_aa_2stage.f |  5 ++---
 lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f |  4 ++--
 lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f | 11 ++++++-----
 lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f |  4 ++--
 8 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f
index 30a61261f..d79978e55 100644
--- a/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f
@@ -433,9 +433,9 @@
 *                 block factorization, LWORK is the length of AINV.
 *
                   SRNAMT = 'CHETRF_AA_2STAGE'
-                  LWORK = MIN(N*NB, 3*NMAX*NMAX)
-                  CALL CHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, 
-     $                                   AINV, (3*NB+1)*N, 
+                  LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX)
+                  CALL CHETRF_AA_2STAGE( UPLO, N, AFAC, LDA,
+     $                                   AINV, MAX( 1, (3*NB+1)*N ),
      $                                   IWORK, IWORK( 1+N ),
      $                                   WORK, LWORK,
      $                                   INFO )
@@ -517,7 +517,6 @@ c                  NT = 1
                      CALL CLACPY( 'Full', N, NRHS, B, LDA, X, LDA )
 *
                      SRNAMT = 'CHETRS_AA_2STAGE'
-                     LWORK = MAX( 1, 3*N-2 )
                      CALL CHETRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA,
      $                            AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ),
      $                            X, LDA, INFO )
diff --git a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f
index 51cef512d..83e8a17b0 100644
--- a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f
@@ -400,9 +400,9 @@
 *                    Factor the matrix and solve the system using CHESV_AA.
 *
                      SRNAMT = 'CHESV_AA_2STAGE '
-                     LWORK = MIN(N*NB, 3*NMAX*NMAX)
+                     LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX)
                      CALL CHESV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA,
-     $                                 AINV, (3*NB+1)*N, 
+     $                                 AINV, MAX( 1, (3*NB+1)*N ),
      $                                 IWORK, IWORK( 1+N ),
      $                                 X, LDA, WORK, LWORK, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f
index bc4e77a5a..1940351a4 100644
--- a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f
@@ -421,9 +421,9 @@
 *                 block factorization, LWORK is the length of AINV.
 *
                   SRNAMT = 'DSYTRF_AA_2STAGE'
-                  LWORK = MIN(N*NB, 3*NMAX*NMAX)
+                  LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX )
                   CALL DSYTRF_AA_2STAGE( UPLO, N, AFAC, LDA, 
-     $                                   AINV, (3*NB+1)*N, 
+     $                                   AINV, MAX( 1, (3*NB+1)*N ),
      $                                   IWORK, IWORK( 1+N ),
      $                                   WORK, LWORK,
      $                                   INFO )
@@ -503,7 +503,6 @@ c                  NT = 1
                      CALL DLACPY( 'Full', N, NRHS, B, LDA, X, LDA )
 *
                      SRNAMT = 'DSYTRS_AA_2STAGE'
-                     LWORK = MAX( 1, 3*N-2 )
                      CALL DSYTRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA,
      $                            AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ),
      $                            X, LDA, INFO )
diff --git a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f
index 91c9e8e9a..d04106ae3 100644
--- a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f
@@ -400,9 +400,9 @@
 *                    Factor the matrix and solve the system using DSYSV_AA.
 *
                      SRNAMT = 'DSYSV_AA_2STAGE '
-                     LWORK = MIN(N*NB, 3*NMAX*NMAX)
+                     LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX )
                      CALL DSYSV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA,
-     $                                 AINV, (3*NB+1)*N, 
+     $                                 AINV, MAX( 1, (3*NB+1)*N ),
      $                                 IWORK, IWORK( 1+N ),
      $                                 X, LDA, WORK, LWORK, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f
index d3c27ae56..6490cd7c3 100644
--- a/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f
@@ -423,9 +423,9 @@
 *                 block factorization, LWORK is the length of AINV.
 *
                   SRNAMT = 'SSYTRF_AA_2STAGE'
-                  LWORK = MIN(N*NB, 3*NMAX*NMAX)
+                  LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX )
                   CALL SSYTRF_AA_2STAGE( UPLO, N, AFAC, LDA, 
-     $                                   AINV, (3*NB+1)*N, 
+     $                                   AINV, MAX( 1, (3*NB+1)*N ),
      $                                   IWORK, IWORK( 1+N ),
      $                                   WORK, LWORK,
      $                                   INFO )
@@ -505,7 +505,6 @@
                      CALL SLACPY( 'Full', N, NRHS, B, LDA, X, LDA )
 *
                      SRNAMT = 'SSYTRS_AA_2STAGE'
-                     LWORK = MAX( 1, 3*N-2 )
                      CALL SSYTRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA,
      $                            AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ),
      $                            X, LDA, INFO )
diff --git a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f
index aff32bce9..319b90805 100644
--- a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f
@@ -400,9 +400,9 @@
 *                    Factor the matrix and solve the system using SSYSV_AA.
 *
                      SRNAMT = 'SSYSV_AA_2STAGE '
-                     LWORK = MIN(N*NB, 3*NMAX*NMAX)
+                     LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX )
                      CALL SSYSV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA,
-     $                                 AINV, (3*NB+1)*N, 
+     $                                 AINV, MAX( 1, (3*NB+1)*N ),
      $                                 IWORK, IWORK( 1+N ),
      $                                 X, LDA, WORK, LWORK, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f
index 381fac9f2..51082f1d0 100644
--- a/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f
@@ -8,7 +8,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE ZCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB, NBVAL, 
+*       SUBROUTINE ZCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB, NBVAL,
 *                             NNS, NSVAL, THRESH, TSTERR, NMAX, A,
 *                             AFAC, AINV, B, X, XACT, WORK, RWORK,
 *                             IWORK, NOUT )
@@ -185,7 +185,8 @@
       LOGICAL            DOTYPE( * )
       INTEGER            IWORK( * ), NBVAL( * ), NSVAL( * ), NVAL( * )
       COMPLEX*16         A( * ), AFAC( * ), AINV( * ), B( * ),
-     $                   RWORK( * ), WORK( * ), X( * ), XACT( * )
+     $                   WORK( * ), X( * ), XACT( * )
+      DOUBLE PRECISION   RWORK( * )
 *     ..
 *
 *  =====================================================================
@@ -430,9 +431,9 @@
 *                 block factorization, LWORK is the length of AINV.
 *
                   SRNAMT = 'ZHETRF_AA_2STAGE'
-                  LWORK = MIN(N*NB, 3*NMAX*NMAX)
-                  CALL ZHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, 
-     $                                   AINV, (3*NB+1)*N, 
+                  LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX )
+                  CALL ZHETRF_AA_2STAGE( UPLO, N, AFAC, LDA,
+     $                                   AINV, MAX( 1, (3*NB+1)*N ),
      $                                   IWORK, IWORK( 1+N ),
      $                                   WORK, LWORK,
      $                                   INFO )
diff --git a/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f
index 9401867e0..fcd774491 100644
--- a/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f
@@ -400,9 +400,9 @@
 *                    Factor the matrix and solve the system using ZHESV_AA.
 *
                      SRNAMT = 'ZHESV_AA_2STAGE '
-                     LWORK = MIN(N*NB, 3*NMAX*NMAX)
+                     LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX )
                      CALL ZHESV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA,
-     $                                 AINV, (3*NB+1)*N, 
+     $                                 AINV, MAX( 1, (3*NB+1)*N ),
      $                                 IWORK, IWORK( 1+N ),
      $                                 X, LDA, WORK, LWORK, INFO )
 *

From 0814491d968da09eebee620fe573750a4b703944 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 23 Dec 2023 19:37:03 +0100
Subject: [PATCH 501/718] Handle corner cases of LWORK (Reference-LAPACK PR
 942)

---
 lapack-netlib/SRC/cgebrd.f           | 32 ++++++----
 lapack-netlib/SRC/cgehrd.f           | 20 +++---
 lapack-netlib/SRC/cgelq.f            |  8 +--
 lapack-netlib/SRC/cgelqf.f           | 20 +++---
 lapack-netlib/SRC/cgemlq.f           | 35 +++++++----
 lapack-netlib/SRC/cgemqr.f           | 31 ++++++---
 lapack-netlib/SRC/cgeqlf.f           | 12 ++--
 lapack-netlib/SRC/cgeqp3rk.f         | 24 +++----
 lapack-netlib/SRC/cgeqr.f            | 23 ++++---
 lapack-netlib/SRC/cgeqrfp.f          | 26 +++++---
 lapack-netlib/SRC/cgesvdx.f          | 14 ++---
 lapack-netlib/SRC/cgesvj.f           | 70 ++++++++++++---------
 lapack-netlib/SRC/cgetri.f           |  6 +-
 lapack-netlib/SRC/cgetsls.f          |  7 ++-
 lapack-netlib/SRC/cgetsqrhrt.f       | 27 +++++---
 lapack-netlib/SRC/cgges3.f           | 40 +++++++-----
 lapack-netlib/SRC/cggev3.f           | 28 +++++----
 lapack-netlib/SRC/cgghd3.f           | 22 ++++---
 lapack-netlib/SRC/cggqrf.f           |  6 +-
 lapack-netlib/SRC/cggrqf.f           |  6 +-
 lapack-netlib/SRC/cggsvd3.f          |  4 +-
 lapack-netlib/SRC/cggsvp3.f          |  4 +-
 lapack-netlib/SRC/cheevd.f           | 11 ++--
 lapack-netlib/SRC/cheevr.f           | 31 +++++----
 lapack-netlib/SRC/cheevr_2stage.f    | 57 ++++++++++-------
 lapack-netlib/SRC/cheevx.f           |  6 +-
 lapack-netlib/SRC/chesv_aa.f         | 15 ++---
 lapack-netlib/SRC/chesv_aa_2stage.f  | 24 +++----
 lapack-netlib/SRC/chesvx.f           | 11 ++--
 lapack-netlib/SRC/chetrd_2stage.f    | 94 +++++++++++++++-------------
 lapack-netlib/SRC/chetrd_hb2st.F     | 40 +++++++-----
 lapack-netlib/SRC/chetrd_he2hb.f     | 20 +++---
 lapack-netlib/SRC/chetrf.f           |  8 +--
 lapack-netlib/SRC/chetrf_aa.f        | 29 ++++++---
 lapack-netlib/SRC/chetrf_aa_2stage.f | 25 ++++----
 lapack-netlib/SRC/chetrf_rk.f        | 10 +--
 lapack-netlib/SRC/chetrf_rook.f      |  6 +-
 lapack-netlib/SRC/chetri2.f          | 30 +++++----
 lapack-netlib/SRC/chetri_3.f         | 21 ++++---
 lapack-netlib/SRC/chetrs_aa.f        | 27 +++++---
 lapack-netlib/SRC/clamswlq.f         | 66 +++++++++++--------
 lapack-netlib/SRC/clamtsqr.f         | 74 ++++++++++++----------
 lapack-netlib/SRC/claswlq.f          | 80 +++++++++++++----------
 lapack-netlib/SRC/clatrs3.f          | 32 +++++++---
 lapack-netlib/SRC/clatsqr.f          | 89 +++++++++++++++-----------
 lapack-netlib/SRC/dsytrf.f           |  5 +-
 lapack-netlib/SRC/ssytrd_sb2st.F     | 40 +++++++-----
 47 files changed, 783 insertions(+), 533 deletions(-)

diff --git a/lapack-netlib/SRC/cgebrd.f b/lapack-netlib/SRC/cgebrd.f
index 5687161a5..5920b1cf5 100644
--- a/lapack-netlib/SRC/cgebrd.f
+++ b/lapack-netlib/SRC/cgebrd.f
@@ -123,7 +123,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= max(1,M,N).
+*>          The length of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise.
 *>          For optimum performance LWORK >= (M+N)*NB, where NB
 *>          is the optimal blocksize.
 *>
@@ -148,7 +149,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup gebrd
 *
 *> \par Further Details:
 *  =====================
@@ -225,8 +226,8 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB,
-     $                   NBMIN, NX, WS
+      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT,
+     $                   MINMN, NB, NBMIN, NX, WS
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEBD2, CGEMM, CLABRD, XERBLA
@@ -236,16 +237,24 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input parameters
 *
       INFO = 0
-      NB = MAX( 1, ILAENV( 1, 'CGEBRD', ' ', M, N, -1, -1 ) )
-      LWKOPT = ( M+N )*NB
-      WORK( 1 ) = REAL( LWKOPT )
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = MAX( M, N )
+         NB = MAX( 1, ILAENV( 1, 'CGEBRD', ' ', M, N, -1, -1 ) )
+         LWKOPT = ( M+N )*NB
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -253,7 +262,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.LT.0 ) THEN
@@ -265,7 +274,6 @@
 *
 *     Quick return if possible
 *
-      MINMN = MIN( M, N )
       IF( MINMN.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -284,7 +292,7 @@
 *        Determine when to switch from blocked to unblocked code.
 *
          IF( NX.LT.MINMN ) THEN
-            WS = ( M+N )*NB
+            WS = LWKOPT
             IF( LWORK.LT.WS ) THEN
 *
 *              Not enough work space for the optimal NB, consider using
@@ -343,7 +351,7 @@
 *
       CALL CGEBD2( M-I+1, N-I+1, A( I, I ), LDA, D( I ), E( I ),
      $             TAUQ( I ), TAUP( I ), WORK, IINFO )
-      WORK( 1 ) = WS
+      WORK( 1 ) = SROUNDUP_LWORK( WS )
       RETURN
 *
 *     End of CGEBRD
diff --git a/lapack-netlib/SRC/cgehrd.f b/lapack-netlib/SRC/cgehrd.f
index f407f931a..7ba87cc01 100644
--- a/lapack-netlib/SRC/cgehrd.f
+++ b/lapack-netlib/SRC/cgehrd.f
@@ -89,7 +89,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension (LWORK)
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -222,13 +222,19 @@
          INFO = -8
       END IF
 *
+      NH = IHI - ILO + 1
       IF( INFO.EQ.0 ) THEN
 *
 *        Compute the workspace requirements
 *
-         NB = MIN( NBMAX, ILAENV( 1, 'CGEHRD', ' ', N, ILO, IHI, -1 ) )
-         LWKOPT = N*NB + TSIZE
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         IF( NH.LE.1 ) THEN
+            LWKOPT = 1
+         ELSE
+            NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI,
+     $                              -1 ) )
+            LWKOPT = N*NB + TSIZE
+         END IF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -249,7 +255,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -269,7 +274,7 @@
 *
 *           Determine if workspace is large enough for blocked code
 *
-            IF( LWORK.LT.N*NB+TSIZE ) THEN
+            IF( LWORK.LT.LWKOPT ) THEN
 *
 *              Not enough workspace to use optimal NB:  determine the
 *              minimum value of NB, and reduce NB or force use of
@@ -345,7 +350,8 @@
 *     Use unblocked code to reduce the rest of the matrix
 *
       CALL CGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO )
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgelq.f b/lapack-netlib/SRC/cgelq.f
index ff482bc42..24aaa982e 100644
--- a/lapack-netlib/SRC/cgelq.f
+++ b/lapack-netlib/SRC/cgelq.f
@@ -98,7 +98,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -295,9 +295,9 @@
         T( 2 ) = MB
         T( 3 ) = NB
         IF( MINW ) THEN
-          WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+          WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
         ELSE
-          WORK( 1 ) = SROUNDUP_LWORK(LWREQ)
+          WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
         END IF
       END IF
       IF( INFO.NE.0 ) THEN
@@ -322,7 +322,7 @@
      $                LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWREQ)
+      WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgelqf.f b/lapack-netlib/SRC/cgelqf.f
index 75f5bc960..3847a958a 100644
--- a/lapack-netlib/SRC/cgelqf.f
+++ b/lapack-netlib/SRC/cgelqf.f
@@ -93,7 +93,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,M).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise.
 *>          For optimum performance LWORK >= M*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -175,9 +176,8 @@
 *     Test the input arguments
 *
       INFO = 0
+      K = MIN( M, N )
       NB = ILAENV( 1, 'CGELQF', ' ', M, N, -1, -1 )
-      LWKOPT = M*NB
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -185,19 +185,25 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN
-         INFO = -7
+      ELSE IF( .NOT.LQUERY ) THEN
+         IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) )
+     $      INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'CGELQF', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
+         IF( K.EQ.0 ) THEN
+            LWKOPT = 1
+         ELSE
+            LWKOPT = M*NB
+         END IF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      K = MIN( M, N )
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -267,7 +273,7 @@
      $   CALL CGELQ2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK,
      $                IINFO )
 *
-      WORK( 1 ) = SROUNDUP_LWORK(IWS)
+      WORK( 1 ) = SROUNDUP_LWORK( IWS )
       RETURN
 *
 *     End of CGELQF
diff --git a/lapack-netlib/SRC/cgemlq.f b/lapack-netlib/SRC/cgemlq.f
index e0cf78bc0..e5b02b669 100644
--- a/lapack-netlib/SRC/cgemlq.f
+++ b/lapack-netlib/SRC/cgemlq.f
@@ -110,16 +110,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -143,7 +144,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -159,11 +160,13 @@
 *>  block sizes MB and NB returned by ILAENV, CGELQ will use either
 *>  CLASWLQ (if the matrix is wide-and-short) or CGELQT to compute
 *>  the LQ factorization.
-*>  This version of CGEMLQ will use either CLAMSWLQ or CGEMLQT to 
+*>  This version of CGEMLQ will use either CLAMSWLQ or CGEMLQT to
 *>  multiply matrix Q by another matrix.
 *>  Further Details in CLAMSWLQ or CGEMLQT.
 *> \endverbatim
 *>
+*> \ingroup gemlq
+*>
 *  =====================================================================
       SUBROUTINE CGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -185,11 +188,12 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLAMSWLQ, CGEMLQT, XERBLA
@@ -201,7 +205,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -216,6 +220,13 @@
         LW = M * MB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN
@@ -244,12 +255,12 @@
         INFO = -9
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
         INFO = -11
-      ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( ( LWORK.LT.LWMIN ) .AND. ( .NOT.LQUERY ) ) THEN
         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = REAL( LW )
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -261,7 +272,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -274,7 +285,7 @@
      $                 MB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = REAL( LW )
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgemqr.f b/lapack-netlib/SRC/cgemqr.f
index ea9de146e..0b7dd9dd7 100644
--- a/lapack-netlib/SRC/cgemqr.f
+++ b/lapack-netlib/SRC/cgemqr.f
@@ -111,16 +111,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -144,7 +145,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -166,6 +167,8 @@
 *>
 *> \endverbatim
 *>
+*> \ingroup gemqr
+*>
 *  =====================================================================
       SUBROUTINE CGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -187,11 +190,12 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEMQRT, CLAMTSQR, XERBLA
@@ -203,7 +207,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -218,6 +222,13 @@
         LW = MB * NB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, MB - K ).EQ.0 ) THEN
@@ -251,7 +262,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = LW
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -263,7 +274,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -276,7 +287,7 @@
      $                 NB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = LW
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgeqlf.f b/lapack-netlib/SRC/cgeqlf.f
index 918bbddad..6c67344c5 100644
--- a/lapack-netlib/SRC/cgeqlf.f
+++ b/lapack-netlib/SRC/cgeqlf.f
@@ -88,7 +88,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is
 *>          the optimal blocksize.
 *>
@@ -187,10 +188,11 @@
             NB = ILAENV( 1, 'CGEQLF', ' ', M, N, -1, -1 )
             LWKOPT = N*NB
          END IF
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
-         IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
-            INFO = -7
+         IF( .NOT.LQUERY ) THEN
+            IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) )
+     $         INFO = -7
          END IF
       END IF
 *
@@ -277,7 +279,7 @@
       IF( MU.GT.0 .AND. NU.GT.0 )
      $   CALL CGEQL2( MU, NU, A, LDA, TAU, WORK, IINFO )
 *
-      WORK( 1 ) = SROUNDUP_LWORK(IWS)
+      WORK( 1 ) = SROUNDUP_LWORK( IWS )
       RETURN
 *
 *     End of CGEQLF
diff --git a/lapack-netlib/SRC/cgeqp3rk.f b/lapack-netlib/SRC/cgeqp3rk.f
index 587860684..731c44edb 100644
--- a/lapack-netlib/SRC/cgeqp3rk.f
+++ b/lapack-netlib/SRC/cgeqp3rk.f
@@ -428,7 +428,8 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*.          LWORK >= N+NRHS-1
+*>          LWORK >= 1, if MIN(M,N) = 0, and
+*>          LWORK >= N+NRHS-1, otherwise.
 *>          For optimal performance LWORK >= NB*( N+NRHS+1 ),
 *>          where NB is the optimal block size for CGEQP3RK returned
 *>          by ILAENV. Minimal block size MINNB=2.
@@ -627,8 +628,9 @@
 *     .. External Functions ..
       LOGICAL            SISNAN
       INTEGER            ISAMAX, ILAENV
-      REAL               SLAMCH, SCNRM2
-      EXTERNAL           SISNAN, SLAMCH, SCNRM2, ISAMAX, ILAENV
+      REAL               SLAMCH, SCNRM2, SROUNDUP_LWORK
+      EXTERNAL           SISNAN, SLAMCH, SCNRM2, ISAMAX, ILAENV,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          CMPLX, MAX, MIN
@@ -703,7 +705,7 @@
 *
             LWKOPT = 2*N + NB*( N+NRHS+1 )
          END IF
-         WORK( 1 ) = CMPLX( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
          IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN
             INFO = -15
@@ -726,7 +728,7 @@
          K = 0
          MAXC2NRMK = ZERO
          RELMAXC2NRMK = ZERO
-         WORK( 1 ) = CMPLX( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          RETURN
       END IF
 *
@@ -778,7 +780,7 @@
 *
 *        Array TAU is not set and contains undefined elements.
 *
-         WORK( 1 ) = CMPLX( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          RETURN
       END IF
 *
@@ -797,7 +799,7 @@
             TAU( J ) = CZERO
          END DO
 *
-         WORK( 1 ) = CMPLX( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          RETURN
 *
       END IF
@@ -828,7 +830,7 @@
          DO J = 1, MINMN
             TAU( J ) = CZERO
          END DO
-         WORK( 1 ) = CMPLX( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          RETURN
       END IF
 *
@@ -873,7 +875,7 @@
             TAU( J ) = CZERO
          END DO
 *
-         WORK( 1 ) = CMPLX( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          RETURN
       END IF
 *
@@ -991,7 +993,7 @@
 *
 *              Return from the routine.
 *
-               WORK( 1 ) = CMPLX( LWKOPT )
+               WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
                RETURN
 *
@@ -1082,7 +1084,7 @@
 *
       END IF
 *
-      WORK( 1 ) = CMPLX( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgeqr.f b/lapack-netlib/SRC/cgeqr.f
index d10e3da65..3617594d0 100644
--- a/lapack-netlib/SRC/cgeqr.f
+++ b/lapack-netlib/SRC/cgeqr.f
@@ -99,7 +99,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -168,6 +168,8 @@
 *>
 *> \endverbatim
 *>
+*> \ingroup geqr
+*>
 *  =====================================================================
       SUBROUTINE CGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
@@ -188,11 +190,12 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, LMINWS, MINT, MINW
-      INTEGER            MB, NB, MINTSZ, NBLCKS
+      INTEGER            MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLATSQR, CGEQRT, XERBLA
@@ -244,8 +247,10 @@
 *
 *     Determine if the workspace size satisfies minimal size
 *
+      LWMIN = MAX( 1, N )
+      LWREQ = MAX( 1, N*NB )
       LMINWS = .FALSE.
-      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N )
+      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ )
      $    .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ )
      $    .AND. ( .NOT.LQUERY ) ) THEN
         IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN
@@ -253,7 +258,7 @@
           NB = 1
           MB = M
         END IF
-        IF( LWORK.LT.NB*N ) THEN
+        IF( LWORK.LT.LWREQ ) THEN
           LMINWS = .TRUE.
           NB = 1
         END IF
@@ -268,7 +273,7 @@
       ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 )
      $   .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -6
-      ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY )
+      ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY )
      $   .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -8
       END IF
@@ -282,9 +287,9 @@
         T( 2 ) = MB
         T( 3 ) = NB
         IF( MINW ) THEN
-          WORK( 1 ) = MAX( 1, N )
+          WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
         ELSE
-          WORK( 1 ) = MAX( 1, NB*N )
+          WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
         END IF
       END IF
       IF( INFO.NE.0 ) THEN
@@ -309,7 +314,7 @@
      $                LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = MAX( 1, NB*N )
+      WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgeqrfp.f b/lapack-netlib/SRC/cgeqrfp.f
index eaf98ddf3..5b6226c67 100644
--- a/lapack-netlib/SRC/cgeqrfp.f
+++ b/lapack-netlib/SRC/cgeqrfp.f
@@ -97,7 +97,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is
 *>          the optimal blocksize.
 *>
@@ -162,8 +163,8 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB,
-     $                   NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT,
+     $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEQR2P, CLARFB, CLARFT, XERBLA
@@ -182,8 +183,16 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'CGEQRF', ' ', M, N, -1, -1 )
-      LWKOPT = N*NB
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      K = MIN( M, N )
+      IF( K.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = N
+         LWKOPT = N*NB
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -191,7 +200,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
@@ -203,7 +212,6 @@
 *
 *     Quick return if possible
 *
-      K = MIN( M, N )
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -211,7 +219,7 @@
 *
       NBMIN = 2
       NX = 0
-      IWS = N
+      IWS = LWKMIN
       IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
 *        Determine when to cross over from blocked to unblocked code.
@@ -273,7 +281,7 @@
      $   CALL CGEQR2P( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK,
      $                IINFO )
 *
-      WORK( 1 ) = SROUNDUP_LWORK(IWS)
+      WORK( 1 ) = SROUNDUP_LWORK( IWS )
       RETURN
 *
 *     End of CGEQRFP
diff --git a/lapack-netlib/SRC/cgesvdx.f b/lapack-netlib/SRC/cgesvdx.f
index fbdb121ca..e1856a65f 100644
--- a/lapack-netlib/SRC/cgesvdx.f
+++ b/lapack-netlib/SRC/cgesvdx.f
@@ -208,7 +208,7 @@
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
-*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK;
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
@@ -261,7 +261,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEsing
+*> \ingroup gesvdx
 *
 *  =====================================================================
       SUBROUTINE CGESVDX( JOBU, JOBVT, RANGE, M, N, A, LDA, VL, VU,
@@ -312,8 +312,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, CLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, CLANGE
+      REAL               SLAMCH, CLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, CLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN, SQRT
@@ -448,7 +448,7 @@
             END IF
          END IF
          MAXWRK = MAX( MAXWRK, MINWRK )
-         WORK( 1 ) = CMPLX( REAL( MAXWRK ), ZERO )
+         WORK( 1 ) = SROUNDUP_LWORK( MAXWRK )
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -19
@@ -464,7 +464,7 @@
 *
 *     Quick return if possible
 *
-      IF( M.EQ.0 .OR. N.EQ.0 ) THEN
+      IF( MINMN.EQ.0 ) THEN
          RETURN
       END IF
 *
@@ -846,7 +846,7 @@
 *
 *     Return optimal workspace in WORK(1)
 *
-      WORK( 1 ) = CMPLX( REAL( MAXWRK ), ZERO )
+      WORK( 1 ) = SROUNDUP_LWORK( MAXWRK )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgesvj.f b/lapack-netlib/SRC/cgesvj.f
index 149cf5e48..b9c8f1709 100644
--- a/lapack-netlib/SRC/cgesvj.f
+++ b/lapack-netlib/SRC/cgesvj.f
@@ -208,15 +208,17 @@
 *> \verbatim
 *>          CWORK is COMPLEX array, dimension (max(1,LWORK))
 *>          Used as workspace.
-*>          If on entry LWORK = -1, then a workspace query is assumed and
-*>          no computation is done; CWORK(1) is set to the minial (and optimal)
-*>          length of CWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER.
-*>          Length of CWORK, LWORK >= M+N.
+*>          Length of CWORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M+N, otherwise.
+*>
+*>          If on entry LWORK = -1, then a workspace query is assumed and
+*>          no computation is done; CWORK(1) is set to the minial (and optimal)
+*>          length of CWORK.
 *> \endverbatim
 *>
 *> \param[in,out] RWORK
@@ -247,15 +249,17 @@
 *>          RWORK(6) = the largest absolute value over all sines of the
 *>                    Jacobi rotation angles in the last sweep. It can be
 *>                    useful for a post festum analysis.
-*>         If on entry LRWORK = -1, then a workspace query is assumed and
-*>         no computation is done; RWORK(1) is set to the minial (and optimal)
-*>         length of RWORK.
 *> \endverbatim
 *>
 *> \param[in] LRWORK
 *> \verbatim
 *>         LRWORK is INTEGER
-*>         Length of RWORK, LRWORK >= MAX(6,N).
+*>         Length of RWORK.
+*>         LRWORK >= 1, if MIN(M,N) = 0, and LRWORK >= MAX(6,N), otherwise
+*>
+*>         If on entry LRWORK = -1, then a workspace query is assumed and
+*>         no computation is done; RWORK(1) is set to the minial (and optimal)
+*>         length of RWORK.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -276,7 +280,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup gesvj
 *
 *> \par Further Details:
 *  =====================
@@ -374,16 +378,17 @@
       PARAMETER  ( NSWEEP = 30 )
 *     ..
 *     .. Local Scalars ..
-      COMPLEX AAPQ, OMPQ
-      REAL    AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG,
-     $        BIGTHETA, CS, CTOL, EPSLN, MXAAPQ,
-     $        MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL,
-     $        SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL
-      INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1,
-     $        ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34,
-     $        N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND
-      LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK,
-     $        RSVEC, UCTOL, UPPER
+      COMPLEX    AAPQ, OMPQ
+      REAL       AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG,
+     $           BIGTHETA, CS, CTOL, EPSLN, MXAAPQ,
+     $           MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL,
+     $           SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL
+      INTEGER    BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1,
+     $           ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34,
+     $           N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND,
+     $           MINMN, LWMIN, LRWMIN
+      LOGICAL    APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK,
+     $           RSVEC, UCTOL, UPPER
 *     ..
 *     ..
 *     .. Intrinsic Functions ..
@@ -398,8 +403,8 @@
       INTEGER            ISAMAX
       EXTERNAL           ISAMAX
 *     from LAPACK
-      REAL               SLAMCH
-      EXTERNAL           SLAMCH
+      REAL               SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           SLAMCH, SROUNDUP_LWORK
       LOGICAL            LSAME
       EXTERNAL           LSAME
 *     ..
@@ -422,7 +427,16 @@
       UPPER = LSAME( JOBA, 'U' )
       LOWER = LSAME( JOBA, 'L' )
 *
-      LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 )
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWMIN  = 1
+         LRWMIN = 1
+      ELSE
+         LWMIN  = M + N
+         LRWMIN = MAX( 6, N )
+      END IF
+*
+      LQUERY = ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 )
       IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN
          INFO = -1
       ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN
@@ -442,9 +456,9 @@
          INFO = -11
       ELSE IF( UCTOL .AND. ( RWORK( 1 ).LE.ONE ) ) THEN
          INFO = -12
-      ELSE IF( LWORK.LT.( M+N ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN
          INFO = -13
-      ELSE IF( LRWORK.LT.MAX( N, 6 ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LRWORK.LT.LRWMIN .AND. ( .NOT.LQUERY ) ) THEN
          INFO = -15
       ELSE
          INFO = 0
@@ -454,15 +468,15 @@
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'CGESVJ', -INFO )
          RETURN
-      ELSE IF ( LQUERY ) THEN
-         CWORK(1) = M + N
-         RWORK(1) = MAX( N, 6 )
+      ELSE IF( LQUERY ) THEN
+         CWORK( 1 ) = SROUNDUP_LWORK( LWMIN )
+         RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN )
          RETURN
       END IF
 *
 * #:) Quick return for void matrix
 *
-      IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN
+      IF( MINMN.EQ.0 ) RETURN
 *
 *     Set numerical parameters
 *     The stopping criterion for Jacobi rotations is
diff --git a/lapack-netlib/SRC/cgetri.f b/lapack-netlib/SRC/cgetri.f
index 2060d1444..2eb3da7ab 100644
--- a/lapack-netlib/SRC/cgetri.f
+++ b/lapack-netlib/SRC/cgetri.f
@@ -153,8 +153,8 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'CGETRI', ' ', N, -1, -1, -1 )
-      LWKOPT = N*NB
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      LWKOPT = MAX( 1, N*NB )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
          INFO = -1
@@ -252,7 +252,7 @@
      $      CALL CSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 )
    60 CONTINUE
 *
-      WORK( 1 ) = SROUNDUP_LWORK(IWS)
+      WORK( 1 ) = SROUNDUP_LWORK( IWS )
       RETURN
 *
 *     End of CGETRI
diff --git a/lapack-netlib/SRC/cgetsls.f b/lapack-netlib/SRC/cgetsls.f
index b4bb7562f..3f43dc8de 100644
--- a/lapack-netlib/SRC/cgetsls.f
+++ b/lapack-netlib/SRC/cgetsls.f
@@ -127,7 +127,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed.
 *>          If LWORK = -1, the routine calculates optimal size of WORK for the
 *>          optimal performance and returns this value in WORK(1).
@@ -229,7 +229,10 @@
 *
 *     Determine the optimum and minimum LWORK
 *
-       IF( M.GE.N ) THEN
+       IF( MIN( M, N, NRHS ).EQ.0 ) THEN
+         WSIZEO = 1
+         WSIZEM = 1
+       ELSE IF ( M.GE.N ) THEN
          CALL CGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 )
          TSZO = INT( TQ( 1 ) )
          LWO  = INT( WORKQ( 1 ) )
diff --git a/lapack-netlib/SRC/cgetsqrhrt.f b/lapack-netlib/SRC/cgetsqrhrt.f
index 4e4dc1d4a..087e9bc7f 100644
--- a/lapack-netlib/SRC/cgetsqrhrt.f
+++ b/lapack-netlib/SRC/cgetsqrhrt.f
@@ -131,13 +131,15 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
-*>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
+*>          If MIN(M,N) = 0, LWORK >= 1, else
+*>          LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
 *>          where
 *>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
 *>             NB1LOCAL = MIN(NB1,N).
 *>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
 *>             LW1 = NB1LOCAL * N,
-*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
+*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ).
+*>
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
@@ -160,7 +162,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup comlpexOTHERcomputational
+*> \ingroup getsqrhrt
 *
 *> \par Contributors:
 *  ==================
@@ -200,6 +202,10 @@
       INTEGER            I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT,
      $                   NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS
 *     ..
+*     .. External Functions ..
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
       EXTERNAL           CCOPY, CLATSQR, CUNGTSQR_ROW, CUNHR_COL,
      $                   XERBLA
@@ -212,7 +218,7 @@
 *     Test the input arguments
 *
       INFO = 0
-      LQUERY  = LWORK.EQ.-1
+      LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
       ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
@@ -225,7 +231,7 @@
          INFO = -5
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -7
-      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN
          INFO = -9
       ELSE
 *
@@ -263,8 +269,9 @@
             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
 *
             LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
+            LWORKOPT = MAX( 1, LWORKOPT )
 *
-            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+            IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN
                INFO = -11
             END IF
 *
@@ -277,14 +284,14 @@
          CALL XERBLA( 'CGETSQRHRT', -INFO )
          RETURN
       ELSE IF ( LQUERY ) THEN
-         WORK( 1 ) = CMPLX( LWORKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
          RETURN
       END IF
 *
 *     Quick return if possible
 *
       IF( MIN( M, N ).EQ.0 ) THEN
-         WORK( 1 ) = CMPLX( LWORKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
          RETURN
       END IF
 *
@@ -341,9 +348,9 @@
          END IF
       END DO
 *
-      WORK( 1 ) = CMPLX( LWORKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
       RETURN
 *
 *     End of CGETSQRHRT
 *
-      END
\ No newline at end of file
+      END
diff --git a/lapack-netlib/SRC/cgges3.f b/lapack-netlib/SRC/cgges3.f
index aac9f9510..c1ca79688 100644
--- a/lapack-netlib/SRC/cgges3.f
+++ b/lapack-netlib/SRC/cgges3.f
@@ -215,7 +215,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= MAX(1,2*N).
+*>          For good performance, LWORK must generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -260,7 +261,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEeigen
+*> \ingroup gges3
 *
 *  =====================================================================
       SUBROUTINE CGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B,
@@ -300,7 +301,8 @@
       LOGICAL            CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL,
      $                   LQUERY, WANTST
       INTEGER            I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT,
-     $                   ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT
+     $                   ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT,
+     $                   LWKMIN
       REAL               ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL,
      $                   PVSR, SMLNUM
 *     ..
@@ -310,13 +312,12 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEQRF, CGGBAK, CGGBAL, CGGHD3, CLAQZ0, CLACPY,
-     $                   CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, SLABAD,
-     $                   XERBLA
+     $                   CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           LSAME, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, SQRT
@@ -353,6 +354,8 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N )
+*
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -369,7 +372,7 @@
          INFO = -14
       ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN
          INFO = -16
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -18
       END IF
 *
@@ -377,29 +380,33 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL CGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX( 1,  N + INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKMIN, N + INT( WORK( 1 ) ) )
          CALL CUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK,
      $                -1, IERR )
-         LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) )
          IF( ILVSL ) THEN
             CALL CUNGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1,
      $                   IERR )
-            LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) )
          END IF
          CALL CGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL,
      $                LDVSL, VSR, LDVSR, WORK, -1, IERR )
-         LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) )
          CALL CLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB,
      $                ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, WORK, -1,
      $                RWORK, 0, IERR )
-         LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
          IF( WANTST ) THEN
             CALL CTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB,
      $                   ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, SDIM,
      $                   PVSL, PVSR, DIF, WORK, -1, IDUM, 1, IERR )
-            LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
+         END IF
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          END IF
-         WORK( 1 ) = CMPLX( LWKOPT )
       END IF
 
 *
@@ -422,7 +429,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -585,7 +591,7 @@
 *
    30 CONTINUE
 *
-      WORK( 1 ) = CMPLX( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cggev3.f b/lapack-netlib/SRC/cggev3.f
index 9483ecdeb..d2b75aebc 100644
--- a/lapack-netlib/SRC/cggev3.f
+++ b/lapack-netlib/SRC/cggev3.f
@@ -174,7 +174,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= MAX(1,2*N).
+*>          For good performance, LWORK must generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -208,7 +209,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEeigen
+*> \ingroup ggev3
 *
 *  =====================================================================
       SUBROUTINE CGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHA, BETA,
@@ -243,7 +244,7 @@
       CHARACTER          CHTEMP
       INTEGER            ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO,
      $                   IN, IRIGHT, IROWS, IRWRK, ITAU, IWRK, JC, JR,
-     $                   LWKOPT
+     $                   LWKOPT, LWKMIN
       REAL               ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS,
      $                   SMLNUM, TEMP
       COMPLEX            X
@@ -253,13 +254,12 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEQRF, CGGBAK, CGGBAL, CGGHD3, CLAQZ0, CLACPY,
-     $                   CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, SLABAD,
-     $                   XERBLA
+     $                   CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           LSAME, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, AIMAG, MAX, REAL, SQRT
@@ -301,6 +301,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N )
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -315,7 +316,7 @@
          INFO = -11
       ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN
          INFO = -13
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -15
       END IF
 *
@@ -323,7 +324,7 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL CGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX( N,  N+INT( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKMIN, N+INT( WORK( 1 ) ) )
          CALL CUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK,
      $                -1, IERR )
          LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
@@ -348,7 +349,11 @@
      $                   RWORK, 0, IERR )
             LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
          END IF
-         WORK( 1 ) = CMPLX( LWKOPT )
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         END IF
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -368,7 +373,6 @@
       EPS = SLAMCH( 'E' )*SLAMCH( 'B' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -549,7 +553,7 @@
       IF( ILBSCL )
      $   CALL CLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR )
 *
-      WORK( 1 ) = CMPLX( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of CGGEV3
diff --git a/lapack-netlib/SRC/cgghd3.f b/lapack-netlib/SRC/cgghd3.f
index 1074b4828..f7175a72c 100644
--- a/lapack-netlib/SRC/cgghd3.f
+++ b/lapack-netlib/SRC/cgghd3.f
@@ -180,14 +180,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension (LWORK)
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in]  LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= 1.
+*>          The length of the array WORK. LWORK >= 1.
 *>          For optimum performance LWORK >= 6*N*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -212,7 +212,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup gghd3
 *
 *> \par Further Details:
 *  =====================
@@ -265,7 +265,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGGHRD, CLARTG, CLASET, CUNM22, CROT, CGEMM,
@@ -280,8 +281,13 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'CGGHD3', ' ', N, ILO, IHI, -1 )
-      LWKOPT = MAX( 6*N*NB, 1 )
-      WORK( 1 ) = CMPLX( LWKOPT )
+      NH = IHI - ILO + 1
+      IF( NH.LE.1 ) THEN
+         LWKOPT = 1
+      ELSE
+         LWKOPT = 6*N*NB
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       INITQ = LSAME( COMPQ, 'I' )
       WANTQ = INITQ .OR. LSAME( COMPQ, 'V' )
       INITZ = LSAME( COMPZ, 'I' )
@@ -330,7 +336,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = CONE
          RETURN
@@ -888,7 +893,8 @@
       IF ( JCOL.LT.IHI )
      $   CALL CGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q,
      $                LDQ, Z, LDZ, IERR )
-      WORK( 1 ) = CMPLX( LWKOPT )
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cggqrf.f b/lapack-netlib/SRC/cggqrf.f
index 29b0bf4af..309f170e8 100644
--- a/lapack-netlib/SRC/cggqrf.f
+++ b/lapack-netlib/SRC/cggqrf.f
@@ -251,8 +251,8 @@
       NB2 = ILAENV( 1, 'CGERQF', ' ', N, P, -1, -1 )
       NB3 = ILAENV( 1, 'CUNMQR', ' ', N, M, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P)*NB
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
          INFO = -1
@@ -288,7 +288,7 @@
 *     RQ factorization of N-by-P matrix B: B = T*Z.
 *
       CALL CGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO )
-      WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) )
+      WORK( 1 ) = SROUNDUP_LWORK( MAX( LOPT, INT( WORK( 1 ) ) ) )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cggrqf.f b/lapack-netlib/SRC/cggrqf.f
index 273ab3ef7..8470a1ce2 100644
--- a/lapack-netlib/SRC/cggrqf.f
+++ b/lapack-netlib/SRC/cggrqf.f
@@ -250,8 +250,8 @@
       NB2 = ILAENV( 1, 'CGEQRF', ' ', P, N, -1, -1 )
       NB3 = ILAENV( 1, 'CUNMRQ', ' ', M, N, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P)*NB
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -288,7 +288,7 @@
 *     QR factorization of P-by-N matrix B: B = Z*T
 *
       CALL CGEQRF( P, N, B, LDB, TAUB, WORK, LWORK, INFO )
-      WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) )
+      WORK( 1 ) = SROUNDUP_LWORK( MAX( LOPT, INT( WORK( 1 ) ) ) )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cggsvd3.f b/lapack-netlib/SRC/cggsvd3.f
index f248aebd5..4c4b85bae 100644
--- a/lapack-netlib/SRC/cggsvd3.f
+++ b/lapack-netlib/SRC/cggsvd3.f
@@ -278,7 +278,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -333,7 +333,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEsing
+*> \ingroup ggsvd3
 *
 *> \par Contributors:
 *  ==================
diff --git a/lapack-netlib/SRC/cggsvp3.f b/lapack-netlib/SRC/cggsvp3.f
index 008a053a2..e19f7efd5 100644
--- a/lapack-netlib/SRC/cggsvp3.f
+++ b/lapack-netlib/SRC/cggsvp3.f
@@ -233,7 +233,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -256,7 +256,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup ggsvp3
 *
 *> \par Further Details:
 *  =====================
diff --git a/lapack-netlib/SRC/cheevd.f b/lapack-netlib/SRC/cheevd.f
index b5ca804eb..9b62a2df6 100644
--- a/lapack-netlib/SRC/cheevd.f
+++ b/lapack-netlib/SRC/cheevd.f
@@ -116,8 +116,7 @@
 *>
 *> \param[out] RWORK
 *> \verbatim
-*>          RWORK is REAL array,
-*>                                         dimension (LRWORK)
+*>          RWORK is REAL array, dimension (MAX(1,LRWORK))
 *>          On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK.
 *> \endverbatim
 *>
@@ -282,8 +281,8 @@
             LROPT = LRWMIN
             LIOPT = LIWMIN
          END IF
-         WORK( 1 ) = SROUNDUP_LWORK(LOPT)
-         RWORK( 1 ) = LROPT
+         WORK( 1 ) = SROUNDUP_LWORK( LOPT )
+         RWORK( 1 ) = SROUNDUP_LWORK( LROPT )
          IWORK( 1 ) = LIOPT
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -378,8 +377,8 @@
          CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LOPT)
-      RWORK( 1 ) = LROPT
+      WORK( 1 ) = SROUNDUP_LWORK( LOPT )
+      RWORK( 1 ) = SROUNDUP_LWORK( LROPT )
       IWORK( 1 ) = LIOPT
 *
       RETURN
diff --git a/lapack-netlib/SRC/cheevr.f b/lapack-netlib/SRC/cheevr.f
index 05c5e66be..ad5c8cd4a 100644
--- a/lapack-netlib/SRC/cheevr.f
+++ b/lapack-netlib/SRC/cheevr.f
@@ -272,7 +272,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= max(1,2*N).
+*>          The length of the array WORK.
+*>          If N <= 1, LWORK >= 1, else LWORK >= 2*N.
 *>          For optimal efficiency, LWORK >= (NB+1)*N,
 *>          where NB is the max of the blocksize for CHETRD and for
 *>          CUNMTR as returned by ILAENV.
@@ -294,7 +295,8 @@
 *> \param[in] LRWORK
 *> \verbatim
 *>          LRWORK is INTEGER
-*>          The length of the array RWORK.  LRWORK >= max(1,24*N).
+*>          The length of the array RWORK.
+*>          If N <= 1, LRWORK >= 1, else LRWORK >= 24*N.
 *>
 *>          If LRWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -313,7 +315,8 @@
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -417,9 +420,15 @@
       LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR.
      $         ( LIWORK.EQ.-1 ) )
 *
-      LRWMIN = MAX( 1, 24*N )
-      LIWMIN = MAX( 1, 10*N )
-      LWMIN = MAX( 1, 2*N )
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LRWMIN = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = 2*N
+         LRWMIN = 24*N
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN
@@ -454,8 +463,8 @@
          NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 )
          NB = MAX( NB, ILAENV( 1, 'CUNMTR', UPLO, N, -1, -1, -1 ) )
          LWKOPT = MAX( ( NB+1 )*N, LWMIN )
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
-         RWORK( 1 ) = LRWMIN
+         WORK( 1 )  = SROUNDUP_LWORK( LWKOPT )
+         RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN )
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -483,7 +492,7 @@
       END IF
 *
       IF( N.EQ.1 ) THEN
-         WORK( 1 ) = 2
+         WORK( 1 ) = 1
          IF( ALLEIG .OR. INDEIG ) THEN
             M = 1
             W( 1 ) = REAL( A( 1, 1 ) )
@@ -710,8 +719,8 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
-      RWORK( 1 ) = LRWMIN
+      WORK( 1 )  = SROUNDUP_LWORK( LWKOPT )
+      RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN )
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/cheevr_2stage.f b/lapack-netlib/SRC/cheevr_2stage.f
index 0332a09bc..e06925fcd 100644
--- a/lapack-netlib/SRC/cheevr_2stage.f
+++ b/lapack-netlib/SRC/cheevr_2stage.f
@@ -265,7 +265,7 @@
 *>          indicating the nonzero elements in Z. The i-th eigenvector
 *>          is nonzero only in elements ISUPPZ( 2*i-1 ) through
 *>          ISUPPZ( 2*i ). This is an output of CSTEMR (tridiagonal
-*>          matrix). The support of the eigenvectors of A is typically 
+*>          matrix). The support of the eigenvectors of A is typically
 *>          1:N because of the unitary transformations applied by CUNMTR.
 *>          Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1
 *> \endverbatim
@@ -279,12 +279,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  
+*>          The dimension of the array WORK.
+*>          If N <= 1,               LWORK must be at least 1.
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, 26*N, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -310,7 +311,8 @@
 *> \param[in] LRWORK
 *> \verbatim
 *>          LRWORK is INTEGER
-*>          The length of the array RWORK.  LRWORK >= max(1,24*N).
+*>          The length of the array RWORK.
+*>          If N <= 1, LRWORK >= 1, else LRWORK >= 24*N.
 *>
 *>          If LRWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -329,7 +331,8 @@
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -354,7 +357,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup heevr_2stage
 *
 *> \par Contributors:
 *  ==================
@@ -382,7 +385,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -390,11 +393,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -443,8 +446,9 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV, ILAENV2STAGE
-      REAL               SLAMCH, CLANSY
-      EXTERNAL           LSAME, SLAMCH, CLANSY, ILAENV, ILAENV2STAGE
+      REAL               SLAMCH, CLANSY, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, CLANSY, ILAENV, ILAENV2STAGE,
+     $                   SROUNDUP_LWORK 
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL,
@@ -472,9 +476,16 @@
       IB     = ILAENV2STAGE( 2, 'CHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
       LHTRD  = ILAENV2STAGE( 3, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
       LWTRD  = ILAENV2STAGE( 4, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
-      LWMIN  = N + LHTRD + LWTRD
-      LRWMIN = MAX( 1, 24*N )
-      LIWMIN = MAX( 1, 10*N )
+*
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LRWMIN = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = N + LHTRD + LWTRD
+         LRWMIN = 24*N
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
@@ -506,8 +517,8 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 )  = LWMIN
-         RWORK( 1 ) = LRWMIN
+         WORK( 1 )  = SROUNDUP_LWORK( LWMIN )
+         RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN )
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -535,7 +546,7 @@
       END IF
 *
       IF( N.EQ.1 ) THEN
-         WORK( 1 ) = 2
+         WORK( 1 ) = 1
          IF( ALLEIG .OR. INDEIG ) THEN
             M = 1
             W( 1 ) = REAL( A( 1, 1 ) )
@@ -643,9 +654,9 @@
 *
 *     Call CHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
 *
-      CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), 
+      CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ),
      $                    RWORK( INDRE ), WORK( INDTAU ),
-     $                    WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDHOUS ), LHTRD,
      $                    WORK( INDWK ), LLWORK, IINFO )
 *
 *     If all eigenvalues are desired
@@ -666,7 +677,7 @@
             CALL SCOPY( N-1, RWORK( INDRE ), 1, RWORK( INDREE ), 1 )
             CALL SCOPY( N, RWORK( INDRD ), 1, RWORK( INDRDD ), 1 )
 *
-            IF (ABSTOL .LE. TWO*N*EPS) THEN
+            IF ( ABSTOL .LE. TWO*N*EPS ) THEN
                TRYRAC = .TRUE.
             ELSE
                TRYRAC = .FALSE.
@@ -765,8 +776,8 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 )  = LWMIN
-      RWORK( 1 ) = LRWMIN
+      WORK( 1 )  = SROUNDUP_LWORK( LWMIN )
+      RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN )
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/cheevx.f b/lapack-netlib/SRC/cheevx.f
index e91599a44..a8a2bde63 100644
--- a/lapack-netlib/SRC/cheevx.f
+++ b/lapack-netlib/SRC/cheevx.f
@@ -348,14 +348,14 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWKMIN = 1
-            WORK( 1 ) = LWKMIN
+            LWKOPT = 1
          ELSE
             LWKMIN = 2*N
             NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 )
             NB = MAX( NB, ILAENV( 1, 'CUNMTR', UPLO, N, -1, -1, -1 ) )
-            LWKOPT = MAX( 1, ( NB + 1 )*N )
-            WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+            LWKOPT = ( NB + 1 )*N
          END IF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY )
      $      INFO = -17
diff --git a/lapack-netlib/SRC/chesv_aa.f b/lapack-netlib/SRC/chesv_aa.f
index 53ecc0a16..0f41c9332 100644
--- a/lapack-netlib/SRC/chesv_aa.f
+++ b/lapack-netlib/SRC/chesv_aa.f
@@ -177,7 +177,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS
+      INTEGER            LWKMIN, LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -197,6 +197,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N, 3*N-2 )
       IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -207,18 +208,18 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX( 2*N, 3*N-2 ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
-         LWKOPT_HETRF = INT( WORK(1) )
+         LWKOPT_HETRF = INT( WORK( 1 ) )
          CALL CHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
      $                   -1, INFO )
-         LWKOPT_HETRS = INT( WORK(1) )
-         LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS )
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         LWKOPT_HETRS = INT( WORK( 1 ) )
+         LWKOPT = MAX( LWKMIN, LWKOPT_HETRF, LWKOPT_HETRS )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -240,7 +241,7 @@
 *
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chesv_aa_2stage.f b/lapack-netlib/SRC/chesv_aa_2stage.f
index 12950c4af..05ebd9253 100644
--- a/lapack-netlib/SRC/chesv_aa_2stage.f
+++ b/lapack-netlib/SRC/chesv_aa_2stage.f
@@ -99,14 +99,14 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is COMPLEX array, dimension (LTB)
+*>          TB is COMPLEX array, dimension (MAX(1,LTB)).
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -146,14 +146,15 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX workspace of size LWORK
+*>          WORK is COMPLEX workspace of size (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used to
+*>          select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -203,7 +204,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL            UPPER, TQUERY, WQUERY
-      INTEGER            LWKOPT
+      INTEGER            LWKMIN, LWKOPT
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -225,6 +226,7 @@
       UPPER = LSAME( UPLO, 'U' )
       WQUERY = ( LWORK.EQ.-1 )
       TQUERY = ( LTB.EQ.-1 )
+      LWKMIN = MAX( 1, N )
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -233,18 +235,19 @@
          INFO = -3
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -5
-      ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -7
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -11
-      ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN
          INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV,
      $                          IPIV2, WORK, -1, INFO )
-         LWKOPT = INT( WORK(1) )
+         LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -254,7 +257,6 @@
          RETURN
       END IF
 *
-*
 *     Compute the factorization A = U**H*T*U or A = L*T*L**H.
 *
       CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2,
@@ -268,7 +270,7 @@
 *
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chesvx.f b/lapack-netlib/SRC/chesvx.f
index c23a35ce7..bdaad55ec 100644
--- a/lapack-netlib/SRC/chesvx.f
+++ b/lapack-netlib/SRC/chesvx.f
@@ -307,7 +307,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, NOFACT
-      INTEGER            LWKOPT, NB
+      INTEGER            LWKMIN, LWKOPT, NB
       REAL               ANORM
 *     ..
 *     .. External Functions ..
@@ -329,6 +329,7 @@
       INFO = 0
       NOFACT = LSAME( FACT, 'N' )
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N )
       IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN
          INFO = -1
       ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) )
@@ -346,17 +347,17 @@
          INFO = -11
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -13
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -18
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = MAX( 1, 2*N )
+         LWKOPT = LWKMIN
          IF( NOFACT ) THEN
             NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 )
             LWKOPT = MAX( LWKOPT, N*NB )
          END IF
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -405,7 +406,7 @@
       IF( RCOND.LT.SLAMCH( 'Epsilon' ) )
      $   INFO = N + 1
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chetrd_2stage.f b/lapack-netlib/SRC/chetrd_2stage.f
index f5ad35f27..ec7075798 100644
--- a/lapack-netlib/SRC/chetrd_2stage.f
+++ b/lapack-netlib/SRC/chetrd_2stage.f
@@ -4,23 +4,23 @@
 *
 *  =========== DOCUMENTATION ===========
 *
-* Online html documentation available at 
-*            http://www.netlib.org/lapack/explore-html/ 
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
-*> Download CHETRD_2STAGE + dependencies 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chetrd_2stage.f"> 
-*> [TGZ]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chetrd_2stage.f"> 
-*> [ZIP]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chetrd_2stage.f"> 
+*> Download CHETRD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chetrd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chetrd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chetrd_2stage.f">
 *> [TXT]</a>
-*> \endhtmlonly 
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+*       SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
 *                                 HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
 *       IMPLICIT NONE
@@ -34,7 +34,7 @@
 *       COMPLEX            A( LDA, * ), TAU( * ),
 *                          HOUS2( * ), WORK( * )
 *       ..
-*  
+*
 *
 *> \par Purpose:
 *  =============
@@ -52,11 +52,11 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  in particular for the second stage (Band to
 *>                  tridiagonal) and thus LHOUS2 is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate Q1 Q2 or to apply Q1 Q2, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate Q1 Q2 or to apply Q1 Q2,
 *>                  then LHOUS2 is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -86,7 +86,7 @@
 *>          triangular part of A is not referenced.
 *>          On exit, if UPLO = 'U', the band superdiagonal
 *>          of A are overwritten by the corresponding elements of the
-*>          internal band-diagonal matrix AB, and the elements above 
+*>          internal band-diagonal matrix AB, and the elements above
 *>          the KD superdiagonal, with the array TAU, represent the unitary
 *>          matrix Q1 as a product of elementary reflectors; if UPLO
 *>          = 'L', the diagonal and band subdiagonal of A are over-
@@ -117,13 +117,13 @@
 *> \param[out] TAU
 *> \verbatim
 *>          TAU is COMPLEX array, dimension (N-KD)
-*>          The scalar factors of the elementary reflectors of 
+*>          The scalar factors of the elementary reflectors of
 *>          the first stage (see Further Details).
 *> \endverbatim
 *>
 *> \param[out] HOUS2
 *> \verbatim
-*>          HOUS2 is COMPLEX array, dimension (LHOUS2)
+*>          HOUS2 is COMPLEX array, dimension (MAX(1,LHOUS2))
 *>          Stores the Householder representation of the stage2
 *>          band to tridiagonal.
 *> \endverbatim
@@ -132,6 +132,8 @@
 *> \verbatim
 *>          LHOUS2 is INTEGER
 *>          The dimension of the array HOUS2.
+*>          LHOUS2 >= 1.
+*>
 *>          If LWORK = -1, or LHOUS2=-1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS2 array, returns
@@ -143,13 +145,16 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension (LWORK)
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
+*>          The dimension of the array WORK.
+*>          If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension).
+*>
 *>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -157,9 +162,9 @@
 *>          message related to LWORK is issued by XERBLA.
 *>          LWORK = MAX(1, dimension) where
 *>          dimension   = max(stage1,stage2) + (KD+1)*N
-*>                      = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                        + max(2*KD*KD, KD*NTHREADS) 
-*>                        + (KD+1)*N 
+*>                      = N*KD + N*max(KD+1,FACTOPTNB)
+*>                        + max(2*KD*KD, KD*NTHREADS)
+*>                        + (KD+1)*N
 *>          where KD is the blocking size of the reduction,
 *>          FACTOPTNB is the blocking used by the QR or LQ
 *>          algorithm, usually FACTOPTNB=128 is a good choice
@@ -177,12 +182,12 @@
 *  Authors:
 *  ========
 *
-*> \author Univ. of Tennessee 
-*> \author Univ. of California Berkeley 
-*> \author Univ. of Colorado Denver 
-*> \author NAG Ltd. 
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
 *
-*> \ingroup complexHEcomputational
+*> \ingroup hetrd_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -202,7 +207,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -210,16 +215,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+      SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
      $                          HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
       IMPLICIT NONE
@@ -250,7 +255,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      EXTERNAL           LSAME, ILAENV2STAGE
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV2STAGE, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -265,10 +271,13 @@
 *
       KD     = ILAENV2STAGE( 1, 'CHETRD_2STAGE', VECT, N, -1, -1, -1 )
       IB     = ILAENV2STAGE( 2, 'CHETRD_2STAGE', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 )
-*      WRITE(*,*),'CHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO,
-*     $            LHMIN, LWMIN
+      IF( N.EQ.0 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.LSAME( VECT, 'N' ) ) THEN
          INFO = -1
@@ -285,8 +294,8 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         HOUS2( 1 ) = LHMIN
-         WORK( 1 )  = LWMIN
+         HOUS2( 1 ) = SROUNDUP_LWORK( LHMIN )
+         WORK( 1 )  = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -309,14 +318,14 @@
       LWRK  = LWORK-LDAB*N
       ABPOS = 1
       WPOS  = ABPOS + LDAB*N
-      CALL CHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, 
+      CALL CHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB,
      $                   TAU, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'CHETRD_HE2HB', -INFO )
          RETURN
       END IF
-      CALL CHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, 
-     $                   WORK( ABPOS ), LDAB, D, E, 
+      CALL CHETRD_HB2ST( 'Y', VECT, UPLO, N, KD,
+     $                   WORK( ABPOS ), LDAB, D, E,
      $                   HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'CHETRD_HB2ST', -INFO )
@@ -324,8 +333,7 @@
       END IF
 *
 *
-      HOUS2( 1 ) = LHMIN
-      WORK( 1 )  = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of CHETRD_2STAGE
diff --git a/lapack-netlib/SRC/chetrd_hb2st.F b/lapack-netlib/SRC/chetrd_hb2st.F
index 3688e40a3..b0d3e45fb 100644
--- a/lapack-netlib/SRC/chetrd_hb2st.F
+++ b/lapack-netlib/SRC/chetrd_hb2st.F
@@ -132,15 +132,17 @@
 *>
 *> \param[out] HOUS
 *> \verbatim
-*>          HOUS is COMPLEX array, dimension LHOUS, that
-*>          store the Householder representation.
+*>          HOUS is COMPLEX array, dimension (MAX(1,LHOUS))
+*>          Stores the Householder representation.
 *> \endverbatim
 *>
 *> \param[in] LHOUS
 *> \verbatim
 *>          LHOUS is INTEGER
-*>          The dimension of the array HOUS. LHOUS = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array HOUS.
+*>          If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS array, returns
 *>          this value as the first entry of the HOUS array, and no error
@@ -152,14 +154,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension LWORK.
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK)).
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array WORK.
+*>          If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -262,7 +267,7 @@
       INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
      $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
      $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
-     $                   NBTILES, TTYPE, TID, NTHREADS, DEBUG,
+     $                   NBTILES, TTYPE, TID, NTHREADS,
      $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
      $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
      $                   SICEV, SIZETAU, LDV, LHMIN, LWMIN
@@ -286,7 +291,6 @@
 *     Determine the minimal workspace size required.
 *     Test the input parameters
 *
-      DEBUG   = 0
       INFO    = 0
       AFTERS1 = LSAME( STAGE1, 'Y' )
       WANTQ   = LSAME( VECT, 'V' )
@@ -295,9 +299,14 @@
 *
 *     Determine the block size, the workspace size and the hous size.
 *
-      IB     = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
+      IB       = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 )
+      IF( N.EQ.0 .OR. KD.LE.1 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
          INFO = -1
@@ -318,8 +327,8 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         HOUS( 1 ) = LHMIN
-         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+         HOUS( 1 ) = SROUNDUP_LWORK( LHMIN )
+         WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -575,8 +584,7 @@ C                 END IF
   170     CONTINUE
       ENDIF
 *
-      HOUS( 1 ) = LHMIN
-      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of CHETRD_HB2ST
diff --git a/lapack-netlib/SRC/chetrd_he2hb.f b/lapack-netlib/SRC/chetrd_he2hb.f
index 090f02100..42e71e0b2 100644
--- a/lapack-netlib/SRC/chetrd_he2hb.f
+++ b/lapack-netlib/SRC/chetrd_he2hb.f
@@ -123,8 +123,8 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension (LWORK)
-*>          On exit, if INFO = 0, or if LWORK=-1, 
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, or if LWORK = -1,
 *>          WORK(1) returns the size of LWORK.
 *> \endverbatim
 *>
@@ -132,7 +132,9 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK which should be calculated
-*>          by a workspace query. LWORK = MAX(1, LWORK_QUERY)
+*>          by a workspace query.
+*>          If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY).
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -294,8 +296,12 @@
       INFO   = 0
       UPPER  = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
-      LWMIN  = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 )
-      
+      IF( N.LE.KD+1 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 )
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -314,7 +320,7 @@
          CALL XERBLA( 'CHETRD_HE2HB', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+         WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
          RETURN
       END IF
 *
@@ -507,7 +513,7 @@
 
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of CHETRD_HE2HB
diff --git a/lapack-netlib/SRC/chetrf.f b/lapack-netlib/SRC/chetrf.f
index 0c596ffe7..2836e30bc 100644
--- a/lapack-netlib/SRC/chetrf.f
+++ b/lapack-netlib/SRC/chetrf.f
@@ -107,7 +107,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK. LWORK >= 1. For best performance
 *>          LWORK >= N*NB, where NB is the block size returned by ILAENV.
 *> \endverbatim
 *>
@@ -228,8 +228,8 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         LWKOPT = MAX( 1, N*NB )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -347,7 +347,7 @@
       END IF
 *
    40 CONTINUE
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of CHETRF
diff --git a/lapack-netlib/SRC/chetrf_aa.f b/lapack-netlib/SRC/chetrf_aa.f
index 0547a4eab..51410a6ed 100644
--- a/lapack-netlib/SRC/chetrf_aa.f
+++ b/lapack-netlib/SRC/chetrf_aa.f
@@ -101,8 +101,10 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >= 2*N. For optimum performance
-*>          LWORK >= N*(1+NB), where NB is the optimal blocksize.
+*>          The length of WORK.
+*>          LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise.
+*>          For optimum performance LWORK >= N*(1+NB), where NB is
+*>          the optimal blocksize, returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -128,7 +130,7 @@
 *> \ingroup hetrf_aa
 *
 *  =====================================================================
-      SUBROUTINE CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO)
+      SUBROUTINE CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -152,7 +154,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL      LQUERY, UPPER
-      INTEGER      J, LWKOPT
+      INTEGER      J, LWKMIN, LWKOPT
       INTEGER      NB, MJ, NJ, K1, K2, J1, J2, J3, JB
       COMPLEX      ALPHA
 *     ..
@@ -179,19 +181,26 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( N.LE.1 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = 2*N
+         LWKOPT = (NB+1)*N
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.( 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = (NB+1)*N
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -203,11 +212,11 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
           RETURN
       ENDIF
       IPIV( 1 ) = 1
-      IF ( N.EQ.1 ) THEN
+      IF( N.EQ.1 ) THEN
          A( 1, 1 ) = REAL( A( 1, 1 ) )
          RETURN
       END IF
@@ -460,7 +469,7 @@
       END IF
 *
    20 CONTINUE
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of CHETRF_AA
diff --git a/lapack-netlib/SRC/chetrf_aa_2stage.f b/lapack-netlib/SRC/chetrf_aa_2stage.f
index 400efdf26..a79343753 100644
--- a/lapack-netlib/SRC/chetrf_aa_2stage.f
+++ b/lapack-netlib/SRC/chetrf_aa_2stage.f
@@ -87,14 +87,14 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is COMPLEX array, dimension (LTB)
+*>          TB is COMPLEX array, dimension (MAX(1,LTB))
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -121,14 +121,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX workspace of size LWORK
+*>          WORK is COMPLEX workspace of size (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used
+*>          to select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -152,7 +152,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup hetrf_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV,
@@ -188,7 +188,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
       
 *     ..
 *     .. External Subroutines ..
@@ -213,9 +214,9 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -6
-      ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN
          INFO = -10
       END IF
 *
@@ -229,10 +230,10 @@
       NB = ILAENV( 1, 'CHETRF_AA_2STAGE', UPLO, N, -1, -1, -1 )
       IF( INFO.EQ.0 ) THEN
          IF( TQUERY ) THEN
-            TB( 1 ) = (3*NB+1)*N
+            TB( 1 ) = SROUNDUP_LWORK( MAX( 1, (3*NB+1)*N ) )
          END IF
          IF( WQUERY ) THEN
-            WORK( 1 ) = N*NB
+            WORK( 1 ) = SROUNDUP_LWORK( MAX( 1, N*NB ) )
          END IF
       END IF
       IF( TQUERY .OR. WQUERY ) THEN
@@ -241,7 +242,7 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
          RETURN
       ENDIF
 *
diff --git a/lapack-netlib/SRC/chetrf_rk.f b/lapack-netlib/SRC/chetrf_rk.f
index ef442c937..a13c740e3 100644
--- a/lapack-netlib/SRC/chetrf_rk.f
+++ b/lapack-netlib/SRC/chetrf_rk.f
@@ -177,14 +177,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension ( MAX(1,LWORK) ).
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned
 *>          by ILAENV.
 *>
@@ -311,8 +311,8 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'CHETRF_RK', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         LWKOPT = MAX( 1, N*NB )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -488,7 +488,7 @@
 *
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of CHETRF_RK
diff --git a/lapack-netlib/SRC/chetrf_rook.f b/lapack-netlib/SRC/chetrf_rook.f
index 1593c2edc..df0323520 100644
--- a/lapack-netlib/SRC/chetrf_rook.f
+++ b/lapack-netlib/SRC/chetrf_rook.f
@@ -122,7 +122,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
@@ -264,7 +264,7 @@
 *
          NB = ILAENV( 1, 'CHETRF_ROOK', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, N*NB )
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -387,7 +387,7 @@
       END IF
 *
    40 CONTINUE
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of CHETRF_ROOK
diff --git a/lapack-netlib/SRC/chetri2.f b/lapack-netlib/SRC/chetri2.f
index 2865a6440..f15065ae7 100644
--- a/lapack-netlib/SRC/chetri2.f
+++ b/lapack-netlib/SRC/chetri2.f
@@ -88,16 +88,16 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension (N+NB+1)*(NB+3)
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          WORK is size >= (N+NB+1)*(NB+3)
+*>          If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3).
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>           calculates:
+*>          calculates:
 *>              - the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array,
 *>              - and no error message related to LWORK is issued by XERBLA.
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEcomputational
+*> \ingroup hetri2
 *
 *  =====================================================================
       SUBROUTINE CHETRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -147,7 +147,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHETRI2X, CHETRI, XERBLA
@@ -159,9 +160,13 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+*
 *     Get blocksize
+*
       NBMAX = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 )
-      IF ( NBMAX .GE. N ) THEN
+      IF( N.EQ.0 ) THEN
+         MINSIZE = 1
+      ELSE IF( NBMAX.GE.N ) THEN
          MINSIZE = N
       ELSE
          MINSIZE = (N+NBMAX+1)*(NBMAX+3)
@@ -173,28 +178,29 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
-*
-*     Quick return if possible
-*
 *
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'CHETRI2', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK(1)=MINSIZE
+         WORK( 1 ) = SROUNDUP_LWORK( MINSIZE )
          RETURN
       END IF
+*
+*     Quick return if possible
+*
       IF( N.EQ.0 )
      $   RETURN
 
-      IF( NBMAX .GE. N ) THEN
+      IF( NBMAX.GE.N ) THEN
          CALL CHETRI( UPLO, N, A, LDA, IPIV, WORK, INFO )
       ELSE
          CALL CHETRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO )
       END IF
+*
       RETURN
 *
 *     End of CHETRI2
diff --git a/lapack-netlib/SRC/chetri_3.f b/lapack-netlib/SRC/chetri_3.f
index deda63598..ccfce5070 100644
--- a/lapack-netlib/SRC/chetri_3.f
+++ b/lapack-netlib/SRC/chetri_3.f
@@ -119,16 +119,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension (N+NB+1)*(NB+3).
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK. LWORK >= (N+NB+1)*(NB+3).
+*>          The length of WORK.
+*>          If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3).
 *>
-*>          If LDWORK = -1, then a workspace query is assumed;
+*>          If LWORK = -1, then a workspace query is assumed;
 *>          the routine only calculates the optimal size of the optimal
 *>          size of the WORK array, returns this value as the first
 *>          entry of the WORK array, and no error message related to
@@ -209,8 +210,13 @@
 *
 *     Determine the block size
 *
-      NB = MAX( 1, ILAENV( 1, 'CHETRI_3', UPLO, N, -1, -1, -1 ) )
-      LWKOPT = ( N+NB+1 ) * ( NB+3 )
+      IF( N.EQ.0 ) THEN
+         LWKOPT = 1
+      ELSE
+         NB = MAX( 1, ILAENV( 1, 'CHETRI_3', UPLO, N, -1, -1, -1 ) )
+         LWKOPT = ( N+NB+1 ) * ( NB+3 )
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
@@ -218,7 +224,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN
          INFO = -8
       END IF
 *
@@ -226,7 +232,6 @@
          CALL XERBLA( 'CHETRI_3', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          RETURN
       END IF
 *
@@ -237,7 +242,7 @@
 *
       CALL CHETRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO )
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chetrs_aa.f b/lapack-netlib/SRC/chetrs_aa.f
index 879549106..07179ab92 100644
--- a/lapack-netlib/SRC/chetrs_aa.f
+++ b/lapack-netlib/SRC/chetrs_aa.f
@@ -105,7 +105,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK >= max(1,3*N-2).
+*>          The dimension of the array WORK.
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the minimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -151,24 +157,30 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, UPPER
-      INTEGER            K, KP, LWKOPT
+      INTEGER            K, KP, LWKMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       REAL               SROUNDUP_LWORK
-      EXTERNAL           LSAME,SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLACPY, CLACGV, CGTSV, CSWAP, CTRSM, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX
+      INTRINSIC          MIN, MAX
 *     ..
 *     .. Executable Statements ..
 *
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWKMIN = 1
+      ELSE
+         LWKMIN = 3*N-2
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -179,21 +191,20 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'CHETRS_AA', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         LWKOPT = (3*N-2)
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKMIN )
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( N.EQ.0 .OR. NRHS.EQ.0 )
+      IF( MIN( N, NRHS ).EQ.0 )
      $   RETURN
 *
       IF( UPPER ) THEN
diff --git a/lapack-netlib/SRC/clamswlq.f b/lapack-netlib/SRC/clamswlq.f
index 5daf60bf6..8f474a3ab 100644
--- a/lapack-netlib/SRC/clamswlq.f
+++ b/lapack-netlib/SRC/clamswlq.f
@@ -127,17 +127,20 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          If SIDE = 'L', LWORK >= max(1,NB) * MB;
-*>          if SIDE = 'R', LWORK >= max(1,M) * MB.
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,NB*MB).
+*>          If SIDE = 'R', LWORK >= max(1,M*MB).
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -193,91 +196,100 @@
 *>
 *  =====================================================================
       SUBROUTINE CLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $    LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      COMPLEX           A( LDA, * ), WORK( * ), C(LDC, * ),
-     $      T( LDT, * )
+      COMPLEX            A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, LW, CTR
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, LW, CTR, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       REAL               SROUNDUP_LWORK
       EXTERNAL           LSAME, SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
-      EXTERNAL    CTPMLQT, CGEMLQT, XERBLA
+      EXTERNAL           CTPMLQT, CGEMLQT, XERBLA
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      INFO = 0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * MB
       ELSE
         LW = M * MB
       END IF
 *
-      INFO = 0
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
+*
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( K.LT.0 ) THEN
         INFO = -5
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
         INFO = -4
-      ELSE IF( K.LT.MB .OR. MB.LT.1) THEN
+      ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN
         INFO = -6
       ELSE IF( LDA.LT.MAX( 1, K ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, MB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
+      END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'CLAMSWLQ', -INFO )
-        WORK(1) = SROUNDUP_LWORK(LW)
         RETURN
-      ELSE IF (LQUERY) THEN
-        WORK(1) = SROUNDUP_LWORK(LW)
+      ELSE IF( LQUERY ) THEN
         RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
       IF((NB.LE.K).OR.(NB.GE.MAX(M,N,K))) THEN
         CALL CGEMLQT( SIDE, TRANS, M, N, K, MB, A, LDA,
-     $        T, LDT, C, LDC, WORK, INFO)
+     $        T, LDT, C, LDC, WORK, INFO )
         RETURN
       END IF
 *
@@ -404,7 +416,7 @@
 *
       END IF
 *
-      WORK(1) = SROUNDUP_LWORK(LW)
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of CLAMSWLQ
diff --git a/lapack-netlib/SRC/clamtsqr.f b/lapack-netlib/SRC/clamtsqr.f
index 05021e642..13625087f 100644
--- a/lapack-netlib/SRC/clamtsqr.f
+++ b/lapack-netlib/SRC/clamtsqr.f
@@ -128,22 +128,24 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
+*>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,N*NB).
+*>          If SIDE = 'R', LWORK >= max(1,MB*NB).
 *>
-*>          If SIDE = 'L', LWORK >= max(1,N)*NB;
-*>          if SIDE = 'R', LWORK >= max(1,MB)*NB.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -195,45 +197,47 @@
 *>
 *  =====================================================================
       SUBROUTINE CLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $        LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      COMPLEX        A( LDA, * ), WORK( * ), C(LDC, * ),
-     $                T( LDT, * )
+      COMPLEX            A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, LW, CTR, Q
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, LW, CTR, Q, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       REAL               SROUNDUP_LWORK
       EXTERNAL           LSAME, SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
-      EXTERNAL   CGEMQRT, CTPMQRT, XERBLA
+      EXTERNAL           CGEMQRT, CTPMQRT, XERBLA
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      INFO = 0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * NB
         Q = M
       ELSE
@@ -241,11 +245,17 @@
         Q = N
       END IF
 *
-      INFO = 0
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
+*
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
@@ -256,38 +266,38 @@
         INFO = -7
       ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, NB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
-*     Determine the block size if it is tall skinny or short and wide
-*
-      IF( INFO.EQ.0)  THEN
-          WORK(1) = SROUNDUP_LWORK(LW)
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'CLAMTSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
+*
+*     Determine the block size if it is tall skinny or short and wide
 *
       IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN
         CALL CGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA,
-     $        T, LDT, C, LDC, WORK, INFO)
+     $        T, LDT, C, LDC, WORK, INFO )
         RETURN
-       END IF
+      END IF
 *
       IF(LEFT.AND.NOTRAN) THEN
 *
@@ -412,7 +422,7 @@
 *
       END IF
 *
-      WORK(1) = SROUNDUP_LWORK(LW)
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of CLAMTSQR
diff --git a/lapack-netlib/SRC/claswlq.f b/lapack-netlib/SRC/claswlq.f
index 12e8373df..2044e055c 100644
--- a/lapack-netlib/SRC/claswlq.f
+++ b/lapack-netlib/SRC/claswlq.f
@@ -96,22 +96,24 @@
 *>          The leading dimension of the array T.  LDT >= MB.
 *> \endverbatim
 *>
-*>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
+*>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= MB*M.
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -163,33 +165,35 @@
 *>
 *  =====================================================================
       SUBROUTINE CLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
-     $                  INFO)
+     $                    INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LWORK, LDT
+      INTEGER            INFO, LDA, M, N, MB, NB, LWORK, LDT
 *     ..
 *     .. Array Arguments ..
-      COMPLEX           A( LDA, * ), WORK( * ), T( LDT, *)
+      COMPLEX            A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, MINMN, LWMIN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
       INTEGER            ILAENV
       REAL               SROUNDUP_LWORK
       EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
+*     ..
 *     .. EXTERNAL SUBROUTINES ..
       EXTERNAL           CGELQT, CTPLQT, XERBLA
+*     ..
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
@@ -200,12 +204,19 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = M*MB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
       ELSE IF( N.LT.0 .OR. N.LT.M ) THEN
         INFO = -2
-      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN
+      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN
         INFO = -3
       ELSE IF( NB.LE.0 ) THEN
         INFO = -4
@@ -213,60 +224,61 @@
         INFO = -6
       ELSE IF( LDT.LT.MB ) THEN
         INFO = -8
-      ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-      WORK(1) = SROUNDUP_LWORK(MB*M)
+*
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'CLASWLQ', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The LQ Decomposition
 *
-       IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN
+      IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN
         CALL CGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO)
         RETURN
-       END IF
+      END IF
 *
-       KK = MOD((N-M),(NB-M))
-       II=N-KK+1
+      KK = MOD((N-M),(NB-M))
+      II = N-KK+1
 *
-*      Compute the LQ factorization of the first block A(1:M,1:NB)
+*     Compute the LQ factorization of the first block A(1:M,1:NB)
 *
-       CALL CGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO)
-       CTR = 1
+      CALL CGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO)
+      CTR = 1
 *
-       DO I = NB+1, II-NB+M , (NB-M)
+      DO I = NB+1, II-NB+M , (NB-M)
 *
-*      Compute the QR factorization of the current block A(1:M,I:I+NB-M)
+*       Compute the QR factorization of the current block A(1:M,I:I+NB-M)
 *
-         CALL CTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
+        CALL CTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
      $                  LDA, T(1,CTR*M+1),
      $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+        CTR = CTR + 1
+      END DO
 *
 *     Compute the QR factorization of the last block A(1:M,II:N)
 *
-       IF (II.LE.N) THEN
+      IF( II.LE.N ) THEN
         CALL CTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ),
      $                  LDA, T(1,CTR*M+1), LDT,
      $                  WORK, INFO )
-       END IF
+      END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(M * MB)
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of CLASWLQ
diff --git a/lapack-netlib/SRC/clatrs3.f b/lapack-netlib/SRC/clatrs3.f
index 0502f6898..354141a8b 100644
--- a/lapack-netlib/SRC/clatrs3.f
+++ b/lapack-netlib/SRC/clatrs3.f
@@ -152,13 +152,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (LWORK).
+*>          WORK is REAL array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal size of
 *>          WORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
+*> \verbatim
 *>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else
 *>          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
 *>          NBA = (N + NB - 1)/NB and NB is the optimal block size.
 *>
@@ -166,6 +170,7 @@
 *>          only calculates the optimal dimensions of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
@@ -182,7 +187,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERauxiliary
+*> \ingroup latrs3
 *> \par Further Details:
 *  =====================
 *  \verbatim
@@ -257,15 +262,16 @@
       LOGICAL            LQUERY, NOTRAN, NOUNIT, UPPER
       INTEGER            AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
      $                   JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
-     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
+     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN
       REAL               ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
      $                   SCAMIN, SMLNUM, TMAX
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, CLANGE, SLARMM
-      EXTERNAL           ILAENV, LSAME, SLAMCH, CLANGE, SLARMM
+      REAL               SLAMCH, CLANGE, SLARMM, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SLAMCH, CLANGE, SLARMM,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLATRS, CSSCAL, XERBLA
@@ -296,15 +302,24 @@
 *     row. WORK( I + KK * LDS ) is the scale factor of the vector
 *     segment associated with the I-th block row and the KK-th vector
 *     in the block column.
+*
       LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
       LDS = NBA
+*
 *     The second part stores upper bounds of the triangular A. There are
 *     a total of NBA x NBA blocks, of which only the upper triangular
 *     part or the lower triangular part is referenced. The upper bound of
 *     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
+*
       LANRM = NBA * NBA
       AWRK = LSCALE
-      WORK( 1 ) = LSCALE + LANRM
+*
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = LSCALE + LANRM
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
 *
 *     Test the input parameters.
 *
@@ -326,7 +341,7 @@
          INFO = -8
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -10
-      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
+      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN
          INFO = -14
       END IF
       IF( INFO.NE.0 ) THEN
@@ -659,6 +674,9 @@
             END IF
          END DO
       END DO
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
+*
       RETURN
 *
 *     End of CLATRS3
diff --git a/lapack-netlib/SRC/clatsqr.f b/lapack-netlib/SRC/clatsqr.f
index cd2cb4aa7..67403693f 100644
--- a/lapack-netlib/SRC/clatsqr.f
+++ b/lapack-netlib/SRC/clatsqr.f
@@ -101,15 +101,18 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= NB*N.
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -165,32 +168,34 @@
 *>
 *  =====================================================================
       SUBROUTINE CLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK,
-     $                    LWORK, INFO)
+     $                    LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LDT, LWORK
+      INTEGER            INFO, LDA, M, N, MB, NB, LDT, LWORK
 *     ..
 *     .. Array Arguments ..
-      COMPLEX           A( LDA, * ), WORK( * ), T(LDT, *)
+      COMPLEX            A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, LWMIN, MINMN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
       REAL               SROUNDUP_LWORK
       EXTERNAL           LSAME, SROUNDUP_LWORK
+*     ..
 *     .. EXTERNAL SUBROUTINES ..
-      EXTERNAL    CGEQRT, CTPQRT, XERBLA
+      EXTERNAL           CGEQRT, CTPQRT, XERBLA
+*     ..
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
@@ -201,6 +206,13 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = N*NB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
@@ -208,64 +220,65 @@
         INFO = -2
       ELSE IF( MB.LT.1 ) THEN
         INFO = -3
-      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN
+      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN
         INFO = -4
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -6
       ELSE IF( LDT.LT.NB ) THEN
         INFO = -8
-      ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-        WORK(1) = SROUNDUP_LWORK(NB*N)
+*
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'CLATSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The QR Decomposition
 *
-       IF ((MB.LE.N).OR.(MB.GE.M)) THEN
-         CALL CGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO)
-         RETURN
-       END IF
-       KK = MOD((M-N),(MB-N))
-       II=M-KK+1
+      IF ( (MB.LE.N) .OR. (MB.GE.M) ) THEN
+        CALL CGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO )
+        RETURN
+      END IF
+      KK = MOD((M-N),(MB-N))
+      II = M-KK+1
 *
-*      Compute the QR factorization of the first block A(1:MB,1:N)
+*     Compute the QR factorization of the first block A(1:MB,1:N)
 *
-       CALL CGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
-       CTR = 1
+      CALL CGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
+      CTR = 1
 *
-       DO I = MB+1, II-MB+N ,  (MB-N)
+      DO I = MB+1, II-MB+N, (MB-N)
 *
-*      Compute the QR factorization of the current block A(I:I+MB-N,1:N)
+*       Compute the QR factorization of the current block A(I:I+MB-N,1:N)
 *
-         CALL CTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
+        CALL CTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
      $                 T(1,CTR * N + 1),
-     $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+     $                 LDT, WORK, INFO )
+        CTR = CTR + 1
+      END DO
 *
-*      Compute the QR factorization of the last block A(II:M,1:N)
+*     Compute the QR factorization of the last block A(II:M,1:N)
 *
-       IF (II.LE.M) THEN
-         CALL CTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
+      IF( II.LE.M ) THEN
+        CALL CTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
      $                 T(1, CTR * N + 1), LDT,
-     $                  WORK, INFO )
-       END IF
+     $                 WORK, INFO )
+      END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(N*NB)
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of CLATSQR
diff --git a/lapack-netlib/SRC/dsytrf.f b/lapack-netlib/SRC/dsytrf.f
index aee9b3f6a..2a1a2d4dc 100644
--- a/lapack-netlib/SRC/dsytrf.f
+++ b/lapack-netlib/SRC/dsytrf.f
@@ -107,7 +107,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
@@ -135,7 +135,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrf
 *
 *> \par Further Details:
 *  =====================
@@ -352,6 +352,7 @@
       END IF
 *
    40 CONTINUE
+*
       WORK( 1 ) = LWKOPT
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssytrd_sb2st.F b/lapack-netlib/SRC/ssytrd_sb2st.F
index 32bae26dc..111eaa93e 100644
--- a/lapack-netlib/SRC/ssytrd_sb2st.F
+++ b/lapack-netlib/SRC/ssytrd_sb2st.F
@@ -132,15 +132,17 @@
 *>
 *> \param[out] HOUS
 *> \verbatim
-*>          HOUS is REAL array, dimension LHOUS, that
-*>          store the Householder representation.
+*>          HOUS is REAL array, dimension (MAX(1,LHOUS))
+*>          Stores the Householder representation.
 *> \endverbatim
 *>
 *> \param[in] LHOUS
 *> \verbatim
 *>          LHOUS is INTEGER
-*>          The dimension of the array HOUS. LHOUS = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array HOUS.
+*>          If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension)
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS array, returns
 *>          this value as the first entry of the HOUS array, and no error
@@ -152,14 +154,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension LWORK.
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array WORK.
+*>          IF N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension)
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -261,7 +266,7 @@
       INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
      $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
      $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
-     $                   NBTILES, TTYPE, TID, NTHREADS, DEBUG,
+     $                   NBTILES, TTYPE, TID, NTHREADS,
      $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
      $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
      $                   SISEV, SIZETAU, LDV, LHMIN, LWMIN
@@ -283,7 +288,6 @@
 *     Determine the minimal workspace size required.
 *     Test the input parameters
 *
-      DEBUG   = 0
       INFO    = 0
       AFTERS1 = LSAME( STAGE1, 'Y' )
       WANTQ   = LSAME( VECT, 'V' )
@@ -292,9 +296,14 @@
 *
 *     Determine the block size, the workspace size and the hous size.
 *
-      IB     = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+      IB       = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 )
+      IF( N.EQ.0 .OR. KD.LE.1 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
          INFO = -1
@@ -315,8 +324,8 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         HOUS( 1 ) = LHMIN
-         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+         HOUS( 1 ) = SROUNDUP_LWORK( LHMIN )
+         WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -544,8 +553,7 @@
   170     CONTINUE
       ENDIF
 *
-      HOUS( 1 ) = LHMIN
-      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of SSYTRD_SB2ST

From 29d6024ec534858294e648d189f9026302a903b4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 23 Dec 2023 19:44:11 +0100
Subject: [PATCH 502/718] Handle corner cases of LWORK (Reference-LAPACK PR
 942)

---
 lapack-netlib/SRC/dgebrd.f           |  26 ++++--
 lapack-netlib/SRC/dgehrd.f           |  24 +++--
 lapack-netlib/SRC/dgelq.f            |   4 +-
 lapack-netlib/SRC/dgelqf.f           |  20 ++--
 lapack-netlib/SRC/dgelsd.f           |   7 +-
 lapack-netlib/SRC/dgemlq.f           |  32 ++++---
 lapack-netlib/SRC/dgemqr.f           |  32 ++++---
 lapack-netlib/SRC/dgeqlf.f           |  10 +-
 lapack-netlib/SRC/dgeqp3rk.f         |   3 +-
 lapack-netlib/SRC/dgeqr.f            |  20 ++--
 lapack-netlib/SRC/dgeqrfp.f          |  24 +++--
 lapack-netlib/SRC/dgerqf.f           |   4 +-
 lapack-netlib/SRC/dgesvj.f           |  32 +++++--
 lapack-netlib/SRC/dgetri.f           |   5 +-
 lapack-netlib/SRC/dgetsls.f          |  12 ++-
 lapack-netlib/SRC/dgetsqrhrt.f       |  18 ++--
 lapack-netlib/SRC/dgges.f            |  12 +--
 lapack-netlib/SRC/dgges3.f           |  39 +++++---
 lapack-netlib/SRC/dggev3.f           |  37 ++++----
 lapack-netlib/SRC/dgghd3.f           |  17 ++--
 lapack-netlib/SRC/dggqrf.f           |   5 +-
 lapack-netlib/SRC/dggrqf.f           |   4 +-
 lapack-netlib/SRC/dggsvd3.f          |   4 +-
 lapack-netlib/SRC/dggsvp3.f          |   4 +-
 lapack-netlib/SRC/dlamswlq.f         |  62 ++++++++-----
 lapack-netlib/SRC/dlamtsqr.f         |  74 ++++++++-------
 lapack-netlib/SRC/dlaswlq.f          |  92 ++++++++++--------
 lapack-netlib/SRC/dlatrs3.f          |  27 +++++-
 lapack-netlib/SRC/dlatsqr.f          |  92 ++++++++++--------
 lapack-netlib/SRC/dsyev_2stage.f     |  22 ++---
 lapack-netlib/SRC/dsyevd.f           |   5 +-
 lapack-netlib/SRC/dsyevr.f           |  21 +++--
 lapack-netlib/SRC/dsyevr_2stage.f    |  40 ++++----
 lapack-netlib/SRC/dsyevx.f           |   6 +-
 lapack-netlib/SRC/dsysv_aa.f         |  13 +--
 lapack-netlib/SRC/dsysv_aa_2stage.f  |  24 ++---
 lapack-netlib/SRC/dsysvx.f           |   9 +-
 lapack-netlib/SRC/dsytrd.f           |   4 +-
 lapack-netlib/SRC/dsytrd_2stage.f    |  89 ++++++++++--------
 lapack-netlib/SRC/dsytrd_sb2st.F     | 134 ++++++++++++++-------------
 lapack-netlib/SRC/dsytrd_sy2sb.f     |  18 ++--
 lapack-netlib/SRC/dsytrf_aa.f        |  27 ++++--
 lapack-netlib/SRC/dsytrf_aa_2stage.f |  22 ++---
 lapack-netlib/SRC/dsytrf_rk.f        |   6 +-
 lapack-netlib/SRC/dsytrf_rook.f      |   4 +-
 lapack-netlib/SRC/dsytri2.f          |  27 +++---
 lapack-netlib/SRC/dsytri_3.f         |  21 +++--
 lapack-netlib/SRC/dsytrs_aa.f        |  27 ++++--
 48 files changed, 751 insertions(+), 510 deletions(-)

diff --git a/lapack-netlib/SRC/dgebrd.f b/lapack-netlib/SRC/dgebrd.f
index 0f0d1651a..ac11d48a0 100644
--- a/lapack-netlib/SRC/dgebrd.f
+++ b/lapack-netlib/SRC/dgebrd.f
@@ -122,7 +122,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= max(1,M,N).
+*>          The length of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise.
 *>          For optimum performance LWORK >= (M+N)*NB, where NB
 *>          is the optimal blocksize.
 *>
@@ -147,7 +148,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup gebrd
 *
 *> \par Further Details:
 *  =====================
@@ -223,8 +224,8 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB,
-     $                   NBMIN, NX, WS
+      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT,
+     $                   MINMN, NB, NBMIN, NX, WS
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DGEBD2, DGEMM, DLABRD, XERBLA
@@ -241,9 +242,17 @@
 *     Test the input parameters
 *
       INFO = 0
-      NB = MAX( 1, ILAENV( 1, 'DGEBRD', ' ', M, N, -1, -1 ) )
-      LWKOPT = ( M+N )*NB
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = MAX( M, N )
+         NB = MAX( 1, ILAENV( 1, 'DGEBRD', ' ', M, N, -1, -1 ) )
+         LWKOPT = ( M+N )*NB
+      ENDIF
       WORK( 1 ) = DBLE( LWKOPT )
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -251,7 +260,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.LT.0 ) THEN
@@ -263,7 +272,6 @@
 *
 *     Quick return if possible
 *
-      MINMN = MIN( M, N )
       IF( MINMN.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -282,7 +290,7 @@
 *        Determine when to switch from blocked to unblocked code.
 *
          IF( NX.LT.MINMN ) THEN
-            WS = ( M+N )*NB
+            WS = LWKOPT
             IF( LWORK.LT.WS ) THEN
 *
 *              Not enough work space for the optimal NB, consider using
diff --git a/lapack-netlib/SRC/dgehrd.f b/lapack-netlib/SRC/dgehrd.f
index a40c61cb6..d95bbd182 100644
--- a/lapack-netlib/SRC/dgehrd.f
+++ b/lapack-netlib/SRC/dgehrd.f
@@ -89,7 +89,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (LWORK)
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup gehrd
 *
 *> \par Further Details:
 *  =====================
@@ -173,7 +173,7 @@
       INTEGER            IHI, ILO, INFO, LDA, LWORK, N
 *     ..
 *     .. Array Arguments ..
-      DOUBLE PRECISION  A( LDA, * ), TAU( * ), WORK( * )
+      DOUBLE PRECISION   A( LDA, * ), TAU( * ), WORK( * )
 *     ..
 *
 *  =====================================================================
@@ -182,7 +182,7 @@
       INTEGER            NBMAX, LDT, TSIZE
       PARAMETER          ( NBMAX = 64, LDT = NBMAX+1,
      $                     TSIZE = LDT*NBMAX )
-      DOUBLE PRECISION  ZERO, ONE
+      DOUBLE PRECISION   ZERO, ONE
       PARAMETER          ( ZERO = 0.0D+0,
      $                     ONE = 1.0D+0 )
 *     ..
@@ -190,7 +190,7 @@
       LOGICAL            LQUERY
       INTEGER            I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB,
      $                   NBMIN, NH, NX
-      DOUBLE PRECISION  EI
+      DOUBLE PRECISION   EI
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DAXPY, DGEHD2, DGEMM, DLAHR2, DLARFB, DTRMM,
@@ -221,12 +221,18 @@
          INFO = -8
       END IF
 *
+      NH = IHI - ILO + 1
       IF( INFO.EQ.0 ) THEN
 *
 *        Compute the workspace requirements
 *
-         NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI, -1 ) )
-         LWKOPT = N*NB + TSIZE
+         IF( NH.LE.1 ) THEN
+            LWKOPT = 1
+         ELSE
+            NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI,
+     $                              -1 ) )
+            LWKOPT = N*NB + TSIZE
+         ENDIF
          WORK( 1 ) = LWKOPT
       END IF
 *
@@ -248,7 +254,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -268,7 +273,7 @@
 *
 *           Determine if workspace is large enough for blocked code
 *
-            IF( LWORK.LT.N*NB+TSIZE ) THEN
+            IF( LWORK.LT.LWKOPT ) THEN
 *
 *              Not enough workspace to use optimal NB:  determine the
 *              minimum value of NB, and reduce NB or force use of
@@ -344,6 +349,7 @@
 *     Use unblocked code to reduce the rest of the matrix
 *
       CALL DGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO )
+*
       WORK( 1 ) = LWKOPT
 *
       RETURN
diff --git a/lapack-netlib/SRC/dgelq.f b/lapack-netlib/SRC/dgelq.f
index 013b6c356..255e8732f 100644
--- a/lapack-netlib/SRC/dgelq.f
+++ b/lapack-netlib/SRC/dgelq.f
@@ -98,7 +98,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -166,6 +166,8 @@
 *>  the LQ factorization.
 *> \endverbatim
 *>
+*> \ingroup gelq
+*>
 *  =====================================================================
       SUBROUTINE DGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
diff --git a/lapack-netlib/SRC/dgelqf.f b/lapack-netlib/SRC/dgelqf.f
index ed3372f96..f0eb00a55 100644
--- a/lapack-netlib/SRC/dgelqf.f
+++ b/lapack-netlib/SRC/dgelqf.f
@@ -93,7 +93,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,M).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise.
 *>          For optimum performance LWORK >= M*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -118,7 +119,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup gelqf
 *
 *> \par Further Details:
 *  =====================
@@ -174,9 +175,8 @@
 *     Test the input arguments
 *
       INFO = 0
+      K = MIN( M, N )
       NB = ILAENV( 1, 'DGELQF', ' ', M, N, -1, -1 )
-      LWKOPT = M*NB
-      WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -184,19 +184,25 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN
-         INFO = -7
+      ELSE IF( .NOT.LQUERY ) THEN
+         IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) )
+     $      INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'DGELQF', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
+         IF( K.EQ.0 ) THEN
+            LWKOPT = 1
+         ELSE
+            LWKOPT = M*NB
+         END IF
+         WORK( 1 ) = LWKOPT
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      K = MIN( M, N )
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
diff --git a/lapack-netlib/SRC/dgelsd.f b/lapack-netlib/SRC/dgelsd.f
index b1f45a2c6..7dc564f48 100644
--- a/lapack-netlib/SRC/dgelsd.f
+++ b/lapack-netlib/SRC/dgelsd.f
@@ -188,7 +188,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEsolve
+*> \ingroup gelsd
 *
 *> \par Contributors:
 *  ==================
@@ -228,7 +228,7 @@
       DOUBLE PRECISION   ANRM, BIGNUM, BNRM, EPS, SFMIN, SMLNUM
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DGEBRD, DGELQF, DGEQRF, DLABAD, DLACPY, DLALSD,
+      EXTERNAL           DGEBRD, DGELQF, DGEQRF, DLACPY, DLALSD,
      $                   DLASCL, DLASET, DORMBR, DORMLQ, DORMQR, XERBLA
 *     ..
 *     .. External Functions ..
@@ -276,7 +276,7 @@
      $       LOG( TWO ) ) + 1, 0 )
 *
       IF( INFO.EQ.0 ) THEN
-         MAXWRK = 0
+         MAXWRK = 1
          LIWORK = 3*MINMN*NLVL + 11*MINMN
          MM = M
          IF( M.GE.N .AND. M.GE.MNTHR ) THEN
@@ -372,7 +372,6 @@
       SFMIN = DLAMCH( 'S' )
       SMLNUM = SFMIN / EPS
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max entry outside range [SMLNUM,BIGNUM].
 *
diff --git a/lapack-netlib/SRC/dgemlq.f b/lapack-netlib/SRC/dgemlq.f
index 3ba209105..757683f46 100644
--- a/lapack-netlib/SRC/dgemlq.f
+++ b/lapack-netlib/SRC/dgemlq.f
@@ -111,16 +111,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -144,7 +145,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -160,11 +161,13 @@
 *>  block sizes MB and NB returned by ILAENV, DGELQ will use either
 *>  DLASWLQ (if the matrix is wide-and-short) or DGELQT to compute
 *>  the LQ factorization.
-*>  This version of DGEMLQ will use either DLAMSWLQ or DGEMLQT to 
+*>  This version of DGEMLQ will use either DLAMSWLQ or DGEMLQT to
 *>  multiply matrix Q by another matrix.
 *>  Further Details in DLAMSWLQ or DGEMLQT.
 *> \endverbatim
 *>
+*> \ingroup gemlq
+*>
 *  =====================================================================
       SUBROUTINE DGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -186,7 +189,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -202,7 +205,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -217,6 +220,13 @@
         LW = M * MB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN
@@ -245,12 +255,12 @@
         INFO = -9
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
         INFO = -11
-      ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = LW
+        WORK( 1 ) = LWMIN
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -262,7 +272,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -275,7 +285,7 @@
      $                 MB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = LW
+      WORK( 1 ) = LWMIN
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/dgemqr.f b/lapack-netlib/SRC/dgemqr.f
index 022cf21e4..608815483 100644
--- a/lapack-netlib/SRC/dgemqr.f
+++ b/lapack-netlib/SRC/dgemqr.f
@@ -111,16 +111,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -144,7 +145,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -160,12 +161,14 @@
 *>  block sizes MB and NB returned by ILAENV, DGEQR will use either
 *>  DLATSQR (if the matrix is tall-and-skinny) or DGEQRT to compute
 *>  the QR factorization.
-*>  This version of DGEMQR will use either DLAMTSQR or DGEMQRT to 
+*>  This version of DGEMQR will use either DLAMTSQR or DGEMQRT to
 *>  multiply matrix Q by another matrix.
 *>  Further Details in DLATMSQR or DGEMQRT.
 *>
 *> \endverbatim
 *>
+*> \ingroup gemqr
+*>
 *  =====================================================================
       SUBROUTINE DGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -187,7 +190,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -203,7 +206,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -218,6 +221,13 @@
         LW = MB * NB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, MB - K ).EQ.0 ) THEN
@@ -246,12 +256,12 @@
         INFO = -9
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
         INFO = -11
-      ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = LW
+        WORK( 1 ) = LWMIN
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -263,7 +273,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -276,7 +286,7 @@
      $                 NB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = LW
+      WORK( 1 ) = LWMIN
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/dgeqlf.f b/lapack-netlib/SRC/dgeqlf.f
index b8ac0b1a0..a72d9dc76 100644
--- a/lapack-netlib/SRC/dgeqlf.f
+++ b/lapack-netlib/SRC/dgeqlf.f
@@ -88,7 +88,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -113,7 +114,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup geqlf
 *
 *> \par Further Details:
 *  =====================
@@ -188,8 +189,9 @@
          END IF
          WORK( 1 ) = LWKOPT
 *
-         IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
-            INFO = -7
+         IF( .NOT.LQUERY ) THEN
+            IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) )
+     $         INFO = -7
          END IF
       END IF
 *
diff --git a/lapack-netlib/SRC/dgeqp3rk.f b/lapack-netlib/SRC/dgeqp3rk.f
index 117a68287..b8e41b39c 100644
--- a/lapack-netlib/SRC/dgeqp3rk.f
+++ b/lapack-netlib/SRC/dgeqp3rk.f
@@ -427,7 +427,8 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*.          LWORK >= (3*N + NRHS - 1)
+*>          LWORK >= 1, if MIN(M,N) = 0, and
+*>          LWORK >= (3*N+NRHS-1), otherwise.
 *>          For optimal performance LWORK >= (2*N + NB*( N+NRHS+1 )),
 *>          where NB is the optimal block size for DGEQP3RK returned
 *>          by ILAENV. Minimal block size MINNB=2.
diff --git a/lapack-netlib/SRC/dgeqr.f b/lapack-netlib/SRC/dgeqr.f
index eac8930ce..6ed8f211f 100644
--- a/lapack-netlib/SRC/dgeqr.f
+++ b/lapack-netlib/SRC/dgeqr.f
@@ -99,7 +99,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -168,6 +168,8 @@
 *>
 *> \endverbatim
 *>
+*> \ingroup geqr
+*>
 *  =====================================================================
       SUBROUTINE DGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
@@ -188,7 +190,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, LMINWS, MINT, MINW
-      INTEGER            MB, NB, MINTSZ, NBLCKS
+      INTEGER            MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -244,8 +246,10 @@
 *
 *     Determine if the workspace size satisfies minimal size
 *
+      LWMIN = MAX( 1, N )
+      LWREQ = MAX( 1, N*NB )
       LMINWS = .FALSE.
-      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N )
+      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ )
      $    .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ )
      $    .AND. ( .NOT.LQUERY ) ) THEN
         IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN
@@ -253,7 +257,7 @@
           NB = 1
           MB = M
         END IF
-        IF( LWORK.LT.NB*N ) THEN
+        IF( LWORK.LT.LWREQ ) THEN
           LMINWS = .TRUE.
           NB = 1
         END IF
@@ -268,7 +272,7 @@
       ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 )
      $   .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -6
-      ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY )
+      ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY )
      $   .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -8
       END IF
@@ -282,9 +286,9 @@
         T( 2 ) = MB
         T( 3 ) = NB
         IF( MINW ) THEN
-          WORK( 1 ) = MAX( 1, N )
+          WORK( 1 ) = LWMIN
         ELSE
-          WORK( 1 ) = MAX( 1, NB*N )
+          WORK( 1 ) = LWREQ
         END IF
       END IF
       IF( INFO.NE.0 ) THEN
@@ -309,7 +313,7 @@
      $                LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = MAX( 1, NB*N )
+      WORK( 1 ) = LWREQ
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/dgeqrfp.f b/lapack-netlib/SRC/dgeqrfp.f
index 46d2ee479..aa757e96c 100644
--- a/lapack-netlib/SRC/dgeqrfp.f
+++ b/lapack-netlib/SRC/dgeqrfp.f
@@ -97,7 +97,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is
 *>          the optimal blocksize.
 *>
@@ -122,7 +123,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup geqrfp
 *
 *> \par Further Details:
 *  =====================
@@ -162,8 +163,8 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB,
-     $                   NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT,
+     $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DGEQR2P, DLARFB, DLARFT, XERBLA
@@ -181,8 +182,16 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'DGEQRF', ' ', M, N, -1, -1 )
-      LWKOPT = N*NB
+      K = MIN( M, N )
+      IF( K.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = N
+         LWKOPT = N*NB
+      END IF
       WORK( 1 ) = LWKOPT
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -190,7 +199,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
@@ -202,7 +211,6 @@
 *
 *     Quick return if possible
 *
-      K = MIN( M, N )
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -210,7 +218,7 @@
 *
       NBMIN = 2
       NX = 0
-      IWS = N
+      IWS = LWKMIN
       IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
 *        Determine when to cross over from blocked to unblocked code.
diff --git a/lapack-netlib/SRC/dgerqf.f b/lapack-netlib/SRC/dgerqf.f
index cca9d6367..435239cc7 100644
--- a/lapack-netlib/SRC/dgerqf.f
+++ b/lapack-netlib/SRC/dgerqf.f
@@ -114,7 +114,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup gerqf
 *
 *> \par Further Details:
 *  =====================
@@ -189,7 +189,7 @@
          END IF
          WORK( 1 ) = LWKOPT
 *
-         IF ( .NOT.LQUERY ) THEN
+         IF( .NOT.LQUERY ) THEN
             IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) )
      $         INFO = -7
          END IF
diff --git a/lapack-netlib/SRC/dgesvj.f b/lapack-netlib/SRC/dgesvj.f
index 5fdb21e45..198bfb0a5 100644
--- a/lapack-netlib/SRC/dgesvj.f
+++ b/lapack-netlib/SRC/dgesvj.f
@@ -208,7 +208,7 @@
 *>
 *> \param[in,out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (LWORK)
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *>          On entry :
 *>          If JOBU = 'C' :
 *>          WORK(1) = CTOL, where CTOL defines the threshold for convergence.
@@ -239,7 +239,12 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          length of WORK, WORK >= MAX(6,M+N)
+*>          The length of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(6,M+N), otherwise.
+*>
+*>          If on entry LWORK = -1, then a workspace query is assumed and
+*>          no computation is done; WORK(1) is set to the minial (and optimal)
+*>          length of WORK.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -260,7 +265,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup gesvj
 *
 *> \par Further Details:
 *  =====================
@@ -365,9 +370,9 @@
       INTEGER            BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1,
      $                   ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34,
      $                   N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP,
-     $                   SWBAND
-      LOGICAL            APPLV, GOSCALE, LOWER, LSVEC, NOSCALE, ROTOK,
-     $                   RSVEC, UCTOL, UPPER
+     $                   SWBAND, MINMN, LWMIN
+      LOGICAL            APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE,
+     $                   ROTOK, RSVEC, UCTOL, UPPER
 *     ..
 *     .. Local Arrays ..
       DOUBLE PRECISION   FASTR( 5 )
@@ -408,6 +413,14 @@
       UPPER = LSAME( JOBA, 'U' )
       LOWER = LSAME( JOBA, 'L' )
 *
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 6, M+N )
+      END IF
+*
+      LQUERY = ( LWORK.EQ.-1 )
       IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN
          INFO = -1
       ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN
@@ -427,7 +440,7 @@
          INFO = -11
       ELSE IF( UCTOL .AND. ( WORK( 1 ).LE.ONE ) ) THEN
          INFO = -12
-      ELSE IF( LWORK.LT.MAX( M+N, 6 ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN
          INFO = -13
       ELSE
          INFO = 0
@@ -437,11 +450,14 @@
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'DGESVJ', -INFO )
          RETURN
+      ELSE IF( LQUERY ) THEN
+         WORK( 1 ) = LWMIN
+         RETURN
       END IF
 *
 * #:) Quick return for void matrix
 *
-      IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN
+      IF( MINMN.EQ.0 ) RETURN
 *
 *     Set numerical parameters
 *     The stopping criterion for Jacobi rotations is
diff --git a/lapack-netlib/SRC/dgetri.f b/lapack-netlib/SRC/dgetri.f
index 92ef90c18..7b5a3a1b6 100644
--- a/lapack-netlib/SRC/dgetri.f
+++ b/lapack-netlib/SRC/dgetri.f
@@ -107,7 +107,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup getri
 *
 *  =====================================================================
       SUBROUTINE DGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -151,8 +151,9 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'DGETRI', ' ', N, -1, -1, -1 )
-      LWKOPT = N*NB
+      LWKOPT = MAX( 1, N*NB )
       WORK( 1 ) = LWKOPT
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
          INFO = -1
diff --git a/lapack-netlib/SRC/dgetsls.f b/lapack-netlib/SRC/dgetsls.f
index 25f4c12c2..73b505ff7 100644
--- a/lapack-netlib/SRC/dgetsls.f
+++ b/lapack-netlib/SRC/dgetsls.f
@@ -127,7 +127,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed.
 *>          If LWORK = -1, the routine calculates optimal size of WORK for the
 *>          optimal performance and returns this value in WORK(1).
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEsolve
+*> \ingroup getsls
 *
 *  =====================================================================
       SUBROUTINE DGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB,
@@ -189,7 +189,7 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       DOUBLE PRECISION   DLAMCH, DLANGE
-      EXTERNAL           LSAME, DLABAD, DLAMCH, DLANGE
+      EXTERNAL           LSAME, DLAMCH, DLANGE
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DGEQR, DGEMQR, DLASCL, DLASET,
@@ -226,7 +226,10 @@
 *
 *     Determine the optimum and minimum LWORK
 *
-       IF( M.GE.N ) THEN
+       IF( MIN( M, N, NRHS ).EQ.0 ) THEN
+         WSIZEM = 1
+         WSIZEO = 1
+       ELSE IF( M.GE.N ) THEN
          CALL DGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 )
          TSZO = INT( TQ( 1 ) )
          LWO  = INT( WORKQ( 1 ) )
@@ -294,7 +297,6 @@
 *
        SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' )
        BIGNUM = ONE / SMLNUM
-       CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A, B if max element outside range [SMLNUM,BIGNUM]
 *
diff --git a/lapack-netlib/SRC/dgetsqrhrt.f b/lapack-netlib/SRC/dgetsqrhrt.f
index 668deeba8..682c7c30f 100644
--- a/lapack-netlib/SRC/dgetsqrhrt.f
+++ b/lapack-netlib/SRC/dgetsqrhrt.f
@@ -130,14 +130,17 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
+*>          If MIN(M,N) = 0, LWORK >= 1, else
+*>          LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
 *>          where
 *>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
 *>             NB1LOCAL = MIN(NB1,N).
 *>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
 *>             LW1 = NB1LOCAL * N,
-*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
+*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ).
+*>
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
@@ -160,7 +163,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup getsqrhrt
 *
 *> \par Contributors:
 *  ==================
@@ -212,7 +215,7 @@
 *     Test the input arguments
 *
       INFO = 0
-      LQUERY  = LWORK.EQ.-1
+      LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
       ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
@@ -225,7 +228,7 @@
          INFO = -5
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -7
-      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN
          INFO = -9
       ELSE
 *
@@ -263,8 +266,9 @@
             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
 *
             LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
+            LWORKOPT = MAX( 1, LWORKOPT )
 *
-            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+            IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN
                INFO = -11
             END IF
 *
@@ -346,4 +350,4 @@
 *
 *     End of DGETSQRHRT
 *
-      END
\ No newline at end of file
+      END
diff --git a/lapack-netlib/SRC/dgges.f b/lapack-netlib/SRC/dgges.f
index 31db23715..b9ffc7982 100644
--- a/lapack-netlib/SRC/dgges.f
+++ b/lapack-netlib/SRC/dgges.f
@@ -234,8 +234,8 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          If N = 0, LWORK >= 1, else LWORK >= 8*N+16.
-*>          For good performance , LWORK must generally be larger.
+*>          If N = 0, LWORK >= 1, else LWORK >= MAX(8*N,6*N+16).
+*>          For good performance, LWORK must generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -275,7 +275,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEeigen
+*> \ingroup gges
 *
 *  =====================================================================
       SUBROUTINE DGGES( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, LDB,
@@ -321,9 +321,8 @@
       DOUBLE PRECISION   DIF( 2 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DGEQRF, DGGBAK, DGGBAL, DGGHRD, DHGEQZ, DLABAD,
-     $                   DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGSEN,
-     $                   XERBLA
+      EXTERNAL           DGEQRF, DGGBAK, DGGBAL, DGGHRD, DHGEQZ, DLACPY,
+     $                   DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -431,7 +430,6 @@
       EPS = DLAMCH( 'P' )
       SAFMIN = DLAMCH( 'S' )
       SAFMAX = ONE / SAFMIN
-      CALL DLABAD( SAFMIN, SAFMAX )
       SMLNUM = SQRT( SAFMIN ) / EPS
       BIGNUM = ONE / SMLNUM
 *
diff --git a/lapack-netlib/SRC/dgges3.f b/lapack-netlib/SRC/dgges3.f
index 7b00d294a..2ef55951a 100644
--- a/lapack-netlib/SRC/dgges3.f
+++ b/lapack-netlib/SRC/dgges3.f
@@ -234,6 +234,8 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
+*>          If N = 0, LWORK >= 1, else LWORK >= 6*N+16.
+*>          For good performance, LWORK must generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -273,7 +275,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEeigen
+*> \ingroup gges3
 *
 *  =====================================================================
       SUBROUTINE DGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B,
@@ -309,7 +311,8 @@
       LOGICAL            CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL,
      $                   LQUERY, LST2SL, WANTST
       INTEGER            I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT,
-     $                   ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT
+     $                   ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT,
+     $                   LWKMIN
       DOUBLE PRECISION   ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL,
      $                   PVSR, SAFMAX, SAFMIN, SMLNUM
 *     ..
@@ -318,9 +321,8 @@
       DOUBLE PRECISION   DIF( 2 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLABAD,
-     $                   DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGSEN,
-     $                   XERBLA
+      EXTERNAL           DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLACPY,
+     $                   DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -362,6 +364,12 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      IF( N.EQ.0 ) THEN
+         LWKMIN = 1
+      ELSE
+         LWKMIN = 6*N+16
+      END IF
+*
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -378,7 +386,7 @@
          INFO = -15
       ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN
          INFO = -17
-      ELSE IF( LWORK.LT.6*N+16 .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -19
       END IF
 *
@@ -386,29 +394,33 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL DGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX( 6*N+16, 3*N+INT( WORK ( 1 ) ) )
+         LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) )
          CALL DORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK,
      $                -1, IERR )
-         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
          IF( ILVSL ) THEN
             CALL DORGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1, IERR )
-            LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
          END IF
          CALL DGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL,
      $                LDVSL, VSR, LDVSR, WORK, -1, IERR )
-         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
          CALL DLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB,
      $                ALPHAR, ALPHAI, BETA, VSL, LDVSL, VSR, LDVSR,
      $                WORK, -1, 0, IERR )
-         LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
          IF( WANTST ) THEN
             CALL DTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB,
      $                   ALPHAR, ALPHAI, BETA, VSL, LDVSL, VSR, LDVSR,
      $                   SDIM, PVSL, PVSR, DIF, WORK, -1, IDUM, 1,
      $                   IERR )
-            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
+         END IF
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = LWKOPT
          END IF
-         WORK( 1 ) = LWKOPT
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -430,7 +442,6 @@
       EPS = DLAMCH( 'P' )
       SAFMIN = DLAMCH( 'S' )
       SAFMAX = ONE / SAFMIN
-      CALL DLABAD( SAFMIN, SAFMAX )
       SMLNUM = SQRT( SAFMIN ) / EPS
       BIGNUM = ONE / SMLNUM
 *
diff --git a/lapack-netlib/SRC/dggev3.f b/lapack-netlib/SRC/dggev3.f
index 4bbe8a40f..b970c04c4 100644
--- a/lapack-netlib/SRC/dggev3.f
+++ b/lapack-netlib/SRC/dggev3.f
@@ -188,7 +188,9 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
-*>          LWORK is INTEGER
+*>          LWORK is INTEGER.
+*>          The dimension of the array WORK. LWORK >= MAX(1,8*N).
+*>          For good performance, LWORK should generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -217,7 +219,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEeigen
+*> \ingroup ggev3
 *
 *  =====================================================================
       SUBROUTINE DGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHAR,
@@ -248,7 +250,8 @@
       LOGICAL            ILASCL, ILBSCL, ILV, ILVL, ILVR, LQUERY
       CHARACTER          CHTEMP
       INTEGER            ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO,
-     $                   IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT
+     $                   IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT,
+     $                   LWKMIN
       DOUBLE PRECISION   ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS,
      $                   SMLNUM, TEMP
 *     ..
@@ -256,9 +259,8 @@
       LOGICAL            LDUMMA( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLABAD,
-     $                   DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGEVC,
-     $                   XERBLA
+      EXTERNAL           DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLACPY,
+     $                   DLASCL, DLASET, DORGQR, DORMQR, DTGEVC, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -299,6 +301,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 8*N )
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -313,7 +316,7 @@
          INFO = -12
       ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN
          INFO = -14
-      ELSE IF( LWORK.LT.MAX( 1, 8*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -16
       END IF
 *
@@ -321,13 +324,13 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL DGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX(1, 8*N, 3*N+INT( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) )
          CALL DORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, -1,
      $                IERR )
-         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
          IF( ILVL ) THEN
             CALL DORGQR( N, N, N, VL, LDVL, WORK, WORK, -1, IERR )
-            LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
          END IF
          IF( ILV ) THEN
             CALL DGGHD3( JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, VL,
@@ -336,18 +339,21 @@
             CALL DLAQZ0( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
      $                   ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR,
      $                   WORK, -1, 0, IERR )
-            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
          ELSE
             CALL DGGHD3( 'N', 'N', N, 1, N, A, LDA, B, LDB, VL, LDVL,
      $                   VR, LDVR, WORK, -1, IERR )
-            LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
             CALL DLAQZ0( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
      $                   ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR,
      $                   WORK, -1, 0, IERR )
-            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
+         END IF
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = LWKOPT
          END IF
-
-         WORK( 1 ) = LWKOPT
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -367,7 +373,6 @@
       EPS = DLAMCH( 'P' )
       SMLNUM = DLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
diff --git a/lapack-netlib/SRC/dgghd3.f b/lapack-netlib/SRC/dgghd3.f
index 43d7a77df..21a668573 100644
--- a/lapack-netlib/SRC/dgghd3.f
+++ b/lapack-netlib/SRC/dgghd3.f
@@ -179,14 +179,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (LWORK)
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
-*> \param[in]  LWORK
+*> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= 1.
+*>          The length of the array WORK. LWORK >= 1.
 *>          For optimum performance LWORK >= 6*N*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -211,7 +211,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup gghd3
 *
 *> \par Further Details:
 *  =====================
@@ -275,7 +275,12 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'DGGHD3', ' ', N, ILO, IHI, -1 )
-      LWKOPT = MAX( 6*N*NB, 1 )
+      NH = IHI - ILO + 1
+      IF( NH.LE.1 ) THEN
+         LWKOPT = 1
+      ELSE
+         LWKOPT = 6*N*NB
+      END IF
       WORK( 1 ) = DBLE( LWKOPT )
       INITQ = LSAME( COMPQ, 'I' )
       WANTQ = INITQ .OR. LSAME( COMPQ, 'V' )
@@ -325,7 +330,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = ONE
          RETURN
@@ -885,6 +889,7 @@
       IF ( JCOL.LT.IHI )
      $   CALL DGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q,
      $                LDQ, Z, LDZ, IERR )
+*
       WORK( 1 ) = DBLE( LWKOPT )
 *
       RETURN
diff --git a/lapack-netlib/SRC/dggqrf.f b/lapack-netlib/SRC/dggqrf.f
index 39d27a5c9..edac7f22f 100644
--- a/lapack-netlib/SRC/dggqrf.f
+++ b/lapack-netlib/SRC/dggqrf.f
@@ -173,7 +173,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup ggqrf
 *
 *> \par Further Details:
 *  =====================
@@ -250,7 +250,7 @@
       NB2 = ILAENV( 1, 'DGERQF', ' ', N, P, -1, -1 )
       NB3 = ILAENV( 1, 'DORMQR', ' ', N, M, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P )*NB
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
@@ -287,6 +287,7 @@
 *     RQ factorization of N-by-P matrix B: B = T*Z.
 *
       CALL DGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO )
+*
       WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) )
 *
       RETURN
diff --git a/lapack-netlib/SRC/dggrqf.f b/lapack-netlib/SRC/dggrqf.f
index ddf4104c5..3b1024c1c 100644
--- a/lapack-netlib/SRC/dggrqf.f
+++ b/lapack-netlib/SRC/dggrqf.f
@@ -172,7 +172,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup ggrqf
 *
 *> \par Further Details:
 *  =====================
@@ -249,7 +249,7 @@
       NB2 = ILAENV( 1, 'DGEQRF', ' ', P, N, -1, -1 )
       NB3 = ILAENV( 1, 'DORMRQ', ' ', M, N, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P )*NB
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
diff --git a/lapack-netlib/SRC/dggsvd3.f b/lapack-netlib/SRC/dggsvd3.f
index 503f0d8cc..ee4d11e86 100644
--- a/lapack-netlib/SRC/dggsvd3.f
+++ b/lapack-netlib/SRC/dggsvd3.f
@@ -278,7 +278,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -328,7 +328,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEsing
+*> \ingroup ggsvd3
 *
 *> \par Contributors:
 *  ==================
diff --git a/lapack-netlib/SRC/dggsvp3.f b/lapack-netlib/SRC/dggsvp3.f
index 4e1db3117..485d95b36 100644
--- a/lapack-netlib/SRC/dggsvp3.f
+++ b/lapack-netlib/SRC/dggsvp3.f
@@ -227,7 +227,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -250,7 +250,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup ggsvp3
 *
 *> \par Further Details:
 *  =====================
diff --git a/lapack-netlib/SRC/dlamswlq.f b/lapack-netlib/SRC/dlamswlq.f
index 70e78f4b1..07ef1bd57 100644
--- a/lapack-netlib/SRC/dlamswlq.f
+++ b/lapack-netlib/SRC/dlamswlq.f
@@ -127,17 +127,20 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          If SIDE = 'L', LWORK >= max(1,NB) * MB;
-*>          if SIDE = 'R', LWORK >= max(1,M) * MB.
+*>
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,NB*MB).
+*>          If SIDE = 'R', LWORK >= max(1,M*MB).
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -189,29 +192,31 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamswlq
+*>
 *  =====================================================================
       SUBROUTINE DLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $    LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      DOUBLE PRECISION A( LDA, * ), WORK( * ), C(LDC, * ),
-     $      T( LDT, * )
+      DOUBLE PRECISION   A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, CTR, LW
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, CTR, LW, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -223,52 +228,60 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * MB
       ELSE
         LW = M * MB
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
 *
       INFO = 0
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( K.LT.0 ) THEN
         INFO = -5
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
         INFO = -4
-      ELSE IF( K.LT.MB .OR. MB.LT.1) THEN
+      ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN
         INFO = -6
       ELSE IF( LDA.LT.MAX( 1, K ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, MB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = LWMIN
+      END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'DLAMSWLQ', -INFO )
-        WORK(1) = LW
         RETURN
-      ELSE IF (LQUERY) THEN
-        WORK(1) = LW
+      ELSE IF( LQUERY ) THEN
         RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -402,7 +415,8 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK( 1 ) = LWMIN
+*
       RETURN
 *
 *     End of DLAMSWLQ
diff --git a/lapack-netlib/SRC/dlamtsqr.f b/lapack-netlib/SRC/dlamtsqr.f
index 962a31476..023db5ac9 100644
--- a/lapack-netlib/SRC/dlamtsqr.f
+++ b/lapack-netlib/SRC/dlamtsqr.f
@@ -128,22 +128,24 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
+*>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,N*NB).
+*>          If SIDE = 'R', LWORK >= max(1,MB*NB).
 *>
-*>          If SIDE = 'L', LWORK >= max(1,N)*NB;
-*>          if SIDE = 'R', LWORK >= max(1,MB)*NB.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -191,29 +193,31 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamtsqr
+*>
 *  =====================================================================
       SUBROUTINE DLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $        LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      DOUBLE PRECISION A( LDA, * ), WORK( * ), C(LDC, * ),
-     $                T( LDT, * )
+      DOUBLE PRECISION   A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, LW, CTR, Q
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, LW, CTR, Q, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -225,12 +229,13 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      INFO = 0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * NB
         Q = M
       ELSE
@@ -238,11 +243,17 @@
         Q = N
       END IF
 *
-      INFO = 0
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
+*
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
@@ -253,38 +264,38 @@
         INFO = -7
       ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, NB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
-*     Determine the block size if it is tall skinny or short and wide
-*
-      IF( INFO.EQ.0)  THEN
-          WORK(1) = LW
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = LWMIN
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'DLAMTSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
+*
+*     Determine the block size if it is tall skinny or short and wide
 *
       IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN
         CALL DGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA,
-     $        T, LDT, C, LDC, WORK, INFO)
+     $        T, LDT, C, LDC, WORK, INFO )
         RETURN
-       END IF
+      END IF
 *
       IF(LEFT.AND.NOTRAN) THEN
 *
@@ -410,7 +421,8 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK( 1 ) = LWMIN
+*
       RETURN
 *
 *     End of DLAMTSQR
diff --git a/lapack-netlib/SRC/dlaswlq.f b/lapack-netlib/SRC/dlaswlq.f
index c95c94cbc..636c12dc8 100644
--- a/lapack-netlib/SRC/dlaswlq.f
+++ b/lapack-netlib/SRC/dlaswlq.f
@@ -99,19 +99,22 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
+*>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= MB*M.
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -159,33 +162,37 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup laswlq
+*>
 *  =====================================================================
       SUBROUTINE DLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
-     $                  INFO)
+     $                    INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LWORK, LDT
+      INTEGER            INFO, LDA, M, N, MB, NB, LWORK, LDT
 *     ..
 *     .. Array Arguments ..
-      DOUBLE PRECISION  A( LDA, * ), WORK( * ), T( LDT, *)
+      DOUBLE PRECISION   A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, MINMN, LWMIN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+*     ..
 *     .. EXTERNAL SUBROUTINES ..
       EXTERNAL           DGELQT, DTPLQT, XERBLA
+*     ..
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
@@ -196,12 +203,19 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = M*MB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
       ELSE IF( N.LT.0 .OR. N.LT.M ) THEN
         INFO = -2
-      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN
+      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN
         INFO = -3
       ELSE IF( NB.LT.0 ) THEN
         INFO = -4
@@ -209,60 +223,62 @@
         INFO = -6
       ELSE IF( LDT.LT.MB ) THEN
         INFO = -8
-      ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-      WORK(1) = MB*M
+*
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = LWMIN
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'DLASWLQ', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The LQ Decomposition
 *
-       IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN
-        CALL DGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO)
+      IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN
+        CALL DGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO )
         RETURN
-       END IF
+      END IF
 *
-       KK = MOD((N-M),(NB-M))
-       II=N-KK+1
+      KK = MOD((N-M),(NB-M))
+      II = N-KK+1
 *
-*      Compute the LQ factorization of the first block A(1:M,1:NB)
+*     Compute the LQ factorization of the first block A(1:M,1:NB)
 *
-       CALL DGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO)
-       CTR = 1
+      CALL DGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO )
+      CTR = 1
 *
-       DO I = NB+1, II-NB+M , (NB-M)
+      DO I = NB+1, II-NB+M, (NB-M)
 *
-*      Compute the QR factorization of the current block A(1:M,I:I+NB-M)
+*       Compute the QR factorization of the current block A(1:M,I:I+NB-M)
 *
-         CALL DTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
-     $                  LDA, T(1, CTR * M + 1),
-     $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+        CALL DTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
+     $                 LDA, T(1, CTR * M + 1),
+     $                 LDT, WORK, INFO )
+        CTR = CTR + 1
+      END DO
 *
 *     Compute the QR factorization of the last block A(1:M,II:N)
 *
-       IF (II.LE.N) THEN
+      IF( II.LE.N ) THEN
         CALL DTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ),
-     $                  LDA, T(1, CTR * M + 1), LDT,
-     $                  WORK, INFO )
-       END IF
+     $                 LDA, T(1, CTR * M + 1), LDT,
+     $                 WORK, INFO )
+      END IF
+*
+      WORK( 1 ) = LWMIN
 *
-      WORK( 1 ) = M * MB
       RETURN
 *
 *     End of DLASWLQ
diff --git a/lapack-netlib/SRC/dlatrs3.f b/lapack-netlib/SRC/dlatrs3.f
index e6d78b672..d18675b2d 100644
--- a/lapack-netlib/SRC/dlatrs3.f
+++ b/lapack-netlib/SRC/dlatrs3.f
@@ -151,13 +151,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (LWORK).
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal size of
 *>          WORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
+*> \verbatim
 *>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else
 *>          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
 *>          NBA = (N + NB - 1)/NB and NB is the optimal block size.
 *>
@@ -165,6 +169,7 @@
 *>          only calculates the optimal dimensions of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
@@ -181,7 +186,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERauxiliary
+*> \ingroup latrs3
 *> \par Further Details:
 *  =====================
 *  \verbatim
@@ -253,7 +258,7 @@
       LOGICAL            LQUERY, NOTRAN, NOUNIT, UPPER
       INTEGER            AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
      $                   JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
-     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
+     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN
       DOUBLE PRECISION   ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
      $                   SCAMIN, SMLNUM, TMAX
 *     ..
@@ -292,15 +297,24 @@
 *     row. WORK( I+KK*LDS ) is the scale factor of the vector
 *     segment associated with the I-th block row and the KK-th vector
 *     in the block column.
+*
       LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
       LDS = NBA
+*
 *     The second part stores upper bounds of the triangular A. There are
 *     a total of NBA x NBA blocks, of which only the upper triangular
 *     part or the lower triangular part is referenced. The upper bound of
 *     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
+*
       LANRM = NBA * NBA
       AWRK = LSCALE
-      WORK( 1 ) = LSCALE + LANRM
+*
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = LSCALE + LANRM
+      END IF
+      WORK( 1 ) = LWMIN
 *
 *     Test the input parameters
 *
@@ -322,7 +336,7 @@
          INFO = -8
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -10
-      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
+      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN
          INFO = -14
       END IF
       IF( INFO.NE.0 ) THEN
@@ -649,6 +663,9 @@
             END IF
          END DO
       END DO
+*
+      WORK( 1 ) = LWMIN
+*
       RETURN
 *
 *     End of DLATRS3
diff --git a/lapack-netlib/SRC/dlatsqr.f b/lapack-netlib/SRC/dlatsqr.f
index 94a04be02..0000aab68 100644
--- a/lapack-netlib/SRC/dlatsqr.f
+++ b/lapack-netlib/SRC/dlatsqr.f
@@ -101,15 +101,18 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= NB*N.
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -161,27 +164,29 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup latsqr
+*>
 *  =====================================================================
       SUBROUTINE DLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK,
-     $                    LWORK, INFO)
+     $                    LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LDT, LWORK
+      INTEGER            INFO, LDA, M, N, MB, NB, LDT, LWORK
 *     ..
 *     .. Array Arguments ..
-      DOUBLE PRECISION  A( LDA, * ), WORK( * ), T(LDT, *)
+      DOUBLE PRECISION   A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, MINMN, LWMIN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
@@ -198,6 +203,13 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = N*NB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
@@ -205,65 +217,67 @@
         INFO = -2
       ELSE IF( MB.LT.1 ) THEN
         INFO = -3
-      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN
+      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN
         INFO = -4
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -6
       ELSE IF( LDT.LT.NB ) THEN
         INFO = -8
-      ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-        WORK(1) = NB*N
+*
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = LWMIN
       END IF
+*
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'DLATSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The QR Decomposition
 *
-       IF ((MB.LE.N).OR.(MB.GE.M)) THEN
-         CALL DGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO)
-         RETURN
-       END IF
+      IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN
+        CALL DGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO )
+        RETURN
+      END IF
 *
-       KK = MOD((M-N),(MB-N))
-       II=M-KK+1
+      KK = MOD((M-N),(MB-N))
+      II = M-KK+1
 *
-*      Compute the QR factorization of the first block A(1:MB,1:N)
+*     Compute the QR factorization of the first block A(1:MB,1:N)
 *
-       CALL DGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
+      CALL DGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
 *
-       CTR = 1
-       DO I = MB+1, II-MB+N ,  (MB-N)
+      CTR = 1
+      DO I = MB+1, II-MB+N, (MB-N)
 *
-*      Compute the QR factorization of the current block A(I:I+MB-N,1:N)
+*       Compute the QR factorization of the current block A(I:I+MB-N,1:N)
 *
-         CALL DTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
-     $                 T(1, CTR * N + 1),
-     $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+        CALL DTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
+     $                T(1, CTR * N + 1),
+     $                LDT, WORK, INFO )
+        CTR = CTR + 1
+      END DO
 *
-*      Compute the QR factorization of the last block A(II:M,1:N)
+*     Compute the QR factorization of the last block A(II:M,1:N)
 *
-       IF (II.LE.M) THEN
-         CALL DTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
-     $                 T(1, CTR * N + 1), LDT,
-     $                  WORK, INFO )
-       END IF
+      IF( II.LE.M ) THEN
+        CALL DTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
+     $                T(1, CTR * N + 1), LDT,
+     $                WORK, INFO )
+      END IF
 *
-      WORK( 1 ) = N*NB
+      WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of DLATSQR
diff --git a/lapack-netlib/SRC/dsyev_2stage.f b/lapack-netlib/SRC/dsyev_2stage.f
index 50d51d992..286366bfe 100644
--- a/lapack-netlib/SRC/dsyev_2stage.f
+++ b/lapack-netlib/SRC/dsyev_2stage.f
@@ -20,7 +20,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, 
+*       SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
 *                                INFO )
 *
 *       IMPLICIT NONE
@@ -97,7 +97,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension LWORK
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -105,12 +105,12 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + 2*N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + 2*N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -143,7 +143,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYeigen
+*> \ingroup heev_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -161,7 +161,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -169,16 +169,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
 *  =====================================================================
-      SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, 
+      SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
      $                         INFO )
 *
       IMPLICIT NONE
@@ -305,7 +305,7 @@
       LLWORK  = LWORK - INDWRK + 1
 *
       CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ),
-     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD,
      $                    WORK( INDWRK ), LLWORK, IINFO )
 *
 *     For eigenvalues only, call DSTERF.  For eigenvectors, first call
diff --git a/lapack-netlib/SRC/dsyevd.f b/lapack-netlib/SRC/dsyevd.f
index b27f4cdc7..adcfcb373 100644
--- a/lapack-netlib/SRC/dsyevd.f
+++ b/lapack-netlib/SRC/dsyevd.f
@@ -96,8 +96,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array,
-*>                                         dimension (LWORK)
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -160,7 +159,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYeigen
+*> \ingroup heevd
 *
 *> \par Contributors:
 *  ==================
diff --git a/lapack-netlib/SRC/dsyevr.f b/lapack-netlib/SRC/dsyevr.f
index 698691533..8647b0162 100644
--- a/lapack-netlib/SRC/dsyevr.f
+++ b/lapack-netlib/SRC/dsyevr.f
@@ -271,7 +271,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,26*N).
+*>          The dimension of the array WORK.
+*>          If N <= 1, LWORK >= 1, else LWORK >= 26*N.
 *>          For optimal efficiency, LWORK >= (NB+6)*N,
 *>          where NB is the max of the blocksize for DSYTRD and DORMTR
 *>          returned by ILAENV.
@@ -285,13 +286,14 @@
 *> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
-*>          On exit, if INFO = 0, IWORK(1) returns the optimal LWORK.
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
 *> \endverbatim
 *>
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the IWORK array,
@@ -315,7 +317,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYeigen
+*> \ingroup heevr
 *
 *> \par Contributors:
 *  ==================
@@ -390,8 +392,13 @@
 *
       LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) )
 *
-      LWMIN = MAX( 1, 26*N )
-      LIWMIN = MAX( 1, 10*N )
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = 26*N
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN
@@ -450,7 +457,7 @@
       END IF
 *
       IF( N.EQ.1 ) THEN
-         WORK( 1 ) = 7
+         WORK( 1 ) = 1
          IF( ALLEIG .OR. INDEIG ) THEN
             M = 1
             W( 1 ) = A( 1, 1 )
diff --git a/lapack-netlib/SRC/dsyevr_2stage.f b/lapack-netlib/SRC/dsyevr_2stage.f
index 09242bbd3..63d5e3159 100644
--- a/lapack-netlib/SRC/dsyevr_2stage.f
+++ b/lapack-netlib/SRC/dsyevr_2stage.f
@@ -263,7 +263,7 @@
 *>          indicating the nonzero elements in Z. The i-th eigenvector
 *>          is nonzero only in elements ISUPPZ( 2*i-1 ) through
 *>          ISUPPZ( 2*i ). This is an output of DSTEMR (tridiagonal
-*>          matrix). The support of the eigenvectors of A is typically 
+*>          matrix). The support of the eigenvectors of A is typically
 *>          1:N because of the orthogonal transformations applied by DORMTR.
 *>          Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1
 *> \endverbatim
@@ -277,12 +277,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  
+*>          The dimension of the array WORK.
+*>          If N <= 1,               LWORK must be at least 1.
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, 26*N, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + 5*N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + 5*N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -300,13 +301,14 @@
 *> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
-*>          On exit, if INFO = 0, IWORK(1) returns the optimal LWORK.
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
 *> \endverbatim
 *>
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the IWORK array,
@@ -330,7 +332,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYeigen
+*> \ingroup heevr_2stage
 *
 *> \par Contributors:
 *  ==================
@@ -358,7 +360,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -366,11 +368,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -444,8 +446,14 @@
       IB     = ILAENV2STAGE( 2, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
       LHTRD  = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
       LWTRD  = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
-      LWMIN  = MAX( 26*N, 5*N + LHTRD + LWTRD )
-      LIWMIN = MAX( 1, 10*N )
+*
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = MAX( 26*N, 5*N + LHTRD + LWTRD )
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
@@ -484,7 +492,7 @@
 *         NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 )
 *         NB = MAX( NB, ILAENV( 1, 'DORMTR', UPLO, N, -1, -1, -1 ) )
 *         LWKOPT = MAX( ( NB+1 )*N, LWMIN )
-         WORK( 1 ) = LWMIN
+         WORK( 1 )  = LWMIN
          IWORK( 1 ) = LIWMIN
       END IF
 *
@@ -504,7 +512,7 @@
       END IF
 *
       IF( N.EQ.1 ) THEN
-         WORK( 1 ) = 7
+         WORK( 1 ) = 1
          IF( ALLEIG .OR. INDEIG ) THEN
             M = 1
             W( 1 ) = A( 1, 1 )
@@ -608,7 +616,7 @@
 *     Call DSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form.
 *
 *
-      CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), 
+      CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ),
      $                    WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ),
      $                    LHTRD, WORK( INDWK ), LLWORK, IINFO )
 *
@@ -727,7 +735,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 )  = LWMIN
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/dsyevx.f b/lapack-netlib/SRC/dsyevx.f
index 99719874b..fd6a78e32 100644
--- a/lapack-netlib/SRC/dsyevx.f
+++ b/lapack-netlib/SRC/dsyevx.f
@@ -244,7 +244,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYeigen
+*> \ingroup heevx
 *
 *  =====================================================================
       SUBROUTINE DSYEVX( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, IL, IU,
@@ -338,14 +338,14 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWKMIN = 1
-            WORK( 1 ) = LWKMIN
+            LWKOPT = 1
          ELSE
             LWKMIN = 8*N
             NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 )
             NB = MAX( NB, ILAENV( 1, 'DORMTR', UPLO, N, -1, -1, -1 ) )
             LWKOPT = MAX( LWKMIN, ( NB + 3 )*N )
-            WORK( 1 ) = LWKOPT
          END IF
+         WORK( 1 ) = LWKOPT
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY )
      $      INFO = -17
diff --git a/lapack-netlib/SRC/dsysv_aa.f b/lapack-netlib/SRC/dsysv_aa.f
index 8dab5a384..0a96ecd7e 100644
--- a/lapack-netlib/SRC/dsysv_aa.f
+++ b/lapack-netlib/SRC/dsysv_aa.f
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYsolve
+*> \ingroup hesv_aa
 *
 *  =====================================================================
       SUBROUTINE DSYSV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
@@ -177,7 +177,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS
+      INTEGER            LWKMIN, LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -196,6 +196,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N, 3*N-2 )
       IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -206,17 +207,17 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
-         LWKOPT_SYTRF = INT( WORK(1) )
+         LWKOPT_SYTRF = INT( WORK( 1 ) )
          CALL DSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
      $                   -1, INFO )
-         LWKOPT_SYTRS = INT( WORK(1) )
-         LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS )
+         LWKOPT_SYTRS = INT( WORK( 1 ) )
+         LWKOPT = MAX( LWKMIN, LWKOPT_SYTRF, LWKOPT_SYTRS )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/dsysv_aa_2stage.f b/lapack-netlib/SRC/dsysv_aa_2stage.f
index 72fbe1e9a..90dd0a38a 100644
--- a/lapack-netlib/SRC/dsysv_aa_2stage.f
+++ b/lapack-netlib/SRC/dsysv_aa_2stage.f
@@ -101,14 +101,14 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is DOUBLE PRECISION array, dimension (LTB)
+*>          TB is DOUBLE PRECISION array, dimension (MAX(1,LTB))
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -148,14 +148,15 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION workspace of size LWORK
+*>          WORK is DOUBLE PRECISION workspace of size (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used to
+*>          select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -179,7 +180,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYsolve
+*> \ingroup hesv_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE DSYSV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB,
@@ -205,7 +206,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL            UPPER, TQUERY, WQUERY
-      INTEGER            LWKOPT
+      INTEGER            LWKMIN, LWKOPT
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -226,6 +227,7 @@
       UPPER = LSAME( UPLO, 'U' )
       WQUERY = ( LWORK.EQ.-1 )
       TQUERY = ( LTB.EQ.-1 )
+      LWKMIN = MAX( 1, N )
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -234,18 +236,19 @@
          INFO = -3
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -5
-      ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -7
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -11
-      ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN
          INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV,
      $                          IPIV2, WORK, -1, INFO )
-         LWKOPT = INT( WORK(1) )
+         LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) )
+         WORK( 1 ) = LWKOPT
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -255,7 +258,6 @@
          RETURN
       END IF
 *
-*
 *     Compute the factorization A = U**T*T*U or A = L*T*L**T.
 *
       CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2,
diff --git a/lapack-netlib/SRC/dsysvx.f b/lapack-netlib/SRC/dsysvx.f
index a30831e72..b2b8210ca 100644
--- a/lapack-netlib/SRC/dsysvx.f
+++ b/lapack-netlib/SRC/dsysvx.f
@@ -275,7 +275,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYsolve
+*> \ingroup hesvx
 *
 *  =====================================================================
       SUBROUTINE DSYSVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B,
@@ -305,7 +305,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, NOFACT
-      INTEGER            LWKOPT, NB
+      INTEGER            LWKMIN, LWKOPT, NB
       DOUBLE PRECISION   ANORM
 *     ..
 *     .. External Functions ..
@@ -327,6 +327,7 @@
       INFO = 0
       NOFACT = LSAME( FACT, 'N' )
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 3*N )
       IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN
          INFO = -1
       ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) )
@@ -344,12 +345,12 @@
          INFO = -11
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -13
-      ELSE IF( LWORK.LT.MAX( 1, 3*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -18
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = MAX( 1, 3*N )
+         LWKOPT = LWKMIN
          IF( NOFACT ) THEN
             NB = ILAENV( 1, 'DSYTRF', UPLO, N, -1, -1, -1 )
             LWKOPT = MAX( LWKOPT, N*NB )
diff --git a/lapack-netlib/SRC/dsytrd.f b/lapack-netlib/SRC/dsytrd.f
index 3dcfc3db2..58d4b633b 100644
--- a/lapack-netlib/SRC/dsytrd.f
+++ b/lapack-netlib/SRC/dsytrd.f
@@ -139,7 +139,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrd
 *
 *> \par Further Details:
 *  =====================
@@ -247,7 +247,7 @@
 *        Determine the block size.
 *
          NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/dsytrd_2stage.f b/lapack-netlib/SRC/dsytrd_2stage.f
index 8ae77d3e4..a88ac1c73 100644
--- a/lapack-netlib/SRC/dsytrd_2stage.f
+++ b/lapack-netlib/SRC/dsytrd_2stage.f
@@ -4,23 +4,23 @@
 *
 *  =========== DOCUMENTATION ===========
 *
-* Online html documentation available at 
-*            http://www.netlib.org/lapack/explore-html/ 
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
-*> Download DSYTRD_2STAGE + dependencies 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsytrd_2stage.f"> 
-*> [TGZ]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsytrd_2stage.f"> 
-*> [ZIP]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsytrd_2stage.f"> 
+*> Download DSYTRD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsytrd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsytrd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsytrd_2stage.f">
 *> [TXT]</a>
-*> \endhtmlonly 
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+*       SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
 *                                 HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
 *       IMPLICIT NONE
@@ -34,7 +34,7 @@
 *       DOUBLE PRECISION   A( LDA, * ), TAU( * ),
 *                          HOUS2( * ), WORK( * )
 *       ..
-*  
+*
 *
 *> \par Purpose:
 *  =============
@@ -52,11 +52,11 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  in particular for the second stage (Band to
 *>                  tridiagonal) and thus LHOUS2 is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate Q1 Q2 or to apply Q1 Q2, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate Q1 Q2 or to apply Q1 Q2,
 *>                  then LHOUS2 is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -86,7 +86,7 @@
 *>          triangular part of A is not referenced.
 *>          On exit, if UPLO = 'U', the band superdiagonal
 *>          of A are overwritten by the corresponding elements of the
-*>          internal band-diagonal matrix AB, and the elements above 
+*>          internal band-diagonal matrix AB, and the elements above
 *>          the KD superdiagonal, with the array TAU, represent the orthogonal
 *>          matrix Q1 as a product of elementary reflectors; if UPLO
 *>          = 'L', the diagonal and band subdiagonal of A are over-
@@ -117,13 +117,13 @@
 *> \param[out] TAU
 *> \verbatim
 *>          TAU is DOUBLE PRECISION array, dimension (N-KD)
-*>          The scalar factors of the elementary reflectors of 
+*>          The scalar factors of the elementary reflectors of
 *>          the first stage (see Further Details).
 *> \endverbatim
 *>
 *> \param[out] HOUS2
 *> \verbatim
-*>          HOUS2 is DOUBLE PRECISION array, dimension (LHOUS2)
+*>          HOUS2 is DOUBLE PRECISION array, dimension (MAX(1,LHOUS2))
 *>          Stores the Householder representation of the stage2
 *>          band to tridiagonal.
 *> \endverbatim
@@ -132,6 +132,8 @@
 *> \verbatim
 *>          LHOUS2 is INTEGER
 *>          The dimension of the array HOUS2.
+*>          LHOUS2 >= 1.
+*>
 *>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS2 array, returns
@@ -143,23 +145,26 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (LWORK)
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS2=-1,
+*>          The dimension of the array WORK.
+*>          If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *>          LWORK = MAX(1, dimension) where
 *>          dimension   = max(stage1,stage2) + (KD+1)*N
-*>                      = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                        + max(2*KD*KD, KD*NTHREADS) 
-*>                        + (KD+1)*N 
+*>                      = N*KD + N*max(KD+1,FACTOPTNB)
+*>                        + max(2*KD*KD, KD*NTHREADS)
+*>                        + (KD+1)*N
 *>          where KD is the blocking size of the reduction,
 *>          FACTOPTNB is the blocking used by the QR or LQ
 *>          algorithm, usually FACTOPTNB=128 is a good choice
@@ -177,12 +182,12 @@
 *  Authors:
 *  ========
 *
-*> \author Univ. of Tennessee 
-*> \author Univ. of California Berkeley 
-*> \author Univ. of Colorado Denver 
-*> \author NAG Ltd. 
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrd_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -202,7 +207,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -210,16 +215,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+      SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
      $                          HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
       IMPLICIT NONE
@@ -265,10 +270,13 @@
 *
       KD     = ILAENV2STAGE( 1, 'DSYTRD_2STAGE', VECT, N, -1, -1, -1 )
       IB     = ILAENV2STAGE( 2, 'DSYTRD_2STAGE', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 )
-*      WRITE(*,*),'DSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO,
-*     $            LHMIN, LWMIN
+      IF( N.EQ.0 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.LSAME( VECT, 'N' ) ) THEN
          INFO = -1
@@ -309,14 +317,14 @@
       LWRK  = LWORK-LDAB*N
       ABPOS = 1
       WPOS  = ABPOS + LDAB*N
-      CALL DSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, 
+      CALL DSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB,
      $                   TAU, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'DSYTRD_SY2SB', -INFO )
          RETURN
       END IF
-      CALL DSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, 
-     $                   WORK( ABPOS ), LDAB, D, E, 
+      CALL DSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD,
+     $                   WORK( ABPOS ), LDAB, D, E,
      $                   HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'DSYTRD_SB2ST', -INFO )
@@ -324,8 +332,7 @@
       END IF
 *
 *
-      HOUS2( 1 ) = LHMIN
-      WORK( 1 )  = LWMIN
+      WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of DSYTRD_2STAGE
diff --git a/lapack-netlib/SRC/dsytrd_sb2st.F b/lapack-netlib/SRC/dsytrd_sb2st.F
index bb74dd491..04d03d587 100644
--- a/lapack-netlib/SRC/dsytrd_sb2st.F
+++ b/lapack-netlib/SRC/dsytrd_sb2st.F
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+*       SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
 *                               D, E, HOUS, LHOUS, WORK, LWORK, INFO )
 *
 *       #if defined(_OPENMP)
@@ -53,12 +53,12 @@
 *> \param[in] STAGE1
 *> \verbatim
 *>          STAGE1 is CHARACTER*1
-*>          = 'N':  "No": to mention that the stage 1 of the reduction  
+*>          = 'N':  "No": to mention that the stage 1 of the reduction
 *>                  from dense to band using the dsytrd_sy2sb routine
-*>                  was not called before this routine to reproduce AB. 
-*>                  In other term this routine is called as standalone. 
-*>          = 'Y':  "Yes": to mention that the stage 1 of the 
-*>                  reduction from dense to band using the dsytrd_sy2sb 
+*>                  was not called before this routine to reproduce AB.
+*>                  In other term this routine is called as standalone.
+*>          = 'Y':  "Yes": to mention that the stage 1 of the
+*>                  reduction from dense to band using the dsytrd_sy2sb
 *>                  routine has been called to produce AB (e.g., AB is
 *>                  the output of dsytrd_sy2sb.
 *> \endverbatim
@@ -66,10 +66,10 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  and thus LHOUS is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate or to apply Q later on, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate or to apply Q later on,
 *>                  then LHOUS is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -132,34 +132,39 @@
 *>
 *> \param[out] HOUS
 *> \verbatim
-*>          HOUS is DOUBLE PRECISION array, dimension LHOUS, that
-*>          store the Householder representation.
+*>          HOUS is DOUBLE PRECISION array, dimension (MAX(1,LHOUS))
+*>          Stores the Householder representation.
 *> \endverbatim
 *>
 *> \param[in] LHOUS
 *> \verbatim
 *>          LHOUS is INTEGER
-*>          The dimension of the array HOUS. LHOUS = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array HOUS.
+*>          If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS array, returns
 *>          this value as the first entry of the HOUS array, and no error
 *>          message related to LHOUS is issued by XERBLA.
 *>          LHOUS = MAX(1, dimension) where
 *>          dimension = 4*N if VECT='N'
-*>          not available now if VECT='H'     
+*>          not available now if VECT='H'
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension LWORK.
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array WORK.
+*>          If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -188,7 +193,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup real16OTHERcomputational
+*> \ingroup hetrd_hb2st
 *
 *> \par Further Details:
 *  =====================
@@ -208,7 +213,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -216,16 +221,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+      SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
      $                         D, E, HOUS, LHOUS, WORK, LWORK, INFO )
 *
 #if defined(_OPENMP)
@@ -258,11 +263,11 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, WANTQ, UPPER, AFTERS1
-      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, 
+      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
      $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
      $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
-     $                   NBTILES, TTYPE, TID, NTHREADS, DEBUG,
-     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, 
+     $                   NBTILES, TTYPE, TID, NTHREADS,
+     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
      $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
      $                   SIDEV, SIZETAU, LDV, LHMIN, LWMIN
 *     ..
@@ -274,7 +279,7 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      INTEGER            ILAENV2STAGE 
+      INTEGER            ILAENV2STAGE
       EXTERNAL           LSAME, ILAENV2STAGE
 *     ..
 *     .. Executable Statements ..
@@ -282,7 +287,6 @@
 *     Determine the minimal workspace size required.
 *     Test the input parameters
 *
-      DEBUG   = 0
       INFO    = 0
       AFTERS1 = LSAME( STAGE1, 'Y' )
       WANTQ   = LSAME( VECT, 'V' )
@@ -291,9 +295,14 @@
 *
 *     Determine the block size, the workspace size and the hous size.
 *
-      IB     = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+      IB       = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 )
+      IF( N.EQ.0 .OR. KD.LE.1 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
          INFO = -1
@@ -355,7 +364,7 @@
           ABDPOS   = KD + 1
           ABOFDPOS = KD
       ELSE
-          APOS     = INDA 
+          APOS     = INDA
           AWPOS    = INDA + KD + 1
           DPOS     = APOS
           OFDPOS   = DPOS + 1
@@ -363,11 +372,11 @@
           ABOFDPOS = 2
 
       ENDIF
-*      
-*     Case KD=0: 
-*     The matrix is diagonal. We just copy it (convert to "real" for 
-*     real because D is double and the imaginary part should be 0) 
-*     and store it in D. A sequential code here is better or 
+*
+*     Case KD=0:
+*     The matrix is diagonal. We just copy it (convert to "real" for
+*     real because D is double and the imaginary part should be 0)
+*     and store it in D. A sequential code here is better or
 *     in a parallel environment it might need two cores for D and E
 *
       IF( KD.EQ.0 ) THEN
@@ -382,17 +391,17 @@
           WORK( 1 ) = 1
           RETURN
       END IF
-*      
-*     Case KD=1: 
-*     The matrix is already Tridiagonal. We have to make diagonal 
+*
+*     Case KD=1:
+*     The matrix is already Tridiagonal. We have to make diagonal
 *     and offdiagonal elements real, and store them in D and E.
-*     For that, for real precision just copy the diag and offdiag 
-*     to D and E while for the COMPLEX case the bulge chasing is  
-*     performed to convert the hermetian tridiagonal to symmetric 
-*     tridiagonal. A simpler conversion formula might be used, but then 
+*     For that, for real precision just copy the diag and offdiag
+*     to D and E while for the COMPLEX case the bulge chasing is
+*     performed to convert the hermetian tridiagonal to symmetric
+*     tridiagonal. A simpler conversion formula might be used, but then
 *     updating the Q matrix will be required and based if Q is generated
-*     or not this might complicate the story. 
-*      
+*     or not this might complicate the story.
+*
       IF( KD.EQ.1 ) THEN
           DO 50 I = 1, N
               D( I ) = ( AB( ABDPOS, I ) )
@@ -413,7 +422,7 @@
           RETURN
       END IF
 *
-*     Main code start here. 
+*     Main code start here.
 *     Reduce the symmetric band of A to a tridiagonal matrix.
 *
       THGRSIZ   = N
@@ -422,7 +431,7 @@
       NBTILES   = CEILING( REAL(N)/REAL(KD) )
       STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) )
       THGRNB    = CEILING( REAL(N-1)/REAL(THGRSIZ) )
-*      
+*
       CALL DLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA )
       CALL DLASET( "A", KD,   N, ZERO, ZERO, WORK( AWPOS ), LDA )
 *
@@ -431,7 +440,7 @@
 *
 #if defined(_OPENMP)
 !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
-!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) 
+!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID )
 !$OMP$         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
 !$OMP$         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
 !$OMP$         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
@@ -440,7 +449,7 @@
 #endif
 *
 *     main bulge chasing loop
-*      
+*
       DO 100 THGRID = 1, THGRNB
           STT  = (THGRID-1)*THGRSIZ+1
           THED = MIN( (STT + THGRSIZ -1), (N-1))
@@ -451,7 +460,7 @@
                   ST = STT
                   DO 130 SWEEPID = ST, ED
                       DO 140 K = 1, GRSIZ
-                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ) 
+                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ)
      $                           + (M-1)*GRSIZ + K
                           IF ( MYID.EQ.1 ) THEN
                               TTYPE = 1
@@ -477,16 +486,16 @@
                           ENDIF
 *
 *                         Call the kernel
-*                             
+*
 #if defined(_OPENMP) &&  _OPENMP >= 201307
-                          IF( TTYPE.NE.1 ) THEN      
+                          IF( TTYPE.NE.1 ) THEN
 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
 !$OMP$     DEPEND(in:WORK(MYID-1))
 !$OMP$     DEPEND(out:WORK(MYID))
                               TID      = OMP_GET_THREAD_NUM()
-                              CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                              CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                             STIND, EDIND, SWEEPID, N, KD, IB,
-     $                             WORK ( INDA ), LDA, 
+     $                             WORK ( INDA ), LDA,
      $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                             WORK( INDW + TID*KD ) )
 !$OMP END TASK
@@ -494,20 +503,20 @@
 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
 !$OMP$     DEPEND(out:WORK(MYID))
                               TID      = OMP_GET_THREAD_NUM()
-                              CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                              CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                             STIND, EDIND, SWEEPID, N, KD, IB,
-     $                             WORK ( INDA ), LDA, 
+     $                             WORK ( INDA ), LDA,
      $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                             WORK( INDW + TID*KD ) )
 !$OMP END TASK
                           ENDIF
 #else
-                          CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                          CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                         STIND, EDIND, SWEEPID, N, KD, IB,
-     $                         WORK ( INDA ), LDA, 
+     $                         WORK ( INDA ), LDA,
      $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                         WORK( INDW ) )
-#endif 
+#endif
                           IF ( BLKLASTIND.GE.(N-1) ) THEN
                               STT = STT + 1
                               EXIT
@@ -522,14 +531,14 @@
 !$OMP END MASTER
 !$OMP END PARALLEL
 #endif
-*      
+*
 *     Copy the diagonal from A to D. Note that D is REAL thus only
 *     the Real part is needed, the imaginary part should be zero.
 *
       DO 150 I = 1, N
           D( I ) = ( WORK( DPOS+(I-1)*LDA ) )
   150 CONTINUE
-*      
+*
 *     Copy the off diagonal from A to E. Note that E is REAL thus only
 *     the Real part is needed, the imaginary part should be zero.
 *
@@ -543,11 +552,10 @@
   170     CONTINUE
       ENDIF
 *
-      HOUS( 1 ) = LHMIN
       WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of DSYTRD_SB2ST
 *
       END
-      
+
diff --git a/lapack-netlib/SRC/dsytrd_sy2sb.f b/lapack-netlib/SRC/dsytrd_sy2sb.f
index 1660b5c7e..38acc71f1 100644
--- a/lapack-netlib/SRC/dsytrd_sy2sb.f
+++ b/lapack-netlib/SRC/dsytrd_sy2sb.f
@@ -123,8 +123,8 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (LWORK)
-*>          On exit, if INFO = 0, or if LWORK=-1, 
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, or if LWORK = -1,
 *>          WORK(1) returns the size of LWORK.
 *> \endverbatim
 *>
@@ -132,7 +132,9 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK which should be calculated
-*>          by a workspace query. LWORK = MAX(1, LWORK_QUERY)
+*>          by a workspace query.
+*>          If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY)
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -158,7 +160,7 @@
 *> \author Univ. of Colorado Denver 
 *> \author NAG Ltd. 
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrd_he2hb
 *
 *> \par Further Details:
 *  =====================
@@ -293,8 +295,12 @@
       INFO   = 0
       UPPER  = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
-      LWMIN  = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', '', N, KD, -1, -1 )
-      
+      IF( N.LE.KD+1 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', ' ', N, KD, -1, -1 )
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
diff --git a/lapack-netlib/SRC/dsytrf_aa.f b/lapack-netlib/SRC/dsytrf_aa.f
index 9a0b26ce5..924d4c165 100644
--- a/lapack-netlib/SRC/dsytrf_aa.f
+++ b/lapack-netlib/SRC/dsytrf_aa.f
@@ -101,8 +101,10 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >= MAX(1,2*N). For optimum performance
-*>          LWORK >= N*(1+NB), where NB is the optimal blocksize.
+*>          The length of WORK.
+*>          LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise.
+*>          For optimum performance LWORK >= N*(1+NB), where NB is
+*>          the optimal blocksize, returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -125,10 +127,10 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrf_aa
 *
 *  =====================================================================
-      SUBROUTINE DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO)
+      SUBROUTINE DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -152,7 +154,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY, UPPER
-      INTEGER            J, LWKOPT
+      INTEGER            J, LWKMIN, LWKOPT
       INTEGER            NB, MJ, NJ, K1, K2, J1, J2, J3, JB
       DOUBLE PRECISION   ALPHA
 *     ..
@@ -179,18 +181,25 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( N.LE.1 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = 2*N
+         LWKOPT = (NB+1)*N
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = (NB+1)*N
          WORK( 1 ) = LWKOPT
       END IF
 *
@@ -203,11 +212,11 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
           RETURN
       ENDIF
       IPIV( 1 ) = 1
-      IF ( N.EQ.1 ) THEN
+      IF( N.EQ.1 ) THEN
          RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/dsytrf_aa_2stage.f b/lapack-netlib/SRC/dsytrf_aa_2stage.f
index c65bd86e6..fae95bab2 100644
--- a/lapack-netlib/SRC/dsytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/dsytrf_aa_2stage.f
@@ -87,14 +87,14 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is DOUBLE PRECISION array, dimension (LTB)
+*>          TB is DOUBLE PRECISION array, dimension (MAX(1,LTB))
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -121,14 +121,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION workspace of size LWORK
+*>          WORK is DOUBLE PRECISION workspace of size (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used
+*>          to select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -152,7 +152,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrf_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV,
@@ -211,9 +211,9 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -6
-      ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN
          INFO = -10
       END IF
 *
@@ -227,10 +227,10 @@
       NB = ILAENV( 1, 'DSYTRF_AA_2STAGE', UPLO, N, -1, -1, -1 )
       IF( INFO.EQ.0 ) THEN
          IF( TQUERY ) THEN
-            TB( 1 ) = (3*NB+1)*N
+            TB( 1 ) = MAX( 1, (3*NB+1)*N )
          END IF
          IF( WQUERY ) THEN
-            WORK( 1 ) = N*NB
+            WORK( 1 ) = MAX( 1, N*NB )
          END IF
       END IF
       IF( TQUERY .OR. WQUERY ) THEN
@@ -239,7 +239,7 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
          RETURN
       ENDIF
 *
diff --git a/lapack-netlib/SRC/dsytrf_rk.f b/lapack-netlib/SRC/dsytrf_rk.f
index 086586968..0717eb076 100644
--- a/lapack-netlib/SRC/dsytrf_rk.f
+++ b/lapack-netlib/SRC/dsytrf_rk.f
@@ -177,14 +177,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension ( MAX(1,LWORK) ).
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned
 *>          by ILAENV.
 *>
@@ -229,7 +229,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrf_rk
 *
 *> \par Further Details:
 *  =====================
diff --git a/lapack-netlib/SRC/dsytrf_rook.f b/lapack-netlib/SRC/dsytrf_rook.f
index 2f00d1802..316663485 100644
--- a/lapack-netlib/SRC/dsytrf_rook.f
+++ b/lapack-netlib/SRC/dsytrf_rook.f
@@ -118,7 +118,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
@@ -146,7 +146,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrf_rook
 *
 *> \par Further Details:
 *  =====================
diff --git a/lapack-netlib/SRC/dsytri2.f b/lapack-netlib/SRC/dsytri2.f
index dbcdcdb58..5960d3992 100644
--- a/lapack-netlib/SRC/dsytri2.f
+++ b/lapack-netlib/SRC/dsytri2.f
@@ -88,16 +88,16 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (N+NB+1)*(NB+3)
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          WORK is size >= (N+NB+1)*(NB+3)
+*>          If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3).
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>           calculates:
+*>          calculates:
 *>              - the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array,
 *>              - and no error message related to LWORK is issued by XERBLA.
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetri2
 *
 *  =====================================================================
       SUBROUTINE DSYTRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -159,9 +159,13 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+*
 *     Get blocksize
+*
       NBMAX = ILAENV( 1, 'DSYTRI2', UPLO, N, -1, -1, -1 )
-      IF ( NBMAX .GE. N ) THEN
+      IF( N.EQ.0 ) THEN
+         MINSIZE = 1
+      ELSE IF( NBMAX.GE.N ) THEN
          MINSIZE = N
       ELSE
          MINSIZE = (N+NBMAX+1)*(NBMAX+3)
@@ -173,28 +177,29 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
-*
-*     Quick return if possible
-*
 *
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'DSYTRI2', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK(1)=MINSIZE
+         WORK( 1 ) = MINSIZE
          RETURN
       END IF
+*
+*     Quick return if possible
+*
       IF( N.EQ.0 )
      $   RETURN
 
-      IF( NBMAX .GE. N ) THEN
+      IF( NBMAX.GE.N ) THEN
          CALL DSYTRI( UPLO, N, A, LDA, IPIV, WORK, INFO )
       ELSE
          CALL DSYTRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO )
       END IF
+*
       RETURN
 *
 *     End of DSYTRI2
diff --git a/lapack-netlib/SRC/dsytri_3.f b/lapack-netlib/SRC/dsytri_3.f
index 86d69cdfd..50834c605 100644
--- a/lapack-netlib/SRC/dsytri_3.f
+++ b/lapack-netlib/SRC/dsytri_3.f
@@ -119,16 +119,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (N+NB+1)*(NB+3).
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK. LWORK >= (N+NB+1)*(NB+3).
+*>          The length of WORK.
+*>          If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3).
 *>
-*>          If LDWORK = -1, then a workspace query is assumed;
+*>          If LWORK = -1, then a workspace query is assumed;
 *>          the routine only calculates the optimal size of the optimal
 *>          size of the WORK array, returns this value as the first
 *>          entry of the WORK array, and no error message related to
@@ -152,7 +153,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetri_3
 *
 *> \par Contributors:
 *  ==================
@@ -208,8 +209,13 @@
 *
 *     Determine the block size
 *
-      NB = MAX( 1, ILAENV( 1, 'DSYTRI_3', UPLO, N, -1, -1, -1 ) )
-      LWKOPT = ( N+NB+1 ) * ( NB+3 )
+      IF( N.EQ.0 ) THEN
+         LWKOPT = 1
+      ELSE
+         NB = MAX( 1, ILAENV( 1, 'DSYTRI_3', UPLO, N, -1, -1, -1 ) )
+         LWKOPT = ( N+NB+1 ) * ( NB+3 )
+      END IF
+      WORK( 1 ) = LWKOPT
 *
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
@@ -217,7 +223,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN
          INFO = -8
       END IF
 *
@@ -225,7 +231,6 @@
          CALL XERBLA( 'DSYTRI_3', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = LWKOPT
          RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/dsytrs_aa.f b/lapack-netlib/SRC/dsytrs_aa.f
index 26b11a2a0..f0016cb7f 100644
--- a/lapack-netlib/SRC/dsytrs_aa.f
+++ b/lapack-netlib/SRC/dsytrs_aa.f
@@ -105,7 +105,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK >= max(1,3*N-2).
+*>          The dimension of the array WORK.
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the minimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -123,7 +129,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrs_aa
 *
 *  =====================================================================
       SUBROUTINE DSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB,
@@ -151,7 +157,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, UPPER
-      INTEGER            K, KP, LWKOPT
+      INTEGER            K, KP, LWKMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -161,13 +167,19 @@
       EXTERNAL           DLACPY, DGTSV, DSWAP, DTRSM, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX
+      INTRINSIC          MIN, MAX
 *     ..
 *     .. Executable Statements ..
 *
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWKMIN = 1
+      ELSE
+         LWKMIN = 3*N-2
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -178,21 +190,20 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'DSYTRS_AA', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         LWKOPT = (3*N-2)
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = LWKMIN
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( N.EQ.0 .OR. NRHS.EQ.0 )
+      IF( MIN( N, NRHS ).EQ.0 )
      $   RETURN
 *
       IF( UPPER ) THEN

From c082669ad46b0f87be6730249b937def5060f80c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 23 Dec 2023 20:05:03 +0100
Subject: [PATCH 503/718] Handle corner cases of LWORK (Reference-LAPACK PR
 942)

---
 lapack-netlib/SRC/sgebrd.f           | 28 +++++---
 lapack-netlib/SRC/sgehrd.f           | 26 +++++---
 lapack-netlib/SRC/sgelq.f            |  8 +--
 lapack-netlib/SRC/sgelqf.f           | 20 ++++--
 lapack-netlib/SRC/sgemlq.f           | 24 ++++---
 lapack-netlib/SRC/sgemqr.f           | 22 +++++--
 lapack-netlib/SRC/sgeqlf.f           |  8 ++-
 lapack-netlib/SRC/sgeqp3rk.f         | 24 +++----
 lapack-netlib/SRC/sgeqr.f            | 22 ++++---
 lapack-netlib/SRC/sgeqrfp.f          | 28 +++++---
 lapack-netlib/SRC/sgesvj.f           | 36 ++++++++---
 lapack-netlib/SRC/sgetri.f           | 10 +--
 lapack-netlib/SRC/sgetsls.f          |  7 +-
 lapack-netlib/SRC/sgetsqrhrt.f       | 16 +++--
 lapack-netlib/SRC/sgges3.f           | 24 +++++--
 lapack-netlib/SRC/sggev3.f           | 27 +++++---
 lapack-netlib/SRC/sgghd3.f           | 15 +++--
 lapack-netlib/SRC/sggqrf.f           |  9 ++-
 lapack-netlib/SRC/sggrqf.f           |  2 +-
 lapack-netlib/SRC/sggsvd3.f          |  2 +-
 lapack-netlib/SRC/sggsvp3.f          |  5 +-
 lapack-netlib/SRC/slamswlq.f         | 64 ++++++++++++-------
 lapack-netlib/SRC/slamtsqr.f         | 76 +++++++++++++---------
 lapack-netlib/SRC/slaswlq.f          | 92 +++++++++++++++------------
 lapack-netlib/SRC/slatrs3.f          | 32 +++++++---
 lapack-netlib/SRC/slatsqr.f          | 95 ++++++++++++++++------------
 lapack-netlib/SRC/ssyevd.f           |  7 +-
 lapack-netlib/SRC/ssyevr.f           | 19 ++++--
 lapack-netlib/SRC/ssyevr_2stage.f    | 22 +++++--
 lapack-netlib/SRC/ssyevx.f           |  6 +-
 lapack-netlib/SRC/ssysv_aa.f         | 18 +++---
 lapack-netlib/SRC/ssysv_aa_2stage.f  | 27 ++++----
 lapack-netlib/SRC/ssysvx.f           |  7 +-
 lapack-netlib/SRC/ssytrd_2stage.f    | 86 +++++++++++++------------
 lapack-netlib/SRC/ssytrd_sy2sb.f     | 18 ++++--
 lapack-netlib/SRC/ssytrf.f           |  5 +-
 lapack-netlib/SRC/ssytrf_aa.f        | 36 +++++++----
 lapack-netlib/SRC/ssytrf_aa_2stage.f | 18 +++---
 lapack-netlib/SRC/ssytrf_rk.f        |  8 +--
 lapack-netlib/SRC/ssytrf_rook.f      |  7 +-
 lapack-netlib/SRC/ssytri2.f          | 32 ++++++----
 lapack-netlib/SRC/ssytri_3.f         | 21 +++---
 lapack-netlib/SRC/ssytrs_aa.f        | 30 ++++++---
 43 files changed, 671 insertions(+), 418 deletions(-)

diff --git a/lapack-netlib/SRC/sgebrd.f b/lapack-netlib/SRC/sgebrd.f
index 2d0c6d651..b33ad0b1f 100644
--- a/lapack-netlib/SRC/sgebrd.f
+++ b/lapack-netlib/SRC/sgebrd.f
@@ -122,7 +122,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= max(1,M,N).
+*>          The length of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise.
 *>          For optimum performance LWORK >= (M+N)*NB, where NB
 *>          is the optimal blocksize.
 *>
@@ -223,8 +224,8 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB,
-     $                   NBMIN, NX, WS
+      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT,
+     $                   MINMN, NB, NBMIN, NX, WS
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEBD2, SGEMM, SLABRD, XERBLA
@@ -242,9 +243,16 @@
 *     Test the input parameters
 *
       INFO = 0
-      NB = MAX( 1, ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) )
-      LWKOPT = ( M+N )*NB
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = MAX( M, N )
+         NB = MAX( 1, ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) )
+         LWKOPT = ( M+N )*NB
+      ENDIF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -252,7 +260,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.LT.0 ) THEN
@@ -264,7 +272,6 @@
 *
 *     Quick return if possible
 *
-      MINMN = MIN( M, N )
       IF( MINMN.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -283,7 +290,7 @@
 *        Determine when to switch from blocked to unblocked code.
 *
          IF( NX.LT.MINMN ) THEN
-            WS = ( M+N )*NB
+            WS = LWKOPT
             IF( LWORK.LT.WS ) THEN
 *
 *              Not enough work space for the optimal NB, consider using
@@ -342,7 +349,8 @@
 *
       CALL SGEBD2( M-I+1, N-I+1, A( I, I ), LDA, D( I ), E( I ),
      $             TAUQ( I ), TAUP( I ), WORK, IINFO )
-      WORK( 1 ) = SROUNDUP_LWORK(WS)
+*
+      WORK( 1 ) = SROUNDUP_LWORK( WS )
       RETURN
 *
 *     End of SGEBRD
diff --git a/lapack-netlib/SRC/sgehrd.f b/lapack-netlib/SRC/sgehrd.f
index 47733d947..cfa17e156 100644
--- a/lapack-netlib/SRC/sgehrd.f
+++ b/lapack-netlib/SRC/sgehrd.f
@@ -89,7 +89,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (LWORK)
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -173,7 +173,7 @@
       INTEGER            IHI, ILO, INFO, LDA, LWORK, N
 *     ..
 *     .. Array Arguments ..
-      REAL              A( LDA, * ), TAU( * ), WORK( * )
+      REAL               A( LDA, * ), TAU( * ), WORK( * )
 *     ..
 *
 *  =====================================================================
@@ -182,7 +182,7 @@
       INTEGER            NBMAX, LDT, TSIZE
       PARAMETER          ( NBMAX = 64, LDT = NBMAX+1,
      $                     TSIZE = LDT*NBMAX )
-      REAL              ZERO, ONE
+      REAL               ZERO, ONE
       PARAMETER          ( ZERO = 0.0E+0,
      $                     ONE = 1.0E+0 )
 *     ..
@@ -190,7 +190,7 @@
       LOGICAL            LQUERY
       INTEGER            I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB,
      $                   NBMIN, NH, NX
-      REAL              EI
+      REAL               EI
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SAXPY, SGEHD2, SGEMM, SLAHR2, SLARFB, STRMM,
@@ -222,13 +222,19 @@
          INFO = -8
       END IF
 *
+      NH = IHI - ILO + 1
       IF( INFO.EQ.0 ) THEN
 *
 *       Compute the workspace requirements
 *
-         NB = MIN( NBMAX, ILAENV( 1, 'SGEHRD', ' ', N, ILO, IHI, -1 ) )
-         LWKOPT = N*NB + TSIZE
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         IF( NH.LE.1 ) THEN
+            LWKOPT = 1
+         ELSE
+            NB = MIN( NBMAX, ILAENV( 1, 'SGEHRD', ' ', N, ILO, IHI,
+     $                              -1 ) )
+            LWKOPT = N*NB + TSIZE
+         ENDIF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -249,7 +255,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -269,7 +274,7 @@
 *
 *           Determine if workspace is large enough for blocked code
 *
-            IF( LWORK.LT.N*NB+TSIZE ) THEN
+            IF( LWORK.LT.LWKOPT ) THEN
 *
 *              Not enough workspace to use optimal NB:  determine the
 *              minimum value of NB, and reduce NB or force use of
@@ -345,7 +350,8 @@
 *     Use unblocked code to reduce the rest of the matrix
 *
       CALL SGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO )
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgelq.f b/lapack-netlib/SRC/sgelq.f
index 74c7cc267..75f02675d 100644
--- a/lapack-netlib/SRC/sgelq.f
+++ b/lapack-netlib/SRC/sgelq.f
@@ -98,7 +98,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -295,9 +295,9 @@
         T( 2 ) = MB
         T( 3 ) = NB
         IF( MINW ) THEN
-          WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+          WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
         ELSE
-          WORK( 1 ) = SROUNDUP_LWORK(LWREQ)
+          WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
         END IF
       END IF
       IF( INFO.NE.0 ) THEN
@@ -322,7 +322,7 @@
      $                LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWREQ)
+      WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
       RETURN
 *
 *     End of SGELQ
diff --git a/lapack-netlib/SRC/sgelqf.f b/lapack-netlib/SRC/sgelqf.f
index 1ceec4742..3b3913d84 100644
--- a/lapack-netlib/SRC/sgelqf.f
+++ b/lapack-netlib/SRC/sgelqf.f
@@ -93,7 +93,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,M).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise.
 *>          For optimum performance LWORK >= M*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -175,9 +176,8 @@
 *     Test the input arguments
 *
       INFO = 0
+      K = MIN( M, N )
       NB = ILAENV( 1, 'SGELQF', ' ', M, N, -1, -1 )
-      LWKOPT = M*NB
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -185,19 +185,25 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN
-         INFO = -7
+      ELSE IF( .NOT.LQUERY ) THEN
+         IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) )
+     $      INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'SGELQF', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
+         IF( K.EQ.0 ) THEN
+            LWKOPT = 1
+         ELSE
+            LWKOPT = M*NB
+         END IF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      K = MIN( M, N )
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -267,7 +273,7 @@
      $   CALL SGELQ2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK,
      $                IINFO )
 *
-      WORK( 1 ) = SROUNDUP_LWORK(IWS)
+      WORK( 1 ) = SROUNDUP_LWORK( IWS )
       RETURN
 *
 *     End of SGELQF
diff --git a/lapack-netlib/SRC/sgemlq.f b/lapack-netlib/SRC/sgemlq.f
index 83536825c..7e4d9bf65 100644
--- a/lapack-netlib/SRC/sgemlq.f
+++ b/lapack-netlib/SRC/sgemlq.f
@@ -110,13 +110,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
 *>          value as WORK(1), and no error message related to WORK
@@ -187,7 +188,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -207,7 +208,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -222,6 +223,13 @@
         LW = M * MB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN
@@ -250,12 +258,12 @@
         INFO = -9
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
         INFO = -11
-      ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = SROUNDUP_LWORK( LW )
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -267,7 +275,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -280,7 +288,7 @@
      $                 MB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK( LW )
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgemqr.f b/lapack-netlib/SRC/sgemqr.f
index 3207f8bfd..19bf467b8 100644
--- a/lapack-netlib/SRC/sgemqr.f
+++ b/lapack-netlib/SRC/sgemqr.f
@@ -189,12 +189,13 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
+      EXTERNAL           LSAME
       REAL               SROUNDUP_LWORK
-      EXTERNAL           LSAME, SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMQRT, SLAMTSQR, XERBLA
@@ -206,7 +207,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -221,6 +222,13 @@
         LW = MB * NB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, MB - K ).EQ.0 ) THEN
@@ -249,12 +257,12 @@
         INFO = -9
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
         INFO = -11
-      ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = SROUNDUP_LWORK(LW)
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -266,7 +274,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -279,7 +287,7 @@
      $                 NB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LW)
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgeqlf.f b/lapack-netlib/SRC/sgeqlf.f
index b1266c89e..14942b765 100644
--- a/lapack-netlib/SRC/sgeqlf.f
+++ b/lapack-netlib/SRC/sgeqlf.f
@@ -88,7 +88,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -189,8 +190,9 @@
          END IF
          WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
-         IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
-            INFO = -7
+         IF( .NOT.LQUERY ) THEN
+            IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) )
+     $         INFO = -7
          END IF
       END IF
 *
diff --git a/lapack-netlib/SRC/sgeqp3rk.f b/lapack-netlib/SRC/sgeqp3rk.f
index bb5da72dc..d3a335b88 100644
--- a/lapack-netlib/SRC/sgeqp3rk.f
+++ b/lapack-netlib/SRC/sgeqp3rk.f
@@ -427,7 +427,8 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*.          LWORK >= (3*N + NRHS - 1)
+*>          LWORK >= 1, if MIN(M,N) = 0, and
+*>          LWORK >= (3*N+NRHS-1), otherwise.
 *>          For optimal performance LWORK >= (2*N + NB*( N+NRHS+1 )),
 *>          where NB is the optimal block size for SGEQP3RK returned
 *>          by ILAENV. Minimal block size MINNB=2.
@@ -618,8 +619,9 @@
 *     .. External Functions ..
       LOGICAL            SISNAN
       INTEGER            ISAMAX, ILAENV
-      REAL               SLAMCH, SNRM2
-      EXTERNAL           SISNAN, SLAMCH, SNRM2, ISAMAX, ILAENV
+      REAL               SLAMCH, SNRM2, SROUNDUP_LWORK
+      EXTERNAL           SISNAN, SLAMCH, SNRM2, ISAMAX, ILAENV,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          REAL, MAX, MIN
@@ -696,7 +698,7 @@
 *
             LWKOPT = 2*N + NB*( N+NRHS+1 )
          END IF
-         WORK( 1 ) = REAL( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
          IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN
             INFO = -15
@@ -719,7 +721,7 @@
          K = 0
          MAXC2NRMK = ZERO
          RELMAXC2NRMK = ZERO
-         WORK( 1 ) = REAL( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          RETURN
       END IF
 *
@@ -772,7 +774,7 @@
 *
 *        Array TAU is not set and contains undefined elements.
 *
-         WORK( 1 ) = REAL( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          RETURN
       END IF
 *
@@ -791,7 +793,7 @@
             TAU( J ) = ZERO
          END DO
 *
-         WORK( 1 ) = REAL( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          RETURN
 *
       END IF
@@ -822,7 +824,7 @@
          DO J = 1, MINMN
             TAU( J ) = ZERO
          END DO
-         WORK( 1 ) = REAL( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          RETURN
       END IF
 *
@@ -867,7 +869,7 @@
             TAU( J ) = ZERO
          END DO
 *
-         WORK( 1 ) = REAL( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          RETURN
       END IF
 *
@@ -985,7 +987,7 @@
 *
 *              Return from the routine.
 *
-               WORK( 1 ) = REAL( LWKOPT )
+               WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
                RETURN
 *
@@ -1072,7 +1074,7 @@
 *
       END IF
 *
-      WORK( 1 ) = REAL( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgeqr.f b/lapack-netlib/SRC/sgeqr.f
index 6f41a92ea..79a515e1c 100644
--- a/lapack-netlib/SRC/sgeqr.f
+++ b/lapack-netlib/SRC/sgeqr.f
@@ -99,7 +99,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -168,6 +168,8 @@
 *>
 *> \endverbatim
 *>
+*> \ingroup geqr
+*>
 *  =====================================================================
       SUBROUTINE SGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
@@ -188,11 +190,13 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, LMINWS, MINT, MINW
-      INTEGER            MB, NB, MINTSZ, NBLCKS
+      INTEGER            MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLATSQR, SGEQRT, XERBLA
@@ -244,8 +248,10 @@
 *
 *     Determine if the workspace size satisfies minimal size
 *
+      LWMIN = MAX( 1, N )
+      LWREQ = MAX( 1, N*NB )
       LMINWS = .FALSE.
-      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N )
+      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ )
      $    .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ )
      $    .AND. ( .NOT.LQUERY ) ) THEN
         IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN
@@ -253,7 +259,7 @@
           NB = 1
           MB = M
         END IF
-        IF( LWORK.LT.NB*N ) THEN
+        IF( LWORK.LT.LWREQ ) THEN
           LMINWS = .TRUE.
           NB = 1
         END IF
@@ -268,7 +274,7 @@
       ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 )
      $   .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -6
-      ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY )
+      ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY )
      $   .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -8
       END IF
@@ -282,9 +288,9 @@
         T( 2 ) = MB
         T( 3 ) = NB
         IF( MINW ) THEN
-          WORK( 1 ) = MAX( 1, N )
+          WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
         ELSE
-          WORK( 1 ) = MAX( 1, NB*N )
+          WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
         END IF
       END IF
       IF( INFO.NE.0 ) THEN
@@ -309,7 +315,7 @@
      $                LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = MAX( 1, NB*N )
+      WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgeqrfp.f b/lapack-netlib/SRC/sgeqrfp.f
index d1ee2a828..37747c512 100644
--- a/lapack-netlib/SRC/sgeqrfp.f
+++ b/lapack-netlib/SRC/sgeqrfp.f
@@ -97,7 +97,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is
 *>          the optimal blocksize.
 *>
@@ -162,8 +163,8 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB,
-     $                   NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT,
+     $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEQR2P, SLARFB, SLARFT, XERBLA
@@ -173,8 +174,9 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
+      EXTERNAL           ILAENV
       REAL               SROUNDUP_LWORK
-      EXTERNAL           ILAENV, SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -182,8 +184,16 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'SGEQRF', ' ', M, N, -1, -1 )
-      LWKOPT = N*NB
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      K = MIN( M, N )
+      IF( K.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = N
+         LWKOPT = N*NB
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -191,7 +201,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
@@ -211,7 +221,7 @@
 *
       NBMIN = 2
       NX = 0
-      IWS = N
+      IWS = LWKMIN
       IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
 *        Determine when to cross over from blocked to unblocked code.
@@ -273,7 +283,7 @@
      $   CALL SGEQR2P( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK,
      $                IINFO )
 *
-      WORK( 1 ) = SROUNDUP_LWORK(IWS)
+      WORK( 1 ) = SROUNDUP_LWORK( IWS )
       RETURN
 *
 *     End of SGEQRFP
diff --git a/lapack-netlib/SRC/sgesvj.f b/lapack-netlib/SRC/sgesvj.f
index 3f53a5a15..36aed2853 100644
--- a/lapack-netlib/SRC/sgesvj.f
+++ b/lapack-netlib/SRC/sgesvj.f
@@ -208,7 +208,7 @@
 *>
 *> \param[in,out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (LWORK)
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
 *>          On entry,
 *>          If JOBU = 'C' :
 *>          WORK(1) = CTOL, where CTOL defines the threshold for convergence.
@@ -239,7 +239,12 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>         length of WORK, WORK >= MAX(6,M+N)
+*>          Length of WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(6,M+N), otherwise.
+*>
+*>          If on entry LWORK = -1, then a workspace query is assumed and
+*>          no computation is done; WORK(1) is set to the minial (and optimal)
+*>          length of WORK.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -260,7 +265,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup gesvj
 *
 *> \par Further Details:
 *  =====================
@@ -351,9 +356,9 @@
       INTEGER            BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1,
      $                   ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34,
      $                   N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP,
-     $                   SWBAND
-      LOGICAL            APPLV, GOSCALE, LOWER, LSVEC, NOSCALE, ROTOK,
-     $                   RSVEC, UCTOL, UPPER
+     $                   SWBAND, MINMN, LWMIN
+      LOGICAL            APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE,
+     $                   ROTOK, RSVEC, UCTOL, UPPER
 *     ..
 *     .. Local Arrays ..
       REAL               FASTR( 5 )
@@ -369,8 +374,8 @@
       INTEGER            ISAMAX
       EXTERNAL           ISAMAX
 *     from LAPACK
-      REAL               SLAMCH
-      EXTERNAL           SLAMCH
+      REAL               SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           SLAMCH, SROUNDUP_LWORK
       LOGICAL            LSAME
       EXTERNAL           LSAME
 *     ..
@@ -394,6 +399,14 @@
       UPPER = LSAME( JOBA, 'U' )
       LOWER = LSAME( JOBA, 'L' )
 *
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 6, M+N )
+      END IF
+*
+      LQUERY = ( LWORK.EQ.-1 )
       IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN
          INFO = -1
       ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN
@@ -413,7 +426,7 @@
          INFO = -11
       ELSE IF( UCTOL .AND. ( WORK( 1 ).LE.ONE ) ) THEN
          INFO = -12
-      ELSE IF( LWORK.LT.MAX( M+N, 6 ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN
          INFO = -13
       ELSE
          INFO = 0
@@ -423,11 +436,14 @@
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'SGESVJ', -INFO )
          RETURN
+      ELSE IF( LQUERY ) THEN
+         WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
+         RETURN
       END IF
 *
 * #:) Quick return for void matrix
 *
-      IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN
+      IF( MINMN.EQ.0 ) RETURN
 *
 *     Set numerical parameters
 *     The stopping criterion for Jacobi rotations is
diff --git a/lapack-netlib/SRC/sgetri.f b/lapack-netlib/SRC/sgetri.f
index fe71bc4a5..7b06bb63d 100644
--- a/lapack-netlib/SRC/sgetri.f
+++ b/lapack-netlib/SRC/sgetri.f
@@ -137,8 +137,9 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
+      EXTERNAL           ILAENV
       REAL               SROUNDUP_LWORK
-      EXTERNAL           ILAENV, SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMM, SGEMV, SSWAP, STRSM, STRTRI, XERBLA
@@ -152,8 +153,9 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'SGETRI', ' ', N, -1, -1, -1 )
-      LWKOPT = N*NB
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      LWKOPT = MAX( 1, N*NB )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
          INFO = -1
@@ -251,7 +253,7 @@
      $      CALL SSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 )
    60 CONTINUE
 *
-      WORK( 1 ) = SROUNDUP_LWORK(IWS)
+      WORK( 1 ) = SROUNDUP_LWORK( IWS )
       RETURN
 *
 *     End of SGETRI
diff --git a/lapack-netlib/SRC/sgetsls.f b/lapack-netlib/SRC/sgetsls.f
index d89c6a4e6..08a427a8b 100644
--- a/lapack-netlib/SRC/sgetsls.f
+++ b/lapack-netlib/SRC/sgetsls.f
@@ -127,7 +127,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed.
 *>          If LWORK = -1, the routine calculates optimal size of WORK for the
 *>          optimal performance and returns this value in WORK(1).
@@ -226,7 +226,10 @@
 *
 *     Determine the optimum and minimum LWORK
 *
-       IF( M.GE.N ) THEN
+       IF( MIN( M, N, NRHS ).EQ.0 ) THEN
+         WSIZEO = 1
+         WSIZEM = 1
+       ELSE IF( M.GE.N ) THEN
          CALL SGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 )
          TSZO = INT( TQ( 1 ) )
          LWO  = INT( WORKQ( 1 ) )
diff --git a/lapack-netlib/SRC/sgetsqrhrt.f b/lapack-netlib/SRC/sgetsqrhrt.f
index d80ff4da8..7ade8a66c 100644
--- a/lapack-netlib/SRC/sgetsqrhrt.f
+++ b/lapack-netlib/SRC/sgetsqrhrt.f
@@ -130,14 +130,17 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
+*>          If MIN(M,N) = 0, LWORK >= 1, else
+*>          LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
 *>          where
 *>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
 *>             NB1LOCAL = MIN(NB1,N).
 *>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
 *>             LW1 = NB1LOCAL * N,
-*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
+*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ).
+*>
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
@@ -216,7 +219,7 @@
 *     Test the input arguments
 *
       INFO = 0
-      LQUERY  = LWORK.EQ.-1
+      LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
       ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
@@ -229,7 +232,7 @@
          INFO = -5
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -7
-      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN
          INFO = -9
       ELSE
 *
@@ -267,8 +270,9 @@
             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
 *
             LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
+            LWORKOPT = MAX( 1, LWORKOPT )
 *
-            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+            IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN
                INFO = -11
             END IF
 *
@@ -350,4 +354,4 @@
 *
 *     End of SGETSQRHRT
 *
-      END
\ No newline at end of file
+      END
diff --git a/lapack-netlib/SRC/sgges3.f b/lapack-netlib/SRC/sgges3.f
index e35d4955a..e90cd6947 100644
--- a/lapack-netlib/SRC/sgges3.f
+++ b/lapack-netlib/SRC/sgges3.f
@@ -234,6 +234,8 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
+*>          If N = 0, LWORK >= 1, else LWORK >= 6*N+16.
+*>          For good performance, LWORK must generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -309,7 +311,8 @@
       LOGICAL            CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL,
      $                   LQUERY, LST2SL, WANTST
       INTEGER            I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT,
-     $                   ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT
+     $                   ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT,
+     $                   LWKMIN
       REAL               ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL,
      $                   PVSR, SAFMAX, SAFMIN, SMLNUM
 *     ..
@@ -361,6 +364,12 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      IF( N.EQ.0 ) THEN
+         LWKMIN = 1
+      ELSE
+         LWKMIN = 6*N+16
+      END IF
+*
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -377,7 +386,7 @@
          INFO = -15
       ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN
          INFO = -17
-      ELSE IF( LWORK.LT.6*N+16 .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -19
       END IF
 *
@@ -385,7 +394,7 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL SGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX( 6*N+16, 3*N+INT( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) )
          CALL SORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK,
      $                -1, IERR )
          LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
@@ -407,7 +416,11 @@
      $                   IERR )
             LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
          END IF
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         END IF
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -421,6 +434,7 @@
 *
       IF( N.EQ.0 ) THEN
          SDIM = 0
+         WORK( 1 ) = 1
          RETURN
       END IF
 *
@@ -657,7 +671,7 @@
 *
    40 CONTINUE
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sggev3.f b/lapack-netlib/SRC/sggev3.f
index c82d2187f..d788d1147 100644
--- a/lapack-netlib/SRC/sggev3.f
+++ b/lapack-netlib/SRC/sggev3.f
@@ -189,6 +189,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
+*>          The dimension of the array WORK. LWORK >= MAX(1,8*N).
+*>          For good performance, LWORK should generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -248,7 +250,8 @@
       LOGICAL            ILASCL, ILBSCL, ILV, ILVL, ILVR, LQUERY
       CHARACTER          CHTEMP
       INTEGER            ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO,
-     $                   IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT
+     $                   IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT,
+     $                   LWKMIN
       REAL               ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS,
      $                   SMLNUM, TEMP
 *     ..
@@ -298,6 +301,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 8*N )
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -312,7 +316,7 @@
          INFO = -12
       ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN
          INFO = -14
-      ELSE IF( LWORK.LT.MAX( 1, 8*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -16
       END IF
 *
@@ -320,28 +324,31 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL SGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX( 1, 8*N, 3*N+INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) )
          CALL SORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK,
      $                -1, IERR )
-         LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
          CALL SGGHD3( JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, VL, LDVL,
      $                VR, LDVR, WORK, -1, IERR )
-         LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
          IF( ILVL ) THEN
             CALL SORGQR( N, N, N, VL, LDVL, WORK, WORK, -1, IERR )
-            LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
             CALL SLAQZ0( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
      $                   ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR,
      $                   WORK, -1, 0, IERR )
-            LWKOPT = MAX( LWKOPT, 2*N+INT ( WORK( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
          ELSE
             CALL SLAQZ0( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
      $                   ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR,
      $                   WORK, -1, 0, IERR )
-            LWKOPT = MAX( LWKOPT, 2*N+INT ( WORK( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
+         END IF
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          END IF
-         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
-*
       END IF
 *
       IF( INFO.NE.0 ) THEN
diff --git a/lapack-netlib/SRC/sgghd3.f b/lapack-netlib/SRC/sgghd3.f
index 9c5858b5a..01e57088a 100644
--- a/lapack-netlib/SRC/sgghd3.f
+++ b/lapack-netlib/SRC/sgghd3.f
@@ -179,14 +179,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (LWORK)
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
-*> \param[in]  LWORK
+*> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= 1.
+*>          The length of the array WORK. LWORK >= 1.
 *>          For optimum performance LWORK >= 6*N*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -276,7 +276,12 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'SGGHD3', ' ', N, ILO, IHI, -1 )
-      LWKOPT = MAX( 6*N*NB, 1 )
+      NH = IHI - ILO + 1
+      IF( NH.LE.1 ) THEN
+         LWKOPT = 1
+      ELSE
+         LWKOPT = 6*N*NB
+      END IF
       WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       INITQ = LSAME( COMPQ, 'I' )
       WANTQ = INITQ .OR. LSAME( COMPQ, 'V' )
@@ -326,7 +331,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = ONE
          RETURN
@@ -886,6 +890,7 @@
       IF ( JCOL.LT.IHI )
      $   CALL SGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q,
      $                LDQ, Z, LDZ, IERR )
+*
       WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
diff --git a/lapack-netlib/SRC/sggqrf.f b/lapack-netlib/SRC/sggqrf.f
index ebb42a899..d32b48410 100644
--- a/lapack-netlib/SRC/sggqrf.f
+++ b/lapack-netlib/SRC/sggqrf.f
@@ -236,8 +236,9 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
+      EXTERNAL           ILAENV
       REAL               SROUNDUP_LWORK
-      EXTERNAL           ILAENV, SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN
@@ -251,8 +252,9 @@
       NB2 = ILAENV( 1, 'SGERQF', ' ', N, P, -1, -1 )
       NB3 = ILAENV( 1, 'SORMQR', ' ', N, M, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P )*NB
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
          INFO = -1
@@ -289,6 +291,7 @@
 *
       CALL SGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO )
       LWKOPT = MAX( LOPT, INT( WORK( 1 ) ) )
+*
       WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
diff --git a/lapack-netlib/SRC/sggrqf.f b/lapack-netlib/SRC/sggrqf.f
index 2163f1ef8..b3842ec2a 100644
--- a/lapack-netlib/SRC/sggrqf.f
+++ b/lapack-netlib/SRC/sggrqf.f
@@ -250,7 +250,7 @@
       NB2 = ILAENV( 1, 'SGEQRF', ' ', P, N, -1, -1 )
       NB3 = ILAENV( 1, 'SORMRQ', ' ', M, N, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P)*NB
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
       WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
diff --git a/lapack-netlib/SRC/sggsvd3.f b/lapack-netlib/SRC/sggsvd3.f
index 053fff5de..cee630593 100644
--- a/lapack-netlib/SRC/sggsvd3.f
+++ b/lapack-netlib/SRC/sggsvd3.f
@@ -278,7 +278,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
diff --git a/lapack-netlib/SRC/sggsvp3.f b/lapack-netlib/SRC/sggsvp3.f
index a463b9064..8e90d770c 100644
--- a/lapack-netlib/SRC/sggsvp3.f
+++ b/lapack-netlib/SRC/sggsvp3.f
@@ -227,7 +227,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -300,8 +300,9 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
+      EXTERNAL           LSAME
       REAL               SROUNDUP_LWORK
-      EXTERNAL           LSAME, SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEQP3, SGEQR2, SGERQ2, SLACPY, SLAPMT,
diff --git a/lapack-netlib/SRC/slamswlq.f b/lapack-netlib/SRC/slamswlq.f
index d4996b1f2..432afaded 100644
--- a/lapack-netlib/SRC/slamswlq.f
+++ b/lapack-netlib/SRC/slamswlq.f
@@ -127,17 +127,20 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          If SIDE = 'L', LWORK >= max(1,NB) * MB;
-*>          if SIDE = 'R', LWORK >= max(1,M) * MB.
+*>
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,NB*MB).
+*>          If SIDE = 'R', LWORK >= max(1,M*MB).
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -189,33 +192,38 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamswlq
+*>
 *  =====================================================================
       SUBROUTINE SLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $    LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      REAL              A( LDA, * ), WORK( * ), C(LDC, * ),
-     $      T( LDT, * )
+      REAL               A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, LW, CTR
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, LW, CTR, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
       EXTERNAL           STPMLQT, SGEMLQT, XERBLA
 *     ..
@@ -223,52 +231,60 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * MB
       ELSE
         LW = M * MB
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
 *
       INFO = 0
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( K.LT.0 ) THEN
         INFO = -5
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
         INFO = -4
-      ELSE IF( K.LT.MB .OR. MB.LT.1) THEN
+      ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN
         INFO = -6
       ELSE IF( LDA.LT.MAX( 1, K ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, MB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
+      END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'SLAMSWLQ', -INFO )
-        WORK(1) = LW
         RETURN
-      ELSE IF (LQUERY) THEN
-        WORK(1) = LW
+      ELSE IF( LQUERY ) THEN
         RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -402,7 +418,7 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of SLAMSWLQ
diff --git a/lapack-netlib/SRC/slamtsqr.f b/lapack-netlib/SRC/slamtsqr.f
index 960b794de..f9b167aea 100644
--- a/lapack-netlib/SRC/slamtsqr.f
+++ b/lapack-netlib/SRC/slamtsqr.f
@@ -128,22 +128,24 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) REAL array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
+*>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,N*NB).
+*>          If SIDE = 'R', LWORK >= max(1,MB*NB).
 *>
-*>          If SIDE = 'L', LWORK >= max(1,N)*NB;
-*>          if SIDE = 'R', LWORK >= max(1,MB)*NB.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -191,33 +193,38 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamtsqr
+*>
 *  =====================================================================
       SUBROUTINE SLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $        LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      REAL              A( LDA, * ), WORK( * ), C(LDC, * ),
-     $                T( LDT, * )
+      REAL               A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, LW, CTR, Q
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, LW, CTR, Q, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMQRT, STPMQRT, XERBLA
 *     ..
@@ -225,12 +232,13 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      INFO = 0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * NB
         Q = M
       ELSE
@@ -238,11 +246,17 @@
         Q = N
       END IF
 *
-      INFO = 0
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
+*
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
@@ -253,38 +267,38 @@
         INFO = -7
       ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, NB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN. AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
-*     Determine the block size if it is tall skinny or short and wide
-*
-      IF( INFO.EQ.0)  THEN
-          WORK(1) = LW
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'SLAMTSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
+*
+*     Determine the block size if it is tall skinny or short and wide
 *
       IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN
         CALL SGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA,
-     $        T, LDT, C, LDC, WORK, INFO)
+     $        T, LDT, C, LDC, WORK, INFO )
         RETURN
-       END IF
+      END IF
 *
       IF(LEFT.AND.NOTRAN) THEN
 *
@@ -410,7 +424,7 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of SLAMTSQR
diff --git a/lapack-netlib/SRC/slaswlq.f b/lapack-netlib/SRC/slaswlq.f
index 685f823a0..594c646db 100644
--- a/lapack-netlib/SRC/slaswlq.f
+++ b/lapack-netlib/SRC/slaswlq.f
@@ -96,22 +96,24 @@
 *>          The leading dimension of the array T.  LDT >= MB.
 *> \endverbatim
 *>
-*>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) REAL array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
+*>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= MB * M.
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -163,32 +165,35 @@
 *>
 *  =====================================================================
       SUBROUTINE SLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
-     $                  INFO)
+     $                    INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LWORK, LDT
+      INTEGER            INFO, LDA, M, N, MB, NB, LWORK, LDT
 *     ..
 *     .. Array Arguments ..
-      REAL              A( LDA, * ), WORK( * ), T( LDT, *)
+      REAL               A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, MINMN, LWMIN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
+      EXTERNAL           LSAME
       REAL               SROUNDUP_LWORK
-      EXTERNAL           LSAME, SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. EXTERNAL SUBROUTINES ..
       EXTERNAL           SGELQT, SGEQRT, STPLQT, STPQRT, XERBLA
+*     ..
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
@@ -199,12 +204,19 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = M*MB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
       ELSE IF( N.LT.0 .OR. N.LT.M ) THEN
         INFO = -2
-      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN
+      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN
         INFO = -3
       ELSE IF( NB.LE.0 ) THEN
         INFO = -4
@@ -212,60 +224,60 @@
         INFO = -6
       ELSE IF( LDT.LT.MB ) THEN
         INFO = -8
-      ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-      WORK(1) = MB*M
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'SLASWLQ', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The LQ Decomposition
 *
-       IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN
-        CALL SGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO)
+      IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN
+        CALL SGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO )
         RETURN
-       END IF
+      END IF
 *
-       KK = MOD((N-M),(NB-M))
-       II=N-KK+1
+      KK = MOD((N-M),(NB-M))
+      II = N-KK+1
 *
-*      Compute the LQ factorization of the first block A(1:M,1:NB)
+*     Compute the LQ factorization of the first block A(1:M,1:NB)
 *
-       CALL SGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO)
-       CTR = 1
+      CALL SGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO )
+      CTR = 1
 *
-       DO I = NB+1, II-NB+M , (NB-M)
+      DO I = NB+1, II-NB+M, (NB-M)
 *
-*      Compute the QR factorization of the current block A(1:M,I:I+NB-M)
+*       Compute the QR factorization of the current block A(1:M,I:I+NB-M)
 *
-         CALL STPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
-     $                  LDA, T(1, CTR * M + 1),
-     $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+        CALL STPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
+     $                 LDA, T(1, CTR * M + 1),
+     $                 LDT, WORK, INFO )
+        CTR = CTR + 1
+      END DO
 *
 *     Compute the QR factorization of the last block A(1:M,II:N)
 *
-       IF (II.LE.N) THEN
+      IF( II.LE.N ) THEN
         CALL STPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ),
-     $                  LDA, T(1, CTR * M + 1), LDT,
-     $                  WORK, INFO )
-       END IF
+     $                 LDA, T(1, CTR * M + 1), LDT,
+     $                 WORK, INFO )
+      END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(M * MB)
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of SLASWLQ
diff --git a/lapack-netlib/SRC/slatrs3.f b/lapack-netlib/SRC/slatrs3.f
index 8f0c4bf16..17052289e 100644
--- a/lapack-netlib/SRC/slatrs3.f
+++ b/lapack-netlib/SRC/slatrs3.f
@@ -151,13 +151,16 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (LWORK).
-*>          On exit, if INFO = 0, WORK(1) returns the optimal size of
-*>          WORK.
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
+*> \verbatim
 *>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else
 *>          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
 *>          NBA = (N + NB - 1)/NB and NB is the optimal block size.
 *>
@@ -165,6 +168,7 @@
 *>          only calculates the optimal dimensions of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
@@ -181,7 +185,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERauxiliary
+*> \ingroup latrs3
 *> \par Further Details:
 *  =====================
 *  \verbatim
@@ -253,7 +257,7 @@
       LOGICAL            LQUERY, NOTRAN, NOUNIT, UPPER
       INTEGER            AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
      $                   JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
-     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
+     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN
       REAL               ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
      $                   SCAMIN, SMLNUM, TMAX
 *     ..
@@ -264,7 +268,8 @@
       EXTERNAL           ILAENV, LSAME, SLAMCH, SLANGE, SLARMM
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLATRS, SSCAL, XERBLA
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SLATRS, SSCAL, SROUNDUP_LWORK, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN
@@ -292,15 +297,24 @@
 *     row. WORK( I + KK * LDS ) is the scale factor of the vector
 *     segment associated with the I-th block row and the KK-th vector
 *     in the block column.
+*
       LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
       LDS = NBA
+*
 *     The second part stores upper bounds of the triangular A. There are
 *     a total of NBA x NBA blocks, of which only the upper triangular
 *     part or the lower triangular part is referenced. The upper bound of
 *     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
+*
       LANRM = NBA * NBA
       AWRK = LSCALE
-      WORK( 1 ) = LSCALE + LANRM
+*
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = LSCALE + LANRM
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
 *
 *     Test the input parameters.
 *
@@ -322,7 +336,7 @@
          INFO = -8
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -10
-      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
+      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN
          INFO = -14
       END IF
       IF( INFO.NE.0 ) THEN
@@ -650,6 +664,8 @@
          END DO
       END DO
       RETURN
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
 *
 *     End of SLATRS3
 *
diff --git a/lapack-netlib/SRC/slatsqr.f b/lapack-netlib/SRC/slatsqr.f
index 86733bb15..4730815b5 100644
--- a/lapack-netlib/SRC/slatsqr.f
+++ b/lapack-netlib/SRC/slatsqr.f
@@ -101,15 +101,18 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= NB*N.
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -161,33 +164,39 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup latsqr
+*>
 *  =====================================================================
       SUBROUTINE SLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK,
-     $                    LWORK, INFO)
+     $                    LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LDT, LWORK
+      INTEGER            INFO, LDA, M, N, MB, NB, LDT, LWORK
 *     ..
 *     .. Array Arguments ..
-      REAL  A( LDA, * ), WORK( * ), T(LDT, *)
+      REAL               A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, MINMN, LWMIN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. EXTERNAL SUBROUTINES ..
       EXTERNAL           SGEQRT, STPQRT, XERBLA
+*     ..
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
@@ -198,6 +207,13 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = N*NB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
@@ -205,64 +221,65 @@
         INFO = -2
       ELSE IF( MB.LT.1 ) THEN
         INFO = -3
-      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN
+      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN
         INFO = -4
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -6
       ELSE IF( LDT.LT.NB ) THEN
         INFO = -8
-      ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-        WORK(1) = NB*N
+*
+      IF( INFO.EQ.0 )  THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'SLATSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The QR Decomposition
 *
-       IF ((MB.LE.N).OR.(MB.GE.M)) THEN
-         CALL SGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO)
-         RETURN
-       END IF
-       KK = MOD((M-N),(MB-N))
-       II=M-KK+1
+      IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN
+        CALL SGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO )
+        RETURN
+      END IF
+      KK = MOD((M-N),(MB-N))
+      II = M-KK+1
 *
-*      Compute the QR factorization of the first block A(1:MB,1:N)
+*     Compute the QR factorization of the first block A(1:MB,1:N)
 *
-       CALL SGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
+      CALL SGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
 *
-       CTR = 1
-       DO I = MB+1, II-MB+N ,  (MB-N)
+      CTR = 1
+      DO I = MB+1, II-MB+N, (MB-N)
 *
-*      Compute the QR factorization of the current block A(I:I+MB-N,1:N)
+*       Compute the QR factorization of the current block A(I:I+MB-N,1:N)
 *
-         CALL STPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
-     $                 T(1, CTR * N + 1),
-     $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+        CALL STPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
+     $                T(1, CTR * N + 1),
+     $                LDT, WORK, INFO )
+        CTR = CTR + 1
+      END DO
 *
-*      Compute the QR factorization of the last block A(II:M,1:N)
+*     Compute the QR factorization of the last block A(II:M,1:N)
 *
-       IF (II.LE.M) THEN
-         CALL STPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
-     $                 T(1, CTR * N + 1), LDT,
-     $                  WORK, INFO )
-       END IF
+      IF( II.LE.M ) THEN
+        CALL STPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
+     $                T(1, CTR * N + 1), LDT,
+     $                WORK, INFO )
+      END IF
 *
-      work( 1 ) = N*NB
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of SLATSQR
diff --git a/lapack-netlib/SRC/ssyevd.f b/lapack-netlib/SRC/ssyevd.f
index a5e4638d6..2ae44fc81 100644
--- a/lapack-netlib/SRC/ssyevd.f
+++ b/lapack-netlib/SRC/ssyevd.f
@@ -96,8 +96,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array,
-*>                                         dimension (LWORK)
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -251,7 +250,7 @@
      $                  N*ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) )
             LIOPT = LIWMIN
          END IF
-         WORK( 1 ) = SROUNDUP_LWORK(LOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LOPT )
          IWORK( 1 ) = LIOPT
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -335,7 +334,7 @@
       IF( ISCALE.EQ.1 )
      $   CALL SSCAL( N, ONE / SIGMA, W, 1 )
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LOPT )
       IWORK( 1 ) = LIOPT
 *
       RETURN
diff --git a/lapack-netlib/SRC/ssyevr.f b/lapack-netlib/SRC/ssyevr.f
index 47e4d7cbf..870facd60 100644
--- a/lapack-netlib/SRC/ssyevr.f
+++ b/lapack-netlib/SRC/ssyevr.f
@@ -271,7 +271,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,26*N).
+*>          The dimension of the array WORK.
+*>          If N <= 1, LWORK >= 1, else LWORK >= 26*N.
 *>          For optimal efficiency, LWORK >= (NB+6)*N,
 *>          where NB is the max of the blocksize for SSYTRD and SORMTR
 *>          returned by ILAENV.
@@ -292,7 +293,8 @@
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK and
@@ -392,8 +394,13 @@
 *
       LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) )
 *
-      LWMIN = MAX( 1, 26*N )
-      LIWMIN = MAX( 1, 10*N )
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = 26*N
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN
@@ -428,7 +435,7 @@
          NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
          NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) )
          LWKOPT = MAX( ( NB+1 )*N, LWMIN )
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -677,7 +684,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/ssyevr_2stage.f b/lapack-netlib/SRC/ssyevr_2stage.f
index a2d6a6231..471e25977 100644
--- a/lapack-netlib/SRC/ssyevr_2stage.f
+++ b/lapack-netlib/SRC/ssyevr_2stage.f
@@ -278,6 +278,7 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
+*>          If N <= 1,               LWORK must be at least 1.
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, 26*N, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + 5*N
@@ -300,13 +301,14 @@
 *> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
-*>          On exit, if INFO = 0, IWORK(1) returns the optimal LWORK.
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
 *> \endverbatim
 *>
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the IWORK array,
@@ -445,8 +447,14 @@
       IB     = ILAENV2STAGE( 2, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
       LHTRD  = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
       LWTRD  = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
-      LWMIN  = MAX( 26*N, 5*N + LHTRD + LWTRD )
-      LIWMIN = MAX( 1, 10*N )
+*
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = MAX( 26*N, 5*N + LHTRD + LWTRD )
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
@@ -485,7 +493,7 @@
 *         NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
 *         NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) )
 *         LWKOPT = MAX( ( NB+1 )*N, LWMIN )
-         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+         WORK( 1 )  = SROUNDUP_LWORK( LWMIN )
          IWORK( 1 ) = LIWMIN
       END IF
 *
@@ -505,7 +513,7 @@
       END IF
 *
       IF( N.EQ.1 ) THEN
-         WORK( 1 ) = 26
+         WORK( 1 ) = 1
          IF( ALLEIG .OR. INDEIG ) THEN
             M = 1
             W( 1 ) = A( 1, 1 )
@@ -733,7 +741,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+      WORK( 1 )  = SROUNDUP_LWORK( LWMIN )
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/ssyevx.f b/lapack-netlib/SRC/ssyevx.f
index 2204aa39b..aaed6dad5 100644
--- a/lapack-netlib/SRC/ssyevx.f
+++ b/lapack-netlib/SRC/ssyevx.f
@@ -338,14 +338,14 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWKMIN = 1
-            WORK( 1 ) = SROUNDUP_LWORK(LWKMIN)
+            LWKOPT = 1
          ELSE
             LWKMIN = 8*N
             NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
             NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) )
             LWKOPT = MAX( LWKMIN, ( NB + 3 )*N )
-            WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          END IF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY )
      $      INFO = -17
@@ -542,7 +542,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssysv_aa.f b/lapack-netlib/SRC/ssysv_aa.f
index e43d4de7f..711a275e1 100644
--- a/lapack-netlib/SRC/ssysv_aa.f
+++ b/lapack-netlib/SRC/ssysv_aa.f
@@ -177,12 +177,13 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS
+      INTEGER            LWKMIN, LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
+      EXTERNAL           LSAME
       REAL               SROUNDUP_LWORK
-      EXTERNAL           LSAME, SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, SSYTRS_AA, SSYTRF_AA
@@ -196,6 +197,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N, 3*N-2 )
       IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -206,18 +208,18 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
-         LWKOPT_SYTRF = INT( WORK(1) )
+         LWKOPT_SYTRF = INT( WORK( 1 ) )
          CALL SSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
      $                   -1, INFO )
-         LWKOPT_SYTRS = INT( WORK(1) )
-         LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS )
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         LWKOPT_SYTRS = INT( WORK( 1 ) )
+         LWKOPT = MAX( LWKMIN, LWKOPT_SYTRF, LWKOPT_SYTRS )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -239,7 +241,7 @@
 *
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssysv_aa_2stage.f b/lapack-netlib/SRC/ssysv_aa_2stage.f
index 3d88e068e..fb068b3bf 100644
--- a/lapack-netlib/SRC/ssysv_aa_2stage.f
+++ b/lapack-netlib/SRC/ssysv_aa_2stage.f
@@ -100,14 +100,14 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is REAL array, dimension (LTB)
+*>          TB is REAL array, dimension (MAX(1,LTB))
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -147,14 +147,15 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL workspace of size LWORK
+*>          WORK is REAL workspace of size (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used to
+*>          select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -204,12 +205,13 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            UPPER, TQUERY, WQUERY
-      INTEGER            LWKOPT
+      INTEGER            LWKMIN, LWKOPT
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
+      EXTERNAL           LSAME
       REAL               SROUNDUP_LWORK
-      EXTERNAL           LSAME, SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SSYTRF_AA_2STAGE, SSYTRS_AA_2STAGE,
@@ -226,6 +228,7 @@
       UPPER = LSAME( UPLO, 'U' )
       WQUERY = ( LWORK.EQ.-1 )
       TQUERY = ( LTB.EQ.-1 )
+      LWKMIN = MAX( 1, N )
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -234,18 +237,19 @@
          INFO = -3
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -5
-      ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -7
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -11
-      ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN
          INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV,
      $                          IPIV2, WORK, -1, INFO )
-         LWKOPT = INT( WORK(1) )
+         LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -255,7 +259,6 @@
          RETURN
       END IF
 *
-*
 *     Compute the factorization A = U**T*T*U or A = L*T*L**T.
 *
       CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2,
@@ -269,7 +272,7 @@
 *
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssysvx.f b/lapack-netlib/SRC/ssysvx.f
index 0d72217eb..06a6413f1 100644
--- a/lapack-netlib/SRC/ssysvx.f
+++ b/lapack-netlib/SRC/ssysvx.f
@@ -305,7 +305,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, NOFACT
-      INTEGER            LWKOPT, NB
+      INTEGER            LWKMIN, LWKOPT, NB
       REAL               ANORM
 *     ..
 *     .. External Functions ..
@@ -327,6 +327,7 @@
       INFO = 0
       NOFACT = LSAME( FACT, 'N' )
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 3*N )
       IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN
          INFO = -1
       ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) )
@@ -344,12 +345,12 @@
          INFO = -11
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -13
-      ELSE IF( LWORK.LT.MAX( 1, 3*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -18
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = MAX( 1, 3*N )
+         LWKOPT = LWKMIN
          IF( NOFACT ) THEN
             NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 )
             LWKOPT = MAX( LWKOPT, N*NB )
diff --git a/lapack-netlib/SRC/ssytrd_2stage.f b/lapack-netlib/SRC/ssytrd_2stage.f
index 5d70ae0d4..5b401c3d0 100644
--- a/lapack-netlib/SRC/ssytrd_2stage.f
+++ b/lapack-netlib/SRC/ssytrd_2stage.f
@@ -4,23 +4,23 @@
 *
 *  =========== DOCUMENTATION ===========
 *
-* Online html documentation available at 
-*            http://www.netlib.org/lapack/explore-html/ 
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
-*> Download SSYTRD_2STAGE + dependencies 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssytrd_2stage.f"> 
-*> [TGZ]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssytrd_2stage.f"> 
-*> [ZIP]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssytrd_2stage.f"> 
+*> Download SSYTRD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssytrd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssytrd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssytrd_2stage.f">
 *> [TXT]</a>
-*> \endhtmlonly 
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+*       SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
 *                                 HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
 *       IMPLICIT NONE
@@ -34,7 +34,7 @@
 *       REAL               A( LDA, * ), TAU( * ),
 *                          HOUS2( * ), WORK( * )
 *       ..
-*  
+*
 *
 *> \par Purpose:
 *  =============
@@ -52,11 +52,11 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  in particular for the second stage (Band to
 *>                  tridiagonal) and thus LHOUS2 is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate Q1 Q2 or to apply Q1 Q2, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate Q1 Q2 or to apply Q1 Q2,
 *>                  then LHOUS2 is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -86,7 +86,7 @@
 *>          triangular part of A is not referenced.
 *>          On exit, if UPLO = 'U', the band superdiagonal
 *>          of A are overwritten by the corresponding elements of the
-*>          internal band-diagonal matrix AB, and the elements above 
+*>          internal band-diagonal matrix AB, and the elements above
 *>          the KD superdiagonal, with the array TAU, represent the orthogonal
 *>          matrix Q1 as a product of elementary reflectors; if UPLO
 *>          = 'L', the diagonal and band subdiagonal of A are over-
@@ -117,13 +117,13 @@
 *> \param[out] TAU
 *> \verbatim
 *>          TAU is REAL array, dimension (N-KD)
-*>          The scalar factors of the elementary reflectors of 
+*>          The scalar factors of the elementary reflectors of
 *>          the first stage (see Further Details).
 *> \endverbatim
 *>
 *> \param[out] HOUS2
 *> \verbatim
-*>          HOUS2 is REAL array, dimension (LHOUS2)
+*>          HOUS2 is REAL array, dimension (MAX(1,LHOUS2))
 *>          Stores the Householder representation of the stage2
 *>          band to tridiagonal.
 *> \endverbatim
@@ -132,6 +132,8 @@
 *> \verbatim
 *>          LHOUS2 is INTEGER
 *>          The dimension of the array HOUS2.
+*>          LHOUS2 >= 1.
+*>
 *>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS2 array, returns
@@ -149,17 +151,19 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS2=-1,
+*>          The dimension of the array WORK.
+*>          If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *>          LWORK = MAX(1, dimension) where
 *>          dimension   = max(stage1,stage2) + (KD+1)*N
-*>                      = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                        + max(2*KD*KD, KD*NTHREADS) 
-*>                        + (KD+1)*N 
+*>                      = N*KD + N*max(KD+1,FACTOPTNB)
+*>                        + max(2*KD*KD, KD*NTHREADS)
+*>                        + (KD+1)*N
 *>          where KD is the blocking size of the reduction,
 *>          FACTOPTNB is the blocking used by the QR or LQ
 *>          algorithm, usually FACTOPTNB=128 is a good choice
@@ -177,12 +181,12 @@
 *  Authors:
 *  ========
 *
-*> \author Univ. of Tennessee 
-*> \author Univ. of California Berkeley 
-*> \author Univ. of Colorado Denver 
-*> \author NAG Ltd. 
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
 *
-*> \ingroup realSYcomputational
+*> \ingroup hetrd_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -202,7 +206,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -210,16 +214,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+      SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
      $                          HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
       IMPLICIT NONE
@@ -265,10 +269,13 @@
 *
       KD     = ILAENV2STAGE( 1, 'SSYTRD_2STAGE', VECT, N, -1, -1, -1 )
       IB     = ILAENV2STAGE( 2, 'SSYTRD_2STAGE', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 )
-*      WRITE(*,*),'SSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO,
-*     $            LHMIN, LWMIN
+      IF( N.EQ.0 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.LSAME( VECT, 'N' ) ) THEN
          INFO = -1
@@ -309,14 +316,14 @@
       LWRK  = LWORK-LDAB*N
       ABPOS = 1
       WPOS  = ABPOS + LDAB*N
-      CALL SSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, 
+      CALL SSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB,
      $                   TAU, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'SSYTRD_SY2SB', -INFO )
          RETURN
       END IF
-      CALL SSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, 
-     $                   WORK( ABPOS ), LDAB, D, E, 
+      CALL SSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD,
+     $                   WORK( ABPOS ), LDAB, D, E,
      $                   HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'SSYTRD_SB2ST', -INFO )
@@ -324,8 +331,7 @@
       END IF
 *
 *
-      HOUS2( 1 ) = LHMIN
-      WORK( 1 )  = LWMIN
+      WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of SSYTRD_2STAGE
diff --git a/lapack-netlib/SRC/ssytrd_sy2sb.f b/lapack-netlib/SRC/ssytrd_sy2sb.f
index 4efc43630..3996e07bb 100644
--- a/lapack-netlib/SRC/ssytrd_sy2sb.f
+++ b/lapack-netlib/SRC/ssytrd_sy2sb.f
@@ -124,7 +124,7 @@
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is REAL array, dimension (LWORK)
-*>          On exit, if INFO = 0, or if LWORK=-1, 
+*>          On exit, if INFO = 0, or if LWORK = -1,
 *>          WORK(1) returns the size of LWORK.
 *> \endverbatim
 *>
@@ -132,7 +132,9 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK which should be calculated
-*>          by a workspace query. LWORK = MAX(1, LWORK_QUERY)
+*>          by a workspace query.
+*>          If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY)
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -294,8 +296,12 @@
       INFO   = 0
       UPPER  = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
-      LWMIN  = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 )
-      
+      IF( N.LE.KD+1 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 )
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -314,7 +320,7 @@
          CALL XERBLA( 'SSYTRD_SY2SB', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+         WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
          RETURN
       END IF
 *
@@ -507,7 +513,7 @@
 
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of SSYTRD_SY2SB
diff --git a/lapack-netlib/SRC/ssytrf.f b/lapack-netlib/SRC/ssytrf.f
index a788fbcf0..55f3a4f0f 100644
--- a/lapack-netlib/SRC/ssytrf.f
+++ b/lapack-netlib/SRC/ssytrf.f
@@ -234,7 +234,7 @@
 *
          NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, N*NB )
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -353,7 +353,8 @@
       END IF
 *
    40 CONTINUE
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of SSYTRF
diff --git a/lapack-netlib/SRC/ssytrf_aa.f b/lapack-netlib/SRC/ssytrf_aa.f
index d6408a978..af32fb064 100644
--- a/lapack-netlib/SRC/ssytrf_aa.f
+++ b/lapack-netlib/SRC/ssytrf_aa.f
@@ -101,8 +101,10 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >= MAX(1,2*N). For optimum performance
-*>          LWORK >= N*(1+NB), where NB is the optimal blocksize.
+*>          The length of WORK.
+*>          LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise.
+*>          For optimum performance LWORK >= N*(1+NB), where NB is
+*>          the optimal blocksize, returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -128,7 +130,7 @@
 *> \ingroup hetrf_aa
 *
 *  =====================================================================
-      SUBROUTINE SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO)
+      SUBROUTINE SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -142,19 +144,19 @@
 *     ..
 *     .. Array Arguments ..
       INTEGER            IPIV( * )
-      REAL   A( LDA, * ), WORK( * )
+      REAL               A( LDA, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *     .. Parameters ..
-      REAL   ZERO, ONE
+      REAL               ZERO, ONE
       PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY, UPPER
-      INTEGER            J, LWKOPT
+      INTEGER            J, LWKMIN, LWKOPT
       INTEGER            NB, MJ, NJ, K1, K2, J1, J2, J3, JB
-      REAL   ALPHA
+      REAL               ALPHA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -180,19 +182,26 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( N.LE.1 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = 2*N
+         LWKOPT = (NB+1)*N
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = (NB+1)*N
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -204,11 +213,11 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
           RETURN
       ENDIF
       IPIV( 1 ) = 1
-      IF ( N.EQ.1 ) THEN
+      IF( N.EQ.1 ) THEN
          RETURN
       END IF
 *
@@ -458,7 +467,8 @@
       END IF
 *
    20 CONTINUE
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of SSYTRF_AA
diff --git a/lapack-netlib/SRC/ssytrf_aa_2stage.f b/lapack-netlib/SRC/ssytrf_aa_2stage.f
index abe6564c5..6b5cdee1b 100644
--- a/lapack-netlib/SRC/ssytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/ssytrf_aa_2stage.f
@@ -94,7 +94,7 @@
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -121,14 +121,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL workspace of size LWORK
+*>          WORK is REAL workspace of size (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used to
+*>          select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -212,9 +212,9 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -6
-      ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN
          INFO = -10
       END IF
 *
@@ -228,10 +228,10 @@
       NB = ILAENV( 1, 'SSYTRF_AA_2STAGE', UPLO, N, -1, -1, -1 )
       IF( INFO.EQ.0 ) THEN
          IF( TQUERY ) THEN
-            TB( 1 ) = (3*NB+1)*N
+            TB( 1 ) = SROUNDUP_LWORK( MAX( 1, (3*NB+1)*N ) )
          END IF
          IF( WQUERY ) THEN
-            WORK( 1 ) = SROUNDUP_LWORK(N*NB)
+            WORK( 1 ) = SROUNDUP_LWORK( MAX( 1, N*NB ) )
          END IF
       END IF
       IF( TQUERY .OR. WQUERY ) THEN
@@ -240,7 +240,7 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
          RETURN
       ENDIF
 *
diff --git a/lapack-netlib/SRC/ssytrf_rk.f b/lapack-netlib/SRC/ssytrf_rk.f
index 72830543c..89ecf38fd 100644
--- a/lapack-netlib/SRC/ssytrf_rk.f
+++ b/lapack-netlib/SRC/ssytrf_rk.f
@@ -177,14 +177,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension ( MAX(1,LWORK) ).
+*>          WORK is REAL array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned
 *>          by ILAENV.
 *>
@@ -312,7 +312,7 @@
 *
          NB = ILAENV( 1, 'SSYTRF_RK', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, N*NB )
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -488,7 +488,7 @@
 *
       END IF
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of SSYTRF_RK
diff --git a/lapack-netlib/SRC/ssytrf_rook.f b/lapack-netlib/SRC/ssytrf_rook.f
index 339a229e7..7c2cbbc57 100644
--- a/lapack-netlib/SRC/ssytrf_rook.f
+++ b/lapack-netlib/SRC/ssytrf_rook.f
@@ -118,7 +118,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
@@ -260,7 +260,7 @@
 *
          NB = ILAENV( 1, 'SSYTRF_ROOK', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, N*NB )
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -383,7 +383,8 @@
       END IF
 *
    40 CONTINUE
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of SSYTRF_ROOK
diff --git a/lapack-netlib/SRC/ssytri2.f b/lapack-netlib/SRC/ssytri2.f
index 49f6cad65..fd1c53473 100644
--- a/lapack-netlib/SRC/ssytri2.f
+++ b/lapack-netlib/SRC/ssytri2.f
@@ -88,16 +88,16 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (N+NB+1)*(NB+3)
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          WORK is size >= (N+NB+1)*(NB+3)
+*>          If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3).
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>           calculates:
+*>          calculates:
 *>              - the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array,
 *>              - and no error message related to LWORK is issued by XERBLA.
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYcomputational
+*> \ingroup hetri2
 *
 *  =====================================================================
       SUBROUTINE SSYTRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -147,7 +147,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SSYTRI, SSYTRI2X, XERBLA
@@ -159,9 +160,13 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+*
 *     Get blocksize
+*
       NBMAX = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 )
-      IF ( NBMAX .GE. N ) THEN
+      IF( N.EQ.0 ) THEN
+         MINSIZE = 1
+      ELSE IF( NBMAX.GE.N ) THEN
          MINSIZE = N
       ELSE
          MINSIZE = (N+NBMAX+1)*(NBMAX+3)
@@ -173,28 +178,29 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
-*
-*     Quick return if possible
-*
 *
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'SSYTRI2', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK(1)=MINSIZE
+         WORK( 1 ) = SROUNDUP_LWORK( MINSIZE )
          RETURN
       END IF
+*
+*     Quick return if possible
+*
       IF( N.EQ.0 )
      $   RETURN
-
-      IF( NBMAX .GE. N ) THEN
+*
+      IF( NBMAX.GE.N ) THEN
          CALL SSYTRI( UPLO, N, A, LDA, IPIV, WORK, INFO )
       ELSE
          CALL SSYTRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO )
       END IF
+*
       RETURN
 *
 *     End of SSYTRI2
diff --git a/lapack-netlib/SRC/ssytri_3.f b/lapack-netlib/SRC/ssytri_3.f
index bca01105d..f0152a149 100644
--- a/lapack-netlib/SRC/ssytri_3.f
+++ b/lapack-netlib/SRC/ssytri_3.f
@@ -119,16 +119,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (N+NB+1)*(NB+3).
+*>          WORK is REAL array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK. LWORK >= (N+NB+1)*(NB+3).
+*>          The length of WORK.
+*>          If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3).
 *>
-*>          If LDWORK = -1, then a workspace query is assumed;
+*>          If LWORK = -1, then a workspace query is assumed;
 *>          the routine only calculates the optimal size of the optimal
 *>          size of the WORK array, returns this value as the first
 *>          entry of the WORK array, and no error message related to
@@ -209,8 +210,13 @@
 *
 *     Determine the block size
 *
-      NB = MAX( 1, ILAENV( 1, 'SSYTRI_3', UPLO, N, -1, -1, -1 ) )
-      LWKOPT = ( N+NB+1 ) * ( NB+3 )
+      IF( N.EQ.0 ) THEN
+         LWKOPT = 1
+      ELSE
+         NB = MAX( 1, ILAENV( 1, 'SSYTRI_3', UPLO, N, -1, -1, -1 ) )
+         LWKOPT = ( N+NB+1 ) * ( NB+3 )
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
@@ -218,7 +224,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN
          INFO = -8
       END IF
 *
@@ -226,7 +232,6 @@
          CALL XERBLA( 'SSYTRI_3', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          RETURN
       END IF
 *
@@ -237,7 +242,7 @@
 *
       CALL SSYTRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO )
 *
-      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssytrs_aa.f b/lapack-netlib/SRC/ssytrs_aa.f
index 12fca0c71..265cf0c1d 100644
--- a/lapack-netlib/SRC/ssytrs_aa.f
+++ b/lapack-netlib/SRC/ssytrs_aa.f
@@ -105,7 +105,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK >= max(1,3*N-2).
+*>          The dimension of the array WORK.
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the minimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -141,7 +147,7 @@
 *     ..
 *     .. Array Arguments ..
       INTEGER            IPIV( * )
-      REAL   A( LDA, * ), B( LDB, * ), WORK( * )
+      REAL               A( LDA, * ), B( LDB, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
@@ -151,24 +157,31 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, UPPER
-      INTEGER            K, KP, LWKOPT
+      INTEGER            K, KP, LWKMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
+      EXTERNAL           LSAME
       REAL               SROUNDUP_LWORK
-      EXTERNAL           LSAME, SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGTSV, SSWAP, SLACPY, STRSM, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX
+      INTRINSIC          MIN, MAX
 *     ..
 *     .. Executable Statements ..
 *
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWKMIN = 1
+      ELSE
+         LWKMIN = 3*N-2
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -179,21 +192,20 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'SSYTRS_AA', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         LWKOPT = (3*N-2)
-         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
+         WORK( 1 ) = SROUNDUP_LWORK( LWKMIN )
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( N.EQ.0 .OR. NRHS.EQ.0 )
+      IF( MIN( N, NRHS ).EQ.0 )
      $   RETURN
 *
       IF( UPPER ) THEN

From 45ef0d7361f05d96828534264f0536a492b14180 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 23 Dec 2023 20:16:33 +0100
Subject: [PATCH 504/718] Handle corner cases of LWORK (Reference-LAPACK PR
 942)

---
 lapack-netlib/SRC/zgebrd.f           |  26 ++++--
 lapack-netlib/SRC/zgehrd.f           |  23 +++--
 lapack-netlib/SRC/zgelq.f            |   4 +-
 lapack-netlib/SRC/zgelqf.f           |  20 ++--
 lapack-netlib/SRC/zgemlq.f           |  28 ++++--
 lapack-netlib/SRC/zgemqr.f           |  30 ++++--
 lapack-netlib/SRC/zgeqlf.f           |  10 +-
 lapack-netlib/SRC/zgeqp3rk.f         |   3 +-
 lapack-netlib/SRC/zgeqr.f            |  20 ++--
 lapack-netlib/SRC/zgeqrfp.f          |  24 +++--
 lapack-netlib/SRC/zgesvj.f           |  83 ++++++++++-------
 lapack-netlib/SRC/zgetri.f           |   4 +-
 lapack-netlib/SRC/zgetsls.f          |  12 ++-
 lapack-netlib/SRC/zgetsqrhrt.f       |  15 +--
 lapack-netlib/SRC/zgges3.f           |  34 ++++---
 lapack-netlib/SRC/zggev3.f           |  24 +++--
 lapack-netlib/SRC/zgghd3.f           |  15 ++-
 lapack-netlib/SRC/zggqrf.f           |   4 +-
 lapack-netlib/SRC/zggrqf.f           |   4 +-
 lapack-netlib/SRC/zggsvd3.f          |   4 +-
 lapack-netlib/SRC/zggsvp3.f          |   4 +-
 lapack-netlib/SRC/zheevd.f           |   5 +-
 lapack-netlib/SRC/zheevr.f           |  29 ++++--
 lapack-netlib/SRC/zheevr_2stage.f    |  42 +++++----
 lapack-netlib/SRC/zhesv_aa.f         |  15 +--
 lapack-netlib/SRC/zhesv_aa_2stage.f  |  23 +++--
 lapack-netlib/SRC/zhesvx.f           |  13 +--
 lapack-netlib/SRC/zhetrd_2stage.f    |  87 +++++++++--------
 lapack-netlib/SRC/zhetrd_hb2st.F     | 134 ++++++++++++++-------------
 lapack-netlib/SRC/zhetrd_he2hb.f     |  18 ++--
 lapack-netlib/SRC/zhetrf.f           |   7 +-
 lapack-netlib/SRC/zhetrf_aa.f        |  27 ++++--
 lapack-netlib/SRC/zhetrf_aa_2stage.f |  28 +++---
 lapack-netlib/SRC/zhetrf_rk.f        |   8 +-
 lapack-netlib/SRC/zhetrf_rook.f      |   4 +-
 lapack-netlib/SRC/zhetri2.f          |  27 +++---
 lapack-netlib/SRC/zhetrs_aa.f        |  27 ++++--
 lapack-netlib/SRC/zlamswlq.f         |  68 ++++++++------
 lapack-netlib/SRC/zlamtsqr.f         |  76 ++++++++-------
 lapack-netlib/SRC/zlaswlq.f          |  91 ++++++++++--------
 lapack-netlib/SRC/zlatrs3.f          |  22 ++++-
 lapack-netlib/SRC/zlatsqr.f          |  95 +++++++++++--------
 42 files changed, 733 insertions(+), 504 deletions(-)

diff --git a/lapack-netlib/SRC/zgebrd.f b/lapack-netlib/SRC/zgebrd.f
index f1791c6a4..c1a6169a7 100644
--- a/lapack-netlib/SRC/zgebrd.f
+++ b/lapack-netlib/SRC/zgebrd.f
@@ -122,7 +122,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= max(1,M,N).
+*>          The length of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise.
 *>          For optimum performance LWORK >= (M+N)*NB, where NB
 *>          is the optimal blocksize.
 *>
@@ -147,7 +148,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup gebrd
 *
 *> \par Further Details:
 *  =====================
@@ -223,8 +224,8 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB,
-     $                   NBMIN, NX, WS
+      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT,
+     $                   MINMN, NB, NBMIN, NX, WS
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, ZGEBD2, ZGEMM, ZLABRD
@@ -241,9 +242,17 @@
 *     Test the input parameters
 *
       INFO = 0
-      NB = MAX( 1, ILAENV( 1, 'ZGEBRD', ' ', M, N, -1, -1 ) )
-      LWKOPT = ( M+N )*NB
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = MAX( M, N )
+         NB = MAX( 1, ILAENV( 1, 'ZGEBRD', ' ', M, N, -1, -1 ) )
+         LWKOPT = ( M+N )*NB
+      END IF
       WORK( 1 ) = DBLE( LWKOPT )
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -251,7 +260,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.LT.0 ) THEN
@@ -263,7 +272,6 @@
 *
 *     Quick return if possible
 *
-      MINMN = MIN( M, N )
       IF( MINMN.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -282,7 +290,7 @@
 *        Determine when to switch from blocked to unblocked code.
 *
          IF( NX.LT.MINMN ) THEN
-            WS = ( M+N )*NB
+            WS = LWKOPT
             IF( LWORK.LT.WS ) THEN
 *
 *              Not enough work space for the optimal NB, consider using
diff --git a/lapack-netlib/SRC/zgehrd.f b/lapack-netlib/SRC/zgehrd.f
index e18493cf9..0f4424ded 100644
--- a/lapack-netlib/SRC/zgehrd.f
+++ b/lapack-netlib/SRC/zgehrd.f
@@ -89,7 +89,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 array, dimension (LWORK)
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup gehrd
 *
 *> \par Further Details:
 *  =====================
@@ -173,7 +173,7 @@
       INTEGER            IHI, ILO, INFO, LDA, LWORK, N
 *     ..
 *     .. Array Arguments ..
-      COMPLEX*16        A( LDA, * ), TAU( * ), WORK( * )
+      COMPLEX*16         A( LDA, * ), TAU( * ), WORK( * )
 *     ..
 *
 *  =====================================================================
@@ -182,7 +182,7 @@
       INTEGER            NBMAX, LDT, TSIZE
       PARAMETER          ( NBMAX = 64, LDT = NBMAX+1,
      $                     TSIZE = LDT*NBMAX )
-      COMPLEX*16        ZERO, ONE
+      COMPLEX*16         ZERO, ONE
       PARAMETER          ( ZERO = ( 0.0D+0, 0.0D+0 ),
      $                     ONE = ( 1.0D+0, 0.0D+0 ) )
 *     ..
@@ -190,7 +190,7 @@
       LOGICAL            LQUERY
       INTEGER            I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB,
      $                   NBMIN, NH, NX
-      COMPLEX*16        EI
+      COMPLEX*16         EI
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ZAXPY, ZGEHD2, ZGEMM, ZLAHR2, ZLARFB, ZTRMM,
@@ -221,12 +221,18 @@
          INFO = -8
       END IF
 *
+      NH = IHI - ILO + 1
       IF( INFO.EQ.0 ) THEN
 *
 *        Compute the workspace requirements
 *
-         NB = MIN( NBMAX, ILAENV( 1, 'ZGEHRD', ' ', N, ILO, IHI, -1 ) )
-         LWKOPT = N*NB + TSIZE
+         IF( NH.LE.1 ) THEN
+            LWKOPT = 1
+         ELSE
+            NB = MIN( NBMAX, ILAENV( 1, 'ZGEHRD', ' ', N, ILO, IHI,
+     $                              -1 ) )
+            LWKOPT = N*NB + TSIZE
+         END IF
          WORK( 1 ) = LWKOPT
       ENDIF
 *
@@ -248,7 +254,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -268,7 +273,7 @@
 *
 *           Determine if workspace is large enough for blocked code
 *
-            IF( LWORK.LT.N*NB+TSIZE ) THEN
+            IF( LWORK.LT.LWKOPT ) THEN
 *
 *              Not enough workspace to use optimal NB:  determine the
 *              minimum value of NB, and reduce NB or force use of
diff --git a/lapack-netlib/SRC/zgelq.f b/lapack-netlib/SRC/zgelq.f
index de7c9a378..86610e801 100644
--- a/lapack-netlib/SRC/zgelq.f
+++ b/lapack-netlib/SRC/zgelq.f
@@ -98,7 +98,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -166,6 +166,8 @@
 *>  the LQ factorization.
 *> \endverbatim
 *>
+*> \ingroup gelq
+*>
 *  =====================================================================
       SUBROUTINE ZGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
diff --git a/lapack-netlib/SRC/zgelqf.f b/lapack-netlib/SRC/zgelqf.f
index 6c295eece..e988ea818 100644
--- a/lapack-netlib/SRC/zgelqf.f
+++ b/lapack-netlib/SRC/zgelqf.f
@@ -93,7 +93,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,M).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise.
 *>          For optimum performance LWORK >= M*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -118,7 +119,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup gelqf
 *
 *> \par Further Details:
 *  =====================
@@ -174,9 +175,8 @@
 *     Test the input arguments
 *
       INFO = 0
+      K = MIN( M, N )
       NB = ILAENV( 1, 'ZGELQF', ' ', M, N, -1, -1 )
-      LWKOPT = M*NB
-      WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -184,19 +184,25 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN
-         INFO = -7
+      ELSE IF( .NOT.LQUERY ) THEN
+         IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) )
+     $      INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'ZGELQF', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
+         IF( K.EQ.0 ) THEN
+            LWKOPT = 1
+         ELSE
+            LWKOPT = M*NB
+         END IF
+         WORK( 1 ) = LWKOPT
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      K = MIN( M, N )
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
diff --git a/lapack-netlib/SRC/zgemlq.f b/lapack-netlib/SRC/zgemlq.f
index 41cd1c059..11489087a 100644
--- a/lapack-netlib/SRC/zgemlq.f
+++ b/lapack-netlib/SRC/zgemlq.f
@@ -109,16 +109,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -142,7 +143,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -158,11 +159,13 @@
 *>  block sizes MB and NB returned by ILAENV, ZGELQ will use either
 *>  ZLASWLQ (if the matrix is wide-and-short) or ZGELQT to compute
 *>  the LQ factorization.
-*>  This version of ZGEMLQ will use either ZLAMSWLQ or ZGEMLQT to 
+*>  This version of ZGEMLQ will use either ZLAMSWLQ or ZGEMLQT to
 *>  multiply matrix Q by another matrix.
 *>  Further Details in ZLAMSWLQ or ZGEMLQT.
 *> \endverbatim
 *>
+*> \ingroup gemlq
+*>
 *  =====================================================================
       SUBROUTINE ZGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -184,7 +187,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -200,7 +203,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -215,6 +218,13 @@
         LW = M * MB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN
@@ -243,7 +253,7 @@
         INFO = -9
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
         INFO = -11
-      ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( ( LWORK.LT.LWMIN ) .AND. ( .NOT.LQUERY ) ) THEN
         INFO = -13
       END IF
 *
@@ -260,7 +270,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/zgemqr.f b/lapack-netlib/SRC/zgemqr.f
index c83eaff2f..d14d74fe2 100644
--- a/lapack-netlib/SRC/zgemqr.f
+++ b/lapack-netlib/SRC/zgemqr.f
@@ -111,16 +111,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -144,7 +145,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -166,6 +167,8 @@
 *>
 *> \endverbatim
 *>
+*> \ingroup gemqr
+*>
 *  =====================================================================
       SUBROUTINE ZGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -187,7 +190,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -203,7 +206,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -218,6 +221,13 @@
         LW = MB * NB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, MB - K ).EQ.0 ) THEN
@@ -246,12 +256,12 @@
         INFO = -9
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
         INFO = -11
-      ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = LW
+        WORK( 1 ) = LWMIN
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -263,7 +273,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -276,7 +286,7 @@
      $                 NB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = LW
+      WORK( 1 ) = LWMIN
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/zgeqlf.f b/lapack-netlib/SRC/zgeqlf.f
index 94721540c..a27612c64 100644
--- a/lapack-netlib/SRC/zgeqlf.f
+++ b/lapack-netlib/SRC/zgeqlf.f
@@ -88,7 +88,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is
 *>          the optimal blocksize.
 *>
@@ -113,7 +114,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup geqlf
 *
 *> \par Further Details:
 *  =====================
@@ -188,8 +189,9 @@
          END IF
          WORK( 1 ) = LWKOPT
 *
-         IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
-            INFO = -7
+         IF( .NOT.LQUERY ) THEN
+            IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) )
+     $         INFO = -7
          END IF
       END IF
 *
diff --git a/lapack-netlib/SRC/zgeqp3rk.f b/lapack-netlib/SRC/zgeqp3rk.f
index 247a3c379..01dcce0de 100644
--- a/lapack-netlib/SRC/zgeqp3rk.f
+++ b/lapack-netlib/SRC/zgeqp3rk.f
@@ -428,7 +428,8 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*.          LWORK >= N+NRHS-1
+*>          LWORK >= 1, if MIN(M,N) = 0, and
+*>          LWORK >= N+NRHS-1, otherwise.
 *>          For optimal performance LWORK >= NB*( N+NRHS+1 ),
 *>          where NB is the optimal block size for ZGEQP3RK returned
 *>          by ILAENV. Minimal block size MINNB=2.
diff --git a/lapack-netlib/SRC/zgeqr.f b/lapack-netlib/SRC/zgeqr.f
index 20a80d083..7df9c2403 100644
--- a/lapack-netlib/SRC/zgeqr.f
+++ b/lapack-netlib/SRC/zgeqr.f
@@ -99,7 +99,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -168,6 +168,8 @@
 *>
 *> \endverbatim
 *>
+*> \ingroup geqr
+*>
 *  =====================================================================
       SUBROUTINE ZGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
@@ -188,7 +190,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, LMINWS, MINT, MINW
-      INTEGER            MB, NB, MINTSZ, NBLCKS
+      INTEGER            MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -244,8 +246,10 @@
 *
 *     Determine if the workspace size satisfies minimal size
 *
+      LWMIN = MAX( 1, N )
+      LWREQ = MAX( 1, N*NB )
       LMINWS = .FALSE.
-      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N )
+      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ )
      $    .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ )
      $    .AND. ( .NOT.LQUERY ) ) THEN
         IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN
@@ -253,7 +257,7 @@
           NB = 1
           MB = M
         END IF
-        IF( LWORK.LT.NB*N ) THEN
+        IF( LWORK.LT.LWREQ ) THEN
           LMINWS = .TRUE.
           NB = 1
         END IF
@@ -268,7 +272,7 @@
       ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 )
      $   .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -6
-      ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY )
+      ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY )
      $   .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -8
       END IF
@@ -282,9 +286,9 @@
         T( 2 ) = MB
         T( 3 ) = NB
         IF( MINW ) THEN
-          WORK( 1 ) = MAX( 1, N )
+          WORK( 1 ) = LWMIN
         ELSE
-          WORK( 1 ) = MAX( 1, NB*N )
+          WORK( 1 ) = LWREQ
         END IF
       END IF
       IF( INFO.NE.0 ) THEN
@@ -309,7 +313,7 @@
      $                LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = MAX( 1, NB*N )
+      WORK( 1 ) = LWREQ
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/zgeqrfp.f b/lapack-netlib/SRC/zgeqrfp.f
index 73bcde667..3562de36e 100644
--- a/lapack-netlib/SRC/zgeqrfp.f
+++ b/lapack-netlib/SRC/zgeqrfp.f
@@ -97,7 +97,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is
 *>          the optimal blocksize.
 *>
@@ -122,7 +123,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup geqrfp
 *
 *> \par Further Details:
 *  =====================
@@ -162,8 +163,8 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB,
-     $                   NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT,
+     $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, ZGEQR2P, ZLARFB, ZLARFT
@@ -181,8 +182,16 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'ZGEQRF', ' ', M, N, -1, -1 )
-      LWKOPT = N*NB
+      K = MIN( M, N )
+      IF( K.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = N
+         LWKOPT = N*NB
+      END IF
       WORK( 1 ) = LWKOPT
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -190,7 +199,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
@@ -202,7 +211,6 @@
 *
 *     Quick return if possible
 *
-      K = MIN( M, N )
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -210,7 +218,7 @@
 *
       NBMIN = 2
       NX = 0
-      IWS = N
+      IWS = LWKMIN
       IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
 *        Determine when to cross over from blocked to unblocked code.
diff --git a/lapack-netlib/SRC/zgesvj.f b/lapack-netlib/SRC/zgesvj.f
index 6cd2335f2..2be45d826 100644
--- a/lapack-netlib/SRC/zgesvj.f
+++ b/lapack-netlib/SRC/zgesvj.f
@@ -200,23 +200,25 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>          The leading dimension of the array V, LDV >= 1.
-*>          If JOBV = 'V', then LDV >= max(1,N).
-*>          If JOBV = 'A', then LDV >= max(1,MV) .
+*>          If JOBV = 'V', then LDV >= MAX(1,N).
+*>          If JOBV = 'A', then LDV >= MAX(1,MV) .
 *> \endverbatim
 *>
 *> \param[in,out] CWORK
 *> \verbatim
-*>          CWORK is COMPLEX*16 array, dimension (max(1,LWORK))
+*>          CWORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
 *>          Used as workspace.
-*>          If on entry LWORK = -1, then a workspace query is assumed and
-*>          no computation is done; CWORK(1) is set to the minial (and optimal)
-*>          length of CWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER.
-*>          Length of CWORK, LWORK >= M+N.
+*>          Length of CWORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M+N, otherwise.
+*>
+*>          If on entry LWORK = -1, then a workspace query is assumed and
+*>          no computation is done; CWORK(1) is set to the minial (and optimal)
+*>          length of CWORK.
 *> \endverbatim
 *>
 *> \param[in,out] RWORK
@@ -247,15 +249,17 @@
 *>          RWORK(6) = the largest absolute value over all sines of the
 *>                    Jacobi rotation angles in the last sweep. It can be
 *>                    useful for a post festum analysis.
-*>         If on entry LRWORK = -1, then a workspace query is assumed and
-*>         no computation is done; RWORK(1) is set to the minial (and optimal)
-*>         length of RWORK.
 *> \endverbatim
 *>
 *> \param[in] LRWORK
 *> \verbatim
 *>         LRWORK is INTEGER
-*>         Length of RWORK, LRWORK >= MAX(6,N).
+*>         Length of RWORK.
+*>         LRWORK >= 1, if MIN(M,N) = 0, and LRWORK >= MAX(6,N), otherwise.
+*>
+*>         If on entry LRWORK = -1, then a workspace query is assumed and
+*>         no computation is done; RWORK(1) is set to the minial (and optimal)
+*>         length of RWORK.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -276,7 +280,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup gesvj
 *
 *> \par Further Details:
 *  =====================
@@ -367,23 +371,25 @@
 *
 *     .. Local Parameters ..
       DOUBLE PRECISION   ZERO,         HALF,         ONE
-      PARAMETER  ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0)
-      COMPLEX*16      CZERO,                  CONE
-      PARAMETER  ( CZERO = (0.0D0, 0.0D0), CONE = (1.0D0, 0.0D0) )
-      INTEGER      NSWEEP
-      PARAMETER  ( NSWEEP = 30 )
+      PARAMETER        ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0)
+      COMPLEX*16         CZERO,                  CONE
+      PARAMETER        ( CZERO = (0.0D0, 0.0D0), CONE = (1.0D0, 0.0D0) )
+      INTEGER            NSWEEP
+      PARAMETER        ( NSWEEP = 30 )
 *     ..
 *     .. Local Scalars ..
-      COMPLEX*16 AAPQ, OMPQ
-      DOUBLE PRECISION AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG,
-     $       BIGTHETA, CS, CTOL, EPSLN, MXAAPQ,
-     $       MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL,
-     $       SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL
-      INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1,
-     $        ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34,
-     $        N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND
-      LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK,
-     $        RSVEC, UCTOL, UPPER
+      COMPLEX*16         AAPQ, OMPQ
+      DOUBLE PRECISION   AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG,
+     $                   BIGTHETA, CS, CTOL, EPSLN, MXAAPQ,
+     $                   MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL,
+     $                   SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN,
+     $                   TOL
+      INTEGER            BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1,
+     $                   ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34,
+     $                   N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP,
+     $                   SWBAND, MINMN, LWMIN, LRWMIN
+      LOGICAL            APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE,
+     $                   ROTOK, RSVEC, UCTOL, UPPER
 *     ..
 *     ..
 *     .. Intrinsic Functions ..
@@ -422,7 +428,16 @@
       UPPER = LSAME( JOBA, 'U' )
       LOWER = LSAME( JOBA, 'L' )
 *
-      LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 )
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWMIN  = 1
+         LRWMIN = 1
+      ELSE
+         LWMIN  = M+N
+         LRWMIN = MAX( 6, N )
+      END IF
+*
+      LQUERY = ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 )
       IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN
          INFO = -1
       ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN
@@ -442,9 +457,9 @@
          INFO = -11
       ELSE IF( UCTOL .AND. ( RWORK( 1 ).LE.ONE ) ) THEN
          INFO = -12
-      ELSE IF( ( LWORK.LT.( M+N ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN
          INFO = -13
-      ELSE IF( ( LRWORK.LT.MAX( N, 6 ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LRWORK.LT.LRWMIN .AND. ( .NOT.LQUERY ) ) THEN
          INFO = -15
       ELSE
          INFO = 0
@@ -454,15 +469,15 @@
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'ZGESVJ', -INFO )
          RETURN
-      ELSE IF ( LQUERY ) THEN
-         CWORK(1) = M + N
-         RWORK(1) = MAX( N, 6 )
+      ELSE IF( LQUERY ) THEN
+         CWORK( 1 ) = LWMIN
+         RWORK( 1 ) = LRWMIN
          RETURN
       END IF
 *
 * #:) Quick return for void matrix
 *
-      IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN
+      IF( MINMN.EQ.0 ) RETURN
 *
 *     Set numerical parameters
 *     The stopping criterion for Jacobi rotations is
diff --git a/lapack-netlib/SRC/zgetri.f b/lapack-netlib/SRC/zgetri.f
index 41782841c..f3806a77c 100644
--- a/lapack-netlib/SRC/zgetri.f
+++ b/lapack-netlib/SRC/zgetri.f
@@ -107,7 +107,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup getri
 *
 *  =====================================================================
       SUBROUTINE ZGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -152,7 +152,7 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'ZGETRI', ' ', N, -1, -1, -1 )
-      LWKOPT = N*NB
+      LWKOPT = MAX( 1, N*NB )
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
diff --git a/lapack-netlib/SRC/zgetsls.f b/lapack-netlib/SRC/zgetsls.f
index 17c6d5146..26311c611 100644
--- a/lapack-netlib/SRC/zgetsls.f
+++ b/lapack-netlib/SRC/zgetsls.f
@@ -127,7 +127,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed.
 *>          If LWORK = -1, the routine calculates optimal size of WORK for the
 *>          optimal performance and returns this value in WORK(1).
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEsolve
+*> \ingroup getsls
 *
 *  =====================================================================
       SUBROUTINE ZGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB,
@@ -192,7 +192,7 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       DOUBLE PRECISION   DLAMCH, ZLANGE
-      EXTERNAL           LSAME, DLABAD, DLAMCH, ZLANGE
+      EXTERNAL           LSAME, DLAMCH, ZLANGE
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ZGEQR, ZGEMQR, ZLASCL, ZLASET,
@@ -229,7 +229,10 @@
 *
 *     Determine the optimum and minimum LWORK
 *
-       IF( M.GE.N ) THEN
+       IF( MIN( M, N, NRHS ).EQ.0 ) THEN
+         WSIZEO = 1
+         WSIZEM = 1
+       ELSE IF( M.GE.N ) THEN
          CALL ZGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 )
          TSZO = INT( TQ( 1 ) )
          LWO  = INT( WORKQ( 1 ) )
@@ -297,7 +300,6 @@
 *
        SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' )
        BIGNUM = ONE / SMLNUM
-       CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A, B if max element outside range [SMLNUM,BIGNUM]
 *
diff --git a/lapack-netlib/SRC/zgetsqrhrt.f b/lapack-netlib/SRC/zgetsqrhrt.f
index 5f0167937..e7ce993aa 100644
--- a/lapack-netlib/SRC/zgetsqrhrt.f
+++ b/lapack-netlib/SRC/zgetsqrhrt.f
@@ -131,13 +131,15 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
-*>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
+*>          If MIN(M,N) = 0, LWORK >= 1, else
+*>          LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
 *>          where
 *>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
 *>             NB1LOCAL = MIN(NB1,N).
 *>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
 *>             LW1 = NB1LOCAL * N,
-*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
+*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ).
+*>
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
@@ -160,7 +162,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup comlpex16OTHERcomputational
+*> \ingroup getsqrhrt
 *
 *> \par Contributors:
 *  ==================
@@ -212,7 +214,7 @@
 *     Test the input arguments
 *
       INFO = 0
-      LQUERY  = LWORK.EQ.-1
+      LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
       ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
@@ -225,7 +227,7 @@
          INFO = -5
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -7
-      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN
          INFO = -9
       ELSE
 *
@@ -263,8 +265,9 @@
             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
 *
             LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
+            LWORKOPT = MAX( 1, LWORKOPT )
 *
-            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+            IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN
                INFO = -11
             END IF
 *
diff --git a/lapack-netlib/SRC/zgges3.f b/lapack-netlib/SRC/zgges3.f
index 8b3e44f88..8235c2543 100644
--- a/lapack-netlib/SRC/zgges3.f
+++ b/lapack-netlib/SRC/zgges3.f
@@ -215,7 +215,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= MAX(1,2*N)
+*>          For good performance, LWORK must generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -260,7 +261,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEeigen
+*> \ingroup gges3
 *
 *  =====================================================================
       SUBROUTINE ZGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B,
@@ -300,7 +301,8 @@
       LOGICAL            CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL,
      $                   LQUERY, WANTST
       INTEGER            I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT,
-     $                   ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT
+     $                   ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT,
+     $                   LWKMIN
       DOUBLE PRECISION   ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL,
      $                   PVSR, SMLNUM
 *     ..
@@ -309,9 +311,8 @@
       DOUBLE PRECISION   DIF( 2 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3,
-     $                   ZLAQZ0, ZLACPY, ZLASCL, ZLASET, ZTGSEN, ZUNGQR,
-     $                   ZUNMQR
+      EXTERNAL           XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, ZLAQZ0,
+     $                   ZLACPY, ZLASCL, ZLASET, ZTGSEN, ZUNGQR, ZUNMQR
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -353,6 +354,8 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N )
+*
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -369,7 +372,7 @@
          INFO = -14
       ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN
          INFO = -16
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -18
       END IF
 *
@@ -377,28 +380,32 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL ZGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX( 1,  N + INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKMIN,  N + INT( WORK( 1 ) ) )
          CALL ZUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK,
      $                -1, IERR )
-         LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) )
          IF( ILVSL ) THEN
             CALL ZUNGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1, IERR )
             LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) )
          END IF
          CALL ZGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL,
      $                LDVSL, VSR, LDVSR, WORK, -1, IERR )
-         LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) )
          CALL ZLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB,
      $                ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, WORK, -1,
      $                RWORK, 0, IERR )
-         LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
          IF( WANTST ) THEN
             CALL ZTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB,
      $                   ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, SDIM,
      $                   PVSL, PVSR, DIF, WORK, -1, IDUM, 1, IERR )
-            LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
+         END IF
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = DCMPLX( LWKOPT )
          END IF
-         WORK( 1 ) = DCMPLX( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -420,7 +427,6 @@
       EPS = DLAMCH( 'P' )
       SMLNUM = DLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
diff --git a/lapack-netlib/SRC/zggev3.f b/lapack-netlib/SRC/zggev3.f
index 2d6c74582..0cc073470 100644
--- a/lapack-netlib/SRC/zggev3.f
+++ b/lapack-netlib/SRC/zggev3.f
@@ -174,7 +174,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= MAX(1,2*N).
+*>          For good performance, LWORK must generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -208,7 +209,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEeigen
+*> \ingroup ggev3
 *
 *  =====================================================================
       SUBROUTINE ZGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHA, BETA,
@@ -243,7 +244,7 @@
       CHARACTER          CHTEMP
       INTEGER            ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO,
      $                   IN, IRIGHT, IROWS, IRWRK, ITAU, IWRK, JC, JR,
-     $                   LWKOPT
+     $                   LWKMIN, LWKOPT
       DOUBLE PRECISION   ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS,
      $                   SMLNUM, TEMP
       COMPLEX*16         X
@@ -252,9 +253,8 @@
       LOGICAL            LDUMMA( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3,
-     $                   ZLAQZ0, ZLACPY, ZLASCL, ZLASET, ZTGEVC, ZUNGQR,
-     $                   ZUNMQR
+      EXTERNAL           XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, ZLAQZ0,
+     $                   ZLACPY, ZLASCL, ZLASET, ZTGEVC, ZUNGQR, ZUNMQR
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -301,6 +301,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N )
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -315,7 +316,7 @@
          INFO = -11
       ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN
          INFO = -13
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -15
       END IF
 *
@@ -323,7 +324,7 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL ZGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX( 1,  N+INT( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKMIN, N+INT( WORK( 1 ) ) )
          CALL ZUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK,
      $                -1, IERR )
          LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
@@ -348,7 +349,11 @@
      $                   RWORK, 0, IERR )
             LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
          END IF
-         WORK( 1 ) = DCMPLX( LWKOPT )
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = DCMPLX( LWKOPT )
+         END IF
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -368,7 +373,6 @@
       EPS = DLAMCH( 'E' )*DLAMCH( 'B' )
       SMLNUM = DLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
diff --git a/lapack-netlib/SRC/zgghd3.f b/lapack-netlib/SRC/zgghd3.f
index b29cdc70a..08343688d 100644
--- a/lapack-netlib/SRC/zgghd3.f
+++ b/lapack-netlib/SRC/zgghd3.f
@@ -176,14 +176,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 array, dimension (LWORK)
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in]  LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= 1.
+*>          The length of the array WORK. LWORK >= 1.
 *>          For optimum performance LWORK >= 6*N*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -208,7 +208,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup gghd3
 *
 *> \par Further Details:
 *  =====================
@@ -275,7 +275,12 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'ZGGHD3', ' ', N, ILO, IHI, -1 )
-      LWKOPT = MAX( 6*N*NB, 1 )
+      NH = IHI - ILO + 1
+      IF( NH.LE.1 ) THEN
+         LWKOPT = 1
+      ELSE
+         LWKOPT = 6*N*NB
+      END IF
       WORK( 1 ) = DCMPLX( LWKOPT )
       INITQ = LSAME( COMPQ, 'I' )
       WANTQ = INITQ .OR. LSAME( COMPQ, 'V' )
@@ -325,7 +330,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = CONE
          RETURN
@@ -883,6 +887,7 @@
       IF ( JCOL.LT.IHI )
      $   CALL ZGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q,
      $                LDQ, Z, LDZ, IERR )
+*
       WORK( 1 ) = DCMPLX( LWKOPT )
 *
       RETURN
diff --git a/lapack-netlib/SRC/zggqrf.f b/lapack-netlib/SRC/zggqrf.f
index 0388b0874..d8636d663 100644
--- a/lapack-netlib/SRC/zggqrf.f
+++ b/lapack-netlib/SRC/zggqrf.f
@@ -173,7 +173,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup ggqrf
 *
 *> \par Further Details:
 *  =====================
@@ -250,7 +250,7 @@
       NB2 = ILAENV( 1, 'ZGERQF', ' ', N, P, -1, -1 )
       NB3 = ILAENV( 1, 'ZUNMQR', ' ', N, M, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P )*NB
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
diff --git a/lapack-netlib/SRC/zggrqf.f b/lapack-netlib/SRC/zggrqf.f
index be912c772..69c14af24 100644
--- a/lapack-netlib/SRC/zggrqf.f
+++ b/lapack-netlib/SRC/zggrqf.f
@@ -172,7 +172,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup ggrqf
 *
 *> \par Further Details:
 *  =====================
@@ -249,7 +249,7 @@
       NB2 = ILAENV( 1, 'ZGEQRF', ' ', P, N, -1, -1 )
       NB3 = ILAENV( 1, 'ZUNMRQ', ' ', M, N, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P )*NB
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
diff --git a/lapack-netlib/SRC/zggsvd3.f b/lapack-netlib/SRC/zggsvd3.f
index 71257a7c0..40624f5be 100644
--- a/lapack-netlib/SRC/zggsvd3.f
+++ b/lapack-netlib/SRC/zggsvd3.f
@@ -277,7 +277,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -332,7 +332,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEsing
+*> \ingroup ggsvd3
 *
 *> \par Contributors:
 *  ==================
diff --git a/lapack-netlib/SRC/zggsvp3.f b/lapack-netlib/SRC/zggsvp3.f
index f39ccdad3..7b465aaee 100644
--- a/lapack-netlib/SRC/zggsvp3.f
+++ b/lapack-netlib/SRC/zggsvp3.f
@@ -233,7 +233,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -256,7 +256,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup ggsvp3
 *
 *> \par Further Details:
 *  =====================
diff --git a/lapack-netlib/SRC/zheevd.f b/lapack-netlib/SRC/zheevd.f
index ba52f9e72..8e86b9e88 100644
--- a/lapack-netlib/SRC/zheevd.f
+++ b/lapack-netlib/SRC/zheevd.f
@@ -116,8 +116,7 @@
 *>
 *> \param[out] RWORK
 *> \verbatim
-*>          RWORK is DOUBLE PRECISION array,
-*>                                         dimension (LRWORK)
+*>          RWORK is DOUBLE PRECISION array, dimension (MAX(1,LRWORK))
 *>          On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK.
 *> \endverbatim
 *>
@@ -180,7 +179,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEeigen
+*> \ingroup heevd
 *
 *> \par Further Details:
 *  =====================
diff --git a/lapack-netlib/SRC/zheevr.f b/lapack-netlib/SRC/zheevr.f
index 1452e04a3..fe6e1a85f 100644
--- a/lapack-netlib/SRC/zheevr.f
+++ b/lapack-netlib/SRC/zheevr.f
@@ -272,7 +272,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= max(1,2*N).
+*>          The length of the array WORK.
+*>          If N <= 1, LWORK >= 1, else LWORK >= 2*N.
 *>          For optimal efficiency, LWORK >= (NB+1)*N,
 *>          where NB is the max of the blocksize for ZHETRD and for
 *>          ZUNMTR as returned by ILAENV.
@@ -294,7 +295,8 @@
 *> \param[in] LRWORK
 *> \verbatim
 *>          LRWORK is INTEGER
-*>          The length of the array RWORK.  LRWORK >= max(1,24*N).
+*>          The length of the array RWORK.
+*>          If N <= 1, LRWORK >= 1, else LRWORK >= 24*N.
 *>
 *>          If LRWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -313,7 +315,8 @@
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -338,7 +341,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEeigen
+*> \ingroup heevr
 *
 *> \par Contributors:
 *  ==================
@@ -417,9 +420,15 @@
       LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR.
      $         ( LIWORK.EQ.-1 ) )
 *
-      LRWMIN = MAX( 1, 24*N )
-      LIWMIN = MAX( 1, 10*N )
-      LWMIN = MAX( 1, 2*N )
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LRWMIN = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = 2*N
+         LRWMIN = 24*N
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN
@@ -454,7 +463,7 @@
          NB = ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 )
          NB = MAX( NB, ILAENV( 1, 'ZUNMTR', UPLO, N, -1, -1, -1 ) )
          LWKOPT = MAX( ( NB+1 )*N, LWMIN )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 )  = LWKOPT
          RWORK( 1 ) = LRWMIN
          IWORK( 1 ) = LIWMIN
 *
@@ -483,7 +492,7 @@
       END IF
 *
       IF( N.EQ.1 ) THEN
-         WORK( 1 ) = 2
+         WORK( 1 ) = 1
          IF( ALLEIG .OR. INDEIG ) THEN
             M = 1
             W( 1 ) = DBLE( A( 1, 1 ) )
@@ -710,7 +719,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 )  = LWKOPT
       RWORK( 1 ) = LRWMIN
       IWORK( 1 ) = LIWMIN
 *
diff --git a/lapack-netlib/SRC/zheevr_2stage.f b/lapack-netlib/SRC/zheevr_2stage.f
index 5c576e633..b1cc7175f 100644
--- a/lapack-netlib/SRC/zheevr_2stage.f
+++ b/lapack-netlib/SRC/zheevr_2stage.f
@@ -265,7 +265,7 @@
 *>          indicating the nonzero elements in Z. The i-th eigenvector
 *>          is nonzero only in elements ISUPPZ( 2*i-1 ) through
 *>          ISUPPZ( 2*i ). This is an output of ZSTEMR (tridiagonal
-*>          matrix). The support of the eigenvectors of A is typically 
+*>          matrix). The support of the eigenvectors of A is typically
 *>          1:N because of the unitary transformations applied by ZUNMTR.
 *>          Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1
 *> \endverbatim
@@ -279,12 +279,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  
+*>          The dimension of the array WORK.
+*>          If N <= 1,               LWORK must be at least 1.
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, 26*N, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -310,7 +311,8 @@
 *> \param[in] LRWORK
 *> \verbatim
 *>          LRWORK is INTEGER
-*>          The length of the array RWORK.  LRWORK >= max(1,24*N).
+*>          The length of the array RWORK.
+*>          If N <= 1, LRWORK >= 1, else LRWORK >= 24*N.
 *>
 *>          If LRWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -329,7 +331,8 @@
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -354,7 +357,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEeigen
+*> \ingroup heevr_2stage
 *
 *> \par Contributors:
 *  ==================
@@ -382,7 +385,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -390,11 +393,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -472,9 +475,16 @@
       IB     = ILAENV2STAGE( 2, 'ZHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
       LHTRD  = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
       LWTRD  = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
-      LWMIN  = N + LHTRD + LWTRD
-      LRWMIN = MAX( 1, 24*N )
-      LIWMIN = MAX( 1, 10*N )
+*
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LRWMIN = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = N + LHTRD + LWTRD
+         LRWMIN = 24*N
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
@@ -535,7 +545,7 @@
       END IF
 *
       IF( N.EQ.1 ) THEN
-         WORK( 1 ) = 2
+         WORK( 1 ) = 1
          IF( ALLEIG .OR. INDEIG ) THEN
             M = 1
             W( 1 ) = DBLE( A( 1, 1 ) )
@@ -643,9 +653,9 @@
 *
 *     Call ZHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
 *
-      CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), 
+      CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ),
      $                    RWORK( INDRE ), WORK( INDTAU ),
-     $                    WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDHOUS ), LHTRD,
      $                    WORK( INDWK ), LLWORK, IINFO )
 *
 *     If all eigenvalues are desired
diff --git a/lapack-netlib/SRC/zhesv_aa.f b/lapack-netlib/SRC/zhesv_aa.f
index df8498c7a..b3d4b3725 100644
--- a/lapack-netlib/SRC/zhesv_aa.f
+++ b/lapack-netlib/SRC/zhesv_aa.f
@@ -128,7 +128,7 @@
 *>          LWORK is INTEGER
 *>          The length of WORK.  LWORK >= MAX(1,2*N,3*N-2), and for best 
 *>          performance LWORK >= max(1,N*NB), where NB is the optimal
-*>          blocksize for ZHETRF.
+*>          blocksize for ZHETRF_AA.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEsolve
+*> \ingroup hesv_aa
 *
 *  =====================================================================
       SUBROUTINE ZHESV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
@@ -177,7 +177,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS
+      INTEGER            LWKMIN, LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -196,6 +196,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N, 3*N-2 )
       IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -206,17 +207,17 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
-         LWKOPT_HETRF = INT( WORK(1) )
+         LWKOPT_HETRF = INT( WORK( 1 ) )
          CALL ZHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
      $                   -1, INFO )
-         LWKOPT_HETRS = INT( WORK(1) )
-         LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS )
+         LWKOPT_HETRS = INT( WORK( 1 ) )
+         LWKOPT = MAX( LWKMIN, LWKOPT_HETRF, LWKOPT_HETRS )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/zhesv_aa_2stage.f b/lapack-netlib/SRC/zhesv_aa_2stage.f
index 79c01c546..c503b5554 100644
--- a/lapack-netlib/SRC/zhesv_aa_2stage.f
+++ b/lapack-netlib/SRC/zhesv_aa_2stage.f
@@ -100,14 +100,14 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is COMPLEX*16 array, dimension (LTB)
+*>          TB is COMPLEX*16 array, dimension (MAX(1,LTB)).
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -147,14 +147,15 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 workspace of size LWORK
+*>          WORK is COMPLEX*16 workspace of size (MAX(1,LWORK)).
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used to
+*>          select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -178,7 +179,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEsolve
+*> \ingroup hesv_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE ZHESV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB,
@@ -208,7 +209,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL            UPPER, TQUERY, WQUERY
-      INTEGER            LWKOPT
+      INTEGER            LWKOPT, LWKMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -229,6 +230,7 @@
       UPPER = LSAME( UPLO, 'U' )
       WQUERY = ( LWORK.EQ.-1 )
       TQUERY = ( LTB.EQ.-1 )
+      LWKMIN = MAX( 1, N )
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -237,18 +239,19 @@
          INFO = -3
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -5
-      ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -7
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -11
-      ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN
          INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV,
      $                          IPIV2, WORK, -1, INFO )
-         LWKOPT = INT( WORK(1) )
+         LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) )
+         WORK( 1 ) = LWKOPT
       END IF
 *
       IF( INFO.NE.0 ) THEN
diff --git a/lapack-netlib/SRC/zhesvx.f b/lapack-netlib/SRC/zhesvx.f
index 485c81df6..64aa16674 100644
--- a/lapack-netlib/SRC/zhesvx.f
+++ b/lapack-netlib/SRC/zhesvx.f
@@ -234,8 +234,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >= max(1,2*N), and for best
-*>          performance, when FACT = 'N', LWORK >= max(1,2*N,N*NB), where
+*>          The length of WORK.  LWORK >= MAX(1,2*N), and for best
+*>          performance, when FACT = 'N', LWORK >= MAX(1,2*N,N*NB), where
 *>          NB is the optimal blocksize for ZHETRF.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
@@ -276,7 +276,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEsolve
+*> \ingroup hesvx
 *
 *  =====================================================================
       SUBROUTINE ZHESVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B,
@@ -307,7 +307,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, NOFACT
-      INTEGER            LWKOPT, NB
+      INTEGER            LWKOPT, LWKMIN, NB
       DOUBLE PRECISION   ANORM
 *     ..
 *     .. External Functions ..
@@ -329,6 +329,7 @@
       INFO = 0
       NOFACT = LSAME( FACT, 'N' )
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N )
       IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN
          INFO = -1
       ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) )
@@ -346,12 +347,12 @@
          INFO = -11
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -13
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -18
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = MAX( 1, 2*N )
+         LWKOPT = LWKMIN
          IF( NOFACT ) THEN
             NB = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 )
             LWKOPT = MAX( LWKOPT, N*NB )
diff --git a/lapack-netlib/SRC/zhetrd_2stage.f b/lapack-netlib/SRC/zhetrd_2stage.f
index b9d2f0eb1..ab444894b 100644
--- a/lapack-netlib/SRC/zhetrd_2stage.f
+++ b/lapack-netlib/SRC/zhetrd_2stage.f
@@ -4,23 +4,23 @@
 *
 *  =========== DOCUMENTATION ===========
 *
-* Online html documentation available at 
-*            http://www.netlib.org/lapack/explore-html/ 
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
-*> Download ZHETRD_2STAGE + dependencies 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhetrd_2stage.f"> 
-*> [TGZ]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhetrd_2stage.f"> 
-*> [ZIP]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhetrd_2stage.f"> 
+*> Download ZHETRD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhetrd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhetrd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhetrd_2stage.f">
 *> [TXT]</a>
-*> \endhtmlonly 
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+*       SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
 *                                 HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
 *       IMPLICIT NONE
@@ -34,7 +34,7 @@
 *       COMPLEX*16         A( LDA, * ), TAU( * ),
 *                          HOUS2( * ), WORK( * )
 *       ..
-*  
+*
 *
 *> \par Purpose:
 *  =============
@@ -52,11 +52,11 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  in particular for the second stage (Band to
 *>                  tridiagonal) and thus LHOUS2 is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate Q1 Q2 or to apply Q1 Q2, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate Q1 Q2 or to apply Q1 Q2,
 *>                  then LHOUS2 is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -86,7 +86,7 @@
 *>          triangular part of A is not referenced.
 *>          On exit, if UPLO = 'U', the band superdiagonal
 *>          of A are overwritten by the corresponding elements of the
-*>          internal band-diagonal matrix AB, and the elements above 
+*>          internal band-diagonal matrix AB, and the elements above
 *>          the KD superdiagonal, with the array TAU, represent the unitary
 *>          matrix Q1 as a product of elementary reflectors; if UPLO
 *>          = 'L', the diagonal and band subdiagonal of A are over-
@@ -117,13 +117,13 @@
 *> \param[out] TAU
 *> \verbatim
 *>          TAU is COMPLEX*16 array, dimension (N-KD)
-*>          The scalar factors of the elementary reflectors of 
+*>          The scalar factors of the elementary reflectors of
 *>          the first stage (see Further Details).
 *> \endverbatim
 *>
 *> \param[out] HOUS2
 *> \verbatim
-*>          HOUS2 is COMPLEX*16 array, dimension (LHOUS2)
+*>          HOUS2 is COMPLEX*16 array, dimension (MAX(1,LHOUS2))
 *>          Stores the Householder representation of the stage2
 *>          band to tridiagonal.
 *> \endverbatim
@@ -132,6 +132,8 @@
 *> \verbatim
 *>          LHOUS2 is INTEGER
 *>          The dimension of the array HOUS2.
+*>          LHOUS2 >= 1.
+*>
 *>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS2 array, returns
@@ -143,23 +145,26 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 array, dimension (LWORK)
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS2=-1,
+*>          The dimension of the array WORK.
+*>          If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *>          LWORK = MAX(1, dimension) where
 *>          dimension   = max(stage1,stage2) + (KD+1)*N
-*>                      = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                        + max(2*KD*KD, KD*NTHREADS) 
-*>                        + (KD+1)*N 
+*>                      = N*KD + N*max(KD+1,FACTOPTNB)
+*>                        + max(2*KD*KD, KD*NTHREADS)
+*>                        + (KD+1)*N
 *>          where KD is the blocking size of the reduction,
 *>          FACTOPTNB is the blocking used by the QR or LQ
 *>          algorithm, usually FACTOPTNB=128 is a good choice
@@ -177,12 +182,12 @@
 *  Authors:
 *  ========
 *
-*> \author Univ. of Tennessee 
-*> \author Univ. of California Berkeley 
-*> \author Univ. of Colorado Denver 
-*> \author NAG Ltd. 
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrd_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -202,7 +207,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -210,16 +215,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+      SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
      $                          HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
       IMPLICIT NONE
@@ -265,10 +270,13 @@
 *
       KD     = ILAENV2STAGE( 1, 'ZHETRD_2STAGE', VECT, N, -1, -1, -1 )
       IB     = ILAENV2STAGE( 2, 'ZHETRD_2STAGE', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 )
-*      WRITE(*,*),'ZHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO,
-*     $            LHMIN, LWMIN
+      IF( N.EQ.0 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.LSAME( VECT, 'N' ) ) THEN
          INFO = -1
@@ -309,14 +317,14 @@
       LWRK  = LWORK-LDAB*N
       ABPOS = 1
       WPOS  = ABPOS + LDAB*N
-      CALL ZHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, 
+      CALL ZHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB,
      $                   TAU, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'ZHETRD_HE2HB', -INFO )
          RETURN
       END IF
-      CALL ZHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, 
-     $                   WORK( ABPOS ), LDAB, D, E, 
+      CALL ZHETRD_HB2ST( 'Y', VECT, UPLO, N, KD,
+     $                   WORK( ABPOS ), LDAB, D, E,
      $                   HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'ZHETRD_HB2ST', -INFO )
@@ -324,7 +332,6 @@
       END IF
 *
 *
-      HOUS2( 1 ) = LHMIN
       WORK( 1 )  = LWMIN
       RETURN
 *
diff --git a/lapack-netlib/SRC/zhetrd_hb2st.F b/lapack-netlib/SRC/zhetrd_hb2st.F
index 1d39ac942..247497ab6 100644
--- a/lapack-netlib/SRC/zhetrd_hb2st.F
+++ b/lapack-netlib/SRC/zhetrd_hb2st.F
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+*       SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
 *                               D, E, HOUS, LHOUS, WORK, LWORK, INFO )
 *
 *       #if defined(_OPENMP)
@@ -53,12 +53,12 @@
 *> \param[in] STAGE1
 *> \verbatim
 *>          STAGE1 is CHARACTER*1
-*>          = 'N':  "No": to mention that the stage 1 of the reduction  
+*>          = 'N':  "No": to mention that the stage 1 of the reduction
 *>                  from dense to band using the zhetrd_he2hb routine
-*>                  was not called before this routine to reproduce AB. 
-*>                  In other term this routine is called as standalone. 
-*>          = 'Y':  "Yes": to mention that the stage 1 of the 
-*>                  reduction from dense to band using the zhetrd_he2hb 
+*>                  was not called before this routine to reproduce AB.
+*>                  In other term this routine is called as standalone.
+*>          = 'Y':  "Yes": to mention that the stage 1 of the
+*>                  reduction from dense to band using the zhetrd_he2hb
 *>                  routine has been called to produce AB (e.g., AB is
 *>                  the output of zhetrd_he2hb.
 *> \endverbatim
@@ -66,10 +66,10 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  and thus LHOUS is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate or to apply Q later on, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate or to apply Q later on,
 *>                  then LHOUS is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -132,34 +132,39 @@
 *>
 *> \param[out] HOUS
 *> \verbatim
-*>          HOUS is COMPLEX*16 array, dimension LHOUS, that
-*>          store the Householder representation.
+*>          HOUS is COMPLEX*16 array, dimension (MAX(1,LHOUS))
+*>          Stores the Householder representation.
 *> \endverbatim
 *>
 *> \param[in] LHOUS
 *> \verbatim
 *>          LHOUS is INTEGER
-*>          The dimension of the array HOUS. LHOUS = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array HOUS.
+*>          If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS array, returns
 *>          this value as the first entry of the HOUS array, and no error
 *>          message related to LHOUS is issued by XERBLA.
 *>          LHOUS = MAX(1, dimension) where
 *>          dimension = 4*N if VECT='N'
-*>          not available now if VECT='H'     
+*>          not available now if VECT='H'
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 array, dimension LWORK.
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)).
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array WORK.
+*>          If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -188,7 +193,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup hetrd_hb2st
 *
 *> \par Further Details:
 *  =====================
@@ -208,7 +213,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -216,16 +221,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+      SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
      $                         D, E, HOUS, LHOUS, WORK, LWORK, INFO )
 *
 *
@@ -259,11 +264,11 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, WANTQ, UPPER, AFTERS1
-      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, 
+      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
      $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
      $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
-     $                   NBTILES, TTYPE, TID, NTHREADS, DEBUG,
-     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, 
+     $                   NBTILES, TTYPE, TID, NTHREADS,
+     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
      $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
      $                   SIZEV, SIZETAU, LDV, LHMIN, LWMIN
       DOUBLE PRECISION   ABSTMP
@@ -277,7 +282,7 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      INTEGER            ILAENV2STAGE 
+      INTEGER            ILAENV2STAGE
       EXTERNAL           LSAME, ILAENV2STAGE
 *     ..
 *     .. Executable Statements ..
@@ -285,7 +290,6 @@
 *     Determine the minimal workspace size required.
 *     Test the input parameters
 *
-      DEBUG   = 0
       INFO    = 0
       AFTERS1 = LSAME( STAGE1, 'Y' )
       WANTQ   = LSAME( VECT, 'V' )
@@ -294,9 +298,14 @@
 *
 *     Determine the block size, the workspace size and the hous size.
 *
-      IB     = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
+      IB       = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 )
+      IF( N.EQ.0 .OR. KD.LE.1 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
          INFO = -1
@@ -358,7 +367,7 @@
           ABDPOS   = KD + 1
           ABOFDPOS = KD
       ELSE
-          APOS     = INDA 
+          APOS     = INDA
           AWPOS    = INDA + KD + 1
           DPOS     = APOS
           OFDPOS   = DPOS + 1
@@ -366,11 +375,11 @@
           ABOFDPOS = 2
 
       ENDIF
-*      
-*     Case KD=0: 
-*     The matrix is diagonal. We just copy it (convert to "real" for 
-*     complex because D is double and the imaginary part should be 0) 
-*     and store it in D. A sequential code here is better or 
+*
+*     Case KD=0:
+*     The matrix is diagonal. We just copy it (convert to "real" for
+*     complex because D is double and the imaginary part should be 0)
+*     and store it in D. A sequential code here is better or
 *     in a parallel environment it might need two cores for D and E
 *
       IF( KD.EQ.0 ) THEN
@@ -385,17 +394,17 @@
           WORK( 1 ) = 1
           RETURN
       END IF
-*      
-*     Case KD=1: 
-*     The matrix is already Tridiagonal. We have to make diagonal 
+*
+*     Case KD=1:
+*     The matrix is already Tridiagonal. We have to make diagonal
 *     and offdiagonal elements real, and store them in D and E.
-*     For that, for real precision just copy the diag and offdiag 
-*     to D and E while for the COMPLEX case the bulge chasing is  
-*     performed to convert the hermetian tridiagonal to symmetric 
-*     tridiagonal. A simpler conversion formula might be used, but then 
+*     For that, for real precision just copy the diag and offdiag
+*     to D and E while for the COMPLEX case the bulge chasing is
+*     performed to convert the hermetian tridiagonal to symmetric
+*     tridiagonal. A simpler conversion formula might be used, but then
 *     updating the Q matrix will be required and based if Q is generated
-*     or not this might complicate the story. 
-*      
+*     or not this might complicate the story.
+*
       IF( KD.EQ.1 ) THEN
           DO 50 I = 1, N
               D( I ) = DBLE( AB( ABDPOS, I ) )
@@ -444,7 +453,7 @@ C                 END IF
           RETURN
       END IF
 *
-*     Main code start here. 
+*     Main code start here.
 *     Reduce the hermitian band of A to a tridiagonal matrix.
 *
       THGRSIZ   = N
@@ -453,7 +462,7 @@ C                 END IF
       NBTILES   = CEILING( REAL(N)/REAL(KD) )
       STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) )
       THGRNB    = CEILING( REAL(N-1)/REAL(THGRSIZ) )
-*      
+*
       CALL ZLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA )
       CALL ZLASET( "A", KD,   N, ZERO, ZERO, WORK( AWPOS ), LDA )
 *
@@ -462,7 +471,7 @@ C                 END IF
 *
 #if defined(_OPENMP)
 !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
-!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) 
+!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID )
 !$OMP$         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
 !$OMP$         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
 !$OMP$         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
@@ -471,7 +480,7 @@ C                 END IF
 #endif
 *
 *     main bulge chasing loop
-*      
+*
       DO 100 THGRID = 1, THGRNB
           STT  = (THGRID-1)*THGRSIZ+1
           THED = MIN( (STT + THGRSIZ -1), (N-1))
@@ -482,7 +491,7 @@ C                 END IF
                   ST = STT
                   DO 130 SWEEPID = ST, ED
                       DO 140 K = 1, GRSIZ
-                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ) 
+                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ)
      $                           + (M-1)*GRSIZ + K
                           IF ( MYID.EQ.1 ) THEN
                               TTYPE = 1
@@ -508,17 +517,17 @@ C                 END IF
                           ENDIF
 *
 *                         Call the kernel
-*                             
+*
 #if defined(_OPENMP) &&  _OPENMP >= 201307
 
-                          IF( TTYPE.NE.1 ) THEN      
+                          IF( TTYPE.NE.1 ) THEN
 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
 !$OMP$     DEPEND(in:WORK(MYID-1))
 !$OMP$     DEPEND(out:WORK(MYID))
                               TID      = OMP_GET_THREAD_NUM()
-                              CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                              CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                             STIND, EDIND, SWEEPID, N, KD, IB,
-     $                             WORK ( INDA ), LDA, 
+     $                             WORK ( INDA ), LDA,
      $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                             WORK( INDW + TID*KD ) )
 !$OMP END TASK
@@ -526,20 +535,20 @@ C                 END IF
 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
 !$OMP$     DEPEND(out:WORK(MYID))
                               TID      = OMP_GET_THREAD_NUM()
-                              CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                              CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                             STIND, EDIND, SWEEPID, N, KD, IB,
-     $                             WORK ( INDA ), LDA, 
+     $                             WORK ( INDA ), LDA,
      $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                             WORK( INDW + TID*KD ) )
 !$OMP END TASK
                           ENDIF
 #else
-                          CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                          CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                         STIND, EDIND, SWEEPID, N, KD, IB,
-     $                         WORK ( INDA ), LDA, 
+     $                         WORK ( INDA ), LDA,
      $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                         WORK( INDW ) )
-#endif 
+#endif
                           IF ( BLKLASTIND.GE.(N-1) ) THEN
                               STT = STT + 1
                               EXIT
@@ -554,14 +563,14 @@ C                 END IF
 !$OMP END MASTER
 !$OMP END PARALLEL
 #endif
-*      
+*
 *     Copy the diagonal from A to D. Note that D is REAL thus only
 *     the Real part is needed, the imaginary part should be zero.
 *
       DO 150 I = 1, N
           D( I ) = DBLE( WORK( DPOS+(I-1)*LDA ) )
   150 CONTINUE
-*      
+*
 *     Copy the off diagonal from A to E. Note that E is REAL thus only
 *     the Real part is needed, the imaginary part should be zero.
 *
@@ -575,11 +584,10 @@ C                 END IF
   170     CONTINUE
       ENDIF
 *
-      HOUS( 1 ) = LHMIN
       WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of ZHETRD_HB2ST
 *
       END
-      
+
diff --git a/lapack-netlib/SRC/zhetrd_he2hb.f b/lapack-netlib/SRC/zhetrd_he2hb.f
index e1b2e1794..3e3bfa374 100644
--- a/lapack-netlib/SRC/zhetrd_he2hb.f
+++ b/lapack-netlib/SRC/zhetrd_he2hb.f
@@ -123,8 +123,8 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 array, dimension (LWORK)
-*>          On exit, if INFO = 0, or if LWORK=-1, 
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, or if LWORK = -1,
 *>          WORK(1) returns the size of LWORK.
 *> \endverbatim
 *>
@@ -132,7 +132,9 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK which should be calculated
-*>          by a workspace query. LWORK = MAX(1, LWORK_QUERY)
+*>          by a workspace query.
+*>          If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY).
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -158,7 +160,7 @@
 *> \author Univ. of Colorado Denver 
 *> \author NAG Ltd. 
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrd_he2hb
 *
 *> \par Further Details:
 *  =====================
@@ -293,8 +295,12 @@
       INFO   = 0
       UPPER  = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
-      LWMIN  = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 )
-      
+      IF( N.LE.KD+1 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 )
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
diff --git a/lapack-netlib/SRC/zhetrf.f b/lapack-netlib/SRC/zhetrf.f
index 78d4f71b8..433887108 100644
--- a/lapack-netlib/SRC/zhetrf.f
+++ b/lapack-netlib/SRC/zhetrf.f
@@ -107,7 +107,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK. LWORK >= 1. For best performance
 *>          LWORK >= N*NB, where NB is the block size returned by ILAENV.
 *> \endverbatim
 *>
@@ -130,7 +130,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrf
 *
 *> \par Further Details:
 *  =====================
@@ -227,7 +227,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
@@ -346,6 +346,7 @@
       END IF
 *
    40 CONTINUE
+*
       WORK( 1 ) = LWKOPT
       RETURN
 *
diff --git a/lapack-netlib/SRC/zhetrf_aa.f b/lapack-netlib/SRC/zhetrf_aa.f
index 537c16e8c..381c87d51 100644
--- a/lapack-netlib/SRC/zhetrf_aa.f
+++ b/lapack-netlib/SRC/zhetrf_aa.f
@@ -101,8 +101,10 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK. LWORK >= MAX(1,2*N). For optimum performance
-*>          LWORK >= N*(1+NB), where NB is the optimal blocksize.
+*>          The length of WORK.
+*>          LWORK >= 1, if N >= 1, and LWORK >= 2*N, otherwise.
+*>          For optimum performance LWORK >= N*(1+NB), where NB is
+*>          the optimal blocksize, returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -125,10 +127,10 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrf_aa
 *
 *  =====================================================================
-      SUBROUTINE ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO)
+      SUBROUTINE ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -152,7 +154,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL      LQUERY, UPPER
-      INTEGER      J, LWKOPT
+      INTEGER      J, LWKMIN, LWKOPT
       INTEGER      NB, MJ, NJ, K1, K2, J1, J2, J3, JB
       COMPLEX*16   ALPHA
 *     ..
@@ -178,18 +180,25 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( N.LE.1 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = 2*N
+         LWKOPT = (NB+1)*N
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = (NB+1)*N
          WORK( 1 ) = LWKOPT
       END IF
 *
@@ -202,11 +211,11 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
           RETURN
       ENDIF
       IPIV( 1 ) = 1
-      IF ( N.EQ.1 ) THEN
+      IF( N.EQ.1 ) THEN
          A( 1, 1 ) = DBLE( A( 1, 1 ) )
          RETURN
       END IF
diff --git a/lapack-netlib/SRC/zhetrf_aa_2stage.f b/lapack-netlib/SRC/zhetrf_aa_2stage.f
index 477602b5e..bab13a99d 100644
--- a/lapack-netlib/SRC/zhetrf_aa_2stage.f
+++ b/lapack-netlib/SRC/zhetrf_aa_2stage.f
@@ -87,14 +87,14 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is COMPLEX*16 array, dimension (LTB)
+*>          TB is COMPLEX*16 array, dimension (MAX(1,LTB))
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -121,14 +121,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 workspace of size LWORK
+*>          WORK is COMPLEX*16 workspace of size (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used to
+*>          select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -152,7 +152,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16SYcomputational
+*> \ingroup hetrf_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV,
@@ -182,7 +182,7 @@
 *     .. Local Scalars ..
       LOGICAL            UPPER, TQUERY, WQUERY
       INTEGER            I, J, K, I1, I2, TD
-      INTEGER            LDTB, NB, KB, JB, NT, IINFO
+      INTEGER            LWKOPT, LDTB, NB, KB, JB, NT, IINFO
       COMPLEX*16         PIV
 *     ..
 *     .. External Functions ..
@@ -212,9 +212,9 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -6
-      ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN
          INFO = -10
       END IF
 *
@@ -228,10 +228,10 @@
       NB = ILAENV( 1, 'ZHETRF_AA_2STAGE', UPLO, N, -1, -1, -1 )
       IF( INFO.EQ.0 ) THEN
          IF( TQUERY ) THEN
-            TB( 1 ) = (3*NB+1)*N
+            TB( 1 ) = MAX( 1, (3*NB+1)*N )
          END IF
          IF( WQUERY ) THEN
-            WORK( 1 ) = N*NB
+            WORK( 1 ) = MAX( 1, N*NB )
          END IF
       END IF
       IF( TQUERY .OR. WQUERY ) THEN
@@ -240,7 +240,7 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
          RETURN
       ENDIF
 *
@@ -392,7 +392,7 @@
                CALL ZGETRF( N-(J+1)*NB, NB, 
      $                      WORK, N,
      $                      IPIV( (J+1)*NB+1 ), IINFO )
-c               IF (IINFO.NE.0 .AND. INFO.EQ.0) THEN
+c               IF( IINFO.NE.0 .AND. INFO.EQ.0 ) THEN
 c                  INFO = IINFO+(J+1)*NB
 c               END IF
 *
@@ -587,7 +587,7 @@ c               END IF
                CALL ZGETRF( N-(J+1)*NB, NB, 
      $                      A( (J+1)*NB+1, J*NB+1 ), LDA,
      $                      IPIV( (J+1)*NB+1 ), IINFO )
-c               IF (IINFO.NE.0 .AND. INFO.EQ.0) THEN
+c               IF( IINFO.NE.0 .AND. INFO.EQ.0 ) THEN
 c                  INFO = IINFO+(J+1)*NB
 c               END IF
 *         
diff --git a/lapack-netlib/SRC/zhetrf_rk.f b/lapack-netlib/SRC/zhetrf_rk.f
index 73dd9f9d0..7c505fa4d 100644
--- a/lapack-netlib/SRC/zhetrf_rk.f
+++ b/lapack-netlib/SRC/zhetrf_rk.f
@@ -177,14 +177,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 array, dimension ( MAX(1,LWORK) ).
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned
 *>          by ILAENV.
 *>
@@ -229,7 +229,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrf_rk
 *
 *> \par Further Details:
 *  =====================
@@ -310,7 +310,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'ZHETRF_RK', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/zhetrf_rook.f b/lapack-netlib/SRC/zhetrf_rook.f
index e9de47248..a56349092 100644
--- a/lapack-netlib/SRC/zhetrf_rook.f
+++ b/lapack-netlib/SRC/zhetrf_rook.f
@@ -122,7 +122,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
@@ -150,7 +150,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrf_rook
 *
 *> \par Further Details:
 *  =====================
diff --git a/lapack-netlib/SRC/zhetri2.f b/lapack-netlib/SRC/zhetri2.f
index 384745c3a..1d932b866 100644
--- a/lapack-netlib/SRC/zhetri2.f
+++ b/lapack-netlib/SRC/zhetri2.f
@@ -88,16 +88,16 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 array, dimension (N+NB+1)*(NB+3)
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)).
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          WORK is size >= (N+NB+1)*(NB+3)
+*>          If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3).
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>           calculates:
+*>          calculates:
 *>              - the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array,
 *>              - and no error message related to LWORK is issued by XERBLA.
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetri2
 *
 *  =====================================================================
       SUBROUTINE ZHETRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -159,9 +159,13 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+*
 *     Get blocksize
+*
       NBMAX = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 )
-      IF ( NBMAX .GE. N ) THEN
+      IF( N.EQ.0 ) THEN
+         MINSIZE = 1
+      ELSE IF( NBMAX.GE.N ) THEN
          MINSIZE = N
       ELSE
          MINSIZE = (N+NBMAX+1)*(NBMAX+3)
@@ -173,28 +177,29 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
-*
-*     Quick return if possible
-*
 *
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'ZHETRI2', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK(1)=MINSIZE
+         WORK( 1 ) = MINSIZE
          RETURN
       END IF
+*
+*     Quick return if possible
+*
       IF( N.EQ.0 )
      $   RETURN
 
-      IF( NBMAX .GE. N ) THEN
+      IF( NBMAX.GE.N ) THEN
          CALL ZHETRI( UPLO, N, A, LDA, IPIV, WORK, INFO )
       ELSE
          CALL ZHETRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO )
       END IF
+*
       RETURN
 *
 *     End of ZHETRI2
diff --git a/lapack-netlib/SRC/zhetrs_aa.f b/lapack-netlib/SRC/zhetrs_aa.f
index 06ac1fd28..b7a1f7f07 100644
--- a/lapack-netlib/SRC/zhetrs_aa.f
+++ b/lapack-netlib/SRC/zhetrs_aa.f
@@ -106,7 +106,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK >= max(1,3*N-2).
+*>          The dimension of the array WORK.
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the minimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -124,7 +130,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrs_aa
 *
 *  =====================================================================
       SUBROUTINE ZHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB,
@@ -152,7 +158,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, UPPER
-      INTEGER            K, KP, LWKOPT
+      INTEGER            K, KP, LWKMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -162,13 +168,19 @@
       EXTERNAL           ZGTSV, ZSWAP, ZTRSM, ZLACGV, ZLACPY, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX
+      INTRINSIC          MIN, MAX
 *     ..
 *     .. Executable Statements ..
 *
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWKMIN = 1
+      ELSE
+         LWKMIN = 3*N-2
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -179,21 +191,20 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'ZHETRS_AA', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         LWKOPT = (3*N-2)
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = LWKMIN
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( N.EQ.0 .OR. NRHS.EQ.0 )
+      IF( MIN( N, NRHS ).EQ.0 )
      $   RETURN
 *
       IF( UPPER ) THEN
diff --git a/lapack-netlib/SRC/zlamswlq.f b/lapack-netlib/SRC/zlamswlq.f
index 4abefa434..59a0a5558 100644
--- a/lapack-netlib/SRC/zlamswlq.f
+++ b/lapack-netlib/SRC/zlamswlq.f
@@ -127,17 +127,20 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          If SIDE = 'L', LWORK >= max(1,NB) * MB;
-*>          if SIDE = 'R', LWORK >= max(1,M) * MB.
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,NB*MB).
+*>          If SIDE = 'R', LWORK >= max(1,M*MB).
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -189,92 +192,103 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamswlq
+*>
 *  =====================================================================
       SUBROUTINE ZLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $    LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      COMPLEX*16        A( LDA, * ), WORK( * ), C(LDC, * ),
-     $      T( LDT, * )
+      COMPLEX*16         A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, LW, CTR
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, LW, CTR, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+*     ..
 *     .. External Subroutines ..
-      EXTERNAL    ZTPMLQT, ZGEMLQT, XERBLA
+      EXTERNAL           ZTPMLQT, ZGEMLQT, XERBLA
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      INFO = 0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * MB
       ELSE
         LW = M * MB
       END IF
 *
-      INFO = 0
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
+*
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( K.LT.0 ) THEN
         INFO = -5
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
         INFO = -4
-      ELSE IF( K.LT.MB .OR. MB.LT.1) THEN
+      ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN
         INFO = -6
       ELSE IF( LDA.LT.MAX( 1, K ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, MB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = LWMIN
+      END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'ZLAMSWLQ', -INFO )
-        WORK(1) = LW
         RETURN
-      ELSE IF (LQUERY) THEN
-        WORK(1) = LW
+      ELSE IF( LQUERY ) THEN
         RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
       IF((NB.LE.K).OR.(NB.GE.MAX(M,N,K))) THEN
         CALL ZGEMLQT( SIDE, TRANS, M, N, K, MB, A, LDA,
-     $        T, LDT, C, LDC, WORK, INFO)
+     $        T, LDT, C, LDC, WORK, INFO )
         RETURN
       END IF
 *
@@ -403,7 +417,7 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of ZLAMSWLQ
diff --git a/lapack-netlib/SRC/zlamtsqr.f b/lapack-netlib/SRC/zlamtsqr.f
index 5030cb75f..03770c06e 100644
--- a/lapack-netlib/SRC/zlamtsqr.f
+++ b/lapack-netlib/SRC/zlamtsqr.f
@@ -128,22 +128,24 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
+*>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,N*NB).
+*>          If SIDE = 'R', LWORK >= max(1,MB*NB).
 *>
-*>          If SIDE = 'L', LWORK >= max(1,N)*NB;
-*>          if SIDE = 'R', LWORK >= max(1,MB)*NB.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -191,46 +193,50 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamtsqr
+*>
 *  =====================================================================
       SUBROUTINE ZLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $        LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      COMPLEX*16        A( LDA, * ), WORK( * ), C(LDC, * ),
-     $                T( LDT, * )
+      COMPLEX*16         A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, LW, CTR, Q
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, LW, CTR, Q, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+*     ..
 *     .. External Subroutines ..
-      EXTERNAL   ZGEMQRT, ZTPMQRT, XERBLA
+      EXTERNAL           ZGEMQRT, ZTPMQRT, XERBLA
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      INFO = 0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * NB
         Q = M
       ELSE
@@ -238,11 +244,17 @@
         Q = N
       END IF
 *
-      INFO = 0
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
+*
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
@@ -253,38 +265,38 @@
         INFO = -7
       ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, NB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
-*     Determine the block size if it is tall skinny or short and wide
-*
-      IF( INFO.EQ.0)  THEN
-          WORK(1) = LW
+      IF( INFO.EQ.0 )  THEN
+        WORK( 1 ) = LWMIN
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'ZLAMTSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
+*
+*     Determine the block size if it is tall skinny or short and wide
 *
       IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN
         CALL ZGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA,
-     $        T, LDT, C, LDC, WORK, INFO)
+     $        T, LDT, C, LDC, WORK, INFO )
         RETURN
-       END IF
+      END IF
 *
       IF(LEFT.AND.NOTRAN) THEN
 *
@@ -410,7 +422,7 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of ZLAMTSQR
diff --git a/lapack-netlib/SRC/zlaswlq.f b/lapack-netlib/SRC/zlaswlq.f
index be4c48539..735207132 100644
--- a/lapack-netlib/SRC/zlaswlq.f
+++ b/lapack-netlib/SRC/zlaswlq.f
@@ -96,22 +96,23 @@
 *>          The leading dimension of the array T.  LDT >= MB.
 *> \endverbatim
 *>
-*>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= MB*M.
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -159,33 +160,37 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup laswlq
+*>
 *  =====================================================================
       SUBROUTINE ZLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
-     $                  INFO)
+     $                    INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LWORK, LDT
+      INTEGER            INFO, LDA, M, N, MB, NB, LWORK, LDT
 *     ..
 *     .. Array Arguments ..
-      COMPLEX*16        A( LDA, * ), WORK( * ), T( LDT, *)
+      COMPLEX*16         A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, MINMN, LWMIN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+*     ..
 *     .. EXTERNAL SUBROUTINES ..
       EXTERNAL           ZGELQT, ZTPLQT, XERBLA
+*     ..
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
@@ -196,12 +201,19 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = M*MB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
       ELSE IF( N.LT.0 .OR. N.LT.M ) THEN
         INFO = -2
-      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN
+      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN
         INFO = -3
       ELSE IF( NB.LE.0 ) THEN
         INFO = -4
@@ -209,60 +221,61 @@
         INFO = -6
       ELSE IF( LDT.LT.MB ) THEN
         INFO = -8
-      ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-      WORK(1) = MB*M
+*
+      IF( INFO.EQ.0 )  THEN
+        WORK( 1 ) = LWMIN
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'ZLASWLQ', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The LQ Decomposition
 *
-       IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN
-        CALL ZGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO)
+      IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN
+        CALL ZGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO )
         RETURN
-       END IF
+      END IF
 *
-       KK = MOD((N-M),(NB-M))
-       II=N-KK+1
+      KK = MOD((N-M),(NB-M))
+      II = N-KK+1
 *
-*      Compute the LQ factorization of the first block A(1:M,1:NB)
+*     Compute the LQ factorization of the first block A(1:M,1:NB)
 *
-       CALL ZGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO)
-       CTR = 1
+      CALL ZGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO )
+      CTR = 1
 *
-       DO I = NB+1, II-NB+M , (NB-M)
+      DO I = NB+1, II-NB+M, (NB-M)
 *
-*      Compute the QR factorization of the current block A(1:M,I:I+NB-M)
+*       Compute the QR factorization of the current block A(1:M,I:I+NB-M)
 *
-         CALL ZTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
-     $                  LDA, T(1, CTR * M + 1),
-     $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+        CALL ZTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
+     $                LDA, T(1, CTR * M + 1),
+     $                LDT, WORK, INFO )
+        CTR = CTR + 1
+      END DO
 *
 *     Compute the QR factorization of the last block A(1:M,II:N)
 *
-       IF (II.LE.N) THEN
+      IF( II.LE.N ) THEN
         CALL ZTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ),
-     $                  LDA, T(1, CTR * M + 1), LDT,
-     $                  WORK, INFO )
-       END IF
+     $                LDA, T(1, CTR * M + 1), LDT,
+     $                WORK, INFO )
+      END IF
 *
-      WORK( 1 ) = M * MB
+      WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of ZLASWLQ
diff --git a/lapack-netlib/SRC/zlatrs3.f b/lapack-netlib/SRC/zlatrs3.f
index 231a17274..27eac839b 100644
--- a/lapack-netlib/SRC/zlatrs3.f
+++ b/lapack-netlib/SRC/zlatrs3.f
@@ -158,7 +158,11 @@
 *> \endverbatim
 *>
 *> \param[in] LWORK
+*> \verbatim
 *>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else
 *>          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
 *>          NBA = (N + NB - 1)/NB and NB is the optimal block size.
 *>
@@ -166,6 +170,7 @@
 *>          only calculates the optimal dimensions of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
@@ -182,7 +187,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERauxiliary
+*> \ingroup latrs3
 *> \par Further Details:
 *  =====================
 *  \verbatim
@@ -257,7 +262,7 @@
       LOGICAL            LQUERY, NOTRAN, NOUNIT, UPPER
       INTEGER            AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
      $                   JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
-     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
+     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN
       DOUBLE PRECISION   ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
      $                   SCAMIN, SMLNUM, TMAX
 *     ..
@@ -296,15 +301,24 @@
 *     row. WORK( I + KK * LDS ) is the scale factor of the vector
 *     segment associated with the I-th block row and the KK-th vector
 *     in the block column.
+*
       LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
       LDS = NBA
+*
 *     The second part stores upper bounds of the triangular A. There are
 *     a total of NBA x NBA blocks, of which only the upper triangular
 *     part or the lower triangular part is referenced. The upper bound of
 *     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
+*
       LANRM = NBA * NBA
       AWRK = LSCALE
-      WORK( 1 ) = LSCALE + LANRM
+*
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = LSCALE + LANRM
+      END IF
+      WORK( 1 ) = LWMIN
 *
 *     Test the input parameters.
 *
@@ -326,7 +340,7 @@
          INFO = -8
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -10
-      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
+      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN
          INFO = -14
       END IF
       IF( INFO.NE.0 ) THEN
diff --git a/lapack-netlib/SRC/zlatsqr.f b/lapack-netlib/SRC/zlatsqr.f
index 8c938aebc..24d00f28a 100644
--- a/lapack-netlib/SRC/zlatsqr.f
+++ b/lapack-netlib/SRC/zlatsqr.f
@@ -101,15 +101,18 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= NB*N.
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -161,33 +164,37 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup latsqr
+*>
 *  =====================================================================
       SUBROUTINE ZLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK,
-     $                    LWORK, INFO)
+     $                    LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LDT, LWORK
+      INTEGER            INFO, LDA, M, N, MB, NB, LDT, LWORK
 *     ..
 *     .. Array Arguments ..
-      COMPLEX*16        A( LDA, * ), WORK( * ), T(LDT, *)
+      COMPLEX*16         A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, LWMIN, MINMN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+*     ..
 *     .. EXTERNAL SUBROUTINES ..
-      EXTERNAL    ZGEQRT, ZTPQRT, XERBLA
+      EXTERNAL           ZGEQRT, ZTPQRT, XERBLA
+*     ..
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
@@ -198,6 +205,13 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = N*NB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
@@ -205,64 +219,65 @@
         INFO = -2
       ELSE IF( MB.LT.1 ) THEN
         INFO = -3
-      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN
+      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN
         INFO = -4
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -6
       ELSE IF( LDT.LT.NB ) THEN
         INFO = -8
-      ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-        WORK(1) = NB*N
+*
+      IF( INFO.EQ.0 )  THEN
+        WORK( 1 ) = LWMIN
       END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'ZLATSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The QR Decomposition
 *
-       IF ((MB.LE.N).OR.(MB.GE.M)) THEN
-         CALL ZGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO)
-         RETURN
-       END IF
-       KK = MOD((M-N),(MB-N))
-       II=M-KK+1
+      IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN
+        CALL ZGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO )
+        RETURN
+      END IF
+      KK = MOD((M-N),(MB-N))
+      II = M-KK+1
 *
-*      Compute the QR factorization of the first block A(1:MB,1:N)
+*     Compute the QR factorization of the first block A(1:MB,1:N)
 *
-       CALL ZGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
-       CTR = 1
+      CALL ZGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
+      CTR = 1
 *
-       DO I = MB+1, II-MB+N ,  (MB-N)
+      DO I = MB+1, II-MB+N, (MB-N)
 *
-*      Compute the QR factorization of the current block A(I:I+MB-N,1:N)
+*       Compute the QR factorization of the current block A(I:I+MB-N,1:N)
 *
-         CALL ZTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
-     $                 T(1, CTR * N + 1),
-     $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+        CALL ZTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
+     $                T(1, CTR * N + 1),
+     $                LDT, WORK, INFO )
+        CTR = CTR + 1
+      END DO
 *
-*      Compute the QR factorization of the last block A(II:M,1:N)
+*     Compute the QR factorization of the last block A(II:M,1:N)
 *
-       IF (II.LE.M) THEN
-         CALL ZTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
-     $                 T(1,CTR * N + 1), LDT,
-     $                  WORK, INFO )
-       END IF
+      IF( II.LE.M ) THEN
+        CALL ZTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
+     $                T(1,CTR * N + 1), LDT,
+     $                WORK, INFO )
+      END IF
 *
-      work( 1 ) = N*NB
+      WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of ZLATSQR

From 201f22f49ae87f83b8970a257bc580e33cb94f30 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 23 Dec 2023 23:27:38 +0100
Subject: [PATCH 505/718] Fix issues related to ?GEDMD (Reference-LAPACK PR
 959)

---
 lapack-netlib/SRC/cgedmd.f90 | 872 +++++++++++++++++++--------------
 lapack-netlib/SRC/dgedmd.f90 | 920 ++++++++++++++++++++---------------
 lapack-netlib/SRC/sgedmd.f90 | 916 +++++++++++++++++++---------------
 lapack-netlib/SRC/zgedmd.f90 | 848 +++++++++++++++++++-------------
 4 files changed, 2084 insertions(+), 1472 deletions(-)

diff --git a/lapack-netlib/SRC/cgedmd.f90 b/lapack-netlib/SRC/cgedmd.f90
index 499489270..1413130ec 100644
--- a/lapack-netlib/SRC/cgedmd.f90
+++ b/lapack-netlib/SRC/cgedmd.f90
@@ -1,22 +1,526 @@
+!> \brief \b CGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices.
+!
+!  =========== DOCUMENTATION ===========
+!
+!  Definition:
+!  ===========
+!
+!     SUBROUTINE CGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,   &
+!                        M, N, X, LDX, Y, LDY, NRNK, TOL,   &
+!                        K, EIGS, Z, LDZ, RES, B,    LDB,   &
+!                        W, LDW,  S, LDS, ZWORK,  LZWORK,   &
+!                        RWORK, LRWORK, IWORK, LIWORK, INFO )
+!.....
+!     USE                   iso_fortran_env
+!     IMPLICIT NONE
+!     INTEGER, PARAMETER :: WP = real32
+!
+!.....
+!     Scalar arguments
+!     CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+!     INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+!                                NRNK, LDZ, LDB, LDW,  LDS, &
+!                                LIWORK, LRWORK, LZWORK
+!     INTEGER,       INTENT(OUT)  :: K, INFO
+!     REAL(KIND=WP), INTENT(IN)   ::    TOL
+!     Array arguments
+!     COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+!     COMPLEX(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+!                                        W(LDW,*), S(LDS,*)
+!     COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
+!     COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
+!     REAL(KIND=WP),    INTENT(OUT)   :: RES(*)
+!     REAL(KIND=WP),    INTENT(OUT)   :: RWORK(*)
+!     INTEGER,          INTENT(OUT)   :: IWORK(*)
+!
+!............................................................
+!>    \par Purpose:
+!     =============
+!>    \verbatim
+!>    CGEDMD computes the Dynamic Mode Decomposition (DMD) for
+!>    a pair of data snapshot matrices. For the input matrices
+!>    X and Y such that Y = A*X with an unaccessible matrix
+!>    A, CGEDMD computes a certain number of Ritz pairs of A using
+!>    the standard Rayleigh-Ritz extraction from a subspace of
+!>    range(X) that is determined using the leading left singular
+!>    vectors of X. Optionally, CGEDMD returns the residuals
+!>    of the computed Ritz pairs, the information needed for
+!>    a refinement of the Ritz vectors, or the eigenvectors of
+!>    the Exact DMD.
+!>    For further details see the references listed
+!>    below. For more details of the implementation see [3].
+!>    \endverbatim
+!............................................................
+!>    \par References:
+!     ================
+!>    \verbatim
+!>    [1] P. Schmid: Dynamic mode decomposition of numerical
+!>        and experimental data,
+!>        Journal of Fluid Mechanics 656, 5-28, 2010.
+!>    [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!>        decompositions: analysis and enhancements,
+!>        SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!>    [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!>        Mode Decomposition I. Technical report. AIMDyn Inc.
+!>        and LAPACK Working Note 298.
+!>    [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
+!>        Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!>        Theory and Applications, Journal of Computational
+!>        Dynamics 1(2), 391 -421, 2014.
+!>    \endverbatim
+!......................................................................
+!>    \par Developed and supported by:
+!     ================================
+!>    \verbatim
+!>    Developed and coded by Zlatko Drmac, Faculty of Science,
+!>    University of Zagreb;  drmac@math.hr
+!>    In cooperation with
+!>    AIMdyn Inc., Santa Barbara, CA.
+!>    and supported by
+!>    - DARPA SBIR project "Koopman Operator-Based Forecasting
+!>    for Nonstationary Processes from Near-Term, Limited
+!>    Observational Data" Contract No: W31P4Q-21-C-0007
+!>    - DARPA PAI project "Physics-Informed Machine Learning
+!>    Methodologies" Contract No: HR0011-18-9-0033
+!>    - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!>    Framework for Space-Time Analysis of Process Dynamics"
+!>    Contract No: HR0011-16-C-0116
+!>    Any opinions, findings and conclusions or recommendations
+!>    expressed in this material are those of the author and
+!>    do not necessarily reflect the views of the DARPA SBIR
+!>    Program Office
+!>    \endverbatim
+!......................................................................
+!>    \par Distribution Statement A:
+!     ==============================
+!>    \verbatim
+!>    Approved for Public Release, Distribution Unlimited.
+!>    Cleared by DARPA on September 29, 2022
+!>    \endverbatim
+!......................................................................
+!     Arguments
+!     =========
+!
+!>    \param[in] JOBS
+!>    \verbatim
+!>    JOBS (input) CHARACTER*1
+!>    Determines whether the initial data snapshots are scaled
+!>    by a diagonal matrix.
+!>    'S' :: The data snapshots matrices X and Y are multiplied
+!>           with a diagonal matrix D so that X*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'C' :: The snapshots are scaled as with the 'S' option.
+!>           If it is found that an i-th column of X is zero
+!>           vector and the corresponding i-th column of Y is
+!>           non-zero, then the i-th column of Y is set to
+!>           zero and a warning flag is raised.
+!>    'Y' :: The data snapshots matrices X and Y are multiplied
+!>           by a diagonal matrix D so that Y*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'N' :: No data scaling.
+!>    \endverbatim
+!.....
+!>    \param[in] JOBZ
+!>    \verbatim
+!>    JOBZ (input) CHARACTER*1
+!>    Determines whether the eigenvectors (Koopman modes) will
+!>    be computed.
+!>    'V' :: The eigenvectors (Koopman modes) will be computed
+!>           and returned in the matrix Z.
+!>           See the description of Z.
+!>    'F' :: The eigenvectors (Koopman modes) will be returned
+!>           in factored form as the product X(:,1:K)*W, where X
+!>           contains a POD basis (leading left singular vectors
+!>           of the data matrix X) and W contains the eigenvectors
+!>           of the corresponding Rayleigh quotient.
+!>           See the descriptions of K, X, W, Z.
+!>    'N' :: The eigenvectors are not computed.
+!>    \endverbatim
+!.....
+!>    \param[in] JOBR
+!>    \verbatim
+!>    JOBR (input) CHARACTER*1
+!>    Determines whether to compute the residuals.
+!>    'R' :: The residuals for the computed eigenpairs will be
+!>           computed and stored in the array RES.
+!>           See the description of RES.
+!>           For this option to be legal, JOBZ must be 'V'.
+!>    'N' :: The residuals are not computed.
+!>    \endverbatim
+!.....
+!>    \param[in] JOBF
+!>    \verbatim
+!>    JOBF (input) CHARACTER*1
+!>    Specifies whether to store information needed for post-
+!>    processing (e.g. computing refined Ritz vectors)
+!>    'R' :: The matrix needed for the refinement of the Ritz
+!>           vectors is computed and stored in the array B.
+!>           See the description of B.
+!>    'E' :: The unscaled eigenvectors of the Exact DMD are
+!>           computed and returned in the array B. See the
+!>           description of B.
+!>    'N' :: No eigenvector refinement data is computed.
+!>    \endverbatim
+!.....
+!>    \param[in] WHTSVD
+!>    \verbatim
+!>    WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!>    Allows for a selection of the SVD algorithm from the
+!>    LAPACK library.
+!>    1 :: CGESVD (the QR SVD algorithm)
+!>    2 :: CGESDD (the Divide and Conquer algorithm; if enough
+!>         workspace available, this is the fastest option)
+!>    3 :: CGESVDQ (the preconditioned QR SVD  ; this and 4
+!>         are the most accurate options)
+!>    4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3
+!>         are the most accurate options)
+!>    For the four methods above, a significant difference in
+!>    the accuracy of small singular values is possible if
+!>    the snapshots vary in norm so that X is severely
+!>    ill-conditioned. If small (smaller than EPS*||X||)
+!>    singular values are of interest and JOBS=='N',  then
+!>    the options (3, 4) give the most accurate results, where
+!>    the option 4 is slightly better and with stronger
+!>    theoretical background.
+!>    If JOBS=='S', i.e. the columns of X will be normalized,
+!>    then all methods give nearly equally accurate results.
+!>    \endverbatim
+!.....
+!>    \param[in] M
+!>    \verbatim
+!>    M (input) INTEGER, M>= 0
+!>    The state space dimension (the row dimension of X, Y).
+!>    \endverbatim
+!.....
+!>    \param[in] N
+!>    \verbatim
+!>    N (input) INTEGER, 0 <= N <= M
+!>    The number of data snapshot pairs
+!>    (the number of columns of X and Y).
+!>    \endverbatim
+!.....
+!>    \param[in,out] X
+!>    \verbatim
+!>    X (input/output) COMPLEX(KIND=WP) M-by-N array
+!>    > On entry, X contains the data snapshot matrix X. It is
+!>    assumed that the column norms of X are in the range of
+!>    the normalized floating point numbers.
+!>    < On exit, the leading K columns of X contain a POD basis,
+!>    i.e. the leading K left singular vectors of the input
+!>    data matrix X, U(:,1:K). All N columns of X contain all
+!>    left singular vectors of the input matrix X.
+!>    See the descriptions of K, Z and W.
+!>    \endverbatim
+!.....
+!>    \param[in] LDX
+!>    \verbatim
+!>    LDX (input) INTEGER, LDX >= M
+!>    The leading dimension of the array X.
+!>    \endverbatim
+!.....
+!>    \param[in,out] Y
+!>    \verbatim
+!>    Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array
+!>    > On entry, Y contains the data snapshot matrix Y
+!>    < On exit,
+!>    If JOBR == 'R', the leading K columns of Y  contain
+!>    the residual vectors for the computed Ritz pairs.
+!>    See the description of RES.
+!>    If JOBR == 'N', Y contains the original input data,
+!>                    scaled according to the value of JOBS.
+!>    \endverbatim
+!.....
+!>    \param[in] LDY
+!>    \verbatim
+!>    LDY (input) INTEGER , LDY >= M
+!>    The leading dimension of the array Y.
+!>    \endverbatim
+!.....
+!>    \param[in] NRNK
+!>    \verbatim
+!>    NRNK (input) INTEGER
+!>    Determines the mode how to compute the numerical rank,
+!>    i.e. how to truncate small singular values of the input
+!>    matrix X. On input, if
+!>    NRNK = -1 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(1)
+!>                 This option is recommended.
+!>    NRNK = -2 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(i-1)
+!>                 This option is included for R&D purposes.
+!>                 It requires highly accurate SVD, which
+!>                 may not be feasible.
+!>    The numerical rank can be enforced by using positive
+!>    value of NRNK as follows:
+!>    0 < NRNK <= N :: at most NRNK largest singular values
+!>    will be used. If the number of the computed nonzero
+!>    singular values is less than NRNK, then only those
+!>    nonzero values will be used and the actually used
+!>    dimension is less than NRNK. The actual number of
+!>    the nonzero singular values is returned in the variable
+!>    K. See the descriptions of TOL and  K.
+!>    \endverbatim
+!.....
+!>    \param[in] TOL
+!>    \verbatim
+!>    TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!>    The tolerance for truncating small singular values.
+!>    See the description of NRNK.
+!>    \endverbatim
+!.....
+!>    \param[out] K
+!>    \verbatim
+!>    K (output) INTEGER,  0 <= K <= N
+!>    The dimension of the POD basis for the data snapshot
+!>    matrix X and the number of the computed Ritz pairs.
+!>    The value of K is determined according to the rule set
+!>    by the parameters NRNK and TOL.
+!>    See the descriptions of NRNK and TOL.
+!>    \endverbatim
+!.....
+!>    \param[out] EIGS
+!>    \verbatim
+!>    EIGS (output) COMPLEX(KIND=WP) N-by-1 array
+!>    The leading K (K<=N) entries of EIGS contain
+!>    the computed eigenvalues (Ritz values).
+!>    See the descriptions of K, and Z.
+!>    \endverbatim
+!.....
+!>    \param[out] Z
+!>    \verbatim
+!>    Z (workspace/output) COMPLEX(KIND=WP)  M-by-N array
+!>    If JOBZ =='V' then Z contains the  Ritz vectors.  Z(:,i)
+!>    is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1.
+!>    If JOBZ == 'F', then the Z(:,i)'s are given implicitly as
+!>    the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i)
+!>    is an eigenvector corresponding to EIGS(i). The columns
+!>    of W(1:k,1:K) are the computed eigenvectors of the
+!>    K-by-K Rayleigh quotient.
+!>    See the descriptions of EIGS, X and W.
+!>    \endverbatim
+!.....
+!>    \param[in] LDZ
+!>    \verbatim
+!>    LDZ (input) INTEGER , LDZ >= M
+!>    The leading dimension of the array Z.
+!>    \endverbatim
+!.....
+!>    \param[out] RES
+!>    \verbatim
+!>    RES (output) REAL(KIND=WP) N-by-1 array
+!>    RES(1:K) contains the residuals for the K computed
+!>    Ritz pairs,
+!>    RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2.
+!>    See the description of EIGS and Z.
+!>    \endverbatim
+!.....
+!>    \param[out] B
+!>    \verbatim
+!>    B (output) COMPLEX(KIND=WP)  M-by-N array.
+!>    IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
+!>    be used for computing the refined vectors; see further
+!>    details in the provided references.
+!>    If JOBF == 'E', B(1:M,1:K) contains
+!>    A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!>    Exact DMD, up to scaling by the inverse eigenvalues.
+!>    If JOBF =='N', then B is not referenced.
+!>    See the descriptions of X, W, K.
+!>    \endverbatim
+!.....
+!>    \param[in] LDB
+!>    \verbatim
+!>    LDB (input) INTEGER, LDB >= M
+!>    The leading dimension of the array B.
+!>    \endverbatim
+!.....
+!>    \param[out] W
+!>    \verbatim
+!>    W (workspace/output) COMPLEX(KIND=WP) N-by-N array
+!>    On exit, W(1:K,1:K) contains the K computed
+!>    eigenvectors of the matrix Rayleigh quotient.
+!>    The Ritz vectors (returned in Z) are the
+!>    product of X (containing a POD basis for the input
+!>    matrix X) and W. See the descriptions of K, S, X and Z.
+!>    W is also used as a workspace to temporarily store the
+!>    right singular vectors of X.
+!>    \endverbatim
+!.....
+!>    \param[in] LDW
+!>    \verbatim
+!>    LDW (input) INTEGER, LDW >= N
+!>    The leading dimension of the array W.
+!>    \endverbatim
+!.....
+!>    \param[out] S
+!>    \verbatim
+!>    S (workspace/output) COMPLEX(KIND=WP) N-by-N array
+!>    The array S(1:K,1:K) is used for the matrix Rayleigh
+!>    quotient. This content is overwritten during
+!>    the eigenvalue decomposition by CGEEV.
+!>    See the description of K.
+!>    \endverbatim
+!.....
+!>    \param[in] LDS
+!>    \verbatim
+!>    LDS (input) INTEGER, LDS >= N
+!>    The leading dimension of the array S.
+!>    \endverbatim
+!.....
+!>    \param[out] ZWORK
+!>    \verbatim
+!>    ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array
+!>    ZWORK is used as complex workspace in the complex SVD, as
+!>    specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing
+!>    the eigenvalues of a Rayleigh quotient.
+!>    If the call to CGEDMD is only workspace query, then
+!>    ZWORK(1) contains the minimal complex workspace length and
+!>    ZWORK(2) is the optimal complex workspace length.
+!>    Hence, the length of work is at least 2.
+!>    See the description of LZWORK.
+!>    \endverbatim
+!.....
+!>    \param[in] LZWORK
+!>    \verbatim
+!>    LZWORK (input) INTEGER
+!>    The minimal length of the workspace vector ZWORK.
+!>    LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV),
+!>    where LZWORK_CGEEV = MAX( 1, 2*N )  and the minimal
+!>    LZWORK_SVD is calculated as follows
+!>    If WHTSVD == 1 :: CGESVD ::
+!>       LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+!>    If WHTSVD == 2 :: CGESDD ::
+!>       LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N)
+!>    If WHTSVD == 3 :: CGESVDQ ::
+!>       LZWORK_SVD = obtainable by a query
+!>    If WHTSVD == 4 :: CGEJSV ::
+!>       LZWORK_SVD = obtainable by a query
+!>    If on entry LZWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths and returns them in
+!>    LZWORK(1) and LZWORK(2), respectively.
+!>    \endverbatim
+!.....
+!>    \param[out] RWORK
+!>    \verbatim
+!>    RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array
+!>    On exit, RWORK(1:N) contains the singular values of
+!>    X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
+!>    If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain
+!>    scaling factor RWORK(N+2)/RWORK(N+1) used to scale X
+!>    and Y to avoid overflow in the SVD of X.
+!>    This may be of interest if the scaling option is off
+!>    and as many as possible smallest eigenvalues are
+!>    desired to the highest feasible accuracy.
+!>    If the call to CGEDMD is only workspace query, then
+!>    RWORK(1) contains the minimal workspace length.
+!>    See the description of LRWORK.
+!>    \endverbatim
+!.....
+!>    \param[in] LRWORK
+!>    \verbatim
+!>    LRWORK (input) INTEGER
+!>    The minimal length of the workspace vector RWORK.
+!>    LRWORK is calculated as follows:
+!>    LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where
+!>    LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace
+!>    for the SVD subroutine determined by the input parameter
+!>    WHTSVD.
+!>    If WHTSVD == 1 :: CGESVD ::
+!>       LRWORK_SVD = 5*MIN(M,N)
+!>    If WHTSVD == 2 :: CGESDD ::
+!>       LRWORK_SVD =  MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N),
+!>       2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) )
+!>    If WHTSVD == 3 :: CGESVDQ ::
+!>       LRWORK_SVD = obtainable by a query
+!>    If WHTSVD == 4 :: CGEJSV ::
+!>       LRWORK_SVD = obtainable by a query
+!>    If on entry LRWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    real workspace length and returns it in RWORK(1).
+!>    \endverbatim
+!.....
+!>    \param[out] IWORK
+!>    \verbatim
+!>    IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!>    Workspace that is required only if WHTSVD equals
+!>    2 , 3 or 4. (See the description of WHTSVD).
+!>    If on entry LWORK =-1 or LIWORK=-1, then the
+!>    minimal length of IWORK is computed and returned in
+!>    IWORK(1). See the description of LIWORK.
+!>    \endverbatim
+!.....
+!>    \param[in] LIWORK
+!>    \verbatim
+!>    LIWORK (input) INTEGER
+!>    The minimal length of the workspace vector IWORK.
+!>    If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!>    If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!>    If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!>    If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!>    If on entry LIWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths for  ZWORK, RWORK and
+!>    IWORK. See the descriptions of ZWORK, RWORK and IWORK.
+!>    \endverbatim
+!.....
+!>    \param[out] INFO
+!>    \verbatim
+!>    INFO (output) INTEGER
+!>    -i < 0 :: On entry, the i-th argument had an
+!>              illegal value
+!>       = 0 :: Successful return.
+!>       = 1 :: Void input. Quick exit (M=0 or N=0).
+!>       = 2 :: The SVD computation of X did not converge.
+!>              Suggestion: Check the input data and/or
+!>              repeat with different WHTSVD.
+!>       = 3 :: The computation of the eigenvalues did not
+!>              converge.
+!>       = 4 :: If data scaling was requested on input and
+!>              the procedure found inconsistency in the data
+!>              such that for some column index i,
+!>              X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!>              to zero if JOBS=='C'. The computation proceeds
+!>              with original or modified data and warning
+!>              flag is set with INFO=4.
+!>    \endverbatim
+!
+!  Authors:
+!  ========
+!
+!> \author Zlatko Drmac
+!
+!> \ingroup gedmd
+!
+!.............................................................
+!.............................................................
       SUBROUTINE CGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,   &
                          M, N, X, LDX, Y, LDY, NRNK, TOL,   &
                          K, EIGS, Z, LDZ, RES, B,    LDB,   &
                          W, LDW,  S, LDS, ZWORK,  LZWORK,   &
                          RWORK, LRWORK, IWORK, LIWORK, INFO )
-!   March 2023
+!
+!  -- LAPACK driver routine                                           --
+!
+!  -- LAPACK is a software package provided by University of          --
+!  -- Tennessee, University of California Berkeley, University of     --
+!  -- Colorado Denver and NAG Ltd..                                   --
+!
 !.....
       USE                   iso_fortran_env
       IMPLICIT NONE
       INTEGER, PARAMETER :: WP = real32
-!.....
+!
 !     Scalar arguments
+!     ~~~~~~~~~~~~~~~~
       CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
       INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
                                  NRNK, LDZ, LDB, LDW,  LDS, &
                                  LIWORK, LRWORK, LZWORK
       INTEGER,       INTENT(OUT)  :: K, INFO
       REAL(KIND=WP), INTENT(IN)   ::    TOL
+!
 !     Array arguments
+!     ~~~~~~~~~~~~~~~
       COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
       COMPLEX(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
                                          W(LDW,*), S(LDS,*)
@@ -25,364 +529,14 @@
       REAL(KIND=WP),    INTENT(OUT)   :: RES(*)
       REAL(KIND=WP),    INTENT(OUT)   :: RWORK(*)
       INTEGER,          INTENT(OUT)   :: IWORK(*)
-!............................................................
-!     Purpose
-!     =======
-!     CGEDMD computes the Dynamic Mode Decomposition (DMD) for
-!     a pair of data snapshot matrices. For the input matrices
-!     X and Y such that Y = A*X with an unaccessible matrix
-!     A, CGEDMD computes a certain number of Ritz pairs of A using
-!     the standard Rayleigh-Ritz extraction from a subspace of
-!     range(X) that is determined using the leading left singular
-!     vectors of X. Optionally, CGEDMD returns the residuals
-!     of the computed Ritz pairs, the information needed for
-!     a refinement of the Ritz vectors, or the eigenvectors of
-!     the Exact DMD.
-!     For further details see the references listed
-!     below. For more details of the implementation see [3].
-!
-!     References
-!     ==========
-!     [1] P. Schmid: Dynamic mode decomposition of numerical
-!         and experimental data,
-!         Journal of Fluid Mechanics 656, 5-28, 2010.
-!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
-!         decompositions: analysis and enhancements,
-!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
-!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
-!         Mode Decomposition I. Technical report. AIMDyn Inc.
-!         and LAPACK Working Note 298.
-!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
-!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
-!         Theory and Applications, Journal of Computational
-!         Dynamics 1(2), 391 -421, 2014.
 !
-!......................................................................
-!     Developed and supported by:
-!     ===========================
-!     Developed and coded by Zlatko Drmac, Faculty of Science,
-!     University of Zagreb;  drmac@math.hr
-!     In cooperation with
-!     AIMdyn Inc., Santa Barbara, CA.
-!     and supported by
-!     - DARPA SBIR project "Koopman Operator-Based Forecasting
-!     for Nonstationary Processes from Near-Term, Limited
-!     Observational Data" Contract No: W31P4Q-21-C-0007
-!     - DARPA PAI project "Physics-Informed Machine Learning
-!     Methodologies" Contract No: HR0011-18-9-0033
-!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
-!     Framework for Space-Time Analysis of Process Dynamics"
-!     Contract No: HR0011-16-C-0116
-!     Any opinions, findings and conclusions or recommendations
-!     expressed in this material are those of the author and
-!     do not necessarily reflect the views of the DARPA SBIR
-!     Program Office
-!============================================================
-!     Distribution Statement A:
-!     Approved for Public Release, Distribution Unlimited.
-!     Cleared by DARPA on September 29, 2022
-!============================================================
-!......................................................................
-!     Arguments
-!     =========
-!     JOBS (input) CHARACTER*1
-!     Determines whether the initial data snapshots are scaled
-!     by a diagonal matrix.
-!     'S' :: The data snapshots matrices X and Y are multiplied
-!            with a diagonal matrix D so that X*D has unit
-!            nonzero columns (in the Euclidean 2-norm)
-!     'C' :: The snapshots are scaled as with the 'S' option.
-!            If it is found that an i-th column of X is zero
-!            vector and the corresponding i-th column of Y is
-!            non-zero, then the i-th column of Y is set to
-!            zero and a warning flag is raised.
-!     'Y' :: The data snapshots matrices X and Y are multiplied
-!            by a diagonal matrix D so that Y*D has unit
-!            nonzero columns (in the Euclidean 2-norm)
-!     'N' :: No data scaling.
-!.....
-!     JOBZ (input) CHARACTER*1
-!     Determines whether the eigenvectors (Koopman modes) will
-!     be computed.
-!     'V' :: The eigenvectors (Koopman modes) will be computed
-!            and returned in the matrix Z.
-!            See the description of Z.
-!     'F' :: The eigenvectors (Koopman modes) will be returned
-!            in factored form as the product X(:,1:K)*W, where X
-!            contains a POD basis (leading left singular vectors
-!            of the data matrix X) and W contains the eigenvectors
-!            of the corresponding Rayleigh quotient.
-!            See the descriptions of K, X, W, Z.
-!     'N' :: The eigenvectors are not computed.
-!.....
-!     JOBR (input) CHARACTER*1
-!     Determines whether to compute the residuals.
-!     'R' :: The residuals for the computed eigenpairs will be
-!            computed and stored in the array RES.
-!            See the description of RES.
-!            For this option to be legal, JOBZ must be 'V'.
-!     'N' :: The residuals are not computed.
-!.....
-!     JOBF (input) CHARACTER*1
-!     Specifies whether to store information needed for post-
-!     processing (e.g. computing refined Ritz vectors)
-!     'R' :: The matrix needed for the refinement of the Ritz
-!            vectors is computed and stored in the array B.
-!            See the description of B.
-!     'E' :: The unscaled eigenvectors of the Exact DMD are
-!            computed and returned in the array B. See the
-!            description of B.
-!     'N' :: No eigenvector refinement data is computed.
-!.....
-!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
-!     Allows for a selection of the SVD algorithm from the
-!     LAPACK library.
-!     1 :: CGESVD (the QR SVD algorithm)
-!     2 :: CGESDD (the Divide and Conquer algorithm; if enough
-!          workspace available, this is the fastest option)
-!     3 :: CGESVDQ (the preconditioned QR SVD  ; this and 4
-!          are the most accurate options)
-!     4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3
-!          are the most accurate options)
-!     For the four methods above, a significant difference in
-!     the accuracy of small singular values is possible if
-!     the snapshots vary in norm so that X is severely
-!     ill-conditioned. If small (smaller than EPS*||X||)
-!     singular values are of interest and JOBS=='N',  then
-!     the options (3, 4) give the most accurate results, where
-!     the option 4 is slightly better and with stronger
-!     theoretical background.
-!     If JOBS=='S', i.e. the columns of X will be normalized,
-!     then all methods give nearly equally accurate results.
-!.....
-!     M (input) INTEGER, M>= 0
-!     The state space dimension (the row dimension of X, Y).
-!.....
-!     N (input) INTEGER, 0 <= N <= M
-!     The number of data snapshot pairs
-!     (the number of columns of X and Y).
-!.....
-!     X (input/output) COMPLEX(KIND=WP) M-by-N array
-!   > On entry, X contains the data snapshot matrix X. It is
-!     assumed that the column norms of X are in the range of
-!     the normalized floating point numbers.
-!   < On exit, the leading K columns of X contain a POD basis,
-!     i.e. the leading K left singular vectors of the input
-!     data matrix X, U(:,1:K). All N columns of X contain all
-!     left singular vectors of the input matrix X.
-!     See the descriptions of K, Z and W.
-!.....
-!     LDX (input) INTEGER, LDX >= M
-!     The leading dimension of the array X.
-!.....
-!     Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array
-!   > On entry, Y contains the data snapshot matrix Y
-!   < On exit,
-!     If JOBR == 'R', the leading K columns of Y  contain
-!     the residual vectors for the computed Ritz pairs.
-!     See the description of RES.
-!     If JOBR == 'N', Y contains the original input data,
-!                     scaled according to the value of JOBS.
-!.....
-!     LDY (input) INTEGER , LDY >= M
-!     The leading dimension of the array Y.
-!.....
-!     NRNK (input) INTEGER
-!     Determines the mode how to compute the numerical rank,
-!     i.e. how to truncate small singular values of the input
-!     matrix X. On input, if
-!     NRNK = -1 :: i-th singular value sigma(i) is truncated
-!                  if sigma(i) <= TOL*sigma(1)
-!                  This option is recommended.
-!     NRNK = -2 :: i-th singular value sigma(i) is truncated
-!                  if sigma(i) <= TOL*sigma(i-1)
-!                  This option is included for R&D purposes.
-!                  It requires highly accurate SVD, which
-!                  may not be feasible.
-!     The numerical rank can be enforced by using positive
-!     value of NRNK as follows:
-!     0 < NRNK <= N :: at most NRNK largest singular values
-!     will be used. If the number of the computed nonzero
-!     singular values is less than NRNK, then only those
-!     nonzero values will be used and the actually used
-!     dimension is less than NRNK. The actual number of
-!     the nonzero singular values is returned in the variable
-!     K. See the descriptions of TOL and  K.
-!.....
-!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
-!     The tolerance for truncating small singular values.
-!     See the description of NRNK.
-!.....
-!     K (output) INTEGER,  0 <= K <= N
-!     The dimension of the POD basis for the data snapshot
-!     matrix X and the number of the computed Ritz pairs.
-!     The value of K is determined according to the rule set
-!     by the parameters NRNK and TOL.
-!     See the descriptions of NRNK and TOL.
-!.....
-!     EIGS (output) COMPLEX(KIND=WP) N-by-1 array
-!     The leading K (K<=N) entries of EIGS contain
-!     the computed eigenvalues (Ritz values).
-!     See the descriptions of K, and Z.
-!.....
-!     Z (workspace/output) COMPLEX(KIND=WP)  M-by-N array
-!     If JOBZ =='V' then Z contains the  Ritz vectors.  Z(:,i)
-!     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1.
-!     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as
-!     the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i)
-!     is an eigenvector corresponding to EIGS(i). The columns
-!     of W(1:k,1:K) are the computed eigenvectors of the
-!     K-by-K Rayleigh quotient.
-!     See the descriptions of EIGS, X and W.
-!.....
-!     LDZ (input) INTEGER , LDZ >= M
-!     The leading dimension of the array Z.
-!.....
-!     RES (output) REAL(KIND=WP) N-by-1 array
-!     RES(1:K) contains the residuals for the K computed
-!     Ritz pairs,
-!     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2.
-!     See the description of EIGS and Z.
-!.....
-!     B (output) COMPLEX(KIND=WP)  M-by-N array.
-!     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
-!     be used for computing the refined vectors; see further
-!     details in the provided references.
-!     If JOBF == 'E', B(1:M,1:K) contains
-!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
-!     Exact DMD, up to scaling by the inverse eigenvalues.
-!     If JOBF =='N', then B is not referenced.
-!     See the descriptions of X, W, K.
-!.....
-!     LDB (input) INTEGER, LDB >= M
-!     The leading dimension of the array B.
-!.....
-!     W (workspace/output) COMPLEX(KIND=WP) N-by-N array
-!     On exit, W(1:K,1:K) contains the K computed
-!     eigenvectors of the matrix Rayleigh quotient.
-!     The Ritz vectors (returned in Z) are the
-!     product of X (containing a POD basis for the input
-!     matrix X) and W. See the descriptions of K, S, X and Z.
-!     W is also used as a workspace to temporarily store the
-!     right singular vectors of X.
-!.....
-!     LDW (input) INTEGER, LDW >= N
-!     The leading dimension of the array W.
-!.....
-!     S (workspace/output) COMPLEX(KIND=WP) N-by-N array
-!     The array S(1:K,1:K) is used for the matrix Rayleigh
-!     quotient. This content is overwritten during
-!     the eigenvalue decomposition by CGEEV.
-!     See the description of K.
-!.....
-!     LDS (input) INTEGER, LDS >= N
-!     The leading dimension of the array S.
-!.....
-!     ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array
-!     ZWORK is used as complex workspace in the complex SVD, as
-!     specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing
-!     the eigenvalues of a Rayleigh quotient.
-!     If the call to CGEDMD is only workspace query, then
-!     ZWORK(1) contains the minimal complex workspace length and
-!     ZWORK(2) is the optimal complex workspace length.
-!     Hence, the length of work is at least 2.
-!     See the description of LZWORK.
-!.....
-!     LZWORK (input) INTEGER
-!     The minimal length of the workspace vector ZWORK.
-!     LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV),
-!     where LZWORK_CGEEV = MAX( 1, 2*N )  and the minimal
-!     LZWORK_SVD is calculated as follows
-!     If WHTSVD == 1 :: CGESVD ::
-!        LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N))
-!     If WHTSVD == 2 :: CGESDD ::
-!        LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N)
-!     If WHTSVD == 3 :: CGESVDQ ::
-!        LZWORK_SVD = obtainable by a query
-!     If WHTSVD == 4 :: CGEJSV ::
-!        LZWORK_SVD = obtainable by a query
-!     If on entry LZWORK = -1, then a workspace query is
-!     assumed and the procedure only computes the minimal
-!     and the optimal workspace lengths and returns them in
-!     LZWORK(1) and LZWORK(2), respectively.
-!.....
-!     RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array
-!     On exit, RWORK(1:N) contains the singular values of
-!     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
-!     If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain
-!     scaling factor RWORK(N+2)/RWORK(N+1) used to scale X
-!     and Y to avoid overflow in the SVD of X.
-!     This may be of interest if the scaling option is off
-!     and as many as possible smallest eigenvalues are
-!     desired to the highest feasible accuracy.
-!     If the call to CGEDMD is only workspace query, then
-!     RWORK(1) contains the minimal workspace length.
-!     See the description of LRWORK.
-!.....
-!     LRWORK (input) INTEGER
-!     The minimal length of the workspace vector RWORK.
-!     LRWORK is calculated as follows:
-!     LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where
-!     LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace
-!     for the SVD subroutine determined by the input parameter
-!     WHTSVD.
-!     If WHTSVD == 1 :: CGESVD ::
-!        LRWORK_SVD = 5*MIN(M,N)
-!     If WHTSVD == 2 :: CGESDD ::
-!        LRWORK_SVD =  MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N),
-!        2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) )
-!     If WHTSVD == 3 :: CGESVDQ ::
-!        LRWORK_SVD = obtainable by a query
-!     If WHTSVD == 4 :: CGEJSV ::
-!        LRWORK_SVD = obtainable by a query
-!     If on entry LRWORK = -1, then a workspace query is
-!     assumed and the procedure only computes the minimal
-!     real workspace length and returns it in RWORK(1).
-!.....
-!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
-!     Workspace that is required only if WHTSVD equals
-!     2 , 3 or 4. (See the description of WHTSVD).
-!     If on entry LWORK =-1 or LIWORK=-1, then the
-!     minimal length of IWORK is computed and returned in
-!     IWORK(1). See the description of LIWORK.
-!.....
-!     LIWORK (input) INTEGER
-!     The minimal length of the workspace vector IWORK.
-!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
-!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
-!     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
-!     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
-!     If on entry LIWORK = -1, then a workspace query is
-!     assumed and the procedure only computes the minimal
-!     and the optimal workspace lengths for  ZWORK, RWORK and
-!     IWORK. See the descriptions of ZWORK, RWORK and IWORK.
-!.....
-!     INFO (output) INTEGER
-!     -i < 0 :: On entry, the i-th argument had an
-!               illegal value
-!        = 0 :: Successful return.
-!        = 1 :: Void input. Quick exit (M=0 or N=0).
-!        = 2 :: The SVD computation of X did not converge.
-!               Suggestion: Check the input data and/or
-!               repeat with different WHTSVD.
-!        = 3 :: The computation of the eigenvalues did not
-!               converge.
-!        = 4 :: If data scaling was requested on input and
-!               the procedure found inconsistency in the data
-!               such that for some column index i,
-!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
-!               to zero if JOBS=='C'. The computation proceeds
-!               with original or modified data and warning
-!               flag is set with INFO=4.
-!.............................................................
-!.............................................................
 !     Parameters
 !     ~~~~~~~~~~
       REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
       REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
       COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP )
       COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP )
-
+!
 !     Local scalars
 !     ~~~~~~~~~~~~~
       REAL(KIND=WP) :: OFL,   ROOTSC, SCALE,  SMALL,   &
@@ -400,7 +554,7 @@
 !     Local arrays
 !     ~~~~~~~~~~~~
       REAL(KIND=WP) :: RDUMMY(2)
-
+!
 !     External functions (BLAS and LAPACK)
 !     ~~~~~~~~~~~~~~~~~
       REAL(KIND=WP) CLANGE, SLAMCH, SCNRM2
@@ -408,13 +562,13 @@
       INTEGER                               ICAMAX
       LOGICAL       SISNAN, LSAME
       EXTERNAL      SISNAN, LSAME
-
+!
 !     External subroutines (BLAS and LAPACK)
 !     ~~~~~~~~~~~~~~~~~~~~
       EXTERNAL      CAXPY,  CGEMM,  CSSCAL
       EXTERNAL      CGEEV,  CGEJSV, CGESDD, CGESVD, CGESVDQ, &
                     CLACPY, CLASCL, CLASSQ, XERBLA
-
+!
 !     Intrinsic functions
 !     ~~~~~~~~~~~~~~~~~~~
       INTRINSIC     FLOAT, INT, MAX, SQRT
@@ -607,7 +761,8 @@
           K = 0
           DO i = 1, N
             !WORK(i) = SCNRM2( M, X(1,i), 1 )
-            SCALE  = ZERO
+            SSUM  = ONE
+            SCALE = ZERO
             CALL CLASSQ( M, X(1,i), 1, SCALE, SSUM )
             IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN
                 K    =  0
@@ -680,7 +835,8 @@
           ! carefully computed using CLASSQ.
           DO i = 1, N
             !RWORK(i) = SCNRM2( M, Y(1,i), 1 )
-            SCALE  = ZERO
+            SSUM  = ONE
+            SCALE = ZERO
             CALL CLASSQ( M, Y(1,i), 1, SCALE, SSUM )
             IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN
                 K    =  0
diff --git a/lapack-netlib/SRC/dgedmd.f90 b/lapack-netlib/SRC/dgedmd.f90
index 20424808f..15df48fe9 100644
--- a/lapack-netlib/SRC/dgedmd.f90
+++ b/lapack-netlib/SRC/dgedmd.f90
@@ -1,424 +1,574 @@
-      SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,  &
-                         M, N, X, LDX, Y, LDY, NRNK, TOL,  &
-                         K, REIG,  IMEIG,   Z, LDZ,  RES,  &
-                         B, LDB, W,  LDW,   S, LDS,        &
-                         WORK, LWORK, IWORK, LIWORK, INFO )
-! March 2023
+!> \brief \b DGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices.
+!
+!  =========== DOCUMENTATION ===========
+!
+!  Definition:
+!  ===========
+!
+!     SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,  &
+!                        M, N, X, LDX, Y, LDY, NRNK, TOL,  &
+!                        K, REIG,  IMEIG,   Z, LDZ,  RES,  &
+!                        B, LDB, W,  LDW,   S, LDS,        &
+!                        WORK, LWORK, IWORK, LIWORK, INFO )
+!
 !.....
-      USE                   iso_fortran_env
-      IMPLICIT NONE
-      INTEGER, PARAMETER :: WP = real64
+!     USE                   iso_fortran_env
+!     IMPLICIT NONE
+!     INTEGER, PARAMETER :: WP = real64
 !.....
 !     Scalar arguments
-      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
-      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
-                                 NRNK, LDZ, LDB, LDW,  LDS, &
-                                 LWORK,  LIWORK
-      INTEGER,   INTENT(OUT)  :: K, INFO
-      REAL(KIND=WP), INTENT(IN)  :: TOL
+!     CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+!     INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+!                                NRNK, LDZ, LDB, LDW,  LDS, &
+!                                LWORK,  LIWORK
+!     INTEGER,   INTENT(OUT)  :: K, INFO
+!     REAL(KIND=WP), INTENT(IN)  :: TOL
 !     Array arguments
-      REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
-      REAL(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
-                                      W(LDW,*), S(LDS,*)
-      REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*), &
-                                      RES(*)
-      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)
-      INTEGER,       INTENT(OUT)   :: IWORK(*)
-!............................................................
-!     Purpose
-!     =======
-!     DGEDMD computes the Dynamic Mode Decomposition (DMD) for
-!     a pair of data snapshot matrices. For the input matrices
-!     X and Y such that Y = A*X with an unaccessible matrix
-!     A, DGEDMD computes a certain number of Ritz pairs of A using
-!     the standard Rayleigh-Ritz extraction from a subspace of
-!     range(X) that is determined using the leading left singular
-!     vectors of X. Optionally, DGEDMD returns the residuals
-!     of the computed Ritz pairs, the information needed for
-!     a refinement of the Ritz vectors, or the eigenvectors of
-!     the Exact DMD.
-!     For further details see the references listed
-!     below. For more details of the implementation see [3].
-!
-!     References
-!     ==========
-!     [1] P. Schmid: Dynamic mode decomposition of numerical
-!         and experimental data,
-!         Journal of Fluid Mechanics 656, 5-28, 2010.
-!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
-!         decompositions: analysis and enhancements,
-!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
-!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
-!         Mode Decomposition I. Technical report. AIMDyn Inc.
-!         and LAPACK Working Note 298.
-!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
-!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
-!         Theory and Applications, Journal of Computational
-!         Dynamics 1(2), 391 -421, 2014.
+!     REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+!     REAL(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+!                                     W(LDW,*), S(LDS,*)
+!     REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*), &
+!                                     RES(*)
+!     REAL(KIND=WP), INTENT(OUT)   :: WORK(*)
+!     INTEGER,       INTENT(OUT)   :: IWORK(*)
 !
-!......................................................................
-!     Developed and supported by:
-!     ===========================
-!     Developed and coded by Zlatko Drmac, Faculty of Science,
-!     University of Zagreb;  drmac@math.hr
-!     In cooperation with
-!     AIMdyn Inc., Santa Barbara, CA.
-!     and supported by
-!     - DARPA SBIR project "Koopman Operator-Based Forecasting
-!     for Nonstationary Processes from Near-Term, Limited
-!     Observational Data" Contract No: W31P4Q-21-C-0007
-!     - DARPA PAI project "Physics-Informed Machine Learning
-!     Methodologies" Contract No: HR0011-18-9-0033
-!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
-!     Framework for Space-Time Analysis of Process Dynamics"
-!     Contract No: HR0011-16-C-0116
-!     Any opinions, findings and conclusions or recommendations
-!     expressed in this material are those of the author and
-!     do not necessarily reflect the views of the DARPA SBIR
-!     Program Office
-!============================================================
-!     Distribution Statement A:
-!     Approved for Public Release, Distribution Unlimited.
-!     Cleared by DARPA on September 29, 2022
-!============================================================
 !............................................................
+!>    \par Purpose:
+!     =============
+!>    \verbatim
+!>    DGEDMD computes the Dynamic Mode Decomposition (DMD) for
+!>    a pair of data snapshot matrices. For the input matrices
+!>    X and Y such that Y = A*X with an unaccessible matrix
+!>    A, DGEDMD computes a certain number of Ritz pairs of A using
+!>    the standard Rayleigh-Ritz extraction from a subspace of
+!>    range(X) that is determined using the leading left singular
+!>    vectors of X. Optionally, DGEDMD returns the residuals
+!>    of the computed Ritz pairs, the information needed for
+!>    a refinement of the Ritz vectors, or the eigenvectors of
+!>    the Exact DMD.
+!>    For further details see the references listed
+!>    below. For more details of the implementation see [3].
+!>    \endverbatim
+!............................................................
+!>    \par References:
+!     ================
+!>    \verbatim
+!>    [1] P. Schmid: Dynamic mode decomposition of numerical
+!>        and experimental data,
+!>        Journal of Fluid Mechanics 656, 5-28, 2010.
+!>    [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!>        decompositions: analysis and enhancements,
+!>        SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!>    [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!>        Mode Decomposition I. Technical report. AIMDyn Inc.
+!>        and LAPACK Working Note 298.
+!>    [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
+!>        Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!>        Theory and Applications, Journal of Computational
+!>        Dynamics 1(2), 391 -421, 2014.
+!>    \endverbatim
+!......................................................................
+!>    \par Developed and supported by:
+!     ================================
+!>    \verbatim
+!>    Developed and coded by Zlatko Drmac, Faculty of Science,
+!>    University of Zagreb;  drmac@math.hr
+!>    In cooperation with
+!>    AIMdyn Inc., Santa Barbara, CA.
+!>    and supported by
+!>    - DARPA SBIR project "Koopman Operator-Based Forecasting
+!>    for Nonstationary Processes from Near-Term, Limited
+!>    Observational Data" Contract No: W31P4Q-21-C-0007
+!>    - DARPA PAI project "Physics-Informed Machine Learning
+!>    Methodologies" Contract No: HR0011-18-9-0033
+!>    - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!>    Framework for Space-Time Analysis of Process Dynamics"
+!>    Contract No: HR0011-16-C-0116
+!>    Any opinions, findings and conclusions or recommendations
+!>    expressed in this material are those of the author and
+!>    do not necessarily reflect the views of the DARPA SBIR
+!>    Program Office
+!>    \endverbatim
+!......................................................................
+!>    \par Distribution Statement A:
+!     ==============================
+!>    \verbatim
+!>    Approved for Public Release, Distribution Unlimited.
+!>    Cleared by DARPA on September 29, 2022
+!>    \endverbatim
+!......................................................................
 !     Arguments
 !     =========
-!     JOBS (input) CHARACTER*1
-!     Determines whether the initial data snapshots are scaled
-!     by a diagonal matrix.
-!     'S' :: The data snapshots matrices X and Y are multiplied
-!            with a diagonal matrix D so that X*D has unit
-!            nonzero columns (in the Euclidean 2-norm)
-!     'C' :: The snapshots are scaled as with the 'S' option.
-!            If it is found that an i-th column of X is zero
-!            vector and the corresponding i-th column of Y is
-!            non-zero, then the i-th column of Y is set to
-!            zero and a warning flag is raised.
-!     'Y' :: The data snapshots matrices X and Y are multiplied
-!            by a diagonal matrix D so that Y*D has unit
-!            nonzero columns (in the Euclidean 2-norm)
-!     'N' :: No data scaling.
+!
+!>    \param[in] JOBS
+!>    \verbatim
+!>    JOBS (input) is CHARACTER*1
+!>    Determines whether the initial data snapshots are scaled
+!>    by a diagonal matrix.
+!>    'S' :: The data snapshots matrices X and Y are multiplied
+!>           with a diagonal matrix D so that X*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'C' :: The snapshots are scaled as with the 'S' option.
+!>           If it is found that an i-th column of X is zero
+!>           vector and the corresponding i-th column of Y is
+!>           non-zero, then the i-th column of Y is set to
+!>           zero and a warning flag is raised.
+!>    'Y' :: The data snapshots matrices X and Y are multiplied
+!>           by a diagonal matrix D so that Y*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'N' :: No data scaling.
+!>    \endverbatim
 !.....
-!     JOBZ (input) CHARACTER*1
-!     Determines whether the eigenvectors (Koopman modes) will
-!     be computed.
-!     'V' :: The eigenvectors (Koopman modes) will be computed
-!            and returned in the matrix Z.
-!            See the description of Z.
-!     'F' :: The eigenvectors (Koopman modes) will be returned
-!            in factored form as the product X(:,1:K)*W, where X
-!            contains a POD basis (leading left singular vectors
-!            of the data matrix X) and W contains the eigenvectors
-!            of the corresponding Rayleigh quotient.
-!            See the descriptions of K, X, W, Z.
-!     'N' :: The eigenvectors are not computed.
+!>    \param[in] JOBZ
+!>    \verbatim
+!>    JOBZ (input) CHARACTER*1
+!>    Determines whether the eigenvectors (Koopman modes) will
+!>    be computed.
+!>    'V' :: The eigenvectors (Koopman modes) will be computed
+!>           and returned in the matrix Z.
+!>           See the description of Z.
+!>    'F' :: The eigenvectors (Koopman modes) will be returned
+!>           in factored form as the product X(:,1:K)*W, where X
+!>           contains a POD basis (leading left singular vectors
+!>           of the data matrix X) and W contains the eigenvectors
+!>           of the corresponding Rayleigh quotient.
+!>           See the descriptions of K, X, W, Z.
+!>    'N' :: The eigenvectors are not computed.
+!>    \endverbatim
 !.....
-!     JOBR (input) CHARACTER*1
-!     Determines whether to compute the residuals.
-!     'R' :: The residuals for the computed eigenpairs will be
-!            computed and stored in the array RES.
-!            See the description of RES.
-!            For this option to be legal, JOBZ must be 'V'.
-!     'N' :: The residuals are not computed.
+!>    \param[in] JOBR
+!>    \verbatim
+!>    JOBR (input) CHARACTER*1
+!>    Determines whether to compute the residuals.
+!>    'R' :: The residuals for the computed eigenpairs will be
+!>           computed and stored in the array RES.
+!>           See the description of RES.
+!>           For this option to be legal, JOBZ must be 'V'.
+!>    'N' :: The residuals are not computed.
+!>    \endverbatim
 !.....
-!     JOBF (input) CHARACTER*1
-!     Specifies whether to store information needed for post-
-!     processing (e.g. computing refined Ritz vectors)
-!     'R' :: The matrix needed for the refinement of the Ritz
-!            vectors is computed and stored in the array B.
-!            See the description of B.
-!     'E' :: The unscaled eigenvectors of the Exact DMD are
-!            computed and returned in the array B. See the
-!            description of B.
-!     'N' :: No eigenvector refinement data is computed.
+!>    \param[in] JOBF
+!>    \verbatim
+!>    JOBF (input) CHARACTER*1
+!>    Specifies whether to store information needed for post-
+!>    processing (e.g. computing refined Ritz vectors)
+!>    'R' :: The matrix needed for the refinement of the Ritz
+!>           vectors is computed and stored in the array B.
+!>           See the description of B.
+!>    'E' :: The unscaled eigenvectors of the Exact DMD are
+!>           computed and returned in the array B. See the
+!>           description of B.
+!>    'N' :: No eigenvector refinement data is computed.
+!>    \endverbatim
 !.....
-!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
-!     Allows for a selection of the SVD algorithm from the
-!     LAPACK library.
-!     1 :: DGESVD (the QR SVD algorithm)
-!     2 :: DGESDD (the Divide and Conquer algorithm; if enough
-!          workspace available, this is the fastest option)
-!     3 :: DGESVDQ (the preconditioned QR SVD  ; this and 4
-!          are the most accurate options)
-!     4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3
-!          are the most accurate options)
-!     For the four methods above, a significant difference in
-!     the accuracy of small singular values is possible if
-!     the snapshots vary in norm so that X is severely
-!     ill-conditioned. If small (smaller than EPS*||X||)
-!     singular values are of interest and JOBS=='N',  then
-!     the options (3, 4) give the most accurate results, where
-!     the option 4 is slightly better and with stronger
-!     theoretical background.
-!     If JOBS=='S', i.e. the columns of X will be normalized,
-!     then all methods give nearly equally accurate results.
+!>    \param[in] WHTSVD
+!>    \verbatim
+!>    WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!>    Allows for a selection of the SVD algorithm from the
+!>    LAPACK library.
+!>    1 :: DGESVD (the QR SVD algorithm)
+!>    2 :: DGESDD (the Divide and Conquer algorithm; if enough
+!>         workspace available, this is the fastest option)
+!>    3 :: DGESVDQ (the preconditioned QR SVD  ; this and 4
+!>         are the most accurate options)
+!>    4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3
+!>         are the most accurate options)
+!>    For the four methods above, a significant difference in
+!>    the accuracy of small singular values is possible if
+!>    the snapshots vary in norm so that X is severely
+!>    ill-conditioned. If small (smaller than EPS*||X||)
+!>    singular values are of interest and JOBS=='N',  then
+!>    the options (3, 4) give the most accurate results, where
+!>    the option 4 is slightly better and with stronger
+!>    theoretical background.
+!>    If JOBS=='S', i.e. the columns of X will be normalized,
+!>    then all methods give nearly equally accurate results.
+!>    \endverbatim
 !.....
-!     M (input) INTEGER, M>= 0
-!     The state space dimension (the row dimension of X, Y).
+!>    \param[in] M
+!>    \verbatim
+!>    M (input) INTEGER, M>= 0
+!>    The state space dimension (the row dimension of X, Y).
+!>    \endverbatim
 !.....
-!     N (input) INTEGER, 0 <= N <= M
-!     The number of data snapshot pairs
-!     (the number of columns of X and Y).
+!>    \param[in] N
+!>    \verbatim
+!>    N (input) INTEGER, 0 <= N <= M
+!>    The number of data snapshot pairs
+!>    (the number of columns of X and Y).
+!>    \endverbatim
 !.....
-!     X (input/output) REAL(KIND=WP) M-by-N array
-!     > On entry, X contains the data snapshot matrix X. It is
-!     assumed that the column norms of X are in the range of
-!     the normalized floating point numbers.
-!     < On exit, the leading K columns of X contain a POD basis,
-!     i.e. the leading K left singular vectors of the input
-!     data matrix X, U(:,1:K). All N columns of X contain all
-!     left singular vectors of the input matrix X.
-!     See the descriptions of K, Z and W.
+!>    \param[in,out] X
+!>    \verbatim
+!>    X (input/output) REAL(KIND=WP) M-by-N array
+!>    > On entry, X contains the data snapshot matrix X. It is
+!>    assumed that the column norms of X are in the range of
+!>    the normalized floating point numbers.
+!>    < On exit, the leading K columns of X contain a POD basis,
+!>    i.e. the leading K left singular vectors of the input
+!>    data matrix X, U(:,1:K). All N columns of X contain all
+!>    left singular vectors of the input matrix X.
+!>    See the descriptions of K, Z and W.
+!>    \endverbatim
 !.....
-!     LDX (input) INTEGER, LDX >= M
-!     The leading dimension of the array X.
+!>    \param[in] LDX
+!>    \verbatim
+!>    LDX (input) INTEGER, LDX >= M
+!>    The leading dimension of the array X.
+!>    \endverbatim
 !.....
-!     Y (input/workspace/output) REAL(KIND=WP) M-by-N array
-!     > On entry, Y contains the data snapshot matrix Y
-!     < On exit,
-!     If JOBR == 'R', the leading K columns of Y  contain
-!     the residual vectors for the computed Ritz pairs.
-!     See the description of RES.
-!     If JOBR == 'N', Y contains the original input data,
-!                     scaled according to the value of JOBS.
+!>    \param[in,out] Y
+!>    \verbatim
+!>    Y (input/workspace/output) REAL(KIND=WP) M-by-N array
+!>    > On entry, Y contains the data snapshot matrix Y
+!>    < On exit,
+!>    If JOBR == 'R', the leading K columns of Y  contain
+!>    the residual vectors for the computed Ritz pairs.
+!>    See the description of RES.
+!>    If JOBR == 'N', Y contains the original input data,
+!>                    scaled according to the value of JOBS.
+!>    \endverbatim
 !.....
-!     LDY (input) INTEGER , LDY >= M
-!     The leading dimension of the array Y.
+!>    \param[in] LDY
+!>    \verbatim
+!>    LDY (input) INTEGER , LDY >= M
+!>    The leading dimension of the array Y.
+!>    \endverbatim
 !.....
-!     NRNK (input) INTEGER
-!     Determines the mode how to compute the numerical rank,
-!     i.e. how to truncate small singular values of the input
-!     matrix X. On input, if
-!     NRNK = -1 :: i-th singular value sigma(i) is truncated
-!                  if sigma(i) <= TOL*sigma(1).
-!                  This option is recommended.
-!     NRNK = -2 :: i-th singular value sigma(i) is truncated
-!                  if sigma(i) <= TOL*sigma(i-1)
-!                  This option is included for R&D purposes.
-!                  It requires highly accurate SVD, which
-!                  may not be feasible.
-!
-!     The numerical rank can be enforced by using positive
-!     value of NRNK as follows:
-!     0 < NRNK <= N :: at most NRNK largest singular values
-!     will be used. If the number of the computed nonzero
-!     singular values is less than NRNK, then only those
-!     nonzero values will be used and the actually used
-!     dimension is less than NRNK. The actual number of
-!     the nonzero singular values is returned in the variable
-!     K. See the descriptions of TOL and  K.
+!>    \param[in] NRNK
+!>    \verbatim
+!>    NRNK (input) INTEGER
+!>    Determines the mode how to compute the numerical rank,
+!>    i.e. how to truncate small singular values of the input
+!>    matrix X. On input, if
+!>    NRNK = -1 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(1).
+!>                 This option is recommended.
+!>    NRNK = -2 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(i-1)
+!>                 This option is included for R&D purposes.
+!>                 It requires highly accurate SVD, which
+!>                 may not be feasible.
+!>
+!>    The numerical rank can be enforced by using positive
+!>    value of NRNK as follows:
+!>    0 < NRNK <= N :: at most NRNK largest singular values
+!>    will be used. If the number of the computed nonzero
+!>    singular values is less than NRNK, then only those
+!>    nonzero values will be used and the actually used
+!>    dimension is less than NRNK. The actual number of
+!>    the nonzero singular values is returned in the variable
+!>    K. See the descriptions of TOL and  K.
+!>    \endverbatim
 !.....
-!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
-!     The tolerance for truncating small singular values.
-!     See the description of NRNK.
+!>    \param[in] TOL
+!>    \verbatim
+!>    TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!>    The tolerance for truncating small singular values.
+!>    See the description of NRNK.
+!>    \endverbatim
 !.....
-!     K (output) INTEGER,  0 <= K <= N
-!     The dimension of the POD basis for the data snapshot
-!     matrix X and the number of the computed Ritz pairs.
-!     The value of K is determined according to the rule set
-!     by the parameters NRNK and TOL.
-!     See the descriptions of NRNK and TOL.
+!>    \param[out] K
+!>    \verbatim
+!>    K (output) INTEGER,  0 <= K <= N
+!>    The dimension of the POD basis for the data snapshot
+!>    matrix X and the number of the computed Ritz pairs.
+!>    The value of K is determined according to the rule set
+!>    by the parameters NRNK and TOL.
+!>    See the descriptions of NRNK and TOL.
+!>    \endverbatim
 !.....
-!     REIG (output) REAL(KIND=WP) N-by-1 array
-!     The leading K (K<=N) entries of REIG contain
-!     the real parts of the computed eigenvalues
-!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
-!     See the descriptions of K, IMEIG, and Z.
+!>    \param[out] REIG
+!>    \verbatim
+!>    REIG (output) REAL(KIND=WP) N-by-1 array
+!>    The leading K (K<=N) entries of REIG contain
+!>    the real parts of the computed eigenvalues
+!>    REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!>    See the descriptions of K, IMEIG, and Z.
+!>    \endverbatim
 !.....
-!     IMEIG (output) REAL(KIND=WP) N-by-1 array
-!     The leading K (K<=N) entries of IMEIG contain
-!     the imaginary parts of the computed eigenvalues
-!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
-!     The eigenvalues are determined as follows:
-!     If IMEIG(i) == 0, then the corresponding eigenvalue is
-!     real, LAMBDA(i) = REIG(i).
-!     If IMEIG(i)>0, then the corresponding complex
-!     conjugate pair of eigenvalues reads
-!     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i)
-!     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i)
-!     That is, complex conjugate pairs have consecutive
-!     indices (i,i+1), with the positive imaginary part
-!     listed first.
-!     See the descriptions of K, REIG, and Z.
+!>    \param[out] IMEIG
+!>    \verbatim
+!>    IMEIG (output) REAL(KIND=WP) N-by-1 array
+!>    The leading K (K<=N) entries of IMEIG contain
+!>    the imaginary parts of the computed eigenvalues
+!>    REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!>    The eigenvalues are determined as follows:
+!>    If IMEIG(i) == 0, then the corresponding eigenvalue is
+!>    real, LAMBDA(i) = REIG(i).
+!>    If IMEIG(i)>0, then the corresponding complex
+!>    conjugate pair of eigenvalues reads
+!>    LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i)
+!>    LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i)
+!>    That is, complex conjugate pairs have consecutive
+!>    indices (i,i+1), with the positive imaginary part
+!>    listed first.
+!>    See the descriptions of K, REIG, and Z.
+!>    \endverbatim
 !.....
-!     Z (workspace/output) REAL(KIND=WP)  M-by-N array
-!     If JOBZ =='V' then
-!        Z contains real Ritz vectors as follows:
-!        If IMEIG(i)=0, then Z(:,i) is an eigenvector of
-!        the i-th Ritz value; ||Z(:,i)||_2=1.
-!        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then
-!        [Z(:,i) Z(:,i+1)] span an invariant subspace and
-!        the Ritz values extracted from this subspace are
-!        REIG(i) + sqrt(-1)*IMEIG(i) and
-!        REIG(i) - sqrt(-1)*IMEIG(i).
-!        The corresponding eigenvectors are
-!        Z(:,i) + sqrt(-1)*Z(:,i+1) and
-!        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively.
-!        || Z(:,i:i+1)||_F = 1.
-!     If JOBZ == 'F', then the above descriptions hold for
-!     the columns of X(:,1:K)*W(1:K,1:K), where the columns
-!     of W(1:k,1:K) are the computed eigenvectors of the
-!     K-by-K Rayleigh quotient. The columns of W(1:K,1:K)
-!     are similarly structured: If IMEIG(i) == 0 then
-!     X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0
-!     then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and
-!          X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1)
-!     are the eigenvectors of LAMBDA(i), LAMBDA(i+1).
-!     See the descriptions of REIG, IMEIG, X and W.
+!>    \param[out] Z
+!>    \verbatim
+!>    Z (workspace/output) REAL(KIND=WP)  M-by-N array
+!>    If JOBZ =='V' then
+!>       Z contains real Ritz vectors as follows:
+!>       If IMEIG(i)=0, then Z(:,i) is an eigenvector of
+!>       the i-th Ritz value; ||Z(:,i)||_2=1.
+!>       If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then
+!>       [Z(:,i) Z(:,i+1)] span an invariant subspace and
+!>       the Ritz values extracted from this subspace are
+!>       REIG(i) + sqrt(-1)*IMEIG(i) and
+!>       REIG(i) - sqrt(-1)*IMEIG(i).
+!>       The corresponding eigenvectors are
+!>       Z(:,i) + sqrt(-1)*Z(:,i+1) and
+!>       Z(:,i) - sqrt(-1)*Z(:,i+1), respectively.
+!>       || Z(:,i:i+1)||_F = 1.
+!>    If JOBZ == 'F', then the above descriptions hold for
+!>    the columns of X(:,1:K)*W(1:K,1:K), where the columns
+!>    of W(1:k,1:K) are the computed eigenvectors of the
+!>    K-by-K Rayleigh quotient. The columns of W(1:K,1:K)
+!>    are similarly structured: If IMEIG(i) == 0 then
+!>    X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0
+!>    then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and
+!>         X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1)
+!>    are the eigenvectors of LAMBDA(i), LAMBDA(i+1).
+!>    See the descriptions of REIG, IMEIG, X and W.
+!>    \endverbatim
 !.....
-!     LDZ (input) INTEGER , LDZ >= M
-!     The leading dimension of the array Z.
+!>    \param[in] LDZ
+!>    \verbatim
+!>    LDZ (input) INTEGER , LDZ >= M
+!>    The leading dimension of the array Z.
+!>    \endverbatim
 !.....
-!     RES (output) REAL(KIND=WP) N-by-1 array
-!     RES(1:K) contains the residuals for the K computed
-!     Ritz pairs.
-!     If LAMBDA(i) is real, then
-!        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2.
-!     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair
-!     then
-!     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F
-!     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ]
-!               [-imag(LAMBDA(i)) real(LAMBDA(i)) ].
-!     It holds that
-!     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2
-!     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2
-!     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1)
-!           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1)
-!     See the description of REIG, IMEIG and Z.
+!>    \param[out] RES
+!>    \verbatim
+!>    RES (output) REAL(KIND=WP) N-by-1 array
+!>    RES(1:K) contains the residuals for the K computed
+!>    Ritz pairs.
+!>    If LAMBDA(i) is real, then
+!>       RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2.
+!>    If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair
+!>    then
+!>    RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F
+!>    where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ]
+!>              [-imag(LAMBDA(i)) real(LAMBDA(i)) ].
+!>    It holds that
+!>    RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2
+!>    RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2
+!>    where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1)
+!>          ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1)
+!>    See the description of REIG, IMEIG and Z.
+!>    \endverbatim
 !.....
-!     B (output) REAL(KIND=WP)  M-by-N array.
-!     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
-!     be used for computing the refined vectors; see further
-!     details in the provided references.
-!     If JOBF == 'E', B(1:M,1;K) contains
-!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
-!     Exact DMD, up to scaling by the inverse eigenvalues.
-!     If JOBF =='N', then B is not referenced.
-!     See the descriptions of X, W, K.
+!>    \param[out] B
+!>    \verbatim
+!>    B (output) REAL(KIND=WP)  M-by-N array.
+!>    IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
+!>    be used for computing the refined vectors; see further
+!>    details in the provided references.
+!>    If JOBF == 'E', B(1:M,1;K) contains
+!>    A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!>    Exact DMD, up to scaling by the inverse eigenvalues.
+!>    If JOBF =='N', then B is not referenced.
+!>    See the descriptions of X, W, K.
+!>    \endverbatim
 !.....
-!     LDB (input) INTEGER, LDB >= M
-!     The leading dimension of the array B.
+!>    \param[in] LDB
+!>    \verbatim
+!>    LDB (input) INTEGER, LDB >= M
+!>    The leading dimension of the array B.
+!>    \endverbatim
 !.....
-!     W (workspace/output) REAL(KIND=WP) N-by-N array
-!     On exit, W(1:K,1:K) contains the K computed
-!     eigenvectors of the matrix Rayleigh quotient (real and
-!     imaginary parts for each complex conjugate pair of the
-!     eigenvalues). The Ritz vectors (returned in Z) are the
-!     product of X (containing a POD basis for the input
-!     matrix X) and W. See the descriptions of K, S, X and Z.
-!     W is also used as a workspace to temporarily store the
-!     right singular vectors of X.
+!>    \param[out] W
+!>    \verbatim
+!>    W (workspace/output) REAL(KIND=WP) N-by-N array
+!>    On exit, W(1:K,1:K) contains the K computed
+!>    eigenvectors of the matrix Rayleigh quotient (real and
+!>    imaginary parts for each complex conjugate pair of the
+!>    eigenvalues). The Ritz vectors (returned in Z) are the
+!>    product of X (containing a POD basis for the input
+!>    matrix X) and W. See the descriptions of K, S, X and Z.
+!>    W is also used as a workspace to temporarily store the
+!>    right singular vectors of X.
+!>    \endverbatim
 !.....
-!     LDW (input) INTEGER, LDW >= N
-!     The leading dimension of the array W.
+!>    \param[in] LDW
+!>    \verbatim
+!>    LDW (input) INTEGER, LDW >= N
+!>    The leading dimension of the array W.
+!>    \endverbatim
 !.....
-!     S (workspace/output) REAL(KIND=WP) N-by-N array
-!     The array S(1:K,1:K) is used for the matrix Rayleigh
-!     quotient. This content is overwritten during
-!     the eigenvalue decomposition by DGEEV.
-!     See the description of K.
+!>    \param[out] S
+!>    \verbatim
+!>    S (workspace/output) REAL(KIND=WP) N-by-N array
+!>    The array S(1:K,1:K) is used for the matrix Rayleigh
+!>    quotient. This content is overwritten during
+!>    the eigenvalue decomposition by DGEEV.
+!>    See the description of K.
+!>    \endverbatim
 !.....
-!     LDS (input) INTEGER, LDS >= N
-!     The leading dimension of the array S.
+!>    \param[in] LDS
+!>    \verbatim
+!>    LDS (input) INTEGER, LDS >= N
+!>    The leading dimension of the array S.
+!>    \endverbatim
 !.....
-!     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
-!     On exit, WORK(1:N) contains the singular values of
-!     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
-!     If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain
-!     scaling factor WORK(N+2)/WORK(N+1) used to scale X
-!     and Y to avoid overflow in the SVD of X.
-!     This may be of interest if the scaling option is off
-!     and as many as possible smallest eigenvalues are
-!     desired to the highest feasible accuracy.
-!     If the call to DGEDMD is only workspace query, then
-!     WORK(1) contains the minimal workspace length and
-!     WORK(2) is the optimal workspace length. Hence, the
-!     leng of work is at least 2.
-!     See the description of LWORK.
+!>    \param[out] WORK
+!>    \verbatim
+!>    WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!>    On exit, WORK(1:N) contains the singular values of
+!>    X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
+!>    If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain
+!>    scaling factor WORK(N+2)/WORK(N+1) used to scale X
+!>    and Y to avoid overflow in the SVD of X.
+!>    This may be of interest if the scaling option is off
+!>    and as many as possible smallest eigenvalues are
+!>    desired to the highest feasible accuracy.
+!>    If the call to DGEDMD is only workspace query, then
+!>    WORK(1) contains the minimal workspace length and
+!>    WORK(2) is the optimal workspace length. Hence, the
+!>    leng of work is at least 2.
+!>    See the description of LWORK.
+!>    \endverbatim
 !.....
-!     LWORK (input) INTEGER
-!     The minimal length of the workspace vector WORK.
-!     LWORK is calculated as follows:
-!     If WHTSVD == 1 ::
-!        If JOBZ == 'V', then
-!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)).
-!        If JOBZ == 'N'  then
-!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)).
-!        Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal
-!        workspace length of DGESVD.
-!     If WHTSVD == 2 ::
-!        If JOBZ == 'V', then
-!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N))
-!        If JOBZ == 'N', then
-!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N))
-!        Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the
-!        minimal workspace length of DGESDD.
-!     If WHTSVD == 3 ::
-!        If JOBZ == 'V', then
-!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
-!        If JOBZ == 'N', then
-!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
-!        Here LWORK_SVD = N+M+MAX(3*N+1,
-!                        MAX(1,3*N+M,5*N),MAX(1,N))
-!        is the minimal workspace length of DGESVDQ.
-!     If WHTSVD == 4 ::
-!        If JOBZ == 'V', then
-!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
-!        If JOBZ == 'N', then
-!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
-!        Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the
-!        minimal workspace length of DGEJSV.
-!     The above expressions are not simplified in order to
-!     make the usage of WORK more transparent, and for
-!     easier checking. In any case, LWORK >= 2.
-!     If on entry LWORK = -1, then a workspace query is
-!     assumed and the procedure only computes the minimal
-!     and the optimal workspace lengths for both WORK and
-!     IWORK. See the descriptions of WORK and IWORK.
+!>    \param[in] LWORK
+!>    \verbatim
+!>    LWORK (input) INTEGER
+!>    The minimal length of the workspace vector WORK.
+!>    LWORK is calculated as follows:
+!>    If WHTSVD == 1 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)).
+!>       If JOBZ == 'N'  then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)).
+!>       Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal
+!>       workspace length of DGESVD.
+!>    If WHTSVD == 2 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N))
+!>       If JOBZ == 'N', then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N))
+!>       Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the
+!>       minimal workspace length of DGESDD.
+!>    If WHTSVD == 3 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
+!>       If JOBZ == 'N', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
+!>       Here LWORK_SVD = N+M+MAX(3*N+1,
+!>                       MAX(1,3*N+M,5*N),MAX(1,N))
+!>       is the minimal workspace length of DGESVDQ.
+!>    If WHTSVD == 4 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
+!>       If JOBZ == 'N', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
+!>       Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the
+!>       minimal workspace length of DGEJSV.
+!>    The above expressions are not simplified in order to
+!>    make the usage of WORK more transparent, and for
+!>    easier checking. In any case, LWORK >= 2.
+!>    If on entry LWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths for both WORK and
+!>    IWORK. See the descriptions of WORK and IWORK.
+!>    \endverbatim
 !.....
-!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
-!     Workspace that is required only if WHTSVD equals
-!     2 , 3 or 4. (See the description of WHTSVD).
-!     If on entry LWORK =-1 or LIWORK=-1, then the
-!     minimal length of IWORK is computed and returned in
-!     IWORK(1). See the description of LIWORK.
+!>    \param[out] IWORK
+!>    \verbatim
+!>    IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!>    Workspace that is required only if WHTSVD equals
+!>    2 , 3 or 4. (See the description of WHTSVD).
+!>    If on entry LWORK =-1 or LIWORK=-1, then the
+!>    minimal length of IWORK is computed and returned in
+!>    IWORK(1). See the description of LIWORK.
+!>    \endverbatim
 !.....
-!     LIWORK (input) INTEGER
-!     The minimal length of the workspace vector IWORK.
-!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
-!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
-!     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
-!     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
-!     If on entry LIWORK = -1, then a workspace query is
-!     assumed and the procedure only computes the minimal
-!     and the optimal workspace lengths for both WORK and
-!     IWORK. See the descriptions of WORK and IWORK.
+!>    \param[in] LIWORK
+!>    \verbatim
+!>    LIWORK (input) INTEGER
+!>    The minimal length of the workspace vector IWORK.
+!>    If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!>    If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!>    If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!>    If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!>    If on entry LIWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths for both WORK and
+!>    IWORK. See the descriptions of WORK and IWORK.
+!>    \endverbatim
 !.....
-!     INFO (output) INTEGER
-!     -i < 0 :: On entry, the i-th argument had an
-!               illegal value
-!        = 0 :: Successful return.
-!        = 1 :: Void input. Quick exit (M=0 or N=0).
-!        = 2 :: The SVD computation of X did not converge.
-!               Suggestion: Check the input data and/or
-!               repeat with different WHTSVD.
-!        = 3 :: The computation of the eigenvalues did not
-!               converge.
-!        = 4 :: If data scaling was requested on input and
-!               the procedure found inconsistency in the data
-!               such that for some column index i,
-!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
-!               to zero if JOBS=='C'. The computation proceeds
-!               with original or modified data and warning
-!               flag is set with INFO=4.
+!>    \param[out] INFO
+!>    \verbatim
+!>    INFO (output) INTEGER
+!>    -i < 0 :: On entry, the i-th argument had an
+!>              illegal value
+!>       = 0 :: Successful return.
+!>       = 1 :: Void input. Quick exit (M=0 or N=0).
+!>       = 2 :: The SVD computation of X did not converge.
+!>              Suggestion: Check the input data and/or
+!>              repeat with different WHTSVD.
+!>       = 3 :: The computation of the eigenvalues did not
+!>              converge.
+!>       = 4 :: If data scaling was requested on input and
+!>              the procedure found inconsistency in the data
+!>              such that for some column index i,
+!>              X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!>              to zero if JOBS=='C'. The computation proceeds
+!>              with original or modified data and warning
+!>              flag is set with INFO=4.
+!>    \endverbatim
+!
+!  Authors:
+!  ========
+!
+!> \author Zlatko Drmac
+!
+!> \ingroup gedmd
+!
 !.............................................................
 !.............................................................
+      SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,  &
+                         M, N, X, LDX, Y, LDY, NRNK, TOL,  &
+                         K, REIG,  IMEIG,   Z, LDZ,  RES,  &
+                         B, LDB, W,  LDW,   S, LDS,        &
+                         WORK, LWORK, IWORK, LIWORK, INFO )
+!
+!  -- LAPACK driver routine                                           --
+!
+!  -- LAPACK is a software package provided by University of          --
+!  -- Tennessee, University of California Berkeley, University of     --
+!  -- Colorado Denver and NAG Ltd..                                   --
+!
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real64
+!
+!     Scalar arguments
+!     ~~~~~~~~~~~~~~~~
+      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+                                 NRNK, LDZ, LDB, LDW,  LDS, &
+                                 LWORK,  LIWORK
+      INTEGER,   INTENT(OUT)  :: K, INFO
+      REAL(KIND=WP), INTENT(IN)  :: TOL
+!
+!     Array arguments
+!     ~~~~~~~~~~~~~~~
+      REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+      REAL(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+                                      W(LDW,*), S(LDS,*)
+      REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*), &
+                                      RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!
 !     Parameters
 !     ~~~~~~~~~~
       REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
       REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
-
+!
 !     Local scalars
 !     ~~~~~~~~~~~~~
       REAL(KIND=WP) :: OFL,    ROOTSC, SCALE,  SMALL,  &
@@ -432,10 +582,11 @@
                        WNTEX,  WNTREF, WNTRES, WNTVEC
       CHARACTER     :: JOBZL,  T_OR_N
       CHARACTER     :: JSVOPT
-
+!
 !     Local arrays
 !     ~~~~~~~~~~~~
       REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2)
+!
 !     External functions (BLAS and LAPACK)
 !     ~~~~~~~~~~~~~~~~~
       REAL(KIND=WP) DLANGE, DLAMCH, DNRM2
@@ -443,13 +594,13 @@
       INTEGER       IDAMAX
       LOGICAL       DISNAN, LSAME
       EXTERNAL      DISNAN, LSAME
-
+!
 !     External subroutines (BLAS and LAPACK)
 !     ~~~~~~~~~~~~~~~~~~~~
       EXTERNAL      DAXPY,  DGEMM,  DSCAL
       EXTERNAL      DGEEV,  DGEJSV, DGESDD, DGESVD, DGESVDQ, &
                     DLACPY, DLASCL, DLASSQ, XERBLA
-
+!
 !     Intrinsic functions
 !     ~~~~~~~~~~~~~~~~~~~
       INTRINSIC     DBLE, INT, MAX, SQRT
@@ -632,7 +783,8 @@
           K = 0
           DO i = 1, N
             !WORK(i) = DNRM2( M, X(1,i), 1 )
-            SCALE  = ZERO
+            SSUM  = ONE
+            SCALE = ZERO
             CALL DLASSQ( M, X(1,i), 1, SCALE, SSUM )
             IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN
                 K    =  0
@@ -705,7 +857,8 @@
           ! carefully computed using DLASSQ.
           DO i = 1, N
             !WORK(i) = DNRM2( M, Y(1,i), 1 )
-            SCALE  = ZERO
+            SSUM  = ONE
+            SCALE = ZERO
             CALL DLASSQ( M, Y(1,i), 1, SCALE, SSUM )
             IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN
                 K    =  0
@@ -1051,4 +1204,3 @@
       RETURN
 !     ......
       END SUBROUTINE DGEDMD
-
diff --git a/lapack-netlib/SRC/sgedmd.f90 b/lapack-netlib/SRC/sgedmd.f90
index 49cb11527..4860e8898 100644
--- a/lapack-netlib/SRC/sgedmd.f90
+++ b/lapack-netlib/SRC/sgedmd.f90
@@ -1,423 +1,573 @@
-      SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,  &
-                         M, N, X, LDX, Y, LDY, NRNK, TOL,  &
-                         K, REIG,  IMEIG,   Z, LDZ,  RES,  &
-                         B, LDB, W,  LDW,   S, LDS,        &
-                         WORK, LWORK, IWORK, LIWORK, INFO )
-! March 2023
+!> \brief \b SGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices.
+!
+!  =========== DOCUMENTATION ===========
+!
+!  Definition:
+!  ===========
+!
+!     SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,  &
+!                        M, N, X, LDX, Y, LDY, NRNK, TOL,  &
+!                        K, REIG,  IMEIG,   Z, LDZ,  RES,  &
+!                        B, LDB, W,  LDW,   S, LDS,        &
+!                        WORK, LWORK, IWORK, LIWORK, INFO )
 !.....
-      USE                   iso_fortran_env
-      IMPLICIT NONE
-      INTEGER, PARAMETER :: WP = real32
+!     USE                   iso_fortran_env
+!     IMPLICIT NONE
+!     INTEGER, PARAMETER :: WP = real32
 !.....
 !     Scalar arguments
-      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
-      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
-                                 NRNK, LDZ, LDB, LDW,  LDS, &
-                                 LWORK,  LIWORK
-      INTEGER,   INTENT(OUT)  :: K, INFO
-      REAL(KIND=WP), INTENT(IN) ::  TOL
+!     CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+!     INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+!                                NRNK, LDZ, LDB, LDW,  LDS, &
+!                                LWORK,  LIWORK
+!     INTEGER,   INTENT(OUT)  :: K, INFO
+!     REAL(KIND=WP), INTENT(IN) ::  TOL
 !     Array arguments
-      REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
-      REAL(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
-                                      W(LDW,*), S(LDS,*)
-      REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*), &
-                                      RES(*)
-      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)
-      INTEGER,       INTENT(OUT)   :: IWORK(*)
-!............................................................
-!     Purpose
-!     =======
-!     SGEDMD computes the Dynamic Mode Decomposition (DMD) for
-!     a pair of data snapshot matrices. For the input matrices
-!     X and Y such that Y = A*X with an unaccessible matrix
-!     A, SGEDMD computes a certain number of Ritz pairs of A using
-!     the standard Rayleigh-Ritz extraction from a subspace of
-!     range(X) that is determined using the leading left singular
-!     vectors of X. Optionally, SGEDMD returns the residuals
-!     of the computed Ritz pairs, the information needed for
-!     a refinement of the Ritz vectors, or the eigenvectors of
-!     the Exact DMD.
-!     For further details see the references listed
-!     below. For more details of the implementation see [3].
-!
-!     References
-!     ==========
-!     [1] P. Schmid: Dynamic mode decomposition of numerical
-!         and experimental data,
-!         Journal of Fluid Mechanics 656, 5-28, 2010.
-!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
-!         decompositions: analysis and enhancements,
-!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
-!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
-!         Mode Decomposition I. Technical report. AIMDyn Inc.
-!         and LAPACK Working Note 298.
-!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
-!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
-!         Theory and Applications, Journal of Computational
-!         Dynamics 1(2), 391 -421, 2014.
+!     REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+!     REAL(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+!                                     W(LDW,*), S(LDS,*)
+!     REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*), &
+!                                     RES(*)
+!     REAL(KIND=WP), INTENT(OUT)   :: WORK(*)
+!     INTEGER,       INTENT(OUT)   :: IWORK(*)
 !
+!............................................................
+!>    \par Purpose:
+!     =============
+!>    \verbatim
+!>    SGEDMD computes the Dynamic Mode Decomposition (DMD) for
+!>    a pair of data snapshot matrices. For the input matrices
+!>    X and Y such that Y = A*X with an unaccessible matrix
+!>    A, SGEDMD computes a certain number of Ritz pairs of A using
+!>    the standard Rayleigh-Ritz extraction from a subspace of
+!>    range(X) that is determined using the leading left singular
+!>    vectors of X. Optionally, SGEDMD returns the residuals
+!>    of the computed Ritz pairs, the information needed for
+!>    a refinement of the Ritz vectors, or the eigenvectors of
+!>    the Exact DMD.
+!>    For further details see the references listed
+!>    below. For more details of the implementation see [3].
+!>    \endverbatim
+!............................................................
+!>    \par References:
+!     ================
+!>    \verbatim
+!>    [1] P. Schmid: Dynamic mode decomposition of numerical
+!>        and experimental data,
+!>        Journal of Fluid Mechanics 656, 5-28, 2010.
+!>    [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!>        decompositions: analysis and enhancements,
+!>        SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!>    [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!>        Mode Decomposition I. Technical report. AIMDyn Inc.
+!>        and LAPACK Working Note 298.
+!>    [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
+!>        Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!>        Theory and Applications, Journal of Computational
+!>        Dynamics 1(2), 391 -421, 2014.
+!>    \endverbatim
 !......................................................................
-!     Developed and supported by:
-!     ===========================
-!     Developed and coded by Zlatko Drmac, Faculty of Science,
-!     University of Zagreb;  drmac@math.hr
-!     In cooperation with
-!     AIMdyn Inc., Santa Barbara, CA.
-!     and supported by
-!     - DARPA SBIR project "Koopman Operator-Based Forecasting
-!     for Nonstationary Processes from Near-Term, Limited
-!     Observational Data" Contract No: W31P4Q-21-C-0007
-!     - DARPA PAI project "Physics-Informed Machine Learning
-!     Methodologies" Contract No: HR0011-18-9-0033
-!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
-!     Framework for Space-Time Analysis of Process Dynamics"
-!     Contract No: HR0011-16-C-0116
-!     Any opinions, findings and conclusions or recommendations
-!     expressed in this material are those of the author and
-!     do not necessarily reflect the views of the DARPA SBIR
-!     Program Office
-!============================================================
-!     Distribution Statement A:
-!     Approved for Public Release, Distribution Unlimited.
-!     Cleared by DARPA on September 29, 2022
-!============================================================
+!>    \par Developed and supported by:
+!     ================================
+!>    \verbatim
+!>    Developed and coded by Zlatko Drmac, Faculty of Science,
+!>    University of Zagreb;  drmac@math.hr
+!>    In cooperation with
+!>    AIMdyn Inc., Santa Barbara, CA.
+!>    and supported by
+!>    - DARPA SBIR project "Koopman Operator-Based Forecasting
+!>    for Nonstationary Processes from Near-Term, Limited
+!>    Observational Data" Contract No: W31P4Q-21-C-0007
+!>    - DARPA PAI project "Physics-Informed Machine Learning
+!>    Methodologies" Contract No: HR0011-18-9-0033
+!>    - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!>    Framework for Space-Time Analysis of Process Dynamics"
+!>    Contract No: HR0011-16-C-0116
+!>    Any opinions, findings and conclusions or recommendations
+!>    expressed in this material are those of the author and
+!>    do not necessarily reflect the views of the DARPA SBIR
+!>    Program Office
+!>    \endverbatim
 !......................................................................
+!>    \par Distribution Statement A:
+!     ==============================
+!>    \verbatim
+!>    Distribution Statement A:
+!>    Approved for Public Release, Distribution Unlimited.
+!>    Cleared by DARPA on September 29, 2022
+!>    \endverbatim
+!============================================================
 !     Arguments
 !     =========
-!     JOBS (input) CHARACTER*1
-!     Determines whether the initial data snapshots are scaled
-!     by a diagonal matrix.
-!     'S' :: The data snapshots matrices X and Y are multiplied
-!            with a diagonal matrix D so that X*D has unit
-!            nonzero columns (in the Euclidean 2-norm)
-!     'C' :: The snapshots are scaled as with the 'S' option.
-!            If it is found that an i-th column of X is zero
-!            vector and the corresponding i-th column of Y is
-!            non-zero, then the i-th column of Y is set to
-!            zero and a warning flag is raised.
-!     'Y' :: The data snapshots matrices X and Y are multiplied
-!            by a diagonal matrix D so that Y*D has unit
-!            nonzero columns (in the Euclidean 2-norm)
-!     'N' :: No data scaling.
+!
+!>    \param[in] JOBS
+!>    \verbatim
+!>    JOBS (input) CHARACTER*1
+!>    Determines whether the initial data snapshots are scaled
+!>    by a diagonal matrix.
+!>    'S' :: The data snapshots matrices X and Y are multiplied
+!>           with a diagonal matrix D so that X*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'C' :: The snapshots are scaled as with the 'S' option.
+!>           If it is found that an i-th column of X is zero
+!>           vector and the corresponding i-th column of Y is
+!>           non-zero, then the i-th column of Y is set to
+!>           zero and a warning flag is raised.
+!>    'Y' :: The data snapshots matrices X and Y are multiplied
+!>           by a diagonal matrix D so that Y*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'N' :: No data scaling.
+!>    \endverbatim
 !.....
-!     JOBZ (input) CHARACTER*1
-!     Determines whether the eigenvectors (Koopman modes) will
-!     be computed.
-!     'V' :: The eigenvectors (Koopman modes) will be computed
-!            and returned in the matrix Z.
-!            See the description of Z.
-!     'F' :: The eigenvectors (Koopman modes) will be returned
-!            in factored form as the product X(:,1:K)*W, where X
-!            contains a POD basis (leading left singular vectors
-!            of the data matrix X) and W contains the eigenvectors
-!            of the corresponding Rayleigh quotient.
-!            See the descriptions of K, X, W, Z.
-!     'N' :: The eigenvectors are not computed.
+!>    \param[in] JOBZ
+!>    \verbatim
+!>    JOBZ (input) CHARACTER*1
+!>    Determines whether the eigenvectors (Koopman modes) will
+!>    be computed.
+!>    'V' :: The eigenvectors (Koopman modes) will be computed
+!>           and returned in the matrix Z.
+!>           See the description of Z.
+!>    'F' :: The eigenvectors (Koopman modes) will be returned
+!>           in factored form as the product X(:,1:K)*W, where X
+!>           contains a POD basis (leading left singular vectors
+!>           of the data matrix X) and W contains the eigenvectors
+!>           of the corresponding Rayleigh quotient.
+!>           See the descriptions of K, X, W, Z.
+!>    'N' :: The eigenvectors are not computed.
+!>    \endverbatim
 !.....
-!     JOBR (input) CHARACTER*1
-!     Determines whether to compute the residuals.
-!     'R' :: The residuals for the computed eigenpairs will be
-!            computed and stored in the array RES.
-!            See the description of RES.
-!            For this option to be legal, JOBZ must be 'V'.
-!     'N' :: The residuals are not computed.
+!>    \param[in] JOBR
+!>    \verbatim
+!>    JOBR (input) CHARACTER*1
+!>    Determines whether to compute the residuals.
+!>    'R' :: The residuals for the computed eigenpairs will be
+!>           computed and stored in the array RES.
+!>           See the description of RES.
+!>           For this option to be legal, JOBZ must be 'V'.
+!>    'N' :: The residuals are not computed.
+!>    \endverbatim
 !.....
-!     JOBF (input) CHARACTER*1
-!     Specifies whether to store information needed for post-
-!     processing (e.g. computing refined Ritz vectors)
-!     'R' :: The matrix needed for the refinement of the Ritz
-!            vectors is computed and stored in the array B.
-!            See the description of B.
-!     'E' :: The unscaled eigenvectors of the Exact DMD are
-!            computed and returned in the array B. See the
-!            description of B.
-!     'N' :: No eigenvector refinement data is computed.
+!>    \param[in] JOBF
+!>    \verbatim
+!>    JOBF (input) CHARACTER*1
+!>    Specifies whether to store information needed for post-
+!>    processing (e.g. computing refined Ritz vectors)
+!>    'R' :: The matrix needed for the refinement of the Ritz
+!>           vectors is computed and stored in the array B.
+!>           See the description of B.
+!>    'E' :: The unscaled eigenvectors of the Exact DMD are
+!>           computed and returned in the array B. See the
+!>           description of B.
+!>    'N' :: No eigenvector refinement data is computed.
+!>    \endverbatim
 !.....
-!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
-!     Allows for a selection of the SVD algorithm from the
-!     LAPACK library.
-!     1 :: SGESVD (the QR SVD algorithm)
-!     2 :: SGESDD (the Divide and Conquer algorithm; if enough
-!          workspace available, this is the fastest option)
-!     3 :: SGESVDQ (the preconditioned QR SVD  ; this and 4
-!          are the most accurate options)
-!     4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3
-!          are the most accurate options)
-!     For the four methods above, a significant difference in
-!     the accuracy of small singular values is possible if
-!     the snapshots vary in norm so that X is severely
-!     ill-conditioned. If small (smaller than EPS*||X||)
-!     singular values are of interest and JOBS=='N',  then
-!     the options (3, 4) give the most accurate results, where
-!     the option 4 is slightly better and with stronger
-!     theoretical background.
-!     If JOBS=='S', i.e. the columns of X will be normalized,
-!     then all methods give nearly equally accurate results.
+!>    \param[in] WHTSVD
+!>    \verbatim
+!>    WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!>    Allows for a selection of the SVD algorithm from the
+!>    LAPACK library.
+!>    1 :: SGESVD (the QR SVD algorithm)
+!>    2 :: SGESDD (the Divide and Conquer algorithm; if enough
+!>         workspace available, this is the fastest option)
+!>    3 :: SGESVDQ (the preconditioned QR SVD  ; this and 4
+!>         are the most accurate options)
+!>    4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3
+!>         are the most accurate options)
+!>    For the four methods above, a significant difference in
+!>    the accuracy of small singular values is possible if
+!>    the snapshots vary in norm so that X is severely
+!>    ill-conditioned. If small (smaller than EPS*||X||)
+!>    singular values are of interest and JOBS=='N',  then
+!>    the options (3, 4) give the most accurate results, where
+!>    the option 4 is slightly better and with stronger
+!>    theoretical background.
+!>    If JOBS=='S', i.e. the columns of X will be normalized,
+!>    then all methods give nearly equally accurate results.
+!>    \endverbatim
 !.....
-!     M (input) INTEGER, M>= 0
-!     The state space dimension (the row dimension of X, Y).
+!>    \param[in] M
+!>    \verbatim
+!>    M (input) INTEGER, M>= 0
+!>    The state space dimension (the row dimension of X, Y).
+!>    \endverbatim
 !.....
-!     N (input) INTEGER, 0 <= N <= M
-!     The number of data snapshot pairs
-!     (the number of columns of X and Y).
+!>    \param[in] N
+!>    \verbatim
+!>    N (input) INTEGER, 0 <= N <= M
+!>    The number of data snapshot pairs
+!>    (the number of columns of X and Y).
+!>    \endverbatim
 !.....
-!     X (input/output) REAL(KIND=WP) M-by-N array
-!     > On entry, X contains the data snapshot matrix X. It is
-!     assumed that the column norms of X are in the range of
-!     the normalized floating point numbers.
-!     < On exit, the leading K columns of X contain a POD basis,
-!     i.e. the leading K left singular vectors of the input
-!     data matrix X, U(:,1:K). All N columns of X contain all
-!     left singular vectors of the input matrix X.
-!     See the descriptions of K, Z and W.
+!>    \param[in,out] X
+!>    \verbatim
+!>    X (input/output) REAL(KIND=WP) M-by-N array
+!>    > On entry, X contains the data snapshot matrix X. It is
+!>    assumed that the column norms of X are in the range of
+!>    the normalized floating point numbers.
+!>    < On exit, the leading K columns of X contain a POD basis,
+!>    i.e. the leading K left singular vectors of the input
+!>    data matrix X, U(:,1:K). All N columns of X contain all
+!>    left singular vectors of the input matrix X.
+!>    See the descriptions of K, Z and W.
+!>    \endverbatim
 !.....
-!     LDX (input) INTEGER, LDX >= M
-!     The leading dimension of the array X.
+!>    \param[in] LDX
+!>    \verbatim
+!>    LDX (input) INTEGER, LDX >= M
+!>    The leading dimension of the array X.
+!>    \endverbatim
 !.....
-!     Y (input/workspace/output) REAL(KIND=WP) M-by-N array
-!     > On entry, Y contains the data snapshot matrix Y
-!     < On exit,
-!     If JOBR == 'R', the leading K columns of Y  contain
-!     the residual vectors for the computed Ritz pairs.
-!     See the description of RES.
-!     If JOBR == 'N', Y contains the original input data,
-!                     scaled according to the value of JOBS.
+!>    \param[in,out] Y
+!>    \verbatim
+!>    Y (input/workspace/output) REAL(KIND=WP) M-by-N array
+!>    > On entry, Y contains the data snapshot matrix Y
+!>    < On exit,
+!>    If JOBR == 'R', the leading K columns of Y  contain
+!>    the residual vectors for the computed Ritz pairs.
+!>    See the description of RES.
+!>    If JOBR == 'N', Y contains the original input data,
+!>                    scaled according to the value of JOBS.
+!>    \endverbatim
 !.....
-!     LDY (input) INTEGER , LDY >= M
-!     The leading dimension of the array Y.
+!>    \param[in] LDY
+!>    \verbatim
+!>    LDY (input) INTEGER , LDY >= M
+!>    The leading dimension of the array Y.
+!>    \endverbatim
 !.....
-!     NRNK (input) INTEGER
-!     Determines the mode how to compute the numerical rank,
-!     i.e. how to truncate small singular values of the input
-!     matrix X. On input, if
-!     NRNK = -1 :: i-th singular value sigma(i) is truncated
-!                  if sigma(i) <= TOL*sigma(1)
-!                  This option is recommended.
-!     NRNK = -2 :: i-th singular value sigma(i) is truncated
-!                  if sigma(i) <= TOL*sigma(i-1)
-!                  This option is included for R&D purposes.
-!                  It requires highly accurate SVD, which
-!                  may not be feasible.
-!     The numerical rank can be enforced by using positive
-!     value of NRNK as follows:
-!     0 < NRNK <= N :: at most NRNK largest singular values
-!     will be used. If the number of the computed nonzero
-!     singular values is less than NRNK, then only those
-!     nonzero values will be used and the actually used
-!     dimension is less than NRNK. The actual number of
-!     the nonzero singular values is returned in the variable
-!     K. See the descriptions of TOL and  K.
+!>    \param[in] NRNK
+!>    \verbatim
+!>    NRNK (input) INTEGER
+!>    Determines the mode how to compute the numerical rank,
+!>    i.e. how to truncate small singular values of the input
+!>    matrix X. On input, if
+!>    NRNK = -1 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(1)
+!>                 This option is recommended.
+!>    NRNK = -2 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(i-1)
+!>                 This option is included for R&D purposes.
+!>                 It requires highly accurate SVD, which
+!>                 may not be feasible.
+!>    The numerical rank can be enforced by using positive
+!>    value of NRNK as follows:
+!>    0 < NRNK <= N :: at most NRNK largest singular values
+!>    will be used. If the number of the computed nonzero
+!>    singular values is less than NRNK, then only those
+!>    nonzero values will be used and the actually used
+!>    dimension is less than NRNK. The actual number of
+!>    the nonzero singular values is returned in the variable
+!>    K. See the descriptions of TOL and  K.
+!>    \endverbatim
 !.....
-!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
-!     The tolerance for truncating small singular values.
-!     See the description of NRNK.
+!>    \param[in] TOL
+!>    \verbatim
+!>    TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!>    The tolerance for truncating small singular values.
+!>    See the description of NRNK.
+!>    \endverbatim
 !.....
-!     K (output) INTEGER,  0 <= K <= N
-!     The dimension of the POD basis for the data snapshot
-!     matrix X and the number of the computed Ritz pairs.
-!     The value of K is determined according to the rule set
-!     by the parameters NRNK and TOL.
-!     See the descriptions of NRNK and TOL.
+!>    \param[out] K
+!>    \verbatim
+!>    K (output) INTEGER,  0 <= K <= N
+!>    The dimension of the POD basis for the data snapshot
+!>    matrix X and the number of the computed Ritz pairs.
+!>    The value of K is determined according to the rule set
+!>    by the parameters NRNK and TOL.
+!>    See the descriptions of NRNK and TOL.
+!>    \endverbatim
 !.....
-!     REIG (output) REAL(KIND=WP) N-by-1 array
-!     The leading K (K<=N) entries of REIG contain
-!     the real parts of the computed eigenvalues
-!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
-!     See the descriptions of K, IMEIG, and Z.
+!>    \param[out] REIG
+!>    \verbatim
+!>    REIG (output) REAL(KIND=WP) N-by-1 array
+!>    The leading K (K<=N) entries of REIG contain
+!>    the real parts of the computed eigenvalues
+!>    REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!>    See the descriptions of K, IMEIG, and Z.
+!>    \endverbatim
 !.....
-!     IMEIG (output) REAL(KIND=WP) N-by-1 array
-!     The leading K (K<=N) entries of IMEIG contain
-!     the imaginary parts of the computed eigenvalues
-!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
-!     The eigenvalues are determined as follows:
-!     If IMEIG(i) == 0, then the corresponding eigenvalue is
-!     real, LAMBDA(i) = REIG(i).
-!     If IMEIG(i)>0, then the corresponding complex
-!     conjugate pair of eigenvalues reads
-!     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i)
-!     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i)
-!     That is, complex conjugate pairs have consecutive
-!     indices (i,i+1), with the positive imaginary part
-!     listed first.
-!     See the descriptions of K, REIG, and Z.
+!>    \param[out] IMEIG
+!>    \verbatim
+!>    IMEIG (output) REAL(KIND=WP) N-by-1 array
+!>    The leading K (K<=N) entries of IMEIG contain
+!>    the imaginary parts of the computed eigenvalues
+!>    REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!>    The eigenvalues are determined as follows:
+!>    If IMEIG(i) == 0, then the corresponding eigenvalue is
+!>    real, LAMBDA(i) = REIG(i).
+!>    If IMEIG(i)>0, then the corresponding complex
+!>    conjugate pair of eigenvalues reads
+!>    LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i)
+!>    LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i)
+!>    That is, complex conjugate pairs have consecutive
+!>    indices (i,i+1), with the positive imaginary part
+!>    listed first.
+!>    See the descriptions of K, REIG, and Z.
+!>    \endverbatim
 !.....
-!     Z (workspace/output) REAL(KIND=WP)  M-by-N array
-!     If JOBZ =='V' then
-!        Z contains real Ritz vectors as follows:
-!        If IMEIG(i)=0, then Z(:,i) is an eigenvector of
-!        the i-th Ritz value; ||Z(:,i)||_2=1.
-!        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then
-!        [Z(:,i) Z(:,i+1)] span an invariant subspace and
-!        the Ritz values extracted from this subspace are
-!        REIG(i) + sqrt(-1)*IMEIG(i) and
-!        REIG(i) - sqrt(-1)*IMEIG(i).
-!        The corresponding eigenvectors are
-!        Z(:,i) + sqrt(-1)*Z(:,i+1) and
-!        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively.
-!        || Z(:,i:i+1)||_F = 1.
-!     If JOBZ == 'F', then the above descriptions hold for
-!     the columns of X(:,1:K)*W(1:K,1:K), where the columns
-!     of W(1:k,1:K) are the computed eigenvectors of the
-!     K-by-K Rayleigh quotient. The columns of W(1:K,1:K)
-!     are similarly structured: If IMEIG(i) == 0 then
-!     X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0
-!     then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and
-!          X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1)
-!     are the eigenvectors of LAMBDA(i), LAMBDA(i+1).
-!     See the descriptions of REIG, IMEIG, X and W.
+!>    \param[out] Z
+!>    \verbatim
+!>    Z (workspace/output) REAL(KIND=WP)  M-by-N array
+!>    If JOBZ =='V' then
+!>       Z contains real Ritz vectors as follows:
+!>       If IMEIG(i)=0, then Z(:,i) is an eigenvector of
+!>       the i-th Ritz value; ||Z(:,i)||_2=1.
+!>       If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then
+!>       [Z(:,i) Z(:,i+1)] span an invariant subspace and
+!>       the Ritz values extracted from this subspace are
+!>       REIG(i) + sqrt(-1)*IMEIG(i) and
+!>       REIG(i) - sqrt(-1)*IMEIG(i).
+!>       The corresponding eigenvectors are
+!>       Z(:,i) + sqrt(-1)*Z(:,i+1) and
+!>       Z(:,i) - sqrt(-1)*Z(:,i+1), respectively.
+!>       || Z(:,i:i+1)||_F = 1.
+!>    If JOBZ == 'F', then the above descriptions hold for
+!>    the columns of X(:,1:K)*W(1:K,1:K), where the columns
+!>    of W(1:k,1:K) are the computed eigenvectors of the
+!>    K-by-K Rayleigh quotient. The columns of W(1:K,1:K)
+!>    are similarly structured: If IMEIG(i) == 0 then
+!>    X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0
+!>    then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and
+!>         X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1)
+!>    are the eigenvectors of LAMBDA(i), LAMBDA(i+1).
+!>    See the descriptions of REIG, IMEIG, X and W.
+!>    \endverbatim
 !.....
-!     LDZ (input) INTEGER , LDZ >= M
-!     The leading dimension of the array Z.
+!>    \param[in] LDZ
+!>    \verbatim
+!>    LDZ (input) INTEGER , LDZ >= M
+!>    The leading dimension of the array Z.
+!>    \endverbatim
 !.....
-!     RES (output) REAL(KIND=WP) N-by-1 array
-!     RES(1:K) contains the residuals for the K computed
-!     Ritz pairs.
-!     If LAMBDA(i) is real, then
-!        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2.
-!     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair
-!     then
-!     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F
-!     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ]
-!               [-imag(LAMBDA(i)) real(LAMBDA(i)) ].
-!     It holds that
-!     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2
-!     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2
-!     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1)
-!           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1)
-!     See the description of REIG, IMEIG and Z.
+!>    \param[out] RES
+!>    \verbatim
+!>    RES (output) REAL(KIND=WP) N-by-1 array
+!>    RES(1:K) contains the residuals for the K computed
+!>    Ritz pairs.
+!>    If LAMBDA(i) is real, then
+!>       RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2.
+!>    If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair
+!>    then
+!>    RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F
+!>    where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ]
+!>              [-imag(LAMBDA(i)) real(LAMBDA(i)) ].
+!>    It holds that
+!>    RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2
+!>    RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2
+!>    where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1)
+!>          ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1)
+!>    See the description of REIG, IMEIG and Z.
+!>    \endverbatim
 !.....
-!     B (output) REAL(KIND=WP)  M-by-N array.
-!     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
-!     be used for computing the refined vectors; see further
-!     details in the provided references.
-!     If JOBF == 'E', B(1:M,1;K) contains
-!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
-!     Exact DMD, up to scaling by the inverse eigenvalues.
-!     If JOBF =='N', then B is not referenced.
-!     See the descriptions of X, W, K.
+!>    \param[out] B
+!>    \verbatim
+!>    B (output) REAL(KIND=WP)  M-by-N array.
+!>    IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
+!>    be used for computing the refined vectors; see further
+!>    details in the provided references.
+!>    If JOBF == 'E', B(1:M,1;K) contains
+!>    A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!>    Exact DMD, up to scaling by the inverse eigenvalues.
+!>    If JOBF =='N', then B is not referenced.
+!>    See the descriptions of X, W, K.
+!>    \endverbatim
 !.....
-!     LDB (input) INTEGER, LDB >= M
-!     The leading dimension of the array B.
+!>    \param[in] LDB
+!>    \verbatim
+!>    LDB (input) INTEGER, LDB >= M
+!>    The leading dimension of the array B.
+!>    \endverbatim
 !.....
-!     W (workspace/output) REAL(KIND=WP) N-by-N array
-!     On exit, W(1:K,1:K) contains the K computed
-!     eigenvectors of the matrix Rayleigh quotient (real and
-!     imaginary parts for each complex conjugate pair of the
-!     eigenvalues). The Ritz vectors (returned in Z) are the
-!     product of X (containing a POD basis for the input
-!     matrix X) and W. See the descriptions of K, S, X and Z.
-!     W is also used as a workspace to temporarily store the
-!     left singular vectors of X.
+!>    \param[out] W
+!>    \verbatim
+!>    W (workspace/output) REAL(KIND=WP) N-by-N array
+!>    On exit, W(1:K,1:K) contains the K computed
+!>    eigenvectors of the matrix Rayleigh quotient (real and
+!>    imaginary parts for each complex conjugate pair of the
+!>    eigenvalues). The Ritz vectors (returned in Z) are the
+!>    product of X (containing a POD basis for the input
+!>    matrix X) and W. See the descriptions of K, S, X and Z.
+!>    W is also used as a workspace to temporarily store the
+!>    left singular vectors of X.
+!>    \endverbatim
 !.....
-!     LDW (input) INTEGER, LDW >= N
-!     The leading dimension of the array W.
+!>    \param[in] LDW
+!>    \verbatim
+!>    LDW (input) INTEGER, LDW >= N
+!>    The leading dimension of the array W.
+!>    \endverbatim
 !.....
-!     S (workspace/output) REAL(KIND=WP) N-by-N array
-!     The array S(1:K,1:K) is used for the matrix Rayleigh
-!     quotient. This content is overwritten during
-!     the eigenvalue decomposition by SGEEV.
-!     See the description of K.
+!>    \param[out] S
+!>    \verbatim
+!>    S (workspace/output) REAL(KIND=WP) N-by-N array
+!>    The array S(1:K,1:K) is used for the matrix Rayleigh
+!>    quotient. This content is overwritten during
+!>    the eigenvalue decomposition by SGEEV.
+!>    See the description of K.
+!>    \endverbatim
 !.....
-!     LDS (input) INTEGER, LDS >= N
-!     The leading dimension of the array S.
+!>    \param[in] LDS
+!>    \verbatim
+!>    LDS (input) INTEGER, LDS >= N
+!>    The leading dimension of the array S.
+!>    \endverbatim
 !.....
-!     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
-!     On exit, WORK(1:N) contains the singular values of
-!     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
-!     If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain
-!     scaling factor WORK(N+2)/WORK(N+1) used to scale X
-!     and Y to avoid overflow in the SVD of X.
-!     This may be of interest if the scaling option is off
-!     and as many as possible smallest eigenvalues are
-!     desired to the highest feasible accuracy.
-!     If the call to SGEDMD is only workspace query, then
-!     WORK(1) contains the minimal workspace length and
-!     WORK(2) is the optimal workspace length. Hence, the
-!     length of work is at least 2.
-!     See the description of LWORK.
+!>    \param[out] WORK
+!>    \verbatim
+!>    WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!>    On exit, WORK(1:N) contains the singular values of
+!>    X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
+!>    If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain
+!>    scaling factor WORK(N+2)/WORK(N+1) used to scale X
+!>    and Y to avoid overflow in the SVD of X.
+!>    This may be of interest if the scaling option is off
+!>    and as many as possible smallest eigenvalues are
+!>    desired to the highest feasible accuracy.
+!>    If the call to SGEDMD is only workspace query, then
+!>    WORK(1) contains the minimal workspace length and
+!>    WORK(2) is the optimal workspace length. Hence, the
+!>    length of work is at least 2.
+!>    See the description of LWORK.
+!>    \endverbatim
 !.....
-!     LWORK (input) INTEGER
-!     The minimal length of the workspace vector WORK.
-!     LWORK is calculated as follows:
-!     If WHTSVD == 1 ::
-!        If JOBZ == 'V', then
-!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)).
-!        If JOBZ == 'N'  then
-!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)).
-!        Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal
-!        workspace length of SGESVD.
-!     If WHTSVD == 2 ::
-!        If JOBZ == 'V', then
-!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N))
-!        If JOBZ == 'N', then
-!        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N))
-!        Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the
-!        minimal workspace length of SGESDD.
-!     If WHTSVD == 3 ::
-!        If JOBZ == 'V', then
-!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
-!        If JOBZ == 'N', then
-!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
-!        Here LWORK_SVD = N+M+MAX(3*N+1,
-!                        MAX(1,3*N+M,5*N),MAX(1,N))
-!        is the minimal workspace length of SGESVDQ.
-!     If WHTSVD == 4 ::
-!        If JOBZ == 'V', then
-!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
-!        If JOBZ == 'N', then
-!        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
-!        Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the
-!        minimal workspace length of SGEJSV.
-!     The above expressions are not simplified in order to
-!     make the usage of WORK more transparent, and for
-!     easier checking. In any case, LWORK >= 2.
-!     If on entry LWORK = -1, then a workspace query is
-!     assumed and the procedure only computes the minimal
-!     and the optimal workspace lengths for both WORK and
-!     IWORK. See the descriptions of WORK and IWORK.
+!>    \param[in] LWORK
+!>    \verbatim
+!>    LWORK (input) INTEGER
+!>    The minimal length of the workspace vector WORK.
+!>    LWORK is calculated as follows:
+!>    If WHTSVD == 1 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)).
+!>       If JOBZ == 'N'  then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)).
+!>       Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal
+!>       workspace length of SGESVD.
+!>    If WHTSVD == 2 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N))
+!>       If JOBZ == 'N', then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N))
+!>       Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the
+!>       minimal workspace length of SGESDD.
+!>    If WHTSVD == 3 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
+!>       If JOBZ == 'N', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
+!>       Here LWORK_SVD = N+M+MAX(3*N+1,
+!>                       MAX(1,3*N+M,5*N),MAX(1,N))
+!>       is the minimal workspace length of SGESVDQ.
+!>    If WHTSVD == 4 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
+!>       If JOBZ == 'N', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
+!>       Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the
+!>       minimal workspace length of SGEJSV.
+!>    The above expressions are not simplified in order to
+!>    make the usage of WORK more transparent, and for
+!>    easier checking. In any case, LWORK >= 2.
+!>    If on entry LWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths for both WORK and
+!>    IWORK. See the descriptions of WORK and IWORK.
+!>    \endverbatim
 !.....
-!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
-!     Workspace that is required only if WHTSVD equals
-!     2 , 3 or 4. (See the description of WHTSVD).
-!     If on entry LWORK =-1 or LIWORK=-1, then the
-!     minimal length of IWORK is computed and returned in
-!     IWORK(1). See the description of LIWORK.
+!>    \param[out] IWORK
+!>    \verbatim
+!>    IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!>    Workspace that is required only if WHTSVD equals
+!>    2 , 3 or 4. (See the description of WHTSVD).
+!>    If on entry LWORK =-1 or LIWORK=-1, then the
+!>    minimal length of IWORK is computed and returned in
+!>    IWORK(1). See the description of LIWORK.
+!>    \endverbatim
 !.....
-!     LIWORK (input) INTEGER
-!     The minimal length of the workspace vector IWORK.
-!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
-!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
-!     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
-!     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
-!     If on entry LIWORK = -1, then a workspace query is
-!     assumed and the procedure only computes the minimal
-!     and the optimal workspace lengths for both WORK and
-!     IWORK. See the descriptions of WORK and IWORK.
+!>    \param[in] LIWORK
+!>    \verbatim
+!>    LIWORK (input) INTEGER
+!>    The minimal length of the workspace vector IWORK.
+!>    If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!>    If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!>    If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!>    If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!>    If on entry LIWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths for both WORK and
+!>    IWORK. See the descriptions of WORK and IWORK.
+!>    \endverbatim
 !.....
-!     INFO (output) INTEGER
-!     -i < 0 :: On entry, the i-th argument had an
-!               illegal value
-!        = 0 :: Successful return.
-!        = 1 :: Void input. Quick exit (M=0 or N=0).
-!        = 2 :: The SVD computation of X did not converge.
-!               Suggestion: Check the input data and/or
-!               repeat with different WHTSVD.
-!        = 3 :: The computation of the eigenvalues did not
-!               converge.
-!        = 4 :: If data scaling was requested on input and
-!               the procedure found inconsistency in the data
-!               such that for some column index i,
-!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
-!               to zero if JOBS=='C'. The computation proceeds
-!               with original or modified data and warning
-!               flag is set with INFO=4.
+!>    \param[out] INFO
+!>    \verbatim
+!>    INFO (output) INTEGER
+!>    -i < 0 :: On entry, the i-th argument had an
+!>              illegal value
+!>       = 0 :: Successful return.
+!>       = 1 :: Void input. Quick exit (M=0 or N=0).
+!>       = 2 :: The SVD computation of X did not converge.
+!>              Suggestion: Check the input data and/or
+!>              repeat with different WHTSVD.
+!>       = 3 :: The computation of the eigenvalues did not
+!>              converge.
+!>       = 4 :: If data scaling was requested on input and
+!>              the procedure found inconsistency in the data
+!>              such that for some column index i,
+!>              X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!>              to zero if JOBS=='C'. The computation proceeds
+!>              with original or modified data and warning
+!>              flag is set with INFO=4.
+!>    \endverbatim
+!
+!  Authors:
+!  ========
+!
+!> \author Zlatko Drmac
+!
+!> \ingroup gedmd
+!
 !.............................................................
 !.............................................................
+      SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,  &
+                         M, N, X, LDX, Y, LDY, NRNK, TOL,  &
+                         K, REIG,  IMEIG,   Z, LDZ,  RES,  &
+                         B, LDB, W,  LDW,   S, LDS,        &
+                         WORK, LWORK, IWORK, LIWORK, INFO )
+!
+!  -- LAPACK driver routine                                           --
+!
+!  -- LAPACK is a software package provided by University of          --
+!  -- Tennessee, University of California Berkeley, University of     --
+!  -- Colorado Denver and NAG Ltd..                                   --
+!
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real32
+!
+!     Scalar arguments
+!     ~~~~~~~~~~~~~~~~
+      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+                                 NRNK, LDZ, LDB, LDW,  LDS, &
+                                 LWORK,  LIWORK
+      INTEGER,   INTENT(OUT)  :: K, INFO
+      REAL(KIND=WP), INTENT(IN) ::  TOL
+!
+!     Array arguments
+!     ~~~~~~~~~~~~~~~
+      REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+      REAL(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+                                      W(LDW,*), S(LDS,*)
+      REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*), &
+                                      RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!
 !     Parameters
 !     ~~~~~~~~~~
       REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
       REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
-
+!
 !     Local scalars
 !     ~~~~~~~~~~~~~
       REAL(KIND=WP) :: OFL,   ROOTSC, SCALE,  SMALL,   &
@@ -431,11 +581,11 @@
                         WNTEX, WNTREF, WNTRES, WNTVEC
       CHARACTER     ::  JOBZL, T_OR_N
       CHARACTER     ::  JSVOPT
-
+!
 !     Local arrays
 !     ~~~~~~~~~~~~
       REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2)
-
+!
 !     External functions (BLAS and LAPACK)
 !     ~~~~~~~~~~~~~~~~~
       REAL(KIND=WP) SLANGE, SLAMCH, SNRM2
@@ -443,13 +593,13 @@
       INTEGER       ISAMAX
       LOGICAL       SISNAN, LSAME
       EXTERNAL      SISNAN, LSAME
-
+!
 !     External subroutines (BLAS and LAPACK)
 !     ~~~~~~~~~~~~~~~~~~~~
       EXTERNAL      SAXPY,  SGEMM,  SSCAL
       EXTERNAL      SGEEV,  SGEJSV, SGESDD, SGESVD, SGESVDQ, &
                     SLACPY, SLASCL, SLASSQ, XERBLA
-
+!
 !     Intrinsic functions
 !     ~~~~~~~~~~~~~~~~~~~
       INTRINSIC     INT, FLOAT, MAX, SQRT
@@ -632,7 +782,8 @@
           K = 0
           DO i = 1, N
             !WORK(i) = DNRM2( M, X(1,i), 1 )
-            SCALE  = ZERO
+            SSUM  = ONE
+            SCALE = ZERO
             CALL SLASSQ( M, X(1,i), 1, SCALE, SSUM )
             IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN
                 K    =  0
@@ -705,7 +856,8 @@
           ! carefully computed using SLASSQ.
           DO i = 1, N
             !WORK(i) = DNRM2( M, Y(1,i), 1 )
-            SCALE  = ZERO
+            SSUM  = ONE
+            SCALE = ZERO
             CALL SLASSQ( M, Y(1,i), 1, SCALE, SSUM )
             IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN
                 K    =  0
diff --git a/lapack-netlib/SRC/zgedmd.f90 b/lapack-netlib/SRC/zgedmd.f90
index 090641ad8..5045cb166 100644
--- a/lapack-netlib/SRC/zgedmd.f90
+++ b/lapack-netlib/SRC/zgedmd.f90
@@ -1,389 +1,539 @@
-      SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,   &
-                         M, N, X, LDX, Y, LDY, NRNK, TOL,   &
-                         K, EIGS, Z, LDZ, RES, B,    LDB,   &
-                         W, LDW,  S, LDS, ZWORK,  LZWORK,   &
-                         RWORK, LRWORK, IWORK, LIWORK, INFO )
-! March 2023
-!.....
-      USE                   iso_fortran_env
-      IMPLICIT NONE
-      INTEGER, PARAMETER :: WP = real64
-
-!.....
-!     Scalar arguments
-      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
-      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
-                                 NRNK, LDZ, LDB, LDW,  LDS, &
-                                 LIWORK, LRWORK, LZWORK
-      INTEGER,       INTENT(OUT)  :: K, INFO
-      REAL(KIND=WP), INTENT(IN)   ::    TOL
-!     Array arguments
-      COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
-      COMPLEX(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
-                                         W(LDW,*), S(LDS,*)
-      COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
-      COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
-      REAL(KIND=WP),    INTENT(OUT)   :: RES(*)
-      REAL(KIND=WP),    INTENT(OUT)   :: RWORK(*)
-      INTEGER,          INTENT(OUT)   :: IWORK(*)
-!............................................................
-!     Purpose
-!     =======
-!     ZGEDMD computes the Dynamic Mode Decomposition (DMD) for
-!     a pair of data snapshot matrices. For the input matrices
-!     X and Y such that Y = A*X with an unaccessible matrix
-!     A, ZGEDMD computes a certain number of Ritz pairs of A using
-!     the standard Rayleigh-Ritz extraction from a subspace of
-!     range(X) that is determined using the leading left singular
-!     vectors of X. Optionally, ZGEDMD returns the residuals
-!     of the computed Ritz pairs, the information needed for
-!     a refinement of the Ritz vectors, or the eigenvectors of
-!     the Exact DMD.
-!     For further details see the references listed
-!     below. For more details of the implementation see [3].
-!
-!     References
-!     ==========
-!     [1] P. Schmid: Dynamic mode decomposition of numerical
-!         and experimental data,
-!         Journal of Fluid Mechanics 656, 5-28, 2010.
-!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
-!         decompositions: analysis and enhancements,
-!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
-!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
-!         Mode Decomposition I. Technical report. AIMDyn Inc.
-!         and LAPACK Working Note 298.
-!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
-!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
-!         Theory and Applications, Journal of Computational
-!         Dynamics 1(2), 391 -421, 2014.
+!> \brief \b ZGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices.
+!
+!  =========== DOCUMENTATION ===========
+!
+!  Definition:
+!  ===========
 !
+!      SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,   &
+!                         M, N, X, LDX, Y, LDY, NRNK, TOL,   &
+!                         K, EIGS, Z, LDZ, RES, B,    LDB,   &
+!                         W, LDW,  S, LDS, ZWORK,  LZWORK,   &
+!                         RWORK, LRWORK, IWORK, LIWORK, INFO )
+!......
+!      USE                   iso_fortran_env
+!      IMPLICIT NONE
+!      INTEGER, PARAMETER :: WP = real64
+! 
+!......
+!      Scalar arguments
+!      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+!      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+!                                 NRNK, LDZ, LDB, LDW,  LDS, &
+!                                 LIWORK, LRWORK, LZWORK
+!      INTEGER,       INTENT(OUT)  :: K, INFO
+!      REAL(KIND=WP), INTENT(IN)   ::    TOL
+!      Array arguments
+!      COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+!      COMPLEX(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+!                                         W(LDW,*), S(LDS,*)
+!      COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
+!      COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
+!      REAL(KIND=WP),    INTENT(OUT)   :: RES(*)
+!      REAL(KIND=WP),    INTENT(OUT)   :: RWORK(*)
+!      INTEGER,          INTENT(OUT)   :: IWORK(*)
+!
+!............................................................
+!>    \par Purpose:
+!     =============
+!>    \verbatim
+!>    ZGEDMD computes the Dynamic Mode Decomposition (DMD) for
+!>    a pair of data snapshot matrices. For the input matrices
+!>    X and Y such that Y = A*X with an unaccessible matrix
+!>    A, ZGEDMD computes a certain number of Ritz pairs of A using
+!>    the standard Rayleigh-Ritz extraction from a subspace of
+!>    range(X) that is determined using the leading left singular
+!>    vectors of X. Optionally, ZGEDMD returns the residuals
+!>    of the computed Ritz pairs, the information needed for
+!>    a refinement of the Ritz vectors, or the eigenvectors of
+!>    the Exact DMD.
+!>    For further details see the references listed
+!>    below. For more details of the implementation see [3].
+!>    \endverbatim
+!............................................................
+!>    \par References:
+!     ================
+!>    \verbatim
+!>    [1] P. Schmid: Dynamic mode decomposition of numerical
+!>        and experimental data,
+!>        Journal of Fluid Mechanics 656, 5-28, 2010.
+!>    [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!>        decompositions: analysis and enhancements,
+!>        SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!>    [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!>        Mode Decomposition I. Technical report. AIMDyn Inc.
+!>        and LAPACK Working Note 298.
+!>    [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
+!>        Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!>        Theory and Applications, Journal of Computational
+!>        Dynamics 1(2), 391 -421, 2014.
+!>    \endverbatim
 !......................................................................
-!     Developed and supported by:
-!     ===========================
-!     Developed and coded by Zlatko Drmac, Faculty of Science,
-!     University of Zagreb;  drmac@math.hr
-!     In cooperation with
-!     AIMdyn Inc., Santa Barbara, CA.
-!     and supported by
-!     - DARPA SBIR project "Koopman Operator-Based Forecasting
-!     for Nonstationary Processes from Near-Term, Limited
-!     Observational Data" Contract No: W31P4Q-21-C-0007
-!     - DARPA PAI project "Physics-Informed Machine Learning
-!     Methodologies" Contract No: HR0011-18-9-0033
-!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
-!     Framework for Space-Time Analysis of Process Dynamics"
-!     Contract No: HR0011-16-C-0116
-!     Any opinions, findings and conclusions or recommendations
-!     expressed in this material are those of the author and
-!     do not necessarily reflect the views of the DARPA SBIR
-!     Program Office
-!============================================================
-!     Distribution Statement A:
-!     Approved for Public Release, Distribution Unlimited.
-!     Cleared by DARPA on September 29, 2022
-!============================================================
+!>    \par Developed and supported by:
+!     ================================
+!>    \verbatim
+!>    Developed and coded by Zlatko Drmac, Faculty of Science,
+!>    University of Zagreb;  drmac@math.hr
+!>    In cooperation with
+!>    AIMdyn Inc., Santa Barbara, CA.
+!>    and supported by
+!>    - DARPA SBIR project "Koopman Operator-Based Forecasting
+!>    for Nonstationary Processes from Near-Term, Limited
+!>    Observational Data" Contract No: W31P4Q-21-C-0007
+!>    - DARPA PAI project "Physics-Informed Machine Learning
+!>    Methodologies" Contract No: HR0011-18-9-0033
+!>    - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!>    Framework for Space-Time Analysis of Process Dynamics"
+!>    Contract No: HR0011-16-C-0116
+!>    Any opinions, findings and conclusions or recommendations
+!>    expressed in this material are those of the author and
+!>    do not necessarily reflect the views of the DARPA SBIR
+!>    Program Office
+!>    \endverbatim
+!......................................................................
+!>    \par Distribution Statement A:
+!     ==============================
+!>    \verbatim
+!>    Approved for Public Release, Distribution Unlimited.
+!>    Cleared by DARPA on September 29, 2022
+!>    \endverbatim
 !............................................................
 !     Arguments
 !     =========
-!     JOBS (input) CHARACTER*1
-!     Determines whether the initial data snapshots are scaled
-!     by a diagonal matrix.
-!     'S' :: The data snapshots matrices X and Y are multiplied
-!            with a diagonal matrix D so that X*D has unit
-!            nonzero columns (in the Euclidean 2-norm)
-!     'C' :: The snapshots are scaled as with the 'S' option.
-!            If it is found that an i-th column of X is zero
-!            vector and the corresponding i-th column of Y is
-!            non-zero, then the i-th column of Y is set to
-!            zero and a warning flag is raised.
-!     'Y' :: The data snapshots matrices X and Y are multiplied
-!            by a diagonal matrix D so that Y*D has unit
-!            nonzero columns (in the Euclidean 2-norm)
-!     'N' :: No data scaling.
+!
+!>    \param[in] JOBS
+!>    \verbatim
+!>    JOBS (input) CHARACTER*1
+!>    Determines whether the initial data snapshots are scaled
+!>    by a diagonal matrix.
+!>    'S' :: The data snapshots matrices X and Y are multiplied
+!>           with a diagonal matrix D so that X*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'C' :: The snapshots are scaled as with the 'S' option.
+!>           If it is found that an i-th column of X is zero
+!>           vector and the corresponding i-th column of Y is
+!>           non-zero, then the i-th column of Y is set to
+!>           zero and a warning flag is raised.
+!>    'Y' :: The data snapshots matrices X and Y are multiplied
+!>           by a diagonal matrix D so that Y*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'N' :: No data scaling.
+!>    \endverbatim
 !.....
-!     JOBZ (input) CHARACTER*1
-!     Determines whether the eigenvectors (Koopman modes) will
-!     be computed.
-!     'V' :: The eigenvectors (Koopman modes) will be computed
-!            and returned in the matrix Z.
-!            See the description of Z.
-!     'F' :: The eigenvectors (Koopman modes) will be returned
-!            in factored form as the product X(:,1:K)*W, where X
-!            contains a POD basis (leading left singular vectors
-!            of the data matrix X) and W contains the eigenvectors
-!            of the corresponding Rayleigh quotient.
-!            See the descriptions of K, X, W, Z.
-!     'N' :: The eigenvectors are not computed.
+!>    \param[in] JOBZ
+!>    \verbatim
+!>    JOBZ (input) CHARACTER*1
+!>    Determines whether the eigenvectors (Koopman modes) will
+!>    be computed.
+!>    'V' :: The eigenvectors (Koopman modes) will be computed
+!>           and returned in the matrix Z.
+!>           See the description of Z.
+!>    'F' :: The eigenvectors (Koopman modes) will be returned
+!>           in factored form as the product X(:,1:K)*W, where X
+!>           contains a POD basis (leading left singular vectors
+!>           of the data matrix X) and W contains the eigenvectors
+!>           of the corresponding Rayleigh quotient.
+!>           See the descriptions of K, X, W, Z.
+!>    'N' :: The eigenvectors are not computed.
+!>    \endverbatim
 !.....
-!     JOBR (input) CHARACTER*1
-!     Determines whether to compute the residuals.
-!     'R' :: The residuals for the computed eigenpairs will be
-!            computed and stored in the array RES.
-!            See the description of RES.
-!            For this option to be legal, JOBZ must be 'V'.
-!     'N' :: The residuals are not computed.
+!>    \param[in] JOBR
+!>    \verbatim
+!>    JOBR (input) CHARACTER*1
+!>    Determines whether to compute the residuals.
+!>    'R' :: The residuals for the computed eigenpairs will be
+!>           computed and stored in the array RES.
+!>           See the description of RES.
+!>           For this option to be legal, JOBZ must be 'V'.
+!>    'N' :: The residuals are not computed.
+!>    \endverbatim
 !.....
-!     JOBF (input) CHARACTER*1
-!     Specifies whether to store information needed for post-
-!     processing (e.g. computing refined Ritz vectors)
-!     'R' :: The matrix needed for the refinement of the Ritz
-!            vectors is computed and stored in the array B.
-!            See the description of B.
-!     'E' :: The unscaled eigenvectors of the Exact DMD are
-!            computed and returned in the array B. See the
-!            description of B.
-!     'N' :: No eigenvector refinement data is computed.
+!>    \param[in] JOBF
+!>    \verbatim
+!>    JOBF (input) CHARACTER*1
+!>    Specifies whether to store information needed for post-
+!>    processing (e.g. computing refined Ritz vectors)
+!>    'R' :: The matrix needed for the refinement of the Ritz
+!>           vectors is computed and stored in the array B.
+!>           See the description of B.
+!>    'E' :: The unscaled eigenvectors of the Exact DMD are
+!>           computed and returned in the array B. See the
+!>           description of B.
+!>    'N' :: No eigenvector refinement data is computed.
+!>    \endverbatim
 !.....
-!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
-!     Allows for a selection of the SVD algorithm from the
-!     LAPACK library.
-!     1 :: ZGESVD (the QR SVD algorithm)
-!     2 :: ZGESDD (the Divide and Conquer algorithm; if enough
-!          workspace available, this is the fastest option)
-!     3 :: ZGESVDQ (the preconditioned QR SVD  ; this and 4
-!          are the most accurate options)
-!     4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3
-!          are the most accurate options)
-!     For the four methods above, a significant difference in
-!     the accuracy of small singular values is possible if
-!     the snapshots vary in norm so that X is severely
-!     ill-conditioned. If small (smaller than EPS*||X||)
-!     singular values are of interest and JOBS=='N',  then
-!     the options (3, 4) give the most accurate results, where
-!     the option 4 is slightly better and with stronger
-!     theoretical background.
-!     If JOBS=='S', i.e. the columns of X will be normalized,
-!     then all methods give nearly equally accurate results.
+!>    \param[in] WHTSVD
+!>    \verbatim
+!>    WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!>    Allows for a selection of the SVD algorithm from the
+!>    LAPACK library.
+!>    1 :: ZGESVD (the QR SVD algorithm)
+!>    2 :: ZGESDD (the Divide and Conquer algorithm; if enough
+!>         workspace available, this is the fastest option)
+!>    3 :: ZGESVDQ (the preconditioned QR SVD  ; this and 4
+!>         are the most accurate options)
+!>    4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3
+!>         are the most accurate options)
+!>    For the four methods above, a significant difference in
+!>    the accuracy of small singular values is possible if
+!>    the snapshots vary in norm so that X is severely
+!>    ill-conditioned. If small (smaller than EPS*||X||)
+!>    singular values are of interest and JOBS=='N',  then
+!>    the options (3, 4) give the most accurate results, where
+!>    the option 4 is slightly better and with stronger
+!>    theoretical background.
+!>    If JOBS=='S', i.e. the columns of X will be normalized,
+!>    then all methods give nearly equally accurate results.
+!>    \endverbatim
 !.....
-!     M (input) INTEGER, M>= 0
-!     The state space dimension (the row dimension of X, Y).
+!>    \param[in] M
+!>    \verbatim
+!>    M (input) INTEGER, M>= 0
+!>    The state space dimension (the row dimension of X, Y).
+!>    \endverbatim
 !.....
-!     N (input) INTEGER, 0 <= N <= M
-!     The number of data snapshot pairs
-!     (the number of columns of X and Y).
+!>    \param[in] N
+!>    \verbatim
+!>    N (input) INTEGER, 0 <= N <= M
+!>    The number of data snapshot pairs
+!>    (the number of columns of X and Y).
+!>    \endverbatim
 !.....
-!     X (input/output) COMPLEX(KIND=WP) M-by-N array
-!   > On entry, X contains the data snapshot matrix X. It is
-!     assumed that the column norms of X are in the range of
-!     the normalized floating point numbers.
-!   < On exit, the leading K columns of X contain a POD basis,
-!     i.e. the leading K left singular vectors of the input
-!     data matrix X, U(:,1:K). All N columns of X contain all
-!     left singular vectors of the input matrix X.
-!     See the descriptions of K, Z and W.
+!>    \param[in] LDX
+!>    \verbatim
+!>    X (input/output) COMPLEX(KIND=WP) M-by-N array
+!>    > On entry, X contains the data snapshot matrix X. It is
+!>    assumed that the column norms of X are in the range of
+!>    the normalized floating point numbers.
+!>    < On exit, the leading K columns of X contain a POD basis,
+!>    i.e. the leading K left singular vectors of the input
+!>    data matrix X, U(:,1:K). All N columns of X contain all
+!>    left singular vectors of the input matrix X.
+!>    See the descriptions of K, Z and W.
 !.....
-!     LDX (input) INTEGER, LDX >= M
-!     The leading dimension of the array X.
+!>    LDX (input) INTEGER, LDX >= M
+!>    The leading dimension of the array X.
+!>    \endverbatim
 !.....
-!     Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array
-!   > On entry, Y contains the data snapshot matrix Y
-!   < On exit,
-!     If JOBR == 'R', the leading K columns of Y  contain
-!     the residual vectors for the computed Ritz pairs.
-!     See the description of RES.
-!     If JOBR == 'N', Y contains the original input data,
-!                     scaled according to the value of JOBS.
+!>    \param[in,out] Y
+!>    \verbatim
+!>    Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array
+!>    > On entry, Y contains the data snapshot matrix Y
+!>    < On exit,
+!>    If JOBR == 'R', the leading K columns of Y  contain
+!>    the residual vectors for the computed Ritz pairs.
+!>    See the description of RES.
+!>    If JOBR == 'N', Y contains the original input data,
+!>                    scaled according to the value of JOBS.
+!>    \endverbatim
 !.....
-!     LDY (input) INTEGER , LDY >= M
-!     The leading dimension of the array Y.
+!>    \param[in] LDY
+!>    \verbatim
+!>    LDY (input) INTEGER , LDY >= M
+!>    The leading dimension of the array Y.
+!>    \endverbatim
 !.....
-!     NRNK (input) INTEGER
-!     Determines the mode how to compute the numerical rank,
-!     i.e. how to truncate small singular values of the input
-!     matrix X. On input, if
-!     NRNK = -1 :: i-th singular value sigma(i) is truncated
-!                  if sigma(i) <= TOL*sigma(1)
-!                  This option is recommended.
-!     NRNK = -2 :: i-th singular value sigma(i) is truncated
-!                  if sigma(i) <= TOL*sigma(i-1)
-!                  This option is included for R&D purposes.
-!                  It requires highly accurate SVD, which
-!                  may not be feasible.
-!     The numerical rank can be enforced by using positive
-!     value of NRNK as follows:
-!     0 < NRNK <= N :: at most NRNK largest singular values
-!     will be used. If the number of the computed nonzero
-!     singular values is less than NRNK, then only those
-!     nonzero values will be used and the actually used
-!     dimension is less than NRNK. The actual number of
-!     the nonzero singular values is returned in the variable
-!     K. See the descriptions of TOL and  K.
+!>    \param[in] NRNK
+!>    \verbatim
+!>    NRNK (input) INTEGER
+!>    Determines the mode how to compute the numerical rank,
+!>    i.e. how to truncate small singular values of the input
+!>    matrix X. On input, if
+!>    NRNK = -1 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(1)
+!>                 This option is recommended.
+!>    NRNK = -2 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(i-1)
+!>                 This option is included for R&D purposes.
+!>                 It requires highly accurate SVD, which
+!>                 may not be feasible.
+!>    The numerical rank can be enforced by using positive
+!>    value of NRNK as follows:
+!>    0 < NRNK <= N :: at most NRNK largest singular values
+!>    will be used. If the number of the computed nonzero
+!>    singular values is less than NRNK, then only those
+!>    nonzero values will be used and the actually used
+!>    dimension is less than NRNK. The actual number of
+!>    the nonzero singular values is returned in the variable
+!>    K. See the descriptions of TOL and  K.
+!>    \endverbatim
 !.....
-!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
-!     The tolerance for truncating small singular values.
-!     See the description of NRNK.
+!>    \param[in] TOL
+!>    \verbatim
+!>    TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!>    The tolerance for truncating small singular values.
+!>    See the description of NRNK.
+!>    \endverbatim
 !.....
-!     K (output) INTEGER,  0 <= K <= N
-!     The dimension of the POD basis for the data snapshot
-!     matrix X and the number of the computed Ritz pairs.
-!     The value of K is determined according to the rule set
-!     by the parameters NRNK and TOL.
-!     See the descriptions of NRNK and TOL.
+!>    \param[out] K
+!>    \verbatim
+!>    K (output) INTEGER,  0 <= K <= N
+!>    The dimension of the POD basis for the data snapshot
+!>    matrix X and the number of the computed Ritz pairs.
+!>    The value of K is determined according to the rule set
+!>    by the parameters NRNK and TOL.
+!>    See the descriptions of NRNK and TOL.
+!>    \endverbatim
 !.....
-!     EIGS (output) COMPLEX(KIND=WP) N-by-1 array
-!     The leading K (K<=N) entries of EIGS contain
-!     the computed eigenvalues (Ritz values).
-!     See the descriptions of K, and Z.
+!>    \param[out] EIGS
+!>    \verbatim
+!>    EIGS (output) COMPLEX(KIND=WP) N-by-1 array
+!>    The leading K (K<=N) entries of EIGS contain
+!>    the computed eigenvalues (Ritz values).
+!>    See the descriptions of K, and Z.
+!>    \endverbatim
 !.....
-!     Z (workspace/output) COMPLEX(KIND=WP)  M-by-N array
-!     If JOBZ =='V' then Z contains the  Ritz vectors.  Z(:,i)
-!     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1.
-!     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as
-!     the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i)
-!     is an eigenvector corresponding to EIGS(i). The columns
-!     of W(1:k,1:K) are the computed eigenvectors of the
-!     K-by-K Rayleigh quotient.
-!     See the descriptions of EIGS, X and W.
+!>    \param[out] Z
+!>    \verbatim
+!>    Z (workspace/output) COMPLEX(KIND=WP)  M-by-N array
+!>    If JOBZ =='V' then Z contains the  Ritz vectors.  Z(:,i)
+!>    is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1.
+!>    If JOBZ == 'F', then the Z(:,i)'s are given implicitly as
+!>    the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i)
+!>    is an eigenvector corresponding to EIGS(i). The columns
+!>    of W(1:k,1:K) are the computed eigenvectors of the
+!>    K-by-K Rayleigh quotient.
+!>    See the descriptions of EIGS, X and W.
+!>    \endverbatim
 !.....
-!     LDZ (input) INTEGER , LDZ >= M
-!     The leading dimension of the array Z.
+!>    \param[in] LDZ
+!>    \verbatim
+!>    LDZ (input) INTEGER , LDZ >= M
+!>    The leading dimension of the array Z.
+!>    \endverbatim
 !.....
-!     RES (output) REAL(KIND=WP) N-by-1 array
-!     RES(1:K) contains the residuals for the K computed
-!     Ritz pairs,
-!     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2.
-!     See the description of EIGS and Z.
+!>    \param[out] RES
+!>    \verbatim
+!>    RES (output) REAL(KIND=WP) N-by-1 array
+!>    RES(1:K) contains the residuals for the K computed
+!>    Ritz pairs,
+!>    RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2.
+!>    See the description of EIGS and Z.
+!>    \endverbatim
 !.....
-!     B (output) COMPLEX(KIND=WP)  M-by-N array.
-!     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
-!     be used for computing the refined vectors; see further
-!     details in the provided references.
-!     If JOBF == 'E', B(1:M,1:K) contains
-!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
-!     Exact DMD, up to scaling by the inverse eigenvalues.
-!     If JOBF =='N', then B is not referenced.
-!     See the descriptions of X, W, K.
+!>    \param[out] B
+!>    \verbatim
+!>    B (output) COMPLEX(KIND=WP)  M-by-N array.
+!>    IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
+!>    be used for computing the refined vectors; see further
+!>    details in the provided references.
+!>    If JOBF == 'E', B(1:M,1:K) contains
+!>    A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!>    Exact DMD, up to scaling by the inverse eigenvalues.
+!>    If JOBF =='N', then B is not referenced.
+!>    See the descriptions of X, W, K.
+!>    \endverbatim
 !.....
-!     LDB (input) INTEGER, LDB >= M
-!     The leading dimension of the array B.
+!>    \param[in] LDB
+!>    \verbatim
+!>    LDB (input) INTEGER, LDB >= M
+!>    The leading dimension of the array B.
+!>    \endverbatim
 !.....
-!     W (workspace/output) COMPLEX(KIND=WP) N-by-N array
-!     On exit, W(1:K,1:K) contains the K computed
-!     eigenvectors of the matrix Rayleigh quotient.
-!     The Ritz vectors (returned in Z) are the
-!     product of X (containing a POD basis for the input
-!     matrix X) and W. See the descriptions of K, S, X and Z.
-!     W is also used as a workspace to temporarily store the
-!     right singular vectors of X.
+!>    \param[out] W
+!>    \verbatim
+!>    W (workspace/output) COMPLEX(KIND=WP) N-by-N array
+!>    On exit, W(1:K,1:K) contains the K computed
+!>    eigenvectors of the matrix Rayleigh quotient.
+!>    The Ritz vectors (returned in Z) are the
+!>    product of X (containing a POD basis for the input
+!>    matrix X) and W. See the descriptions of K, S, X and Z.
+!>    W is also used as a workspace to temporarily store the
+!>    right singular vectors of X.
+!>    \endverbatim
 !.....
-!     LDW (input) INTEGER, LDW >= N
-!     The leading dimension of the array W.
+!>    \param[in] LDW
+!>    \verbatim
+!>    LDW (input) INTEGER, LDW >= N
+!>    The leading dimension of the array W.
+!>    \endverbatim
 !.....
-!     S (workspace/output) COMPLEX(KIND=WP) N-by-N array
-!     The array S(1:K,1:K) is used for the matrix Rayleigh
-!     quotient. This content is overwritten during
-!     the eigenvalue decomposition by ZGEEV.
-!     See the description of K.
+!>    \param[out] S
+!>    \verbatim
+!>    S (workspace/output) COMPLEX(KIND=WP) N-by-N array
+!>    The array S(1:K,1:K) is used for the matrix Rayleigh
+!>    quotient. This content is overwritten during
+!>    the eigenvalue decomposition by ZGEEV.
+!>    See the description of K.
+!>    \endverbatim
 !.....
-!     LDS (input) INTEGER, LDS >= N
-!     The leading dimension of the array S.
+!>    \param[in] LDS
+!>    \verbatim
+!>    LDS (input) INTEGER, LDS >= N
+!>    The leading dimension of the array S.
+!>    \endverbatim
 !.....
-!     ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array
-!     ZWORK is used as complex workspace in the complex SVD, as
-!     specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing
-!     the eigenvalues of a Rayleigh quotient.
-!     If the call to ZGEDMD is only workspace query, then
-!     ZWORK(1) contains the minimal complex workspace length and
-!     ZWORK(2) is the optimal complex workspace length.
-!     Hence, the length of work is at least 2.
-!     See the description of LZWORK.
+!>    \param[out] ZWORK
+!>    \verbatim
+!>    ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array
+!>    ZWORK is used as complex workspace in the complex SVD, as
+!>    specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing
+!>    the eigenvalues of a Rayleigh quotient.
+!>    If the call to ZGEDMD is only workspace query, then
+!>    ZWORK(1) contains the minimal complex workspace length and
+!>    ZWORK(2) is the optimal complex workspace length.
+!>    Hence, the length of work is at least 2.
+!>    See the description of LZWORK.
+!>    \endverbatim
 !.....
-!     LZWORK (input) INTEGER
-!     The minimal length of the workspace vector ZWORK.
-!     LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV),
-!     where LZWORK_ZGEEV = MAX( 1, 2*N )  and the minimal
-!     LZWORK_SVD is calculated as follows
-!     If WHTSVD == 1 :: ZGESVD ::
-!        LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N))
-!     If WHTSVD == 2 :: ZGESDD ::
-!        LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N)
-!     If WHTSVD == 3 :: ZGESVDQ ::
-!        LZWORK_SVD = obtainable by a query
-!     If WHTSVD == 4 :: ZGEJSV ::
-!        LZWORK_SVD = obtainable by a query
-!     If on entry LZWORK = -1, then a workspace query is
-!     assumed and the procedure only computes the minimal
-!     and the optimal workspace lengths and returns them in
-!     LZWORK(1) and LZWORK(2), respectively.
+!>    \param[in] LZWORK
+!>    \verbatim
+!>    LZWORK (input) INTEGER
+!>    The minimal length of the workspace vector ZWORK.
+!>    LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV),
+!>    where LZWORK_ZGEEV = MAX( 1, 2*N )  and the minimal
+!>    LZWORK_SVD is calculated as follows
+!>    If WHTSVD == 1 :: ZGESVD ::
+!>       LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+!>    If WHTSVD == 2 :: ZGESDD ::
+!>       LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N)
+!>    If WHTSVD == 3 :: ZGESVDQ ::
+!>       LZWORK_SVD = obtainable by a query
+!>    If WHTSVD == 4 :: ZGEJSV ::
+!>       LZWORK_SVD = obtainable by a query
+!>    If on entry LZWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths and returns them in
+!>    LZWORK(1) and LZWORK(2), respectively.
+!>    \endverbatim
 !.....
-!     RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array
-!     On exit, RWORK(1:N) contains the singular values of
-!     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
-!     If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain
-!     scaling factor RWORK(N+2)/RWORK(N+1) used to scale X
-!     and Y to avoid overflow in the SVD of X.
-!     This may be of interest if the scaling option is off
-!     and as many as possible smallest eigenvalues are
-!     desired to the highest feasible accuracy.
-!     If the call to ZGEDMD is only workspace query, then
-!     RWORK(1) contains the minimal workspace length.
-!     See the description of LRWORK.
+!>    \param[out] RWORK
+!>    \verbatim
+!>    RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array
+!>    On exit, RWORK(1:N) contains the singular values of
+!>    X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
+!>    If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain
+!>    scaling factor RWORK(N+2)/RWORK(N+1) used to scale X
+!>    and Y to avoid overflow in the SVD of X.
+!>    This may be of interest if the scaling option is off
+!>    and as many as possible smallest eigenvalues are
+!>    desired to the highest feasible accuracy.
+!>    If the call to ZGEDMD is only workspace query, then
+!>    RWORK(1) contains the minimal workspace length.
+!>    See the description of LRWORK.
+!>    \endverbatim
 !.....
-!     LRWORK (input) INTEGER
-!     The minimal length of the workspace vector RWORK.
-!     LRWORK is calculated as follows:
-!     LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where
-!     LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace
-!     for the SVD subroutine determined by the input parameter
-!     WHTSVD.
-!     If WHTSVD == 1 :: ZGESVD ::
-!        LRWORK_SVD = 5*MIN(M,N)
-!     If WHTSVD == 2 :: ZGESDD ::
-!        LRWORK_SVD =  MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N),
-!        2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) )
-!     If WHTSVD == 3 :: ZGESVDQ ::
-!        LRWORK_SVD = obtainable by a query
-!     If WHTSVD == 4 :: ZGEJSV ::
-!        LRWORK_SVD = obtainable by a query
-!     If on entry LRWORK = -1, then a workspace query is
-!     assumed and the procedure only computes the minimal
-!     real workspace length and returns it in RWORK(1).
+!>    \param[in] LRWORK
+!>    \verbatim
+!>    LRWORK (input) INTEGER
+!>    The minimal length of the workspace vector RWORK.
+!>    LRWORK is calculated as follows:
+!>    LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where
+!>    LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace
+!>    for the SVD subroutine determined by the input parameter
+!>    WHTSVD.
+!>    If WHTSVD == 1 :: ZGESVD ::
+!>       LRWORK_SVD = 5*MIN(M,N)
+!>    If WHTSVD == 2 :: ZGESDD ::
+!>       LRWORK_SVD =  MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N),
+!>       2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) )
+!>    If WHTSVD == 3 :: ZGESVDQ ::
+!>       LRWORK_SVD = obtainable by a query
+!>    If WHTSVD == 4 :: ZGEJSV ::
+!>       LRWORK_SVD = obtainable by a query
+!>    If on entry LRWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    real workspace length and returns it in RWORK(1).
+!>    \endverbatim
 !.....
-!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
-!     Workspace that is required only if WHTSVD equals
-!     2 , 3 or 4. (See the description of WHTSVD).
-!     If on entry LWORK =-1 or LIWORK=-1, then the
-!     minimal length of IWORK is computed and returned in
-!     IWORK(1). See the description of LIWORK.
+!>    \param[out] IWORK
+!>    \verbatim
+!>    IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!>    Workspace that is required only if WHTSVD equals
+!>    2 , 3 or 4. (See the description of WHTSVD).
+!>    If on entry LWORK =-1 or LIWORK=-1, then the
+!>    minimal length of IWORK is computed and returned in
+!>    IWORK(1). See the description of LIWORK.
+!>    \endverbatim
 !.....
-!     LIWORK (input) INTEGER
-!     The minimal length of the workspace vector IWORK.
-!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
-!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
-!     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
-!     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
-!     If on entry LIWORK = -1, then a workspace query is
-!     assumed and the procedure only computes the minimal
-!     and the optimal workspace lengths for  ZWORK, RWORK and
-!     IWORK. See the descriptions of ZWORK, RWORK and IWORK.
+!>    \param[in] LIWORK
+!>    \verbatim
+!>    LIWORK (input) INTEGER
+!>    The minimal length of the workspace vector IWORK.
+!>    If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!>    If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!>    If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!>    If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!>    If on entry LIWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths for  ZWORK, RWORK and
+!>    IWORK. See the descriptions of ZWORK, RWORK and IWORK.
+!>    \endverbatim
 !.....
-!     INFO (output) INTEGER
-!     -i < 0 :: On entry, the i-th argument had an
-!               illegal value
-!        = 0 :: Successful return.
-!        = 1 :: Void input. Quick exit (M=0 or N=0).
-!        = 2 :: The SVD computation of X did not converge.
-!               Suggestion: Check the input data and/or
-!               repeat with different WHTSVD.
-!        = 3 :: The computation of the eigenvalues did not
-!               converge.
-!        = 4 :: If data scaling was requested on input and
-!               the procedure found inconsistency in the data
-!               such that for some column index i,
-!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
-!               to zero if JOBS=='C'. The computation proceeds
-!               with original or modified data and warning
-!               flag is set with INFO=4.
+!>    \param[out] INFO
+!>    \verbatim
+!>    INFO (output) INTEGER
+!>    -i < 0 :: On entry, the i-th argument had an
+!>              illegal value
+!>       = 0 :: Successful return.
+!>       = 1 :: Void input. Quick exit (M=0 or N=0).
+!>       = 2 :: The SVD computation of X did not converge.
+!>              Suggestion: Check the input data and/or
+!>              repeat with different WHTSVD.
+!>       = 3 :: The computation of the eigenvalues did not
+!>              converge.
+!>       = 4 :: If data scaling was requested on input and
+!>              the procedure found inconsistency in the data
+!>              such that for some column index i,
+!>              X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!>              to zero if JOBS=='C'. The computation proceeds
+!>              with original or modified data and warning
+!>              flag is set with INFO=4.
+!>    \endverbatim
+!
+!  Authors:
+!  ========
+!
+!> \author Zlatko Drmac
+!
+!> \ingroup gedmd
+!
 !.............................................................
 !.............................................................
+      SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,   &
+                         M, N, X, LDX, Y, LDY, NRNK, TOL,   &
+                         K, EIGS, Z, LDZ, RES, B,    LDB,   &
+                         W, LDW,  S, LDS, ZWORK,  LZWORK,   &
+                         RWORK, LRWORK, IWORK, LIWORK, INFO )
+!
+!  -- LAPACK driver routine                                           --
+!
+!  -- LAPACK is a software package provided by University of          --
+!  -- Tennessee, University of California Berkeley, University of     --
+!  -- Colorado Denver and NAG Ltd..                                   --
+!
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real64
+!
+!     Scalar arguments
+!     ~~~~~~~~~~~~~~~~
+      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+                                 NRNK, LDZ, LDB, LDW,  LDS, &
+                                 LIWORK, LRWORK, LZWORK
+      INTEGER,       INTENT(OUT)  :: K, INFO
+      REAL(KIND=WP), INTENT(IN)   ::    TOL
+!
+!     Array arguments
+!     ~~~~~~~~~~~~~~~
+      COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+                                         W(LDW,*), S(LDS,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
+      REAL(KIND=WP),    INTENT(OUT)   :: RES(*)
+      REAL(KIND=WP),    INTENT(OUT)   :: RWORK(*)
+      INTEGER,          INTENT(OUT)   :: IWORK(*)
+!
 !     Parameters
 !     ~~~~~~~~~~
       REAL(KIND=WP),    PARAMETER ::  ONE = 1.0_WP
       REAL(KIND=WP),    PARAMETER :: ZERO = 0.0_WP
       COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP )
       COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP )
-
+!
 !     Local scalars
 !     ~~~~~~~~~~~~~
       REAL(KIND=WP) :: OFL,   ROOTSC, SCALE,  SMALL,    &
@@ -401,7 +551,7 @@
 !     Local arrays
 !     ~~~~~~~~~~~~
       REAL(KIND=WP) :: RDUMMY(2)
-
+!
 !     External functions (BLAS and LAPACK)
 !     ~~~~~~~~~~~~~~~~~
       REAL(KIND=WP) ZLANGE, DLAMCH, DZNRM2
@@ -409,13 +559,13 @@
       INTEGER                               IZAMAX
       LOGICAL       DISNAN, LSAME
       EXTERNAL      DISNAN, LSAME
-
+!
 !     External subroutines (BLAS and LAPACK)
 !     ~~~~~~~~~~~~~~~~~~~~
       EXTERNAL      ZAXPY,  ZGEMM,  ZDSCAL
       EXTERNAL      ZGEEV,  ZGEJSV, ZGESDD, ZGESVD, ZGESVDQ, &
                     ZLACPY, ZLASCL, ZLASSQ, XERBLA
-
+!
 !     Intrinsic functions
 !     ~~~~~~~~~~~~~~~~~~~
       INTRINSIC     DBLE, INT, MAX, SQRT
@@ -608,7 +758,8 @@
           K = 0
           DO i = 1, N
             !WORK(i) = DZNRM2( M, X(1,i), 1 )
-            SCALE  = ZERO
+            SSUM  = ONE
+            SCALE = ZERO
             CALL ZLASSQ( M, X(1,i), 1, SCALE, SSUM )
             IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN
                 K    =  0
@@ -681,7 +832,8 @@
           ! carefully computed using ZLASSQ.
           DO i = 1, N
             !RWORK(i) = DZNRM2( M, Y(1,i), 1 )
-            SCALE  = ZERO
+            SSUM  = ONE
+            SCALE = ZERO
             CALL ZLASSQ( M, Y(1,i), 1, SCALE, SSUM )
             IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN
                 K    =  0

From 283713e4c541d9e77b4d3922bdc40245a8354a89 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 23 Dec 2023 23:32:45 +0100
Subject: [PATCH 506/718] Add tests for ?GEDMD (Reference-LAPACK PR 959)

---
 lapack-netlib/TESTING/CMakeLists.txt | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/lapack-netlib/TESTING/CMakeLists.txt b/lapack-netlib/TESTING/CMakeLists.txt
index b4e2223f7..d4e6f970d 100644
--- a/lapack-netlib/TESTING/CMakeLists.txt
+++ b/lapack-netlib/TESTING/CMakeLists.txt
@@ -54,6 +54,9 @@ add_lapack_test(sgqr.out gqr.in xeigtsts)
 add_lapack_test(sgsv.out gsv.in xeigtsts)
 add_lapack_test(scsd.out csd.in xeigtsts)
 add_lapack_test(slse.out lse.in xeigtsts)
+#
+# ======== SINGLE DMD EIG TESTS ===========================
+add_lapack_test(sdmd.out sdmd.in xdmdeigtsts)
 endif()
 
 if(BUILD_DOUBLE)
@@ -85,6 +88,9 @@ add_lapack_test(dgqr.out gqr.in xeigtstd)
 add_lapack_test(dgsv.out gsv.in xeigtstd)
 add_lapack_test(dcsd.out csd.in xeigtstd)
 add_lapack_test(dlse.out lse.in xeigtstd)
+#
+# ======== DOUBLE DMD EIG TESTS ===========================
+add_lapack_test(ddmd.out ddmd.in xdmdeigtstd)
 endif()
 
 if(BUILD_COMPLEX)
@@ -114,6 +120,9 @@ add_lapack_test(cgqr.out gqr.in xeigtstc)
 add_lapack_test(cgsv.out gsv.in xeigtstc)
 add_lapack_test(ccsd.out csd.in xeigtstc)
 add_lapack_test(clse.out lse.in xeigtstc)
+#
+# ======== COMPLEX DMD EIG TESTS ===========================
+add_lapack_test(cdmd.out cdmd.in xdmdeigtstc)
 endif()
 
 if(BUILD_COMPLEX16)
@@ -145,6 +154,9 @@ add_lapack_test(zgqr.out gqr.in xeigtstz)
 add_lapack_test(zgsv.out gsv.in xeigtstz)
 add_lapack_test(zcsd.out csd.in xeigtstz)
 add_lapack_test(zlse.out lse.in xeigtstz)
+#
+# ======== COMPLEX16 DMD EIG TESTS ===========================
+add_lapack_test(zdmd.out zdmd.in xdmdeigtstz)
 endif()
 
 

From c6fa9210278da4c505aec2fab8244ad009b1f944 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 23 Dec 2023 23:39:53 +0100
Subject: [PATCH 507/718] Add tests for ?GEDMD (Reference-LAPACK PR 959)

---
 lapack-netlib/TESTING/EIG/CMakeLists.txt | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt
index d252c7fa9..e7236677a 100644
--- a/lapack-netlib/TESTING/EIG/CMakeLists.txt
+++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt
@@ -42,6 +42,8 @@ set(SEIGTST schkee.F
    sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f
    sstt22.f ssyl01.f ssyt21.f ssyt22.f)
 
+set(SDMDEIGTST schkdmd.f90)
+
 set(CEIGTST cchkee.F
    cbdt01.f cbdt02.f cbdt03.f cbdt05.f
    cchkbb.f cchkbd.f cchkbk.f cchkbl.f cchkec.f
@@ -59,6 +61,8 @@ set(CEIGTST cchkee.F
    csgt01.f cslect.f csyl01.f
    cstt21.f cstt22.f cunt01.f cunt03.f)
 
+set(CDMDEIGTST cchkdmd.f90)
+
 set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f
    dsvdch.f dsvdct.f dsxt1.f)
 
@@ -79,6 +83,8 @@ set(DEIGTST dchkee.F
    dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f
    dstt22.f dsyl01.f dsyt21.f dsyt22.f)
 
+set(DDMDEIGTST dchkdmd.f90)
+
 set(ZEIGTST zchkee.F
    zbdt01.f zbdt02.f zbdt03.f zbdt05.f
    zchkbb.f zchkbd.f zchkbk.f zchkbl.f zchkec.f
@@ -96,6 +102,8 @@ set(ZEIGTST zchkee.F
    zsgt01.f zslect.f zsyl01.f
    zstt21.f zstt22.f zunt01.f zunt03.f)
 
+set(ZDMDEIGTST zchkdmd.f90)
+
 macro(add_eig_executable name)
   add_executable(${name} ${ARGN})
   target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE})
@@ -104,16 +112,20 @@ endmacro()
 
 if(BUILD_SINGLE)
 add_eig_executable(xeigtsts ${SEIGTST} ${SCIGTST} ${AEIGTST})
+add_eig_executable(xdmdeigtsts ${SDMDEIGTST})
 endif()
 
 if(BUILD_COMPLEX)
 add_eig_executable(xeigtstc ${CEIGTST} ${SCIGTST} ${AEIGTST})
+add_eig_executable(xdmdeigtstc ${CDMDEIGTST})
 endif()
 
 if(BUILD_DOUBLE)
 add_eig_executable(xeigtstd ${DEIGTST} ${DZIGTST} ${AEIGTST})
+add_eig_executable(xdmdeigtstd ${DDMDEIGTST})
 endif()
 
 if(BUILD_COMPLEX16)
 add_eig_executable(xeigtstz ${ZEIGTST} ${DZIGTST} ${AEIGTST})
+add_eig_executable(xdmdeigtstz ${ZDMDEIGTST})
 endif()

From 0baf462dbca147a82db9412b521459b60e417d53 Mon Sep 17 00:00:00 2001
From: Wu Xiaotian <wuxiaotian@zhcn.cc>
Date: Wed, 20 Dec 2023 10:34:47 +0800
Subject: [PATCH 508/718] Fix: build failed on LoongArch

According to the documentation at https://github.com/loongson/la-abi-specs/blob/release/lapcs.adoc#the-base-abi-variants, valid -mabi parameters are lp64s, lp64f, lp64d, ilp32s, ilp32f and ilp32d.
---
 cmake/cc.cmake | 14 ++++++++++++--
 cmake/fc.cmake | 14 ++++++++++++--
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index 00952e810..2da941afb 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -36,9 +36,19 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS
 
     if (LOONGARCH64)
       if (BINARY64)
-        set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64")
+	CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI)
+        if(COMPILER_SUPPORT_LP64D_ABI)
+	  set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64d")
+	else()
+	  set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64")
+	endif ()
       else ()
-        set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32")
+	CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI)
+	if(COMPILER_SUPPORT_ILP32D_ABI)
+	  set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=ilp32d")
+	else()
+	  set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32")
+	endif ()
       endif ()
       set(BINARY_DEFINED 1)
     endif ()
diff --git a/cmake/fc.cmake b/cmake/fc.cmake
index c496f6368..5c30be843 100644
--- a/cmake/fc.cmake
+++ b/cmake/fc.cmake
@@ -61,9 +61,19 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
     endif ()
     if (LOONGARCH64)
       if (BINARY64)
-        set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
+	CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI)
+        if(COMPILER_SUPPORT_LP64D_ABI)
+	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d")
+	else()
+	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
+	endif ()
       else ()
-        set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
+	CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI)
+	if(COMPILER_SUPPORT_ILP32D_ABI)
+	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d")
+	else()
+	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
+	endif ()
       endif ()
     endif ()
     if (RISCV64)

From d2f4f1b28aa4c00a4bc4bc1b512f162b349cae4c Mon Sep 17 00:00:00 2001
From: Xiaotian Wu <wuxiaotian@loongson.cn>
Date: Wed, 20 Dec 2023 14:13:04 +0800
Subject: [PATCH 509/718] CI: update toolchains for LoongArch64

---
 .github/workflows/loongarch64.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml
index 4a9bf98b6..42393eb0c 100644
--- a/.github/workflows/loongarch64.yml
+++ b/.github/workflows/loongarch64.yml
@@ -16,13 +16,13 @@ jobs:
         include:
           - target: LOONGSONGENERIC
             triple:  loongarch64-unknown-linux-gnu
-            opts: NO_SHARED=1 TARGET=LOONGSONGENERIC
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSONGENERIC
           - target: LOONGSON3R5
             triple: loongarch64-unknown-linux-gnu
-            opts: NO_SHARED=1 TARGET=LOONGSON3R5
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5
           - target: LOONGSON2K1000
             triple: loongarch64-unknown-linux-gnu
-            opts: NO_SHARED=1 TARGET=LOONGSON2K1000
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000
           - target: DYNAMIC_ARCH
             triple: loongarch64-unknown-linux-gnu
             opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC
@@ -40,8 +40,8 @@ jobs:
 
       - name: Download and install loongarch64-toolchain
         run: |
-          wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz
-          tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt
+          wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
+          tar -xf CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz -C /opt
 
       - name: Set env
         run: |

From 1106460bb3f5b6d17a77acafaf1fa7ba6051acbd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 25 Dec 2023 12:29:56 +0100
Subject: [PATCH 510/718] remove redundant targets from the default ARM64
 DYNAMIC_ARCH list

---
 cmake/arch.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index ebdc5a833..eb974456b 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -44,7 +44,7 @@ endif ()
 
 if (DYNAMIC_ARCH)
   if (ARM64)
-	  set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
+	  set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
     if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
           set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE)
     endif ()

From e7a895e7140f538d14ea5b9354d33840b76e44d1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 25 Dec 2023 12:36:05 +0100
Subject: [PATCH 511/718] Add Apple M as NeoverseN1

---
 driver/others/dynamic_arm64.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c
index b5fb8161d..803e0b5eb 100644
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@@ -247,6 +247,10 @@ static gotoblas_t *get_coretype(void) {
   int implementer, variant, part, arch, revision, midr_el1;
   char coremsg[128];
 
+#if defined (OS_DARWIN)
+  return &gotoblas_NEOVERSEN1;
+#endif
+	
 #if (!defined OS_LINUX && !defined OS_ANDROID)
   return NULL;
 #else
@@ -352,6 +356,9 @@ static gotoblas_t *get_coretype(void) {
           return &gotoblas_FALKOR;
       }
       break;
+    case 0x61: // Apple
+	return &gotoblas_NEOVERSEN1;
+      break;
     default:
       snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
       openblas_warning(1, coremsg);

From c6996a80e950535306e281f7f2da54f387805d4e Mon Sep 17 00:00:00 2001
From: Shiyou Yin <yinshiyou-hf@loongson.cn>
Date: Fri, 8 Dec 2023 16:06:17 +0800
Subject: [PATCH 512/718] loongarch64: Refine amax,amin,max,min optimization.

---
 common_loongarch64.h                     |  24 +++
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |  16 +-
 kernel/loongarch64/KERNEL.LOONGSON3R5    |  16 +-
 kernel/loongarch64/amax_lasx.S           | 232 +++++++++++++++++++++++
 kernel/loongarch64/amax_lsx.S            | 231 ++++++++++++++++++++++
 kernel/loongarch64/amin_lasx.S           | 232 +++++++++++++++++++++++
 kernel/loongarch64/amin_lsx.S            | 232 +++++++++++++++++++++++
 kernel/loongarch64/damax_lasx.S          | 183 ------------------
 kernel/loongarch64/damax_lsx.S           | 145 --------------
 kernel/loongarch64/damin_lasx.S          | 178 -----------------
 kernel/loongarch64/damin_lsx.S           | 145 --------------
 kernel/loongarch64/dmax_lasx.S           | 175 -----------------
 kernel/loongarch64/dmax_lsx.S            | 141 --------------
 kernel/loongarch64/dmin_lasx.S           | 175 -----------------
 kernel/loongarch64/dmin_lsx.S            | 143 --------------
 kernel/loongarch64/max_lasx.S            | 229 ++++++++++++++++++++++
 kernel/loongarch64/max_lsx.S             | 228 ++++++++++++++++++++++
 kernel/loongarch64/min_lasx.S            | 229 ++++++++++++++++++++++
 kernel/loongarch64/min_lsx.S             | 228 ++++++++++++++++++++++
 kernel/loongarch64/samax_lasx.S          | 208 --------------------
 kernel/loongarch64/samax_lsx.S           | 177 -----------------
 kernel/loongarch64/samin_lasx.S          | 208 --------------------
 kernel/loongarch64/samin_lsx.S           | 177 -----------------
 kernel/loongarch64/smax_lasx.S           | 205 --------------------
 kernel/loongarch64/smax_lsx.S            | 171 -----------------
 kernel/loongarch64/smin_lasx.S           | 205 --------------------
 kernel/loongarch64/smin_lsx.S            | 174 -----------------
 27 files changed, 1881 insertions(+), 2826 deletions(-)
 create mode 100644 kernel/loongarch64/amax_lasx.S
 create mode 100644 kernel/loongarch64/amax_lsx.S
 create mode 100644 kernel/loongarch64/amin_lasx.S
 create mode 100644 kernel/loongarch64/amin_lsx.S
 delete mode 100644 kernel/loongarch64/damax_lasx.S
 delete mode 100644 kernel/loongarch64/damax_lsx.S
 delete mode 100644 kernel/loongarch64/damin_lasx.S
 delete mode 100644 kernel/loongarch64/damin_lsx.S
 delete mode 100644 kernel/loongarch64/dmax_lasx.S
 delete mode 100644 kernel/loongarch64/dmax_lsx.S
 delete mode 100644 kernel/loongarch64/dmin_lasx.S
 delete mode 100644 kernel/loongarch64/dmin_lsx.S
 create mode 100644 kernel/loongarch64/max_lasx.S
 create mode 100644 kernel/loongarch64/max_lsx.S
 create mode 100644 kernel/loongarch64/min_lasx.S
 create mode 100644 kernel/loongarch64/min_lsx.S
 delete mode 100644 kernel/loongarch64/samax_lasx.S
 delete mode 100644 kernel/loongarch64/samax_lsx.S
 delete mode 100644 kernel/loongarch64/samin_lasx.S
 delete mode 100644 kernel/loongarch64/samin_lsx.S
 delete mode 100644 kernel/loongarch64/smax_lasx.S
 delete mode 100644 kernel/loongarch64/smax_lsx.S
 delete mode 100644 kernel/loongarch64/smin_lasx.S
 delete mode 100644 kernel/loongarch64/smin_lsx.S

diff --git a/common_loongarch64.h b/common_loongarch64.h
index 4963b2f07..72e900f77 100644
--- a/common_loongarch64.h
+++ b/common_loongarch64.h
@@ -120,6 +120,10 @@ static inline int WhereAmI(void){
 #define CMOVT   fsel
 #define MTC     movgr2fr.d
 #define FABS    fabs.d
+#define FMIN    fmin.d
+#define FMINA   fmina.d
+#define FMAX    fmax.d
+#define FMAXA   fmaxa.d
 #define CMPEQ   fcmp.ceq.d
 #define CMPLE   fcmp.cle.d
 #define CMPLT   fcmp.clt.d
@@ -128,10 +132,18 @@ static inline int WhereAmI(void){
 #define XVFSUB  xvfsub.d
 #define XVFADD  xvfadd.d
 #define XVFMADD xvfmadd.d
+#define XVFMIN  xvfmin.d
+#define XVFMINA xvfmina.d
+#define XVFMAX  xvfmax.d
+#define XVFMAXA xvfmaxa.d
 
 #define VFSUB  vfsub.d
 #define VFADD  vfadd.d
 #define VFMADD vfmadd.d
+#define VFMIN  vfmin.d
+#define VFMINA vfmina.d
+#define VFMAX  vfmax.d
+#define VFMAXA vfmaxa.d
 
 #else
 
@@ -148,6 +160,10 @@ static inline int WhereAmI(void){
 #define CMOVT   fsel
 #define MTC     movgr2fr.w
 #define FABS    fabs.s
+#define FMIN    fmin.s
+#define FMINA   fmina.s
+#define FMAX    fmax.s
+#define FMAXA   fmaxa.s
 #define CMPEQ   fcmp.ceq.s
 #define CMPLE   fcmp.cle.s
 #define CMPLT   fcmp.clt.s
@@ -156,10 +172,18 @@ static inline int WhereAmI(void){
 #define XVFSUB  xvfsub.s
 #define XVFADD  xvfadd.s
 #define XVFMADD xvfmadd.s
+#define XVFMIN  xvfmin.s
+#define XVFMINA xvfmina.s
+#define XVFMAX  xvfmax.s
+#define XVFMAXA xvfmaxa.s
 
 #define VFSUB  vfsub.s
 #define VFADD  vfadd.s
 #define VFMADD vfmadd.s
+#define VFMIN  vfmin.s
+#define VFMINA vfmina.s
+#define VFMAX  vfmax.s
+#define VFMAXA vfmaxa.s
 
 #endif /* defined(DOUBLE) */
 
diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 1e4fa7a9d..802dd1c9b 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -7,17 +7,17 @@ DDOTKERNEL  = dot_lsx.S
 SSCALKERNEL  = sscal_lsx.S
 DSCALKERNEL  = dscal_lsx.S
 
-SAMAXKERNEL =  samax_lsx.S
-DAMAXKERNEL =  damax_lsx.S
+SAMAXKERNEL =  amax_lsx.S
+DAMAXKERNEL =  amax_lsx.S
 
-SAMINKERNEL =  samin_lsx.S
-DAMINKERNEL =  damin_lsx.S
+SAMINKERNEL =  amin_lsx.S
+DAMINKERNEL =  amin_lsx.S
 
-SMAXKERNEL  =  smax_lsx.S
-DMAXKERNEL  =  dmax_lsx.S
+SMAXKERNEL  =  max_lsx.S
+DMAXKERNEL  =  max_lsx.S
 
-SMINKERNEL  =  smin_lsx.S
-DMINKERNEL  =  dmin_lsx.S
+SMINKERNEL  =  min_lsx.S
+DMINKERNEL  =  min_lsx.S
 
 ISMAXKERNEL =  ismax_lsx.S
 IDMAXKERNEL =  idmax_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index f00abcb32..3253489d9 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -7,17 +7,17 @@ DDOTKERNEL  = dot_lasx.S
 SSCALKERNEL  = sscal_lasx.S
 DSCALKERNEL  = dscal_lasx.S
 
-SAMAXKERNEL =  samax_lasx.S
-DAMAXKERNEL =  damax_lasx.S
+SAMAXKERNEL =  amax_lasx.S
+DAMAXKERNEL =  amax_lasx.S
 
-SAMINKERNEL =  samin_lasx.S
-DAMINKERNEL =  damin_lasx.S
+SAMINKERNEL =  amin_lasx.S
+DAMINKERNEL =  amin_lasx.S
 
-SMAXKERNEL  =  smax_lasx.S
-DMAXKERNEL =   dmax_lasx.S
+SMAXKERNEL  =  max_lsx.S
+DMAXKERNEL =   max_lsx.S
 
-SMINKERNEL =   smin_lasx.S
-DMINKERNEL =   dmin_lasx.S
+SMINKERNEL =   min_lsx.S
+DMINKERNEL =   min_lsx.S
 
 ISMAXKERNEL =  ismax_lasx.S
 IDMAXKERNEL =  idmax_lasx.S
diff --git a/kernel/loongarch64/amax_lasx.S b/kernel/loongarch64/amax_lasx.S
new file mode 100644
index 000000000..e964d4ddb
--- /dev/null
+++ b/kernel/loongarch64/amax_lasx.S
@@ -0,0 +1,232 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $xr0
+#define VM1 $xr1
+#define VM2 $xr2
+#define VX0 $xr3
+#define VX1 $xr4
+#define VX2 $xr5
+#define VX3 $xr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    xvldrepl.d VM0, X, 0
+#else
+    xvldrepl.w VM0, X, 0
+#endif
+    XVFSUB VM0, VM0, VM0
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 4
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    xvld VX2, X, 64
+    xvld VX3, X, 96
+    addi.d I, I, -1
+    addi.d X, X, 128
+    XVFMAXA   VM1, VX0, VX1
+    XVFMAXA   VM2, VX2, VX3
+    XVFMAXA   VM0, VM0, VM1
+    XVFMAXA   VM0, VM0, VM2
+#else
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    addi.d I, I, -1
+    addi.d X, X, 64
+    XVFMAXA   VM1, VX0, VX1
+    XVFMAXA   VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMAXA   VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMAXA   VM1, VX0, VX1
+    XVFMAXA   VM2, VX2, VX3
+    XVFMAXA   VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 0x1
+    XVFMAXA   VM0, VM0, VM1
+    .align 3
+
+.L11:
+    andi  I, N, 0x0f
+    bge $r0, I, .L13
+    .align 3
+
+.L12: /* 0 < N < 16 */
+    LD    $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMAXA $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    FABS $f0, $f0
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 0
+    xvinsgr2vr.w VM1, t2, 1
+    xvinsgr2vr.w VM1, t3, 2
+    xvinsgr2vr.w VM1, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 4
+    xvinsgr2vr.w VM1, t2, 5
+    xvinsgr2vr.w VM1, t3, 6
+    xvinsgr2vr.w VM1, t4, 7
+    xvfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMAXA   VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMAXA   VM1, VX0, VX1
+    XVFMAXA   VM2, VX2, VX3
+    XVFMAXA   VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 1
+    XVFMAXA   VM0, VM0, VM1
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24: /* 0 < N < 8 */
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMAXA $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    FABS $f0, $f0
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/amax_lsx.S b/kernel/loongarch64/amax_lsx.S
new file mode 100644
index 000000000..fb3b77a0e
--- /dev/null
+++ b/kernel/loongarch64/amax_lsx.S
@@ -0,0 +1,231 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $vr0
+#define VM1 $vr1
+#define VM2 $vr2
+#define VX0 $vr3
+#define VX1 $vr4
+#define VX2 $vr5
+#define VX3 $vr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    vldrepl.d VM0, X, 0
+#else
+    vldrepl.w VM0, X, 0
+#endif
+    VFSUB VM0, VM0, VM0
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 3
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vld VX2, X, 32
+    vld VX3, X, 48
+    addi.d I, I, -1
+    addi.d X, X, 64
+    VFMAXA   VM1, VX0, VX1
+    VFMAXA   VM2, VX2, VX3
+    VFMAXA   VM0, VM0, VM1
+    VFMAXA   VM0, VM0, VM2
+#else
+    vld VX0, X, 0
+    vld VX1, X, 16
+    addi.d I, I, -1
+    addi.d X, X, 32
+    VFMAXA   VM1, VX0, VX1
+    VFMAXA   VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMAXA   VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMAXA   VM1, VX0, VX1
+    VFMAXA   VM2, VX2, VX3
+    VFMAXA   VM0, VM1, VM2
+#endif
+    .align 3
+
+.L11:
+    andi I, N, 7
+    bge $r0, I, .L13
+    .align 3
+
+.L12:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMAXA $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    FABS $f0, $f0
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM2, VX0, VX1
+    vfmaxa.d VM1, VM1, VM2
+    vfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmaxa.s VM1, VX0, VX1
+    vfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMAXA   VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMAXA   VM1, VX0, VX1
+    VFMAXA   VM2, VX2, VX3
+    VFMAXA   VM0, VM1, VM2
+#endif
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMAXA $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    FABS $f0, $f0
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/amin_lasx.S b/kernel/loongarch64/amin_lasx.S
new file mode 100644
index 000000000..0a4359002
--- /dev/null
+++ b/kernel/loongarch64/amin_lasx.S
@@ -0,0 +1,232 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $xr0
+#define VM1 $xr1
+#define VM2 $xr2
+#define VX0 $xr3
+#define VX1 $xr4
+#define VX2 $xr5
+#define VX3 $xr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    xvldrepl.d VM0, X, 0
+#else
+    xvldrepl.w VM0, X, 0
+#endif
+    XVFSUB VM0, VM0, VM0
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 4
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    xvld VX2, X, 64
+    xvld VX3, X, 96
+    addi.d I, I, -1
+    addi.d X, X, 128
+    XVFMINA   VM1, VX0, VX1
+    XVFMINA   VM2, VX2, VX3
+    XVFMINA   VM0, VM0, VM1
+    XVFMINA   VM0, VM0, VM2
+#else
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    addi.d I, I, -1
+    addi.d X, X, 64
+    XVFMINA   VM1, VX0, VX1
+    XVFMINA   VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMINA   VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMINA   VM1, VX0, VX1
+    XVFMINA   VM2, VX2, VX3
+    XVFMINA   VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 0x1
+    XVFMINA   VM0, VM0, VM1
+    .align 3
+
+.L11:
+    andi  I, N, 0x0f
+    bge $r0, I, .L13
+    .align 3
+
+.L12: /* 0 < N < 16 */
+    LD    $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMINA $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    FABS $f0, $f0
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 0
+    xvinsgr2vr.w VM1, t2, 1
+    xvinsgr2vr.w VM1, t3, 2
+    xvinsgr2vr.w VM1, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 4
+    xvinsgr2vr.w VM1, t2, 5
+    xvinsgr2vr.w VM1, t3, 6
+    xvinsgr2vr.w VM1, t4, 7
+    xvfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMINA   VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMINA   VM1, VX0, VX1
+    XVFMINA   VM2, VX2, VX3
+    XVFMINA   VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 1
+    XVFMINA   VM0, VM0, VM1
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24: /* 0 < N < 8 */
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMINA $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    FABS $f0, $f0
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/amin_lsx.S b/kernel/loongarch64/amin_lsx.S
new file mode 100644
index 000000000..644caf43c
--- /dev/null
+++ b/kernel/loongarch64/amin_lsx.S
@@ -0,0 +1,232 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $vr0
+#define VM1 $vr1
+#define VM2 $vr2
+#define VX0 $vr3
+#define VX1 $vr4
+#define VX2 $vr5
+#define VX3 $vr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    vldrepl.d VM0, X, 0
+#else
+    vldrepl.w VM0, X, 0
+#endif
+    VFSUB VM0, VM0, VM0
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 3
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vld VX2, X, 32
+    vld VX3, X, 48
+    addi.d I, I, -1
+    addi.d X, X, 64
+    VFMINA   VM1, VX0, VX1
+    VFMINA   VM2, VX2, VX3
+    VFMINA   VM0, VM0, VM1
+    VFMINA   VM0, VM0, VM2
+#else
+    vld VX0, X, 0
+    vld VX1, X, 16
+    addi.d I, I, -1
+    addi.d X, X, 32
+    VFMINA   VM1, VX0, VX1
+    VFMINA   VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMINA   VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMINA   VM1, VX0, VX1
+    VFMINA   VM2, VX2, VX3
+    VFMINA   VM0, VM1, VM2
+#endif
+    .align 3
+
+.L11:
+    andi I, N, 7
+    bge $r0, I, .L13
+    .align 3
+
+.L12:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMINA $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    FABS $f0, $f0
+    SUB $f0, $f0, $f0
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM2, VX0, VX1
+    vfmaxa.d VM1, VM1, VM2
+    vfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmaxa.s VM1, VX0, VX1
+    vfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMINA   VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMINA   VM1, VX0, VX1
+    VFMINA   VM2, VX2, VX3
+    VFMINA   VM0, VM1, VM2
+#endif
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMINA $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    FABS $f0, $f0
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/damax_lasx.S b/kernel/loongarch64/damax_lasx.S
deleted file mode 100644
index c44ce4995..000000000
--- a/kernel/loongarch64/damax_lasx.S
+++ /dev/null
@@ -1,183 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define J $r13
-#define t1 $r14
-#define t2 $r18
-#define t3 $r15
-#define t4 $r17
-#define TEMP $r16
-#define m0 $xr8
-#define x1 $xr9
-#define x2 $xr10
-#define x3 $xr11
-#define x4 $xr12
-#define x5 $xr13
-#define x6 $xr14
-#define x7 $xr15
-#define x8 $xr16
-#define VX0 $xr20
-#define VX1 $xr21
-#define VM0 $xr22
-#define VM1 $xr23
-#define VM2 $xr18
-#define VM3 $xr19
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    xvld VM0, X, 0
-    srai.d I, N, 3
-    bge $r0, I, .L12
-    .align 3
-
-.L10:
-    xvld VX0, X, 0 * SIZE
-    xvld VX1, X, 4 * SIZE
-    addi.d I, I, -1
-    xvfmaxa.d VM1, VX1, VX0
-    addi.d X, X, 8 * SIZE
-    xvfmaxa.d VM0, VM0, VM1
-    blt $r0, I, .L10
-    .align 3
-
-.L11:
-    xvpickve.d x1, VM0, 0
-    xvpickve.d x2, VM0, 1
-    xvpickve.d x3, VM0, 2
-    xvpickve.d x4, VM0, 3
-    xvfmaxa.d VM1, x1, x2
-    xvfmaxa.d VM2, x3, x4
-    xvfmaxa.d VM0, VM1, VM2
-    .align 3
-
-.L12: //INCX==1 and N<8
-    andi I, N, 7
-    li.d J, 4
-    bge J, I, .L13 // 4<N<8
-    xvld VX0, X, 0
-    slli.d J, J, 1 // 8
-    sub.d I, J, I
-    slli.d I, I, BASE_SHIFT
-    xvldx VX1, X, I
-    xvfmaxa.d m0, VX0, VX1 //patial repeat read
-    xvpickve.d x1, m0, 0
-    xvpickve.d x2, m0, 1
-    xvpickve.d x3, m0, 2
-    xvpickve.d x4, m0, 3
-    xvfmaxa.d VM1, x1, x2
-    xvfmaxa.d m0, x3, x4
-    xvfmaxa.d m0, m0, VM1
-    xvfmaxa.d VM0, m0, VM0
-    fabs.d $f22, $f22
-    fmov.d $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L13: //INCX==1 and 0<=N<=4
-    bge $r0, I, .L15
-    .align 3
-
-.L14:
-    xvld x1, X, 0
-    addi.d I, I, -1
-    xvfmaxa.d VM0, VM0, x1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    .align 3
-
-.L15:
-    fabs.d $f22, $f22
-    fmov.d $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X // initialize the maxa value
-    ld.d t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.d VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L23
-    ld.d t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.d t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.d t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.d VM0, t2, 1
-    xvinsgr2vr.d VM0, t3, 2
-    xvinsgr2vr.d VM0, t4, 3
-    .align 3
-
-.L21:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX0, t1, 0
-    xvinsgr2vr.d VX0, t2, 1
-    xvinsgr2vr.d VX0, t3, 2
-    xvinsgr2vr.d VX0, t4, 3
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX1, t1, 0
-    xvinsgr2vr.d VX1, t2, 1
-    xvinsgr2vr.d VX1, t3, 2
-    xvinsgr2vr.d VX1, t4, 3
-    addi.d I, I, -1
-    xvfmaxa.d VM1, VX1, VX0
-    xvfmaxa.d VM0, VM1, VM0
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    xvpickve.d x1, VM0, 0
-    xvpickve.d x2, VM0, 1
-    xvpickve.d x3, VM0, 2
-    xvpickve.d x4, VM0, 3
-    xvfmaxa.d VM1, x1, x2
-    xvfmaxa.d VM2, x3, x4
-    xvfmaxa.d VM0, VM1, VM2
-    .align 3
-
-.L23: //INCX!=1 and N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    xvld x1, X, 0
-    addi.d I, I, -1
-    xvfmaxa.d VM0, VM0, x1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fabs.d $f22, $f22
-    fmov.d $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/damax_lsx.S b/kernel/loongarch64/damax_lsx.S
deleted file mode 100644
index 3e5a10525..000000000
--- a/kernel/loongarch64/damax_lsx.S
+++ /dev/null
@@ -1,145 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define t1 $r14
-#define t2 $r18
-#define t3 $r15
-#define t4 $r17
-#define TEMP $r16
-#define x1 $vr9
-#define x2 $vr10
-#define VX0 $vr20
-#define VX1 $vr21
-#define VM0 $vr22
-#define VM1 $vr23
-#define VM2 $vr18
-#define VM3 $vr19
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    vld VM0, X, 0
-    srai.d I, N, 3
-    bge $r0, I, .L12
-    .align 3
-
-.L10:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 2 * SIZE
-    addi.d I, I, -1
-    vfmaxa.d VM1, VX1, VX0
-    vld VX0, X, 4 * SIZE
-    vld VX1, X, 6 * SIZE
-    vfmaxa.d VM2, VX1, VX0
-    vfmaxa.d VM3, VM1, VM2
-    addi.d X, X, 8 * SIZE
-    vfmaxa.d VM0, VM0, VM3
-    blt $r0, I, .L10
-    .align 3
-
-.L11:
-    vreplvei.d x1, VM0, 0
-    vreplvei.d x2, VM0, 1
-    vfmaxa.d VM0, x1, x2
-    .align 3
-
-.L12: //INCX==1 and N<8
-    andi I, N, 7
-    bge $r0, I, .L14
-    .align 3
-
-.L13:
-    vld x1, X, 0
-    addi.d I, I, -1
-    vfmaxa.d VM0, VM0, x1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L13
-    .align 3
-
-.L14:
-    fabs.d $f22, $f22
-    fmov.d $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X // initialize the maxa value
-    ld.d t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.d VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L23
-    ld.d t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.d VM0, t2, 1
-    .align 3
-
-.L21:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    vfmaxa.d VM1, VX0, VX1
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    vfmaxa.d VM2, VX0, VX1
-    vfmaxa.d VM3, VM1, VM2
-    vfmaxa.d VM0, VM0, VM3
-    addi.d I, I, -1
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    vreplvei.d x1, VM0, 0
-    vreplvei.d x2, VM0, 1
-    vfmaxa.d VM0, x1, x2
-    .align 3
-
-.L23: //INCX!=1 and N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    vld x1, X, 0
-    addi.d I, I, -1
-    vfmaxa.d VM0, VM0, x1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fabs.d $f22, $f22
-    fmov.d $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/damin_lasx.S b/kernel/loongarch64/damin_lasx.S
deleted file mode 100644
index 9d96dd997..000000000
--- a/kernel/loongarch64/damin_lasx.S
+++ /dev/null
@@ -1,178 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define J $r13
-#define t1 $r14
-#define t2 $r18
-#define t3 $r15
-#define t4 $r17
-#define TEMP $r16
-#define m0 $xr8
-#define x1 $xr9
-#define x2 $xr10
-#define x3 $xr11
-#define x4 $xr12
-#define VX0 $xr20
-#define VX1 $xr21
-#define VM0 $xr22
-#define VM1 $xr23
-#define VM2 $xr19
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    xvld VM0, X, 0
-    srai.d I, N, 3
-    bge $r0, I, .L12
-    .align 3
-
-.L10:
-    xvld VX0, X, 0 * SIZE
-    addi.d I, I, -1
-    xvld VX1, X, 4 * SIZE
-    xvfmina.d VM1, VX1, VX0
-    addi.d X, X, 8 * SIZE
-    xvfmina.d VM0, VM0, VM1
-    blt $r0, I, .L10
-    .align 3
-
-.L11:
-    xvpickve.d x1, VM0, 0
-    xvpickve.d x2, VM0, 1
-    xvpickve.d x3, VM0, 2
-    xvpickve.d x4, VM0, 3
-    xvfmina.d VM1, x1, x2
-    xvfmina.d VM2, x3, x4
-    xvfmina.d VM0, VM1, VM2
-    .align 3
-
-.L12: //INCX==1 and N<8
-    andi I, N, 7
-    li.d J, 4
-    bge J, I, .L13 // 4<N<8
-    xvld VX0, X, 0
-    slli.d J, J, 1 // 8
-    sub.d I, J, I
-    slli.d I, I, BASE_SHIFT
-    xvldx VX1, X, I
-    xvfmina.d m0, VX0, VX1 //patial repeat read
-    xvpickve.d x1, m0, 0
-    xvpickve.d x2, m0, 1
-    xvpickve.d x3, m0, 2
-    xvpickve.d x4, m0, 3
-    xvfmina.d VM1, x1, x2
-    xvfmina.d m0, x3, x4
-    xvfmina.d m0, m0, VM1
-    xvfmina.d VM0, m0, VM0
-    fabs.d $f22, $f22
-    fmov.d $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L13: //INCX==1 and 0<=N<=4
-    bge $r0, I, .L15
-    .align 3
-
-.L14:
-    xvld x1, X, 0
-    addi.d I, I, -1
-    xvfmina.d VM0, VM0, x1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    .align 3
-
-.L15:
-    fabs.d $f22, $f22
-    fmov.d $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X // initialize the mina value
-    ld.d t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.d VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L23
-    ld.d t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.d t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.d t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.d VM0, t2, 1
-    xvinsgr2vr.d VM0, t3, 2
-    xvinsgr2vr.d VM0, t4, 3
-    .align 3
-
-.L21:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX0, t1, 0
-    xvinsgr2vr.d VX0, t2, 1
-    xvinsgr2vr.d VX0, t3, 2
-    xvinsgr2vr.d VX0, t4, 3
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX1, t1, 0
-    xvinsgr2vr.d VX1, t2, 1
-    xvinsgr2vr.d VX1, t3, 2
-    xvinsgr2vr.d VX1, t4, 3
-    addi.d I, I, -1
-    xvfmina.d VM1, VX1, VX0
-    xvfmina.d VM0, VM1, VM0
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    xvpickve.d x1, VM0, 0
-    xvpickve.d x2, VM0, 1
-    xvpickve.d x3, VM0, 2
-    xvpickve.d x4, VM0, 3
-    xvfmina.d VM1, x1, x2
-    xvfmina.d VM2, x3, x4
-    xvfmina.d VM0, VM1, VM2
-    .align 3
-
-.L23: //INCX!=1 and N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    xvld x1, X, 0
-    addi.d I, I, -1
-    xvfmina.d VM0, VM0, x1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fabs.d $f22, $f22
-    fmov.d $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/damin_lsx.S b/kernel/loongarch64/damin_lsx.S
deleted file mode 100644
index c62fc2731..000000000
--- a/kernel/loongarch64/damin_lsx.S
+++ /dev/null
@@ -1,145 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define t1 $r14
-#define t2 $r18
-#define t3 $r15
-#define t4 $r17
-#define TEMP $r16
-#define x1 $vr9
-#define x2 $vr10
-#define VX0 $vr20
-#define VX1 $vr21
-#define VM0 $vr22
-#define VM1 $vr23
-#define VM2 $vr18
-#define VM3 $vr19
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    vld VM0, X, 0
-    srai.d I, N, 3
-    bge $r0, I, .L12
-    .align 3
-
-.L10:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 2 * SIZE
-    addi.d I, I, -1
-    vfmina.d VM1, VX1, VX0
-    vld VX0, X, 4 * SIZE
-    vld VX1, X, 6 * SIZE
-    vfmina.d VM2, VX1, VX0
-    vfmina.d VM3, VM1, VM2
-    addi.d X, X, 8 * SIZE
-    vfmina.d VM0, VM0, VM3
-    blt $r0, I, .L10
-    .align 3
-
-.L11:
-    vreplvei.d x1, VM0, 0
-    vreplvei.d x2, VM0, 1
-    vfmina.d VM0, x1, x2
-    .align 3
-
-.L12: //INCX==1 and N<8
-    andi I, N, 7
-    bge $r0, I, .L14
-    .align 3
-
-.L13:
-    vld x1, X, 0
-    addi.d I, I, -1
-    vfmina.d VM0, VM0, x1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L13
-    .align 3
-
-.L14:
-    fabs.d $f22, $f22
-    fmov.d $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X // initialize the mina value
-    ld.d t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.d VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L23
-    ld.d t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.d VM0, t2, 1
-    .align 3
-
-.L21:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    vfmina.d VM1, VX0, VX1
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    addi.d I, I, -1
-    vfmina.d VM2, VX0, VX1
-    vfmina.d VM3, VM1, VM2
-    vfmina.d VM0, VM0, VM3
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    vreplvei.d x1, VM0, 0
-    vreplvei.d x2, VM0, 1
-    vfmina.d VM0, x1, x2
-    .align 3
-
-.L23: //INCX!=1 and N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    vld x1, X, 0
-    vfmina.d VM0, VM0, x1
-    addi.d I, I, -1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fabs.d $f22, $f22
-    fmov.d $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/dmax_lasx.S b/kernel/loongarch64/dmax_lasx.S
deleted file mode 100644
index 46366d2ec..000000000
--- a/kernel/loongarch64/dmax_lasx.S
+++ /dev/null
@@ -1,175 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define J $r13
-#define t1 $r14
-#define t2 $r18
-#define t3 $r15
-#define t4 $r17
-#define TEMP $r16
-#define m0 $xr8
-#define x1 $xr9
-#define x2 $xr10
-#define x3 $xr11
-#define x4 $xr12
-#define VX0 $xr20
-#define VX1 $xr21
-#define VM0 $xr22
-#define VM1 $xr23
-#define VM2 $xr19
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    xvld VM0, X, 0
-    srai.d I, N, 3
-    bge $r0, I, .L12
-    .align 3
-
-.L10:
-    xvld VX0, X, 0 * SIZE
-    xvld VX1, X, 4 * SIZE
-    addi.d I, I, -1
-    xvfmax.d VM1, VX1, VX0
-    addi.d X, X, 8 * SIZE
-    xvfmax.d VM0, VM0, VM1
-    blt $r0, I, .L10
-    .align 3
-
-.L11:
-    xvpickve.d x1, VM0, 0
-    xvpickve.d x2, VM0, 1
-    xvpickve.d x3, VM0, 2
-    xvpickve.d x4, VM0, 3
-    xvfmax.d VM1, x1, x2
-    xvfmax.d VM2, x3, x4
-    xvfmax.d VM0, VM1, VM2
-    .align 3
-
-.L12: //INCX==1 and N<8
-    andi I, N, 7
-    li.d J, 4
-    bge J, I, .L13 // 4<N<8
-    xvld VX0, X, 0
-    slli.d J, J, 1 // 8
-    sub.d I, J, I
-    slli.d I, I, BASE_SHIFT
-    xvldx VX1, X, I
-    xvfmax.d m0, VX0, VX1 //patial repeat read
-    xvpickve.d x1, m0, 0
-    xvpickve.d x2, m0, 1
-    xvpickve.d x3, m0, 2
-    xvpickve.d x4, m0, 3
-    xvfmax.d VM1, x1, x2
-    xvfmax.d m0, x3, x4
-    xvfmax.d m0, m0, VM1
-    xvfmax.d VM0, m0, VM0
-    fmov.d $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L13: //INCX==1 and 0<=N<=4
-    bge $r0, I, .L15
-    .align 3
-
-.L14:
-    xvld x1, X, 0
-    addi.d I, I, -1
-    xvfmax.d VM0, VM0, x1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    .align 3
-
-.L15:
-    fmov.d $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X // initialize the max value
-    ld.d t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.d VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L23
-    ld.d t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.d t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.d t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.d VM0, t2, 1
-    xvinsgr2vr.d VM0, t3, 2
-    xvinsgr2vr.d VM0, t4, 3
-    .align 3
-
-.L21:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX0, t1, 0
-    xvinsgr2vr.d VX0, t2, 1
-    xvinsgr2vr.d VX0, t3, 2
-    xvinsgr2vr.d VX0, t4, 3
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX1, t1, 0
-    xvinsgr2vr.d VX1, t2, 1
-    xvinsgr2vr.d VX1, t3, 2
-    xvinsgr2vr.d VX1, t4, 3
-    addi.d I, I, -1
-    xvfmax.d VM1, VX1, VX0
-    xvfmax.d VM0, VM1, VM0
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    xvpickve.d x1, VM0, 0
-    xvpickve.d x2, VM0, 1
-    xvpickve.d x3, VM0, 2
-    xvpickve.d x4, VM0, 3
-    xvfmax.d VM1, x1, x2
-    xvfmax.d VM2, x3, x4
-    xvfmax.d VM0, VM1, VM2
-    .align 3
-
-.L23: //INCX!=1 and N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    xvld x1, X, 0
-    addi.d I, I, -1
-    xvfmax.d VM0, VM0, x1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fmov.d $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/dmax_lsx.S b/kernel/loongarch64/dmax_lsx.S
deleted file mode 100644
index 99e9b2f30..000000000
--- a/kernel/loongarch64/dmax_lsx.S
+++ /dev/null
@@ -1,141 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define t1 $r14
-#define t2 $r18
-#define t3 $r15
-#define t4 $r17
-#define TEMP $r16
-#define x1 $vr9
-#define x2 $vr10
-#define VX0 $vr20
-#define VX1 $vr21
-#define VM0 $vr22
-#define VM1 $vr23
-#define VM2 $vr19
-#define VM3 $vr18
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    vld VM0, X, 0
-    srai.d I, N, 3
-    bge $r0, I, .L12
-    .align 3
-
-.L10:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 2 * SIZE
-    addi.d I, I, -1
-    vfmax.d VM1, VX1, VX0
-    vld VX0, X, 4 * SIZE
-    vld VX1, X, 6 * SIZE
-    vfmax.d VM2, VX1, VX0
-    vfmax.d VM3, VM1, VM2
-    addi.d X, X, 8 * SIZE
-    vfmax.d VM0, VM0, VM3
-    blt $r0, I, .L10
-    .align 3
-
-.L11:
-    vreplvei.d x2, VM0, 1
-    vfmax.d VM0, VM0, x2
-    .align 3
-
-.L12: //INCX==1 and N<8
-    andi I, N, 7
-    bge $r0, I, .L14
-    .align 3
-
-.L13:
-    vld x1, X, 0
-    addi.d I, I, -1
-    vfmax.d VM0, VM0, x1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L13
-    .align 3
-
-.L14:
-    fmov.d $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X // initialize the max value
-    ld.d t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.d VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L23
-    ld.d t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.d VM0, t2, 1
-    .align 3
-
-.L21:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    vfmax.d VM1, VX0, VX1
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    addi.d I, I, -1
-    vfmax.d VM2, VX0, VX1
-    vfmax.d VM3, VM1, VM2
-    vfmax.d VM0, VM0, VM3
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    vreplvei.d x2, VM0, 1
-    vfmax.d VM0, VM0, x2
-    .align 3
-
-.L23: //INCX!=1 and N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    vld x1, X, 0
-    addi.d I, I, -1
-    vfmax.d VM0, VM0, x1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fmov.d $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/dmin_lasx.S b/kernel/loongarch64/dmin_lasx.S
deleted file mode 100644
index e76056565..000000000
--- a/kernel/loongarch64/dmin_lasx.S
+++ /dev/null
@@ -1,175 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define J $r13
-#define t1 $r14
-#define t2 $r18
-#define t3 $r15
-#define t4 $r17
-#define TEMP $r16
-#define m0 $xr8
-#define x1 $xr9
-#define x2 $xr10
-#define x3 $xr11
-#define x4 $xr12
-#define VX0 $xr20
-#define VX1 $xr21
-#define VM0 $xr22
-#define VM1 $xr23
-#define VM2 $xr19
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    xvld VM0, X, 0
-    srai.d I, N, 3
-    bge $r0, I, .L12
-    .align 3
-
-.L10:
-    xvld VX0, X, 0 * SIZE
-    xvld VX1, X, 4 * SIZE
-    addi.d I, I, -1
-    xvfmin.d VM1, VX1, VX0
-    addi.d X, X, 8 * SIZE
-    xvfmin.d VM0, VM0, VM1
-    blt $r0, I, .L10
-    .align 3
-
-.L11:
-    xvpickve.d x1, VM0, 0
-    xvpickve.d x2, VM0, 1
-    xvpickve.d x3, VM0, 2
-    xvpickve.d x4, VM0, 3
-    xvfmin.d VM1, x1, x2
-    xvfmin.d VM2, x3, x4
-    xvfmin.d VM0, VM1, VM2
-    .align 3
-
-.L12: //INCX==1 and N<8
-    andi I, N, 7
-    li.d J, 4
-    bge J, I, .L13 // 4<N<8
-    xvld VX0, X, 0
-    slli.d J, J, 1 // 8
-    sub.d I, J, I
-    slli.d I, I, BASE_SHIFT
-    xvldx VX1, X, I
-    xvfmin.d m0, VX0, VX1 //patial repeat read
-    xvpickve.d x1, m0, 0
-    xvpickve.d x2, m0, 1
-    xvpickve.d x3, m0, 2
-    xvpickve.d x4, m0, 3
-    xvfmin.d VM1, x1, x2
-    xvfmin.d m0, x3, x4
-    xvfmin.d m0, m0, VM1
-    xvfmin.d VM0, m0, VM0
-    fmov.d $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L13: //INCX==1 and 0<=N<=4
-    bge $r0, I, .L15
-    .align 3
-
-.L14:
-    xvld x1, X, 0
-    xvfmin.d VM0, VM0, x1
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    .align 3
-
-.L15:
-    fmov.d $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X // initialize the min value
-    ld.d t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.d VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L23
-    ld.d t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.d t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.d t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.d VM0, t2, 1
-    xvinsgr2vr.d VM0, t3, 2
-    xvinsgr2vr.d VM0, t4, 3
-    .align 3
-
-.L21:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX0, t1, 0
-    xvinsgr2vr.d VX0, t2, 1
-    xvinsgr2vr.d VX0, t3, 2
-    xvinsgr2vr.d VX0, t4, 3
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX1, t1, 0
-    xvinsgr2vr.d VX1, t2, 1
-    xvinsgr2vr.d VX1, t3, 2
-    xvinsgr2vr.d VX1, t4, 3
-    addi.d I, I, -1
-    xvfmin.d VM1, VX1, VX0
-    xvfmin.d VM0, VM1, VM0
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    xvpickve.d x1, VM0, 0
-    xvpickve.d x2, VM0, 1
-    xvpickve.d x3, VM0, 2
-    xvpickve.d x4, VM0, 3
-    xvfmin.d VM1, x1, x2
-    xvfmin.d VM2, x3, x4
-    xvfmin.d VM0, VM1, VM2
-    .align 3
-
-.L23: //INCX!=1 and N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    xvld x1, X, 0
-    xvfmin.d VM0, VM0, x1
-    addi.d I, I, -1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fmov.d $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/dmin_lsx.S b/kernel/loongarch64/dmin_lsx.S
deleted file mode 100644
index f076067a3..000000000
--- a/kernel/loongarch64/dmin_lsx.S
+++ /dev/null
@@ -1,143 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define t1 $r14
-#define t2 $r18
-#define t3 $r15
-#define t4 $r17
-#define TEMP $r16
-#define x1 $vr9
-#define x2 $vr10
-#define VX0 $vr20
-#define VX1 $vr21
-#define VM0 $vr22
-#define VM1 $vr23
-#define VM2 $vr18
-#define VM3 $vr19
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    vld VM0, X, 0
-    srai.d I, N, 3
-    bge $r0, I, .L12
-    .align 3
-
-.L10:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 2 * SIZE
-    addi.d I, I, -1
-    vfmin.d VM1, VX1, VX0
-    vld VX0, X, 4 * SIZE
-    vld VX1, X, 6 * SIZE
-    vfmin.d VM2, VX1, VX0
-    vfmin.d VM3, VM1, VM2
-    addi.d X, X, 8 * SIZE
-    vfmin.d VM0, VM0, VM3
-    blt $r0, I, .L10
-    .align 3
-
-.L11:
-    vreplvei.d x1, VM0, 0
-    vreplvei.d x2, VM0, 1
-    vfmin.d VM0, x1, x2
-    .align 3
-
-.L12: //INCX==1 and N<8
-    andi I, N, 7
-    bge $r0, I, .L14
-    .align 3
-
-.L13:
-    vld x1, X, 0
-    addi.d I, I, -1
-    vfmin.d VM0, VM0, x1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L13
-    .align 3
-
-.L14:
-    fmov.d $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X // initialize the min value
-    ld.d t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.d VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L23
-    ld.d t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.d VM0, t2, 1
-    .align 3
-
-.L21:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    vfmin.d VM1, VX0, VX1
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    addi.d I, I, -1
-    vfmin.d VM2, VX0, VX1
-    vfmin.d VM3, VM1, VM2
-    vfmin.d VM0, VM0, VM3
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    vreplvei.d x1, VM0, 0
-    vreplvei.d x2, VM0, 1
-    vfmin.d VM0, x1, x2
-    .align 3
-
-.L23: //INCX!=1 and N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    vld x1, X, 0
-    addi.d I, I, -1
-    vfmin.d VM0, VM0, x1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fmov.d $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/max_lasx.S b/kernel/loongarch64/max_lasx.S
new file mode 100644
index 000000000..3215ae394
--- /dev/null
+++ b/kernel/loongarch64/max_lasx.S
@@ -0,0 +1,229 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $xr0
+#define VM1 $xr1
+#define VM2 $xr2
+#define VX0 $xr3
+#define VX1 $xr4
+#define VX2 $xr5
+#define VX3 $xr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    xvldrepl.d VM0, X, 0
+#else
+    xvldrepl.w VM0, X, 0
+#endif
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 4
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    xvld VX2, X, 64
+    xvld VX3, X, 96
+    addi.d I, I, -1
+    addi.d X, X, 128
+    XVFMAX    VM1, VX0, VX1
+    XVFMAX    VM2, VX2, VX3
+    XVFMAX    VM0, VM0, VM1
+    XVFMAX    VM0, VM0, VM2
+#else
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    addi.d I, I, -1
+    addi.d X, X, 64
+    XVFMAX    VM1, VX0, VX1
+    XVFMAX    VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMAX    VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMAX    VM1, VX0, VX1
+    XVFMAX    VM2, VX2, VX3
+    XVFMAX    VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 0x1
+    XVFMAX    VM0, VM0, VM1
+    .align 3
+
+.L11:
+    andi  I, N, 0x0f
+    bge $r0, I, .L13
+    .align 3
+
+.L12: /* 0 < N < 16 */
+    LD    $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMAX  $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 0
+    xvinsgr2vr.w VM1, t2, 1
+    xvinsgr2vr.w VM1, t3, 2
+    xvinsgr2vr.w VM1, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 4
+    xvinsgr2vr.w VM1, t2, 5
+    xvinsgr2vr.w VM1, t3, 6
+    xvinsgr2vr.w VM1, t4, 7
+    xvfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMAX    VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMAX    VM1, VX0, VX1
+    XVFMAX    VM2, VX2, VX3
+    XVFMAX    VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 1
+    XVFMAX    VM0, VM0, VM1
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24: /* 0 < N < 8 */
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMAX  $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/max_lsx.S b/kernel/loongarch64/max_lsx.S
new file mode 100644
index 000000000..a2100875d
--- /dev/null
+++ b/kernel/loongarch64/max_lsx.S
@@ -0,0 +1,228 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $vr0
+#define VM1 $vr1
+#define VM2 $vr2
+#define VX0 $vr3
+#define VX1 $vr4
+#define VX2 $vr5
+#define VX3 $vr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    vldrepl.d VM0, X, 0
+#else
+    vldrepl.w VM0, X, 0
+#endif
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 3
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vld VX2, X, 32
+    vld VX3, X, 48
+    addi.d I, I, -1
+    addi.d X, X, 64
+    VFMAX    VM1, VX0, VX1
+    VFMAX    VM2, VX2, VX3
+    VFMAX    VM0, VM0, VM1
+    VFMAX    VM0, VM0, VM2
+#else
+    vld VX0, X, 0
+    vld VX1, X, 16
+    addi.d I, I, -1
+    addi.d X, X, 32
+    VFMAX    VM1, VX0, VX1
+    VFMAX    VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMAX    VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMAX    VM1, VX0, VX1
+    VFMAX    VM2, VX2, VX3
+    VFMAX    VM0, VM1, VM2
+#endif
+    .align 3
+
+.L11:
+    andi I, N, 7
+    bge $r0, I, .L13
+    .align 3
+
+.L12:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMAX  $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM2, VX0, VX1
+    vfmaxa.d VM1, VM1, VM2
+    vfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmaxa.s VM1, VX0, VX1
+    vfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMAX    VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMAX    VM1, VX0, VX1
+    VFMAX    VM2, VX2, VX3
+    VFMAX    VM0, VM1, VM2
+#endif
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMAX  $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/min_lasx.S b/kernel/loongarch64/min_lasx.S
new file mode 100644
index 000000000..890c8882d
--- /dev/null
+++ b/kernel/loongarch64/min_lasx.S
@@ -0,0 +1,229 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $xr0
+#define VM1 $xr1
+#define VM2 $xr2
+#define VX0 $xr3
+#define VX1 $xr4
+#define VX2 $xr5
+#define VX3 $xr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    xvldrepl.d VM0, X, 0
+#else
+    xvldrepl.w VM0, X, 0
+#endif
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 4
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    xvld VX2, X, 64
+    xvld VX3, X, 96
+    addi.d I, I, -1
+    addi.d X, X, 128
+    XVFMIN    VM1, VX0, VX1
+    XVFMIN    VM2, VX2, VX3
+    XVFMIN    VM0, VM0, VM1
+    XVFMIN    VM0, VM0, VM2
+#else
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    addi.d I, I, -1
+    addi.d X, X, 64
+    XVFMIN    VM1, VX0, VX1
+    XVFMIN    VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMIN    VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMIN    VM1, VX0, VX1
+    XVFMIN    VM2, VX2, VX3
+    XVFMIN    VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 0x1
+    XVFMIN    VM0, VM0, VM1
+    .align 3
+
+.L11:
+    andi  I, N, 0x0f
+    bge $r0, I, .L13
+    .align 3
+
+.L12: /* 0 < N < 16 */
+    LD    $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMIN  $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 0
+    xvinsgr2vr.w VM1, t2, 1
+    xvinsgr2vr.w VM1, t3, 2
+    xvinsgr2vr.w VM1, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 4
+    xvinsgr2vr.w VM1, t2, 5
+    xvinsgr2vr.w VM1, t3, 6
+    xvinsgr2vr.w VM1, t4, 7
+    xvfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMIN    VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMIN    VM1, VX0, VX1
+    XVFMIN    VM2, VX2, VX3
+    XVFMIN    VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 1
+    XVFMIN    VM0, VM0, VM1
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24: /* 0 < N < 8 */
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMIN  $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/min_lsx.S b/kernel/loongarch64/min_lsx.S
new file mode 100644
index 000000000..1bc32a0ed
--- /dev/null
+++ b/kernel/loongarch64/min_lsx.S
@@ -0,0 +1,228 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $vr0
+#define VM1 $vr1
+#define VM2 $vr2
+#define VX0 $vr3
+#define VX1 $vr4
+#define VX2 $vr5
+#define VX3 $vr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    vldrepl.d VM0, X, 0
+#else
+    vldrepl.w VM0, X, 0
+#endif
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 3
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vld VX2, X, 32
+    vld VX3, X, 48
+    addi.d I, I, -1
+    addi.d X, X, 64
+    VFMIN    VM1, VX0, VX1
+    VFMIN    VM2, VX2, VX3
+    VFMIN    VM0, VM0, VM1
+    VFMIN    VM0, VM0, VM2
+#else
+    vld VX0, X, 0
+    vld VX1, X, 16
+    addi.d I, I, -1
+    addi.d X, X, 32
+    VFMIN    VM1, VX0, VX1
+    VFMIN    VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMIN    VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMIN    VM1, VX0, VX1
+    VFMIN    VM2, VX2, VX3
+    VFMIN    VM0, VM1, VM2
+#endif
+    .align 3
+
+.L11:
+    andi I, N, 7
+    bge $r0, I, .L13
+    .align 3
+
+.L12:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMIN  $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM2, VX0, VX1
+    vfmaxa.d VM1, VM1, VM2
+    vfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmaxa.s VM1, VX0, VX1
+    vfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMIN    VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMIN    VM1, VX0, VX1
+    VFMIN    VM2, VX2, VX3
+    VFMIN    VM0, VM1, VM2
+#endif
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMIN  $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/samax_lasx.S b/kernel/loongarch64/samax_lasx.S
deleted file mode 100644
index a04e4a90b..000000000
--- a/kernel/loongarch64/samax_lasx.S
+++ /dev/null
@@ -1,208 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define J $r13
-#define t1 $r14
-#define t2 $r18
-#define t3 $r15
-#define t4 $r17
-#define TEMP $r16
-#define m0 $xr8
-#define x1 $xr9
-#define x2 $xr10
-#define x3 $xr11
-#define x4 $xr12
-#define x5 $xr13
-#define x6 $xr14
-#define x7 $xr15
-#define x8 $xr16
-#define VX0 $xr20
-#define VX1 $xr21
-#define VM0 $xr22
-#define VM1 $xr23
-#define VM2 $xr19
-#define VM3 $xr18
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    xvld VM0, X, 0
-    srai.d I, N, 3
-    bge $r0, I, .L12
-    .align 3
-
-.L10:
-    xvld VX0, X, 0 * SIZE
-    addi.d I, I, -1
-    xvfmaxa.s VM0, VM0, VX0
-    addi.d X, X, 8 * SIZE
-    blt $r0, I, .L10
-    .align 3
-
-.L11:
-    xvpickve.w x1, VM0, 0
-    xvpickve.w x2, VM0, 1
-    xvpickve.w x3, VM0, 2
-    xvpickve.w x4, VM0, 3
-    xvpickve.w x5, VM0, 4
-    xvpickve.w x6, VM0, 5
-    xvpickve.w x7, VM0, 6
-    xvpickve.w x8, VM0, 7
-    xvfmaxa.s VM3, x1, x2
-    xvfmaxa.s VM2, x3, x4
-    xvfmaxa.s VM1, x5, x6
-    xvfmaxa.s VM0, x7, x8
-    xvfmaxa.s VM2, VM2, VM3
-    xvfmaxa.s VM0, VM0, VM1
-    xvfmaxa.s VM0, VM0, VM2
-    .align 3
-
-.L12: //INCX==1 and N<8
-    andi I, N, 7
-    li.d J, 4
-    bge J, I, .L13 // 4<N<8
-    xvld VX0, X, 0
-    slli.d J, J, 1 // 8
-    sub.d I, J, I
-    slli.d I, I, BASE_SHIFT
-    xvldx VX1, X, I
-    xvfmaxa.s m0, VX0, VX1 //patial repeat read
-    xvpickve.w x1, m0, 0
-    xvpickve.w x2, m0, 1
-    xvpickve.w x3, m0, 2
-    xvpickve.w x4, m0, 3
-    xvfmaxa.s m0, x1, x2
-    xvfmaxa.s VM1, x3, x4
-    xvfmaxa.s m0, m0, VM1
-    xvfmaxa.s VM0, m0, VM0
-    fabs.s $f22, $f22
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L13: //INCX==1 and 0<=N<=4
-    bge $r0, I, .L15
-    .align 3
-
-.L14:
-    xvld x1, X, 0
-    addi.d I, I, -1
-    xvfmaxa.s VM0, VM0, x1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    .align 3
-
-.L15:
-    fabs.s $f22, $f22
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X // initialize the maxa value
-    ld.w t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.w VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L23
-    ld.w t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.w VM0, t2, 1
-    xvinsgr2vr.w VM0, t3, 2
-    xvinsgr2vr.w VM0, t4, 3
-    ld.w t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.w VM0, t1, 4
-    xvinsgr2vr.w VM0, t2, 5
-    xvinsgr2vr.w VM0, t3, 6
-    xvinsgr2vr.w VM0, t4, 7
-    .align 3
-
-.L21:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    addi.d I, I, -1
-    xvfmaxa.s VM0, VM0, VX0
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    xvpickve.w x1, VM0, 0
-    xvpickve.w x2, VM0, 1
-    xvpickve.w x3, VM0, 2
-    xvpickve.w x4, VM0, 3
-    xvpickve.w x5, VM0, 4
-    xvpickve.w x6, VM0, 5
-    xvpickve.w x7, VM0, 6
-    xvpickve.w x8, VM0, 7
-    xvfmaxa.s VM3, x1, x2
-    xvfmaxa.s VM2, x3, x4
-    xvfmaxa.s VM1, x5, x6
-    xvfmaxa.s VM0, x7, x8
-    xvfmaxa.s VM2, VM2, VM3
-    xvfmaxa.s VM0, VM0, VM1
-    xvfmaxa.s VM0, VM0, VM2
-    .align 3
-
-.L23: //INCX!=1 and N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    xvld x1, X, 0
-    addi.d I, I, -1
-    xvfmaxa.s VM0, VM0, x1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fabs.s $f22, $f22
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/samax_lsx.S b/kernel/loongarch64/samax_lsx.S
deleted file mode 100644
index 5c20dd96a..000000000
--- a/kernel/loongarch64/samax_lsx.S
+++ /dev/null
@@ -1,177 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define J $r13
-#define t1 $r14
-#define t2 $r18
-#define t3 $r15
-#define t4 $r17
-#define TEMP $r16
-#define m0 $vr8
-#define x1 $vr9
-#define x2 $vr10
-#define x3 $vr11
-#define x4 $vr12
-#define VX0 $vr20
-#define VX1 $vr21
-#define VM0 $vr22
-#define VM1 $vr23
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    vld VM0, X, 0
-    srai.d I, N, 3
-    bge $r0, I, .L12
-    .align 3
-
-.L10:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    addi.d I, I, -1
-    vfmaxa.s VM1, VX0, VX1
-    addi.d X, X, 8 * SIZE
-    vfmaxa.s VM0, VM0, VM1
-    blt $r0, I, .L10
-    .align 3
-
-.L11:
-    vreplvei.w x1, VM0, 0
-    vreplvei.w x2, VM0, 1
-    vreplvei.w x3, VM0, 2
-    vreplvei.w x4, VM0, 3
-    vfmaxa.s VM1, x1, x2
-    vfmaxa.s VM0, x3, x4
-    vfmaxa.s VM0, VM0, VM1
-    .align 3
-
-.L12: //INCX==1 and N<8
-    andi I, N, 7
-    li.d J, 4
-    bge J, I, .L13 // 4<N<8
-    vld VX0, X, 0
-    slli.d J, J, 1 // 8
-    sub.d I, J, I
-    slli.d I, I, BASE_SHIFT
-    vldx VX1, X, I
-    vfmaxa.s m0, VX0, VX1 //patial repeat read
-    vreplvei.w x1, m0, 0
-    vreplvei.w x2, m0, 1
-    vreplvei.w x3, m0, 2
-    vreplvei.w x4, m0, 3
-    vfmaxa.s m0, x1, x2
-    vfmaxa.s VM1, x3, x4
-    vfmaxa.s m0, m0, VM1
-    vfmaxa.s VM0, m0, VM0
-    fabs.s $f22, $f22
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L13: //INCX==1 and 0<=N<=4
-    bge $r0, I, .L15
-    .align 3
-
-.L14:
-    vld x1, X, 0
-    addi.d I, I, -1
-    vfmaxa.s VM0, VM0, x1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    .align 3
-
-.L15:
-    fabs.s $f22, $f22
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X
-    ld.w t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.w VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L23
-    ld.w t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.w VM0, t2, 1
-    vinsgr2vr.w VM0, t3, 2
-    vinsgr2vr.w VM0, t4, 3
-    .align 3
-
-.L21:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    addi.d I, I, -1
-    vfmaxa.s VM1, VX0, VX1
-    vfmaxa.s VM0, VM0, VM1
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    vreplvei.w x1, VM0, 0
-    vreplvei.w x2, VM0, 1
-    vreplvei.w x3, VM0, 2
-    vreplvei.w x4, VM0, 3
-    vfmaxa.s VM1, x1, x2
-    vfmaxa.s VM0, x3, x4
-    vfmaxa.s VM0, VM0, VM1
-    .align 3
-
-.L23: //INCX!=1 and N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    vld x1, X, 0
-    addi.d I, I, -1
-    vfmaxa.s VM0, VM0, x1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fabs.s $f22, $f22
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/samin_lasx.S b/kernel/loongarch64/samin_lasx.S
deleted file mode 100644
index 8e96968fa..000000000
--- a/kernel/loongarch64/samin_lasx.S
+++ /dev/null
@@ -1,208 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define J $r13
-#define t1 $r14
-#define t2 $r18
-#define t3 $r15
-#define t4 $r17
-#define TEMP $r16
-#define m0 $xr8
-#define x1 $xr9
-#define x2 $xr10
-#define x3 $xr11
-#define x4 $xr12
-#define x5 $xr13
-#define x6 $xr14
-#define x7 $xr15
-#define x8 $xr16
-#define VX0 $xr20
-#define VX1 $xr21
-#define VM0 $xr22
-#define VM1 $xr23
-#define VM2 $xr18
-#define VM3 $xr19
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    xvld VM0, X, 0
-    srai.d I, N, 3
-    bge $r0, I, .L12
-    .align 3
-
-.L10:
-    xvld VX0, X, 0 * SIZE
-    addi.d I, I, -1
-    xvfmina.s VM0, VM0, VX0
-    addi.d X, X, 8 * SIZE
-    blt $r0, I, .L10
-    .align 3
-
-.L11:
-    xvpickve.w x1, VM0, 0
-    xvpickve.w x2, VM0, 1
-    xvpickve.w x3, VM0, 2
-    xvpickve.w x4, VM0, 3
-    xvpickve.w x5, VM0, 4
-    xvpickve.w x6, VM0, 5
-    xvpickve.w x7, VM0, 6
-    xvpickve.w x8, VM0, 7
-    xvfmina.s VM3, x1, x2
-    xvfmina.s VM2, x3, x4
-    xvfmina.s VM1, x5, x6
-    xvfmina.s VM0, x7, x8
-    xvfmina.s VM2, VM2, VM3
-    xvfmina.s VM0, VM0, VM1
-    xvfmina.s VM0, VM0, VM2
-    .align 3
-
-.L12: //INCX==1 and N<8
-    andi I, N, 7
-    li.d J, 4
-    bge J, I, .L13 // 4<N<8
-    xvld VX0, X, 0
-    slli.d J, J, 1 // 8
-    sub.d I, J, I
-    slli.d I, I, BASE_SHIFT
-    xvldx VX1, X, I
-    xvfmina.s m0, VX0, VX1 //patial repeat read
-    xvpickve.w x1, m0, 0
-    xvpickve.w x2, m0, 1
-    xvpickve.w x3, m0, 2
-    xvpickve.w x4, m0, 3
-    xvfmina.s m0, x1, x2
-    xvfmina.s VM1, x3, x4
-    xvfmina.s m0, m0, VM1
-    xvfmina.s VM0, m0, VM0
-    fabs.s $f22, $f22
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L13: //INCX==1 and 0<=N<=4
-    bge $r0, I, .L15
-    .align 3
-
-.L14:
-    xvld x1, X, 0
-    addi.d I, I, -1
-    xvfmina.s VM0, VM0, x1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    .align 3
-
-.L15:
-    fabs.s $f22, $f22
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X // initialize the mina value
-    ld.w t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.w VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L23
-    ld.w t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.w VM0, t2, 1
-    xvinsgr2vr.w VM0, t3, 2
-    xvinsgr2vr.w VM0, t4, 3
-    ld.w t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.w VM0, t1, 4
-    xvinsgr2vr.w VM0, t2, 5
-    xvinsgr2vr.w VM0, t3, 6
-    xvinsgr2vr.w VM0, t4, 7
-    .align 3
-
-.L21:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    addi.d I, I, -1
-    xvfmina.s VM0, VM0, VX0
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    xvpickve.w x1, VM0, 0
-    xvpickve.w x2, VM0, 1
-    xvpickve.w x3, VM0, 2
-    xvpickve.w x4, VM0, 3
-    xvpickve.w x5, VM0, 4
-    xvpickve.w x6, VM0, 5
-    xvpickve.w x7, VM0, 6
-    xvpickve.w x8, VM0, 7
-    xvfmina.s VM3, x1, x2
-    xvfmina.s VM2, x3, x4
-    xvfmina.s VM1, x5, x6
-    xvfmina.s VM0, x7, x8
-    xvfmina.s VM2, VM2, VM3
-    xvfmina.s VM0, VM0, VM1
-    xvfmina.s VM0, VM0, VM2
-    .align 3
-
-.L23: //INCX!=1 and N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    xvld x1, X, 0
-    addi.d I, I, -1
-    xvfmina.s VM0, VM0, x1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fabs.s $f22, $f22
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/samin_lsx.S b/kernel/loongarch64/samin_lsx.S
deleted file mode 100644
index e222fbcd1..000000000
--- a/kernel/loongarch64/samin_lsx.S
+++ /dev/null
@@ -1,177 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define J $r13
-#define t1 $r14
-#define t2 $r18
-#define t3 $r15
-#define t4 $r17
-#define TEMP $r16
-#define m0 $vr8
-#define x1 $vr9
-#define x2 $vr10
-#define x3 $vr11
-#define x4 $vr12
-#define VX0 $vr20
-#define VX1 $vr21
-#define VM0 $vr22
-#define VM1 $vr23
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    vld VM0, X, 0
-    srai.d I, N, 3
-    bge $r0, I, .L12
-    .align 3
-
-.L10:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    addi.d I, I, -1
-    vfmina.s VM1, VX0, VX1
-    addi.d X, X, 8 * SIZE
-    vfmina.s VM0, VM0, VM1
-    blt $r0, I, .L10
-    .align 3
-
-.L11:
-    vreplvei.w x1, VM0, 0
-    vreplvei.w x2, VM0, 1
-    vreplvei.w x3, VM0, 2
-    vreplvei.w x4, VM0, 3
-    vfmina.s VM1, x1, x2
-    vfmina.s VM0, x3, x4
-    vfmina.s VM0, VM0, VM1
-    .align 3
-
-.L12: //INCX==1 and N<8
-    andi I, N, 7
-    li.d J, 4
-    bge J, I, .L13 // 4<N<8
-    vld VX0, X, 0
-    slli.d J, J, 1 // 8
-    sub.d I, J, I
-    slli.d I, I, BASE_SHIFT
-    vldx VX1, X, I
-    vfmina.s m0, VX0, VX1 //patial repeat read
-    vreplvei.w x1, m0, 0
-    vreplvei.w x2, m0, 1
-    vreplvei.w x3, m0, 2
-    vreplvei.w x4, m0, 3
-    vfmina.s m0, x1, x2
-    vfmina.s VM1, x3, x4
-    vfmina.s m0, m0, VM1
-    vfmina.s VM0, m0, VM0
-    fabs.s $f22, $f22
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L13: //INCX==1 and 0<=N<=4
-    bge $r0, I, .L15
-    .align 3
-
-.L14:
-    vld x1, X, 0
-    addi.d I, I, -1
-    vfmina.s VM0, VM0, x1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    .align 3
-
-.L15:
-    fabs.s $f22, $f22
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X
-    ld.w t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.w VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L23
-    ld.w t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.w VM0, t2, 1
-    vinsgr2vr.w VM0, t3, 2
-    vinsgr2vr.w VM0, t4, 3
-    .align 3
-
-.L21:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    addi.d I, I, -1
-    vfmina.s VM1, VX0, VX1
-    vfmina.s VM0, VM0, VM1
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    vreplvei.w x1, VM0, 0
-    vreplvei.w x2, VM0, 1
-    vreplvei.w x3, VM0, 2
-    vreplvei.w x4, VM0, 3
-    vfmina.s VM1, x1, x2
-    vfmina.s VM0, x3, x4
-    vfmina.s VM0, VM0, VM1
-    .align 3
-
-.L23: //INCX!=1 and N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    vld x1, X, 0
-    addi.d I, I, -1
-    vfmina.s VM0, VM0, x1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fabs.s $f22, $f22
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/smax_lasx.S b/kernel/loongarch64/smax_lasx.S
deleted file mode 100644
index 8cceb9928..000000000
--- a/kernel/loongarch64/smax_lasx.S
+++ /dev/null
@@ -1,205 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define J $r13
-#define t1 $r14
-#define t2 $r18
-#define t3 $r15
-#define t4 $r17
-#define TEMP $r16
-#define m0 $xr8
-#define x1 $xr9
-#define x2 $xr10
-#define x3 $xr11
-#define x4 $xr12
-#define x5 $xr13
-#define x6 $xr14
-#define x7 $xr15
-#define x8 $xr16
-#define VX0 $xr20
-#define VX1 $xr21
-#define VM0 $xr22
-#define VM1 $xr23
-#define VM2 $xr18
-#define VM3 $xr17
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    xvld VM0, X, 0
-    srai.d I, N, 3
-    bge $r0, I, .L12
-    .align 3
-
-.L10:
-    xvld VX0, X, 0 * SIZE
-    addi.d I, I, -1
-    xvfmax.s VM0, VM0, VX0
-    addi.d X, X, 8 * SIZE
-    blt $r0, I, .L10
-    .align 3
-
-.L11:
-    xvpickve.w x1, VM0, 0
-    xvpickve.w x2, VM0, 1
-    xvpickve.w x3, VM0, 2
-    xvpickve.w x4, VM0, 3
-    xvpickve.w x5, VM0, 4
-    xvpickve.w x6, VM0, 5
-    xvpickve.w x7, VM0, 6
-    xvpickve.w x8, VM0, 7
-    xvfmax.s VM3, x1, x2
-    xvfmax.s VM2, x3, x4
-    xvfmax.s VM1, x5, x6
-    xvfmax.s VM0, x7, x8
-    xvfmax.s VM2, VM2, VM3
-    xvfmax.s VM0, VM0, VM1
-    xvfmax.s VM0, VM0, VM2
-    .align 3
-
-.L12: //INCX==1 and N<8
-    andi I, N, 7
-    li.d J, 4
-    bge J, I, .L13 // 4<N<8
-    xvld VX0, X, 0
-    slli.d J, J, 1 // 8
-    sub.d I, J, I
-    slli.d I, I, BASE_SHIFT
-    xvldx VX1, X, I
-    xvfmax.s m0, VX0, VX1 //patial repeat read
-    xvpickve.w x1, m0, 0
-    xvpickve.w x2, m0, 1
-    xvpickve.w x3, m0, 2
-    xvpickve.w x4, m0, 3
-    xvfmax.s m0, x1, x2
-    xvfmax.s VM1, x3, x4
-    xvfmax.s m0, m0, VM1
-    xvfmax.s VM0, m0, VM0
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L13: //INCX==1 and 0<=N<=4
-    bge $r0, I, .L15
-    .align 3
-
-.L14:
-    xvld x1, X, 0
-    addi.d I, I, -1
-    xvfmax.s VM0, VM0, x1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    .align 3
-
-.L15:
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X // initialize the max value
-    ld.w t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.w VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L23
-    ld.w t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.w VM0, t2, 1
-    xvinsgr2vr.w VM0, t3, 2
-    xvinsgr2vr.w VM0, t4, 3
-    ld.w t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.w VM0, t1, 4
-    xvinsgr2vr.w VM0, t2, 5
-    xvinsgr2vr.w VM0, t3, 6
-    xvinsgr2vr.w VM0, t4, 7
-    .align 3
-
-.L21:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    addi.d I, I, -1
-    xvfmax.s VM0, VM0, VX0
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    xvpickve.w x1, VM0, 0
-    xvpickve.w x2, VM0, 1
-    xvpickve.w x3, VM0, 2
-    xvpickve.w x4, VM0, 3
-    xvpickve.w x5, VM0, 4
-    xvpickve.w x6, VM0, 5
-    xvpickve.w x7, VM0, 6
-    xvpickve.w x8, VM0, 7
-    xvfmax.s VM3, x1, x2
-    xvfmax.s VM2, x3, x4
-    xvfmax.s VM1, x5, x6
-    xvfmax.s VM0, x7, x8
-    xvfmax.s VM2, VM2, VM3
-    xvfmax.s VM0, VM0, VM1
-    xvfmax.s VM0, VM0, VM2
-    .align 3
-
-.L23: //INCX!=1 and N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    xvld x1, X, 0
-    addi.d I, I, -1
-    xvfmax.s VM0, VM0, x1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/smax_lsx.S b/kernel/loongarch64/smax_lsx.S
deleted file mode 100644
index 71e6863b8..000000000
--- a/kernel/loongarch64/smax_lsx.S
+++ /dev/null
@@ -1,171 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N     $r4
-#define X     $r5
-#define INCX  $r6
-#define I     $r12
-#define J     $r13
-#define t1    $r14
-#define t2    $r18
-#define t3    $r15
-#define t4    $r17
-#define TEMP  $r16
-#define m0    $vr8
-#define x1    $vr9
-#define x2    $vr10
-#define x3    $vr11
-#define x4    $vr12
-#define VX0   $vr20
-#define VX1   $vr21
-#define VM0   $vr22
-#define VM1   $vr23
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    vld VM0, X, 0
-    srai.d I, N, 3
-    bge $r0, I, .L12
-    .align 3
-
-.L10:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    addi.d I, I, -1
-    vfmax.s VM1, VX0, VX1
-    addi.d X, X, 8 * SIZE
-    vfmax.s VM0, VM0, VM1
-    blt $r0, I, .L10
-    .align 3
-
-.L11:
-    vreplvei.w x2, VM0, 1
-    vreplvei.w x3, VM0, 2
-    vreplvei.w x4, VM0, 3
-    vfmax.s VM1, x2, x3
-    vfmax.s VM0, x4, VM0
-    vfmax.s VM0, VM0, VM1
-    .align 3
-
-.L12: //INCX==1 and N<8
-    andi I, N, 7
-    li.d J, 4
-    bge J, I, .L13 // 4<N<8
-    vld VX0, X, 0
-    slli.d J, J, 1 // 8
-    sub.d I, J, I
-    slli.d I, I, BASE_SHIFT
-    vldx VX1, X, I
-    vfmax.s m0, VX0, VX1 //patial repeat read
-    vreplvei.w x2, m0, 1
-    vreplvei.w x3, m0, 2
-    vreplvei.w x4, m0, 3
-    vfmax.s x1, x2, x3
-    vfmax.s VM1, x4, m0
-    vfmax.s m0, x1, VM1
-    vfmax.s VM0, m0, VM0
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L13: //INCX==1 and 0<=N<=4
-    bge $r0, I, .L15
-    .align 3
-
-.L14:
-    vld x1, X, 0
-    addi.d I, I, -1
-    vfmax.s VM0, VM0, x1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    .align 3
-
-.L15:
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X
-    ld.w t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.w VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L23
-    ld.w t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.w VM0, t2, 1
-    vinsgr2vr.w VM0, t3, 2
-    vinsgr2vr.w VM0, t4, 3
-    .align 3
-
-.L21:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    addi.d I, I, -1
-    vfmax.s VM1, VX0, VX1
-    vfmax.s VM0, VM0, VM1
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    vreplvei.w x2, VM0, 1
-    vreplvei.w x3, VM0, 2
-    vreplvei.w x4, VM0, 3
-    vfmax.s VM1, x2, x3
-    vfmax.s VM0, x4, VM0
-    vfmax.s VM0, VM0, VM1
-    .align 3
-
-.L23: //INCX!=1 and N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    vld x1, X, 0
-    addi.d I, I, -1
-    vfmax.s VM0, VM0, x1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/smin_lasx.S b/kernel/loongarch64/smin_lasx.S
deleted file mode 100644
index f6bbbfb8d..000000000
--- a/kernel/loongarch64/smin_lasx.S
+++ /dev/null
@@ -1,205 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define J $r13
-#define t1 $r14
-#define t2 $r18
-#define t3 $r15
-#define t4 $r17
-#define TEMP $r16
-#define m0 $xr8
-#define x1 $xr9
-#define x2 $xr10
-#define x3 $xr11
-#define x4 $xr12
-#define x5 $xr13
-#define x6 $xr14
-#define x7 $xr15
-#define x8 $xr16
-#define VX0 $xr20
-#define VX1 $xr21
-#define VM0 $xr22
-#define VM1 $xr23
-#define VM2 $xr18
-#define VM3 $xr19
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    xvld VM0, X, 0
-    srai.d I, N, 3
-    bge $r0, I, .L12
-    .align 3
-
-.L10:
-    xvld VX0, X, 0 * SIZE
-    addi.d I, I, -1
-    xvfmin.s VM0, VM0, VX0
-    addi.d X, X, 8 * SIZE
-    blt $r0, I, .L10
-    .align 3
-
-.L11:
-    xvpickve.w x1, VM0, 0
-    xvpickve.w x2, VM0, 1
-    xvpickve.w x3, VM0, 2
-    xvpickve.w x4, VM0, 3
-    xvpickve.w x5, VM0, 4
-    xvpickve.w x6, VM0, 5
-    xvpickve.w x7, VM0, 6
-    xvpickve.w x8, VM0, 7
-    xvfmin.s VM3, x1, x2
-    xvfmin.s VM2, x3, x4
-    xvfmin.s VM1, x5, x6
-    xvfmin.s VM0, x7, x8
-    xvfmin.s VM2, VM2, VM3
-    xvfmin.s VM0, VM0, VM1
-    xvfmin.s VM0, VM0, VM2
-    .align 3
-
-.L12: //INCX==1 and N<8
-    andi I, N, 7
-    li.d J, 4
-    bge J, I, .L13 // 4<N<8
-    xvld VX0, X, 0
-    slli.d J, J, 1 // 8
-    sub.d I, J, I
-    slli.d I, I, BASE_SHIFT
-    xvldx VX1, X, I
-    xvfmin.s m0, VX0, VX1 //patial repeat read
-    xvpickve.w x1, m0, 0
-    xvpickve.w x2, m0, 1
-    xvpickve.w x3, m0, 2
-    xvpickve.w x4, m0, 3
-    xvfmin.s m0, x1, x2
-    xvfmin.s VM1, x3, x4
-    xvfmin.s m0, m0, VM1
-    xvfmin.s VM0, m0, VM0
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L13: //INCX==1 and 0<=N<=4
-    bge $r0, I, .L15
-    .align 3
-
-.L14:
-    xvld x1, X, 0
-    addi.d I, I, -1
-    xvfmin.s VM0, VM0, x1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    .align 3
-
-.L15:
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X // initialize the min value
-    ld.w t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.w VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L23
-    ld.w t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.w VM0, t2, 1
-    xvinsgr2vr.w VM0, t3, 2
-    xvinsgr2vr.w VM0, t4, 3
-    ld.w t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.w VM0, t1, 4
-    xvinsgr2vr.w VM0, t2, 5
-    xvinsgr2vr.w VM0, t3, 6
-    xvinsgr2vr.w VM0, t4, 7
-    .align 3
-
-.L21:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    addi.d I, I, -1
-    xvfmin.s VM0, VM0, VX0
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    xvpickve.w x1, VM0, 0
-    xvpickve.w x2, VM0, 1
-    xvpickve.w x3, VM0, 2
-    xvpickve.w x4, VM0, 3
-    xvpickve.w x5, VM0, 4
-    xvpickve.w x6, VM0, 5
-    xvpickve.w x7, VM0, 6
-    xvpickve.w x8, VM0, 7
-    xvfmin.s VM3, x1, x2
-    xvfmin.s VM2, x3, x4
-    xvfmin.s VM1, x5, x6
-    xvfmin.s VM0, x7, x8
-    xvfmin.s VM2, VM2, VM3
-    xvfmin.s VM0, VM0, VM1
-    xvfmin.s VM0, VM0, VM2
-    .align 3
-
-.L23: //INCX!=1 and N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    xvld x1, X, 0
-    addi.d I, I, -1
-    xvfmin.s VM0, VM0, x1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/smin_lsx.S b/kernel/loongarch64/smin_lsx.S
deleted file mode 100644
index 1cc3699b0..000000000
--- a/kernel/loongarch64/smin_lsx.S
+++ /dev/null
@@ -1,174 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define J $r13
-#define t1 $r14
-#define t2 $r18
-#define t3 $r15
-#define t4 $r17
-#define TEMP $r16
-#define m0 $vr8
-#define x1 $vr9
-#define x2 $vr10
-#define x3 $vr11
-#define x4 $vr12
-#define VX0 $vr20
-#define VX1 $vr21
-#define VM0 $vr22
-#define VM1 $vr23
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    vld VM0, X, 0
-    srai.d I, N, 3
-    bge $r0, I, .L12
-    .align 3
-
-.L10:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    addi.d I, I, -1
-    vfmin.s VM1, VX0, VX1
-    addi.d X, X, 8 * SIZE
-    vfmin.s VM0, VM0, VM1
-    blt $r0, I, .L10
-    .align 3
-
-.L11:
-    vreplvei.w x1, VM0, 0
-    vreplvei.w x2, VM0, 1
-    vreplvei.w x3, VM0, 2
-    vreplvei.w x4, VM0, 3
-    vfmin.s VM1, x1, x2
-    vfmin.s VM0, x3, x4
-    vfmin.s VM0, VM0, VM1
-    .align 3
-
-.L12: //INCX==1 and N<8
-    andi I, N, 7
-    li.d J, 4
-    bge J, I, .L13 // 4<N<8
-    vld VX0, X, 0
-    slli.d J, J, 1 // 8
-    sub.d I, J, I
-    slli.d I, I, BASE_SHIFT
-    vldx VX1, X, I
-    vfmin.s m0, VX0, VX1 //patial repeat read
-    vreplvei.w x1, m0, 0
-    vreplvei.w x2, m0, 1
-    vreplvei.w x3, m0, 2
-    vreplvei.w x4, m0, 3
-    vfmin.s m0, x1, x2
-    vfmin.s VM1, x3, x4
-    vfmin.s m0, m0, VM1
-    vfmin.s VM0, m0, VM0
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L13: //INCX==1 and 0<=N<=4
-    bge $r0, I, .L15
-    .align 3
-
-.L14:
-    vld x1, X, 0
-    addi.d I, I, -1
-    vfmin.s VM0, VM0, x1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    .align 3
-
-.L15:
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X
-    ld.w t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.w VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L23
-    ld.w t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.w VM0, t2, 1
-    vinsgr2vr.w VM0, t3, 2
-    vinsgr2vr.w VM0, t4, 3
-    .align 3
-
-.L21:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    vfmin.s VM1, VX0, VX1
-    addi.d I, I, -1
-    vfmin.s VM0, VM0, VM1
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    vreplvei.w x1, VM0, 0
-    vreplvei.w x2, VM0, 1
-    vreplvei.w x3, VM0, 2
-    vreplvei.w x4, VM0, 3
-    vfmin.s VM1, x1, x2
-    vfmin.s VM0, x3, x4
-    vfmin.s VM0, VM0, VM1
-    .align 3
-
-.L23: //INCX!=1 and N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    vld x1, X, 0
-    vfmin.s VM0, VM0, x1
-    addi.d I, I, -1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fmov.s $f0, $f22
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE

From 36c12c4971da3a1f305ca54a90bd957eaea94c6b Mon Sep 17 00:00:00 2001
From: Shiyou Yin <yinshiyou-hf@loongson.cn>
Date: Wed, 27 Dec 2023 11:30:17 +0800
Subject: [PATCH 513/718] loongarch64: Refine copy,swap,nrm2,sum optimization.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |  12 +-
 kernel/loongarch64/KERNEL.LOONGSON3R5    |  12 +-
 kernel/loongarch64/copy_lasx.S           | 306 ++++++++++++++++
 kernel/loongarch64/copy_lsx.S            | 316 +++++++++++++++++
 kernel/loongarch64/dcopy_lasx.S          | 224 ------------
 kernel/loongarch64/dcopy_lsx.S           | 232 ------------
 kernel/loongarch64/dnrm2_lasx.S          | 124 ++++---
 kernel/loongarch64/dnrm2_lsx.S           |  84 +++--
 kernel/loongarch64/dsum_lasx.S           | 125 -------
 kernel/loongarch64/dsum_lsx.S            | 123 -------
 kernel/loongarch64/dswap_lasx.S          | 301 ----------------
 kernel/loongarch64/dswap_lsx.S           | 317 -----------------
 kernel/loongarch64/scopy_lasx.S          | 216 ------------
 kernel/loongarch64/scopy_lsx.S           | 220 ------------
 kernel/loongarch64/snrm2_lasx.S          | 116 +++---
 kernel/loongarch64/snrm2_lsx.S           | 141 ++++----
 kernel/loongarch64/ssum_lasx.S           | 140 --------
 kernel/loongarch64/ssum_lsx.S            | 125 -------
 kernel/loongarch64/sswap_lasx.S          | 286 ---------------
 kernel/loongarch64/sswap_lsx.S           | 294 ----------------
 kernel/loongarch64/sum_lasx.S            | 225 ++++++++++++
 kernel/loongarch64/sum_lsx.S             | 204 +++++++++++
 kernel/loongarch64/swap_lasx.S           | 401 +++++++++++++++++++++
 kernel/loongarch64/swap_lsx.S            | 431 +++++++++++++++++++++++
 24 files changed, 2159 insertions(+), 2816 deletions(-)
 create mode 100644 kernel/loongarch64/copy_lasx.S
 create mode 100644 kernel/loongarch64/copy_lsx.S
 delete mode 100644 kernel/loongarch64/dcopy_lasx.S
 delete mode 100644 kernel/loongarch64/dcopy_lsx.S
 delete mode 100644 kernel/loongarch64/dsum_lasx.S
 delete mode 100644 kernel/loongarch64/dsum_lsx.S
 delete mode 100644 kernel/loongarch64/dswap_lasx.S
 delete mode 100644 kernel/loongarch64/dswap_lsx.S
 delete mode 100644 kernel/loongarch64/scopy_lasx.S
 delete mode 100644 kernel/loongarch64/scopy_lsx.S
 delete mode 100644 kernel/loongarch64/ssum_lasx.S
 delete mode 100644 kernel/loongarch64/ssum_lsx.S
 delete mode 100644 kernel/loongarch64/sswap_lasx.S
 delete mode 100644 kernel/loongarch64/sswap_lsx.S
 create mode 100644 kernel/loongarch64/sum_lasx.S
 create mode 100644 kernel/loongarch64/sum_lsx.S
 create mode 100644 kernel/loongarch64/swap_lasx.S
 create mode 100644 kernel/loongarch64/swap_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 802dd1c9b..cb230b348 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -31,11 +31,11 @@ IDAMAXKERNEL = idamax_lsx.S
 ISAMINKERNEL = isamin_lsx.S
 IDAMINKERNEL = idamin_lsx.S
 
-SCOPYKERNEL =  scopy_lsx.S
-DCOPYKERNEL =  dcopy_lsx.S
+SCOPYKERNEL =  copy_lsx.S
+DCOPYKERNEL =  copy_lsx.S
 
-SSWAPKERNEL =  sswap_lsx.S
-DSWAPKERNEL =  dswap_lsx.S
+SSWAPKERNEL =  swap_lsx.S
+DSWAPKERNEL =  swap_lsx.S
 
 SAXPYKERNEL =  saxpy_lsx.S
 DAXPYKERNEL =  daxpy_lsx.S
@@ -43,8 +43,8 @@ DAXPYKERNEL =  daxpy_lsx.S
 SAXPBYKERNEL = saxpby_lsx.S
 DAXPBYKERNEL = daxpby_lsx.S
 
-SSUMKERNEL  =  ssum_lsx.S
-DSUMKERNEL  =  dsum_lsx.S
+SSUMKERNEL  =  sum_lsx.S
+DSUMKERNEL  =  sum_lsx.S
 
 SASUMKERNEL =  sasum_lsx.S
 DASUMKERNEL =  dasum_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 3253489d9..ba59c4566 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -31,11 +31,11 @@ IDAMAXKERNEL = idamax_lasx.S
 ISAMINKERNEL = isamin_lasx.S
 IDAMINKERNEL = idamin_lasx.S
 
-SCOPYKERNEL =  scopy_lasx.S
-DCOPYKERNEL =  dcopy_lasx.S
+SCOPYKERNEL =  copy_lasx.S
+DCOPYKERNEL =  copy_lasx.S
 
-SSWAPKERNEL =  sswap_lasx.S
-DSWAPKERNEL =  dswap_lasx.S
+SSWAPKERNEL =  swap_lasx.S
+DSWAPKERNEL =  swap_lasx.S
 
 SAXPYKERNEL =  saxpy_lasx.S
 DAXPYKERNEL =  daxpy_lasx.S
@@ -43,8 +43,8 @@ DAXPYKERNEL =  daxpy_lasx.S
 SAXPBYKERNEL = saxpby_lasx.S
 DAXPBYKERNEL = daxpby_lasx.S
 
-SSUMKERNEL  =  ssum_lasx.S
-DSUMKERNEL  =  dsum_lasx.S
+SSUMKERNEL  =  sum_lasx.S
+DSUMKERNEL  =  sum_lasx.S
 
 SASUMKERNEL =  sasum_lasx.S
 DASUMKERNEL =  dasum_lasx.S
diff --git a/kernel/loongarch64/copy_lasx.S b/kernel/loongarch64/copy_lasx.S
new file mode 100644
index 000000000..31f91cec1
--- /dev/null
+++ b/kernel/loongarch64/copy_lasx.S
@@ -0,0 +1,306 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define VX0    $xr12
+#define VX1    $xr13
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+/* INCX==1 and INCY==1 */
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    xvld VX0, X, 0
+    addi.d  I, I, -1
+    xvst VX0, Y, 0
+#ifdef DOUBLE
+    xvld VX0, X, 32
+    xvst VX0, Y, 32
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    ST    $f12, Y, 0
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+/* INCX==1 and INCY!=1 */
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    xvstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.d VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    xvstelm.d VX0, Y, 0, 3
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0, 2
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0, 3
+    add.d Y, Y, INCY
+#else
+    xvld VX0, X, 0
+    xvstelm.w VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 3
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 4
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 5
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 6
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 7
+    add.d Y, Y, INCY
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    ST    $f12, Y, 0
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+/* INCX!=1 and INCY==1 */
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    xvst VX0, Y, 0
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvst VX1, Y, 32
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvst VX0, Y, 0
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    ST    $f12, Y, 0
+    add.d  X, X, INCX
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+/* INCX!=1 and INCY!=1 */
+.L22:
+    bge $r0, I, .L223
+    .align 3
+
+.L222:
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    ST    a3, X, 0
+    add.d Y, Y, INCY
+    ST    a4, X, 0
+    add.d Y, Y, INCY
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    ST    a3, X, 0
+    add.d Y, Y, INCY
+    ST    a4, X, 0
+    add.d Y, Y, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    ST    $f12, Y, 0
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/copy_lsx.S b/kernel/loongarch64/copy_lsx.S
new file mode 100644
index 000000000..bb10f3565
--- /dev/null
+++ b/kernel/loongarch64/copy_lsx.S
@@ -0,0 +1,316 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define VX0    $vr12
+#define VX1    $vr13
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+/* INCX==1 and INCY==1 */
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    vld VX0, X, 0
+    vld VX1, X, 16
+    addi.d  I, I, -1
+    vst VX0, Y, 0
+    vst VX1, Y, 16
+#ifdef DOUBLE
+    vld VX0, X, 32
+    vld VX1, X, 48
+    vst VX0, Y, 32
+    vst VX1, Y, 48
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    ST    $f12, Y, 0
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+/* INCX==1 and INCY!=1 */
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    vld VX0, X, 32
+    vld VX1, X, 48
+    vstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0, 1
+    add.d Y, Y, INCY
+#else
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vstelm.w VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.w VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    vstelm.w VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    vstelm.w VX0, Y, 0, 3
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0, 2
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0, 3
+    add.d Y, Y, INCY
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    ST    $f12, Y, 0
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+/* INCX!=1 and INCY==1 */
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 0
+    vst VX1, Y, 16
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 32
+    vst VX1, Y, 48
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    vst VX0, Y, 0
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vst VX1, Y, 16
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    ST    $f12, Y, 0
+    add.d  X, X, INCX
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+/* INCX!=1 and INCY!=1 */
+.L22:
+    bge $r0, I, .L223
+    .align 3
+
+.L222:
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    ST    a3, X, 0
+    add.d Y, Y, INCY
+    ST    a4, X, 0
+    add.d Y, Y, INCY
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    ST    a3, X, 0
+    add.d Y, Y, INCY
+    ST    a4, X, 0
+    add.d Y, Y, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    ST    $f12, Y, 0
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dcopy_lasx.S b/kernel/loongarch64/dcopy_lasx.S
deleted file mode 100644
index 9d7da4a80..000000000
--- a/kernel/loongarch64/dcopy_lasx.S
+++ /dev/null
@@ -1,224 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define Y      $r7
-#define INCY   $r8
-#define I      $r17
-#define TEMP   $r18
-#define t1     $r14
-#define t2     $r15
-#define t3     $r16
-#define t4     $r19
-#define a1     $f12
-#define a2     $f13
-#define a3     $f14
-#define a4     $f15
-#define VX0    $xr12
-#define VX1    $xr13
-
-    PROLOGUE
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L112
-    .align 3
-
-.L111:
-    xvld VX0, X, 0 * SIZE
-    xvld VX1, X, 4 * SIZE
-    xvst VX0, Y, 0 * SIZE
-    xvst VX1, Y, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L111
-    .align 3
-
-.L112:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L113:
-    fld.d $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    fst.d $f12, Y, 0 * SIZE
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L113
-    b .L999
-    .align 3
-
-.L12:
-    bge $r0, I, .L122
-    .align 3
-
-.L121:
-    xvld VX0, X, 0 * SIZE
-    xvld VX1, X, 4 * SIZE
-    xvstelm.d VX0, Y, 0, 0
-    add.d Y, Y, INCY
-    xvstelm.d VX0, Y, 0, 1
-    add.d Y, Y, INCY
-    xvstelm.d VX0, Y, 0, 2
-    add.d Y, Y, INCY
-    xvstelm.d VX0, Y, 0, 3
-    add.d Y, Y, INCY
-    xvstelm.d VX1, Y, 0, 0
-    add.d Y, Y, INCY
-    xvstelm.d VX1, Y, 0, 1
-    add.d Y, Y, INCY
-    xvstelm.d VX1, Y, 0, 2
-    add.d Y, Y, INCY
-    xvstelm.d VX1, Y, 0, 3
-    add.d Y, Y, INCY
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L121
-    .align 3
-
-.L122:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L123:
-    fld.d $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    fst.d $f12, Y, 0 * SIZE
-    add.d  Y, Y, INCY
-    blt $r0, I, .L123
-    b .L999
-    .align 3
-
-.L21:
-    bge $r0, I, .L212
-    .align 3
-
-.L211:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX0, t1, 0
-    xvinsgr2vr.d VX0, t2, 1
-    xvinsgr2vr.d VX0, t3, 2
-    xvinsgr2vr.d VX0, t4, 3
-    xvst VX0, Y, 0 * SIZE
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX1, t1, 0
-    xvinsgr2vr.d VX1, t2, 1
-    xvinsgr2vr.d VX1, t3, 2
-    xvinsgr2vr.d VX1, t4, 3
-    xvst VX1, Y, 4 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    .align 3
-
-.L212:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L213:
-    fld.d $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    add.d  X, X, INCX
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L213
-    b .L999
-    .align 3
-
-.L22:
-    bgez INCX, .L220
-    .align 3
-
-.L220:
-    bge $r0, I, .L223
-    .align 3
-
-.L222:
-    fld.d a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a3, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a4, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a3, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a4, X, 0 * SIZE
-    add.d Y, Y, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L222
-    .align 3
-
-.L223:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L224:
-    fld.d $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    add.d  X, X, INCX
-    add.d  Y, Y, INCY
-    blt $r0, I, .L224
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/dcopy_lsx.S b/kernel/loongarch64/dcopy_lsx.S
deleted file mode 100644
index 161655bbd..000000000
--- a/kernel/loongarch64/dcopy_lsx.S
+++ /dev/null
@@ -1,232 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define Y      $r7
-#define INCY   $r8
-#define I      $r17
-#define TEMP   $r18
-#define t1     $r14
-#define t2     $r15
-#define t3     $r16
-#define t4     $r19
-#define a1     $f12
-#define a2     $f13
-#define a3     $f14
-#define a4     $f15
-#define VX0    $vr12
-#define VX1    $vr13
-
-    PROLOGUE
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L112
-    .align 3
-
-.L111:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 2 * SIZE
-    vst VX0, Y, 0 * SIZE
-    vst VX1, Y, 2 * SIZE
-    vld VX0, X, 4 * SIZE
-    vld VX1, X, 6 * SIZE
-    addi.d  I, I, -1
-    vst VX0, Y, 4 * SIZE
-    vst VX1, Y, 6 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L111
-    .align 3
-
-.L112:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L113:
-    fld.d $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    fst.d $f12, Y, 0 * SIZE
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L113
-    b .L999
-    .align 3
-
-.L12:
-    bge $r0, I, .L122
-    .align 3
-
-.L121:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 2 * SIZE
-    vstelm.d VX0, Y, 0, 0
-    add.d Y, Y, INCY
-    vstelm.d VX0, Y, 0, 1
-    add.d Y, Y, INCY
-    vstelm.d VX1, Y, 0, 0
-    add.d Y, Y, INCY
-    vstelm.d VX1, Y, 0, 1
-    add.d Y, Y, INCY
-    vld VX0, X, 4 * SIZE
-    vld VX1, X, 6 * SIZE
-    vstelm.d VX0, Y, 0, 0
-    add.d Y, Y, INCY
-    vstelm.d VX0, Y, 0, 1
-    add.d Y, Y, INCY
-    vstelm.d VX1, Y, 0, 0
-    add.d Y, Y, INCY
-    vstelm.d VX1, Y, 0, 1
-    add.d Y, Y, INCY
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L121
-    .align 3
-
-.L122:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L123:
-    fld.d $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    fst.d $f12, Y, 0 * SIZE
-    add.d  Y, Y, INCY
-    blt $r0, I, .L123
-    b .L999
-    .align 3
-
-.L21:
-    bge $r0, I, .L212
-    .align 3
-
-.L211:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    vst VX0, Y, 0 * SIZE
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    vst VX1, Y, 2 * SIZE
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    vst VX0, Y, 4 * SIZE
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    vst VX1, Y, 6 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    .align 3
-
-.L212:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L213:
-    fld.d $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    add.d  X, X, INCX
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L213
-    b .L999
-    .align 3
-
-.L22:
-    bgez INCX, .L220
-    .align 3
-
-.L220:
-    bge $r0, I, .L223
-    .align 3
-
-.L222:
-    fld.d a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a3, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a4, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a3, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d a4, X, 0 * SIZE
-    add.d Y, Y, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L222
-    .align 3
-
-.L223:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L224:
-    fld.d $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    add.d  X, X, INCX
-    add.d  Y, Y, INCY
-    blt $r0, I, .L224
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/dnrm2_lasx.S b/kernel/loongarch64/dnrm2_lasx.S
index 2a9c3cf7b..5a6f7cf1e 100644
--- a/kernel/loongarch64/dnrm2_lasx.S
+++ b/kernel/loongarch64/dnrm2_lasx.S
@@ -1,3 +1,35 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
 #define ASSEMBLER
 
 #include "common.h"
@@ -12,6 +44,8 @@
 #define t2     $r13
 #define t3     $r14
 #define t4     $r15
+
+/* Don't change following FR unless you know the effects. */
 #define VX0    $xr15
 #define VX1    $xr16
 #define VM0    $xr17
@@ -35,6 +69,7 @@
 
     xvxor.v   res1, res1, res1
     xvxor.v   res2, res2, res2
+    xvxor.v VM0, VM0, VM0
     bge $r0, N, .L999
     beq $r0, INCX, .L999
     move  XX, X
@@ -46,12 +81,11 @@
     slli.d INCX, INCX, BASE_SHIFT
     srai.d I, N, 3
     bne INCX, TEMP, .L20
-    xvld VM0, X, 0
     bge $r0,  I, .L97
     .align 3
 
 .L10:
-    xvld VX0, X, 0 * SIZE
+    xvld VX0, X, 0
     xvld VX1, X, 4 * SIZE
     xvfmaxa.d VM1, VX1, VX0
     xvfmaxa.d VM0, VM0, VM1
@@ -62,40 +96,32 @@
     .align 3
 
 .L20: // INCX!=1
-    move TEMP, X // initialize the maxa value
-    ld.d t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.d VM0, t1, 0
-    srai.d I, N, 3
     bge $r0, I, .L97
-    ld.d t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.d VM0, t2, 1
     .align 3
 
 .L21:
-    ld.d t1, X, 0 * SIZE
+    ld.d t1, X, 0
     add.d X, X, INCX
     xvinsgr2vr.d VX0, t1, 0
-    ld.d t2, X, 0 * SIZE
+    ld.d t2, X, 0
     add.d X, X, INCX
     xvinsgr2vr.d VX0, t2, 1
-    ld.d t3, X, 0 * SIZE
+    ld.d t3, X, 0
     add.d X, X, INCX
     xvinsgr2vr.d VX0, t3, 2
-    ld.d t4, X, 0 * SIZE
+    ld.d t4, X, 0
     add.d X, X, INCX
     xvinsgr2vr.d VX0, t4, 3
-    ld.d t1, X, 0 * SIZE
+    ld.d t1, X, 0
     add.d X, X, INCX
     xvinsgr2vr.d VX1, t1, 0
-    ld.d t2, X, 0 * SIZE
+    ld.d t2, X, 0
     add.d X, X, INCX
     xvinsgr2vr.d VX1, t2, 1
-    ld.d t3, X, 0 * SIZE
+    ld.d t3, X, 0
     add.d X, X, INCX
     xvinsgr2vr.d VX1, t3, 2
-    ld.d t4, X, 0 * SIZE
+    ld.d t4, X, 0
     add.d X, X, INCX
     xvinsgr2vr.d VX1, t4, 3
     xvfmaxa.d VM1, VX0, VX1
@@ -109,9 +135,9 @@
     xvpickve.d VX0, VM0, 1
     xvpickve.d VX1, VM0, 2
     xvpickve.d VM3, VM0, 3
-    xvfmaxa.d VM1, VX0, VX1
-    xvfmaxa.d VM2, VM3, VM0
-    xvfmaxa.d VM0, VM1, VM2
+    fmaxa.d $f17, $f17, $f14
+    fmaxa.d $f17, $f17, $f15
+    fmaxa.d $f17, $f17, $f16
     .align 3
 
 .L97:
@@ -149,12 +175,12 @@
     .align 3
 
 .L110:
-    xvld VX0, XX, 0 * SIZE
+    xvld VX0, XX, 0
     xvld VX1, XX, 4 * SIZE
-    xvfmul.d VM0, VX0, VALPHA
-    xvfmul.d VM1, VX1, VALPHA
-    xvfmadd.d res1, VM0, VM0, res1
-    xvfmadd.d res2, VM1, VM1, res2
+    xvfmul.d VM2, VX0, VALPHA
+    xvfmul.d VM3, VX1, VALPHA
+    xvfmadd.d res1, VM2, VM2, res1
+    xvfmadd.d res2, VM3, VM3, res2
     addi.d XX, XX, 8 * SIZE
     addi.d I, I, -1
     blt $r0, I, .L110
@@ -166,34 +192,34 @@
     bge $r0, I, .L997
 
 .L121:
-    ld.d t1, XX,  0 * SIZE
+    ld.d t1, XX,  0
     add.d XX, XX, INCX
-    ld.d t2, XX, 0 * SIZE
+    ld.d t2, XX, 0
     add.d XX, XX, INCX
-    ld.d t3, XX, 0 * SIZE
+    ld.d t3, XX, 0
     add.d XX, XX, INCX
-    ld.d t4, XX, 0 * SIZE
+    ld.d t4, XX, 0
     add.d XX, XX, INCX
     xvinsgr2vr.d VX0, t1, 0
     xvinsgr2vr.d VX0, t2, 1
     xvinsgr2vr.d VX0, t3, 2
     xvinsgr2vr.d VX0, t4, 3
-    ld.d t1, XX, 0 * SIZE
+    ld.d t1, XX, 0
     add.d XX, XX, INCX
-    ld.d t2, XX, 0 * SIZE
+    ld.d t2, XX, 0
     add.d XX, XX, INCX
-    ld.d t3, XX, 0 * SIZE
+    ld.d t3, XX, 0
     add.d XX, XX, INCX
-    ld.d t4, XX, 0 * SIZE
+    ld.d t4, XX, 0
     add.d XX, XX, INCX
-    xvinsgr2vr.d VX0, t1, 0
-    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
     xvinsgr2vr.d VX1, t3, 2
     xvinsgr2vr.d VX1, t4, 3
-    xvfmul.d VM0, VX0, VALPHA
-    xvfmul.d VM1, VX1, VALPHA
-    xvfmadd.d res1, VM0, VM0, res1
-    xvfmadd.d res2, VM1, VM1, res2
+    xvfmul.d VM2, VX0, VALPHA
+    xvfmul.d VM3, VX1, VALPHA
+    xvfmadd.d res1, VM2, VM2, res1
+    xvfmadd.d res2, VM3, VM3, res2
     addi.d  I, I, -1
     blt $r0, I, .L121
     b .L996
@@ -203,10 +229,10 @@
     xvfadd.d res1, res1, res2
     xvpickve.d VX0, res1, 1
     xvpickve.d VX1, res1, 2
-    xvpickve.d VM0, res1, 3
-    xvfadd.d res1, VX0, res1
-    xvfadd.d VX1, VX1, VM0
-    xvfadd.d res1, VX1, res1
+    xvpickve.d VM2, res1, 3
+    fadd.d $f19, $f19, $f15
+    fadd.d $f19, $f19, $f16
+    fadd.d $f19, $f19, $f13
     .align 3
 
 .L997:
@@ -215,19 +241,17 @@
     .align 3
 
 .L998:
-    fld.d $f15, XX, 0 * SIZE
+    fld.d $f15, XX, 0
     addi.d I, I, -1
     fmul.d $f15, $f15, ALPHA
     fmadd.d $f19, $f15, $f15, $f19
     add.d XX, XX , INCX
     blt $r0, I, .L998
-    fsqrt.d $f19, $f19
-    fmul.d $f0, max, $f19
-    jirl    $r0, $r1, 0x0
-    .align 3
 
 .L999:
-    fmov.d $f0, $f19
+    fsqrt.d $f19, $f19
+    fmul.d $f0, max, $f19
     jirl $r0, $r1, 0x0
+    .align 3
 
     EPILOGUE
diff --git a/kernel/loongarch64/dnrm2_lsx.S b/kernel/loongarch64/dnrm2_lsx.S
index e4615e18d..fce4260e2 100644
--- a/kernel/loongarch64/dnrm2_lsx.S
+++ b/kernel/loongarch64/dnrm2_lsx.S
@@ -1,3 +1,35 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
 #define ASSEMBLER
 
 #include "common.h"
@@ -12,6 +44,8 @@
 #define t2     $r13
 #define t3     $r14
 #define t4     $r15
+
+/* Don't change following FR unless you know the effects. */
 #define VX0    $vr15
 #define VX1    $vr16
 #define VM0    $vr17
@@ -35,6 +69,7 @@
 
     vxor.v   res1, res1, res1
     vxor.v   res2, res2, res2
+    vxor.v VM0, VM0, VM0
     bge $r0, N, .L999
     beq $r0, INCX, .L999
     move  XX, X
@@ -46,7 +81,7 @@
     slli.d INCX, INCX, BASE_SHIFT
     srai.d I, N, 3
     bne INCX, TEMP, .L20
-    vld VM0, X, 0
+
     bge $r0,  I, .L97
     .align 3
 
@@ -66,15 +101,7 @@
     .align 3
 
 .L20: // INCX!=1
-    move TEMP, X // initialize the maxa value
-    ld.d t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.d VM0, t1, 0
-    srai.d I, N, 3
     bge $r0, I, .L97
-    ld.d t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.d VM0, t2, 1
     .align 3
 
 .L21:
@@ -154,16 +181,16 @@
 .L110:
     vld VX0, XX, 0 * SIZE
     vld VX1, XX, 2 * SIZE
-    vfmul.d VM0, VX0, VALPHA
-    vfmul.d VM1, VX1, VALPHA
-    vfmadd.d res1, VM0, VM0, res1
-    vfmadd.d res2, VM1, VM1, res2
+    vfmul.d VM2, VX0, VALPHA
+    vfmul.d VM3, VX1, VALPHA
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
     vld VX0, XX, 4 * SIZE
     vld VX1, XX, 6 * SIZE
-    vfmul.d VM0, VX0, VALPHA
-    vfmul.d VM1, VX1, VALPHA
-    vfmadd.d res1, VM0, VM0, res1
-    vfmadd.d res2, VM1, VM1, res2
+    vfmul.d VM2, VX0, VALPHA
+    vfmul.d VM3, VX1, VALPHA
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
     addi.d XX, XX, 8 * SIZE
     addi.d I, I, -1
     blt $r0, I, .L110
@@ -173,6 +200,7 @@
 .L120:
     srai.d I, N, 3
     bge $r0, I, .L997
+    .align 3
 
 .L121:
     ld.d t1, XX,  0 * SIZE
@@ -187,14 +215,14 @@
     vinsgr2vr.d VX0, t2, 1
     vinsgr2vr.d VX1, t3, 0
     vinsgr2vr.d VX1, t4, 1
-    vfmul.d VM0, VX0, VALPHA
+    vfmul.d VM2, VX0, VALPHA
     ld.d t1, XX, 0 * SIZE
     add.d XX, XX, INCX
-    vfmul.d VM1, VX1, VALPHA
+    vfmul.d VM3, VX1, VALPHA
     ld.d t2, XX, 0 * SIZE
     add.d XX, XX, INCX
-    vfmadd.d res1, VM0, VM0, res1
-    vfmadd.d res2, VM1, VM1, res2
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
     ld.d t3, XX, 0 * SIZE
     add.d XX, XX, INCX
     ld.d t4, XX, 0 * SIZE
@@ -203,10 +231,10 @@
     vinsgr2vr.d VX0, t2, 1
     vinsgr2vr.d VX1, t3, 0
     vinsgr2vr.d VX1, t4, 1
-    vfmul.d VM0, VX0, VALPHA
-    vfmul.d VM1, VX1, VALPHA
-    vfmadd.d res1, VM0, VM0, res1
-    vfmadd.d res2, VM1, VM1, res2
+    vfmul.d VM2, VX0, VALPHA
+    vfmul.d VM3, VX1, VALPHA
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
     addi.d  I, I, -1
     blt $r0, I, .L121
     b .L996
@@ -230,13 +258,11 @@
     fmadd.d $f19, $f15, $f15, $f19
     add.d XX, XX , INCX
     blt $r0, I, .L998
-    fsqrt.d $f19, $f19
-    fmul.d $f0, max, $f19
-    jirl    $r0, $r1, 0x0
     .align 3
 
 .L999:
-    fmov.d $f0, $f19
+    fsqrt.d $f19, $f19
+    fmul.d $f0, max, $f19
     jirl $r0, $r1, 0x0
 
     EPILOGUE
diff --git a/kernel/loongarch64/dsum_lasx.S b/kernel/loongarch64/dsum_lasx.S
deleted file mode 100644
index 3c51dab60..000000000
--- a/kernel/loongarch64/dsum_lasx.S
+++ /dev/null
@@ -1,125 +0,0 @@
-#define ASSEMBLER
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define I      $r17
-#define TEMP   $r18
-#define t1     $r15
-#define t2     $r12
-#define t3     $r13
-#define t4     $r14
-#define VX0    $xr12
-#define VX1    $xr13
-#define VX2    $xr14
-#define VX3    $xr15
-#define res1   $xr16
-#define res2   $xr17
-    PROLOGUE
-    xvxor.v res1, res1, res1
-    xvxor.v res2, res2, res2
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d  TEMP, SIZE
-    slli.d  INCX, INCX, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bge $r0, I, .L13
-    .align 3
-
-.L11:
-    xvld VX0, X, 0 * SIZE
-    xvld VX1, X, 4 * SIZE
-    xvfadd.d res2, VX0, VX1
-    xvfadd.d res1, res1, res2
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L11
-    .align 3
-
-.L12:
-    xvpickve.d VX1, res1, 1
-    xvpickve.d VX2, res1, 2
-    xvpickve.d VX3, res1, 3
-    xvfadd.d res1, VX1, res1
-    xvfadd.d res1, VX2, res1
-    xvfadd.d res1, VX3, res1
-    .align 3
-
-.L13:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L14:
-    fld.d $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    fadd.d $f16, $f12, $f16
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    b .L999
-    .align 3
-
-.L20:
-    bge $r0, I, .L23
-    .align 3
-
-.L21:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX0, t1, 0
-    xvinsgr2vr.d VX0, t2, 1
-    xvinsgr2vr.d VX0, t3, 2
-    xvinsgr2vr.d VX0, t4, 3
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX1, t1, 0
-    xvinsgr2vr.d VX1, t2, 1
-    xvinsgr2vr.d VX1, t3, 2
-    xvinsgr2vr.d VX1, t4, 3
-    xvfadd.d res2, VX0, VX1
-    xvfadd.d res1, res1, res2
-    addi.d  I, I, -1
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    xvpickve.d VX1, res1, 1
-    xvpickve.d VX2, res1, 2
-    xvpickve.d VX3, res1, 3
-    xvfadd.d res1, VX1, res1
-    xvfadd.d res1, VX2, res1
-    xvfadd.d res1, VX3, res1
-    .align 3
-
-.L23:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    fld.d $f12, X, 0 * SIZE
-    fadd.d $f16, $f12, $f16
-    addi.d I, I, -1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fmov.d $f0, $f16
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/dsum_lsx.S b/kernel/loongarch64/dsum_lsx.S
deleted file mode 100644
index 402d087df..000000000
--- a/kernel/loongarch64/dsum_lsx.S
+++ /dev/null
@@ -1,123 +0,0 @@
-#define ASSEMBLER
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define I      $r17
-#define TEMP   $r18
-#define t1     $r15
-#define t2     $r12
-#define t3     $r13
-#define t4     $r14
-#define VX0    $vr12
-#define VX1    $vr13
-#define VX2    $vr14
-#define VX3    $vr15
-#define res1   $vr16
-#define res2   $vr17
-    PROLOGUE
-    vxor.v res1, res1, res1
-    vxor.v res2, res2, res2
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d  TEMP, SIZE
-    slli.d  INCX, INCX, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bge $r0, I, .L13
-    .align 3
-
-.L11:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 2 * SIZE
-    vfadd.d res2, VX0, VX1
-    vfadd.d res1, res1, res2
-    vld VX0, X, 4 * SIZE
-    vld VX1, X, 6 * SIZE
-    vfadd.d res2, VX0, VX1
-    vfadd.d res1, res1, res2
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L11
-    .align 3
-
-.L12:
-    vreplvei.d VX1, res1, 1
-    vfadd.d res1, VX1, res1
-    .align 3
-
-.L13:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L14:
-    fld.d $f12, X, 0 * SIZE
-    fadd.d $f16, $f12, $f16
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    b .L999
-    .align 3
-
-.L20:
-    bge $r0, I, .L23
-    .align 3
-
-.L21:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    vinsgr2vr.d VX1, t1, 0
-    vinsgr2vr.d VX1, t2, 1
-    add.d X, X, INCX
-    vfadd.d res2, VX0, VX1
-    vfadd.d res1, res1, res2
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t3, 0
-    vinsgr2vr.d VX0, t4, 1
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    add.d X, X, INCX
-    vfadd.d res2, VX0, VX1
-    vfadd.d res1, res1, res2
-    addi.d  I, I, -1
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    vreplvei.d VX1, res1, 1
-    vfadd.d res1, VX1, res1
-    .align 3
-
-.L23:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    fld.d $f12, X, 0 * SIZE
-    fadd.d $f16, $f12, $f16
-    addi.d I, I, -1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fmov.d $f0, $f16
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/dswap_lasx.S b/kernel/loongarch64/dswap_lasx.S
deleted file mode 100644
index 221cb7fa2..000000000
--- a/kernel/loongarch64/dswap_lasx.S
+++ /dev/null
@@ -1,301 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define X      $r7
-#define INCX   $r8
-#define Y      $r9
-#define INCY   $r10
-
-#define I      $r17
-#define TEMP   $r18
-#define XX     $r5
-#define YY     $r6
-#define t1     $r14
-#define t2     $r15
-#define t3     $r16
-#define t4     $r19
-#define a1     $f12
-#define a2     $f13
-#define a3     $f14
-#define a4     $f15
-#define b1     $f16
-#define b2     $f17
-#define b3     $f18
-#define b4     $f19
-#define VX0    $xr12
-#define VX1    $xr13
-#define VX2    $xr14
-#define VX3    $xr15
-
-
-    PROLOGUE
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L112
-    .align 3
-
-.L111:
-    xvld VX0, X, 0 * SIZE
-    xvld VX1, X, 4 * SIZE
-    xvld VX2, Y, 0 * SIZE
-    xvld VX3, Y, 4 * SIZE
-    addi.d  I, I, -1
-    xvst VX2, X, 0 * SIZE
-    xvst VX3, X, 4 * SIZE
-    xvst VX0, Y, 0 * SIZE
-    xvst VX1, Y, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L111
-    .align 3
-
-.L112:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L113:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    fst.d $f14, X, 0 * SIZE
-    addi.d  X, X, SIZE
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L113
-    b .L999
-    .align 3
-
-.L12: // INCX==1 and INCY!=1
-    bge $r0, I, .L122
-    .align 3
-
-.L121:
-    xvld VX0, X, 0 * SIZE
-    ld.d t1, Y, 0 * SIZE
-    xvstelm.d VX0, Y, 0, 0
-    add.d Y, Y, INCY
-    ld.d t2, Y, 0 * SIZE
-    xvstelm.d VX0, Y, 0, 1
-    add.d Y, Y, INCY
-    ld.d t3, Y, 0 * SIZE
-    xvstelm.d VX0, Y, 0, 2
-    add.d Y, Y, INCY
-    ld.d t4, Y, 0 * SIZE
-    xvstelm.d VX0, Y, 0, 3
-    xvinsgr2vr.d VX2, t1, 0
-    xvinsgr2vr.d VX2, t2, 1
-    xvinsgr2vr.d VX2, t3, 2
-    xvinsgr2vr.d VX2, t4, 3
-    add.d Y, Y, INCY
-    xvst VX2, X, 0 * SIZE
-    xvld VX1, X, 4 * SIZE
-    ld.d t1, Y, 0 * SIZE
-    xvstelm.d VX1, Y, 0, 0
-    add.d Y, Y, INCY
-    ld.d t2, Y, 0 * SIZE
-    xvstelm.d VX1, Y, 0, 1
-    add.d Y, Y, INCY
-    ld.d t3, Y, 0 * SIZE
-    xvstelm.d VX1, Y, 0, 2
-    add.d Y, Y, INCY
-    ld.d t4, Y, 0 * SIZE
-    xvstelm.d VX1, Y, 0, 3
-    xvinsgr2vr.d VX3, t1, 0
-    xvinsgr2vr.d VX3, t2, 1
-    xvinsgr2vr.d VX3, t3, 2
-    xvinsgr2vr.d VX3, t4, 3
-    add.d Y, Y, INCY
-    xvst VX3, X, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L121
-    .align 3
-
-.L122:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L123:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    fst.d $f14, X, 0 * SIZE
-    addi.d  X, X, SIZE
-    add.d  Y, Y, INCY
-    blt $r0, I, .L123
-    b .L999
-    .align 3
-
-.L21:
-    bge $r0, I, .L212
-    .align 3
-
-.L211:
-    xvld VX2, Y, 0 * SIZE
-    ld.d t1, X, 0 * SIZE
-    xvstelm.d VX2, X, 0, 0
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    xvstelm.d VX2, X, 0, 1
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    xvstelm.d VX2, X, 0, 2
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    xvstelm.d VX2, X, 0, 3
-    xvinsgr2vr.d VX0, t1, 0
-    xvinsgr2vr.d VX0, t2, 1
-    xvinsgr2vr.d VX0, t3, 2
-    xvinsgr2vr.d VX0, t4, 3
-    add.d X, X, INCX
-    xvst VX0, Y, 0 * SIZE
-    xvld VX3, Y, 4 * SIZE
-    ld.d t1, X, 0 * SIZE
-    xvstelm.d VX3, X, 0, 0
-    add.d X, X, INCY
-    ld.d t2, X, 0 * SIZE
-    xvstelm.d VX3, X, 0, 1
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    xvstelm.d VX3, X, 0, 2
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    xvstelm.d VX3, X, 0, 3
-    xvinsgr2vr.d VX1, t1, 0
-    xvinsgr2vr.d VX1, t2, 1
-    xvinsgr2vr.d VX1, t3, 2
-    xvinsgr2vr.d VX1, t4, 3
-    add.d X, X, INCX
-    xvst VX1, Y, 0 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    .align 3
-
-.L212:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L213:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    fst.d $f14, X, 0 * SIZE
-    add.d  X, X, INCX
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L213
-    b .L999
-    .align 3
-
-.L22:
-    bgez INCX, .L220
-    //addi.d TEMP, N, -1
-    //mul.d TEMP, TEMP, INCX
-    //sub.d X, X, TEMP
-    .align 3
-
-.L220:
-    bge $r0, I, .L223
-    .align 3
-    move XX, X
-
-.L222:
-    fld.d a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d b1, Y, 0 * SIZE
-    fst.d a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d b2, Y, 0 * SIZE
-    fst.d a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d b3, Y, 0 * SIZE
-    fst.d a3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d b4, Y, 0 * SIZE
-    fst.d a4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d b1, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.d b1, Y, 0 * SIZE
-    fst.d a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d b2, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.d b2, Y, 0 * SIZE
-    fst.d a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d b3, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.d b3, Y, 0 * SIZE
-    fst.d a3, Y, 0 * SIZE
-    fld.d a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d b4, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.d b4, Y, 0 * SIZE
-    fst.d a4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d b1, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.d b2, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.d b3, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.d b4, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    addi.d  I, I, -1
-    blt $r0, I, .L222
-    .align 3
-
-.L223:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L224:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    fst.d $f14, X, 0 * SIZE
-    add.d  X, X, INCX
-    add.d  Y, Y, INCY
-    blt $r0, I, .L224
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/dswap_lsx.S b/kernel/loongarch64/dswap_lsx.S
deleted file mode 100644
index 7f7f585e1..000000000
--- a/kernel/loongarch64/dswap_lsx.S
+++ /dev/null
@@ -1,317 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define X      $r7
-#define INCX   $r8
-#define Y      $r9
-#define INCY   $r10
-
-#define I      $r17
-#define TEMP   $r18
-#define XX     $r5
-#define YY     $r6
-#define t1     $r14
-#define t2     $r15
-#define t3     $r16
-#define t4     $r19
-#define a1     $f12
-#define a2     $f13
-#define a3     $f14
-#define a4     $f15
-#define b1     $f16
-#define b2     $f17
-#define b3     $f18
-#define b4     $f19
-#define VX0    $vr12
-#define VX1    $vr13
-#define VX2    $vr14
-#define VX3    $vr15
-
-
-    PROLOGUE
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L112
-    .align 3
-
-.L111:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 2 * SIZE
-    vld VX2, Y, 0 * SIZE
-    vld VX3, Y, 2 * SIZE
-    vst VX2, X, 0 * SIZE
-    vst VX3, X, 2 * SIZE
-    vst VX0, Y, 0 * SIZE
-    vst VX1, Y, 2 * SIZE
-    vld VX0, X, 4 * SIZE
-    vld VX1, X, 6 * SIZE
-    vld VX2, Y, 4 * SIZE
-    vld VX3, Y, 6 * SIZE
-    addi.d  I, I, -1
-    vst VX2, X, 4 * SIZE
-    vst VX3, X, 6 * SIZE
-    vst VX0, Y, 4 * SIZE
-    vst VX1, Y, 6 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L111
-    .align 3
-
-.L112:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L113:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    fst.d $f14, X, 0 * SIZE
-    addi.d  X, X, SIZE
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L113
-    b .L999
-    .align 3
-
-.L12: // INCX==1 and INCY!=1
-    bge $r0, I, .L122
-    .align 3
-
-.L121:
-    vld VX0, X, 0 * SIZE
-    ld.d t1, Y, 0 * SIZE
-    vstelm.d VX0, Y, 0, 0
-    add.d Y, Y, INCY
-    ld.d t2, Y, 0 * SIZE
-    vstelm.d VX0, Y, 0, 1
-    vinsgr2vr.d VX2, t1, 0
-    vinsgr2vr.d VX2, t2, 1
-    add.d Y, Y, INCY
-    vst VX2, X, 0 * SIZE
-    vld VX1, X, 2 * SIZE
-    ld.d t3, Y, 0 * SIZE
-    vstelm.d VX1, Y, 0, 0
-    add.d Y, Y, INCY
-    ld.d t4, Y, 0 * SIZE
-    vstelm.d VX1, Y, 0, 1
-    vinsgr2vr.d VX3, t3, 0
-    vinsgr2vr.d VX3, t4, 1
-    add.d Y, Y, INCY
-    vst VX3, X, 2 * SIZE
-    vld VX0, X, 4 * SIZE
-    ld.d t1, Y, 0 * SIZE
-    vstelm.d VX0, Y, 0, 0
-    add.d Y, Y, INCY
-    ld.d t2, Y, 0 * SIZE
-    vstelm.d VX0, Y, 0, 1
-    vinsgr2vr.d VX2, t1, 0
-    vinsgr2vr.d VX2, t2, 1
-    add.d Y, Y, INCY
-    vst VX2, X, 4 * SIZE
-    vld VX1, X, 6 * SIZE
-    ld.d t3, Y, 0 * SIZE
-    vstelm.d VX1, Y, 0, 0
-    add.d Y, Y, INCY
-    ld.d t4, Y, 0 * SIZE
-    vstelm.d VX1, Y, 0, 1
-    vinsgr2vr.d VX3, t3, 0
-    vinsgr2vr.d VX3, t4, 1
-    add.d Y, Y, INCY
-    vst VX3, X, 6 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L121
-    .align 3
-
-.L122:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L123:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    fst.d $f14, X, 0 * SIZE
-    addi.d  X, X, SIZE
-    add.d  Y, Y, INCY
-    blt $r0, I, .L123
-    b .L999
-    .align 3
-
-.L21:
-    bge $r0, I, .L212
-    .align 3
-
-.L211:
-    vld VX2, Y, 0 * SIZE
-    ld.d t1, X, 0 * SIZE
-    vstelm.d VX2, X, 0, 0
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    vstelm.d VX2, X, 0, 1
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    add.d X, X, INCY
-    vst VX0, Y, 0 * SIZE
-    vld VX3, Y, 2 * SIZE
-    ld.d t3, X, 0 * SIZE
-    vstelm.d VX3, X, 0, 0
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    vstelm.d VX3, X, 0, 1
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    add.d X, X, INCX
-    vst VX1, Y, 2 * SIZE
-    vld VX2, Y, 4 * SIZE
-    ld.d t1, X, 0 * SIZE
-    vstelm.d VX2, X, 0, 0
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    vstelm.d VX2, X, 0, 1
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    add.d X, X, INCY
-    vst VX0, Y, 4 * SIZE
-    vld VX3, Y, 6 * SIZE
-    ld.d t3, X, 0 * SIZE
-    vstelm.d VX3, X, 0, 0
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    vstelm.d VX3, X, 0, 1
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    add.d X, X, INCX
-    vst VX1, Y, 6 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    .align 3
-
-.L212:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L213:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    fst.d $f14, X, 0 * SIZE
-    add.d  X, X, INCX
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L213
-    b .L999
-    .align 3
-
-.L22:
-    bgez INCX, .L220
-    //addi.d TEMP, N, -1
-    //mul.d TEMP, TEMP, INCX
-    //sub.d X, X, TEMP
-    .align 3
-
-.L220:
-    bge $r0, I, .L223
-    .align 3
-    move XX, X
-
-.L222:
-    fld.d a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.d b1, Y, 0 * SIZE
-    fst.d a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d b2, Y, 0 * SIZE
-    fst.d a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d b3, Y, 0 * SIZE
-    fst.d a3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d b4, Y, 0 * SIZE
-    fst.d a4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d b1, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.d b1, Y, 0 * SIZE
-    fst.d a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d b2, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.d b2, Y, 0 * SIZE
-    fst.d a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.d a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d b3, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.d b3, Y, 0 * SIZE
-    fst.d a3, Y, 0 * SIZE
-    fld.d a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.d b4, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.d b4, Y, 0 * SIZE
-    fst.d a4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.d b1, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.d b2, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.d b3, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.d b4, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    addi.d  I, I, -1
-    blt $r0, I, .L222
-    .align 3
-
-.L223:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L224:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.d $f12, Y, 0 * SIZE
-    fst.d $f14, X, 0 * SIZE
-    add.d  X, X, INCX
-    add.d  Y, Y, INCY
-    blt $r0, I, .L224
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/scopy_lasx.S b/kernel/loongarch64/scopy_lasx.S
deleted file mode 100644
index 7db1e7cee..000000000
--- a/kernel/loongarch64/scopy_lasx.S
+++ /dev/null
@@ -1,216 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define Y      $r7
-#define INCY   $r8
-#define I      $r17
-#define TEMP   $r18
-#define t1     $r14
-#define t2     $r15
-#define t3     $r16
-#define t4     $r19
-#define a1     $f12
-#define a2     $f13
-#define a3     $f14
-#define a4     $f15
-#define VX0    $xr12
-#define VX1    $xr13
-
-    PROLOGUE
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L112
-    .align 3
-
-.L111:
-    xvld VX0, X, 0 * SIZE
-    addi.d  I, I, -1
-    xvst VX0, Y, 0 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L111
-    .align 3
-
-.L112:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L113:
-    fld.s $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    fst.s $f12, Y, 0 * SIZE
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L113
-    b .L999
-    .align 3
-
-.L12:
-    bge $r0, I, .L122
-    .align 3
-
-.L121:
-    xvld VX0, X, 0 * SIZE
-    xvstelm.w VX0, Y, 0, 0
-    add.d Y, Y, INCY
-    xvstelm.w VX0, Y, 0, 1
-    add.d Y, Y, INCY
-    xvstelm.w VX0, Y, 0, 2
-    add.d Y, Y, INCY
-    xvstelm.w VX0, Y, 0, 3
-    add.d Y, Y, INCY
-    xvstelm.w VX0, Y, 0, 4
-    add.d Y, Y, INCY
-    xvstelm.w VX0, Y, 0, 5
-    add.d Y, Y, INCY
-    xvstelm.w VX0, Y, 0, 6
-    add.d Y, Y, INCY
-    xvstelm.w VX0, Y, 0, 7
-    add.d Y, Y, INCY
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L121
-    .align 3
-
-.L122:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L123:
-    fld.s $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    fst.s $f12, Y, 0 * SIZE
-    add.d  Y, Y, INCY
-    blt $r0, I, .L123
-    b .L999
-    .align 3
-
-.L21:
-    bge $r0, I, .L212
-    .align 3
-
-.L211:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    xvst VX0, Y, 0 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    .align 3
-
-.L212:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L213:
-    fld.s $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    add.d  X, X, INCX
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L213
-    b .L999
-    .align 3
-
-.L22:
-    bge $r0, I, .L223
-    .align 3
-
-.L222:
-    fld.s a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a3, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a4, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a3, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a4, X, 0 * SIZE
-    add.d Y, Y, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L222
-    .align 3
-
-.L223:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L224:
-    fld.s $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    add.d  X, X, INCX
-    add.d  Y, Y, INCY
-    blt $r0, I, .L224
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/scopy_lsx.S b/kernel/loongarch64/scopy_lsx.S
deleted file mode 100644
index 32150d3d6..000000000
--- a/kernel/loongarch64/scopy_lsx.S
+++ /dev/null
@@ -1,220 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define Y      $r7
-#define INCY   $r8
-#define I      $r17
-#define TEMP   $r18
-#define t1     $r14
-#define t2     $r15
-#define t3     $r16
-#define t4     $r19
-#define a1     $f12
-#define a2     $f13
-#define a3     $f14
-#define a4     $f15
-#define VX0    $vr12
-#define VX1    $vr13
-
-    PROLOGUE
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L112
-    .align 3
-
-.L111:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    addi.d  I, I, -1
-    vst VX0, Y, 0 * SIZE
-    vst VX1, Y, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L111
-    .align 3
-
-.L112:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L113:
-    fld.s $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    fst.s $f12, Y, 0 * SIZE
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L113
-    b .L999
-    .align 3
-
-.L12:
-    bge $r0, I, .L122
-    .align 3
-
-.L121:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    vstelm.w VX0, Y, 0, 0
-    add.d Y, Y, INCY
-    vstelm.w VX0, Y, 0, 1
-    add.d Y, Y, INCY
-    vstelm.w VX0, Y, 0, 2
-    add.d Y, Y, INCY
-    vstelm.w VX0, Y, 0, 3
-    add.d Y, Y, INCY
-    vstelm.w VX1, Y, 0, 0
-    add.d Y, Y, INCY
-    vstelm.w VX1, Y, 0, 1
-    add.d Y, Y, INCY
-    vstelm.w VX1, Y, 0, 2
-    add.d Y, Y, INCY
-    vstelm.w VX1, Y, 0, 3
-    add.d Y, Y, INCY
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L121
-    .align 3
-
-.L122:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L123:
-    fld.s $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    fst.s $f12, Y, 0 * SIZE
-    add.d  Y, Y, INCY
-    blt $r0, I, .L123
-    b .L999
-    .align 3
-
-.L21:
-    bge $r0, I, .L212
-    .align 3
-
-.L211:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    vst VX0, Y, 0 * SIZE
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    vst VX1, Y, 4 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    .align 3
-
-.L212:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L213:
-    fld.s $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    add.d  X, X, INCX
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L213
-    b .L999
-    .align 3
-
-.L22:
-    bge $r0, I, .L223
-    .align 3
-
-.L222:
-    fld.s a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a3, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a4, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a3, X, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s a4, X, 0 * SIZE
-    add.d Y, Y, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L222
-    .align 3
-
-.L223:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L224:
-    fld.s $f12, X, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    add.d  X, X, INCX
-    add.d  Y, Y, INCY
-    blt $r0, I, .L224
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/snrm2_lasx.S b/kernel/loongarch64/snrm2_lasx.S
index 274908c14..3ae11e897 100644
--- a/kernel/loongarch64/snrm2_lasx.S
+++ b/kernel/loongarch64/snrm2_lasx.S
@@ -1,3 +1,35 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
 #define ASSEMBLER
 
 #include "common.h"
@@ -11,10 +43,13 @@
 #define t2     $r13
 #define t3     $r14
 #define t4     $r15
+
+/* Don't change following FR unless you know the effects. */
 #define VX0    $xr15
 #define VX1    $xr16
 #define VX2    $xr17
 #define VX3    $xr18
+#define VX4    $xr21
 #define res1   $xr19
 #define res2   $xr20
 
@@ -37,14 +72,13 @@
    .align 3
 
 .L10:
-   xvld VX0, X, 0 * SIZE
-   xvld VX1, X, 0 * SIZE
-   xvfcvtl.d.s VX0, VX0
-   xvfcvth.d.s VX1, VX1
-   xvfmadd.d res1, VX0, VX0, res1
-   xvfmadd.d res2, VX1, VX1, res2
+   xvld VX0, X, 0
+   xvfcvtl.d.s VX1, VX0
+   xvfcvth.d.s VX2, VX0
+   xvfmadd.d res1, VX1, VX1, res1
+   xvfmadd.d res2, VX2, VX2, res2
    addi.d I, I, -1
-   addi.d X, X, 8 * SIZE 
+   addi.d X, X, 8 * SIZE
    blt $r0, I, .L10
    .align 3
    b .L996
@@ -54,70 +88,46 @@
    .align 3
 
 .L21:
-   ld.w t1, X, 0 * SIZE
+   ld.w t1, X, 0
    add.d X, X, INCX
-   ld.w t2, X, 0 * SIZE
+   ld.w t2, X, 0
    add.d X, X, INCX
-   ld.w t3, X, 0 * SIZE
+   ld.w t3, X, 0
    add.d X, X, INCX
-   ld.w t4, X, 0 * SIZE
+   ld.w t4, X, 0
    add.d X, X, INCX
    xvinsgr2vr.w VX0, t1, 0
    xvinsgr2vr.w VX0, t2, 1
    xvinsgr2vr.w VX0, t3, 2
    xvinsgr2vr.w VX0, t4, 3
-   ld.w t1, X, 0 * SIZE
+   ld.w t1, X, 0
    add.d X, X, INCX
-   ld.w t2, X, 0 * SIZE
+   ld.w t2, X, 0
    add.d X, X, INCX
-   ld.w t3, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t4, X, 0 * SIZE
+   ld.w t3, X, 0
    add.d X, X, INCX
+   ld.w t4, X, 0
    xvinsgr2vr.w VX0, t1, 4
    xvinsgr2vr.w VX0, t2, 5
    xvinsgr2vr.w VX0, t3, 6
    xvinsgr2vr.w VX0, t4, 7
-   ld.w t1, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t2, X, 0 * SIZE
    add.d X, X, INCX
-   ld.w t3, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t4, X, 0 * SIZE
-   add.d X, X, INCX
-   xvinsgr2vr.w VX1, t1, 0
-   xvinsgr2vr.w VX1, t2, 1
-   xvinsgr2vr.w VX1, t3, 2
-   xvinsgr2vr.w VX1, t4, 3
-   ld.w t1, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t2, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t3, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t4, X, 0 * SIZE
-   add.d X, X, INCX
-   xvinsgr2vr.w VX1, t1, 4
-   xvinsgr2vr.w VX1, t2, 5
-   xvinsgr2vr.w VX1, t3, 6
-   xvinsgr2vr.w VX1, t4, 7
-   xvfcvtl.d.s VX0, VX0
-   xvfcvth.d.s VX1, VX1
-   xvfmadd.d res1, VX0, VX0, res1
-   xvfmadd.d res2, VX1, VX1, res2
+   xvfcvtl.d.s VX1, VX0
+   xvfcvth.d.s VX2, VX0
+   xvfmadd.d res1, VX1, VX1, res1
+   xvfmadd.d res2, VX2, VX2, res2
    addi.d  I, I, -1
    blt $r0, I, .L21
    b .L996
 
 .L996:
    xvfadd.d res1, res1, res2
-   xvpickve.w VX1, res1, 1
-   xvpickve.w VX2, res1, 2
-   xvpickve.w VX3, res1, 3
-   xvfadd.s res1, VX1, res1
-   xvfadd.s res1, VX2, res1
-   xvfadd.s res1, VX3, res1
+   xvpickve.d VX1, res1, 1
+   xvpickve.d VX2, res1, 2
+   xvpickve.d VX3, res1, 3
+   fadd.d   $f19, $f19, $f16
+   fadd.d   $f19, $f19, $f17
+   fadd.d   $f19, $f19, $f18
    .align 3
 
 .L997:
@@ -126,11 +136,11 @@
    .align 3
 
 .L998:
-   fld.s $f15, X, 0 * SIZE
-   addi.d I, I, -1
+   fld.s $f15, X, 0
+   add.d    X, X, INCX
+   addi.d   I, I, -1
    fcvt.d.s $f15, $f15
-   fmadd.d $f19, $f15, $f15, $f19
-   add.d X, X, INCX
+   fmadd.d  $f19, $f15, $f15, $f19
    blt $r0, I, .L998
    .align 3
 
diff --git a/kernel/loongarch64/snrm2_lsx.S b/kernel/loongarch64/snrm2_lsx.S
index 17d017900..bb492dbf0 100644
--- a/kernel/loongarch64/snrm2_lsx.S
+++ b/kernel/loongarch64/snrm2_lsx.S
@@ -1,3 +1,35 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
 #define ASSEMBLER
 
 #include "common.h"
@@ -15,6 +47,9 @@
 #define VX1    $vr16
 #define VX2    $vr17
 #define VX3    $vr18
+#define VX4    $vr21
+#define VX5    $vr22
+/* Don't change following FR unless you know the effects. */
 #define res1   $vr19
 #define res2   $vr20
 
@@ -24,99 +59,71 @@
    LDINT   N,     0(N)
    LDINT   INCX,  0(INCX)
 #endif
-
    vxor.v res1, res1, res1
    vxor.v res2, res2, res2
-   bge $r0, N, .L999
+   bge $r0,    N, .L999
    beq $r0, INCX, .L999
    li.d  TEMP, SIZE
    slli.d INCX, INCX, BASE_SHIFT
    srai.d I, N, 3
    bne INCX, TEMP, .L20
-   bge $r0,  I, .L997
+   bge  $r0,    I, .L997
    .align 3
 
 .L10:
-   vld VX0, X, 0 * SIZE
-   vld VX1, X, 0 * SIZE
-   vfcvtl.d.s VX0, VX0
-   vfcvth.d.s VX1, VX1
-   vfmadd.d res1, VX0, VX0, res1
-   vfmadd.d res2, VX1, VX1, res2
-   vld VX2, X, 4 * SIZE
-   vld VX3, X, 4 * SIZE
-   vfcvtl.d.s VX2, VX2
-   vfcvth.d.s VX3, VX3
-   vfmadd.d res1, VX2, VX2, res1
-   vfmadd.d res2, VX3, VX3, res2
+   vld  VX0, X, 0
+   vld  VX5, X, 4 * SIZE
    addi.d I, I, -1
-   addi.d X, X, 8 * SIZE 
+   addi.d X, X, 8 * SIZE
+   vfcvtl.d.s VX1, VX0
+   vfcvth.d.s VX2, VX0
+   vfcvtl.d.s VX3, VX5
+   vfcvth.d.s VX4, VX5
+   vfmadd.d  res1, VX1, VX1, res1
+   vfmadd.d  res2, VX2, VX2, res2
+   vfmadd.d  res1, VX3, VX3, res1
+   vfmadd.d  res2, VX4, VX4, res2
    blt $r0, I, .L10
    b .L996
    .align 3
-   
 
 .L20:
    bge $r0, I, .L997
    .align 3
 
 .L21:
-   ld.w t1, X, 0 * SIZE
+   ld.w t1, X, 0
    add.d X, X, INCX
-   ld.w t2, X, 0 * SIZE
+   ld.w t2, X, 0
    add.d X, X, INCX
-   ld.w t3, X, 0 * SIZE
+   ld.w t3, X, 0
    add.d X, X, INCX
-   ld.w t4, X, 0 * SIZE
+   ld.w t4, X, 0
    add.d X, X, INCX
    vinsgr2vr.w VX0, t1, 0
    vinsgr2vr.w VX0, t2, 1
    vinsgr2vr.w VX0, t3, 2
-   vinsgr2vr.w VX0, t4, 3  
-   ld.w t1, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t2, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t3, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t4, X, 0 * SIZE
+   vinsgr2vr.w VX0, t4, 3
+   vfcvtl.d.s VX1, VX0
+   vfcvth.d.s VX2, VX0
+   vfmadd.d res1, VX1, VX1, res1
+   vfmadd.d res2, VX2, VX2, res2
+   ld.w t1, X, 0
    add.d X, X, INCX
-   vinsgr2vr.w VX1, t1, 0
-   vinsgr2vr.w VX1, t2, 1
-   vinsgr2vr.w VX1, t3, 2
-   vinsgr2vr.w VX1, t4, 3
-   vfcvtl.d.s VX0, VX0
-   vfcvth.d.s VX1, VX1
-   vfmadd.d res1, VX0, VX0, res1
-   vfmadd.d res2, VX1, VX1, res2
-   ld.w t1, X, 0 * SIZE
+   ld.w t2, X, 0
    add.d X, X, INCX
-   ld.w t2, X, 0 * SIZE
+   ld.w t3, X, 0
    add.d X, X, INCX
-   ld.w t3, X, 0 * SIZE
+   ld.w t4, X, 0
    add.d X, X, INCX
-   ld.w t4, X, 0 * SIZE
-   add.d X, X, INCX
-   vinsgr2vr.w VX2, t1, 0
-   vinsgr2vr.w VX2, t2, 1
-   vinsgr2vr.w VX2, t3, 2
-   vinsgr2vr.w VX2, t4, 3  
-   ld.w t1, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t2, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t3, X, 0 * SIZE
-   add.d X, X, INCX
-   ld.w t4, X, 0 * SIZE
-   add.d X, X, INCX
-   vinsgr2vr.w VX3, t1, 0
-   vinsgr2vr.w VX3, t2, 1
-   vinsgr2vr.w VX3, t3, 2
-   vinsgr2vr.w VX3, t4, 3
-   vfcvtl.d.s VX2, VX2
-   vfcvth.d.s VX3, VX3
-   vfmadd.d res1, VX2, VX2, res1
-   vfmadd.d res2, VX3, VX3, res2
+   vinsgr2vr.w VX0, t1, 0
+   vinsgr2vr.w VX0, t2, 1
+   vinsgr2vr.w VX0, t3, 2
+   vinsgr2vr.w VX0, t4, 3
+   vfcvtl.d.s VX3, VX0
+   vfcvth.d.s VX4, VX0
+   vfmadd.d res1, VX3, VX3, res1
+   vfmadd.d res2, VX4, VX4, res2
    addi.d  I, I, -1
    blt $r0, I, .L21
    b .L996
@@ -124,12 +131,8 @@
 
 .L996:
    vfadd.d res1, res1, res2
-   vreplvei.w VX1, res1, 1
-   vreplvei.w VX2, res1, 2
-   vreplvei.w VX3, res1, 3
-   vfadd.s res1, VX1, res1
-   vfadd.s res1, VX2, res1
-   vfadd.s res1, VX3, res1
+   vreplvei.d VX1, res1, 1
+   vfadd.d res1, VX1, res1
    .align 3
 
 .L997:
@@ -138,7 +141,7 @@
    .align 3
 
 .L998:
-   fld.s $f15, X, 0 * SIZE
+   fld.s $f15, X, 0
    addi.d I, I, -1
    fcvt.d.s $f15, $f15
    fmadd.d $f19, $f15, $f15, $f19
diff --git a/kernel/loongarch64/ssum_lasx.S b/kernel/loongarch64/ssum_lasx.S
deleted file mode 100644
index 7cf57bc77..000000000
--- a/kernel/loongarch64/ssum_lasx.S
+++ /dev/null
@@ -1,140 +0,0 @@
-#define ASSEMBLER
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define I      $r17
-#define TEMP   $r18
-#define t1     $r15
-#define t2     $r12
-#define t3     $r13
-#define t4     $r14
-#define VX0    $xr12
-#define VX1    $xr13
-#define VX2    $xr14
-#define VX3    $xr15
-#define res1   $xr16
-#define res2   $xr17
-    PROLOGUE
-    xvxor.v res1, res1, res1
-    xvxor.v res2, res2, res2
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d  TEMP, SIZE
-    slli.d  INCX, INCX, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bge $r0, I, .L13
-    .align 3
-
-.L11:
-    xvld VX0, X, 0 * SIZE
-    xvfadd.s res1, VX0, res1
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L11
-    .align 3
-
-.L12:
-    xvfadd.s res2, res1, res2
-    xvpickve.w VX1, res1, 1
-    xvpickve.w VX2, res1, 2
-    xvpickve.w VX3, res1, 3
-    xvfadd.s res1, VX1, res1
-    xvfadd.s res1, VX2, res1
-    xvfadd.s res1, VX3, res1
-    xvpickve.w VX0, res2, 4
-    xvpickve.w VX1, res2, 5
-    xvpickve.w VX2, res2, 6
-    xvpickve.w VX3, res2, 7
-    xvfadd.s res1, VX0, res1
-    xvfadd.s res1, VX1, res1
-    xvfadd.s res1, VX2, res1
-    xvfadd.s res1, VX2, res1
-    .align 3
-
-.L13:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L14:
-    fld.s $f12, X, 0 * SIZE
-    fadd.s $f16, $f12, $f16
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    b .L999
-    .align 3
-
-.L20:
-    bge $r0, I, .L23
-    .align 3
-
-.L21:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    xvfadd.s res1, VX0, res1
-    addi.d  I, I, -1
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    xvfadd.s res2, res1, res2
-    xvpickve.w VX1, res1, 1
-    xvpickve.w VX2, res1, 2
-    xvpickve.w VX3, res1, 3
-    xvfadd.s res1, VX1, res1
-    xvfadd.s res1, VX2, res1
-    xvfadd.s res1, VX3, res1
-    xvpickve.w VX0, res2, 4
-    xvpickve.w VX1, res2, 5
-    xvpickve.w VX2, res2, 6
-    xvpickve.w VX3, res2, 7
-    xvfadd.s res1, VX0, res1
-    xvfadd.s res1, VX1, res1
-    xvfadd.s res1, VX2, res1
-    xvfadd.s res1, VX2, res1
-    .align 3
-
-.L23:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    fld.s $f12, X, 0 * SIZE
-    fadd.s $f16, $f12, $f16
-    addi.d I, I, -1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fmov.s $f0, $f16
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/ssum_lsx.S b/kernel/loongarch64/ssum_lsx.S
deleted file mode 100644
index de63c69e3..000000000
--- a/kernel/loongarch64/ssum_lsx.S
+++ /dev/null
@@ -1,125 +0,0 @@
-#define ASSEMBLER
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define I      $r17
-#define TEMP   $r18
-#define t1     $r15
-#define t2     $r12
-#define t3     $r13
-#define t4     $r14
-#define VX0    $vr12
-#define VX1    $vr13
-#define VX2    $vr14
-#define VX3    $vr15
-#define res1   $vr16
-#define res2   $vr17
-    PROLOGUE
-    vxor.v res1, res1, res1
-    vxor.v res2, res2, res2
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d  TEMP, SIZE
-    slli.d  INCX, INCX, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bge $r0, I, .L13
-    .align 3
-
-.L11:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    vfadd.s res2, VX0, VX1
-    vfadd.s res1, res1, res2
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L11
-    .align 3
-
-.L12:
-    vreplvei.w VX1, res1, 1
-    vreplvei.w VX2, res1, 2
-    vreplvei.w VX3, res1, 3
-    vfadd.s res1, VX1, res1
-    vfadd.s res1, VX2, res1
-    vfadd.s res1, VX3, res1
-    .align 3
-
-.L13:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L14:
-    fld.s $f12, X, 0 * SIZE
-    fadd.s $f16, $f12, $f16
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    b .L999
-    .align 3
-
-.L20:
-    bge $r0, I, .L23
-    .align 3
-
-.L21:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    vfadd.s res2, VX0, VX1
-    vfadd.s res1, res1, res2
-    addi.d  I, I, -1
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    vreplvei.w VX1, res1, 1
-    vreplvei.w VX2, res1, 2
-    vreplvei.w VX3, res1, 3
-    vfadd.s res1, VX1, res1
-    vfadd.s res1, VX2, res1
-    vfadd.s res1, VX3, res1
-    .align 3
-
-.L23:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    fld.s $f12, X, 0 * SIZE
-    fadd.s $f16, $f12, $f16
-    addi.d I, I, -1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fmov.s $f0, $f16
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/sswap_lasx.S b/kernel/loongarch64/sswap_lasx.S
deleted file mode 100644
index 7184eff45..000000000
--- a/kernel/loongarch64/sswap_lasx.S
+++ /dev/null
@@ -1,286 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define X      $r7
-#define INCX   $r8
-#define Y      $r9
-#define INCY   $r10
-
-#define I      $r17
-#define TEMP   $r18
-#define XX     $r5
-#define YY     $r6
-#define t1     $r14
-#define t2     $r15
-#define t3     $r16
-#define t4     $r19
-#define a1     $f12
-#define a2     $f13
-#define a3     $f14
-#define a4     $f15
-#define b1     $f16
-#define b2     $f17
-#define b3     $f18
-#define b4     $f19
-#define VX0    $xr12
-#define VX1    $xr13
-#define VX2    $xr14
-#define VX3    $xr15
-
-
-    PROLOGUE
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L112
-    .align 3
-
-.L111:
-    xvld VX0, X, 0 * SIZE
-    xvld VX2, Y, 0 * SIZE
-    addi.d  I, I, -1
-    xvst VX2, X, 0 * SIZE
-    xvst VX0, Y, 0 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L111
-    .align 3
-
-.L112:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L113:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    fst.s $f14, X, 0 * SIZE
-    addi.d  X, X, SIZE
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L113
-    b .L999
-    .align 3
-
-.L12: // INCX==1 and INCY!=1
-    bge $r0, I, .L122
-    .align 3
-
-.L121:
-    xvld VX0, X, 0 * SIZE
-    ld.w t1, Y, 0 * SIZE
-    xvstelm.w VX0, Y, 0, 0
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    xvstelm.w VX0, Y, 0, 1
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    xvstelm.w VX0, Y, 0, 2
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    xvstelm.w VX0, Y, 0, 3
-    xvinsgr2vr.w VX2, t1, 0
-    xvinsgr2vr.w VX2, t2, 1
-    xvinsgr2vr.w VX2, t3, 2
-    xvinsgr2vr.w VX2, t4, 3
-    add.d Y, Y, INCY
-    ld.w t1, Y, 0 * SIZE
-    xvstelm.w VX0, Y, 0, 4
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    xvstelm.w VX0, Y, 0, 5
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    xvstelm.w VX0, Y, 0, 6
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    xvstelm.w VX0, Y, 0, 7
-    xvinsgr2vr.w VX2, t1, 4
-    xvinsgr2vr.w VX2, t2, 5
-    xvinsgr2vr.w VX2, t3, 6
-    xvinsgr2vr.w VX2, t4, 7
-    add.d Y, Y, INCY
-    xvst VX2, X, 0 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L121
-    .align 3
-
-.L122:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L123:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    fst.s $f14, X, 0 * SIZE
-    addi.d  X, X, SIZE
-    add.d  Y, Y, INCY
-    blt $r0, I, .L123
-    b .L999
-    .align 3
-
-.L21:
-    bge $r0, I, .L212
-    .align 3
-
-.L211:
-    xvld VX2, Y, 0 * SIZE
-    ld.w t1, X, 0 * SIZE
-    xvstelm.w VX2, X, 0, 0
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    xvstelm.w VX2, X, 0, 1
-    add.d X, X, INCY
-    ld.w t3, X, 0 * SIZE
-    xvstelm.w VX2, X, 0, 2
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    xvstelm.w VX2, X, 0, 3
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    add.d X, X, INCX
-    ld.w t1, X, 0 * SIZE
-    xvstelm.w VX2, X, 0, 4
-    add.d X, X, INCY
-    ld.w t2, X, 0 * SIZE
-    xvstelm.w VX2, X, 0, 5
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    xvstelm.w VX2, X, 0, 6
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    xvstelm.w VX2, X, 0, 7
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    add.d X, X, INCX
-    xvst VX1, Y, 0 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    .align 3
-
-.L212:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L213:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    fst.s $f14, X, 0 * SIZE
-    add.d  X, X, INCX
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L213
-    b .L999
-    .align 3
-
-.L22:
-    bge $r0, I, .L223
-    .align 3
-    move XX, X
-
-.L222:
-    fld.s a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s b1, Y, 0 * SIZE
-    fst.s a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s b2, Y, 0 * SIZE
-    fst.s a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s b3, Y, 0 * SIZE
-    fst.s a3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s b4, Y, 0 * SIZE
-    fst.s a4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s b1, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.s b1, Y, 0 * SIZE
-    fst.s a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s b2, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.s b2, Y, 0 * SIZE
-    fst.s a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s b3, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.s b3, Y, 0 * SIZE
-    fst.s a3, Y, 0 * SIZE
-    fld.s a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s b4, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.s b4, Y, 0 * SIZE
-    fst.s a4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s b1, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.s b2, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.s b3, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.s b4, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    addi.d  I, I, -1
-    blt $r0, I, .L222
-    .align 3
-
-.L223:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L224:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    fst.s $f14, X, 0 * SIZE
-    add.d  X, X, INCX
-    add.d  Y, Y, INCY
-    blt $r0, I, .L224
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/sswap_lsx.S b/kernel/loongarch64/sswap_lsx.S
deleted file mode 100644
index 4f19a8024..000000000
--- a/kernel/loongarch64/sswap_lsx.S
+++ /dev/null
@@ -1,294 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define X      $r7
-#define INCX   $r8
-#define Y      $r9
-#define INCY   $r10
-
-#define I      $r17
-#define TEMP   $r18
-#define XX     $r5
-#define YY     $r6
-#define t1     $r14
-#define t2     $r15
-#define t3     $r16
-#define t4     $r19
-#define a1     $f12
-#define a2     $f13
-#define a3     $f14
-#define a4     $f15
-#define b1     $f16
-#define b2     $f17
-#define b3     $f18
-#define b4     $f19
-#define VX0    $vr12
-#define VX1    $vr13
-#define VX2    $vr14
-#define VX3    $vr15
-
-
-    PROLOGUE
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L112
-    .align 3
-
-.L111:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    vld VX2, Y, 0 * SIZE
-    vld VX3, Y, 4 * SIZE
-    addi.d  I, I, -1
-    vst VX2, X, 0 * SIZE
-    vst VX3, X, 4 * SIZE
-    vst VX0, Y, 0 * SIZE
-    vst VX1, Y, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L111
-    .align 3
-
-.L112:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L113:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    fst.s $f14, X, 0 * SIZE
-    addi.d  X, X, SIZE
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L113
-    b .L999
-    .align 3
-
-.L12: // INCX==1 and INCY!=1
-    bge $r0, I, .L122
-    .align 3
-
-.L121:
-    vld VX0, X, 0 * SIZE
-    ld.w t1, Y, 0 * SIZE
-    vstelm.w VX0, Y, 0, 0
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    vstelm.w VX0, Y, 0, 1
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    vstelm.w VX0, Y, 0, 2
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vstelm.w VX0, Y, 0, 3
-    vinsgr2vr.w VX2, t1, 0
-    vinsgr2vr.w VX2, t2, 1
-    vinsgr2vr.w VX2, t3, 2
-    vinsgr2vr.w VX2, t4, 3
-    add.d Y, Y, INCY
-    vst VX2, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    ld.w t1, Y, 0 * SIZE
-    vstelm.w VX1, Y, 0, 0
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    vstelm.w VX1, Y, 0, 1
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    vstelm.w VX1, Y, 0, 2
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vstelm.w VX1, Y, 0, 3
-    vinsgr2vr.w VX3, t1, 0
-    vinsgr2vr.w VX3, t2, 1
-    vinsgr2vr.w VX3, t3, 2
-    vinsgr2vr.w VX3, t4, 3
-    add.d Y, Y, INCY
-    vst VX3, X, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L121
-    .align 3
-
-.L122:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L123:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    fst.s $f14, X, 0 * SIZE
-    addi.d  X, X, SIZE
-    add.d  Y, Y, INCY
-    blt $r0, I, .L123
-    b .L999
-    .align 3
-
-.L21:// INCX!=1 and INCY==1
-    bge $r0, I, .L212
-    .align 3
-
-.L211:
-    vld VX2, Y, 0 * SIZE
-    ld.w t1, X, 0 * SIZE
-    vstelm.w VX2, X, 0, 0
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    vstelm.w VX2, X, 0, 1
-    add.d X, X, INCY
-    ld.w t3, X, 0 * SIZE
-    vstelm.w VX2, X, 0, 2
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    vstelm.w VX2, X, 0, 3
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    add.d X, X, INCX
-    vst VX0, Y, 0 * SIZE
-    vld VX3, Y, 4 * SIZE
-    ld.w t1, X, 0 * SIZE
-    vstelm.w VX3, X, 0, 0
-    add.d X, X, INCY
-    ld.w t2, X, 0 * SIZE
-    vstelm.w VX3, X, 0, 1
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    vstelm.w VX3, X, 0, 2
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    vstelm.w VX3, X, 0, 3
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    add.d X, X, INCX
-    vst VX1, Y, 0 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    .align 3
-
-.L212:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L213:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    fst.s $f14, X, 0 * SIZE
-    add.d  X, X, INCX
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L213
-    b .L999
-    .align 3
-
-.L22:
-    bge $r0, I, .L223
-    .align 3
-    move XX, X
-
-.L222:
-    fld.s a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fld.s b1, Y, 0 * SIZE
-    fst.s a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s b2, Y, 0 * SIZE
-    fst.s a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s b3, Y, 0 * SIZE
-    fst.s a3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s b4, Y, 0 * SIZE
-    fst.s a4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s a1, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s b1, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.s b1, Y, 0 * SIZE
-    fst.s a1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s a2, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s b2, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.s b2, Y, 0 * SIZE
-    fst.s a2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fld.s a3, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s b3, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.s b3, Y, 0 * SIZE
-    fst.s a3, Y, 0 * SIZE
-    fld.s a4, X, 0 * SIZE
-    add.d X, X, INCX
-    fst.s b4, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fld.s b4, Y, 0 * SIZE
-    fst.s a4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    fst.s b1, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.s b2, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.s b3, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    fst.s b4, XX, 0 * SIZE
-    add.d XX, XX, INCX
-    addi.d  I, I, -1
-    blt $r0, I, .L222
-    .align 3
-
-.L223:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L224:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fst.s $f12, Y, 0 * SIZE
-    fst.s $f14, X, 0 * SIZE
-    add.d  X, X, INCX
-    add.d  Y, Y, INCY
-    blt $r0, I, .L224
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/sum_lasx.S b/kernel/loongarch64/sum_lasx.S
new file mode 100644
index 000000000..fd6d5adb3
--- /dev/null
+++ b/kernel/loongarch64/sum_lasx.S
@@ -0,0 +1,225 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define res1   $xr16
+#define res2   $xr17
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+    xvld VX0, X, 0
+    xvfadd.s res1, res1, VX0
+#ifdef DOUBLE
+    xvld VX1, X, 32
+    xvfadd.s res1, res1, VX1
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD  $f12, X, 0
+    ADD $f16, $f12, $f16
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvfadd.s res1, VX0, res1
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD  $f12, X, 0
+    ADD $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/sum_lsx.S b/kernel/loongarch64/sum_lsx.S
new file mode 100644
index 000000000..6b2027781
--- /dev/null
+++ b/kernel/loongarch64/sum_lsx.S
@@ -0,0 +1,204 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define res1   $vr16
+#define res2   $vr17
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+    vld VX0, X, 0
+    vld VX1, X, 16
+    VFADD res2, VX0, VX1
+    VFADD res1, res1, res2
+#ifdef DOUBLE
+    vld VX0, X, 32
+    vld VX1, X, 48
+    VFADD res2, VX0, VX1
+    VFADD res1, res1, res2
+#endif
+    addi.d  X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d   res1, res1, VX1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD   $f12, X, 0
+    ADD  $f16, $f12, $f16
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfadd.s res2, VX0, VX1
+    vfadd.s res1, res1, res2
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD  $f12, X, 0
+    ADD $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt  $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/swap_lasx.S b/kernel/loongarch64/swap_lasx.S
new file mode 100644
index 000000000..4767fffe3
--- /dev/null
+++ b/kernel/loongarch64/swap_lasx.S
@@ -0,0 +1,401 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define YY     $r6
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+/* INCX==1 and INCY==1 */
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    xvld VX0, X, 0
+    xvld VX2, Y, 0
+    addi.d  I, I, -1
+    xvst VX2, X, 0
+    xvst VX0, Y, 0
+#ifdef DOUBLE
+    xvld VX0, X, 32
+    xvld VX2, Y, 32
+    xvst VX2, X, 32
+    xvst VX0, Y, 32
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    addi.d  X, X, SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+/* INCX==1 and INCY!=1 */
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    ld.d t1, Y, 0
+    xvstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0
+    xvstelm.d VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0
+    xvstelm.d VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0
+    xvstelm.d VX0, Y, 0, 3
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvst VX2, X, 0
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0
+    xvstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0
+    xvstelm.d VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0
+    xvstelm.d VX1, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0
+    xvstelm.d VX1, Y, 0, 3
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvst VX3, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+#else
+    xvld VX0, X, 0
+    ld.w t1, Y, 0
+    xvstelm.w VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0
+    xvstelm.w VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0
+    xvstelm.w VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0
+    xvstelm.w VX0, Y, 0, 3
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    ld.w t1, Y, 0
+    xvstelm.w VX0, Y, 0, 4
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0
+    xvstelm.w VX0, Y, 0, 5
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0
+    xvstelm.w VX0, Y, 0, 6
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0
+    xvstelm.w VX0, Y, 0, 7
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvst VX2, X, 0
+    addi.d X, X, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    addi.d  X, X, SIZE
+    add.d   Y, Y, INCY
+    blt   $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    xvld VX2, Y, 0
+    ld.d t1, X, 0
+    xvstelm.d VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    xvstelm.d VX2, X, 0, 1
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    xvstelm.d VX2, X, 0, 2
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    xvstelm.d VX2, X, 0, 3
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvst VX0, Y, 0
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0
+    xvstelm.d VX3, X, 0, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    xvstelm.d VX3, X, 0, 1
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    xvstelm.d VX3, X, 0, 2
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    xvstelm.d VX3, X, 0, 3
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvst VX1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvld VX2, Y, 0
+    ld.w t1, X, 0
+    xvstelm.w VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    xvstelm.w VX2, X, 0, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    xvstelm.w VX2, X, 0, 2
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    xvstelm.w VX2, X, 0, 3
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0
+    xvstelm.w VX2, X, 0, 4
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    xvstelm.w VX2, X, 0, 5
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    xvstelm.w VX2, X, 0, 6
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    xvstelm.w VX2, X, 0, 7
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvst VX0, Y, 0
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    add.d  X, X, INCX
+    addi.d Y, Y, SIZE
+    blt  $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+    move XX, X
+
+.L222:
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    LD    b1, Y, 0
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    LD    b2, Y, 0
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    LD    b3, Y, 0
+    ST    a3, Y, 0
+    add.d Y, Y, INCY
+    LD    b4, Y, 0
+    ST    a4, Y, 0
+    add.d Y, Y, INCY
+    LD    a1, X, 0
+    add.d X, X, INCX
+    ST    b1, XX, 0
+    add.d XX, XX, INCX
+    LD    b1, Y, 0
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    LD    a2, X, 0
+    add.d X, X, INCX
+    ST    b2, XX, 0
+    add.d XX, XX, INCX
+    LD    b2, Y, 0
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    LD    a3, X, 0
+    add.d X, X, INCX
+    ST    b3, XX, 0
+    add.d XX, XX, INCX
+    LD    b3, Y, 0
+    ST    a3, Y, 0
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    b4, XX, 0
+    add.d XX, XX, INCX
+    LD    b4, Y, 0
+    ST    a4, Y, 0
+    add.d Y, Y, INCY
+    ST    b1, XX, 0
+    add.d XX, XX, INCX
+    ST    b2, XX, 0
+    add.d XX, XX, INCX
+    ST    b3, XX, 0
+    add.d XX, XX, INCX
+    ST    b4, XX, 0
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt  $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/swap_lsx.S b/kernel/loongarch64/swap_lsx.S
new file mode 100644
index 000000000..736187f93
--- /dev/null
+++ b/kernel/loongarch64/swap_lsx.S
@@ -0,0 +1,431 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define YY     $r6
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+/* INCX==1 and incy==1 */
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vld VX2, Y, 0
+    vld VX3, Y, 16
+    addi.d  I, I, -1
+    vst VX2, X, 0
+    vst VX3, X, 16
+    vst VX0, Y, 0
+    vst VX1, Y, 16
+#ifdef DOUBLE
+    vld VX0, X, 32
+    vld VX1, X, 48
+    vld VX2, Y, 32
+    vld VX3, Y, 48
+    vst VX2, X, 32
+    vst VX3, X, 48
+    vst VX0, Y, 32
+    vst VX1, Y, 48
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+#ifdef DOUBLE
+    fld.d $f12, X, 0
+    fld.d $f14, Y, 0
+    addi.d I, I, -1
+    fst.d $f12, Y, 0
+    fst.d $f14, X, 0
+#else
+    fld.s $f12, X, 0
+    fld.s $f14, Y, 0
+    addi.d I, I, -1
+    fst.s $f12, Y, 0
+    fst.s $f14, X, 0
+#endif
+    addi.d  X, X, SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+/* INCX==1 and INCY!=1 */
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    ld.d t1, Y, 0
+    vstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0
+    vstelm.d VX0, Y, 0, 1
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vst VX2, X, 0
+    vld VX1, X, 2 * SIZE
+    ld.d t3, Y, 0
+    vstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0
+    vstelm.d VX1, Y, 0, 1
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vst VX3, X, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    ld.d t1, Y, 0
+    vstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0
+    vstelm.d VX0, Y, 0, 1
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vst VX2, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    ld.d t3, Y, 0
+    vstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0
+    vstelm.d VX1, Y, 0, 1
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vst VX3, X, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+#else
+    vld VX0, X, 0
+    ld.w t1, Y, 0
+    vstelm.w VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0
+    vstelm.w VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0
+    vstelm.w VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0
+    vstelm.w VX0, Y, 0, 3
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vst VX2, X, 0
+
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0
+    vstelm.w VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0
+    vstelm.w VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0
+    vstelm.w VX1, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0
+    vstelm.w VX1, Y, 0, 3
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vst VX3, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    addi.d X, X, SIZE
+    add.d  Y, Y, INCY
+    blt  $r0, I, .L123
+    b .L999
+    .align 3
+
+/* INCX!=1 and INCY==1 */
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    vld VX2, Y, 0
+    ld.d t1, X, 0
+    vstelm.d VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    vstelm.d VX2, X, 0, 1
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vst VX0, Y, 0
+    vld VX3, Y, 2 * SIZE
+    ld.d t3, X, 0
+    vstelm.d VX3, X, 0, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    vstelm.d VX3, X, 0, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vst VX1, Y, 2 * SIZE
+    vld VX2, Y, 4 * SIZE
+    ld.d t1, X, 0
+    vstelm.d VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    vstelm.d VX2, X, 0, 1
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vst VX0, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t3, X, 0
+    vstelm.d VX3, X, 0, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    vstelm.d VX3, X, 0, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vst  VX1, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    vld VX2, Y, 0
+    ld.w t1, X, 0
+    vstelm.w VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    vstelm.w VX2, X, 0, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    vstelm.w VX2, X, 0, 2
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    vstelm.w VX2, X, 0, 3
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vst VX0, Y, 0
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0
+    vstelm.w VX3, X, 0, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    vstelm.w VX3, X, 0, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    vstelm.w VX3, X, 0, 2
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    vstelm.w VX3, X, 0, 3
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    vst  VX1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    ST  $f12, Y, 0 * SIZE
+    ST  $f14, X, 0 * SIZE
+    add.d  X, X, INCX
+    addi.d Y, Y, SIZE
+    blt  $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+    move XX, X
+
+.L222:
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    LD    b1, Y, 0
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    LD    b2, Y, 0
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    LD    b3, Y, 0
+    ST    a3, Y, 0
+    add.d Y, Y, INCY
+    LD    b4, Y, 0
+    ST    a4, Y, 0
+    add.d Y, Y, INCY
+    LD    a1, X, 0
+    add.d X, X, INCX
+    ST    b1, XX, 0
+    add.d XX, XX, INCX
+    LD    b1, Y, 0
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    LD    a2, X, 0
+    add.d X, X, INCX
+    ST    b2, XX, 0
+    add.d XX, XX, INCX
+    LD    b2, Y, 0
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    LD    a3, X, 0
+    add.d X, X, INCX
+    ST    b3, XX, 0
+    add.d XX, XX, INCX
+    LD    b3, Y, 0
+    ST    a3, Y, 0
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    b4, XX, 0
+    add.d XX, XX, INCX
+    LD    b4, Y, 0
+    ST    a4, Y, 0
+    add.d Y, Y, INCY
+    ST    b1, XX, 0
+    add.d XX, XX, INCX
+    ST    b2, XX, 0
+    add.d XX, XX, INCX
+    ST    b3, XX, 0
+    add.d XX, XX, INCX
+    ST    b4, XX, 0
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE

From 154baad454647fdd6d71e2c907285859718da22e Mon Sep 17 00:00:00 2001
From: zhoupeng <zhoupeng@loongson.cn>
Date: Wed, 27 Dec 2023 16:04:33 +0800
Subject: [PATCH 514/718] loongarch64: Refine iamin optimization.

---
 common_loongarch64.h                          |  10 +
 kernel/loongarch64/KERNEL.LOONGSON2K1000      |   4 +-
 kernel/loongarch64/KERNEL.LOONGSON3R5         |   4 +-
 .../{isamin_lasx.S => iamin_lasx.S}           | 270 +++++++----
 kernel/loongarch64/iamin_lsx.S                | 446 ++++++++++++++++++
 kernel/loongarch64/idamin_lasx.S              | 275 -----------
 kernel/loongarch64/idamin_lsx.S               | 228 ---------
 kernel/loongarch64/isamin_lsx.S               | 275 -----------
 8 files changed, 649 insertions(+), 863 deletions(-)
 rename kernel/loongarch64/{isamin_lasx.S => iamin_lasx.S} (54%)
 create mode 100644 kernel/loongarch64/iamin_lsx.S
 delete mode 100644 kernel/loongarch64/idamin_lasx.S
 delete mode 100644 kernel/loongarch64/idamin_lsx.S
 delete mode 100644 kernel/loongarch64/isamin_lsx.S

diff --git a/common_loongarch64.h b/common_loongarch64.h
index 72e900f77..846fc0dbd 100644
--- a/common_loongarch64.h
+++ b/common_loongarch64.h
@@ -119,6 +119,7 @@ static inline int WhereAmI(void){
 #define MOV     fmov.d
 #define CMOVT   fsel
 #define MTC     movgr2fr.d
+#define MTG     movfr2gr.d
 #define FABS    fabs.d
 #define FMIN    fmin.d
 #define FMINA   fmina.d
@@ -136,6 +137,8 @@ static inline int WhereAmI(void){
 #define XVFMINA xvfmina.d
 #define XVFMAX  xvfmax.d
 #define XVFMAXA xvfmaxa.d
+#define XVCMPEQ xvfcmp.ceq.d
+#define XVCMPLT xvfcmp.clt.d
 
 #define VFSUB  vfsub.d
 #define VFADD  vfadd.d
@@ -144,6 +147,8 @@ static inline int WhereAmI(void){
 #define VFMINA vfmina.d
 #define VFMAX  vfmax.d
 #define VFMAXA vfmaxa.d
+#define VCMPEQ vfcmp.ceq.d
+#define VCMPLT vfcmp.clt.d
 
 #else
 
@@ -159,6 +164,7 @@ static inline int WhereAmI(void){
 #define MOV     fmov.s
 #define CMOVT   fsel
 #define MTC     movgr2fr.w
+#define MTG     movfr2gr.s
 #define FABS    fabs.s
 #define FMIN    fmin.s
 #define FMINA   fmina.s
@@ -176,6 +182,8 @@ static inline int WhereAmI(void){
 #define XVFMINA xvfmina.s
 #define XVFMAX  xvfmax.s
 #define XVFMAXA xvfmaxa.s
+#define XVCMPEQ xvfcmp.ceq.s
+#define XVCMPLT xvfcmp.clt.s
 
 #define VFSUB  vfsub.s
 #define VFADD  vfadd.s
@@ -184,6 +192,8 @@ static inline int WhereAmI(void){
 #define VFMINA vfmina.s
 #define VFMAX  vfmax.s
 #define VFMAXA vfmaxa.s
+#define VCMPEQ vfcmp.ceq.s
+#define VCMPLT vfcmp.clt.s
 
 #endif /* defined(DOUBLE) */
 
diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index cb230b348..4eae2e4f9 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -28,8 +28,8 @@ IDMINKERNEL =  idmin_lsx.S
 ISAMAXKERNEL = isamax_lsx.S
 IDAMAXKERNEL = idamax_lsx.S
 
-ISAMINKERNEL = isamin_lsx.S
-IDAMINKERNEL = idamin_lsx.S
+ISAMINKERNEL = iamin_lsx.S
+IDAMINKERNEL = iamin_lsx.S
 
 SCOPYKERNEL =  copy_lsx.S
 DCOPYKERNEL =  copy_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index ba59c4566..e7e1b5d5a 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -28,8 +28,8 @@ IDMINKERNEL =  idmin_lasx.S
 ISAMAXKERNEL = isamax_lasx.S
 IDAMAXKERNEL = idamax_lasx.S
 
-ISAMINKERNEL = isamin_lasx.S
-IDAMINKERNEL = idamin_lasx.S
+ISAMINKERNEL = iamin_lasx.S
+IDAMINKERNEL = iamin_lasx.S
 
 SCOPYKERNEL =  copy_lasx.S
 DCOPYKERNEL =  copy_lasx.S
diff --git a/kernel/loongarch64/isamin_lasx.S b/kernel/loongarch64/iamin_lasx.S
similarity index 54%
rename from kernel/loongarch64/isamin_lasx.S
rename to kernel/loongarch64/iamin_lasx.S
index cbdf32530..6ea117907 100644
--- a/kernel/loongarch64/isamin_lasx.S
+++ b/kernel/loongarch64/iamin_lasx.S
@@ -1,3 +1,30 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
 #define ASSEMBLER
 
 #include "common.h"
@@ -39,6 +66,31 @@
     slli.d INCX, INCX, BASE_SHIFT
     bne INCX, TEMP, .L20
     xvld VM0, X, 0
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
     addi.w i0, i0, 1
     srai.d I, N, 3
     bge $r0, I, .L21
@@ -76,25 +128,49 @@
     xvinsgr2vr.w VI0, i0, 6 //7
     addi.w i0, i0, 1
     xvinsgr2vr.w VI0, i0, 7 //8
+#endif
     .align 3
 
 .L10:
     xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvadd.d VI1, VI1, VINC8
+    xvld VX1, X, 4 * SIZE
+    xvadd.d VI2, VI1, VINC4
+    xvfmina.d VM1, VX0, VX1
+    xvfcmp.ceq.d VT0, VX0, VM1
+    addi.d I, I, -1
+    xvbitsel.v VI2, VI2, VI1, VT0
+    xvfmina.d VM1, VM0, VM1
+#else
     addi.d I, I, -1
-    xvadd.w VI1, VI1, VINC8
+    xvadd.w VI2, VI1, VINC8
     xvfmina.s VM1, VX0, VM0
-    xvfcmp.ceq.s VT0, VM0, VM1
+#endif
+    XVCMPEQ VT0, VM0, VM1
     addi.d X, X, 8 * SIZE
     xvbitsel.v VM0, VM1, VM0, VT0
-    xvbitsel.v VI0, VI1, VI0, VT0
+    xvbitsel.v VI0, VI2, VI0, VT0
     blt $r0, I, .L10
     .align 3
 
 .L15:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmina.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, x1, VM1
+#else
     xvxor.v VX0, VX0, VX0
-    xvor.v VX0, VI0, VX0
+    xvor.v  VX0, VI0, VX0
     xvxor.v VX1, VX1, VX1
-    xvor.v VX1, VM0, VX1
+    xvor.v  VX1, VM0, VX1
     xvpickve.w VI1, VI0, 0
     xvpickve.w VI2, VI0, 1
     xvpickve.w VI3, VI0, 2
@@ -105,28 +181,62 @@
     xvpickve.w x4, VM0, 3
     xvfmina.s VM1, x1, x2
     xvfcmp.ceq.s VT0, x1, VM1
+#endif
     xvbitsel.v VINC4, VI2, VI1, VT0
-    xvfmina.s VM0, x3, x4
-    xvfcmp.ceq.s VT0, x3, VM0
+    XVFMINA VM0, x4, x3
+    XVCMPEQ VT0, x3, VM0
     xvbitsel.v VINC8, VI4, VI3, VT0
-    xvfmina.s VM0, VM0, VM1
-    xvfcmp.ceq.s VT0, VM0, VM1
+    XVFMINA VM0, VM0, VM1
+    XVCMPEQ VT0, VM0, VM1
     xvbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.w $f17, TEMP
-    ffint.s.w $f17, $f17
-    xvfcmp.ceq.s VT0, VM0, x1
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f9
     bceqz $fcc0, .L26
-    xvfcmp.clt.s VT0, VI1, VI0
+    XVCMPLT VT0, VI1, VI0
     xvbitsel.v VI0, VI0, VI1, VT0
     b .L26
     .align 3
 
 .L20: // INCX!=1
     move TEMP, X
-    addi.w i0, i0, 1
-    ld.w t1, TEMP, 0 * SIZE
+#ifdef DOUBLE
+    addi.d  i0,   i0, 1
+    ld.d    t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t2, 1
+    xvinsgr2vr.d VM0, t3, 2
+    xvinsgr2vr.d VM0, t4, 3
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    addi.w  i0, i0, 1
+    ld.w    t1, TEMP, 0 * SIZE
     add.d TEMP, TEMP, INCX
     xvinsgr2vr.w VM0, t1, 0
     srai.d I, N, 3
@@ -186,9 +296,43 @@
     xvinsgr2vr.w VI0, i0, 6 //7
     addi.w i0, i0, 1
     xvinsgr2vr.w VI0, i0, 7 //8
+#endif
     .align 3
 
 .L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvadd.d VI2, VI1, VINC4
+    xvfmina.d VM1, VX0, VX1
+    xvfcmp.ceq.d VT0, VX0, VM1
+    xvbitsel.v VI2, VI2, VI1, VT0
+    xvfmina.d VM1, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+#else
     ld.w t1, X, 0 * SIZE
     add.d X, X, INCX
     ld.w t2, X, 0 * SIZE
@@ -213,73 +357,42 @@
     xvinsgr2vr.w VX0, t2, 5
     xvinsgr2vr.w VX0, t3, 6
     xvinsgr2vr.w VX0, t4, 7
-    xvadd.w VI1, VI1, VINC8
+    xvadd.w VI2, VI1, VINC8
     xvfmina.s VM1, VX0, VM0
     xvfcmp.ceq.s VT0, VM1, VM0
+#endif
     addi.d I, I, -1
     xvbitsel.v VM0, VM1, VM0, VT0
-    xvbitsel.v VI0, VI1, VI0, VT0
+    xvbitsel.v VI0, VI2, VI0, VT0
     blt $r0, I, .L24
     .align 3
 
-.L25:
-    xvxor.v VX0, VX0, VX0
-    xvor.v VX0, VI0, VX0
-    xvxor.v VX1, VX1, VX1
-    xvor.v VX1, VM0, VX1
-    xvpickve.w VI1, VI0, 0
-    xvpickve.w VI2, VI0, 1
-    xvpickve.w VI3, VI0, 2
-    xvpickve.w VI4, VI0, 3
-    xvpickve.w x1, VM0, 0
-    xvpickve.w x2, VM0, 1
-    xvpickve.w x3, VM0, 2
-    xvpickve.w x4, VM0, 3
-    xvfmina.s VM1, x1, x2
-    xvfcmp.ceq.s VT0, x1, VM1
-    xvbitsel.v VINC4, VI2, VI1, VT0
-    xvfmina.s VM0, x3, x4
-    xvfcmp.ceq.s VT0, x3, VM0
-    xvbitsel.v VINC8, VI3, VI4, VT0
-    xvfmina.s VM0, VM0, VM1
-    xvfcmp.ceq.s VT0, VM0, VM1
-    xvbitsel.v VM0, VM0, VM1, VT0
-    xvbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.w $f17, TEMP
-    ffint.s.w $f17, $f17
-    xvfcmp.ceq.s VT0, VM0, x1
-    fcmp.ceq.s $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    xvfcmp.clt.s VT0, VI1, VI0
-    xvbitsel.v VI0, VI0, VI1, VT0
-    .align 3
-
 .L26:
-    xvfcmp.ceq.s VT0, VM0, x2
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f10
     bceqz $fcc0, .L27
-    xvfcmp.clt.s VT0, VI2, VI0
+    XVCMPLT VT0, VI2, VI0
     xvbitsel.v VI0, VI0, VI2, VT0
     .align 3
 
 .L27:
-    xvfcmp.ceq.s VT0, VM0, x3
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f11
     bceqz $fcc0, .L28
-    xvfcmp.clt.s VT0, VI3, VI0
+    XVCMPLT VT0, VI3, VI0
     xvbitsel.v VI0, VI0, VI3, VT0
     .align 3
 
 .L28:
-    xvfcmp.ceq.s VT0, VM0, x4
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f12
     bceqz $fcc0, .L29
-    xvfcmp.clt.s VT0, VI4, VI0
+    XVCMPLT VT0, VI4, VI0
     xvbitsel.v VI0, VI0, VI4, VT0
     .align 3
 
 .L29:
+#ifdef DOUBLE
+    movfr2gr.d i0, $f20
+    .align 3
+#else
     fmov.s $f16, $f20
     .align 3
 
@@ -306,35 +419,28 @@
     xvfmina.s VM0, VM0, VM1
     xvfcmp.ceq.s VT0, VM0, VM1
     xvbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.w $f17, TEMP
-    ffint.s.w $f17, $f17
-    xvfcmp.ceq.s VT0, VM0, x1
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f9
     bceqz $fcc0, .L262
     xvfcmp.clt.s VT0, VI1, VI0
     xvbitsel.v VI0, VI0, VI1, VT0
     .align 3
 
 .L262:
-    xvfcmp.ceq.s VT0, VM0, x2
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f10
     bceqz $fcc0, .L272
     xvfcmp.clt.s VT0, VI2, VI0
     xvbitsel.v VI0, VI0, VI2, VT0
     .align 3
 
 .L272:
-    xvfcmp.ceq.s VT0, VM0, x3
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f11
     bceqz $fcc0, .L282
     xvfcmp.clt.s VT0, VI3, VI0
     xvbitsel.v VI0, VI0, VI3, VT0
     .align 3
 
 .L282:
-    xvfcmp.ceq.s VT0, VM0, x4
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f12
     bceqz $fcc0, .L292
     xvfcmp.clt.s VT0, VI4, VI0
     xvbitsel.v VI0, VI0, VI4, VT0
@@ -346,9 +452,11 @@
     xvbitsel.v VI0, VI0, VI1, VT0
     movfr2gr.s i0, $f20
 
-.L21: //N<8
-    andi I, N, 7
-    bge $r0, I, .L999
+#endif
+
+.L21: // N<8
+    andi    I, N, 7
+    bge   $r0, I, .L999
     srai.d i1, N, 3
     slli.d i1, i1, 3
     addi.d i1, i1, 1 //current index
@@ -357,17 +465,17 @@
     .align 3
 
 .L22:
-    fld.s $f9, X, 0
+    LD $f9, X, 0
     addi.d I, I, -1
-    xvfmina.s VM1, x1, VM0
-    xvfcmp.ceq.s VT0, VM0, VM1
-    add.d  X, X, INCX
+    XVFMINA    VM1,  x1, VM0
+    XVCMPEQ    VT0, VM0, VM1
+    add.d        X,   X, INCX
     xvbitsel.v VM0, VM1, VM0, VT0
     xvbitsel.v VI0, VI1, VI0, VT0
     addi.d i1, i1, 1
     movgr2fr.d $f21, i1
     blt $r0, I, .L22
-    movfr2gr.s i0, $f20
+    MTG i0, $f20
     .align 3
 
 .L999:
@@ -375,4 +483,4 @@
     jirl $r0, $r1, 0x0
     .align 3
 
-    EPILOGUE
\ No newline at end of file
+    EPILOGUE
diff --git a/kernel/loongarch64/iamin_lsx.S b/kernel/loongarch64/iamin_lsx.S
new file mode 100644
index 000000000..ce885fd88
--- /dev/null
+++ b/kernel/loongarch64/iamin_lsx.S
@@ -0,0 +1,446 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#ifdef DOUBLE
+#define VINC2 $vr17
+#define VINC4 $vr18
+#else
+#define VINC4 $vr17
+#define VINC8 $vr18
+#endif
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+#ifdef DOUBLE
+    addi.d  i0, i0, 1
+    srai.d   I, N, 3
+    bge $r0, I, .L21
+
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC2, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w i0, i0, 1
+    srai.d  I, N, 3
+    bge   $r0, I, .L21
+
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 2 * SIZE
+    vadd.d VI2, VI1, VINC2
+    vfmina.d x1, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x1
+    vbitsel.v x2, VI2, VI1, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI2, VINC2
+    vld VX1, X, 6 * SIZE
+    vadd.d VI2, VI1, VINC2
+    vfmina.d x3, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x3
+    vbitsel.v x4, VI2, VI1, VT0
+    vfmina.d x3, x1, x3
+    vfcmp.ceq.d VT0, x1, x3
+    addi.d I, I, -1
+    vbitsel.v x2, x4, x2, VT0
+    vfmina.d VM1, VM0, x3
+#else
+    vadd.w VI1, VI1, VINC8
+    vld VX1, X, 4 * SIZE
+    vadd.w VI2, VI1, VINC4
+    vfmina.s VM1, VX0, VX1
+    vfcmp.ceq.s VT0, VX0, VM1
+    addi.d I, I, -1
+    vbitsel.v x2, VI2, VI1, VT0
+    vfmina.s VM1, VM0, VM1
+#endif
+    VCMPEQ VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, x2, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d  x1, VM0, 0
+    vreplvei.d  x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v   VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w  x1, VM0, 0
+    vreplvei.w  x2, VM0, 1
+    vreplvei.w  x3, VM0, 2
+    vreplvei.w  x4, VM0, 3
+    vfmina.s   VM1, x1, x2
+    vfcmp.ceq.s  VT0, VM1, x1
+    vbitsel.v  VINC4, VI2, VI1, VT0
+    vfmina.s     VM0,  x3, x4
+    vfcmp.ceq.s  VT0,  x3, VM0
+    vbitsel.v  VINC8, VI4, VI3, VT0
+    vfmina.s     VM0, VM0, VM1
+    vfcmp.ceq.s  VT0, VM0, VM1
+    vbitsel.v    VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v   VI0, VI0, VI1, VT0
+    b .L26
+#endif
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+#ifdef DOUBLE
+    addi.d    i0,   i0, 1
+    ld.d      t1, TEMP, 0 * SIZE
+    add.d   TEMP, TEMP, INCX
+    vinsgr2vr.d    VM0, t1, 0
+    srai.d     I,    N, 3
+    bge $r0,   I, .L21
+    ld.d      t2, TEMP, 0 * SIZE
+    add.d   TEMP, TEMP, INCX
+
+    vinsgr2vr.d VM0, t2, 1
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC2, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w    i0,   i0, 1
+    ld.w      t1, TEMP, 0 * SIZE
+    add.d   TEMP, TEMP, INCX
+    vinsgr2vr.w    VM0, t1, 0
+    srai.d     I,    N, 3
+    bge $r0,   I, .L21
+    ld.w      t2, TEMP, 0 * SIZE
+    add.d   TEMP, TEMP, INCX
+
+    vreplvei.d VI1, VI0, 0
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t2, 1
+    vinsgr2vr.w VM0, t3, 2
+    vinsgr2vr.w VM0, t4, 3
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d  t1, X, 0 * SIZE
+    add.d X,  X, INCX
+    ld.d  t2, X, 0 * SIZE
+    add.d X,  X, INCX
+
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI1, VINC4
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, VINC2
+    vfmina.d x1, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x1
+    vbitsel.v x2, VI2, VI1, VT0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI2, VINC2
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, VINC2
+    vfmina.d x3, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x3
+    vbitsel.v x4, VI2, VI1, VT0
+    vfmina.d x3, x1, x3
+    vfcmp.ceq.d VT0, x1, x3
+    addi.d I, I, -1
+    vbitsel.v x2, x4, x2, VT0
+    vfmina.d VM1, VM0, x3
+    vbitsel.v VM0, VM1, VM0, VT0
+    vfcmp.ceq.d VT0, VM0, VM1
+    vbitsel.v VI0, x2, VI0, VT0
+#else
+    ld.w  t1, X, 0 * SIZE
+    add.d X,  X, INCX
+    ld.w  t2, X, 0 * SIZE
+    add.d X,  X, INCX
+
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    vadd.w VI1, VI1, VINC8
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vadd.w VI2, VI1, VINC4
+    vfmina.s VM1, VX0, VX1
+    vfcmp.ceq.s VT0, VX0, VM1
+    vbitsel.v VI2, VI2, VI1, VT0
+    vfmina.s VM1, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    addi.d I, I, -1
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI2, VI0, VT0
+#endif
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d  x1, VM0, 0
+    vreplvei.d  x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmina.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC4, VI2, VI1, VT0
+    vfmina.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC8, VI4, VI3, VT0
+    vfmina.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+#endif
+    .align 3
+
+.L26:
+#ifdef DOUBLE
+    vfmina.d VM0, x1, x2
+    vfcmp.ceq.d VT0, x1, VM0
+    vbitsel.v VI0, VI2, VI1, VT0
+    .align 3
+
+.L27:
+    movfr2gr.d i0, $f20
+
+#else
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    vfcmp.clt.s VT0, VI2, VI0
+    vbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    vfcmp.clt.s VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    vfcmp.clt.s VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    movfr2gr.s i0, $f20
+#endif
+    .align 3
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD $f9, X, 0
+    addi.d I, I, -1
+    VFMINA VM1, x1, VM0
+    VCMPEQ VT0, VM0, VM1
+    add.d  X, X, INCX
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    addi.d i1, i1, 1
+    MTC $f21, i1
+    blt $r0, I, .L22
+    movfr2gr.s i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/idamin_lasx.S b/kernel/loongarch64/idamin_lasx.S
deleted file mode 100644
index 6ef1e8903..000000000
--- a/kernel/loongarch64/idamin_lasx.S
+++ /dev/null
@@ -1,275 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define t1 $r13
-#define t2 $r15
-#define t3 $r18
-#define t4 $r16
-#define i0 $r17
-#define i1 $r14
-#define TEMP $r19
-#define x1 $xr9
-#define x2 $xr10
-#define x3 $xr11
-#define x4 $xr12
-#define VX0 $xr13
-#define VX1 $xr14
-#define VM0 $xr15
-#define VM1 $xr16
-#define VINC4 $xr17
-#define VINC8 $xr18
-#define VI0 $xr20
-#define VI1 $xr21
-#define VI2 $xr22
-#define VI3 $xr8
-#define VI4 $xr19
-#define VT0 $xr23
-
-    PROLOGUE
-    li.d  i0, 0
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    xvld VM0, X, 0
-    addi.d i0, i0, 1
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    slli.d i0, i0, 2 //4
-    xvreplgr2vr.d VINC4, i0
-    slli.d i0, i0, 1 //8
-    xvreplgr2vr.d VINC8, i0
-    addi.d i0, i0, -15
-    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 1
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 2
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 3
-    addi.d i0, i0, 5
-    xvinsgr2vr.d VI0, i0, 0 //1
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 1 //2
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 2 //3
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 3 //4
-    .align 3
-
-.L10:
-    xvld VX0, X, 0 * SIZE
-    xvadd.d VI1, VI1, VINC8
-    xvld VX1, X, 4 * SIZE
-    xvadd.d VI2, VI1, VINC4
-    xvfmina.d VM1, VX0, VX1
-    xvfcmp.ceq.d VT0, VX0, VM1
-    addi.d I, I, -1
-    xvbitsel.v VI2, VI2, VI1, VT0
-    xvfmina.d VM1, VM0, VM1
-    xvfcmp.ceq.d VT0, VM0, VM1
-    addi.d X, X, 8 * SIZE
-    xvbitsel.v VM0, VM1, VM0, VT0
-    xvbitsel.v VI0, VI2, VI0, VT0
-    blt $r0, I, .L10
-    .align 3
-
-.L15:
-    xvpickve.d VI1, VI0, 0
-    xvpickve.d VI2, VI0, 1
-    xvpickve.d VI3, VI0, 2
-    xvpickve.d VI4, VI0, 3
-    xvpickve.d x1, VM0, 0
-    xvpickve.d x2, VM0, 1
-    xvpickve.d x3, VM0, 2
-    xvpickve.d x4, VM0, 3
-    xvfmina.d VM1, x1, x2
-    xvfcmp.ceq.d VT0, x1, VM1
-    xvbitsel.v VINC4, VI2, VI1, VT0
-    xvfmina.d VM0, x4, x3
-    xvfcmp.ceq.d VT0, x3, VM0
-    xvbitsel.v VINC8, VI4, VI3, VT0
-    xvfmina.d VM0, VM0, VM1
-    xvfcmp.ceq.d VT0, VM0, VM1
-    xvbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.d $f17, TEMP
-    ffint.d.l $f17, $f17
-    xvfcmp.ceq.d VT0, VM0, x1
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    xvfcmp.clt.d VT0, VI1, VI0
-    xvbitsel.v VI0, VI0, VI1, VT0
-    b .L26
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X
-    addi.d i0, i0, 1
-    ld.d t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.d VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    ld.d t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.d t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.d t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.d VM0, t2, 1
-    xvinsgr2vr.d VM0, t3, 2
-    xvinsgr2vr.d VM0, t4, 3
-    slli.d i0, i0, 2 //4
-    xvreplgr2vr.d VINC4, i0
-    slli.d i0, i0, 1 //8
-    xvreplgr2vr.d VINC8, i0
-    addi.d i0, i0, -15
-    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 1
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 2
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 3
-    addi.d i0, i0, 5
-    xvinsgr2vr.d VI0, i0, 0 //1
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 1 //2
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 2 //3
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 3 //4
-    .align 3
-
-.L24:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX0, t1, 0
-    xvinsgr2vr.d VX0, t2, 1
-    xvinsgr2vr.d VX0, t3, 2
-    xvinsgr2vr.d VX0, t4, 3
-    xvadd.d VI1, VI1, VINC8
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX1, t1, 0
-    xvinsgr2vr.d VX1, t2, 1
-    xvinsgr2vr.d VX1, t3, 2
-    xvinsgr2vr.d VX1, t4, 3
-    xvadd.d VI2, VI1, VINC4
-    xvfmina.d VM1, VX0, VX1
-    xvfcmp.ceq.d VT0, VX0, VM1
-    xvbitsel.v VI2, VI2, VI1, VT0
-    xvfmina.d VM1, VM0, VM1
-    xvfcmp.ceq.d VT0, VM0, VM1
-    addi.d I, I, -1
-    xvbitsel.v VM0, VM1, VM0, VT0
-    xvbitsel.v VI0, VI2, VI0, VT0
-    blt $r0, I, .L24
-    .align 3
-
-.L25:
-    xvpickve.d VI1, VI0, 0
-    xvpickve.d VI2, VI0, 1
-    xvpickve.d VI3, VI0, 2
-    xvpickve.d VI4, VI0, 3
-    xvpickve.d x1, VM0, 0
-    xvpickve.d x2, VM0, 1
-    xvpickve.d x3, VM0, 2
-    xvpickve.d x4, VM0, 3
-    xvfmina.d VM1, x1, x2
-    xvfcmp.ceq.d VT0, x1, VM1
-    xvbitsel.v VINC4, VI2, VI1, VT0
-    xvfmina.d VM0, x4, x3
-    xvfcmp.ceq.d VT0, x3, VM0
-    xvbitsel.v VINC8, VI4, VI3, VT0
-    xvfmina.d VM0, VM0, VM1
-    xvfcmp.ceq.d VT0, VM0, VM1
-    xvbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.d $f17, TEMP
-    ffint.d.l $f17, $f17
-    xvfcmp.ceq.d VT0, VM0, x1
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    xvfcmp.clt.d VT0, VI1, VI0
-    xvbitsel.v VI0, VI0, VI1, VT0
-    .align 3
-
-.L26:
-    xvfcmp.ceq.d VT0, VM0, x2
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L27
-    xvfcmp.clt.d VT0, VI2, VI0
-    xvbitsel.v VI0, VI0, VI2, VT0
-    .align 3
-
-.L27:
-    xvfcmp.ceq.d VT0, VM0, x3
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L28
-    xvfcmp.clt.d VT0, VI3, VI0
-    xvbitsel.v VI0, VI0, VI3, VT0
-    .align 3
-
-.L28:
-    xvfcmp.ceq.d VT0, VM0, x4
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L29
-    xvfcmp.clt.d VT0, VI4, VI0
-    xvbitsel.v VI0, VI0, VI4, VT0
-    .align 3
-
-.L29:
-    movfr2gr.d i0, $f20
-    .align 3
-
-.L21: // N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    srai.d i1, N, 3
-    slli.d i1, i1, 3
-    addi.d i1, i1, 1 //current index
-    movgr2fr.d $f21, i1
-    movgr2fr.d $f20, i0
-    .align 3
-
-.L22:
-    fld.d $f9, X, 0
-    addi.d I, I, -1
-    xvfmina.d VM1, x1, VM0
-    xvfcmp.ceq.d VT0, VM0, VM1
-    add.d  X, X, INCX
-    xvbitsel.v VM0, VM1, VM0, VT0
-    xvbitsel.v VI0, VI1, VI0, VT0
-    addi.d i1, i1, 1
-    movgr2fr.d $f21, i1
-    blt $r0, I, .L22
-    movfr2gr.d i0, $f20
-    .align 3
-
-.L999:
-    move $r4, $r17
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/idamin_lsx.S b/kernel/loongarch64/idamin_lsx.S
deleted file mode 100644
index 9eb9d883f..000000000
--- a/kernel/loongarch64/idamin_lsx.S
+++ /dev/null
@@ -1,228 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define t1 $r13
-#define t2 $r15
-#define t3 $r18
-#define t4 $r16
-#define i0 $r17
-#define i1 $r14
-#define TEMP $r19
-#define x1 $vr9
-#define x2 $vr10
-#define x3 $vr11
-#define x4 $vr12
-#define VX0 $vr13
-#define VX1 $vr14
-#define VM0 $vr15
-#define VM1 $vr16
-#define VINC2 $vr17
-#define VINC4 $vr18
-#define VI0 $vr20
-#define VI1 $vr21
-#define VI2 $vr22
-#define VI3 $vr8
-#define VI4 $vr19
-#define VT0 $vr23
-
-    PROLOGUE
-    li.d  i0, 0
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    vld VM0, X, 0
-    addi.d i0, i0, 1
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    slli.d i0, i0, 1 //2
-    vreplgr2vr.d VINC2, i0
-    slli.d i0, i0, 1 //4
-    vreplgr2vr.d VINC4, i0
-    addi.d i0, i0, -7
-    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
-    addi.d i0, i0, 1
-    vinsgr2vr.d VI1, i0, 1
-    addi.d i0, i0, 3
-    vinsgr2vr.d VI0, i0, 0 //1
-    addi.d i0, i0, 1
-    vinsgr2vr.d VI0, i0, 1 //2
-    .align 3
-
-.L10:
-    vld VX0, X, 0 * SIZE
-    vadd.d VI1, VI1, VINC4
-    vld VX1, X, 2 * SIZE
-    vadd.d VI2, VI1, VINC2
-    vfmina.d x1, VX0, VX1
-    vfcmp.ceq.d VT0, VX0, x1
-    vbitsel.v x2, VI2, VI1, VT0
-    vld VX0, X, 4 * SIZE
-    vadd.d VI1, VI2, VINC2
-    vld VX1, X, 6 * SIZE
-    vadd.d VI2, VI1, VINC2
-    vfmina.d x3, VX0, VX1
-    vfcmp.ceq.d VT0, VX0, x3
-    vbitsel.v x4, VI2, VI1, VT0
-    vfmina.d x3, x1, x3
-    vfcmp.ceq.d VT0, x1, x3
-    addi.d I, I, -1
-    vbitsel.v x2, x4, x2, VT0
-    vfmina.d VM1, VM0, x3
-    vfcmp.ceq.d VT0, VM0, VM1
-    addi.d X, X, 8 * SIZE
-    vbitsel.v VM0, VM1, VM0, VT0
-    vbitsel.v VI0, x2, VI0, VT0
-    blt $r0, I, .L10
-    .align 3
-
-.L15:
-    vreplvei.d VI1, VI0, 0
-    vreplvei.d VI2, VI0, 1
-    vreplvei.d x1, VM0, 0
-    vreplvei.d x2, VM0, 1
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.d $f17, TEMP
-    ffint.d.l $f17, $f17
-    vfcmp.ceq.d VT0, x2, x1
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    vfcmp.clt.d VT0, VI1, VI0
-    vbitsel.v VI0, VI0, VI1, VT0
-    b .L27
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X
-    addi.d i0, i0, 1
-    ld.d t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.d VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    ld.d t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.d VM0, t2, 1
-    slli.d i0, i0, 1 //2
-    vreplgr2vr.d VINC2, i0
-    slli.d i0, i0, 1 //4
-    vreplgr2vr.d VINC4, i0
-    addi.d i0, i0, -7
-    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
-    addi.d i0, i0, 1
-    vinsgr2vr.d VI1, i0, 1
-    addi.d i0, i0, 3
-    vinsgr2vr.d VI0, i0, 0 //1
-    addi.d i0, i0, 1
-    vinsgr2vr.d VI0, i0, 1 //2
-    .align 3
-
-.L24:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    vadd.d VI1, VI1, VINC4
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t1, 0
-    vinsgr2vr.d VX1, t2, 1
-    vadd.d VI2, VI1, VINC2
-    vfmina.d x1, VX0, VX1
-    vfcmp.ceq.d VT0, VX0, x1
-    vbitsel.v x2, VI2, VI1, VT0
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    vadd.d VI1, VI2, VINC2
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t1, 0
-    vinsgr2vr.d VX1, t2, 1
-    vadd.d VI2, VI1, VINC2
-    vfmina.d x3, VX0, VX1
-    vfcmp.ceq.d VT0, VX0, x3
-    vbitsel.v x4, VI2, VI1, VT0
-    vfmina.d x3, x1, x3
-    vfcmp.ceq.d VT0, x1, x3
-    addi.d I, I, -1
-    vbitsel.v x2, x4, x2, VT0
-    vfmina.d VM1, VM0, x3
-    vbitsel.v VM0, VM1, VM0, VT0
-    vfcmp.ceq.d VT0, VM0, VM1
-    vbitsel.v VI0, x2, VI0, VT0
-    blt $r0, I, .L24
-    .align 3
-
-.L25:
-    vreplvei.d VI1, VI0, 0
-    vreplvei.d VI2, VI0, 1
-    vreplvei.d x1, VM0, 0
-    vreplvei.d x2, VM0, 1
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.d $f17, TEMP
-    ffint.d.l $f17, $f17
-    vfcmp.ceq.d VT0, x2, x1
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    vfcmp.clt.d VT0, VI1, VI0
-    vbitsel.v VI0, VI0, VI1, VT0
-    b .L27
-    .align 3
-
-.L26:
-    vfmina.d VM0, x1, x2
-    vfcmp.ceq.d VT0, x1, VM0
-    vbitsel.v VI0, VI2, VI1, VT0
-    .align 3
-
-.L27:
-    movfr2gr.d i0, $f20
-    .align 3
-
-.L21: //N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    srai.d i1, N, 3
-    slli.d i1, i1, 3
-    addi.d i1, i1, 1 //current index
-    movgr2fr.d $f21, i1
-    movgr2fr.d $f20, i0
-    .align 3
-
-.L22:
-    fld.d $f9, X, 0
-    addi.d I, I, -1
-    vfmina.d VM1, x1, VM0
-    vfcmp.ceq.d VT0, VM0, VM1
-    add.d  X, X, INCX
-    vbitsel.v VM0, VM1, VM0, VT0
-    vbitsel.v VI0, VI1, VI0, VT0
-    addi.d i1, i1, 1
-    movgr2fr.d $f21, i1
-    blt $r0, I, .L22
-    movfr2gr.d i0, $f20
-    .align 3
-
-.L999:
-    move $r4, $r17
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/isamin_lsx.S b/kernel/loongarch64/isamin_lsx.S
deleted file mode 100644
index 598888660..000000000
--- a/kernel/loongarch64/isamin_lsx.S
+++ /dev/null
@@ -1,275 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define t1 $r13
-#define t2 $r15
-#define t3 $r18
-#define t4 $r16
-#define i0 $r17
-#define i1 $r14
-#define TEMP $r19
-#define x1 $vr9
-#define x2 $vr10
-#define x3 $vr11
-#define x4 $vr12
-#define VX0 $vr13
-#define VX1 $vr14
-#define VM0 $vr15
-#define VM1 $vr16
-#define VINC4 $vr17
-#define VINC8 $vr18
-#define VI0 $vr20
-#define VI1 $vr21
-#define VI2 $vr22
-#define VI3 $vr8
-#define VI4 $vr19
-#define VT0 $vr23
-
-    PROLOGUE
-    li.d  i0, 0
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    vld VM0, X, 0
-    addi.w i0, i0, 1
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    slli.w i0, i0, 2 //4
-    vreplgr2vr.w VINC4, i0
-    slli.w i0, i0, 1 //8
-    vreplgr2vr.w VINC8, i0
-    addi.w i0, i0, -15
-    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 1
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 2
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 3
-    addi.w i0, i0, 5
-    vinsgr2vr.w VI0, i0, 0 //1
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 1 //2
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 2 //3
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 3 //4
-    .align 3
-
-.L10:
-    vld VX0, X, 0 * SIZE
-    vadd.w VI1, VI1, VINC8
-    vld VX1, X, 4 * SIZE
-    vadd.w VI2, VI1, VINC4
-    vfmina.s VM1, VX0, VX1
-    vfcmp.ceq.s VT0, VX0, VM1
-    addi.d I, I, -1
-    vbitsel.v VI2, VI2, VI1, VT0
-    vfmina.s VM1, VM0, VM1
-    vfcmp.ceq.s VT0, VM0, VM1
-    addi.d X, X, 8 * SIZE
-    vbitsel.v VM0, VM1, VM0, VT0
-    vbitsel.v VI0, VI2, VI0, VT0
-    blt $r0, I, .L10
-    .align 3
-
-.L15:
-    vreplvei.w VI1, VI0, 0
-    vreplvei.w VI2, VI0, 1
-    vreplvei.w VI3, VI0, 2
-    vreplvei.w VI4, VI0, 3
-    vreplvei.w x1, VM0, 0
-    vreplvei.w x2, VM0, 1
-    vreplvei.w x3, VM0, 2
-    vreplvei.w x4, VM0, 3
-    vfmina.s VM1, x1, x2
-    vfcmp.ceq.s VT0, VM1, x1
-    vbitsel.v VINC4, VI2, VI1, VT0
-    vfmina.s VM0, x3, x4
-    vfcmp.ceq.s VT0, x3, VM0
-    vbitsel.v VINC8, VI4, VI3, VT0
-    vfmina.s VM0, VM0, VM1
-    vfcmp.ceq.s VT0, VM0, VM1
-    vbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.w $f17, TEMP
-    ffint.s.w $f17, $f17
-    vfcmp.ceq.s VT0, VM0, x1
-    fcmp.ceq.s $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    vfcmp.clt.s VT0, VI1, VI0
-    vbitsel.v VI0, VI0, VI1, VT0
-    b .L26
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X
-    addi.w i0, i0, 1
-    ld.w t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.w VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    ld.w t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.w VM0, t2, 1
-    vinsgr2vr.w VM0, t3, 2
-    vinsgr2vr.w VM0, t4, 3
-    slli.w i0, i0, 2 //4
-    vreplgr2vr.w VINC4, i0
-    slli.w i0, i0, 1 //8
-    vreplgr2vr.w VINC8, i0
-    addi.w i0, i0, -15
-    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 1
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 2
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 3
-    addi.w i0, i0, 5
-    vinsgr2vr.w VI0, i0, 0 //1
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 1 //2
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 2 //3
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 3 //4
-    .align 3
-
-.L24:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    vadd.w VI1, VI1, VINC8
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    vadd.w VI2, VI1, VINC4
-    vfmina.s VM1, VX0, VX1
-    vfcmp.ceq.s VT0, VX0, VM1
-    vbitsel.v VI2, VI2, VI1, VT0
-    vfmina.s VM1, VM0, VM1
-    vfcmp.ceq.s VT0, VM0, VM1
-    addi.d I, I, -1
-    vbitsel.v VM0, VM1, VM0, VT0
-    vbitsel.v VI0, VI2, VI0, VT0
-    blt $r0, I, .L24
-    .align 3
-
-.L25:
-    vreplvei.w VI1, VI0, 0
-    vreplvei.w VI2, VI0, 1
-    vreplvei.w VI3, VI0, 2
-    vreplvei.w VI4, VI0, 3
-    vreplvei.w x1, VM0, 0
-    vreplvei.w x2, VM0, 1
-    vreplvei.w x3, VM0, 2
-    vreplvei.w x4, VM0, 3
-    vfmina.s VM1, x1, x2
-    vfcmp.ceq.s VT0, VM1, x1
-    vbitsel.v VINC4, VI2, VI1, VT0
-    vfmina.s VM0, x3, x4
-    vfcmp.ceq.s VT0, x3, VM0
-    vbitsel.v VINC8, VI4, VI3, VT0
-    vfmina.s VM0, VM0, VM1
-    vfcmp.ceq.s VT0, VM0, VM1
-    vbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.w $f17, TEMP
-    ffint.s.w $f17, $f17
-    vfcmp.ceq.s VT0, VM0, x1
-    fcmp.ceq.s $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    vfcmp.clt.s VT0, VI1, VI0
-    vbitsel.v VI0, VI0, VI1, VT0
-    .align 3
-
-.L26:
-    vfcmp.ceq.s VT0, VM0, x2
-    fcmp.ceq.s $fcc0, $f23, $f17
-    bceqz $fcc0, .L27
-    vfcmp.clt.s VT0, VI2, VI0
-    vbitsel.v VI0, VI0, VI2, VT0
-    .align 3
-
-.L27:
-    vfcmp.ceq.s VT0, VM0, x3
-    fcmp.ceq.s $fcc0, $f23, $f17
-    bceqz $fcc0, .L28
-    vfcmp.clt.s VT0, VI3, VI0
-    vbitsel.v VI0, VI0, VI3, VT0
-    .align 3
-
-.L28:
-    vfcmp.ceq.s VT0, VM0, x4
-    fcmp.ceq.s $fcc0, $f23, $f17
-    bceqz $fcc0, .L29
-    vfcmp.clt.s VT0, VI4, VI0
-    vbitsel.v VI0, VI0, VI4, VT0
-    .align 3
-
-.L29:
-    movfr2gr.s i0, $f20
-    .align 3
-
-.L21: //N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    srai.d i1, N, 3
-    slli.d i1, i1, 3
-    addi.d i1, i1, 1 //current index
-    movgr2fr.d $f21, i1
-    movgr2fr.d $f20, i0
-    .align 3
-
-.L22:
-    fld.s $f9, X, 0
-    addi.d I, I, -1
-    vfmina.s VM1, x1, VM0
-    vfcmp.ceq.s VT0, VM0, VM1
-    add.d  X, X, INCX
-    vbitsel.v VM0, VM1, VM0, VT0
-    vbitsel.v VI0, VI1, VI0, VT0
-    addi.d i1, i1, 1
-    movgr2fr.d $f21, i1
-    blt $r0, I, .L22
-    movfr2gr.s i0, $f20
-    .align 3
-
-.L999:
-    move $r4, $r17
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file

From 8be26541933b36c6e3e8002c44002efb02033bdd Mon Sep 17 00:00:00 2001
From: zhoupeng <zhoupeng@loongson.cn>
Date: Thu, 28 Dec 2023 10:24:24 +0800
Subject: [PATCH 515/718] loongarch64: Refine imax optimization.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000      |   4 +-
 kernel/loongarch64/KERNEL.LOONGSON3R5         |   4 +-
 kernel/loongarch64/idmax_lasx.S               | 273 -----------
 kernel/loongarch64/idmax_lsx.S                | 225 ---------
 .../loongarch64/{ismax_lasx.S => imax_lasx.S} | 230 ++++++++--
 kernel/loongarch64/imax_lsx.S                 | 428 ++++++++++++++++++
 kernel/loongarch64/ismax_lsx.S                | 272 -----------
 7 files changed, 626 insertions(+), 810 deletions(-)
 delete mode 100644 kernel/loongarch64/idmax_lasx.S
 delete mode 100644 kernel/loongarch64/idmax_lsx.S
 rename kernel/loongarch64/{ismax_lasx.S => imax_lasx.S} (57%)
 create mode 100644 kernel/loongarch64/imax_lsx.S
 delete mode 100644 kernel/loongarch64/ismax_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 4eae2e4f9..346f1fb45 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -19,8 +19,8 @@ DMAXKERNEL  =  max_lsx.S
 SMINKERNEL  =  min_lsx.S
 DMINKERNEL  =  min_lsx.S
 
-ISMAXKERNEL =  ismax_lsx.S
-IDMAXKERNEL =  idmax_lsx.S
+ISMAXKERNEL =  imax_lsx.S
+IDMAXKERNEL =  imax_lsx.S
 
 ISMINKERNEL =  ismin_lsx.S
 IDMINKERNEL =  idmin_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index e7e1b5d5a..6b4df2d61 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -19,8 +19,8 @@ DMAXKERNEL =   max_lsx.S
 SMINKERNEL =   min_lsx.S
 DMINKERNEL =   min_lsx.S
 
-ISMAXKERNEL =  ismax_lasx.S
-IDMAXKERNEL =  idmax_lasx.S
+ISMAXKERNEL =  imax_lasx.S
+IDMAXKERNEL =  imax_lasx.S
 
 ISMINKERNEL =  ismin_lasx.S
 IDMINKERNEL =  idmin_lasx.S
diff --git a/kernel/loongarch64/idmax_lasx.S b/kernel/loongarch64/idmax_lasx.S
deleted file mode 100644
index bbfe0941a..000000000
--- a/kernel/loongarch64/idmax_lasx.S
+++ /dev/null
@@ -1,273 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define t1 $r13
-#define t2 $r15
-#define t3 $r18
-#define t4 $r16
-#define i0 $r17
-#define i1 $r14
-#define TEMP $r19
-#define x1 $xr9
-#define x2 $xr10
-#define x3 $xr11
-#define x4 $xr12
-#define VX0 $xr13
-#define VX1 $xr14
-#define VM0 $xr15
-#define VM1 $xr16
-#define VINC4 $xr17
-#define VINC8 $xr18
-#define VI0 $xr20
-#define VI1 $xr21
-#define VI2 $xr22
-#define VI3 $xr8
-#define VI4 $xr19
-#define VT0 $xr23
-
-    PROLOGUE
-    li.d  i0, 0
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    xvld VM0, X, 0
-    addi.d i0, i0, 1
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    slli.d i0, i0, 2 //4
-    xvreplgr2vr.d VINC4, i0
-    slli.d i0, i0, 1 //8
-    xvreplgr2vr.d VINC8, i0
-    addi.d i0, i0, -15
-    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 1
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 2
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 3
-    addi.d i0, i0, 5
-    xvinsgr2vr.d VI0, i0, 0 //1
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 1 //2
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 2 //3
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 3 //4
-    .align 3
-
-.L10:
-    xvld VX0, X, 0 * SIZE
-    xvadd.d VI1, VI1, VINC8
-    xvld VX1, X, 4 * SIZE
-    xvadd.d VI2, VI1, VINC4
-    xvfcmp.clt.d VT0, VX0, VX1
-    addi.d I, I, -1
-    xvbitsel.v VM1, VX0, VX1, VT0
-    xvbitsel.v VI2, VI1, VI2, VT0
-    xvfcmp.clt.d VT0, VM0, VM1
-    addi.d X, X, 8 * SIZE
-    xvbitsel.v VM0, VM0, VM1, VT0
-    xvbitsel.v VI0, VI0, VI2, VT0
-    blt $r0, I, .L10
-    .align 3
-
-.L15:
-    xvpickve.d VI1, VI0, 0
-    xvpickve.d VI2, VI0, 1
-    xvpickve.d VI3, VI0, 2
-    xvpickve.d VI4, VI0, 3
-    xvpickve.d x1, VM0, 0
-    xvpickve.d x2, VM0, 1
-    xvpickve.d x3, VM0, 2
-    xvpickve.d x4, VM0, 3
-    xvfcmp.clt.d VT0, x1, x2
-    xvbitsel.v VM1, x1, x2, VT0
-    xvbitsel.v VINC4, VI1, VI2, VT0
-    xvfcmp.clt.d VT0, x3, x4
-    xvbitsel.v VM0, x3, x4, VT0
-    xvbitsel.v VINC8, VI3, VI4, VT0
-    xvfcmp.clt.d VT0, VM0, VM1
-    xvbitsel.v VM0, VM0, VM1, VT0
-    xvbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.d $f17, TEMP
-    ffint.d.l $f17, $f17
-    xvfcmp.ceq.d VT0, VM0, x1
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    xvfcmp.clt.d VT0, VI1, VI0
-    xvbitsel.v VI0, VI0, VI1, VT0
-    b .L26
-    .align 3
-
-
-.L20: // INCX!=1
-    move TEMP, X
-    addi.d i0, i0, 1
-    ld.d t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.d VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    ld.d t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.d t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.d t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.d VM0, t2, 1
-    xvinsgr2vr.d VM0, t3, 2
-    xvinsgr2vr.d VM0, t4, 3
-    slli.d i0, i0, 2 //4
-    xvreplgr2vr.d VINC4, i0
-    slli.d i0, i0, 1 //8
-    xvreplgr2vr.d VINC8, i0
-    addi.d i0, i0, -15
-    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 1
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 2
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 3
-    addi.d i0, i0, 5
-    xvinsgr2vr.d VI0, i0, 0 //1
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 1 //2
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 2 //3
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 3 //4
-    .align 3
-
-.L24:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX0, t1, 0
-    xvinsgr2vr.d VX0, t2, 1
-    xvinsgr2vr.d VX0, t3, 2
-    xvinsgr2vr.d VX0, t4, 3
-    xvadd.d VI1, VI1, VINC8
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX1, t1, 0
-    xvinsgr2vr.d VX1, t2, 1
-    xvinsgr2vr.d VX1, t3, 2
-    xvinsgr2vr.d VX1, t4, 3
-    xvadd.d VI2, VI1, VINC4
-    xvfcmp.clt.d VT0, VX0, VX1
-    addi.d I, I, -1
-    xvbitsel.v VM1, VX0, VX1, VT0
-    xvbitsel.v VI2, VI1, VI2, VT0
-    xvfcmp.clt.d VT0, VM0, VM1
-    xvbitsel.v VM0, VM0, VM1, VT0
-    xvbitsel.v VI0, VI0, VI2, VT0
-    blt $r0, I, .L24
-    .align 3
-
-.L25:
-    xvpickve.d VI1, VI0, 0
-    xvpickve.d VI2, VI0, 1
-    xvpickve.d VI3, VI0, 2
-    xvpickve.d VI4, VI0, 3
-    xvpickve.d x1, VM0, 0
-    xvpickve.d x2, VM0, 1
-    xvpickve.d x3, VM0, 2
-    xvpickve.d x4, VM0, 3
-    xvfcmp.clt.d VT0, x1, x2
-    xvbitsel.v VM1, x1, x2, VT0
-    xvbitsel.v VINC4, VI1, VI2, VT0
-    xvfcmp.clt.d VT0, x3, x4
-    xvbitsel.v VM0, x3, x4, VT0
-    xvbitsel.v VINC8, VI3, VI4, VT0
-    xvfcmp.clt.d VT0, VM0, VM1
-    xvbitsel.v VM0, VM0, VM1, VT0
-    xvbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.d $f17, TEMP
-    ffint.d.l $f17, $f17
-    xvfcmp.ceq.d VT0, VM0, x1
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    xvfcmp.clt.d VT0, VI1, VI0
-    xvbitsel.v VI0, VI0, VI1, VT0
-    .align 3
-
-.L26:
-    xvfcmp.ceq.d VT0, VM0, x2
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L27
-    xvfcmp.clt.d VT0, VI2, VI0
-    xvbitsel.v VI0, VI0, VI2, VT0
-    .align 3
-
-.L27:
-    xvfcmp.ceq.d VT0, VM0, x3
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L28
-    xvfcmp.clt.d VT0, VI3, VI0
-    xvbitsel.v VI0, VI0, VI3, VT0
-    .align 3
-
-.L28:
-    xvfcmp.ceq.d VT0, VM0, x4
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L29
-    xvfcmp.clt.d VT0, VI4, VI0
-    xvbitsel.v VI0, VI0, VI4, VT0
-    .align 3
-
-.L29:
-    movfr2gr.d i0, $f20
-    .align 3
-
-.L21: //N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    srai.d i1, N, 3
-    slli.d i1, i1, 3
-    addi.d i1, i1, 1 //current index
-    movgr2fr.d $f21, i1
-    movgr2fr.d $f20, i0
-    .align 3
-
-.L22:
-    fld.d $f9, X, 0
-    addi.d I, I, -1
-    fcmp.clt.d $fcc0, $f15, $f9
-    add.d  X, X, INCX
-    fsel $f15, $f15, $f9, $fcc0
-    fsel $f20, $f20, $f21, $fcc0
-    addi.d i1, i1, 1
-    movgr2fr.d $f21, i1
-    blt $r0, I, .L22
-    movfr2gr.d i0, $f20
-    .align 3
-
-.L999:
-    move $r4, $r17
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/idmax_lsx.S b/kernel/loongarch64/idmax_lsx.S
deleted file mode 100644
index 1b4734bab..000000000
--- a/kernel/loongarch64/idmax_lsx.S
+++ /dev/null
@@ -1,225 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define t1 $r13
-#define t2 $r15
-#define t3 $r18
-#define t4 $r16
-#define i0 $r17
-#define i1 $r14
-#define TEMP $r19
-#define x1 $vr9
-#define x2 $vr10
-#define x3 $vr11
-#define x4 $vr12
-#define VX0 $vr13
-#define VX1 $vr14
-#define VM0 $vr15
-#define VM1 $vr16
-#define VINC2 $vr17
-#define VINC4 $vr18
-#define VI0 $vr20
-#define VI1 $vr21
-#define VI2 $vr22
-#define VI3 $vr8
-#define VI4 $vr19
-#define VT0 $vr23
-
-    PROLOGUE
-    li.d  i0, 0
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    vld VM0, X, 0
-    addi.d i0, i0, 1
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    slli.d i0, i0, 1 //2
-    vreplgr2vr.d VINC2, i0
-    slli.d i0, i0, 1 //4
-    vreplgr2vr.d VINC4, i0
-    addi.d i0, i0, -7
-    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
-    addi.d i0, i0, 1
-    vinsgr2vr.d VI1, i0, 1
-    addi.d i0, i0, 3
-    vinsgr2vr.d VI0, i0, 0 //1
-    addi.d i0, i0, 1
-    vinsgr2vr.d VI0, i0, 1 //2
-    .align 3
-
-.L10:
-    vld VX0, X, 0 * SIZE
-    vadd.d VI1, VI1, VINC4
-    vld VX1, X, 2 * SIZE
-    vadd.d VI2, VI1, VINC2
-    vfcmp.clt.d VT0, VX0, VX1
-    vbitsel.v x1, VX0, VX1, VT0
-    vbitsel.v x2, VI1, VI2, VT0
-    vld VX0, X, 4 * SIZE
-    vadd.d VI1, VI2, VINC2
-    vld VX1, X, 6 * SIZE
-    vadd.d VI2, VI1, VINC2
-    vfcmp.clt.d VT0, VX0, VX1
-    addi.d I, I, -1
-    vbitsel.v x3, VX0, VX1, VT0
-    vbitsel.v x4, VI1, VI2, VT0
-    vfcmp.clt.d VT0, x1, x3
-    vbitsel.v x1, x1, x3, VT0
-    vbitsel.v x2, x2, x4, VT0
-    vfcmp.clt.d VT0, VM0, x1
-    addi.d X, X, 8 * SIZE
-    vbitsel.v VM0, VM0, x1, VT0
-    vbitsel.v VI0, VI0, x2, VT0
-    blt $r0, I, .L10
-    .align 3
-
-.L15:
-    vreplvei.d VI1, VI0, 0
-    vreplvei.d VI2, VI0, 1
-    vreplvei.d x1, VM0, 0
-    vreplvei.d x2, VM0, 1
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.d $f17, TEMP
-    ffint.d.l $f17, $f17
-    vfcmp.ceq.d VT0, x2, x1
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    vfcmp.clt.d VT0, VI1, VI0
-    vbitsel.v VI0, VI0, VI1, VT0
-    b .L27
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X
-    addi.d i0, i0, 1
-    ld.d t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.d VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    ld.d t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.d VM0, t2, 1
-    slli.d i0, i0, 1 //2
-    vreplgr2vr.d VINC2, i0
-    slli.d i0, i0, 1 //4
-    vreplgr2vr.d VINC4, i0
-    addi.d i0, i0, -7
-    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
-    addi.d i0, i0, 1
-    vinsgr2vr.d VI1, i0, 1
-    addi.d i0, i0, 3
-    vinsgr2vr.d VI0, i0, 0 //1
-    addi.d i0, i0, 1
-    vinsgr2vr.d VI0, i0, 1 //2
-    .align 3
-
-.L24:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    vadd.d VI1, VI1, VINC4
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t1, 0
-    vinsgr2vr.d VX1, t2, 1
-    vadd.d VI2, VI1, VINC2
-    vfcmp.clt.d VT0, VX0, VX1
-    vbitsel.v x1, VX0, VX1, VT0
-    vbitsel.v x2, VI1, VI2, VT0
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    vadd.d VI1, VI2, VINC2
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t1, 0
-    vinsgr2vr.d VX1, t2, 1
-    vadd.d VI2, VI1, VINC2
-    vfcmp.clt.d VT0, VX0, VX1
-    vbitsel.v x3, VX0, VX1, VT0
-    vbitsel.v x4, VI1, VI2, VT0
-    vfcmp.clt.d VT0, x1, x3
-    vbitsel.v x1, x1, x3, VT0
-    vbitsel.v x2, x2, x4, VT0
-    vfcmp.clt.d VT0, VM0, x1
-    addi.d I, I, -1
-    vbitsel.v VM0, VM0, x1, VT0
-    vbitsel.v VI0, VI0, x2, VT0
-    blt $r0, I, .L24
-    .align 3
-
-.L25:
-    vreplvei.d VI1, VI0, 0
-    vreplvei.d VI2, VI0, 1
-    vreplvei.d x1, VM0, 0
-    vreplvei.d x2, VM0, 1
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.d $f17, TEMP
-    ffint.d.l $f17, $f17
-    vfcmp.ceq.d VT0, x2, x1
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    vfcmp.clt.d VT0, VI1, VI0
-    vbitsel.v VI0, VI0, VI1, VT0
-    b .L27
-    .align 3
-
-.L26:
-    vfcmp.clt.d VT0, x1, x2
-    vbitsel.v VM0, x1, x2, VT0
-    vbitsel.v VI0, VI1, VI2, VT0
-    .align 3
-
-.L27:
-    movfr2gr.d i0, $f20
-    .align 3
-
-.L21: //N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    srai.d i1, N, 3
-    slli.d i1, i1, 3
-    addi.d i1, i1, 1 //current index
-    movgr2fr.d $f21, i1
-    movgr2fr.d $f20, i0
-    .align 3
-
-.L22:
-    fld.d $f9, X, 0
-    addi.d I, I, -1
-    fcmp.clt.d $fcc0, $f15, $f9
-    add.d  X, X, INCX
-    fsel $f15, $f15, $f9, $fcc0
-    fsel $f20, $f20, $f21, $fcc0
-    addi.d i1, i1, 1
-    movgr2fr.d $f21, i1
-    blt $r0, I, .L22
-    movfr2gr.d i0, $f20
-    .align 3
-
-.L999:
-    move $r4, $r17
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/ismax_lasx.S b/kernel/loongarch64/imax_lasx.S
similarity index 57%
rename from kernel/loongarch64/ismax_lasx.S
rename to kernel/loongarch64/imax_lasx.S
index 843dd6c6a..2d3d5e9d3 100644
--- a/kernel/loongarch64/ismax_lasx.S
+++ b/kernel/loongarch64/imax_lasx.S
@@ -1,3 +1,29 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
 #define ASSEMBLER
 
 #include "common.h"
@@ -39,6 +65,31 @@
     slli.d INCX, INCX, BASE_SHIFT
     bne INCX, TEMP, .L20
     xvld VM0, X, 0
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
     addi.w i0, i0, 1
     srai.d I, N, 3
     bge $r0, I, .L21
@@ -76,20 +127,47 @@
     xvinsgr2vr.w VI0, i0, 6 //7
     addi.w i0, i0, 1
     xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+
     .align 3
 
 .L10:
     xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvadd.d VI1, VI1, VINC8
+    xvld VX1, X, 4 * SIZE
+    xvadd.d VI2, VI1, VINC4
+    xvfcmp.clt.d VT0, VX0, VX1
+    addi.d I, I, -1
+    xvbitsel.v VM1, VX0, VX1, VT0
+    xvbitsel.v VI2, VI1, VI2, VT0
+    xvfcmp.clt.d VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VI0, VI2, VT0
+#else
     xvadd.w VI1, VI1, VINC8
     xvfcmp.clt.s VT0, VM0, VX0
     addi.d I, I, -1
     xvbitsel.v VM0, VM0, VX0, VT0
     xvbitsel.v VI0, VI0, VI1, VT0
     addi.d X, X, 8 * SIZE
+#endif
     blt $r0, I, .L10
     .align 3
 
 .L15:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfcmp.clt.d VT0, x1, x2
+#else
     xvxor.v VX0, VX0, VX0
     xvor.v VX0, VI0, VX0
     xvxor.v VX1, VX1, VX1
@@ -103,28 +181,33 @@
     xvpickve.w x3, VM0, 2
     xvpickve.w x4, VM0, 3
     xvfcmp.clt.s VT0, x1, x2
+#endif
     xvbitsel.v VM1, x1, x2, VT0
     xvbitsel.v VINC4, VI1, VI2, VT0
-    xvfcmp.clt.s VT0, x3, x4
+    XVCMPLT VT0, x3, x4
     xvbitsel.v VM0, x3, x4, VT0
     xvbitsel.v VINC8, VI3, VI4, VT0
-    xvfcmp.clt.s VT0, VM0, VM1
+    XVCMPLT VT0, VM0, VM1
     xvbitsel.v VM0, VM0, VM1, VT0
     xvbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.w $f17, TEMP
-    ffint.s.w $f17, $f17
-    xvfcmp.ceq.s VT0, VM0, x1
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f9
     bceqz $fcc0, .L26
-    xvfcmp.clt.s VT0, VI1, VI0
+    XVCMPLT VT0, VI1, VI0
     xvbitsel.v VI0, VI0, VI1, VT0
     b .L26
     .align 3
 
-
 .L20: // INCX!=1
     move TEMP, X
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+#else
     addi.w i0, i0, 1
     ld.w t1, TEMP, 0 * SIZE
     add.d TEMP, TEMP, INCX
@@ -143,11 +226,38 @@
     ld.w t1, TEMP, 0 * SIZE
     add.d TEMP, TEMP, INCX
     ld.w t2, TEMP, 0 * SIZE
+#endif
     add.d TEMP, TEMP, INCX
-    ld.w t3, TEMP, 0 * SIZE
+    ld.d t3, TEMP, 0 * SIZE
     add.d TEMP, TEMP, INCX
-    ld.w t4, TEMP, 0 * SIZE
+    ld.d t4, TEMP, 0 * SIZE
     add.d TEMP, TEMP, INCX
+#ifdef DOUBLE
+    xvinsgr2vr.d VM0, t1, 0
+    xvinsgr2vr.d VM0, t2, 1
+    xvinsgr2vr.d VM0, t3, 2
+    xvinsgr2vr.d VM0, t4, 3
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
     xvinsgr2vr.w VM0, t1, 4
     xvinsgr2vr.w VM0, t2, 5
     xvinsgr2vr.w VM0, t3, 6
@@ -186,9 +296,46 @@
     xvinsgr2vr.w VI0, i0, 6 //7
     addi.w i0, i0, 1
     xvinsgr2vr.w VI0, i0, 7 //8
+#endif
     .align 3
 
 .L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    xvadd.d VI2, VI1, VINC4
+    xvfcmp.clt.d VT0, VX0, VX1
+    addi.d I, I, -1
+    xvbitsel.v VM1, VX0, VX1, VT0
+    xvbitsel.v VI2, VI1, VI2, VT0
+    xvfcmp.clt.d VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VI0, VI2, VT0
+#else
     ld.w t1, X, 0 * SIZE
     add.d X, X, INCX
     ld.w t2, X, 0 * SIZE
@@ -218,10 +365,21 @@
     addi.d I, I, -1
     xvbitsel.v VM0, VM0, VX0, VT0
     xvbitsel.v VI0, VI0, VI1, VT0
+#endif
     blt $r0, I, .L24
     .align 3
 
 .L25:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d  x1, VM0, 0
+    xvpickve.d  x2, VM0, 1
+    xvpickve.d  x3, VM0, 2
+    xvpickve.d  x4, VM0, 3
+#else
     xvxor.v VX0, VX0, VX0
     xvor.v VX0, VI0, VX0
     xvxor.v VX1, VX1, VX1
@@ -230,57 +388,56 @@
     xvpickve.w VI2, VI0, 1
     xvpickve.w VI3, VI0, 2
     xvpickve.w VI4, VI0, 3
-    xvpickve.w x1, VM0, 0
-    xvpickve.w x2, VM0, 1
-    xvpickve.w x3, VM0, 2
-    xvpickve.w x4, VM0, 3
-    xvfcmp.clt.s VT0, x1, x2
+    xvpickve.w  x1, VM0, 0
+    xvpickve.w  x2, VM0, 1
+    xvpickve.w  x3, VM0, 2
+    xvpickve.w  x4, VM0, 3
+#endif
+    XVCMPLT VT0, x1, x2
     xvbitsel.v VM1, x1, x2, VT0
     xvbitsel.v VINC4, VI1, VI2, VT0
-    xvfcmp.clt.s VT0, x3, x4
+    XVCMPLT VT0, x3, x4
     xvbitsel.v VM0, x3, x4, VT0
     xvbitsel.v VINC8, VI3, VI4, VT0
-    xvfcmp.clt.s VT0, VM0, VM1
+    XVCMPLT VT0, VM0, VM1
     xvbitsel.v VM0, VM0, VM1, VT0
     xvbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.w $f17, TEMP
-    ffint.s.w $f17, $f17
-    xvfcmp.ceq.s VT0, VM0, x1
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f9
     bceqz $fcc0, .L26
-    xvfcmp.clt.s VT0, VI1, VI0
+    XVCMPLT VT0, VI1, VI0
     xvbitsel.v VI0, VI0, VI1, VT0
     .align 3
 
 .L26:
-    xvfcmp.ceq.s VT0, VM0, x2
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f10
     bceqz $fcc0, .L27
-    xvfcmp.clt.s VT0, VI2, VI0
+    XVCMPLT VT0, VI2, VI0
     xvbitsel.v VI0, VI0, VI2, VT0
     .align 3
 
 .L27:
-    xvfcmp.ceq.s VT0, VM0, x3
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f11
     bceqz $fcc0, .L28
-    xvfcmp.clt.s VT0, VI3, VI0
+    XVCMPLT VT0, VI3, VI0
     xvbitsel.v VI0, VI0, VI3, VT0
     .align 3
 
 .L28:
-    xvfcmp.ceq.s VT0, VM0, x4
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f12
     bceqz $fcc0, .L29
-    xvfcmp.clt.s VT0, VI4, VI0
+    XVCMPLT VT0, VI4, VI0
     xvbitsel.v VI0, VI0, VI4, VT0
     .align 3
 
 .L29:
+#ifdef DOUBLE
+    movfr2gr.d i0, $f20
+#else
     fmov.s $f16, $f20
+#endif
     .align 3
 
+#ifndef DOUBLE
 .L252:
     xvxor.v VI0, VI0, VI0
     xvor.v VI0, VI0, VX0
@@ -343,6 +500,7 @@
     fsel $f15, $f15, $f13, $fcc0
     fsel $f20, $f20, $f16, $fcc0
     movfr2gr.s i0, $f20
+#endif
 
 .L21: //N<8
     andi I, N, 7
@@ -357,14 +515,14 @@
 .L22:
     fld.d $f9, X, 0
     addi.d I, I, -1
-    fcmp.clt.s $fcc0, $f15, $f9
+    CMPLT $fcc0, $f15, $f9
     add.d  X, X, INCX
     fsel $f15, $f15, $f9, $fcc0
     fsel $f20, $f20, $f21, $fcc0
     addi.d i1, i1, 1
     movgr2fr.d $f21, i1
     blt $r0, I, .L22
-    movfr2gr.s i0, $f20
+    MTG i0, $f20
     .align 3
 
 .L999:
@@ -372,4 +530,4 @@
     jirl $r0, $r1, 0x0
     .align 3
 
-    EPILOGUE
\ No newline at end of file
+    EPILOGUE
diff --git a/kernel/loongarch64/imax_lsx.S b/kernel/loongarch64/imax_lsx.S
new file mode 100644
index 000000000..92556d4e6
--- /dev/null
+++ b/kernel/loongarch64/imax_lsx.S
@@ -0,0 +1,428 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+#ifdef DOUBLE
+    addi.d  i0, i0, 1
+    srai.d   I,  N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d $vr17, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d $vr18, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w  i0, i0, 1
+    srai.d   I,  N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w $vr17, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w $vr18, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vadd.d VI1, VI1, $vr18
+    vld VX1, X, 2 * SIZE
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX0, VX1
+    vbitsel.v x1, VX0, VX1, VT0
+    vbitsel.v x2, VI1, VI2, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI2, $vr17
+    vld VX1, X, 6 * SIZE
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX0, VX1
+    addi.d I, I, -1
+    vbitsel.v x3, VX0, VX1, VT0
+    vbitsel.v x4, VI1, VI2, VT0
+    VCMPLT VT0, x1, x3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT0
+    VCMPLT VT0, VM0, x1
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, VM0, x1, VT0
+    vbitsel.v VI0, VI0, x2, VT0
+#else
+    vadd.w VI1, VI1, $vr18
+    vld VX1, X, 4 * SIZE
+    vadd.w VI2, VI1, $vr17
+    VCMPLT VT0, VX0, VX1
+    addi.d I, I, -1
+    vbitsel.v VM1, VX0, VX1, VT0
+    vbitsel.v VI2, VI1, VI2, VT0
+    VCMPLT VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, VI0, VI2, VT0
+#endif
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d  x1, VM0, 0
+    vreplvei.d  x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    VCMPLT VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w  x1, VM0, 0
+    vreplvei.w  x2, VM0, 1
+    vreplvei.w  x3, VM0, 2
+    vreplvei.w  x4, VM0, 3
+    VCMPLT VT0, x1, x2
+    vbitsel.v VM1, x1, x2, VT0
+    vbitsel.v $vr17, VI1, VI2, VT0
+    VCMPLT VT0, x3, x4
+    vbitsel.v VM0, x3, x4, VT0
+    vbitsel.v $vr18, VI3, VI4, VT0
+    VCMPLT VT0, VM0, VM1
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, $vr18, $vr17, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    VCMPLT VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+#endif
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+#ifdef DOUBLE
+    addi.d  i0,   i0, 1
+    ld.d    t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d  VM0, t1, 0
+    srai.d         I,  N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t2, 1
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d $vr17, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d $vr18, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w  i0,   i0, 1
+    ld.w    t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w  VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t2, 1
+    vinsgr2vr.w VM0, t3, 2
+    vinsgr2vr.w VM0, t4, 3
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w $vr17, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w $vr18, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI1, $vr18
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX0, VX1
+    vbitsel.v x1, VX0, VX1, VT0
+    vbitsel.v x2, VI1, VI2, VT0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI2, $vr17
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX0, VX1
+    vbitsel.v x3, VX0, VX1, VT0
+    vbitsel.v x4, VI1, VI2, VT0
+    VCMPLT VT0, x1, x3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT0
+    VCMPLT VT0, VM0, x1
+    addi.d I, I, -1
+    vbitsel.v VM0, VM0, x1, VT0
+    vbitsel.v VI0, VI0, x2, VT0
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    vadd.w VI1, VI1, $vr18
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vadd.w VI2, VI1, $vr17
+    VCMPLT VT0, VX0, VX1
+    addi.d I, I, -1
+    vbitsel.v VM1, VX0, VX1, VT0
+    vbitsel.v VI2, VI1, VI2, VT0
+    VCMPLT VT0, VM0, VM1
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, VI0, VI2, VT0
+#endif
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d  x1, VM0, 0
+    vreplvei.d  x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    VCMPLT VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w  x1, VM0, 0
+    vreplvei.w  x2, VM0, 1
+    vreplvei.w  x3, VM0, 2
+    vreplvei.w  x4, VM0, 3
+    vfcmp.clt.s VT0,  x1, x2
+    vbitsel.v   VM1,  x1, x2, VT0
+    vbitsel.v $vr17, VI1, VI2, VT0
+    vfcmp.clt.s VT0,  x3, x4
+    vbitsel.v   VM0,  x3, x4, VT0
+    vbitsel.v $vr18, VI3, VI4, VT0
+    vfcmp.clt.s VT0, VM0, VM1
+    vbitsel.v VM0,   VM0, VM1, VT0
+    vbitsel.v VI0, $vr18, $vr17, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+#endif
+    .align 3
+
+.L26:
+#ifdef DOUBLE
+    VCMPLT    VT0,  x1, x2
+    vbitsel.v VM0,  x1, x2, VT0
+    vbitsel.v VI0, VI1, VI2, VT0
+#else
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    VCMPLT VT0, VI2, VI0
+    vbitsel.v VI0, VI0, VI2, VT0
+#endif
+    .align 3
+
+.L27:
+#ifndef DOUBLE
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    VCMPLT VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    VCMPLT VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+#endif
+    MTG i0, $f20
+    .align 3
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    addi.d I, I, -1
+    CMPLT $fcc0, $f15, $f9
+    add.d  X, X, INCX
+    fsel $f15, $f15, $f9, $fcc0
+    fsel $f20, $f20, $f21, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/ismax_lsx.S b/kernel/loongarch64/ismax_lsx.S
deleted file mode 100644
index 33b326bbd..000000000
--- a/kernel/loongarch64/ismax_lsx.S
+++ /dev/null
@@ -1,272 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define t1 $r13
-#define t2 $r15
-#define t3 $r18
-#define t4 $r16
-#define i0 $r17
-#define i1 $r14
-#define TEMP $r19
-#define x1 $vr9
-#define x2 $vr10
-#define x3 $vr11
-#define x4 $vr12
-#define VX0 $vr13
-#define VX1 $vr14
-#define VM0 $vr15
-#define VM1 $vr16
-#define VINC4 $vr17
-#define VINC8 $vr18
-#define VI0 $vr20
-#define VI1 $vr21
-#define VI2 $vr22
-#define VI3 $vr8
-#define VI4 $vr19
-#define VT0 $vr23
-
-    PROLOGUE
-    li.d  i0, 0
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    vld VM0, X, 0
-    addi.w i0, i0, 1
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    slli.w i0, i0, 2 //4
-    vreplgr2vr.w VINC4, i0
-    slli.w i0, i0, 1 //8
-    vreplgr2vr.w VINC8, i0
-    addi.w i0, i0, -15
-    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 1
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 2
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 3
-    addi.w i0, i0, 5
-    vinsgr2vr.w VI0, i0, 0 //1
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 1 //2
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 2 //3
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 3 //4
-    .align 3
-
-.L10:
-    vld VX0, X, 0 * SIZE
-    vadd.w VI1, VI1, VINC8
-    vld VX1, X, 4 * SIZE
-    vadd.w VI2, VI1, VINC4
-    vfcmp.clt.s VT0, VX0, VX1
-    addi.d I, I, -1
-    vbitsel.v VM1, VX0, VX1, VT0
-    vbitsel.v VI2, VI1, VI2, VT0
-    vfcmp.clt.s VT0, VM0, VM1
-    addi.d X, X, 8 * SIZE
-    vbitsel.v VM0, VM0, VM1, VT0
-    vbitsel.v VI0, VI0, VI2, VT0
-    blt $r0, I, .L10
-    .align 3
-
-.L15:
-    vreplvei.w VI1, VI0, 0
-    vreplvei.w VI2, VI0, 1
-    vreplvei.w VI3, VI0, 2
-    vreplvei.w VI4, VI0, 3
-    vreplvei.w x1, VM0, 0
-    vreplvei.w x2, VM0, 1
-    vreplvei.w x3, VM0, 2
-    vreplvei.w x4, VM0, 3
-    vfcmp.clt.s VT0, x1, x2
-    vbitsel.v VM1, x1, x2, VT0
-    vbitsel.v VINC4, VI1, VI2, VT0
-    vfcmp.clt.s VT0, x3, x4
-    vbitsel.v VM0, x3, x4, VT0
-    vbitsel.v VINC8, VI3, VI4, VT0
-    vfcmp.clt.s VT0, VM0, VM1
-    vbitsel.v VM0, VM0, VM1, VT0
-    vbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.w $f17, TEMP
-    ffint.s.w $f17, $f17
-    vfcmp.ceq.s VT0, VM0, x1
-    fcmp.ceq.s $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    vfcmp.clt.s VT0, VI1, VI0
-    vbitsel.v VI0, VI0, VI1, VT0
-    b .L26
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X
-    addi.w i0, i0, 1
-    ld.w t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.w VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    ld.w t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.w VM0, t2, 1
-    vinsgr2vr.w VM0, t3, 2
-    vinsgr2vr.w VM0, t4, 3
-    slli.w i0, i0, 2 //4
-    vreplgr2vr.w VINC4, i0
-    slli.w i0, i0, 1 //8
-    vreplgr2vr.w VINC8, i0
-    addi.w i0, i0, -15
-    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 1
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 2
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 3
-    addi.w i0, i0, 5
-    vinsgr2vr.w VI0, i0, 0 //1
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 1 //2
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 2 //3
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 3 //4
-    .align 3
-
-.L24:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    vadd.w VI1, VI1, VINC8
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    vadd.w VI2, VI1, VINC4
-    vfcmp.clt.s VT0, VX0, VX1
-    addi.d I, I, -1
-    vbitsel.v VM1, VX0, VX1, VT0
-    vbitsel.v VI2, VI1, VI2, VT0
-    vfcmp.clt.s VT0, VM0, VM1
-    vbitsel.v VM0, VM0, VM1, VT0
-    vbitsel.v VI0, VI0, VI2, VT0
-    blt $r0, I, .L24
-    .align 3
-
-.L25:
-    vreplvei.w VI1, VI0, 0
-    vreplvei.w VI2, VI0, 1
-    vreplvei.w VI3, VI0, 2
-    vreplvei.w VI4, VI0, 3
-    vreplvei.w x1, VM0, 0
-    vreplvei.w x2, VM0, 1
-    vreplvei.w x3, VM0, 2
-    vreplvei.w x4, VM0, 3
-    vfcmp.clt.s VT0, x1, x2
-    vbitsel.v VM1, x1, x2, VT0
-    vbitsel.v VINC4, VI1, VI2, VT0
-    vfcmp.clt.s VT0, x3, x4
-    vbitsel.v VM0, x3, x4, VT0
-    vbitsel.v VINC8, VI3, VI4, VT0
-    vfcmp.clt.s VT0, VM0, VM1
-    vbitsel.v VM0, VM0, VM1, VT0
-    vbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.w $f17, TEMP
-    ffint.s.w $f17, $f17
-    vfcmp.ceq.s VT0, VM0, x1
-    fcmp.ceq.s $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    vfcmp.clt.s VT0, VI1, VI0
-    vbitsel.v VI0, VI0, VI1, VT0
-    .align 3
-
-.L26:
-    vfcmp.ceq.s VT0, VM0, x2
-    fcmp.ceq.s $fcc0, $f23, $f17
-    bceqz $fcc0, .L27
-    vfcmp.clt.s VT0, VI2, VI0
-    vbitsel.v VI0, VI0, VI2, VT0
-    .align 3
-
-.L27:
-    vfcmp.ceq.s VT0, VM0, x3
-    fcmp.ceq.s $fcc0, $f23, $f17
-    bceqz $fcc0, .L28
-    vfcmp.clt.s VT0, VI3, VI0
-    vbitsel.v VI0, VI0, VI3, VT0
-    .align 3
-
-.L28:
-    vfcmp.ceq.s VT0, VM0, x4
-    fcmp.ceq.s $fcc0, $f23, $f17
-    bceqz $fcc0, .L29
-    vfcmp.clt.s VT0, VI4, VI0
-    vbitsel.v VI0, VI0, VI4, VT0
-    .align 3
-
-.L29:
-    movfr2gr.s i0, $f20
-    .align 3
-
-.L21: //N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    srai.d i1, N, 3
-    slli.d i1, i1, 3
-    addi.d i1, i1, 1 //current index
-    movgr2fr.d $f21, i1
-    movgr2fr.d $f20, i0
-    .align 3
-
-.L22:
-    fld.d $f9, X, 0
-    addi.d I, I, -1
-    fcmp.clt.s $fcc0, $f15, $f9
-    fsel $f15, $f15, $f9, $fcc0
-    fsel $f20, $f20, $f21, $fcc0
-    addi.d i1, i1, 1
-    add.d  X, X, INCX
-    movgr2fr.d $f21, i1
-    blt $r0, I, .L22
-    movfr2gr.s i0, $f20
-    .align 3
-
-.L999:
-    move $r4, $r17
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file

From 116aee7527935a51b288a15b7feffc6ea2313e8a Mon Sep 17 00:00:00 2001
From: zhoupeng <zhoupeng@loongson.cn>
Date: Thu, 28 Dec 2023 15:17:28 +0800
Subject: [PATCH 516/718] loongarch64: Refine imin optimization.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000      |   4 +-
 kernel/loongarch64/KERNEL.LOONGSON3R5         |   4 +-
 kernel/loongarch64/idmin_lasx.S               | 272 -----------
 kernel/loongarch64/idmin_lsx.S                | 225 ---------
 .../loongarch64/{ismin_lasx.S => imin_lasx.S} | 266 ++++++++---
 kernel/loongarch64/imin_lsx.S                 | 428 ++++++++++++++++++
 kernel/loongarch64/ismin_lsx.S                | 271 -----------
 7 files changed, 645 insertions(+), 825 deletions(-)
 delete mode 100644 kernel/loongarch64/idmin_lasx.S
 delete mode 100644 kernel/loongarch64/idmin_lsx.S
 rename kernel/loongarch64/{ismin_lasx.S => imin_lasx.S} (54%)
 create mode 100644 kernel/loongarch64/imin_lsx.S
 delete mode 100644 kernel/loongarch64/ismin_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 346f1fb45..9164f28ef 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -22,8 +22,8 @@ DMINKERNEL  =  min_lsx.S
 ISMAXKERNEL =  imax_lsx.S
 IDMAXKERNEL =  imax_lsx.S
 
-ISMINKERNEL =  ismin_lsx.S
-IDMINKERNEL =  idmin_lsx.S
+ISMINKERNEL =  imin_lsx.S
+IDMINKERNEL =  imin_lsx.S
 
 ISAMAXKERNEL = isamax_lsx.S
 IDAMAXKERNEL = idamax_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 6b4df2d61..5882b8932 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -22,8 +22,8 @@ DMINKERNEL =   min_lsx.S
 ISMAXKERNEL =  imax_lasx.S
 IDMAXKERNEL =  imax_lasx.S
 
-ISMINKERNEL =  ismin_lasx.S
-IDMINKERNEL =  idmin_lasx.S
+ISMINKERNEL =  imin_lasx.S
+IDMINKERNEL =  imin_lasx.S
 
 ISAMAXKERNEL = isamax_lasx.S
 IDAMAXKERNEL = idamax_lasx.S
diff --git a/kernel/loongarch64/idmin_lasx.S b/kernel/loongarch64/idmin_lasx.S
deleted file mode 100644
index 7930d4963..000000000
--- a/kernel/loongarch64/idmin_lasx.S
+++ /dev/null
@@ -1,272 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define t1 $r13
-#define t2 $r15
-#define t3 $r18
-#define t4 $r16
-#define i0 $r17
-#define i1 $r14
-#define TEMP $r19
-#define x1 $xr9
-#define x2 $xr10
-#define x3 $xr11
-#define x4 $xr12
-#define VX0 $xr13
-#define VX1 $xr14
-#define VM0 $xr15
-#define VM1 $xr16
-#define VINC4 $xr17
-#define VINC8 $xr18
-#define VI0 $xr20
-#define VI1 $xr21
-#define VI2 $xr22
-#define VI3 $xr8
-#define VI4 $xr19
-#define VT0 $xr23
-
-    PROLOGUE
-    li.d  i0, 0
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    xvld VM0, X, 0
-    addi.d i0, i0, 1
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    slli.d i0, i0, 2 //4
-    xvreplgr2vr.d VINC4, i0
-    slli.d i0, i0, 1 //8
-    xvreplgr2vr.d VINC8, i0
-    addi.d i0, i0, -15
-    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 1
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 2
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 3
-    addi.d i0, i0, 5
-    xvinsgr2vr.d VI0, i0, 0 //1
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 1 //2
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 2 //3
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 3 //4
-    .align 3
-
-.L10:
-    xvld VX0, X, 0 * SIZE
-    xvadd.d VI1, VI1, VINC8
-    xvld VX1, X, 4 * SIZE
-    xvadd.d VI2, VI1, VINC4
-    xvfcmp.clt.d VT0, VX1, VX0
-    addi.d I, I, -1
-    xvbitsel.v VM1, VX0, VX1, VT0
-    xvbitsel.v VI2, VI1, VI2, VT0
-    xvfcmp.clt.d VT0, VM1, VM0
-    addi.d X, X, 8 * SIZE
-    xvbitsel.v VM0, VM0, VM1, VT0
-    xvbitsel.v VI0, VI0, VI2, VT0
-    blt $r0, I, .L10
-    .align 3
-
-.L15:
-    xvpickve.d VI1, VI0, 0
-    xvpickve.d VI2, VI0, 1
-    xvpickve.d VI3, VI0, 2
-    xvpickve.d VI4, VI0, 3
-    xvpickve.d x1, VM0, 0
-    xvpickve.d x2, VM0, 1
-    xvpickve.d x3, VM0, 2
-    xvpickve.d x4, VM0, 3
-    xvfcmp.clt.d VT0, x2, x1
-    xvbitsel.v VM1, x1, x2, VT0
-    xvbitsel.v VINC4, VI1, VI2, VT0
-    xvfcmp.clt.d VT0, x4, x3
-    xvbitsel.v VM0, x3, x4, VT0
-    xvbitsel.v VINC8, VI3, VI4, VT0
-    xvfcmp.clt.d VT0, VM1, VM0
-    xvbitsel.v VM0, VM0, VM1, VT0
-    xvbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.d $f17, TEMP
-    ffint.d.l $f17, $f17
-    xvfcmp.ceq.d VT0, VM0, x1
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    xvfcmp.clt.d VT0, VI1, VI0
-    xvbitsel.v VI0, VI0, VI1, VT0
-    b .L26
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X
-    addi.d i0, i0, 1
-    ld.d t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.d VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    ld.d t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.d t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.d t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.d VM0, t2, 1
-    xvinsgr2vr.d VM0, t3, 2
-    xvinsgr2vr.d VM0, t4, 3
-    slli.d i0, i0, 2 //4
-    xvreplgr2vr.d VINC4, i0
-    slli.d i0, i0, 1 //8
-    xvreplgr2vr.d VINC8, i0
-    addi.d i0, i0, -15
-    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 1
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 2
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 3
-    addi.d i0, i0, 5
-    xvinsgr2vr.d VI0, i0, 0 //1
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 1 //2
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 2 //3
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 3 //4
-    .align 3
-
-.L24:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX0, t1, 0
-    xvinsgr2vr.d VX0, t2, 1
-    xvinsgr2vr.d VX0, t3, 2
-    xvinsgr2vr.d VX0, t4, 3
-    xvadd.d VI1, VI1, VINC8
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX1, t1, 0
-    xvinsgr2vr.d VX1, t2, 1
-    xvinsgr2vr.d VX1, t3, 2
-    xvinsgr2vr.d VX1, t4, 3
-    xvadd.d VI2, VI1, VINC4
-    xvfcmp.clt.d VT0, VX1, VX0
-    addi.d I, I, -1
-    xvbitsel.v VM1, VX0, VX1, VT0
-    xvbitsel.v VI2, VI1, VI2, VT0
-    xvfcmp.clt.d VT0, VM1, VM0
-    xvbitsel.v VM0, VM0, VM1, VT0
-    xvbitsel.v VI0, VI0, VI2, VT0
-    blt $r0, I, .L24
-    .align 3
-
-.L25:
-    xvpickve.d VI1, VI0, 0
-    xvpickve.d VI2, VI0, 1
-    xvpickve.d VI3, VI0, 2
-    xvpickve.d VI4, VI0, 3
-    xvpickve.d x1, VM0, 0
-    xvpickve.d x2, VM0, 1
-    xvpickve.d x3, VM0, 2
-    xvpickve.d x4, VM0, 3
-    xvfcmp.clt.d VT0, x2, x1
-    xvbitsel.v VM1, x1, x2, VT0
-    xvbitsel.v VINC4, VI1, VI2, VT0
-    xvfcmp.clt.d VT0, x4, x3
-    xvbitsel.v VM0, x3, x4, VT0
-    xvbitsel.v VINC8, VI3, VI4, VT0
-    xvfcmp.clt.d VT0, VM1, VM0
-    xvbitsel.v VM0, VM0, VM1, VT0
-    xvbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.d $f17, TEMP
-    ffint.d.l $f17, $f17
-    xvfcmp.ceq.d VT0, VM0, x1
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    xvfcmp.clt.d VT0, VI1, VI0
-    xvbitsel.v VI0, VI0, VI1, VT0
-    .align 3
-
-.L26:
-    xvfcmp.ceq.d VT0, VM0, x2
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L27
-    xvfcmp.clt.d VT0, VI2, VI0
-    xvbitsel.v VI0, VI0, VI2, VT0
-    .align 3
-
-.L27:
-    xvfcmp.ceq.d VT0, VM0, x3
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L28
-    xvfcmp.clt.d VT0, VI3, VI0
-    xvbitsel.v VI0, VI0, VI3, VT0
-    .align 3
-
-.L28:
-    xvfcmp.ceq.d VT0, VM0, x4
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L29
-    xvfcmp.clt.d VT0, VI4, VI0
-    xvbitsel.v VI0, VI0, VI4, VT0
-    .align 3
-
-.L29:
-    movfr2gr.d i0, $f20
-    .align 3
-
-.L21: //N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    srai.d i1, N, 3
-    slli.d i1, i1, 3
-    addi.d i1, i1, 1 //current index
-    movgr2fr.d $f21, i1
-    movgr2fr.d $f20, i0
-    .align 3
-
-.L22:
-    fld.d $f9, X, 0
-    addi.d I, I, -1
-    fcmp.clt.d $fcc0, $f9, $f15
-    add.d  X, X, INCX
-    fsel $f15, $f15, $f9, $fcc0
-    fsel $f20, $f20, $f21, $fcc0
-    addi.d i1, i1, 1
-    movgr2fr.d $f21, i1
-    blt $r0, I, .L22
-    movfr2gr.d i0, $f20
-    .align 3
-
-.L999:
-    move $r4, $r17
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/idmin_lsx.S b/kernel/loongarch64/idmin_lsx.S
deleted file mode 100644
index 8b6edcbf0..000000000
--- a/kernel/loongarch64/idmin_lsx.S
+++ /dev/null
@@ -1,225 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define t1 $r13
-#define t2 $r15
-#define t3 $r18
-#define t4 $r16
-#define i0 $r17
-#define i1 $r14
-#define TEMP $r19
-#define x1 $vr9
-#define x2 $vr10
-#define x3 $vr11
-#define x4 $vr12
-#define VX0 $vr13
-#define VX1 $vr14
-#define VM0 $vr15
-#define VM1 $vr16
-#define VINC2 $vr17
-#define VINC4 $vr18
-#define VI0 $vr20
-#define VI1 $vr21
-#define VI2 $vr22
-#define VI3 $vr8
-#define VI4 $vr19
-#define VT0 $vr23
-
-    PROLOGUE
-    li.d  i0, 0
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    vld VM0, X, 0
-    addi.d i0, i0, 1
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    slli.d i0, i0, 1 //2
-    vreplgr2vr.d VINC2, i0
-    slli.d i0, i0, 1 //4
-    vreplgr2vr.d VINC4, i0
-    addi.d i0, i0, -7
-    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
-    addi.d i0, i0, 1
-    vinsgr2vr.d VI1, i0, 1
-    addi.d i0, i0, 3
-    vinsgr2vr.d VI0, i0, 0 //1
-    addi.d i0, i0, 1
-    vinsgr2vr.d VI0, i0, 1 //2
-    .align 3
-
-.L10:
-    vld VX0, X, 0 * SIZE
-    vadd.d VI1, VI1, VINC4
-    vld VX1, X, 2 * SIZE
-    vadd.d VI2, VI1, VINC2
-    vfcmp.clt.d VT0, VX1, VX0
-    vbitsel.v x1, VX0, VX1, VT0
-    vbitsel.v x2, VI1, VI2, VT0
-    vld VX0, X, 4 * SIZE
-    vadd.d VI1, VI2, VINC2
-    vld VX1, X, 6 * SIZE
-    vadd.d VI2, VI1, VINC2
-    vfcmp.clt.d VT0, VX1, VX0
-    addi.d I, I, -1
-    vbitsel.v x3, VX0, VX1, VT0
-    vbitsel.v x4, VI1, VI2, VT0
-    vfcmp.clt.d VT0, x3, x1
-    addi.d X, X, 8 * SIZE
-    vbitsel.v x1, x1, x3, VT0
-    vbitsel.v x2, x2, x4, VT0
-    vfcmp.clt.d VT0, x1, VM0
-    vbitsel.v VM0, VM0, x1, VT0
-    vbitsel.v VI0, VI0, x2, VT0
-    blt $r0, I, .L10
-    .align 3
-
-.L15:
-    vreplvei.d VI1, VI0, 0
-    vreplvei.d VI2, VI0, 1
-    vreplvei.d x1, VM0, 0
-    vreplvei.d x2, VM0, 1
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.d $f17, TEMP
-    ffint.d.l $f17, $f17
-    vfcmp.ceq.d VT0, x2, x1
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    vfcmp.clt.d VT0, VI1, VI0
-    vbitsel.v VI0, VI0, VI1, VT0
-    b .L27
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X
-    addi.d i0, i0, 1
-    ld.d t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.d VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    ld.d t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.d VM0, t2, 1
-    slli.d i0, i0, 1 //2
-    vreplgr2vr.d VINC2, i0
-    slli.d i0, i0, 1 //4
-    vreplgr2vr.d VINC4, i0
-    addi.d i0, i0, -7
-    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
-    addi.d i0, i0, 1
-    vinsgr2vr.d VI1, i0, 1
-    addi.d i0, i0, 3
-    vinsgr2vr.d VI0, i0, 0 //1
-    addi.d i0, i0, 1
-    vinsgr2vr.d VI0, i0, 1 //2
-    .align 3
-
-.L24:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    vadd.d VI1, VI1, VINC4
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t1, 0
-    vinsgr2vr.d VX1, t2, 1
-    vadd.d VI2, VI1, VINC2
-    vfcmp.clt.d VT0, VX1, VX0
-    vbitsel.v x1, VX0, VX1, VT0
-    vbitsel.v x2, VI1, VI2, VT0
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    vadd.d VI1, VI2, VINC2
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t1, 0
-    vinsgr2vr.d VX1, t2, 1
-    vadd.d VI2, VI1, VINC2
-    vfcmp.clt.d VT0, VX1, VX0
-    vbitsel.v x3, VX0, VX1, VT0
-    vbitsel.v x4, VI1, VI2, VT0
-    vfcmp.clt.d VT0, x3, x1
-    vbitsel.v x1, x1, x3, VT0
-    vbitsel.v x2, x2, x4, VT0
-    vfcmp.clt.d VT0, x1, VM0
-    addi.d I, I, -1
-    vbitsel.v VM0, VM0, x1, VT0
-    vbitsel.v VI0, VI0, x2, VT0
-    blt $r0, I, .L24
-    .align 3
-
-.L25:
-    vreplvei.d VI1, VI0, 0
-    vreplvei.d VI2, VI0, 1
-    vreplvei.d x1, VM0, 0
-    vreplvei.d x2, VM0, 1
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.d $f17, TEMP
-    ffint.d.l $f17, $f17
-    vfcmp.ceq.d VT0, x2, x1
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    vfcmp.clt.d VT0, VI1, VI0
-    vbitsel.v VI0, VI0, VI1, VT0
-    b .L27
-    .align 3
-
-.L26:
-    vfcmp.clt.d VT0, x2, x1
-    vbitsel.v VM0, x1, x2, VT0
-    vbitsel.v VI0, VI1, VI2, VT0
-    .align 3
-
-.L27:
-    movfr2gr.d i0, $f20
-    .align 3
-
-.L21: //N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    srai.d i1, N, 3
-    slli.d i1, i1, 3
-    addi.d i1, i1, 1 //current index
-    movgr2fr.d $f21, i1
-    movgr2fr.d $f20, i0
-    .align 3
-
-.L22:
-    fld.d $f9, X, 0
-    addi.d I, I, -1
-    fcmp.clt.d $fcc0, $f9, $f15
-    add.d  X, X, INCX
-    fsel $f15, $f15, $f9, $fcc0
-    fsel $f20, $f20, $f21, $fcc0
-    addi.d i1, i1, 1
-    movgr2fr.d $f21, i1
-    blt $r0, I, .L22
-    movfr2gr.d i0, $f20
-    .align 3
-
-.L999:
-    move $r4, $r17
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/ismin_lasx.S b/kernel/loongarch64/imin_lasx.S
similarity index 54%
rename from kernel/loongarch64/ismin_lasx.S
rename to kernel/loongarch64/imin_lasx.S
index 15f6e2ec9..5306828e2 100644
--- a/kernel/loongarch64/ismin_lasx.S
+++ b/kernel/loongarch64/imin_lasx.S
@@ -1,3 +1,30 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
 #define ASSEMBLER
 
 #include "common.h"
@@ -39,6 +66,31 @@
     slli.d INCX, INCX, BASE_SHIFT
     bne INCX, TEMP, .L20
     xvld VM0, X, 0
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
     addi.w i0, i0, 1
     srai.d I, N, 3
     bge $r0, I, .L21
@@ -76,20 +128,45 @@
     xvinsgr2vr.w VI0, i0, 6 //7
     addi.w i0, i0, 1
     xvinsgr2vr.w VI0, i0, 7 //8
+#endif
     .align 3
 
 .L10:
     xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvadd.d VI1, VI1, VINC8
+    xvld VX1, X, 4 * SIZE
+    xvadd.d VI2, VI1, VINC4
+    XVCMPLT VT0, VX1, VX0
+    addi.d I, I, -1
+    xvbitsel.v VM1, VX0, VX1, VT0
+    xvbitsel.v VI2, VI1, VI2, VT0
+    XVCMPLT VT0, VM1, VM0
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VI0, VI2, VT0
+#else
     xvadd.w VI1, VI1, VINC8
-    xvfcmp.clt.s VT0, VX0, VM0
+    XVCMPLT VT0, VX0, VM0
     addi.d I, I, -1
     xvbitsel.v VM0, VM0, VX0, VT0
     xvbitsel.v VI0, VI0, VI1, VT0
     addi.d X, X, 8 * SIZE
+#endif
     blt $r0, I, .L10
     .align 3
 
 .L15:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d  x1, VM0, 0
+    xvpickve.d  x2, VM0, 1
+    xvpickve.d  x3, VM0, 2
+    xvpickve.d  x4, VM0, 3
+#else
     xvxor.v VX0, VX0, VX0
     xvor.v VX0, VI0, VX0
     xvxor.v VX1, VX1, VX1
@@ -98,32 +175,67 @@
     xvpickve.w VI2, VI0, 1
     xvpickve.w VI3, VI0, 2
     xvpickve.w VI4, VI0, 3
-    xvpickve.w x1, VM0, 0
-    xvpickve.w x2, VM0, 1
-    xvpickve.w x3, VM0, 2
-    xvpickve.w x4, VM0, 3
-    xvfcmp.clt.s VT0, x2, x1
+    xvpickve.w  x1, VM0, 0
+    xvpickve.w  x2, VM0, 1
+    xvpickve.w  x3, VM0, 2
+    xvpickve.w  x4, VM0, 3
+#endif
+    XVCMPLT VT0, x2, x1
     xvbitsel.v VM1, x1, x2, VT0
     xvbitsel.v VINC4, VI1, VI2, VT0
-    xvfcmp.clt.s VT0, x4, x3
+    XVCMPLT VT0, x4, x3
     xvbitsel.v VM0, x3, x4, VT0
     xvbitsel.v VINC8, VI3, VI4, VT0
-    xvfcmp.clt.s VT0, VM1, VM0
+    XVCMPLT VT0, VM1, VM0
     xvbitsel.v VM0, VM0, VM1, VT0
     xvbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.w $f17, TEMP
-    ffint.s.w $f17, $f17
-    xvfcmp.ceq.s VT0, x1, VM0
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f9
     bceqz $fcc0, .L26
-    xvfcmp.clt.s VT0, VI1, VI0
+    XVCMPLT VT0, VI1, VI0
     xvbitsel.v VI0, VI0, VI1, VT0
     b .L26
     .align 3
 
 .L20: // INCX!=1
     move TEMP, X
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    xvinsgr2vr.d VM0, t2, 1
+    xvinsgr2vr.d VM0, t3, 2
+    xvinsgr2vr.d VM0, t4, 3
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
     addi.w i0, i0, 1
     ld.w t1, TEMP, 0 * SIZE
     add.d TEMP, TEMP, INCX
@@ -185,9 +297,46 @@
     xvinsgr2vr.w VI0, i0, 6 //7
     addi.w i0, i0, 1
     xvinsgr2vr.w VI0, i0, 7 //8
+#endif
     .align 3
 
 .L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    xvadd.d VI2, VI1, VINC4
+    XVCMPLT VT0, VX1, VX0
+    addi.d I, I, -1
+    xvbitsel.v VM1, VX0, VX1, VT0
+    xvbitsel.v VI2, VI1, VI2, VT0
+    XVCMPLT VT0, VM1, VM0
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VI0, VI2, VT0
+#else
     ld.w t1, X, 0 * SIZE
     add.d X, X, INCX
     ld.w t2, X, 0 * SIZE
@@ -213,73 +362,83 @@
     xvinsgr2vr.w VX0, t3, 6
     xvinsgr2vr.w VX0, t4, 7
     xvadd.w VI1, VI1, VINC8
-    xvfcmp.clt.s VT0, VX0, VM0
+    XVCMPLT VT0, VX0, VM0
     addi.d I, I, -1
     xvbitsel.v VM0, VM0, VX0, VT0
     xvbitsel.v VI0, VI0, VI1, VT0
+#endif
     blt $r0, I, .L24
     .align 3
 
 .L25:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1,  VM0, 0
+    xvpickve.d x2,  VM0, 1
+    xvpickve.d x3,  VM0, 2
+    xvpickve.d x4,  VM0, 3
+#else
     xvxor.v VX0, VX0, VX0
-    xvor.v VX0, VI0, VX0
+    xvor.v  VX0, VI0, VX0
     xvxor.v VX1, VX1, VX1
-    xvor.v VX1, VM0, VX1
+    xvor.v  VX1, VM0, VX1
     xvpickve.w VI1, VI0, 0
     xvpickve.w VI2, VI0, 1
     xvpickve.w VI3, VI0, 2
     xvpickve.w VI4, VI0, 3
-    xvpickve.w x1, VM0, 0
-    xvpickve.w x2, VM0, 1
-    xvpickve.w x3, VM0, 2
-    xvpickve.w x4, VM0, 3
-    xvfcmp.clt.s VT0, x2, x1
+    xvpickve.w  x1, VM0, 0
+    xvpickve.w  x2, VM0, 1
+    xvpickve.w  x3, VM0, 2
+    xvpickve.w  x4, VM0, 3
+#endif
+    XVCMPLT    VT0, x2, x1
     xvbitsel.v VM1, x1, x2, VT0
     xvbitsel.v VINC4, VI1, VI2, VT0
-    xvfcmp.clt.s VT0, x4, x3
+    XVCMPLT    VT0, x4, x3
     xvbitsel.v VM0, x3, x4, VT0
     xvbitsel.v VINC8, VI3, VI4, VT0
-    xvfcmp.clt.s VT0, VM1, VM0
+    XVCMPLT VT0, VM1, VM0
     xvbitsel.v VM0, VM0, VM1, VT0
     xvbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.w $f17, TEMP
-    ffint.s.w $f17, $f17
-    xvfcmp.ceq.s VT0, VM0, x1
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f9
     bceqz $fcc0, .L26
-    xvfcmp.clt.s VT0, VI1, VI0
+    XVCMPLT VT0, VI1, VI0
     xvbitsel.v VI0, VI0, VI1, VT0
     .align 3
 
 .L26:
-    xvfcmp.ceq.s VT0, VM0, x2
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f10
     bceqz $fcc0, .L27
-    xvfcmp.clt.s VT0, VI2, VI0
+    XVCMPLT VT0, VI2, VI0
     xvbitsel.v VI0, VI0, VI2, VT0
     .align 3
 
 .L27:
-    xvfcmp.ceq.s VT0, VM0, x3
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f11
     bceqz $fcc0, .L28
-    xvfcmp.clt.s VT0, VI3, VI0
+    XVCMPLT VT0, VI3, VI0
     xvbitsel.v VI0, VI0, VI3, VT0
     .align 3
 
 .L28:
-    xvfcmp.ceq.s VT0, VM0, x4
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f12
     bceqz $fcc0, .L29
-    xvfcmp.clt.s VT0, VI4, VI0
+    XVCMPLT VT0, VI4, VI0
     xvbitsel.v VI0, VI0, VI4, VT0
     .align 3
 
 .L29:
+#ifdef DOUBLE
+    MTG i0, $f20
+#else
     fmov.s $f16, $f20
+#endif
     .align 3
 
+#ifndef DOUBLE
 .L252:
     xvxor.v VI0, VI0, VI0
     xvor.v VI0, VI0, VX0
@@ -294,13 +453,13 @@
     xvpickve.w x2, VM0, 5
     xvpickve.w x3, VM0, 6
     xvpickve.w x4, VM0, 7
-    xvfcmp.clt.s VT0, x2, x1
+    XVCMPLT VT0, x2, x1
     xvbitsel.v x1, x1, x2, VT0
     xvbitsel.v VINC4, VI1, VI2, VT0
-    xvfcmp.clt.s VT0, x4, x3
+    XVCMPLT VT0, x4, x3
     xvbitsel.v VM0, x3, x4, VT0
     xvbitsel.v VINC8, VI3, VI4, VT0
-    xvfcmp.clt.s VT0, x1, VM0
+    XVCMPLT VT0, x1, VM0
     xvbitsel.v VM0, VM0, x1, VT0
     xvbitsel.v VI0, VINC8, VINC4, VT0
     li.d TEMP, 1 //处理尾数相等时取最小序号
@@ -309,7 +468,7 @@
     xvfcmp.ceq.s VT0, VM0, x1
     fcmp.ceq.s $fcc0, $f23, $f17
     bceqz $fcc0, .L262
-    xvfcmp.clt.s VT0, VI1, VI0
+    XVCMPLT VT0, VI1, VI0
     xvbitsel.v VI0, VI0, VI1, VT0
     .align 3
 
@@ -317,7 +476,7 @@
     xvfcmp.ceq.s VT0, VM0, x2
     fcmp.ceq.s $fcc0, $f23, $f17
     bceqz $fcc0, .L272
-    xvfcmp.clt.s VT0, VI2, VI0
+    XVCMPLT VT0, VI2, VI0
     xvbitsel.v VI0, VI0, VI2, VT0
     .align 3
 
@@ -325,7 +484,7 @@
     xvfcmp.ceq.s VT0, VM0, x3
     fcmp.ceq.s $fcc0, $f23, $f17
     bceqz $fcc0, .L282
-    xvfcmp.clt.s VT0, VI3, VI0
+    XVCMPLT VT0, VI3, VI0
     xvbitsel.v VI0, VI0, VI3, VT0
     .align 3
 
@@ -333,17 +492,18 @@
     xvfcmp.ceq.s VT0, VM0, x4
     fcmp.ceq.s $fcc0, $f23, $f17
     bceqz $fcc0, .L292
-    xvfcmp.clt.s VT0, VI4, VI0
+    XVCMPLT VT0, VI4, VI0
     xvbitsel.v VI0, VI0, VI4, VT0
     .align 3
 
 .L292:
-    fcmp.clt.s $fcc0, $f13, $f15
+    CMPLT $fcc0, $f13, $f15
     fsel $f15, $f15, $f13, $fcc0
     fsel $f20, $f20, $f16, $fcc0
-    movfr2gr.s i0, $f20
+    MTG i0, $f20
+#endif
 
-.L21:   //N<8
+.L21: //N<8
     andi I, N, 7
     bge $r0, I, .L999
     srai.d i1, N, 3
@@ -356,14 +516,14 @@
 .L22:
     fld.d $f9, X, 0
     addi.d I, I, -1
-    fcmp.clt.s $fcc0, $f9, $f15
+    CMPLT $fcc0, $f9, $f15
+    add.d  X, X, INCX
     fsel $f15, $f15, $f9, $fcc0
     fsel $f20, $f20, $f21, $fcc0
     addi.d i1, i1, 1
     movgr2fr.d $f21, i1
-    add.d  X, X, INCX
     blt $r0, I, .L22
-    movfr2gr.s i0, $f20
+    MTG i0, $f20
     .align 3
 
 .L999:
@@ -371,4 +531,4 @@
     jirl $r0, $r1, 0x0
     .align 3
 
-    EPILOGUE
\ No newline at end of file
+    EPILOGUE
diff --git a/kernel/loongarch64/imin_lsx.S b/kernel/loongarch64/imin_lsx.S
new file mode 100644
index 000000000..a0c411e7a
--- /dev/null
+++ b/kernel/loongarch64/imin_lsx.S
@@ -0,0 +1,428 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d  I, N, 3
+    bge   $r0, I, .L21
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d $vr17, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d $vr18, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w i0, i0, 1
+    srai.d  I, N, 3
+    bge   $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w $vr17, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w $vr18, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vadd.d VI1, VI1, $vr18
+    vld VX1, X, 2 * SIZE
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX1, VX0
+    vbitsel.v x1, VX0, VX1, VT0
+    vbitsel.v x2, VI1, VI2, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI2, $vr17
+    vld VX1, X, 6 * SIZE
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX1, VX0
+    addi.d I, I, -1
+    vbitsel.v x3, VX0, VX1, VT0
+    vbitsel.v x4, VI1, VI2, VT0
+    VCMPLT VT0, x3, x1
+    addi.d X, X, 8 * SIZE
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT0
+    VCMPLT VT0, x1, VM0
+    vbitsel.v VM0, VM0, x1, VT0
+    vbitsel.v VI0, VI0, x2, VT0
+#else
+    vadd.w VI1, VI1, $vr18
+    vld VX1, X, 4 * SIZE
+    vadd.w VI2, VI1, $vr17
+    VCMPLT VT0, VX1, VX0
+    addi.d I, I, -1
+    vbitsel.v VM1, VX0, VX1, VT0
+    vbitsel.v VI2, VI1, VI2, VT0
+    VCMPLT VT0, VM1, VM0
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, VI0, VI2, VT0
+#endif
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    VCMPLT VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    VCMPLT VT0, x2, x1
+    vbitsel.v VM1, x1, x2, VT0
+    vbitsel.v $vr17, VI1, VI2, VT0
+    VCMPLT VT0, x4, x3
+    vbitsel.v VM0, x3, x4, VT0
+    vbitsel.v $vr18, VI3, VI4, VT0
+    VCMPLT VT0, VM1, VM0
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, $vr18, $vr17, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    VCMPLT VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+#endif
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t2, 1
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d $vr17, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d $vr18, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w i0, i0, 1
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t2, 1
+    vinsgr2vr.w VM0, t3, 2
+    vinsgr2vr.w VM0, t4, 3
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w $vr17, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w $vr18, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI1, $vr18
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX1, VX0
+    vbitsel.v x1, VX0, VX1, VT0
+    vbitsel.v x2, VI1, VI2, VT0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI2, $vr17
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX1, VX0
+    vbitsel.v x3, VX0, VX1, VT0
+    vbitsel.v x4, VI1, VI2, VT0
+    VCMPLT VT0, x3, x1
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT0
+    VCMPLT VT0, x1, VM0
+    addi.d I, I, -1
+    vbitsel.v VM0, VM0, x1, VT0
+    vbitsel.v VI0, VI0, x2, VT0
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    vadd.w VI1, VI1, $vr18
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vadd.w VI2, VI1, $vr17
+    VCMPLT VT0, VX1, VX0
+    addi.d I, I, -1
+    vbitsel.v VM1, VX0, VX1, VT0
+    vbitsel.v VI2, VI1, VI2, VT0
+    VCMPLT VT0, VM1, VM0
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, VI0, VI2, VT0
+#endif
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    VCMPLT VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    VCMPLT VT0, x2, x1
+    vbitsel.v VM1, x1, x2, VT0
+    vbitsel.v $vr17, VI1, VI2, VT0
+    VCMPLT VT0, x4, x3
+    vbitsel.v VM0, x3, x4, VT0
+    vbitsel.v $vr18, VI3, VI4, VT0
+    VCMPLT VT0, VM1, VM0
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, $vr18, $vr17, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    VCMPLT VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+#endif
+    .align 3
+
+.L26:
+#ifdef DOUBLE
+    VCMPLT VT0, x2, x1
+    vbitsel.v VM0, x1, x2, VT0
+    vbitsel.v VI0, VI1, VI2, VT0
+#else
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    VCMPLT VT0, VI2, VI0
+    vbitsel.v VI0, VI0, VI2, VT0
+#endif
+    .align 3
+
+.L27:
+#ifndef DOUBLE
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    VCMPLT VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    VCMPLT VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+#endif
+    MTG i0, $f20
+    .align 3
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    addi.d I, I, -1
+    CMPLT $fcc0, $f9, $f15
+    add.d  X, X, INCX
+    fsel $f15, $f15, $f9, $fcc0
+    fsel $f20, $f20, $f21, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/ismin_lsx.S b/kernel/loongarch64/ismin_lsx.S
deleted file mode 100644
index f90ebbd57..000000000
--- a/kernel/loongarch64/ismin_lsx.S
+++ /dev/null
@@ -1,271 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define t1 $r13
-#define t2 $r15
-#define t3 $r18
-#define t4 $r16
-#define i0 $r17
-#define i1 $r14
-#define TEMP $r19
-#define x1 $vr9
-#define x2 $vr10
-#define x3 $vr11
-#define x4 $vr12
-#define VX0 $vr13
-#define VX1 $vr14
-#define VM0 $vr15
-#define VM1 $vr16
-#define VINC4 $vr17
-#define VINC8 $vr18
-#define VI0 $vr20
-#define VI1 $vr21
-#define VI2 $vr22
-#define VI3 $vr8
-#define VI4 $vr19
-#define VT0 $vr23
-
-    PROLOGUE
-    li.d  i0, 0
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    vld VM0, X, 0
-    addi.w i0, i0, 1
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    slli.w i0, i0, 2 //4
-    vreplgr2vr.w VINC4, i0
-    slli.w i0, i0, 1 //8
-    vreplgr2vr.w VINC8, i0
-    addi.w i0, i0, -15
-    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 1
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 2
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 3
-    addi.w i0, i0, 5
-    vinsgr2vr.w VI0, i0, 0 //1
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 1 //2
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 2 //3
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 3 //4
-    .align 3
-
-.L10:
-    vld VX0, X, 0 * SIZE
-    vadd.w VI1, VI1, VINC8
-    vld VX1, X, 4 * SIZE
-    vadd.w VI2, VI1, VINC4
-    vfcmp.clt.s VT0, VX1, VX0
-    addi.d I, I, -1
-    vbitsel.v VM1, VX0, VX1, VT0
-    vbitsel.v VI2, VI1, VI2, VT0
-    vfcmp.clt.s VT0, VM1, VM0
-    addi.d X, X, 8 * SIZE
-    vbitsel.v VM0, VM0, VM1, VT0
-    vbitsel.v VI0, VI0, VI2, VT0
-    blt $r0, I, .L10
-    .align 3
-
-.L15:
-    vreplvei.w VI1, VI0, 0
-    vreplvei.w VI2, VI0, 1
-    vreplvei.w VI3, VI0, 2
-    vreplvei.w VI4, VI0, 3
-    vreplvei.w x1, VM0, 0
-    vreplvei.w x2, VM0, 1
-    vreplvei.w x3, VM0, 2
-    vreplvei.w x4, VM0, 3
-    vfcmp.clt.s VT0, x2, x1
-    vbitsel.v VM1, x1, x2, VT0
-    vbitsel.v VINC4, VI1, VI2, VT0
-    vfcmp.clt.s VT0, x4, x3
-    vbitsel.v VM0, x3, x4, VT0
-    vbitsel.v VINC8, VI3, VI4, VT0
-    vfcmp.clt.s VT0, VM1, VM0
-    vbitsel.v VM0, VM0, VM1, VT0
-    vbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.w $f17, TEMP
-    ffint.s.w $f17, $f17
-    vfcmp.ceq.s VT0, x1, VM0
-    fcmp.ceq.s $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    vfcmp.clt.s VT0, VI1, VI0
-    vbitsel.v VI0, VI0, VI1, VT0
-    b .L26
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X
-    addi.w i0, i0, 1
-    ld.w t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.w VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    ld.w t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.w VM0, t2, 1
-    vinsgr2vr.w VM0, t3, 2
-    vinsgr2vr.w VM0, t4, 3
-    slli.w i0, i0, 2 //4
-    vreplgr2vr.w VINC4, i0
-    slli.w i0, i0, 1 //8
-    vreplgr2vr.w VINC8, i0
-    addi.w i0, i0, -15
-    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 1
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 2
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 3
-    addi.w i0, i0, 5
-    vinsgr2vr.w VI0, i0, 0 //1
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 1 //2
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 2 //3
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 3 //4
-    .align 3
-
-.L24:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    vadd.w VI1, VI1, VINC8
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    vadd.w VI2, VI1, VINC4
-    vfcmp.clt.s VT0, VX1, VX0
-    addi.d I, I, -1
-    vbitsel.v VM1, VX0, VX1, VT0
-    vbitsel.v VI2, VI1, VI2, VT0
-    vfcmp.clt.s VT0, VM1, VM0
-    vbitsel.v VM0, VM0, VM1, VT0
-    vbitsel.v VI0, VI0, VI2, VT0
-    blt $r0, I, .L24
-    .align 3
-
-.L25:
-    vreplvei.w VI1, VI0, 0
-    vreplvei.w VI2, VI0, 1
-    vreplvei.w VI3, VI0, 2
-    vreplvei.w VI4, VI0, 3
-    vreplvei.w x1, VM0, 0
-    vreplvei.w x2, VM0, 1
-    vreplvei.w x3, VM0, 2
-    vreplvei.w x4, VM0, 3
-    vfcmp.clt.s VT0, x2, x1
-    vbitsel.v VM1, x1, x2, VT0
-    vbitsel.v VINC4, VI1, VI2, VT0
-    vfcmp.clt.s VT0, x4, x3
-    vbitsel.v VM0, x3, x4, VT0
-    vbitsel.v VINC8, VI3, VI4, VT0
-    vfcmp.clt.s VT0, VM1, VM0
-    vbitsel.v VM0, VM0, VM1, VT0
-    vbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.w $f17, TEMP
-    ffint.s.w $f17, $f17
-    vfcmp.ceq.s VT0, x1, VM0
-    fcmp.ceq.s $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    vfcmp.clt.s VT0, VI1, VI0
-    vbitsel.v VI0, VI0, VI1, VT0
-    .align 3
-
-.L26:
-    vfcmp.ceq.s VT0, x2, VM0
-    fcmp.ceq.s $fcc0, $f23, $f17
-    bceqz $fcc0, .L27
-    vfcmp.clt.s VT0, VI2, VI0
-    vbitsel.v VI0, VI0, VI2, VT0
-    .align 3
-
-.L27:
-    vfcmp.ceq.s VT0, x3, VM0
-    fcmp.ceq.s $fcc0, $f23, $f17
-    bceqz $fcc0, .L28
-    vfcmp.clt.s VT0, VI3, VI0
-    vbitsel.v VI0, VI0, VI3, VT0
-    .align 3
-
-.L28:
-    vfcmp.ceq.s VT0, x4, VM0
-    fcmp.ceq.s $fcc0, $f23, $f17
-    bceqz $fcc0, .L29
-    vfcmp.clt.s VT0, VI4, VI0
-    vbitsel.v VI0, VI0, VI4, VT0
-    .align 3
-
-.L29:
-    movfr2gr.s i0, $f20
-
-.L21: //N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    srai.d i1, N, 3
-    slli.d i1, i1, 3
-    addi.d i1, i1, 1 //current index
-    movgr2fr.d $f21, i1
-    movgr2fr.d $f20, i0
-    .align 3
-
-.L22:
-    fld.d $f9, X, 0
-    fcmp.clt.s $fcc0, $f9, $f15
-    fsel $f15, $f15, $f9, $fcc0
-    fsel $f20, $f20, $f21, $fcc0
-    addi.d I, I, -1
-    addi.d i1, i1, 1
-    add.d  X, X, INCX
-    movgr2fr.d $f21, i1
-    blt $r0, I, .L22
-    movfr2gr.s i0, $f20
-    .align 3
-
-.L999:
-    move $r4, $r17
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file

From ea70e165c71201e46961c479e53c17d4034290f8 Mon Sep 17 00:00:00 2001
From: zhoupeng <zhoupeng@loongson.cn>
Date: Thu, 28 Dec 2023 20:07:59 +0800
Subject: [PATCH 517/718] loongarch64: Refine rot optimization.

---
 common_loongarch64.h                          |   10 +
 kernel/loongarch64/KERNEL.LOONGSON2K1000      |    4 +-
 kernel/loongarch64/KERNEL.LOONGSON3R5         |    4 +-
 kernel/loongarch64/drot_lsx.S                 | 1050 ----------
 .../loongarch64/{drot_lasx.S => rot_lasx.S}   |  779 ++++++-
 kernel/loongarch64/rot_lsx.S                  | 1791 +++++++++++++++++
 kernel/loongarch64/srot_lasx.S                |  863 --------
 kernel/loongarch64/srot_lsx.S                 |  927 ---------
 8 files changed, 2532 insertions(+), 2896 deletions(-)
 delete mode 100644 kernel/loongarch64/drot_lsx.S
 rename kernel/loongarch64/{drot_lasx.S => rot_lasx.S} (52%)
 create mode 100644 kernel/loongarch64/rot_lsx.S
 delete mode 100644 kernel/loongarch64/srot_lasx.S
 delete mode 100644 kernel/loongarch64/srot_lsx.S

diff --git a/common_loongarch64.h b/common_loongarch64.h
index 846fc0dbd..13514d6e0 100644
--- a/common_loongarch64.h
+++ b/common_loongarch64.h
@@ -129,6 +129,7 @@ static inline int WhereAmI(void){
 #define CMPLE   fcmp.cle.d
 #define CMPLT   fcmp.clt.d
 #define NEG     fneg.d
+#define FFINT   ffint.d.l
 
 #define XVFSUB  xvfsub.d
 #define XVFADD  xvfadd.d
@@ -139,6 +140,8 @@ static inline int WhereAmI(void){
 #define XVFMAXA xvfmaxa.d
 #define XVCMPEQ xvfcmp.ceq.d
 #define XVCMPLT xvfcmp.clt.d
+#define XVMUL   xvfmul.d
+#define XVMSUB  xvfmsub.d
 
 #define VFSUB  vfsub.d
 #define VFADD  vfadd.d
@@ -149,6 +152,8 @@ static inline int WhereAmI(void){
 #define VFMAXA vfmaxa.d
 #define VCMPEQ vfcmp.ceq.d
 #define VCMPLT vfcmp.clt.d
+#define VMUL   vfmul.d
+#define VMSUB  vfmsub.d
 
 #else
 
@@ -174,6 +179,7 @@ static inline int WhereAmI(void){
 #define CMPLE   fcmp.cle.s
 #define CMPLT   fcmp.clt.s
 #define NEG     fneg.s
+#define FFINT   ffint.s.l
 
 #define XVFSUB  xvfsub.s
 #define XVFADD  xvfadd.s
@@ -184,6 +190,8 @@ static inline int WhereAmI(void){
 #define XVFMAXA xvfmaxa.s
 #define XVCMPEQ xvfcmp.ceq.s
 #define XVCMPLT xvfcmp.clt.s
+#define XVMUL   xvfmul.s
+#define XVMSUB  xvfmsub.s
 
 #define VFSUB  vfsub.s
 #define VFADD  vfadd.s
@@ -194,6 +202,8 @@ static inline int WhereAmI(void){
 #define VFMAXA vfmaxa.s
 #define VCMPEQ vfcmp.ceq.s
 #define VCMPLT vfcmp.clt.s
+#define VMUL   vfmul.s
+#define VMSUB  vfmsub.s
 
 #endif /* defined(DOUBLE) */
 
diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 9164f28ef..b315c81f2 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -49,8 +49,8 @@ DSUMKERNEL  =  sum_lsx.S
 SASUMKERNEL =  sasum_lsx.S
 DASUMKERNEL =  dasum_lsx.S
 
-SROTKERNEL  =  srot_lsx.S
-DROTKERNEL  =  drot_lsx.S
+SROTKERNEL  =  rot_lsx.S
+DROTKERNEL  =  rot_lsx.S
 
 SNRM2KERNEL =  snrm2_lsx.S
 DNRM2KERNEL =  dnrm2_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 5882b8932..577f6316e 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -49,8 +49,8 @@ DSUMKERNEL  =  sum_lasx.S
 SASUMKERNEL =  sasum_lasx.S
 DASUMKERNEL =  dasum_lasx.S
 
-SROTKERNEL  =  srot_lasx.S
-DROTKERNEL  =  drot_lasx.S
+SROTKERNEL  =  rot_lasx.S
+DROTKERNEL  =  rot_lasx.S
 
 SNRM2KERNEL =  snrm2_lasx.S
 DNRM2KERNEL =  dnrm2_lasx.S
diff --git a/kernel/loongarch64/drot_lsx.S b/kernel/loongarch64/drot_lsx.S
deleted file mode 100644
index 6db803b1c..000000000
--- a/kernel/loongarch64/drot_lsx.S
+++ /dev/null
@@ -1,1050 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define Y      $r7
-#define INCY   $r8
-#define C      $f0
-#define S      $f1
-
-#define I      $r12
-#define TEMP   $r13
-#define t1     $r14
-#define t2     $r16
-#define t3     $r15
-#define t4     $r17
-#define XX     $r18
-#define YY     $r19
-#define a1     $f12
-#define VX0    $vr8
-#define VX1    $vr20
-#define VX2    $vr21
-#define VX3    $vr22
-#define VT0    $vr10
-#define VT1    $vr18
-#define VXC    $vr23
-#define VXS    $vr9
-#define VXZ    $vr19
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    movgr2fr.d a1, $r0
-    ffint.d.l a1, a1
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    movfr2gr.d t1, C
-    vreplgr2vr.d VXC, t1
-    movfr2gr.d t2, S
-    vreplgr2vr.d VXS, t2
-    movfr2gr.d t3, a1
-    vreplgr2vr.d VXZ, t3
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L997
-    fcmp.ceq.d $fcc0, C, a1
-    bcnez $fcc0, .L110
-    fcmp.ceq.d $fcc0, S, a1
-    bcnez $fcc0, .L112 // C!=0 S==0
-    b .L111 // C!=0 S!=0
-    .align 3
-
-.L110:
-    fcmp.ceq.d $fcc0, S, a1
-    bcnez $fcc0, .L114 // C==0 S==0
-    b .L113 // C==0 S!=0
-    .align 3
-
-.L111: // C!=0 S!=0
-    vld VX0, X, 0 * SIZE
-    vld VX2, Y, 0 * SIZE
-    vld VX1, X, 2 * SIZE
-    vld VX3, Y, 2 * SIZE
-    vfmul.d VT0, VX0, VXC
-    vfmadd.d VT0, VX2, VXS, VT0
-    vfmul.d VT1, VX0, VXS
-    vfmsub.d VT1, VX2, VXC, VT1
-    vst VT0, X, 0 * SIZE
-    vst VT1, Y, 0 * SIZE
-    vfmul.d VT0, VX1, VXC
-    vfmadd.d VT0, VX3, VXS, VT0
-    vfmul.d VT1, VX1, VXS
-    vfmsub.d VT1, VX3, VXC, VT1
-    vst VT0, X, 2 * SIZE
-    vst VT1, Y, 2 * SIZE
-    vld VX0, X, 4 * SIZE
-    vld VX2, Y, 4 * SIZE
-    vld VX1, X, 6 * SIZE
-    vld VX3, Y, 6 * SIZE
-    vfmul.d VT0, VX0, VXC
-    vfmadd.d VT0, VX2, VXS, VT0
-    vfmul.d VT1, VX0, VXS
-    vfmsub.d VT1, VX2, VXC, VT1
-    vst VT0, X, 4 * SIZE
-    vst VT1, Y, 4 * SIZE
-    vfmul.d VT0, VX1, VXC
-    vfmadd.d VT0, VX3, VXS, VT0
-    vfmul.d VT1, VX1, VXS
-    vfmsub.d VT1, VX3, VXC, VT1
-    vst VT0, X, 6 * SIZE
-    vst VT1, Y, 6 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L111
-    b .L997
-    .align 3
-
-.L112: // C!=0 S==0
-    vld VX0, X, 0 * SIZE
-    vld VX2, Y, 0 * SIZE
-    vld VX1, X, 2 * SIZE
-    vld VX3, Y, 2 * SIZE
-    vfmul.d VT0, VX0, VXC
-    vfmul.d VT1, VX2, VXC
-    vst VT0, X, 0 * SIZE
-    vst VT1, Y, 0 * SIZE
-    vfmul.d VT0, VX1, VXC
-    vfmul.d VT1, VX3, VXC
-    vst VT0, X, 2 * SIZE
-    vst VT1, Y, 2 * SIZE
-    vld VX0, X, 4 * SIZE
-    vld VX2, Y, 4 * SIZE
-    vld VX1, X, 6 * SIZE
-    vld VX3, Y, 6 * SIZE
-    vfmul.d VT0, VX0, VXC
-    vfmul.d VT1, VX2, VXC
-    vst VT0, X, 4 * SIZE
-    vst VT1, Y, 4 * SIZE
-    vfmul.d VT0, VX1, VXC
-    vfmul.d VT1, VX3, VXC
-    vst VT0, X, 6 * SIZE
-    vst VT1, Y, 6 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L112
-    b .L997
-    .align 3
-
-.L113: // C==0 S!=0
-    vld VX0, X, 0 * SIZE
-    vld VX2, Y, 0 * SIZE
-    vld VX1, X, 2 * SIZE
-    vld VX3, Y, 2 * SIZE
-    vfmul.d VT0, VX2, VXS
-    vfmul.d VT1, VX0, VXS
-    vfsub.d VT1, VXZ, VT1
-    vst VT0, X, 0 * SIZE
-    vst VT1, Y, 0 * SIZE
-    vfmul.d VT0, VX3, VXS
-    vfmul.d VT1, VX1, VXS
-    vfsub.d VT1, VXZ, VT1
-    vst VT0, X, 2 * SIZE
-    vst VT1, Y, 2 * SIZE
-    vld VX0, X, 4 * SIZE
-    vld VX2, Y, 4 * SIZE
-    vld VX1, X, 6 * SIZE
-    vld VX3, Y, 6 * SIZE
-    vfmul.d VT0, VX2, VXS
-    vfmul.d VT1, VX0, VXS
-    vfsub.d VT1, VXZ, VT1
-    vst VT0, X, 4 * SIZE
-    vst VT1, Y, 4 * SIZE
-    vfmul.d VT0, VX3, VXS
-    vfmul.d VT1, VX1, VXS
-    vfsub.d VT1, VXZ, VT1
-    vst VT0, X, 6 * SIZE
-    vst VT1, Y, 6 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L113
-    b .L997
-    .align 3
-
-.L114: // C==0 S==0
-    vst VXZ, X, 0 * SIZE
-    vst VXZ, Y, 0 * SIZE
-    vst VXZ, X, 2 * SIZE
-    vst VXZ, Y, 2 * SIZE
-    vst VXZ, X, 4 * SIZE
-    vst VXZ, Y, 4 * SIZE
-    vst VXZ, X, 6 * SIZE
-    vst VXZ, Y, 6 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L114
-    b .L997
-    .align 3
-
-.L12: // INCX==1 and INCY!=1
-    bge $r0, I, .L997
-    move YY, Y
-    move XX, X
-    fcmp.ceq.d $fcc0, C, a1
-    bcnez $fcc0, .L120
-    fcmp.ceq.d $fcc0, S, a1
-    bcnez $fcc0, .L122 // C!=0 S==0
-    b .L121 // C!=0 S!=0
-    .align 3
-
-.L120:
-    fcmp.ceq.d $fcc0, S, a1
-    bcnez $fcc0, .L124 // C==0 S==0
-    b .L123 // C==0 S!=0
-    .align 3
-
-.L121: // C!=0 S!=0
-    vld VX0, X, 0 * SIZE
-    ld.d t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t2, Y, 0 * SIZE
-    vinsgr2vr.d VX2, t1, 0
-    vinsgr2vr.d VX2, t2, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX0, VXC
-    vfmadd.d VT0, VX2, VXS, VT0
-    vfmul.d VT1, VX0, VXS
-    vfmsub.d VT1, VX2, VXC, VT1
-    vst VT0, X, 0 * SIZE
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    vld VX0, X, 2 * SIZE
-    ld.d t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t4, Y, 0 * SIZE
-    vinsgr2vr.d VX2, t3, 0
-    vinsgr2vr.d VX2, t4, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX0, VXC
-    vfmadd.d VT0, VX2, VXS, VT0
-    vfmul.d VT1, VX0, VXS
-    vfmsub.d VT1, VX2, VXC, VT1
-    vst VT0, X, 2 * SIZE
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    vld VX1, X, 4 * SIZE
-    ld.d t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t2, Y, 0 * SIZE
-    vinsgr2vr.d VX3, t1, 0
-    vinsgr2vr.d VX3, t2, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX1, VXC
-    vfmadd.d VT0, VX3, VXS, VT0
-    vfmul.d VT1, VX1, VXS
-    vfmsub.d VT1, VX3, VXC, VT1
-    vst VT0, X, 4 * SIZE
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    vld VX1, X, 6 * SIZE
-    ld.d t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t4, Y, 0 * SIZE
-    vinsgr2vr.d VX3, t3, 0
-    vinsgr2vr.d VX3, t4, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX1, VXC
-    vfmadd.d VT0, VX3, VXS, VT0
-    vfmul.d VT1, VX1, VXS
-    vfmsub.d VT1, VX3, VXC, VT1
-    vst VT0, X, 6 * SIZE
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L121
-    b .L997
-    .align 3
-
-.L122: // C!=0 S==0
-    vld VX0, X, 0 * SIZE
-    ld.d t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t2, Y, 0 * SIZE
-    vinsgr2vr.d VX2, t1, 0
-    vinsgr2vr.d VX2, t2, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX0, VXC
-    vfmul.d VT1, VX2, VXC
-    vst VT0, X, 0 * SIZE
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    vld VX0, X, 2 * SIZE
-    ld.d t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t4, Y, 0 * SIZE
-    vinsgr2vr.d VX2, t3, 0
-    vinsgr2vr.d VX2, t4, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX0, VXC
-    vfmul.d VT1, VX2, VXC
-    vst VT0, X, 2 * SIZE
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    vld VX1, X, 4 * SIZE
-    ld.d t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t2, Y, 0 * SIZE
-    vinsgr2vr.d VX3, t1, 0
-    vinsgr2vr.d VX3, t2, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX1, VXC
-    vfmul.d VT1, VX3, VXC
-    vst VT0, X, 4 * SIZE
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    vld VX1, X, 6 * SIZE
-    ld.d t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t4, Y, 0 * SIZE
-    vinsgr2vr.d VX3, t3, 0
-    vinsgr2vr.d VX3, t4, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX1, VXC
-    vfmul.d VT1, VX3, VXC
-    vst VT0, X, 6 * SIZE
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L122
-    b .L997
-    .align 3
-
-.L123: // C==0 S!=0
-    vld VX0, X, 0 * SIZE
-    ld.d t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t2, Y, 0 * SIZE
-    vinsgr2vr.d VX2, t1, 0
-    vinsgr2vr.d VX2, t2, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX2, VXS
-    vfmul.d VT1, VX0, VXS
-    vfsub.d VT1, VXZ, VT1
-    vst VT0, X, 0 * SIZE
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    vld VX0, X, 2 * SIZE
-    ld.d t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t4, Y, 0 * SIZE
-    vinsgr2vr.d VX2, t3, 0
-    vinsgr2vr.d VX2, t4, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX2, VXS
-    vfmul.d VT1, VX0, VXS
-    vfsub.d VT1, VXZ, VT1
-    vst VT0, X, 2 * SIZE
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    vld VX1, X, 4 * SIZE
-    ld.d t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t2, Y, 0 * SIZE
-    vinsgr2vr.d VX3, t1, 0
-    vinsgr2vr.d VX3, t2, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX3, VXS
-    vfmul.d VT1, VX1, VXS
-    vfsub.d VT1, VXZ, VT1
-    vst VT0, X, 4 * SIZE
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    vld VX1, X, 6 * SIZE
-    ld.d t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t4, Y, 0 * SIZE
-    vinsgr2vr.d VX3, t3, 0
-    vinsgr2vr.d VX3, t4, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX3, VXS
-    vfmul.d VT1, VX1, VXS
-    vfsub.d VT1, VXZ, VT1
-    vst VT0, X, 6 * SIZE
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L123
-    b .L997
-    .align 3
-
-.L124: // C==0 S==0
-    vst VXZ, X, 0 * SIZE
-    vst VXZ, X, 4 * SIZE
-    vstelm.d VXZ, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VXZ, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.d VXZ, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VXZ, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.d VXZ, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VXZ, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.d VXZ, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VXZ, YY, 0, 1
-    add.d YY, YY, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L124
-    b .L997
-    .align 3
-
-.L21:// INCX!=1 and INCY==1
-    bge $r0, I, .L997
-    move XX, X
-    fcmp.ceq.d $fcc0, C, a1
-    bcnez $fcc0, .L210
-    fcmp.ceq.d $fcc0, S, a1
-    bcnez $fcc0, .L212 // C!=0 S==0
-    b .L211 // C!=0 S!=0
-    .align 3
-
-.L210:
-    fcmp.ceq.d $fcc0, S, a1
-    bcnez $fcc0, .L214 // C==0 S==0
-    b .L213 // C==0 S!=0
-    .align 3
-
-.L211: // C!=0 S!=0
-    vld VX2, Y, 0 * SIZE
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    add.d X, X, INCX
-    vfmul.d VT0, VXC, VX0
-    vfmadd.d VT0, VX2, VXS, VT0
-    vfmul.d VT1, VXS, VX0
-    vfmsub.d VT1, VX2, VXC, VT1
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vst VT1, Y, 0 * SIZE
-    vld VX2, Y, 2 * SIZE
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    vinsgr2vr.d VX0, t3, 0
-    vinsgr2vr.d VX0, t4, 1
-    add.d X, X, INCX
-    vfmul.d VT0, VXC, VX0
-    vfmadd.d VT0, VX2, VXS, VT0
-    vfmul.d VT1, VXS, VX0
-    vfmsub.d VT1, VX2, VXC, VT1
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vst VT1, Y, 2 * SIZE
-    vld VX3, Y, 4 * SIZE
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    vinsgr2vr.d VX1, t1, 0
-    vinsgr2vr.d VX1, t2, 1
-    add.d X, X, INCX
-    vfmul.d VT0, VX1, VXC
-    vfmadd.d VT0, VX3, VXS, VT0
-    vfmul.d VT1, VX1, VXS
-    vfmsub.d VT1, VX3, VXC, VT1
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vst VT1, Y, 4 * SIZE
-    vld VX3, Y, 6 * SIZE
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    add.d X, X, INCX
-    vfmul.d VT0, VX1, VXC
-    vfmadd.d VT0, VX3, VXS, VT0
-    vfmul.d VT1, VX1, VXS
-    vfmsub.d VT1, VX3, VXC, VT1
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vst VT1, Y, 6 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    b .L997
-    .align 3
-
-.L212: // C!=0 S==0
-    vld VX2, Y, 0 * SIZE
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    add.d X, X, INCX
-    vfmul.d VT0, VXC, VX0
-    vfmul.d VT1, VX2, VXC
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vst VT1, Y, 0 * SIZE
-    vld VX2, Y, 2 * SIZE
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    vinsgr2vr.d VX0, t3, 0
-    vinsgr2vr.d VX0, t4, 1
-    add.d X, X, INCX
-    vfmul.d VT0, VXC, VX0
-    vfmul.d VT1, VX2, VXC
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vst VT1, Y, 2 * SIZE
-    vld VX3, Y, 4 * SIZE
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    vinsgr2vr.d VX1, t1, 0
-    vinsgr2vr.d VX1, t2, 1
-    add.d X, X, INCX
-    vfmul.d VT0, VX1, VXC
-    vfmul.d VT1, VX3, VXS
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vst VT1, Y, 4 * SIZE
-    vld VX3, Y, 6 * SIZE
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    add.d X, X, INCX
-    vfmul.d VT0, VX1, VXC
-    vfmul.d VT1, VX3, VXS
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    vst VT1, Y, 6 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L212
-    b .L997
-    .align 3
-
-.L213: // C==0 S!=0
-    vld VX2, Y, 0 * SIZE
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    add.d X, X, INCX
-    vfmul.d VT0, VXS, VX2
-    vfmul.d VT1, VXS, VX0
-    vfsub.d VT1, VXZ, VT1
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vst VT1, Y, 0 * SIZE
-    vld VX2, Y, 2 * SIZE
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    vinsgr2vr.d VX0, t3, 0
-    vinsgr2vr.d VX0, t4, 1
-    add.d X, X, INCX
-    vfmul.d VT0, VXS, VX2
-    vfmul.d VT1, VXS, VX0
-    vfsub.d VT1, VXZ, VT1
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vst VT1, Y, 2 * SIZE
-    vld VX3, Y, 4 * SIZE
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    vinsgr2vr.d VX1, t1, 0
-    vinsgr2vr.d VX1, t2, 1
-    add.d X, X, INCX
-    vfmul.d VT0, VX3, VXS
-    vfmul.d VT1, VX1, VXS
-    vfsub.d VT1, VXZ, VT1
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vst VT1, Y, 4 * SIZE
-    vld VX3, Y, 6 * SIZE
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    add.d X, X, INCX
-    vfmul.d VT0, VX3, VXS
-    vfmul.d VT1, VX1, VXS
-    vfsub.d VT1, VXZ, VT1
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vst VT1, Y, 6 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L213
-    b .L997
-    .align 3
-
-.L214: // C==0 S==0
-    vstelm.d VXZ, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VXZ, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.d VXZ, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VXZ, XX, 0, 1
-    add.d XX, XX, INCX
-    vst VT1, Y, 0 * SIZE
-    vstelm.d VXZ, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VXZ, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.d VXZ, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VXZ, XX, 0, 1
-    add.d XX, XX, INCX
-    vst VT1, Y, 4 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    b .L997
-    .align 3
-
-.L22:
-    bge $r0, I, .L997
-    move YY, Y
-    move XX, X
-    fcmp.ceq.d $fcc0, C, a1
-    bcnez $fcc0, .L220
-    fcmp.ceq.d $fcc0, S, a1
-    bcnez $fcc0, .L222 // C!=0 S==0
-    b .L221 // C!=0 S!=0
-    .align 3
-
-.L220:
-    fcmp.ceq.d $fcc0, S, a1
-    bcnez $fcc0, .L224 // C==0 S==0
-    b .L223 // C==0 S!=0
-    .align 3
-
-.L221: // C!=0 S!=0
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    add.d X, X, INCX
-    ld.d t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t2, Y, 0 * SIZE
-    vinsgr2vr.d VX2, t1, 0
-    vinsgr2vr.d VX2, t2, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX0, VXC
-    vfmadd.d VT0, VX2, VXS, VT0
-    vfmul.d VT1, VX0, VXS
-    vfmsub.d VT1, VX2, VXC, VT1
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t3, 0
-    vinsgr2vr.d VX0, t4, 1
-    ld.d t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t4, Y, 0 * SIZE
-    vinsgr2vr.d VX2, t3, 0
-    vinsgr2vr.d VX2, t4, 1
-    add.d Y, Y, INCY
-    vfmul.d  VT0, VX0, VXC
-    vfmadd.d VT0, VX2, VXS, VT0
-    vfmul.d  VT1, VX0, VXS
-    vfmsub.d VT1, VX2, VXC, VT1
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t1, 0
-    vinsgr2vr.d VX1, t2, 1
-    ld.d t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t2, Y, 0 * SIZE
-    vinsgr2vr.d VX3, t1, 0
-    vinsgr2vr.d VX3, t2, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX1, VXC
-    vfmadd.d VT0, VX3, VXS, VT0
-    vfmul.d VT1, VX0, VXS
-    vfmsub.d VT1, VX3, VXC, VT1
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    add.d X, X, INCX
-    ld.d t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t4, Y, 0 * SIZE
-    vinsgr2vr.d VX3, t3, 0
-    vinsgr2vr.d VX3, t4, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX1, VXC
-    vfmadd.d VT0, VX3, VXS, VT0
-    vfmul.d VT1, VX0, VXS
-    vfmsub.d VT1, VX3, VXC, VT1
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L221
-    b .L997
-    .align 3
-
-.L222: // C!=0 S==0
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    ld.d t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t2, Y, 0 * SIZE
-    vinsgr2vr.d VX2, t1, 0
-    vinsgr2vr.d VX2, t2, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX0, VXC
-    vfmul.d VT1, VX2, VXC
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t3, 0
-    vinsgr2vr.d VX0, t4, 1
-    ld.d t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t4, Y, 0 * SIZE
-    vinsgr2vr.d VX2, t3, 0
-    vinsgr2vr.d VX2, t4, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX0, VXC
-    vfmul.d VT1, VX2, VXC
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t1, 0
-    vinsgr2vr.d VX1, t2, 1
-    ld.d t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t2, Y, 0 * SIZE
-    vinsgr2vr.d VX3, t1, 0
-    vinsgr2vr.d VX3, t2, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX1, VXC
-    vfmul.d VT1, VX3, VXC
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    ld.d t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t4, Y, 0 * SIZE
-    vinsgr2vr.d VX3, t3, 0
-    vinsgr2vr.d VX3, t4, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX1, VXC
-    vfmul.d VT1, VX3, VXC
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L222
-    b .L997
-    .align 3
-
-.L223: // C==0 S!=0
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    ld.d t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t2, Y, 0 * SIZE
-    vinsgr2vr.d VX2, t1, 0
-    vinsgr2vr.d VX2, t2, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX2, VXS
-    vfmul.d VT1, VX0, VXS
-    vfsub.d VT1, VXZ, VT1
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t3, 0
-    vinsgr2vr.d VX0, t4, 1
-    ld.d t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t4, Y, 0 * SIZE
-    vinsgr2vr.d VX2, t3, 0
-    vinsgr2vr.d VX2, t4, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX2, VXS
-    vfmul.d VT1, VX0, VXS
-    vfsub.d VT1, VXZ, VT1
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t1, 0
-    vinsgr2vr.d VX1, t2, 1
-    ld.d t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t2, Y, 0 * SIZE
-    vinsgr2vr.d VX3, t1, 0
-    vinsgr2vr.d VX3, t2, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX3, VXS
-    vfmul.d VT1, VX0, VXS
-    vfsub.d VT1, VXZ, VT1
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    ld.d t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.d t4, Y, 0 * SIZE
-    vinsgr2vr.d VX3, t3, 0
-    vinsgr2vr.d VX3, t4, 1
-    add.d Y, Y, INCY
-    vfmul.d VT0, VX3, VXS
-    vfmul.d VT1, VX0, VXS
-    vfsub.d VT1, VXZ, VT1
-    vstelm.d VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.d VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L223
-    b .L997
-    .align 3
-
-.L224: // C==0 S==0
-    vstelm.d VXZ, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VXZ, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.d VXZ, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VXZ, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.d VXZ, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VXZ, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.d VXZ, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VXZ, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.d VXZ, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VXZ, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.d VXZ, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.d VXZ, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.d VXZ, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VXZ, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.d VXZ, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.d VXZ, YY, 0, 1
-    add.d YY, YY, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L224
-    b .L997
-    .align 3
-
-.L997:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L998:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f13, Y, 0 * SIZE
-    fmul.d $f10, $f12, C
-    fmadd.d $f10, $f13, S, $f10
-    fst.d $f10, X, 0 * SIZE
-    addi.d I, I, -1
-    fmul.d $f20, $f12, S
-    fmsub.d $f20, $f13, C, $f20
-    fst.d $f20, Y, 0 * SIZE
-    add.d  X, X, INCX
-    add.d  Y, Y, INCY
-    blt $r0, I, .L998
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/drot_lasx.S b/kernel/loongarch64/rot_lasx.S
similarity index 52%
rename from kernel/loongarch64/drot_lasx.S
rename to kernel/loongarch64/rot_lasx.S
index d3644b780..5d7e3d7cc 100644
--- a/kernel/loongarch64/drot_lasx.S
+++ b/kernel/loongarch64/rot_lasx.S
@@ -1,3 +1,30 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
 #define ASSEMBLER
 
 #include "common.h"
@@ -33,16 +60,25 @@
     bge $r0, N, .L999
     li.d TEMP, 1
     movgr2fr.d a1, $r0
-    ffint.d.l a1, a1
+    FFINT a1, a1
     slli.d  TEMP, TEMP, BASE_SHIFT
     slli.d  INCX, INCX, BASE_SHIFT
     slli.d  INCY, INCY, BASE_SHIFT
+#ifdef DOUBLE
     movfr2gr.d t1, C
     xvreplgr2vr.d VXC, t1
     movfr2gr.d t2, S
     xvreplgr2vr.d VXS, t2
     movfr2gr.d t3, a1
     xvreplgr2vr.d VXZ, t3
+#else
+    movfr2gr.s t1, C
+    xvreplgr2vr.w VXC, t1
+    movfr2gr.s t2, S
+    xvreplgr2vr.w VXS, t2
+    movfr2gr.s t3, a1
+    xvreplgr2vr.w VXZ, t3
+#endif
     srai.d I, N, 3
     bne INCX, TEMP, .L20
     bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
@@ -53,15 +89,15 @@
 
 .L11:
     bge $r0, I, .L997
-    fcmp.ceq.d $fcc0, C, a1
+    CMPEQ $fcc0, C, a1
     bcnez $fcc0, .L110
-    fcmp.ceq.d $fcc0, S, a1
+    CMPEQ $fcc0, S, a1
     bcnez $fcc0, .L112 // C!=0 S==0
     b .L111 // C!=0 S!=0
     .align 3
 
 .L110:
-    fcmp.ceq.d $fcc0, S, a1
+    CMPEQ $fcc0, S, a1
     bcnez $fcc0, .L114 // C==0 S==0
     b .L113 // C==0 S!=0
     .align 3
@@ -69,20 +105,24 @@
 .L111: // C!=0 S!=0
     xvld VX0, X, 0 * SIZE
     xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
     xvld VX1, X, 4 * SIZE
     xvld VX3, Y, 4 * SIZE
-    xvfmul.d VT0, VX0, VXC
-    xvfmadd.d VT0, VX2, VXS, VT0
-    xvfmul.d VT1, VX0, VXS
-    xvfmsub.d VT1, VX2, VXC, VT1
+#endif
+    XVMUL VT0, VX0, VXC
+    XVFMADD VT0, VX2, VXS, VT0
+    XVMUL VT1, VX0, VXS
+    XVMSUB VT1, VX2, VXC, VT1
     xvst VT0, X, 0 * SIZE
     xvst VT1, Y, 0 * SIZE
-    xvfmul.d VT0, VX1, VXC
-    xvfmadd.d VT0, VX3, VXS, VT0
-    xvfmul.d VT1, VX1, VXS
-    xvfmsub.d VT1, VX3, VXC, VT1
+#ifdef DOUBLE
+    XVMUL VT0, VX1, VXC
+    XVFMADD VT0, VX3, VXS, VT0
+    XVMUL VT1, VX1, VXS
+    XVMSUB VT1, VX3, VXC, VT1
     xvst VT0, X, 4 * SIZE
     xvst VT1, Y, 4 * SIZE
+#endif
     addi.d X, X, 8 * SIZE
     addi.d Y, Y, 8 * SIZE
     addi.d  I, I, -1
@@ -93,16 +133,20 @@
 .L112: // C!=0 S==0
     xvld VX0, X, 0 * SIZE
     xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
     xvld VX1, X, 4 * SIZE
     xvld VX3, Y, 4 * SIZE
-    xvfmul.d VT0, VX0, VXC
-    xvfmul.d VT1, VX2, VXC
+#endif
+    XVMUL VT0, VX0, VXC
+    XVMUL VT1, VX2, VXC
     xvst VT0, X, 0 * SIZE
     xvst VT1, Y, 0 * SIZE
-    xvfmul.d VT0, VX1, VXC
-    xvfmul.d VT1, VX3, VXC
+#ifdef DOUBLE
+    XVMUL VT0, VX1, VXC
+    XVMUL VT1, VX3, VXC
     xvst VT0, X, 4 * SIZE
     xvst VT1, Y, 4 * SIZE
+#endif
     addi.d X, X, 8 * SIZE
     addi.d Y, Y, 8 * SIZE
     addi.d  I, I, -1
@@ -113,18 +157,22 @@
 .L113: // C==0 S!=0
     xvld VX0, X, 0 * SIZE
     xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
     xvld VX1, X, 4 * SIZE
     xvld VX3, Y, 4 * SIZE
-    xvfmul.d VT0, VX2, VXS
-    xvfmul.d VT1, VX0, VXS
-    xvfsub.d VT1, VXZ, VT1
+#endif
+    XVMUL VT0, VX2, VXS
+    XVMUL VT1, VX0, VXS
+    XVFSUB VT1, VXZ, VT1
     xvst VT0, X, 0 * SIZE
     xvst VT1, Y, 0 * SIZE
-    xvfmul.d VT0, VX3, VXS
-    xvfmul.d VT1, VX1, VXS
+#ifdef DOUBLE
+    XVMUL VT0, VX3, VXS
+    XVMUL VT1, VX1, VXS
     xvfsub.d VT1, VXZ, VT1
     xvst VT0, X, 4 * SIZE
     xvst VT1, Y, 4 * SIZE
+#endif
     addi.d X, X, 8 * SIZE
     addi.d Y, Y, 8 * SIZE
     addi.d  I, I, -1
@@ -135,8 +183,10 @@
 .L114: // C==0 S==0
     xvst VXZ, X, 0 * SIZE
     xvst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
     xvst VXZ, X, 4 * SIZE
     xvst VXZ, Y, 4 * SIZE
+#endif
     addi.d X, X, 8 * SIZE
     addi.d Y, Y, 8 * SIZE
     addi.d  I, I, -1
@@ -148,37 +198,66 @@
     bge $r0, I, .L997
     move YY, Y
     move XX, X
-    fcmp.ceq.d $fcc0, C, a1
+    CMPEQ $fcc0, C, a1
     bcnez $fcc0, .L120
-    fcmp.ceq.d $fcc0, S, a1
+    CMPEQ $fcc0, S, a1
     bcnez $fcc0, .L122 // C!=0 S==0
     b .L121 // C!=0 S!=0
     .align 3
 
 .L120:
-    fcmp.ceq.d $fcc0, S, a1
+    CMPEQ $fcc0, S, a1
     bcnez $fcc0, .L124 // C==0 S==0
     b .L123 // C==0 S!=0
     .align 3
 
 .L121: // C!=0 S!=0
     xvld VX0, X, 0 * SIZE
-    ld.d t1, Y, 0 * SIZE
+#ifdef DOUBLE
+    ld.d  t1, Y, 0 * SIZE
     add.d Y, Y, INCY
-    ld.d t2, Y, 0 * SIZE
+    ld.d  t2, Y, 0 * SIZE
     add.d Y, Y, INCY
-    ld.d t3, Y, 0 * SIZE
+    ld.d  t3, Y, 0 * SIZE
     add.d Y, Y, INCY
-    ld.d t4, Y, 0 * SIZE
+    ld.d  t4, Y, 0 * SIZE
     xvinsgr2vr.d VX2, t1, 0
     xvinsgr2vr.d VX2, t2, 1
     xvinsgr2vr.d VX2, t3, 2
     xvinsgr2vr.d VX2, t4, 3
     add.d Y, Y, INCY
-    xvfmul.d VT0, VX0, VXC
-    xvfmadd.d VT0, VX2, VXS, VT0
-    xvfmul.d VT1, VX0, VXS
-    xvfmsub.d VT1, VX2, VXC, VT1
+#else
+    ld.w  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w  t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w  t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w  t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+#endif
+    XVMUL VT0, VX0, VXC
+    XVFMADD VT0, VX2, VXS, VT0
+    XVMUL VT1, VX0, VXS
+    XVMSUB VT1, VX2, VXC, VT1
+
+#ifdef DOUBLE
     xvld VX1, X, 4 * SIZE
     xvst VT0, X, 0 * SIZE
     xvstelm.d VT1, YY, 0, 0
@@ -201,11 +280,10 @@
     xvinsgr2vr.d VX3, t3, 2
     xvinsgr2vr.d VX3, t4, 3
     add.d Y, Y, INCY
-    xvfmul.d VT0, VX1, VXC
-    xvfmadd.d VT0, VX3, VXS, VT0
-    xvfmul.d VT1, VX1, VXS
-    xvfmsub.d VT1, VX3, VXC, VT1
-    addi.d  I, I, -1
+    XVMUL VT0, VX1, VXC
+    XVFMADD VT0, VX3, VXS, VT0
+    XVMUL VT1, VX1, VXS
+    XVMSUB VT1, VX3, VXC, VT1
     xvst VT0, X, 4 * SIZE
     xvstelm.d VT1, YY, 0, 0
     add.d YY, YY, INCY
@@ -214,13 +292,34 @@
     xvstelm.d VT1, YY, 0, 2
     add.d YY, YY, INCY
     xvstelm.d VT1, YY, 0, 3
+#else
+    xvst VT0, X, 0 * SIZE
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+
+#endif
     add.d YY, YY, INCY
     addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
     blt $r0, I, .L121
     b .L997
     .align 3
 
 .L122: // C!=0 S==0
+#ifdef DOUBLE
     xvld VX0, X, 0 * SIZE
     ld.d t1, Y, 0 * SIZE
     add.d Y, Y, INCY
@@ -269,13 +368,60 @@
     xvstelm.d VT1, YY, 0, 2
     add.d YY, YY, INCY
     xvstelm.d VT1, YY, 0, 3
+#else
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VT0, VX0, VXC
+    xvfmul.s VT1, VX2, VXC
+    xvst VT0, X, 0 * SIZE
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+#endif
     add.d YY, YY, INCY
     addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
     blt $r0, I, .L122
     b .L997
     .align 3
 
 .L123: // C==0 S!=0
+#ifdef DOUBLE
     xvld VX0, X, 0 * SIZE
     ld.d t1, Y, 0 * SIZE
     add.d Y, Y, INCY
@@ -326,14 +472,63 @@
     xvstelm.d VT1, YY, 0, 2
     add.d YY, YY, INCY
     xvstelm.d VT1, YY, 0, 3
+#else
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VT0, VX2, VXS
+    xvfmul.s VT1, VX0, VXS
+    xvfsub.s VT1, VXZ, VT1
+    xvst VT0, X, 0 * SIZE
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+#endif
     add.d YY, YY, INCY
     addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
     blt $r0, I, .L123
     b .L997
     .align 3
 
 .L124: // C==0 S==0
     xvst VXZ, X, 0 * SIZE
+#ifdef DOUBLE
+    xvst VXZ, X, 0 * SIZE
     xvst VXZ, X, 4 * SIZE
     xvstelm.d VXZ, YY, 0, 0
     add.d YY, YY, INCY
@@ -350,29 +545,50 @@
     xvstelm.d VXZ, YY, 0, 2
     add.d YY, YY, INCY
     xvstelm.d VXZ, YY, 0, 3
+#else
+    xvst VXZ, X, 0 * SIZE
+    xvstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 7
+#endif
     add.d YY, YY, INCY
     addi.d  I, I, -1
+    addi.d X, X, 8 * SIZE
     blt $r0, I, .L124
+    move Y, YY
     b .L997
     .align 3
 
 .L21:// INCX!=1 and INCY==1
     bge $r0, I, .L997
     move XX, X
-    fcmp.ceq.d $fcc0, C, a1
+    CMPEQ $fcc0, C, a1
     bcnez $fcc0, .L210
-    fcmp.ceq.d $fcc0, S, a1
+    CMPEQ $fcc0, S, a1
     bcnez $fcc0, .L212 // C!=0 S==0
     b .L211 // C!=0 S!=0
     .align 3
 
 .L210:
-    fcmp.ceq.d $fcc0, S, a1
+    CMPEQ $fcc0, S, a1
     bcnez $fcc0, .L214 // C==0 S==0
     b .L213 // C==0 S!=0
     .align 3
 
 .L211: // C!=0 S!=0
+#ifdef DOUBLE
     xvld VX2, Y, 0 * SIZE
     ld.d t1, X, 0 * SIZE
     add.d X, X, INCX
@@ -425,6 +641,54 @@
     xvstelm.d VT0, XX, 0, 3
     add.d XX, XX, INCX
     xvst VT1, Y, 4 * SIZE
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VT0, VXC, VX0
+    xvfmadd.s VT0, VX2, VXS, VT0
+    xvfmul.s VT1, VX0, VXS
+    xvfmsub.s VT1, VX2, VXC, VT1
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+#endif
     addi.d Y, Y, 8 * SIZE
     addi.d  I, I, -1
     blt $r0, I, .L211
@@ -432,6 +696,7 @@
     .align 3
 
 .L212: // C!=0 S==0
+#ifdef DOUBLE
     xvld VX2, Y, 0 * SIZE
     ld.d t1, X, 0 * SIZE
     add.d X, X, INCX
@@ -480,6 +745,52 @@
     add.d XX, XX, INCX
     xvfmul.d VT1, VX3, VXS
     xvst VT1, Y, 4 * SIZE
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VT0, VXC, VX0
+    xvfmul.s VT1, VX2, VXC
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+#endif
     addi.d Y, Y, 8 * SIZE
     addi.d  I, I, -1
     blt $r0, I, .L212
@@ -487,6 +798,7 @@
     .align 3
 
 .L213: // C==0 S!=0
+#ifdef DOUBLE
     xvld VX2, Y, 0 * SIZE
     ld.d t1, X, 0 * SIZE
     add.d X, X, INCX
@@ -537,6 +849,53 @@
     xvstelm.d VT0, XX, 0, 3
     add.d XX, XX, INCX
     xvst VT1, Y, 4 * SIZE
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VT0, VXS, VX2
+    xvfmul.s VT1, VXS, VX0
+    xvfsub.s VT1, VXZ, VT1
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+#endif
     addi.d Y, Y, 8 * SIZE
     addi.d  I, I, -1
     blt $r0, I, .L213
@@ -544,6 +903,7 @@
     .align 3
 
 .L214: // C==0 S==0
+#ifdef DOUBLE
     xvstelm.d VXZ, XX, 0, 0
     add.d XX, XX, INCX
     xvstelm.d VXZ, XX, 0, 1
@@ -562,6 +922,25 @@
     xvstelm.d VXZ, XX, 0, 3
     add.d XX, XX, INCX
     xvst VT1, Y, 4 * SIZE
+#else
+    xvstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+    xvstelm.w VXZ, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 7
+    add.d XX, XX, INCX
+#endif
     addi.d Y, Y, 8 * SIZE
     addi.d  I, I, -1
     blt $r0, I, .L211
@@ -572,20 +951,21 @@
     bge $r0, I, .L997
     move YY, Y
     move XX, X
-    fcmp.ceq.d $fcc0, C, a1
+    CMPEQ $fcc0, C, a1
     bcnez $fcc0, .L220
-    fcmp.ceq.d $fcc0, S, a1
+    CMPEQ $fcc0, S, a1
     bcnez $fcc0, .L222 // C!=0 S==0
     b .L221 // C!=0 S!=0
     .align 3
 
 .L220:
-    fcmp.ceq.d $fcc0, S, a1
+    CMPEQ $fcc0, S, a1
     bcnez $fcc0, .L224 // C==0 S==0
     b .L223 // C==0 S!=0
     .align 3
 
 .L221: // C!=0 S!=0
+#ifdef DOUBLE
     ld.d t1, X, 0 * SIZE
     add.d X, X, INCX
     ld.d t2, X, 0 * SIZE
@@ -674,12 +1054,99 @@
     add.d YY, YY, INCY
     xvstelm.d VT1, YY, 0, 3
     add.d YY, YY, INCY
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s  VT0, VX0, VXC
+    xvfmadd.s VT0, VX2, VXS, VT0
+    xvfmul.s  VT1, VX0, VXS
+    xvfmsub.s VT1, VX2, VXC, VT1
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+    add.d YY, YY, INCY
+#endif
     addi.d  I, I, -1
     blt $r0, I, .L221
     b .L997
     .align 3
 
 .L222: // C!=0 S==0
+#ifdef DOUBLE
     ld.d t1, X, 0 * SIZE
     add.d X, X, INCX
     ld.d t2, X, 0 * SIZE
@@ -764,12 +1231,97 @@
     add.d YY, YY, INCY
     xvstelm.d VT1, YY, 0, 3
     add.d YY, YY, INCY
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VT0, VX0, VXC
+    xvfmul.s VT1, VX2, VXC
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+    add.d YY, YY, INCY
+#endif
     addi.d  I, I, -1
     blt $r0, I, .L222
     b .L997
     .align 3
 
 .L223: // C==0 S!=0
+#ifdef DOUBLE
     ld.d t1, X, 0 * SIZE
     add.d X, X, INCX
     ld.d t2, X, 0 * SIZE
@@ -856,12 +1408,98 @@
     add.d YY, YY, INCY
     xvstelm.d VT1, YY, 0, 3
     add.d YY, YY, INCY
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VT0, VX2, VXS
+    xvfmul.s VT1, VX0, VXS
+    xvfsub.s VT1, VXZ, VT1
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+    add.d YY, YY, INCY
+#endif
     addi.d  I, I, -1
     blt $r0, I, .L223
     b .L997
     .align 3
 
 .L224: // C==0 S==0
+#ifdef DOUBLE
     xvstelm.d VXZ, XX, 0, 0
     add.d XX, XX, INCX
     xvstelm.d VXZ, XX, 0, 1
@@ -893,9 +1531,46 @@
     xvstelm.d VXZ, YY, 0, 2
     add.d YY, YY, INCY
     xvstelm.d VXZ, YY, 0, 3
+#else
+    xvstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 7
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 7
+#endif
     add.d YY, YY, INCY
     addi.d  I, I, -1
     blt $r0, I, .L224
+#ifdef DOUBLE
+    move X, XX
+    move Y, YY
+#endif
     b .L997
     .align 3
 
@@ -905,15 +1580,15 @@
     .align 3
 
 .L998:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f13, Y, 0 * SIZE
-    fmul.d $f10, $f12, C
-    fmadd.d $f10, $f13, S, $f10
-    fst.d $f10, X, 0 * SIZE
+    LD   $f12, X, 0 * SIZE
+    LD   $f13, Y, 0 * SIZE
+    MUL  $f10, $f12, C
+    MADD $f10, $f13, S, $f10
+    ST   $f10, X, 0 * SIZE
     addi.d I, I, -1
-    fmul.d $f20, $f12, S
-    fmsub.d $f20, $f13, C, $f20
-    fst.d $f20, Y, 0 * SIZE
+    MUL  $f20, $f12, S
+    MSUB $f20, $f13, C, $f20
+    ST   $f20, Y, 0 * SIZE
     add.d  X, X, INCX
     add.d  Y, Y, INCY
     blt $r0, I, .L998
@@ -924,4 +1599,4 @@
     jirl $r0, $r1, 0x0
     .align 3
 
-    EPILOGUE
\ No newline at end of file
+    EPILOGUE
diff --git a/kernel/loongarch64/rot_lsx.S b/kernel/loongarch64/rot_lsx.S
new file mode 100644
index 000000000..4b0e59310
--- /dev/null
+++ b/kernel/loongarch64/rot_lsx.S
@@ -0,0 +1,1791 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define C      $f0
+#define S      $f1
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VT0    $vr10
+#define VT1    $vr18
+#define VXC    $vr23
+#define VXS    $vr9
+#define VXZ    $vr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT a1, a1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+#ifdef DOUBLE
+    movfr2gr.d t1, C
+    vreplgr2vr.d VXC, t1
+    movfr2gr.d t2, S
+    vreplgr2vr.d VXS, t2
+    movfr2gr.d t3, a1
+    vreplgr2vr.d VXZ, t3
+#else
+    movfr2gr.s t1, C
+    vreplgr2vr.w VXC, t1
+    movfr2gr.s t2, S
+    vreplgr2vr.w VXS, t2
+    movfr2gr.s t3, a1
+    vreplgr2vr.w VXZ, t3
+#endif
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L110
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L112 // C!=0 S==0
+    b .L111 // C!=0 S!=0
+    .align 3
+
+.L110:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L114 // C==0 S==0
+    b .L113 // C==0 S!=0
+    .align 3
+
+.L111: // C!=0 S!=0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+#endif
+    VMUL VT0, VX0, VXC
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX2, VXC, VT1
+    vst VT0, X, 0 * SIZE
+    vst VT1, Y, 0 * SIZE
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+#ifdef DOUBLE
+    vst VT0, X, 2 * SIZE
+    vst VT1, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    VMUL VT0, VX0, VXC
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX2, VXC, VT1
+#endif
+    vst VT0, X, 4 * SIZE
+    vst VT1, Y, 4 * SIZE
+#ifdef DOUBLE
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vst VT0, X, 6 * SIZE
+    vst VT1, Y, 6 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // C!=0 S==0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+#endif
+    VMUL VT0, VX0, VXC
+    VMUL VT1, VX2, VXC
+    vst VT0, X, 0 * SIZE
+    vst VT1, Y, 0 * SIZE
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+#ifdef DOUBLE
+    vst VT0, X, 2 * SIZE
+    vst VT1, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    VMUL VT0, VX0, VXC
+    VMUL VT1, VX2, VXC
+#endif
+    vst VT0, X, 4 * SIZE
+    vst VT1, Y, 4 * SIZE
+#ifdef DOUBLE
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+    vst VT0, X, 6 * SIZE
+    vst VT1, Y, 6 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // C==0 S!=0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+#endif
+    VMUL VT0, VX2, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vst VT0, X, 0 * SIZE
+    vst VT1, Y, 0 * SIZE
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+#ifdef DOUBLE
+    vst VT0, X, 2 * SIZE
+    vst VT1, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    VMUL VT0, VX2, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+#endif
+    vst VT0, X, 4 * SIZE
+    vst VT1, Y, 4 * SIZE
+#ifdef DOUBLE
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+    vst VT0, X, 6 * SIZE
+    vst VT1, Y, 6 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // C==0 S==0
+    vst VXZ, X, 0 * SIZE
+    vst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    vst VXZ, X, 2 * SIZE
+    vst VXZ, Y, 2 * SIZE
+#endif
+    vst VXZ, X, 4 * SIZE
+    vst VXZ, Y, 4 * SIZE
+#ifdef DOUBLE
+    vst VXZ, X, 6 * SIZE
+    vst VXZ, Y, 6 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L120
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L122 // C!=0 S==0
+    b .L121 // C!=0 S!=0
+    .align 3
+
+.L120:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L124 // C==0 S==0
+    b .L123 // C==0 S!=0
+    .align 3
+
+.L121: // C!=0 S!=0
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    ld.d  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d  t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w  t2, Y, 0 * SIZE
+#endif
+    add.d Y, Y, INCY
+#ifndef DOUBLE
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+#endif
+    VMUL VT0, VX0, VXC
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX2, VXC, VT1
+    vst VT0, X, 0 * SIZE
+#ifdef DOUBLE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX0, X, 2 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX2, VXC, VT1
+    vst VT0, X, 2 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+#else
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+#endif
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vst VT0, X, 4 * SIZE
+#ifdef DOUBLE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 6 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vst VT0, X, 6 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+#else
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L122: // C!=0 S==0
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    ld.d  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d  t2, Y, 0 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w  t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+#else
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+#endif
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VMUL VT1, VX2, VXC
+    vst VT0, X, 0 * SIZE
+#ifdef DOUBLE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX0, X, 2 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VMUL VT1, VX2, VXC
+    vst VT0, X, 2 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+    vst VT0, X, 4 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 6 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+    vst VT0, X, 6 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+#else
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+    vst VT0, X, 4 * SIZE
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L122
+    b .L997
+    .align 3
+
+.L123: // C==0 S!=0
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    ld.d  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d  t2, Y, 0 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w  t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+#else
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+#endif
+    add.d Y, Y, INCY
+    VMUL VT0, VX2, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vst VT0, X, 0 * SIZE
+#ifdef DOUBLE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX0, X, 2 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX2, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vst VT0, X, 2 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+    vst VT0, X, 4 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 6 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+    vst VT0, X, 6 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+#else
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+    vst VT0, X, 4 * SIZE
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L123
+    b .L997
+    .align 3
+
+.L124: // C==0 S==0
+    vst VXZ, X, 0 * SIZE
+    vst VXZ, X, 4 * SIZE
+#ifdef DOUBLE
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+#else
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L124
+#ifdef DOUBLE
+    move Y, YY
+#endif
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    move XX, X
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L210
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L212 // C!=0 S==0
+    b .L211 // C!=0 S!=0
+    .align 3
+
+.L210:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L214 // C==0 S==0
+    b .L213 // C==0 S!=0
+    .align 3
+
+.L211: // C!=0 S!=0
+#ifdef DOUBLE
+    vld VX2, Y, 0 * SIZE
+    ld.d  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d  t2, X, 0 * SIZE
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w  t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+#else
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+#endif
+    add.d X, X, INCX
+    VMUL VT0, VXC, VX0
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VXS, VX0
+    VMSUB VT1, VX2, VXC, VT1
+#ifdef DOUBLE
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX2, Y, 2 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    add.d X, X, INCX
+    VMUL VT0, VXC, VX0
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VXS, VX0
+    VMSUB VT1, VX2, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 2 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 6 * SIZE
+#else
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L212: // C!=0 S==0
+#ifdef DOUBLE
+    vld VX2, Y, 0 * SIZE
+    ld.d  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d  t2, X, 0 * SIZE
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w  t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+#else
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+#endif
+    add.d X, X, INCX
+    VMUL VT0, VXC, VX0
+    VMUL VT1, VX2, VXC
+
+#ifdef DOUBLE
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX2, Y, 2 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    add.d X, X, INCX
+    VMUL VT0, VXC, VX0
+    VMUL VT1, VX2, VXC
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 2 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXS
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXS
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    vst VT1, Y, 6 * SIZE
+#else
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXS
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L212
+    b .L997
+    .align 3
+
+.L213: // C==0 S!=0
+#ifdef DOUBLE
+    vld VX2, Y, 0 * SIZE
+    ld.d  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d  t2, X, 0 * SIZE
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w  t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+#else
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+#endif
+    add.d X, X, INCX
+    VMUL VT0, VXS, VX2
+    VMUL VT1, VXS, VX0
+    VFSUB VT1, VXZ, VT1
+
+#ifdef DOUBLE
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX2, Y, 2 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    add.d X, X, INCX
+    VMUL VT0, VXS, VX2
+    VMUL VT1, VXS, VX0
+    VFSUB VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 2 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 6 * SIZE
+#else
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L213
+    b .L997
+    .align 3
+
+.L214: // C==0 S==0
+#ifdef DOUBLE
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+#else
+    vstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 3
+#endif
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+#ifdef DOUBLE
+    move X, XX
+#endif
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L220
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L222 // C!=0 S==0
+    b .L221 // C!=0 S!=0
+    .align 3
+
+.L220:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L224 // C==0 S==0
+    b .L223 // C==0 S!=0
+    .align 3
+
+.L221: // C!=0 S!=0
+#ifdef DOUBLE
+    ld.d  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d  t2, X, 0 * SIZE
+    add.d X, X, INCX
+#else
+    ld.w  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w  t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX2, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    VMUL  VT0, VX0, VXC
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL  VT1, VX0, VXS
+    VMSUB VT1, VX2, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+#else
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX2, VXC, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222: // C!=0 S==0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+#ifndef DOUBLE
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VMUL VT1, VX2, VXC
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VMUL VT1, VX2, VXC
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+#else
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VMUL VT1, VX2, VXC
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: // C==0 S!=0
+#ifdef DOUBLE
+    ld.d  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d  t2, X, 0 * SIZE
+    add.d X, X, INCX
+#else
+    ld.w  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w  t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX2, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX2, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+#else
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX2, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224: // C==0 S==0
+#ifdef DOUBLE
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+#else
+    vstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+#ifdef DOUBLE
+    move X, XX
+    move Y, YY
+#endif
+    b .L997
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   $f12, X, 0 * SIZE
+    LD   $f13, Y, 0 * SIZE
+    MUL  $f10, $f12, C
+    MADD $f10, $f13, S, $f10
+    ST   $f10, X, 0 * SIZE
+    addi.d I, I, -1
+    MUL  $f20, $f12, S
+    MSUB $f20, $f13, C, $f20
+    ST   $f20, Y, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/srot_lasx.S b/kernel/loongarch64/srot_lasx.S
deleted file mode 100644
index 9aeb4dcf5..000000000
--- a/kernel/loongarch64/srot_lasx.S
+++ /dev/null
@@ -1,863 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define Y      $r7
-#define INCY   $r8
-#define C      $f0
-#define S      $f1
-
-#define I      $r12
-#define TEMP   $r13
-#define t1     $r14
-#define t2     $r16
-#define t3     $r15
-#define t4     $r17
-#define XX     $r18
-#define YY     $r19
-#define a1     $f12
-#define VX0    $xr8
-#define VX1    $xr20
-#define VX2    $xr21
-#define VX3    $xr22
-#define VT0    $xr10
-#define VT1    $xr18
-#define VXC    $xr23
-#define VXS    $xr9
-#define VXZ    $xr19
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    movgr2fr.d a1, $r0
-    ffint.s.l a1, a1
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    movfr2gr.s t1, C
-    xvreplgr2vr.w VXC, t1
-    movfr2gr.s t2, S
-    xvreplgr2vr.w VXS, t2
-    movfr2gr.s t3, a1
-    xvreplgr2vr.w VXZ, t3
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L997
-    fcmp.ceq.s $fcc0, C, a1
-    bcnez $fcc0, .L110
-    fcmp.ceq.s $fcc0, S, a1
-    bcnez $fcc0, .L112 // C!=0 S==0
-    b .L111 // C!=0 S!=0
-    .align 3
-
-.L110:
-    fcmp.ceq.s $fcc0, S, a1
-    bcnez $fcc0, .L114 // C==0 S==0
-    b .L113 // C==0 S!=0
-    .align 3
-
-.L111: // C!=0 S!=0
-    xvld VX0, X, 0 * SIZE
-    xvld VX2, Y, 0 * SIZE
-    xvfmul.s VT0, VX0, VXC
-    xvfmadd.s VT0, VX2, VXS, VT0
-    xvfmul.s VT1, VX0, VXS
-    xvfmsub.s VT1, VX2, VXC, VT1
-    xvst VT0, X, 0 * SIZE
-    xvst VT1, Y, 0 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L111
-    b .L997
-    .align 3
-
-.L112: // C!=0 S==0
-    xvld VX0, X, 0 * SIZE
-    xvld VX2, Y, 0 * SIZE
-    xvfmul.s VT0, VX0, VXC
-    xvfmul.s VT1, VX2, VXC
-    xvst VT0, X, 0 * SIZE
-    xvst VT1, Y, 0 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L112
-    b .L997
-    .align 3
-
-.L113: // C==0 S!=0
-    xvld VX0, X, 0 * SIZE
-    xvld VX2, Y, 0 * SIZE
-    xvfmul.s VT0, VX2, VXS
-    xvfmul.s VT1, VX0, VXS
-    xvfsub.s VT1, VXZ, VT1
-    xvst VT0, X, 0 * SIZE
-    xvst VT1, Y, 0 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L113
-    b .L997
-    .align 3
-
-.L114: // C==0 S==0
-    xvst VXZ, X, 0 * SIZE
-    xvst VXZ, Y, 0 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L114
-    b .L997
-    .align 3
-
-.L12: // INCX==1 and INCY!=1
-    bge $r0, I, .L997
-    move YY, Y
-    move XX, X
-    fcmp.ceq.s $fcc0, C, a1
-    bcnez $fcc0, .L120
-    fcmp.ceq.s $fcc0, S, a1
-    bcnez $fcc0, .L122 // C!=0 S==0
-    b .L121 // C!=0 S!=0
-    .align 3
-
-.L120:
-    fcmp.ceq.s $fcc0, S, a1
-    bcnez $fcc0, .L124 // C==0 S==0
-    b .L123 // C==0 S!=0
-    .align 3
-
-.L121: // C!=0 S!=0
-    xvld VX0, X, 0 * SIZE
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    xvinsgr2vr.w VX2, t1, 0
-    xvinsgr2vr.w VX2, t2, 1
-    xvinsgr2vr.w VX2, t3, 2
-    xvinsgr2vr.w VX2, t4, 3
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    xvinsgr2vr.w VX2, t1, 4
-    xvinsgr2vr.w VX2, t2, 5
-    xvinsgr2vr.w VX2, t3, 6
-    xvinsgr2vr.w VX2, t4, 7
-    add.d Y, Y, INCY
-    xvfmul.s VT0, VX0, VXC
-    xvfmadd.s VT0, VX2, VXS, VT0
-    xvfmul.s VT1, VX0, VXS
-    xvfmsub.s VT1, VX2, VXC, VT1
-    xvst VT0, X, 0 * SIZE
-    xvstelm.w VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 2
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 3
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 4
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 5
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 6
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 7
-    add.d YY, YY, INCY
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L121
-    b .L997
-    .align 3
-
-.L122: // C!=0 S==0
-    xvld VX0, X, 0 * SIZE
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    xvinsgr2vr.w VX2, t1, 0
-    xvinsgr2vr.w VX2, t2, 1
-    xvinsgr2vr.w VX2, t3, 2
-    xvinsgr2vr.w VX2, t4, 3
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    xvinsgr2vr.w VX2, t1, 4
-    xvinsgr2vr.w VX2, t2, 5
-    xvinsgr2vr.w VX2, t3, 6
-    xvinsgr2vr.w VX2, t4, 7
-    add.d Y, Y, INCY
-    xvfmul.s VT0, VX0, VXC
-    xvfmul.s VT1, VX2, VXC
-    xvst VT0, X, 0 * SIZE
-    xvstelm.w VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 2
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 3
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 4
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 5
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 6
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 7
-    add.d YY, YY, INCY
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L122
-    b .L997
-    .align 3
-
-.L123: // C==0 S!=0
-    xvld VX0, X, 0 * SIZE
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    xvinsgr2vr.w VX2, t1, 0
-    xvinsgr2vr.w VX2, t2, 1
-    xvinsgr2vr.w VX2, t3, 2
-    xvinsgr2vr.w VX2, t4, 3
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    xvinsgr2vr.w VX2, t1, 4
-    xvinsgr2vr.w VX2, t2, 5
-    xvinsgr2vr.w VX2, t3, 6
-    xvinsgr2vr.w VX2, t4, 7
-    add.d Y, Y, INCY
-    xvfmul.s VT0, VX2, VXS
-    xvfmul.s VT1, VX0, VXS
-    xvfsub.s VT1, VXZ, VT1
-    xvst VT0, X, 0 * SIZE
-    xvstelm.w VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 2
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 3
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 4
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 5
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 6
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 7
-    add.d YY, YY, INCY
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L123
-    b .L997
-    .align 3
-
-.L124: // C==0 S==0
-    xvst VXZ, X, 0 * SIZE
-    xvstelm.w VXZ, YY, 0, 0
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 1
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 2
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 3
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 4
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 5
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 6
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 7
-    add.d YY, YY, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L124
-    b .L997
-    .align 3
-
-.L21:// INCX!=1 and INCY==1
-    bge $r0, I, .L997
-    move XX, X
-    fcmp.ceq.s $fcc0, C, a1
-    bcnez $fcc0, .L210
-    fcmp.ceq.s $fcc0, S, a1
-    bcnez $fcc0, .L212 // C!=0 S==0
-    b .L211 // C!=0 S!=0
-    .align 3
-
-.L210:
-    fcmp.ceq.s $fcc0, S, a1
-    bcnez $fcc0, .L214 // C==0 S==0
-    b .L213 // C==0 S!=0
-    .align 3
-
-.L211: // C!=0 S!=0
-    xvld VX2, Y, 0 * SIZE
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    add.d X, X, INCX
-    xvfmul.s VT0, VXC, VX0
-    xvfmadd.s VT0, VX2, VXS, VT0
-    xvfmul.s VT1, VX0, VXS
-    xvfmsub.s VT1, VX2, VXC, VT1
-    xvstelm.w VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 2
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 3
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 4
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 5
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 6
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 7
-    add.d XX, XX, INCX
-    xvst VT1, Y, 0 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    b .L997
-    .align 3
-
-.L212: // C!=0 S==0
-    xvld VX2, Y, 0 * SIZE
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    add.d X, X, INCX
-    xvfmul.s VT0, VXC, VX0
-    xvfmul.s VT1, VX2, VXC
-    xvstelm.w VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 2
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 3
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 4
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 5
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 6
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 7
-    add.d XX, XX, INCX
-    xvst VT1, Y, 0 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L212
-    b .L997
-    .align 3
-
-.L213: // C==0 S!=0
-    xvld VX2, Y, 0 * SIZE
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    add.d X, X, INCX
-    xvfmul.s VT0, VXS, VX2
-    xvfmul.s VT1, VXS, VX0
-    xvfsub.s VT1, VXZ, VT1
-    xvstelm.w VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 2
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 3
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 4
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 5
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 6
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 7
-    add.d XX, XX, INCX
-    xvst VT1, Y, 0 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L213
-    b .L997
-    .align 3
-
-.L214: // C==0 S==0
-    xvstelm.w VXZ, XX, 0, 0
-    add.d XX, XX, INCX
-    xvstelm.w VXZ, XX, 0, 1
-    add.d XX, XX, INCX
-    xvstelm.w VXZ, XX, 0, 2
-    add.d XX, XX, INCX
-    xvstelm.w VXZ, XX, 0, 3
-    add.d XX, XX, INCX
-    xvst VT1, Y, 0 * SIZE
-    xvstelm.w VXZ, XX, 0, 4
-    add.d XX, XX, INCX
-    xvstelm.w VXZ, XX, 0, 5
-    add.d XX, XX, INCX
-    xvstelm.w VXZ, XX, 0, 6
-    add.d XX, XX, INCX
-    xvstelm.w VXZ, XX, 0, 7
-    add.d XX, XX, INCX
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    b .L997
-    .align 3
-
-.L22:
-    bge $r0, I, .L997
-    move YY, Y
-    move XX, X
-    fcmp.ceq.s $fcc0, C, a1
-    bcnez $fcc0, .L220
-    fcmp.ceq.s $fcc0, S, a1
-    bcnez $fcc0, .L222 // C!=0 S==0
-    b .L221 // C!=0 S!=0
-    .align 3
-
-.L220:
-    fcmp.ceq.s $fcc0, S, a1
-    bcnez $fcc0, .L224 // C==0 S==0
-    b .L223 // C==0 S!=0
-    .align 3
-
-.L221: // C!=0 S!=0
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    xvinsgr2vr.w VX2, t1, 0
-    xvinsgr2vr.w VX2, t2, 1
-    xvinsgr2vr.w VX2, t3, 2
-    xvinsgr2vr.w VX2, t4, 3
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    xvinsgr2vr.w VX2, t1, 4
-    xvinsgr2vr.w VX2, t2, 5
-    xvinsgr2vr.w VX2, t3, 6
-    xvinsgr2vr.w VX2, t4, 7
-    add.d Y, Y, INCY
-    xvfmul.s  VT0, VX0, VXC
-    xvfmadd.s VT0, VX2, VXS, VT0
-    xvfmul.s  VT1, VX0, VXS
-    xvfmsub.s VT1, VX2, VXC, VT1
-    xvstelm.w VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 2
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 3
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 4
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 5
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 6
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 7
-    add.d XX, XX, INCX
-    xvstelm.w VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 2
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 3
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 4
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 5
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 6
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 7
-    add.d YY, YY, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L221
-    b .L997
-    .align 3
-
-.L222: // C!=0 S==0
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    xvinsgr2vr.w VX2, t1, 0
-    xvinsgr2vr.w VX2, t2, 1
-    xvinsgr2vr.w VX2, t3, 2
-    xvinsgr2vr.w VX2, t4, 3
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    xvinsgr2vr.w VX2, t1, 4
-    xvinsgr2vr.w VX2, t2, 5
-    xvinsgr2vr.w VX2, t3, 6
-    xvinsgr2vr.w VX2, t4, 7
-    add.d Y, Y, INCY
-    xvfmul.s VT0, VX0, VXC
-    xvfmul.s VT1, VX2, VXC
-    xvstelm.w VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 2
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 3
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 4
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 5
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 6
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 7
-    add.d XX, XX, INCX
-    xvstelm.w VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 2
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 3
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 4
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 5
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 6
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 7
-    add.d YY, YY, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L222
-    b .L997
-    .align 3
-
-.L223: // C==0 S!=0
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    xvinsgr2vr.w VX2, t1, 0
-    xvinsgr2vr.w VX2, t2, 1
-    xvinsgr2vr.w VX2, t3, 2
-    xvinsgr2vr.w VX2, t4, 3
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    xvinsgr2vr.w VX2, t1, 4
-    xvinsgr2vr.w VX2, t2, 5
-    xvinsgr2vr.w VX2, t3, 6
-    xvinsgr2vr.w VX2, t4, 7
-    add.d Y, Y, INCY
-    xvfmul.s VT0, VX2, VXS
-    xvfmul.s VT1, VX0, VXS
-    xvfsub.s VT1, VXZ, VT1
-    xvstelm.w VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 2
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 3
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 4
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 5
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 6
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 7
-    add.d XX, XX, INCX
-    xvstelm.w VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 2
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 3
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 4
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 5
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 6
-    add.d YY, YY, INCY
-    xvstelm.w VT1, YY, 0, 7
-    add.d YY, YY, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L223
-    b .L997
-    .align 3
-
-.L224: // C==0 S==0
-    xvstelm.w VXZ, XX, 0, 0
-    add.d XX, XX, INCX
-    xvstelm.w VXZ, XX, 0, 1
-    add.d XX, XX, INCX
-    xvstelm.w VXZ, XX, 0, 2
-    add.d XX, XX, INCX
-    xvstelm.w VXZ, XX, 0, 3
-    add.d XX, XX, INCX
-    xvstelm.w VXZ, YY, 0, 0
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 1
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 2
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 3
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, XX, 0, 4
-    add.d XX, XX, INCX
-    xvstelm.w VXZ, XX, 0, 5
-    add.d XX, XX, INCX
-    xvstelm.w VXZ, XX, 0, 6
-    add.d XX, XX, INCX
-    xvstelm.w VXZ, XX, 0, 7
-    add.d XX, XX, INCX
-    xvstelm.w VXZ, YY, 0, 4
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 5
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 6
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 7
-    add.d YY, YY, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L224
-    b .L997
-    .align 3
-
-.L997:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L998:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f13, Y, 0 * SIZE
-    fmul.s $f10, $f12, C
-    fmadd.s $f10, $f13, S, $f10
-    fst.s $f10, X, 0 * SIZE
-    addi.d I, I, -1
-    fmul.s $f20, $f12, S
-    fmsub.s $f20, $f13, C, $f20
-    fst.s $f20, Y, 0 * SIZE
-    add.d  X, X, INCX
-    add.d  Y, Y, INCY
-    blt $r0, I, .L998
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/srot_lsx.S b/kernel/loongarch64/srot_lsx.S
deleted file mode 100644
index 8822b58e4..000000000
--- a/kernel/loongarch64/srot_lsx.S
+++ /dev/null
@@ -1,927 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define Y      $r7
-#define INCY   $r8
-#define C      $f0
-#define S      $f1
-
-#define I      $r12
-#define TEMP   $r13
-#define t1     $r14
-#define t2     $r16
-#define t3     $r15
-#define t4     $r17
-#define XX     $r18
-#define YY     $r19
-#define a1     $f12
-#define VX0    $vr8
-#define VX1    $vr20
-#define VX2    $vr21
-#define VX3    $vr22
-#define VT0    $vr10
-#define VT1    $vr18
-#define VXC    $vr23
-#define VXS    $vr9
-#define VXZ    $vr19
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    movgr2fr.d a1, $r0
-    ffint.s.l a1, a1
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    movfr2gr.s t1, C
-    vreplgr2vr.w VXC, t1
-    movfr2gr.s t2, S
-    vreplgr2vr.w VXS, t2
-    movfr2gr.s t3, a1
-    vreplgr2vr.w VXZ, t3
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L997
-    fcmp.ceq.s $fcc0, C, a1
-    bcnez $fcc0, .L110
-    fcmp.ceq.s $fcc0, S, a1
-    bcnez $fcc0, .L112 // C!=0 S==0
-    b .L111 // C!=0 S!=0
-    .align 3
-
-.L110:
-    fcmp.ceq.s $fcc0, S, a1
-    bcnez $fcc0, .L114 // C==0 S==0
-    b .L113 // C==0 S!=0
-    .align 3
-
-.L111: // C!=0 S!=0
-    vld VX0, X, 0 * SIZE
-    vld VX2, Y, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    vld VX3, Y, 4 * SIZE
-    vfmul.s VT0, VX0, VXC
-    vfmadd.s VT0, VX2, VXS, VT0
-    vfmul.s VT1, VX0, VXS
-    vfmsub.s VT1, VX2, VXC, VT1
-    vst VT0, X, 0 * SIZE
-    vst VT1, Y, 0 * SIZE
-    vfmul.s VT0, VX1, VXC
-    vfmadd.s VT0, VX3, VXS, VT0
-    vfmul.s VT1, VX1, VXS
-    vfmsub.s VT1, VX3, VXC, VT1
-    vst VT0, X, 4 * SIZE
-    vst VT1, Y, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L111
-    b .L997
-    .align 3
-
-.L112: // C!=0 S==0
-    vld VX0, X, 0 * SIZE
-    vld VX2, Y, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    vld VX3, Y, 4 * SIZE
-    vfmul.s VT0, VX0, VXC
-    vfmul.s VT1, VX2, VXC
-    vst VT0, X, 0 * SIZE
-    vst VT1, Y, 0 * SIZE
-    vfmul.s VT0, VX1, VXC
-    vfmul.s VT1, VX3, VXC
-    vst VT0, X, 4 * SIZE
-    vst VT1, Y, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L112
-    b .L997
-    .align 3
-
-.L113: // C==0 S!=0
-    vld VX0, X, 0 * SIZE
-    vld VX2, Y, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    vld VX3, Y, 4 * SIZE
-    vfmul.s VT0, VX2, VXS
-    vfmul.s VT1, VX0, VXS
-    vfsub.s VT1, VXZ, VT1
-    vst VT0, X, 0 * SIZE
-    vst VT1, Y, 0 * SIZE
-    vfmul.s VT0, VX3, VXS
-    vfmul.s VT1, VX1, VXS
-    vfsub.s VT1, VXZ, VT1
-    vst VT0, X, 4 * SIZE
-    vst VT1, Y, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L113
-    b .L997
-    .align 3
-
-.L114: // C==0 S==0
-    vst VXZ, X, 0 * SIZE
-    vst VXZ, Y, 0 * SIZE
-    vst VXZ, X, 4 * SIZE
-    vst VXZ, Y, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L114
-    b .L997
-    .align 3
-
-.L12: // INCX==1 and INCY!=1
-    bge $r0, I, .L997
-    move YY, Y
-    move XX, X
-    fcmp.ceq.s $fcc0, C, a1
-    bcnez $fcc0, .L120
-    fcmp.ceq.s $fcc0, S, a1
-    bcnez $fcc0, .L122 // C!=0 S==0
-    b .L121 // C!=0 S!=0
-    .align 3
-
-.L120:
-    fcmp.ceq.s $fcc0, S, a1
-    bcnez $fcc0, .L124 // C==0 S==0
-    b .L123 // C==0 S!=0
-    .align 3
-
-.L121: // C!=0 S!=0
-    vld VX0, X, 0 * SIZE
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vinsgr2vr.w VX2, t1, 0
-    vinsgr2vr.w VX2, t2, 1
-    vinsgr2vr.w VX2, t3, 2
-    vinsgr2vr.w VX2, t4, 3
-    add.d Y, Y, INCY
-    vfmul.s VT0, VX0, VXC
-    vfmadd.s VT0, VX2, VXS, VT0
-    vfmul.s VT1, VX0, VXS
-    vfmsub.s VT1, VX2, VXC, VT1
-    vst VT0, X, 0 * SIZE
-    vstelm.w VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 3
-    add.d YY, YY, INCY
-    vld VX1, X, 4 * SIZE
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vinsgr2vr.w VX3, t1, 0
-    vinsgr2vr.w VX3, t2, 1
-    vinsgr2vr.w VX3, t3, 2
-    vinsgr2vr.w VX3, t4, 3
-    add.d Y, Y, INCY
-    vfmul.s VT0, VX1, VXC
-    vfmadd.s VT0, VX3, VXS, VT0
-    vfmul.s VT1, VX1, VXS
-    vfmsub.s VT1, VX3, VXC, VT1
-    vst VT0, X, 4 * SIZE
-    vstelm.w VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 3
-    add.d YY, YY, INCY
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L121
-    b .L997
-    .align 3
-
-.L122: // C!=0 S==0
-    vld VX0, X, 0 * SIZE
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vinsgr2vr.w VX2, t1, 0
-    vinsgr2vr.w VX2, t2, 1
-    vinsgr2vr.w VX2, t3, 2
-    vinsgr2vr.w VX2, t4, 3
-    add.d Y, Y, INCY
-    vfmul.s VT0, VX0, VXC
-    vfmul.s VT1, VX2, VXC
-    vst VT0, X, 0 * SIZE
-    vstelm.w VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 3
-    add.d YY, YY, INCY
-    vld VX1, X, 4 * SIZE
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vinsgr2vr.w VX3, t1, 0
-    vinsgr2vr.w VX3, t2, 1
-    vinsgr2vr.w VX3, t3, 2
-    vinsgr2vr.w VX3, t4, 3
-    add.d Y, Y, INCY
-    vfmul.s VT0, VX1, VXC
-    vfmul.s VT1, VX3, VXC
-    vst VT0, X, 4 * SIZE
-    vstelm.w VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 3
-    add.d YY, YY, INCY
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L122
-    b .L997
-    .align 3
-
-.L123: // C==0 S!=0
-    vld VX0, X, 0 * SIZE
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vinsgr2vr.w VX2, t1, 0
-    vinsgr2vr.w VX2, t2, 1
-    vinsgr2vr.w VX2, t3, 2
-    vinsgr2vr.w VX2, t4, 3
-    add.d Y, Y, INCY
-    vfmul.s VT0, VX2, VXS
-    vfmul.s VT1, VX0, VXS
-    vfsub.s VT1, VXZ, VT1
-    vst VT0, X, 0 * SIZE
-    vstelm.w VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 3
-    add.d YY, YY, INCY
-    vld VX1, X, 4 * SIZE
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vinsgr2vr.w VX3, t1, 0
-    vinsgr2vr.w VX3, t2, 1
-    vinsgr2vr.w VX3, t3, 2
-    vinsgr2vr.w VX3, t4, 3
-    add.d Y, Y, INCY
-    vfmul.s VT0, VX3, VXS
-    vfmul.s VT1, VX1, VXS
-    vfsub.s VT1, VXZ, VT1
-    vst VT0, X, 4 * SIZE
-    vstelm.w VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 3
-    add.d YY, YY, INCY
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L123
-    b .L997
-    .align 3
-
-.L124: // C==0 S==0
-    vst VXZ, X, 0 * SIZE
-    vst VXZ, X, 4 * SIZE
-    vstelm.w VXZ, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 3
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 3
-    add.d YY, YY, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L124
-    b .L997
-    .align 3
-
-.L21:// INCX!=1 and INCY==1
-    bge $r0, I, .L997
-    move XX, X
-    fcmp.ceq.s $fcc0, C, a1
-    bcnez $fcc0, .L210
-    fcmp.ceq.s $fcc0, S, a1
-    bcnez $fcc0, .L212 // C!=0 S==0
-    b .L211 // C!=0 S!=0
-    .align 3
-
-.L210:
-    fcmp.ceq.s $fcc0, S, a1
-    bcnez $fcc0, .L214 // C==0 S==0
-    b .L213 // C==0 S!=0
-    .align 3
-
-.L211: // C!=0 S!=0
-    vld VX2, Y, 0 * SIZE
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    add.d X, X, INCX
-    vfmul.s VT0, VXC, VX0
-    vfmadd.s VT0, VX2, VXS, VT0
-    vfmul.s VT1, VXS, VX0
-    vfmsub.s VT1, VX2, VXC, VT1
-    vstelm.w VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 2
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 3
-    add.d XX, XX, INCX
-    vst VT1, Y, 0 * SIZE
-    vld VX3, Y, 4 * SIZE
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    add.d X, X, INCX
-    vfmul.s VT0, VX1, VXC
-    vfmadd.s VT0, VX3, VXS, VT0
-    vfmul.s VT1, VX1, VXS
-    vfmsub.s VT1, VX3, VXC, VT1
-    vstelm.w VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 2
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 3
-    add.d XX, XX, INCX
-    vst VT1, Y, 4 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    b .L997
-    .align 3
-
-.L212: // C!=0 S==0
-    vld VX2, Y, 0 * SIZE
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    add.d X, X, INCX
-    vfmul.s VT0, VXC, VX0
-    vfmul.s VT1, VX2, VXC
-    vstelm.w VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 2
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 3
-    add.d XX, XX, INCX
-    vst VT1, Y, 0 * SIZE
-    vld VX3, Y, 4 * SIZE
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    add.d X, X, INCX
-    vfmul.s VT0, VX1, VXC
-    vfmul.s VT1, VX3, VXS
-    vstelm.w VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 2
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 3
-    add.d XX, XX, INCX
-    vst VT1, Y, 4 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L212
-    b .L997
-    .align 3
-
-.L213: // C==0 S!=0
-    vld VX2, Y, 0 * SIZE
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    add.d X, X, INCX
-    vfmul.s VT0, VXS, VX2
-    vfmul.s VT1, VXS, VX0
-    vfsub.s VT1, VXZ, VT1
-    vstelm.w VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 2
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 3
-    add.d XX, XX, INCX
-    vst VT1, Y, 0 * SIZE
-    vld VX3, Y, 4 * SIZE
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    add.d X, X, INCX
-    vfmul.s VT0, VX3, VXS
-    vfmul.s VT1, VX1, VXS
-    vfsub.s VT1, VXZ, VT1
-    vstelm.w VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 2
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 3
-    add.d XX, XX, INCX
-    vst VT1, Y, 4 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L213
-    b .L997
-    .align 3
-
-.L214: // C==0 S==0
-    vstelm.w VXZ, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.w VXZ, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.w VXZ, XX, 0, 2
-    add.d XX, XX, INCX
-    vstelm.w VXZ, XX, 0, 3
-    add.d XX, XX, INCX
-    vst VT1, Y, 0 * SIZE
-    vstelm.w VXZ, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.w VXZ, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.w VXZ, XX, 0, 2
-    add.d XX, XX, INCX
-    vstelm.w VXZ, XX, 0, 3
-    add.d XX, XX, INCX
-    vst VT1, Y, 4 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L211
-    b .L997
-    .align 3
-
-.L22:
-    bge $r0, I, .L997
-    move YY, Y
-    move XX, X
-    fcmp.ceq.s $fcc0, C, a1
-    bcnez $fcc0, .L220
-    fcmp.ceq.s $fcc0, S, a1
-    bcnez $fcc0, .L222 // C!=0 S==0
-    b .L221 // C!=0 S!=0
-    .align 3
-
-.L220:
-    fcmp.ceq.s $fcc0, S, a1
-    bcnez $fcc0, .L224 // C==0 S==0
-    b .L223 // C==0 S!=0
-    .align 3
-
-.L221: // C!=0 S!=0
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vinsgr2vr.w VX2, t1, 0
-    vinsgr2vr.w VX2, t2, 1
-    vinsgr2vr.w VX2, t3, 2
-    vinsgr2vr.w VX2, t4, 3
-    add.d Y, Y, INCY
-    vfmul.s VT0, VX0, VXC
-    vfmadd.s VT0, VX2, VXS, VT0
-    vfmul.s VT1, VX0, VXS
-    vfmsub.s VT1, VX2, VXC, VT1
-    vstelm.w VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 2
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 3
-    add.d XX, XX, INCX
-    vstelm.w VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 3
-    add.d YY, YY, INCY
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    add.d X, X, INCX
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vinsgr2vr.w VX3, t1, 0
-    vinsgr2vr.w VX3, t2, 1
-    vinsgr2vr.w VX3, t3, 2
-    vinsgr2vr.w VX3, t4, 3
-    add.d Y, Y, INCY
-    vfmul.s VT0, VX1, VXC
-    vfmadd.s VT0, VX3, VXS, VT0
-    vfmul.s VT1, VX0, VXS
-    vfmsub.s VT1, VX3, VXC, VT1
-    vstelm.w VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 2
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 3
-    add.d XX, XX, INCX
-    vstelm.w VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 3
-    add.d YY, YY, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L221
-    b .L997
-    .align 3
-
-.L222: // C!=0 S==0
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vinsgr2vr.w VX2, t1, 0
-    vinsgr2vr.w VX2, t2, 1
-    vinsgr2vr.w VX2, t3, 2
-    vinsgr2vr.w VX2, t4, 3
-    add.d Y, Y, INCY
-    vfmul.s VT0, VX0, VXC
-    vfmul.s VT1, VX2, VXC
-    vstelm.w VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 2
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 3
-    add.d XX, XX, INCX
-    vstelm.w VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 3
-    add.d YY, YY, INCY
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vinsgr2vr.w VX3, t1, 0
-    vinsgr2vr.w VX3, t2, 1
-    vinsgr2vr.w VX3, t3, 2
-    vinsgr2vr.w VX3, t4, 3
-    add.d Y, Y, INCY
-    vfmul.s VT0, VX1, VXC
-    vfmul.s VT1, VX3, VXC
-    vstelm.w VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 2
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 3
-    add.d XX, XX, INCX
-    vstelm.w VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 3
-    add.d YY, YY, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L222
-    b .L997
-    .align 3
-
-.L223: // C==0 S!=0
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vinsgr2vr.w VX2, t1, 0
-    vinsgr2vr.w VX2, t2, 1
-    vinsgr2vr.w VX2, t3, 2
-    vinsgr2vr.w VX2, t4, 3
-    add.d Y, Y, INCY
-    vfmul.s VT0, VX2, VXS
-    vfmul.s VT1, VX0, VXS
-    vfsub.s VT1, VXZ, VT1
-    vstelm.w VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 2
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 3
-    add.d XX, XX, INCX
-    vstelm.w VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 3
-    add.d YY, YY, INCY
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vinsgr2vr.w VX3, t1, 0
-    vinsgr2vr.w VX3, t2, 1
-    vinsgr2vr.w VX3, t3, 2
-    vinsgr2vr.w VX3, t4, 3
-    add.d Y, Y, INCY
-    vfmul.s VT0, VX3, VXS
-    vfmul.s VT1, VX0, VXS
-    vfsub.s VT1, VXZ, VT1
-    vstelm.w VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 2
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 3
-    add.d XX, XX, INCX
-    vstelm.w VT1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VT1, YY, 0, 3
-    add.d YY, YY, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L223
-    b .L997
-    .align 3
-
-.L224: // C==0 S==0
-    vstelm.w VXZ, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.w VXZ, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.w VXZ, XX, 0, 2
-    add.d XX, XX, INCX
-    vstelm.w VXZ, XX, 0, 3
-    add.d XX, XX, INCX
-    vstelm.w VXZ, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 3
-    add.d YY, YY, INCY
-    vstelm.w VXZ, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.w VXZ, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.w VXZ, XX, 0, 2
-    add.d XX, XX, INCX
-    vstelm.w VXZ, XX, 0, 3
-    add.d XX, XX, INCX
-    vstelm.w VXZ, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 3
-    add.d YY, YY, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L224
-    b .L997
-    .align 3
-
-.L997:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L998:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f13, Y, 0 * SIZE
-    fmul.s $f10, $f12, C
-    fmadd.s $f10, $f13, S, $f10
-    fst.s $f10, X, 0 * SIZE
-    addi.d I, I, -1
-    fmul.s $f20, $f12, S
-    fmsub.s $f20, $f13, C, $f20
-    fst.s $f20, Y, 0 * SIZE
-    add.d  X, X, INCX
-    add.d  Y, Y, INCY
-    blt $r0, I, .L998
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file

From 173a65d4e6ddf8bc5a9b1cd02d594d4b48dc9f89 Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao@loongson.cn>
Date: Mon, 25 Dec 2023 15:11:04 +0800
Subject: [PATCH 518/718] loongarch64: Add and refine iamax optimization
 functions.

---
 common_loongarch64.h                          |   4 +
 kernel/loongarch64/KERNEL.LOONGSON2K1000      |   7 +-
 kernel/loongarch64/KERNEL.LOONGSON3R5         |   7 +-
 kernel/loongarch64/camax_lasx.S               | 194 ++++++
 kernel/loongarch64/camax_lsx.S                | 206 +++++++
 .../{isamax_lasx.S => iamax_lasx.S}           | 254 ++++++--
 kernel/loongarch64/iamax_lsx.S                | 482 +++++++++++++++
 kernel/loongarch64/icamax_lasx.S              | 562 ++++++++++++++++++
 kernel/loongarch64/icamax_lsx.S               | 434 ++++++++++++++
 kernel/loongarch64/idamax_lasx.S              | 275 ---------
 kernel/loongarch64/idamax_lsx.S               | 267 ---------
 kernel/loongarch64/isamax_lsx.S               | 275 ---------
 12 files changed, 2101 insertions(+), 866 deletions(-)
 create mode 100644 kernel/loongarch64/camax_lasx.S
 create mode 100644 kernel/loongarch64/camax_lsx.S
 rename kernel/loongarch64/{isamax_lasx.S => iamax_lasx.S} (55%)
 create mode 100644 kernel/loongarch64/iamax_lsx.S
 create mode 100644 kernel/loongarch64/icamax_lasx.S
 create mode 100644 kernel/loongarch64/icamax_lsx.S
 delete mode 100644 kernel/loongarch64/idamax_lasx.S
 delete mode 100644 kernel/loongarch64/idamax_lsx.S
 delete mode 100644 kernel/loongarch64/isamax_lsx.S

diff --git a/common_loongarch64.h b/common_loongarch64.h
index 13514d6e0..599b4795c 100644
--- a/common_loongarch64.h
+++ b/common_loongarch64.h
@@ -139,6 +139,7 @@ static inline int WhereAmI(void){
 #define XVFMAX  xvfmax.d
 #define XVFMAXA xvfmaxa.d
 #define XVCMPEQ xvfcmp.ceq.d
+#define XVCMPLE xvfcmp.cle.d
 #define XVCMPLT xvfcmp.clt.d
 #define XVMUL   xvfmul.d
 #define XVMSUB  xvfmsub.d
@@ -151,6 +152,7 @@ static inline int WhereAmI(void){
 #define VFMAX  vfmax.d
 #define VFMAXA vfmaxa.d
 #define VCMPEQ vfcmp.ceq.d
+#define VCMPLE vfcmp.cle.d
 #define VCMPLT vfcmp.clt.d
 #define VMUL   vfmul.d
 #define VMSUB  vfmsub.d
@@ -189,6 +191,7 @@ static inline int WhereAmI(void){
 #define XVFMAX  xvfmax.s
 #define XVFMAXA xvfmaxa.s
 #define XVCMPEQ xvfcmp.ceq.s
+#define XVCMPLE xvfcmp.cle.s
 #define XVCMPLT xvfcmp.clt.s
 #define XVMUL   xvfmul.s
 #define XVMSUB  xvfmsub.s
@@ -201,6 +204,7 @@ static inline int WhereAmI(void){
 #define VFMAX  vfmax.s
 #define VFMAXA vfmaxa.s
 #define VCMPEQ vfcmp.ceq.s
+#define VCMPLE vfcmp.cle.s
 #define VCMPLT vfcmp.clt.s
 #define VMUL   vfmul.s
 #define VMSUB  vfmsub.s
diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index b315c81f2..a8a6dd82f 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -9,6 +9,7 @@ DSCALKERNEL  = dscal_lsx.S
 
 SAMAXKERNEL =  amax_lsx.S
 DAMAXKERNEL =  amax_lsx.S
+CAMAXKERNEL =  camax_lsx.S
 
 SAMINKERNEL =  amin_lsx.S
 DAMINKERNEL =  amin_lsx.S
@@ -25,8 +26,10 @@ IDMAXKERNEL =  imax_lsx.S
 ISMINKERNEL =  imin_lsx.S
 IDMINKERNEL =  imin_lsx.S
 
-ISAMAXKERNEL = isamax_lsx.S
-IDAMAXKERNEL = idamax_lsx.S
+ISAMAXKERNEL = iamax_lsx.S
+IDAMAXKERNEL = iamax_lsx.S
+ICAMAXKERNEL = icamax_lsx.S
+IZAMAXKERNEL = icamax_lsx.S
 
 ISAMINKERNEL = iamin_lsx.S
 IDAMINKERNEL = iamin_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 577f6316e..e4c45e1fa 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -9,6 +9,7 @@ DSCALKERNEL  = dscal_lasx.S
 
 SAMAXKERNEL =  amax_lasx.S
 DAMAXKERNEL =  amax_lasx.S
+CAMAXKERNEL =  camax_lasx.S
 
 SAMINKERNEL =  amin_lasx.S
 DAMINKERNEL =  amin_lasx.S
@@ -25,8 +26,10 @@ IDMAXKERNEL =  imax_lasx.S
 ISMINKERNEL =  imin_lasx.S
 IDMINKERNEL =  imin_lasx.S
 
-ISAMAXKERNEL = isamax_lasx.S
-IDAMAXKERNEL = idamax_lasx.S
+ISAMAXKERNEL = iamax_lasx.S
+IDAMAXKERNEL = iamax_lasx.S
+ICAMAXKERNEL = icamax_lasx.S
+IZAMAXKERNEL = icamax_lasx.S
 
 ISAMINKERNEL = iamin_lasx.S
 IDAMINKERNEL = iamin_lasx.S
diff --git a/kernel/loongarch64/camax_lasx.S b/kernel/loongarch64/camax_lasx.S
new file mode 100644
index 000000000..7013430cb
--- /dev/null
+++ b/kernel/loongarch64/camax_lasx.S
@@ -0,0 +1,194 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N    $r4
+#define X    $r5
+#define INCX $r6
+#define I    $r12
+#define t1   $f14
+#define t2   $f18
+#define t3   $f15
+#define t4   $f17
+#define s1   $f22
+#define s2   $f9
+#define s3   $f10
+#define s4   $f11
+#define TEMP $r16
+#define a0   $f20
+#define a1   $f21
+#define x1   $xr9
+#define x2   $xr10
+#define x3   $xr11
+#define x4   $xr12
+#define VT0  $xr13
+#define VT1  $xr14
+#define res0 $xr18
+#define neg1 $xr19
+#define VX0  $xr20
+#define VX1  $xr21
+#define VM0  $xr22
+#define VM1  $xr23
+
+    PROLOGUE
+    xvxor.v VM0, VM0, VM0
+    xvxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    li.w I, -1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    xvreplgr2vr.w neg1, I
+    xvffint.s.w neg1, neg1
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L23
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 8 * SIZE
+    addi.d I, I, -1
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmul.s x3, neg1, x1
+    xvfmul.s x4, neg1, x2
+    xvfcmp.clt.s VT0, x1, res0
+    xvfcmp.clt.s VT1, x2, res0
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VT1
+    addi.d X, X, 16 * SIZE
+    xvfadd.s VM1, x1, x2
+    xvfmax.s VM0, VM0, VM1
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfmax.s VM1, x1, x2
+    xvfmax.s VM0, x3, x4
+    xvfmax.s VM0, VM0, VM1
+    b .L23
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmax.s s1, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmax.s s1, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    addi.d I, I, -1
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmax.s s3, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmax.s s4, t1, t3
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    fmax.s s1, s1, s2
+    fmax.s s3, s3, s4
+    fmax.s s1, s1, s3
+    .align 3
+
+.L23: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    add.d  X, X, INCX
+    fmax.s s1, a0, s1
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/camax_lsx.S b/kernel/loongarch64/camax_lsx.S
new file mode 100644
index 000000000..2e55629de
--- /dev/null
+++ b/kernel/loongarch64/camax_lsx.S
@@ -0,0 +1,206 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N    $r4
+#define X    $r5
+#define INCX $r6
+#define I    $r12
+#define t1   $f14
+#define t2   $f18
+#define t3   $f15
+#define t4   $f17
+#define s1   $f22
+#define s2   $f9
+#define s3   $f10
+#define s4   $f11
+#define TEMP $r16
+#define a0   $f20
+#define a1   $f21
+#define x1   $vr9
+#define x2   $vr10
+#define x3   $vr11
+#define x4   $vr12
+#define VT0  $vr13
+#define VT1  $vr14
+#define res0 $vr18
+#define neg1 $vr19
+#define VX0  $vr20
+#define VX1  $vr21
+#define VM0  $vr22
+#define VM1  $vr23
+
+    PROLOGUE
+    vxor.v VM0, VM0, VM0
+    vxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    li.w I, -1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    vreplgr2vr.w neg1, I
+    vffint.s.w neg1, neg1
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L23
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, neg1, x1
+    vfmul.s x4, neg1, x2
+    vfcmp.clt.s VT0, x1, res0
+    vfcmp.clt.s VT1, x2, res0
+    vld VX0, X, 8 * SIZE
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT1
+    vld VX1, X, 12 * SIZE
+    vfadd.s VM1, x1, x2
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, neg1, x1
+    vfmul.s x4, neg1, x2
+    vfcmp.clt.s VT0, x1, res0
+    vfcmp.clt.s VT1, x2, res0
+    addi.d X, X, 16 * SIZE
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT1
+    vfadd.s x1, x1, x2
+    vfmax.s VM1, x1, VM1
+    vfmax.s VM0, VM0, VM1
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmax.s VM1, x1, x2
+    vfmax.s VM0, x3, x4
+    vfmax.s VM0, VM0, VM1
+    b .L23
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmax.s s1, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmax.s s1, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    addi.d I, I, -1
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmax.s s3, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmax.s s4, t1, t3
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    fmax.s s1, s1, s2
+    fmax.s s3, s3, s4
+    fmax.s s1, s1, s3
+    .align 3
+
+.L23: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    fld.s a0, X, 0 * SIZE
+    fld.s a1, X, 1 * SIZE
+    addi.d I, I, -1
+    fabs.s a0, a0
+    fabs.s a1, a1
+    fadd.s a0, a0, a1
+    add.d  X, X, INCX
+    fmax.s s1, a0, s1
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/isamax_lasx.S b/kernel/loongarch64/iamax_lasx.S
similarity index 55%
rename from kernel/loongarch64/isamax_lasx.S
rename to kernel/loongarch64/iamax_lasx.S
index 2800b1d43..090da3004 100644
--- a/kernel/loongarch64/isamax_lasx.S
+++ b/kernel/loongarch64/iamax_lasx.S
@@ -1,3 +1,30 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
 #define ASSEMBLER
 
 #include "common.h"
@@ -39,6 +66,31 @@
     slli.d INCX, INCX, BASE_SHIFT
     bne INCX, TEMP, .L20
     xvld VM0, X, 0
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
     addi.w i0, i0, 1
     srai.d I, N, 3
     bge $r0, I, .L21
@@ -76,9 +128,25 @@
     xvinsgr2vr.w VI0, i0, 6 //7
     addi.w i0, i0, 1
     xvinsgr2vr.w VI0, i0, 7 //8
+#endif
     .align 3
 
 .L10:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvadd.d VI1, VI1, VINC8
+    xvld VX1, X, 4 * SIZE
+    xvadd.d VI2, VI1, VINC4
+    xvfmaxa.d VM1, VX0, VX1
+    xvfcmp.ceq.d VT0, VX0, VM1
+    addi.d I, I, -1
+    xvbitsel.v VI2, VI2, VI1, VT0
+    xvfmaxa.d VM1, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI2, VI0, VT0
+#else
     xvld VX0, X, 0 * SIZE
     addi.d I, I, -1
     xvadd.w VI1, VI1, VINC8
@@ -87,10 +155,21 @@
     addi.d X, X, 8 * SIZE
     xvbitsel.v VM0, VM1, VM0, VT0
     xvbitsel.v VI0, VI1, VI0, VT0
+#endif
     blt $r0, I, .L10
     .align 3
 
 .L15:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+#else
     xvxor.v VX0, VX0, VX0
     xvor.v VX0, VI0, VX0
     xvxor.v VX1, VX1, VX1
@@ -103,28 +182,62 @@
     xvpickve.w x2, VM0, 1
     xvpickve.w x3, VM0, 2
     xvpickve.w x4, VM0, 3
-    xvfmaxa.s VM1, x1, x2
-    xvfcmp.ceq.s VT0, x1, VM1
+#endif
+    XVFMAXA VM1, x1, x2
+    XVCMPEQ VT0, x1, VM1
     xvbitsel.v VINC4, VI2, VI1, VT0
-    xvfmaxa.s VM0, x3, x4
-    xvfcmp.ceq.s VT0, x3, VM0
+    XVFMAXA VM0, x3, x4
+    XVCMPEQ VT0, x3, VM0
     xvbitsel.v VINC8, VI4, VI3, VT0
-    xvfmaxa.s VM0, VM0, VM1
-    xvfcmp.ceq.s VT0, VM0, VM1
+    XVFMAXA VM0, VM0, VM1
+    XVCMPEQ VT0, VM0, VM1
     xvbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.w $f17, TEMP
-    ffint.s.w $f17, $f17
-    xvfcmp.ceq.s VT0, VM0, x1
-    fcmp.ceq.s $fcc0, $f23, $f17
+    CMPEQ $fcc0, $f15, $f9
     bceqz $fcc0, .L26
-    xvfcmp.clt.s VT0, VI1, VI0
+    XVCMPLT VT0, VI1, VI0
     xvbitsel.v VI0, VI0, VI1, VT0
     b .L26
     .align 3
 
 .L20: // INCX!=1
     move TEMP, X
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t2, 1
+    xvinsgr2vr.d VM0, t3, 2
+    xvinsgr2vr.d VM0, t4, 3
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
     addi.w i0, i0, 1
     ld.w t1, TEMP, 0 * SIZE
     add.d TEMP, TEMP, INCX
@@ -186,9 +299,46 @@
     xvinsgr2vr.w VI0, i0, 6 //7
     addi.w i0, i0, 1
     xvinsgr2vr.w VI0, i0, 7 //8
+#endif
     .align 3
 
 .L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvadd.d VI2, VI1, VINC4
+    xvfmaxa.d VM1, VX0, VX1
+    xvfcmp.ceq.d VT0, VX0, VM1
+    addi.d I, I, -1
+    xvbitsel.v VI2, VI2, VI1, VT0
+    xvfmaxa.d VM1, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI2, VI0, VT0
+#else
     ld.w t1, X, 0 * SIZE
     add.d X, X, INCX
     ld.w t2, X, 0 * SIZE
@@ -219,10 +369,30 @@
     addi.d I, I, -1
     xvbitsel.v VM0, VM1, VM0, VT0
     xvbitsel.v VI0, VI1, VI0, VT0
+#endif
     blt $r0, I, .L24
     .align 3
 
 .L25:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmaxa.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, x1, VM1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmaxa.d VM0, x4, x3
+    xvfcmp.ceq.d VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmaxa.d VM0, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#else
     xvxor.v VX0, VX0, VX0
     xvor.v VX0, VI0, VX0
     xvxor.v VX1, VX1, VX1
@@ -245,44 +415,45 @@
     xvfcmp.ceq.s VT0, VM0, VM1
     xvbitsel.v VM0, VM0, VM1, VT0
     xvbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.w $f17, TEMP
-    ffint.s.w $f17, $f17
-    xvfcmp.ceq.s VT0, VM0, x1
-    fcmp.ceq.s $fcc0, $f23, $f17
+#endif
+    CMPEQ $fcc0, $f15, $f9
     bceqz $fcc0, .L26
-    xvfcmp.clt.s VT0, VI1, VI0
+    XVCMPLT VT0, VI1, VI0
     xvbitsel.v VI0, VI0, VI1, VT0
     .align 3
 
 .L26:
-    xvfcmp.ceq.s VT0, VM0, x2
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f10
     bceqz $fcc0, .L27
-    xvfcmp.clt.s VT0, VI2, VI0
+    XVCMPLT  VT0, VI2, VI0
     xvbitsel.v VI0, VI0, VI2, VT0
     .align 3
 
 .L27:
-    xvfcmp.ceq.s VT0, VM0, x3
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f11
     bceqz $fcc0, .L28
-    xvfcmp.clt.s VT0, VI3, VI0
+    XVCMPLT  VT0, VI3, VI0
     xvbitsel.v VI0, VI0, VI3, VT0
     .align 3
 
 .L28:
-    xvfcmp.ceq.s VT0, VM0, x4
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f12
     bceqz $fcc0, .L29
-    xvfcmp.clt.s VT0, VI4, VI0
+    XVCMPLT  VT0, VI4, VI0
     xvbitsel.v VI0, VI0, VI4, VT0
     .align 3
 
 .L29:
+#ifdef DOUBLE
+    movfr2gr.d i0, $f20
+#else
     fmov.s $f16, $f20
+#endif
     .align 3
 
+#ifdef DOUBLE
+
+#else
 .L252:
     xvxor.v VI0, VI0, VI0
     xvor.v VI0, VI0, VX0
@@ -306,35 +477,27 @@
     xvfmaxa.s VM0, VM0, VM1
     xvfcmp.ceq.s VT0, VM0, VM1
     xvbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.w $f17, TEMP
-    ffint.s.w $f17, $f17
-    xvfcmp.ceq.s VT0, VM0, x1
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f9
     bceqz $fcc0, .L262
     xvfcmp.clt.s VT0, VI1, VI0
     xvbitsel.v VI0, VI0, VI1, VT0
     .align 3
 
 .L262:
-    xvfcmp.ceq.s VT0, VM0, x2
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f10
     bceqz $fcc0, .L272
     xvfcmp.clt.s VT0, VI2, VI0
     xvbitsel.v VI0, VI0, VI2, VT0
     .align 3
-
 .L272:
-    xvfcmp.ceq.s VT0, VM0, x3
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f11
     bceqz $fcc0, .L282
     xvfcmp.clt.s VT0, VI3, VI0
     xvbitsel.v VI0, VI0, VI3, VT0
     .align 3
 
 .L282:
-    xvfcmp.ceq.s VT0, VM0, x4
-    fcmp.ceq.s $fcc0, $f23, $f17
+    fcmp.ceq.d $fcc0, $f15, $f12
     bceqz $fcc0, .L292
     xvfcmp.clt.s VT0, VI4, VI0
     xvbitsel.v VI0, VI0, VI4, VT0
@@ -345,8 +508,9 @@
     xvfcmp.ceq.s VT0, VM0, VX0
     xvbitsel.v VI0, VI0, VI1, VT0
     movfr2gr.s i0, $f20
+#endif
 
-.L21: // N<8
+.L21: //N<8
     andi I, N, 7
     bge $r0, I, .L999
     srai.d i1, N, 3
@@ -357,17 +521,17 @@
     .align 3
 
 .L22:
-    fld.s $f9, X, 0
+    LD  $f9, X, 0
     addi.d I, I, -1
-    xvfmaxa.s VM1, x1, VM0
-    xvfcmp.ceq.s VT0, VM0, VM1
+    XVFMAXA VM1, x1, VM0
+    XVCMPEQ VT0, VM0, VM1
     add.d  X, X, INCX
     xvbitsel.v VM0, VM1, VM0, VT0
     xvbitsel.v VI0, VI1, VI0, VT0
     addi.d i1, i1, 1
     movgr2fr.d $f21, i1
     blt $r0, I, .L22
-    movfr2gr.s i0, $f20
+    MTG     i0,   $f20
     .align 3
 
 .L999:
@@ -375,4 +539,4 @@
     jirl $r0, $r1, 0x0
     .align 3
 
-    EPILOGUE
\ No newline at end of file
+    EPILOGUE
diff --git a/kernel/loongarch64/iamax_lsx.S b/kernel/loongarch64/iamax_lsx.S
new file mode 100644
index 000000000..ce5b3c724
--- /dev/null
+++ b/kernel/loongarch64/iamax_lsx.S
@@ -0,0 +1,482 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VINC2 $vr17
+#define VINC4 $vr18
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L11
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC2, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC2, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w VINC4, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 2 * SIZE
+    vadd.d VI2, VI1, VINC2
+    vfmaxa.d x1, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x1
+    vbitsel.v x2, VI2, VI1, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI2, VINC2
+    vld VX1, X, 6 * SIZE
+    vadd.d VI2, VI1, VINC2
+    vfmaxa.d x3, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x3
+    vbitsel.v x4, VI2, VI1, VT0
+    vfmaxa.d x3, x1, x3
+    vfcmp.ceq.d VT0, x1, x3
+    vbitsel.v x2, x4, x2, VT0
+    vfmaxa.d VM1, VM0, x3
+    vfcmp.ceq.d VT0, VM0, VM1
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, x2, VI0, VT0
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    vadd.w VI1, VI1, VINC4
+    vld VX1, X, 4 * SIZE
+    vadd.w VI2, VI1, VINC2
+    vfmaxa.s VM1, VX0, VX1
+    vfcmp.ceq.s VT0, VX0, VM1
+    addi.d I, I, -1
+    vbitsel.v VI2, VI2, VI1, VT0
+    vfmaxa.s VM1, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI2, VI0, VT0
+#endif
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L16
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L17
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmaxa.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC2, VI2, VI1, VT0
+    vfmaxa.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC4, VI4, VI3, VT0
+    vfmaxa.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC4, VINC2, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+#endif
+    .align 3
+
+#ifdef DOUBLE
+.L16:
+    vfmaxa.d VM0, x1, x2
+    vfcmp.ceq.d VT0, x1, VM0
+    vbitsel.v VI0, VI2, VI1, VT0
+    .align 3
+
+.L17:
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L11: //INCX==1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L14
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L13:
+    fld.d $f9, X, 0
+    vfmaxa.d VM1, x1, VM0
+    vfcmp.ceq.d VT0, VM0, VM1
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    addi.d I, I, -1
+    addi.d i1, i1, 1
+    addi.d  X, X, SIZE
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L13
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L14:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t2, 1
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC2, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+    .align 3
+
+.L24:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI1, VINC4
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, VINC2
+    vfmaxa.d x1, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x1
+    vbitsel.v x2, VI2, VI1, VT0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI2, VINC2
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, VINC2
+    vfmaxa.d x3, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x3
+    vbitsel.v x4, VI2, VI1, VT0
+    vfmaxa.d x3, x1, x3
+    vfcmp.ceq.d VT0, x1, x3
+    vbitsel.v x2, x4, x2, VT0
+    vfmaxa.d VM1, VM0, x3
+    vbitsel.v VM0, VM1, VM0, VT0
+    vfcmp.ceq.d VT0, VM0, VM1
+    vbitsel.v VI0, x2, VI0, VT0
+    addi.d I, I, -1
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+    .align 3
+
+.L26:
+    vfmaxa.d VM0, x1, x2
+    vfcmp.ceq.d VT0, x1, VM0
+    vbitsel.v VI0, VI2, VI1, VT0
+    .align 3
+
+.L27:
+    movfr2gr.d i0, $f20
+    .align 3
+
+#else
+.L20: // INCX!=1
+    move TEMP, X
+    addi.w i0, i0, 1
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t2, 1
+    vinsgr2vr.w VM0, t3, 2
+    vinsgr2vr.w VM0, t4, 3
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC2, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w VINC4, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+    .align 3
+
+.L24:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    vadd.w VI1, VI1, VINC4
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vadd.w VI2, VI1, VINC2
+    vfmaxa.s VM1, VX0, VX1
+    vfcmp.ceq.s VT0, VX0, VM1
+    vbitsel.v VI2, VI2, VI1, VT0
+    vfmaxa.s VM1, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    addi.d I, I, -1
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI2, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmaxa.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC2, VI2, VI1, VT0
+    vfmaxa.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC4, VI4, VI3, VT0
+    vfmaxa.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC4, VINC2, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    vfcmp.clt.s VT0, VI2, VI0
+    vbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    vfcmp.clt.s VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    vfcmp.clt.s VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    movfr2gr.s i0, $f20
+    .align 3
+
+#endif
+.L21: // N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD $f9, X, 0
+    VFMAXA VM1, x1, VM0
+    VCMPEQ VT0, VM0, VM1
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    addi.d I, I, -1
+    addi.d i1, i1, 1
+    add.d  X, X, INCX
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG   i0,  $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/icamax_lasx.S b/kernel/loongarch64/icamax_lasx.S
new file mode 100644
index 000000000..7800cb917
--- /dev/null
+++ b/kernel/loongarch64/icamax_lasx.S
@@ -0,0 +1,562 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define a0 $f12
+#define a1 $f13
+#define s1 $f15
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    xvxor.v VM0, VM0, VM0
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    xvxor.v VI3, VI3, VI3 // 0
+#ifdef DOUBLE
+    li.d I, -1
+    xvreplgr2vr.d VI4, I
+    xvffint.d.l VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 1 //3
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI0, i0, 2 //2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    li.w I, -1
+    xvreplgr2vr.w VI4, I
+    xvffint.s.w VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 2 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //6
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI0, i0, 4 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //4
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvadd.d VI1, VI1, VINC4
+    xvld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvfmul.d x3, VI4, x1
+    xvfmul.d x4, VI4, x2
+    xvfcmp.clt.d VT0, x1, VI3
+    xvfcmp.clt.d VINC8, x2, VI3
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VINC8
+#else
+    xvadd.w VI1, VI1, VINC8
+    xvld VX1, X, 8 * SIZE
+    addi.d I, I, -1
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmul.s x3, VI4, x1
+    xvfmul.s x4, VI4, x2
+    xvfcmp.clt.s VT0, x1, VI3
+    xvfcmp.clt.s VINC4, x2, VI3
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VINC4
+#endif
+    XVFADD  x1,  x1,  x2
+    XVFMAX  x3,  VM0, x1
+    XVCMPEQ VT0, x3,  VM0
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, x3, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmax.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, VM1, x1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmax.d VM0, x3, x4
+    xvfcmp.ceq.d VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmax.d VM0, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#endif
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT  VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 1 //3
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI0, i0, 2 //2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 2 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //6
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI0, i0, 4 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //4
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    xvadd.d VI1, VI1, VINC4
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    addi.d I, I, -1
+    xvfmul.d x3, VI4, x1
+    xvfmul.d x4, VI4, x2
+    xvfcmp.clt.d VT0, x1, VI3
+    xvfcmp.clt.d VINC8, x2, VI3
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VINC8
+    xvfadd.d x1, x1, x2
+    xvfmax.d x3, VM0, x1
+    xvfcmp.ceq.d VT0, x3, VM0
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    xvadd.w VI1, VI1, VINC8
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    xvadd.w VI1, VI1, VINC8
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    addi.d I, I, -1
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmul.s x3, VI4, x1
+    xvfmul.s x4, VI4, x2
+    xvfcmp.clt.s VT0, x1, VI3
+    xvfcmp.clt.s VINC8, x2, VI3
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VINC8
+    xvfadd.s x1, x1, x2
+    xvfmax.s x3, VM0, x1
+    xvfcmp.ceq.s VT0, x3, VM0
+#endif
+    xvbitsel.v VM0, x3, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmaxa.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, VM1, x1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmaxa.d VM0, x3, x4
+    xvfcmp.ceq.d VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmaxa.d VM0, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#endif
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT  VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    XVCMPLT  VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    XVCMPLT  VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    XVCMPLT  VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+#ifdef DOUBLE
+    movfr2gr.d i0, $f20
+#else
+    fmov.s $f16, $f20
+#endif
+    .align 3
+
+#ifdef DOUBLE
+#else
+.L252:
+    xvxor.v VI0, VI0, VI0
+    xvor.v VI0, VI0, VX0
+    fmov.s $f13, $f15
+    xvxor.v VM0, VM0, VM0
+    xvor.v VM0, VM0, VX1
+    xvpickve.w VI1, VI0, 4
+    xvpickve.w VI2, VI0, 5
+    xvpickve.w VI3, VI0, 6
+    xvpickve.w VI4, VI0, 7
+    xvpickve.w x1, VM0, 4
+    xvpickve.w x2, VM0, 5
+    xvpickve.w x3, VM0, 6
+    xvpickve.w x4, VM0, 7
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v x1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, x1
+    xvbitsel.v VM0, VM0, x1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L262
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L262:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L272
+    xvfcmp.clt.s VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L272:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L282
+    xvfcmp.clt.s VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L282:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L292
+    xvfcmp.clt.s VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L292:
+    fcmp.clt.s $fcc0, $f15, $f13
+    fsel $f15, $f15, $f13, $fcc0
+    fsel $f20, $f20, $f16, $fcc0
+    movfr2gr.s i0, $f20
+
+#endif
+.L21: //N<8
+#ifdef DOUBLE
+    andi I, N, 3
+    bge $r0, I, .L999
+    srai.d i1, N, 2
+    slli.d i1, i1, 2
+#else
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+#endif
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    FMAX a1, s1, a0
+    CMPEQ $fcc0, s1, a1
+    add.d  X, X, INCX
+    fsel s1, a1, s1, $fcc0
+    fsel $f20, $f21, $f20, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG i0, $f20
+    .align 3
+
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/icamax_lsx.S b/kernel/loongarch64/icamax_lsx.S
new file mode 100644
index 000000000..a2fc9dbbd
--- /dev/null
+++ b/kernel/loongarch64/icamax_lsx.S
@@ -0,0 +1,434 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define a0 $f12
+#define a1 $f13
+#define s1 $f15
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VINC4 $vr17
+#define VINC8 $vr18
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    vxor.v VM0, VM0, VM0
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    vxor.v VI3, VI3, VI3 // 0
+#ifdef DOUBLE
+    li.d I, -1
+    vreplgr2vr.d VI4, I
+    vffint.d.l VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -3
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    li.w I, -1
+    vreplgr2vr.w VI4, I
+    vffint.s.w VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.w i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    addi.w i0, i0, -7
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 2 * SIZE
+    addi.d I, I, -1
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VI4, x1
+    vfmul.d x4, VI4, x2
+    vfcmp.clt.d VT0, x1, VI3
+    vfcmp.clt.d VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    vfadd.d x1, x1, x2
+    vfmax.d x3, VM0, x1
+    vfcmp.ceq.d VT0, x3, VM0
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VI4, x1
+    vfmul.d x4, VI4, x2
+#else
+    vadd.w VI1, VI1, VINC4
+    vld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, VI4, x1
+    vfmul.s x4, VI4, x2
+#endif
+    VCMPLT  VT0, x1, VI3
+    VCMPLT  VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    VFADD  x1, x1, x2
+    VFMAX  x3, VM0, x1
+    VCMPEQ VT0, x3, VM0
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmaxa.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC4, VI2, VI1, VT0
+    vfmaxa.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC8, VI4, VI3, VT0
+    vfmaxa.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+#endif
+    .align 3
+
+.L20: // INCX!=1
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -3
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    addi.w i0, i0, -7
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vadd.d VI1, VI1, VINC4
+    vfmul.d x3, VI4, x1
+    vfmul.d x4, VI4, x2
+    vfcmp.clt.d VT0, x1, VI3
+    vfcmp.clt.d VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    vfadd.d x1, x1, x2
+    vfmax.d x3, VM0, x1
+    ld.d t1, X, 0 * SIZE
+    vfcmp.ceq.d VT0, x3, VM0
+    ld.d t2, X, 1 * SIZE
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vadd.d VI1, VI1, VINC4
+    addi.d I, I, -1
+    vfmul.d x3, VI4, x1
+    vfmul.d x4, VI4, x2
+    vfcmp.clt.d VT0, x1, VI3
+    vfcmp.clt.d VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    vfadd.d x1, x1, x2
+    vfmax.d x3, VM0, x1
+    vfcmp.ceq.d VT0, x3, VM0
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    vadd.w VI1, VI1, VINC4
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    addi.d I, I, -1
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, VI4, x1
+    vfmul.s x4, VI4, x2
+    vfcmp.clt.s VT0, x1, VI3
+    vfcmp.clt.s VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    vfadd.s x1, x1, x2
+    vfmax.s x3, VM0, x1
+    vfcmp.ceq.s VT0, x3, VM0
+#endif
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmaxa.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC4, VI2, VI1, VT0
+    vfmaxa.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC8, VI4, VI3, VT0
+    vfmaxa.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+#endif
+    .align 3
+
+#ifdef DOUBLE
+.L26:
+    vfmaxa.d VM0, x1, x2
+    vfcmp.ceq.d VT0, x1, VM0
+    vbitsel.v VI0, VI2, VI1, VT0
+    .align 3
+
+.L27:
+    movfr2gr.d i0, $f20
+    .align 3
+#else
+.L26:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    vfcmp.clt.s VT0, VI2, VI0
+    vbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    vfcmp.clt.s VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    vfcmp.clt.s VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    movfr2gr.s i0, $f20
+    .align 3
+
+#endif
+.L21: //N<4
+    andi I, N, 3
+    bge $r0, I, .L999
+    srai.d i1, N, 2
+    slli.d i1, i1, 2
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    FMAX a1, s1, a0
+    CMPEQ $fcc0, s1, a1
+    add.d  X, X, INCX
+    fsel s1, a1, s1, $fcc0
+    fsel $f20, $f21, $f20, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG  i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/idamax_lasx.S b/kernel/loongarch64/idamax_lasx.S
deleted file mode 100644
index 8248ee757..000000000
--- a/kernel/loongarch64/idamax_lasx.S
+++ /dev/null
@@ -1,275 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define t1 $r13
-#define t2 $r15
-#define t3 $r18
-#define t4 $r16
-#define i0 $r17
-#define i1 $r14
-#define TEMP $r19
-#define x1 $xr9
-#define x2 $xr10
-#define x3 $xr11
-#define x4 $xr12
-#define VX0 $xr13
-#define VX1 $xr14
-#define VM0 $xr15
-#define VM1 $xr16
-#define VINC4 $xr17
-#define VINC8 $xr18
-#define VI0 $xr20
-#define VI1 $xr21
-#define VI2 $xr22
-#define VI3 $xr8
-#define VI4 $xr19
-#define VT0 $xr23
-
-    PROLOGUE
-    li.d  i0, 0
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    xvld VM0, X, 0
-    addi.d i0, i0, 1
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    slli.d i0, i0, 2 //4
-    xvreplgr2vr.d VINC4, i0
-    slli.d i0, i0, 1 //8
-    xvreplgr2vr.d VINC8, i0
-    addi.d i0, i0, -15
-    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 1
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 2
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 3
-    addi.d i0, i0, 5
-    xvinsgr2vr.d VI0, i0, 0 //1
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 1 //2
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 2 //3
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 3 //4
-    .align 3
-
-.L10:
-    xvld VX0, X, 0 * SIZE
-    xvadd.d VI1, VI1, VINC8
-    xvld VX1, X, 4 * SIZE
-    xvadd.d VI2, VI1, VINC4
-    xvfmaxa.d VM1, VX0, VX1
-    xvfcmp.ceq.d VT0, VX0, VM1
-    addi.d I, I, -1
-    xvbitsel.v VI2, VI2, VI1, VT0
-    xvfmaxa.d VM1, VM0, VM1
-    xvfcmp.ceq.d VT0, VM0, VM1
-    addi.d X, X, 8 * SIZE
-    xvbitsel.v VM0, VM1, VM0, VT0
-    xvbitsel.v VI0, VI2, VI0, VT0
-    blt $r0, I, .L10
-    .align 3
-
-.L15:
-    xvpickve.d VI1, VI0, 0
-    xvpickve.d VI2, VI0, 1
-    xvpickve.d VI3, VI0, 2
-    xvpickve.d VI4, VI0, 3
-    xvpickve.d x1, VM0, 0
-    xvpickve.d x2, VM0, 1
-    xvpickve.d x3, VM0, 2
-    xvpickve.d x4, VM0, 3
-    xvfmaxa.d VM1, x1, x2
-    xvfcmp.ceq.d VT0, x1, VM1
-    xvbitsel.v VINC4, VI2, VI1, VT0
-    xvfmaxa.d VM0, x4, x3
-    xvfcmp.ceq.d VT0, x3, VM0
-    xvbitsel.v VINC8, VI4, VI3, VT0
-    xvfmaxa.d VM0, VM0, VM1
-    xvfcmp.ceq.d VT0, VM0, VM1
-    xvbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.d $f17, TEMP
-    ffint.d.l $f17, $f17
-    xvfcmp.ceq.d VT0, VM0, x1
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    xvfcmp.clt.d VT0, VI1, VI0
-    xvbitsel.v VI0, VI0, VI1, VT0
-    b .L26
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X
-    addi.d i0, i0, 1
-    ld.d t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.d VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    ld.d t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.d t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.d t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    xvinsgr2vr.d VM0, t2, 1
-    xvinsgr2vr.d VM0, t3, 2
-    xvinsgr2vr.d VM0, t4, 3
-    slli.d i0, i0, 2 //4
-    xvreplgr2vr.d VINC4, i0
-    slli.d i0, i0, 1 //8
-    xvreplgr2vr.d VINC8, i0
-    addi.d i0, i0, -15
-    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 1
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 2
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI1, i0, 3
-    addi.d i0, i0, 5
-    xvinsgr2vr.d VI0, i0, 0 //1
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 1 //2
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 2 //3
-    addi.d i0, i0, 1
-    xvinsgr2vr.d VI0, i0, 3 //4
-    .align 3
-
-.L24:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX0, t1, 0
-    xvinsgr2vr.d VX0, t2, 1
-    xvinsgr2vr.d VX0, t3, 2
-    xvinsgr2vr.d VX0, t4, 3
-    xvadd.d VI1, VI1, VINC8
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX1, t1, 0
-    xvinsgr2vr.d VX1, t2, 1
-    xvinsgr2vr.d VX1, t3, 2
-    xvinsgr2vr.d VX1, t4, 3
-    xvadd.d VI2, VI1, VINC4
-    xvfmaxa.d VM1, VX0, VX1
-    xvfcmp.ceq.d VT0, VX0, VM1
-    addi.d I, I, -1
-    xvbitsel.v VI2, VI2, VI1, VT0
-    xvfmaxa.d VM1, VM0, VM1
-    xvfcmp.ceq.d VT0, VM0, VM1
-    xvbitsel.v VM0, VM1, VM0, VT0
-    xvbitsel.v VI0, VI2, VI0, VT0
-    blt $r0, I, .L24
-    .align 3
-
-.L25:
-    xvpickve.d VI1, VI0, 0
-    xvpickve.d VI2, VI0, 1
-    xvpickve.d VI3, VI0, 2
-    xvpickve.d VI4, VI0, 3
-    xvpickve.d x1, VM0, 0
-    xvpickve.d x2, VM0, 1
-    xvpickve.d x3, VM0, 2
-    xvpickve.d x4, VM0, 3
-    xvfmaxa.d VM1, x1, x2
-    xvfcmp.ceq.d VT0, x1, VM1
-    xvbitsel.v VINC4, VI2, VI1, VT0
-    xvfmaxa.d VM0, x4, x3
-    xvfcmp.ceq.d VT0, x3, VM0
-    xvbitsel.v VINC8, VI4, VI3, VT0
-    xvfmaxa.d VM0, VM0, VM1
-    xvfcmp.ceq.d VT0, VM0, VM1
-    xvbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.d $f17, TEMP
-    ffint.d.l $f17, $f17
-    xvfcmp.ceq.d VT0, VM0, x1
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    xvfcmp.clt.d VT0, VI1, VI0
-    xvbitsel.v VI0, VI0, VI1, VT0
-    .align 3
-
-.L26:
-    xvfcmp.ceq.d VT0, VM0, x2
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L27
-    xvfcmp.clt.d VT0, VI2, VI0
-    xvbitsel.v VI0, VI0, VI2, VT0
-    .align 3
-
-.L27:
-    xvfcmp.ceq.d VT0, VM0, x3
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L28
-    xvfcmp.clt.d VT0, VI3, VI0
-    xvbitsel.v VI0, VI0, VI3, VT0
-    .align 3
-
-.L28:
-    xvfcmp.ceq.d VT0, VM0, x4
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L29
-    xvfcmp.clt.d VT0, VI4, VI0
-    xvbitsel.v VI0, VI0, VI4, VT0
-    .align 3
-
-.L29:
-    movfr2gr.d i0, $f20
-    .align 3
-
-.L21: //N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    srai.d i1, N, 3
-    slli.d i1, i1, 3
-    addi.d i1, i1, 1 //current index
-    movgr2fr.d $f21, i1
-    movgr2fr.d $f20, i0
-    .align 3
-
-.L22:
-    fld.d $f9, X, 0
-    addi.d I, I, -1
-    xvfmaxa.d VM1, x1, VM0
-    xvfcmp.ceq.d VT0, VM0, VM1
-    add.d  X, X, INCX
-    xvbitsel.v VM0, VM1, VM0, VT0
-    xvbitsel.v VI0, VI1, VI0, VT0
-    addi.d i1, i1, 1
-    movgr2fr.d $f21, i1
-    blt $r0, I, .L22
-    movfr2gr.d i0, $f20
-    .align 3
-
-.L999:
-    move $r4, $r17
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/idamax_lsx.S b/kernel/loongarch64/idamax_lsx.S
deleted file mode 100644
index fb2d5bac1..000000000
--- a/kernel/loongarch64/idamax_lsx.S
+++ /dev/null
@@ -1,267 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define t1 $r13
-#define t2 $r15
-#define t3 $r18
-#define t4 $r16
-#define i0 $r17
-#define i1 $r14
-#define TEMP $r19
-#define x1 $vr9
-#define x2 $vr10
-#define x3 $vr11
-#define x4 $vr12
-#define VX0 $vr13
-#define VX1 $vr14
-#define VM0 $vr15
-#define VM1 $vr16
-#define VINC2 $vr17
-#define VINC4 $vr18
-#define VI0 $vr20
-#define VI1 $vr21
-#define VI2 $vr22
-#define VI3 $vr8
-#define VI4 $vr19
-#define VT0 $vr23
-
-    PROLOGUE
-    li.d  i0, 0
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    vld VM0, X, 0
-    addi.d i0, i0, 1
-    srai.d I, N, 3
-    bge $r0, I, .L11
-    slli.d i0, i0, 1 //2
-    vreplgr2vr.d VINC2, i0
-    slli.d i0, i0, 1 //4
-    vreplgr2vr.d VINC4, i0
-    addi.d i0, i0, -7
-    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
-    addi.d i0, i0, 1
-    vinsgr2vr.d VI1, i0, 1
-    addi.d i0, i0, 3
-    vinsgr2vr.d VI0, i0, 0 //1
-    addi.d i0, i0, 1
-    vinsgr2vr.d VI0, i0, 1 //2
-    .align 3
-
-.L10:
-    vld VX0, X, 0 * SIZE
-    vadd.d VI1, VI1, VINC4
-    vld VX1, X, 2 * SIZE
-    vadd.d VI2, VI1, VINC2
-    vfmaxa.d x1, VX0, VX1
-    vfcmp.ceq.d VT0, VX0, x1
-    vbitsel.v x2, VI2, VI1, VT0
-    vld VX0, X, 4 * SIZE
-    vadd.d VI1, VI2, VINC2
-    vld VX1, X, 6 * SIZE
-    vadd.d VI2, VI1, VINC2
-    vfmaxa.d x3, VX0, VX1
-    vfcmp.ceq.d VT0, VX0, x3
-    vbitsel.v x4, VI2, VI1, VT0
-    vfmaxa.d x3, x1, x3
-    vfcmp.ceq.d VT0, x1, x3
-    vbitsel.v x2, x4, x2, VT0
-    vfmaxa.d VM1, VM0, x3
-    vfcmp.ceq.d VT0, VM0, VM1
-    vbitsel.v VM0, VM1, VM0, VT0
-    vbitsel.v VI0, x2, VI0, VT0
-    addi.d I, I, -1
-    addi.d X, X, 8 * SIZE
-    blt $r0, I, .L10
-    .align 3
-
-.L15:
-    vreplvei.d VI1, VI0, 0
-    vreplvei.d VI2, VI0, 1
-    vreplvei.d x1, VM0, 0
-    vreplvei.d x2, VM0, 1
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.d $f17, TEMP
-    ffint.d.l $f17, $f17
-    vfcmp.ceq.d VT0, x2, x1
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L16
-    vfcmp.clt.d VT0, VI1, VI0
-    vbitsel.v VI0, VI0, VI1, VT0
-    b .L17
-    .align 3
-
-.L16:
-    vfmaxa.d VM0, x1, x2
-    vfcmp.ceq.d VT0, x1, VM0
-    vbitsel.v VI0, VI2, VI1, VT0
-    .align 3
-
-.L17:
-    movfr2gr.d i0, $f20
-    .align 3
-
-.L11: //INCX==1 and N<8
-    andi I, N, 7
-    bge $r0, I, .L14
-    srai.d i1, N, 3
-    slli.d i1, i1, 3
-    addi.d i1, i1, 1 //current index
-    movgr2fr.d $f21, i1
-    movgr2fr.d $f20, i0
-    .align 3
-
-.L13:
-    fld.d $f9, X, 0
-    vfmaxa.d VM1, x1, VM0
-    vfcmp.ceq.d VT0, VM0, VM1
-    vbitsel.v VM0, VM1, VM0, VT0
-    vbitsel.v VI0, VI1, VI0, VT0
-    addi.d I, I, -1
-    addi.d i1, i1, 1
-    addi.d  X, X, SIZE
-    movgr2fr.d $f21, i1
-    blt $r0, I, .L13
-    movfr2gr.d i0, $f20
-    .align 3
-
-.L14:
-    move $r4, $r17
-    jirl $r0, $r1, 0x0
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X
-    addi.d i0, i0, 1
-    ld.d t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.d VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    ld.d t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.d VM0, t2, 1
-    slli.d i0, i0, 1 //2
-    vreplgr2vr.d VINC2, i0
-    slli.d i0, i0, 1 //4
-    vreplgr2vr.d VINC4, i0
-    addi.d i0, i0, -7
-    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
-    addi.d i0, i0, 1
-    vinsgr2vr.d VI1, i0, 1
-    addi.d i0, i0, 3
-    vinsgr2vr.d VI0, i0, 0 //1
-    addi.d i0, i0, 1
-    vinsgr2vr.d VI0, i0, 1 //2
-    .align 3
-
-.L24:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t2, 1
-    vadd.d VI1, VI1, VINC4
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t1, 0
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t2, 1
-    vadd.d VI2, VI1, VINC2
-    vfmaxa.d x1, VX0, VX1
-    vfcmp.ceq.d VT0, VX0, x1
-    vbitsel.v x2, VI2, VI1, VT0
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t2, 1
-    vadd.d VI1, VI2, VINC2
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t1, 0
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX1, t2, 1
-    vadd.d VI2, VI1, VINC2
-    vfmaxa.d x3, VX0, VX1
-    vfcmp.ceq.d VT0, VX0, x3
-    vbitsel.v x4, VI2, VI1, VT0
-    vfmaxa.d x3, x1, x3
-    vfcmp.ceq.d VT0, x1, x3
-    vbitsel.v x2, x4, x2, VT0
-    vfmaxa.d VM1, VM0, x3
-    vbitsel.v VM0, VM1, VM0, VT0
-    vfcmp.ceq.d VT0, VM0, VM1
-    vbitsel.v VI0, x2, VI0, VT0
-    addi.d I, I, -1
-    blt $r0, I, .L24
-    .align 3
-
-.L25:
-    vreplvei.d VI1, VI0, 0
-    vreplvei.d VI2, VI0, 1
-    vreplvei.d x1, VM0, 0
-    vreplvei.d x2, VM0, 1
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.d $f17, TEMP
-    ffint.d.l $f17, $f17
-    vfcmp.ceq.d VT0, x2, x1
-    fcmp.ceq.d $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    vfcmp.clt.d VT0, VI1, VI0
-    vbitsel.v VI0, VI0, VI1, VT0
-    b .L27
-    .align 3
-
-.L26:
-    vfmaxa.d VM0, x1, x2
-    vfcmp.ceq.d VT0, x1, VM0
-    vbitsel.v VI0, VI2, VI1, VT0
-    .align 3
-
-.L27:
-    movfr2gr.d i0, $f20
-    .align 3
-
-.L21: // N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    srai.d i1, N, 3
-    slli.d i1, i1, 3
-    addi.d i1, i1, 1 //current index
-    movgr2fr.d $f21, i1
-    movgr2fr.d $f20, i0
-    .align 3
-
-.L22:
-    fld.d $f9, X, 0
-    vfmaxa.d VM1, x1, VM0
-    vfcmp.ceq.d VT0, VM0, VM1
-    vbitsel.v VM0, VM1, VM0, VT0
-    vbitsel.v VI0, VI1, VI0, VT0
-    addi.d I, I, -1
-    addi.d i1, i1, 1
-    add.d  X, X, INCX
-    movgr2fr.d $f21, i1
-    blt $r0, I, .L22
-    movfr2gr.d i0, $f20
-    .align 3
-
-.L999:
-    move $r4, $r17
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/isamax_lsx.S b/kernel/loongarch64/isamax_lsx.S
deleted file mode 100644
index a18aa7354..000000000
--- a/kernel/loongarch64/isamax_lsx.S
+++ /dev/null
@@ -1,275 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-
-#define N $r4
-#define X $r5
-#define INCX $r6
-#define I $r12
-#define t1 $r13
-#define t2 $r15
-#define t3 $r18
-#define t4 $r16
-#define i0 $r17
-#define i1 $r14
-#define TEMP $r19
-#define x1 $vr9
-#define x2 $vr10
-#define x3 $vr11
-#define x4 $vr12
-#define VX0 $vr13
-#define VX1 $vr14
-#define VM0 $vr15
-#define VM1 $vr16
-#define VINC4 $vr17
-#define VINC8 $vr18
-#define VI0 $vr20
-#define VI1 $vr21
-#define VI2 $vr22
-#define VI3 $vr8
-#define VI4 $vr19
-#define VT0 $vr23
-
-    PROLOGUE
-    li.d  i0, 0
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    bne INCX, TEMP, .L20
-    vld VM0, X, 0
-    addi.w i0, i0, 1
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    slli.w i0, i0, 2 //4
-    vreplgr2vr.w VINC4, i0
-    slli.w i0, i0, 1 //8
-    vreplgr2vr.w VINC8, i0
-    addi.w i0, i0, -15
-    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 1
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 2
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 3
-    addi.w i0, i0, 5
-    vinsgr2vr.w VI0, i0, 0 //1
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 1 //2
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 2 //3
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 3 //4
-    .align 3
-
-.L10:
-    vld VX0, X, 0 * SIZE
-    vadd.w VI1, VI1, VINC8
-    vld VX1, X, 4 * SIZE
-    vadd.w VI2, VI1, VINC4
-    vfmaxa.s VM1, VX0, VX1
-    vfcmp.ceq.s VT0, VX0, VM1
-    addi.d I, I, -1
-    vbitsel.v VI2, VI2, VI1, VT0
-    vfmaxa.s VM1, VM0, VM1
-    vfcmp.ceq.s VT0, VM0, VM1
-    addi.d X, X, 8 * SIZE
-    vbitsel.v VM0, VM1, VM0, VT0
-    vbitsel.v VI0, VI2, VI0, VT0
-    blt $r0, I, .L10
-    .align 3
-
-.L15:
-    vreplvei.w VI1, VI0, 0
-    vreplvei.w VI2, VI0, 1
-    vreplvei.w VI3, VI0, 2
-    vreplvei.w VI4, VI0, 3
-    vreplvei.w x1, VM0, 0
-    vreplvei.w x2, VM0, 1
-    vreplvei.w x3, VM0, 2
-    vreplvei.w x4, VM0, 3
-    vfmaxa.s VM1, x1, x2
-    vfcmp.ceq.s VT0, VM1, x1
-    vbitsel.v VINC4, VI2, VI1, VT0
-    vfmaxa.s VM0, x3, x4
-    vfcmp.ceq.s VT0, x3, VM0
-    vbitsel.v VINC8, VI4, VI3, VT0
-    vfmaxa.s VM0, VM0, VM1
-    vfcmp.ceq.s VT0, VM0, VM1
-    vbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.w $f17, TEMP
-    ffint.s.w $f17, $f17
-    vfcmp.ceq.s VT0, VM0, x1
-    fcmp.ceq.s $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    vfcmp.clt.s VT0, VI1, VI0
-    vbitsel.v VI0, VI0, VI1, VT0
-    b .L26
-    .align 3
-
-.L20: // INCX!=1
-    move TEMP, X
-    addi.w i0, i0, 1
-    ld.w t1, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.w VM0, t1, 0
-    srai.d I, N, 3
-    bge $r0, I, .L21
-    ld.w t2, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t3, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    ld.w t4, TEMP, 0 * SIZE
-    add.d TEMP, TEMP, INCX
-    vinsgr2vr.w VM0, t2, 1
-    vinsgr2vr.w VM0, t3, 2
-    vinsgr2vr.w VM0, t4, 3
-    slli.w i0, i0, 2 //4
-    vreplgr2vr.w VINC4, i0
-    slli.w i0, i0, 1 //8
-    vreplgr2vr.w VINC8, i0
-    addi.w i0, i0, -15
-    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 1
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 2
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI1, i0, 3
-    addi.w i0, i0, 5
-    vinsgr2vr.w VI0, i0, 0 //1
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 1 //2
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 2 //3
-    addi.w i0, i0, 1
-    vinsgr2vr.w VI0, i0, 3 //4
-    .align 3
-
-.L24:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    vadd.w VI1, VI1, VINC8
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    vadd.w VI2, VI1, VINC4
-    vfmaxa.s VM1, VX0, VX1
-    vfcmp.ceq.s VT0, VX0, VM1
-    vbitsel.v VI2, VI2, VI1, VT0
-    vfmaxa.s VM1, VM0, VM1
-    vfcmp.ceq.s VT0, VM0, VM1
-    addi.d I, I, -1
-    vbitsel.v VM0, VM1, VM0, VT0
-    vbitsel.v VI0, VI2, VI0, VT0
-    blt $r0, I, .L24
-    .align 3
-
-.L25:
-    vreplvei.w VI1, VI0, 0
-    vreplvei.w VI2, VI0, 1
-    vreplvei.w VI3, VI0, 2
-    vreplvei.w VI4, VI0, 3
-    vreplvei.w x1, VM0, 0
-    vreplvei.w x2, VM0, 1
-    vreplvei.w x3, VM0, 2
-    vreplvei.w x4, VM0, 3
-    vfmaxa.s VM1, x1, x2
-    vfcmp.ceq.s VT0, VM1, x1
-    vbitsel.v VINC4, VI2, VI1, VT0
-    vfmaxa.s VM0, x3, x4
-    vfcmp.ceq.s VT0, x3, VM0
-    vbitsel.v VINC8, VI4, VI3, VT0
-    vfmaxa.s VM0, VM0, VM1
-    vfcmp.ceq.s VT0, VM0, VM1
-    vbitsel.v VI0, VINC8, VINC4, VT0
-    li.d TEMP, 1 //处理尾数相等时取最小序号
-    movgr2fr.w $f17, TEMP
-    ffint.s.w $f17, $f17
-    vfcmp.ceq.s VT0, VM0, x1
-    fcmp.ceq.s $fcc0, $f23, $f17
-    bceqz $fcc0, .L26
-    vfcmp.clt.s VT0, VI1, VI0
-    vbitsel.v VI0, VI0, VI1, VT0
-    .align 3
-
-.L26:
-    vfcmp.ceq.s VT0, VM0, x2
-    fcmp.ceq.s $fcc0, $f23, $f17
-    bceqz $fcc0, .L27
-    vfcmp.clt.s VT0, VI2, VI0
-    vbitsel.v VI0, VI0, VI2, VT0
-    .align 3
-
-.L27:
-    vfcmp.ceq.s VT0, VM0, x3
-    fcmp.ceq.s $fcc0, $f23, $f17
-    bceqz $fcc0, .L28
-    vfcmp.clt.s VT0, VI3, VI0
-    vbitsel.v VI0, VI0, VI3, VT0
-    .align 3
-
-.L28:
-    vfcmp.ceq.s VT0, VM0, x4
-    fcmp.ceq.s $fcc0, $f23, $f17
-    bceqz $fcc0, .L29
-    vfcmp.clt.s VT0, VI4, VI0
-    vbitsel.v VI0, VI0, VI4, VT0
-    .align 3
-
-.L29:
-    movfr2gr.s i0, $f20
-    .align 3
-
-.L21: //N<8
-    andi I, N, 7
-    bge $r0, I, .L999
-    srai.d i1, N, 3
-    slli.d i1, i1, 3
-    addi.d i1, i1, 1 //current index
-    movgr2fr.d $f21, i1
-    movgr2fr.d $f20, i0
-    .align 3
-
-.L22:
-    fld.s $f9, X, 0
-    addi.d I, I, -1
-    vfmaxa.s VM1, x1, VM0
-    vfcmp.ceq.s VT0, VM0, VM1
-    add.d  X, X, INCX
-    vbitsel.v VM0, VM1, VM0, VT0
-    vbitsel.v VI0, VI1, VI0, VT0
-    addi.d i1, i1, 1
-    movgr2fr.d $f21, i1
-    blt $r0, I, .L22
-    movfr2gr.s i0, $f20
-    .align 3
-
-.L999:
-    move $r4, $r17
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE

From 179ed51d3b2df5e0df8a28d184bd169efa7f2b61 Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao@loongson.cn>
Date: Thu, 21 Dec 2023 14:18:39 +0800
Subject: [PATCH 519/718] Add dgemm_kernel_8x4.S file.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   14 +
 kernel/loongarch64/dgemm_kernel_8x4.S    | 2894 ++++++++++++++++++++++
 param.h                                  |    4 +-
 3 files changed, 2910 insertions(+), 2 deletions(-)
 create mode 100644 kernel/loongarch64/dgemm_kernel_8x4.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index a8a6dd82f..a78c0dbc5 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -58,4 +58,18 @@ DROTKERNEL  =  rot_lsx.S
 SNRM2KERNEL =  snrm2_lsx.S
 DNRM2KERNEL =  dnrm2_lsx.S
 
+DGEMMKERNEL    = dgemm_kernel_8x4.S
+DGEMMINCOPY    = ../generic/gemm_ncopy_8.c
+DGEMMITCOPY    = ../generic/gemm_tcopy_8.c
+DGEMMONCOPY    = ../generic/gemm_ncopy_4.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
 endif
diff --git a/kernel/loongarch64/dgemm_kernel_8x4.S b/kernel/loongarch64/dgemm_kernel_8x4.S
new file mode 100644
index 000000000..405f1bd97
--- /dev/null
+++ b/kernel/loongarch64/dgemm_kernel_8x4.S
@@ -0,0 +1,2894 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA  $f0   // param 4: alpha
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+
+#ifdef TRMMKERNEL
+#define OFFSET $r11  // param 9: offset
+#endif
+#define OFF    $r12
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define T0     $r25 /* !! DO NOT USE $r21 and $r22 !! */
+#define T1     $r26
+#define T2     $r27
+#define ZERO   $r0
+
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+#define D0     $vr16
+#define D1     $vr17
+#define D2     $vr18
+#define D3     $vr19
+#define D4     $vr20
+#define D5     $vr21
+#define D6     $vr22
+#define D7     $vr23
+#define D8     $vr24
+#define D9     $vr25
+#define D10    $vr26
+#define D11    $vr27
+#define D12    $vr28
+#define D13    $vr29
+#define D14    $vr30
+#define D15    $vr31
+#define VALPHA $vr15
+
+/* Prefetch interval */
+#define A_PRE  0x200
+#define B_PRE  0x100
+
+.macro KERNEL2x8x4
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vld     U1,   A0,    0x10
+    vfmadd.d  D2,  U10, U12, D2
+    vfmadd.d  D3,  U11, U12, D3
+
+    vld     U2,   A0,    0x20
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vld     U3,   A0,    0x30
+    vfmadd.d  D6,  U10, U13, D6
+    vfmadd.d  D7,  U11, U13, D7
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D8,  U8, U14, D8
+    vfmadd.d  D9,  U9, U14, D9
+
+    preld     0,   B0, B_PRE
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D10, U10, U14, D10
+    vfmadd.d  D11, U11, U14, D11
+
+    preld     0,   A0, A_PRE
+    vldrepl.d U6,  B0, 0x10
+    vfmadd.d  D12, U8, U15, D12
+    vfmadd.d  D13, U9, U15, D13
+
+    preld     0,   A0, A_PRE + 0x40
+    vldrepl.d U7,  B0, 0x18
+    vfmadd.d  D14, U10, U15, D14
+    vfmadd.d  D15, U11, U15, D15
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    vld     U8,   A0,    0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vld     U9,   A0,    0x10
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    vld     U10,  A0,    0x20
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    vld     U11,  A0,    0x30
+    vfmadd.d  D6,  U2, U5, D6
+    vfmadd.d  D7,  U3, U5, D7
+
+    vldrepl.d U12,  B0, 0x00
+    vfmadd.d  D8,  U0, U6, D8
+    vfmadd.d  D9,  U1, U6, D9
+
+    preld     0,   B0, B_PRE
+    vldrepl.d U13,  B0, 0x08
+    vfmadd.d  D10, U2, U6, D10
+    vfmadd.d  D11, U3, U6, D11
+
+    preld     0,   A0, A_PRE
+    vldrepl.d U14,  B0, 0x10
+    vfmadd.d  D12, U0, U7, D12
+    vfmadd.d  D13, U1, U7, D13
+
+    preld     0,   A0, A_PRE + 0x40
+    vldrepl.d U15,  B0, 0x18
+    vfmadd.d  D14, U2, U7, D14
+    vfmadd.d  D15, U3, U7, D15
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x8x4_END
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vld     U1,   A0,    0x10
+    vfmadd.d  D2,  U10, U12, D2
+    vfmadd.d  D3,  U11, U12, D3
+
+    vld     U2,   A0,    0x20
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vld     U3,   A0,    0x30
+    vfmadd.d  D6,  U10, U13, D6
+    vfmadd.d  D7,  U11, U13, D7
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D8,  U8, U14, D8
+    vfmadd.d  D9,  U9, U14, D9
+
+    preld     0,   B0, B_PRE
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D10, U10, U14, D10
+    vfmadd.d  D11, U11, U14, D11
+
+    preld     0,   A0, A_PRE
+    vldrepl.d U6,  B0, 0x10
+    vfmadd.d  D12, U8, U15, D12
+    vfmadd.d  D13, U9, U15, D13
+
+    preld     0,   A0, A_PRE + 0x40
+    vldrepl.d U7,  B0, 0x18
+    vfmadd.d  D14, U10, U15, D14
+    vfmadd.d  D15, U11, U15, D15
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    vfmadd.d  D6,  U2, U5, D6
+    vfmadd.d  D7,  U3, U5, D7
+
+    vfmadd.d  D8,  U0, U6, D8
+    vfmadd.d  D9,  U1, U6, D9
+
+    preld     0,   B0, B_PRE
+    vfmadd.d  D10, U2, U6, D10
+    vfmadd.d  D11, U3, U6, D11
+
+    preld     0,   A0, A_PRE
+    vfmadd.d  D12, U0, U7, D12
+    vfmadd.d  D13, U1, U7, D13
+
+    preld     0,   A0, A_PRE + 0x40
+    vfmadd.d  D14, U2, U7, D14
+    vfmadd.d  D15, U3, U7, D15
+.endm
+
+.macro KERNEL8x8x4
+.rept 4
+    KERNEL2x8x4
+.endr
+.endm
+
+.macro KERNEL8x8x4_END
+.rept 3
+    KERNEL2x8x4
+.endr
+    KERNEL2x8x4_END
+.endm
+
+.macro KERNEL2x4x4
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vldrepl.d U6,  B0, 0x10
+    vfmadd.d  D8,  U8, U14, D8
+    vfmadd.d  D9,  U9, U14, D9
+
+    vldrepl.d U7,  B0, 0x18
+    vfmadd.d  D12, U8, U15, D12
+    vfmadd.d  D13, U9, U15, D13
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    vld     U8,   A0,    0x00
+    vld     U9,   A0,    0x10
+
+    vldrepl.d U12, B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vldrepl.d U13, B0, 0x08
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    vldrepl.d U14, B0, 0x10
+    vfmadd.d  D8,  U0, U6, D8
+    vfmadd.d  D9,  U1, U6, D9
+
+    vldrepl.d U15,  B0, 0x18
+    vfmadd.d  D12, U0, U7, D12
+    vfmadd.d  D13, U1, U7, D13
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x4x4_END
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vldrepl.d U6,  B0, 0x10
+    vfmadd.d  D8,  U8, U14, D8
+    vfmadd.d  D9,  U9, U14, D9
+
+    vldrepl.d U7,  B0, 0x18
+    vfmadd.d  D12, U8, U15, D12
+    vfmadd.d  D13, U9, U15, D13
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+    vfmadd.d  D8,  U0, U6, D8
+    vfmadd.d  D9,  U1, U6, D9
+    vfmadd.d  D12, U0, U7, D12
+    vfmadd.d  D13, U1, U7, D13
+.endm
+
+.macro KERNEL8x4x4
+.rept 4
+    KERNEL2x4x4
+.endr
+.endm
+
+.macro KERNEL8x4x4_END
+.rept 3
+    KERNEL2x4x4
+.endr
+    KERNEL2x4x4_END
+.endm
+
+.macro KERNEL2x2x4
+    vldrepl.d     U0,   A0,    0x00
+    vldrepl.d     U1,   A0,    0x08
+
+    vfmadd.d   D0,     U8,     U12,    D0
+    vfmadd.d   D1,     U8,     U13,    D1
+    vfmadd.d   D2,     U9,     U12,    D2
+    vfmadd.d   D3,     U9,     U13,    D3
+
+    vld    U4,  B0, 0x00
+    vld    U5,  B0, 0x10
+    addi.d  A0,  A0, 0x10
+    addi.d  B0,  B0, 0x20
+
+    vldrepl.d     U8,   A0,    0x00
+    vldrepl.d     U9,   A0,    0x08
+
+    vfmadd.d   D0,     U0,     U4,    D0
+    vfmadd.d   D1,     U0,     U5,    D1
+    vfmadd.d   D2,     U1,     U4,    D2
+    vfmadd.d   D3,     U1,     U5,    D3
+
+    vld       U12, B0, 0x00
+    vld       U13, B0, 0x10
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x2x4_END
+    vldrepl.d     U0,   A0,    0x00
+    vldrepl.d     U1,   A0,    0x08
+
+    vfmadd.d   D0,     U8,     U12,    D0
+    vfmadd.d   D1,     U8,     U13,    D1
+    vfmadd.d   D2,     U9,     U12,    D2
+    vfmadd.d   D3,     U9,     U13,    D3
+
+    vld     U4,  B0, 0x00
+    vld     U5,  B0, 0x10
+    addi.d  A0,  A0, 0x10
+    addi.d  B0,  B0, 0x20
+
+    vfmadd.d   D0,     U0,     U4,    D0
+    vfmadd.d   D1,     U0,     U5,    D1
+    vfmadd.d   D2,     U1,     U4,    D2
+    vfmadd.d   D3,     U1,     U5,    D3
+.endm
+
+.macro KERNEL8x2x4
+.rept 4
+    KERNEL2x2x4
+.endr
+.endm
+
+.macro KERNEL8x2x4_END
+.rept 3
+    KERNEL2x2x4
+.endr
+    KERNEL2x2x4_END
+.endm
+
+.macro KERNEL2x1x4
+    vldrepl.d  U0,     A0,     0x00
+    vfmadd.d   D0,     U8,     U12,    D0
+    vfmadd.d   D1,     U8,     U13,    D1
+    vld        U4,     B0,     0x00
+    vld        U5,     B0,     0x10
+
+    vldrepl.d  U8,     A0,     0x08
+    vfmadd.d   D0,     U0,     U4,     D0
+    vfmadd.d   D1,     U0,     U5,     D1
+    vld        U12,    B0,     0x20
+    vld        U13,    B0,     0x30
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x40
+.endm
+
+.macro KERNEL2x1x4_END
+    vldrepl.d  U0,     A0,     0x00
+    vfmadd.d   D0,     U8,     U12,    D0
+    vfmadd.d   D1,     U8,     U13,    D1
+    vld        U4,     B0,     0x00
+    vld        U5,     B0,     0x10
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+    vfmadd.d   D0,     U0,     U4,     D0
+    vfmadd.d   D1,     U0,     U5,     D1
+.endm
+
+.macro KERNEL8x1x4
+.rept 4
+    KERNEL2x1x4
+.endr
+.endm
+
+.macro KERNEL8x1x4_END
+.rept 3
+    KERNEL2x1x4
+.endr
+    KERNEL2x1x4_END
+.endm
+
+.macro KERNEL2x8x2
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vld     U1,   A0,    0x10
+    vfmadd.d  D2,  U10, U12, D2
+    vfmadd.d  D3,  U11, U12, D3
+
+    vld     U2,   A0,    0x20
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vld     U3,   A0,    0x30
+    vfmadd.d  D6,  U10, U13, D6
+    vfmadd.d  D7,  U11, U13, D7
+
+    vldrepl.d U4,  B0, 0x00
+    vldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    vld     U8,   A0,    0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vld     U9,   A0,    0x10
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    vld     U10,   A0,   0x20
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    vld     U11,   A0,   0x30
+    vfmadd.d  D6,  U2, U5, D6
+    vfmadd.d  D7,  U3, U5, D7
+
+    vldrepl.d U12,  B0, 0x00
+    vldrepl.d U13,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x8x2_END
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vld     U1,   A0,    0x10
+    vfmadd.d  D2,  U10, U12, D2
+    vfmadd.d  D3,  U11, U12, D3
+
+    vld     U2,   A0,    0x20
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vld     U3,   A0,    0x30
+    vfmadd.d  D6,  U10, U13, D6
+    vfmadd.d  D7,  U11, U13, D7
+
+    vldrepl.d U4,  B0, 0x00
+    vldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    vfmadd.d  D6,  U2, U5, D6
+    vfmadd.d  D7,  U3, U5, D7
+.endm
+
+.macro KERNEL8x8x2
+.rept 4
+    KERNEL2x8x2
+.endr
+.endm
+
+.macro KERNEL8x8x2_END
+.rept 3
+    KERNEL2x8x2
+.endr
+    KERNEL2x8x2_END
+.endm
+
+.macro KERNEL2x4x2
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vldrepl.d U4,  B0, 0x00
+    vldrepl.d U5,  B0, 0x08
+
+    vld     U8,   A0,    0x20
+    vld     U9,   A0,    0x30
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    vldrepl.d U12,  B0, 0x10
+    vldrepl.d U13,  B0, 0x18
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x4x2_END
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vldrepl.d U4,  B0, 0x00
+    vldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+.endm
+
+.macro KERNEL8x4x2
+.rept 4
+    KERNEL2x4x2
+.endr
+.endm
+
+.macro KERNEL8x4x2_END
+.rept 3
+    KERNEL2x4x2
+.endr
+    KERNEL2x4x2_END
+.endm
+
+.macro KERNEL2x2x2
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D4,  U8, U13, D4
+
+    vldrepl.d U4,  B0, 0x00
+    vldrepl.d U5,  B0, 0x08
+
+    vld     U8,   A0,    0x10
+    vldrepl.d U12,  B0, 0x10
+    vldrepl.d U13,  B0, 0x18
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D4,  U0, U5, D4
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x2x2_END
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D4,  U8, U13, D4
+
+    vldrepl.d U4,  B0, 0x00
+    vldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D4,  U0, U5, D4
+.endm
+
+.macro KERNEL8x2x2
+.rept 4
+    KERNEL2x2x2
+.endr
+.endm
+
+.macro KERNEL8x2x2_END
+.rept 3
+    KERNEL2x2x2
+.endr
+    KERNEL2x2x2_END
+.endm
+
+.macro KERNEL2x1x2
+    vldrepl.d U0,  A0, 0x00
+    vfmadd.d  D0,  U8, U12, D0
+
+    vld       U4,  B0, 0x00
+    vldrepl.d U8,  A0, 0x08
+    vld       U12, B0, 0x10
+    vfmadd.d  D0,  U0, U4, D0
+
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x1x2_END
+    vldrepl.d U0,  A0, 0x00
+    vfmadd.d  D0,  U8, U12, D0
+
+    vld       U4,  B0, 0x00
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x10
+
+    vfmadd.d  D0,  U0, U4, D0
+.endm
+
+.macro KERNEL8x1x2
+.rept 4
+    KERNEL2x1x2
+.endr
+.endm
+
+.macro KERNEL8x1x2_END
+.rept 3
+    KERNEL2x1x2
+.endr
+    KERNEL2x1x2_END
+.endm
+
+.macro KERNEL2x8x1
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vld     U1,   A0,    0x10
+    vfmadd.d  D2,  U10, U12, D2
+    vfmadd.d  D3,  U11, U12, D3
+
+    vldrepl.d U4,  B0, 0x00
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    vld     U8,   A0,    0x40
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vld     U9,   A0,    0x50
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    vld     U10,   A0,    0x60
+    vld     U11,   A0,    0x70
+
+    vldrepl.d U12,  B0, 0x08
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x8x1_END
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vld     U1,   A0,    0x10
+    vfmadd.d  D2,  U10, U12, D2
+    vfmadd.d  D3,  U11, U12, D3
+
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    vldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+.endm
+
+.macro KERNEL8x8x1
+.rept 4
+    KERNEL2x8x1
+.endr
+.endm
+
+.macro KERNEL8x8x1_END
+.rept 3
+    KERNEL2x8x1
+.endr
+    KERNEL2x8x1_END
+.endm
+
+.macro KERNEL2x4x1
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+    vldrepl.d U4,  B0, 0x00
+
+    vld     U8,   A0,    0x20
+    vld     U9,   A0,    0x30
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+    vldrepl.d U12,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x4x1_END
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+    vldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+.endm
+
+.macro KERNEL8x4x1
+.rept 4
+    KERNEL2x4x1
+.endr
+.endm
+
+.macro KERNEL8x4x1_END
+.rept 3
+    KERNEL2x4x1
+.endr
+    KERNEL2x4x1_END
+.endm
+
+.macro KERNEL2x2x1
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    vld     U8,   A0,    0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vldrepl.d U12,  B0, 0x00
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+.endm
+
+.macro KERNEL2x2x1_END
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    vfmadd.d  D0,  U0, U4, D0
+.endm
+
+.macro KERNEL8x2x1
+.rept 4
+    KERNEL2x2x1
+.endr
+.endm
+
+.macro KERNEL8x2x1_END
+.rept 3
+    KERNEL2x2x1
+.endr
+    KERNEL2x2x1_END
+.endm
+
+.macro KERNEL2x1x1
+    vldrepl.d  U0,  A0,    0x00
+    vfmadd.d   D0,  U8, U12, D0
+    vldrepl.d  U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    vldrepl.d  U8,  A0,    0x00
+    vfmadd.d   D0,  U0, U4, D0
+    vldrepl.d  U12, B0, 0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+.endm
+
+.macro KERNEL2x1x1_END
+    vldrepl.d  U0,  A0,    0x00
+    vfmadd.d   D0,  U8, U12, D0
+    vldrepl.d  U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    vfmadd.d  D0,  U0, U4, D0
+.endm
+
+.macro KERNEL8x1x1
+.rept 4
+    KERNEL2x1x1
+.endr
+.endm
+
+.macro KERNEL8x1x1_END
+.rept 3
+    KERNEL2x1x1
+.endr
+    KERNEL2x1x1_END
+.endm
+
+
+    PROLOGUE
+
+    addi.d   $sp,   $sp,   -112
+    /* Store regs */
+    SDARG    $r23,  $sp,   0
+    SDARG    $r24,  $sp,   8
+    SDARG    $r25,  $sp,   16
+    SDARG    $r26,  $sp,   24
+    SDARG    $r27,  $sp,   32
+    ST       $f24,  $sp,   40
+    ST       $f25,  $sp,   48
+    ST       $f26,  $sp,   56
+    ST       $f27,  $sp,   64
+    ST       $f28,  $sp,   72
+    ST       $f29,  $sp,   80
+    ST       $f30,  $sp,   88
+    ST       $f31,  $sp,   96
+    ST       ALPHA, $sp,   104
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d   OFF,   ZERO,  OFFSET
+#else
+    xor     OFF,   OFF,   OFF
+#endif
+
+    /* if (!(N >> 2)) goto L_N3 */
+    srai.d   J,     N,     2     /* J = bn >> 2 */
+    andi     N,     N,     0x03
+    vldrepl.d  VALPHA, $sp, 104 /* When N < 4, VALPHA will not changed */
+    beq      ZERO,  J,     .L_N3
+
+.L_J1: /* J-- && This loop include Condition 1 */
+
+/************************* Condition 1 if((N >> 2) && (M >> 3)) START !!! *************************
+*                                                   dgemm_core_16x4                                */
+    move     C0,    C
+    move     A0,    A
+    slli.d   T0,    LDC,   3
+    add.d    C1,    C0,    T0
+    addi.d   J,     J,     -1   /* J-- */
+    add.d    C2,    C1,    T0
+    add.d    C3,    C2,    T0
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 3)) goto L_M8 */
+    srai.d   I,     M,     3     /* I = bm >> 3 */
+    beq      ZERO,  I,     .L_M8
+
+.L_I1: /* I-- */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x06
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  8
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    /* Calculate the first set of D0~D15,
+     * avoidig set 0 operation
+     * Load 8 * 64 from A0
+     * U0 = {a1,  a0}
+     * U1 = {a3,  a2}
+     * U2 = {a5,  a4}
+     * U3 = {a7,  a6}
+     */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    vldrepl.d     U4, B0, 0x00
+    preld         0,  C0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+    vfmul.d  D1,  U1, U4
+    preld    0,   C0, 0x20
+    vfmul.d  D2,  U2, U4
+    vfmul.d  D3,  U3, U4
+
+    vldrepl.d     U5, B0, 0x08
+    preld         0,  C1, 0x00
+    /* line 2 */
+    vfmul.d  D4,  U0, U5
+    vfmul.d  D5,  U1, U5
+    preld    0,   C1, 0x20
+    vfmul.d  D6,  U2, U5
+    vfmul.d  D7,  U3, U5
+
+    vldrepl.d     U6, B0, 0x10
+    preld         0,  C2, 0x00
+    /* line 3 */
+    vfmul.d  D8,  U0, U6
+    vfmul.d  D9,  U1, U6
+    preld    0,   C2, 0x20
+    vfmul.d  D10, U2, U6
+    vfmul.d  D11, U3, U6
+
+    vldrepl.d     U7, B0, 0x18
+    preld         0,  C3, 0x00
+    /* line 4 */
+    vfmul.d  D12, U0, U7
+    vfmul.d  D13, U1, U7
+    preld    0,   C3, 0x20
+    vfmul.d  D14, U2, U7
+    vfmul.d  D15, U3, U7
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x40
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_L7 */
+    beq       ZERO,TL, .L_L7
+
+    vld     U8,   A0,    0x00
+    vld     U9,   A0,    0x10
+    vld     U10,  A0,    0x20
+    vld     U11,  A0,    0x30
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    vldrepl.d  U13,   B0,  0x08
+    vldrepl.d  U14,   B0,  0x10
+    vldrepl.d  U15,   B0,  0x18
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    beq    ZERO,    TL,  .L_TL1_END
+.L_TL1: /* TL-- */
+    KERNEL8x8x4
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_TL1
+
+.L_TL1_END:
+    KERNEL8x8x4_END
+
+   /* Maybe we need calculate the last
+    * 7 sets of D0~D15?
+    */
+.L_L7:
+    /* if (!(L & 7)) goto L_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_L0
+
+.L_L71:
+    /* Load 16 * 64 from A0 */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    /* Cumulative D0~D15 */
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+    vfmadd.d  D6,  U2, U5, D6
+    vfmadd.d  D7,  U3, U5, D7
+
+    vldrepl.d U6,  B0, 0x10
+    vfmadd.d  D8,  U0, U6, D8
+    vfmadd.d  D9,  U1, U6, D9
+    vfmadd.d  D10, U2, U6, D10
+    vfmadd.d  D11, U3, U6, D11
+
+    vldrepl.d U7,  B0, 0x18
+    vfmadd.d  D12, U0, U7, D12
+    vfmadd.d  D13, U1, U7, D13
+    vfmadd.d  D14, U2, U7, D14
+    vfmadd.d  D15, U3, U7, D15
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_L71
+
+.L_L0:
+    vldrepl.d  VALPHA, $sp, 104
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+    vfmul.d  D2,   D2,  VALPHA
+    vfmul.d  D3,   D3,  VALPHA
+    vfmul.d  D4,   D4,  VALPHA
+    vfmul.d  D5,   D5,  VALPHA
+    vfmul.d  D6,   D6,  VALPHA
+    vfmul.d  D7,   D7,  VALPHA
+    vfmul.d  D8,   D8,  VALPHA
+    vfmul.d  D9,   D9,  VALPHA
+    vfmul.d  D10,  D10, VALPHA
+    vfmul.d  D11,  D11, VALPHA
+    vfmul.d  D12,  D12, VALPHA
+    vfmul.d  D13,  D13, VALPHA
+    vfmul.d  D14,  D14, VALPHA
+    vfmul.d  D15,  D15, VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vld      U1,  C0,  0x10
+    vld      U2,  C0,  0x20
+    vld      U3,  C0,  0x30
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    vfmadd.d D1,  D1,  VALPHA,  U1
+    vfmadd.d D2,  D2,  VALPHA,  U2
+    vfmadd.d D3,  D3,  VALPHA,  U3
+
+    /* Load C1  */
+    vld      U4,  C1,  0x00
+    vld      U5,  C1,  0x10
+    vld      U6,  C1,  0x20
+    vld      U7,  C1,  0x30
+    vfmadd.d D4,  D4,  VALPHA,  U4
+    vfmadd.d D5,  D5,  VALPHA,  U5
+    vfmadd.d D6,  D6,  VALPHA,  U6
+    vfmadd.d D7,  D7,  VALPHA,  U7
+
+    /* Load C2  */
+    vld      U8,  C2,  0x00
+    vld      U9,  C2,  0x10
+    vld      U10, C2,  0x20
+    vld      U11, C2,  0x30
+    vfmadd.d D8,  D8,  VALPHA,  U8
+    vfmadd.d D9,  D9,  VALPHA,  U9
+    vfmadd.d D10, D10, VALPHA,  U10
+    vfmadd.d D11, D11, VALPHA,  U11
+
+    /* Load C3  */
+    vld      U0,  C3,  0x00
+    vld      U1,  C3,  0x10
+    vld      U2,  C3,  0x20
+    vld      U3,  C3,  0x30
+    vfmadd.d D12, D12, VALPHA,  U0
+    vfmadd.d D13, D13, VALPHA,  U1
+    vfmadd.d D14, D14, VALPHA,  U2
+    vfmadd.d D15, D15, VALPHA,  U3
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    vst      D0,  C0,  0x00
+    vst      D1,  C0,  0x10
+    vst      D2,  C0,  0x20
+    vst      D3,  C0,  0x30
+    /* Store C1 */
+    vst      D4,  C1,  0x00
+    vst      D5,  C1,  0x10
+    vst      D6,  C1,  0x20
+    vst      D7,  C1,  0x30
+    /* Store C2 */
+    vst      D8,  C2,  0x00
+    vst      D9,  C2,  0x10
+    vst      D10, C2,  0x20
+    vst      D11, C2,  0x30
+    /* Store C3 */
+    vst      D12, C3,  0x00
+    vst      D13, C3,  0x10
+    vst      D14, C3,  0x20
+    vst      D15, C3,  0x30
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x40
+    addi.d    C1,  C1,  0x40
+    addi.d    C2,  C2,  0x40
+    addi.d    C3,  C3,  0x40
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values  in A */
+    addi.d    L,   L,   -8
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x06
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF, OFF, 0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_I1
+
+.L_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   7
+    beq       ZERO,I,   .L_M0
+
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x05
+    add.d    A0,    A0,   T0
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  4
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 4 * 64 from A0 */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+    vfmul.d  D1,  U1, U4
+
+    vldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    vfmul.d  D4,  U0, U5
+    vfmul.d  D5,  U1, U5
+
+    vldrepl.d     U6, B0, 0x10
+    /* line 3 */
+    vfmul.d  D8,  U0, U6
+    vfmul.d  D9,  U1, U6
+
+    vldrepl.d     U7, B0, 0x18
+    /* line 4 */
+    vfmul.d  D12, U0, U7
+    vfmul.d  D13, U1, U7
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x20
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M4_L7 */
+    beq       ZERO,TL, .L_M4_L7
+
+    vld     U8,   A0,    0x00
+    vld     U9,   A0,    0x10
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    vldrepl.d  U13,   B0,  0x08
+    vldrepl.d  U14,   B0,  0x10
+    vldrepl.d  U15,   B0,  0x18
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    beq    ZERO,    TL,  .L_M4_TL1_END
+
+.L_M4_TL1: /* TL-- */
+    KERNEL8x4x4
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M4_TL1
+
+.L_M4_TL1_END:
+    KERNEL8x4x4_END
+
+.L_M4_L7:
+    /* if (!(L & 7)) goto L_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M4_L0
+
+.L_M4_L71:
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    vldrepl.d U6,  B0, 0x10
+    vfmadd.d  D8,  U0, U6, D8
+    vfmadd.d  D9,  U1, U6, D9
+
+    vldrepl.d U7,  B0, 0x18
+    vfmadd.d  D12, U0, U7, D12
+    vfmadd.d  D13, U1, U7, D13
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M4_L71
+
+.L_M4_L0:
+    vldrepl.d  VALPHA, $sp, 104
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+    vfmul.d  D4,   D4,  VALPHA
+    vfmul.d  D5,   D5,  VALPHA
+    vfmul.d  D8,   D8,  VALPHA
+    vfmul.d  D9,   D9,  VALPHA
+    vfmul.d  D12,  D12, VALPHA
+    vfmul.d  D13,  D13, VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vld      U1,  C0,  0x10
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    vfmadd.d D1,  D1,  VALPHA,  U1
+
+    /* Load C1  */
+    vld      U2,  C1,  0x00
+    vld      U3,  C1,  0x10
+    vfmadd.d D4,  D4,  VALPHA,  U2
+    vfmadd.d D5,  D5,  VALPHA,  U3
+
+    /* Load C2  */
+    vld      U4,  C2,  0x00
+    vld      U5,  C2,  0x10
+    vfmadd.d D8,  D8,  VALPHA,  U4
+    vfmadd.d D9,  D9,  VALPHA,  U5
+
+    /* Load C3  */
+    vld      U6,  C3,  0x00
+    vld      U7,  C3,  0x10
+    vfmadd.d D12, D12, VALPHA,  U6
+    vfmadd.d D13, D13, VALPHA,  U7
+#endif   // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    vst      D0,  C0,  0x00
+    vst      D1,  C0,  0x10
+    /* Store C1 */
+    vst      D4,  C1,  0x00
+    vst      D5,  C1,  0x10
+    /* Store C2 */
+    vst      D8,  C2,  0x00
+    vst      D9,  C2,  0x10
+    /* Store C3 */
+    vst      D12, C3,  0x00
+    vst      D13, C3,  0x10
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x20
+    addi.d    C1,  C1,  0x20
+    addi.d    C2,  C2,  0x20
+    addi.d    C3,  C3,  0x20
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -4
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x05
+    add.d     A0,  A0, T0
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N >> 2 ) && (M & 4) ) End************/
+
+.L_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x04
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  2
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 2 * 64 from A0 */
+    vldrepl.d     U0,   A0,    0x00
+    vldrepl.d     U1,   A0,    0x08
+
+    vld    U4,     B0,     0x00
+    vld    U5,     B0,     0x10
+
+    vfmul.d    D0,     U0,     U4
+    vfmul.d    D1,     U0,     U5
+    vfmul.d    D2,     U1,     U4
+    vfmul.d    D3,     U1,     U5
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M2_L7 */
+    beq       ZERO,TL, .L_M2_L7
+
+    vldrepl.d     U8,   A0,    0x00
+    vldrepl.d     U9,   A0,    0x08
+
+    addi.d    TL,  TL,  -1
+
+    vld       U12, B0, 0x00
+    vld       U13, B0, 0x10
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x20
+
+    beq    ZERO,    TL,  .L_M2_TL1_END
+.L_M2_TL1: /* TL-- */
+    KERNEL8x2x4
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M2_TL1
+.L_M2_TL1_END:
+    KERNEL8x2x4_END
+
+.L_M2_L7:
+    /* if (!(L & 7)) goto L_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M2_L0
+
+.L_M2_L71:
+    vldrepl.d     U0,   A0,    0x00
+    vldrepl.d     U1,   A0,    0x08
+
+    vld    U4,  B0, 0x00
+    vld    U5,  B0, 0x10
+
+    vfmadd.d   D0,     U0,     U4,    D0
+    vfmadd.d   D1,     U0,     U5,    D1
+    vfmadd.d   D2,     U1,     U4,    D2
+    vfmadd.d   D3,     U1,     U5,    D3
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M2_L71
+
+.L_M2_L0:
+    vldrepl.d  VALPHA, $sp, 104
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+    vfmul.d  D2,   D2,  VALPHA
+    vfmul.d  D3,   D3,  VALPHA
+
+    vstelm.d  D0,  C0,  0x00,    0x00
+    vstelm.d  D0,  C1,  0x00,    0x01
+    vstelm.d  D1,  C2,  0x00,    0x00
+    vstelm.d  D1,  C3,  0x00,    0x01
+    vstelm.d  D2,  C0,  0x08,    0x00
+    vstelm.d  D2,  C1,  0x08,    0x01
+    vstelm.d  D3,  C2,  0x08,    0x00
+    vstelm.d  D3,  C3,  0x08,    0x01
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    /* Load C1  */
+    vld      U1,  C1,  0x00
+    /* Load C2  */
+    vld      U2,  C2,  0x00
+    /* Load C3  */
+    vld      U3,  C3,  0x00
+
+    vilvl.d  D4,  D2,  D0   //C0
+    vilvh.d  D5,  D2,  D0   //C1
+    vilvl.d  D6,  D3,  D1   //C2
+    vilvh.d  D7,  D3,  D1   //C3
+
+    vfmadd.d D0,  D4,  VALPHA,  U0
+    vfmadd.d D2,  D5,  VALPHA,  U1
+    vfmadd.d D1,  D6,  VALPHA,  U2
+    vfmadd.d D3,  D7,  VALPHA,  U3
+
+    vst      D0,  C0,  0x00
+    vst      D2,  C1,  0x00
+    vst      D1,  C2,  0x00
+    vst      D3,  C3,  0x00
+#endif   // #if defined(TRMMKERNEL)
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x10
+    addi.d    C1,  C1,  0x10
+    addi.d    C2,  C2,  0x10
+    addi.d    C3,  C3,  0x10
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -2
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x04
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N >> 2 ) && (M & 2) ) End************/
+
+.L_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x03
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  1
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    vldrepl.d  U0,     A0,     0x00
+    vld        U4,     B0,     0x00
+    vld        U5,     B0,     0x10
+    vfmul.d    D0,     U0,     U4
+    vfmul.d    D1,     U0,     U5
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M1_L7 */
+    beq       ZERO,TL, .L_M1_L7
+
+    vldrepl.d  U8,     A0,     0x00
+
+    addi.d     TL,  TL,  -1
+    vld        U12, B0,  0x00
+    vld        U13, B0,  0x10
+    addi.d     A0,  A0,  0x08
+    addi.d     B0,  B0,  0x20
+
+    beq    ZERO,    TL,  .L_M1_TL1_END
+
+.L_M1_TL1: /* TL-- */
+    KERNEL8x1x4
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M1_TL1
+.L_M1_TL1_END:
+    KERNEL8x1x4_END
+
+.L_M1_L7:
+    /* if (!(L & 7)) goto L_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M1_L0
+
+.L_M1_L71:
+    vldrepl.d  U0,     A0,     0x00
+    vld        U4,     B0,     0x00
+    vld        U5,     B0,     0x10
+    vfmadd.d   D0,     U0,     U4,    D0
+    vfmadd.d   D1,     U0,     U5,    D1
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M1_L71
+
+.L_M1_L0:
+    vldrepl.d  VALPHA, $sp, 104
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+
+    vstelm.d D0,  C0,  0x00,    0x00
+    vstelm.d D0,  C1,  0x00,    0x01
+    vstelm.d D1,  C2,  0x00,    0x00
+    vstelm.d D1,  C3,  0x00,    0x01
+#else
+    /* Load C0  */
+    vldrepl.d     U0,  C0,  0x00
+    vldrepl.d     U1,  C1,  0x00
+    vilvl.d       D4,  U1,  U0
+    vfmadd.d D6,  D0,  VALPHA,  D4
+
+    vldrepl.d     U2,  C2,  0x00
+    vldrepl.d     U3,  C3,  0x00
+    vilvl.d       D5,  U3,  U2
+    vfmadd.d D7,  D1,  VALPHA,  D5
+
+    vstelm.d D6,  C0,  0x00,    0x00
+    vstelm.d D6,  C1,  0x00,    0x01
+    vstelm.d D7,  C2,  0x00,    0x00
+    vstelm.d D7,  C3,  0x00,    0x01
+#endif   // #if defined(TRMMKERNEL)
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x08
+    addi.d    C1,  C1,  0x08
+    addi.d    C2,  C2,  0x08
+    addi.d    C3,  C3,  0x08
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -1
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x03
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N >> 2 ) && (M & 1) ) End************/
+
+.L_M0:
+    /* Add stride for B and C
+     * B += (K * 32)
+     * C += (LDC * 32)
+     */
+    /* since the array type is double,
+     * so we must mul 32
+     */
+    slli.d    T0,   K,   5
+    slli.d    T1,   LDC, 5
+    add.d     B,    B,   T0
+    add.d     C,    C,   T1
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d    OFF,  OFF, 0x04
+#endif
+
+    blt      ZERO,  J,   .L_J1
+
+//////////////// go back to L_J1 /////////////////
+/////////////////////////////////////////////////
+/************************ Condition 1 if((N >> 2) && (M >> 3)) END !!! ************************/
+
+    vldrepl.d  VALPHA, $sp, 104
+
+.L_N3:
+    andi     J,    N,   2
+    beq      ZERO, J,   .L_N1
+
+/************************* Condition 2 if((N & 2) && (M >> 3)) START !!! *************************
+*                                                   dgemm_core_16x2                                */
+
+    move     C0,    C
+    move     A0,    A
+    slli.d   T0,    LDC,   3
+    add.d    C1,    C0,    T0
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 3)) goto L_N3_M8 */
+    srai.d   I,     M,     3     /* I = bm >> 3 */
+    beq      ZERO,  I,     .L_N3_M8
+
+.L_N3_I1:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x06
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x04
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  8
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 8 * 64 from A0
+     * U0 = {a1,  a0}
+     * U1 = {a3,  a2}
+     * U2 = {a5,  a4}
+     * U3 = {a7,  a6}
+     */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    vldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+    vfmul.d  D1,  U1, U4
+    vfmul.d  D2,  U2, U4
+    vfmul.d  D3,  U3, U4
+
+    vldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    vfmul.d  D4,  U0, U5
+    vfmul.d  D5,  U1, U5
+    vfmul.d  D6,  U2, U5
+    vfmul.d  D7,  U3, U5
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x40
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_L7 */
+    beq       ZERO,TL, .L_N3_L7
+
+    vld     U8,   A0,    0x00
+    vld     U9,   A0,    0x10
+    vld     U10,  A0,    0x20
+    vld     U11,  A0,    0x30
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    vldrepl.d  U13,   B0,  0x08
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_N3_TL1_END
+
+.L_N3_TL1: /* TL-- */
+    KERNEL8x8x2
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_TL1
+.L_N3_TL1_END:
+    KERNEL8x8x2_END
+
+.L_N3_L7:
+    /* if (!(L & 7)) goto L_N3_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_L0
+
+.L_N3_L71:
+    /* Load 16 * 64 from A0 */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+    vfmadd.d  D6,  U2, U5, D6
+    vfmadd.d  D7,  U3, U5, D7
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_L71
+
+.L_N3_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+    vfmul.d  D2,   D2,  VALPHA
+    vfmul.d  D3,   D3,  VALPHA
+    vfmul.d  D4,   D4,  VALPHA
+    vfmul.d  D5,   D5,  VALPHA
+    vfmul.d  D6,   D6,  VALPHA
+    vfmul.d  D7,   D7,  VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vld      U1,  C0,  0x10
+    vld      U2,  C0,  0x20
+    vld      U3,  C0,  0x30
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    vfmadd.d D1,  D1,  VALPHA,  U1
+    vfmadd.d D2,  D2,  VALPHA,  U2
+    vfmadd.d D3,  D3,  VALPHA,  U3
+
+    /* Load C1  */
+    vld      U4,  C1,  0x00
+    vld      U5,  C1,  0x10
+    vld      U6,  C1,  0x20
+    vld      U7,  C1,  0x30
+    vfmadd.d D4,  D4,  VALPHA,  U4
+    vfmadd.d D5,  D5,  VALPHA,  U5
+    vfmadd.d D6,  D6,  VALPHA,  U6
+    vfmadd.d D7,  D7,  VALPHA,  U7
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    vst      D0,  C0,  0x00
+    vst      D1,  C0,  0x10
+    vst      D2,  C0,  0x20
+    vst      D3,  C0,  0x30
+    /* Store C1 */
+    vst      D4,  C1,  0x00
+    vst      D5,  C1,  0x10
+    vst      D6,  C1,  0x20
+    vst      D7,  C1,  0x30
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x40
+    addi.d    C1,  C1,  0x40
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -8
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x06
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x04
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x8
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N3_I1
+
+.L_N3_M8:
+    /* We have done M & 8, considering M=4/2/1 */
+    andi      I,   M,   7
+    beq       ZERO,I,   .L_N3_M0
+
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N3_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x05
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x04
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  4
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 4 * 64 from A0 */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+    vfmul.d  D1,  U1, U4
+
+    vldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    vfmul.d  D4,  U0, U5
+    vfmul.d  D5,  U1, U5
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x20
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M4_L7 */
+    beq       ZERO,TL, .L_N3_M4_L7
+
+    vld     U8,   A0,    0x00
+    vld     U9,   A0,    0x10
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    vldrepl.d  U13,   B0,  0x08
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_N3_M4_TL1_END
+
+.L_N3_M4_TL1: /* TL-- */
+    KERNEL8x4x2
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_M4_TL1
+.L_N3_M4_TL1_END:
+    KERNEL8x4x2_END
+
+.L_N3_M4_L7:
+    /* if (!(L & 7)) goto L_N3_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_M4_L0
+
+.L_N3_M4_L71:
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_M4_L71
+
+.L_N3_M4_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+    vfmul.d  D4,   D4,  VALPHA
+    vfmul.d  D5,   D5,  VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vld      U1,  C0,  0x10
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    vfmadd.d D1,  D1,  VALPHA,  U1
+
+    /* Load C1  */
+    vld      U2,  C1,  0x00
+    vld      U3,  C1,  0x10
+    vfmadd.d D4,  D4,  VALPHA,  U2
+    vfmadd.d D5,  D5,  VALPHA,  U3
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    vst      D0,  C0,  0x00
+    vst      D1,  C0,  0x10
+    /* Store C1 */
+    vst      D4,  C1,  0x00
+    vst      D5,  C1,  0x10
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x20
+    addi.d    C1,  C1,  0x20
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -4
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x05
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x04
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 2 ) && (M & 4) ) End************/
+
+.L_N3_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_N3_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x04
+    add.d    A0,    A0,   T0
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  2
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 2 * 64 from A0 */
+    vld     U0,   A0,    0x00
+
+    vldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+
+    vldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    vfmul.d  D4,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M2_L7 */
+    beq       ZERO,TL, .L_N3_M2_L7
+
+    vld     U8,   A0,    0x00
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    vldrepl.d  U13,   B0,  0x08
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_N3_M2_TL1_END
+
+.L_N3_M2_TL1: /* TL-- */
+    KERNEL8x2x2
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_M2_TL1
+.L_N3_M2_TL1_END:
+    KERNEL8x2x2_END
+
+.L_N3_M2_L7:
+    /* if (!(L & 7)) goto L_N3_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_M2_L0
+
+.L_N3_M2_L71:
+    vld     U0,   A0,    0x00
+
+    vldrepl.d U4,  B0, 0x00
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D0,  U0, U4, D0
+
+    vfmadd.d  D4,  U0, U5, D4
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_M2_L71
+
+.L_N3_M2_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D4,   D4,  VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+
+    /* Load C1  */
+    vld      U1,  C1,  0x00
+    vfmadd.d D4,  D4,  VALPHA,  U1
+#endif // #if defined(TRMMKERNEL)
+
+    vst       D0,  C0,  0x00
+    vst       D4,  C1,  0x00
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x10
+    addi.d    C1,  C1,  0x10
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -2
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x04
+    add.d     A0,  A0, T0
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 2 ) && (M & 2) ) End************/
+
+.L_N3_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_N3_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x03
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x04
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  1
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 1 * 64 from A0 */
+    vldrepl.d   U0,   A0,  0x00
+
+    vld         U4,   B0,  0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M1_L7 */
+    beq       ZERO,TL, .L_N3_M1_L7
+
+    vldrepl.d  U8,   A0,    0x00
+
+    addi.d    TL,  TL,  -1
+
+    vld        U12,  B0,    0x00
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_N3_M1_TL1_END
+
+.L_N3_M1_TL1: /* TL-- */
+    KERNEL8x1x2
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_M1_TL1
+.L_N3_M1_TL1_END:
+    KERNEL8x1x2_END
+
+.L_N3_M1_L7:
+    /* if (!(L & 7)) goto L_N3_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_M1_L0
+
+.L_N3_M1_L71:
+    vldrepl.d  U0,  A0,    0x00
+
+    vld        U4,  B0,    0x00
+    vfmadd.d   D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_M1_L71
+
+.L_N3_M1_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vld      U1,  C1,  0x00
+    vilvl.d  U2,  U1,  U0
+    vfmadd.d D0,  D0,  VALPHA,  U2
+#endif // #if defined(TRMMKERNEL)
+
+    vstelm.d D0,  C0,  0x00,  0x00
+    vstelm.d D0,  C1,  0x00,  0x01
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x08
+    addi.d    C1,  C1,  0x08
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -1
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x03
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x04
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 2 ) && (M & 1) ) End************/
+
+.L_N3_M0:
+    /* Add stride for B and C
+     * B += (K * 16)
+     * C += (LDC * 16)
+     */
+    /* since the array type is double,
+     * so we must mul 16
+     */
+    slli.d    T0,   K,   4
+    slli.d    T1,   LDC, 4
+    add.d     B,    B,   T0
+    add.d     C,    C,   T1
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d    OFF,  OFF, 0x02
+#endif
+
+    /* We must reinit I */
+    srai.d   I,     M,   4     /* I = bm >> 4 */
+
+/************************* Condition 2 if((N & 2) && (M >> 3)) End !!! *************************
+*                                                   dgemm_core_16x2                                */
+
+.L_N1:
+    andi     J,    N,   1
+    beq      ZERO, J,   .L_N0
+
+/************************* Condition 3 if((N & 1) && (M >> 3)) START !!! *************************
+*                                                   dgemm_core_16x1                                */
+
+    move     C0,    C
+    move     A0,    A
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 3)) goto L_N1_M8 */
+    srai.d   I,     M,     3     /* I = bm >> 3 */
+    beq      ZERO,  I,     .L_N1_M8
+
+.L_N1_I1:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x06
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x03
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  8
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 8 * 64 from A0
+     * U0 = {a3,  a2}
+     * U1 = {a1,  a0}
+     * U2 = {a5,  a4}
+     * U3 = {a7,  a6}
+     */
+
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    vldrepl.d   U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+    vfmul.d  D1,  U1, U4
+    vfmul.d  D2,  U2, U4
+    vfmul.d  D3,  U3, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x40
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_L7 */
+    beq       ZERO,TL, .L_N1_L7
+
+    vld     U8,   A0,    0x00
+    vld     U9,   A0,    0x10
+    vld     U10,  A0,    0x20
+    vld     U11,  A0,    0x30
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_N1_TL1_END
+.L_N1_TL1: /* TL-- */
+    KERNEL8x8x1
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_TL1
+.L_N1_TL1_END:
+    KERNEL8x8x1_END
+
+.L_N1_L7:
+    /* if (!(L & 7)) goto L_N1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_L0
+
+.L_N1_L71:
+    /* Load 16 * 64 from A0 */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_L71
+
+.L_N1_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+    vfmul.d  D2,   D2,  VALPHA
+    vfmul.d  D3,   D3,  VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vld      U1,  C0,  0x10
+    vld      U2,  C0,  0x20
+    vld      U3,  C0,  0x30
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    vfmadd.d D1,  D1,  VALPHA,  U1
+    vfmadd.d D2,  D2,  VALPHA,  U2
+    vfmadd.d D3,  D3,  VALPHA,  U3
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    vst      D0,  C0,  0x00
+    vst      D1,  C0,  0x10
+    vst      D2,  C0,  0x20
+    vst      D3,  C0,  0x30
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x40
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -8
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x06
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x03
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x8
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N1_I1
+
+.L_N1_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   7
+    beq       ZERO,I,   .L_N1_M0
+
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N1_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x05
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x03
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  4
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 4 * 64 from A0 */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+    vfmul.d  D1,  U1, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x20
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M4_L7 */
+    beq       ZERO,TL, .L_N1_M4_L7
+
+    vld     U8,   A0,    0x00
+    vld     U9,   A0,    0x10
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_N1_M4_TL1_END
+
+.L_N1_M4_TL1: /* TL-- */
+    KERNEL8x4x1
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M4_TL1
+.L_N1_M4_TL1_END:
+    KERNEL8x4x1_END
+
+.L_N1_M4_L7:
+    /* if (!(L & 7)) goto L_N1_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M4_L0
+
+.L_N1_M4_L71:
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M4_L71
+
+.L_N1_M4_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vld      U1,  C0,  0x10
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    vfmadd.d D1,  D1,  VALPHA,  U1
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    vst      D0,  C0,  0x00
+    vst      D1,  C0,  0x10
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x20
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -4
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x05
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x03
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 1) && (M & 4) ) End************/
+
+.L_N1_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_N1_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x04
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x03
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  2
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 2 * 64 from A0 */
+    vld     U0,   A0,    0x00
+
+    vldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M2_L7 */
+    beq       ZERO,TL, .L_N1_M2_L7
+
+    vld     U8,   A0,    0x00
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_N1_M2_TL1_END
+
+.L_N1_M2_TL1: /* TL-- */
+    KERNEL8x2x1
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M2_TL1
+.L_N1_M2_TL1_END:
+    KERNEL8x2x1_END
+
+.L_N1_M2_L7:
+    /* if (!(L & 7)) goto L_N1_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M2_L0
+
+.L_N1_M2_L71:
+    vld     U0,   A0,    0x00
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M2_L71
+
+.L_N1_M2_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+#endif // #if defined(TRMMKERNEL)
+
+    vstelm.d D0,  C0,  0x00,    0x00
+    vstelm.d D0,  C0,  0x08,    0x01
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x10
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -2
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x04
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x03
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 1 ) && (M & 2) ) End************/
+
+.L_N1_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_N1_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x03
+    add.d    A0,    A0,   T0
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  1
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 1 * 64 from A0 */
+    vldrepl.d  U0,   A0,    0x00
+
+    vldrepl.d  U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M1_L7 */
+    beq       ZERO,TL, .L_N1_M1_L7
+
+    vldrepl.d  U8,   A0,    0x00
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_N1_M1_TL1_END
+
+.L_N1_M1_TL1: /* TL-- */
+    KERNEL8x1x1
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M1_TL1
+.L_N1_M1_TL1_END:
+    KERNEL8x1x1_END
+
+.L_N1_M1_L7:
+    /* if (!(L & 7)) goto L_N1_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M1_L0
+
+.L_N1_M1_L71:
+    vldrepl.d  U0,   A0,    0x00
+
+    vldrepl.d  U4,  B0, 0x00
+    vfmadd.d   D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M1_L71
+
+.L_N1_M1_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+#else
+    /* Load C0  */
+    vldrepl.d  U0,  C0,  0x00
+    vfmadd.d  D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+#endif // #if defined(TRMMKERNEL)
+
+    vstelm.d D0,  C0,  0x00,  0x00
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x08
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -1
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x03
+    add.d     A0,  A0, T0
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 1 ) && (M & 1) ) End************/
+
+.L_N1_M0:
+
+/************************* Condition 3 if((N & 1) && (M >> 3)) End !!! *************************
+*                                                   dgemm_core_16x1                                */
+
+.L_N0:
+    /* Restore regs */
+    LDARG    $r23,  $sp,   0
+    LDARG    $r24,  $sp,   8
+    LDARG    $r25,  $sp,   16
+    LDARG    $r26,  $sp,   24
+    LDARG    $r27,  $sp,   32
+    LD       $f24,  $sp,   40
+    LD       $f25,  $sp,   48
+    LD       $f26,  $sp,   56
+    LD       $f27,  $sp,   64
+    LD       $f28,  $sp,   72
+    LD       $f29,  $sp,   80
+    LD       $f30,  $sp,   88
+    LD       $f31,  $sp,   96
+    addi.d   $sp,   $sp,   112
+
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/param.h b/param.h
index ee4640f57..e4e242d5d 100644
--- a/param.h
+++ b/param.h
@@ -2888,8 +2888,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_UNROLL_M  2
 #define SGEMM_DEFAULT_UNROLL_N  8
 
-#define DGEMM_DEFAULT_UNROLL_M  2
-#define DGEMM_DEFAULT_UNROLL_N  8
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
 
 #define CGEMM_DEFAULT_UNROLL_M  1
 #define CGEMM_DEFAULT_UNROLL_N  4

From e771be185ee3ae604ce0b6ffb0dc38258b04f866 Mon Sep 17 00:00:00 2001
From: guxiwei <guxiwei-hf@loongson.cn>
Date: Thu, 21 Dec 2023 14:28:06 +0800
Subject: [PATCH 520/718] Optimize copy functions with lsx.

Signed-off-by: Hao Chen <chenhao@loongson.cn>
---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   8 +-
 kernel/loongarch64/dgemm_ncopy_4_lsx.S   | 185 +++++++
 kernel/loongarch64/dgemm_ncopy_8_lsx.S   | 283 +++++++++++
 kernel/loongarch64/dgemm_tcopy_4_lsx.S   | 280 +++++++++++
 kernel/loongarch64/dgemm_tcopy_8_lsx.S   | 597 +++++++++++++++++++++++
 5 files changed, 1349 insertions(+), 4 deletions(-)
 create mode 100644 kernel/loongarch64/dgemm_ncopy_4_lsx.S
 create mode 100644 kernel/loongarch64/dgemm_ncopy_8_lsx.S
 create mode 100644 kernel/loongarch64/dgemm_tcopy_4_lsx.S
 create mode 100644 kernel/loongarch64/dgemm_tcopy_8_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index a78c0dbc5..00cb769eb 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -59,10 +59,10 @@ SNRM2KERNEL =  snrm2_lsx.S
 DNRM2KERNEL =  dnrm2_lsx.S
 
 DGEMMKERNEL    = dgemm_kernel_8x4.S
-DGEMMINCOPY    = ../generic/gemm_ncopy_8.c
-DGEMMITCOPY    = ../generic/gemm_tcopy_8.c
-DGEMMONCOPY    = ../generic/gemm_ncopy_4.c
-DGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
+DGEMMINCOPY    = dgemm_ncopy_8_lsx.S
+DGEMMITCOPY    = dgemm_tcopy_8_lsx.S
+DGEMMONCOPY    = dgemm_ncopy_4_lsx.S
+DGEMMOTCOPY    = dgemm_tcopy_4_lsx.S
 DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
 DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
 DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
diff --git a/kernel/loongarch64/dgemm_ncopy_4_lsx.S b/kernel/loongarch64/dgemm_ncopy_4_lsx.S
new file mode 100644
index 000000000..048a49af6
--- /dev/null
+++ b/kernel/loongarch64/dgemm_ncopy_4_lsx.S
@@ -0,0 +1,185 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r21
+#define TL     $r7
+#define T0     $r6
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+
+    PROLOGUE
+
+    move       TD,   DST
+    move       TS,   SRC
+    slli.d     TL,   LDA,  0x03
+    slli.d     T0,   TL,   0x01
+    srai.d     J,    N,    0x02
+    beq        J,    ZERO, .L_N2
+.L_J1: /* J-- */
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x02
+    add.d      S3,   S2,   TL
+    add.d      S4,   S2,   T0
+    add.d      TS,   S3,   T0
+    addi.d     J,    J,    -1
+    beq        I,    ZERO, .L_I3
+.L_I1: /* I-- */
+    GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00
+    GINTERLACE v, d, D0, D2, U1, U0
+    GINTERLACE v, d, D1, D3, U3, U2
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30
+    addi.d     TD,   TD,   0x40
+
+    GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10
+    GINTERLACE v, d, D0, D2, U1, U0
+    GINTERLACE v, d, D1, D3, U3, U2
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+.L_I3:
+    andi      I,     M,    0x03
+    beq       I,     ZERO, .L_I0
+.L_II1:
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+
+    addi.d    TD,    TD,  0x20
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_II1
+.L_I0:
+    blt       ZERO,  J,   .L_J1
+.L_N2:
+    andi      J,     N,   0x02
+    beq       ZERO,  J,   .L_N1
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x01
+    add.d      TS,   S2,   TL
+    beq        I,    ZERO, .L_2I3
+.L_2I1: /* I-- */
+    GLD v, , U0, S1, 0x00, U1, S2, 0x00
+    GINTERLACE v, d, D0, D1, U1, U0
+    GST v, , D0, TD, 0x00, D1, TD, 0x10
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     TD,   TD,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_2I1
+.L_2I3:
+    andi       I,    M,    0x01
+    beq        ZERO, I,    .L_N1
+.L_2II1: /* I-- */
+    fld.d      F0,   S1,   0x00
+    fld.d      F1,   S2,   0x00
+    fst.d      F0,   TD,   0x00
+    addi.d     I,    I,    -1
+    fst.d      F1,   TD,   0x08
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     TD,   TD,   0x10
+    blt        ZERO, I,    .L_2II1
+.L_N1:
+    move      S1,    TS
+    beq       ZERO,  M,   .L_N0
+.L_M1:
+    fld.d     F0,    S1,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F0,    TD,  0x00
+    addi.d    TD,    TD,  0x08
+    addi.d    M,     M,   -1
+    blt       ZERO,  M,   .L_M1
+.L_N0:
+    jirl       $r0,  $r1,  0x00
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemm_ncopy_8_lsx.S b/kernel/loongarch64/dgemm_ncopy_8_lsx.S
new file mode 100644
index 000000000..30bebe8df
--- /dev/null
+++ b/kernel/loongarch64/dgemm_ncopy_8_lsx.S
@@ -0,0 +1,283 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r21
+#define TL     $r7
+#define T0     $r6
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+
+    PROLOGUE
+    push_if_used 26, 32
+    move       TD,   DST
+    move       TS,   SRC
+    slli.d     TL,   LDA,  0x03
+    slli.d     T0,   TL,   0x01
+    srai.d     J,    N,    0x03
+    beq        J,    ZERO, .L_N4
+.L_J1:
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x03
+    add.d      S3,   S2,   TL
+    addi.d     J,    J,    -1
+    add.d      S4,   S3,   TL
+    add.d      S5,   S3,   T0
+    add.d      S6,   S4,   T0
+    add.d      S7,   S5,   T0
+    add.d      S8,   S6,   T0
+    add.d      TS,   S7,   T0
+    beq        I,    ZERO, .L_I7
+.L_I1:
+    GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00, \
+             U4, S5, 0x00, U5, S6, 0x00, U6, S7, 0x00, U7, S8, 0x00
+    GINTERLACE v, d, D0, D4, U1, U0
+    GINTERLACE v, d, D1, D5, U3, U2
+    GINTERLACE v, d, D2, D6, U5, U4
+    GINTERLACE v, d, D3, D7, U7, U6
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \
+             D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70
+    addi.d    TD,    TD,   0x80
+    GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10, \
+             U4, S5, 0x10, U5, S6, 0x10, U6, S7, 0x10, U7, S8, 0x10
+    GINTERLACE v, d, D0, D4, U1, U0
+    GINTERLACE v, d, D1, D5, U3, U2
+    GINTERLACE v, d, D2, D6, U5, U4
+    GINTERLACE v, d, D3, D7, U7, U6
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \
+             D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70
+    addi.d    TD,    TD,   0x80
+    GLD v, , U0, S1, 0x20, U1, S2, 0x20, U2, S3, 0x20, U3, S4, 0x20, \
+             U4, S5, 0x20, U5, S6, 0x20, U6, S7, 0x20, U7, S8, 0x20
+    GINTERLACE v, d, D0, D4, U1, U0
+    GINTERLACE v, d, D1, D5, U3, U2
+    GINTERLACE v, d, D2, D6, U5, U4
+    GINTERLACE v, d, D3, D7, U7, U6
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \
+             D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70
+    addi.d    TD,    TD,   0x80
+    GLD v, , U0, S1, 0x30, U1, S2, 0x30, U2, S3, 0x30, U3, S4, 0x30, \
+             U4, S5, 0x30, U5, S6, 0x30, U6, S7, 0x30, U7, S8, 0x30
+    GINTERLACE v, d, D0, D4, U1, U0
+    GINTERLACE v, d, D1, D5, U3, U2
+    GINTERLACE v, d, D2, D6, U5, U4
+    GINTERLACE v, d, D3, D7, U7, U6
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \
+             D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70
+    addi.d    TD,    TD,   0x80
+
+    addi.d     S1,   S1,   0x40
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    addi.d     S5,   S5,   0x40
+    addi.d     S6,   S6,   0x40
+    addi.d     S7,   S7,   0x40
+    addi.d     S8,   S8,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+.L_I7:
+    andi      I,     M,    0x07
+    beq       I,     ZERO, .L_I0
+.L_II1: /* I-- */
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+    fld.d     F4,    S5,  0x00
+    fld.d     F5,    S6,  0x00
+    fld.d     F6,    S7,  0x00
+    fld.d     F7,    S8,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+    fst.d     F4,    TD,  0x20
+    addi.d    S5,    S5,  0x08
+    fst.d     F5,    TD,  0x28
+    addi.d    S6,    S6,  0x08
+    fst.d     F6,    TD,  0x30
+    addi.d    S7,    S7,  0x08
+    fst.d     F7,    TD,  0x38
+    addi.d    S8,    S8,  0x08
+    addi.d    TD,    TD,  0x40
+
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_II1
+.L_I0:
+    blt       ZERO,  J,   .L_J1
+.L_N4:
+    andi      J,     N,   0x04
+    beq       ZERO,  J,   .L_N2
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x02
+    add.d      S3,   S2,   TL
+    add.d      S4,   S2,   T0
+    add.d      TS,   S3,   T0
+    beq        I,    ZERO, .L_I3
+.L_4I1: /* I-- */
+    GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00
+    GINTERLACE v, d, D0, D2, U1, U0
+    GINTERLACE v, d, D1, D3, U3, U2
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30
+    addi.d     TD,   TD,   0x40
+
+    GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10
+    GINTERLACE v, d, D0, D2, U1, U0
+    GINTERLACE v, d, D1, D3, U3, U2
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_4I1
+.L_I3:
+    andi      I,     M,    0x03
+    beq       I,     ZERO, .L_N2
+.L_4II1:
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+
+    addi.d    TD,    TD,  0x20
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_4II1
+.L_N2:
+    andi      J,     N,   0x02
+    beq       ZERO,  J,   .L_N1
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x01
+    add.d      TS,   S2,   TL
+    beq        I,    ZERO, .L_NI1
+.L_2I1: /* I-- */
+    GLD v, , U0, S1, 0x00, U1, S2, 0x00
+    GINTERLACE v, d, D0, D1, U1, U0
+    GST v, , D0, TD, 0x00, D1, TD, 0x10
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     TD,   TD,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_2I1
+.L_NI1:
+    andi      I,     M,    0x01
+    beq       I,     ZERO, .L_N1
+
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    addi.d    TD,    TD,  0x10
+.L_N1:
+    move      S1,    TS
+    beq       ZERO,  M,   .L_N0
+.L_M1:
+    fld.d     F0,    S1,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F0,    TD,  0x00
+    addi.d    TD,    TD,  0x08
+    addi.d    M,     M,   -1
+    blt       ZERO,  M,   .L_M1
+.L_N0:
+    pop_if_used 26, 32
+    jirl       $r0,  $r1,  0x00
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemm_tcopy_4_lsx.S b/kernel/loongarch64/dgemm_tcopy_4_lsx.S
new file mode 100644
index 000000000..134066471
--- /dev/null
+++ b/kernel/loongarch64/dgemm_tcopy_4_lsx.S
@@ -0,0 +1,280 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S0     $r11
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define P0     $r16
+#define P1     $r17
+#define P2     $r18
+#define P3     $r19
+#define T0     $r20
+#define T1     $r23
+#define TL     $r7
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+
+    PROLOGUE
+    push_if_used 18, 8
+
+    move       S0,     SRC
+    move       P0,     DST
+
+    // Find P0, P2, P3
+    srai.d     T0,     N,     0x02
+    slli.d     T0,     T0,    0x02
+    srai.d     T1,     N,     0x01
+    slli.d     T1,     T1,    0x01
+    mul.d      T0,     M,     T0
+    mul.d      T1,     M,     T1
+    slli.d     T0,     T0,    0x03
+    slli.d     T1,     T1,    0x03
+    add.d      P2,     DST,   T0
+    add.d      P3,     DST,   T1
+
+    slli.d     TL,     LDA,   0x03
+    srai.d     J,      M,     0x02
+    slli.d     T0,     TL,    0x01
+    slli.d     T1,     M,     0x05
+    beq        ZERO,   J,     .L_M3
+.L_J1: /* J-- */
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S3,     S1,    T0
+    add.d      S4,     S2,    T0
+    add.d      S0,     S3,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x80
+
+    srai.d     I,      N,     0x02
+    addi.d     J,      J,     -1
+    beq        ZERO,   I,     .L_N3
+.L_I1: /* I-- */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S2,    0x00
+    vld       U3,     S2,    0x10
+    vld       U4,     S3,    0x00
+    vld       U5,     S3,    0x10
+    vld       U6,     S4,    0x00
+    vld       U7,     S4,    0x10
+
+    vst       U0,     P1,    0x00
+    vst       U1,     P1,    0x10
+    vst       U2,     P1,    0x20
+    vst       U3,     P1,    0x30
+    vst       U4,     P1,    0x40
+    vst       U5,     P1,    0x50
+    vst       U6,     P1,    0x60
+    vst       U7,     P1,    0x70
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S3,     S3,    0x20
+    addi.d     S4,     S4,    0x20
+    add.d      P1,     P1,    T1
+
+    addi.d     I,      I,    -1
+    blt        ZERO,   I,    .L_I1
+.L_N3:
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_N1
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S2,    0x00
+    vld       U2,     S3,    0x00
+    vld       U3,     S4,    0x00
+
+    vst       U0,     P2,    0x00
+    vst       U1,     P2,    0x10
+    vst       U2,     P2,    0x20
+    vst       U3,     P2,    0x30
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S3,     S3,    0x10
+    addi.d     S4,     S4,    0x10
+    addi.d     P2,     P2,    0x40
+.L_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+    fld.d      F2,     S3,    0x00
+    fld.d      F3,     S4,    0x00
+
+    fst.d      F0,     P3,    0x00
+    fst.d      F1,     P3,    0x08
+    fst.d      F2,     P3,    0x10
+    fst.d      F3,     P3,    0x18
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     S3,     S3,    0x08
+    addi.d     S4,     S4,    0x08
+    addi.d     P3,     P3,    0x20
+
+.L_N0:
+    blt        ZERO,   J,     .L_J1
+
+.L_M3:
+    andi       J,      M,      0x02
+    beq        ZERO,   J,      .L_M1
+
+    move       S1,     S0
+    add.d      S2,     S0,     TL
+    add.d      S0,     S0,     T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,     0x40
+
+    srai.d     I,      N,      0x02
+    beq        ZERO,   I,      .L_2N3
+
+.L_2I1:   /* I-- */
+    vld       U0,     S1,     0x00
+    vld       U1,     S1,     0x10
+    vld       U2,     S2,     0x00
+    vld       U3,     S2,     0x10
+
+    vst       U0,     P1,     0x00
+    vst       U1,     P1,     0x10
+    vst       U2,     P1,     0x20
+    vst       U3,     P1,     0x30
+
+    addi.d     S1,     S1,     0x20
+    addi.d     S2,     S2,     0x20
+    addi.d     I,      I,      -1
+    add.d      P1,     P1,     T1
+
+    blt        ZERO,   I,     .L_2I1
+
+.L_2N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_2N1
+
+    vld       U0,     S1,     0x00
+    vld       U1,     S2,     0x00
+
+    vst       U0,     P2,     0x00
+    vst       U1,     P2,     0x10
+
+    addi.d     S1,     S1,     0x10
+    addi.d     S2,     S2,     0x10
+    addi.d     P2,     P2,     0x20
+
+.L_2N1:
+    addi.d     I,      N,      0x01
+    beq        ZERO,   I,      .L_M1
+
+    fld.d      F0,     S1,     0x00
+    fld.d      F1,     S2,     0x00
+
+    fst.d      F0,     P3,     0x00
+    fst.d      F1,     P3,     0x08
+
+    addi.d     S1,     S1,     0x08
+    addi.d     S2,     S2,     0x08
+    addi.d     P3,     P3,     0x10
+.L_M1:
+    andi       J,      M,      0x01
+    beq        ZERO,   J,      .L_M0
+
+    move       S1,     S0
+    move       P1,     P0
+
+    srai.d     I,      N,      0x02
+    beq        ZERO,   I,      .L_1N3
+
+.L_1I1:
+    vld       U0,    S1,      0x00
+    vld       U1,    S1,      0x10
+
+    vst       U0,    P1,      0x00
+    vst       U1,    P1,      0x10
+
+    addi.d     S1,    S1,      0x20
+    addi.d     I,     I,       -1
+    add.d      P1,    P1,      T1
+
+    blt        ZERO,  I,       .L_1I1
+
+.L_1N3:
+    andi       I,     N,       0x02
+    beq        I,     ZERO,    .L_1N1
+
+    fld.d      F0,    S1,      0x00
+    fld.d      F1,    S1,      0x08
+
+    fst.d      F0,    P2,      0x00
+    fst.d      F1,    P2,      0x08
+
+    addi.d     S1,    S1,      0x10
+    addi.d     P2,    P2,      0x10
+
+.L_1N1:
+    andi       I,     N,       0x01
+    beq        I,     ZERO,    .L_M0
+
+    fld.d      F0,    S1,      0x00
+
+    fst.d      F0,    P3,      0x00
+
+.L_M0:
+    pop_if_used 18, 8
+    jirl       $r0,    $r1,   0x00
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemm_tcopy_8_lsx.S b/kernel/loongarch64/dgemm_tcopy_8_lsx.S
new file mode 100644
index 000000000..a7e3ef69c
--- /dev/null
+++ b/kernel/loongarch64/dgemm_tcopy_8_lsx.S
@@ -0,0 +1,597 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S0     $r11
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define P0     $r20
+#define P1     $r23
+#define P2     $r24
+#define P3     $r25
+#define P4     $r26
+#define P5     $r27
+#define T0     $r28
+#define T1     $r29
+#define TL     $r7
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LASX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+
+    PROLOGUE
+    push_if_used 24, 8
+
+    move       S0,     SRC
+    move       P0,     DST
+
+    srai.d     T0,     N,     0x03
+    srai.d     T1,     N,     0x02
+    slli.d     T0,     T0,    0x03
+    slli.d     T1,     T1,    0x02
+    mul.d      P2,     M,     T0
+    mul.d      P3,     M,     T1
+    slli.d     P2,     P2,    0x03
+    slli.d     P3,     P3,    0x03
+    add.d      P2,     DST,   P2
+    add.d      P3,     DST,   P3
+
+    srai.d     T0,     N,     0x01
+    slli.d     T0,     T0,    0x01
+    mul.d      P4,     M,     T0
+    slli.d     P4,     P4,    0x03
+    add.d      P4,     DST,   P4
+
+    slli.d     TL,     LDA,   0x03
+    srai.d     J,      M,     0x03
+    slli.d     T0,     TL,    0x01
+    slli.d     T1,     M,     0x06
+    beq        ZERO,   J,     .L_M7
+.L_J1: /* J-- */
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S3,     S1,    T0
+    add.d      S4,     S2,    T0
+    add.d      S5,     S3,    T0
+    add.d      S6,     S4,    T0
+    add.d      S7,     S5,    T0
+    add.d      S8,     S6,    T0
+    add.d      S0,     S7,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x200
+
+    srai.d     I,      N,     0x03
+    addi.d     J,      J,     -1
+    beq        ZERO,   I,     .L_N7
+
+.L_I1: /* I-- */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+    vld       U4,     S2,    0x00
+    vld       U5,     S2,    0x10
+    vld       U6,     S2,    0x20
+    vld       U7,     S2,    0x30
+
+    vst       U0,     P1,    0x00
+    vst       U1,     P1,    0x10
+    vst       U2,     P1,    0x20
+    vst       U3,     P1,    0x30
+    vst       U4,     P1,    0x40
+    vst       U5,     P1,    0x50
+    vst       U6,     P1,    0x60
+    vst       U7,     P1,    0x70
+
+    vld       U0,     S3,    0x00
+    vld       U1,     S3,    0x10
+    vld       U2,     S3,    0x20
+    vld       U3,     S3,    0x30
+    vld       U4,     S4,    0x00
+    vld       U5,     S4,    0x10
+    vld       U6,     S4,    0x20
+    vld       U7,     S4,    0x30
+
+    vst       U0,     P1,    0x80
+    vst       U1,     P1,    0x90
+    vst       U2,     P1,    0xa0
+    vst       U3,     P1,    0xb0
+    vst       U4,     P1,    0xc0
+    vst       U5,     P1,    0xd0
+    vst       U6,     P1,    0xe0
+    vst       U7,     P1,    0xf0
+
+    vld       U0,     S5,    0x00
+    vld       U1,     S5,    0x10
+    vld       U2,     S5,    0x20
+    vld       U3,     S5,    0x30
+    vld       U4,     S6,    0x00
+    vld       U5,     S6,    0x10
+    vld       U6,     S6,    0x20
+    vld       U7,     S6,    0x30
+
+    vst       U0,     P1,    0x100
+    vst       U1,     P1,    0x110
+    vst       U2,     P1,    0x120
+    vst       U3,     P1,    0x130
+    vst       U4,     P1,    0x140
+    vst       U5,     P1,    0x150
+    vst       U6,     P1,    0x160
+    vst       U7,     P1,    0x170
+
+    vld       U0,     S7,    0x00
+    vld       U1,     S7,    0x10
+    vld       U2,     S7,    0x20
+    vld       U3,     S7,    0x30
+    vld       U4,     S8,    0x00
+    vld       U5,     S8,    0x10
+    vld       U6,     S8,    0x20
+    vld       U7,     S8,    0x30
+
+    vst       U0,     P1,    0x180
+    vst       U1,     P1,    0x190
+    vst       U2,     P1,    0x1a0
+    vst       U3,     P1,    0x1b0
+    vst       U4,     P1,    0x1c0
+    vst       U5,     P1,    0x1d0
+    vst       U6,     P1,    0x1e0
+    vst       U7,     P1,    0x1f0
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    addi.d     S3,     S3,    0x40
+    addi.d     S4,     S4,    0x40
+    addi.d     S5,     S5,    0x40
+    addi.d     S6,     S6,    0x40
+    addi.d     S7,     S7,    0x40
+    addi.d     S8,     S8,    0x40
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_I1
+.L_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_N3
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S2,    0x00
+    vld       U3,     S2,    0x10
+    vld       U4,     S3,    0x00
+    vld       U5,     S3,    0x10
+    vld       U6,     S4,    0x00
+    vld       U7,     S4,    0x10
+
+    vst       U0,     P2,    0x00
+    vst       U1,     P2,    0x10
+    vst       U2,     P2,    0x20
+    vst       U3,     P2,    0x30
+    vst       U4,     P2,    0x40
+    vst       U5,     P2,    0x50
+    vst       U6,     P2,    0x60
+    vst       U7,     P2,    0x70
+
+    vld       U0,     S5,    0x00
+    vld       U1,     S5,    0x10
+    vld       U2,     S6,    0x00
+    vld       U3,     S6,    0x10
+    vld       U4,     S7,    0x00
+    vld       U5,     S7,    0x10
+    vld       U6,     S8,    0x00
+    vld       U7,     S8,    0x10
+
+    vst       U0,     P2,    0x80
+    vst       U1,     P2,    0x90
+    vst       U2,     P2,    0xa0
+    vst       U3,     P2,    0xb0
+    vst       U4,     P2,    0xc0
+    vst       U5,     P2,    0xd0
+    vst       U6,     P2,    0xe0
+    vst       U7,     P2,    0xf0
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S3,     S3,    0x20
+    addi.d     S4,     S4,    0x20
+    addi.d     S5,     S5,    0x20
+    addi.d     S6,     S6,    0x20
+    addi.d     S7,     S7,    0x20
+    addi.d     S8,     S8,    0x20
+    addi.d     P2,     P2,    0x100
+
+.L_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_N1
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S2,    0x00
+    vld       U2,     S3,    0x00
+    vld       U3,     S4,    0x00
+    vld       U4,     S5,    0x00
+    vld       U5,     S6,    0x00
+    vld       U6,     S7,    0x00
+    vld       U7,     S8,    0x00
+
+    vst       U0,     P3,    0x00
+    vst       U1,     P3,    0x10
+    vst       U2,     P3,    0x20
+    vst       U3,     P3,    0x30
+    vst       U4,     P3,    0x40
+    vst       U5,     P3,    0x50
+    vst       U6,     P3,    0x60
+    vst       U7,     P3,    0x70
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S3,     S3,    0x10
+    addi.d     S4,     S4,    0x10
+    addi.d     S5,     S5,    0x10
+    addi.d     S6,     S6,    0x10
+    addi.d     S7,     S7,    0x10
+    addi.d     S8,     S8,    0x10
+    addi.d     P3,     P3,    0x80
+
+.L_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+    fld.d      F2,     S3,    0x00
+    fld.d      F3,     S4,    0x00
+    fld.d      F4,     S5,    0x00
+    fld.d      F5,     S6,    0x00
+    fld.d      F6,     S7,    0x00
+    fld.d      F7,     S8,    0x00
+
+    fst.d      F0,     P4,    0x00
+    fst.d      F1,     P4,    0x08
+    fst.d      F2,     P4,    0x10
+    fst.d      F3,     P4,    0x18
+    fst.d      F4,     P4,    0x20
+    fst.d      F5,     P4,    0x28
+
+    fst.d      F6,     P4,    0x30
+    fst.d      F7,     P4,    0x38
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     S3,     S3,    0x08
+    addi.d     S4,     S4,    0x08
+    addi.d     S5,     S5,    0x08
+    addi.d     S6,     S6,    0x08
+    addi.d     S7,     S7,    0x08
+    addi.d     S8,     S8,    0x08
+    addi.d     P4,     P4,    0x40
+
+.L_N0:
+    blt        ZERO,   J,     .L_J1
+.L_M7:
+    andi       J,      M,     0x04
+    beq        ZERO,   J,     .L_M3
+
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S3,     S1,    T0
+    add.d      S4,     S2,    T0
+    add.d      S0,     S3,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x100
+
+    srai.d     I,      N,     0x03
+    beq        ZERO,   I,     .L_4N7
+.L_4I1: /* I-- */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+    vld       U4,     S2,    0x00
+    vld       U5,     S2,    0x10
+    vld       U6,     S2,    0x20
+    vld       U7,     S2,    0x30
+
+    vst       U0,     P1,    0x00
+    vst       U1,     P1,    0x10
+    vst       U2,     P1,    0x20
+    vst       U3,     P1,    0x30
+    vst       U4,     P1,    0x40
+    vst       U5,     P1,    0x50
+    vst       U6,     P1,    0x60
+    vst       U7,     P1,    0x70
+
+    vld       U0,     S3,    0x00
+    vld       U1,     S3,    0x10
+    vld       U2,     S3,    0x20
+    vld       U3,     S3,    0x30
+    vld       U4,     S4,    0x00
+    vld       U5,     S4,    0x10
+    vld       U6,     S4,    0x20
+    vld       U7,     S4,    0x30
+
+    vst       U0,     P1,    0x80
+    vst       U1,     P1,    0x90
+    vst       U2,     P1,    0xa0
+    vst       U3,     P1,    0xb0
+    vst       U4,     P1,    0xc0
+    vst       U5,     P1,    0xd0
+    vst       U6,     P1,    0xe0
+    vst       U7,     P1,    0xf0
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    addi.d     S3,     S3,    0x40
+    addi.d     S4,     S4,    0x40
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_4I1
+.L_4N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_4N3
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S2,    0x00
+    vld       U3,     S2,    0x10
+    vld       U4,     S3,    0x00
+    vld       U5,     S3,    0x10
+    vld       U6,     S4,    0x00
+    vld       U7,     S4,    0x10
+
+    vst       U0,     P2,    0x00
+    vst       U1,     P2,    0x10
+    vst       U2,     P2,    0x20
+    vst       U3,     P2,    0x30
+    vst       U4,     P2,    0x40
+    vst       U5,     P2,    0x50
+    vst       U6,     P2,    0x60
+    vst       U7,     P2,    0x70
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S3,     S3,    0x20
+    addi.d     S4,     S4,    0x20
+    addi.d     P2,     P2,    0x80
+
+.L_4N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_4N1
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S2,    0x00
+    vld       U2,     S3,    0x00
+    vld       U3,     S4,    0x00
+
+    vst       U0,     P3,    0x00
+    vst       U1,     P3,    0x10
+    vst       U2,     P3,    0x20
+    vst       U3,     P3,    0x30
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S3,     S3,    0x10
+    addi.d     S4,     S4,    0x10
+    addi.d     P3,     P3,    0x40
+
+.L_4N1:
+    andi        I,      N,     0x01
+    beq         ZERO,   I,     .L_M3
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+    fld.d      F2,     S3,    0x00
+    fld.d      F3,     S4,    0x00
+
+    fst.d      F0,     P4,    0x00
+    fst.d      F1,     P4,    0x08
+    fst.d      F2,     P4,    0x10
+    fst.d      F3,     P4,    0x18
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     S3,     S3,    0x08
+    addi.d     S4,     S4,    0x08
+    addi.d     P4,     P4,    0x20
+.L_M3:
+    andi       J,      M,     0x02
+    beq        ZERO,   J,     .L_M1
+
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S0,     S0,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x80
+
+    srai.d     I,      N,     0x03
+    beq        ZERO,   I,     .L_2N7
+.L_2I1: /* I-- */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+    vld       U4,     S2,    0x00
+    vld       U5,     S2,    0x10
+    vld       U6,     S2,    0x20
+    vld       U7,     S2,    0x30
+
+    vst       U0,     P1,    0x00
+    vst       U1,     P1,    0x10
+    vst       U2,     P1,    0x20
+    vst       U3,     P1,    0x30
+    vst       U4,     P1,    0x40
+    vst       U5,     P1,    0x50
+    vst       U6,     P1,    0x60
+    vst       U7,     P1,    0x70
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_2I1
+.L_2N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_2N3
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S2,    0x00
+    vld       U3,     S2,    0x10
+
+    vst       U0,     P2,    0x00
+    vst       U1,     P2,    0x10
+    vst       U2,     P2,    0x20
+    vst       U3,     P2,    0x30
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     P2,     P2,    0x40
+
+.L_2N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_2N1
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S2,    0x00
+
+    vst       U0,     P3,    0x00
+    vst       U1,     P3,    0x10
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     P3,     P3,    0x20
+
+.L_2N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M1
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+
+    fst.d      F0,     P4,    0x00
+    fst.d      F1,     P4,    0x08
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     P4,     P4,    0x10
+.L_M1:
+    andi       J,      M,     0x01
+    beq        ZERO,   J,     .L_M0
+
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x40
+
+    srai.d     I,      N,     0x03
+    beq        ZERO,   I,     .L_1N7
+.L_1I1: /* I-- */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+
+    vst       U0,     P1,    0x00
+    vst       U1,     P1,    0x10
+    vst       U2,     P1,    0x20
+    vst       U3,     P1,    0x30
+
+    addi.d     S1,     S1,    0x40
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_1I1
+
+.L_1N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_1N3
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+
+    vst       U0,     P2,    0x00
+    vst       U1,     P2,    0x10
+
+    addi.d     S1,     S1,    0x20
+    addi.d     P2,     P2,    0x20
+
+.L_1N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_1N1
+
+    vld       U0,     S1,    0x00
+    vst       U0,     P3,    0x00
+
+    addi.d     S1,     S1,    0x10
+    addi.d     P3,     P3,    0x10
+
+.L_1N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M0
+
+    fld.d      F0,     S1,    0x00
+
+    fst.d      F0,     P4,    0x00
+
+    addi.d     S1,     S1,    0x08
+    addi.d     P4,     P4,    0x08
+.L_M0:
+    pop_if_used 24, 8
+    jirl       $r0,    $r1,   0x00
+    EPILOGUE

From 06fd5b5995e66e7b54bb4b8496a3b946cd56212e Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao@loongson.cn>
Date: Wed, 27 Dec 2023 10:44:02 +0800
Subject: [PATCH 521/718] loongarch64: Add and Refine asum optimization
 functions.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   6 +-
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   6 +-
 kernel/loongarch64/asum_lasx.S           | 257 ++++++++++++++++
 kernel/loongarch64/asum_lsx.S            | 258 ++++++++++++++++
 kernel/loongarch64/casum_lasx.S          | 329 +++++++++++++++++++++
 kernel/loongarch64/casum_lsx.S           | 358 +++++++++++++++++++++++
 kernel/loongarch64/dasum_lasx.S          | 148 ----------
 kernel/loongarch64/dasum_lsx.S           | 158 ----------
 kernel/loongarch64/sasum_lasx.S          | 157 ----------
 kernel/loongarch64/sasum_lsx.S           | 148 ----------
 10 files changed, 1210 insertions(+), 615 deletions(-)
 create mode 100644 kernel/loongarch64/asum_lasx.S
 create mode 100644 kernel/loongarch64/asum_lsx.S
 create mode 100644 kernel/loongarch64/casum_lasx.S
 create mode 100644 kernel/loongarch64/casum_lsx.S
 delete mode 100644 kernel/loongarch64/dasum_lasx.S
 delete mode 100644 kernel/loongarch64/dasum_lsx.S
 delete mode 100644 kernel/loongarch64/sasum_lasx.S
 delete mode 100644 kernel/loongarch64/sasum_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 00cb769eb..201427dcd 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -49,8 +49,10 @@ DAXPBYKERNEL = daxpby_lsx.S
 SSUMKERNEL  =  sum_lsx.S
 DSUMKERNEL  =  sum_lsx.S
 
-SASUMKERNEL =  sasum_lsx.S
-DASUMKERNEL =  dasum_lsx.S
+SASUMKERNEL =  asum_lsx.S
+DASUMKERNEL =  asum_lsx.S
+CASUMKERNEL =  casum_lsx.S
+ZASUMKERNEL =  casum_lsx.S
 
 SROTKERNEL  =  rot_lsx.S
 DROTKERNEL  =  rot_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index e4c45e1fa..e822cb630 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -49,8 +49,10 @@ DAXPBYKERNEL = daxpby_lasx.S
 SSUMKERNEL  =  sum_lasx.S
 DSUMKERNEL  =  sum_lasx.S
 
-SASUMKERNEL =  sasum_lasx.S
-DASUMKERNEL =  dasum_lasx.S
+SASUMKERNEL =  asum_lasx.S
+DASUMKERNEL =  asum_lasx.S
+CASUMKERNEL =  casum_lasx.S
+ZASUMKERNEL =  casum_lasx.S
 
 SROTKERNEL  =  rot_lasx.S
 DROTKERNEL  =  rot_lasx.S
diff --git a/kernel/loongarch64/asum_lasx.S b/kernel/loongarch64/asum_lasx.S
new file mode 100644
index 000000000..9a2c031f3
--- /dev/null
+++ b/kernel/loongarch64/asum_lasx.S
@@ -0,0 +1,257 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define VT0    $xr23
+#define VT1    $xr22
+#define res1   $xr16
+#define res2   $xr17
+#define res0   $xr18
+#define neg1   $xr19
+
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    xvxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+#ifdef DOUBLE
+    li.d t1, -1
+    xvreplgr2vr.d neg1, t1
+    xvffint.d.l neg1, neg1
+#else
+    li.w t1, -1
+    xvreplgr2vr.w neg1, t1
+    xvffint.s.w neg1, neg1
+#endif
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+#else
+    xvld VX0, X, 0 * SIZE
+    xvfmul.s VX2, neg1, VX0
+    xvfcmp.clt.s VT0, VX0, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvfadd.s res1, VX0, res1
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD    $f12, X, 0 * SIZE
+    FABS  $f12, $f12
+    ADD   $f16, $f12,  $f16
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvfmul.s VX2, neg1, VX0
+    xvfcmp.clt.s VT0, VX0, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvfadd.s res1, VX0, res1
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f12, X, 0 * SIZE
+    FABS $f12, $f12
+    ADD  $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV    $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/asum_lsx.S b/kernel/loongarch64/asum_lsx.S
new file mode 100644
index 000000000..512b01404
--- /dev/null
+++ b/kernel/loongarch64/asum_lsx.S
@@ -0,0 +1,258 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define VT0    $vr23
+#define VT1    $vr22
+#define res1   $vr16
+#define res2   $vr17
+#define res0   $vr18
+#define neg1   $vr19
+
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    vxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+#ifdef DOUBLE
+    li.d t1, -1
+    vreplgr2vr.d neg1, t1
+    vffint.d.l neg1, neg1
+#else
+    li.w t1, -1
+    vreplgr2vr.w neg1, t1
+    vffint.s.w neg1, neg1
+#endif
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res2, VX0, VX1
+    vfadd.s res1, res1, res2
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD   $f12, X, 0 * SIZE
+    FABS $f12, $f12
+    ADD  $f16, $f12, $f16
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res2, VX0, VX1
+    vfadd.s res1, res1, res2
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f12, X, 0 * SIZE
+    FABS $f12, $f12
+    ADD  $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV    $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/casum_lasx.S b/kernel/loongarch64/casum_lasx.S
new file mode 100644
index 000000000..caf0ff969
--- /dev/null
+++ b/kernel/loongarch64/casum_lasx.S
@@ -0,0 +1,329 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define res1   $xr16
+#define res2   $xr17
+#define res3   $xr18
+#define res0   $xr19
+#define neg1   $xr20
+#define VT0    $xr21
+#define VT1    $xr22
+
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    xvxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+#ifdef DOUBLE
+    li.d t1, -1
+    xvreplgr2vr.d neg1, t1
+    xvffint.d.l neg1, neg1
+#else
+    li.w t1, -1
+    xvreplgr2vr.w neg1, t1
+    xvffint.s.w neg1, neg1
+#endif
+    li.d  TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+    xvld VX2, X, 8 * SIZE
+    xvld VX3, X, 12 * SIZE
+    xvfmul.d VX0, neg1, VX2
+    xvfmul.d VX1, neg1, VX3
+    xvfcmp.clt.d VT0, VX2, res0
+    xvfcmp.clt.d VT1, VX3, res0
+    xvbitsel.v VX2, VX2, VX0, VT0
+    xvbitsel.v VX3, VX3, VX1, VT1
+    xvfadd.d res2, VX2, VX3
+    xvfadd.d res1, res1, res2
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 8 * SIZE
+    xvfmul.s VX2, neg1, VX0
+    xvfmul.s VX3, neg1, VX1
+    xvfcmp.clt.s VT0, VX0, res0
+    xvfcmp.clt.s VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.s res2, VX0, VX1
+    xvfadd.s res1, res2, res1
+#endif
+    addi.d X, X, 16 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    FABS a1, a1
+    FABS a2, a2
+    addi.d I, I, -1
+    ADD  a1, a1, a2
+    ADD  s1, a1, s1
+    addi.d  X, X, 2 * SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 0
+    xvinsgr2vr.w VX1, t2, 1
+    xvinsgr2vr.w VX1, t3, 2
+    xvinsgr2vr.w VX1, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 4
+    xvinsgr2vr.w VX1, t2, 5
+    xvinsgr2vr.w VX1, t3, 6
+    xvinsgr2vr.w VX1, t4, 7
+    xvfmul.s VX2, neg1, VX0
+    xvfmul.s VX3, neg1, VX1
+    xvfcmp.clt.s VT0, VX0, res0
+    xvfcmp.clt.s VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.s res2, VX0, VX1
+    xvfadd.s res1, res2, res1
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    FABS a1, a1
+    FABS a2, a2
+    addi.d I, I, -1
+    ADD  a1, a1, a2
+    ADD  s1, a1, s1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV  $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/casum_lsx.S b/kernel/loongarch64/casum_lsx.S
new file mode 100644
index 000000000..4822f2080
--- /dev/null
+++ b/kernel/loongarch64/casum_lsx.S
@@ -0,0 +1,358 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define res1   $vr16
+#define res2   $vr17
+#define res3   $vr18
+#define res0   $vr19
+#define neg1   $vr20
+#define VT0    $vr21
+#define VT1    $vr22
+
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    vxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+#ifdef DOUBLE
+    li.d t1, -1
+    vreplgr2vr.d neg1, t1
+    vffint.d.l neg1, neg1
+#else
+    li.w t1, -1
+    vreplgr2vr.w neg1, t1
+    vffint.s.w neg1, neg1
+#endif
+    li.d  TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    vld VX2, X, 4 * SIZE
+    vld VX3, X, 6 * SIZE
+    vfmul.d VX0, neg1, VX2
+    vfmul.d VX1, neg1, VX3
+    vfcmp.clt.d VT0, VX2, res0
+    vfcmp.clt.d VT1, VX3, res0
+    vbitsel.v VX2, VX2, VX0, VT0
+    vbitsel.v VX3, VX3, VX1, VT1
+    vfadd.d res2, VX2, VX3
+    vfadd.d res1, res1, res2
+    vld VX0, X, 8 * SIZE
+    vld VX1, X, 10 * SIZE
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    vld VX2, X, 12 * SIZE
+    vld VX3, X, 14 * SIZE
+    vfmul.d VX0, neg1, VX2
+    vfmul.d VX1, neg1, VX3
+    vfcmp.clt.d VT0, VX2, res0
+    vfcmp.clt.d VT1, VX3, res0
+    vbitsel.v VX2, VX2, VX0, VT0
+    vbitsel.v VX3, VX3, VX1, VT1
+    vfadd.d res2, VX2, VX3
+    vfadd.d res1, res1, res2
+    addi.d  I, I, -1
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res2, VX0, VX1
+    vld VX0, X, 8 * SIZE
+    vld VX1, X, 12 * SIZE
+    addi.d  I, I, -1
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res3, VX1, VX0
+    vfadd.s res2, res3, res2
+    vfadd.s res1, res1, res2
+#endif
+    addi.d X, X, 16 * SIZE
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    FABS a1, a1
+    FABS a2, a2
+    addi.d I, I, -1
+    ADD  a1, a1, a2
+    ADD  s1, a1, s1
+    addi.d  X, X, 2 * SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res2, VX0, VX1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vfmul.s VX0, neg1, VX2
+    vfmul.s VX1, neg1, VX3
+    vfcmp.clt.s VT0, VX2, res0
+    vfcmp.clt.s VT1, VX3, res0
+    vbitsel.v VX2, VX2, VX0, VT0
+    vbitsel.v VX3, VX3, VX1, VT1
+    vfadd.s res3, VX2, VX3
+    vfadd.s res2, res3, res2
+    vfadd.s res1, res1, res2
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    FABS a1, a1
+    FABS a2, a2
+    addi.d I, I, -1
+    ADD  a1, a1, a2
+    ADD  s1, a1, s1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV  $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dasum_lasx.S b/kernel/loongarch64/dasum_lasx.S
deleted file mode 100644
index 49de98c40..000000000
--- a/kernel/loongarch64/dasum_lasx.S
+++ /dev/null
@@ -1,148 +0,0 @@
-#define ASSEMBLER
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define I      $r17
-#define TEMP   $r18
-#define t1     $r15
-#define t2     $r12
-#define t3     $r13
-#define t4     $r14
-#define VX0    $xr12
-#define VX1    $xr13
-#define VX2    $xr14
-#define VX3    $xr15
-#define VT0    $xr23
-#define VT1    $xr22
-#define res1   $xr16
-#define res2   $xr17
-#define res0   $xr18
-#define neg1   $xr19
-
-    PROLOGUE
-    xvxor.v res1, res1, res1
-    xvxor.v res2, res2, res2
-    xvxor.v res0, res0, res0
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d t1, -1
-    xvreplgr2vr.d neg1, t1
-    xvffint.d.l neg1, neg1
-    li.d  TEMP, SIZE
-    slli.d  INCX, INCX, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bge $r0, I, .L13
-    .align 3
-
-.L11:
-    xvld VX0, X, 0 * SIZE
-    xvld VX1, X, 4 * SIZE
-    xvfmul.d VX2, neg1, VX0
-    xvfmul.d VX3, neg1, VX1
-    xvfcmp.clt.d VT0, VX0, res0
-    xvfcmp.clt.d VT1, VX1, res0
-    xvbitsel.v VX0, VX0, VX2, VT0
-    xvbitsel.v VX1, VX1, VX3, VT1
-    xvfadd.d res2, VX0, VX1
-    xvfadd.d res1, res1, res2
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L11
-    .align 3
-
-.L12:
-    xvpickve.d VX1, res1, 1
-    xvpickve.d VX2, res1, 2
-    xvpickve.d VX3, res1, 3
-    xvfadd.d res1, VX1, res1
-    xvfadd.d res1, VX2, res1
-    xvfadd.d res1, VX3, res1
-    .align 3
-
-.L13:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L14:
-    fld.d $f12, X, 0 * SIZE
-    fabs.d $f12, $f12
-    fadd.d $f16, $f12, $f16
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    b .L999
-    .align 3
-
-.L20:
-    bge $r0, I, .L23
-    .align 3
-
-.L21:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX0, t1, 0
-    xvinsgr2vr.d VX0, t2, 1
-    xvinsgr2vr.d VX0, t3, 2
-    xvinsgr2vr.d VX0, t4, 3
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d VX1, t1, 0
-    xvinsgr2vr.d VX1, t2, 1
-    xvinsgr2vr.d VX1, t3, 2
-    xvinsgr2vr.d VX1, t4, 3
-    xvfmul.d VX2, neg1, VX0
-    xvfmul.d VX3, neg1, VX1
-    xvfcmp.clt.d VT0, VX0, res0
-    xvfcmp.clt.d VT1, VX1, res0
-    xvbitsel.v VX0, VX0, VX2, VT0
-    xvbitsel.v VX1, VX1, VX3, VT1
-    xvfadd.d res2, VX0, VX1
-    xvfadd.d res1, res1, res2
-    addi.d  I, I, -1
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    xvpickve.d VX1, res1, 1
-    xvpickve.d VX2, res1, 2
-    xvpickve.d VX3, res1, 3
-    xvfadd.d res1, VX1, res1
-    xvfadd.d res1, VX2, res1
-    xvfadd.d res1, VX3, res1
-    .align 3
-
-.L23:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    fld.d $f12, X, 0 * SIZE
-    fabs.d $f12, $f12
-    fadd.d $f16, $f12, $f16
-    addi.d I, I, -1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fmov.d $f0, $f16
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/dasum_lsx.S b/kernel/loongarch64/dasum_lsx.S
deleted file mode 100644
index 94750815e..000000000
--- a/kernel/loongarch64/dasum_lsx.S
+++ /dev/null
@@ -1,158 +0,0 @@
-#define ASSEMBLER
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define I      $r17
-#define TEMP   $r18
-#define t1     $r15
-#define t2     $r12
-#define t3     $r13
-#define t4     $r14
-#define VX0    $vr12
-#define VX1    $vr13
-#define VX2    $vr14
-#define VX3    $vr15
-#define VT0    $vr23
-#define VT1    $vr22
-#define res1   $vr16
-#define res2   $vr17
-#define res0   $vr18
-#define neg1   $vr19
-
-    PROLOGUE
-    vxor.v res1, res1, res1
-    vxor.v res2, res2, res2
-    vxor.v res0, res0, res0
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d t1, -1
-    vreplgr2vr.d neg1, t1
-    vffint.d.l neg1, neg1
-    li.d  TEMP, SIZE
-    slli.d  INCX, INCX, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bge $r0, I, .L13
-    .align 3
-
-.L11:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 2 * SIZE
-    vfmul.d VX2, neg1, VX0
-    vfmul.d VX3, neg1, VX1
-    vfcmp.clt.d VT0, VX0, res0
-    vfcmp.clt.d VT1, VX1, res0
-    vbitsel.v VX0, VX0, VX2, VT0
-    vbitsel.v VX1, VX1, VX3, VT1
-    vfadd.d res2, VX0, VX1
-    vfadd.d res1, res1, res2
-    vld VX0, X, 4 * SIZE
-    vld VX1, X, 6 * SIZE
-    vfmul.d VX2, neg1, VX0
-    vfmul.d VX3, neg1, VX1
-    vfcmp.clt.d VT0, VX0, res0
-    vfcmp.clt.d VT1, VX1, res0
-    vbitsel.v VX0, VX0, VX2, VT0
-    vbitsel.v VX1, VX1, VX3, VT1
-    vfadd.d res2, VX0, VX1
-    vfadd.d res1, res1, res2
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L11
-    .align 3
-
-.L12:
-    vreplvei.d VX1, res1, 1
-    vfadd.d res1, VX1, res1
-    .align 3
-
-.L13:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L14:
-    fld.d $f12, X, 0 * SIZE
-    fabs.d $f12, $f12
-    fadd.d $f16, $f12, $f16
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    b .L999
-    .align 3
-
-.L20:
-    bge $r0, I, .L23
-    .align 3
-
-.L21:
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t1, 0
-    vinsgr2vr.d VX0, t2, 1
-    ld.d t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t2, X, 0 * SIZE
-    vinsgr2vr.d VX1, t1, 0
-    vinsgr2vr.d VX1, t2, 1
-    add.d X, X, INCX
-    vfmul.d VX2, neg1, VX0
-    vfmul.d VX3, neg1, VX1
-    vfcmp.clt.d VT0, VX0, res0
-    vfcmp.clt.d VT1, VX1, res0
-    vbitsel.v VX0, VX0, VX2, VT0
-    vbitsel.v VX1, VX1, VX3, VT1
-    vfadd.d res2, VX0, VX1
-    vfadd.d res1, res1, res2
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d VX0, t3, 0
-    vinsgr2vr.d VX0, t4, 1
-    ld.d t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.d t4, X, 0 * SIZE
-    vinsgr2vr.d VX1, t3, 0
-    vinsgr2vr.d VX1, t4, 1
-    add.d X, X, INCX
-    vfmul.d VX2, neg1, VX0
-    vfmul.d VX3, neg1, VX1
-    vfcmp.clt.d VT0, VX0, res0
-    vfcmp.clt.d VT1, VX1, res0
-    vbitsel.v VX0, VX0, VX2, VT0
-    vbitsel.v VX1, VX1, VX3, VT1
-    vfadd.d res2, VX0, VX1
-    vfadd.d res1, res1, res2
-    addi.d  I, I, -1
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    vreplvei.d VX1, res1, 1
-    vfadd.d res1, VX1, res1
-    .align 3
-
-.L23:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    fld.d $f12, X, 0 * SIZE
-    fabs.d $f12, $f12
-    fadd.d $f16, $f12, $f16
-    addi.d I, I, -1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fmov.d $f0, $f16
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/sasum_lasx.S b/kernel/loongarch64/sasum_lasx.S
deleted file mode 100644
index a452701aa..000000000
--- a/kernel/loongarch64/sasum_lasx.S
+++ /dev/null
@@ -1,157 +0,0 @@
-#define ASSEMBLER
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define I      $r17
-#define TEMP   $r18
-#define t1     $r15
-#define t2     $r12
-#define t3     $r13
-#define t4     $r14
-#define VX0    $xr12
-#define VX1    $xr13
-#define VX2    $xr14
-#define VX3    $xr15
-#define VT0    $xr23
-#define VT1    $xr22
-#define res1   $xr16
-#define res2   $xr17
-#define res0   $xr18
-#define neg1   $xr19
-
-    PROLOGUE
-    xvxor.v res1, res1, res1
-    xvxor.v res2, res2, res2
-    xvxor.v res0, res0, res0
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.w t1, -1
-    xvreplgr2vr.w neg1, t1
-    xvffint.s.w neg1, neg1
-    li.d  TEMP, SIZE
-    slli.d  INCX, INCX, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bge $r0, I, .L13
-    .align 3
-
-.L11:
-    xvld VX0, X, 0 * SIZE
-    xvfmul.s VX2, neg1, VX0
-    xvfcmp.clt.s VT0, VX0, res0
-    xvbitsel.v VX0, VX0, VX2, VT0
-    xvfadd.s res1, VX0, res1
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L11
-    .align 3
-
-.L12:
-    xvfadd.s res2, res1, res2
-    xvpickve.w VX1, res1, 1
-    xvpickve.w VX2, res1, 2
-    xvpickve.w VX3, res1, 3
-    xvfadd.s res1, VX1, res1
-    xvfadd.s res1, VX2, res1
-    xvfadd.s res1, VX3, res1
-    xvpickve.w VX0, res2, 4
-    xvpickve.w VX1, res2, 5
-    xvpickve.w VX2, res2, 6
-    xvpickve.w VX3, res2, 7
-    xvfadd.s res1, VX0, res1
-    xvfadd.s res1, VX1, res1
-    xvfadd.s res1, VX2, res1
-    xvfadd.s res1, VX2, res1
-    .align 3
-
-.L13:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L14:
-    fld.s $f12, X, 0 * SIZE
-    fabs.s $f12, $f12
-    fadd.s $f16, $f12, $f16
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    b .L999
-    .align 3
-
-.L20:
-    bge $r0, I, .L23
-    .align 3
-
-.L21:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    xvfmul.s VX2, neg1, VX0
-    xvfcmp.clt.s VT0, VX0, res0
-    xvbitsel.v VX0, VX0, VX2, VT0
-    xvfadd.s res1, VX0, res1
-    addi.d  I, I, -1
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    xvfadd.s res2, res1, res2
-    xvpickve.w VX1, res1, 1
-    xvpickve.w VX2, res1, 2
-    xvpickve.w VX3, res1, 3
-    xvfadd.s res1, VX1, res1
-    xvfadd.s res1, VX2, res1
-    xvfadd.s res1, VX3, res1
-    xvpickve.w VX0, res2, 4
-    xvpickve.w VX1, res2, 5
-    xvpickve.w VX2, res2, 6
-    xvpickve.w VX3, res2, 7
-    xvfadd.s res1, VX0, res1
-    xvfadd.s res1, VX1, res1
-    xvfadd.s res1, VX2, res1
-    xvfadd.s res1, VX2, res1
-    .align 3
-
-.L23:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    fld.s $f12, X, 0 * SIZE
-    fabs.s $f12, $f12
-    fadd.s $f16, $f12, $f16
-    addi.d I, I, -1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fmov.s $f0, $f16
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/sasum_lsx.S b/kernel/loongarch64/sasum_lsx.S
deleted file mode 100644
index 87026a144..000000000
--- a/kernel/loongarch64/sasum_lsx.S
+++ /dev/null
@@ -1,148 +0,0 @@
-#define ASSEMBLER
-#include "common.h"
-#define N      $r4
-#define X      $r5
-#define INCX   $r6
-#define I      $r17
-#define TEMP   $r18
-#define t1     $r15
-#define t2     $r12
-#define t3     $r13
-#define t4     $r14
-#define VX0    $vr12
-#define VX1    $vr13
-#define VX2    $vr14
-#define VX3    $vr15
-#define VT0    $vr23
-#define VT1    $vr22
-#define res1   $vr16
-#define res2   $vr17
-#define res0   $vr18
-#define neg1   $vr19
-
-    PROLOGUE
-    vxor.v res1, res1, res1
-    vxor.v res2, res2, res2
-    vxor.v res0, res0, res0
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.w t1, -1
-    vreplgr2vr.w neg1, t1
-    vffint.s.w neg1, neg1
-    li.d  TEMP, SIZE
-    slli.d  INCX, INCX, BASE_SHIFT
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bge $r0, I, .L13
-    .align 3
-
-.L11:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    vfmul.s VX2, neg1, VX0
-    vfmul.s VX3, neg1, VX1
-    vfcmp.clt.s VT0, VX0, res0
-    vfcmp.clt.s VT1, VX1, res0
-    vbitsel.v VX0, VX0, VX2, VT0
-    vbitsel.v VX1, VX1, VX3, VT1
-    vfadd.s res2, VX0, VX1
-    vfadd.s res1, res1, res2
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L11
-    .align 3
-
-.L12:
-    vreplvei.w VX1, res1, 1
-    vreplvei.w VX2, res1, 2
-    vreplvei.w VX3, res1, 3
-    vfadd.s res1, VX1, res1
-    vfadd.s res1, VX2, res1
-    vfadd.s res1, VX3, res1
-    .align 3
-
-.L13:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L14:
-    fld.s $f12, X, 0 * SIZE
-    fabs.s $f12, $f12
-    fadd.s $f16, $f12, $f16
-    addi.d I, I, -1
-    addi.d  X, X, SIZE
-    blt $r0, I, .L14
-    b .L999
-    .align 3
-
-.L20:
-    bge $r0, I, .L23
-    .align 3
-
-.L21:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    vfmul.s VX2, neg1, VX0
-    vfmul.s VX3, neg1, VX1
-    vfcmp.clt.s VT0, VX0, res0
-    vfcmp.clt.s VT1, VX1, res0
-    vbitsel.v VX0, VX0, VX2, VT0
-    vbitsel.v VX1, VX1, VX3, VT1
-    vfadd.s res2, VX0, VX1
-    vfadd.s res1, res1, res2
-    addi.d  I, I, -1
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    vreplvei.w VX1, res1, 1
-    vreplvei.w VX2, res1, 2
-    vreplvei.w VX3, res1, 3
-    vfadd.s res1, VX1, res1
-    vfadd.s res1, VX2, res1
-    vfadd.s res1, VX3, res1
-    .align 3
-
-.L23:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L24:
-    fld.s $f12, X, 0 * SIZE
-    fabs.s $f12, $f12
-    fadd.s $f16, $f12, $f16
-    addi.d I, I, -1
-    add.d  X, X, INCX
-    blt $r0, I, .L24
-    .align 3
-
-.L999:
-    fmov.s $f0, $f16
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file

From 0753848e03e1298c162386df467f78bc15851cc4 Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao@loongson.cn>
Date: Wed, 27 Dec 2023 16:54:01 +0800
Subject: [PATCH 522/718] loongarch64: Refine and add axpy optimization
 functions.

Signed-off-by: Hao Chen <chenhao@loongson.cn>
---
 kernel/loongarch64/KERNEL.LOONGSON2K1000      |   6 +-
 kernel/loongarch64/KERNEL.LOONGSON3R5         |   6 +-
 .../loongarch64/{daxpy_lasx.S => axpy_lasx.S} | 237 +++++-
 .../loongarch64/{daxpy_lsx.S => axpy_lsx.S}   | 254 ++++++-
 kernel/loongarch64/caxpy_lasx.S               | 707 ++++++++++++++++++
 kernel/loongarch64/caxpy_lsx.S                | 679 +++++++++++++++++
 kernel/loongarch64/saxpy_lasx.S               | 323 --------
 kernel/loongarch64/saxpy_lsx.S                | 338 ---------
 8 files changed, 1839 insertions(+), 711 deletions(-)
 rename kernel/loongarch64/{daxpy_lasx.S => axpy_lasx.S} (52%)
 rename kernel/loongarch64/{daxpy_lsx.S => axpy_lsx.S} (53%)
 create mode 100644 kernel/loongarch64/caxpy_lasx.S
 create mode 100644 kernel/loongarch64/caxpy_lsx.S
 delete mode 100644 kernel/loongarch64/saxpy_lasx.S
 delete mode 100644 kernel/loongarch64/saxpy_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 201427dcd..bdde126ad 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -40,8 +40,10 @@ DCOPYKERNEL =  copy_lsx.S
 SSWAPKERNEL =  swap_lsx.S
 DSWAPKERNEL =  swap_lsx.S
 
-SAXPYKERNEL =  saxpy_lsx.S
-DAXPYKERNEL =  daxpy_lsx.S
+SAXPYKERNEL =  axpy_lsx.S
+DAXPYKERNEL =  axpy_lsx.S
+CAXPYKERNEL =  caxpy_lsx.S
+ZAXPYKERNEL =  caxpy_lsx.S
 
 SAXPBYKERNEL = saxpby_lsx.S
 DAXPBYKERNEL = daxpby_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index e822cb630..7642b2a4d 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -40,8 +40,10 @@ DCOPYKERNEL =  copy_lasx.S
 SSWAPKERNEL =  swap_lasx.S
 DSWAPKERNEL =  swap_lasx.S
 
-SAXPYKERNEL =  saxpy_lasx.S
-DAXPYKERNEL =  daxpy_lasx.S
+SAXPYKERNEL =  axpy_lasx.S
+DAXPYKERNEL =  axpy_lasx.S
+CAXPYKERNEL =  caxpy_lasx.S
+ZAXPYKERNEL =  caxpy_lasx.S
 
 SAXPBYKERNEL = saxpby_lasx.S
 DAXPBYKERNEL = daxpby_lasx.S
diff --git a/kernel/loongarch64/daxpy_lasx.S b/kernel/loongarch64/axpy_lasx.S
similarity index 52%
rename from kernel/loongarch64/daxpy_lasx.S
rename to kernel/loongarch64/axpy_lasx.S
index bafd871ab..707fd09b5 100644
--- a/kernel/loongarch64/daxpy_lasx.S
+++ b/kernel/loongarch64/axpy_lasx.S
@@ -1,6 +1,33 @@
-#define ASSEMBLER
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
 
+#define ASSEMBLER
 #include "common.h"
+
 #define N      $r4
 #define XX     $r5
 #define YY     $r6
@@ -35,16 +62,20 @@
     bge $r0, N, .L999
     li.d TEMP, 1
     movgr2fr.d a1, $r0
-    ffint.d.l a1, a1
+    FFINT   a1,  a1
     movgr2fr.d a2, TEMP
-    ffint.d.l a2, a2
-    fcmp.ceq.d $fcc0, ALPHA, a1
+    FFINT   a2,  a2
+    CMPEQ   $fcc0, ALPHA, a1
     bcnez $fcc0, .L999
     slli.d  TEMP, TEMP, BASE_SHIFT
     slli.d  INCX, INCX, BASE_SHIFT
     slli.d  INCY, INCY, BASE_SHIFT
-    movfr2gr.d t1, ALPHA
+    MTG t1, ALPHA
+#ifdef DOUBLE
     xvreplgr2vr.d VXA, t1
+#else
+    xvreplgr2vr.w VXA, t1
+#endif
 
     srai.d I, N, 3
     bne INCX, TEMP, .L20
@@ -56,11 +87,12 @@
 
 .L11:
     bge $r0, I, .L113
-    fcmp.ceq.d $fcc0, ALPHA, a2
+    CMPEQ $fcc0, ALPHA, a2
     bceqz $fcc0, .L112
     .align 3
 
 .L111:
+#ifdef DOUBLE
     xvld VX0, X, 0 * SIZE
     xvld VX2, Y, 0 * SIZE
     xvld VX1, X, 4 * SIZE
@@ -70,6 +102,13 @@
     addi.d  I, I, -1
     xvst VX2, Y, 0 * SIZE
     xvst VX3, Y, 4 * SIZE
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    addi.d  I, I, -1
+    xvfadd.s VX2, VX0, VX2
+    xvst VX2, Y, 0 * SIZE
+#endif
     addi.d X, X, 8 * SIZE
     addi.d Y, Y, 8 * SIZE
     blt $r0, I, .L111
@@ -77,6 +116,7 @@
     .align 3
 
 .L112:
+#ifdef DOUBLE
     xvld VX0, X, 0 * SIZE
     xvld VX2, Y, 0 * SIZE
     xvld VX1, X, 4 * SIZE
@@ -86,6 +126,13 @@
     addi.d  I, I, -1
     xvst VX2, Y, 0 * SIZE
     xvst VX3, Y, 4 * SIZE
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    addi.d  I, I, -1
+    xvfmadd.s VX2, VX0, VXA, VX2
+    xvst VX2, Y, 0 * SIZE
+#endif
     addi.d X, X, 8 * SIZE
     addi.d Y, Y, 8 * SIZE
     blt $r0, I, .L112
@@ -97,11 +144,11 @@
     .align 3
 
 .L114:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
     addi.d I, I, -1
-    fmadd.d $f14, $f12, $f0, $f14
-    fst.d $f14, Y, 0 * SIZE
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
     addi.d  X, X, SIZE
     addi.d  Y, Y, SIZE
     blt $r0, I, .L114
@@ -114,6 +161,7 @@
     .align 3
 
 .L121:
+#ifdef DOUBLE
     xvld VX0, X, 0 * SIZE
     ld.d t1, Y, 0 * SIZE
     add.d Y, Y, INCY
@@ -158,6 +206,50 @@
     xvstelm.d VX3, YY, 0, 2
     add.d YY, YY, INCY
     xvstelm.d VX3, YY, 0, 3
+#else
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmadd.s VX2, VX0, VXA, VX2
+    addi.d  I, I, -1
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
     add.d YY, YY, INCY
     addi.d X, X, 8 * SIZE
     blt $r0, I, .L121
@@ -169,11 +261,11 @@
     .align 3
 
 .L123:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
     addi.d I, I, -1
-    fmadd.d $f14, $f12, $f0, $f14
-    fst.d $f14, Y, 0 * SIZE
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
     addi.d  X, X, SIZE
     add.d  Y, Y, INCY
     blt $r0, I, .L123
@@ -185,6 +277,7 @@
     .align 3
 
 .L211:
+#ifdef DOUBLE
     xvld VX2, Y, 0 * SIZE
     ld.d t1, X, 0 * SIZE
     add.d X, X, INCX
@@ -217,6 +310,37 @@
     addi.d  I, I, -1
     xvst VX3, Y, 4 * SIZE
     addi.d Y, Y, 8 * SIZE
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvfmadd.s VX2, VX0, VXA, VX2
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
     blt $r0, I, .L211
     .align 3
 
@@ -226,11 +350,11 @@
     .align 3
 
 .L213:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
     addi.d I, I, -1
-    fmadd.d $f14, $f12, $f0, $f14
-    fst.d $f14, Y, 0 * SIZE
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
     add.d X, X, INCX
     addi.d Y, Y, SIZE
     blt $r0, I, .L213
@@ -243,6 +367,7 @@
     .align 3
 
 .L222:
+#ifdef DOUBLE
     ld.d t1, X, 0 * SIZE
     add.d X, X, INCX
     ld.d t2, X, 0 * SIZE
@@ -309,6 +434,73 @@
     xvstelm.d VX3, YY, 0, 2
     add.d YY, YY, INCY
     xvstelm.d VX3, YY, 0, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmadd.s VX2, VX0, VXA, VX2
+    addi.d  I, I, -1
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
     add.d YY, YY, INCY
     blt $r0, I, .L222
     .align 3
@@ -319,15 +511,14 @@
     .align 3
 
 .L224:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
     addi.d I, I, -1
-    fmadd.d $f14, $f12, $f0, $f14
-    fst.d $f14, Y, 0 * SIZE
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
     add.d X, X, INCX
     add.d Y, Y, INCY
     blt $r0, I, .L224
-    b .L999
     .align 3
 
 .L999:
diff --git a/kernel/loongarch64/daxpy_lsx.S b/kernel/loongarch64/axpy_lsx.S
similarity index 53%
rename from kernel/loongarch64/daxpy_lsx.S
rename to kernel/loongarch64/axpy_lsx.S
index fc88f0bb9..0d74e2bce 100644
--- a/kernel/loongarch64/daxpy_lsx.S
+++ b/kernel/loongarch64/axpy_lsx.S
@@ -1,6 +1,33 @@
-#define ASSEMBLER
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
 
+#define ASSEMBLER
 #include "common.h"
+
 #define N      $r4
 #define XX     $r5
 #define YY     $r6
@@ -35,16 +62,20 @@
     bge $r0, N, .L999
     li.d TEMP, 1
     movgr2fr.d a1, $r0
-    ffint.d.l a1, a1
+    FFINT   a1,  a1
     movgr2fr.d a2, TEMP
-    ffint.d.l a2, a2
-    fcmp.ceq.d $fcc0, ALPHA, a1
+    FFINT   a2,  a2
+    CMPEQ   $fcc0, ALPHA, a1
     bcnez $fcc0, .L999
     slli.d  TEMP, TEMP, BASE_SHIFT
     slli.d  INCX, INCX, BASE_SHIFT
     slli.d  INCY, INCY, BASE_SHIFT
-    movfr2gr.d t1, ALPHA
+    MTG t1, ALPHA
+#ifdef DOUBLE
     vreplgr2vr.d VXA, t1
+#else
+    vreplgr2vr.w VXA, t1
+#endif
 
     srai.d I, N, 3
     bne INCX, TEMP, .L20
@@ -56,11 +87,12 @@
 
 .L11:
     bge $r0, I, .L113
-    fcmp.ceq.d $fcc0, ALPHA, a2
+    CMPEQ $fcc0, ALPHA, a2
     bceqz $fcc0, .L112
     .align 3
 
 .L111:
+#ifdef DOUBLE
     vld VX0, X, 0 * SIZE
     vld VX2, Y, 0 * SIZE
     vld VX1, X, 2 * SIZE
@@ -75,16 +107,27 @@
     vld VX3, Y, 6 * SIZE
     vfadd.d VX2, VX0, VX2
     vfadd.d VX3, VX1, VX3
-    addi.d  I, I, -1
     vst VX2, Y, 4 * SIZE
     vst VX3, Y, 6 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfadd.s VX2, VX0, VX2
+    vfadd.s VX3, VX1, VX3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
     addi.d X, X, 8 * SIZE
     addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
     blt $r0, I, .L111
     b .L113
     .align 3
 
 .L112:
+#ifdef DOUBLE
     vld VX0, X, 0 * SIZE
     vld VX2, Y, 0 * SIZE
     vld VX1, X, 2 * SIZE
@@ -104,6 +147,19 @@
     vst VX2, Y, 4 * SIZE
     vst VX3, Y, 6 * SIZE
     addi.d Y, Y, 8 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfmadd.s VX2, VX0, VXA, VX2
+    vfmadd.s VX3, VX1, VXA, VX3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+#endif
     blt $r0, I, .L112
     .align 3
 
@@ -113,11 +169,11 @@
     .align 3
 
 .L114:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
     addi.d I, I, -1
-    fmadd.d $f14, $f12, $f0, $f14
-    fst.d $f14, Y, 0 * SIZE
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
     addi.d  X, X, SIZE
     addi.d  Y, Y, SIZE
     blt $r0, I, .L114
@@ -130,6 +186,7 @@
     .align 3
 
 .L121:
+#ifdef DOUBLE
     vld VX0, X, 0 * SIZE
     ld.d t1, Y, 0 * SIZE
     add.d Y, Y, INCY
@@ -180,6 +237,54 @@
     add.d YY, YY, INCY
     addi.d X, X, 8 * SIZE
     addi.d  I, I, -1
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmadd.s VX2, VX0, VXA, VX2
+    vld VX1, X, 4 * SIZE
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vfmadd.s VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+#endif
     blt $r0, I, .L121
     .align 3
 
@@ -189,11 +294,11 @@
     .align 3
 
 .L123:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
     addi.d I, I, -1
-    fmadd.d $f14, $f12, $f0, $f14
-    fst.d $f14, Y, 0 * SIZE
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
     addi.d  X, X, SIZE
     add.d  Y, Y, INCY
     blt $r0, I, .L123
@@ -205,6 +310,7 @@
     .align 3
 
 .L211:
+#ifdef DOUBLE
     vld VX2, Y, 0 * SIZE
     ld.d t1, X, 0 * SIZE
     add.d X, X, INCX
@@ -242,6 +348,39 @@
     vfmadd.d VX3, VX1, VXA, VX3
     addi.d  I, I, -1
     vst VX3, Y, 6 * SIZE
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmadd.s VX2, VX0, VXA, VX2
+    vld VX3, Y, 4 * SIZE
+    vst VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    vfmadd.s VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vst VX3, Y, 4 * SIZE
+#endif
     addi.d Y, Y, 8 * SIZE
     blt $r0, I, .L211
     .align 3
@@ -252,11 +391,11 @@
     .align 3
 
 .L213:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
     addi.d I, I, -1
-    fmadd.d $f14, $f12, $f0, $f14
-    fst.d $f14, Y, 0 * SIZE
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
     add.d X, X, INCX
     addi.d Y, Y, SIZE
     blt $r0, I, .L213
@@ -269,6 +408,7 @@
     .align 3
 
 .L222:
+#ifdef DOUBLE
     ld.d t1, X, 0 * SIZE
     add.d X, X, INCX
     ld.d t2, X, 0 * SIZE
@@ -337,6 +477,74 @@
     vstelm.d VX3, YY, 0, 0
     add.d YY, YY, INCY
     vstelm.d VX3, YY, 0, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmadd.s VX2, VX0, VXA, VX2
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vfmadd.s VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+#endif
     add.d YY, YY, INCY
     blt $r0, I, .L222
     .align 3
@@ -347,11 +555,11 @@
     .align 3
 
 .L224:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
     addi.d I, I, -1
-    fmadd.d $f14, $f12, $f0, $f14
-    fst.d $f14, Y, 0 * SIZE
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
     add.d X, X, INCX
     add.d Y, Y, INCY
     blt $r0, I, .L224
diff --git a/kernel/loongarch64/caxpy_lasx.S b/kernel/loongarch64/caxpy_lasx.S
new file mode 100644
index 000000000..2b970fe70
--- /dev/null
+++ b/kernel/loongarch64/caxpy_lasx.S
@@ -0,0 +1,707 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define XX     $r5
+#define YY     $r6
+#define ALPHAR $f0
+#define ALPHAI $f1
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VXAR   $xr23
+#define VXAI   $xr19
+#define x1     $xr18
+#define x2     $xr17
+#define x3     $xr16
+#define x4     $xr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT  a1, a1
+    CMPEQ  $fcc0, ALPHAR, a1
+    CMPEQ  $fcc1, ALPHAI, a1
+    bceqz $fcc0, .L10
+    bcnez $fcc1, .L999
+.L10:
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    MTG  t1, ALPHAR
+    MTG  t2, ALPHAI
+#ifdef DOUBLE
+    xvreplgr2vr.d VXAR, t1
+    xvreplgr2vr.d VXAI, t2
+    srai.d I, N, 2
+#else
+    xvreplgr2vr.w VXAR, t1
+    xvreplgr2vr.w VXAI, t2
+    srai.d I, N, 3
+#endif
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    .align 3
+
+.L111:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX1, X, 8 * SIZE
+    xvld VX3, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmsub.d VX1, VXAR, x1, VX0
+    xvfmadd.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfadd.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmsub.s VX1, VXAR, x1, VX0
+    xvfmadd.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmadd.d VX1, VXAR, x1, VX0
+    xvfmsub.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfsub.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmadd.s VX1, VXAR, x1, VX0
+    xvfmsub.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 2
+    xvinsgr2vr.d x4, t4, 2
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x3, t1, 1
+    xvinsgr2vr.d x4, t2, 1
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    add.d Y, Y, INCY
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+#else
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    xvld VX1, X, 8 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    add.d Y, Y, INCY
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmsub.d VX1, VXAR, x1, VX0
+    xvfmadd.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfadd.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmsub.s VX1, VXAR, x1, VX0
+    xvfmadd.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmadd.d VX1, VXAR, x1, VX0
+    xvfmsub.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfsub.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmadd.s VX1, VXAR, x1, VX0
+    xvfmsub.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    xvstelm.d x3, YY, 0 * SIZE, 0
+    xvstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 2
+    xvstelm.d x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 1
+    xvstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 3
+    xvstelm.d x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+#else
+    addi.d  I, I, -1
+    xvstelm.w x3, YY, 0 * SIZE, 0
+    xvstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 1
+    xvstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 4
+    xvstelm.w x4, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 5
+    xvstelm.w x4, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 2
+    xvstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 3
+    xvstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 6
+    xvstelm.w x4, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 7
+    xvstelm.w x4, YY, 1 * SIZE, 7
+    add.d YY, YY, INCY
+    addi.d X, X, 16 * SIZE
+#endif
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 2
+    xvinsgr2vr.d x2, t4, 2
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 1
+    xvinsgr2vr.d x2, t2, 1
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    xvld VX3, Y, 8 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmsub.d VX1, VXAR, x1, VX0
+    xvfmadd.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfadd.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmsub.s VX1, VXAR, x1, VX0
+    xvfmadd.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmadd.d VX1, VXAR, x1, VX0
+    xvfmsub.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfsub.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmadd.s VX1, VXAR, x1, VX0
+    xvfmsub.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 8 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L222:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 1
+    xvinsgr2vr.d x4, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 2
+    xvinsgr2vr.d x4, t2, 2
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmsub.d VX1, VXAR, x1, VX0
+    xvfmadd.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfadd.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmsub.s VX1, VXAR, x1, VX0
+    xvfmadd.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmadd.d VX1, VXAR, x1, VX0
+    xvfmsub.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfsub.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmadd.s VX1, VXAR, x1, VX0
+    xvfmsub.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfsub.s x4, x4, VX3
+#endif
+#endif
+    addi.d  I, I, -1
+#ifdef DOUBLE
+    xvstelm.d x3, YY, 0 * SIZE, 0
+    xvstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 1
+    xvstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 2
+    xvstelm.d x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 3
+    xvstelm.d x4, YY, 1 * SIZE, 3
+#else
+    xvstelm.w x3, YY, 0 * SIZE, 0
+    xvstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 1
+    xvstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 2
+    xvstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 3
+    xvstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 4
+    xvstelm.w x4, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 5
+    xvstelm.w x4, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 6
+    xvstelm.w x4, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 7
+    xvstelm.w x4, YY, 1 * SIZE, 7
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    .align 3
+
+.L997:
+#ifdef DOUBLE
+    andi I, N, 3
+#else
+    andi I, N, 7
+#endif
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    LD  a3, Y, 0 * SIZE
+    LD  a4, Y, 1 * SIZE
+    addi.d I, I, -1
+#if !defined(CONJ)
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MSUB s3, ALPHAR, a1, s1
+    MADD s4, a2, ALPHAR, s2
+    ADD  s3, s3, a3
+    ADD  s4, s4, a4
+#else
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MADD s3, ALPHAR, a1, s1
+    MSUB s4, a2, ALPHAR, s2
+    ADD  s3, s3, a3
+    SUB  s4, a4, s4
+#endif
+    ST  s3, Y, 0 * SIZE
+    ST  s4, Y, 1 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/caxpy_lsx.S b/kernel/loongarch64/caxpy_lsx.S
new file mode 100644
index 000000000..85598d0b9
--- /dev/null
+++ b/kernel/loongarch64/caxpy_lsx.S
@@ -0,0 +1,679 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define XX     $r5
+#define YY     $r6
+#define ALPHAR $f0
+#define ALPHAI $f1
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VXAR   $vr23
+#define VXAI   $vr19
+#define x1     $vr18
+#define x2     $vr17
+#define x3     $vr16
+#define x4     $vr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT  a1, a1
+    CMPEQ  $fcc0, ALPHAR, a1
+    CMPEQ  $fcc1, ALPHAI, a1
+    bceqz $fcc0, .L10
+    bcnez $fcc1, .L999
+.L10:
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    MTG  t1, ALPHAR
+    MTG  t2, ALPHAI
+#ifdef DOUBLE
+    vreplgr2vr.d VXAR, t1
+    vreplgr2vr.d VXAI, t2
+#else
+    vreplgr2vr.w VXAR, t1
+    vreplgr2vr.w VXAI, t2
+#endif
+    srai.d I, N, 2
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    .align 3
+
+.L111:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmsub.s VX1, VXAR, x1, VX0
+    vfmadd.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmadd.s VX1, VXAR, x1, VX0
+    vfmsub.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+#if !defined(CONJ)
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#endif
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+#else
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmsub.s VX1, VXAR, x1, VX0
+    vfmadd.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmadd.s VX1, VXAR, x1, VX0
+    vfmsub.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#if !defined(CONJ)
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#endif
+    addi.d  I, I, -1
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+#else
+    addi.d  I, I, -1
+    vstelm.w x3, YY, 0 * SIZE, 0
+    vstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 1
+    vstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 2
+    vstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 3
+    vstelm.w x4, YY, 1 * SIZE, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmsub.s VX1, VXAR, x1, VX0
+    vfmadd.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmadd.s VX1, VXAR, x1, VX0
+    vfmsub.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+
+    vld VX2, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+#if !defined(CONJ)
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#endif
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    addi.d  I, I, -1
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+#else
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    addi.d  I, I, -1
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L222:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+#endif
+    add.d Y, Y, INCY
+#if !defined(CONJ)
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmsub.s VX1, VXAR, x1, VX0
+    vfmadd.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmadd.s VX1, VXAR, x1, VX0
+    vfmsub.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+#if !defined(CONJ)
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#endif
+    addi.d  I, I, -1
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+#else
+    addi.d  I, I, -1
+    vstelm.w x3, YY, 0 * SIZE, 0
+    vstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 1
+    vstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 2
+    vstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 3
+    vstelm.w x4, YY, 1 * SIZE, 3
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    .align 3
+
+.L997:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    LD  a3, Y, 0 * SIZE
+    LD  a4, Y, 1 * SIZE
+    addi.d I, I, -1
+#if !defined(CONJ)
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MSUB s3, ALPHAR, a1, s1
+    MADD s4, a2, ALPHAR, s2
+    ADD  s3, s3, a3
+    ADD  s4, s4, a4
+#else
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MADD s3, ALPHAR, a1, s1
+    MSUB s4, a2, ALPHAR, s2
+    ADD  s3, s3, a3
+    SUB  s4, a4, s4
+#endif
+    ST  s3, Y, 0 * SIZE
+    ST  s4, Y, 1 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/saxpy_lasx.S b/kernel/loongarch64/saxpy_lasx.S
deleted file mode 100644
index 609e26328..000000000
--- a/kernel/loongarch64/saxpy_lasx.S
+++ /dev/null
@@ -1,323 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define XX     $r5
-#define YY     $r6
-#define ALPHA  $f0
-#define X      $r7
-#define INCX   $r8
-#define Y      $r9
-#define INCY   $r10
-
-#define I      $r12
-#define TEMP   $r13
-#define t1     $r14
-#define t2     $r16
-#define t3     $r15
-#define t4     $r17
-#define a1     $f12
-#define a2     $f13
-#define a3     $f14
-#define a4     $f15
-#define b1     $f16
-#define b2     $f17
-#define b3     $f18
-#define b4     $f19
-#define VX0    $xr8
-#define VX1    $xr20
-#define VX2    $xr21
-#define VX3    $xr22
-#define VXA    $xr23
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    movgr2fr.d a1, $r0
-    ffint.s.l a1, a1
-    movgr2fr.d a2, TEMP
-    ffint.s.l a2, a2
-    fcmp.ceq.s $fcc0, ALPHA, a1
-    bcnez $fcc0, .L999
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    movfr2gr.s t1, ALPHA
-    xvreplgr2vr.w VXA, t1
-
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L113
-    fcmp.ceq.s $fcc0, ALPHA, a2
-    bceqz $fcc0, .L112
-    .align 3
-
-.L111:
-    xvld VX0, X, 0 * SIZE
-    xvld VX2, Y, 0 * SIZE
-    addi.d  I, I, -1
-    xvfadd.s VX2, VX0, VX2
-    xvst VX2, Y, 0 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L111
-    b .L113
-    .align 3
-
-.L112:
-    xvld VX0, X, 0 * SIZE
-    xvld VX2, Y, 0 * SIZE
-    addi.d  I, I, -1
-    xvfmadd.s VX2, VX0, VXA, VX2
-    xvst VX2, Y, 0 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L112
-    .align 3
-
-.L113:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L114:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fmadd.s $f14, $f12, $f0, $f14
-    fst.s $f14, Y, 0 * SIZE
-    addi.d  X, X, SIZE
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L114
-    b .L999
-    .align 3
-
-.L12: // INCX==1 and INCY!=1
-    bge $r0, I, .L122
-    move YY, Y
-    .align 3
-
-.L121:
-    xvld VX0, X, 0 * SIZE
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    xvinsgr2vr.w VX2, t1, 0
-    xvinsgr2vr.w VX2, t2, 1
-    xvinsgr2vr.w VX2, t3, 2
-    xvinsgr2vr.w VX2, t4, 3
-    add.d Y, Y, INCY
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    xvinsgr2vr.w VX2, t1, 4
-    xvinsgr2vr.w VX2, t2, 5
-    xvinsgr2vr.w VX2, t3, 6
-    xvinsgr2vr.w VX2, t4, 7
-    add.d Y, Y, INCY
-    xvfmadd.s VX2, VX0, VXA, VX2
-    addi.d  I, I, -1
-    xvstelm.w VX2, YY, 0, 0
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 1
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 2
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 3
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 4
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 5
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 6
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 7
-    add.d YY, YY, INCY
-    addi.d X, X, 8 * SIZE
-    blt $r0, I, .L121
-    .align 3
-
-.L122:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L123:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fmadd.s $f14, $f12, $f0, $f14
-    fst.s $f14, Y, 0 * SIZE
-    addi.d  X, X, SIZE
-    add.d  Y, Y, INCY
-    blt $r0, I, .L123
-    b .L999
-    .align 3
-
-.L21:// INCX!=1 and INCY==1
-    bge $r0, I, .L212
-    .align 3
-
-.L211:
-    xvld VX2, Y, 0 * SIZE
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    add.d X, X, INCX
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    xvfmadd.s VX2, VX0, VXA, VX2
-    addi.d  I, I, -1
-    xvst VX2, Y, 0 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L211
-    .align 3
-
-.L212:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L213:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fmadd.s $f14, $f12, $f0, $f14
-    fst.s $f14, Y, 0 * SIZE
-    add.d X, X, INCX
-    addi.d Y, Y, SIZE
-    blt $r0, I, .L213
-    b .L999
-    .align 3
-
-.L22:
-    bge $r0, I, .L223
-    move YY, Y
-    .align 3
-
-.L222:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    xvinsgr2vr.w VX2, t1, 0
-    xvinsgr2vr.w VX2, t2, 1
-    xvinsgr2vr.w VX2, t3, 2
-    xvinsgr2vr.w VX2, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    xvinsgr2vr.w VX2, t1, 4
-    xvinsgr2vr.w VX2, t2, 5
-    xvinsgr2vr.w VX2, t3, 6
-    xvinsgr2vr.w VX2, t4, 7
-    add.d Y, Y, INCY
-    xvfmadd.s VX2, VX0, VXA, VX2
-    addi.d  I, I, -1
-    xvstelm.w VX2, YY, 0, 0
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 1
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 2
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 3
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 4
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 5
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 6
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 7
-    add.d YY, YY, INCY
-    blt $r0, I, .L222
-    .align 3
-
-.L223:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L224:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fmadd.s $f14, $f12, $f0, $f14
-    fst.s $f14, Y, 0 * SIZE
-    add.d X, X, INCX
-    add.d Y, Y, INCY
-    blt $r0, I, .L224
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/saxpy_lsx.S b/kernel/loongarch64/saxpy_lsx.S
deleted file mode 100644
index f47415ed6..000000000
--- a/kernel/loongarch64/saxpy_lsx.S
+++ /dev/null
@@ -1,338 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define XX     $r5
-#define YY     $r6
-#define ALPHA  $f0
-#define X      $r7
-#define INCX   $r8
-#define Y      $r9
-#define INCY   $r10
-
-#define I      $r12
-#define TEMP   $r13
-#define t1     $r14
-#define t2     $r16
-#define t3     $r15
-#define t4     $r17
-#define a1     $f12
-#define a2     $f13
-#define a3     $f14
-#define a4     $f15
-#define b1     $f16
-#define b2     $f17
-#define b3     $f18
-#define b4     $f19
-#define VX0    $vr8
-#define VX1    $vr20
-#define VX2    $vr21
-#define VX3    $vr22
-#define VXA    $vr23
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    movgr2fr.d a1, $r0
-    ffint.s.l a1, a1
-    movgr2fr.d a2, TEMP
-    ffint.s.l a2, a2
-    fcmp.ceq.s $fcc0, ALPHA, a1
-    bcnez $fcc0, .L999
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    movfr2gr.s t1, ALPHA
-    vreplgr2vr.w VXA, t1
-
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L113
-    fcmp.ceq.s $fcc0, ALPHA, a2
-    bceqz $fcc0, .L112
-    .align 3
-
-.L111:
-    vld VX0, X, 0 * SIZE
-    vld VX2, Y, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    vld VX3, Y, 4 * SIZE
-    vfadd.s VX2, VX0, VX2
-    vfadd.s VX3, VX1, VX3
-    vst VX2, Y, 0 * SIZE
-    vst VX3, Y, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L111
-    b .L113
-    .align 3
-
-.L112:
-    vld VX0, X, 0 * SIZE
-    vld VX2, Y, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    vld VX3, Y, 4 * SIZE
-    vfmadd.s VX2, VX0, VXA, VX2
-    vfmadd.s VX3, VX1, VXA, VX3
-    vst VX2, Y, 0 * SIZE
-    vst VX3, Y, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L112
-    b .L113
-    .align 3
-
-.L113:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L114:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fmadd.s $f14, $f12, $f0, $f14
-    fst.s $f14, Y, 0 * SIZE
-    addi.d  X, X, SIZE
-    addi.d  Y, Y, SIZE
-    blt $r0, I, .L114
-    b .L999
-    .align 3
-
-.L12: // INCX==1 and INCY!=1
-    bge $r0, I, .L122
-    move YY, Y
-    .align 3
-
-.L121:
-    vld VX0, X, 0 * SIZE
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vinsgr2vr.w VX2, t1, 0
-    vinsgr2vr.w VX2, t2, 1
-    vinsgr2vr.w VX2, t3, 2
-    vinsgr2vr.w VX2, t4, 3
-    add.d Y, Y, INCY
-    vfmadd.s VX2, VX0, VXA, VX2
-    vld VX1, X, 4 * SIZE
-    vstelm.w VX2, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VX2, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VX2, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VX2, YY, 0, 3
-    add.d YY, YY, INCY
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vinsgr2vr.w VX3, t1, 0
-    vinsgr2vr.w VX3, t2, 1
-    vinsgr2vr.w VX3, t3, 2
-    vinsgr2vr.w VX3, t4, 3
-    add.d Y, Y, INCY
-    vfmadd.s VX3, VX1, VXA, VX3
-    addi.d  I, I, -1
-    vstelm.w VX3, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VX3, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VX3, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VX3, YY, 0, 3
-    add.d YY, YY, INCY
-    addi.d X, X, 8 * SIZE
-    blt $r0, I, .L121
-    .align 3
-
-.L122:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L123:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fmadd.s $f14, $f12, $f0, $f14
-    fst.s $f14, Y, 0 * SIZE
-    addi.d  X, X, SIZE
-    add.d  Y, Y, INCY
-    blt $r0, I, .L123
-    b .L999
-    .align 3
-
-.L21:// INCX!=1 and INCY==1
-    bge $r0, I, .L212
-    .align 3
-
-.L211:
-    vld VX2, Y, 0 * SIZE
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    add.d X, X, INCX
-    vfmadd.s VX2, VX0, VXA, VX2
-    vld VX3, Y, 4 * SIZE
-    vst VX2, Y, 0 * SIZE
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    add.d X, X, INCX
-    vfmadd.s VX3, VX1, VXA, VX3
-    addi.d  I, I, -1
-    vst VX3, Y, 4 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L211
-    .align 3
-
-.L212:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L213:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fmadd.s $f14, $f12, $f0, $f14
-    fst.s $f14, Y, 0 * SIZE
-    add.d X, X, INCX
-    addi.d Y, Y, SIZE
-    blt $r0, I, .L213
-    b .L999
-    .align 3
-
-.L22:
-    bge $r0, I, .L223
-    move YY, Y
-    .align 3
-
-.L222:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    add.d X, X, INCX
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vinsgr2vr.w VX2, t1, 0
-    vinsgr2vr.w VX2, t2, 1
-    vinsgr2vr.w VX2, t3, 2
-    vinsgr2vr.w VX2, t4, 3
-    add.d Y, Y, INCY
-    vfmadd.s VX2, VX0, VXA, VX2
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    vstelm.w VX2, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VX2, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VX2, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VX2, YY, 0, 3
-    add.d YY, YY, INCY
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vinsgr2vr.w VX3, t1, 0
-    vinsgr2vr.w VX3, t2, 1
-    vinsgr2vr.w VX3, t3, 2
-    vinsgr2vr.w VX3, t4, 3
-    add.d Y, Y, INCY
-    vfmadd.s VX3, VX1, VXA, VX3
-    addi.d  I, I, -1
-    vstelm.w VX3, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VX3, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VX3, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VX3, YY, 0, 3
-    add.d YY, YY, INCY
-    blt $r0, I, .L222
-    .align 3
-
-.L223:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L224:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f14, Y, 0 * SIZE
-    addi.d I, I, -1
-    fmadd.s $f14, $f12, $f0, $f14
-    fst.s $f14, Y, 0 * SIZE
-    add.d X, X, INCX
-    add.d Y, Y, INCY
-    blt $r0, I, .L224
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE

From 8785e948b534a7747611b36b46a69f5813a18cc2 Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao@loongson.cn>
Date: Wed, 27 Dec 2023 17:04:46 +0800
Subject: [PATCH 523/718] loongarch64: Add camin optimization function.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   1 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   1 +
 kernel/loongarch64/camin_lasx.S          | 199 +++++++++++++++++++++
 kernel/loongarch64/camin_lsx.S           | 211 +++++++++++++++++++++++
 4 files changed, 412 insertions(+)
 create mode 100644 kernel/loongarch64/camin_lasx.S
 create mode 100644 kernel/loongarch64/camin_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index bdde126ad..fc1766ff5 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -13,6 +13,7 @@ CAMAXKERNEL =  camax_lsx.S
 
 SAMINKERNEL =  amin_lsx.S
 DAMINKERNEL =  amin_lsx.S
+CAMINKERNEL =  camin_lsx.S
 
 SMAXKERNEL  =  max_lsx.S
 DMAXKERNEL  =  max_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 7642b2a4d..7de9d4440 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -13,6 +13,7 @@ CAMAXKERNEL =  camax_lasx.S
 
 SAMINKERNEL =  amin_lasx.S
 DAMINKERNEL =  amin_lasx.S
+CAMINKERNEL =  camin_lasx.S
 
 SMAXKERNEL  =  max_lsx.S
 DMAXKERNEL =   max_lsx.S
diff --git a/kernel/loongarch64/camin_lasx.S b/kernel/loongarch64/camin_lasx.S
new file mode 100644
index 000000000..d7931d30a
--- /dev/null
+++ b/kernel/loongarch64/camin_lasx.S
@@ -0,0 +1,199 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N    $r4
+#define X    $r5
+#define INCX $r6
+#define I    $r12
+#define TEMP $r16
+#define t1   $f14
+#define t2   $f18
+#define t3   $f15
+#define t4   $f17
+#define s1   $f22
+#define s2   $f9
+#define s3   $f10
+#define s4   $f11
+#define a0   $f20
+#define a1   $f21
+#define x1   $xr9
+#define x2   $xr10
+#define x3   $xr11
+#define x4   $xr12
+#define VT0  $xr13
+#define VT1  $xr14
+#define res0 $xr18
+#define neg1 $xr19
+#define VX0  $xr20
+#define VX1  $xr21
+#define VM0  $xr22
+#define VM1  $xr23
+
+    PROLOGUE
+    MTC s1, $r0
+    xvxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    fld.s a0, X, 0 * SIZE
+    fld.s a1, X, 1 * SIZE
+    fabs.s a0, a0
+    fabs.s a1, a1
+    fadd.s s1, a1, a0
+    xvreplve0.w VM0, VM0
+    li.d TEMP, 1
+    li.w I, -1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    xvreplgr2vr.w neg1, I
+    xvffint.s.w neg1, neg1
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L23
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 8 * SIZE
+    addi.d I, I, -1
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmul.s x3, neg1, x1
+    xvfmul.s x4, neg1, x2
+    xvfcmp.clt.s VT0, x1, res0
+    xvfcmp.clt.s VT1, x2, res0
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VT1
+    addi.d X, X, 16 * SIZE
+    xvfadd.s VM1, x1, x2
+    xvfmin.s VM0, VM0, VM1
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfmin.s VM1, x1, x2
+    xvfmin.s VM0, x3, x4
+    xvfmin.s VM0, VM0, VM1
+    b .L23
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmin.s s1, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmin.s s1, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    addi.d I, I, -1
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmin.s s3, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmin.s s4, t1, t3
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    fmin.s s1, s1, s2
+    fmin.s s3, s3, s4
+    fmin.s s1, s1, s3
+    .align 3
+
+.L23: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    add.d  X, X, INCX
+    fmin.s s1, a0, s1
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/camin_lsx.S b/kernel/loongarch64/camin_lsx.S
new file mode 100644
index 000000000..e9ad6b04d
--- /dev/null
+++ b/kernel/loongarch64/camin_lsx.S
@@ -0,0 +1,211 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N    $r4
+#define X    $r5
+#define INCX $r6
+#define I    $r12
+#define t1   $f14
+#define t2   $f18
+#define t3   $f15
+#define t4   $f17
+#define s1   $f22
+#define s2   $f9
+#define s3   $f10
+#define s4   $f11
+#define TEMP $r16
+#define a0   $f20
+#define a1   $f21
+#define x1   $vr9
+#define x2   $vr10
+#define x3   $vr11
+#define x4   $vr12
+#define VT0  $vr13
+#define VT1  $vr14
+#define res0 $vr18
+#define neg1 $vr19
+#define VX0  $vr20
+#define VX1  $vr21
+#define VM0  $vr22
+#define VM1  $vr23
+
+    PROLOGUE
+    MTC s1, $r0
+    vxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    fld.s a0, X, 0 * SIZE
+    fld.s a1, X, 1 * SIZE
+    fabs.s a0, a0
+    fabs.s a1, a1
+    fadd.s s1, a1, a0
+    vreplvei.w VM0, VM0, 0
+    li.d TEMP, 1
+    li.w I, -1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    vreplgr2vr.w neg1, I
+    vffint.s.w neg1, neg1
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L23
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, neg1, x1
+    vfmul.s x4, neg1, x2
+    vfcmp.clt.s VT0, x1, res0
+    vfcmp.clt.s VT1, x2, res0
+    vld VX0, X, 8 * SIZE
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT1
+    vld VX1, X, 12 * SIZE
+    vfadd.s VM1, x1, x2
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, neg1, x1
+    vfmul.s x4, neg1, x2
+    vfcmp.clt.s VT0, x1, res0
+    vfcmp.clt.s VT1, x2, res0
+    addi.d X, X, 16 * SIZE
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT1
+    vfadd.s x1, x1, x2
+    vfmin.s VM1, x1, VM1
+    vfmin.s VM0, VM0, VM1
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmin.s VM1, x1, x2
+    vfmin.s VM0, x3, x4
+    vfmin.s VM0, VM0, VM1
+    b .L23
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmin.s s1, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmin.s s1, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    addi.d I, I, -1
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmin.s s3, t1, t3
+    fld.s t1, X, 0 * SIZE
+    fld.s t2, X, 1 * SIZE
+    add.d X, X, INCX
+    fld.s t3, X, 0 * SIZE
+    fld.s t4, X, 1 * SIZE
+    add.d X, X, INCX
+    fabs.s t1, t1
+    fabs.s t2, t2
+    fabs.s t3, t3
+    fabs.s t4, t4
+    fadd.s t1, t1, t2
+    fadd.s t3, t3, t4
+    fmin.s s4, t1, t3
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    fmin.s s1, s1, s2
+    fmin.s s3, s3, s4
+    fmin.s s1, s1, s3
+    .align 3
+
+.L23: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    fld.s a0, X, 0 * SIZE
+    fld.s a1, X, 1 * SIZE
+    addi.d I, I, -1
+    fabs.s a0, a0
+    fabs.s a1, a1
+    fadd.s a0, a0, a1
+    add.d  X, X, INCX
+    fmin.s s1, a0, s1
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE

From 2a34fb4b80d494d76da7a9ea6a3d54dffbd57f37 Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao@loongson.cn>
Date: Wed, 27 Dec 2023 18:17:51 +0800
Subject: [PATCH 524/718] loongarch64: Add and refine scal optimization
 functions.

Signed-off-by: Hao Chen <chenhao@loongson.cn>
---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   6 +-
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   6 +-
 kernel/loongarch64/cscal_lasx.S          | 645 +++++++++++++++++++++++
 kernel/loongarch64/cscal_lsx.S           | 571 ++++++++++++++++++++
 kernel/loongarch64/scal_lasx.S           | 282 ++++++++++
 kernel/loongarch64/scal_lsx.S            | 301 +++++++++++
 kernel/loongarch64/sscal_lasx.S          | 188 -------
 kernel/loongarch64/sscal_lsx.S           | 194 -------
 8 files changed, 1807 insertions(+), 386 deletions(-)
 create mode 100644 kernel/loongarch64/cscal_lasx.S
 create mode 100644 kernel/loongarch64/cscal_lsx.S
 create mode 100644 kernel/loongarch64/scal_lasx.S
 create mode 100644 kernel/loongarch64/scal_lsx.S
 delete mode 100644 kernel/loongarch64/sscal_lasx.S
 delete mode 100644 kernel/loongarch64/sscal_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index fc1766ff5..7abdae55a 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -4,8 +4,10 @@ SDOTKERNEL  = dot_lsx.S
 DSDOTKERNEL = dot_lsx.S
 DDOTKERNEL  = dot_lsx.S
 
-SSCALKERNEL  = sscal_lsx.S
-DSCALKERNEL  = dscal_lsx.S
+SSCALKERNEL  = scal_lsx.S
+DSCALKERNEL  = scal_lsx.S
+CSCALKERNEL  = cscal_lsx.S
+ZSCALKERNEL  = cscal_lsx.S
 
 SAMAXKERNEL =  amax_lsx.S
 DAMAXKERNEL =  amax_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 7de9d4440..13f9f23ed 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -4,8 +4,10 @@ SDOTKERNEL  = dot_lasx.S
 DSDOTKERNEL = dot_lasx.S
 DDOTKERNEL  = dot_lasx.S
 
-SSCALKERNEL  = sscal_lasx.S
-DSCALKERNEL  = dscal_lasx.S
+SSCALKERNEL  = scal_lasx.S
+DSCALKERNEL  = scal_lasx.S
+CSCALKERNEL  = cscal_lasx.S
+ZSCALKERNEL  = cscal_lasx.S
 
 SAMAXKERNEL =  amax_lasx.S
 DAMAXKERNEL =  amax_lasx.S
diff --git a/kernel/loongarch64/cscal_lasx.S b/kernel/loongarch64/cscal_lasx.S
new file mode 100644
index 000000000..3605a6c0e
--- /dev/null
+++ b/kernel/loongarch64/cscal_lasx.S
@@ -0,0 +1,645 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define ALPHAR $f0
+#define ALPHAI $f1
+#define X      $r7
+#define INCX   $r8
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VXAR   $xr23
+#define VXAI   $xr19
+#define VXZ    $xr12
+#define x1     $xr18
+#define x2     $xr17
+#define x3     $xr16
+#define x4     $xr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT    a1, a1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    MTG  t1, ALPHAR
+#ifdef DOUBLE
+    xvreplgr2vr.d VXAR, t1
+    movfr2gr.d t2, ALPHAI
+    xvreplgr2vr.d VXAI, t2
+    xvxor.v VXZ, VXZ, VXZ
+    srai.d I, N, 2
+#else
+    xvreplgr2vr.w VXAR, t1
+    movfr2gr.s t2, ALPHAI
+    xvreplgr2vr.w VXAI, t2
+    xvxor.v VXZ, VXZ, VXZ
+    srai.d I, N, 3
+#endif
+    bne INCX, TEMP, .L22
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ  $fcc0, ALPHAR, a1
+    CMPEQ  $fcc1, ALPHAI, a1
+    bceqz $fcc0, .L13
+    b .L14
+    .align 3
+
+.L13:
+    bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
+    b .L113 //alpha_r != 0.0 && alpha_i == 0.0
+
+.L14:
+    bceqz $fcc1, .L112  //alpha_r == 0.0 && alpha_i != 0.0
+    b .L111 //alpha_r == 0.0 && alpha_i == 0.0
+    .align 3
+
+.L111:  //alpha_r == 0.0 && alpha_i == 0.0
+    xvst VXZ, X, 0 * SIZE
+#ifdef DOUBLE
+    xvst VXZ, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+#else
+    xvst VXZ, X, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112:  //alpha_r == 0.0 && alpha_i != 0.0
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvfmul.d x3, VXAI, x2
+    xvfsub.d x3, VXZ, x3
+    xvfmul.d x4, VXAI, x1
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    xvst VX2, X, 0 * SIZE
+    xvst VX3, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmul.s x3, VXAI, x2
+    xvfsub.s x3, VXZ, x3
+    xvfmul.s x4, VXAI, x1
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    xvst VX2, X, 0 * SIZE
+    xvst VX3, X, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: //alpha_r != 0.0 && alpha_i == 0.0
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvfmul.d x3, VXAR, x1
+    xvfmul.d x4, VXAR, x2
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    xvst VX2, X, 0 * SIZE
+    xvst VX3, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmul.s x3, VXAR, x1
+    xvfmul.s x4, VXAR, x2
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    xvst VX2, X, 0 * SIZE
+    xvst VX3, X, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114:  //alpha_r != 0.0 && alpha_i != 0.0
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvfmul.d VX0, VXAI, x2
+    xvfmsub.d x3, VXAR, x1, VX0
+    xvfmul.d VX1, VXAI, x1
+    xvfmadd.d x4, VXAR, x2, VX1
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    xvst VX2, X, 0 * SIZE
+    xvst VX3, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmul.s VX0, VXAI, x2
+    xvfmsub.s x3, VXAR, x1, VX0
+    xvfmul.s VX1, VXAI, x1
+    xvfmadd.s x4, VXAR, x2, VX1
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    xvst VX2, X, 0 * SIZE
+    xvst VX3, X, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move XX, X
+    CMPEQ  $fcc0, ALPHAR, a1
+    CMPEQ  $fcc1, ALPHAI, a1
+    bceqz $fcc0, .L23
+    b .L24
+    .align 3
+
+.L23:
+    bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
+    b .L223 //alpha_r != 0.0 && alpha_i == 0.0
+
+.L24:
+    bceqz $fcc1, .L222  //alpha_r == 0.0 && alpha_i != 0.0
+    b .L221 //alpha_r == 0.0 && alpha_i == 0.0
+    .align 3
+
+.L221:  //alpha_r == 0.0 && alpha_i == 0.0
+#ifdef DOUBLE
+    xvstelm.d VXZ, X, 0, 0
+    xvstelm.d VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.d VXZ, X, 0, 0
+    xvstelm.d VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.d VXZ, X, 0, 0
+    xvstelm.d VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.d VXZ, X, 0, 0
+    xvstelm.d VXZ, X, 1 * SIZE, 0
+#else
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+#endif
+    add.d X, X, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222:  //alpha_r == 0.0 && alpha_i != 0.0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+
+    xvfmul.d x3, VXAI, x2
+    xvfsub.d x3, VXZ, x3
+    xvfmul.d x4, VXAI, x1
+    addi.d  I, I, -1
+    xvstelm.d x3, XX, 0 * SIZE, 0
+    xvstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 1
+    xvstelm.d x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 2
+    xvstelm.d x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 3
+    xvstelm.d x4, XX, 1 * SIZE, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+
+    xvfmul.s x3, VXAI, x2
+    xvfsub.s x3, VXZ, x3
+    xvfmul.s x4, VXAI, x1
+    addi.d  I, I, -1
+    xvstelm.w x3, XX, 0 * SIZE, 0
+    xvstelm.w x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 1
+    xvstelm.w x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 2
+    xvstelm.w x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 3
+    xvstelm.w x4, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 4
+    xvstelm.w x4, XX, 1 * SIZE, 4
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 5
+    xvstelm.w x4, XX, 1 * SIZE, 5
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 6
+    xvstelm.w x4, XX, 1 * SIZE, 6
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 7
+    xvstelm.w x4, XX, 1 * SIZE, 7
+#endif
+    add.d XX, XX, INCX
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: //alpha_r != 0.0 && alpha_i == 0.0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+
+    xvfmul.d x3, VXAR, x1
+    xvfmul.d x4, VXAR, x2
+    addi.d  I, I, -1
+    xvstelm.d x3, XX, 0 * SIZE, 0
+    xvstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 1
+    xvstelm.d x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 2
+    xvstelm.d x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 3
+    xvstelm.d x4, XX, 1 * SIZE, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+
+    xvfmul.s x3, VXAR, x1
+    xvfmul.s x4, VXAR, x2
+    addi.d  I, I, -1
+    xvstelm.w x3, XX, 0 * SIZE, 0
+    xvstelm.w x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 1
+    xvstelm.w x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 2
+    xvstelm.w x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 3
+    xvstelm.w x4, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 4
+    xvstelm.w x4, XX, 1 * SIZE, 4
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 5
+    xvstelm.w x4, XX, 1 * SIZE, 5
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 6
+    xvstelm.w x4, XX, 1 * SIZE, 6
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 7
+    xvstelm.w x4, XX, 1 * SIZE, 7
+#endif
+    add.d XX, XX, INCX
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224:  //alpha_r != 0.0 && alpha_i != 0.0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+
+    xvfmul.d VX0, VXAI, x2
+    xvfmsub.d x3, VXAR, x1, VX0
+    xvfmul.d VX1, VXAI, x1
+    xvfmadd.d x4, VXAR, x2, VX1
+    addi.d  I, I, -1
+    xvstelm.d x3, XX, 0 * SIZE, 0
+    xvstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 1
+    xvstelm.d x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 2
+    xvstelm.d x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 3
+    xvstelm.d x4, XX, 1 * SIZE, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+
+    xvfmul.s VX0, VXAI, x2
+    xvfmsub.s x3, VXAR, x1, VX0
+    xvfmul.s VX1, VXAI, x1
+    xvfmadd.s x4, VXAR, x2, VX1
+    addi.d  I, I, -1
+    xvstelm.w x3, XX, 0 * SIZE, 0
+    xvstelm.w x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 1
+    xvstelm.w x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 2
+    xvstelm.w x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 3
+    xvstelm.w x4, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 4
+    xvstelm.w x4, XX, 1 * SIZE, 4
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 5
+    xvstelm.w x4, XX, 1 * SIZE, 5
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 6
+    xvstelm.w x4, XX, 1 * SIZE, 6
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 7
+    xvstelm.w x4, XX, 1 * SIZE, 7
+#endif
+    add.d XX, XX, INCX
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+
+.L997:
+#ifdef DOUBLE
+    andi I, N, 3
+#else
+    andi I, N, 7
+#endif
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    addi.d I, I, -1
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MSUB s1, ALPHAR, a1, s1
+    MADD s2, ALPHAR, a2, s2
+    ST   s1, X, 0 * SIZE
+    ST   s2, X, 1 * SIZE
+    add.d X, X, INCX
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/cscal_lsx.S b/kernel/loongarch64/cscal_lsx.S
new file mode 100644
index 000000000..f442a754f
--- /dev/null
+++ b/kernel/loongarch64/cscal_lsx.S
@@ -0,0 +1,571 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define ALPHAR $f0
+#define ALPHAI $f1
+#define X      $r7
+#define INCX   $r8
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VXAR   $vr23
+#define VXAI   $vr19
+#define VXZ    $vr12
+#define x1     $vr18
+#define x2     $vr17
+#define x3     $vr16
+#define x4     $vr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT    a1, a1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    MTG  t1, ALPHAR
+#ifdef DOUBLE
+    vreplgr2vr.d VXAR, t1
+    movfr2gr.d t2, ALPHAI
+    vreplgr2vr.d VXAI, t2
+#else
+    vreplgr2vr.w VXAR, t1
+    movfr2gr.s t2, ALPHAI
+    vreplgr2vr.w VXAI, t2
+#endif
+    vxor.v VXZ, VXZ, VXZ
+    srai.d I, N, 2
+    bne INCX, TEMP, .L22
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ  $fcc0, ALPHAR, a1
+    CMPEQ  $fcc1, ALPHAI, a1
+    bceqz $fcc0, .L13
+    b .L14
+    .align 3
+
+.L13:
+    bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
+    b .L113 //alpha_r != 0.0 && alpha_i == 0.0
+
+.L14:
+    bceqz $fcc1, .L112  //alpha_r == 0.0 && alpha_i != 0.0
+    b .L111 //alpha_r == 0.0 && alpha_i == 0.0
+    .align 3
+
+.L111:  //alpha_r == 0.0 && alpha_i == 0.0
+    vst VXZ, X, 0 * SIZE
+#ifdef DOUBLE
+    vst VXZ, X, 2 * SIZE
+    vst VXZ, X, 4 * SIZE
+    vst VXZ, X, 6 * SIZE
+#else
+    vst VXZ, X, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112:  //alpha_r == 0.0 && alpha_i != 0.0
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXAI, x2
+    vfsub.d x3, VXZ, x3
+    vfmul.d x4, VXAI, x1
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXAI, x2
+    vfsub.d x3, VXZ, x3
+    vfmul.d x4, VXAI, x1
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, X, 4 * SIZE
+    vst VX3, X, 6 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, VXAI, x2
+    vfsub.s x3, VXZ, x3
+    vfmul.s x4, VXAI, x1
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: //alpha_r != 0.0 && alpha_i == 0.0
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXAR, x1
+    vfmul.d x4, VXAR, x2
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXAR, x1
+    vfmul.d x4, VXAR, x2
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, X, 4 * SIZE
+    vst VX3, X, 6 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, VXAR, x1
+    vfmul.s x4, VXAR, x2
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114:  //alpha_r != 0.0 && alpha_i != 0.0
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d VX0, VXAI, x2
+    vfmsub.d x3, VXAR, x1, VX0
+    vfmul.d VX1, VXAI, x1
+    vfmadd.d x4, VXAR, x2, VX1
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d VX0, VXAI, x2
+    vfmsub.d x3, VXAR, x1, VX0
+    vfmul.d VX1, VXAI, x1
+    vfmadd.d x4, VXAR, x2, VX1
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, X, 4 * SIZE
+    vst VX3, X, 6 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s VX0, VXAI, x2
+    vfmsub.s x3, VXAR, x1, VX0
+    vfmul.s VX1, VXAI, x1
+    vfmadd.s x4, VXAR, x2, VX1
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move XX, X
+    CMPEQ  $fcc0, ALPHAR, a1
+    CMPEQ  $fcc1, ALPHAI, a1
+    bceqz $fcc0, .L23
+    b .L24
+    .align 3
+
+.L23:
+    bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
+    b .L223 //alpha_r != 0.0 && alpha_i == 0.0
+
+.L24:
+    bceqz $fcc1, .L222  //alpha_r == 0.0 && alpha_i != 0.0
+    b .L221 //alpha_r == 0.0 && alpha_i == 0.0
+    .align 3
+
+.L221:  //alpha_r == 0.0 && alpha_i == 0.0
+#ifdef DOUBLE
+    vstelm.d VXZ, X, 0, 0
+    vstelm.d VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    vstelm.d VXZ, X, 0, 0
+    vstelm.d VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    vstelm.d VXZ, X, 0, 0
+    vstelm.d VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    vstelm.d VXZ, X, 0, 0
+    vstelm.d VXZ, X, 1 * SIZE, 0
+#else
+    vstelm.w VXZ, X, 0, 0
+    vstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    vstelm.w VXZ, X, 0, 0
+    vstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    vstelm.w VXZ, X, 0, 0
+    vstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    vstelm.w VXZ, X, 0, 0
+    vstelm.w VXZ, X, 1 * SIZE, 0
+#endif
+    add.d X, X, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222:  //alpha_r == 0.0 && alpha_i != 0.0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vfmul.d x3, VXAI, x2
+    vfsub.d x3, VXZ, x3
+    vfmul.d x4, VXAI, x1
+    vstelm.d x3, XX, 0 * SIZE, 0
+    vstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d x3, XX, 0 * SIZE, 1
+    vstelm.d x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vfmul.d x3, VXAI, x2
+    vfsub.d x3, VXZ, x3
+    vfmul.d x4, VXAI, x1
+    addi.d  I, I, -1
+    vstelm.d x3, XX, 0 * SIZE, 0
+    vstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d x3, XX, 0 * SIZE, 1
+    vstelm.d x4, XX, 1 * SIZE, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+
+    vfmul.s x3, VXAI, x2
+    vfsub.s x3, VXZ, x3
+    vfmul.s x4, VXAI, x1
+    addi.d  I, I, -1
+    vstelm.w x3, XX, 0 * SIZE, 0
+    vstelm.w x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 1
+    vstelm.w x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 2
+    vstelm.w x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 3
+    vstelm.w x4, XX, 1 * SIZE, 3
+#endif
+    add.d XX, XX, INCX
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: //alpha_r != 0.0 && alpha_i == 0.0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vfmul.d x3, VXAR, x1
+    vfmul.d x4, VXAR, x2
+    vstelm.d x3, XX, 0 * SIZE, 0
+    vstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d x3, XX, 0 * SIZE, 1
+    vstelm.d x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vfmul.d x3, VXAR, x1
+    vfmul.d x4, VXAR, x2
+    addi.d  I, I, -1
+    vstelm.d x3, XX, 0 * SIZE, 0
+    vstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d x3, XX, 0 * SIZE, 1
+    vstelm.d x4, XX, 1 * SIZE, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+
+    vfmul.s x3, VXAR, x1
+    vfmul.s x4, VXAR, x2
+    addi.d  I, I, -1
+    vstelm.w x3, XX, 0 * SIZE, 0
+    vstelm.w x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 1
+    vstelm.w x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 2
+    vstelm.w x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 3
+    vstelm.w x4, XX, 1 * SIZE, 3
+#endif
+    add.d XX, XX, INCX
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224:  //alpha_r != 0.0 && alpha_i != 0.0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vfmul.d VX0, VXAI, x2
+    vfmsub.d x3, VXAR, x1, VX0
+    vfmul.d VX1, VXAI, x1
+    vfmadd.d x4, VXAR, x2, VX1
+    vstelm.d x3, XX, 0 * SIZE, 0
+    vstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d x3, XX, 0 * SIZE, 1
+    vstelm.d x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vfmul.d VX0, VXAI, x2
+    vfmsub.d x3, VXAR, x1, VX0
+    vfmul.d VX1, VXAI, x1
+    vfmadd.d x4, VXAR, x2, VX1
+    addi.d  I, I, -1
+    vstelm.d x3, XX, 0 * SIZE, 0
+    vstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d x3, XX, 0 * SIZE, 1
+    vstelm.d x4, XX, 1 * SIZE, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+
+    vfmul.s VX0, VXAI, x2
+    vfmsub.s x3, VXAR, x1, VX0
+    vfmul.s VX1, VXAI, x1
+    vfmadd.s x4, VXAR, x2, VX1
+    addi.d  I, I, -1
+    vstelm.w x3, XX, 0 * SIZE, 0
+    vstelm.w x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 1
+    vstelm.w x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 2
+    vstelm.w x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 3
+    vstelm.w x4, XX, 1 * SIZE, 3
+#endif
+    add.d XX, XX, INCX
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+
+.L997:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    addi.d I, I, -1
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MSUB s1, ALPHAR, a1, s1
+    MADD s2, ALPHAR, a2, s2
+    ST   s1, X, 0 * SIZE
+    ST   s2, X, 1 * SIZE
+    add.d X, X, INCX
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/scal_lasx.S b/kernel/loongarch64/scal_lasx.S
new file mode 100644
index 000000000..48e2c0718
--- /dev/null
+++ b/kernel/loongarch64/scal_lasx.S
@@ -0,0 +1,282 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define ALPHA $f0
+#define X  $r7
+#define INCX $r8
+#define I $r12
+#define TEMP $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define XX $r16
+#define VX0 $xr12
+#define VX1 $xr13
+#define VT0 $xr14
+#define VT1 $xr15
+#define VALPHA $xr19
+#define a1 $f8
+#define a2 $f23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT  a1, a1
+    movgr2fr.d a2, TEMP
+    FFINT  a2, a2
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    CMPEQ  $fcc0, ALPHA, a1
+    bcnez  $fcc0, .L20   //ALPHA==0
+    CMPEQ  $fcc0, ALPHA, a2
+    bcnez $fcc0, .L999  //ALPHA==1 return
+    srai.d I, N, 3
+    beq INCX, TEMP, .L30    //ALPHA！=0|1 and INCX==1
+    MTG  TEMP, ALPHA
+#ifdef DOUBLE
+    xvreplgr2vr.d VALPHA, TEMP
+#else
+    xvreplgr2vr.w VALPHA, TEMP
+#endif
+    move XX, X
+    .align 3
+
+.L10:   //ALPHA！=0|1 and INCX!=1
+    bge $r0, I, .L32
+    .align 3
+.L11:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    xvfmul.d VT0, VX0, VALPHA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvfmul.d VT1, VX1, VALPHA
+    xvstelm.d VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT1, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT1, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT1, XX, 0, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvfmul.s VT0, VX0, VALPHA
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+#endif
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    b .L32
+    .align 3
+
+.L20:
+    srai.d I, N, 3
+    beq INCX, TEMP, .L24
+    bge $r0, I, .L22
+    .align 3
+
+.L21:
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L23:
+    ST  a1, X, 0 * SIZE
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L23
+    jirl $r0, $r1, 0
+    .align 3
+
+.L24:
+    bge $r0, I, .L26 /*N<8 INCX==1*/
+    .align 3
+.L25:
+    xvxor.v VX0, VX0, VX0
+    xvst VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvst VX0, X, 4 * SIZE
+#endif
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L25
+    .align 3
+
+.L26:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L27:
+    ST  a1, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, SIZE
+    blt $r0, I, .L27
+    jirl $r0, $r1, 0
+    .align 3
+
+.L30:
+    bge $r0, I, .L32/*N<8 INCX==1*/
+    MTG TEMP, ALPHA
+#ifdef DOUBLE
+    xvreplgr2vr.d VALPHA , TEMP
+#else
+    xvreplgr2vr.w VALPHA , TEMP
+#endif
+    .align 3
+
+.L31:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvfmul.d VT0, VX0, VALPHA
+    xvfmul.d VT1, VX1, VALPHA
+    xvst VT0, X, 0 * SIZE
+    xvst VT1, X, 4 * SIZE
+#else
+    xvfmul.s VT0, VX0, VALPHA
+    xvst VT0, X, 0 * SIZE
+#endif
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L31
+    .align 3
+
+.L32:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L33:
+    LD    a1, X, 0 * SIZE
+    addi.d I, I, -1
+    MUL   a1, ALPHA, a1
+    ST    a1, X, 0 * SIZE
+    add.d X, X, INCX
+    blt $r0, I, .L33
+    jirl $r0, $r1, 0
+    .align 3
+
+.L999:
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/scal_lsx.S b/kernel/loongarch64/scal_lsx.S
new file mode 100644
index 000000000..1ffce7db2
--- /dev/null
+++ b/kernel/loongarch64/scal_lsx.S
@@ -0,0 +1,301 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define ALPHA $f0
+#define X  $r7
+#define INCX $r8
+#define I $r12
+#define TEMP $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define XX $r16
+#define VX0 $vr12
+#define VX1 $vr13
+#define VT0 $vr14
+#define VT1 $vr15
+#define VALPHA $vr19
+#define a1 $f8
+#define a2 $f23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT  a1, a1
+    movgr2fr.d a2, TEMP
+    FFINT  a2, a2
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    CMPEQ  $fcc0, ALPHA, a1
+    bcnez  $fcc0, .L20   //ALPHA==0
+    CMPEQ  $fcc0, ALPHA, a2
+    bcnez $fcc0, .L999  //ALPHA==1 return
+    srai.d I, N, 3
+    beq INCX, TEMP, .L30    //ALPHA！=0|1 and INCX==1
+    MTG  TEMP, ALPHA
+#ifdef DOUBLE
+    vreplgr2vr.d VALPHA, TEMP
+#else
+    vreplgr2vr.w VALPHA, TEMP
+#endif
+    move XX, X
+    .align 3
+
+.L10:   //ALPHA！=0|1 and INCX!=1
+    bge $r0, I, .L32
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vfmul.d VT0, VX0, VALPHA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vfmul.d VT1, VX1, VALPHA
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vstelm.d VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT1, XX, 0, 1
+    add.d XX, XX, INCX
+    vfmul.d VT0, VX0, VALPHA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vfmul.d VT1, VX1, VALPHA
+    vstelm.d VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT1, XX, 0, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vfmul.s VT0, VX0, VALPHA
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vfmul.s VT1, VX1, VALPHA
+    vstelm.w VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT1, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT1, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT1, XX, 0, 3
+#endif
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    b .L32
+    .align 3
+
+.L20:
+    srai.d I, N, 3
+    beq INCX, TEMP, .L24
+    bge $r0, I, .L22
+    .align 3
+
+.L21:
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L23:
+    ST  a1, X, 0 * SIZE
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L23
+    jirl $r0, $r1, 0
+    .align 3
+
+.L24:
+    bge $r0, I, .L26 /*N<8 INCX==1*/
+    .align 3
+
+.L25:
+    vxor.v VX0, VX0, VX0
+    vst VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vst VX0, X, 2 * SIZE
+    vst VX0, X, 4 * SIZE
+    vst VX0, X, 6 * SIZE
+#else
+    vst VX0, X, 4 * SIZE
+#endif
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L25
+    .align 3
+
+.L26:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L27:
+    ST  a1, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, SIZE
+    blt $r0, I, .L27
+    jirl $r0, $r1, 0
+    .align 3
+
+.L30:
+    bge $r0, I, .L32/*N<8 INCX==1*/
+    MTG  TEMP, ALPHA
+#ifdef DOUBLE
+    vreplgr2vr.d VALPHA , TEMP
+#else
+    vreplgr2vr.w VALPHA , TEMP
+#endif
+    .align 3
+
+.L31:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vfmul.d VT0, VX0, VALPHA
+    vfmul.d VT1, VX1, VALPHA
+    vld VX0, X, 4 * SIZE
+    vst VT0, X, 0 * SIZE
+    vst VT1, X, 2 * SIZE
+    vfmul.d VT0, VX0, VALPHA
+    vld VX1, X, 6 * SIZE
+    vst VT0, X, 4 * SIZE
+    vfmul.d VT1, VX1, VALPHA
+    vst VT1, X, 6 * SIZE
+    addi.d I, I, -1
+#else
+    vld VX1, X, 4 * SIZE
+    vfmul.s VT0, VX0, VALPHA
+    vfmul.s VT1, VX1, VALPHA
+    addi.d I, I, -1
+    vst VT0, X, 0 * SIZE
+    vst VT1, X, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L31
+    .align 3
+
+.L32:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L33:
+    LD    a1, X, 0 * SIZE
+    addi.d I, I, -1
+    MUL   a1, ALPHA, a1
+    ST    a1, X, 0 * SIZE
+    add.d X, X, INCX
+    blt $r0, I, .L33
+    jirl $r0, $r1, 0
+    .align 3
+
+.L999:
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/sscal_lasx.S b/kernel/loongarch64/sscal_lasx.S
deleted file mode 100644
index 329f24659..000000000
--- a/kernel/loongarch64/sscal_lasx.S
+++ /dev/null
@@ -1,188 +0,0 @@
-#define ASSEMBLER
-#include "common.h"
-
-#define N $r4
-#define ALPHA $f0
-#define X  $r7
-#define INCX $r8
-#define I $r12
-#define TEMP $r13
-#define t1 $r14
-#define t2 $r18
-#define t3 $r15
-#define t4 $r17
-#define XX $r16
-#define VX0 $xr12
-#define VX1 $xr13
-#define VT0 $xr14
-#define VT1 $xr15
-#define VALPHA $xr19
-#define a1 $f8
-#define a2 $f23
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    movgr2fr.d a1, $r0
-    ffint.s.l a1, a1
-    movgr2fr.d a2, TEMP
-    ffint.s.l a2, a2
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    fcmp.ceq.s $fcc0, ALPHA, a1
-    bcnez $fcc0, .L20   //ALPHA==0
-    fcmp.ceq.s $fcc0, ALPHA, a2
-    bcnez $fcc0, .L999  //ALPHA==1 return
-    srai.d I, N, 3
-    beq INCX, TEMP, .L30    //ALPHA！=0|1 and INCX==1
-    movfr2gr.s TEMP, ALPHA
-    xvreplgr2vr.w VALPHA, TEMP
-    move XX, X
-
-.L10:   //ALPHA！=0|1 and INCX!=1
-    bge $r0, I, .L32
-    .align 3
-.L11:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    xvfmul.s VT0, VX0, VALPHA
-    xvstelm.w VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 2
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 3
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 4
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 5
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 6
-    add.d XX, XX, INCX
-    xvstelm.w VT0, XX, 0, 7
-    add.d XX, XX, INCX
-    addi.d  I, I, -1
-    blt $r0, I, .L11
-    b .L32
-    .align 3
-
-.L20:
-    srai.d I, N, 3
-    beq INCX, TEMP, .L24
-    bge $r0, I, .L22
-    .align 3
-
-.L21:
-    fst.s a1, X, 0
-    add.d X, X, INCX
-    fst.s a1, X, 0
-    add.d X, X, INCX
-    fst.s a1, X, 0
-    add.d X, X, INCX
-    fst.s a1, X, 0
-    add.d X, X, INCX
-    fst.s a1, X, 0
-    add.d X, X, INCX
-    fst.s a1, X, 0
-    add.d X, X, INCX
-    fst.s a1, X, 0
-    add.d X, X, INCX
-    fst.s a1, X, 0
-    add.d X, X, INCX
-    addi.d  I, I, -1
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-.L23:
-    fst.s a1, X, 0 * SIZE
-    addi.d I, I, -1
-    add.d X, X, INCX
-    blt $r0, I, .L23
-    jirl $r0, $r1, 0
-    .align 3
-
-.L24:
-    bge $r0, I, .L26 /*N<8 INCX==1*/
-    .align 3
-.L25:
-    xvxor.v VX0, VX0, VX0
-    xvst VX0, X, 0 * SIZE
-    addi.d I, I, -1
-    addi.d X, X, 8 * SIZE
-    blt $r0, I, .L25
-    .align 3
-
-.L26:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-.L27:
-    fst.s a1, X, 0 * SIZE
-    addi.d I, I, -1
-    addi.d X, X, SIZE
-    blt $r0, I, .L27
-    jirl $r0, $r1, 0
-    .align 3
-
-.L30:
-    bge $r0, I, .L32/*N<8 INCX==1*/
-    movfr2gr.s TEMP, ALPHA
-    xvreplgr2vr.w VALPHA , TEMP
-    .align 3
-
-.L31:
-    xvld VX0, X, 0 * SIZE
-    addi.d I, I, -1
-    xvfmul.s VT0, VX0, VALPHA
-    xvst VT0, X, 0 * SIZE
-    addi.d X, X, 8 * SIZE
-    blt $r0, I, .L31
-    .align 3
-
-.L32:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-.L33:
-    fld.s a1, X, 0 * SIZE
-    addi.d I, I, -1
-    fmul.s a1, ALPHA, a1
-    fst.s a1, X, 0 * SIZE
-    add.d X, X, INCX
-    blt $r0, I, .L33
-    jirl $r0, $r1, 0
-    .align 3
-
-.L999:
-    jirl    $r0, $r1, 0x0
-
-    EPILOGUE
diff --git a/kernel/loongarch64/sscal_lsx.S b/kernel/loongarch64/sscal_lsx.S
deleted file mode 100644
index d0ea1307d..000000000
--- a/kernel/loongarch64/sscal_lsx.S
+++ /dev/null
@@ -1,194 +0,0 @@
-#define ASSEMBLER
-#include "common.h"
-
-#define N $r4
-#define ALPHA $f0
-#define X  $r7
-#define INCX $r8
-#define I $r12
-#define TEMP $r13
-#define t1 $r14
-#define t2 $r18
-#define t3 $r15
-#define t4 $r17
-#define XX $r16
-#define VX0 $vr12
-#define VX1 $vr13
-#define VT0 $vr14
-#define VT1 $vr15
-#define VALPHA $vr19
-#define a1 $f8
-#define a2 $f23
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    bge $r0, INCX, .L999
-    li.d TEMP, 1
-    movgr2fr.d a1, $r0
-    ffint.s.l a1, a1
-    movgr2fr.d a2, TEMP
-    ffint.s.l a2, a2
-    slli.d TEMP, TEMP, BASE_SHIFT
-    slli.d INCX, INCX, BASE_SHIFT
-    fcmp.ceq.s $fcc0, ALPHA, a1
-    bcnez $fcc0, .L20   //ALPHA==0
-    fcmp.ceq.s $fcc0, ALPHA, a2
-    bcnez $fcc0, .L999  //ALPHA==1 return
-    srai.d I, N, 3
-    beq INCX, TEMP, .L30    //ALPHA！=0|1 and INCX==1
-    movfr2gr.s TEMP, ALPHA
-    vreplgr2vr.w VALPHA, TEMP
-    move XX, X
-    .align 3
-
-.L10:   //ALPHA！=0|1 and INCX!=1
-    bge $r0, I, .L32
-    .align 3
-.L11:
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    vfmul.s VT0, VX0, VALPHA
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    vstelm.w VT0, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 2
-    add.d XX, XX, INCX
-    vstelm.w VT0, XX, 0, 3
-    add.d XX, XX, INCX
-    vfmul.s VT1, VX1, VALPHA
-    vstelm.w VT1, XX, 0, 0
-    add.d XX, XX, INCX
-    vstelm.w VT1, XX, 0, 1
-    add.d XX, XX, INCX
-    vstelm.w VT1, XX, 0, 2
-    add.d XX, XX, INCX
-    vstelm.w VT1, XX, 0, 3
-    add.d XX, XX, INCX
-    addi.d  I, I, -1
-    blt $r0, I, .L11
-    b .L32
-    .align 3
-
-.L20:
-    srai.d I, N, 3
-    beq INCX, TEMP, .L24
-    bge $r0, I, .L22
-    .align 3
-
-.L21:
-    fst.s a1, X, 0
-    add.d X, X, INCX
-    fst.s a1, X, 0
-    add.d X, X, INCX
-    fst.s a1, X, 0
-    add.d X, X, INCX
-    fst.s a1, X, 0
-    add.d X, X, INCX
-    fst.s a1, X, 0
-    add.d X, X, INCX
-    fst.s a1, X, 0
-    add.d X, X, INCX
-    fst.s a1, X, 0
-    add.d X, X, INCX
-    fst.s a1, X, 0
-    add.d X, X, INCX
-    addi.d  I, I, -1
-    blt $r0, I, .L21
-    .align 3
-
-.L22:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-.L23:
-    fst.s a1, X, 0 * SIZE
-    addi.d I, I, -1
-    add.d X, X, INCX
-    blt $r0, I, .L23
-    jirl $r0, $r1, 0
-    .align 3
-
-.L24:
-    bge $r0, I, .L26 /*N<8 INCX==1*/
-    .align 3
-.L25:
-    vxor.v VX0, VX0, VX0
-    vst VX0, X, 0 * SIZE
-    vst VX0, X, 4 * SIZE
-    addi.d I, I, -1
-    addi.d X, X, 8 * SIZE
-    blt $r0, I, .L25
-    .align 3
-
-.L26:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-.L27:
-    fst.s a1, X, 0 * SIZE
-    addi.d I, I, -1
-    addi.d X, X, SIZE
-    blt $r0, I, .L27
-    jirl $r0, $r1, 0
-    .align 3
-
-.L30:
-    bge $r0, I, .L32/*N<8 INCX==1*/
-    movfr2gr.s TEMP, ALPHA
-    vreplgr2vr.w VALPHA , TEMP
-    .align 3
-
-.L31:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    vfmul.s VT0, VX0, VALPHA
-    vfmul.s VT1, VX1, VALPHA
-    addi.d I, I, -1
-    vst VT0, X, 0 * SIZE
-    vst VT1, X, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-    blt $r0, I, .L31
-    .align 3
-
-.L32:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-.L33:
-    fld.s a1, X, 0 * SIZE
-    addi.d I, I, -1
-    fmul.s a1, ALPHA, a1
-    fst.s a1, X, 0 * SIZE
-    add.d X, X, INCX
-    blt $r0, I, .L33
-    jirl $r0, $r1, 0
-    .align 3
-
-.L999:
-    jirl    $r0, $r1, 0x0
-
-    EPILOGUE

From 65a0aeb128a46b75b189d3a21b619d005351b2d2 Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao@loongson.cn>
Date: Thu, 28 Dec 2023 17:45:17 +0800
Subject: [PATCH 525/718] loongarch64: Add c/zcopy optimization functions.

Signed-off-by: Hao Chen <chenhao@loongson.cn>
---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   2 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   2 +
 kernel/loongarch64/ccopy_lasx.S          | 386 +++++++++++++++++++++
 kernel/loongarch64/ccopy_lsx.S           | 411 +++++++++++++++++++++++
 4 files changed, 801 insertions(+)
 create mode 100644 kernel/loongarch64/ccopy_lasx.S
 create mode 100644 kernel/loongarch64/ccopy_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 7abdae55a..2aa68af67 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -39,6 +39,8 @@ IDAMINKERNEL = iamin_lsx.S
 
 SCOPYKERNEL =  copy_lsx.S
 DCOPYKERNEL =  copy_lsx.S
+CCOPYKERNEL =  ccopy_lsx.S
+ZCOPYKERNEL =  ccopy_lsx.S
 
 SSWAPKERNEL =  swap_lsx.S
 DSWAPKERNEL =  swap_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 13f9f23ed..3bcec2c62 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -39,6 +39,8 @@ IDAMINKERNEL = iamin_lasx.S
 
 SCOPYKERNEL =  copy_lasx.S
 DCOPYKERNEL =  copy_lasx.S
+CCOPYKERNEL =  ccopy_lasx.S
+ZCOPYKERNEL =  ccopy_lasx.S
 
 SSWAPKERNEL =  swap_lasx.S
 DSWAPKERNEL =  swap_lasx.S
diff --git a/kernel/loongarch64/ccopy_lasx.S b/kernel/loongarch64/ccopy_lasx.S
new file mode 100644
index 000000000..fbc5d96bc
--- /dev/null
+++ b/kernel/loongarch64/ccopy_lasx.S
@@ -0,0 +1,386 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX2, X, 8 * SIZE
+    xvld VX3, X, 12 * SIZE
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 4 * SIZE
+    xvst VX2, Y, 8 * SIZE
+    xvst VX3, Y, 12 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX2, X, 8 * SIZE
+    xvld VX3, X, 12 * SIZE
+    xvstelm.d VX0, Y, 0 * SIZE, 0
+    xvstelm.d VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX0, Y, 0 * SIZE, 2
+    xvstelm.d VX0, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0 * SIZE, 0
+    xvstelm.d VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0 * SIZE, 2
+    xvstelm.d VX1, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    xvstelm.d VX2, Y, 0 * SIZE, 0
+    xvstelm.d VX2, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX2, Y, 0 * SIZE, 2
+    xvstelm.d VX2, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    xvstelm.d VX3, Y, 0 * SIZE, 0
+    xvstelm.d VX3, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX3, Y, 0 * SIZE, 2
+    xvstelm.d VX3, Y, 1 * SIZE, 3
+#else
+    xvld VX1, X, 8 * SIZE
+    xvstelm.w VX0, Y, 0 * SIZE, 0
+    xvstelm.w VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0 * SIZE, 2
+    xvstelm.w VX0, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0 * SIZE, 4
+    xvstelm.w VX0, Y, 1 * SIZE, 5
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0 * SIZE, 6
+    xvstelm.w VX0, Y, 1 * SIZE, 7
+    add.d Y, Y, INCY
+    xvstelm.w VX1, Y, 0 * SIZE, 0
+    xvstelm.w VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.w VX1, Y, 0 * SIZE, 2
+    xvstelm.w VX1, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    xvstelm.w VX1, Y, 0 * SIZE, 4
+    xvstelm.w VX1, Y, 1 * SIZE, 5
+    add.d Y, Y, INCY
+    xvstelm.w VX1, Y, 0 * SIZE, 6
+    xvstelm.w VX1, Y, 1 * SIZE, 7
+#endif
+    add.d Y, Y, INCY
+    addi.d X, X, 16 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 4 * SIZE
+    xvst VX2, Y, 8 * SIZE
+    xvst VX3, Y, 12 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 0
+    xvinsgr2vr.w VX1, t2, 1
+    xvinsgr2vr.w VX1, t3, 2
+    xvinsgr2vr.w VX1, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 4
+    xvinsgr2vr.w VX1, t2, 5
+    xvinsgr2vr.w VX1, t3, 6
+    xvinsgr2vr.w VX1, t4, 7
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    addi.d Y, Y, 16 * SIZE
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  X, X, INCX
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+
+.L222:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/ccopy_lsx.S b/kernel/loongarch64/ccopy_lsx.S
new file mode 100644
index 000000000..4c4d880f1
--- /dev/null
+++ b/kernel/loongarch64/ccopy_lsx.S
@@ -0,0 +1,411 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:// INCX==1 and INCY==1
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX2, X, 4 * SIZE
+    vld VX3, X, 6 * SIZE
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 2 * SIZE
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    vld VX0, X, 8 * SIZE
+    vld VX1, X, 10 * SIZE
+    vld VX2, X, 12 * SIZE
+    vld VX3, X, 14 * SIZE
+    addi.d  I, I, -1
+    vst VX0, Y, 8 * SIZE
+    vst VX1, Y, 10 * SIZE
+    vst VX2, Y, 12 * SIZE
+    vst VX3, Y, 14 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX2, X, 8 * SIZE
+    vld VX3, X, 12 * SIZE
+    addi.d  I, I, -1
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 4 * SIZE
+    vst VX2, Y, 8 * SIZE
+    vst VX3, Y, 12 * SIZE
+#endif
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, 2 * SIZE
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    addi.d Y, Y, 2 * SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12:   // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX2, X, 4 * SIZE
+    vld VX3, X, 6 * SIZE
+    vstelm.d VX0, Y, 0 * SIZE, 0
+    vstelm.d VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0 * SIZE, 0
+    vstelm.d VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX2, Y, 0 * SIZE, 0
+    vstelm.d VX2, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX3, Y, 0 * SIZE, 0
+    vstelm.d VX3, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vld VX0, X, 8 * SIZE
+    vld VX1, X, 10 * SIZE
+    vld VX2, X, 12 * SIZE
+    vld VX3, X, 14 * SIZE
+    vstelm.d VX0, Y, 0 * SIZE, 0
+    vstelm.d VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0 * SIZE, 0
+    vstelm.d VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX2, Y, 0 * SIZE, 0
+    vstelm.d VX2, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX3, Y, 0 * SIZE, 0
+    vstelm.d VX3, Y, 1 * SIZE, 1
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX2, X, 8 * SIZE
+    vld VX3, X, 12 * SIZE
+    vstelm.w VX0, Y, 0 * SIZE, 0
+    vstelm.w VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.w VX0, Y, 0 * SIZE, 2
+    vstelm.w VX0, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0 * SIZE, 0
+    vstelm.w VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0 * SIZE, 2
+    vstelm.w VX1, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    vstelm.w VX2, Y, 0 * SIZE, 0
+    vstelm.w VX2, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.w VX2, Y, 0 * SIZE, 2
+    vstelm.w VX2, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    vstelm.w VX3, Y, 0 * SIZE, 0
+    vstelm.w VX3, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.w VX3, Y, 0 * SIZE, 2
+    vstelm.w VX3, Y, 1 * SIZE, 3
+#endif
+    add.d Y, Y, INCY
+    addi.d X, X, 16 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 2 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 4 * SIZE
+    vst VX1, Y, 6 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 8 * SIZE
+    vst VX1, Y, 10 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 12 * SIZE
+    vst VX1, Y, 14 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 4 * SIZE
+    vst VX2, Y, 8 * SIZE
+    vst VX3, Y, 12 * SIZE
+#endif
+    addi.d Y, Y, 16 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  X, X, INCX
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+
+.L222:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE

From d97272cb35af5849515b6a5850ddfe642e29430f Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao@loongson.cn>
Date: Thu, 28 Dec 2023 19:09:18 +0800
Subject: [PATCH 526/718] loongarch64: Add c/zdot optimization functions.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   2 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   2 +
 kernel/loongarch64/cdot_lasx.S           | 565 +++++++++++++++++++++++
 kernel/loongarch64/cdot_lsx.S            | 397 ++++++++++++++++
 4 files changed, 966 insertions(+)
 create mode 100644 kernel/loongarch64/cdot_lasx.S
 create mode 100644 kernel/loongarch64/cdot_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 2aa68af67..c70120f9a 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -3,6 +3,8 @@ ifndef NO_LSX
 SDOTKERNEL  = dot_lsx.S
 DSDOTKERNEL = dot_lsx.S
 DDOTKERNEL  = dot_lsx.S
+CDOTKERNEL  = cdot_lsx.S
+ZDOTKERNEL  = cdot_lsx.S
 
 SSCALKERNEL  = scal_lsx.S
 DSCALKERNEL  = scal_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 3bcec2c62..98673ae09 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -3,6 +3,8 @@ ifndef NO_LASX
 SDOTKERNEL  = dot_lasx.S
 DSDOTKERNEL = dot_lasx.S
 DDOTKERNEL  = dot_lasx.S
+CDOTKERNEL  = cdot_lasx.S
+ZDOTKERNEL  = cdot_lasx.S
 
 SSCALKERNEL  = scal_lasx.S
 DSCALKERNEL  = scal_lasx.S
diff --git a/kernel/loongarch64/cdot_lasx.S b/kernel/loongarch64/cdot_lasx.S
new file mode 100644
index 000000000..0583e56ea
--- /dev/null
+++ b/kernel/loongarch64/cdot_lasx.S
@@ -0,0 +1,565 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r19
+#define TEMP   $r10
+#define t1     $r11
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define res1   $xr16
+#define res2   $xr17
+#define res3   $xr18
+#define res4   $xr19
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define x1     $xr20
+#define x2     $xr21
+#define x3     $xr22
+#define x4     $xr23
+
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    xvxor.v res3, res3, res3
+    xvxor.v res4, res4, res4
+    bge $r0, N, .L999
+    li.d  TEMP, 2 * SIZE
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+#ifdef DOUBLE
+    srai.d I, N, 2
+#else
+    srai.d I, N, 3
+#endif
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    .align 3
+
+.L111:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+    xvfmadd.d res1, x1, x3, res1
+    xvfmadd.d res2, x2, x3, res2
+    xvfmadd.d res3, x1, x4, res3
+    xvfmadd.d res4, x2, x4, res4
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+    xvfmadd.s res1, x1, x3, res1
+    xvfmadd.s res2, x2, x3, res2
+    xvfmadd.s res3, x1, x4, res3
+    xvfmadd.s res4, x2, x4, res4
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L996
+    .align 3
+
+.L12:
+    bge $r0, I, .L997
+    .align 3
+
+.L121:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 2
+    xvinsgr2vr.d x4, t4, 2
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 1
+    xvinsgr2vr.d x4, t2, 1
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    addi.d X, X, 8 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvfmadd.d res1, x1, x3, res1
+    xvfmadd.d res2, x2, x3, res2
+    xvfmadd.d res3, x1, x4, res3
+    xvfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    xvld VX1, X, 8 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    addi.d X, X, 16 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmadd.s res1, x1, x3, res1
+    xvfmadd.s res2, x2, x3, res2
+    xvfmadd.s res3, x1, x4, res3
+    xvfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L996
+    .align 3
+
+.L21:
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 2
+    xvinsgr2vr.d x2, t4, 2
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 1
+    xvinsgr2vr.d x2, t2, 1
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    addi.d Y, Y, 8 * SIZE
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+    xvfmadd.d res1, x1, x3, res1
+    xvfmadd.d res2, x2, x3, res2
+    xvfmadd.d res3, x1, x4, res3
+    xvfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    xvld VX3, Y, 8 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    addi.d Y, Y, 8 * SIZE
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+    xvfmadd.s res1, x1, x3, res1
+    xvfmadd.s res2, x2, x3, res2
+    xvfmadd.s res3, x1, x4, res3
+    xvfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L996
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    .align 3
+
+.L222:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 1
+    xvinsgr2vr.d x4, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 2
+    xvinsgr2vr.d x4, t2, 2
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    xvfmadd.d res1, x1, x3, res1
+    xvfmadd.d res2, x2, x3, res2
+    xvfmadd.d res3, x1, x4, res3
+    xvfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    xvfmadd.s res1, x1, x3, res1
+    xvfmadd.s res2, x2, x3, res2
+    xvfmadd.s res3, x1, x4, res3
+    xvfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L996:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+    xvpickve.d VX1, res2, 1
+    xvpickve.d VX2, res2, 2
+    xvpickve.d VX3, res2, 3
+    xvfadd.d res2, VX1, res2
+    xvfadd.d res2, VX2, res2
+    xvfadd.d res2, VX3, res2
+    xvpickve.d VX1, res3, 1
+    xvpickve.d VX2, res3, 2
+    xvpickve.d VX3, res3, 3
+    xvfadd.d res3, VX1, res3
+    xvfadd.d res3, VX2, res3
+    xvfadd.d res3, VX3, res3
+    xvpickve.d VX1, res4, 1
+    xvpickve.d VX2, res4, 2
+    xvpickve.d VX3, res4, 3
+    xvfadd.d res4, VX1, res4
+    xvfadd.d res4, VX2, res4
+    xvfadd.d res4, VX3, res4
+#else
+    xvpickve.w VX0, res1, 1
+    xvpickve.w VX1, res1, 2
+    xvpickve.w VX2, res1, 3
+    xvpickve.w VX3, res1, 4
+    xvpickve.w x1, res1, 5
+    xvpickve.w x2, res1, 6
+    xvpickve.w x3, res1, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvfadd.s res1, x1, res1
+    xvfadd.s res1, x2, res1
+    xvfadd.s res1, x3, res1
+    xvpickve.w VX0, res2, 1
+    xvpickve.w VX1, res2, 2
+    xvpickve.w VX2, res2, 3
+    xvpickve.w VX3, res2, 4
+    xvpickve.w x1, res2, 5
+    xvpickve.w x2, res2, 6
+    xvpickve.w x3, res2, 7
+    xvfadd.s res2, VX0, res2
+    xvfadd.s res2, VX1, res2
+    xvfadd.s res2, VX2, res2
+    xvfadd.s res2, VX3, res2
+    xvfadd.s res2, x1, res2
+    xvfadd.s res2, x2, res2
+    xvfadd.s res2, x3, res2
+    xvpickve.w VX0, res3, 1
+    xvpickve.w VX1, res3, 2
+    xvpickve.w VX2, res3, 3
+    xvpickve.w VX3, res3, 4
+    xvpickve.w x1, res3, 5
+    xvpickve.w x2, res3, 6
+    xvpickve.w x3, res3, 7
+    xvfadd.s res3, VX0, res3
+    xvfadd.s res3, VX1, res3
+    xvfadd.s res3, VX2, res3
+    xvfadd.s res3, VX3, res3
+    xvfadd.s res3, x1, res3
+    xvfadd.s res3, x2, res3
+    xvfadd.s res3, x3, res3
+    xvpickve.w VX0, res4, 1
+    xvpickve.w VX1, res4, 2
+    xvpickve.w VX2, res4, 3
+    xvpickve.w VX3, res4, 4
+    xvpickve.w x1, res4, 5
+    xvpickve.w x2, res4, 6
+    xvpickve.w x3, res4, 7
+    xvfadd.s res4, VX0, res4
+    xvfadd.s res4, VX1, res4
+    xvfadd.s res4, VX2, res4
+    xvfadd.s res4, VX3, res4
+    xvfadd.s res4, x1, res4
+    xvfadd.s res4, x2, res4
+    xvfadd.s res4, x3, res4
+#endif
+    .align 3
+
+.L997:
+#ifdef DOUBLE
+    andi I, N, 3
+#else
+    andi I, N, 7
+#endif
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    LD   a3, Y, 0 * SIZE
+    LD   a4, Y, 1 * SIZE
+    MADD s1, a1, a3, s1
+    MADD s2, a2, a3, s2
+    MADD s3, a1, a4, s3
+    MADD s4, a2, a4, s4
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+#ifndef CONJ
+    SUB $f0, s1, s4
+    ADD $f1, s3, s2
+#else
+    ADD $f0, s1, s4
+    SUB $f1, s3, s2
+#endif
+    jirl    $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/cdot_lsx.S b/kernel/loongarch64/cdot_lsx.S
new file mode 100644
index 000000000..5feea12be
--- /dev/null
+++ b/kernel/loongarch64/cdot_lsx.S
@@ -0,0 +1,397 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r19
+#define TEMP   $r10
+#define t1     $r11
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define res1   $vr16
+#define res2   $vr17
+#define res3   $vr18
+#define res4   $vr19
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define x1     $vr20
+#define x2     $vr21
+#define x3     $vr22
+#define x4     $vr23
+
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    vxor.v res3, res3, res3
+    vxor.v res4, res4, res4
+    bge $r0, N, .L999
+    li.d  TEMP, 2 * SIZE
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+#ifdef DOUBLE
+    srai.d I, N, 1
+#else
+    srai.d I, N, 2
+#endif
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    .align 3
+
+.L111:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmadd.d res1, x1, x3, res1
+    vfmadd.d res2, x2, x3, res2
+    vfmadd.d res3, x1, x4, res3
+    vfmadd.d res4, x2, x4, res4
+    addi.d X, X, 4 * SIZE
+    addi.d Y, Y, 4 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmadd.s res1, x1, x3, res1
+    vfmadd.s res2, x2, x3, res2
+    vfmadd.s res3, x1, x4, res3
+    vfmadd.s res4, x2, x4, res4
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L996
+    .align 3
+
+.L12:
+    bge $r0, I, .L997
+    .align 3
+
+.L121:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    addi.d X, X, 4 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmadd.d res1, x1, x3, res1
+    vfmadd.d res2, x2, x3, res2
+    vfmadd.d res3, x1, x4, res3
+    vfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    addi.d X, X, 8 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmadd.s res1, x1, x3, res1
+    vfmadd.s res2, x2, x3, res2
+    vfmadd.s res3, x1, x4, res3
+    vfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L996
+    .align 3
+
+.L21:
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX3, Y, 2 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    addi.d Y, Y, 4 * SIZE
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmadd.d res1, x1, x3, res1
+    vfmadd.d res2, x2, x3, res2
+    vfmadd.d res3, x1, x4, res3
+    vfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    addi.d Y, Y, 8 * SIZE
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmadd.s res1, x1, x3, res1
+    vfmadd.s res2, x2, x3, res2
+    vfmadd.s res3, x1, x4, res3
+    vfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L996
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    .align 3
+
+.L222:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    vfmadd.d res1, x1, x3, res1
+    vfmadd.d res2, x2, x3, res2
+    vfmadd.d res3, x1, x4, res3
+    vfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    vfmadd.s res1, x1, x3, res1
+    vfmadd.s res2, x2, x3, res2
+    vfmadd.s res3, x1, x4, res3
+    vfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L996:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+    vreplvei.d VX1, res2, 1
+    vfadd.d res2, VX1, res2
+    vreplvei.d VX1, res3, 1
+    vfadd.d res3, VX1, res3
+    vreplvei.d VX1, res4, 1
+    vfadd.d res4, VX1, res4
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+    vreplvei.w VX1, res2, 1
+    vreplvei.w VX2, res2, 2
+    vreplvei.w VX3, res2, 3
+    vfadd.s res2, VX1, res2
+    vfadd.s res2, VX2, res2
+    vfadd.s res2, VX3, res2
+    vreplvei.w VX1, res3, 1
+    vreplvei.w VX2, res3, 2
+    vreplvei.w VX3, res3, 3
+    vfadd.s res3, VX1, res3
+    vfadd.s res3, VX2, res3
+    vfadd.s res3, VX3, res3
+    vreplvei.w VX1, res4, 1
+    vreplvei.w VX2, res4, 2
+    vreplvei.w VX3, res4, 3
+    vfadd.s res4, VX1, res4
+    vfadd.s res4, VX2, res4
+    vfadd.s res4, VX3, res4
+#endif
+    .align 3
+
+.L997:
+#ifdef DOUBLE
+    andi I, N, 1
+#else
+    andi I, N, 3
+#endif
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    LD   a3, Y, 0 * SIZE
+    LD   a4, Y, 1 * SIZE
+    MADD s1, a1, a3, s1
+    MADD s2, a2, a3, s2
+    MADD s3, a1, a4, s3
+    MADD s4, a2, a4, s4
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+#ifndef CONJ
+    SUB $f0, s1, s4
+    ADD $f1, s3, s2
+#else
+    ADD $f0, s1, s4
+    SUB $f1, s3, s2
+#endif
+    jirl    $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE

From fbd612f8c4f3eceb16a645c8a5366af35e7b6a2e Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao@loongson.cn>
Date: Thu, 28 Dec 2023 20:07:58 +0800
Subject: [PATCH 527/718] loongarch64: Add ic/zamin optimization functions.

---
 common_loongarch64.h                     |   4 +
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   2 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   2 +
 kernel/loongarch64/icamin_lasx.S         | 555 +++++++++++++++++++++++
 kernel/loongarch64/icamin_lsx.S          | 425 +++++++++++++++++
 5 files changed, 988 insertions(+)
 create mode 100644 kernel/loongarch64/icamin_lasx.S
 create mode 100644 kernel/loongarch64/icamin_lsx.S

diff --git a/common_loongarch64.h b/common_loongarch64.h
index 599b4795c..e581e2e3e 100644
--- a/common_loongarch64.h
+++ b/common_loongarch64.h
@@ -133,6 +133,7 @@ static inline int WhereAmI(void){
 
 #define XVFSUB  xvfsub.d
 #define XVFADD  xvfadd.d
+#define XVFMUL  xvfmul.d
 #define XVFMADD xvfmadd.d
 #define XVFMIN  xvfmin.d
 #define XVFMINA xvfmina.d
@@ -146,6 +147,7 @@ static inline int WhereAmI(void){
 
 #define VFSUB  vfsub.d
 #define VFADD  vfadd.d
+#define VFMUL  vfmul.d
 #define VFMADD vfmadd.d
 #define VFMIN  vfmin.d
 #define VFMINA vfmina.d
@@ -185,6 +187,7 @@ static inline int WhereAmI(void){
 
 #define XVFSUB  xvfsub.s
 #define XVFADD  xvfadd.s
+#define XVFMUL  xvfmul.s
 #define XVFMADD xvfmadd.s
 #define XVFMIN  xvfmin.s
 #define XVFMINA xvfmina.s
@@ -198,6 +201,7 @@ static inline int WhereAmI(void){
 
 #define VFSUB  vfsub.s
 #define VFADD  vfadd.s
+#define VFMUL  vfmul.s
 #define VFMADD vfmadd.s
 #define VFMIN  vfmin.s
 #define VFMINA vfmina.s
diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index c70120f9a..5e2632574 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -38,6 +38,8 @@ IZAMAXKERNEL = icamax_lsx.S
 
 ISAMINKERNEL = iamin_lsx.S
 IDAMINKERNEL = iamin_lsx.S
+ICAMINKERNEL = icamin_lsx.S
+IZAMINKERNEL = icamin_lsx.S
 
 SCOPYKERNEL =  copy_lsx.S
 DCOPYKERNEL =  copy_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 98673ae09..20a4d9a7e 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -38,6 +38,8 @@ IZAMAXKERNEL = icamax_lasx.S
 
 ISAMINKERNEL = iamin_lasx.S
 IDAMINKERNEL = iamin_lasx.S
+ICAMINKERNEL = icamin_lasx.S
+IZAMINKERNEL = icamin_lasx.S
 
 SCOPYKERNEL =  copy_lasx.S
 DCOPYKERNEL =  copy_lasx.S
diff --git a/kernel/loongarch64/icamin_lasx.S b/kernel/loongarch64/icamin_lasx.S
new file mode 100644
index 000000000..01abd45b2
--- /dev/null
+++ b/kernel/loongarch64/icamin_lasx.S
@@ -0,0 +1,555 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define a0 $f12
+#define a1 $f13
+#define s1 $f15
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    LD   a0, X, 0 * SIZE
+    LD   a1, X, 1 * SIZE
+    FABS a0, a0
+    FABS a1, a1
+    ADD  s1, a1, a0
+#ifdef DOUBLE
+    xvreplve0.d VM0, VM0
+    xvxor.v VI3, VI3, VI3 // 0
+    li.d I, -1
+    xvreplgr2vr.d VI4, I
+    xvffint.d.l VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 1 //3
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI0, i0, 2 //2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    xvreplve0.w VM0, VM0
+    xvxor.v VI3, VI3, VI3 // 0
+    li.w I, -1
+    xvreplgr2vr.w VI4, I
+    xvffint.s.w VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 2 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //6
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI0, i0, 4 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //4
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvadd.d VI1, VI1, VINC4
+    xvld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvfmul.d x3, VI4, x1
+    xvfmul.d x4, VI4, x2
+    xvfcmp.clt.d VT0, x1, VI3
+    xvfcmp.clt.d VINC8, x2, VI3
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VINC8
+#else
+    xvadd.w VI1, VI1, VINC8
+    xvld VX1, X, 8 * SIZE
+    addi.d I, I, -1
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmul.s x3, VI4, x1
+    xvfmul.s x4, VI4, x2
+    xvfcmp.clt.s VT0, x1, VI3
+    xvfcmp.clt.s VINC4, x2, VI3
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VINC4
+#endif
+    XVFADD  x1, x1, x2
+    XVFMIN  x3, VM0, x1
+    XVCMPEQ VT0, x3, VM0
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, x3, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmin.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, VM1, x1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmin.d VM0, x3, x4
+    xvfcmp.ceq.d VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmin.d VM0, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#endif
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT  VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 1 //3
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI0, i0, 2 //2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 2 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //6
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI0, i0, 4 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //4
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    xvadd.d VI1, VI1, VINC4
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    xvadd.w VI1, VI1, VINC8
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    xvadd.w VI1, VI1, VINC8
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+#endif
+    addi.d I, I, -1
+    XVFMUL x3, VI4, x1
+    XVFMUL x4, VI4, x2
+    XVCMPLT VT0, x1, VI3
+    XVCMPLT VINC8, x2, VI3
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VINC8
+    XVFADD x1, x1, x2
+    XVFMIN x3, VM0, x1
+    XVCMPEQ VT0, x3, VM0
+    xvbitsel.v VM0, x3, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmina.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, VM1, x1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmina.d VM0, x3, x4
+    xvfcmp.ceq.d VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmina.d VM0, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+#endif
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT  VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    XVCMPLT  VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    XVCMPLT  VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    XVCMPLT  VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+#ifdef DOUBLE
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L21: //N<4
+    andi I, N, 3
+    bge $r0, I, .L999
+    srai.d i1, N, 2
+    slli.d i1, i1, 2
+#else
+    fmov.s $f16, $f20
+    .align 3
+
+.L252:
+    xvxor.v VI0, VI0, VI0
+    xvor.v VI0, VI0, VX0
+    fmov.s $f13, $f15
+    xvxor.v VM0, VM0, VM0
+    xvor.v VM0, VM0, VX1
+    xvpickve.w VI1, VI0, 4
+    xvpickve.w VI2, VI0, 5
+    xvpickve.w VI3, VI0, 6
+    xvpickve.w VI4, VI0, 7
+    xvpickve.w x1, VM0, 4
+    xvpickve.w x2, VM0, 5
+    xvpickve.w x3, VM0, 6
+    xvpickve.w x4, VM0, 7
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v x1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, x1
+    xvbitsel.v VM0, VM0, x1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L262
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L262:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L272
+    xvfcmp.clt.s VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L272:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L282
+    xvfcmp.clt.s VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L282:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L292
+    xvfcmp.clt.s VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L292:
+    fcmp.clt.s $fcc0, $f15, $f13
+    fsel $f15, $f15, $f13, $fcc0
+    fsel $f20, $f20, $f16, $fcc0
+    movfr2gr.s i0, $f20
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+#endif
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    FMIN a1, s1, a0
+    CMPEQ $fcc0, s1, a1
+    add.d  X, X, INCX
+    fsel s1, a1, s1, $fcc0
+    fsel $f20, $f21, $f20, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG  i0, $f20
+    .align 3
+
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/icamin_lsx.S b/kernel/loongarch64/icamin_lsx.S
new file mode 100644
index 000000000..a08cd33c5
--- /dev/null
+++ b/kernel/loongarch64/icamin_lsx.S
@@ -0,0 +1,425 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define a0 $f12
+#define a1 $f13
+#define s1 $f15
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VINC4 $vr17
+#define VINC8 $vr18
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    LD   a0, X, 0 * SIZE
+    LD   a1, X, 1 * SIZE
+    FABS a0, a0
+    FABS a1, a1
+    ADD  s1, a1, a0
+    vreplvei.w VM0, VM0, 0
+    vxor.v VI3, VI3, VI3 // 0
+#ifdef DOUBLE
+    li.d I, -1
+    vreplgr2vr.d VI4, I
+    vffint.d.l VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -3
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    li.w I, -1
+    vreplgr2vr.w VI4, I
+    vffint.s.w VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.w i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    addi.w i0, i0, -7
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 2 * SIZE
+    addi.d I, I, -1
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VI4, x1
+    vfmul.d x4, VI4, x2
+    vfcmp.clt.d VT0, x1, VI3
+    vfcmp.clt.d VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    vfadd.d x1, x1, x2
+    vfmin.d x3, VM0, x1
+    vfcmp.ceq.d VT0, x3, VM0
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
+    vadd.w VI1, VI1, VINC4
+    vld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+#endif
+    VFMUL  x3, VI4, x1
+    VFMUL  x4, VI4, x2
+    VCMPLT VT0, x1, VI3
+    VCMPLT VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    VFADD  x1, x1, x2
+    VFMIN  x3, VM0, x1
+    VCMPEQ VT0, x3, VM0
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmina.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC4, VI2, VI1, VT0
+    vfmina.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC8, VI4, VI3, VT0
+    vfmina.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+#endif
+    .align 3
+
+.L20: // INCX!=1
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -3
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    addi.w i0, i0, -7
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vadd.d VI1, VI1, VINC4
+    vfmul.d x3, VI4, x1
+    vfmul.d x4, VI4, x2
+    vfcmp.clt.d VT0, x1, VI3
+    vfcmp.clt.d VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    vfadd.d x1, x1, x2
+    vfmin.d x3, VM0, x1
+    ld.d t1, X, 0 * SIZE
+    vfcmp.ceq.d VT0, x3, VM0
+    ld.d t2, X, 1 * SIZE
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vadd.d VI1, VI1, VINC4
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    vadd.w VI1, VI1, VINC4
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+#endif
+    addi.d I, I, -1
+    VFMUL  x3, VI4, x1
+    VFMUL  x4, VI4, x2
+    VCMPLT VT0, x1, VI3
+    VCMPLT VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    VFADD  x1, x1, x2
+    VFMIN  x3, VM0, x1
+    VCMPEQ VT0, x3, VM0
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmina.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC4, VI2, VI1, VT0
+    vfmina.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC8, VI4, VI3, VT0
+    vfmina.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+#endif
+    .align 3
+
+.L26:
+#ifdef DOUBLE
+    vfmina.d VM0, x1, x2
+    vfcmp.ceq.d VT0, x1, VM0
+#else
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    vfcmp.clt.s VT0, VI2, VI0
+#endif
+    vbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+#ifdef DOUBLE
+    movfr2gr.d i0, $f20
+    .align 3
+#else
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    vfcmp.clt.s VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    vfcmp.clt.s VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    movfr2gr.s i0, $f20
+    .align 3
+
+#endif
+.L21: //N<4
+    andi I, N, 3
+    bge $r0, I, .L999
+    srai.d i1, N, 2
+    slli.d i1, i1, 2
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    FMIN a1, s1, a0
+    CMPEQ $fcc0, s1, a1
+    add.d  X, X, INCX
+    fsel s1, a1, s1, $fcc0
+    fsel $f20, $f21, $f20, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG  i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE

From 3c53ded315901759f0ee2a77d07121c8905fb18d Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao@loongson.cn>
Date: Thu, 28 Dec 2023 20:26:01 +0800
Subject: [PATCH 528/718] loongarch64: Add c/znrm2 optimization functions.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   2 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   2 +
 kernel/loongarch64/cnrm2_lasx.S          | 147 +++++++++++++
 kernel/loongarch64/cnrm2_lsx.S           | 155 ++++++++++++++
 kernel/loongarch64/znrm2_lasx.S          | 252 ++++++++++++++++++++++
 kernel/loongarch64/znrm2_lsx.S           | 260 +++++++++++++++++++++++
 6 files changed, 818 insertions(+)
 create mode 100644 kernel/loongarch64/cnrm2_lasx.S
 create mode 100644 kernel/loongarch64/cnrm2_lsx.S
 create mode 100644 kernel/loongarch64/znrm2_lasx.S
 create mode 100644 kernel/loongarch64/znrm2_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 5e2632574..826588318 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -70,6 +70,8 @@ DROTKERNEL  =  rot_lsx.S
 
 SNRM2KERNEL =  snrm2_lsx.S
 DNRM2KERNEL =  dnrm2_lsx.S
+CNRM2KERNEL =  cnrm2_lsx.S
+ZNRM2KERNEL =  znrm2_lsx.S
 
 DGEMMKERNEL    = dgemm_kernel_8x4.S
 DGEMMINCOPY    = dgemm_ncopy_8_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 20a4d9a7e..b61ecd427 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -70,6 +70,8 @@ DROTKERNEL  =  rot_lasx.S
 
 SNRM2KERNEL =  snrm2_lasx.S
 DNRM2KERNEL =  dnrm2_lasx.S
+CNRM2KERNEL =  cnrm2_lasx.S
+ZNRM2KERNEL =  znrm2_lasx.S
 
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
diff --git a/kernel/loongarch64/cnrm2_lasx.S b/kernel/loongarch64/cnrm2_lasx.S
new file mode 100644
index 000000000..3a60069ac
--- /dev/null
+++ b/kernel/loongarch64/cnrm2_lasx.S
@@ -0,0 +1,147 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+#define a1     $f15
+#define a2     $f16
+#define res    $f19
+#define VX0    $xr15
+#define VX1    $xr16
+#define VX2    $xr17
+#define VX3    $xr18
+#define VX4    $xr21
+#define res1   $xr19
+#define res2   $xr20
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   xvxor.v res1, res1, res1
+   xvxor.v res2, res2, res2
+   bge $r0, N, .L999
+   beq $r0, INCX, .L999
+   li.d  TEMP, SIZE
+   slli.d INCX, INCX, ZBASE_SHIFT
+   srai.d I, N, 2
+   bne INCX, TEMP, .L20
+   bge $r0,  I, .L997
+   .align 3
+
+.L10:
+   xvld VX0, X, 0 * SIZE
+   xvfcvtl.d.s VX1, VX0
+   xvfcvth.d.s VX2, VX0
+   xvfmadd.d res1, VX1, VX1, res1
+   xvfmadd.d res2, VX2, VX2, res2
+   addi.d I, I, -1
+   addi.d X, X, 8 * SIZE
+   blt $r0, I, .L10
+   .align 3
+   b .L996
+
+.L20:
+   bge $r0, I, .L997
+   .align 3
+
+.L21:
+   ld.w t1, X, 0 * SIZE
+   ld.w t2, X, 1 * SIZE
+   add.d X, X, INCX
+   ld.w t3, X, 0 * SIZE
+   ld.w t4, X, 1 * SIZE
+   add.d X, X, INCX
+   xvinsgr2vr.w VX0, t1, 0
+   xvinsgr2vr.w VX0, t2, 1
+   xvinsgr2vr.w VX0, t3, 2
+   xvinsgr2vr.w VX0, t4, 3
+   ld.w t1, X, 0 * SIZE
+   ld.w t2, X, 1 * SIZE
+   add.d X, X, INCX
+   ld.w t3, X, 0 * SIZE
+   ld.w t4, X, 1 * SIZE
+   xvinsgr2vr.w VX0, t1, 4
+   xvinsgr2vr.w VX0, t2, 5
+   xvinsgr2vr.w VX0, t3, 6
+   xvinsgr2vr.w VX0, t4, 7
+   add.d X, X, INCX
+   xvfcvtl.d.s VX1, VX0
+   xvfcvth.d.s VX2, VX0
+   xvfmadd.d res1, VX1, VX1, res1
+   xvfmadd.d res2, VX2, VX2, res2
+   addi.d  I, I, -1
+   blt $r0, I, .L21
+   b .L996
+
+.L996:
+   xvfadd.d res1, res1, res2
+   xvpickve.d VX1, res1, 1
+   xvpickve.d VX2, res1, 2
+   xvpickve.d VX3, res1, 3
+   xvfadd.d res1, VX1, res1
+   xvfadd.d res1, VX2, res1
+   xvfadd.d res1, VX3, res1
+   .align 3
+
+.L997:
+   andi I, N, 3
+   bge $r0, I, .L999
+   .align 3
+
+.L998:
+   fld.s a1, X, 0 * SIZE
+   fld.s a2, X, 1 * SIZE
+   addi.d I, I, -1
+   fcvt.d.s a1, a1
+   fcvt.d.s a2, a2
+   fmadd.d res, a1, a1, res
+   fmadd.d res, a2, a2, res
+   add.d X, X, INCX
+   blt $r0, I, .L998
+   .align 3
+
+.L999:
+   fsqrt.d res, res
+   move $r4, $r17
+   fcvt.s.d $f0, res
+   jirl $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/cnrm2_lsx.S b/kernel/loongarch64/cnrm2_lsx.S
new file mode 100644
index 000000000..20950ba17
--- /dev/null
+++ b/kernel/loongarch64/cnrm2_lsx.S
@@ -0,0 +1,155 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+#define a1     $f15
+#define a2     $f16
+#define res    $f19
+#define VX0    $vr15
+#define VX1    $vr16
+#define VX2    $vr17
+#define VX3    $vr18
+#define VX4    $vr21
+#define res1   $vr19
+#define res2   $vr20
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   vxor.v res1, res1, res1
+   vxor.v res2, res2, res2
+   bge $r0, N, .L999
+   beq $r0, INCX, .L999
+   li.d  TEMP, 1
+   slli.d TEMP, TEMP, ZBASE_SHIFT
+   slli.d INCX, INCX, ZBASE_SHIFT
+   srai.d I, N, 2
+   bne INCX, TEMP, .L20
+   bge $r0,  I, .L997
+   .align 3
+
+.L10:
+   vld VX0, X, 0 * SIZE
+   vfcvtl.d.s VX1, VX0
+   vfcvth.d.s VX2, VX0
+   vfmadd.d res1, VX1, VX1, res1
+   vfmadd.d res2, VX2, VX2, res2
+   vld VX0, X, 4 * SIZE
+   vfcvtl.d.s VX3, VX0
+   vfcvth.d.s VX4, VX0
+   vfmadd.d res1, VX3, VX3, res1
+   vfmadd.d res2, VX4, VX4, res2
+   addi.d I, I, -1
+   addi.d X, X, 8 * SIZE
+   blt $r0, I, .L10
+   b .L996
+   .align 3
+
+.L20:
+   bge $r0, I, .L997
+   .align 3
+
+.L21:
+   ld.w t1, X, 0 * SIZE
+   ld.w t2, X, 1 * SIZE
+   add.d X, X, INCX
+   ld.w t3, X, 0 * SIZE
+   ld.w t4, X, 1 * SIZE
+   vinsgr2vr.w VX0, t1, 0
+   vinsgr2vr.w VX0, t2, 1
+   vinsgr2vr.w VX0, t3, 2
+   vinsgr2vr.w VX0, t4, 3
+   add.d X, X, INCX
+   vfcvtl.d.s VX1, VX0
+   vfcvth.d.s VX2, VX0
+   vfmadd.d res1, VX1, VX1, res1
+   vfmadd.d res2, VX2, VX2, res2
+   ld.w t1, X, 0 * SIZE
+   ld.w t2, X, 1 * SIZE
+   add.d X, X, INCX
+   ld.w t3, X, 0 * SIZE
+   ld.w t4, X, 1 * SIZE
+   vinsgr2vr.w VX0, t1, 0
+   vinsgr2vr.w VX0, t2, 1
+   vinsgr2vr.w VX0, t3, 2
+   vinsgr2vr.w VX0, t4, 3
+   add.d X, X, INCX
+   vfcvtl.d.s VX3, VX0
+   vfcvth.d.s VX4, VX0
+   vfmadd.d res1, VX3, VX3, res1
+   vfmadd.d res2, VX4, VX4, res2
+   addi.d  I, I, -1
+   blt $r0, I, .L21
+   b .L996
+   .align 3
+
+.L996:
+   vfadd.d res1, res1, res2
+   vreplvei.d VX1, res1, 1
+   vfadd.d res1, VX1, res1
+   .align 3
+
+.L997:
+   andi I, N, 3
+   bge $r0, I, .L999
+   .align 3
+
+.L998:
+    fld.s a1, X, 0 * SIZE
+    fld.s a2, X, 1 * SIZE
+    addi.d I, I, -1
+    fcvt.d.s a1, a1
+    fcvt.d.s a2, a2
+    fmadd.d res, a1, a1, res
+    fmadd.d res, a2, a2, res
+    add.d X, X, INCX
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    fsqrt.d res, res
+    move $r4, $r17
+    fcvt.s.d $f0, $f19
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/znrm2_lasx.S b/kernel/loongarch64/znrm2_lasx.S
new file mode 100644
index 000000000..53f8a6e05
--- /dev/null
+++ b/kernel/loongarch64/znrm2_lasx.S
@@ -0,0 +1,252 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define XX     $r19
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+#define INF    $f23
+#define a1     $f22
+#define max    $f17
+#define ALPHA  $f12
+#define a3     $f15
+#define a2     $f16
+#define VX0    $xr15
+#define VX1    $xr16
+#define VM0    $xr17
+#define VM1    $xr18
+#define VM2    $xr13
+#define VM3    $xr14
+#define res1   $xr19
+#define res2   $xr20
+#define VALPHA $xr21
+
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    xvxor.v   res1, res1, res1
+    xvxor.v   res2, res2, res2
+    xvxor.v VM0, VM0, VM0
+    bge $r0, N, .L999
+    beq $r0, INCX, .L999
+    move  XX, X
+    // Init INF
+    addi.d  TEMP, $r0,  0x7FF
+    slli.d  TEMP, TEMP, 52
+    MTC  INF, TEMP
+    li.d  TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 2
+    bne INCX, TEMP, .L20
+    bge $r0,  I, .L97
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfmaxa.d VM1, VX1, VX0
+    xvfmaxa.d VM0, VM0, VM1
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L10
+    b .L96
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L97
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM0, VM0, VM1
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    b .L96
+    .align 3
+
+.L96:
+    xvpickve.d VX0, VM0, 1
+    xvpickve.d VX1, VM0, 2
+    xvpickve.d VM3, VM0, 3
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM2, VM3, VM0
+    xvfmaxa.d VM0, VM1, VM2
+    .align 3
+
+.L97:
+    andi I, N, 3
+    bge $r0, I, .L99
+    .align 3
+
+.L98:
+    fld.d a3, X, 0 * SIZE
+    fld.d a2, X, 1 * SIZE
+    fmaxa.d a3, a2, a3
+    fmaxa.d max, a3, max
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L98
+    .align 3
+
+.L99:
+    fabs.d  max, max
+    lu12i.w  TEMP, 0x3f800 // 1
+    movgr2fr.d  a1, $r0
+    movgr2fr.w  ALPHA, TEMP
+    CMPEQ  $fcc0, max, a1
+    fcvt.d.s  ALPHA, ALPHA
+    bcnez  $fcc0, .L999
+    fdiv.d  ALPHA, ALPHA, max
+    CMPEQ  $fcc0, INF, ALPHA
+    bcnez  $fcc0, .L999
+    movfr2gr.d TEMP, ALPHA
+    xvreplgr2vr.d VALPHA, TEMP
+
+.L100:
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    srai.d  I, N, 2
+    bne INCX, TEMP, .L120
+    bge $r0, I, .L997
+    .align 3
+
+.L110:
+    xvld VX0, XX, 0 * SIZE
+    xvld VX1, XX, 4 * SIZE
+    xvfmul.d VM2, VX0, VALPHA
+    xvfmul.d VM3, VX1, VALPHA
+    xvfmadd.d res1, VM2, VM2, res1
+    xvfmadd.d res2, VM3, VM3, res2
+    addi.d XX, XX, 8 * SIZE
+    addi.d I, I, -1
+    blt $r0, I, .L110
+    b .L996
+    .align 3
+
+.L120:
+    bge $r0, I, .L997
+    .align 3
+
+.L121:
+    ld.d t1, XX, 0 * SIZE
+    ld.d t2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    ld.d t3, XX, 0 * SIZE
+    ld.d t4, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, XX, 0 * SIZE
+    ld.d t2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    ld.d t3, XX, 0 * SIZE
+    ld.d t4, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmul.d VM2, VX0, VALPHA
+    xvfmul.d VM3, VX1, VALPHA
+    xvfmadd.d res1, VM2, VM2, res1
+    xvfmadd.d res2, VM3, VM3, res2
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L996
+    .align 3
+
+.L996:
+    xvfadd.d res1, res1, res2
+    xvpickve.d VX0, res1, 1
+    xvpickve.d VX1, res1, 2
+    xvpickve.d VM2, res1, 3
+    xvfadd.d res1, VX0, res1
+    xvfadd.d VX1, VX1, VM2
+    xvfadd.d res1, VX1, res1
+    .align 3
+
+.L997:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    fld.d a3, XX, 0 * SIZE
+    fld.d a2, XX, 1 * SIZE
+    addi.d I, I, -1
+    fmul.d a3, a3, ALPHA
+    fmadd.d $f19, a3, a3, $f19
+    fmul.d a2, a2, ALPHA
+    fmadd.d $f19, a2, a2, $f19
+    add.d XX, XX , INCX
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    fsqrt.d $f19, $f19
+    fmul.d $f0, max, $f19
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/znrm2_lsx.S b/kernel/loongarch64/znrm2_lsx.S
new file mode 100644
index 000000000..14c59d504
--- /dev/null
+++ b/kernel/loongarch64/znrm2_lsx.S
@@ -0,0 +1,260 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define XX     $r19
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+#define INF    $f23
+#define a1     $f22
+#define max    $f17
+#define ALPHA  $f12
+#define a3     $f15
+#define a2     $f16
+#define VX0    $vr15
+#define VX1    $vr16
+#define VM0    $vr17
+#define VM1    $vr18
+#define VM2    $vr13
+#define VM3    $vr14
+#define res1   $vr19
+#define res2   $vr20
+#define VALPHA $vr21
+
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    vxor.v   res1, res1, res1
+    vxor.v   res2, res2, res2
+    vxor.v VM0, VM0, VM0
+    bge $r0, N, .L999
+    beq $r0, INCX, .L999
+    move  XX, X
+    // Init INF
+    addi.d  TEMP, $r0,  0x7FF
+    slli.d  TEMP, TEMP, 52
+    MTC  INF, TEMP
+    li.d  TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 2
+    bne INCX, TEMP, .L20
+    bge $r0,  I, .L97
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfmaxa.d VM1, VX1, VX0
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vfmaxa.d VM2, VX1, VX0
+    vfmaxa.d VM3, VM1, VM2
+    vfmaxa.d VM0, VM0, VM3
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L10
+    b .L96
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L97
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmaxa.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM2, VX0, VX1
+    vfmaxa.d VM3, VM1, VM2
+    vfmaxa.d VM0, VM0, VM3
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    b .L96
+    .align 3
+
+.L96:
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    vfmaxa.d VM0, VX0, VX1
+    .align 3
+
+.L97:
+    andi I, N, 3
+    bge $r0, I, .L99
+    .align 3
+
+.L98:
+    fld.d a3, X, 0 * SIZE
+    fld.d a2, X, 1 * SIZE
+    fmaxa.d a3, a2, a3
+    fmaxa.d max, a3, max
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L98
+    .align 3
+
+.L99:
+    fabs.d  max, max
+    lu12i.w  TEMP, 0x3f800 // 1
+    movgr2fr.d  a1, $r0
+    movgr2fr.w  ALPHA, TEMP
+    CMPEQ  $fcc0, max, a1
+    fcvt.d.s  ALPHA, ALPHA
+    bcnez  $fcc0, .L999
+    fdiv.d  ALPHA, ALPHA, max
+    CMPEQ  $fcc0, INF, ALPHA
+    bcnez  $fcc0, .L999
+    movfr2gr.d TEMP, ALPHA
+    vreplgr2vr.d VALPHA, TEMP
+
+.L100:
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    srai.d  I, N, 2
+    bne INCX, TEMP, .L120
+    bge $r0, I, .L997
+    .align 3
+
+.L110:
+    vld VX0, XX, 0 * SIZE
+    vld VX1, XX, 2 * SIZE
+    vfmul.d VM2, VX0, VALPHA
+    vfmul.d VM3, VX1, VALPHA
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
+    vld VX0, XX, 4 * SIZE
+    vld VX1, XX, 6 * SIZE
+    vfmul.d VM2, VX0, VALPHA
+    vfmul.d VM3, VX1, VALPHA
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
+    addi.d XX, XX, 8 * SIZE
+    addi.d I, I, -1
+    blt $r0, I, .L110
+    b .L996
+    .align 3
+
+.L120:
+    bge $r0, I, .L997
+    .align 3
+
+.L121:
+    ld.d t1, XX, 0 * SIZE
+    ld.d t2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    ld.d t3, XX, 0 * SIZE
+    ld.d t4, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmul.d VM2, VX0, VALPHA
+    ld.d t1, XX, 0 * SIZE
+    vfmul.d VM3, VX1, VALPHA
+    ld.d t2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
+    ld.d t3, XX, 0 * SIZE
+    ld.d t4, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmul.d VM2, VX0, VALPHA
+    vfmul.d VM3, VX1, VALPHA
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L996
+    .align 3
+
+.L996:
+    vfadd.d res1, res1, res2
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+    .align 3
+
+.L997:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    fld.d a3, XX, 0 * SIZE
+    fld.d a2, XX, 1 * SIZE
+    addi.d I, I, -1
+    fmul.d a3, a3, ALPHA
+    fmadd.d $f19, a3, a3, $f19
+    fmul.d a2, a2, ALPHA
+    fmadd.d $f19, a2, a2, $f19
+    add.d XX, XX , INCX
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    fsqrt.d $f19, $f19
+    fmul.d $f0, max, $f19
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE

From 1ec5dded43b9d5e14875061f62d5edec480f9584 Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao@loongson.cn>
Date: Thu, 28 Dec 2023 21:23:59 +0800
Subject: [PATCH 529/718] loongarch64: Add c/zrot optimization functions.

Signed-off-by: Hao Chen <chenhao@loongson.cn>
---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |    2 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |    2 +
 kernel/loongarch64/crot_lasx.S           | 1079 ++++++++++++++++++++++
 kernel/loongarch64/crot_lsx.S            |  907 ++++++++++++++++++
 4 files changed, 1990 insertions(+)
 create mode 100644 kernel/loongarch64/crot_lasx.S
 create mode 100644 kernel/loongarch64/crot_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 826588318..02ea4304e 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -67,6 +67,8 @@ ZASUMKERNEL =  casum_lsx.S
 
 SROTKERNEL  =  rot_lsx.S
 DROTKERNEL  =  rot_lsx.S
+CROTKERNEL  =  crot_lsx.S
+ZROTKERNEL  =  crot_lsx.S
 
 SNRM2KERNEL =  snrm2_lsx.S
 DNRM2KERNEL =  dnrm2_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index b61ecd427..462698f85 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -67,6 +67,8 @@ ZASUMKERNEL =  casum_lasx.S
 
 SROTKERNEL  =  rot_lasx.S
 DROTKERNEL  =  rot_lasx.S
+CROTKERNEL  =  crot_lasx.S
+ZROTKERNEL  =  crot_lasx.S
 
 SNRM2KERNEL =  snrm2_lasx.S
 DNRM2KERNEL =  dnrm2_lasx.S
diff --git a/kernel/loongarch64/crot_lasx.S b/kernel/loongarch64/crot_lasx.S
new file mode 100644
index 000000000..d4ec1e22c
--- /dev/null
+++ b/kernel/loongarch64/crot_lasx.S
@@ -0,0 +1,1079 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define C      $f0
+#define S      $f1
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VT0    $xr10
+#define VT1    $xr18
+#define VXC    $xr23
+#define VXS    $xr9
+#define VXZ    $xr11
+#define x1     $xr12
+#define x2     $xr13
+#define x3     $xr14
+#define x4     $xr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT  a1, a1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    MTG  t1, C
+    MTG  t2, S
+    MTG  t3, a1
+#ifdef DOUBLE
+    xvreplgr2vr.d VXC, t1
+    xvreplgr2vr.d VXS, t2
+    xvreplgr2vr.d VXZ, t3
+    srai.d I, N, 2
+#else
+    xvreplgr2vr.w VXC, t1
+    xvreplgr2vr.w VXS, t2
+    xvreplgr2vr.w VXZ, t3
+    srai.d I, N, 3
+#endif
+    beq INCX, $r0, .L996
+    beq INCY, $r0, .L996
+    bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1
+    bne INCY, TEMP, .L22
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L110
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L112 // C!=0 S==0
+    b .L111 // C!=0 S!=0
+    .align 3
+
+.L110:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L114 // C==0 S==0
+    b .L113 // C==0 S!=0
+    .align 3
+
+.L111: // C!=0 S!=0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+    xvfmul.d VX0, x1, VXC
+    xvfmadd.d VX0, x3, VXS, VX0
+    xvfmul.d VX1, x1, VXS
+    xvfmsub.d VX1, x3, VXC, VX1
+    xvfmul.d VX2, x2, VXC
+    xvfmadd.d VX2, x4, VXS, VX2
+    xvfmul.d VX3, x2, VXS
+    xvfmsub.d VX3, x4, VXC, VX3
+    xvilvl.d x1, VX2 ,VX0
+    xvilvh.d x2, VX2, VX0
+    xvilvl.d x3, VX3 ,VX1
+    xvilvh.d x4, VX3, VX1
+    xvst x1, X, 0 * SIZE
+    xvst x3, Y, 0 * SIZE
+    xvst x2, X, 4 * SIZE
+    xvst x4, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvld VX3, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+    xvfmul.s VX0, x1, VXC
+    xvfmadd.s VX0, x3, VXS, VX0
+    xvfmul.s VX1, x1, VXS
+    xvfmsub.s VX1, x3, VXC, VX1
+    xvfmul.s VX2, x2, VXC
+    xvfmadd.s VX2, x4, VXS, VX2
+    xvfmul.s VX3, x2, VXS
+    xvfmsub.s VX3, x4, VXC, VX3
+    xvilvl.w x1, VX2 ,VX0
+    xvilvh.w x2, VX2, VX0
+    xvilvl.w x3, VX3 ,VX1
+    xvilvh.w x4, VX3, VX1
+    xvst x1, X, 0 * SIZE
+    xvst x3, Y, 0 * SIZE
+    xvst x2, X, 8 * SIZE
+    xvst x4, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // C!=0 S==0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+    xvfmul.d VX0, x1, VXC
+    xvfmul.d VX1, x3, VXC
+    xvfmul.d VX2, x2, VXC
+    xvfmul.d VX3, x4, VXC
+    xvilvl.d x1, VX2 ,VX0
+    xvilvh.d x2, VX2, VX0
+    xvilvl.d x3, VX3 ,VX1
+    xvilvh.d x4, VX3, VX1
+    xvst x1, X, 0 * SIZE
+    xvst x3, Y, 0 * SIZE
+    xvst x2, X, 4 * SIZE
+    xvst x4, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvld VX3, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+    xvfmul.s VX0, x1, VXC
+    xvfmul.s VX1, x3, VXC
+    xvfmul.s VX2, x2, VXC
+    xvfmul.s VX3, x4, VXC
+    xvilvl.w x1, VX2 ,VX0
+    xvilvh.w x2, VX2, VX0
+    xvilvl.w x3, VX3 ,VX1
+    xvilvh.w x4, VX3, VX1
+    xvst x1, X, 0 * SIZE
+    xvst x3, Y, 0 * SIZE
+    xvst x2, X, 8 * SIZE
+    xvst x4, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // C==0 S!=0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+    xvfmul.d VX0, x3, VXS
+    xvfmul.d VX1, x1, VXS
+    xvfsub.d VX1, VXZ, VX1
+    xvfmul.d VX2, x4, VXS
+    xvfmul.d VX3, x2, VXS
+    xvfsub.d VX3, VXZ, VX3
+    xvilvl.d x1, VX2 ,VX0
+    xvilvh.d x2, VX2, VX0
+    xvilvl.d x3, VX3 ,VX1
+    xvilvh.d x4, VX3, VX1
+    xvst x1, X, 0 * SIZE
+    xvst x3, Y, 0 * SIZE
+    xvst x2, X, 4 * SIZE
+    xvst x4, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvld VX3, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+    xvfmul.s VX0, x3, VXS
+    xvfmul.s VX1, x1, VXS
+    xvfsub.s VX1, VXZ, VX1
+    xvfmul.s VX2, x4, VXS
+    xvfmul.s VX3, x2, VXS
+    xvfsub.s VX3, VXZ, VX3
+    xvilvl.w x1, VX2 ,VX0
+    xvilvh.w x2, VX2, VX0
+    xvilvl.w x3, VX3 ,VX1
+    xvilvh.w x4, VX3, VX1
+    xvst x1, X, 0 * SIZE
+    xvst x3, Y, 0 * SIZE
+    xvst x2, X, 8 * SIZE
+    xvst x4, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // C==0 S==0
+    xvst VXZ, X, 0 * SIZE
+    xvst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvst VXZ, X, 4 * SIZE
+    xvst VXZ, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvst VXZ, X, 8 * SIZE
+    xvst VXZ, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L220
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L222 // C!=0 S==0
+    b .L221 // C!=0 S!=0
+    .align 3
+
+.L220:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L224 // C==0 S==0
+    b .L223 // C==0 S!=0
+    .align 3
+
+.L221: // C!=0 S!=0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 1
+    xvinsgr2vr.d x4, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x3, t1, 2
+    xvinsgr2vr.d x4, t2, 2
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    add.d Y, Y, INCY
+
+    xvfmul.d VX0, x1, VXC
+    xvfmadd.d VX0, x3, VXS, VX0
+    xvfmul.d VX1, x1, VXS
+    xvfmsub.d VX1, x3, VXC, VX1
+    xvfmul.d VX2, x2, VXC
+    xvfmadd.d VX2, x4, VXS, VX2
+    xvfmul.d VX3, x2, VXS
+    xvfmsub.d VX3, x4, VXC, VX3
+    xvstelm.d VX0, XX, 0, 0
+    xvstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 1
+    xvstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 2
+    xvstelm.d VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 3
+    xvstelm.d VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.d VX1, YY, 0, 0
+    xvstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 1
+    xvstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 2
+    xvstelm.d VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 3
+    xvstelm.d VX3, YY, 1 * SIZE, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    add.d Y, Y, INCY
+
+    xvfmul.s VX0, x1, VXC
+    xvfmadd.s VX0, x3, VXS, VX0
+    xvfmul.s VX1, x1, VXS
+    xvfmsub.s VX1, x3, VXC, VX1
+    xvfmul.s VX2, x2, VXC
+    xvfmadd.s VX2, x4, VXS, VX2
+    xvfmul.s VX3, x2, VXS
+    xvfmsub.s VX3, x4, VXC, VX3
+    xvstelm.w VX0, XX, 0, 0
+    xvstelm.w VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 1
+    xvstelm.w VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 2
+    xvstelm.w VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 3
+    xvstelm.w VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.w VX1, YY, 0, 0
+    xvstelm.w VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 1
+    xvstelm.w VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 2
+    xvstelm.w VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 3
+    xvstelm.w VX3, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX0, XX, 0, 4
+    xvstelm.w VX2, XX, 1 * SIZE, 4
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 5
+    xvstelm.w VX2, XX, 1 * SIZE, 5
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 6
+    xvstelm.w VX2, XX, 1 * SIZE, 6
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 7
+    xvstelm.w VX2, XX, 1 * SIZE, 7
+    add.d XX, XX, INCX
+    xvstelm.w VX1, YY, 0, 4
+    xvstelm.w VX3, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 5
+    xvstelm.w VX3, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 6
+    xvstelm.w VX3, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 7
+    xvstelm.w VX3, YY, 1 * SIZE, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222: // C!=0 S==0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 1
+    xvinsgr2vr.d x4, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x3, t1, 2
+    xvinsgr2vr.d x4, t2, 2
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX0, x1, VXC
+    xvfmul.d VX1, x3, VXC
+    xvfmul.d VX2, x2, VXC
+    xvfmul.d VX3, x4, VXC
+    xvstelm.d VX0, XX, 0, 0
+    xvstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 1
+    xvstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 2
+    xvstelm.d VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 3
+    xvstelm.d VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.d VX1, YY, 0, 0
+    xvstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 1
+    xvstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 2
+    xvstelm.d VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 3
+    xvstelm.d VX3, YY, 1 * SIZE, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX0, x1, VXC
+    xvfmul.s VX1, x3, VXC
+    xvfmul.s VX2, x2, VXC
+    xvfmul.s VX3, x4, VXC
+    xvstelm.w VX0, XX, 0, 0
+    xvstelm.w VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 1
+    xvstelm.w VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 2
+    xvstelm.w VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 3
+    xvstelm.w VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.w VX1, YY, 0, 0
+    xvstelm.w VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 1
+    xvstelm.w VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 2
+    xvstelm.w VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 3
+    xvstelm.w VX3, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX0, XX, 0, 4
+    xvstelm.w VX2, XX, 1 * SIZE, 4
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 5
+    xvstelm.w VX2, XX, 1 * SIZE, 5
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 6
+    xvstelm.w VX2, XX, 1 * SIZE, 6
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 7
+    xvstelm.w VX2, XX, 1 * SIZE, 7
+    add.d XX, XX, INCX
+    xvstelm.w VX1, YY, 0, 4
+    xvstelm.w VX3, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 5
+    xvstelm.w VX3, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 6
+    xvstelm.w VX3, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 7
+    xvstelm.w VX3, YY, 1 * SIZE, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: // C==0 S!=0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 1
+    xvinsgr2vr.d x4, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x3, t1, 2
+    xvinsgr2vr.d x4, t2, 2
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX0, x3, VXS
+    xvfmul.d VX1, x1, VXS
+    xvfsub.d VX1, VXZ, VX1
+    xvfmul.d VX2, x4, VXS
+    xvfmul.d VX3, x2, VXS
+    xvfsub.d VX3, VXZ, VX3
+    xvstelm.d VX0, XX, 0, 0
+    xvstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 1
+    xvstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 2
+    xvstelm.d VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 3
+    xvstelm.d VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.d VX1, YY, 0, 0
+    xvstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 1
+    xvstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 2
+    xvstelm.d VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 3
+    xvstelm.d VX3, YY, 1 * SIZE, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX0, x3, VXS
+    xvfmul.s VX1, x1, VXS
+    xvfsub.s VX1, VXZ, VX1
+    xvfmul.s VX2, x4, VXS
+    xvfmul.s VX3, x2, VXS
+    xvfsub.s VX3, VXZ, VX3
+    xvstelm.w VX0, XX, 0, 0
+    xvstelm.w VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 1
+    xvstelm.w VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 2
+    xvstelm.w VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 3
+    xvstelm.w VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.w VX1, YY, 0, 0
+    xvstelm.w VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 1
+    xvstelm.w VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 2
+    xvstelm.w VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 3
+    xvstelm.w VX3, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX0, XX, 0, 4
+    xvstelm.w VX2, XX, 1 * SIZE, 4
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 5
+    xvstelm.w VX2, XX, 1 * SIZE, 5
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 6
+    xvstelm.w VX2, XX, 1 * SIZE, 6
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 7
+    xvstelm.w VX2, XX, 1 * SIZE, 7
+    add.d XX, XX, INCX
+    xvstelm.w VX1, YY, 0, 4
+    xvstelm.w VX3, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 5
+    xvstelm.w VX3, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 6
+    xvstelm.w VX3, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 7
+    xvstelm.w VX3, YY, 1 * SIZE, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224: // C==0 S==0
+#ifdef DOUBLE
+    xvstelm.d VXZ, XX, 0, 0
+    xvstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 0
+    xvstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 0
+    xvstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 0
+    xvstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, YY, 0, 0
+    xvstelm.d VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 0
+    xvstelm.d VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 0
+    xvstelm.d VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 0
+    xvstelm.d VXZ, YY, 1 * SIZE, 0
+#else
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    move X, XX
+    move Y, YY
+    b .L997
+    .align 3
+
+.L996:
+    move I, N
+    b .L998
+    .align 3
+
+.L997:
+#ifdef DOUBLE
+    andi I, N, 3
+#else
+    andi I, N, 7
+#endif
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    LD   a3, Y, 0 * SIZE
+    LD   a4, Y, 1 * SIZE
+    MUL  s1, a1, C
+    MADD s1, a3, S, s1
+    MUL  s2, a1, S
+    MSUB s2, a3, C, s2
+    MUL  s3, a2, C
+    MADD s3, a4, S, s3
+    MUL  s4, a2, S
+    MSUB s4, a4, C, s4
+    addi.d I, I, -1
+    ST   s1, X, 0 * SIZE
+    ST   s2, Y, 0 * SIZE
+    ST   s3, X, 1 * SIZE
+    ST   s4, Y, 1 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/crot_lsx.S b/kernel/loongarch64/crot_lsx.S
new file mode 100644
index 000000000..126257edc
--- /dev/null
+++ b/kernel/loongarch64/crot_lsx.S
@@ -0,0 +1,907 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define C      $f0
+#define S      $f1
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VT0    $vr10
+#define VT1    $vr18
+#define VXC    $vr23
+#define VXS    $vr9
+#define VXZ    $vr11
+#define x1     $vr12
+#define x2     $vr13
+#define x3     $vr14
+#define x4     $vr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT a1, a1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    MTG  t1, C
+    MTG  t2, S
+    MTG  t3, a1
+#ifdef DOUBLE
+    vreplgr2vr.d VXC, t1
+    vreplgr2vr.d VXS, t2
+    vreplgr2vr.d VXZ, t3
+#else
+    vreplgr2vr.w VXC, t1
+    vreplgr2vr.w VXS, t2
+    vreplgr2vr.w VXZ, t3
+    srai.d I, N, 2
+#endif
+    beq INCX, $r0, .L996
+    beq INCY, $r0, .L996
+    bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1
+    bne INCY, TEMP, .L22
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L110
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L112 // C!=0 S==0
+    b .L111 // C!=0 S!=0
+    .align 3
+
+.L110:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L114 // C==0 S==0
+    b .L113 // C==0 S!=0
+    .align 3
+
+.L111: // C!=0 S!=0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmul.d VX0, x1, VXC
+    vfmadd.d VX0, x3, VXS, VX0
+    vfmul.d VX1, x1, VXS
+    vfmsub.d VX1, x3, VXC, VX1
+    vfmul.d VX2, x2, VXC
+    vfmadd.d VX2, x4, VXS, VX2
+    vfmul.d VX3, x2, VXS
+    vfmsub.d VX3, x4, VXC, VX3
+    vilvl.d x1, VX2 ,VX0
+    vilvh.d x2, VX2, VX0
+    vilvl.d x3, VX3 ,VX1
+    vilvh.d x4, VX3, VX1
+    vst x1, X, 0 * SIZE
+    vst x3, Y, 0 * SIZE
+    vst x2, X, 2 * SIZE
+    vst x4, Y, 2 * SIZE
+    addi.d X, X, 4 * SIZE
+    addi.d Y, Y, 4 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmul.s VX0, x1, VXC
+    vfmadd.s VX0, x3, VXS, VX0
+    vfmul.s VX1, x1, VXS
+    vfmsub.s VX1, x3, VXC, VX1
+    vfmul.s VX2, x2, VXC
+    vfmadd.s VX2, x4, VXS, VX2
+    vfmul.s VX3, x2, VXS
+    vfmsub.s VX3, x4, VXC, VX3
+    vilvl.w x1, VX2 ,VX0
+    vilvh.w x2, VX2, VX0
+    vilvl.w x3, VX3 ,VX1
+    vilvh.w x4, VX3, VX1
+    vst x1, X, 0 * SIZE
+    vst x3, Y, 0 * SIZE
+    vst x2, X, 4 * SIZE
+    vst x4, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // C!=0 S==0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmul.d VX0, x1, VXC
+    vfmul.d VX1, x3, VXC
+    vfmul.d VX2, x2, VXC
+    vfmul.d VX3, x4, VXC
+    vilvl.d x1, VX2 ,VX0
+    vilvh.d x2, VX2, VX0
+    vilvl.d x3, VX3 ,VX1
+    vilvh.d x4, VX3, VX1
+    vst x1, X, 0 * SIZE
+    vst x3, Y, 0 * SIZE
+    vst x2, X, 2 * SIZE
+    vst x4, Y, 2 * SIZE
+    addi.d X, X, 4 * SIZE
+    addi.d Y, Y, 4 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmul.s VX0, x1, VXC
+    vfmul.s VX1, x3, VXC
+    vfmul.s VX2, x2, VXC
+    vfmul.s VX3, x4, VXC
+    vilvl.w x1, VX2 ,VX0
+    vilvh.w x2, VX2, VX0
+    vilvl.w x3, VX3 ,VX1
+    vilvh.w x4, VX3, VX1
+    vst x1, X, 0 * SIZE
+    vst x3, Y, 0 * SIZE
+    vst x2, X, 4 * SIZE
+    vst x4, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // C==0 S!=0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmul.d VX0, x3, VXS
+    vfmul.d VX1, x1, VXS
+    vfsub.d VX1, VXZ, VX1
+    vfmul.d VX2, x4, VXS
+    vfmul.d VX3, x2, VXS
+    vfsub.d VX3, VXZ, VX3
+    vilvl.d x1, VX2 ,VX0
+    vilvh.d x2, VX2, VX0
+    vilvl.d x3, VX3 ,VX1
+    vilvh.d x4, VX3, VX1
+    vst x1, X, 0 * SIZE
+    vst x3, Y, 0 * SIZE
+    vst x2, X, 2 * SIZE
+    vst x4, Y, 2 * SIZE
+    addi.d X, X, 4 * SIZE
+    addi.d Y, Y, 4 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmul.s VX0, x3, VXS
+    vfmul.s VX1, x1, VXS
+    vfsub.s VX1, VXZ, VX1
+    vfmul.s VX2, x4, VXS
+    vfmul.s VX3, x2, VXS
+    vfsub.s VX3, VXZ, VX3
+    vilvl.w x1, VX2 ,VX0
+    vilvh.w x2, VX2, VX0
+    vilvl.w x3, VX3 ,VX1
+    vilvh.w x4, VX3, VX1
+    vst x1, X, 0 * SIZE
+    vst x3, Y, 0 * SIZE
+    vst x2, X, 4 * SIZE
+    vst x4, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // C==0 S==0
+    vst VXZ, X, 0 * SIZE
+    vst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    vst VXZ, X, 2 * SIZE
+    vst VXZ, Y, 2 * SIZE
+    addi.d X, X, 4 * SIZE
+    addi.d Y, Y, 4 * SIZE
+#else
+    vst VXZ, X, 4 * SIZE
+    vst VXZ, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    srai.d I, N, 2
+#endif
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L220
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L222 // C!=0 S==0
+    b .L221 // C!=0 S!=0
+    .align 3
+
+.L220:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L224 // C==0 S==0
+    b .L223 // C==0 S!=0
+    .align 3
+
+.L221: // C!=0 S!=0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, x1, VXC
+    vfmadd.d VX0, x3, VXS, VX0
+    vfmul.d VX1, x1, VXS
+    vfmsub.d VX1, x3, VXC, VX1
+    vfmul.d VX2, x2, VXC
+    vfmadd.d VX2, x4, VXS, VX2
+    vfmul.d VX3, x2, VXS
+    vfmsub.d VX3, x4, VXC, VX3
+    vstelm.d VX0, XX, 0, 0
+    vstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VX0, XX, 0, 1
+    vstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.d VX1, YY, 0, 0
+    vstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    vstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, x1, VXC
+    vfmadd.d VX0, x3, VXS, VX0
+    vfmul.d VX1, x1, VXS
+    vfmsub.d VX1, x3, VXC, VX1
+    vfmul.d VX2, x2, VXC
+    vfmadd.d VX2, x4, VXS, VX2
+    vfmul.d VX3, x2, VXS
+    vfmsub.d VX3, x4, VXC, VX3
+    vstelm.d VX0, XX, 0, 0
+    vstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VX0, XX, 0, 1
+    vstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.d VX1, YY, 0, 0
+    vstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    vstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L995
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+
+    vfmul.s VX0, x1, VXC
+    vfmadd.s VX0, x3, VXS, VX0
+    vfmul.s VX1, x1, VXS
+    vfmsub.s VX1, x3, VXC, VX1
+    vfmul.s VX2, x2, VXC
+    vfmadd.s VX2, x4, VXS, VX2
+    vfmul.s VX3, x2, VXS
+    vfmsub.s VX3, x4, VXC, VX3
+    vstelm.w VX0, XX, 0, 0
+    vstelm.w VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 1
+    vstelm.w VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 2
+    vstelm.w VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 3
+    vstelm.w VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    vstelm.w VX1, YY, 0, 0
+    vstelm.w VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 1
+    vstelm.w VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 2
+    vstelm.w VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 3
+    vstelm.w VX3, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L997
+#endif
+    .align 3
+
+.L222: // C!=0 S==0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, x1, VXC
+    vfmul.d VX1, x3, VXC
+    vfmul.d VX2, x2, VXC
+    vfmul.d VX3, x4, VXC
+    vstelm.d VX0, XX, 0, 0
+    vstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VX0, XX, 0, 1
+    vstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.d VX1, YY, 0, 0
+    vstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    vstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, x1, VXC
+    vfmul.d VX1, x3, VXC
+    vfmul.d VX2, x2, VXC
+    vfmul.d VX3, x4, VXC
+    vstelm.d VX0, XX, 0, 0
+    vstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VX0, XX, 0, 1
+    vstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.d VX1, YY, 0, 0
+    vstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    vstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    b .L995
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX0, x1, VXC
+    vfmul.s VX1, x3, VXC
+    vfmul.s VX2, x2, VXC
+    vfmul.s VX3, x4, VXC
+    vstelm.w VX0, XX, 0, 0
+    vstelm.w VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 1
+    vstelm.w VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 2
+    vstelm.w VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 3
+    vstelm.w VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    vstelm.w VX1, YY, 0, 0
+    vstelm.w VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 1
+    vstelm.w VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 2
+    vstelm.w VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 3
+    vstelm.w VX3, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    b .L997
+#endif
+    .align 3
+
+.L223: // C==0 S!=0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, x3, VXS
+    vfmul.d VX1, x1, VXS
+    vfsub.d VX1, VXZ, VX1
+    vfmul.d VX2, x4, VXS
+    vfmul.d VX3, x2, VXS
+    vfsub.d VX3, VXZ, VX3
+    vstelm.d VX0, XX, 0, 0
+    vstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VX0, XX, 0, 1
+    vstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.d VX1, YY, 0, 0
+    vstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    vstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, x3, VXS
+    vfmul.d VX1, x1, VXS
+    vfsub.d VX1, VXZ, VX1
+    vfmul.d VX2, x4, VXS
+    vfmul.d VX3, x2, VXS
+    vfsub.d VX3, VXZ, VX3
+    vstelm.d VX0, XX, 0, 0
+    vstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VX0, XX, 0, 1
+    vstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.d VX1, YY, 0, 0
+    vstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    vstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L223
+    b .L995
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX0, x3, VXS
+    vfmul.s VX1, x1, VXS
+    vfsub.s VX1, VXZ, VX1
+    vfmul.s VX2, x4, VXS
+    vfmul.s VX3, x2, VXS
+    vfsub.s VX3, VXZ, VX3
+    vstelm.w VX0, XX, 0, 0
+    vstelm.w VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 1
+    vstelm.w VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 2
+    vstelm.w VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 3
+    vstelm.w VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    vstelm.w VX1, YY, 0, 0
+    vstelm.w VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 1
+    vstelm.w VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 2
+    vstelm.w VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 3
+    vstelm.w VX3, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L223
+    b .L997
+#endif
+    .align 3
+
+.L224: // C==0 S==0
+#ifdef DOUBLE
+    vstelm.d VXZ, XX, 0, 0
+    vstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    vstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    vstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    vstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, YY, 0, 0
+    vstelm.d VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    vstelm.d VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    vstelm.d VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    vstelm.d VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    move X, XX
+    move Y, YY
+    b .L995
+#else
+    vstelm.w VXZ, XX, 0, 0
+    vstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 0
+    vstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 0
+    vstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 0
+    vstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, YY, 0, 0
+    vstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    vstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    vstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    vstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    move X, XX
+    move Y, YY
+    b .L997
+#endif
+    .align 3
+
+#ifdef DOUBLE
+    .L995:
+    andi I, N, 3
+    bge $r0, I, .L999
+    b .L998
+    .align 3
+
+#endif
+.L996:
+    move I, N
+    b .L998
+    .align 3
+
+.L997:
+#ifdef DOUBLE
+    andi I, N, 1
+#else
+    andi I, N, 3
+#endif
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    LD   a3, Y, 0 * SIZE
+    LD   a4, Y, 1 * SIZE
+    MUL  s1, a1, C
+    MADD s1, a3, S, s1
+    MUL  s2, a1, S
+    MSUB s2, a3, C, s2
+    MUL  s3, a2, C
+    MADD s3, a4, S, s3
+    MUL  s4, a2, S
+    MSUB s4, a4, C, s4
+    addi.d I, I, -1
+    ST   s1, X, 0 * SIZE
+    ST   s2, Y, 0 * SIZE
+    ST   s3, X, 1 * SIZE
+    ST   s4, Y, 1 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE

From edabb936681bff6fea0454aedf064c19c1db217f Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao@loongson.cn>
Date: Fri, 29 Dec 2023 15:08:10 +0800
Subject: [PATCH 530/718] loongarch64: Refine axpby optimization functions.

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000      |   4 +-
 kernel/loongarch64/KERNEL.LOONGSON3R5         |   4 +-
 .../{daxpby_lasx.S => axpby_lasx.S}           | 475 ++++++++++++-
 .../loongarch64/{daxpby_lsx.S => axpby_lsx.S} | 503 +++++++++++++-
 kernel/loongarch64/saxpby_lasx.S              | 597 -----------------
 kernel/loongarch64/saxpby_lsx.S               | 629 ------------------
 6 files changed, 931 insertions(+), 1281 deletions(-)
 rename kernel/loongarch64/{daxpby_lasx.S => axpby_lasx.S} (55%)
 rename kernel/loongarch64/{daxpby_lsx.S => axpby_lsx.S} (56%)
 delete mode 100644 kernel/loongarch64/saxpby_lasx.S
 delete mode 100644 kernel/loongarch64/saxpby_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 02ea4304e..0fb0bb68f 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -54,8 +54,8 @@ DAXPYKERNEL =  axpy_lsx.S
 CAXPYKERNEL =  caxpy_lsx.S
 ZAXPYKERNEL =  caxpy_lsx.S
 
-SAXPBYKERNEL = saxpby_lsx.S
-DAXPBYKERNEL = daxpby_lsx.S
+SAXPBYKERNEL = axpby_lsx.S
+DAXPBYKERNEL = axpby_lsx.S
 
 SSUMKERNEL  =  sum_lsx.S
 DSUMKERNEL  =  sum_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 462698f85..1a6a04532 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -54,8 +54,8 @@ DAXPYKERNEL =  axpy_lasx.S
 CAXPYKERNEL =  caxpy_lasx.S
 ZAXPYKERNEL =  caxpy_lasx.S
 
-SAXPBYKERNEL = saxpby_lasx.S
-DAXPBYKERNEL = daxpby_lasx.S
+SAXPBYKERNEL = axpby_lasx.S
+DAXPBYKERNEL = axpby_lasx.S
 
 SSUMKERNEL  =  sum_lasx.S
 DSUMKERNEL  =  sum_lasx.S
diff --git a/kernel/loongarch64/daxpby_lasx.S b/kernel/loongarch64/axpby_lasx.S
similarity index 55%
rename from kernel/loongarch64/daxpby_lasx.S
rename to kernel/loongarch64/axpby_lasx.S
index 4b19703e7..f1d99cd3b 100644
--- a/kernel/loongarch64/daxpby_lasx.S
+++ b/kernel/loongarch64/axpby_lasx.S
@@ -1,6 +1,33 @@
-#define ASSEMBLER
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
 
+#define ASSEMBLER
 #include "common.h"
+
 #define N      $r4
 #define ALPHA  $f0
 #define X      $r5
@@ -32,16 +59,22 @@
     bge $r0, N, .L999
     li.d TEMP, 1
     movgr2fr.d a1, $r0
-    ffint.d.l a1, a1
+    ffint.s.l a1, a1
     slli.d  TEMP, TEMP, BASE_SHIFT
     slli.d  INCX, INCX, BASE_SHIFT
     slli.d  INCY, INCY, BASE_SHIFT
-    movfr2gr.d t1, ALPHA
+    MTG  t1, ALPHA
+    MTG  t2, BETA
+    MTG  t3, a1
+#ifdef DOUBLE
     xvreplgr2vr.d VXA, t1
-    movfr2gr.d t2, BETA
     xvreplgr2vr.d VXB, t2
-    movfr2gr.d t3, a1
     xvreplgr2vr.d VXZ, t3
+#else
+    xvreplgr2vr.w VXA, t1
+    xvreplgr2vr.w VXB, t2
+    xvreplgr2vr.w VXZ, t3
+#endif
     srai.d I, N, 3
     bne INCX, TEMP, .L20
     bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
@@ -52,21 +85,22 @@
 
 .L11:
     bge $r0, I, .L997
-    fcmp.ceq.d $fcc0, ALPHA, a1
+    CMPEQ $fcc0, ALPHA, a1
     bcnez $fcc0, .L110
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
     bcnez $fcc0, .L112 // ALPHA!=0 BETA==0
     b .L111 // ALPHA!=0 BETA!=0
     .align 3
 
 .L110:
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
     bcnez $fcc0, .L114 // ALPHA==0 BETA==0
     b .L113 // ALPHA==0 BETA!=0
     .align 3
 
 .L111: // ALPHA!=0 BETA!=0
     xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
     xvld VX2, Y, 0 * SIZE
     xvld VX1, X, 4 * SIZE
     xvld VX3, Y, 4 * SIZE
@@ -77,6 +111,13 @@
     addi.d  I, I, -1
     xvst VX2, Y, 0 * SIZE
     xvst VX3, Y, 4 * SIZE
+#else
+    xvld VX2, Y, 0 * SIZE
+    xvfmul.s VX0, VX0, VXA
+    addi.d  I, I, -1
+    xvfmadd.s VX2, VX2, VXB, VX0
+    xvst VX2, Y, 0 * SIZE
+#endif
     addi.d X, X, 8 * SIZE
     addi.d Y, Y, 8 * SIZE
     blt $r0, I, .L111
@@ -85,34 +126,46 @@
 
 .L112: // ALPHA!=0 BETA==0
     xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
     xvld VX1, X, 4 * SIZE
     xvfmul.d VX0, VX0, VXA
     xvfmul.d VX1, VX1, VXA
     xvst VX0, Y, 0 * SIZE
     xvst VX1, Y, 4 * SIZE
+#else
+    xvfmul.s VX0, VX0, VXA
+    addi.d  I, I, -1
+    xvst VX0, Y, 0 * SIZE
+#endif
     addi.d X, X, 8 * SIZE
     addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
     blt $r0, I, .L112
     b .L997
     .align 3
 
 .L113: // ALPHA==0 BETA!=0
     xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
     xvld VX3, Y, 4 * SIZE
     xvfmul.d VX2, VX2, VXB
     xvfmul.d VX3, VX3, VXB
     xvst VX2, Y, 0 * SIZE
     xvst VX3, Y, 4 * SIZE
-    addi.d Y, Y, 8 * SIZE
+#else
+    xvfmul.s VX2, VX2, VXB
+    xvst VX2, Y, 0 * SIZE
+#endif
     addi.d  I, I, -1
+    addi.d Y, Y, 8 * SIZE
     blt $r0, I, .L113
     b .L997
     .align 3
 
 .L114: // ALPHA==0 BETA==0
     xvst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
     xvst VXZ, Y, 4 * SIZE
+#endif
     addi.d Y, Y, 8 * SIZE
     addi.d  I, I, -1
     blt $r0, I, .L114
@@ -122,21 +175,22 @@
 .L12: // INCX==1 and INCY!=1
     bge $r0, I, .L997
     move YY, Y
-    fcmp.ceq.d $fcc0, ALPHA, a1
+    CMPEQ $fcc0, ALPHA, a1
     bcnez $fcc0, .L120
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
     bcnez $fcc0, .L122 // ALPHA!=0 BETA==0
     b .L121 // ALPHA!=0 BETA!=0
     .align 3
 
 .L120:
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
     bcnez $fcc0, .L124 // ALPHA==0 BETA==0
     b .L123 // ALPHA==0 BETA!=0
     .align 3
 
 .L121: // ALPHA!=0 BETA!=0
     xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
     ld.d t1, Y, 0 * SIZE
     add.d Y, Y, INCY
     ld.d t2, Y, 0 * SIZE
@@ -182,14 +236,59 @@
     xvstelm.d VX3, YY, 0, 2
     add.d YY, YY, INCY
     xvstelm.d VX3, YY, 0, 3
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX0, VX0, VXA
+    xvfmadd.s VX2, VX2, VXB, VX0
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
     add.d YY, YY, INCY
     addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
     blt $r0, I, .L121
     b .L997
     .align 3
 
 .L122: // ALPHA!=0 BETA==0
     xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
     xvld VX1, X, 4 * SIZE
     xvfmul.d VX0, VX0, VXA
     xvfmul.d VX1, VX1, VXA
@@ -208,14 +307,33 @@
     xvstelm.d VX1, YY, 0, 2
     add.d YY, YY, INCY
     xvstelm.d VX1, YY, 0, 3
+#else
+    xvfmul.s VX0, VX0, VXA
+    addi.d  I, I, -1
+    xvstelm.w VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 7
+#endif
     add.d YY, YY, INCY
     addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
     blt $r0, I, .L122
     b .L997
     .align 3
 
 .L123: // ALPHA==0 BETA!=0
+#ifdef DOUBLE
     ld.d t1, Y, 0 * SIZE
     add.d Y, Y, INCY
     ld.d t2, Y, 0 * SIZE
@@ -250,7 +368,6 @@
     xvstelm.d VX2, YY, 0, 3
     add.d YY, YY, INCY
     xvfmul.d VX3, VX3, VXB
-    addi.d  I, I, -1
     xvstelm.d VX3, YY, 0, 0
     add.d YY, YY, INCY
     xvstelm.d VX3, YY, 0, 1
@@ -258,12 +375,56 @@
     xvstelm.d VX3, YY, 0, 2
     add.d YY, YY, INCY
     xvstelm.d VX3, YY, 0, 3
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX2, VX2, VXB
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
     add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
     blt $r0, I, .L123
     b .L997
     .align 3
 
 .L124: // ALPHA==0 BETA==0
+#ifdef DOUBLE
     xvstelm.d VXZ, YY, 0, 0
     add.d YY, YY, INCY
     xvstelm.d VXZ, YY, 0, 1
@@ -279,6 +440,23 @@
     xvstelm.d VXZ, YY, 0, 2
     add.d YY, YY, INCY
     xvstelm.d VXZ, YY, 0, 3
+#else
+    xvstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 7
+#endif
     add.d YY, YY, INCY
     addi.d  I, I, -1
     blt $r0, I, .L124
@@ -287,21 +465,22 @@
 
 .L21:// INCX!=1 and INCY==1
     bge $r0, I, .L997
-    fcmp.ceq.d $fcc0, ALPHA, a1
+    CMPEQ $fcc0, ALPHA, a1
     bcnez $fcc0, .L210
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
     bcnez $fcc0, .L212 // ALPHA!=0 BETA==0
     b .L211 // ALPHA!=0 BETA!=0
     .align 3
 
 .L210:
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
     bcnez $fcc0, .L214 // ALPHA==0 BETA==0
     b .L213 // ALPHA==0 BETA!=0
     .align 3
 
 .L211: // ALPHA!=0 BETA!=0
     xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
     ld.d t1, X, 0 * SIZE
     add.d X, X, INCX
     ld.d t2, X, 0 * SIZE
@@ -334,12 +513,43 @@
     xvfmadd.d VX3, VX3, VXB, VX1
     addi.d  I, I, -1
     xvst VX3, Y, 4 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VX0, VXA, VX0
+    xvfmadd.s VX2, VX2, VXB, VX0
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+#endif
     addi.d Y, Y, 8 * SIZE
     blt $r0, I, .L211
     b .L997
     .align 3
 
 .L212: // ALPHA!=0 BETA==0
+#ifdef DOUBLE
     ld.d t1, X, 0 * SIZE
     add.d X, X, INCX
     ld.d t2, X, 0 * SIZE
@@ -369,6 +579,35 @@
     xvfmul.d VX1, VX1, VXA
     addi.d  I, I, -1
     xvst VX1, Y, 4 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VX0, VXA, VX0
+    addi.d  I, I, -1
+    xvst VX0, Y, 0 * SIZE
+#endif
     addi.d Y, Y, 8 * SIZE
     blt $r0, I, .L212
     b .L997
@@ -376,20 +615,27 @@
 
 .L213: // ALPHA==0 BETA!=0
     xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
     xvld VX3, Y, 4 * SIZE
     xvfmul.d VX2, VX2, VXB
     xvfmul.d VX3, VX3, VXB
-    addi.d  I, I, -1
     xvst VX2, Y, 0 * SIZE
     xvst VX3, Y, 4 * SIZE
+#else
+    xvfmul.s VX2, VX2, VXB
+    xvst VX2, Y, 0 * SIZE
+#endif
     addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
     blt $r0, I, .L213
     b .L997
     .align 3
 
 .L214: // ALPHA==0 BETA==0
     xvst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
     xvst VXZ, Y, 4 * SIZE
+#endif
     addi.d Y, Y, 8 * SIZE
     addi.d  I, I, -1
     blt $r0, I, .L214
@@ -399,20 +645,21 @@
 .L22:
     bge $r0, I, .L997
     move YY, Y
-    fcmp.ceq.d $fcc0, ALPHA, a1
+    CMPEQ $fcc0, ALPHA, a1
     bcnez $fcc0, .L220
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
     bcnez $fcc0, .L222 // ALPHA!=0 BETA==0
     b .L221 // ALPHA!=0 BETA!=0
     .align 3
 
 .L220:
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
     bcnez $fcc0, .L224 // ALPHA==0 BETA==0
     b .L223 // ALPHA==0 BETA!=0
     .align 3
 
 .L221: // ALPHA!=0 BETA!=0
+#ifdef DOUBLE
     ld.d t1, X, 0 * SIZE
     add.d X, X, INCX
     ld.d t2, X, 0 * SIZE
@@ -481,12 +728,81 @@
     xvstelm.d VX3, YY, 0, 2
     add.d YY, YY, INCY
     xvstelm.d VX3, YY, 0, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX0, VX0, VXA
+    xvfmadd.s VX2, VX2, VXB, VX0
+    addi.d  I, I, -1
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
     add.d YY, YY, INCY
     blt $r0, I, .L221
     b .L997
     .align 3
 
 .L222: // ALPHA!=0 BETA==0
+#ifdef DOUBLE
     ld.d t1, X, 0 * SIZE
     add.d X, X, INCX
     ld.d t2, X, 0 * SIZE
@@ -529,12 +845,56 @@
     xvstelm.d VX1, YY, 0, 2
     add.d YY, YY, INCY
     xvstelm.d VX1, YY, 0, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VX0, VX0, VXA
+    addi.d  I, I, -1
+    xvstelm.w VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 7
+#endif
     add.d YY, YY, INCY
     blt $r0, I, .L222
     b .L997
     .align 3
 
 .L223: // ALPHA==0 BETA!=0
+#ifdef DOUBLE
     ld.d t1, Y, 0 * SIZE
     add.d Y, Y, INCY
     ld.d t2, Y, 0 * SIZE
@@ -577,12 +937,56 @@
     xvstelm.d VX3, YY, 0, 2
     add.d YY, YY, INCY
     xvstelm.d VX3, YY, 0, 3
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX2, VX2, VXB
+    addi.d  I, I, -1
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
     add.d YY, YY, INCY
     blt $r0, I, .L223
     b .L997
     .align 3
 
 .L224: // ALPHA==0 BETA==0
+#ifdef DOUBLE
     xvstelm.d VXZ, YY, 0, 0
     add.d YY, YY, INCY
     xvstelm.d VXZ, YY, 0, 1
@@ -598,6 +1002,23 @@
     xvstelm.d VXZ, YY, 0, 2
     add.d YY, YY, INCY
     xvstelm.d VXZ, YY, 0, 3
+#else
+    xvstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 7
+#endif
     add.d YY, YY, INCY
     addi.d  I, I, -1
     blt $r0, I, .L224
@@ -610,12 +1031,12 @@
     .align 3
 
 .L998:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f13, Y, 0 * SIZE
+    LD   $f12, X, 0 * SIZE
+    LD   $f13, Y, 0 * SIZE
     addi.d I, I, -1
-    fmul.d $f12, $f12, ALPHA
-    fmadd.d $f13, $f13, BETA, $f12
-    fst.d $f13, Y, 0 * SIZE
+    MUL  $f12, $f12, ALPHA
+    MADD $f13, $f13, BETA, $f12
+    ST   $f13, Y, 0 * SIZE
     add.d  X, X, INCX
     add.d  Y, Y, INCY
     blt $r0, I, .L998
diff --git a/kernel/loongarch64/daxpby_lsx.S b/kernel/loongarch64/axpby_lsx.S
similarity index 56%
rename from kernel/loongarch64/daxpby_lsx.S
rename to kernel/loongarch64/axpby_lsx.S
index 9aafbaf2a..45154c262 100644
--- a/kernel/loongarch64/daxpby_lsx.S
+++ b/kernel/loongarch64/axpby_lsx.S
@@ -1,6 +1,33 @@
-#define ASSEMBLER
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
 
+#define ASSEMBLER
 #include "common.h"
+
 #define N      $r4
 #define ALPHA  $f0
 #define X      $r5
@@ -32,16 +59,22 @@
     bge $r0, N, .L999
     li.d TEMP, 1
     movgr2fr.d a1, $r0
-    ffint.d.l a1, a1
+    ffint.s.l a1, a1
     slli.d  TEMP, TEMP, BASE_SHIFT
     slli.d  INCX, INCX, BASE_SHIFT
     slli.d  INCY, INCY, BASE_SHIFT
-    movfr2gr.d t1, ALPHA
+    MTG  t1, ALPHA
+    MTG  t2, BETA
+    MTG  t3, a1
+#ifdef DOUBLE
     vreplgr2vr.d VXA, t1
-    movfr2gr.d t2, BETA
     vreplgr2vr.d VXB, t2
-    movfr2gr.d t3, a1
     vreplgr2vr.d VXZ, t3
+#else
+    vreplgr2vr.w VXA, t1
+    vreplgr2vr.w VXB, t2
+    vreplgr2vr.w VXZ, t3
+#endif
     srai.d I, N, 3
     bne INCX, TEMP, .L20
     bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
@@ -52,15 +85,15 @@
 
 .L11:
     bge $r0, I, .L997
-    fcmp.ceq.d $fcc0, ALPHA, a1
+    CMPEQ $fcc0, ALPHA, a1
     bcnez $fcc0, .L110
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
     bcnez $fcc0, .L112 // ALPHA!=0 BETA==0
     b .L111 // ALPHA!=0 BETA!=0
     .align 3
 
 .L110:
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
     bcnez $fcc0, .L114 // ALPHA==0 BETA==0
     b .L113 // ALPHA==0 BETA!=0
     .align 3
@@ -68,6 +101,7 @@
 .L111: // ALPHA!=0 BETA!=0
     vld VX0, X, 0 * SIZE
     vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
     vld VX1, X, 2 * SIZE
     vld VX3, Y, 2 * SIZE
     vfmul.d VX0, VX0, VXA
@@ -86,6 +120,16 @@
     vfmadd.d VX3, VX3, VXB, VX1
     vst VX2, Y, 4 * SIZE
     vst VX3, Y, 6 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfmul.s VX0, VX0, VXA
+    vfmul.s VX1, VX1, VXA
+    vfmadd.s VX2, VX2, VXB, VX0
+    vfmadd.s VX3, VX3, VXB, VX1
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
     addi.d X, X, 8 * SIZE
     addi.d Y, Y, 8 * SIZE
     addi.d  I, I, -1
@@ -95,6 +139,7 @@
 
 .L112: // ALPHA!=0 BETA==0
     vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
     vld VX1, X, 2 * SIZE
     vfmul.d VX0, VX0, VXA
     vfmul.d VX1, VX1, VXA
@@ -106,6 +151,13 @@
     vfmul.d VX3, VX3, VXA
     vst VX2, Y, 4 * SIZE
     vst VX3, Y, 6 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vfmul.s VX0, VX0, VXA
+    vfmul.s VX1, VX1, VXA
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 4 * SIZE
+#endif
     addi.d X, X, 8 * SIZE
     addi.d Y, Y, 8 * SIZE
     addi.d  I, I, -1
@@ -113,7 +165,8 @@
     b .L997
     .align 3
 
-.L113: // ALPHA==0 BETA!=0\
+.L113: // ALPHA==0 BETA!=0
+#ifdef DOUBLE
     vld VX0, Y, 0 * SIZE
     vld VX1, Y, 2 * SIZE
     vfmul.d VX0, VX0, VXB
@@ -126,6 +179,14 @@
     vfmul.d VX3, VX3, VXB
     vst VX2, Y, 4 * SIZE
     vst VX3, Y, 6 * SIZE
+#else
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfmul.s VX2, VX2, VXB
+    vfmul.s VX3, VX3, VXB
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
     addi.d Y, Y, 8 * SIZE
     addi.d  I, I, -1
     blt $r0, I, .L113
@@ -134,9 +195,13 @@
 
 .L114: // ALPHA==0 BETA==0
     vst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
     vst VXZ, Y, 2 * SIZE
     vst VXZ, Y, 4 * SIZE
     vst VXZ, Y, 6 * SIZE
+#else
+    vst VXZ, Y, 4 * SIZE
+#endif
     addi.d Y, Y, 8 * SIZE
     addi.d  I, I, -1
     blt $r0, I, .L114
@@ -146,21 +211,22 @@
 .L12: // INCX==1 and INCY!=1
     bge $r0, I, .L997
     move YY, Y
-    fcmp.ceq.d $fcc0, ALPHA, a1
+    CMPEQ $fcc0, ALPHA, a1
     bcnez $fcc0, .L120
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
     bcnez $fcc0, .L122 // ALPHA!=0 BETA==0
     b .L121 // ALPHA!=0 BETA!=0
     .align 3
 
 .L120:
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
     bcnez $fcc0, .L124 // ALPHA==0 BETA==0
     b .L123 // ALPHA==0 BETA!=0
     .align 3
 
 .L121: // ALPHA!=0 BETA!=0
     vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
     ld.d t1, Y, 0 * SIZE
     add.d Y, Y, INCY
     ld.d t2, Y, 0 * SIZE
@@ -212,6 +278,53 @@
     vstelm.d VX3, YY, 0, 0
     add.d YY, YY, INCY
     vstelm.d VX3, YY, 0, 1
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX0, VX0, VXA
+    vld VX1, X, 4 * SIZE
+    vfmadd.s VX2, VX2, VXB, VX0
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    vfmul.s VX1, VX1, VXA
+    vfmadd.s VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+#endif
     add.d YY, YY, INCY
     addi.d X, X, 8 * SIZE
     blt $r0, I, .L121
@@ -220,6 +333,7 @@
 
 .L122: // ALPHA!=0 BETA==0
     vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
     vld VX1, X, 2 * SIZE
     vfmul.d VX0, VX0, VXA
     vfmul.d VX1, VX1, VXA
@@ -242,6 +356,26 @@
     vstelm.d VX1, YY, 0, 0
     add.d YY, YY, INCY
     vstelm.d VX1, YY, 0, 1
+#else
+    vld VX1, X, 4 * SIZE
+    vfmul.s VX0, VX0, VXA
+    vfmul.s VX1, VX1, VXA
+    vstelm.w VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 3
+#endif
     add.d YY, YY, INCY
     addi.d X, X, 8 * SIZE
     addi.d  I, I, -1
@@ -250,6 +384,7 @@
     .align 3
 
 .L123: // ALPHA==0 BETA!=0
+#ifdef DOUBLE
     ld.d t1, Y, 0 * SIZE
     add.d Y, Y, INCY
     ld.d t2, Y, 0 * SIZE
@@ -294,12 +429,57 @@
     vstelm.d VX3, YY, 0, 0
     add.d YY, YY, INCY
     vstelm.d VX3, YY, 0, 1
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX2, VX2, VXB
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    vfmul.s VX3, VX3, VXB
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+#endif
     add.d YY, YY, INCY
     blt $r0, I, .L123
     b .L997
     .align 3
 
 .L124: // ALPHA==0 BETA==0
+#ifdef DOUBLE
     vstelm.d VXZ, YY, 0, 0
     add.d YY, YY, INCY
     vstelm.d VXZ, YY, 0, 1
@@ -315,6 +495,23 @@
     vstelm.d VXZ, YY, 0, 0
     add.d YY, YY, INCY
     vstelm.d VXZ, YY, 0, 1
+#else
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+#endif
     add.d YY, YY, INCY
     addi.d  I, I, -1
     blt $r0, I, .L124
@@ -323,21 +520,22 @@
 
 .L21:// INCX!=1 and INCY==1
     bge $r0, I, .L997
-    fcmp.ceq.d $fcc0, ALPHA, a1
+    CMPEQ $fcc0, ALPHA, a1
     bcnez $fcc0, .L210
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
     bcnez $fcc0, .L212 // ALPHA!=0 BETA==0
     b .L211 // ALPHA!=0 BETA!=0
     .align 3
 
 .L210:
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
     bcnez $fcc0, .L214 // ALPHA==0 BETA==0
     b .L213 // ALPHA==0 BETA!=0
     .align 3
 
 .L211: // ALPHA!=0 BETA!=0
     vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
     ld.d t1, X, 0 * SIZE
     add.d X, X, INCX
     ld.d t2, X, 0 * SIZE
@@ -378,12 +576,47 @@
     vfmadd.d VX3, VX3, VXB, VX1
     addi.d  I, I, -1
     vst VX3, Y, 6 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmul.s VX0, VXA, VX0
+    vld VX3, Y, 4 * SIZE
+    vfmadd.s VX2, VX2, VXB, VX0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vst VX2, Y, 0 * SIZE
+    vfmul.s VX1, VX1, VXA
+    vfmadd.s VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    vst VX3, Y, 4 * SIZE
+#endif
     addi.d Y, Y, 8 * SIZE
     blt $r0, I, .L211
     b .L997
     .align 3
 
 .L212: // ALPHA!=0 BETA==0
+#ifdef DOUBLE
     ld.d t1, X, 0 * SIZE
     add.d X, X, INCX
     ld.d t2, X, 0 * SIZE
@@ -417,6 +650,37 @@
     vfmul.d VX1, VX1, VXA
     addi.d  I, I, -1
     vst VX1, Y, 6 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmul.s VX0, VXA, VX0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vst VX0, Y, 0 * SIZE
+    vfmul.s VX1, VX1, VXA
+    addi.d  I, I, -1
+    vst VX1, Y, 4 * SIZE
+#endif
     addi.d Y, Y, 8 * SIZE
     blt $r0, I, .L212
     b .L997
@@ -424,6 +688,7 @@
 
 .L213: // ALPHA==0 BETA!=0
     vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
     vld VX3, Y, 2 * SIZE
     vfmul.d VX2, VX2, VXB
     vfmul.d VX3, VX3, VXB
@@ -433,19 +698,30 @@
     vld VX3, Y, 6 * SIZE
     vfmul.d VX2, VX2, VXB
     vfmul.d VX3, VX3, VXB
-    addi.d  I, I, -1
     vst VX2, Y, 4 * SIZE
     vst VX3, Y, 6 * SIZE
+#else
+    vld VX3, Y, 4 * SIZE
+    vfmul.s VX2, VX2, VXB
+    vfmul.s VX3, VX3, VXB
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
     addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
     blt $r0, I, .L213
     b .L997
     .align 3
 
 .L214: // ALPHA==0 BETA==0
     vst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
     vst VXZ, Y, 2 * SIZE
     vst VXZ, Y, 4 * SIZE
     vst VXZ, Y, 6 * SIZE
+#else
+    vst VXZ, Y, 4 * SIZE
+#endif
     addi.d Y, Y, 8 * SIZE
     addi.d  I, I, -1
     blt $r0, I, .L214
@@ -455,20 +731,21 @@
 .L22:
     bge $r0, I, .L997
     move YY, Y
-    fcmp.ceq.d $fcc0, ALPHA, a1
+    CMPEQ $fcc0, ALPHA, a1
     bcnez $fcc0, .L220
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
     bcnez $fcc0, .L222 // ALPHA!=0 BETA==0
     b .L221 // ALPHA!=0 BETA!=0
     .align 3
 
 .L220:
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
     bcnez $fcc0, .L224 // ALPHA==0 BETA==0
     b .L223 // ALPHA==0 BETA!=0
     .align 3
 
 .L221: // ALPHA!=0 BETA!=0
+#ifdef DOUBLE
     ld.d t1, X, 0 * SIZE
     add.d X, X, INCX
     ld.d t2, X, 0 * SIZE
@@ -541,12 +818,83 @@
     vstelm.d VX3, YY, 0, 0
     add.d YY, YY, INCY
     vstelm.d VX3, YY, 0, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX0, VX0, VXA
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vfmadd.s VX2, VX2, VXB, VX0
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX1, VX1, VXA
+    addi.d  I, I, -1
+    vfmadd.s VX3, VX3, VXB, VX1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+#endif
     add.d YY, YY, INCY
     blt $r0, I, .L221
     b .L997
     .align 3
 
 .L222: // ALPHA!=0 BETA==0
+#ifdef DOUBLE
     ld.d t1, X, 0 * SIZE
     add.d X, X, INCX
     ld.d t2, X, 0 * SIZE
@@ -591,12 +939,57 @@
     vstelm.d VX1, YY, 0, 0
     add.d YY, YY, INCY
     vstelm.d VX1, YY, 0, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmul.s VX0, VX0, VXA
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vstelm.w VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    vfmul.s VX1, VX1, VXA
+    addi.d  I, I, -1
+    vstelm.w VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 3
+#endif
     add.d YY, YY, INCY
     blt $r0, I, .L222
     b .L997
     .align 3
 
 .L223: // ALPHA==0 BETA!=0
+#ifdef DOUBLE
     ld.d t1, Y, 0 * SIZE
     add.d Y, Y, INCY
     ld.d t2, Y, 0 * SIZE
@@ -641,12 +1034,57 @@
     vstelm.d VX3, YY, 0, 0
     add.d YY, YY, INCY
     vstelm.d VX3, YY, 0, 1
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX2, VX2, VXB
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    vfmul.s VX3, VX3, VXB
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+#endif
     add.d YY, YY, INCY
     blt $r0, I, .L223
     b .L997
     .align 3
 
 .L224: // ALPHA==0 BETA==0
+#ifdef DOUBLE
     vstelm.d VXZ, YY, 0, 0
     add.d YY, YY, INCY
     vstelm.d VXZ, YY, 0, 1
@@ -662,6 +1100,23 @@
     vstelm.d VXZ, YY, 0, 0
     add.d YY, YY, INCY
     vstelm.d VXZ, YY, 0, 1
+#else
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+#endif
     add.d YY, YY, INCY
     addi.d  I, I, -1
     blt $r0, I, .L224
@@ -674,12 +1129,12 @@
     .align 3
 
 .L998:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f13, Y, 0 * SIZE
+    LD   $f12, X, 0 * SIZE
+    LD   $f13, Y, 0 * SIZE
     addi.d I, I, -1
-    fmul.d $f12, $f12, ALPHA
-    fmadd.d $f13, $f13, BETA, $f12
-    fst.d $f13, Y, 0 * SIZE
+    MUL  $f12, $f12, ALPHA
+    MADD $f13, $f13, BETA, $f12
+    ST   $f13, Y, 0 * SIZE
     add.d  X, X, INCX
     add.d  Y, Y, INCY
     blt $r0, I, .L998
diff --git a/kernel/loongarch64/saxpby_lasx.S b/kernel/loongarch64/saxpby_lasx.S
deleted file mode 100644
index c5d1ff402..000000000
--- a/kernel/loongarch64/saxpby_lasx.S
+++ /dev/null
@@ -1,597 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define ALPHA  $f0
-#define X      $r5
-#define INCX   $r6
-#define BETA   $f1
-#define Y      $r7
-#define INCY   $r8
-
-#define I      $r12
-#define TEMP   $r13
-#define t1     $r14
-#define t2     $r16
-#define t3     $r15
-#define t4     $r17
-#define XX     $r18
-#define YY     $r19
-#define a1     $f12
-#define a2     $f13
-#define VX0    $xr8
-#define VX1    $xr20
-#define VX2    $xr21
-#define VX3    $xr22
-#define VXA    $xr23
-#define VXB    $xr9
-#define VXZ    $xr19
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    movgr2fr.d a1, $r0
-    ffint.s.l a1, a1
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    movfr2gr.s t1, ALPHA
-    xvreplgr2vr.w VXA, t1
-    movfr2gr.s t2, BETA
-    xvreplgr2vr.w VXB, t2
-    movfr2gr.s t3, a1
-    xvreplgr2vr.w VXZ, t3
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L997
-    fcmp.ceq.s $fcc0, ALPHA, a1
-    bcnez $fcc0, .L110
-    fcmp.ceq.s $fcc0, BETA, a1
-    bcnez $fcc0, .L112 // ALPHA!=0 BETA==0
-    b .L111 // ALPHA!=0 BETA!=0
-    .align 3
-
-.L110:
-    fcmp.ceq.s $fcc0, BETA, a1
-    bcnez $fcc0, .L114 // ALPHA==0 BETA==0
-    b .L113 // ALPHA==0 BETA!=0
-    .align 3
-
-.L111: // ALPHA!=0 BETA!=0
-    xvld VX0, X, 0 * SIZE
-    xvld VX2, Y, 0 * SIZE
-    xvfmul.s VX0, VX0, VXA
-    addi.d  I, I, -1
-    xvfmadd.s VX2, VX2, VXB, VX0
-    xvst VX2, Y, 0 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L111
-    b .L997
-    .align 3
-
-.L112: // ALPHA!=0 BETA==0
-    xvld VX0, X, 0 * SIZE
-    xvfmul.s VX0, VX0, VXA
-    addi.d  I, I, -1
-    xvst VX0, Y, 0 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L112
-    b .L997
-    .align 3
-
-.L113: // ALPHA==0 BETA!=0
-    xvld VX2, Y, 0 * SIZE
-    xvfmul.s VX2, VX2, VXB
-    addi.d  I, I, -1
-    xvst VX2, Y, 0 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L113
-    b .L997
-    .align 3
-
-.L114: // ALPHA==0 BETA==0
-    xvst VXZ, Y, 0 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L114
-    b .L997
-    .align 3
-
-.L12: // INCX==1 and INCY!=1
-    bge $r0, I, .L997
-    move YY, Y
-    fcmp.ceq.s $fcc0, ALPHA, a1
-    bcnez $fcc0, .L120
-    fcmp.ceq.s $fcc0, BETA, a1
-    bcnez $fcc0, .L122 // ALPHA!=0 BETA==0
-    b .L121 // ALPHA!=0 BETA!=0
-    .align 3
-
-.L120:
-    fcmp.ceq.s $fcc0, BETA, a1
-    bcnez $fcc0, .L124 // ALPHA==0 BETA==0
-    b .L123 // ALPHA==0 BETA!=0
-    .align 3
-
-.L121: // ALPHA!=0 BETA!=0
-    xvld VX0, X, 0 * SIZE
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    xvinsgr2vr.w VX2, t1, 0
-    xvinsgr2vr.w VX2, t2, 1
-    xvinsgr2vr.w VX2, t3, 2
-    xvinsgr2vr.w VX2, t4, 3
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    xvinsgr2vr.w VX2, t1, 4
-    xvinsgr2vr.w VX2, t2, 5
-    xvinsgr2vr.w VX2, t3, 6
-    xvinsgr2vr.w VX2, t4, 7
-    add.d Y, Y, INCY
-    xvfmul.s VX0, VX0, VXA
-    xvfmadd.s VX2, VX2, VXB, VX0
-    xvstelm.w VX2, YY, 0, 0
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 1
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 2
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 3
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 4
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 5
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 6
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 7
-    add.d YY, YY, INCY
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L121
-    b .L997
-    .align 3
-
-.L122: // ALPHA!=0 BETA==0
-    xvld VX0, X, 0 * SIZE
-    xvfmul.s VX0, VX0, VXA
-    addi.d  I, I, -1
-    xvstelm.w VX0, YY, 0, 0
-    add.d YY, YY, INCY
-    xvstelm.w VX0, YY, 0, 1
-    add.d YY, YY, INCY
-    xvstelm.w VX0, YY, 0, 2
-    add.d YY, YY, INCY
-    xvstelm.w VX0, YY, 0, 3
-    add.d YY, YY, INCY
-    xvstelm.w VX0, YY, 0, 4
-    add.d YY, YY, INCY
-    xvstelm.w VX0, YY, 0, 5
-    add.d YY, YY, INCY
-    xvstelm.w VX0, YY, 0, 6
-    add.d YY, YY, INCY
-    xvstelm.w VX0, YY, 0, 7
-    add.d YY, YY, INCY
-    addi.d X, X, 8 * SIZE
-    blt $r0, I, .L122
-    b .L997
-    .align 3
-
-.L123: // ALPHA==0 BETA!=0
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    xvinsgr2vr.w VX2, t1, 0
-    xvinsgr2vr.w VX2, t2, 1
-    xvinsgr2vr.w VX2, t3, 2
-    xvinsgr2vr.w VX2, t4, 3
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    xvinsgr2vr.w VX2, t1, 4
-    xvinsgr2vr.w VX2, t2, 5
-    xvinsgr2vr.w VX2, t3, 6
-    xvinsgr2vr.w VX2, t4, 7
-    add.d Y, Y, INCY
-    xvfmul.s VX2, VX2, VXB
-    xvstelm.w VX2, YY, 0, 0
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 1
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 2
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 3
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 4
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 5
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 6
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 7
-    add.d YY, YY, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L123
-    b .L997
-    .align 3
-
-.L124: // ALPHA==0 BETA==0
-    xvstelm.w VXZ, YY, 0, 0
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 1
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 2
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 3
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 4
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 5
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 6
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 7
-    add.d YY, YY, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L124
-    b .L997
-    .align 3
-
-.L21:// INCX!=1 and INCY==1
-    bge $r0, I, .L997
-    fcmp.ceq.s $fcc0, ALPHA, a1
-    bcnez $fcc0, .L210
-    fcmp.ceq.s $fcc0, BETA, a1
-    bcnez $fcc0, .L212 // ALPHA!=0 BETA==0
-    b .L211 // ALPHA!=0 BETA!=0
-    .align 3
-
-.L210:
-    fcmp.ceq.s $fcc0, BETA, a1
-    bcnez $fcc0, .L214 // ALPHA==0 BETA==0
-    b .L213 // ALPHA==0 BETA!=0
-    .align 3
-
-.L211: // ALPHA!=0 BETA!=0
-    xvld VX2, Y, 0 * SIZE
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    add.d X, X, INCX
-    xvfmul.s VX0, VXA, VX0
-    xvfmadd.s VX2, VX2, VXB, VX0
-    addi.d  I, I, -1
-    xvst VX2, Y, 0 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L211
-    b .L997
-    .align 3
-
-.L212: // ALPHA!=0 BETA==0
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    add.d X, X, INCX
-    xvfmul.s VX0, VXA, VX0
-    addi.d  I, I, -1
-    xvst VX0, Y, 0 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L212
-    b .L997
-    .align 3
-
-.L213: // ALPHA==0 BETA!=0
-    xvld VX2, Y, 0 * SIZE
-    xvfmul.s VX2, VX2, VXB
-    xvst VX2, Y, 0 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L213
-    b .L997
-    .align 3
-
-.L214: // ALPHA==0 BETA==0
-    xvst VXZ, Y, 0 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L214
-    b .L997
-    .align 3
-
-.L22:
-    bge $r0, I, .L997
-    move YY, Y
-    fcmp.ceq.s $fcc0, ALPHA, a1
-    bcnez $fcc0, .L220
-    fcmp.ceq.s $fcc0, BETA, a1
-    bcnez $fcc0, .L222 // ALPHA!=0 BETA==0
-    b .L221 // ALPHA!=0 BETA!=0
-    .align 3
-
-.L220:
-    fcmp.ceq.s $fcc0, BETA, a1
-    bcnez $fcc0, .L224 // ALPHA==0 BETA==0
-    b .L223 // ALPHA==0 BETA!=0
-    .align 3
-
-.L221: // ALPHA!=0 BETA!=0
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    add.d X, X, INCX
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    xvinsgr2vr.w VX2, t1, 0
-    xvinsgr2vr.w VX2, t2, 1
-    xvinsgr2vr.w VX2, t3, 2
-    xvinsgr2vr.w VX2, t4, 3
-    add.d Y, Y, INCY
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    xvinsgr2vr.w VX2, t1, 4
-    xvinsgr2vr.w VX2, t2, 5
-    xvinsgr2vr.w VX2, t3, 6
-    xvinsgr2vr.w VX2, t4, 7
-    add.d Y, Y, INCY
-    xvfmul.s VX0, VX0, VXA
-    xvfmadd.s VX2, VX2, VXB, VX0
-    addi.d  I, I, -1
-    xvstelm.w VX2, YY, 0, 0
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 1
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 2
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 3
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 4
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 5
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 6
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 7
-    add.d YY, YY, INCY
-    blt $r0, I, .L221
-    b .L997
-    .align 3
-
-.L222: // ALPHA!=0 BETA==0
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    xvinsgr2vr.w VX0, t1, 0
-    xvinsgr2vr.w VX0, t2, 1
-    xvinsgr2vr.w VX0, t3, 2
-    xvinsgr2vr.w VX0, t4, 3
-    add.d X, X, INCX
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    xvinsgr2vr.w VX0, t1, 4
-    xvinsgr2vr.w VX0, t2, 5
-    xvinsgr2vr.w VX0, t3, 6
-    xvinsgr2vr.w VX0, t4, 7
-    add.d X, X, INCX
-    xvfmul.s VX0, VX0, VXA
-    addi.d  I, I, -1
-    xvstelm.w VX0, YY, 0, 0
-    add.d YY, YY, INCY
-    xvstelm.w VX0, YY, 0, 1
-    add.d YY, YY, INCY
-    xvstelm.w VX0, YY, 0, 2
-    add.d YY, YY, INCY
-    xvstelm.w VX0, YY, 0, 3
-    add.d YY, YY, INCY
-    xvstelm.w VX0, YY, 0, 4
-    add.d YY, YY, INCY
-    xvstelm.w VX0, YY, 0, 5
-    add.d YY, YY, INCY
-    xvstelm.w VX0, YY, 0, 6
-    add.d YY, YY, INCY
-    xvstelm.w VX0, YY, 0, 7
-    add.d YY, YY, INCY
-    blt $r0, I, .L222
-    b .L997
-    .align 3
-
-.L223: // ALPHA==0 BETA!=0
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    xvinsgr2vr.w VX2, t1, 0
-    xvinsgr2vr.w VX2, t2, 1
-    xvinsgr2vr.w VX2, t3, 2
-    xvinsgr2vr.w VX2, t4, 3
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    xvinsgr2vr.w VX2, t1, 4
-    xvinsgr2vr.w VX2, t2, 5
-    xvinsgr2vr.w VX2, t3, 6
-    xvinsgr2vr.w VX2, t4, 7
-    add.d Y, Y, INCY
-    xvfmul.s VX2, VX2, VXB
-    addi.d  I, I, -1
-    xvstelm.w VX2, YY, 0, 0
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 1
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 2
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 3
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 4
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 5
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 6
-    add.d YY, YY, INCY
-    xvstelm.w VX2, YY, 0, 7
-    add.d YY, YY, INCY
-    blt $r0, I, .L223
-    b .L997
-    .align 3
-
-.L224: // ALPHA==0 BETA==0
-    xvstelm.w VXZ, YY, 0, 0
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 1
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 2
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 3
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 4
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 5
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 6
-    add.d YY, YY, INCY
-    xvstelm.w VXZ, YY, 0, 7
-    add.d YY, YY, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L224
-    b .L997
-    .align 3
-
-.L997:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L998:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f13, Y, 0 * SIZE
-    addi.d I, I, -1
-    fmul.s $f12, $f12, ALPHA
-    fmadd.s $f13, $f13, BETA, $f12
-    fst.s $f13, Y, 0 * SIZE
-    add.d  X, X, INCX
-    add.d  Y, Y, INCY
-    blt $r0, I, .L998
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE
diff --git a/kernel/loongarch64/saxpby_lsx.S b/kernel/loongarch64/saxpby_lsx.S
deleted file mode 100644
index 7f8cea2dd..000000000
--- a/kernel/loongarch64/saxpby_lsx.S
+++ /dev/null
@@ -1,629 +0,0 @@
-#define ASSEMBLER
-
-#include "common.h"
-#define N      $r4
-#define ALPHA  $f0
-#define X      $r5
-#define INCX   $r6
-#define BETA   $f1
-#define Y      $r7
-#define INCY   $r8
-
-#define I      $r12
-#define TEMP   $r13
-#define t1     $r14
-#define t2     $r16
-#define t3     $r15
-#define t4     $r17
-#define XX     $r18
-#define YY     $r19
-#define a1     $f12
-#define a2     $f13
-#define VX0    $vr8
-#define VX1    $vr20
-#define VX2    $vr21
-#define VX3    $vr22
-#define VXA    $vr23
-#define VXB    $vr9
-#define VXZ    $vr19
-
-    PROLOGUE
-
-    bge $r0, N, .L999
-    li.d TEMP, 1
-    movgr2fr.d a1, $r0
-    ffint.s.l a1, a1
-    slli.d  TEMP, TEMP, BASE_SHIFT
-    slli.d  INCX, INCX, BASE_SHIFT
-    slli.d  INCY, INCY, BASE_SHIFT
-    movfr2gr.s t1, ALPHA
-    vreplgr2vr.w VXA, t1
-    movfr2gr.s t2, BETA
-    vreplgr2vr.w VXB, t2
-    movfr2gr.s t3, a1
-    vreplgr2vr.w VXZ, t3
-    srai.d I, N, 3
-    bne INCX, TEMP, .L20
-    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
-    b .L11  // INCX==1 and INCY==1
-.L20:
-    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
-    b .L21 // INCX!=1 and INCY==1
-
-.L11:
-    bge $r0, I, .L997
-    fcmp.ceq.s $fcc0, ALPHA, a1
-    bcnez $fcc0, .L110
-    fcmp.ceq.s $fcc0, BETA, a1
-    bcnez $fcc0, .L112 // ALPHA!=0 BETA==0
-    b .L111 // ALPHA!=0 BETA!=0
-    .align 3
-
-.L110:
-    fcmp.ceq.s $fcc0, BETA, a1
-    bcnez $fcc0, .L114 // ALPHA==0 BETA==0
-    b .L113 // ALPHA==0 BETA!=0
-    .align 3
-
-.L111: // ALPHA!=0 BETA!=0
-    vld VX0, X, 0 * SIZE
-    vld VX2, Y, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    vld VX3, Y, 4 * SIZE
-    vfmul.s VX0, VX0, VXA
-    vfmul.s VX1, VX1, VXA
-    vfmadd.s VX2, VX2, VXB, VX0
-    vfmadd.s VX3, VX3, VXB, VX1
-    vst VX2, Y, 0 * SIZE
-    vst VX3, Y, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L111
-    b .L997
-    .align 3
-
-.L112: // ALPHA!=0 BETA==0
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    vfmul.s VX0, VX0, VXA
-    vfmul.s VX1, VX1, VXA
-    vst VX0, Y, 0 * SIZE
-    vst VX1, Y, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L112
-    b .L997
-    .align 3
-
-.L113: // ALPHA==0 BETA!=0
-    vld VX2, Y, 0 * SIZE
-    vld VX3, Y, 4 * SIZE
-    vfmul.s VX2, VX2, VXB
-    vfmul.s VX3, VX3, VXB
-    vst VX2, Y, 0 * SIZE
-    vst VX3, Y, 4 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L113
-    b .L997
-    .align 3
-
-.L114: // ALPHA==0 BETA==0
-    vst VXZ, Y, 0 * SIZE
-    vst VXZ, Y, 4 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L114
-    b .L997
-    .align 3
-
-.L12: // INCX==1 and INCY!=1
-    bge $r0, I, .L997
-    move YY, Y
-    fcmp.ceq.s $fcc0, ALPHA, a1
-    bcnez $fcc0, .L120
-    fcmp.ceq.s $fcc0, BETA, a1
-    bcnez $fcc0, .L122 // ALPHA!=0 BETA==0
-    b .L121 // ALPHA!=0 BETA!=0
-    .align 3
-
-.L120:
-    fcmp.ceq.s $fcc0, BETA, a1
-    bcnez $fcc0, .L124 // ALPHA==0 BETA==0
-    b .L123 // ALPHA==0 BETA!=0
-    .align 3
-
-.L121: // ALPHA!=0 BETA!=0
-    vld VX0, X, 0 * SIZE
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vinsgr2vr.w VX2, t1, 0
-    vinsgr2vr.w VX2, t2, 1
-    vinsgr2vr.w VX2, t3, 2
-    vinsgr2vr.w VX2, t4, 3
-    add.d Y, Y, INCY
-    vfmul.s VX0, VX0, VXA
-    vld VX1, X, 4 * SIZE
-    vfmadd.s VX2, VX2, VXB, VX0
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    vinsgr2vr.w VX3, t1, 0
-    vinsgr2vr.w VX3, t2, 1
-    vinsgr2vr.w VX3, t3, 2
-    vinsgr2vr.w VX3, t4, 3
-    vstelm.w VX2, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VX2, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VX2, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VX2, YY, 0, 3
-    add.d YY, YY, INCY
-    vfmul.s VX1, VX1, VXA
-    vfmadd.s VX3, VX3, VXB, VX1
-    addi.d  I, I, -1
-    vstelm.w VX3, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VX3, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VX3, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VX3, YY, 0, 3
-    add.d YY, YY, INCY
-    addi.d X, X, 8 * SIZE
-    blt $r0, I, .L121
-    b .L997
-    .align 3
-
-.L122: // ALPHA!=0 BETA==0
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    vfmul.s VX0, VX0, VXA
-    vfmul.s VX1, VX1, VXA
-    vstelm.w VX0, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VX0, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VX0, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VX0, YY, 0, 3
-    add.d YY, YY, INCY
-    vstelm.w VX1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VX1, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VX1, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VX1, YY, 0, 3
-    add.d YY, YY, INCY
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L122
-    b .L997
-    .align 3
-
-.L123: // ALPHA==0 BETA!=0
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vinsgr2vr.w VX2, t1, 0
-    vinsgr2vr.w VX2, t2, 1
-    vinsgr2vr.w VX2, t3, 2
-    vinsgr2vr.w VX2, t4, 3
-    add.d Y, Y, INCY
-    vfmul.s VX2, VX2, VXB
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    vinsgr2vr.w VX3, t1, 0
-    vinsgr2vr.w VX3, t2, 1
-    vinsgr2vr.w VX3, t3, 2
-    vinsgr2vr.w VX3, t4, 3
-    vstelm.w VX2, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VX2, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VX2, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VX2, YY, 0, 3
-    add.d YY, YY, INCY
-    vfmul.s VX3, VX3, VXB
-    addi.d  I, I, -1
-    vstelm.w VX3, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VX3, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VX3, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VX3, YY, 0, 3
-    add.d YY, YY, INCY
-    blt $r0, I, .L123
-    b .L997
-    .align 3
-
-.L124: // ALPHA==0 BETA==0
-    vstelm.w VXZ, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 3
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 3
-    add.d YY, YY, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L124
-    b .L997
-    .align 3
-
-.L21:// INCX!=1 and INCY==1
-    bge $r0, I, .L997
-    fcmp.ceq.s $fcc0, ALPHA, a1
-    bcnez $fcc0, .L210
-    fcmp.ceq.s $fcc0, BETA, a1
-    bcnez $fcc0, .L212 // ALPHA!=0 BETA==0
-    b .L211 // ALPHA!=0 BETA!=0
-    .align 3
-
-.L210:
-    fcmp.ceq.s $fcc0, BETA, a1
-    bcnez $fcc0, .L214 // ALPHA==0 BETA==0
-    b .L213 // ALPHA==0 BETA!=0
-    .align 3
-
-.L211: // ALPHA!=0 BETA!=0
-    vld VX2, Y, 0 * SIZE
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    add.d X, X, INCX
-    vfmul.s VX0, VXA, VX0
-    vld VX3, Y, 4 * SIZE
-    vfmadd.s VX2, VX2, VXB, VX0
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    vst VX2, Y, 0 * SIZE
-    vfmul.s VX1, VX1, VXA
-    vfmadd.s VX3, VX3, VXB, VX1
-    addi.d  I, I, -1
-    vst VX3, Y, 4 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L211
-    b .L997
-    .align 3
-
-.L212: // ALPHA!=0 BETA==0
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    add.d X, X, INCX
-    vfmul.s VX0, VXA, VX0
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    vst VX0, Y, 0 * SIZE
-    vfmul.s VX1, VX1, VXA
-    addi.d  I, I, -1
-    vst VX1, Y, 4 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    blt $r0, I, .L212
-    b .L997
-    .align 3
-
-.L213: // ALPHA==0 BETA!=0
-    vld VX2, Y, 0 * SIZE
-    vld VX3, Y, 4 * SIZE
-    vfmul.s VX2, VX2, VXB
-    vfmul.s VX3, VX3, VXB
-    vst VX2, Y, 0 * SIZE
-    vst VX3, Y, 4 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L213
-    b .L997
-    .align 3
-
-.L214: // ALPHA==0 BETA==0
-    vst VXZ, Y, 0 * SIZE
-    vst VXZ, Y, 4 * SIZE
-    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L214
-    b .L997
-    .align 3
-
-.L22:
-    bge $r0, I, .L997
-    move YY, Y
-    fcmp.ceq.s $fcc0, ALPHA, a1
-    bcnez $fcc0, .L220
-    fcmp.ceq.s $fcc0, BETA, a1
-    bcnez $fcc0, .L222 // ALPHA!=0 BETA==0
-    b .L221 // ALPHA!=0 BETA!=0
-    .align 3
-
-.L220:
-    fcmp.ceq.s $fcc0, BETA, a1
-    bcnez $fcc0, .L224 // ALPHA==0 BETA==0
-    b .L223 // ALPHA==0 BETA!=0
-    .align 3
-
-.L221: // ALPHA!=0 BETA!=0
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vinsgr2vr.w VX2, t1, 0
-    vinsgr2vr.w VX2, t2, 1
-    vinsgr2vr.w VX2, t3, 2
-    vinsgr2vr.w VX2, t4, 3
-    add.d Y, Y, INCY
-    vfmul.s VX0, VX0, VXA
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vfmadd.s VX2, VX2, VXB, VX0
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    vstelm.w VX2, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VX2, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VX2, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VX2, YY, 0, 3
-    add.d YY, YY, INCY
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vinsgr2vr.w VX3, t1, 0
-    vinsgr2vr.w VX3, t2, 1
-    vinsgr2vr.w VX3, t3, 2
-    vinsgr2vr.w VX3, t4, 3
-    add.d Y, Y, INCY
-    vfmul.s VX1, VX1, VXA
-    addi.d  I, I, -1
-    vfmadd.s VX3, VX3, VXB, VX1
-    vstelm.w VX3, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VX3, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VX3, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VX3, YY, 0, 3
-    add.d YY, YY, INCY
-    blt $r0, I, .L221
-    b .L997
-    .align 3
-
-.L222: // ALPHA!=0 BETA==0
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    vinsgr2vr.w VX0, t1, 0
-    vinsgr2vr.w VX0, t2, 1
-    vinsgr2vr.w VX0, t3, 2
-    vinsgr2vr.w VX0, t4, 3
-    add.d X, X, INCX
-    vfmul.s VX0, VX0, VXA
-    ld.w t1, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t2, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    add.d X, X, INCX
-    ld.w t4, X, 0 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w VX1, t1, 0
-    vinsgr2vr.w VX1, t2, 1
-    vinsgr2vr.w VX1, t3, 2
-    vinsgr2vr.w VX1, t4, 3
-    vstelm.w VX0, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VX0, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VX0, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VX0, YY, 0, 3
-    add.d YY, YY, INCY
-    vfmul.s VX1, VX1, VXA
-    addi.d  I, I, -1
-    vstelm.w VX1, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VX1, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VX1, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VX1, YY, 0, 3
-    add.d YY, YY, INCY
-    blt $r0, I, .L222
-    b .L997
-    .align 3
-
-.L223: // ALPHA==0 BETA!=0
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    vinsgr2vr.w VX2, t1, 0
-    vinsgr2vr.w VX2, t2, 1
-    vinsgr2vr.w VX2, t3, 2
-    vinsgr2vr.w VX2, t4, 3
-    add.d Y, Y, INCY
-    vfmul.s VX2, VX2, VXB
-    ld.w t1, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t2, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t3, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    ld.w t4, Y, 0 * SIZE
-    add.d Y, Y, INCY
-    vinsgr2vr.w VX3, t1, 0
-    vinsgr2vr.w VX3, t2, 1
-    vinsgr2vr.w VX3, t3, 2
-    vinsgr2vr.w VX3, t4, 3
-    vstelm.w VX2, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VX2, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VX2, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VX2, YY, 0, 3
-    add.d YY, YY, INCY
-    vfmul.s VX3, VX3, VXB
-    addi.d  I, I, -1
-    vstelm.w VX3, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VX3, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VX3, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VX3, YY, 0, 3
-    add.d YY, YY, INCY
-    blt $r0, I, .L223
-    b .L997
-    .align 3
-
-.L224: // ALPHA==0 BETA==0
-    vstelm.w VXZ, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 3
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 0
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 1
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 2
-    add.d YY, YY, INCY
-    vstelm.w VXZ, YY, 0, 3
-    add.d YY, YY, INCY
-    addi.d  I, I, -1
-    blt $r0, I, .L224
-    b .L997
-    .align 3
-
-.L997:
-    andi I, N, 7
-    bge $r0, I, .L999
-    .align 3
-
-.L998:
-    fld.s $f12, X, 0 * SIZE
-    fld.s $f13, Y, 0 * SIZE
-    addi.d I, I, -1
-    fmul.s $f12, $f12, ALPHA
-    fmadd.s $f13, $f13, BETA, $f12
-    fst.s $f13, Y, 0 * SIZE
-    add.d  X, X, INCX
-    add.d  Y, Y, INCY
-    blt $r0, I, .L998
-    .align 3
-
-.L999:
-    move $r4, $r12
-    jirl $r0, $r1, 0x0
-    .align 3
-
-    EPILOGUE

From 546f13558caad680fdd82778c106ebaa3bd7331c Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Fri, 29 Dec 2023 11:03:53 +0800
Subject: [PATCH 531/718] loongarch64: Add {c/z}swap and {c/z}sum optimization

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   6 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   6 +
 kernel/loongarch64/csum_lasx.S           | 274 +++++++++++++++
 kernel/loongarch64/csum_lsx.S            | 266 ++++++++++++++
 kernel/loongarch64/cswap_lasx.S          | 394 +++++++++++++++++++++
 kernel/loongarch64/cswap_lsx.S           | 421 +++++++++++++++++++++++
 6 files changed, 1367 insertions(+)
 create mode 100644 kernel/loongarch64/csum_lasx.S
 create mode 100644 kernel/loongarch64/csum_lsx.S
 create mode 100644 kernel/loongarch64/cswap_lasx.S
 create mode 100644 kernel/loongarch64/cswap_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 0fb0bb68f..01f8e4782 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -75,6 +75,12 @@ DNRM2KERNEL =  dnrm2_lsx.S
 CNRM2KERNEL =  cnrm2_lsx.S
 ZNRM2KERNEL =  znrm2_lsx.S
 
+CSWAPKERNEL = cswap_lsx.S
+ZSWAPKERNEL = cswap_lsx.S
+
+CSUMKERNEL = csum_lsx.S
+ZSUMKERNEL = csum_lsx.S
+
 DGEMMKERNEL    = dgemm_kernel_8x4.S
 DGEMMINCOPY    = dgemm_ncopy_8_lsx.S
 DGEMMITCOPY    = dgemm_tcopy_8_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 1a6a04532..a9e8abaf0 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -75,6 +75,12 @@ DNRM2KERNEL =  dnrm2_lasx.S
 CNRM2KERNEL =  cnrm2_lasx.S
 ZNRM2KERNEL =  znrm2_lasx.S
 
+CSWAPKERNEL = cswap_lasx.S
+ZSWAPKERNEL = cswap_lasx.S
+
+CSUMKERNEL = csum_lasx.S
+ZSUMKERNEL = csum_lasx.S
+
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
diff --git a/kernel/loongarch64/csum_lasx.S b/kernel/loongarch64/csum_lasx.S
new file mode 100644
index 000000000..3e65f2c15
--- /dev/null
+++ b/kernel/loongarch64/csum_lasx.S
@@ -0,0 +1,274 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define res1   $xr16
+#define res2   $xr17
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d  TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+    xvld VX2, X, 8 * SIZE
+    xvld VX3, X, 12 * SIZE
+    xvfadd.d res2, VX2, VX3
+    xvfadd.d res1, res1, res2
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 8 * SIZE
+    xvfadd.s res2, VX0, VX1
+    xvfadd.s res1, res2, res1
+#endif
+    addi.d X, X, 16 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    ADD a1, a1, a2
+    ADD s1, a1, s1
+
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 0
+    xvinsgr2vr.w VX1, t2, 1
+    xvinsgr2vr.w VX1, t3, 2
+    xvinsgr2vr.w VX1, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 4
+    xvinsgr2vr.w VX1, t2, 5
+    xvinsgr2vr.w VX1, t3, 6
+    xvinsgr2vr.w VX1, t4, 7
+    xvfadd.s res2, VX0, VX1
+    xvfadd.s res1, res2, res1
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    ADD a1, a1, a2
+    ADD s1, a1, s1
+
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/csum_lsx.S b/kernel/loongarch64/csum_lsx.S
new file mode 100644
index 000000000..8de8e27ca
--- /dev/null
+++ b/kernel/loongarch64/csum_lsx.S
@@ -0,0 +1,266 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define res1   $vr16
+#define res2   $vr17
+#define res3   $vr18
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d  TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    vld VX2, X, 4 * SIZE
+    vld VX3, X, 6 * SIZE
+    vfadd.d res2, VX2, VX3
+    vfadd.d res1, res1, res2
+    vld VX0, X, 8 * SIZE
+    vld VX1, X, 10 * SIZE
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    vld VX2, X, 12 * SIZE
+    vld VX3, X, 14 * SIZE
+    vfadd.d res2, VX2, VX3
+    vfadd.d res1, res1, res2
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vfadd.s res2, VX0, VX1
+    vld VX2, X, 8 * SIZE
+    vld VX3, X, 12 * SIZE
+    vfadd.s res3, VX2, VX3
+    vfadd.s res2, res3, res2
+    vfadd.s res1, res1, res2
+#endif
+
+    addi.d  I, I, -1
+    addi.d X, X, 16 * SIZE
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    ADD a1, a1, a2
+    ADD s1, a1, s1
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfadd.s res2, VX0, VX1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vfadd.s res3, VX2, VX3
+    vfadd.s res2, res3, res2
+    vfadd.s res1, res1, res2
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    ADD a1, a1, a2
+    ADD s1, a1, s1
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/cswap_lasx.S b/kernel/loongarch64/cswap_lasx.S
new file mode 100644
index 000000000..d53773d5a
--- /dev/null
+++ b/kernel/loongarch64/cswap_lasx.S
@@ -0,0 +1,394 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define YY     $r6
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    srai.d I, N, 2
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvst VX2, X, 0 * SIZE
+    xvst VX3, X, 4 * SIZE
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 4 * SIZE
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvst VX2, X, 0 * SIZE
+    xvst VX0, Y, 0 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+.L113:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    xvstelm.d VX0, Y, 0 * SIZE, 0
+    ld.d t2, Y, 1 * SIZE
+    xvstelm.d VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    xvstelm.d VX0, Y, 0 * SIZE, 2
+    ld.d t4, Y, 1 * SIZE
+    xvstelm.d VX0, Y, 1 * SIZE, 3
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvst VX2, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    xvstelm.d VX1, Y, 0 * SIZE, 0
+    ld.d t2, Y, 1 * SIZE
+    xvstelm.d VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    xvstelm.d VX1, Y, 0 * SIZE, 2
+    ld.d t4, Y, 1 * SIZE
+    xvstelm.d VX1, Y, 1 * SIZE, 3
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvst VX3, X, 4 * SIZE
+#else
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    xvstelm.w VX0, Y, 0 * SIZE, 0
+    ld.w t2, Y, 1 * SIZE
+    xvstelm.w VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    xvstelm.w VX0, Y, 0 * SIZE, 2
+    ld.w t4, Y, 1 * SIZE
+    xvstelm.w VX0, Y, 1 * SIZE, 3
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    ld.w t1, Y, 0 * SIZE
+    xvstelm.w VX0, Y, 0 * SIZE, 4
+    ld.w t2, Y, 1 * SIZE
+    xvstelm.w VX0, Y, 1 * SIZE, 5
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    xvstelm.w VX0, Y, 0 * SIZE, 6
+    ld.w t4, Y, 1 * SIZE
+    xvstelm.w VX0, Y, 1 * SIZE, 7
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvst VX2, X, 0 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    xvstelm.d VX2, X, 0 * SIZE, 0
+    ld.d t2, X, 1 * SIZE
+    xvstelm.d VX2, X, 1 * SIZE, 1
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    xvstelm.d VX2, X, 0 * SIZE, 2
+    ld.d t4, X, 1 * SIZE
+    xvstelm.d VX2, X, 1 * SIZE, 3
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvst VX0, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    xvstelm.d VX3, X, 0 * SIZE, 0
+    ld.d t2, X, 1 * SIZE
+    xvstelm.d VX3, X, 1 * SIZE, 1
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    xvstelm.d VX3, X, 0 * SIZE, 2
+    ld.d t4, X, 1 * SIZE
+    xvstelm.d VX3, X, 1 * SIZE, 3
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvst VX1, Y, 4 * SIZE
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    xvstelm.w VX2, X, 0 * SIZE, 0
+    ld.w t2, X, 1 * SIZE
+    xvstelm.w VX2, X, 1 * SIZE, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    xvstelm.w VX2, X, 0 * SIZE, 2
+    ld.w t4, X, 1 * SIZE
+    xvstelm.w VX2, X, 1 * SIZE, 3
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    xvstelm.w VX2, X, 0 * SIZE, 4
+    ld.w t2, X, 1 * SIZE
+    xvstelm.w VX2, X, 1 * SIZE, 5
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    xvstelm.w VX2, X, 0 * SIZE, 6
+    ld.w t4, X, 1 * SIZE
+    xvstelm.w VX2, X, 1 * SIZE, 7
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvst VX0, Y, 0 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+    move XX, X
+
+.L222:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD a3, X, 0 * SIZE
+    LD a4, X, 1 * SIZE
+    add.d X, X, INCX
+    LD b1, Y, 0 * SIZE
+    ST a1, Y, 0 * SIZE
+    LD b2, Y, 1 * SIZE
+    ST a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    LD b3, Y, 0 * SIZE
+    ST a3, Y, 0 * SIZE
+    LD b4, Y, 1 * SIZE
+    ST a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD a1, X, 0 * SIZE
+    ST b1, XX, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    add.d X, X, INCX
+    ST b2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    LD a3, X, 0 * SIZE
+    ST b3, XX, 0 * SIZE
+    LD a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST b4, XX, 1 * SIZE
+    add.d XX, XX, INCX
+
+    LD b1, Y, 0 * SIZE
+    ST a1, Y, 0 * SIZE
+    LD b2, Y, 1 * SIZE
+    ST a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    LD b3, Y, 0 * SIZE
+    ST a3, Y, 0 * SIZE
+    LD b4, Y, 1 * SIZE
+    ST a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    ST b1, XX, 0 * SIZE
+    ST b2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    ST b3, XX, 0 * SIZE
+    ST b4, XX, 1 * SIZE
+
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/cswap_lsx.S b/kernel/loongarch64/cswap_lsx.S
new file mode 100644
index 000000000..62a869066
--- /dev/null
+++ b/kernel/loongarch64/cswap_lsx.S
@@ -0,0 +1,421 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define YY     $r6
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    srai.d I, N, 2
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 2 * SIZE
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vst VX2, X, 4 * SIZE
+    vst VX3, X, 6 * SIZE
+    vst VX0, Y, 4 * SIZE
+    vst VX1, Y, 6 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 4 * SIZE
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 4 * SIZE
+#endif
+    addi.d  I, I, -1
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    vstelm.d VX0, Y, 0 * SIZE, 0
+    ld.d t2, Y, 1 * SIZE
+    vstelm.d VX0, Y, 1 * SIZE, 1
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vst VX2, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    vstelm.d VX1, Y, 0 * SIZE, 0
+    ld.d t4, Y, 1 * SIZE
+    vstelm.d VX1, Y, 1 * SIZE, 1
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vst VX3, X, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    vstelm.d VX0, Y, 0 * SIZE, 0
+    ld.d t2, Y, 1 * SIZE
+    vstelm.d VX0, Y, 1 * SIZE, 1
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vst VX2, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    vstelm.d VX1, Y, 0 * SIZE, 0
+    ld.d t4, Y, 1 * SIZE
+    vstelm.d VX1, Y, 1 * SIZE, 1
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vst VX3, X, 6 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    vstelm.w VX0, Y, 0 * SIZE, 0
+    ld.w t2, Y, 1 * SIZE
+    vstelm.w VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    vstelm.w VX0, Y, 0 * SIZE, 2
+    ld.w t4, Y, 1 * SIZE
+    vstelm.w VX0, Y, 1 * SIZE, 3
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vst VX2, X, 0 * SIZE
+
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    vstelm.w VX1, Y, 0 * SIZE, 0
+    ld.w t2, Y, 1 * SIZE
+    vstelm.w VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    vstelm.w VX1, Y, 0 * SIZE, 2
+    ld.w t4, Y, 1 * SIZE
+    vstelm.w VX1, Y, 1 * SIZE, 3
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vst VX3, X, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    vld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    vstelm.d VX2, X, 0 * SIZE, 0
+    ld.d t2, X, 1 * SIZE
+    vstelm.d VX2, X, 1 * SIZE, 1
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vst VX0, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    ld.d t3, X, 0 * SIZE
+    vstelm.d VX3, X, 0 * SIZE, 0
+    ld.d t4, X, 1 * SIZE
+    vstelm.d VX3, X, 1 * SIZE, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vst VX1, Y, 2 * SIZE
+    vld VX2, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    vstelm.d VX2, X, 0 * SIZE, 0
+    ld.d t2, X, 1 * SIZE
+    vstelm.d VX2, X, 1 * SIZE, 1
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vst VX0, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t3, X, 0 * SIZE
+    vstelm.d VX3, X, 0 * SIZE, 0
+    ld.d t4, X, 1 * SIZE
+    vstelm.d VX3, X, 1 * SIZE, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vst VX1, Y, 6 * SIZE
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    vstelm.w VX2, X, 0 * SIZE, 0
+    ld.w t2, X, 1 * SIZE
+    vstelm.w VX2, X, 1 * SIZE, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    vstelm.w VX2, X, 0 * SIZE, 2
+    ld.w t4, X, 1 * SIZE
+    vstelm.w VX2, X, 1 * SIZE, 3
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vst VX0, Y, 0 * SIZE
+
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    vstelm.w VX3, X, 0 * SIZE, 0
+    ld.w t2, X, 1 * SIZE
+    vstelm.w VX3, X, 1 * SIZE, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    vstelm.w VX3, X, 0 * SIZE, 2
+    ld.w t4, X, 1 * SIZE
+    vstelm.w VX3, X, 1 * SIZE, 3
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    vst VX1, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+    move XX, X
+
+.L222:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD a3, X, 0 * SIZE
+    LD a4, X, 1 * SIZE
+    add.d X, X, INCX
+    LD b1, Y, 0 * SIZE
+    ST a1, Y, 0 * SIZE
+    LD b2, Y, 1 * SIZE
+    ST a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    LD b3, Y, 0 * SIZE
+    ST a3, Y, 0 * SIZE
+    LD b4, Y, 1 * SIZE
+    ST a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD a1, X, 0 * SIZE
+    ST b1, XX, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    add.d X, X, INCX
+    ST b2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    LD a3, X, 0 * SIZE
+    ST b3, XX, 0 * SIZE
+    LD a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST b4, XX, 1 * SIZE
+    add.d XX, XX, INCX
+
+    LD b1, Y, 0 * SIZE
+    ST a1, Y, 0 * SIZE
+    LD b2, Y, 1 * SIZE
+    ST a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    LD b3, Y, 0 * SIZE
+    ST a3, Y, 0 * SIZE
+    LD b4, Y, 1 * SIZE
+    ST a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    ST b1, XX, 0 * SIZE
+    ST b2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    ST b3, XX, 0 * SIZE
+    ST b4, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE

From a5d0d21378f6da778b18c8f9c5fd4ad33d9b52d5 Mon Sep 17 00:00:00 2001
From: pengxu <pengxu@loongson.cn>
Date: Fri, 29 Dec 2023 15:10:01 +0800
Subject: [PATCH 532/718] loongarch64: Add zgemm and cgemm optimization

---
 common_loongarch64.h                       |   4 +
 kernel/loongarch64/KERNEL.LOONGSON2K1000   |  12 +
 kernel/loongarch64/KERNEL.LOONGSON3R5      |  24 +-
 kernel/loongarch64/cgemm_kernel_2x2_lasx.S | 857 +++++++++++++++++++++
 kernel/loongarch64/cgemm_kernel_2x2_lsx.S  | 812 +++++++++++++++++++
 kernel/loongarch64/cgemm_ncopy_2_lasx.S    | 193 +++++
 kernel/loongarch64/cgemm_ncopy_2_lsx.S     | 202 +++++
 kernel/loongarch64/cgemm_tcopy_2_lasx.S    | 218 ++++++
 kernel/loongarch64/cgemm_tcopy_2_lsx.S     | 218 ++++++
 kernel/loongarch64/zgemm_kernel_2x2.S      | 848 ++++++++++++++++++++
 kernel/loongarch64/zgemm_kernel_2x2_lasx.S | 822 ++++++++++++++++++++
 kernel/loongarch64/zgemm_ncopy_2_lasx.S    | 196 +++++
 kernel/loongarch64/zgemm_tcopy_2_lasx.S    | 212 +++++
 param.h                                    |   8 +-
 14 files changed, 4621 insertions(+), 5 deletions(-)
 create mode 100644 kernel/loongarch64/cgemm_kernel_2x2_lasx.S
 create mode 100644 kernel/loongarch64/cgemm_kernel_2x2_lsx.S
 create mode 100644 kernel/loongarch64/cgemm_ncopy_2_lasx.S
 create mode 100644 kernel/loongarch64/cgemm_ncopy_2_lsx.S
 create mode 100644 kernel/loongarch64/cgemm_tcopy_2_lasx.S
 create mode 100644 kernel/loongarch64/cgemm_tcopy_2_lsx.S
 create mode 100644 kernel/loongarch64/zgemm_kernel_2x2.S
 create mode 100644 kernel/loongarch64/zgemm_kernel_2x2_lasx.S
 create mode 100644 kernel/loongarch64/zgemm_ncopy_2_lasx.S
 create mode 100644 kernel/loongarch64/zgemm_tcopy_2_lasx.S

diff --git a/common_loongarch64.h b/common_loongarch64.h
index e581e2e3e..b1426da79 100644
--- a/common_loongarch64.h
+++ b/common_loongarch64.h
@@ -144,6 +144,7 @@ static inline int WhereAmI(void){
 #define XVCMPLT xvfcmp.clt.d
 #define XVMUL   xvfmul.d
 #define XVMSUB  xvfmsub.d
+#define XVNMSUB xvfnmsub.d
 
 #define VFSUB  vfsub.d
 #define VFADD  vfadd.d
@@ -158,6 +159,7 @@ static inline int WhereAmI(void){
 #define VCMPLT vfcmp.clt.d
 #define VMUL   vfmul.d
 #define VMSUB  vfmsub.d
+#define VNMSUB vfnmsub.d
 
 #else
 
@@ -198,6 +200,7 @@ static inline int WhereAmI(void){
 #define XVCMPLT xvfcmp.clt.s
 #define XVMUL   xvfmul.s
 #define XVMSUB  xvfmsub.s
+#define XVNMSUB xvfnmsub.s
 
 #define VFSUB  vfsub.s
 #define VFADD  vfadd.s
@@ -212,6 +215,7 @@ static inline int WhereAmI(void){
 #define VCMPLT vfcmp.clt.s
 #define VMUL   vfmul.s
 #define VMSUB  vfmsub.s
+#define VNMSUB vfnmsub.s
 
 #endif /* defined(DOUBLE) */
 
diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index 01f8e4782..c365e9a75 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -95,4 +95,16 @@ DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
 DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
 DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+CGEMMKERNEL  = cgemm_kernel_2x2_lsx.S
+CGEMMONCOPY  = cgemm_ncopy_2_lsx.S
+CGEMMOTCOPY  = cgemm_tcopy_2_lsx.S
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
 endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index a9e8abaf0..68360faaf 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -107,13 +107,35 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 SGEMVNKERNEL = sgemv_n_8_lasx.S
 SGEMVTKERNEL = sgemv_t_8_lasx.S
 
+CGEMMKERNEL  = cgemm_kernel_2x2_lsx.S
+CGEMMONCOPY  = cgemm_ncopy_2_lsx.S
+CGEMMOTCOPY  = cgemm_tcopy_2_lsx.S
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+ZGEMMKERNEL  = zgemm_kernel_2x2_lasx.S
+ZGEMMONCOPY  = zgemm_ncopy_2_lasx.S
+ZGEMMOTCOPY  = zgemm_tcopy_2_lasx.S
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
 DTRSMKERNEL_LN  = dtrsm_kernel_LN_16x4_lasx.S
 DTRSMKERNEL_LT  = dtrsm_kernel_LT_16x4_lasx.S
 DTRSMKERNEL_RN  = dtrsm_kernel_RN_16x4_lasx.S
 DTRSMKERNEL_RT  = dtrsm_kernel_RT_16x4_lasx.S
-endif
 
 STRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 STRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
 STRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+endif
diff --git a/kernel/loongarch64/cgemm_kernel_2x2_lasx.S b/kernel/loongarch64/cgemm_kernel_2x2_lasx.S
new file mode 100644
index 000000000..e07f7dc64
--- /dev/null
+++ b/kernel/loongarch64/cgemm_kernel_2x2_lasx.S
@@ -0,0 +1,857 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LASX vectors */
+#define U0     $xr30
+#define U1     $xr31
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define VALPHAR    $xr28
+#define VALPHAI    $xr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVFMADD
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VNMSUB
+#define    VMADD4       VFMADD
+
+#define    XVFADD1       XVFADD
+#define    XVFADD2       XVFADD
+#define    XVFADD3       XVFSUB
+#define    XVFADD4       XVFADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVNMSUB
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VFMADD
+#define    VMADD4       VNMSUB
+
+#define    XVFADD1       XVFADD
+#define    XVFADD2       XVFADD
+#define    XVFADD3       XVFADD
+#define    XVFADD4       XVFSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVFMADD
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VFMADD
+#define    VMADD4       VFMADD
+
+#define    XVFADD1       XVFADD
+#define    XVFADD2       XVFSUB
+#define    XVFADD3       XVFADD
+#define    XVFADD4       XVFADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVNMSUB
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VNMSUB
+#define    VMADD4       VNMSUB
+
+#define    XVFADD1       XVFADD
+#define    XVFADD2       XVFSUB
+#define    XVFADD3       XVFSUB
+#define    XVFADD4       XVFSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    xvldrepl.w  VALPHAR, $sp, 112
+    xvldrepl.w  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   2
+
+    move       J,      $r0
+    srai.d     T0,     N,     1
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/2; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L150
+
+.L11:  /* for(i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     C3,     OFF,   0x04
+    add.d      A0,     A0,    C3
+    add.d      B0,     B,     C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    srai.d     C2,     TL,     2
+    beq        L,      C2,    .L130
+    blt        C2,     L,     .L130
+
+.L12:  /* for(k=0; k<bk/4; k+=1) */
+    xvld       D0,     A0,    0x00  //a 0-7
+    xvld       D1,     A0,    0x20  //a 8-15
+    xvld       D2,     B0,    0x00  //b 0-7
+    xvld       D3,     B0,    0x20  //b 8-15
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02  //a 0 1 2 3 8 9 10 11
+    xvand.v    D5,     D4,    D4
+    xvshuf4i.w D4,     D4,    0x88  //a 0 2 0 2 8 10 8 10
+    xvshuf4i.w D5,     D5,    0xdd  //a 1 3 1 3 9 11 9 11
+
+    xvand.v    D6,     D1,    D1
+    xvpermi.q  D6,     D0,    0x31  //a 4 5 6 7 12 13 14 15
+    xvand.v    D7,     D6,    D6
+    xvshuf4i.w D6,     D6,    0x88  //a 4 6 4 6 12 14 12 14
+    xvshuf4i.w D7,     D7,    0xdd  //a 5 7 5 7 13 15 13 15
+
+    xvand.v    D8,     D2,    D2
+    xvpermi.q  D8,     D3,    0x02  //b 0 1 2 3 8 9 10 11
+    xvand.v    D9,     D8,    D8
+    xvshuf4i.w D8,     D8,    0xa0  //b 0 0 2 2 8 8 10 10
+    xvshuf4i.w D9,     D9,    0xf5  //a 1 1 3 3 9 9 11 11
+
+    xvand.v    D10,     D3,    D3
+    xvpermi.q  D10,     D2,    0x31  //b 4 5 6 7 12 13 14 15
+    xvand.v    D11,     D10,    D10
+    xvshuf4i.w D10,     D10,    0xa0  //b 4 4 6 6 12 12 14 14
+    xvshuf4i.w D11,     D11,    0xf5  //a 5 5 7 7 13 13 15 15
+
+    XVMADD1      U0,     D4,    D8,     U0  //res0 2 4 6 0 2 4 6
+    XVMADD2      U1,     D5,    D8,     U1  //res1 3 4 7 1 3 4 7
+
+    xvpermi.q    U0,     U0,    0x01
+    xvpermi.q    U1,     U1,    0x01
+    XVMADD1      U0,     D4,    D8,     U0
+    XVMADD2      U1,     D5,    D8,     U1
+
+    XVMADD3      U0,     D5,    D9,     U0
+    XVMADD4      U1,     D4,    D9,     U1
+
+    xvpermi.q    U0,     U0,    0x01
+    xvpermi.q    U1,     U1,    0x01
+    XVMADD3      U0,     D5,    D9,     U0
+    XVMADD4      U1,     D4,    D9,     U1
+
+    XVMADD1      U0,     D6,    D10,     U0  //res0 2 4 6 0 2 4 6
+    XVMADD2      U1,     D7,    D10,     U1  //res1 3 4 7 1 3 4 7
+
+    xvpermi.q    U0,     U0,    0x01
+    xvpermi.q    U1,     U1,    0x01
+    XVMADD1      U0,     D6,    D10,     U0
+    XVMADD2      U1,     D7,    D10,     U1
+
+    XVMADD3      U0,     D7,    D11,     U0
+    XVMADD4      U1,     D6,    D11,     U1
+
+    xvpermi.q    U0,     U0,    0x01
+    xvpermi.q    U1,     U1,    0x01
+    XVMADD3      U0,     D7,    D11,     U0
+    XVMADD4      U1,     D6,    D11,     U1
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L12
+
+.L130:
+    move       L,      $r0
+    andi       C2,     TL,     3
+    beq        L,      C2,    .L14
+
+.L13:  /* for(k=0; k<(bk&3); k+=1) */
+    vld       $vr16,     A0,    0x00  //a0 a1 a2 a3
+    vld       $vr17,     B0,    0x00  //b0 b1 b2 b3
+
+    vshuf4i.w $vr20,     $vr17,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w $vr21,     $vr17,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  $vr18,     $vr16,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  $vr19,     $vr16,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      $vr30,     $vr18,    $vr20,     $vr30  //res0 2 4 6
+    VMADD2      $vr31,     $vr19,    $vr20,     $vr31  //res1 3 5 7
+    VMADD3      $vr30,     $vr19,    $vr21,     $vr30
+    VMADD4      $vr31,     $vr18,    $vr21,     $vr31
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L13
+
+.L14:
+#if defined(TRMMKERNEL)
+    vld       $vr8,     C0,    0x00  //0 1 2 3
+    vld       $vr9,     C1,    0x00  //4 5 6 7
+
+    vpackev.w $vr10,    $vr9,    $vr8    //0 4 2 6
+    vpermi.w  $vr10,    $vr10,   0xd8    //0 2 4 6
+
+    vpackod.w $vr11,    $vr9,    $vr8    //1 5 3 7
+    vpermi.w  $vr11,    $vr11,   0xd8    //1 3 5 7
+
+    vfmul.s      $vr10,    $vr30,    $vr28
+    vfmul.s      $vr11,    $vr31,    $vr28
+    VNMSUB    $vr10,    $vr31,    $vr29, $vr10
+    VFMADD     $vr11,    $vr30,    $vr29, $vr11
+
+    vilvl.w   $vr8,     $vr11,   $vr10  //0 1 2 3
+
+    vilvh.w   $vr9,     $vr11,   $vr10  //4 5 6 7
+
+    vst       $vr8,     C0,    0x00
+    vst       $vr9,     C1,    0x00
+#else
+    vld       $vr8,     C0,    0x00  //0 1 2 3
+    vld       $vr9,     C1,    0x00  //4 5 6 7
+
+    vpackev.w $vr10,    $vr9,    $vr8    //0 4 2 6
+    vpermi.w  $vr10,    $vr10,   0xd8    //0 2 4 6
+
+    vpackod.w $vr11,    $vr9,    $vr8    //1 5 3 7
+    vpermi.w  $vr11,    $vr11,   0xd8    //1 3 5 7
+
+    VFMADD      $vr10,    $vr30,    $vr28, $vr10
+    VFMADD      $vr11,    $vr31,    $vr28, $vr11
+    VNMSUB     $vr10,    $vr31,    $vr29, $vr10
+    VFMADD      $vr11,    $vr30,    $vr29, $vr11
+
+    vilvl.w   $vr8,     $vr11,   $vr10  //0 1 2 3
+
+    vilvh.w   $vr9,     $vr11,   $vr10  //4 5 6 7
+
+    vst       $vr8,     C0,    0x00
+    vst       $vr9,     C1,    0x00
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L18
+
+.L15:  /* for(i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x03
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x04
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         b3,     B0,    0x08        //load4
+    MADD1      c21,    a1,    b3,     c21  //res2
+    MADD2      c22,    a2,    b3,     c22  //res3
+    LD         b4,     B0,    0x0c        //load5
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+
+    MUL        b5,     c21,   ALPHA_R
+    MUL        b6,     c22,   ALPHA_I
+    SUB        b5,     b5,    b6
+    ST         b5,     C1,    0x00
+
+    MUL        b5,     c22,   ALPHA_R
+    MUL        b6,     c21,   ALPHA_I
+    ADD        b6,     b5,    b6
+    ST         b6,     C1,    0x04
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         b5,     C1,    0x00    //C1[0]
+    LD         b6,     C1,    0x04    //C1[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    MADD       b5,     c21,   ALPHA_R, b5
+    MADD       b6,     c22,   ALPHA_R, b6
+    NMSUB      b5,     c22,   ALPHA_I, b5
+    MADD       b6,     c21,   ALPHA_I, b6
+    ST         b5,     C1,    0x00
+    ST         b6,     C1,    0x04
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x03
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x04
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L15
+
+.L18:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+
+    slli.d     L,      K,     0x04
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x02
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     1
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L24
+
+.L21:  /* for (i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x03
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         a3,     A0,    0x08        //load4
+    MADD1      c21,    a3,    b1,     c21  //res2
+    LD         a4,     A0,    0x0c        //load5
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+
+    MUL        a7,     c21,   ALPHA_R
+    MUL        a8,     c22,   ALPHA_I
+    SUB        a7,     a7,    a8
+    ST         a7,     C0,    0x08
+
+    MUL        a7,     c22,   ALPHA_R
+    MUL        a8,     c21,   ALPHA_I
+    ADD        a8,     a7,    a8
+    ST         a8,     C0,    0x0c
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         a7,     C0,    0x08    //C1[2]
+    LD         a8,     C0,    0x0c    //C1[3]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+    ST         a7,     C0,    0x08
+    ST         a8,     C0,    0x0c
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x03
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L24:
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L28
+
+.L25:  /* for (i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,   0x03
+    add.d      A0,     A0,   C3
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x03
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x08
+
+    addi.d     I,      I,     1
+    blt        I,      T1,    .L25
+
+.L28:
+    slli.d     L,      K,     3
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L20
+
+.L30:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_kernel_2x2_lsx.S b/kernel/loongarch64/cgemm_kernel_2x2_lsx.S
new file mode 100644
index 000000000..ebacf4e93
--- /dev/null
+++ b/kernel/loongarch64/cgemm_kernel_2x2_lsx.S
@@ -0,0 +1,812 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LASX vectors */
+#define U0     $vr30
+#define U1     $vr31
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+#define D0     $vr16
+#define D1     $vr17
+#define D2     $vr18
+#define D3     $vr19
+#define D4     $vr20
+#define D5     $vr21
+#define D6     $vr22
+#define D7     $vr23
+#define D8     $vr24
+#define D9     $vr25
+#define D10    $vr26
+#define D11    $vr27
+#define VALPHAR    $vr28
+#define VALPHAI    $vr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VNMSUB
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VFMADD
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VFMADD
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VNMSUB
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    vldrepl.w  VALPHAR, $sp, 112
+    vldrepl.w  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   2
+
+    move       J,      $r0
+    srai.d     T0,     N,     1
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/2; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L150
+
+.L11:  /* for(i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     C3,     OFF,   0x04
+    add.d      A0,     A0,    C3
+    add.d      B0,     B,     C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    srai.d     C2,     TL,     2
+    beq        L,      C2,    .L130
+    blt        C2,     L,     .L130
+
+.L12:  /* for(k=0; k<bk/4; k+=1) */
+    vld       D0,     A0,    0x00  //a0 a1 a2 a3
+    vld       D1,     B0,    0x00  //b0 b1 b2 b3
+
+    vshuf4i.w D4,     D1,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w D5,     D1,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  D2,     D0,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  D3,     D0,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    VMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    VMADD3      U0,     D3,    D5,     U0
+    VMADD4      U1,     D2,    D5,     U1
+
+    vld       D0,     A0,    0x10  //a0 a1 a2 a3
+    vld       D1,     B0,    0x10  //b0 b1 b2 b3
+
+    vshuf4i.w D4,     D1,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w D5,     D1,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  D2,     D0,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  D3,     D0,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    VMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    VMADD3      U0,     D3,    D5,     U0
+    VMADD4      U1,     D2,    D5,     U1
+
+    vld       D0,     A0,    0x20  //a0 a1 a2 a3
+    vld       D1,     B0,    0x20  //b0 b1 b2 b3
+
+    vshuf4i.w D4,     D1,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w D5,     D1,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  D2,     D0,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  D3,     D0,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    VMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    VMADD3      U0,     D3,    D5,     U0
+    VMADD4      U1,     D2,    D5,     U1
+
+    vld       D0,     A0,    0x30  //a0 a1 a2 a3
+    vld       D1,     B0,    0x30  //b0 b1 b2 b3
+
+    vshuf4i.w D4,     D1,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w D5,     D1,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  D2,     D0,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  D3,     D0,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    VMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    VMADD3      U0,     D3,    D5,     U0
+    VMADD4      U1,     D2,    D5,     U1
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L12
+
+.L130:
+    move       L,      $r0
+    andi       C2,     TL,     3
+    beq        L,      C2,    .L14
+
+.L13:  /* for(k=0; k<(bk&3); k+=1) */
+    vld       D0,     A0,    0x00  //a0 a1 a2 a3
+    vld       D1,     B0,    0x00  //b0 b1 b2 b3
+
+    vshuf4i.w D4,     D1,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w D5,     D1,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  D2,     D0,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  D3,     D0,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    VMADD2      U1,     D3,    D4,     U1  //res1 3 5 7
+    VMADD3      U0,     D3,    D5,     U0
+    VMADD4      U1,     D2,    D5,     U1
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L13
+
+.L14:
+#if defined(TRMMKERNEL)
+    vld       U8,     C0,    0x00  //0 1 2 3
+    vld       U9,     C1,    0x00  //4 5 6 7
+
+    vpackev.w U10,    U9,    U8    //0 4 2 6
+    vpermi.w  U10,    U10,   0xd8  //0 2 4 6
+
+    vpackod.w U11,    U9,    U8    //1 5 3 7
+    vpermi.w  U11,    U11,   0xd8  //1 3 5 7
+
+    vfmul.s      U10,    U0,    VALPHAR
+    vfmul.s      U11,    U1,    VALPHAR
+    VNMSUB     U10,    U1,    VALPHAI, U10
+    VFMADD      U11,    U0,    VALPHAI, U11
+
+    vilvl.w   U8,     U11,   U10  //0 1 2 3
+
+    vilvh.w   U9,     U11,   U10  //4 5 6 7
+
+    vst       U8,     C0,    0x00
+    vst       U9,     C1,    0x00
+#else
+    vld       U8,     C0,    0x00  //0 1 2 3
+    vld       U9,     C1,    0x00  //4 5 6 7
+
+    vpackev.w U10,    U9,    U8    //0 4 2 6
+    vpermi.w  U10,    U10,   0xd8  //0 2 4 6
+
+    vpackod.w U11,    U9,    U8    //1 5 3 7
+    vpermi.w  U11,    U11,   0xd8  //1 3 5 7
+
+    VFMADD      U10,    U0,    VALPHAR, U10
+    VFMADD      U11,    U1,    VALPHAR, U11
+    VNMSUB     U10,    U1,    VALPHAI, U10
+    VFMADD      U11,    U0,    VALPHAI, U11
+
+    vilvl.w   U8,     U11,   U10  //0 1 2 3
+
+    vilvh.w   U9,     U11,   U10  //4 5 6 7
+
+    vst       U8,     C0,    0x00
+    vst       U9,     C1,    0x00
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L18
+
+.L15:  /* for(i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x03
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x04
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         b3,     B0,    0x08        //load4
+    MADD1      c21,    a1,    b3,     c21  //res2
+    MADD2      c22,    a2,    b3,     c22  //res3
+    LD         b4,     B0,    0x0c        //load5
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+
+    MUL        b5,     c21,   ALPHA_R
+    MUL        b6,     c22,   ALPHA_I
+    SUB        b5,     b5,    b6
+    ST         b5,     C1,    0x00
+
+    MUL        b5,     c22,   ALPHA_R
+    MUL        b6,     c21,   ALPHA_I
+    ADD        b6,     b5,    b6
+    ST         b6,     C1,    0x04
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         b5,     C1,    0x00    //C1[0]
+    LD         b6,     C1,    0x04    //C1[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    MADD       b5,     c21,   ALPHA_R, b5
+    MADD       b6,     c22,   ALPHA_R, b6
+    NMSUB      b5,     c22,   ALPHA_I, b5
+    MADD       b6,     c21,   ALPHA_I, b6
+    ST         b5,     C1,    0x00
+    ST         b6,     C1,    0x04
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x03
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x04
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L15
+
+.L18:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+
+    slli.d     L,      K,     0x04
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x02
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     1
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L24
+
+.L21:  /* for (i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x03
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         a3,     A0,    0x08        //load4
+    MADD1      c21,    a3,    b1,     c21  //res2
+    LD         a4,     A0,    0x0c        //load5
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+
+    MUL        a7,     c21,   ALPHA_R
+    MUL        a8,     c22,   ALPHA_I
+    SUB        a7,     a7,    a8
+    ST         a7,     C0,    0x08
+
+    MUL        a7,     c22,   ALPHA_R
+    MUL        a8,     c21,   ALPHA_I
+    ADD        a8,     a7,    a8
+    ST         a8,     C0,    0x0c
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         a7,     C0,    0x08    //C1[2]
+    LD         a8,     C0,    0x0c    //C1[3]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+    ST         a7,     C0,    0x08
+    ST         a8,     C0,    0x0c
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x03
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L24:
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L28
+
+.L25:  /* for (i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,   0x03
+    add.d      A0,     A0,   C3
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x03
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x08
+
+    addi.d     I,      I,     1
+    blt        I,      T1,    .L25
+
+.L28:
+    slli.d     L,      K,     3
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L20
+
+.L30:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_ncopy_2_lasx.S b/kernel/loongarch64/cgemm_ncopy_2_lasx.S
new file mode 100644
index 000000000..10dc48c85
--- /dev/null
+++ b/kernel/loongarch64/cgemm_ncopy_2_lasx.S
@@ -0,0 +1,193 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+#define D8     $xr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x02  //lda
+    slli.d     TL,   TL,   0x01
+    slli.d     T0,   TL,   0x01
+
+    srai.d     I,    N,    0x01
+    beq        I,    ZERO,  .L_N0
+
+.L_J1: /* if (i > 0) I-- */
+    move       S1,   TS         //a_offset1
+    add.d      S2,   TS,   TL   //a_offset2
+    srai.d     J,    M,    0x02
+    add.d      TS,   TS,   T0
+
+    beq        J,    ZERO,  .L_I3
+
+.L_I1: /* if (j > 0) J-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x00
+    xvld       U2,   S2,   0x00
+
+    xvpermi.q  U0,   U2,   0x02
+    xvpermi.q  U2,   U1,   0x31
+
+    xvpermi.d  U0,   U0,   0xd8
+    xvpermi.d  U2,   U2,   0xd8
+
+    xvst       U0,   TD,   0x00
+    xvst       U2,   TD,   0x20
+
+    addi.d     S1,   S1,   0x20   // a_offset1
+    addi.d     S2,   S2,   0x20
+    addi.d     TD,   TD,   0x40  // b_offset
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_I1
+
+.L_I3:
+    andi       J,    M,    0x03
+    beq        J,    ZERO, .L_II20
+
+.L_II1:  /* j = (m & 3)  if (j > 0) */
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     TD,   TD,   0x10
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_II1
+
+.L_II20:
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_J1
+
+.L_N0:  /* if(n&1)*/
+    andi       I,     N,   0x01
+    beq        ZERO,  I,   .L_N00
+
+.L_N1:
+    srai.d     J,     M,   0x02
+    beq        ZERO,  J,   .L_N10
+
+.L_N11: /* j = (m >> 2) if (j > 0) */
+    xvld       U0,    TS,   0x00
+
+    xvst       U0,    TD,   0x00
+
+    addi.d     TS,    TS,   0x20   // a_offset
+    addi.d     TD,    TD,   0x20   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N11
+
+.L_N10:
+    andi       J,     M,    0x03
+    beq        J,     ZERO, .L_N00
+
+.L_N12:  /* j = (m & 3)  if (j > 0) */
+    fld.s      F0,    TS,   0x00
+    fld.s      F1,    TS,   0x04
+
+    fst.s      F0,    TD,   0x00
+    fst.s      F1,    TD,   0x04
+
+    addi.d     TS,    TS,   0x08   // a_offset
+    addi.d     TD,    TD,   0x08   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N12
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_ncopy_2_lsx.S b/kernel/loongarch64/cgemm_ncopy_2_lsx.S
new file mode 100644
index 000000000..1cf4d87dc
--- /dev/null
+++ b/kernel/loongarch64/cgemm_ncopy_2_lsx.S
@@ -0,0 +1,202 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+#define D8     $vr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x02  //lda
+    slli.d     TL,   TL,   0x01
+    slli.d     T0,   TL,   0x01
+
+    srai.d     I,    N,    0x01
+    beq        I,    ZERO,  .L_N0
+
+.L_J1: /* if (i > 0) I-- */
+    move       S1,   TS         //a_offset1
+    add.d      S2,   TS,   TL   //a_offset2
+    srai.d     J,    M,    0x02
+    add.d      TS,   TS,   T0
+
+    beq        J,    ZERO,  .L_I3
+
+.L_I1: /* if (j > 0) J-- */
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+    vld       U2,   S2,   0x00
+    vld       U3,   S2,   0x10
+
+    vand.v    D0,   U2,   U2
+    vand.v    D1,   U3,   U3
+    vand.v    D2,   U2,   U2
+    vand.v    D3,   U3,   U3
+
+    vpermi.w  D0,   U0,   0x44
+    vpermi.w  D2,   U0,   0xee
+    vpermi.w  D1,   U1,   0x44
+    vpermi.w  D3,   U1,   0xee
+
+    vst       D0,   TD,   0x00
+    vst       D2,   TD,   0x10
+    vst       D1,   TD,   0x20
+    vst       D3,   TD,   0x30
+
+    addi.d     S1,   S1,   0x20   // a_offset1
+    addi.d     S2,   S2,   0x20
+    addi.d     TD,   TD,   0x40  // b_offset
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_I1
+
+.L_I3:
+    andi       J,    M,    0x03
+    beq        J,    ZERO, .L_II20
+
+.L_II1:  /* j = (m & 3)  if (j > 0) */
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     TD,   TD,   0x10
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_II1
+
+.L_II20:
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_J1
+
+.L_N0:  /* if(n&1)*/
+    andi       I,     N,   0x01
+    beq        ZERO,  I,   .L_N00
+
+.L_N1:
+    srai.d     J,     M,   0x02
+    beq        ZERO,  J,   .L_N10
+
+.L_N11: /* j = (m >> 2) if (j > 0) */
+    vld       U0,    TS,   0x00
+    vld       U1,    TS,   0x10
+
+    vst       U0,    TD,   0x00
+    vst       U1,    TD,   0x10
+
+    addi.d     TS,    TS,   0x20   // a_offset
+    addi.d     TD,    TD,   0x20   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N11
+
+.L_N10:
+    andi       J,     M,    0x03
+    beq        J,     ZERO, .L_N00
+
+.L_N12:  /* j = (m & 3)  if (j > 0) */
+    fld.s      F0,    TS,   0x00
+    fld.s      F1,    TS,   0x04
+
+    fst.s      F0,    TD,   0x00
+    fst.s      F1,    TD,   0x04
+
+    addi.d     TS,    TS,   0x08   // a_offset
+    addi.d     TD,    TD,   0x08   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N12
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_tcopy_2_lasx.S b/kernel/loongarch64/cgemm_tcopy_2_lasx.S
new file mode 100644
index 000000000..e2245e412
--- /dev/null
+++ b/kernel/loongarch64/cgemm_tcopy_2_lasx.S
@@ -0,0 +1,218 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define TD     $r16
+#define TS     $r17
+#define TL     $r7
+#define T0     $r18
+#define S8     $r19
+#define S9     $r20
+#define S10    $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x02  //lda
+    slli.d     TL,     TL,    0x01
+
+    ori        T0,     ZERO,  0x01
+    andn       T0,     N,     T0
+    mul.d      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x02
+    add.d      S9,     DST,   T0  //boffset2
+
+    srai.d     J,      M,     0x01  //j
+
+    beq        J,      ZERO,  .L_M1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01
+    add.d      S2,     S1,    TL   //aoffset2
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x20
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_JN1
+
+.L_JI1:  /* if(i>0) i--*/
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x00
+    xvld       U2,   S2,   0x00
+
+    xvpermi.q  U0,     U2,   0x02
+    xvpermi.q  U2,     U1,   0x31
+
+    xvst       U0,   S8,   0x00
+
+    slli.d     T0,   M,    0x04
+    add.d      S8,   S8,   T0
+
+    xvst       U2,   S8,   0x00
+
+    add.d      S8,   S8,   T0
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_JI1
+
+.L_JN1:  /* if(n&2) */
+    andi       I,    N,    0x02
+    beq        ZERO, I,    .L_JN2
+
+    vld        $vr0,   S1,   0x00
+    vld        $vr1,   S2,   0x00
+
+    vst        $vr0,   S8,   0x00
+    vst        $vr1,   S8,   0x10
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+
+.L_JN2:  /* if(n&1) */
+    andi       I,    N,    0x01
+    beq        ZERO, I,    .L_J0
+
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fst.s      F0,   S9,   0x00
+    fst.s      F1,   S9,   0x04
+    fst.s      F2,   S9,   0x08
+    fst.s      F3,   S9,   0x0c
+
+    addi.d     S9,   S9,   0x10
+
+.L_J0:
+    addi.d     J,    J,    -1
+    blt        ZERO, J,   .L_J1
+
+.L_M1:  /* if(m&1) */
+    andi       I,      M,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M1N1
+
+.L_M1I1:  /* if(i>0) */
+    vld        $vr0,     TS,    0x00
+    vld        $vr1,     TS,    0x10
+
+    vst       $vr0,     TD,    0x00
+
+    slli.d     T0,     M,     0x04
+    add.d      TD,     TD,    T0
+
+    vst       $vr1,     TD,    0x00
+
+    add.d      TD,     TD,    T0
+    addi.d     TS,     TS,    0x20
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M1I1
+
+.L_M1N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M1N2
+
+    vld        $vr0,     TS,    0x00
+
+    vst        $vr0,     TD,    0x00
+
+    addi.d     TS,     TS,    0x10
+
+.L_M1N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    fld.s      F0,     TS,    0x00
+    fld.s      F1,     TS,    0x04
+
+    fst.s      F0,     S9,    0x00
+    fst.s      F1,     S9,    0x04
+
+.L_M0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_tcopy_2_lsx.S b/kernel/loongarch64/cgemm_tcopy_2_lsx.S
new file mode 100644
index 000000000..15c0fde8f
--- /dev/null
+++ b/kernel/loongarch64/cgemm_tcopy_2_lsx.S
@@ -0,0 +1,218 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define TD     $r16
+#define TS     $r17
+#define TL     $r7
+#define T0     $r18
+#define S8     $r19
+#define S9     $r20
+#define S10    $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x02  //lda
+    slli.d     TL,     TL,    0x01
+
+    ori        T0,     ZERO,  0x01
+    andn       T0,     N,     T0
+    mul.d      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x02
+    add.d      S9,     DST,   T0  //boffset2
+
+    srai.d     J,      M,     0x01  //j
+
+    beq        J,      ZERO,  .L_M1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01
+    add.d      S2,     S1,    TL   //aoffset2
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x20
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_JN1
+
+.L_JI1:  /* if(i>0) i--*/
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+    vld       U2,   S2,   0x00
+    vld       U3,   S2,   0x10
+
+    vst       U0,   S8,   0x00
+    vst       U2,   S8,   0x10
+
+    slli.d     T0,   M,    0x04
+    add.d      S8,   S8,   T0
+
+    vst       U1,   S8,   0x00
+    vst       U3,   S8,   0x10
+
+    add.d      S8,   S8,   T0
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_JI1
+
+.L_JN1:  /* if(n&2) */
+    andi       I,    N,    0x02
+    beq        ZERO, I,    .L_JN2
+
+    vld       U0,   S1,   0x00
+    vld       U1,   S2,   0x00
+
+    vst       U0,   S8,   0x00
+    vst       U1,   S8,   0x10
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+
+.L_JN2:  /* if(n&1) */
+    andi       I,    N,    0x01
+    beq        ZERO, I,    .L_J0
+
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fst.s      F0,   S9,   0x00
+    fst.s      F1,   S9,   0x04
+    fst.s      F2,   S9,   0x08
+    fst.s      F3,   S9,   0x0c
+
+    addi.d     S9,   S9,   0x10
+
+.L_J0:
+    addi.d     J,    J,    -1
+    blt        ZERO, J,   .L_J1
+
+.L_M1:  /* if(m&1) */
+    andi       I,      M,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M1N1
+
+.L_M1I1:  /* if(i>0) */
+    vld       U0,     TS,    0x00
+    vld       U1,     TS,    0x10
+
+    vst       U0,     TD,    0x00
+
+    slli.d     T0,     M,     0x04
+    add.d      TD,     TD,    T0
+
+    vst       U1,     TD,    0x00
+
+    add.d      TD,     TD,    T0
+    addi.d     TS,     TS,    0x20
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M1I1
+
+.L_M1N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M1N2
+
+    vld       U0,     TS,    0x00
+
+    vst       U0,     TD,    0x00
+
+    addi.d     TS,     TS,    0x10
+
+.L_M1N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    fld.s      F0,     TS,    0x00
+    fld.s      F1,     TS,    0x04
+
+    fst.s      F0,     S9,    0x00
+    fst.s      F1,     S9,    0x04
+
+.L_M0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_kernel_2x2.S b/kernel/loongarch64/zgemm_kernel_2x2.S
new file mode 100644
index 000000000..589d170c5
--- /dev/null
+++ b/kernel/loongarch64/zgemm_kernel_2x2.S
@@ -0,0 +1,848 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+#define c51    $f26
+#define c52    $f27
+#define c61    $f28
+#define c62    $f29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -88
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    ST         $f23,   $sp,   32
+    ST         $f24,   $sp,   40
+    ST         $f25,   $sp,   48
+    ST         $f26,   $sp,   56
+    ST         $f27,   $sp,   64
+    ST         $f28,   $sp,   72
+    ST         $f29,   $sp,   80
+
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   BASE_SHIFT
+
+    move       J,      $r0
+    srai.d     T0,     N,     1
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/2; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L150
+
+.L11:  /* for(i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     C3,     OFF,   0x05
+    add.d      A0,     A0,    C3
+    add.d      B0,     B,     C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MOV        c12,    c11
+    MOV        c21,    c11
+    MOV        c22,    c11
+    MOV        c31,    c11
+    MOV        c32,    c11
+    MOV        c41,    c11
+    MOV        c42,    c11
+
+    move       L,      $r0   //cycle param k
+    srai.d     C2,     TL,     2
+    beq        L,      C2,    .L130
+    blt        C2,     L,     .L130
+
+.L12:  /* for(k=0; k<bk/4; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x08        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x08        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         a3,     A0,    0x10        //load4
+    MADD1      c21,    a3,    b1,     c21  //res2
+    LD         a4,     A0,    0x18        //load5
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+    LD         b3,     B0,    0x10        //load6
+    MADD1      c31,    a1,    b3,     c31  //res4
+    MADD2      c32,    a2,    b3,     c32  //res5
+    LD         b4,     B0,    0x18        //load7
+    MADD3      c31,    a2,    b4,     c31
+    MADD4      c32,    a1,    b4,     c32
+    MADD1      c41,    a3,    b3,     c41  //res6
+    MADD2      c42,    a4,    b3,     c42  //res7
+    MADD3      c41,    a4,    b4,     c41
+    MADD4      c42,    a3,    b4,     c42
+    LD         a5,     A0,    0x20        //load8
+    LD         b5,     B0,    0x20        //load9
+    MADD1      c11,    a5,    b5,     c11
+    LD         a6,     A0,    0x28        //load10
+    MADD2      c12,    a6,    b5,     c12
+    LD         b6,     B0,    0x28        //load11
+    MADD3      c11,    a6,    b6,     c11
+    MADD4      c12,    a5,    b6,     c12
+    LD         a7,     A0,    0x30        //load12
+    MADD1      c21,    a7,    b5,     c21
+    LD         a8,     A0,    0x38        //load13
+    MADD2      c22,    a8,    b5,     c22
+    MADD3      c21,    a8,    b6,     c21
+    MADD4      c22,    a7,    b6,     c22
+    LD         b7,     B0,    0x30        //load14
+    MADD1      c31,    a5,    b7,     c31
+    MADD2      c32,    a6,    b7,     c32
+    LD         b8,     B0,    0x38        //load15
+    MADD3      c31,    a6,    b8,     c31
+    MADD4      c32,    a5,    b8,     c32
+    MADD1      c41,    a7,    b7,     c41
+    MADD2      c42,    a8,    b7,     c42
+    MADD3      c41,    a8,    b8,     c41
+    MADD4      c42,    a7,    b8,     c42
+
+    LD         a1,     A0,    0x40        //load0
+    LD         b1,     B0,    0x40        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x48        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x48        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         a3,     A0,    0x50        //load4
+    MADD1      c21,    a3,    b1,     c21  //res2
+    LD         a4,     A0,    0x58        //load5
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+    LD         b3,     B0,    0x50        //load6
+    MADD1      c31,    a1,    b3,     c31  //res4
+    MADD2      c32,    a2,    b3,     c32  //res5
+    LD         b4,     B0,    0x58        //load7
+    MADD3      c31,    a2,    b4,     c31
+    MADD4      c32,    a1,    b4,     c32
+    MADD1      c41,    a3,    b3,     c41  //res6
+    MADD2      c42,    a4,    b3,     c42  //res7
+    MADD3      c41,    a4,    b4,     c41
+    MADD4      c42,    a3,    b4,     c42
+    LD         a5,     A0,    0x60        //load8
+    LD         b5,     B0,    0x60        //load9
+    MADD1      c11,    a5,    b5,     c11
+    LD         a6,     A0,    0x68        //load10
+    MADD2      c12,    a6,    b5,     c12
+    LD         b6,     B0,    0x68        //load11
+    MADD3      c11,    a6,    b6,     c11
+    MADD4      c12,    a5,    b6,     c12
+    LD         a7,     A0,    0x70        //load12
+    MADD1      c21,    a7,    b5,     c21
+    LD         a8,     A0,    0x78        //load13
+    MADD2      c22,    a8,    b5,     c22
+    MADD3      c21,    a8,    b6,     c21
+    MADD4      c22,    a7,    b6,     c22
+    LD         b7,     B0,    0x70        //load14
+    MADD1      c31,    a5,    b7,     c31
+    MADD2      c32,    a6,    b7,     c32
+    LD         b8,     B0,    0x78        //load15
+    MADD3      c31,    a6,    b8,     c31
+    MADD4      c32,    a5,    b8,     c32
+    MADD1      c41,    a7,    b7,     c41
+    MADD2      c42,    a8,    b7,     c42
+    MADD3      c41,    a8,    b8,     c41
+    MADD4      c42,    a7,    b8,     c42
+
+    addi.d     A0,     A0,    0x80
+    addi.d     B0,     B0,    0x80
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L12
+
+.L130:
+    move       L,      $r0
+    andi       C2,     TL,     3
+    beq        L,      C2,    .L14
+
+.L13:  /* for(k=0; k<(bk&3); k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x08        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x08        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         a3,     A0,    0x10        //load4
+    MADD1      c21,    a3,    b1,     c21  //res2
+    LD         a4,     A0,    0x18        //load5
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+    LD         b3,     B0,    0x10        //load6
+    MADD1      c31,    a1,    b3,     c31  //res4
+    MADD2      c32,    a2,    b3,     c32  //res5
+    LD         b4,     B0,    0x18        //load7
+    MADD3      c31,    a2,    b4,     c31
+    MADD4      c32,    a1,    b4,     c32
+    MADD1      c41,    a3,    b3,     c41  //res6
+    MADD2      c42,    a4,    b3,     c42  //res7
+    MADD3      c41,    a4,    b4,     c41
+    MADD4      c42,    a3,    b4,     c42
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L13
+
+.L14:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+
+    MUL        a7,     c21,   ALPHA_R
+    MUL        a8,     c22,   ALPHA_I
+    SUB        a7,     a7,    a8
+    ST         a7,     C0,    0x10
+
+    MUL        a7,     c22,   ALPHA_R
+    MUL        a8,     c21,   ALPHA_I
+    ADD        a8,     a7,    a8
+    ST         a8,     C0,    0x18
+
+    MUL        b5,     c31,   ALPHA_R
+    MUL        b6,     c32,   ALPHA_I
+    SUB        b5,     b5,    b6
+    ST         b5,     C1,    0x00
+
+    MUL        b5,     c32,   ALPHA_R
+    MUL        b6,     c31,   ALPHA_I
+    ADD        b6,     b5,    b6
+    ST         b6,     C1,    0x08
+
+    MUL        b7,     c41,   ALPHA_R
+    MUL        b8,     c42,   ALPHA_I
+    SUB        b7,     b7,    b8
+    ST         b7,     C1,    0x10
+
+    MUL        b7,     c42,   ALPHA_R
+    MUL        b8,     c41,   ALPHA_I
+    ADD        b8,     b7,    b8
+    ST         b8,     C1,    0x18
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+    LD         a7,     C0,    0x10    //C0[2]
+    LD         a8,     C0,    0x18    //C0[3]
+    LD         b5,     C1,    0x00    //C1[0]
+    LD         b6,     C1,    0x08    //C1[1]
+    LD         b7,     C1,    0x10    //C1[2]
+    LD         b8,     C1,    0x18    //C1[3]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+    ST         a7,     C0,    0x10
+    ST         a8,     C0,    0x18
+
+    MADD       b5,     c31,   ALPHA_R, b5
+    MADD       b6,     c32,   ALPHA_R, b6
+    NMSUB      b5,     c32,   ALPHA_I, b5
+    MADD       b6,     c31,   ALPHA_I, b6
+    ST         b5,     C1,    0x00
+    ST         b6,     C1,    0x08
+
+    MADD       b7,     c41,   ALPHA_R, b7
+    MADD       b8,     c42,   ALPHA_R, b8
+    NMSUB      b7,     c42,   ALPHA_I, b7
+    MADD       b8,     c41,   ALPHA_I, b8
+    ST         b7,     C1,    0x10
+    ST         b8,     C1,    0x18
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x05
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L18
+
+.L15:  /* for(i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x05
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x08        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x08        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         b3,     B0,    0x10        //load4
+    MADD1      c21,    a1,    b3,     c21  //res2
+    MADD2      c22,    a2,    b3,     c22  //res3
+    LD         b4,     B0,    0x18        //load5
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+
+    MUL        b5,     c21,   ALPHA_R
+    MUL        b6,     c22,   ALPHA_I
+    SUB        b5,     b5,    b6
+    ST         b5,     C1,    0x00
+
+    MUL        b5,     c22,   ALPHA_R
+    MUL        b6,     c21,   ALPHA_I
+    ADD        b6,     b5,    b6
+    ST         b6,     C1,    0x08
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+    LD         b5,     C1,    0x00    //C1[0]
+    LD         b6,     C1,    0x08    //C1[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+
+    MADD       b5,     c21,   ALPHA_R, b5
+    MADD       b6,     c22,   ALPHA_R, b6
+    NMSUB      b5,     c22,   ALPHA_I, b5
+    MADD       b6,     c21,   ALPHA_I, b6
+    ST         b5,     C1,    0x00
+    ST         b6,     C1,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x05
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L15
+
+.L18:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+
+    slli.d     L,      K,     0x05
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x02
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     1
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L24
+
+.L21:  /* for (i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x05
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x04
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x08        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x08        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         a3,     A0,    0x10        //load4
+    MADD1      c21,    a3,    b1,     c21  //res2
+    LD         a4,     A0,    0x18        //load5
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+
+    MUL        a7,     c21,   ALPHA_R
+    MUL        a8,     c22,   ALPHA_I
+    SUB        a7,     a7,    a8
+    ST         a7,     C0,    0x10
+
+    MUL        a7,     c22,   ALPHA_R
+    MUL        a8,     c21,   ALPHA_I
+    ADD        a8,     a7,    a8
+    ST         a8,     C0,    0x18
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+    LD         a7,     C0,    0x10    //C1[2]
+    LD         a8,     C0,    0x18    //C1[3]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+    ST         a7,     C0,    0x10
+    ST         a8,     C0,    0x18
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x05
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x04
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x20
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L24:
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L28
+
+.L25:  /* for (i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,   0x04
+    add.d      A0,     A0,   C3
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x08        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x08        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T1,    .L25
+
+.L28:
+    slli.d     L,      K,     4
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L20
+
+.L30:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LD         $f23,   $sp,   32
+    LD         $f24,   $sp,   40
+    LD         $f25,   $sp,   48
+    LD         $f26,   $sp,   56
+    LD         $f27,   $sp,   64
+    LD         $f28,   $sp,   72
+    LD         $f29,   $sp,   80
+
+    addi.d     $sp,    $sp,   88
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_kernel_2x2_lasx.S b/kernel/loongarch64/zgemm_kernel_2x2_lasx.S
new file mode 100644
index 000000000..152190d4f
--- /dev/null
+++ b/kernel/loongarch64/zgemm_kernel_2x2_lasx.S
@@ -0,0 +1,822 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LASX vectors */
+#define U0     $xr30
+#define U1     $xr31
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define VALPHAR $xr28
+#define VALPHAI $xr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVFMADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVFMADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    xvldrepl.d  VALPHAR, $sp, 112
+    xvldrepl.d  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   BASE_SHIFT
+
+    move       J,      $r0
+    srai.d     T0,     N,     1
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/2; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L150
+
+.L11:  /* for(i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     C3,     OFF,   0x05
+    add.d      A0,     A0,    C3
+    add.d      B0,     B,     C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    move       L,      $r0   //cycle param k
+    srai.d     C2,     TL,     2
+    xvxor.v    U1,     U1,   U1
+    beq        L,      C2,    .L130
+    blt        C2,     L,     .L130
+
+.L12:  /* for(k=0; k<bk/4; k+=1) */
+    xvld       D0,     A0,    0x00  //a0 a1 a2 a3
+    xvld       D1,     B0,    0x00  //b0 b1 b2 b3
+
+    xvpermi.d  D4,     D1,    0xa0    //b0 b0 b2 b2
+    xvpermi.d  D5,     D1,    0xf5    //b1 b1 b3 b3
+    xvpermi.d  D2,     D0,    0x88    //a0 a2 a0 a2
+    xvpermi.d  D3,     D0,    0xdd    //a1 a3 a1 a3
+
+    XVMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    XVMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    XVMADD3      U0,     D3,    D5,     U0
+    XVMADD4      U1,     D2,    D5,     U1
+
+    xvld       D0,     A0,    0x20  //a0 a1 a2 a3
+    xvld       D1,     B0,    0x20  //b0 b1 b2 b3
+
+    xvpermi.d  D4,     D1,    0xa0    //b0 b0 b2 b2
+    xvpermi.d  D5,     D1,    0xf5    //b1 b1 b3 b3
+    xvpermi.d  D2,     D0,    0x88    //a0 a2 a0 a2
+    xvpermi.d  D3,     D0,    0xdd    //a1 a3 a1 a3
+
+    XVMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    XVMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    XVMADD3      U0,     D3,    D5,     U0
+    XVMADD4      U1,     D2,    D5,     U1
+
+    xvld       D0,     A0,    0x40  //a0 a1 a2 a3
+    xvld       D1,     B0,    0x40  //b0 b1 b2 b3
+
+    xvpermi.d  D4,     D1,    0xa0    //b0 b0 b2 b2
+    xvpermi.d  D5,     D1,    0xf5    //b1 b1 b3 b3
+    xvpermi.d  D2,     D0,    0x88    //a0 a2 a0 a2
+    xvpermi.d  D3,     D0,    0xdd    //a1 a3 a1 a3
+
+    XVMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    XVMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    XVMADD3      U0,     D3,    D5,     U0
+    XVMADD4      U1,     D2,    D5,     U1
+
+    xvld       D0,     A0,    0x60  //a0 a1 a2 a3
+    xvld       D1,     B0,    0x60  //b0 b1 b2 b3
+
+    xvpermi.d  D4,     D1,    0xa0    //b0 b0 b2 b2
+    xvpermi.d  D5,     D1,    0xf5    //b1 b1 b3 b3
+    xvpermi.d  D2,     D0,    0x88    //a0 a2 a0 a2
+    xvpermi.d  D3,     D0,    0xdd    //a1 a3 a1 a3
+
+    XVMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    XVMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    XVMADD3      U0,     D3,    D5,     U0
+    XVMADD4      U1,     D2,    D5,     U1
+
+    addi.d     A0,     A0,    0x80
+    addi.d     B0,     B0,    0x80
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L12
+
+.L130:
+    move       L,      $r0
+    andi       C2,     TL,     3
+    beq        L,      C2,    .L14
+
+.L13:  /* for(k=0; k<(bk&3); k+=1) */
+    xvld       D0,     A0,    0x00  //a0 a1 a2 a3
+    xvld       D1,     B0,    0x00  //b0 b1 b2 b3
+
+    xvpermi.d  D4,     D1,    0xa0    //b0 b0 b2 b2
+    xvpermi.d  D5,     D1,    0xf5    //b1 b1 b3 b3
+    xvpermi.d  D2,     D0,    0x88    //a0 a2 a0 a2
+    xvpermi.d  D3,     D0,    0xdd    //a1 a3 a1 a3
+
+    XVMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    XVMADD2      U1,     D3,    D4,     U1  //res1 3 5 7
+    XVMADD3      U0,     D3,    D5,     U0
+    XVMADD4      U1,     D2,    D5,     U1
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L13
+
+.L14:
+#if defined(TRMMKERNEL)
+    xvld       U8,     C0,    0x00  //0 1 2 3
+    xvld       U9,     C1,    0x00  //4 5 6 7
+
+    xvpackev.d U10,    U9,    U8
+    xvpermi.d  U10,    U10,   0xd8  //0 2 4 6
+    xvpackod.d U11,    U9,    U8
+    xvpermi.d  U11,    U11,   0xd8  //1 3 5 7
+
+    xvfmul.d      U10,    U0,    VALPHAR
+    xvfmul.d      U11,    U1,    VALPHAR
+    XVNMSUB     U10,    U1,    VALPHAI, U10
+    XVFMADD      U11,    U0,    VALPHAI, U11
+
+    xvand.v    U8,     U10,   U10  //0 2 4 6
+    xvpermi.q  U8,     U11,   0x02 //0 2 1 3
+    xvpermi.d  U8,     U8,    0xd8 //0 1 2 3
+
+    xvand.v    U9,     U11,   U11  //1 3 5 7
+    xvpermi.q  U9,     U10,   0x31 //4 6 5 7
+    xvpermi.d  U9,     U9,    0xd8 //4 5 6 7
+
+    xvst       U8,     C0,    0x00
+    xvst       U9,     C1,    0x00
+#else
+    xvld       U8,     C0,    0x00  //0 1 2 3
+    xvld       U9,     C1,    0x00  //4 5 6 7
+
+    xvpackev.d U10,    U9,    U8
+    xvpermi.d  U10,    U10,   0xd8  //0 2 4 6
+    xvpackod.d U11,    U9,    U8
+    xvpermi.d  U11,    U11,   0xd8  //1 3 5 7
+
+    XVFMADD      U10,    U0,    VALPHAR, U10
+    XVFMADD      U11,    U1,    VALPHAR, U11
+    XVNMSUB     U10,    U1,    VALPHAI, U10
+    XVFMADD      U11,    U0,    VALPHAI, U11
+
+    xvand.v    U8,     U10,   U10  //0 2 4 6
+    xvpermi.q  U8,     U11,   0x02 //0 2 1 3
+    xvpermi.d  U8,     U8,    0xd8 //0 1 2 3
+
+    xvand.v    U9,     U11,   U11  //1 3 5 7
+    xvpermi.q  U9,     U10,   0x31 //4 6 5 7
+    xvpermi.d  U9,     U9,    0xd8 //4 5 6 7
+
+    xvst       U8,     C0,    0x00
+    xvst       U9,     C1,    0x00
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x05
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L18
+
+.L15:  /* for(i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x05
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    LD         a2,     A0,    0x08        //load2
+    LD         b2,     B0,    0x08        //load3
+    LD         b3,     B0,    0x10        //load4
+    LD         b4,     B0,    0x18        //load5
+
+    MADD1      c11,    a1,    b1,     c11  //res0
+    MADD2      c12,    a2,    b1,     c12  //res1
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    MADD1      c21,    a1,    b3,     c21  //res2
+    MADD2      c22,    a2,    b3,     c22  //res3
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+
+    MUL        b5,     c21,   ALPHA_R
+    MUL        b6,     c22,   ALPHA_I
+    SUB        b5,     b5,    b6
+    ST         b5,     C1,    0x00
+
+    MUL        b5,     c22,   ALPHA_R
+    MUL        b6,     c21,   ALPHA_I
+    ADD        b6,     b5,    b6
+    ST         b6,     C1,    0x08
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+    LD         b5,     C1,    0x00    //C1[0]
+    LD         b6,     C1,    0x08    //C1[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    MADD       b5,     c21,   ALPHA_R, b5
+    MADD       b6,     c22,   ALPHA_R, b6
+    NMSUB      b5,     c22,   ALPHA_I, b5
+    MADD       b6,     c21,   ALPHA_I, b6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+
+    ST         b5,     C1,    0x00
+    ST         b6,     C1,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x05
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L15
+
+.L18:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+
+    slli.d     L,      K,     0x05
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x02
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     1
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L24
+
+.L21:  /* for (i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x05
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x04
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    LD         a2,     A0,    0x08        //load2
+    LD         b2,     B0,    0x08        //load3
+    LD         a3,     A0,    0x10        //load4
+    LD         a4,     A0,    0x18        //load5
+
+    MADD1      c11,    a1,    b1,     c11  //res0
+    MADD2      c12,    a2,    b1,     c12  //res1
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    MADD1      c21,    a3,    b1,     c21  //res2
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+
+    MUL        a7,     c21,   ALPHA_R
+    MUL        a8,     c22,   ALPHA_I
+    SUB        a7,     a7,    a8
+    ST         a7,     C0,    0x10
+
+    MUL        a7,     c22,   ALPHA_R
+    MUL        a8,     c21,   ALPHA_I
+    ADD        a8,     a7,    a8
+    ST         a8,     C0,    0x18
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+    LD         a7,     C0,    0x10    //C1[2]
+    LD         a8,     C0,    0x18    //C1[3]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+
+    ST         a7,     C0,    0x10
+    ST         a8,     C0,    0x18
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x05
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x04
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x20
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L24:
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L28
+
+.L25:  /* for (i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,   0x04
+    add.d      A0,     A0,   C3
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    LD         a2,     A0,    0x08        //load2
+    LD         b2,     B0,    0x08        //load3
+
+    MADD1      c11,    a1,    b1,     c11  //res0
+    MADD2      c12,    a2,    b1,     c12  //res1
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T1,    .L25
+
+.L28:
+    slli.d     L,      K,     4
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L20
+
+.L30:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_ncopy_2_lasx.S b/kernel/loongarch64/zgemm_ncopy_2_lasx.S
new file mode 100644
index 000000000..4bc1052a9
--- /dev/null
+++ b/kernel/loongarch64/zgemm_ncopy_2_lasx.S
@@ -0,0 +1,196 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+#define D8     $xr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x03  //lda
+    slli.d     TL,   TL,   0x01
+    slli.d     T0,   TL,   0x01
+
+    srai.d     I,    N,    0x01
+    beq        I,    ZERO,  .L_N0
+
+.L_J1: /* if (i > 0) I-- */
+    move       S1,   TS         //a_offset1
+    add.d      S2,   TS,   TL   //a_offset2
+    srai.d     J,    M,    0x02
+    add.d      TS,   TS,   T0
+
+    beq        J,    ZERO,  .L_I3
+
+.L_I1: /* if (j > 0) J-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x20
+    xvld       U2,   S2,   0x00
+    xvld       U3,   S2,   0x20
+
+    xvand.v    D0,   U0,   U0
+    xvand.v    D1,   U1,   U1
+    xvand.v    D2,   U2,   U2
+    xvand.v    D3,   U3,   U3
+
+    xvpermi.q  D0,   U2,   0x02
+    xvpermi.q  D2,   U0,   0x31
+    xvpermi.q  D1,   U3,   0x02
+    xvpermi.q  D3,   U1,   0x31
+
+    xvst       D0,   TD,   0x00
+    xvst       D2,   TD,   0x20
+    xvst       D1,   TD,   0x40
+    xvst       D3,   TD,   0x60
+
+    addi.d     S1,   S1,   0x40   // a_offset1
+    addi.d     S2,   S2,   0x40
+    addi.d     TD,   TD,   0x80  // b_offset
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_I1
+
+.L_I3:
+    andi       J,    M,    0x03
+    beq        J,    ZERO, .L_II20
+
+.L_II1:  /* j = (m & 3)  if (j > 0) */
+    vld        $vr0, S1,   0x00
+    vld        $vr1, S2,   0x00
+
+    vst        $vr0, TD,   0x00
+    vst        $vr1, TD,   0x10
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     TD,   TD,   0x20
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_II1
+
+.L_II20:
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_J1
+
+.L_N0:  /* if(n&1)*/
+    andi       I,     N,   0x01
+    beq        ZERO,  I,   .L_N00
+
+.L_N1:
+    srai.d     J,     M,   0x02
+    beq        ZERO,  J,   .L_N10
+
+.L_N11: /* j = (m >> 2) if (j > 0) */
+    xvld       U0,    TS,   0x00
+    xvld       U1,    TS,   0x20
+
+    xvst       U0,    TD,   0x00
+    xvst       U1,    TD,   0x20
+
+    addi.d     TS,    TS,   0x40   // a_offset
+    addi.d     TD,    TD,   0x40   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N11
+
+.L_N10:
+    andi       J,     M,    0x03
+    beq        J,     ZERO, .L_N00
+
+.L_N12:  /* j = (m & 3)  if (j > 0) */
+    vld        $vr0,  TS,   0x00
+    vst        $vr0,  TD,   0x00
+
+
+    addi.d     TS,    TS,   0x10   // a_offset
+    addi.d     TD,    TD,   0x10   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N12
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_tcopy_2_lasx.S b/kernel/loongarch64/zgemm_tcopy_2_lasx.S
new file mode 100644
index 000000000..3fe17beef
--- /dev/null
+++ b/kernel/loongarch64/zgemm_tcopy_2_lasx.S
@@ -0,0 +1,212 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define TD     $r16
+#define TS     $r17
+#define TL     $r7
+#define T0     $r18
+#define S8     $r19
+#define S9     $r20
+#define S10    $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x03  //lda
+    slli.d     TL,     TL,    0x01
+
+    ori        T0,     ZERO,  0x01
+    andn       T0,     N,     T0
+    mul.d      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x03
+    add.d      S9,     DST,   T0  //boffset2
+
+    srai.d     J,      M,     0x01  //j
+
+    beq        J,      ZERO,  .L_M1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01
+    add.d      S2,     S1,    TL   //aoffset2
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x40
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_JN1
+
+.L_JI1:  /* if(i>0) i--*/
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x20
+    xvld       U2,   S2,   0x00
+    xvld       U3,   S2,   0x20
+
+    xvst       U0,   S8,   0x00
+    xvst       U2,   S8,   0x20
+
+    slli.d     T0,   M,    0x05
+    add.d      S8,   S8,   T0
+
+    xvst       U1,   S8,   0x00
+    xvst       U3,   S8,   0x20
+
+    add.d      S8,   S8,   T0
+    addi.d     S1,   S1,   0x40
+    addi.d     S2,   S2,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_JI1
+
+.L_JN1:  /* if(n&2) */
+    andi       I,    N,    0x02
+    beq        ZERO, I,    .L_JN2
+
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+
+    xvst       U0,   S8,   0x00
+    xvst       U1,   S8,   0x20
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+
+.L_JN2:  /* if(n&1) */
+    andi       I,    N,    0x01
+    beq        ZERO, I,    .L_J0
+
+    vld        $vr0, S1,   0x00
+    vld        $vr1, S2,   0x00
+
+    vst        $vr0, S9,   0x00
+    vst        $vr1, S9,   0x10
+
+    addi.d     S9,   S9,   0x20
+
+.L_J0:
+    addi.d     J,    J,    -1
+    blt        ZERO, J,   .L_J1
+
+.L_M1:  /* if(m&1) */
+    andi       I,      M,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M1N1
+
+.L_M1I1:  /* if(i>0) */
+    xvld       U0,     TS,    0x00
+    xvld       U1,     TS,    0x20
+
+    xvst       U0,     TD,    0x00
+
+    slli.d     T0,     M,     0x05
+    add.d      TD,     TD,    T0
+
+    xvst       U1,     TD,    0x00
+
+    add.d      TD,     TD,    T0
+    addi.d     TS,     TS,    0x40
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M1I1
+
+.L_M1N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M1N2
+
+    xvld       U0,     TS,    0x00
+
+    xvst       U0,     TD,    0x00
+
+    addi.d     TS,     TS,    0x20
+
+.L_M1N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    vld        $vr0, TS,   0x00
+
+    vst        $vr0, S9,   0x00
+
+.L_M0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/param.h b/param.h
index e4e242d5d..54760a82e 100644
--- a/param.h
+++ b/param.h
@@ -2853,13 +2853,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #define QGEMM_DEFAULT_UNROLL_N 2
-#define CGEMM_DEFAULT_UNROLL_N 4
-#define ZGEMM_DEFAULT_UNROLL_N 4
+#define CGEMM_DEFAULT_UNROLL_N 2
+#define ZGEMM_DEFAULT_UNROLL_N 2
 #define XGEMM_DEFAULT_UNROLL_N 1
 
 #define QGEMM_DEFAULT_UNROLL_M 2
-#define CGEMM_DEFAULT_UNROLL_M 1
-#define ZGEMM_DEFAULT_UNROLL_M 1
+#define CGEMM_DEFAULT_UNROLL_M 2
+#define ZGEMM_DEFAULT_UNROLL_M 2
 #define XGEMM_DEFAULT_UNROLL_M 1
 
 #define SGEMM_DEFAULT_P 256

From 0f648ebcd15d4a577eae9584e95b804acc25cc3f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 30 Dec 2023 20:31:32 +0100
Subject: [PATCH 533/718] use alternate download for the CLFS cross-compiler
 package

---
 .github/workflows/loongarch64.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml
index 42393eb0c..f1bf8064c 100644
--- a/.github/workflows/loongarch64.yml
+++ b/.github/workflows/loongarch64.yml
@@ -40,7 +40,8 @@ jobs:
 
       - name: Download and install loongarch64-toolchain
         run: |
-          wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
+          wget https://github.com/sunhaiyong1978/CLFS-for-LoongArch/releases/download/8.1/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
+          #wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
           tar -xf CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz -C /opt
 
       - name: Set env

From 44b5b9e39fa9bc98eb880d3f781ee4c337459ec6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 30 Dec 2023 22:50:40 +0100
Subject: [PATCH 534/718] Update C/ZGEMM MN for Loongson2k1000

---
 param.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/param.h b/param.h
index 03cea76c9..03d6fa11d 100644
--- a/param.h
+++ b/param.h
@@ -2891,11 +2891,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define DGEMM_DEFAULT_UNROLL_M  8
 #define DGEMM_DEFAULT_UNROLL_N  4
 
-#define CGEMM_DEFAULT_UNROLL_M  1
-#define CGEMM_DEFAULT_UNROLL_N  4
+#define CGEMM_DEFAULT_UNROLL_M  2
+#define CGEMM_DEFAULT_UNROLL_N  2
 
-#define ZGEMM_DEFAULT_UNROLL_M  1
-#define ZGEMM_DEFAULT_UNROLL_N  4
+#define ZGEMM_DEFAULT_UNROLL_M  2
+#define ZGEMM_DEFAULT_UNROLL_N  2
 
 #define SGEMM_DEFAULT_P	128
 #define DGEMM_DEFAULT_P	128

From 2802478449b7ea9bc7d1517c055635e8037908dc Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 30 Dec 2023 23:35:51 +0100
Subject: [PATCH 535/718] revert change to Loongson2k1000 zgemm

---
 param.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/param.h b/param.h
index 03d6fa11d..f3731c491 100644
--- a/param.h
+++ b/param.h
@@ -2894,8 +2894,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_DEFAULT_UNROLL_M  2
 #define CGEMM_DEFAULT_UNROLL_N  2
 
-#define ZGEMM_DEFAULT_UNROLL_M  2
-#define ZGEMM_DEFAULT_UNROLL_N  2
+#define ZGEMM_DEFAULT_UNROLL_M  1
+#define ZGEMM_DEFAULT_UNROLL_N  4
 
 #define SGEMM_DEFAULT_P	128
 #define DGEMM_DEFAULT_P	128

From d3451af03f97a5087cbfcdc1ca83cdf07616d7b8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 31 Dec 2023 12:35:37 +0100
Subject: [PATCH 536/718] Fix uninitialized read/wrong variable
 (Reference-LAPACK PR 967)

---
 lapack-netlib/SRC/cbbcsd.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/cbbcsd.f b/lapack-netlib/SRC/cbbcsd.f
index 1cd1ffbf1..4d0c45efe 100644
--- a/lapack-netlib/SRC/cbbcsd.f
+++ b/lapack-netlib/SRC/cbbcsd.f
@@ -805,7 +805,7 @@
                CALL SLARTGP( B22BULGE, B22E(I-1), RWORK(IU2SN+I-1),
      $                       RWORK(IU2CS+I-1), R )
             ELSE IF( NU .LT. MU ) THEN
-               CALL SLARTGS( B21E(I), B21E(I+1), NU, RWORK(IU2CS+I-1),
+               CALL SLARTGS( B21E(I), B21D(I+1), NU, RWORK(IU2CS+I-1),
      $                       RWORK(IU2SN+I-1) )
             ELSE
                CALL SLARTGS( B22D(I), B22E(I), MU, RWORK(IU2CS+I-1),

From bd787c8a1a7039fe4c0e6df5c1d9ff70738342dd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 31 Dec 2023 12:36:47 +0100
Subject: [PATCH 537/718] Fix uninitialized read/wrong variable
 (Reference-LAPACK PR 967)

---
 lapack-netlib/SRC/dbbcsd.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/dbbcsd.f b/lapack-netlib/SRC/dbbcsd.f
index 4fcf9ab5d..913f96a73 100644
--- a/lapack-netlib/SRC/dbbcsd.f
+++ b/lapack-netlib/SRC/dbbcsd.f
@@ -805,7 +805,7 @@
                CALL DLARTGP( B22BULGE, B22E(I-1), WORK(IU2SN+I-1),
      $                       WORK(IU2CS+I-1), R )
             ELSE IF( NU .LT. MU ) THEN
-               CALL DLARTGS( B21E(I), B21E(I+1), NU, WORK(IU2CS+I-1),
+               CALL DLARTGS( B21E(I), B21D(I+1), NU, WORK(IU2CS+I-1),
      $                       WORK(IU2SN+I-1) )
             ELSE
                CALL DLARTGS( B22D(I), B22E(I), MU, WORK(IU2CS+I-1),

From 1b668479deb9e1ef229c8f40b96d6f56b60402c4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 31 Dec 2023 12:37:52 +0100
Subject: [PATCH 538/718] Fix uninitialized read/wrong variable
 (Reference-LAPACK PR 967)

---
 lapack-netlib/SRC/sbbcsd.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/sbbcsd.f b/lapack-netlib/SRC/sbbcsd.f
index d5720cb33..2a619cb71 100644
--- a/lapack-netlib/SRC/sbbcsd.f
+++ b/lapack-netlib/SRC/sbbcsd.f
@@ -805,7 +805,7 @@
                CALL SLARTGP( B22BULGE, B22E(I-1), WORK(IU2SN+I-1),
      $                       WORK(IU2CS+I-1), R )
             ELSE IF( NU .LT. MU ) THEN
-               CALL SLARTGS( B21E(I), B21E(I+1), NU, WORK(IU2CS+I-1),
+               CALL SLARTGS( B21E(I), B21D(I+1), NU, WORK(IU2CS+I-1),
      $                       WORK(IU2SN+I-1) )
             ELSE
                CALL SLARTGS( B22D(I), B22E(I), MU, WORK(IU2CS+I-1),

From 00d7476b4beba9cbd0cceac89e483e14664b9037 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 31 Dec 2023 12:39:21 +0100
Subject: [PATCH 539/718] Fix uninitialized read/wrong variable
 (Reference-LAPACK PR 967)

---
 lapack-netlib/SRC/zbbcsd.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/zbbcsd.f b/lapack-netlib/SRC/zbbcsd.f
index 757e98c71..6601f4a06 100644
--- a/lapack-netlib/SRC/zbbcsd.f
+++ b/lapack-netlib/SRC/zbbcsd.f
@@ -804,7 +804,7 @@
                CALL DLARTGP( B22BULGE, B22E(I-1), RWORK(IU2SN+I-1),
      $                       RWORK(IU2CS+I-1), R )
             ELSE IF( NU .LT. MU ) THEN
-               CALL DLARTGS( B21E(I), B21E(I+1), NU, RWORK(IU2CS+I-1),
+               CALL DLARTGS( B21E(I), B21D(I+1), NU, RWORK(IU2CS+I-1),
      $                       RWORK(IU2SN+I-1) )
             ELSE
                CALL DLARTGS( B22D(I), B22E(I), MU, RWORK(IU2CS+I-1),

From 4a15d7242037862053a6332844f3838cd1ec64e3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 31 Dec 2023 16:30:57 +0100
Subject: [PATCH 540/718] AzureCI: Update alpine-chroot-install (#4403)

* Update alpine-chroot-install
---
 azure-pipelines.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 317bc504a..3ae8615a7 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -288,9 +288,9 @@ jobs:
      vmImage: 'ubuntu-latest'
   steps:
   - script: |
-        wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \
-          && echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc  alpine-chroot-install' | sha1sum -c \
-          || exit 1
+        wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.14.0/alpine-chroot-install \
+        && echo 'ccbf65f85cdc351851f8ad025bb3e65bae4d5b06  alpine-chroot-install' | sha1sum -c \
+        || exit 1
         alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
         sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo'
         alpine make DYNAMIC_ARCH=1 BINARY=64

From 504f9b0c5e4d7a9f6d87639d18dfb193e5b0b980 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 2 Jan 2024 18:46:21 +0100
Subject: [PATCH 541/718] Increase S/D GEMM PQ to match typical L2 size as
 forNeoverseV1

---
 param.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/param.h b/param.h
index f3731c491..469c38ce3 100644
--- a/param.h
+++ b/param.h
@@ -3359,13 +3359,13 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define ZGEMM_DEFAULT_UNROLL_M  4
 #define ZGEMM_DEFAULT_UNROLL_N  4
 
-#define SGEMM_DEFAULT_P	128
-#define DGEMM_DEFAULT_P	160
+#define SGEMM_DEFAULT_P	240
+#define DGEMM_DEFAULT_P	240
 #define CGEMM_DEFAULT_P 128
 #define ZGEMM_DEFAULT_P 128
 
-#define SGEMM_DEFAULT_Q 352
-#define DGEMM_DEFAULT_Q 128
+#define SGEMM_DEFAULT_Q 640
+#define DGEMM_DEFAULT_Q 320
 #define CGEMM_DEFAULT_Q 224
 #define ZGEMM_DEFAULT_Q 112
 

From 03713bc464bf20a57ad777a41820c608fb5d0e8c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 2 Jan 2024 22:08:49 +0100
Subject: [PATCH 542/718] Update Changelog for 0.3.26

---
 Changelog.txt | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/Changelog.txt b/Changelog.txt
index e0fe0ca5a..b6139d6b7 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,49 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.26
+ 2-Jan-2024
+
+general:
+- improved the version of openblas.pc that is created by the CMAKE build
+- fixed a CMAKE-specific build problem on older versions of MacOS
+- worked around linking problems on old versions of MacOS
+- corrected installation location of the lapacke_mangling header in CMAKE builds
+- added type declarations for complex variables to the MSVC-specific parts of the LAPACK header
+- significantly sped up ?GESV for small problem sizes by introducing a lower bound for multithreading
+- imported additions and corrections from the Reference-LAPACK project:
+  - added new LAPACK functions for truncated QR with pivoting (Reference-LAPACK PRs 891&941)
+  - handle miscalculation of minimum work array size in corner cases (Reference-LAPACK PR 942)
+  - fixed use of uninitialized variables in ?GEDMD and improved inline documentation (PR 959)
+  - fixed use of uninitialized variables (and consequential failures) in ?BBCSD (PR 967)
+  - added tests for the recently introduced Dynamic Mode Decomposition functions (PR 736)
+  - fixed several memory leaks in the LAPACK testsuite (PR 953)
+  - fixed counting of testsuite results by the Python script (PR 954)
+   
+x86-64:
+- fixed computation of CASUM on SkylakeX and newer targets in the special
+  case that AVX512 is not supported by the compiler or operating environment
+- fixed potential undefined behaviour in the CASUM/ZASUM kernels for AVX512 targets
+- worked around a problem in the pre-AVX kernels for GEMV
+- sped up the thread management code on MS Windows
+
+arm64:
+- fixed building of the LAPACK testsuite with Xcode 15 on Apple M1 and newer
+- sped up the thread management code on MS Windows
+- sped up SGEMM and DGEMM on Neoverse V1 and N1
+- sped up ?DOT on SVE-capable targets
+- reduced the number of targets in DYNAMIC_ARCH builds by eliminating functionally equivalent ones
+- included support for Apple M1 and newer targets in DYNAMIC_ARCH builds
+
+power:
+- improved the SGEMM kernel for POWER10
+- fixed compilation with (very) old versions of gcc
+- fixed detection of old 32bit PPC targets in CMAKE-based builds
+- added autodetection of the POWERPC 7400 subtype
+- fixed CMAKE-based compilation for PPCG4 and PPC970 targets
+
+loongarch64:
+- added and improved optimized kernels for almost all BLAS functions
+
 ====================================================================
 Version 0.3.25
  12-Nov-2023

From fde8bb990372980b840758bcef714bf565cdfb68 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 2 Jan 2024 22:24:33 +0100
Subject: [PATCH 543/718] Update version to 0.3.26

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f3eac2edf..6d09f5658 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
 
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 25.dev)
+set(OpenBLAS_PATCH_VERSION 26)
 
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 

From 6c77e5e314474773a7749357b153caba4ec3817d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 2 Jan 2024 22:25:05 +0100
Subject: [PATCH 544/718] Update Makefile.rule

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index 58f02358e..daf2d958d 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.25.dev
+VERSION = 0.3.26
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

From 4f5da84e2f447b466db2a3d468dbee8881cc23a0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 2 Jan 2024 22:32:27 +0100
Subject: [PATCH 545/718] Update version to 0.3.26.dev

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6d09f5658..5a1e4b271 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
 
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 26)
+set(OpenBLAS_PATCH_VERSION 26.dev)
 
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 

From 1412d2deeb32cfc1d80150eba520a5bba915f1c6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 2 Jan 2024 22:33:01 +0100
Subject: [PATCH 546/718] Update version to 0.3.26.dev

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index daf2d958d..8dbf5eab6 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.26
+VERSION = 0.3.26.dev
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

From e48627c999e3aadfbcdb2b1f1589d4e75840dfca Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 6 Jan 2024 23:55:52 +0100
Subject: [PATCH 547/718] Add tests for ZSCAL with NaN and Inf arguments

---
 utest/CMakeLists.txt |  1 +
 utest/Makefile       |  2 +-
 utest/test_zscal.c   | 49 +++++++++++++++++++++++++++++++++++++++++++
 utest/utest_main2.c  | 50 +++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 100 insertions(+), 2 deletions(-)
 create mode 100644 utest/test_zscal.c

diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index 2e32827d3..c47954ce4 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -15,6 +15,7 @@ else ()
     test_dsdot.c
     test_dnrm2.c
     test_swap.c
+    test_zscal.c
   )
 endif ()
 
diff --git a/utest/Makefile b/utest/Makefile
index f99035440..d0715c754 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -11,7 +11,7 @@ UTESTBIN=openblas_utest
 
 include $(TOPDIR)/Makefile.system
 
-OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o
+OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o
 #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o
 
 ifneq ($(NO_LAPACK), 1)
diff --git a/utest/test_zscal.c b/utest/test_zscal.c
new file mode 100644
index 000000000..b6310439f
--- /dev/null
+++ b/utest/test_zscal.c
@@ -0,0 +1,49 @@
+#include "openblas_utest.h"
+#include <cblas.h>
+#ifdef BUILD_COMPLEX16
+
+CTEST(zscal, i_nan)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
+    cblas_zscal(9, i, &nan, 1);
+    ASSERT_TRUE(isnan(nan[0]));
+    ASSERT_TRUE(isnan(nan[1]));
+    ASSERT_TRUE(isnan(nan[16]));
+    ASSERT_TRUE(isnan(nan[17]));
+}
+
+CTEST(zscal, nan_i)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
+    cblas_zscal(9, &nan, &i, 1);
+    ASSERT_TRUE(isnan(i[0]));
+    ASSERT_TRUE(isnan(i[1]));
+    ASSERT_TRUE(isnan(i[16]));
+    ASSERT_TRUE(isnan(i[17]));
+}
+	    
+CTEST(zscal, i_inf)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0};
+    cblas_zscal(9, i, &inf, 1);
+    ASSERT_TRUE(isnan(inf[0]));
+    ASSERT_TRUE(isinf(inf[1]));
+    ASSERT_TRUE(isnan(inf[16]));
+    ASSERT_TRUE(isinf(inf[17]));
+}    
+
+CTEST(zscal, inf_i)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0};
+    cblas_zscal(9, &inf, &i, 1);
+    ASSERT_TRUE(isnan(i[0]));
+    ASSERT_TRUE(isinf(i[1]));
+    ASSERT_TRUE(isnan(i[16]));
+    ASSERT_TRUE(isinf(i[17]));
+}
+
+#endif
diff --git a/utest/utest_main2.c b/utest/utest_main2.c
index 4382bf159..8cb663190 100644
--- a/utest/utest_main2.c
+++ b/utest/utest_main2.c
@@ -617,6 +617,51 @@ CTEST(max, smax_zero){
   ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS);
 }
 
+
+CTEST(zscal, i_nan)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
+    cblas_zscal(9, i, &nan, 1);
+    ASSERT(isnan(nan[0]);
+    ASSERT(isnan(nan[1]);
+    ASSERT(isnan(nan[16]);
+    ASSERT(isnan(nan[17]);
+}
+
+CTEST(zscal, nan_i)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
+    cblas_zscal(9, &nan, &i, 1);
+    ASSERT(isnan(i[0]);
+    ASSERT(isnan(i[1]);
+    ASSERT(isnan(i[16]);
+    ASSERT(isnan(i[17]);
+	    }    
+	    
+CTEST(zscal, i_inf)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0};
+    cblas_zscal(9, i, &inf, 1);
+    ASSERT(isnan(inf[0]);
+    ASSERT(isinf(inf[1]);
+    ASSERT(isnan(inf[16]);
+    ASSERT(isinf(inf[17]);
+}    
+
+CTEST(zscal, inf_i)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0};
+    cblas_zscal(9, &inf, &i, 1);
+    ASSERT(isnan(i[0]);
+    ASSERT(isinf(i[1]);
+    ASSERT(isnan(i[16]);
+    ASSERT(isinf(i[17]);
+}
+
 int main(int argc, const char ** argv){
 
   CTEST_ADD (amax, samax);
@@ -648,7 +693,10 @@ int main(int argc, const char ** argv){
   CTEST_ADD (swap,zswap_inc_0);
   CTEST_ADD (swap,sswap_inc_0);
   CTEST_ADD (swap,cswap_inc_0);
-
+  CTEST_ADD (zscal, i_nan);
+  CTEST_ADD (zscal, nan_i);
+  CTEST_ADD (zscal, i_inf);
+  CTEST_ADD (zscal, inf_i);
   int num_fail=0;
 
   num_fail=ctest_main(argc, argv);

From def4996170815ced665af2d1d8bbc08490feb809 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 7 Jan 2024 15:29:42 +0100
Subject: [PATCH 548/718] Fix handling of NAN and INF arguments

---
 kernel/x86_64/zscal.c      | 61 +++++++++++++++++++-------------------
 kernel/x86_64/zscal_sse2.S |  1 +
 2 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c
index 45e3531b8..b8d2a8754 100644
--- a/kernel/x86_64/zscal.c
+++ b/kernel/x86_64/zscal.c
@@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #include "common.h"
-
+#include <float.h>
 
 #if defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "zscal_microk_skylakex-2.c"
@@ -222,12 +222,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 		if ( da_r == 0.0 )
 		{
-
 			BLASLONG n1 = n & -2;
 
 			if ( da_i == 0.0 )
 			{
-
 				while(j < n1)
 				{
 			
@@ -253,7 +251,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 			}
 			else
 			{
-
 				while(j < n1)
 				{
 			
@@ -356,49 +353,59 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 		alpha[0] = da_r;
 		alpha[1] = da_i;
-	
+
 		if ( da_r == 0.0 )
 			if ( da_i == 0 )
 				zscal_kernel_8_zero(n1 , alpha , x);
 			else
-				zscal_kernel_8_zero_r(n1 , alpha , x);
+//				zscal_kernel_8_zero_r(n1 , alpha , x);
+				zscal_kernel_8(n1 , alpha , x);
 		else
-			if ( da_i == 0 )
+			if ( da_i == 0 && da_r == da_r)
 				zscal_kernel_8_zero_i(n1 , alpha , x);
 			else
 				zscal_kernel_8(n1 , alpha , x);
-
+		}
 		i = n1 << 1;
 		j = n1;
-	}
-
-
-	if ( da_r == 0.0 )
+	
+	if ( da_r == 0.0 || da_r != da_r )
 	{
-
 		if ( da_i == 0.0 )
 		{
-
+		FLOAT res=0.0;
+		if (da_r != da_r) res= da_r;
 			while(j < n)
 			{
-		
-					x[i]=0.0;
-					x[i+1]=0.0;
+					x[i]=res;
+					x[i+1]=res;
 					i += 2 ;
 					j++;
 
 			}
 
 		}
-		else
+		else if (da_r < -FLT_MAX || da_r > FLT_MAX) { 
+			while(j < n)
+			{
+					x[i]=SNAN;
+					x[i+1]=da_r;
+					i += 2 ;
+					j++;
+
+			}
+
+		} else
 		{
 
 			while(j < n)
 			{
-			
 				temp0        = -da_i * x[i+1];
+				if (x[i] < -FLT_MAX || x[i] > FLT_MAX)
+					temp0 = SNAN;
 				x[i+1]       =  da_i * x[i];
-				x[i]         =  temp0;
+				if ( x[i] == x[i]) //preserve NaN 
+				  x[i]         =  temp0;
 				i += 2 ;
 				j++;
 
@@ -409,12 +416,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 	}
 	else
 	{
-
-		if ( da_i == 0.0 )
+		if (da_i == 0.0)
 		{
-
-			while(j < n)
-			{
+				while(j < n)
+				{
 			
 					temp0        =  da_r * x[i];
 					x[i+1]       =  da_r * x[i+1];
@@ -422,15 +427,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 					i += 2 ;
 					j++;
 
-			}
-
+				}
 		}
 		else
 		{
 
 			while(j < n)
 			{
-
 				temp0        =  da_r * x[i]   - da_i * x[i+1];
 				x[i+1]       =  da_r * x[i+1] + da_i * x[i];
 				x[i]         =  temp0;
@@ -445,5 +448,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 	return(0);
 }
-
-
diff --git a/kernel/x86_64/zscal_sse2.S b/kernel/x86_64/zscal_sse2.S
index 223b1e439..d6a49136d 100644
--- a/kernel/x86_64/zscal_sse2.S
+++ b/kernel/x86_64/zscal_sse2.S
@@ -82,6 +82,7 @@
 	pxor	%xmm15, %xmm15
 	comisd	%xmm0, %xmm15
 	jne	.L100
+	jp	.L100
 
 	comisd	%xmm1, %xmm15
 	jne	.L100

From c9df62e88375fae17adf555d73a757e2e8f65cab Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 7 Jan 2024 17:49:40 +0100
Subject: [PATCH 549/718] Fix handling of NAN

---
 kernel/arm64/zscal.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/arm64/zscal.S b/kernel/arm64/zscal.S
index 929455975..4bd43320d 100644
--- a/kernel/arm64/zscal.S
+++ b/kernel/arm64/zscal.S
@@ -223,7 +223,7 @@ zscal_begin:
 	fcmp	DA_I, #0.0
 	beq	.Lzscal_kernel_RI_zero
 
-	b	.Lzscal_kernel_R_zero
+//	b	.Lzscal_kernel_R_zero
 
 .Lzscal_kernel_R_non_zero:
 

From d3d99c34f24a426bd90dfd97f76683dcd0eb2006 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 7 Jan 2024 17:56:51 +0100
Subject: [PATCH 550/718] Fix handling of NAN and INF

---
 kernel/mips/zscal.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c
index bca1155c1..d8cbffcf9 100644
--- a/kernel/mips/zscal.c
+++ b/kernel/mips/zscal.c
@@ -63,6 +63,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 				x[ip+1] = da_r * x[ip+1] + da_i * x[ip]   ;
 			}
 		}
+		if (!isnan(x_ip] && !isinf(x[ip]))
 		x[ip]   = temp;
 
 		ip += inc_x2;

From 711433fcf0257cd07cf728c64e175960d0da38a2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 7 Jan 2024 18:01:58 +0100
Subject: [PATCH 551/718] Update zscal.c

---
 kernel/mips/zscal.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c
index d8cbffcf9..127f4f444 100644
--- a/kernel/mips/zscal.c
+++ b/kernel/mips/zscal.c
@@ -63,9 +63,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 				x[ip+1] = da_r * x[ip+1] + da_i * x[ip]   ;
 			}
 		}
-		if (!isnan(x_ip] && !isinf(x[ip]))
+//		if (!isnan(x_ip] && !isinf(x[ip]))
 		x[ip]   = temp;
-
+		if ( da_r != da_r ) x[ip] = da_r;
 		ip += inc_x2;
 	}
 

From 903589f84b97b3975fa29a0f2a2fcef418040721 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 7 Jan 2024 18:37:00 +0100
Subject: [PATCH 552/718] Update zscal.c

---
 kernel/mips/zscal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c
index 127f4f444..e76fb1ff1 100644
--- a/kernel/mips/zscal.c
+++ b/kernel/mips/zscal.c
@@ -63,7 +63,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 				x[ip+1] = da_r * x[ip+1] + da_i * x[ip]   ;
 			}
 		}
-//		if (!isnan(x_ip] && !isinf(x[ip]))
+//		if (!isnan(x[ip]) && !isinf(x[ip]))
 		x[ip]   = temp;
 		if ( da_r != da_r ) x[ip] = da_r;
 		ip += inc_x2;

From 0c33b57f5fb3721b2c6aa6fb5c7ddd09e17d6542 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 7 Jan 2024 18:40:19 +0100
Subject: [PATCH 553/718] Handle NAN in input

---
 kernel/riscv64/zscal_vector.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c
index d275b75f8..24b44952d 100644
--- a/kernel/riscv64/zscal_vector.c
+++ b/kernel/riscv64/zscal_vector.c
@@ -59,7 +59,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 
         unsigned int gvl = 0;
         FLOAT_V_T vt, v0, v1;
-        if(da_r == 0.0 && da_i == 0.0){
+        if (isnan(da_r)) {
+                gvl = VSETVL(n);
+                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * 2 * gvl;
+                vt = VFMVVF_FLOAT(da_r, gvl);
+                for(i=0,j=0; i < n/(gvl*2); i++){
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+inc_xv+1], stride_x, vt, gvl);
+
+                        j += gvl*2;
+                        ix += inc_xv*2;
+                }
+                for(; j < n; ){
+                        gvl = VSETVL(n-j);
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl);
+                        j += gvl;
+                        ix += inc_x * 2 * gvl;
+                }  
+        else if(da_r == 0.0 && da_i == 0.0){
                 gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * 2 * gvl;

From b08a208365f0bfbc8c85b85b4401d3a05f57a222 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 7 Jan 2024 19:14:41 +0100
Subject: [PATCH 554/718] Update zscal_vector.c

---
 kernel/riscv64/zscal_vector.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c
index 24b44952d..33cb5d7ea 100644
--- a/kernel/riscv64/zscal_vector.c
+++ b/kernel/riscv64/zscal_vector.c
@@ -80,7 +80,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
                         j += gvl;
                         ix += inc_x * 2 * gvl;
                 }  
-        else if(da_r == 0.0 && da_i == 0.0){
+        } else if(da_r == 0.0 && da_i == 0.0){
                 gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * 2 * gvl;

From 2173356d5b63ea5e5f861bfebea387bc41d9e3b0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 7 Jan 2024 19:45:28 +0100
Subject: [PATCH 555/718] Update zscal_vector.c

---
 kernel/riscv64/zscal_vector.c | 24 +++---------------------
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c
index 33cb5d7ea..a943135df 100644
--- a/kernel/riscv64/zscal_vector.c
+++ b/kernel/riscv64/zscal_vector.c
@@ -59,28 +59,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 
         unsigned int gvl = 0;
         FLOAT_V_T vt, v0, v1;
-        if (isnan(da_r)) {
-                gvl = VSETVL(n);
-                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
-                BLASLONG inc_xv = inc_x * 2 * gvl;
-                vt = VFMVVF_FLOAT(da_r, gvl);
-                for(i=0,j=0; i < n/(gvl*2); i++){
-                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+inc_xv+1], stride_x, vt, gvl);
 
-                        j += gvl*2;
-                        ix += inc_xv*2;
-                }
-                for(; j < n; ){
-                        gvl = VSETVL(n-j);
-                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl);
-                        j += gvl;
-                        ix += inc_x * 2 * gvl;
-                }  
-        } else if(da_r == 0.0 && da_i == 0.0){
+        if(da_r == 0.0 && da_i == 0.0){
                 gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * 2 * gvl;
@@ -101,6 +81,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
                         j += gvl;
                         ix += inc_x * 2 * gvl;
                 }
+#if 0
         }else if(da_r == 0.0){
                 gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
@@ -129,6 +110,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
                         VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
                         VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
                 }
+    #endif
         }else if(da_i == 0.0){
                 gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);

From 91bbde7f64ef9d176e69416897ce5e255c351ec8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 7 Jan 2024 20:21:05 +0100
Subject: [PATCH 556/718] revert accidental direct commit to develop

---
 kernel/mips/zscal.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c
index e76fb1ff1..bca1155c1 100644
--- a/kernel/mips/zscal.c
+++ b/kernel/mips/zscal.c
@@ -63,9 +63,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 				x[ip+1] = da_r * x[ip+1] + da_i * x[ip]   ;
 			}
 		}
-//		if (!isnan(x[ip]) && !isinf(x[ip]))
 		x[ip]   = temp;
-		if ( da_r != da_r ) x[ip] = da_r;
+
 		ip += inc_x2;
 	}
 

From f052bd4705210ef6c7293447b236f3a634799ba1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 7 Jan 2024 20:21:48 +0100
Subject: [PATCH 557/718] revert accidental direct commit to develop

---
 kernel/riscv64/zscal_vector.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c
index a943135df..d275b75f8 100644
--- a/kernel/riscv64/zscal_vector.c
+++ b/kernel/riscv64/zscal_vector.c
@@ -59,7 +59,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 
         unsigned int gvl = 0;
         FLOAT_V_T vt, v0, v1;
-
         if(da_r == 0.0 && da_i == 0.0){
                 gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
@@ -81,7 +80,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
                         j += gvl;
                         ix += inc_x * 2 * gvl;
                 }
-#if 0
         }else if(da_r == 0.0){
                 gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
@@ -110,7 +108,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
                         VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
                         VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
                 }
-    #endif
         }else if(da_i == 0.0){
                 gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);

From acf17a825dadf070d595b901ae10d7f21db5aa25 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 7 Jan 2024 20:26:16 +0100
Subject: [PATCH 558/718] Handle NAN in input

---
 kernel/mips/zscal.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c
index bca1155c1..d69477f97 100644
--- a/kernel/mips/zscal.c
+++ b/kernel/mips/zscal.c
@@ -63,8 +63,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 				x[ip+1] = da_r * x[ip+1] + da_i * x[ip]   ;
 			}
 		}
-		x[ip]   = temp;
-
+		if ( da_r != da_r ) 
+			x[ip] = da_r;
+		else
+			x[ip]   = temp;
+		
 		ip += inc_x2;
 	}
 

From f0808d856b69d750facf288460b65f0e9b0dde65 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 7 Jan 2024 20:27:29 +0100
Subject: [PATCH 559/718] Handle NAN in input

---
 kernel/riscv64/zscal_vector.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c
index d275b75f8..77f4fc312 100644
--- a/kernel/riscv64/zscal_vector.c
+++ b/kernel/riscv64/zscal_vector.c
@@ -80,6 +80,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
                         j += gvl;
                         ix += inc_x * 2 * gvl;
                 }
+#if 0
         }else if(da_r == 0.0){
                 gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
@@ -97,6 +98,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
                         j += gvl;
                         ix += inc_xv;
                 }
+#endif
                 if(j < n){
                         gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);

From cf8b03ae8b59d41ea74fd7f8f52efe9fed897c23 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 7 Jan 2024 23:09:57 +0100
Subject: [PATCH 560/718] Use NAN rather than SNAN for portability

---
 kernel/x86_64/zscal.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c
index b8d2a8754..66c8a0d2b 100644
--- a/kernel/x86_64/zscal.c
+++ b/kernel/x86_64/zscal.c
@@ -388,8 +388,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 		else if (da_r < -FLT_MAX || da_r > FLT_MAX) { 
 			while(j < n)
 			{
-					x[i]=SNAN;
-					x[i+1]=da_r;
+					x[i]= NAN;
+					x[i+1] = da_r;
 					i += 2 ;
 					j++;
 
@@ -402,7 +402,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 			{
 				temp0        = -da_i * x[i+1];
 				if (x[i] < -FLT_MAX || x[i] > FLT_MAX)
-					temp0 = SNAN;
+					temp0 = NAN;
 				x[i+1]       =  da_i * x[i];
 				if ( x[i] == x[i]) //preserve NaN 
 				  x[i]         =  temp0;

From 5e7f714e93848c72faa2d2e658b8584aa395f603 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 8 Jan 2024 08:17:40 +0100
Subject: [PATCH 561/718] Update zscal.c

---
 kernel/mips/zscal.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c
index d69477f97..b117db1dd 100644
--- a/kernel/mips/zscal.c
+++ b/kernel/mips/zscal.c
@@ -47,6 +47,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 			else
 			{
 				temp = - da_i * x[ip+1] ;
+				if (isnan(x[ip]) || isinf(x[ip])) temp = x[ip];
 				x[ip+1] = da_i * x[ip]  ;
 			}
 		}

From 25b0c48082a34bb2e5e9beebfdb65810e2e5affc Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 8 Jan 2024 09:49:18 +0100
Subject: [PATCH 562/718] Update zscal.c

---
 kernel/mips/zscal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c
index b117db1dd..7bb261941 100644
--- a/kernel/mips/zscal.c
+++ b/kernel/mips/zscal.c
@@ -47,7 +47,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 			else
 			{
 				temp = - da_i * x[ip+1] ;
-				if (isnan(x[ip]) || isinf(x[ip])) temp = x[ip];
+				if (isnan(x[ip]) || isinf(x[ip])) temp = NAN;
 				x[ip+1] = da_i * x[ip]  ;
 			}
 		}

From f637e1271391765b4e9fc87b958ecb4fd7541460 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 8 Jan 2024 09:52:38 +0100
Subject: [PATCH 563/718] Handle INF and NAN

---
 kernel/riscv64/zscal.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/riscv64/zscal.c b/kernel/riscv64/zscal.c
index 0521aaa0b..b2d537d04 100644
--- a/kernel/riscv64/zscal.c
+++ b/kernel/riscv64/zscal.c
@@ -60,6 +60,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 			else
 			{
 				temp = - da_i * x[ip+1] ;
+				if (isnan(x[ip]) || isinf(x[ip])) temp = NAN;
 				x[ip+1] = da_i * x[ip]  ;
 			}
 		}

From 7ee1ee38e22904837afd3e51dd67f91cd499185f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 8 Jan 2024 14:20:07 +0100
Subject: [PATCH 564/718] Handle NaN in input

---
 kernel/x86/zscal_sse.S  | 1 +
 kernel/x86/zscal_sse2.S | 1 +
 2 files changed, 2 insertions(+)

diff --git a/kernel/x86/zscal_sse.S b/kernel/x86/zscal_sse.S
index e011c98f5..89e36251e 100644
--- a/kernel/x86/zscal_sse.S
+++ b/kernel/x86/zscal_sse.S
@@ -87,6 +87,7 @@
 	xorps	%xmm7, %xmm7
 	comiss	%xmm0, %xmm7
 	jne	.L100		# Alpha_r != ZERO
+	jp	.L100		# Alpha_r NaN
 
 	comiss	%xmm1, %xmm7
 	jne	.L100		# Alpha_i != ZERO
diff --git a/kernel/x86/zscal_sse2.S b/kernel/x86/zscal_sse2.S
index cc7ab6686..0bc61b209 100644
--- a/kernel/x86/zscal_sse2.S
+++ b/kernel/x86/zscal_sse2.S
@@ -98,6 +98,7 @@
 	xorps	%xmm7, %xmm7
 	comisd	%xmm0, %xmm7
 	jne	.L100
+	jp	.L100
 
 	comisd	%xmm1, %xmm7
 	jne	.L100

From 1c31f56e5a23f6dd7ebfd35014fc5a604e4702cd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 8 Jan 2024 16:11:25 +0100
Subject: [PATCH 565/718] Handle NAN

---
 kernel/x86/zscal.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86/zscal.S b/kernel/x86/zscal.S
index 1eb518563..adb56edb9 100644
--- a/kernel/x86/zscal.S
+++ b/kernel/x86/zscal.S
@@ -98,7 +98,7 @@
 	fcomip	%st(1), %st
 	ffreep	%st(0)
 	jne	.L30
-
+jp .L30
 	EMMS
 
 	pxor	%mm0, %mm0

From 058dd2a4cb4a6139796e29629191db5f2899fa1e Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Mon, 8 Jan 2024 14:16:09 -0600
Subject: [PATCH 566/718] Replace two vector loads with one vector pair load
 and fix endianess of stores - DGEMM versions.

---
 kernel/power/dgemm_tcopy_16_power8.S         |   3 +
 kernel/power/dgemm_tcopy_macros_16_power10.S | 655 +++++++++++++++++++
 kernel/power/dgemm_tcopy_macros_16_power8.S  |   6 +
 3 files changed, 664 insertions(+)
 create mode 100644 kernel/power/dgemm_tcopy_macros_16_power10.S

diff --git a/kernel/power/dgemm_tcopy_16_power8.S b/kernel/power/dgemm_tcopy_16_power8.S
index fc52e0202..7a3c04e8e 100644
--- a/kernel/power/dgemm_tcopy_16_power8.S
+++ b/kernel/power/dgemm_tcopy_16_power8.S
@@ -107,6 +107,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define o0	0
 
+#ifdef POWER10
+#include "dgemm_tcopy_macros_16_power10.S"
+#endif
 #include "dgemm_tcopy_macros_16_power8.S"
 
 #define STACKSIZE 144
diff --git a/kernel/power/dgemm_tcopy_macros_16_power10.S b/kernel/power/dgemm_tcopy_macros_16_power10.S
new file mode 100644
index 000000000..3cb416e09
--- /dev/null
+++ b/kernel/power/dgemm_tcopy_macros_16_power10.S
@@ -0,0 +1,655 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/21 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_4x16', `
+#else
+.macro COPY_4x16
+#endif
+
+    lxvpx       vs32,   o0,     A0
+    lxvpx       vs34,   o32,    A0
+	addi		A0,	A0,	64
+
+    lxvpx       vs40,   o0,     A1
+    lxvpx       vs42,   o32,    A1
+	addi		A1,	A1,	64
+
+    lxvpx       vs48,   o0,     A2
+    lxvpx       vs50,   o32,    A2
+	addi		A2,	A2,	64
+
+    lxvpx       vs4,    o0,     A3
+    lxvpx       vs6,    o32,    A3
+	addi		A3,	A3,	64
+
+    lxvpx       vs36,   o0,     A0
+    lxvpx       vs38,   o32,    A0
+	addi		A0,	A0,	64
+
+    lxvpx       vs44,   o0,     A1
+    lxvpx       vs46,   o32,    A1
+	addi		A1,	A1,	64
+
+    lxvpx       vs12,   o0,     A2
+    lxvpx       vs2,    o32,    A2
+	addi		A2,	A2,	64
+
+    lxvpx       vs8,    o0,     A3
+    lxvpx       vs10,   o32,    A3
+	addi		A3,	A3,	64
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs32,	o0,	T1
+	stxvd2x		vs33,	o16,	T1
+	stxvd2x		vs34,	o32,	T1
+	stxvd2x		vs35,	o48,	T1
+#else
+    stxvd2x     vs33,   o0, T1
+    stxvd2x     vs32,   o16,    T1
+    stxvd2x     vs35,   o32,    T1
+    stxvd2x     vs34,   o48,    T1
+#endif
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs36,	o0,	T1
+	stxvd2x		vs37,	o16,	T1
+	stxvd2x		vs38,	o32,	T1
+	stxvd2x		vs39,	o48,	T1
+#else
+    stxvd2x     vs37,   o0, T1
+    stxvd2x     vs36,   o16,    T1
+    stxvd2x     vs39,   o32,    T1
+    stxvd2x     vs38,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs40,	o0,	T1
+	stxvd2x		vs41,	o16,	T1
+	stxvd2x		vs42,	o32,	T1
+	stxvd2x		vs43,	o48,	T1
+#else
+    stxvd2x     vs41,   o0, T1
+    stxvd2x     vs40,   o16,    T1
+    stxvd2x     vs43,   o32,    T1
+    stxvd2x     vs42,   o48,    T1
+#endif
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs44,	o0,	T1
+	stxvd2x		vs45,	o16,	T1
+	stxvd2x		vs46,	o32,	T1
+	stxvd2x		vs47,	o48,	T1
+#else
+    stxvd2x     vs45,   o0, T1
+    stxvd2x     vs44,   o16,    T1
+    stxvd2x     vs47,   o32,    T1
+    stxvd2x     vs46,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs48,	o0,	T1
+	stxvd2x		vs49,	o16,	T1
+	stxvd2x		vs50,	o32,	T1
+	stxvd2x		vs51,	o48,	T1
+#else
+    stxvd2x     vs49,   o0, T1
+    stxvd2x     vs48,   o16,    T1
+    stxvd2x     vs51,   o32,    T1
+    stxvd2x     vs50,   o48,    T1
+#endif
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs12,	o0,	T1
+	stxvd2x		vs13,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+#else
+    stxvd2x     vs13,   o0, T1
+    stxvd2x     vs12,   o16,    T1
+    stxvd2x     vs3,    o32,    T1
+    stxvd2x     vs2,    o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs4,	o0,	T1
+	stxvd2x		vs5,	o16,	T1
+	stxvd2x		vs6,	o32,	T1
+	stxvd2x		vs7,	o48,	T1
+#else
+    stxvd2x     vs5,    o0, T1
+    stxvd2x     vs4,    o16,    T1
+    stxvd2x     vs7,    o32,    T1
+    stxvd2x     vs6,    o48,    T1
+#endif
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+#else
+    stxvd2x     vs9,    o0, T1
+    stxvd2x     vs8,    o16,    T1
+    stxvd2x     vs11,   o32,    T1
+    stxvd2x     vs10,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_4x8', `
+#else
+.macro COPY_4x8
+#endif
+
+    lxvpx       vs32,   o0,     A0
+    lxvpx       vs34,   o32,    A0
+	addi		A0,	A0,	64
+
+
+    lxvpx       vs36,   o0,     A1
+    lxvpx       vs38,   o32,    A1
+	addi		A1,	A1,	64
+
+
+    lxvpx       vs40,   o0,     A2
+    lxvpx       vs42,   o32,    A2
+	addi		A2,	A2,	64
+
+
+    lxvpx       vs44,   o0,     A3
+    lxvpx       vs46,   o32,    A3
+	addi		A3,	A3,	64
+
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs32,	o0,	T1
+	stxvd2x		vs33,	o16,	T1
+	stxvd2x		vs34,	o32,	T1
+	stxvd2x		vs35,	o48,	T1
+#else
+    stxvd2x     vs33,   o0, T1
+    stxvd2x     vs32,   o16,    T1
+    stxvd2x     vs35,   o32,    T1
+    stxvd2x     vs34,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs36,	o0,	T1
+	stxvd2x		vs37,	o16,	T1
+	stxvd2x		vs38,	o32,	T1
+	stxvd2x		vs39,	o48,	T1
+#else
+    stxvd2x     vs37,   o0, T1
+    stxvd2x     vs36,   o16,    T1
+    stxvd2x     vs39,   o32,    T1
+    stxvd2x     vs38,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs40,	o0,	T1
+	stxvd2x		vs41,	o16,	T1
+	stxvd2x		vs42,	o32,	T1
+	stxvd2x		vs43,	o48,	T1
+#else
+    stxvd2x     vs41,   o0, T1
+    stxvd2x     vs40,   o16,    T1
+    stxvd2x     vs43,   o32,    T1
+    stxvd2x     vs42,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs44,	o0,	T1
+	stxvd2x		vs45,	o16,	T1
+	stxvd2x		vs46,	o32,	T1
+	stxvd2x		vs47,	o48,	T1
+#else
+    stxvd2x     vs45,   o0, T1
+    stxvd2x     vs44,   o16,    T1
+    stxvd2x     vs47,   o32,    T1
+    stxvd2x     vs46,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_4x4', `
+#else
+.macro COPY_4x4
+#endif
+
+    lxvpx       vs32,   o0,     A0
+	addi		A0,	A0,	32
+
+
+    lxvpx       vs34,   o0,     A1
+	addi		A1,	A1,	32
+
+
+    lxvpx       vs36,   o0,     A2
+	addi		A2,	A2,	32
+
+
+    lxvpx       vs38,   o0,     A3
+	addi		A3,	A3,	32
+
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs32,	o0,	T1
+	stxvd2x		vs33,	o16,	T1
+
+	stxvd2x		vs34,	o32,	T1
+	stxvd2x		vs35,	o48,	T1
+#else
+    stxvd2x     vs33,   o0, T1
+    stxvd2x     vs32,   o16,    T1
+
+    stxvd2x     vs35,   o32,    T1
+    stxvd2x     vs34,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs36,	o0,	T1
+	stxvd2x		vs37,	o16,	T1
+
+	stxvd2x		vs38,	o32,	T1
+	stxvd2x		vs39,	o48,	T1
+#else
+    stxvd2x     vs37,   o0, T1
+    stxvd2x     vs36,   o16,    T1
+
+    stxvd2x     vs39,   o32,    T1
+    stxvd2x     vs38,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=16
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_2x16', `
+#else
+.macro COPY_2x16
+#endif
+
+    lxvpx       vs32,   o0,     A0
+    lxvpx       vs34,   o32,    A0
+	addi		A0,	A0,	64
+
+    lxvpx       vs36,   o0,     A0
+    lxvpx       vs38,   o32,    A0
+	addi		A0,	A0,	64
+
+
+    lxvpx       vs40,   o0,     A1
+    lxvpx       vs42,   o32,    A1
+	addi		A1,	A1,	64
+
+    lxvpx       vs44,   o0,     A1
+    lxvpx       vs46,   o32,    A1
+	addi		A1,	A1,	64
+
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs32,	o0,	T1
+	stxvd2x		vs33,	o16,	T1
+	stxvd2x		vs34,	o32,	T1
+	stxvd2x		vs35,	o48,	T1
+#else
+    stxvd2x     vs33,   o0, T1
+    stxvd2x     vs32,   o16,    T1
+    stxvd2x     vs35,   o32,    T1
+    stxvd2x     vs34,   o48,    T1
+#endif
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs36,	o0,	T1
+	stxvd2x		vs37,	o16,	T1
+	stxvd2x		vs38,	o32,	T1
+	stxvd2x		vs39,	o48,	T1
+#else
+    stxvd2x     vs37,   o0, T1
+    stxvd2x     vs36,   o16,    T1
+    stxvd2x     vs39,   o32,    T1
+    stxvd2x     vs38,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs40,	o0,	T1
+	stxvd2x		vs41,	o16,	T1
+	stxvd2x		vs42,	o32,	T1
+	stxvd2x		vs43,	o48,	T1
+#else
+    stxvd2x     vs41,   o0, T1
+    stxvd2x     vs40,   o16,    T1
+    stxvd2x     vs43,   o32,    T1
+    stxvd2x     vs42,   o48,    T1
+#endif
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs44,	o0,	T1
+	stxvd2x		vs45,	o16,	T1
+	stxvd2x		vs46,	o32,	T1
+	stxvd2x		vs47,	o48,	T1
+#else
+    stxvd2x     vs45,   o0, T1
+    stxvd2x     vs44,   o16,    T1
+    stxvd2x     vs47,   o32,    T1
+    stxvd2x     vs46,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_2x8', `
+#else
+.macro COPY_2x8
+#endif
+
+    lxvpx       vs32,   o0,     A0
+    lxvpx       vs34,   o32,    A0
+	addi		A0,	A0,	64
+
+
+    lxvpx       vs36,   o0,     A1
+    lxvpx       vs38,   o0,     A1
+	addi		A1,	A1,	64
+
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs32,	o0,	T1
+	stxvd2x		vs33,	o16,	T1
+	stxvd2x		vs34,	o32,	T1
+	stxvd2x		vs35,	o48,	T1
+#else
+    stxvd2x     vs33,   o0, T1
+    stxvd2x     vs32,   o16,    T1
+    stxvd2x     vs35,   o32,    T1
+    stxvd2x     vs34,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs36,	o0,	T1
+	stxvd2x		vs37,	o16,	T1
+	stxvd2x		vs38,	o32,	T1
+	stxvd2x		vs39,	o48,	T1
+#else
+    stxvd2x     vs37,   o0, T1
+    stxvd2x     vs36,   o16,    T1
+    stxvd2x     vs39,   o32,    T1
+    stxvd2x     vs38,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_2x4', `
+#else
+.macro COPY_2x4
+#endif
+
+    lxvpx       vs32,   o0,     A0
+	addi		A0,	A0,	32
+
+
+    lxvpx       vs34,   o0,     A1
+	addi		A1,	A1,	32
+
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs32,	o0,	T1
+	stxvd2x		vs33,	o16,	T1
+
+	stxvd2x		vs34,	o32,	T1
+	stxvd2x		vs35,	o48,	T1
+#else
+    stxvd2x     vs33,   o0, T1
+    stxvd2x     vs32,   o16,    T1
+
+    stxvd2x     vs35,   o32,    T1
+    stxvd2x     vs34,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=16
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_1x16', `
+#else
+.macro COPY_1x16
+#endif
+
+    lxvpx       vs32,   o0,     A0
+    lxvpx       vs34,   o32,    A0
+	addi		A0,	A0,	64
+
+    lxvpx       vs36,   o0,     A0
+    lxvpx       vs38,   o0,     A0
+	addi		A0,	A0,	64
+
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs32,	o0,	T1
+	stxvd2x		vs33,	o16,	T1
+	stxvd2x		vs34,	o32,	T1
+	stxvd2x		vs35,	o48,	T1
+#else
+    stxvd2x     vs33,   o0, T1
+    stxvd2x     vs32,   o16,    T1
+    stxvd2x     vs35,   o32,    T1
+    stxvd2x     vs34,   o48,    T1
+#endif
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs36,	o0,	T1
+	stxvd2x		vs37,	o16,	T1
+	stxvd2x		vs38,	o32,	T1
+	stxvd2x		vs39,	o48,	T1
+#else
+    stxvd2x     vs37,   o0, T1
+    stxvd2x     vs36,   o16,    T1
+    stxvd2x     vs39,   o32,    T1
+    stxvd2x     vs38,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_1x8', `
+#else
+.macro COPY_1x8
+#endif
+
+    lxvpx       vs32,   o0,     A0
+    lxvpx       vs34,   o32,    A0
+	addi		A0,	A0,	64
+
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs32,	o0,	T1
+	stxvd2x		vs33,	o16,	T1
+	stxvd2x		vs34,	o32,	T1
+	stxvd2x		vs35,	o48,	T1
+#else
+    stxvd2x     vs33,   o0, T1
+    stxvd2x     vs32,   o16,    T1
+    stxvd2x     vs35,   o32,    T1
+    stxvd2x     vs34,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_1x4', `
+#else
+.macro COPY_1x4
+#endif
+
+    lxvpx       vs32,   o0,     A0
+	addi		A0,	A0,	32
+
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs32,	o0,	T1
+	stxvd2x		vs33,	o16,	T1
+#else
+    stxvd2x     vs33,   o0, T1
+    stxvd2x     vs32,   o16,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
diff --git a/kernel/power/dgemm_tcopy_macros_16_power8.S b/kernel/power/dgemm_tcopy_macros_16_power8.S
index 6c5b8ed62..93b488fd0 100644
--- a/kernel/power/dgemm_tcopy_macros_16_power8.S
+++ b/kernel/power/dgemm_tcopy_macros_16_power8.S
@@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * Macros for N=4 and M=16
 **********************************************************************************************/
 
+#ifndef POWER10
 #if defined(_AIX)
 define(`COPY_4x16', `
 #else
@@ -275,6 +276,7 @@ define(`COPY_4x4', `
 #else
 .endm
 #endif
+#endif
 
 
 /**********************************************************************************************
@@ -369,6 +371,7 @@ define(`COPY_4x1', `
 * Macros for N=2 and M=16
 **********************************************************************************************/
 
+#ifndef POWER10
 #if defined(_AIX)
 define(`COPY_2x16', `
 #else
@@ -512,6 +515,7 @@ define(`COPY_2x4', `
 #else
 .endm
 #endif
+#endif
 
 
 /**********************************************************************************************
@@ -580,6 +584,7 @@ define(`COPY_2x1', `
 * Macros for N=1 and M=16
 **********************************************************************************************/
 
+#ifndef POWER10
 #if defined(_AIX)
 define(`COPY_1x16', `
 #else
@@ -675,6 +680,7 @@ define(`COPY_1x4', `
 #else
 .endm
 #endif
+#endif
 
 
 /**********************************************************************************************

From 7df363e1e2daef0916faae7d0b399fc998e5433e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 12 Jan 2024 00:08:52 +0100
Subject: [PATCH 567/718] temporarily disable the MSA C/ZSCAL kernels

---
 kernel/mips/KERNEL.P5600 | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600
index 9a6e06d67..f0fb5e087 100644
--- a/kernel/mips/KERNEL.P5600
+++ b/kernel/mips/KERNEL.P5600
@@ -103,8 +103,10 @@ endif
 ifdef HAVE_MSA
 SSCALKERNEL  = ../mips/sscal_msa.c
 DSCALKERNEL  = ../mips/dscal_msa.c
-CSCALKERNEL  = ../mips/cscal_msa.c
-ZSCALKERNEL  = ../mips/zscal_msa.c
+#CSCALKERNEL  = ../mips/cscal_msa.c
+#ZSCALKERNEL  = ../mips/zscal_msa.c
+CSCALKERNEL = ../mips/zscal.c
+ZSCALKERNEL = ../mips/zscal.c
 else
 SSCALKERNEL  = ../mips/scal.c
 DSCALKERNEL  = ../mips/scal.c

From 1dada6d65d89d19b2cf89b12169f6b2196c90f1d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 12 Jan 2024 00:10:56 +0100
Subject: [PATCH 568/718] Add compiler test and flag for AVX512BF16 capability

---
 c_check | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/c_check b/c_check
index b5e4a9ad0..3e507be81 100755
--- a/c_check
+++ b/c_check
@@ -244,6 +244,7 @@ case "$data" in
 esac
 
 no_avx512=0
+no_avx512bf=0
 if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
     tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
     tmpf="$tmpd/a.c"
@@ -262,6 +263,25 @@ if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
     }
 
     rm -rf "$tmpd"
+    if [ "$no_avx512" -eq 0 ]; then
+    tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
+    tmpf="$tmpd/a.c"
+    code='"__m512 a= _mm512_dpbf16_ps(a, (__m512bh) _mm512_loadu_si512(%1]), (__m512bh) _mm512_loadu_si512(%2]));"'
+    printf "#include <immintrin.h>\n\nint main(void){ %s; }\n" "$code" >> "$tmpf"
+    if [ "$compiler" = "PGI" ]; then
+        args=" -tp cooperlake -c -o $tmpf.o $tmpf"
+    else
+        args=" -march=cooperlake -c -o $tmpf.o $tmpf"
+    fi
+    no_avx512bf=0
+    {
+        $compiler_name $flags $args >/dev/null 2>&1
+    } || {
+        no_avx512bf=1
+    }
+
+    rm -rf "$tmpd"
+  fi
 fi
 
 no_rv64gv=0
@@ -409,6 +429,7 @@ done
  [ "$makefile" = "-" ] && {
     [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
     [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
+    [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
     [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
     [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
     exit 0
@@ -437,6 +458,7 @@ done
     [ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n"
     [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
     [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
+    [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
     [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
     [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
     [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"

From 995a990e24fdcc8080128a8abc17b4ccc66bd4fd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 12 Jan 2024 00:12:46 +0100
Subject: [PATCH 569/718] Make AVX512 BFLOAT16 kernels conditional on compiler
 capability

---
 kernel/x86_64/KERNEL.COOPERLAKE     | 3 ++-
 kernel/x86_64/KERNEL.SAPPHIRERAPIDS | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE
index dba94aea8..22b042029 100644
--- a/kernel/x86_64/KERNEL.COOPERLAKE
+++ b/kernel/x86_64/KERNEL.COOPERLAKE
@@ -1,5 +1,5 @@
 include $(KERNELDIR)/KERNEL.SKYLAKEX
-
+ifneq ($(NO_AVX512BF16), 1)
 SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_cooperlake.c
 SBGEMM_SMALL_K_NN = sbgemm_small_kernel_nn_cooperlake.c
 SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_nn_cooperlake.c
@@ -20,3 +20,4 @@ SBGEMMINCOPYOBJ =  sbgemm_incopy$(TSUFFIX).$(SUFFIX)
 SBGEMMITCOPYOBJ =  sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
 SBGEMMONCOPYOBJ =  sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SBGEMMOTCOPYOBJ =  sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
index 3a832e917..0ab2b4ddc 100644
--- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
+++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
@@ -1,5 +1,6 @@
 include $(KERNELDIR)/KERNEL.COOPERLAKE
 
+ifneq ($(NO_AVX512BF16), 1)
 SBGEMM_SMALL_M_PERMIT =
 SBGEMM_SMALL_K_NN     =
 SBGEMM_SMALL_K_B0_NN  =
@@ -20,3 +21,4 @@ SBGEMMINCOPYOBJ =  sbgemm_incopy$(TSUFFIX).$(SUFFIX)
 SBGEMMITCOPYOBJ =  sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
 SBGEMMONCOPYOBJ =  sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SBGEMMOTCOPYOBJ =  sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif

From d1ead06bd8707723e643099aa1072aefa914f660 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 12 Jan 2024 09:29:13 +0100
Subject: [PATCH 570/718] define NAN and INFINITY if needed

---
 utest/test_zscal.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/utest/test_zscal.c b/utest/test_zscal.c
index b6310439f..8992eee90 100644
--- a/utest/test_zscal.c
+++ b/utest/test_zscal.c
@@ -2,6 +2,13 @@
 #include <cblas.h>
 #ifdef BUILD_COMPLEX16
 
+#ifndef NAN
+#define NAN 0.0/0.0
+#endif
+#ifndef INFINITY
+#define INFINITY 1.0/0.0
+#endif
+
 CTEST(zscal, i_nan)
 {
     double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };

From b57627c27f027692a4cc9725784e1b0e2e51bc49 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 12 Jan 2024 12:03:08 +0100
Subject: [PATCH 571/718] Handle NAN and INF

---
 kernel/zarch/zscal.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c
index d39b8447e..d019d66df 100644
--- a/kernel/zarch/zscal.c
+++ b/kernel/zarch/zscal.c
@@ -233,9 +233,15 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
         while (j < n1) {
 
-          temp0 = -da_i * x[i + 1];
+	  if (isnan(x[i]) || isinf(x[i]))
+		temp0 = NAN;
+	  else
+		temp0 = -da_i * x[i + 1];
           x[i + 1] = da_i * x[i];
           x[i] = temp0;
+	  if (isnan(x[i + inc_x]) || isinf(x[i + inc_x]))
+		temp1 = NAN;
+	  else
           temp1 = -da_i * x[i + 1 + inc_x];
           x[i + 1 + inc_x] = da_i * x[i + inc_x];
           x[i + inc_x] = temp1;
@@ -246,7 +252,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
         while (j < n) {
 
-          temp0 = -da_i * x[i + 1];
+	  if (isnan(x[i]) || isinf(x[i]))
+		temp0 = NAN;
+	  else
+          	temp0 = -da_i * x[i + 1];
           x[i + 1] = da_i * x[i];
           x[i] = temp0;
           i += inc_x;
@@ -347,7 +356,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
       while (j < n) {
 
-        temp0 = -da_i * x[i + 1];
+	if (isnan(x[i]) || isinf(x[i]))
+		temp0 = NAN;
+	  else
+        	temp0 = -da_i * x[i + 1];
         x[i + 1] = da_i * x[i];
         x[i] = temp0;
         i += 2;

From 20413ee6ecb05b4746e107ec61614178e498509b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 12 Jan 2024 13:11:13 +0100
Subject: [PATCH 572/718] Update zscal.c

---
 kernel/zarch/zscal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c
index d019d66df..4160a1a76 100644
--- a/kernel/zarch/zscal.c
+++ b/kernel/zarch/zscal.c
@@ -329,7 +329,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
       if (da_i == 0)
         zscal_kernel_8_zero(n1, x);
       else
-        zscal_kernel_8_zero_r(n1, alpha, x);
+        zscal_kernel_8(n1, da_r, da_i, x);
     else if (da_i == 0)
       zscal_kernel_8_zero_i(n1, alpha, x);
     else

From 2e2e538b7c493b727ed9ff160e18f086f5e17384 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 13 Jan 2024 20:02:43 +0100
Subject: [PATCH 573/718] Add openblas_set_num_threads_local() and use of
 blas_omp_threads_local in OMP parallel regions

---
 cblas.h         | 1 +
 common_thread.h | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/cblas.h b/cblas.h
index 8a5055cf8..ade2fca3a 100644
--- a/cblas.h
+++ b/cblas.h
@@ -12,6 +12,7 @@ extern "C" {
 /*Set the number of threads on runtime.*/
 void openblas_set_num_threads(int num_threads);
 void goto_set_num_threads(int num_threads);
+int openblas_set_num_threads_local(int num_threads);
 
 /*Get the number of threads on runtime.*/
 int openblas_get_num_threads(void);
diff --git a/common_thread.h b/common_thread.h
index 9e7dae74a..d37fcb189 100644
--- a/common_thread.h
+++ b/common_thread.h
@@ -137,19 +137,20 @@ typedef struct blas_queue {
 
 extern int blas_server_avail;
 extern int blas_omp_number_max;
+extern int blas_omp_threads_local;
 
 static __inline int num_cpu_avail(int level) {
 
 #ifdef USE_OPENMP
 int openmp_nthreads;
 	openmp_nthreads=omp_get_max_threads();
+	if (omp_in_parallel()) openmp_nthreads = blas_omp_threads_local;
 #endif
 
 #ifndef USE_OPENMP 
   if (blas_cpu_number == 1
-#endif
-#ifdef USE_OPENMP
-     if (openmp_nthreads == 1 || omp_in_parallel()
+#else
+     if (openmp_nthreads == 1 
 #endif
       ) return 1;        
 

From 87d31af2ae8ef42c924d33633968af6262ab4344 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 13 Jan 2024 20:06:24 +0100
Subject: [PATCH 574/718] Add openblas_set_num_threads_local()

---
 driver/others/blas_server_omp.c          |  1 +
 driver/others/openblas_set_num_threads.c | 13 +++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c
index 213531057..4d3d1e0de 100644
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -69,6 +69,7 @@
 
 int blas_server_avail = 0;
 int blas_omp_number_max = 0;
+int blas_omp_threads_local = 1;
 
 extern int openblas_omp_adaptive_env(void);
 
diff --git a/driver/others/openblas_set_num_threads.c b/driver/others/openblas_set_num_threads.c
index 0b57867b0..c938fde69 100644
--- a/driver/others/openblas_set_num_threads.c
+++ b/driver/others/openblas_set_num_threads.c
@@ -36,11 +36,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifdef SMP_SERVER
 
 extern  void openblas_set_num_threads(int num_threads) ;
+extern  int openblas_get_num_threads(void) ;
 
 void openblas_set_num_threads_(int* num_threads){
 	openblas_set_num_threads(*num_threads);
 }
 
+int openblas_set_num_threads_local(int num_threads){
+	int ret = openblas_get_num_threads();
+	openblas_set_num_threads(num_threads);
+	blas_omp_threads_local=num_threads;
+	return ret;
+}
+
+
 #else
 //Single thread
 
@@ -50,4 +59,8 @@ void openblas_set_num_threads(int num_threads) {
 void openblas_set_num_threads_(int* num_threads){
 
 }
+
+int openblas_set_num_threads_local(int num_threads){
+	return 1;
+}
 #endif

From bf66af3dc0060af9a7b001470a5300efb8f8313b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 13 Jan 2024 20:37:36 +0100
Subject: [PATCH 575/718] remove matrix dimension 6 from DGS to avoid spurious
 errors from FMA

---
 lapack-netlib/TESTING/dgd.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/TESTING/dgd.in b/lapack-netlib/TESTING/dgd.in
index ca24dd2a0..127b63789 100644
--- a/lapack-netlib/TESTING/dgd.in
+++ b/lapack-netlib/TESTING/dgd.in
@@ -1,6 +1,6 @@
 DGS               Data for the Real Nonsymmetric Schur Form Driver
 5                 Number of matrix dimensions
-2 6 10 12 20 30   Matrix dimensions
+2 10 12 20 30     Matrix dimensions
 1 1 1 2 1         Parameters NB, NBMIN, NXOVER, NS, NBCOL
 10                Threshold for test ratios
 .TRUE.            Put T to test the error exits

From 9fab60d32f595350d599d5fe901dfaae2ce56592 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 13 Jan 2024 20:39:05 +0100
Subject: [PATCH 576/718] Remove matrix dimension 6 from SGS to avoid spurious
 errors from FMA

---
 lapack-netlib/TESTING/sgd.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/TESTING/sgd.in b/lapack-netlib/TESTING/sgd.in
index 164b3cc35..efab6c257 100644
--- a/lapack-netlib/TESTING/sgd.in
+++ b/lapack-netlib/TESTING/sgd.in
@@ -1,6 +1,6 @@
 SGS               Data for the Real Nonsymmetric Schur Form Driver
 5                 Number of matrix dimensions
-2 6 10 12 20 30   Matrix dimensions
+2 10 12 20 30     Matrix dimensions
 1 1 1 2 1         Parameters NB, NBMIN, NXOVER, NS, NBCOL
 10                Threshold for test ratios
 .TRUE.            Put T to test the error exits

From 8a9d492af7b26f6c94965301783f5895cae9cc3b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 14 Jan 2024 19:58:49 +0100
Subject: [PATCH 577/718] Add default for blas_omp_threads_local

---
 driver/others/blas_server.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c
index 2fcb37192..2531c57e9 100644
--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@@ -113,6 +113,8 @@ extern unsigned int openblas_thread_timeout(void);
 /* We need this global for checking if initialization is finished.  */
 int blas_server_avail   __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
 
+int blas_omp_threads_local = 1;
+
 /* Local Variables */
 #if   defined(USE_PTHREAD_LOCK)
 static pthread_mutex_t  server_lock    = PTHREAD_MUTEX_INITIALIZER;

From 152a6c43b6c61607cc91d5285f94dbef2575ff4f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 14 Jan 2024 19:59:55 +0100
Subject: [PATCH 578/718] Add blas_omp_threads_local

---
 driver/others/blas_server_win32.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 40ff85abc..2d41af228 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -59,6 +59,8 @@ static CRITICAL_SECTION queue_lock;
 /* We need this global for checking if initialization is finished.   */
 int blas_server_avail = 0;
 
+int blas_omp_threads_local = 1;
+
 /* Local Variables */
 static BLASULONG server_lock       = 0;
 

From 0d2e486edfb6919afea1bba62f5ac3ebc460cee0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 15 Jan 2024 11:18:59 +0100
Subject: [PATCH 579/718] Handle NAN and INF

---
 kernel/arm/zscal.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/arm/zscal.c b/kernel/arm/zscal.c
index 0521aaa0b..b2d537d04 100644
--- a/kernel/arm/zscal.c
+++ b/kernel/arm/zscal.c
@@ -60,6 +60,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 			else
 			{
 				temp = - da_i * x[ip+1] ;
+				if (isnan(x[ip]) || isinf(x[ip])) temp = NAN;
 				x[ip+1] = da_i * x[ip]  ;
 			}
 		}

From 025a1b2c7b02c9984c331fde39eb27e492aeb8e1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 15 Jan 2024 22:40:21 +0100
Subject: [PATCH 580/718] Only use mtune=native when not cross-compiling

---
 Makefile.arm64 | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/Makefile.arm64 b/Makefile.arm64
index 1b10446f7..ed52a9424 100644
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -104,9 +104,15 @@ ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
 endif
 else
-CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native
+CCOMMON_OPT += -march=armv8.4-a+sve 
+ifneq ($(CROSS), 1)
+CCOMMON_OPT += -mtune=native
+endif
 ifneq ($(F_COMPILER), NAG)
-FCOMMON_OPT += -march=armv8.4-a -mtune=native
+FCOMMON_OPT += -march=armv8.4-a 
+ifneq ($(CROSS), 1)
+FCOMMON_OPT += -mtune=native
+endif
 endif
 endif
 else
@@ -138,9 +144,15 @@ ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
 endif
 else
-CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native
+CCOMMON_OPT += -march=armv8.5-a+sve
+ifneq ($(CROSS), 1)
+CCOMMON_OPT += -mtune=native
+endif
 ifneq ($(F_COMPILER), NAG)
-FCOMMON_OPT += -march=armv8.5-a -mtune=native
+FCOMMON_OPT += -march=armv8.5-a 
+ifneq ($(CROSS), 1)
+FCOMMON_OPT += -mtune=native
+endif
 endif
 endif
 else

From 8f4e325ea82f5d5f3791b147208cbaae8dde8f64 Mon Sep 17 00:00:00 2001
From: Ian McInerney <i.mcinerney17@imperial.ac.uk>
Date: Mon, 15 Jan 2024 23:42:03 +0000
Subject: [PATCH 581/718] Fix Clang sapphire rapids march flag

---
 Makefile.x86_64 | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile.x86_64 b/Makefile.x86_64
index 702447ace..c0dbe84c8 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -130,11 +130,11 @@ ifeq ($(C_COMPILER), GCC)
   endif
  endif
 else ifeq ($(C_COMPILER), CLANG)
- # cooperlake support was added in clang 12
+ # sapphire rapids support was added in clang 12
  ifeq ($(CLANGVERSIONGTEQ12), 1)
-  CCOMMON_OPT += -march=cooperlake
+  CCOMMON_OPT += -march=sapphirerapids
   ifneq ($(F_COMPILER), NAG)
-   FCOMMON_OPT += -march=cooperlake
+   FCOMMON_OPT += -march=sapphirerapids
   endif
  else  # not supported in clang, fallback to avx512
   CCOMMON_OPT += -march=skylake-avx512

From 9edb805e645d3530e907864e242a3f64a881b28a Mon Sep 17 00:00:00 2001
From: Sergei Lewis <slewis@rivosinc.com>
Date: Tue, 16 Jan 2024 14:24:18 +0000
Subject: [PATCH 582/718] fix builds with t-head toolchains that use old
 versions of the intrinsics spec

---
 common_riscv64.h                 | 17 +++++-
 cpuid_riscv64.c                  |  4 +-
 kernel/riscv64/amax_vector.c     | 18 +++---
 kernel/riscv64/amin_vector.c     | 18 +++---
 kernel/riscv64/asum_vector.c     | 20 ++++---
 kernel/riscv64/axpby_vector.c    | 16 +++---
 kernel/riscv64/axpy_vector.c     | 12 ++--
 kernel/riscv64/copy_vector.c     | 14 ++---
 kernel/riscv64/dot_vector.c      | 44 +++++++++------
 kernel/riscv64/gemv_n_vector.c   | 24 ++++----
 kernel/riscv64/gemv_t_vector.c   | 42 ++++++++------
 kernel/riscv64/iamax_vector.c    | 80 +++++++++++++++------------
 kernel/riscv64/iamin_vector.c    | 82 ++++++++++++++++------------
 kernel/riscv64/imax_vector.c     | 82 ++++++++++++++++------------
 kernel/riscv64/imin_vector.c     | 80 ++++++++++++++++-----------
 kernel/riscv64/izamax_vector.c   | 94 ++++++++++++++++++--------------
 kernel/riscv64/izamin_vector.c   | 92 ++++++++++++++++++-------------
 kernel/riscv64/max_vector.c      | 20 ++++---
 kernel/riscv64/min_vector.c      | 20 ++++---
 kernel/riscv64/nrm2_vector.c     | 64 ++++++++++++----------
 kernel/riscv64/rot_vector.c      | 36 ++++++------
 kernel/riscv64/scal_vector.c     | 14 ++---
 kernel/riscv64/sum_vector.c      | 32 +++++------
 kernel/riscv64/swap_vector.c     | 10 ++--
 kernel/riscv64/symv_L_vector.c   | 56 +++++++++++--------
 kernel/riscv64/symv_U_vector.c   | 60 +++++++++++---------
 kernel/riscv64/zamax_vector.c    | 25 +++++----
 kernel/riscv64/zamin_vector.c    | 25 +++++----
 kernel/riscv64/zasum_vector.c    | 22 +++++---
 kernel/riscv64/zaxpby_vector.c   | 32 +++++------
 kernel/riscv64/zaxpy_vector.c    | 20 +++----
 kernel/riscv64/zcopy_vector.c    | 12 ++--
 kernel/riscv64/zdot_vector.c     | 60 +++++++++++---------
 kernel/riscv64/zgemv_n_vector.c  | 28 +++++-----
 kernel/riscv64/zgemv_t_vector.c  | 56 +++++++++++--------
 kernel/riscv64/zhemv_LM_vector.c | 60 +++++++++++---------
 kernel/riscv64/zhemv_UV_vector.c | 60 +++++++++++---------
 kernel/riscv64/znrm2_vector.c    | 51 +++++++++--------
 kernel/riscv64/zrot_vector.c     | 36 ++++++------
 kernel/riscv64/zscal_vector.c    | 32 +++++------
 kernel/riscv64/zsum_vector.c     | 16 +++---
 kernel/riscv64/zswap_vector.c    | 10 ++--
 42 files changed, 900 insertions(+), 696 deletions(-)

diff --git a/common_riscv64.h b/common_riscv64.h
index de79c8cab..f11e8b75d 100644
--- a/common_riscv64.h
+++ b/common_riscv64.h
@@ -91,12 +91,23 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define BUFFER_SIZE     ( 32 << 20)
 #define SEEK_ADDRESS
 
-#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(__riscv_v)
+#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC)))
 # include <riscv_vector.h>
+#endif
+
+#if defined( __riscv_xtheadc ) && defined( __riscv_v ) && ( __riscv_v <= 7000 )
+// t-head toolchain uses obsolete rvv intrinsics, can't build for C910V without this
+#define RISCV_0p10_INTRINSICS
+#define RISCV_RVV(x) x
+#else
+#define RISCV_RVV(x) __riscv_ ## x
+#endif
+
+#if defined(C910V) || defined(RISCV64_ZVL256B)
 # if !defined(DOUBLE)
-#  define EXTRACT_FLOAT(v) __riscv_vfmv_f_s_f32m1_f32(v)
+#  define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f32m1_f32)(v)
 # else
-#  define EXTRACT_FLOAT(v) __riscv_vfmv_f_s_f64m1_f64(v)
+#  define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f64m1_f64)(v)
 # endif
 #else
 # define EXTRACT_FLOAT(v) (v[0])
diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c
index 1b6b62f21..928b5ba92 100644
--- a/cpuid_riscv64.c
+++ b/cpuid_riscv64.c
@@ -72,11 +72,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define CPU_GENERIC         0
 #define CPU_C910V           1
-#define CPU_RISCV64_ZVL256B 2
+#define CPU_x280            2
+#define CPU_RISCV64_ZVL256B 3
 
 static char *cpuname[] = {
   "RISCV64_GENERIC",
   "C910V",
+  "x280",
   "CPU_RISCV64_ZVL256B"
 };
 
diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c
index 81a39af32..b66d4871e 100644
--- a/kernel/riscv64/amax_vector.c
+++ b/kernel/riscv64/amax_vector.c
@@ -49,15 +49,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define JOIN2(x, y) JOIN2_X(x, y)
 #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
 
-#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
 #define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
 #define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
-#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
-#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f,  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
-#define VFABS_FLOAT     JOIN(__riscv_vfabs,      _v_f,  ELEN,   LMUL,   _)
-#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
-#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))(v_res, va, vb, gvl)
+#else
+#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#endif
+#define VFABS_FLOAT     JOIN(RISCV_RVV(vfabs),      _v_f,  ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c
index c8ba75f4a..1c541f0fd 100644
--- a/kernel/riscv64/amin_vector.c
+++ b/kernel/riscv64/amin_vector.c
@@ -48,15 +48,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define JOIN2(x, y) JOIN2_X(x, y)
 #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
 
-#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
 #define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
 #define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
-#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
-#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f,  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
-#define VFABS_FLOAT     JOIN(__riscv_vfabs,      _v_f,  ELEN,   LMUL,   _)
-#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f  ELEN,   LMUL,   _)
-#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmin_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))(v_res, va, vb, gvl)
+#else
+#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#endif
+#define VFABS_FLOAT     JOIN(RISCV_RVV(vfabs),      _v_f,  ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f  ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c
index d10bf99e6..995dbf9a1 100644
--- a/kernel/riscv64/asum_vector.c
+++ b/kernel/riscv64/asum_vector.c
@@ -49,16 +49,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define JOIN2(x, y) JOIN2_X(x, y)
 #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
 
-#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
 #define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
 #define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
-#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
-#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f,  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
-#define VFABS_FLOAT     JOIN(__riscv_vfabs,     _v_f,   ELEN,   LMUL,   _)
-#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
-#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
-#define VFADDVV_FLOAT   JOIN(__riscv_vfadd,     _vv_f,  ELEN,   LMUL,   _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))(v_res, va, vb, gvl)
+#else
+#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#endif
+#define VFABS_FLOAT     JOIN(RISCV_RVV(vfabs),     _v_f,   ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define VFADDVV_FLOAT   JOIN(RISCV_RVV(vfadd),     _vv_f,  ELEN,   LMUL,   _)
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c
index b77cb58fb..386c4a5f1 100644
--- a/kernel/riscv64/axpby_vector.c
+++ b/kernel/riscv64/axpby_vector.c
@@ -48,15 +48,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define JOIN2(x, y) JOIN2_X(x, y)
 #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
 
-#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
 #define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
-#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
-#define VSEV_FLOAT      JOIN(__riscv_vse,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VSSEV_FLOAT     JOIN(__riscv_vsse,      ELEN,   _v_f,   ELEN,   LMUL)
-#define VFMACCVF_FLOAT  JOIN(__riscv_vfmacc,    _vf_f,  ELEN,   LMUL,   _)
-#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
-#define VFMULVF_FLOAT   JOIN(__riscv_vfmul,     _vf_f,  ELEN,   LMUL,   _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(RISCV_RVV(vsse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFMACCVF_FLOAT  JOIN(RISCV_RVV(vfmacc),    _vf_f,  ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMULVF_FLOAT   JOIN(RISCV_RVV(vfmul),     _vf_f,  ELEN,   LMUL,   _)
 
 int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
 {
diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c
index 3447107a6..e99ca8542 100644
--- a/kernel/riscv64/axpy_vector.c
+++ b/kernel/riscv64/axpy_vector.c
@@ -49,13 +49,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define JOIN2(x, y) JOIN2_X(x, y)
 #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
 
-#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
 #define FLOAT_V_T       JOIN(vfloat,    	ELEN,   LMUL,   _t,     _)
-#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
-#define VSEV_FLOAT      JOIN(__riscv_vse,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VSSEV_FLOAT     JOIN(__riscv_vsse,      ELEN,   _v_f,   ELEN,   LMUL)
-#define VFMACCVF_FLOAT  JOIN(__riscv_vfmacc,    _vf_f, 	ELEN,   LMUL,   _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(RISCV_RVV(vsse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFMACCVF_FLOAT  JOIN(RISCV_RVV(vfmacc),    _vf_f, 	ELEN,   LMUL,   _)
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
diff --git a/kernel/riscv64/copy_vector.c b/kernel/riscv64/copy_vector.c
index 710e8670a..ccbd6e482 100644
--- a/kernel/riscv64/copy_vector.c
+++ b/kernel/riscv64/copy_vector.c
@@ -47,12 +47,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define JOIN2(x, y) JOIN2_X(x, y)
 #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
 
-#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
 #define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
-#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
-#define VSEV_FLOAT      JOIN(__riscv_vse,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VSSEV_FLOAT     JOIN(__riscv_vsse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(RISCV_RVV(vsse),      ELEN,   _v_f,   ELEN,   LMUL)
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
@@ -71,7 +71,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                 stride_x = inc_x * sizeof(FLOAT);
                 if(gvl <= n/4){
                         BLASLONG inc_xv = inc_x * gvl;
-                        BLASLONG gvl3 = gvl * 3;
+                        unsigned int gvl3 = gvl * 3;
                         BLASLONG inc_xv3 = inc_xv * 3;
                         for(i=0,j=0; i<n/(4*gvl); i++){
                                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
@@ -99,7 +99,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                 if(gvl <= n/4){
                         BLASLONG inc_yv = inc_y * gvl;
                         BLASLONG inc_yv3 = inc_yv * 3;
-                        BLASLONG gvl3 = gvl * 3;
+                        unsigned int gvl3 = gvl * 3;
                         for(i=0,j=0; i<n/(4*gvl); i++){
                                 v0 = VLEV_FLOAT(&x[j], gvl);
                                 VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
diff --git a/kernel/riscv64/dot_vector.c b/kernel/riscv64/dot_vector.c
index a1841edb7..38ccc4778 100644
--- a/kernel/riscv64/dot_vector.c
+++ b/kernel/riscv64/dot_vector.c
@@ -27,29 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) __riscv_vsetvl_e32m4(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT __riscv_vle32_v_f32m4
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
-#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
-#define VFDOTVV_FLOAT __riscv_vfdot_vv_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) __riscv_vsetvl_e64m4(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT __riscv_vle64_v_f64m4
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
-#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
-#define VFDOTVV_FLOAT __riscv_vfdot_vv_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4)
 #endif
 
 #if defined(DSDOT)
diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c
index 34e21ccda..aa13fc87d 100644
--- a/kernel/riscv64/gemv_n_vector.c
+++ b/kernel/riscv64/gemv_n_vector.c
@@ -27,21 +27,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT __riscv_vle32_v_f32m4
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
-#define VSEV_FLOAT __riscv_vse32_v_f32m4
-#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
 #else
-#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT __riscv_vle64_v_f64m4
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
-#define VSEV_FLOAT __riscv_vse64_v_f64m4
-#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
diff --git a/kernel/riscv64/gemv_t_vector.c b/kernel/riscv64/gemv_t_vector.c
index 3764c3b64..62b85164c 100644
--- a/kernel/riscv64/gemv_t_vector.c
+++ b/kernel/riscv64/gemv_t_vector.c
@@ -27,28 +27,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) __riscv_vsetvl_e32m2(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n)
 #define FLOAT_V_T vfloat32m2_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT __riscv_vle32_v_f32m2
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m2
-#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m2_f32m1
-#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m2
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
-#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m2
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m2)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m2_f32m1(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m2_f32m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m2)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m2)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m2)
 #define xint_t int
 #else
-#define VSETVL(n) __riscv_vsetvl_e64m2(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n)
 #define FLOAT_V_T vfloat64m2_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT __riscv_vle64_v_f64m2
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m2
-#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m2_f64m1
-#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m2
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
-#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m2
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m2)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m2_f64m1(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m2_f64m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m2)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m2)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m2)
 #define xint_t long long
 #endif
 
@@ -60,7 +68,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
         FLOAT temp;
 
         FLOAT_V_T va, vr, vx;
-        BLASLONG gvl = 0;
+        unsigned int gvl = 0;
         FLOAT_V_T_M1 v_res;
 
 
diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c
index 92880fbcf..800312400 100644
--- a/kernel/riscv64/iamax_vector.c
+++ b/kernel/riscv64/iamax_vector.c
@@ -31,50 +31,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(DOUBLE)
 
-#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT __riscv_vle64_v_f64m4
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
-#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m4_f64m1
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f64m4_f64m1(v_res, va, vb, gvl)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m4_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m4)(vm, compressed, va, gvl)
+#else
+#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f64m4_f64m1)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m4_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m4)
+#endif
 #define MASK_T vbool16_t
-#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m4_b16
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
-#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m4
-#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m4_b16
-#define VMFIRSTM __riscv_vfirst_m_b16
+#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f64m4_b16)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f64m4)
+#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f64m4_b16)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b16)
 #define UINT_V_T vuint64m4_t
-#define VIDV_UINT __riscv_vid_v_u64m4
-#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m4_mu
-#define VADDVX_UINT __riscv_vadd_vx_u64m4
-#define VMVVX_UINT __riscv_vmv_v_x_u64m4
-#define VFABS_FLOAT __riscv_vfabs_v_f64m4
-#define VCOMPRESS __riscv_vcompress_vm_u64m4
-#define VMV_X __riscv_vmv_x_s_u64m4_u64
+#define VIDV_UINT RISCV_RVV(vid_v_u64m4)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m4)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m4)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m4)
+#define VMV_X RISCV_RVV(vmv_x_s_u64m4_u64)
 #else
 
-#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT __riscv_vle32_v_f32m4
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
-#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m4_f32m1
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f32m4_f32m1(v_res, va, vb, gvl)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m4_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m4)(vm, compressed, va, gvl)
+#else
+#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f32m4_f32m1)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m4_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m4)
+#endif
 #define MASK_T vbool8_t
-#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m4_b8
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
-#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m4
-#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m4_b8
-#define VMFIRSTM __riscv_vfirst_m_b8
+#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f32m4_b8)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f32m4)
+#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f32m4_b8)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
 #define UINT_V_T vuint32m4_t
-#define VIDV_UINT __riscv_vid_v_u32m4
-#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m4_mu
-#define VADDVX_UINT __riscv_vadd_vx_u32m4
-#define VMVVX_UINT __riscv_vmv_v_x_u32m4
-#define VFABS_FLOAT __riscv_vfabs_v_f32m4
-#define VCOMPRESS __riscv_vcompress_vm_u32m4
-#define VMV_X __riscv_vmv_x_s_u32m4_u32
+#define VIDV_UINT RISCV_RVV(vid_v_u32m4)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m4)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m4)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m4)
+#define VMV_X RISCV_RVV(vmv_x_s_u32m4_u32)
 #endif
 
 
diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c
index 0503f9948..a58872960 100644
--- a/kernel/riscv64/iamin_vector.c
+++ b/kernel/riscv64/iamin_vector.c
@@ -31,52 +31,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(DOUBLE)
 
-#define VSETVL(n) __riscv_vsetvl_e64m8(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT __riscv_vle64_v_f64m8
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f64m8_f64m1(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT vid_v_u64m8_m
+#define VADDVX_MASK_UINT vadd_vx_u64m8_m
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl)
+#else
 #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f64m8_b8
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
-#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8
-#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8
-#define VMFIRSTM __riscv_vfirst_m_b8
-#define UINT_V_T vuint64m8_t
 #define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu
-#define VIDV_UINT __riscv_vid_v_u64m8
 #define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu
-#define VADDVX_UINT __riscv_vadd_vx_u64m8
-#define VMVVX_UINT __riscv_vmv_v_x_u64m8
-#define VFABS_FLOAT __riscv_vfabs_v_f64m8
-#define VCOMPRESS __riscv_vcompress_vm_u64m8
-#define VMV_X __riscv_vmv_x_s_u64m8_u64
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8)
+#endif
+#define MASK_T vbool8_t
+#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f64m8_b8)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f64m8)
+#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f64m8_b8)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
+#define UINT_V_T vuint64m8_t
+#define VIDV_UINT RISCV_RVV(vid_v_u64m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64)
 #else
 
-#define VSETVL(n) __riscv_vsetvl_e32m8(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT __riscv_vle32_v_f32m8
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f32m8_f32m1(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT vid_v_u32m8_m
+#define VADDVX_MASK_UINT vadd_vx_u32m8_m
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl)
+#else
 #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f32m8_b4
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
-#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8
-#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4
-#define VMFIRSTM __riscv_vfirst_m_b4
-#define UINT_V_T vuint32m8_t
 #define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu
-#define VIDV_UINT __riscv_vid_v_u32m8
 #define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu
-#define VADDVX_UINT __riscv_vadd_vx_u32m8
-#define VMVVX_UINT __riscv_vmv_v_x_u32m8
-#define VFABS_FLOAT __riscv_vfabs_v_f32m8
-#define VCOMPRESS __riscv_vcompress_vm_u32m8
-#define VMV_X __riscv_vmv_x_s_u32m8_u32
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8)
+#endif
+#define MASK_T vbool4_t
+#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f32m8_b4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f32m8)
+#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f32m8_b4)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b4)
+#define UINT_V_T vuint32m8_t
+#define VIDV_UINT RISCV_RVV(vid_v_u32m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32)
 #endif
 
 
diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c
index e24f9fd48..308fa15a4 100644
--- a/kernel/riscv64/imax_vector.c
+++ b/kernel/riscv64/imax_vector.c
@@ -31,50 +31,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(DOUBLE)
 
-#define VSETVL(n) __riscv_vsetvl_e64m8(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT __riscv_vle64_v_f64m8
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
-#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f64m8_f64m1(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f64m8_f64m1)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_mu)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8)
+#endif
 #define MASK_T vbool8_t
-#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
-#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8
-#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8
-#define VMFIRSTM __riscv_vfirst_m_b8
+#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f64m8_b8)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f64m8)
+#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f64m8_b8)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
 #define UINT_V_T vuint64m8_t
-#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu
-#define VIDV_UINT __riscv_vid_v_u64m8
-#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu
-#define VADDVX_UINT __riscv_vadd_vx_u64m8
-#define VMVVX_UINT __riscv_vmv_v_x_u64m8
-#define VCOMPRESS __riscv_vcompress_vm_u64m8
-#define VMV_X __riscv_vmv_x_s_u64m8_u64
+#define VIDV_UINT RISCV_RVV(vid_v_u64m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64)
 #else
 
-#define VSETVL(n) __riscv_vsetvl_e32m8(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT __riscv_vle32_v_f32m8
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
-#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f32m8_f32m1(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_m)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f32m8_f32m1)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_mu)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8)
+#endif
 #define MASK_T vbool4_t
-#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
-#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8
-#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4
-#define VMFIRSTM __riscv_vfirst_m_b4
+#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f32m8_b4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f32m8)
+#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f32m8_b4)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b4)
 #define UINT_V_T vuint32m8_t
-#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu
-#define VIDV_UINT __riscv_vid_v_u32m8
-#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu
-#define VADDVX_UINT __riscv_vadd_vx_u32m8
-#define VMVVX_UINT __riscv_vmv_v_x_u32m8
-#define VCOMPRESS __riscv_vcompress_vm_u32m8
-#define VMV_X __riscv_vmv_x_s_u32m8_u32
+#define VIDV_UINT RISCV_RVV(vid_v_u32m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32)
 #endif
 
 
diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c
index a60bd3d07..ec36b8eb9 100644
--- a/kernel/riscv64/imin_vector.c
+++ b/kernel/riscv64/imin_vector.c
@@ -31,50 +31,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(DOUBLE)
 
-#define VSETVL(n) __riscv_vsetvl_e64m8(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT __riscv_vle64_v_f64m8
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
-#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f64m8_f64m1(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT(mask, gvl) RISCV_RVV(vid_v_u64m8_m)(mask, v_min_index, gvl)
+#define VADDVX_MASK_UINT(mask, a, b, gvl) RISCV_RVV(vadd_vx_u64m8_m)(mask, a, a, b, gvl)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMINVS_FLOAT RISCV_RVV(vfredmin_vs_f64m8_f64m1)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8)
+#endif
 #define MASK_T vbool8_t
-#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f64m8_b8
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
-#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8
-#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8
-#define VMFIRSTM __riscv_vfirst_m_b8
+#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f64m8_b8)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f64m8)
+#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f64m8_b8)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
 #define UINT_V_T vuint64m8_t
-#define VIDV_MASK_UINT __riscv_vid_v_u64m8_m
-#define VIDV_UINT __riscv_vid_v_u64m8
-#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_m
-#define VADDVX_UINT __riscv_vadd_vx_u64m8
-#define VMVVX_UINT __riscv_vmv_v_x_u64m8
-#define VCOMPRESS __riscv_vcompress_vm_u64m8
-#define VMV_X __riscv_vmv_x_s_u64m8_u64
+#define VIDV_UINT RISCV_RVV(vid_v_u64m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64)
 #else
 
-#define VSETVL(n) __riscv_vsetvl_e32m8(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT __riscv_vle32_v_f32m8
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f32m8_f32m1(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT(mask, gvl) RISCV_RVV(vid_v_u32m8_m)(mask, v_min_index, gvl)
+#define VADDVX_MASK_UINT(mask, a, b, gvl) RISCV_RVV(vadd_vx_u32m8_m)(mask, a, a, b, gvl)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl)
+#else
 #define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f32m8_b4
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
-#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8
-#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4
-#define VMFIRSTM __riscv_vfirst_m_b4
-#define UINT_V_T vuint32m8_t
 #define VIDV_MASK_UINT __riscv_vid_v_u32m8_m
-#define VIDV_UINT __riscv_vid_v_u32m8
 #define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m
-#define VADDVX_UINT __riscv_vadd_vx_u32m8
-#define VMVVX_UINT __riscv_vmv_v_x_u32m8
-#define VCOMPRESS __riscv_vcompress_vm_u32m8
-#define VMV_X __riscv_vmv_x_s_u32m8_u32
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8)
+#endif
+#define MASK_T vbool4_t
+#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f32m8_b4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f32m8)
+#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f32m8_b4)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b4)
+#define UINT_V_T vuint32m8_t
+#define VIDV_UINT RISCV_RVV(vid_v_u32m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32)
 #endif
 
 
diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c
index 89cd510c1..d33e89c00 100644
--- a/kernel/riscv64/izamax_vector.c
+++ b/kernel/riscv64/izamax_vector.c
@@ -31,58 +31,72 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(DOUBLE)
 
-#define VSETVL(n) __riscv_vsetvl_e64m8(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT __riscv_vle64_v_f64m8
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
-#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f64m8_f64m1
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmax_vs_f64m8_f64m1)(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f64m8_f64m1)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_mu)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8)
+#endif
 #define MASK_T vbool8_t
-#define VMFLTVV_FLOAT __riscv_vmflt_vv_f64m8_b8
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
-#define VFMAXVV_FLOAT __riscv_vfmax_vv_f64m8
-#define VMFGEVF_FLOAT __riscv_vmfge_vf_f64m8_b8
-#define VMFIRSTM __riscv_vfirst_m_b8
+#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f64m8_b8)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f64m8)
+#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f64m8_b8)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
 #define UINT_V_T vuint64m8_t
-#define VSEVU_UINT __riscv_vse64_v_u64m8
+#define VSEVU_UINT RISCV_RVV(vse64_v_u64m8)
 #define UINT_T long unsigned int
-#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu
-#define VIDV_UINT __riscv_vid_v_u64m8
-#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu
-#define VADDVX_UINT __riscv_vadd_vx_u64m8
-#define VMVVX_UINT __riscv_vmv_v_x_u64m8
-#define VFABS_FLOAT __riscv_vfabs_v_f64m8
-#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8
-#define VCOMPRESS __riscv_vcompress_vm_u64m8
-#define VMV_X __riscv_vmv_x_s_u64m8_u64
+#define VIDV_UINT RISCV_RVV(vid_v_u64m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m8)
+#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64)
 #else
 
-#define VSETVL(n) __riscv_vsetvl_e32m8(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT __riscv_vle32_v_f32m8
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
-#define VFREDMAXVS_FLOAT __riscv_vfredmax_vs_f32m8_f32m1
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmax_vs_f32m8_f32m1)(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_m)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f32m8_f32m1)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_mu)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8)
+#endif
 #define MASK_T vbool4_t
-#define VMFLTVV_FLOAT __riscv_vmflt_vv_f32m8_b4
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
-#define VFMAXVV_FLOAT __riscv_vfmax_vv_f32m8
-#define VMFGEVF_FLOAT __riscv_vmfge_vf_f32m8_b4
-#define VMFIRSTM __riscv_vfirst_m_b4
+#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f32m8_b4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f32m8)
+#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f32m8_b4)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b4)
 #define UINT_V_T vuint32m8_t
 #define UINT_T unsigned int
-#define VSEVU_UINT __riscv_vse32_v_u32m8
-#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu
-#define VIDV_UINT __riscv_vid_v_u32m8
-#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu
-#define VADDVX_UINT __riscv_vadd_vx_u32m8
-#define VMVVX_UINT __riscv_vmv_v_x_u32m8
-#define VFABS_FLOAT __riscv_vfabs_v_f32m8
-#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8
-#define VCOMPRESS __riscv_vcompress_vm_u32m8
-#define VMV_X __riscv_vmv_x_s_u32m8_u32
+#define VSEVU_UINT RISCV_RVV(vse32_v_u32m8)
+#define VIDV_UINT RISCV_RVV(vid_v_u32m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m8)
+#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32)
 #endif
 
 
diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c
index 74daf32b8..a3877a46c 100644
--- a/kernel/riscv64/izamin_vector.c
+++ b/kernel/riscv64/izamin_vector.c
@@ -31,58 +31,72 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(DOUBLE)
 
-#define VSETVL(n) __riscv_vsetvl_e64m8(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT __riscv_vle64_v_f64m8
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
-#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmin_vs_f64m8_f64m1)(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMINVS_FLOAT RISCV_RVV(vfredmin_vs_f64m8_f64m1)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_mu)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8)
+#endif
 #define MASK_T vbool8_t
-#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f64m8_b8
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
-#define VFMINVV_FLOAT __riscv_vfmin_vv_f64m8
-#define VMFLEVF_FLOAT __riscv_vmfle_vf_f64m8_b8
-#define VMFIRSTM __riscv_vfirst_m_b8
+#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f64m8_b8)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f64m8)
+#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f64m8_b8)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
 #define UINT_V_T vuint64m8_t
 #define VSEVU_UINT vse64_v_u64m8
 #define UINT_T long unsigned int
-#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu
-#define VIDV_UINT __riscv_vid_v_u64m8
-#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu
-#define VADDVX_UINT __riscv_vadd_vx_u64m8
-#define VMVVX_UINT __riscv_vmv_v_x_u64m8
-#define VFABS_FLOAT __riscv_vfabs_v_f64m8
-#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8
-#define VCOMPRESS __riscv_vcompress_vm_u64m8
-#define VMV_X __riscv_vmv_x_s_u64m8_u64
+#define VIDV_UINT RISCV_RVV(vid_v_u64m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m8)
+#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64)
 #else
 
-#define VSETVL(n) __riscv_vsetvl_e32m8(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT __riscv_vle32_v_f32m8
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
-#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmin_vs_f32m8_f32m1)(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_m)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMINVS_FLOAT RISCV_RVV(vfredmin_vs_f32m8_f32m1)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_mu)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8)
+#endif
 #define MASK_T vbool4_t
-#define VMFGTVV_FLOAT __riscv_vmfgt_vv_f32m8_b4
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
-#define VFMINVV_FLOAT __riscv_vfmin_vv_f32m8
-#define VMFLEVF_FLOAT __riscv_vmfle_vf_f32m8_b4
-#define VMFIRSTM __riscv_vfirst_m_b4
+#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f32m8_b4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f32m8)
+#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f32m8_b4)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b4)
 #define UINT_V_T vuint32m8_t
 #define UINT_T unsigned int
-#define VSEVU_UINT __riscv_vse32_v_u32m8
-#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu
-#define VIDV_UINT __riscv_vid_v_u32m8
-#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu
-#define VADDVX_UINT __riscv_vadd_vx_u32m8
-#define VMVVX_UINT __riscv_vmv_v_x_u32m8
-#define VFABS_FLOAT __riscv_vfabs_v_f32m8
-#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8
-#define VCOMPRESS __riscv_vcompress_vm_u32m8
-#define VMV_X __riscv_vmv_x_s_u32m8_u32
+#define VSEVU_UINT RISCV_RVV(vse32_v_u32m8)
+#define VIDV_UINT RISCV_RVV(vid_v_u32m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m8)
+#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32)
 #endif
 
 
diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c
index 97f602e51..ee9920cd2 100644
--- a/kernel/riscv64/max_vector.c
+++ b/kernel/riscv64/max_vector.c
@@ -54,17 +54,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define JOIN2(x, y) JOIN2_X(x, y)
 #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
 
-#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
 #define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
 #define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
-#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
-#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f,  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))(v_res, va, vb, gvl)
+#else
+#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#endif
 #define MASK_T          JOIN(vbool,     MLEN,   _t,     _,      _)
-#define VMFLTVF_FLOAT   JOIN(__riscv_vmflt_vf_f, ELEN,  LMUL,   _b,     MLEN)
-#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
-#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
-#define VFMAXVV_FLOAT   JOIN(__riscv_vfmax,     _vv_f,  ELEN,   LMUL,   _)
+#define VMFLTVF_FLOAT   JOIN(RISCV_RVV(vmflt_vf_f), ELEN,  LMUL,   _b,     MLEN)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define VFMAXVV_FLOAT   JOIN(RISCV_RVV(vfmax),     _vv_f,  ELEN,   LMUL,   _)
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c
index 77bf19b9d..2001840bb 100644
--- a/kernel/riscv64/min_vector.c
+++ b/kernel/riscv64/min_vector.c
@@ -54,17 +54,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define JOIN2(x, y) JOIN2_X(x, y)
 #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
 
-#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
 #define FLOAT_V_T       JOIN(vfloat,    ELEN,   LMUL,   _t,     _)
 #define FLOAT_V_T_M1    JOIN(vfloat,    ELEN,   m1,     _t,     _)
-#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
-#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f,  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmin_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))(v_res, va, vb, gvl)
+#else
+#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#endif
 #define MASK_T          JOIN(vbool,     MLEN,   _t,     _,      _)
-#define VMFLTVF_FLOAT   JOIN(__riscv_vmflt_vf_f, ELEN,  LMUL,   _b,     MLEN)
-#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
-#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
-#define VFMINVV_FLOAT   JOIN(__riscv_vfmin,     _vv_f,  ELEN,   LMUL,   _)
+#define VMFLTVF_FLOAT   JOIN(RISCV_RVV(vmflt_vf_f), ELEN,  LMUL,   _b,     MLEN)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define VFMINVV_FLOAT   JOIN(RISCV_RVV(vfmin),     _vv_f,  ELEN,   LMUL,   _)
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c
index 205f07eb7..141dffebf 100644
--- a/kernel/riscv64/nrm2_vector.c
+++ b/kernel/riscv64/nrm2_vector.c
@@ -52,38 +52,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define JOIN2(x, y) JOIN2_X(x, y)
 #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
 
-#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
 #define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
 #define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
-#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
-#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
-#define VFMVSF_FLOAT    JOIN(__riscv_vfmv,      _s_f_f, ELEN,   LMUL,   _)
-#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVSF_FLOAT    JOIN(RISCV_RVV(vfmv),      _s_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
 #define MASK_T          JOIN(vbool,             MLEN,   _t,     _,      _)
-#define VFABS           JOIN(__riscv_vfabs,     _v_f,   ELEN,   LMUL,   _)
-#define VMFNE           JOIN(__riscv_vmfne_vf_f,ELEN,   LMUL,   _b,     MLEN)
-#define VMFGT           JOIN(__riscv_vmfgt_vv_f,ELEN,   LMUL,   _b,     MLEN)
-#define VMFEQ           JOIN(__riscv_vmfeq_vf_f,ELEN,   LMUL,   _b,     MLEN)
-#define VCPOP           JOIN(__riscv_vcpop,     _m_b,   MLEN,   _,      _)
-#define VFREDMAX        JOIN(__riscv_vfredmax_vs_f,ELEN,LMUL,   JOIN2(_f, ELEN), m1)
-#define VFREDMIN        JOIN(__riscv_vfredmin_vs_f,ELEN,LMUL,   JOIN2(_f, ELEN), m1)
-#define VFIRST          JOIN(__riscv_vfirst,    _m_b,   MLEN,   _,      _)
-#define VRGATHER        JOIN(__riscv_vrgather,  _vx_f,  ELEN,   LMUL,   _)
-#define VFDIV           JOIN(__riscv_vfdiv,     _vv_f,  ELEN,   LMUL,   _)
-#define VFDIV_M         JOIN(__riscv_vfdiv,     _vv_f,  ELEN,   LMUL,   _mu)
-#define VFMUL           JOIN(__riscv_vfmul,     _vv_f,  ELEN,   LMUL,   _)
-#define VFMUL_M         JOIN(__riscv_vfmul,     _vv_f,  ELEN,   LMUL,   _mu)
-#define VFMACC          JOIN(__riscv_vfmacc,    _vv_f,  ELEN,   LMUL,   _)
-#define VFMACC_M        JOIN(__riscv_vfmacc,    _vv_f,  ELEN,   LMUL,   _mu)
-#define VMSBF           JOIN(__riscv_vmsbf,     _m_b,   MLEN,   _,      _)
-#define VMSOF           JOIN(__riscv_vmsof,     _m_b,   MLEN,   _,      _)
-#define VMAND           JOIN(__riscv_vmand,     _mm_b,  MLEN,   _,      _)
-#define VMANDN          JOIN(__riscv_vmandn,    _mm_b,  MLEN,   _,      _)
-#define VFREDSUM        JOIN(__riscv_vfredusum_vs_f,ELEN,LMUL,  JOIN2(_f, ELEN), m1)
-#define VMERGE          JOIN(__riscv_vmerge,    _vvm_f, ELEN,   LMUL,   _)
-
-#define VSEV_FLOAT      JOIN(__riscv_vse,       ELEN,   _v_f,   ELEN,   LMUL)
+#define VFABS           JOIN(RISCV_RVV(vfabs),     _v_f,   ELEN,   LMUL,   _)
+#define VMFNE           JOIN(RISCV_RVV(vmfne_vf_f),ELEN,   LMUL,   _b,     MLEN)
+#define VMFGT           JOIN(RISCV_RVV(vmfgt_vv_f),ELEN,   LMUL,   _b,     MLEN)
+#define VMFEQ           JOIN(RISCV_RVV(vmfeq_vf_f),ELEN,   LMUL,   _b,     MLEN)
+#define VCPOP           JOIN(RISCV_RVV(vcpop),     _m_b,   MLEN,   _,      _)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFDIV_M         JOIN(RISCV_RVV(vfdiv),     _vv_f,  ELEN,   LMUL,   _m)
+#define VFMUL_M         JOIN(RISCV_RVV(vfmul),     _vv_f,  ELEN,   LMUL,   _m)
+#define VFMACC_M        JOIN(RISCV_RVV(vfmacc),    _vv_f,  ELEN,   LMUL,   _m)
+#define VMERGE(a, b, mask, gvl)       JOIN(RISCV_RVV(vmerge),    _vvm_f, ELEN,   LMUL,   _)(mask, a, b, gvl)
+#else
+#define VFDIV_M         JOIN(RISCV_RVV(vfdiv),     _vv_f,  ELEN,   LMUL,   _mu)
+#define VFMUL_M         JOIN(RISCV_RVV(vfmul),     _vv_f,  ELEN,   LMUL,   _mu)
+#define VFMACC_M        JOIN(RISCV_RVV(vfmacc),    _vv_f,  ELEN,   LMUL,   _mu)
+#define VMERGE          JOIN(RISCV_RVV(vmerge),    _vvm_f, ELEN,   LMUL,   _)
+#endif
+#define VFIRST          JOIN(RISCV_RVV(vfirst),    _m_b,   MLEN,   _,      _)
+#define VRGATHER        JOIN(RISCV_RVV(vrgather),  _vx_f,  ELEN,   LMUL,   _)
+#define VFDIV           JOIN(RISCV_RVV(vfdiv),     _vv_f,  ELEN,   LMUL,   _)
+#define VFMUL           JOIN(RISCV_RVV(vfmul),     _vv_f,  ELEN,   LMUL,   _)
+#define VFMACC          JOIN(RISCV_RVV(vfmacc),    _vv_f,  ELEN,   LMUL,   _)
+#define VMSBF           JOIN(RISCV_RVV(vmsbf),     _m_b,   MLEN,   _,      _)
+#define VMSOF           JOIN(RISCV_RVV(vmsof),     _m_b,   MLEN,   _,      _)
+#define VMAND           JOIN(RISCV_RVV(vmand),     _mm_b,  MLEN,   _,      _)
+#define VMANDN          JOIN(RISCV_RVV(vmandn),    _mm_b,  MLEN,   _,      _)
+
+#define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
 
 #if defined(DOUBLE)
 #define ABS fabs
@@ -91,7 +95,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ABS fabsf
 #endif
 
-#define EXTRACT_FLOAT0_V(v) JOIN(__riscv_vfmv_f_s_f, ELEN, LMUL, _f, ELEN)(v)
+#define EXTRACT_FLOAT0_V(v) JOIN(RISCV_RVV(vfmv_f_s_f), ELEN, LMUL, _f, ELEN)(v)
 
 //#define DUMP( label, v0, gvl )
 #define DUMP( label, v0, gvl ) do{ FLOAT x[16]; VSEV_FLOAT( x, v0, gvl ); printf ("%s(%d): %s [ ", __FILE__, __LINE__, label); for( int xxx = 0; xxx < gvl; ++xxx ) { printf("%f, ", x[xxx]); } printf(" ]\n"); } while(0)
diff --git a/kernel/riscv64/rot_vector.c b/kernel/riscv64/rot_vector.c
index f7aa6c3c2..649d9bb94 100644
--- a/kernel/riscv64/rot_vector.c
+++ b/kernel/riscv64/rot_vector.c
@@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) __riscv_vsetvl_e32m4(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT __riscv_vle32_v_f32m4
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
-#define VSEV_FLOAT __riscv_vse32_v_f32m4
-#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
-#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4
-#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4)
+#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f32m4)
 #else
-#define VSETVL(n) __riscv_vsetvl_e64m4(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT __riscv_vle64_v_f64m4
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
-#define VSEV_FLOAT __riscv_vse64_v_f64m4
-#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
-#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4
-#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4)
+#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f64m4)
 #endif
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
diff --git a/kernel/riscv64/scal_vector.c b/kernel/riscv64/scal_vector.c
index f6b87d4f9..8fa9315f6 100644
--- a/kernel/riscv64/scal_vector.c
+++ b/kernel/riscv64/scal_vector.c
@@ -52,14 +52,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define JOIN2(x, y) JOIN2_X(x, y)
 #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
 
-#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
 #define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
-#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
-#define VSEV_FLOAT      JOIN(__riscv_vse,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VSSEV_FLOAT     JOIN(__riscv_vsse,      ELEN,   _v_f,   ELEN,   LMUL)
-#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
-#define VFMULVF_FLOAT   JOIN(__riscv_vfmul,     _vf_f,  ELEN,   LMUL,   _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(RISCV_RVV(vsse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMULVF_FLOAT   JOIN(RISCV_RVV(vfmul),     _vf_f,  ELEN,   LMUL,   _)
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
diff --git a/kernel/riscv64/sum_vector.c b/kernel/riscv64/sum_vector.c
index 3fd217375..cf734faab 100644
--- a/kernel/riscv64/sum_vector.c
+++ b/kernel/riscv64/sum_vector.c
@@ -29,27 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <math.h>
 
 #if !defined(DOUBLE)
-#define VSETVL(n) __riscv_vsetvl_e32m8(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT __riscv_vle32_v_f32m8
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
-#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
-#define VFADDVV_FLOAT __riscv_vfadd_vv_f32m8
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
+#define VFREDSUMVS_FLOAT RISCV_RVV(vfredusum_vs_f32m8_f32m1)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8)
 #else
-#define VSETVL(n) __riscv_vsetvl_e64m8(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT __riscv_vle64_v_f64m8
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
-#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
-#define VFADDVV_FLOAT __riscv_vfadd_vv_f64m8
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
+#define VFREDSUMVS_FLOAT RISCV_RVV(vfredusum_vs_f64m8_f64m1)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8)
 #endif
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c
index baf3d8f69..3b467a586 100644
--- a/kernel/riscv64/swap_vector.c
+++ b/kernel/riscv64/swap_vector.c
@@ -53,12 +53,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define JOIN2(x, y) JOIN2_X(x, y)
 #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
 
-#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
 #define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
-#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
-#define VSEV_FLOAT      JOIN(__riscv_vse,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VSSEV_FLOAT     JOIN(__riscv_vsse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(RISCV_RVV(vsse),      ELEN,   _v_f,   ELEN,   LMUL)
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
diff --git a/kernel/riscv64/symv_L_vector.c b/kernel/riscv64/symv_L_vector.c
index f3b922195..cd89c63ec 100644
--- a/kernel/riscv64/symv_L_vector.c
+++ b/kernel/riscv64/symv_L_vector.c
@@ -27,35 +27,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) __riscv_vsetvl_e32m4(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT __riscv_vle32_v_f32m4
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
-#define VSEV_FLOAT __riscv_vse32_v_f32m4
-#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
-#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
-#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) __riscv_vsetvl_e64m4(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT __riscv_vle64_v_f64m4
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
-#define VSEV_FLOAT __riscv_vse64_v_f64m4
-#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
-#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
-#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4)
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c
index 9977e2741..894c6a643 100644
--- a/kernel/riscv64/symv_U_vector.c
+++ b/kernel/riscv64/symv_U_vector.c
@@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) __riscv_vsetvl_e32m4(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT __riscv_vle32_v_f32m4
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
-#define VSEV_FLOAT __riscv_vse32_v_f32m4
-#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
-#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
-#define VFDOTVV_FLOAT __riscv_vfdot_vv_f32m4
-#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) __riscv_vsetvl_e64m4(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT __riscv_vle64_v_f64m4
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
-#define VSEV_FLOAT __riscv_vse64_v_f64m4
-#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
-#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
-#define VFDOTVV_FLOAT __riscv_vfdot_vv_f64m4
-#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4)
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c
index 4301528bd..2dee5ab29 100644
--- a/kernel/riscv64/zamax_vector.c
+++ b/kernel/riscv64/zamax_vector.c
@@ -53,19 +53,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define JOIN2(x, y) JOIN2_X(x, y)
 #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
 
-#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
 #define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
 #define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
-#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
-#define VFREDMAXVS_FLOAT JOIN(__riscv_vfredmax_vs_f,  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmax_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1)) (v_res, va, vb, gvl)
+#define VFRSUBVF_MASK_FLOAT(va,vb,c,gvl) JOIN(RISCV_RVV(vfrsub),_vf_f,  ELEN,   LMUL,   _m) (va, vb, vb, c, gvl)
+#else
+#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#define VFRSUBVF_MASK_FLOAT JOIN(RISCV_RVV(vfrsub),_vf_f,  ELEN,   LMUL,   _m)
+#endif
 #define MASK_T          JOIN(vbool,             MLEN,   _t,     _,      _)
-#define VMFLTVF_FLOAT   JOIN(__riscv_vmflt_vf_f, ELEN,  LMUL,   _b,     MLEN)
-#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
-#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
-#define VFRSUBVF_MASK_FLOAT JOIN(__riscv_vfrsub,_vf_f,  ELEN,   LMUL,   _m)
-#define VFMAXVV_FLOAT   JOIN(__riscv_vfmax,     _vv_f,  ELEN,   LMUL,   _)
-#define VFADDVV_FLOAT   JOIN(__riscv_vfadd,     _vv_f,  ELEN,   LMUL,   _)
+#define VMFLTVF_FLOAT   JOIN(RISCV_RVV(vmflt_vf_f), ELEN,  LMUL,   _b,     MLEN)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define VFMAXVV_FLOAT   JOIN(RISCV_RVV(vfmax),     _vv_f,  ELEN,   LMUL,   _)
+#define VFADDVV_FLOAT   JOIN(RISCV_RVV(vfadd),     _vv_f,  ELEN,   LMUL,   _)
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c
index 095b1c3df..df9a7a7e1 100644
--- a/kernel/riscv64/zamin_vector.c
+++ b/kernel/riscv64/zamin_vector.c
@@ -55,19 +55,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define JOIN2(x, y) JOIN2_X(x, y)
 #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
 
-#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
 #define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
 #define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
-#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
-#define VFREDMINVS_FLOAT JOIN(__riscv_vfredmin_vs_f,  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmin_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1)) (v_res, va, vb, gvl)
+#define VFRSUBVF_MASK_FLOAT(va,vb,c,gvl) JOIN(RISCV_RVV(vfrsub),_vf_f,  ELEN,   LMUL,   _m) (va, vb, vb, c, gvl)
+#else
+#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#define VFRSUBVF_MASK_FLOAT JOIN(RISCV_RVV(vfrsub),_vf_f,  ELEN,   LMUL,   _m)
+#endif
 #define MASK_T          JOIN(vbool,             MLEN,   _t,     _,      _)
-#define VMFLTVF_FLOAT   JOIN(__riscv_vmflt_vf_f, ELEN,  LMUL,   _b,     MLEN)
-#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
-#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
-#define VFRSUBVF_MASK_FLOAT JOIN(__riscv_vfrsub,_vf_f,  ELEN,   LMUL,   _m)
-#define VFMINVV_FLOAT   JOIN(__riscv_vfmin,     _vv_f,  ELEN,   LMUL,   _)
-#define VFADDVV_FLOAT   JOIN(__riscv_vfadd,     _vv_f,  ELEN,   LMUL,   _)
+#define VMFLTVF_FLOAT   JOIN(RISCV_RVV(vmflt_vf_f), ELEN,  LMUL,   _b,     MLEN)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define VFMINVV_FLOAT   JOIN(RISCV_RVV(vfmin),     _vv_f,  ELEN,   LMUL,   _)
+#define VFADDVV_FLOAT   JOIN(RISCV_RVV(vfadd),     _vv_f,  ELEN,   LMUL,   _)
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c
index 9136f0037..fca904d6a 100644
--- a/kernel/riscv64/zasum_vector.c
+++ b/kernel/riscv64/zasum_vector.c
@@ -53,17 +53,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define JOIN2(x, y) JOIN2_X(x, y)
 #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
 
-#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
 #define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
 #define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
-#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
-#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f,  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
-#define VFABS_FLOAT     JOIN(__riscv_vfabs,     _v_f,   ELEN,   LMUL,   _)
-#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
-#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
-#define VFADDVV_FLOAT   JOIN(__riscv_vfadd,     _vv_f,  ELEN,   LMUL,   _)
-#define VMFLTVF_FLOAT   JOIN(__riscv_vmflt,     _vf_f,  ELEN,   LMUL,   MLEN)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))(v_res, va, vb, gvl)
+#else
+#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#endif
+#define VFABS_FLOAT     JOIN(RISCV_RVV(vfabs),     _v_f,   ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define VFADDVV_FLOAT   JOIN(RISCV_RVV(vfadd),     _vv_f,  ELEN,   LMUL,   _)
+#define VMFLTVF_FLOAT   JOIN(RISCV_RVV(vmflt),     _vf_f,  ELEN,   LMUL,   MLEN)
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c
index 404f51fb3..d5ad974cf 100644
--- a/kernel/riscv64/zaxpby_vector.c
+++ b/kernel/riscv64/zaxpby_vector.c
@@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
-#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
-#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4
-#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m4
-#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4)
+#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f32m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
 #else
-#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
-#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
-#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4
-#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m4
-#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4)
+#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f64m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
 #endif
 
 int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i, FLOAT *y, BLASLONG inc_y)
diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c
index 20bfe74ec..d19e51118 100644
--- a/kernel/riscv64/zaxpy_vector.c
+++ b/kernel/riscv64/zaxpy_vector.c
@@ -28,19 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
-#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
-#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
 #else
-#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
-#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
-#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
diff --git a/kernel/riscv64/zcopy_vector.c b/kernel/riscv64/zcopy_vector.c
index 9da60acb0..9e4a67b71 100644
--- a/kernel/riscv64/zcopy_vector.c
+++ b/kernel/riscv64/zcopy_vector.c
@@ -27,15 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
-#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
 #else
-#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
-#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
 #endif
 
 
diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c
index 57542714a..13b8fe378 100644
--- a/kernel/riscv64/zdot_vector.c
+++ b/kernel/riscv64/zdot_vector.c
@@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) __riscv_vsetvl_e32m4(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT __riscv_vle32_v_f32m4
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
-#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
-#define VFDOTVV_FLOAT __riscv_vfdot_vv_f32m4
-#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
-#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f32m4
-#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32)
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) __riscv_vsetvl_e64m4(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4)
+#define VFMSACVV_FLOAT RISCV_RVV(vfmsac_vv_f32m4)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT __riscv_vle64_v_f64m4
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
-#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
-#define VFDOTVV_FLOAT __riscv_vfdot_vv_f64m4
-#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
-#define VFMSACVV_FLOAT __riscv_vfmsac_vv_f64m4
-#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64)
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4)
+#define VFMSACVV_FLOAT RISCV_RVV(vfmsac_vv_f64m4)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4)
 #endif
 
 OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c
index f4acad770..104d3865d 100644
--- a/kernel/riscv64/zgemv_n_vector.c
+++ b/kernel/riscv64/zgemv_n_vector.c
@@ -27,23 +27,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT __riscv_vle32_v_f32m4
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
-#define VSEV_FLOAT __riscv_vse32_v_f32m4
-#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
-#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
 #else
-#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT __riscv_vle64_v_f64m4
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
-#define VSEV_FLOAT __riscv_vse64_v_f64m4
-#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
-#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
diff --git a/kernel/riscv64/zgemv_t_vector.c b/kernel/riscv64/zgemv_t_vector.c
index 179454094..5d85ab3a4 100644
--- a/kernel/riscv64/zgemv_t_vector.c
+++ b/kernel/riscv64/zgemv_t_vector.c
@@ -27,31 +27,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) __riscv_vsetvl_e32m2(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m2_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m2
-#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m2_f32m1
-#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m2
-#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m2
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
-#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m2
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m2_f32m1)(vr, va, vb, gvl)
 #else
-#define VSETVL(n) __riscv_vsetvl_e64m2(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m2_f32m1)(va, vb, gvl)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m2)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m2)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m2)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m2)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m2_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m2
-#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m2_f64m1
-#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m2
-#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m2
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
-#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m2
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m2_f64m1)(vr, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m2_f64m1)(va, vb, gvl)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m2)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m2)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m2)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m2)
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@@ -93,8 +101,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
                         vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl);
                         vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl);
 #endif
-                        v_res_r = VFREDSUM_FLOAT(vr, v_res_r, gvl);
-                        v_res_i = VFREDSUM_FLOAT(vi, v_res_i, gvl);
+                        v_res_r = VFREDSUM_FLOAT(v_res_r, vr, v_res_r, gvl);
+                        v_res_i = VFREDSUM_FLOAT(v_res_i, vi, v_res_i, gvl);
 
                         j += inc_av;
                         ix += inc_xv;
@@ -117,8 +125,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
                         vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl);
 
 #endif
-                        v_res_r = VFREDSUM_FLOAT(vr, v_res_r, gvl);
-                        v_res_i = VFREDSUM_FLOAT(vi, v_res_i, gvl);
+                        v_res_r = VFREDSUM_FLOAT(v_res_r, vr, v_res_r, gvl);
+                        v_res_i = VFREDSUM_FLOAT(v_res_i, vi, v_res_i, gvl);
                 }
 
                 temp_r = VFMVFS_FLOAT(v_res_r);
diff --git a/kernel/riscv64/zhemv_LM_vector.c b/kernel/riscv64/zhemv_LM_vector.c
index e025120e5..117db7d84 100644
--- a/kernel/riscv64/zhemv_LM_vector.c
+++ b/kernel/riscv64/zhemv_LM_vector.c
@@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) __riscv_vsetvl_e32m4(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
-#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
-#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
-#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
-#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
-#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) __riscv_vsetvl_e64m4(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
-#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
-#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
-#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
-#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
-#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4)
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c
index 0e1ea5436..7c6b63bf3 100644
--- a/kernel/riscv64/zhemv_UV_vector.c
+++ b/kernel/riscv64/zhemv_UV_vector.c
@@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) __riscv_vsetvl_e32m4(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
-#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
-#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
-#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
-#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
-#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) __riscv_vsetvl_e64m4(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
-#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
-#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
-#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
-#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
-#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4)
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c
index 437bf4246..8614f7539 100644
--- a/kernel/riscv64/znrm2_vector.c
+++ b/kernel/riscv64/znrm2_vector.c
@@ -52,37 +52,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define JOIN2(x, y) JOIN2_X(x, y)
 #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
 
-#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
 #define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
 #define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
-#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
-#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
-#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
 #define MASK_T          JOIN(vbool,             MLEN,   _t,     _,      _)
-#define VFABS           JOIN(__riscv_vfabs,     _v_f,   ELEN,   LMUL,   _)
-#define VMFNE           JOIN(__riscv_vmfne_vf_f,ELEN,   LMUL,   _b,     MLEN)
-#define VMFGT           JOIN(__riscv_vmfgt_vv_f,ELEN,   LMUL,   _b,     MLEN)
-#define VMFEQ           JOIN(__riscv_vmfeq_vv_f,ELEN,   LMUL,   _b,     MLEN)
-#define VCPOP           JOIN(__riscv_vcpop,     _m_b,   MLEN,   _,      _)
-#define VFREDMAX        JOIN(__riscv_vfredmax_vs_f,ELEN,LMUL,   JOIN2(_f, ELEN), m1)
-#define VFIRST          JOIN(__riscv_vfirst,    _m_b,   MLEN,   _,      _)
-#define VRGATHER        JOIN(__riscv_vrgather,  _vx_f,  ELEN,   LMUL,   _)
-#define VFDIV           JOIN(__riscv_vfdiv,     _vf_f,  ELEN,   LMUL,   _)
-#define VFDIV_M         JOIN(__riscv_vfdiv,     _vv_f,  ELEN,   LMUL,   _mu)
-#define VFMUL           JOIN(__riscv_vfmul,     _vv_f,  ELEN,   LMUL,   _)
-#define VFMACC          JOIN(__riscv_vfmacc,    _vv_f,  ELEN,   LMUL,   _)
-#define VFMACC_M        JOIN(__riscv_vfmacc,    _vv_f,  ELEN,   LMUL,   _mu)
-#define VMSOF           JOIN(__riscv_vmsof,     _m_b,   MLEN,   _,      _)
-#define VMANDN          JOIN(__riscv_vmandn,    _mm_b,  MLEN,   _,      _)
-#define VFREDUSUM       JOIN(__riscv_vfredusum_vs_f,ELEN,LMUL,  JOIN2(_f, ELEN), m1)
+#define VFABS           JOIN(RISCV_RVV(vfabs),     _v_f,   ELEN,   LMUL,   _)
+#define VMFNE           JOIN(RISCV_RVV(vmfne_vf_f),ELEN,   LMUL,   _b,     MLEN)
+#define VMFGT           JOIN(RISCV_RVV(vmfgt_vv_f),ELEN,   LMUL,   _b,     MLEN)
+#define VMFEQ           JOIN(RISCV_RVV(vmfeq_vv_f),ELEN,   LMUL,   _b,     MLEN)
+#define VCPOP           JOIN(RISCV_RVV(vcpop),     _m_b,   MLEN,   _,      _)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAX(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f),ELEN,LMUL,   JOIN2(_f, ELEN), m1)(v_res, va, vb, gvl)
+#define VFREDUSUM(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f),ELEN,LMUL,  JOIN2(_f, ELEN), m1)(v_res, va, vb, gvl)
+#define VFDIV_M         JOIN(RISCV_RVV(vfdiv),     _vv_f,  ELEN,   LMUL,   _m)
+#define VFMACC_M        JOIN(RISCV_RVV(vfmacc),    _vv_f,  ELEN,   LMUL,   _m)
+#else
+#define VFREDMAX        JOIN(RISCV_RVV(vfredmax_vs_f),ELEN,LMUL,   JOIN2(_f, ELEN), m1)
+#define VFREDUSUM       JOIN(RISCV_RVV(vfredusum_vs_f),ELEN,LMUL,  JOIN2(_f, ELEN), m1)
+#define VFDIV_M         JOIN(RISCV_RVV(vfdiv),     _vv_f,  ELEN,   LMUL,   _mu)
+#define VFMACC_M        JOIN(RISCV_RVV(vfmacc),    _vv_f,  ELEN,   LMUL,   _mu)
+#endif
+#define VFIRST          JOIN(RISCV_RVV(vfirst),    _m_b,   MLEN,   _,      _)
+#define VRGATHER        JOIN(RISCV_RVV(vrgather),  _vx_f,  ELEN,   LMUL,   _)
+#define VFDIV           JOIN(RISCV_RVV(vfdiv),     _vf_f,  ELEN,   LMUL,   _)
+#define VFMUL           JOIN(RISCV_RVV(vfmul),     _vv_f,  ELEN,   LMUL,   _)
+#define VFMACC          JOIN(RISCV_RVV(vfmacc),    _vv_f,  ELEN,   LMUL,   _)
+#define VMSOF           JOIN(RISCV_RVV(vmsof),     _m_b,   MLEN,   _,      _)
+#define VMANDN          JOIN(RISCV_RVV(vmandn),    _mm_b,  MLEN,   _,      _)
 #if defined(DOUBLE)
 #define ABS fabs
 #else
 #define ABS fabsf
 #endif
 
-#define EXTRACT_FLOAT0_V(v) JOIN(__riscv_vfmv_f_s_f, ELEN, LMUL, _f, ELEN)(v)
+#define EXTRACT_FLOAT0_V(v) JOIN(RISCV_RVV(vfmv_f_s_f), ELEN, LMUL, _f, ELEN)(v)
 
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c
index c3afbc7cc..50751b343 100644
--- a/kernel/riscv64/zrot_vector.c
+++ b/kernel/riscv64/zrot_vector.c
@@ -27,27 +27,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) __riscv_vsetvl_e32m4(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT __riscv_vle32_v_f32m4
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
-#define VSEV_FLOAT __riscv_vse32_v_f32m4
-#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
-#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4
-#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
 #else
-#define VSETVL(n) __riscv_vsetvl_e64m4(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT __riscv_vle64_v_f64m4
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
-#define VSEV_FLOAT __riscv_vse64_v_f64m4
-#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
-#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4
-#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
 #endif
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c
index 5d9ab7b28..2034aafaa 100644
--- a/kernel/riscv64/zscal_vector.c
+++ b/kernel/riscv64/zscal_vector.c
@@ -27,25 +27,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) __riscv_vsetvl_e32m4(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
-#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
-#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4
-#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
 #else
-#define VSETVL(n) __riscv_vsetvl_e64m4(n)
-#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
-#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
-#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
-#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4
-#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
-#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
diff --git a/kernel/riscv64/zsum_vector.c b/kernel/riscv64/zsum_vector.c
index 7aab15105..ca0b02b5c 100644
--- a/kernel/riscv64/zsum_vector.c
+++ b/kernel/riscv64/zsum_vector.c
@@ -53,16 +53,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define JOIN2(x, y) JOIN2_X(x, y)
 #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
 
-#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
 #define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
 #define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
-#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
-#define VFREDSUMVS_FLOAT JOIN(__riscv_vfredusum_vs_f,  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
-#define VFMVVF_FLOAT    JOIN(__riscv_vfmv,      _v_f_f, ELEN,   LMUL,   _)
-#define VFMVVF_FLOAT_M1 JOIN(__riscv_vfmv,      _v_f_f, ELEN,   m1,     _)
-#define VFADDVV_FLOAT   JOIN(__riscv_vfadd,     _vv_f,  ELEN,   LMUL,   _)
-#define VMFLTVF_FLOAT   JOIN(__riscv_vmflt,     _vf_f,  ELEN,   LMUL,   MLEN)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define VFADDVV_FLOAT   JOIN(RISCV_RVV(vfadd),     _vv_f,  ELEN,   LMUL,   _)
+#define VMFLTVF_FLOAT   JOIN(RISCV_RVV(vmflt),     _vf_f,  ELEN,   LMUL,   MLEN)
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c
index d8980602d..02c98b588 100644
--- a/kernel/riscv64/zswap_vector.c
+++ b/kernel/riscv64/zswap_vector.c
@@ -53,12 +53,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define JOIN2(x, y) JOIN2_X(x, y)
 #define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
 
-#define VSETVL          JOIN(__riscv_vsetvl,    _e,     ELEN,   LMUL,   _)
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
 #define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
-#define VLEV_FLOAT      JOIN(__riscv_vle,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VLSEV_FLOAT     JOIN(__riscv_vlse,      ELEN,   _v_f,   ELEN,   LMUL)
-#define VSEV_FLOAT      JOIN(__riscv_vse,       ELEN,   _v_f,   ELEN,   LMUL)
-#define VSSEV_FLOAT     JOIN(__riscv_vsse,      ELEN,   _v_f,   ELEN,   LMUL)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(RISCV_RVV(vsse),      ELEN,   _v_f,   ELEN,   LMUL)
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {

From ec89466e14d3dff51128682b3975f8554439a2f1 Mon Sep 17 00:00:00 2001
From: Dirreke <mingyang_ge@163.com>
Date: Tue, 16 Jan 2024 23:45:06 +0800
Subject: [PATCH 583/718] Add CSKY support

---
 CONTRIBUTORS.md            |   5 +-
 Makefile.csky              |   4 +
 Makefile.prebuild          |   4 +
 Makefile.system            |   5 ++
 TargetList.txt             |   4 +
 c_check                    |   2 +
 c_check.pl                 |   7 ++
 common.h                   |   4 +
 common_csky.h              |  56 ++++++++++++++
 ctest.c                    |   4 +
 getarch.c                  |  34 ++++++++-
 kernel/csky/KERNEL         | 149 +++++++++++++++++++++++++++++++++++++
 kernel/csky/Makefile       |   1 +
 lapack/laswp/csky/Makefile |  13 ++++
 param.h                    |  37 +++++++++
 15 files changed, 325 insertions(+), 4 deletions(-)
 create mode 100644 Makefile.csky
 create mode 100644 common_csky.h
 create mode 100644 kernel/csky/KERNEL
 create mode 100644 kernel/csky/Makefile
 create mode 100644 lapack/laswp/csky/Makefile

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 493747052..419f90dab 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -218,4 +218,7 @@ In chronological order:
   * [2022-08] Fix building from sources for QNX
 
 * Mark Seminatore <https://github.com/mseminatore>
-  * [2023-11-09] Improve Windows threading performance scaling
\ No newline at end of file
+  * [2023-11-09] Improve Windows threading performance scaling
+
+* Dirreke <https://github.com/mseminatore>
+  * [2024-01-16] Add basic support for the CSKY architecture
diff --git a/Makefile.csky b/Makefile.csky
new file mode 100644
index 000000000..36162af2a
--- /dev/null
+++ b/Makefile.csky
@@ -0,0 +1,4 @@
+ifeq ($(CORE), CK860FV)
+CCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float
+FCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float -static
+endif
diff --git a/Makefile.prebuild b/Makefile.prebuild
index 0be4f1274..83da8e2ce 100644
--- a/Makefile.prebuild
+++ b/Makefile.prebuild
@@ -55,6 +55,10 @@ ifeq ($(TARGET), C910V)
 TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d
 endif
 
+ifeq ($(TARGET), CK860FV)
+TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float
+endif
+
 all: getarch_2nd
 	./getarch_2nd  0 >> $(TARGET_MAKE)
 	./getarch_2nd  1 >> $(TARGET_CONF)
diff --git a/Makefile.system b/Makefile.system
index e602eaf05..0088eaff5 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -873,6 +873,11 @@ endif
 endif
 endif
 
+ifeq ($(ARCH), csky)
+NO_BINARY_MODE	= 1
+BINARY_DEFINED	= 1
+endif
+
 #
 #  C Compiler dependent settings
 #
diff --git a/TargetList.txt b/TargetList.txt
index deef75819..c11b94fa5 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -133,3 +133,7 @@ E2K
 EV4
 EV5
 EV6
+
+14.CSKY
+CSKY
+CK860FV
diff --git a/c_check b/c_check
index 3e507be81..59ab9bb13 100755
--- a/c_check
+++ b/c_check
@@ -91,6 +91,7 @@ case "$data" in
     *ARCH_ZARCH*) architecture=zarch ;;
     *ARCH_RISCV64*) architecture=riscv64 ;;
     *ARCH_LOONGARCH64*) architecture=loongarch64 ;;
+    *ARCH_CSKY*) architecture=csky ;;
 esac
 
 defined=0
@@ -236,6 +237,7 @@ case "$data" in
     *ARCH_ARM*) architecture=arm ;;
     *ARCH_ZARCH*) architecture=zarch ;;
     *ARCH_LOONGARCH64*) architecture=loongarch64 ;;
+    *ARCH_CSKY*) architecture=csky ;;
 esac
 
 binformat='bin32'
diff --git a/c_check.pl b/c_check.pl
index d9c36793c..6b89f06eb 100644
--- a/c_check.pl
+++ b/c_check.pl
@@ -97,6 +97,7 @@ $architecture = arm64        if ($data =~ /ARCH_ARM64/);
 $architecture = zarch        if ($data =~ /ARCH_ZARCH/);
 $architecture = riscv64      if ($data =~ /ARCH_RISCV64/);
 $architecture = loongarch64  if ($data =~ /ARCH_LOONGARCH64/);
+$architecture = csky         if ($data =~ /ARCH_CSKY/);
 
 $defined = 0;
 
@@ -156,6 +157,11 @@ if ($architecture eq "loongarch64") {
     $binary = 64;
 }
 
+if ($architecture eq "csky") {
+    $defined = 1;
+    $binary = 32;
+}
+
 if ($compiler eq "PGI") {
     $compiler_name .= " -tp p7"    if ($binary eq "32");
     $compiler_name .= " -tp p7-64" if ($binary eq "64");
@@ -284,6 +290,7 @@ $architecture = arm          if ($data =~ /ARCH_ARM/);
 $architecture = arm64        if ($data =~ /ARCH_ARM64/);
 $architecture = zarch        if ($data =~ /ARCH_ZARCH/);
 $architecture = loongarch64  if ($data =~ /ARCH_LOONGARCH64/);
+$architecture = csky         if ($data =~ /ARCH_CSKY/);
 
 $binformat    = bin32;
 $binformat    = bin64  if ($data =~ /BINARY_64/);
diff --git a/common.h b/common.h
index 462c1d428..1f0b9e533 100644
--- a/common.h
+++ b/common.h
@@ -482,6 +482,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
 #include "common_e2k.h"
 #endif
 
+#ifdef ARCH_CSKY
+#include "common_csky.h"
+#endif
+
 #ifndef ASSEMBLER
 #ifdef OS_WINDOWSSTORE
 typedef char env_var_t[MAX_PATH];
diff --git a/common_csky.h b/common_csky.h
new file mode 100644
index 000000000..3095dc781
--- /dev/null
+++ b/common_csky.h
@@ -0,0 +1,56 @@
+/*****************************************************************************
+Copyright (c) 2011-2015, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of 
+      its contributors may be used to endorse or promote products 
+      derived from this software without specific prior written 
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#ifndef COMMON_CSKY
+#define COMMON_CSKY
+
+#define MB  __sync_synchronize()
+#define WMB __sync_synchronize()
+#define RMB __sync_synchronize()
+
+#define INLINE inline
+
+#ifndef ASSEMBLER
+
+
+static inline int blas_quickdivide(blasint x, blasint y){
+  return x / y;
+}
+
+#endif
+
+
+
+#define BUFFER_SIZE     ( 32 << 20)
+#define SEEK_ADDRESS
+
+#endif
\ No newline at end of file
diff --git a/ctest.c b/ctest.c
index 2ccae8dcc..cbc15326f 100644
--- a/ctest.c
+++ b/ctest.c
@@ -173,6 +173,10 @@ HAVE_C11
 ARCH_E2K
 #endif
 
+#if defined(__csky__)
+ARCH_CSKY
+#endif
+
 #if defined(__EMSCRIPTEN__)
 ARCH_RISCV64
 OS_WINDOWS
diff --git a/getarch.c b/getarch.c
index 87384c084..2d26da079 100644
--- a/getarch.c
+++ b/getarch.c
@@ -150,6 +150,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /* #define FORCE_EV4		*/
 /* #define FORCE_EV5		*/
 /* #define FORCE_EV6		*/
+/* #define FORCE_CSKY		*/
+/* #define FORCE_CK860FV		*/
 /* #define FORCE_GENERIC	*/
 
 #ifdef FORCE_P2
@@ -1692,6 +1694,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "generic"
 #endif
 
+#ifdef FORCE_CSKY
+#define FORCE
+#define ARCHITECTURE    "CSKY"
+#define SUBARCHITECTURE "CSKY"
+#define SUBDIRNAME      "csky"
+#define ARCHCONFIG   "-DCSKY" \
+       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
+#define LIBNAME   "csky"
+#define CORENAME  "CSKY"
+#endif
+
+#ifdef FORCE_CK860FV
+#define FORCE
+#define ARCHITECTURE    "CSKY"
+#define SUBARCHITECTURE "CK860V"
+#define SUBDIRNAME      "csky"
+#define ARCHCONFIG   "-DCK860FV " \
+       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
+#define LIBNAME   "ck860fv"
+#define CORENAME  "CK860FV"
+#endif
+
+
 #ifndef FORCE
 
 #ifdef USER_TARGET
@@ -1766,7 +1795,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define OPENBLAS_SUPPORTED
 #endif
 
-
 #ifndef OPENBLAS_SUPPORTED
 #error "This arch/CPU is not supported by OpenBLAS."
 #endif
@@ -1831,7 +1859,7 @@ int main(int argc, char *argv[]){
 #ifdef FORCE
     printf("CORE=%s\n", CORENAME);
 #else
-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__)
+#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) || defined(__csky__)
     printf("CORE=%s\n", get_corename());
 #endif
 #endif
@@ -1979,7 +2007,7 @@ printf("ELF_VERSION=2\n");
 #ifdef FORCE
     printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
 #else
-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv)
+#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__csky__)
     printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
 #endif
 #endif
diff --git a/kernel/csky/KERNEL b/kernel/csky/KERNEL
new file mode 100644
index 000000000..afa8a0881
--- /dev/null
+++ b/kernel/csky/KERNEL
@@ -0,0 +1,149 @@
+SAMAXKERNEL  = ../arm/amax.c
+DAMAXKERNEL  = ../arm/amax.c
+CAMAXKERNEL  = ../arm/zamax.c
+ZAMAXKERNEL  = ../arm/zamax.c
+
+SAMINKERNEL  = ../arm/amin.c
+DAMINKERNEL  = ../arm/amin.c
+CAMINKERNEL  = ../arm/zamin.c
+ZAMINKERNEL  = ../arm/zamin.c
+
+SMAXKERNEL   = ../arm/max.c
+DMAXKERNEL   = ../arm/max.c
+
+SMINKERNEL   = ../arm/min.c
+DMINKERNEL   = ../arm/min.c
+
+ISAMAXKERNEL = ../arm/iamax.c
+IDAMAXKERNEL = ../arm/iamax.c
+ICAMAXKERNEL = ../arm/izamax.c
+IZAMAXKERNEL = ../arm/izamax.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL  = ../arm/imax.c
+IDMAXKERNEL  = ../arm/imax.c
+
+ISMINKERNEL  = ../arm/imin.c
+IDMINKERNEL  = ../arm/imin.c
+
+SASUMKERNEL  = ../arm/asum.c
+DASUMKERNEL  = ../arm/asum.c
+CASUMKERNEL  = ../arm/zasum.c
+ZASUMKERNEL  = ../arm/zasum.c
+
+SSUMKERNEL  = ../arm/sum.c
+DSUMKERNEL  = ../arm/sum.c
+CSUMKERNEL  = ../arm/zsum.c
+ZSUMKERNEL  = ../arm/zsum.c
+
+SAXPYKERNEL  = ../arm/axpy.c
+DAXPYKERNEL  = ../arm/axpy.c
+CAXPYKERNEL  = ../arm/zaxpy.c
+ZAXPYKERNEL  = ../arm/zaxpy.c
+
+SCOPYKERNEL  = ../arm/copy.c
+DCOPYKERNEL  = ../arm/copy.c
+CCOPYKERNEL  = ../arm/zcopy.c
+ZCOPYKERNEL  = ../arm/zcopy.c
+
+SDOTKERNEL   = ../arm/dot.c
+DDOTKERNEL   = ../arm/dot.c
+CDOTKERNEL   = ../arm/zdot.c
+ZDOTKERNEL   = ../arm/zdot.c
+DSDOTKERNEL  = ../generic/dot.c
+
+SNRM2KERNEL  = ../arm/nrm2.c
+DNRM2KERNEL  = ../arm/nrm2.c
+CNRM2KERNEL  = ../arm/znrm2.c
+ZNRM2KERNEL  = ../arm/znrm2.c
+
+SROTKERNEL   = ../arm/rot.c
+DROTKERNEL   = ../arm/rot.c
+CROTKERNEL   = ../arm/zrot.c
+ZROTKERNEL   = ../arm/zrot.c
+
+SSCALKERNEL  = ../arm/scal.c
+DSCALKERNEL  = ../arm/scal.c
+CSCALKERNEL  = ../arm/zscal.c
+ZSCALKERNEL  = ../arm/zscal.c
+
+SSWAPKERNEL  = ../arm/swap.c
+DSWAPKERNEL  = ../arm/swap.c
+CSWAPKERNEL  = ../arm/zswap.c
+ZSWAPKERNEL  = ../arm/zswap.c
+
+SGEMVNKERNEL = ../arm/gemv_n.c
+DGEMVNKERNEL = ../arm/gemv_n.c
+CGEMVNKERNEL = ../arm/zgemv_n.c
+ZGEMVNKERNEL = ../arm/zgemv_n.c
+
+SGEMVTKERNEL = ../arm/gemv_t.c
+DGEMVTKERNEL = ../arm/gemv_t.c
+CGEMVTKERNEL = ../arm/zgemv_t.c
+ZGEMVTKERNEL = ../arm/zgemv_t.c
+
+STRMMKERNEL	= ../generic/trmmkernel_2x2.c
+DTRMMKERNEL	= ../generic/trmmkernel_2x2.c
+CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+
+SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+LSAME_KERNEL	= ../generic/lsame.c
+
+SGEMM_BETA = ../generic/gemm_beta.c
+DGEMM_BETA = ../generic/gemm_beta.c
+CGEMM_BETA = ../generic/zgemm_beta.c
+ZGEMM_BETA = ../generic/zgemm_beta.c
+
+
diff --git a/kernel/csky/Makefile b/kernel/csky/Makefile
new file mode 100644
index 000000000..520349bd6
--- /dev/null
+++ b/kernel/csky/Makefile
@@ -0,0 +1 @@
+clean ::
diff --git a/lapack/laswp/csky/Makefile b/lapack/laswp/csky/Makefile
new file mode 100644
index 000000000..75411deb5
--- /dev/null
+++ b/lapack/laswp/csky/Makefile
@@ -0,0 +1,13 @@
+TOPDIR	= ../../..
+include ../../../Makefile.system
+
+ifndef LASWP
+LASWP	= ../generic/laswp_k.c
+endif
+
+ifndef ZLASWP
+ZLASWP	= ../generic/zlaswp_k.c
+endif
+
+include ../generic/Makefile
+
diff --git a/param.h b/param.h
index 469c38ce3..e048dabe7 100644
--- a/param.h
+++ b/param.h
@@ -3807,7 +3807,44 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout
 #define SYMV_P	16
 #endif
 
+#if defined(CSKY) || defined(CK860FV)
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M  2
+#define SGEMM_DEFAULT_UNROLL_N  2
+
+#define DGEMM_DEFAULT_UNROLL_M  2
+#define DGEMM_DEFAULT_UNROLL_N  2
+
+#define CGEMM_DEFAULT_UNROLL_M  2
+#define CGEMM_DEFAULT_UNROLL_N  2
+
+#define ZGEMM_DEFAULT_UNROLL_M  2
+#define ZGEMM_DEFAULT_UNROLL_N  2
+
+#define SGEMM_DEFAULT_P	128
+#define DGEMM_DEFAULT_P	128
+#define CGEMM_DEFAULT_P 96
+#define ZGEMM_DEFAULT_P 64
+
+#define SGEMM_DEFAULT_Q 240
+#define DGEMM_DEFAULT_Q 120
+#define CGEMM_DEFAULT_Q 120
+#define ZGEMM_DEFAULT_Q 120
+
+#define SGEMM_DEFAULT_R 12288
+#define DGEMM_DEFAULT_R 8192
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#define SYMV_P	16
 
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+
+#endif
 
 #ifdef GENERIC
 

From ec46ca7a4344dac05f16ce29e3be86368f595d0c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 17 Jan 2024 07:33:10 +0100
Subject: [PATCH 584/718] Support Arm Compiler for Linux as classic flang
 (#4436)

* Support ArmCompilerforLinux as classic flang
---
 f_check | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/f_check b/f_check
index dac34edee..81f598ffa 100755
--- a/f_check
+++ b/f_check
@@ -45,7 +45,7 @@ if [ -z "$compiler" ]; then
 	      pathf90 pathf95
 	      pgf95 pgf90 pgf77 pgfortran nvfortran
 	      flang egfortran
-              ifort nagfor ifx ftn crayftn"
+              ifort nagfor ifx ftn crayftn armflang"
 
     for list in $lists; do
         for p in $path; do
@@ -85,7 +85,11 @@ else
 	    *Hewlett*)
 		vendor=CRAY
 		openmp='-fopenmp'
-		;;		
+		;;
+   	    *Arm\ F90*)
+		vendor=FLANG
+		openmp='-fopenmp'
+		;;	
             *GNU*|*GCC*)
 
                 v="${data#*GCC: *\) }"
@@ -108,7 +112,7 @@ else
                 	    if [ "$major" -ge 17 ]; then
                         	vendor=FLANGNEW
 			    fi	
-			;;
+			    ;;
                         *ifort*|*ifx*)
                             vendor=INTEL
                             openmp='-fopenmp'

From d2fc4f3b4d7f41527bc7dc8f62e9aa6229cfac89 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 17 Jan 2024 20:59:24 +0100
Subject: [PATCH 585/718] Increase multithreading threshold by a factor of 50

---
 interface/gemv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/gemv.c b/interface/gemv.c
index 1f0763579..2c121f130 100644
--- a/interface/gemv.c
+++ b/interface/gemv.c
@@ -226,7 +226,7 @@ void CNAME(enum CBLAS_ORDER order,
 
 #ifdef SMP
 
-  if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD )
+  if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD )
     nthreads = 1;
   else
     nthreads = num_cpu_avail(2);

From a4992e09bc75cf95ea829c8ce3a8e2ea09d3d1fc Mon Sep 17 00:00:00 2001
From: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Date: Thu, 18 Jan 2024 18:20:37 +0400
Subject: [PATCH 586/718] Fix utest compilation

Introduced recently when adding new test cases for ZSCAL

- include cblas is needed for cblas_zscal
- ASSERT macro does not exist
- missing closing )
---
 utest/utest_main2.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/utest/utest_main2.c b/utest/utest_main2.c
index 8cb663190..46a3b067d 100644
--- a/utest/utest_main2.c
+++ b/utest/utest_main2.c
@@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CTEST_SEGFAULT
 #define CTEST_ADD_TESTS_MANUALLY
 
+#include "cblas.h"
 #include "openblas_utest.h"
 
 CTEST(amax, samax){
@@ -623,10 +624,10 @@ CTEST(zscal, i_nan)
     double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
     double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
     cblas_zscal(9, i, &nan, 1);
-    ASSERT(isnan(nan[0]);
-    ASSERT(isnan(nan[1]);
-    ASSERT(isnan(nan[16]);
-    ASSERT(isnan(nan[17]);
+    ASSERT_TRUE(isnan(nan[0]));
+    ASSERT_TRUE(isnan(nan[1]));
+    ASSERT_TRUE(isnan(nan[16]));
+    ASSERT_TRUE(isnan(nan[17]));
 }
 
 CTEST(zscal, nan_i)
@@ -634,10 +635,10 @@ CTEST(zscal, nan_i)
     double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
     double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
     cblas_zscal(9, &nan, &i, 1);
-    ASSERT(isnan(i[0]);
-    ASSERT(isnan(i[1]);
-    ASSERT(isnan(i[16]);
-    ASSERT(isnan(i[17]);
+    ASSERT_TRUE(isnan(i[0]));
+    ASSERT_TRUE(isnan(i[1]));
+    ASSERT_TRUE(isnan(i[16]));
+    ASSERT_TRUE(isnan(i[17]));
 	    }    
 	    
 CTEST(zscal, i_inf)
@@ -645,10 +646,10 @@ CTEST(zscal, i_inf)
     double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
     double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0};
     cblas_zscal(9, i, &inf, 1);
-    ASSERT(isnan(inf[0]);
-    ASSERT(isinf(inf[1]);
-    ASSERT(isnan(inf[16]);
-    ASSERT(isinf(inf[17]);
+    ASSERT_TRUE(isnan(inf[0]));
+    ASSERT_TRUE(isinf(inf[1]));
+    ASSERT_TRUE(isnan(inf[16]));
+    ASSERT_TRUE(isinf(inf[17]));
 }    
 
 CTEST(zscal, inf_i)
@@ -656,10 +657,10 @@ CTEST(zscal, inf_i)
     double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
     double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0};
     cblas_zscal(9, &inf, &i, 1);
-    ASSERT(isnan(i[0]);
-    ASSERT(isinf(i[1]);
-    ASSERT(isnan(i[16]);
-    ASSERT(isinf(i[17]);
+    ASSERT_TRUE(isnan(i[0]));
+    ASSERT_TRUE(isinf(i[1]));
+    ASSERT_TRUE(isnan(i[16]));
+    ASSERT_TRUE(isinf(i[17]));
 }
 
 int main(int argc, const char ** argv){

From b193ea3d7b6ae9950d6c7c73937661318a719c26 Mon Sep 17 00:00:00 2001
From: kseniyazaytseva <kseniya.zaytseva@syntacore.com>
Date: Thu, 18 Jan 2024 22:11:12 +0300
Subject: [PATCH 587/718] Fix BLAS and LAPACK tests for RVV 1.0 target, update
 to 0.12.0 intrincics

* Update intrincics API to 0.12.0 version (Stride Segment Loads/Stores)
* Fixed nrm2, axpby, ncopy, zgemv and scal kernels
* Added zero size checks
---
 kernel/riscv64/axpby_rvv.c             |   8 +-
 kernel/riscv64/copy_rvv.c              |   2 +-
 kernel/riscv64/gemm_ncopy_8_rvv.c      |  51 ++++-
 kernel/riscv64/gemm_tcopy_8_rvv.c      |  71 +++---
 kernel/riscv64/izamax_rvv.c            |  23 +-
 kernel/riscv64/izamin_rvv.c            |  23 +-
 kernel/riscv64/nrm2_rvv.c              | 237 +++++++++++++------
 kernel/riscv64/scal_rvv.c              |  49 ++--
 kernel/riscv64/symv_U_rvv.c            |   2 +-
 kernel/riscv64/trsm_kernel_LN_rvv_v1.c |  39 ++--
 kernel/riscv64/trsm_kernel_LT_rvv_v1.c |  39 ++--
 kernel/riscv64/trsm_kernel_RN_rvv_v1.c |  37 +--
 kernel/riscv64/trsm_kernel_RT_rvv_v1.c |  33 ++-
 kernel/riscv64/zamax_rvv.c             |  23 +-
 kernel/riscv64/zamin_rvv.c             |  23 +-
 kernel/riscv64/zaxpby_rvv.c            |  63 ++++--
 kernel/riscv64/zaxpy_rvv.c             |  76 +++++--
 kernel/riscv64/zcopy_rvv.c             |  40 ++--
 kernel/riscv64/zdot_rvv.c              |  49 +++-
 kernel/riscv64/zgemm_beta_rvv.c        |  27 ++-
 kernel/riscv64/zgemm_ncopy_4_rvv.c     |  72 ++++--
 kernel/riscv64/zgemm_ncopy_rvv_v1.c    |  18 +-
 kernel/riscv64/zgemm_tcopy_4_rvv.c     |  60 +++--
 kernel/riscv64/zgemm_tcopy_rvv_v1.c    |  18 +-
 kernel/riscv64/zgemmkernel_rvv_v1x4.c  | 144 +++++++++---
 kernel/riscv64/zgemv_n_rvv.c           |  50 ++--
 kernel/riscv64/zgemv_t_rvv.c           |  44 ++--
 kernel/riscv64/zhemm_ltcopy_rvv_v1.c   |  33 ++-
 kernel/riscv64/zhemm_utcopy_rvv_v1.c   |  33 ++-
 kernel/riscv64/znrm2_rvv.c             | 301 +++++++++++++++++++------
 kernel/riscv64/zrot_rvv.c              |  95 ++++++--
 kernel/riscv64/zscal_rvv.c             |  66 ++++--
 kernel/riscv64/zsum_rvv.c              |  23 +-
 kernel/riscv64/zswap_rvv.c             |  62 ++---
 kernel/riscv64/zsymm_lcopy_rvv_v1.c    |  33 ++-
 kernel/riscv64/zsymm_ucopy_rvv_v1.c    |  33 ++-
 kernel/riscv64/ztrmm_lncopy_rvv_v1.c   |  30 ++-
 kernel/riscv64/ztrmm_ltcopy_rvv_v1.c   |  30 ++-
 kernel/riscv64/ztrmm_uncopy_rvv_v1.c   |  30 ++-
 kernel/riscv64/ztrmm_utcopy_rvv_v1.c   |  31 ++-
 kernel/riscv64/ztrmmkernel_rvv_v1x4.c  | 110 ++++++---
 kernel/riscv64/ztrsm_lncopy_rvv_v1.c   |  26 +--
 kernel/riscv64/ztrsm_ltcopy_rvv_v1.c   |  26 +--
 kernel/riscv64/ztrsm_uncopy_rvv_v1.c   |  26 +--
 kernel/riscv64/ztrsm_utcopy_rvv_v1.c   |  26 +--
 param.h                                |   2 +-
 46 files changed, 1628 insertions(+), 709 deletions(-)

diff --git a/kernel/riscv64/axpby_rvv.c b/kernel/riscv64/axpby_rvv.c
index a1dbdb0e4..d7fb86eab 100644
--- a/kernel/riscv64/axpby_rvv.c
+++ b/kernel/riscv64/axpby_rvv.c
@@ -53,7 +53,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
 {
     FLOAT_V_T vx, vy;
 
-    if ( n < 0     )  return(0);
+    if ( n <= 0     )  return(0);
 
     if ( beta == 0.0 ) {
         if ( alpha == 0.0 ) {
@@ -63,7 +63,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
                 BLASLONG stride_y = inc_y * sizeof(FLOAT);
                 size_t vl = VSETVL(n);
                 vy = VFMVVF_FLOAT(0.0, vl);
-                for ( ; n > 0; n -= vl, y += vl*stride_y) {
+                for ( ; n > 0; n -= vl, y += vl*inc_y) {
                     vl = VSETVL(n);
                     VSSEV_FLOAT(y, stride_y, vy, vl);
                 }
@@ -126,10 +126,12 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
 
         } else {
             if ((1 == inc_x) && (1 == inc_y)) {
-                for (size_t vl; n > 0; n -= vl, y += vl) {
+                for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
                     vl = VSETVL(n);
+                    vx = VLEV_FLOAT(x, vl);
                     vy = VLEV_FLOAT(y, vl);
                     vy = VFMULVF_FLOAT(vy, beta, vl);
+                    vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
                     VSEV_FLOAT (y, vy, vl);
                 }
             } else if (1 == inc_x) {
diff --git a/kernel/riscv64/copy_rvv.c b/kernel/riscv64/copy_rvv.c
index 041fd2dae..9d4b84095 100644
--- a/kernel/riscv64/copy_rvv.c
+++ b/kernel/riscv64/copy_rvv.c
@@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
-    if(n < 0)  return(0);
+    if(n <= 0)  return(0);
 
     FLOAT_V_T v0;
 
diff --git a/kernel/riscv64/gemm_ncopy_8_rvv.c b/kernel/riscv64/gemm_ncopy_8_rvv.c
index 3030d67fb..c652ab0c0 100644
--- a/kernel/riscv64/gemm_ncopy_8_rvv.c
+++ b/kernel/riscv64/gemm_ncopy_8_rvv.c
@@ -30,19 +30,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m1(n)
 #define FLOAT_V_T               vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m1x2_t
+#define FLOAT_VX4_T             vfloat32m1x4_t
+#define FLOAT_VX8_T             vfloat32m1x8_t
+#define VSET_VX2                __riscv_vset_v_f32m1_f32m1x2
+#define VSET_VX4                __riscv_vset_v_f32m1_f32m1x4
+#define VSET_VX8                __riscv_vset_v_f32m1_f32m1x8
 #define VLEV_FLOAT              __riscv_vle32_v_f32m1
 #define VSEV_FLOAT              __riscv_vse32_v_f32m1
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m1
-#define VSSEG4_FLOAT            __riscv_vsseg4e32_v_f32m1
-#define VSSEG8_FLOAT            __riscv_vsseg8e32_v_f32m1
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m1x2
+#define VSSEG4_FLOAT            __riscv_vsseg4e32_v_f32m1x4
+#define VSSEG8_FLOAT            __riscv_vsseg8e32_v_f32m1x8
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m1(n)
 #define FLOAT_V_T               vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m1x2_t
+#define FLOAT_VX4_T             vfloat64m1x4_t
+#define FLOAT_VX8_T             vfloat64m1x8_t
+#define VSET_VX2                __riscv_vset_v_f64m1_f64m1x2
+#define VSET_VX4                __riscv_vset_v_f64m1_f64m1x4
+#define VSET_VX8                __riscv_vset_v_f64m1_f64m1x8
 #define VLEV_FLOAT              __riscv_vle64_v_f64m1
 #define VSEV_FLOAT              __riscv_vse64_v_f64m1
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m1
-#define VSSEG4_FLOAT            __riscv_vsseg4e64_v_f64m1
-#define VSSEG8_FLOAT            __riscv_vsseg8e64_v_f64m1
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m1x2
+#define VSSEG4_FLOAT            __riscv_vsseg4e64_v_f64m1x4
+#define VSSEG8_FLOAT            __riscv_vsseg8e64_v_f64m1x8
 #endif
 
 // Optimizes the implementation in ../generic/gemm_ncopy_8.c
@@ -57,6 +69,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
     FLOAT *b_offset;
 
     FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8;
+    FLOAT_VX2_T vx2;
+    FLOAT_VX4_T vx4;
+    FLOAT_VX8_T vx8;
+
     size_t vl;
 
     //fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
@@ -87,7 +103,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
             v7 = VLEV_FLOAT(a_offset7, vl);
             v8 = VLEV_FLOAT(a_offset8, vl);
 
-            VSSEG8_FLOAT(b_offset, v1, v2, v3, v4, v5, v6, v7, v8, vl);
+            vx8 = VSET_VX8(vx8, 0, v1);
+            vx8 = VSET_VX8(vx8, 1, v2);
+            vx8 = VSET_VX8(vx8, 2, v3);
+            vx8 = VSET_VX8(vx8, 3, v4);
+            vx8 = VSET_VX8(vx8, 4, v5);
+            vx8 = VSET_VX8(vx8, 5, v6);
+            vx8 = VSET_VX8(vx8, 6, v7);
+            vx8 = VSET_VX8(vx8, 7, v8);
+
+            VSSEG8_FLOAT(b_offset, vx8, vl);
 
             a_offset1 += vl;
             a_offset2 += vl;
@@ -116,7 +141,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
             v3 = VLEV_FLOAT(a_offset3, vl);
             v4 = VLEV_FLOAT(a_offset4, vl);
 
-            VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl);
+            vx4 = VSET_VX4(vx4, 0, v1);
+            vx4 = VSET_VX4(vx4, 1, v2);
+            vx4 = VSET_VX4(vx4, 2, v3);
+            vx4 = VSET_VX4(vx4, 3, v4);
+
+            VSSEG4_FLOAT(b_offset, vx4, vl);
 
             a_offset1 += vl;
             a_offset2 += vl;
@@ -137,7 +167,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
             v1 = VLEV_FLOAT(a_offset1, vl);
             v2 = VLEV_FLOAT(a_offset2, vl);
 
-            VSSEG2_FLOAT(b_offset, v1, v2, vl);
+            vx2 = VSET_VX2(vx2, 0, v1);
+            vx2 = VSET_VX2(vx2, 1, v2);
+
+            VSSEG2_FLOAT(b_offset, vx2, vl);
 
             a_offset1 += vl;
             a_offset2 += vl;
diff --git a/kernel/riscv64/gemm_tcopy_8_rvv.c b/kernel/riscv64/gemm_tcopy_8_rvv.c
index 080a87312..4742ae6a7 100644
--- a/kernel/riscv64/gemm_tcopy_8_rvv.c
+++ b/kernel/riscv64/gemm_tcopy_8_rvv.c
@@ -30,27 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m1(n)
 #define FLOAT_V_T               vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m1x2_t
+#define FLOAT_VX4_T             vfloat32m1x4_t
+#define FLOAT_VX8_T             vfloat32m1x8_t
 #define VLEV_FLOAT              __riscv_vle32_v_f32m1
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m1
 #define VSEV_FLOAT              __riscv_vse32_v_f32m1
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m1
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m1
-#define VLSSEG4_FLOAT           __riscv_vlsseg4e32_v_f32m1
-#define VSSEG4_FLOAT            __riscv_vsseg4e32_v_f32m1
-#define VLSSEG8_FLOAT           __riscv_vlsseg8e32_v_f32m1
-#define VSSEG8_FLOAT            __riscv_vsseg8e32_v_f32m1
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m1x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m1x2
+#define VLSSEG4_FLOAT           __riscv_vlsseg4e32_v_f32m1x4
+#define VSSEG4_FLOAT            __riscv_vsseg4e32_v_f32m1x4
+#define VLSSEG8_FLOAT           __riscv_vlsseg8e32_v_f32m1x8
+#define VSSEG8_FLOAT            __riscv_vsseg8e32_v_f32m1x8
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m1(n)
 #define FLOAT_V_T               vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m1x2_t
+#define FLOAT_VX4_T             vfloat64m1x4_t
+#define FLOAT_VX8_T             vfloat64m1x8_t
 #define VLEV_FLOAT              __riscv_vle64_v_f64m1
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m1
 #define VSEV_FLOAT              __riscv_vse64_v_f64m1
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m1
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m1
-#define VLSSEG4_FLOAT           __riscv_vlsseg4e64_v_f64m1
-#define VSSEG4_FLOAT            __riscv_vsseg4e64_v_f64m1
-#define VLSSEG8_FLOAT           __riscv_vlsseg8e64_v_f64m1
-#define VSSEG8_FLOAT            __riscv_vsseg8e64_v_f64m1
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m1x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m1x2
+#define VLSSEG4_FLOAT           __riscv_vlsseg4e64_v_f64m1x4
+#define VSSEG4_FLOAT            __riscv_vsseg4e64_v_f64m1x4
+#define VLSSEG8_FLOAT           __riscv_vlsseg8e64_v_f64m1x8
+#define VSSEG8_FLOAT            __riscv_vsseg8e64_v_f64m1x8
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
@@ -62,7 +68,10 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
 
     IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
 
-    FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7;
+    FLOAT_V_T v0;
+    FLOAT_VX2_T vx2;
+    FLOAT_VX4_T vx4;
+    FLOAT_VX8_T vx8;
 
     // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
 
@@ -83,8 +92,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
         for(i = (n >> 3); i > 0; i--) {
             size_t vl = 8;
 
-            VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl);
-            VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
+            vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG8_FLOAT(boffset1, vx8, vl);
 
             aoffset1 += 8;
             boffset1 += m * 8;
@@ -93,8 +102,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
         if (n & 4) {
             size_t vl = 8;
 
-            VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl);
-            VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
+            vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG4_FLOAT(boffset2, vx4, vl);
 
             aoffset1 += 4;
             boffset2 += 32;
@@ -103,8 +112,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
         if (n & 2) {
             size_t vl = 8;
 
-            VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl);
-            VSSEG2_FLOAT(boffset3, v0, v1, vl);
+            vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG2_FLOAT(boffset3, vx2, vl);
 
             aoffset1 += 2;
             boffset3 += 16;
@@ -133,8 +142,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
         for(i = (n >> 3); i > 0; i--) {
             size_t vl = 4;
 
-            VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl);
-            VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
+            vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG8_FLOAT(boffset1, vx8, vl);
 
             aoffset1 += 8;
             boffset1 += m * 8;
@@ -143,8 +152,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
         if (n & 4) {
             size_t vl = 4;
 
-            VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl);
-            VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
+            vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG4_FLOAT(boffset2, vx4, vl);
 
             aoffset1 += 4;
             boffset2 += 16;
@@ -153,8 +162,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
         if (n & 2) {
             size_t vl = 4;
 
-            VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl);
-            VSSEG2_FLOAT(boffset3, v0, v1, vl);
+            vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG2_FLOAT(boffset3, vx2, vl);
 
             aoffset1 += 2;
             boffset3 += 8;
@@ -181,8 +190,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
         for(i = (n >> 3); i > 0; i--) {
             size_t vl = 2;
 
-            VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT), vl);
-            VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
+            vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG8_FLOAT(boffset1, vx8, vl);
 
             aoffset1 += 8;
             boffset1 += m * 8;
@@ -191,8 +200,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
         if (n & 4) {
             size_t vl = 2;
 
-            VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT), vl);
-            VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
+            vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG4_FLOAT(boffset2, vx4, vl);
 
             aoffset1 += 4;
             boffset2 += 8;
@@ -201,8 +210,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
         if (n & 2) {
             size_t vl = 2;
 
-            VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT), vl);
-            VSSEG2_FLOAT(boffset3, v0, v1, vl);
+            vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG2_FLOAT(boffset3, vx2, vl);
 
             aoffset1 += 2;
             boffset3 += 4;
diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c
index e93f0056c..32f66a7a7 100644
--- a/kernel/riscv64/izamax_rvv.c
+++ b/kernel/riscv64/izamax_rvv.c
@@ -32,10 +32,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
 #define FLOAT_V_T               vfloat64m4_t
 #define FLOAT_V_T_M1            vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
 #define VLEV_FLOAT              __riscv_vle64_v_f64m4
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m4
-#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
 #define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m4_f64m1
 #define MASK_T                  vbool16_t
 #define VMFLTVF_FLOAT           __riscv_vmflt_vf_f64m4_b16
@@ -61,10 +63,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
 #define FLOAT_V_T               vfloat32m4_t
 #define FLOAT_V_T_M1            vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
 #define VLEV_FLOAT              __riscv_vle32_v_f32m4
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m4
-#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
 #define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m4_f32m1
 #define MASK_T                  vbool8_t
 #define VMFLTVF_FLOAT           __riscv_vmflt_vf_f32m4_b8
@@ -93,6 +97,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
     if (n <= 0 || inc_x <= 0) return(max_index);
 
     FLOAT_V_T vx0, vx1, v_max;
+    FLOAT_VX2_T vxx2;
     UINT_V_T v_max_index;
     MASK_T mask;
   
@@ -107,7 +112,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) {
             vl = VSETVL(n);
 
-            VLSEG_FLOAT(&vx0, &vx1, x, vl);
+            vxx2 = VLSEG_FLOAT(x, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
 
             vx0 = VFABSV_FLOAT(vx0, vl);
             vx1 = VFABSV_FLOAT(vx1, vl);
@@ -129,7 +137,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) {
             vl = VSETVL(n);
         
-            VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
         
             vx0 = VFABSV_FLOAT(vx0, vl);
             vx1 = VFABSV_FLOAT(vx1, vl);
diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c
index b5bc27404..d34b220fa 100644
--- a/kernel/riscv64/izamin_rvv.c
+++ b/kernel/riscv64/izamin_rvv.c
@@ -33,8 +33,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
 #define FLOAT_V_T               vfloat64m4_t
 #define FLOAT_V_T_M1            vfloat64m1_t
-#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
 #define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m4_f64m1
 #define MASK_T                  vbool16_t
 #define VMFLTVF_FLOAT           __riscv_vmflt_vf_f64m4_b16
@@ -60,8 +62,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
 #define FLOAT_V_T               vfloat32m4_t
 #define FLOAT_V_T_M1            vfloat32m1_t
-#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
 #define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m4_f32m1
 #define MASK_T                  vbool8_t
 #define VMFLTVF_FLOAT           __riscv_vmflt_vf_f32m4_b8
@@ -90,6 +94,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
     if (n <= 0 || inc_x <= 0) return(min_index);
 
     FLOAT_V_T vx0, vx1, v_min;
+    FLOAT_VX2_T vxx2;
     UINT_V_T v_min_index;
     MASK_T mask;
   
@@ -104,7 +109,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) {
             vl = VSETVL(n);
 
-            VLSEG_FLOAT(&vx0, &vx1, x, vl);
+            vxx2 = VLSEG_FLOAT(x, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
 
             vx0 = VFABSV_FLOAT(vx0, vl);
             vx1 = VFABSV_FLOAT(vx1, vl);
@@ -127,7 +135,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) {
             vl = VSETVL(n);
 
-            VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
 
             vx0 = VFABSV_FLOAT(vx0, vl);
             vx1 = VFABSV_FLOAT(vx1, vl);
diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c
index 994fadb70..3eb423849 100644
--- a/kernel/riscv64/nrm2_rvv.c
+++ b/kernel/riscv64/nrm2_rvv.c
@@ -26,78 +26,187 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #include "common.h"
-#include <math.h>
-
-#if !defined(DOUBLE)
-#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
-#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
-#define FLOAT_V_T               vfloat32m8_t
-#define FLOAT_V_T_M1            vfloat32m1_t
-#define VLEV_FLOAT              __riscv_vle32_v_f32m8
-#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
-#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m8_f32m1
-#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f32m8_tu
-#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
-#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
-#define ABS fabsf
-#else
-#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
-#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
-#define FLOAT_V_T               vfloat64m8_t
-#define FLOAT_V_T_M1            vfloat64m1_t
-#define VLEV_FLOAT              __riscv_vle64_v_f64m8
-#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
-#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m8_f64m1
-#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f64m8_tu
-#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
-#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+
+#if defined(DOUBLE)
+#define VSETVL             __riscv_vsetvl_e64m4
+#define FLOAT_V_T           vfloat64m4_t
+#define FLOAT_V_T_M1        vfloat64m1_t
+#define VLEV_FLOAT          __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT         __riscv_vlse64_v_f64m4
+#define VFMVVF_FLOAT        __riscv_vfmv_v_f_f64m4
+#define VFMVSF_FLOAT        __riscv_vfmv_s_f_f64m4
+#define VFMVVF_FLOAT_M1     __riscv_vfmv_v_f_f64m1
+#define MASK_T              vbool16_t
+#define VFABS               __riscv_vfabs_v_f64m4
+#define VMFNE               __riscv_vmfne_vf_f64m4_b16
+#define VMFGT               __riscv_vmfgt_vv_f64m4_b16
+#define VMFEQ               __riscv_vmfeq_vf_f64m4_b16
+#define VCPOP               __riscv_vcpop_m_b16
+#define VFREDMAX            __riscv_vfredmax_vs_f64m4_f64m1
+#define VFREDMIN            __riscv_vfredmin_vs_f64m4_f64m1
+#define VFIRST              __riscv_vfirst_m_b16
+#define VRGATHER            __riscv_vrgather_vx_f64m4
+#define VFDIV               __riscv_vfdiv_vv_f64m4
+#define VFDIV_M             __riscv_vfdiv_vv_f64m4_mu
+#define VFMUL               __riscv_vfmul_vv_f64m4
+#define VFMUL_M             __riscv_vfmul_vv_f64m4_mu
+#define VFMACC              __riscv_vfmacc_vv_f64m4
+#define VFMACC_M            __riscv_vfmacc_vv_f64m4_mu
+#define VMSBF               __riscv_vmsbf_m_b16
+#define VMSOF               __riscv_vmsof_m_b16
+#define VMAND               __riscv_vmand_mm_b16
+#define VMANDN              __riscv_vmand_mm_b16
+#define VFREDSUM            __riscv_vfredusum_vs_f64m4_f64m1
+#define VMERGE              __riscv_vmerge_vvm_f64m4
+#define VSEV_FLOAT          __riscv_vse64_v_f64m4
+#define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f64m4_f64(v)
 #define ABS fabs
+#else
+#define VSETVL              __riscv_vsetvl_e32m4
+#define FLOAT_V_T           vfloat32m4_t
+#define FLOAT_V_T_M1        vfloat32m1_t
+#define VLEV_FLOAT          __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT         __riscv_vlse32_v_f32m4
+#define VFMVVF_FLOAT        __riscv_vfmv_v_f_f32m4
+#define VFMVSF_FLOAT        __riscv_vfmv_s_f_f32m4
+#define VFMVVF_FLOAT_M1     __riscv_vfmv_v_f_f32m1
+#define MASK_T              vbool8_t
+#define VFABS               __riscv_vfabs_v_f32m4
+#define VMFNE               __riscv_vmfne_vf_f32m4_b8
+#define VMFGT               __riscv_vmfgt_vv_f32m4_b8
+#define VMFEQ               __riscv_vmfeq_vf_f32m4_b8
+#define VCPOP               __riscv_vcpop_m_b8
+#define VFREDMAX            __riscv_vfredmax_vs_f32m4_f32m1
+#define VFREDMIN            __riscv_vfredmin_vs_f32m4_f32m1
+#define VFIRST              __riscv_vfirst_m_b8
+#define VRGATHER            __riscv_vrgather_vx_f32m4
+#define VFDIV               __riscv_vfdiv_vv_f32m4
+#define VFDIV_M             __riscv_vfdiv_vv_f32m4_mu
+#define VFMUL               __riscv_vfmul_vv_f32m4
+#define VFMUL_M             __riscv_vfmul_vv_f32m4_mu
+#define VFMACC              __riscv_vfmacc_vv_f32m4
+#define VFMACC_M            __riscv_vfmacc_vv_f32m4_mu
+#define VMSBF               __riscv_vmsbf_m_b8
+#define VMSOF               __riscv_vmsof_m_b8
+#define VMAND               __riscv_vmand_mm_b8
+#define VMANDN              __riscv_vmand_mm_b8
+#define VFREDSUM            __riscv_vfredusum_vs_f32m4_f32m1
+#define VMERGE              __riscv_vmerge_vvm_f32m4
+#define VSEV_FLOAT          __riscv_vse32_v_f32m4
+#define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f32m4_f32(v)
+#define ABS fabsf
 #endif
 
-
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-
-    if( n <= 0 ) return(0.0);
-    if(n == 1) return (ABS(x[0]));
-
-    FLOAT_V_T vr, v0;
-    FLOAT_V_T_M1 v_res;
-    FLOAT ssq = 0.0;
-
-    size_t vlmax = VSETVL_MAX;
-    v_res = VFMVVF_FLOAT_M1(0, vlmax);
-
-    vr = VFMVVF_FLOAT(0, vlmax);
- 
-    if(inc_x == 1) {
-
-        for (size_t vl; n > 0; n -= vl, x += vl) {
-            vl = VSETVL(n);
-
-            v0 = VLEV_FLOAT(x, vl);
-
-            vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl);
+	BLASLONG i=0;
+
+	if (n <= 0 || inc_x <= 0) return(0.0);
+        if(n == 1) return (ABS(x[0]));
+
+        unsigned int gvl = 0;
+
+        MASK_T nonzero_mask;
+        MASK_T scale_mask;
+
+        gvl = VSETVL(n);
+        FLOAT_V_T v0;
+        FLOAT_V_T v_ssq = VFMVVF_FLOAT(0, gvl);
+        FLOAT_V_T v_scale = VFMVVF_FLOAT(0, gvl);
+
+        FLOAT scale = 0;
+        FLOAT ssq = 0;
+        unsigned int stride_x = inc_x * sizeof(FLOAT);
+        int idx = 0;
+
+        if( n >= gvl ) // don't pay overheads if we're not doing useful work
+        {
+                for(i=0; i<n/gvl; i++){
+                        v0 = VLSEV_FLOAT( &x[idx], stride_x, gvl );
+                        nonzero_mask = VMFNE( v0, 0, gvl );
+                        v0 = VFABS( v0, gvl );
+                        scale_mask = VMFGT( v0, v_scale, gvl );
+
+                        // assume scale changes are relatively infrequent
+
+                        // unclear if the vcpop+branch is actually a win
+                        // since the operations being skipped are predicated anyway
+                        // need profiling to confirm
+                        if( VCPOP(scale_mask, gvl) ) 
+                        {
+                                v_scale = VFDIV_M( scale_mask, v_scale, v_scale, v0, gvl );
+                                v_scale = VFMUL_M( scale_mask, v_scale, v_scale, v_scale, gvl );
+                                v_ssq = VFMUL_M( scale_mask, v_ssq, v_ssq, v_scale, gvl );
+                                v_scale = VMERGE( v_scale, v0, scale_mask, gvl );
+                        }
+                        v0 = VFDIV_M( nonzero_mask, v0, v0, v_scale, gvl );
+                        v_ssq = VFMACC_M( nonzero_mask, v_ssq, v0, v0, gvl );
+                        idx += inc_x * gvl;
+                }
+
+                // we have gvl elements which we accumulated independently, with independent scales
+                // we need to combine these
+                // naive sort so we process small values first to avoid losing information
+                // could use vector sort extensions where available, but we're dealing with gvl elts at most
+
+                FLOAT * out_ssq = alloca(gvl*sizeof(FLOAT));
+                FLOAT * out_scale = alloca(gvl*sizeof(FLOAT));
+                VSEV_FLOAT( out_ssq, v_ssq, gvl );
+                VSEV_FLOAT( out_scale, v_scale, gvl );
+                for( int a = 0; a < (gvl-1); ++a )
+                {
+                        int smallest = a;
+                        for( size_t b = a+1; b < gvl; ++b )
+                                if( out_scale[b] < out_scale[smallest] )
+                                        smallest = b;
+                        if( smallest != a )
+                        {
+                                FLOAT tmp1 = out_ssq[a];
+                                FLOAT tmp2 = out_scale[a];
+                                out_ssq[a] = out_ssq[smallest];
+                                out_scale[a] = out_scale[smallest];
+                                out_ssq[smallest] = tmp1;
+                                out_scale[smallest] = tmp2;
+                        }
+                }
+
+                int a = 0;
+                while( a<gvl && out_scale[a] == 0 )
+                        ++a;
+
+                if( a < gvl ) 
+                {
+                        ssq = out_ssq[a];
+                        scale = out_scale[a];
+                        ++a;
+                        for( ; a < gvl; ++a ) 
+                        {
+                                ssq = ssq * ( scale / out_scale[a] ) * ( scale / out_scale[a] ) + out_ssq[a];
+                                scale = out_scale[a];
+                        }
+                }
         }
 
-    } else {
- 
-        BLASLONG stride_x = inc_x * sizeof(FLOAT);
-
-        for (size_t vl; n > 0; n -= vl, x += vl * inc_x) {
-            vl = VSETVL(n);
-
-            v0 = VLSEV_FLOAT(x, stride_x, vl);
-
-            vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl);
+        //finish any tail using scalar ops
+        i*=gvl*inc_x;
+        n*=inc_x;
+        while(i < n){
+                if ( x[i] != 0.0 ){
+                        FLOAT absxi = ABS( x[i] );
+                        if ( scale < absxi ){
+                                ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
+                                scale = absxi ;
+                        }
+                        else{
+                                ssq += ( absxi/scale ) * ( absxi/scale );
+                        }
+
+                }
+
+                i += inc_x;
         }
-    }
 
-    v_res = VFREDSUM_FLOAT(vr, v_res, vlmax);
+	return(scale * sqrt(ssq));
+}
 
-    ssq = VFMVFS_FLOAT_M1(v_res);
 
-    return sqrt(ssq);
-}
diff --git a/kernel/riscv64/scal_rvv.c b/kernel/riscv64/scal_rvv.c
index 2e2cfd31e..2c273fb63 100644
--- a/kernel/riscv64/scal_rvv.c
+++ b/kernel/riscv64/scal_rvv.c
@@ -29,6 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
 #define FLOAT_V_T               vfloat32m8_t
 #define VLEV_FLOAT              __riscv_vle32_v_f32m8
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
@@ -38,6 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
 #define FLOAT_V_T               vfloat64m8_t
 #define VLEV_FLOAT              __riscv_vle64_v_f64m8
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
@@ -54,26 +56,41 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
     FLOAT_V_T v0;
  
     if(inc_x == 1) {
-
-        for (size_t vl; n > 0; n -= vl, x += vl) {
-            vl = VSETVL(n);
-
-            v0 = VLEV_FLOAT(x, vl);
-            v0 = VFMULVF_FLOAT(v0, da, vl);
-            VSEV_FLOAT(x, v0, vl);
+        if(da == 0.0) {
+            int gvl = VSETVL_MAX;
+            v0 = VFMVVF_FLOAT(0.0, gvl);
+            for (size_t vl; n > 0; n -= vl, x += vl) {
+                vl = VSETVL(n);
+                VSEV_FLOAT(x, v0, vl);
+            }
         }
-
-    } else { 
+        else {
+            for (size_t vl; n > 0; n -= vl, x += vl) {
+                vl = VSETVL(n);
+                v0 = VLEV_FLOAT(x, vl);
+                v0 = VFMULVF_FLOAT(v0, da, vl);
+                VSEV_FLOAT(x, v0, vl);
+            }
+        }
+    } else {
         BLASLONG stride_x = inc_x * sizeof(FLOAT);
 
-        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
-            vl = VSETVL(n);
-
-            v0 = VLSEV_FLOAT(x, stride_x, vl);
-            v0 = VFMULVF_FLOAT(v0, da, vl);
-            VSSEV_FLOAT(x, stride_x, v0, vl);
+        if(da == 0.0) {
+            int gvl = VSETVL_MAX;
+            v0 = VFMVVF_FLOAT(0.0, gvl);
+            for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+                vl = VSETVL(n);
+                VSSEV_FLOAT(x, stride_x, v0, vl);
+            }
+        }
+        else {
+            for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+                vl = VSETVL(n);
+                v0 = VLSEV_FLOAT(x, stride_x, vl);
+                v0 = VFMULVF_FLOAT(v0, da, vl);
+                VSSEV_FLOAT(x, stride_x, v0, vl);
+            }
         }
-
     }
 
     return 0;
diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c
index 3cfd3ee4c..bcd2f6981 100644
--- a/kernel/riscv64/symv_U_rvv.c
+++ b/kernel/riscv64/symv_U_rvv.c
@@ -82,7 +82,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
         FLOAT_V_T va, vx, vy, vr;
         BLASLONG stride_x, stride_y, inc_xv, inc_yv;
-
+        
         BLASLONG m1 = m - offset;
         if(inc_x == 1 && inc_y == 1)
         {
diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c
index 886af0c3b..869561fb3 100644
--- a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c
+++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c
@@ -31,13 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n)               __riscv_vsetvl_e32m2(n)
 #define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
 #define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
 #define VSSEV_FLOAT             __riscv_vsse32_v_f32m2
 #define VSEV_FLOAT              __riscv_vse32_v_f32m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
-#define VSSSEG2_FLOAT           __riscv_vssseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSSEG2_FLOAT           __riscv_vssseg2e32_v_f32m2x2
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
 #define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
@@ -45,13 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n)               __riscv_vsetvl_e64m2(n)
 #define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
 #define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
 #define VSSEV_FLOAT             __riscv_vsse64_v_f64m2
 #define VSEV_FLOAT              __riscv_vse64_v_f64m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
-#define VSSSEG2_FLOAT           __riscv_vssseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSSEG2_FLOAT           __riscv_vssseg2e64_v_f64m2x2
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
@@ -140,6 +144,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
 
     BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2;
 
+    FLOAT_VX2_T vbx2, vsx2, vcx2;
     FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2;
     size_t vl;
     a += (m - 1) * m * 2;
@@ -153,7 +158,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
 
         for (j = n; j > 0; j -= vl) {
             vl = VSETVL(j);
-            VLSSEG2_FLOAT(&vb1, &vb2, pc + i * 2, stride_ldc, vl);
+            vbx2 = VLSSEG2_FLOAT(pc + i * 2, stride_ldc, vl);
+            vb1 = VGET_VX2(vbx2, 0);
+            vb2 = VGET_VX2(vbx2, 1);
 #ifndef CONJ
             vs1 =   VFMULVF_FLOAT(vb1, aa1, vl);
             vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl);
@@ -165,12 +172,16 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
             vs2 =   VFMULVF_FLOAT(vb2, aa1, vl);
             vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl);
 #endif
-            VSSEG2_FLOAT(b, vs1, vs2, vl);
-            VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vs1, vs2, vl);
+            vsx2 = VSET_VX2(vsx2, 0, vs1);
+            vsx2 = VSET_VX2(vsx2, 1, vs2);
+            VSSEG2_FLOAT(b, vsx2, vl);
+            VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vsx2, vl);
             b   += vl * 2;
 
             for (k = 0; k < i; k ++) {
-                VLSSEG2_FLOAT(&vc1, &vc2, pc + k * 2, stride_ldc, vl);
+                vcx2 = VLSSEG2_FLOAT(pc + k * 2, stride_ldc, vl);
+                vc1 = VGET_VX2(vcx2, 0);
+                vc2 = VGET_VX2(vcx2, 1);
 #ifndef CONJ
                 vc1 =  VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
                 vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
@@ -182,7 +193,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
                 vc2 =  VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
                 vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
 #endif
-                VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vc1, vc2, vl);
+                vcx2 = VSET_VX2(vcx2, 0, vc1);
+                vcx2 = VSET_VX2(vcx2, 1, vc2);
+                VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vcx2, vl);
             }
             pc  += vl * ldc * 2;
         }
diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c
index ddeef966c..da443cfba 100644
--- a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c
+++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c
@@ -31,13 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n)               __riscv_vsetvl_e32m2(n)
 #define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
 #define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
 #define VSSEV_FLOAT             __riscv_vsse32_v_f32m2
 #define VSEV_FLOAT              __riscv_vse32_v_f32m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
-#define VSSSEG2_FLOAT           __riscv_vssseg2e32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSSEG2_FLOAT           __riscv_vssseg2e32_v_f32m2x2
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
 #define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
@@ -45,13 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n)               __riscv_vsetvl_e64m2(n)
 #define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
 #define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
 #define VSSEV_FLOAT             __riscv_vsse64_v_f64m2
 #define VSEV_FLOAT              __riscv_vse64_v_f64m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
-#define VSSSEG2_FLOAT           __riscv_vssseg2e64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSSEG2_FLOAT           __riscv_vssseg2e64_v_f64m2x2
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
@@ -137,6 +141,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
 
     BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2;
 
+    FLOAT_VX2_T vbx2, vsx2, vcx2;
     FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2;
     size_t vl;
 
@@ -149,7 +154,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
 
         for (j = n; j > 0; j -= vl) {
             vl = VSETVL(j);
-            VLSSEG2_FLOAT(&vb1, &vb2, pc + i * 2, stride_ldc, vl);
+            vbx2 = VLSSEG2_FLOAT(pc + i * 2, stride_ldc, vl);
+            vb1 = VGET_VX2(vbx2, 0);
+            vb2 = VGET_VX2(vbx2, 1);
 #ifndef CONJ
             vs1 =   VFMULVF_FLOAT(vb1, aa1, vl);
             vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl);
@@ -161,12 +168,16 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
             vs2 =   VFMULVF_FLOAT(vb2, aa1, vl);
             vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl);
 #endif
-            VSSEG2_FLOAT(b, vs1, vs2, vl);
-            VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vs1, vs2, vl);
+            vsx2 = VSET_VX2(vsx2, 0, vs1);
+            vsx2 = VSET_VX2(vsx2, 1, vs2);
+            VSSEG2_FLOAT(b, vsx2, vl);
+            VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vsx2, vl);
             b   += vl * 2;
 
             for (k = i + 1; k < m; k++) {
-                VLSSEG2_FLOAT(&vc1, &vc2, pc + k * 2, stride_ldc, vl);
+                vcx2 = VLSSEG2_FLOAT(pc + k * 2, stride_ldc, vl);
+                vc1 = VGET_VX2(vcx2, 0);
+                vc2 = VGET_VX2(vcx2, 1);
 #ifndef CONJ
                 vc1 =  VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
                 vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
@@ -178,7 +189,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
                 vc2 =  VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
                 vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
 #endif
-                VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vc1, vc2, vl);
+                vcx2 = VSET_VX2(vcx2, 0, vc1);
+                vcx2 = VSET_VX2(vcx2, 1, vc2);
+                VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vcx2, vl);
             }
             pc  += vl * ldc * 2;
         }
diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c
index 4c83bbaa3..32e481036 100644
--- a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c
+++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c
@@ -31,13 +31,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n)               __riscv_vsetvl_e32m2(n)
 #define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
 #define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
 #define VLEV_FLOAT              __riscv_vle32_v_f32m2
 #define VSSEV_FLOAT             __riscv_vsse32_v_f32m2
 #define VSEV_FLOAT              __riscv_vse32_v_f32m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
-#define VSSSEG2_FLOAT           __riscv_vssseg2e32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
 #define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
@@ -45,13 +46,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n)               __riscv_vsetvl_e64m2(n)
 #define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
 #define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
 #define VLEV_FLOAT              __riscv_vle64_v_f64m2
 #define VSSEV_FLOAT             __riscv_vsse64_v_f64m2
 #define VSEV_FLOAT              __riscv_vse64_v_f64m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
-#define VSSSEG2_FLOAT           __riscv_vssseg2e64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
@@ -133,6 +135,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
 
     int i, j, k;
 
+    FLOAT_VX2_T vax2, vsx2, vcx2;
     FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2;
 
     size_t vl;
@@ -147,7 +150,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
 
         for (j = m; j > 0; j -= vl) {
             vl = VSETVL(j);
-            VLSEG2_FLOAT(&va1, &va2, pci, vl);
+            vax2 = VLSEG2_FLOAT(pci, vl);
+            va1 = VGET_VX2(vax2, 0);
+            va2 = VGET_VX2(vax2, 1);
 #ifndef CONJ
             vs1 =   VFMULVF_FLOAT(va1, bb1, vl);
             vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl);
@@ -159,13 +164,17 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
             vs2 =   VFMULVF_FLOAT(va2, bb1, vl);
             vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl);
 #endif
-            VSSEG2_FLOAT(a, vs1, vs2, vl);
-            VSSEG2_FLOAT(pci, vs1, vs2, vl);
+            vsx2 = VSET_VX2(vsx2, 0, vs1);
+            vsx2 = VSET_VX2(vsx2, 1, vs2);
+            VSSEG2_FLOAT(a, vsx2, vl);
+            VSSEG2_FLOAT(pci, vsx2, vl);
             a += vl * 2;
             pci += vl * 2;
 
             for (k = i + 1; k < n; k ++){
-                VLSEG2_FLOAT(&vc1, &vc2, pcj + k * ldc * 2, vl);
+                vcx2 = VLSEG2_FLOAT(pcj + k * ldc * 2, vl);
+                vc1 = VGET_VX2(vcx2, 0);
+                vc2 = VGET_VX2(vcx2, 1);
 #ifndef CONJ
                 vc1 =  VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
                 vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
@@ -177,7 +186,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
                 vc2 =  VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
                 vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
 #endif
-                VSSEG2_FLOAT(pcj + k * ldc * 2, vc1, vc2, vl);
+                vcx2 = VSET_VX2(vcx2, 0, vc1);
+                vcx2 = VSET_VX2(vcx2, 1, vc2);
+                VSSEG2_FLOAT(pcj + k * ldc * 2, vcx2, vl);
             }
             pcj += vl * 2;
         }
diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c
index b368eefb9..81cc41818 100644
--- a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c
+++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c
@@ -31,10 +31,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n)               __riscv_vsetvl_e32m2(n)
 #define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
 #define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
 #define VLEV_FLOAT              __riscv_vle32_v_f32m2
 #define VSEV_FLOAT              __riscv_vse32_v_f32m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
 #define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
@@ -42,10 +45,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n)               __riscv_vsetvl_e64m2(n)
 #define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
 #define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
 #define VLEV_FLOAT              __riscv_vle64_v_f64m2
 #define VSEV_FLOAT              __riscv_vse64_v_f64m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
@@ -133,6 +139,7 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
 
     int i, j, k;
 
+    FLOAT_VX2_T vax2, vsx2, vcx2;
     FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2;
 
     size_t vl;
@@ -149,7 +156,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
         pcj = c;
         for (j = m; j > 0; j -= vl) {
             vl = VSETVL(j);
-            VLSEG2_FLOAT(&va1, &va2, pci, vl);
+            vax2 = VLSEG2_FLOAT(pci, vl);
+            va1 = VGET_VX2(vax2, 0);
+            va2 = VGET_VX2(vax2, 1);
 #ifndef CONJ
             vs1 =   VFMULVF_FLOAT(va1, bb1, vl);
             vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl);
@@ -161,13 +170,17 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
             vs2 =   VFMULVF_FLOAT(va2, bb1, vl);
             vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl);
 #endif
-            VSSEG2_FLOAT(a, vs1, vs2, vl);
-            VSSEG2_FLOAT(pci, vs1, vs2, vl);
+            vsx2 = VSET_VX2(vsx2, 0, vs1);
+            vsx2 = VSET_VX2(vsx2, 1, vs2);
+            VSSEG2_FLOAT(a, vsx2, vl);
+            VSSEG2_FLOAT(pci, vsx2, vl);
             a += vl * 2;
             pci += vl * 2;
 
             for (k = 0; k < i; k ++){
-                VLSEG2_FLOAT(&vc1, &vc2, pcj + k * ldc * 2, vl);
+                vcx2 = VLSEG2_FLOAT(pcj + k * ldc * 2, vl);
+                vc1 = VGET_VX2(vcx2, 0);
+                vc2 = VGET_VX2(vcx2, 1);
 #ifndef CONJ
                 vc1 =  VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
                 vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
@@ -179,7 +192,9 @@ static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, B
                 vc2 =  VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
                 vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
 #endif
-                VSSEG2_FLOAT(pcj + k * ldc * 2, vc1, vc2, vl);
+                vcx2 = VSET_VX2(vcx2, 0, vc1);
+                vcx2 = VSET_VX2(vcx2, 1, vc2);
+                VSSEG2_FLOAT(pcj + k * ldc * 2, vcx2, vl);
             }
             pcj += vl * 2;
         }
diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c
index bbb1e876b..180cf059a 100644
--- a/kernel/riscv64/zamax_rvv.c
+++ b/kernel/riscv64/zamax_rvv.c
@@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
 #define FLOAT_V_T               vfloat32m4_t
 #define FLOAT_V_T_M1            vfloat32m1_t
-#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
 #define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m4_f32m1
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
@@ -49,8 +51,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
 #define FLOAT_V_T               vfloat64m4_t
 #define FLOAT_V_T_M1            vfloat64m1_t
-#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
 #define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m4_f64m1
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
@@ -68,6 +72,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
     FLOAT_V_T v0, v1, vmax;
     FLOAT_V_T_M1 v_res;
+    FLOAT_VX2_T vx2;
 
     v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
     size_t vlmax = VSETVL_MAX;
@@ -78,7 +83,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         for (size_t vl; n > 0; n -= vl, x += vl*2) {
             vl = VSETVL(n);
 
-            VLSEG_FLOAT(&v0, &v1, x, vl);
+            vx2 = VLSEG_FLOAT(x, vl);
+            
+            v0 = VGET_VX2(vx2, 0);
+            v1 = VGET_VX2(vx2, 1);
 
             v0 = VFABSV_FLOAT(v0, vl);
             v1 = VFABSV_FLOAT(v1, vl);
@@ -95,7 +103,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
             vl = VSETVL(n);
 
-            VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl);
+            vx2 = VLSSEG_FLOAT(x, stride_x, vl);
+
+            v0 = VGET_VX2(vx2, 0);
+            v1 = VGET_VX2(vx2, 1);
 
             v0 = VFABSV_FLOAT(v0, vl);
             v1 = VFABSV_FLOAT(v1, vl);
diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c
index c5453121b..56a467502 100644
--- a/kernel/riscv64/zamin_rvv.c
+++ b/kernel/riscv64/zamin_rvv.c
@@ -34,8 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
 #define FLOAT_V_T               vfloat32m4_t
 #define FLOAT_V_T_M1            vfloat32m1_t
-#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
 #define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m4_f32m1
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
@@ -49,8 +51,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
 #define FLOAT_V_T               vfloat64m4_t
 #define FLOAT_V_T_M1            vfloat64m1_t
-#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
 #define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m4_f64m1
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
@@ -68,6 +72,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
     FLOAT_V_T v0, v1, vmin;
     FLOAT_V_T_M1 v_res;
+    FLOAT_VX2_T vx2;
 
     v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1);
     size_t vlmax = VSETVL_MAX;
@@ -78,7 +83,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         for (size_t vl; n > 0; n -= vl, x += vl*2) {
             vl = VSETVL(n);
 
-            VLSEG_FLOAT(&v0, &v1, x, vl);
+            vx2 = VLSEG_FLOAT(x, vl);
+            
+            v0 = VGET_VX2(vx2, 0);
+            v1 = VGET_VX2(vx2, 1);
 
             v0 = VFABSV_FLOAT(v0, vl);
             v1 = VFABSV_FLOAT(v1, vl);
@@ -94,7 +102,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
             vl = VSETVL(n);
 
-            VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl);
+            vx2 = VLSSEG_FLOAT(x, stride_x, vl);
+
+            v0 = VGET_VX2(vx2, 0);
+            v1 = VGET_VX2(vx2, 1);
 
             v0 = VFABSV_FLOAT(v0, vl);
             v1 = VFABSV_FLOAT(v1, vl);
diff --git a/kernel/riscv64/zaxpby_rvv.c b/kernel/riscv64/zaxpby_rvv.c
index e0da55311..66e38c1e4 100644
--- a/kernel/riscv64/zaxpby_rvv.c
+++ b/kernel/riscv64/zaxpby_rvv.c
@@ -35,6 +35,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m4(n)
 #define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VSET_VX2                __riscv_vset_v_f32m4_f32m4x2
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m4
 #define VSSEV_FLOAT             __riscv_vsse32_v_f32m4
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m4
@@ -42,13 +45,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
 #define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m4
 #define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f32m4
-#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
-#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
-#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4x2
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m4(n)
 #define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VSET_VX2                __riscv_vset_v_f64m4_f64m4x2
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m4
 #define VSSEV_FLOAT             __riscv_vsse64_v_f64m4
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m4
@@ -56,10 +62,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
 #define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m4
 #define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f64m4
-#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
-#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
-#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4x2
 #endif
 
 int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y)
@@ -74,6 +80,7 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
     BLASLONG stride_x = inc_x2 * sizeof(FLOAT);
     BLASLONG stride_y = inc_y2 * sizeof(FLOAT);
     FLOAT_V_T vx0, vx1, vy0, vy1;
+    FLOAT_VX2_T vxx2, vyx2;
 
     if ( beta_r == 0.0 && beta_i == 0.0)
     {
@@ -81,10 +88,12 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
         {
             size_t vl = VSETVL(n);
             FLOAT_V_T temp = VFMVVF_FLOAT(0.0, vl);
-            for ( ; n > 0; n -= vl, y += vl*stride_y)
+            vxx2 = VSET_VX2(vxx2, 0, temp);
+            vxx2 = VSET_VX2(vxx2, 1, temp);
+            for ( ; n > 0; n -= vl, y += vl*inc_y2)
             {
                 vl = VSETVL(n);
-                VSSSEG_FLOAT(y, stride_y, temp, temp, vl);
+                VSSSEG_FLOAT(y, stride_y, vxx2, vl);
             }
         }
         else
@@ -92,7 +101,10 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
             for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) 
             {
                 vl = VSETVL(n);
-                VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
+
+                vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+                vx0 = VGET_VX2(vxx2, 0);
+                vx1 = VGET_VX2(vxx2, 1);
                 
                 vy0 = VFMULVF_FLOAT(vx1, alpha_i, vl);
                 vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, vl);
@@ -100,20 +112,26 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
                 vy1 = VFMULVF_FLOAT(vx1, alpha_r, vl);
                 vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, vl);
 
-                VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
+                vyx2 = VSET_VX2(vyx2, 0, vy0);
+                vyx2 = VSET_VX2(vyx2, 1, vy1);
+                VSSSEG_FLOAT(y, stride_y, vyx2, vl);
             }
         }
     }
     else
     {
         FLOAT_V_T v0, v1;
+        FLOAT_VX2_T v_x2;
 
         if ( alpha_r == 0.0 && alpha_i == 0.0 )
         {
             for (size_t vl; n > 0; n -= vl, y += vl*inc_y2) 
             {
                 vl = VSETVL(n);
-                VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+
+                vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+                vy0 = VGET_VX2(vyx2, 0);
+                vy1 = VGET_VX2(vyx2, 1);
                 
                 v0 = VFMULVF_FLOAT(vy1, beta_i, vl);
                 v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, vl);
@@ -121,7 +139,9 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
                 v1 = VFMULVF_FLOAT(vy1, beta_r, vl);
                 v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl);
 
-                VSSSEG_FLOAT(y, stride_y, v0, v1, vl);
+                v_x2 = VSET_VX2(v_x2, 0, v0);
+                v_x2 = VSET_VX2(v_x2, 1, v1);
+                VSSSEG_FLOAT(y, stride_y, v_x2, vl);
             }
         }
         else
@@ -129,8 +149,14 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
             for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) 
             {
                 vl = VSETVL(n);
-                VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
-                VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+
+                vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+                vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+                vx0 = VGET_VX2(vxx2, 0);
+                vx1 = VGET_VX2(vxx2, 1);
+                vy0 = VGET_VX2(vyx2, 0);
+                vy1 = VGET_VX2(vyx2, 1);
 
                 v0 = VFMULVF_FLOAT(vx0, alpha_r, vl);
                 v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, vl);
@@ -142,7 +168,10 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
                 v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, vl);
                 v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl);
 
-                VSSSEG_FLOAT(y, stride_y, v0, v1, vl);
+                v_x2 = VSET_VX2(v_x2, 0, v0);
+                v_x2 = VSET_VX2(v_x2, 1, v1);
+
+                VSSSEG_FLOAT(y, stride_y, v_x2, vl);
             }
         }
     }
diff --git a/kernel/riscv64/zaxpy_rvv.c b/kernel/riscv64/zaxpy_rvv.c
index 3f75898e0..0db32df10 100644
--- a/kernel/riscv64/zaxpy_rvv.c
+++ b/kernel/riscv64/zaxpy_rvv.c
@@ -30,19 +30,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m4(n)
 #define FLOAT_V_T               vfloat32m4_t
-#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
-#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4
-#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VSET_VX2                __riscv_vset_v_f32m4_f32m4x2
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4x2
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m4
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m4
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m4(n)
 #define FLOAT_V_T               vfloat64m4_t
-#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
-#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4
-#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VSET_VX2                __riscv_vset_v_f64m4_f64m4x2
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4x2
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m4
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m4
 #endif
@@ -53,14 +59,21 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
     if(da_r == 0.0 && da_i == 0.0) return(0);
  
     FLOAT_V_T vx0, vx1, vy0, vy1;
+    FLOAT_VX2_T vxx2, vyx2;
 
     if(inc_x == 1 && inc_y == 1) {
 
         for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
             vl = VSETVL(n);
 
-            VLSEG_FLOAT(&vx0, &vx1, x, vl);
-            VLSEG_FLOAT(&vy0, &vy1, y, vl);
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
         #if !defined(CONJ)
             vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
             vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl);
@@ -72,7 +85,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
             vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
             vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
         #endif
-            VSSEG_FLOAT(y, vy0, vy1, vl);
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+            VSSEG_FLOAT(y, vyx2, vl);
         }
 
     } else if (inc_x == 1) {
@@ -82,8 +97,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
         for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
             vl = VSETVL(n);
 
-            VLSEG_FLOAT(&vx0, &vx1, x, vl);
-            VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
 
         #if !defined(CONJ)
             vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
@@ -96,7 +116,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
             vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
             vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
         #endif
-            VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+            VSSSEG_FLOAT(y, stride_y, vyx2, vl);
         }
 
     } else if (inc_y == 1) {
@@ -106,8 +128,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
         for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
             vl = VSETVL(n);
 
-            VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
-            VLSEG_FLOAT(&vy0, &vy1, y, vl);
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
 
         #if !defined(CONJ)
             vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
@@ -120,7 +147,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
             vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
             vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
         #endif
-            VSSEG_FLOAT(y, vy0, vy1, vl);
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+            VSSEG_FLOAT(y, vyx2, vl);
         }
 
     } else {
@@ -131,8 +160,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
         for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
             vl = VSETVL(n);
 
-            VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
-            VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
 
         #if !defined(CONJ)
             vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
@@ -145,7 +179,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
             vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
             vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
         #endif
-            VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+            VSSSEG_FLOAT(y, stride_y, vyx2, vl);
         }
 
     }
diff --git a/kernel/riscv64/zcopy_rvv.c b/kernel/riscv64/zcopy_rvv.c
index bd94810ce..13879f03b 100644
--- a/kernel/riscv64/zcopy_rvv.c
+++ b/kernel/riscv64/zcopy_rvv.c
@@ -34,11 +34,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSEV_FLOAT_M8           __riscv_vse32_v_f32m8
 
 #define VSETVL_M4(n)            __riscv_vsetvl_e32m4(n)
-#define FLOAT_V_T_M4            vfloat32m4_t
-#define VLSEG_FLOAT_M4          __riscv_vlseg2e32_v_f32m4
-#define VSSEG_FLOAT_M4          __riscv_vsseg2e32_v_f32m4
-#define VLSSEG_FLOAT_M4         __riscv_vlsseg2e32_v_f32m4
-#define VSSSEG_FLOAT_M4         __riscv_vssseg2e32_v_f32m4
+#define FLOAT_VX2_T_M4          vfloat32m4x2_t
+#define VLSEG_FLOAT_M4          __riscv_vlseg2e32_v_f32m4x2
+#define VSSEG_FLOAT_M4          __riscv_vsseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT_M4         __riscv_vlsseg2e32_v_f32m4x2
+#define VSSSEG_FLOAT_M4         __riscv_vssseg2e32_v_f32m4x2
 #else
 #define VSETVL_M8(n)            __riscv_vsetvl_e64m8(n)
 #define FLOAT_V_T_M8            vfloat64m8_t
@@ -46,16 +46,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSEV_FLOAT_M8           __riscv_vse64_v_f64m8
 
 #define VSETVL_M4(n)            __riscv_vsetvl_e64m4(n)
-#define FLOAT_V_T_M4            vfloat64m4_t
-#define VLSEG_FLOAT_M4          __riscv_vlseg2e64_v_f64m4
-#define VSSEG_FLOAT_M4          __riscv_vsseg2e64_v_f64m4
-#define VLSSEG_FLOAT_M4         __riscv_vlsseg2e64_v_f64m4
-#define VSSSEG_FLOAT_M4         __riscv_vssseg2e64_v_f64m4
+#define FLOAT_VX2_T_M4          vfloat64m4x2_t
+#define VLSEG_FLOAT_M4          __riscv_vlseg2e64_v_f64m4x2
+#define VSSEG_FLOAT_M4          __riscv_vsseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT_M4         __riscv_vlsseg2e64_v_f64m4x2
+#define VSSSEG_FLOAT_M4         __riscv_vssseg2e64_v_f64m4x2
 #endif
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
-    if(n < 0) return(0);
+    if(n <= 0) return(0);
 
     if(inc_x == 1 && inc_y == 1) {
 
@@ -70,34 +70,34 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 
     }else if (1 == inc_x) {
 
-        FLOAT_V_T_M4 vr, vi;
+        FLOAT_VX2_T_M4 vx2;
         BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
 
         for(size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
             vl = VSETVL_M4(n);
-            VLSEG_FLOAT_M4(&vr, &vi, x, vl);
-            VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl);
+            vx2 = VLSEG_FLOAT_M4(x, vl);
+            VSSSEG_FLOAT_M4(y, stride_y, vx2, vl);
         }
     } else if (1 == inc_y) {
 
-        FLOAT_V_T_M4 vr, vi;
+        FLOAT_VX2_T_M4 vx2;
         BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
 
         for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
             vl = VSETVL_M4(n);
-            VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl);
-            VSSEG_FLOAT_M4(y, vr, vi, vl);
+            vx2 = VLSSEG_FLOAT_M4(x, stride_x, vl);
+            VSSEG_FLOAT_M4(y, vx2, vl);
         }
     } else {
 
-        FLOAT_V_T_M4 vr, vi;
+        FLOAT_VX2_T_M4 vx2;
         BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
         BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
 
         for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
             vl = VSETVL_M4(n);
-            VLSSEG_FLOAT_M4(&vr, &vi, x, stride_x, vl);
-            VSSSEG_FLOAT_M4(y, stride_y, vr, vi, vl);
+            vx2 = VLSSEG_FLOAT_M4(x, stride_x, vl);
+            VSSSEG_FLOAT_M4(y, stride_y, vx2, vl);
         }
     }
 
diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c
index fa0e89353..13bc2ee39 100644
--- a/kernel/riscv64/zdot_rvv.c
+++ b/kernel/riscv64/zdot_rvv.c
@@ -33,8 +33,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
 #define FLOAT_V_T               vfloat32m4_t
 #define FLOAT_V_T_M1            vfloat32m1_t
-#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
 #define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m4_f32m1
 #define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f32m4_tu
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
@@ -49,8 +51,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
 #define FLOAT_V_T               vfloat64m4_t
 #define FLOAT_V_T_M1            vfloat64m1_t
-#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
 #define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m4_f64m1
 #define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f64m4_tu
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
@@ -71,6 +75,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 
     FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1;
     FLOAT_V_T_M1 v_res, v_z0;
+    FLOAT_VX2_T vxx2, vyx2;
     size_t vlmax_m1 = VSETVL_MAX_M1;
     v_z0 = VFMVVF_FLOAT_M1(0, vlmax_m1);
 
@@ -83,8 +88,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
         for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
             vl = VSETVL(n);
 
-            VLSEG_FLOAT(&vx0, &vx1, x, vl);
-            VLSEG_FLOAT(&vy0, &vy1, y, vl);
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
 
             vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl);
             vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl);
@@ -104,8 +114,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
         for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
             vl = VSETVL(n);
 
-            VLSEG_FLOAT(&vx0, &vx1, x, vl);
-            VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
 
             vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl);
             vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl);
@@ -124,8 +139,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
         for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
             vl = VSETVL(n);
 
-            VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
-            VLSEG_FLOAT(&vy0, &vy1, y, vl);
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
 
             vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl);
             vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl);
@@ -145,8 +165,13 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
         for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
             vl = VSETVL(n);
 
-            VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
-            VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
 
             vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl);
             vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl);
diff --git a/kernel/riscv64/zgemm_beta_rvv.c b/kernel/riscv64/zgemm_beta_rvv.c
index b94b5f4bf..ee334801b 100644
--- a/kernel/riscv64/zgemm_beta_rvv.c
+++ b/kernel/riscv64/zgemm_beta_rvv.c
@@ -41,8 +41,11 @@
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m4(n)
 #define FLOAT_V_T               vfloat32m4_t
-#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
-#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VSET_VX2                __riscv_vset_v_f32m4_f32m4x2
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4x2
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
 #define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m4
 #define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
@@ -50,8 +53,11 @@
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m4(n)
 #define FLOAT_V_T               vfloat64m4_t
-#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
-#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VSET_VX2                __riscv_vset_v_f64m4_f64m4x2
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4x2
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
 #define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m4
 #define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
@@ -68,6 +74,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1,
     FLOAT *c_offset;
 	size_t vl;
     FLOAT_V_T vr, vi, v1, v2, v3, v4;
+    FLOAT_VX2_T vx2;
 
     ldc *= 2;
     c_offset = c;
@@ -77,6 +84,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1,
         vl = VSETVL(m);
         vr = VFMVVF_FLOAT(0.0, vl);
         vi = VFMVVF_FLOAT(0.0, vl);
+        vx2 = VSET_VX2(vx2, 0, vr);
+        vx2 = VSET_VX2(vx2, 1, vi);
 
         for( ; n > 0; n--, c += ldc) {
             c_offset = c;
@@ -84,7 +93,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1,
             for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) {
                 vl = VSETVL(chunk);
 
-                VSSEG_FLOAT(c_offset, vr, vi, vl);
+                VSSEG_FLOAT(c_offset, vx2, vl);
 			}
 		}
 
@@ -96,7 +105,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1,
             for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) {
                 vl = VSETVL(chunk);
 
-                VLSEG_FLOAT(&vr, &vi, c_offset, vl);
+                vx2 = VLSEG_FLOAT(c_offset, vl);
+                vr = VGET_VX2(vx2, 0);
+                vi = VGET_VX2(vx2, 1);
 
                 v1 = VFMULVF_FLOAT(vr, beta_r, vl);
                 v2 = VFMULVF_FLOAT(vi, beta_i, vl);
@@ -107,7 +118,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1,
 				vr = VFSUBVV_FLOAT(v1, v2, vl);
 				vi = VFADDVV_FLOAT(v3, v4, vl);
 
-                VSSEG_FLOAT(c_offset, vr, vi, vl);
+                vx2 = VSET_VX2(vx2, 0, vr);
+                vx2 = VSET_VX2(vx2, 1, vi);
+                VSSEG_FLOAT(c_offset, vx2, vl);
 			}
 		}
 
diff --git a/kernel/riscv64/zgemm_ncopy_4_rvv.c b/kernel/riscv64/zgemm_ncopy_4_rvv.c
index d50a4b8d5..dce98752e 100644
--- a/kernel/riscv64/zgemm_ncopy_4_rvv.c
+++ b/kernel/riscv64/zgemm_ncopy_4_rvv.c
@@ -29,18 +29,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m1(n)
-#define FLOAT_V_T               vfloat32m1_t
-#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m1
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m1
-#define VSSEG4_FLOAT            __riscv_vsseg4e32_v_f32m1
-#define VSSEG8_FLOAT            __riscv_vsseg8e32_v_f32m1
+#define FLOAT_VX2_T             vfloat32m1x2_t
+#define FLOAT_VX4_T             vfloat32m1x4_t
+#define FLOAT_VX8_T             vfloat32m1x8_t
+#define VGET_VX2                __riscv_vget_v_f32m1x2_f32m1
+#define VSET_VX2                __riscv_vset_v_f32m1_f32m1x2
+#define VSET_VX4                __riscv_vset_v_f32m1_f32m1x4
+#define VSET_VX8                __riscv_vset_v_f32m1_f32m1x8
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m1x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m1x2
+#define VSSEG4_FLOAT            __riscv_vsseg4e32_v_f32m1x4
+#define VSSEG8_FLOAT            __riscv_vsseg8e32_v_f32m1x8
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m1(n)
-#define FLOAT_V_T               vfloat64m1_t
-#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m1
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m1
-#define VSSEG4_FLOAT            __riscv_vsseg4e64_v_f64m1
-#define VSSEG8_FLOAT            __riscv_vsseg8e64_v_f64m1
+#define FLOAT_VX2_T             vfloat64m1x2_t
+#define FLOAT_VX4_T             vfloat64m1x4_t
+#define FLOAT_VX8_T             vfloat64m1x8_t
+#define VGET_VX2                __riscv_vget_v_f64m1x2_f64m1
+#define VSET_VX2                __riscv_vset_v_f64m1_f64m1x2
+#define VSET_VX4                __riscv_vset_v_f64m1_f64m1x4
+#define VSET_VX8                __riscv_vset_v_f64m1_f64m1x8
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m1x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m1x2
+#define VSSEG4_FLOAT            __riscv_vsseg4e64_v_f64m1x4
+#define VSSEG8_FLOAT            __riscv_vsseg8e64_v_f64m1x8
 #endif
 
 // Optimizes the implementation in ../generic/zgemm_ncopy_4.c
@@ -53,7 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
 
     FLOAT *boffset;
 
-    FLOAT_V_T v11, v12, v21, v22, v31, v32, v41, v42;
+    FLOAT_VX2_T v1x2, v2x2, v3x2, v4x2;
+    FLOAT_VX4_T vxx4;
+    FLOAT_VX8_T vxx8;
     size_t vl;
 
     aoffset = a;
@@ -69,12 +83,21 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
 
         for (i = m; i > 0; i -= vl) {
             vl = VSETVL(i);
-            VLSEG2_FLOAT(&v11, &v12, aoffset1, vl);
-            VLSEG2_FLOAT(&v21, &v22, aoffset2, vl);
-            VLSEG2_FLOAT(&v31, &v32, aoffset3, vl);
-            VLSEG2_FLOAT(&v41, &v42, aoffset4, vl);
-
-            VSSEG8_FLOAT(boffset, v11, v12, v21, v22, v31, v32, v41, v42, vl);
+            v1x2 = VLSEG2_FLOAT(aoffset1, vl);
+            v2x2 = VLSEG2_FLOAT(aoffset2, vl);
+            v3x2 = VLSEG2_FLOAT(aoffset3, vl);
+            v4x2 = VLSEG2_FLOAT(aoffset4, vl);
+            
+            vxx8 = VSET_VX8(vxx8, 0, VGET_VX2(v1x2, 0));
+            vxx8 = VSET_VX8(vxx8, 1, VGET_VX2(v1x2, 1));
+            vxx8 = VSET_VX8(vxx8, 2, VGET_VX2(v2x2, 0));
+            vxx8 = VSET_VX8(vxx8, 3, VGET_VX2(v2x2, 1));
+            vxx8 = VSET_VX8(vxx8, 4, VGET_VX2(v3x2, 0));
+            vxx8 = VSET_VX8(vxx8, 5, VGET_VX2(v3x2, 1));
+            vxx8 = VSET_VX8(vxx8, 6, VGET_VX2(v4x2, 0));
+            vxx8 = VSET_VX8(vxx8, 7, VGET_VX2(v4x2, 1));
+
+            VSSEG8_FLOAT(boffset, vxx8, vl);
 
             aoffset1 += vl * 2;
             aoffset2 += vl * 2;
@@ -91,10 +114,15 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
         
         for (i = m; i > 0; i -= vl) {
             vl = VSETVL(i);
-            VLSEG2_FLOAT(&v11, &v12, aoffset1, vl);
-            VLSEG2_FLOAT(&v21, &v22, aoffset2, vl);
+            v1x2 = VLSEG2_FLOAT(aoffset1, vl);
+            v2x2 = VLSEG2_FLOAT(aoffset2, vl);
+
+            vxx4 = VSET_VX4(vxx4, 0, VGET_VX2(v1x2, 0));
+            vxx4 = VSET_VX4(vxx4, 1, VGET_VX2(v1x2, 1));
+            vxx4 = VSET_VX4(vxx4, 2, VGET_VX2(v2x2, 0));
+            vxx4 = VSET_VX4(vxx4, 3, VGET_VX2(v2x2, 1));
         
-            VSSEG4_FLOAT(boffset, v11, v12, v21, v22, vl);
+            VSSEG4_FLOAT(boffset, vxx4, vl);
         
             aoffset1 += vl * 2;
             aoffset2 += vl * 2;
@@ -108,9 +136,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
 
         for (i = m; i > 0; i -= vl) {
             vl = VSETVL(i);
-            VLSEG2_FLOAT(&v11, &v12, aoffset1, vl);
+            v1x2 = VLSEG2_FLOAT(aoffset1, vl);
 
-            VSSEG2_FLOAT(boffset, v11, v12, vl);
+            VSSEG2_FLOAT(boffset, v1x2, vl);
 
             aoffset1 += vl * 2;
             boffset  += vl * 2;
diff --git a/kernel/riscv64/zgemm_ncopy_rvv_v1.c b/kernel/riscv64/zgemm_ncopy_rvv_v1.c
index 1d3b8d3b7..275daa5f2 100644
--- a/kernel/riscv64/zgemm_ncopy_rvv_v1.c
+++ b/kernel/riscv64/zgemm_ncopy_rvv_v1.c
@@ -30,14 +30,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m2(n)
-#define FLOAT_V_T               vfloat32m2_t
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m2(n)
-#define FLOAT_V_T               vfloat64m2_t
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
@@ -48,7 +48,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
     FLOAT *a_offset1;
     FLOAT *b_offset;
 
-    FLOAT_V_T v0, v1;
+    FLOAT_VX2_T vx2;
     size_t vl;
 
     //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
@@ -62,8 +62,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
         a_offset += vl * lda * 2;
 
         for(i = m; i > 0; i--) {
-            VLSSEG2_FLOAT(&v0, &v1, a_offset1, lda * sizeof(FLOAT) * 2, vl);
-            VSSEG2_FLOAT(b_offset, v0, v1, vl);
+            vx2 = VLSSEG2_FLOAT(a_offset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG2_FLOAT(b_offset, vx2, vl);
 
             a_offset1 += 2;
             b_offset += vl * 2;
diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv.c b/kernel/riscv64/zgemm_tcopy_4_rvv.c
index 8c35b5616..cfafbf0dc 100644
--- a/kernel/riscv64/zgemm_tcopy_4_rvv.c
+++ b/kernel/riscv64/zgemm_tcopy_4_rvv.c
@@ -30,25 +30,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m1(n)
 #define FLOAT_V_T               vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m1x2_t
+#define FLOAT_VX4_T             vfloat32m1x4_t
+#define FLOAT_VX8_T             vfloat32m1x8_t
 #define VLEV_FLOAT              __riscv_vle32_v_f32m1
 #define VSEV_FLOAT              __riscv_vse32_v_f32m1
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m1
-#define VLSSEG4_FLOAT           __riscv_vlsseg4e32_v_f32m1
-#define VLSSEG8_FLOAT           __riscv_vlsseg8e32_v_f32m1
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m1
-#define VSSEG4_FLOAT            __riscv_vsseg4e32_v_f32m1
-#define VSSEG8_FLOAT            __riscv_vsseg8e32_v_f32m1
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m1x2
+#define VLSSEG4_FLOAT           __riscv_vlsseg4e32_v_f32m1x4
+#define VLSSEG8_FLOAT           __riscv_vlsseg8e32_v_f32m1x8
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m1x2
+#define VSSEG4_FLOAT            __riscv_vsseg4e32_v_f32m1x4
+#define VSSEG8_FLOAT            __riscv_vsseg8e32_v_f32m1x8
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m1(n)
 #define FLOAT_V_T               vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m1x2_t
+#define FLOAT_VX4_T             vfloat64m1x4_t
+#define FLOAT_VX8_T             vfloat64m1x8_t
 #define VLEV_FLOAT              __riscv_vle64_v_f64m1
 #define VSEV_FLOAT              __riscv_vse64_v_f64m1
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m1
-#define VLSSEG4_FLOAT           __riscv_vlsseg4e64_v_f64m1
-#define VLSSEG8_FLOAT           __riscv_vlsseg8e64_v_f64m1
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m1
-#define VSSEG4_FLOAT            __riscv_vsseg4e64_v_f64m1
-#define VSSEG8_FLOAT            __riscv_vsseg8e64_v_f64m1
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m1x2
+#define VLSSEG4_FLOAT           __riscv_vlsseg4e64_v_f64m1x4
+#define VLSSEG8_FLOAT           __riscv_vlsseg8e64_v_f64m1x8
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m1x2
+#define VSSEG4_FLOAT            __riscv_vsseg4e64_v_f64m1x4
+#define VSSEG8_FLOAT            __riscv_vsseg8e64_v_f64m1x8
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
@@ -60,7 +66,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
 
     IFLOAT *boffset, *boffset1, *boffset2, *boffset3;
 
-    FLOAT_V_T v0, v1, v2, v3, v4, v5, v6, v7;
+    FLOAT_V_T v0;
+    FLOAT_VX2_T vx2;
+    FLOAT_VX4_T vx4;
+    FLOAT_VX8_T vx8;
+
     size_t vl;
 
     //fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
@@ -81,8 +91,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
         for(i = (n >> 2); i > 0; i--) {
             vl = 4;
 
-            VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT) * 2, vl);
-            VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
+            vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG8_FLOAT(boffset1, vx8, vl);
 
             aoffset1 += 8;
             boffset1 += m * 8;
@@ -91,8 +101,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
         if (n & 2) {
             vl = 4;
 
-            VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT) * 2, vl);
-            VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
+            vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG4_FLOAT(boffset2, vx4, vl);
 
             aoffset1 += 4;
             boffset2 += 16;
@@ -101,8 +111,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
         if (n & 1) {
             vl = 4;
 
-            VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT) * 2, vl);
-            VSSEG2_FLOAT(boffset3, v0, v1, vl);
+            vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG2_FLOAT(boffset3, vx2, vl);
 
             aoffset1 += 2;
             boffset3 += 8;
@@ -119,8 +129,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
         for(i = (n >> 2); i > 0; i--) {
             vl = 2;
 
-            VLSSEG8_FLOAT(&v0, &v1, &v2, &v3, &v4, &v5, &v6, &v7, aoffset1, lda * sizeof(FLOAT) * 2, vl);
-            VSSEG8_FLOAT(boffset1, v0, v1, v2, v3, v4, v5, v6, v7, vl);
+            vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG8_FLOAT(boffset1, vx8, vl);
 
             aoffset1 += 8;
             boffset1 += m * 8;
@@ -129,8 +139,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
         if (n & 2) {
             vl = 2;
 
-            VLSSEG4_FLOAT(&v0, &v1, &v2, &v3, aoffset1, lda * sizeof(FLOAT) * 2, vl);
-            VSSEG4_FLOAT(boffset2, v0, v1, v2, v3, vl);
+            vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG4_FLOAT(boffset2, vx4, vl);
 
             aoffset1 += 4;
             boffset2 += 8;
@@ -139,8 +149,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
         if (n & 1) {
             vl = 2;
 
-            VLSSEG2_FLOAT(&v0, &v1, aoffset1, lda * sizeof(FLOAT) * 2, vl);
-            VSSEG2_FLOAT(boffset3, v0, v1, vl);
+            vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG2_FLOAT(boffset3, vx2, vl);
 
             //aoffset1 += 2;
             boffset3 += 4;
diff --git a/kernel/riscv64/zgemm_tcopy_rvv_v1.c b/kernel/riscv64/zgemm_tcopy_rvv_v1.c
index 7a085269c..96e986502 100644
--- a/kernel/riscv64/zgemm_tcopy_rvv_v1.c
+++ b/kernel/riscv64/zgemm_tcopy_rvv_v1.c
@@ -29,14 +29,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m2(n)
-#define FLOAT_V_T               vfloat32m2_t
-#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m2(n)
-#define FLOAT_V_T               vfloat64m2_t
-#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
@@ -47,7 +47,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
     IFLOAT *aoffset1;
     IFLOAT *boffset;
 
-    FLOAT_V_T v0, v1;
+    FLOAT_VX2_T vx2;
     size_t vl;
 
     //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
@@ -62,8 +62,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
         aoffset += vl * 2;
 
         for(i = m; i > 0; i--) {
-            VLSEG2_FLOAT(&v0, &v1, aoffset1, vl);
-            VSSEG2_FLOAT(boffset, v0, v1, vl);
+            vx2 = VLSEG2_FLOAT(aoffset1, vl);
+            VSSEG2_FLOAT(boffset, vx2, vl);
 
             aoffset1 += lda * 2;
             boffset += vl * 2;
diff --git a/kernel/riscv64/zgemmkernel_rvv_v1x4.c b/kernel/riscv64/zgemmkernel_rvv_v1x4.c
index 41399cf79..77e012ff5 100644
--- a/kernel/riscv64/zgemmkernel_rvv_v1x4.c
+++ b/kernel/riscv64/zgemmkernel_rvv_v1x4.c
@@ -30,20 +30,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m2(n)
 #define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
 #define VLEV_FLOAT              __riscv_vle32_v_f32m2
 #define VSEV_FLOAT              __riscv_vse32_v_f32m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m2(n)
 #define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
 #define VLEV_FLOAT              __riscv_vle64_v_f64m2
 #define VSEV_FLOAT              __riscv_vse64_v_f64m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
@@ -80,6 +86,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
     BLASLONG i,j,k;
     FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb;
 
+    FLOAT_VX2_T vax2;
     FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
     FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
 
@@ -109,10 +116,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
 
             for (k = bk/4; k > 0; k--)
             {
-                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
 
-                VLSEG2_FLOAT(&va2, &va3, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va2 = VGET_VX2(vax2, 0);
+                va3 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
@@ -137,7 +148,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
 
                 ptrbb += 8;
 
-                VLSEG2_FLOAT(&va4, &va5, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va4 = VGET_VX2(vax2, 0);
+                va5 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
@@ -162,7 +175,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
                 
                 ptrbb += 8;
 
-                VLSEG2_FLOAT(&va6, &va7, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va6 = VGET_VX2(vax2, 0);
+                va7 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
                 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
@@ -211,7 +226,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
 
             for (k = (bk & 3); k > 0; k--)
             {
-                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
                 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
@@ -237,35 +254,57 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
                 ptrbb += 8;
             }
 
-            VLSEG2_FLOAT(&va0, &va1, C0, vl);
-            VLSEG2_FLOAT(&va2, &va3, C1, vl);
+            vax2 = VLSEG2_FLOAT(C0, vl);
+            va0 = VGET_VX2(vax2, 0);
+            va1 = VGET_VX2(vax2, 1);
+
+            vax2 = VLSEG2_FLOAT(C1, vl);
+            va2 = VGET_VX2(vax2, 0);
+            va3 = VGET_VX2(vax2, 1);
 
             va0 =  VFMACCVF_FLOAT(va0, alphar, vres0, vl);
             va1 =  VFMACCVF_FLOAT(va1, alphar, vres1, vl);
             va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
             va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
-            VSSEG2_FLOAT(C0, va0, va1, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C0, vax2, vl);
 
             va2 =  VFMACCVF_FLOAT(va2, alphar, vres2, vl);
             va3 =  VFMACCVF_FLOAT(va3, alphar, vres3, vl);
             va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl);
             va3 =  VFMACCVF_FLOAT(va3, alphai, vres2, vl);
-            VSSEG2_FLOAT(C1, va2, va3, vl);
 
-            VLSEG2_FLOAT(&va0, &va1, C2, vl);
-            VLSEG2_FLOAT(&va2, &va3, C3, vl);
+            vax2 = VSET_VX2(vax2, 0, va2);
+            vax2 = VSET_VX2(vax2, 1, va3);
+            VSSEG2_FLOAT(C1, vax2, vl);
+
+            vax2 = VLSEG2_FLOAT(C2, vl);
+            va0 = VGET_VX2(vax2, 0);
+            va1 = VGET_VX2(vax2, 1);
+
+            vax2 = VLSEG2_FLOAT(C3, vl);
+            va2 = VGET_VX2(vax2, 0);
+            va3 = VGET_VX2(vax2, 1);
 
             va0 =  VFMACCVF_FLOAT(va0, alphar, vres4, vl);
             va1 =  VFMACCVF_FLOAT(va1, alphar, vres5, vl);
             va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl);
             va1 =  VFMACCVF_FLOAT(va1, alphai, vres4, vl);
-            VSSEG2_FLOAT(C2, va0, va1, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C2, vax2, vl);
 
             va2 =  VFMACCVF_FLOAT(va2, alphar, vres6, vl);
             va3 =  VFMACCVF_FLOAT(va3, alphar, vres7, vl);
             va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl);
             va3 =  VFMACCVF_FLOAT(va3, alphai, vres6, vl);
-            VSSEG2_FLOAT(C3, va2, va3, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va2);
+            vax2 = VSET_VX2(vax2, 1, va3);
+            VSSEG2_FLOAT(C3, vax2, vl);
 
             C0 += vl * 2;
             C1 += vl * 2;
@@ -294,9 +333,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
 
             for (k = bk/4; k > 0; k--)
             {
-                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
-                VLSEG2_FLOAT(&va2, &va3, ptrba, vl);
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va2 = VGET_VX2(vax2, 0);
+                va3 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
@@ -311,7 +355,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
 
                 ptrbb += 4;
 
-                VLSEG2_FLOAT(&va4, &va5, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va4 = VGET_VX2(vax2, 0);
+                va5 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
                 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
@@ -326,7 +372,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
                 
                 ptrbb += 4;
 
-                VLSEG2_FLOAT(&va6, &va7, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va6 = VGET_VX2(vax2, 0);
+                va7 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
                 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
@@ -356,7 +404,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
 
             for (k = (bk & 3); k > 0; k--)
             {
-                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
                 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
@@ -372,20 +422,31 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
                 ptrbb += 4;
             }
 
-            VLSEG2_FLOAT(&va0, &va1, C0, vl);
-            VLSEG2_FLOAT(&va2, &va3, C1, vl);
+            vax2 = VLSEG2_FLOAT(C0, vl);
+            va0 = VGET_VX2(vax2, 0);
+            va1 = VGET_VX2(vax2, 1);
+
+            vax2 = VLSEG2_FLOAT(C1, vl);
+            va2 = VGET_VX2(vax2, 0);
+            va3 = VGET_VX2(vax2, 1);
 
             va0 =  VFMACCVF_FLOAT(va0, alphar, vres0, vl);
             va1 =  VFMACCVF_FLOAT(va1, alphar, vres1, vl);
             va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
             va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
-            VSSEG2_FLOAT(C0, va0, va1, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C0, vax2, vl);
 
             va2 =  VFMACCVF_FLOAT(va2, alphar, vres2, vl);
             va3 =  VFMACCVF_FLOAT(va3, alphar, vres3, vl);
             va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl);
             va3 =  VFMACCVF_FLOAT(va3, alphai, vres2, vl);
-            VSSEG2_FLOAT(C1, va2, va3, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va2);
+            vax2 = VSET_VX2(vax2, 1, va3);
+            VSSEG2_FLOAT(C1, vax2, vl);
 
             C0 += vl * 2;
             C1 += vl * 2;
@@ -409,9 +470,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
 
             for (k = bk/4; k > 0; k--)
             {
-                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
-                VLSEG2_FLOAT(&va2, &va3, ptrba, vl);
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va2 = VGET_VX2(vax2, 0);
+                va3 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
@@ -420,7 +486,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
                 vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
                 ptrbb += 2;
 
-                VLSEG2_FLOAT(&va4, &va5, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va4 = VGET_VX2(vax2, 0);
+                va5 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
                 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
@@ -430,7 +498,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
 
                 ptrbb += 2;
 
-                VLSEG2_FLOAT(&va6, &va7, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va6 = VGET_VX2(vax2, 0);
+                va7 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
                 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
@@ -448,7 +518,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
 
             for (k = (bk & 3); k > 0; k--)
             {
-                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
                 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
@@ -458,12 +530,18 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
                 ptrbb += 2;
             }
             
-            VLSEG2_FLOAT(&va0, &va1, C0, vl);
+            vax2 = VLSEG2_FLOAT(C0, vl);
+            va0 = VGET_VX2(vax2, 0);
+            va1 = VGET_VX2(vax2, 1);
+
             va0 =  VFMACCVF_FLOAT(va0, alphar, vres0, vl);
             va1 =  VFMACCVF_FLOAT(va1, alphar, vres1, vl);
             va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
             va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
-            VSSEG2_FLOAT(C0, va0, va1, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C0, vax2, vl);
             C0 += vl * 2;
         }
 
diff --git a/kernel/riscv64/zgemv_n_rvv.c b/kernel/riscv64/zgemv_n_rvv.c
index 4a40c30a7..f14ef5ba8 100644
--- a/kernel/riscv64/zgemv_n_rvv.c
+++ b/kernel/riscv64/zgemv_n_rvv.c
@@ -30,27 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m4(n)
 #define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VSET_VX2                __riscv_vset_v_f32m4_f32m4x2
 #define VLEV_FLOAT              __riscv_vle32_v_f32m4
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m4
 #define VSEV_FLOAT              __riscv_vse32_v_f32m4
 #define VSSEV_FLOAT             __riscv_vsse32_v_f32m4
-#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
-#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
-#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4x2
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m4
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m4
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m4(n)
 #define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VSET_VX2                __riscv_vset_v_f64m4_f64m4x2
 #define VLEV_FLOAT              __riscv_vle64_v_f64m4
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m4
 #define VSEV_FLOAT              __riscv_vse64_v_f64m4
 #define VSSEV_FLOAT             __riscv_vsse64_v_f64m4
-#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
-#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
-#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4x2
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m4
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m4
 #endif
@@ -62,6 +68,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
     FLOAT *a_ptr;
     FLOAT temp_r, temp_i;
     FLOAT_V_T va0, va1, vy0, vy1;
+    FLOAT_VX2_T vax2, vyx2;
 
     BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2;
 
@@ -73,7 +80,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
             vl = VSETVL(m);
             a_ptr = a;
             ix = 0;
-            VLSEG_FLOAT(&vy0, &vy1, y, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
 
             for(i = 0; i < n; i++){
 #if !defined(XCONJ)
@@ -84,7 +94,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
                 temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
 #endif
 
-                VLSEG_FLOAT(&va0, &va1, a_ptr, vl);
+                vax2 = VLSEG_FLOAT(a_ptr, vl);
+
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
 #if !defined(CONJ)
 #if !defined(XCONJ)
                 vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
@@ -113,7 +126,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
                 a_ptr += lda2;
                 ix += inc_x2;
             }
-            VSSEG_FLOAT(y, vy0, vy1, vl);
+
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+            VSSEG_FLOAT(y, vyx2, vl);
         }
 
     }
@@ -123,7 +139,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
             vl = VSETVL(m);
             a_ptr = a;
             ix = 0;
-            VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
 
             for(i = 0; i < n; i++){
 #if !defined(XCONJ)
@@ -134,7 +152,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
                 temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
 #endif
 
-                VLSEG_FLOAT(&va0, &va1, a_ptr, vl);
+                vax2 = VLSEG_FLOAT(a_ptr, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
 #if !defined(CONJ)
 #if !defined(XCONJ)
                 vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
@@ -163,7 +183,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
                 a_ptr += lda2;
                 ix += inc_x2;
             }
-            VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+            VSSSEG_FLOAT(y, stride_y, vyx2, vl);
         }
     }
     return(0);
diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c
index 2f0380530..1c89a9f72 100644
--- a/kernel/riscv64/zgemv_t_rvv.c
+++ b/kernel/riscv64/zgemv_t_rvv.c
@@ -32,9 +32,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
 #define FLOAT_V_T               vfloat32m4_t
 #define FLOAT_V_T_M1            vfloat32m1_t
-#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
-#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m4_f32m1
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VFREDSUM_FLOAT_TU       __riscv_vfredusum_vs_f32m4_f32m1_tu
 #define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f32m4_tu
 #define VFNMSACVV_FLOAT_TU      __riscv_vfnmsac_vv_f32m4_tu
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
@@ -46,9 +48,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
 #define FLOAT_V_T               vfloat64m4_t
 #define FLOAT_V_T_M1            vfloat64m1_t
-#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
-#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m4_f64m1
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VFREDSUM_FLOAT_TU       __riscv_vfredusum_vs_f64m4_f64m1_tu
 #define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f64m4_tu
 #define VFNMSACVV_FLOAT_TU      __riscv_vfnmsac_vv_f64m4_tu
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
@@ -66,6 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
 
     FLOAT_V_T va0, va1, vx0, vx1, vr, vi; 
     FLOAT_V_T_M1 v_res, v_z0;
+    FLOAT_VX2_T vxx2, vax2;
 
     BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
     //BLASLONG stride_a = sizeof(FLOAT) * 2;
@@ -73,6 +78,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
     BLASLONG lda2 = lda * 2;
 
     size_t vlmax = VSETVL_MAX_M1;
+    v_res = VFMVVF_FLOAT_M1(0, vlmax);
     v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
     vlmax = VSETVL(m);
 
@@ -86,8 +92,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
             for(size_t vl, k = m; k > 0; k -= vl) {
                 vl = VSETVL(k);
 
-                VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl);
-                VLSEG_FLOAT(&vx0, &vx1, &x[ix], vl);
+                vax2 = VLSEG_FLOAT(&a_ptr[j], vl);
+                vxx2 = VLSEG_FLOAT(&x[ix], vl);
+
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                vx0 = VGET_VX2(vxx2, 0);
+                vx1 = VGET_VX2(vxx2, 1);
 
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
                 vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl);
@@ -104,9 +115,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
                 ix += vl * inc_x * 2;
             }
             
-            v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
+            v_res = VFREDSUM_FLOAT_TU(v_res, vr, v_z0, vlmax);
             temp_r = VFMVFS_FLOAT_M1(v_res);
-            v_res = VFREDSUM_FLOAT(vi, v_z0, vlmax);
+            v_res = VFREDSUM_FLOAT_TU(v_res, vi, v_z0, vlmax);
             temp_i = VFMVFS_FLOAT_M1(v_res);
 
 #if !defined(XCONJ)
@@ -130,8 +141,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
             for(size_t vl, k = m; k > 0; k -= vl) {
                 vl = VSETVL(k);
     
-                VLSEG_FLOAT(&va0, &va1, &a_ptr[j], vl);
-                VLSSEG_FLOAT(&vx0, &vx1, &x[ix], stride_x, vl);
+                vax2 = VLSEG_FLOAT(&a_ptr[j], vl);
+                vxx2 = VLSSEG_FLOAT(&x[ix], stride_x, vl);
+
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                vx0 = VGET_VX2(vxx2, 0);
+                vx1 = VGET_VX2(vxx2, 1);
     
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
                 vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl);
@@ -148,9 +164,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
                 ix += vl * inc_x * 2;
             }
             
-            v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
+            v_res = VFREDSUM_FLOAT_TU(v_res, vr, v_z0, vlmax);
             temp_r = VFMVFS_FLOAT_M1(v_res);
-            v_res = VFREDSUM_FLOAT(vi, v_z0, vlmax);
+            v_res = VFREDSUM_FLOAT_TU(v_res, vi, v_z0, vlmax);
             temp_i = VFMVFS_FLOAT_M1(v_res);
     
 #if !defined(XCONJ)
diff --git a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c
index 79b20a646..97013895a 100644
--- a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c
+++ b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c
@@ -31,12 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n)               __riscv_vsetvl_e32m2(n)
 #define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
 #define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
 #define VLEV_FLOAT              __riscv_vle32_v_f32m2
 #define VSEV_FLOAT              __riscv_vse32_v_f32m2
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
 #define INT_V_T                 vint32m2_t
 #define VID_V_INT               __riscv_vid_v_i32m2
 #define VADD_VX_INT             __riscv_vadd_vx_i32m2
@@ -51,12 +54,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n)               __riscv_vsetvl_e64m2(n)
 #define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
 #define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
 #define VLEV_FLOAT              __riscv_vle64_v_f64m2
 #define VSEV_FLOAT              __riscv_vse64_v_f64m2
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
 #define INT_V_T                 vint64m2_t
 #define VID_V_INT               __riscv_vid_v_i64m2
 #define VADD_VX_INT             __riscv_vadd_vx_i64m2
@@ -81,6 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     BLASLONG stride_lda = sizeof(FLOAT) * lda * 2;
 
     FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero;
+    FLOAT_VX2_T va1x2, va2x2, vbx2;
     VBOOL_T vbool_gt0, vbool_lt0, vbool_eq0;
     INT_V_T vindex_max, vindex;
 
@@ -96,8 +103,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         ao2 = a + posY * 2 + posX * lda * 2;
 
         for (i = m; i > 0; i--, offset--) {
-            VLSSEG2_FLOAT(&va20, &va21, ao2, stride_lda, vl);
-            VLSEG2_FLOAT(&va10, &va11, ao1, vl);
+            va2x2 = VLSSEG2_FLOAT(ao2, stride_lda, vl);
+            va1x2 = VLSEG2_FLOAT(ao1, vl);
+
+            va20 = VGET_VX2(va2x2, 0);
+            va21 = VGET_VX2(va2x2, 1);
+            va10 = VGET_VX2(va1x2, 0);
+            va11 = VGET_VX2(va1x2, 1);
 
             vindex = VADD_VX_INT(vindex_max, offset, vl);
             vbool_gt0  = VMSGT_VX_INT(vindex, 0, vl);
@@ -111,7 +123,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
             vb1 =  VMERGE_VVM_FLOAT(vb1, vb2, vbool_lt0, vl);
             vb1 =  VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl);
-            VSSEG2_FLOAT(b, vb0, vb1, vl);
+
+            vbx2 = VSET_VX2(vbx2, 0, vb0);
+            vbx2 = VSET_VX2(vbx2, 1, vb1);
+            VSSEG2_FLOAT(b, vbx2, vl);
 
             b   += vl * 2;
             ao1 += lda * 2;
diff --git a/kernel/riscv64/zhemm_utcopy_rvv_v1.c b/kernel/riscv64/zhemm_utcopy_rvv_v1.c
index a86815275..59029e9e5 100644
--- a/kernel/riscv64/zhemm_utcopy_rvv_v1.c
+++ b/kernel/riscv64/zhemm_utcopy_rvv_v1.c
@@ -31,12 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n)               __riscv_vsetvl_e32m2(n)
 #define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
 #define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
 #define VLEV_FLOAT              __riscv_vle32_v_f32m2
 #define VSEV_FLOAT              __riscv_vse32_v_f32m2
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
 #define INT_V_T                 vint32m2_t
 #define VID_V_INT               __riscv_vid_v_i32m2
 #define VADD_VX_INT             __riscv_vadd_vx_i32m2
@@ -51,12 +54,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n)               __riscv_vsetvl_e64m2(n)
 #define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
 #define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
 #define VLEV_FLOAT              __riscv_vle64_v_f64m2
 #define VSEV_FLOAT              __riscv_vse64_v_f64m2
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
 #define INT_V_T                 vint64m2_t
 #define VID_V_INT               __riscv_vid_v_i64m2
 #define VADD_VX_INT             __riscv_vadd_vx_i64m2
@@ -79,6 +85,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     BLASLONG stride_lda = sizeof(FLOAT) * lda * 2;
     
     FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero;
+    FLOAT_VX2_T va1x2, va2x2, vbx2;
     VBOOL_T vbool_gt0, vbool_eq0;
     INT_V_T vindex_max, vindex;
 
@@ -94,8 +101,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         ao2 = a + posX * 2 + posY * lda * 2;
 
         for (i = m; i > 0; i--, offset--) {
-            VLSSEG2_FLOAT(&va10, &va11, ao1, stride_lda, vl);
-            VLSEG2_FLOAT(&va20, &va21, ao2, vl);
+            va1x2 = VLSSEG2_FLOAT(ao1, stride_lda, vl);
+            va2x2 = VLSEG2_FLOAT(ao2, vl);
+
+            va20 = VGET_VX2(va2x2, 0);
+            va21 = VGET_VX2(va2x2, 1);
+            va10 = VGET_VX2(va1x2, 0);
+            va11 = VGET_VX2(va1x2, 1);
 
             vindex = VADD_VX_INT(vindex_max, offset, vl);
             vbool_gt0  = VMSGT_VX_INT(vindex, 0, vl);
@@ -108,7 +120,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
             vb1 =  VMERGE_VVM_FLOAT(vb1, vb2, vbool_gt0, vl);
             vb1 =  VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl);
-            VSSEG2_FLOAT(b, vb0, vb1, vl);
+
+            vbx2 = VSET_VX2(vbx2, 0, vb0);
+            vbx2 = VSET_VX2(vbx2, 1, vb1);
+            VSSEG2_FLOAT(b, vbx2, vl);
 
             b   += vl * 2;
             ao1 += 2;
diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c
index d2b27aa8d..32f67758a 100644
--- a/kernel/riscv64/znrm2_rvv.c
+++ b/kernel/riscv64/znrm2_rvv.c
@@ -28,95 +28,248 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
-#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
-#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
-#define FLOAT_V_T               vfloat32m4_t
-#define FLOAT_V_T_M1            vfloat32m1_t
-#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
-#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f32m4_tu
-#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
-#define VFREDMAXVS_FLOAT_TU     __riscv_vfredmax_vs_f32m4_f32m1_tu
-#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
-#define VFABSV_FLOAT            __riscv_vfabs_v_f32m4
+#define VSETVL(n)           __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX          __riscv_vsetvlmax_e32m4()
+#define FLOAT_V_T           vfloat32m4_t
+#define FLOAT_V_T_M1        vfloat32m1_t
+#define MASK_T              vbool8_t
+#define VLEV_FLOAT          __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT         __riscv_vlse32_v_f32m4
+#define VFREDSUM_FLOAT      __riscv_vfredusum_vs_f32m4_f32m1_tu
+#define VFMACCVV_FLOAT_TU   __riscv_vfmacc_vv_f32m4_tu
+#define VFMVVF_FLOAT        __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1     __riscv_vfmv_v_f_f32m1
+#define VMFIRSTM            __riscv_vfirst_m_b8
+#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f32m4_f32m1_tu
+#define VFMVFS_FLOAT        __riscv_vfmv_f_s_f32m1_f32
+#define VMFGTVF_FLOAT       __riscv_vmfgt_vf_f32m4_b8
+#define VFDIVVF_FLOAT       __riscv_vfdiv_vf_f32m4
+#define VFABSV_FLOAT        __riscv_vfabs_v_f32m4
 #else
-#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
-#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
-#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
-#define FLOAT_V_T               vfloat64m4_t
-#define FLOAT_V_T_M1            vfloat64m1_t
-#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
-#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f64m4_tu
-#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
-#define VFREDMAXVS_FLOAT_TU     __riscv_vfredmax_vs_f64m4_f64m1_tu
-#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
-#define VFABSV_FLOAT            __riscv_vfabs_v_f64m4
+#define VSETVL(n)           __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX          __riscv_vsetvlmax_e64m4()
+#define FLOAT_V_T           vfloat64m4_t
+#define FLOAT_V_T_M1        vfloat64m1_t
+#define MASK_T              vbool16_t
+#define VLEV_FLOAT          __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT         __riscv_vlse64_v_f64m4
+#define VFREDSUM_FLOAT      __riscv_vfredusum_vs_f64m4_f64m1_tu
+#define VFMACCVV_FLOAT_TU   __riscv_vfmacc_vv_f64m4_tu
+#define VFMVVF_FLOAT        __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1     __riscv_vfmv_v_f_f64m1
+#define VMFIRSTM            __riscv_vfirst_m_b16
+#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f64m4_f64m1_tu
+#define VFMVFS_FLOAT        __riscv_vfmv_f_s_f64m1_f64
+#define VMFGTVF_FLOAT       __riscv_vmfgt_vf_f64m4_b16
+#define VFDIVVF_FLOAT       __riscv_vfdiv_vf_f64m4
+#define VFABSV_FLOAT        __riscv_vfabs_v_f64m4
 #endif
 
-// TODO: Should single precision use the widening MAC, or perhaps all should be double? 
-
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
+	BLASLONG i=0, j=0;
 
-    if ( n <= 0 ) return(0.0);
-
-    FLOAT_V_T vr, v0, v1;
-    FLOAT_V_T_M1 v_max, v_res;
-    FLOAT scale = 0.0, ssq = 0.0;
-
-    size_t vlmax = VSETVL_MAX;
-    v_res = VFMVVF_FLOAT_M1(0, vlmax);
-    v_max = VFMVVF_FLOAT_M1(0, vlmax);
+	if (n <= 0 || inc_x <= 0) return(0.0);
 
-    vr = VFMVVF_FLOAT(0, vlmax);
+    FLOAT_V_T vr, v0, v_zero;
+    unsigned int gvl = 0;
+    FLOAT_V_T_M1 v_res, v_z0;
+    gvl = VSETVL_MAX;
+    v_res = VFMVVF_FLOAT_M1(0, gvl);
+    v_z0 = VFMVVF_FLOAT_M1(0, gvl);
 
+    FLOAT scale = 0.0, ssq = 0.0;
+    MASK_T mask;
+    BLASLONG index = 0;
     if (inc_x == 1) {
-
-        for (size_t vl; n > 0; n -= vl, x += vl*2) {
-            vl = VSETVL(n);
-
-            VLSEG_FLOAT(&v0, &v1, x, vl);
-            v0 = VFABSV_FLOAT(v0, vl);
-            v1 = VFABSV_FLOAT(v1, vl);
-
-            v_max = VFREDMAXVS_FLOAT_TU(v_max, v0, v_max, vl);
-            vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl);
-
-            v_max = VFREDMAXVS_FLOAT_TU(v_max, v1, v_max, vl);
-            vr = VFMACCVV_FLOAT_TU(vr, v1, v1, vl);
+        BLASLONG n2 = n * 2;
+        gvl = VSETVL(n2);
+        vr = VFMVVF_FLOAT(0, gvl);
+        v_zero = VFMVVF_FLOAT(0, gvl);
+        for (i=0,j=0; i<n2/gvl; i++) {
+            v0 = VLEV_FLOAT(&x[j], gvl);
+            //fabs(vector)
+            v0 = VFABSV_FLOAT(v0, gvl);
+            //if scale change
+            mask = VMFGTVF_FLOAT(v0, scale, gvl);
+            index = VMFIRSTM(mask, gvl);
+            if (index == -1) { //no elements greater than scale
+                if (scale != 0.0) {
+                    v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                    vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl);
+                }
+            } else { // found greater element
+                //ssq in vector vr: vr[0]
+                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                //total ssq before current vector
+                ssq += VFMVFS_FLOAT(v_res);
+                //find max
+                v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+                //update ssq before max_index
+                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
+                //update scale
+                scale = VFMVFS_FLOAT(v_res);
+                //ssq in vector vr
+                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+            }
+            j += gvl;
         }
+        //ssq in vector vr: vr[0]
+        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+        //total ssq now
+        ssq += VFMVFS_FLOAT(v_res);
 
+        //tail
+        if(j < n2){
+            gvl = VSETVL(n2-j);
+            v0 = VLEV_FLOAT(&x[j], gvl);
+            // fabs(vector)
+            v0 = VFABSV_FLOAT(v0, gvl);
+            // if scale change
+            mask = VMFGTVF_FLOAT(v0, scale, gvl);
+            index = VMFIRSTM(mask, gvl);
+            if (index == -1) {//no elements greater than scale
+                if(scale != 0.0)
+                    v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+            } else { //found greater element
+                //find max
+                v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+                //update ssq before max_index
+                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
+                //update scale
+                scale = VFMVFS_FLOAT(v_res);
+                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+            }
+            vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+            //ssq in vector vr: vr[0]
+            v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+            //total ssq now
+            ssq += VFMVFS_FLOAT(v_res);
+        }
     } else {
+        gvl = VSETVL(n);
+        vr = VFMVVF_FLOAT(0, gvl);
+        v_zero = VFMVVF_FLOAT(0, gvl);
+        unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
+        int idx = 0, inc_v = inc_x * gvl * 2;
+        for (i=0,j=0; i<n/gvl; i++) {
+            v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+            // fabs(vector)
+            v0 = VFABSV_FLOAT(v0, gvl);
+            //if scale change
+            mask = VMFGTVF_FLOAT(v0, scale, gvl);
+            index = VMFIRSTM(mask, gvl);
+            if (index == -1) { // no elements greater than scale
+                if(scale != 0.0){
+                    v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                    vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl);
+                }
+            } else {//found greater element
+                //ssq in vector vr: vr[0]
+                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                //total ssq before current vector
+                ssq += VFMVFS_FLOAT(v_res);
+                //find max
+                v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+                //update ssq before max_index
+                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
+                //update scale
+                scale = VFMVFS_FLOAT(v_res);
+                //ssq in vector vr
+                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+            }
 
-        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
-
-        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
-            vl = VSETVL(n);
- 
-            VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl);
-            v0 = VFABSV_FLOAT(v0, vl);
-            v1 = VFABSV_FLOAT(v1, vl);
+            v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+            //fabs(vector)
+            v0 = VFABSV_FLOAT(v0, gvl);
+            //if scale change
+            mask = VMFGTVF_FLOAT(v0, scale, gvl);
+            index = VMFIRSTM(mask, gvl);
+            if (index == -1) { // no elements greater than scale
+                if(scale != 0.0) {
+                    v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                    vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl);
+                }
+            } else { // found greater element
+                //ssq in vector vr: vr[0]
+                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                //total ssq before current vector
+                ssq += VFMVFS_FLOAT(v_res);
+                //find max
+                v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+                //update ssq before max_index
+                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
+                //update scale
+                scale = VFMVFS_FLOAT(v_res);
+                //ssq in vector vr
+                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+            }
+            j += gvl;
+            idx += inc_v;
+        }
+        //ssq in vector vr: vr[0]
+        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+        //total ssq now
+        ssq += VFMVFS_FLOAT(v_res);
 
-            v_max = VFREDMAXVS_FLOAT_TU(v_max, v0, v_max, vl);
-            vr = VFMACCVV_FLOAT_TU(vr, v0, v0, vl);
+        //tail
+        if (j < n) {
+            gvl = VSETVL(n-j);
+            v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+            //fabs(vector)
+            v0 = VFABSV_FLOAT(v0, gvl);
+            //if scale change
+            mask = VMFGTVF_FLOAT(v0, scale, gvl);
+            index = VMFIRSTM(mask, gvl);
+            if(index == -1) { // no elements greater than scale
+                if(scale != 0.0) {
+                    v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                    vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+                }
+            } else { // found greater element
+                //find max
+                v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+                //update ssq before max_index
+                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
+                //update scale
+                scale = VFMVFS_FLOAT(v_res);
+                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+            }
 
-            v_max = VFREDMAXVS_FLOAT_TU(v_max, v1, v_max, vl);
-            vr = VFMACCVV_FLOAT_TU(vr, v1, v1, vl);
+            v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+            //fabs(vector)
+            v0 = VFABSV_FLOAT(v0, gvl);
+            //if scale change
+            mask = VMFGTVF_FLOAT(v0, scale, gvl);
+            index = VMFIRSTM(mask, gvl);
+            if (index == -1) {//no elements greater than scale
+                if(scale != 0.0) {
+                    v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                    vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl);
+                }
+            } else { // found greater element
+                //ssq in vector vr: vr[0]
+                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                //total ssq before current vector
+                ssq += VFMVFS_FLOAT(v_res);
+                //find max
+                v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+                //update ssq before max_index
+                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
+                //update scale
+                scale = VFMVFS_FLOAT(v_res);
+                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+            }
+            //ssq in vector vr: vr[0]
+            v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+            //total ssq now
+            ssq += VFMVFS_FLOAT(v_res);
         }
-
     }
-
-    v_res = VFREDSUM_FLOAT(vr, v_res, vlmax);
-
-    ssq = VFMVFS_FLOAT_M1(v_res);
-    scale = VFMVFS_FLOAT_M1(v_max);
-    ssq = ssq / (scale*scale);
-
-    return(scale * sqrt(ssq));
+	return(scale * sqrt(ssq));
 }
diff --git a/kernel/riscv64/zrot_rvv.c b/kernel/riscv64/zrot_rvv.c
index ee81bfe91..1d5390684 100644
--- a/kernel/riscv64/zrot_rvv.c
+++ b/kernel/riscv64/zrot_rvv.c
@@ -30,28 +30,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m4(n)
 #define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VSET_VX2                __riscv_vset_v_f32m4_f32m4x2
 #define VLEV_FLOAT              __riscv_vle32_v_f32m4
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m4
 #define VSEV_FLOAT              __riscv_vse32_v_f32m4
 #define VSSEV_FLOAT             __riscv_vsse32_v_f32m4
-#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
-#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
-#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4x2
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m4
 #define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m4
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m4
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m4(n)
 #define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VSET_VX2                __riscv_vset_v_f64m4_f64m4x2
 #define VLEV_FLOAT              __riscv_vle64_v_f64m4
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m4
 #define VSEV_FLOAT              __riscv_vse64_v_f64m4
 #define VSSEV_FLOAT             __riscv_vsse64_v_f64m4
-#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
-#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
-#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4x2
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m4
 #define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m4
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m4
@@ -63,6 +69,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
     if (n <= 0) return(0);
 
     FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1;
+    FLOAT_VX2_T vxx2, vyx2, vtx2;
 
     if (inc_x == 0 && inc_y == 0) {
         BLASLONG i=0;
@@ -93,8 +100,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
         for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
             vl = VSETVL(n);
 
-            VLSEG_FLOAT(&vx0, &vx1, x, vl);
-            VLSEG_FLOAT(&vy0, &vy1, y, vl);
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
 
             vt0 = VFMULVF_FLOAT(vx0, c, vl);
             vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
@@ -105,8 +117,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
             vy1 = VFMULVF_FLOAT(vy1, c, vl);
             vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
 
-            VSSEG_FLOAT(x, vt0, vt1, vl);
-            VSSEG_FLOAT(y, vy0, vy1, vl);
+            vtx2 = VSET_VX2(vtx2, 0, vt0);
+            vtx2 = VSET_VX2(vtx2, 1, vt1);
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+
+            VSSEG_FLOAT(x, vtx2, vl);
+            VSSEG_FLOAT(y, vyx2, vl);
         }
 
     } else if (inc_x == 1){
@@ -115,8 +132,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
         for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
             vl = VSETVL(n);
 
-            VLSEG_FLOAT(&vx0, &vx1, x, vl);
-            VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
 
             vt0 = VFMULVF_FLOAT(vx0, c, vl);
             vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
@@ -127,8 +149,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
             vy1 = VFMULVF_FLOAT(vy1, c, vl);
             vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
 
-            VSSEG_FLOAT(x, vt0, vt1, vl);
-            VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
+            vtx2 = VSET_VX2(vtx2, 0, vt0);
+            vtx2 = VSET_VX2(vtx2, 1, vt1);
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+
+            VSSEG_FLOAT(x, vtx2, vl);
+            VSSSEG_FLOAT(y, stride_y, vyx2, vl);
         }
 
     } else if (inc_y == 1){
@@ -137,8 +164,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
         for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
             vl = VSETVL(n);
 
-            VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
-            VLSEG_FLOAT(&vy0, &vy1, y, vl);
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
 
             vt0 = VFMULVF_FLOAT(vx0, c, vl);
             vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
@@ -149,8 +181,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
             vy1 = VFMULVF_FLOAT(vy1, c, vl);
             vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
 
-            VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl);
-            VSSEG_FLOAT(y, vy0, vy1, vl);
+            vtx2 = VSET_VX2(vtx2, 0, vt0);
+            vtx2 = VSET_VX2(vtx2, 1, vt1);
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+
+            VSSSEG_FLOAT(x, stride_x, vtx2, vl);
+            VSSEG_FLOAT(y, vyx2, vl);
         }
 
     } else {
@@ -160,8 +197,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
         for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
             vl = VSETVL(n);
 
-            VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
-            VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
 
             vt0 = VFMULVF_FLOAT(vx0, c, vl);
             vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
@@ -172,8 +214,13 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
             vy1 = VFMULVF_FLOAT(vy1, c, vl);
             vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
 
-            VSSSEG_FLOAT(x, stride_x, vt0, vt1, vl);
-            VSSSEG_FLOAT(y, stride_y, vy0, vy1, vl);
+            vtx2 = VSET_VX2(vtx2, 0, vt0);
+            vtx2 = VSET_VX2(vtx2, 1, vt1);
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+
+            VSSSEG_FLOAT(x, stride_x, vtx2, vl);
+            VSSSEG_FLOAT(y, stride_y, vyx2, vl);
         }
     }
 
diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c
index 779fab68c..2586c6036 100644
--- a/kernel/riscv64/zscal_rvv.c
+++ b/kernel/riscv64/zscal_rvv.c
@@ -31,10 +31,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n)               __riscv_vsetvl_e32m4(n)
 #define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
 #define FLOAT_V_T               vfloat32m4_t
-#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
-#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4
-#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VSET_VX2                __riscv_vset_v_f32m4_f32m4x2
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4x2
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m4
 #define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m4
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m4
@@ -43,10 +46,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n)               __riscv_vsetvl_e64m4(n)
 #define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
 #define FLOAT_V_T               vfloat64m4_t
-#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
-#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4
-#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VSET_VX2                __riscv_vset_v_f64m4_f64m4x2
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4x2
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m4
 #define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m4
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m4
@@ -61,6 +67,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
     FLOAT_V_T vt, vr, vi;
     BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
     size_t vlmax = VSETVL_MAX;
+    FLOAT_VX2_T vx2;
 
     if(da_r == 0.0 && da_i == 0.0) {
 
@@ -71,16 +78,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 
             for (size_t vl; n > 0; n -= vl, x += vl*2) {
                 vl = VSETVL(n);
-
-                VSSEG_FLOAT(x, vr, vi, vl);
+                vx2 = VSET_VX2(vx2, 0, vr);
+                vx2 = VSET_VX2(vx2, 1, vi);
+                VSSEG_FLOAT(x, vx2, vl);
             }
 
         } else {
 
             for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
                 vl = VSETVL(n);
-
-                VSSSEG_FLOAT(x, stride_x, vr, vi, vl);
+                vx2 = VSET_VX2(vx2, 0, vr);
+                vx2 = VSET_VX2(vx2, 1, vi);
+                VSSSEG_FLOAT(x, stride_x, vx2, vl);
             }
         }
 
@@ -89,12 +98,17 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
         for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
             vl = VSETVL(n);
             
-            VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl);
+            vx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vr = VGET_VX2(vx2, 0);
+            vi = VGET_VX2(vx2, 1);
 
             vt = VFMULVF_FLOAT(vi, -da_i, vl);
             vi = VFMULVF_FLOAT(vr, da_i, vl);
 
-            VSSSEG_FLOAT(x, stride_x, vt, vi, vl);
+            vx2 = VSET_VX2(vx2, 0, vt);
+            vx2 = VSET_VX2(vx2, 1, vi);
+
+            VSSSEG_FLOAT(x, stride_x, vx2, vl);
         }
 
     } else if(da_i == 0.0) {
@@ -102,12 +116,16 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
         for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
             vl = VSETVL(n);
 
-            VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl);
+            vx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vr = VGET_VX2(vx2, 0);
+            vi = VGET_VX2(vx2, 1);
 
             vr = VFMULVF_FLOAT(vr, da_r, vl);
             vi = VFMULVF_FLOAT(vi, da_r, vl);
 
-            VSSSEG_FLOAT(x, stride_x, vr, vi, vl);
+            vx2 = VSET_VX2(vx2, 0, vr);
+            vx2 = VSET_VX2(vx2, 1, vi);
+            VSSSEG_FLOAT(x, stride_x, vx2, vl);
         }
 
     } else {
@@ -117,14 +135,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
             for (size_t vl; n > 0; n -= vl, x += vl*2) {
                 vl = VSETVL(n);
 
-                VLSEG_FLOAT(&vr, &vi, x, vl);
+                vx2 = VLSEG_FLOAT(x, vl);
+                vr = VGET_VX2(vx2, 0);
+                vi = VGET_VX2(vx2, 1);
 
                 vt = VFMULVF_FLOAT(vr, da_r, vl);
                 vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);
                 vi = VFMULVF_FLOAT(vi, da_r, vl);
                 vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);
 
-                VSSEG_FLOAT(x, vt, vi, vl);
+                vx2 = VSET_VX2(vx2, 0, vt);
+                vx2 = VSET_VX2(vx2, 1, vi);
+                VSSEG_FLOAT(x, vx2, vl);
             }
 
         } else {
@@ -132,14 +154,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
             for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
                 vl = VSETVL(n);
 
-                VLSSEG_FLOAT(&vr, &vi, x, stride_x, vl);
+                vx2 = VLSSEG_FLOAT(x, stride_x, vl);
+                vr = VGET_VX2(vx2, 0);
+                vi = VGET_VX2(vx2, 1);
 
                 vt = VFMULVF_FLOAT(vr, da_r, vl);
                 vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);
                 vi = VFMULVF_FLOAT(vi, da_r, vl);
                 vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);
 
-                VSSSEG_FLOAT(x, stride_x, vt, vi, vl);
+                vx2 = VSET_VX2(vx2, 0, vt);
+                vx2 = VSET_VX2(vx2, 1, vi);
+                VSSSEG_FLOAT(x, stride_x, vx2, vl);
             }
         }
     }
diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c
index b41f70eb5..489188bd5 100644
--- a/kernel/riscv64/zsum_rvv.c
+++ b/kernel/riscv64/zsum_rvv.c
@@ -32,8 +32,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
 #define FLOAT_V_T               vfloat32m4_t
 #define FLOAT_V_T_M1            vfloat32m1_t
-#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
 #define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f32m4_f32m1
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
@@ -44,8 +46,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
 #define FLOAT_V_T               vfloat64m4_t
 #define FLOAT_V_T_M1            vfloat64m1_t
-#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
 #define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f64m4_f64m1
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
 #define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
@@ -59,6 +63,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
     if (n <= 0 || inc_x <= 0) return(sumf);
 
     FLOAT_V_T v0, v1;
+    FLOAT_VX2_T vx2;
     size_t vlmax = VSETVL_MAX; 
     FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax);
 
@@ -67,7 +72,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         for (size_t vl; n > 0; n -= vl, x += vl*2) {
             vl = VSETVL(n);
 
-            VLSEG_FLOAT(&v0, &v1, x, vl);
+            vx2 = VLSEG_FLOAT(x, vl);
+
+            v0 = VGET_VX2(vx2, 0);
+            v1 = VGET_VX2(vx2, 1);
 
             v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl);
             v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl);
@@ -80,7 +88,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
             vl = VSETVL(n);
 
-            VLSSEG_FLOAT(&v0, &v1, x, stride_x, vl);
+            vx2 = VLSSEG_FLOAT(x, stride_x, vl);
+
+            v0 = VGET_VX2(vx2, 0);
+            v1 = VGET_VX2(vx2, 1);
 
             v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl);
             v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl);
diff --git a/kernel/riscv64/zswap_rvv.c b/kernel/riscv64/zswap_rvv.c
index 17b7b9f43..c2adf5e05 100644
--- a/kernel/riscv64/zswap_rvv.c
+++ b/kernel/riscv64/zswap_rvv.c
@@ -29,18 +29,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m4(n)
-#define FLOAT_V_T               vfloat32m4_t
-#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4
-#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4
-#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4x2
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m4(n)
-#define FLOAT_V_T               vfloat64m4_t
-#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4
-#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4
-#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4
-#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4x2
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
@@ -48,7 +48,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
 
     if (n <= 0) return(0);
 
-    FLOAT_V_T vx0, vx1, vy0, vy1;
+    FLOAT_VX2_T vxx2, vyx2;
 
     if (inc_x == 0 && inc_y == 0) {
         if (n & 1) {
@@ -75,8 +75,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
         BLASLONG m = n - 1;
         for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_y * 2) {
             vl = VSETVL(m);
-            VLSSEG_FLOAT(&vy0, &vy1, ptr - 2, stride_y, vl);
-            VSSSEG_FLOAT(ptr, stride_y, vy0, vy1, vl);
+            vyx2 = VLSSEG_FLOAT(ptr - 2, stride_y, vl);
+            VSSSEG_FLOAT(ptr, stride_y, vyx2, vl);
         }
         y[0] = temp[0];
         y[1] = temp[1];
@@ -92,8 +92,8 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
         BLASLONG m = n - 1;
         for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_x * 2) {
             vl = VSETVL(m);
-            VLSSEG_FLOAT(&vx0, &vx1, ptr - 2, stride_x, vl);
-            VSSSEG_FLOAT(ptr, stride_x, vx0, vx1, vl);
+            vxx2 = VLSSEG_FLOAT(ptr - 2, stride_x, vl);
+            VSSSEG_FLOAT(ptr, stride_x, vxx2, vl);
         }
         x[0] = temp[0];
         x[1] = temp[1];
@@ -103,11 +103,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
         for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
             vl = VSETVL(n);
 
-            VLSEG_FLOAT(&vx0, &vx1, x, vl);
-            VLSEG_FLOAT(&vy0, &vy1, y, vl);
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
 
-            VSSEG_FLOAT(y, vx0, vx1, vl);
-            VSSEG_FLOAT(x, vy0, vy1, vl);
+            VSSEG_FLOAT(y, vxx2, vl);
+            VSSEG_FLOAT(x, vyx2, vl);
         }
 
     } else if (inc_x == 1){
@@ -116,11 +116,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
         for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
             vl = VSETVL(n);
 
-            VLSEG_FLOAT(&vx0, &vx1, x, vl);
-            VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
 
-            VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl);
-            VSSEG_FLOAT(x, vy0, vy1, vl);
+            VSSSEG_FLOAT(y, stride_y, vxx2, vl);
+            VSSEG_FLOAT(x, vyx2, vl);
         }
 
     } else if (inc_y == 1){
@@ -129,11 +129,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
         for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
             vl = VSETVL(n);
 
-            VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
-            VLSEG_FLOAT(&vy0, &vy1, y, vl);
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
 
-            VSSEG_FLOAT(y, vx0, vx1, vl);
-            VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl);
+            VSSEG_FLOAT(y, vxx2, vl);
+            VSSSEG_FLOAT(x, stride_x, vyx2, vl);
         }
 
     } else {
@@ -143,11 +143,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
         for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
             vl = VSETVL(n);
 
-            VLSSEG_FLOAT(&vx0, &vx1, x, stride_x, vl);
-            VLSSEG_FLOAT(&vy0, &vy1, y, stride_y, vl);
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
 
-            VSSSEG_FLOAT(y, stride_y, vx0, vx1, vl);
-            VSSSEG_FLOAT(x, stride_x, vy0, vy1, vl);
+            VSSSEG_FLOAT(y, stride_y, vxx2, vl);
+            VSSSEG_FLOAT(x, stride_x, vyx2, vl);
         }
 
     }
diff --git a/kernel/riscv64/zsymm_lcopy_rvv_v1.c b/kernel/riscv64/zsymm_lcopy_rvv_v1.c
index 0f9e04869..f4d806190 100644
--- a/kernel/riscv64/zsymm_lcopy_rvv_v1.c
+++ b/kernel/riscv64/zsymm_lcopy_rvv_v1.c
@@ -31,12 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n)               __riscv_vsetvl_e32m2(n)
 #define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
 #define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
 #define VLEV_FLOAT              __riscv_vle32_v_f32m2
 #define VSEV_FLOAT              __riscv_vse32_v_f32m2
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
 #define INT_V_T                 vint32m2_t
 #define VID_V_INT               __riscv_vid_v_i32m2
 #define VADD_VX_INT             __riscv_vadd_vx_i32m2
@@ -47,12 +50,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n)               __riscv_vsetvl_e64m2(n)
 #define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
 #define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
 #define VLEV_FLOAT              __riscv_vle64_v_f64m2
 #define VSEV_FLOAT              __riscv_vse64_v_f64m2
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
 #define INT_V_T                 vint64m2_t
 #define VID_V_INT               __riscv_vid_v_i64m2
 #define VADD_VX_INT             __riscv_vadd_vx_i64m2
@@ -70,6 +76,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     BLASLONG stride_lda = sizeof(FLOAT)*lda*2;
 
     FLOAT_V_T vb0, vb1, va10, va11, va20, va21;
+    FLOAT_VX2_T va1x2, va2x2, vbx2;
     VBOOL_T vbool;
     INT_V_T vindex_max, vindex;
 
@@ -85,15 +92,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
         for (i = m; i > 0; i--, offset--) {
 
-            VLSSEG2_FLOAT(&va20, &va21, ao2, stride_lda, vl);
-            VLSEG2_FLOAT(&va10, &va11, ao1, vl);
+            va2x2 = VLSSEG2_FLOAT(ao2, stride_lda, vl);
+            va1x2 = VLSEG2_FLOAT(ao1, vl);
+
+            va20 = VGET_VX2(va2x2, 0);
+            va21 = VGET_VX2(va2x2, 1);
+            va10 = VGET_VX2(va1x2, 0);
+            va11 = VGET_VX2(va1x2, 1);
 
             vindex = VADD_VX_INT(vindex_max, offset, vl);
             vbool  = VMSGT_VX_INT(vindex, 0, vl);
 
             vb0 =  VMERGE_VVM_FLOAT(va20, va10, vbool, vl);
             vb1 =  VMERGE_VVM_FLOAT(va21, va11, vbool, vl);
-            VSSEG2_FLOAT(b, vb0, vb1, vl);
+
+            vbx2 = VSET_VX2(vbx2, 0, vb0);
+            vbx2 = VSET_VX2(vbx2, 1, vb1);
+            VSSEG2_FLOAT(b, vbx2, vl);
 
             b   += vl * 2;
             ao1 += lda * 2;
diff --git a/kernel/riscv64/zsymm_ucopy_rvv_v1.c b/kernel/riscv64/zsymm_ucopy_rvv_v1.c
index fdc693700..069551bb0 100644
--- a/kernel/riscv64/zsymm_ucopy_rvv_v1.c
+++ b/kernel/riscv64/zsymm_ucopy_rvv_v1.c
@@ -31,12 +31,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n)               __riscv_vsetvl_e32m2(n)
 #define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
 #define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
 #define VLEV_FLOAT              __riscv_vle32_v_f32m2
 #define VSEV_FLOAT              __riscv_vse32_v_f32m2
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
 #define INT_V_T                 vint32m2_t
 #define VID_V_INT               __riscv_vid_v_i32m2
 #define VADD_VX_INT             __riscv_vadd_vx_i32m2
@@ -47,12 +50,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL(n)               __riscv_vsetvl_e64m2(n)
 #define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
 #define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
 #define VLEV_FLOAT              __riscv_vle64_v_f64m2
 #define VSEV_FLOAT              __riscv_vse64_v_f64m2
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
 #define INT_V_T                 vint64m2_t
 #define VID_V_INT               __riscv_vid_v_i64m2
 #define VADD_VX_INT             __riscv_vadd_vx_i64m2
@@ -71,6 +77,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     BLASLONG stride_lda = sizeof(FLOAT)*lda * 2;
     
     FLOAT_V_T vb0, vb1, va10, va11, va20, va21;
+    FLOAT_VX2_T va1x2, va2x2, vbx2;
     VBOOL_T vbool;
     INT_V_T vindex_max, vindex;
 
@@ -86,15 +93,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         ao2 = a + posX * 2 + 0 + posY * lda * 2;
 
         for (i = m; i > 0; i--, offset--) {
-            VLSSEG2_FLOAT(&va10, &va11, ao1, stride_lda, vl);
-            VLSEG2_FLOAT(&va20, &va21, ao2, vl);
+            va1x2 = VLSSEG2_FLOAT(ao1, stride_lda, vl);
+            va2x2 = VLSEG2_FLOAT(ao2, vl);
+
+            va20 = VGET_VX2(va2x2, 0);
+            va21 = VGET_VX2(va2x2, 1);
+            va10 = VGET_VX2(va1x2, 0);
+            va11 = VGET_VX2(va1x2, 1);
 
             vindex = VADD_VX_INT(vindex_max, offset, vl);
             vbool  = VMSGT_VX_INT(vindex, 0, vl);
 
             vb0 =  VMERGE_VVM_FLOAT(va20, va10, vbool, vl);
             vb1 =  VMERGE_VVM_FLOAT(va21, va11, vbool, vl);
-            VSSEG2_FLOAT(b, vb0, vb1, vl);
+
+            vbx2 = VSET_VX2(vbx2, 0, vb0);
+            vbx2 = VSET_VX2(vbx2, 1, vb1);
+            VSSEG2_FLOAT(b, vbx2, vl);
 
             b   += vl * 2;
             ao1 += 2;
diff --git a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c
index 7276618c5..ae664561b 100644
--- a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c
+++ b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c
@@ -32,12 +32,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m2(n)
 #define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
 #define VLEV_FLOAT              __riscv_vle32_v_f32m2
 #define VSEV_FLOAT              __riscv_vse32_v_f32m2
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
 #define VBOOL_T                 vbool16_t
 #define UINT_V_T                vint32m2_t
 #define VID_V_UINT              __riscv_vid_v_i32m2
@@ -47,12 +49,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m2(n)
 #define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
 #define VLEV_FLOAT              __riscv_vle64_v_f64m2
 #define VSEV_FLOAT              __riscv_vse64_v_f64m2
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
 #define VBOOL_T                 vbool32_t
 #define UINT_V_T                vuint64m2_t
 #define VID_V_UINT              __riscv_vid_v_u64m2
@@ -69,6 +73,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
     BLASLONG stride_lda = sizeof(FLOAT)*lda*2;
     
+    FLOAT_VX2_T vax2;
     FLOAT_V_T va0, va1;
 
     size_t vl;
@@ -98,8 +103,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         {
             if (X > posY) 
             {
-                VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
-                VSSEG2_FLOAT(b, va0, va1, vl);
+                vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                VSSEG2_FLOAT(b, vax2, vl);
 
                 ao  += 2;
                 b   += vl * 2;
@@ -119,7 +124,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                 vindex  = VID_V_UINT(vl);
                 for (unsigned int j = 0; j < vl; j++) 
                 {
-                    VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
+                    vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                    va0 = VGET_VX2(vax2, 0);
+                    va1 = VGET_VX2(vax2, 1);
+
                     vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
                     va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl);
                     va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
@@ -128,7 +136,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                     va0 =  VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl);
                     va1 =  VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl);
 #endif
-                    VSSEG2_FLOAT(b, va0, va1, vl);
+                    vax2 = VSET_VX2(vax2, 0, va0);
+                    vax2 = VSET_VX2(vax2, 1, va1);
+                    VSSEG2_FLOAT(b, vax2, vl);
                     ao  += 2;
                     b   += vl * 2;
                 }
diff --git a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c
index 72e8f2ce2..ab8d34337 100644
--- a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c
+++ b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c
@@ -32,11 +32,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m2(n)
 #define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
 #define VLEV_FLOAT              __riscv_vle32_v_f32m2
 #define VSEV_FLOAT              __riscv_vse32_v_f32m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
 #define VBOOL_T                 vbool16_t
 #define UINT_V_T                vuint32m2_t
 #define VID_V_UINT              __riscv_vid_v_u32m2
@@ -46,11 +48,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m2(n)
 #define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
 #define VLEV_FLOAT              __riscv_vle64_v_f64m2
 #define VSEV_FLOAT              __riscv_vse64_v_f64m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
 #define VBOOL_T                 vbool32_t
 #define UINT_V_T                vuint64m2_t
 #define VID_V_UINT              __riscv_vid_v_u64m2
@@ -65,6 +69,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
     FLOAT *ao;
     
+    FLOAT_VX2_T vax2;
     FLOAT_V_T va0, va1;
     size_t vl;
 #ifdef UNIT
@@ -101,8 +106,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
             else if (X < posY) 
             {
                 //va1 = VLEV_FLOAT(ao, vl);
-                VLSEG2_FLOAT(&va0, &va1, ao, vl);
-                VSSEG2_FLOAT(b, va0, va1, vl);
+                vax2 = VLSEG2_FLOAT(ao, vl);
+                VSSEG2_FLOAT(b, vax2, vl);
 
                 ao  += lda * 2;
                 b   += vl * 2;
@@ -115,7 +120,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                 for (unsigned int j = 0; j < vl; j++) 
                 {
                     //va1 = VLEV_FLOAT(ao, vl);
-                    VLSEG2_FLOAT(&va0, &va1, ao, vl);
+                    vax2 = VLSEG2_FLOAT(ao, vl);
+                    va0 = VGET_VX2(vax2, 0);
+                    va1 = VGET_VX2(vax2, 1);
+
                     vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
                     va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl);
                     va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
@@ -124,7 +132,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                     va0 =  VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl);
                     va1 =  VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl);
 #endif
-                    VSSEG2_FLOAT(b, va0, va1, vl);
+                    vax2 = VSET_VX2(vax2, 0, va0);
+                    vax2 = VSET_VX2(vax2, 1, va1);
+                    VSSEG2_FLOAT(b, vax2, vl);
                     ao  += lda * 2;
                     b   += vl * 2;
                 }
diff --git a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c
index e6d36c86d..ba6e63b96 100644
--- a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c
+++ b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c
@@ -32,12 +32,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m2(n)
 #define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
 #define VLEV_FLOAT              __riscv_vle32_v_f32m2
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
 #define VSEV_FLOAT              __riscv_vse32_v_f32m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
 #define VBOOL_T                 vbool16_t
 #define UINT_V_T                vuint32m2_t
 #define VID_V_UINT              __riscv_vid_v_u32m2
@@ -47,12 +49,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m2(n)
 #define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
 #define VLEV_FLOAT              __riscv_vle64_v_f64m2
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
 #define VSEV_FLOAT              __riscv_vse64_v_f64m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
 #define VBOOL_T                 vbool32_t
 #define UINT_V_T                vuint64m2_t
 #define VID_V_UINT              __riscv_vid_v_u64m2
@@ -67,6 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     BLASLONG stride_lda = sizeof(FLOAT) * lda * 2;
     FLOAT *ao;
 
+    FLOAT_VX2_T vax2;
     FLOAT_V_T va0, va1;
     size_t vl;
 
@@ -96,8 +101,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         {
             if (X < posY) 
             {
-                VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
-                VSSEG2_FLOAT(b, va0, va1, vl);
+                vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                VSSEG2_FLOAT(b, vax2, vl);
 
                 ao  += 2;
                 b   += vl * 2;
@@ -118,7 +123,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                 vindex  = VID_V_UINT(vl);
                 for (unsigned int j = 0; j < vl; j++) 
                 {
-                    VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
+                    vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                    va0 = VGET_VX2(vax2, 0);
+                    va1 = VGET_VX2(vax2, 1);
+
                     vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
                     va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl);
                     va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
@@ -127,7 +135,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                     va0 =  VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl);
                     va1 =  VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl);
 #endif
-                    VSSEG2_FLOAT(b, va0, va1, vl);
+                    vax2 = VSET_VX2(vax2, 0, va0);
+                    vax2 = VSET_VX2(vax2, 1, va1);
+                    VSSEG2_FLOAT(b, vax2, vl);
                     ao  += 2;
                     b   += vl * 2;
                 }
diff --git a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c
index 7085cfc37..a624fff54 100644
--- a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c
+++ b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c
@@ -34,11 +34,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m2(n)
 #define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
 #define VLEV_FLOAT              __riscv_vle32_v_f32m2
 #define VSEV_FLOAT              __riscv_vse32_v_f32m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
 #define VBOOL_T                 vbool16_t
 #define UINT_V_T                vuint32m2_t
 #define VID_V_UINT              __riscv_vid_v_u32m2
@@ -48,11 +50,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m2(n)
 #define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
 #define VLEV_FLOAT              __riscv_vle64_v_f64m2
 #define VSEV_FLOAT              __riscv_vse64_v_f64m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
 #define VBOOL_T                 vbool32_t
 #define UINT_V_T                vuint64m2_t
 #define VID_V_UINT              __riscv_vid_v_u64m2
@@ -66,6 +70,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     BLASLONG i, j, js, X;
 
     FLOAT *ao;
+
+    FLOAT_VX2_T vax2;
     FLOAT_V_T va0, va1;
 #ifdef UNIT
     VBOOL_T vbool_eq;
@@ -103,8 +109,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
             }
             else if (X > posY)
             {
-                VLSEG2_FLOAT(&va0, &va1, ao, vl);
-                VSSEG2_FLOAT(b, va0, va1, vl);
+                vax2 = VLSEG2_FLOAT(ao, vl);
+                VSSEG2_FLOAT(b, vax2, vl);
                 ao  += lda * 2;
                 b   += vl * 2;
                 X++;
@@ -115,7 +121,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                 vindex  = VID_V_UINT(vl);
                 for (j = 0; j < vl; j++) 
                 {
-                    VLSEG2_FLOAT(&va0, &va1, ao, vl);
+                    vax2 = VLSEG2_FLOAT(ao, vl);
+                    va0 = VGET_VX2(vax2, 0);
+                    va1 = VGET_VX2(vax2, 1);
+
                     vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
                     va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl);
                     va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
@@ -124,7 +133,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                     va0 =  VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl);
                     va1 =  VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl);
 #endif
-                    VSSEG2_FLOAT(b, va0, va1, vl);
+                    vax2 = VSET_VX2(vax2, 0, va0);
+                    vax2 = VSET_VX2(vax2, 1, va1);
+                    VSSEG2_FLOAT(b, vax2, vl);
                     ao += lda * 2;
                     b += vl * 2;
                 }
diff --git a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c
index 92b4b855b..db5f06af8 100644
--- a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c
+++ b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c
@@ -30,10 +30,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m2(n)
 #define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
 #define VLEV_FLOAT              __riscv_vle32_v_f32m2
 #define VSEV_FLOAT              __riscv_vse32_v_f32m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
@@ -41,10 +44,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m2(n)
 #define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
 #define VLEV_FLOAT              __riscv_vle64_v_f64m2
 #define VSEV_FLOAT              __riscv_vse64_v_f64m2
-#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
 #define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
 #define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
@@ -85,6 +91,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
 	off = 0;
 #endif
 
+    FLOAT_VX2_T vax2;
     FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
     FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
 
@@ -130,10 +137,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
 
             for (k = temp/4; k > 0; k--)
             {
-                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
 
-                VLSEG2_FLOAT(&va2, &va3, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va2 = VGET_VX2(vax2, 0);
+                va3 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
@@ -158,7 +169,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
 
                 ptrbb += 8;
 
-                VLSEG2_FLOAT(&va4, &va5, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va4 = VGET_VX2(vax2, 0);
+                va5 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
                 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
@@ -183,7 +196,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
                 
                 ptrbb += 8;
 
-                VLSEG2_FLOAT(&va6, &va7, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va6 = VGET_VX2(vax2, 0);
+                va7 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
                 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
@@ -233,7 +248,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
 
             for (k = temp & 3; k > 0; k--)
             {
-                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
@@ -262,25 +279,37 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
             va1 =  VFMULVF_FLOAT(vres1, alphar, vl);
             va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
             va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
-            VSSEG2_FLOAT(C0, va0, va1, vl);
+            
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C0, vax2, vl);
 
             va2 =  VFMULVF_FLOAT(vres2, alphar, vl);
             va3 =  VFMULVF_FLOAT(vres3, alphar, vl);
             va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl);
             va3 =  VFMACCVF_FLOAT(va3, alphai, vres2, vl);
-            VSSEG2_FLOAT(C1, va2, va3, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va2);
+            vax2 = VSET_VX2(vax2, 1, va3);
+            VSSEG2_FLOAT(C1, vax2, vl);
 
             va0 =  VFMULVF_FLOAT(vres4, alphar, vl);
             va1 =  VFMULVF_FLOAT(vres5, alphar, vl);
             va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl);
             va1 =  VFMACCVF_FLOAT(va1, alphai, vres4, vl);
-            VSSEG2_FLOAT(C2, va0, va1, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C2, vax2, vl);
 
             va2 =  VFMULVF_FLOAT(vres6, alphar, vl);
             va3 =  VFMULVF_FLOAT(vres7, alphar, vl);
             va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl);
             va3 =  VFMACCVF_FLOAT(va3, alphai, vres6, vl);
-            VSSEG2_FLOAT(C3, va2, va3, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va2);
+            vax2 = VSET_VX2(vax2, 1, va3);
+            VSSEG2_FLOAT(C3, vax2, vl);
 
 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             temp = bk - off;
@@ -342,10 +371,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
 #endif
             for (k = temp/4; k > 0; k--)
             {
-                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
 
-                VLSEG2_FLOAT(&va2, &va3, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va2 = VGET_VX2(vax2, 0);
+                va3 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
@@ -360,7 +393,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
 
                 ptrbb += 4;
 
-                VLSEG2_FLOAT(&va4, &va5, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va4 = VGET_VX2(vax2, 0);
+                va5 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
                 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
@@ -375,7 +410,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
 
                 ptrbb += 4;
 
-                VLSEG2_FLOAT(&va6, &va7, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va6 = VGET_VX2(vax2, 0);
+                va7 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
                 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
@@ -405,7 +442,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
 
             for (k = temp & 3; k > 0; k--)
             {
-                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
@@ -425,13 +464,19 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
             va1 =  VFMULVF_FLOAT(vres1, alphar, vl);
             va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
             va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
-            VSSEG2_FLOAT(C0, va0, va1, vl);
+            
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C0, vax2, vl);
 
             va2 =  VFMULVF_FLOAT(vres2, alphar, vl);
             va3 =  VFMULVF_FLOAT(vres3, alphar, vl);
             va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl);
             va3 =  VFMACCVF_FLOAT(va3, alphai, vres2, vl);
-            VSSEG2_FLOAT(C1, va2, va3, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va2);
+            vax2 = VSET_VX2(vax2, 1, va3);
+            VSSEG2_FLOAT(C1, vax2, vl);
 
 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             temp = bk - off;
@@ -487,10 +532,14 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
 #endif
             for (k = temp/4; k > 0; k--)
             {
-                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
 
-                VLSEG2_FLOAT(&va2, &va3, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va2 = VGET_VX2(vax2, 0);
+                va3 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
@@ -500,7 +549,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
 
                 ptrbb += 2;
 
-                VLSEG2_FLOAT(&va4, &va5, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va4 = VGET_VX2(vax2, 0);
+                va5 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
                 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
@@ -510,7 +561,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
 
                 ptrbb += 2;
 
-                VLSEG2_FLOAT(&va6, &va7, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va6 = VGET_VX2(vax2, 0);
+                va7 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
                 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
@@ -530,7 +583,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
 
             for (k = temp & 3; k > 0; k--)
             {
-                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
                 ptrba += vl*2;
 
                 vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
@@ -545,7 +600,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b
             va1 =  VFMULVF_FLOAT(vres1, alphar, vl);
             va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
             va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
-            VSSEG2_FLOAT(C0, va0, va1, vl);
+            
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C0, vax2, vl);
 
 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
             temp = bk - off;
diff --git a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c
index 383cb883f..36cec711d 100644
--- a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c
+++ b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c
@@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m2(n)
-#define FLOAT_V_T               vfloat32m2_t
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
-#define VSSEG2_FLOAT_M          __riscv_vsseg2e32_v_f32m2_m
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e32_v_f32m2x2_m
 #define VBOOL_T                 vbool16_t
 #define UINT_V_T                vuint32m2_t
 #define VID_V_UINT              __riscv_vid_v_u32m2
 #define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m2(n)
-#define FLOAT_V_T               vfloat64m2_t
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
-#define VSSEG2_FLOAT_M          __riscv_vsseg2e64_v_f64m2_m
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e64_v_f64m2x2_m
 #define VBOOL_T                 vbool32_t
 #define UINT_V_T                vuint64m2_t
 #define VID_V_UINT              __riscv_vid_v_u64m2
@@ -64,7 +64,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     BLASLONG stride_lda = sizeof(FLOAT)*lda*2;
 
-    FLOAT_V_T va0, va1;
+    FLOAT_VX2_T vax2;
     VBOOL_T vbool_cmp;
     UINT_V_T vindex;
     size_t vl;
@@ -82,9 +82,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
                 vindex  = VID_V_UINT(vl);
                 for (unsigned int j = 0; j < vl; j++) 
                 {
-                    VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
+                    vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
                     vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
-                    VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl);
+                    VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl);
 
                     compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1));
                     ao  += 2;
@@ -97,8 +97,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
             {
                 if (ii > jj)
                 {
-                    VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
-                    VSSEG2_FLOAT(b, va0, va1, vl);
+                    vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                    VSSEG2_FLOAT(b, vax2, vl);
                 }
                 ao  += 2;
                 b   += vl * 2;
diff --git a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c
index f57e9f1de..3a7bdb522 100644
--- a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c
+++ b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c
@@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m2(n)
-#define FLOAT_V_T               vfloat32m2_t
-#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
-#define VSSEG2_FLOAT_M          __riscv_vsseg2e32_v_f32m2_m
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e32_v_f32m2x2_m
 #define VBOOL_T                 vbool16_t
 #define UINT_V_T                vuint32m2_t
 #define VID_V_UINT              __riscv_vid_v_u32m2
 #define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m2(n)
-#define FLOAT_V_T               vfloat64m2_t
-#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
-#define VSSEG2_FLOAT_M          __riscv_vsseg2e64_v_f64m2_m
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e64_v_f64m2x2_m
 #define VBOOL_T                 vbool32_t
 #define UINT_V_T                vuint64m2_t
 #define VID_V_UINT              __riscv_vid_v_u64m2
@@ -60,7 +60,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     jj = offset;
 
-    FLOAT_V_T va0, va1;
+    FLOAT_VX2_T vax2;
     VBOOL_T vbool_cmp;
     UINT_V_T vindex;
 
@@ -82,9 +82,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
                 {
                     compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1));
 
-                    VLSEG2_FLOAT(&va0, &va1, ao, vl);
+                    vax2 = VLSEG2_FLOAT(ao, vl);
                     vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
-                    VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl);
+                    VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl);
 
                     b   += vl * 2;
                     ao  += lda * 2;
@@ -96,8 +96,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
             {
                 if (ii < jj) 
                 {
-                    VLSEG2_FLOAT(&va0, &va1, ao, vl);
-                    VSSEG2_FLOAT(b, va0, va1, vl);
+                    vax2 = VLSEG2_FLOAT(ao, vl);
+                    VSSEG2_FLOAT(b, vax2, vl);
                 }
                 ao  += lda * 2;
                 b   += vl * 2;
diff --git a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c
index be3613429..2a158d4de 100644
--- a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c
+++ b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c
@@ -31,20 +31,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m2(n)
-#define FLOAT_V_T               vfloat32m2_t
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
-#define VSSEG2_FLOAT_M          __riscv_vsseg2e32_v_f32m2_m
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e32_v_f32m2x2_m
 #define VBOOL_T                 vbool16_t
 #define UINT_V_T                vuint32m2_t
 #define VID_V_UINT              __riscv_vid_v_u32m2
 #define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m2(n)
-#define FLOAT_V_T               vfloat64m2_t
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
-#define VSSEG2_FLOAT_M          __riscv_vsseg2e64_v_f64m2_m
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e64_v_f64m2x2_m
 #define VBOOL_T                 vbool32_t
 #define UINT_V_T                vuint64m2_t
 #define VID_V_UINT              __riscv_vid_v_u64m2
@@ -62,7 +62,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
     FLOAT *ao;
     jj = offset;
 
-    FLOAT_V_T va0, va1;
+    FLOAT_VX2_T vax2;
     VBOOL_T vbool_cmp;
     UINT_V_T vindex;
 
@@ -83,9 +83,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
                 for (unsigned int j = 0; j < vl; j++) 
                 {
                     compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1));
-                    VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
+                    vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
                     vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
-                    VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl);
+                    VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl);
                     ao  += 2;
                     b   += vl * 2;
                 }
@@ -96,8 +96,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
             {
                 if (ii < jj) 
                 {
-                    VLSSEG2_FLOAT(&va0, &va1, ao, stride_lda, vl);
-                    VSSEG2_FLOAT(b, va0, va1, vl);
+                    vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                    VSSEG2_FLOAT(b, vax2, vl);
                 }
                 ao  += 2;
                 b   += vl * 2;
diff --git a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c
index b1f5ef8f0..4b3319588 100644
--- a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c
+++ b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c
@@ -30,20 +30,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m2(n)
-#define FLOAT_V_T               vfloat32m2_t
-#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2
-#define VSSEG2_FLOAT_M          __riscv_vsseg2e32_v_f32m2_m
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e32_v_f32m2x2_m
 #define VBOOL_T                 vbool16_t
 #define UINT_V_T                vuint32m2_t
 #define VID_V_UINT              __riscv_vid_v_u32m2
 #define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m2(n)
-#define FLOAT_V_T               vfloat64m2_t
-#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2
-#define VSSEG2_FLOAT_M          __riscv_vsseg2e64_v_f64m2_m
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e64_v_f64m2x2_m
 #define VBOOL_T                 vbool32_t
 #define UINT_V_T                vuint64m2_t
 #define VID_V_UINT              __riscv_vid_v_u64m2
@@ -60,7 +60,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
     FLOAT *ao;
 
     jj = offset;
-    FLOAT_V_T va0, va1;
+    FLOAT_VX2_T vax2;
 
     VBOOL_T vbool_cmp;
     UINT_V_T vindex;
@@ -81,9 +81,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
                 vindex  = VID_V_UINT(vl);
                 for (unsigned int j = 0; j < vl; j++) 
                 {
-                    VLSEG2_FLOAT(&va0, &va1, ao, vl);
+                    vax2 = VLSEG2_FLOAT(ao, vl);
                     vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
-                    VSSEG2_FLOAT_M(vbool_cmp, b, va0, va1, vl);
+                    VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl);
 
                     compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1));
 
@@ -97,8 +97,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
             {
                 if (ii > jj) 
                 {
-                    VLSEG2_FLOAT(&va0, &va1, ao, vl);
-                    VSSEG2_FLOAT(b, va0, va1, vl);
+                    vax2 = VLSEG2_FLOAT(ao, vl);
+                    VSSEG2_FLOAT(b, vax2, vl);
                 }
                 ao  += lda * 2;
                 b   += vl * 2;
diff --git a/param.h b/param.h
index c5c70b78e..d93221d28 100644
--- a/param.h
+++ b/param.h
@@ -3057,7 +3057,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define CGEMM_DEFAULT_UNROLL_M  8
 #define CGEMM_DEFAULT_UNROLL_N  4
-#define CGEMM_DEFAULT_UNROLL_MN 16
+#define CGEMM_DEFAULT_UNROLL_MN 32
 
 #define ZGEMM_DEFAULT_UNROLL_M  8
 #define ZGEMM_DEFAULT_UNROLL_N  4

From ff41cf5c49bda6bf84950812e737b2e7bcddf139 Mon Sep 17 00:00:00 2001
From: kseniyazaytseva <kseniya.zaytseva@syntacore.com>
Date: Fri, 17 Mar 2023 14:28:26 +0300
Subject: [PATCH 588/718] Fix BLAS, BLAS-like functions and Generic RISC-V
 kernels

* Fixed gemmt, imatcopy, zimatcopy_cnc functions
* Fixed cblas_cscal testing in ctest
* Removed rotmg unreacheble code
* Added zero size checks
---
 cblas.h                        |   8 ++
 common_interface.h             |   9 ++
 ctest/c_cblat1.f               |  10 +-
 ctest/c_cblat1c.c              |   6 +-
 interface/gemmt.c              | 233 ++++++++++++++++++++++-----------
 interface/imatcopy.c           |   8 +-
 interface/rotmg.c              |  28 +---
 interface/zimatcopy.c          |   6 +-
 kernel/generic/zimatcopy_cnc.c |   1 -
 kernel/riscv64/axpby.c         |   2 +-
 kernel/riscv64/axpy.c          |   2 +-
 kernel/riscv64/copy.c          |   2 +-
 kernel/riscv64/dot.c           |   2 +-
 kernel/riscv64/swap.c          |   2 +-
 kernel/riscv64/zaxpy.c         |   2 +-
 kernel/riscv64/zcopy.c         |   2 +-
 kernel/riscv64/zswap.c         |   2 +-
 17 files changed, 201 insertions(+), 124 deletions(-)

diff --git a/cblas.h b/cblas.h
index c2bdd27fa..f7d36788d 100644
--- a/cblas.h
+++ b/cblas.h
@@ -289,6 +289,14 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA
 void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 
+void cblas_sgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_dgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
+void cblas_cgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
+void cblas_zgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 
 void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                  OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
diff --git a/common_interface.h b/common_interface.h
index 318827920..61a82c306 100644
--- a/common_interface.h
+++ b/common_interface.h
@@ -498,6 +498,15 @@ void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double *
 void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
 	   xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 
+void BLASFUNC(sgemmt)(char*, char *, char *, blasint *, blasint *, float *,
+	   float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
+void BLASFUNC(dgemmt)(char*, char *, char *, blasint *, blasint *, double *,
+	   double *, blasint *, double *, blasint *, double *, double *, blasint *);
+void BLASFUNC(cgemmt)(char*, char *, char *, blasint *, blasint *, float *,
+	   float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
+void BLASFUNC(zgemmt)(char*, char *, char *, blasint *, blasint *, double *,
+	   double *, blasint *, double *, blasint *, double *, double *, blasint *);
+
 int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *,
 		     float *, float  *, blasint *, float  *, blasint *,
 		     float *, float  *, blasint *);
diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f
index 1a123d74d..cad7c7fa7 100644
--- a/ctest/c_cblat1.f
+++ b/ctest/c_cblat1.f
@@ -96,7 +96,7 @@
       INTEGER           ICAMAXTEST
       EXTERNAL          SCASUMTEST, SCNRM2TEST, ICAMAXTEST
 *     .. External Subroutines ..
-      EXTERNAL          CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1
+      EXTERNAL          CSCALTEST, CSSCALTEST, CTEST, ITEST1, STEST1
 *     .. Intrinsic Functions ..
       INTRINSIC         MAX
 *     .. Common blocks ..
@@ -214,8 +214,8 @@
                CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1),
      +                     STRUE4(NP1),SFAC)
             ELSE IF (ICASE.EQ.8) THEN
-*              .. CSCAL ..
-               CALL CSCAL(N,CA,CX,INCX)
+*              .. CSCALTEST ..
+               CALL CSCALTEST(N,CA,CX,INCX)
                CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX),
      +                    SFAC)
             ELSE IF (ICASE.EQ.9) THEN
@@ -236,14 +236,14 @@
 *
       INCX = 1
       IF (ICASE.EQ.8) THEN
-*        CSCAL
+*        CSCALTEST
 *        Add a test for alpha equal to zero.
          CA = (0.0E0,0.0E0)
          DO 80 I = 1, 5
             MWPCT(I) = (0.0E0,0.0E0)
             MWPCS(I) = (1.0E0,1.0E0)
    80    CONTINUE
-         CALL CSCAL(5,CA,CX,INCX)
+         CALL CSCALTEST(5,CA,CX,INCX)
          CALL CTEST(5,CX,MWPCT,MWPCS,SFAC)
       ELSE IF (ICASE.EQ.9) THEN
 *        CSSCALTEST
diff --git a/ctest/c_cblat1c.c b/ctest/c_cblat1c.c
index 8c0dd140c..af29301af 100644
--- a/ctest/c_cblat1c.c
+++ b/ctest/c_cblat1c.c
@@ -685,7 +685,7 @@ real *sfac;
     static integer i__;
     extern /* Subroutine */ int ctest_();
     static complex mwpcs[5], mwpct[5];
-    extern /* Subroutine */ int itest1_(), stest1_();
+    extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_();
     static complex cx[8];
     extern real scnrm2test_();
     static integer np1;
@@ -727,7 +727,7 @@ real *sfac;
 		stest1_(&r__1, &strue4[np1 - 1], &strue4[np1 - 1], sfac);
 	    } else if (combla_1.icase == 8) {
 /*              .. CSCAL .. */
-		cscal_(&combla_1.n, &ca, cx, &combla_1.incx);
+		cscaltest_(&combla_1.n, &ca, cx, &combla_1.incx);
 		ctest_(&len, cx, &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48],
 			 &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], sfac);
 	    } else if (combla_1.icase == 9) {
@@ -761,7 +761,7 @@ real *sfac;
 	    mwpcs[i__1].r = (float)1., mwpcs[i__1].i = (float)1.;
 /* L80: */
 	}
-	cscal_(&c__5, &ca, cx, &combla_1.incx);
+	cscaltest_(&c__5, &ca, cx, &combla_1.incx);
 	ctest_(&c__5, cx, mwpct, mwpcs, sfac);
     } else if (combla_1.icase == 9) {
 /*        CSSCALTEST */
diff --git a/interface/gemmt.c b/interface/gemmt.c
index 3eed1dfe4..a4530721c 100644
--- a/interface/gemmt.c
+++ b/interface/gemmt.c
@@ -35,29 +35,26 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "common.h"
-#ifdef FUNCTION_PROFILE
-#include "functable.h"
-#endif
 
 #ifndef COMPLEX
 #define SMP_THRESHOLD_MIN 65536.0
 #ifdef XDOUBLE
-#define ERROR_NAME "QGEMT "
+#define ERROR_NAME "QGEMMT "
 #elif defined(DOUBLE)
-#define ERROR_NAME "DGEMT "
+#define ERROR_NAME "DGEMMT "
 #elif defined(BFLOAT16)
-#define ERROR_NAME "SBGEMT "
+#define ERROR_NAME "SBGEMMT "
 #else
-#define ERROR_NAME "SGEMT "
+#define ERROR_NAME "SGEMMT "
 #endif
 #else
 #define SMP_THRESHOLD_MIN 8192.0
 #ifdef XDOUBLE
-#define ERROR_NAME "XGEMT "
+#define ERROR_NAME "XGEMMT "
 #elif defined(DOUBLE)
-#define ERROR_NAME "ZGEMT "
+#define ERROR_NAME "ZGEMMT "
 #else
-#define ERROR_NAME "CGEMT "
+#define ERROR_NAME "CGEMMT "
 #endif
 #endif
 
@@ -68,18 +65,22 @@
 #ifndef CBLAS
 
 void NAME(char *UPLO, char *TRANSA, char *TRANSB,
-	  blasint * M, blasint * N, blasint * K,
+	  blasint * M, blasint * K,
 	  FLOAT * Alpha,
 	  IFLOAT * a, blasint * ldA,
 	  IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
 {
 
-	blasint m, n, k;
+	blasint m, k;
 	blasint lda, ldb, ldc;
 	int transa, transb, uplo;
 	blasint info;
 
 	char transA, transB, Uplo;
+	blasint nrowa, nrowb;
+#if defined(COMPLEX)
+	blasint ncolb;
+#endif
 	IFLOAT *buffer;
 	IFLOAT *aa, *bb;
 	FLOAT *cc;
@@ -92,7 +93,6 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
 	PRINT_DEBUG_NAME;
 
 	m = *M;
-	n = *N;
 	k = *K;
 
 #if defined(COMPLEX)
@@ -159,32 +159,47 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
 	if (Uplo == 'L')
 		uplo = 1;
 
+	nrowa = m;
+	if (transa & 1) nrowa = k;
+	nrowb = k;
+#if defined(COMPLEX)
+	ncolb = m;
+#endif
+	if (transb & 1) {
+		nrowb = m;
+#if defined(COMPLEX)
+		ncolb = k;
+#endif
+	}
+
 	info = 0;
 
-	if (uplo < 0)
-		info = 14;
-	if (ldc < m)
+	if (ldc < MAX(1, m))
 		info = 13;
+	if (ldb < MAX(1, nrowb))
+		info = 10;
+	if (lda < MAX(1, nrowa))
+		info = 8;
 	if (k < 0)
 		info = 5;
-	if (n < 0)
-		info = 4;
 	if (m < 0)
-		info = 3;
+		info = 4;
 	if (transb < 0)
-		info = 2;
+		info = 3;
 	if (transa < 0)
+		info = 2;
+	if (uplo < 0)
 		info = 1;
 
-	if (info) {
+	if (info != 0) {
 		BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
 		return;
 	}
 #else
 
 void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
-	   enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M,
-	   blasint N, blasint k,
+	   enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m,
+	   blasint k,
 #ifndef COMPLEX
 	   FLOAT alpha,
 	   IFLOAT * A, blasint LDA,
@@ -205,17 +220,23 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 	int transa, transb, uplo;
 	blasint info;
-	blasint m, n, lda, ldb;
+	blasint lda, ldb;
 	FLOAT *a, *b;
+#if defined(COMPLEX)
+	blasint nrowb, ncolb;
+#endif
 	XFLOAT *buffer;
 
 	PRINT_DEBUG_CNAME;
 
+	uplo = -1;
 	transa = -1;
 	transb = -1;
 	info = 0;
 
 	if (order == CblasColMajor) {
+		if (Uplo == CblasUpper) uplo = 0;
+		if (Uplo == CblasLower) uplo = 1;
 
 		if (TransA == CblasNoTrans)
 			transa = 0;
@@ -248,9 +269,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 			transb = 3;
 #endif
 
-		m = M;
-		n = N;
-
 		a = (void *)A;
 		b = (void *)B;
 		lda = LDA;
@@ -258,23 +276,42 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 		info = -1;
 
-		if (ldc < m)
+		blasint nrowa;
+#if !defined(COMPLEX)
+		blasint nrowb;
+#endif
+		nrowa = m;
+		if (transa & 1) nrowa = k;
+		nrowb = k;
+#if defined(COMPLEX)
+		ncolb = m;
+#endif
+		if (transb & 1) {
+			nrowb = m;
+#if defined(COMPLEX)
+			ncolb = k;
+#endif
+		}
+
+		if (ldc < MAX(1, m))
 			info = 13;
+		if (ldb < MAX(1, nrowb))
+			info = 10;
+		if (lda < MAX(1, nrowa))
+			info = 8;
 		if (k < 0)
 			info = 5;
-		if (n < 0)
-			info = 4;
 		if (m < 0)
-			info = 3;
+			info = 4;
 		if (transb < 0)
-			info = 2;
+			info = 3;
 		if (transa < 0)
+			info = 2;
+		if (uplo < 0)
 			info = 1;
 	}
 
 	if (order == CblasRowMajor) {
-		m = N;
-		n = M;
 
 		a = (void *)B;
 		b = (void *)A;
@@ -282,6 +319,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 		lda = LDB;
 		ldb = LDA;
 
+		if (Uplo == CblasUpper) uplo = 0;
+		if (Uplo == CblasLower) uplo = 1;
+
 		if (TransB == CblasNoTrans)
 			transa = 0;
 		if (TransB == CblasTrans)
@@ -315,29 +355,42 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 		info = -1;
 
-		if (ldc < m)
+		blasint ncola; 
+#if !defined(COMPLEX)
+		blasint ncolb;
+#endif
+		ncola = m;
+		if (transa & 1) ncola = k;
+		ncolb = k;
+#if defined(COMPLEX)
+		nrowb = m;
+#endif
+
+		if (transb & 1) {
+#if defined(COMPLEX)
+			nrowb = k;
+#endif
+			ncolb = m;
+		}
+
+		if (ldc < MAX(1,m))
 			info = 13;
+		if (ldb < MAX(1, ncolb))
+			info = 8;
+		if (lda < MAX(1, ncola))
+			info = 10;
 		if (k < 0)
 			info = 5;
-		if (n < 0)
-			info = 4;
 		if (m < 0)
-			info = 3;
+			info = 4;
 		if (transb < 0)
 			info = 2;
 		if (transa < 0)
+			info = 3;
+		if (uplo < 0)
 			info = 1;
-
 	}
 
-	uplo = -1;
-	if (Uplo == CblasUpper)
-		uplo = 0;
-	if (Uplo == CblasLower)
-		uplo = 1;
-	if (uplo < 0)
-		info = 14;
-
 	if (info >= 0) {
 		BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
 		return;
@@ -407,37 +460,48 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 #endif
 
-	if ((m == 0) || (n == 0))
+	if (m == 0)
 		return;
 
 	IDEBUG_START;
 
-	FUNCTION_PROFILE_START();
+#if defined(COMPLEX)
+	if (transb > 1){
+#ifndef CBLAS
+		IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
+#else
+		if (order == CblasColMajor)
+			IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
+		if (order == CblasRowMajor)
+			IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
+#endif
+	}
+#endif
 
-	const blasint incb = (transb == 0) ? 1 : ldb;
+	const blasint incb = ((transb & 1) == 0) ? 1 : ldb;
 
 	if (uplo == 1) {
-		for (i = 0; i < n; i++) {
-			j = n - i;
+		for (i = 0; i < m; i++) {
+			j = m - i;
 
 			l = j;
 #if defined(COMPLEX)
 			aa = a + i * 2;
 			bb = b + i * ldb * 2;
-			if (transa) {
-				l = k;
+			if (transa & 1) {
 				aa = a + lda * i * 2;
-				bb = b + i * 2;
 			}
+			if (transb & 1)
+				bb = b + i * 2;
 			cc = c + i * 2 * ldc + i * 2;
 #else
 			aa = a + i;
 			bb = b + i * ldb;
-			if (transa) {
-				l = k;
+			if (transa & 1) {
 				aa = a + lda * i;
-				bb = b + i;
 			}
+			if (transb & 1)
+				bb = b + i;
 			cc = c + i * ldc + i;
 #endif
 
@@ -447,7 +511,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 				       NULL, 0);
 
 			if (alpha_r == ZERO && alpha_i == ZERO)
-				return;
+				continue;
 #else
 			if (beta != ONE)
 				SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
@@ -458,8 +522,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 			IDEBUG_START;
 
-			FUNCTION_PROFILE_START();
-
 			buffer_size = j + k + 128 / sizeof(FLOAT);
 #ifdef WINDOWS_ABI
 			buffer_size += 160 / sizeof(FLOAT);
@@ -479,20 +541,34 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 
 #if defined(COMPLEX)
+				if (!(transa & 1))
 				(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
 						     aa, lda, bb, incb, cc, 1,
 						     buffer);
+				else
+				(gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
+						     aa, lda, bb, incb, cc, 1,
+						     buffer);
 #else
+				if (!(transa & 1))
 				(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
 						     bb, incb, cc, 1, buffer);
+				else
+				(gemv[(int)transa]) (k, j, 0, alpha, aa, lda,
+						     bb, incb, cc, 1, buffer);
 #endif
 #ifdef SMP
 			} else {
-
+				if (!(transa & 1))
 				(gemv_thread[(int)transa]) (j, k, alpha, aa,
 							    lda, bb, incb, cc,
 							    1, buffer,
 							    nthreads);
+				else
+				(gemv_thread[(int)transa]) (k, j, alpha, aa,
+							    lda, bb, incb, cc,
+							    1, buffer,
+							    nthreads);
 
 			}
 #endif
@@ -501,21 +577,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 		}
 	} else {
 
-		for (i = 0; i < n; i++) {
+		for (i = 0; i < m; i++) {
 			j = i + 1;
 
 			l = j;
 #if defined COMPLEX
 			bb = b + i * ldb * 2;
-			if (transa) {
-				l = k;
+			if (transb & 1) {
 				bb = b + i * 2;
 			}
 			cc = c + i * 2 * ldc;
 #else
 			bb = b + i * ldb;
-			if (transa) {
-				l = k;
+			if (transb & 1) {
 				bb = b + i;
 			}
 			cc = c + i * ldc;
@@ -527,7 +601,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 				       NULL, 0);
 
 			if (alpha_r == ZERO && alpha_i == ZERO)
-				return;
+				continue;
 #else
 			if (beta != ONE)
 				SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
@@ -537,8 +611,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 			IDEBUG_START;
 
-			FUNCTION_PROFILE_START();
-
 			buffer_size = j + k + 128 / sizeof(FLOAT);
 #ifdef WINDOWS_ABI
 			buffer_size += 160 / sizeof(FLOAT);
@@ -558,32 +630,41 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 
 #if defined(COMPLEX)
+				if (!(transa & 1))
 				(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
 						     a, lda, bb, incb, cc, 1,
 						     buffer);
+				else
+				(gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
+						     a, lda, bb, incb, cc, 1,
+						     buffer);
 #else
+				if (!(transa & 1))
 				(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
 						     incb, cc, 1, buffer);
+				else
+				(gemv[(int)transa]) (k, j, 0, alpha, a, lda, bb,
+						     incb, cc, 1, buffer);
 #endif
 
 #ifdef SMP
 			} else {
-
+				if (!(transa & 1))
 				(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
 							    bb, incb, cc, 1,
 							    buffer, nthreads);
-
+				else
+				(gemv_thread[(int)transa]) (k, j, alpha, a, lda,
+							    bb, incb, cc, 1,
+							    buffer, nthreads);
 			}
 #endif
 
 			STACK_FREE(buffer);
 		}
 	}
-	FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE,
-			     args.m * args.k + args.k * args.n +
-			     args.m * args.n, 2 * args.m * args.n * args.k);
 
 	IDEBUG_END;
 
 	return;
-}
+}
\ No newline at end of file
diff --git a/interface/imatcopy.c b/interface/imatcopy.c
index 91975f7f4..109280fe6 100644
--- a/interface/imatcopy.c
+++ b/interface/imatcopy.c
@@ -149,10 +149,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 
 #endif
 
-	if ( *lda >  *ldb )
-		msize = (size_t)(*lda) * (*ldb)  * sizeof(FLOAT);
-	else
-		msize = (size_t)(*ldb) * (*ldb)  * sizeof(FLOAT);
+	if ( *rows >  *cols )
+            msize = (size_t)(*rows) * (*ldb)  * sizeof(FLOAT);
+    else
+            msize = (size_t)(*cols) * (*ldb)  * sizeof(FLOAT);
 
 	b = malloc(msize);
 	if ( b == NULL )
diff --git a/interface/rotmg.c b/interface/rotmg.c
index 3a5ca8f95..b8f627221 100644
--- a/interface/rotmg.c
+++ b/interface/rotmg.c
@@ -96,12 +96,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
 	else
 	{
 		dp2 = *dd2 * dy1;
-		if(dp2 == ZERO)
-		{
-			dflag = -TWO;
-			dparam[0] = dflag;
-			return;
-		}
 		dp1 = *dd1 * *dx1;
 		dq2 =  dp2 * dy1;
 		dq1 =  dp1 * *dx1;
@@ -113,24 +107,10 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
 			dh12 =    dp2 /  dp1;
 
 			du   = ONE - dh12 * dh21;
-			if(du > ZERO)
-			{
-				dflag = ZERO;
-				*dd1  = *dd1 / du;
-				*dd2  = *dd2 / du;
-				*dx1  = *dx1 * du;
-			} else {
-				dflag = -ONE;
-
-				dh11  = ZERO;
-				dh12  = ZERO;
-				dh21  = ZERO;
-				dh22  = ZERO;
-
-				*dd1  = ZERO;
-				*dd2  = ZERO;
-				*dx1  = ZERO;
-			}
+			dflag = ZERO;
+			*dd1  = *dd1 / du;
+			*dd2  = *dd2 / du;
+			*dx1  = *dx1 * du;
 			
 		}
 		else
diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c
index ecda5ef4e..7d73ba572 100644
--- a/interface/zimatcopy.c
+++ b/interface/zimatcopy.c
@@ -171,10 +171,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
     }
 #endif
 
-	if ( *lda >  *ldb )
-                msize = (size_t)(*lda) * (*ldb)  * sizeof(FLOAT) * 2;
+		if ( *rows >  *cols )
+                msize = (size_t)(*rows) * (*ldb)  * sizeof(FLOAT) * 2;
         else
-                msize = (size_t)(*ldb) * (*ldb)  * sizeof(FLOAT) * 2;
+                msize = (size_t)(*cols) * (*ldb)  * sizeof(FLOAT) * 2;
 
         b = malloc(msize);
         if ( b == NULL )
diff --git a/kernel/generic/zimatcopy_cnc.c b/kernel/generic/zimatcopy_cnc.c
index 8e772bd8a..6426cffc0 100644
--- a/kernel/generic/zimatcopy_cnc.c
+++ b/kernel/generic/zimatcopy_cnc.c
@@ -40,7 +40,6 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a,
 
 	if ( rows <= 0     )  return(0);
 	if ( cols <= 0     )  return(0);
-    if ( alpha_r == 1.0 && alpha_i == 0.0 ) return (0); 
 
 	aptr = a;
 	lda *= 2;
diff --git a/kernel/riscv64/axpby.c b/kernel/riscv64/axpby.c
index 278747f75..04f9518d3 100644
--- a/kernel/riscv64/axpby.c
+++ b/kernel/riscv64/axpby.c
@@ -33,7 +33,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
 	BLASLONG i=0;
 	BLASLONG ix,iy;
 
-	if ( n < 0     )  return(0);
+	if ( n <= 0     )  return(0);
 
 	ix = 0;
 	iy = 0;
diff --git a/kernel/riscv64/axpy.c b/kernel/riscv64/axpy.c
index fb1094dd9..19d12ad3f 100644
--- a/kernel/riscv64/axpy.c
+++ b/kernel/riscv64/axpy.c
@@ -42,7 +42,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 	BLASLONG i=0;
 	BLASLONG ix,iy;
 
-	if ( n < 0     )  return(0);
+	if ( n <= 0     )  return(0);
 	if ( da == 0.0 ) return(0);
 
 	ix = 0;
diff --git a/kernel/riscv64/copy.c b/kernel/riscv64/copy.c
index 7b4f04f30..e79ca59af 100644
--- a/kernel/riscv64/copy.c
+++ b/kernel/riscv64/copy.c
@@ -41,7 +41,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 	BLASLONG i=0;
 	BLASLONG ix=0,iy=0;
 
-	if ( n < 0     )  return(0);
+	if ( n <= 0     )  return(0);
 
 	while(i < n)
 	{
diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c
index 46a84ad18..bf55998ca 100644
--- a/kernel/riscv64/dot.c
+++ b/kernel/riscv64/dot.c
@@ -46,7 +46,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 	BLASLONG ix=0,iy=0;
 	double dot = 0.0 ;
 
-	if ( n < 0 )  return(dot);
+	if ( n < 1 )  return(dot);
 
 	while(i < n)
 	{
diff --git a/kernel/riscv64/swap.c b/kernel/riscv64/swap.c
index eac621fb2..33bbeeb6a 100644
--- a/kernel/riscv64/swap.c
+++ b/kernel/riscv64/swap.c
@@ -41,7 +41,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
 	BLASLONG ix=0,iy=0;
 	FLOAT temp;
 
-	if ( n < 0     )  return(0);
+	if ( n <= 0     )  return(0);
 
 	while(i < n)
 	{
diff --git a/kernel/riscv64/zaxpy.c b/kernel/riscv64/zaxpy.c
index 1dcaeac27..18b6315cb 100644
--- a/kernel/riscv64/zaxpy.c
+++ b/kernel/riscv64/zaxpy.c
@@ -44,7 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 	BLASLONG inc_x2;
 	BLASLONG inc_y2;
 
-	if ( n < 0     )  return(0);
+	if ( n <= 0     )  return(0);
 	if ( da_r == 0.0 && da_i == 0.0 ) return(0);
 
 	ix = 0;
diff --git a/kernel/riscv64/zcopy.c b/kernel/riscv64/zcopy.c
index 07fe584c5..b0f19efd5 100644
--- a/kernel/riscv64/zcopy.c
+++ b/kernel/riscv64/zcopy.c
@@ -43,7 +43,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 	BLASLONG inc_x2;
 	BLASLONG inc_y2;
 
-	if ( n < 0     )  return(0);
+	if ( n <= 0     )  return(0);
 
 	inc_x2 = 2 * inc_x;
 	inc_y2 = 2 * inc_y;
diff --git a/kernel/riscv64/zswap.c b/kernel/riscv64/zswap.c
index ae4760ae0..df1402b94 100644
--- a/kernel/riscv64/zswap.c
+++ b/kernel/riscv64/zswap.c
@@ -45,7 +45,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
 	BLASLONG inc_x2;
 	BLASLONG inc_y2;
 
-	if ( n < 0     )  return(0);
+	if ( n <= 0     )  return(0);
 
 	inc_x2 = 2 * inc_x;
 	inc_y2 = 2 * inc_y;

From 5b4df851d7581145f0aee4336f11127a3a7acc8a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 21 Mar 2023 08:29:05 +0100
Subject: [PATCH 589/718] fix stray blank on continuation line

---
 interface/Makefile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/interface/Makefile b/interface/Makefile
index 6f320d8f7..a4d3f710a 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -92,8 +92,9 @@ CBLAS2OBJS    = \
 		cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \
 		ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) \
 		csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \
-		csbmv.$(SUFFIX) \
-		cspr2.$(SUFFIX) \
+		csbmv.$(SUFFIX) cspmv.$(SUFFIX) \
+		cspr.$(SUFFIX)  cspr2.$(SUFFIX) \
+		csymv.$(SUFFIX) csyr.$(SUFFIX)  \
 		ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \
 		ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \
 		chemv.$(SUFFIX) chbmv.$(SUFFIX) \

From 1c04df20bd9c845160b3eb2e51adaceb6f93cf8a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 20 Mar 2023 23:04:12 +0100
Subject: [PATCH 590/718] Re-enable overriding the LAPACK SYMV,SYR,SPMV and SPR
 implementations

---
 lapack-netlib/SRC/Makefile | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile
index 49798b0c5..5f22789bd 100644
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@@ -572,22 +572,26 @@ ALL_AUX_OBJS = xerbla.o ../INSTALL/lsame.o
 SLAPACKOBJS     = \
         sgetrf.o sgetrs.o spotrf.o sgetf2.o \
         spotf2.o slaswp.o sgesv.o slauu2.o  \
-        slauum.o strti2.o strtri.o strtrs.o
+        slauum.o strti2.o strtri.o strtrs.o \
+	ssymv.o ssyr.o sspmv.o sspr.o
 
 DLAPACKOBJS     = \
         dgetrf.o dgetrs.o dpotrf.o dgetf2.o \
         dpotf2.o dlaswp.o dgesv.o dlauu2.o  \
-        dlauum.o dtrti2.o dtrtri.o dtrtrs.o
+        dlauum.o dtrti2.o dtrtri.o dtrtrs.o \
+	dsymv.o dsyr.o dspmv.o dspr.o
 
 CLAPACKOBJS     = \
         cgetrf.o cgetrs.o cpotrf.o cgetf2.o \
         cpotf2.o claswp.o cgesv.o clauu2.o \
-        clauum.o ctrti2.o ctrtri.o ctrtrs.o 
+        clauum.o ctrti2.o ctrtri.o ctrtrs.o \
+	csymv.o csyr.o cspmv.o cspr.o
 
 ZLAPACKOBJS     = \
         zgetrf.o zgetrs.o zpotrf.o zgetf2.o \
         zpotf2.o zlaswp.o zgesv.o  zlauu2.o \
-        zlauum.o ztrti2.o ztrtri.o ztrtrs.o
+        zlauum.o ztrti2.o ztrtri.o ztrtrs.o \
+	zsymv.o zsyr.o zspmv.o zspr.o
 
 ALLAUX = $(filter-out $(ALL_AUX_OBJS),$(ALLAUX_O))
 SLASRC = $(filter-out $(SLAPACKOBJS),$(SLASRC_O))

From 5222b5fc18829265be7ffc77e77271a18f17c005 Mon Sep 17 00:00:00 2001
From: kseniyazaytseva <kseniya.zaytseva@syntacore.com>
Date: Thu, 12 Oct 2023 22:06:00 +0300
Subject: [PATCH 591/718] Added axpby kernels for GENERIC RISC-V target

---
 kernel/riscv64/KERNEL.RISCV64_GENERIC | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC
index 61a8a2b91..15bcd2289 100644
--- a/kernel/riscv64/KERNEL.RISCV64_GENERIC
+++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC
@@ -45,6 +45,11 @@ DAXPYKERNEL  = ../riscv64/axpy.c
 CAXPYKERNEL  = ../riscv64/zaxpy.c
 ZAXPYKERNEL  = ../riscv64/zaxpy.c
 
+SAXPBYKERNEL  = ../riscv64/axpby.c
+DAXPBYKERNEL  = ../riscv64/axpby.c
+CAXPBYKERNEL  = ../riscv64/zaxpby.c
+ZAXPBYKERNEL  = ../riscv64/zaxpby.c
+
 SCOPYKERNEL  = ../riscv64/copy.c
 DCOPYKERNEL  = ../riscv64/copy.c
 CCOPYKERNEL  = ../riscv64/zcopy.c

From f1291614536d7d1bec6508fda9b0c56dd7286bb3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 21 Mar 2023 07:43:03 +0100
Subject: [PATCH 592/718] restore C/Z SPMV, SPR, SYR,SYMV

---
 interface/Makefile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/interface/Makefile b/interface/Makefile
index a4d3f710a..3db4b2b6d 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -122,8 +122,9 @@ ZBLAS2OBJS    = \
 		zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \
 		ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) \
 		zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \
-		zsbmv.$(SUFFIX) \
-		zspr2.$(SUFFIX) \
+		zsbmv.$(SUFFIX) zspmv.$(SUFFIX) \
+		zspr.$(SUFFIX)  zspr2.$(SUFFIX) \
+		zsymv.$(SUFFIX) zsyr.$(SUFFIX)  \
 		ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \
 		ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \
 		zhemv.$(SUFFIX) zhbmv.$(SUFFIX) \

From 85548e66ca25228a73ec08c257d5d92108b94b62 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 28 Mar 2023 16:33:09 +0200
Subject: [PATCH 593/718] Fix build failures seen with the NO_LAPACK option -
 cspr/csymv/csyr belong on the LAPACK list

---
 interface/Makefile | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/interface/Makefile b/interface/Makefile
index 3db4b2b6d..6f320d8f7 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -92,9 +92,8 @@ CBLAS2OBJS    = \
 		cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \
 		ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) \
 		csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \
-		csbmv.$(SUFFIX) cspmv.$(SUFFIX) \
-		cspr.$(SUFFIX)  cspr2.$(SUFFIX) \
-		csymv.$(SUFFIX) csyr.$(SUFFIX)  \
+		csbmv.$(SUFFIX) \
+		cspr2.$(SUFFIX) \
 		ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \
 		ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \
 		chemv.$(SUFFIX) chbmv.$(SUFFIX) \
@@ -122,9 +121,8 @@ ZBLAS2OBJS    = \
 		zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \
 		ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) \
 		zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \
-		zsbmv.$(SUFFIX) zspmv.$(SUFFIX) \
-		zspr.$(SUFFIX)  zspr2.$(SUFFIX) \
-		zsymv.$(SUFFIX) zsyr.$(SUFFIX)  \
+		zsbmv.$(SUFFIX) \
+		zspr2.$(SUFFIX) \
 		ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \
 		ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \
 		zhemv.$(SUFFIX) zhbmv.$(SUFFIX) \

From f7cf637d7aad0990625f41f83db74446a5908509 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 28 Mar 2023 18:31:04 +0200
Subject: [PATCH 594/718] redo lost edit

---
 interface/Makefile | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/interface/Makefile b/interface/Makefile
index 6f320d8f7..275b71a1c 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -445,7 +445,8 @@ QLAPACKOBJS	= \
 CLAPACKOBJS	= \
 	cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \
 	cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \
-	clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX)
+	clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX) \
+	cspr.$(SUFFIX) cspmv.$(SUFFIX) csymv.$(SUFFIX) csyr.$(SUFFIX)
 
 #ZLAPACKOBJS	= \
 #	zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \
@@ -456,8 +457,8 @@ CLAPACKOBJS	= \
 ZLAPACKOBJS	= \
 	zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \
 	zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX)  zlauu2.$(SUFFIX) \
-	zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX)
-
+	zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX) \
+	zspr.$(SUFFIX) zspmv.$(SUFFIX) zsymv.$(SUFFIX) zsyr.$(SUFFIX)
 
 XLAPACKOBJS	= \
 	xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \

From f89e0034a479016ab5d9e1681abf07dab7f8cf38 Mon Sep 17 00:00:00 2001
From: kseniyazaytseva <kseniya.zaytseva@syntacore.com>
Date: Wed, 20 Dec 2023 21:20:30 +0300
Subject: [PATCH 595/718] Fix LAPACK usage from BLAS

---
 interface/Makefile         |  7 +++----
 lapack-netlib/SRC/Makefile | 12 ++++--------
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/interface/Makefile b/interface/Makefile
index 275b71a1c..6f320d8f7 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -445,8 +445,7 @@ QLAPACKOBJS	= \
 CLAPACKOBJS	= \
 	cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \
 	cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \
-	clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX) \
-	cspr.$(SUFFIX) cspmv.$(SUFFIX) csymv.$(SUFFIX) csyr.$(SUFFIX)
+	clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX)
 
 #ZLAPACKOBJS	= \
 #	zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \
@@ -457,8 +456,8 @@ CLAPACKOBJS	= \
 ZLAPACKOBJS	= \
 	zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \
 	zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX)  zlauu2.$(SUFFIX) \
-	zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX) \
-	zspr.$(SUFFIX) zspmv.$(SUFFIX) zsymv.$(SUFFIX) zsyr.$(SUFFIX)
+	zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX)
+
 
 XLAPACKOBJS	= \
 	xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \
diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile
index 5f22789bd..49798b0c5 100644
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@@ -572,26 +572,22 @@ ALL_AUX_OBJS = xerbla.o ../INSTALL/lsame.o
 SLAPACKOBJS     = \
         sgetrf.o sgetrs.o spotrf.o sgetf2.o \
         spotf2.o slaswp.o sgesv.o slauu2.o  \
-        slauum.o strti2.o strtri.o strtrs.o \
-	ssymv.o ssyr.o sspmv.o sspr.o
+        slauum.o strti2.o strtri.o strtrs.o
 
 DLAPACKOBJS     = \
         dgetrf.o dgetrs.o dpotrf.o dgetf2.o \
         dpotf2.o dlaswp.o dgesv.o dlauu2.o  \
-        dlauum.o dtrti2.o dtrtri.o dtrtrs.o \
-	dsymv.o dsyr.o dspmv.o dspr.o
+        dlauum.o dtrti2.o dtrtri.o dtrtrs.o
 
 CLAPACKOBJS     = \
         cgetrf.o cgetrs.o cpotrf.o cgetf2.o \
         cpotf2.o claswp.o cgesv.o clauu2.o \
-        clauum.o ctrti2.o ctrtri.o ctrtrs.o \
-	csymv.o csyr.o cspmv.o cspr.o
+        clauum.o ctrti2.o ctrtri.o ctrtrs.o 
 
 ZLAPACKOBJS     = \
         zgetrf.o zgetrs.o zpotrf.o zgetf2.o \
         zpotf2.o zlaswp.o zgesv.o  zlauu2.o \
-        zlauum.o ztrti2.o ztrtri.o ztrtrs.o \
-	zsymv.o zsyr.o zspmv.o zspr.o
+        zlauum.o ztrti2.o ztrtri.o ztrtrs.o
 
 ALLAUX = $(filter-out $(ALL_AUX_OBJS),$(ALLAUX_O))
 SLASRC = $(filter-out $(SLAPACKOBJS),$(SLASRC_O))

From bf39c0d8b53c8899fb81a9515da91a004d25ec0b Mon Sep 17 00:00:00 2001
From: kseniyazaytseva <kseniya.zaytseva@syntacore.com>
Date: Fri, 23 Jun 2023 14:51:39 +0300
Subject: [PATCH 596/718] Added new tests for BLAS-like and BLAS API in utest

---
 .gitignore                             |    1 +
 utest/CMakeLists.txt                   |   75 +
 utest/Makefile                         |   30 +-
 utest/test_extensions/common.c         |  259 +++
 utest/test_extensions/common.h         |   76 +
 utest/test_extensions/test_caxpby.c    |  631 ++++++++
 utest/test_extensions/test_caxpyc.c    |  158 ++
 utest/test_extensions/test_cgbmv.c     |  279 ++++
 utest/test_extensions/test_cgeadd.c    |  880 +++++++++++
 utest/test_extensions/test_cgemm.c     |  273 ++++
 utest/test_extensions/test_cgemmt.c    | 2010 ++++++++++++++++++++++++
 utest/test_extensions/test_cgemv_n.c   |  340 ++++
 utest/test_extensions/test_cgemv_t.c   | 1132 +++++++++++++
 utest/test_extensions/test_cimatcopy.c |  850 ++++++++++
 utest/test_extensions/test_comatcopy.c |  728 +++++++++
 utest/test_extensions/test_crot.c      |  792 ++++++++++
 utest/test_extensions/test_crotg.c     |  290 ++++
 utest/test_extensions/test_csbmv.c     |  606 +++++++
 utest/test_extensions/test_cscal.c     |  164 ++
 utest/test_extensions/test_cspmv.c     |  428 +++++
 utest/test_extensions/test_ctrmv.c     |  266 ++++
 utest/test_extensions/test_ctrsv.c     |  267 ++++
 utest/test_extensions/test_damin.c     |  354 +++++
 utest/test_extensions/test_daxpby.c    |  799 ++++++++++
 utest/test_extensions/test_dgeadd.c    |  878 +++++++++++
 utest/test_extensions/test_dgemmt.c    | 1442 +++++++++++++++++
 utest/test_extensions/test_dimatcopy.c |  947 +++++++++++
 utest/test_extensions/test_domatcopy.c |  672 ++++++++
 utest/test_extensions/test_drotmg.c    |  414 +++++
 utest/test_extensions/test_dsum.c      |  403 +++++
 utest/test_extensions/test_dzamax.c    |  294 ++++
 utest/test_extensions/test_dzamin.c    |  310 ++++
 utest/test_extensions/test_dzsum.c     |  403 +++++
 utest/test_extensions/test_icamin.c    |  625 ++++++++
 utest/test_extensions/test_idamin.c    |  787 ++++++++++
 utest/test_extensions/test_isamin.c    |  787 ++++++++++
 utest/test_extensions/test_izamin.c    |  625 ++++++++
 utest/test_extensions/test_samin.c     |  354 +++++
 utest/test_extensions/test_saxpby.c    |  794 ++++++++++
 utest/test_extensions/test_scamax.c    |  294 ++++
 utest/test_extensions/test_scamin.c    |  310 ++++
 utest/test_extensions/test_scsum.c     |  403 +++++
 utest/test_extensions/test_sgeadd.c    |  880 +++++++++++
 utest/test_extensions/test_sgemmt.c    | 1442 +++++++++++++++++
 utest/test_extensions/test_simatcopy.c |  947 +++++++++++
 utest/test_extensions/test_somatcopy.c |  672 ++++++++
 utest/test_extensions/test_srotmg.c    |  414 +++++
 utest/test_extensions/test_ssum.c      |  403 +++++
 utest/test_extensions/test_zaxpby.c    |  630 ++++++++
 utest/test_extensions/test_zaxpyc.c    |  159 ++
 utest/test_extensions/test_zgbmv.c     |  280 ++++
 utest/test_extensions/test_zgeadd.c    |  880 +++++++++++
 utest/test_extensions/test_zgemm.c     |  273 ++++
 utest/test_extensions/test_zgemmt.c    | 2010 ++++++++++++++++++++++++
 utest/test_extensions/test_zgemv_n.c   |  341 ++++
 utest/test_extensions/test_zgemv_t.c   | 1136 +++++++++++++
 utest/test_extensions/test_zimatcopy.c |  850 ++++++++++
 utest/test_extensions/test_zomatcopy.c |  745 +++++++++
 utest/test_extensions/test_zrot.c      |  790 ++++++++++
 utest/test_extensions/test_zrotg.c     |  290 ++++
 utest/test_extensions/test_zsbmv.c     |  606 +++++++
 utest/test_extensions/test_zscal.c     |  165 ++
 utest/test_extensions/test_zspmv.c     |  427 +++++
 utest/test_extensions/test_ztrmv.c     |  266 ++++
 utest/test_extensions/test_ztrsv.c     |  267 ++++
 utest/test_extensions/xerbla.c         |   88 ++
 66 files changed, 37387 insertions(+), 4 deletions(-)
 create mode 100644 utest/test_extensions/common.c
 create mode 100644 utest/test_extensions/common.h
 create mode 100644 utest/test_extensions/test_caxpby.c
 create mode 100644 utest/test_extensions/test_caxpyc.c
 create mode 100644 utest/test_extensions/test_cgbmv.c
 create mode 100644 utest/test_extensions/test_cgeadd.c
 create mode 100644 utest/test_extensions/test_cgemm.c
 create mode 100644 utest/test_extensions/test_cgemmt.c
 create mode 100644 utest/test_extensions/test_cgemv_n.c
 create mode 100644 utest/test_extensions/test_cgemv_t.c
 create mode 100644 utest/test_extensions/test_cimatcopy.c
 create mode 100644 utest/test_extensions/test_comatcopy.c
 create mode 100644 utest/test_extensions/test_crot.c
 create mode 100644 utest/test_extensions/test_crotg.c
 create mode 100644 utest/test_extensions/test_csbmv.c
 create mode 100644 utest/test_extensions/test_cscal.c
 create mode 100644 utest/test_extensions/test_cspmv.c
 create mode 100644 utest/test_extensions/test_ctrmv.c
 create mode 100644 utest/test_extensions/test_ctrsv.c
 create mode 100644 utest/test_extensions/test_damin.c
 create mode 100644 utest/test_extensions/test_daxpby.c
 create mode 100644 utest/test_extensions/test_dgeadd.c
 create mode 100644 utest/test_extensions/test_dgemmt.c
 create mode 100644 utest/test_extensions/test_dimatcopy.c
 create mode 100644 utest/test_extensions/test_domatcopy.c
 create mode 100644 utest/test_extensions/test_drotmg.c
 create mode 100644 utest/test_extensions/test_dsum.c
 create mode 100644 utest/test_extensions/test_dzamax.c
 create mode 100644 utest/test_extensions/test_dzamin.c
 create mode 100644 utest/test_extensions/test_dzsum.c
 create mode 100644 utest/test_extensions/test_icamin.c
 create mode 100644 utest/test_extensions/test_idamin.c
 create mode 100644 utest/test_extensions/test_isamin.c
 create mode 100644 utest/test_extensions/test_izamin.c
 create mode 100644 utest/test_extensions/test_samin.c
 create mode 100644 utest/test_extensions/test_saxpby.c
 create mode 100644 utest/test_extensions/test_scamax.c
 create mode 100644 utest/test_extensions/test_scamin.c
 create mode 100644 utest/test_extensions/test_scsum.c
 create mode 100644 utest/test_extensions/test_sgeadd.c
 create mode 100644 utest/test_extensions/test_sgemmt.c
 create mode 100644 utest/test_extensions/test_simatcopy.c
 create mode 100644 utest/test_extensions/test_somatcopy.c
 create mode 100644 utest/test_extensions/test_srotmg.c
 create mode 100644 utest/test_extensions/test_ssum.c
 create mode 100644 utest/test_extensions/test_zaxpby.c
 create mode 100644 utest/test_extensions/test_zaxpyc.c
 create mode 100644 utest/test_extensions/test_zgbmv.c
 create mode 100644 utest/test_extensions/test_zgeadd.c
 create mode 100644 utest/test_extensions/test_zgemm.c
 create mode 100644 utest/test_extensions/test_zgemmt.c
 create mode 100644 utest/test_extensions/test_zgemv_n.c
 create mode 100644 utest/test_extensions/test_zgemv_t.c
 create mode 100644 utest/test_extensions/test_zimatcopy.c
 create mode 100644 utest/test_extensions/test_zomatcopy.c
 create mode 100644 utest/test_extensions/test_zrot.c
 create mode 100644 utest/test_extensions/test_zrotg.c
 create mode 100644 utest/test_extensions/test_zsbmv.c
 create mode 100644 utest/test_extensions/test_zscal.c
 create mode 100644 utest/test_extensions/test_zspmv.c
 create mode 100644 utest/test_extensions/test_ztrmv.c
 create mode 100644 utest/test_extensions/test_ztrsv.c
 create mode 100644 utest/test_extensions/xerbla.c

diff --git a/.gitignore b/.gitignore
index 0fe20ecaa..e3e783c46 100644
--- a/.gitignore
+++ b/.gitignore
@@ -46,6 +46,7 @@ config_last.h
 getarch
 getarch_2nd
 utest/openblas_utest
+utest/openblas_utest_ext
 ctest/xccblat1
 ctest/xccblat2
 ctest/xccblat3
diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index 2e32827d3..d78701707 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -18,6 +18,69 @@ else ()
   )
 endif ()
 
+
+set(DIR_EXT test_extensions)
+set(OpenBLAS_utest_ext_src
+utest_main.c
+${DIR_EXT}/xerbla.c
+${DIR_EXT}/test_isamin.c 
+${DIR_EXT}/test_idamin.c 
+${DIR_EXT}/test_icamin.c 
+${DIR_EXT}/test_izamin.c 
+${DIR_EXT}/test_ssum.c 
+${DIR_EXT}/test_dsum.c 
+${DIR_EXT}/test_scsum.c 
+${DIR_EXT}/test_dzsum.c
+${DIR_EXT}/test_samin.c
+${DIR_EXT}/test_damin.c
+${DIR_EXT}/test_scamin.c
+${DIR_EXT}/test_dzamin.c
+${DIR_EXT}/test_scamax.c
+${DIR_EXT}/test_dzamax.c
+${DIR_EXT}/test_zrotg.c
+${DIR_EXT}/test_crotg.c
+$(DIR_EXT)/test_drotmg.c
+$(DIR_EXT)/test_srotmg.c
+$(DIR_EXT)/test_zscal.c
+$(DIR_EXT)/test_cscal.c
+$(DIR_EXT)/test_domatcopy.c
+$(DIR_EXT)/test_somatcopy.c
+$(DIR_EXT)/test_zomatcopy.c
+$(DIR_EXT)/test_comatcopy.c
+${DIR_EXT}/test_simatcopy.c
+${DIR_EXT}/test_dimatcopy.c
+${DIR_EXT}/test_cimatcopy.c
+${DIR_EXT}/test_zimatcopy.c
+${DIR_EXT}/test_sgeadd.c
+${DIR_EXT}/test_dgeadd.c
+${DIR_EXT}/test_cgeadd.c
+${DIR_EXT}/test_zgeadd.c
+${DIR_EXT}/test_saxpby.c
+${DIR_EXT}/test_daxpby.c
+${DIR_EXT}/test_caxpby.c
+${DIR_EXT}/test_zaxpby.c
+${DIR_EXT}/test_caxpyc.c
+${DIR_EXT}/test_zaxpyc.c
+$(DIR_EXT)/test_cgemv_t.c
+$(DIR_EXT)/test_zgemv_t.c
+$(DIR_EXT)/test_cgemv_n.c
+$(DIR_EXT)/test_zgemv_n.c
+${DIR_EXT}/test_crot.c
+${DIR_EXT}/test_zrot.c
+${DIR_EXT}/test_cgbmv.c
+${DIR_EXT}/test_zgbmv.c
+${DIR_EXT}/test_dgemmt.c
+${DIR_EXT}/test_sgemmt.c
+${DIR_EXT}/test_cgemmt.c
+${DIR_EXT}/test_zgemmt.c
+${DIR_EXT}/test_ztrmv.c
+${DIR_EXT}/test_ctrmv.c
+$(DIR_EXT)/test_ztrsv.c
+$(DIR_EXT)/test_ctrsv.c
+$(DIR_EXT)/test_zgemm.c
+$(DIR_EXT)/test_cgemm.c
+)
+
 # crashing on travis cl with an error code suggesting resource not found
 if (NOT MSVC)
 set(OpenBLAS_utest_src
@@ -46,6 +109,13 @@ set(OpenBLAS_utest_src
   ${OpenBLAS_utest_src}
   test_potrs.c
   )
+set(OpenBLAS_utest_ext_src
+  ${OpenBLAS_utest_ext_src}
+  ${DIR_EXT}/test_cspmv.c
+  ${DIR_EXT}/test_zspmv.c
+  ${DIR_EXT}/test_csbmv.c
+  ${DIR_EXT}/test_zsbmv.c
+  )
 if (NOT NO_CBLAS AND NOT NO_LAPACKE)
 set(OpenBLAS_utest_src
   ${OpenBLAS_utest_src}
@@ -57,7 +127,11 @@ endif()
 set(OpenBLAS_utest_bin openblas_utest)
 add_executable(${OpenBLAS_utest_bin} ${OpenBLAS_utest_src})
 
+set(OpenBLAS_utest_ext_bin openblas_utest_ext)
+add_executable(${OpenBLAS_utest_ext_bin} ${OpenBLAS_utest_ext_src})
+
 target_link_libraries(${OpenBLAS_utest_bin} ${OpenBLAS_LIBNAME})
+target_link_libraries(${OpenBLAS_utest_ext_bin} ${OpenBLAS_LIBNAME})
 
 if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX"  )
 target_link_libraries(${OpenBLAS_utest_bin} m)
@@ -82,3 +156,4 @@ add_custom_command(TARGET ${OpenBLAS_utest_bin}
 endif()
 
 add_test(${OpenBLAS_utest_bin} ${CMAKE_CURRENT_BINARY_DIR}/${OpenBLAS_utest_bin})
+add_test(${OpenBLAS_utest_ext_bin} ${CMAKE_CURRENT_BINARY_DIR}/${OpenBLAS_utest_bin})
diff --git a/utest/Makefile b/utest/Makefile
index f99035440..55561c770 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -1,21 +1,38 @@
 UTEST_CHECK = 1
 TOPDIR	= ..
+DIR_EXT=test_extensions
 
 override TARGET_ARCH=
 override TARGET_MACH=
 
 UTESTBIN=openblas_utest
+UTESTEXTBIN=openblas_utest_ext
 
 .PHONY : all
-.NOTPARALLEL : all run_test $(UTESTBIN)
+.NOTPARALLEL : all run_test $(UTESTBIN) $(UTESTEXTBIN)
 
 include $(TOPDIR)/Makefile.system
 
 OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o
 #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o
+OBJS_EXT=utest_main.o $(DIR_EXT)/xerbla.o $(DIR_EXT)/common.o 
+OBJS_EXT+=$(DIR_EXT)/test_isamin.o $(DIR_EXT)/test_idamin.o $(DIR_EXT)/test_icamin.o $(DIR_EXT)/test_izamin.o 
+OBJS_EXT+=$(DIR_EXT)/test_ssum.o $(DIR_EXT)/test_dsum.o $(DIR_EXT)/test_scsum.o $(DIR_EXT)/test_dzsum.o
+OBJS_EXT+=$(DIR_EXT)/test_saxpby.o $(DIR_EXT)/test_daxpby.o $(DIR_EXT)/test_caxpby.o $(DIR_EXT)/test_zaxpby.o $(DIR_EXT)/test_zaxpyc.o $(DIR_EXT)/test_caxpyc.o
+OBJS_EXT+=$(DIR_EXT)/test_samin.o $(DIR_EXT)/test_damin.o $(DIR_EXT)/test_scamin.o $(DIR_EXT)/test_dzamin.o $(DIR_EXT)/test_scamax.o $(DIR_EXT)/test_dzamax.o
+OBJS_EXT+=$(DIR_EXT)/test_drotmg.o $(DIR_EXT)/test_srotmg.o $(DIR_EXT)/test_zrotg.o $(DIR_EXT)/test_crotg.o $(DIR_EXT)/test_crot.o $(DIR_EXT)/test_zrot.o
+OBJS_EXT+=$(DIR_EXT)/test_zscal.o $(DIR_EXT)/test_cscal.o
+OBJS_EXT+=$(DIR_EXT)/test_domatcopy.o $(DIR_EXT)/test_somatcopy.o $(DIR_EXT)/test_zomatcopy.o $(DIR_EXT)/test_comatcopy.o
+OBJS_EXT+=$(DIR_EXT)/test_simatcopy.o $(DIR_EXT)/test_dimatcopy.o $(DIR_EXT)/test_cimatcopy.o $(DIR_EXT)/test_zimatcopy.o
+OBJS_EXT+=$(DIR_EXT)/test_sgeadd.o $(DIR_EXT)/test_dgeadd.o $(DIR_EXT)/test_cgeadd.o $(DIR_EXT)/test_zgeadd.o
+OBJS_EXT+=$(DIR_EXT)/test_cgemv_t.o $(DIR_EXT)/test_zgemv_t.o $(DIR_EXT)/test_cgemv_n.o $(DIR_EXT)/test_zgemv_n.o
+OBJS_EXT+=$(DIR_EXT)/test_sgemmt.o $(DIR_EXT)/test_dgemmt.o $(DIR_EXT)/test_cgemmt.o $(DIR_EXT)/test_zgemmt.o
+OBJS_EXT+=$(DIR_EXT)/test_ztrmv.o $(DIR_EXT)/test_ctrmv.o $(DIR_EXT)/test_ztrsv.o $(DIR_EXT)/test_ctrsv.o
+OBJS_EXT+=$(DIR_EXT)/test_zgemm.o $(DIR_EXT)/test_cgemm.o $(DIR_EXT)/test_zgbmv.o $(DIR_EXT)/test_cgbmv.o
 
 ifneq ($(NO_LAPACK), 1)
 OBJS += test_potrs.o
+OBJS_EXT += $(DIR_EXT)/test_zspmv.o $(DIR_EXT)/test_cspmv.o $(DIR_EXT)/test_zsbmv.o $(DIR_EXT)/test_csbmv.o
 ifneq ($(NO_CBLAS), 1)
 ifneq ($(NO_LAPACKE), 1)
 OBJS += test_kernel_regress.o
@@ -47,12 +64,17 @@ all : run_test
 $(UTESTBIN): $(OBJS)
 	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB)
 
-run_test: $(UTESTBIN)
+$(UTESTEXTBIN): $(OBJS_EXT)
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB)
+
+run_test: $(UTESTBIN) $(UTESTEXTBIN)
 ifneq ($(CROSS), 1)
 	./$(UTESTBIN)
+	./$(UTESTEXTBIN)
 endif
 
 clean:
-	-rm -f *.o $(UTESTBIN)
+	-rm -f *.o $(UTESTBIN) $(UTESTEXTBIN)
+	-rm -f $(DIR_EXT)/*.o
 
-libs:
+libs:
\ No newline at end of file
diff --git a/utest/test_extensions/common.c b/utest/test_extensions/common.c
new file mode 100644
index 000000000..c3bdcefc7
--- /dev/null
+++ b/utest/test_extensions/common.c
@@ -0,0 +1,259 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "common.h"
+
+/**
+ * Generate random array
+ */
+void srand_generate(float *alpha, blasint n)
+{
+    blasint i;
+    for (i = 0; i < n; i++)
+        alpha[i] = (float)rand() / (float)RAND_MAX * 5.0f;
+}
+
+void drand_generate(double *alpha, blasint n)
+{
+    blasint i;
+    for (i = 0; i < n; i++)
+        alpha[i] = (double)rand() / (double)RAND_MAX * 5.0;
+}
+
+/**
+ * Find difference between two rectangle matrix
+ * return norm of differences
+ */
+float smatrix_difference(float *a, float *b, blasint cols, blasint rows, blasint ld)
+{
+    blasint i = 0;
+    blasint j = 0;
+    blasint inc = 1;
+    float norm = 0.0f;
+
+    float *a_ptr = a;
+    float *b_ptr = b;
+
+    for(i = 0; i < rows; i++)
+    {
+        for (j = 0; j < cols; j++) {
+            a_ptr[j] -= b_ptr[j];
+        }
+        norm += cblas_snrm2(cols, a_ptr, inc);
+        
+        a_ptr += ld;
+        b_ptr += ld;
+    }
+    return norm/(float)(rows);
+}
+
+double dmatrix_difference(double *a, double *b, blasint cols, blasint rows, blasint ld)
+{
+    blasint i = 0;
+    blasint j = 0;
+    blasint inc = 1;
+    double norm = 0.0;
+
+    double *a_ptr = a;
+    double *b_ptr = b;
+
+    for(i = 0; i < rows; i++)
+    {
+        for (j = 0; j < cols; j++) {
+            a_ptr[j] -= b_ptr[j];
+        }
+        norm += cblas_dnrm2(cols, a_ptr, inc);
+        
+        a_ptr += ld;
+        b_ptr += ld;
+    }
+    return norm/(double)(rows);
+}
+
+/**
+ * Complex conjugate operation for vector
+ * 
+ * param n specifies number of elements in vector x
+ * param inc_x specifies increment of vector x
+ * param x_ptr specifies buffer holding vector x
+ */
+void cconjugate_vector(blasint n, blasint inc_x, float *x_ptr)
+{
+    blasint i;
+    inc_x *= 2;
+
+    for (i = 0; i < n; i++)
+    {
+        x_ptr[1] *= (-1.0f);
+        x_ptr +=  inc_x;
+    }
+}
+
+void zconjugate_vector(blasint n, blasint inc_x, double *x_ptr)
+{
+    blasint i;
+    inc_x *= 2;
+
+    for (i = 0; i < n; i++)
+    {
+        x_ptr[1] *= (-1.0);
+        x_ptr +=  inc_x;
+    }
+}
+
+/**
+ * Transpose matrix
+ * 
+ * param rows specifies number of rows of A
+ * param cols specifies number of columns of A
+ * param alpha specifies scaling factor for matrix A
+ * param a_src - buffer holding input matrix A
+ * param lda_src - leading dimension of the matrix A
+ * param a_dst - buffer holding output matrix A
+ * param lda_dst - leading dimension of output matrix A
+ */
+void stranspose(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, 
+                float *a_dst, blasint lda_dst)
+{
+    blasint i, j;
+    for (i = 0; i != cols; i++)
+    {
+        for (j = 0; j != rows; j++)
+            a_dst[i*lda_dst+j] = alpha*a_src[j*lda_src+i];
+    }
+}
+
+void dtranspose(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, 
+                double *a_dst, blasint lda_dst)
+{
+    blasint i, j;
+    for (i = 0; i != cols; i++)
+    {
+        for (j = 0; j != rows; j++)
+            a_dst[i*lda_dst+j] = alpha*a_src[j*lda_src+i];
+    }
+}
+
+void ctranspose(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, 
+                float *a_dst, blasint lda_dst, int conj)
+{
+    blasint i, j;
+    lda_dst *= 2;
+    lda_src *= 2;
+    for (i = 0; i != cols*2; i+=2)
+    {
+        for (j = 0; j != rows*2; j+=2){
+            a_dst[(i/2)*lda_dst+j] = alpha[0] * a_src[(j/2)*lda_src+i] + conj * alpha[1] * a_src[(j/2)*lda_src+i+1];
+            a_dst[(i/2)*lda_dst+j+1] = (-1.0f) * conj * alpha[0] * a_src[(j/2)*lda_src+i+1] + alpha[1] * a_src[(j/2)*lda_src+i];
+        } 
+    }
+}
+
+void ztranspose(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, 
+                double *a_dst, blasint lda_dst, int conj)
+{
+    blasint i, j;
+    lda_dst *= 2;
+    lda_src *= 2;
+    for (i = 0; i != cols*2; i+=2)
+    {
+        for (j = 0; j != rows*2; j+=2){
+            a_dst[(i/2)*lda_dst+j] = alpha[0] * a_src[(j/2)*lda_src+i] + conj * alpha[1] * a_src[(j/2)*lda_src+i+1];
+            a_dst[(i/2)*lda_dst+j+1] = (-1.0) * conj * alpha[0] * a_src[(j/2)*lda_src+i+1] + alpha[1] * a_src[(j/2)*lda_src+i];
+        } 
+    }
+}
+
+/**
+ * Copy matrix from source A to destination A
+ * 
+ * param rows specifies number of rows of A
+ * param cols specifies number of columns of A
+ * param alpha specifies scaling factor for matrix A
+ * param a_src - buffer holding input matrix A
+ * param lda_src - leading dimension of the matrix A
+ * param a_dst - buffer holding output matrix A
+ * param lda_dst - leading dimension of output matrix A
+ * param conj specifies conjugation
+ */
+void scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, 
+           float *a_dst, blasint lda_dst)
+{
+    blasint i, j;
+    for (i = 0; i != rows; i++)
+    {
+        for (j = 0; j != cols; j++)
+            a_dst[i*lda_dst+j] = alpha*a_src[i*lda_src+j];
+    }
+}
+
+void dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, 
+           double *a_dst, blasint lda_dst)
+{
+    blasint i, j;
+    for (i = 0; i != rows; i++)
+    {
+        for (j = 0; j != cols; j++)
+            a_dst[i*lda_dst+j] = alpha*a_src[i*lda_src+j];
+    }
+}
+
+void ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, 
+           float *a_dst, blasint lda_dst, int conj)
+{
+    blasint i, j;
+    lda_dst *= 2;
+    lda_src *= 2;
+    for (i = 0; i != rows; i++)
+    {
+        for (j = 0; j != cols*2; j+=2){
+            a_dst[i*lda_dst+j] = alpha[0] * a_src[i*lda_src+j] + conj * alpha[1] * a_src[i*lda_src+j+1];
+            a_dst[i*lda_dst+j+1] = (-1.0f) * conj *alpha[0] * a_src[i*lda_src+j+1] + alpha[1] * a_src[i*lda_src+j];
+        }
+    }
+}
+
+void zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, 
+           double *a_dst, blasint lda_dst, int conj)
+{
+    blasint i, j;
+    lda_dst *= 2;
+    lda_src *= 2;
+    for (i = 0; i != rows; i++)
+    {
+        for (j = 0; j != cols*2; j+=2){
+            a_dst[i*lda_dst+j] = alpha[0] * a_src[i*lda_src+j] + conj * alpha[1] * a_src[i*lda_src+j+1];
+            a_dst[i*lda_dst+j+1] = (-1.0) * conj *alpha[0] * a_src[i*lda_src+j+1] + alpha[1] * a_src[i*lda_src+j];
+        }
+    }
+}
\ No newline at end of file
diff --git a/utest/test_extensions/common.h b/utest/test_extensions/common.h
new file mode 100644
index 000000000..62b84325c
--- /dev/null
+++ b/utest/test_extensions/common.h
@@ -0,0 +1,76 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#ifndef _TEST_EXTENSION_COMMON_H_
+#define _TEST_EXTENSION_COMMON_H_
+
+#include <cblas.h>
+#include <ctype.h>
+
+#define TRUE 1
+#define FALSE 0
+#define INVALID -1
+#define SINGLE_TOL 1e-02f
+#define DOUBLE_TOL 1e-10
+
+extern int check_error(void);
+extern void set_xerbla(char* current_rout, int expected_info);
+extern int BLASFUNC(xerbla)(char *name, blasint *info, blasint length);
+
+extern void srand_generate(float *alpha, blasint n);
+extern void drand_generate(double *alpha, blasint n);
+
+extern float smatrix_difference(float *a, float *b, blasint cols, blasint rows, blasint ld);
+extern double dmatrix_difference(double *a, double *b, blasint cols, blasint rows, blasint ld);
+
+extern void cconjugate_vector(blasint n, blasint inc_x, float *x_ptr);
+extern void zconjugate_vector(blasint n, blasint inc_x, double *x_ptr);
+
+extern void stranspose(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, 
+                       float *a_dst, blasint lda_dst);
+extern void dtranspose(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, 
+                double *a_dst, blasint lda_dst);
+extern void ctranspose(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, 
+                      float *a_dst, blasint lda_dst, int conj);
+extern void ztranspose(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, 
+                double *a_dst, blasint lda_dst, int conj);
+
+extern void scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, 
+           float *a_dst, blasint lda_dst);
+extern void dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, 
+           double *a_dst, blasint lda_dst);
+extern void ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, 
+           float *a_dst, blasint lda_dst, int conj);
+extern void zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, 
+           double *a_dst, blasint lda_dst, int conj);                
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_caxpby.c b/utest/test_extensions/test_caxpby.c
new file mode 100644
index 000000000..221a48ac7
--- /dev/null
+++ b/utest/test_extensions/test_caxpby.c
@@ -0,0 +1,631 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_CAXPBY {
+    float x_test[DATASIZE * INCREMENT * 2];
+    float x_verify[DATASIZE * INCREMENT * 2];
+    float y_test[DATASIZE * INCREMENT * 2];
+    float y_verify[DATASIZE * INCREMENT * 2];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CAXPBY data_caxpby;
+
+/**
+ * Fortran API specific function
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Compare with the following options:
+ * 
+ * param n - number of elements in vectors x and y
+ * param alpha - scalar alpha
+ * param incx - increment for the elements of x
+ * param beta - scalar beta
+ * param incy - increment for the elements of y
+ * return norm of difference
+ */
+static float check_caxpby(blasint n, float *alpha, blasint incx, float *beta, blasint incy)
+{
+    blasint i;
+
+    // cscal accept only positive increments
+    blasint incx_abs = labs(incx);
+    blasint incy_abs = labs(incy);
+
+    // Fill vectors x, y
+    srand_generate(data_caxpby.x_test, n * incx_abs * 2);
+    srand_generate(data_caxpby.y_test, n * incy_abs * 2);
+
+    // Copy vector x for caxpy
+    for (i = 0; i < n * incx_abs * 2; i++)
+        data_caxpby.x_verify[i] = data_caxpby.x_test[i];
+
+    // Copy vector y for cscal
+    for (i = 0; i < n * incy_abs * 2; i++)
+        data_caxpby.y_verify[i] = data_caxpby.y_test[i];
+
+    // Find beta*y
+    BLASFUNC(cscal)(&n, beta, data_caxpby.y_verify, &incy_abs);
+
+    // Find sum of alpha*x and beta*y
+    BLASFUNC(caxpy)(&n, alpha, data_caxpby.x_verify, &incx,
+                        data_caxpby.y_verify, &incy);
+    
+    BLASFUNC(caxpby)(&n, alpha, data_caxpby.x_test, &incx,
+                        beta, data_caxpby.y_test, &incy);
+
+    // Find the differences between output vector caculated by caxpby and caxpy
+    for (i = 0; i < n * incy_abs * 2; i++)
+        data_caxpby.y_test[i] -= data_caxpby.y_verify[i];
+
+    // Find the norm of differences
+    return BLASFUNC(scnrm2)(&n, data_caxpby.y_test, &incy_abs);
+}
+
+/**
+ * C API specific function 
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Compare with the following options:
+ * 
+ * param n - number of elements in vectors x and y
+ * param alpha - scalar alpha
+ * param incx - increment for the elements of x
+ * param beta - scalar beta
+ * param incy - increment for the elements of y
+ * return norm of difference
+ */
+static float c_api_check_caxpby(blasint n, float *alpha, blasint incx, float *beta, blasint incy)
+{
+    blasint i;
+
+    // cscal accept only positive increments
+    blasint incx_abs = labs(incx);
+    blasint incy_abs = labs(incy);
+
+    // Fill vectors x, y
+    srand_generate(data_caxpby.x_test, n * incx_abs * 2);
+    srand_generate(data_caxpby.y_test, n * incy_abs * 2);
+
+    // Copy vector x for caxpy
+    for (i = 0; i < n * incx_abs * 2; i++)
+        data_caxpby.x_verify[i] = data_caxpby.x_test[i];
+
+    // Copy vector y for cscal
+    for (i = 0; i < n * incy_abs * 2; i++)
+        data_caxpby.y_verify[i] = data_caxpby.y_test[i];
+
+    // Find beta*y
+    cblas_cscal(n, beta, data_caxpby.y_verify, incy_abs);
+
+    // Find sum of alpha*x and beta*y
+    cblas_caxpy(n, alpha, data_caxpby.x_verify, incx,
+                        data_caxpby.y_verify, incy);
+    
+    cblas_caxpby(n, alpha, data_caxpby.x_test, incx,
+                        beta, data_caxpby.y_test, incy);
+
+    // Find the differences between output vector caculated by caxpby and caxpy
+    for (i = 0; i < n * incy_abs * 2; i++)
+        data_caxpby.y_test[i] -= data_caxpby.y_verify[i];
+
+    // Find the norm of differences
+    return cblas_scnrm2(n, data_caxpby.y_test, incy_abs);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ */
+CTEST(caxpby, inc_x_1_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ */
+CTEST(caxpby, inc_x_2_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ */
+CTEST(caxpby, inc_x_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ */
+CTEST(caxpby, inc_x_2_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    float alpha[] = {3.0f, 1.0f};
+    float beta[] = {4.0f, 3.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is 2
+ */
+CTEST(caxpby, inc_x_neg_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = -1, incy = 2;
+    float alpha[] = {5.0f, 2.2f};
+    float beta[] = {4.0f, 5.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is -1
+ */
+CTEST(caxpby, inc_x_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = -1;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {6.0f, 3.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is -1
+ */
+CTEST(caxpby, inc_x_neg_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = -2, incy = -1;
+    float alpha[] = {7.0f, 2.0f};
+    float beta[] = {3.5f, 1.3f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ */
+CTEST(caxpby, inc_x_1_inc_y_1_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar beta is zero
+ */
+CTEST(caxpby, inc_x_1_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ * Scalar beta is zero
+ */
+CTEST(caxpby, inc_x_1_inc_y_1_N_100_a_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+ * Scalar beta is zero
+*/
+CTEST(caxpby, inc_x_1_inc_y_2_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Check if n - size of vectors x, y is zero
+ */
+CTEST(caxpby, check_n_zero)
+{
+    blasint n = 0, incx = 1, incy = 1;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+    
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ */
+CTEST(caxpby, c_api_inc_x_1_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ */
+CTEST(caxpby, c_api_inc_x_2_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ */
+CTEST(caxpby, c_api_inc_x_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {2.0f, 2.1f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ */
+CTEST(caxpby, c_api_inc_x_2_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    float alpha[] = {3.0f, 2.0f};
+    float beta[] = {4.0f, 3.0f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is 2
+ */
+CTEST(caxpby, c_api_inc_x_neg_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = -1, incy = 2;
+    float alpha[] = {5.0f, 2.0f};
+    float beta[] = {4.0f, 3.1f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is -1
+ */
+CTEST(caxpby, c_api_inc_x_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = -1;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {6.0f, 2.3f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is -1
+ */
+CTEST(caxpby, c_api_inc_x_neg_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = -2, incy = -1;
+    float alpha[] = {7.0f, 1.0f};
+    float beta[] = {3.5f, 1.0f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ */
+CTEST(caxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar beta is zero
+ */
+CTEST(caxpby, c_api_inc_x_1_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ * Scalar beta is zero
+ */
+CTEST(caxpby, c_api_inc_x_1_inc_y_1_N_100_a_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+ * Scalar beta is zero
+*/
+CTEST(caxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Check if n - size of vectors x, y is zero
+ */
+CTEST(caxpby, c_api_check_n_zero)
+{
+    blasint n = 0, incx = 1, incy = 1;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+#endif
diff --git a/utest/test_extensions/test_caxpyc.c b/utest/test_extensions/test_caxpyc.c
new file mode 100644
index 000000000..ed1899e57
--- /dev/null
+++ b/utest/test_extensions/test_caxpyc.c
@@ -0,0 +1,158 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_CAXPYC {
+    float x_test[DATASIZE * INCREMENT * 2];
+    float x_verify[DATASIZE * INCREMENT * 2];
+    float y_test[DATASIZE * INCREMENT * 2];
+    float y_verify[DATASIZE * INCREMENT * 2];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CAXPYC data_caxpyc;
+
+/**
+ * Test caxpyc by conjugating vector x and comparing with caxpy.
+ * Compare with the following options:
+ *
+ * param n - number of elements in vectors x and y
+ * param alpha - scalar alpha
+ * param incx - increment for the elements of x
+ * param incy - increment for the elements of y
+ * return norm of difference
+ */
+static float check_caxpyc(blasint n, float *alpha, blasint incx, blasint incy)
+{
+    blasint i;
+
+    srand_generate(data_caxpyc.x_test, n * incx * 2);
+    srand_generate(data_caxpyc.y_test, n * incy * 2);
+
+    for (i = 0; i < n * incx * 2; i++)
+        data_caxpyc.x_verify[i] = data_caxpyc.x_test[i];
+
+    for (i = 0; i < n * incy * 2; i++)
+        data_caxpyc.y_verify[i] = data_caxpyc.y_test[i];
+
+    cconjugate_vector(n, incx, data_caxpyc.x_verify);
+
+    BLASFUNC(caxpy)(&n, alpha, data_caxpyc.x_verify, &incx,
+                    data_caxpyc.y_verify, &incy);
+
+    BLASFUNC(caxpyc)(&n, alpha, data_caxpyc.x_test, &incx,
+                     data_caxpyc.y_test, &incy);
+
+    for (i = 0; i < n * incy * 2; i++)
+        data_caxpyc.y_verify[i] -= data_caxpyc.y_test[i];
+
+    return BLASFUNC(scnrm2)(&n, data_caxpyc.y_verify, &incy);
+}
+
+/**
+ * Test caxpyc by conjugating vector x and comparing with caxpy.
+ * Test with the following options:
+ *
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ */
+CTEST(caxpyc, conj_strides_one)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha[] = {5.0f, 2.2f};
+
+    float norm = check_caxpyc(n, alpha, incx, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test caxpyc by conjugating vector x and comparing with caxpy.
+ * Test with the following options:
+ *
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ */
+CTEST(caxpyc, conj_incx_one)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha[] = {5.0f, 2.2f};
+
+    float norm = check_caxpyc(n, alpha, incx, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test caxpyc by conjugating vector x and comparing with caxpy.
+ * Test with the following options:
+ *
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ */
+CTEST(caxpyc, conj_incy_one)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    float alpha[] = {5.0f, 2.2f};
+
+    float norm = check_caxpyc(n, alpha, incx, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test caxpyc by conjugating vector x and comparing with caxpy.
+ * Test with the following options:
+ *
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ */
+CTEST(caxpyc, conj_strides_two)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    float alpha[] = {5.0f, 2.2f};
+
+    float norm = check_caxpyc(n, alpha, incx, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+#endif
diff --git a/utest/test_extensions/test_cgbmv.c b/utest/test_extensions/test_cgbmv.c
new file mode 100644
index 000000000..8e0640c5d
--- /dev/null
+++ b/utest/test_extensions/test_cgbmv.c
@@ -0,0 +1,279 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 1
+
+struct DATA_CGBMV {
+    float a_test[DATASIZE * DATASIZE * 2];
+    float a_band_storage[DATASIZE * DATASIZE * 2];
+    float matrix[DATASIZE * DATASIZE * 2];
+    float b_test[DATASIZE * 2 * INCREMENT];
+    float c_test[DATASIZE * 2 * INCREMENT];
+    float c_verify[DATASIZE * 2 * INCREMENT];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CGBMV data_cgbmv;
+
+/** 
+ * Transform full-storage band matrix A to band-packed storage mode.
+ * 
+ * param m - number of rows of A
+ * param n - number of columns of A
+ * param kl - number of sub-diagonals of the matrix A
+ * param ku - number of super-diagonals of the matrix A
+ * output param a - buffer for holding band-packed matrix
+ * param lda - specifies the leading dimension of a
+ * param matrix - buffer holding full-storage band matrix A 
+ * param ldm - specifies the leading full-storage band matrix A
+ */
+static void transform_to_band_storage(blasint m, blasint n, blasint kl, 
+                                      blasint ku, float* a, blasint lda,
+                                      float* matrix, blasint ldm)
+{
+    blasint i, j, k;
+    for (j = 0; j < n; j++) 
+    {
+        k = 2 * (ku - j);
+        for (i = MAX(0, 2*(j - ku)); i < MIN(m, j + kl + 1) * 2; i+=2) 
+        {
+            a[(k + i) + j * lda * 2] = matrix[i + j * ldm * 2];
+            a[(k + i) + j * lda * 2 + 1] = matrix[i + j * ldm * 2 + 1];
+        }
+    }
+}
+
+/** 
+ * Generate full-storage band matrix A with kl sub-diagonals and ku super-diagonals
+ * 
+ * param m - number of rows of A
+ * param n - number of columns of A
+ * param kl - number of sub-diagonals of the matrix A
+ * param ku - number of super-diagonals of the matrix A
+ * output param band_matrix - buffer for full-storage band matrix.
+ * param matrix - buffer holding input general matrix
+ * param ldm - specifies the leading of input general matrix
+*/
+static void get_band_matrix(blasint m, blasint n, blasint kl, blasint ku, 
+                            float *band_matrix, float *matrix, blasint ldm)
+{
+    blasint i, j;
+    blasint k = 0;
+    for (i = 0; i < n; i++)
+    {
+        for (j = 0; j < m * 2; j += 2)
+        {
+            if ((blasint)(j/2) > kl + i || i > ku + (blasint)(j/2)) 
+            {
+                band_matrix[i * ldm * 2 + j] = 0.0f;
+                band_matrix[i * ldm * 2 + j + 1] = 0.0f;
+                continue;
+            }
+
+            band_matrix[i * ldm * 2 + j] = matrix[k++];
+            band_matrix[i * ldm * 2 + j + 1] = matrix[k++];
+        }
+    }
+}
+
+/**
+ * Comapare results computed by cgbmv and cgemv 
+ * since gbmv is gemv for band matrix
+ * 
+ * param trans specifies op(A), the transposition operation applied to A
+ * param m - number of rows of A
+ * param n - number of columns of A
+ * param kl - number of sub-diagonals of the matrix A
+ * param ku - number of super-diagonals of the matrix A
+ * param alpha - scaling factor for the matrix-vector product
+ * param lda - specifies the leading dimension of a
+ * param inc_b - stride of vector b
+ * param beta - scaling factor for vector c
+ * param inc_c - stride of vector c
+ * return norm of differences 
+ */
+static float check_cgbmv(char trans, blasint m, blasint n, blasint kl, blasint ku,
+    float *alpha, blasint lda, blasint inc_b, float *beta, blasint inc_c)
+{
+    blasint i;
+    blasint lenb, lenc;
+    
+    if(trans == 'T' || trans == 'C' || trans == 'D' || trans == 'U'){
+        lenb = m;
+        lenc = n;
+    } else {
+        lenb = n;
+        lenc = m;
+    }
+    
+    srand_generate(data_cgbmv.matrix, m * n * 2);
+    srand_generate(data_cgbmv.b_test, 2 * (1 + (lenb - 1) * inc_b));
+    srand_generate(data_cgbmv.c_test, 2 * (1 + (lenc - 1) * inc_c));
+
+    for (i = 0; i < 2 * (1 + (lenc - 1) * inc_c); i++)
+        data_cgbmv.c_verify[i] = data_cgbmv.c_test[i];
+
+    get_band_matrix(m, n, kl, ku, data_cgbmv.a_test, data_cgbmv.matrix, m);
+
+    transform_to_band_storage(m, n, kl, ku, data_cgbmv.a_band_storage, lda, data_cgbmv.a_test, m);
+
+    BLASFUNC(cgemv)(&trans, &m, &n, alpha, data_cgbmv.a_test, &m, data_cgbmv.b_test,
+                    &inc_b, beta, data_cgbmv.c_verify, &inc_c);
+
+    BLASFUNC(cgbmv)(&trans, &m, &n, &kl, &ku, alpha, data_cgbmv.a_band_storage, &lda, data_cgbmv.b_test, 
+                    &inc_b, beta, data_cgbmv.c_test, &inc_c);
+
+    for (i = 0; i < 2 * (1 + (lenc - 1) * inc_c); i++)
+        data_cgbmv.c_verify[i] -= data_cgbmv.c_test[i];
+
+    return BLASFUNC(scnrm2)(&lenc, data_cgbmv.c_verify, &inc_c);
+}
+
+/**
+ * Test cgbmv by comparing it against cgemv
+ * with param trans is D
+ */
+CTEST(cgbmv, trans_D)
+{
+    blasint m = 50, n = 25;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 20, ku = 11;
+    blasint lda = 50;
+    char trans = 'D';
+
+    float alpha[] = {7.0f, 1.0f};
+    float beta[] = {1.5f, -1.5f};
+
+    float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgbmv by comparing it against cgemv
+ * with param trans is O
+ */
+CTEST(cgbmv, trans_O)
+{
+    blasint m = 50, n = 25;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 20, ku = 10;
+    blasint lda = 50;
+    char trans = 'O';
+
+    float alpha[] = {7.0f, 1.0f};
+    float beta[] = {1.5f, -1.5f};
+
+    float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgbmv by comparing it against cgemv
+ * with param trans is S
+ */
+CTEST(cgbmv, trans_S)
+{
+    blasint m = 50, n = 25;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 6, ku = 9;
+    blasint lda = 50;
+    char trans = 'S';
+
+    float alpha[] = {7.0f, 1.0f};
+    float beta[] = {1.5f, -1.5f};
+
+    float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgbmv by comparing it against cgemv
+ * with param trans is U
+ */
+CTEST(cgbmv, trans_U)
+{
+    blasint m = 25, n = 50;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 7, ku = 11;
+    blasint lda = kl + ku + 1;
+    char trans = 'U';
+
+    float alpha[] = {7.0f, 1.0f};
+    float beta[] = {1.5f, -1.5f};
+
+    float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgbmv by comparing it against cgemv
+ * with param trans is C
+ */
+CTEST(cgbmv, trans_C)
+{
+    blasint m = 50, n = 25;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 20, ku = 11;
+    blasint lda = 50;
+    char trans = 'C';
+
+    float alpha[] = {7.0f, 1.0f};
+    float beta[] = {1.5f, -1.5f};
+
+    float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgbmv by comparing it against cgemv
+ * with param trans is R
+ */
+CTEST(cgbmv, trans_R)
+{
+    blasint m = 50, n = 100;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 20, ku = 11;
+    blasint lda = 50;
+    char trans = 'R';
+
+    float alpha[] = {7.0f, 1.0f};
+    float beta[] = {1.5f, -1.5f};
+
+    float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+#endif
diff --git a/utest/test_extensions/test_cgeadd.c b/utest/test_extensions/test_cgeadd.c
new file mode 100644
index 000000000..0cf6cbf87
--- /dev/null
+++ b/utest/test_extensions/test_cgeadd.c
@@ -0,0 +1,880 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define N 100
+#define M 100
+
+struct DATA_CGEADD {
+    float a_test[M * N * 2];
+    float c_test[M * N * 2];
+    float c_verify[M * N * 2];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CGEADD data_cgeadd;
+
+/**
+ * cgeadd reference implementation
+ *
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param alpha - scaling factor for matrix A
+ * param aptr - refer to matrix A
+ * param lda - leading dimension of A
+ * param beta - scaling factor for matrix C
+ * param cptr - refer to matrix C
+ * param ldc - leading dimension of C
+ */
+static void cgeadd_trusted(blasint m, blasint n, float *alpha, float *aptr,
+                           blasint lda, float *beta, float *cptr, blasint ldc)
+{
+    blasint i;
+
+    lda *= 2; 
+	ldc *= 2;
+
+    for (i = 0; i < n; i++)
+    {
+        cblas_caxpby(m, alpha, aptr, 1, beta, cptr, 1);
+        aptr += lda;
+        cptr += ldc;
+    }
+}
+
+/**
+ * Test cgeadd by comparing it against reference
+ * Compare with the following options:
+ *
+ * param api - specifies Fortran or C API
+ * param order - specifies whether A and C stored in
+ * row-major order or column-major order
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param alpha - scaling factor for matrix A
+ * param lda - leading dimension of A
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ * return norm of differences
+ */
+static float check_cgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order,
+                          blasint m, blasint n, float *alpha, blasint lda,
+                          float *beta, blasint ldc)
+{
+    blasint i;
+    blasint cols = m, rows = n;
+
+    if (order == CblasRowMajor)
+    {
+        rows = m;
+        cols = n;
+    }
+
+    // Fill matrix A, C
+    srand_generate(data_cgeadd.a_test, lda * rows * 2);
+    srand_generate(data_cgeadd.c_test, ldc * rows * 2);
+
+    // Copy matrix C for cgeadd
+    for (i = 0; i < ldc * rows * 2; i++)
+        data_cgeadd.c_verify[i] = data_cgeadd.c_test[i];
+
+    cgeadd_trusted(cols, rows, alpha, data_cgeadd.a_test, lda,
+                   beta, data_cgeadd.c_verify, ldc);
+
+    if (api == 'F')
+        BLASFUNC(cgeadd)(&m, &n, alpha, data_cgeadd.a_test, &lda,
+                         beta, data_cgeadd.c_test, &ldc);
+    else
+        cblas_cgeadd(order, m, n, alpha, data_cgeadd.a_test, lda,
+                     beta, data_cgeadd.c_test, ldc);
+
+    // Find the differences between output matrix caculated by cgeadd and sgemm
+    return smatrix_difference(data_cgeadd.c_test, data_cgeadd.c_verify, cols, rows, ldc*2);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param api - specifies Fortran or C API
+ * param order - specifies whether A and C stored in
+ * row-major order or column-major order
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param lda - leading dimension of A
+ * param ldc - leading dimension of C
+ * param expected_info - expected invalid parameter number in cgeadd
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order,
+                         blasint m, blasint n, blasint lda,
+                         blasint ldc, int expected_info)
+{
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    set_xerbla("CGEADD ", expected_info);
+
+    if (api == 'F')
+        BLASFUNC(cgeadd)(&m, &n, alpha, data_cgeadd.a_test, &lda,
+                         beta, data_cgeadd.c_test, &ldc);
+    else 
+        cblas_cgeadd(order, m, n, alpha, data_cgeadd.a_test, lda,
+                 beta, data_cgeadd.c_test, ldc);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(cgeadd, matrix_n_100_m_100)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {3.0f, 2.0f};
+    float beta[] = {1.0f, 3.0f};
+
+    float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar alpha is zero (operation is C:=beta*C)
+ */
+CTEST(cgeadd, matrix_n_100_m_100_alpha_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {2.5f, 1.0f};
+
+    float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar beta is zero (operation is C:=alpha*A)
+ */
+CTEST(cgeadd, matrix_n_100_m_100_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {3.0f, 1.5f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalars alpha, beta is zero (operation is C:= 0)
+ */
+CTEST(cgeadd, matrix_n_100_m_100_alpha_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(cgeadd, matrix_n_100_m_50)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M / 2;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C
+ * Must be at least zero.
+ */
+CTEST(cgeadd, xerbla_n_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific tests
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ */
+CTEST(cgeadd, xerbla_m_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ */
+CTEST(cgeadd, xerbla_lda_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 6;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ */
+CTEST(cgeadd, xerbla_ldc_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Check if n - number of columns of A, C equal zero.
+ */
+CTEST(cgeadd, n_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 0;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Check if m - number of rows of A and C equal zero.
+ */
+CTEST(cgeadd, m_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 0;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(cgeadd, c_api_matrix_n_100_m_100)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {1.0f, 3.0f};
+
+    float norm = check_cgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * c api option order is row-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(cgeadd, c_api_matrix_n_100_m_100_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {4.0f, 1.5f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * c api option order is row-major order
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(cgeadd, c_api_matrix_n_50_m_100_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = N / 2;
+    blasint m = M;
+
+    blasint lda = n;
+    blasint ldc = n;
+
+    float alpha[] = {3.0f, 2.5f};
+    float beta[] = {1.0f, 2.0f};
+
+    float norm = check_cgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar alpha is zero (operation is C:=beta*C)
+ */
+CTEST(cgeadd, c_api_matrix_n_100_m_100_alpha_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_cgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar beta is zero (operation is C:=alpha*A)
+ */
+CTEST(cgeadd, c_api_matrix_n_100_m_100_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {3.0f, 1.5f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalars alpha, beta is zero (operation is C:= 0)
+ */
+CTEST(cgeadd, c_api_matrix_n_100_m_100_alpha_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(cgeadd, c_api_matrix_n_100_m_50)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M / 2;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {2.0f, 3.0f};
+    float beta[] = {2.0f, 4.0f};
+
+    float norm = check_cgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param order -
+ * specifies whether A and C stored in
+ * row-major order or column-major order
+ */
+CTEST(cgeadd, c_api_xerbla_invalid_order)
+{
+    CBLAS_ORDER order = INVALID;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 0;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C.
+ * Must be at least zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(cgeadd, c_api_xerbla_n_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C.
+ * Must be at least zero.
+ *
+ * c api option order is row-major order
+ */
+CTEST(cgeadd, c_api_xerbla_n_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(cgeadd, c_api_xerbla_m_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ *
+ * c api option order is row-major order
+ */
+CTEST(cgeadd, c_api_xerbla_m_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ *
+ * c api option order is column-major order
+ */
+CTEST(cgeadd, c_api_xerbla_lda_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 5;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ *
+ * c api option order is row-major order
+ */
+CTEST(cgeadd, c_api_xerbla_lda_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 5;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ *
+ * c api option order is column-major order
+ */
+CTEST(cgeadd, c_api_xerbla_ldc_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ *
+ * c api option order is row-major order
+ */
+CTEST(cgeadd, c_api_xerbla_ldc_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Check if n - number of columns of A, C equal zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(cgeadd, c_api_n_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 0;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_cgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Check if m - number of rows of A and C equal zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(cgeadd, c_api_m_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 0;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_cgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_cgemm.c b/utest/test_extensions/test_cgemm.c
new file mode 100644
index 000000000..cd38d710b
--- /dev/null
+++ b/utest/test_extensions/test_cgemm.c
@@ -0,0 +1,273 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_CGEMM {
+	float a_test[DATASIZE * DATASIZE * 2];
+    float a_verify[DATASIZE * DATASIZE * 2];
+	float b_test[DATASIZE * DATASIZE * 2];
+    float b_verify[DATASIZE * DATASIZE * 2];
+    float c_test[DATASIZE * DATASIZE * 2];
+	float c_verify[DATASIZE * DATASIZE * 2];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CGEMM data_cgemm;
+
+/**
+ * Test cgemm with the conjugate matrices by conjugating and not transposed matrices
+ * and comparing it with the non-conjugate cgemm.
+ *
+ * param transa specifies op(A), the transposition (conjugation) operation applied to A
+ * param transb specifies op(B), the transposition (conjugation) operation applied to B
+ * param m specifies the number of rows of the matrix op(A) and of the matrix C
+ * param n specifies the number of columns of the matrix op(B) and the number of columns of the matrix C
+ * param k specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B)
+ * param alpha - scaling factor for the matrix-matrix product
+ * param lda - leading dimension of matrix A
+ * param ldb - leading dimension of matrix B
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of matrix C
+ * return norm of difference
+ */
+static float check_cgemm(char transa, char transb, blasint m, blasint n, blasint k, 
+                         float *alpha, blasint lda, blasint ldb, float *beta, blasint ldc)
+{
+	blasint i;
+	float alpha_conj[] = {1.0f, 0.0f}; 
+	char transa_verify = transa;
+    char transb_verify = transb;
+
+    int arows = k, acols = m;
+    int brows = n, bcols = k;
+
+    if (transa == 'T' || transa == 'C'){
+        arows = m; acols = k;
+    }
+
+    if (transb == 'T' || transb == 'C'){
+        brows = k; bcols = n;
+    }
+
+	srand_generate(data_cgemm.a_test, arows * lda * 2);
+	srand_generate(data_cgemm.b_test, brows * ldb * 2);
+    srand_generate(data_cgemm.c_test, n * ldc * 2);
+
+	for (i = 0; i < arows * lda * 2; i++)
+		data_cgemm.a_verify[i] = data_cgemm.a_test[i];
+
+	for (i = 0; i < brows * ldb * 2; i++)
+		data_cgemm.b_verify[i] = data_cgemm.b_test[i];
+
+    for (i = 0; i < n * ldc * 2; i++)
+		data_cgemm.c_verify[i] = data_cgemm.c_test[i];
+
+	if (transa == 'R'){
+		cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, arows, acols, alpha_conj, data_cgemm.a_verify, lda, lda);
+		transa_verify = 'N';
+	}
+
+    if (transb == 'R'){
+		cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, brows, bcols, alpha_conj, data_cgemm.b_verify, ldb, ldb);
+		transb_verify = 'N';
+	}
+
+	BLASFUNC(cgemm)(&transa_verify, &transb_verify, &m, &n, &k, alpha, data_cgemm.a_verify, &lda,
+	 				data_cgemm.b_verify, &ldb, beta, data_cgemm.c_verify, &ldc);
+
+	BLASFUNC(cgemm)(&transa, &transb, &m, &n, &k, alpha, data_cgemm.a_test, &lda,
+	 				data_cgemm.b_test, &ldb, beta, data_cgemm.c_test, &ldc);
+
+	return smatrix_difference(data_cgemm.c_test, data_cgemm.c_verify, m, n, ldc*2);
+}
+
+/**
+ * Test cgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate cgemm.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and transposed
+ * matrix B is conjugate and not transposed
+ */
+CTEST(cgemm, conjtransa_conjnotransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'C';
+	char transb = 'R';
+	float alpha[] = {-2.0, 1.0f};
+    float beta[] = {1.0f, -1.0f};
+
+	float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test cgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate cgemm.
+ * Test with the following options:
+ *
+ * matrix A is not conjugate and not transposed
+ * matrix B is conjugate and not transposed
+ */
+CTEST(cgemm, notransa_conjnotransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'N';
+	char transb = 'R';
+	float alpha[] = {-2.0, 1.0f};
+    float beta[] = {1.0f, -1.0f};
+
+	float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test cgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate cgemm.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not transposed
+ * matrix B is conjugate and transposed
+ */
+CTEST(cgemm, conjnotransa_conjtransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'R';
+	char transb = 'C';
+	float alpha[] = {-2.0, 1.0f};
+    float beta[] = {1.0f, -1.0f};
+
+	float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test cgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate cgemm.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not transposed
+ * matrix B is not conjugate and not transposed
+ */
+CTEST(cgemm, conjnotransa_notransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'R';
+	char transb = 'N';
+	float alpha[] = {-2.0, 1.0f};
+    float beta[] = {1.0f, -1.0f};
+
+	float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test cgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate cgemm.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not transposed
+ * matrix B is conjugate and not transposed
+ */
+CTEST(cgemm, conjnotransa_conjnotransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'R';
+	char transb = 'R';
+	float alpha[] = {-2.0, 1.0f};
+    float beta[] = {1.0f, -1.0f};
+
+	float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test cgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate cgemm.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not transposed
+ * matrix B is transposed
+ */
+CTEST(cgemm, conjnotransa_transb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'R';
+	char transb = 'T';
+	float alpha[] = {-2.0, 1.0f};
+    float beta[] = {1.0f, -1.0f};
+
+	float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test cgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate cgemm.
+ * Test with the following options:
+ *
+ * matrix A is transposed
+ * matrix B is conjugate and not transposed
+ */
+CTEST(cgemm, transa_conjnotransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'T';
+	char transb = 'R';
+	float alpha[] = {-2.0, 1.0f};
+    float beta[] = {1.0f, -1.0f};
+
+	float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_cgemmt.c b/utest/test_extensions/test_cgemmt.c
new file mode 100644
index 000000000..ed9279933
--- /dev/null
+++ b/utest/test_extensions/test_cgemmt.c
@@ -0,0 +1,2010 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_CGEMMT {
+    float a_test[DATASIZE * DATASIZE * 2];
+    float b_test[DATASIZE * DATASIZE * 2];
+    float c_test[DATASIZE * DATASIZE * 2];
+    float c_verify[DATASIZE * DATASIZE * 2];
+    float c_gemm[DATASIZE * DATASIZE * 2];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CGEMMT data_cgemmt;
+
+/**
+ * Compute gemmt via gemm since gemmt is gemm but updates only 
+ * the upper or lower triangular part of the result matrix
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order (for Fortran API column major always)
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param alpha - scaling factor for the matrix-matrix product
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ */
+static void cgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                           char transb, blasint m, blasint k, float *alpha, blasint lda, 
+                           blasint ldb, float *beta, blasint ldc)
+{
+    blasint i, j;
+
+    if(api == 'F')
+        BLASFUNC(cgemm)(&transa, &transb, &m, &m, &k, alpha, data_cgemmt.a_test, &lda,
+                        data_cgemmt.b_test, &ldb, beta, data_cgemmt.c_gemm, &ldc);
+    else
+        cblas_cgemm(order, transa, transb, m, m, k, alpha, data_cgemmt.a_test, lda,
+                data_cgemmt.b_test, ldb, beta, data_cgemmt.c_gemm, ldc);
+
+    ldc *= 2;
+
+    if (uplo == 'L' || uplo == CblasLower)
+    {
+        for (i = 0; i < m; i++)
+            for (j = i * 2; j < m * 2; j+=2){
+                data_cgemmt.c_verify[i * ldc + j] =
+                    data_cgemmt.c_gemm[i * ldc + j];
+                data_cgemmt.c_verify[i * ldc + j + 1] =
+                    data_cgemmt.c_gemm[i * ldc + j + 1];
+            }
+    } else {
+        for (i = 0; i < m; i++)
+            for (j = 0; j <= i * 2; j+=2){
+                data_cgemmt.c_verify[i * ldc + j] =
+                    data_cgemmt.c_gemm[i * ldc + j];
+                data_cgemmt.c_verify[i * ldc + j + 1] =
+                    data_cgemmt.c_gemm[i * ldc + j + 1];
+            }
+    }
+}
+
+/**
+ * Comapare results computed by cgemmt and cgemmt_trusted
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order (for Fortran API column major always)
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param alpha - scaling factor for the matrix-matrix product
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ * return norm of differences
+ */
+static float check_cgemmt(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                          char transb, blasint m, blasint k, float *alpha, blasint lda, 
+                          blasint ldb, float *beta, blasint ldc)
+{
+    blasint i;
+    blasint b_cols;
+    blasint a_cols;
+    blasint inc = 1;
+    blasint size_c = m * ldc * 2;
+
+    if(order == CblasColMajor){
+        if (transa == 'T' || transa == 'C' || 
+            transa == CblasTrans || transa == CblasConjTrans) 
+            a_cols = m;
+        else a_cols = k;
+
+        if (transb == 'T' || transb == 'C' || 
+            transb == CblasTrans || transb == CblasConjTrans) 
+            b_cols = k;
+        else b_cols = m;
+    } else {
+        if (transa == 'T' || transa == 'C' || 
+            transa == CblasTrans || transa == CblasConjTrans) 
+            a_cols = k;
+        else a_cols = m;
+
+        if (transb == 'T' || transb == 'C' ||
+            transb == CblasTrans || transb == CblasConjTrans) 
+            b_cols = m;
+        else b_cols = k;
+    }
+
+    srand_generate(data_cgemmt.a_test, a_cols * lda * 2);
+    srand_generate(data_cgemmt.b_test, b_cols * ldb  * 2);
+    srand_generate(data_cgemmt.c_test, m * ldc * 2);
+
+    for (i = 0; i < m * ldc * 2; i++)
+        data_cgemmt.c_gemm[i] = data_cgemmt.c_verify[i] = data_cgemmt.c_test[i];
+
+    cgemmt_trusted(api, order, uplo, transa, transb, m, k, alpha, lda, ldb, beta, ldc);
+
+    if (api == 'F')
+        BLASFUNC(cgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_cgemmt.a_test,
+                         &lda, data_cgemmt.b_test, &ldb, beta, data_cgemmt.c_test, &ldc);
+    else
+        cblas_cgemmt(order, uplo, transa, transb, m, k, alpha, data_cgemmt.a_test, lda,
+                     data_cgemmt.b_test, ldb, beta, data_cgemmt.c_test, ldc);
+
+    for (i = 0; i < m * ldc * 2; i++)
+        data_cgemmt.c_verify[i] -= data_cgemmt.c_test[i];
+
+    return BLASFUNC(snrm2)(&size_c, data_cgemmt.c_verify, &inc) / size_c;
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param ldc - leading dimension of C
+ * param expected_info - expected invalid parameter number in cgemmt
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                         char transb, blasint m, blasint k, blasint lda, blasint ldb,
+                         blasint ldc, int expected_info)
+{
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    set_xerbla("CGEMMT ", expected_info);
+
+    if (api == 'F')
+        BLASFUNC(cgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_cgemmt.a_test,
+                         &lda, data_cgemmt.b_test, &ldb, beta, data_cgemmt.c_test, &ldc);
+    else
+        cblas_cgemmt(order, uplo, transa, transb, m, k, alpha, data_cgemmt.a_test, lda,
+                     data_cgemmt.b_test, ldb, beta, data_cgemmt.c_test, ldc);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(cgemmt, upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(cgemmt, upper_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 25, ldb = 25, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(cgemmt, upper_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 25, ldc = 25;
+    char transa = 'N', transb = 'T';
+    char uplo = 'U';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(cgemmt, upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'T';
+    char uplo = 'U';
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, upper_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 50, ldc = 25;
+    char transa = 'R', transb = 'R';
+    char uplo = 'U';
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, upper_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'C', transb = 'R';
+    char uplo = 'U';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, upper_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'R', transb = 'C';
+    char uplo = 'U';
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, upper_M_50_K_25_a_conjtrans_b_conjtrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'C', transb = 'C';
+    char uplo = 'U';
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(cgemmt, upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(cgemmt, upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 0.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(cgemmt, lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(cgemmt, lower_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 25, ldb = 25, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'L';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(cgemmt, lower_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 25, ldc = 25;
+    char transa = 'N', transb = 'T';
+    char uplo = 'L';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(cgemmt, lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'T';
+    char uplo = 'L';
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, lower_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 50, ldc = 25;
+    char transa = 'R', transb = 'R';
+    char uplo = 'L';
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, lower_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'C', transb = 'R';
+    char uplo = 'L';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, lower_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'R', transb = 'C';
+    char uplo = 'L';
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, lower_M_50_K_25_a_conjtrans_b_conjtrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'C', transb = 'C';
+    char uplo = 'L';
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(cgemmt, lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(cgemmt, lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 0.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(cgemmt, c_api_colmajor_upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(cgemmt, c_api_colmajor_upper_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 25, ldb = 25, ldc = 50;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(cgemmt, c_api_colmajor_upper_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 25, ldc = 25;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(cgemmt, c_api_colmajor_upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, c_api_colmajor_upper_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 50, ldc = 25;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasConjNoTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, c_api_colmajor_upper_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasConjTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, c_api_colmajor_upper_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasConjNoTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, c_api_colmajor_upper_M_50_K_25_a_conjtrans_b_conjtrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasConjTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(cgemmt, c_api_colmajor_upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(cgemmt, c_api_colmajor_upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 0.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(cgemmt, c_api_colmajor_lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(cgemmt, c_api_colmajor_lower_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 25, ldb = 25, ldc = 50;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(cgemmt, c_api_colmajor_lower_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 25, ldc = 25;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(cgemmt, c_api_colmajor_lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, c_api_colmajor_lower_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 50, ldc = 25;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasConjNoTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, c_api_colmajor_lower_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasConjTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, c_api_colmajor_lower_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasConjNoTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, c_api_colmajor_lower_M_50_K_25_a_conjtrans_b_conjtrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasConjTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(cgemmt, c_api_colmajor_lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(cgemmt, c_api_colmajor_lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 0.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_upper_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_upper_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 25;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {-1.0f, -1.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_upper_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 50, ldb = 25, ldc = 25;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasConjNoTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_upper_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasConjTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_upper_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasConjNoTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_upper_M_25_K_50_a_conjtrans_b_conjtrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 50, ldc = 25;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasConjTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(cgemmt, c_api_rowmajor_upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(cgemmt, c_api_rowmajor_upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 0.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_lower_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 25;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_lower_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 50, ldb = 25, ldc = 25;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasConjNoTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasConjTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasConjNoTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_25_a_conjtrans_b_conjtrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 25, ldc = 50;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasConjTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(cgemmt, c_api_rowmajor_lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(cgemmt, c_api_rowmajor_lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 0.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param uplo.
+ * Must be upper (U) or lower (L).
+ */
+CTEST(cgemmt, xerbla_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'O';
+    int expected_info = 1;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param transa.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(cgemmt, xerbla_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'O', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 2;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param transb.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(cgemmt, xerbla_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'O';
+    char uplo = 'U';
+    int expected_info = 3;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(cgemmt, xerbla_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 4;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(cgemmt, xerbla_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 5;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(cgemmt, xerbla_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 100, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 8;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B not transposed.
+ */
+CTEST(cgemmt, xerbla_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 10;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(cgemmt, xerbla_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 13;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test.
+ * Test error function for an invalid param order.
+ * Must be column or row major.
+ */
+CTEST(cgemmt, xerbla_c_api_major_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 0;
+
+    int passed = check_badargs('C', 'O', CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param uplo.
+ * Must be upper or lower.
+ */
+CTEST(cgemmt, xerbla_c_api_colmajor_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 1;
+
+    int passed = check_badargs('C', CblasColMajor, 'O', CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param transa.
+ * Must be trans or no-trans.
+ */
+CTEST(cgemmt, xerbla_c_api_colmajor_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 2;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, 'O', CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param transb.
+ * Must be trans or no-trans.
+ */
+CTEST(cgemmt, xerbla_c_api_colmajor_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 3;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, 'O',
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(cgemmt, xerbla_c_api_colmajor_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 4;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(cgemmt, xerbla_c_api_colmajor_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 5;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(cgemmt, xerbla_c_api_colmajor_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 100, ldc = 50;
+    int expected_info = 8;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B not transposed.
+ */
+CTEST(cgemmt, xerbla_c_api_colmajor_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 10;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(cgemmt, xerbla_c_api_colmajor_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 13;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param uplo.
+ * Must be upper or lower.
+ */
+CTEST(cgemmt, xerbla_c_api_rowmajor_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 1;
+
+    int passed = check_badargs('C', CblasRowMajor, 'O', CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param transa.
+ * Must be trans or no-trans.
+ */
+CTEST(cgemmt, xerbla_c_api_rowmajor_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 2;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, 'O', CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param transb.
+ * Must be trans or no-trans.
+ */
+CTEST(cgemmt, xerbla_c_api_rowmajor_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 3;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, 'O',
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(cgemmt, xerbla_c_api_rowmajor_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 4;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(cgemmt, xerbla_c_api_rowmajor_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 5;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(cgemmt, xerbla_c_api_rowmajor_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 8;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B transposed.
+ */
+CTEST(cgemmt, xerbla_c_api_rowmajor_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 10;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(cgemmt, xerbla_c_api_rowmajor_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 100, ldb = 100, ldc = 50;
+    int expected_info = 13;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_cgemv_n.c b/utest/test_extensions/test_cgemv_n.c
new file mode 100644
index 000000000..60c9af86a
--- /dev/null
+++ b/utest/test_extensions/test_cgemv_n.c
@@ -0,0 +1,340 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_CSPMV_N {
+    float a_test[DATASIZE * DATASIZE * 2];
+    float b_test[DATASIZE * 2 * INCREMENT];
+    float c_test[DATASIZE * 2 * INCREMENT];
+    float c_verify[DATASIZE * 2 * INCREMENT];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CSPMV_N data_cgemv_n;
+
+/**
+ * cgemv not transposed reference code
+ *
+ * param trans specifies whether matris A is conj or/and xconj
+ * param m - number of rows of A
+ * param n - number of columns of A
+ * param alpha - scaling factor for the matrib-vector product
+ * param a - buffer holding input matrib A
+ * param lda - leading dimension of matrix A
+ * param b - Buffer holding input vector b
+ * param inc_b - stride of vector b
+ * param beta - scaling factor for vector c
+ * param c - buffer holding input/output vector c
+ * param inc_c - stride of vector c
+ */
+static void cgemv_n_trusted(char trans, blasint m, blasint n, float *alpha, float *a,
+                          blasint lda, float *b, blasint inc_b, float *beta, float *c,
+                          blasint inc_c)
+{
+	blasint i, j;
+    blasint i2 = 0;
+	blasint ib = 0, ic = 0;
+
+    float temp_r, temp_i;
+
+	float *a_ptr = a;
+    blasint lda2 = 2*lda;
+
+	blasint inc_b2 = 2 * inc_b;
+    blasint inc_c2 = 2 * inc_c;
+
+    BLASFUNC(cscal)(&m, beta, c, &inc_c);
+
+	for (j = 0; j < n; j++)
+	{
+
+        if (trans == 'N' || trans == 'R') {
+            temp_r = alpha[0] * b[ib] - alpha[1] * b[ib+1];
+            temp_i = alpha[0] * b[ib+1] + alpha[1] * b[ib];
+        } else {
+            temp_r = alpha[0] * b[ib] + alpha[1] * b[ib+1];
+            temp_i = alpha[0] * b[ib+1] - alpha[1] * b[ib];
+        }
+
+		ic = 0;
+		i2 = 0;
+
+		for (i = 0; i < m; i++)
+		{
+                if (trans == 'N') {
+                    c[ic] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
+                    c[ic+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+                } 
+                if (trans == 'O') {
+                    c[ic] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
+                    c[ic+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+                }
+                if (trans == 'R') {
+                    c[ic] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
+                    c[ic+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+                }
+                if (trans == 'S') {
+                    c[ic] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
+                    c[ic+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+                }
+			i2 += 2;
+			ic += inc_c2;
+		}
+		a_ptr += lda2;
+		ib += inc_b2;
+	}
+
+}
+
+/**
+ * Comapare results computed by cgemv and cgemv_n_trusted
+ *
+ * param trans specifies whether matris A is conj or/and xconj
+ * param m - number of rows of A
+ * param n - number of columns of A
+ * param alpha - scaling factor for the matrib-vector product
+ * param lda - leading dimension of matrix A
+ * param inc_b - stride of vector b
+ * param beta - scaling factor for vector c
+ * param inc_c - stride of vector c
+ * return norm of differences
+ */
+static float check_cgemv_n(char trans, blasint m, blasint n, float *alpha, blasint lda, 
+                            blasint inc_b, float *beta, blasint inc_c)
+{
+    blasint i;
+
+    srand_generate(data_cgemv_n.a_test, n * lda);
+    srand_generate(data_cgemv_n.b_test, 2 * n * inc_b);
+    srand_generate(data_cgemv_n.c_test, 2 * m * inc_c);
+
+    for (i = 0; i < m * 2 * inc_c; i++)
+        data_cgemv_n.c_verify[i] = data_cgemv_n.c_test[i];
+
+    cgemv_n_trusted(trans, m, n, alpha, data_cgemv_n.a_test, lda, data_cgemv_n.b_test, 
+                  inc_b, beta, data_cgemv_n.c_test, inc_c);
+    BLASFUNC(cgemv)(&trans, &m, &n, alpha, data_cgemv_n.a_test, &lda, data_cgemv_n.b_test, 
+                    &inc_b, beta, data_cgemv_n.c_verify, &inc_c);
+
+    for (i = 0; i < m * 2 * inc_c; i++)
+        data_cgemv_n.c_verify[i] -= data_cgemv_n.c_test[i];
+
+    return BLASFUNC(scnrm2)(&n, data_cgemv_n.c_verify, &inc_c);
+}
+
+/**
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(cgemv, trans_o_square_matrix)
+{
+    blasint n = 100, m = 100, lda = 100;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'O';
+    float alpha[] = {2.0f, -1.0f};
+    float beta[] = {1.4f, 5.0f};
+
+    float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj
+ * Number of rows of A is 50
+ * Number of colums of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(cgemv, trans_o_rectangular_matrix_rows_less_then_cols)
+{
+    blasint n = 100, m = 50, lda = 50;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'O';
+    float alpha[] = {2.0f, -1.0f};
+    float beta[] = {1.4f, 5.0f};
+
+    float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj
+ * Number of rows of A is 100
+ * Number of colums of A is 50
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(cgemv, trans_o_rectangular_matrix_cols_less_then_rows)
+{
+    blasint n = 50, m = 100, lda = 100;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'O';
+    float alpha[] = {2.0f, -1.0f};
+    float beta[] = {1.4f, 5.0f};
+
+    float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 2
+ */
+CTEST(cgemv, trans_o_double_strides)
+{
+    blasint n = 100, m = 100, lda = 100;
+    blasint inc_b = 2, inc_c = 2;
+    char trans = 'O';
+    float alpha[] = {2.0f, -1.0f};
+    float beta[] = {1.4f, 5.0f};
+
+    float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj and conj
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(cgemv, trans_s_square_matrix)
+{
+    blasint n = 100, m = 100, lda = 100;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'S';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.4f, 5.0f};
+
+    float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj and conj
+ * Number of rows of A is 50
+ * Number of colums of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(cgemv, trans_s_rectangular_matrix_rows_less_then_cols)
+{
+    blasint n = 100, m = 50, lda = 50;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'S';
+    float alpha[] = {2.0f, -1.0f};
+    float beta[] = {1.4f, 5.0f};
+
+    float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj and conj
+ * Number of rows of A is 100
+ * Number of colums of A is 50
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(cgemv, trans_s_rectangular_matrix_cols_less_then_rows)
+{
+    blasint n = 50, m = 100, lda = 100;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'S';
+    float alpha[] = {2.0f, -1.0f};
+    float beta[] = {1.4f, 0.0f};
+
+    float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj and conj
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 2
+ */
+CTEST(cgemv, trans_s_double_strides)
+{
+    blasint n = 100, m = 100, lda = 100;
+    blasint inc_b = 2, inc_c = 2;
+    char trans = 'S';
+    float alpha[] = {2.0f, -1.0f};
+    float beta[] = {1.0f, 5.0f};
+
+    float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+#endif
diff --git a/utest/test_extensions/test_cgemv_t.c b/utest/test_extensions/test_cgemv_t.c
new file mode 100644
index 000000000..aa3281e66
--- /dev/null
+++ b/utest/test_extensions/test_cgemv_t.c
@@ -0,0 +1,1132 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define N 100
+#define M 100
+#define INCREMENT 2
+
+struct DATA_CGEMV_T {
+    float a_test[N * M * 2];
+    float a_verify[N * M * 2];
+    float y_test[M * INCREMENT * 2];
+    float y_verify[M * INCREMENT * 2];
+    float x_test[M * INCREMENT * 2];
+    float x_verify[M * INCREMENT * 2];
+};
+
+// SINGLE_EPS_ZGEMV = MAX_VAL * NUMBER OF OPERATIONS * FLT_EPSILON
+// SINGLE_EPS_ZGEMV = 5.0 * O(100 * 100) * 1.19e-07 = 5*e-03
+#define SINGLE_EPS_ZGEMV 5e-03
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CGEMV_T data_cgemv_t;
+
+/**
+ * Find product of matrix-vector multiplication
+ * 
+ * param n specifies number of columns of A
+ * param m specifies number of rows of A and size of vector x
+ * param lda specifies leading dimension of A
+ * param inc_x specifies increment of vector x
+ */
+static void matrix_vector_product(blasint n, blasint m, blasint lda, blasint inc_x)
+{
+    blasint i;
+    float *a_ptr = data_cgemv_t.a_verify;
+    float *x_ptr = data_cgemv_t.x_test;
+    float *x_res = data_cgemv_t.x_verify;
+
+    openblas_complex_float result;
+
+    for (i = 0; i < n * inc_x; i+= inc_x)
+    {
+        result = cblas_cdotu(lda, a_ptr, 1, x_ptr, inc_x);
+        x_res[0] = CREAL(result);
+        x_res[1] = CIMAG(result);
+        a_ptr += lda * 2;
+        x_res += 2 * inc_x;
+    }
+}
+
+/**
+ * Test cgemv by comparing it against comatcopy, caxpby and
+ * reference func matrix_vector_product
+ * 
+ * comatcopy perform operation: op(A)
+ * matrix_vector_product perform operation: A*x
+ * caxpby perform operation: alpha*x + beta*y
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param m specifies number of rows of A
+ * param n specifies number of columns of A
+ * param alpha specifies scalar alpha
+ * param lda specifies leading dimension of the matrix A
+ * param inc_x specifies increment for vector x
+ * param beta specifies scalar beta
+ * param inc_y specifies increment for vector y
+ * return norm of difference between cgemv and result of reference funcs
+ */
+static float check_cgemv(char api, char order, char trans, blasint m, blasint n, float *alpha, 
+                            blasint lda, blasint inc_x, float *beta, blasint inc_y)
+{
+    blasint i;
+
+    enum CBLAS_ORDER corder;
+    enum CBLAS_TRANSPOSE ctrans;
+
+    // Transpose parameters for comatcopy
+    // cgemv_t perform operation on transposed matrix, no need to transpose a_verify
+    char trans_copy;
+    char ctrans_copy;
+
+    // Param alpha for comatcopy, scale on alpha perform caxpby
+    float alpha_one[] = {1.0f, 0.0f};
+
+    memset(data_cgemv_t.x_verify, 0.0f, m * inc_x * 2 * sizeof(float));
+
+    // Fill matrix A, vectors x, y    
+    srand_generate(data_cgemv_t.a_test, lda * n * 2);
+    srand_generate(data_cgemv_t.x_test, m * inc_x * 2);
+    srand_generate(data_cgemv_t.y_test, m * inc_y * 2);
+
+    // Copy vector y for reference funcs
+    for (int i = 0; i < m * inc_y * 2; i++) {
+        data_cgemv_t.y_verify[i] = data_cgemv_t.y_test[i];
+    }
+
+    if (api == 'F') {
+        if (trans == 'T') trans_copy = 'N';
+        if (trans == 'C') trans_copy = 'R';
+        if (trans == 'U') trans_copy = 'R';
+        if (trans == 'D') trans_copy = 'N';
+
+        // Perform operation: op(A)
+        BLASFUNC(comatcopy)(&order, &trans_copy, &m, &n, alpha_one, data_cgemv_t.a_test, &lda, data_cgemv_t.a_verify, &lda);
+
+        // Find A*x
+        matrix_vector_product(n, m, lda, inc_x);
+
+        // Find conj(x)
+        if (trans == 'U' || trans == 'D') {
+            cconjugate_vector(m, inc_x, data_cgemv_t.x_verify);
+        }
+
+        // Find alpha*x+beta*y
+        BLASFUNC(caxpby)(&n, alpha, data_cgemv_t.x_verify, &inc_x, beta, data_cgemv_t.y_verify, &inc_y);
+
+        BLASFUNC(cgemv)(&trans, &m, &n, alpha, data_cgemv_t.a_test, 
+                        &lda, data_cgemv_t.x_test, &inc_x, beta, data_cgemv_t.y_test, &inc_y);
+    }
+    else {
+        if (order == 'C') corder = CblasColMajor;
+        if (order == 'R') corder = CblasRowMajor;
+        if (trans == 'T') {ctrans = CblasTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasTrans : CblasNoTrans;}
+        if (trans == 'N') {ctrans = CblasNoTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasNoTrans : CblasTrans;}
+        if (trans == 'C') {ctrans = CblasConjTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasConjTrans : CblasConjNoTrans;}
+        if (trans == 'R') {ctrans = CblasConjNoTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasConjNoTrans : CblasConjTrans;}
+
+        // Perform operation: op(A)
+        cblas_comatcopy(corder, ctrans_copy, m, n, alpha_one, data_cgemv_t.a_test, lda, data_cgemv_t.a_verify, lda);
+
+        // Find A*x
+        matrix_vector_product(n, m, lda, inc_x);
+
+        // Find alpha*x+beta*y
+        cblas_caxpby(n, alpha, data_cgemv_t.x_verify, inc_x, beta, data_cgemv_t.y_verify, inc_y);
+
+        cblas_cgemv(corder, ctrans, m, n, alpha, data_cgemv_t.a_test, 
+                        lda, data_cgemv_t.x_test, inc_x, beta, data_cgemv_t.y_test, inc_y);
+    }
+
+    // Find the differences between output vector caculated by cgemv and reference funcs
+    for (i = 0; i < m * inc_y * 2; i++)
+        data_cgemv_t.y_test[i] -= data_cgemv_t.y_verify[i];
+
+    // Find the norm of differences
+    return cblas_scnrm2(m, data_cgemv_t.y_test, inc_y);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param m specifies number of rows of A
+ * param n specifies number of columns of A
+ * param lda specifies leading dimension of the matrix A
+ * param inc_x specifies increment for vector x
+ * param inc_y specifies increment for vector y
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char order, char trans, blasint m, blasint n, 
+                        blasint lda, blasint inc_x, blasint inc_y, int expected_info)
+{
+    float alpha[] = {1.0f, 1.0f};
+    float a[] = {1.0f, 1.0f};
+    float x[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+    float y[] = {1.0f, 1.0f};
+
+    set_xerbla("CGEMV ", expected_info);
+
+    BLASFUNC(cgemv)(&trans, &m, &n, alpha, a, &lda, x, &inc_x, beta, y, &inc_y);
+
+    return check_error();
+}
+
+/**
+ * C API specific function
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param m specifies number of rows of A
+ * param n specifies number of columns of A
+ * param lda specifies leading dimension of the matrix A
+ * param inc_x specifies increment for vector x
+ * param inc_y specifies increment for vector y
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int c_api_check_badargs(CBLAS_ORDER corder, CBLAS_TRANSPOSE ctrans, blasint m, blasint n, 
+                        blasint lda, blasint inc_x, blasint inc_y, int expected_info)
+{
+    float alpha[] = {1.0f, 1.0f};
+    float a[] = {1.0f, 1.0f};
+    float x[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+    float y[] = {1.0f, 1.0f};
+
+    set_xerbla("CGEMV ", expected_info);
+
+    cblas_cgemv(corder, ctrans, m, n, alpha, a, lda, x, inc_x, beta, y, inc_y);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * inc x = 2, inc y = 1
+ * alpha_r = 1.0f, alpha_i = 1.0f
+ * beta_r = 2.0f, beta_i = 2.0f
+ */
+CTEST(cgemv, colmajor_trans_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'T';
+
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {1.0f, 2.0f};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    float norm = check_cgemv('F', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * inc x = 2, inc y = 1
+ * alpha_r = 1.0f, alpha_i = 1.0f
+ * beta_r = 2.0f, beta_i = 2.0f
+ */
+CTEST(cgemv, colmajor_trans_col_100_row_100_inc_x_2_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'T';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {2.0f, 2.0f};
+
+    blasint inc_x = 2;
+    blasint inc_y = 1;
+
+    float norm = check_cgemv('F', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate A
+ * Square matrix
+ * inc x = 1, inc y = 1
+ * alpha_r = 2.0f, alpha_i = 1.0f
+ * beta_r = 2.0f, beta_i = 1.0f
+ */
+CTEST(cgemv, colmajor_conjtrans_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'C';
+
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    float norm = check_cgemv('F', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate A
+ * Square matrix
+ * inc x = 1, inc y = 2
+ * alpha_r = 2.0f, alpha_i = 1.0f
+ * beta_r = 2.0f, beta_i = 1.0f
+ */
+CTEST(cgemv, colmajor_conjtrans_col_100_row_100_inc_x_1_y_2)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'C';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    blasint inc_x = 1;
+    blasint inc_y = 2;
+
+    float norm = check_cgemv('F', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and x conjugate
+ * Square matrix
+ * inc x = 1, inc y = 1
+ * alpha_r = 2.0f, alpha_i = 1.0f
+ * beta_r = 2.0f, beta_i = 1.0f
+ */
+CTEST(cgemv, colmajor_trans_x_conj_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'U';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    float norm = check_cgemv('F', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and x conjugate
+ * Square matrix
+ * inc x = 2, inc y = 2
+ * alpha_r = 1.0f, alpha_i = 2.0f
+ * beta_r = 1.0f, beta_i = 1.0f
+ */
+CTEST(cgemv, colmajor_trans_x_conj_col_100_row_100_inc_x_2_y_2)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'U';
+
+    float alpha[] = {1.0f, 2.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    float norm = check_cgemv('F', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+
+/**
+ * Fortran API specific test
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition, conjugate A, conjugate x
+ * Square matrix
+ * inc x = 2, inc y = 2
+ * alpha_r = 2.0f, alpha_i = 1.0f
+ * beta_r = 1.0f, beta_i = 2.0f
+ */
+CTEST(cgemv, colmajor_conjtrans_x_conj_col_100_row_100_inc_x_1_y_2)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'D';
+
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {1.0f, 2.0f};
+
+    blasint inc_x = 1;
+    blasint inc_y = 2;
+
+    float norm = check_cgemv('F', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test 
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition, conjugate A, conjugate x
+ * Square matrix
+ * inc x = 2, inc y = 1
+ * alpha_r = 2.0f, alpha_i = 1.0f
+ * beta_r = 1.0f, beta_i = 2.0f
+ */
+CTEST(cgemv, c_api_colmajor_trans_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'T';
+
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {1.0f, 2.0f};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    float norm = check_cgemv('C', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test 
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate A
+ * Square matrix
+ * inc x = 2, inc y = 1
+ * alpha_r = 1.0f, alpha_i = 1.0f
+ * beta_r = 1.0f, beta_i = 2.0f
+ */
+CTEST(cgemv, c_api_colmajor_conjtrans_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'C';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 2.0f};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    float norm = check_cgemv('C', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test 
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate A
+ * Square matrix
+ * inc x = 1, inc y = 2
+ * alpha_r = 1.0f, alpha_i = 1.0f
+ * beta_r = 1.0f, beta_i = 2.0f
+ */
+CTEST(cgemv, c_api_colmajor_conjtrans_col_100_row_100_inc_x_1_y_2)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'C';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 2.0f};
+
+    blasint inc_x = 1;
+    blasint inc_y = 2;
+
+    float norm = check_cgemv('C', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test 
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Square matrix
+ * inc x = 1, inc y = 1
+ * alpha_r = 2.0f, alpha_i = 1.0f
+ * beta_r = 1.0f, beta_i = 1.0f
+ */
+CTEST(cgemv, c_api_rowmajor_notrans_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'R';
+    char trans = 'N';
+
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    float norm = check_cgemv('C', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test 
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * No trans
+ * Square matrix
+ * inc x = 2, inc y = 2
+ * alpha_r = 1.0f, alpha_i = 1.0f
+ * beta_r = 3.0f, beta_i = 2.0f
+ */
+CTEST(cgemv, c_api_rowmajor_notrans_col_100_row_100_inc_x_2_y_2)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'R';
+    char trans = 'N';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {3.0f, 1.0f};
+
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    float norm = check_cgemv('C', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test 
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Conjugate
+ * Square matrix
+ * inc x = 1, inc y = 1
+ * alpha_r = 1.0f, alpha_i = 3.0f
+ * beta_r = 1.0f, beta_i = 2.5f
+ */
+CTEST(cgemv, c_api_rowmajor_conj_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'R';
+    char trans = 'R';
+
+    float alpha[] = {1.0f, 3.0f};
+    float beta[] = {1.0f, 2.5f};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    float norm = check_cgemv('C', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test 
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Conjugate
+ * Square matrix
+ * inc x = 2, inc y = 1
+ * alpha_r = 1.0f, alpha_i = 1.0f
+ * beta_r = 1.0f, beta_i = 1.5f
+ */
+CTEST(cgemv, c_api_rowmajor_conj_col_100_row_100_inc_x_2_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'R';
+    char trans = 'R';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.5f};
+
+    blasint inc_x = 2;
+    blasint inc_y = 1;
+
+    float norm = check_cgemv('C', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param inc_y.
+ * Must be positive
+ * 
+ * Column major
+ */
+CTEST(cgemv, xerbla_invalid_inc_y)
+{
+    char order = 'C';
+    char trans = 'T';
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 0;
+
+    int expected_info = 11;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param inc_y.
+ * Must be positive
+ * 
+ * Column major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_inc_y_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 0;
+
+    int expected_info = 11;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param inc_y.
+ * Must be positive
+ * 
+ * Row major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_inc_y_row_major)
+{
+    enum CBLAS_ORDER corder = CblasRowMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasNoTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 0;
+
+    int expected_info = 11;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param inc_x.
+ * Must be positive
+ * 
+ * Column major
+ */
+CTEST(cgemv, xerbla_invalid_inc_x)
+{
+    char order = 'C';
+    char trans = 'T';
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+    
+    blasint inc_x = 0;
+    blasint inc_y = 1;
+    
+    int expected_info = 8;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param inc_x.
+ * Must be positive
+ * 
+ * Column major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_inc_x_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+    
+    blasint inc_x = 0;
+    blasint inc_y = 1;
+    
+    int expected_info = 8;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param inc_x.
+ * Must be positive
+ * 
+ * Row major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_inc_x_row_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+    
+    blasint inc_x = 0;
+    blasint inc_y = 1;
+    
+    int expected_info = 8;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param n.
+ * Must be positive.
+ * 
+ * Column major
+ */
+CTEST(cgemv, xerbla_invalid_n)
+{
+    char order = 'C';
+    char trans = 'T';
+
+    blasint m = 1, n = INVALID;
+    blasint lda = 1;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 3;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n.
+ * Must be positive.
+ * 
+ * Column major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_n_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = INVALID;
+    blasint lda = 1;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+    
+    int expected_info = 3;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n.
+ * Must be positive.
+ * 
+ * Row major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_n_row_major)
+{
+    enum CBLAS_ORDER corder = CblasRowMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasNoTrans;
+
+    blasint m = INVALID, n = 1;
+    blasint lda = 1;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 3;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param m.
+ * Must be positive.
+ * 
+ * Column major
+ */
+CTEST(cgemv, xerbla_invalid_m)
+{
+    char order = 'C';
+    char trans = 'T';
+
+    blasint m = INVALID, n = 1;
+    blasint lda = 1;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+    
+    int expected_info = 2;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m.
+ * Must be positive.
+ * 
+ * Column major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_m_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = INVALID, n = 1;
+    blasint lda = 1;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 2;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m.
+ * Must be positive.
+ * 
+ * Row major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_m_row_major)
+{
+    enum CBLAS_ORDER corder = CblasRowMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasNoTrans;
+
+    blasint m = 1, n = INVALID;
+    blasint lda = 1;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 2;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param lda.
+ * lda must be at least n.
+ * 
+ * Column major
+ */
+CTEST(cgemv, xerbla_invalid_lda)
+{
+    char order = 'C';
+    char trans = 'T';
+
+    blasint m = 1, n = 1;
+    blasint lda = INVALID;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 6;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda.
+ * If matrices are stored using col major layout, 
+ * lda must be at least m.
+ * 
+ * Column major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_lda_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = INVALID;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 6;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda.
+ * If matrices are stored using col major layout, 
+ * lda must be at least n.
+ * 
+ * Column major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_lda_row_major)
+{
+    enum CBLAS_ORDER corder = CblasRowMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasNoTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = INVALID;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 6;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param trans.
+ * 
+ * Column major
+ */
+CTEST(cgemv, xerbla_invalid_trans)
+{
+    char order = 'C';
+    char trans = 'Z';
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param trans.
+ * 
+ * Column major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_trans_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = INVALID;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 1;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param trans.
+ * 
+ * Row major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_trans_row_major)
+{
+    enum CBLAS_ORDER corder = CblasRowMajor;
+    enum CBLAS_TRANSPOSE ctrans = INVALID;
+    
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 1;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param order. 
+ */
+CTEST(cgemv, c_api_xerbla_invalid_order_col_major)
+{
+    enum CBLAS_ORDER corder = INVALID;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 0;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_cimatcopy.c b/utest/test_extensions/test_cimatcopy.c
new file mode 100644
index 000000000..800f8a2d1
--- /dev/null
+++ b/utest/test_extensions/test_cimatcopy.c
@@ -0,0 +1,850 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_CIMATCOPY {
+    float a_test[DATASIZE * DATASIZE * 2];
+    float a_verify[DATASIZE * DATASIZE * 2];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CIMATCOPY data_cimatcopy;
+
+/**
+ * Comapare results computed by cimatcopy and reference func
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param rows specifies number of rows of A
+ * param cols specifies number of columns of A
+ * param alpha specifies scaling factor for matrix A
+ * param lda_src - leading dimension of the matrix A
+ * param lda_dst - leading dimension of output matrix A
+ * return norm of difference between openblas and reference func
+ */
+static float check_cimatcopy(char api, char order, char trans, blasint rows, blasint cols, float *alpha, 
+                             blasint lda_src, blasint lda_dst)
+{
+    blasint m, n;
+    blasint rows_out, cols_out;
+    enum CBLAS_ORDER corder;
+    enum CBLAS_TRANSPOSE ctrans;
+    int conj = -1;
+
+    if (order == 'C') {
+        n = rows; m = cols;
+    }
+    else {
+        m = rows; n = cols;
+    }
+
+    if(trans == 'T' || trans == 'C') {
+        rows_out = n; cols_out = m*2;
+        if (trans == 'C')
+            conj = 1;
+    }
+    else {
+        rows_out = m; cols_out = n*2;
+        if (trans == 'R')
+            conj = 1;
+    }
+
+    srand_generate(data_cimatcopy.a_test, lda_src*m*2);
+
+    if (trans == 'T' || trans == 'C') {
+        ctranspose(m, n, alpha, data_cimatcopy.a_test, lda_src, data_cimatcopy.a_verify, lda_dst, conj);
+    } 
+    else {
+        ccopy(m, n, alpha, data_cimatcopy.a_test, lda_src, data_cimatcopy.a_verify, lda_dst, conj);
+    }
+
+    if (api == 'F') {
+        BLASFUNC(cimatcopy)(&order, &trans, &rows, &cols, alpha, data_cimatcopy.a_test, 
+                            &lda_src, &lda_dst);
+    }
+    else {
+        if (order == 'C') corder = CblasColMajor;
+        if (order == 'R') corder = CblasRowMajor;
+        if (trans == 'T') ctrans = CblasTrans;
+        if (trans == 'N') ctrans = CblasNoTrans;
+        if (trans == 'C') ctrans = CblasConjTrans;
+        if (trans == 'R') ctrans = CblasConjNoTrans;
+        cblas_cimatcopy(corder, ctrans, rows, cols, alpha, data_cimatcopy.a_test, 
+                    lda_src, lda_dst);
+    }
+
+    // Find the differences between output matrix computed by cimatcopy and reference func
+    return smatrix_difference(data_cimatcopy.a_test, data_cimatcopy.a_verify, cols_out, rows_out, 2*lda_dst);    
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param rows specifies number of rows of A
+ * param cols specifies number of columns of A
+ * param lda_src - leading dimension of the matrix A
+ * param lda_dst - leading dimension of output matrix A
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char order, char trans, blasint rows, blasint cols,
+                          blasint lda_src, blasint lda_dst, int expected_info)
+{
+    float alpha[] = {1.0f, 1.0f};
+
+    set_xerbla("CIMATCOPY", expected_info);
+
+    BLASFUNC(cimatcopy)(&order, &trans, &rows, &cols, alpha, data_cimatcopy.a_test, 
+                        &lda_src, &lda_dst);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha_r = -3.0, alpha_i = 1.0
+ */
+CTEST(cimatcopy, colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha[] = {-3.0f, 1.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, colmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'R';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(cimatcopy, colmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'C';
+    float alpha[] = {2.0f, 1.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, colmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, colmajor_notrans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 50, lda_dst = 50;
+    char order = 'C';
+    char trans = 'N';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific tests
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * Rectangular matrix
+ * alpha_r = 1.0, alpha_i = 1.0
+ */
+CTEST(cimatcopy, colmajor_conjtrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'C';
+    float alpha[] = {1.0f, 1.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * Rectangular matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, colmajor_conj_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 50, lda_dst = 50;
+    char order = 'C';
+    char trans = 'R';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha_r = 2.0, alpha_i = 3.0
+ */
+CTEST(cimatcopy, rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha[] = {2.0f, 3.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific tests
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, rowmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'R';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific tests
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(cimatcopy, rowmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'C';
+    float alpha[] = {2.0f, 1.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(cimatcopy, rowmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 50;
+    char order = 'R';
+    char trans = 'N'; 
+    float alpha[] = {2.0f, 1.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Rectangular matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, rowmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy and conjugate
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(cimatcopy, rowmajor_conj_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 50;
+    char order = 'R';
+    char trans = 'R'; 
+    float alpha[] = {1.5f, -1.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, rowmajor_conjtrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'R';
+    char trans = 'C';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha_r = 3.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, c_api_colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    float alpha[] = {3.0f, 2.0f};
+
+    float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha_r = 3.0, alpha_i = 1.5
+ */
+CTEST(cimatcopy, c_api_colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha[] = {3.0f, 1.5f};
+
+    float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha_r = 3.0, alpha_i = 1.0
+ */
+CTEST(cimatcopy, c_api_rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    float alpha[] = {3.0f, 1.0f};
+
+    float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, c_api_colmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'R';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(cimatcopy, c_api_colmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'C';
+    float alpha[] = {2.0f, 1.0f};
+
+    float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha_r = 1.0, alpha_i = 1.0
+ */
+CTEST(cimatcopy, c_api_rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha[] = {1.0f, 1.0f};
+
+    float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy and conjugate
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(cimatcopy, c_api_rowmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'R'; 
+    float alpha[] = {1.5f, -1.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, c_api_rowmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'C';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test error function for an invalid param order.
+ * Must be column (C) or row major (R).
+ */
+CTEST(cimatcopy, xerbla_invalid_order)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'O';
+    char trans = 'T';
+    int expected_info = 1;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param trans.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(cimatcopy, xerbla_invalid_trans)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'O';
+    int expected_info = 2;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param m.
+ * Must be positive.
+ */
+CTEST(cimatcopy, xerbla_invalid_rows)
+{
+    blasint m = 0, n = 100;
+    blasint lda_src = 0, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 3;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param n.
+ * Must be positive.
+ */
+CTEST(cimatcopy, xerbla_invalid_cols)
+{
+    blasint m = 100, n = 0;
+    blasint lda_src = 100, lda_dst = 0;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 4;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_src.
+ * If matrices are stored using row major layout, 
+ * lda_src must be at least n.
+ */
+CTEST(cimatcopy, xerbla_rowmajor_invalid_lda)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_src.
+ * If matrices are stored using column major layout,
+ * lda_src must be at least m.
+ */
+CTEST(cimatcopy, xerbla_colmajor_invalid_lda)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using row major layout and 
+ * there is no transposition, lda_dst must be at least n.
+ */
+CTEST(cimatcopy, xerbla_rowmajor_notrans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using row major layout and 
+ * there is transposition, lda_dst must be at least m.
+ */
+CTEST(cimatcopy, xerbla_rowmajor_trans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using column major layout and 
+ * there is no transposition, lda_dst must be at least m.
+ */
+CTEST(cimatcopy, xerbla_colmajor_notrans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using column major layout and 
+ * there is transposition, lda_dst must be at least n.
+ */
+CTEST(cimatcopy, xerbla_colmajor_trans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_comatcopy.c b/utest/test_extensions/test_comatcopy.c
new file mode 100644
index 000000000..8a3d5ee7b
--- /dev/null
+++ b/utest/test_extensions/test_comatcopy.c
@@ -0,0 +1,728 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_COMATCOPY {
+    float a_test[DATASIZE * DATASIZE * 2];
+    float b_test[DATASIZE * DATASIZE * 2];
+    float b_verify[DATASIZE * DATASIZE * 2];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_COMATCOPY data_comatcopy;
+
+/**
+ * Comapare results computed by comatcopy and reference func
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param rows - number of rows of A
+ * param cols - number of columns of A
+ * param alpha - scaling factor for matrix B
+ * param lda - leading dimension of the matrix A
+ * param ldb - leading dimension of the matrix B
+ * return norm of difference between openblas and reference func
+ */
+static float check_comatcopy(char api, char order, char trans, blasint rows, blasint cols, float* alpha, 
+                             blasint lda, blasint ldb)
+{
+    blasint b_rows, b_cols;
+    blasint m, n;
+    enum CBLAS_ORDER corder;
+    enum CBLAS_TRANSPOSE ctrans;
+    int conj = -1;
+
+    if (order == 'C') {
+        m = cols; n = rows;
+    }
+    else {
+        m = rows; n = cols;
+    }
+
+    if(trans == 'T' || trans == 'C') {
+        b_rows = n; b_cols = m*2;
+        if (trans == 'C')
+            conj = 1;
+    }
+    else {
+        b_rows = m; b_cols = n*2;
+        if (trans == 'R')
+            conj = 1;
+    }
+
+    srand_generate(data_comatcopy.a_test, lda*m*2);
+
+    if (trans == 'T' || trans == 'C') {
+        ctranspose(m, n, alpha, data_comatcopy.a_test, lda, data_comatcopy.b_verify, ldb, conj);
+    } 
+    else {
+        ccopy(m, n, alpha, data_comatcopy.a_test, lda, data_comatcopy.b_verify, ldb, conj);
+    }
+
+    if (api == 'F') {
+        BLASFUNC(comatcopy)(&order, &trans, &rows, &cols, alpha, data_comatcopy.a_test, 
+                            &lda, data_comatcopy.b_test, &ldb);
+    }
+    else {
+        if (order == 'C') corder = CblasColMajor;
+        if (order == 'R') corder = CblasRowMajor;
+        if (trans == 'T') ctrans = CblasTrans;
+        if (trans == 'N') ctrans = CblasNoTrans;
+        if (trans == 'C') ctrans = CblasConjTrans;
+        if (trans == 'R') ctrans = CblasConjNoTrans;
+        cblas_comatcopy(corder, ctrans, rows, cols, alpha, data_comatcopy.a_test, 
+                    lda, data_comatcopy.b_test, ldb);
+    }
+    
+    return smatrix_difference(data_comatcopy.b_test, data_comatcopy.b_verify, b_cols, b_rows, ldb*2);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param rows - number of rows of A
+ * param cols - number of columns of A
+ * param lda - leading dimension of the matrix A
+ * param ldb - leading dimension of the matrix B
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char order, char trans, blasint rows, blasint cols,
+                          blasint lda, blasint ldb, int expected_info)
+{
+    float alpha[] = {1.0f, 1.0f};
+
+    set_xerbla("COMATCOPY", expected_info);
+
+    BLASFUNC(comatcopy)(&order, &trans, &rows, &cols, alpha, data_comatcopy.a_test, 
+                        &lda, data_comatcopy.b_test, &ldb);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(comatcopy, colmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * alpha_r = -1.0, alpha_i = 2.0
+ */
+CTEST(comatcopy, colmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'T';
+    float alpha[] = {-1.0f, 2.0f};
+
+    float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(comatcopy, colmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'R';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(comatcopy, colmajor_conjtrnas_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'C';
+    float alpha[] = {2.0f, 1.0f};
+
+    float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(comatcopy, rowmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 50, ldb = 50;
+    char order = 'R';
+    char trans = 'N'; 
+    float alpha[] = {1.5f, -1.0f};
+
+    float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(comatcopy, rowmajor_trans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'T';
+    float alpha[] = {1.5f, -1.0f};
+
+    float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy and conjugate
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(comatcopy, rowmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'R'; 
+    float alpha[] = {1.5f, -1.0f};
+
+    float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(comatcopy, rowmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'C';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(comatcopy, c_api_colmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * alpha_r = -1.0, alpha_i = 2.0
+ */
+CTEST(comatcopy, c_api_colmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'T';
+    float alpha[] = {-1.0f, 2.0f};
+
+    float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(comatcopy, c_api_colmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'R';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(comatcopy, c_api_colmajor_conjtrnas_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'C';
+    float alpha[] = {2.0f, 1.0f};
+
+    float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(comatcopy, c_api_rowmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 50, ldb = 50;
+    char order = 'R';
+    char trans = 'N'; 
+    float alpha[] = {1.5f, -1.0f};
+
+    float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(comatcopy, c_api_rowmajor_trans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'T';
+    float alpha[] = {1.5f, -1.0f};
+
+    float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy and conjugate
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(comatcopy, c_api_rowmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'R'; 
+    float alpha[] = {1.5f, -1.0f};
+
+    float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(comatcopy, c_api_rowmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'C';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test error function for an invalid param order.
+ * Must be column (C) or row major (R).
+ */
+CTEST(comatcopy, xerbla_invalid_order)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'O';
+    char trans = 'T';
+    int expected_info = 1;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param trans.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(comatcopy, xerbla_invalid_trans)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'O';
+    int expected_info = 2;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param m.
+ * Must be positive.
+ */
+CTEST(comatcopy, xerbla_invalid_rows)
+{
+    blasint m = 0, n = 100;
+    blasint lda = 0, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 3;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param n.
+ * Must be positive.
+ */
+CTEST(comatcopy, xerbla_invalid_cols)
+{
+    blasint m = 100, n = 0;
+    blasint lda = 100, ldb = 0;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 4;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda.
+ * If matrices are stored using row major layout, 
+ * lda must be at least n.
+ */
+CTEST(comatcopy, xerbla_rowmajor_invalid_lda)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 50, ldb = 100;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda.
+ * If matrices are stored using column major layout,
+ * lda must be at least m.
+ */
+CTEST(comatcopy, xerbla_colmajor_invalid_lda)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 50, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using row major layout and 
+ * there is no transposition, ldb must be at least n.
+ */
+CTEST(comatcopy, xerbla_rowmajor_notrans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using row major layout and 
+ * there is transposition, ldb must be at least m.
+ */
+CTEST(comatcopy, xerbla_rowmajor_trans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using row major layout and 
+ * there is no transposition, ldb must be at least n.
+ */
+CTEST(comatcopy, xerbla_rowmajor_conj_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'R';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using row major layout and 
+ * there is transposition, ldb must be at least m.
+ */
+CTEST(comatcopy, xerbla_rowmajor_transconj_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'C';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using column major layout and 
+ * there is no transposition, ldb must be at least m.
+ */
+CTEST(comatcopy, xerbla_colmajor_notrans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using column major layout and 
+ * there is transposition, ldb must be at least n.
+ */
+CTEST(comatcopy, xerbla_colmajor_trans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using column major layout and 
+ * there is no transposition, ldb must be at least m.
+ */
+CTEST(comatcopy, xerbla_colmajor_conj_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'R';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using column major layout and 
+ * there is transposition, ldb must be at least n.
+ */
+CTEST(comatcopy, xerbla_colmajor_transconj_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'C';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
diff --git a/utest/test_extensions/test_crot.c b/utest/test_extensions/test_crot.c
new file mode 100644
index 000000000..1c55216d9
--- /dev/null
+++ b/utest/test_extensions/test_crot.c
@@ -0,0 +1,792 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_CROT {
+    float x_test[DATASIZE * INCREMENT * 2];
+    float y_test[DATASIZE * INCREMENT * 2];
+    float x_verify[DATASIZE * INCREMENT * 2];
+    float y_verify[DATASIZE * INCREMENT * 2];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CROT data_crot;
+
+/**
+ * Fortran API specific function
+ * Comapare results computed by csrot and caxpby 
+ * 
+ * param n specifies size of vector x
+ * param inc_x specifies increment of vector x
+ * param inc_y specifies increment of vector y
+ * param c specifies cosine
+ * param s specifies sine
+ * return norm of differences 
+ */
+static float check_csrot(blasint n, blasint inc_x, blasint inc_y, float *c, float *s)
+{
+    blasint i;
+    float norm = 0;
+    float s_neg[] = {-s[0], s[1]};
+
+    blasint inc_x_abs = labs(inc_x);
+    blasint inc_y_abs = labs(inc_y);
+
+    // Fill vectors x, y
+    srand_generate(data_crot.x_test, n * inc_x_abs * 2);
+    srand_generate(data_crot.y_test, n * inc_y_abs * 2);
+
+    if (inc_x == 0 && inc_y == 0) {
+        srand_generate(data_crot.x_test, n * 2);
+        srand_generate(data_crot.y_test, n * 2);
+    }
+
+    // Copy vector x for caxpby
+    for (i = 0; i < n * inc_x_abs * 2; i++)
+        data_crot.x_verify[i] = data_crot.x_test[i];
+
+    // Copy vector y for caxpby
+    for (i = 0; i < n * inc_y_abs * 2; i++)
+        data_crot.y_verify[i] = data_crot.y_test[i];
+    
+    // Find cx = c*x + s*y
+    BLASFUNC(caxpby)(&n, s, data_crot.y_test, &inc_y, c, data_crot.x_verify, &inc_x);
+
+    // Find cy = -conjg(s)*x + c*y
+    BLASFUNC(caxpby)(&n, s_neg, data_crot.x_test, &inc_x, c, data_crot.y_verify, &inc_y);
+
+    BLASFUNC(csrot)(&n, data_crot.x_test, &inc_x, data_crot.y_test, &inc_y, c, s);
+
+    // Find the differences between vector x caculated by caxpby and csrot
+    for (i = 0; i < n * 2 * inc_x_abs; i++)
+        data_crot.x_test[i] -= data_crot.x_verify[i];
+
+    // Find the differences between vector y caculated by caxpby and csrot
+    for (i = 0; i < n * 2 * inc_y_abs; i++)
+        data_crot.y_test[i] -= data_crot.y_verify[i];
+
+    // Find the norm of differences
+    norm += BLASFUNC(scnrm2)(&n, data_crot.x_test, &inc_x_abs);
+    norm += BLASFUNC(scnrm2)(&n, data_crot.y_test, &inc_y_abs);
+    return (norm / 2);
+}
+
+/**
+ * C API specific function
+ * Comapare results computed by csrot and caxpby 
+ * 
+ * param n specifies size of vector x
+ * param inc_x specifies increment of vector x
+ * param inc_y specifies increment of vector y
+ * param c specifies cosine
+ * param s specifies sine
+ * return norm of differences 
+ */
+static float c_api_check_csrot(blasint n, blasint inc_x, blasint inc_y, float *c, float *s)
+{
+    blasint i;
+    float norm = 0;
+    float s_neg[] = {-s[0], s[1]};
+
+    blasint inc_x_abs = labs(inc_x);
+    blasint inc_y_abs = labs(inc_y);
+
+    // Fill vectors x, y
+    srand_generate(data_crot.x_test, n * inc_x_abs * 2);
+    srand_generate(data_crot.y_test, n * inc_y_abs * 2);
+
+    if (inc_x == 0 && inc_y == 0) {
+        srand_generate(data_crot.x_test, n * 2);
+        srand_generate(data_crot.y_test, n * 2);
+    }
+
+    // Copy vector x for caxpby
+    for (i = 0; i < n * inc_x_abs * 2; i++)
+        data_crot.x_verify[i] = data_crot.x_test[i];
+
+    // Copy vector y for caxpby
+    for (i = 0; i < n * inc_y_abs * 2; i++)
+        data_crot.y_verify[i] = data_crot.y_test[i];
+    
+    // Find cx = c*x + s*y
+    cblas_caxpby(n, s, data_crot.y_test, inc_y, c, data_crot.x_verify, inc_x);
+
+    // Find cy = -conjg(s)*x + c*y
+    cblas_caxpby(n, s_neg, data_crot.x_test, inc_x, c, data_crot.y_verify, inc_y);
+
+    cblas_csrot(n, data_crot.x_test, inc_x, data_crot.y_test, inc_y, c[0], s[0]);
+
+    // Find the differences between vector x caculated by caxpby and csrot
+    for (i = 0; i < n * 2 * inc_x_abs; i++)
+        data_crot.x_test[i] -= data_crot.x_verify[i];
+
+    // Find the differences between vector y caculated by caxpby and csrot
+    for (i = 0; i < n * 2 * inc_y_abs; i++)
+        data_crot.y_test[i] -= data_crot.y_verify[i];
+
+    // Find the norm of differences
+    norm += cblas_scnrm2(n, data_crot.x_test, inc_x_abs);
+    norm += cblas_scnrm2(n, data_crot.y_test, inc_y_abs);
+    return (norm / 2);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 0
+ * Stride of vector y is 0
+ * c = 1.0f
+ * s = 2.0f
+ */
+CTEST(crot, inc_x_0_inc_y_0)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 0;
+    blasint inc_y = 0;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {2.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, inc_x_1_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is -1
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, inc_x_neg_1_inc_y_neg_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -1;
+    blasint inc_y = -1;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ * c = 3.0f
+ * s = 2.0f
+ */
+CTEST(crot, inc_x_2_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 1;
+
+    // Imaginary  part for caxpby
+    float c[] = {3.0f, 0.0f};
+    float s[] = {2.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is 1
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, inc_x_neg_2_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -2;
+    blasint inc_y = 1;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, inc_x_1_inc_y_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 2;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is -2
+ * c = 2.0f
+ * s = 1.0f
+ */
+CTEST(crot, inc_x_1_inc_y_neg_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = -2;
+
+    // Imaginary  part for caxpby
+    float c[] = {2.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0f
+ * s = 2.0f
+ */
+CTEST(crot, inc_x_2_inc_y_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {2.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, inc_x_neg_2_inc_y_neg_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -2;
+    blasint inc_y = -2;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 0.0f
+ * s = 1.0f
+ */
+CTEST(crot, inc_x_2_inc_y_2_c_zero)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for caxpby
+    float c[] = {0.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0f
+ * s = 0.0f
+ */
+CTEST(crot, inc_x_2_inc_y_2_s_zero)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {0.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 0
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, check_n_zero)
+{
+    blasint n = 0;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test 
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 0
+ * Stride of vector y is 0
+ * c = 1.0f
+ * s = 2.0f
+ */
+CTEST(crot, c_api_inc_x_0_inc_y_0)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 0;
+    blasint inc_y = 0;
+
+    // Imaginary  part for caxpby
+    float c[] = {3.0f, 0.0f};
+    float s[] = {2.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, c_api_inc_x_1_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is -1
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, c_api_inc_x_neg_1_inc_y_neg_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -1;
+    blasint inc_y = -1;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ * c = 3.0f
+ * s = 2.0f
+ */
+CTEST(crot, c_api_inc_x_2_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 1;
+
+    // Imaginary  part for caxpby
+    float c[] = {3.0f, 0.0f};
+    float s[] = {2.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is 1
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, c_api_inc_x_neg_2_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -2;
+    blasint inc_y = 1;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, c_api_inc_x_1_inc_y_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 2;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is -2
+ * c = 2.0f
+ * s = 1.0f
+ */
+CTEST(crot, c_api_inc_x_1_inc_y_neg_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = -2;
+
+    // Imaginary  part for caxpby
+    float c[] = {2.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0f
+ * s = 2.0f
+ */
+CTEST(crot, c_api_inc_x_2_inc_y_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {2.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, c_api_inc_x_neg_2_inc_y_neg_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -2;
+    blasint inc_y = -2;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 0.0f
+ * s = 1.0f
+ */
+CTEST(crot, c_api_inc_x_2_inc_y_2_c_zero)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for caxpby
+    float c[] = {0.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0f
+ * s = 0.0f
+ */
+CTEST(crot, c_api_inc_x_2_inc_y_2_s_zero)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {0.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 0
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, c_api_check_n_zero)
+{
+    blasint n = 0;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_crotg.c b/utest/test_extensions/test_crotg.c
new file mode 100644
index 000000000..9db7dc7d3
--- /dev/null
+++ b/utest/test_extensions/test_crotg.c
@@ -0,0 +1,290 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#ifdef BUILD_COMPLEX
+
+/**
+ * Fortran API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, zero_a)
+{
+    float sa[2] = {0.0f, 0.0f};
+    float sb[2] = {1.0f, 1.0f};
+    float ss[2];
+    float sc;
+    BLASFUNC(crotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.0f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, zero_b)
+{
+    float sa[2] = {1.0f, 1.0f};
+    float sb[2] = {0.0f, 0.0f};
+    float ss[2];
+    float sc;
+    BLASFUNC(crotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(1.0f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, zero_real)
+{
+    float sa[2] = {0.0f, 1.0f};
+    float sb[2] = {0.0f, 1.0f};
+    float ss[2];
+    float sc;
+    BLASFUNC(crotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.7071f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.7071f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.41421f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, positive_real_positive_img)
+{
+    float sa[2] = {3.0f, 4.0f};
+    float sb[2] = {4.0f, 6.0f};
+    float ss[2];
+    float sc;
+    BLASFUNC(crotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.04558f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(5.26498f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(7.01997f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, negative_real_positive_img)
+{
+    float sa[2] = {-3.0f, 4.0f};
+    float sb[2] = {-4.0f, 6.0f};
+    float ss[2];
+    float sc;
+    BLASFUNC(crotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.04558f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-5.26498f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(7.01997f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, positive_real_negative_img)
+{
+    float sa[2] = {3.0f, -4.0f};
+    float sb[2] = {4.0f, -6.0f};
+    float ss[2];
+    float sc;
+    BLASFUNC(crotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.04558f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(5.26498f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-7.01997f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, negative_real_negative_img)
+{
+    float sa[2] = {-3.0f, -4.0f};
+    float sb[2] = {-4.0f, -6.0f};
+    float ss[2];
+    float sc;
+    BLASFUNC(crotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.04558f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-5.26498f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-7.01997f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, c_api_zero_a)
+{
+    float sa[2] = {0.0f, 0.0f};
+    float sb[2] = {1.0f, 1.0f};
+    float ss[2];
+    float sc;
+    cblas_crotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.0f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, c_api_zero_b)
+{
+    float sa[2] = {1.0f, 1.0f};
+    float sb[2] = {0.0f, 0.0f};
+    float ss[2];
+    float sc;
+    cblas_crotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(1.0f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, c_api_zero_real)
+{
+    float sa[2] = {0.0f, 1.0f};
+    float sb[2] = {0.0f, 1.0f};
+    float ss[2];
+    float sc;
+    cblas_crotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.7071f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.7071f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.41421f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, c_api_positive_real_positive_img)
+{
+    float sa[2] = {3.0f, 4.0f};
+    float sb[2] = {4.0f, 6.0f};
+    float ss[2];
+    float sc;
+    cblas_crotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.04558f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(5.26498f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(7.01997f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, c_api_negative_real_positive_img)
+{
+    float sa[2] = {-3.0f, 4.0f};
+    float sb[2] = {-4.0f, 6.0f};
+    float ss[2];
+    float sc;
+    cblas_crotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.04558f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-5.26498f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(7.01997f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, c_api_positive_real_negative_img)
+{
+    float sa[2] = {3.0f, -4.0f};
+    float sb[2] = {4.0f, -6.0f};
+    float ss[2];
+    float sc;
+    cblas_crotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.04558f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(5.26498f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-7.01997f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, c_api_negative_real_negative_img)
+{
+    float sa[2] = {-3.0f, -4.0f};
+    float sb[2] = {-4.0f, -6.0f};
+    float ss[2];
+    float sc;
+    cblas_crotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.04558f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-5.26498f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-7.01997f, sa[1], SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_csbmv.c b/utest/test_extensions/test_csbmv.c
new file mode 100644
index 000000000..8e8ce4530
--- /dev/null
+++ b/utest/test_extensions/test_csbmv.c
@@ -0,0 +1,606 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_CSBMV {
+    float sp_matrix[DATASIZE * (DATASIZE + 1)];
+    float sb_matrix[DATASIZE * DATASIZE * 2];
+    float b_test[DATASIZE * 2 * INCREMENT];
+    float c_test[DATASIZE * 2 * INCREMENT];
+    float c_verify[DATASIZE * 2 * INCREMENT];
+};
+
+// SINGLE_EPS_ZGEMV = MAX_VAL * NUMBER OF OPERATIONS * FLT_EPSILON
+// SINGLE_EPS_ZGEMV = 5.0 * O(100 * 100) * 1.19e-07 = 5*e-03
+#define SINGLE_EPS_ZGEMV 5e-03
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CSBMV data_csbmv;
+
+/** 
+ * Transform full-storage symmetric band matrix A to upper (U) or lower (L)
+ * band-packed storage mode.
+ * 
+ * param uplo specifies whether matrix a is upper or lower band-packed.
+ * param n - number of rows and columns of A
+ * param k - number of super-diagonals of A
+ * output param a - buffer for holding symmetric band-packed matrix
+ * param lda - specifies the leading dimension of a
+ * param sb_matrix - buffer holding full-storage symmetric band matrix A 
+ * param ldm - specifies the leading dimension of A
+ */
+static void transform_to_band_storage(char uplo, blasint n, blasint k, float* a, blasint lda,
+                                     float* sb_matrix, blasint ldm) 
+{
+    blasint i, j, m;
+    if (uplo == 'L') {
+        for (j = 0; j < n; j++)
+        {
+            m = -j;
+            for (i = 2 * j; i < MIN(2 * n, 2 * (j + k + 1)); i += 2)
+            {
+                a[(2*m + i) + j * lda * 2] = sb_matrix[i + j * ldm * 2];
+                a[(2*m + (i + 1)) + j * lda * 2] = sb_matrix[(i + 1) + j * ldm * 2];
+            }
+        }
+    }
+    else {
+        for (j = 0; j < n; j++)
+        {   
+            m = k - j;
+            for (i = MAX(0, 2*(j - k)); i <= j*2; i += 2)
+            {
+                a[(2*m + i) + j * lda * 2] = sb_matrix[i + j * ldm * 2];
+                a[(2*m + (i + 1)) + j * lda * 2] = sb_matrix[(i + 1) + j * ldm * 2];
+            }
+        }
+    }
+}
+
+/** 
+ * Generate full-storage symmetric band matrix A with k - super-diagonals
+ * from input symmetric packed matrix in lower packed mode (L)
+ * 
+ * output param sb_matrix - buffer for holding full-storage symmetric band matrix.
+ * param sp_matrix - buffer holding input symmetric packed matrix
+ * param n - number of rows and columns of A
+ * param k - number of super-diagonals of A
+ */
+static void get_symmetric_band_matr(float *sb_matrix, float *sp_matrix, blasint n, blasint k)
+{
+    blasint m;
+    blasint i, j;
+    m = 0;
+    for (i = 0; i < n; i++)
+    {
+        for (j = 0; j < n * 2; j += 2)
+        {
+            // Make matrix band with k super-diagonals
+            if (fabs((i+1) - ceil((j+1)/2.0f)) > k) 
+            {
+                sb_matrix[i * n * 2 + j] = 0.0f;
+                sb_matrix[i * n * 2 + j + 1] = 0.0f;
+                continue;
+            }
+
+            if (j / 2 < i)
+            {
+                sb_matrix[i * n * 2 + j] = 
+                        sb_matrix[j * n + i * 2];
+                sb_matrix[i * n * 2 + j + 1] = 
+                        sb_matrix[j * n + i * 2 + 1];
+            }
+            else
+            {
+                sb_matrix[i * n * 2 + j] = sp_matrix[m++];
+                sb_matrix[i * n * 2 + j + 1] = sp_matrix[m++];
+            }
+        }
+    }
+}
+
+/** 
+ * Check if error function was called with expected function name
+ * and param info
+ * 
+ * param uplo specifies whether matrix a is upper or lower band-packed.
+ * param n - number of rows and columns of A
+ * param k - number of super-diagonals of A
+ * param lda - specifies the leading dimension of a
+ * param inc_b - stride of vector b_test
+ * param inc_c - stride of vector c_test
+ * param expected_info - expected invalid parameter number in csbmv
+ * return TRUE if everything is ok, otherwise FALSE 
+ */
+static int check_badargs(char uplo, blasint n, blasint k, blasint lda, blasint inc_b,
+                          blasint inc_c, int expected_info)
+{
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float a[2];
+    srand_generate(a, 2);
+
+    set_xerbla("CSBMV ", expected_info);
+
+    BLASFUNC(csbmv)(&uplo, &n, &k, alpha, a, &lda, data_csbmv.b_test, 
+                    &inc_b, beta, data_csbmv.c_test, &inc_c);
+
+    return check_error();
+}
+
+/**
+ * Comapare results computed by csbmv and cgemv 
+ * since csbmv is cgemv for symmetric band matrix
+ * 
+ * param uplo specifies whether matrix A is upper or lower triangular
+ * param n - number of rows and columns of A
+ * param k - number of super-diagonals of A
+ * param alpha - scaling factor for the matrix-vector product
+ * param lda - specifies the leading dimension of a
+ * param inc_b - stride of vector b_test
+ * param beta - scaling factor for vector c_test
+ * param inc_c - stride of vector c_test
+ * param lda - specifies the leading dimension of a
+ * return norm of differences 
+ */
+static float check_csbmv(char uplo, blasint n, blasint k, float *alpha, blasint lda, 
+    blasint inc_b, float *beta, blasint inc_c, blasint ldm)
+{
+    blasint i;
+
+    // Trans param for gemv (can use any, since the input matrix is symmetric)
+    char trans = 'N';
+
+    // Symmetric band packed matrix for sbmv
+    float a[lda * n * 2];
+
+    // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test 
+    srand_generate(data_csbmv.sp_matrix, n * (n + 1));
+    srand_generate(data_csbmv.b_test, n * inc_b * 2);
+    srand_generate(data_csbmv.c_test, n * inc_c * 2);
+
+    // Copy vector c_test for cgemv
+    for (i = 0; i < n * inc_c * 2; i++)
+        data_csbmv.c_verify[i] = data_csbmv.c_test[i];
+
+    // Generate full-storage symmetric band matrix
+    // with k super-diagonals from symmetric packed matrix
+    get_symmetric_band_matr(data_csbmv.sb_matrix, data_csbmv.sp_matrix, n, k);
+
+    // Transform symmetric band matrix from conventional
+    // full matrix storage  to band storage for csbmv
+    transform_to_band_storage(uplo, n, k, a, lda, data_csbmv.sb_matrix, ldm);
+
+    BLASFUNC(cgemv)(&trans, &n, &n, alpha, data_csbmv.sb_matrix, &ldm, data_csbmv.b_test,
+                    &inc_b, beta, data_csbmv.c_verify, &inc_c);
+
+    BLASFUNC(csbmv)(&uplo, &n, &k, alpha, a, &lda,
+                    data_csbmv.b_test, &inc_b, beta, data_csbmv.c_test, &inc_c);
+
+    // Find the differences between output vector caculated by csbmv and cgemv
+    for (i = 0; i < n * inc_c * 2; i++)
+        data_csbmv.c_test[i] -= data_csbmv.c_verify[i];
+
+    // Find the norm of differences
+    return BLASFUNC(scnrm2)(&n, data_csbmv.c_test, &inc_c);
+}
+
+/**
+ * Test csbmv by comparing it against cgemv
+ * with the following options:
+ * 
+ * a is upper-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 0 
+ */
+CTEST(csbmv, upper_k_0_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 0;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Test csbmv by comparing it against cgemv
+ * with the following options:
+ * 
+ * a is upper-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 1
+ */
+CTEST(csbmv, upper_k_1_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 1;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Test csbmv by comparing it against cgemv
+ * with the following options:
+ * 
+ * a is upper-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 2
+ */
+CTEST(csbmv, upper_k_2_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Test csbmv by comparing it against cgemv
+ * with the following options:
+ * 
+ * a is upper-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 2
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 2
+ */
+CTEST(csbmv, upper_k_2_inc_b_2_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 2, inc_c = 1;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Test csbmv by comparing it against cgemv
+ * with the following options:
+ * 
+ * a is upper-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 2
+ * Stride of vector c_test is 2
+ * Number of super-diagonals k is 2
+ */
+CTEST(csbmv, upper_k_2_inc_b_2_inc_c_2_n_100)
+{
+    blasint n = DATASIZE, inc_b = 2, inc_c = 2;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Test csbmv by comparing it against cgemv
+ * with the following options:
+ * 
+ * a is lower-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 0
+ */
+CTEST(csbmv, lower_k_0_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 0;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'L';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Test csbmv by comparing it against cgemv
+ * with the following options:
+ * 
+ * a is lower-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 1
+ */
+CTEST(csbmv, lower_k_1_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 1;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'L';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Test csbmv by comparing it against cgemv
+ * with the following options:
+ * 
+ * a is lower-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 2
+ */
+CTEST(csbmv, lower_k_2_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'L';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Test csbmv by comparing it against cgemv
+ * with the following options:
+ * 
+ * a is lower-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 2
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 2
+ */
+CTEST(csbmv, lower_k_2_inc_b_2_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 2, inc_c = 1;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'L';
+
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Test csbmv by comparing it against cgemv
+ * with the following options:
+ * 
+ * a is lower-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 2
+ * Stride of vector c_test is 2
+ * Number of super-diagonals k is 2
+ */
+CTEST(csbmv, lower_k_2_inc_b_2_inc_c_2_n_100)
+{
+    blasint n = DATASIZE, inc_b = 2, inc_c = 2;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'L';
+
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/** 
+ * Check if output matrix a contains any NaNs
+ */
+CTEST(csbmv, check_for_NaN)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 0;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+
+    ASSERT_TRUE(norm == norm); /* NaN == NaN is false  */
+}
+
+/**
+ * Test error function for an invalid param uplo.
+ * Uplo specifies whether a is in upper (U) or lower (L) band-packed storage mode.
+ */
+CTEST(csbmv, xerbla_uplo_invalid)
+{
+    blasint n = 1, inc_b = 1, inc_c = 1;
+    char uplo = 'O';
+    blasint k = 0;
+    blasint lda = k + 1;
+    int expected_info = 1;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/** 
+ * Test error function for an invalid param N -
+ * number of rows and columns of A. Must be at least zero.
+ */
+CTEST(csbmv, xerbla_n_invalid)
+{
+    blasint n = INVALID, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    blasint k = 0;
+    blasint lda = k + 1;
+    int expected_info = 2;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Check if n - number of rows and columns of A equal zero.
+ */
+CTEST(csbmv, check_n_zero)
+{
+    blasint n = 0, inc_b = 1, inc_c = 1;
+    blasint k = 0;
+    blasint lda = k + 1;
+    blasint ldm = 1;
+    char uplo = 'U';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Test error function for an invalid param inc_b -
+ * stride of vector b_test. Can't be zero. 
+ */
+CTEST(csbmv, xerbla_inc_b_zero)
+{
+    blasint n = 1, inc_b = 0, inc_c = 1;
+    char uplo = 'U';
+    blasint k = 0;
+    blasint lda = k + 1;
+    int expected_info = 8;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param inc_c -
+ * stride of vector c_test. Can't be zero. 
+ */
+CTEST(csbmv, xerbla_inc_c_zero)
+{
+    blasint n = 1, inc_b = 1, inc_c = 0;
+    char uplo = 'U';
+    blasint k = 0;
+    blasint lda = k + 1;
+    int expected_info = 11;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param k -
+ * number of super-diagonals of A. Must be at least zero.
+ */
+CTEST(csbmv, xerbla_k_invalid)
+{
+    blasint n = 1, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    blasint k = INVALID;
+    blasint lda = 1;
+    int expected_info = 3;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of a. Must be at least (k+1).
+ */
+CTEST(csbmv, xerbla_lda_invalid)
+{
+    blasint n = 1, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    blasint k = 0;
+    blasint lda = INVALID;
+    int expected_info = 6;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_cscal.c b/utest/test_extensions/test_cscal.c
new file mode 100644
index 000000000..009c600ad
--- /dev/null
+++ b/utest/test_extensions/test_cscal.c
@@ -0,0 +1,164 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_CSCAL {
+    float x_test[DATASIZE * 2 * INCREMENT];
+    float x_verify[DATASIZE * 2 * INCREMENT];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CSCAL data_cscal;
+
+/**
+ * cscal reference code
+ *
+ * param n - number of elements of vector x
+ * param alpha - scaling factor for the vector product
+ * param x - buffer holding input vector x
+ * param inc - stride of vector x
+ */
+static void cscal_trusted(blasint n, float *alpha, float* x, blasint inc){
+    blasint i, ip = 0;
+    blasint inc_x2 = 2 * inc;
+    float temp;
+    for (i = 0; i < n; i++)
+	{
+        temp = alpha[0] * x[ip] - alpha[1] * x[ip+1];
+		x[ip+1] = alpha[0] * x[ip+1] + alpha[1] * x[ip];
+        x[ip] = temp;
+        ip += inc_x2;
+    }
+}
+
+/**
+ * Comapare results computed by cscal and cscal_trusted
+ *
+ * param api specifies tested api (C or Fortran)
+ * param n - number of elements of vector x
+ * param alpha - scaling factor for the vector product
+ * param inc - stride of vector x
+ * return norm of differences
+ */
+static float check_cscal(char api, blasint n, float *alpha, blasint inc)
+{
+    blasint i;
+
+    // Fill vectors a 
+    srand_generate(data_cscal.x_test, n * inc * 2);
+
+    // Copy vector x for cscal_trusted
+    for (i = 0; i < n * 2 * inc; i++)
+        data_cscal.x_verify[i] = data_cscal.x_test[i];
+
+    cscal_trusted(n, alpha, data_cscal.x_verify, inc);
+
+    if(api == 'F')
+        BLASFUNC(cscal)(&n, alpha, data_cscal.x_test, &inc);
+    else
+        cblas_cscal(n, alpha, data_cscal.x_test, inc);
+
+    // Find the differences between output vector computed by cscal and cscal_trusted
+    for (i = 0; i < n * 2 * inc; i++)
+        data_cscal.x_verify[i] -= data_cscal.x_test[i];
+
+    // Find the norm of differences
+    return BLASFUNC(scnrm2)(&n, data_cscal.x_verify, &inc);
+}
+
+/**
+ * Fortran API specific test
+ * Test cscal by comparing it against reference
+ */
+CTEST(cscal, alpha_r_zero_alpha_i_not_zero)
+{
+    blasint N = DATASIZE;
+    blasint inc = 1;
+    float alpha[2] = {0.0f, 1.0f};
+
+    float norm = check_cscal('F', N, alpha, inc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cscal by comparing it against reference
+ */
+CTEST(cscal, alpha_r_zero_alpha_i_zero_inc_2)
+{
+    blasint N = DATASIZE;
+    blasint inc = 2;
+    float alpha[2] = {0.0f, 0.0f};
+
+    float norm = check_cscal('F', N, alpha, inc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cscal by comparing it against reference
+ */
+CTEST(cscal, c_api_alpha_r_zero_alpha_i_not_zero)
+{
+    blasint N = DATASIZE;
+    blasint inc = 1;
+    float alpha[2] = {0.0f, 1.0f};
+
+    float norm = check_cscal('C', N, alpha, inc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cscal by comparing it against reference
+ */
+CTEST(cscal, c_api_alpha_r_zero_alpha_i_zero_inc_2)
+{
+    blasint N = DATASIZE;
+    blasint inc = 2;
+    float alpha[2] = {0.0f, 0.0f};
+
+    float norm = check_cscal('C', N, alpha, inc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_cspmv.c b/utest/test_extensions/test_cspmv.c
new file mode 100644
index 000000000..b64c90e3a
--- /dev/null
+++ b/utest/test_extensions/test_cspmv.c
@@ -0,0 +1,428 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_CSPMV {
+    float a_verify[DATASIZE * DATASIZE * 2];
+    float a_test[DATASIZE * (DATASIZE + 1)];
+    float b_test[DATASIZE * 2 * INCREMENT];
+    float c_test[DATASIZE * 2 * INCREMENT];
+    float c_verify[DATASIZE * 2 * INCREMENT];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CSPMV data_cspmv;
+
+/**
+ * Compute spmv via gemv since spmv is gemv for symmetric packed matrix
+ *
+ * param uplo specifies whether matrix A is upper or lower triangular
+ * param n - number of rows and columns of A
+ * param alpha - scaling factor for the matrix-vector product
+ * param a - buffer holding input matrix A
+ * param b - Buffer holding input vector b
+ * param inc_b - stride of vector b
+ * param beta - scaling factor for vector c
+ * param c - buffer holding input/output vector c
+ * param inc_c - stride of vector c
+ * output param data_cspmv.c_verify - matrix computed by gemv
+ */
+static void cspmv_trusted(char uplo, blasint n, float *alpha, float *a,
+                          float *b, blasint inc_b, float *beta, float *c,
+                          blasint inc_c)
+{
+    blasint k;
+    blasint i, j;
+
+    // param for gemv (can use any, since the input matrix is symmetric)
+    char trans = 'N';
+
+    // Unpack the input symmetric packed matrix
+    if (uplo == 'L')
+    {
+        k = 0;
+        for (i = 0; i < n; i++)
+        {
+            for (j = 0; j < n * 2; j += 2)
+            {
+                if (j / 2 < i)
+                {
+                    data_cspmv.a_verify[i * n * 2 + j] =
+                        data_cspmv.a_verify[j * n + i * 2];
+                    data_cspmv.a_verify[i * n * 2 + j + 1] =
+                        data_cspmv.a_verify[j * n + i * 2 + 1];
+                }
+                else
+                {
+                    data_cspmv.a_verify[i * n * 2 + j] = a[k++];
+                    data_cspmv.a_verify[i * n * 2 + j + 1] = a[k++];
+                }
+            }
+        }
+    }
+    else
+    {
+        k = n * (n + 1) - 1;
+        for (j = 2 * n - 1; j >= 0; j -= 2)
+        {
+            for (i = n - 1; i >= 0; i--)
+            {
+                if (j / 2 < i)
+                {
+                    data_cspmv.a_verify[i * n * 2 + j] =
+                        data_cspmv.a_verify[(j - 1) * n + i * 2 + 1];
+                    data_cspmv.a_verify[i * n * 2 + j - 1] =
+                        data_cspmv.a_verify[(j - 1) * n + i * 2];
+                }
+                else
+                {
+                    data_cspmv.a_verify[i * n * 2 + j] = a[k--];
+                    data_cspmv.a_verify[i * n * 2 + j - 1] = a[k--];
+                }
+            }
+        }
+    }
+
+    // Run gemv with the unpacked matrix
+    BLASFUNC(cgemv)(&trans, &n, &n, alpha, data_cspmv.a_verify, &n, b,
+                    &inc_b, beta, data_cspmv.c_verify, &inc_c);
+}
+
+/**
+ * Comapare results computed by cspmv and cspmv_trusted
+ *
+ * param uplo specifies whether matrix A is upper or lower triangular
+ * param n - number of rows and columns of A
+ * param alpha - scaling factor for the matrix-vector product
+ * param inc_b - stride of vector b
+ * param beta - scaling factor for vector c
+ * param inc_c - stride of vector c
+ * return norm of differences
+ */
+static float check_cspmv(char uplo, blasint n, float *alpha, blasint inc_b,
+                         float *beta, blasint inc_c)
+{
+    blasint i;
+
+    // Fill symmetric packed maxtix a, vectors b and c
+    srand_generate(data_cspmv.a_test, n * (n + 1));
+    srand_generate(data_cspmv.b_test, 2 * n * inc_b);
+    srand_generate(data_cspmv.c_test, 2 * n * inc_c);
+
+    // Copy vector c for cspmv_trusted
+    for (i = 0; i < n * 2 * inc_c; i++)
+        data_cspmv.c_verify[i] = data_cspmv.c_test[i];
+
+    cspmv_trusted(uplo, n, alpha, data_cspmv.a_test, data_cspmv.b_test,
+                  inc_b, beta, data_cspmv.c_verify, inc_c);
+
+    BLASFUNC(cspmv)(&uplo, &n, alpha, data_cspmv.a_test, data_cspmv.b_test,
+                    &inc_b, beta, data_cspmv.c_test, &inc_c);
+
+    // Find the differences between output vector computed by cspmv and cspmv_trusted
+    for (i = 0; i < n * 2 * inc_c; i++)
+        data_cspmv.c_test[i] -= data_cspmv.c_verify[i];
+
+    // Find the norm of differences
+    return BLASFUNC(scnrm2)(&n, data_cspmv.c_test, &inc_c);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param uplo specifies whether matrix A is upper or lower triangular
+ * param n - number of rows and columns of A
+ * param inc_b - stride of vector b
+ * param inc_c - stride of vector c
+ * param expected_info - expected invalid parameter number in cspmv
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char uplo, blasint n, blasint inc_b,
+                         blasint inc_c, int expected_info)
+{
+    float alpha[] = {1.0, 1.0};
+    float beta[] = {0.0, 0.0};
+
+    set_xerbla("CSPMV ", expected_info);
+
+    BLASFUNC(cspmv)(&uplo, &n, alpha, data_cspmv.a_test, data_cspmv.b_test,
+                    &inc_b, beta, data_cspmv.c_test, &inc_c);
+
+    return check_error();
+}
+
+/**
+ * Test cspmv by comparing it against cgemv
+ * with the following options:
+ *
+ * A is upper triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(cspmv, upper_inc_b_1_inc_c_1_N_100)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cspmv by comparing it against cgemv
+ * with the following options:
+ *
+ * A is upper triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 2
+ */
+CTEST(cspmv, upper_inc_b_1_inc_c_2_N_100)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 2;
+    char uplo = 'U';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cspmv by comparing it against cgemv
+ * with the following options:
+ *
+ * A is upper triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 1
+ */
+CTEST(cspmv, upper_inc_b_2_inc_c_1_N_100)
+{
+    blasint N = DATASIZE, inc_b = 2, inc_c = 1;
+    char uplo = 'U';
+    float alpha[] = {1.0f, 0.0f};
+    float beta[] = {1.0f, 0.0f};
+
+    float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cspmv by comparing it against cgemv
+ * with the following options:
+ *
+ * A is upper triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 2
+ */
+CTEST(cspmv, upper_inc_b_2_inc_c_2_N_100)
+{
+    blasint N = DATASIZE, inc_b = 2, inc_c = 2;
+    char uplo = 'U';
+    float alpha[] = {2.5, -2.1};
+    float beta[] = {0.0f, 1.0f};
+
+    float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cspmv by comparing it against cgemv
+ * with the following options:
+ *
+ * A is lower triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(cspmv, lower_inc_b_1_inc_c_1_N_100)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 1;
+    char uplo = 'L';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cspmv by comparing it against cgemv
+ * with the following options:
+ *
+ * A is lower triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 2
+ */
+CTEST(cspmv, lower_inc_b_1_inc_c_2_N_100)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 2;
+    char uplo = 'L';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cspmv by comparing it against cgemv
+ * with the following options:
+ *
+ * A is lower triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 1
+ */
+CTEST(cspmv, lower_inc_b_2_inc_c_1_N_100)
+{
+    blasint N = DATASIZE, inc_b = 2, inc_c = 1;
+    char uplo = 'L';
+    float alpha[] = {1.0f, 0.0f};
+    float beta[] = {1.0f, 0.0f};
+
+    float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cspmv by comparing it against cgemv
+ * with the following options:
+ *
+ * A is lower triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 2
+ */
+CTEST(cspmv, lower_inc_b_2_inc_c_2_N_100)
+{
+    blasint N = DATASIZE, inc_b = 2, inc_c = 2;
+    char uplo = 'L';
+    float alpha[] = {2.5, -2.1};
+    float beta[] = {0.0f, 1.0f};
+
+    float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Check if output matrix A contains any NaNs
+ */
+CTEST(cspmv, check_for_NaN)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_TRUE(norm == norm); /* NaN == NaN is false */
+}
+
+/**
+ * Test error function for an invalid param uplo.
+ * uplo specifies whether A is upper or lower triangular.
+ */
+CTEST(cspmv, xerbla_uplo_invalid)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 1;
+    char uplo = 'O';
+    int expected_info = 1;
+
+    int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param N -
+ * number of rows and columns of A. Must be at least zero.
+ */
+CTEST(cspmv, xerbla_N_invalid)
+{
+    blasint N = INVALID, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    int expected_info = 2;
+
+    int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param inc_b -
+ * stride of vector b. Can't be zero.
+ */
+CTEST(cspmv, xerbla_inc_b_zero)
+{
+    blasint N = DATASIZE, inc_b = 0, inc_c = 1;
+    char uplo = 'U';
+    int expected_info = 6;
+
+    int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param inc_c -
+ * stride of vector c. Can't be zero.
+ */
+CTEST(cspmv, xerbla_inc_c_zero)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 0;
+    char uplo = 'U';
+    int expected_info = 9;
+
+    int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
diff --git a/utest/test_extensions/test_ctrmv.c b/utest/test_extensions/test_ctrmv.c
new file mode 100644
index 000000000..2a3f27416
--- /dev/null
+++ b/utest/test_extensions/test_ctrmv.c
@@ -0,0 +1,266 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+#include "common.h"
+
+#define DATASIZE 300
+#define INCREMENT 2
+
+struct DATA_CTRMV {
+	float a_test[DATASIZE * DATASIZE * 2];
+	float a_verify[DATASIZE * DATASIZE * 2];
+	float x_test[DATASIZE * INCREMENT * 2];
+	float x_verify[DATASIZE * INCREMENT * 2];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CTRMV data_ctrmv;
+
+/**
+ * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A
+ * and comparing it with the non-conjugate ctrmv.
+ *
+ * param uplo specifies whether A is upper or lower triangular
+ * param trans specifies op(A), the transposition (conjugation) operation applied to A
+ * param diag specifies whether the matrix A is unit triangular or not.
+ * param n - numbers of rows and columns of A
+ * param lda - leading dimension of matrix A
+ * param incx - increment for the elements of x
+ * return norm of difference
+ */
+static float check_ctrmv(char uplo, char trans, char diag, blasint n, blasint lda, blasint incx)
+{
+	blasint i;
+	float alpha_conj[] = {1.0f, 0.0f}; 
+	char trans_verify = trans;
+
+	srand_generate(data_ctrmv.a_test, n * lda * 2);
+	srand_generate(data_ctrmv.x_test, n * incx * 2);
+
+	for (i = 0; i < n * lda * 2; i++)
+		data_ctrmv.a_verify[i] = data_ctrmv.a_test[i];
+
+	for (i = 0; i < n * incx * 2; i++)
+		data_ctrmv.x_verify[i] = data_ctrmv.x_test[i];
+
+	if (trans == 'R'){
+		cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, n, n, alpha_conj, data_ctrmv.a_verify, lda, lda);
+		trans_verify = 'N';
+	}
+
+	BLASFUNC(ctrmv)(&uplo, &trans_verify, &diag, &n, data_ctrmv.a_verify, &lda,
+	 				data_ctrmv.x_verify, &incx);
+
+	BLASFUNC(ctrmv)(&uplo, &trans, &diag, &n, data_ctrmv.a_test, &lda,
+	 				data_ctrmv.x_test, &incx);
+
+	for (i = 0; i < n * incx * 2; i++)
+		data_ctrmv.x_verify[i] -= data_ctrmv.x_test[i];
+
+	return BLASFUNC(scnrm2)(&n, data_ctrmv.x_verify, &incx);
+}
+
+/**
+ * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is not unit triangular
+ */
+CTEST(ctrmv, conj_notrans_upper_not_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'N';
+	char trans = 'R';
+
+	float norm = check_ctrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is unit triangular
+ */
+CTEST(ctrmv, conj_notrans_upper_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'U';
+	char trans = 'R';
+
+	float norm = check_ctrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is not unit triangular
+ */
+CTEST(ctrmv, conj_notrans_lower_not_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'N';
+	char trans = 'R';
+
+	float norm = check_ctrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is unit triangular
+ */
+CTEST(ctrmv, conj_notrans_lower_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'U';
+	char trans = 'R';
+
+	float norm = check_ctrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is not unit triangular
+ * vector x stride is 2
+ */
+CTEST(ctrmv, conj_notrans_upper_not_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'N';
+	char trans = 'R';
+
+	float norm = check_ctrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is unit triangular
+ * vector x stride is 2
+ */
+CTEST(ctrmv, conj_notrans_upper_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'U';
+	char trans = 'R';
+
+	float norm = check_ctrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is not unit triangular
+ * vector x stride is 2
+ */
+CTEST(ctrmv, conj_notrans_lower_not_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'N';
+	char trans = 'R';
+
+	float norm = check_ctrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is unit triangular
+ * vector x stride is 2
+ */
+CTEST(ctrmv, conj_notrans_lower_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'U';
+	char trans = 'R';
+
+	float norm = check_ctrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_ctrsv.c b/utest/test_extensions/test_ctrsv.c
new file mode 100644
index 000000000..0e639bb2a
--- /dev/null
+++ b/utest/test_extensions/test_ctrsv.c
@@ -0,0 +1,267 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+#include "common.h"
+
+#define DATASIZE 300
+#define INCREMENT 2
+
+struct DATA_CTRSV {
+	float a_test[DATASIZE * DATASIZE * 2];
+	float a_verify[DATASIZE * DATASIZE * 2];
+	float x_test[DATASIZE * INCREMENT * 2];
+	float x_verify[DATASIZE * INCREMENT * 2];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CTRSV data_ctrsv;
+
+/**
+ * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A
+ * and comparing it with the non-conjugate ctrsv.
+ *
+ * param uplo specifies whether A is upper or lower triangular
+ * param trans specifies op(A), the transposition (conjugation) operation applied to A
+ * param diag specifies whether the matrix A is unit triangular or not.
+ * param n - numbers of rows and columns of A
+ * param lda - leading dimension of matrix A
+ * param incx - increment for the elements of x
+ * return norm of difference
+ */
+static float check_ctrsv(char uplo, char trans, char diag, blasint n, blasint lda, blasint incx)
+{
+	blasint i;
+	float alpha_conj[] = {1.0f, 0.0f}; 
+	char trans_verify = trans;
+
+	srand_generate(data_ctrsv.a_test, n * lda * 2);
+	srand_generate(data_ctrsv.x_test, n * incx * 2);
+
+	for (i = 0; i < n * lda * 2; i++)
+		data_ctrsv.a_verify[i] = data_ctrsv.a_test[i];
+
+	for (i = 0; i < n * incx * 2; i++)
+		data_ctrsv.x_verify[i] = data_ctrsv.x_test[i];
+
+	if (trans == 'R'){
+		cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, n, n, 
+						alpha_conj, data_ctrsv.a_verify, lda, lda);
+		trans_verify = 'N';
+	}
+
+	BLASFUNC(ctrsv)(&uplo, &trans_verify, &diag, &n, data_ctrsv.a_verify, 
+					&lda, data_ctrsv.x_verify, &incx);
+
+	BLASFUNC(ctrsv)(&uplo, &trans, &diag, &n, data_ctrsv.a_test, &lda,
+	 				data_ctrsv.x_test, &incx);
+
+	for (i = 0; i < n * incx * 2; i++)
+		data_ctrsv.x_verify[i] -= data_ctrsv.x_test[i];
+
+	return BLASFUNC(scnrm2)(&n, data_ctrsv.x_verify, &incx);
+}
+
+/**
+ * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is not unit triangular
+ */
+CTEST(ctrsv, conj_notrans_upper_not_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'N';
+	char trans = 'R';
+
+	float norm = check_ctrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is unit triangular
+ */
+CTEST(ctrsv, conj_notrans_upper_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'U';
+	char trans = 'R';
+
+	float norm = check_ctrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is not unit triangular
+ */
+CTEST(ctrsv, conj_notrans_lower_not_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'N';
+	char trans = 'R';
+
+	float norm = check_ctrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is unit triangular
+ */
+CTEST(ctrsv, conj_notrans_lower_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'U';
+	char trans = 'R';
+
+	float norm = check_ctrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is not unit triangular
+ * vector x stride is 2
+ */
+CTEST(ctrsv, conj_notrans_upper_not_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'N';
+	char trans = 'R';
+
+	float norm = check_ctrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is unit triangular
+ * vector x stride is 2
+ */
+CTEST(ctrsv, conj_notrans_upper_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'U';
+	char trans = 'R';
+
+	float norm = check_ctrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is not unit triangular
+ * vector x stride is 2
+ */
+CTEST(ctrsv, conj_notrans_lower_not_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'N';
+	char trans = 'R';
+
+	float norm = check_ctrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is unit triangular
+ * vector x stride is 2
+ */
+CTEST(ctrsv, conj_notrans_lower_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'U';
+	char trans = 'R';
+
+	float norm = check_ctrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_damin.c b/utest/test_extensions/test_damin.c
new file mode 100644
index 000000000..d492343ed
--- /dev/null
+++ b/utest/test_extensions/test_damin.c
@@ -0,0 +1,354 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 70
+#define INCREMENT 2
+
+#ifdef BUILD_DOUBLE
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < ELEMENTS * inc; i ++) {
+      x[i] = 1000 - i;
+   }
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   double x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0;
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {1.1};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {-1.1};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {1.1, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {-1.1, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {1.1, 1.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {-1.1, 1.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {1.1, 1.0, 2.2};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {-1.1, 1.0, -2.2};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {1.1, 1.0, 2.2, 3.3};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {-1.1, 1.0, -2.2, -3.3};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_1_N_5){
+   blasint N = 5, inc = 1;
+   double x[] = {1.1, 1.0, 2.2, 3.3, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_1_N_5){
+   blasint N = 5, inc = 1;
+   double x[] = {-1.1, 1.0, -2.2, -3.3, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_2_N_5){
+   blasint N = 5, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_2_N_5){
+   blasint N = 5, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 0.0;
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0;
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 1.0;
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0;
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_daxpby.c b/utest/test_extensions/test_daxpby.c
new file mode 100644
index 000000000..6e77c7c7c
--- /dev/null
+++ b/utest/test_extensions/test_daxpby.c
@@ -0,0 +1,799 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_DAXPBY{
+    double x_test[DATASIZE * INCREMENT];
+    double x_verify[DATASIZE * INCREMENT];
+    double y_test[DATASIZE * INCREMENT];
+    double y_verify[DATASIZE * INCREMENT];
+};
+
+#ifdef BUILD_DOUBLE
+static struct DATA_DAXPBY data_daxpby;
+
+/**
+ * Fortran API specific function
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Compare with the following options:
+ * 
+ * param n - number of elements in vectors x and y
+ * param alpha - scalar alpha
+ * param incx - increment for the elements of x
+ * param beta - scalar beta
+ * param incy - increment for the elements of y
+ * return norm of difference
+ */
+static double check_daxpby(blasint n, double alpha, blasint incx, double beta, blasint incy)
+{
+    blasint i;
+
+    // dscal accept only positive increments
+    blasint incx_abs = labs(incx);
+    blasint incy_abs = labs(incy);
+
+    // Fill vectors x, y
+    drand_generate(data_daxpby.x_test, n * incx_abs);
+    drand_generate(data_daxpby.y_test, n * incy_abs);
+
+    // Copy vector x for daxpy
+    for (i = 0; i < n * incx_abs; i++)
+        data_daxpby.x_verify[i] = data_daxpby.x_test[i];
+
+    // Copy vector y for dscal
+    for (i = 0; i < n * incy_abs; i++)
+        data_daxpby.y_verify[i] = data_daxpby.y_test[i];
+
+    // Find beta*y
+    BLASFUNC(dscal)(&n, &beta, data_daxpby.y_verify, &incy_abs);
+
+    // Find sum of alpha*x and beta*y
+    BLASFUNC(daxpy)(&n, &alpha, data_daxpby.x_verify, &incx,
+                        data_daxpby.y_verify, &incy);
+    
+    BLASFUNC(daxpby)(&n, &alpha, data_daxpby.x_test, &incx,
+                        &beta, data_daxpby.y_test, &incy);
+
+    // Find the differences between output vector caculated by daxpby and daxpy
+    for (i = 0; i < n * incy_abs; i++)
+        data_daxpby.y_test[i] -= data_daxpby.y_verify[i];
+
+    // Find the norm of differences
+    return BLASFUNC(dnrm2)(&n, data_daxpby.y_test, &incy_abs);
+}
+
+/**
+ * C API specific function
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Compare with the following options:
+ * 
+ * param n - number of elements in vectors x and y
+ * param alpha - scalar alpha
+ * param incx - increment for the elements of x
+ * param beta - scalar beta
+ * param incy - increment for the elements of y
+ * return norm of difference
+ */
+static double c_api_check_daxpby(blasint n, double alpha, blasint incx, double beta, blasint incy)
+{
+    blasint i;
+
+    // dscal accept only positive increments
+    blasint incx_abs = labs(incx);
+    blasint incy_abs = labs(incy);
+
+    // Copy vector x for daxpy
+    for (i = 0; i < n * incx_abs; i++)
+        data_daxpby.x_verify[i] = data_daxpby.x_test[i];
+
+    // Copy vector y for dscal
+    for (i = 0; i < n * incy_abs; i++)
+        data_daxpby.y_verify[i] = data_daxpby.y_test[i];
+
+    // Find beta*y
+    cblas_dscal(n, beta, data_daxpby.y_verify, incy_abs);
+
+    // Find sum of alpha*x and beta*y
+    cblas_daxpy(n, alpha, data_daxpby.x_verify, incx,
+                        data_daxpby.y_verify, incy);
+    
+    cblas_daxpby(n, alpha, data_daxpby.x_test, incx,
+                        beta, data_daxpby.y_test, incy);
+
+    // Find the differences between output vector caculated by daxpby and daxpy
+    for (i = 0; i < n * incy_abs; i++)
+        data_daxpby.y_test[i] -= data_daxpby.y_verify[i];
+
+    // Find the norm of differences
+    return cblas_dnrm2(n, data_daxpby.y_test, incy_abs);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ */
+CTEST(daxpby, inc_x_1_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha = 1.0;
+    double beta = 1.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ */
+CTEST(daxpby, inc_x_2_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    double alpha = 2.0;
+    double beta = 1.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ */
+CTEST(daxpby, inc_x_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha = 1.0;
+    double beta = 2.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ */
+CTEST(daxpby, inc_x_2_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    double alpha = 3.0;
+    double beta = 4.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is 2
+ */
+CTEST(daxpby, inc_x_neg_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = -1, incy = 2;
+    double alpha = 5.0;
+    double beta = 4.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is -1
+ */
+CTEST(daxpby, inc_x_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = -1;
+    double alpha = 1.0;
+    double beta = 6.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is -1
+ */
+CTEST(daxpby, inc_x_neg_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = -2, incy = -1;
+    double alpha = 7.0;
+    double beta = 3.5;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ */
+CTEST(daxpby, inc_x_1_inc_y_1_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha = 0.0;
+    double beta = 1.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+*/
+CTEST(daxpby, inc_x_1_inc_y_2_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha = 0.0;
+    double beta = 1.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar beta is zero
+ */
+CTEST(daxpby, inc_x_1_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ * Scalar beta is zero
+*/
+CTEST(daxpby, inc_x_2_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar beta is zero
+*/
+CTEST(daxpby, inc_x_1_inc_y_2_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * Scalar beta is zero
+*/
+CTEST(daxpby, inc_x_2_inc_y_2_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ * Scalar beta is zero
+ */
+CTEST(daxpby, inc_x_1_inc_y_1_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha = 0.0;
+    double beta = 0.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+ * Scalar beta is zero
+*/
+CTEST(daxpby, inc_x_1_inc_y_2_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha = 0.0;
+    double beta = 0.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Check if n - size of vectors x, y is zero
+ */
+CTEST(daxpby, check_n_zero)
+{
+    blasint n = 0, incx = 1, incy = 1;
+    double alpha = 1.0;
+    double beta = 1.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+    
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ */
+CTEST(daxpby, c_api_inc_x_1_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha = 1.0;
+    double beta = 1.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ */
+CTEST(daxpby, c_api_inc_x_2_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    double alpha = 2.0;
+    double beta = 1.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ */
+CTEST(daxpby, c_api_inc_x_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha = 1.0;
+    double beta = 2.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ */
+CTEST(daxpby, c_api_inc_x_2_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    double alpha = 3.0;
+    double beta = 4.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is 2
+ */
+CTEST(daxpby, c_api_inc_x_neg_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = -1, incy = 2;
+    double alpha = 5.0;
+    double beta = 4.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is -1
+ */
+CTEST(daxpby, c_api_inc_x_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = -1;
+    double alpha = 1.0;
+    double beta = 6.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is -1
+ */
+CTEST(daxpby, c_api_inc_x_neg_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = -2, incy = -1;
+    double alpha = 7.0;
+    double beta = 3.5;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ */
+CTEST(daxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha = 0.0;
+    double beta = 1.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+*/
+CTEST(daxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha = 0.0;
+    double beta = 1.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * 
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar beta is zero
+ */
+CTEST(daxpby, c_api_inc_x_1_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ * Scalar beta is zero
+*/
+CTEST(daxpby, c_api_inc_x_2_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * 
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar beta is zero
+*/
+CTEST(daxpby, c_api_inc_x_1_inc_y_2_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * 
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * Scalar beta is zero
+*/
+CTEST(daxpby, c_api_inc_x_2_inc_y_2_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * 
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ * Scalar beta is zero
+ */
+CTEST(daxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha = 0.0;
+    double beta = 0.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+ * Scalar beta is zero
+*/
+CTEST(daxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha = 0.0;
+    double beta = 0.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Check if n - size of vectors x, y is zero
+ */
+CTEST(daxpby, c_api_check_n_zero)
+{
+    blasint n = 0, incx = 1, incy = 1;
+    double alpha = 1.0;
+    double beta = 1.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_dgeadd.c b/utest/test_extensions/test_dgeadd.c
new file mode 100644
index 000000000..4654c51a3
--- /dev/null
+++ b/utest/test_extensions/test_dgeadd.c
@@ -0,0 +1,878 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define N 100
+#define M 100
+
+struct DATA_DGEADD{
+    double a_test[M * N];
+    double c_test[M * N];
+    double c_verify[M * N];
+};
+
+#ifdef BUILD_DOUBLE
+static struct DATA_DGEADD data_dgeadd;
+
+/**
+ * dgeadd reference implementation
+ *
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param alpha - scaling factor for matrix A
+ * param aptr - refer to matrix A
+ * param lda - leading dimension of A
+ * param beta - scaling factor for matrix C
+ * param cptr - refer to matrix C
+ * param ldc - leading dimension of C
+ */
+static void dgeadd_trusted(blasint m, blasint n, double alpha, double *aptr,
+                           blasint lda, double beta, double *cptr, blasint ldc)
+{
+    blasint i;
+
+    for (i = 0; i < n; i++)
+    {
+        cblas_daxpby(m, alpha, aptr, 1, beta, cptr, 1);
+        aptr += lda;
+        cptr += ldc;
+    }
+}
+
+/**
+ * Test dgeadd by comparing it against reference
+ * Compare with the following options:
+ *
+ * param api - specifies Fortran or C API
+ * param order - specifies whether A and C stored in
+ * row-major order or column-major order
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param alpha - scaling factor for matrix A
+ * param lda - leading dimension of A
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ * return norm of differences
+ */
+static double check_dgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order,
+                          blasint m, blasint n, double alpha, blasint lda,
+                          double beta, blasint ldc)
+{
+    blasint i;
+    blasint cols = m, rows = n;
+
+    if (order == CblasRowMajor)
+    {
+        rows = m;
+        cols = n;
+    }
+
+    // Fill matrix A, C
+    drand_generate(data_dgeadd.a_test, lda * rows);
+    drand_generate(data_dgeadd.c_test, ldc * rows);
+
+    // Copy matrix C for dgeadd
+    for (i = 0; i < ldc * rows; i++)
+        data_dgeadd.c_verify[i] = data_dgeadd.c_test[i];
+
+    dgeadd_trusted(cols, rows, alpha, data_dgeadd.a_test, lda,
+                   beta, data_dgeadd.c_verify, ldc);
+
+    if (api == 'F')
+        BLASFUNC(dgeadd)(&m, &n, &alpha, data_dgeadd.a_test, &lda,
+         &beta, data_dgeadd.c_test, &ldc);
+    else
+        cblas_dgeadd(order, m, n, alpha, data_dgeadd.a_test, lda,
+                     beta, data_dgeadd.c_test, ldc);
+
+    // Find the differences between output matrix caculated by dgeadd and sgemm
+    return dmatrix_difference(data_dgeadd.c_test, data_dgeadd.c_verify, cols, rows, ldc);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param api - specifies Fortran or C API
+ * param order - specifies whether A and C stored in
+ * row-major order or column-major order
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param lda - leading dimension of A
+ * param ldc - leading dimension of C
+ * param expected_info - expected invalid parameter number in dgeadd
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order,
+                         blasint m, blasint n, blasint lda,
+                         blasint ldc, int expected_info)
+{
+    double alpha = 1.0;
+    double beta = 1.0;
+
+    set_xerbla("DGEADD ", expected_info);
+
+    if (api == 'F')
+        BLASFUNC(dgeadd)(&m, &n, &alpha, data_dgeadd.a_test, &lda,
+                         &beta, data_dgeadd.c_test, &ldc);
+    else 
+        cblas_dgeadd(order, m, n, alpha, data_dgeadd.a_test, lda,
+                 beta, data_dgeadd.c_test, ldc);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(dgeadd, matrix_n_100_m_100)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 3.0;
+    double beta = 3.0;
+
+    double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar alpha is zero (operation is C:=beta*C)
+ */
+CTEST(dgeadd, matrix_n_100_m_100_alpha_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 0.0;
+    double beta = 2.5;
+
+    double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar beta is zero (operation is C:=alpha*A)
+ */
+CTEST(dgeadd, matrix_n_100_m_100_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 3.0;
+    double beta = 0.0;
+
+    double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalars alpha, beta is zero (operation is C:= 0)
+ */
+CTEST(dgeadd, matrix_n_100_m_100_alpha_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 0.0;
+    double beta = 0.0;
+
+    double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(dgeadd, matrix_n_100_m_50)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M / 2;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 1.0;
+    double beta = 1.0;
+
+    double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C
+ * Must be at least zero.
+ */
+CTEST(dgeadd, xerbla_n_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ */
+CTEST(dgeadd, xerbla_m_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ */
+CTEST(dgeadd, xerbla_lda_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 6;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ */
+CTEST(dgeadd, xerbla_ldc_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Check if n - number of columns of A, C equal zero.
+ */
+CTEST(dgeadd, n_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 0;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    double alpha = 1.0;
+    double beta = 1.0;
+
+    double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Check if m - number of rows of A and C equal zero.
+ */
+CTEST(dgeadd, m_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 0;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    double alpha = 1.0;
+    double beta = 1.0;
+
+    double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(dgeadd, c_api_matrix_n_100_m_100)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 2.0;
+    double beta = 3.0;
+
+    double norm = check_dgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is row-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(dgeadd, c_api_matrix_n_100_m_100_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 4.0;
+    double beta = 2.0;
+
+    double norm = check_dgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is row-major order
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(dgeadd, c_api_matrix_n_50_m_100_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = N / 2;
+    blasint m = M;
+
+    blasint lda = n;
+    blasint ldc = n;
+
+    double alpha = 3.0;
+    double beta = 1.0;
+
+    double norm = check_dgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar alpha is zero (operation is C:=beta*C)
+ */
+CTEST(dgeadd, c_api_matrix_n_100_m_100_alpha_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 0.0;
+    double beta = 1.0;
+
+    double norm = check_dgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar beta is zero (operation is C:=alpha*A)
+ */
+CTEST(dgeadd, c_api_matrix_n_100_m_100_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 3.0;
+    double beta = 0.0;
+
+    double norm = check_dgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalars alpha, beta is zero (operation is C:= 0)
+ */
+CTEST(dgeadd, c_api_matrix_n_100_m_100_alpha_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 0.0;
+    double beta = 0.0;
+
+    double norm = check_dgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(dgeadd, c_api_matrix_n_100_m_50)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M / 2;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 3.0;
+    double beta = 4.0;
+
+    double norm = check_dgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param order -
+ * specifies whether A and C stored in
+ * row-major order or column-major order
+ */
+CTEST(dgeadd, c_api_xerbla_invalid_order)
+{
+    CBLAS_ORDER order = INVALID;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 0;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C.
+ * Must be at least zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(dgeadd, c_api_xerbla_n_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C.
+ * Must be at least zero.
+ *
+ * c api option order is row-major order
+ */
+CTEST(dgeadd, c_api_xerbla_n_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(dgeadd, c_api_xerbla_m_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ *
+ * c api option order is row-major order
+ */
+CTEST(dgeadd, c_api_xerbla_m_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ *
+ * c api option order is column-major order
+ */
+CTEST(dgeadd, c_api_xerbla_lda_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 5;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ *
+ * c api option order is row-major order
+ */
+CTEST(dgeadd, c_api_xerbla_lda_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 5;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ *
+ * c api option order is column-major order
+ */
+CTEST(dgeadd, c_api_xerbla_ldc_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ *
+ * c api option order is row-major order
+ */
+CTEST(dgeadd, c_api_xerbla_ldc_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Check if n - number of columns of A, C equal zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(dgeadd, c_api_n_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 0;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    double alpha = 1.0;
+    double beta = 1.0;
+
+    double norm = check_dgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Check if m - number of rows of A and C equal zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(dgeadd, c_api_m_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 0;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    double alpha = 1.0;
+    double beta = 1.0;
+
+    double norm = check_dgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_dgemmt.c b/utest/test_extensions/test_dgemmt.c
new file mode 100644
index 000000000..22dcaf2aa
--- /dev/null
+++ b/utest/test_extensions/test_dgemmt.c
@@ -0,0 +1,1442 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_DGEMMT {
+    double a_test[DATASIZE * DATASIZE];
+    double b_test[DATASIZE * DATASIZE];
+    double c_test[DATASIZE * DATASIZE];
+    double c_verify[DATASIZE * DATASIZE];
+    double c_gemm[DATASIZE * DATASIZE];
+};
+
+#ifdef BUILD_DOUBLE
+static struct DATA_DGEMMT data_dgemmt;
+
+/**
+ * Compute gemmt via gemm since gemmt is gemm but updates only 
+ * the upper or lower triangular part of the result matrix
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order (for Fortran API column major always)
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param alpha - scaling factor for the matrix-matrix product
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ */
+static void dgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                           char transb, blasint m, blasint k, double alpha, blasint lda, 
+                           blasint ldb, double beta, blasint ldc)
+{
+    blasint i, j;
+
+    if(api == 'F')
+        BLASFUNC(dgemm)(&transa, &transb, &m, &m, &k, &alpha, data_dgemmt.a_test, &lda,
+                        data_dgemmt.b_test, &ldb, &beta, data_dgemmt.c_gemm, &ldc);
+    else
+        cblas_dgemm(order, transa, transb, m, m, k, alpha, data_dgemmt.a_test, lda,
+                data_dgemmt.b_test, ldb, beta, data_dgemmt.c_gemm, ldc);
+
+    if (uplo == 'L' || uplo == CblasLower)
+    {
+        for (i = 0; i < m; i++)
+            for (j = i; j < m; j++)
+                data_dgemmt.c_verify[i * ldc + j] =
+                    data_dgemmt.c_gemm[i * ldc + j];
+    } else {
+        for (i = 0; i < m; i++)
+            for (j = 0; j <= i; j++)
+                data_dgemmt.c_verify[i * ldc + j] =
+                    data_dgemmt.c_gemm[i * ldc + j];
+    }
+}
+
+/**
+ * Comapare results computed by dgemmt and dgemmt_trusted
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order (for Fortran API column major always)
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param alpha - scaling factor for the matrix-matrix product
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ * return norm of differences
+ */
+static double check_dgemmt(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                          char transb, blasint m, blasint k, double alpha, blasint lda, 
+                          blasint ldb, double beta, blasint ldc)
+{
+    blasint i;
+    blasint b_cols;
+    blasint a_cols;
+    blasint inc = 1;
+    blasint size_c = m * ldc;
+
+    if(order == CblasColMajor){
+        if (transa == 'T' || transa == 'C' || 
+            transa == CblasTrans || transa == CblasConjTrans) 
+            a_cols = m;
+        else a_cols = k;
+
+        if (transb == 'T' || transb == 'C' || 
+            transb == CblasTrans || transb == CblasConjTrans) 
+            b_cols = k;
+        else b_cols = m;
+    } else {
+        if (transa == 'T' || transa == 'C' || 
+            transa == CblasTrans || transa == CblasConjTrans) 
+            a_cols = k;
+        else a_cols = m;
+
+        if (transb == 'T' || transb == 'C' ||
+            transb == CblasTrans || transb == CblasConjTrans) 
+            b_cols = m;
+        else b_cols = k;
+    }
+
+    drand_generate(data_dgemmt.a_test, a_cols * lda);
+    drand_generate(data_dgemmt.b_test, b_cols * ldb);
+    drand_generate(data_dgemmt.c_test, m * ldc);
+
+    for (i = 0; i < m * ldc; i++)
+        data_dgemmt.c_gemm[i] = data_dgemmt.c_verify[i] = data_dgemmt.c_test[i];
+
+    dgemmt_trusted(api, order, uplo, transa, transb, m, k, alpha, lda, ldb, beta, ldc);
+
+    if (api == 'F')
+        BLASFUNC(dgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_dgemmt.a_test,
+                         &lda, data_dgemmt.b_test, &ldb, &beta, data_dgemmt.c_test, &ldc);
+    else
+        cblas_dgemmt(order, uplo, transa, transb, m, k, alpha, data_dgemmt.a_test, lda,
+                    data_dgemmt.b_test, ldb, beta, data_dgemmt.c_test, ldc);
+
+    for (i = 0; i < m * ldc; i++)
+        data_dgemmt.c_verify[i] -= data_dgemmt.c_test[i];
+
+    return BLASFUNC(dnrm2)(&size_c, data_dgemmt.c_verify, &inc) / size_c;
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param ldc - leading dimension of C
+ * param expected_info - expected invalid parameter number in dgemmt
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                         char transb, blasint m, blasint k, blasint lda, blasint ldb,
+                         blasint ldc, int expected_info)
+{
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    set_xerbla("DGEMMT ", expected_info);
+
+    if (api == 'F')
+        BLASFUNC(dgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_dgemmt.a_test,
+                         &lda, data_dgemmt.b_test, &ldb, &beta, data_dgemmt.c_test, &ldc);
+    else
+        cblas_dgemmt(order, uplo, transa, transb, m, k, alpha, data_dgemmt.a_test, lda,
+                    data_dgemmt.b_test, ldb, beta, data_dgemmt.c_test, ldc);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(dgemmt, upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(dgemmt, upper_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 100;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    double alpha = 1.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(dgemmt, upper_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'T';
+    char uplo = 'U';
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(dgemmt, upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'T';
+    char uplo = 'U';
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * alpha = 0.0
+ */
+CTEST(dgemmt, upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    double alpha = 0.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * beta = 1.0
+ */
+CTEST(dgemmt, upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    double alpha = 2.0;
+    double beta = 1.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(dgemmt, lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(dgemmt, lower_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 100;
+    char transa = 'T', transb = 'N';
+    char uplo = 'L';
+    double alpha = 1.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(dgemmt, lower_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'T';
+    char uplo = 'L';
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(dgemmt, lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'T';
+    char uplo = 'L';
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * alpha = 0.0
+ */
+CTEST(dgemmt, lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    double alpha = 0.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * beta = 1.0
+ */
+CTEST(dgemmt, lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    double alpha = 2.0;
+    double beta = 1.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(dgemmt, c_api_colmajor_upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(dgemmt, c_api_colmajor_upper_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 100;
+    double alpha = 1.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(dgemmt, c_api_colmajor_upper_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(dgemmt, c_api_colmajor_upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * alpha = 0.0
+ */
+CTEST(dgemmt, c_api_colmajor_upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 0.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * beta = 1.0
+ */
+CTEST(dgemmt, c_api_colmajor_upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 2.0;
+    double beta = 1.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(dgemmt, c_api_colmajor_lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(dgemmt, c_api_colmajor_lower_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 100;
+    double alpha = 1.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(dgemmt, c_api_colmajor_lower_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(dgemmt, c_api_colmajor_lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * alpha = 0.0
+ */
+CTEST(dgemmt, c_api_colmajor_lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 0.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * beta = 1.0
+ */
+CTEST(dgemmt, c_api_colmajor_lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 2.0;
+    double beta = 1.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(dgemmt, c_api_rowmajor_upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(dgemmt, c_api_rowmajor_upper_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 100, ldb = 100, ldc = 100;
+    double alpha = 1.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(dgemmt, c_api_rowmajor_upper_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 100, ldb = 100, ldc = 50;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(dgemmt, c_api_rowmajor_upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * alpha = 0.0
+ */
+CTEST(dgemmt, c_api_rowmajor_upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 0.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * beta = 1.0
+ */
+CTEST(dgemmt, c_api_rowmajor_upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 2.0;
+    double beta = 1.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(dgemmt, c_api_rowmajor_lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(dgemmt, c_api_rowmajor_lower_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 100, ldb = 100, ldc = 100;
+    double alpha = 1.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(dgemmt, c_api_rowmajor_lower_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 100, ldb = 100, ldc = 50;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(dgemmt, c_api_rowmajor_lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * alpha = 0.0
+ */
+CTEST(dgemmt, c_api_rowmajor_lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 0.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * beta = 1.0
+ */
+CTEST(dgemmt, c_api_rowmajor_lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 2.0;
+    double beta = 1.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param uplo.
+ * Must be upper (U) or lower (L).
+ */
+CTEST(dgemmt, xerbla_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'O';
+    int expected_info = 1;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param transa.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(dgemmt, xerbla_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'O', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 2;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param transb.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(dgemmt, xerbla_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'O';
+    char uplo = 'U';
+    int expected_info = 3;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(dgemmt, xerbla_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 4;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(dgemmt, xerbla_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 5;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(dgemmt, xerbla_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 100, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 8;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B not transposed.
+ */
+CTEST(dgemmt, xerbla_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 10;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(dgemmt, xerbla_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 13;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test.
+ * Test error function for an invalid param order.
+ * Must be column or row major.
+ */
+CTEST(dgemmt, xerbla_c_api_major_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 0;
+
+    int passed = check_badargs('C', 'O', CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param uplo.
+ * Must be upper or lower.
+ */
+CTEST(dgemmt, xerbla_c_api_colmajor_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 1;
+
+    int passed = check_badargs('C', CblasColMajor, 'O', CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param transa.
+ * Must be trans or no-trans.
+ */
+CTEST(dgemmt, xerbla_c_api_colmajor_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 2;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, 'O', CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param transb.
+ * Must be trans or no-trans.
+ */
+CTEST(dgemmt, xerbla_c_api_colmajor_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 3;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, 'O',
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(dgemmt, xerbla_c_api_colmajor_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 4;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(dgemmt, xerbla_c_api_colmajor_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 5;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(dgemmt, xerbla_c_api_colmajor_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 100, ldc = 50;
+    int expected_info = 8;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B not transposed.
+ */
+CTEST(dgemmt, xerbla_c_api_colmajor_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 10;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(dgemmt, xerbla_c_api_colmajor_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 13;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param uplo.
+ * Must be upper or lower.
+ */
+CTEST(dgemmt, xerbla_c_api_rowmajor_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 1;
+
+    int passed = check_badargs('C', CblasRowMajor, 'O', CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param transa.
+ * Must be trans or no-trans.
+ */
+CTEST(dgemmt, xerbla_c_api_rowmajor_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 2;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, 'O', CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param transb.
+ * Must be trans or no-trans.
+ */
+CTEST(dgemmt, xerbla_c_api_rowmajor_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 3;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, 'O',
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(dgemmt, xerbla_c_api_rowmajor_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 4;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(dgemmt, xerbla_c_api_rowmajor_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 5;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(dgemmt, xerbla_c_api_rowmajor_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 8;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B transposed.
+ */
+CTEST(dgemmt, xerbla_c_api_rowmajor_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 10;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(dgemmt, xerbla_c_api_rowmajor_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 100, ldb = 100, ldc = 50;
+    int expected_info = 13;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_dimatcopy.c b/utest/test_extensions/test_dimatcopy.c
new file mode 100644
index 000000000..4debb50e8
--- /dev/null
+++ b/utest/test_extensions/test_dimatcopy.c
@@ -0,0 +1,947 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_DIMATCOPY {
+    double a_test[DATASIZE* DATASIZE];
+    double a_verify[DATASIZE* DATASIZE];
+};
+
+#ifdef BUILD_DOUBLE
+static struct DATA_DIMATCOPY data_dimatcopy;
+
+/**
+ * Comapare results computed by dimatcopy and reference func
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param rows specifies number of rows of A
+ * param cols specifies number of columns of A
+ * param alpha specifies scaling factor for matrix A
+ * param lda_src - leading dimension of the matrix A
+ * param lda_dst - leading dimension of output matrix A
+ * return norm of difference between openblas and reference func
+ */
+static double check_dimatcopy(char api, char order, char trans, blasint rows, blasint cols, double alpha, 
+                             blasint lda_src, blasint lda_dst)
+{
+    blasint m, n;
+    blasint rows_out, cols_out;
+    enum CBLAS_ORDER corder;
+    enum CBLAS_TRANSPOSE ctrans;
+
+    if (order == 'C') {
+        n = rows; m = cols;
+    }
+    else {
+        m = rows; n = cols;
+    }
+
+    if(trans == 'T' || trans == 'C') {
+        rows_out = n; cols_out = m;
+    }
+    else {
+        rows_out = m; cols_out = n;
+    }
+
+    drand_generate(data_dimatcopy.a_test, lda_src*m);
+
+    if (trans == 'T' || trans == 'C') {
+        dtranspose(m, n, alpha, data_dimatcopy.a_test, lda_src, data_dimatcopy.a_verify, lda_dst);
+    } 
+    else {
+        dcopy(m, n, alpha, data_dimatcopy.a_test, lda_src, data_dimatcopy.a_verify, lda_dst);
+    }
+
+    if (api == 'F') {
+        BLASFUNC(dimatcopy)(&order, &trans, &rows, &cols, &alpha, data_dimatcopy.a_test, 
+                            &lda_src, &lda_dst);
+    }
+    else {
+        if (order == 'C') corder = CblasColMajor;
+        if (order == 'R') corder = CblasRowMajor;
+        if (trans == 'T') ctrans = CblasTrans;
+        if (trans == 'N') ctrans = CblasNoTrans;
+        if (trans == 'C') ctrans = CblasConjTrans;
+        if (trans == 'R') ctrans = CblasConjNoTrans;
+        cblas_dimatcopy(corder, ctrans, rows, cols, alpha, data_dimatcopy.a_test, 
+                    lda_src, lda_dst);
+    }
+
+    // Find the differences between output matrix computed by dimatcopy and reference func
+    return dmatrix_difference(data_dimatcopy.a_test, data_dimatcopy.a_verify, cols_out, rows_out, lda_dst);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param rows specifies number of rows of A
+ * param cols specifies number of columns of A
+ * param lda_src - leading dimension of the matrix A
+ * param lda_dst - leading dimension of output matrix A
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char order, char trans, blasint rows, blasint cols,
+                          blasint lda_src, blasint lda_dst, int expected_info)
+{
+    double alpha = 1.0;
+
+    set_xerbla("DIMATCOPY", expected_info);
+
+    BLASFUNC(dimatcopy)(&order, &trans, &rows, &cols, &alpha, data_dimatcopy.a_test, 
+                        &lda_src, &lda_dst);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(dimatcopy, colmajor_trans_col_100_row_100_alpha_one)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 1.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(dimatcopy, colmajor_notrans_col_100_row_100_alpha_one)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 1.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 0.0
+ */
+CTEST(dimatcopy, colmajor_trans_col_100_row_100_alpha_zero)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 0.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 0.0
+ */
+CTEST(dimatcopy, colmajor_notrans_col_100_row_100_alpha_zero)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 0.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 1.0
+ */
+CTEST(dimatcopy, colmajor_trans_col_50_row_100_alpha_one)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 1.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 1.0
+ */
+CTEST(dimatcopy, colmajor_notrans_col_50_row_100_alpha_one)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 1.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 0.0
+ */
+CTEST(dimatcopy, colmajor_trans_col_50_row_100_alpha_zero)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 0.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 0.0
+ */
+CTEST(dimatcopy, colmajor_notrans_col_50_row_100_alpha_zero)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 0.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, colmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, colmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(dimatcopy, rowmajor_trans_col_100_row_100_alpha_one)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    double alpha = 1.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(dimatcopy, rowmajor_notrans_col_100_row_100_alpha_one)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha = 1.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 0.0
+ */
+CTEST(dimatcopy, rowmajor_trans_col_100_row_100_alpha_zero)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    double alpha = 0.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 0.0
+ */
+CTEST(dimatcopy, rowmajor_notrans_col_100_row_100_alpha_zero)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha = 0.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 1.0
+ */
+CTEST(dimatcopy, rowmajor_trans_col_100_row_50_alpha_one)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'C'; // same as trans for real matrix
+    double alpha = 1.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 1.0
+ */
+CTEST(dimatcopy, rowmajor_notrans_col_100_row_50_alpha_one)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha = 1.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 0.0
+ */
+CTEST(dimatcopy, rowmajor_trans_col_100_row_50_alpha_zero)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'C'; // same as trans for real matrix
+    double alpha = 0.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 0.0
+ */
+CTEST(dimatcopy, rowmajor_notrans_col_100_row_50_alpha_zero)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha = 0.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, rowmajor_trans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'C'; // same as trans for real matrix
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, rowmajor_notrans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, c_api_colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, c_api_colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, c_api_rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, c_api_rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test error function for an invalid param order.
+ * Must be column (C) or row major (R).
+ */
+CTEST(dimatcopy, xerbla_invalid_order)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'O';
+    char trans = 'T';
+    int expected_info = 1;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param trans.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(dimatcopy, xerbla_invalid_trans)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'O';
+    int expected_info = 2;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param m.
+ * Must be positive.
+ */
+CTEST(dimatcopy, xerbla_invalid_rows)
+{
+    blasint m = 0, n = 100;
+    blasint lda_src = 0, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 3;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param n.
+ * Must be positive.
+ */
+CTEST(dimatcopy, xerbla_invalid_cols)
+{
+    blasint m = 100, n = 0;
+    blasint lda_src = 100, lda_dst = 0;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 4;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_src.
+ * If matrices are stored using row major layout, 
+ * lda_src must be at least n.
+ */
+CTEST(dimatcopy, xerbla_rowmajor_invalid_lda)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_src.
+ * If matrices are stored using column major layout,
+ * lda_src must be at least m.
+ */
+CTEST(dimatcopy, xerbla_colmajor_invalid_lda)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using row major layout and 
+ * there is no transposition, lda_dst must be at least n.
+ */
+CTEST(dimatcopy, xerbla_rowmajor_notrans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using row major layout and 
+ * there is transposition, lda_dst must be at least m.
+ */
+CTEST(dimatcopy, xerbla_rowmajor_trans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using column major layout and 
+ * there is no transposition, lda_dst must be at least m.
+ */
+CTEST(dimatcopy, xerbla_colmajor_notrans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using column major layout and 
+ * there is transposition, lda_dst must be at least n.
+ */
+CTEST(dimatcopy, xerbla_colmajor_trans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_domatcopy.c b/utest/test_extensions/test_domatcopy.c
new file mode 100644
index 000000000..f692e8784
--- /dev/null
+++ b/utest/test_extensions/test_domatcopy.c
@@ -0,0 +1,672 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_DOMATCOPY {
+    double a_test[DATASIZE * DATASIZE];
+    double b_test[DATASIZE * DATASIZE];
+    double b_verify[DATASIZE * DATASIZE];
+};
+
+#ifdef BUILD_DOUBLE
+static struct DATA_DOMATCOPY data_domatcopy;
+
+/**
+ * Comapare results computed by domatcopy and reference func
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param rows - number of rows of A
+ * param cols - number of columns of A
+ * param alpha - scaling factor for matrix B
+ * param lda - leading dimension of the matrix A
+ * param ldb - leading dimension of the matrix B
+ * return norm of difference between openblas and reference func
+ */
+static double check_domatcopy(char api, char order, char trans, blasint rows, blasint cols, double alpha, 
+                             blasint lda, blasint ldb)
+{
+    blasint b_rows, b_cols;
+    blasint m, n;
+    enum CBLAS_ORDER corder;
+    enum CBLAS_TRANSPOSE ctrans;
+
+    if (order == 'C') {
+        m = cols; n = rows;
+    }
+    else {
+        m = rows; n = cols;
+    }
+
+    if(trans == 'T' || trans == 'C') {
+        b_rows = n; b_cols = m;
+    }
+    else {
+        b_rows = m; b_cols = n;
+    }
+
+    drand_generate(data_domatcopy.a_test, lda*m);
+
+    if (trans == 'T' || trans == 'C') {
+        dtranspose(m, n, alpha, data_domatcopy.a_test, lda, data_domatcopy.b_verify, ldb);
+    } 
+    else {
+        dcopy(m, n, alpha, data_domatcopy.a_test, lda, data_domatcopy.b_verify, ldb);
+    }
+
+    if (api == 'F') {
+        BLASFUNC(domatcopy)(&order, &trans, &rows, &cols, &alpha, data_domatcopy.a_test, 
+                            &lda, data_domatcopy.b_test, &ldb);
+    }
+    else {
+        if (order == 'C') corder = CblasColMajor;
+        if (order == 'R') corder = CblasRowMajor;
+        if (trans == 'T') ctrans = CblasTrans;
+        if (trans == 'N') ctrans = CblasNoTrans;
+        if (trans == 'C') ctrans = CblasConjTrans;
+        if (trans == 'R') ctrans = CblasConjNoTrans;
+        cblas_domatcopy(corder, ctrans, rows, cols, alpha, data_domatcopy.a_test, 
+                    lda, data_domatcopy.b_test, ldb);
+    }
+    
+    return dmatrix_difference(data_domatcopy.b_test, data_domatcopy.b_verify, b_cols, b_rows, ldb);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param rows - number of rows of A
+ * param cols - number of columns of A
+ * param lda - leading dimension of the matrix A
+ * param ldb - leading dimension of the matrix B
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char order, char trans, blasint rows, blasint cols,
+                          blasint lda, blasint ldb, int expected_info)
+{
+    double alpha = 1.0;
+
+    set_xerbla("DOMATCOPY", expected_info);
+
+    BLASFUNC(domatcopy)(&order, &trans, &rows, &cols, &alpha, data_domatcopy.a_test, 
+                        &lda, data_domatcopy.b_test, &ldb);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(domatcopy, colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 1.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(domatcopy, colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 1.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(domatcopy, colmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 2.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific tests
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(domatcopy, colmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 2.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 0.0
+ */
+CTEST(domatcopy, colmajor_trans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 50, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 0.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 0.0
+ */
+CTEST(domatcopy, colmajor_notrans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 50, ldb = 50;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 0.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(domatcopy, rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'T';
+    double alpha = 1.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(domatcopy, rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha = 1.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(domatcopy, rowmajor_conjtrans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'C'; // same as trans for real matrix
+    double alpha = 2.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(domatcopy, rowmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 50, ldb = 50;
+    char order = 'R';
+    char trans = 'N'; 
+    double alpha = 2.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Matrix dimensions leave residues from 4 and 2 (specialize
+ * for rt case)
+ * alpha = 1.5
+ */
+CTEST(domatcopy, rowmajor_trans_col_27_row_27)
+{
+    blasint m = 27, n = 27;
+    blasint lda = 27, ldb = 27;
+    char order = 'R';
+    char trans = 'T'; 
+    double alpha = 1.5;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 0.0
+ */
+CTEST(domatcopy, rowmajor_notrans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'N'; 
+    double alpha = 0.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(domatcopy, c_api_colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 1.0;
+
+    double norm = check_domatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(domatcopy, c_api_colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 1.0;
+
+    double norm = check_domatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(domatcopy, c_api_rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'T';
+    double alpha = 1.0;
+
+    double norm = check_domatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(domatcopy, c_api_rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha = 1.0;
+
+    double norm = check_domatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test error function for an invalid param order.
+ * Must be column (C) or row major (R).
+ */
+CTEST(domatcopy, xerbla_invalid_order)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'O';
+    char trans = 'T';
+    int expected_info = 1;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param trans.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(domatcopy, xerbla_invalid_trans)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'O';
+    int expected_info = 2;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param m.
+ * Must be positive.
+ */
+CTEST(domatcopy, xerbla_invalid_rows)
+{
+    blasint m = 0, n = 100;
+    blasint lda = 0, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 3;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param n.
+ * Must be positive.
+ */
+CTEST(domatcopy, xerbla_invalid_cols)
+{
+    blasint m = 100, n = 0;
+    blasint lda = 100, ldb = 0;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 4;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda.
+ * If matrices are stored using row major layout,
+ * lda must be at least n.
+ */
+CTEST(domatcopy, xerbla_rowmajor_invalid_lda)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 50, ldb = 100;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda.
+ * If matrices are stored using column major layout,
+ * lda must be at least m.
+ */
+CTEST(domatcopy, xerbla_colmajor_invalid_lda)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 50, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using row major layout and
+ * there is no transposition, ldb must be at least n.
+ */
+CTEST(domatcopy, xerbla_rowmajor_notrans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using row major layout and
+ * there is transposition, ldb must be at least m.
+ */
+CTEST(domatcopy, xerbla_rowmajor_trans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using column major layout and
+ * there is no transposition, ldb must be at least m.
+ */
+CTEST(domatcopy, xerbla_colmajor_notrans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using column major layout and
+ * there is transposition, ldb must be at least n.
+ */
+CTEST(domatcopy, xerbla_colmajor_trans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
diff --git a/utest/test_extensions/test_drotmg.c b/utest/test_extensions/test_drotmg.c
new file mode 100644
index 000000000..3073c8e3e
--- /dev/null
+++ b/utest/test_extensions/test_drotmg.c
@@ -0,0 +1,414 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#ifdef BUILD_DOUBLE
+
+/**
+ * Fortran API specific test
+ * Test drotmg by comparing it against pre-calculated values
+ */
+CTEST(drotmg, y1_zero)
+{
+	double te_d1, tr_d1;
+	double te_d2, tr_d2;
+	double te_x1, tr_x1;
+	double te_y1, tr_y1;
+	double te_param[5];
+	double tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 2.0;
+	te_d2 = tr_d2 = 2.0;
+	te_x1 = tr_x1 = 8.0;
+	te_y1 = tr_y1 = 0.0;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 2.0;
+	tr_d2 = 2.0;
+	tr_x1 = 8.0;
+	tr_y1 = 0.0;
+
+	tr_param[0] = -2.0;
+	tr_param[1] = 0.0;
+	tr_param[2] = 0.0;
+	tr_param[3] = 0.0;
+	tr_param[4] = 0.0;
+
+	//OpenBLAS
+	BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS);
+	}
+}
+
+/**
+ * Fortran API specific test
+ * Test drotmg by comparing it against pre-calculated values
+ */
+CTEST(drotmg, d1_negative)
+{
+	double te_d1, tr_d1;
+	double te_d2, tr_d2;
+	double te_x1, tr_x1;
+	double te_y1, tr_y1;
+	double te_param[5];
+	double tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = -1.0;
+	te_d2 = tr_d2 = 2.0;
+	te_x1 = tr_x1 = 8.0;
+	te_y1 = tr_y1 = 8.0;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 0.0;
+	tr_d2 = 0.0;
+	tr_x1 = 0.0;
+	tr_y1 = 8.0;
+
+	tr_param[0] = -1.0;
+	tr_param[1] = 0.0;
+	tr_param[2] = 0.0;
+	tr_param[3] = 0.0;
+	tr_param[4] = 0.0;
+
+	//OpenBLAS
+	BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS);
+	}
+}
+
+/**
+ * Fortran API specific test
+ * Test drotmg by comparing it against pre-calculated values
+ */
+CTEST(drotmg, d1_positive_d2_positive_x1_zero)
+{
+	double te_d1, tr_d1;
+	double te_d2, tr_d2;
+	double te_x1, tr_x1;
+	double te_y1, tr_y1;
+	double te_param[5];
+	double tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 2.0;
+	te_d2 = tr_d2 = 2.0;
+	te_x1 = tr_x1 = 0.0;
+	te_y1 = tr_y1 = 8.0;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 2.0;
+	tr_d2 = 2.0;
+	tr_x1 = 8.0;
+	tr_y1 = 8.0;
+
+	tr_param[0] = 1.0;
+	tr_param[1] = 0.0;
+	tr_param[2] = 0.0;
+	tr_param[3] = 0.0;
+	tr_param[4] = 0.0;
+
+	//OpenBLAS
+	BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS);
+	}
+}
+
+/**
+ * Fortran API specific test
+ * Test drotmg by comparing it against pre-calculated values
+ */
+CTEST(drotmg, scaled_y_greater_than_scaled_x)
+{
+	double te_d1, tr_d1;
+	double te_d2, tr_d2;
+	double te_x1, tr_x1;
+	double te_y1, tr_y1;
+	double te_param[5];
+	double tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 1.0;
+	te_d2 = tr_d2 = -2.0;
+	te_x1 = tr_x1 = 8.0;
+	te_y1 = tr_y1 = 8.0;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 0.0;
+	tr_d2 = 0.0;
+	tr_x1 = 0.0;
+	tr_y1 = 8.0;
+
+	tr_param[0] = -1.0;
+	tr_param[1] = 0.0;
+	tr_param[2] = 0.0;
+	tr_param[3] = 0.0;
+	tr_param[4] = 0.0;
+
+	//OpenBLAS
+	BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS);
+	}
+}
+
+/**
+ * C API specific test
+ * Test drotmg by comparing it against pre-calculated values
+ */
+CTEST(drotmg, c_api_y1_zero)
+{
+	double te_d1, tr_d1;
+	double te_d2, tr_d2;
+	double te_x1, tr_x1;
+	double te_y1, tr_y1;
+	double te_param[5];
+	double tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 2.0;
+	te_d2 = tr_d2 = 2.0;
+	te_x1 = tr_x1 = 8.0;
+	te_y1 = tr_y1 = 0.0;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 2.0;
+	tr_d2 = 2.0;
+	tr_x1 = 8.0;
+	tr_y1 = 0.0;
+
+	tr_param[0] = -2.0;
+	tr_param[1] = 0.0;
+	tr_param[2] = 0.0;
+	tr_param[3] = 0.0;
+	tr_param[4] = 0.0;
+
+	//OpenBLAS
+	cblas_drotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS);
+	}
+}
+
+/**
+ * C API specific test
+ * Test drotmg by comparing it against pre-calculated values
+ */
+CTEST(drotmg, c_api_d1_negative)
+{
+	double te_d1, tr_d1;
+	double te_d2, tr_d2;
+	double te_x1, tr_x1;
+	double te_y1, tr_y1;
+	double te_param[5];
+	double tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = -1.0;
+	te_d2 = tr_d2 = 2.0;
+	te_x1 = tr_x1 = 8.0;
+	te_y1 = tr_y1 = 8.0;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 0.0;
+	tr_d2 = 0.0;
+	tr_x1 = 0.0;
+	tr_y1 = 8.0;
+
+	tr_param[0] = -1.0;
+	tr_param[1] = 0.0;
+	tr_param[2] = 0.0;
+	tr_param[3] = 0.0;
+	tr_param[4] = 0.0;
+
+	//OpenBLAS
+	cblas_drotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS);
+	}
+}
+
+/**
+ * C API specific test
+ * Test drotmg by comparing it against pre-calculated values
+ */
+CTEST(drotmg, c_api_d1_positive_d2_positive_x1_zero)
+{
+	double te_d1, tr_d1;
+	double te_d2, tr_d2;
+	double te_x1, tr_x1;
+	double te_y1, tr_y1;
+	double te_param[5];
+	double tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 2.0;
+	te_d2 = tr_d2 = 2.0;
+	te_x1 = tr_x1 = 0.0;
+	te_y1 = tr_y1 = 8.0;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 2.0;
+	tr_d2 = 2.0;
+	tr_x1 = 8.0;
+	tr_y1 = 8.0;
+
+	tr_param[0] = 1.0;
+	tr_param[1] = 0.0;
+	tr_param[2] = 0.0;
+	tr_param[3] = 0.0;
+	tr_param[4] = 0.0;
+
+	//OpenBLAS
+	cblas_drotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS);
+	}
+}
+
+/**
+ * C API specific test
+ * Test drotmg by comparing it against pre-calculated values
+ */
+CTEST(drotmg, c_api_scaled_y_greater_than_scaled_x)
+{
+	double te_d1, tr_d1;
+	double te_d2, tr_d2;
+	double te_x1, tr_x1;
+	double te_y1, tr_y1;
+	double te_param[5];
+	double tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 1.0;
+	te_d2 = tr_d2 = -2.0;
+	te_x1 = tr_x1 = 8.0;
+	te_y1 = tr_y1 = 8.0;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 0.0;
+	tr_d2 = 0.0;
+	tr_x1 = 0.0;
+	tr_y1 = 8.0;
+
+	tr_param[0] = -1.0;
+	tr_param[1] = 0.0;
+	tr_param[2] = 0.0;
+	tr_param[3] = 0.0;
+	tr_param[4] = 0.0;
+
+	//OpenBLAS
+	cblas_drotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS);
+	}
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_dsum.c b/utest/test_extensions/test_dsum.c
new file mode 100644
index 000000000..e987c5a42
--- /dev/null
+++ b/utest/test_extensions/test_dsum.c
@@ -0,0 +1,403 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 50
+#define INCREMENT 2
+
+#ifdef BUILD_DOUBLE
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < ELEMENTS * inc; i ++) {
+      x[i] = 1000 - i;
+   }
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   double x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0;
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {1.1};
+
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {1.1, 0.0};
+
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {1.1, -1.0};
+
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {1.1, -1.5, 1.0, 1.0};
+
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {1.1, 1.0, 2.2};
+
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.3, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {1.1, 0.0, -1.0, -3.0, 2.2, 3.0};
+
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.3, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {1.1, 1.0, -2.2, 3.3};
+
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.2, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 2.0, 2.2, 2.7, -3.3, -5.9};
+
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_1_N_5){
+   blasint N = 5, inc = 1;
+   double x[] = {0.0, 1.0, 2.2, 3.3, 0.0};
+
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(6.5, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_2_N_5){
+   blasint N = 5, inc = 2;
+   double x[] = {0.0, 3.0, 1.0, -2.2, 2.2, -1.7, 3.3, 14.5, 0.0, -9.0};
+
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(6.5, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = (i & 1) ? -1.0 : 1.0;
+   }
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = (i & 1) ? -1.0 : 1.0;
+   }
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(50.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < ELEMENTS * inc; i ++) {
+      x[i] = 1000 - i;
+   }
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   double x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0;
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {1.1};
+
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {1.1, 0.0};
+
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {1.1, -1.0};
+
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {1.1, -1.5, 1.0, 1.0};
+
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(2.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {1.1, 1.0, 2.2};
+
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(4.3, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {1.1, 0.0, -1.0, -3.0, 2.2, 3.0};
+
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(2.3, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {1.1, 1.0, -2.2, 3.3};
+
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(3.2, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 2.0, 2.2, 2.7, -3.3, -5.9};
+
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_1_N_5){
+   blasint N = 5, inc = 1;
+   double x[] = {0.0, 1.0, 2.2, 3.3, 0.0};
+
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(6.5, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_2_N_5){
+   blasint N = 5, inc = 2;
+   double x[] = {0.0, 3.0, 1.0, -2.2, 2.2, -1.7, 3.3, 14.5, 0.0, -9.0};
+
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(6.5, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = (i & 1) ? -1.0 : 1.0;
+   }
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = (i & 1) ? -1.0 : 1.0;
+   }
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(50.0, sum, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_dzamax.c b/utest/test_extensions/test_dzamax.c
new file mode 100644
index 000000000..edea3de8f
--- /dev/null
+++ b/utest/test_extensions/test_dzamax.c
@@ -0,0 +1,294 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 70
+#define INCREMENT 2
+
+#ifdef BUILD_COMPLEX16
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+      x[i] = 1000 - i;
+   }
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, step_zero){
+   blasint i;
+   blasint N = ELEMENTS * 2, inc = 0;
+   double x[ELEMENTS];
+   for (i = 0; i < N  * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0;
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {1.0, 2.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {-1.0, -2.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {1.0, 2.0, 0.0, 0.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {-1.0, -2.0, 0.0, 0.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -3.0, -1.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 0.0, 0.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -3.0, -1.0, 0.0, 0.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0, -2.0, -2.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0, -2.0, -2.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, positive_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i;
+   }
+   x[7 * inc * 2] = 1000.0;
+   x[7 * inc * 2 + 1] = 1000.0;
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2000.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, negative_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = -i;
+   }
+   x[7 * inc * 2] = 1000.0;
+   x[7 * inc * 2 + 1] = 1000.0;
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2000.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, positive_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i;
+   }
+   x[7 * inc * 2] = 1000.0;
+   x[7 * inc * 2 + 1] = 1000.0;
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2000.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, negative_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = -i;
+   }
+   x[7 * inc * 2] = 1000.0;
+   x[7 * inc * 2 + 1] = 1000.0;
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2000.0, amax, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_dzamin.c b/utest/test_extensions/test_dzamin.c
new file mode 100644
index 000000000..916eede92
--- /dev/null
+++ b/utest/test_extensions/test_dzamin.c
@@ -0,0 +1,310 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 70
+#define INCREMENT 2
+
+#ifdef BUILD_COMPLEX16
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+      x[i] = 1000 - i;
+   }
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, step_zero){
+   blasint i;
+   blasint N = ELEMENTS * 2, inc = 0;
+   double x[ELEMENTS];
+   for (i = 0; i < N  * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0;
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {1.0, 2.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {-1.0, -2.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {1.0, 2.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {-1.0, -2.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0, -2.0, -2.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0, -2.0, -2.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, positive_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[7 * inc * 2] = 0.0;
+   x[7 * inc * 2 + 1] = 0.0;
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, negative_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = - i - 1000;
+   }
+   x[7 * inc * 2] = 0.0;
+   x[7 * inc * 2 + 1] = 0.0;
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, positive_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[7 * inc * 2] = 0.0;
+   x[7 * inc * 2 + 1] = 0.0;
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, negative_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = - i - 1000;
+   }
+   x[7 * inc * 2] = 0.0;
+   x[7 * inc * 2 + 1] = 0.0;
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_dzsum.c b/utest/test_extensions/test_dzsum.c
new file mode 100644
index 000000000..5139f59cb
--- /dev/null
+++ b/utest/test_extensions/test_dzsum.c
@@ -0,0 +1,403 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 50
+#define INCREMENT 2
+
+#ifdef BUILD_COMPLEX16
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+      x[i] = 1000 - i;
+   }
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   double x[ELEMENTS];
+   for (i = 0; i < N  * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0;
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {1.1, -1.0};
+
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {1.1, 0.0, 2.3, -1.0};
+
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {1.1, -1.0, 2.3, -1.0};
+
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.4, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {1.1, -1.5, 1.1, -1.0, 1.0, 1.0, 1.1, -1.0};
+
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.6, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {1.1, 1.0, 2.2, 1.1, -1.0, 0.0};
+
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.4, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {1.1, 0.0, -1.0, 0.0, -1.0, -3.0, -1.0, 0.0, 2.2, 3.0, -1.0, 0.0};
+
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.3, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {1.1, 1.0, -2.2, 3.3, 1.1, 1.0, -2.2, 3.3};
+
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(6.4, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {1.1, 0.0, 1.1, 1.0, 1.0, 2.0, 1.1, 1.0, 2.2, 2.7, 1.1, 1.0, -3.3, -5.9};
+
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(-0.2, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_1_N_5){
+   blasint N = 5, inc = 1;
+   double x[] = {0.0, 1.0, 2.2, 3.3, 0.0, 0.0, 1.0, 2.2, 3.3, 0.0};
+
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(13.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_2_N_5){
+   blasint N = 5, inc = 2;
+   double x[] = {0.0, 3.0, 1.0, 2.2, 1.0, -2.2, 1.0, 2.2, 2.2, -1.7, 1.0, 2.2, 3.3, 14.5, 1.0, 2.2, 0.0, -9.0, 1.0, 2.2};
+
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(11.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = (i & 1) ? -1.0 : 1.0;
+   }
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = (i & 1) ? -1.0 : 1.0;
+   }
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+      x[i] = 1000 - i;
+   }
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   double x[ELEMENTS];
+   for (i = 0; i < N  * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0;
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {1.1, -1.0};
+
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {1.1, 0.0, 2.3, -1.0};
+
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {1.1, -1.0, 2.3, -1.0};
+
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.4, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {1.1, -1.5, 1.1, -1.0, 1.0, 1.0, 1.1, -1.0};
+
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.6, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {1.1, 1.0, 2.2, 1.1, -1.0, 0.0};
+
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(4.4, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {1.1, 0.0, -1.0, 0.0, -1.0, -3.0, -1.0, 0.0, 2.2, 3.0, -1.0, 0.0};
+
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(2.3, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {1.1, 1.0, -2.2, 3.3, 1.1, 1.0, -2.2, 3.3};
+
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(6.4, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {1.1, 0.0, 1.1, 1.0, 1.0, 2.0, 1.1, 1.0, 2.2, 2.7, 1.1, 1.0, -3.3, -5.9};
+
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(-0.2, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_1_N_5){
+   blasint N = 5, inc = 1;
+   double x[] = {0.0, 1.0, 2.2, 3.3, 0.0, 0.0, 1.0, 2.2, 3.3, 0.0};
+
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(13.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_2_N_5){
+   blasint N = 5, inc = 2;
+   double x[] = {0.0, 3.0, 1.0, 2.2, 1.0, -2.2, 1.0, 2.2, 2.2, -1.7, 1.0, 2.2, 3.3, 14.5, 1.0, 2.2, 0.0, -9.0, 1.0, 2.2};
+
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(11.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = (i & 1) ? -1.0 : 1.0;
+   }
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = (i & 1) ? -1.0 : 1.0;
+   }
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_icamin.c b/utest/test_extensions/test_icamin.c
new file mode 100644
index 000000000..cca464eac
--- /dev/null
+++ b/utest/test_extensions/test_icamin.c
@@ -0,0 +1,625 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 50
+#define INCREMENT 2
+
+#ifdef BUILD_COMPLEX
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+      x[i] = 1000 - i;
+   }
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N * 2; i ++) {
+      x[i] = i - 1000;
+   }
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {1.0f, 2.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {-1.0f, -2.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f, -2.0f, -2.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f, -2.0f, -2.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, positive_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[7 * inc * 2] = 0.0f;
+   x[7 * inc * 2 + 1] = 0.0f;
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, negative_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = - i - 1000;
+   }
+   x[7 * inc * 2] = 0.0f;
+   x[7 * inc * 2 + 1] = 0.0f;
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, positive_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[7 * inc * 2] = 0.0f;
+   x[7 * inc * 2 + 1] = 0.0f;
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, negative_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = - i - 1000;
+   }
+   x[7 * inc * 2] = 0.0f;
+   x[7 * inc * 2 + 1] = 0.0f;
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, min_idx_in_vec_tail){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc * 2] = 0.0f;
+   x[(N - 1) * inc * 2 + 1] = 0.0f;
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(N, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_bad_args_N_0){
+    blasint i;
+    blasint N = 0, inc = 1;
+    float x[ELEMENTS * 2];
+    for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+        x[i] = 1000 - i;
+    }
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_step_zero){
+    blasint i;
+    blasint N = ELEMENTS, inc = 0;
+    float x[ELEMENTS * 2];
+    for (i = 0; i < N * 2; i ++) {
+        x[i] = i - 1000;
+    }
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_positive_step_1_N_1){
+    blasint N = 1, inc = 1;
+    float x[] = {1.0f, 2.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_negative_step_1_N_1){
+    blasint N = 1, inc = 1;
+    float x[] = {-1.0f, -2.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_positive_step_2_N_1){
+    blasint N = 1, inc = 2;
+    float x[] = {1.0f, 2.0f, 0.0f, 0.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_negative_step_2_N_1){
+    blasint N = 1, inc = 2;
+    float x[] = {-1.0f, -2.0f, 0.0f, 0.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_positive_step_1_N_2){
+    blasint N = 2, inc = 1;
+    float x[] = {1.0f, 2.0f, 0.0f, 0.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_negative_step_1_N_2){
+    blasint N = 2, inc = 1;
+    float x[] = {-1.0f, -2.0f, 0.0f, 0.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_positive_step_2_N_2){
+    blasint N = 2, inc = 2;
+    float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_negative_step_2_N_2){
+    blasint N = 2, inc = 2;
+    float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_positive_step_1_N_3){
+    blasint N = 3, inc = 1;
+    float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_negative_step_1_N_3){
+    blasint N = 3, inc = 1;
+    float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_positive_step_2_N_3){
+    blasint N = 3, inc = 2;
+    float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_negative_step_2_N_3){
+    blasint N = 3, inc = 2;
+    float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_positive_step_1_N_4){
+    blasint N = 4, inc = 1;
+    float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f, -2.0f, -2.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_negative_step_1_N_4){
+    blasint N = 4, inc = 1;
+    float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f, -2.0f, -2.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_positive_step_2_N_4){
+    blasint N = 4, inc = 2;
+    float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_negative_step_2_N_4){
+    blasint N = 4, inc = 2;
+    float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_positive_step_1_N_50){
+    blasint i;
+    blasint N = ELEMENTS, inc = 1;
+    float x[ELEMENTS * 2];
+    for (i = 0; i < N * inc * 2; i ++) {
+        x[i] = i + 1000;
+    }
+    x[7 * inc * 2] = 0.0f;
+    x[7 * inc * 2 + 1] = 0.0f;
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(7, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_negative_step_1_N_50){
+    blasint i;
+    blasint N = ELEMENTS, inc = 1;
+    float x[ELEMENTS * 2];
+    for (i = 0; i < N * inc * 2; i ++) {
+        x[i] = - i - 1000;
+    }
+    x[7 * inc * 2] = 0.0f;
+    x[7 * inc * 2 + 1] = 0.0f;
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(7, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_positive_step_2_N_50){
+    blasint i;
+    blasint N = ELEMENTS, inc = INCREMENT;
+    float x[ELEMENTS * INCREMENT * 2];
+    for (i = 0; i < N * inc * 2; i ++) {
+        x[i] = i + 1000;
+    }
+    x[7 * inc * 2] = 0.0f;
+    x[7 * inc * 2 + 1] = 0.0f;
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(7, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_negative_step_2_N_50){
+    blasint i;
+    blasint N = ELEMENTS, inc = INCREMENT;
+    float x[ELEMENTS * INCREMENT * 2];
+    for (i = 0; i < N * inc * 2; i ++) {
+        x[i] = - i - 1000;
+    }
+    x[7 * inc * 2] = 0.0f;
+    x[7 * inc * 2 + 1] = 0.0f;
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(7, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_min_idx_in_vec_tail){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc * 2] = 0.0f;
+   x[(N - 1) * inc * 2 + 1] = 0.0f;
+   blasint index = cblas_icamin(N, x, inc);
+   ASSERT_EQUAL(N - 1, index);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_idamin.c b/utest/test_extensions/test_idamin.c
new file mode 100644
index 000000000..9f099f666
--- /dev/null
+++ b/utest/test_extensions/test_idamin.c
@@ -0,0 +1,787 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 50
+#define INCREMENT 2
+
+#ifdef BUILD_DOUBLE
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < ELEMENTS * inc; i ++) {
+      x[i] = 1000 - i;
+   }
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   double x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0;
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {1.1};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {-1.1};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {1.1, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {-1.1, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {1.1, 1.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {-1.1, 1.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {1.1, 1.0, 2.2};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {-1.1, 1.0, -2.2};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {1.1, 1.0, 2.2, 3.3};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {-1.1, 1.0, -2.2, -3.3};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_1_N_5){
+   blasint N = 5, inc = 1;
+   double x[] = {1.1, 1.0, 2.2, 3.3, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(5, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_1_N_5){
+   blasint N = 5, inc = 1;
+   double x[] = {-1.1, 1.0, -2.2, -3.3, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(5, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_2_N_5){
+   blasint N = 5, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0, 0.0, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(5, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_2_N_5){
+   blasint N = 5, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0, 0.0, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(5, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 0.0;
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(9, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0;
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(9, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 0.0;
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(9, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0;
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(9, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, min_idx_in_vec_tail){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc] = 0.0;
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(N, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, min_idx_in_vec_tail_inc_1){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS * inc];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc] = 0.0f;
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(N, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < ELEMENTS * inc; i ++) {
+      x[i] = 1000 - i;
+   }
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   double x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0;
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {1.1};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {-1.1};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {1.1, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {-1.1, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {1.1, 1.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {-1.1, 1.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {1.1, 1.0, 2.2};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {-1.1, 1.0, -2.2};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {1.1, 1.0, 2.2, 3.3};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {-1.1, 1.0, -2.2, -3.3};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_1_N_5){
+   blasint N = 5, inc = 1;
+   double x[] = {1.1, 1.0, 2.2, 3.3, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(4, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_1_N_5){
+   blasint N = 5, inc = 1;
+   double x[] = {-1.1, 1.0, -2.2, -3.3, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(4, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_2_N_5){
+   blasint N = 5, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0, 0.0, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(4, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_2_N_5){
+   blasint N = 5, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0, 0.0, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(4, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 0.0;
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0;
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 0.0;
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0;
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_min_idx_in_vec_tail){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc] = 0.0;
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(N - 1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_min_idx_in_vec_tail_inc_1){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS * inc];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc] = 0.0;
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(N - 1, index);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_isamin.c b/utest/test_extensions/test_isamin.c
new file mode 100644
index 000000000..df8dead07
--- /dev/null
+++ b/utest/test_extensions/test_isamin.c
@@ -0,0 +1,787 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 50
+#define INCREMENT 2
+
+#ifdef BUILD_SINGLE
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < ELEMENTS * inc; i ++) {
+      x[i] = 1000 - i;
+   }
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   float x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0f;
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {1.1f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {-1.1f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {1.1f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {-1.1f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {1.1f, 1.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {-1.1f, 1.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {-1.1f, 1.0f, -2.2f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f, 3.3f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {-1.1f, 1.0f, -2.2f, -3.3f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_1_N_5){
+   blasint N = 5, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f, 3.3f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(5, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_1_N_5){
+   blasint N = 5, inc = 1;
+   float x[] = {-1.1f, 1.0f, -2.2f, -3.3f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(5, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_2_N_5){
+   blasint N = 5, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f, 0.0f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(5, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_2_N_5){
+   blasint N = 5, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f, 0.0f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(5, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 0.0f;
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(9, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0f;
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(9, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 0.0f;
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(9, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0f;
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(9, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, min_idx_in_vec_tail){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc] = 0.0f;
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(N, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, min_idx_in_vec_tail_inc_1){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS * inc];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc] = 0.0f;
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(N, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < ELEMENTS * inc; i ++) {
+      x[i] = 1000 - i;
+   }
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   float x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0f;
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {1.1f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {-1.1f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {1.1f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {-1.1f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {1.1f, 1.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {-1.1f, 1.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {-1.1f, 1.0f, -2.2f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f, 3.3f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {-1.1f, 1.0f, -2.2f, -3.3f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_1_N_5){
+   blasint N = 5, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f, 3.3f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(4, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_1_N_5){
+   blasint N = 5, inc = 1;
+   float x[] = {-1.1f, 1.0f, -2.2f, -3.3f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(4, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_2_N_5){
+   blasint N = 5, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f, 0.0f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(4, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_2_N_5){
+   blasint N = 5, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f, 0.0f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(4, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_1_N_50){
+    blasint i;
+    blasint N = ELEMENTS, inc = 1;
+    float x[ELEMENTS];
+    for (i = 0; i < N * inc; i ++) {
+        x[i] = i + 1000;
+    }
+
+    x[8 * inc] = 0.0f;
+    blasint index = cblas_isamin(N, x, inc);
+    ASSERT_EQUAL(8, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_1_N_50){
+    blasint i;
+    blasint N = ELEMENTS, inc = 1;
+    float x[ELEMENTS];
+    for (i = 0; i < N  * inc; i ++) {
+        x[i] = - i - 1000;
+    }
+
+    x[8 * inc] = -1.0f;
+    blasint index = cblas_isamin(N, x, inc);
+    ASSERT_EQUAL(8, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 0.0f;
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0f;
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_min_idx_in_vec_tail){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc] = 0.0f;
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(N - 1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_min_idx_in_vec_tail_inc_1){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS * inc];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc] = 0.0f;
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(N - 1, index);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_izamin.c b/utest/test_extensions/test_izamin.c
new file mode 100644
index 000000000..a0bdae8e2
--- /dev/null
+++ b/utest/test_extensions/test_izamin.c
@@ -0,0 +1,625 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 50
+#define INCREMENT 2
+
+#ifdef BUILD_COMPLEX16
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+      x[i] = 1000 - i;
+   }
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N * 2; i ++) {
+      x[i] = i - 1000;
+   }
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {1.0, 2.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {-1.0, -2.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {1.0, 2.0, 0.0, 0.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {-1.0, -2.0, 0.0, 0.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0, -2.0, -2.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0, -2.0, -2.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, positive_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[7 * inc * 2] = 0.0;
+   x[7 * inc * 2 + 1] = 0.0;
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, negative_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = - i - 1000;
+   }
+   x[7 * inc * 2] = 0.0;
+   x[7 * inc * 2 + 1] = 0.0;
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, positive_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[7 * inc * 2] = 0.0;
+   x[7 * inc * 2 + 1] = 0.0;
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, negative_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = - i - 1000;
+   }
+   x[7 * inc * 2] = 0.0;
+   x[7 * inc * 2 + 1] = 0.0;
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, min_idx_in_vec_tail){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc * 2] = 0.0;
+   x[(N - 1) * inc * 2 + 1] = 0.0;
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(N, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_bad_args_N_0){
+    blasint i;
+    blasint N = 0, inc = 1;
+    double x[ELEMENTS * 2];
+    for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+        x[i] = 1000 - i;
+    }
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_step_zero){
+    blasint i;
+    blasint N = ELEMENTS, inc = 0;
+    double x[ELEMENTS * 2];
+    for (i = 0; i < N * 2; i ++) {
+        x[i] = i - 1000;
+    }
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_positive_step_1_N_1){
+    blasint N = 1, inc = 1;
+    double x[] = {1.0, 2.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_negative_step_1_N_1){
+    blasint N = 1, inc = 1;
+    double x[] = {-1.0, -2.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_positive_step_2_N_1){
+    blasint N = 1, inc = 2;
+    double x[] = {1.0, 2.0, 0.0, 0.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_negative_step_2_N_1){
+    blasint N = 1, inc = 2;
+    double x[] = {-1.0, -2.0, 0.0, 0.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_positive_step_1_N_2){
+    blasint N = 2, inc = 1;
+    double x[] = {1.0, 2.0, 0.0, 0.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_negative_step_1_N_2){
+    blasint N = 2, inc = 1;
+    double x[] = {-1.0, -2.0, 0.0, 0.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_positive_step_2_N_2){
+    blasint N = 2, inc = 2;
+    double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_negative_step_2_N_2){
+    blasint N = 2, inc = 2;
+    double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_positive_step_1_N_3){
+    blasint N = 3, inc = 1;
+    double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_negative_step_1_N_3){
+    blasint N = 3, inc = 1;
+    double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_positive_step_2_N_3){
+    blasint N = 3, inc = 2;
+    double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_negative_step_2_N_3){
+    blasint N = 3, inc = 2;
+    double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_positive_step_1_N_4){
+    blasint N = 4, inc = 1;
+    double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0, -2.0, -2.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_negative_step_1_N_4){
+    blasint N = 4, inc = 1;
+    double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0, -2.0, -2.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_positive_step_2_N_4){
+    blasint N = 4, inc = 2;
+    double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_negative_step_2_N_4){
+    blasint N = 4, inc = 2;
+    double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_positive_step_1_N_50){
+    blasint i;
+    blasint N = ELEMENTS, inc = 1;
+    double x[ELEMENTS * 2];
+    for (i = 0; i < N * inc * 2; i ++) {
+        x[i] = i + 1000;
+    }
+    x[7 * inc * 2] = 0.0;
+    x[7 * inc * 2 + 1] = 0.0;
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(7, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_negative_step_1_N_50){
+    blasint i;
+    blasint N = ELEMENTS, inc = 1;
+    double x[ELEMENTS * 2];
+    for (i = 0; i < N * inc * 2; i ++) {
+        x[i] = - i - 1000;
+    }
+    x[7 * inc * 2] = 0.0;
+    x[7 * inc * 2 + 1] = 0.0;
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(7, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_positive_step_2_N_50){
+    blasint i;
+    blasint N = ELEMENTS, inc = INCREMENT;
+    double x[ELEMENTS * INCREMENT * 2];
+    for (i = 0; i < N * inc * 2; i ++) {
+        x[i] = i + 1000;
+    }
+    x[7 * inc * 2] = 0.0;
+    x[7 * inc * 2 + 1] = 0.0;
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(7, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_negative_step_2_N_50){
+    blasint i;
+    blasint N = ELEMENTS, inc = INCREMENT;
+    double x[ELEMENTS * INCREMENT * 2];
+    for (i = 0; i < N * inc * 2; i ++) {
+        x[i] = - i - 1000;
+    }
+    x[7 * inc * 2] = 0.0;
+    x[7 * inc * 2 + 1] = 0.0;
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(7, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_min_idx_in_vec_tail){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc * 2] = 0.0;
+   x[(N - 1) * inc * 2 + 1] = 0.0;
+   blasint index = cblas_izamin(N, x, inc);
+   ASSERT_EQUAL(N - 1, index);
+}
+#endif
diff --git a/utest/test_extensions/test_samin.c b/utest/test_extensions/test_samin.c
new file mode 100644
index 000000000..5c747a0f6
--- /dev/null
+++ b/utest/test_extensions/test_samin.c
@@ -0,0 +1,354 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 70
+#define INCREMENT 2
+
+#ifdef BUILD_SINGLE
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < ELEMENTS * inc; i ++) {
+      x[i] = 1000 - i;
+   }
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   float x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0f;
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {1.1f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {-1.1f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {1.1f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {-1.1f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {1.1f, 1.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {-1.1f, 1.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {-1.1f, 1.0f, -2.2f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f, 3.3f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {-1.1f, 1.0f, -2.2f, -3.3f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_1_N_5){
+   blasint N = 5, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f, 3.3f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_1_N_5){
+   blasint N = 5, inc = 1;
+   float x[] = {-1.1f, 1.0f, -2.2f, -3.3f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_2_N_5){
+   blasint N = 5, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_2_N_5){
+   blasint N = 5, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 0.0f;
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0f;
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 1.0f;
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0f;
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_saxpby.c b/utest/test_extensions/test_saxpby.c
new file mode 100644
index 000000000..b4bd5cf0b
--- /dev/null
+++ b/utest/test_extensions/test_saxpby.c
@@ -0,0 +1,794 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_SAXPBY {
+    float x_test[DATASIZE * INCREMENT];
+    float x_verify[DATASIZE * INCREMENT];
+    float y_test[DATASIZE * INCREMENT];
+    float y_verify[DATASIZE * INCREMENT];
+};
+#ifdef BUILD_SINGLE
+static struct DATA_SAXPBY data_saxpby;
+
+/**
+ * Fortran API specific function
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Compare with the following options:
+ * 
+ * param n - number of elements in vectors x and y
+ * param alpha - scalar alpha
+ * param incx - increment for the elements of x
+ * param beta - scalar beta
+ * param incy - increment for the elements of y
+ * return norm of difference
+ */
+static float check_saxpby(blasint n, float alpha, blasint incx, float beta, blasint incy)
+{
+    blasint i;
+
+    // sscal accept only positive increments
+    blasint incx_abs = labs(incx);
+    blasint incy_abs = labs(incy);
+
+    // Fill vectors x, y
+    srand_generate(data_saxpby.x_test, n * incx_abs);
+    srand_generate(data_saxpby.y_test, n * incy_abs);
+
+    // Copy vector x for saxpy
+    for (i = 0; i < n * incx_abs; i++)
+        data_saxpby.x_verify[i] = data_saxpby.x_test[i];
+
+    // Copy vector y for sscal
+    for (i = 0; i < n * incy_abs; i++)
+        data_saxpby.y_verify[i] = data_saxpby.y_test[i];
+
+    // Find beta*y
+    BLASFUNC(sscal)(&n, &beta, data_saxpby.y_verify, &incy_abs);
+
+    // Find sum of alpha*x and beta*y
+    BLASFUNC(saxpy)(&n, &alpha, data_saxpby.x_verify, &incx,
+                        data_saxpby.y_verify, &incy);
+    
+    BLASFUNC(saxpby)(&n, &alpha, data_saxpby.x_test, &incx,
+                        &beta, data_saxpby.y_test, &incy);
+
+    // Find the differences between output vector caculated by saxpby and saxpy
+    for (i = 0; i < n * incy_abs; i++)
+        data_saxpby.y_test[i] -= data_saxpby.y_verify[i];
+
+    // Find the norm of differences
+    return BLASFUNC(snrm2)(&n, data_saxpby.y_test, &incy_abs);
+}
+
+/**
+ * C API specific function
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Compare with the following options:
+ * 
+ * param n - number of elements in vectors x and y
+ * param alpha - scalar alpha
+ * param incx - increment for the elements of x
+ * param beta - scalar beta
+ * param incy - increment for the elements of y
+ * return norm of difference
+ */
+static float c_api_check_saxpby(blasint n, float alpha, blasint incx, float beta, blasint incy)
+{
+    blasint i;
+
+    // sscal accept only positive increments
+    blasint incx_abs = labs(incx);
+    blasint incy_abs = labs(incy);
+
+    // Copy vector x for saxpy
+    for (i = 0; i < n * incx_abs; i++)
+        data_saxpby.x_verify[i] = data_saxpby.x_test[i];
+
+    // Copy vector y for sscal
+    for (i = 0; i < n * incy_abs; i++)
+        data_saxpby.y_verify[i] = data_saxpby.y_test[i];
+
+    // Find beta*y
+    cblas_sscal(n, beta, data_saxpby.y_verify, incy_abs);
+
+    // Find sum of alpha*x and beta*y
+    cblas_saxpy(n, alpha, data_saxpby.x_verify, incx,
+                        data_saxpby.y_verify, incy);
+    
+    cblas_saxpby(n, alpha, data_saxpby.x_test, incx,
+                        beta, data_saxpby.y_test, incy);
+
+    // Find the differences between output vector caculated by saxpby and saxpy
+    for (i = 0; i < n * incy_abs; i++)
+        data_saxpby.y_test[i] -= data_saxpby.y_verify[i];
+
+    // Find the norm of differences
+    return cblas_snrm2(n, data_saxpby.y_test, incy_abs);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ */
+CTEST(saxpby, inc_x_1_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha = 1.0f;
+    float beta = 1.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ */
+CTEST(saxpby, inc_x_2_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    float alpha = 2.0f;
+    float beta = 1.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ */
+CTEST(saxpby, inc_x_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha = 1.0f;
+    float beta = 2.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ */
+CTEST(saxpby, inc_x_2_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    float alpha = 3.0f;
+    float beta = 4.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is 2
+ */
+CTEST(saxpby, inc_x_neg_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = -1, incy = 2;
+    float alpha = 5.0f;
+    float beta = 4.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is -1
+ */
+CTEST(saxpby, inc_x_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = -1;
+    float alpha = 1.0f;
+    float beta = 6.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is -1
+ */
+CTEST(saxpby, inc_x_neg_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = -2, incy = -1;
+    float alpha = 7.0f;
+    float beta = 3.5f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ */
+CTEST(saxpby, inc_x_1_inc_y_1_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha = 0.0f;
+    float beta = 1.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+*/
+CTEST(saxpby, inc_x_1_inc_y_2_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha = 0.0f;
+    float beta = 1.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar beta is zero
+ */
+CTEST(saxpby, inc_x_1_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ * Scalar beta is zero
+*/
+CTEST(saxpby, inc_x_2_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar beta is zero
+*/
+CTEST(saxpby, inc_x_1_inc_y_2_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * Scalar beta is zero
+*/
+CTEST(saxpby, inc_x_2_inc_y_2_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ * Scalar beta is zero
+ */
+CTEST(saxpby, inc_x_1_inc_y_1_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha = 0.0f;
+    float beta = 0.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+ * Scalar beta is zero
+*/
+CTEST(saxpby, inc_x_1_inc_y_2_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha = 0.0f;
+    float beta = 0.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Check if n - size of vectors x, y is zero
+ */
+CTEST(saxpby, check_n_zero)
+{
+    blasint n = 0, incx = 1, incy = 1;
+    float alpha = 1.0f;
+    float beta = 1.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+    
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ */
+CTEST(saxpby, c_api_inc_x_1_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha = 1.0f;
+    float beta = 1.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ */
+CTEST(saxpby, c_api_inc_x_2_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    float alpha = 2.0f;
+    float beta = 1.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ */
+CTEST(saxpby, c_api_inc_x_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha = 1.0f;
+    float beta = 2.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ */
+CTEST(saxpby, c_api_inc_x_2_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    float alpha = 3.0f;
+    float beta = 4.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is 2
+ */
+CTEST(saxpby, c_api_inc_x_neg_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = -1, incy = 2;
+    float alpha = 5.0f;
+    float beta = 4.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is -1
+ */
+CTEST(saxpby, c_api_inc_x_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = -1;
+    float alpha = 1.0f;
+    float beta = 6.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is -1
+ */
+CTEST(saxpby, c_api_inc_x_neg_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = -2, incy = -1;
+    float alpha = 7.0f;
+    float beta = 3.5f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ */
+CTEST(saxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha = 0.0f;
+    float beta = 1.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+*/
+CTEST(saxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha = 0.0f;
+    float beta = 1.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar beta is zero
+ */
+CTEST(saxpby, c_api_inc_x_1_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ * Scalar beta is zero
+*/
+CTEST(saxpby, c_api_inc_x_2_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar beta is zero
+*/
+CTEST(saxpby, c_api_inc_x_1_inc_y_2_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * Scalar beta is zero
+*/
+CTEST(saxpby, c_api_inc_x_2_inc_y_2_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ * Scalar beta is zero
+ */
+CTEST(saxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha = 0.0f;
+    float beta = 0.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+ * Scalar beta is zero
+*/
+CTEST(saxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha = 0.0f;
+    float beta = 0.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Check if n - size of vectors x, y is zero
+ */
+CTEST(saxpby, c_api_check_n_zero)
+{
+    blasint n = 0, incx = 1, incy = 1;
+    float alpha = 1.0f;
+    float beta = 1.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_scamax.c b/utest/test_extensions/test_scamax.c
new file mode 100644
index 000000000..39d7201ff
--- /dev/null
+++ b/utest/test_extensions/test_scamax.c
@@ -0,0 +1,294 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 70
+#define INCREMENT 2
+
+#ifdef BUILD_COMPLEX
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+      x[i] = 1000 - i;
+   }
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, step_zero){
+   blasint i;
+   blasint N = ELEMENTS * 2, inc = 0;
+   float x[ELEMENTS];
+   for (i = 0; i < N  * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0f;
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {1.0f, 2.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {-1.0f, -2.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -3.0f, -1.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 3.0f, 1.0f, 0.0f, 0.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -3.0f, -1.0f, 0.0f, 0.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f, -2.0f, -2.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f, -2.0f, -2.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, positive_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i;
+   }
+   x[7 * inc * 2] = 1000.0f;
+   x[7 * inc * 2 + 1] = 1000.0f;
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2000.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, negative_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = -i;
+   }
+   x[7 * inc * 2] = 1000.0f;
+   x[7 * inc * 2 + 1] = 1000.0f;
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2000.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, positive_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i;
+   }
+   x[7 * inc * 2] = 1000.0f;
+   x[7 * inc * 2 + 1] = 1000.0f;
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2000.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, negative_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = -i;
+   }
+   x[7 * inc * 2] = 1000.0f;
+   x[7 * inc * 2 + 1] = 1000.0f;
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2000.0f, amax, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_scamin.c b/utest/test_extensions/test_scamin.c
new file mode 100644
index 000000000..4baa23184
--- /dev/null
+++ b/utest/test_extensions/test_scamin.c
@@ -0,0 +1,310 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 70
+#define INCREMENT 2
+
+#ifdef BUILD_COMPLEX
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+      x[i] = 1000 - i;
+   }
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, step_zero){
+   blasint i;
+   blasint N = ELEMENTS * 2, inc = 0;
+   float x[ELEMENTS];
+   for (i = 0; i < N  * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0f;
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {1.0f, 2.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {-1.0f, -2.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f, -2.0f, -2.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f, -2.0f, -2.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, positive_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[7 * inc * 2] = 0.0f;
+   x[7 * inc * 2 + 1] = 0.0f;
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, negative_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = - i - 1000;
+   }
+   x[7 * inc * 2] = 0.0f;
+   x[7 * inc * 2 + 1] = 0.0f;
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, positive_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[7 * inc * 2] = 0.0f;
+   x[7 * inc * 2 + 1] = 0.0f;
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, negative_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = - i - 1000;
+   }
+   x[7 * inc * 2] = 0.0f;
+   x[7 * inc * 2 + 1] = 0.0f;
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_scsum.c b/utest/test_extensions/test_scsum.c
new file mode 100644
index 000000000..492e1a4ca
--- /dev/null
+++ b/utest/test_extensions/test_scsum.c
@@ -0,0 +1,403 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 50
+#define INCREMENT 2
+
+#ifdef BUILD_COMPLEX
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+      x[i] = 1000 - i;
+   }
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   float x[ELEMENTS];
+   for (i = 0; i < N  * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0f;
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {1.1f, -1.0f};
+
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {1.1f, 0.0f, 2.3f, -1.0f};
+
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {1.1f, -1.0f, 2.3f, -1.0f};
+
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.4f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {1.1f, -1.5f, 1.1f, -1.0f, 1.0f, 1.0f, 1.1f, -1.0f};
+
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.6f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f, 1.1f, -1.0f, 0.0f};
+
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.4f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {1.1f, 0.0f, -1.0f, 0.0f, -1.0f, -3.0f, -1.0f, 0.0f, 2.2f, 3.0f, -1.0f, 0.0f};
+
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.3f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {1.1f, 1.0f, -2.2f, 3.3f, 1.1f, 1.0f, -2.2f, 3.3f};
+
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(6.4f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.1f, 1.0f, 1.0f, 2.0f, 1.1f, 1.0f, 2.2f, 2.7f, 1.1f, 1.0f, -3.3f, -5.9f};
+
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(-0.2f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_1_N_5){
+   blasint N = 5, inc = 1;
+   float x[] = {0.0f, 1.0f, 2.2f, 3.3f, 0.0f, 0.0f, 1.0f, 2.2f, 3.3f, 0.0f};
+
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(13.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_2_N_5){
+   blasint N = 5, inc = 2;
+   float x[] = {0.0f, 3.0f, 1.0f, 2.2f, 1.0f, -2.2f, 1.0f, 2.2f, 2.2f, -1.7f, 1.0f, 2.2f, 3.3f, 14.5f, 1.0f, 2.2f, 0.0f, -9.0f, 1.0f, 2.2f};
+
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(11.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = (i & 1) ? -1.0f : 1.0f;
+   }
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = (i & 1) ? -1.0f : 1.0f;
+   }
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+      x[i] = 1000 - i;
+   }
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   float x[ELEMENTS];
+   for (i = 0; i < N  * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0f;
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {1.1f, -1.0f};
+
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {1.1f, 0.0f, 2.3f, -1.0f};
+
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {1.1f, -1.0f, 2.3f, -1.0f};
+
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.4f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {1.1f, -1.5f, 1.1f, -1.0f, 1.0f, 1.0f, 1.1f, -1.0f};
+
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.6f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f, 1.1f, -1.0f, 0.0f};
+
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(4.4f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {1.1f, 0.0f, -1.0f, 0.0f, -1.0f, -3.0f, -1.0f, 0.0f, 2.2f, 3.0f, -1.0f, 0.0f};
+
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(2.3f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {1.1f, 1.0f, -2.2f, 3.3f, 1.1f, 1.0f, -2.2f, 3.3f};
+
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(6.4f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.1f, 1.0f, 1.0f, 2.0f, 1.1f, 1.0f, 2.2f, 2.7f, 1.1f, 1.0f, -3.3f, -5.9f};
+
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(-0.2f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_1_N_5){
+   blasint N = 5, inc = 1;
+   float x[] = {0.0f, 1.0f, 2.2f, 3.3f, 0.0f, 0.0f, 1.0f, 2.2f, 3.3f, 0.0f};
+
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(13.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_2_N_5){
+   blasint N = 5, inc = 2;
+   float x[] = {0.0f, 3.0f, 1.0f, 2.2f, 1.0f, -2.2f, 1.0f, 2.2f, 2.2f, -1.7f, 1.0f, 2.2f, 3.3f, 14.5f, 1.0f, 2.2f, 0.0f, -9.0f, 1.0f, 2.2f};
+
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(11.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = (i & 1) ? -1.0f : 1.0f;
+   }
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = (i & 1) ? -1.0f : 1.0f;
+   }
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_sgeadd.c b/utest/test_extensions/test_sgeadd.c
new file mode 100644
index 000000000..b42ce9c0e
--- /dev/null
+++ b/utest/test_extensions/test_sgeadd.c
@@ -0,0 +1,880 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define N 100
+#define M 100
+
+struct DATA_SGEADD
+{
+    float a_test[M * N];
+    float c_test[M * N];
+    float c_verify[M * N];
+};
+
+#ifdef BUILD_SINGLE
+static struct DATA_SGEADD data_sgeadd;
+
+/**
+ * sgeadd reference implementation
+ *
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param alpha - scaling factor for matrix A
+ * param aptr - refer to matrix A
+ * param lda - leading dimension of A
+ * param beta - scaling factor for matrix C
+ * param cptr - refer to matrix C
+ * param ldc - leading dimension of C
+ */
+static void sgeadd_trusted(blasint m, blasint n, float alpha, float *aptr,
+                           blasint lda, float beta, float *cptr, blasint ldc)
+{
+    blasint i;
+
+    for (i = 0; i < n; i++)
+    {
+        cblas_saxpby(m, alpha, aptr, 1, beta, cptr, 1);
+        aptr += lda;
+        cptr += ldc;
+    }
+}
+
+/**
+ * Test sgeadd by comparing it against reference
+ * Compare with the following options:
+ *
+ * param api - specifies Fortran or C API
+ * param order - specifies whether A and C stored in
+ * row-major order or column-major order
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param alpha - scaling factor for matrix A
+ * param lda - leading dimension of A
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ * return norm of differences
+ */
+static float check_sgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order,
+                          blasint m, blasint n, float alpha, blasint lda,
+                          float beta, blasint ldc)
+{
+    blasint i;
+    blasint cols = m, rows = n;
+
+    if (order == CblasRowMajor)
+    {
+        rows = m;
+        cols = n;
+    }
+
+    // Fill matrix A, C
+    srand_generate(data_sgeadd.a_test, lda * rows);
+    srand_generate(data_sgeadd.c_test, ldc * rows);
+
+    // Copy matrix C for sgeadd
+    for (i = 0; i < ldc * rows; i++)
+        data_sgeadd.c_verify[i] = data_sgeadd.c_test[i];
+
+    sgeadd_trusted(cols, rows, alpha, data_sgeadd.a_test, lda,
+                   beta, data_sgeadd.c_verify, ldc);
+
+    if (api == 'F')
+        BLASFUNC(sgeadd)
+        (&m, &n, &alpha, data_sgeadd.a_test, &lda,
+         &beta, data_sgeadd.c_test, &ldc);
+    else
+        cblas_sgeadd(order, m, n, alpha, data_sgeadd.a_test, lda,
+                     beta, data_sgeadd.c_test, ldc);
+
+    // Find the differences between output matrix caculated by sgeadd and sgemm
+    return smatrix_difference(data_sgeadd.c_test, data_sgeadd.c_verify, cols, rows, ldc);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param api - specifies Fortran or C API
+ * param order - specifies whether A and C stored in
+ * row-major order or column-major order
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param lda - leading dimension of A
+ * param ldc - leading dimension of C
+ * param expected_info - expected invalid parameter number in sgeadd
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order,
+                         blasint m, blasint n, blasint lda,
+                         blasint ldc, int expected_info)
+{
+    float alpha = 1.0f;
+    float beta = 1.0f;
+
+    set_xerbla("SGEADD ", expected_info);
+
+    if (api == 'F')
+        BLASFUNC(sgeadd)
+        (&m, &n, &alpha, data_sgeadd.a_test, &lda,
+         &beta, data_sgeadd.c_test, &ldc);
+    else
+        cblas_sgeadd(order, m, n, alpha, data_sgeadd.a_test, lda,
+                     beta, data_sgeadd.c_test, ldc);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(sgeadd, matrix_n_100_m_100)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 3.0f;
+    float beta = 3.0f;
+
+    float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar alpha is zero (operation is C:=beta*C)
+ */
+CTEST(sgeadd, matrix_n_100_m_100_alpha_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 0.0f;
+    float beta = 2.5f;
+
+    float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar beta is zero (operation is C:=alpha*A)
+ */
+CTEST(sgeadd, matrix_n_100_m_100_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 3.0f;
+    float beta = 0.0f;
+
+    float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalars alpha, beta is zero (operation is C:= 0)
+ */
+CTEST(sgeadd, matrix_n_100_m_100_alpha_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 0.0f;
+    float beta = 0.0f;
+
+    float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(sgeadd, matrix_n_100_m_50)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M / 2;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 1.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C
+ * Must be at least zero.
+ */
+CTEST(sgeadd, xerbla_n_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ */
+CTEST(sgeadd, xerbla_m_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ */
+CTEST(sgeadd, xerbla_lda_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 6;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ */
+CTEST(sgeadd, xerbla_ldc_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Check if n - number of columns of A, C equal zero.
+ */
+CTEST(sgeadd, n_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 0;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    float alpha = 1.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Check if m - number of rows of A and C equal zero.
+ */
+CTEST(sgeadd, m_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 0;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    float alpha = 1.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(sgeadd, c_api_matrix_n_100_m_100)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 2.0f;
+    float beta = 3.0f;
+
+    float norm = check_sgeadd('C', order, m, n, alpha,
+                              lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is row-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(sgeadd, c_api_matrix_n_100_m_100_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 4.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgeadd('C', order, m, n, alpha,
+                              lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is row-major order
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(sgeadd, c_api_matrix_n_50_m_100_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = N / 2;
+    blasint m = M;
+
+    blasint lda = n;
+    blasint ldc = n;
+
+    float alpha = 3.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgeadd('C', order, m, n, alpha,
+                              lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar alpha is zero (operation is C:=beta*C)
+ */
+CTEST(sgeadd, c_api_matrix_n_100_m_100_alpha_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 0.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgeadd('C', order, m, n, alpha,
+                              lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar beta is zero (operation is C:=alpha*A)
+ */
+CTEST(sgeadd, c_api_matrix_n_100_m_100_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 3.0f;
+    float beta = 0.0f;
+
+    float norm = check_sgeadd('C', order, m, n, alpha,
+                              lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalars alpha, beta is zero (operation is C:= 0)
+ */
+CTEST(sgeadd, c_api_matrix_n_100_m_100_alpha_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 0.0f;
+    float beta = 0.0f;
+
+    float norm = check_sgeadd('C', order, m, n, alpha,
+                              lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(sgeadd, c_api_matrix_n_100_m_50)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M / 2;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 3.0f;
+    float beta = 4.0f;
+
+    float norm = check_sgeadd('C', order, m, n, alpha,
+                              lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param order -
+ * specifies whether A and C stored in
+ * row-major order or column-major order
+ */
+CTEST(sgeadd, c_api_xerbla_invalid_order)
+{
+    CBLAS_ORDER order = INVALID;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 0;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C.
+ * Must be at least zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(sgeadd, c_api_xerbla_n_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C.
+ * Must be at least zero.
+ *
+ * c api option order is row-major order
+ */
+CTEST(sgeadd, c_api_xerbla_n_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(sgeadd, c_api_xerbla_m_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ *
+ * c api option order is row-major order
+ */
+CTEST(sgeadd, c_api_xerbla_m_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ *
+ * c api option order is column-major order
+ */
+CTEST(sgeadd, c_api_xerbla_lda_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 5;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ *
+ * c api option order is row-major order
+ */
+CTEST(sgeadd, c_api_xerbla_lda_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 5;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ *
+ * c api option order is column-major order
+ */
+CTEST(sgeadd, c_api_xerbla_ldc_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ *
+ * c api option order is row-major order
+ */
+CTEST(sgeadd, c_api_xerbla_ldc_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Check if n - number of columns of A, C equal zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(sgeadd, c_api_n_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 0;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    float alpha = 1.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Check if m - number of rows of A and C equal zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(sgeadd, c_api_m_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 0;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    float alpha = 1.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_sgemmt.c b/utest/test_extensions/test_sgemmt.c
new file mode 100644
index 000000000..5b51e3579
--- /dev/null
+++ b/utest/test_extensions/test_sgemmt.c
@@ -0,0 +1,1442 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_SGEMMT {
+    float a_test[DATASIZE * DATASIZE];
+    float b_test[DATASIZE * DATASIZE];
+    float c_test[DATASIZE * DATASIZE];
+    float c_verify[DATASIZE * DATASIZE];
+    float c_gemm[DATASIZE * DATASIZE];
+};
+
+#ifdef BUILD_SINGLE
+static struct DATA_SGEMMT data_sgemmt;
+
+/**
+ * Compute gemmt via gemm since gemmt is gemm but updates only 
+ * the upper or lower triangular part of the result matrix
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order (for Fortran API column major always)
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param alpha - scaling factor for the matrix-matrix product
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ */
+static void sgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                           char transb, blasint m, blasint k, float alpha, blasint lda, 
+                           blasint ldb, float beta, blasint ldc)
+{
+    blasint i, j;
+
+    if(api == 'F')
+        BLASFUNC(sgemm)(&transa, &transb, &m, &m, &k, &alpha, data_sgemmt.a_test, &lda,
+                        data_sgemmt.b_test, &ldb, &beta, data_sgemmt.c_gemm, &ldc);
+    else
+        cblas_sgemm(order, transa, transb, m, m, k, alpha, data_sgemmt.a_test, lda,
+                data_sgemmt.b_test, ldb, beta, data_sgemmt.c_gemm, ldc);
+
+    if (uplo == 'L' || uplo == CblasLower)
+    {
+        for (i = 0; i < m; i++)
+            for (j = i; j < m; j++)
+                data_sgemmt.c_verify[i * ldc + j] =
+                    data_sgemmt.c_gemm[i * ldc + j];
+    } else {
+        for (i = 0; i < m; i++)
+            for (j = 0; j <= i; j++)
+                data_sgemmt.c_verify[i * ldc + j] =
+                    data_sgemmt.c_gemm[i * ldc + j];
+    }
+}
+
+/**
+ * Comapare results computed by sgemmt and sgemmt_trusted
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order (for Fortran API column major always)
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param alpha - scaling factor for the matrix-matrix product
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ * return norm of differences
+ */
+static float check_sgemmt(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                          char transb, blasint m, blasint k, float alpha, blasint lda, 
+                          blasint ldb, float beta, blasint ldc)
+{
+    blasint i;
+    blasint b_cols;
+    blasint a_cols;
+    blasint inc = 1;
+    blasint size_c = m * ldc;
+
+    if(order == CblasColMajor){
+        if (transa == 'T' || transa == 'C' || 
+            transa == CblasTrans || transa == CblasConjTrans) 
+            a_cols = m;
+        else a_cols = k;
+
+        if (transb == 'T' || transb == 'C' || 
+            transb == CblasTrans || transb == CblasConjTrans) 
+            b_cols = k;
+        else b_cols = m;
+    } else {
+        if (transa == 'T' || transa == 'C' || 
+            transa == CblasTrans || transa == CblasConjTrans) 
+            a_cols = k;
+        else a_cols = m;
+
+        if (transb == 'T' || transb == 'C' ||
+            transb == CblasTrans || transb == CblasConjTrans) 
+            b_cols = m;
+        else b_cols = k;
+    }
+
+    srand_generate(data_sgemmt.a_test, a_cols * lda);
+    srand_generate(data_sgemmt.b_test, b_cols * ldb);
+    srand_generate(data_sgemmt.c_test, m * ldc);
+
+    for (i = 0; i < m * ldc; i++)
+        data_sgemmt.c_gemm[i] = data_sgemmt.c_verify[i] = data_sgemmt.c_test[i];
+
+    sgemmt_trusted(api, order, uplo, transa, transb, m, k, alpha, lda, ldb, beta, ldc);
+
+    if (api == 'F')
+        BLASFUNC(sgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_sgemmt.a_test,
+                         &lda, data_sgemmt.b_test, &ldb, &beta, data_sgemmt.c_test, &ldc);
+    else
+        cblas_sgemmt(order, uplo, transa, transb, m, k, alpha, data_sgemmt.a_test, lda,
+                    data_sgemmt.b_test, ldb, beta, data_sgemmt.c_test, ldc);
+
+    for (i = 0; i < m * ldc; i++)
+        data_sgemmt.c_verify[i] -= data_sgemmt.c_test[i];
+
+    return BLASFUNC(snrm2)(&size_c, data_sgemmt.c_verify, &inc) / size_c;
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param ldc - leading dimension of C
+ * param expected_info - expected invalid parameter number in sgemmt
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                         char transb, blasint m, blasint k, blasint lda, blasint ldb,
+                         blasint ldc, int expected_info)
+{
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    set_xerbla("SGEMMT ", expected_info);
+
+    if (api == 'F')
+        BLASFUNC(sgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_sgemmt.a_test,
+                         &lda, data_sgemmt.b_test, &ldb, &beta, data_sgemmt.c_test, &ldc);
+    else
+        cblas_sgemmt(order, uplo, transa, transb, m, k, alpha, data_sgemmt.a_test, lda,
+                    data_sgemmt.b_test, ldb, beta, data_sgemmt.c_test, ldc);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(sgemmt, upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(sgemmt, upper_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 100;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    float alpha = 1.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(sgemmt, upper_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'T';
+    char uplo = 'U';
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(sgemmt, upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'T';
+    char uplo = 'U';
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * alpha = 0.0
+ */
+CTEST(sgemmt, upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    float alpha = 0.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * beta = 1.0
+ */
+CTEST(sgemmt, upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    float alpha = 2.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(sgemmt, lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(sgemmt, lower_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 100;
+    char transa = 'T', transb = 'N';
+    char uplo = 'L';
+    float alpha = 1.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(sgemmt, lower_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'T';
+    char uplo = 'L';
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(sgemmt, lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'T';
+    char uplo = 'L';
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * alpha = 0.0
+ */
+CTEST(sgemmt, lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    float alpha = 0.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * beta = 1.0
+ */
+CTEST(sgemmt, lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    float alpha = 2.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(sgemmt, c_api_colmajor_upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(sgemmt, c_api_colmajor_upper_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 100;
+    float alpha = 1.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(sgemmt, c_api_colmajor_upper_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(sgemmt, c_api_colmajor_upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * alpha = 0.0
+ */
+CTEST(sgemmt, c_api_colmajor_upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 0.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * beta = 1.0
+ */
+CTEST(sgemmt, c_api_colmajor_upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 2.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(sgemmt, c_api_colmajor_lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(sgemmt, c_api_colmajor_lower_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 100;
+    float alpha = 1.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(sgemmt, c_api_colmajor_lower_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(sgemmt, c_api_colmajor_lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * alpha = 0.0
+ */
+CTEST(sgemmt, c_api_colmajor_lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 0.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * beta = 1.0
+ */
+CTEST(sgemmt, c_api_colmajor_lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 2.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(sgemmt, c_api_rowmajor_upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(sgemmt, c_api_rowmajor_upper_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 100, ldb = 100, ldc = 100;
+    float alpha = 1.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(sgemmt, c_api_rowmajor_upper_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 100, ldb = 100, ldc = 50;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(sgemmt, c_api_rowmajor_upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * alpha = 0.0
+ */
+CTEST(sgemmt, c_api_rowmajor_upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 0.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * beta = 1.0
+ */
+CTEST(sgemmt, c_api_rowmajor_upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 2.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(sgemmt, c_api_rowmajor_lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(sgemmt, c_api_rowmajor_lower_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 100, ldb = 100, ldc = 100;
+    float alpha = 1.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(sgemmt, c_api_rowmajor_lower_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 100, ldb = 100, ldc = 50;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(sgemmt, c_api_rowmajor_lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * alpha = 0.0
+ */
+CTEST(sgemmt, c_api_rowmajor_lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 0.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * beta = 1.0
+ */
+CTEST(sgemmt, c_api_rowmajor_lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 2.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param uplo.
+ * Must be upper (U) or lower (L).
+ */
+CTEST(sgemmt, xerbla_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'O';
+    int expected_info = 1;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param transa.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(sgemmt, xerbla_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'O', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 2;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param transb.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(sgemmt, xerbla_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'O';
+    char uplo = 'U';
+    int expected_info = 3;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(sgemmt, xerbla_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 4;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(sgemmt, xerbla_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 5;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(sgemmt, xerbla_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 100, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 8;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B not transposed.
+ */
+CTEST(sgemmt, xerbla_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 10;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(sgemmt, xerbla_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 13;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test.
+ * Test error function for an invalid param order.
+ * Must be column or row major.
+ */
+CTEST(sgemmt, xerbla_c_api_major_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 0;
+
+    int passed = check_badargs('C', 'O', CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param uplo.
+ * Must be upper or lower.
+ */
+CTEST(sgemmt, xerbla_c_api_colmajor_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 1;
+
+    int passed = check_badargs('C', CblasColMajor, 'O', CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param transa.
+ * Must be trans or no-trans.
+ */
+CTEST(sgemmt, xerbla_c_api_colmajor_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 2;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, 'O', CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param transb.
+ * Must be trans or no-trans.
+ */
+CTEST(sgemmt, xerbla_c_api_colmajor_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 3;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, 'O',
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(sgemmt, xerbla_c_api_colmajor_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 4;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(sgemmt, xerbla_c_api_colmajor_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 5;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(sgemmt, xerbla_c_api_colmajor_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 100, ldc = 50;
+    int expected_info = 8;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B not transposed.
+ */
+CTEST(sgemmt, xerbla_c_api_colmajor_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 10;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(sgemmt, xerbla_c_api_colmajor_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 13;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param uplo.
+ * Must be upper or lower.
+ */
+CTEST(sgemmt, xerbla_c_api_rowmajor_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 1;
+
+    int passed = check_badargs('C', CblasRowMajor, 'O', CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param transa.
+ * Must be trans or no-trans.
+ */
+CTEST(sgemmt, xerbla_c_api_rowmajor_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 2;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, 'O', CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param transb.
+ * Must be trans or no-trans.
+ */
+CTEST(sgemmt, xerbla_c_api_rowmajor_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 3;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, 'O',
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(sgemmt, xerbla_c_api_rowmajor_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 4;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(sgemmt, xerbla_c_api_rowmajor_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 5;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(sgemmt, xerbla_c_api_rowmajor_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 8;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B transposed.
+ */
+CTEST(sgemmt, xerbla_c_api_rowmajor_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 10;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(sgemmt, xerbla_c_api_rowmajor_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 100, ldb = 100, ldc = 50;
+    int expected_info = 13;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_simatcopy.c b/utest/test_extensions/test_simatcopy.c
new file mode 100644
index 000000000..0d9c44e73
--- /dev/null
+++ b/utest/test_extensions/test_simatcopy.c
@@ -0,0 +1,947 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_SIMATCOPY {
+    float a_test[DATASIZE* DATASIZE];
+    float a_verify[DATASIZE* DATASIZE];
+};
+
+#ifdef BUILD_SINGLE
+static struct DATA_SIMATCOPY data_simatcopy;
+
+/**
+ * Comapare results computed by simatcopy and reference func
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param rows specifies number of rows of A
+ * param cols specifies number of columns of A
+ * param alpha specifies scaling factor for matrix A
+ * param lda_src - leading dimension of the matrix A
+ * param lda_dst - leading dimension of output matrix A
+ * return norm of difference between openblas and reference func
+ */
+static float check_simatcopy(char api, char order, char trans, blasint rows, blasint cols, float alpha, 
+                             blasint lda_src, blasint lda_dst)
+{
+    blasint m, n;
+    blasint rows_out, cols_out;
+    enum CBLAS_ORDER corder;
+    enum CBLAS_TRANSPOSE ctrans;
+
+    if (order == 'C') {
+        n = rows; m = cols;
+    }
+    else {
+        m = rows; n = cols;
+    }
+
+    if(trans == 'T' || trans == 'C') {
+        rows_out = n; cols_out = m;
+    }
+    else {
+        rows_out = m; cols_out = n;
+    }
+
+    srand_generate(data_simatcopy.a_test, lda_src*m);
+
+    if (trans == 'T' || trans == 'C') {
+        stranspose(m, n, alpha, data_simatcopy.a_test, lda_src, data_simatcopy.a_verify, lda_dst);
+    } 
+    else {
+        scopy(m, n, alpha, data_simatcopy.a_test, lda_src, data_simatcopy.a_verify, lda_dst);
+    }
+
+    if (api == 'F') {
+        BLASFUNC(simatcopy)(&order, &trans, &rows, &cols, &alpha, data_simatcopy.a_test, 
+                            &lda_src, &lda_dst);
+    }
+    else {
+        if (order == 'C') corder = CblasColMajor;
+        if (order == 'R') corder = CblasRowMajor;
+        if (trans == 'T') ctrans = CblasTrans;
+        if (trans == 'N') ctrans = CblasNoTrans;
+        if (trans == 'C') ctrans = CblasConjTrans;
+        if (trans == 'R') ctrans = CblasConjNoTrans;
+        cblas_simatcopy(corder, ctrans, rows, cols, alpha, data_simatcopy.a_test, 
+                    lda_src, lda_dst);
+    }
+
+    // Find the differences between output matrix computed by simatcopy and reference func
+    return smatrix_difference(data_simatcopy.a_test, data_simatcopy.a_verify, cols_out, rows_out, lda_dst);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param rows specifies number of rows of A
+ * param cols specifies number of columns of A
+ * param lda_src - leading dimension of the matrix A
+ * param lda_dst - leading dimension of output matrix A
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char order, char trans, blasint rows, blasint cols,
+                          blasint lda_src, blasint lda_dst, int expected_info)
+{
+    float alpha = 1.0f;
+
+    set_xerbla("SIMATCOPY", expected_info);
+
+    BLASFUNC(simatcopy)(&order, &trans, &rows, &cols, &alpha, data_simatcopy.a_test, 
+                        &lda_src, &lda_dst);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0f
+ */
+CTEST(simatcopy, colmajor_trans_col_100_row_100_alpha_one)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 1.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0f
+ */
+CTEST(simatcopy, colmajor_notrans_col_100_row_100_alpha_one)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 1.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 0.0f
+ */
+CTEST(simatcopy, colmajor_trans_col_100_row_100_alpha_zero)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 0.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 0.0f
+ */
+CTEST(simatcopy, colmajor_notrans_col_100_row_100_alpha_zero)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 0.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 1.0f
+ */
+CTEST(simatcopy, colmajor_trans_col_50_row_100_alpha_one)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 1.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 1.0f
+ */
+CTEST(simatcopy, colmajor_notrans_col_50_row_100_alpha_one)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 1.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 0.0f
+ */
+CTEST(simatcopy, colmajor_trans_col_50_row_100_alpha_zero)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 0.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 0.0f
+ */
+CTEST(simatcopy, colmajor_notrans_col_50_row_100_alpha_zero)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 0.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, colmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, colmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0f
+ */
+CTEST(simatcopy, rowmajor_trans_col_100_row_100_alpha_one)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    float alpha = 1.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0f
+ */
+CTEST(simatcopy, rowmajor_notrans_col_100_row_100_alpha_one)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha = 1.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 0.0f
+ */
+CTEST(simatcopy, rowmajor_trans_col_100_row_100_alpha_zero)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    float alpha = 0.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 0.0f
+ */
+CTEST(simatcopy, rowmajor_notrans_col_100_row_100_alpha_zero)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha = 0.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific tests
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 1.0f
+ */
+CTEST(simatcopy, rowmajor_trans_col_100_row_50_alpha_one)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'C'; // same as trans for real matrix
+    float alpha = 1.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 1.0f
+ */
+CTEST(simatcopy, rowmajor_notrans_col_100_row_50_alpha_one)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha = 1.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 0.0f
+ */
+CTEST(simatcopy, rowmajor_trans_col_100_row_50_alpha_zero)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'C'; // same as trans for real matrix
+    float alpha = 0.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 0.0f
+ */
+CTEST(simatcopy, rowmajor_notrans_col_100_row_50_alpha_zero)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha = 0.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, rowmajor_trans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'C'; // same as trans for real matrix
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, rowmajor_notrans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, c_api_colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, c_api_colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, c_api_rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, c_api_rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test error function for an invalid param order.
+ * Must be column (C) or row major (R).
+ */
+CTEST(simatcopy, xerbla_invalid_order)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'O';
+    char trans = 'T';
+    int expected_info = 1;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param trans.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(simatcopy, xerbla_invalid_trans)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'O';
+    int expected_info = 2;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param m.
+ * Must be positive.
+ */
+CTEST(simatcopy, xerbla_invalid_rows)
+{
+    blasint m = 0, n = 100;
+    blasint lda_src = 0, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 3;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param n.
+ * Must be positive.
+ */
+CTEST(simatcopy, xerbla_invalid_cols)
+{
+    blasint m = 100, n = 0;
+    blasint lda_src = 100, lda_dst = 0;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 4;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_src.
+ * If matrices are stored using row major layout, 
+ * lda_src must be at least n.
+ */
+CTEST(simatcopy, xerbla_rowmajor_invalid_lda)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_src.
+ * If matrices are stored using column major layout,
+ * lda_src must be at least m.
+ */
+CTEST(simatcopy, xerbla_colmajor_invalid_lda)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using row major layout and 
+ * there is no transposition, lda_dst must be at least n.
+ */
+CTEST(simatcopy, xerbla_rowmajor_notrans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using row major layout and 
+ * there is transposition, lda_dst must be at least m.
+ */
+CTEST(simatcopy, xerbla_rowmajor_trans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using column major layout and 
+ * there is no transposition, lda_dst must be at least m.
+ */
+CTEST(simatcopy, xerbla_colmajor_notrans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using column major layout and 
+ * there is transposition, lda_dst must be at least n.
+ */
+CTEST(simatcopy, xerbla_colmajor_trans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_somatcopy.c b/utest/test_extensions/test_somatcopy.c
new file mode 100644
index 000000000..c75bbc75e
--- /dev/null
+++ b/utest/test_extensions/test_somatcopy.c
@@ -0,0 +1,672 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_SOMATCOPY {
+    float a_test[DATASIZE * DATASIZE];
+    float b_test[DATASIZE * DATASIZE];
+    float b_verify[DATASIZE * DATASIZE];
+};
+
+#ifdef BUILD_SINGLE
+static struct DATA_SOMATCOPY data_somatcopy;
+
+/**
+ * Comapare results computed by somatcopy and reference func
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param rows - number of rows of A
+ * param cols - number of columns of A
+ * param alpha - scaling factor for matrix B
+ * param lda - leading dimension of the matrix A
+ * param ldb - leading dimension of the matrix B
+ * return norm of difference between openblas and reference func
+ */
+static float check_somatcopy(char api, char order, char trans, blasint rows, blasint cols, float alpha, 
+                             blasint lda, blasint ldb)
+{
+    blasint b_rows, b_cols;
+    blasint m, n;
+    enum CBLAS_ORDER corder;
+    enum CBLAS_TRANSPOSE ctrans;
+
+    if (order == 'C') {
+        m = cols; n = rows;
+    }
+    else {
+        m = rows; n = cols;
+    }
+
+    if(trans == 'T' || trans == 'C') {
+        b_rows = n; b_cols = m;
+    }
+    else {
+        b_rows = m; b_cols = n;
+    }
+
+    srand_generate(data_somatcopy.a_test, lda*m);
+
+    if (trans == 'T' || trans == 'C') {
+        stranspose(m, n, alpha, data_somatcopy.a_test, lda, data_somatcopy.b_verify, ldb);
+    } 
+    else {
+        scopy(m, n, alpha, data_somatcopy.a_test, lda, data_somatcopy.b_verify, ldb);
+    }
+
+    if (api == 'F') {
+        BLASFUNC(somatcopy)(&order, &trans, &rows, &cols, &alpha, data_somatcopy.a_test, 
+                            &lda, data_somatcopy.b_test, &ldb);
+    }
+    else {
+        if (order == 'C') corder = CblasColMajor;
+        if (order == 'R') corder = CblasRowMajor;
+        if (trans == 'T') ctrans = CblasTrans;
+        if (trans == 'N') ctrans = CblasNoTrans;
+        if (trans == 'C') ctrans = CblasConjTrans;
+        if (trans == 'R') ctrans = CblasConjNoTrans;
+        cblas_somatcopy(corder, ctrans, rows, cols, alpha, data_somatcopy.a_test, 
+                    lda, data_somatcopy.b_test, ldb);
+    }
+    
+    return smatrix_difference(data_somatcopy.b_test, data_somatcopy.b_verify, b_cols, b_rows, ldb);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param rows - number of rows of A
+ * param cols - number of columns of A
+ * param lda - leading dimension of the matrix A
+ * param ldb - leading dimension of the matrix B
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char order, char trans, blasint rows, blasint cols,
+                          blasint lda, blasint ldb, int expected_info)
+{
+    float alpha = 1.0;
+
+    set_xerbla("SOMATCOPY", expected_info);
+
+    BLASFUNC(somatcopy)(&order, &trans, &rows, &cols, &alpha, data_somatcopy.a_test, 
+                        &lda, data_somatcopy.b_test, &ldb);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(somatcopy, colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 1.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(somatcopy, colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 1.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(somatcopy, colmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 2.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(somatcopy, colmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 2.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 0.0
+ */
+CTEST(somatcopy, colmajor_trans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 50, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 0.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 0.0
+ */
+CTEST(somatcopy, colmajor_notrans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 50, ldb = 50;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 0.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(somatcopy, rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'T';
+    float alpha = 1.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(somatcopy, rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha = 1.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(somatcopy, rowmajor_conjtrans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'C'; // same as trans for real matrix
+    float alpha = 2.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(somatcopy, rowmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 50, ldb = 50;
+    char order = 'R';
+    char trans = 'N'; 
+    float alpha = 2.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Matrix dimensions leave residues from 4 and 2 (specialize
+ * for rt case)
+ * alpha = 1.5
+ */
+CTEST(somatcopy, rowmajor_trans_col_27_row_27)
+{
+    blasint m = 27, n = 27;
+    blasint lda = 27, ldb = 27;
+    char order = 'R';
+    char trans = 'T'; 
+    float alpha = 1.5f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 0.0
+ */
+CTEST(somatcopy, rowmajor_notrans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'N'; 
+    float alpha = 0.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(somatcopy, c_api_colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 1.0f;
+
+    float norm = check_somatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(somatcopy, c_api_colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 1.0f;
+
+    float norm = check_somatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(somatcopy, c_api_rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'T';
+    float alpha = 1.0f;
+
+    float norm = check_somatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(somatcopy, c_api_rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha = 1.0f;
+
+    float norm = check_somatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test error function for an invalid param order.
+ * Must be column (C) or row major (R).
+ */
+CTEST(somatcopy, xerbla_invalid_order)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'O';
+    char trans = 'T';
+    int expected_info = 1;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param trans.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(somatcopy, xerbla_invalid_trans)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'O';
+    int expected_info = 2;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param m.
+ * Must be positive.
+ */
+CTEST(somatcopy, xerbla_invalid_rows)
+{
+    blasint m = 0, n = 100;
+    blasint lda = 0, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 3;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param n.
+ * Must be positive.
+ */
+CTEST(somatcopy, xerbla_invalid_cols)
+{
+    blasint m = 100, n = 0;
+    blasint lda = 100, ldb = 0;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 4;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda.
+ * If matrices are stored using row major layout,
+ * lda must be at least n.
+ */
+CTEST(somatcopy, xerbla_rowmajor_invalid_lda)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 50, ldb = 100;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda.
+ * If matrices are stored using column major layout,
+ * lda must be at least m.
+ */
+CTEST(somatcopy, xerbla_colmajor_invalid_lda)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 50, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using row major layout and
+ * there is no transposition, ldb must be at least n.
+ */
+CTEST(somatcopy, xerbla_rowmajor_notrans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using row major layout and
+ * there is transposition, ldb must be at least m.
+ */
+CTEST(somatcopy, xerbla_rowmajor_trans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using column major layout and
+ * there is no transposition, ldb must be at least m.
+ */
+CTEST(somatcopy, xerbla_colmajor_notrans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using column major layout and
+ * there is transposition, ldb must be at least n.
+ */
+CTEST(somatcopy, xerbla_colmajor_trans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
diff --git a/utest/test_extensions/test_srotmg.c b/utest/test_extensions/test_srotmg.c
new file mode 100644
index 000000000..3c97e3b4d
--- /dev/null
+++ b/utest/test_extensions/test_srotmg.c
@@ -0,0 +1,414 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#ifdef BUILD_SINGLE
+
+/**
+ * Fortran API specific test
+ * Test srotmg by comparing it against pre-calculated values
+ */
+CTEST(srotmg, y1_zero)
+{
+	float te_d1, tr_d1;
+	float te_d2, tr_d2;
+	float te_x1, tr_x1;
+	float te_y1, tr_y1;
+	float te_param[5];
+	float tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 2.0f;
+	te_d2 = tr_d2 = 2.0f;
+	te_x1 = tr_x1 = 8.0f;
+	te_y1 = tr_y1 = 0.0f;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0f;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 2.0f;
+	tr_d2 = 2.0f;
+	tr_x1 = 8.0f;
+	tr_y1 = 0.0f;
+
+	tr_param[0] = -2.0f;
+	tr_param[1] = 0.0f;
+	tr_param[2] = 0.0f;
+	tr_param[3] = 0.0f;
+	tr_param[4] = 0.0f;
+
+	//OpenBLAS
+	BLASFUNC(srotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS);
+	}
+}
+
+/**
+ * Fortran API specific test
+ * Test srotmg by comparing it against pre-calculated values
+ */
+CTEST(srotmg, d1_negative)
+{
+	float te_d1, tr_d1;
+	float te_d2, tr_d2;
+	float te_x1, tr_x1;
+	float te_y1, tr_y1;
+	float te_param[5];
+	float tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = -1.0f;
+	te_d2 = tr_d2 = 2.0f;
+	te_x1 = tr_x1 = 8.0f;
+	te_y1 = tr_y1 = 8.0f;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0f;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 0.0f;
+	tr_d2 = 0.0f;
+	tr_x1 = 0.0f;
+	tr_y1 = 8.0f;
+
+	tr_param[0] = -1.0f;
+	tr_param[1] = 0.0f;
+	tr_param[2] = 0.0f;
+	tr_param[3] = 0.0f;
+	tr_param[4] = 0.0f;
+
+	//OpenBLAS
+	BLASFUNC(srotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS);
+	}
+}
+
+/**
+ * Fortran API specific test
+ * Test srotmg by comparing it against pre-calculated values
+ */
+CTEST(srotmg, d1_positive_d2_positive_x1_zero)
+{
+	float te_d1, tr_d1;
+	float te_d2, tr_d2;
+	float te_x1, tr_x1;
+	float te_y1, tr_y1;
+	float te_param[5];
+	float tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 2.0f;
+	te_d2 = tr_d2 = 2.0f;
+	te_x1 = tr_x1 = 0.0f;
+	te_y1 = tr_y1 = 8.0f;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0f;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 2.0f;
+	tr_d2 = 2.0f;
+	tr_x1 = 8.0f;
+	tr_y1 = 8.0f;
+
+	tr_param[0] = 1.0f;
+	tr_param[1] = 0.0f;
+	tr_param[2] = 0.0f;
+	tr_param[3] = 0.0f;
+	tr_param[4] = 0.0f;
+
+	//OpenBLAS
+	BLASFUNC(srotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS);
+	}
+}
+
+/**
+ * Fortran API specific test
+ * Test srotmg by comparing it against pre-calculated values
+ */
+CTEST(srotmg, scaled_y_greater_than_scaled_x)
+{
+	float te_d1, tr_d1;
+	float te_d2, tr_d2;
+	float te_x1, tr_x1;
+	float te_y1, tr_y1;
+	float te_param[5];
+	float tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 1.0f;
+	te_d2 = tr_d2 = -2.0f;
+	te_x1 = tr_x1 = 8.0f;
+	te_y1 = tr_y1 = 8.0f;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0f;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 0.0f;
+	tr_d2 = 0.0f;
+	tr_x1 = 0.0f;
+	tr_y1 = 8.0f;
+
+	tr_param[0] = -1.0f;
+	tr_param[1] = 0.0f;
+	tr_param[2] = 0.0f;
+	tr_param[3] = 0.0f;
+	tr_param[4] = 0.0f;
+
+	//OpenBLAS
+	BLASFUNC(srotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS);
+	}
+}
+
+/**
+ * C API specific test
+ * Test srotmg by comparing it against pre-calculated values
+ */
+CTEST(srotmg, c_api_y1_zero)
+{
+	float te_d1, tr_d1;
+	float te_d2, tr_d2;
+	float te_x1, tr_x1;
+	float te_y1, tr_y1;
+	float te_param[5];
+	float tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 2.0f;
+	te_d2 = tr_d2 = 2.0f;
+	te_x1 = tr_x1 = 8.0f;
+	te_y1 = tr_y1 = 0.0f;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0f;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 2.0f;
+	tr_d2 = 2.0f;
+	tr_x1 = 8.0f;
+	tr_y1 = 0.0f;
+
+	tr_param[0] = -2.0f;
+	tr_param[1] = 0.0f;
+	tr_param[2] = 0.0f;
+	tr_param[3] = 0.0f;
+	tr_param[4] = 0.0f;
+
+	//OpenBLAS
+	cblas_srotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS);
+	}
+}
+
+/**
+ * C API specific test
+ * Test srotmg by comparing it against pre-calculated values
+ */
+CTEST(srotmg, c_api_d1_negative)
+{
+	float te_d1, tr_d1;
+	float te_d2, tr_d2;
+	float te_x1, tr_x1;
+	float te_y1, tr_y1;
+	float te_param[5];
+	float tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = -1.0f;
+	te_d2 = tr_d2 = 2.0f;
+	te_x1 = tr_x1 = 8.0f;
+	te_y1 = tr_y1 = 8.0f;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0f;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 0.0f;
+	tr_d2 = 0.0f;
+	tr_x1 = 0.0f;
+	tr_y1 = 8.0f;
+
+	tr_param[0] = -1.0f;
+	tr_param[1] = 0.0f;
+	tr_param[2] = 0.0f;
+	tr_param[3] = 0.0f;
+	tr_param[4] = 0.0f;
+
+	//OpenBLAS
+	cblas_srotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS);
+	}
+}
+
+/**
+ * C API specific test
+ * Test srotmg by comparing it against pre-calculated values
+ */
+CTEST(srotmg, c_api_d1_positive_d2_positive_x1_zero)
+{
+	float te_d1, tr_d1;
+	float te_d2, tr_d2;
+	float te_x1, tr_x1;
+	float te_y1, tr_y1;
+	float te_param[5];
+	float tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 2.0f;
+	te_d2 = tr_d2 = 2.0f;
+	te_x1 = tr_x1 = 0.0f;
+	te_y1 = tr_y1 = 8.0f;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0f;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 2.0f;
+	tr_d2 = 2.0f;
+	tr_x1 = 8.0f;
+	tr_y1 = 8.0f;
+
+	tr_param[0] = 1.0f;
+	tr_param[1] = 0.0f;
+	tr_param[2] = 0.0f;
+	tr_param[3] = 0.0f;
+	tr_param[4] = 0.0f;
+
+	//OpenBLAS
+	cblas_srotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS);
+	}
+}
+
+/**
+ * C API specific test
+ * Test srotmg by comparing it against pre-calculated values
+ */
+CTEST(srotmg, c_api_scaled_y_greater_than_scaled_x)
+{
+	float te_d1, tr_d1;
+	float te_d2, tr_d2;
+	float te_x1, tr_x1;
+	float te_y1, tr_y1;
+	float te_param[5];
+	float tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 1.0f;
+	te_d2 = tr_d2 = -2.0f;
+	te_x1 = tr_x1 = 8.0f;
+	te_y1 = tr_y1 = 8.0f;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0f;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 0.0f;
+	tr_d2 = 0.0f;
+	tr_x1 = 0.0f;
+	tr_y1 = 8.0f;
+
+	tr_param[0] = -1.0f;
+	tr_param[1] = 0.0f;
+	tr_param[2] = 0.0f;
+	tr_param[3] = 0.0f;
+	tr_param[4] = 0.0f;
+
+	//OpenBLAS
+	cblas_srotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS);
+	}
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_ssum.c b/utest/test_extensions/test_ssum.c
new file mode 100644
index 000000000..971a0d2e0
--- /dev/null
+++ b/utest/test_extensions/test_ssum.c
@@ -0,0 +1,403 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 50
+#define INCREMENT 2
+
+#ifdef BUILD_SINGLE
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < ELEMENTS * inc; i ++) {
+      x[i] = 1000 - i;
+   }
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   float x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0f;
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {1.1f};
+
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {1.1f, 0.0f};
+
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {1.1f, -1.0f};
+
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {1.1f, -1.5f, 1.0f, 1.0f};
+
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f};
+
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.3f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {1.1f, 0.0f, -1.0f, -3.0f, 2.2f, 3.0f};
+
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.3f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {1.1f, 1.0f, -2.2f, 3.3f};
+
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.2f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 2.0f, 2.2f, 2.7f, -3.3f, -5.9f};
+
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_1_N_5){
+   blasint N = 5, inc = 1;
+   float x[] = {0.0f, 1.0f, 2.2f, 3.3f, 0.0f};
+
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(6.5f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_2_N_5){
+   blasint N = 5, inc = 2;
+   float x[] = {0.0f, 3.0f, 1.0f, -2.2f, 2.2f, -1.7f, 3.3f, 14.5f, 0.0f, -9.0f};
+
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(6.5f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = (i & 1) ? -1.0f : 1.0f;
+   }
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = (i & 1) ? -1.0f : 1.0f;
+   }
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(50.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < ELEMENTS * inc; i ++) {
+      x[i] = 1000 - i;
+   }
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   float x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0f;
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {1.1f};
+
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {1.1f, 0.0f};
+
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {1.1f, -1.0f};
+
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {1.1f, -1.5f, 1.0f, 1.0f};
+
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(2.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f};
+
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(4.3f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {1.1f, 0.0f, -1.0f, -3.0f, 2.2f, 3.0f};
+
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(2.3f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {1.1f, 1.0f, -2.2f, 3.3f};
+
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(3.2f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 2.0f, 2.2f, 2.7f, -3.3f, -5.9f};
+
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_1_N_5){
+   blasint N = 5, inc = 1;
+   float x[] = {0.0f, 1.0f, 2.2f, 3.3f, 0.0f};
+
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(6.5f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_2_N_5){
+   blasint N = 5, inc = 2;
+   float x[] = {0.0f, 3.0f, 1.0f, -2.2f, 2.2f, -1.7f, 3.3f, 14.5f, 0.0f, -9.0f};
+
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(6.5f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = (i & 1) ? -1.0f : 1.0f;
+   }
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = (i & 1) ? -1.0f : 1.0f;
+   }
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(50.0f, sum, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_zaxpby.c b/utest/test_extensions/test_zaxpby.c
new file mode 100644
index 000000000..6148f44c5
--- /dev/null
+++ b/utest/test_extensions/test_zaxpby.c
@@ -0,0 +1,630 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_ZAXPBY {
+    double x_test[DATASIZE * INCREMENT * 2];
+    double x_verify[DATASIZE * INCREMENT * 2];
+    double y_test[DATASIZE * INCREMENT * 2];
+    double y_verify[DATASIZE * INCREMENT * 2];
+};
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZAXPBY data_zaxpby;
+
+/**
+ * Fortran API specific function
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Compare with the following options:
+ * 
+ * param n - number of elements in vectors x and y
+ * param alpha - scalar alpha
+ * param incx - increment for the elements of x
+ * param beta - scalar beta
+ * param incy - increment for the elements of y
+ * return norm of difference
+ */
+static double check_zaxpby(blasint n, double *alpha, blasint incx, double *beta, blasint incy)
+{
+    blasint i;
+
+    // zscal accept only positive increments
+    blasint incx_abs = labs(incx);
+    blasint incy_abs = labs(incy);
+
+    // Fill vectors x, y
+    drand_generate(data_zaxpby.x_test, n * incx_abs * 2);
+    drand_generate(data_zaxpby.y_test, n * incy_abs * 2);
+
+    // Copy vector x for zaxpy
+    for (i = 0; i < n * incx_abs * 2; i++)
+        data_zaxpby.x_verify[i] = data_zaxpby.x_test[i];
+
+    // Copy vector y for zscal
+    for (i = 0; i < n * incy_abs * 2; i++)
+        data_zaxpby.y_verify[i] = data_zaxpby.y_test[i];
+
+    // Find beta*y
+    BLASFUNC(zscal)(&n, beta, data_zaxpby.y_verify, &incy_abs);
+
+    // Find sum of alpha*x and beta*y
+    BLASFUNC(zaxpy)(&n, alpha, data_zaxpby.x_verify, &incx,
+                        data_zaxpby.y_verify, &incy);
+    
+    BLASFUNC(zaxpby)(&n, alpha, data_zaxpby.x_test, &incx,
+                        beta, data_zaxpby.y_test, &incy);
+
+    // Find the differences between output vector caculated by zaxpby and zaxpy
+    for (i = 0; i < n * incy_abs * 2; i++)
+        data_zaxpby.y_test[i] -= data_zaxpby.y_verify[i];
+
+    // Find the norm of differences
+    return BLASFUNC(dznrm2)(&n, data_zaxpby.y_test, &incy_abs);
+}
+
+/**
+ * C API specific function
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Compare with the following options:
+ * 
+ * param n - number of elements in vectors x and y
+ * param alpha - scalar alpha
+ * param incx - increment for the elements of x
+ * param beta - scalar beta
+ * param incy - increment for the elements of y
+ * return norm of difference
+ */
+static double c_api_check_zaxpby(blasint n, double *alpha, blasint incx, double *beta, blasint incy)
+{
+    blasint i;
+
+    // zscal accept only positive increments
+    blasint incx_abs = labs(incx);
+    blasint incy_abs = labs(incy);
+
+    // Fill vectors x, y
+    drand_generate(data_zaxpby.x_test, n * incx_abs * 2);
+    drand_generate(data_zaxpby.y_test, n * incy_abs * 2);
+
+    // Copy vector x for zaxpy
+    for (i = 0; i < n * incx_abs * 2; i++)
+        data_zaxpby.x_verify[i] = data_zaxpby.x_test[i];
+
+    // Copy vector y for zscal
+    for (i = 0; i < n * incy_abs * 2; i++)
+        data_zaxpby.y_verify[i] = data_zaxpby.y_test[i];
+
+    // Find beta*y
+    cblas_zscal(n, beta, data_zaxpby.y_verify, incy_abs);
+
+    // Find sum of alpha*x and beta*y
+    cblas_zaxpy(n, alpha, data_zaxpby.x_verify, incx,
+                        data_zaxpby.y_verify, incy);
+    
+    cblas_zaxpby(n, alpha, data_zaxpby.x_test, incx,
+                        beta, data_zaxpby.y_test, incy);
+
+    // Find the differences between output vector caculated by zaxpby and zaxpy
+    for (i = 0; i < n * incy_abs * 2; i++)
+        data_zaxpby.y_test[i] -= data_zaxpby.y_verify[i];
+
+    // Find the norm of differences
+    return cblas_dznrm2(n, data_zaxpby.y_test, incy_abs);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ */
+CTEST(zaxpby, inc_x_1_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ */
+CTEST(zaxpby, inc_x_2_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ */
+CTEST(zaxpby, inc_x_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ */
+CTEST(zaxpby, inc_x_2_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    double alpha[] = {3.0, 1.0};
+    double beta[] = {4.0, 3.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is 2
+ */
+CTEST(zaxpby, inc_x_neg_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = -1, incy = 2;
+    double alpha[] = {5.0, 2.2};
+    double beta[] = {4.0, 5.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is -1
+ */
+CTEST(zaxpby, inc_x_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = -1;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {6.0, 3.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is -1
+ */
+CTEST(zaxpby, inc_x_neg_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = -2, incy = -1;
+    double alpha[] = {7.0, 2.0};
+    double beta[] = {3.5, 1.3};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ */
+CTEST(zaxpby, inc_x_1_inc_y_1_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar beta is zero
+ */
+CTEST(zaxpby, inc_x_1_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ * Scalar beta is zero
+ */
+CTEST(zaxpby, inc_x_1_inc_y_1_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+ * Scalar beta is zero
+*/
+CTEST(zaxpby, inc_x_1_inc_y_2_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Check if n - size of vectors x, y is zero
+ */
+CTEST(zaxpby, check_n_zero)
+{
+    blasint n = 0, incx = 1, incy = 1;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+    
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ */
+CTEST(zaxpby, c_api_inc_x_1_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ */
+CTEST(zaxpby, c_api_inc_x_2_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ */
+CTEST(zaxpby, c_api_inc_x_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {2.0, 2.1};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ */
+CTEST(zaxpby, c_api_inc_x_2_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    double alpha[] = {3.0, 2.0};
+    double beta[] = {4.0, 3.0};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is 2
+ */
+CTEST(zaxpby, c_api_inc_x_neg_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = -1, incy = 2;
+    double alpha[] = {5.0, 2.0};
+    double beta[] = {4.0, 3.1};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is -1
+ */
+CTEST(zaxpby, c_api_inc_x_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = -1;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {6.0, 2.3};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is -1
+ */
+CTEST(zaxpby, c_api_inc_x_neg_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = -2, incy = -1;
+    double alpha[] = {7.0, 1.0};
+    double beta[] = {3.5, 1.0};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ */
+CTEST(zaxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar beta is zero
+ */
+CTEST(zaxpby, c_api_inc_x_1_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ * Scalar beta is zero
+ */
+CTEST(zaxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+ * Scalar beta is zero
+*/
+CTEST(zaxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Check if n - size of vectors x, y is zero
+ */
+CTEST(zaxpby, c_api_check_n_zero)
+{
+    blasint n = 0, incx = 1, incy = 1;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+#endif
diff --git a/utest/test_extensions/test_zaxpyc.c b/utest/test_extensions/test_zaxpyc.c
new file mode 100644
index 000000000..7c11cd920
--- /dev/null
+++ b/utest/test_extensions/test_zaxpyc.c
@@ -0,0 +1,159 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_ZAXPYC {
+	double x_test[DATASIZE * INCREMENT * 2];
+	double x_verify[DATASIZE * INCREMENT * 2];
+	double y_test[DATASIZE * INCREMENT * 2];
+	double y_verify[DATASIZE * INCREMENT * 2];
+};
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZAXPYC data_zaxpyc;
+
+/**
+ * Test zaxpyc by conjugating vector x and comparing with zaxpy.
+ * Compare with the following options:
+ *
+ * param n - number of elements in vectors x and y
+ * param alpha - scalar alpha
+ * param incx - increment for the elements of x
+ * param incy - increment for the elements of y
+ * return norm of difference
+ */
+static double check_zaxpyc(blasint n, double *alpha, blasint incx, blasint incy)
+{
+	blasint i;
+
+	drand_generate(data_zaxpyc.x_test, n * incx * 2);
+	drand_generate(data_zaxpyc.y_test, n * incy * 2);
+
+	for (i = 0; i < n * incx * 2; i++)
+		data_zaxpyc.x_verify[i] = data_zaxpyc.x_test[i];
+
+	for (i = 0; i < n * incy * 2; i++)
+		data_zaxpyc.y_verify[i] = data_zaxpyc.y_test[i];
+
+	zconjugate_vector(n, incx, data_zaxpyc.x_verify);
+
+	BLASFUNC(zaxpy)
+	(&n, alpha, data_zaxpyc.x_verify, &incx,
+	 data_zaxpyc.y_verify, &incy);
+
+	BLASFUNC(zaxpyc)
+	(&n, alpha, data_zaxpyc.x_test, &incx,
+	 data_zaxpyc.y_test, &incy);
+
+	for (i = 0; i < n * incy * 2; i++)
+		data_zaxpyc.y_verify[i] -= data_zaxpyc.y_test[i];
+
+	return BLASFUNC(dznrm2)(&n, data_zaxpyc.y_verify, &incy);
+}
+
+/**
+ * Test zaxpyc by conjugating vector x and comparing with zaxpy.
+ * Test with the following options:
+ *
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ */
+CTEST(zaxpyc, conj_strides_one)
+{
+	blasint n = DATASIZE, incx = 1, incy = 1;
+	double alpha[] = {5.0, 2.2};
+
+	double norm = check_zaxpyc(n, alpha, incx, incy);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test zaxpyc by conjugating vector x and comparing with zaxpy.
+ * Test with the following options:
+ *
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ */
+CTEST(zaxpyc, conj_incx_one)
+{
+	blasint n = DATASIZE, incx = 1, incy = 2;
+	double alpha[] = {5.0, 2.2};
+
+	double norm = check_zaxpyc(n, alpha, incx, incy);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test zaxpyc by conjugating vector x and comparing with zaxpy.
+ * Test with the following options:
+ *
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ */
+CTEST(zaxpyc, conj_incy_one)
+{
+	blasint n = DATASIZE, incx = 2, incy = 1;
+	double alpha[] = {5.0, 2.2};
+
+	double norm = check_zaxpyc(n, alpha, incx, incy);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test zaxpyc by conjugating vector x and comparing with zaxpy.
+ * Test with the following options:
+ *
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ */
+CTEST(zaxpyc, conj_strides_two)
+{
+	blasint n = DATASIZE, incx = 2, incy = 2;
+	double alpha[] = {5.0, 2.2};
+
+	double norm = check_zaxpyc(n, alpha, incx, incy);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+#endif
diff --git a/utest/test_extensions/test_zgbmv.c b/utest/test_extensions/test_zgbmv.c
new file mode 100644
index 000000000..55473361c
--- /dev/null
+++ b/utest/test_extensions/test_zgbmv.c
@@ -0,0 +1,280 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 1
+
+struct DATA_ZGBMV {
+    double a_test[DATASIZE * DATASIZE * 2];
+    double a_band_storage[DATASIZE * DATASIZE * 2];
+    double matrix[DATASIZE * DATASIZE * 2];
+    double b_test[DATASIZE * 2 * INCREMENT];
+    double c_test[DATASIZE * 2 * INCREMENT];
+    double c_verify[DATASIZE * 2 * INCREMENT];
+};
+
+#ifdef BUILD_COMPLEX16
+
+static struct DATA_ZGBMV data_zgbmv;
+
+/** 
+ * Transform full-storage band matrix A to band-packed storage mode.
+ * 
+ * param m - number of rows of A
+ * param n - number of columns of A
+ * param kl - number of sub-diagonals of the matrix A
+ * param ku - number of super-diagonals of the matrix A
+ * output param a - buffer for holding band-packed matrix
+ * param lda - specifies the leading dimension of a
+ * param matrix - buffer holding full-storage band matrix A 
+ * param ldm - specifies the leading full-storage band matrix A
+ */
+static void transform_to_band_storage(blasint m, blasint n, blasint kl, 
+                                      blasint ku, double* a, blasint lda,
+                                      double* matrix, blasint ldm)
+{
+    blasint i, j, k;
+    for (j = 0; j < n; j++) 
+    {
+        k = 2 * (ku - j);
+        for (i = MAX(0, 2*(j - ku)); i < MIN(m, j + kl + 1) * 2; i+=2) 
+        {
+            a[(k + i) + j * lda * 2] = matrix[i + j * ldm * 2];
+            a[(k + i) + j * lda * 2 + 1] = matrix[i + j * ldm * 2 + 1];
+        }
+    }
+}
+
+/** 
+ * Generate full-storage band matrix A with kl sub-diagonals and ku super-diagonals
+ * 
+ * param m - number of rows of A
+ * param n - number of columns of A
+ * param kl - number of sub-diagonals of the matrix A
+ * param ku - number of super-diagonals of the matrix A
+ * output param band_matrix - buffer for full-storage band matrix.
+ * param matrix - buffer holding input general matrix
+ * param ldm - specifies the leading of input general matrix
+ */
+static void get_band_matrix(blasint m, blasint n, blasint kl, blasint ku, 
+                            double *band_matrix, double *matrix, blasint ldm)
+{
+    blasint i, j;
+    blasint k = 0;
+    for (i = 0; i < n; i++)
+    {
+        for (j = 0; j < m * 2; j += 2)
+        {
+            if ((blasint)(j/2) > kl + i || i > ku + (blasint)(j/2)) 
+            {
+                band_matrix[i * ldm * 2 + j] = 0.0;
+                band_matrix[i * ldm * 2 + j + 1] = 0.0;
+                continue;
+            }
+
+            band_matrix[i * ldm * 2 + j] = matrix[k++];
+            band_matrix[i * ldm * 2 + j + 1] = matrix[k++];
+        }
+    }
+}
+
+/**
+ * Comapare results computed by zgbmv and zgemv 
+ * since gbmv is gemv for band matrix
+ * 
+ * param trans specifies op(A), the transposition operation applied to A
+ * param m - number of rows of A
+ * param n - number of columns of A
+ * param kl - number of sub-diagonals of the matrix A
+ * param ku - number of super-diagonals of the matrix A
+ * param alpha - scaling factor for the matrix-vector product
+ * param lda - specifies the leading dimension of a
+ * param inc_b - stride of vector b
+ * param beta - scaling factor for vector c
+ * param inc_c - stride of vector c
+ * return norm of differences 
+ */
+static double check_zgbmv(char trans, blasint m, blasint n, blasint kl, blasint ku,
+    double *alpha, blasint lda, blasint inc_b, double *beta, blasint inc_c)
+{
+    blasint i;
+    blasint lenb, lenc;
+    
+    if(trans == 'T' || trans == 'C' || trans == 'D' || trans == 'U'){
+        lenb = m;
+        lenc = n;
+    } else {
+        lenb = n;
+        lenc = m;
+    }
+
+    drand_generate(data_zgbmv.matrix, m * n * 2);
+    drand_generate(data_zgbmv.b_test, 2 * (1 + (lenb - 1) * inc_b));
+    drand_generate(data_zgbmv.c_test, 2 * (1 + (lenc - 1) * inc_c));
+
+    for (i = 0; i < 2 * (1 + (lenc - 1) * inc_c); i++)
+        data_zgbmv.c_verify[i] = data_zgbmv.c_test[i];
+
+    get_band_matrix(m, n, kl, ku, data_zgbmv.a_test, data_zgbmv.matrix, m);
+
+    transform_to_band_storage(m, n, kl, ku, data_zgbmv.a_band_storage, lda, data_zgbmv.a_test, m);
+
+    BLASFUNC(zgemv)(&trans, &m, &n, alpha, data_zgbmv.a_test, &m, data_zgbmv.b_test,
+                    &inc_b, beta, data_zgbmv.c_verify, &inc_c);
+
+    BLASFUNC(zgbmv)(&trans, &m, &n, &kl, &ku, alpha, data_zgbmv.a_band_storage, &lda, data_zgbmv.b_test, 
+                    &inc_b, beta, data_zgbmv.c_test, &inc_c);
+
+    for (i = 0; i < 2 * (1 + (lenc - 1) * inc_c); i++)
+        data_zgbmv.c_verify[i] -= data_zgbmv.c_test[i];
+
+    return BLASFUNC(dznrm2)(&lenc, data_zgbmv.c_verify, &inc_c);
+}
+
+/**
+ * Test zgbmv by comparing it against zgemv
+ * with param trans is D
+ */
+CTEST(zgbmv, trans_D)
+{
+    blasint m = 50, n = 25;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 20, ku = 11;
+    blasint lda = 50;
+    char trans = 'D';
+
+    double alpha[] = {7.0, 1.0};
+    double beta[] = {1.5, -1.5};
+
+    double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgbmv by comparing it against zgemv
+ * with param trans is O
+ */
+CTEST(zgbmv, trans_O)
+{
+    blasint m = 50, n = 25;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 20, ku = 10;
+    blasint lda = 50;
+    char trans = 'O';
+
+    double alpha[] = {7.0, 1.0};
+    double beta[] = {1.5, -1.5};
+
+    double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgbmv by comparing it against zgemv
+ * with param trans is S
+ */
+CTEST(zgbmv, trans_S)
+{
+    blasint m = 50, n = 25;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 6, ku = 9;
+    blasint lda = 50;
+    char trans = 'S';
+
+    double alpha[] = {7.0, 1.0};
+    double beta[] = {1.5, -1.5};
+
+    double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgbmv by comparing it against zgemv
+ * with param trans is U
+ */
+CTEST(zgbmv, trans_U)
+{
+    blasint m = 25, n = 50;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 7, ku = 11;
+    blasint lda = kl + ku + 1;
+    char trans = 'U';
+
+    double alpha[] = {7.0, 1.0};
+    double beta[] = {1.5, -1.5};
+
+    double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgbmv by comparing it against zgemv
+ * with param trans is C
+ */
+CTEST(zgbmv, trans_C)
+{
+    blasint m = 50, n = 25;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 20, ku = 11;
+    blasint lda = 50;
+    char trans = 'C';
+
+    double alpha[] = {7.0, 1.0};
+    double beta[] = {1.5, -1.5};
+
+    double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgbmv by comparing it against zgemv
+ * with param trans is R
+ */
+CTEST(zgbmv, trans_R)
+{
+    blasint m = 50, n = 100;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 20, ku = 11;
+    blasint lda = 50;
+    char trans = 'R';
+
+    double alpha[] = {7.0, 1.0};
+    double beta[] = {1.5, -1.5};
+
+    double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+#endif
diff --git a/utest/test_extensions/test_zgeadd.c b/utest/test_extensions/test_zgeadd.c
new file mode 100644
index 000000000..917c04829
--- /dev/null
+++ b/utest/test_extensions/test_zgeadd.c
@@ -0,0 +1,880 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define N 100
+#define M 100
+
+struct DATA_ZGEADD {
+    double a_test[M * N * 2];
+    double c_test[M * N * 2];
+    double c_verify[M * N * 2];
+};
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZGEADD data_zgeadd;
+
+/**
+ * zgeadd reference implementation
+ *
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param alpha - scaling factor for matrix A
+ * param aptr - refer to matrix A
+ * param lda - leading dimension of A
+ * param beta - scaling factor for matrix C
+ * param cptr - refer to matrix C
+ * param ldc - leading dimension of C
+ */
+static void zgeadd_trusted(blasint m, blasint n, double *alpha, double *aptr,
+                           blasint lda, double *beta, double *cptr, blasint ldc)
+{
+    blasint i;
+
+    lda *= 2;
+    ldc *= 2;
+
+    for (i = 0; i < n; i++)
+    {
+        cblas_zaxpby(m, alpha, aptr, 1, beta, cptr, 1);
+        aptr += lda;
+        cptr += ldc;
+    }
+}
+
+/**
+ * Test zgeadd by comparing it against reference
+ * Compare with the following options:
+ *
+ * param api - specifies Fortran or C API
+ * param order - specifies whether A and C stored in
+ * row-major order or column-major order
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param alpha - scaling factor for matrix A
+ * param lda - leading dimension of A
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ * return norm of differences
+ */
+static double check_zgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order,
+                           blasint m, blasint n, double *alpha, blasint lda,
+                           double *beta, blasint ldc)
+{
+    blasint i;
+    blasint cols = m, rows = n;
+
+    if (order == CblasRowMajor)
+    {
+        rows = m;
+        cols = n;
+    }
+
+    // Fill matrix A, C
+    srand_generate(data_zgeadd.a_test, lda * rows * 2);
+    srand_generate(data_zgeadd.c_test, ldc * rows * 2);
+
+    // Copy matrix C for zgeadd
+    for (i = 0; i < ldc * rows * 2; i++)
+        data_zgeadd.c_verify[i] = data_zgeadd.c_test[i];
+
+    zgeadd_trusted(cols, rows, alpha, data_zgeadd.a_test, lda,
+                   beta, data_zgeadd.c_verify, ldc);
+
+    if (api == 'F')
+        BLASFUNC(zgeadd)(&m, &n, alpha, data_zgeadd.a_test, &lda,
+                         beta, data_zgeadd.c_test, &ldc);
+    else
+        cblas_zgeadd(order, m, n, alpha, data_zgeadd.a_test, lda,
+                     beta, data_zgeadd.c_test, ldc);
+
+    // Find the differences between output matrix caculated by zgeadd and sgemm
+    return dmatrix_difference(data_zgeadd.c_test, data_zgeadd.c_verify, cols, rows, ldc * 2);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param api - specifies Fortran or C API
+ * param order - specifies whether A and C stored in
+ * row-major order or column-major order
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param lda - leading dimension of A
+ * param ldc - leading dimension of C
+ * param expected_info - expected invalid parameter number in zgeadd
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order,
+                         blasint m, blasint n, blasint lda,
+                         blasint ldc, int expected_info)
+{
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    set_xerbla("ZGEADD ", expected_info);
+
+    if (api == 'F')
+        BLASFUNC(zgeadd)(&m, &n, alpha, data_zgeadd.a_test, &lda,
+                         beta, data_zgeadd.c_test, &ldc);
+    else
+        cblas_zgeadd(order, m, n, alpha, data_zgeadd.a_test, lda,
+                     beta, data_zgeadd.c_test, ldc);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(zgeadd, matrix_n_100_m_100)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {3.0, 2.0};
+    double beta[] = {1.0, 3.0};
+
+    double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar alpha is zero (operation is C:=beta*C)
+ */
+CTEST(zgeadd, matrix_n_100_m_100_alpha_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar beta is zero (operation is C:=alpha*A)
+ */
+CTEST(zgeadd, matrix_n_100_m_100_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {3.0, 1.5};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalars alpha, beta is zero (operation is C:= 0)
+ */
+CTEST(zgeadd, matrix_n_100_m_100_alpha_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(zgeadd, matrix_n_100_m_50)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M / 2;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C
+ * Must be at least zero.
+ */
+CTEST(zgeadd, xerbla_n_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ */
+CTEST(zgeadd, xerbla_m_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ */
+CTEST(zgeadd, xerbla_lda_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 6;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ */
+CTEST(zgeadd, xerbla_ldc_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Check if n - number of columns of A, C equal zero.
+ */
+CTEST(zgeadd, n_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 0;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Check if m - number of rows of A and C equal zero.
+ */
+CTEST(zgeadd, m_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 0;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(zgeadd, c_api_matrix_n_100_m_100)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.0, 3.0};
+
+    double norm = check_zgeadd('C', order, m, n, alpha,
+                               lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is row-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(zgeadd, c_api_matrix_n_100_m_100_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {4.0, 1.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgeadd('C', order, m, n, alpha,
+                               lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is row-major order
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(zgeadd, c_api_matrix_n_50_m_100_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = N / 2;
+    blasint m = M;
+
+    blasint lda = n;
+    blasint ldc = n;
+
+    double alpha[] = {3.0, 2.5};
+    double beta[] = {1.0, 2.0};
+
+    double norm = check_zgeadd('C', order, m, n, alpha,
+                               lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar alpha is zero (operation is C:=beta*C)
+ */
+CTEST(zgeadd, c_api_matrix_n_100_m_100_alpha_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zgeadd('C', order, m, n, alpha,
+                               lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar beta is zero (operation is C:=alpha*A)
+ */
+CTEST(zgeadd, c_api_matrix_n_100_m_100_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {3.0, 1.5};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zgeadd('C', order, m, n, alpha,
+                               lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalars alpha, beta is zero (operation is C:= 0)
+ */
+CTEST(zgeadd, c_api_matrix_n_100_m_100_alpha_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zgeadd('C', order, m, n, alpha,
+                               lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(zgeadd, c_api_matrix_n_100_m_50)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M / 2;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {2.0, 3.0};
+    double beta[] = {2.0, 4.0};
+
+    double norm = check_zgeadd('C', order, m, n, alpha,
+                               lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param order -
+ * specifies whether A and C stored in
+ * row-major order or column-major order
+ */
+CTEST(zgeadd, c_api_xerbla_invalid_order)
+{
+    CBLAS_ORDER order = INVALID;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 0;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C.
+ * Must be at least zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(zgeadd, c_api_xerbla_n_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C.
+ * Must be at least zero.
+ *
+ * c api option order is row-major order
+ */
+CTEST(zgeadd, c_api_xerbla_n_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(zgeadd, c_api_xerbla_m_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ *
+ * c api option order is row-major order
+ */
+CTEST(zgeadd, c_api_xerbla_m_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ *
+ * c api option order is column-major order
+ */
+CTEST(zgeadd, c_api_xerbla_lda_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 5;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ *
+ * c api option order is row-major order
+ */
+CTEST(zgeadd, c_api_xerbla_lda_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 5;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ *
+ * c api option order is column-major order
+ */
+CTEST(zgeadd, c_api_xerbla_ldc_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ *
+ * c api option order is row-major order
+ */
+CTEST(zgeadd, c_api_xerbla_ldc_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Check if n - number of columns of A, C equal zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(zgeadd, c_api_n_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 0;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zgeadd('C', order, m, n, alpha,
+                               lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Check if m - number of rows of A and C equal zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(zgeadd, c_api_m_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 0;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zgeadd('C', order, m, n, alpha,
+                               lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_zgemm.c b/utest/test_extensions/test_zgemm.c
new file mode 100644
index 000000000..4160a5086
--- /dev/null
+++ b/utest/test_extensions/test_zgemm.c
@@ -0,0 +1,273 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_ZGEMM {
+	double a_test[DATASIZE * DATASIZE * 2];
+    double a_verify[DATASIZE * DATASIZE * 2];
+	double b_test[DATASIZE * DATASIZE * 2];
+    double b_verify[DATASIZE * DATASIZE * 2];
+    double c_test[DATASIZE * DATASIZE * 2];
+	double c_verify[DATASIZE * DATASIZE * 2];
+};
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZGEMM data_zgemm;
+
+/**
+ * Test zgemm with the conjugate matrices by conjugating and not transposed matrices
+ * and comparing it with the non-conjugate zgemm.
+ *
+ * param transa specifies op(A), the transposition (conjugation) operation applied to A
+ * param transb specifies op(B), the transposition (conjugation) operation applied to B
+ * param m specifies the number of rows of the matrix op(A) and of the matrix C
+ * param n specifies the number of columns of the matrix op(B) and the number of columns of the matrix C
+ * param k specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B)
+ * param alpha - scaling factor for the matrix-matrix product
+ * param lda - leading dimension of matrix A
+ * param ldb - leading dimension of matrix B
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of matrix C
+ * return norm of difference
+ */
+static double check_zgemm(char transa, char transb, blasint m, blasint n, blasint k, 
+                         double *alpha, blasint lda, blasint ldb, double *beta, blasint ldc)
+{
+	blasint i;
+	double alpha_conj[] = {1.0, 0.0}; 
+	char transa_verify = transa;
+    char transb_verify = transb;
+
+    int arows = k, acols = m;
+    int brows = n, bcols = k;
+
+    if (transa == 'T' || transa == 'C'){
+        arows = m; acols = k;
+    }
+
+    if (transb == 'T' || transb == 'C'){
+        brows = k; bcols = n;
+    }
+
+	drand_generate(data_zgemm.a_test, arows * lda * 2);
+	drand_generate(data_zgemm.b_test, brows * ldb * 2);
+    drand_generate(data_zgemm.c_test, n * ldc * 2);
+
+	for (i = 0; i < arows * lda * 2; i++)
+		data_zgemm.a_verify[i] = data_zgemm.a_test[i];
+
+	for (i = 0; i < brows * ldb * 2; i++)
+		data_zgemm.b_verify[i] = data_zgemm.b_test[i];
+
+    for (i = 0; i < n * ldc * 2; i++)
+		data_zgemm.c_verify[i] = data_zgemm.c_test[i];
+
+	if (transa == 'R'){
+		cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, arows, acols, alpha_conj, data_zgemm.a_verify, lda, lda);
+		transa_verify = 'N';
+	}
+
+    if (transb == 'R'){
+		cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, brows, bcols, alpha_conj, data_zgemm.b_verify, ldb, ldb);
+	    transb_verify = 'N';
+	}
+
+	BLASFUNC(zgemm)(&transa_verify, &transb_verify, &m, &n, &k, alpha, data_zgemm.a_verify, &lda,
+	 				data_zgemm.b_verify, &ldb, beta, data_zgemm.c_verify, &ldc);
+
+	BLASFUNC(zgemm)(&transa, &transb, &m, &n, &k, alpha, data_zgemm.a_test, &lda,
+	 				data_zgemm.b_test, &ldb, beta, data_zgemm.c_test, &ldc);
+
+	return dmatrix_difference(data_zgemm.c_test, data_zgemm.c_verify, m, n, ldc*2);
+}
+
+/**
+ * Test zgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate zgemm.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and transposed
+ * matrix B is conjugate and not transposed
+ */
+CTEST(zgemm, conjtransa_conjnotransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'C';
+	char transb = 'R';
+	double alpha[] = {-2.0, 1.0};
+    double beta[] = {1.0, -1.0};
+
+	double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test zgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate zgemm.
+ * Test with the following options:
+ *
+ * matrix A is not conjugate and not transposed
+ * matrix B is conjugate and not transposed
+ */
+CTEST(zgemm, notransa_conjnotransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'N';
+	char transb = 'R';
+	double alpha[] = {-2.0, 1.0};
+    double beta[] = {1.0, -1.0};
+
+	double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test zgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate zgemm.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not transposed
+ * matrix B is conjugate and transposed
+ */
+CTEST(zgemm, conjnotransa_conjtransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'R';
+	char transb = 'C';
+	double alpha[] = {-2.0, 1.0};
+    double beta[] = {1.0, -1.0};
+
+	double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test zgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate zgemm.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not transposed
+ * matrix B is not conjugate and not transposed
+ */
+CTEST(zgemm, conjnotransa_notransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'R';
+	char transb = 'N';
+	double alpha[] = {-2.0, 1.0};
+    double beta[] = {1.0, -1.0};
+
+	double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test zgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate zgemm.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not transposed
+ * matrix B is conjugate and not transposed
+ */
+CTEST(zgemm, conjnotransa_conjnotransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'R';
+	char transb = 'R';
+	double alpha[] = {-2.0, 1.0};
+    double beta[] = {1.0, -1.0};
+
+	double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test zgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate zgemm.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not transposed
+ * matrix B is transposed
+ */
+CTEST(zgemm, conjnotransa_transb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'R';
+	char transb = 'T';
+	double alpha[] = {-2.0, 1.0};
+    double beta[] = {1.0, -1.0};
+
+	double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test zgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate zgemm.
+ * Test with the following options:
+ *
+ * matrix A is transposed
+ * matrix B is conjugate and not transposed
+ */
+CTEST(zgemm, transa_conjnotransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'T';
+	char transb = 'R';
+	double alpha[] = {-2.0, 1.0};
+    double beta[] = {1.0, -1.0};
+
+	double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_zgemmt.c b/utest/test_extensions/test_zgemmt.c
new file mode 100644
index 000000000..c55381008
--- /dev/null
+++ b/utest/test_extensions/test_zgemmt.c
@@ -0,0 +1,2010 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_ZGEMMT {
+    double a_test[DATASIZE * DATASIZE * 2];
+    double b_test[DATASIZE * DATASIZE * 2];
+    double c_test[DATASIZE * DATASIZE * 2];
+    double c_verify[DATASIZE * DATASIZE * 2];
+    double c_gemm[DATASIZE * DATASIZE * 2];
+};
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZGEMMT data_zgemmt;
+
+/**
+ * Compute gemmt via gemm since gemmt is gemm but updates only 
+ * the upper or lower triangular part of the result matrix
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order (for Fortran API column major always)
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param alpha - scaling factor for the matrix-matrix product
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ */
+static void zgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                           char transb, blasint m, blasint k, double *alpha, blasint lda, 
+                           blasint ldb, double *beta, blasint ldc)
+{
+    blasint i, j;
+
+    if(api == 'F')
+        BLASFUNC(zgemm)(&transa, &transb, &m, &m, &k, alpha, data_zgemmt.a_test, &lda,
+                        data_zgemmt.b_test, &ldb, beta, data_zgemmt.c_gemm, &ldc);
+    else
+        cblas_zgemm(order, transa, transb, m, m, k, alpha, data_zgemmt.a_test, lda,
+                data_zgemmt.b_test, ldb, beta, data_zgemmt.c_gemm, ldc);
+
+    ldc *= 2;
+
+    if (uplo == 'L' || uplo == CblasLower)
+    {
+        for (i = 0; i < m; i++)
+            for (j = i * 2; j < m * 2; j+=2){
+                data_zgemmt.c_verify[i * ldc + j] =
+                    data_zgemmt.c_gemm[i * ldc + j];
+                data_zgemmt.c_verify[i * ldc + j + 1] =
+                    data_zgemmt.c_gemm[i * ldc + j + 1];
+            }
+    } else {
+        for (i = 0; i < m; i++)
+            for (j = 0; j <= i * 2; j+=2){
+                data_zgemmt.c_verify[i * ldc + j] =
+                    data_zgemmt.c_gemm[i * ldc + j];
+                data_zgemmt.c_verify[i * ldc + j + 1] =
+                    data_zgemmt.c_gemm[i * ldc + j + 1];
+            }
+    }
+}
+
+/**
+ * Comapare results computed by zgemmt and zgemmt_trusted
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order (for Fortran API column major always)
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param alpha - scaling factor for the matrix-matrix product
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ * return norm of differences
+ */
+static double check_zgemmt(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                          char transb, blasint m, blasint k, double *alpha, blasint lda, 
+                          blasint ldb, double *beta, blasint ldc)
+{
+    blasint i;
+    blasint b_cols;
+    blasint a_cols;
+    blasint inc = 1;
+    blasint size_c = m * ldc * 2;
+
+    if(order == CblasColMajor){
+        if (transa == 'T' || transa == 'C' || 
+            transa == CblasTrans || transa == CblasConjTrans) 
+            a_cols = m;
+        else a_cols = k;
+
+        if (transb == 'T' || transb == 'C' || 
+            transb == CblasTrans || transb == CblasConjTrans) 
+            b_cols = k;
+        else b_cols = m;
+    } else {
+        if (transa == 'T' || transa == 'C' || 
+            transa == CblasTrans || transa == CblasConjTrans) 
+            a_cols = k;
+        else a_cols = m;
+
+        if (transb == 'T' || transb == 'C' ||
+            transb == CblasTrans || transb == CblasConjTrans) 
+            b_cols = m;
+        else b_cols = k;
+    }
+
+    drand_generate(data_zgemmt.a_test, a_cols * lda * 2);
+    drand_generate(data_zgemmt.b_test, b_cols * ldb  * 2);
+    drand_generate(data_zgemmt.c_test, m * ldc * 2);
+
+    for (i = 0; i < m * ldc * 2; i++)
+        data_zgemmt.c_gemm[i] = data_zgemmt.c_verify[i] = data_zgemmt.c_test[i];
+
+    zgemmt_trusted(api, order, uplo, transa, transb, m, k, alpha, lda, ldb, beta, ldc);
+
+    if (api == 'F')
+        BLASFUNC(zgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_zgemmt.a_test,
+                         &lda, data_zgemmt.b_test, &ldb, beta, data_zgemmt.c_test, &ldc);
+    else
+        cblas_zgemmt(order, uplo, transa, transb, m, k, alpha, data_zgemmt.a_test, lda,
+                    data_zgemmt.b_test, ldb, beta, data_zgemmt.c_test, ldc);
+
+    for (i = 0; i < m * ldc * 2; i++)
+        data_zgemmt.c_verify[i] -= data_zgemmt.c_test[i];
+
+    return BLASFUNC(dnrm2)(&size_c, data_zgemmt.c_verify, &inc) / size_c;
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param ldc - leading dimension of C
+ * param expected_info - expected invalid parameter number in zgemmt
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                         char transb, blasint m, blasint k, blasint lda, blasint ldb,
+                         blasint ldc, int expected_info)
+{
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    set_xerbla("ZGEMMT ", expected_info);
+
+    if (api == 'F')
+        BLASFUNC(zgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_zgemmt.a_test,
+                         &lda, data_zgemmt.b_test, &ldb, beta, data_zgemmt.c_test, &ldc);
+    else
+        cblas_zgemmt(order, uplo, transa, transb, m, k, alpha, data_zgemmt.a_test, lda,
+                    data_zgemmt.b_test, ldb, beta, data_zgemmt.c_test, ldc);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(zgemmt, upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(zgemmt, upper_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 25, ldb = 25, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(zgemmt, upper_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 25, ldc = 25;
+    char transa = 'N', transb = 'T';
+    char uplo = 'U';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(zgemmt, upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'T';
+    char uplo = 'U';
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, upper_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 50, ldc = 25;
+    char transa = 'R', transb = 'R';
+    char uplo = 'U';
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, upper_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'C', transb = 'R';
+    char uplo = 'U';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, upper_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'R', transb = 'C';
+    char uplo = 'U';
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, upper_M_50_K_25_a_conjtrans_b_conjtrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'C', transb = 'C';
+    char uplo = 'U';
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(zgemmt, upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(zgemmt, upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 0.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(zgemmt, lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(zgemmt, lower_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 25, ldb = 25, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'L';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(zgemmt, lower_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 25, ldc = 25;
+    char transa = 'N', transb = 'T';
+    char uplo = 'L';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(zgemmt, lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'T';
+    char uplo = 'L';
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, lower_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 50, ldc = 25;
+    char transa = 'R', transb = 'R';
+    char uplo = 'L';
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, lower_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'C', transb = 'R';
+    char uplo = 'L';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, lower_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'R', transb = 'C';
+    char uplo = 'L';
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, lower_M_50_K_25_a_conjtrans_b_conjtrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'C', transb = 'C';
+    char uplo = 'L';
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(zgemmt, lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(zgemmt, lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 0.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(zgemmt, c_api_colmajor_upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(zgemmt, c_api_colmajor_upper_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 25, ldb = 25, ldc = 50;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(zgemmt, c_api_colmajor_upper_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 25, ldc = 25;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(zgemmt, c_api_colmajor_upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, c_api_colmajor_upper_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 50, ldc = 25;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasConjNoTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, c_api_colmajor_upper_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasConjTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, c_api_colmajor_upper_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasConjNoTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, c_api_colmajor_upper_M_50_K_25_a_conjtrans_b_conjtrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasConjTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(zgemmt, c_api_colmajor_upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(zgemmt, c_api_colmajor_upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 0.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(zgemmt, c_api_colmajor_lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(zgemmt, c_api_colmajor_lower_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 25, ldb = 25, ldc = 50;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(zgemmt, c_api_colmajor_lower_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 25, ldc = 25;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(zgemmt, c_api_colmajor_lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, c_api_colmajor_lower_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 50, ldc = 25;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasConjNoTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, c_api_colmajor_lower_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasConjTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, c_api_colmajor_lower_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasConjNoTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, c_api_colmajor_lower_M_50_K_25_a_conjtrans_b_conjtrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasConjTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(zgemmt, c_api_colmajor_lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(zgemmt, c_api_colmajor_lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 0.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_upper_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_upper_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 25;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {-1.0, -1.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_upper_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 50, ldb = 25, ldc = 25;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasConjNoTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_upper_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasConjTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_upper_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasConjNoTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_upper_M_25_K_50_a_conjtrans_b_conjtrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 50, ldc = 25;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasConjTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(zgemmt, c_api_rowmajor_upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(zgemmt, c_api_rowmajor_upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 0.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_lower_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 25;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_lower_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 50, ldb = 25, ldc = 25;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasConjNoTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasConjTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasConjNoTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_25_a_conjtrans_b_conjtrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 25, ldc = 50;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasConjTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(zgemmt, c_api_rowmajor_lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(zgemmt, c_api_rowmajor_lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 0.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param uplo.
+ * Must be upper (U) or lower (L).
+ */
+CTEST(zgemmt, xerbla_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'O';
+    int expected_info = 1;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param transa.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(zgemmt, xerbla_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'O', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 2;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param transb.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(zgemmt, xerbla_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'O';
+    char uplo = 'U';
+    int expected_info = 3;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(zgemmt, xerbla_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 4;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(zgemmt, xerbla_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 5;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(zgemmt, xerbla_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 100, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 8;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B not transposed.
+ */
+CTEST(zgemmt, xerbla_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 10;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(zgemmt, xerbla_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 13;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test.
+ * Test error function for an invalid param order.
+ * Must be column or row major.
+ */
+CTEST(zgemmt, xerbla_c_api_major_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 0;
+
+    int passed = check_badargs('C', 'O', CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param uplo.
+ * Must be upper or lower.
+ */
+CTEST(zgemmt, xerbla_c_api_colmajor_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 1;
+
+    int passed = check_badargs('C', CblasColMajor, 'O', CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param transa.
+ * Must be trans or no-trans.
+ */
+CTEST(zgemmt, xerbla_c_api_colmajor_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 2;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, 'O', CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param transb.
+ * Must be trans or no-trans.
+ */
+CTEST(zgemmt, xerbla_c_api_colmajor_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 3;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, 'O',
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(zgemmt, xerbla_c_api_colmajor_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 4;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(zgemmt, xerbla_c_api_colmajor_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 5;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(zgemmt, xerbla_c_api_colmajor_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 100, ldc = 50;
+    int expected_info = 8;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B not transposed.
+ */
+CTEST(zgemmt, xerbla_c_api_colmajor_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 10;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(zgemmt, xerbla_c_api_colmajor_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 13;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param uplo.
+ * Must be upper or lower.
+ */
+CTEST(zgemmt, xerbla_c_api_rowmajor_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 1;
+
+    int passed = check_badargs('C', CblasRowMajor, 'O', CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param transa.
+ * Must be trans or no-trans.
+ */
+CTEST(zgemmt, xerbla_c_api_rowmajor_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 2;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, 'O', CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param transb.
+ * Must be trans or no-trans.
+ */
+CTEST(zgemmt, xerbla_c_api_rowmajor_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 3;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, 'O',
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(zgemmt, xerbla_c_api_rowmajor_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 4;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(zgemmt, xerbla_c_api_rowmajor_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 5;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(zgemmt, xerbla_c_api_rowmajor_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 8;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B transposed.
+ */
+CTEST(zgemmt, xerbla_c_api_rowmajor_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 10;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(zgemmt, xerbla_c_api_rowmajor_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 100, ldb = 100, ldc = 50;
+    int expected_info = 13;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_zgemv_n.c b/utest/test_extensions/test_zgemv_n.c
new file mode 100644
index 000000000..903b855e1
--- /dev/null
+++ b/utest/test_extensions/test_zgemv_n.c
@@ -0,0 +1,341 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_ZSPMV_N {
+    double a_test[DATASIZE * DATASIZE * 2];
+    double b_test[DATASIZE * 2 * INCREMENT];
+    double c_test[DATASIZE * 2 * INCREMENT];
+    double c_verify[DATASIZE * 2 * INCREMENT];
+};
+
+#ifdef BUILD_COMPLEX16
+
+static struct DATA_ZSPMV_N data_zgemv_n;
+
+/**
+ * zgemv not transposed reference code
+ *
+ * param trans specifies whether matris A is conj or/and xconj
+ * param m - number of rows of A
+ * param n - number of columns of A
+ * param alpha - scaling factor for the matrib-vector product
+ * param a - buffer holding input matrib A
+ * param lda - leading dimension of matrix A
+ * param b - Buffer holding input vector b
+ * param inc_b - stride of vector b
+ * param beta - scaling factor for vector c
+ * param c - buffer holding input/output vector c
+ * param inc_c - stride of vector c
+ */
+static void zgemv_n_trusted(char trans, blasint m, blasint n, double *alpha, double *a,
+                          blasint lda, double *b, blasint inc_b, double *beta, double *c,
+                          blasint inc_c)
+{
+	blasint i, j;
+    blasint i2 = 0;
+	blasint ib = 0, ic = 0;
+
+    double temp_r, temp_i;
+
+	double *a_ptr = a;
+    blasint lda2 = 2*lda;
+
+	blasint inc_b2 = 2 * inc_b;
+    blasint inc_c2 = 2 * inc_c;
+
+    BLASFUNC(zscal)(&m, beta, c, &inc_c);
+
+	for (j = 0; j < n; j++)
+	{
+
+        if (trans == 'N' || trans == 'R') {
+            temp_r = alpha[0] * b[ib] - alpha[1] * b[ib+1];
+            temp_i = alpha[0] * b[ib+1] + alpha[1] * b[ib];
+        } else {
+            temp_r = alpha[0] * b[ib] + alpha[1] * b[ib+1];
+            temp_i = alpha[0] * b[ib+1] - alpha[1] * b[ib];
+        }
+
+		ic = 0;
+		i2 = 0;
+
+		for (i = 0; i < m; i++)
+		{
+                if (trans == 'N') {
+                    c[ic] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
+                    c[ic+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+                } 
+                if (trans == 'O') {
+                    c[ic] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
+                    c[ic+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+                }
+                if (trans == 'R') {
+                    c[ic] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
+                    c[ic+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+                }
+                if (trans == 'S') {
+                    c[ic] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
+                    c[ic+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+                }
+			i2 += 2;
+			ic += inc_c2;
+		}
+		a_ptr += lda2;
+		ib += inc_b2;
+	}
+
+}
+
+/**
+ * Comapare results computed by zgemv and zgemv_n_trusted
+ *
+ * param trans specifies whether matris A is conj or/and xconj
+ * param m - number of rows of A
+ * param n - number of columns of A
+ * param alpha - scaling factor for the matrib-vector product
+ * param lda - leading dimension of matrix A
+ * param inc_b - stride of vector b
+ * param beta - scaling factor for vector c
+ * param inc_c - stride of vector c
+ * return norm of differences
+ */
+static double check_zgemv_n(char trans, blasint m, blasint n, double *alpha, blasint lda, 
+                            blasint inc_b, double *beta, blasint inc_c)
+{
+    blasint i;
+
+    drand_generate(data_zgemv_n.a_test, n * lda);
+    drand_generate(data_zgemv_n.b_test, 2 * n * inc_b);
+    drand_generate(data_zgemv_n.c_test, 2 * m * inc_c);
+
+    for (i = 0; i < m * 2 * inc_c; i++)
+        data_zgemv_n.c_verify[i] = data_zgemv_n.c_test[i];
+
+    zgemv_n_trusted(trans, m, n, alpha, data_zgemv_n.a_test, lda, data_zgemv_n.b_test, 
+                  inc_b, beta, data_zgemv_n.c_test, inc_c);
+    BLASFUNC(zgemv)(&trans, &m, &n, alpha, data_zgemv_n.a_test, &lda, data_zgemv_n.b_test, 
+                    &inc_b, beta, data_zgemv_n.c_verify, &inc_c);
+
+    for (i = 0; i < m * 2 * inc_c; i++)
+        data_zgemv_n.c_verify[i] -= data_zgemv_n.c_test[i];
+
+    return BLASFUNC(dznrm2)(&n, data_zgemv_n.c_verify, &inc_c);
+}
+
+/**
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(zgemv, trans_o_square_matrix)
+{
+    blasint n = 100, m = 100, lda = 100;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'O';
+    double alpha[] = {2.0, -1.0};
+    double beta[] = {1.4, 5.0};
+
+    double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj
+ * Number of rows of A is 50
+ * Number of colums of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(zgemv, trans_o_rectangular_matrix_rows_less_then_cols)
+{
+    blasint n = 100, m = 50, lda = 50;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'O';
+    double alpha[] = {2.0, -1.0};
+    double beta[] = {1.4, 5.0};
+
+    double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj
+ * Number of rows of A is 100
+ * Number of colums of A is 50
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(zgemv, trans_o_rectangular_matrix_cols_less_then_rows)
+{
+    blasint n = 50, m = 100, lda = 100;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'O';
+    double alpha[] = {2.0, -1.0};
+    double beta[] = {1.4, 5.0};
+
+    double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 2
+ */
+CTEST(zgemv, trans_o_double_strides)
+{
+    blasint n = 100, m = 100, lda = 100;
+    blasint inc_b = 2, inc_c = 2;
+    char trans = 'O';
+    double alpha[] = {2.0, -1.0};
+    double beta[] = {1.4, 5.0};
+
+    double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj and conj
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(zgemv, trans_s_square_matrix)
+{
+    blasint n = 100, m = 100, lda = 100;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'S';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.4, 5.0};
+
+    double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj and conj
+ * Number of rows of A is 50
+ * Number of colums of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(zgemv, trans_s_rectangular_matrix_rows_less_then_cols)
+{
+    blasint n = 100, m = 50, lda = 50;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'S';
+    double alpha[] = {2.0, -1.0};
+    double beta[] = {1.4, 5.0};
+
+    double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj and conj
+ * Number of rows of A is 100
+ * Number of colums of A is 50
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(zgemv, trans_s_rectangular_matrix_cols_less_then_rows)
+{
+    blasint n = 50, m = 100, lda = 100;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'S';
+    double alpha[] = {2.0, -1.0};
+    double beta[] = {1.4, 0.0};
+
+    double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj and conj
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 2
+ */
+CTEST(zgemv, trans_s_double_strides)
+{
+    blasint n = 100, m = 100, lda = 100;
+    blasint inc_b = 2, inc_c = 2;
+    char trans = 'S';
+    double alpha[] = {2.0, -1.0};
+    double beta[] = {1.0, 5.0};
+
+    double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+#endif
diff --git a/utest/test_extensions/test_zgemv_t.c b/utest/test_extensions/test_zgemv_t.c
new file mode 100644
index 000000000..2e0ee65f0
--- /dev/null
+++ b/utest/test_extensions/test_zgemv_t.c
@@ -0,0 +1,1136 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define N 100
+#define M 100
+#define INCREMENT 2
+
+struct DATA_ZGEMV_T {
+    double a_test[N * M * 2];
+    double a_verify[N * M * 2];
+    double y_test[M * INCREMENT * 2];
+    double y_verify[M * INCREMENT * 2];
+    double x_test[M * INCREMENT * 2];
+    double x_verify[M * INCREMENT * 2];
+};
+
+// DOUBLE_EPS_ZGEMV = MAX_VAL * NUMBER OF OPERATIONS * DBL_EPSILON
+// DOUBLE_EPS_ZGEMV = 5.0 * O(100 * 100) * 2.2e-16 = 1e-11
+#define DOUBLE_EPS_ZGEMV 1e-11
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZGEMV_T data_zgemv_t;
+
+/**
+ * Find product of matrix-vector multiplication
+ *
+ * param n specifies number of columns of A
+ * param m specifies number of rows of A and size of vector x
+ * param lda specifies leading dimension of A
+ * param inc_x specifies increment of vector x
+ */
+static void matrix_vector_product(blasint n, blasint m, blasint lda, blasint inc_x)
+{
+    blasint i;
+    double *a_ptr = data_zgemv_t.a_verify;
+    double *x_ptr = data_zgemv_t.x_test;
+    double *x_res = data_zgemv_t.x_verify;
+
+    openblas_complex_double result;
+
+    for (i = 0; i < n * inc_x; i += inc_x)
+    {
+        result = cblas_zdotu(lda, a_ptr, 1, x_ptr, inc_x);
+        x_res[0] = CREAL(result);
+        x_res[1] = CIMAG(result);
+        a_ptr += lda * 2;
+        x_res += 2 * inc_x;
+    }
+}
+
+/**
+ * Test zgemv by comparing it against zomatcopy, zaxpby and
+ * reference func matrix_vector_product
+ *
+ * zomatcopy perform operation: op(A)
+ * matrix_vector_product perform operation: A*x
+ * zaxpby perform operation: alpha*x + beta*y
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param m specifies number of rows of A
+ * param n specifies number of columns of A
+ * param alpha specifies scalar alpha
+ * param lda specifies leading dimension of the matrix A
+ * param inc_x specifies increment for vector x
+ * param beta specifies scalar beta
+ * param inc_y specifies increment for vector y
+ * return norm of difference between zgemv and result of reference funcs
+ */
+static double check_zgemv(char api, char order, char trans, blasint m, blasint n, double *alpha,
+                          blasint lda, blasint inc_x, double *beta, blasint inc_y)
+{
+    blasint i;
+
+    enum CBLAS_ORDER corder;
+    enum CBLAS_TRANSPOSE ctrans;
+
+    // Transpose parameters for zomatcopy
+    // zgemv_t perform operation on transposed matrix, no need to transpose a_verify
+    char trans_copy;
+    char ctrans_copy;
+
+    // Param alpha for zomatcopy, scale on alpha perform zaxpby
+    double alpha_one[] = {1.0, 0.0};
+
+    memset(data_zgemv_t.x_verify, 0.0, m * inc_x * 2 * sizeof(double));
+
+    // Fill matrix A, vectors x, y
+    drand_generate(data_zgemv_t.a_test, lda * n * 2);
+    drand_generate(data_zgemv_t.x_test, m * inc_x * 2);
+    drand_generate(data_zgemv_t.y_test, m * inc_y * 2);
+
+    // Copy vector y for reference funcs
+    for (int i = 0; i < m * inc_y * 2; i++)
+    {
+        data_zgemv_t.y_verify[i] = data_zgemv_t.y_test[i];
+    }
+
+    if (api == 'F') {
+        if (trans == 'T') trans_copy = 'N';
+        if (trans == 'C') trans_copy = 'R';
+        if (trans == 'U') trans_copy = 'R';
+        if (trans == 'D') trans_copy = 'N';
+
+        // Perform operation: op(A)
+        BLASFUNC(zomatcopy)(&order, &trans_copy, &m, &n, alpha_one, 
+                            data_zgemv_t.a_test, &lda, data_zgemv_t.a_verify, &lda);
+
+        // Find A*x
+        matrix_vector_product(n, m, lda, inc_x);
+
+        // Find conj(x)
+        if (trans == 'U' || trans == 'D')
+        {
+            zconjugate_vector(m, inc_x, data_zgemv_t.x_verify);
+        }
+
+        // Find alpha*x+beta*y
+        BLASFUNC(zaxpby)(&n, alpha, data_zgemv_t.x_verify, &inc_x, beta, 
+                         data_zgemv_t.y_verify, &inc_y);
+
+        BLASFUNC(zgemv)(&trans, &m, &n, alpha, data_zgemv_t.a_test, &lda, 
+                        data_zgemv_t.x_test, &inc_x, beta, data_zgemv_t.y_test, &inc_y);
+    }
+    else {
+        if (order == 'C') corder = CblasColMajor;
+        if (order == 'R') corder = CblasRowMajor;
+        if (trans == 'T') {ctrans = CblasTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasTrans : CblasNoTrans;}
+        if (trans == 'N') {ctrans = CblasNoTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasNoTrans : CblasTrans;}
+        if (trans == 'C') {ctrans = CblasConjTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasConjTrans : CblasConjNoTrans;}
+        if (trans == 'R') {ctrans = CblasConjNoTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasConjNoTrans : CblasConjTrans;}
+
+        // Perform operation: op(A)
+        cblas_zomatcopy(corder, ctrans_copy, m, n, alpha_one, data_zgemv_t.a_test, lda, data_zgemv_t.a_verify, lda);
+
+        // Find A*x
+        matrix_vector_product(n, m, lda, inc_x);
+
+        // Find alpha*x+beta*y
+        cblas_zaxpby(n, alpha, data_zgemv_t.x_verify, inc_x, beta, data_zgemv_t.y_verify, inc_y);
+
+        cblas_zgemv(corder, ctrans, m, n, alpha, data_zgemv_t.a_test,
+                    lda, data_zgemv_t.x_test, inc_x, beta, data_zgemv_t.y_test, inc_y);
+    }
+
+    // Find the differences between output vector caculated by zgemv and reference funcs
+    for (i = 0; i < m * inc_y * 2; i++)
+        data_zgemv_t.y_test[i] -= data_zgemv_t.y_verify[i];
+
+    // Find the norm of differences
+    return cblas_dznrm2(m, data_zgemv_t.y_test, inc_y);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param m specifies number of rows of A
+ * param n specifies number of columns of A
+ * param lda specifies leading dimension of the matrix A
+ * param inc_x specifies increment for vector x
+ * param inc_y specifies increment for vector y
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char order, char trans, blasint m, blasint n,
+                         blasint lda, blasint inc_x, blasint inc_y, int expected_info)
+{
+    double alpha[] = {1.0, 1.0};
+    double a[] = {1.0, 1.0};
+    double x[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+    double y[] = {1.0, 1.0};
+
+    set_xerbla("ZGEMV ", expected_info);
+
+    BLASFUNC(zgemv)(&trans, &m, &n, alpha, a, &lda, x, 
+                    &inc_x, beta, y, &inc_y);
+
+    return check_error();
+}
+
+/**
+ * C API specific function
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param m specifies number of rows of A
+ * param n specifies number of columns of A
+ * param lda specifies leading dimension of the matrix A
+ * param inc_x specifies increment for vector x
+ * param inc_y specifies increment for vector y
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int c_api_check_badargs(CBLAS_ORDER corder, CBLAS_TRANSPOSE ctrans, blasint m, blasint n,
+                               blasint lda, blasint inc_x, blasint inc_y, int expected_info)
+{
+    double alpha[] = {1.0, 1.0};
+    double a[] = {1.0, 1.0};
+    double x[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+    double y[] = {1.0, 1.0};
+
+    set_xerbla("ZGEMV ", expected_info);
+
+    cblas_zgemv(corder, ctrans, m, n, alpha, a, lda, x, inc_x, beta, y, inc_y);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * inc x = 2, inc y = 1
+ * alpha_r = 1.0, alpha_i = 1.0
+ * beta_r = 2.0, beta_i = 2.0
+ */
+CTEST(zgemv, colmajor_trans_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'T';
+
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.0, 2.0};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    double norm = check_zgemv('F', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * inc x = 2, inc y = 1
+ * alpha_r = 1.0, alpha_i = 1.0
+ * beta_r = 2.0, beta_i = 2.0
+ */
+CTEST(zgemv, colmajor_trans_col_100_row_100_inc_x_2_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'T';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {2.0, 2.0};
+
+    blasint inc_x = 2;
+    blasint inc_y = 1;
+
+    double norm = check_zgemv('F', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate A
+ * Square matrix
+ * inc x = 1, inc y = 1
+ * alpha_r = 2.0, alpha_i = 1.0
+ * beta_r = 2.0, beta_i = 1.0
+ */
+CTEST(zgemv, colmajor_conjtrans_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'C';
+
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    double norm = check_zgemv('F', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate A
+ * Square matrix
+ * inc x = 1, inc y = 2
+ * alpha_r = 2.0, alpha_i = 1.0
+ * beta_r = 2.0, beta_i = 1.0
+ */
+CTEST(zgemv, colmajor_conjtrans_col_100_row_100_inc_x_1_y_2)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'C';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    blasint inc_x = 1;
+    blasint inc_y = 2;
+
+    double norm = check_zgemv('F', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and x conjugate
+ * Square matrix
+ * inc x = 1, inc y = 1
+ * alpha_r = 2.0, alpha_i = 1.0
+ * beta_r = 2.0, beta_i = 1.0
+ */
+CTEST(zgemv, colmajor_trans_x_conj_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'U';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    double norm = check_zgemv('F', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and x conjugate
+ * Square matrix
+ * inc x = 2, inc y = 2
+ * alpha_r = 1.0, alpha_i = 2.0
+ * beta_r = 1.0, beta_i = 1.0
+ */
+CTEST(zgemv, colmajor_trans_x_conj_col_100_row_100_inc_x_2_y_2)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'U';
+
+    double alpha[] = {1.0, 2.0};
+    double beta[] = {1.0, 1.0};
+
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    double norm = check_zgemv('F', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition, conjugate A, conjugate x
+ * Square matrix
+ * inc x = 2, inc y = 2
+ * alpha_r = 2.0, alpha_i = 1.0
+ * beta_r = 1.0, beta_i = 2.0
+ */
+CTEST(zgemv, colmajor_conjtrans_x_conj_col_100_row_100_inc_x_1_y_2)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'D';
+
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.0, 2.0};
+
+    blasint inc_x = 1;
+    blasint inc_y = 2;
+
+    double norm = check_zgemv('F', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition, conjugate A, conjugate x
+ * Square matrix
+ * inc x = 2, inc y = 1
+ * alpha_r = 2.0, alpha_i = 1.0
+ * beta_r = 1.0, beta_i = 2.0
+ */
+CTEST(zgemv, c_api_colmajor_trans_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'T';
+
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.0, 2.0};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    double norm = check_zgemv('C', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate A
+ * Square matrix
+ * inc x = 2, inc y = 1
+ * alpha_r = 1.0, alpha_i = 1.0
+ * beta_r = 1.0, beta_i = 2.0
+ */
+CTEST(zgemv, c_api_colmajor_conjtrans_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'C';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 2.0};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    double norm = check_zgemv('C', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate A
+ * Square matrix
+ * inc x = 1, inc y = 2
+ * alpha_r = 1.0, alpha_i = 1.0
+ * beta_r = 1.0, beta_i = 2.0
+ */
+CTEST(zgemv, c_api_colmajor_conjtrans_col_100_row_100_inc_x_1_y_2)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'C';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 2.0};
+
+    blasint inc_x = 1;
+    blasint inc_y = 2;
+
+    double norm = check_zgemv('C', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Square matrix
+ * inc x = 1, inc y = 1
+ * alpha_r = 2.0, alpha_i = 1.0
+ * beta_r = 1.0, beta_i = 1.0
+ */
+CTEST(zgemv, c_api_rowmajor_notrans_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'R';
+    char trans = 'N';
+
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    double norm = check_zgemv('C', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * No trans
+ * Square matrix
+ * inc x = 2, inc y = 2
+ * alpha_r = 1.0, alpha_i = 1.0
+ * beta_r = 3.0, beta_i = 2.0
+ */
+CTEST(zgemv, c_api_rowmajor_notrans_col_100_row_100_inc_x_2_y_2)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'R';
+    char trans = 'N';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {3.0, 1.0};
+
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    double norm = check_zgemv('C', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Conjugate
+ * Square matrix
+ * inc x = 1, inc y = 1
+ * alpha_r = 1.0, alpha_i = 3.0
+ * beta_r = 1.0, beta_i = 2.5
+ */
+CTEST(zgemv, c_api_rowmajor_conj_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'R';
+    char trans = 'R';
+
+    double alpha[] = {1.0, 3.0};
+    double beta[] = {1.0, 2.5};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    double norm = check_zgemv('C', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Conjugate
+ * Square matrix
+ * inc x = 2, inc y = 1
+ * alpha_r = 1.0, alpha_i = 1.0
+ * beta_r = 1.0, beta_i = 1.5
+ */
+CTEST(zgemv, c_api_rowmajor_conj_col_100_row_100_inc_x_2_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'R';
+    char trans = 'R';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.5};
+
+    blasint inc_x = 2;
+    blasint inc_y = 1;
+
+    double norm = check_zgemv('C', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param inc_y.
+ * Must be positive
+ *
+ * Column major
+ */
+CTEST(zgemv, xerbla_invalid_inc_y)
+{
+    char order = 'C';
+    char trans = 'T';
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 0;
+
+    int expected_info = 11;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param inc_y.
+ * Must be positive
+ *
+ * Column major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_inc_y_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 0;
+
+    int expected_info = 11;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param inc_y.
+ * Must be positive
+ *
+ * Row major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_inc_y_row_major)
+{
+    enum CBLAS_ORDER corder = CblasRowMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasNoTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 0;
+
+    int expected_info = 11;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param inc_x.
+ * Must be positive
+ *
+ * Column major
+ */
+CTEST(zgemv, xerbla_invalid_inc_x)
+{
+    char order = 'C';
+    char trans = 'T';
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 0;
+    blasint inc_y = 1;
+
+    int expected_info = 8;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param inc_x.
+ * Must be positive
+ *
+ * Column major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_inc_x_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 0;
+    blasint inc_y = 1;
+
+    int expected_info = 8;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param inc_x.
+ * Must be positive
+ *
+ * Row major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_inc_x_row_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 0;
+    blasint inc_y = 1;
+
+    int expected_info = 8;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param n.
+ * Must be positive.
+ *
+ * Column major
+ */
+CTEST(zgemv, xerbla_invalid_n)
+{
+    char order = 'C';
+    char trans = 'T';
+
+    blasint m = 1, n = INVALID;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 3;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n.
+ * Must be positive.
+ *
+ * Column major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_n_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = INVALID;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 3;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n.
+ * Must be positive.
+ *
+ * Row major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_n_row_major)
+{
+    enum CBLAS_ORDER corder = CblasRowMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasNoTrans;
+
+    blasint m = INVALID, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 3;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param m.
+ * Must be positive.
+ *
+ * Column major
+ */
+CTEST(zgemv, xerbla_invalid_m)
+{
+    char order = 'C';
+    char trans = 'T';
+
+    blasint m = INVALID, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 2;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m.
+ * Must be positive.
+ *
+ * Column major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_m_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = INVALID, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 2;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m.
+ * Must be positive.
+ *
+ * Row major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_m_row_major)
+{
+    enum CBLAS_ORDER corder = CblasRowMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasNoTrans;
+
+    blasint m = 1, n = INVALID;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 2;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param lda.
+ * lda must be at least n.
+ *
+ * Column major
+ */
+CTEST(zgemv, xerbla_invalid_lda)
+{
+    char order = 'C';
+    char trans = 'T';
+
+    blasint m = 1, n = 1;
+    blasint lda = INVALID;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 6;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda.
+ * If matrices are stored using col major layout,
+ * lda must be at least m.
+ *
+ * Column major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_lda_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = INVALID;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 6;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda.
+ * If matrices are stored using col major layout,
+ * lda must be at least n.
+ *
+ * Column major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_lda_row_major)
+{
+    enum CBLAS_ORDER corder = CblasRowMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasNoTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = INVALID;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 6;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param trans.
+ *
+ * Column major
+ */
+CTEST(zgemv, xerbla_invalid_trans)
+{
+    char order = 'C';
+    char trans = 'Z';
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param trans.
+ *
+ * Column major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_trans_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = INVALID;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 1;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param trans.
+ *
+ * Row major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_trans_row_major)
+{
+    enum CBLAS_ORDER corder = CblasRowMajor;
+    enum CBLAS_TRANSPOSE ctrans = INVALID;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 1;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param order.
+ */
+CTEST(zgemv, c_api_xerbla_invalid_order_col_major)
+{
+    enum CBLAS_ORDER corder = INVALID;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 0;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_zimatcopy.c b/utest/test_extensions/test_zimatcopy.c
new file mode 100644
index 000000000..6461ce88f
--- /dev/null
+++ b/utest/test_extensions/test_zimatcopy.c
@@ -0,0 +1,850 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_ZIMATCOPY {
+    double a_test[DATASIZE * DATASIZE * 2];
+    double a_verify[DATASIZE * DATASIZE * 2];
+};
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZIMATCOPY data_zimatcopy;
+
+/**
+ * Comapare results computed by zimatcopy and reference func
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param rows specifies number of rows of A
+ * param cols specifies number of columns of A
+ * param alpha specifies scaling factor for matrix A
+ * param lda_src - leading dimension of the matrix A
+ * param lda_dst - leading dimension of output matrix A
+ * return norm of difference between openblas and reference func
+ */
+static double check_zimatcopy(char api, char order, char trans, blasint rows, blasint cols, double *alpha, 
+                             blasint lda_src, blasint lda_dst)
+{
+    blasint m, n;
+    blasint rows_out, cols_out;
+    enum CBLAS_ORDER corder;
+    enum CBLAS_TRANSPOSE ctrans;
+    int conj = -1;
+
+    if (order == 'C') {
+        n = rows; m = cols;
+    }
+    else {
+        m = rows; n = cols;
+    }
+
+    if(trans == 'T' || trans == 'C') {
+        rows_out = n; cols_out = m*2;
+        if (trans == 'C')
+            conj = 1;
+    }
+    else {
+        rows_out = m; cols_out = n*2;
+        if (trans == 'R')
+            conj = 1;
+    }
+
+    drand_generate(data_zimatcopy.a_test, lda_src*m*2);
+
+    if (trans == 'T' || trans == 'C') {
+        ztranspose(m, n, alpha, data_zimatcopy.a_test, lda_src, data_zimatcopy.a_verify, lda_dst, conj);
+    } 
+    else {
+        zcopy(m, n, alpha, data_zimatcopy.a_test, lda_src, data_zimatcopy.a_verify, lda_dst, conj);
+    }
+
+    if (api == 'F') {
+        BLASFUNC(zimatcopy)(&order, &trans, &rows, &cols, alpha, data_zimatcopy.a_test, 
+                            &lda_src, &lda_dst);
+    }
+    else {
+        if (order == 'C') corder = CblasColMajor;
+        if (order == 'R') corder = CblasRowMajor;
+        if (trans == 'T') ctrans = CblasTrans;
+        if (trans == 'N') ctrans = CblasNoTrans;
+        if (trans == 'C') ctrans = CblasConjTrans;
+        if (trans == 'R') ctrans = CblasConjNoTrans;
+        cblas_zimatcopy(corder, ctrans, rows, cols, alpha, data_zimatcopy.a_test, 
+                    lda_src, lda_dst);
+    }
+
+    // Find the differences between output matrix computed by zimatcopy and reference func
+    return dmatrix_difference(data_zimatcopy.a_test, data_zimatcopy.a_verify, cols_out, rows_out, lda_dst*2);    
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param rows specifies number of rows of A
+ * param cols specifies number of columns of A
+ * param lda_src - leading dimension of the matrix A
+ * param lda_dst - leading dimension of output matrix A
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char order, char trans, blasint rows, blasint cols,
+                          blasint lda_src, blasint lda_dst, int expected_info)
+{
+    double alpha[] = {1.0, 1.0};
+
+    set_xerbla("ZIMATCOPY", expected_info);
+
+    BLASFUNC(zimatcopy)(&order, &trans, &rows, &cols, alpha, data_zimatcopy.a_test, 
+                        &lda_src, &lda_dst);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha_r = -3.0, alpha_i = 1.0
+ */
+CTEST(zimatcopy, colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha[] = {-3.0, 1.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, colmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'R';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(zimatcopy, colmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'C';
+    double alpha[] = {2.0, 1.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, colmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, colmajor_notrans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 50, lda_dst = 50;
+    char order = 'C';
+    char trans = 'N';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * Rectangular matrix
+ * alpha_r = 1.0, alpha_i = 1.0
+ */
+CTEST(zimatcopy, colmajor_conjtrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'C';
+    double alpha[] = {1.0, 1.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * Rectangular matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, colmajor_conj_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 50, lda_dst = 50;
+    char order = 'C';
+    char trans = 'R';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha_r = 2.0, alpha_i = 3.0
+ */
+CTEST(zimatcopy, rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha[] = {2.0, 3.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, rowmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'R';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(zimatcopy, rowmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'C';
+    double alpha[] = {2.0, 1.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(zimatcopy, rowmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 50;
+    char order = 'R';
+    char trans = 'N'; 
+    double alpha[] = {2.0, 1.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Rectangular matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, rowmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy and conjugate
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(zimatcopy, rowmajor_conj_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 50;
+    char order = 'R';
+    char trans = 'R'; 
+    double alpha[] = {1.5, -1.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, rowmajor_conjtrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'R';
+    char trans = 'C';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha_r = 3.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, c_api_colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    double alpha[] = {3.0, 2.0};
+
+    double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha_r = 3.0, alpha_i = 1.5
+ */
+CTEST(zimatcopy, c_api_colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha[] = {3.0, 1.5};
+
+    double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha_r = 3.0, alpha_i = 1.0
+ */
+CTEST(zimatcopy, c_api_rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    double alpha[] = {3.0, 1.0};
+
+    double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, c_api_colmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'R';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(zimatcopy, c_api_colmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'C';
+    double alpha[] = {2.0, 1.0};
+
+    double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha_r = 1.0, alpha_i = 1.0
+ */
+CTEST(zimatcopy, c_api_rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha[] = {1.0, 1.0};
+
+    double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy and conjugate
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(zimatcopy, c_api_rowmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'R'; 
+    double alpha[] = {1.5, -1.0};
+
+    double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, c_api_rowmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'C';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test error function for an invalid param order.
+ * Must be column (C) or row major (R).
+ */
+CTEST(zimatcopy, xerbla_invalid_order)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'O';
+    char trans = 'T';
+    int expected_info = 1;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param trans.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(zimatcopy, xerbla_invalid_trans)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'O';
+    int expected_info = 2;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param m.
+ * Must be positive.
+ */
+CTEST(zimatcopy, xerbla_invalid_rows)
+{
+    blasint m = 0, n = 100;
+    blasint lda_src = 0, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 3;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param n.
+ * Must be positive.
+ */
+CTEST(zimatcopy, xerbla_invalid_cols)
+{
+    blasint m = 100, n = 0;
+    blasint lda_src = 100, lda_dst = 0;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 4;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_src.
+ * If matrices are stored using row major layout, 
+ * lda_src must be at least n.
+ */
+CTEST(zimatcopy, xerbla_rowmajor_invalid_lda)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_src.
+ * If matrices are stored using column major layout,
+ * lda_src must be at least m.
+ */
+CTEST(zimatcopy, xerbla_colmajor_invalid_lda)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using row major layout and 
+ * there is no transposition, lda_dst must be at least n.
+ */
+CTEST(zimatcopy, xerbla_rowmajor_notrans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using row major layout and 
+ * there is transposition, lda_dst must be at least m.
+ */
+CTEST(zimatcopy, xerbla_rowmajor_trans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using column major layout and 
+ * there is no transposition, lda_dst must be at least m.
+ */
+CTEST(zimatcopy, xerbla_colmajor_notrans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using column major layout and 
+ * there is transposition, lda_dst must be at least n.
+ */
+CTEST(zimatcopy, xerbla_colmajor_trans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_zomatcopy.c b/utest/test_extensions/test_zomatcopy.c
new file mode 100644
index 000000000..8df3dd80f
--- /dev/null
+++ b/utest/test_extensions/test_zomatcopy.c
@@ -0,0 +1,745 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_ZOMATCOPY {
+    double a_test[DATASIZE * DATASIZE * 2];
+    double b_test[DATASIZE * DATASIZE * 2];
+    double b_verify[DATASIZE * DATASIZE * 2];
+};
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZOMATCOPY data_zomatcopy;
+
+/**
+ * Comapare results computed by zomatcopy and reference func
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param rows - number of rows of A
+ * param cols - number of columns of A
+ * param alpha - scaling factor for matrix B
+ * param lda - leading dimension of the matrix A
+ * param ldb - leading dimension of the matrix B
+ * return norm of difference between openblas and reference func
+ */
+static double check_zomatcopy(char api, char order, char trans, blasint rows, blasint cols, double* alpha, 
+                             blasint lda, blasint ldb)
+{
+    blasint b_rows, b_cols;
+    blasint m, n;
+    enum CBLAS_ORDER corder;
+    enum CBLAS_TRANSPOSE ctrans;
+    int conj = -1;
+
+    if (order == 'C') {
+        m = cols; n = rows;
+    }
+    else {
+        m = rows; n = cols;
+    }
+
+    if(trans == 'T' || trans == 'C') {
+        b_rows = n; b_cols = m*2;
+        if (trans == 'C')
+            conj = 1;
+    }
+    else {
+        b_rows = m; b_cols = n*2;
+        if (trans == 'R')
+            conj = 1;
+    }
+
+    drand_generate(data_zomatcopy.a_test, lda*m*2);
+
+    if (trans == 'T' || trans == 'C') {
+        ztranspose(m, n, alpha, data_zomatcopy.a_test, lda, data_zomatcopy.b_verify, ldb, conj);
+    } 
+    else {
+        zcopy(m, n, alpha, data_zomatcopy.a_test, lda, data_zomatcopy.b_verify, ldb, conj);
+    }
+
+    if (api == 'F') {
+        BLASFUNC(zomatcopy)(&order, &trans, &rows, &cols, alpha, data_zomatcopy.a_test, 
+                            &lda, data_zomatcopy.b_test, &ldb);
+    }
+    else {
+        if (order == 'C') corder = CblasColMajor;
+        if (order == 'R') corder = CblasRowMajor;
+        if (trans == 'T') ctrans = CblasTrans;
+        if (trans == 'N') ctrans = CblasNoTrans;
+        if (trans == 'C') ctrans = CblasConjTrans;
+        if (trans == 'R') ctrans = CblasConjNoTrans;
+        cblas_zomatcopy(corder, ctrans, rows, cols, alpha, data_zomatcopy.a_test, 
+                    lda, data_zomatcopy.b_test, ldb);
+    }
+    
+    return dmatrix_difference(data_zomatcopy.b_test, data_zomatcopy.b_verify, b_cols, b_rows, ldb*2);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param rows - number of rows of A
+ * param cols - number of columns of A
+ * param lda - leading dimension of the matrix A
+ * param ldb - leading dimension of the matrix B
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char order, char trans, blasint rows, blasint cols,
+                          blasint lda, blasint ldb, int expected_info)
+{
+    double alpha[] = {1.0, 1.0};
+
+    set_xerbla("ZOMATCOPY", expected_info);
+
+    BLASFUNC(zomatcopy)(&order, &trans, &rows, &cols, alpha, data_zomatcopy.a_test, 
+                        &lda, data_zomatcopy.b_test, &ldb);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zomatcopy, colmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha[] = {1.0, 2.0};
+    double norm;
+
+    norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * alpha_r = -1.0, alpha_i = 2.0
+ */
+CTEST(zomatcopy, colmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'T';
+    double alpha[] = {-1.0, 2.0};
+    double norm;
+
+    norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zomatcopy, colmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'R';
+    double alpha[] = {1.0, 2.0};
+    double norm;
+
+    norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(zomatcopy, colmajor_conjtrnas_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'C';
+    double alpha[] = {2.0, 1.0};
+    double norm;
+
+    norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Fortran API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(zomatcopy, rowmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 50, ldb = 50;
+    char order = 'R';
+    char trans = 'N'; 
+    double alpha[] = {1.5, -1.0};
+    double norm;
+
+    norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(zomatcopy, rowmajor_trans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'T';
+    double alpha[] = {1.5, -1.0};
+    double norm;
+
+    norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy and conjugate
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(zomatcopy, rowmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'R'; 
+    double alpha[] = {1.5, -1.0};
+    double norm;
+
+    norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zomatcopy, rowmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'C';
+    double alpha[] = {1.0, 2.0};
+    double norm;
+
+    norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zomatcopy, c_api_colmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha[] = {1.0, 2.0};
+    double norm;
+
+    norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * alpha_r = -1.0, alpha_i = 2.0
+ */
+CTEST(zomatcopy, c_api_colmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'T';
+    double alpha[] = {-1.0, 2.0};
+    double norm;
+
+    norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zomatcopy, c_api_colmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'R';
+    double alpha[] = {1.0, 2.0};
+    double norm;
+
+    norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(zomatcopy, c_api_colmajor_conjtrnas_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'C';
+    double alpha[] = {2.0, 1.0};
+    double norm;
+
+    norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(zomatcopy, c_api_rowmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 50, ldb = 50;
+    char order = 'R';
+    char trans = 'N'; 
+    double alpha[] = {1.5, -1.0};
+    double norm;
+
+    norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(zomatcopy, c_api_rowmajor_trans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'T';
+    double alpha[] = {1.5, -1.0};
+    double norm;
+
+    norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy and conjugate
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(zomatcopy, c_api_rowmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'R'; 
+    double alpha[] = {1.5, -1.0};
+    double norm;
+
+    norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zomatcopy, c_api_rowmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'C';
+    double alpha[] = {1.0, 2.0};
+    double norm;
+
+    norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+* Test error function for an invalid param order.
+* Must be column (C) or row major (R).
+*/
+CTEST(zomatcopy, xerbla_invalid_order)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'O';
+    char trans = 'T';
+    int expected_info = 1;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param trans.
+* Must be trans (T/C) or no-trans (N/R).
+*/
+CTEST(zomatcopy, xerbla_invalid_trans)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'O';
+    int expected_info = 2;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param m.
+* Must be positive.
+*/
+CTEST(zomatcopy, xerbla_invalid_rows)
+{
+    blasint m = 0, n = 100;
+    blasint lda = 0, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 3;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param n.
+* Must be positive.
+*/
+CTEST(zomatcopy, xerbla_invalid_cols)
+{
+    blasint m = 100, n = 0;
+    blasint lda = 100, ldb = 0;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 4;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param lda.
+* If matrices are stored using row major layout, 
+* lda must be at least n.
+*/
+CTEST(zomatcopy, xerbla_rowmajor_invalid_lda)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 50, ldb = 100;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param lda.
+* If matrices are stored using column major layout,
+* lda must be at least m.
+*/
+CTEST(zomatcopy, xerbla_colmajor_invalid_lda)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 50, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param ldb.
+* If matrices are stored using row major layout and 
+* there is no transposition, ldb must be at least n.
+*/
+CTEST(zomatcopy, xerbla_rowmajor_notrans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param ldb.
+* If matrices are stored using row major layout and 
+* there is transposition, ldb must be at least m.
+*/
+CTEST(zomatcopy, xerbla_rowmajor_trans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param ldb.
+* If matrices are stored using row major layout and 
+* there is no transposition, ldb must be at least n.
+*/
+CTEST(zomatcopy, xerbla_rowmajor_conj_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'R';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param ldb.
+* If matrices are stored using row major layout and 
+* there is transposition, ldb must be at least m.
+*/
+CTEST(zomatcopy, xerbla_rowmajor_transconj_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'C';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param ldb.
+* If matrices are stored using column major layout and 
+* there is no transposition, ldb must be at least m.
+*/
+CTEST(zomatcopy, xerbla_colmajor_notrans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param ldb.
+* If matrices are stored using column major layout and 
+* there is transposition, ldb must be at least n.
+*/
+CTEST(zomatcopy, xerbla_colmajor_trans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param ldb.
+* If matrices are stored using column major layout and 
+* there is no transposition, ldb must be at least m.
+*/
+CTEST(zomatcopy, xerbla_colmajor_conj_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'R';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param ldb.
+* If matrices are stored using column major layout and 
+* there is transposition, ldb must be at least n.
+*/
+CTEST(zomatcopy, xerbla_colmajor_transconj_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'C';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
diff --git a/utest/test_extensions/test_zrot.c b/utest/test_extensions/test_zrot.c
new file mode 100644
index 000000000..5471e051a
--- /dev/null
+++ b/utest/test_extensions/test_zrot.c
@@ -0,0 +1,790 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_ZROT {
+    double x_test[DATASIZE * INCREMENT * 2];
+    double y_test[DATASIZE * INCREMENT * 2];
+    double x_verify[DATASIZE * INCREMENT * 2];
+    double y_verify[DATASIZE * INCREMENT * 2];
+};
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZROT data_zrot;
+
+/**
+ * Comapare results computed by zdrot and zaxpby 
+ * 
+ * param n specifies size of vector x
+ * param inc_x specifies increment of vector x
+ * param inc_y specifies increment of vector y
+ * param c specifies cosine
+ * param s specifies sine
+ * return norm of differences 
+ */
+static double check_zdrot(blasint n, blasint inc_x, blasint inc_y, double *c, double *s)
+{
+    blasint i;
+    double norm = 0;
+    double s_neg[] = {-s[0], s[1]};
+
+    blasint inc_x_abs = labs(inc_x);
+    blasint inc_y_abs = labs(inc_y);
+
+    // Fill vectors x, y
+    drand_generate(data_zrot.x_test, n * inc_x_abs * 2);
+    drand_generate(data_zrot.y_test, n * inc_y_abs * 2);
+
+    if (inc_x == 0 && inc_y == 0) {
+        drand_generate(data_zrot.x_test, n * 2);
+        drand_generate(data_zrot.y_test, n * 2);
+    }
+
+    // Copy vector x for zaxpby
+    for (i = 0; i < n * inc_x_abs * 2; i++)
+        data_zrot.x_verify[i] = data_zrot.x_test[i];
+
+    // Copy vector y for zaxpby
+    for (i = 0; i < n * inc_y_abs * 2; i++)
+        data_zrot.y_verify[i] = data_zrot.y_test[i];
+    
+    // Find cx = c*x + s*y
+    BLASFUNC(zaxpby)(&n, s, data_zrot.y_test, &inc_y, c, data_zrot.x_verify, &inc_x);
+
+    // Find cy = -conjg(s)*x + c*y
+    BLASFUNC(zaxpby)(&n, s_neg, data_zrot.x_test, &inc_x, c, data_zrot.y_verify, &inc_y);
+
+    BLASFUNC(zdrot)(&n, data_zrot.x_test, &inc_x, data_zrot.y_test, &inc_y, c, s);
+
+    // Find the differences between vector x caculated by zaxpby and zdrot
+    for (i = 0; i < n * 2 * inc_x_abs; i++)
+        data_zrot.x_test[i] -= data_zrot.x_verify[i];
+
+    // Find the differences between vector y caculated by zaxpby and zdrot
+    for (i = 0; i < n * 2 * inc_y_abs; i++)
+        data_zrot.y_test[i] -= data_zrot.y_verify[i];
+
+    // Find the norm of differences
+    norm += BLASFUNC(dznrm2)(&n, data_zrot.x_test, &inc_x_abs);
+    norm += BLASFUNC(dznrm2)(&n, data_zrot.y_test, &inc_y_abs);
+    return (norm / 2);
+}
+
+/**
+ * C API specific function
+ * Comapare results computed by zdrot and zaxpby 
+ * 
+ * param n specifies size of vector x
+ * param inc_x specifies increment of vector x
+ * param inc_y specifies increment of vector y
+ * param c specifies cosine
+ * param s specifies sine
+ * return norm of differences 
+ */
+static double c_api_check_zdrot(blasint n, blasint inc_x, blasint inc_y, double *c, double *s)
+{
+    blasint i;
+    double norm = 0;
+    double s_neg[] = {-s[0], s[1]};
+
+    blasint inc_x_abs = labs(inc_x);
+    blasint inc_y_abs = labs(inc_y);
+
+    // Fill vectors x, y
+    drand_generate(data_zrot.x_test, n * inc_x_abs * 2);
+    drand_generate(data_zrot.y_test, n * inc_y_abs * 2);
+
+    if (inc_x == 0 && inc_y == 0) {
+        drand_generate(data_zrot.x_test, n * 2);
+        drand_generate(data_zrot.y_test, n * 2);
+    }
+
+    // Copy vector x for zaxpby
+    for (i = 0; i < n * inc_x_abs * 2; i++)
+        data_zrot.x_verify[i] = data_zrot.x_test[i];
+
+    // Copy vector y for zaxpby
+    for (i = 0; i < n * inc_y_abs * 2; i++)
+        data_zrot.y_verify[i] = data_zrot.y_test[i];
+    
+    // Find cx = c*x + s*y
+    cblas_zaxpby(n, s, data_zrot.y_test, inc_y, c, data_zrot.x_verify, inc_x);
+
+    // Find cy = -conjg(s)*x + c*y
+    cblas_zaxpby(n, s_neg, data_zrot.x_test, inc_x, c, data_zrot.y_verify, inc_y);
+
+    cblas_zdrot(n, data_zrot.x_test, inc_x, data_zrot.y_test, inc_y, c[0], s[0]);
+
+    // Find the differences between vector x caculated by zaxpby and zdrot
+    for (i = 0; i < n * 2 * inc_x_abs; i++)
+        data_zrot.x_test[i] -= data_zrot.x_verify[i];
+
+    // Find the differences between vector y caculated by zaxpby and zdrot
+    for (i = 0; i < n * 2 * inc_y_abs; i++)
+        data_zrot.y_test[i] -= data_zrot.y_verify[i];
+
+    // Find the norm of differences
+    norm += cblas_dznrm2(n, data_zrot.x_test, inc_x_abs);
+    norm += cblas_dznrm2(n, data_zrot.y_test, inc_y_abs);
+    return (norm / 2);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 0
+ * Stride of vector y is 0
+ * c = 1.0
+ * s = 2.0
+ */
+CTEST(zrot, inc_x_0_inc_y_0)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 0;
+    blasint inc_y = 0;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {2.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, inc_x_1_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is -1
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, inc_x_neg_1_inc_y_neg_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -1;
+    blasint inc_y = -1;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ * c = 3.0
+ * s = 2.0
+ */
+CTEST(zrot, inc_x_2_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 1;
+
+    // Imaginary  part for zaxpby
+    double c[] = {3.0, 0.0};
+    double s[] = {2.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is 1
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, inc_x_neg_2_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -2;
+    blasint inc_y = 1;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, inc_x_1_inc_y_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is -2
+ * c = 2.0
+ * s = 1.0
+ */
+CTEST(zrot, inc_x_1_inc_y_neg_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = -2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {2.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0
+ * s = 2.0
+ */
+CTEST(zrot, inc_x_2_inc_y_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {2.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, inc_x_neg_2_inc_y_neg_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -2;
+    blasint inc_y = -2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 0.0
+ * s = 1.0
+ */
+CTEST(zrot, inc_x_2_inc_y_2_c_zero)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {0.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0
+ * s = 0.0
+ */
+CTEST(zrot, inc_x_2_inc_y_2_s_zero)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {0.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 0
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, check_n_zero)
+{
+    blasint n = 0;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test 
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 0
+ * Stride of vector y is 0
+ * c = 1.0
+ * s = 2.0
+ */
+CTEST(zrot, c_api_inc_x_0_inc_y_0)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 0;
+    blasint inc_y = 0;
+
+    // Imaginary  part for zaxpby
+    double c[] = {3.0, 0.0};
+    double s[] = {2.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, c_api_inc_x_1_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is -1
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, c_api_inc_x_neg_1_inc_y_neg_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -1;
+    blasint inc_y = -1;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ * c = 3.0
+ * s = 2.0
+ */
+CTEST(zrot, c_api_inc_x_2_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 1;
+
+    // Imaginary  part for zaxpby
+    double c[] = {3.0, 0.0};
+    double s[] = {2.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is 1
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, c_api_inc_x_neg_2_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -2;
+    blasint inc_y = 1;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, c_api_inc_x_1_inc_y_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is -2
+ * c = 2.0
+ * s = 1.0
+ */
+CTEST(zrot, c_api_inc_x_1_inc_y_neg_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = -2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {2.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0
+ * s = 2.0
+ */
+CTEST(zrot, c_api_inc_x_2_inc_y_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {2.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, c_api_inc_x_neg_2_inc_y_neg_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -2;
+    blasint inc_y = -2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 0.0
+ * s = 1.0
+ */
+CTEST(zrot, c_api_inc_x_2_inc_y_2_c_zero)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {0.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0
+ * s = 0.0
+ */
+CTEST(zrot, c_api_inc_x_2_inc_y_2_s_zero)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {0.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 0
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, c_api_check_n_zero)
+{
+    blasint n = 0;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_zrotg.c b/utest/test_extensions/test_zrotg.c
new file mode 100644
index 000000000..310121422
--- /dev/null
+++ b/utest/test_extensions/test_zrotg.c
@@ -0,0 +1,290 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#ifdef BUILD_COMPLEX16
+
+/**
+ * Fortran API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, zero_a)
+{
+    double sa[2] = {0.0, 0.0};
+    double sb[2] = {1.0, 1.0};
+    double ss[2];
+    double sc;
+    BLASFUNC(zrotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.0, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific tests
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, zero_b)
+{
+    double sa[2] = {1.0, 1.0};
+    double sb[2] = {0.0, 0.0};
+    double ss[2];
+    double sc;
+    BLASFUNC(zrotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(1.0, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, zero_real)
+{
+    double sa[2] = {0.0, 1.0};
+    double sb[2] = {0.0, 1.0};
+    double ss[2];
+    double sc;
+    BLASFUNC(zrotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.70710678118654, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.70710678118654, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.41421356237309, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, positive_real_positive_img)
+{
+    double sa[2] = {3.0, 4.0};
+    double sb[2] = {4.0, 6.0};
+    double ss[2];
+    double sc;
+    BLASFUNC(zrotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.04558423058385, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(5.26497863243527, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(7.01997150991369, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, negative_real_positive_img)
+{
+    double sa[2] = {-3.0, 4.0};
+    double sb[2] = {-4.0, 6.0};
+    double ss[2];
+    double sc;
+    BLASFUNC(zrotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.04558423058385, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-5.26497863243527, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(7.01997150991369, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, positive_real_negative_img)
+{
+    double sa[2] = {3.0, -4.0};
+    double sb[2] = {4.0, -6.0};
+    double ss[2];
+    double sc;
+    BLASFUNC(zrotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.04558423058385, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(5.26497863243527, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-7.01997150991369, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, negative_real_negative_img)
+{
+    double sa[2] = {-3.0, -4.0};
+    double sb[2] = {-4.0, -6.0};
+    double ss[2];
+    double sc;
+    BLASFUNC(zrotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.04558423058385, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-5.26497863243527, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-7.01997150991369, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, c_api_zero_a)
+{
+    double sa[2] = {0.0, 0.0};
+    double sb[2] = {1.0, 1.0};
+    double ss[2];
+    double sc;
+    cblas_zrotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.0, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, c_api_zero_b)
+{
+    double sa[2] = {1.0, 1.0};
+    double sb[2] = {0.0, 0.0};
+    double ss[2];
+    double sc;
+    cblas_zrotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(1.0, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, c_api_zero_real)
+{
+    double sa[2] = {0.0, 1.0};
+    double sb[2] = {0.0, 1.0};
+    double ss[2];
+    double sc;
+    cblas_zrotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.70710678118654, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.70710678118654, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.41421356237309, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, c_api_positive_real_positive_img)
+{
+    double sa[2] = {3.0, 4.0};
+    double sb[2] = {4.0, 6.0};
+    double ss[2];
+    double sc;
+    cblas_zrotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.04558423058385, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(5.26497863243527, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(7.01997150991369, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, c_api_negative_real_positive_img)
+{
+    double sa[2] = {-3.0, 4.0};
+    double sb[2] = {-4.0, 6.0};
+    double ss[2];
+    double sc;
+    cblas_zrotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.04558423058385, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-5.26497863243527, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(7.01997150991369, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, c_api_positive_real_negative_img)
+{
+    double sa[2] = {3.0, -4.0};
+    double sb[2] = {4.0, -6.0};
+    double ss[2];
+    double sc;
+    cblas_zrotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.04558423058385, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(5.26497863243527, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-7.01997150991369, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, c_api_negative_real_negative_img)
+{
+    double sa[2] = {-3.0, -4.0};
+    double sb[2] = {-4.0, -6.0};
+    double ss[2];
+    double sc;
+    cblas_zrotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.04558423058385, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-5.26497863243527, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-7.01997150991369, sa[1], DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_zsbmv.c b/utest/test_extensions/test_zsbmv.c
new file mode 100644
index 000000000..afdb208c1
--- /dev/null
+++ b/utest/test_extensions/test_zsbmv.c
@@ -0,0 +1,606 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_ZSBMV {
+    double sp_matrix[DATASIZE * (DATASIZE + 1)];
+    double sb_matrix[DATASIZE * DATASIZE * 2];
+    double b_test[DATASIZE * 2 * INCREMENT];
+    double c_test[DATASIZE * 2 * INCREMENT];
+    double c_verify[DATASIZE * 2 * INCREMENT];
+};
+
+// DOUBLE_EPS_ZGEMV = MAX_VAL * NUMBER OF OPERATIONS * DBL_EPSILON
+// DOUBLE_EPS_ZGEMV = 5.0 * O(100 * 100) * 2.2e-16 = 1e-11
+#define DOUBLE_EPS_ZGEMV 1e-11
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZSBMV data_zsbmv;
+
+/** 
+ * Transform full-storage symmetric band matrix A to upper (U) or lower (L)
+ * band-packed storage mode.
+ * 
+ * param uplo specifies whether matrix a is upper or lower band-packed.
+ * param n - number of rows and columns of A
+ * param k - number of super-diagonals of A
+ * output param a - buffer for holding symmetric band-packed matrix
+ * param lda - specifies the leading dimension of a
+ * param sb_matrix - buffer holding full-storage symmetric band matrix A 
+ * param ldm - specifies the leading dimension of A
+ */
+static void transform_to_band_storage(char uplo, blasint n, blasint k, double* a, blasint lda,
+                                     double* sb_matrix, blasint ldm) 
+{
+    blasint i, j, m;
+    if (uplo == 'L') {
+        for (j = 0; j < n; j++)
+        {
+            m = -j;
+            for (i = 2 * j; i < MIN(2 * n, 2 * (j + k + 1)); i += 2)
+            {
+                a[(2*m + i) + j * lda * 2] = sb_matrix[i + j * ldm * 2];
+                a[(2*m + (i + 1)) + j * lda * 2] = sb_matrix[(i + 1) + j * ldm * 2];
+            }
+        }
+    }
+    else {
+        for (j = 0; j < n; j++)
+        {   
+            m = k - j;
+            for (i = MAX(0, 2*(j - k)); i <= j*2; i += 2)
+            {
+                a[(2*m + i) + j * lda * 2] = sb_matrix[i + j * ldm * 2];
+                a[(2*m + (i + 1)) + j * lda * 2] = sb_matrix[(i + 1) + j * ldm * 2];
+            }
+        }
+    }
+}
+
+/** 
+ * Generate full-storage symmetric band matrix A with k - super-diagonals
+ * from input symmetric packed matrix in lower packed mode (L)
+ * 
+ * output param sb_matrix - buffer for holding full-storage symmetric band matrix.
+ * param sp_matrix - buffer holding input symmetric packed matrix
+ * param n - number of rows and columns of A
+ * param k - number of super-diagonals of A
+ */
+static void get_symmetric_band_matr(double *sb_matrix, double *sp_matrix, blasint n, blasint k)
+{
+    blasint m;
+    blasint i, j;
+    m = 0;
+    for (i = 0; i < n; i++)
+    {
+        for (j = 0; j < n * 2; j += 2)
+        {
+            // Make matrix band with k super-diagonals
+            if (fabs((i+1) - ceil((j+1)/2.0)) > k) 
+            {
+                sb_matrix[i * n * 2 + j] = 0.0;
+                sb_matrix[i * n * 2 + j + 1] = 0.0;
+                continue;
+            }
+
+            if (j / 2 < i)
+            {
+                sb_matrix[i * n * 2 + j] = 
+                        sb_matrix[j * n + i * 2];
+                sb_matrix[i * n * 2 + j + 1] = 
+                        sb_matrix[j * n + i * 2 + 1];
+            }
+            else
+            {
+                sb_matrix[i * n * 2 + j] = sp_matrix[m++];
+                sb_matrix[i * n * 2 + j + 1] = sp_matrix[m++];
+            }
+        }
+    }
+}
+
+/** 
+ * Check if error function was called with expected function name
+ * and param info
+ * 
+ * param uplo specifies whether matrix a is upper or lower band-packed.
+ * param n - number of rows and columns of A
+ * param k - number of super-diagonals of A
+ * param lda - specifies the leading dimension of a
+ * param inc_b - stride of vector b_test
+ * param inc_c - stride of vector c_test
+ * param expected_info - expected invalid parameter number in zsbmv
+ * return TRUE if everything is ok, otherwise FALSE 
+ */
+static int check_badargs(char uplo, blasint n, blasint k, blasint lda, blasint inc_b,
+                          blasint inc_c, int expected_info)
+{
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double a[2];
+    drand_generate(a, 2);
+
+    set_xerbla("ZSBMV ", expected_info);
+
+    BLASFUNC(zsbmv)(&uplo, &n, &k, alpha, a, &lda, data_zsbmv.b_test, 
+                    &inc_b, beta, data_zsbmv.c_test, &inc_c);
+
+    return check_error();
+}
+
+/**
+ * Comapare results computed by zsbmv and zgemv 
+ * since zsbmv is zgemv for symmetric band matrix
+ * 
+ * param uplo specifies whether matrix A is upper or lower triangular
+ * param n - number of rows and columns of A
+ * param k - number of super-diagonals of A
+ * param alpha - scaling factor for the matrix-vector product
+ * param lda - specifies the leading dimension of a
+ * param inc_b - stride of vector b_test
+ * param beta - scaling factor for vector c_test
+ * param inc_c - stride of vector c_test
+ * param lda - specifies the leading dimension of a
+ * return norm of differences 
+ */
+static double check_zsbmv(char uplo, blasint n, blasint k, double *alpha, blasint lda, 
+    blasint inc_b, double *beta, blasint inc_c, blasint ldm)
+{
+    blasint i;
+
+    // Trans param for gemv (can use any, since the input matrix is symmetric)
+    char trans = 'N';
+
+    // Symmetric band packed matrix for sbmv
+    double a[lda * n * 2];
+
+    // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test 
+    drand_generate(data_zsbmv.sp_matrix, n * (n + 1));
+    drand_generate(data_zsbmv.b_test, n * inc_b * 2);
+    drand_generate(data_zsbmv.c_test, n * inc_c * 2);
+
+    // Copy vector c_test for zgemv
+    for (i = 0; i < n * inc_c * 2; i++)
+        data_zsbmv.c_verify[i] = data_zsbmv.c_test[i];
+
+    // Generate full-storage symmetric band matrix
+    // with k super-diagonals from symmetric packed matrix
+    get_symmetric_band_matr(data_zsbmv.sb_matrix, data_zsbmv.sp_matrix, n, k);
+
+    // Transform symmetric band matrix from conventional
+    // full matrix storage  to band storage for zsbmv
+    transform_to_band_storage(uplo, n, k, a, lda, data_zsbmv.sb_matrix, ldm);
+
+    BLASFUNC(zgemv)(&trans, &n, &n, alpha, data_zsbmv.sb_matrix, &ldm, data_zsbmv.b_test,
+                    &inc_b, beta, data_zsbmv.c_verify, &inc_c);
+
+    BLASFUNC(zsbmv)(&uplo, &n, &k, alpha, a, &lda,
+                    data_zsbmv.b_test, &inc_b, beta, data_zsbmv.c_test, &inc_c);
+
+    // Find the differences between output vector caculated by zsbmv and zgemv
+    for (i = 0; i < n * inc_c * 2; i++)
+        data_zsbmv.c_test[i] -= data_zsbmv.c_verify[i];
+
+    // Find the norm of differences
+    return BLASFUNC(dznrm2)(&n, data_zsbmv.c_test, &inc_c);
+}
+
+/**
+ * Test zsbmv by comparing it against zgemv
+ * with the following options:
+ * 
+ * a is upper-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 0
+ */
+CTEST(zsbmv, upper_k_0_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 0;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Test zsbmv by comparing it against zgemv
+ * with the following options:
+ * 
+ * a is upper-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 1
+ */
+CTEST(zsbmv, upper_k_1_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 1;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Test zsbmv by comparing it against zgemv
+ * with the following options:
+ * 
+ * a is upper-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 2
+ */
+CTEST(zsbmv, upper_k_2_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Test zsbmv by comparing it against zgemv
+ * with the following options:
+ * 
+ * a is upper-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 2
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 2
+ */
+CTEST(zsbmv, upper_k_2_inc_b_2_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 2, inc_c = 1;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Test zsbmv by comparing it against zgemv
+ * with the following options:
+ * 
+ * a is upper-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 2
+ * Stride of vector c_test is 2
+ * Number of super-diagonals k is 2
+ */
+CTEST(zsbmv, upper_k_2_inc_b_2_inc_c_2_n_100)
+{
+    blasint n = DATASIZE, inc_b = 2, inc_c = 2;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Test zsbmv by comparing it against zgemv
+ * with the following options:
+ * 
+ * a is lower-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 0
+ */
+CTEST(zsbmv, lower_k_0_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 0;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'L';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Test zsbmv by comparing it against zgemv
+ * with the following options:
+ * 
+ * a is lower-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 1
+ */
+CTEST(zsbmv, lower_k_1_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 1;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'L';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Test zsbmv by comparing it against zgemv
+ * with the following options:
+ * 
+ * a is lower-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 2
+ */
+CTEST(zsbmv, lower_k_2_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'L';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Test zsbmv by comparing it against zgemv
+ * with the following options:
+ * 
+ * a is lower-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 2
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 2
+ */
+CTEST(zsbmv, lower_k_2_inc_b_2_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 2, inc_c = 1;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'L';
+
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Test zsbmv by comparing it against zgemv
+ * with the following options:
+ * 
+ * a is lower-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 2
+ * Stride of vector c_test is 2
+ * Number of super-diagonals k is 2
+ */
+CTEST(zsbmv, lower_k_2_inc_b_2_inc_c_2_n_100)
+{
+    blasint n = DATASIZE, inc_b = 2, inc_c = 2;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'L';
+
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/** 
+ * Check if output matrix a contains any NaNs
+ */
+CTEST(zsbmv, check_for_NaN)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 0;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    
+    ASSERT_TRUE(norm == norm); /* NaN == NaN is false  */
+}
+
+/**
+ * Test error function for an invalid param uplo.
+ * Uplo specifies whether a is in upper (U) or lower (L) band-packed storage mode.
+ */
+CTEST(zsbmv, xerbla_uplo_invalid)
+{
+    blasint n = 1, inc_b = 1, inc_c = 1;
+    char uplo = 'O';
+    blasint k = 0;
+    blasint lda = k + 1;
+    int expected_info = 1;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/** 
+ * Test error function for an invalid param N -
+ * number of rows and columns of A. Must be at least zero.
+ */
+CTEST(zsbmv, xerbla_n_invalid)
+{
+    blasint n = INVALID, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    blasint k = 0;
+    blasint lda = k + 1;
+    int expected_info = 2;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Check if n - number of rows and columns of A equal zero.
+ */
+CTEST(zsbmv, check_n_zero)
+{
+    blasint n = 0, inc_b = 1, inc_c = 1;
+    blasint k = 0;
+    blasint lda = k + 1;
+    blasint ldm = 1;
+    char uplo = 'U';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Test error function for an invalid param inc_b -
+ * stride of vector b_test. Can't be zero. 
+ */
+CTEST(zsbmv, xerbla_inc_b_zero)
+{
+    blasint n = 1, inc_b = 0, inc_c = 1;
+    char uplo = 'U';
+    blasint k = 0;
+    blasint lda = k + 1;
+    int expected_info = 8;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param inc_c -
+ * stride of vector c_test. Can't be zero. 
+ */
+CTEST(zsbmv, xerbla_inc_c_zero)
+{
+    blasint n = 1, inc_b = 1, inc_c = 0;
+    char uplo = 'U';
+    blasint k = 0;
+    blasint lda = k + 1;
+    int expected_info = 11;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param k -
+ * number of super-diagonals of A. Must be at least zero.
+ */
+CTEST(zsbmv, xerbla_k_invalid)
+{
+    blasint n = 1, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    blasint k = INVALID;
+    blasint lda = 1;
+    int expected_info = 3;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of a. Must be at least (k+1).
+ */
+CTEST(zsbmv, xerbla_lda_invalid)
+{
+    blasint n = 1, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    blasint k = 0;
+    blasint lda = INVALID;
+    int expected_info = 6;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_zscal.c b/utest/test_extensions/test_zscal.c
new file mode 100644
index 000000000..132f4ee5b
--- /dev/null
+++ b/utest/test_extensions/test_zscal.c
@@ -0,0 +1,165 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_ZSCAL {
+    double x_test[DATASIZE * 2 * INCREMENT];
+    double x_verify[DATASIZE * 2 * INCREMENT];
+};
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZSCAL data_zscal;
+
+
+/**
+ * zscal reference code
+ *
+ * param n - number of elements of vector x
+ * param alpha - scaling factor for the vector product
+ * param x - buffer holding input vector x
+ * param inc - stride of vector x
+ */
+static void zscal_trusted(blasint n, double *alpha, double* x, blasint inc){
+    blasint i, ip = 0;
+    blasint inc_x2 = 2 * inc;
+    double temp;
+    for (i = 0; i < n; i++)
+	{
+        temp = alpha[0] * x[ip] - alpha[1] * x[ip+1];
+		x[ip+1] = alpha[0] * x[ip+1] + alpha[1] * x[ip];
+        x[ip] = temp;
+        ip += inc_x2;
+    }
+}
+
+/**
+ * Comapare results computed by zscal and zscal_trusted
+ *
+ * param api specifies tested api (C or Fortran)
+ * param n - number of elements of vector x
+ * param alpha - scaling factor for the vector product
+ * param inc - stride of vector x
+ * return norm of differences
+ */
+static double check_zscal(char api, blasint n, double *alpha, blasint inc)
+{
+    blasint i;
+
+    // Fill vectors x
+    drand_generate(data_zscal.x_test, n * inc * 2);
+
+    // Copy vector x for zscal_trusted
+    for (i = 0; i < n * 2 * inc; i++)
+        data_zscal.x_verify[i] = data_zscal.x_test[i];
+
+    zscal_trusted(n, alpha, data_zscal.x_verify, inc);
+
+    if(api == 'F')
+        BLASFUNC(zscal)(&n, alpha, data_zscal.x_test, &inc);
+    else
+        cblas_zscal(n, alpha, data_zscal.x_test, inc);
+
+    // Find the differences between output vector computed by zscal and zscal_trusted
+    for (i = 0; i < n * 2 * inc; i++)
+        data_zscal.x_verify[i] -= data_zscal.x_test[i];
+
+    // Find the norm of differences
+    return BLASFUNC(dznrm2)(&n, data_zscal.x_verify, &inc);
+}
+
+/**
+ * Fortran API specific test
+ * Test zscal by comparing it against reference
+ */
+CTEST(zscal, alpha_r_zero_alpha_i_not_zero)
+{
+    blasint N = DATASIZE;
+    blasint inc = 1;
+    double alpha[2] = {0.0, 1.0};
+
+    double norm = check_zscal('F', N, alpha, inc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zscal by comparing it against reference
+ */
+CTEST(zscal, alpha_r_zero_alpha_i_zero_inc_2)
+{
+    blasint N = DATASIZE;
+    blasint inc = 2;
+    double alpha[2] = {0.0, 0.0};
+
+    double norm = check_zscal('F', N, alpha, inc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zscal by comparing it against reference
+ */
+CTEST(zscal, c_api_alpha_r_zero_alpha_i_not_zero)
+{
+    blasint N = DATASIZE;
+    blasint inc = 1;
+    double alpha[2] = {0.0, 1.0};
+
+    double norm = check_zscal('C', N, alpha, inc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zscal by comparing it against reference
+ */
+CTEST(zscal, c_api_alpha_r_zero_alpha_i_zero_inc_2)
+{
+    blasint N = DATASIZE;
+    blasint inc = 2;
+    double alpha[2] = {0.0, 0.0};
+
+    double norm = check_zscal('C', N, alpha, inc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_zspmv.c b/utest/test_extensions/test_zspmv.c
new file mode 100644
index 000000000..510ac0579
--- /dev/null
+++ b/utest/test_extensions/test_zspmv.c
@@ -0,0 +1,427 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_ZSPMV {
+    double a_verify[DATASIZE * DATASIZE * 2];
+    double a_test[DATASIZE * (DATASIZE + 1)];
+    double b_test[DATASIZE * 2 * INCREMENT];
+    double c_test[DATASIZE * 2 * INCREMENT];
+    double c_verify[DATASIZE * 2 * INCREMENT];
+};
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZSPMV data_zspmv;
+
+/**
+ * Compute spmv via gemv since spmv is gemv for symmetric packed matrix
+ *
+ * param uplo specifies whether matrix A is upper or lower triangular
+ * param n - number of rows and columns of A
+ * param alpha - scaling factor for the matrix-vector product
+ * param a - buffer holding input matrix A
+ * param b - Buffer holding input vector b
+ * param inc_b - stride of vector b
+ * param beta - scaling factor for vector c
+ * param c - buffer holding input/output vector c
+ * param inc_c - stride of vector c
+ * output param data_zspmv.c_verify - matrix computed by gemv
+ */
+static void zspmv_trusted(char uplo, blasint n, double *alpha, double *a,
+                          double *b, blasint inc_b, double *beta, double *c,
+                          blasint inc_c)
+{
+    blasint k;
+    blasint i, j;
+
+    // param for gemv (can use any, since the input matrix is symmetric)
+    char trans = 'N';
+    
+    // Unpack the input symmetric packed matrix
+    if (uplo == 'L')
+    {
+        k = 0;
+        for (i = 0; i < n; i++)
+        {
+            for (j = 0; j < n * 2; j += 2)
+            {
+                if (j / 2 < i)
+                {
+                    data_zspmv.a_verify[i * n * 2 + j] = 
+                            data_zspmv.a_verify[j * n + i * 2];
+                    data_zspmv.a_verify[i * n * 2 + j + 1] = 
+                            data_zspmv.a_verify[j * n + i * 2 + 1];
+                }
+                else
+                {
+                    data_zspmv.a_verify[i * n * 2 + j] = a[k++];
+                    data_zspmv.a_verify[i * n * 2 + j + 1] = a[k++];
+                }
+            }
+        }
+    }
+    else
+    {
+        k = n * (n + 1) - 1;
+        for (j = 2 * n - 1; j >= 0; j -= 2)
+        {
+            for (i = n - 1; i >= 0; i--)
+            {
+                if (j / 2 < i)
+                {
+                    data_zspmv.a_verify[i * n * 2 + j] = 
+                            data_zspmv.a_verify[(j - 1) * n + i * 2 + 1];
+                    data_zspmv.a_verify[i * n * 2 + j - 1] = 
+                            data_zspmv.a_verify[(j - 1) * n + i * 2];
+                }
+                else
+                {
+                    data_zspmv.a_verify[i * n * 2 + j] = a[k--];
+                    data_zspmv.a_verify[i * n * 2 + j - 1] = a[k--];
+                }
+            }
+        }
+    }
+
+    // Run gemv with unpacked matrix
+    BLASFUNC(zgemv)(&trans, &n, &n, alpha, data_zspmv.a_verify, &n, b, 
+                    &inc_b, beta, c, &inc_c);
+}
+
+/**
+ * Comapare results computed by zspmv and zspmv_trusted
+ *
+ * param uplo specifies whether matrix A is upper or lower triangular
+ * param n - number of rows and columns of A
+ * param alpha - scaling factor for the matrix-vector product
+ * param inc_b - stride of vector b
+ * param beta - scaling factor for vector c
+ * param inc_c - stride of vector c
+ * return norm of differences
+ */
+static double check_zspmv(char uplo, blasint n, double *alpha, blasint inc_b,
+                          double *beta, blasint inc_c)
+{
+    blasint i;
+
+    // Fill symmetric packed maxtix a, vectors b and c 
+    drand_generate(data_zspmv.a_test, n * (n + 1));
+    drand_generate(data_zspmv.b_test, 2 * n * inc_b);
+    drand_generate(data_zspmv.c_test, 2 * n * inc_c);
+
+    // Copy vector c for zspmv_trusted
+    for (i = 0; i < n * 2 * inc_c; i++)
+        data_zspmv.c_verify[i] = data_zspmv.c_test[i];
+
+    zspmv_trusted(uplo, n, alpha, data_zspmv.a_test, data_zspmv.b_test, 
+                  inc_b, beta, data_zspmv.c_verify, inc_c);
+    BLASFUNC(zspmv)(&uplo, &n, alpha, data_zspmv.a_test, data_zspmv.b_test, 
+                    &inc_b, beta, data_zspmv.c_test, &inc_c);
+
+    // Find the differences between output vector caculated by zspmv and zspmv_trusted
+    for (i = 0; i < n * 2 * inc_c; i++)
+        data_zspmv.c_test[i] -= data_zspmv.c_verify[i];
+
+    // Find the norm of differences
+    return BLASFUNC(dznrm2)(&n, data_zspmv.c_test, &inc_c);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param uplo specifies whether matrix A is upper or lower triangular
+ * param n - number of rows and columns of A
+ * param inc_b - stride of vector b
+ * param inc_c - stride of vector c
+ * param expected_info - expected invalid parameter number in zspmv
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char uplo, blasint n, blasint inc_b,
+                          blasint inc_c, int expected_info)
+{
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    set_xerbla("ZSPMV ", expected_info);
+
+    BLASFUNC(zspmv)(&uplo, &n, alpha, data_zspmv.a_test, data_zspmv.b_test, 
+                    &inc_b, beta, data_zspmv.c_test, &inc_c);
+
+    return check_error();
+}
+
+/**
+ * Test zspmv by comparing it against zgemv
+ * with the following options:
+ *
+ * A is upper triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(zspmv, upper_inc_b_1_inc_c_1_N_100)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zspmv by comparing it against zgemv
+ * with the following options:
+ *
+ * A is upper triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 2
+ */
+CTEST(zspmv, upper_inc_b_1_inc_c_2_N_100)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 2;
+    char uplo = 'U';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zspmv by comparing it against zgemv
+ * with the following options:
+ *
+ * A is upper triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 1
+ */
+CTEST(zspmv, upper_inc_b_2_inc_c_1_N_100)
+{
+    blasint N = DATASIZE, inc_b = 2, inc_c = 1;
+    char uplo = 'U';
+    double alpha[] = {1.0, 0.0};
+    double beta[] = {1.0, 0.0};
+
+    double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zspmv by comparing it against zgemv
+ * with the following options:
+ *
+ * A is upper triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 2
+ */
+CTEST(zspmv, upper_inc_b_2_inc_c_2_N_100)
+{
+    blasint N = DATASIZE, inc_b = 2, inc_c = 2;
+    char uplo = 'U';
+    double alpha[] = {2.5, -2.1};
+    double beta[] = {0.0, 1.0};
+
+    double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zspmv by comparing it against zgemv
+ * with the following options:
+ *
+ * A is lower triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(zspmv, lower_inc_b_1_inc_c_1_N_100)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 1;
+    char uplo = 'L';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zspmv by comparing it against zgemv
+ * with the following options:
+ *
+ * A is lower triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 2
+ */
+CTEST(zspmv, lower_inc_b_1_inc_c_2_N_100)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 2;
+    char uplo = 'L';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zspmv by comparing it against zgemv
+ * with the following options:
+ *
+ * A is lower triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 1
+ */
+CTEST(zspmv, lower_inc_b_2_inc_c_1_N_100)
+{
+    blasint N = DATASIZE, inc_b = 2, inc_c = 1;
+    char uplo = 'L';
+    double alpha[] = {1.0, 0.0};
+    double beta[] = {1.0, 0.0};
+
+    double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zspmv by comparing it against zgemv
+ * with the following options:
+ *
+ * A is lower triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 2
+ */
+CTEST(zspmv, lower_inc_b_2_inc_c_2_N_100)
+{
+    blasint N = DATASIZE, inc_b = 2, inc_c = 2;
+    char uplo = 'L';
+    double alpha[] = {2.5, -2.1};
+    double beta[] = {0.0, 1.0};
+
+    double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Check if output matrix A contains any NaNs
+ */
+CTEST(zspmv, check_for_NaN)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_TRUE(norm == norm); /* NaN == NaN is false */
+}
+
+/**
+ * Test error function for an invalid param uplo.
+ * uplo specifies whether A is upper or lower triangular.
+ */
+CTEST(zspmv, xerbla_uplo_invalid)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 1;
+    char uplo = 'O';
+    int expected_info = 1;
+
+    int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param N -
+ * number of rows and columns of A. Must be at least zero.
+ */
+CTEST(zspmv, xerbla_N_invalid)
+{
+    blasint N = INVALID, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    int expected_info = 2;
+
+    int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param inc_b -
+ * stride of vector b. Can't be zero.
+ */
+CTEST(zspmv, xerbla_inc_b_zero)
+{
+    blasint N = DATASIZE, inc_b = 0, inc_c = 1;
+    char uplo = 'U';
+    int expected_info = 6;
+
+    int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param inc_c -
+ * stride of vector c. Can't be zero.
+ */
+CTEST(zspmv, xerbla_inc_c_zero)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 0;
+    char uplo = 'U';
+    int expected_info = 9;
+
+    int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
diff --git a/utest/test_extensions/test_ztrmv.c b/utest/test_extensions/test_ztrmv.c
new file mode 100644
index 000000000..aad64099e
--- /dev/null
+++ b/utest/test_extensions/test_ztrmv.c
@@ -0,0 +1,266 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+#include "common.h"
+
+#define DATASIZE 300
+#define INCREMENT 2
+
+struct DATA_ZTRMV {
+	double a_test[DATASIZE * DATASIZE * 2];
+	double a_verify[DATASIZE * DATASIZE * 2];
+	double x_test[DATASIZE * INCREMENT * 2];
+	double x_verify[DATASIZE * INCREMENT * 2];
+};
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZTRMV data_ztrmv;
+
+/**
+ * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A
+ * and comparing it with the non-conjugate ztrmv.
+ *
+ * param uplo specifies whether A is upper or lower triangular
+ * param trans specifies op(A), the transposition (conjugation) operation applied to A
+ * param diag specifies whether the matrix A is unit triangular or not.
+ * param n - numbers of rows and columns of A
+ * param lda - leading dimension of matrix A
+ * param incx - increment for the elements of x
+ * return norm of difference
+ */
+static double check_ztrmv(char uplo, char trans, char diag, blasint n, blasint lda, blasint incx)
+{
+	blasint i;
+	double alpha_conj[] = {1.0, 0.0}; 
+	char trans_verify = trans;
+
+	srand_generate(data_ztrmv.a_test, n * lda * 2);
+	srand_generate(data_ztrmv.x_test, n * incx * 2);
+
+	for (i = 0; i < n * lda * 2; i++)
+		data_ztrmv.a_verify[i] = data_ztrmv.a_test[i];
+
+	for (i = 0; i < n * incx * 2; i++)
+		data_ztrmv.x_verify[i] = data_ztrmv.x_test[i];
+
+	if (trans == 'R'){
+		cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, n, n, alpha_conj, data_ztrmv.a_verify, lda, lda);
+		trans_verify = 'N';
+	}
+
+	BLASFUNC(ztrmv)(&uplo, &trans_verify, &diag, &n, data_ztrmv.a_verify, &lda,
+	 				data_ztrmv.x_verify, &incx);
+
+	BLASFUNC(ztrmv)(&uplo, &trans, &diag, &n, data_ztrmv.a_test, &lda,
+	 				data_ztrmv.x_test, &incx);
+
+	for (i = 0; i < n * incx * 2; i++)
+		data_ztrmv.x_verify[i] -= data_ztrmv.x_test[i];
+
+	return BLASFUNC(dznrm2)(&n, data_ztrmv.x_verify, &incx);
+}
+
+/**
+ * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is not unit triangular
+ */
+CTEST(ztrmv, conj_notrans_upper_not_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'N';
+	char trans = 'R';
+
+	double norm = check_ztrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is unit triangular
+ */
+CTEST(ztrmv, conj_notrans_upper_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'U';
+	char trans = 'R';
+
+	double norm = check_ztrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is not unit triangular
+ */
+CTEST(ztrmv, conj_notrans_lower_not_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'N';
+	char trans = 'R';
+
+	double norm = check_ztrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is unit triangular
+ */
+CTEST(ztrmv, conj_notrans_lower_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'U';
+	char trans = 'R';
+
+	double norm = check_ztrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is not unit triangular
+ * vector x stride is 2
+ */
+CTEST(ztrmv, conj_notrans_upper_not_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'N';
+	char trans = 'R';
+
+	double norm = check_ztrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is unit triangular
+ * vector x stride is 2
+ */
+CTEST(ztrmv, conj_notrans_upper_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'U';
+	char trans = 'R';
+
+	double norm = check_ztrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is not unit triangular
+ * vector x stride is 2
+ */
+CTEST(ztrmv, conj_notrans_lower_not_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'N';
+	char trans = 'R';
+
+	double norm = check_ztrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is unit triangular
+ * vector x stride is 2
+ */
+CTEST(ztrmv, conj_notrans_lower_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'U';
+	char trans = 'R';
+
+	double norm = check_ztrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_ztrsv.c b/utest/test_extensions/test_ztrsv.c
new file mode 100644
index 000000000..ae556f5e2
--- /dev/null
+++ b/utest/test_extensions/test_ztrsv.c
@@ -0,0 +1,267 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+#include "common.h"
+
+#define DATASIZE 300
+#define INCREMENT 2
+
+struct DATA_ZTRSV {
+	double a_test[DATASIZE * DATASIZE * 2];
+	double a_verify[DATASIZE * DATASIZE * 2];
+	double x_test[DATASIZE * INCREMENT * 2];
+	double x_verify[DATASIZE * INCREMENT * 2];
+};
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZTRSV data_ztrsv;
+
+/**
+ * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A
+ * and comparing it with the non-conjugate ztrsv.
+ *
+ * param uplo specifies whether A is upper or lower triangular
+ * param trans specifies op(A), the transposition (conjugation) operation applied to A
+ * param diag specifies whether the matrix A is unit triangular or not.
+ * param n - numbers of rows and columns of A
+ * param lda - leading dimension of matrix A
+ * param incx - increment for the elements of x
+ * return norm of difference
+ */
+static double check_ztrsv(char uplo, char trans, char diag, blasint n, blasint lda, blasint incx)
+{
+	blasint i;
+	double alpha_conj[] = {1.0, 0.0}; 
+	char trans_verify = trans;
+
+	srand_generate(data_ztrsv.a_test, n * lda * 2);
+	srand_generate(data_ztrsv.x_test, n * incx * 2);
+
+	for (i = 0; i < n * lda * 2; i++)
+		data_ztrsv.a_verify[i] = data_ztrsv.a_test[i];
+
+	for (i = 0; i < n * incx * 2; i++)
+		data_ztrsv.x_verify[i] = data_ztrsv.x_test[i];
+
+	if (trans == 'R'){
+		cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, n, n, 
+						alpha_conj, data_ztrsv.a_verify, lda, lda);
+		trans_verify = 'N';
+	}
+
+	BLASFUNC(ztrsv)(&uplo, &trans_verify, &diag, &n, data_ztrsv.a_verify, 
+					&lda, data_ztrsv.x_verify, &incx);
+
+	BLASFUNC(ztrsv)(&uplo, &trans, &diag, &n, data_ztrsv.a_test, &lda, 
+					data_ztrsv.x_test, &incx);
+
+	for (i = 0; i < n * incx * 2; i++)
+		data_ztrsv.x_verify[i] -= data_ztrsv.x_test[i];
+
+	return BLASFUNC(dznrm2)(&n, data_ztrsv.x_verify, &incx);
+}
+
+/**
+ * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is not unit triangular
+ */
+CTEST(ztrsv, conj_notrans_upper_not_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'N';
+	char trans = 'R';
+
+	double norm = check_ztrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is unit triangular
+ */
+CTEST(ztrsv, conj_notrans_upper_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'U';
+	char trans = 'R';
+
+	double norm = check_ztrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is not unit triangular
+ */
+CTEST(ztrsv, conj_notrans_lower_not_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'N';
+	char trans = 'R';
+
+	double norm = check_ztrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is unit triangular
+ */
+CTEST(ztrsv, conj_notrans_lower_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'U';
+	char trans = 'R';
+
+	double norm = check_ztrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is not unit triangular
+ * vector x stride is 2
+ */
+CTEST(ztrsv, conj_notrans_upper_not_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'N';
+	char trans = 'R';
+
+	double norm = check_ztrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is unit triangular
+ * vector x stride is 2
+ */
+CTEST(ztrsv, conj_notrans_upper_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'U';
+	char trans = 'R';
+
+	double norm = check_ztrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is not unit triangular
+ * vector x stride is 2
+ */
+CTEST(ztrsv, conj_notrans_lower_not_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'N';
+	char trans = 'R';
+
+	double norm = check_ztrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is unit triangular
+ * vector x stride is 2
+ */
+CTEST(ztrsv, conj_notrans_lower_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'U';
+	char trans = 'R';
+
+	double norm = check_ztrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/xerbla.c b/utest/test_extensions/xerbla.c
new file mode 100644
index 000000000..9487b20a6
--- /dev/null
+++ b/utest/test_extensions/xerbla.c
@@ -0,0 +1,88 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "common.h"
+
+static int link_xerbla=TRUE;
+static int lerr, _info, ok;
+static char *rout;
+
+static void F77_xerbla(char *srname, void *vinfo)
+{
+   int info=*(int*)vinfo;
+
+   if (link_xerbla)
+   {
+      link_xerbla = 0;
+      return;
+   }
+
+   if (rout != NULL && strcmp(rout, srname) != 0){
+      printf("***** XERBLA WAS CALLED WITH SRNAME = <%s> INSTEAD OF <%s> *******\n", srname, rout);
+      ok = FALSE;
+   }
+
+   if (info != _info){
+      printf("***** XERBLA WAS CALLED WITH INFO = %d INSTEAD OF %d in %s *******\n",info, _info, srname);
+      lerr = TRUE;
+      ok = FALSE;
+   } else lerr = FALSE;
+}
+
+/**  
+* error function redefinition 
+*/
+int BLASFUNC(xerbla)(char *name, blasint *info, blasint length)
+{
+  F77_xerbla(name, info);
+  return 0;
+}
+
+int check_error(void) {
+   if (lerr == TRUE ) {
+       printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", _info, rout);
+      ok = FALSE;
+   }
+   lerr = TRUE;
+   return ok;
+}
+
+void set_xerbla(char* current_rout, int expected_info){
+   if (link_xerbla) /* call these first to link */
+      F77_xerbla(rout, &_info);
+
+   ok = TRUE;
+   lerr = TRUE;
+   _info = expected_info;
+   rout = current_rout;
+}
\ No newline at end of file

From c99e231fc5bb9ae661547089b561f0a66e99431e Mon Sep 17 00:00:00 2001
From: Andrey Sokolov <andrey.sokolov@syntacore.com>
Date: Thu, 18 Jan 2024 23:54:51 +0300
Subject: [PATCH 597/718] Fix rand_generate

---
 utest/test_extensions/common.c      | 4 ++--
 utest/test_extensions/test_zgeadd.c | 4 ++--
 utest/test_extensions/test_ztrmv.c  | 4 ++--
 utest/test_extensions/test_ztrsv.c  | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/utest/test_extensions/common.c b/utest/test_extensions/common.c
index c3bdcefc7..8a6a47795 100644
--- a/utest/test_extensions/common.c
+++ b/utest/test_extensions/common.c
@@ -40,14 +40,14 @@ void srand_generate(float *alpha, blasint n)
 {
     blasint i;
     for (i = 0; i < n; i++)
-        alpha[i] = (float)rand() / (float)RAND_MAX * 5.0f;
+        alpha[i] = (float)rand() / (float)RAND_MAX;
 }
 
 void drand_generate(double *alpha, blasint n)
 {
     blasint i;
     for (i = 0; i < n; i++)
-        alpha[i] = (double)rand() / (double)RAND_MAX * 5.0;
+        alpha[i] = (double)rand() / (double)RAND_MAX;
 }
 
 /**
diff --git a/utest/test_extensions/test_zgeadd.c b/utest/test_extensions/test_zgeadd.c
index 917c04829..e50f86de0 100644
--- a/utest/test_extensions/test_zgeadd.c
+++ b/utest/test_extensions/test_zgeadd.c
@@ -103,8 +103,8 @@ static double check_zgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order,
     }
 
     // Fill matrix A, C
-    srand_generate(data_zgeadd.a_test, lda * rows * 2);
-    srand_generate(data_zgeadd.c_test, ldc * rows * 2);
+    drand_generate(data_zgeadd.a_test, lda * rows * 2);
+    drand_generate(data_zgeadd.c_test, ldc * rows * 2);
 
     // Copy matrix C for zgeadd
     for (i = 0; i < ldc * rows * 2; i++)
diff --git a/utest/test_extensions/test_ztrmv.c b/utest/test_extensions/test_ztrmv.c
index aad64099e..5668ec296 100644
--- a/utest/test_extensions/test_ztrmv.c
+++ b/utest/test_extensions/test_ztrmv.c
@@ -66,8 +66,8 @@ static double check_ztrmv(char uplo, char trans, char diag, blasint n, blasint l
 	double alpha_conj[] = {1.0, 0.0}; 
 	char trans_verify = trans;
 
-	srand_generate(data_ztrmv.a_test, n * lda * 2);
-	srand_generate(data_ztrmv.x_test, n * incx * 2);
+	drand_generate(data_ztrmv.a_test, n * lda * 2);
+	drand_generate(data_ztrmv.x_test, n * incx * 2);
 
 	for (i = 0; i < n * lda * 2; i++)
 		data_ztrmv.a_verify[i] = data_ztrmv.a_test[i];
diff --git a/utest/test_extensions/test_ztrsv.c b/utest/test_extensions/test_ztrsv.c
index ae556f5e2..4b7ec6aaf 100644
--- a/utest/test_extensions/test_ztrsv.c
+++ b/utest/test_extensions/test_ztrsv.c
@@ -66,8 +66,8 @@ static double check_ztrsv(char uplo, char trans, char diag, blasint n, blasint l
 	double alpha_conj[] = {1.0, 0.0}; 
 	char trans_verify = trans;
 
-	srand_generate(data_ztrsv.a_test, n * lda * 2);
-	srand_generate(data_ztrsv.x_test, n * incx * 2);
+	drand_generate(data_ztrsv.a_test, n * lda * 2);
+	drand_generate(data_ztrsv.x_test, n * incx * 2);
 
 	for (i = 0; i < n * lda * 2; i++)
 		data_ztrsv.a_verify[i] = data_ztrsv.a_test[i];

From ccbc3f875bc87e92c1ab05f3b361f64d7fd95c87 Mon Sep 17 00:00:00 2001
From: Octavian Maghiar <octavian.maghiar@imgtec.com>
Date: Fri, 19 Jan 2024 12:40:00 +0000
Subject: [PATCH 598/718] [RISC-V] Add RISCV64_ZVL128B target to
 common_riscv64.h

---
 common_riscv64.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common_riscv64.h b/common_riscv64.h
index f11e8b75d..4b5f7dcc4 100644
--- a/common_riscv64.h
+++ b/common_riscv64.h
@@ -91,7 +91,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define BUFFER_SIZE     ( 32 << 20)
 #define SEEK_ADDRESS
 
-#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC)))
+#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B)
 # include <riscv_vector.h>
 #endif
 

From aaf65210ccba0c53408c242a2e0f5ad5d798d532 Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <chris.sidebottom@arm.com>
Date: Fri, 19 Jan 2024 19:04:21 +0000
Subject: [PATCH 599/718] Add dynamic support for Arm(R) Neoverse(TM) V2
 processor

Whilst I figure out how best to map the L2 parameters without
duplicating all of `ARMV8SVE`, lets just map this to `NEOVERSEV1`.
---
 driver/others/dynamic_arm64.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c
index 803e0b5eb..6b21028d1 100644
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@@ -1,6 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* Copyright 2023 The OpenBLAS Project                               */
+/* Copyright 2023-2024 The OpenBLAS Project                          */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -143,12 +143,13 @@ extern gotoblas_t  gotoblas_ARMV8SVE;
 #endif
 extern gotoblas_t  gotoblas_THUNDERX3T110;
 #endif
+#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1
 
 extern void openblas_warning(int verbose, const char * msg);
 #define FALLBACK_VERBOSE 1
 #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"
 
-#define NUM_CORETYPES   16
+#define NUM_CORETYPES   17
 
 /*
  * In case asm/hwcap.h is outdated on the build system, make sure
@@ -178,6 +179,7 @@ static char *corename[] = {
   "emag8180",
   "neoversen1",
   "neoversev1",
+  "neoversev2",
   "neoversen2",
   "thunderx3t110",
   "cortexa55",
@@ -198,10 +200,11 @@ char *gotoblas_corename(void) {
   if (gotoblas == &gotoblas_EMAG8180)     return corename[ 9];
   if (gotoblas == &gotoblas_NEOVERSEN1)   return corename[10];
   if (gotoblas == &gotoblas_NEOVERSEV1)   return corename[11];
-  if (gotoblas == &gotoblas_NEOVERSEN2)   return corename[12];
-  if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
-  if (gotoblas == &gotoblas_CORTEXA55)    return corename[14];
-  if (gotoblas == &gotoblas_ARMV8SVE)     return corename[15];
+  if (gotoblas == &gotoblas_NEOVERSEV2)   return corename[12];
+  if (gotoblas == &gotoblas_NEOVERSEN2)   return corename[13];
+  if (gotoblas == &gotoblas_THUNDERX3T110) return corename[14];
+  if (gotoblas == &gotoblas_CORTEXA55)    return corename[15];
+  if (gotoblas == &gotoblas_ARMV8SVE)     return corename[16];
   return corename[NUM_CORETYPES];
 }
 
@@ -233,10 +236,11 @@ static gotoblas_t *force_coretype(char *coretype) {
     case  9: return (&gotoblas_EMAG8180);
     case 10: return (&gotoblas_NEOVERSEN1);
     case 11: return (&gotoblas_NEOVERSEV1);
-    case 12: return (&gotoblas_NEOVERSEN2);
-    case 13: return (&gotoblas_THUNDERX3T110);
-    case 14: return (&gotoblas_CORTEXA55);
-    case 15: return (&gotoblas_ARMV8SVE);
+    case 12: return (&gotoblas_NEOVERSEV2);
+    case 13: return (&gotoblas_NEOVERSEN2);
+    case 14: return (&gotoblas_THUNDERX3T110);
+    case 15: return (&gotoblas_CORTEXA55);
+    case 16: return (&gotoblas_ARMV8SVE);
   }
   snprintf(message, 128, "Core not found: %s\n", coretype);
   openblas_warning(1, message);
@@ -312,6 +316,13 @@ static gotoblas_t *get_coretype(void) {
 	    return &gotoblas_NEOVERSEN1;
       	  }else
 	    return &gotoblas_NEOVERSEV1;
+  case 0xd4f:
+      if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
+        openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
+        return &gotoblas_NEOVERSEN1;
+      } else {
+	      return &gotoblas_NEOVERSEV2;
+      }
 #endif
 	case 0xd05: // Cortex A55
 	  return &gotoblas_CORTEXA55;

From 304a9b60afbc79be77193cc3bdb5bb5d503aa533 Mon Sep 17 00:00:00 2001
From: Han Gao/Revy/Rabenda <rabenda.cn@gmail.com>
Date: Sun, 21 Jan 2024 14:32:52 +0000
Subject: [PATCH 600/718] Update T-Head toolchains v2.8.0

Signed-off-by: Han Gao/Revy/Rabenda <rabenda.cn@gmail.com>
---
 .github/workflows/c910v.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml
index 30cf32b34..68ba2ddd7 100644
--- a/.github/workflows/c910v.yml
+++ b/.github/workflows/c910v.yml
@@ -14,8 +14,8 @@ jobs:
     if: "github.repository == 'OpenMathLib/OpenBLAS'"
     runs-on: ubuntu-latest
     env:
-      xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282
-      toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz
+      xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1698113812618
+      toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0-20231018.tar.gz
     strategy:
       fail-fast: false
       matrix:
@@ -76,7 +76,7 @@ jobs:
         run: |
             wget ${xuetie_toolchain}/${toolchain_file_name}
             tar -xvf ${toolchain_file_name} -C /opt
-            export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1/bin:$PATH"
+            export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0/bin:$PATH"
 
             make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
 

From e1afb23811256b231c259ca57d7a5f6e81ac6da5 Mon Sep 17 00:00:00 2001
From: kseniyazaytseva <kseniya.zaytseva@syntacore.com>
Date: Fri, 7 Apr 2023 11:13:23 +0300
Subject: [PATCH 601/718] Fix BLAS and LAPACK tests for C910V and
 RISCV64_ZVL256B targets

* Fixed bugs in dgemm, [a]min\max, asum kernels
* Added zero checks for BLAS kernels
* Added dsdot implementation for RVV 0.7.1
* Fixed bugs in _vector files for C910V and RISCV64_ZVL256B targets
* Added additional definitions for RISCV64_ZVL256B target
---
 Makefile.prebuild                       |   4 +
 Makefile.riscv64                        |   4 +
 TargetList.txt                          |   1 +
 getarch.c                               |  14 +++
 kernel/riscv64/KERNEL.C910V             |   1 +
 kernel/riscv64/amin_vector.c            |   6 +-
 kernel/riscv64/asum_vector.c            |   7 +-
 kernel/riscv64/axpby_vector.c           |   2 +-
 kernel/riscv64/dgemm_kernel_8x4_c910v.c |   2 +-
 kernel/riscv64/dsdot_vector.c           | 152 ++++++++++++++++++++++++
 kernel/riscv64/iamin_vector.c           |   4 +-
 kernel/riscv64/izamin_vector.c          |   2 +-
 kernel/riscv64/nrm2_vector.c            |   2 +-
 kernel/riscv64/nrm2_vector_dot.c        |   2 +-
 kernel/riscv64/swap_vector.c            |   2 +-
 kernel/riscv64/zamax_vector.c           |  17 +--
 kernel/riscv64/zamin_vector.c           |  17 +--
 kernel/riscv64/znrm2_vector.c           |   2 +-
 kernel/riscv64/zswap_vector.c           |   2 +-
 19 files changed, 205 insertions(+), 38 deletions(-)
 create mode 100644 kernel/riscv64/dsdot_vector.c

diff --git a/Makefile.prebuild b/Makefile.prebuild
index c4f4a2602..d30275f06 100644
--- a/Makefile.prebuild
+++ b/Makefile.prebuild
@@ -59,6 +59,10 @@ ifeq ($(TARGET), x280)
 TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
 endif
 
+ifeq ($(TARGET), RISCV64_ZVL256B)
+TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
+endif
+
 ifeq ($(TARGET), RISCV64_GENERIC)
 TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d
 endif
diff --git a/Makefile.riscv64 b/Makefile.riscv64
index ce7a27141..2239a3676 100644
--- a/Makefile.riscv64
+++ b/Makefile.riscv64
@@ -6,6 +6,10 @@ ifeq ($(CORE), x280)
 CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math
 FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
 endif
+ifeq ($(CORE), RISCV64_ZVL256B)
+CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl256b -mabi=lp64d
+FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
+endif
 ifeq ($(CORE), RISCV64_GENERIC)
 CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d
 FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static
diff --git a/TargetList.txt b/TargetList.txt
index f76f605cc..f65a18b50 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -121,6 +121,7 @@ Z14
 RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54)
 C910V
 x280
+RISCV64_ZVL256B
 
 11.LOONGARCH64:
 LOONGSONGENERIC
diff --git a/getarch.c b/getarch.c
index 772836347..12ea72052 100644
--- a/getarch.c
+++ b/getarch.c
@@ -1692,6 +1692,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #endif
 
+#ifdef FORCE_RISCV64_ZVL256B
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "RISCV64_ZVL256B"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG   "-DRISCV64_ZVL256B " \
+       "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME   "riscv64_zvl256b"
+#define CORENAME  "RISCV64_ZVL256B"
+#else
+#endif
+
 
 #if defined(FORCE_E2K) || defined(__e2k__)
 #define FORCE
diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V
index 0da66fa35..2798a870e 100644
--- a/kernel/riscv64/KERNEL.C910V
+++ b/kernel/riscv64/KERNEL.C910V
@@ -59,6 +59,7 @@ SDOTKERNEL   = dot_vector.c
 DDOTKERNEL   = dot_vector.c
 CDOTKERNEL   = zdot_vector.c
 ZDOTKERNEL   = zdot_vector.c
+DSDOTKERNEL  = dsdot_vector.c
 
 SNRM2KERNEL  = nrm2_vector.c
 DNRM2KERNEL  = nrm2_vector.c
diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c
index 1c541f0fd..c4578eabf 100644
--- a/kernel/riscv64/amin_vector.c
+++ b/kernel/riscv64/amin_vector.c
@@ -31,15 +31,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #       define LMUL m2
 #       if defined(DOUBLE)
 #               define ELEN 64
+#               define ABS fabs
 #       else
 #               define ELEN 32
+#               define ABS fabsf
 #       endif
 #else
 #       define LMUL m8
 #       if defined(DOUBLE)
 #               define ELEN 64
+#               define ABS fabs
 #       else
 #               define ELEN 32
+#               define ABS fabsf
 #       endif
 #endif
 
@@ -69,7 +73,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         FLOAT minf=0.0;
         if (n <= 0 || inc_x <= 0) return(minf);
 
-        minf = *x;
+        minf = ABS(*x);
         x += inc_x;
         --n;
         if (n == 0) return(minf);
diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c
index 995dbf9a1..a652eafdd 100644
--- a/kernel/riscv64/asum_vector.c
+++ b/kernel/riscv64/asum_vector.c
@@ -67,7 +67,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0, j=0;
-	BLASLONG ix=0;
 	FLOAT asumf=0.0;
 	if (n <= 0 || inc_x <= 0) return(asumf);
         unsigned int gvl = 0;
@@ -103,17 +102,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 unsigned int stride_x = inc_x * sizeof(FLOAT);
                 if(gvl <= n/2){
                         v_sum = VFMVVF_FLOAT(0, gvl);
-                        BLASLONG inc_xv = inc_x * gvl;
                         for(i=0,j=0; i<n/(gvl*2); i++){
-                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
                                 v0 = VFABS_FLOAT(v0, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
 
-                                v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                v1 = VLSEV_FLOAT(&x[(j+gvl)*inc_x], stride_x, gvl);
                                 v1 = VFABS_FLOAT(v1, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
                                 j += gvl * 2;
-                                inc_xv += inc_xv * 2;
                         }
                         v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
                 }
diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c
index 386c4a5f1..850fc903e 100644
--- a/kernel/riscv64/axpby_vector.c
+++ b/kernel/riscv64/axpby_vector.c
@@ -60,7 +60,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
 {
-	if (n < 0)  return(0);
+	if (n <= 0)  return(0);
 
 	BLASLONG i=0, j=0;
 	unsigned int gvl = 0;
diff --git a/kernel/riscv64/dgemm_kernel_8x4_c910v.c b/kernel/riscv64/dgemm_kernel_8x4_c910v.c
index 79a7a98d9..b9bccbd49 100644
--- a/kernel/riscv64/dgemm_kernel_8x4_c910v.c
+++ b/kernel/riscv64/dgemm_kernel_8x4_c910v.c
@@ -196,7 +196,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
 		   
 		   asm volatile(
 				"vsetvli    zero, zero, e64,m1 \n\t"
-				"fmv.w.x    ft11, zero         \n\t"
+				"fmv.d.x    ft11, zero         \n\t"
 				"mv         t0,   %[BK]        \n\t"
 				
 				"vfmv.v.f   v16,  ft11         \n\t"
diff --git a/kernel/riscv64/dsdot_vector.c b/kernel/riscv64/dsdot_vector.c
new file mode 100644
index 000000000..e972828b5
--- /dev/null
+++ b/kernel/riscv64/dsdot_vector.c
@@ -0,0 +1,152 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+        BLASLONG i=0, j=0;
+        double dot = 0.0 ;
+
+        if ( n < 1 )  return(dot);
+        vfloat64m4_t vr;
+        vfloat32m2_t vx, vy;
+        unsigned int gvl = 0;
+        vfloat64m1_t v_res, v_z0;
+        gvl = vsetvlmax_e64m1();
+        v_res = vfmv_v_f_f64m1(0, gvl);
+        v_z0 = vfmv_v_f_f64m1(0, gvl);
+
+        if(inc_x == 1 && inc_y == 1){
+                gvl = vsetvl_e64m4(n);
+                vr = vfmv_v_f_f64m4(0, gvl);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = vle32_v_f32m2(&x[j], gvl);
+                        vy = vle32_v_f32m2(&y[j], gvl);
+                        vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvl_e64m4(n-j);
+                        vx = vle32_v_f32m2(&x[j], gvl);
+                        vy = vle32_v_f32m2(&y[j], gvl);
+                        vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl);
+                        //vr = vfdot_vv_f32m2(vx, vy, gvl);
+                        vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl);
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+                }
+        }else if(inc_y == 1){
+                gvl = vsetvl_e64m4(n);
+                vr = vfmv_v_f_f64m4(0, gvl);
+                 int stride_x = inc_x * sizeof(FLOAT);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl);
+                        vy = vle32_v_f32m2(&y[j], gvl);
+                        vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvl_e64m4(n-j);
+                        vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl);
+                        vy = vle32_v_f32m2(&y[j], gvl);
+                        vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl);
+                        //vr = vfdot_vv_f32m2(vx, vy, gvl);
+                        vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl);
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+
+                }
+        }else if(inc_x == 1){
+                gvl = vsetvl_e64m4(n);
+                vr = vfmv_v_f_f64m4(0, gvl);
+                 int stride_y = inc_y * sizeof(FLOAT);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = vle32_v_f32m2(&x[j], gvl);
+                        vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl);
+                        vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvl_e64m4(n-j);
+                        vx = vle32_v_f32m2(&x[j], gvl);
+                        vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl);
+                        vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl);
+                        //vr = vfdot_vv_f32m2(vx, vy, gvl);
+                        vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl);
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+
+                }
+        }else{
+                gvl = vsetvl_e64m4(n);
+                vr = vfmv_v_f_f64m4(0, gvl);
+                 int stride_x = inc_x * sizeof(FLOAT);
+                 int stride_y = inc_y * sizeof(FLOAT);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl);
+                        vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl);
+                        vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvl_e64m4(n-j);
+                        vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl);
+                        vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl);
+                        vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl);
+                        //vr = vfdot_vv_f32m2(vx, vy, gvl);
+                        vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl);
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+
+                }
+        }
+        return(dot);
+}
diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c
index a58872960..0e591e697 100644
--- a/kernel/riscv64/iamin_vector.c
+++ b/kernel/riscv64/iamin_vector.c
@@ -139,7 +139,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
                         v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
                         FLOAT cur_minf = EXTRACT_FLOAT(v_res);
-                        if(cur_minf > minf){
+                        if(cur_minf < minf){
                                 //tail index
                                 v_min_index = VIDV_UINT(gvl);
                                 v_min_index = VADDVX_UINT(v_min_index, j, gvl);
@@ -185,7 +185,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
                         v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
                         FLOAT cur_minf = EXTRACT_FLOAT(v_res);
-                        if(cur_minf > minf){
+                        if(cur_minf < minf){
                                 //tail index
                                 v_min_index = VIDV_UINT(gvl);
                                 v_min_index = VADDVX_UINT(v_min_index, j, gvl);
diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c
index a3877a46c..c76a38099 100644
--- a/kernel/riscv64/izamin_vector.c
+++ b/kernel/riscv64/izamin_vector.c
@@ -156,7 +156,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
                 v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
                 FLOAT cur_minf = EXTRACT_FLOAT(v_res);
-                if(cur_minf > minf){
+                if(cur_minf < minf){
                         //tail index
                         v_min_index = VIDV_UINT(gvl);
                         v_min_index = VADDVX_UINT(v_min_index, j, gvl);
diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c
index 141dffebf..5c03fbec7 100644
--- a/kernel/riscv64/nrm2_vector.c
+++ b/kernel/riscv64/nrm2_vector.c
@@ -104,7 +104,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 
-	if(n <= 0)  return(0.0);
+	if (n <= 0 || inc_x <= 0) return(0.0);
         if(n == 1) return (ABS(x[0]));
 
         unsigned int gvl = 0;
diff --git a/kernel/riscv64/nrm2_vector_dot.c b/kernel/riscv64/nrm2_vector_dot.c
index 06e61d695..dfa13a6f5 100644
--- a/kernel/riscv64/nrm2_vector_dot.c
+++ b/kernel/riscv64/nrm2_vector_dot.c
@@ -61,7 +61,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG i=0, j=0;
 	double len = 0.0 ;
 
-	if ( n < 0 )  return(0.0);
+	if ( n <= 0 )  return(0.0);
         if(n == 1) return (ABS(x[0]));
 
         FLOAT_V_T vr, v0, v1;
diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c
index 3b467a586..f583f5392 100644
--- a/kernel/riscv64/swap_vector.c
+++ b/kernel/riscv64/swap_vector.c
@@ -67,7 +67,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
         BLASLONG stride_x, stride_y;
         FLOAT_V_T vx0, vx1, vy0, vy1;
 
-	if (n < 0)  return(0);
+	if (n <= 0)  return(0);
 
         unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
         if( inc_x == 0 && inc_y == 0 ) { n = n & 1; }
diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c
index 2dee5ab29..ec4a5a1e9 100644
--- a/kernel/riscv64/zamax_vector.c
+++ b/kernel/riscv64/zamax_vector.c
@@ -60,17 +60,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
 #ifdef RISCV_0p10_INTRINSICS
 #define VFREDMAXVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmax_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1)) (v_res, va, vb, gvl)
-#define VFRSUBVF_MASK_FLOAT(va,vb,c,gvl) JOIN(RISCV_RVV(vfrsub),_vf_f,  ELEN,   LMUL,   _m) (va, vb, vb, c, gvl)
 #else
 #define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
-#define VFRSUBVF_MASK_FLOAT JOIN(RISCV_RVV(vfrsub),_vf_f,  ELEN,   LMUL,   _m)
 #endif
 #define MASK_T          JOIN(vbool,             MLEN,   _t,     _,      _)
-#define VMFLTVF_FLOAT   JOIN(RISCV_RVV(vmflt_vf_f), ELEN,  LMUL,   _b,     MLEN)
 #define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
 #define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
 #define VFMAXVV_FLOAT   JOIN(RISCV_RVV(vfmax),     _vv_f,  ELEN,   LMUL,   _)
 #define VFADDVV_FLOAT   JOIN(RISCV_RVV(vfadd),     _vv_f,  ELEN,   LMUL,   _)
+#define VFABSV_FLOAT   JOIN(RISCV_RVV(vfabs),     _v_f,  ELEN,   LMUL,   _)
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
@@ -91,10 +89,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         for(; i<n/gvl; i++){
                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl);
-                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl);
+
+                v0 = VFABSV_FLOAT(v0, gvl);
+                v1 = VFABSV_FLOAT(v1, gvl);
 
                 v0 = VFADDVV_FLOAT(v0, v1, gvl);
                 v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
@@ -108,10 +105,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 gvl = VSETVL(n-j);
                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl);
-                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl);
+                v0 = VFABSV_FLOAT(v0, gvl);
+                v1 = VFABSV_FLOAT(v1, gvl);
                 v1 = VFADDVV_FLOAT(v0, v1, gvl);
                 v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl);
         }
diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c
index df9a7a7e1..45b3e0b9d 100644
--- a/kernel/riscv64/zamin_vector.c
+++ b/kernel/riscv64/zamin_vector.c
@@ -62,17 +62,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
 #ifdef RISCV_0p10_INTRINSICS
 #define VFREDMINVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmin_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1)) (v_res, va, vb, gvl)
-#define VFRSUBVF_MASK_FLOAT(va,vb,c,gvl) JOIN(RISCV_RVV(vfrsub),_vf_f,  ELEN,   LMUL,   _m) (va, vb, vb, c, gvl)
 #else
 #define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
-#define VFRSUBVF_MASK_FLOAT JOIN(RISCV_RVV(vfrsub),_vf_f,  ELEN,   LMUL,   _m)
 #endif
 #define MASK_T          JOIN(vbool,             MLEN,   _t,     _,      _)
-#define VMFLTVF_FLOAT   JOIN(RISCV_RVV(vmflt_vf_f), ELEN,  LMUL,   _b,     MLEN)
 #define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
 #define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
 #define VFMINVV_FLOAT   JOIN(RISCV_RVV(vfmin),     _vv_f,  ELEN,   LMUL,   _)
 #define VFADDVV_FLOAT   JOIN(RISCV_RVV(vfadd),     _vv_f,  ELEN,   LMUL,   _)
+#define VFABSV_FLOAT   JOIN(RISCV_RVV(vfabs),     _v_f,  ELEN,   LMUL,   _)
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
@@ -93,10 +91,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         for(; i<n/gvl; i++){
                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl);
-                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl);
+
+                v0 = VFABSV_FLOAT(v0, gvl);
+                v1 = VFABSV_FLOAT(v1, gvl);
 
                 v0 = VFADDVV_FLOAT(v0, v1, gvl);
                 v_min = VFMINVV_FLOAT(v_min, v0, gvl);
@@ -110,10 +107,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 gvl = VSETVL(n-j);
                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, 0, gvl);
-                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, 0, gvl);
+                v0 = VFABSV_FLOAT(v0, gvl);
+                v1 = VFABSV_FLOAT(v1, gvl);
                 v1 = VFADDVV_FLOAT(v0, v1, gvl);
                 v_res = VFREDMINVS_FLOAT(v1, v_res, gvl);
         }
diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c
index 8614f7539..6ee3be79e 100644
--- a/kernel/riscv64/znrm2_vector.c
+++ b/kernel/riscv64/znrm2_vector.c
@@ -96,7 +96,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
         BLASLONG i=0;
 
-        if(n < 0)  return(0.0);
+	if (n <= 0 || inc_x <= 0) return(0.0);
 
         FLOAT_V_T v_ssq, v_scale, v0, v1, v_zero;
         unsigned int gvl = 0;
diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c
index 02c98b588..f2734c4a9 100644
--- a/kernel/riscv64/zswap_vector.c
+++ b/kernel/riscv64/zswap_vector.c
@@ -69,7 +69,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
         unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
         if( inc_x == 0 && inc_y == 0 ) { n = n & 1; }
 
-	if (n < 0)  return(0);
+	if (n <= 0)  return(0);
         if(inc_x == 1 && inc_y == 1){
                 BLASLONG n2 = n * 2;
                 if(gvl <= n2/2){

From d938aed7fe20baf0e0dfa75d6b53ddcfe4319a6f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 23 Jan 2024 17:15:53 +0100
Subject: [PATCH 602/718] reset "mem structure overflowed" state on shutdown

---
 driver/others/memory.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index caef3e2b7..4ee8f9a2e 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -3214,7 +3214,7 @@ void blas_shutdown(void){
 #endif
     memory[pos].lock   = 0;
   }
-  if (memory_overflowed)
+  if (memory_overflowed) {
     for (pos = 0; pos < NEW_BUFFERS; pos ++){
       newmemory[pos].addr   = (void *)0;
       newmemory[pos].used   = 0;
@@ -3222,6 +3222,10 @@ void blas_shutdown(void){
       newmemory[pos].pos    = -1;
 #endif
       newmemory[pos].lock   = 0;
+    }
+    free(newmemory);
+    newmemory = NULL;
+    memory_overflowed = 0;  
   }
 
   UNLOCK_COMMAND(&alloc_lock);

From 86943afa9cb96a49ad036c1a484390967b322b5e Mon Sep 17 00:00:00 2001
From: kseniyazaytseva <kseniya.zaytseva@syntacore.com>
Date: Wed, 24 Jan 2024 10:53:13 +0300
Subject: [PATCH 603/718] Fix x280 taget include riscv_vector.h

---
 common_riscv64.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common_riscv64.h b/common_riscv64.h
index 4b5f7dcc4..ab3bfa25a 100644
--- a/common_riscv64.h
+++ b/common_riscv64.h
@@ -91,7 +91,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define BUFFER_SIZE     ( 32 << 20)
 #define SEEK_ADDRESS
 
-#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B)
+#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) || defined(x280)
 # include <riscv_vector.h>
 #endif
 

From 73530b03fa6ecd03e7ceb2b37c234a0bb1626445 Mon Sep 17 00:00:00 2001
From: Andrey Sokolov <andrey.sokolov@syntacore.com>
Date: Wed, 24 Jan 2024 11:38:14 +0300
Subject: [PATCH 604/718] remove RISCV64_ZVL256B additional extentions

---
 Makefile.prebuild | 2 +-
 Makefile.riscv64  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile.prebuild b/Makefile.prebuild
index 7824e15a8..98acca80e 100644
--- a/Makefile.prebuild
+++ b/Makefile.prebuild
@@ -60,7 +60,7 @@ TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
 endif
 
 ifeq ($(TARGET), RISCV64_ZVL256B)
-TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
+TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
 endif
 
 ifeq ($(TARGET), RISCV64_ZVL128B)
diff --git a/Makefile.riscv64 b/Makefile.riscv64
index 9d314d074..113cc57c5 100644
--- a/Makefile.riscv64
+++ b/Makefile.riscv64
@@ -7,8 +7,8 @@ CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math
 FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
 endif
 ifeq ($(CORE), RISCV64_ZVL256B)
-CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl256b -mabi=lp64d
-FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
+CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d
+FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
 endif
 ifeq ($(CORE), RISCV64_ZVL128B)
 CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d 

From 276e3ebf9e1405196d36c2570bce8376b8bad2fd Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Fri, 26 Jan 2024 10:03:50 +0800
Subject: [PATCH 605/718] LoongArch64: Add dzamax and dzamin opt

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   2 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |   2 +
 kernel/loongarch64/camax_lasx.S          | 150 +++++++++-------
 kernel/loongarch64/camax_lsx.S           | 195 ++++++++++++---------
 kernel/loongarch64/camin_lasx.S          | 164 ++++++++++--------
 kernel/loongarch64/camin_lsx.S           | 209 +++++++++++++----------
 6 files changed, 418 insertions(+), 304 deletions(-)

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index c365e9a75..e27ce3bee 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -14,10 +14,12 @@ ZSCALKERNEL  = cscal_lsx.S
 SAMAXKERNEL =  amax_lsx.S
 DAMAXKERNEL =  amax_lsx.S
 CAMAXKERNEL =  camax_lsx.S
+ZAMAXKERNEL =  camax_lsx.S
 
 SAMINKERNEL =  amin_lsx.S
 DAMINKERNEL =  amin_lsx.S
 CAMINKERNEL =  camin_lsx.S
+ZAMINKERNEL =  camin_lsx.S
 
 SMAXKERNEL  =  max_lsx.S
 DMAXKERNEL  =  max_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 68360faaf..f4429cfba 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -14,10 +14,12 @@ ZSCALKERNEL  = cscal_lasx.S
 SAMAXKERNEL =  amax_lasx.S
 DAMAXKERNEL =  amax_lasx.S
 CAMAXKERNEL =  camax_lasx.S
+ZAMAXKERNEL =  camax_lasx.S
 
 SAMINKERNEL =  amin_lasx.S
 DAMINKERNEL =  amin_lasx.S
 CAMINKERNEL =  camin_lasx.S
+ZAMINKERNEL =  camin_lasx.S
 
 SMAXKERNEL  =  max_lsx.S
 DMAXKERNEL =   max_lsx.S
diff --git a/kernel/loongarch64/camax_lasx.S b/kernel/loongarch64/camax_lasx.S
index 7013430cb..f9a4e9012 100644
--- a/kernel/loongarch64/camax_lasx.S
+++ b/kernel/loongarch64/camax_lasx.S
@@ -63,42 +63,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     bge $r0, N, .L999
     bge $r0, INCX, .L999
     li.d TEMP, 1
-    li.w I, -1
     slli.d TEMP, TEMP, ZBASE_SHIFT
     slli.d INCX, INCX, ZBASE_SHIFT
-    xvreplgr2vr.w neg1, I
-    xvffint.s.w neg1, neg1
     srai.d I, N, 3
     bne INCX, TEMP, .L20
     bge $r0, I, .L23
     .align 3
 
 .L10:
-    xvld VX0, X, 0 * SIZE
-    xvld VX1, X, 8 * SIZE
-    addi.d I, I, -1
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+#ifdef DOUBLE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+#else
     xvpickev.w x1, VX1, VX0
     xvpickod.w x2, VX1, VX0
-    xvfmul.s x3, neg1, x1
-    xvfmul.s x4, neg1, x2
-    xvfcmp.clt.s VT0, x1, res0
-    xvfcmp.clt.s VT1, x2, res0
-    xvbitsel.v x1, x1, x3, VT0
-    xvbitsel.v x2, x2, x4, VT1
+#endif
+    XVFSUB x3, res0, x1
+    XVFSUB x4, res0, x2
+    XVFMAX x1, x1, x3
+    XVFMAX x2, x2, x4
+    XVFADD VM1, x1, x2
+    XVFMAX VM0, VM0, VM1
+#ifdef DOUBLE
+    xvld VX0, X, 64
+    xvld VX1, X, 96
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    XVFSUB x3, res0, x1
+    XVFSUB x4, res0, x2
+    XVFMAX x1, x1, x3
+    XVFMAX x2, x2, x4
+    XVFADD VM1, x1, x2
+    XVFMAX VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
     addi.d X, X, 16 * SIZE
-    xvfadd.s VM1, x1, x2
-    xvfmax.s VM0, VM0, VM1
     blt $r0, I, .L10
     .align 3
 
 .L11:
+#ifdef DOUBLE
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    XVFMAX VM0, x1, x2
+#else
     xvpickve.w x1, VM0, 0
     xvpickve.w x2, VM0, 1
     xvpickve.w x3, VM0, 2
     xvpickve.w x4, VM0, 3
-    xvfmax.s VM1, x1, x2
-    xvfmax.s VM0, x3, x4
-    xvfmax.s VM0, VM0, VM1
+    XVFMAX VM0, x1, x2
+    XVFMAX VM1, x3, x4
+    XVFMAX VM0, VM0, VM1
+#endif
     b .L23
     .align 3
 
@@ -107,66 +125,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     .align 3
 
 .L21:
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmax.s s1, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmax.s s1, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
     addi.d I, I, -1
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmax.s s3, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s3, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmax.s s4, t1, t3
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s4, t1, t3
     blt $r0, I, .L21
     .align 3
 
 .L22:
-    fmax.s s1, s1, s2
-    fmax.s s3, s3, s4
-    fmax.s s1, s1, s3
+    FMAX s1, s1, s2
+    FMAX s3, s3, s4
+    FMAX s1, s1, s3
     .align 3
 
 .L23: //N<8
@@ -182,12 +200,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     FABS a1, a1
     ADD a0, a0, a1
     add.d  X, X, INCX
-    fmax.s s1, a0, s1
+    FMAX s1, a0, s1
     blt $r0, I, .L24
     .align 3
 
 .L999:
-    fmov.s $f0, $f22
+    MOV $f0, $f22
     jirl $r0, $r1, 0x0
     .align 3
 
diff --git a/kernel/loongarch64/camax_lsx.S b/kernel/loongarch64/camax_lsx.S
index 2e55629de..cf46cb016 100644
--- a/kernel/loongarch64/camax_lsx.S
+++ b/kernel/loongarch64/camax_lsx.S
@@ -63,54 +63,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     bge $r0, N, .L999
     bge $r0, INCX, .L999
     li.d TEMP, 1
-    li.w I, -1
     slli.d TEMP, TEMP, ZBASE_SHIFT
     slli.d INCX, INCX, ZBASE_SHIFT
-    vreplgr2vr.w neg1, I
-    vffint.s.w neg1, neg1
     srai.d I, N, 3
     bne INCX, TEMP, .L20
     bge $r0, I, .L23
     .align 3
 
 .L10:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    addi.d I, I, -1
+    vld VX0, X, 0
+    vld VX1, X, 16
+#ifdef DOUBLE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
     vpickev.w x1, VX1, VX0
     vpickod.w x2, VX1, VX0
-    vfmul.s x3, neg1, x1
-    vfmul.s x4, neg1, x2
-    vfcmp.clt.s VT0, x1, res0
-    vfcmp.clt.s VT1, x2, res0
-    vld VX0, X, 8 * SIZE
-    vbitsel.v x1, x1, x3, VT0
-    vbitsel.v x2, x2, x4, VT1
-    vld VX1, X, 12 * SIZE
-    vfadd.s VM1, x1, x2
+#endif
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD VM1, x1, x2
+
+    vld VX0, X, 32
+    vld VX1, X, 48
+#ifdef DOUBLE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
     vpickev.w x1, VX1, VX0
     vpickod.w x2, VX1, VX0
-    vfmul.s x3, neg1, x1
-    vfmul.s x4, neg1, x2
-    vfcmp.clt.s VT0, x1, res0
-    vfcmp.clt.s VT1, x2, res0
+#endif
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD x1, x1, x2
+    VFMAX VM1, x1, VM1
+    VFMAX VM0, VM0, VM1
+#ifdef DOUBLE
+    vld VX0, X, 64
+    vld VX1, X, 80
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD VM1, x1, x2
+
+    vld VX0, X, 96
+    vld VX1, X, 112
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD x1, x1, x2
+    VFMAX VM1, x1, VM1
+    VFMAX VM0, VM0, VM1
+#endif
     addi.d X, X, 16 * SIZE
-    vbitsel.v x1, x1, x3, VT0
-    vbitsel.v x2, x2, x4, VT1
-    vfadd.s x1, x1, x2
-    vfmax.s VM1, x1, VM1
-    vfmax.s VM0, VM0, VM1
+    addi.d I, I, -1
     blt $r0, I, .L10
     .align 3
 
 .L11:
+#ifdef DOUBLE
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    VFMAX VM0, x1, x2
+#else
     vreplvei.w x1, VM0, 0
     vreplvei.w x2, VM0, 1
     vreplvei.w x3, VM0, 2
     vreplvei.w x4, VM0, 3
-    vfmax.s VM1, x1, x2
-    vfmax.s VM0, x3, x4
-    vfmax.s VM0, VM0, VM1
+    VFMAX VM1, x1, x2
+    VFMAX VM0, x3, x4
+    VFMAX VM0, VM0, VM1
+#endif
     b .L23
     .align 3
 
@@ -119,66 +152,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     .align 3
 
 .L21:
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmax.s s1, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmax.s s1, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
     addi.d I, I, -1
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmax.s s3, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s3, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmax.s s4, t1, t3
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s4, t1, t3
     blt $r0, I, .L21
     .align 3
 
 .L22:
-    fmax.s s1, s1, s2
-    fmax.s s3, s3, s4
-    fmax.s s1, s1, s3
+    FMAX s1, s1, s2
+    FMAX s3, s3, s4
+    FMAX s1, s1, s3
     .align 3
 
 .L23: //N<8
@@ -187,19 +220,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     .align 3
 
 .L24:
-    fld.s a0, X, 0 * SIZE
-    fld.s a1, X, 1 * SIZE
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
     addi.d I, I, -1
-    fabs.s a0, a0
-    fabs.s a1, a1
-    fadd.s a0, a0, a1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
     add.d  X, X, INCX
-    fmax.s s1, a0, s1
+    FMAX s1, a0, s1
     blt $r0, I, .L24
     .align 3
 
 .L999:
-    fmov.s $f0, $f22
+    MOV $f0, $f22
     jirl $r0, $r1, 0x0
     .align 3
 
diff --git a/kernel/loongarch64/camin_lasx.S b/kernel/loongarch64/camin_lasx.S
index d7931d30a..c1c4c98c8 100644
--- a/kernel/loongarch64/camin_lasx.S
+++ b/kernel/loongarch64/camin_lasx.S
@@ -61,49 +61,71 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvxor.v res0, res0, res0
     bge $r0, N, .L999
     bge $r0, INCX, .L999
-    fld.s a0, X, 0 * SIZE
-    fld.s a1, X, 1 * SIZE
-    fabs.s a0, a0
-    fabs.s a1, a1
-    fadd.s s1, a1, a0
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    FABS a0, a0
+    FABS a1, a1
+    ADD s1, a1, a0
+#ifdef DOUBLE
+    xvreplve0.d VM0, VM0
+#else
     xvreplve0.w VM0, VM0
+#endif
     li.d TEMP, 1
-    li.w I, -1
     slli.d TEMP, TEMP, ZBASE_SHIFT
     slli.d INCX, INCX, ZBASE_SHIFT
-    xvreplgr2vr.w neg1, I
-    xvffint.s.w neg1, neg1
     srai.d I, N, 3
     bne INCX, TEMP, .L20
     bge $r0, I, .L23
     .align 3
 
 .L10:
-    xvld VX0, X, 0 * SIZE
-    xvld VX1, X, 8 * SIZE
-    addi.d I, I, -1
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+#ifdef DOUBLE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+#else
     xvpickev.w x1, VX1, VX0
     xvpickod.w x2, VX1, VX0
-    xvfmul.s x3, neg1, x1
-    xvfmul.s x4, neg1, x2
-    xvfcmp.clt.s VT0, x1, res0
-    xvfcmp.clt.s VT1, x2, res0
-    xvbitsel.v x1, x1, x3, VT0
-    xvbitsel.v x2, x2, x4, VT1
+#endif
+    XVFSUB x3, res0, x1
+    XVFSUB x4, res0, x2
+    XVFMAX x1, x1, x3
+    XVFMAX x2, x2, x4
+    XVFADD VM1, x1, x2
+    XVFMIN VM0, VM0, VM1
+#ifdef DOUBLE
+    xvld VX0, X, 64
+    xvld VX1, X, 96
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    XVFSUB x3, res0, x1
+    XVFSUB x4, res0, x2
+    XVFMAX x1, x1, x3
+    XVFMAX x2, x2, x4
+    XVFADD VM1, x1, x2
+    XVFMIN VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
     addi.d X, X, 16 * SIZE
-    xvfadd.s VM1, x1, x2
-    xvfmin.s VM0, VM0, VM1
     blt $r0, I, .L10
     .align 3
 
 .L11:
+#ifdef DOUBLE
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    XVFMIN VM0, x1, x2
+#else
     xvpickve.w x1, VM0, 0
     xvpickve.w x2, VM0, 1
     xvpickve.w x3, VM0, 2
     xvpickve.w x4, VM0, 3
-    xvfmin.s VM1, x1, x2
-    xvfmin.s VM0, x3, x4
-    xvfmin.s VM0, VM0, VM1
+    XVFMIN VM0, x1, x2
+    XVFMIN VM1, x3, x4
+    XVFMIN VM0, VM0, VM1
+#endif
     b .L23
     .align 3
 
@@ -112,66 +134,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     .align 3
 
 .L21:
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmin.s s1, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmin.s s1, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
     addi.d I, I, -1
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmin.s s3, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s3, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmin.s s4, t1, t3
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s4, t1, t3
     blt $r0, I, .L21
     .align 3
 
 .L22:
-    fmin.s s1, s1, s2
-    fmin.s s3, s3, s4
-    fmin.s s1, s1, s3
+    FMIN s1, s1, s2
+    FMIN s3, s3, s4
+    FMIN s1, s1, s3
     .align 3
 
 .L23: //N<8
@@ -187,12 +209,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     FABS a1, a1
     ADD a0, a0, a1
     add.d  X, X, INCX
-    fmin.s s1, a0, s1
+    FMIN s1, a0, s1
     blt $r0, I, .L24
     .align 3
 
 .L999:
-    fmov.s $f0, $f22
+    MOV $f0, $f22
     jirl $r0, $r1, 0x0
     .align 3
 
diff --git a/kernel/loongarch64/camin_lsx.S b/kernel/loongarch64/camin_lsx.S
index e9ad6b04d..ff666ea8f 100644
--- a/kernel/loongarch64/camin_lsx.S
+++ b/kernel/loongarch64/camin_lsx.S
@@ -61,61 +61,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     vxor.v res0, res0, res0
     bge $r0, N, .L999
     bge $r0, INCX, .L999
-    fld.s a0, X, 0 * SIZE
-    fld.s a1, X, 1 * SIZE
-    fabs.s a0, a0
-    fabs.s a1, a1
-    fadd.s s1, a1, a0
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    FABS a0, a0
+    FABS a1, a1
+    ADD s1, a1, a0
+#ifdef DOUBLE
+    vreplvei.d VM0, VM0, 0
+#else
     vreplvei.w VM0, VM0, 0
+#endif
     li.d TEMP, 1
-    li.w I, -1
     slli.d TEMP, TEMP, ZBASE_SHIFT
     slli.d INCX, INCX, ZBASE_SHIFT
-    vreplgr2vr.w neg1, I
-    vffint.s.w neg1, neg1
     srai.d I, N, 3
     bne INCX, TEMP, .L20
     bge $r0, I, .L23
     .align 3
 
 .L10:
-    vld VX0, X, 0 * SIZE
-    vld VX1, X, 4 * SIZE
-    addi.d I, I, -1
+    vld VX0, X, 0
+    vld VX1, X, 16
+#ifdef DOUBLE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
     vpickev.w x1, VX1, VX0
     vpickod.w x2, VX1, VX0
-    vfmul.s x3, neg1, x1
-    vfmul.s x4, neg1, x2
-    vfcmp.clt.s VT0, x1, res0
-    vfcmp.clt.s VT1, x2, res0
-    vld VX0, X, 8 * SIZE
-    vbitsel.v x1, x1, x3, VT0
-    vbitsel.v x2, x2, x4, VT1
-    vld VX1, X, 12 * SIZE
-    vfadd.s VM1, x1, x2
+#endif
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD VM1, x1, x2
+
+    vld VX0, X, 32
+    vld VX1, X, 48
+#ifdef DOUBLE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
     vpickev.w x1, VX1, VX0
     vpickod.w x2, VX1, VX0
-    vfmul.s x3, neg1, x1
-    vfmul.s x4, neg1, x2
-    vfcmp.clt.s VT0, x1, res0
-    vfcmp.clt.s VT1, x2, res0
+#endif
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD x1, x1, x2
+    VFMIN VM1, x1, VM1
+    VFMIN VM0, VM0, VM1
+#ifdef DOUBLE
+    vld VX0, X, 64
+    vld VX1, X, 80
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD VM1, x1, x2
+
+    vld VX0, X, 96
+    vld VX1, X, 112
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD x1, x1, x2
+    VFMIN VM1, x1, VM1
+    VFMIN VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
     addi.d X, X, 16 * SIZE
-    vbitsel.v x1, x1, x3, VT0
-    vbitsel.v x2, x2, x4, VT1
-    vfadd.s x1, x1, x2
-    vfmin.s VM1, x1, VM1
-    vfmin.s VM0, VM0, VM1
     blt $r0, I, .L10
     .align 3
 
 .L11:
+#ifdef DOUBLE
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    VFMIN VM0, x1, x2
+#else
     vreplvei.w x1, VM0, 0
     vreplvei.w x2, VM0, 1
     vreplvei.w x3, VM0, 2
     vreplvei.w x4, VM0, 3
-    vfmin.s VM1, x1, x2
-    vfmin.s VM0, x3, x4
-    vfmin.s VM0, VM0, VM1
+    VFMIN VM1, x1, x2
+    VFMIN VM0, x3, x4
+    VFMIN VM0, VM0, VM1
+#endif
     b .L23
     .align 3
 
@@ -124,66 +161,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     .align 3
 
 .L21:
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmin.s s1, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmin.s s1, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
     addi.d I, I, -1
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmin.s s3, t1, t3
-    fld.s t1, X, 0 * SIZE
-    fld.s t2, X, 1 * SIZE
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s3, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
     add.d X, X, INCX
-    fld.s t3, X, 0 * SIZE
-    fld.s t4, X, 1 * SIZE
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
     add.d X, X, INCX
-    fabs.s t1, t1
-    fabs.s t2, t2
-    fabs.s t3, t3
-    fabs.s t4, t4
-    fadd.s t1, t1, t2
-    fadd.s t3, t3, t4
-    fmin.s s4, t1, t3
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s4, t1, t3
     blt $r0, I, .L21
     .align 3
 
 .L22:
-    fmin.s s1, s1, s2
-    fmin.s s3, s3, s4
-    fmin.s s1, s1, s3
+    FMIN s1, s1, s2
+    FMIN s3, s3, s4
+    FMIN s1, s1, s3
     .align 3
 
 .L23: //N<8
@@ -192,19 +229,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     .align 3
 
 .L24:
-    fld.s a0, X, 0 * SIZE
-    fld.s a1, X, 1 * SIZE
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
     addi.d I, I, -1
-    fabs.s a0, a0
-    fabs.s a1, a1
-    fadd.s a0, a0, a1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
     add.d  X, X, INCX
-    fmin.s s1, a0, s1
+    FMIN s1, a0, s1
     blt $r0, I, .L24
     .align 3
 
 .L999:
-    fmov.s $f0, $f22
+    MOV $f0, $f22
     jirl $r0, $r1, 0x0
     .align 3
 

From ec2aa32eb069a8aa9b2013c1f6af6b4cddb1b0dc Mon Sep 17 00:00:00 2001
From: Mark Ryan <markdryan@rivosinc.com>
Date: Thu, 25 Jan 2024 15:20:58 +0000
Subject: [PATCH 606/718] Fix crash in cpuid_riscv64.c

The crash is reproducible when building OpenBLAS without forcing a
target in a riscv64 container running on an X86_64 machine with an
older version of QEMU, e.g., 7.0.0, registered with binfmt_misc to
run riscv64 binaries.  With this setup, cat /proc/cpuinfo in the
container returns the cpu information for the host, which contains a
"model name" string, and we execute the buggy code.  The code in
question is searching in an uninitialised buffer for the ':' character
and doesn't check to see whether it was found or not.  This can result
in pmodel containing the pointer value 1 and a crash when pmodel is
defererenced.  The algorithm to detect the C910V CPU has not been
modified, merely fixed to prevent the crash.

A few additional checks for NULL pointers are added to improve the
robustness of the code and a whitespace error is corrected.
---
 cpuid_riscv64.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c
index 894d2b873..7e08af831 100644
--- a/cpuid_riscv64.c
+++ b/cpuid_riscv64.c
@@ -86,23 +86,29 @@ int detect(void){
   char *pmodel = NULL, *pisa = NULL;
 
   infile = fopen("/proc/cpuinfo", "r");
+  if (!infile)
+    return CPU_GENERIC;
   while (fgets(buffer, sizeof(buffer), infile)){
     if(!strncmp(buffer, "model name", 10)){
       strcpy(model_buffer, buffer);
-      pmodel = strchr(isa_buffer, ':') + 1;
+      pmodel = strchr(model_buffer, ':');
+      if (pmodel)
+        pmodel++;
     }
 
     if(!strncmp(buffer, "isa", 3)){
       strcpy(isa_buffer, buffer);
-      pisa = strchr(isa_buffer, '4') + 1;
+      pisa = strchr(isa_buffer, '4');
+      if (pisa)
+        pisa++;
     }
   }
 
   fclose(infile);
 
-  if (!pmodel)
+  if (!pmodel || !pisa)
    return(CPU_GENERIC);
-   
+
   if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v'))
     return CPU_C910V;
 

From e0b610d01ff4cdb44fd2453b7efa7f51ea84e575 Mon Sep 17 00:00:00 2001
From: Mark Ryan <markdryan@rivosinc.com>
Date: Fri, 26 Jan 2024 13:57:33 +0000
Subject: [PATCH 607/718] Harmonize riscv64 LIBNAME for forced and non-forced
 targets

The forced values for LIBNAME were either riscv64_generic or c910v
while the non-forced value of LIBNAME was always riscv64.
---
 cpuid_riscv64.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c
index 7e08af831..13009753a 100644
--- a/cpuid_riscv64.c
+++ b/cpuid_riscv64.c
@@ -78,6 +78,11 @@ static char *cpuname[] = {
   "C910V"
 };
 
+static char *cpuname_lower[] = {
+  "riscv64_generic",
+  "c910v"
+};
+
 int detect(void){
 #ifdef __linux
   FILE *infile;
@@ -146,5 +151,5 @@ void get_cpuconfig(void){
 }
 
 void get_libname(void){
-  printf("riscv64\n");
+  printf("%s", cpuname_lower[detect()]);
 }

From 48a4c4d454530d46c0143f4f9bd3f5f7a8374757 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 26 Jan 2024 16:30:52 +0100
Subject: [PATCH 608/718] Use +sve in arch declarations of the fallback paths
 for SVE targets

---
 Makefile.arm64 | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Makefile.arm64 b/Makefile.arm64
index ed52a9424..ca053b03d 100644
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -116,13 +116,13 @@ endif
 endif
 endif
 else
-CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
 endif
 endif
 else
-CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
 endif
@@ -138,7 +138,7 @@ ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
 ifneq ($(OSNAME), Darwin)
 CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
 else
-CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
 endif
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
@@ -156,13 +156,13 @@ endif
 endif
 endif
 else
-CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
 endif
 endif
 else
-CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
 endif

From 519ea6e87aa357787896986836f853192c829930 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Tue, 30 Jan 2024 10:39:22 +0800
Subject: [PATCH 609/718] utest: Add utest for the {sc/dz}amax and
 {s/d/sc/dz}amin

---
 utest/CMakeLists.txt |  1 +
 utest/Makefile       |  3 +-
 utest/test_amax.c    | 35 +++++++++++++++--
 utest/test_amin.c    | 89 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 123 insertions(+), 5 deletions(-)
 create mode 100644 utest/test_amin.c

diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index c47954ce4..41829bd22 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -16,6 +16,7 @@ else ()
     test_dnrm2.c
     test_swap.c
     test_zscal.c
+    test_amin.c
   )
 endif ()
 
diff --git a/utest/Makefile b/utest/Makefile
index d0715c754..8acaa3ea9 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -11,7 +11,8 @@ UTESTBIN=openblas_utest
 
 include $(TOPDIR)/Makefile.system
 
-OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o
+OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o \
+     test_amin.o
 #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o
 
 ifneq ($(NO_LAPACK), 1)
diff --git a/utest/test_amax.c b/utest/test_amax.c
index a9e5a1c85..e9775caf0 100644
--- a/utest/test_amax.c
+++ b/utest/test_amax.c
@@ -1,5 +1,5 @@
 /*****************************************************************************
-Copyright (c) 2011-2016, The OpenBLAS Project
+Copyright (c) 2011-2024, The OpenBLAS Project
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -13,9 +13,9 @@ met:
       notice, this list of conditions and the following disclaimer in
       the documentation and/or other materials provided with the
       distribution.
-   3. Neither the name of the OpenBLAS project nor the names of 
-      its contributors may be used to endorse or promote products 
-      derived from this software without specific prior written 
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
       permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
@@ -57,4 +57,31 @@ CTEST(amax, damax){
   ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS);
 }
 #endif
+#ifdef BUILD_COMPLEX
+CTEST(amax, scamax){
+  blasint N = 9, inc = 1;
+  float te_max = 0.0, tr_max = 0.0;
+  float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
+	        -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6,
+		-7.7, 8.8 };
 
+  te_max = BLASFUNC(scamax)(&N, x, &inc);
+  tr_max = 20.0;
+
+  ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS);
+}
+#endif
+#ifdef BUILD_COMPLEX16
+CTEST(amax, dzamax){
+  blasint N = 9, inc = 1;
+  double te_max = 0.0, tr_max = 0.0;
+  double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
+	         -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6,
+		 -7.7, 8.8 };
+
+  te_max = BLASFUNC(dzamax)(&N, x, &inc);
+  tr_max = 20.0;
+
+  ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS);
+}
+#endif
diff --git a/utest/test_amin.c b/utest/test_amin.c
new file mode 100644
index 000000000..1305ab8ef
--- /dev/null
+++ b/utest/test_amin.c
@@ -0,0 +1,89 @@
+/*****************************************************************************
+Copyright (c) 2011-2024, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "openblas_utest.h"
+
+#ifdef BUILD_SINGLE
+CTEST(amin, samin){
+  blasint N = 3, inc = 1;
+  float te_min = 0.0, tr_min = 0.0;
+  float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
+	        -9.9 };
+
+  te_min = BLASFUNC(samin)(&N, x, &inc);
+  tr_min = 1.1;
+
+  ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS);
+}
+#endif
+#ifdef BUILD_DOUBLE
+CTEST(amin, damin){
+  blasint N = 3, inc = 1;
+  double te_min = 0.0, tr_min = 0.0;
+  double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
+	         -9.9 };
+
+  te_min = BLASFUNC(damin)(&N, x, &inc);
+  tr_min = 1.1;
+
+  ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS);
+}
+#endif
+#ifdef BUILD_COMPLEX
+CTEST(amin, scamin){
+  blasint N = 9, inc = 1;
+  float te_min = 0.0, tr_min = 0.0;
+  float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
+	        -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6,
+		-7.7, 8.8 };
+
+  te_min = BLASFUNC(scamin)(&N, x, &inc);
+  tr_min = 3.3;
+
+  ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS);
+}
+#endif
+#ifdef BUILD_COMPLEX16
+CTEST(amin, dzamin){
+  blasint N = 9, inc = 1;
+  double te_min = 0.0, tr_min = 0.0;
+  double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
+	         -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6,
+		 -7.7, 8.8 };
+
+  te_min = BLASFUNC(dzamin)(&N, x, &inc);
+  tr_min = 3.3;
+
+  ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS);
+}
+#endif

From a79d11740580db13f101b76c81a02f93654de9a7 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Tue, 30 Jan 2024 11:03:56 +0800
Subject: [PATCH 610/718] LoogArch64: Fixed bug for {s/d}amin

---
 kernel/loongarch64/amin_lasx.S | 1 -
 kernel/loongarch64/amin_lsx.S  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/kernel/loongarch64/amin_lasx.S b/kernel/loongarch64/amin_lasx.S
index 0a4359002..c91a33006 100644
--- a/kernel/loongarch64/amin_lasx.S
+++ b/kernel/loongarch64/amin_lasx.S
@@ -66,7 +66,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
     xvldrepl.w VM0, X, 0
 #endif
-    XVFSUB VM0, VM0, VM0
     bne INCX, TEMP, .L20
 
     srai.d I, N, 4
diff --git a/kernel/loongarch64/amin_lsx.S b/kernel/loongarch64/amin_lsx.S
index 644caf43c..c3c3f4ae9 100644
--- a/kernel/loongarch64/amin_lsx.S
+++ b/kernel/loongarch64/amin_lsx.S
@@ -66,7 +66,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
     vldrepl.w VM0, X, 0
 #endif
-    VFSUB VM0, VM0, VM0
     bne INCX, TEMP, .L20
 
     srai.d I, N, 3

From 3d4dfd008556b5a722162def487e0553f807e6e8 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Tue, 30 Jan 2024 11:25:59 +0800
Subject: [PATCH 611/718] Benchmark: Rename the executable file names for
 {sc/dz}a{min/max}

No interface named {c/z}a{min/max}, keeping it would
cause ambiguity
---
 benchmark/Makefile | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/benchmark/Makefile b/benchmark/Makefile
index d9ddb9042..6a7c54636 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -265,9 +265,9 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
        ismax.goto idmax.goto \
        isamin.goto idamin.goto icamin.goto izamin.goto \
        ismin.goto idmin.goto \
-       samax.goto damax.goto camax.goto zamax.goto \
+       samax.goto damax.goto scamax.goto dzamax.goto \
        smax.goto dmax.goto \
-       samin.goto damin.goto camin.goto zamin.goto \
+       samin.goto damin.goto scamin.goto dzamin.goto \
        smin.goto dmin.goto \
        saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \
        snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS)
@@ -2832,12 +2832,12 @@ samax.goto : samax.$(SUFFIX) ../$(LIBNAME)
 damax.goto : damax.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
-############################################## CAMAX ##############################################
-camax.goto : camax.$(SUFFIX) ../$(LIBNAME)
+############################################## SCAMAX ##############################################
+scamax.goto : scamax.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
-############################################## ZAMAX ##############################################
-zamax.goto : zamax.$(SUFFIX) ../$(LIBNAME)
+############################################## DZAMAX ##############################################
+dzamax.goto : dzamax.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
 ############################################## SMAX ##############################################
@@ -2856,12 +2856,12 @@ samin.goto : samin.$(SUFFIX) ../$(LIBNAME)
 damin.goto : damin.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
-############################################## CAMIN ##############################################
-camin.goto : camin.$(SUFFIX) ../$(LIBNAME)
+############################################## SCAMIN ##############################################
+scamin.goto : scamin.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
-############################################## ZAMIN ##############################################
-zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME)
+############################################## DZAMIN ##############################################
+dzamin.goto : dzamin.$(SUFFIX) ../$(LIBNAME)
 	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
 
 ############################################## SMIN ##############################################
@@ -3383,10 +3383,10 @@ samax.$(SUFFIX) : amax.c
 damax.$(SUFFIX) : amax.c
 	$(CC) $(CFLAGS) -c -UCOMPLEX  -DDOUBLE -o $(@F) $^
 
-camax.$(SUFFIX) : amax.c
+scamax.$(SUFFIX) : amax.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX  -UDOUBLE -o $(@F) $^
 
-zamax.$(SUFFIX) : amax.c
+dzamax.$(SUFFIX) : amax.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX  -DDOUBLE -o $(@F) $^
 
 
@@ -3403,10 +3403,10 @@ samin.$(SUFFIX) : amin.c
 damin.$(SUFFIX) : amin.c
 	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
 
-camin.$(SUFFIX) : amin.c
+scamin.$(SUFFIX) : amin.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
 
-zamin.$(SUFFIX) : amin.c
+dzamin.$(SUFFIX) : amin.c
 	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
 
 

From 0d7fe5ea610d46afaed9f5164f6a11729e2429de Mon Sep 17 00:00:00 2001
From: Mark Seminatore <mseminatore@users.noreply.github.com>
Date: Mon, 29 Jan 2024 22:33:47 -0800
Subject: [PATCH 612/718] clean up whitespace

---
 driver/others/blas_server_win32.c | 192 +++++++++++++-----------------
 1 file changed, 85 insertions(+), 107 deletions(-)

diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 5820a55f4..68dde584b 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -72,19 +72,9 @@ static HANDLE	    blas_threads   [MAX_CPU_NUMBER];
 static DWORD	    blas_threads_id[MAX_CPU_NUMBER];
 static volatile int thread_target;	// target num of live threads, volatile for cross-thread reads
 
-#if defined (__GNUC__) && (__GNUC__ < 6)
-	#define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch)
-#else
-	#if defined(_WIN64)
-		#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp)
-	#else
-		#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp)
-	#endif
-#endif
-
 static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 
-      if (!(mode & BLAS_COMPLEX)){
+      if (!(mode & BLAS_COMPLEX)) {
 #ifdef EXPRECISION
 	if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
 	  /* REAL / Extended Double */
@@ -99,7 +89,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 		args -> c, args -> ldc, sb);
 	} else
 #endif
-	  if ((mode & BLAS_PREC) == BLAS_DOUBLE){
+	  if ((mode & BLAS_PREC) == BLAS_DOUBLE) {
 	    /* REAL / Double */
 	    void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
 			  double *, BLASLONG, double *, BLASLONG,
@@ -110,7 +100,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 		  args -> a, args -> lda,
 		  args -> b, args -> ldb,
 		  args -> c, args -> ldc, sb);
-	  } else if ((mode & BLAS_PREC) == BLAS_SINGLE){
+	  } else if ((mode & BLAS_PREC) == BLAS_SINGLE) {
 	    /* REAL / Single */
 	    void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
 			  float *, BLASLONG, float *, BLASLONG,
@@ -122,7 +112,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 		  args -> b, args -> ldb,
 		  args -> c, args -> ldc, sb);
 #ifdef BUILD_BFLOAT16
-          } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){
+          } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16) {
             /* REAL / BFLOAT16 */
             void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
                           bfloat16 *, BLASLONG, bfloat16 *, BLASLONG,
@@ -133,7 +123,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
                   args -> a, args -> lda,
                   args -> b, args -> ldb,
                   args -> c, args -> ldc, sb);
-          } else if ((mode & BLAS_PREC) == BLAS_STOBF16){
+          } else if ((mode & BLAS_PREC) == BLAS_STOBF16) {
             /* REAL / BLAS_STOBF16 */
             void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
                           float *, BLASLONG, bfloat16 *, BLASLONG,
@@ -144,7 +134,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
                   args -> a, args -> lda,
                   args -> b, args -> ldb,
                   args -> c, args -> ldc, sb);
-          } else if ((mode & BLAS_PREC) == BLAS_DTOBF16){
+          } else if ((mode & BLAS_PREC) == BLAS_DTOBF16) {
             /* REAL / BLAS_DTOBF16 */
             void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
                           double *, BLASLONG, bfloat16 *, BLASLONG,
@@ -161,7 +151,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 	  }
       } else {
 #ifdef EXPRECISION
-	if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
+	if ((mode & BLAS_PREC) == BLAS_XDOUBLE) {
 	  /* COMPLEX / Extended Double */
 	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
 			xdouble *, BLASLONG, xdouble *, BLASLONG,
@@ -175,7 +165,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 		args -> c, args -> ldc, sb);
 	} else
 #endif
-	  if ((mode & BLAS_PREC) == BLAS_DOUBLE){
+	  if ((mode & BLAS_PREC) == BLAS_DOUBLE) {
 	    /* COMPLEX / Double */
 	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
 			double *, BLASLONG, double *, BLASLONG,
@@ -205,10 +195,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
       }
 }
 
-/* This is a main routine of threads. Each thread waits until job is */
-/* queued.                                                           */
-
-static DWORD WINAPI blas_thread_server(void *arg){
+// This is a main routine of threads. Each thread waits until job is
+// queued.
+static DWORD WINAPI blas_thread_server(void *arg) {
 
   /* Thread identifier */
   BLASLONG  cpu = (BLASLONG)arg;
@@ -221,24 +210,22 @@ static DWORD WINAPI blas_thread_server(void *arg){
 
   MT_TRACE("Server[%2ld] Thread is started!\n", cpu);
 
-  while (1){
+  while (1) {
 
     /* Waiting for Queue */
 
     MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu);
 
-	// event raised when work is added to the queue
-	WaitForSingleObject(kickoff_event, INFINITE);
+    // event raised when work is added to the queue
+    WaitForSingleObject(kickoff_event, INFINITE);
 
-	if (cpu > thread_target - 2)
-	{
-		//MT_TRACE("thread [%d] exiting.\n", cpu);
-		break;	// excess thread, so worker thread exits
-	}
+    if (cpu > thread_target - 2) {
+      //MT_TRACE("thread [%d] exiting.\n", cpu);
+      break;	// excess thread, so worker thread exits
+    }
 
     MT_TRACE("Server[%2ld] Got it.\n", cpu);
 
-#if 1
     EnterCriticalSection(&queue_lock);
 
     queue = work_queue;
@@ -246,51 +233,39 @@ static DWORD WINAPI blas_thread_server(void *arg){
         work_queue = work_queue->next;
 
     LeaveCriticalSection(&queue_lock);
-#else
-    volatile blas_queue_t* queue_next;
-
-    INT_PTR prev_value;
-    do {
-        queue = (volatile blas_queue_t*)work_queue;
-        if (!queue)
-            break;
-
-        queue_next = (volatile blas_queue_t*)queue->next;
-        prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue);
-    } while (prev_value != queue);
-#endif
 
-    if (queue)  {
+    if (queue) {
       int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
 
       sa = queue -> sa;
       sb = queue -> sb;
 
-#ifdef CONSISTENT_FPCSR
-      __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
-      __asm__ __volatile__ ("fldcw %0"   : : "m" (queue -> x87_mode));
-#endif
+      #ifdef CONSISTENT_FPCSR
+        __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
+        __asm__ __volatile__ ("fldcw %0"   : : "m" (queue -> x87_mode));
+      #endif
 
       MT_TRACE("Server[%2ld] Started.  Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
 	      cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
 
       // fprintf(stderr, "queue start[%ld]!!!\n", cpu);
 
-#ifdef MONITOR
-      main_status[cpu] = MAIN_RUNNING1;
-#endif
+      #ifdef MONITOR
+        main_status[cpu] = MAIN_RUNNING1;
+      #endif
 
-      if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
+      if (sa == NULL) 
+        sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
 
       if (sb == NULL) {
-	if (!(queue -> mode & BLAS_COMPLEX)){
+        if (!(queue -> mode & BLAS_COMPLEX)) {
 #ifdef EXPRECISION
-	  if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
+	  if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE) {
 	    sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble)
 					+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
 	  } else
 #endif
-	    if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
+	    if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) {
 #ifdef BUILD_DOUBLE
 	      sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
 					  + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
@@ -324,26 +299,25 @@ static DWORD WINAPI blas_thread_server(void *arg){
             /* Other types in future */
 	    }
 	}
-	queue->sb=sb;
+      	queue->sb=sb;
       }
 
-#ifdef MONITOR
-      main_status[cpu] = MAIN_RUNNING2;
-#endif
+      #ifdef MONITOR
+        main_status[cpu] = MAIN_RUNNING2;
+      #endif
 
       if (!(queue -> mode & BLAS_LEGACY)) {
-
-	(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
+      	(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
       } else {
-	legacy_exec(routine, queue -> mode, queue -> args, sb);
+  	    legacy_exec(routine, queue -> mode, queue -> args, sb);
       }
-    }else{
-		continue; //if queue == NULL
-	}
+    } else {
+  		continue; //if queue == NULL
+	  }
 
     MT_TRACE("Server[%2ld] Finished!\n", cpu);
 	
-	queue->finished = 1;
+	  queue->finished = 1;
   }
 
   /* Shutdown procedure */
@@ -353,10 +327,12 @@ static DWORD WINAPI blas_thread_server(void *arg){
   blas_memory_free(buffer);
 
   return 0;
-  }
+}
 
-/* Initializing routine */
-int blas_thread_init(void){
+//
+// Initializing routine
+//
+int blas_thread_init(void) {
   BLASLONG i;
 
   if (blas_server_avail || (blas_cpu_number <= 1)) return 0;
@@ -365,16 +341,16 @@ int blas_thread_init(void){
 
   MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number);
 
-  if (!blas_server_avail){
-	// create the kickoff Event
-	kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
+  if (!blas_server_avail) {
+    // create the kickoff Event
+    kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
 
-	thread_target = blas_cpu_number;
+    thread_target = blas_cpu_number;
 
     InitializeCriticalSection(&queue_lock);
 
-    for(i = 0; i < blas_cpu_number - 1; i++){
-	  //MT_TRACE("thread_init: creating thread [%d]\n", i);
+    for(i = 0; i < blas_cpu_number - 1; i++) {
+	    //MT_TRACE("thread_init: creating thread [%d]\n", i);
 
       blas_threads[i] = CreateThread(NULL, 0,
 				     blas_thread_server, (void *)i,
@@ -391,13 +367,10 @@ int blas_thread_init(void){
 
 /*
    User can call one of two routines.
-
      exec_blas_async ... immediately returns after jobs are queued.
-
      exec_blas       ... returns after jobs are finished.
 */
-
-int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
+int exec_blas_async(BLASLONG pos, blas_queue_t *queue) {
 
 #if defined(SMP_SERVER)
   // Handle lazy re-init of the thread-pool after a POSIX fork
@@ -417,7 +390,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
     __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode));
 #endif
 
-	current->finished = 0;
+  	current->finished = 0;
     current = current -> next;
     pos ++;
   }
@@ -426,18 +399,18 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
 
   if (!work_queue)
   {
-      work_queue = queue;
+    work_queue = queue;
   }
   else
   {
 	  blas_queue_t *next_item = work_queue;
 
-      // find the end of the work queue
-      while (next_item)
-          next_item = next_item->next;
+    // find the end of the work queue
+    while (next_item)
+        next_item = next_item->next;
 
-      // add new work to the end
-      next_item = queue;
+    // add new work to the end
+    next_item = queue;
   }
 
   LeaveCriticalSection(&queue_lock);
@@ -447,20 +420,24 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
   return 0;
 }
 
-int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
+//
+// Join. Wait for all queued tasks to complete
+//
+int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) {
 
-    MT_TRACE("Synchronization Waiting.\n");
+  MT_TRACE("Synchronization Waiting.\n");
 
-    while (num){
-      MT_TRACE("Waiting Queue ..\n");
-	  while (!queue->finished)
-		  YIELDING;
+  while (num) {
+    MT_TRACE("Waiting Queue ..\n");
 
-  	  queue = queue->next;
-      num--;
-    }
+    while (!queue->finished)
+      YIELDING;
 
-    MT_TRACE("Completely Done.\n\n");
+    queue = queue->next;
+    num--;
+  }
+
+  MT_TRACE("Completely Done.\n\n");
 
 	// if work was added to the queue after this batch we can't sleep the worker threads
 	// by resetting the event
@@ -474,8 +451,10 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
 	return 0;
 }
 
-/* Execute Threads */
-int exec_blas(BLASLONG num, blas_queue_t *queue){
+//
+// Execute Threads
+//
+int exec_blas(BLASLONG num, blas_queue_t *queue) {
 
 #if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
   // Handle lazy re-init of the thread-pool after a POSIX fork
@@ -507,9 +486,8 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
   return 0;
 }
 
-/* Shutdown procedure, but user don't have to call this routine. The */
-/* kernel automatically kill threads.                                */
-
+// Shutdown procedure, but user don't have to call this routine. The
+// kernel automatically kill threads.
 int BLASFUNC(blas_thread_shutdown)(void){
 
   int i;
@@ -518,9 +496,9 @@ int BLASFUNC(blas_thread_shutdown)(void){
 
   LOCK_COMMAND(&server_lock);
 
-  if (blas_server_avail){
+  if (blas_server_avail) {
 
-    for(i = 0; i < blas_num_threads - 1; i++){
+    for(i = 0; i < blas_num_threads - 1; i++) {
       // Could also just use WaitForMultipleObjects
       DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
 
@@ -555,7 +533,7 @@ void goto_set_num_threads(int num_threads)
 
 	if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
 
-	if (blas_server_avail && num_threads < blas_num_threads)	{
+	if (blas_server_avail && num_threads < blas_num_threads) {
 		LOCK_COMMAND(&server_lock);
 
 		thread_target = num_threads;
@@ -586,7 +564,7 @@ void goto_set_num_threads(int num_threads)
 		thread_target = num_threads;
 
 		//increased_threads = 1;
-	    if (!blas_server_avail){
+	    if (!blas_server_avail) {
 			// create the kickoff Event
 			kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
 
@@ -595,7 +573,7 @@ void goto_set_num_threads(int num_threads)
 			blas_server_avail = 1;
 		}
 
-		for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
+		for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) {
 			//MT_TRACE("set_num_threads: creating thread [%d]\n", i);
 
 			blas_threads[i] = CreateThread(NULL, 0,

From 83ce97a4ca44c1aedc9f825bcb11f3a999f09c60 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Tue, 30 Jan 2024 16:54:14 +0800
Subject: [PATCH 613/718] LoongArch64: Handle NAN and INF

---
 kernel/loongarch64/cscal_lasx.S | 149 +-------------------------------
 kernel/loongarch64/cscal_lsx.S  | 130 +---------------------------
 2 files changed, 4 insertions(+), 275 deletions(-)

diff --git a/kernel/loongarch64/cscal_lasx.S b/kernel/loongarch64/cscal_lasx.S
index 3605a6c0e..f53526663 100644
--- a/kernel/loongarch64/cscal_lasx.S
+++ b/kernel/loongarch64/cscal_lasx.S
@@ -99,7 +99,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     b .L113 //alpha_r != 0.0 && alpha_i == 0.0
 
 .L14:
-    bceqz $fcc1, .L112  //alpha_r == 0.0 && alpha_i != 0.0
+    bceqz $fcc1, .L114  //alpha_r == 0.0 && alpha_i != 0.0
     b .L111 //alpha_r == 0.0 && alpha_i == 0.0
     .align 3
 
@@ -117,38 +117,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     b .L997
     .align 3
 
-.L112:  //alpha_r == 0.0 && alpha_i != 0.0
-    xvld VX0, X, 0 * SIZE
-#ifdef DOUBLE
-    xvld VX1, X, 4 * SIZE
-    xvpickev.d x1, VX1, VX0
-    xvpickod.d x2, VX1, VX0
-    xvfmul.d x3, VXAI, x2
-    xvfsub.d x3, VXZ, x3
-    xvfmul.d x4, VXAI, x1
-    xvilvl.d VX2, x4 ,x3
-    xvilvh.d VX3, x4, x3
-    xvst VX2, X, 0 * SIZE
-    xvst VX3, X, 4 * SIZE
-    addi.d X, X, 8 * SIZE
-#else
-    xvld VX1, X, 8 * SIZE
-    xvpickev.w x1, VX1, VX0
-    xvpickod.w x2, VX1, VX0
-    xvfmul.s x3, VXAI, x2
-    xvfsub.s x3, VXZ, x3
-    xvfmul.s x4, VXAI, x1
-    xvilvl.w VX2, x4 ,x3
-    xvilvh.w VX3, x4, x3
-    xvst VX2, X, 0 * SIZE
-    xvst VX3, X, 8 * SIZE
-    addi.d X, X, 16 * SIZE
-#endif
-    addi.d  I, I, -1
-    blt $r0, I, .L112
-    b .L997
-    .align 3
-
 .L113: //alpha_r != 0.0 && alpha_i == 0.0
     xvld VX0, X, 0 * SIZE
 #ifdef DOUBLE
@@ -227,7 +195,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     b .L223 //alpha_r != 0.0 && alpha_i == 0.0
 
 .L24:
-    bceqz $fcc1, .L222  //alpha_r == 0.0 && alpha_i != 0.0
+    bceqz $fcc1, .L224  //alpha_r == 0.0 && alpha_i != 0.0
     b .L221 //alpha_r == 0.0 && alpha_i == 0.0
     .align 3
 
@@ -275,119 +243,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     b .L997
     .align 3
 
-.L222:  //alpha_r == 0.0 && alpha_i != 0.0
-#ifdef DOUBLE
-    ld.d t1, X, 0 * SIZE
-    ld.d t2, X, 1 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    ld.d t4, X, 1 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.d x1, t1, 0
-    xvinsgr2vr.d x2, t2, 0
-    xvinsgr2vr.d x1, t3, 1
-    xvinsgr2vr.d x2, t4, 1
-    ld.d t1, X, 0 * SIZE
-    ld.d t2, X, 1 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    ld.d t4, X, 1 * SIZE
-    xvinsgr2vr.d x1, t1, 2
-    xvinsgr2vr.d x2, t2, 2
-    xvinsgr2vr.d x1, t3, 3
-    xvinsgr2vr.d x2, t4, 3
-    add.d X, X, INCX
-
-    xvfmul.d x3, VXAI, x2
-    xvfsub.d x3, VXZ, x3
-    xvfmul.d x4, VXAI, x1
-    addi.d  I, I, -1
-    xvstelm.d x3, XX, 0 * SIZE, 0
-    xvstelm.d x4, XX, 1 * SIZE, 0
-    add.d XX, XX, INCX
-    xvstelm.d x3, XX, 0 * SIZE, 1
-    xvstelm.d x4, XX, 1 * SIZE, 1
-    add.d XX, XX, INCX
-    xvstelm.d x3, XX, 0 * SIZE, 2
-    xvstelm.d x4, XX, 1 * SIZE, 2
-    add.d XX, XX, INCX
-    xvstelm.d x3, XX, 0 * SIZE, 3
-    xvstelm.d x4, XX, 1 * SIZE, 3
-#else
-    ld.w t1, X, 0 * SIZE
-    ld.w t2, X, 1 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    ld.w t4, X, 1 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w x1, t1, 0
-    xvinsgr2vr.w x2, t2, 0
-    xvinsgr2vr.w x1, t3, 1
-    xvinsgr2vr.w x2, t4, 1
-    ld.w t1, X, 0 * SIZE
-    ld.w t2, X, 1 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    ld.w t4, X, 1 * SIZE
-    xvinsgr2vr.w x1, t1, 2
-    xvinsgr2vr.w x2, t2, 2
-    xvinsgr2vr.w x1, t3, 3
-    xvinsgr2vr.w x2, t4, 3
-    add.d X, X, INCX
-    ld.w t1, X, 0 * SIZE
-    ld.w t2, X, 1 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    ld.w t4, X, 1 * SIZE
-    add.d X, X, INCX
-    xvinsgr2vr.w x1, t1, 4
-    xvinsgr2vr.w x2, t2, 4
-    xvinsgr2vr.w x1, t3, 5
-    xvinsgr2vr.w x2, t4, 5
-    ld.w t1, X, 0 * SIZE
-    ld.w t2, X, 1 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    ld.w t4, X, 1 * SIZE
-    xvinsgr2vr.w x1, t1, 6
-    xvinsgr2vr.w x2, t2, 6
-    xvinsgr2vr.w x1, t3, 7
-    xvinsgr2vr.w x2, t4, 7
-    add.d X, X, INCX
-
-    xvfmul.s x3, VXAI, x2
-    xvfsub.s x3, VXZ, x3
-    xvfmul.s x4, VXAI, x1
-    addi.d  I, I, -1
-    xvstelm.w x3, XX, 0 * SIZE, 0
-    xvstelm.w x4, XX, 1 * SIZE, 0
-    add.d XX, XX, INCX
-    xvstelm.w x3, XX, 0 * SIZE, 1
-    xvstelm.w x4, XX, 1 * SIZE, 1
-    add.d XX, XX, INCX
-    xvstelm.w x3, XX, 0 * SIZE, 2
-    xvstelm.w x4, XX, 1 * SIZE, 2
-    add.d XX, XX, INCX
-    xvstelm.w x3, XX, 0 * SIZE, 3
-    xvstelm.w x4, XX, 1 * SIZE, 3
-    add.d XX, XX, INCX
-    xvstelm.w x3, XX, 0 * SIZE, 4
-    xvstelm.w x4, XX, 1 * SIZE, 4
-    add.d XX, XX, INCX
-    xvstelm.w x3, XX, 0 * SIZE, 5
-    xvstelm.w x4, XX, 1 * SIZE, 5
-    add.d XX, XX, INCX
-    xvstelm.w x3, XX, 0 * SIZE, 6
-    xvstelm.w x4, XX, 1 * SIZE, 6
-    add.d XX, XX, INCX
-    xvstelm.w x3, XX, 0 * SIZE, 7
-    xvstelm.w x4, XX, 1 * SIZE, 7
-#endif
-    add.d XX, XX, INCX
-    blt $r0, I, .L222
-    b .L997
-    .align 3
-
 .L223: //alpha_r != 0.0 && alpha_i == 0.0
 #ifdef DOUBLE
     ld.d t1, X, 0 * SIZE
diff --git a/kernel/loongarch64/cscal_lsx.S b/kernel/loongarch64/cscal_lsx.S
index f442a754f..241d3d16e 100644
--- a/kernel/loongarch64/cscal_lsx.S
+++ b/kernel/loongarch64/cscal_lsx.S
@@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     b .L113 //alpha_r != 0.0 && alpha_i == 0.0
 
 .L14:
-    bceqz $fcc1, .L112  //alpha_r == 0.0 && alpha_i != 0.0
+    bceqz $fcc1, .L114  //alpha_r == 0.0 && alpha_i != 0.0
     b .L111 //alpha_r == 0.0 && alpha_i == 0.0
     .align 3
 
@@ -116,48 +116,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     b .L997
     .align 3
 
-.L112:  //alpha_r == 0.0 && alpha_i != 0.0
-    vld VX0, X, 0 * SIZE
-#ifdef DOUBLE
-    vld VX1, X, 2 * SIZE
-    vpickev.d x1, VX1, VX0
-    vpickod.d x2, VX1, VX0
-    vfmul.d x3, VXAI, x2
-    vfsub.d x3, VXZ, x3
-    vfmul.d x4, VXAI, x1
-    vilvl.d VX2, x4 ,x3
-    vilvh.d VX3, x4, x3
-    vst VX2, X, 0 * SIZE
-    vst VX3, X, 2 * SIZE
-    vld VX0, X, 4 * SIZE
-    vld VX1, X, 6 * SIZE
-    vpickev.d x1, VX1, VX0
-    vpickod.d x2, VX1, VX0
-    vfmul.d x3, VXAI, x2
-    vfsub.d x3, VXZ, x3
-    vfmul.d x4, VXAI, x1
-    vilvl.d VX2, x4 ,x3
-    vilvh.d VX3, x4, x3
-    vst VX2, X, 4 * SIZE
-    vst VX3, X, 6 * SIZE
-#else
-    vld VX1, X, 4 * SIZE
-    vpickev.w x1, VX1, VX0
-    vpickod.w x2, VX1, VX0
-    vfmul.s x3, VXAI, x2
-    vfsub.s x3, VXZ, x3
-    vfmul.s x4, VXAI, x1
-    vilvl.w VX2, x4 ,x3
-    vilvh.w VX3, x4, x3
-    vst VX2, X, 0 * SIZE
-    vst VX3, X, 4 * SIZE
-#endif
-    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
-    blt $r0, I, .L112
-    b .L997
-    .align 3
-
 .L113: //alpha_r != 0.0 && alpha_i == 0.0
     vld VX0, X, 0 * SIZE
 #ifdef DOUBLE
@@ -256,7 +214,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     b .L223 //alpha_r != 0.0 && alpha_i == 0.0
 
 .L24:
-    bceqz $fcc1, .L222  //alpha_r == 0.0 && alpha_i != 0.0
+    bceqz $fcc1, .L224  //alpha_r == 0.0 && alpha_i != 0.0
     b .L221 //alpha_r == 0.0 && alpha_i == 0.0
     .align 3
 
@@ -292,90 +250,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     b .L997
     .align 3
 
-.L222:  //alpha_r == 0.0 && alpha_i != 0.0
-#ifdef DOUBLE
-    ld.d t1, X, 0 * SIZE
-    ld.d t2, X, 1 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    ld.d t4, X, 1 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.d x1, t1, 0
-    vinsgr2vr.d x2, t2, 0
-    vinsgr2vr.d x1, t3, 1
-    vinsgr2vr.d x2, t4, 1
-    vfmul.d x3, VXAI, x2
-    vfsub.d x3, VXZ, x3
-    vfmul.d x4, VXAI, x1
-    vstelm.d x3, XX, 0 * SIZE, 0
-    vstelm.d x4, XX, 1 * SIZE, 0
-    add.d XX, XX, INCX
-    vstelm.d x3, XX, 0 * SIZE, 1
-    vstelm.d x4, XX, 1 * SIZE, 1
-    add.d XX, XX, INCX
-
-    ld.d t1, X, 0 * SIZE
-    ld.d t2, X, 1 * SIZE
-    add.d X, X, INCX
-    ld.d t3, X, 0 * SIZE
-    ld.d t4, X, 1 * SIZE
-    vinsgr2vr.d x1, t1, 0
-    vinsgr2vr.d x2, t2, 0
-    vinsgr2vr.d x1, t3, 1
-    vinsgr2vr.d x2, t4, 1
-    add.d X, X, INCX
-    vfmul.d x3, VXAI, x2
-    vfsub.d x3, VXZ, x3
-    vfmul.d x4, VXAI, x1
-    addi.d  I, I, -1
-    vstelm.d x3, XX, 0 * SIZE, 0
-    vstelm.d x4, XX, 1 * SIZE, 0
-    add.d XX, XX, INCX
-    vstelm.d x3, XX, 0 * SIZE, 1
-    vstelm.d x4, XX, 1 * SIZE, 1
-#else
-    ld.w t1, X, 0 * SIZE
-    ld.w t2, X, 1 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    ld.w t4, X, 1 * SIZE
-    add.d X, X, INCX
-    vinsgr2vr.w x1, t1, 0
-    vinsgr2vr.w x2, t2, 0
-    vinsgr2vr.w x1, t3, 1
-    vinsgr2vr.w x2, t4, 1
-    ld.w t1, X, 0 * SIZE
-    ld.w t2, X, 1 * SIZE
-    add.d X, X, INCX
-    ld.w t3, X, 0 * SIZE
-    ld.w t4, X, 1 * SIZE
-    vinsgr2vr.w x1, t1, 2
-    vinsgr2vr.w x2, t2, 2
-    vinsgr2vr.w x1, t3, 3
-    vinsgr2vr.w x2, t4, 3
-    add.d X, X, INCX
-
-    vfmul.s x3, VXAI, x2
-    vfsub.s x3, VXZ, x3
-    vfmul.s x4, VXAI, x1
-    addi.d  I, I, -1
-    vstelm.w x3, XX, 0 * SIZE, 0
-    vstelm.w x4, XX, 1 * SIZE, 0
-    add.d XX, XX, INCX
-    vstelm.w x3, XX, 0 * SIZE, 1
-    vstelm.w x4, XX, 1 * SIZE, 1
-    add.d XX, XX, INCX
-    vstelm.w x3, XX, 0 * SIZE, 2
-    vstelm.w x4, XX, 1 * SIZE, 2
-    add.d XX, XX, INCX
-    vstelm.w x3, XX, 0 * SIZE, 3
-    vstelm.w x4, XX, 1 * SIZE, 3
-#endif
-    add.d XX, XX, INCX
-    blt $r0, I, .L222
-    b .L997
-    .align 3
-
 .L223: //alpha_r != 0.0 && alpha_i == 0.0
 #ifdef DOUBLE
     ld.d t1, X, 0 * SIZE

From bb043a021f138a3915c835776fdfe90673644db4 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Tue, 30 Jan 2024 17:27:59 +0800
Subject: [PATCH 614/718] utest: Add tests for zscal

---
 utest/test_zscal.c | 52 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 2 deletions(-)

diff --git a/utest/test_zscal.c b/utest/test_zscal.c
index 8992eee90..ffc851e8b 100644
--- a/utest/test_zscal.c
+++ b/utest/test_zscal.c
@@ -20,6 +20,18 @@ CTEST(zscal, i_nan)
     ASSERT_TRUE(isnan(nan[17]));
 }
 
+CTEST(zscal, i_nan_inc_2)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0,
+                    NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
+    cblas_zscal(9, i, &nan, 2);
+    ASSERT_TRUE(isnan(nan[0]));
+    ASSERT_TRUE(isnan(nan[1]));
+    ASSERT_TRUE(isnan(nan[16]));
+    ASSERT_TRUE(isnan(nan[17]));
+}
+
 CTEST(zscal, nan_i)
 {
     double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
@@ -30,7 +42,19 @@ CTEST(zscal, nan_i)
     ASSERT_TRUE(isnan(i[16]));
     ASSERT_TRUE(isnan(i[17]));
 }
-	    
+
+CTEST(zscal, nan_i_inc_2)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1,
+                  0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
+    cblas_zscal(9, &nan, &i, 2);
+    ASSERT_TRUE(isnan(i[0]));
+    ASSERT_TRUE(isnan(i[1]));
+    ASSERT_TRUE(isnan(i[16]));
+    ASSERT_TRUE(isnan(i[17]));
+}
+
 CTEST(zscal, i_inf)
 {
     double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
@@ -40,7 +64,19 @@ CTEST(zscal, i_inf)
     ASSERT_TRUE(isinf(inf[1]));
     ASSERT_TRUE(isnan(inf[16]));
     ASSERT_TRUE(isinf(inf[17]));
-}    
+}
+
+CTEST(zscal, i_inf_inc_2)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0,
+                    INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0};
+    cblas_zscal(9, i, &inf, 2);
+    ASSERT_TRUE(isnan(inf[0]));
+    ASSERT_TRUE(isinf(inf[1]));
+    ASSERT_TRUE(isnan(inf[16]));
+    ASSERT_TRUE(isinf(inf[17]));
+}
 
 CTEST(zscal, inf_i)
 {
@@ -53,4 +89,16 @@ CTEST(zscal, inf_i)
     ASSERT_TRUE(isinf(i[17]));
 }
 
+CTEST(zscal, inf_i_inc_2)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1,
+                  0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0};
+    cblas_zscal(9, &inf, &i, 2);
+    ASSERT_TRUE(isnan(i[0]));
+    ASSERT_TRUE(isinf(i[1]));
+    ASSERT_TRUE(isnan(i[16]));
+    ASSERT_TRUE(isinf(i[17]));
+}
+
 #endif

From 09bb48d1b97aaf60ea40707658bf6761a9637424 Mon Sep 17 00:00:00 2001
From: Chip Kerchner <chip.kerchner@ibm.com>
Date: Tue, 30 Jan 2024 09:13:16 -0600
Subject: [PATCH 615/718] Vectorize in-copy packing/copying for SGEMM - 4X
 faster.

---
 kernel/power/KERNEL.POWER10         |   2 +-
 kernel/power/KERNEL.POWER8          |   7 +-
 kernel/power/KERNEL.POWER9          |   2 +-
 kernel/power/sgemm_ncopy_16_power.c | 482 ++++++++++++++++++++++++++++
 4 files changed, 487 insertions(+), 6 deletions(-)
 create mode 100755 kernel/power/sgemm_ncopy_16_power.c

diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index 9047c714c..c84cd91d2 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -25,7 +25,7 @@ ZTRMMKERNEL	= zgemm_kernel_power10.S
 endif
 
 SGEMMKERNEL    =  sgemm_kernel_power10.c
-SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
+SGEMMINCOPY    = sgemm_ncopy_16_power.c
 SGEMMITCOPY    = sgemm_tcopy_16_power8.S
 SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
 SGEMMOTCOPY    = sgemm_tcopy_8_power8.S
diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8
index 2b8e65948..36222348a 100644
--- a/kernel/power/KERNEL.POWER8
+++ b/kernel/power/KERNEL.POWER8
@@ -1,11 +1,9 @@
 # Big-endian 32bit (AIX) is supported through the POWER6 GEMM kernels, no separate TRMM
 ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1)
 SGEMMKERNEL    =  gemm_kernel_power6.S
-SGEMMINCOPY    =
 SGEMMITCOPY    =
 SGEMMONCOPY    =  gemm_ncopy_4.S
 SGEMMOTCOPY    =  gemm_tcopy_4.S
-SGEMMINCOPYOBJ =
 SGEMMITCOPYOBJ =
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
@@ -50,11 +48,9 @@ CTRMMKERNEL	= ctrmm_kernel_8x4_power8.S
 ZTRMMKERNEL	= ztrmm_kernel_8x2_power8.S
 
 SGEMMKERNEL    =  sgemm_kernel_16x8_power8.S
-SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
 SGEMMITCOPY    = sgemm_tcopy_16_power8.S
 SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
 SGEMMOTCOPY    = sgemm_tcopy_8_power8.S
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
@@ -90,6 +86,9 @@ ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
 ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 
+SGEMMINCOPY    = sgemm_ncopy_16_power.c
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+
 STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
 STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index b6b102b3e..7d007d1a2 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -13,7 +13,7 @@ CTRMMKERNEL	= cgemm_kernel_power9.S
 ZTRMMKERNEL	= zgemm_kernel_power9.S
 
 SGEMMKERNEL    =  sgemm_kernel_power9.S
-SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
+SGEMMINCOPY    = sgemm_ncopy_16_power.c
 SGEMMITCOPY    = sgemm_tcopy_16_power8.S
 SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
 SGEMMOTCOPY    = sgemm_tcopy_8_power8.S
diff --git a/kernel/power/sgemm_ncopy_16_power.c b/kernel/power/sgemm_ncopy_16_power.c
new file mode 100755
index 000000000..babe1376e
--- /dev/null
+++ b/kernel/power/sgemm_ncopy_16_power.c
@@ -0,0 +1,482 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include <altivec.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+  BLASLONG i, j;
+
+  IFLOAT *aoffset;
+  IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
+  IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
+  IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12;
+  IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16;
+
+  IFLOAT *boffset;
+  IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+  IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
+  IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
+  IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
+  IFLOAT ctemp17,  ctemp19 ;
+  IFLOAT ctemp21,  ctemp23 ;
+  IFLOAT ctemp25,  ctemp27 ;
+  IFLOAT ctemp29,  ctemp31 ;
+
+  aoffset = a;
+  boffset = b;
+  j = (n >> 4);
+  if (j > 0){
+    do{
+      aoffset1  = aoffset;
+      aoffset2  = aoffset1  + lda;
+      aoffset3  = aoffset2  + lda;
+      aoffset4  = aoffset3  + lda;
+      aoffset5  = aoffset4  + lda;
+      aoffset6  = aoffset5  + lda;
+      aoffset7  = aoffset6  + lda;
+      aoffset8  = aoffset7  + lda;
+      aoffset9  = aoffset8  + lda;
+      aoffset10 = aoffset9  + lda;
+      aoffset11 = aoffset10 + lda;
+      aoffset12 = aoffset11 + lda;
+      aoffset13 = aoffset12 + lda;
+      aoffset14 = aoffset13 + lda;
+      aoffset15 = aoffset14 + lda;
+      aoffset16 = aoffset15 + lda;
+      aoffset += 16 * lda;
+      i = (m >> 2);
+      if (i > 0){
+	vector float c1, c2, c3, c4, c5, c6, c7, c8;
+	vector float c9, c10, c11, c12, c13, c14, c15, c16;
+	vector float t1, t2, t3, t4, t5, t6, t7, t8;
+	vector float t9, t10, t11, t12;
+	do{
+	   c1 = vec_xl(0, aoffset1);
+	   c2 = vec_xl(0, aoffset2);
+	   c3 = vec_xl(0, aoffset3);
+	   c4 = vec_xl(0, aoffset4);
+	   c5 = vec_xl(0, aoffset5);
+	   c6 = vec_xl(0, aoffset6);
+	   c7 = vec_xl(0, aoffset7);
+	   c8 = vec_xl(0, aoffset8);
+	   c9 = vec_xl(0, aoffset9);
+	   c10 = vec_xl(0, aoffset10);
+	   c11 = vec_xl(0, aoffset11);
+	   c12 = vec_xl(0, aoffset12);
+	   c13 = vec_xl(0, aoffset13);
+	   c14 = vec_xl(0, aoffset14);
+	   c15 = vec_xl(0, aoffset15);
+	   c16 = vec_xl(0, aoffset16);
+
+           t1  = vec_mergeh(c1, c2);
+           t2  = vec_mergeh(c3, c4);
+           t3  = vec_mergeh(c5, c6);
+           t4  = vec_mergeh(c7, c8);
+           t9  = vec_mergeh(c9, c10);
+           t10  = vec_mergeh(c11, c12);
+           t11  = vec_mergeh(c13, c14);
+           t12  = vec_mergeh(c15, c16);
+
+	   t5 = vec_xxpermdi(t1, t2, 0b00);
+           t6 = vec_xxpermdi(t3, t4, 0b00);
+	   t7 = vec_xxpermdi(t9, t10, 0b00);
+	   t8 = vec_xxpermdi(t11, t12, 0b00);
+
+	   vec_xst(t5, 0, boffset);
+	   vec_xst(t6, 0, boffset+4);
+	   vec_xst(t7, 0, boffset+8);
+	   vec_xst(t8, 0, boffset+12);
+	   t5 = vec_xxpermdi(t1, t2, 0b11);
+	   t6 = vec_xxpermdi(t3, t4, 0b11);
+	   t7 = vec_xxpermdi(t9, t10, 0b11);
+	   t8 = vec_xxpermdi(t11, t12, 0b11);
+	   vec_xst(t5, 0, boffset+16);
+	   vec_xst(t6, 0, boffset+20);
+	   vec_xst(t7, 0, boffset+24);
+	   vec_xst(t8, 0, boffset+28);
+
+           t1  = vec_mergel(c1, c2);
+           t2  = vec_mergel(c3, c4);
+           t3  = vec_mergel(c5, c6);
+           t4  = vec_mergel(c7, c8);
+           t9  = vec_mergel(c9, c10);
+           t10  = vec_mergel(c11, c12);
+           t11  = vec_mergel(c13, c14);
+           t12  = vec_mergel(c15, c16);
+  	   t5 = vec_xxpermdi(t1, t2, 0b00);
+	   t6 = vec_xxpermdi(t3, t4, 0b00);
+	   t7 = vec_xxpermdi(t9, t10, 0b00);
+	   t8 = vec_xxpermdi(t11, t12, 0b00);
+	   vec_xst(t5, 0, boffset+32);
+	   vec_xst(t6, 0, boffset+36);
+	   vec_xst(t7, 0, boffset+40);
+	   vec_xst(t8, 0, boffset+44);
+
+	   t5 = vec_xxpermdi(t1, t2, 0b11);
+	   t6 = vec_xxpermdi(t3, t4, 0b11);
+	   t7 = vec_xxpermdi(t9, t10, 0b11);
+	   t8 = vec_xxpermdi(t11, t12, 0b11);
+	   vec_xst(t5, 0, boffset+48);
+	   vec_xst(t6, 0, boffset+52);
+	   vec_xst(t7, 0, boffset+56);
+	   vec_xst(t8, 0, boffset+60);
+
+	  aoffset1 +=  4;
+	  aoffset2 +=  4;
+	  aoffset3 +=  4;
+	  aoffset4 +=  4;
+	  aoffset5 +=  4;
+	  aoffset6 +=  4;
+	  aoffset7 +=  4;
+	  aoffset8 +=  4;
+
+	  aoffset9  +=  4;
+	  aoffset10 +=  4;
+	  aoffset11 +=  4;
+	  aoffset12 +=  4;
+	  aoffset13 +=  4;
+	  aoffset14 +=  4;
+	  aoffset15 +=  4;
+	  aoffset16 +=  4;
+	  boffset   += 64;
+
+	  i --;
+	}while(i > 0);
+      }
+      i = (m & 3);
+      if (i > 0){
+        do{
+	ctemp01 = *(aoffset1 +  0);
+	ctemp03 = *(aoffset2 +  0);
+	ctemp05 = *(aoffset3 +  0);
+	ctemp07 = *(aoffset4 +  0);
+	ctemp09 = *(aoffset5 +  0);
+	ctemp11 = *(aoffset6 +  0);
+	ctemp13 = *(aoffset7 +  0);
+	ctemp15 = *(aoffset8 +  0);
+
+	ctemp17 = *(aoffset9 +  0);
+	ctemp19 = *(aoffset10 +  0);
+	ctemp21 = *(aoffset11 +  0);
+	ctemp23 = *(aoffset12 +  0);
+	ctemp25 = *(aoffset13 +  0);
+	ctemp27 = *(aoffset14 +  0);
+	ctemp29 = *(aoffset15 +  0);
+	ctemp31 = *(aoffset16 +  0);
+	*(boffset +  0) = ctemp01;
+	*(boffset +  1) = ctemp03;
+	*(boffset +  2) = ctemp05;
+	*(boffset +  3) = ctemp07;
+	*(boffset +  4) = ctemp09;
+	*(boffset +  5) = ctemp11;
+	*(boffset +  6) = ctemp13;
+	*(boffset +  7) = ctemp15;
+
+	*(boffset +  8) = ctemp17;
+	*(boffset +  9) = ctemp19;
+	*(boffset + 10) = ctemp21;
+	*(boffset + 11) = ctemp23;
+	*(boffset + 12) = ctemp25;
+	*(boffset + 13) = ctemp27;
+	*(boffset + 14) = ctemp29;
+	*(boffset + 15) = ctemp31;
+	  aoffset1+=1;
+	  aoffset2+=1;
+	  aoffset3+=1;
+	  aoffset4+=1;
+	  aoffset5+=1;
+	  aoffset6+=1;
+	  aoffset7+=1;
+	  aoffset8+=1;
+	  aoffset9+=1;
+	  aoffset10+=1;
+	  aoffset11+=1;
+	  aoffset12+=1;
+	  aoffset13+=1;
+	  aoffset14+=1;
+	  aoffset15+=1;
+	  aoffset16+=1;
+	boffset  += 16;
+        i --;
+        }while(i > 0);
+      }
+      j--;
+    }while(j > 0);
+  } /* end of if(j > 0) */
+
+  if (n & 8){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset1  + lda;
+    aoffset3  = aoffset2  + lda;
+    aoffset4  = aoffset3  + lda;
+    aoffset5  = aoffset4  + lda;
+    aoffset6  = aoffset5  + lda;
+    aoffset7  = aoffset6  + lda;
+    aoffset8  = aoffset7  + lda;
+    aoffset += 8 * lda;
+
+    i = (m >> 2);
+    if (i > 0){
+      vector float c1, c2, c3, c4, c5, c6, c7, c8;
+      vector float t1, t2, t3, t4, t5, t6, t7, t8;
+      do{
+        c1 = vec_xl(0, aoffset1);
+        c2 = vec_xl(0, aoffset2);
+        c3 = vec_xl(0, aoffset3);
+        c4 = vec_xl(0, aoffset4);
+        c5 = vec_xl(0, aoffset5);
+        c6 = vec_xl(0, aoffset6);
+        c7 = vec_xl(0, aoffset7);
+        c8 = vec_xl(0, aoffset8);
+
+        t1  = vec_mergeh(c1, c2);
+        t2  = vec_mergeh(c3, c4);
+        t3  = vec_mergeh(c5, c6);
+        t4  = vec_mergeh(c7, c8);
+
+        t5 = vec_xxpermdi(t1, t2, 0b00);
+        t6 = vec_xxpermdi(t3, t4, 0b00);
+        t7 = vec_xxpermdi(t1, t2, 0b11);
+        t8 = vec_xxpermdi(t3, t4, 0b11);
+
+        vec_xst(t5, 0, boffset);
+        vec_xst(t6, 0, boffset+4);
+        vec_xst(t7, 0, boffset+8);
+        vec_xst(t8, 0, boffset+12);
+
+        t1  = vec_mergel(c1, c2);
+        t2  = vec_mergel(c3, c4);
+        t3  = vec_mergel(c5, c6);
+        t4  = vec_mergel(c7, c8);
+
+        t5 = vec_xxpermdi(t1, t2, 0b00);
+        t6 = vec_xxpermdi(t3, t4, 0b00);
+        t7 = vec_xxpermdi(t1, t2, 0b11);
+        t8 = vec_xxpermdi(t3, t4, 0b11);
+
+        vec_xst(t5, 0, boffset+16);
+        vec_xst(t6, 0, boffset+20);
+        vec_xst(t7, 0, boffset+24);
+        vec_xst(t8, 0, boffset+28);
+
+        aoffset1 +=  4;
+        aoffset2 +=  4;
+        aoffset3 +=  4;
+        aoffset4 +=  4;
+        aoffset5 +=  4;
+        aoffset6 +=  4;
+        aoffset7 +=  4;
+        aoffset8 +=  4;
+
+        boffset   += 32;
+        i--;
+      }while(i > 0);
+    }
+
+    i = (m & 3);
+    if (i > 0) {
+      do {
+        ctemp01 = *(aoffset1 +  0);
+        ctemp03 = *(aoffset2 +  0);
+        ctemp05 = *(aoffset3 +  0);
+        ctemp07 = *(aoffset4 +  0);
+        ctemp09 = *(aoffset5 +  0);
+        ctemp11 = *(aoffset6 +  0);
+        ctemp13 = *(aoffset7 +  0);
+        ctemp15 = *(aoffset8 +  0);
+
+        *(boffset +  0) = ctemp01;
+        *(boffset +  1) = ctemp03;
+        *(boffset +  2) = ctemp05;
+        *(boffset +  3) = ctemp07;
+        *(boffset +  4) = ctemp09;
+        *(boffset +  5) = ctemp11;
+        *(boffset +  6) = ctemp13;
+        *(boffset +  7) = ctemp15;
+
+        aoffset1+=1;
+        aoffset2+=1;
+        aoffset3+=1;
+        aoffset4+=1;
+        aoffset5+=1;
+        aoffset6+=1;
+        aoffset7+=1;
+        aoffset8+=1;
+
+        boffset   += 8;
+        i--;
+      } while (i > 0);
+    }
+  }
+
+  if (n & 4){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset1  + lda;
+    aoffset3  = aoffset2  + lda;
+    aoffset4  = aoffset3  + lda;
+    aoffset += 4 * lda;
+
+    i = (m >> 2);
+    if (i > 0){
+      vector float c1, c2, c3, c4;
+      vector float t1, t2, t3, t4;
+      do{
+        c1 = vec_xl(0, aoffset1);
+        c2 = vec_xl(0, aoffset2);
+        c3 = vec_xl(0, aoffset3);
+        c4 = vec_xl(0, aoffset4);
+
+        t1  = vec_mergeh(c1, c2);
+        t2  = vec_mergeh(c3, c4);
+
+        t3 = vec_xxpermdi(t1, t2, 0b00);
+        t4 = vec_xxpermdi(t1, t2, 0b11);
+
+        vec_xst(t3, 0, boffset);
+        vec_xst(t4, 0, boffset+4);
+
+        t1  = vec_mergel(c1, c2);
+        t2  = vec_mergel(c3, c4);
+
+        t3 = vec_xxpermdi(t1, t2, 0b00);
+        t4 = vec_xxpermdi(t1, t2, 0b11);
+
+        vec_xst(t3, 0, boffset+8);
+        vec_xst(t4, 0, boffset+12);
+
+        aoffset1 +=  4;
+        aoffset2 +=  4;
+        aoffset3 +=  4;
+        aoffset4 +=  4;
+
+        boffset   += 16;
+        i--;
+      }while(i > 0);
+    }
+
+    i = (m & 3);
+    if (i > 0) {
+      do {
+        ctemp01 = *(aoffset1 +  0);
+        ctemp03 = *(aoffset2 +  0);
+        ctemp05 = *(aoffset3 +  0);
+        ctemp07 = *(aoffset4 +  0);
+
+        *(boffset +  0) = ctemp01;
+        *(boffset +  1) = ctemp03;
+        *(boffset +  2) = ctemp05;
+        *(boffset +  3) = ctemp07;
+
+        aoffset1+=1;
+        aoffset2+=1;
+        aoffset3+=1;
+        aoffset4+=1;
+
+        boffset   += 4;
+        i--;
+      } while (i > 0);
+    }
+  }
+
+  if (n & 2){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset1  + lda;
+    aoffset += 2 * lda;
+
+    i = (m >> 1);
+    if (i > 0){
+      do{
+	ctemp01 = *(aoffset1 +  0);
+	ctemp02 = *(aoffset1 +  1);
+	ctemp03 = *(aoffset2 +  0);
+	ctemp04 = *(aoffset2 +  1);
+
+	*(boffset +  0) = ctemp01;
+	*(boffset +  1) = ctemp03;
+	*(boffset +  2) = ctemp02;
+	*(boffset +  3) = ctemp04;
+
+	aoffset1 +=  2;
+	aoffset2 +=  2;
+	boffset   += 4;
+
+	i --;
+      }while(i > 0);
+    }
+
+    if (m & 1){
+      ctemp01 = *(aoffset1 +  0);
+      ctemp03 = *(aoffset2 +  0);
+
+      *(boffset +  0) = ctemp01;
+      *(boffset +  1) = ctemp03;
+      boffset   += 2;
+    }
+  }
+
+  if (n & 1){
+    aoffset1  = aoffset;
+
+    i = (m >> 1);
+    if (i > 0){
+      do{
+	ctemp01 = *(aoffset1 +  0);
+	ctemp02 = *(aoffset1 +  1);
+
+	*(boffset +  0) = ctemp01;
+	*(boffset +  1) = ctemp02;
+
+	aoffset1 +=  2;
+	boffset   += 2;
+
+	i --;
+      }while(i > 0);
+    }
+
+    if (m & 1){
+      ctemp01 = *(aoffset1 +  0);
+
+      *(boffset +  0) = ctemp01;
+      // boffset   += 1;
+    }
+  }
+
+  return 0;
+}

From 969601a1dcfdc4c44174346b7c752fa338f00737 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Wed, 31 Jan 2024 11:20:25 +0800
Subject: [PATCH 616/718] X86_64: Fixed bug in zscal

Fixed handling of NAN and INF arguments when
inc is greater than 1.
---
 kernel/x86_64/zscal.c | 91 ++++++++++++++++++++++++-------------------
 1 file changed, 50 insertions(+), 41 deletions(-)

diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c
index 66c8a0d2b..bc79c0caf 100644
--- a/kernel/x86_64/zscal.c
+++ b/kernel/x86_64/zscal.c
@@ -69,16 +69,16 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha , FLOAT *x )
 
 	for( i=0; i<n; i+=4 )
 	{
-		t0 = da_r *x[0] - da_i *x[1];	
-		t1 = da_r *x[2] - da_i *x[3];	
-		t2 = da_r *x[4] - da_i *x[5];	
-		t3 = da_r *x[6] - da_i *x[7];	
+		t0 = da_r *x[0] - da_i *x[1];
+		t1 = da_r *x[2] - da_i *x[3];
+		t2 = da_r *x[4] - da_i *x[5];
+		t3 = da_r *x[6] - da_i *x[7];
 
 		x[1] = da_r * x[1] + da_i * x[0];
 		x[3] = da_r * x[3] + da_i * x[2];
 		x[5] = da_r * x[5] + da_i * x[4];
 		x[7] = da_r * x[7] + da_i * x[6];
-		
+
 		x[0] = t0;
 		x[2] = t1;
 		x[4] = t2;
@@ -99,16 +99,16 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha , FLOAT *x )
 
 	for( i=0; i<n; i+=4 )
 	{
-		t0 =  - da_i *x[1];	
-		t1 =  - da_i *x[3];	
-		t2 =  - da_i *x[5];	
-		t3 =  - da_i *x[7];	
+		t0 =  - da_i *x[1];
+		t1 =  - da_i *x[3];
+		t2 =  - da_i *x[5];
+		t3 =  - da_i *x[7];
 
 		x[1] =  da_i * x[0];
 		x[3] =  da_i * x[2];
 		x[5] =  da_i * x[4];
 		x[7] =  da_i * x[6];
-		
+
 		x[0] = t0;
 		x[2] = t1;
 		x[4] = t2;
@@ -129,16 +129,16 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha , FLOAT *x )
 
 	for( i=0; i<n; i+=4 )
 	{
-		t0 = da_r *x[0];	
-		t1 = da_r *x[2];	
-		t2 = da_r *x[4];	
-		t3 = da_r *x[6];	
+		t0 = da_r *x[0];
+		t1 = da_r *x[2];
+		t2 = da_r *x[4];
+		t3 = da_r *x[6];
 
 		x[1] = da_r * x[1];
 		x[3] = da_r * x[3];
 		x[5] = da_r * x[5];
 		x[7] = da_r * x[7];
-		
+
 		x[0] = t0;
 		x[2] = t1;
 		x[4] = t2;
@@ -157,14 +157,14 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha , FLOAT *x )
 	BLASLONG i;
 	for( i=0; i<n; i+=4 )
 	{
-		x[0] = 0.0;	
-		x[1] = 0.0;	
-		x[2] = 0.0;	
-		x[3] = 0.0;	
-		x[4] = 0.0;	
-		x[5] = 0.0;	
-		x[6] = 0.0;	
-		x[7] = 0.0;	
+		x[0] = 0.0;
+		x[1] = 0.0;
+		x[2] = 0.0;
+		x[3] = 0.0;
+		x[4] = 0.0;
+		x[5] = 0.0;
+		x[6] = 0.0;
+		x[7] = 0.0;
 		x+=8;
 	}
 
@@ -186,10 +186,10 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
 
 	for ( i=0; i<n; i+=4 )
 	{
-		t0 = da_r * x[0]         - da_i *x[1];	
-		t1 = da_r * x[inc_x]     - da_i *x[inc_x  + 1];	
-		t2 = da_r * x[inc_x2]    - da_i *x[inc_x2 + 1];	
-		t3 = da_r * x[inc_x3]    - da_i *x[inc_x3 + 1];	
+		t0 = da_r * x[0]         - da_i *x[1];
+		t1 = da_r * x[inc_x]     - da_i *x[inc_x  + 1];
+		t2 = da_r * x[inc_x2]    - da_i *x[inc_x2 + 1];
+		t3 = da_r * x[inc_x3]    - da_i *x[inc_x3 + 1];
 
 		x[1]               = da_i * x[0]       + da_r * x[1];
 		x[inc_x  +1]       = da_i * x[inc_x]   + da_r * x[inc_x  +1];
@@ -228,7 +228,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 			{
 				while(j < n1)
 				{
-			
+
 					x[i]=0.0;
 					x[i+1]=0.0;
 					x[i+inc_x]=0.0;
@@ -240,7 +240,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 				while(j < n)
 				{
-			
+
 					x[i]=0.0;
 					x[i+1]=0.0;
 					i += inc_x ;
@@ -253,11 +253,17 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 			{
 				while(j < n1)
 				{
-			
-					temp0        = -da_i * x[i+1];
+
+					if (isnan(x[i]) || isinf(x[i]))
+						temp0	= NAN;
+					else
+						temp0   = -da_i * x[i+1];
 					x[i+1]       =  da_i * x[i];
 					x[i]         =  temp0;
-					temp1        = -da_i * x[i+1+inc_x];
+					if (isnan(x[i+inc_x]) || isinf(x[i+inc_x]))
+						temp1	= NAN;
+					else
+						temp1   = -da_i * x[i+1+inc_x];
 					x[i+1+inc_x] =  da_i * x[i+inc_x];
 					x[i+inc_x]   =  temp1;
 					i += 2*inc_x ;
@@ -267,8 +273,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 				while(j < n)
 				{
-			
-					temp0        = -da_i * x[i+1];
+
+					if (isnan(x[i]) || isinf(x[i]))
+						temp0	= NAN;
+					else
+						temp0   = -da_i * x[i+1];
 					x[i+1]       =  da_i * x[i];
 					x[i]         =  temp0;
 					i += inc_x ;
@@ -291,7 +300,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 				while(j < n1)
 				{
-			
+
 					temp0        =  da_r * x[i];
 					x[i+1]       =  da_r * x[i+1];
 					x[i]         =  temp0;
@@ -305,7 +314,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 				while(j < n)
 				{
-			
+
 					temp0        =  da_r * x[i];
 					x[i+1]       =  da_r * x[i+1];
 					x[i]         =  temp0;
@@ -368,7 +377,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 		}
 		i = n1 << 1;
 		j = n1;
-	
+
 	if ( da_r == 0.0 || da_r != da_r )
 	{
 		if ( da_i == 0.0 )
@@ -385,7 +394,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 			}
 
 		}
-		else if (da_r < -FLT_MAX || da_r > FLT_MAX) { 
+		else if (da_r < -FLT_MAX || da_r > FLT_MAX) {
 			while(j < n)
 			{
 					x[i]= NAN;
@@ -404,7 +413,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 				if (x[i] < -FLT_MAX || x[i] > FLT_MAX)
 					temp0 = NAN;
 				x[i+1]       =  da_i * x[i];
-				if ( x[i] == x[i]) //preserve NaN 
+				if ( x[i] == x[i]) //preserve NaN
 				  x[i]         =  temp0;
 				i += 2 ;
 				j++;
@@ -420,7 +429,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 		{
 				while(j < n)
 				{
-			
+
 					temp0        =  da_r * x[i];
 					x[i+1]       =  da_r * x[i+1];
 					x[i]         =  temp0;
@@ -442,7 +451,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 			}
 
-		}		
+		}
 
 	}
 

From 1a6fdb035308370c08c740da279b769615594980 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 31 Jan 2024 15:57:57 +0100
Subject: [PATCH 617/718] Add prototypes for extensions ?AMIN/?AMAX and
 CAXPYC/ZAXPYC

---
 cblas.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/cblas.h b/cblas.h
index ade2fca3a..3b74e25ee 100644
--- a/cblas.h
+++ b/cblas.h
@@ -101,6 +101,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
 CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
 
+float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
+float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
+double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
+
+float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
+float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
+double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
+
 CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
@@ -116,6 +126,9 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS
 void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
 void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
 
+void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
+void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
+
 void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
 void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
 void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
@@ -290,7 +303,6 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA
 void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 
-
 void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                  OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
 void cblas_dsymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,

From b54cda849096ade35bd2f69341e3d02fa1543512 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 31 Jan 2024 16:00:52 +0100
Subject: [PATCH 618/718] Unify creation of CBLAS interfaces for ?AMIN/?AMAX
 and C/ZAXPYC between gmake and cmake builds

---
 interface/CMakeLists.txt |  2 ++
 interface/Makefile       | 49 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt
index 4e082928b..ed19b556a 100644
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@@ -130,6 +130,8 @@ endif ()
 foreach (float_type ${FLOAT_TYPES})
 
   if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
+    GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type})
+
     GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type})
     GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type})
     GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type})
diff --git a/interface/Makefile b/interface/Makefile
index 78335357b..99859cbf5 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -270,7 +270,8 @@ CSBLAS1OBJS   = \
 	cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
 	cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
 	cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
-	cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX)
+	cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_samax.$(SUFFIX) \
+	cblas_samin.$(SUFFIX)
 
 CSBLAS2OBJS   = \
 	cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
@@ -295,7 +296,8 @@ CDBLAS1OBJS   = \
 	cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
 	cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
 	cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
-	cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX)
+	cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_damax.$(SUFFIX) \
+	cblas_damin.$(SUFFIX)
 
 CDBLAS2OBJS   = \
 	cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
@@ -315,7 +317,7 @@ CCBLAS1OBJS   = \
 	cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
 	cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
 	cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
-	cblas_caxpby.$(SUFFIX) \
+	cblas_caxpby.$(SUFFIX) cblas_scamax.$(SUFFIX) cblas_caxpyc.$(SUFFIX) cblas_scamin.$(SUFFIX) \
 	cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX)
 
 CCBLAS2OBJS   = \
@@ -340,12 +342,12 @@ CXERBLAOBJ = \
 
 CZBLAS1OBJS   = \
 	cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX)  cblas_zaxpy.$(SUFFIX) \
-	cblas_zcopy.$(SUFFIX) \
+	cblas_zcopy.$(SUFFIX) cblas_dzamax.$(SUFFIX) cblas_dzamin.$(SUFFIX) \
 	cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
 	cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
 	cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
 	cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
-	cblas_zaxpby.$(SUFFIX) \
+	cblas_zaxpby.$(SUFFIX) cblas_zaxpyc.$(SUFFIX) \
 	cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX)
 
 
@@ -1533,6 +1535,30 @@ cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c
 cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c
 	$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
 
+cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_camax.$(SUFFIX) cblas_camax.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_zamax.$(SUFFIX) cblas_zamax.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_camin.$(SUFFIX) cblas_camin.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_zamin.$(SUFFIX) cblas_zamin.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
 cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
 
@@ -1627,6 +1653,19 @@ cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c
 cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
 
+cblas_caxpyc.$(SUFFIX) cblas_caxpyc.$(PSUFFIX) : zaxpy.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
+
+cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
+
+cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
+
+sscal.$(SUFFIX) sscal.$(PSUFFIX) : scal.c
+	$(CC) $(CFLAGS) -c $< -o $(@F)
+
+dscal.$(SUFFIX) dscal.$(PSUFFIX) : scal.c
 cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
 

From a7d004e820f1ccbc9f61b4b1353ccdb04f208690 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 31 Jan 2024 17:55:42 +0100
Subject: [PATCH 619/718] Fix CBLAS prototype

---
 interface/max.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/interface/max.c b/interface/max.c
index f05977448..6c7d32bd9 100644
--- a/interface/max.c
+++ b/interface/max.c
@@ -145,8 +145,13 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
 
 #else
 
+#ifdef COMPLEX
+FLOAT CNAME(blasint n, void *vx, blasint incx){
+  FLOAT *x = (FLOAT*) vx;
+#else
 FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
-
+#endif
+  
   FLOAT ret;
 
   PRINT_DEBUG_CNAME;

From 47bd06476312598eea694f19a00a9191041b1586 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 31 Jan 2024 20:49:43 +0100
Subject: [PATCH 620/718] Fix names in build rules

---
 interface/Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/interface/Makefile b/interface/Makefile
index 99859cbf5..ad4a0fb89 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -1541,10 +1541,10 @@ cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c
 cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c
 	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
 
-cblas_camax.$(SUFFIX) cblas_camax.$(PSUFFIX) : max.c
+cblas_scamax.$(SUFFIX) cblas_scamax.$(PSUFFIX) : max.c
 	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
 
-cblas_zamax.$(SUFFIX) cblas_zamax.$(PSUFFIX) : max.c
+cblas_dzamax.$(SUFFIX) cblas_dzamax.$(PSUFFIX) : max.c
 	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
 
 cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c
@@ -1553,10 +1553,10 @@ cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c
 cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c
 	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
 
-cblas_camin.$(SUFFIX) cblas_camin.$(PSUFFIX) : max.c
+cblas_scamin.$(SUFFIX) cblas_scamin.$(PSUFFIX) : max.c
 	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
 
-cblas_zamin.$(SUFFIX) cblas_zamin.$(PSUFFIX) : max.c
+cblas_dzamin.$(SUFFIX) cblas_dzamin.$(PSUFFIX) : max.c
 	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
 
 cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c

From 42cb567f0f9f6c8ef27558e5b61251b0805aae6d Mon Sep 17 00:00:00 2001
From: Mark Seminatore <mseminatore@users.noreply.github.com>
Date: Wed, 31 Jan 2024 13:24:28 -0800
Subject: [PATCH 621/718] more cleanup

---
 driver/others/blas_server_win32.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 68dde584b..ee6d08f8c 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -72,7 +72,10 @@ static HANDLE	    blas_threads   [MAX_CPU_NUMBER];
 static DWORD	    blas_threads_id[MAX_CPU_NUMBER];
 static volatile int thread_target;	// target num of live threads, volatile for cross-thread reads
 
-static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
+//
+//
+//
+static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) {
 
       if (!(mode & BLAS_COMPLEX)) {
 #ifdef EXPRECISION
@@ -195,8 +198,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
       }
 }
 
-// This is a main routine of threads. Each thread waits until job is
-// queued.
+//
+// This is a main routine of threads. Each thread waits until job is queued.
+//
 static DWORD WINAPI blas_thread_server(void *arg) {
 
   /* Thread identifier */
@@ -488,7 +492,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue) {
 
 // Shutdown procedure, but user don't have to call this routine. The
 // kernel automatically kill threads.
-int BLASFUNC(blas_thread_shutdown)(void){
+int BLASFUNC(blas_thread_shutdown)(void) {
 
   int i;
 
@@ -563,7 +567,7 @@ void goto_set_num_threads(int num_threads)
 
 		thread_target = num_threads;
 
-		//increased_threads = 1;
+		  //increased_threads = 1;
 	    if (!blas_server_avail) {
 			// create the kickoff Event
 			kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);

From 61c8e19f95dfefe3e586d6d83f7432b7d51299fa Mon Sep 17 00:00:00 2001
From: Chip Kerchner <chip.kerchner@ibm.com>
Date: Wed, 31 Jan 2024 15:27:50 -0600
Subject: [PATCH 622/718] Fix Makefile to support OpenMP on AIX for xlc (clang)
 with xlf.

---
 ctest/Makefile |  3 +++
 test/Makefile  |  3 +++
 utest/Makefile | 10 ++++++++++
 3 files changed, 16 insertions(+)

diff --git a/ctest/Makefile b/ctest/Makefile
index af5b34a36..ad960b35a 100644
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -218,6 +218,9 @@ ifeq ($(F_COMPILER), IBM)
 ifeq ($(C_COMPILER), GCC)
 CEXTRALIB += -lgomp
 endif
+ifeq ($(C_COMPILER), CLANG)
+CEXTRALIB += -lomp
+endif
 endif
 endif
 
diff --git a/test/Makefile b/test/Makefile
index 56acf1c5b..5a4694ce6 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -276,6 +276,9 @@ ifeq ($(F_COMPILER), IBM)
 ifeq ($(C_COMPILER), GCC)
 CEXTRALIB += -lgomp
 endif
+ifeq ($(C_COMPILER), CLANG)
+CEXTRALIB += -lomp
+endif
 endif
 endif
 
diff --git a/utest/Makefile b/utest/Makefile
index 8acaa3ea9..ac58d6f12 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -45,8 +45,18 @@ endif
 
 all : run_test
 
+ifeq ($(OSNAME), AIX)
+ifeq ($(USE_OPENMP), 1)
+$(UTESTBIN): $(OBJS)
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB)
+else
+$(UTESTBIN): $(OBJS)
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB)
+endif
+else
 $(UTESTBIN): $(OBJS)
 	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB)
+endif
 
 run_test: $(UTESTBIN)
 ifneq ($(CROSS), 1)

From a3b0ef6596d51ecfb59b0a2f6a7b0d59bc4f18b4 Mon Sep 17 00:00:00 2001
From: Sergei Lewis <slewis@rivosinc.com>
Date: Thu, 1 Feb 2024 10:26:02 +0000
Subject: [PATCH 623/718] Restore riscv64 fixes from develop branch: dot
 product double precision accumulation, zscal NaN handling

---
 Makefile.prebuild             |  1 +
 kernel/riscv64/dot.c          | 10 ++++
 kernel/riscv64/zscal_rvv.c    | 90 ++++++-----------------------------
 kernel/riscv64/zscal_vector.c | 79 +-----------------------------
 4 files changed, 26 insertions(+), 154 deletions(-)

diff --git a/Makefile.prebuild b/Makefile.prebuild
index b44b50039..b7d695a75 100644
--- a/Makefile.prebuild
+++ b/Makefile.prebuild
@@ -57,6 +57,7 @@ endif
 
 ifeq ($(TARGET), CK860FV)
 TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float
+endif
 
 ifeq ($(TARGET), x280)
 TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c
index bf55998ca..8ad493a2b 100644
--- a/kernel/riscv64/dot.c
+++ b/kernel/riscv64/dot.c
@@ -44,14 +44,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0,iy=0;
+
+#if defined(DSDOT)
 	double dot = 0.0 ;
+#else
+	FLOAT  dot = 0.0 ;
+#endif
 
 	if ( n < 1 )  return(dot);
 
 	while(i < n)
 	{
 
+#if defined(DSDOT)
+		dot += (double) y[iy] * (double) x[ix] ;
+#else
 		dot += y[iy] * x[ix] ;
+#endif
+
 		ix  += inc_x ;
 		iy  += inc_y ;
 		i++ ;
diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c
index 2586c6036..ae79d9f9d 100644
--- a/kernel/riscv64/zscal_rvv.c
+++ b/kernel/riscv64/zscal_rvv.c
@@ -69,49 +69,26 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
     size_t vlmax = VSETVL_MAX;
     FLOAT_VX2_T vx2;
 
-    if(da_r == 0.0 && da_i == 0.0) {
+    if(inc_x == 1) {
 
-        vr = VFMVVF_FLOAT(0.0, vlmax);
-        vi = VFMVVF_FLOAT(0.0, vlmax);
-
-        if(inc_x == 1) {
-
-            for (size_t vl; n > 0; n -= vl, x += vl*2) {
-                vl = VSETVL(n);
-                vx2 = VSET_VX2(vx2, 0, vr);
-                vx2 = VSET_VX2(vx2, 1, vi);
-                VSSEG_FLOAT(x, vx2, vl);
-            }
-
-        } else {
-
-            for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
-                vl = VSETVL(n);
-                vx2 = VSET_VX2(vx2, 0, vr);
-                vx2 = VSET_VX2(vx2, 1, vi);
-                VSSSEG_FLOAT(x, stride_x, vx2, vl);
-            }
-        }
-
-    } else if(da_r == 0.0) {
-
-        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
+        for (size_t vl; n > 0; n -= vl, x += vl*2) {
             vl = VSETVL(n);
-            
-            vx2 = VLSSEG_FLOAT(x, stride_x, vl);
+
+            vx2 = VLSEG_FLOAT(x, vl);
             vr = VGET_VX2(vx2, 0);
             vi = VGET_VX2(vx2, 1);
 
-            vt = VFMULVF_FLOAT(vi, -da_i, vl);
-            vi = VFMULVF_FLOAT(vr, da_i, vl);
+            vt = VFMULVF_FLOAT(vr, da_r, vl);
+            vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);
+            vi = VFMULVF_FLOAT(vi, da_r, vl);
+            vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);
 
             vx2 = VSET_VX2(vx2, 0, vt);
             vx2 = VSET_VX2(vx2, 1, vi);
-
-            VSSSEG_FLOAT(x, stride_x, vx2, vl);
+            VSSEG_FLOAT(x, vx2, vl);
         }
 
-    } else if(da_i == 0.0) {
+    } else {
 
         for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
             vl = VSETVL(n);
@@ -120,54 +97,15 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
             vr = VGET_VX2(vx2, 0);
             vi = VGET_VX2(vx2, 1);
 
-            vr = VFMULVF_FLOAT(vr, da_r, vl);
+            vt = VFMULVF_FLOAT(vr, da_r, vl);
+            vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);
             vi = VFMULVF_FLOAT(vi, da_r, vl);
+            vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);
 
-            vx2 = VSET_VX2(vx2, 0, vr);
+            vx2 = VSET_VX2(vx2, 0, vt);
             vx2 = VSET_VX2(vx2, 1, vi);
             VSSSEG_FLOAT(x, stride_x, vx2, vl);
         }
-
-    } else {
-
-        if(inc_x == 1) {
-
-            for (size_t vl; n > 0; n -= vl, x += vl*2) {
-                vl = VSETVL(n);
-
-                vx2 = VLSEG_FLOAT(x, vl);
-                vr = VGET_VX2(vx2, 0);
-                vi = VGET_VX2(vx2, 1);
-
-                vt = VFMULVF_FLOAT(vr, da_r, vl);
-                vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);
-                vi = VFMULVF_FLOAT(vi, da_r, vl);
-                vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);
-
-                vx2 = VSET_VX2(vx2, 0, vt);
-                vx2 = VSET_VX2(vx2, 1, vi);
-                VSSEG_FLOAT(x, vx2, vl);
-            }
-
-        } else {
-
-            for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
-                vl = VSETVL(n);
-
-                vx2 = VLSSEG_FLOAT(x, stride_x, vl);
-                vr = VGET_VX2(vx2, 0);
-                vi = VGET_VX2(vx2, 1);
-
-                vt = VFMULVF_FLOAT(vr, da_r, vl);
-                vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);
-                vi = VFMULVF_FLOAT(vi, da_r, vl);
-                vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);
-
-                vx2 = VSET_VX2(vx2, 0, vt);
-                vx2 = VSET_VX2(vx2, 1, vi);
-                VSSSEG_FLOAT(x, stride_x, vx2, vl);
-            }
-        }
     }
 
     return(0);
diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c
index 2034aafaa..536bbdf73 100644
--- a/kernel/riscv64/zscal_vector.c
+++ b/kernel/riscv64/zscal_vector.c
@@ -59,84 +59,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 
         unsigned int gvl = 0;
         FLOAT_V_T vt, v0, v1;
-        if(da_r == 0.0 && da_i == 0.0){
-                gvl = VSETVL(n);
-                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
-                BLASLONG inc_xv = inc_x * 2 * gvl;
-                vt = VFMVVF_FLOAT(0.0, gvl);
-                for(i=0,j=0; i < n/(gvl*2); i++){
-                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+inc_xv+1], stride_x, vt, gvl);
-
-                        j += gvl*2;
-                        ix += inc_xv*2;
-                }
-                for(; j < n; ){
-                        gvl = VSETVL(n-j);
-                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl);
-                        j += gvl;
-                        ix += inc_x * 2 * gvl;
-                }
-        }else if(da_r == 0.0){
-                gvl = VSETVL(n);
-                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
-                BLASLONG inc_xv = inc_x * 2 * gvl;
-                for(i=0,j=0; i < n/gvl; i++){
-                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-
-                        vt = VFMULVF_FLOAT(v1, -da_i, gvl);
-                        v1 = VFMULVF_FLOAT(v0, da_i, gvl);
-
-                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
-
-                        j += gvl;
-                        ix += inc_xv;
-                }
-                if(j < n){
-                        gvl = VSETVL(n-j);
-                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-
-                        vt = VFMULVF_FLOAT(v1, -da_i, gvl);
-                        v1 = VFMULVF_FLOAT(v0, da_i, gvl);
-
-                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
-                }
-        }else if(da_i == 0.0){
-                gvl = VSETVL(n);
-                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
-                BLASLONG inc_xv = inc_x * 2 * gvl;
-                for(i=0,j=0; i < n/gvl; i++){
-                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-
-                        vt = VFMULVF_FLOAT(v0, da_r, gvl);
-                        v1 = VFMULVF_FLOAT(v1, da_r, gvl);
-
-                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
-
-                        j += gvl;
-                        ix += inc_xv;
-                }
-                if(j < n){
-                        gvl = VSETVL(n-j);
-                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-
-                        vt = VFMULVF_FLOAT(v0, da_r, gvl);
-                        v1 = VFMULVF_FLOAT(v1, da_r, gvl);
-
-                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
-                }
-        }else{
+        {
                 gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * 2 * gvl;

From 2bb7ea64a197c04ce36927530394db5400242d20 Mon Sep 17 00:00:00 2001
From: Chip Kerchner <chip.kerchner@ibm.com>
Date: Thu, 1 Feb 2024 08:11:43 -0600
Subject: [PATCH 624/718] Only vectorize 64-bit version for Power8.

---
 kernel/power/KERNEL.POWER8 | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8
index 36222348a..700a68e44 100644
--- a/kernel/power/KERNEL.POWER8
+++ b/kernel/power/KERNEL.POWER8
@@ -1,9 +1,11 @@
 # Big-endian 32bit (AIX) is supported through the POWER6 GEMM kernels, no separate TRMM
 ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1)
 SGEMMKERNEL    =  gemm_kernel_power6.S
+SGEMMINCOPY    =
 SGEMMITCOPY    =
 SGEMMONCOPY    =  gemm_ncopy_4.S
 SGEMMOTCOPY    =  gemm_tcopy_4.S
+SGEMMINCOPYOBJ =
 SGEMMITCOPYOBJ =
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
@@ -48,9 +50,11 @@ CTRMMKERNEL	= ctrmm_kernel_8x4_power8.S
 ZTRMMKERNEL	= ztrmm_kernel_8x2_power8.S
 
 SGEMMKERNEL    =  sgemm_kernel_16x8_power8.S
+SGEMMINCOPY    = sgemm_ncopy_16_power.c
 SGEMMITCOPY    = sgemm_tcopy_16_power8.S
 SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
 SGEMMOTCOPY    = sgemm_tcopy_8_power8.S
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
@@ -86,9 +90,6 @@ ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
 ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-SGEMMINCOPY    = sgemm_ncopy_16_power.c
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-
 STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
 STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c

From ddac75e0efef0deeb8ea189e69909371db13f0ef Mon Sep 17 00:00:00 2001
From: austinpagan <fossum@us.ibm.com>
Date: Thu, 1 Feb 2024 12:24:25 -0600
Subject: [PATCH 625/718] Adding .C versions of CGEMM and ZGEMM

---
 kernel/power/KERNEL.POWER10         |   24 +-
 kernel/power/cgemm_kernel_power10.c | 1154 +++++++++++++++++++++++++++
 kernel/power/zgemm_kernel_power10.c |  761 ++++++++++++++++++
 3 files changed, 1931 insertions(+), 8 deletions(-)
 create mode 100644 kernel/power/cgemm_kernel_power10.c
 create mode 100644 kernel/power/zgemm_kernel_power10.c

diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index 9047c714c..5f49b9c46 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -17,11 +17,15 @@ SBGEMMOTCOPYOBJ =  sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
 STRMMKERNEL	= sgemm_kernel_power10.c
 DTRMMKERNEL	= dgemm_kernel_power10.c
 ifeq ($(OSNAME), AIX)
-CTRMMKERNEL     = ctrmm_kernel_8x4_power8.S
-ZTRMMKERNEL     = ztrmm_kernel_8x2_power8.S
+#CTRMMKERNEL     = ctrmm_kernel_8x4_power8.S
+#ZTRMMKERNEL     = ztrmm_kernel_8x2_power8.S
+CTRMMKERNEL	= cgemm_kernel_power10.c
+ZTRMMKERNEL	= zgemm_kernel_power10.c
 else
-CTRMMKERNEL	= cgemm_kernel_power10.S
-ZTRMMKERNEL	= zgemm_kernel_power10.S
+#CTRMMKERNEL	= cgemm_kernel_power10.S
+#ZTRMMKERNEL	= zgemm_kernel_power10.S
+CTRMMKERNEL	= cgemm_kernel_power10.c
+ZTRMMKERNEL	= zgemm_kernel_power10.c
 endif
 
 SGEMMKERNEL    =  sgemm_kernel_power10.c
@@ -65,9 +69,11 @@ DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c
 DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c
 
 ifeq ($(OSNAME), AIX)
-CGEMMKERNEL    = cgemm_kernel_8x4_power8.S
+#CGEMMKERNEL    = cgemm_kernel_8x4_power8.S
+CGEMMKERNEL    = cgemm_kernel_power10.c
 else
-CGEMMKERNEL    = cgemm_kernel_power10.S
+#CGEMMKERNEL    = cgemm_kernel_power10.S
+CGEMMKERNEL    = cgemm_kernel_power10.c
 endif
 #CGEMMKERNEL     = cgemm_kernel_8x4_power8.S
 CGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
@@ -84,9 +90,11 @@ CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
 CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
 
 ifeq ($(OSNAME), AIX)
-ZGEMMKERNEL    = zgemm_kernel_8x2_power8.S
+#ZGEMMKERNEL    = zgemm_kernel_8x2_power8.S
+ZGEMMKERNEL    = zgemm_kernel_power10.c
 else
-ZGEMMKERNEL    = zgemm_kernel_power10.S
+#ZGEMMKERNEL    = zgemm_kernel_power10.S
+ZGEMMKERNEL    = zgemm_kernel_power10.c
 endif
 ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
diff --git a/kernel/power/cgemm_kernel_power10.c b/kernel/power/cgemm_kernel_power10.c
new file mode 100644
index 000000000..279c83aec
--- /dev/null
+++ b/kernel/power/cgemm_kernel_power10.c
@@ -0,0 +1,1154 @@
+/*********************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+#include "common.h"
+#include <altivec.h>
+
+typedef __vector unsigned char  vec_t;
+typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
+typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
+
+#define SET_ACC_ZERO() \
+          __builtin_mma_xxsetaccz (&acc0); \
+          __builtin_mma_xxsetaccz (&acc1); \
+          __builtin_mma_xxsetaccz (&acc2); \
+          __builtin_mma_xxsetaccz (&acc3); \
+          __builtin_mma_xxsetaccz (&acc4); \
+          __builtin_mma_xxsetaccz (&acc5); \
+          __builtin_mma_xxsetaccz (&acc6); \
+          __builtin_mma_xxsetaccz (&acc7);
+
+#if (defined(NN) || defined(NT) || defined(TN) || defined(TT))
+#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real  = _arbr - _aibi; _imag  =  _arbi + _aibr; }
+#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag +=  _arbi + _aibr; }
+#endif
+
+#if (defined(NR) || defined(NC) || defined(TR) || defined(TC))
+#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real  = _arbr + _aibi; _imag  = -_arbi + _aibr; }
+#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; }
+#endif
+
+#if (defined(RN) || defined(RT) || defined(CN) || defined(CT))
+#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real  = _arbr + _aibi; _imag  =  _arbi - _aibr; }
+#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag +=  _arbi - _aibr; }
+#endif
+
+#if (defined(RR) || defined(RC) || defined(CR) || defined(CC)) 
+#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real  = _arbr - _aibi; _imag  = -_arbi - _aibr; }
+#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; }
+#endif
+
+#if defined (TRMMKERNEL)
+#define A_OP =
+#else
+#define A_OP +=
+#endif
+
+#define BUILTIN_MMA_DISASSEMBLE_ACC_8                                    \
+          __builtin_mma_disassemble_acc ((void *)result,      &acc0);    \
+          __builtin_mma_disassemble_acc ((void *)&result[ 4], &acc1);    \
+          __builtin_mma_disassemble_acc ((void *)&result[ 8], &acc2);    \
+          __builtin_mma_disassemble_acc ((void *)&result[12], &acc3);    \
+          __builtin_mma_disassemble_acc ((void *)&result[16], &acc4);    \
+          __builtin_mma_disassemble_acc ((void *)&result[20], &acc5);    \
+          __builtin_mma_disassemble_acc ((void *)&result[24], &acc6);    \
+          __builtin_mma_disassemble_acc ((void *)&result[28], &acc7);
+
+#define COMP_MUL_1                                                       \
+          COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4])   
+
+#define COMP_MAC_1(_offset) {                                            \
+          FLOAT *_ro = &res[_offset];                                    \
+          COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
+}
+
+#define COMP_MUL_2A                                                      \
+          COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4])     \
+          COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6])
+
+#define COMP_MAC_2A(_offset) {                                           \
+          FLOAT *_ro = &res[_offset];                                    \
+          COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
+          COMP_MAC(tr[1], _ro[ 2], _ro[ 7], ti[1], _ro[ 3], _ro[ 6])     \
+}
+
+#define COMP_MUL_2B                                                      \
+          COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4])     \
+          COMP_MUL(tr[1], res[ 8], res[13], ti[1], res[ 9], res[12])
+
+#define COMP_MAC_2B(_offset) {                                           \
+          FLOAT *_ro = &res[_offset];                                    \
+          COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
+          COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12])     \
+}
+
+#define COMP_MUL_4A(_offset) {                                           \
+          FLOAT *_ro = &res[_offset];                                    \
+          COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
+          COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12])     \
+          COMP_MUL(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20])     \
+          COMP_MUL(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28])     \
+}
+
+#define COMP_MAC_4A(_offset) {                                           \
+          FLOAT *_ro = &res[_offset];                                    \
+          COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
+          COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12])     \
+          COMP_MAC(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20])     \
+          COMP_MAC(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28])     \
+}
+
+#define COMP_MUL_4B(_offset) {                                           \
+          FLOAT *_ro = &res[_offset];                                    \
+          COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
+          COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12])     \
+          COMP_MUL(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6])     \
+          COMP_MUL(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14])     \
+}
+
+#define COMP_MAC_4B(_offset) {                                           \
+          FLOAT *_ro = &res[_offset];                                    \
+          COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
+          COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12])     \
+          COMP_MAC(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6])     \
+          COMP_MAC(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14])     \
+}
+
+
+#define SAVE_ACC_COMPLEX_11                                              \
+          BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
+          COMP_MUL_1                                                     \
+          COMP_MAC_1(16)                                                 \
+          COMP_MAC_1(32)                                                 \
+          COMP_MAC_1(48)                                                 \
+          COMP_MAC_1(64)                                                 \
+          COMP_MAC_1(80)                                                 \
+          COMP_MAC_1(96)                                                 \
+          COMP_MAC_1(112)                                                \
+	  CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                  \
+	  CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_12                                              \
+          BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
+          COMP_MUL_2A                                                    \
+          COMP_MAC_2A(16)                                                \
+          COMP_MAC_2A(32)                                                \
+          COMP_MAC_2A(48)                                                \
+          COMP_MAC_2A(64)                                                \
+          COMP_MAC_2A(80)                                                \
+          COMP_MAC_2A(96)                                                \
+          COMP_MAC_2A(112)                                               \
+	  CO[0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;            \
+	  CO[1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;            \
+	  CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i;            \
+	  CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_21_1                                            \
+          BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
+          COMP_MUL_2B                                                    \
+          COMP_MAC_2B(16)                                                \
+          COMP_MAC_2B(32)                                                \
+          COMP_MAC_2B(48)                                                \
+          COMP_MAC_2B(64)                                                \
+          COMP_MAC_2B(80)                                                \
+          COMP_MAC_2B(96)                                                \
+          COMP_MAC_2B(112)                                               \
+	  CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                  \
+	  CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;                  \
+	  CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;                  \
+	  CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_21_2                                            \
+          BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
+          COMP_MUL_4A(0)                                                 \
+          COMP_MAC_4A(32)                                                \
+          COMP_MAC_4A(64)                                                \
+          COMP_MAC_4A(96)                                                \
+	  CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                  \
+	  CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;                  \
+	  CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;                  \
+	  CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;                  \
+	  CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;                  \
+	  CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;                  \
+	  CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;                  \
+	  CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_21_4                                            \
+          BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
+          COMP_MUL_4A(0)                                                 \
+          COMP_MAC_4A(64)                                                \
+	  CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                 \
+	  CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;                 \
+	  CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;                 \
+	  CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;                 \
+	  CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;                 \
+	  CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;                 \
+	  CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;                 \
+	  CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;                 \
+          COMP_MUL_4A(32)                                                \
+          COMP_MAC_4A(96)                                                \
+	  CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                 \
+	  CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i;                 \
+	  CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i;                 \
+	  CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i;                 \
+	  CO[12] A_OP tr[2] * alpha_r - ti[2] * alpha_i;                 \
+	  CO[13] A_OP ti[2] * alpha_r + tr[2] * alpha_i;                 \
+	  CO[14] A_OP tr[3] * alpha_r - ti[3] * alpha_i;                 \
+	  CO[15] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_22_4                                        \
+          BUILTIN_MMA_DISASSEMBLE_ACC_8                              \
+          COMP_MUL_4B(0)                                             \
+	  CO[ 0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;       \
+	  CO[ 1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;       \
+	  CO[ 2]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;       \
+	  CO[ 3]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;       \
+	  CO[2*ldc+ 0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;       \
+	  CO[2*ldc+ 1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;       \
+	  CO[2*ldc+ 2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;       \
+	  CO[2*ldc+ 3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;       \
+          COMP_MUL_4B(16)                                            \
+	  CO[ 4]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;       \
+	  CO[ 5]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;       \
+	  CO[ 6]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;       \
+	  CO[ 7]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;       \
+	  CO[2*ldc+ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;       \
+	  CO[2*ldc+ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;       \
+	  CO[2*ldc+ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;       \
+	  CO[2*ldc+ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;       \
+          COMP_MUL_4B(32)                                            \
+	  CO[ 8]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;       \
+	  CO[ 9]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;       \
+	  CO[10]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;       \
+	  CO[11]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;       \
+	  CO[2*ldc+ 8] A_OP tr[2] * alpha_r - ti[2] * alpha_i;       \
+	  CO[2*ldc+ 9] A_OP ti[2] * alpha_r + tr[2] * alpha_i;       \
+	  CO[2*ldc+10] A_OP tr[3] * alpha_r - ti[3] * alpha_i;       \
+	  CO[2*ldc+11] A_OP ti[3] * alpha_r + tr[3] * alpha_i;       \
+          COMP_MUL_4B(48)                                            \
+	  CO[12]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;       \
+	  CO[13]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;       \
+	  CO[14]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;       \
+	  CO[15]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;       \
+	  CO[2*ldc+12] A_OP tr[2] * alpha_r - ti[2] * alpha_i;       \
+	  CO[2*ldc+13] A_OP ti[2] * alpha_r + tr[2] * alpha_i;       \
+	  CO[2*ldc+14] A_OP tr[3] * alpha_r - ti[3] * alpha_i;       \
+	  CO[2*ldc+15] A_OP ti[3] * alpha_r + tr[3] * alpha_i;         
+
+#define SAVE_ACC_COMPLEX_22_2                                          \
+          BUILTIN_MMA_DISASSEMBLE_ACC_8                                \
+          COMP_MUL_4B(0)                                               \
+	  CO[0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;          \
+	  CO[1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;          \
+	  CO[2]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;          \
+	  CO[3]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;          \
+	  CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;          \
+	  CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;          \
+	  CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;          \
+	  CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;          \
+          COMP_MUL_4B(16)                                              \
+	  CO[4]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;          \
+	  CO[5]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;          \
+	  CO[6]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;          \
+	  CO[7]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;          \
+	  CO[2*ldc+4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;          \
+	  CO[2*ldc+5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;          \
+	  CO[2*ldc+6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;          \
+	  CO[2*ldc+7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;         
+
+#define SAVE_ACC_COMPLEX_22_1                                          \
+          BUILTIN_MMA_DISASSEMBLE_ACC_8                                \
+          COMP_MUL_4B(0)                                               \
+	  CO[0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;          \
+	  CO[1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;          \
+	  CO[2]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;          \
+	  CO[3]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;          \
+	  CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;          \
+	  CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;          \
+	  CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;          \
+	  CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;         
+
+#define SAVE_ACC_COMPLEX_24_ALL \
+      __builtin_mma_disassemble_acc ((void *)result, &acc0);            \
+      __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc4);      \
+      __builtin_mma_disassemble_acc ((void *)(&result[8]), &acc1);      \
+      __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc5);     \
+      __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc2);     \
+      __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc6);     \
+      __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc3);     \
+      __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7);     \
+      COMP_MUL(tr[ 0], res[  0], res[  5], ti[ 0], res[  1], res[  4])  \
+      COMP_MUL(tr[ 1], res[  8], res[ 13], ti[ 1], res[  9], res[ 12])  \
+      COMP_MUL(tr[ 2], res[  2], res[  7], ti[ 2], res[  3], res[  6])  \
+      COMP_MUL(tr[ 3], res[ 10], res[ 15], ti[ 3], res[ 11], res[ 14])  \
+      COMP_MUL(tr[ 4], res[ 16], res[ 21], ti[ 4], res[ 17], res[ 20])  \
+      COMP_MUL(tr[ 5], res[ 24], res[ 29], ti[ 5], res[ 25], res[ 28])  \
+      COMP_MUL(tr[ 6], res[ 18], res[ 23], ti[ 6], res[ 19], res[ 22])  \
+      COMP_MUL(tr[ 7], res[ 26], res[ 31], ti[ 7], res[ 27], res[ 30])  \
+      COMP_MUL(tr[ 8], res[ 32], res[ 37], ti[ 8], res[ 33], res[ 36])  \
+      COMP_MUL(tr[ 9], res[ 40], res[ 45], ti[ 9], res[ 41], res[ 44])  \
+      COMP_MUL(tr[10], res[ 34], res[ 39], ti[10], res[ 35], res[ 38])  \
+      COMP_MUL(tr[11], res[ 42], res[ 47], ti[11], res[ 43], res[ 46])  \
+      COMP_MUL(tr[12], res[ 48], res[ 53], ti[12], res[ 49], res[ 52])  \
+      COMP_MUL(tr[13], res[ 56], res[ 61], ti[13], res[ 57], res[ 60])  \
+      COMP_MUL(tr[14], res[ 50], res[ 55], ti[14], res[ 51], res[ 54])  \
+      COMP_MUL(tr[15], res[ 58], res[ 63], ti[15], res[ 59], res[ 62])  \
+      COMP_MUL(tr[16], res[ 64], res[ 69], ti[16], res[ 65], res[ 68])  \
+      COMP_MUL(tr[17], res[ 72], res[ 77], ti[17], res[ 73], res[ 76])  \
+      COMP_MUL(tr[18], res[ 66], res[ 71], ti[18], res[ 67], res[ 70])  \
+      COMP_MUL(tr[19], res[ 74], res[ 79], ti[19], res[ 75], res[ 78])  \
+      COMP_MUL(tr[20], res[ 80], res[ 85], ti[20], res[ 81], res[ 84])  \
+      COMP_MUL(tr[21], res[ 88], res[ 93], ti[21], res[ 89], res[ 92])  \
+      COMP_MUL(tr[22], res[ 82], res[ 87], ti[22], res[ 83], res[ 86])  \
+      COMP_MUL(tr[23], res[ 90], res[ 95], ti[23], res[ 91], res[ 94])  \
+      COMP_MUL(tr[24], res[ 96], res[101], ti[24], res[ 97], res[100])  \
+      COMP_MUL(tr[25], res[104], res[109], ti[25], res[105], res[108])  \
+      COMP_MUL(tr[26], res[ 98], res[103], ti[26], res[ 99], res[102])  \
+      COMP_MUL(tr[27], res[106], res[111], ti[27], res[107], res[110])  \
+      COMP_MUL(tr[28], res[112], res[117], ti[28], res[113], res[116])  \
+      COMP_MUL(tr[29], res[120], res[125], ti[29], res[121], res[124])  \
+      COMP_MUL(tr[30], res[114], res[119], ti[30], res[115], res[118])  \
+      COMP_MUL(tr[31], res[122], res[127], ti[31], res[123], res[126])  \
+	  CO[       0] A_OP tr[ 0] * alpha_r - ti[ 0] * alpha_i;            \
+	  CO[       1] A_OP ti[ 0] * alpha_r + tr[ 0] * alpha_i;            \
+	  CO[       2] A_OP tr[ 1] * alpha_r - ti[ 1] * alpha_i;            \
+	  CO[       3] A_OP ti[ 1] * alpha_r + tr[ 1] * alpha_i;            \
+	  CO[2*ldc+ 0] A_OP tr[ 2] * alpha_r - ti[ 2] * alpha_i;            \
+	  CO[2*ldc+ 1] A_OP ti[ 2] * alpha_r + tr[ 2] * alpha_i;            \
+	  CO[2*ldc+ 2] A_OP tr[ 3] * alpha_r - ti[ 3] * alpha_i;            \
+	  CO[2*ldc+ 3] A_OP ti[ 3] * alpha_r + tr[ 3] * alpha_i;            \
+	  CO[4*ldc+ 0] A_OP tr[ 4] * alpha_r - ti[ 4] * alpha_i;            \
+	  CO[4*ldc+ 1] A_OP ti[ 4] * alpha_r + tr[ 4] * alpha_i;            \
+	  CO[4*ldc+ 2] A_OP tr[ 5] * alpha_r - ti[ 5] * alpha_i;            \
+	  CO[4*ldc+ 3] A_OP ti[ 5] * alpha_r + tr[ 5] * alpha_i;            \
+	  CO[6*ldc+ 0] A_OP tr[ 6] * alpha_r - ti[ 6] * alpha_i;            \
+	  CO[6*ldc+ 1] A_OP ti[ 6] * alpha_r + tr[ 6] * alpha_i;            \
+	  CO[6*ldc+ 2] A_OP tr[ 7] * alpha_r - ti[ 7] * alpha_i;            \
+	  CO[6*ldc+ 3] A_OP ti[ 7] * alpha_r + tr[ 7] * alpha_i;            \
+	  CO[       4] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i;            \
+	  CO[       5] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i;            \
+	  CO[       6] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i;            \
+	  CO[       7] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i;            \
+	  CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i;            \
+	  CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i;            \
+	  CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i;            \
+	  CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i;            \
+	  CO[4*ldc+ 4] A_OP tr[12] * alpha_r - ti[12] * alpha_i;            \
+	  CO[4*ldc+ 5] A_OP ti[12] * alpha_r + tr[12] * alpha_i;            \
+	  CO[4*ldc+ 6] A_OP tr[13] * alpha_r - ti[13] * alpha_i;            \
+	  CO[4*ldc+ 7] A_OP ti[13] * alpha_r + tr[13] * alpha_i;            \
+	  CO[6*ldc+ 4] A_OP tr[14] * alpha_r - ti[14] * alpha_i;            \
+	  CO[6*ldc+ 5] A_OP ti[14] * alpha_r + tr[14] * alpha_i;            \
+	  CO[6*ldc+ 6] A_OP tr[15] * alpha_r - ti[15] * alpha_i;            \
+	  CO[6*ldc+ 7] A_OP ti[15] * alpha_r + tr[15] * alpha_i;            \
+	  CO[       8] A_OP tr[16] * alpha_r - ti[16] * alpha_i;            \
+	  CO[       9] A_OP ti[16] * alpha_r + tr[16] * alpha_i;            \
+	  CO[      10] A_OP tr[17] * alpha_r - ti[17] * alpha_i;            \
+	  CO[      11] A_OP ti[17] * alpha_r + tr[17] * alpha_i;            \
+	  CO[2*ldc+ 8] A_OP tr[18] * alpha_r - ti[18] * alpha_i;            \
+	  CO[2*ldc+ 9] A_OP ti[18] * alpha_r + tr[18] * alpha_i;            \
+	  CO[2*ldc+10] A_OP tr[19] * alpha_r - ti[19] * alpha_i;            \
+	  CO[2*ldc+11] A_OP ti[19] * alpha_r + tr[19] * alpha_i;            \
+	  CO[4*ldc+ 8] A_OP tr[20] * alpha_r - ti[20] * alpha_i;            \
+	  CO[4*ldc+ 9] A_OP ti[20] * alpha_r + tr[20] * alpha_i;            \
+	  CO[4*ldc+10] A_OP tr[21] * alpha_r - ti[21] * alpha_i;            \
+	  CO[4*ldc+11] A_OP ti[21] * alpha_r + tr[21] * alpha_i;            \
+	  CO[6*ldc+ 8] A_OP tr[22] * alpha_r - ti[22] * alpha_i;            \
+	  CO[6*ldc+ 9] A_OP ti[22] * alpha_r + tr[22] * alpha_i;            \
+	  CO[6*ldc+10] A_OP tr[23] * alpha_r - ti[23] * alpha_i;            \
+	  CO[6*ldc+11] A_OP ti[23] * alpha_r + tr[23] * alpha_i;            \
+	  CO[      12] A_OP tr[24] * alpha_r - ti[24] * alpha_i;            \
+	  CO[      13] A_OP ti[24] * alpha_r + tr[24] * alpha_i;            \
+	  CO[      14] A_OP tr[25] * alpha_r - ti[25] * alpha_i;            \
+	  CO[      15] A_OP ti[25] * alpha_r + tr[25] * alpha_i;            \
+	  CO[2*ldc+12] A_OP tr[26] * alpha_r - ti[26] * alpha_i;            \
+	  CO[2*ldc+13] A_OP ti[26] * alpha_r + tr[26] * alpha_i;            \
+	  CO[2*ldc+14] A_OP tr[27] * alpha_r - ti[27] * alpha_i;            \
+	  CO[2*ldc+15] A_OP ti[27] * alpha_r + tr[27] * alpha_i;            \
+	  CO[4*ldc+12] A_OP tr[28] * alpha_r - ti[28] * alpha_i;            \
+	  CO[4*ldc+13] A_OP ti[28] * alpha_r + tr[28] * alpha_i;            \
+	  CO[4*ldc+14] A_OP tr[29] * alpha_r - ti[29] * alpha_i;            \
+	  CO[4*ldc+15] A_OP ti[29] * alpha_r + tr[29] * alpha_i;            \
+	  CO[6*ldc+12] A_OP tr[30] * alpha_r - ti[30] * alpha_i;            \
+	  CO[6*ldc+13] A_OP ti[30] * alpha_r + tr[30] * alpha_i;            \
+	  CO[6*ldc+14] A_OP tr[31] * alpha_r - ti[31] * alpha_i;            \
+	  CO[6*ldc+15] A_OP ti[31] * alpha_r + tr[31] * alpha_i; 
+
+#define SAVE_ACC_COMPLEX_24(ACC1, ACC2, CI)                            \
+          __builtin_mma_disassemble_acc ((void *)result, ACC1);        \
+          __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2);  \
+          COMP_MUL(tr[0], res[0], res[5], ti[0], res[1], res[4])       \
+          COMP_MUL(tr[1], res[8], res[13], ti[1], res[9], res[12])     \
+          COMP_MUL(tr[2], res[2], res[7], ti[2], res[3], res[6])       \
+          COMP_MUL(tr[3], res[10], res[15], ti[3], res[11], res[14])   \
+          COMP_MUL(tr[4], res[16], res[21], ti[4], res[17], res[20])   \
+          COMP_MUL(tr[5], res[24], res[29], ti[5], res[25], res[28])   \
+          COMP_MUL(tr[6], res[18], res[23], ti[6], res[19], res[22])   \
+          COMP_MUL(tr[7], res[26], res[31], ti[7], res[27], res[30])   \
+	  CO[CI+0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;       \
+	  CO[CI+1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;       \
+	  CO[CI+2]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;       \
+	  CO[CI+3]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;       \
+	  CO[CI+2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;       \
+	  CO[CI+2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;       \
+	  CO[CI+2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;       \
+	  CO[CI+2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;       \
+	  CO[CI+4*ldc+0] A_OP tr[4] * alpha_r - ti[4] * alpha_i;       \
+	  CO[CI+4*ldc+1] A_OP ti[4] * alpha_r + tr[4] * alpha_i;       \
+	  CO[CI+4*ldc+2] A_OP tr[5] * alpha_r - ti[5] * alpha_i;       \
+	  CO[CI+4*ldc+3] A_OP ti[5] * alpha_r + tr[5] * alpha_i;       \
+	  CO[CI+6*ldc+0] A_OP tr[6] * alpha_r - ti[6] * alpha_i;       \
+	  CO[CI+6*ldc+1] A_OP ti[6] * alpha_r + tr[6] * alpha_i;       \
+	  CO[CI+6*ldc+2] A_OP tr[7] * alpha_r - ti[7] * alpha_i;       \
+	  CO[CI+6*ldc+3] A_OP ti[7] * alpha_r + tr[7] * alpha_i;         
+
+#define SAVE_ACC_COMPLEX_14                                              \
+          BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
+          COMP_MUL(tr[0], res[  0], res[  5], ti[0], res[  1], res[  4]) \
+          COMP_MUL(tr[1], res[  2], res[  7], ti[1], res[  3], res[  6]) \
+          COMP_MUL(tr[2], res[ 16], res[ 21], ti[2], res[ 17], res[ 20]) \
+          COMP_MUL(tr[3], res[ 18], res[ 23], ti[3], res[ 19], res[ 22]) \
+          COMP_MAC(tr[0], res[ 32], res[ 37], ti[0], res[ 33], res[ 36]) \
+          COMP_MAC(tr[1], res[ 34], res[ 39], ti[1], res[ 35], res[ 38]) \
+          COMP_MAC(tr[2], res[ 48], res[ 53], ti[2], res[ 49], res[ 52]) \
+          COMP_MAC(tr[3], res[ 50], res[ 55], ti[3], res[ 51], res[ 54]) \
+          COMP_MAC(tr[0], res[ 64], res[ 69], ti[0], res[ 65], res[ 68]) \
+          COMP_MAC(tr[1], res[ 66], res[ 71], ti[1], res[ 67], res[ 70]) \
+          COMP_MAC(tr[2], res[ 80], res[ 85], ti[2], res[ 81], res[ 84]) \
+          COMP_MAC(tr[3], res[ 82], res[ 87], ti[3], res[ 83], res[ 86]) \
+          COMP_MAC(tr[0], res[ 96], res[101], ti[0], res[ 97], res[100]) \
+          COMP_MAC(tr[1], res[ 98], res[103], ti[1], res[ 99], res[102]) \
+          COMP_MAC(tr[2], res[112], res[117], ti[2], res[113], res[116]) \
+          COMP_MAC(tr[3], res[114], res[119], ti[3], res[115], res[118]) \
+	  CO[0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;            \
+	  CO[1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;            \
+	  CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i;            \
+	  CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i;            \
+	  CO[4*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;            \
+	  CO[4*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;            \
+	  CO[6*ldc+0] A_OP tr[3] * alpha_r - ti[3] * alpha_i;            \
+	  CO[6*ldc+1] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
+
+#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+#define REFRESH_TEMP_BK(x, y) \
+            temp = k - off;
+#elif defined(LEFT)
+#define REFRESH_TEMP_BK(x, y) \
+            temp = off + x;
+#else
+#define REFRESH_TEMP_BK(x, y) \
+            temp = off + y;
+#endif
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_POINTERS(x, y) \
+	  BO = B; \
+          REFRESH_TEMP_BK(x, y)
+#else
+#define REFRESH_POINTERS(x, y) \
+          AO += off * (2*x); \
+          BO = B + off * (2*y); \
+          REFRESH_TEMP_BK(x, y)
+#endif
+
+#ifdef LEFT
+#define REFRESH_OFF(x) \
+            off += x;
+#else
+#define REFRESH_OFF(x)
+#endif
+
+#ifdef LEFT
+#define UPDATE_TEMP(x, y) \
+            temp -= x;
+#else
+#define UPDATE_TEMP(x, y) \
+            temp -= y;
+#endif
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_TMP_AFTER_SAVE(x, y) \
+            temp = k - off; \
+            UPDATE_TEMP(x, y) \
+            AO += temp * (2*x); \
+            BO += temp * (2*y);
+#else
+#define REFRESH_TMP_AFTER_SAVE(x, y)
+#endif
+
+#define REFRESH_AFTER_SAVE(x,y) \
+        REFRESH_TMP_AFTER_SAVE(x, y) \
+	REFRESH_OFF(x)
+/*************************************************************************************
+* GEMM Kernel
+*************************************************************************************/
+int
+CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B,
+       FLOAT * C, BLASLONG ldc
+#ifdef TRMMKERNEL
+       , BLASLONG offset
+#endif
+  )
+{
+  BLASLONG i1, i, l, temp;
+  FLOAT *AO, *BO, *CO;
+#if defined(TRMMKERNEL)
+  BLASLONG off;
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+  off = -offset;
+#endif
+
+  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+
+  v4sf_t result[32];
+  FLOAT *res, tr[64], ti[64];
+  res = (FLOAT *) result;
+
+  for (i1 = 0; i1 < (n >> 2); i1++)
+    {
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      AO = A;
+      CO = C;
+      C += ldc << 3;
+
+      for (i = 0; i < (m >> 3); i++)
+	{
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (8, 4);
+#else
+	  BO = B;
+	  temp = k;
+#endif
+          SET_ACC_ZERO()
+	  for (l = 0; l < temp; ++l)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<4];
+              vec_t rowB1 = *(vec_t *) & BO[l<<3];
+              vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
+              vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
+              vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+              __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1);
+              __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1);
+              __builtin_mma_xvf32gerpp(&acc4, rowA1, rowB2);
+              __builtin_mma_xvf32gerpp(&acc5, rowA2, rowB2);
+              __builtin_mma_xvf32gerpp(&acc6, rowA3, rowB2);
+              __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB2);
+	    }
+      SAVE_ACC_COMPLEX_24_ALL
+	  CO += 16;
+	  AO += temp << 4;
+	  BO += temp << 3;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (8, 4)
+#endif
+	}
+      if (m & 4)
+	{
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (4, 4);
+#else
+	  BO = B;
+	  temp = k;
+#endif
+          SET_ACC_ZERO()
+	  for (l = 0; l < (temp & (~1)); l+=2)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<3];
+              vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
+              vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
+              vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12];
+              vec_t rowB1 = *(vec_t *) & BO[l<<3];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
+              vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8];
+              vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+              __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2);
+              __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2);
+              __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3);
+              __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB3);
+              __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB4);
+              __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4);
+            }
+          for (l = (temp & (~1)); l < temp; ++l)
+            {
+              vec_t rowA1 = *(vec_t *) & AO[l<<3];
+              vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
+              vec_t rowB1 = *(vec_t *) & BO[l<<3];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+              __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2);
+              __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2);
+            }
+          SAVE_ACC_COMPLEX_24(&acc0, &acc2, 0)
+          SAVE_ACC_COMPLEX_24(&acc1, &acc3, 4)
+	  CO += 8;
+	  AO += temp << 3;
+	  BO += temp << 3;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (4, 4)
+#endif
+	}
+      if (m & 2)
+	{
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (2, 4);
+#else
+	  BO = B;
+	  temp = k;
+#endif
+          SET_ACC_ZERO()
+	  for (l = 0; l < (temp & (~3)); l+=4)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<2];
+              vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
+              vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
+              vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12];
+              vec_t rowB1 = *(vec_t *) & BO[l<<3];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
+              vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8];
+              vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12];
+              vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16];
+              vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20];
+              vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24];
+              vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2);
+              __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB3);
+              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB4);
+              __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB5);
+              __builtin_mma_xvf32gerpp(&acc1, rowA3, rowB6);
+              __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB7);
+              __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB8);
+	    }
+	  for (l = (temp & (~3)); l < temp; ++l)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<2];
+              vec_t rowB1 = *(vec_t *) & BO[l<<3];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2);
+	    }
+          SAVE_ACC_COMPLEX_24(&acc0, &acc1, 0)
+	  CO += 4;
+	  AO += temp << 2;
+	  BO += temp << 3;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (2, 4)
+#endif
+	}
+      if (m & 1)
+	{
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (1, 4)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+          SET_ACC_ZERO()
+	  for (l = 0; l < (temp & (~3)); l+=4)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<1];
+              vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
+              vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
+              vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6];
+              vec_t rowB1 = *(vec_t *) & BO[l<<3];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
+              vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8];
+              vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12];
+              vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16];
+              vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20];
+              vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24];
+              vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2);
+              __builtin_mma_xvf32gerpp(&acc2, rowA2, rowB3);
+              __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB4);
+              __builtin_mma_xvf32gerpp(&acc4, rowA3, rowB5);
+              __builtin_mma_xvf32gerpp(&acc5, rowA3, rowB6);
+              __builtin_mma_xvf32gerpp(&acc6, rowA4, rowB7);
+              __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB8);
+	    }
+	  for (l = (temp & (~3)); l < temp; ++l)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<1];
+              vec_t rowB1 = *(vec_t *) & BO[l<<3];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2);
+	    }
+          SAVE_ACC_COMPLEX_14
+	  CO += 2;
+	  AO += temp << 1;
+	  BO += temp << 3;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (1, 4)
+#endif
+	}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 4;			// number of values in A
+#endif
+
+      B += k << 3;
+    }
+
+  if (n & 2)
+    {
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      AO = A;
+      CO = C;
+      C += ldc << 2;
+
+      for (i = 0; i < (m >> 3); i++)
+	{
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (8, 2)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+          SET_ACC_ZERO()
+	  for (l = 0; l < (temp & (~1)); l+=2)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<4];
+              vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
+              vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
+              vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12];
+              vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16];
+              vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20];
+              vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24];
+              vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28];
+              vec_t rowB1 = *(vec_t *) & BO[l<<2];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+              __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1);
+              __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1);
+              __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB2);
+              __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB2);
+              __builtin_mma_xvf32gerpp(&acc2, rowA7, rowB2);
+              __builtin_mma_xvf32gerpp(&acc3, rowA8, rowB2);
+	    }
+	  for (l = (temp & (~1)); l < temp; ++l)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<4];
+              vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
+              vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
+              vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12];
+              vec_t rowB1 = *(vec_t *) & BO[l<<2];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+              __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1);
+              __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1);
+	    }
+          SAVE_ACC_COMPLEX_22_4
+	  AO += temp << 4;
+	  BO += temp << 2;
+	  CO += 16;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (8, 2)
+#endif
+	}
+      if (m & 4)
+	{
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (4, 2)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+          SET_ACC_ZERO()
+	  for (l = 0; l < (temp & (~3)); l+=4)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<3];
+              vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
+              vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
+              vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12];
+              vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16];
+              vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20];
+              vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24];
+              vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28];
+              vec_t rowB1 = *(vec_t *) & BO[l<<2];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4];
+              vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8];
+              vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+              __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB2);
+              __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB2);
+              __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB3);
+              __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB3);
+              __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB4);
+              __builtin_mma_xvf32gerpp(&acc1, rowA8, rowB4);
+	    }
+	  for (l = (temp & (~3)); l < temp; ++l)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<3];
+              vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
+              vec_t rowB1 = *(vec_t *) & BO[l<<2];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+	    }
+          SAVE_ACC_COMPLEX_22_2
+	  AO += temp << 3;
+	  BO += temp << 2;
+	  CO += 8;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (4, 2)
+#endif
+	} if (m & 2)
+	{
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (2, 2)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+          SET_ACC_ZERO()
+	  for (l = 0; l < (temp & (~7)); l+=8)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<2];
+              vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
+              vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
+              vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12];
+              vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16];
+              vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20];
+              vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24];
+              vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28];
+              vec_t rowB1 = *(vec_t *) & BO[l<<2];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4];
+              vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8];
+              vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12];
+              vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16];
+              vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20];
+              vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24];
+              vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB2);
+              __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3);
+              __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB4);
+              __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB5);
+              __builtin_mma_xvf32gerpp(&acc0, rowA6, rowB6);
+              __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB7);
+              __builtin_mma_xvf32gerpp(&acc0, rowA8, rowB8);
+	    }
+	  for (l = (temp & (~7)); l < temp; ++l)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<2];
+              vec_t rowB1 = *(vec_t *) & BO[l<<2];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+	    }
+          SAVE_ACC_COMPLEX_22_1
+	  AO += temp << 2;
+	  BO += temp << 2;
+	  CO += 4;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (2, 2)
+#endif
+	}
+      if (m & 1)
+	{
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (1, 2)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  // RIP OUT MMA STUFF!
+          SET_ACC_ZERO()
+	  for (l = 0; l < (temp & (~7)); l+=8)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<1];
+              vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
+              vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
+              vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6];
+              vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8];
+              vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10];
+              vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12];
+              vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14];
+              vec_t rowB1 = *(vec_t *) & BO[l<<2];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4];
+              vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8];
+              vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12];
+              vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16];
+              vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20];
+              vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24];
+              vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2);
+              __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3);
+              __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4);
+              __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5);
+              __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6);
+              __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
+              __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
+	    }
+	  for (l = (temp & (~7)); l < temp; ++l)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<1];
+              vec_t rowB1 = *(vec_t *) & BO[l<<2];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+	    }
+          SAVE_ACC_COMPLEX_12
+	  AO += temp<<1;
+	  BO += temp<<2;
+	  CO += 2;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (1, 2)
+#endif
+	}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 2;			// number of values in A
+#endif
+      B += k << 2;
+    }
+
+  if (n & 1)
+    {
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      AO = A;
+      CO = C;
+      C += ldc << 1;
+
+      for (i = 0; i < (m >> 3); i++)
+	{
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (8, 1)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+          SET_ACC_ZERO()
+	  for (l = 0; l < (temp & (~1)); l+=2)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<4];
+              vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
+              vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
+              vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12];
+              vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16];
+              vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20];
+              vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24];
+              vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28];
+              vec_t rowB1 = *(vec_t *) & BO[l<<1];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+              __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1);
+              __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1);
+              __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB2);
+              __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB2);
+              __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB2);
+              __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB2);
+	    }
+	  for (l = (temp & (~1)); l < temp; ++l)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<4];
+              vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
+              vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
+              vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12];
+              vec_t rowB1 = *(vec_t *) & BO[l<<1];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+              __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1);
+              __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1);
+	    }
+          SAVE_ACC_COMPLEX_21_4
+	  AO += temp << 4;
+	  BO += temp << 1;
+	  CO += 16;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (8, 1)
+#endif
+	}
+      if (m & 4)
+	{
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (4, 1)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+          SET_ACC_ZERO()
+	  for (l = 0; l < (temp & (~3)); l+=4)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<3];
+              vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
+              vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
+              vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12];
+              vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16];
+              vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20];
+              vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24];
+              vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28];
+              vec_t rowB1 = *(vec_t *) & BO[l<<1];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+              vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
+              vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+              __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB2);
+              __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB2);
+              __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB3);
+              __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB3);
+              __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB4);
+              __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB4);
+	    }
+	  for (l = (temp & (~3)); l < temp; ++l)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<3];
+              vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
+              vec_t rowB1 = *(vec_t *) & BO[l<<1];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+	    }
+          SAVE_ACC_COMPLEX_21_2
+	  AO += temp << 3;
+	  BO += temp << 1;
+	  CO += 8;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (4, 1)
+#endif
+	}
+      if (m & 2)
+	{
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (2, 1)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  // RIP OUT MMA STUFF!
+          SET_ACC_ZERO()
+	  for (l = 0; l < (temp & (~7)); l+=8)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<2];
+              vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
+              vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
+              vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12];
+              vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16];
+              vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20];
+              vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24];
+              vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28];
+              vec_t rowB1 = *(vec_t *) & BO[l<<1];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+              vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
+              vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
+              vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
+              vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
+              vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
+              vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2);
+              __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3);
+              __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4);
+              __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5);
+              __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6);
+              __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
+              __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
+	    }
+	  for (l = (temp & (~7)); l < temp; ++l)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<2];
+              vec_t rowB1 = *(vec_t *) & BO[l<<1];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+	    }
+          SAVE_ACC_COMPLEX_21_1
+	  AO += temp << 2;
+	  BO += temp << 1;
+	  CO += 4;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (2, 1)
+#endif
+	}
+      if (m & 1)
+	{
+#if defined(TRMMKERNEL)
+	  REFRESH_POINTERS (1, 1)
+#else
+	  BO = B;
+	  temp = k;
+#endif
+	  // RIP OUT MMA STUFF!
+          SET_ACC_ZERO()
+	  for (l = 0; l < (temp & (~7)); l+=8)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<1];
+              vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
+              vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
+              vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6];
+              vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8];
+              vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10];
+              vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12];
+              vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14];
+              vec_t rowB1 = *(vec_t *) & BO[l<<1];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+              vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
+              vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
+              vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
+              vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
+              vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
+              vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2);
+              __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3);
+              __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4);
+              __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5);
+              __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6);
+              __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
+              __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
+	    }
+	  for (l = (temp & (~7)); l < temp; ++l)
+	    {
+              vec_t rowA1 = *(vec_t *) & AO[l<<1];
+              vec_t rowB1 = *(vec_t *) & BO[l<<1];
+              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+	    }
+          SAVE_ACC_COMPLEX_11
+	  AO += temp<<1;
+	  BO += temp<<1;
+	  CO += 2;
+#if defined(TRMMKERNEL)
+	  REFRESH_AFTER_SAVE (1, 1)
+#endif
+	}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 1;			// number of values in A
+#endif
+      B += k << 1;
+    }
+  return 0;
+}
diff --git a/kernel/power/zgemm_kernel_power10.c b/kernel/power/zgemm_kernel_power10.c
new file mode 100644
index 000000000..e4e609067
--- /dev/null
+++ b/kernel/power/zgemm_kernel_power10.c
@@ -0,0 +1,761 @@
+/*********************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+#include "common.h"
+#include <altivec.h>
+
+typedef __vector unsigned char  vec_t;
+typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
+
+#define SET_ACC_ZERO() \
+          __builtin_mma_xxsetaccz (&acc0); \
+          __builtin_mma_xxsetaccz (&acc1); \
+          __builtin_mma_xxsetaccz (&acc2); \
+          __builtin_mma_xxsetaccz (&acc3); \
+          __builtin_mma_xxsetaccz (&acc4); \
+          __builtin_mma_xxsetaccz (&acc5); \
+          __builtin_mma_xxsetaccz (&acc6); \
+          __builtin_mma_xxsetaccz (&acc7);
+
+#if (defined(NN) || defined(NT) || defined(TN) || defined(TT))
+#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real  = _arbr - _aibi; _imag  =  _arbi + _aibr; }
+#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag +=  _arbi + _aibr; }
+#endif
+
+#if (defined(NR) || defined(NC) || defined(TR) || defined(TC))
+#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real  = _arbr + _aibi; _imag  = -_arbi + _aibr; }
+#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; }
+#endif
+
+#if (defined(RN) || defined(RT) || defined(CN) || defined(CT))
+#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real  = _arbr + _aibi; _imag  =  _arbi - _aibr; }
+#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag +=  _arbi - _aibr; }
+#endif
+
+#if (defined(RR) || defined(RC) || defined(CR) || defined(CC)) 
+#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real  = _arbr - _aibi; _imag  = -_arbi - _aibr; }
+#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; }
+#endif
+
+#if defined(TRMMKERNEL) 
+#define A_OP =
+#else
+#define A_OP +=
+#endif
+
+#define BUILTIN_MMA_DISASSEMBLE_ACC_8                                 \
+          __builtin_mma_disassemble_acc ((void *)result, &acc0);      \
+          __builtin_mma_disassemble_acc ((void *)&result[4], &acc1);  \
+          __builtin_mma_disassemble_acc ((void *)&result[8], &acc2);  \
+          __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \
+          __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \
+          __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \
+          __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \
+          __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); 
+
+#define SAVE_ACC_COMPLEX_11                                           \
+          BUILTIN_MMA_DISASSEMBLE_ACC_8                               \
+          COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2])  \
+          COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10])  \
+          COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18])  \
+          COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26])  \
+          COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34])  \
+          COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42])  \
+          COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50])  \
+          COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58])  \
+	  CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;               \
+	  CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_12                                           \
+          BUILTIN_MMA_DISASSEMBLE_ACC_8                               \
+          COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2])  \
+          COMP_MUL(tr[1], res[ 8], res[11], ti[1], res[ 9], res[10])  \
+          COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18])  \
+          COMP_MAC(tr[1], res[24], res[27], ti[1], res[25], res[26])  \
+          COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34])  \
+          COMP_MAC(tr[1], res[40], res[43], ti[1], res[41], res[42])  \
+          COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50])  \
+          COMP_MAC(tr[1], res[56], res[59], ti[1], res[57], res[58])  \
+	  CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;               \
+	  CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;               \
+	  CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i;         \
+	  CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_21_1                                         \
+          BUILTIN_MMA_DISASSEMBLE_ACC_8                               \
+          COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2])  \
+          COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6])  \
+          COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10])  \
+          COMP_MAC(tr[1], res[12], res[15], ti[1], res[13], res[14])  \
+          COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18])  \
+          COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22])  \
+          COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26])  \
+          COMP_MAC(tr[1], res[28], res[31], ti[1], res[29], res[30])  \
+          COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34])  \
+          COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38])  \
+          COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42])  \
+          COMP_MAC(tr[1], res[44], res[47], ti[1], res[45], res[46])  \
+          COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50])  \
+          COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54])  \
+          COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58])  \
+          COMP_MAC(tr[1], res[60], res[63], ti[1], res[61], res[62])  \
+	  CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;               \
+	  CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;               \
+	  CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;               \
+	  CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_21_2                                         \
+          BUILTIN_MMA_DISASSEMBLE_ACC_8                               \
+          COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2])  \
+          COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6])  \
+          COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10])  \
+          COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14])  \
+          COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18])  \
+          COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22])  \
+          COMP_MAC(tr[2], res[24], res[27], ti[2], res[25], res[26])  \
+          COMP_MAC(tr[3], res[28], res[31], ti[3], res[29], res[30])  \
+          COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34])  \
+          COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38])  \
+          COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42])  \
+          COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46])  \
+          COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50])  \
+          COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54])  \
+          COMP_MAC(tr[2], res[56], res[59], ti[2], res[57], res[58])  \
+          COMP_MAC(tr[3], res[60], res[63], ti[3], res[61], res[62])  \
+	  CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;               \
+	  CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;               \
+	  CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;               \
+	  CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;               \
+	  CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;               \
+	  CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;               \
+	  CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;               \
+	  CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_21_4                                         \
+          BUILTIN_MMA_DISASSEMBLE_ACC_8                               \
+          COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2])  \
+          COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6])  \
+          COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10])  \
+          COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14])  \
+          COMP_MUL(tr[4], res[16], res[19], ti[4], res[17], res[18])  \
+          COMP_MUL(tr[5], res[20], res[23], ti[5], res[21], res[22])  \
+          COMP_MUL(tr[6], res[24], res[27], ti[6], res[25], res[26])  \
+          COMP_MUL(tr[7], res[28], res[31], ti[7], res[29], res[30])  \
+          COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34])  \
+          COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38])  \
+          COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42])  \
+          COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46])  \
+          COMP_MAC(tr[4], res[48], res[51], ti[4], res[49], res[50])  \
+          COMP_MAC(tr[5], res[52], res[55], ti[5], res[53], res[54])  \
+          COMP_MAC(tr[6], res[56], res[59], ti[6], res[57], res[58])  \
+          COMP_MAC(tr[7], res[60], res[63], ti[7], res[61], res[62])  \
+	  CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;              \
+	  CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;              \
+	  CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;              \
+	  CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;              \
+	  CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;              \
+	  CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;              \
+	  CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;              \
+	  CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;              \
+	  CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i;              \
+	  CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i;              \
+	  CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i;              \
+	  CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i;              \
+	  CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i;              \
+	  CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i;              \
+	  CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i;              \
+	  CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_22_1                                          \
+          __builtin_mma_disassemble_acc ((void *)result, &acc0);       \
+          __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc1); \
+          COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2])       \
+          COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6])       \
+          COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10])     \
+          COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14] )  \
+	  CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;            \
+	  CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;            \
+	  CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;            \
+	  CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;            \
+	  CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;      \
+	  CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;      \
+	  CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;      \
+	  CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_22_2(ACC1, ACC2, CI)                         \
+          __builtin_mma_disassemble_acc ((void *)result, ACC1);       \
+          __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \
+          COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2])      \
+          COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6])      \
+          COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10])    \
+          COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14])  \
+	  CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;            \
+	  CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;            \
+	  CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;            \
+	  CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;            \
+	  CO[2*ldc+CI+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;      \
+	  CO[2*ldc+CI+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;      \
+	  CO[2*ldc+CI+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;      \
+	  CO[2*ldc+CI+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
+
+#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+#define REFRESH_TEMP_BK(x, y) \
+            temp = k - off;
+#elif defined(LEFT)
+#define REFRESH_TEMP_BK(x, y) \
+            temp = off + x;
+#else
+#define REFRESH_TEMP_BK(x, y) \
+            temp = off + y;
+#endif
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_POINTERS(x, y)  \
+          BO = B;               \
+          REFRESH_TEMP_BK(x, y)
+#else
+#define REFRESH_POINTERS(x, y)  \
+          AO += off * (2*x);    \
+          BO = B + off * (2*y); \
+          REFRESH_TEMP_BK(x, y)
+#endif
+
+#ifdef LEFT
+#define REFRESH_OFF(x) \
+            off += x;
+#else
+#define REFRESH_OFF(x)
+#endif
+
+#ifdef LEFT
+#define UPDATE_TEMP(x, y) \
+            temp -= x;
+#else
+#define UPDATE_TEMP(x, y) \
+            temp -= y;
+#endif
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_TMP_AFTER_SAVE(x, y) \
+            temp = k - off; \
+            UPDATE_TEMP(x, y) \
+            AO += temp * (2*x); \
+            BO += temp * (2*y);
+#else
+#define REFRESH_TMP_AFTER_SAVE(x, y)
+#endif
+
+#define REFRESH_AFTER_SAVE(x,y) \
+        REFRESH_TMP_AFTER_SAVE(x, y) \
+        REFRESH_OFF(x)
+/*************************************************************************************
+* GEMM Kernel
+*************************************************************************************/
+int
+CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B,
+       FLOAT * C, BLASLONG ldc
+#ifdef TRMMKERNEL
+       , BLASLONG offset
+#endif
+  )
+{
+  BLASLONG i1, i, l, temp;
+  FLOAT *AO, *BO, *CO;
+#if defined(TRMMKERNEL)
+  BLASLONG off;
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+  off = -offset;
+#endif
+  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+
+  v4sf_t result[32];
+  FLOAT *res, tr[16], ti[16];
+  res = (FLOAT *) result;
+
+  for (i1 = 0; i1 < (n >> 1); i1++)
+    {
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      AO = A;
+      CO = C;
+      C += ldc<<2;
+      for (i = 0; i < (m >> 3); i++)
+	{
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (8, 2)
+#else
+          BO = B;
+          temp = k;
+#endif
+          SET_ACC_ZERO()
+	  for (l = 0; l < temp; ++l)
+	    {
+              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
+              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
+              __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
+              __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
+              vec_t rowB1 = *(vec_t *) & BO[l<<2];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
+              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
+              __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
+              __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
+              __builtin_mma_xvf64gerpp(&acc4, rowA1, rowB2);
+              __builtin_mma_xvf64gerpp(&acc5, rowA2, rowB2);
+              __builtin_mma_xvf64gerpp(&acc6, rowA3, rowB2);
+              __builtin_mma_xvf64gerpp(&acc7, rowA4, rowB2);
+	    }
+        __builtin_mma_disassemble_acc ((void *)result,        &acc0);
+        __builtin_mma_disassemble_acc ((void *)(&result[ 4]), &acc1);
+        __builtin_mma_disassemble_acc ((void *)(&result[ 8]), &acc2);
+        __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc3);
+        __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc4);
+        __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc5);
+        __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc6);
+        __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7);
+        COMP_MUL(tr[ 0], res[ 0], res[ 3], ti[ 0], res[ 1], res[ 2])
+        COMP_MUL(tr[ 1], res[ 4], res[ 7], ti[ 1], res[ 5], res[ 6])
+        COMP_MUL(tr[ 2], res[ 8], res[11], ti[ 2], res[ 9], res[10])
+        COMP_MUL(tr[ 3], res[12], res[15], ti[ 3], res[13], res[14])
+        COMP_MUL(tr[ 4], res[16], res[19], ti[ 4], res[17], res[18])
+        COMP_MUL(tr[ 5], res[20], res[23], ti[ 5], res[21], res[22])
+        COMP_MUL(tr[ 6], res[24], res[27], ti[ 6], res[25], res[26])
+        COMP_MUL(tr[ 7], res[28], res[31], ti[ 7], res[29], res[30])
+        COMP_MUL(tr[ 8], res[32], res[35], ti[ 8], res[33], res[34])
+        COMP_MUL(tr[ 9], res[36], res[39], ti[ 9], res[37], res[38])
+        COMP_MUL(tr[10], res[40], res[43], ti[10], res[41], res[42])
+        COMP_MUL(tr[11], res[44], res[47], ti[11], res[45], res[46])
+        COMP_MUL(tr[12], res[48], res[51], ti[12], res[49], res[50])
+        COMP_MUL(tr[13], res[52], res[55], ti[13], res[53], res[54])
+        COMP_MUL(tr[14], res[56], res[59], ti[14], res[57], res[58])
+        COMP_MUL(tr[15], res[60], res[63], ti[15], res[61], res[62])
+        CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;
+        CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
+        CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;
+        CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
+        CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;
+        CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;
+        CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;
+        CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
+        CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i;
+        CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i;
+        CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i;
+        CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i;
+        CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i;
+        CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i;
+        CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i;
+        CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i;
+        CO[2*ldc+ 0] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i;
+        CO[2*ldc+ 1] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i;
+        CO[2*ldc+ 2] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i;
+        CO[2*ldc+ 3] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i;
+        CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i;
+        CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i;
+        CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i;
+        CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i;
+        CO[2*ldc+ 8] A_OP tr[12] * alpha_r - ti[12] * alpha_i;
+        CO[2*ldc+ 9] A_OP ti[12] * alpha_r + tr[12] * alpha_i;
+        CO[2*ldc+10] A_OP tr[13] * alpha_r - ti[13] * alpha_i;
+        CO[2*ldc+11] A_OP ti[13] * alpha_r + tr[13] * alpha_i;
+        CO[2*ldc+12] A_OP tr[14] * alpha_r - ti[14] * alpha_i;
+        CO[2*ldc+13] A_OP ti[14] * alpha_r + tr[14] * alpha_i;
+        CO[2*ldc+14] A_OP tr[15] * alpha_r - ti[15] * alpha_i;
+        CO[2*ldc+15] A_OP ti[15] * alpha_r + tr[15] * alpha_i;
+
+	  AO += temp << 4;
+	  BO += temp << 2;
+	  CO += 16;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (8, 2)
+#endif
+	}
+      if (m & 4)
+	{
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (4, 2)
+#else
+          BO = B;
+          temp = k;
+#endif
+          SET_ACC_ZERO()
+	  for (l = 0; l < (temp & (~1)); l+=2)
+	    {
+              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
+              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
+              __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
+              __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12]));
+              vec_t rowB1 = *(vec_t *) & BO[l<<2];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
+              vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
+              vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
+              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
+              __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2);
+              __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2);
+              __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB3);
+              __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB3);
+              __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4);
+              __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
+	    }
+	  for (l = (temp & (~1)); l < temp; ++l)
+	    {
+              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
+              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
+              vec_t rowB1 = *(vec_t *) & BO[l<<2];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
+              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
+              __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2);
+              __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2);
+	    }
+          SAVE_ACC_COMPLEX_22_2(&acc0, &acc2, 0)
+          SAVE_ACC_COMPLEX_22_2(&acc1, &acc3, 4)
+	  AO += temp << 3;
+	  BO += temp << 2;
+	  CO += 8;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (4, 2)
+#endif
+	}
+      if (m & 2)
+	{
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (2, 2)
+#else
+          BO = B;
+          temp = k;
+#endif
+          SET_ACC_ZERO()
+	  for (l = 0; l < (temp & (~3)); l+=4)
+	    {
+              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
+              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
+              __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
+              __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12]));
+              vec_t rowB1 = *(vec_t *) & BO[l<<2];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
+              vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
+              vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
+              vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8];
+              vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10];
+              vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12];
+              vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14];
+              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
+              __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3);
+              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4);
+              __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5);
+              __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6);
+              __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
+              __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
+	    }
+	  for (l = (temp & (~3)); l < temp; ++l)
+	    {
+              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
+              vec_t rowB1 = *(vec_t *) & BO[l<<2];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
+              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
+	    }
+          SAVE_ACC_COMPLEX_22_1
+	  AO += temp << 2;
+	  BO += temp << 2;
+	  CO += 4;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (2, 2)
+#endif
+	}
+      if (m & 1)
+	{
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (1, 2)
+#else
+          BO = B;
+          temp = k;
+#endif
+	  // RIP OUT MMA STUFF!
+          SET_ACC_ZERO()
+	  for (l = 0; l < (temp & (~3)); l+=4)
+	    {
+              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
+              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
+              __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
+              __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6]));
+              vec_t rowB1 = *(vec_t *) & BO[l<<2];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
+              vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
+              vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
+              vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8];
+              vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10];
+              vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12];
+              vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14];
+              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
+              __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3);
+              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4);
+              __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5);
+              __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6);
+              __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
+              __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
+	    }
+	  for (l = (temp & (~3)); l < temp; ++l)
+	    {
+              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
+              vec_t rowB1 = *(vec_t *) & BO[l<<2];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
+              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
+	    }
+          SAVE_ACC_COMPLEX_12                
+	  AO += temp << 1;
+	  BO += temp << 2;
+	  CO += 2;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (1, 2)
+#endif
+	}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 2;                 // number of values in A
+#endif
+      B += k << 2;
+    }
+  if (n & 1)
+    {
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      AO = A;
+      CO = C;
+      C += ldc<<1;
+      for (i = 0; i < (m >> 3); i++)
+	{
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (8, 1)
+#else
+          BO = B;
+          temp = k;
+#endif
+          SET_ACC_ZERO()
+	  for (l = 0; l < (temp & (~1)); l+=2)
+	    {
+              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
+              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
+              __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
+              __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
+              __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16]));
+              __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20]));
+              __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24]));
+              __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28]));
+              vec_t rowB1 = *(vec_t *) & BO[l<<1];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
+              __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
+              __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
+              __builtin_mma_xvf64gerpp(&acc0, rowA5, rowB2);
+              __builtin_mma_xvf64gerpp(&acc1, rowA6, rowB2);
+              __builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2);
+              __builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2);
+	    }
+	  for (l = (temp & (~1)); l < temp; ++l)
+	    {
+              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
+              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
+              __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
+              __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
+              vec_t rowB1 = *(vec_t *) & BO[l<<1];
+              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
+              __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
+              __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
+	    }
+          SAVE_ACC_COMPLEX_21_4
+
+	  AO += temp << 4;
+	  BO += temp << 1;
+	  CO += 16;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (8, 1)
+#endif
+	}
+      if (m & 4)
+	{
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (4, 1)
+#else
+          BO = B;
+          temp = k;
+#endif
+          SET_ACC_ZERO()
+	  for (l = 0; l < (temp & (~3)); l+=4)
+	    {
+              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
+              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
+              __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
+              __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12]));
+              __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16]));
+              __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20]));
+              __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24]));
+              __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28]));
+              vec_t rowB1 = *(vec_t *) & BO[l<<1];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+              vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
+              vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
+              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
+              __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB2);
+              __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB2);
+              __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB3);
+              __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB3);
+              __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4);
+              __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4);
+	    }
+	  for (l = (temp & (~3)); l < temp; ++l)
+	    {
+              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
+              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
+              vec_t rowB1 = *(vec_t *) & BO[l<<1];
+              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
+	    }
+          SAVE_ACC_COMPLEX_21_2
+	  AO += temp << 3;
+	  BO += temp << 1;
+	  CO += 8;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (4, 1)
+#endif
+	} if (m & 2)
+	{
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (2, 1)
+#else
+          BO = B;
+          temp = k;
+#endif
+          SET_ACC_ZERO()
+	  for (l = 0; l < (temp & (~7)); l+=8)
+	    {
+              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
+              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
+              __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
+              __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12]));
+              __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16]));
+              __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20]));
+              __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24]));
+              __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28]));
+              vec_t rowB1 = *(vec_t *) & BO[l<<1];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+              vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
+              vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
+              vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
+              vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
+              vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
+              vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
+              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2);
+              __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3);
+              __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
+              __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5);
+              __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6);
+              __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
+              __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
+	    }
+	  for (l = (temp & (~7)); l < temp; ++l)
+	    {
+              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
+              vec_t rowB1 = *(vec_t *) & BO[l<<1];
+              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+	    }
+          SAVE_ACC_COMPLEX_21_1
+	  AO += temp << 2;
+	  BO += temp << 1;
+	  CO += 4;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (2, 1)
+#endif
+	}
+      if (m & 1)
+	{
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (1, 1)
+#else
+          BO = B;
+          temp = k;
+#endif
+	  // RIP OUT MMA STUFF!
+          SET_ACC_ZERO()
+	  for (l = 0; l < (temp & (~7)); l+=8)
+	    {
+              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
+              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
+              __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
+              __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6]));
+              __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8]));
+              __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10]));
+              __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12]));
+              __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14]));
+              vec_t rowB1 = *(vec_t *) & BO[l<<1];
+              vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+              vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
+              vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
+              vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
+              vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
+              vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
+              vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
+              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2);
+              __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3);
+              __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
+              __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5);
+              __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6);
+              __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
+              __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
+	    }
+	  for (l = (temp & (~7)); l < temp; ++l)
+	    {
+              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
+              vec_t rowB1 = *(vec_t *) & BO[l<<1];
+              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+	    }
+          SAVE_ACC_COMPLEX_11       
+	  AO += temp << 1;
+	  BO += temp << 1;
+	  CO += 2;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (1, 1)
+#endif
+	}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 1;                 // number of values in A
+#endif
+      B += k << 1;
+    }
+  return 0;
+}

From 87ba528d8b12d8debf77b5b952b7c31d2ba1bd08 Mon Sep 17 00:00:00 2001
From: austinpagan <fossum@us.ibm.com>
Date: Thu, 1 Feb 2024 18:46:07 -0600
Subject: [PATCH 626/718] Changed C files to straighten out indentation. 
 Removed commented lines from other file.

---
 kernel/power/KERNEL.POWER10         |   21 -
 kernel/power/cgemm_kernel_power10.c | 1725 +++++++++++++--------------
 kernel/power/zgemm_kernel_power10.c | 1161 +++++++++---------
 3 files changed, 1411 insertions(+), 1496 deletions(-)

diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index 5f49b9c46..4e408e121 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -16,17 +16,8 @@ SBGEMMOTCOPYOBJ =  sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 STRMMKERNEL	= sgemm_kernel_power10.c
 DTRMMKERNEL	= dgemm_kernel_power10.c
-ifeq ($(OSNAME), AIX)
-#CTRMMKERNEL     = ctrmm_kernel_8x4_power8.S
-#ZTRMMKERNEL     = ztrmm_kernel_8x2_power8.S
-CTRMMKERNEL	= cgemm_kernel_power10.c
-ZTRMMKERNEL	= zgemm_kernel_power10.c
-else
-#CTRMMKERNEL	= cgemm_kernel_power10.S
-#ZTRMMKERNEL	= zgemm_kernel_power10.S
 CTRMMKERNEL	= cgemm_kernel_power10.c
 ZTRMMKERNEL	= zgemm_kernel_power10.c
-endif
 
 SGEMMKERNEL    =  sgemm_kernel_power10.c
 SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
@@ -68,13 +59,7 @@ DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c
 DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c
 DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c
 
-ifeq ($(OSNAME), AIX)
-#CGEMMKERNEL    = cgemm_kernel_8x4_power8.S
 CGEMMKERNEL    = cgemm_kernel_power10.c
-else
-#CGEMMKERNEL    = cgemm_kernel_power10.S
-CGEMMKERNEL    = cgemm_kernel_power10.c
-endif
 #CGEMMKERNEL     = cgemm_kernel_8x4_power8.S
 CGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
 ifeq ($(OSNAME), AIX)
@@ -89,13 +74,7 @@ CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
 CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
 
-ifeq ($(OSNAME), AIX)
-#ZGEMMKERNEL    = zgemm_kernel_8x2_power8.S
 ZGEMMKERNEL    = zgemm_kernel_power10.c
-else
-#ZGEMMKERNEL    = zgemm_kernel_power10.S
-ZGEMMKERNEL    = zgemm_kernel_power10.c
-endif
 ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
 ZGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
diff --git a/kernel/power/cgemm_kernel_power10.c b/kernel/power/cgemm_kernel_power10.c
index 279c83aec..233768cef 100644
--- a/kernel/power/cgemm_kernel_power10.c
+++ b/kernel/power/cgemm_kernel_power10.c
@@ -31,15 +31,15 @@ typedef __vector unsigned char  vec_t;
 typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
 typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
 
-#define SET_ACC_ZERO() \
-          __builtin_mma_xxsetaccz (&acc0); \
-          __builtin_mma_xxsetaccz (&acc1); \
-          __builtin_mma_xxsetaccz (&acc2); \
-          __builtin_mma_xxsetaccz (&acc3); \
-          __builtin_mma_xxsetaccz (&acc4); \
-          __builtin_mma_xxsetaccz (&acc5); \
-          __builtin_mma_xxsetaccz (&acc6); \
-          __builtin_mma_xxsetaccz (&acc7);
+#define SET_ACC_ZERO()                 \
+      __builtin_mma_xxsetaccz (&acc0); \
+      __builtin_mma_xxsetaccz (&acc1); \
+      __builtin_mma_xxsetaccz (&acc2); \
+      __builtin_mma_xxsetaccz (&acc3); \
+      __builtin_mma_xxsetaccz (&acc4); \
+      __builtin_mma_xxsetaccz (&acc5); \
+      __builtin_mma_xxsetaccz (&acc6); \
+      __builtin_mma_xxsetaccz (&acc7);
 
 #if (defined(NN) || defined(NT) || defined(TN) || defined(TT))
 #define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real  = _arbr - _aibi; _imag  =  _arbi + _aibr; }
@@ -67,231 +67,231 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
 #define A_OP +=
 #endif
 
-#define BUILTIN_MMA_DISASSEMBLE_ACC_8                                    \
-          __builtin_mma_disassemble_acc ((void *)result,      &acc0);    \
-          __builtin_mma_disassemble_acc ((void *)&result[ 4], &acc1);    \
-          __builtin_mma_disassemble_acc ((void *)&result[ 8], &acc2);    \
-          __builtin_mma_disassemble_acc ((void *)&result[12], &acc3);    \
-          __builtin_mma_disassemble_acc ((void *)&result[16], &acc4);    \
-          __builtin_mma_disassemble_acc ((void *)&result[20], &acc5);    \
-          __builtin_mma_disassemble_acc ((void *)&result[24], &acc6);    \
-          __builtin_mma_disassemble_acc ((void *)&result[28], &acc7);
+#define BUILTIN_MMA_DISASSEMBLE_ACC_8                                \
+      __builtin_mma_disassemble_acc ((void *)result,      &acc0);    \
+      __builtin_mma_disassemble_acc ((void *)&result[ 4], &acc1);    \
+      __builtin_mma_disassemble_acc ((void *)&result[ 8], &acc2);    \
+      __builtin_mma_disassemble_acc ((void *)&result[12], &acc3);    \
+      __builtin_mma_disassemble_acc ((void *)&result[16], &acc4);    \
+      __builtin_mma_disassemble_acc ((void *)&result[20], &acc5);    \
+      __builtin_mma_disassemble_acc ((void *)&result[24], &acc6);    \
+      __builtin_mma_disassemble_acc ((void *)&result[28], &acc7);
 
-#define COMP_MUL_1                                                       \
-          COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4])   
+#define COMP_MUL_1                                                   \
+      COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4])   
 
-#define COMP_MAC_1(_offset) {                                            \
-          FLOAT *_ro = &res[_offset];                                    \
-          COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
+#define COMP_MAC_1(_offset) {                                        \
+      FLOAT *_ro = &res[_offset];                                    \
+      COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
 }
 
-#define COMP_MUL_2A                                                      \
-          COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4])     \
-          COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6])
+#define COMP_MUL_2A                                                  \
+      COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4])     \
+      COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6])
 
-#define COMP_MAC_2A(_offset) {                                           \
-          FLOAT *_ro = &res[_offset];                                    \
-          COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
-          COMP_MAC(tr[1], _ro[ 2], _ro[ 7], ti[1], _ro[ 3], _ro[ 6])     \
+#define COMP_MAC_2A(_offset) {                                       \
+      FLOAT *_ro = &res[_offset];                                    \
+      COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
+      COMP_MAC(tr[1], _ro[ 2], _ro[ 7], ti[1], _ro[ 3], _ro[ 6])     \
 }
 
-#define COMP_MUL_2B                                                      \
-          COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4])     \
-          COMP_MUL(tr[1], res[ 8], res[13], ti[1], res[ 9], res[12])
+#define COMP_MUL_2B                                                  \
+      COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4])     \
+      COMP_MUL(tr[1], res[ 8], res[13], ti[1], res[ 9], res[12])
 
-#define COMP_MAC_2B(_offset) {                                           \
-          FLOAT *_ro = &res[_offset];                                    \
-          COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
-          COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12])     \
+#define COMP_MAC_2B(_offset) {                                       \
+      FLOAT *_ro = &res[_offset];                                    \
+      COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
+      COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12])     \
 }
 
-#define COMP_MUL_4A(_offset) {                                           \
-          FLOAT *_ro = &res[_offset];                                    \
-          COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
-          COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12])     \
-          COMP_MUL(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20])     \
-          COMP_MUL(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28])     \
+#define COMP_MUL_4A(_offset) {                                       \
+      FLOAT *_ro = &res[_offset];                                    \
+      COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
+      COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12])     \
+      COMP_MUL(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20])     \
+      COMP_MUL(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28])     \
 }
 
-#define COMP_MAC_4A(_offset) {                                           \
-          FLOAT *_ro = &res[_offset];                                    \
-          COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
-          COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12])     \
-          COMP_MAC(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20])     \
-          COMP_MAC(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28])     \
+#define COMP_MAC_4A(_offset) {                                       \
+      FLOAT *_ro = &res[_offset];                                    \
+      COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
+      COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12])     \
+      COMP_MAC(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20])     \
+      COMP_MAC(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28])     \
 }
 
-#define COMP_MUL_4B(_offset) {                                           \
-          FLOAT *_ro = &res[_offset];                                    \
-          COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
-          COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12])     \
-          COMP_MUL(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6])     \
-          COMP_MUL(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14])     \
+#define COMP_MUL_4B(_offset) {                                       \
+      FLOAT *_ro = &res[_offset];                                    \
+      COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
+      COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12])     \
+      COMP_MUL(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6])     \
+      COMP_MUL(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14])     \
 }
 
-#define COMP_MAC_4B(_offset) {                                           \
-          FLOAT *_ro = &res[_offset];                                    \
-          COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
-          COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12])     \
-          COMP_MAC(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6])     \
-          COMP_MAC(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14])     \
+#define COMP_MAC_4B(_offset) {                                       \
+      FLOAT *_ro = &res[_offset];                                    \
+      COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
+      COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12])     \
+      COMP_MAC(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6])     \
+      COMP_MAC(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14])     \
 }
 
 
-#define SAVE_ACC_COMPLEX_11                                              \
-          BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
-          COMP_MUL_1                                                     \
-          COMP_MAC_1(16)                                                 \
-          COMP_MAC_1(32)                                                 \
-          COMP_MAC_1(48)                                                 \
-          COMP_MAC_1(64)                                                 \
-          COMP_MAC_1(80)                                                 \
-          COMP_MAC_1(96)                                                 \
-          COMP_MAC_1(112)                                                \
-	  CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                  \
-	  CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
+#define SAVE_ACC_COMPLEX_11                                          \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
+      COMP_MUL_1                                                     \
+      COMP_MAC_1(16)                                                 \
+      COMP_MAC_1(32)                                                 \
+      COMP_MAC_1(48)                                                 \
+      COMP_MAC_1(64)                                                 \
+      COMP_MAC_1(80)                                                 \
+      COMP_MAC_1(96)                                                 \
+      COMP_MAC_1(112)                                                \
+      CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                  \
+      CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
 
-#define SAVE_ACC_COMPLEX_12                                              \
-          BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
-          COMP_MUL_2A                                                    \
-          COMP_MAC_2A(16)                                                \
-          COMP_MAC_2A(32)                                                \
-          COMP_MAC_2A(48)                                                \
-          COMP_MAC_2A(64)                                                \
-          COMP_MAC_2A(80)                                                \
-          COMP_MAC_2A(96)                                                \
-          COMP_MAC_2A(112)                                               \
-	  CO[0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;            \
-	  CO[1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;            \
-	  CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i;            \
-	  CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
+#define SAVE_ACC_COMPLEX_12                                          \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
+      COMP_MUL_2A                                                    \
+      COMP_MAC_2A(16)                                                \
+      COMP_MAC_2A(32)                                                \
+      COMP_MAC_2A(48)                                                \
+      COMP_MAC_2A(64)                                                \
+      COMP_MAC_2A(80)                                                \
+      COMP_MAC_2A(96)                                                \
+      COMP_MAC_2A(112)                                               \
+      CO[0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;            \
+      CO[1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;            \
+      CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i;            \
+      CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
 
-#define SAVE_ACC_COMPLEX_21_1                                            \
-          BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
-          COMP_MUL_2B                                                    \
-          COMP_MAC_2B(16)                                                \
-          COMP_MAC_2B(32)                                                \
-          COMP_MAC_2B(48)                                                \
-          COMP_MAC_2B(64)                                                \
-          COMP_MAC_2B(80)                                                \
-          COMP_MAC_2B(96)                                                \
-          COMP_MAC_2B(112)                                               \
-	  CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                  \
-	  CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;                  \
-	  CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;                  \
-	  CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
+#define SAVE_ACC_COMPLEX_21_1                                        \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
+      COMP_MUL_2B                                                    \
+      COMP_MAC_2B(16)                                                \
+      COMP_MAC_2B(32)                                                \
+      COMP_MAC_2B(48)                                                \
+      COMP_MAC_2B(64)                                                \
+      COMP_MAC_2B(80)                                                \
+      COMP_MAC_2B(96)                                                \
+      COMP_MAC_2B(112)                                               \
+      CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                  \
+      CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;                  \
+      CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;                  \
+      CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
 
-#define SAVE_ACC_COMPLEX_21_2                                            \
-          BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
-          COMP_MUL_4A(0)                                                 \
-          COMP_MAC_4A(32)                                                \
-          COMP_MAC_4A(64)                                                \
-          COMP_MAC_4A(96)                                                \
-	  CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                  \
-	  CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;                  \
-	  CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;                  \
-	  CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;                  \
-	  CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;                  \
-	  CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;                  \
-	  CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;                  \
-	  CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
+#define SAVE_ACC_COMPLEX_21_2                                        \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
+      COMP_MUL_4A(0)                                                 \
+      COMP_MAC_4A(32)                                                \
+      COMP_MAC_4A(64)                                                \
+      COMP_MAC_4A(96)                                                \
+      CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                  \
+      CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;                  \
+      CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;                  \
+      CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;                  \
+      CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;                  \
+      CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;                  \
+      CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;                  \
+      CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
 
-#define SAVE_ACC_COMPLEX_21_4                                            \
-          BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
-          COMP_MUL_4A(0)                                                 \
-          COMP_MAC_4A(64)                                                \
-	  CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                 \
-	  CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;                 \
-	  CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;                 \
-	  CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;                 \
-	  CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;                 \
-	  CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;                 \
-	  CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;                 \
-	  CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;                 \
-          COMP_MUL_4A(32)                                                \
-          COMP_MAC_4A(96)                                                \
-	  CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                 \
-	  CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i;                 \
-	  CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i;                 \
-	  CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i;                 \
-	  CO[12] A_OP tr[2] * alpha_r - ti[2] * alpha_i;                 \
-	  CO[13] A_OP ti[2] * alpha_r + tr[2] * alpha_i;                 \
-	  CO[14] A_OP tr[3] * alpha_r - ti[3] * alpha_i;                 \
-	  CO[15] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
+#define SAVE_ACC_COMPLEX_21_4                                        \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
+      COMP_MUL_4A(0)                                                 \
+      COMP_MAC_4A(64)                                                \
+      CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                 \
+      CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;                 \
+      CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;                 \
+      CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;                 \
+      CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;                 \
+      CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;                 \
+      CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;                 \
+      CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;                 \
+      COMP_MUL_4A(32)                                                \
+      COMP_MAC_4A(96)                                                \
+      CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                 \
+      CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i;                 \
+      CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i;                 \
+      CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i;                 \
+      CO[12] A_OP tr[2] * alpha_r - ti[2] * alpha_i;                 \
+      CO[13] A_OP ti[2] * alpha_r + tr[2] * alpha_i;                 \
+      CO[14] A_OP tr[3] * alpha_r - ti[3] * alpha_i;                 \
+      CO[15] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
 
-#define SAVE_ACC_COMPLEX_22_4                                        \
-          BUILTIN_MMA_DISASSEMBLE_ACC_8                              \
-          COMP_MUL_4B(0)                                             \
-	  CO[ 0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;       \
-	  CO[ 1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;       \
-	  CO[ 2]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;       \
-	  CO[ 3]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;       \
-	  CO[2*ldc+ 0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;       \
-	  CO[2*ldc+ 1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;       \
-	  CO[2*ldc+ 2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;       \
-	  CO[2*ldc+ 3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;       \
-          COMP_MUL_4B(16)                                            \
-	  CO[ 4]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;       \
-	  CO[ 5]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;       \
-	  CO[ 6]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;       \
-	  CO[ 7]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;       \
-	  CO[2*ldc+ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;       \
-	  CO[2*ldc+ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;       \
-	  CO[2*ldc+ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;       \
-	  CO[2*ldc+ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;       \
-          COMP_MUL_4B(32)                                            \
-	  CO[ 8]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;       \
-	  CO[ 9]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;       \
-	  CO[10]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;       \
-	  CO[11]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;       \
-	  CO[2*ldc+ 8] A_OP tr[2] * alpha_r - ti[2] * alpha_i;       \
-	  CO[2*ldc+ 9] A_OP ti[2] * alpha_r + tr[2] * alpha_i;       \
-	  CO[2*ldc+10] A_OP tr[3] * alpha_r - ti[3] * alpha_i;       \
-	  CO[2*ldc+11] A_OP ti[3] * alpha_r + tr[3] * alpha_i;       \
-          COMP_MUL_4B(48)                                            \
-	  CO[12]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;       \
-	  CO[13]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;       \
-	  CO[14]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;       \
-	  CO[15]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;       \
-	  CO[2*ldc+12] A_OP tr[2] * alpha_r - ti[2] * alpha_i;       \
-	  CO[2*ldc+13] A_OP ti[2] * alpha_r + tr[2] * alpha_i;       \
-	  CO[2*ldc+14] A_OP tr[3] * alpha_r - ti[3] * alpha_i;       \
-	  CO[2*ldc+15] A_OP ti[3] * alpha_r + tr[3] * alpha_i;         
+#define SAVE_ACC_COMPLEX_22_4                                    \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                              \
+      COMP_MUL_4B(0)                                             \
+      CO[ 0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;       \
+      CO[ 1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;       \
+      CO[ 2]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;       \
+      CO[ 3]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;       \
+      CO[2*ldc+ 0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;       \
+      CO[2*ldc+ 1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;       \
+      CO[2*ldc+ 2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;       \
+      CO[2*ldc+ 3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;       \
+      COMP_MUL_4B(16)                                            \
+      CO[ 4]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;       \
+      CO[ 5]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;       \
+      CO[ 6]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;       \
+      CO[ 7]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;       \
+      CO[2*ldc+ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;       \
+      CO[2*ldc+ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;       \
+      CO[2*ldc+ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;       \
+      CO[2*ldc+ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;       \
+      COMP_MUL_4B(32)                                            \
+      CO[ 8]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;       \
+      CO[ 9]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;       \
+      CO[10]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;       \
+      CO[11]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;       \
+      CO[2*ldc+ 8] A_OP tr[2] * alpha_r - ti[2] * alpha_i;       \
+      CO[2*ldc+ 9] A_OP ti[2] * alpha_r + tr[2] * alpha_i;       \
+      CO[2*ldc+10] A_OP tr[3] * alpha_r - ti[3] * alpha_i;       \
+      CO[2*ldc+11] A_OP ti[3] * alpha_r + tr[3] * alpha_i;       \
+      COMP_MUL_4B(48)                                            \
+      CO[12]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;       \
+      CO[13]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;       \
+      CO[14]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;       \
+      CO[15]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;       \
+      CO[2*ldc+12] A_OP tr[2] * alpha_r - ti[2] * alpha_i;       \
+      CO[2*ldc+13] A_OP ti[2] * alpha_r + tr[2] * alpha_i;       \
+      CO[2*ldc+14] A_OP tr[3] * alpha_r - ti[3] * alpha_i;       \
+      CO[2*ldc+15] A_OP ti[3] * alpha_r + tr[3] * alpha_i;         
 
-#define SAVE_ACC_COMPLEX_22_2                                          \
-          BUILTIN_MMA_DISASSEMBLE_ACC_8                                \
-          COMP_MUL_4B(0)                                               \
-	  CO[0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;          \
-	  CO[1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;          \
-	  CO[2]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;          \
-	  CO[3]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;          \
-	  CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;          \
-	  CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;          \
-	  CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;          \
-	  CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;          \
-          COMP_MUL_4B(16)                                              \
-	  CO[4]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;          \
-	  CO[5]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;          \
-	  CO[6]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;          \
-	  CO[7]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;          \
-	  CO[2*ldc+4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;          \
-	  CO[2*ldc+5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;          \
-	  CO[2*ldc+6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;          \
-	  CO[2*ldc+7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;         
+#define SAVE_ACC_COMPLEX_22_2                                      \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                                \
+      COMP_MUL_4B(0)                                               \
+      CO[0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;          \
+      CO[1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;          \
+      CO[2]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;          \
+      CO[3]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;          \
+      CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;          \
+      CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;          \
+      CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;          \
+      CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;          \
+      COMP_MUL_4B(16)                                              \
+      CO[4]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;          \
+      CO[5]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;          \
+      CO[6]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;          \
+      CO[7]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;          \
+      CO[2*ldc+4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;          \
+      CO[2*ldc+5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;          \
+      CO[2*ldc+6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;          \
+      CO[2*ldc+7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;         
 
-#define SAVE_ACC_COMPLEX_22_1                                          \
-          BUILTIN_MMA_DISASSEMBLE_ACC_8                                \
-          COMP_MUL_4B(0)                                               \
-	  CO[0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;          \
-	  CO[1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;          \
-	  CO[2]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;          \
-	  CO[3]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;          \
-	  CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;          \
-	  CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;          \
-	  CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;          \
-	  CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;         
+#define SAVE_ACC_COMPLEX_22_1                                      \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                                \
+      COMP_MUL_4B(0)                                               \
+      CO[0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;          \
+      CO[1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;          \
+      CO[2]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;          \
+      CO[3]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;          \
+      CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;          \
+      CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;          \
+      CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;          \
+      CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;         
 
-#define SAVE_ACC_COMPLEX_24_ALL \
+#define SAVE_ACC_COMPLEX_24_ALL                                         \
       __builtin_mma_disassemble_acc ((void *)result, &acc0);            \
       __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc4);      \
       __builtin_mma_disassemble_acc ((void *)(&result[8]), &acc1);      \
@@ -332,205 +332,205 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
       COMP_MUL(tr[29], res[120], res[125], ti[29], res[121], res[124])  \
       COMP_MUL(tr[30], res[114], res[119], ti[30], res[115], res[118])  \
       COMP_MUL(tr[31], res[122], res[127], ti[31], res[123], res[126])  \
-	  CO[       0] A_OP tr[ 0] * alpha_r - ti[ 0] * alpha_i;            \
-	  CO[       1] A_OP ti[ 0] * alpha_r + tr[ 0] * alpha_i;            \
-	  CO[       2] A_OP tr[ 1] * alpha_r - ti[ 1] * alpha_i;            \
-	  CO[       3] A_OP ti[ 1] * alpha_r + tr[ 1] * alpha_i;            \
-	  CO[2*ldc+ 0] A_OP tr[ 2] * alpha_r - ti[ 2] * alpha_i;            \
-	  CO[2*ldc+ 1] A_OP ti[ 2] * alpha_r + tr[ 2] * alpha_i;            \
-	  CO[2*ldc+ 2] A_OP tr[ 3] * alpha_r - ti[ 3] * alpha_i;            \
-	  CO[2*ldc+ 3] A_OP ti[ 3] * alpha_r + tr[ 3] * alpha_i;            \
-	  CO[4*ldc+ 0] A_OP tr[ 4] * alpha_r - ti[ 4] * alpha_i;            \
-	  CO[4*ldc+ 1] A_OP ti[ 4] * alpha_r + tr[ 4] * alpha_i;            \
-	  CO[4*ldc+ 2] A_OP tr[ 5] * alpha_r - ti[ 5] * alpha_i;            \
-	  CO[4*ldc+ 3] A_OP ti[ 5] * alpha_r + tr[ 5] * alpha_i;            \
-	  CO[6*ldc+ 0] A_OP tr[ 6] * alpha_r - ti[ 6] * alpha_i;            \
-	  CO[6*ldc+ 1] A_OP ti[ 6] * alpha_r + tr[ 6] * alpha_i;            \
-	  CO[6*ldc+ 2] A_OP tr[ 7] * alpha_r - ti[ 7] * alpha_i;            \
-	  CO[6*ldc+ 3] A_OP ti[ 7] * alpha_r + tr[ 7] * alpha_i;            \
-	  CO[       4] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i;            \
-	  CO[       5] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i;            \
-	  CO[       6] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i;            \
-	  CO[       7] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i;            \
-	  CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i;            \
-	  CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i;            \
-	  CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i;            \
-	  CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i;            \
-	  CO[4*ldc+ 4] A_OP tr[12] * alpha_r - ti[12] * alpha_i;            \
-	  CO[4*ldc+ 5] A_OP ti[12] * alpha_r + tr[12] * alpha_i;            \
-	  CO[4*ldc+ 6] A_OP tr[13] * alpha_r - ti[13] * alpha_i;            \
-	  CO[4*ldc+ 7] A_OP ti[13] * alpha_r + tr[13] * alpha_i;            \
-	  CO[6*ldc+ 4] A_OP tr[14] * alpha_r - ti[14] * alpha_i;            \
-	  CO[6*ldc+ 5] A_OP ti[14] * alpha_r + tr[14] * alpha_i;            \
-	  CO[6*ldc+ 6] A_OP tr[15] * alpha_r - ti[15] * alpha_i;            \
-	  CO[6*ldc+ 7] A_OP ti[15] * alpha_r + tr[15] * alpha_i;            \
-	  CO[       8] A_OP tr[16] * alpha_r - ti[16] * alpha_i;            \
-	  CO[       9] A_OP ti[16] * alpha_r + tr[16] * alpha_i;            \
-	  CO[      10] A_OP tr[17] * alpha_r - ti[17] * alpha_i;            \
-	  CO[      11] A_OP ti[17] * alpha_r + tr[17] * alpha_i;            \
-	  CO[2*ldc+ 8] A_OP tr[18] * alpha_r - ti[18] * alpha_i;            \
-	  CO[2*ldc+ 9] A_OP ti[18] * alpha_r + tr[18] * alpha_i;            \
-	  CO[2*ldc+10] A_OP tr[19] * alpha_r - ti[19] * alpha_i;            \
-	  CO[2*ldc+11] A_OP ti[19] * alpha_r + tr[19] * alpha_i;            \
-	  CO[4*ldc+ 8] A_OP tr[20] * alpha_r - ti[20] * alpha_i;            \
-	  CO[4*ldc+ 9] A_OP ti[20] * alpha_r + tr[20] * alpha_i;            \
-	  CO[4*ldc+10] A_OP tr[21] * alpha_r - ti[21] * alpha_i;            \
-	  CO[4*ldc+11] A_OP ti[21] * alpha_r + tr[21] * alpha_i;            \
-	  CO[6*ldc+ 8] A_OP tr[22] * alpha_r - ti[22] * alpha_i;            \
-	  CO[6*ldc+ 9] A_OP ti[22] * alpha_r + tr[22] * alpha_i;            \
-	  CO[6*ldc+10] A_OP tr[23] * alpha_r - ti[23] * alpha_i;            \
-	  CO[6*ldc+11] A_OP ti[23] * alpha_r + tr[23] * alpha_i;            \
-	  CO[      12] A_OP tr[24] * alpha_r - ti[24] * alpha_i;            \
-	  CO[      13] A_OP ti[24] * alpha_r + tr[24] * alpha_i;            \
-	  CO[      14] A_OP tr[25] * alpha_r - ti[25] * alpha_i;            \
-	  CO[      15] A_OP ti[25] * alpha_r + tr[25] * alpha_i;            \
-	  CO[2*ldc+12] A_OP tr[26] * alpha_r - ti[26] * alpha_i;            \
-	  CO[2*ldc+13] A_OP ti[26] * alpha_r + tr[26] * alpha_i;            \
-	  CO[2*ldc+14] A_OP tr[27] * alpha_r - ti[27] * alpha_i;            \
-	  CO[2*ldc+15] A_OP ti[27] * alpha_r + tr[27] * alpha_i;            \
-	  CO[4*ldc+12] A_OP tr[28] * alpha_r - ti[28] * alpha_i;            \
-	  CO[4*ldc+13] A_OP ti[28] * alpha_r + tr[28] * alpha_i;            \
-	  CO[4*ldc+14] A_OP tr[29] * alpha_r - ti[29] * alpha_i;            \
-	  CO[4*ldc+15] A_OP ti[29] * alpha_r + tr[29] * alpha_i;            \
-	  CO[6*ldc+12] A_OP tr[30] * alpha_r - ti[30] * alpha_i;            \
-	  CO[6*ldc+13] A_OP ti[30] * alpha_r + tr[30] * alpha_i;            \
-	  CO[6*ldc+14] A_OP tr[31] * alpha_r - ti[31] * alpha_i;            \
-	  CO[6*ldc+15] A_OP ti[31] * alpha_r + tr[31] * alpha_i; 
+      CO[       0] A_OP tr[ 0] * alpha_r - ti[ 0] * alpha_i;            \
+      CO[       1] A_OP ti[ 0] * alpha_r + tr[ 0] * alpha_i;            \
+      CO[       2] A_OP tr[ 1] * alpha_r - ti[ 1] * alpha_i;            \
+      CO[       3] A_OP ti[ 1] * alpha_r + tr[ 1] * alpha_i;            \
+      CO[2*ldc+ 0] A_OP tr[ 2] * alpha_r - ti[ 2] * alpha_i;            \
+      CO[2*ldc+ 1] A_OP ti[ 2] * alpha_r + tr[ 2] * alpha_i;            \
+      CO[2*ldc+ 2] A_OP tr[ 3] * alpha_r - ti[ 3] * alpha_i;            \
+      CO[2*ldc+ 3] A_OP ti[ 3] * alpha_r + tr[ 3] * alpha_i;            \
+      CO[4*ldc+ 0] A_OP tr[ 4] * alpha_r - ti[ 4] * alpha_i;            \
+      CO[4*ldc+ 1] A_OP ti[ 4] * alpha_r + tr[ 4] * alpha_i;            \
+      CO[4*ldc+ 2] A_OP tr[ 5] * alpha_r - ti[ 5] * alpha_i;            \
+      CO[4*ldc+ 3] A_OP ti[ 5] * alpha_r + tr[ 5] * alpha_i;            \
+      CO[6*ldc+ 0] A_OP tr[ 6] * alpha_r - ti[ 6] * alpha_i;            \
+      CO[6*ldc+ 1] A_OP ti[ 6] * alpha_r + tr[ 6] * alpha_i;            \
+      CO[6*ldc+ 2] A_OP tr[ 7] * alpha_r - ti[ 7] * alpha_i;            \
+      CO[6*ldc+ 3] A_OP ti[ 7] * alpha_r + tr[ 7] * alpha_i;            \
+      CO[       4] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i;            \
+      CO[       5] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i;            \
+      CO[       6] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i;            \
+      CO[       7] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i;            \
+      CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i;            \
+      CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i;            \
+      CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i;            \
+      CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i;            \
+      CO[4*ldc+ 4] A_OP tr[12] * alpha_r - ti[12] * alpha_i;            \
+      CO[4*ldc+ 5] A_OP ti[12] * alpha_r + tr[12] * alpha_i;            \
+      CO[4*ldc+ 6] A_OP tr[13] * alpha_r - ti[13] * alpha_i;            \
+      CO[4*ldc+ 7] A_OP ti[13] * alpha_r + tr[13] * alpha_i;            \
+      CO[6*ldc+ 4] A_OP tr[14] * alpha_r - ti[14] * alpha_i;            \
+      CO[6*ldc+ 5] A_OP ti[14] * alpha_r + tr[14] * alpha_i;            \
+      CO[6*ldc+ 6] A_OP tr[15] * alpha_r - ti[15] * alpha_i;            \
+      CO[6*ldc+ 7] A_OP ti[15] * alpha_r + tr[15] * alpha_i;            \
+      CO[       8] A_OP tr[16] * alpha_r - ti[16] * alpha_i;            \
+      CO[       9] A_OP ti[16] * alpha_r + tr[16] * alpha_i;            \
+      CO[      10] A_OP tr[17] * alpha_r - ti[17] * alpha_i;            \
+      CO[      11] A_OP ti[17] * alpha_r + tr[17] * alpha_i;            \
+      CO[2*ldc+ 8] A_OP tr[18] * alpha_r - ti[18] * alpha_i;            \
+      CO[2*ldc+ 9] A_OP ti[18] * alpha_r + tr[18] * alpha_i;            \
+      CO[2*ldc+10] A_OP tr[19] * alpha_r - ti[19] * alpha_i;            \
+      CO[2*ldc+11] A_OP ti[19] * alpha_r + tr[19] * alpha_i;            \
+      CO[4*ldc+ 8] A_OP tr[20] * alpha_r - ti[20] * alpha_i;            \
+      CO[4*ldc+ 9] A_OP ti[20] * alpha_r + tr[20] * alpha_i;            \
+      CO[4*ldc+10] A_OP tr[21] * alpha_r - ti[21] * alpha_i;            \
+      CO[4*ldc+11] A_OP ti[21] * alpha_r + tr[21] * alpha_i;            \
+      CO[6*ldc+ 8] A_OP tr[22] * alpha_r - ti[22] * alpha_i;            \
+      CO[6*ldc+ 9] A_OP ti[22] * alpha_r + tr[22] * alpha_i;            \
+      CO[6*ldc+10] A_OP tr[23] * alpha_r - ti[23] * alpha_i;            \
+      CO[6*ldc+11] A_OP ti[23] * alpha_r + tr[23] * alpha_i;            \
+      CO[      12] A_OP tr[24] * alpha_r - ti[24] * alpha_i;            \
+      CO[      13] A_OP ti[24] * alpha_r + tr[24] * alpha_i;            \
+      CO[      14] A_OP tr[25] * alpha_r - ti[25] * alpha_i;            \
+      CO[      15] A_OP ti[25] * alpha_r + tr[25] * alpha_i;            \
+      CO[2*ldc+12] A_OP tr[26] * alpha_r - ti[26] * alpha_i;            \
+      CO[2*ldc+13] A_OP ti[26] * alpha_r + tr[26] * alpha_i;            \
+      CO[2*ldc+14] A_OP tr[27] * alpha_r - ti[27] * alpha_i;            \
+      CO[2*ldc+15] A_OP ti[27] * alpha_r + tr[27] * alpha_i;            \
+      CO[4*ldc+12] A_OP tr[28] * alpha_r - ti[28] * alpha_i;            \
+      CO[4*ldc+13] A_OP ti[28] * alpha_r + tr[28] * alpha_i;            \
+      CO[4*ldc+14] A_OP tr[29] * alpha_r - ti[29] * alpha_i;            \
+      CO[4*ldc+15] A_OP ti[29] * alpha_r + tr[29] * alpha_i;            \
+      CO[6*ldc+12] A_OP tr[30] * alpha_r - ti[30] * alpha_i;            \
+      CO[6*ldc+13] A_OP ti[30] * alpha_r + tr[30] * alpha_i;            \
+      CO[6*ldc+14] A_OP tr[31] * alpha_r - ti[31] * alpha_i;            \
+      CO[6*ldc+15] A_OP ti[31] * alpha_r + tr[31] * alpha_i; 
 
-#define SAVE_ACC_COMPLEX_24(ACC1, ACC2, CI)                            \
-          __builtin_mma_disassemble_acc ((void *)result, ACC1);        \
-          __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2);  \
-          COMP_MUL(tr[0], res[0], res[5], ti[0], res[1], res[4])       \
-          COMP_MUL(tr[1], res[8], res[13], ti[1], res[9], res[12])     \
-          COMP_MUL(tr[2], res[2], res[7], ti[2], res[3], res[6])       \
-          COMP_MUL(tr[3], res[10], res[15], ti[3], res[11], res[14])   \
-          COMP_MUL(tr[4], res[16], res[21], ti[4], res[17], res[20])   \
-          COMP_MUL(tr[5], res[24], res[29], ti[5], res[25], res[28])   \
-          COMP_MUL(tr[6], res[18], res[23], ti[6], res[19], res[22])   \
-          COMP_MUL(tr[7], res[26], res[31], ti[7], res[27], res[30])   \
-	  CO[CI+0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;       \
-	  CO[CI+1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;       \
-	  CO[CI+2]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;       \
-	  CO[CI+3]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;       \
-	  CO[CI+2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;       \
-	  CO[CI+2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;       \
-	  CO[CI+2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;       \
-	  CO[CI+2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;       \
-	  CO[CI+4*ldc+0] A_OP tr[4] * alpha_r - ti[4] * alpha_i;       \
-	  CO[CI+4*ldc+1] A_OP ti[4] * alpha_r + tr[4] * alpha_i;       \
-	  CO[CI+4*ldc+2] A_OP tr[5] * alpha_r - ti[5] * alpha_i;       \
-	  CO[CI+4*ldc+3] A_OP ti[5] * alpha_r + tr[5] * alpha_i;       \
-	  CO[CI+6*ldc+0] A_OP tr[6] * alpha_r - ti[6] * alpha_i;       \
-	  CO[CI+6*ldc+1] A_OP ti[6] * alpha_r + tr[6] * alpha_i;       \
-	  CO[CI+6*ldc+2] A_OP tr[7] * alpha_r - ti[7] * alpha_i;       \
-	  CO[CI+6*ldc+3] A_OP ti[7] * alpha_r + tr[7] * alpha_i;         
+#define SAVE_ACC_COMPLEX_24(ACC1, ACC2, CI)                        \
+      __builtin_mma_disassemble_acc ((void *)result, ACC1);        \
+      __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2);  \
+      COMP_MUL(tr[0], res[0], res[5], ti[0], res[1], res[4])       \
+      COMP_MUL(tr[1], res[8], res[13], ti[1], res[9], res[12])     \
+      COMP_MUL(tr[2], res[2], res[7], ti[2], res[3], res[6])       \
+      COMP_MUL(tr[3], res[10], res[15], ti[3], res[11], res[14])   \
+      COMP_MUL(tr[4], res[16], res[21], ti[4], res[17], res[20])   \
+      COMP_MUL(tr[5], res[24], res[29], ti[5], res[25], res[28])   \
+      COMP_MUL(tr[6], res[18], res[23], ti[6], res[19], res[22])   \
+      COMP_MUL(tr[7], res[26], res[31], ti[7], res[27], res[30])   \
+      CO[CI+0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;       \
+      CO[CI+1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;       \
+      CO[CI+2]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;       \
+      CO[CI+3]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;       \
+      CO[CI+2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;       \
+      CO[CI+2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;       \
+      CO[CI+2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;       \
+      CO[CI+2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;       \
+      CO[CI+4*ldc+0] A_OP tr[4] * alpha_r - ti[4] * alpha_i;       \
+      CO[CI+4*ldc+1] A_OP ti[4] * alpha_r + tr[4] * alpha_i;       \
+      CO[CI+4*ldc+2] A_OP tr[5] * alpha_r - ti[5] * alpha_i;       \
+      CO[CI+4*ldc+3] A_OP ti[5] * alpha_r + tr[5] * alpha_i;       \
+      CO[CI+6*ldc+0] A_OP tr[6] * alpha_r - ti[6] * alpha_i;       \
+      CO[CI+6*ldc+1] A_OP ti[6] * alpha_r + tr[6] * alpha_i;       \
+      CO[CI+6*ldc+2] A_OP tr[7] * alpha_r - ti[7] * alpha_i;       \
+      CO[CI+6*ldc+3] A_OP ti[7] * alpha_r + tr[7] * alpha_i;         
 
-#define SAVE_ACC_COMPLEX_14                                              \
-          BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
-          COMP_MUL(tr[0], res[  0], res[  5], ti[0], res[  1], res[  4]) \
-          COMP_MUL(tr[1], res[  2], res[  7], ti[1], res[  3], res[  6]) \
-          COMP_MUL(tr[2], res[ 16], res[ 21], ti[2], res[ 17], res[ 20]) \
-          COMP_MUL(tr[3], res[ 18], res[ 23], ti[3], res[ 19], res[ 22]) \
-          COMP_MAC(tr[0], res[ 32], res[ 37], ti[0], res[ 33], res[ 36]) \
-          COMP_MAC(tr[1], res[ 34], res[ 39], ti[1], res[ 35], res[ 38]) \
-          COMP_MAC(tr[2], res[ 48], res[ 53], ti[2], res[ 49], res[ 52]) \
-          COMP_MAC(tr[3], res[ 50], res[ 55], ti[3], res[ 51], res[ 54]) \
-          COMP_MAC(tr[0], res[ 64], res[ 69], ti[0], res[ 65], res[ 68]) \
-          COMP_MAC(tr[1], res[ 66], res[ 71], ti[1], res[ 67], res[ 70]) \
-          COMP_MAC(tr[2], res[ 80], res[ 85], ti[2], res[ 81], res[ 84]) \
-          COMP_MAC(tr[3], res[ 82], res[ 87], ti[3], res[ 83], res[ 86]) \
-          COMP_MAC(tr[0], res[ 96], res[101], ti[0], res[ 97], res[100]) \
-          COMP_MAC(tr[1], res[ 98], res[103], ti[1], res[ 99], res[102]) \
-          COMP_MAC(tr[2], res[112], res[117], ti[2], res[113], res[116]) \
-          COMP_MAC(tr[3], res[114], res[119], ti[3], res[115], res[118]) \
-	  CO[0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;            \
-	  CO[1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;            \
-	  CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i;            \
-	  CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i;            \
-	  CO[4*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;            \
-	  CO[4*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;            \
-	  CO[6*ldc+0] A_OP tr[3] * alpha_r - ti[3] * alpha_i;            \
-	  CO[6*ldc+1] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
+#define SAVE_ACC_COMPLEX_14                                          \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
+      COMP_MUL(tr[0], res[  0], res[  5], ti[0], res[  1], res[  4]) \
+      COMP_MUL(tr[1], res[  2], res[  7], ti[1], res[  3], res[  6]) \
+      COMP_MUL(tr[2], res[ 16], res[ 21], ti[2], res[ 17], res[ 20]) \
+      COMP_MUL(tr[3], res[ 18], res[ 23], ti[3], res[ 19], res[ 22]) \
+      COMP_MAC(tr[0], res[ 32], res[ 37], ti[0], res[ 33], res[ 36]) \
+      COMP_MAC(tr[1], res[ 34], res[ 39], ti[1], res[ 35], res[ 38]) \
+      COMP_MAC(tr[2], res[ 48], res[ 53], ti[2], res[ 49], res[ 52]) \
+      COMP_MAC(tr[3], res[ 50], res[ 55], ti[3], res[ 51], res[ 54]) \
+      COMP_MAC(tr[0], res[ 64], res[ 69], ti[0], res[ 65], res[ 68]) \
+      COMP_MAC(tr[1], res[ 66], res[ 71], ti[1], res[ 67], res[ 70]) \
+      COMP_MAC(tr[2], res[ 80], res[ 85], ti[2], res[ 81], res[ 84]) \
+      COMP_MAC(tr[3], res[ 82], res[ 87], ti[3], res[ 83], res[ 86]) \
+      COMP_MAC(tr[0], res[ 96], res[101], ti[0], res[ 97], res[100]) \
+      COMP_MAC(tr[1], res[ 98], res[103], ti[1], res[ 99], res[102]) \
+      COMP_MAC(tr[2], res[112], res[117], ti[2], res[113], res[116]) \
+      COMP_MAC(tr[3], res[114], res[119], ti[3], res[115], res[118]) \
+      CO[0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;            \
+      CO[1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;            \
+      CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i;            \
+      CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i;            \
+      CO[4*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;            \
+      CO[4*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;            \
+      CO[6*ldc+0] A_OP tr[3] * alpha_r - ti[3] * alpha_i;            \
+      CO[6*ldc+1] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
 
 #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
 
 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
 #define REFRESH_TEMP_BK(x, y) \
-            temp = k - off;
+      temp = k - off;
 #elif defined(LEFT)
 #define REFRESH_TEMP_BK(x, y) \
-            temp = off + x;
+      temp = off + x;
 #else
 #define REFRESH_TEMP_BK(x, y) \
-            temp = off + y;
+      temp = off + y;
 #endif
 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 #define REFRESH_POINTERS(x, y) \
-	  BO = B; \
-          REFRESH_TEMP_BK(x, y)
+      BO = B; \
+      REFRESH_TEMP_BK(x, y)
 #else
 #define REFRESH_POINTERS(x, y) \
-          AO += off * (2*x); \
-          BO = B + off * (2*y); \
-          REFRESH_TEMP_BK(x, y)
+      AO += off * (2*x); \
+      BO = B + off * (2*y); \
+      REFRESH_TEMP_BK(x, y)
 #endif
 
 #ifdef LEFT
 #define REFRESH_OFF(x) \
-            off += x;
+      off += x;
 #else
 #define REFRESH_OFF(x)
 #endif
 
 #ifdef LEFT
 #define UPDATE_TEMP(x, y) \
-            temp -= x;
+      temp -= x;
 #else
 #define UPDATE_TEMP(x, y) \
-            temp -= y;
+      temp -= y;
 #endif
 
 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 #define REFRESH_TMP_AFTER_SAVE(x, y) \
-            temp = k - off; \
-            UPDATE_TEMP(x, y) \
-            AO += temp * (2*x); \
-            BO += temp * (2*y);
+      temp = k - off;                \
+      UPDATE_TEMP(x, y)              \
+      AO += temp * (2*x);            \
+      BO += temp * (2*y);
 #else
 #define REFRESH_TMP_AFTER_SAVE(x, y)
 #endif
 
-#define REFRESH_AFTER_SAVE(x,y) \
-        REFRESH_TMP_AFTER_SAVE(x, y) \
-	REFRESH_OFF(x)
+#define REFRESH_AFTER_SAVE(x,y)      \
+      REFRESH_TMP_AFTER_SAVE(x, y)   \
+      REFRESH_OFF(x)
 /*************************************************************************************
 * GEMM Kernel
 *************************************************************************************/
 int
-CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B,
-       FLOAT * C, BLASLONG ldc
 #ifdef TRMMKERNEL
-       , BLASLONG offset
+CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, 
+       FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset) 
+#else
+CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, 
+       FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc) 
 #endif
-  )
 {
-  BLASLONG i1, i, l, temp;
-  FLOAT *AO, *BO, *CO;
+   BLASLONG i1, i, l, temp;
+   FLOAT *AO, *BO, *CO;
 #if defined(TRMMKERNEL)
-  BLASLONG off;
+   BLASLONG off;
 #endif
 #if defined(TRMMKERNEL) && !defined(LEFT)
-  off = -offset;
+   off = -offset;
 #endif
 
-  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+   __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
 
-  v4sf_t result[32];
-  FLOAT *res, tr[64], ti[64];
-  res = (FLOAT *) result;
+   v4sf_t result[32];
+   FLOAT *res, tr[64], ti[64];
+   res = (FLOAT *) result;
 
-  for (i1 = 0; i1 < (n >> 2); i1++)
-    {
+   for (i1 = 0; i1 < (n >> 2); i1++) {
 #if defined(TRMMKERNEL) && defined(LEFT)
       off = offset;
 #endif
@@ -538,193 +538,181 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
       CO = C;
       C += ldc << 3;
 
-      for (i = 0; i < (m >> 3); i++)
-	{
+      for (i = 0; i < (m >> 3); i++) {
 #if defined(TRMMKERNEL)
-	  REFRESH_POINTERS (8, 4);
+         REFRESH_POINTERS (8, 4);
 #else
-	  BO = B;
-	  temp = k;
+         BO = B;
+         temp = k;
 #endif
-          SET_ACC_ZERO()
-	  for (l = 0; l < temp; ++l)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<4];
-              vec_t rowB1 = *(vec_t *) & BO[l<<3];
-              vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
-              vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
-              vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
-              __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1);
-              __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1);
-              __builtin_mma_xvf32gerpp(&acc4, rowA1, rowB2);
-              __builtin_mma_xvf32gerpp(&acc5, rowA2, rowB2);
-              __builtin_mma_xvf32gerpp(&acc6, rowA3, rowB2);
-              __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB2);
-	    }
+         SET_ACC_ZERO()
+         for (l = 0; l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<4];
+            vec_t rowB1 = *(vec_t *) & BO[l<<3];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1);
+            __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1);
+            __builtin_mma_xvf32gerpp(&acc4, rowA1, rowB2);
+            __builtin_mma_xvf32gerpp(&acc5, rowA2, rowB2);
+            __builtin_mma_xvf32gerpp(&acc6, rowA3, rowB2);
+            __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB2);
+         }
       SAVE_ACC_COMPLEX_24_ALL
-	  CO += 16;
-	  AO += temp << 4;
-	  BO += temp << 3;
+         CO += 16;
+         AO += temp << 4;
+         BO += temp << 3;
 #if defined(TRMMKERNEL)
-	  REFRESH_AFTER_SAVE (8, 4)
+         REFRESH_AFTER_SAVE (8, 4)
 #endif
-	}
-      if (m & 4)
-	{
+      }
+      if (m & 4) {
 #if defined(TRMMKERNEL)
-	  REFRESH_POINTERS (4, 4);
+         REFRESH_POINTERS (4, 4);
 #else
-	  BO = B;
-	  temp = k;
+         BO = B;
+         temp = k;
 #endif
-          SET_ACC_ZERO()
-	  for (l = 0; l < (temp & (~1)); l+=2)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<3];
-              vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
-              vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
-              vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12];
-              vec_t rowB1 = *(vec_t *) & BO[l<<3];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
-              vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8];
-              vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
-              __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2);
-              __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2);
-              __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3);
-              __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB3);
-              __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB4);
-              __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4);
-            }
-          for (l = (temp & (~1)); l < temp; ++l)
-            {
-              vec_t rowA1 = *(vec_t *) & AO[l<<3];
-              vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
-              vec_t rowB1 = *(vec_t *) & BO[l<<3];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
-              __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2);
-              __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2);
-            }
-          SAVE_ACC_COMPLEX_24(&acc0, &acc2, 0)
-          SAVE_ACC_COMPLEX_24(&acc1, &acc3, 4)
-	  CO += 8;
-	  AO += temp << 3;
-	  BO += temp << 3;
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~1)); l+=2) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<3];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12];
+            vec_t rowB1 = *(vec_t *) & BO[l<<3];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2);
+            __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2);
+            __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3);
+            __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB3);
+            __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB4);
+            __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4);
+         }
+         for (l = (temp & (~1)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<3];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
+            vec_t rowB1 = *(vec_t *) & BO[l<<3];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2);
+            __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2);
+         }
+         SAVE_ACC_COMPLEX_24(&acc0, &acc2, 0)
+         SAVE_ACC_COMPLEX_24(&acc1, &acc3, 4)
+         CO += 8;
+         AO += temp << 3;
+         BO += temp << 3;
 #if defined(TRMMKERNEL)
-	  REFRESH_AFTER_SAVE (4, 4)
+         REFRESH_AFTER_SAVE (4, 4)
 #endif
-	}
-      if (m & 2)
-	{
+      }
+      if (m & 2) {
 #if defined(TRMMKERNEL)
-	  REFRESH_POINTERS (2, 4);
+         REFRESH_POINTERS (2, 4);
 #else
-	  BO = B;
-	  temp = k;
+         BO = B;
+         temp = k;
 #endif
-          SET_ACC_ZERO()
-	  for (l = 0; l < (temp & (~3)); l+=4)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<2];
-              vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
-              vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
-              vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12];
-              vec_t rowB1 = *(vec_t *) & BO[l<<3];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
-              vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8];
-              vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12];
-              vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16];
-              vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20];
-              vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24];
-              vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2);
-              __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB3);
-              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB4);
-              __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB5);
-              __builtin_mma_xvf32gerpp(&acc1, rowA3, rowB6);
-              __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB7);
-              __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB8);
-	    }
-	  for (l = (temp & (~3)); l < temp; ++l)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<2];
-              vec_t rowB1 = *(vec_t *) & BO[l<<3];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2);
-	    }
-          SAVE_ACC_COMPLEX_24(&acc0, &acc1, 0)
-	  CO += 4;
-	  AO += temp << 2;
-	  BO += temp << 3;
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~3)); l+=4) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<2];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12];
+            vec_t rowB1 = *(vec_t *) & BO[l<<3];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12];
+            vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16];
+            vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20];
+            vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24];
+            vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2);
+            __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB3);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB4);
+            __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB5);
+            __builtin_mma_xvf32gerpp(&acc1, rowA3, rowB6);
+            __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB7);
+            __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB8);
+         }
+         for (l = (temp & (~3)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<2];
+            vec_t rowB1 = *(vec_t *) & BO[l<<3];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2);
+         }
+         SAVE_ACC_COMPLEX_24(&acc0, &acc1, 0)
+         CO += 4;
+         AO += temp << 2;
+         BO += temp << 3;
 #if defined(TRMMKERNEL)
-	  REFRESH_AFTER_SAVE (2, 4)
+         REFRESH_AFTER_SAVE (2, 4)
 #endif
-	}
-      if (m & 1)
-	{
+      }
+      if (m & 1) {
 #if defined(TRMMKERNEL)
-	  REFRESH_POINTERS (1, 4)
+         REFRESH_POINTERS (1, 4)
 #else
-	  BO = B;
-	  temp = k;
+         BO = B;
+         temp = k;
 #endif
-          SET_ACC_ZERO()
-	  for (l = 0; l < (temp & (~3)); l+=4)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<1];
-              vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
-              vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
-              vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6];
-              vec_t rowB1 = *(vec_t *) & BO[l<<3];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
-              vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8];
-              vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12];
-              vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16];
-              vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20];
-              vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24];
-              vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2);
-              __builtin_mma_xvf32gerpp(&acc2, rowA2, rowB3);
-              __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB4);
-              __builtin_mma_xvf32gerpp(&acc4, rowA3, rowB5);
-              __builtin_mma_xvf32gerpp(&acc5, rowA3, rowB6);
-              __builtin_mma_xvf32gerpp(&acc6, rowA4, rowB7);
-              __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB8);
-	    }
-	  for (l = (temp & (~3)); l < temp; ++l)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<1];
-              vec_t rowB1 = *(vec_t *) & BO[l<<3];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2);
-	    }
-          SAVE_ACC_COMPLEX_14
-	  CO += 2;
-	  AO += temp << 1;
-	  BO += temp << 3;
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~3)); l+=4) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<1];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6];
+            vec_t rowB1 = *(vec_t *) & BO[l<<3];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12];
+            vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16];
+            vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20];
+            vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24];
+            vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2);
+            __builtin_mma_xvf32gerpp(&acc2, rowA2, rowB3);
+            __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB4);
+            __builtin_mma_xvf32gerpp(&acc4, rowA3, rowB5);
+            __builtin_mma_xvf32gerpp(&acc5, rowA3, rowB6);
+            __builtin_mma_xvf32gerpp(&acc6, rowA4, rowB7);
+            __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB8);
+         }
+         for (l = (temp & (~3)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<1];
+            vec_t rowB1 = *(vec_t *) & BO[l<<3];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2);
+         }
+         SAVE_ACC_COMPLEX_14
+         CO += 2;
+         AO += temp << 1;
+         BO += temp << 3;
 #if defined(TRMMKERNEL)
-	  REFRESH_AFTER_SAVE (1, 4)
+         REFRESH_AFTER_SAVE (1, 4)
 #endif
-	}
+      }
 #if defined(TRMMKERNEL) && !defined(LEFT)
-      off += 4;			// number of values in A
+      off += 4;                        // number of values in A
 #endif
 
       B += k << 3;
-    }
+   }
 
-  if (n & 2)
-    {
+   if (n & 2) {
 #if defined(TRMMKERNEL) && defined(LEFT)
       off = offset;
 #endif
@@ -732,212 +720,199 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
       CO = C;
       C += ldc << 2;
 
-      for (i = 0; i < (m >> 3); i++)
-	{
+      for (i = 0; i < (m >> 3); i++) {
 #if defined(TRMMKERNEL)
-	  REFRESH_POINTERS (8, 2)
+         REFRESH_POINTERS (8, 2)
 #else
-	  BO = B;
-	  temp = k;
+         BO = B;
+         temp = k;
 #endif
-          SET_ACC_ZERO()
-	  for (l = 0; l < (temp & (~1)); l+=2)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<4];
-              vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
-              vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
-              vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12];
-              vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16];
-              vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20];
-              vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24];
-              vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28];
-              vec_t rowB1 = *(vec_t *) & BO[l<<2];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
-              __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1);
-              __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1);
-              __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB2);
-              __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB2);
-              __builtin_mma_xvf32gerpp(&acc2, rowA7, rowB2);
-              __builtin_mma_xvf32gerpp(&acc3, rowA8, rowB2);
-	    }
-	  for (l = (temp & (~1)); l < temp; ++l)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<4];
-              vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
-              vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
-              vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12];
-              vec_t rowB1 = *(vec_t *) & BO[l<<2];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
-              __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1);
-              __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1);
-	    }
-          SAVE_ACC_COMPLEX_22_4
-	  AO += temp << 4;
-	  BO += temp << 2;
-	  CO += 16;
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~1)); l+=2) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<4];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12];
+            vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16];
+            vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20];
+            vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24];
+            vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28];
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1);
+            __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1);
+            __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB2);
+            __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB2);
+            __builtin_mma_xvf32gerpp(&acc2, rowA7, rowB2);
+            __builtin_mma_xvf32gerpp(&acc3, rowA8, rowB2);
+         }
+         for (l = (temp & (~1)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<4];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12];
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1);
+            __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1);
+         }
+         SAVE_ACC_COMPLEX_22_4
+         AO += temp << 4;
+         BO += temp << 2;
+         CO += 16;
 #if defined(TRMMKERNEL)
-	  REFRESH_AFTER_SAVE (8, 2)
+         REFRESH_AFTER_SAVE (8, 2)
 #endif
-	}
-      if (m & 4)
-	{
+      }
+      if (m & 4) {
 #if defined(TRMMKERNEL)
-	  REFRESH_POINTERS (4, 2)
+         REFRESH_POINTERS (4, 2)
 #else
-	  BO = B;
-	  temp = k;
+         BO = B;
+         temp = k;
 #endif
-          SET_ACC_ZERO()
-	  for (l = 0; l < (temp & (~3)); l+=4)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<3];
-              vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
-              vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
-              vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12];
-              vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16];
-              vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20];
-              vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24];
-              vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28];
-              vec_t rowB1 = *(vec_t *) & BO[l<<2];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4];
-              vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8];
-              vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
-              __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB2);
-              __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB2);
-              __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB3);
-              __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB3);
-              __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB4);
-              __builtin_mma_xvf32gerpp(&acc1, rowA8, rowB4);
-	    }
-	  for (l = (temp & (~3)); l < temp; ++l)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<3];
-              vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
-              vec_t rowB1 = *(vec_t *) & BO[l<<2];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
-	    }
-          SAVE_ACC_COMPLEX_22_2
-	  AO += temp << 3;
-	  BO += temp << 2;
-	  CO += 8;
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~3)); l+=4) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<3];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12];
+            vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16];
+            vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20];
+            vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24];
+            vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28];
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB2);
+            __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB2);
+            __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB3);
+            __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB3);
+            __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB4);
+            __builtin_mma_xvf32gerpp(&acc1, rowA8, rowB4);
+         }
+         for (l = (temp & (~3)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<3];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+         }
+         SAVE_ACC_COMPLEX_22_2
+         AO += temp << 3;
+         BO += temp << 2;
+         CO += 8;
 #if defined(TRMMKERNEL)
-	  REFRESH_AFTER_SAVE (4, 2)
+         REFRESH_AFTER_SAVE (4, 2)
 #endif
-	} if (m & 2)
-	{
+      } 
+      if (m & 2) {
 #if defined(TRMMKERNEL)
-	  REFRESH_POINTERS (2, 2)
+         REFRESH_POINTERS (2, 2)
 #else
-	  BO = B;
-	  temp = k;
+         BO = B;
+         temp = k;
 #endif
-          SET_ACC_ZERO()
-	  for (l = 0; l < (temp & (~7)); l+=8)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<2];
-              vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
-              vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
-              vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12];
-              vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16];
-              vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20];
-              vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24];
-              vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28];
-              vec_t rowB1 = *(vec_t *) & BO[l<<2];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4];
-              vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8];
-              vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12];
-              vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16];
-              vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20];
-              vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24];
-              vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB2);
-              __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3);
-              __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB4);
-              __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB5);
-              __builtin_mma_xvf32gerpp(&acc0, rowA6, rowB6);
-              __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB7);
-              __builtin_mma_xvf32gerpp(&acc0, rowA8, rowB8);
-	    }
-	  for (l = (temp & (~7)); l < temp; ++l)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<2];
-              vec_t rowB1 = *(vec_t *) & BO[l<<2];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-	    }
-          SAVE_ACC_COMPLEX_22_1
-	  AO += temp << 2;
-	  BO += temp << 2;
-	  CO += 4;
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~7)); l+=8) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<2];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12];
+            vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16];
+            vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20];
+            vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24];
+            vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28];
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12];
+            vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16];
+            vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20];
+            vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24];
+            vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB2);
+            __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3);
+            __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB4);
+            __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB5);
+            __builtin_mma_xvf32gerpp(&acc0, rowA6, rowB6);
+            __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB7);
+            __builtin_mma_xvf32gerpp(&acc0, rowA8, rowB8);
+         }
+         for (l = (temp & (~7)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<2];
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+         }
+         SAVE_ACC_COMPLEX_22_1
+         AO += temp << 2;
+         BO += temp << 2;
+         CO += 4;
 #if defined(TRMMKERNEL)
-	  REFRESH_AFTER_SAVE (2, 2)
+         REFRESH_AFTER_SAVE (2, 2)
 #endif
-	}
-      if (m & 1)
-	{
+      }
+      if (m & 1) {
 #if defined(TRMMKERNEL)
-	  REFRESH_POINTERS (1, 2)
+         REFRESH_POINTERS (1, 2)
 #else
-	  BO = B;
-	  temp = k;
+         BO = B;
+         temp = k;
 #endif
-	  // RIP OUT MMA STUFF!
-          SET_ACC_ZERO()
-	  for (l = 0; l < (temp & (~7)); l+=8)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<1];
-              vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
-              vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
-              vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6];
-              vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8];
-              vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10];
-              vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12];
-              vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14];
-              vec_t rowB1 = *(vec_t *) & BO[l<<2];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4];
-              vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8];
-              vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12];
-              vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16];
-              vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20];
-              vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24];
-              vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2);
-              __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3);
-              __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4);
-              __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5);
-              __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6);
-              __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
-              __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
-	    }
-	  for (l = (temp & (~7)); l < temp; ++l)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<1];
-              vec_t rowB1 = *(vec_t *) & BO[l<<2];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-	    }
-          SAVE_ACC_COMPLEX_12
-	  AO += temp<<1;
-	  BO += temp<<2;
-	  CO += 2;
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~7)); l+=8) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<1];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6];
+            vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8];
+            vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10];
+            vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12];
+            vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14];
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12];
+            vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16];
+            vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20];
+            vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24];
+            vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2);
+            __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3);
+            __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4);
+            __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5);
+            __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6);
+            __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
+            __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
+         }
+         for (l = (temp & (~7)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<1];
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+         }
+         SAVE_ACC_COMPLEX_12
+         AO += temp<<1;
+         BO += temp<<2;
+         CO += 2;
 #if defined(TRMMKERNEL)
-	  REFRESH_AFTER_SAVE (1, 2)
+         REFRESH_AFTER_SAVE (1, 2)
 #endif
-	}
+      }
 #if defined(TRMMKERNEL) && !defined(LEFT)
-      off += 2;			// number of values in A
+      off += 2;                        // number of values in A
 #endif
       B += k << 2;
-    }
+   }
 
-  if (n & 1)
-    {
+   if (n & 1) {
 #if defined(TRMMKERNEL) && defined(LEFT)
       off = offset;
 #endif
@@ -945,210 +920,196 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *
       CO = C;
       C += ldc << 1;
 
-      for (i = 0; i < (m >> 3); i++)
-	{
+      for (i = 0; i < (m >> 3); i++) {
 #if defined(TRMMKERNEL)
-	  REFRESH_POINTERS (8, 1)
+         REFRESH_POINTERS (8, 1)
 #else
-	  BO = B;
-	  temp = k;
+         BO = B;
+         temp = k;
 #endif
-          SET_ACC_ZERO()
-	  for (l = 0; l < (temp & (~1)); l+=2)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<4];
-              vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
-              vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
-              vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12];
-              vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16];
-              vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20];
-              vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24];
-              vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28];
-              vec_t rowB1 = *(vec_t *) & BO[l<<1];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
-              __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1);
-              __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1);
-              __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB2);
-              __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB2);
-              __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB2);
-              __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB2);
-	    }
-	  for (l = (temp & (~1)); l < temp; ++l)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<4];
-              vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
-              vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
-              vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12];
-              vec_t rowB1 = *(vec_t *) & BO[l<<1];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
-              __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1);
-              __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1);
-	    }
-          SAVE_ACC_COMPLEX_21_4
-	  AO += temp << 4;
-	  BO += temp << 1;
-	  CO += 16;
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~1)); l+=2) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<4];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12];
+            vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16];
+            vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20];
+            vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24];
+            vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28];
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1);
+            __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1);
+            __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB2);
+            __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB2);
+            __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB2);
+            __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB2);
+         }
+         for (l = (temp & (~1)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<4];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12];
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1);
+            __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1);
+         }
+         SAVE_ACC_COMPLEX_21_4
+         AO += temp << 4;
+         BO += temp << 1;
+         CO += 16;
 #if defined(TRMMKERNEL)
-	  REFRESH_AFTER_SAVE (8, 1)
+         REFRESH_AFTER_SAVE (8, 1)
 #endif
-	}
-      if (m & 4)
-	{
+      }
+      if (m & 4) {
 #if defined(TRMMKERNEL)
-	  REFRESH_POINTERS (4, 1)
+         REFRESH_POINTERS (4, 1)
 #else
-	  BO = B;
-	  temp = k;
+         BO = B;
+         temp = k;
 #endif
-          SET_ACC_ZERO()
-	  for (l = 0; l < (temp & (~3)); l+=4)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<3];
-              vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
-              vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
-              vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12];
-              vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16];
-              vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20];
-              vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24];
-              vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28];
-              vec_t rowB1 = *(vec_t *) & BO[l<<1];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
-              vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
-              vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
-              __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB2);
-              __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB2);
-              __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB3);
-              __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB3);
-              __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB4);
-              __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB4);
-	    }
-	  for (l = (temp & (~3)); l < temp; ++l)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<3];
-              vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
-              vec_t rowB1 = *(vec_t *) & BO[l<<1];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
-	    }
-          SAVE_ACC_COMPLEX_21_2
-	  AO += temp << 3;
-	  BO += temp << 1;
-	  CO += 8;
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~3)); l+=4) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<3];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12];
+            vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16];
+            vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20];
+            vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24];
+            vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28];
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB2);
+            __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB2);
+            __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB3);
+            __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB3);
+            __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB4);
+            __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB4);
+         }
+         for (l = (temp & (~3)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<3];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+         }
+         SAVE_ACC_COMPLEX_21_2
+         AO += temp << 3;
+         BO += temp << 1;
+         CO += 8;
 #if defined(TRMMKERNEL)
-	  REFRESH_AFTER_SAVE (4, 1)
+         REFRESH_AFTER_SAVE (4, 1)
 #endif
-	}
-      if (m & 2)
-	{
+      }
+      if (m & 2) {
 #if defined(TRMMKERNEL)
-	  REFRESH_POINTERS (2, 1)
+         REFRESH_POINTERS (2, 1)
 #else
-	  BO = B;
-	  temp = k;
+         BO = B;
+         temp = k;
 #endif
-	  // RIP OUT MMA STUFF!
-          SET_ACC_ZERO()
-	  for (l = 0; l < (temp & (~7)); l+=8)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<2];
-              vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
-              vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
-              vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12];
-              vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16];
-              vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20];
-              vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24];
-              vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28];
-              vec_t rowB1 = *(vec_t *) & BO[l<<1];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
-              vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
-              vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
-              vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
-              vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
-              vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
-              vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2);
-              __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3);
-              __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4);
-              __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5);
-              __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6);
-              __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
-              __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
-	    }
-	  for (l = (temp & (~7)); l < temp; ++l)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<2];
-              vec_t rowB1 = *(vec_t *) & BO[l<<1];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-	    }
-          SAVE_ACC_COMPLEX_21_1
-	  AO += temp << 2;
-	  BO += temp << 1;
-	  CO += 4;
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~7)); l+=8) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<2];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12];
+            vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16];
+            vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20];
+            vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24];
+            vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28];
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
+            vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
+            vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
+            vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
+            vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2);
+            __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3);
+            __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4);
+            __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5);
+            __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6);
+            __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
+            __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
+         }
+         for (l = (temp & (~7)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<2];
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+         }
+         SAVE_ACC_COMPLEX_21_1
+         AO += temp << 2;
+         BO += temp << 1;
+         CO += 4;
 #if defined(TRMMKERNEL)
-	  REFRESH_AFTER_SAVE (2, 1)
+         REFRESH_AFTER_SAVE (2, 1)
 #endif
-	}
-      if (m & 1)
-	{
+      }
+      if (m & 1) {
 #if defined(TRMMKERNEL)
-	  REFRESH_POINTERS (1, 1)
+         REFRESH_POINTERS (1, 1)
 #else
-	  BO = B;
-	  temp = k;
+         BO = B;
+         temp = k;
 #endif
-	  // RIP OUT MMA STUFF!
-          SET_ACC_ZERO()
-	  for (l = 0; l < (temp & (~7)); l+=8)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<1];
-              vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
-              vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
-              vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6];
-              vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8];
-              vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10];
-              vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12];
-              vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14];
-              vec_t rowB1 = *(vec_t *) & BO[l<<1];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
-              vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
-              vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
-              vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
-              vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
-              vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
-              vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2);
-              __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3);
-              __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4);
-              __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5);
-              __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6);
-              __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
-              __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
-	    }
-	  for (l = (temp & (~7)); l < temp; ++l)
-	    {
-              vec_t rowA1 = *(vec_t *) & AO[l<<1];
-              vec_t rowB1 = *(vec_t *) & BO[l<<1];
-              __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
-	    }
-          SAVE_ACC_COMPLEX_11
-	  AO += temp<<1;
-	  BO += temp<<1;
-	  CO += 2;
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~7)); l+=8) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<1];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6];
+            vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8];
+            vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10];
+            vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12];
+            vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14];
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
+            vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
+            vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
+            vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
+            vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2);
+            __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3);
+            __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4);
+            __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5);
+            __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6);
+            __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
+            __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
+         }
+         for (l = (temp & (~7)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<1];
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+         }
+         SAVE_ACC_COMPLEX_11
+         AO += temp<<1;
+         BO += temp<<1;
+         CO += 2;
 #if defined(TRMMKERNEL)
-	  REFRESH_AFTER_SAVE (1, 1)
+         REFRESH_AFTER_SAVE (1, 1)
 #endif
-	}
+      }
 #if defined(TRMMKERNEL) && !defined(LEFT)
-      off += 1;			// number of values in A
+      off += 1;                        // number of values in A
 #endif
       B += k << 1;
-    }
-  return 0;
+   }
+   return 0;
 }
diff --git a/kernel/power/zgemm_kernel_power10.c b/kernel/power/zgemm_kernel_power10.c
index e4e609067..370d12af3 100644
--- a/kernel/power/zgemm_kernel_power10.c
+++ b/kernel/power/zgemm_kernel_power10.c
@@ -30,15 +30,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 typedef __vector unsigned char  vec_t;
 typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
 
-#define SET_ACC_ZERO() \
-          __builtin_mma_xxsetaccz (&acc0); \
-          __builtin_mma_xxsetaccz (&acc1); \
-          __builtin_mma_xxsetaccz (&acc2); \
-          __builtin_mma_xxsetaccz (&acc3); \
-          __builtin_mma_xxsetaccz (&acc4); \
-          __builtin_mma_xxsetaccz (&acc5); \
-          __builtin_mma_xxsetaccz (&acc6); \
-          __builtin_mma_xxsetaccz (&acc7);
+#define SET_ACC_ZERO()                 \
+      __builtin_mma_xxsetaccz (&acc0); \
+      __builtin_mma_xxsetaccz (&acc1); \
+      __builtin_mma_xxsetaccz (&acc2); \
+      __builtin_mma_xxsetaccz (&acc3); \
+      __builtin_mma_xxsetaccz (&acc4); \
+      __builtin_mma_xxsetaccz (&acc5); \
+      __builtin_mma_xxsetaccz (&acc6); \
+      __builtin_mma_xxsetaccz (&acc7);
 
 #if (defined(NN) || defined(NT) || defined(TN) || defined(TT))
 #define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real  = _arbr - _aibi; _imag  =  _arbi + _aibr; }
@@ -66,696 +66,671 @@ typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
 #define A_OP +=
 #endif
 
-#define BUILTIN_MMA_DISASSEMBLE_ACC_8                                 \
-          __builtin_mma_disassemble_acc ((void *)result, &acc0);      \
-          __builtin_mma_disassemble_acc ((void *)&result[4], &acc1);  \
-          __builtin_mma_disassemble_acc ((void *)&result[8], &acc2);  \
-          __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \
-          __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \
-          __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \
-          __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \
-          __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); 
+#define BUILTIN_MMA_DISASSEMBLE_ACC_8                             \
+      __builtin_mma_disassemble_acc ((void *)result, &acc0);      \
+      __builtin_mma_disassemble_acc ((void *)&result[4], &acc1);  \
+      __builtin_mma_disassemble_acc ((void *)&result[8], &acc2);  \
+      __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \
+      __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \
+      __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \
+      __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \
+      __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); 
 
-#define SAVE_ACC_COMPLEX_11                                           \
-          BUILTIN_MMA_DISASSEMBLE_ACC_8                               \
-          COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2])  \
-          COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10])  \
-          COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18])  \
-          COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26])  \
-          COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34])  \
-          COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42])  \
-          COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50])  \
-          COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58])  \
-	  CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;               \
-	  CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
+#define SAVE_ACC_COMPLEX_11                                       \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                               \
+      COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2])  \
+      COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10])  \
+      COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18])  \
+      COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26])  \
+      COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34])  \
+      COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42])  \
+      COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50])  \
+      COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58])  \
+      CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;               \
+      CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
 
-#define SAVE_ACC_COMPLEX_12                                           \
-          BUILTIN_MMA_DISASSEMBLE_ACC_8                               \
-          COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2])  \
-          COMP_MUL(tr[1], res[ 8], res[11], ti[1], res[ 9], res[10])  \
-          COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18])  \
-          COMP_MAC(tr[1], res[24], res[27], ti[1], res[25], res[26])  \
-          COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34])  \
-          COMP_MAC(tr[1], res[40], res[43], ti[1], res[41], res[42])  \
-          COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50])  \
-          COMP_MAC(tr[1], res[56], res[59], ti[1], res[57], res[58])  \
-	  CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;               \
-	  CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;               \
-	  CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i;         \
-	  CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
+#define SAVE_ACC_COMPLEX_12                                       \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                               \
+      COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2])  \
+      COMP_MUL(tr[1], res[ 8], res[11], ti[1], res[ 9], res[10])  \
+      COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18])  \
+      COMP_MAC(tr[1], res[24], res[27], ti[1], res[25], res[26])  \
+      COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34])  \
+      COMP_MAC(tr[1], res[40], res[43], ti[1], res[41], res[42])  \
+      COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50])  \
+      COMP_MAC(tr[1], res[56], res[59], ti[1], res[57], res[58])  \
+      CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;               \
+      CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;               \
+      CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i;         \
+      CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
 
-#define SAVE_ACC_COMPLEX_21_1                                         \
-          BUILTIN_MMA_DISASSEMBLE_ACC_8                               \
-          COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2])  \
-          COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6])  \
-          COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10])  \
-          COMP_MAC(tr[1], res[12], res[15], ti[1], res[13], res[14])  \
-          COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18])  \
-          COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22])  \
-          COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26])  \
-          COMP_MAC(tr[1], res[28], res[31], ti[1], res[29], res[30])  \
-          COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34])  \
-          COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38])  \
-          COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42])  \
-          COMP_MAC(tr[1], res[44], res[47], ti[1], res[45], res[46])  \
-          COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50])  \
-          COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54])  \
-          COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58])  \
-          COMP_MAC(tr[1], res[60], res[63], ti[1], res[61], res[62])  \
-	  CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;               \
-	  CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;               \
-	  CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;               \
-	  CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
+#define SAVE_ACC_COMPLEX_21_1                                     \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                               \
+      COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2])  \
+      COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6])  \
+      COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10])  \
+      COMP_MAC(tr[1], res[12], res[15], ti[1], res[13], res[14])  \
+      COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18])  \
+      COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22])  \
+      COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26])  \
+      COMP_MAC(tr[1], res[28], res[31], ti[1], res[29], res[30])  \
+      COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34])  \
+      COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38])  \
+      COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42])  \
+      COMP_MAC(tr[1], res[44], res[47], ti[1], res[45], res[46])  \
+      COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50])  \
+      COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54])  \
+      COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58])  \
+      COMP_MAC(tr[1], res[60], res[63], ti[1], res[61], res[62])  \
+      CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;               \
+      CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;               \
+      CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;               \
+      CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
 
-#define SAVE_ACC_COMPLEX_21_2                                         \
-          BUILTIN_MMA_DISASSEMBLE_ACC_8                               \
-          COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2])  \
-          COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6])  \
-          COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10])  \
-          COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14])  \
-          COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18])  \
-          COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22])  \
-          COMP_MAC(tr[2], res[24], res[27], ti[2], res[25], res[26])  \
-          COMP_MAC(tr[3], res[28], res[31], ti[3], res[29], res[30])  \
-          COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34])  \
-          COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38])  \
-          COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42])  \
-          COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46])  \
-          COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50])  \
-          COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54])  \
-          COMP_MAC(tr[2], res[56], res[59], ti[2], res[57], res[58])  \
-          COMP_MAC(tr[3], res[60], res[63], ti[3], res[61], res[62])  \
-	  CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;               \
-	  CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;               \
-	  CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;               \
-	  CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;               \
-	  CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;               \
-	  CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;               \
-	  CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;               \
-	  CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
+#define SAVE_ACC_COMPLEX_21_2                                     \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                               \
+      COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2])  \
+      COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6])  \
+      COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10])  \
+      COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14])  \
+      COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18])  \
+      COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22])  \
+      COMP_MAC(tr[2], res[24], res[27], ti[2], res[25], res[26])  \
+      COMP_MAC(tr[3], res[28], res[31], ti[3], res[29], res[30])  \
+      COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34])  \
+      COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38])  \
+      COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42])  \
+      COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46])  \
+      COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50])  \
+      COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54])  \
+      COMP_MAC(tr[2], res[56], res[59], ti[2], res[57], res[58])  \
+      COMP_MAC(tr[3], res[60], res[63], ti[3], res[61], res[62])  \
+      CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;               \
+      CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;               \
+      CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;               \
+      CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;               \
+      CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;               \
+      CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;               \
+      CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;               \
+      CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
 
-#define SAVE_ACC_COMPLEX_21_4                                         \
-          BUILTIN_MMA_DISASSEMBLE_ACC_8                               \
-          COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2])  \
-          COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6])  \
-          COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10])  \
-          COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14])  \
-          COMP_MUL(tr[4], res[16], res[19], ti[4], res[17], res[18])  \
-          COMP_MUL(tr[5], res[20], res[23], ti[5], res[21], res[22])  \
-          COMP_MUL(tr[6], res[24], res[27], ti[6], res[25], res[26])  \
-          COMP_MUL(tr[7], res[28], res[31], ti[7], res[29], res[30])  \
-          COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34])  \
-          COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38])  \
-          COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42])  \
-          COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46])  \
-          COMP_MAC(tr[4], res[48], res[51], ti[4], res[49], res[50])  \
-          COMP_MAC(tr[5], res[52], res[55], ti[5], res[53], res[54])  \
-          COMP_MAC(tr[6], res[56], res[59], ti[6], res[57], res[58])  \
-          COMP_MAC(tr[7], res[60], res[63], ti[7], res[61], res[62])  \
-	  CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;              \
-	  CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;              \
-	  CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;              \
-	  CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;              \
-	  CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;              \
-	  CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;              \
-	  CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;              \
-	  CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;              \
-	  CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i;              \
-	  CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i;              \
-	  CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i;              \
-	  CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i;              \
-	  CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i;              \
-	  CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i;              \
-	  CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i;              \
-	  CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i;
+#define SAVE_ACC_COMPLEX_21_4                                     \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                               \
+      COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2])  \
+      COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6])  \
+      COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10])  \
+      COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14])  \
+      COMP_MUL(tr[4], res[16], res[19], ti[4], res[17], res[18])  \
+      COMP_MUL(tr[5], res[20], res[23], ti[5], res[21], res[22])  \
+      COMP_MUL(tr[6], res[24], res[27], ti[6], res[25], res[26])  \
+      COMP_MUL(tr[7], res[28], res[31], ti[7], res[29], res[30])  \
+      COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34])  \
+      COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38])  \
+      COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42])  \
+      COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46])  \
+      COMP_MAC(tr[4], res[48], res[51], ti[4], res[49], res[50])  \
+      COMP_MAC(tr[5], res[52], res[55], ti[5], res[53], res[54])  \
+      COMP_MAC(tr[6], res[56], res[59], ti[6], res[57], res[58])  \
+      COMP_MAC(tr[7], res[60], res[63], ti[7], res[61], res[62])  \
+      CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;              \
+      CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;              \
+      CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;              \
+      CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;              \
+      CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;              \
+      CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;              \
+      CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;              \
+      CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;              \
+      CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i;              \
+      CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i;              \
+      CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i;              \
+      CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i;              \
+      CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i;              \
+      CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i;              \
+      CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i;              \
+      CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i;
 
-#define SAVE_ACC_COMPLEX_22_1                                          \
-          __builtin_mma_disassemble_acc ((void *)result, &acc0);       \
-          __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc1); \
-          COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2])       \
-          COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6])       \
-          COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10])     \
-          COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14] )  \
-	  CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;            \
-	  CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;            \
-	  CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;            \
-	  CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;            \
-	  CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;      \
-	  CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;      \
-	  CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;      \
-	  CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
+#define SAVE_ACC_COMPLEX_22_1                                      \
+      __builtin_mma_disassemble_acc ((void *)result, &acc0);       \
+      __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc1); \
+      COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2])       \
+      COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6])       \
+      COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10])     \
+      COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14] )  \
+      CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                \
+      CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;                \
+      CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;                \
+      CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;                \
+      CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;          \
+      CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;          \
+      CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;          \
+      CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
 
-#define SAVE_ACC_COMPLEX_22_2(ACC1, ACC2, CI)                         \
-          __builtin_mma_disassemble_acc ((void *)result, ACC1);       \
-          __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \
-          COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2])      \
-          COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6])      \
-          COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10])    \
-          COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14])  \
-	  CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;            \
-	  CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;            \
-	  CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;            \
-	  CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;            \
-	  CO[2*ldc+CI+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;      \
-	  CO[2*ldc+CI+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;      \
-	  CO[2*ldc+CI+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;      \
-	  CO[2*ldc+CI+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
+#define SAVE_ACC_COMPLEX_22_2(ACC1, ACC2, CI)                     \
+      __builtin_mma_disassemble_acc ((void *)result, ACC1);       \
+      __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \
+      COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2])      \
+      COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6])      \
+      COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10])    \
+      COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14])  \
+      CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;            \
+      CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;            \
+      CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;            \
+      CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;            \
+      CO[2*ldc+CI+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;      \
+      CO[2*ldc+CI+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;      \
+      CO[2*ldc+CI+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;      \
+      CO[2*ldc+CI+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
 
 #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
 
 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
 #define REFRESH_TEMP_BK(x, y) \
-            temp = k - off;
+      temp = k - off;
 #elif defined(LEFT)
 #define REFRESH_TEMP_BK(x, y) \
-            temp = off + x;
+      temp = off + x;
 #else
 #define REFRESH_TEMP_BK(x, y) \
-            temp = off + y;
+      temp = off + y;
 #endif
 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 #define REFRESH_POINTERS(x, y)  \
-          BO = B;               \
-          REFRESH_TEMP_BK(x, y)
+      BO = B;                   \
+      REFRESH_TEMP_BK(x, y)
 #else
 #define REFRESH_POINTERS(x, y)  \
-          AO += off * (2*x);    \
-          BO = B + off * (2*y); \
-          REFRESH_TEMP_BK(x, y)
+      AO += off * (2*x);        \
+      BO = B + off * (2*y);     \
+      REFRESH_TEMP_BK(x, y)
 #endif
 
 #ifdef LEFT
 #define REFRESH_OFF(x) \
-            off += x;
+      off += x;
 #else
 #define REFRESH_OFF(x)
 #endif
 
 #ifdef LEFT
 #define UPDATE_TEMP(x, y) \
-            temp -= x;
+      temp -= x;
 #else
 #define UPDATE_TEMP(x, y) \
-            temp -= y;
+      temp -= y;
 #endif
 
 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 #define REFRESH_TMP_AFTER_SAVE(x, y) \
-            temp = k - off; \
-            UPDATE_TEMP(x, y) \
-            AO += temp * (2*x); \
-            BO += temp * (2*y);
+      temp = k - off;                \
+      UPDATE_TEMP(x, y)              \
+      AO += temp * (2*x);            \
+      BO += temp * (2*y);
 #else
 #define REFRESH_TMP_AFTER_SAVE(x, y)
 #endif
 
-#define REFRESH_AFTER_SAVE(x,y) \
-        REFRESH_TMP_AFTER_SAVE(x, y) \
-        REFRESH_OFF(x)
+#define REFRESH_AFTER_SAVE(x,y)    \
+      REFRESH_TMP_AFTER_SAVE(x, y) \
+      REFRESH_OFF(x)
 /*************************************************************************************
 * GEMM Kernel
 *************************************************************************************/
 int
-CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B,
-       FLOAT * C, BLASLONG ldc
 #ifdef TRMMKERNEL
-       , BLASLONG offset
+CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, 
+       FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset)
+#else
+CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, 
+       FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc)
 #endif
-  )
 {
-  BLASLONG i1, i, l, temp;
-  FLOAT *AO, *BO, *CO;
+   BLASLONG i1, i, l, temp;
+   FLOAT *AO, *BO, *CO;
 #if defined(TRMMKERNEL)
-  BLASLONG off;
+   BLASLONG off;
 #endif
 #if defined(TRMMKERNEL) && !defined(LEFT)
-  off = -offset;
+   off = -offset;
 #endif
-  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+   __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
 
-  v4sf_t result[32];
-  FLOAT *res, tr[16], ti[16];
-  res = (FLOAT *) result;
+   v4sf_t result[32];
+   FLOAT *res, tr[16], ti[16];
+   res = (FLOAT *) result;
 
-  for (i1 = 0; i1 < (n >> 1); i1++)
-    {
+   for (i1 = 0; i1 < (n >> 1); i1++) {
 #if defined(TRMMKERNEL) && defined(LEFT)
       off = offset;
 #endif
       AO = A;
       CO = C;
       C += ldc<<2;
-      for (i = 0; i < (m >> 3); i++)
-	{
+      for (i = 0; i < (m >> 3); i++) {
 #if defined(TRMMKERNEL)
-          REFRESH_POINTERS (8, 2)
+         REFRESH_POINTERS (8, 2)
 #else
-          BO = B;
-          temp = k;
-#endif
-          SET_ACC_ZERO()
-	  for (l = 0; l < temp; ++l)
-	    {
-              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
-              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
-              __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
-              __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
-              vec_t rowB1 = *(vec_t *) & BO[l<<2];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
-              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
-              __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
-              __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
-              __builtin_mma_xvf64gerpp(&acc4, rowA1, rowB2);
-              __builtin_mma_xvf64gerpp(&acc5, rowA2, rowB2);
-              __builtin_mma_xvf64gerpp(&acc6, rowA3, rowB2);
-              __builtin_mma_xvf64gerpp(&acc7, rowA4, rowB2);
-	    }
-        __builtin_mma_disassemble_acc ((void *)result,        &acc0);
-        __builtin_mma_disassemble_acc ((void *)(&result[ 4]), &acc1);
-        __builtin_mma_disassemble_acc ((void *)(&result[ 8]), &acc2);
-        __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc3);
-        __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc4);
-        __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc5);
-        __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc6);
-        __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7);
-        COMP_MUL(tr[ 0], res[ 0], res[ 3], ti[ 0], res[ 1], res[ 2])
-        COMP_MUL(tr[ 1], res[ 4], res[ 7], ti[ 1], res[ 5], res[ 6])
-        COMP_MUL(tr[ 2], res[ 8], res[11], ti[ 2], res[ 9], res[10])
-        COMP_MUL(tr[ 3], res[12], res[15], ti[ 3], res[13], res[14])
-        COMP_MUL(tr[ 4], res[16], res[19], ti[ 4], res[17], res[18])
-        COMP_MUL(tr[ 5], res[20], res[23], ti[ 5], res[21], res[22])
-        COMP_MUL(tr[ 6], res[24], res[27], ti[ 6], res[25], res[26])
-        COMP_MUL(tr[ 7], res[28], res[31], ti[ 7], res[29], res[30])
-        COMP_MUL(tr[ 8], res[32], res[35], ti[ 8], res[33], res[34])
-        COMP_MUL(tr[ 9], res[36], res[39], ti[ 9], res[37], res[38])
-        COMP_MUL(tr[10], res[40], res[43], ti[10], res[41], res[42])
-        COMP_MUL(tr[11], res[44], res[47], ti[11], res[45], res[46])
-        COMP_MUL(tr[12], res[48], res[51], ti[12], res[49], res[50])
-        COMP_MUL(tr[13], res[52], res[55], ti[13], res[53], res[54])
-        COMP_MUL(tr[14], res[56], res[59], ti[14], res[57], res[58])
-        COMP_MUL(tr[15], res[60], res[63], ti[15], res[61], res[62])
-        CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;
-        CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
-        CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;
-        CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
-        CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;
-        CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;
-        CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;
-        CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
-        CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i;
-        CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i;
-        CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i;
-        CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i;
-        CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i;
-        CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i;
-        CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i;
-        CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i;
-        CO[2*ldc+ 0] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i;
-        CO[2*ldc+ 1] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i;
-        CO[2*ldc+ 2] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i;
-        CO[2*ldc+ 3] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i;
-        CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i;
-        CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i;
-        CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i;
-        CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i;
-        CO[2*ldc+ 8] A_OP tr[12] * alpha_r - ti[12] * alpha_i;
-        CO[2*ldc+ 9] A_OP ti[12] * alpha_r + tr[12] * alpha_i;
-        CO[2*ldc+10] A_OP tr[13] * alpha_r - ti[13] * alpha_i;
-        CO[2*ldc+11] A_OP ti[13] * alpha_r + tr[13] * alpha_i;
-        CO[2*ldc+12] A_OP tr[14] * alpha_r - ti[14] * alpha_i;
-        CO[2*ldc+13] A_OP ti[14] * alpha_r + tr[14] * alpha_i;
-        CO[2*ldc+14] A_OP tr[15] * alpha_r - ti[15] * alpha_i;
-        CO[2*ldc+15] A_OP ti[15] * alpha_r + tr[15] * alpha_i;
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < temp; ++l) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
+            __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
+            __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
+            __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
+            __builtin_mma_xvf64gerpp(&acc4, rowA1, rowB2);
+            __builtin_mma_xvf64gerpp(&acc5, rowA2, rowB2);
+            __builtin_mma_xvf64gerpp(&acc6, rowA3, rowB2);
+            __builtin_mma_xvf64gerpp(&acc7, rowA4, rowB2);
+         }
+         __builtin_mma_disassemble_acc ((void *)result,        &acc0);
+         __builtin_mma_disassemble_acc ((void *)(&result[ 4]), &acc1);
+         __builtin_mma_disassemble_acc ((void *)(&result[ 8]), &acc2);
+         __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc3);
+         __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc4);
+         __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc5);
+         __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc6);
+         __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7);
+         COMP_MUL(tr[ 0], res[ 0], res[ 3], ti[ 0], res[ 1], res[ 2])
+         COMP_MUL(tr[ 1], res[ 4], res[ 7], ti[ 1], res[ 5], res[ 6])
+         COMP_MUL(tr[ 2], res[ 8], res[11], ti[ 2], res[ 9], res[10])
+         COMP_MUL(tr[ 3], res[12], res[15], ti[ 3], res[13], res[14])
+         COMP_MUL(tr[ 4], res[16], res[19], ti[ 4], res[17], res[18])
+         COMP_MUL(tr[ 5], res[20], res[23], ti[ 5], res[21], res[22])
+         COMP_MUL(tr[ 6], res[24], res[27], ti[ 6], res[25], res[26])
+         COMP_MUL(tr[ 7], res[28], res[31], ti[ 7], res[29], res[30])
+         COMP_MUL(tr[ 8], res[32], res[35], ti[ 8], res[33], res[34])
+         COMP_MUL(tr[ 9], res[36], res[39], ti[ 9], res[37], res[38])
+         COMP_MUL(tr[10], res[40], res[43], ti[10], res[41], res[42])
+         COMP_MUL(tr[11], res[44], res[47], ti[11], res[45], res[46])
+         COMP_MUL(tr[12], res[48], res[51], ti[12], res[49], res[50])
+         COMP_MUL(tr[13], res[52], res[55], ti[13], res[53], res[54])
+         COMP_MUL(tr[14], res[56], res[59], ti[14], res[57], res[58])
+         COMP_MUL(tr[15], res[60], res[63], ti[15], res[61], res[62])
+         CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;
+         CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
+         CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;
+         CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
+         CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;
+         CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;
+         CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;
+         CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
+         CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i;
+         CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i;
+         CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i;
+         CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i;
+         CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i;
+         CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i;
+         CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i;
+         CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i;
+         CO[2*ldc+ 0] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i;
+         CO[2*ldc+ 1] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i;
+         CO[2*ldc+ 2] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i;
+         CO[2*ldc+ 3] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i;
+         CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i;
+         CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i;
+         CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i;
+         CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i;
+         CO[2*ldc+ 8] A_OP tr[12] * alpha_r - ti[12] * alpha_i;
+         CO[2*ldc+ 9] A_OP ti[12] * alpha_r + tr[12] * alpha_i;
+         CO[2*ldc+10] A_OP tr[13] * alpha_r - ti[13] * alpha_i;
+         CO[2*ldc+11] A_OP ti[13] * alpha_r + tr[13] * alpha_i;
+         CO[2*ldc+12] A_OP tr[14] * alpha_r - ti[14] * alpha_i;
+         CO[2*ldc+13] A_OP ti[14] * alpha_r + tr[14] * alpha_i;
+         CO[2*ldc+14] A_OP tr[15] * alpha_r - ti[15] * alpha_i;
+         CO[2*ldc+15] A_OP ti[15] * alpha_r + tr[15] * alpha_i;
 
-	  AO += temp << 4;
-	  BO += temp << 2;
-	  CO += 16;
+         AO += temp << 4;
+         BO += temp << 2;
+         CO += 16;
 #if defined(TRMMKERNEL)
-          REFRESH_AFTER_SAVE (8, 2)
+         REFRESH_AFTER_SAVE (8, 2)
 #endif
-	}
-      if (m & 4)
-	{
+      }
+      if (m & 4) {
 #if defined(TRMMKERNEL)
-          REFRESH_POINTERS (4, 2)
+         REFRESH_POINTERS (4, 2)
 #else
-          BO = B;
-          temp = k;
-#endif
-          SET_ACC_ZERO()
-	  for (l = 0; l < (temp & (~1)); l+=2)
-	    {
-              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
-              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
-              __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
-              __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12]));
-              vec_t rowB1 = *(vec_t *) & BO[l<<2];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
-              vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
-              vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
-              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
-              __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2);
-              __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2);
-              __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB3);
-              __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB3);
-              __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4);
-              __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
-	    }
-	  for (l = (temp & (~1)); l < temp; ++l)
-	    {
-              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
-              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
-              vec_t rowB1 = *(vec_t *) & BO[l<<2];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
-              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
-              __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2);
-              __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2);
-	    }
-          SAVE_ACC_COMPLEX_22_2(&acc0, &acc2, 0)
-          SAVE_ACC_COMPLEX_22_2(&acc1, &acc3, 4)
-	  AO += temp << 3;
-	  BO += temp << 2;
-	  CO += 8;
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~1)); l+=2) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
+            __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
+            __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2);
+            __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2);
+            __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB3);
+            __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB3);
+            __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4);
+            __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
+         }
+         for (l = (temp & (~1)); l < temp; ++l) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2);
+            __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2);
+         }
+         SAVE_ACC_COMPLEX_22_2(&acc0, &acc2, 0)
+         SAVE_ACC_COMPLEX_22_2(&acc1, &acc3, 4)
+         AO += temp << 3;
+         BO += temp << 2;
+         CO += 8;
 #if defined(TRMMKERNEL)
-          REFRESH_AFTER_SAVE (4, 2)
+         REFRESH_AFTER_SAVE (4, 2)
 #endif
-	}
-      if (m & 2)
-	{
+      }
+      if (m & 2) {
 #if defined(TRMMKERNEL)
-          REFRESH_POINTERS (2, 2)
+         REFRESH_POINTERS (2, 2)
 #else
-          BO = B;
-          temp = k;
-#endif
-          SET_ACC_ZERO()
-	  for (l = 0; l < (temp & (~3)); l+=4)
-	    {
-              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
-              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
-              __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
-              __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12]));
-              vec_t rowB1 = *(vec_t *) & BO[l<<2];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
-              vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
-              vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
-              vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8];
-              vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10];
-              vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12];
-              vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14];
-              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
-              __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3);
-              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4);
-              __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5);
-              __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6);
-              __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
-              __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
-	    }
-	  for (l = (temp & (~3)); l < temp; ++l)
-	    {
-              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
-              vec_t rowB1 = *(vec_t *) & BO[l<<2];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
-              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
-	    }
-          SAVE_ACC_COMPLEX_22_1
-	  AO += temp << 2;
-	  BO += temp << 2;
-	  CO += 4;
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~3)); l+=4) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
+            __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
+            __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
+            vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8];
+            vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10];
+            vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12];
+            vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
+            __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4);
+            __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5);
+            __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6);
+            __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
+            __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
+         }
+         for (l = (temp & (~3)); l < temp; ++l) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
+         }
+         SAVE_ACC_COMPLEX_22_1
+         AO += temp << 2;
+         BO += temp << 2;
+         CO += 4;
 #if defined(TRMMKERNEL)
-          REFRESH_AFTER_SAVE (2, 2)
+         REFRESH_AFTER_SAVE (2, 2)
 #endif
-	}
-      if (m & 1)
-	{
+      }
+      if (m & 1) {
 #if defined(TRMMKERNEL)
-          REFRESH_POINTERS (1, 2)
+         REFRESH_POINTERS (1, 2)
 #else
-          BO = B;
-          temp = k;
-#endif
-	  // RIP OUT MMA STUFF!
-          SET_ACC_ZERO()
-	  for (l = 0; l < (temp & (~3)); l+=4)
-	    {
-              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
-              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
-              __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
-              __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6]));
-              vec_t rowB1 = *(vec_t *) & BO[l<<2];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
-              vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
-              vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
-              vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8];
-              vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10];
-              vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12];
-              vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14];
-              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
-              __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3);
-              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4);
-              __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5);
-              __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6);
-              __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
-              __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
-	    }
-	  for (l = (temp & (~3)); l < temp; ++l)
-	    {
-              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
-              vec_t rowB1 = *(vec_t *) & BO[l<<2];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
-              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
-	    }
-          SAVE_ACC_COMPLEX_12                
-	  AO += temp << 1;
-	  BO += temp << 2;
-	  CO += 2;
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~3)); l+=4) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
+            __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
+            __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
+            vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8];
+            vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10];
+            vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12];
+            vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
+            __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4);
+            __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5);
+            __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6);
+            __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
+            __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
+         }
+         for (l = (temp & (~3)); l < temp; ++l) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
+         }
+         SAVE_ACC_COMPLEX_12                
+         AO += temp << 1;
+         BO += temp << 2;
+         CO += 2;
 #if defined(TRMMKERNEL)
-          REFRESH_AFTER_SAVE (1, 2)
+         REFRESH_AFTER_SAVE (1, 2)
 #endif
-	}
+      }
 #if defined(TRMMKERNEL) && !defined(LEFT)
       off += 2;                 // number of values in A
 #endif
       B += k << 2;
-    }
-  if (n & 1)
-    {
+   }
+   if (n & 1) {
 #if defined(TRMMKERNEL) && defined(LEFT)
       off = offset;
 #endif
       AO = A;
       CO = C;
       C += ldc<<1;
-      for (i = 0; i < (m >> 3); i++)
-	{
+      for (i = 0; i < (m >> 3); i++) {
 #if defined(TRMMKERNEL)
-          REFRESH_POINTERS (8, 1)
+         REFRESH_POINTERS (8, 1)
 #else
-          BO = B;
-          temp = k;
-#endif
-          SET_ACC_ZERO()
-	  for (l = 0; l < (temp & (~1)); l+=2)
-	    {
-              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
-              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
-              __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
-              __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
-              __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16]));
-              __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20]));
-              __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24]));
-              __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28]));
-              vec_t rowB1 = *(vec_t *) & BO[l<<1];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
-              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
-              __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
-              __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
-              __builtin_mma_xvf64gerpp(&acc0, rowA5, rowB2);
-              __builtin_mma_xvf64gerpp(&acc1, rowA6, rowB2);
-              __builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2);
-              __builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2);
-	    }
-	  for (l = (temp & (~1)); l < temp; ++l)
-	    {
-              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
-              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
-              __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
-              __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
-              vec_t rowB1 = *(vec_t *) & BO[l<<1];
-              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
-              __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
-              __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
-	    }
-          SAVE_ACC_COMPLEX_21_4
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~1)); l+=2) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
+            __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
+            __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
+            __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16]));
+            __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20]));
+            __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24]));
+            __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
+            __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
+            __builtin_mma_xvf64gerpp(&acc0, rowA5, rowB2);
+            __builtin_mma_xvf64gerpp(&acc1, rowA6, rowB2);
+            __builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2);
+            __builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2);
+         }
+         for (l = (temp & (~1)); l < temp; ++l) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
+            __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
+            __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
+            __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
+         }
+         SAVE_ACC_COMPLEX_21_4
 
-	  AO += temp << 4;
-	  BO += temp << 1;
-	  CO += 16;
+         AO += temp << 4;
+         BO += temp << 1;
+         CO += 16;
 #if defined(TRMMKERNEL)
-          REFRESH_AFTER_SAVE (8, 1)
+         REFRESH_AFTER_SAVE (8, 1)
 #endif
-	}
-      if (m & 4)
-	{
+      }
+      if (m & 4) {
 #if defined(TRMMKERNEL)
-          REFRESH_POINTERS (4, 1)
+         REFRESH_POINTERS (4, 1)
 #else
-          BO = B;
-          temp = k;
-#endif
-          SET_ACC_ZERO()
-	  for (l = 0; l < (temp & (~3)); l+=4)
-	    {
-              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
-              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
-              __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
-              __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12]));
-              __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16]));
-              __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20]));
-              __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24]));
-              __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28]));
-              vec_t rowB1 = *(vec_t *) & BO[l<<1];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
-              vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
-              vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
-              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
-              __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB2);
-              __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB2);
-              __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB3);
-              __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB3);
-              __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4);
-              __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4);
-	    }
-	  for (l = (temp & (~3)); l < temp; ++l)
-	    {
-              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
-              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
-              vec_t rowB1 = *(vec_t *) & BO[l<<1];
-              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
-	    }
-          SAVE_ACC_COMPLEX_21_2
-	  AO += temp << 3;
-	  BO += temp << 1;
-	  CO += 8;
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~3)); l+=4) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
+            __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
+            __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12]));
+            __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16]));
+            __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20]));
+            __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24]));
+            __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB2);
+            __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB2);
+            __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB3);
+            __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB3);
+            __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4);
+            __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4);
+         }
+         for (l = (temp & (~3)); l < temp; ++l) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
+         }
+         SAVE_ACC_COMPLEX_21_2
+         AO += temp << 3;
+         BO += temp << 1;
+         CO += 8;
 #if defined(TRMMKERNEL)
-          REFRESH_AFTER_SAVE (4, 1)
+         REFRESH_AFTER_SAVE (4, 1)
 #endif
-	} if (m & 2)
-	{
+      } 
+      if (m & 2) {
 #if defined(TRMMKERNEL)
-          REFRESH_POINTERS (2, 1)
+         REFRESH_POINTERS (2, 1)
 #else
-          BO = B;
-          temp = k;
-#endif
-          SET_ACC_ZERO()
-	  for (l = 0; l < (temp & (~7)); l+=8)
-	    {
-              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
-              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
-              __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
-              __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12]));
-              __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16]));
-              __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20]));
-              __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24]));
-              __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28]));
-              vec_t rowB1 = *(vec_t *) & BO[l<<1];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
-              vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
-              vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
-              vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
-              vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
-              vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
-              vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
-              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2);
-              __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3);
-              __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
-              __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5);
-              __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6);
-              __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
-              __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
-	    }
-	  for (l = (temp & (~7)); l < temp; ++l)
-	    {
-              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
-              vec_t rowB1 = *(vec_t *) & BO[l<<1];
-              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
-	    }
-          SAVE_ACC_COMPLEX_21_1
-	  AO += temp << 2;
-	  BO += temp << 1;
-	  CO += 4;
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~7)); l+=8) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
+            __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
+            __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12]));
+            __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16]));
+            __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20]));
+            __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24]));
+            __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
+            vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
+            vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
+            vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
+            vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2);
+            __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3);
+            __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
+            __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5);
+            __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6);
+            __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
+            __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
+         }
+         for (l = (temp & (~7)); l < temp; ++l) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+         }
+         SAVE_ACC_COMPLEX_21_1
+         AO += temp << 2;
+         BO += temp << 1;
+         CO += 4;
 #if defined(TRMMKERNEL)
-          REFRESH_AFTER_SAVE (2, 1)
+         REFRESH_AFTER_SAVE (2, 1)
 #endif
-	}
-      if (m & 1)
-	{
+      }
+      if (m & 1) {
 #if defined(TRMMKERNEL)
-          REFRESH_POINTERS (1, 1)
+         REFRESH_POINTERS (1, 1)
 #else
-          BO = B;
-          temp = k;
-#endif
-	  // RIP OUT MMA STUFF!
-          SET_ACC_ZERO()
-	  for (l = 0; l < (temp & (~7)); l+=8)
-	    {
-              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
-              __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
-              __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
-              __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6]));
-              __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8]));
-              __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10]));
-              __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12]));
-              __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14]));
-              vec_t rowB1 = *(vec_t *) & BO[l<<1];
-              vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
-              vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
-              vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
-              vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
-              vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
-              vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
-              vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
-              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
-              __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2);
-              __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3);
-              __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
-              __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5);
-              __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6);
-              __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
-              __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
-	    }
-	  for (l = (temp & (~7)); l < temp; ++l)
-	    {
-              __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
-              vec_t rowB1 = *(vec_t *) & BO[l<<1];
-              __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
-	    }
-          SAVE_ACC_COMPLEX_11       
-	  AO += temp << 1;
-	  BO += temp << 1;
-	  CO += 2;
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~7)); l+=8) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
+            __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
+            __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6]));
+            __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8]));
+            __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10]));
+            __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12]));
+            __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
+            vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
+            vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
+            vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
+            vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2);
+            __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3);
+            __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
+            __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5);
+            __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6);
+            __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
+            __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
+         }
+         for (l = (temp & (~7)); l < temp; ++l) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+         }
+         SAVE_ACC_COMPLEX_11       
+         AO += temp << 1;
+         BO += temp << 1;
+         CO += 2;
 #if defined(TRMMKERNEL)
-          REFRESH_AFTER_SAVE (1, 1)
+         REFRESH_AFTER_SAVE (1, 1)
 #endif
-	}
+      }
 #if defined(TRMMKERNEL) && !defined(LEFT)
       off += 1;                 // number of values in A
 #endif
       B += k << 1;
-    }
-  return 0;
+   }
+   return 0;
 }

From f68e9989c47d2643fd72986afc59f6a8d7d07486 Mon Sep 17 00:00:00 2001
From: kseniyazaytseva <kseniya.zaytseva@syntacore.com>
Date: Fri, 2 Feb 2024 12:26:23 +0300
Subject: [PATCH 627/718] Remove zero rows/columns matcopy tests

---
 utest/test_extensions/test_cimatcopy.c | 32 --------------------------
 utest/test_extensions/test_comatcopy.c | 32 --------------------------
 utest/test_extensions/test_dimatcopy.c | 32 --------------------------
 utest/test_extensions/test_domatcopy.c | 32 --------------------------
 utest/test_extensions/test_simatcopy.c | 32 --------------------------
 utest/test_extensions/test_somatcopy.c | 32 --------------------------
 utest/test_extensions/test_zimatcopy.c | 32 --------------------------
 utest/test_extensions/test_zomatcopy.c | 32 --------------------------
 8 files changed, 256 deletions(-)

diff --git a/utest/test_extensions/test_cimatcopy.c b/utest/test_extensions/test_cimatcopy.c
index 800f8a2d1..a4b1e30ac 100644
--- a/utest/test_extensions/test_cimatcopy.c
+++ b/utest/test_extensions/test_cimatcopy.c
@@ -714,38 +714,6 @@ CTEST(cimatcopy, xerbla_invalid_trans)
     ASSERT_EQUAL(TRUE, passed);
 }
 
-/**
- * Test error function for an invalid param m.
- * Must be positive.
- */
-CTEST(cimatcopy, xerbla_invalid_rows)
-{
-    blasint m = 0, n = 100;
-    blasint lda_src = 0, lda_dst = 100;
-    char order = 'C';
-    char trans = 'T';
-    int expected_info = 3;
-
-    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
-    ASSERT_EQUAL(TRUE, passed);
-}
-
-/**
- * Test error function for an invalid param n.
- * Must be positive.
- */
-CTEST(cimatcopy, xerbla_invalid_cols)
-{
-    blasint m = 100, n = 0;
-    blasint lda_src = 100, lda_dst = 0;
-    char order = 'C';
-    char trans = 'T';
-    int expected_info = 4;
-
-    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
-    ASSERT_EQUAL(TRUE, passed);
-}
-
 /**
  * Test error function for an invalid param lda_src.
  * If matrices are stored using row major layout, 
diff --git a/utest/test_extensions/test_comatcopy.c b/utest/test_extensions/test_comatcopy.c
index 8a3d5ee7b..71663406a 100644
--- a/utest/test_extensions/test_comatcopy.c
+++ b/utest/test_extensions/test_comatcopy.c
@@ -524,38 +524,6 @@ CTEST(comatcopy, xerbla_invalid_trans)
     ASSERT_EQUAL(TRUE, passed);
 }
 
-/**
- * Test error function for an invalid param m.
- * Must be positive.
- */
-CTEST(comatcopy, xerbla_invalid_rows)
-{
-    blasint m = 0, n = 100;
-    blasint lda = 0, ldb = 100;
-    char order = 'C';
-    char trans = 'T';
-    int expected_info = 3;
-
-    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
-    ASSERT_EQUAL(TRUE, passed);
-}
-
-/**
- * Test error function for an invalid param n.
- * Must be positive.
- */
-CTEST(comatcopy, xerbla_invalid_cols)
-{
-    blasint m = 100, n = 0;
-    blasint lda = 100, ldb = 0;
-    char order = 'C';
-    char trans = 'T';
-    int expected_info = 4;
-
-    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
-    ASSERT_EQUAL(TRUE, passed);
-}
-
 /**
  * Test error function for an invalid param lda.
  * If matrices are stored using row major layout, 
diff --git a/utest/test_extensions/test_dimatcopy.c b/utest/test_extensions/test_dimatcopy.c
index 4debb50e8..d2a16bbbf 100644
--- a/utest/test_extensions/test_dimatcopy.c
+++ b/utest/test_extensions/test_dimatcopy.c
@@ -811,38 +811,6 @@ CTEST(dimatcopy, xerbla_invalid_trans)
     ASSERT_EQUAL(TRUE, passed);
 }
 
-/**
- * Test error function for an invalid param m.
- * Must be positive.
- */
-CTEST(dimatcopy, xerbla_invalid_rows)
-{
-    blasint m = 0, n = 100;
-    blasint lda_src = 0, lda_dst = 100;
-    char order = 'C';
-    char trans = 'T';
-    int expected_info = 3;
-
-    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
-    ASSERT_EQUAL(TRUE, passed);
-}
-
-/**
- * Test error function for an invalid param n.
- * Must be positive.
- */
-CTEST(dimatcopy, xerbla_invalid_cols)
-{
-    blasint m = 100, n = 0;
-    blasint lda_src = 100, lda_dst = 0;
-    char order = 'C';
-    char trans = 'T';
-    int expected_info = 4;
-
-    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
-    ASSERT_EQUAL(TRUE, passed);
-}
-
 /**
  * Test error function for an invalid param lda_src.
  * If matrices are stored using row major layout, 
diff --git a/utest/test_extensions/test_domatcopy.c b/utest/test_extensions/test_domatcopy.c
index f692e8784..e60b9c83d 100644
--- a/utest/test_extensions/test_domatcopy.c
+++ b/utest/test_extensions/test_domatcopy.c
@@ -536,38 +536,6 @@ CTEST(domatcopy, xerbla_invalid_trans)
     ASSERT_EQUAL(TRUE, passed);
 }
 
-/**
- * Test error function for an invalid param m.
- * Must be positive.
- */
-CTEST(domatcopy, xerbla_invalid_rows)
-{
-    blasint m = 0, n = 100;
-    blasint lda = 0, ldb = 100;
-    char order = 'C';
-    char trans = 'T';
-    int expected_info = 3;
-
-    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
-    ASSERT_EQUAL(TRUE, passed);
-}
-
-/**
- * Test error function for an invalid param n.
- * Must be positive.
- */
-CTEST(domatcopy, xerbla_invalid_cols)
-{
-    blasint m = 100, n = 0;
-    blasint lda = 100, ldb = 0;
-    char order = 'C';
-    char trans = 'T';
-    int expected_info = 4;
-
-    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
-    ASSERT_EQUAL(TRUE, passed);
-}
-
 /**
  * Test error function for an invalid param lda.
  * If matrices are stored using row major layout,
diff --git a/utest/test_extensions/test_simatcopy.c b/utest/test_extensions/test_simatcopy.c
index 0d9c44e73..cf14d360c 100644
--- a/utest/test_extensions/test_simatcopy.c
+++ b/utest/test_extensions/test_simatcopy.c
@@ -811,38 +811,6 @@ CTEST(simatcopy, xerbla_invalid_trans)
     ASSERT_EQUAL(TRUE, passed);
 }
 
-/**
- * Test error function for an invalid param m.
- * Must be positive.
- */
-CTEST(simatcopy, xerbla_invalid_rows)
-{
-    blasint m = 0, n = 100;
-    blasint lda_src = 0, lda_dst = 100;
-    char order = 'C';
-    char trans = 'T';
-    int expected_info = 3;
-
-    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
-    ASSERT_EQUAL(TRUE, passed);
-}
-
-/**
- * Test error function for an invalid param n.
- * Must be positive.
- */
-CTEST(simatcopy, xerbla_invalid_cols)
-{
-    blasint m = 100, n = 0;
-    blasint lda_src = 100, lda_dst = 0;
-    char order = 'C';
-    char trans = 'T';
-    int expected_info = 4;
-
-    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
-    ASSERT_EQUAL(TRUE, passed);
-}
-
 /**
  * Test error function for an invalid param lda_src.
  * If matrices are stored using row major layout, 
diff --git a/utest/test_extensions/test_somatcopy.c b/utest/test_extensions/test_somatcopy.c
index c75bbc75e..b53c7cae5 100644
--- a/utest/test_extensions/test_somatcopy.c
+++ b/utest/test_extensions/test_somatcopy.c
@@ -536,38 +536,6 @@ CTEST(somatcopy, xerbla_invalid_trans)
     ASSERT_EQUAL(TRUE, passed);
 }
 
-/**
- * Test error function for an invalid param m.
- * Must be positive.
- */
-CTEST(somatcopy, xerbla_invalid_rows)
-{
-    blasint m = 0, n = 100;
-    blasint lda = 0, ldb = 100;
-    char order = 'C';
-    char trans = 'T';
-    int expected_info = 3;
-
-    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
-    ASSERT_EQUAL(TRUE, passed);
-}
-
-/**
- * Test error function for an invalid param n.
- * Must be positive.
- */
-CTEST(somatcopy, xerbla_invalid_cols)
-{
-    blasint m = 100, n = 0;
-    blasint lda = 100, ldb = 0;
-    char order = 'C';
-    char trans = 'T';
-    int expected_info = 4;
-
-    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
-    ASSERT_EQUAL(TRUE, passed);
-}
-
 /**
  * Test error function for an invalid param lda.
  * If matrices are stored using row major layout,
diff --git a/utest/test_extensions/test_zimatcopy.c b/utest/test_extensions/test_zimatcopy.c
index 6461ce88f..8376bc493 100644
--- a/utest/test_extensions/test_zimatcopy.c
+++ b/utest/test_extensions/test_zimatcopy.c
@@ -714,38 +714,6 @@ CTEST(zimatcopy, xerbla_invalid_trans)
     ASSERT_EQUAL(TRUE, passed);
 }
 
-/**
- * Test error function for an invalid param m.
- * Must be positive.
- */
-CTEST(zimatcopy, xerbla_invalid_rows)
-{
-    blasint m = 0, n = 100;
-    blasint lda_src = 0, lda_dst = 100;
-    char order = 'C';
-    char trans = 'T';
-    int expected_info = 3;
-
-    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
-    ASSERT_EQUAL(TRUE, passed);
-}
-
-/**
- * Test error function for an invalid param n.
- * Must be positive.
- */
-CTEST(zimatcopy, xerbla_invalid_cols)
-{
-    blasint m = 100, n = 0;
-    blasint lda_src = 100, lda_dst = 0;
-    char order = 'C';
-    char trans = 'T';
-    int expected_info = 4;
-
-    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
-    ASSERT_EQUAL(TRUE, passed);
-}
-
 /**
  * Test error function for an invalid param lda_src.
  * If matrices are stored using row major layout, 
diff --git a/utest/test_extensions/test_zomatcopy.c b/utest/test_extensions/test_zomatcopy.c
index 8df3dd80f..495831c56 100644
--- a/utest/test_extensions/test_zomatcopy.c
+++ b/utest/test_extensions/test_zomatcopy.c
@@ -541,38 +541,6 @@ CTEST(zomatcopy, xerbla_invalid_trans)
     ASSERT_EQUAL(TRUE, passed);
 }
 
-/**
-* Test error function for an invalid param m.
-* Must be positive.
-*/
-CTEST(zomatcopy, xerbla_invalid_rows)
-{
-    blasint m = 0, n = 100;
-    blasint lda = 0, ldb = 100;
-    char order = 'C';
-    char trans = 'T';
-    int expected_info = 3;
-
-    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
-    ASSERT_EQUAL(TRUE, passed);
-}
-
-/**
-* Test error function for an invalid param n.
-* Must be positive.
-*/
-CTEST(zomatcopy, xerbla_invalid_cols)
-{
-    blasint m = 100, n = 0;
-    blasint lda = 100, ldb = 0;
-    char order = 'C';
-    char trans = 'T';
-    int expected_info = 4;
-
-    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
-    ASSERT_EQUAL(TRUE, passed);
-}
-
 /**
 * Test error function for an invalid param lda.
 * If matrices are stored using row major layout, 

From 441339104f5056ba8d68343667511de683d69ca6 Mon Sep 17 00:00:00 2001
From: kseniyazaytseva <kseniya.zaytseva@syntacore.com>
Date: Fri, 2 Feb 2024 13:49:39 +0300
Subject: [PATCH 628/718] fix test ext cmake build

---
 utest/CMakeLists.txt | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index d78701707..c090ed511 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -22,6 +22,7 @@ endif ()
 set(DIR_EXT test_extensions)
 set(OpenBLAS_utest_ext_src
 utest_main.c
+${DIR_EXT}/common.c
 ${DIR_EXT}/xerbla.c
 ${DIR_EXT}/test_isamin.c 
 ${DIR_EXT}/test_idamin.c 
@@ -39,14 +40,14 @@ ${DIR_EXT}/test_scamax.c
 ${DIR_EXT}/test_dzamax.c
 ${DIR_EXT}/test_zrotg.c
 ${DIR_EXT}/test_crotg.c
-$(DIR_EXT)/test_drotmg.c
-$(DIR_EXT)/test_srotmg.c
-$(DIR_EXT)/test_zscal.c
-$(DIR_EXT)/test_cscal.c
-$(DIR_EXT)/test_domatcopy.c
-$(DIR_EXT)/test_somatcopy.c
-$(DIR_EXT)/test_zomatcopy.c
-$(DIR_EXT)/test_comatcopy.c
+${DIR_EXT}/test_drotmg.c
+${DIR_EXT}/test_srotmg.c
+${DIR_EXT}/test_zscal.c
+${DIR_EXT}/test_cscal.c
+${DIR_EXT}/test_domatcopy.c
+${DIR_EXT}/test_somatcopy.c
+${DIR_EXT}/test_zomatcopy.c
+${DIR_EXT}/test_comatcopy.c
 ${DIR_EXT}/test_simatcopy.c
 ${DIR_EXT}/test_dimatcopy.c
 ${DIR_EXT}/test_cimatcopy.c
@@ -59,12 +60,12 @@ ${DIR_EXT}/test_saxpby.c
 ${DIR_EXT}/test_daxpby.c
 ${DIR_EXT}/test_caxpby.c
 ${DIR_EXT}/test_zaxpby.c
-${DIR_EXT}/test_caxpyc.c
-${DIR_EXT}/test_zaxpyc.c
-$(DIR_EXT)/test_cgemv_t.c
-$(DIR_EXT)/test_zgemv_t.c
-$(DIR_EXT)/test_cgemv_n.c
-$(DIR_EXT)/test_zgemv_n.c
+# ${DIR_EXT}/test_caxpyc.c
+# ${DIR_EXT}/test_zaxpyc.c
+${DIR_EXT}/test_cgemv_t.c
+${DIR_EXT}/test_zgemv_t.c
+${DIR_EXT}/test_cgemv_n.c
+${DIR_EXT}/test_zgemv_n.c
 ${DIR_EXT}/test_crot.c
 ${DIR_EXT}/test_zrot.c
 ${DIR_EXT}/test_cgbmv.c
@@ -75,10 +76,10 @@ ${DIR_EXT}/test_cgemmt.c
 ${DIR_EXT}/test_zgemmt.c
 ${DIR_EXT}/test_ztrmv.c
 ${DIR_EXT}/test_ctrmv.c
-$(DIR_EXT)/test_ztrsv.c
-$(DIR_EXT)/test_ctrsv.c
-$(DIR_EXT)/test_zgemm.c
-$(DIR_EXT)/test_cgemm.c
+${DIR_EXT}/test_ztrsv.c
+${DIR_EXT}/test_ctrsv.c
+${DIR_EXT}/test_zgemm.c
+${DIR_EXT}/test_cgemm.c
 )
 
 # crashing on travis cl with an error code suggesting resource not found

From b6949ce74c8a22f4ec2b710ac084411f6b6560b0 Mon Sep 17 00:00:00 2001
From: kseniyazaytseva <kseniya.zaytseva@syntacore.com>
Date: Fri, 2 Feb 2024 14:42:27 +0300
Subject: [PATCH 629/718] add axpyc to cmake build

---
 utest/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index c090ed511..67dc489d1 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -60,8 +60,8 @@ ${DIR_EXT}/test_saxpby.c
 ${DIR_EXT}/test_daxpby.c
 ${DIR_EXT}/test_caxpby.c
 ${DIR_EXT}/test_zaxpby.c
-# ${DIR_EXT}/test_caxpyc.c
-# ${DIR_EXT}/test_zaxpyc.c
+${DIR_EXT}/test_caxpyc.c
+${DIR_EXT}/test_zaxpyc.c
 ${DIR_EXT}/test_cgemv_t.c
 ${DIR_EXT}/test_zgemv_t.c
 ${DIR_EXT}/test_cgemv_n.c

From 4d8dee508ce415743d61042111a9d50660b2b8bb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 4 Feb 2024 01:05:03 +0100
Subject: [PATCH 630/718] temporarily disable the CAXPY/ZAXPY kernels

---
 kernel/riscv64/KERNEL.C910V | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V
index 2798a870e..066329390 100644
--- a/kernel/riscv64/KERNEL.C910V
+++ b/kernel/riscv64/KERNEL.C910V
@@ -42,8 +42,8 @@ ZSUMKERNEL  = ../arm/zsum.c
 
 SAXPYKERNEL  = axpy_vector.c
 DAXPYKERNEL  = axpy_vector.c
-CAXPYKERNEL  = zaxpy_vector.c
-ZAXPYKERNEL  = zaxpy_vector.c
+CAXPYKERNEL  = zaxpy.c
+ZAXPYKERNEL  = zaxpy.c
 
 SAXPBYKERNEL  = axpby_vector.c
 DAXPBYKERNEL  = axpby_vector.c

From 68d354814f9f846338e1988c4f609c8add419012 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 4 Feb 2024 01:14:22 +0100
Subject: [PATCH 631/718] Fix incompatible pointer type in BFLOAT16 mode

---
 interface/gemmt.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/interface/gemmt.c b/interface/gemmt.c
index 01dec0c35..8fd8089d0 100644
--- a/interface/gemmt.c
+++ b/interface/gemmt.c
@@ -527,7 +527,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 			// for alignment
 			buffer_size = (buffer_size + 3) & ~3;
-			STACK_ALLOC(buffer_size, FLOAT, buffer);
+			STACK_ALLOC(buffer_size, IFLOAT, buffer);
 
 #ifdef SMP
 
@@ -616,7 +616,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 			// for alignment
 			buffer_size = (buffer_size + 3) & ~3;
-			STACK_ALLOC(buffer_size, FLOAT, buffer);
+			STACK_ALLOC(buffer_size, IFLOAT, buffer);
 
 #ifdef SMP
 
@@ -666,4 +666,4 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 	IDEBUG_END;
 
 	return;
-}
\ No newline at end of file
+}

From 3597827c932b90be8916f5fbaa27dd8db818ee9e Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Thu, 1 Feb 2024 16:33:58 +0800
Subject: [PATCH 632/718] utest: add axpby

---
 utest/CMakeLists.txt |   1 +
 utest/Makefile       |   2 +-
 utest/test_axpby.c   | 320 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 322 insertions(+), 1 deletion(-)
 create mode 100644 utest/test_axpby.c

diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index 41829bd22..edfcfb7cf 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -17,6 +17,7 @@ else ()
     test_swap.c
     test_zscal.c
     test_amin.c
+    test_axpby.c
   )
 endif ()
 
diff --git a/utest/Makefile b/utest/Makefile
index 8acaa3ea9..867f8c88c 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -12,7 +12,7 @@ UTESTBIN=openblas_utest
 include $(TOPDIR)/Makefile.system
 
 OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o \
-     test_amin.o
+     test_amin.o test_axpby.o
 #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o
 
 ifneq ($(NO_LAPACK), 1)
diff --git a/utest/test_axpby.c b/utest/test_axpby.c
new file mode 100644
index 000000000..37ba8ad14
--- /dev/null
+++ b/utest/test_axpby.c
@@ -0,0 +1,320 @@
+/*****************************************************************************
+Copyright (c) 2011-2024, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "openblas_utest.h"
+
+#ifdef BUILD_SINGLE
+CTEST(axpby, saxpby_inc_0)
+{
+    blasint i;
+    blasint N = 9, incX = 0, incY = 0;
+    float alpha = 1.0, beta = 2.0;
+    float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(saxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y2[] = { 1535.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    for(i = 0; i < N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
+    }
+}
+
+CTEST(axpby, saxpby_inc_1)
+{
+    blasint i;
+    blasint N = 9, incX = 1, incY = 1;
+    float alpha = 0.25, beta = 0.75;
+    float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(saxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y2[] = { 1.75, 3.75, 5.75, 7.75, 1.75, 3.75, 5.75, 7.75, 9.75 };
+
+    for(i = 0; i < N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
+    }
+}
+
+CTEST(axpby, saxpby_inc_2)
+{
+    blasint i;
+    blasint N = 9, incX = 2, incY = 2;
+    float alpha = 0.25, beta = 0.75;
+    float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(saxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y2[] = { 1.75, 4.00, 5.75, 8.00, 1.75, 4.00, 5.75, 8.00,
+                   9.75, 2.00, 3.75, 6.00, 7.75, 2.00, 3.75, 6.00,
+                   7.75, 10.00 };
+
+    for(i = 0; i < 2 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
+    }
+}
+#endif
+
+#ifdef BUILD_DOUBLE
+CTEST(axpby, daxpby_inc_0)
+{
+    blasint i;
+    blasint N = 9, incX = 0, incY = 0;
+    double alpha = 1.0, beta  = 2.0;
+    double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(daxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y2[] = { 1535.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    for(i = 0; i < N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+    }
+}
+
+CTEST(axpby, daxpby_inc_1)
+{
+    blasint i;
+    blasint N = 9, incX = 1, incY = 1;
+    double alpha = 0.25, beta = 0.75;
+    double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(daxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y2[] = { 1.75, 3.75, 5.75, 7.75, 1.75, 3.75, 5.75, 7.75, 9.75 };
+
+    for(i = 0; i < N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+    }
+}
+
+CTEST(axpby, daxpby_inc_2)
+{
+    blasint i;
+    blasint N = 9, incX = 2, incY = 2;
+    double alpha = 0.25, beta = 0.75;
+    double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                    1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                    2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(daxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                    1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y2[] = { 1.75, 4.00, 5.75, 8.00, 1.75, 4.00, 5.75, 8.00,
+                    9.75, 2.00, 3.75, 6.00, 7.75, 2.00, 3.75, 6.00,
+                    7.75, 10.00 };
+
+    for(i = 0; i < 2 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+    }
+}
+#endif
+
+#ifdef BUILD_COMPLEX
+CTEST(axpby, caxpby_inc_0)
+{
+    blasint i;
+    blasint N = 9, incX = 0, incY = 0;
+    float alpha[] = { 1.0, 2.0 }, beta[] = { 2.0, 1.0 };
+    float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(caxpby)(&N, alpha, x1, &incX, beta, y1, &incY);
+
+    float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y2[] = { 9355.0, -8865.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0,
+                   10.0, 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    for(i = 0; i < 2 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
+    }
+}
+
+CTEST(axpby, caxpby_inc_1)
+{
+    blasint i;
+    blasint N = 9, incX = 1, incY = 1;
+    float alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 };
+    float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(caxpby)(&N, alpha, x1, &incX, beta, y1, &incY);
+
+    float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y2[] = { -2.0, 5.5, -2.0, 13.5, -2.0, 5.5, -2.0, 13.5,
+                   8.0, 11.5, -2.0, 9.5, 6.0, 9.5, -2.0, 9.5, -2.0, 17.5 };
+
+    for(i = 0; i < 2 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
+    }
+}
+
+CTEST(axpby, caxpby_inc_2)
+{
+    blasint i;
+    blasint N = 9, incX = 2, incY = 2;
+    float alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 };
+    float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(caxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y2[] = { -2.0, 5.5, 6.0, 8.0, -2.0, 5.5, 6.0, 8.0, 8.0,
+                   11.5, 4.0, 6.0, 6.0, 9.5, 4.0, 6.0, -2.0, 17.5,
+                   2.0, 4.0, -2.0, 13.5, 2.0, 4.0, -2.0, 13.5, 10.0,
+                   2.0, -2.0, 9.5, 8.0, 2.0, -2.0, 9.5, 8.0, 10.0 };
+
+    for(i = 0; i < 4 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
+    }
+}
+#endif
+
+#ifdef BUILD_COMPLEX16
+CTEST(axpby, zaxpby_inc_0)
+{
+    blasint i;
+    blasint N = 9, incX = 0, incY = 0;
+    double alpha[] = { 1.0, 2.0 }, beta[] = { 2.0, 1.0 };
+    double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(zaxpby)(&N, alpha, x1, &incX, beta, y1, &incY);
+
+    double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y2[] = { 9355.0, -8865.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0,
+                   10.0, 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    for(i = 0; i < 2 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+    }
+}
+
+CTEST(axpby, zaxpby_inc_1)
+{
+    blasint i;
+    blasint N = 9, incX = 1, incY = 1;
+    double alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 };
+    double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(zaxpby)(&N, alpha, x1, &incX, beta, y1, &incY);
+
+    double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y2[] = { -2.0, 5.5, -2.0, 13.5, -2.0, 5.5, -2.0, 13.5,
+                   8.0, 11.5, -2.0, 9.5, 6.0, 9.5, -2.0, 9.5, -2.0, 17.5 };
+
+    for(i = 0; i < 2 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+    }
+}
+
+CTEST(axpby, zaxpby_inc_2)
+{
+    blasint i;
+    blasint N = 9, incX = 2, incY = 2;
+    double alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 };
+    double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(zaxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y2[] = { -2.0, 5.5, 6.0, 8.0, -2.0, 5.5, 6.0, 8.0, 8.0,
+                   11.5, 4.0, 6.0, 6.0, 9.5, 4.0, 6.0, -2.0, 17.5,
+                   2.0, 4.0, -2.0, 13.5, 2.0, 4.0, -2.0, 13.5, 10.0,
+                   2.0, -2.0, 9.5, 8.0, 2.0, -2.0, 9.5, 8.0, 10.0 };
+
+    for(i = 0; i < 4 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+    }
+}
+#endif

From 1e1f487dc7e95c7e5a1d4234147439a5f4bca070 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Thu, 1 Feb 2024 19:57:05 +0800
Subject: [PATCH 633/718] LoongArch64: Fixed {s/d}axpby

---
 kernel/loongarch64/axpby_lasx.S | 9 +++++++--
 kernel/loongarch64/axpby_lsx.S  | 9 +++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/kernel/loongarch64/axpby_lasx.S b/kernel/loongarch64/axpby_lasx.S
index f1d99cd3b..7a246ca5c 100644
--- a/kernel/loongarch64/axpby_lasx.S
+++ b/kernel/loongarch64/axpby_lasx.S
@@ -57,10 +57,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     PROLOGUE
 
     bge $r0, N, .L999
-    li.d TEMP, 1
     movgr2fr.d a1, $r0
     ffint.s.l a1, a1
-    slli.d  TEMP, TEMP, BASE_SHIFT
     slli.d  INCX, INCX, BASE_SHIFT
     slli.d  INCY, INCY, BASE_SHIFT
     MTG  t1, ALPHA
@@ -75,6 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvreplgr2vr.w VXB, t2
     xvreplgr2vr.w VXZ, t3
 #endif
+    // If incx == 0 || incy == 0, do one by one
+    and TEMP, INCX, INCY
+    or  I,    N,    N
+    beqz TEMP, .L998
+
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
     srai.d I, N, 3
     bne INCX, TEMP, .L20
     bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
diff --git a/kernel/loongarch64/axpby_lsx.S b/kernel/loongarch64/axpby_lsx.S
index 45154c262..e50d4cdcc 100644
--- a/kernel/loongarch64/axpby_lsx.S
+++ b/kernel/loongarch64/axpby_lsx.S
@@ -57,10 +57,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     PROLOGUE
 
     bge $r0, N, .L999
-    li.d TEMP, 1
     movgr2fr.d a1, $r0
     ffint.s.l a1, a1
-    slli.d  TEMP, TEMP, BASE_SHIFT
     slli.d  INCX, INCX, BASE_SHIFT
     slli.d  INCY, INCY, BASE_SHIFT
     MTG  t1, ALPHA
@@ -75,6 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     vreplgr2vr.w VXB, t2
     vreplgr2vr.w VXZ, t3
 #endif
+    // If incx == 0 || incy == 0, do one by one
+    and TEMP, INCX, INCY
+    or  I,    N,    N
+    beqz TEMP, .L998
+
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
     srai.d I, N, 3
     bne INCX, TEMP, .L20
     bne INCY, TEMP, .L12 // INCX==1 and INCY!=1

From 7bc93d95a1d72fe733e43aa1bea64796b123dcbb Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Wed, 24 Jan 2024 16:11:45 +0800
Subject: [PATCH 634/718] LoongArch64: Opt {c/z}axpby

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |    2 +
 kernel/loongarch64/KERNEL.LOONGSON3R5    |    2 +
 kernel/loongarch64/caxpby_lasx.S         | 1046 ++++++++++++++++++++++
 kernel/loongarch64/caxpby_lsx.S          | 1029 +++++++++++++++++++++
 4 files changed, 2079 insertions(+)
 create mode 100644 kernel/loongarch64/caxpby_lasx.S
 create mode 100644 kernel/loongarch64/caxpby_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index e27ce3bee..f4ab495e6 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -58,6 +58,8 @@ ZAXPYKERNEL =  caxpy_lsx.S
 
 SAXPBYKERNEL = axpby_lsx.S
 DAXPBYKERNEL = axpby_lsx.S
+CAXPBYKERNEL = caxpby_lsx.S
+ZAXPBYKERNEL = caxpby_lsx.S
 
 SSUMKERNEL  =  sum_lsx.S
 DSUMKERNEL  =  sum_lsx.S
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index f4429cfba..bd85fab01 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -58,6 +58,8 @@ ZAXPYKERNEL =  caxpy_lasx.S
 
 SAXPBYKERNEL = axpby_lasx.S
 DAXPBYKERNEL = axpby_lasx.S
+CAXPBYKERNEL = caxpby_lasx.S
+ZAXPBYKERNEL = caxpby_lasx.S
 
 SSUMKERNEL  =  sum_lasx.S
 DSUMKERNEL  =  sum_lasx.S
diff --git a/kernel/loongarch64/caxpby_lasx.S b/kernel/loongarch64/caxpby_lasx.S
new file mode 100644
index 000000000..c5802092e
--- /dev/null
+++ b/kernel/loongarch64/caxpby_lasx.S
@@ -0,0 +1,1046 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define ALPHAR $f0
+#define ALPHAI $f1
+#define X      $r5
+#define INCX   $r6
+#define BETAR  $f2
+#define BETAI  $f3
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VXAR   $xr23
+#define VXAI   $xr19
+#define VXBR   $xr14
+#define VXBI   $xr13
+#define VXZ    $xr12
+#define x1     $xr18
+#define x2     $xr17
+#define x3     $xr16
+#define x4     $xr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    movgr2fr.d a1, $r0
+    FFINT a1, a1
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    MTG t1, ALPHAR
+    MTG t2, ALPHAI
+    MTG t3, BETAR
+    MTG t4, BETAI
+#ifdef DOUBLE
+    xvreplgr2vr.d VXAR, t1
+    xvreplgr2vr.d VXAI, t2
+    xvreplgr2vr.d VXBR, t3
+    xvreplgr2vr.d VXBI, t4
+#else
+    xvreplgr2vr.w VXAR, t1
+    xvreplgr2vr.w VXAI, t2
+    xvreplgr2vr.w VXBR, t3
+    xvreplgr2vr.w VXBI, t4
+#endif
+    xvxor.v VXZ, VXZ, VXZ
+    // If incx == 0 || incy == 0, do one by one
+    and TEMP, INCX, INCY
+    or  I,    N,    N
+    beqz TEMP, .L998
+
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+#ifdef DOUBLE
+    srai.d I, N, 2
+#else
+    srai.d I, N, 3
+#endif
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ $fcc0, BETAR, a1
+    CMPEQ $fcc1, BETAI, a1
+    CMPEQ $fcc2, ALPHAR, a1
+    CMPEQ $fcc3, ALPHAI, a1
+    bceqz $fcc0, .L13
+    bceqz $fcc1, .L13
+    b .L14
+    .align 3
+
+.L13:
+    bceqz $fcc2, .L114
+    bceqz $fcc3, .L114 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L113 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+
+.L14:
+    bceqz $fcc2, .L112
+    bceqz $fcc3, .L112 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L111 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+    .align 3
+
+.L111:  //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+    xvst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvst VXZ, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvst VXZ, Y, 8 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112:  //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+#endif
+    XVFMUL x3, VXAI, x2
+    XVFMUL x4, VXAI, x1
+    XVMSUB x3, VXAR, x1, x3
+    XVFMADD x4, VXAR, x2, x4
+#ifdef DOUBLE
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    xvld VX0, Y, 0 * SIZE
+    xvld VX1, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+#else
+    xvld VX0, Y, 0 * SIZE
+    xvld VX1, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+#endif
+    XVFMUL x3, VXBI, x2
+    XVFMUL x4, VXBI, x1
+    XVMSUB x3, VXBR, x1, x3
+    XVFMADD x4, VXBR, x2, x4
+#ifdef DOUBLE
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d X, Y, 8 * SIZE
+#else
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 8 * SIZE
+    addi.d X, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 8 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+#endif
+    XVFMUL VX0, VXAI, x2
+    XVFMUL VX1, VXAI, x1
+    XVFMUL VX2, VXBI, x4
+    XVFMUL VX3, VXBI, x3
+    XVMSUB VX0, VXAR, x1, VX0
+    XVFMADD VX1, VXAR, x2, VX1
+    XVMSUB VX2, VXBR, x3, VX2
+    XVFMADD VX3, VXBR, x4, VX3
+    XVFADD x3, VX0, VX2
+    XVFADD x4, VX1, VX3
+#ifdef DOUBLE
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 2
+    xvinsgr2vr.d x4, t4, 2
+
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x3, t1, 1
+    xvinsgr2vr.d x4, t2, 1
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    add.d Y, Y, INCY
+
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX1, VXAI, x1
+    xvfmul.d VX2, VXBI, x4
+    xvfmul.d VX3, VXBI, x3
+    xvfmsub.d VX0, VXAR, x1, VX0
+    xvfmadd.d VX1, VXAR, x2, VX1
+    xvfmsub.d VX2, VXBR, x3, VX2
+    xvfmadd.d VX3, VXBR, x4, VX3
+    xvfadd.d x3, VX0, VX2
+    xvfadd.d x4, VX1, VX3
+    addi.d  I, I, -1
+    xvstelm.d x3, YY, 0 * SIZE, 0
+    xvstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 2
+    xvstelm.d x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 1
+    xvstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 3
+    xvstelm.d x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+#else
+    xvld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    xvld VX1, X, 8 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    add.d Y, Y, INCY
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    add.d Y, Y, INCY
+
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    XVFMUL VX0, VXAI, x2
+    XVFMUL VX1, VXAI, x1
+    XVFMUL VX2, VXBI, x4
+    XVFMUL VX3, VXBI, x3
+    XVMSUB VX0, VXAR, x1, VX0
+    XVFMADD VX1, VXAR, x2, VX1
+    XVMSUB VX2, VXBR, x3, VX2
+    XVFMADD VX3, VXBR, x4, VX3
+    XVFADD x3, VX0, VX2
+    XVFADD x4, VX1, VX3
+    addi.d  I, I, -1
+    xvstelm.w x3, YY, 0 * SIZE, 0
+    xvstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 1
+    xvstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 4
+    xvstelm.w x4, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 5
+    xvstelm.w x4, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 2
+    xvstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 3
+    xvstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 6
+    xvstelm.w x4, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 7
+    xvstelm.w x4, YY, 1 * SIZE, 7
+    add.d YY, YY, INCY
+    addi.d X, X, 16 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+#endif
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 2
+    xvinsgr2vr.d x2, t4, 2
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 1
+    xvinsgr2vr.d x2, t2, 1
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX1, VXAI, x1
+    xvfmul.d VX2, VXBI, x4
+    xvfmul.d VX3, VXBI, x3
+    xvfmsub.d VX0, VXAR, x1, VX0
+    xvfmadd.d VX1, VXAR, x2, VX1
+    xvfmsub.d VX2, VXBR, x3, VX2
+    xvfmadd.d VX3, VXBR, x4, VX3
+    xvfadd.d x3, VX0, VX2
+    xvfadd.d x4, VX1, VX3
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    xvld VX3, Y, 8 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+    XVFMUL VX0, VXAI, x2
+    XVFMUL VX1, VXAI, x1
+    XVFMUL VX2, VXBI, x4
+    XVFMUL VX3, VXBI, x3
+    XVMSUB VX0, VXAR, x1, VX0
+    XVFMADD VX1, VXAR, x2, VX1
+    XVMSUB VX2, VXBR, x3, VX2
+    XVFMADD VX3, VXBR, x4, VX3
+    XVFADD x3, VX0, VX2
+    XVFADD x4, VX1, VX3
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 8 * SIZE
+    addi.d Y, Y, 16 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+#endif
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    CMPEQ $fcc0, BETAR, a1
+    CMPEQ $fcc1, BETAI, a1
+    CMPEQ $fcc2, ALPHAR, a1
+    CMPEQ $fcc3, ALPHAI, a1
+    bceqz $fcc0, .L23
+    bceqz $fcc1, .L23
+    b .L24
+    .align 3
+
+.L23:
+    bceqz $fcc2, .L224
+    bceqz $fcc3, .L224 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L223 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+    .align 3
+
+.L24:
+    bceqz $fcc2, .L222
+    bceqz $fcc3, .L222 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L221 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+    .align 3
+
+.L221:  //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    xvstelm.d VXZ, Y, 0, 0
+    xvstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.d VXZ, Y, 0, 0
+    xvstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.d VXZ, Y, 0, 0
+    xvstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.d VXZ, Y, 0, 0
+    xvstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    addi.d I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+#else
+    xvstelm.w VXZ, Y, 0, 0
+    xvstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VXZ, Y, 0, 0
+    xvstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VXZ, Y, 0, 0
+    xvstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VXZ, Y, 0, 0
+    xvstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VXZ, Y, 0, 0
+    xvstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VXZ, Y, 0, 0
+    xvstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VXZ, Y, 0, 0
+    xvstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    addi.d I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+#endif
+
+.L222:  //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+    xvfmul.d x3, VXAI, x2
+    xvfmul.d x4, VXAI, x1
+    xvfmsub.d x3, VXAR, x1, x3
+    xvfmadd.d x4, VXAR, x2, x4
+    addi.d  I, I, -1
+    xvstelm.d x3, YY, 0 * SIZE, 0
+    xvstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 1
+    xvstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 2
+    xvstelm.d x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 3
+    xvstelm.d x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+#else
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+    XVFMUL x3, VXAI, x2
+    XVFMUL x4, VXAI, x1
+    XVMSUB x3, VXAR, x1, x3
+    XVFMADD x4, VXAR, x2, x4
+    addi.d  I, I, -1
+    xvstelm.w x3, YY, 0 * SIZE, 0
+    xvstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 1
+    xvstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 2
+    xvstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 3
+    xvstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 4
+    xvstelm.w x4, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 5
+    xvstelm.w x4, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 6
+    xvstelm.w x4, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 7
+    xvstelm.w x4, YY, 1 * SIZE, 7
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+#endif
+
+.L223:
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d x3, VXBI, x2
+    xvfmul.d x4, VXBI, x1
+    xvfmsub.d x3, VXBR, x1, x3
+    xvfmadd.d x4, VXBR, x2, x4
+
+    addi.d  I, I, -1
+    xvstelm.d x3, YY, 0 * SIZE, 0
+    xvstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 1
+    xvstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 2
+    xvstelm.d x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 3
+    xvstelm.d x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+#else
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d Y, Y, INCY
+
+    XVFMUL x3, VXBI, x2
+    XVFMUL x4, VXBI, x1
+    XVMSUB x3, VXBR, x1, x3
+    XVFMADD x4, VXBR, x2, x4
+    addi.d  I, I, -1
+    xvstelm.w x3, YY, 0 * SIZE, 0
+    xvstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 1
+    xvstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 2
+    xvstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 3
+    xvstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 4
+    xvstelm.w x4, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 5
+    xvstelm.w x4, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 6
+    xvstelm.w x4, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 7
+    xvstelm.w x4, YY, 1 * SIZE, 7
+    add.d YY, YY, INCY
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+#endif
+
+.L224:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 1
+    xvinsgr2vr.d x4, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x3, t1, 2
+    xvinsgr2vr.d x4, t2, 2
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX1, VXAI, x1
+    xvfmul.d VX2, VXBI, x4
+    xvfmul.d VX3, VXBI, x3
+    xvfmsub.d VX0, VXAR, x1, VX0
+    xvfmadd.d VX1, VXAR, x2, VX1
+    xvfmsub.d VX2, VXBR, x3, VX2
+    xvfmadd.d VX3, VXBR, x4, VX3
+    xvfadd.d x3, VX0, VX2
+    xvfadd.d x4, VX1, VX3
+    addi.d  I, I, -1
+
+    xvstelm.d x3, YY, 0 * SIZE, 0
+    xvstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 1
+    xvstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 2
+    xvstelm.d x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 3
+    xvstelm.d x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+#else
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    add.d Y, Y, INCY
+
+    XVFMUL VX0, VXAI, x2
+    XVFMUL VX1, VXAI, x1
+    XVFMUL VX2, VXBI, x4
+    XVFMUL VX3, VXBI, x3
+    XVMSUB VX0, VXAR, x1, VX0
+    XVFMADD VX1, VXAR, x2, VX1
+    XVMSUB VX2, VXBR, x3, VX2
+    XVFMADD VX3, VXBR, x4, VX3
+    XVFADD x3, VX0, VX2
+    XVFADD x4, VX1, VX3
+    addi.d  I, I, -1
+
+    xvstelm.w x3, YY, 0 * SIZE, 0
+    xvstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 1
+    xvstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 2
+    xvstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 3
+    xvstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 4
+    xvstelm.w x4, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 5
+    xvstelm.w x4, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 6
+    xvstelm.w x4, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 7
+    xvstelm.w x4, YY, 1 * SIZE, 7
+    add.d YY, YY, INCY
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+#endif
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    addi.d I, I, -1
+    MUL s1, ALPHAI, a2
+    MUL s2, ALPHAI, a1
+    MUL s3, BETAI, a4
+    MUL s4, BETAI, a3
+    MSUB s1, ALPHAR, a1, s1
+    MADD s2, a2, ALPHAR, s2
+    MSUB s3, BETAR, a3, s3
+    MADD s4, a4, BETAR, s4
+    ADD s3, s3, s1
+    ADD s4, s4, s2
+    ST s3, Y, 0 * SIZE
+    ST s4, Y, 1 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/caxpby_lsx.S b/kernel/loongarch64/caxpby_lsx.S
new file mode 100644
index 000000000..247ae428e
--- /dev/null
+++ b/kernel/loongarch64/caxpby_lsx.S
@@ -0,0 +1,1029 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define ALPHAR $f0
+#define ALPHAI $f1
+#define X      $r5
+#define INCX   $r6
+#define BETAR  $f2
+#define BETAI  $f3
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VXAR   $vr23
+#define VXAI   $vr19
+#define VXBR   $vr14
+#define VXBI   $vr13
+#define VXZ    $vr12
+#define x1     $vr18
+#define x2     $vr17
+#define x3     $vr16
+#define x4     $vr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    movgr2fr.d a1, $r0
+#ifdef DOUBLE
+    ffint.d.l a1, a1
+#else
+    ffint.s.l a1, a1
+#endif
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+#ifdef DOUBLE
+    movfr2gr.d t1, ALPHAR
+    vreplgr2vr.d VXAR, t1
+    movfr2gr.d t2, ALPHAI
+    vreplgr2vr.d VXAI, t2
+    movfr2gr.d t3, BETAR
+    vreplgr2vr.d VXBR, t3
+    movfr2gr.d t4, BETAI
+    vreplgr2vr.d VXBI, t4
+#else
+    movfr2gr.s t1, ALPHAR
+    vreplgr2vr.w VXAR, t1
+    movfr2gr.s t2, ALPHAI
+    vreplgr2vr.w VXAI, t2
+    movfr2gr.s t3, BETAR
+    vreplgr2vr.w VXBR, t3
+    movfr2gr.s t4, BETAI
+    vreplgr2vr.w VXBI, t4
+#endif
+    vxor.v VXZ, VXZ, VXZ
+    // If incx == 0 || incy == 0, do one by one
+    and TEMP, INCX, INCY
+    or  I,    N,    N
+    beqz TEMP, .L998
+
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    srai.d I, N, 2
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+#ifdef DOUBLE
+    fcmp.ceq.d $fcc0, BETAR, a1
+    fcmp.ceq.d $fcc1, BETAI, a1
+    fcmp.ceq.d $fcc2, ALPHAR, a1
+    fcmp.ceq.d $fcc3, ALPHAI, a1
+#else
+    fcmp.ceq.s $fcc0, BETAR, a1
+    fcmp.ceq.s $fcc1, BETAI, a1
+    fcmp.ceq.s $fcc2, ALPHAR, a1
+    fcmp.ceq.s $fcc3, ALPHAI, a1
+#endif
+    bceqz $fcc0, .L13
+    bceqz $fcc1, .L13
+    b .L14
+    .align 3
+
+.L13:
+    bceqz $fcc2, .L114
+    bceqz $fcc3, .L114 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L113 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+
+.L14:
+    bceqz $fcc2, .L112
+    bceqz $fcc3, .L112 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L111 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+    .align 3
+
+.L111:  //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    vst VXZ, Y, 0 * SIZE
+    vst VXZ, Y, 2 * SIZE
+    vst VXZ, Y, 4 * SIZE
+    vst VXZ, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+#else
+    vst VXZ, Y, 0 * SIZE
+    vst VXZ, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+#endif
+
+.L112:  //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXAI, x2
+    vfmul.d x4, VXAI, x1
+    vfmsub.d x3, VXAR, x1, x3
+    vfmadd.d x4, VXAR, x2, x4
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXAI, x2
+    vfmul.d x4, VXAI, x1
+    vfmsub.d x3, VXAR, x1, x3
+    vfmadd.d x4, VXAR, x2, x4
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, VXAI, x2
+    vfmul.s x4, VXAI, x1
+    vfmsub.s x3, VXAR, x1, x3
+    vfmadd.s x4, VXAR, x2, x4
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+#endif
+
+.L113: //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    vld VX0, Y, 0 * SIZE
+    vld VX1, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXBI, x2
+    vfmul.d x4, VXBI, x1
+    vfmsub.d x3, VXBR, x1, x3
+    vfmadd.d x4, VXBR, x2, x4
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+    vld VX0, Y, 4 * SIZE
+    vld VX1, Y, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXBI, x2
+    vfmul.d x4, VXBI, x1
+    vfmsub.d x3, VXBR, x1, x3
+    vfmadd.d x4, VXBR, x2, x4
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+#else
+    vld VX0, Y, 0 * SIZE
+    vld VX1, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, VXBI, x2
+    vfmul.s x4, VXBI, x1
+    vfmsub.s x3, VXBR, x1, x3
+    vfmadd.s x4, VXBR, x2, x4
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+#endif
+
+.L114:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX1, VXAI, x1
+    vfmul.s VX2, VXBI, x4
+    vfmul.s VX3, VXBI, x3
+    vfmsub.s VX0, VXAR, x1, VX0
+    vfmadd.s VX1, VXAR, x2, VX1
+    vfmsub.s VX2, VXBR, x3, VX2
+    vfmadd.s VX3, VXBR, x4, VX3
+    vfadd.s x3, VX0, VX2
+    vfadd.s x4, VX1, VX3
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+#endif
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    addi.d  I, I, -1
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX1, VXAI, x1
+    vfmul.s VX2, VXBI, x4
+    vfmul.s VX3, VXBI, x3
+    vfmsub.s VX0, VXAR, x1, VX0
+    vfmadd.s VX1, VXAR, x2, VX1
+    vfmsub.s VX2, VXBR, x3, VX2
+    vfmadd.s VX3, VXBR, x4, VX3
+    vfadd.s x3, VX0, VX2
+    vfadd.s x4, VX1, VX3
+    addi.d  I, I, -1
+    vstelm.w x3, YY, 0 * SIZE, 0
+    vstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 1
+    vstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 2
+    vstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 3
+    vstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+#endif
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+
+    vld VX2, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    addi.d  I, I, -1
+    vst VX3, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX1, VXAI, x1
+    vfmul.s VX2, VXBI, x4
+    vfmul.s VX3, VXBI, x3
+    vfmsub.s VX0, VXAR, x1, VX0
+    vfmadd.s VX1, VXAR, x2, VX1
+    vfmsub.s VX2, VXBR, x3, VX2
+    vfmadd.s VX3, VXBR, x4, VX3
+    vfadd.s x3, VX0, VX2
+    vfadd.s x4, VX1, VX3
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    addi.d  I, I, -1
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+#endif
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+#ifdef DOUBLE
+    fcmp.ceq.d $fcc0, BETAR, a1
+    fcmp.ceq.d $fcc1, BETAI, a1
+    fcmp.ceq.d $fcc2, ALPHAR, a1
+    fcmp.ceq.d $fcc3, ALPHAI, a1
+#else
+    fcmp.ceq.s $fcc0, BETAR, a1
+    fcmp.ceq.s $fcc1, BETAI, a1
+    fcmp.ceq.s $fcc2, ALPHAR, a1
+    fcmp.ceq.s $fcc3, ALPHAI, a1
+#endif
+    bceqz $fcc0, .L23
+    bceqz $fcc1, .L23
+    b .L24
+    .align 3
+
+.L23:
+    bceqz $fcc2, .L224
+    bceqz $fcc3, .L224 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L223 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+    .align 3
+
+.L24:
+    bceqz $fcc2, .L222
+    bceqz $fcc3, .L222 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L221 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+    .align 3
+
+.L221:  //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    vstelm.d VXZ, Y, 0, 0
+    vstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VXZ, Y, 0, 0
+    vstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VXZ, Y, 0, 0
+    vstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VXZ, Y, 0, 0
+    vstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    addi.d I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+#else
+    vstelm.w VXZ, Y, 0, 0
+    vstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.w VXZ, Y, 0, 0
+    vstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.w VXZ, Y, 0, 0
+    vstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.w VXZ, Y, 0, 0
+    vstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    addi.d I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+#endif
+
+.L222:  //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vfmul.d x3, VXAI, x2
+    vfmul.d x4, VXAI, x1
+    vfmsub.d x3, VXAR, x1, x3
+    vfmadd.d x4, VXAR, x2, x4
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vfmul.d x3, VXAI, x2
+    vfmul.d x4, VXAI, x1
+    vfmsub.d x3, VXAR, x1, x3
+    vfmadd.d x4, VXAR, x2, x4
+    addi.d  I, I, -1
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    vfmul.s x3, VXAI, x2
+    vfmul.s x4, VXAI, x1
+    vfmsub.s x3, VXAR, x1, x3
+    vfmadd.s x4, VXAR, x2, x4
+    addi.d  I, I, -1
+    vstelm.w x3, YY, 0 * SIZE, 0
+    vstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 1
+    vstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 2
+    vstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 3
+    vstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+#endif
+
+.L223:
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d x3, VXBI, x2
+    vfmul.d x4, VXBI, x1
+    vfmsub.d x3, VXBR, x1, x3
+    vfmadd.d x4, VXBR, x2, x4
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d x3, VXBI, x2
+    vfmul.d x4, VXBI, x1
+    vfmsub.d x3, VXBR, x1, x3
+    vfmadd.d x4, VXBR, x2, x4
+    addi.d  I, I, -1
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+#else
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s x3, VXBI, x2
+    vfmul.s x4, VXBI, x1
+    vfmsub.s x3, VXBR, x1, x3
+    vfmadd.s x4, VXBR, x2, x4
+
+    addi.d  I, I, -1
+    vstelm.w x3, YY, 0 * SIZE, 0
+    vstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 1
+    vstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 2
+    vstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 3
+    vstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+#endif
+
+.L224:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX1, VXAI, x1
+    vfmul.s VX2, VXBI, x4
+    vfmul.s VX3, VXBI, x3
+    vfmsub.s VX0, VXAR, x1, VX0
+    vfmadd.s VX1, VXAR, x2, VX1
+    vfmsub.s VX2, VXBR, x3, VX2
+    vfmadd.s VX3, VXBR, x4, VX3
+    vfadd.s x3, VX0, VX2
+    vfadd.s x4, VX1, VX3
+    addi.d  I, I, -1
+
+    vstelm.w x3, YY, 0 * SIZE, 0
+    vstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 1
+    vstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 2
+    vstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 3
+    vstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+#endif
+
+.L997:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+#ifdef DOUBLE
+    fld.d a1, X, 0 * SIZE
+    fld.d a2, X, 1 * SIZE
+    fld.d a3, Y, 0 * SIZE
+    fld.d a4, Y, 1 * SIZE
+    addi.d I, I, -1
+    fmul.d s1, ALPHAI, a2
+    fmul.d s2, ALPHAI, a1
+    fmul.d s3, BETAI, a4
+    fmul.d s4, BETAI, a3
+    fmsub.d s1, ALPHAR, a1, s1
+    fmadd.d s2, a2, ALPHAR, s2
+    fmsub.d s3, BETAR, a3, s3
+    fmadd.d s4, a4, BETAR, s4
+    fadd.d s3, s3, s1
+    fadd.d s4, s4, s2
+    fst.d s3, Y, 0 * SIZE
+    fst.d s4, Y, 1 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+#else
+    fld.s a1, X, 0 * SIZE
+    fld.s a2, X, 1 * SIZE
+    fld.s a3, Y, 0 * SIZE
+    fld.s a4, Y, 1 * SIZE
+    addi.d I, I, -1
+    fmul.s s1, ALPHAI, a2
+    fmul.s s2, ALPHAI, a1
+    fmul.s s3, BETAI, a4
+    fmul.s s4, BETAI, a3
+    fmsub.s s1, ALPHAR, a1, s1
+    fmadd.s s2, a2, ALPHAR, s2
+    fmsub.s s3, BETAR, a3, s3
+    fmadd.s s4, a4, BETAR, s4
+    fadd.s s3, s3, s1
+    fadd.s s4, s4, s2
+    fst.s s3, Y, 0 * SIZE
+    fst.s s4, Y, 1 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+#endif
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE

From adde7253217a6824da1deff064b46e36c1e259ca Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Sun, 4 Feb 2024 14:43:08 +0800
Subject: [PATCH 635/718] LoongArch64: Fixed {s/d}amin LSX optimization

---
 kernel/loongarch64/amin_lsx.S | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/loongarch64/amin_lsx.S b/kernel/loongarch64/amin_lsx.S
index c3c3f4ae9..47701b6e4 100644
--- a/kernel/loongarch64/amin_lsx.S
+++ b/kernel/loongarch64/amin_lsx.S
@@ -124,7 +124,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 .L13:
     FABS $f0, $f0
-    SUB $f0, $f0, $f0
     jirl $r0,  $r1, 0x0
     .align 3
 

From d02c61e82e82b7f721ecf50dd7717547b6302a19 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 4 Feb 2024 10:01:27 +0100
Subject: [PATCH 636/718] Update lowercase cpunames for RISC-V

---
 cpuid_riscv64.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c
index 5db54f1aa..ff7ba2aad 100644
--- a/cpuid_riscv64.c
+++ b/cpuid_riscv64.c
@@ -86,7 +86,10 @@ static char *cpuname[] = {
 
 static char *cpuname_lower[] = {
   "riscv64_generic",
-  "c910v"
+  "c910v",
+  "x280",
+  "riscv64_zvl256b",
+  "riscv64_zvl128b"
 };
 
 int detect(void){

From e61d96303d4972181e1d1b26b1f3a88ed9e5de02 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 4 Feb 2024 10:05:20 +0100
Subject: [PATCH 637/718] Fix missing NO_AVX2 fallback for SapphireRapids

---
 driver/others/dynamic.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 69a473060..e3f905265 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -275,6 +275,7 @@ extern gotoblas_t  gotoblas_EXCAVATOR;
 #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
 #define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE
 #define gotoblas_ZEN gotoblas_SANDYBRIDGE
+#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE
 #else
 extern gotoblas_t  gotoblas_HASWELL;
 extern gotoblas_t  gotoblas_ZEN;

From 6d8a273cca96c16873a15b59ea724834aa9b4558 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 4 Feb 2024 22:07:51 +0100
Subject: [PATCH 638/718] Handle zero increment(s) in C910V ?AXPBY (#4483)

* Handle zero increment(s)
---
 kernel/riscv64/axpby_vector.c  | 58 +++++++++++++++++++++++++
 kernel/riscv64/zaxpby_vector.c | 77 ++++++++++++++++++++++++++++++++++
 2 files changed, 135 insertions(+)

diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c
index 850fc903e..721aad2b0 100644
--- a/kernel/riscv64/axpby_vector.c
+++ b/kernel/riscv64/axpby_vector.c
@@ -69,6 +69,63 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
 
 	BLASLONG stride_x, stride_y, ix = 0, iy = 0;
 
+	if (inc_x == 0 || inc_y == 0) { /* use trivial non-vectorized loop if either increment is zero */
+
+	if ( beta == 0.0 )
+	{
+
+		if ( alpha == 0.0 )
+		{
+			while(i < n)
+			{
+				y[iy] = 0.0 ;
+				iy += inc_y ;
+				i++ ;
+			}
+		}
+		else
+		{
+			while(i < n)
+			{
+				y[iy] = alpha * x[ix] ;
+				ix += inc_x ;
+				iy += inc_y ;
+				i++ ;
+			}
+
+
+		}
+
+	}
+	else
+	{
+
+		if ( alpha == 0.0 )
+		{
+			while(i < n)
+			{
+				y[iy] =  beta * y[iy] ;
+				iy += inc_y ;
+				i++ ;
+			}
+		}
+		else
+		{
+			while(i < n)
+			{
+				y[iy] = alpha * x[ix] + beta * y[iy] ;
+				ix += inc_x ;
+				iy += inc_y ;
+				i++ ;
+			}
+		}
+
+	}
+
+	return(0);
+
+	} else { /* vectorized approach for non-zero increments */
+
         if(beta == 0.0){
                 if(alpha == 0.0){//alpha == 0 && beta == 0
                         if(inc_y == 1){
@@ -381,5 +438,6 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
                 }
         }
 	return(0);
+	}
 }
 
diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c
index d5ad974cf..bbf2bbe7d 100644
--- a/kernel/riscv64/zaxpby_vector.c
+++ b/kernel/riscv64/zaxpby_vector.c
@@ -62,6 +62,82 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
         stride_x = inc_x * 2 * sizeof(FLOAT);
         stride_y = inc_y * 2 * sizeof(FLOAT);
 
+	if (inc_x == 0 || inc_y == 0) {
+
+	FLOAT temp;
+	BLASLONG inc_x2, inc_y2;
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	if ( beta_r == 0.0 && beta_i == 0.0)
+	{
+		if ( alpha_r == 0.0 && alpha_i == 0.0 )
+		{
+
+			while(i < n)
+			{
+				y[iy]   = 0.0 ;
+				y[iy+1] = 0.0 ;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+		}
+		else
+		{
+
+			while(i < n)
+			{
+				y[iy]   = ( alpha_r * x[ix]   - alpha_i * x[ix+1] ) ;
+				y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix]   ) ;
+				ix += inc_x2 ;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+
+		}
+
+	}
+	else
+	{
+		if ( alpha_r == 0.0 && alpha_i == 0.0 )
+		{
+
+			while(i < n)
+			{
+				temp    = ( beta_r * y[iy]   - beta_i * y[iy+1] ) ;
+				y[iy+1] = ( beta_r * y[iy+1] + beta_i * y[iy]   ) ;
+				y[iy]   = temp;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+		}
+		else
+		{
+
+			while(i < n)
+			{
+				temp    = ( alpha_r * x[ix]   - alpha_i * x[ix+1] ) + ( beta_r * y[iy]   - beta_i * y[iy+1] ) ;
+				y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix]   ) + ( beta_r * y[iy+1] + beta_i * y[iy]   ) ;
+				y[iy]   = temp;
+				ix += inc_x2 ;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+
+		}
+
+
+
+	}
+	return(0);
+
+	} else {
+
         if(beta_r == 0.0 && beta_i == 0.0){
                 if(alpha_r == 0.0 && alpha_i == 0.0){
                         if(inc_y == 1){
@@ -191,5 +267,6 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
                 }
         }
 	return(0);
+	}
 }
 

From 479e4af0893f3772eff9a75e89cf28bef91383c3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 5 Feb 2024 15:35:24 +0100
Subject: [PATCH 639/718] Rescale input vector more often to minimize relative
 error (Reference-LAPACK PR 981)

---
 lapack-netlib/SRC/clarfgp.f | 30 ++++++++++--------------------
 lapack-netlib/SRC/zlarfgp.f | 30 ++++++++++--------------------
 2 files changed, 20 insertions(+), 40 deletions(-)

diff --git a/lapack-netlib/SRC/clarfgp.f b/lapack-netlib/SRC/clarfgp.f
index 47b5e47b0..980e93612 100644
--- a/lapack-netlib/SRC/clarfgp.f
+++ b/lapack-netlib/SRC/clarfgp.f
@@ -148,33 +148,23 @@
       ALPHR = REAL( ALPHA )
       ALPHI = AIMAG( ALPHA )
 *
-      IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN
+      IF( XNORM.LE.EPS*ABS(ALPHA) .AND. ALPHI.EQ.ZERO ) THEN
 *
 *        H  =  [1-alpha/abs(alpha) 0; 0 I], sign chosen so ALPHA >= 0.
 *
-         IF( ALPHI.EQ.ZERO ) THEN
-            IF( ALPHR.GE.ZERO ) THEN
-*              When TAU.eq.ZERO, the vector is special-cased to be
-*              all zeros in the application routines.  We do not need
-*              to clear it.
-               TAU = ZERO
-            ELSE
-*              However, the application routines rely on explicit
-*              zero checks when TAU.ne.ZERO, and we must clear X.
-               TAU = TWO
-               DO J = 1, N-1
-                  X( 1 + (J-1)*INCX ) = ZERO
-               END DO
-               ALPHA = -ALPHA
-            END IF
+         IF( ALPHR.GE.ZERO ) THEN
+*           When TAU.eq.ZERO, the vector is special-cased to be
+*           all zeros in the application routines.  We do not need
+*           to clear it.
+            TAU = ZERO
          ELSE
-*           Only "reflecting" the diagonal entry to be real and non-negative.
-            XNORM = SLAPY2( ALPHR, ALPHI )
-            TAU = CMPLX( ONE - ALPHR / XNORM, -ALPHI / XNORM )
+*           However, the application routines rely on explicit
+*           zero checks when TAU.ne.ZERO, and we must clear X.
+            TAU = TWO
             DO J = 1, N-1
                X( 1 + (J-1)*INCX ) = ZERO
             END DO
-            ALPHA = XNORM
+            ALPHA = -ALPHA
          END IF
       ELSE
 *
diff --git a/lapack-netlib/SRC/zlarfgp.f b/lapack-netlib/SRC/zlarfgp.f
index 6c9efb04c..d54f2ea5d 100644
--- a/lapack-netlib/SRC/zlarfgp.f
+++ b/lapack-netlib/SRC/zlarfgp.f
@@ -148,33 +148,23 @@
       ALPHR = DBLE( ALPHA )
       ALPHI = DIMAG( ALPHA )
 *
-      IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN
+      IF( XNORM.LE.EPS*ABS(ALPHA) .AND. ALPHI.EQ.ZERO ) THEN
 *
 *        H  =  [1-alpha/abs(alpha) 0; 0 I], sign chosen so ALPHA >= 0.
 *
-         IF( ALPHI.EQ.ZERO ) THEN
-            IF( ALPHR.GE.ZERO ) THEN
-*              When TAU.eq.ZERO, the vector is special-cased to be
-*              all zeros in the application routines.  We do not need
-*              to clear it.
-               TAU = ZERO
-            ELSE
-*              However, the application routines rely on explicit
-*              zero checks when TAU.ne.ZERO, and we must clear X.
-               TAU = TWO
-               DO J = 1, N-1
-                  X( 1 + (J-1)*INCX ) = ZERO
-               END DO
-               ALPHA = -ALPHA
-            END IF
+         IF( ALPHR.GE.ZERO ) THEN
+*           When TAU.eq.ZERO, the vector is special-cased to be
+*           all zeros in the application routines.  We do not need
+*           to clear it.
+            TAU = ZERO
          ELSE
-*           Only "reflecting" the diagonal entry to be real and non-negative.
-            XNORM = DLAPY2( ALPHR, ALPHI )
-            TAU = DCMPLX( ONE - ALPHR / XNORM, -ALPHI / XNORM )
+*           However, the application routines rely on explicit
+*           zero checks when TAU.ne.ZERO, and we must clear X.
+            TAU = TWO
             DO J = 1, N-1
                X( 1 + (J-1)*INCX ) = ZERO
             END DO
-            ALPHA = XNORM
+            ALPHA = -ALPHA
          END IF
       ELSE
 *

From 63fbffddf86e2443e7aa4eda4fdad2db84b17568 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 5 Feb 2024 21:44:03 +0100
Subject: [PATCH 640/718] Add option FIXED_LIBNAME to suppress versioning and
 softlinking

---
 Makefile        | 25 +++++++++++++++----------
 Makefile.system |  4 ++++
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/Makefile b/Makefile
index b344abcd2..e595ad99a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,9 @@
 TOPDIR	= .
 include ./Makefile.system
+LNCMD = ln -fs
+ifeq ($(FIXED_LIBNAME), 1)
+LNCMD = true
+endif
 
 BLASDIRS = interface driver/level2 driver/level3 driver/others
 
@@ -134,17 +138,17 @@ shared : libs netlib $(RELA)
 ifneq ($(NO_SHARED), 1)
 ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
 	@$(MAKE) -C exports so
-	@ln -fs $(LIBSONAME) $(LIBPREFIX).so
-	@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
+	@$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so
+	@$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
 endif
 ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD))
 	@$(MAKE) -C exports so
-	@ln -fs $(LIBSONAME) $(LIBPREFIX).so
+	@$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so
 endif
 ifeq ($(OSNAME), Darwin)
 	@$(MAKE) -C exports dyn
-	@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
-	@ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
+	@$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).dylib
+	@$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
 endif
 ifeq ($(OSNAME), WINNT)
 	@$(MAKE) -C exports dll
@@ -229,13 +233,13 @@ ifeq ($(INTERFACE64),1)
 endif
 	@echo THELIBNAME=$(LIBNAME) >>  Makefile.conf_last
 	@echo THELIBSONAME=$(LIBSONAME) >>  Makefile.conf_last
-	@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
+	@-$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
 	@touch lib.grd
 
 prof : prof_blas prof_lapack
 
 prof_blas :
-	ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
+	$(LNCMD) $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
 	for d in $(SUBDIRS) ; \
 	do if test -d $$d; then \
 	  $(MAKE) -C $$d prof || exit 1 ; \
@@ -246,7 +250,7 @@ ifeq ($(DYNAMIC_ARCH), 1)
 endif
 
 blas :
-	ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
+	$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
 	for d in $(BLASDIRS) ; \
 	do if test -d $$d; then \
 	  $(MAKE) -C $$d libs || exit 1 ; \
@@ -254,7 +258,7 @@ blas :
 	done
 
 hpl :
-	ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
+	$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
 	for d in $(BLASDIRS) ../laswp exports ; \
 	do if test -d $$d; then \
 	  $(MAKE) -C $$d $(@F) || exit 1 ; \
@@ -268,7 +272,7 @@ ifeq ($(DYNAMIC_ARCH), 1)
 endif
 
 hpl_p :
-	ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
+	$(LNCMD) $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
 	for d in $(SUBDIRS) ../laswp exports ; \
 	do if test -d $$d; then \
 	  $(MAKE) -C $$d $(@F) || exit 1 ; \
@@ -401,6 +405,7 @@ lapack-runtest: lapack-test
 
 blas-test:
 	(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out)
+
 	$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
 	(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out)
 
diff --git a/Makefile.system b/Makefile.system
index 0088eaff5..0353a4d21 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -1704,6 +1704,10 @@ LIBNAME_P	= $(LIBPREFIX)p$(REVISION)_p.$(LIBSUFFIX)
 endif
 endif
 
+ifeq ($(FIXED_LIBNAME),1)
+	LIBNAME = $(LIBPREFIX).$(LIBSUFFIX)
+	LIBNAME_P = $(LIBPREFIX)_p.$(LIBSUFFIX)
+endif
 
 LIBDLLNAME   = $(LIBPREFIX).dll
 IMPLIBNAME   = lib$(LIBNAMEBASE).dll.a

From 440edfd997a4a383ced5226553c15e50c1c750b0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 5 Feb 2024 21:44:50 +0100
Subject: [PATCH 641/718] Add option to suppress versioning of the internal
 name

---
 exports/Makefile | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/exports/Makefile b/exports/Makefile
index 7682f851d..71c112c67 100644
--- a/exports/Makefile
+++ b/exports/Makefile
@@ -132,8 +132,12 @@ libgoto_hpl.def : $(GENSYM)
 	./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
 
 ifeq ($(OSNAME), Darwin)
+ifeq ($(FIXED_LIBNAME),1)
+INTERNALNAME = $(LIBPREFIX).dylib
+else
 INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
 endif
+endif
 
 ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
 $(LIBDYNNAME) : ../$(LIBNAME) osx.def
@@ -169,8 +173,12 @@ INTERNALNAME = $(LIBPREFIX).so
 FEXTRALIB += -lm
 EXTRALIB += -lm
 else
+ifeq ($(FIXED_LIBNAME),1)
+INTERNALNAME = $(LIBPREFIX).so
+else
 INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
 endif
+endif
 
 ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
 ../$(LIBSONAME) : ../$(LIBNAME) linktest.c

From fe3da43b7dac7af1be5538f87f707a6073fbc52c Mon Sep 17 00:00:00 2001
From: pengxu <pengxu@loongson.cn>
Date: Tue, 6 Feb 2024 11:49:01 +0800
Subject: [PATCH 642/718] Optimized zgemm kernel 8*4 LASX, 4*4 LSX and cgemm
 kernel 8*4 LSX for LoongArch

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000   |   20 +-
 kernel/loongarch64/KERNEL.LOONGSON3R5      |   10 +-
 kernel/loongarch64/cgemm_kernel_8x4_lsx.S  | 3313 ++++++++++++++++++
 kernel/loongarch64/cgemm_ncopy_4_lsx.S     |  341 ++
 kernel/loongarch64/cgemm_ncopy_8_lsx.S     |  263 ++
 kernel/loongarch64/cgemm_tcopy_4_lsx.S     |  324 ++
 kernel/loongarch64/cgemm_tcopy_8_lsx.S     |  277 ++
 kernel/loongarch64/zgemm_kernel_4x4_lsx.S  | 2316 +++++++++++++
 kernel/loongarch64/zgemm_kernel_8x4_lasx.S | 3545 ++++++++++++++++++++
 kernel/loongarch64/zgemm_ncopy_4_lasx.S    |  320 ++
 kernel/loongarch64/zgemm_ncopy_4_lsx.S     |  332 ++
 kernel/loongarch64/zgemm_ncopy_8_lasx.S    |  263 ++
 kernel/loongarch64/zgemm_tcopy_4_lasx.S    |  302 ++
 kernel/loongarch64/zgemm_tcopy_4_lsx.S     |  355 ++
 kernel/loongarch64/zgemm_tcopy_8_lasx.S    |  268 ++
 param.h                                    |   10 +-
 16 files changed, 12248 insertions(+), 11 deletions(-)
 create mode 100644 kernel/loongarch64/cgemm_kernel_8x4_lsx.S
 create mode 100644 kernel/loongarch64/cgemm_ncopy_4_lsx.S
 create mode 100644 kernel/loongarch64/cgemm_ncopy_8_lsx.S
 create mode 100644 kernel/loongarch64/cgemm_tcopy_4_lsx.S
 create mode 100644 kernel/loongarch64/cgemm_tcopy_8_lsx.S
 create mode 100644 kernel/loongarch64/zgemm_kernel_4x4_lsx.S
 create mode 100644 kernel/loongarch64/zgemm_kernel_8x4_lasx.S
 create mode 100644 kernel/loongarch64/zgemm_ncopy_4_lasx.S
 create mode 100644 kernel/loongarch64/zgemm_ncopy_4_lsx.S
 create mode 100644 kernel/loongarch64/zgemm_ncopy_8_lasx.S
 create mode 100644 kernel/loongarch64/zgemm_tcopy_4_lasx.S
 create mode 100644 kernel/loongarch64/zgemm_tcopy_4_lsx.S
 create mode 100644 kernel/loongarch64/zgemm_tcopy_8_lasx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index f4ab495e6..c7ef44035 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -100,9 +100,13 @@ DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
 DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
 
-CGEMMKERNEL  = cgemm_kernel_2x2_lsx.S
-CGEMMONCOPY  = cgemm_ncopy_2_lsx.S
-CGEMMOTCOPY  = cgemm_tcopy_2_lsx.S
+CGEMMKERNEL  = cgemm_kernel_8x4_lsx.S
+CGEMMINCOPY  = cgemm_ncopy_8_lsx.S
+CGEMMITCOPY  = cgemm_tcopy_8_lsx.S
+CGEMMONCOPY  = cgemm_ncopy_4_lsx.S
+CGEMMOTCOPY  = cgemm_tcopy_4_lsx.S
+CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
 CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
@@ -111,4 +115,14 @@ CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
 CTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
 
+ZGEMMKERNEL  = zgemm_kernel_4x4_lsx.S
+ZGEMMONCOPY  = zgemm_ncopy_4_lsx.S
+ZGEMMOTCOPY  = zgemm_tcopy_4_lsx.S
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
 endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index bd85fab01..17d15656a 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -122,9 +122,13 @@ CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
 CTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
 
-ZGEMMKERNEL  = zgemm_kernel_2x2_lasx.S
-ZGEMMONCOPY  = zgemm_ncopy_2_lasx.S
-ZGEMMOTCOPY  = zgemm_tcopy_2_lasx.S
+ZGEMMKERNEL  = zgemm_kernel_8x4_lasx.S
+ZGEMMINCOPY  = zgemm_ncopy_8_lasx.S
+ZGEMMITCOPY  = zgemm_tcopy_8_lasx.S
+ZGEMMONCOPY  = zgemm_ncopy_4_lasx.S
+ZGEMMOTCOPY  = zgemm_tcopy_4_lasx.S
+ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
diff --git a/kernel/loongarch64/cgemm_kernel_8x4_lsx.S b/kernel/loongarch64/cgemm_kernel_8x4_lsx.S
new file mode 100644
index 000000000..1e9fd8524
--- /dev/null
+++ b/kernel/loongarch64/cgemm_kernel_8x4_lsx.S
@@ -0,0 +1,3313 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+#define T2     $r26
+#define T3     $r27
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LSX vectors */
+#define U0     $vr30
+#define U1     $vr31
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+#define D0     $vr16
+#define D1     $vr17
+#define D2     $vr18
+#define D3     $vr19
+#define D4     $vr20
+#define D5     $vr21
+#define D6     $vr22
+#define D7     $vr23
+#define D8     $vr24
+#define D9     $vr25
+#define D10    $vr26
+#define D11    $vr27
+#define D12    $vr28
+#define D13    $vr29
+#define VALPHAR $vr28
+#define VALPHAI $vr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VNMSUB
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VFMADD
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VFMADD
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VNMSUB
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    vldrepl.w  VALPHAR, $sp, 112
+    vldrepl.w  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   2
+
+    move       J,      $r0
+    srai.d     T0,     N,     2  //bn/4
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/4; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    add.d      C2,     C1,    TL
+    add.d      C3,     C2,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     3  //bm/8
+    beq        I,      T0,    .L150
+
+.L11:  /* for(i=0; i<bm/8; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     T3,     OFF,   0x06
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x05
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+    vxor.v    U4,     U4,   U4
+    vxor.v    U5,     U5,   U5
+    vxor.v    U6,     U6,   U6
+    vxor.v    U7,     U7,   U7
+    vxor.v    U8,     U8,   U8
+    vxor.v    U9,     U9,   U9
+    vxor.v    U10,    U10,  U10
+    vxor.v    U11,    U11,  U11
+    vxor.v    U12,    U12,  U12
+    vxor.v    U13,    U13,  U13
+    vxor.v    U14,    U14,  U14
+    vxor.v    U15,    U15,  U15
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L13
+    blt        TL,     L,     .L13
+
+.L12:  /* for(k=0; k<temp; k+=1) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D2,     B0,    0x00  // b0ri b1ri
+    vld       D3,     B0,    0x10  // b2ri b3ri
+
+    vshuf4i.w  D4,     D0,    0x00  //a0r
+    vshuf4i.w  D5,     D0,    0x55  //a0i
+
+    vpackev.w  D6,     D3,    D2
+    vshuf4i.w  D6,     D6,    0xd8  //b0r b1r b2r b3r
+
+    vpackod.w  D7,     D3,    D2
+    vshuf4i.w  D7,     D7,    0xd8  //b0i b1i b2i b3i
+
+    VMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r
+    VMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i
+    VMADD3    U0,     D5,    D7,     U0
+    VMADD4    U1,     D4,    D7,     U1
+
+    vshuf4i.w  D4,     D0,    0xaa  //a1r
+    vshuf4i.w  D5,     D0,    0xff  //a1i
+
+    VMADD1    U2,     D4,    D6,     U2  //01r 11r 21r 31r
+    VMADD2    U3,     D5,    D6,     U3  //01i 11i 21i 31i
+    VMADD3    U2,     D5,    D7,     U2
+    VMADD4    U3,     D4,    D7,     U3
+
+    vld       D0,     A0,    0x10  // a2ri a3ri
+
+    vshuf4i.w  D4,     D0,    0x00  //a2r
+    vshuf4i.w  D5,     D0,    0x55  //a2i
+
+    VMADD1    U4,     D4,    D6,     U4  //02r 12r 22r 32r
+    VMADD2    U5,     D5,    D6,     U5  //02i 12i 22i 32i
+    VMADD3    U4,     D5,    D7,     U4
+    VMADD4    U5,     D4,    D7,     U5
+
+    vshuf4i.w  D4,     D0,    0xaa  //a3r
+    vshuf4i.w  D5,     D0,    0xff  //a3i
+
+    VMADD1    U6,     D4,    D6,     U6  //03r 13r 23r 33r
+    VMADD2    U7,     D5,    D6,     U7  //03i 13i 23i 33i
+    VMADD3    U6,     D5,    D7,     U6
+    VMADD4    U7,     D4,    D7,     U7
+
+    vld       D0,     A0,    0x20  // a4ri a5ri
+
+    vshuf4i.w  D4,     D0,    0x00  //a4r
+    vshuf4i.w  D5,     D0,    0x55  //a4i
+
+    VMADD1    U8,     D4,    D6,     U8  //04r 14r 24r 34r
+    VMADD2    U9,     D5,    D6,     U9  //04i 14i 24i 34i
+    VMADD3    U8,     D5,    D7,     U8
+    VMADD4    U9,     D4,    D7,     U9
+
+    vshuf4i.w  D4,     D0,    0xaa  //a5r
+    vshuf4i.w  D5,     D0,    0xff  //a5i
+
+    VMADD1    U10,     D4,    D6,     U10  //05r 15r 25r 35r
+    VMADD2    U11,     D5,    D6,     U11  //05i 15i 25i 35i
+    VMADD3    U10,     D5,    D7,     U10
+    VMADD4    U11,     D4,    D7,     U11
+
+    vld       D0,     A0,    0x30  // a6ri a7ri
+
+    vshuf4i.w  D4,     D0,    0x00  //a6r
+    vshuf4i.w  D5,     D0,    0x55  //a6i
+
+    VMADD1    U12,     D4,    D6,     U12  //06r 16r 26r 36r
+    VMADD2    U13,     D5,    D6,     U13  //06i 16i 26i 36i
+    VMADD3    U12,     D5,    D7,     U12
+    VMADD4    U13,     D4,    D7,     U13
+
+    vshuf4i.w  D4,     D0,    0xaa  //a5r
+    vshuf4i.w  D5,     D0,    0xff  //a5i
+
+    VMADD1    U14,     D4,    D6,     U14  //07r 17r 27r 37r
+    VMADD2    U15,     D5,    D6,     U15  //07i 17i 27i 37i
+    VMADD3    U14,     D5,    D7,     U14
+    VMADD4    U15,     D4,    D7,     U15
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L12
+
+.L13:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      D8,    U0,    VALPHAR
+    vfmul.s      D9,    U1,    VALPHAR
+    VNMSUB      D8,    U1,    VALPHAI, D8
+    VFMADD      D9,    U0,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      D8,    U2,    VALPHAR
+    vfmul.s      D9,    U3,    VALPHAR
+    VNMSUB      D8,    U3,    VALPHAI, D8
+    VFMADD      D9,    U2,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res02 res12 res22 res32
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      D8,    U4,    VALPHAR
+    vfmul.s      D9,    U5,    VALPHAR
+    VNMSUB      D8,    U5,    VALPHAI, D8
+    VFMADD      D9,    U4,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      D8,    U6,    VALPHAR
+    vfmul.s      D9,    U7,    VALPHAR
+    VNMSUB      D8,    U7,    VALPHAI, D8
+    VFMADD      D9,    U6,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res04 res14 res24 res34
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      D8,    U8,    VALPHAR
+    vfmul.s      D9,    U9,    VALPHAR
+    VNMSUB      D8,    U9,    VALPHAI, D8
+    VFMADD      D9,    U8,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res05 res15 res25 res35
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      D8,    U10,    VALPHAR
+    vfmul.s      D9,    U11,    VALPHAR
+    VNMSUB      D8,    U11,    VALPHAI, D8
+    VFMADD      D9,    U10,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res06 res16 res26 res36
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      D8,    U12,    VALPHAR
+    vfmul.s      D9,    U13,    VALPHAR
+    VNMSUB      D8,    U13,    VALPHAI, D8
+    VFMADD      D9,    U12,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res07 res17 res27 res37
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      D8,    U14,    VALPHAR
+    vfmul.s      D9,    U15,    VALPHAR
+    VNMSUB      D8,    U15,    VALPHAI, D8
+    VFMADD      D9,    U14,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10 res20 res30
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      D8,    U0,    VALPHAR, D8
+    VFMADD      D9,    U1,    VALPHAR, D9
+    VNMSUB      D8,    U1,    VALPHAI, D8
+    VFMADD      D9,    U0,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      D8,    U2,    VALPHAR, D8
+    VFMADD      D9,    U3,    VALPHAR, D9
+    VNMSUB      D8,    U3,    VALPHAI, D8
+    VFMADD      D9,    U2,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res02 res12 res22 res32
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      D8,    U4,    VALPHAR, D8
+    VFMADD      D9,    U5,    VALPHAR, D9
+    VNMSUB      D8,    U5,    VALPHAI, D8
+    VFMADD      D9,    U4,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      D8,    U6,    VALPHAR, D8
+    VFMADD      D9,    U7,    VALPHAR, D9
+    VNMSUB      D8,    U7,    VALPHAI, D8
+    VFMADD      D9,    U6,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res04 res14 res24 res34
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      D8,    U8,    VALPHAR, D8
+    VFMADD      D9,    U9,    VALPHAR, D9
+    VNMSUB      D8,    U9,    VALPHAI, D8
+    VFMADD      D9,    U8,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res05 res15 res25 res35
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      D8,    U10,    VALPHAR, D8
+    VFMADD      D9,    U11,    VALPHAR, D9
+    VNMSUB      D8,    U11,    VALPHAI, D8
+    VFMADD      D9,    U10,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res06 res16 res26 res36
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      D8,    U12,    VALPHAR, D8
+    VFMADD      D9,    U13,    VALPHAR, D9
+    VNMSUB      D8,    U13,    VALPHAI, D8
+    VFMADD      D9,    U12,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res07 res17 res27 res37
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      D8,    U14,    VALPHAR, D8
+    VFMADD      D9,    U15,    VALPHAR, D9
+    VNMSUB      D8,    U15,    VALPHAI, D8
+    VFMADD      D9,    U14,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     4
+    beq        I,      T0,    .L18
+
+.L15:  /* if (bm & 4) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x05
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+    vxor.v    U4,     U4,   U4
+    vxor.v    U5,     U5,   U5
+    vxor.v    U6,     U6,   U6
+    vxor.v    U7,     U7,   U7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<temp; k++) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D2,     B0,    0x00  // b0ri b1ri
+    vld       D3,     B0,    0x10  // b2ri b3ri
+
+    vshuf4i.w  D4,     D0,    0x00  //a0r
+    vshuf4i.w  D5,     D0,    0x55  //a0i
+
+    vpackev.w  D6,     D3,    D2
+    vshuf4i.w  D6,     D6,    0xd8  //b0r b1r b2r b3r
+
+    vpackod.w  D7,     D3,    D2
+    vshuf4i.w  D7,     D7,    0xd8  //b0i b1i b2i b3i
+
+    VMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r
+    VMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i
+    VMADD3    U0,     D5,    D7,     U0
+    VMADD4    U1,     D4,    D7,     U1
+
+    vshuf4i.w  D4,     D0,    0xaa  //a1r
+    vshuf4i.w  D5,     D0,    0xff  //a1i
+
+    VMADD1    U2,     D4,    D6,     U2  //01r 11r 21r 31r
+    VMADD2    U3,     D5,    D6,     U3  //01i 11i 21i 31i
+    VMADD3    U2,     D5,    D7,     U2
+    VMADD4    U3,     D4,    D7,     U3
+
+    vld       D0,     A0,    0x10  // a2ri a3ri
+
+    vshuf4i.w  D4,     D0,    0x00  //a2r
+    vshuf4i.w  D5,     D0,    0x55  //a2i
+
+    VMADD1    U4,     D4,    D6,     U4  //02r 12r 22r 32r
+    VMADD2    U5,     D5,    D6,     U5  //02i 12i 22i 32i
+    VMADD3    U4,     D5,    D7,     U4
+    VMADD4    U5,     D4,    D7,     U5
+
+    vshuf4i.w  D4,     D0,    0xaa  //a3r
+    vshuf4i.w  D5,     D0,    0xff  //a3i
+
+    VMADD1    U6,     D4,    D6,     U6  //03r 13r 23r 33r
+    VMADD2    U7,     D5,    D6,     U7  //03i 13i 23i 33i
+    VMADD3    U6,     D5,    D7,     U6
+    VMADD4    U7,     D4,    D7,     U7
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      D8,    U0,    VALPHAR
+    vfmul.s      D9,    U1,    VALPHAR
+    VNMSUB      D8,    U1,    VALPHAI, D8
+    VFMADD      D9,    U0,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      D8,    U2,    VALPHAR
+    vfmul.s      D9,    U3,    VALPHAR
+    VNMSUB      D8,    U3,    VALPHAI, D8
+    VFMADD      D9,    U2,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res02 res12 res22 res32
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      D8,    U4,    VALPHAR
+    vfmul.s      D9,    U5,    VALPHAR
+    VNMSUB      D8,    U5,    VALPHAI, D8
+    VFMADD      D9,    U4,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      D8,    U6,    VALPHAR
+    vfmul.s      D9,    U7,    VALPHAR
+    VNMSUB      D8,    U7,    VALPHAI, D8
+    VFMADD      D9,    U6,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10 res20 res30
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      D8,    U0,    VALPHAR, D8
+    VFMADD      D9,    U1,    VALPHAR, D9
+    VNMSUB      D8,    U1,    VALPHAI, D8
+    VFMADD      D9,    U0,    VALPHAI, D9
+
+    vst       VALPHAR,     C0,    0x00
+    vst       VALPHAI,     C1,    0x00
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      D8,    U2,    VALPHAR, D8
+    VFMADD      D9,    U3,    VALPHAR, D9
+    VNMSUB      D8,    U3,    VALPHAI, D8
+    VFMADD      D9,    U2,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    // vst       VALPHAR,C0,    0x00
+    // LD        $f15,   C0,    0x00
+    // LD        $f15,   C0,    0x04
+    // LD        $f15,   C0,    0x08
+    // LD        $f15,   C0,    0x0c
+
+    // vst       VALPHAI,C0,    0x00
+    // LD        $f15,   C0,    0x00
+    // LD        $f15,   C0,    0x04
+    // LD        $f15,   C0,    0x08
+    // LD        $f15,   C0,    0x0c
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    // LD        $f15,   C0,    0x00
+    // LD        $f15,   C0,    0x04
+    // LD        $f15,   C0,    0x08
+    // LD        $f15,   C0,    0x0c
+
+    // LD        $f15,   C1,    0x00
+    // LD        $f15,   C1,    0x04
+    // LD        $f15,   C1,    0x08
+    // LD        $f15,   C1,    0x0c
+
+    // LD        $f15,   C2,    0x00
+    // LD        $f15,   C2,    0x04
+    // LD        $f15,   C2,    0x08
+    // LD        $f15,   C2,    0x0c
+
+    // LD        $f15,   C3,    0x00
+    // LD        $f15,   C3,    0x04
+    // LD        $f15,   C3,    0x08
+    // LD        $f15,   C3,    0x0c
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res02 res12 res22 res32
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      D8,    U4,    VALPHAR, D8
+    VFMADD      D9,    U5,    VALPHAR, D9
+    VNMSUB      D8,    U5,    VALPHAI, D8
+    VFMADD      D9,    U4,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      D8,    U6,    VALPHAR, D8
+    VFMADD      D9,    U7,    VALPHAR, D9
+    VNMSUB      D8,    U7,    VALPHAI, D8
+    VFMADD      D9,    U6,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L18:   /* if (bm & 2) */
+    move       I,      $r0
+    andi       T0,     M,     2
+    beq        I,      T0,    .L183
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x05
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L182
+    blt        TL,     L,     .L182
+
+.L181:  /* for (k=0; k<temp; k++) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D2,     B0,    0x00  // b0ri b1ri
+    vld       D3,     B0,    0x10  // b2ri b3ri
+
+    vshuf4i.w  D4,     D0,    0x00  //a0r
+    vshuf4i.w  D5,     D0,    0x55  //a0i
+
+    vpackev.w  D6,     D3,    D2
+    vshuf4i.w  D6,     D6,    0xd8  //b0r b1r b2r b3r
+
+    vpackod.w  D7,     D3,    D2
+    vshuf4i.w  D7,     D7,    0xd8  //b0i b1i b2i b3i
+
+    VMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r
+    VMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i
+    VMADD3    U0,     D5,    D7,     U0
+    VMADD4    U1,     D4,    D7,     U1
+
+    vshuf4i.w  D4,     D0,    0xaa  //a1r
+    vshuf4i.w  D5,     D0,    0xff  //a1i
+
+    VMADD1    U2,     D4,    D6,     U2  //01r 11r 21r 31r
+    VMADD2    U3,     D5,    D6,     U3  //01i 11i 21i 31i
+    VMADD3    U2,     D5,    D7,     U2
+    VMADD4    U3,     D4,    D7,     U3
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L181
+
+.L182:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      D8,    U0,    VALPHAR
+    vfmul.s      D9,    U1,    VALPHAR
+    VNMSUB      D8,    U1,    VALPHAI, D8
+    VFMADD      D9,    U0,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      D8,    U2,    VALPHAR
+    vfmul.s      D9,    U3,    VALPHAR
+    VNMSUB      D8,    U3,    VALPHAI, D8
+    VFMADD      D9,    U2,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10 res20 res30
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      D8,    U0,    VALPHAR, D8
+    VFMADD      D9,    U1,    VALPHAR, D9
+    VNMSUB      D8,    U1,    VALPHAI, D8
+    VFMADD      D9,    U0,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      D8,    U2,    VALPHAR, D8
+    VFMADD      D9,    U3,    VALPHAR, D9
+    VNMSUB      D8,    U3,    VALPHAI, D8
+    VFMADD      D9,    U2,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L183:   /* if (bm & 1) */
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L186
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x03
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x05
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+    MTC        c31,    $r0
+    MTC        c32,    $r0
+    MTC        c41,    $r0
+    MTC        c42,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L185
+    blt        TL,     L,     .L185
+
+.L184:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00        //a0r
+    LD         a2,     A0,    0x04        //a0i
+
+    LD         b1,     B0,    0x00        //b0r
+    LD         b2,     B0,    0x04        //b0i
+    LD         b3,     B0,    0x08        //b1r
+    LD         b4,     B0,    0x0c        //b1i
+    LD         b5,     B0,    0x10        //b2r
+    LD         b6,     B0,    0x14        //b2i
+    LD         b7,     B0,    0x18        //b3r
+    LD         b8,     B0,    0x1c        //b3i
+
+    MADD1      c11,    a1,    b1,     c11  //res00r
+    MADD2      c12,    a2,    b1,     c12  //res00i
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    MADD1      c21,    a1,    b3,     c21  //res10r
+    MADD2      c22,    a2,    b3,     c22  //res10i
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    MADD1      c31,    a1,    b5,     c31  //res20r
+    MADD2      c32,    a2,    b5,     c32  //res20i
+    MADD3      c31,    a2,    b6,     c31
+    MADD4      c32,    a1,    b6,     c32
+
+    MADD1      c41,    a1,    b7,     c41  //res30r
+    MADD2      c42,    a2,    b7,     c42  //res30i
+    MADD3      c41,    a2,    b8,     c41
+    MADD4      c42,    a1,    b8,     c42
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L184
+
+.L185:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MUL       a5,     c11,   ALPHA_R
+    MUL       a6,     c12,   ALPHA_R
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    LD         a5,     C1,    0x00    //C1[0]
+    LD         a6,     C1,    0x04    //C1[1]
+
+    MUL       a5,     c21,   ALPHA_R
+    MUL       a6,     c22,   ALPHA_R
+    NMSUB      a5,     c22,   ALPHA_I, a5
+    MADD       a6,     c21,   ALPHA_I, a6
+
+    ST         a5,     C1,    0x00
+    ST         a6,     C1,    0x04
+
+    LD         a5,     C2,    0x00    //C2[0]
+    LD         a6,     C2,    0x04    //C2[1]
+
+    MUL       a5,     c31,   ALPHA_R
+    MUL       a6,     c32,   ALPHA_R
+    NMSUB      a5,     c32,   ALPHA_I, a5
+    MADD       a6,     c31,   ALPHA_I, a6
+
+    ST         a5,     C2,    0x00
+    ST         a6,     C2,    0x04
+
+    LD         a5,     C3,    0x00    //C3[0]
+    LD         a6,     C3,    0x04    //C3[1]
+
+    MUL       a5,     c41,   ALPHA_R
+    MUL       a6,     c42,   ALPHA_R
+    NMSUB      a5,     c42,   ALPHA_I, a5
+    MADD       a6,     c41,   ALPHA_I, a6
+
+    ST         a5,     C3,    0x00
+    ST         a6,     C3,    0x04
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+    addi.d     C2,     C2,    0x08
+    addi.d     C3,     C3,    0x08
+#else
+    //res00 res10 res20 res30
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    LD         a5,     C1,    0x00    //C1[0]
+    LD         a6,     C1,    0x04    //C1[1]
+
+    MADD       a5,     c21,   ALPHA_R, a5
+    MADD       a6,     c22,   ALPHA_R, a6
+    NMSUB      a5,     c22,   ALPHA_I, a5
+    MADD       a6,     c21,   ALPHA_I, a6
+
+    ST         a5,     C1,    0x00
+    ST         a6,     C1,    0x04
+
+    LD         a5,     C2,    0x00    //C2[0]
+    LD         a6,     C2,    0x04    //C2[1]
+
+    MADD       a5,     c31,   ALPHA_R, a5
+    MADD       a6,     c32,   ALPHA_R, a6
+    NMSUB      a5,     c32,   ALPHA_I, a5
+    MADD       a6,     c31,   ALPHA_I, a6
+
+    ST         a5,     C2,    0x00
+    ST         a6,     C2,    0x04
+
+    LD         a5,     C3,    0x00    //C3[0]
+    LD         a6,     C3,    0x04    //C3[1]
+
+    MADD       a5,     c41,   ALPHA_R, a5
+    MADD       a6,     c42,   ALPHA_R, a6
+    NMSUB      a5,     c42,   ALPHA_I, a5
+    MADD       a6,     c41,   ALPHA_I, a6
+
+    ST         a5,     C3,    0x00
+    ST         a6,     C3,    0x04
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+    addi.d     C2,     C2,    0x08
+    addi.d     C3,     C3,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x03
+    add.d      A0,     A0,   T3
+    slli.d     C3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+
+.L186:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   4
+#endif
+
+    slli.d     L,      K,     0x05
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x03
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     2
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     2
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&2); j+=2) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     3  //bm/8
+    beq        I,      T0,    .L24
+
+.L21:  /* for (i=0; i<bm/8; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x04
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+    vxor.v    U4,     U4,   U4
+    vxor.v    U5,     U5,   U5
+    vxor.v    U6,     U6,   U6
+    vxor.v    U7,     U7,   U7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<temp; k++) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D2,     B0,    0x00  // b0ri b1ri
+
+    vshuf4i.w  D4,     D0,    0xa0  //a0rr a1rr
+    vshuf4i.w  D5,     D0,    0xf5  //a0ii a1ii
+
+    vshuf4i.w  D6,     D2,    0x88  //b0r b1r b0r b1r
+    vshuf4i.w  D7,     D2,    0xdd  //b0i b1i b0i b1i
+
+    VMADD1    U0,     D4,    D6,     U0  //00r 10r 01r 11r
+    VMADD2    U1,     D5,    D6,     U1  //00i 10i 01i 11i
+    VMADD3    U0,     D5,    D7,     U0
+    VMADD4    U1,     D4,    D7,     U1
+
+    vld       D0,     A0,    0x10  // a2ri a3ri
+
+    vshuf4i.w  D4,     D0,    0xa0  //a2rr a3rr
+    vshuf4i.w  D5,     D0,    0xf5  //a2ii a3ii
+
+    VMADD1    U2,     D4,    D6,     U2  //02r 12r 03r 13r
+    VMADD2    U3,     D5,    D6,     U3  //02i 12i 03i 13i
+    VMADD3    U2,     D5,    D7,     U2
+    VMADD4    U3,     D4,    D7,     U3
+
+    vld       D0,     A0,    0x20  // a4ri a5ri
+
+    vshuf4i.w  D4,     D0,    0xa0  //a4rr a5rr
+    vshuf4i.w  D5,     D0,    0xf5  //a4ii a5ii
+
+    VMADD1    U4,     D4,    D6,     U4  //04r 14r 05r 15r
+    VMADD2    U5,     D5,    D6,     U5  //04i 14i 05i 15i
+    VMADD3    U4,     D5,    D7,     U4
+    VMADD4    U5,     D4,    D7,     U5
+
+    vld       D0,     A0,    0x30  // a6ri a7ri
+
+    vshuf4i.w  D4,     D0,    0xa0  //a6rr a7rr
+    vshuf4i.w  D5,     D0,    0xf5  //a6ii a7ii
+
+    VMADD1    U6,     D4,    D6,     U6  //06r 16r 07r 17r
+    VMADD2    U7,     D5,    D6,     U7  //06i 16i 07i 17i
+    VMADD3    U6,     D5,    D7,     U6
+    VMADD4    U7,     D4,    D7,     U7
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    //res00 res10 res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    vfmul.s      D2,    U0,    VALPHAR
+    vfmul.s      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12 res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    vfmul.s      D2,    U2,    VALPHAR
+    vfmul.s      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res04 res14 res05 res15
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    vfmul.s      D2,    U4,    VALPHAR
+    vfmul.s      D3,    U5,    VALPHAR
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res06 res16 res07 res17
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    vfmul.s      D2,    U6,    VALPHAR
+    vfmul.s      D3,    U7,    VALPHAR
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10 res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12 res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res04 res14 res05 res15
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    VFMADD      D2,    U4,    VALPHAR, D2
+    VFMADD      D3,    U5,    VALPHAR, D3
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res06 res16 res07 res17
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    VFMADD      D2,    U6,    VALPHAR, D2
+    VFMADD      D3,    U7,    VALPHAR, D3
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L24:   /* if ( bm & 4 ) */
+    move       I,      $r0
+    andi       T1,     M,     4    //bm&4
+    beq        I,      T1,    .L280
+
+.L25:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<temp; k++) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D2,     B0,    0x00  // b0ri b1ri
+
+    vshuf4i.w  D4,     D0,    0xa0  //a0rr a1rr
+    vshuf4i.w  D5,     D0,    0xf5  //a0ii a1ii
+
+    vshuf4i.w  D6,     D2,    0x88  //b0r b1r b0r b1r
+    vshuf4i.w  D7,     D2,    0xdd  //b0i b1i b0i b1i
+
+    VMADD1    U0,     D4,    D6,     U0  //00r 10r 01r 11r
+    VMADD2    U1,     D5,    D6,     U1  //00i 10i 01i 11i
+    VMADD3    U0,     D5,    D7,     U0
+    VMADD4    U1,     D4,    D7,     U1
+
+    vld       D0,     A0,    0x10  // a2ri a3ri
+
+    vshuf4i.w  D4,     D0,    0xa0  //a2rr a3rr
+    vshuf4i.w  D5,     D0,    0xf5  //a2ii a3ii
+
+    VMADD1    U2,     D4,    D6,     U2  //02r 12r 03r 13r
+    VMADD2    U3,     D5,    D6,     U3  //02i 12i 03i 13i
+    VMADD3    U2,     D5,    D7,     U2
+    VMADD4    U3,     D4,    D7,     U3
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    //res00 res10 res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    vfmul.s      D2,    U0,    VALPHAR
+    vfmul.s      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12 res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    vfmul.s      D2,    U2,    VALPHAR
+    vfmul.s      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10 res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12 res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L280:   /* if ( bm & 2 )*/
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L284
+
+.L281:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L283
+    blt        TL,     L,     .L283
+
+.L282:  /* for (k=0; k<temp; k++) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D2,     B0,    0x00  // b0ri b1ri
+
+    vshuf4i.w  D4,     D0,    0xa0  //a0rr a1rr
+    vshuf4i.w  D5,     D0,    0xf5  //a0ii a1ii
+
+    vshuf4i.w  D6,     D2,    0x88  //b0r b1r b0r b1r
+    vshuf4i.w  D7,     D2,    0xdd  //b0i b1i b0i b1i
+
+    VMADD1    U0,     D4,    D6,     U0  //00r 10r 01r 11r
+    VMADD2    U1,     D5,    D6,     U1  //00i 10i 01i 11i
+    VMADD3    U0,     D5,    D7,     U0
+    VMADD4    U1,     D4,    D7,     U1
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L282
+
+.L283:
+#if defined(TRMMKERNEL)
+    //res00 res10 res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    vfmul.s      D2,    U0,    VALPHAR
+    vfmul.s      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10 res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L284:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L288
+
+.L285:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x03
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L287
+    blt        TL,     L,     .L287
+
+.L286:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00        //a0r
+    LD         a2,     A0,    0x04        //a0i
+
+    LD         b1,     B0,    0x00        //b0r
+    LD         b2,     B0,    0x04        //b0i
+    LD         b3,     B0,    0x08        //b1r
+    LD         b4,     B0,    0x0c        //b1i
+
+    MADD1      c11,    a1,    b1,     c11  //res00r
+    MADD2      c12,    a2,    b1,     c12  //res00i
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    MADD1      c21,    a1,    b3,     c21  //res10r
+    MADD2      c22,    a2,    b3,     c22  //res10i
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L286
+
+.L287:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         a7,     C1,    0x00    //C1[0]
+    LD         a8,     C1,    0x04    //C1[1]
+
+    MUL       a5,     c11,   ALPHA_R
+    MUL       a6,     c12,   ALPHA_R
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    MUL       a7,     c21,   ALPHA_R
+    MUL       a8,     c22,   ALPHA_R
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    ST         a7,     C1,    0x00
+    ST         a8,     C1,    0x04
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+#else
+    //res00 res10
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         a7,     C1,    0x00    //C1[0]
+    LD         a8,     C1,    0x04    //C1[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    ST         a7,     C1,    0x00
+    ST         a8,     C1,    0x04
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x03
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L288:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+    slli.d     L,      K,     4
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   2
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     2
+    andi       T0,     N,     2
+    blt        J,      T0,    .L20
+
+.L30:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L999
+
+.L300:  /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     3  //bm/8
+    beq        I,      T0,    .L34
+
+.L31:  /* for (i=0; i<bm/8; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x03
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L33
+    blt        TL,     L,     .L33
+
+.L32:  /* for (k=0; k<temp; k++) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D1,     A0,    0x10  // a2ri a3ri
+
+    vldrepl.w D2,     B0,    0x00 //b0r
+    vldrepl.w D3,     B0,    0x04 //b0i
+
+    vpackev.w D4,     D1,    D0
+    vshuf4i.w  D4,     D4,    0xd8  //a0r a1r a2r a3r
+
+    vpackod.w D5,     D1,    D0
+    vshuf4i.w  D5,     D5,    0xd8  //a0i a1i a2i a3i
+
+    VMADD1    U0,     D4,    D2,     U0  //00r 01r 02r 03r
+    VMADD2    U1,     D5,    D2,     U1  //00i 01i 02i 03i
+    VMADD3    U0,     D5,    D3,     U0
+    VMADD4    U1,     D4,    D3,     U1
+
+    vld       D0,     A0,    0x20  // a4ri a5ri
+    vld       D1,     A0,    0x30  // a6ri a7ri
+
+    vpackev.w D4,     D1,    D0
+    vshuf4i.w  D4,     D4,    0xd8  //a4r a5r a6r a7r
+
+    vpackod.w D5,     D1,    D0
+    vshuf4i.w  D5,     D5,    0xd8  //a4i a5i a6i a7i
+
+    VMADD1    U2,     D4,    D2,     U2  //04r 05r 06r 07r
+    VMADD2    U3,     D5,    D2,     U3  //04i 05i 06i 07i
+    VMADD3    U2,     D5,    D3,     U2
+    VMADD4    U3,     D4,    D3,     U3
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L32
+
+.L33:
+#if defined(TRMMKERNEL)
+    //res00 res01 res02 res03
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C0,    0x10 //c0: 4 5 6 7
+
+    vpackev.w D2,     D1,    D0
+    vshuf4i.w  D2,     D2,    0xd8  //0 2 4 6
+    vpackod.w D3,     D1,    D0
+    vshuf4i.w  D3,     D3,    0xd8  //1 3 5 7
+
+    vfmul.s      D2,    U0,    VALPHAR
+    vfmul.s      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vand.v    D4,     D3,   D3  //1 3 5 7
+    vpermi.w  D4,     D2,   0x44 //0 2 1 3
+    vshuf4i.w  D4,     D4,   0xd8 //0 1 2 3
+
+    vand.v    D5,     D3,   D3  //1 3 5 7
+    vpermi.w  D5,     D2,   0xee //4 6 5 7
+    vshuf4i.w  D5,     D5,   0xd8 //4 5 6 7
+
+    vst       D4,     C0,    0x00
+    vst       D5,     C0,    0x10
+
+    //res04 res05 res06 res07
+    vld       D0,     C0,    0x20 //c0: 8 9 10 11
+    vld       D1,     C0,    0x30 //c0: 12 13 14 15
+
+    vpackev.w D2,     D1,    D0
+    vshuf4i.w  D2,     D2,    0xd8  //8 10 12 14
+    vpackod.w D3,     D1,    D0
+    vshuf4i.w  D3,     D3,    0xd8  //9 11 13 15
+
+    vfmul.s      D2,    U2,    VALPHAR
+    vfmul.s      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vand.v    D4,     D3,   D3  //8 10 12 14
+    vpermi.w  D4,     D2,   0x44 //8 10 9 11
+    vshuf4i.w  D4,     D4,   0xd8 //8 9 10 11
+
+    vand.v    D5,     D3,   D3  //9 11 13 15
+    vpermi.w  D5,     D2,   0xee //12 14 13 15
+    vshuf4i.w  D5,     D5,   0xd8 //12 13 14 15
+
+    vst       D4,     C0,    0x20
+    vst       D5,     C0,    0x30
+
+    addi.d     C0,     C0,    0x40
+#else
+    //res00 res01 res02 res03
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C0,    0x10 //c0: 4 5 6 7
+
+    vpackev.w D2,     D1,    D0
+    vshuf4i.w  D2,     D2,    0xd8  //0 2 4 6
+    vpackod.w D3,     D1,    D0
+    vshuf4i.w  D3,     D3,    0xd8  //1 3 5 7
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vand.v    D4,     D3,   D3  //1 3 5 7
+    vpermi.w  D4,     D2,   0x44 //0 2 1 3
+    vshuf4i.w  D4,     D4,   0xd8 //0 1 2 3
+
+    vand.v    D5,     D3,   D3  //1 3 5 7
+    vpermi.w  D5,     D2,   0xee //4 6 5 7
+    vshuf4i.w  D5,     D5,   0xd8 //4 5 6 7
+
+    vst       D4,     C0,    0x00
+    vst       D5,     C0,    0x10
+
+    //res04 res05 res06 res07
+    vld       D0,     C0,    0x20 //c0: 8 9 10 11
+    vld       D1,     C0,    0x30 //c0: 12 13 14 15
+
+    vpackev.w D2,     D1,    D0
+    vshuf4i.w  D2,     D2,    0xd8  //8 10 12 14
+    vpackod.w D3,     D1,    D0
+    vshuf4i.w  D3,     D3,    0xd8  //9 11 13 15
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vand.v    D4,     D3,   D3  //8 10 12 14
+    vpermi.w  D4,     D2,   0x44 //8 10 9 11
+    vshuf4i.w  D4,     D4,   0xd8 //8 9 10 11
+
+    vand.v    D5,     D3,   D3  //9 11 13 15
+    vpermi.w  D5,     D2,   0xee //12 14 13 15
+    vshuf4i.w  D5,     D5,   0xd8 //12 13 14 15
+
+    vst       D4,     C0,    0x20
+    vst       D5,     C0,    0x30
+
+    addi.d     C0,     C0,    0x40
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x03
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L31
+
+.L34:   /* if ( bm & 4 ) */
+    move       I,      $r0
+    andi       T1,     M,     4    //bm&4
+    beq        I,      T1,    .L38
+
+.L35:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x03
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L37
+    blt        TL,     L,     .L37
+
+.L36:  /* for (k=0; k<temp; k++) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D1,     A0,    0x10  // a2ri a3ri
+
+    vldrepl.w D2,     B0,    0x00 //b0r
+    vldrepl.w D3,     B0,    0x04 //b0i
+
+    vpackev.w D4,     D1,    D0
+    vshuf4i.w  D4,     D4,    0xd8  //a0r a1r a2r a3r
+
+    vpackod.w D5,     D1,    D0
+    vshuf4i.w  D5,     D5,    0xd8  //a0i a1i a2i a3i
+
+    VMADD1    U0,     D4,    D2,     U0  //00r 01r 02r 03r
+    VMADD2    U1,     D5,    D2,     U1  //00i 01i 02i 03i
+    VMADD3    U0,     D5,    D3,     U0
+    VMADD4    U1,     D4,    D3,     U1
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L36
+
+.L37:
+#if defined(TRMMKERNEL)
+    //res00 res01 res02 res03
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C0,    0x10 //c0: 4 5 6 7
+
+    vpackev.w D2,     D1,    D0
+    vshuf4i.w  D2,     D2,    0xd8  //0 2 4 6
+    vpackod.w D3,     D1,    D0
+    vshuf4i.w  D3,     D3,    0xd8  //1 3 5 7
+
+    vfmul.s      D2,    U0,    VALPHAR
+    vfmul.s      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vand.v    D4,     D3,   D3  //1 3 5 7
+    vpermi.w  D4,     D2,   0x44 //0 2 1 3
+    vshuf4i.w  D4,     D4,   0xd8 //0 1 2 3
+
+    vand.v    D5,     D3,   D3  //1 3 5 7
+    vpermi.w  D5,     D2,   0xee //4 6 5 7
+    vshuf4i.w  D5,     D5,   0xd8 //4 5 6 7
+
+    vst       D4,     C0,    0x00
+    vst       D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#else
+    //res00 res01 res02 res03
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C0,    0x10 //c0: 4 5 6 7
+
+    vpackev.w D2,     D1,    D0
+    vshuf4i.w  D2,     D2,    0xd8  //0 2 4 6
+    vpackod.w D3,     D1,    D0
+    vshuf4i.w  D3,     D3,    0xd8  //1 3 5 7
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vand.v    D4,     D3,   D3  //1 3 5 7
+    vpermi.w  D4,     D2,   0x44 //0 2 1 3
+    vshuf4i.w  D4,     D4,   0xd8 //0 1 2 3
+
+    vand.v    D5,     D3,   D3  //1 3 5 7
+    vpermi.w  D5,     D2,   0xee //4 6 5 7
+    vshuf4i.w  D5,     D5,   0xd8 //4 5 6 7
+
+    vst       D4,     C0,    0x00
+    vst       D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x03
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+.L38:   /* if ( bm & 2 ) */
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L312
+
+.L39:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x03
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L311
+    blt        TL,     L,     .L311
+
+.L310:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00        //a0r
+    LD         a2,     A0,    0x04        //a0i
+    LD         a3,     A0,    0x08        //a1r
+    LD         a4,     A0,    0x0c        //a1i
+
+    LD         b1,     B0,    0x00        //b0r
+    LD         b2,     B0,    0x04        //b0i
+
+    MADD1      c11,    a1,    b1,     c11  //res00r
+    MADD2      c12,    a2,    b1,     c12  //res00i
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    MADD1      c21,    a3,    b1,     c21  //res10r
+    MADD2      c22,    a4,    b1,     c22  //res10i
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L310
+
+.L311:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         a7,     C0,    0x08    //C0[2]
+    LD         a8,     C0,    0x0c    //C0[3]
+
+    MUL       a5,     c11,   ALPHA_R
+    MUL       a6,     c12,   ALPHA_R
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    MUL       a7,     c21,   ALPHA_R
+    MUL       a8,     c22,   ALPHA_R
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    ST         a7,     C0,    0x08
+    ST         a8,     C0,    0x0c
+
+    addi.d     C0,     C0,    0x10
+#else
+    //res00 res10
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         a7,     C0,    0x08    //C0[2]
+    LD         a8,     C0,    0x0c    //C0[3]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    ST         a7,     C0,    0x08
+    ST         a8,     C0,    0x0c
+
+    addi.d     C0,     C0,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x03
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L312:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L316
+
+.L313:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x03
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x03
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L315
+    blt        TL,     L,     .L315
+
+.L314:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00
+    LD         a2,     A0,    0x04
+
+    LD         b1,     B0,    0x00
+    LD         b2,     B0,    0x04
+
+    MADD1      c11,    a1,    b1,     c11
+    MADD2      c12,    a2,    b1,     c12
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L314
+
+.L315:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+
+    addi.d     C0,     C0,    0x08
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    addi.d     C0,     C0,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x03
+    add.d      A0,     A0,   T3
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L316:
+    slli.d     L,      K,     3
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L300
+
+.L999:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_ncopy_4_lsx.S b/kernel/loongarch64/cgemm_ncopy_4_lsx.S
new file mode 100644
index 000000000..bfc712fb2
--- /dev/null
+++ b/kernel/loongarch64/cgemm_ncopy_4_lsx.S
@@ -0,0 +1,341 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define TD     $r20
+#define TS     $r11
+#define TL     $r19
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+#define D8     $vr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x02
+    slli.d     TL,   TL,   0x01
+
+    srai.d     J,    N,    0x02
+    beq        J,    ZERO,  .L_N0
+
+.L_J1: /* J-- */
+    move       S1,   TS
+    add.d      S2,   S1,   TL
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+
+    slli.d     T0,   TL,   0x02
+    add.d      TS,   TS,   T0
+
+    srai.d     I,    M,    0x02
+    beq        I,    ZERO,  .L_I3
+
+.L_I1: /* I-- */
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+
+    vld       U2,   S2,   0x00
+    vld       U3,   S2,   0x10
+
+    vld       U4,   S3,   0x00
+    vld       U5,   S3,   0x10
+
+    vld       U6,   S4,   0x00
+    vld       U7,   S4,   0x10
+
+    vand.v    D0,   U2,   U2
+    vand.v    D1,   U3,   U3
+    vand.v    D2,   U2,   U2
+    vand.v    D3,   U3,   U3
+    vand.v    D4,   U6,   U6
+    vand.v    D5,   U7,   U7
+    vand.v    D6,   U6,   U6
+    vand.v    D7,   U7,   U7
+
+    vpermi.w  D0,   U0,   0x44
+    vpermi.w  D4,   U4,   0x44
+    vpermi.w  D2,   U0,   0xee
+    vpermi.w  D6,   U4,   0xee
+    vpermi.w  D1,   U1,   0x44
+    vpermi.w  D5,   U5,   0x44
+    vpermi.w  D3,   U1,   0xee
+    vpermi.w  D7,   U5,   0xee
+
+    vst       D0,   TD,   0x00
+    vst       D4,   TD,   0x10
+    vst       D2,   TD,   0x20
+    vst       D6,   TD,   0x30
+    vst       D1,   TD,   0x40
+    vst       D5,   TD,   0x50
+    vst       D3,   TD,   0x60
+    vst       D7,   TD,   0x70
+
+    addi.d     S1,   S1,   0x20   // a_offset
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x80  // b_offset
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_I3:  /* if(m&2) */
+    andi       I,    M,    0x02
+    beq        I,    ZERO, .L_II20
+
+    vld       U0,   S1,   0x00
+    vld       U1,   S2,   0x00
+    vld       U2,   S3,   0x00
+    vld       U3,   S4,   0x00
+
+    vand.v    D0,   U1,   U1
+    vand.v    D1,   U1,   U1
+    vand.v    D2,   U3,   U3
+    vand.v    D3,   U3,   U3
+
+    vpermi.w  D0,   U0,   0x44
+    vpermi.w  D2,   U2,   0x44
+    vpermi.w  D1,   U0,   0xee
+    vpermi.w  D3,   U2,   0xee
+
+    vst       D0,   TD,   0x00
+    vst       D2,   TD,   0x10
+    vst       D1,   TD,   0x20
+    vst       D3,   TD,   0x30
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     S3,   S3,   0x10
+    addi.d     S4,   S4,   0x10
+    addi.d     TD,   TD,   0x40
+
+.L_II20:  /* if(m&1) */
+    andi       I,    M,    0x01
+    beq        I,    ZERO, .L_J0
+
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fld.s      F4,   S3,   0x00
+    fld.s      F5,   S3,   0x04
+
+    fld.s      F6,   S4,   0x00
+    fld.s      F7,   S4,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+    fst.s      F4,   TD,   0x10
+    fst.s      F5,   TD,   0x14
+    fst.s      F6,   TD,   0x18
+    fst.s      F7,   TD,   0x1c
+
+    addi.d     TD,   TD,   0x20
+
+.L_J0:
+    addi.d    J,     J,    -1
+    blt       ZERO,  J,    .L_J1
+
+.L_N0:  /* if(n&2) */
+    andi       I,     N,   0x02
+    beq        ZERO,  I,   .L_N20
+
+    move       S1,    TS
+    add.d      S2,    S1,   TL
+
+    slli.d     T0,    TL,   0x01
+    add.d      TS,    TS,   T0
+
+    srai.d     I,     M,    0x02
+    beq        ZERO,  I,    .L_N10
+
+.L_N11: /* if(i>0) */
+    vld       U0,    S1,   0x00
+    vld       U1,    S1,   0x10
+    vld       U2,    S2,   0x00
+    vld       U3,    S2,   0x10
+
+    vand.v    D0,    U2,   U2
+    vand.v    D1,    U3,   U3
+    vand.v    D2,    U2,   U2
+    vand.v    D3,    U3,   U3
+
+    vpermi.w  D0,    U0,   0x44
+    vpermi.w  D2,    U0,   0xee
+    vpermi.w  D1,    U1,   0x44
+    vpermi.w  D3,    U1,   0xee
+
+    vst       D0,    TD,   0x00
+    vst       D2,    TD,   0x10
+    vst       D1,    TD,   0x20
+    vst       D3,    TD,   0x30
+
+    addi.d     S1,    S1,   0x20   // a_offset
+    addi.d     S2,    S2,   0x20
+    addi.d     TD,    TD,   0x40   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N11
+
+.L_N10:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N130
+
+    vld       U0,    S1,   0x00
+    vld       U1,    S2,   0x00
+    vand.v    D0,    U1,   U1
+
+    vpermi.w  D0,    U0,   0x44
+    vpermi.w  U1,    U0,   0xee
+
+    vst       D0,    TD,   0x00
+    vst       U1,    TD,   0x10
+
+    addi.d     S1,    S1,   0x10   // a_offset
+    addi.d     S2,    S2,   0x10
+    addi.d     TD,    TD,   0x20   // b_offset
+
+.L_N130:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N20
+
+    fld.s      F0,     S1,   0x00
+    fld.s      F1,     S1,   0x04
+
+    fld.s      F2,     S2,   0x00
+    fld.s      F3,     S2,   0x04
+
+    fst.s      F0,     TD,   0x00
+    fst.s      F1,     TD,   0x04
+    fst.s      F2,     TD,   0x08
+    fst.s      F3,     TD,   0x0c
+
+    addi.d     TD,    TD,   0x10
+
+.L_N20:   /* if(n&1) */
+    andi       I,     N,    0x01
+    beq        I,     ZERO, .L_N00
+
+    move       S1,   TS
+    srai.d     I,    M,    0x02
+
+    beq        I,    ZERO, .L_N30
+
+.L_N21:  /* if(i>0) */
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+
+    vst       U0,   TD,   0x00
+    vst       U1,   TD,   0x10
+
+    addi.d     S1,   S1,   0x20   // aoffset1
+    addi.d     TD,   TD,   0x20   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N21
+
+.L_N30:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N330
+
+    vld       U0,   S1,   0x00
+
+    vst       U0,   TD,   0x00
+
+    addi.d     S1,   S1,   0x10   // aoffset1
+    addi.d     TD,   TD,   0x10   // b_offset
+
+.L_N330:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N00
+
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_ncopy_8_lsx.S b/kernel/loongarch64/cgemm_ncopy_8_lsx.S
new file mode 100644
index 000000000..87a88e37d
--- /dev/null
+++ b/kernel/loongarch64/cgemm_ncopy_8_lsx.S
@@ -0,0 +1,263 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+#define D8     $vr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST  //boffset
+    move       TS,   SRC  //aoffset
+
+    slli.d     TL,   LDA,  0x02  //lda
+    slli.d     TL,   TL,   0x01
+
+    slli.d     T0,   TL,   0x03
+    srai.d     J,    N,    0x03  //j
+
+    beq        J,    ZERO, .L_N1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    move       I,    M
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+    add.d      S5,   S4,   TL
+    add.d      S6,   S5,   TL
+    add.d      S7,   S6,   TL
+    add.d      S8,   S7,   TL
+    add.d      TS,   TS,   T0
+
+    beq        I,    ZERO, .L_J11
+
+.L_I1:  /* if(i>0) i--*/
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+    fld.s      F4,   S3,   0x00
+    fld.s      F5,   S3,   0x04
+    fld.s      F6,   S4,   0x00
+    fld.s      F7,   S4,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+    fst.s      F4,   TD,   0x10
+    fst.s      F5,   TD,   0x14
+    fst.s      F6,   TD,   0x18
+    fst.s      F7,   TD,   0x1c
+
+    fld.s      F0,   S5,   0x00
+    fld.s      F1,   S5,   0x04
+    fld.s      F2,   S6,   0x00
+    fld.s      F3,   S6,   0x04
+    fld.s      F4,   S7,   0x00
+    fld.s      F5,   S7,   0x04
+    fld.s      F6,   S8,   0x00
+    fld.s      F7,   S8,   0x04
+
+    fst.s      F0,   TD,   0x20
+    fst.s      F1,   TD,   0x24
+    fst.s      F2,   TD,   0x28
+    fst.s      F3,   TD,   0x2c
+    fst.s      F4,   TD,   0x30
+    fst.s      F5,   TD,   0x34
+    fst.s      F6,   TD,   0x38
+    fst.s      F7,   TD,   0x3c
+
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     S3,   S3,   0x08
+    addi.d     S4,   S4,   0x08
+    addi.d     S5,   S5,   0x08
+    addi.d     S6,   S6,   0x08
+    addi.d     S7,   S7,   0x08
+    addi.d     S8,   S8,   0x08
+    addi.d     TD,   TD,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_J11: /* j--*/
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_J1
+
+.L_N1:  /* if(n&4)*/
+    andi       I,     N,    0x04
+    beq        I,     ZERO, .L_N2
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    move       I,    M
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+    add.d      TS,   S4,   TL
+
+    beq        I,     ZERO, .L_N2
+
+.L_N11:  /* if(i>0)*/
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+    fld.s      F4,   S3,   0x00
+    fld.s      F5,   S3,   0x04
+    fld.s      F6,   S4,   0x00
+    fld.s      F7,   S4,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+    fst.s      F4,   TD,   0x10
+    fst.s      F5,   TD,   0x14
+    fst.s      F6,   TD,   0x18
+    fst.s      F7,   TD,   0x1c
+
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     S3,   S3,   0x08
+    addi.d     S4,   S4,   0x08
+    addi.d     TD,   TD,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_N11
+
+.L_N2:  /* if(n&2)*/
+    andi       I,     N,    0x02
+    beq        I,     ZERO, .L_N3
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    move       I,    M
+    add.d      TS,   S2,   TL
+
+    beq        I,    ZERO, .L_N3
+
+.L_N21:  /* if(i>0)*/
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     TD,   TD,   0x10
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_N21
+
+.L_N3:  /* if(n&2)*/
+    andi       I,    N,    0x01
+    beq        I,    ZERO, .L_N0
+
+    move       S1,   TS
+    move       I,    M
+
+    beq        I,    ZERO, .L_N0
+
+.L_N31:  /* if(i>0)*/
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+
+    addi.d     S1,   S1,   0x08
+    addi.d     TD,   TD,   0x08
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_N31
+
+.L_N0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_tcopy_4_lsx.S b/kernel/loongarch64/cgemm_tcopy_4_lsx.S
new file mode 100644
index 000000000..6d63d62e7
--- /dev/null
+++ b/kernel/loongarch64/cgemm_tcopy_4_lsx.S
@@ -0,0 +1,324 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define TD     $r16
+#define TS     $r17
+#define TL     $r18
+#define T0     $r19
+#define S8     $r20
+#define S9     $r23
+#define S10    $r11
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x02  //lda
+    slli.d     TL,     TL,    0x01  //lda
+
+    ori        T0,     ZERO,  0x03
+    andn       T0,     N,     T0
+    mul.w      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x02
+    add.d      S9,     DST,   T0  //boffset2
+
+    ori        T0,     ZERO,  0x01
+    andn       T0,     N,     T0
+    mul.w      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x02
+    add.d      S10,    DST,   T0  //boffset3
+
+    srai.d     J,      M,     0x02  //j
+
+    beq        J,      ZERO,  .L_M1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+    add.d      S3,     S2,    TL
+    add.d      S4,     S3,    TL
+
+    slli.d     T0,     TL,    0x02
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x80
+
+    srai.d     I,      N,     0x02
+
+    beq        ZERO,   I,     .L_JN1
+
+.L_JI1:  /* if(i>0) i--*/
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+
+    vld       U2,   S2,   0x00
+    vld       U3,   S2,   0x10
+
+    vld       U4,   S3,   0x00
+    vld       U5,   S3,   0x10
+
+    vld       U6,   S4,   0x00
+    vld       U7,   S4,   0x10
+
+    vst       U0,   S8,   0x00
+    vst       U1,   S8,   0x10
+    vst       U2,   S8,   0x20
+    vst       U3,   S8,   0x30
+    vst       U4,   S8,   0x40
+    vst       U5,   S8,   0x50
+    vst       U6,   S8,   0x60
+    vst       U7,   S8,   0x70
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    slli.d     T0,   M,    0x05
+    add.d      S8,   S8,   T0
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_JI1
+
+.L_JN1:  /* if(n&2) */
+    andi       I,    N,    0x02
+    beq        ZERO, I,    .L_JN2
+
+    vld       U0,   S1,   0x00
+    vld       U1,   S2,   0x00
+    vld       U2,   S3,   0x00
+    vld       U3,   S4,   0x00
+
+    vst       U0,   S9,   0x00
+    vst       U1,   S9,   0x10
+    vst       U2,   S9,   0x20
+    vst       U3,   S9,   0x30
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     S3,   S3,   0x10
+    addi.d     S4,   S4,   0x10
+    addi.d     S9,   S9,   0x40
+
+.L_JN2:  /* if(n&1) */
+    andi       I,    N,    0x01
+    beq        ZERO, I,    .L_J0
+
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fld.s      F4,   S3,   0x00
+    fld.s      F5,   S3,   0x04
+
+    fld.s      F6,   S4,   0x00
+    fld.s      F7,   S4,   0x04
+
+    fst.s      F0,   S10,   0x00
+    fst.s      F1,   S10,   0x04
+    fst.s      F2,   S10,   0x08
+    fst.s      F3,   S10,   0x0c
+    fst.s      F4,   S10,   0x10
+    fst.s      F5,   S10,   0x14
+    fst.s      F6,   S10,   0x18
+    fst.s      F7,   S10,   0x1c
+
+    addi.d     S10,  S10,   0x20
+
+.L_J0:
+    addi.d     J,      J,     -1
+    blt        ZERO, J,   .L_J1
+
+.L_M1:  /* if(m&2) */
+    andi       I,      M,    0x02
+    beq        ZERO,   I,    .L_M2
+
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+
+    slli.d     T0,     TL,    0x01
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x40
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M1N1
+
+.L_M1I1:  /* if(i>0) */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+
+    vld       U2,     S2,    0x00
+    vld       U3,     S2,    0x10
+
+    vst       U0,     S8,    0x00
+    vst       U1,     S8,    0x10
+    vst       U2,     S8,    0x20
+    vst       U3,     S8,    0x30
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    slli.d     T0,     M,     0x05
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M1I1
+
+.L_M1N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M1N2
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S2,    0x00
+
+    vst       U0,     S9,    0x00
+    vst       U1,     S9,    0x10
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S9,     S9,    0x20
+
+.L_M1N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M2
+
+    fld.s      F0,      S1,    0x00
+    fld.s      F1,      S1,    0x04
+
+    fld.s      F2,      S2,    0x00
+    fld.s      F3,      S2,    0x04
+
+    fst.s      F0,      S10,   0x00
+    fst.s      F1,      S10,   0x04
+    fst.s      F2,      S10,   0x08
+    fst.s      F3,      S10,   0x0c
+
+    addi.d     S10,    S10,   0x10
+
+.L_M2:  /* if(m&1) */
+    andi       I,      M,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    move       S1,     TS     //aoffset1
+    move       S8,     TD     //boffset1
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M2N1
+
+.L_M2I1:  /* if(i>0) */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+
+    vst       U0,     S8,    0x00
+    vst       U1,     S8,    0x10
+
+    addi.d     S1,     S1,    0x20
+    slli.d     T0,     M,     0x05
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M2I1
+
+.L_M2N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M2N2
+
+    vld       U0,     S1,    0x00
+
+    vst       U0,     S9,    0x00
+
+    addi.d     S1,     S1,    0x10
+
+.L_M2N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    fld.s      F0,     S1,    0x00
+    fld.s      F1,     S1,    0x04
+
+    fst.s      F0,     S10,    0x00
+    fst.s      F1,     S10,    0x04
+
+.L_M0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_tcopy_8_lsx.S b/kernel/loongarch64/cgemm_tcopy_8_lsx.S
new file mode 100644
index 000000000..2935bbc07
--- /dev/null
+++ b/kernel/loongarch64/cgemm_tcopy_8_lsx.S
@@ -0,0 +1,277 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x02  //lda
+    slli.d     TL,     TL,    0x01
+
+    srai.d     J,      N,     0x03  //j
+
+    beq        J,      ZERO,  .L_N1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+    addi.d     TS,     TS,    0x40
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_J1M1
+
+.L_J1I1:  /* if(i>0) i--*/
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+
+    vld       U4,     S2,    0x00
+    vld       U5,     S2,    0x10
+    vld       U6,     S2,    0x20
+    vld       U7,     S2,    0x30
+
+    vst       U0,     TD,    0x00
+    vst       U1,     TD,    0x10
+    vst       U2,     TD,    0x20
+    vst       U3,     TD,    0x30
+    vst       U4,     TD,    0x40
+    vst       U5,     TD,    0x50
+    vst       U6,     TD,    0x60
+    vst       U7,     TD,    0x70
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+    addi.d     TD,     TD,    0x80
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_J1I1
+
+.L_J1M1:  /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_J0
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+
+    vst       U0,     TD,    0x00
+    vst       U1,     TD,    0x10
+    vst       U2,     TD,    0x20
+    vst       U3,     TD,    0x30
+
+    addi.d     TD,     TD,    0x40
+
+.L_J0:
+    addi.d     J,      J,     -1
+    blt        ZERO,   J,     .L_J1
+
+.L_N1:  /* if(n&4) */
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_N2
+
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+    addi.d     TS,     TS,    0x20
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_N1M1
+
+.L_N1I1:   /* if(i>0) i-- */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+
+    vld       U2,     S2,    0x00
+    vld       U3,     S2,    0x10
+
+    vst       U0,     TD,    0x00
+    vst       U1,     TD,    0x10
+    vst       U2,     TD,    0x20
+    vst       U3,     TD,    0x30
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+    addi.d     TD,     TD,    0x40
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_N1I1
+
+.L_N1M1:  /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_N2
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+
+    vst       U0,     TD,    0x00
+    vst       U1,     TD,    0x10
+
+    addi.d     TD,     TD,    0x20
+
+.L_N2:  /* if(n&2) */
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_N3
+
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+    addi.d     TS,     TS,    0x10
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_N2M1
+
+.L_N2I1:  /* if(i>0) i-- */
+    vld       U0,     S1,    0x00
+    vld       U1,     S2,    0x00
+
+    vst       U0,     TD,    0x00
+    vst       U1,     TD,    0x10
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+
+    addi.d     TD,     TD,    0x20
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_N2I1
+
+.L_N2M1:   /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_N3
+
+    vld       U0,     S1,    0x00
+
+    vst       U0,     TD,    0x00
+
+    addi.d     TD,     TD,    0x10
+
+.L_N3:   /* if(n&1) */
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_N3M1
+
+.L_N3I1:  /* if(i>0) i-- */
+    fld.s      F0,     S1,    0x00
+    fld.s      F1,     S1,    0x04
+
+    fld.s      F2,     S2,    0x00
+    fld.s      F3,     S2,    0x04
+
+    fst.s      F0,     TD,    0x00
+    fst.s      F1,     TD,    0x04
+    fst.s      F2,     TD,    0x08
+    fst.s      F3,     TD,    0x0c
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+    addi.d     TD,     TD,    0x10
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_N3I1
+
+.L_N3M1:  /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    fld.s      F0,     S1,    0x00
+    fld.s      F1,     S1,    0x04
+
+    fst.s      F0,     TD,    0x00
+    fst.s      F1,     TD,    0x04
+
+.L_N0:
+    LDARG      $r23,   $sp,   0
+    addi.d     $sp,    $sp,   8
+    jirl       $r0,    $r1,   0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_kernel_4x4_lsx.S b/kernel/loongarch64/zgemm_kernel_4x4_lsx.S
new file mode 100644
index 000000000..6c4841b24
--- /dev/null
+++ b/kernel/loongarch64/zgemm_kernel_4x4_lsx.S
@@ -0,0 +1,2316 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+#define T2     $r26
+#define T3     $r27
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LSX vectors */
+#define U0     $vr30
+#define U1     $vr31
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+#define D0     $vr16
+#define D1     $vr17
+#define D2     $vr18
+#define D3     $vr19
+#define D4     $vr20
+#define D5     $vr21
+#define D6     $vr22
+#define D7     $vr23
+#define D8     $vr24
+#define D9     $vr25
+#define D10    $vr26
+#define D11    $vr27
+#define D12    $vr28
+#define D13    $vr29
+#define VALPHAR $vr28
+#define VALPHAI $vr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VNMSUB
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VFMADD
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VFMADD
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VNMSUB
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    vldrepl.d  VALPHAR, $sp, 112
+    vldrepl.d  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   BASE_SHIFT
+
+    move       J,      $r0
+    srai.d     T0,     N,     2  //bn/4
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/4; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    add.d      C2,     C1,    TL
+    add.d      C3,     C2,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     2  //bm/4
+    beq        I,      T0,    .L18
+
+.L11:  /* for(i=0; i<bm/4; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     T3,     OFF,   0x06
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x06
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+    vxor.v    U4,     U4,   U4
+    vxor.v    U5,     U5,   U5
+    vxor.v    U6,     U6,   U6
+    vxor.v    U7,     U7,   U7
+    vxor.v    U8,     U8,   U8
+    vxor.v    U9,     U9,   U9
+    vxor.v    U10,    U10,  U10
+    vxor.v    U11,    U11,  U11
+    vxor.v    U12,    U12,  U12
+    vxor.v    U13,    U13,  U13
+    vxor.v    U14,    U14,  U14
+    vxor.v    U15,    U15,  U15
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L13
+    blt        TL,     L,     .L13
+
+.L12:  /* for(k=0; k<temp; k+=1) */
+    vld       D1,     B0,    0x00  // b0ri
+    vld       D2,     B0,    0x10  // b1ri
+    vld       D3,     B0,    0x20  // b2ri
+    vld       D4,     B0,    0x30  // b3ri
+    vld       D0,     A0,    0x00  // a0ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a0rr
+    vshuf4i.d  D6,     D0,    0x55  //a0ii
+
+    vpackev.d D7,     D2,    D1     //b0r b1r
+    vpackod.d D8,     D2,    D1     //b0i b1i
+
+    vpackev.d D9,     D4,    D3     //b2r b3r
+    vpackod.d D10,    D4,    D3     //b2i b3i
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 10r
+    VMADD2    U1,     D6,    D7,     U1  //00i 10i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    VMADD1    U2,     D5,    D9,     U2  //20r 30r
+    VMADD2    U3,     D6,    D9,     U3  //20i 30i
+    VMADD3    U2,     D6,    D10,    U2
+    VMADD4    U3,     D5,    D10,    U3
+
+    vld       D0,     A0,    0x10  // a1ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a1rr
+    vshuf4i.d  D6,     D0,    0x55  //a1ii
+
+    VMADD1    U4,     D5,    D7,     U4  //01r 11r
+    VMADD2    U5,     D6,    D7,     U5  //01i 11i
+    VMADD3    U4,     D6,    D8,     U4
+    VMADD4    U5,     D5,    D8,     U5
+
+    VMADD1    U6,     D5,    D9,     U6  //21r 31r
+    VMADD2    U7,     D6,    D9,     U7  //21i 31i
+    VMADD3    U6,     D6,    D10,    U6
+    VMADD4    U7,     D5,    D10,    U7
+
+    vld       D0,     A0,    0x20  // a2ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a2rr
+    vshuf4i.d  D6,     D0,    0x55  //a2ii
+
+    VMADD1    U8,     D5,    D7,     U8  //02r 12r
+    VMADD2    U9,     D6,    D7,     U9  //02i 12i
+    VMADD3    U8,     D6,    D8,     U8
+    VMADD4    U9,     D5,    D8,     U9
+
+    VMADD1    U10,     D5,    D9,     U10  //22r 32r
+    VMADD2    U11,     D6,    D9,     U11  //22i 32i
+    VMADD3    U10,     D6,    D10,    U10
+    VMADD4    U11,     D5,    D10,    U11
+
+    vld       D0,     A0,    0x30  // a3ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a3rr
+    vshuf4i.d  D6,     D0,    0x55  //a3ii
+
+    VMADD1    U12,     D5,    D7,     U12  //03r 13r
+    VMADD2    U13,     D6,    D7,     U13  //03i 13i
+    VMADD3    U12,     D6,    D8,     U12
+    VMADD4    U13,     D5,    D8,     U13
+
+    VMADD1    U14,     D5,    D9,     U14  //23r 33r
+    VMADD2    U15,     D6,    D9,     U15  //23i 33i
+    VMADD3    U14,     D6,    D10,    U14
+    VMADD4    U15,     D5,    D10,    U15
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L12
+
+.L13:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res20 res30
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    vfmul.d      D2,    U2,    VALPHAR
+    vfmul.d      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U4,    VALPHAR
+    vfmul.d      D3,    U5,    VALPHAR
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res21 res31
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    vfmul.d      D2,    U6,    VALPHAR
+    vfmul.d      D3,    U7,    VALPHAR
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res02 res12
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U8,    VALPHAR
+    vfmul.d      D3,    U9,    VALPHAR
+    VNMSUB      D2,    U9,    VALPHAI, D2
+    VFMADD      D3,    U8,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res22 res32
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    vfmul.d      D2,    U10,    VALPHAR
+    vfmul.d      D3,    U11,    VALPHAR
+    VNMSUB      D2,    U11,    VALPHAI, D2
+    VFMADD      D3,    U10,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U12,    VALPHAR
+    vfmul.d      D3,    U13,    VALPHAR
+    VNMSUB      D2,    U13,    VALPHAI, D2
+    VFMADD      D3,    U12,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res23 res33
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    vfmul.d      D2,    U14,    VALPHAR
+    vfmul.d      D3,    U15,    VALPHAR
+    VNMSUB      D2,    U15,    VALPHAI, D2
+    VFMADD      D3,    U14,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vst       U0,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U1,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U2,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U3,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U4,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U5,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U6,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U7,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U8,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U9,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U10,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U11,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U12,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U13,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U14,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U15,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res20 res30
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U4,    VALPHAR, D2
+    VFMADD      D3,    U5,    VALPHAR, D3
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res21 res31
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    VFMADD      D2,    U6,    VALPHAR, D2
+    VFMADD      D3,    U7,    VALPHAR, D3
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res02 res12
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U8,    VALPHAR, D2
+    VFMADD      D3,    U9,    VALPHAR, D3
+    VNMSUB      D2,    U9,    VALPHAI, D2
+    VFMADD      D3,    U8,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res22 res32
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    VFMADD      D2,    U10,    VALPHAR, D2
+    VFMADD      D3,    U11,    VALPHAR, D3
+    VNMSUB      D2,    U11,    VALPHAI, D2
+    VFMADD      D3,    U10,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U12,    VALPHAR, D2
+    VFMADD      D3,    U13,    VALPHAR, D3
+    VNMSUB      D2,    U13,    VALPHAI, D2
+    VFMADD      D3,    U12,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res23 res33
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    VFMADD      D2,    U14,    VALPHAR, D2
+    VFMADD      D3,    U15,    VALPHAR, D3
+    VNMSUB      D2,    U15,    VALPHAI, D2
+    VFMADD      D3,    U14,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x06
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L18:   /* if (bm & 2) */
+    move       I,      $r0
+    andi       T0,     M,     2
+    beq        I,      T0,    .L183
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x06
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+    vxor.v    U4,     U4,   U4
+    vxor.v    U5,     U5,   U5
+    vxor.v    U6,     U6,   U6
+    vxor.v    U7,     U7,   U7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L182
+    blt        TL,     L,     .L182
+
+.L181:  /* for (k=0; k<temp; k++) */
+    vld       D1,     B0,    0x00  // b0ri
+    vld       D2,     B0,    0x10  // b1ri
+    vld       D3,     B0,    0x20  // b2ri
+    vld       D4,     B0,    0x30  // b3ri
+    vld       D0,     A0,    0x00  // a0ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a0rr
+    vshuf4i.d  D6,     D0,    0x55  //a0ii
+
+    vpackev.d D7,     D2,    D1     //b0r b1r
+    vpackod.d D8,     D2,    D1     //b0i b1i
+
+    vpackev.d D9,     D4,    D3     //b2r b3r
+    vpackod.d D10,    D4,    D3     //b2i b3i
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 10r
+    VMADD2    U1,     D6,    D7,     U1  //00i 10i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    VMADD1    U2,     D5,    D9,     U2  //20r 30r
+    VMADD2    U3,     D6,    D9,     U3  //20i 30i
+    VMADD3    U2,     D6,    D10,    U2
+    VMADD4    U3,     D5,    D10,    U3
+
+    vld       D0,     A0,    0x10  // a1ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a1rr
+    vshuf4i.d  D6,     D0,    0x55  //a1ii
+
+    VMADD1    U4,     D5,    D7,     U4  //01r 11r
+    VMADD2    U5,     D6,    D7,     U5  //01i 11i
+    VMADD3    U4,     D6,    D8,     U4
+    VMADD4    U5,     D5,    D8,     U5
+
+    VMADD1    U6,     D5,    D9,     U6  //21r 31r
+    VMADD2    U7,     D6,    D9,     U7  //21i 31i
+    VMADD3    U6,     D6,    D10,    U6
+    VMADD4    U7,     D5,    D10,    U7
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L181
+
+.L182:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res20 res30
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    vfmul.d      D2,    U2,    VALPHAR
+    vfmul.d      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U4,    VALPHAR
+    vfmul.d      D3,    U5,    VALPHAR
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res21 res31
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    vfmul.d      D2,    U6,    VALPHAR
+    vfmul.d      D3,    U7,    VALPHAR
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res20 res30
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U4,    VALPHAR, D2
+    VFMADD      D3,    U5,    VALPHAR, D3
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res21 res31
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    VFMADD      D2,    U6,    VALPHAR, D2
+    VFMADD      D3,    U7,    VALPHAR, D3
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x06
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L183:   /* if (bm & 1) */
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L186
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x06
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L185
+    blt        TL,     L,     .L185
+
+.L184:  /* for (k=0; k<temp; k++) */
+    vld       D1,     B0,    0x00  // b0ri
+    vld       D2,     B0,    0x10  // b1ri
+    vld       D3,     B0,    0x20  // b2ri
+    vld       D4,     B0,    0x30  // b3ri
+    vld       D0,     A0,    0x00  // a0ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a0rr
+    vshuf4i.d  D6,     D0,    0x55  //a0ii
+
+    vpackev.d D7,     D2,    D1     //b0r b1r
+    vpackod.d D8,     D2,    D1     //b0i b1i
+
+    vpackev.d D9,     D4,    D3     //b2r b3r
+    vpackod.d D10,    D4,    D3     //b2i b3i
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 10r
+    VMADD2    U1,     D6,    D7,     U1  //00i 10i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    VMADD1    U2,     D5,    D9,     U2  //20r 30r
+    VMADD2    U3,     D6,    D9,     U3  //20i 30i
+    VMADD3    U2,     D6,    D10,    U2
+    VMADD4    U3,     D5,    D10,    U3
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L184
+
+.L185:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res20 res30
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    vfmul.d      D2,    U2,    VALPHAR
+    vfmul.d      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res20 res30
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     C3,     TL,   0x06
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+
+.L186:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   4
+#endif
+
+    slli.d     L,      K,     0x06
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x03
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     2
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     2
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&2); j+=2) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     2  //bm/4
+    beq        I,      T0,    .L280
+
+.L21:  /* for (i=0; i<bm/4; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x05
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+    vxor.v    U4,     U4,   U4
+    vxor.v    U5,     U5,   U5
+    vxor.v    U6,     U6,   U6
+    vxor.v    U7,     U7,   U7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<temp; k++) */
+    vld       D1,     B0,    0x00  // b0ri
+    vld       D2,     B0,    0x10  // b1ri
+    vld       D0,     A0,    0x00  // a0ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a0rr
+    vshuf4i.d  D6,     D0,    0x55  //a0ii
+
+    vpackev.d D7,     D2,    D1     //b0r b1r
+    vpackod.d D8,     D2,    D1     //b0i b1i
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 10r
+    VMADD2    U1,     D6,    D7,     U1  //00i 10i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    vld       D0,     A0,    0x10  // a1ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a1rr
+    vshuf4i.d  D6,     D0,    0x55  //a1ii
+
+    VMADD1    U2,     D5,    D7,     U2  //01r 11r
+    VMADD2    U3,     D6,    D7,     U3  //01i 11i
+    VMADD3    U2,     D6,    D8,     U2
+    VMADD4    U3,     D5,    D8,     U3
+
+    vld       D0,     A0,    0x20  // a2ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a2rr
+    vshuf4i.d  D6,     D0,    0x55  //a2ii
+
+    VMADD1    U4,     D5,    D7,     U4  //02r 12r
+    VMADD2    U5,     D6,    D7,     U5  //02i 12i
+    VMADD3    U4,     D6,    D8,     U4
+    VMADD4    U5,     D5,    D8,     U5
+
+    vld       D0,     A0,    0x30  // a3ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a3rr
+    vshuf4i.d  D6,     D0,    0x55  //a3ii
+
+    VMADD1    U6,     D5,    D7,     U6  //03r 13r
+    VMADD2    U7,     D6,    D7,     U7  //03i 13i
+    VMADD3    U6,     D6,    D8,     U6
+    VMADD4    U7,     D5,    D8,     U7
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U2,    VALPHAR
+    vfmul.d      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U4,    VALPHAR
+    vfmul.d      D3,    U5,    VALPHAR
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U6,    VALPHAR
+    vfmul.d      D3,    U7,    VALPHAR
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U4,    VALPHAR, D2
+    VFMADD      D3,    U5,    VALPHAR, D3
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U6,    VALPHAR, D2
+    VFMADD      D3,    U7,    VALPHAR, D3
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L280:   /* if ( bm & 2 )*/
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L284
+
+.L281:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L283
+    blt        TL,     L,     .L283
+
+.L282:  /* for (k=0; k<temp; k++) */
+    vld       D1,     B0,    0x00  // b0ri
+    vld       D2,     B0,    0x10  // b1ri
+    vld       D0,     A0,    0x00  // a0ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a0rr
+    vshuf4i.d  D6,     D0,    0x55  //a0ii
+
+    vpackev.d D7,     D2,    D1     //b0r b1r
+    vpackod.d D8,     D2,    D1     //b0i b1i
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 10r
+    VMADD2    U1,     D6,    D7,     U1  //00i 10i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    vld       D0,     A0,    0x10  // a1ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a1rr
+    vshuf4i.d  D6,     D0,    0x55  //a1ii
+
+    VMADD1    U2,     D5,    D7,     U2  //01r 11r
+    VMADD2    U3,     D6,    D7,     U3  //01i 11i
+    VMADD3    U2,     D6,    D8,     U2
+    VMADD4    U3,     D5,    D8,     U3
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L282
+
+.L283:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U2,    VALPHAR
+    vfmul.d      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L284:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L288
+
+.L285:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x05
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L287
+    blt        TL,     L,     .L287
+
+.L286:  /* for (k=0; k<temp; k++) */
+    vld       D1,     B0,    0x00  // b0ri
+    vld       D2,     B0,    0x10  // b1ri
+    vld       D0,     A0,    0x00  // a0ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a0rr
+    vshuf4i.d  D6,     D0,    0x55  //a0ii
+
+    vpackev.d D7,     D2,    D1     //b0r b1r
+    vpackod.d D8,     D2,    D1     //b0i b1i
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 10r
+    VMADD2    U1,     D6,    D7,     U1  //00i 10i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L286
+
+.L287:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L288:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+    slli.d     L,      K,     5
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   2
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     2
+    andi       T0,     N,     2
+    blt        J,      T0,    .L20
+
+.L30:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L999
+
+.L300:  /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     2  //bm/4
+    beq        I,      T0,    .L38
+
+.L31:  /* for (i=0; i<bm/4; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x04
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L33
+    blt        TL,     L,     .L33
+
+.L32:  /* for (k=0; k<temp; k++) */
+    vld       D1,     B0,    0x00  // b0ri
+
+    vld       D0,     A0,    0x00  // a0ri
+    vld       D2,     A0,    0x10  // a1ri
+
+    vpackev.d  D5,     D2,    D0  //a0r a1r
+    vpackod.d  D6,     D2,    D0  //a0i a1i
+
+    vand.v     D7,     D1,    D1
+    vand.v     D8,     D1,    D1
+    vshuf4i.d  D7,     D1,    0x00  //b0rr
+    vshuf4i.d  D8,     D1,    0x55  //b0ii
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 01r
+    VMADD2    U1,     D6,    D7,     U1  //00i 01i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    vld       D0,     A0,    0x20  // a0ri
+    vld       D2,     A0,    0x30  // a1ri
+
+    vpackev.d  D5,     D2,    D0  //a0r a1r
+    vpackod.d  D6,     D2,    D0  //a0i a1i
+
+    VMADD1    U2,     D5,    D7,     U2  //02r 03r
+    VMADD2    U3,     D6,    D7,     U3  //02i 03i
+    VMADD3    U2,     D6,    D8,     U2
+    VMADD4    U3,     D5,    D8,     U3
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L32
+
+.L33:
+#if defined(TRMMKERNEL)
+    //res00 res01
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C0,    0x10 //c0: 2 3
+
+    vpackev.d D2,     D1,    D0   //c0: 0 2
+    vpackod.d D3,     D1,    D0   //c0: 1 3
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0: 0 1
+    vpackod.d D5,     D3,    D2   //c0: 2 3
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+
+    //res02 res03
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C0,    0x10 //c0: 2 3
+
+    vpackev.d D2,     D1,    D0   //c0: 0 2
+    vpackod.d D3,     D1,    D0   //c0: 1 3
+
+    vfmul.d      D2,    U2,    VALPHAR
+    vfmul.d      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0: 0 1
+    vpackod.d D5,     D3,    D2   //c0: 2 3
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#else
+    //res00 res01
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C0,    0x10 //c0: 2 3
+
+    vpackev.d D2,     D1,    D0   //c0: 0 2
+    vpackod.d D3,     D1,    D0   //c0: 1 3
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0: 0 1
+    vpackod.d D5,     D3,    D2   //c0: 2 3
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+
+    //res02 res03
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C0,    0x10 //c0: 2 3
+
+    vpackev.d D2,     D1,    D0   //c0: 0 2
+    vpackod.d D3,     D1,    D0   //c0: 1 3
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0: 0 1
+    vpackod.d D5,     D3,    D2   //c0: 2 3
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L31
+
+.L38:   /* if ( bm & 2 ) */
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L312
+
+.L39:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L311
+    blt        TL,     L,     .L311
+
+.L310:  /* for (k=0; k<temp; k++) */
+    vld       D1,     B0,    0x00  // b0ri
+
+    vld       D0,     A0,    0x00  // a0ri
+    vld       D2,     A0,    0x10  // a1ri
+
+    vpackev.d  D5,     D2,    D0  //a0r a1r
+    vpackod.d  D6,     D2,    D0  //a0i a1i
+
+    vand.v     D7,     D1,    D1
+    vand.v     D8,     D1,    D1
+    vshuf4i.d  D7,     D1,    0x00  //b0rr
+    vshuf4i.d  D8,     D1,    0x55  //b0ii
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 01r
+    VMADD2    U1,     D6,    D7,     U1  //00i 01i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L310
+
+.L311:
+#if defined(TRMMKERNEL)
+    //res00 res01
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C0,    0x10 //c0: 2 3
+
+    vpackev.d D2,     D1,    D0   //c0: 0 2
+    vpackod.d D3,     D1,    D0   //c0: 1 3
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0: 0 1
+    vpackod.d D5,     D3,    D2   //c0: 2 3
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#else
+    //res00 res01
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C0,    0x10 //c0: 2 3
+
+    vpackev.d D2,     D1,    D0   //c0: 0 2
+    vpackod.d D3,     D1,    D0   //c0: 1 3
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0: 0 1
+    vpackod.d D5,     D3,    D2   //c0: 2 3
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L312:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L316
+
+.L313:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L315
+    blt        TL,     L,     .L315
+
+.L314:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00
+    LD         a2,     A0,    0x08
+
+    LD         b1,     B0,    0x00
+    LD         b2,     B0,    0x08
+
+    MADD1      c11,    a1,    b1,     c11
+    MADD2      c12,    a2,    b1,     c12
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L314
+
+.L315:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+
+    addi.d     C0,     C0,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L316:
+    slli.d     L,      K,     4
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L300
+
+.L999:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_kernel_8x4_lasx.S b/kernel/loongarch64/zgemm_kernel_8x4_lasx.S
new file mode 100644
index 000000000..ca90b30f5
--- /dev/null
+++ b/kernel/loongarch64/zgemm_kernel_8x4_lasx.S
@@ -0,0 +1,3545 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+#define T2     $r26
+#define T3     $r27
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LASX vectors */
+#define U0     $xr30
+#define U1     $xr31
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define VALPHAR $xr28
+#define VALPHAI $xr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVFMADD
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VNMSUB
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVNMSUB
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VFMADD
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVFMADD
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VFMADD
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVNMSUB
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VNMSUB
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    xvldrepl.d  VALPHAR, $sp, 112
+    xvldrepl.d  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   BASE_SHIFT
+
+    move       J,      $r0
+    srai.d     T0,     N,     2  //bn/4
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/4; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    add.d      C2,     C1,    TL
+    add.d      C3,     C2,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     3  //bm/8
+    beq        I,      T0,    .L150
+
+.L11:  /* for(i=0; i<bm/8; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     T3,     OFF,   0x07
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x06
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+    xvxor.v    U4,     U4,   U4
+    xvxor.v    U5,     U5,   U5
+    xvxor.v    U6,     U6,   U6
+    xvxor.v    U7,     U7,   U7
+    xvxor.v    U8,     U8,   U8
+    xvxor.v    U9,     U9,   U9
+    xvxor.v    U10,    U10,  U10
+    xvxor.v    U11,    U11,  U11
+    xvxor.v    U12,    U12,  U12
+    xvxor.v    U13,    U13,  U13
+    xvxor.v    U14,    U14,  U14
+    xvxor.v    U15,    U15,  U15
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L13
+    blt        TL,     L,     .L13
+
+.L12:  /* for(k=0; k<temp; k+=1) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri
+    xvld       D2,     B0,    0x00  // b0ri b1ri
+    xvld       D3,     B0,    0x20  // b2ri b3ri
+
+    xvpermi.d  D4,     D0,    0x00  //a0r
+    xvpermi.d  D5,     D0,    0x55  //a0i
+
+    xvpackev.d D6,     D3,    D2
+    xvpermi.d  D6,     D6,    0xd8  //b0r b1r b2r b3r
+
+    xvpackod.d D7,     D3,    D2
+    xvpermi.d  D7,     D7,    0xd8  //b0i b1i b2i b3i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    xvpermi.d  D4,     D0,    0xaa  //a1r
+    xvpermi.d  D5,     D0,    0xff  //a1i
+
+    XVMADD1    U2,     D4,    D6,     U2  //01r 11r 21r 31r
+    XVMADD2    U3,     D5,    D6,     U3  //01i 11i 21i 31i
+    XVMADD3    U2,     D5,    D7,     U2
+    XVMADD4    U3,     D4,    D7,     U3
+
+    xvld       D0,     A0,    0x20  // a2ri a3ri
+
+    xvpermi.d  D4,     D0,    0x00  //a2r
+    xvpermi.d  D5,     D0,    0x55  //a2i
+
+    XVMADD1    U4,     D4,    D6,     U4  //02r 12r 22r 32r
+    XVMADD2    U5,     D5,    D6,     U5  //02i 12i 22i 32i
+    XVMADD3    U4,     D5,    D7,     U4
+    XVMADD4    U5,     D4,    D7,     U5
+
+    xvpermi.d  D4,     D0,    0xaa  //a3r
+    xvpermi.d  D5,     D0,    0xff  //a3i
+
+    XVMADD1    U6,     D4,    D6,     U6  //03r 13r 23r 33r
+    XVMADD2    U7,     D5,    D6,     U7  //03i 13i 23i 33i
+    XVMADD3    U6,     D5,    D7,     U6
+    XVMADD4    U7,     D4,    D7,     U7
+
+    xvld       D0,     A0,    0x40  // a4ri a5ri
+
+    xvpermi.d  D4,     D0,    0x00  //a4r
+    xvpermi.d  D5,     D0,    0x55  //a4i
+
+    XVMADD1    U8,     D4,    D6,     U8  //04r 14r 24r 34r
+    XVMADD2    U9,     D5,    D6,     U9  //04i 14i 24i 34i
+    XVMADD3    U8,     D5,    D7,     U8
+    XVMADD4    U9,     D4,    D7,     U9
+
+    xvpermi.d  D4,     D0,    0xaa  //a5r
+    xvpermi.d  D5,     D0,    0xff  //a5i
+
+    XVMADD1    U10,     D4,    D6,     U10  //05r 15r 25r 35r
+    XVMADD2    U11,     D5,    D6,     U11  //05i 15i 25i 35i
+    XVMADD3    U10,     D5,    D7,     U10
+    XVMADD4    U11,     D4,    D7,     U11
+
+    xvld       D0,     A0,    0x60  // a6ri a7ri
+
+    xvpermi.d  D4,     D0,    0x00  //a6r
+    xvpermi.d  D5,     D0,    0x55  //a6i
+
+    XVMADD1    U12,     D4,    D6,     U12  //06r 16r 26r 36r
+    XVMADD2    U13,     D5,    D6,     U13  //06i 16i 26i 36i
+    XVMADD3    U12,     D5,    D7,     U12
+    XVMADD4    U13,     D4,    D7,     U13
+
+    xvpermi.d  D4,     D0,    0xaa  //a5r
+    xvpermi.d  D5,     D0,    0xff  //a5i
+
+    XVMADD1    U14,     D4,    D6,     U14  //07r 17r 27r 37r
+    XVMADD2    U15,     D5,    D6,     U15  //07i 17i 27i 37i
+    XVMADD3    U14,     D5,    D7,     U14
+    XVMADD4    U15,     D4,    D7,     U15
+
+    addi.d     A0,     A0,    0x80
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L12
+
+.L13:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U0,    VALPHAR
+    xvfmul.d      D7,    U1,    VALPHAR
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    xvfmul.d      D6,    U2,    VALPHAR
+    xvfmul.d      D7,    U3,    VALPHAR
+    XVNMSUB      D6,    U3,    VALPHAI, D6
+    XVFMADD      D7,    U2,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res02 res12 res22 res32
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U4,    VALPHAR
+    xvfmul.d      D7,    U5,    VALPHAR
+    XVNMSUB      D6,    U5,    VALPHAI, D6
+    XVFMADD      D7,    U4,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    xvfmul.d      D6,    U6,    VALPHAR
+    xvfmul.d      D7,    U7,    VALPHAR
+    XVNMSUB      D6,    U7,    VALPHAI, D6
+    XVFMADD      D7,    U6,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res04 res14 res24 res34
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U8,    VALPHAR
+    xvfmul.d      D7,    U9,    VALPHAR
+    XVNMSUB      D6,    U9,    VALPHAI, D6
+    XVFMADD      D7,    U8,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res05 res15 res25 res35
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    xvfmul.d      D6,    U10,    VALPHAR
+    xvfmul.d      D7,    U11,    VALPHAR
+    XVNMSUB      D6,    U11,    VALPHAI, D6
+    XVFMADD      D7,    U10,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res06 res16 res26 res36
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U12,    VALPHAR
+    xvfmul.d      D7,    U13,    VALPHAR
+    XVNMSUB      D6,    U13,    VALPHAI, D6
+    XVFMADD      D7,    U12,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res07 res17 res27 res37
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    xvfmul.d      D6,    U14,    VALPHAR
+    xvfmul.d      D7,    U15,    VALPHAR
+    XVNMSUB      D6,    U15,    VALPHAI, D6
+    XVFMADD      D7,    U14,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#else
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U0,    VALPHAR, D6
+    XVFMADD      D7,    U1,    VALPHAR, D7
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    XVFMADD      D6,    U2,    VALPHAR, D6
+    XVFMADD      D7,    U3,    VALPHAR, D7
+    XVNMSUB      D6,    U3,    VALPHAI, D6
+    XVFMADD      D7,    U2,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res02 res12 res22 res32
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U4,    VALPHAR, D6
+    XVFMADD      D7,    U5,    VALPHAR, D7
+    XVNMSUB      D6,    U5,    VALPHAI, D6
+    XVFMADD      D7,    U4,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    XVFMADD      D6,    U6,    VALPHAR, D6
+    XVFMADD      D7,    U7,    VALPHAR, D7
+    XVNMSUB      D6,    U7,    VALPHAI, D6
+    XVFMADD      D7,    U6,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res04 res14 res24 res34
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U8,    VALPHAR, D6
+    XVFMADD      D7,    U9,    VALPHAR, D7
+    XVNMSUB      D6,    U9,    VALPHAI, D6
+    XVFMADD      D7,    U8,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res05 res15 res25 res35
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    XVFMADD      D6,    U10,    VALPHAR, D6
+    XVFMADD      D7,    U11,    VALPHAR, D7
+    XVNMSUB      D6,    U11,    VALPHAI, D6
+    XVFMADD      D7,    U10,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res06 res16 res26 res36
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U12,    VALPHAR, D6
+    XVFMADD      D7,    U13,    VALPHAR, D7
+    XVNMSUB      D6,    U13,    VALPHAI, D6
+    XVFMADD      D7,    U12,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res07 res17 res27 res37
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    XVFMADD      D6,    U14,    VALPHAR, D6
+    XVFMADD      D7,    U15,    VALPHAR, D7
+    XVNMSUB      D6,    U15,    VALPHAI, D6
+    XVFMADD      D7,    U14,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x06
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     4
+    beq        I,      T0,    .L18
+
+.L15:  /* if (bm & 4) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x06
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+    xvxor.v    U4,     U4,   U4
+    xvxor.v    U5,     U5,   U5
+    xvxor.v    U6,     U6,   U6
+    xvxor.v    U7,     U7,   U7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri
+    xvld       D2,     B0,    0x00  // b0ri b1ri
+    xvld       D3,     B0,    0x20  // b2ri b3ri
+
+    xvpermi.d  D4,     D0,    0x00  //a0r
+    xvpermi.d  D5,     D0,    0x55  //a0i
+
+    xvpackev.d D6,     D3,    D2
+    xvpermi.d  D6,     D6,    0xd8  //b0r b1r b2r b3r
+
+    xvpackod.d D7,     D3,    D2
+    xvpermi.d  D7,     D7,    0xd8  //b0i b1i b2i b3i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    xvpermi.d  D4,     D0,    0xaa  //a1r
+    xvpermi.d  D5,     D0,    0xff  //a1i
+
+    XVMADD1    U2,     D4,    D6,     U2  //01r 11r 21r 31r
+    XVMADD2    U3,     D5,    D6,     U3  //01i 11i 21i 31i
+    XVMADD3    U2,     D5,    D7,     U2
+    XVMADD4    U3,     D4,    D7,     U3
+
+    xvld       D0,     A0,    0x20  // a2ri a3ri
+
+    xvpermi.d  D4,     D0,    0x00  //a2r
+    xvpermi.d  D5,     D0,    0x55  //a2i
+
+    XVMADD1    U4,     D4,    D6,     U4  //02r 12r 22r 32r
+    XVMADD2    U5,     D5,    D6,     U5  //02i 12i 22i 32i
+    XVMADD3    U4,     D5,    D7,     U4
+    XVMADD4    U5,     D4,    D7,     U5
+
+    xvpermi.d  D4,     D0,    0xaa  //a3r
+    xvpermi.d  D5,     D0,    0xff  //a3i
+
+    XVMADD1    U6,     D4,    D6,     U6  //03r 13r 23r 33r
+    XVMADD2    U7,     D5,    D6,     U7  //03i 13i 23i 33i
+    XVMADD3    U6,     D5,    D7,     U6
+    XVMADD4    U7,     D4,    D7,     U7
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U0,    VALPHAR
+    xvfmul.d      D7,    U1,    VALPHAR
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    xvfmul.d      D6,    U2,    VALPHAR
+    xvfmul.d      D7,    U3,    VALPHAR
+    XVNMSUB      D6,    U3,    VALPHAI, D6
+    XVFMADD      D7,    U2,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res02 res12 res22 res32
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U4,    VALPHAR
+    xvfmul.d      D7,    U5,    VALPHAR
+    XVNMSUB      D6,    U5,    VALPHAI, D6
+    XVFMADD      D7,    U4,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    xvfmul.d      D6,    U6,    VALPHAR
+    xvfmul.d      D7,    U7,    VALPHAR
+    XVNMSUB      D6,    U7,    VALPHAI, D6
+    XVFMADD      D7,    U6,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#else
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U0,    VALPHAR, D6
+    XVFMADD      D7,    U1,    VALPHAR, D7
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    XVFMADD      D6,    U2,    VALPHAR, D6
+    XVFMADD      D7,    U3,    VALPHAR, D7
+    XVNMSUB      D6,    U3,    VALPHAI, D6
+    XVFMADD      D7,    U2,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res02 res12 res22 res32
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U4,    VALPHAR, D6
+    XVFMADD      D7,    U5,    VALPHAR, D7
+    XVNMSUB      D6,    U5,    VALPHAI, D6
+    XVFMADD      D7,    U4,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    XVFMADD      D6,    U6,    VALPHAR, D6
+    XVFMADD      D7,    U7,    VALPHAR, D7
+    XVNMSUB      D6,    U7,    VALPHAI, D6
+    XVFMADD      D7,    U6,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L18:   /* if (bm & 2) */
+    move       I,      $r0
+    andi       T0,     M,     2
+    beq        I,      T0,    .L183
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x06
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L182
+    blt        TL,     L,     .L182
+
+.L181:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri
+    xvld       D2,     B0,    0x00  // b0ri b1ri
+    xvld       D3,     B0,    0x20  // b2ri b3ri
+
+    xvpermi.d  D4,     D0,    0x00  //a0r
+    xvpermi.d  D5,     D0,    0x55  //a0i
+
+    xvpackev.d D6,     D3,    D2
+    xvpermi.d  D6,     D6,    0xd8  //b0r b1r b2r b3r
+
+    xvpackod.d D7,     D3,    D2
+    xvpermi.d  D7,     D7,    0xd8  //b0i b1i b2i b3i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    xvpermi.d  D4,     D0,    0xaa  //a1r
+    xvpermi.d  D5,     D0,    0xff  //a1i
+
+    XVMADD1    U2,     D4,    D6,     U2  //01r 11r 21r 31r
+    XVMADD2    U3,     D5,    D6,     U3  //01i 11i 21i 31i
+    XVMADD3    U2,     D5,    D7,     U2
+    XVMADD4    U3,     D4,    D7,     U3
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L181
+
+.L182:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U0,    VALPHAR
+    xvfmul.d      D7,    U1,    VALPHAR
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    xvfmul.d      D6,    U2,    VALPHAR
+    xvfmul.d      D7,    U3,    VALPHAR
+    XVNMSUB      D6,    U3,    VALPHAI, D6
+    XVFMADD      D7,    U2,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#else
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U0,    VALPHAR, D6
+    XVFMADD      D7,    U1,    VALPHAR, D7
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    XVFMADD      D6,    U2,    VALPHAR, D6
+    XVFMADD      D7,    U3,    VALPHAR, D7
+    XVNMSUB      D6,    U3,    VALPHAI, D6
+    XVFMADD      D7,    U2,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x06
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L183:   /* if (bm & 1) */
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L186
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x06
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L185
+    blt        TL,     L,     .L185
+
+.L184:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri
+    xvld       D2,     B0,    0x00  // b0ri b1ri
+    xvld       D3,     B0,    0x20  // b2ri b3ri
+
+    xvpermi.d  D4,     D0,    0x00  //a0r
+    xvpermi.d  D5,     D0,    0x55  //a0i
+
+    xvpackev.d D6,     D3,    D2
+    xvpermi.d  D6,     D6,    0xd8  //b0r b1r b2r b3r
+
+    xvpackod.d D7,     D3,    D2
+    xvpermi.d  D7,     D7,    0xd8  //b0i b1i b2i b3i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L184
+
+.L185:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U0,    VALPHAR
+    xvfmul.d      D7,    U1,    VALPHAR
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    xvpermi.d  D8,     D10,   0x4e //c1[0] c1[1] c0[0] c0[1]
+    xvpermi.d  D9,     D11,   0x4e //c3[0] c3[1] c2[0] c2[1]
+
+    vst       $vr26,     C0,    0x00
+    vst       $vr24,     C1,    0x00
+    vst       $vr27,     C2,    0x00
+    vst       $vr25,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U0,    VALPHAR, D6
+    XVFMADD      D7,    U1,    VALPHAR, D7
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    xvpermi.d  D8,     D10,   0x4e //c1[0] c1[1] c0[0] c0[1]
+    xvpermi.d  D9,     D11,   0x4e //c3[0] c3[1] c2[0] c2[1]
+
+    vst       $vr26,     C0,    0x00
+    vst       $vr24,     C1,    0x00
+    vst       $vr27,     C2,    0x00
+    vst       $vr25,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x06
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+
+.L186:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   4
+#endif
+
+    slli.d     L,      K,     0x06
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x03
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     2
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     2
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&2); j+=2) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     3  //bm/8
+    beq        I,      T0,    .L24
+
+.L21:  /* for (i=0; i<bm/8; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x05
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    $vr30,     $vr30,   $vr30
+    vxor.v    $vr31,     $vr31,   $vr31
+    vxor.v    $vr2,      $vr2,    $vr2
+    vxor.v    $vr3,      $vr3,    $vr3
+    vxor.v    $vr4,      $vr4,    $vr4
+    vxor.v    $vr5,      $vr5,    $vr5
+    vxor.v    $vr6,      $vr6,    $vr6
+    vxor.v    $vr7,      $vr7,    $vr7
+    vxor.v    $vr8,      $vr8,    $vr8
+    vxor.v    $vr9,      $vr9,    $vr9
+    vxor.v    $vr10,      $vr10,    $vr10
+    vxor.v    $vr11,      $vr11,    $vr11
+    vxor.v    $vr12,      $vr12,    $vr12
+    vxor.v    $vr13,      $vr13,    $vr13
+    vxor.v    $vr14,      $vr14,    $vr14
+    vxor.v    $vr15,      $vr15,    $vr15
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<temp; k++) */
+    vld       $vr16,     A0,    0x00  // a0ri
+
+    vld       $vr18,     B0,    0x00  // b0ri
+    vld       $vr19,     B0,    0x10  // b1ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a0rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a0ii
+
+    vand.v    $vr23,     $vr18,    $vr18
+    vshuf4i.d $vr23,     $vr19,    0x08 //b0r b1r
+    vshuf4i.d $vr18,     $vr19,    0x0d //b0i b1i
+
+    VMADD1    $vr30,     $vr21,    $vr23,     $vr30  //00r 10r
+    VMADD2    $vr31,     $vr22,    $vr23,     $vr31  //00i 10i
+    VMADD3    $vr30,     $vr22,    $vr18,     $vr30
+    VMADD4    $vr31,     $vr21,    $vr18,     $vr31
+
+    vld       $vr16,     A0,    0x10  // a1ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a1rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a1ii
+
+    VMADD1    $vr2,     $vr21,    $vr23,     $vr2  //01r 11r
+    VMADD2    $vr3,     $vr22,    $vr23,     $vr3  //01i 11i
+    VMADD3    $vr2,     $vr22,    $vr18,     $vr2
+    VMADD4    $vr3,     $vr21,    $vr18,     $vr3
+
+    vld       $vr16,     A0,    0x20  // a2ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a2rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a2ii
+
+    VMADD1    $vr4,     $vr21,    $vr23,     $vr4  //02r 12r
+    VMADD2    $vr5,     $vr22,    $vr23,     $vr5  //02i 12i
+    VMADD3    $vr4,     $vr22,    $vr18,     $vr4
+    VMADD4    $vr5,     $vr21,    $vr18,     $vr5
+
+    vld       $vr16,     A0,    0x30  // a3ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a3rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a3ii
+
+    VMADD1    $vr6,     $vr21,    $vr23,     $vr6  //03r 13r
+    VMADD2    $vr7,     $vr22,    $vr23,     $vr7  //03i 13i
+    VMADD3    $vr6,     $vr22,    $vr18,     $vr6
+    VMADD4    $vr7,     $vr21,    $vr18,     $vr7
+
+    vld       $vr16,     A0,    0x40  // a4ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a4rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a4ii
+
+    VMADD1    $vr8,     $vr21,    $vr23,     $vr8  //04r 14r
+    VMADD2    $vr9,     $vr22,    $vr23,     $vr9  //04i 14i
+    VMADD3    $vr8,     $vr22,    $vr18,     $vr8
+    VMADD4    $vr9,     $vr21,    $vr18,     $vr9
+
+    vld       $vr16,     A0,    0x50  // a5ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a5rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a5ii
+
+    VMADD1    $vr10,     $vr21,    $vr23,     $vr10  //05r 15r
+    VMADD2    $vr11,     $vr22,    $vr23,     $vr11  //05i 15i
+    VMADD3    $vr10,     $vr22,    $vr18,     $vr10
+    VMADD4    $vr11,     $vr21,    $vr18,     $vr11
+
+    vld       $vr16,     A0,    0x60  // a6ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a6rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a6ii
+
+    VMADD1    $vr12,     $vr21,    $vr23,     $vr12  //06r 16r
+    VMADD2    $vr13,     $vr22,    $vr23,     $vr13  //06i 16i
+    VMADD3    $vr12,     $vr22,    $vr18,     $vr12
+    VMADD4    $vr13,     $vr21,    $vr18,     $vr13
+
+    vld       $vr16,     A0,    0x70  // a7ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a7rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a7ii
+
+    VMADD1    $vr14,     $vr21,    $vr23,     $vr14  //07r 17r
+    VMADD2    $vr15,     $vr22,    $vr23,     $vr15  //07i 17i
+    VMADD3    $vr14,     $vr22,    $vr18,     $vr14
+    VMADD4    $vr15,     $vr21,    $vr18,     $vr15
+
+    addi.d     A0,     A0,    0x80
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr30,    $vr28
+    vfmul.d      $vr19,    $vr31,    $vr28
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr19,    $vr30,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr2,    $vr28
+    vfmul.d      $vr19,    $vr3,    $vr28
+    VNMSUB      $vr18,    $vr3,    $vr29, $vr18
+    VFMADD      $vr19,    $vr2,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr4,    $vr28
+    vfmul.d      $vr19,    $vr5,    $vr28
+    VNMSUB      $vr18,    $vr5,    $vr29, $vr18
+    VFMADD      $vr19,    $vr4,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res03 res13
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr6,    $vr28
+    vfmul.d      $vr19,    $vr7,    $vr28
+    VNMSUB      $vr18,    $vr7,    $vr29, $vr18
+    VFMADD      $vr19,    $vr6,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res04 res14
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr8,    $vr28
+    vfmul.d      $vr19,    $vr9,    $vr28
+    VNMSUB      $vr18,    $vr9,    $vr29, $vr18
+    VFMADD      $vr19,    $vr8,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res05 res15
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr10,    $vr28
+    vfmul.d      $vr19,    $vr11,    $vr28
+    VNMSUB      $vr18,    $vr11,    $vr29, $vr18
+    VFMADD      $vr19,    $vr10,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res06 res16
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr12,    $vr28
+    vfmul.d      $vr19,    $vr13,    $vr28
+    VNMSUB      $vr18,    $vr13,    $vr29, $vr18
+    VFMADD      $vr19,    $vr12,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res07 res17
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr14,    $vr28
+    vfmul.d      $vr19,    $vr15,    $vr28
+    VNMSUB      $vr18,    $vr15,    $vr29, $vr18
+    VFMADD      $vr19,    $vr14,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr30,    $vr28, $vr18
+    VFMADD      $vr19,    $vr31,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr19,    $vr30,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr2,    $vr28, $vr18
+    VFMADD      $vr19,    $vr3,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr3,    $vr29, $vr18
+    VFMADD      $vr19,    $vr2,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr4,    $vr28, $vr18
+    VFMADD      $vr19,    $vr5,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr5,    $vr29, $vr18
+    VFMADD      $vr19,    $vr4,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res03 res13
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr6,    $vr28, $vr18
+    VFMADD      $vr19,    $vr7,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr7,    $vr29, $vr18
+    VFMADD      $vr19,    $vr6,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res04 res14
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr8,    $vr28, $vr18
+    VFMADD      $vr19,    $vr9,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr9,    $vr29, $vr18
+    VFMADD      $vr19,    $vr8,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res05 res15
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr10,    $vr28, $vr18
+    VFMADD      $vr19,    $vr11,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr11,    $vr29, $vr18
+    VFMADD      $vr19,    $vr10,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res06 res16
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr12,    $vr28, $vr18
+    VFMADD      $vr19,    $vr13,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr13,    $vr29, $vr18
+    VFMADD      $vr19,    $vr12,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res07 res17
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr14,    $vr28, $vr18
+    VFMADD      $vr19,    $vr15,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr15,    $vr29, $vr18
+    VFMADD      $vr19,    $vr14,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L24:   /* if ( bm & 4 ) */
+    move       I,      $r0
+    andi       T1,     M,     4    //bm&4
+    beq        I,      T1,    .L280
+
+.L25:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x06
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x05
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    $vr30,     $vr30,   $vr30
+    vxor.v    $vr31,     $vr31,   $vr31
+    vxor.v    $vr2,      $vr2,    $vr2
+    vxor.v    $vr3,      $vr3,    $vr3
+    vxor.v    $vr4,      $vr4,    $vr4
+    vxor.v    $vr5,      $vr5,    $vr5
+    vxor.v    $vr6,      $vr6,    $vr6
+    vxor.v    $vr7,      $vr7,    $vr7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<temp; k++) */
+    vld       $vr16,     A0,    0x00  // a0ri
+
+    vld       $vr18,     B0,    0x00  // b0ri
+    vld       $vr19,     B0,    0x10  // b1ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a0rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a0ii
+
+    vand.v    $vr23,     $vr18,    $vr18
+    vshuf4i.d $vr23,     $vr19,    0x08 //b0r b1r
+    vshuf4i.d $vr18,     $vr19,    0x0d //b0i b1i
+
+    VMADD1    $vr30,     $vr21,    $vr23,     $vr30  //00r 10r
+    VMADD2    $vr31,     $vr22,    $vr23,     $vr31  //00i 10i
+    VMADD3    $vr30,     $vr22,    $vr18,     $vr30
+    VMADD4    $vr31,     $vr21,    $vr18,     $vr31
+
+    vld       $vr16,     A0,    0x10  // a1ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a1rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a1ii
+
+    VMADD1    $vr2,     $vr21,    $vr23,     $vr2  //01r 11r
+    VMADD2    $vr3,     $vr22,    $vr23,     $vr3  //01i 11i
+    VMADD3    $vr2,     $vr22,    $vr18,     $vr2
+    VMADD4    $vr3,     $vr21,    $vr18,     $vr3
+
+    vld       $vr16,     A0,    0x20  // a2ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a2rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a2ii
+
+    VMADD1    $vr4,     $vr21,    $vr23,     $vr4  //02r 12r
+    VMADD2    $vr5,     $vr22,    $vr23,     $vr5  //02i 12i
+    VMADD3    $vr4,     $vr22,    $vr18,     $vr4
+    VMADD4    $vr5,     $vr21,    $vr18,     $vr5
+
+    vld       $vr16,     A0,    0x30  // a3ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a3rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a3ii
+
+    VMADD1    $vr6,     $vr21,    $vr23,     $vr6  //03r 13r
+    VMADD2    $vr7,     $vr22,    $vr23,     $vr7  //03i 13i
+    VMADD3    $vr6,     $vr22,    $vr18,     $vr6
+    VMADD4    $vr7,     $vr21,    $vr18,     $vr7
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr30,    $vr28
+    vfmul.d      $vr19,    $vr31,    $vr28
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr19,    $vr30,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr2,    $vr28
+    vfmul.d      $vr19,    $vr3,    $vr28
+    VNMSUB      $vr18,    $vr3,    $vr29, $vr18
+    VFMADD      $vr19,    $vr2,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr4,    $vr28
+    vfmul.d      $vr19,    $vr5,    $vr28
+    VNMSUB      $vr18,    $vr5,    $vr29, $vr18
+    VFMADD      $vr19,    $vr4,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res03 res13
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr6,    $vr28
+    vfmul.d      $vr19,    $vr7,    $vr28
+    VNMSUB      $vr18,    $vr7,    $vr29, $vr18
+    VFMADD      $vr19,    $vr6,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr30,    $vr28, $vr18
+    VFMADD      $vr19,    $vr31,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr19,    $vr30,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr2,    $vr28, $vr18
+    VFMADD      $vr19,    $vr3,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr3,    $vr29, $vr18
+    VFMADD      $vr19,    $vr2,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr4,    $vr28, $vr18
+    VFMADD      $vr19,    $vr5,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr5,    $vr29, $vr18
+    VFMADD      $vr19,    $vr4,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res03 res13
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr6,    $vr28, $vr18
+    VFMADD      $vr19,    $vr7,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr7,    $vr29, $vr18
+    VFMADD      $vr19,    $vr6,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L280:   /* if ( bm & 2 )*/
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L284
+
+.L281:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L283
+    blt        TL,     L,     .L283
+
+.L282:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri
+    xvld       D2,     B0,    0x00  // b0ri b1ri
+
+    xvpermi.d  D1,     D0,    0xf5  //a0ii a1ii
+    xvpermi.d  D0,     D0,    0xa0  //a0rr a1rr
+
+    xvpermi.d  D3,     D2,    0xdd  //b0i b1i b0i b1i
+    xvpermi.d  D2,     D2,    0x88  //b0r b1r b0r b1r
+
+    XVMADD1    U0,     D0,    D2,     U0  //00r 10r 01r 11r
+    XVMADD2    U1,     D1,    D2,     U1  //00i 10i 01i 11i
+    XVMADD3    U0,     D1,    D3,     U0
+    XVMADD4    U1,     D0,    D3,     U1
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L282
+
+.L283:
+#if defined(TRMMKERNEL)
+    //res00 res10 res01 res11
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    xvpackev.d D2,     D1,    D0  //0 4 2 6
+    xvpackod.d D3,     D1,    D0  //1 5 3 7
+
+    xvfmul.d      D2,    U0,    VALPHAR
+    xvfmul.d      D3,    U1,    VALPHAR
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvpackev.d D4,     D3,    D2  //0 1 2 3
+    xvpackod.d D5,     D3,    D2  //4 5 6 7
+
+    xvst       D4,     C0,    0x00 //c0: 0 1 2 3
+    xvst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+#else
+    //res00 res10 res01 res11
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    xvpackev.d D2,     D1,    D0  //0 4 2 6
+    xvpackod.d D3,     D1,    D0  //1 5 3 7
+
+    XVFMADD      D2,    U0,    VALPHAR, D2
+    XVFMADD      D3,    U1,    VALPHAR, D3
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvpackev.d D4,     D3,    D2  //0 1 2 3
+    xvpackod.d D5,     D3,    D2  //4 5 6 7
+
+    xvst       D4,     C0,    0x00 //c0: 0 1 2 3
+    xvst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L284:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L288
+
+.L285:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x05
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    $vr30,     $vr30,   $vr30
+    vxor.v    $vr31,     $vr31,   $vr31
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L287
+    blt        TL,     L,     .L287
+
+.L286:  /* for (k=0; k<temp; k++) */
+    vld       $vr16,     A0,    0x00  // a0ri
+
+    vld       $vr18,     B0,    0x00  // b0ri
+    vld       $vr19,     B0,    0x10  // b1ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a0rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a0ii
+
+    vand.v    $vr23,     $vr18,    $vr18
+    vshuf4i.d $vr23,     $vr19,    0x08 //b0r b1r
+    vshuf4i.d $vr18,     $vr19,    0x0d //b0i b1i
+
+    VMADD1    $vr30,     $vr21,    $vr23,     $vr30  //00r 10r
+    VMADD2    $vr31,     $vr22,    $vr23,     $vr31  //00i 10i
+    VMADD3    $vr30,     $vr22,    $vr18,     $vr30
+    VMADD4    $vr31,     $vr21,    $vr18,     $vr31
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L286
+
+.L287:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vand.v    $vr18,     $vr16,    $vr16
+    vshuf4i.d $vr18,     $vr17,    0x08 //c0[0] c1[0]
+    vshuf4i.d $vr16,     $vr17,    0x0d //c0[1] c1[1]
+
+    vfmul.d      $vr18,    $vr30,    $vr28
+    vfmul.d      $vr16,    $vr31,    $vr28
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr16,    $vr30,    $vr29, $vr16
+
+    vand.v    $vr19,     $vr18,    $vr18
+    vshuf4i.d $vr19,     $vr16,    0x08 //c0[0] c0[1]
+    vshuf4i.d $vr18,     $vr16,    0x0d //c1[0] c1[1]
+
+    vst       $vr19,     C0,    0x00 //c0: 0 1
+    vst       $vr18,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vand.v    $vr18,     $vr16,    $vr16
+    vshuf4i.d $vr18,     $vr17,    0x08 //c0[0] c1[0]
+    vshuf4i.d $vr16,     $vr17,    0x0d //c0[1] c1[1]
+
+    VFMADD      $vr18,    $vr30,    $vr28, $vr18
+    VFMADD      $vr16,    $vr31,    $vr28, $vr16
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr16,    $vr30,    $vr29, $vr16
+
+    vand.v    $vr19,     $vr18,    $vr18
+    vshuf4i.d $vr19,     $vr16,    0x08 //c0[0] c0[1]
+    vshuf4i.d $vr18,     $vr16,    0x0d //c1[0] c1[1]
+
+    vst       $vr19,     C0,    0x00 //c0: 0 1
+    vst       $vr18,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L288:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+    slli.d     L,      K,     5
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   2
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     2
+    andi       T0,     N,     2
+    blt        J,      T0,    .L20
+
+.L30:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L999
+
+.L300:  /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     3  //bm/8
+    beq        I,      T0,    .L34
+
+.L31:  /* for (i=0; i<bm/8; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x04
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L33
+    blt        TL,     L,     .L33
+
+.L32:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri
+    xvld       D1,     A0,    0x20  // a2ri a3ri
+
+    xvldrepl.d D2,     B0,    0x00 //b0r
+    xvldrepl.d D3,     B0,    0x08 //b0i
+
+    xvpackev.d D4,     D1,    D0
+    xvpermi.d  D4,     D4,    0xd8  //a0r a1r a2r a3r
+
+    xvpackod.d D5,     D1,    D0
+    xvpermi.d  D5,     D5,    0xd8  //a0i a1i a2i a3i
+
+    XVMADD1    U0,     D4,    D2,     U0  //00r 01r 02r 03r
+    XVMADD2    U1,     D5,    D2,     U1  //00i 01i 02i 03i
+    XVMADD3    U0,     D5,    D3,     U0
+    XVMADD4    U1,     D4,    D3,     U1
+
+    xvld       D0,     A0,    0x40  // a4ri a5ri
+    xvld       D1,     A0,    0x60  // a6ri a7ri
+
+    xvpackev.d D4,     D1,    D0
+    xvpermi.d  D4,     D4,    0xd8  //a4r a5r a6r a7r
+
+    xvpackod.d D5,     D1,    D0
+    xvpermi.d  D5,     D5,    0xd8  //a4i a5i a6i a7i
+
+    XVMADD1    U2,     D4,    D2,     U2  //04r 05r 06r 07r
+    XVMADD2    U3,     D5,    D2,     U3  //04i 05i 06i 07i
+    XVMADD3    U2,     D5,    D3,     U2
+    XVMADD4    U3,     D4,    D3,     U3
+
+    addi.d     A0,     A0,    0x80
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L32
+
+.L33:
+#if defined(TRMMKERNEL)
+    //res00 res01 res02 res03
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C0,    0x20 //c0: 4 5 6 7
+
+    xvpackev.d D2,     D1,    D0
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6
+    xvpackod.d D3,     D1,    D0
+    xvpermi.d  D3,     D3,    0xd8  //1 3 5 7
+
+    xvfmul.d      D2,    U0,    VALPHAR
+    xvfmul.d      D3,    U1,    VALPHAR
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6
+    xvpermi.q  D4,     D3,   0x02 //0 2 1 3
+    xvpermi.d  D4,     D4,   0xd8 //0 1 2 3
+
+    xvand.v    D5,     D3,   D3  //1 3 5 7
+    xvpermi.q  D5,     D2,   0x31 //4 6 5 7
+    xvpermi.d  D5,     D5,   0xd8 //4 5 6 7
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    //res04 res05 res06 res07
+    xvld       D0,     C0,    0x40 //c0: 8 9 10 11
+    xvld       D1,     C0,    0x60 //c0: 12 13 14 15
+
+    xvpackev.d D2,     D1,    D0
+    xvpermi.d  D2,     D2,    0xd8  //8 10 12 14
+    xvpackod.d D3,     D1,    D0
+    xvpermi.d  D3,     D3,    0xd8  //9 11 13 15
+
+    xvfmul.d      D2,    U2,    VALPHAR
+    xvfmul.d      D3,    U3,    VALPHAR
+    XVNMSUB      D2,    U3,    VALPHAI, D2
+    XVFMADD      D3,    U2,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //8 10 12 14
+    xvpermi.q  D4,     D3,   0x02 //8 10 9 11
+    xvpermi.d  D4,     D4,   0xd8 //8 9 10 11
+
+    xvand.v    D5,     D3,   D3  //9 11 13 15
+    xvpermi.q  D5,     D2,   0x31 //12 14 13 15
+    xvpermi.d  D5,     D5,   0xd8 //12 13 14 15
+
+    xvst       D4,     C0,    0x40
+    xvst       D5,     C0,    0x60
+
+    addi.d     C0,     C0,    0x80
+#else
+    //res00 res01 res02 res03
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C0,    0x20 //c0: 4 5 6 7
+
+    xvpackev.d D2,     D1,    D0
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6
+    xvpackod.d D3,     D1,    D0
+    xvpermi.d  D3,     D3,    0xd8  //1 3 5 7
+
+    XVFMADD      D2,    U0,    VALPHAR, D2
+    XVFMADD      D3,    U1,    VALPHAR, D3
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6
+    xvpermi.q  D4,     D3,   0x02 //0 2 1 3
+    xvpermi.d  D4,     D4,   0xd8 //0 1 2 3
+
+    xvand.v    D5,     D3,   D3  //1 3 5 7
+    xvpermi.q  D5,     D2,   0x31 //4 6 5 7
+    xvpermi.d  D5,     D5,   0xd8 //4 5 6 7
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    //res04 res05 res06 res07
+    xvld       D0,     C0,    0x40 //c0: 8 9 10 11
+    xvld       D1,     C0,    0x60 //c0: 12 13 14 15
+
+    xvpackev.d D2,     D1,    D0
+    xvpermi.d  D2,     D2,    0xd8  //8 10 12 14
+    xvpackod.d D3,     D1,    D0
+    xvpermi.d  D3,     D3,    0xd8  //9 11 13 15
+
+    XVFMADD      D2,    U2,    VALPHAR, D2
+    XVFMADD      D3,    U3,    VALPHAR, D3
+    XVNMSUB      D2,    U3,    VALPHAI, D2
+    XVFMADD      D3,    U2,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //8 10 12 14
+    xvpermi.q  D4,     D3,   0x02 //8 10 9 11
+    xvpermi.d  D4,     D4,   0xd8 //8 9 10 11
+
+    xvand.v    D5,     D3,   D3  //9 11 13 15
+    xvpermi.q  D5,     D2,   0x31 //12 14 13 15
+    xvpermi.d  D5,     D5,   0xd8 //12 13 14 15
+
+    xvst       D4,     C0,    0x40
+    xvst       D5,     C0,    0x60
+
+    addi.d     C0,     C0,    0x80
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L31
+
+.L34:   /* if ( bm & 4 ) */
+    move       I,      $r0
+    andi       T1,     M,     4    //bm&4
+    beq        I,      T1,    .L38
+
+.L35:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x06
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L37
+    blt        TL,     L,     .L37
+
+.L36:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri
+    xvld       D1,     A0,    0x20  // a2ri a3ri
+
+    xvldrepl.d D2,     B0,    0x00 //b0r
+    xvldrepl.d D3,     B0,    0x08 //b0i
+
+    xvpackev.d D4,     D1,    D0
+    xvpermi.d  D4,     D4,    0xd8  //a0r a1r a2r a3r
+
+    xvpackod.d D5,     D1,    D0
+    xvpermi.d  D5,     D5,    0xd8  //a0i a1i a2i a3i
+
+    XVMADD1    U0,     D4,    D2,     U0  //00r 01r 02r 03r
+    XVMADD2    U1,     D5,    D2,     U1  //00i 01i 02i 03i
+    XVMADD3    U0,     D5,    D3,     U0
+    XVMADD4    U1,     D4,    D3,     U1
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L36
+
+.L37:
+#if defined(TRMMKERNEL)
+    //res00 res01 res02 res03
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C0,    0x20 //c0: 4 5 6 7
+
+    xvpackev.d D2,     D1,    D0
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6
+    xvpackod.d D3,     D1,    D0
+    xvpermi.d  D3,     D3,    0xd8  //1 3 5 7
+
+    xvfmul.d      D2,    U0,    VALPHAR
+    xvfmul.d      D3,    U1,    VALPHAR
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6
+    xvpermi.q  D4,     D3,   0x02 //0 2 1 3
+    xvpermi.d  D4,     D4,   0xd8 //0 1 2 3
+
+    xvand.v    D5,     D3,   D3  //1 3 5 7
+    xvpermi.q  D5,     D2,   0x31 //4 6 5 7
+    xvpermi.d  D5,     D5,   0xd8 //4 5 6 7
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    addi.d     C0,     C0,    0x40
+#else
+    //res00 res01 res02 res03
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C0,    0x20 //c0: 4 5 6 7
+
+    xvpackev.d D2,     D1,    D0
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6
+    xvpackod.d D3,     D1,    D0
+    xvpermi.d  D3,     D3,    0xd8  //1 3 5 7
+
+    XVFMADD      D2,    U0,    VALPHAR, D2
+    XVFMADD      D3,    U1,    VALPHAR, D3
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6
+    xvpermi.q  D4,     D3,   0x02 //0 2 1 3
+    xvpermi.d  D4,     D4,   0xd8 //0 1 2 3
+
+    xvand.v    D5,     D3,   D3  //1 3 5 7
+    xvpermi.q  D5,     D2,   0x31 //4 6 5 7
+    xvpermi.d  D5,     D5,   0xd8 //4 5 6 7
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    addi.d     C0,     C0,    0x40
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+.L38:   /* if ( bm & 2 ) */
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L312
+
+.L39:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    $vr30,     $vr30,   $vr30
+    vxor.v    $vr31,     $vr31,   $vr31
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L311
+    blt        TL,     L,     .L311
+
+.L310:  /* for (k=0; k<temp; k++) */
+    vld       $vr16,     A0,    0x00  // a0ri
+    vld       $vr17,     A0,    0x10  // a1ri
+
+    //vldrepl.d $vr18,     B0,    0x00 //b0rr
+    //vldrepl.d $vr19,     B0,    0x08 //b0ii
+    vld       $vr18,     B0,    0x00
+    vld       $vr19,     B0,    0x00
+    vshuf4i.d  $vr18,     $vr18,    0x00 //b0rr
+    vshuf4i.d  $vr19,     $vr19,    0x05 //b0ii
+
+    vand.v     $vr20,     $vr16,    $vr16
+    vshuf4i.d  $vr20,     $vr17,    0x08  //a0r a1r
+    vshuf4i.d  $vr16,     $vr17,    0x0d  //a0i a1i
+
+    VMADD1    $vr30,     $vr20,    $vr18,     $vr30  //00r 01r
+    VMADD2    $vr31,     $vr16,    $vr18,     $vr31  //00i 01i
+    VMADD3    $vr30,     $vr16,    $vr19,     $vr30
+    VMADD4    $vr31,     $vr20,    $vr19,     $vr31
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L310
+
+.L311:
+#if defined(TRMMKERNEL)
+    //res00 res01
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C0,    0x10 //c0: 2 3
+
+    vand.v    $vr18,     $vr16,    $vr16
+    vshuf4i.d $vr18,     $vr17,    0x08 //c0[0] c0[2]
+    vshuf4i.d $vr16,     $vr17,    0x0d //c0[1] c0[3]
+
+    vfmul.d      $vr18,    $vr30,    $vr28
+    vfmul.d      $vr16,    $vr31,    $vr28
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr16,    $vr30,    $vr29, $vr16
+
+    vand.v    $vr19,     $vr18,    $vr18
+    vshuf4i.d $vr19,     $vr16,    0x08 //c0[0] c0[1]
+    vshuf4i.d $vr18,     $vr16,    0x0d //c0[2] c0[3]
+
+    vst       $vr19,     C0,    0x00 //c0: 0 1
+    vst       $vr18,     C0,    0x10 //c1: 2 3
+
+    addi.d     C0,     C0,    0x20
+#else
+    //res00 res01
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C0,    0x10 //c0: 2 3
+
+    vand.v    $vr18,     $vr16,    $vr16
+    vshuf4i.d $vr18,     $vr17,    0x08 //c0[0] c0[2]
+    vshuf4i.d $vr16,     $vr17,    0x0d //c0[1] c0[3]
+
+    VFMADD      $vr18,    $vr30,    $vr28, $vr18
+    VFMADD      $vr16,    $vr31,    $vr28, $vr16
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr16,    $vr30,    $vr29, $vr16
+
+    vand.v    $vr19,     $vr18,    $vr18
+    vshuf4i.d $vr19,     $vr16,    0x08 //c0[0] c0[1]
+    vshuf4i.d $vr18,     $vr16,    0x0d //c0[2] c0[3]
+
+    vst       $vr19,     C0,    0x00 //c0: 0 1
+    vst       $vr18,     C0,    0x10 //c1: 2 3
+
+    addi.d     C0,     C0,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L312:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L316
+
+.L313:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L315
+    blt        TL,     L,     .L315
+
+.L314:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00
+    LD         a2,     A0,    0x08
+
+    LD         b1,     B0,    0x00
+    LD         b2,     B0,    0x08
+
+    MADD1      c11,    a1,    b1,     c11
+    MADD2      c12,    a2,    b1,     c12
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L314
+
+.L315:
+#if defined(TRMMKERNEL)
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+
+    MUL       a5,     c11,   ALPHA_R
+    MUL       a6,     c12,   ALPHA_R
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+
+    addi.d     C0,     C0,    0x10
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+
+    addi.d     C0,     C0,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L316:
+    slli.d     L,      K,     4
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L300
+
+.L999:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_ncopy_4_lasx.S b/kernel/loongarch64/zgemm_ncopy_4_lasx.S
new file mode 100644
index 000000000..5d874bcc1
--- /dev/null
+++ b/kernel/loongarch64/zgemm_ncopy_4_lasx.S
@@ -0,0 +1,320 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define TD     $r20
+#define TS     $r11
+#define TL     $r19
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+#define D8     $xr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x03
+    slli.d     TL,   TL,   0x01
+
+    srai.d     J,    N,    0x02
+    beq        J,    ZERO,  .L_N0
+
+.L_J1: /* J-- */
+    move       S1,   TS
+    add.d      S2,   S1,   TL
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+
+    slli.d     T0,   TL,   0x02
+    add.d      TS,   TS,   T0
+
+    srai.d     I,    M,    0x02
+    beq        I,    ZERO,  .L_I3
+
+.L_I1: /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x20
+    xvld       U2,   S2,   0x00
+    xvld       U3,   S2,   0x20
+    xvld       U4,   S3,   0x00
+    xvld       U5,   S3,   0x20
+    xvld       U6,   S4,   0x00
+    xvld       U7,   S4,   0x20
+
+    xvand.v    D0,   U0,   U0
+    xvand.v    D1,   U1,   U1
+    xvand.v    D2,   U2,   U2
+    xvand.v    D3,   U3,   U3
+    xvand.v    D4,   U4,   U4
+    xvand.v    D5,   U5,   U5
+    xvand.v    D6,   U6,   U6
+    xvand.v    D7,   U7,   U7
+
+    xvpermi.q  D0,   U2,   0x02
+    xvpermi.q  D4,   U6,   0x02
+    xvpermi.q  D2,   U0,   0x31
+    xvpermi.q  D6,   U4,   0x31
+    xvpermi.q  D1,   U3,   0x02
+    xvpermi.q  D5,   U7,   0x02
+    xvpermi.q  D3,   U1,   0x31
+    xvpermi.q  D7,   U5,   0x31
+
+    xvst       D0,   TD,   0x00
+    xvst       D4,   TD,   0x20
+    xvst       D2,   TD,   0x40
+    xvst       D6,   TD,   0x60
+    xvst       D1,   TD,   0x80
+    xvst       D5,   TD,   0xa0
+    xvst       D3,   TD,   0xc0
+    xvst       D7,   TD,   0xe0
+
+    addi.d     S1,   S1,   0x40   // a_offset
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    addi.d     TD,   TD,   0x100  // b_offset
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_I3:
+    andi       I,    M,    0x02
+    beq        I,    ZERO, .L_II20
+
+.L_II1:  /* if(m&2) */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+
+    xvand.v    D0,   U0,   U0
+    xvand.v    D1,   U1,   U1
+    xvand.v    D2,   U2,   U2
+    xvand.v    D3,   U3,   U3
+
+    xvpermi.q  D0,   U1,   0x02
+    xvpermi.q  D2,   U3,   0x02
+    xvpermi.q  D1,   U0,   0x31
+    xvpermi.q  D3,   U2,   0x31
+
+    xvst       D0,   TD,   0x00
+    xvst       D2,   TD,   0x20
+    xvst       D1,   TD,   0x40
+    xvst       D3,   TD,   0x60
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x80
+
+.L_II20:
+    andi       I,    M,    0x01
+    beq        I,    ZERO, .L_J0
+
+.L_II2:  /* if(m&1) */
+    vld        $vr0, S1,   0x00
+    vld        $vr1, S2,   0x00
+    vld        $vr2, S3,   0x00
+    vld        $vr3, S4,   0x00
+
+    vst        $vr0, TD,   0x00
+    vst        $vr1, TD,   0x10
+    vst        $vr2, TD,   0x20
+    vst        $vr3, TD,   0x30
+
+    addi.d     TD,   TD,   0x40
+
+.L_J0:
+    addi.d    J,     J,    -1
+    blt       ZERO,  J,    .L_J1
+
+.L_N0:  /* if(n&2) */
+    andi       I,     N,   0x02
+    beq        ZERO,  I,   .L_N20
+
+    move       S1,    TS
+    add.d      S2,    S1,   TL
+
+    slli.d     T0,    TL,   0x01
+    add.d      TS,    TS,   T0
+
+    srai.d     I,     M,    0x02
+    beq        ZERO,  I,    .L_N10
+
+.L_N11: /* if(i>0) */
+    xvld       U0,    S1,   0x00
+    xvld       U1,    S1,   0x20
+    xvld       U2,    S2,   0x00
+    xvld       U3,    S2,   0x20
+
+    xvand.v    D0,    U0,   U0
+    xvand.v    D1,    U1,   U1
+    xvand.v    D2,    U2,   U2
+    xvand.v    D3,    U3,   U3
+
+    xvpermi.q  D0,    U2,   0x02
+    xvpermi.q  D2,    U0,   0x31
+    xvpermi.q  D1,    U3,   0x02
+    xvpermi.q  D3,    U1,   0x31
+
+    xvst       D0,    TD,   0x00
+    xvst       D2,    TD,   0x20
+    xvst       D1,    TD,   0x40
+    xvst       D3,    TD,   0x60
+
+    addi.d     S1,    S1,   0x40   // a_offset
+    addi.d     S2,    S2,   0x40
+    addi.d     TD,    TD,   0x80   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N11
+
+.L_N10:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N130
+
+    xvld       U0,    S1,   0x00
+    xvld       U1,    S2,   0x00
+    xvand.v    D0,    U0,   U0
+
+    xvpermi.q  D0,    U1,   0x02
+    xvpermi.q  U1,    U0,   0x31
+
+    xvst       D0,    TD,   0x00
+    xvst       U1,    TD,   0x20
+
+    addi.d     S1,    S1,   0x20   // a_offset
+    addi.d     S2,    S2,   0x20
+    addi.d     TD,    TD,   0x40   // b_offset
+
+.L_N130:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N20
+
+    vld       $vr0,    S1,   0x00
+    vld       $vr1,    S2,   0x00
+
+    vst       $vr0,    TD,   0x00
+    vst       $vr1,    TD,   0x10
+
+    addi.d     TD,    TD,   0x20
+
+.L_N20:   /* if(n&1) */
+    andi       I,     N,    0x01
+    beq        I,     ZERO, .L_N00
+
+    move       S1,   TS
+    srai.d     I,    M,    0x02
+
+    beq        I,    ZERO, .L_N30
+
+.L_N21:  /* if(i>0) */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x20
+
+    xvst       U0,   TD,   0x00
+    xvst       U1,   TD,   0x20
+
+    addi.d     S1,   S1,   0x40   // aoffset1
+    addi.d     TD,   TD,   0x40   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N21
+
+.L_N30:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N330
+
+    xvld       U0,   S1,   0x00
+    xvst       U0,   TD,   0x00
+
+    addi.d     S1,   S1,   0x20   // aoffset1
+    addi.d     TD,   TD,   0x20   // b_offset
+
+.L_N330:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N00
+
+    vld       $vr0,   S1,   0x00
+    vst       $vr0,   TD,   0x00
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_ncopy_4_lsx.S b/kernel/loongarch64/zgemm_ncopy_4_lsx.S
new file mode 100644
index 000000000..203471cbd
--- /dev/null
+++ b/kernel/loongarch64/zgemm_ncopy_4_lsx.S
@@ -0,0 +1,332 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define TD     $r20
+#define TS     $r11
+#define TL     $r19
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x03
+    slli.d     TL,   TL,   0x01
+
+    srai.d     J,    N,    0x02
+    beq        J,    ZERO,  .L_N0
+
+.L_J1: /* J-- */
+    move       S1,   TS
+    add.d      S2,   S1,   TL
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+
+    slli.d     T0,   TL,   0x02
+    add.d      TS,   TS,   T0
+
+    srai.d     I,    M,    0x02
+    beq        I,    ZERO,  .L_I3
+
+.L_I1: /* I-- */
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+    vld       U2,   S1,   0x20
+    vld       U3,   S1,   0x30
+
+    vld       U4,   S2,   0x00
+    vld       U5,   S2,   0x10
+    vld       U6,   S2,   0x20
+    vld       U7,   S2,   0x30
+
+    vld       U8,   S3,   0x00
+    vld       U9,   S3,   0x10
+    vld       U10,  S3,   0x20
+    vld       U11,  S3,   0x30
+
+    vld       U12,  S4,   0x00
+    vld       U13,  S4,   0x10
+    vld       U14,  S4,   0x20
+    vld       U15,  S4,   0x30
+
+    vst       U0,   TD,   0x00
+    vst       U4,   TD,   0x10
+    vst       U8,   TD,   0x20
+    vst       U12,  TD,   0x30
+
+    vst       U1,   TD,   0x40
+    vst       U5,   TD,   0x50
+    vst       U9,   TD,   0x60
+    vst       U13,  TD,   0x70
+
+    vst       U2,   TD,   0x80
+    vst       U6,   TD,   0x90
+    vst       U10,  TD,   0xa0
+    vst       U14,  TD,   0xb0
+
+    vst       U3,   TD,   0xc0
+    vst       U7,   TD,   0xd0
+    vst       U11,  TD,   0xe0
+    vst       U15,  TD,   0xf0
+
+    addi.d     S1,   S1,   0x40   // a_offset
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    addi.d     TD,   TD,   0x100  // b_offset
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_I3:  /* if(m&2) */
+    andi       I,    M,    0x02
+    beq        I,    ZERO, .L_II20
+
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+
+    vld       U2,   S2,   0x00
+    vld       U3,   S2,   0x10
+
+    vld       U4,   S3,   0x00
+    vld       U5,   S3,   0x10
+
+    vld       U6,   S4,   0x00
+    vld       U7,   S4,   0x10
+
+    vst       U0,   TD,   0x00
+    vst       U2,   TD,   0x10
+    vst       U4,   TD,   0x20
+    vst       U6,   TD,   0x30
+
+    vst       U1,   TD,   0x40
+    vst       U3,   TD,   0x50
+    vst       U5,   TD,   0x60
+    vst       U7,   TD,   0x70
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x80
+
+.L_II20:  /* if(m&1) */
+    andi       I,    M,    0x01
+    beq        I,    ZERO, .L_J0
+
+    vld        U0,   S1,   0x00
+    vld        U1,   S2,   0x00
+    vld        U2,   S3,   0x00
+    vld        U3,   S4,   0x00
+
+    vst        U0,   TD,   0x00
+    vst        U1,   TD,   0x10
+    vst        U2,   TD,   0x20
+    vst        U3,   TD,   0x30
+
+    addi.d     TD,   TD,   0x40
+
+.L_J0:
+    addi.d    J,     J,    -1
+    blt       ZERO,  J,    .L_J1
+
+.L_N0:  /* if(n&2) */
+    andi       I,     N,   0x02
+    beq        ZERO,  I,   .L_N20
+
+    move       S1,    TS
+    add.d      S2,    S1,   TL
+
+    slli.d     T0,    TL,   0x01
+    add.d      TS,    TS,   T0
+
+    srai.d     I,     M,    0x02
+    beq        ZERO,  I,    .L_N10
+
+.L_N11: /* if(i>0) */
+    vld       U0,    S1,   0x00
+    vld       U1,    S1,   0x10
+    vld       U2,    S1,   0x20
+    vld       U3,    S1,   0x30
+
+    vld       U4,    S2,   0x00
+    vld       U5,    S2,   0x10
+    vld       U6,    S2,   0x20
+    vld       U7,    S2,   0x30
+
+    vst       U0,    TD,   0x00
+    vst       U4,    TD,   0x10
+    vst       U1,    TD,   0x20
+    vst       U5,    TD,   0x30
+
+    vst       U2,    TD,   0x40
+    vst       U6,    TD,   0x50
+    vst       U3,    TD,   0x60
+    vst       U7,    TD,   0x70
+
+    addi.d     S1,    S1,   0x40   // a_offset
+    addi.d     S2,    S2,   0x40
+    addi.d     TD,    TD,   0x80   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N11
+
+.L_N10:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N130
+
+    vld       U0,    S1,   0x00
+    vld       U1,    S1,   0x10
+
+    vld       U2,    S2,   0x00
+    vld       U3,    S2,   0x10
+
+    vst       U0,    TD,   0x00
+    vst       U2,    TD,   0x10
+    vst       U1,    TD,   0x20
+    vst       U3,    TD,   0x30
+
+    addi.d     S1,    S1,   0x20   // a_offset
+    addi.d     S2,    S2,   0x20
+    addi.d     TD,    TD,   0x40   // b_offset
+
+.L_N130:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N20
+
+    vld       U0,     S1,   0x00
+    vld       U1,     S2,   0x00
+
+    vst       U0,     TD,   0x00
+    vst       U1,     TD,   0x10
+
+    addi.d     TD,    TD,   0x20
+
+.L_N20:   /* if(n&1) */
+    andi       I,     N,    0x01
+    beq        I,     ZERO, .L_N00
+
+    move       S1,   TS
+    srai.d     I,    M,    0x02
+
+    beq        I,    ZERO, .L_N30
+
+.L_N21:  /* if(i>0) */
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+    vld       U2,   S1,   0x20
+    vld       U3,   S1,   0x30
+
+    vst       U0,   TD,   0x00
+    vst       U1,   TD,   0x10
+    vst       U2,   TD,   0x20
+    vst       U3,   TD,   0x30
+
+    addi.d     S1,   S1,   0x40   // aoffset1
+    addi.d     TD,   TD,   0x40   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N21
+
+.L_N30:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N330
+
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+
+    vst       U0,   TD,   0x00
+    vst       U1,   TD,   0x10
+
+    addi.d     S1,   S1,   0x20   // aoffset1
+    addi.d     TD,   TD,   0x20   // b_offset
+
+.L_N330:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N00
+
+    vld       U0,   S1,   0x00
+
+    vst       U0,   TD,   0x00
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_ncopy_8_lasx.S b/kernel/loongarch64/zgemm_ncopy_8_lasx.S
new file mode 100644
index 000000000..7cd8f605b
--- /dev/null
+++ b/kernel/loongarch64/zgemm_ncopy_8_lasx.S
@@ -0,0 +1,263 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+#define D8     $xr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST  //boffset
+    move       TS,   SRC  //aoffset
+
+    slli.d     TL,   LDA,  0x03  //lda
+    slli.d     TL,   TL,   0x01
+
+    slli.d     T0,   TL,   0x03
+    srai.d     J,    N,    0x03  //j
+
+    beq        J,    ZERO, .L_N1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    move       I,    M
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+    add.d      S5,   S4,   TL
+    add.d      S6,   S5,   TL
+    add.d      S7,   S6,   TL
+    add.d      S8,   S7,   TL
+    add.d      TS,   TS,   T0
+
+    beq        I,    ZERO, .L_J11
+
+.L_I1:  /* if(i>0) i--*/
+    fld.d      F0,   S1,   0x00
+    fld.d      F1,   S1,   0x08
+    fld.d      F2,   S2,   0x00
+    fld.d      F3,   S2,   0x08
+    fld.d      F4,   S3,   0x00
+    fld.d      F5,   S3,   0x08
+    fld.d      F6,   S4,   0x00
+    fld.d      F7,   S4,   0x08
+
+    fst.d      F0,   TD,   0x00
+    fst.d      F1,   TD,   0x08
+    fst.d      F2,   TD,   0x10
+    fst.d      F3,   TD,   0x18
+    fst.d      F4,   TD,   0x20
+    fst.d      F5,   TD,   0x28
+    fst.d      F6,   TD,   0x30
+    fst.d      F7,   TD,   0x38
+
+    fld.d      F0,   S5,   0x00
+    fld.d      F1,   S5,   0x08
+    fld.d      F2,   S6,   0x00
+    fld.d      F3,   S6,   0x08
+    fld.d      F4,   S7,   0x00
+    fld.d      F5,   S7,   0x08
+    fld.d      F6,   S8,   0x00
+    fld.d      F7,   S8,   0x08
+
+    fst.d      F0,   TD,   0x40
+    fst.d      F1,   TD,   0x48
+    fst.d      F2,   TD,   0x50
+    fst.d      F3,   TD,   0x58
+    fst.d      F4,   TD,   0x60
+    fst.d      F5,   TD,   0x68
+    fst.d      F6,   TD,   0x70
+    fst.d      F7,   TD,   0x78
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     S3,   S3,   0x10
+    addi.d     S4,   S4,   0x10
+    addi.d     S5,   S5,   0x10
+    addi.d     S6,   S6,   0x10
+    addi.d     S7,   S7,   0x10
+    addi.d     S8,   S8,   0x10
+    addi.d     TD,   TD,   0x80
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_J11: /* j--*/
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_J1
+
+.L_N1:  /* if(n&4)*/
+    andi       I,     N,    0x04
+    beq        I,     ZERO, .L_N2
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    move       I,    M
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+    add.d      TS,   S4,   TL
+
+    beq        I,     ZERO, .L_N2
+
+.L_N11:  /* if(i>0)*/
+    fld.d      F0,   S1,   0x00
+    fld.d      F1,   S1,   0x08
+    fld.d      F2,   S2,   0x00
+    fld.d      F3,   S2,   0x08
+    fld.d      F4,   S3,   0x00
+    fld.d      F5,   S3,   0x08
+    fld.d      F6,   S4,   0x00
+    fld.d      F7,   S4,   0x08
+
+    fst.d      F0,   TD,   0x00
+    fst.d      F1,   TD,   0x08
+    fst.d      F2,   TD,   0x10
+    fst.d      F3,   TD,   0x18
+    fst.d      F4,   TD,   0x20
+    fst.d      F5,   TD,   0x28
+    fst.d      F6,   TD,   0x30
+    fst.d      F7,   TD,   0x38
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     S3,   S3,   0x10
+    addi.d     S4,   S4,   0x10
+    addi.d     TD,   TD,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_N11
+
+.L_N2:  /* if(n&2)*/
+    andi       I,     N,    0x02
+    beq        I,     ZERO, .L_N3
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    move       I,    M
+    add.d      TS,   S2,   TL
+
+    beq        I,    ZERO, .L_N3
+
+.L_N21:  /* if(i>0)*/
+    fld.d      F0,   S1,   0x00
+    fld.d      F1,   S1,   0x08
+    fld.d      F2,   S2,   0x00
+    fld.d      F3,   S2,   0x08
+
+    fst.d      F0,   TD,   0x00
+    fst.d      F1,   TD,   0x08
+    fst.d      F2,   TD,   0x10
+    fst.d      F3,   TD,   0x18
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     TD,   TD,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_N21
+
+.L_N3:  /* if(n&2)*/
+    andi       I,    N,    0x01
+    beq        I,    ZERO, .L_N0
+
+    move       S1,   TS
+    move       I,    M
+
+    beq        I,    ZERO, .L_N0
+
+.L_N31:  /* if(i>0)*/
+    fld.d      F0,   S1,   0x00
+    fld.d      F1,   S1,   0x08
+
+    fst.d      F0,   TD,   0x00
+    fst.d      F1,   TD,   0x08
+
+    addi.d     S1,   S1,   0x10
+    addi.d     TD,   TD,   0x10
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_N31
+
+.L_N0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_tcopy_4_lasx.S b/kernel/loongarch64/zgemm_tcopy_4_lasx.S
new file mode 100644
index 000000000..1adee11c5
--- /dev/null
+++ b/kernel/loongarch64/zgemm_tcopy_4_lasx.S
@@ -0,0 +1,302 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define TD     $r16
+#define TS     $r17
+#define TL     $r18
+#define T0     $r19
+#define S8     $r20
+#define S9     $r23
+#define S10    $r11
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x03  //lda
+    slli.d     TL,     TL,    0x01  //lda
+
+    ori        T0,     ZERO,  0x03
+    andn       T0,     N,     T0
+    mul.d      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x03
+    add.d      S9,     DST,   T0  //boffset2
+
+    ori        T0,     ZERO,  0x01
+    andn       T0,     N,     T0
+    mul.d      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x03
+    add.d      S10,    DST,   T0  //boffset3
+
+    srai.d     J,      M,     0x02  //j
+
+    beq        J,      ZERO,  .L_M1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+    add.d      S3,     S2,    TL
+    add.d      S4,     S3,    TL
+
+    slli.d     T0,     TL,    0x02
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x100
+
+    srai.d     I,      N,     0x02
+
+    beq        ZERO,   I,     .L_JN1
+
+.L_JI1:  /* if(i>0) i--*/
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x20
+    xvld       U2,   S2,   0x00
+    xvld       U3,   S2,   0x20
+    xvld       U4,   S3,   0x00
+    xvld       U5,   S3,   0x20
+    xvld       U6,   S4,   0x00
+    xvld       U7,   S4,   0x20
+
+    xvst       U0,   S8,   0x00
+    xvst       U1,   S8,   0x20
+    xvst       U2,   S8,   0x40
+    xvst       U3,   S8,   0x60
+    xvst       U4,   S8,   0x80
+    xvst       U5,   S8,   0xa0
+    xvst       U6,   S8,   0xc0
+    xvst       U7,   S8,   0xe0
+
+    addi.d     S1,   S1,   0x40
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    slli.d     T0,   M,    0x06
+    add.d      S8,   S8,   T0
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_JI1
+
+.L_JN1:  /* if(n&2) */
+    andi       I,    N,    0x02
+    beq        ZERO, I,    .L_JN2
+
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+
+    xvst       U0,   S9,   0x00
+    xvst       U1,   S9,   0x20
+    xvst       U2,   S9,   0x40
+    xvst       U3,   S9,   0x60
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     S9,   S9,   0x80
+
+.L_JN2:  /* if(n&1) */
+    andi       I,    N,    0x01
+    beq        ZERO, I,    .L_J0
+
+    vld        $vr0, S1,   0x00
+    vld        $vr1, S2,   0x00
+    vld        $vr2, S3,   0x00
+    vld        $vr3, S4,   0x00
+
+    vst        $vr0, S10,   0x00
+    vst        $vr1, S10,   0x10
+    vst        $vr2, S10,   0x20
+    vst        $vr3, S10,   0x30
+
+    addi.d     S10,  S10,  0x40
+
+.L_J0:
+    addi.d     J,      J,     -1
+    blt        ZERO, J,   .L_J1
+
+.L_M1:  /* if(m&2) */
+    andi       I,      M,    0x02
+    beq        ZERO,   I,    .L_M2
+
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+
+    slli.d     T0,     TL,    0x01
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x80
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M1N1
+
+.L_M1I1:  /* if(i>0) */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+
+    xvst       U0,     S8,   0x00
+    xvst       U1,     S8,   0x20
+    xvst       U2,     S8,   0x40
+    xvst       U3,     S8,   0x60
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    slli.d     T0,     M,     0x06
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M1I1
+
+.L_M1N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M1N2
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    xvst       U0,     S9,    0x00
+    xvst       U1,     S9,    0x20
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S9,     S9,    0x40
+
+.L_M1N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M2
+
+    vld       $vr0,     S1,    0x00
+    vld       $vr1,     S2,    0x00
+
+    vst       $vr0,     S10,    0x00
+    vst       $vr1,     S10,    0x10
+
+    addi.d     S10,    S10,   0x20
+
+.L_M2:  /* if(m&1) */
+    andi       I,      M,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    move       S1,     TS     //aoffset1
+    move       S8,     TD     //boffset1
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M2N1
+
+.L_M2I1:  /* if(i>0) */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+
+    xvst       U0,     S8,    0x00
+    xvst       U1,     S8,    0x20
+
+    addi.d     S1,     S1,    0x40
+    slli.d     T0,     M,     0x06
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M2I1
+
+.L_M2N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M2N2
+
+    xvld       U0,     S1,    0x00
+
+    xvst       U0,     S9,    0x00
+
+    addi.d     S1,     S1,    0x20
+
+.L_M2N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    vld       $vr0,     S1,    0x00
+
+    vst       $vr0,     S10,    0x00
+
+.L_M0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_tcopy_4_lsx.S b/kernel/loongarch64/zgemm_tcopy_4_lsx.S
new file mode 100644
index 000000000..954753eaf
--- /dev/null
+++ b/kernel/loongarch64/zgemm_tcopy_4_lsx.S
@@ -0,0 +1,355 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define TD     $r16
+#define TS     $r17
+#define TL     $r18
+#define T0     $r19
+#define S8     $r20
+#define S9     $r23
+#define S10    $r11
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x03  //lda
+    slli.d     TL,     TL,    0x01  //lda
+
+    ori        T0,     ZERO,  0x03
+    andn       T0,     N,     T0
+    mul.d      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x03
+    add.d      S9,     DST,   T0  //boffset2
+
+    ori        T0,     ZERO,  0x01
+    andn       T0,     N,     T0
+    mul.d      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x03
+    add.d      S10,    DST,   T0  //boffset3
+
+    srai.d     J,      M,     0x02  //j
+
+    beq        J,      ZERO,  .L_M1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+    add.d      S3,     S2,    TL
+    add.d      S4,     S3,    TL
+
+    slli.d     T0,     TL,    0x02
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x100
+
+    srai.d     I,      N,     0x02
+
+    beq        ZERO,   I,     .L_JN1
+
+.L_JI1:  /* if(i>0) i--*/
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+    vld       U2,   S1,   0x20
+    vld       U3,   S1,   0x30
+
+    vld       U4,   S2,   0x00
+    vld       U5,   S2,   0x10
+    vld       U6,   S2,   0x20
+    vld       U7,   S2,   0x30
+
+    vld       U8,   S3,   0x00
+    vld       U9,   S3,   0x10
+    vld       U10,  S3,   0x20
+    vld       U11,  S3,   0x30
+
+    vld       U12,  S4,   0x00
+    vld       U13,  S4,   0x10
+    vld       U14,  S4,   0x20
+    vld       U15,  S4,   0x30
+
+    vst       U0,   S8,   0x00
+    vst       U1,   S8,   0x10
+    vst       U2,   S8,   0x20
+    vst       U3,   S8,   0x30
+    vst       U4,   S8,   0x40
+    vst       U5,   S8,   0x50
+    vst       U6,   S8,   0x60
+    vst       U7,   S8,   0x70
+
+    vst       U8,   S8,   0x80
+    vst       U9,   S8,   0x90
+    vst       U10,  S8,   0xa0
+    vst       U11,  S8,   0xb0
+    vst       U12,  S8,   0xc0
+    vst       U13,  S8,   0xd0
+    vst       U14,  S8,   0xe0
+    vst       U15,  S8,   0xf0
+
+    addi.d     S1,   S1,   0x40
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    slli.d     T0,   M,    0x06
+    add.d      S8,   S8,   T0
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_JI1
+
+.L_JN1:  /* if(n&2) */
+    andi       I,    N,    0x02
+    beq        ZERO, I,    .L_JN2
+
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+
+    vld       U2,   S2,   0x00
+    vld       U3,   S2,   0x10
+
+    vld       U4,   S3,   0x00
+    vld       U5,   S3,   0x10
+
+    vld       U6,   S4,   0x00
+    vld       U7,   S4,   0x10
+
+    vst       U0,   S9,   0x00
+    vst       U1,   S9,   0x10
+    vst       U2,   S9,   0x20
+    vst       U3,   S9,   0x30
+
+    vst       U4,   S9,   0x40
+    vst       U5,   S9,   0x50
+    vst       U6,   S9,   0x60
+    vst       U7,   S9,   0x70
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     S9,   S9,   0x80
+
+.L_JN2:  /* if(n&1) */
+    andi       I,    N,    0x01
+    beq        ZERO, I,    .L_J0
+
+    vld        U0,   S1,   0x00
+    vld        U1,   S2,   0x00
+    vld        U2,   S3,   0x00
+    vld        U3,   S4,   0x00
+
+    vst        U0,   S10,   0x00
+    vst        U1,   S10,   0x10
+    vst        U2,   S10,   0x20
+    vst        U3,   S10,   0x30
+
+    addi.d     S10,  S10,  0x40
+
+.L_J0:
+    addi.d     J,      J,     -1
+    blt        ZERO, J,   .L_J1
+
+.L_M1:  /* if(m&2) */
+    andi       I,      M,    0x02
+    beq        ZERO,   I,    .L_M2
+
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+
+    slli.d     T0,     TL,    0x01
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x80
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M1N1
+
+.L_M1I1:  /* if(i>0) */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+
+    vld       U4,     S2,    0x00
+    vld       U5,     S2,    0x10
+    vld       U6,     S2,    0x20
+    vld       U7,     S2,    0x30
+
+    vst       U0,     S8,    0x00
+    vst       U1,     S8,    0x10
+    vst       U2,     S8,    0x20
+    vst       U3,     S8,    0x30
+
+    vst       U4,     S8,    0x40
+    vst       U5,     S8,    0x50
+    vst       U6,     S8,    0x60
+    vst       U7,     S8,    0x70
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    slli.d     T0,     M,     0x06
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M1I1
+
+.L_M1N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M1N2
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+
+    vld       U2,     S2,    0x00
+    vld       U3,     S2,    0x10
+
+    vst       U0,     S9,    0x00
+    vst       U1,     S9,    0x10
+    vst       U2,     S9,    0x20
+    vst       U3,     S9,    0x30
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S9,     S9,    0x40
+
+.L_M1N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M2
+
+    vld       U0,      S1,    0x00
+    vld       U1,      S2,    0x00
+
+    vst       U0,      S10,   0x00
+    vst       U1,      S10,   0x10
+
+    addi.d     S10,    S10,   0x20
+
+.L_M2:  /* if(m&1) */
+    andi       I,      M,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    move       S1,     TS     //aoffset1
+    move       S8,     TD     //boffset1
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M2N1
+
+.L_M2I1:  /* if(i>0) */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+
+    vst       U0,     S8,    0x00
+    vst       U1,     S8,    0x10
+    vst       U2,     S8,    0x20
+    vst       U3,     S8,    0x30
+
+    addi.d     S1,     S1,    0x40
+    slli.d     T0,     M,     0x06
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M2I1
+
+.L_M2N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M2N2
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+
+    vst       U0,     S9,    0x00
+    vst       U1,     S9,    0x10
+
+    addi.d     S1,     S1,    0x20
+
+.L_M2N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    vld       U0,     S1,    0x00
+
+    vst       U0,     S10,    0x00
+
+.L_M0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_tcopy_8_lasx.S b/kernel/loongarch64/zgemm_tcopy_8_lasx.S
new file mode 100644
index 000000000..f7440dc24
--- /dev/null
+++ b/kernel/loongarch64/zgemm_tcopy_8_lasx.S
@@ -0,0 +1,268 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x03  //lda
+    slli.d     TL,     TL,    0x01
+
+    srai.d     J,      N,     0x03  //j
+
+    beq        J,      ZERO,  .L_N1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+    addi.d     TS,     TS,    0x80
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_J1M1
+
+.L_J1I1:  /* if(i>0) i--*/
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+    xvld       U4,     S2,    0x00
+    xvld       U5,     S2,    0x20
+    xvld       U6,     S2,    0x40
+    xvld       U7,     S2,    0x60
+
+    xvst       U0,     TD,    0x00
+    xvst       U1,     TD,    0x20
+    xvst       U2,     TD,    0x40
+    xvst       U3,     TD,    0x60
+    xvst       U4,     TD,    0x80
+    xvst       U5,     TD,    0xa0
+    xvst       U6,     TD,    0xc0
+    xvst       U7,     TD,    0xe0
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+    addi.d     TD,     TD,    0x100
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_J1I1
+
+.L_J1M1:  /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_J0
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+
+    xvst       U0,     TD,    0x00
+    xvst       U1,     TD,    0x20
+    xvst       U2,     TD,    0x40
+    xvst       U3,     TD,    0x60
+
+    addi.d     TD,     TD,    0x80
+
+.L_J0:
+    addi.d     J,      J,     -1
+    blt        ZERO,   J,     .L_J1
+
+.L_N1:  /* if(n&4) */
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_N2
+
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+    addi.d     TS,     TS,    0x40
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_N1M1
+
+.L_N1I1:   /* if(i>0) i-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+
+    xvst       U0,     TD,    0x00
+    xvst       U1,     TD,    0x20
+    xvst       U2,     TD,    0x40
+    xvst       U3,     TD,    0x60
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+    addi.d     TD,     TD,    0x80
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_N1I1
+
+.L_N1M1:  /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_N2
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+
+    xvst       U0,     TD,    0x00
+    xvst       U1,     TD,    0x20
+
+    addi.d     TD,     TD,    0x40
+
+.L_N2:  /* if(n&2) */
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_N3
+
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+    addi.d     TS,     TS,    0x20
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_N2M1
+
+.L_N2I1:  /* if(i>0) i-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    xvst       U0,     TD,    0x00
+    xvst       U1,     TD,    0x20
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+
+    addi.d     TD,     TD,    0x40
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_N2I1
+
+.L_N2M1:   /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_N3
+
+    xvld       U0,     S1,    0x00
+
+    xvst       U0,     TD,    0x00
+
+    addi.d     TD,     TD,    0x20
+
+.L_N3:   /* if(n&1) */
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_N3M1
+
+.L_N3I1:  /* if(i>0) i-- */
+    vld       $vr0,     S1,    0x00
+    vld       $vr1,     S2,    0x00
+
+    vst       $vr0,     TD,    0x00
+    vst       $vr1,     TD,    0x10
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+    addi.d     TD,     TD,    0x20
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_N3I1
+
+.L_N3M1:  /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    vld       $vr0,     S1,    0x00
+
+    vst       $vr0,     TD,    0x00
+
+.L_N0:
+    LDARG      $r23,   $sp,   0
+    addi.d     $sp,    $sp,   8
+    jirl       $r0,    $r1,   0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/param.h b/param.h
index 003ea396f..5d2e960a2 100644
--- a/param.h
+++ b/param.h
@@ -2854,12 +2854,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define QGEMM_DEFAULT_UNROLL_N 2
 #define CGEMM_DEFAULT_UNROLL_N 2
-#define ZGEMM_DEFAULT_UNROLL_N 2
+#define ZGEMM_DEFAULT_UNROLL_N 4
 #define XGEMM_DEFAULT_UNROLL_N 1
 
 #define QGEMM_DEFAULT_UNROLL_M 2
 #define CGEMM_DEFAULT_UNROLL_M 2
-#define ZGEMM_DEFAULT_UNROLL_M 2
+#define ZGEMM_DEFAULT_UNROLL_M 8
 #define XGEMM_DEFAULT_UNROLL_M 1
 
 #define SGEMM_DEFAULT_P 256
@@ -2891,10 +2891,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define DGEMM_DEFAULT_UNROLL_M  8
 #define DGEMM_DEFAULT_UNROLL_N  4
 
-#define CGEMM_DEFAULT_UNROLL_M  2
-#define CGEMM_DEFAULT_UNROLL_N  2
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
 
-#define ZGEMM_DEFAULT_UNROLL_M  1
+#define ZGEMM_DEFAULT_UNROLL_M  4
 #define ZGEMM_DEFAULT_UNROLL_N  4
 
 #define SGEMM_DEFAULT_P	128

From d4db6a9f16a5c82bbe1860f591cc731c4d83d7c8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 6 Feb 2024 22:23:47 +0100
Subject: [PATCH 643/718] Separate the interface for SBGEMMT from GEMMT due to
 differences in GEMV arguments

---
 interface/CMakeLists.txt |   1 +
 interface/Makefile       |   8 +-
 interface/gemmt.c        |   3 +-
 interface/sbgemmt.c      | 447 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 452 insertions(+), 7 deletions(-)
 create mode 100644 interface/sbgemmt.c

diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt
index ed19b556a..55374674a 100644
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@@ -119,6 +119,7 @@ endif ()
 if (BUILD_BFLOAT16)
 	GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16")
+	GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
diff --git a/interface/Makefile b/interface/Makefile
index ad4a0fb89..048d679d6 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -1303,7 +1303,7 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c
 ifeq ($(BUILD_BFLOAT16),1)
 sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)
-sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : gemmt.c ../param.h
+sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 endif
 
@@ -1662,10 +1662,6 @@ cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c
 cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c
 	$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
 
-sscal.$(SUFFIX) sscal.$(PSUFFIX) : scal.c
-	$(CC) $(CFLAGS) -c $< -o $(@F)
-
-dscal.$(SUFFIX) dscal.$(PSUFFIX) : scal.c
 cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
 
@@ -1971,7 +1967,7 @@ cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
 
 ifeq ($(BUILD_BFLOAT16),1)
-cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h
+cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
 endif
 
diff --git a/interface/gemmt.c b/interface/gemmt.c
index 8fd8089d0..018deb7fb 100644
--- a/interface/gemmt.c
+++ b/interface/gemmt.c
@@ -158,7 +158,8 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
 		uplo = 0;
 	if (Uplo == 'L')
 		uplo = 1;
-
+	
+	nrowa = m;
 	if (transa & 1) nrowa = k;
 	nrowb = k;
 #if defined(COMPLEX)
diff --git a/interface/sbgemmt.c b/interface/sbgemmt.c
new file mode 100644
index 000000000..759af4bfb
--- /dev/null
+++ b/interface/sbgemmt.c
@@ -0,0 +1,447 @@
+/*********************************************************************/
+/* Copyright 2024, The OpenBLAS Project.                             */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/*********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+
+#define SMP_THRESHOLD_MIN 65536.0
+#define ERROR_NAME "SBGEMMT "
+
+#ifndef GEMM_MULTITHREAD_THRESHOLD
+#define GEMM_MULTITHREAD_THRESHOLD 4
+#endif
+
+#ifndef CBLAS
+
+void NAME(char *UPLO, char *TRANSA, char *TRANSB,
+	  blasint * M, blasint * K,
+	  FLOAT * Alpha,
+	  IFLOAT * a, blasint * ldA,
+	  IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
+{
+
+	blasint m, k;
+	blasint lda, ldb, ldc;
+	int transa, transb, uplo;
+	blasint info;
+
+	char transA, transB, Uplo;
+	blasint nrowa, nrowb;
+	IFLOAT *buffer;
+	IFLOAT *aa, *bb;
+	FLOAT *cc;
+	FLOAT alpha, beta;
+
+	PRINT_DEBUG_NAME;
+
+	m = *M;
+	k = *K;
+
+	alpha = *Alpha;
+	beta = *Beta;
+
+	lda = *ldA;
+	ldb = *ldB;
+	ldc = *ldC;
+
+	transA = *TRANSA;
+	transB = *TRANSB;
+	Uplo = *UPLO;
+	TOUPPER(transA);
+	TOUPPER(transB);
+	TOUPPER(Uplo);
+
+	transa = -1;
+	transb = -1;
+	uplo = -1;
+
+	if (transA == 'N')
+		transa = 0;
+	if (transA == 'T')
+		transa = 1;
+
+	if (transA == 'R')
+		transa = 0;
+	if (transA == 'C')
+		transa = 1;
+
+	if (transB == 'N')
+		transb = 0;
+	if (transB == 'T')
+		transb = 1;
+
+	if (transB == 'R')
+		transb = 0;
+	if (transB == 'C')
+		transb = 1;
+
+	if (Uplo == 'U')
+		uplo = 0;
+	if (Uplo == 'L')
+		uplo = 1;
+	nrowa = m;
+	if (transa & 1) nrowa = k;
+	nrowb = k;
+	if (transb & 1) nrowb = m;
+
+	info = 0;
+
+	if (ldc < MAX(1, m))
+		info = 13;
+	if (ldb < MAX(1, nrowb))
+		info = 10;
+	if (lda < MAX(1, nrowa))
+		info = 8;
+	if (k < 0)
+		info = 5;
+	if (m < 0)
+		info = 4;
+	if (transb < 0)
+		info = 3;
+	if (transa < 0)
+		info = 2;
+	if (uplo < 0)
+		info = 1;
+
+	if (info != 0) {
+		BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
+		return;
+	}
+#else
+
+void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
+	   enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m,
+	   blasint k,
+	   FLOAT alpha,
+	   IFLOAT * A, blasint LDA,
+	   IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc)
+{
+	IFLOAT *aa, *bb;
+        FLOAT *cc;
+
+	int transa, transb, uplo;
+	blasint info;
+	blasint lda, ldb;
+	IFLOAT *a, *b;
+	XFLOAT *buffer;
+
+	PRINT_DEBUG_CNAME;
+
+	uplo = -1;
+	transa = -1;
+	transb = -1;
+	info = 0;
+
+	if (order == CblasColMajor) {
+		if (Uplo == CblasUpper) uplo = 0;
+		if (Uplo == CblasLower) uplo = 1;
+
+		if (TransA == CblasNoTrans)
+			transa = 0;
+		if (TransA == CblasTrans)
+			transa = 1;
+
+		if (TransA == CblasConjNoTrans)
+			transa = 0;
+		if (TransA == CblasConjTrans)
+			transa = 1;
+
+		if (TransB == CblasNoTrans)
+			transb = 0;
+		if (TransB == CblasTrans)
+			transb = 1;
+
+		if (TransB == CblasConjNoTrans)
+			transb = 0;
+		if (TransB == CblasConjTrans)
+			transb = 1;
+
+		a = (void *)A;
+		b = (void *)B;
+		lda = LDA;
+		ldb = LDB;
+
+		info = -1;
+
+		blasint nrowa;
+		blasint nrowb;
+		nrowa = m;
+		if (transa & 1) nrowa = k;
+		nrowb = k;
+		if (transb & 1)  nrowb = m;
+
+		if (ldc < MAX(1, m))
+			info = 13;
+		if (ldb < MAX(1, nrowb))
+			info = 10;
+		if (lda < MAX(1, nrowa))
+			info = 8;
+		if (k < 0)
+			info = 5;
+		if (m < 0)
+			info = 4;
+		if (transb < 0)
+			info = 3;
+		if (transa < 0)
+			info = 2;
+		if (uplo < 0)
+			info = 1;
+	}
+
+	if (order == CblasRowMajor) {
+
+		a = (void *)B;
+		b = (void *)A;
+
+		lda = LDB;
+		ldb = LDA;
+
+		if (Uplo == CblasUpper) uplo = 0;
+		if (Uplo == CblasLower) uplo = 1;
+
+		if (TransB == CblasNoTrans)
+			transa = 0;
+		if (TransB == CblasTrans)
+			transa = 1;
+
+		if (TransB == CblasConjNoTrans)
+			transa = 0;
+		if (TransB == CblasConjTrans)
+			transa = 1;
+
+		if (TransA == CblasNoTrans)
+			transb = 0;
+		if (TransA == CblasTrans)
+			transb = 1;
+
+		if (TransA == CblasConjNoTrans)
+			transb = 0;
+		if (TransA == CblasConjTrans)
+			transb = 1;
+
+		info = -1;
+
+		blasint ncola; 
+		blasint ncolb;
+
+		ncola = m;
+		if (transa & 1) ncola = k;
+		ncolb = k;
+
+		if (transb & 1) {
+			ncolb = m;
+		}
+
+		if (ldc < MAX(1,m))
+			info = 13;
+		if (ldb < MAX(1, ncolb))
+			info = 8;
+		if (lda < MAX(1, ncola))
+			info = 10;
+		if (k < 0)
+			info = 5;
+		if (m < 0)
+			info = 4;
+		if (transb < 0)
+			info = 2;
+		if (transa < 0)
+			info = 3;
+		if (uplo < 0)
+			info = 1;
+	}
+
+	if (info >= 0) {
+		BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
+		return;
+	}
+
+#endif
+	int buffer_size;
+	blasint i, j;
+
+#ifdef SMP
+	int nthreads;
+#endif
+
+
+#ifdef SMP
+	static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *,
+				     BLASLONG, IFLOAT *, BLASLONG, FLOAT,
+				     FLOAT *, BLASLONG, int) = {
+		sbgemv_thread_n, sbgemv_thread_t,
+	};
+#endif
+	int (*gemv[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, BLASLONG,
+		       IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = {
+	SBGEMV_N, SBGEMV_T,};
+
+
+	if (m == 0)
+		return;
+
+	IDEBUG_START;
+
+	const blasint incb = ((transb & 1) == 0) ? 1 : ldb;
+
+	if (uplo == 1) {
+		for (i = 0; i < m; i++) {
+			j = m - i;
+
+			aa = a + i;
+			bb = b + i * ldb;
+			if (transa & 1) {
+				aa = a + lda * i;
+			}
+			if (transb & 1)
+				bb = b + i;
+			cc = c + i * ldc + i;
+
+#if 0
+			if (beta != ONE)
+				SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
+
+			if (alpha == ZERO)
+				continue;
+#endif
+
+			IDEBUG_START;
+
+			buffer_size = j + k + 128 / sizeof(FLOAT);
+#ifdef WINDOWS_ABI
+			buffer_size += 160 / sizeof(FLOAT);
+#endif
+			// for alignment
+			buffer_size = (buffer_size + 3) & ~3;
+			STACK_ALLOC(buffer_size, IFLOAT, buffer);
+
+#ifdef SMP
+
+			if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
+				nthreads = 1;
+			else
+				nthreads = num_cpu_avail(2);
+
+			if (nthreads == 1) {
+#endif
+
+				if (!(transa & 1))
+				(gemv[(int)transa]) (j, k, alpha, aa, lda,
+						     bb, incb, beta, cc, 1);
+				else
+				(gemv[(int)transa]) (k, j, alpha, aa, lda,
+						     bb, incb, beta, cc, 1);
+
+#ifdef SMP
+			} else {
+				if (!(transa & 1))
+				(gemv_thread[(int)transa]) (j, k, alpha, aa,
+							    lda, bb, incb, beta, cc,
+							    1, nthreads);
+				else
+				(gemv_thread[(int)transa]) (k, j, alpha, aa,
+							    lda, bb, incb, beta, cc,
+							    1, nthreads);
+
+			}
+#endif
+
+			STACK_FREE(buffer);
+		}
+	} else {
+
+		for (i = 0; i < m; i++) {
+			j = i + 1;
+
+			bb = b + i * ldb;
+			if (transb & 1) {
+				bb = b + i;
+			}
+			cc = c + i * ldc;
+
+#if 0
+			if (beta != ONE)
+				SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
+
+			if (alpha == ZERO)
+				continue;
+#endif
+			IDEBUG_START;
+
+			buffer_size = j + k + 128 / sizeof(FLOAT);
+#ifdef WINDOWS_ABI
+			buffer_size += 160 / sizeof(FLOAT);
+#endif
+			// for alignment
+			buffer_size = (buffer_size + 3) & ~3;
+			STACK_ALLOC(buffer_size, IFLOAT, buffer);
+
+#ifdef SMP
+
+			if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
+				nthreads = 1;
+			else
+				nthreads = num_cpu_avail(2);
+
+			if (nthreads == 1) {
+#endif
+
+				if (!(transa & 1))
+				(gemv[(int)transa]) (j, k, alpha, a, lda, bb,
+						     incb, beta, cc, 1);
+				else
+				(gemv[(int)transa]) (k, j, alpha, a, lda, bb,
+						     incb, beta, cc, 1);
+
+#ifdef SMP
+			} else {
+				if (!(transa & 1))
+				(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
+							    bb, incb, beta, cc, 1,
+							    nthreads);
+				else
+				(gemv_thread[(int)transa]) (k, j, alpha, a, lda,
+							    bb, incb, beta, cc, 1,
+							    nthreads);
+			}
+#endif
+
+			STACK_FREE(buffer);
+		}
+	}
+
+	IDEBUG_END;
+
+	return;
+}

From 1ed69ea1c05d091ec4f38358939aad441b0b2d4a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 6 Feb 2024 23:35:12 +0100
Subject: [PATCH 644/718] improve naming

---
 Makefile.system | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index 0353a4d21..49bd1cbed 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -1705,8 +1705,8 @@ endif
 endif
 
 ifeq ($(FIXED_LIBNAME),1)
-	LIBNAME = $(LIBPREFIX).$(LIBSUFFIX)
-	LIBNAME_P = $(LIBPREFIX)_p.$(LIBSUFFIX)
+	LIBNAME = $(LIBPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).$(LIBSUFFIX)
+	LIBNAME_P = $(LIBPREFIX)$(LISOBNAMEBASE)$(LIBNAMESUFFIX)_p.$(LIBSUFFIX)
 endif
 
 LIBDLLNAME   = $(LIBPREFIX).dll

From 9ef10ffa496b919c25aedbb4aa2fdb930901475a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 6 Feb 2024 23:38:19 +0100
Subject: [PATCH 645/718] Handle prefixed and suffixed libnames, optionally
 suppress softlinking

---
 Makefile.install | 24 ++++++++++++++----------
 openblas.pc.in   |  2 +-
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/Makefile.install b/Makefile.install
index 81f959177..e7d129cce 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -2,10 +2,12 @@ TOPDIR	= .
 export GOTOBLAS_MAKEFILE = 1
 -include $(TOPDIR)/Makefile.conf_last
 include ./Makefile.system
+LNCMD = ln -fs
 
 ifdef THELIBNAME
 LIBNAME=$(THELIBNAME)
 LIBSONAME=$(THELIBSONAME)
+LNCMD = true
 endif
 ifeq ($(INTERFACE64),1)
 USE_64BITINT=1
@@ -99,7 +101,7 @@ ifneq ($(NO_STATIC),1)
 	@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
 	@install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
 	@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
-	ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
+	$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
 endif
 #for install shared library
 ifneq ($(NO_SHARED),1)
@@ -107,21 +109,21 @@ ifneq ($(NO_SHARED),1)
 ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
 	@install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
 	@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
-	ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
-	ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
+	$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so ; \
+	$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
 endif
 
 ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD))
 	@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
 	@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
-	ln -fs $(LIBSONAME) $(LIBPREFIX).so
+	$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so
 endif
 ifeq ($(OSNAME), Darwin)
 	@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
 	@-install_name_tool -id "$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(MAJOR_VERSION).dylib" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
 	@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
-	ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
-	ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
+	$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
+	$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
 endif
 ifeq ($(OSNAME), WINNT)
 	@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
@@ -149,15 +151,15 @@ ifneq ($(NO_STATIC),1)
 	@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
 	@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
 	@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
-	ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
+	$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
 endif
 #for install shared library
 ifneq ($(NO_SHARED),1)
 	@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
 	@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
 	@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
-	ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
-	ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
+	$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so ; \
+	$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
 endif
 
 endif
@@ -170,6 +172,8 @@ endif
 
 	@echo Generating $(LIBSONAMEBASE)$(SUFFIX64).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
 	@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(PKGFILE)"
+	@echo 'libprefix='$(LIBPREFIX) >> "$(PKGFILE)"
+	@echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)"
 	@echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)"
 	@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)"
 	@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
@@ -186,7 +190,7 @@ endif
 ifneq ($(NO_SHARED),1)
 #ifeq logical or
 ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
-	@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
+	@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 endif
 ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
 	@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
diff --git a/openblas.pc.in b/openblas.pc.in
index 8ad6e8bee..6c27c462b 100644
--- a/openblas.pc.in
+++ b/openblas.pc.in
@@ -2,6 +2,6 @@ Name: openblas
 Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
 Version: ${version}
 URL: https://github.com/xianyi/OpenBLAS
-Libs: -L${libdir} -lopenblas${libsuffix}
+Libs: -L${libdir} -l$(libprefix}openblas${libnamesuffix}
 Libs.private: ${extralib}
 Cflags: -I${includedir}

From 25b300bbeebb03f781a94fe0a655f59cd20094a7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 6 Feb 2024 23:40:01 +0100
Subject: [PATCH 646/718] improve internal names

---
 exports/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/exports/Makefile b/exports/Makefile
index 71c112c67..238623b04 100644
--- a/exports/Makefile
+++ b/exports/Makefile
@@ -133,7 +133,7 @@ libgoto_hpl.def : $(GENSYM)
 
 ifeq ($(OSNAME), Darwin)
 ifeq ($(FIXED_LIBNAME),1)
-INTERNALNAME = $(LIBPREFIX).dylib
+INTERNALNAME = $(LIBPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).dylib
 else
 INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
 endif
@@ -174,7 +174,7 @@ FEXTRALIB += -lm
 EXTRALIB += -lm
 else
 ifeq ($(FIXED_LIBNAME),1)
-INTERNALNAME = $(LIBPREFIX).so
+INTERNALNAME = $(LIBPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).so
 else
 INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
 endif

From fb99fc2e6e4ec8ecdcfffe1ca1aeb787464d2825 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 7 Feb 2024 13:42:08 +0100
Subject: [PATCH 647/718] fix type conversion warnings

---
 test/compare_sgemm_sbgemm.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c
index cf808b56d..4afa8bf93 100644
--- a/test/compare_sgemm_sbgemm.c
+++ b/test/compare_sgemm_sbgemm.c
@@ -81,6 +81,16 @@ float16to32 (bfloat16_bits f16)
   return f32.v;
 }
 
+float
+float32to16 (float32_bits f32)
+{
+  bfloat16_bits f16;
+  f16.bits.s = f32.bits.s;
+  f16.bits.e = f32.bits.e;
+  f16.bits.m = (uint32_t) f32.bits.m >> 16;
+  return f32.v;
+}
+
 int
 main (int argc, char *argv[])
 {
@@ -108,16 +118,16 @@ main (int argc, char *argv[])
 	      A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
 	      B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
 	      C[j * k + i] = 0;
-	      AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16;
-	      BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16;
+	      AA[j * k + i].v = float32to16( A[j * k + i] );
+	      BB[j * k + i].v = float32to16( B[j * k + i] );
 	      CC[j * k + i] = 0;
 	      DD[j * k + i] = 0;
 	    }
 	}
       SGEMM (&transA, &transB, &m, &n, &k, &alpha, A,
 	     &m, B, &k, &beta, C, &m);
-      SBGEMM (&transA, &transB, &m, &n, &k, &alpha, AA,
-	      &m, BB, &k, &beta, CC, &m);
+      SBGEMM (&transA, &transB, &m, &n, &k, &alpha, (bfloat16*) AA,
+	      &m, (bfloat16*)BB, &k, &beta, CC, &m);
       for (i = 0; i < n; i++)
 	for (j = 0; j < m; j++)
 	    if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0)

From 08ce6b1c1c6468c26607353d516e0dc8c009f58e Mon Sep 17 00:00:00 2001
From: Chip Kerchner <chip.kerchner@ibm.com>
Date: Wed, 7 Feb 2024 07:54:06 -0600
Subject: [PATCH 648/718] Add missing CPU ID definitions for old versions of
 AIX.

---
 driver/others/dynamic_power.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 0454f186c..16320dc40 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -43,6 +43,13 @@ char *gotoblas_corename(void) {
 #define CPU_POWER9   9
 #define CPU_POWER10 10
 
+#ifndef POWER_9
+#define POWER_9         0x20000         /* 9 class CPU */
+#endif
+#ifndef POWER_10
+#define POWER_10        0x40000         /* 10 class CPU */
+#endif
+
 #ifdef _AIX
 #include <sys/systemcfg.h>
 

From e9f480111e1d5b6f69c8053f79375b0a4242712f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 7 Feb 2024 19:57:18 +0100
Subject: [PATCH 649/718] fix sbgemm bfloat16 conversion errors introduced in
 PR 4488

---
 test/compare_sgemm_sbgemm.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c
index 4afa8bf93..bc74233ab 100644
--- a/test/compare_sgemm_sbgemm.c
+++ b/test/compare_sgemm_sbgemm.c
@@ -81,16 +81,6 @@ float16to32 (bfloat16_bits f16)
   return f32.v;
 }
 
-float
-float32to16 (float32_bits f32)
-{
-  bfloat16_bits f16;
-  f16.bits.s = f32.bits.s;
-  f16.bits.e = f32.bits.e;
-  f16.bits.m = (uint32_t) f32.bits.m >> 16;
-  return f32.v;
-}
-
 int
 main (int argc, char *argv[])
 {
@@ -110,6 +100,8 @@ main (int argc, char *argv[])
       float C[m * n];
       bfloat16_bits AA[m * k], BB[k * n];
       float DD[m * n], CC[m * n];
+      bfloat16 atmp,btmp;
+      blasint one=1;
 
       for (j = 0; j < m; j++)
 	{
@@ -118,8 +110,10 @@ main (int argc, char *argv[])
 	      A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
 	      B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
 	      C[j * k + i] = 0;
-	      AA[j * k + i].v = float32to16( A[j * k + i] );
-	      BB[j * k + i].v = float32to16( B[j * k + i] );
+	      sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one);
+	      sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one);
+	      AA[j * k + i].v = atmp;
+	      BB[j * k + i].v = btmp;
 	      CC[j * k + i] = 0;
 	      DD[j * k + i] = 0;
 	    }

From ff10e6b6dc30e247eb0cabd00de610b48c615c91 Mon Sep 17 00:00:00 2001
From: kseniyazaytseva <kseniya.zaytseva@syntacore.com>
Date: Thu, 8 Feb 2024 00:19:54 +0300
Subject: [PATCH 650/718] Fix zero step tests

---
 utest/test_extensions/test_damin.c  | 4 ++--
 utest/test_extensions/test_dzamax.c | 7 +++----
 utest/test_extensions/test_dzamin.c | 6 +++---
 utest/test_extensions/test_idamin.c | 4 ++--
 utest/test_extensions/test_isamin.c | 4 ++--
 utest/test_extensions/test_samin.c  | 6 +++---
 utest/test_extensions/test_scamax.c | 7 +++----
 utest/test_extensions/test_scamin.c | 6 +++---
 8 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/utest/test_extensions/test_damin.c b/utest/test_extensions/test_damin.c
index d492343ed..fdd2bc658 100644
--- a/utest/test_extensions/test_damin.c
+++ b/utest/test_extensions/test_damin.c
@@ -60,12 +60,12 @@ CTEST(damin, step_zero){
    blasint i;
    blasint N = ELEMENTS, inc = 0;
    double x[ELEMENTS];
-   for (i = 0; i < N  * inc; i ++) {
+   for (i = 0; i < N; i ++) {
       x[i] = i + 1000;
    }
    x[8] = 0.0;
    double amin = BLASFUNC(damin)(&N, x, &inc);
-   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+   ASSERT_DBL_NEAR_TOL(x[0], amin, DOUBLE_EPS);
 }
 
 /**
diff --git a/utest/test_extensions/test_dzamax.c b/utest/test_extensions/test_dzamax.c
index edea3de8f..bdb3a4f18 100644
--- a/utest/test_extensions/test_dzamax.c
+++ b/utest/test_extensions/test_dzamax.c
@@ -59,13 +59,12 @@ CTEST(dzamax, bad_args_N_0){
 CTEST(dzamax, step_zero){
    blasint i;
    blasint N = ELEMENTS * 2, inc = 0;
-   double x[ELEMENTS];
-   for (i = 0; i < N  * inc * 2; i ++) {
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N; i ++) {
       x[i] = i + 1000;
    }
-   x[8] = 0.0;
    double amax = BLASFUNC(dzamax)(&N, x, &inc);
-   ASSERT_DBL_NEAR_TOL(0.0, amax, DOUBLE_EPS);
+   ASSERT_DBL_NEAR_TOL((x[0] + x[1]), amax, DOUBLE_EPS);
 }
 
 /**
diff --git a/utest/test_extensions/test_dzamin.c b/utest/test_extensions/test_dzamin.c
index 916eede92..9fcf87b7b 100644
--- a/utest/test_extensions/test_dzamin.c
+++ b/utest/test_extensions/test_dzamin.c
@@ -59,13 +59,13 @@ CTEST(dzamin, bad_args_N_0){
 CTEST(dzamin, step_zero){
    blasint i;
    blasint N = ELEMENTS * 2, inc = 0;
-   double x[ELEMENTS];
-   for (i = 0; i < N  * inc * 2; i ++) {
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N; i ++) {
       x[i] = i + 1000;
    }
    x[8] = 0.0;
    double amin = BLASFUNC(dzamin)(&N, x, &inc);
-   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+   ASSERT_DBL_NEAR_TOL((x[0] + x[1]), amin, DOUBLE_EPS);
 }
 
 /**
diff --git a/utest/test_extensions/test_idamin.c b/utest/test_extensions/test_idamin.c
index 9f099f666..6a7ed9d1e 100644
--- a/utest/test_extensions/test_idamin.c
+++ b/utest/test_extensions/test_idamin.c
@@ -62,7 +62,7 @@ CTEST(idamin, step_zero){
    blasint i;
    blasint N = ELEMENTS, inc = 0;
    double x[ELEMENTS];
-   for (i = 0; i < N  * inc; i ++) {
+   for (i = 0; i < N; i ++) {
       x[i] = i + 1000;
    }
    x[8] = 0.0;
@@ -435,7 +435,7 @@ CTEST(idamin, c_api_step_zero){
    blasint i;
    blasint N = ELEMENTS, inc = 0;
    double x[ELEMENTS];
-   for (i = 0; i < N  * inc; i ++) {
+   for (i = 0; i < N; i ++) {
       x[i] = i + 1000;
    }
    x[8] = 0.0;
diff --git a/utest/test_extensions/test_isamin.c b/utest/test_extensions/test_isamin.c
index df8dead07..4ff235b83 100644
--- a/utest/test_extensions/test_isamin.c
+++ b/utest/test_extensions/test_isamin.c
@@ -62,7 +62,7 @@ CTEST(isamin, step_zero){
    blasint i;
    blasint N = ELEMENTS, inc = 0;
    float x[ELEMENTS];
-   for (i = 0; i < N  * inc; i ++) {
+   for (i = 0; i < N; i ++) {
       x[i] = i + 1000;
    }
    x[8] = 0.0f;
@@ -435,7 +435,7 @@ CTEST(isamin, c_api_step_zero){
    blasint i;
    blasint N = ELEMENTS, inc = 0;
    float x[ELEMENTS];
-   for (i = 0; i < N  * inc; i ++) {
+   for (i = 0; i < N; i ++) {
       x[i] = i + 1000;
    }
    x[8] = 0.0f;
diff --git a/utest/test_extensions/test_samin.c b/utest/test_extensions/test_samin.c
index 5c747a0f6..2e3a73797 100644
--- a/utest/test_extensions/test_samin.c
+++ b/utest/test_extensions/test_samin.c
@@ -60,12 +60,12 @@ CTEST(samin, step_zero){
    blasint i;
    blasint N = ELEMENTS, inc = 0;
    float x[ELEMENTS];
-   for (i = 0; i < N  * inc; i ++) {
+   for (i = 0; i < N; i ++) {
       x[i] = i + 1000;
    }
-   x[8] = 0.0f;
+   x[8] = 0.0;
    float amin = BLASFUNC(samin)(&N, x, &inc);
-   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+   ASSERT_DBL_NEAR_TOL(x[0], amin, SINGLE_EPS);
 }
 
 /**
diff --git a/utest/test_extensions/test_scamax.c b/utest/test_extensions/test_scamax.c
index 39d7201ff..0f49ebfad 100644
--- a/utest/test_extensions/test_scamax.c
+++ b/utest/test_extensions/test_scamax.c
@@ -59,13 +59,12 @@ CTEST(scamax, bad_args_N_0){
 CTEST(scamax, step_zero){
    blasint i;
    blasint N = ELEMENTS * 2, inc = 0;
-   float x[ELEMENTS];
-   for (i = 0; i < N  * inc * 2; i ++) {
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N; i ++) {
       x[i] = i + 1000;
    }
-   x[8] = 0.0f;
    float amax = BLASFUNC(scamax)(&N, x, &inc);
-   ASSERT_DBL_NEAR_TOL(0.0f, amax, SINGLE_EPS);
+   ASSERT_DBL_NEAR_TOL((x[0] + x[1]), amax, SINGLE_EPS);
 }
 
 /**
diff --git a/utest/test_extensions/test_scamin.c b/utest/test_extensions/test_scamin.c
index 4baa23184..0f0414a1c 100644
--- a/utest/test_extensions/test_scamin.c
+++ b/utest/test_extensions/test_scamin.c
@@ -59,13 +59,13 @@ CTEST(scamin, bad_args_N_0){
 CTEST(scamin, step_zero){
    blasint i;
    blasint N = ELEMENTS * 2, inc = 0;
-   float x[ELEMENTS];
-   for (i = 0; i < N  * inc * 2; i ++) {
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N; i ++) {
       x[i] = i + 1000;
    }
    x[8] = 0.0f;
    float amin = BLASFUNC(scamin)(&N, x, &inc);
-   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+   ASSERT_DBL_NEAR_TOL((x[0] + x[1]), amin, SINGLE_EPS);
 }
 
 /**

From ec5cfe3bc8e7fb4fae09e961ea6169e01cd21fa3 Mon Sep 17 00:00:00 2001
From: kseniyazaytseva <kseniya.zaytseva@syntacore.com>
Date: Thu, 8 Feb 2024 00:21:38 +0300
Subject: [PATCH 651/718] Fix invalid tests

---
 utest/test_extensions/test_cgeadd.c    | 6 +++---
 utest/test_extensions/test_dgeadd.c    | 6 +++---
 utest/test_extensions/test_dimatcopy.c | 8 ++++----
 utest/test_extensions/test_sgeadd.c    | 6 +++---
 utest/test_extensions/test_simatcopy.c | 8 ++++----
 utest/test_extensions/test_zgeadd.c    | 6 +++---
 6 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/utest/test_extensions/test_cgeadd.c b/utest/test_extensions/test_cgeadd.c
index 0cf6cbf87..9b87ad9f3 100644
--- a/utest/test_extensions/test_cgeadd.c
+++ b/utest/test_extensions/test_cgeadd.c
@@ -349,7 +349,7 @@ CTEST(cgeadd, xerbla_lda_invalid)
     blasint lda = INVALID;
     blasint ldc = 1;
 
-    int expected_info = 6;
+    int expected_info = 5;
 
     int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
     ASSERT_EQUAL(TRUE, passed);
@@ -682,7 +682,7 @@ CTEST(cgeadd, c_api_xerbla_n_invalid_row_major)
     blasint lda = 1;
     blasint ldc = 1;
 
-    int expected_info = 1;
+    int expected_info = 2;
 
     int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
     ASSERT_EQUAL(TRUE, passed);
@@ -730,7 +730,7 @@ CTEST(cgeadd, c_api_xerbla_m_invalid_row_major)
     blasint lda = 1;
     blasint ldc = 1;
 
-    int expected_info = 2;
+    int expected_info = 1;
 
     int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
     ASSERT_EQUAL(TRUE, passed);
diff --git a/utest/test_extensions/test_dgeadd.c b/utest/test_extensions/test_dgeadd.c
index 4654c51a3..8f93a842e 100644
--- a/utest/test_extensions/test_dgeadd.c
+++ b/utest/test_extensions/test_dgeadd.c
@@ -346,7 +346,7 @@ CTEST(dgeadd, xerbla_lda_invalid)
     blasint lda = INVALID;
     blasint ldc = 1;
 
-    int expected_info = 6;
+    int expected_info = 5;
 
     int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
     ASSERT_EQUAL(TRUE, passed);
@@ -680,7 +680,7 @@ CTEST(dgeadd, c_api_xerbla_n_invalid_row_major)
     blasint lda = 1;
     blasint ldc = 1;
 
-    int expected_info = 1;
+    int expected_info = 2;
 
     int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
     ASSERT_EQUAL(TRUE, passed);
@@ -728,7 +728,7 @@ CTEST(dgeadd, c_api_xerbla_m_invalid_row_major)
     blasint lda = 1;
     blasint ldc = 1;
 
-    int expected_info = 2;
+    int expected_info = 1;
 
     int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
     ASSERT_EQUAL(TRUE, passed);
diff --git a/utest/test_extensions/test_dimatcopy.c b/utest/test_extensions/test_dimatcopy.c
index d2a16bbbf..811c356b3 100644
--- a/utest/test_extensions/test_dimatcopy.c
+++ b/utest/test_extensions/test_dimatcopy.c
@@ -856,7 +856,7 @@ CTEST(dimatcopy, xerbla_rowmajor_notrans_invalid_ldb)
     blasint lda_src = 100, lda_dst = 50;
     char order = 'R';
     char trans = 'N';
-    int expected_info = 9;
+    int expected_info = 8;
 
     int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
     ASSERT_EQUAL(TRUE, passed);
@@ -873,7 +873,7 @@ CTEST(dimatcopy, xerbla_rowmajor_trans_invalid_ldb)
     blasint lda_src = 100, lda_dst = 50;
     char order = 'R';
     char trans = 'T';
-    int expected_info = 9;
+    int expected_info = 8;
 
     int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
     ASSERT_EQUAL(TRUE, passed);
@@ -890,7 +890,7 @@ CTEST(dimatcopy, xerbla_colmajor_notrans_invalid_ldb)
     blasint lda_src = 100, lda_dst = 50;
     char order = 'C';
     char trans = 'N';
-    int expected_info = 9;
+    int expected_info = 8;
 
     int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
     ASSERT_EQUAL(TRUE, passed);
@@ -907,7 +907,7 @@ CTEST(dimatcopy, xerbla_colmajor_trans_invalid_ldb)
     blasint lda_src = 100, lda_dst = 50;
     char order = 'C';
     char trans = 'T';
-    int expected_info = 9;
+    int expected_info = 8;
 
     int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
     ASSERT_EQUAL(TRUE, passed);
diff --git a/utest/test_extensions/test_sgeadd.c b/utest/test_extensions/test_sgeadd.c
index b42ce9c0e..171132b9d 100644
--- a/utest/test_extensions/test_sgeadd.c
+++ b/utest/test_extensions/test_sgeadd.c
@@ -349,7 +349,7 @@ CTEST(sgeadd, xerbla_lda_invalid)
     blasint lda = INVALID;
     blasint ldc = 1;
 
-    int expected_info = 6;
+    int expected_info = 5;
 
     int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
     ASSERT_EQUAL(TRUE, passed);
@@ -682,7 +682,7 @@ CTEST(sgeadd, c_api_xerbla_n_invalid_row_major)
     blasint lda = 1;
     blasint ldc = 1;
 
-    int expected_info = 1;
+    int expected_info = 2;
 
     int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
     ASSERT_EQUAL(TRUE, passed);
@@ -730,7 +730,7 @@ CTEST(sgeadd, c_api_xerbla_m_invalid_row_major)
     blasint lda = 1;
     blasint ldc = 1;
 
-    int expected_info = 2;
+    int expected_info = 1;
 
     int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
     ASSERT_EQUAL(TRUE, passed);
diff --git a/utest/test_extensions/test_simatcopy.c b/utest/test_extensions/test_simatcopy.c
index cf14d360c..ba388596d 100644
--- a/utest/test_extensions/test_simatcopy.c
+++ b/utest/test_extensions/test_simatcopy.c
@@ -856,7 +856,7 @@ CTEST(simatcopy, xerbla_rowmajor_notrans_invalid_ldb)
     blasint lda_src = 100, lda_dst = 50;
     char order = 'R';
     char trans = 'N';
-    int expected_info = 9;
+    int expected_info = 8;
 
     int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
     ASSERT_EQUAL(TRUE, passed);
@@ -873,7 +873,7 @@ CTEST(simatcopy, xerbla_rowmajor_trans_invalid_ldb)
     blasint lda_src = 100, lda_dst = 50;
     char order = 'R';
     char trans = 'T';
-    int expected_info = 9;
+    int expected_info = 8;
 
     int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
     ASSERT_EQUAL(TRUE, passed);
@@ -890,7 +890,7 @@ CTEST(simatcopy, xerbla_colmajor_notrans_invalid_ldb)
     blasint lda_src = 100, lda_dst = 50;
     char order = 'C';
     char trans = 'N';
-    int expected_info = 9;
+    int expected_info = 8;
 
     int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
     ASSERT_EQUAL(TRUE, passed);
@@ -907,7 +907,7 @@ CTEST(simatcopy, xerbla_colmajor_trans_invalid_ldb)
     blasint lda_src = 100, lda_dst = 50;
     char order = 'C';
     char trans = 'T';
-    int expected_info = 9;
+    int expected_info = 8;
 
     int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
     ASSERT_EQUAL(TRUE, passed);
diff --git a/utest/test_extensions/test_zgeadd.c b/utest/test_extensions/test_zgeadd.c
index e50f86de0..7496ccf88 100644
--- a/utest/test_extensions/test_zgeadd.c
+++ b/utest/test_extensions/test_zgeadd.c
@@ -349,7 +349,7 @@ CTEST(zgeadd, xerbla_lda_invalid)
     blasint lda = INVALID;
     blasint ldc = 1;
 
-    int expected_info = 6;
+    int expected_info = 5;
 
     int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
     ASSERT_EQUAL(TRUE, passed);
@@ -682,7 +682,7 @@ CTEST(zgeadd, c_api_xerbla_n_invalid_row_major)
     blasint lda = 1;
     blasint ldc = 1;
 
-    int expected_info = 1;
+    int expected_info = 2;
 
     int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
     ASSERT_EQUAL(TRUE, passed);
@@ -730,7 +730,7 @@ CTEST(zgeadd, c_api_xerbla_m_invalid_row_major)
     blasint lda = 1;
     blasint ldc = 1;
 
-    int expected_info = 2;
+    int expected_info = 1;
 
     int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
     ASSERT_EQUAL(TRUE, passed);

From cfabc48190bb3ac1b5c6ace9ee560477394054c8 Mon Sep 17 00:00:00 2001
From: kseniyazaytseva <kseniya.zaytseva@syntacore.com>
Date: Thu, 8 Feb 2024 00:22:15 +0300
Subject: [PATCH 652/718] Update rotg tests

---
 utest/test_extensions/test_crotg.c | 24 ++++++++++++------------
 utest/test_extensions/test_zrotg.c | 16 ++++++++--------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/utest/test_extensions/test_crotg.c b/utest/test_extensions/test_crotg.c
index 9db7dc7d3..84875ccf7 100644
--- a/utest/test_extensions/test_crotg.c
+++ b/utest/test_extensions/test_crotg.c
@@ -48,10 +48,10 @@ CTEST(crotg, zero_a)
     float sc;
     BLASFUNC(crotg)(sa, sb, &sc, ss);
     ASSERT_DBL_NEAR_TOL(0.0f, sc, SINGLE_EPS);
-    ASSERT_DBL_NEAR_TOL(1.0f, ss[0], SINGLE_EPS);
-    ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS);
-    ASSERT_DBL_NEAR_TOL(1.0f, sa[0], SINGLE_EPS);
-    ASSERT_DBL_NEAR_TOL(1.0f, sa[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.70711f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.70711f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.41421f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, sa[1], SINGLE_EPS);
 }
 
 /**
@@ -83,8 +83,8 @@ CTEST(crotg, zero_real)
     float ss[2];
     float sc;
     BLASFUNC(crotg)(sa, sb, &sc, ss);
-    ASSERT_DBL_NEAR_TOL(0.7071f, sc, SINGLE_EPS);
-    ASSERT_DBL_NEAR_TOL(0.7071f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.70711f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.70711f, ss[0], SINGLE_EPS);
     ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS);
     ASSERT_DBL_NEAR_TOL(0.0f, sa[0], SINGLE_EPS);
     ASSERT_DBL_NEAR_TOL(1.41421f, sa[1], SINGLE_EPS);
@@ -174,10 +174,10 @@ CTEST(crotg, c_api_zero_a)
     float sc;
     cblas_crotg(sa, sb, &sc, ss);
     ASSERT_DBL_NEAR_TOL(0.0f, sc, SINGLE_EPS);
-    ASSERT_DBL_NEAR_TOL(1.0f, ss[0], SINGLE_EPS);
-    ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS);
-    ASSERT_DBL_NEAR_TOL(1.0f, sa[0], SINGLE_EPS);
-    ASSERT_DBL_NEAR_TOL(1.0f, sa[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.70711f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.70711f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.41421f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, sa[1], SINGLE_EPS);
 }
 
 /**
@@ -209,8 +209,8 @@ CTEST(crotg, c_api_zero_real)
     float ss[2];
     float sc;
     cblas_crotg(sa, sb, &sc, ss);
-    ASSERT_DBL_NEAR_TOL(0.7071f, sc, SINGLE_EPS);
-    ASSERT_DBL_NEAR_TOL(0.7071f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.70711f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.70711f, ss[0], SINGLE_EPS);
     ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS);
     ASSERT_DBL_NEAR_TOL(0.0f, sa[0], SINGLE_EPS);
     ASSERT_DBL_NEAR_TOL(1.41421f, sa[1], SINGLE_EPS);
diff --git a/utest/test_extensions/test_zrotg.c b/utest/test_extensions/test_zrotg.c
index 310121422..1de95447d 100644
--- a/utest/test_extensions/test_zrotg.c
+++ b/utest/test_extensions/test_zrotg.c
@@ -48,10 +48,10 @@ CTEST(zrotg, zero_a)
     double sc;
     BLASFUNC(zrotg)(sa, sb, &sc, ss);
     ASSERT_DBL_NEAR_TOL(0.0, sc, DOUBLE_EPS);
-    ASSERT_DBL_NEAR_TOL(1.0, ss[0], DOUBLE_EPS);
-    ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS);
-    ASSERT_DBL_NEAR_TOL(1.0, sa[0], DOUBLE_EPS);
-    ASSERT_DBL_NEAR_TOL(1.0, sa[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.70710678118655, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.70710678118655, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.41421356237310, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, sa[1], DOUBLE_EPS);
 }
 
 /**
@@ -174,10 +174,10 @@ CTEST(zrotg, c_api_zero_a)
     double sc;
     cblas_zrotg(sa, sb, &sc, ss);
     ASSERT_DBL_NEAR_TOL(0.0, sc, DOUBLE_EPS);
-    ASSERT_DBL_NEAR_TOL(1.0, ss[0], DOUBLE_EPS);
-    ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS);
-    ASSERT_DBL_NEAR_TOL(1.0, sa[0], DOUBLE_EPS);
-    ASSERT_DBL_NEAR_TOL(1.0, sa[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.70710678118655, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.70710678118655, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.41421356237310, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, sa[1], DOUBLE_EPS);
 }
 
 /**

From b3fa16345d83b723b8984b78dc6a2bb5d9f3d479 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 8 Feb 2024 13:15:34 +0100
Subject: [PATCH 653/718] fix prototype for c/zaxpby

---
 common_interface.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common_interface.h b/common_interface.h
index 61a82c306..5a2e1654c 100644
--- a/common_interface.h
+++ b/common_interface.h
@@ -773,8 +773,8 @@ xdouble   BLASFUNC(qlamc3)(xdouble *, xdouble *);
 
 void    BLASFUNC(saxpby) (blasint *, float  *, float  *, blasint *, float *, float  *, blasint *);
 void    BLASFUNC(daxpby) (blasint *, double  *, double  *, blasint *, double *, double  *, blasint *);
-void    BLASFUNC(caxpby) (blasint *, float  *, float  *, blasint *, float *, float  *, blasint *);
-void    BLASFUNC(zaxpby) (blasint *, double  *, double  *, blasint *, double *, double  *, blasint *);
+void    BLASFUNC(caxpby) (blasint *, void  *, float  *, blasint *, void *, float  *, blasint *);
+void    BLASFUNC(zaxpby) (blasint *, void  *, double *, blasint *, void *, double  *, blasint *);
 
 void    BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float  *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double  *, double  *, blasint *, double  *, blasint *);

From 500ac4de5e20596d5cd797d745db97dd0a62ff86 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 8 Feb 2024 13:18:34 +0100
Subject: [PATCH 654/718] fix incompatible pointer types

---
 interface/zaxpby.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/interface/zaxpby.c b/interface/zaxpby.c
index 3a4db7403..e5065270d 100644
--- a/interface/zaxpby.c
+++ b/interface/zaxpby.c
@@ -39,12 +39,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #ifndef CBLAS
 
-void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY)
+void NAME(blasint *N, void *VALPHA, FLOAT *x, blasint *INCX, void *VBETA, FLOAT *y, blasint *INCY)
 {
 
   blasint n    = *N;
   blasint incx = *INCX;
   blasint incy = *INCY;
+  FLOAT* ALPHA = (FLOAT*) VALPHA;
+  FLOAT* BETA = (FLOAT*) VBETA;
 
 #else
 

From ac6b4b7aa472799245c497ab3caa6e77c0e80852 Mon Sep 17 00:00:00 2001
From: Chip Kerchner <chip.kerchner@ibm.com>
Date: Thu, 8 Feb 2024 08:56:30 -0600
Subject: [PATCH 655/718] Make sure CPU ID works for all POWER_10 conditions

---
 driver/others/dynamic_power.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 16320dc40..cd5e88922 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -69,7 +69,7 @@ static int cpuid(void)
     else if (arch == POWER_9) return CPU_POWER9;
 #endif
 #ifdef POWER_10
-    else if (arch == POWER_10) return CPU_POWER10;
+    else if (arch >= POWER_10) return CPU_POWER10;
 #endif
     return CPU_UNKNOWN;
 }

From d408ecedba8cc1d9b799b89df6996bdcd996d5fc Mon Sep 17 00:00:00 2001
From: Chip Kerchner <chip.kerchner@ibm.com>
Date: Thu, 8 Feb 2024 12:17:18 -0600
Subject: [PATCH 656/718] Add environment variable to display coretype for
 dynamic arch.

---
 driver/others/dynamic_power.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index cd5e88922..4c1f4a26e 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -339,6 +339,9 @@ void gotoblas_dynamic_init(void) {
 	if (gotoblas && gotoblas -> init) {
 		strncpy(coren,gotoblas_corename(),20);
 		sprintf(coremsg, "Core: %s\n",coren);
+		if (getenv("GET_OPENBLAS_CORETYPE")) {
+			fprintf(stderr, "%s", coremsg);
+		}
 		openblas_warning(2, coremsg);
 		gotoblas -> init();
 	} else {

From 83bec5135534c94fe1790aa5795620a7cdeaa916 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 8 Feb 2024 21:23:48 +0100
Subject: [PATCH 657/718] Update with recently added CBLAS interfaces and
 LAPACK/LAPACKE functions

---
 exports/gensymbol | 85 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)

diff --git a/exports/gensymbol b/exports/gensymbol
index 704eab06f..93edf2400 100755
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -60,6 +60,7 @@ cblasobjsc="
     cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv
     cblas_scnrm2 cblas_scasum cblas_cgemmt
     cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy
+    cblas_caxpyc cblas_crotg cblas_csrot cblas_scamax cblas_scamin
     "
 cblasobjsd="
     cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot
@@ -69,6 +70,7 @@ cblasobjsd="
     cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv
     cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt
     cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy
+    cblas_damax  cblas_damin
     "
 
 cblasobjss="
@@ -80,6 +82,7 @@ cblasobjss="
     cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm
     cblas_strsv cblas_sgeadd cblas_sgemmt
     cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy
+    cblas_samax cblas_samin
     "
 
 cblasobjsz="
@@ -91,6 +94,7 @@ cblasobjsz="
     cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub
     cblas_zaxpby cblas_zgeadd cblas_zgemmt
     cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy
+    cblas_zaxpyc cblas_zdrot cblas_zrotg cblas_dzamax cblas_dzamin
 "
 
 cblasobjs="cblas_xerbla"
@@ -861,6 +865,51 @@ lapackobjs2z="$lapackobjs2z
     zgedmd
     zgedmdq
     "
+
+#functions added post 3.11
+
+lapackobjs2c="$lapackobjs2c
+    claqp2rk
+    claqp3rk
+    claqz0
+    claqz1
+    claqz2
+    claqz3
+    clatrs3
+    ctrsyl3
+    "
+lapackobjs2d="$lapackobjs2d
+    dgelqs
+    dgelst
+    dgeqp3rk
+    dgeqrs
+    dlaqp2rk
+    dlaqp3rk
+    dlaqz0
+    dlaqz1
+    dlaqz2
+    dlaqz3
+    dlaqz4
+    dlarmm
+    dlatrs3
+    dtrsyl3
+    "
+lapackobjs2z="$lapackobjs2z
+    zgelqs
+    zgelst
+    zgeqp3rk
+    zgeqrs
+    zlaqp2rk
+    zlaqp3rk
+    zlaqz0
+    zlaqz1
+    zlaqz2
+    zlaqz3
+    zlatrs3
+    zrscl
+    ztrsyl3
+"
+
 lapack_extendedprecision_objs="
     zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx
     dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx
@@ -1622,6 +1671,14 @@ lapackeobjsc="
     LAPACKE_cgetsqrhrt_work
     LAPACKE_cungtsqr_row
     LAPACKE_cungtsqr_row_work
+    LAPACKE_clangb
+    LAPACKE_clangb_work
+    LAPACKE_ctrsyl3
+    LAPACKE_ctrsyl3_work
+    LAPACKE_ctz_nancheck
+    LAPACKE_ctz_trans
+    LAPACKE_cunhr_col
+    LAPACKE_cunhr_col_work
 "
 
 lapackeobjsd="
@@ -2239,6 +2296,14 @@ lapackeobjsd="
     LAPACKE_dgetsqrhrt_work
     LAPACKE_dorgtsqr_row
     LAPACKE_dorgtsqr_row_work
+    LAPACKE_dlangb
+    LAPACKE_dlangb_work
+    LAPACKE_dorhr_col
+    LAPACKE_dorhr_col_work
+    LAPACKE_dtrsyl3
+    LAPACKE_dtrsyl3_work
+    LAPACKE_dtz_nancheck
+    LAPACKE_dtz_trans
 "
 
 lapackeobjss="
@@ -2848,6 +2913,14 @@ lapackeobjss="
     LAPACKE_sgetsqrhrt_work
     LAPACKE_sorgtsqr_row
     LAPACKE_sorgtsqr_row_work
+    LAPACKE_slangb
+    LAPACKE_slangb_work
+    LAPACKE_sorhr_col
+    LAPACKE_sorhr_col_work
+    LAPACKE_strsyl3
+    LAPACKE_strsyl3_work
+    LAPACKE_stz_nancheck
+    LAPACKE_stz_trans
 "
 
 lapackeobjsz="
@@ -3515,6 +3588,14 @@ lapackeobjsz="
     LAPACKE_zgetsqrhrt_work
     LAPACKE_zungtsqr_row
     LAPACKE_zungtsqr_row_work
+    LAPACKE_zlangb
+    LAPACKE_zlangb_work
+    LAPACKE_ztrsyl3
+    LAPACKE_ztrsyl3_work
+    LAPACKE_ztz_nancheck
+    LAPACKE_ztz_trans
+    LAPACKE_zunhr_col
+    LAPACKE_zunhr_col_work
 "
 ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile`
 ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the
@@ -3616,6 +3697,7 @@ lapack_embeded_underscore_objs_s="
     ssysv_aa_2stage ssytrf_aa_2stage
     ssytrs_aa_2stage
     slaorhr_col_getrfnp slaorhr_col_getrfnp2 sorhr_col
+    slarfb_gett
 "
 lapack_embeded_underscore_objs_c="
     chetf2_rook chetrf_rook chetri_rook
@@ -3641,6 +3723,7 @@ lapack_embeded_underscore_objs_c="
     csysv_aa_2stage csytrf_aa_2stage
     csytrs_aa_2stage
     claunhr_col_getrfnp claunhr_col_getrfnp2 cunhr_col
+    clarfb_gett
 "
 lapack_embeded_underscore_objs_d="
     dlasyf_rook
@@ -3658,6 +3741,7 @@ lapack_embeded_underscore_objs_d="
      dsysv_aa_2stage
     dsytrf_aa_2stage dsytrs_aa_2stage
     dlaorhr_col_getrfnp dlaorhr_col_getrfnp2 dorhr_col
+    dlarfb_gett
 "
 lapack_embeded_underscore_objs_z="
     zhetf2_rook zhetrf_rook zhetri_rook
@@ -3682,6 +3766,7 @@ lapack_embeded_underscore_objs_z="
     zhetrs_aa_2stage zsysv_aa_2stage
     zsytrf_aa_2stage zsytrs_aa_2stage
     zlaunhr_col_getrfnp zlaunhr_col_getrfnp2 zunhr_col
+    zlarfb_gett
 "
 
 dirname=`pwd -P`/../lapack-netlib

From 98c56a7314dbc0032152b7658c73c203124963f9 Mon Sep 17 00:00:00 2001
From: Mark Seminatore <mseminatore@users.noreply.github.com>
Date: Thu, 8 Feb 2024 13:50:15 -0800
Subject: [PATCH 658/718] more cleanup

---
 driver/others/blas_server_win32.c | 35 ++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index ee6d08f8c..89ce9e656 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -73,7 +73,7 @@ static DWORD	    blas_threads_id[MAX_CPU_NUMBER];
 static volatile int thread_target;	// target num of live threads, volatile for cross-thread reads
 
 //
-//
+// Legacy code path
 //
 static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) {
 
@@ -369,11 +369,11 @@ int blas_thread_init(void) {
   return 0;
 }
 
-/*
-   User can call one of two routines.
-     exec_blas_async ... immediately returns after jobs are queued.
-     exec_blas       ... returns after jobs are finished.
-*/
+//
+//   User can call one of two routines.
+//     exec_blas_async ... immediately returns after jobs are queued.
+//     exec_blas       ... returns after jobs are finished.
+//
 int exec_blas_async(BLASLONG pos, blas_queue_t *queue) {
 
 #if defined(SMP_SERVER)
@@ -471,27 +471,32 @@ int exec_blas(BLASLONG num, blas_queue_t *queue) {
 
   if ((num <= 0) || (queue == NULL)) return 0;
 
-  if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next);
+  if ((num > 1) && queue -> next) 
+    exec_blas_async(1, queue -> next);
 
   routine = queue -> routine;
 
   if (queue -> mode & BLAS_LEGACY) {
     legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
-  } else
+  } else {
     if (queue -> mode & BLAS_PTHREAD) {
       void (*pthreadcompat)(void *) = queue -> routine;
       (pthreadcompat)(queue -> args);
     } else
       (routine)(queue -> args, queue -> range_m, queue -> range_n,
-		queue -> sa, queue -> sb, 0);
+    		queue -> sa, queue -> sb, 0);
+  }
 
-  if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next);
+  if ((num > 1) && queue -> next) 
+    exec_blas_async_wait(num - 1, queue -> next);
 
   return 0;
 }
 
+//
 // Shutdown procedure, but user don't have to call this routine. The
 // kernel automatically kill threads.
+//
 int BLASFUNC(blas_thread_shutdown)(void) {
 
   int i;
@@ -502,7 +507,7 @@ int BLASFUNC(blas_thread_shutdown)(void) {
 
   if (blas_server_avail) {
 
-    for(i = 0; i < blas_num_threads - 1; i++) {
+    for (i = 0; i < blas_num_threads - 1; i++) {
       // Could also just use WaitForMultipleObjects
       DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
 
@@ -524,6 +529,9 @@ int BLASFUNC(blas_thread_shutdown)(void) {
   return 0;
 }
 
+//
+// Legacy function to set numbef of threads
+//
 void goto_set_num_threads(int num_threads)
 {
 	long i;
@@ -577,7 +585,7 @@ void goto_set_num_threads(int num_threads)
 			blas_server_avail = 1;
 		}
 
-		for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) {
+		for (i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) {
 			//MT_TRACE("set_num_threads: creating thread [%d]\n", i);
 
 			blas_threads[i] = CreateThread(NULL, 0,
@@ -593,6 +601,9 @@ void goto_set_num_threads(int num_threads)
 	blas_cpu_number  = num_threads;
 }
 
+//
+// Openblas function to set thread count
+//
 void openblas_set_num_threads(int num)
 {
 	goto_set_num_threads(num);

From 93872f4681430a6ae9d857d8044d36ba98d09bf2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 8 Feb 2024 23:02:09 +0100
Subject: [PATCH 659/718] drop the ?laqz? symbols for now (not translatable by
 f2c)

---
 exports/gensymbol | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/exports/gensymbol b/exports/gensymbol
index 93edf2400..226035842 100755
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -871,13 +871,14 @@ lapackobjs2z="$lapackobjs2z
 lapackobjs2c="$lapackobjs2c
     claqp2rk
     claqp3rk
-    claqz0
-    claqz1
-    claqz2
-    claqz3
-    clatrs3
     ctrsyl3
     "
+#    claqz0
+#    claqz1
+#    claqz2
+#    claqz3
+#    clatrs3
+
 lapackobjs2d="$lapackobjs2d
     dgelqs
     dgelst
@@ -885,15 +886,16 @@ lapackobjs2d="$lapackobjs2d
     dgeqrs
     dlaqp2rk
     dlaqp3rk
-    dlaqz0
-    dlaqz1
-    dlaqz2
-    dlaqz3
-    dlaqz4
     dlarmm
     dlatrs3
     dtrsyl3
     "
+#    dlaqz0
+#    dlaqz1
+#    dlaqz2
+#    dlaqz3
+#    dlaqz4
+
 lapackobjs2z="$lapackobjs2z
     zgelqs
     zgelst
@@ -901,14 +903,14 @@ lapackobjs2z="$lapackobjs2z
     zgeqrs
     zlaqp2rk
     zlaqp3rk
-    zlaqz0
-    zlaqz1
-    zlaqz2
-    zlaqz3
     zlatrs3
     zrscl
     ztrsyl3
-"
+    "
+#    zlaqz0
+#    zlaqz1
+#    zlaqz2
+#    zlaqz3
 
 lapack_extendedprecision_objs="
     zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx

From ff1523163f71e1ad9f1e4a2c8548416f213e7cb1 Mon Sep 17 00:00:00 2001
From: Sergei Lewis <slewis@rivosinc.com>
Date: Fri, 9 Feb 2024 12:59:14 +0000
Subject: [PATCH 660/718] Fix axpy test hangs when n==0. Reenable zaxpy_vector
 kernel for C910V.

---
 kernel/riscv64/KERNEL.C910V   | 4 ++--
 kernel/riscv64/axpy_vector.c  | 2 +-
 kernel/riscv64/zaxpy_vector.c | 8 ++++----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V
index 066329390..2798a870e 100644
--- a/kernel/riscv64/KERNEL.C910V
+++ b/kernel/riscv64/KERNEL.C910V
@@ -42,8 +42,8 @@ ZSUMKERNEL  = ../arm/zsum.c
 
 SAXPYKERNEL  = axpy_vector.c
 DAXPYKERNEL  = axpy_vector.c
-CAXPYKERNEL  = zaxpy.c
-ZAXPYKERNEL  = zaxpy.c
+CAXPYKERNEL  = zaxpy_vector.c
+ZAXPYKERNEL  = zaxpy_vector.c
 
 SAXPBYKERNEL  = axpby_vector.c
 DAXPBYKERNEL  = axpby_vector.c
diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c
index e99ca8542..6dffe5f09 100644
--- a/kernel/riscv64/axpy_vector.c
+++ b/kernel/riscv64/axpy_vector.c
@@ -65,7 +65,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 	FLOAT_V_T vy0, vy1;
 	BLASLONG stride_x, stride_y;
 
-	if (n < 0)  return(0);
+	if (n <= 0)  return(0);
 	if (da == 0.0) return(0);
 
 	if (inc_x == 1 && inc_y == 1) {
diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c
index d19e51118..1e766c5f4 100644
--- a/kernel/riscv64/zaxpy_vector.c
+++ b/kernel/riscv64/zaxpy_vector.c
@@ -45,10 +45,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
-	BLASLONG i = 0, j = 0;
-	BLASLONG ix = 0,iy = 0;
-	if(n < 0) return(0);
-	if(da_r == 0.0 && da_i == 0.0) return(0);
+        BLASLONG i = 0, j = 0;
+        BLASLONG ix = 0,iy = 0;
+        if(n <= 0) return(0);
+        if(da_r == 0.0 && da_i == 0.0) return(0);
         unsigned int gvl = 0;
         BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
         BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);

From bb96e466aee3a969bfb767d347262977bc918658 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 9 Feb 2024 15:50:11 +0100
Subject: [PATCH 661/718] Introduce LIBNAMEPREFIX to avoid messing with the
 internal LIBPREFIX

---
 Makefile.install |  4 ++--
 Makefile.rule    |  5 +++++
 Makefile.system  | 12 ++++++++----
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/Makefile.install b/Makefile.install
index e7d129cce..2b00db4be 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -172,7 +172,7 @@ endif
 
 	@echo Generating $(LIBSONAMEBASE)$(SUFFIX64).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
 	@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(PKGFILE)"
-	@echo 'libprefix='$(LIBPREFIX) >> "$(PKGFILE)"
+	@echo 'libprefix='$(LIBNAMEPREFIX) >> "$(PKGFILE)"
 	@echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)"
 	@echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)"
 	@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)"
@@ -190,7 +190,7 @@ endif
 ifneq ($(NO_SHARED),1)
 #ifeq logical or
 ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
-	@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
+	@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(LIBNAMESUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 endif
 ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
 	@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
diff --git a/Makefile.rule b/Makefile.rule
index 8dbf5eab6..02ec739fd 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -5,6 +5,11 @@
 # This library's version
 VERSION = 0.3.26.dev
 
+# If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
+# and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library
+# 
+# LIBNAMEPREFIX = scipy
+
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
 # is libopenblas_$(LIBNAMESUFFIX).so.0.
diff --git a/Makefile.system b/Makefile.system
index 49bd1cbed..233663ef9 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -1516,6 +1516,10 @@ ifndef LIBSONAMEBASE
 LIBSONAMEBASE = openblas
 endif
 
+ifndef LIBNAMEPREFIX
+LIBNAMEPREFIX =
+endif
+
 ifndef LIBNAMESUFFIX
 LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)
 else
@@ -1523,9 +1527,9 @@ LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX)
 endif
 
 ifeq ($(OSNAME), CYGWIN_NT)
-LIBPREFIX = cyg$(LIBNAMEBASE)
+LIBPREFIX = cyg$(LIBNAMEPREFIX)$(LIBNAMEBASE)
 else
-LIBPREFIX = lib$(LIBNAMEBASE)
+LIBPREFIX = lib$(LIBNAMEPREFIX)$(LIBNAMEBASE)
 endif
 
 KERNELDIR	= $(TOPDIR)/kernel/$(ARCH)
@@ -1705,8 +1709,8 @@ endif
 endif
 
 ifeq ($(FIXED_LIBNAME),1)
-	LIBNAME = $(LIBPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).$(LIBSUFFIX)
-	LIBNAME_P = $(LIBPREFIX)$(LISOBNAMEBASE)$(LIBNAMESUFFIX)_p.$(LIBSUFFIX)
+	LIBNAME = lib$(LIBNAMEPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).$(LIBSUFFIX)
+	LIBNAME_P = lib$(LIBNAMEPREFIX)$(LISOBNAMEBASE)$(LIBNAMESUFFIX)_p.$(LIBSUFFIX)
 endif
 
 LIBDLLNAME   = $(LIBPREFIX).dll

From 10ea3fb7421b4beeddbaff9e759cdcf873c4b369 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 9 Feb 2024 17:09:55 +0100
Subject: [PATCH 662/718] fix duplication of name parts

---
 exports/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/exports/Makefile b/exports/Makefile
index 238623b04..e8dea0364 100644
--- a/exports/Makefile
+++ b/exports/Makefile
@@ -133,7 +133,7 @@ libgoto_hpl.def : $(GENSYM)
 
 ifeq ($(OSNAME), Darwin)
 ifeq ($(FIXED_LIBNAME),1)
-INTERNALNAME = $(LIBPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).dylib
+INTERNALNAME = $(LIBPREFIX)$(LIBNAMESUFFIX).dylib
 else
 INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
 endif
@@ -174,7 +174,7 @@ FEXTRALIB += -lm
 EXTRALIB += -lm
 else
 ifeq ($(FIXED_LIBNAME),1)
-INTERNALNAME = $(LIBPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).so
+INTERNALNAME = $(LIBPREFIX)$(LIBNAMESUFFIX).so
 else
 INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
 endif

From 4c554bd527cc3b8ed0c160cee457e23bfe442343 Mon Sep 17 00:00:00 2001
From: kseniyazaytseva <kseniya.zaytseva@syntacore.com>
Date: Sat, 10 Feb 2024 00:46:52 +0300
Subject: [PATCH 663/718] check abs zero inc

---
 utest/test_extensions/test_damin.c  | 4 ++--
 utest/test_extensions/test_dzamax.c | 4 ++--
 utest/test_extensions/test_dzamin.c | 4 ++--
 utest/test_extensions/test_samin.c  | 4 ++--
 utest/test_extensions/test_scamax.c | 4 ++--
 utest/test_extensions/test_scamin.c | 4 ++--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/utest/test_extensions/test_damin.c b/utest/test_extensions/test_damin.c
index fdd2bc658..736921fa3 100644
--- a/utest/test_extensions/test_damin.c
+++ b/utest/test_extensions/test_damin.c
@@ -61,11 +61,11 @@ CTEST(damin, step_zero){
    blasint N = ELEMENTS, inc = 0;
    double x[ELEMENTS];
    for (i = 0; i < N; i ++) {
-      x[i] = i + 1000;
+      x[i] = i - 1000;
    }
    x[8] = 0.0;
    double amin = BLASFUNC(damin)(&N, x, &inc);
-   ASSERT_DBL_NEAR_TOL(x[0], amin, DOUBLE_EPS);
+   ASSERT_DBL_NEAR_TOL(fabs(x[0]), amin, DOUBLE_EPS);
 }
 
 /**
diff --git a/utest/test_extensions/test_dzamax.c b/utest/test_extensions/test_dzamax.c
index bdb3a4f18..7bc0200c9 100644
--- a/utest/test_extensions/test_dzamax.c
+++ b/utest/test_extensions/test_dzamax.c
@@ -61,10 +61,10 @@ CTEST(dzamax, step_zero){
    blasint N = ELEMENTS * 2, inc = 0;
    double x[ELEMENTS * 2];
    for (i = 0; i < N; i ++) {
-      x[i] = i + 1000;
+      x[i] = i - 1000;
    }
    double amax = BLASFUNC(dzamax)(&N, x, &inc);
-   ASSERT_DBL_NEAR_TOL((x[0] + x[1]), amax, DOUBLE_EPS);
+   ASSERT_DBL_NEAR_TOL((fabs(x[0]) + fabs(x[1])), amax, DOUBLE_EPS);
 }
 
 /**
diff --git a/utest/test_extensions/test_dzamin.c b/utest/test_extensions/test_dzamin.c
index 9fcf87b7b..549881fdc 100644
--- a/utest/test_extensions/test_dzamin.c
+++ b/utest/test_extensions/test_dzamin.c
@@ -61,11 +61,11 @@ CTEST(dzamin, step_zero){
    blasint N = ELEMENTS * 2, inc = 0;
    double x[ELEMENTS * 2];
    for (i = 0; i < N; i ++) {
-      x[i] = i + 1000;
+      x[i] = i - 1000;
    }
    x[8] = 0.0;
    double amin = BLASFUNC(dzamin)(&N, x, &inc);
-   ASSERT_DBL_NEAR_TOL((x[0] + x[1]), amin, DOUBLE_EPS);
+   ASSERT_DBL_NEAR_TOL((fabs(x[0]) + fabs(x[1])), amin, DOUBLE_EPS);
 }
 
 /**
diff --git a/utest/test_extensions/test_samin.c b/utest/test_extensions/test_samin.c
index 2e3a73797..fd34d462a 100644
--- a/utest/test_extensions/test_samin.c
+++ b/utest/test_extensions/test_samin.c
@@ -61,11 +61,11 @@ CTEST(samin, step_zero){
    blasint N = ELEMENTS, inc = 0;
    float x[ELEMENTS];
    for (i = 0; i < N; i ++) {
-      x[i] = i + 1000;
+      x[i] = i - 1000;
    }
    x[8] = 0.0;
    float amin = BLASFUNC(samin)(&N, x, &inc);
-   ASSERT_DBL_NEAR_TOL(x[0], amin, SINGLE_EPS);
+   ASSERT_DBL_NEAR_TOL(fabsf(x[0]), amin, SINGLE_EPS);
 }
 
 /**
diff --git a/utest/test_extensions/test_scamax.c b/utest/test_extensions/test_scamax.c
index 0f49ebfad..8c214ddff 100644
--- a/utest/test_extensions/test_scamax.c
+++ b/utest/test_extensions/test_scamax.c
@@ -61,10 +61,10 @@ CTEST(scamax, step_zero){
    blasint N = ELEMENTS * 2, inc = 0;
    float x[ELEMENTS * 2];
    for (i = 0; i < N; i ++) {
-      x[i] = i + 1000;
+      x[i] = i - 1000;
    }
    float amax = BLASFUNC(scamax)(&N, x, &inc);
-   ASSERT_DBL_NEAR_TOL((x[0] + x[1]), amax, SINGLE_EPS);
+   ASSERT_DBL_NEAR_TOL((fabsf(x[0]) + fabsf(x[1])), amax, SINGLE_EPS);
 }
 
 /**
diff --git a/utest/test_extensions/test_scamin.c b/utest/test_extensions/test_scamin.c
index 0f0414a1c..507548f2a 100644
--- a/utest/test_extensions/test_scamin.c
+++ b/utest/test_extensions/test_scamin.c
@@ -61,11 +61,11 @@ CTEST(scamin, step_zero){
    blasint N = ELEMENTS * 2, inc = 0;
    float x[ELEMENTS * 2];
    for (i = 0; i < N; i ++) {
-      x[i] = i + 1000;
+      x[i] = i - 1000;
    }
    x[8] = 0.0f;
    float amin = BLASFUNC(scamin)(&N, x, &inc);
-   ASSERT_DBL_NEAR_TOL((x[0] + x[1]), amin, SINGLE_EPS);
+   ASSERT_DBL_NEAR_TOL((fabsf(x[0]) + fabsf(x[1])), amin, SINGLE_EPS);
 }
 
 /**

From 5e9ead09ac03f27f7906576d4ec16cb80dc9fb4d Mon Sep 17 00:00:00 2001
From: kseniyazaytseva <kseniya.zaytseva@syntacore.com>
Date: Sat, 10 Feb 2024 00:47:25 +0300
Subject: [PATCH 664/718] fix info return

---
 interface/geadd.c  | 4 ++--
 interface/zgeadd.c | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/interface/geadd.c b/interface/geadd.c
index 3a0ea015d..a2e6d1081 100644
--- a/interface/geadd.c
+++ b/interface/geadd.c
@@ -117,8 +117,8 @@ void CNAME(enum CBLAS_ORDER order,
 
     if (ldc < MAX(1, m))  info = 8;
     if (lda < MAX(1, m))  info = 5;
-    if (n < 0)		  info = 2;
-    if (m < 0)		  info = 1;
+    if (n < 0)		  info = 1;
+    if (m < 0)		  info = 2;
   }
 
   if (info >= 0) {
diff --git a/interface/zgeadd.c b/interface/zgeadd.c
index 7124cf230..de71f27b8 100644
--- a/interface/zgeadd.c
+++ b/interface/zgeadd.c
@@ -66,7 +66,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
   info = 0;
 
 
-  if (lda < MAX(1, m))	info = 6;
+  if (lda < MAX(1, m))	info = 5;
   if (ldc < MAX(1, m))	info = 8;
 
   if (n < 0)		info = 2;
@@ -115,8 +115,8 @@ void CNAME(enum CBLAS_ORDER order,
 
     if (ldc < MAX(1, m))  info = 8;
     if (lda < MAX(1, m))  info = 5;
-    if (n < 0)		  info = 2;
-    if (m < 0)		  info = 1;
+    if (n < 0)		  info = 1;
+    if (m < 0)		  info = 2;
   }
 
   if (info >= 0) {

From c6f30fd4146258dcdfc7cf1322ad1e0b4f88e1ac Mon Sep 17 00:00:00 2001
From: kseniyazaytseva <kseniya.zaytseva@syntacore.com>
Date: Sat, 10 Feb 2024 00:48:07 +0300
Subject: [PATCH 665/718] check for zero inc

---
 interface/max.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/interface/max.c b/interface/max.c
index 6c7d32bd9..7817601b9 100644
--- a/interface/max.c
+++ b/interface/max.c
@@ -46,6 +46,12 @@
 
 #ifdef USE_ABS
 
+#if defined(DOUBLE)
+#define ABS fabs
+#else
+#define ABS fabsf
+#endif
+
 #ifndef USE_MIN
 
 /* ABS & MAX */
@@ -92,6 +98,8 @@
 
 #else
 
+#define ABS
+
 #ifndef USE_MIN
 
 /* MAX */
@@ -130,6 +138,12 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
 
   if (n <= 0) return 0;
 
+#ifndef COMPLEX
+  if (incx == 0) return (ABS(*x));
+#else
+  if (incx == 0) return (ABS(*x) + ABS(*(x+1)));
+#endif
+
   IDEBUG_START;
 
   FUNCTION_PROFILE_START();
@@ -158,6 +172,12 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
 
   if (n <= 0) return 0;
 
+#ifndef COMPLEX
+  if (incx == 0) return (ABS(*x));
+#else
+  if (incx == 0) return (ABS(*x) + ABS(*(x+1)));
+#endif
+
   IDEBUG_START;
 
   FUNCTION_PROFILE_START();

From 7e9b1c08074d1e88f446fc5861f44c8844e93d30 Mon Sep 17 00:00:00 2001
From: kseniyazaytseva <kseniya.zaytseva@syntacore.com>
Date: Sat, 10 Feb 2024 00:49:42 +0300
Subject: [PATCH 666/718] fix uninitialized data usage

---
 interface/zrotg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/interface/zrotg.c b/interface/zrotg.c
index ea73352dd..8acc3c9b2 100644
--- a/interface/zrotg.c
+++ b/interface/zrotg.c
@@ -102,7 +102,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
 		    if (ada >= h *safmin) {
 			*C = sqrt(ada/h);
 			*R = *DA / *C;
-			*(R+1) = *(DA+1) / *(C+1);
+			*(R+1) = *(DA+1) / *C;
 			rtmax *= 2.;
 			if ( ada > rtmin && h < rtmax) { // no risk of intermediate overflow
 				*S = *S1 * (*DA / adahsq) - *(S1+1)* (*(DA+1)/adahsq);
@@ -115,7 +115,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
 		        *C = ada / adahsq;
 		        if (*C >= safmin) {
 			    *R = *DA / *C;
-			    *(R+1) = *(DA+1) / *(C+1);
+			    *(R+1) = *(DA+1) / *C;
 			} else {
 			    *R = *DA * (h / adahsq);
 			    *(R+1) = *(DA+1) * (h / adahsq);

From 8698f9e37f63bd0a57b8182aaecb46cac2adf620 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Sat, 10 Feb 2024 19:12:16 +0100
Subject: [PATCH 667/718] Adding basic support of benchmarks into CMake for
 single, double, single complex and double complex cases. Each benchmarking
 target has a suffix to identify the data type, for example
 ./benchmark_gemm3m_COMPLEX_DOUBLE is a gemm3m.c source compiled with COMPLEX
 and DOUBLE macros defined

---
 CMakeLists.txt | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5a1e4b271..059035ac3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -452,6 +452,34 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
   endif()
 endif()
 
+if (BUILD_TESTING)
+  find_package(OpenMP REQUIRED)
+  file(GLOB SOURCES "benchmark/*.c")
+  foreach(source ${SOURCES})
+    get_filename_component(name ${source} NAME_WE)
+    if ((NOT ${name} STREQUAL "zdot-intel") AND (NOT ${name} STREQUAL "cula_wrapper"))
+        set(defines DEFAULT COMPLEX DOUBLE "COMPLEX\;DOUBLE")
+      foreach(define ${defines})
+        set(target_name "benchmark_${name}")
+        if (NOT "${define}" STREQUAL "DEFAULT")
+          string(JOIN "_" define_str ${define})
+          set(target_name "${target_name}_${define_str}")
+        endif()
+        if ((NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX_DOUBLE") AND
+            (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX_DOUBLE") AND
+            (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX_DOUBLE") AND
+            (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX_DOUBLE"))
+          add_executable(${target_name} ${source})
+          target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+          target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C)
+          if (NOT "${define}" STREQUAL "DEFAULT")
+            target_compile_definitions(${target_name} PRIVATE ${define})
+          endif()
+        endif()
+      endforeach()
+    endif()
+  endforeach()
+endif()
 
 
 # Install project

From d0f5dc763be1783874c5fe5c84c0eef90e73a6d1 Mon Sep 17 00:00:00 2001
From: Dmitry Mikushin <dmitry@kernelgen.org>
Date: Mon, 12 Feb 2024 02:18:03 +0100
Subject: [PATCH 668/718] Adding USE_GEMM3M macro to kernel targets, so that
 the *gemm3m functions and parameters can be included into the gotoblas
 structure. Fixes #4500

---
 kernel/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt
index 60314eedb..74e6760c2 100644
--- a/kernel/CMakeLists.txt
+++ b/kernel/CMakeLists.txt
@@ -1349,6 +1349,9 @@ endif ()
     set_target_properties(kernel${TSUFFIX} PROPERTIES COMPILE_FLAGS "${KERNEL_DEFINITIONS}")
     get_target_property(KERNEL_INCLUDE_DIRECTORIES kernel${TSUFFIX} INCLUDE_DIRECTORIES)
     set_target_properties(kernel${TSUFFIX} PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDE_DIRECTORIES};${TARGET_CONF_DIR}")
+    if (USE_GEMM3M)
+      target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M)
+    endif()
 endfunction ()
 
 

From 10548a0460d0b6abd160e69cd7ca727d41681584 Mon Sep 17 00:00:00 2001
From: Mark Seminatore <mseminatore@users.noreply.github.com>
Date: Mon, 12 Feb 2024 10:22:12 -0800
Subject: [PATCH 669/718] update contributors

---
 CONTRIBUTORS.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 493747052..8f7abc5f8 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -218,4 +218,6 @@ In chronological order:
   * [2022-08] Fix building from sources for QNX
 
 * Mark Seminatore <https://github.com/mseminatore>
-  * [2023-11-09] Improve Windows threading performance scaling
\ No newline at end of file
+  * [2023-11-09] Improve Windows threading performance scaling
+  * [2024-02-09] Introduce MT_TRACE facility and improve code consistency
+  
\ No newline at end of file

From a28afac791853e93c8462199d7ce18694ccc0a2c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 15 Feb 2024 11:48:33 +0100
Subject: [PATCH 670/718] Add FIXED_LIBNAME, LIBNAMEPREFIX and LIBNAMESUFFIX

---
 CMakeLists.txt | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5a1e4b271..a4440ee1b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,6 +40,11 @@ option(USE_PERL "Use the older PERL scripts for build preparation instead of uni
 
 option(NO_WARMUP "Do not run a benchmark on each startup just to find the best location for the memory buffer" ON)
 
+option(FIXED_LIBNAME "Use a non-versioned name for the library and no symbolic linking to variant names" OFF)
+
+set(LIBNAMEPREFIX "" CACHE STRING  "Add a prefix to the openblas part of the library name" )
+set(LIBNAMESUFFIX "" CACHE STRING  "Add a suffix after the openblas part of the library name" )
+
 if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
   option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
 else()
@@ -96,7 +101,7 @@ message(WARNING "CMake support is experimental. It does not yet support all buil
 include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
 include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
 
-set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE})
+set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
 
 set(BLASDIRS interface driver/level2 driver/level3 driver/others)
 
@@ -336,11 +341,12 @@ endif()
     add_subdirectory(cpp_thread_test)
   endif()
 
+if (NOT FIXED_LIBNAME)
 set_target_properties(${OpenBLAS_LIBS} PROPERTIES
   VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}
   SOVERSION ${OpenBLAS_MAJOR_VERSION}
 )
-
+endif()
 if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
   if (NOT MSVC)
     target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition")

From ffbfc3c692c44029eda64a5bc2cc1292bac5572e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 15 Feb 2024 12:16:34 +0100
Subject: [PATCH 671/718] Add libname prefix and suffix

---
 cmake/openblas.pc.in | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in
index 11e5606e5..9526d2df6 100644
--- a/cmake/openblas.pc.in
+++ b/cmake/openblas.pc.in
@@ -1,4 +1,6 @@
 libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+libnameprefix=@LIBNAMEPREFIX@
+libnamesuffix=@LIBNAMESUFFIX@
 libsuffix=@SUFFIX64_UNDERSCORE@
 includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
 
@@ -7,5 +9,5 @@ Name: OpenBLAS
 Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
 Version: @OpenBLAS_VERSION@
 URL: https://github.com/OpenMathLib/OpenBLAS
-Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix} 
+Libs: @OpenMP_C_FLAGS@ -L${libdir} -l${libnameprefix}openblas${libnamesuffix}${libsuffix} 
 Cflags: -I${includedir}

From a0e3f77e0bdbaab44ac86076e15ad15c65c15106 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 15 Feb 2024 12:17:38 +0100
Subject: [PATCH 672/718] add FIXED_LIBNAME, PREFIX and SUFFIX

---
 cmake/system.cmake | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/cmake/system.cmake b/cmake/system.cmake
index bc87f7b44..95f34e9f8 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -501,10 +501,11 @@ set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_TH
 endif()
 endif()
 endif()
+  
+set(LIBPREFIX "lib${LIBNAMEPREFIX}openblas")
+
 if (DEFINED LIBNAMESUFFIX)
-  set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}")
-else ()
-  set(LIBPREFIX "libopenblas")
+  set(LIBPREFIX "${LIBNAMEPREFIX}_${LIBNAMESUFFIX}")
 endif ()
 
 if (NOT DEFINED SYMBOLPREFIX)
@@ -679,6 +680,10 @@ else ()
   endif ()
 endif ()
 
+if (DEFINED FIXED_LIBNAME)
+  set (LIBNAME "${LIBPREFIX}.${LIBSUFFIX}")
+  set (LIBNAME "${LIBPREFIX}_p.${LIBSUFFIX}")
+endif()
 
 set(LIBDLLNAME "${LIBPREFIX}.dll")
 set(LIBSONAME "${LIBNAME}.${LIBSUFFIX}.so")

From 3120f12e767f05cf59e53e903c6a2a2ca9440b07 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 15 Feb 2024 14:16:20 +0100
Subject: [PATCH 673/718] allow for more pre- and suffixes in the name of the
 openblas library

---
 lapack-netlib/TESTING/EIG/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt
index e7236677a..d9c34fe98 100644
--- a/lapack-netlib/TESTING/EIG/CMakeLists.txt
+++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt
@@ -106,7 +106,7 @@ set(ZDMDEIGTST zchkdmd.f90)
 
 macro(add_eig_executable name)
   add_executable(${name} ${ARGN})
-  target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE})
+  target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
 #${TMGLIB} ../${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
 endmacro()
 

From c90979d8ef1aa69f7993efabb1b652d96d0f286c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 15 Feb 2024 14:17:11 +0100
Subject: [PATCH 674/718] allow for more pre- and suffixes in the name of the
 openblas library

---
 lapack-netlib/TESTING/LIN/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt
index 143fd0597..95baa3122 100644
--- a/lapack-netlib/TESTING/LIN/CMakeLists.txt
+++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt
@@ -239,7 +239,7 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr
 
 macro(add_lin_executable name)
   add_executable(${name} ${ARGN})
-  target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE})
+  target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
 #${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
 endmacro()
 

From ca6b4961e4a5d2804372769f7d04cf5d7d86968e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 15 Feb 2024 14:31:11 +0100
Subject: [PATCH 675/718] updates to fix option conflicts and config file
 generation

---
 Makefile.install | 4 +++-
 Makefile.system  | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/Makefile.install b/Makefile.install
index 2b00db4be..b2bc5aa41 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -7,6 +7,8 @@ LNCMD = ln -fs
 ifdef THELIBNAME
 LIBNAME=$(THELIBNAME)
 LIBSONAME=$(THELIBSONAME)
+endif
+ifeq ($(FIXED_LIBNAME), 1)
 LNCMD = true
 endif
 ifeq ($(INTERFACE64),1)
@@ -190,7 +192,7 @@ endif
 ifneq ($(NO_SHARED),1)
 #ifeq logical or
 ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
-	@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(LIBNAMESUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
+	@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 endif
 ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
 	@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
diff --git a/Makefile.system b/Makefile.system
index 233663ef9..2be7024c8 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -1523,7 +1523,7 @@ endif
 ifndef LIBNAMESUFFIX
 LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)
 else
-LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX)
+LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)$(LIBNAMESUFFIX)
 endif
 
 ifeq ($(OSNAME), CYGWIN_NT)

From ba17758c02134acb327a4b71202be6be15e36dbd Mon Sep 17 00:00:00 2001
From: Sergei Lewis <slewis@rivosinc.com>
Date: Fri, 16 Feb 2024 15:58:02 +0000
Subject: [PATCH 676/718] fix axpy implementations where y has a stride of 0

---
 kernel/riscv64/axpy_rvv.c    | 26 ++++++++++++++++++++++++--
 kernel/riscv64/axpy_vector.c | 24 +++++++++++++++++++++++-
 2 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/kernel/riscv64/axpy_rvv.c b/kernel/riscv64/axpy_rvv.c
index 8bc2f30de..2d5293f76 100644
--- a/kernel/riscv64/axpy_rvv.c
+++ b/kernel/riscv64/axpy_rvv.c
@@ -30,19 +30,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m8(n)
 #define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_M1_T            vfloat32m1_t
 #define VLEV_FLOAT              __riscv_vle32_v_f32m8
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
 #define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSEV_FLOAT_M1           __riscv_vse32_v_f32m1
 #define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m8(n)
 #define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_M1_T            vfloat64m1_t
 #define VLEV_FLOAT              __riscv_vle64_v_f64m8
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
 #define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSEV_FLOAT_M1           __riscv_vse64_v_f64m1
 #define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
@@ -76,7 +86,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
             VSEV_FLOAT(y, vy, vl);
         }
 
-    } else if (1 == inc_x) {
+    } else if (1 == inc_x && 0 != inc_y) {
 
         BLASLONG stride_y = inc_y * sizeof(FLOAT);
 
@@ -89,8 +99,20 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
             VSSEV_FLOAT(y, stride_y, vy, vl);
         }
 
-    } else {
+    } else if( 0 == inc_y ) {
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+        size_t in_vl = VSETVL(n);
+        vy = VFMVVF_FLOAT( y[0], in_vl );
 
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vy = VFMACCVF_FLOAT(vy, da, vx, vl);
+        }
+        FLOAT_V_M1_T vres = VFMVVF_FLOAT_M1( 0.0f, 1 );
+        vres = VFREDSUMVS_FLOAT( vy, vres, in_vl );
+        VSEV_FLOAT_M1(y, vres, 1);
+    } else {
         BLASLONG stride_x = inc_x * sizeof(FLOAT);
         BLASLONG stride_y = inc_y * sizeof(FLOAT);
 
diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c
index 6dffe5f09..c77a18afa 100644
--- a/kernel/riscv64/axpy_vector.c
+++ b/kernel/riscv64/axpy_vector.c
@@ -51,11 +51,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
 #define FLOAT_V_T       JOIN(vfloat,    	ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_M1_T    JOIN(vfloat,    	ELEN,   m1,   _t,     _)
 #define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
 #define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
 #define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
 #define VSSEV_FLOAT     JOIN(RISCV_RVV(vsse),      ELEN,   _v_f,   ELEN,   LMUL)
 #define VFMACCVF_FLOAT  JOIN(RISCV_RVV(vfmacc),    _vf_f, 	ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))(v_res, va, vb, gvl)
+#else
+#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
@@ -123,7 +132,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 			VSEV_FLOAT(&y[j], vy0, gvl);
 			j += gvl;
 		}
-        }else if(inc_x == 1){
+	} else if (1 == inc_x && 0 != inc_y) {
 		stride_y = inc_y * sizeof(FLOAT);
                 gvl = VSETVL(n);
                 if(gvl <= n/2){
@@ -151,6 +160,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 			VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
 			j += gvl;
 		}
+	} else if( 0 == inc_y ) {
+	        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+	        size_t in_vl = VSETVL(n);
+	        vy0 = VFMVVF_FLOAT( y[0], in_vl );
+
+	        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+	            vl = VSETVL(n);
+	            vx0 = VLSEV_FLOAT(x, stride_x, vl);
+	            vy0 = VFMACCVF_FLOAT(vy0, da, vx0, vl);
+	        }
+	        FLOAT_V_M1_T v_res = VFMVVF_FLOAT_M1( 0.0f, 1 );
+	        v_res = VFREDSUMVS_FLOAT( vy0, v_res, in_vl );
+	        y[0] = EXTRACT_FLOAT(v_res);
 	}else{
 		stride_x = inc_x * sizeof(FLOAT);
 		stride_y = inc_y * sizeof(FLOAT);

From 461ecabb2249fd598b325a91d2b4dfccbc90a824 Mon Sep 17 00:00:00 2001
From: Sergei Lewis <slewis@rivosinc.com>
Date: Fri, 16 Feb 2024 11:33:28 +0000
Subject: [PATCH 677/718] add RISCV64_ZVL128B and RISCV64_ZVL256B targets to CI
 flows and to README.md

---
 .github/workflows/riscv64_vector.yml | 253 +++++++++++++++++++++++++++
 README.md                            |  10 ++
 common_riscv64.h                     |   2 +-
 3 files changed, 264 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/riscv64_vector.yml

diff --git a/.github/workflows/riscv64_vector.yml b/.github/workflows/riscv64_vector.yml
new file mode 100644
index 000000000..dd6fe9ca8
--- /dev/null
+++ b/.github/workflows/riscv64_vector.yml
@@ -0,0 +1,253 @@
+name: riscv64 zvl256b qemu test
+
+on: [push, pull_request]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+jobs:
+  TEST:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
+    runs-on: ubuntu-latest
+    env:
+      triple: riscv64-unknown-linux-gnu
+      riscv_gnu_toolchain: https://github.com/riscv-collab/riscv-gnu-toolchain
+      riscv_gnu_toolchain_version: 13.2.0
+      riscv_gnu_toolchain_nightly_download_path: /releases/download/2024.02.02/riscv64-glibc-ubuntu-22.04-llvm-nightly-2024.02.02-nightly.tar.gz
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - target: RISCV64_ZVL128B
+            opts: TARGET=RISCV64_ZVL128B BINARY=64 ARCH=riscv64
+            qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=128,elen=64
+          - target: RISCV64_ZVL256B
+            opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64
+            qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: install build deps
+        run: |
+          sudo apt-get update
+          sudo apt-get install autoconf automake autotools-dev ninja-build make \
+          libgomp1-riscv64-cross ccache
+          wget ${riscv_gnu_toolchain}/${riscv_gnu_toolchain_nightly_download_path}
+          tar -xvf $(basename ${riscv_gnu_toolchain_nightly_download_path}) -C /opt
+
+      - name: Compilation cache
+        uses: actions/cache@v3
+        with:
+          path: ~/.ccache
+          key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
+          restore-keys: |
+            ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
+            ccache-${{ runner.os }}-${{ matrix.target }}
+
+      - name: Configure ccache
+        run: |
+          test -d ~/.ccache || mkdir -p ~/.ccache
+          echo "max_size = 300M" > ~/.ccache/ccache.conf
+          echo "compression = true" >> ~/.ccache/ccache.conf
+          ccache -s
+
+      - name: build OpenBLAS libs
+        run: |
+            export PATH="/opt/riscv/bin:$PATH"
+            make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
+            CC='ccache clang --rtlib=compiler-rt -target ${triple} --sysroot /opt/riscv/sysroot --gcc-toolchain=/opt/riscv/lib/gcc/riscv64-unknown-linux-gnu/${riscv_gnu_toolchain_version}/' \
+            AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
+            RANLIB='ccache ${triple}-ranlib' \
+            FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
+            HOSTCC=gcc HOSTFC=gfortran -j$(nproc)
+
+      - name: build OpenBLAS tests
+        run: |
+            export PATH="/opt/riscv/bin:$PATH"
+            make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
+            CC='${triple}-gcc' \
+            AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
+            RANLIB='ccache ${triple}-ranlib' \
+            FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
+            HOSTCC=gcc HOSTFC=gfortran -j$(nproc) tests
+
+      - name: build lapack-netlib tests
+        working-directory: ./lapack-netlib/TESTING
+        run: |
+            export PATH="/opt/riscv/bin:$PATH"
+            make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
+            CC='${triple}-gcc' \
+            AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
+            RANLIB='ccache ${triple}-ranlib' \
+            FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
+            HOSTCC=gcc HOSTFC=gfortran -j$(nproc) \
+            LIN/xlintsts LIN/xlintstc LIN/xlintstd LIN/xlintstz LIN/xlintstrfs \
+            LIN/xlintstrfc LIN/xlintstrfd LIN/xlintstrfz LIN/xlintstds \
+            LIN/xlintstzc EIG/xeigtsts EIG/xeigtstc EIG/xeigtstd EIG/xeigtstz \
+
+      - name: OpenBLAS tests
+        shell: bash
+        run: |
+          export PATH="/opt/riscv/bin:$PATH"
+          export QEMU_CPU=${{ matrix.qemu_cpu }}
+          rm -rf ./test_out
+          mkdir -p ./test_out
+          run_test() { local DIR=$1; local CMD=$2; local DATA=$3; local OUTPUT="./test_out/$DIR.$CMD"; \
+            echo "`pwd`/$DIR/$CMD $DIR/$DATA" >> $OUTPUT; \
+            if [[ -z $DATA ]]; then qemu-riscv64 ./$DIR/$CMD |& tee $OUTPUT ; \ 
+            else qemu-riscv64 ./$DIR/$CMD < ./$DIR/$DATA |& tee $OUTPUT ; fi ; \
+            RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi \
+          }
+          run_test test cblat1 &
+          run_test test cblat2 cblat2.dat &
+          run_test test cblat3 cblat3.dat &
+          run_test test dblat1 &
+          run_test test dblat2 dblat2.dat &
+          run_test test dblat3 dblat3.dat &
+          run_test test sblat1 &
+          run_test test sblat2 sblat2.dat &
+          run_test test sblat3 sblat3.dat &
+          run_test test zblat1 &
+          run_test test zblat2 zblat2.dat &
+          run_test test zblat3 zblat3.dat &
+          run_test ctest xccblat1 &
+          run_test ctest xccblat2 cin2 &
+          run_test ctest xccblat3 cin3 &
+          run_test ctest xdcblat1 &
+          run_test ctest xdcblat2 din2 &
+          run_test ctest xdcblat3 din3 &
+          run_test ctest xscblat1 &
+          run_test ctest xscblat2 sin2 &
+          run_test ctest xscblat3 sin3 &
+          run_test ctest xzcblat1 &
+          run_test ctest xzcblat2 zin2 &
+          run_test ctest xzcblat3 zin3 &
+          wait
+          while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*)
+          if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi
+
+      - name: netlib tests
+        shell: bash
+        run: |
+          : # these take a very long time
+          echo "Skipping netlib tests in CI"
+          exit 0
+          : # comment out exit above to enable the tests
+          : # probably we want to identify a subset to run in CI
+          export PATH="/opt/riscv/bin:$PATH"
+          export QEMU_CPU=${{ matrix.qemu_cpu }}
+          rm -rf ./test_out
+          mkdir -p ./test_out
+          run_test() { local OUTPUT="./test_out/$1"; local DATA="./lapack-netlib/TESTING/$2"; local CMD="./lapack-netlib/TESTING/$3"; \
+            echo "$4" >> $OUTPUT; \
+            echo "$CMD" >> $OUTPUT; \
+            qemu-riscv64 $CMD < $DATA |& tee $OUTPUT; \
+            RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi; \
+            if grep -q fail $OUTPUT ; then echo "*** FAIL: log contains 'fail'" >> $OUTPUT ; fi ; \
+            if grep -q rror $OUTPUT | grep -v -q "passed" | grep -v "largest error" ; then echo "*** FAIL: log contains 'error'" >> $OUTPUT ; fi \
+          }
+          run_test stest.out stest.in LIN/xlintsts "Testing REAL LAPACK linear equation routines"  &
+          run_test ctest.out ctest.in LIN/xlintstc "Testing COMPLEX LAPACK linear equation routines"  &
+          run_test dtest.out dtest.in LIN/xlintstd "Testing DOUBLE PRECISION LAPACK linear equation routines"  &
+          run_test ztest.out ztest.in LIN/xlintstz "Testing COMPLEX16 LAPACK linear equation routines"  &
+          run_test dstest.out dstest.in LIN/xlintstds "Testing SINGLE-DOUBLE PRECISION LAPACK prototype linear equation routines"  &
+          run_test zctest.out zctest.in LIN/xlintstzc "Testing COMPLEX-COMPLEX16 LAPACK prototype linear equation routines"  &
+          run_test stest_rfp.out stest_rfp.in LIN/xlintstrfs "Testing REAL LAPACK RFP prototype linear equation routines"  &
+          run_test dtest_rfp.out dtest_rfp.in LIN/xlintstrfd "Testing DOUBLE PRECISION LAPACK RFP prototype linear equation routines"  &
+          run_test ctest_rfp.out ctest_rfp.in LIN/xlintstrfc "Testing COMPLEX LAPACK RFP prototype linear equation routines"  &
+          run_test ztest_rfp.out ztest_rfp.in LIN/xlintstrfz "Testing COMPLEX16 LAPACK RFP prototype linear equation routines"  &
+          run_test snep.out nep.in EIG/xeigtsts "NEP - Testing Nonsymmetric Eigenvalue Problem routines"  &
+          run_test ssep.out sep.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test sse2.out se2.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test ssvd.out svd.in EIG/xeigtsts "SVD - Testing Singular Value Decomposition routines"  &
+          run_test sec.out sec.in EIG/xeigtsts "SEC - Testing REAL Eigen Condition Routines"  &
+          run_test sed.out sed.in EIG/xeigtsts "SEV - Testing REAL Nonsymmetric Eigenvalue Driver"  &
+          run_test sgg.out sgg.in EIG/xeigtsts "SGG - Testing REAL Nonsymmetric Generalized Eigenvalue Problem routines"  &
+          run_test sgd.out sgd.in EIG/xeigtsts "SGD - Testing REAL Nonsymmetric Generalized Eigenvalue Problem driver routines"  &
+          run_test ssb.out ssb.in EIG/xeigtsts "SSB - Testing REAL Symmetric Eigenvalue Problem routines"  &
+          run_test ssg.out ssg.in EIG/xeigtsts "SSG - Testing REAL Symmetric Generalized Eigenvalue Problem routines"  &
+          run_test sbal.out sbal.in EIG/xeigtsts "SGEBAL - Testing the balancing of a REAL general matrix"  &
+          run_test sbak.out sbak.in EIG/xeigtsts "SGEBAK - Testing the back transformation of a REAL balanced matrix"  &
+          run_test sgbal.out sgbal.in EIG/xeigtsts "SGGBAL - Testing the balancing of a pair of REAL general matrices"  &
+          run_test sgbak.out sgbak.in EIG/xeigtsts "SGGBAK - Testing the back transformation of a pair of REAL balanced matrices"  &
+          run_test sbb.out sbb.in EIG/xeigtsts "SBB - Testing banded Singular Value Decomposition routines"  &
+          run_test sglm.out glm.in EIG/xeigtsts "GLM - Testing Generalized Linear Regression Model routines"  &
+          run_test sgqr.out gqr.in EIG/xeigtsts "GQR - Testing Generalized QR and RQ factorization routines"  &
+          run_test sgsv.out gsv.in EIG/xeigtsts "GSV - Testing Generalized Singular Value Decomposition routines"  &
+          run_test scsd.out csd.in EIG/xeigtsts "CSD - Testing CS Decomposition routines"  &
+          run_test slse.out lse.in EIG/xeigtsts "LSE - Testing Constrained Linear Least Squares routines"  &
+          run_test cnep.out nep.in EIG/xeigtstc "NEP - Testing Nonsymmetric Eigenvalue Problem routines"  &
+          run_test csep.out sep.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test cse2.out se2.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test csvd.out svd.in EIG/xeigtstc "SVD - Testing Singular Value Decomposition routines"  &
+          run_test cec.out cec.in EIG/xeigtstc "CEC - Testing COMPLEX Eigen Condition Routines"  &
+          run_test ced.out ced.in EIG/xeigtstc "CES - Testing COMPLEX Nonsymmetric Schur Form Driver"  &
+          run_test cgg.out cgg.in EIG/xeigtstc "CGG - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem routines"  &
+          run_test cgd.out cgd.in EIG/xeigtstc "CGD - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem driver routines"  &
+          run_test csb.out csb.in EIG/xeigtstc "CHB - Testing Hermitian Eigenvalue Problem routines"  &
+          run_test csg.out csg.in EIG/xeigtstc "CSG - Testing Symmetric Generalized Eigenvalue Problem routines"  &
+          run_test cbal.out cbal.in EIG/xeigtstc "CGEBAL - Testing the balancing of a COMPLEX general matrix"  &
+          run_test cbak.out cbak.in EIG/xeigtstc "CGEBAK - Testing the back transformation of a COMPLEX balanced matrix"  &
+          run_test cgbal.out cgbal.in EIG/xeigtstc "CGGBAL - Testing the balancing of a pair of COMPLEX general matrices"  &
+          run_test cgbak.out cgbak.in EIG/xeigtstc "CGGBAK - Testing the back transformation of a pair of COMPLEX balanced matrices"  &
+          run_test cbb.out cbb.in EIG/xeigtstc "CBB - Testing banded Singular Value Decomposition routines"  &
+          run_test cglm.out glm.in EIG/xeigtstc "GLM - Testing Generalized Linear Regression Model routines"  &
+          run_test cgqr.out gqr.in EIG/xeigtstc "GQR - Testing Generalized QR and RQ factorization routines"  &
+          run_test cgsv.out gsv.in EIG/xeigtstc "GSV - Testing Generalized Singular Value Decomposition routines"  &
+          run_test ccsd.out csd.in EIG/xeigtstc "CSD - Testing CS Decomposition routines"  &
+          run_test clse.out lse.in EIG/xeigtstc "LSE - Testing Constrained Linear Least Squares routines"  &
+          run_test dnep.out nep.in EIG/xeigtstd "NEP - Testing Nonsymmetric Eigenvalue Problem routines"  &
+          run_test dsep.out sep.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test dse2.out se2.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test dsvd.out svd.in EIG/xeigtstd "SVD - Testing Singular Value Decomposition routines"  &
+          run_test dec.out dec.in EIG/xeigtstd "DEC - Testing DOUBLE PRECISION Eigen Condition Routines"  &
+          run_test ded.out ded.in EIG/xeigtstd "DEV - Testing DOUBLE PRECISION Nonsymmetric Eigenvalue Driver"  &
+          run_test dgg.out dgg.in EIG/xeigtstd "DGG - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem routines"  &
+          run_test dgd.out dgd.in EIG/xeigtstd "DGD - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem driver routines"  &
+          run_test dsb.out dsb.in EIG/xeigtstd "DSB - Testing DOUBLE PRECISION Symmetric Eigenvalue Problem routines"  &
+          run_test dsg.out dsg.in EIG/xeigtstd "DSG - Testing DOUBLE PRECISION Symmetric Generalized Eigenvalue Problem routines"  &
+          run_test dbal.out dbal.in EIG/xeigtstd "DGEBAL - Testing the balancing of a DOUBLE PRECISION general matrix"  &
+          run_test dbak.out dbak.in EIG/xeigtstd "DGEBAK - Testing the back transformation of a DOUBLE PRECISION balanced matrix"  &
+          run_test dgbal.out dgbal.in EIG/xeigtstd "DGGBAL - Testing the balancing of a pair of DOUBLE PRECISION general matrices"  &
+          run_test dgbak.out dgbak.in EIG/xeigtstd "DGGBAK - Testing the back transformation of a pair of DOUBLE PRECISION balanced matrices"  &
+          run_test dbb.out dbb.in EIG/xeigtstd "DBB - Testing banded Singular Value Decomposition routines"  &
+          run_test dglm.out glm.in EIG/xeigtstd "GLM - Testing Generalized Linear Regression Model routines"  &
+          run_test dgqr.out gqr.in EIG/xeigtstd "GQR - Testing Generalized QR and RQ factorization routines"  &
+          run_test dgsv.out gsv.in EIG/xeigtstd "GSV - Testing Generalized Singular Value Decomposition routines"  &
+          run_test dcsd.out csd.in EIG/xeigtstd "CSD - Testing CS Decomposition routines"  &
+          run_test dlse.out lse.in EIG/xeigtstd "LSE - Testing Constrained Linear Least Squares routines"  &
+          run_test znep.out nep.in EIG/xeigtstz "NEP - Testing Nonsymmetric Eigenvalue Problem routines"  &
+          run_test zsep.out sep.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test zse2.out se2.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test zsvd.out svd.in EIG/xeigtstz "SVD - Testing Singular Value Decomposition routines"  &
+          run_test zec.out zec.in EIG/xeigtstz "ZEC - Testing COMPLEX16 Eigen Condition Routines"  &
+          run_test zed.out zed.in EIG/xeigtstz "ZES - Testing COMPLEX16 Nonsymmetric Schur Form Driver"  &
+          run_test zgg.out zgg.in EIG/xeigtstz "ZGG - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem routines"  &
+          run_test zgd.out zgd.in EIG/xeigtstz "ZGD - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem driver routines"  &
+          run_test zsb.out zsb.in EIG/xeigtstz "ZHB - Testing Hermitian Eigenvalue Problem routines"  &
+          run_test zsg.out zsg.in EIG/xeigtstz "ZSG - Testing Symmetric Generalized Eigenvalue Problem routines"  &
+          run_test zbal.out zbal.in EIG/xeigtstz "ZGEBAL - Testing the balancing of a COMPLEX16 general matrix"  &
+          run_test zbak.out zbak.in EIG/xeigtstz "ZGEBAK - Testing the back transformation of a COMPLEX16 balanced matrix"  &
+          run_test zgbal.out zgbal.in EIG/xeigtstz "ZGGBAL - Testing the balancing of a pair of COMPLEX general matrices"  &
+          run_test zgbak.out zgbak.in EIG/xeigtstz "ZGGBAK - Testing the back transformation of a pair of COMPLEX16 balanced matrices"  &
+          run_test zbb.out zbb.in EIG/xeigtstz "ZBB - Testing banded Singular Value Decomposition routines"  &
+          run_test zglm.out glm.in EIG/xeigtstz "GLM - Testing Generalized Linear Regression Model routines"  &
+          run_test zgqr.out gqr.in EIG/xeigtstz "GQR - Testing Generalized QR and RQ factorization routines"  &
+          run_test zgsv.out gsv.in EIG/xeigtstz "GSV - Testing Generalized Singular Value Decomposition routines"  &
+          run_test zcsd.out csd.in EIG/xeigtstz "CSD - Testing CS Decomposition routines"  &
+          run_test zlse.out lse.in EIG/xeigtstz "LSE - Testing Constrained Linear Least Squares routines"  &
+          wait
+          while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ;  done < <(grep -lZ FAIL ./test_out/*)
+          python ./lapack-netlib/lapack_testing.py -d ./test_out -e > netlib_summary
+          TOTALS="$(grep 'ALL PRECISIONS' netlib_summary)"
+          NUMERICAL_ERRORS=-1
+          OTHER_ERRORS=-1
+          . <(awk '/ALL PRECISIONS/{printf "NUMERICAL_ERRORS=%s\nOTHER_ERRORS=%s\n", $5, $7}' netlib_summary
+          if (( NUMERICAL_ERRORS != 0 )) || (( OTHER_ERRORS != 0 )) ; then cat netlib_summary ; FAILURES=1 ; fi
+          if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi
diff --git a/README.md b/README.md
index 2f0a0da4c..43f390db0 100644
--- a/README.md
+++ b/README.md
@@ -203,6 +203,16 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
   make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran
   ```
 
+- **ZVL???B**: Level-3 BLAS and Level-1,2 including vectorised kernels targeting generic RISCV cores with vector support with registers of at least the corresponding width; ZVL128B and ZVL256B are available.
+e.g.:
+  ```sh
+make TARGET=RISCV64_ZVL256B CFLAGS="-DTARGET=RISCV64_ZVL256B" \
+    BINARY=64 ARCH=riscv64 CC='clang -target riscv64-unknown-linux-gnu' \
+    AR=riscv64-unknown-linux-gnu-ar AS=riscv64-unknown-linux-gnu-gcc \
+    LD=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran \
+    HOSTCC=gcc HOSTFC=gfortran -j
+  ```
+
 ### Support for multiple targets in a single library
 
 OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
diff --git a/common_riscv64.h b/common_riscv64.h
index ab3bfa25a..eccfc644f 100644
--- a/common_riscv64.h
+++ b/common_riscv64.h
@@ -91,7 +91,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define BUFFER_SIZE     ( 32 << 20)
 #define SEEK_ADDRESS
 
-#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) || defined(x280)
+#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(RISCV64_ZVL128B) || defined(x280)
 # include <riscv_vector.h>
 #endif
 

From 4787a55c64a17d80020c370c3e439f362979c83e Mon Sep 17 00:00:00 2001
From: pengxu <pengxu@loongson.cn>
Date: Tue, 20 Feb 2024 20:41:45 +0800
Subject: [PATCH 678/718] Optimized cgemm kernel 16x4 LASX for LoongArch

---
 kernel/generic/zhemm_ltcopy_16.c            | 1170 ++++++
 kernel/generic/zhemm_utcopy_16.c            | 1168 ++++++
 kernel/generic/zneg_tcopy_16.c              |  587 +++
 kernel/generic/zsymm_lcopy_16.c             |  333 ++
 kernel/generic/zsymm_ucopy_16.c             |  332 ++
 kernel/generic/ztrmm_lncopy_16.c            | 2310 ++++++++++++
 kernel/generic/ztrmm_ltcopy_16.c            | 2313 ++++++++++++
 kernel/generic/ztrmm_uncopy_16.c            | 2316 ++++++++++++
 kernel/generic/ztrmm_utcopy_16.c            | 2318 ++++++++++++
 kernel/generic/ztrsm_lncopy_16.c            |  308 ++
 kernel/generic/ztrsm_ltcopy_16.c            |  264 ++
 kernel/generic/ztrsm_uncopy_16.c            |  313 ++
 kernel/generic/ztrsm_utcopy_16.c            |  261 ++
 kernel/loongarch64/KERNEL.LOONGSON3R5       |   10 +-
 kernel/loongarch64/cgemm_kernel_16x4_lasx.S | 3757 +++++++++++++++++++
 kernel/loongarch64/cgemm_ncopy_16_lasx.S    |  691 ++++
 kernel/loongarch64/cgemm_ncopy_4_lasx.S     |  325 ++
 kernel/loongarch64/cgemm_tcopy_16_lasx.S    |  741 ++++
 kernel/loongarch64/cgemm_tcopy_4_lasx.S     |  306 ++
 param.h                                     |   12 +-
 20 files changed, 19828 insertions(+), 7 deletions(-)
 create mode 100644 kernel/generic/zhemm_ltcopy_16.c
 create mode 100644 kernel/generic/zhemm_utcopy_16.c
 create mode 100644 kernel/generic/zneg_tcopy_16.c
 create mode 100644 kernel/generic/zsymm_lcopy_16.c
 create mode 100644 kernel/generic/zsymm_ucopy_16.c
 create mode 100644 kernel/generic/ztrmm_lncopy_16.c
 create mode 100644 kernel/generic/ztrmm_ltcopy_16.c
 create mode 100644 kernel/generic/ztrmm_uncopy_16.c
 create mode 100644 kernel/generic/ztrmm_utcopy_16.c
 create mode 100644 kernel/generic/ztrsm_lncopy_16.c
 create mode 100644 kernel/generic/ztrsm_ltcopy_16.c
 create mode 100644 kernel/generic/ztrsm_uncopy_16.c
 create mode 100644 kernel/generic/ztrsm_utcopy_16.c
 create mode 100644 kernel/loongarch64/cgemm_kernel_16x4_lasx.S
 create mode 100644 kernel/loongarch64/cgemm_ncopy_16_lasx.S
 create mode 100644 kernel/loongarch64/cgemm_ncopy_4_lasx.S
 create mode 100644 kernel/loongarch64/cgemm_tcopy_16_lasx.S
 create mode 100644 kernel/loongarch64/cgemm_tcopy_4_lasx.S

diff --git a/kernel/generic/zhemm_ltcopy_16.c b/kernel/generic/zhemm_ltcopy_16.c
new file mode 100644
index 000000000..8797891ea
--- /dev/null
+++ b/kernel/generic/zhemm_ltcopy_16.c
@@ -0,0 +1,1170 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, js, offset;
+
+  FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
+  FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
+  FLOAT data17, data18, data19, data20, data21, data22, data23, data24;
+  FLOAT data25, data26, data27, data28, data29, data30, data31, data32;
+
+  FLOAT *ao1, *ao2,  *ao3,  *ao4,  *ao5,  *ao6,  *ao7,  *ao8;
+  FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16;
+
+  lda *= 2;
+
+  js = (n >> 4);
+  while (js > 0){
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
+    if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
+    if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
+    if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
+    if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
+    if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
+    if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
+    if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
+	if (offset > -8) ao9 = a + (posX + 8) * 2 + posY * lda; else ao9 = a + posY * 2 + (posX + 8) * lda;
+    if (offset > -9) ao10 = a + (posX + 9) * 2 + posY * lda; else ao10 = a + posY * 2 + (posX + 9) * lda;
+    if (offset > -10) ao11 = a + (posX + 10) * 2 + posY * lda; else ao11 = a + posY * 2 + (posX + 10) * lda;
+    if (offset > -11) ao12 = a + (posX + 11) * 2 + posY * lda; else ao12 = a + posY * 2 + (posX + 11) * lda;
+    if (offset > -12) ao13 = a + (posX + 12) * 2 + posY * lda; else ao13 = a + posY * 2 + (posX + 12) * lda;
+    if (offset > -13) ao14 = a + (posX + 13) * 2 + posY * lda; else ao14 = a + posY * 2 + (posX + 13) * lda;
+    if (offset > -14) ao15 = a + (posX + 14) * 2 + posY * lda; else ao15 = a + posY * 2 + (posX + 14) * lda;
+    if (offset > -15) ao16 = a + (posX + 15) * 2 + posY * lda; else ao16 = a + posY * 2 + (posX + 15) * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+      data09 = *(ao5 + 0);
+      data10 = *(ao5 + 1);
+      data11 = *(ao6 + 0);
+      data12 = *(ao6 + 1);
+      data13 = *(ao7 + 0);
+      data14 = *(ao7 + 1);
+      data15 = *(ao8 + 0);
+      data16 = *(ao8 + 1);
+	  data17 = *(ao9 + 0);
+      data18 = *(ao9 + 1);
+      data19 = *(ao10 + 0);
+      data20 = *(ao10 + 1);
+      data21 = *(ao11 + 0);
+      data22 = *(ao11 + 1);
+      data23 = *(ao12 + 0);
+      data24 = *(ao12 + 1);
+      data25 = *(ao13 + 0);
+      data26 = *(ao13 + 1);
+      data27 = *(ao14 + 0);
+      data28 = *(ao14 + 1);
+      data29 = *(ao15 + 0);
+      data30 = *(ao15 + 1);
+      data31 = *(ao16 + 0);
+      data32 = *(ao16 + 1);
+
+      if (offset >   0) ao1 += lda; else ao1 += 2;
+      if (offset >  -1) ao2 += lda; else ao2 += 2;
+      if (offset >  -2) ao3 += lda; else ao3 += 2;
+      if (offset >  -3) ao4 += lda; else ao4 += 2;
+      if (offset >  -4) ao5 += lda; else ao5 += 2;
+      if (offset >  -5) ao6 += lda; else ao6 += 2;
+      if (offset >  -6) ao7 += lda; else ao7 += 2;
+      if (offset >  -7) ao8 += lda; else ao8 += 2;
+	  if (offset >  -8) ao9 += lda; else ao9 += 2;
+      if (offset >  -9) ao10 += lda; else ao10 += 2;
+      if (offset >  -10) ao11 += lda; else ao11 += 2;
+      if (offset >  -11) ao12 += lda; else ao12 += 2;
+      if (offset >  -12) ao13 += lda; else ao13 += 2;
+      if (offset >  -13) ao14 += lda; else ao14 += 2;
+      if (offset >  -14) ao15 += lda; else ao15 += 2;
+      if (offset >  -15) ao16 += lda; else ao16 += 2;
+
+      if (offset > 0) {
+	b[ 0] = data01;
+	b[ 1] = data02;
+	b[ 2] = data03;
+	b[ 3] = data04;
+	b[ 4] = data05;
+	b[ 5] = data06;
+	b[ 6] = data07;
+	b[ 7] = data08;
+	b[ 8] = data09;
+	b[ 9] = data10;
+	b[10] = data11;
+	b[11] = data12;
+	b[12] = data13;
+	b[13] = data14;
+	b[14] = data15;
+	b[15] = data16;
+	b[16] = data17;
+	b[17] = data18;
+	b[18] = data19;
+	b[19] = data20;
+	b[20] = data21;
+	b[21] = data22;
+	b[22] = data23;
+	b[23] = data24;
+	b[24] = data25;
+	b[25] = data26;
+	b[26] = data27;
+	b[27] = data28;
+	b[28] = data29;
+	b[29] = data30;
+	b[30] = data31;
+	b[31] = data32;
+      } else
+	if (offset < -15) {
+	  b[ 0] = data01;
+	  b[ 1] = -data02;
+	  b[ 2] = data03;
+	  b[ 3] = -data04;
+	  b[ 4] = data05;
+	  b[ 5] = -data06;
+	  b[ 6] = data07;
+	  b[ 7] = -data08;
+	  b[ 8] = data09;
+	  b[ 9] = -data10;
+	  b[10] = data11;
+	  b[11] = -data12;
+	  b[12] = data13;
+	  b[13] = -data14;
+	  b[14] = data15;
+	  b[15] = -data16;
+	b[16] = data17;
+	b[17] = -data18;
+	b[18] = data19;
+	b[19] = -data20;
+	b[20] = data21;
+	b[21] = -data22;
+	b[22] = data23;
+	b[23] = -data24;
+	b[24] = data25;
+	b[25] = -data26;
+	b[26] = data27;
+	b[27] = -data28;
+	b[28] = data29;
+	b[29] = -data30;
+	b[30] = data31;
+	b[31] = -data32;
+	} else {
+	  switch (offset) {
+	  case  0 :
+	    b[ 0] = data01;
+	    b[ 1] = ZERO;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	  case -1 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = ZERO;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	  case -2 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = ZERO;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	  case -3 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = ZERO;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	  case -4 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = ZERO;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	  case -5 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = ZERO;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	  case -6 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = ZERO;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	  case -7 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = ZERO;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	case -8 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = ZERO;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	case -9 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = ZERO;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	case -10 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = ZERO;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	case -11 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = ZERO;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	case -12 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = ZERO;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	case -13 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = ZERO;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	case -14 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = ZERO;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	case -15 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = ZERO;
+	    break;
+	  }
+	}
+
+      b += 32;
+
+      offset --;
+      i --;
+    }
+
+    posX += 16;
+    js --;
+  }
+
+  if (n & 8) {
+	offset = posX - posY;
+
+    if (offset >  0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
+    if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
+    if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
+    if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
+    if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
+    if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
+    if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
+    if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+      data09 = *(ao5 + 0);
+      data10 = *(ao5 + 1);
+      data11 = *(ao6 + 0);
+      data12 = *(ao6 + 1);
+      data13 = *(ao7 + 0);
+      data14 = *(ao7 + 1);
+      data15 = *(ao8 + 0);
+      data16 = *(ao8 + 1);
+
+      if (offset >   0) ao1 += lda; else ao1 += 2;
+      if (offset >  -1) ao2 += lda; else ao2 += 2;
+      if (offset >  -2) ao3 += lda; else ao3 += 2;
+      if (offset >  -3) ao4 += lda; else ao4 += 2;
+      if (offset >  -4) ao5 += lda; else ao5 += 2;
+      if (offset >  -5) ao6 += lda; else ao6 += 2;
+      if (offset >  -6) ao7 += lda; else ao7 += 2;
+      if (offset >  -7) ao8 += lda; else ao8 += 2;
+
+      if (offset > 0) {
+	b[ 0] = data01;
+	b[ 1] = data02;
+	b[ 2] = data03;
+	b[ 3] = data04;
+	b[ 4] = data05;
+	b[ 5] = data06;
+	b[ 6] = data07;
+	b[ 7] = data08;
+	b[ 8] = data09;
+	b[ 9] = data10;
+	b[10] = data11;
+	b[11] = data12;
+	b[12] = data13;
+	b[13] = data14;
+	b[14] = data15;
+	b[15] = data16;
+      } else
+	if (offset < -7) {
+	  b[ 0] = data01;
+	  b[ 1] = -data02;
+	  b[ 2] = data03;
+	  b[ 3] = -data04;
+	  b[ 4] = data05;
+	  b[ 5] = -data06;
+	  b[ 6] = data07;
+	  b[ 7] = -data08;
+	  b[ 8] = data09;
+	  b[ 9] = -data10;
+	  b[10] = data11;
+	  b[11] = -data12;
+	  b[12] = data13;
+	  b[13] = -data14;
+	  b[14] = data15;
+	  b[15] = -data16;
+	} else {
+	  switch (offset) {
+	  case  0 :
+	    b[ 0] = data01;
+	    b[ 1] = ZERO;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+	    break;
+	  case -1 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = ZERO;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+	    break;
+	  case -2 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = ZERO;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+	    break;
+	  case -3 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = ZERO;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+	    break;
+	  case -4 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = ZERO;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+	    break;
+	  case -5 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = ZERO;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+	    break;
+	  case -6 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = ZERO;
+	    b[14] = data15;
+	    b[15] = data16;
+	    break;
+	  case -7 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = ZERO;
+	    break;
+	  }
+	}
+
+      b += 16;
+
+      offset --;
+      i --;
+    }
+
+    posX += 8;
+  }
+
+  if (n & 4) {
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
+    if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
+    if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
+    if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+
+      if (offset >   0) ao1 += lda; else ao1 += 2;
+      if (offset >  -1) ao2 += lda; else ao2 += 2;
+      if (offset >  -2) ao3 += lda; else ao3 += 2;
+      if (offset >  -3) ao4 += lda; else ao4 += 2;
+
+      if (offset > 0) {
+	b[ 0] = data01;
+	b[ 1] = data02;
+	b[ 2] = data03;
+	b[ 3] = data04;
+	b[ 4] = data05;
+	b[ 5] = data06;
+	b[ 6] = data07;
+	b[ 7] = data08;
+      } else
+	if (offset < -3) {
+	  b[ 0] = data01;
+	  b[ 1] = -data02;
+	  b[ 2] = data03;
+	  b[ 3] = -data04;
+	  b[ 4] = data05;
+	  b[ 5] = -data06;
+	  b[ 6] = data07;
+	  b[ 7] = -data08;
+	} else {
+	  switch (offset) {
+	  case  0 :
+	    b[ 0] = data01;
+	    b[ 1] = ZERO;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    break;
+	  case -1 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = ZERO;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    break;
+	  case -2 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = ZERO;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    break;
+	  case -3 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = ZERO;
+	    break;
+	  }
+	}
+
+      b += 8;
+
+      offset --;
+      i --;
+    }
+
+    posX += 4;
+  }
+
+  if (n & 2) {
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
+    if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+
+      if (offset >   0) ao1 += lda; else ao1 += 2;
+      if (offset >  -1) ao2 += lda; else ao2 += 2;
+
+      if (offset > 0) {
+	b[ 0] = data01;
+	b[ 1] = data02;
+	b[ 2] = data03;
+	b[ 3] = data04;
+      } else
+	if (offset < -1) {
+	  b[ 0] = data01;
+	  b[ 1] = -data02;
+	  b[ 2] = data03;
+	  b[ 3] = -data04;
+	} else {
+	  switch (offset) {
+	  case  0 :
+	    b[ 0] = data01;
+	    b[ 1] = ZERO;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    break;
+	  case -1 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = ZERO;
+	    break;
+	  }
+	}
+
+      b += 4;
+
+      offset --;
+      i --;
+    }
+
+    posX += 2;
+
+  }
+
+  if (n & 1) {
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+
+      if (offset >   0) ao1 += lda; else ao1 += 2;
+
+      if (offset > 0) {
+	b[ 0] = data01;
+	b[ 1] = data02;
+      } else
+	if (offset < 0) {
+	  b[ 0] = data01;
+	  b[ 1] = -data02;
+	} else {
+	    b[ 0] = data01;
+	    b[ 1] = ZERO;
+	}
+
+      b += 2;
+
+      offset --;
+      i --;
+    }
+
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/zhemm_utcopy_16.c b/kernel/generic/zhemm_utcopy_16.c
new file mode 100644
index 000000000..822483a83
--- /dev/null
+++ b/kernel/generic/zhemm_utcopy_16.c
@@ -0,0 +1,1168 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, js, offset;
+
+  FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
+  FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
+  FLOAT data17, data18, data19, data20, data21, data22, data23, data24;
+  FLOAT data25, data26, data27, data28, data29, data30, data31, data32;
+
+  FLOAT *ao1, *ao2,  *ao3,  *ao4,  *ao5,  *ao6,  *ao7,  *ao8;
+  FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16;
+
+  lda *= 2;
+
+  js = (n >> 4);
+  while (js > 0){
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
+    if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
+    if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
+    if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
+    if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda;
+    if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda;
+    if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda;
+    if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda;
+	if (offset > -8) ao9 = a + posY * 2 + (posX + 8) * lda; else ao9 = a + (posX + 8) * 2 + posY * lda;
+    if (offset > -9) ao10 = a + posY * 2 + (posX + 9) * lda; else ao10 = a + (posX + 9) * 2 + posY * lda;
+    if (offset > -10) ao11 = a + posY * 2 + (posX + 10) * lda; else ao11 = a + (posX + 10) * 2 + posY * lda;
+    if (offset > -11) ao12 = a + posY * 2 + (posX + 11) * lda; else ao12 = a + (posX + 11) * 2 + posY * lda;
+    if (offset > -12) ao13 = a + posY * 2 + (posX + 12) * lda; else ao13 = a + (posX + 12) * 2 + posY * lda;
+    if (offset > -13) ao14 = a + posY * 2 + (posX + 13) * lda; else ao14 = a + (posX + 13) * 2 + posY * lda;
+    if (offset > -14) ao15 = a + posY * 2 + (posX + 14) * lda; else ao15 = a + (posX + 14) * 2 + posY * lda;
+    if (offset > -15) ao16 = a + posY * 2 + (posX + 15) * lda; else ao16 = a + (posX + 15) * 2 + posY * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+      data09 = *(ao5 + 0);
+      data10 = *(ao5 + 1);
+      data11 = *(ao6 + 0);
+      data12 = *(ao6 + 1);
+      data13 = *(ao7 + 0);
+      data14 = *(ao7 + 1);
+      data15 = *(ao8 + 0);
+      data16 = *(ao8 + 1);
+	  data17 = *(ao9 + 0);
+      data18 = *(ao9 + 1);
+      data19 = *(ao10 + 0);
+      data20 = *(ao10 + 1);
+      data21 = *(ao11 + 0);
+      data22 = *(ao11 + 1);
+      data23 = *(ao12 + 0);
+      data24 = *(ao12 + 1);
+      data25 = *(ao13 + 0);
+      data26 = *(ao13 + 1);
+      data27 = *(ao14 + 0);
+      data28 = *(ao14 + 1);
+      data29 = *(ao15 + 0);
+      data30 = *(ao15 + 1);
+      data31 = *(ao16 + 0);
+      data32 = *(ao16 + 1);
+
+      if (offset >   0) ao1 += 2; else ao1 += lda;
+      if (offset >  -1) ao2 += 2; else ao2 += lda;
+      if (offset >  -2) ao3 += 2; else ao3 += lda;
+      if (offset >  -3) ao4 += 2; else ao4 += lda;
+      if (offset >  -4) ao5 += 2; else ao5 += lda;
+      if (offset >  -5) ao6 += 2; else ao6 += lda;
+      if (offset >  -6) ao7 += 2; else ao7 += lda;
+      if (offset >  -7) ao8 += 2; else ao8 += lda;
+	  if (offset >  -8) ao9 += 2; else ao9 += lda;
+      if (offset >  -9) ao10 += 2; else ao10 += lda;
+      if (offset >  -10) ao11 += 2; else ao11 += lda;
+      if (offset >  -11) ao12 += 2; else ao12 += lda;
+      if (offset >  -12) ao13 += 2; else ao13 += lda;
+      if (offset >  -13) ao14 += 2; else ao14 += lda;
+      if (offset >  -14) ao15 += 2; else ao15 += lda;
+      if (offset >  -15) ao16 += 2; else ao16 += lda;
+
+      if (offset > 0) {
+	b[ 0] = data01;
+	b[ 1] = -data02;
+	b[ 2] = data03;
+	b[ 3] = -data04;
+	b[ 4] = data05;
+	b[ 5] = -data06;
+	b[ 6] = data07;
+	b[ 7] = -data08;
+	b[ 8] = data09;
+	b[ 9] = -data10;
+	b[10] = data11;
+	b[11] = -data12;
+	b[12] = data13;
+	b[13] = -data14;
+	b[14] = data15;
+	b[15] = -data16;
+	b[16] = data17;
+	b[17] = -data18;
+	b[18] = data19;
+	b[19] = -data20;
+	b[20] = data21;
+	b[21] = -data22;
+	b[22] = data23;
+	b[23] = -data24;
+	b[24] = data25;
+	b[25] = -data26;
+	b[26] = data27;
+	b[27] = -data28;
+	b[28] = data29;
+	b[29] = -data30;
+	b[30] = data31;
+	b[31] = -data32;
+      } else
+	if (offset < -15) {
+	  b[ 0] = data01;
+	  b[ 1] = data02;
+	  b[ 2] = data03;
+	  b[ 3] = data04;
+	  b[ 4] = data05;
+	  b[ 5] = data06;
+	  b[ 6] = data07;
+	  b[ 7] = data08;
+	  b[ 8] = data09;
+	  b[ 9] = data10;
+	  b[10] = data11;
+	  b[11] = data12;
+	  b[12] = data13;
+	  b[13] = data14;
+	  b[14] = data15;
+	  b[15] = data16;
+	  b[16] = data17;
+	b[17] = data18;
+	b[18] = data19;
+	b[19] = data20;
+	b[20] = data21;
+	b[21] = data22;
+	b[22] = data23;
+	b[23] = data24;
+	b[24] = data25;
+	b[25] = data26;
+	b[26] = data27;
+	b[27] = data28;
+	b[28] = data29;
+	b[29] = data30;
+	b[30] = data31;
+	b[31] = data32;
+	} else {
+	  switch (offset) {
+	  case  0 :
+	    b[ 0] = data01;
+	    b[ 1] = ZERO;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	  case -1 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = ZERO;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	  case -2 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = ZERO;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	  case -3 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = ZERO;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	  case -4 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = ZERO;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	  case -5 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = ZERO;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	  case -6 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = ZERO;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	  case -7 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = ZERO;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	case -8 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = ZERO;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	case -9 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = ZERO;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	case -10 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = ZERO;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	case -11 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = ZERO;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	case -12 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = ZERO;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	case -13 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = ZERO;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	case -14 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = ZERO;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	case -15 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = ZERO;
+	    break;
+	  }
+	}
+
+      b += 32;
+
+      offset --;
+      i --;
+    }
+
+    posX += 16;
+    js --;
+  }
+
+  if (n & 8) {
+	offset = posX - posY;
+
+    if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
+    if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
+    if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
+    if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
+    if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda;
+    if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda;
+    if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda;
+    if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda;
+
+    i     = m;
+
+	while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+      data09 = *(ao5 + 0);
+      data10 = *(ao5 + 1);
+      data11 = *(ao6 + 0);
+      data12 = *(ao6 + 1);
+      data13 = *(ao7 + 0);
+      data14 = *(ao7 + 1);
+      data15 = *(ao8 + 0);
+      data16 = *(ao8 + 1);
+
+	  if (offset >   0) ao1 += 2; else ao1 += lda;
+      if (offset >  -1) ao2 += 2; else ao2 += lda;
+      if (offset >  -2) ao3 += 2; else ao3 += lda;
+      if (offset >  -3) ao4 += 2; else ao4 += lda;
+      if (offset >  -4) ao5 += 2; else ao5 += lda;
+      if (offset >  -5) ao6 += 2; else ao6 += lda;
+      if (offset >  -6) ao7 += 2; else ao7 += lda;
+      if (offset >  -7) ao8 += 2; else ao8 += lda;
+
+	  if (offset > 0) {
+	b[ 0] = data01;
+	b[ 1] = -data02;
+	b[ 2] = data03;
+	b[ 3] = -data04;
+	b[ 4] = data05;
+	b[ 5] = -data06;
+	b[ 6] = data07;
+	b[ 7] = -data08;
+	b[ 8] = data09;
+	b[ 9] = -data10;
+	b[10] = data11;
+	b[11] = -data12;
+	b[12] = data13;
+	b[13] = -data14;
+	b[14] = data15;
+	b[15] = -data16;
+      } else
+	if (offset < -7) {
+	  b[ 0] = data01;
+	  b[ 1] = data02;
+	  b[ 2] = data03;
+	  b[ 3] = data04;
+	  b[ 4] = data05;
+	  b[ 5] = data06;
+	  b[ 6] = data07;
+	  b[ 7] = data08;
+	  b[ 8] = data09;
+	  b[ 9] = data10;
+	  b[10] = data11;
+	  b[11] = data12;
+	  b[12] = data13;
+	  b[13] = data14;
+	  b[14] = data15;
+	  b[15] = data16;
+	} else {
+	switch (offset) {
+	  case  0 :
+	    b[ 0] = data01;
+	    b[ 1] = ZERO;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+	    break;
+	case -1 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = ZERO;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+	    break;
+	case -2 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = ZERO;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+	    break;
+	case -3 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = ZERO;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+	    break;
+	case -4 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = ZERO;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+	    break;
+	case -5 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = ZERO;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+	    break;
+	case -6 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = ZERO;
+	    b[14] = data15;
+	    b[15] = -data16;
+	    break;
+	case -7 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = ZERO;
+	    break;
+		}
+	}
+
+      b += 16;
+
+      offset --;
+      i --;
+    }
+
+    posX += 8;
+  }
+
+  if (n & 4) {
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
+    if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
+    if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
+    if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+
+      if (offset >   0) ao1 += 2; else ao1 += lda;
+      if (offset >  -1) ao2 += 2; else ao2 += lda;
+      if (offset >  -2) ao3 += 2; else ao3 += lda;
+      if (offset >  -3) ao4 += 2; else ao4 += lda;
+
+      if (offset > 0) {
+	b[ 0] = data01;
+	b[ 1] = -data02;
+	b[ 2] = data03;
+	b[ 3] = -data04;
+	b[ 4] = data05;
+	b[ 5] = -data06;
+	b[ 6] = data07;
+	b[ 7] = -data08;
+      } else
+	if (offset < -3) {
+	  b[ 0] = data01;
+	  b[ 1] = data02;
+	  b[ 2] = data03;
+	  b[ 3] = data04;
+	  b[ 4] = data05;
+	  b[ 5] = data06;
+	  b[ 6] = data07;
+	  b[ 7] = data08;
+	} else {
+	  switch (offset) {
+	  case  0 :
+	    b[ 0] = data01;
+	    b[ 1] = ZERO;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    break;
+	  case -1 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = ZERO;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    break;
+	  case -2 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = ZERO;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    break;
+	  case -3 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = ZERO;
+	    break;
+	  }
+	}
+
+      b += 8;
+
+      offset --;
+      i --;
+    }
+
+    posX += 4;
+  }
+
+  if (n & 2) {
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
+    if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+
+      if (offset >   0) ao1 += 2; else ao1 += lda;
+      if (offset >  -1) ao2 += 2; else ao2 += lda;
+
+      if (offset > 0) {
+	b[ 0] = data01;
+	b[ 1] = -data02;
+	b[ 2] = data03;
+	b[ 3] = -data04;
+      } else
+	if (offset < -1) {
+	  b[ 0] = data01;
+	  b[ 1] = data02;
+	  b[ 2] = data03;
+	  b[ 3] = data04;
+	} else {
+	  switch (offset) {
+	  case  0 :
+	    b[ 0] = data01;
+	    b[ 1] = ZERO;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    break;
+	  case -1 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = ZERO;
+	    break;
+	  }
+	}
+
+      b += 4;
+
+      offset --;
+      i --;
+    }
+
+    posX += 2;
+  }
+
+  if (n & 1) {
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+
+      if (offset >   0) ao1 += 2; else ao1 += lda;
+
+      if (offset > 0) {
+	b[ 0] = data01;
+	b[ 1] = -data02;
+      } else
+	if (offset < 0) {
+	  b[ 0] = data01;
+	  b[ 1] = data02;
+	} else {
+	    b[ 0] = data01;
+	    b[ 1] = ZERO;
+	}
+
+      b += 2;
+
+      offset --;
+      i --;
+    }
+
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/zneg_tcopy_16.c b/kernel/generic/zneg_tcopy_16.c
new file mode 100644
index 000000000..50f5a3d37
--- /dev/null
+++ b/kernel/generic/zneg_tcopy_16.c
@@ -0,0 +1,587 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
+
+  BLASLONG i, j;
+
+  FLOAT *aoffset;
+  FLOAT *aoffset1, *aoffset2;
+
+  FLOAT *boffset;
+
+  FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+  FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
+  FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
+  FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
+  FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
+  FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
+  FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
+  FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
+
+  FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
+  FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
+  FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
+  FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
+  FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
+  FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
+  FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
+  FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
+
+  aoffset   = a;
+  boffset   = b;
+  lda *= 2;
+
+#if 0
+  fprintf(stderr, "M = %d N = %d\n", m, n);
+#endif
+
+  j = (n >> 4);
+  if (j > 0){
+    do{
+      aoffset1  = aoffset;
+      aoffset2  = aoffset + lda;
+      aoffset += 32;
+
+      i = (m >> 1);
+      if (i > 0){
+	do{
+	  ctemp01 = *(aoffset1 +  0);
+	  ctemp02 = *(aoffset1 +  1);
+	  ctemp03 = *(aoffset1 +  2);
+	  ctemp04 = *(aoffset1 +  3);
+	  ctemp05 = *(aoffset1 +  4);
+	  ctemp06 = *(aoffset1 +  5);
+	  ctemp07 = *(aoffset1 +  6);
+	  ctemp08 = *(aoffset1 +  7);
+	  ctemp09 = *(aoffset1 +  8);
+	  ctemp10 = *(aoffset1 +  9);
+	  ctemp11 = *(aoffset1 + 10);
+	  ctemp12 = *(aoffset1 + 11);
+	  ctemp13 = *(aoffset1 + 12);
+	  ctemp14 = *(aoffset1 + 13);
+	  ctemp15 = *(aoffset1 + 14);
+	  ctemp16 = *(aoffset1 + 15);
+	  ctemp17 = *(aoffset1 + 16);
+	  ctemp18 = *(aoffset1 + 17);
+	  ctemp19 = *(aoffset1 + 18);
+	  ctemp20 = *(aoffset1 + 19);
+	  ctemp21 = *(aoffset1 + 20);
+	  ctemp22 = *(aoffset1 + 21);
+	  ctemp23 = *(aoffset1 + 22);
+	  ctemp24 = *(aoffset1 + 23);
+	  ctemp25 = *(aoffset1 + 24);
+	  ctemp26 = *(aoffset1 + 25);
+	  ctemp27 = *(aoffset1 + 26);
+	  ctemp28 = *(aoffset1 + 27);
+	  ctemp29 = *(aoffset1 + 28);
+	  ctemp30 = *(aoffset1 + 29);
+	  ctemp31 = *(aoffset1 + 30);
+	  ctemp32 = *(aoffset1 + 31);
+
+	  ctemp33 = *(aoffset2 +  0);
+	  ctemp34 = *(aoffset2 +  1);
+	  ctemp35 = *(aoffset2 +  2);
+	  ctemp36 = *(aoffset2 +  3);
+	  ctemp37 = *(aoffset2 +  4);
+	  ctemp38 = *(aoffset2 +  5);
+	  ctemp39 = *(aoffset2 +  6);
+	  ctemp40 = *(aoffset2 +  7);
+	  ctemp41 = *(aoffset2 +  8);
+	  ctemp42 = *(aoffset2 +  9);
+	  ctemp43 = *(aoffset2 + 10);
+	  ctemp44 = *(aoffset2 + 11);
+	  ctemp45 = *(aoffset2 + 12);
+	  ctemp46 = *(aoffset2 + 13);
+	  ctemp47 = *(aoffset2 + 14);
+	  ctemp48 = *(aoffset2 + 15);
+	  ctemp49 = *(aoffset2 + 16);
+	  ctemp50 = *(aoffset2 + 17);
+	  ctemp51 = *(aoffset2 + 18);
+	  ctemp52 = *(aoffset2 + 19);
+	  ctemp53 = *(aoffset2 + 20);
+	  ctemp54 = *(aoffset2 + 21);
+	  ctemp55 = *(aoffset2 + 22);
+	  ctemp56 = *(aoffset2 + 23);
+	  ctemp57 = *(aoffset2 + 24);
+	  ctemp58 = *(aoffset2 + 25);
+	  ctemp59 = *(aoffset2 + 26);
+	  ctemp60 = *(aoffset2 + 27);
+	  ctemp61 = *(aoffset2 + 28);
+	  ctemp62 = *(aoffset2 + 29);
+	  ctemp63 = *(aoffset2 + 30);
+	  ctemp64 = *(aoffset2 + 31);
+
+	  *(boffset +  0) = -ctemp01;
+	  *(boffset +  1) = -ctemp02;
+	  *(boffset +  2) = -ctemp03;
+	  *(boffset +  3) = -ctemp04;
+	  *(boffset +  4) = -ctemp05;
+	  *(boffset +  5) = -ctemp06;
+	  *(boffset +  6) = -ctemp07;
+	  *(boffset +  7) = -ctemp08;
+
+	  *(boffset +  8) = -ctemp09;
+	  *(boffset +  9) = -ctemp10;
+	  *(boffset + 10) = -ctemp11;
+	  *(boffset + 11) = -ctemp12;
+	  *(boffset + 12) = -ctemp13;
+	  *(boffset + 13) = -ctemp14;
+	  *(boffset + 14) = -ctemp15;
+	  *(boffset + 15) = -ctemp16;
+
+	  *(boffset + 16) = -ctemp17;
+	  *(boffset + 17) = -ctemp18;
+	  *(boffset + 18) = -ctemp19;
+	  *(boffset + 19) = -ctemp20;
+	  *(boffset + 20) = -ctemp21;
+	  *(boffset + 21) = -ctemp22;
+	  *(boffset + 22) = -ctemp23;
+	  *(boffset + 23) = -ctemp24;
+
+	  *(boffset + 24) = -ctemp25;
+	  *(boffset + 25) = -ctemp26;
+	  *(boffset + 26) = -ctemp27;
+	  *(boffset + 27) = -ctemp28;
+	  *(boffset + 28) = -ctemp29;
+	  *(boffset + 29) = -ctemp30;
+	  *(boffset + 30) = -ctemp31;
+	  *(boffset + 31) = -ctemp32;
+
+	  *(boffset + 32) = -ctemp33;
+	  *(boffset + 33) = -ctemp34;
+	  *(boffset + 34) = -ctemp35;
+	  *(boffset + 35) = -ctemp36;
+	  *(boffset + 36) = -ctemp37;
+	  *(boffset + 37) = -ctemp38;
+	  *(boffset + 38) = -ctemp39;
+	  *(boffset + 39) = -ctemp40;
+
+	  *(boffset + 40) = -ctemp41;
+	  *(boffset + 41) = -ctemp42;
+	  *(boffset + 42) = -ctemp43;
+	  *(boffset + 43) = -ctemp44;
+	  *(boffset + 44) = -ctemp45;
+	  *(boffset + 45) = -ctemp46;
+	  *(boffset + 46) = -ctemp47;
+	  *(boffset + 47) = -ctemp48;
+
+	  *(boffset + 48) = -ctemp49;
+	  *(boffset + 49) = -ctemp50;
+	  *(boffset + 50) = -ctemp51;
+	  *(boffset + 51) = -ctemp52;
+	  *(boffset + 52) = -ctemp53;
+	  *(boffset + 53) = -ctemp54;
+	  *(boffset + 54) = -ctemp55;
+	  *(boffset + 55) = -ctemp56;
+
+	  *(boffset + 56) = -ctemp57;
+	  *(boffset + 57) = -ctemp58;
+	  *(boffset + 58) = -ctemp59;
+	  *(boffset + 59) = -ctemp60;
+	  *(boffset + 60) = -ctemp61;
+	  *(boffset + 61) = -ctemp62;
+	  *(boffset + 62) = -ctemp63;
+	  *(boffset + 63) = -ctemp64;
+
+	  aoffset1 +=  2 * lda;
+	  aoffset2 +=  2 * lda;
+	  boffset   += 64;
+
+	  i --;
+	}while(i > 0);
+      }
+
+      if (m & 1){
+	ctemp01 = *(aoffset1 +  0);
+	ctemp02 = *(aoffset1 +  1);
+	ctemp03 = *(aoffset1 +  2);
+	ctemp04 = *(aoffset1 +  3);
+	ctemp05 = *(aoffset1 +  4);
+	ctemp06 = *(aoffset1 +  5);
+	ctemp07 = *(aoffset1 +  6);
+	ctemp08 = *(aoffset1 +  7);
+	ctemp09 = *(aoffset1 +  8);
+	ctemp10 = *(aoffset1 +  9);
+	ctemp11 = *(aoffset1 + 10);
+	ctemp12 = *(aoffset1 + 11);
+	ctemp13 = *(aoffset1 + 12);
+	ctemp14 = *(aoffset1 + 13);
+	ctemp15 = *(aoffset1 + 14);
+	ctemp16 = *(aoffset1 + 15);
+	ctemp17 = *(aoffset1 + 16);
+	ctemp18 = *(aoffset1 + 17);
+	ctemp19 = *(aoffset1 + 18);
+	ctemp20 = *(aoffset1 + 19);
+	ctemp21 = *(aoffset1 + 20);
+	ctemp22 = *(aoffset1 + 21);
+	ctemp23 = *(aoffset1 + 22);
+	ctemp24 = *(aoffset1 + 23);
+	ctemp25 = *(aoffset1 + 24);
+	ctemp26 = *(aoffset1 + 25);
+	ctemp27 = *(aoffset1 + 26);
+	ctemp28 = *(aoffset1 + 27);
+	ctemp29 = *(aoffset1 + 28);
+	ctemp30 = *(aoffset1 + 29);
+	ctemp31 = *(aoffset1 + 30);
+	ctemp32 = *(aoffset1 + 31);
+
+	*(boffset +  0) = -ctemp01;
+	*(boffset +  1) = -ctemp02;
+	*(boffset +  2) = -ctemp03;
+	*(boffset +  3) = -ctemp04;
+	*(boffset +  4) = -ctemp05;
+	*(boffset +  5) = -ctemp06;
+	*(boffset +  6) = -ctemp07;
+	*(boffset +  7) = -ctemp08;
+
+	*(boffset +  8) = -ctemp09;
+	*(boffset +  9) = -ctemp10;
+	*(boffset + 10) = -ctemp11;
+	*(boffset + 11) = -ctemp12;
+	*(boffset + 12) = -ctemp13;
+	*(boffset + 13) = -ctemp14;
+	*(boffset + 14) = -ctemp15;
+	*(boffset + 15) = -ctemp16;
+
+	*(boffset + 16) = -ctemp17;
+	*(boffset + 17) = -ctemp18;
+	*(boffset + 18) = -ctemp19;
+	*(boffset + 19) = -ctemp20;
+	*(boffset + 20) = -ctemp21;
+	*(boffset + 21) = -ctemp22;
+	*(boffset + 22) = -ctemp23;
+	*(boffset + 23) = -ctemp24;
+
+	*(boffset + 24) = -ctemp25;
+	*(boffset + 25) = -ctemp26;
+	*(boffset + 26) = -ctemp27;
+	*(boffset + 27) = -ctemp28;
+	*(boffset + 28) = -ctemp29;
+	*(boffset + 29) = -ctemp30;
+	*(boffset + 30) = -ctemp31;
+	*(boffset + 31) = -ctemp32;
+
+	boffset   += 32;
+      }
+
+      j--;
+    }while(j > 0);
+  } /* end of if(j > 0) */
+
+  if (n & 8){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset + lda;
+    aoffset += 16;
+
+    i = (m >> 1);
+    if (i > 0){
+      do{
+	  ctemp01 = *(aoffset1 +  0);
+	  ctemp02 = *(aoffset1 +  1);
+	  ctemp03 = *(aoffset1 +  2);
+	  ctemp04 = *(aoffset1 +  3);
+	  ctemp05 = *(aoffset1 +  4);
+	  ctemp06 = *(aoffset1 +  5);
+	  ctemp07 = *(aoffset1 +  6);
+	  ctemp08 = *(aoffset1 +  7);
+	  ctemp09 = *(aoffset1 +  8);
+	  ctemp10 = *(aoffset1 +  9);
+	  ctemp11 = *(aoffset1 + 10);
+	  ctemp12 = *(aoffset1 + 11);
+	  ctemp13 = *(aoffset1 + 12);
+	  ctemp14 = *(aoffset1 + 13);
+	  ctemp15 = *(aoffset1 + 14);
+	  ctemp16 = *(aoffset1 + 15);
+
+	  ctemp17 = *(aoffset2 +  0);
+	  ctemp18 = *(aoffset2 +  1);
+	  ctemp19 = *(aoffset2 +  2);
+	  ctemp20 = *(aoffset2 +  3);
+	  ctemp21 = *(aoffset2 +  4);
+	  ctemp22 = *(aoffset2 +  5);
+	  ctemp23 = *(aoffset2 +  6);
+	  ctemp24 = *(aoffset2 +  7);
+	  ctemp25 = *(aoffset2 +  8);
+	  ctemp26 = *(aoffset2 +  9);
+	  ctemp27 = *(aoffset2 + 10);
+	  ctemp28 = *(aoffset2 + 11);
+	  ctemp29 = *(aoffset2 + 12);
+	  ctemp30 = *(aoffset2 + 13);
+	  ctemp31 = *(aoffset2 + 14);
+	  ctemp32 = *(aoffset2 + 15);
+
+	  *(boffset +  0) = -ctemp01;
+	  *(boffset +  1) = -ctemp02;
+	  *(boffset +  2) = -ctemp03;
+	  *(boffset +  3) = -ctemp04;
+	  *(boffset +  4) = -ctemp05;
+	  *(boffset +  5) = -ctemp06;
+	  *(boffset +  6) = -ctemp07;
+	  *(boffset +  7) = -ctemp08;
+
+	  *(boffset +  8) = -ctemp09;
+	  *(boffset +  9) = -ctemp10;
+	  *(boffset + 10) = -ctemp11;
+	  *(boffset + 11) = -ctemp12;
+	  *(boffset + 12) = -ctemp13;
+	  *(boffset + 13) = -ctemp14;
+	  *(boffset + 14) = -ctemp15;
+	  *(boffset + 15) = -ctemp16;
+
+	  *(boffset + 16) = -ctemp17;
+	  *(boffset + 17) = -ctemp18;
+	  *(boffset + 18) = -ctemp19;
+	  *(boffset + 19) = -ctemp20;
+	  *(boffset + 20) = -ctemp21;
+	  *(boffset + 21) = -ctemp22;
+	  *(boffset + 22) = -ctemp23;
+	  *(boffset + 23) = -ctemp24;
+
+	  *(boffset + 24) = -ctemp25;
+	  *(boffset + 25) = -ctemp26;
+	  *(boffset + 26) = -ctemp27;
+	  *(boffset + 27) = -ctemp28;
+	  *(boffset + 28) = -ctemp29;
+	  *(boffset + 29) = -ctemp30;
+	  *(boffset + 30) = -ctemp31;
+	  *(boffset + 31) = -ctemp32;
+
+	aoffset1 +=  2 * lda;
+	aoffset2 +=  2 * lda;
+	boffset   += 32;
+
+	i --;
+      }while(i > 0);
+    }
+
+    if (m & 1){
+    ctemp01 = *(aoffset1 +  0);
+	ctemp02 = *(aoffset1 +  1);
+	ctemp03 = *(aoffset1 +  2);
+	ctemp04 = *(aoffset1 +  3);
+	ctemp05 = *(aoffset1 +  4);
+	ctemp06 = *(aoffset1 +  5);
+	ctemp07 = *(aoffset1 +  6);
+	ctemp08 = *(aoffset1 +  7);
+	ctemp09 = *(aoffset1 +  8);
+	ctemp10 = *(aoffset1 +  9);
+	ctemp11 = *(aoffset1 + 10);
+	ctemp12 = *(aoffset1 + 11);
+	ctemp13 = *(aoffset1 + 12);
+	ctemp14 = *(aoffset1 + 13);
+	ctemp15 = *(aoffset1 + 14);
+	ctemp16 = *(aoffset1 + 15);
+
+    *(boffset +  0) = -ctemp01;
+	*(boffset +  1) = -ctemp02;
+	*(boffset +  2) = -ctemp03;
+	*(boffset +  3) = -ctemp04;
+	*(boffset +  4) = -ctemp05;
+	*(boffset +  5) = -ctemp06;
+	*(boffset +  6) = -ctemp07;
+	*(boffset +  7) = -ctemp08;
+
+	*(boffset +  8) = -ctemp09;
+	*(boffset +  9) = -ctemp10;
+	*(boffset + 10) = -ctemp11;
+	*(boffset + 11) = -ctemp12;
+	*(boffset + 12) = -ctemp13;
+	*(boffset + 13) = -ctemp14;
+	*(boffset + 14) = -ctemp15;
+	*(boffset + 15) = -ctemp16;
+
+      boffset   += 16;
+    }
+  }
+
+  if (n & 4){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset + lda;
+    aoffset += 8;
+
+    i = (m >> 1);
+    if (i > 0){
+      do{
+	ctemp01 = *(aoffset1 +  0);
+	ctemp02 = *(aoffset1 +  1);
+	ctemp03 = *(aoffset1 +  2);
+	ctemp04 = *(aoffset1 +  3);
+	ctemp05 = *(aoffset1 +  4);
+	ctemp06 = *(aoffset1 +  5);
+	ctemp07 = *(aoffset1 +  6);
+	ctemp08 = *(aoffset1 +  7);
+
+	ctemp09 = *(aoffset2 +  0);
+	ctemp10 = *(aoffset2 +  1);
+	ctemp11 = *(aoffset2 +  2);
+	ctemp12 = *(aoffset2 +  3);
+	ctemp13 = *(aoffset2 +  4);
+	ctemp14 = *(aoffset2 +  5);
+	ctemp15 = *(aoffset2 +  6);
+	ctemp16 = *(aoffset2 +  7);
+
+	*(boffset +  0) = -ctemp01;
+	*(boffset +  1) = -ctemp02;
+	*(boffset +  2) = -ctemp03;
+	*(boffset +  3) = -ctemp04;
+	*(boffset +  4) = -ctemp05;
+	*(boffset +  5) = -ctemp06;
+	*(boffset +  6) = -ctemp07;
+	*(boffset +  7) = -ctemp08;
+
+	*(boffset +  8) = -ctemp09;
+	*(boffset +  9) = -ctemp10;
+	*(boffset + 10) = -ctemp11;
+	*(boffset + 11) = -ctemp12;
+	*(boffset + 12) = -ctemp13;
+	*(boffset + 13) = -ctemp14;
+	*(boffset + 14) = -ctemp15;
+	*(boffset + 15) = -ctemp16;
+
+	aoffset1 +=  2 * lda;
+	aoffset2 +=  2 * lda;
+	boffset   += 16;
+
+	i --;
+      }while(i > 0);
+    }
+
+    if (m & 1){
+      ctemp01 = *(aoffset1 +  0);
+      ctemp02 = *(aoffset1 +  1);
+      ctemp03 = *(aoffset1 +  2);
+      ctemp04 = *(aoffset1 +  3);
+      ctemp05 = *(aoffset1 +  4);
+      ctemp06 = *(aoffset1 +  5);
+      ctemp07 = *(aoffset1 +  6);
+      ctemp08 = *(aoffset1 +  7);
+
+      *(boffset +  0) = -ctemp01;
+      *(boffset +  1) = -ctemp02;
+      *(boffset +  2) = -ctemp03;
+      *(boffset +  3) = -ctemp04;
+      *(boffset +  4) = -ctemp05;
+      *(boffset +  5) = -ctemp06;
+      *(boffset +  6) = -ctemp07;
+      *(boffset +  7) = -ctemp08;
+
+      boffset   += 8;
+    }
+  }
+
+  if (n & 2){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset + lda;
+    aoffset += 4;
+
+    i = (m >> 1);
+    if (i > 0){
+      do{
+	ctemp01 = *(aoffset1 +  0);
+	ctemp02 = *(aoffset1 +  1);
+	ctemp03 = *(aoffset1 +  2);
+	ctemp04 = *(aoffset1 +  3);
+
+	ctemp05 = *(aoffset2 +  0);
+	ctemp06 = *(aoffset2 +  1);
+	ctemp07 = *(aoffset2 +  2);
+	ctemp08 = *(aoffset2 +  3);
+
+	*(boffset +  0) = -ctemp01;
+	*(boffset +  1) = -ctemp02;
+	*(boffset +  2) = -ctemp03;
+	*(boffset +  3) = -ctemp04;
+	*(boffset +  4) = -ctemp05;
+	*(boffset +  5) = -ctemp06;
+	*(boffset +  6) = -ctemp07;
+	*(boffset +  7) = -ctemp08;
+
+	aoffset1 +=  2 * lda;
+	aoffset2 +=  2 * lda;
+	boffset   += 8;
+
+	i --;
+      }while(i > 0);
+    }
+
+    if (m & 1){
+      ctemp01 = *(aoffset1 +  0);
+      ctemp02 = *(aoffset1 +  1);
+      ctemp03 = *(aoffset1 +  2);
+      ctemp04 = *(aoffset1 +  3);
+
+      *(boffset +  0) = -ctemp01;
+      *(boffset +  1) = -ctemp02;
+      *(boffset +  2) = -ctemp03;
+      *(boffset +  3) = -ctemp04;
+
+      boffset   += 4;
+    }
+  }
+
+  if (n & 1){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset + lda;
+    // aoffset += 2;
+
+    i = (m >> 1);
+    if (i > 0){
+      do{
+	ctemp01 = *(aoffset1 +  0);
+	ctemp02 = *(aoffset1 +  1);
+	ctemp03 = *(aoffset2 +  0);
+	ctemp04 = *(aoffset2 +  1);
+
+	*(boffset +  0) = -ctemp01;
+	*(boffset +  1) = -ctemp02;
+	*(boffset +  2) = -ctemp03;
+	*(boffset +  3) = -ctemp04;
+
+	aoffset1 +=  2 * lda;
+	aoffset2 +=  2 * lda;
+	boffset   += 4;
+
+	i --;
+      }while(i > 0);
+    }
+
+    if (m & 1){
+      ctemp01 = *(aoffset1 +  0);
+      ctemp02 = *(aoffset1 +  1);
+
+      *(boffset +  0) = -ctemp01;
+      *(boffset +  1) = -ctemp02;
+      // boffset   += 2;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/zsymm_lcopy_16.c b/kernel/generic/zsymm_lcopy_16.c
new file mode 100644
index 000000000..b32374a5e
--- /dev/null
+++ b/kernel/generic/zsymm_lcopy_16.c
@@ -0,0 +1,333 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, js, offset;
+
+  FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
+  FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
+  FLOAT data17, data18, data19, data20, data21, data22, data23, data24;
+  FLOAT data25, data26, data27, data28, data29, data30, data31, data32;
+
+  FLOAT *ao1, *ao2,  *ao3,  *ao4,  *ao5,  *ao6,  *ao7,  *ao8;
+  FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16;
+
+  lda *= 2;
+
+  js = (n >> 4);
+  while (js > 0){
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
+    if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
+    if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
+    if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
+    if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
+    if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
+    if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
+    if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
+    if (offset > -8) ao9 = a + (posX + 8) * 2 + posY * lda; else ao9 = a + posY * 2 + (posX + 8) * lda;
+    if (offset > -9) ao10 = a + (posX + 9) * 2 + posY * lda; else ao10 = a + posY * 2 + (posX + 9) * lda;
+    if (offset > -10) ao11 = a + (posX + 10) * 2 + posY * lda; else ao11 = a + posY * 2 + (posX + 10) * lda;
+    if (offset > -11) ao12 = a + (posX + 11) * 2 + posY * lda; else ao12 = a + posY * 2 + (posX + 11) * lda;
+    if (offset > -12) ao13 = a + (posX + 12) * 2 + posY * lda; else ao13 = a + posY * 2 + (posX + 12) * lda;
+    if (offset > -13) ao14 = a + (posX + 13) * 2 + posY * lda; else ao14 = a + posY * 2 + (posX + 13) * lda;
+    if (offset > -14) ao15 = a + (posX + 14) * 2 + posY * lda; else ao15 = a + posY * 2 + (posX + 14) * lda;
+    if (offset > -15) ao16 = a + (posX + 15) * 2 + posY * lda; else ao16 = a + posY * 2 + (posX + 15) * lda;
+
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+      data09 = *(ao5 + 0);
+      data10 = *(ao5 + 1);
+      data11 = *(ao6 + 0);
+      data12 = *(ao6 + 1);
+      data13 = *(ao7 + 0);
+      data14 = *(ao7 + 1);
+      data15 = *(ao8 + 0);
+      data16 = *(ao8 + 1);
+      data17 = *(ao9 + 0);
+      data18 = *(ao9 + 1);
+      data19 = *(ao10 + 0);
+      data20 = *(ao10 + 1);
+      data21 = *(ao11 + 0);
+      data22 = *(ao11 + 1);
+      data23 = *(ao12 + 0);
+      data24 = *(ao12 + 1);
+      data25 = *(ao13 + 0);
+      data26 = *(ao13 + 1);
+      data27 = *(ao14 + 0);
+      data28 = *(ao14 + 1);
+      data29 = *(ao15 + 0);
+      data30 = *(ao15 + 1);
+      data31 = *(ao16 + 0);
+      data32 = *(ao16 + 1);
+
+      if (offset >   0) ao1 += lda; else ao1 += 2;
+      if (offset >  -1) ao2 += lda; else ao2 += 2;
+      if (offset >  -2) ao3 += lda; else ao3 += 2;
+      if (offset >  -3) ao4 += lda; else ao4 += 2;
+      if (offset >  -4) ao5 += lda; else ao5 += 2;
+      if (offset >  -5) ao6 += lda; else ao6 += 2;
+      if (offset >  -6) ao7 += lda; else ao7 += 2;
+      if (offset >  -7) ao8 += lda; else ao8 += 2;
+      if (offset >  -8) ao9 += lda; else ao9 += 2;
+      if (offset >  -9) ao10 += lda; else ao10 += 2;
+      if (offset >  -10) ao11 += lda; else ao11 += 2;
+      if (offset >  -11) ao12 += lda; else ao12 += 2;
+      if (offset >  -12) ao13 += lda; else ao13 += 2;
+      if (offset >  -13) ao14 += lda; else ao14 += 2;
+      if (offset >  -14) ao15 += lda; else ao15 += 2;
+      if (offset >  -15) ao16 += lda; else ao16 += 2;
+
+      b[ 0] = data01;
+      b[ 1] = data02;
+      b[ 2] = data03;
+      b[ 3] = data04;
+      b[ 4] = data05;
+      b[ 5] = data06;
+      b[ 6] = data07;
+      b[ 7] = data08;
+      b[ 8] = data09;
+      b[ 9] = data10;
+      b[10] = data11;
+      b[11] = data12;
+      b[12] = data13;
+      b[13] = data14;
+      b[14] = data15;
+      b[15] = data16;
+      b[16] = data17;
+      b[17] = data18;
+      b[18] = data19;
+      b[19] = data20;
+      b[20] = data21;
+      b[21] = data22;
+      b[22] = data23;
+      b[23] = data24;
+      b[24] = data25;
+      b[25] = data26;
+      b[26] = data27;
+      b[27] = data28;
+      b[28] = data29;
+      b[29] = data30;
+      b[30] = data31;
+      b[31] = data32;
+
+      b += 32;
+
+      offset --;
+      i --;
+    }
+
+    posX += 16;
+    js --;
+  }
+
+  if (n & 8) {
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
+    if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
+    if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
+    if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
+    if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
+    if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
+    if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
+    if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+      data09 = *(ao5 + 0);
+      data10 = *(ao5 + 1);
+      data11 = *(ao6 + 0);
+      data12 = *(ao6 + 1);
+      data13 = *(ao7 + 0);
+      data14 = *(ao7 + 1);
+      data15 = *(ao8 + 0);
+      data16 = *(ao8 + 1);
+
+      if (offset >   0) ao1 += lda; else ao1 += 2;
+      if (offset >  -1) ao2 += lda; else ao2 += 2;
+      if (offset >  -2) ao3 += lda; else ao3 += 2;
+      if (offset >  -3) ao4 += lda; else ao4 += 2;
+      if (offset >  -4) ao5 += lda; else ao5 += 2;
+      if (offset >  -5) ao6 += lda; else ao6 += 2;
+      if (offset >  -6) ao7 += lda; else ao7 += 2;
+      if (offset >  -7) ao8 += lda; else ao8 += 2;
+
+      b[ 0] = data01;
+      b[ 1] = data02;
+      b[ 2] = data03;
+      b[ 3] = data04;
+      b[ 4] = data05;
+      b[ 5] = data06;
+      b[ 6] = data07;
+      b[ 7] = data08;
+      b[ 8] = data09;
+      b[ 9] = data10;
+      b[10] = data11;
+      b[11] = data12;
+      b[12] = data13;
+      b[13] = data14;
+      b[14] = data15;
+      b[15] = data16;
+
+      b += 16;
+
+      offset --;
+      i --;
+    }
+
+    posX += 8;
+  }
+
+  if (n & 4) {
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
+    if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
+    if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
+    if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+
+      if (offset >   0) ao1 += lda; else ao1 += 2;
+      if (offset >  -1) ao2 += lda; else ao2 += 2;
+      if (offset >  -2) ao3 += lda; else ao3 += 2;
+      if (offset >  -3) ao4 += lda; else ao4 += 2;
+
+      b[ 0] = data01;
+      b[ 1] = data02;
+      b[ 2] = data03;
+      b[ 3] = data04;
+      b[ 4] = data05;
+      b[ 5] = data06;
+      b[ 6] = data07;
+      b[ 7] = data08;
+
+      b += 8;
+
+      offset --;
+      i --;
+    }
+
+    posX += 4;
+  }
+
+  if (n & 2) {
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
+    if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+
+      if (offset >   0) ao1 += lda; else ao1 += 2;
+      if (offset >  -1) ao2 += lda; else ao2 += 2;
+
+      b[ 0] = data01;
+      b[ 1] = data02;
+      b[ 2] = data03;
+      b[ 3] = data04;
+
+      b += 4;
+
+      offset --;
+      i --;
+    }
+
+    posX += 2;
+
+  }
+
+  if (n & 1) {
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+
+      if (offset >   0) ao1 += lda; else ao1 += 2;
+
+      b[ 0] = data01;
+      b[ 1] = data02;
+
+      b += 2;
+
+      offset --;
+      i --;
+    }
+
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/zsymm_ucopy_16.c b/kernel/generic/zsymm_ucopy_16.c
new file mode 100644
index 000000000..cb19bea47
--- /dev/null
+++ b/kernel/generic/zsymm_ucopy_16.c
@@ -0,0 +1,332 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, js, offset;
+
+  FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
+  FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
+  FLOAT data17, data18, data19, data20, data21, data22, data23, data24;
+  FLOAT data25, data26, data27, data28, data29, data30, data31, data32;
+
+  FLOAT *ao1, *ao2,  *ao3,  *ao4,  *ao5,  *ao6,  *ao7,  *ao8;
+  FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16;
+
+  lda *= 2;
+
+  js = (n >> 4);
+  while (js > 0){
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
+    if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
+    if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
+    if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
+    if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda;
+    if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda;
+    if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda;
+    if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda;
+    if (offset > -8) ao9 = a + posY * 2 + (posX + 8) * lda; else ao9 = a + (posX + 8) * 2 + posY * lda;
+    if (offset > -9) ao10 = a + posY * 2 + (posX + 9) * lda; else ao10 = a + (posX + 9) * 2 + posY * lda;
+    if (offset > -10) ao11 = a + posY * 2 + (posX + 10) * lda; else ao11 = a + (posX + 10) * 2 + posY * lda;
+    if (offset > -11) ao12 = a + posY * 2 + (posX + 11) * lda; else ao12 = a + (posX + 11) * 2 + posY * lda;
+    if (offset > -12) ao13 = a + posY * 2 + (posX + 12) * lda; else ao13 = a + (posX + 12) * 2 + posY * lda;
+    if (offset > -13) ao14 = a + posY * 2 + (posX + 13) * lda; else ao14 = a + (posX + 13) * 2 + posY * lda;
+    if (offset > -14) ao15 = a + posY * 2 + (posX + 14) * lda; else ao15 = a + (posX + 14) * 2 + posY * lda;
+    if (offset > -15) ao16 = a + posY * 2 + (posX + 15) * lda; else ao16 = a + (posX + 15) * 2 + posY * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+      data09 = *(ao5 + 0);
+      data10 = *(ao5 + 1);
+      data11 = *(ao6 + 0);
+      data12 = *(ao6 + 1);
+      data13 = *(ao7 + 0);
+      data14 = *(ao7 + 1);
+      data15 = *(ao8 + 0);
+      data16 = *(ao8 + 1);
+      data17 = *(ao9 + 0);
+      data18 = *(ao9 + 1);
+      data19 = *(ao10 + 0);
+      data20 = *(ao10 + 1);
+      data21 = *(ao11 + 0);
+      data22 = *(ao11 + 1);
+      data23 = *(ao12 + 0);
+      data24 = *(ao12 + 1);
+      data25 = *(ao13 + 0);
+      data26 = *(ao13 + 1);
+      data27 = *(ao14 + 0);
+      data28 = *(ao14 + 1);
+      data29 = *(ao15 + 0);
+      data30 = *(ao15 + 1);
+      data31 = *(ao16 + 0);
+      data32 = *(ao16 + 1);
+
+      if (offset >   0) ao1 += 2; else ao1 += lda;
+      if (offset >  -1) ao2 += 2; else ao2 += lda;
+      if (offset >  -2) ao3 += 2; else ao3 += lda;
+      if (offset >  -3) ao4 += 2; else ao4 += lda;
+      if (offset >  -4) ao5 += 2; else ao5 += lda;
+      if (offset >  -5) ao6 += 2; else ao6 += lda;
+      if (offset >  -6) ao7 += 2; else ao7 += lda;
+      if (offset >  -7) ao8 += 2; else ao8 += lda;
+      if (offset >  -8) ao9 += 2; else ao9 += lda;
+      if (offset >  -9) ao10 += 2; else ao10 += lda;
+      if (offset >  -10) ao11 += 2; else ao11 += lda;
+      if (offset >  -11) ao12 += 2; else ao12 += lda;
+      if (offset >  -12) ao13 += 2; else ao13 += lda;
+      if (offset >  -13) ao14 += 2; else ao14 += lda;
+      if (offset >  -14) ao15 += 2; else ao15 += lda;
+      if (offset >  -15) ao16 += 2; else ao16 += lda;
+
+      b[ 0] = data01;
+      b[ 1] = data02;
+      b[ 2] = data03;
+      b[ 3] = data04;
+      b[ 4] = data05;
+      b[ 5] = data06;
+      b[ 6] = data07;
+      b[ 7] = data08;
+      b[ 8] = data09;
+      b[ 9] = data10;
+      b[10] = data11;
+      b[11] = data12;
+      b[12] = data13;
+      b[13] = data14;
+      b[14] = data15;
+      b[15] = data16;
+      b[16] = data17;
+      b[17] = data18;
+      b[18] = data19;
+      b[19] = data20;
+      b[20] = data21;
+      b[21] = data22;
+      b[22] = data23;
+      b[23] = data24;
+      b[24] = data25;
+      b[25] = data26;
+      b[26] = data27;
+      b[27] = data28;
+      b[28] = data29;
+      b[29] = data30;
+      b[30] = data31;
+      b[31] = data32;
+
+      b += 32;
+
+      offset --;
+      i --;
+    }
+
+    posX += 16;
+    js --;
+  }
+
+  if (n & 8) {
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
+    if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
+    if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
+    if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
+    if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda;
+    if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda;
+    if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda;
+    if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+      data09 = *(ao5 + 0);
+      data10 = *(ao5 + 1);
+      data11 = *(ao6 + 0);
+      data12 = *(ao6 + 1);
+      data13 = *(ao7 + 0);
+      data14 = *(ao7 + 1);
+      data15 = *(ao8 + 0);
+      data16 = *(ao8 + 1);
+
+      if (offset >   0) ao1 += 2; else ao1 += lda;
+      if (offset >  -1) ao2 += 2; else ao2 += lda;
+      if (offset >  -2) ao3 += 2; else ao3 += lda;
+      if (offset >  -3) ao4 += 2; else ao4 += lda;
+      if (offset >  -4) ao5 += 2; else ao5 += lda;
+      if (offset >  -5) ao6 += 2; else ao6 += lda;
+      if (offset >  -6) ao7 += 2; else ao7 += lda;
+      if (offset >  -7) ao8 += 2; else ao8 += lda;
+
+      b[ 0] = data01;
+      b[ 1] = data02;
+      b[ 2] = data03;
+      b[ 3] = data04;
+      b[ 4] = data05;
+      b[ 5] = data06;
+      b[ 6] = data07;
+      b[ 7] = data08;
+      b[ 8] = data09;
+      b[ 9] = data10;
+      b[10] = data11;
+      b[11] = data12;
+      b[12] = data13;
+      b[13] = data14;
+      b[14] = data15;
+      b[15] = data16;
+
+      b += 16;
+
+      offset --;
+      i --;
+    }
+
+    posX += 8;
+  }
+
+  if (n & 4) {
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
+    if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
+    if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
+    if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+
+      if (offset >   0) ao1 += 2; else ao1 += lda;
+      if (offset >  -1) ao2 += 2; else ao2 += lda;
+      if (offset >  -2) ao3 += 2; else ao3 += lda;
+      if (offset >  -3) ao4 += 2; else ao4 += lda;
+
+      b[ 0] = data01;
+      b[ 1] = data02;
+      b[ 2] = data03;
+      b[ 3] = data04;
+      b[ 4] = data05;
+      b[ 5] = data06;
+      b[ 6] = data07;
+      b[ 7] = data08;
+
+      b += 8;
+
+      offset --;
+      i --;
+    }
+
+    posX += 4;
+  }
+
+  if (n & 2) {
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
+    if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+
+      if (offset >   0) ao1 += 2; else ao1 += lda;
+      if (offset >  -1) ao2 += 2; else ao2 += lda;
+
+      b[ 0] = data01;
+      b[ 1] = data02;
+      b[ 2] = data03;
+      b[ 3] = data04;
+
+      b += 4;
+
+      offset --;
+      i --;
+    }
+
+    posX += 2;
+  }
+
+  if (n & 1) {
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+
+      if (offset >   0) ao1 += 2; else ao1 += lda;
+
+      b[ 0] = data01;
+      b[ 1] = data02;
+
+      b += 2;
+
+      offset --;
+      i --;
+    }
+
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/ztrmm_lncopy_16.c b/kernel/generic/ztrmm_lncopy_16.c
new file mode 100644
index 000000000..d7fb23176
--- /dev/null
+++ b/kernel/generic/ztrmm_lncopy_16.c
@@ -0,0 +1,2310 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, js;
+  BLASLONG X, ii;
+
+  FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08;
+  FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16;
+
+  lda += lda;
+
+  js = (n >> 4);
+
+  if (js > 0){
+    do {
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+	a02 = a + posY * 2 + (posX + 1) * lda;
+	a03 = a + posY * 2 + (posX + 2) * lda;
+	a04 = a + posY * 2 + (posX + 3) * lda;
+	a05 = a + posY * 2 + (posX + 4) * lda;
+	a06 = a + posY * 2 + (posX + 5) * lda;
+	a07 = a + posY * 2 + (posX + 6) * lda;
+	a08 = a + posY * 2 + (posX + 7) * lda;
+	a09 = a + posY * 2 + (posX + 8) * lda;
+	a10 = a + posY * 2 + (posX + 9) * lda;
+	a11 = a + posY * 2 + (posX + 10) * lda;
+	a12 = a + posY * 2 + (posX + 11) * lda;
+	a13 = a + posY * 2 + (posX + 12) * lda;
+	a14 = a + posY * 2 + (posX + 13) * lda;
+	a15 = a + posY * 2 + (posX + 14) * lda;
+	a16 = a + posY * 2 + (posX + 15) * lda;
+      } else {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+	a02 = a + posX * 2 + (posY + 1) * lda;
+	a03 = a + posX * 2 + (posY + 2) * lda;
+	a04 = a + posX * 2 + (posY + 3) * lda;
+	a05 = a + posX * 2 + (posY + 4) * lda;
+	a06 = a + posX * 2 + (posY + 5) * lda;
+	a07 = a + posX * 2 + (posY + 6) * lda;
+	a08 = a + posX * 2 + (posY + 7) * lda;
+	a09 = a + posX * 2 + (posY + 8) * lda;
+	a10 = a + posX * 2 + (posY + 9) * lda;
+	a11 = a + posX * 2 + (posY + 10) * lda;
+	a12 = a + posX * 2 + (posY + 11) * lda;
+	a13 = a + posX * 2 + (posY + 12) * lda;
+	a14 = a + posX * 2 + (posY + 13) * lda;
+	a15 = a + posX * 2 + (posY + 14) * lda;
+	a16 = a + posX * 2 + (posY + 15) * lda;
+      }
+
+      i = (m >> 4);
+      if (i > 0) {
+	do {
+	  if (X > posY) {
+	    for (ii = 0; ii < 16; ii++){
+
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+	      b[  2] = *(a02 +  0);
+	      b[  3] = *(a02 +  1);
+	      b[  4] = *(a03 +  0);
+	      b[  5] = *(a03 +  1);
+	      b[  6] = *(a04 +  0);
+	      b[  7] = *(a04 +  1);
+
+	      b[  8] = *(a05 +  0);
+	      b[  9] = *(a05 +  1);
+	      b[ 10] = *(a06 +  0);
+	      b[ 11] = *(a06 +  1);
+	      b[ 12] = *(a07 +  0);
+	      b[ 13] = *(a07 +  1);
+	      b[ 14] = *(a08 +  0);
+	      b[ 15] = *(a08 +  1);
+
+		  b[ 16] = *(a09 +  0);
+	      b[ 17] = *(a09 +  1);
+	      b[ 18] = *(a10 +  0);
+	      b[ 19] = *(a10 +  1);
+	      b[ 20] = *(a11 +  0);
+	      b[ 21] = *(a11 +  1);
+	      b[ 22] = *(a12 +  0);
+	      b[ 23] = *(a12 +  1);
+
+	      b[ 24] = *(a13 +  0);
+	      b[ 25] = *(a13 +  1);
+	      b[ 26] = *(a14 +  0);
+	      b[ 27] = *(a14 +  1);
+	      b[ 28] = *(a15 +  0);
+	      b[ 29] = *(a15 +  1);
+	      b[ 30] = *(a16 +  0);
+	      b[ 31] = *(a16 +  1);
+
+	      a01 += 2;
+	      a02 += 2;
+	      a03 += 2;
+	      a04 += 2;
+	      a05 += 2;
+	      a06 += 2;
+	      a07 += 2;
+	      a08 += 2;
+		  a09 += 2;
+	      a10 += 2;
+	      a11 += 2;
+	      a12 += 2;
+	      a13 += 2;
+	      a14 += 2;
+	      a15 += 2;
+	      a16 += 2;
+	      b += 32;
+	    }
+	  } else
+	    if (X < posY) {
+	      a01 += 16 * lda;
+	      a02 += 16 * lda;
+	      a03 += 16 * lda;
+	      a04 += 16 * lda;
+	      a05 += 16 * lda;
+	      a06 += 16 * lda;
+	      a07 += 16 * lda;
+	      a08 += 16 * lda;
+		  a09 += 16 * lda;
+	      a10 += 16 * lda;
+	      a11 += 16 * lda;
+	      a12 += 16 * lda;
+	      a13 += 16 * lda;
+	      a14 += 16 * lda;
+	      a15 += 16 * lda;
+	      a16 += 16 * lda;
+
+	      b += 512;
+
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+		  b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+	      b[ 20] = ZERO;
+	      b[ 21] = ZERO;
+	      b[ 22] = ZERO;
+	      b[ 23] = ZERO;
+	      b[ 24] = ZERO;
+	      b[ 25] = ZERO;
+	      b[ 26] = ZERO;
+	      b[ 27] = ZERO;
+	      b[ 28] = ZERO;
+	      b[ 29] = ZERO;
+		  b[ 30] = ZERO;
+	      b[ 31] = ZERO;
+
+	      b[ 32] = *(a01 +  2);
+	      b[ 33] = *(a01 +  3);
+#ifdef UNIT
+	      b[ 34] = ONE;
+	      b[ 35] = ZERO;
+#else
+	      b[ 34] = *(a02 +  2);
+	      b[ 35] = *(a02 +  3);
+#endif
+	      b[ 36] = ZERO;
+	      b[ 37] = ZERO;
+	      b[ 38] = ZERO;
+	      b[ 39] = ZERO;
+	      b[ 40] = ZERO;
+	      b[ 41] = ZERO;
+	      b[ 42] = ZERO;
+	      b[ 43] = ZERO;
+	      b[ 44] = ZERO;
+	      b[ 45] = ZERO;
+	      b[ 46] = ZERO;
+	      b[ 47] = ZERO;
+	      b[ 48] = ZERO;
+	      b[ 49] = ZERO;
+	      b[ 50] = ZERO;
+	      b[ 51] = ZERO;
+	      b[ 52] = ZERO;
+	      b[ 53] = ZERO;
+	      b[ 54] = ZERO;
+	      b[ 55] = ZERO;
+	      b[ 56] = ZERO;
+	      b[ 57] = ZERO;
+	      b[ 58] = ZERO;
+	      b[ 59] = ZERO;
+	      b[ 60] = ZERO;
+	      b[ 61] = ZERO;
+	      b[ 62] = ZERO;
+	      b[ 63] = ZERO;
+
+	      b[ 64] = *(a01 +  4);
+	      b[ 65] = *(a01 +  5);
+	      b[ 66] = *(a02 +  4);
+	      b[ 67] = *(a02 +  5);
+#ifdef UNIT
+	      b[ 68] = ONE;
+	      b[ 69] = ZERO;
+#else
+	      b[ 68] = *(a03 +  4);
+	      b[ 69] = *(a03 +  5);
+#endif
+	      b[ 70] = ZERO;
+	      b[ 71] = ZERO;
+	      b[ 72] = ZERO;
+	      b[ 73] = ZERO;
+	      b[ 74] = ZERO;
+	      b[ 75] = ZERO;
+	      b[ 76] = ZERO;
+	      b[ 77] = ZERO;
+	      b[ 78] = ZERO;
+	      b[ 79] = ZERO;
+	      b[ 80] = ZERO;
+	      b[ 81] = ZERO;
+	      b[ 82] = ZERO;
+		  b[ 83] = ZERO;
+	      b[ 84] = ZERO;
+	      b[ 85] = ZERO;
+	      b[ 86] = ZERO;
+	      b[ 87] = ZERO;
+	      b[ 88] = ZERO;
+	      b[ 89] = ZERO;
+	      b[ 90] = ZERO;
+	      b[ 91] = ZERO;
+	      b[ 92] = ZERO;
+	      b[ 93] = ZERO;
+	      b[ 94] = ZERO;
+	      b[ 95] = ZERO;
+
+	      b[ 96] = *(a01 +  6);
+	      b[ 97] = *(a01 +  7);
+	      b[ 98] = *(a02 +  6);
+	      b[ 99] = *(a02 +  7);
+	      b[100] = *(a03 +  6);
+	      b[101] = *(a03 +  7);
+#ifdef UNIT
+	      b[102] = ONE;
+	      b[103] = ZERO;
+#else
+	      b[102] = *(a04 +  6);
+	      b[103] = *(a04 +  7);
+#endif
+	      b[104] = ZERO;
+	      b[105] = ZERO;
+	      b[106] = ZERO;
+	      b[107] = ZERO;
+	      b[108] = ZERO;
+	      b[109] = ZERO;
+	      b[110] = ZERO;
+	      b[111] = ZERO;
+	      b[112] = ZERO;
+	      b[113] = ZERO;
+	      b[114] = ZERO;
+	      b[115] = ZERO;
+		  b[116] = ZERO;
+	      b[117] = ZERO;
+	      b[118] = ZERO;
+	      b[119] = ZERO;
+	      b[120] = ZERO;
+	      b[121] = ZERO;
+	      b[122] = ZERO;
+	      b[123] = ZERO;
+	      b[124] = ZERO;
+	      b[125] = ZERO;
+	      b[126] = ZERO;
+	      b[127] = ZERO;
+
+	      b[128] = *(a01 +  8);
+	      b[129] = *(a01 +  9);
+	      b[130] = *(a02 +  8);
+	      b[131] = *(a02 +  9);
+	      b[132] = *(a03 +  8);
+	      b[133] = *(a03 +  9);
+	      b[134] = *(a04 +  8);
+	      b[135] = *(a04 +  9);
+#ifdef UNIT
+	      b[136] = ONE;
+	      b[137] = ZERO;
+#else
+	      b[136] = *(a05 +  8);
+	      b[137] = *(a05 +  9);
+#endif
+	      b[138] = ZERO;
+	      b[139] = ZERO;
+	      b[140] = ZERO;
+	      b[141] = ZERO;
+	      b[142] = ZERO;
+	      b[143] = ZERO;
+	      b[144] = ZERO;
+	      b[145] = ZERO;
+	      b[146] = ZERO;
+	      b[147] = ZERO;
+	      b[148] = ZERO;
+		  b[149] = ZERO;
+	      b[150] = ZERO;
+	      b[151] = ZERO;
+	      b[152] = ZERO;
+	      b[153] = ZERO;
+	      b[154] = ZERO;
+	      b[155] = ZERO;
+	      b[156] = ZERO;
+	      b[157] = ZERO;
+	      b[158] = ZERO;
+	      b[159] = ZERO;
+
+	      b[160] = *(a01 + 10);
+	      b[161] = *(a01 + 11);
+	      b[162] = *(a02 + 10);
+	      b[163] = *(a02 + 11);
+	      b[164] = *(a03 + 10);
+	      b[165] = *(a03 + 11);
+	      b[166] = *(a04 + 10);
+	      b[167] = *(a04 + 11);
+	      b[168] = *(a05 + 10);
+	      b[169] = *(a05 + 11);
+#ifdef UNIT
+	      b[170] = ONE;
+	      b[171] = ZERO;
+#else
+	      b[170] = *(a06 + 10);
+	      b[171] = *(a06 + 11);
+#endif
+	      b[172] = ZERO;
+	      b[173] = ZERO;
+	      b[174] = ZERO;
+	      b[175] = ZERO;
+	      b[176] = ZERO;
+	      b[177] = ZERO;
+	      b[178] = ZERO;
+	      b[179] = ZERO;
+	      b[180] = ZERO;
+	      b[181] = ZERO;
+		  b[182] = ZERO;
+	      b[183] = ZERO;
+	      b[184] = ZERO;
+	      b[185] = ZERO;
+	      b[186] = ZERO;
+	      b[187] = ZERO;
+	      b[188] = ZERO;
+	      b[189] = ZERO;
+	      b[190] = ZERO;
+	      b[191] = ZERO;
+
+	      b[192] = *(a01 + 12);
+	      b[193] = *(a01 + 13);
+	      b[194] = *(a02 + 12);
+	      b[195] = *(a02 + 13);
+	      b[196] = *(a03 + 12);
+	      b[197] = *(a03 + 13);
+	      b[198] = *(a04 + 12);
+	      b[199] = *(a04 + 13);
+	      b[200] = *(a05 + 12);
+	      b[201] = *(a05 + 13);
+	      b[202] = *(a06 + 12);
+	      b[203] = *(a06 + 13);
+#ifdef UNIT
+	      b[204] = ONE;
+	      b[205] = ZERO;
+#else
+	      b[204] = *(a07 + 12);
+	      b[205] = *(a07 + 13);
+#endif
+	      b[206] = ZERO;
+	      b[207] = ZERO;
+	      b[208] = ZERO;
+	      b[209] = ZERO;
+	      b[210] = ZERO;
+	      b[211] = ZERO;
+	      b[212] = ZERO;
+	      b[213] = ZERO;
+	      b[214] = ZERO;
+		  b[215] = ZERO;
+	      b[216] = ZERO;
+	      b[217] = ZERO;
+	      b[218] = ZERO;
+	      b[219] = ZERO;
+	      b[220] = ZERO;
+	      b[221] = ZERO;
+	      b[222] = ZERO;
+	      b[223] = ZERO;
+
+	      b[224] = *(a01 + 14);
+	      b[225] = *(a01 + 15);
+	      b[226] = *(a02 + 14);
+	      b[227] = *(a02 + 15);
+	      b[228] = *(a03 + 14);
+	      b[229] = *(a03 + 15);
+	      b[230] = *(a04 + 14);
+	      b[231] = *(a04 + 15);
+	      b[232] = *(a05 + 14);
+	      b[233] = *(a05 + 15);
+	      b[234] = *(a06 + 14);
+	      b[235] = *(a06 + 15);
+	      b[236] = *(a07 + 14);
+	      b[237] = *(a07 + 15);
+#ifdef UNIT
+	      b[238] = ONE;
+	      b[239] = ZERO;
+#else
+	      b[238] = *(a08 + 14);
+	      b[239] = *(a08 + 15);
+#endif
+		  b[240] = ZERO;
+	      b[241] = ZERO;
+	      b[242] = ZERO;
+	      b[243] = ZERO;
+	      b[244] = ZERO;
+	      b[245] = ZERO;
+	      b[246] = ZERO;
+	      b[247] = ZERO;
+		  b[248] = ZERO;
+	      b[249] = ZERO;
+	      b[250] = ZERO;
+	      b[251] = ZERO;
+	      b[252] = ZERO;
+	      b[253] = ZERO;
+	      b[254] = ZERO;
+	      b[255] = ZERO;
+
+	      b[256] = *(a01 + 16);
+	      b[257] = *(a01 + 17);
+	      b[258] = *(a02 + 16);
+	      b[259] = *(a02 + 17);
+	      b[260] = *(a03 + 16);
+	      b[261] = *(a03 + 17);
+	      b[262] = *(a04 + 16);
+	      b[263] = *(a04 + 17);
+	      b[264] = *(a05 + 16);
+	      b[265] = *(a05 + 17);
+	      b[266] = *(a06 + 16);
+	      b[267] = *(a06 + 17);
+	      b[268] = *(a07 + 16);
+	      b[269] = *(a07 + 17);
+	      b[270] = *(a08 + 16);
+	      b[271] = *(a08 + 17);
+#ifdef UNIT
+	      b[272] = ONE;
+		  b[273] = ZERO;
+#else
+	      b[272] = *(a09 + 16);
+		  b[273] = *(a09 + 17);
+#endif
+		  b[274] = ZERO;
+	      b[275] = ZERO;
+	      b[276] = ZERO;
+	      b[277] = ZERO;
+	      b[278] = ZERO;
+	      b[279] = ZERO;
+	      b[280] = ZERO;
+		  b[281] = ZERO;
+	      b[282] = ZERO;
+	      b[283] = ZERO;
+	      b[284] = ZERO;
+	      b[285] = ZERO;
+	      b[286] = ZERO;
+	      b[287] = ZERO;
+
+		  b[288] = *(a01 + 18);
+		  b[289] = *(a01 + 19);
+	      b[290] = *(a02 + 18);
+	      b[291] = *(a02 + 19);
+	      b[292] = *(a03 + 18);
+	      b[293] = *(a03 + 19);
+	      b[294] = *(a04 + 18);
+	      b[295] = *(a04 + 19);
+	      b[296] = *(a05 + 18);
+	      b[297] = *(a05 + 19);
+	      b[298] = *(a06 + 18);
+	      b[299] = *(a06 + 19);
+	      b[300] = *(a07 + 18);
+	      b[301] = *(a07 + 19);
+	      b[302] = *(a08 + 18);
+	      b[303] = *(a08 + 19);
+	      b[304] = *(a09 + 18);
+	      b[305] = *(a09 + 19);
+#ifdef UNIT
+	      b[306] = ONE;
+		  b[307] = ZERO;
+#else
+	      b[306] = *(a10 + 18);
+		  b[307] = *(a10 + 19);
+#endif
+		  b[308] = ZERO;
+	      b[309] = ZERO;
+	      b[310] = ZERO;
+	      b[311] = ZERO;
+	      b[312] = ZERO;
+	      b[313] = ZERO;
+		  b[314] = ZERO;
+	      b[315] = ZERO;
+	      b[316] = ZERO;
+	      b[317] = ZERO;
+	      b[318] = ZERO;
+	      b[319] = ZERO;
+
+		  b[320] = *(a01 + 20);
+		  b[321] = *(a01 + 21);
+	      b[322] = *(a02 + 20);
+	      b[323] = *(a02 + 21);
+	      b[324] = *(a03 + 20);
+	      b[325] = *(a03 + 21);
+	      b[326] = *(a04 + 20);
+	      b[327] = *(a04 + 21);
+	      b[328] = *(a05 + 20);
+	      b[329] = *(a05 + 21);
+	      b[330] = *(a06 + 20);
+	      b[331] = *(a06 + 21);
+	      b[332] = *(a07 + 20);
+	      b[333] = *(a07 + 21);
+	      b[334] = *(a08 + 20);
+	      b[335] = *(a08 + 21);
+	      b[336] = *(a09 + 20);
+	      b[337] = *(a09 + 21);
+	      b[338] = *(a10 + 20);
+	      b[339] = *(a10 + 21);
+#ifdef UNIT
+	      b[340] = ONE;
+	      b[341] = ZERO;
+#else
+	      b[340] = *(a11 + 20);
+	      b[341] = *(a11 + 21);
+#endif
+		  b[342] = ZERO;
+	      b[343] = ZERO;
+	      b[344] = ZERO;
+	      b[345] = ZERO;
+	      b[346] = ZERO;
+		  b[347] = ZERO;
+	      b[348] = ZERO;
+	      b[349] = ZERO;
+	      b[350] = ZERO;
+	      b[351] = ZERO;
+
+		  b[352] = *(a01 + 22);
+		  b[353] = *(a01 + 23);
+	      b[354] = *(a02 + 22);
+	      b[355] = *(a02 + 23);
+	      b[356] = *(a03 + 22);
+	      b[357] = *(a03 + 23);
+	      b[358] = *(a04 + 22);
+	      b[359] = *(a04 + 23);
+	      b[360] = *(a05 + 22);
+	      b[361] = *(a05 + 23);
+	      b[362] = *(a06 + 22);
+	      b[363] = *(a06 + 23);
+	      b[364] = *(a07 + 22);
+	      b[365] = *(a07 + 23);
+	      b[366] = *(a08 + 22);
+	      b[367] = *(a08 + 23);
+	      b[368] = *(a09 + 22);
+	      b[369] = *(a09 + 23);
+	      b[370] = *(a10 + 22);
+	      b[371] = *(a10 + 23);
+	      b[372] = *(a11 + 22);
+	      b[373] = *(a11 + 23);
+#ifdef UNIT
+	      b[374] = ONE;
+	      b[375] = ZERO;
+#else
+	      b[374] = *(a12 + 22);
+	      b[375] = *(a12 + 23);
+#endif
+		  b[376] = ZERO;
+	      b[377] = ZERO;
+	      b[378] = ZERO;
+	      b[379] = ZERO;
+		  b[380] = ZERO;
+	      b[381] = ZERO;
+	      b[382] = ZERO;
+	      b[383] = ZERO;
+
+		  b[384] = *(a01 + 24);
+		  b[385] = *(a01 + 25);
+	      b[386] = *(a02 + 24);
+	      b[387] = *(a02 + 25);
+	      b[388] = *(a03 + 24);
+	      b[389] = *(a03 + 25);
+	      b[390] = *(a04 + 24);
+	      b[391] = *(a04 + 25);
+	      b[392] = *(a05 + 24);
+	      b[393] = *(a05 + 25);
+	      b[394] = *(a06 + 24);
+	      b[395] = *(a06 + 25);
+	      b[396] = *(a07 + 24);
+	      b[397] = *(a07 + 25);
+	      b[398] = *(a08 + 24);
+	      b[399] = *(a08 + 25);
+	      b[400] = *(a09 + 24);
+	      b[401] = *(a09 + 25);
+	      b[402] = *(a10 + 24);
+	      b[403] = *(a10 + 25);
+	      b[404] = *(a11 + 24);
+	      b[405] = *(a11 + 25);
+	      b[406] = *(a12 + 24);
+	      b[407] = *(a12 + 25);
+#ifdef UNIT
+	      b[408] = ONE;
+	      b[409] = ZERO;
+#else
+	      b[408] = *(a13 + 24);
+	      b[409] = *(a13 + 25);
+#endif
+		  b[410] = ZERO;
+	      b[411] = ZERO;
+	      b[412] = ZERO;
+		  b[413] = ZERO;
+	      b[414] = ZERO;
+	      b[415] = ZERO;
+
+		  b[416] = *(a01 + 26);
+		  b[417] = *(a01 + 27);
+	      b[418] = *(a02 + 26);
+	      b[419] = *(a02 + 27);
+	      b[420] = *(a03 + 26);
+	      b[421] = *(a03 + 27);
+	      b[422] = *(a04 + 26);
+	      b[423] = *(a04 + 27);
+	      b[424] = *(a05 + 26);
+	      b[425] = *(a05 + 27);
+	      b[426] = *(a06 + 26);
+	      b[427] = *(a06 + 27);
+	      b[428] = *(a07 + 26);
+	      b[429] = *(a07 + 27);
+	      b[430] = *(a08 + 26);
+	      b[431] = *(a08 + 27);
+	      b[432] = *(a09 + 26);
+	      b[433] = *(a09 + 27);
+	      b[434] = *(a10 + 26);
+	      b[435] = *(a10 + 27);
+	      b[436] = *(a11 + 26);
+	      b[437] = *(a11 + 27);
+	      b[438] = *(a12 + 26);
+	      b[439] = *(a12 + 27);
+	      b[440] = *(a13 + 26);
+	      b[441] = *(a13 + 27);
+#ifdef UNIT
+	      b[442] = ONE;
+	      b[443] = ZERO;
+#else
+	      b[442] = *(a14 + 26);
+	      b[443] = *(a14 + 27);
+#endif
+		  b[444] = ZERO;
+	      b[445] = ZERO;
+		  b[446] = ZERO;
+	      b[447] = ZERO;
+
+		  b[448] = *(a01 + 28);
+		  b[449] = *(a01 + 29);
+	      b[450] = *(a02 + 28);
+	      b[451] = *(a02 + 29);
+	      b[452] = *(a03 + 28);
+	      b[453] = *(a03 + 29);
+	      b[454] = *(a04 + 28);
+	      b[455] = *(a04 + 29);
+	      b[456] = *(a05 + 28);
+	      b[457] = *(a05 + 29);
+	      b[458] = *(a06 + 28);
+	      b[459] = *(a06 + 29);
+	      b[460] = *(a07 + 28);
+	      b[461] = *(a07 + 29);
+	      b[462] = *(a08 + 28);
+	      b[463] = *(a08 + 29);
+	      b[464] = *(a09 + 28);
+	      b[465] = *(a09 + 29);
+	      b[466] = *(a10 + 28);
+	      b[467] = *(a10 + 29);
+	      b[468] = *(a11 + 28);
+	      b[469] = *(a11 + 29);
+	      b[470] = *(a12 + 28);
+	      b[471] = *(a12 + 29);
+	      b[472] = *(a13 + 28);
+	      b[473] = *(a13 + 29);
+	      b[474] = *(a14 + 28);
+	      b[475] = *(a14 + 29);
+#ifdef UNIT
+	      b[476] = ONE;
+	      b[477] = ZERO;
+#else
+	      b[476] = *(a15 + 28);
+	      b[477] = *(a15 + 29);
+#endif
+		  b[478] = ZERO;
+		  b[479] = ZERO;
+
+		  b[480] = *(a01 + 30);
+		  b[481] = *(a01 + 31);
+	      b[482] = *(a02 + 30);
+	      b[483] = *(a02 + 31);
+	      b[484] = *(a03 + 30);
+	      b[485] = *(a03 + 31);
+	      b[486] = *(a04 + 30);
+	      b[487] = *(a04 + 31);
+	      b[488] = *(a05 + 30);
+	      b[489] = *(a05 + 31);
+	      b[490] = *(a06 + 30);
+	      b[491] = *(a06 + 31);
+	      b[492] = *(a07 + 30);
+	      b[493] = *(a07 + 31);
+	      b[494] = *(a08 + 30);
+	      b[495] = *(a08 + 31);
+	      b[496] = *(a09 + 30);
+	      b[497] = *(a09 + 31);
+	      b[498] = *(a10 + 30);
+	      b[499] = *(a10 + 31);
+	      b[500] = *(a11 + 30);
+	      b[501] = *(a11 + 31);
+	      b[502] = *(a12 + 30);
+	      b[503] = *(a12 + 31);
+	      b[504] = *(a13 + 30);
+	      b[505] = *(a13 + 31);
+	      b[506] = *(a14 + 30);
+	      b[507] = *(a14 + 31);
+	      b[508] = *(a15 + 30);
+	      b[509] = *(a15 + 31);
+#ifdef UNIT
+	      b[510] = ONE;
+	      b[511] = ZERO;
+#else
+	      b[510] = *(a16 + 30);
+	      b[511] = *(a16 + 31);
+#endif
+
+	      a01 += 32;
+	      a02 += 32;
+	      a03 += 32;
+	      a04 += 32;
+	      a05 += 32;
+	      a06 += 32;
+	      a07 += 32;
+	      a08 += 32;
+	      a09 += 32;
+	      a10 += 32;
+	      a11 += 32;
+	      a12 += 32;
+	      a13 += 32;
+	      a14 += 32;
+	      a15 += 32;
+	      a16 += 32;
+	      b += 512;
+	    }
+
+	  X += 16;
+	  i --;
+	} while (i > 0);
+      }
+
+      i = (m & 15);
+      if (i) {
+
+	if (X > posY) {
+
+	  for (ii = 0; ii < i; ii++){
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+		b[  2] = *(a02 +  0);
+		b[  3] = *(a02 +  1);
+		b[  4] = *(a03 +  0);
+		b[  5] = *(a03 +  1);
+		b[  6] = *(a04 +  0);
+		b[  7] = *(a04 +  1);
+		b[  8] = *(a05 +  0);
+		b[  9] = *(a05 +  1);
+		b[ 10] = *(a06 +  0);
+		b[ 11] = *(a06 +  1);
+		b[ 12] = *(a07 +  0);
+		b[ 13] = *(a07 +  1);
+		b[ 14] = *(a08 +  0);
+		b[ 15] = *(a08 +  1);
+
+		b[ 16] = *(a09 +  0);
+		b[ 17] = *(a09 +  1);
+		b[ 18] = *(a10 +  0);
+		b[ 19] = *(a10 +  1);
+		b[ 20] = *(a11 +  0);
+		b[ 21] = *(a11 +  1);
+		b[ 22] = *(a12 +  0);
+		b[ 23] = *(a12 +  1);
+		b[ 24] = *(a13 +  0);
+		b[ 25] = *(a13 +  1);
+		b[ 26] = *(a14 +  0);
+		b[ 27] = *(a14 +  1);
+		b[ 28] = *(a15 +  0);
+		b[ 29] = *(a15 +  1);
+		b[ 30] = *(a16 +  0);
+		b[ 31] = *(a16 +  1);
+
+	    a01 += 2;
+	    a02 += 2;
+	    a03 += 2;
+	    a04 += 2;
+	    a05 += 2;
+	    a06 += 2;
+	    a07 += 2;
+	    a08 += 2;
+		a09 += 2;
+	    a10 += 2;
+	    a11 += 2;
+	    a12 += 2;
+	    a13 += 2;
+	    a14 += 2;
+	    a15 += 2;
+	    a16 += 2;
+	    b += 32;
+	  }
+	} else
+	  if (X < posY) {
+	    /* a01 += i * lda;
+	      a02 += i * lda;
+	      a03 += i * lda;
+	      a04 += i * lda;
+	      a05 += i * lda;
+	      a06 += i * lda;
+	      a07 += i * lda;
+	      a08 += i * lda;
+	      a09 += i * lda;
+	      a10 += i * lda;
+	      a11 += i * lda;
+	      a12 += i * lda;
+	      a13 += i * lda;
+	      a14 += i * lda;
+	      a15 += i * lda;
+	      a16 += i * lda; */
+	    b += 32 * i;
+	  } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+		  b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+	      b[ 20] = ZERO;
+	      b[ 21] = ZERO;
+	      b[ 22] = ZERO;
+	      b[ 23] = ZERO;
+	      b[ 24] = ZERO;
+	      b[ 25] = ZERO;
+	      b[ 26] = ZERO;
+	      b[ 27] = ZERO;
+	      b[ 28] = ZERO;
+	      b[ 29] = ZERO;
+		  b[ 30] = ZERO;
+	      b[ 31] = ZERO;
+	      b += 32;
+
+	      if (i >= 2) {
+		b[ 0] = *(a01 +  2);
+		b[ 1] = *(a01 +  3);
+#ifdef UNIT
+		b[ 2] = ONE;
+		b[ 3] = ZERO;
+#else
+		b[ 2] = *(a02 +  2);
+		b[ 3] = *(a02 +  3);
+#endif
+		b[  4] = ZERO;
+		b[  5] = ZERO;
+		b[  6] = ZERO;
+		b[  7] = ZERO;
+		b[  8] = ZERO;
+		b[  9] = ZERO;
+		b[ 10] = ZERO;
+		b[ 11] = ZERO;
+		b[ 12] = ZERO;
+		b[ 13] = ZERO;
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	      }
+
+	      if (i >= 3) {
+		b[ 0] = *(a01 +  4);
+		b[ 1] = *(a01 +  5);
+		b[ 2] = *(a02 +  4);
+		b[ 3] = *(a02 +  5);
+#ifdef UNIT
+		b[ 4] = ONE;
+		b[ 5] = ZERO;
+#else
+		b[ 4] = *(a03 +  4);
+		b[ 5] = *(a03 +  5);
+#endif
+		b[  6] = ZERO;
+		b[  7] = ZERO;
+		b[  8] = ZERO;
+		b[  9] = ZERO;
+		b[ 10] = ZERO;
+		b[ 11] = ZERO;
+		b[ 12] = ZERO;
+		b[ 13] = ZERO;
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	      }
+
+	      if (i >= 4) {
+		b[ 0] = *(a01 +  6);
+		b[ 1] = *(a01 +  7);
+		b[ 2] = *(a02 +  6);
+		b[ 3] = *(a02 +  7);
+		b[ 4] = *(a03 +  6);
+		b[ 5] = *(a03 +  7);
+#ifdef UNIT
+		b[ 6] = ONE;
+		b[ 7] = ZERO;
+#else
+		b[ 6] = *(a04 +  6);
+		b[ 7] = *(a04 +  7);
+#endif
+		b[  8] = ZERO;
+		b[  9] = ZERO;
+		b[ 10] = ZERO;
+		b[ 11] = ZERO;
+		b[ 12] = ZERO;
+		b[ 13] = ZERO;
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	      }
+
+	      if (i >= 5) {
+		b[ 0] = *(a01 +  8);
+		b[ 1] = *(a01 +  9);
+		b[ 2] = *(a02 +  8);
+		b[ 3] = *(a02 +  9);
+		b[ 4] = *(a03 +  8);
+		b[ 5] = *(a03 +  9);
+		b[ 6] = *(a04 +  8);
+		b[ 7] = *(a04 +  9);
+#ifdef UNIT
+		b[ 8] = ONE;
+		b[ 9] = ZERO;
+#else
+		b[ 8] = *(a05 +  8);
+		b[ 9] = *(a05 +  9);
+#endif
+		b[ 10] = ZERO;
+		b[ 11] = ZERO;
+		b[ 12] = ZERO;
+		b[ 13] = ZERO;
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	      }
+
+	      if (i >= 6) {
+		b[ 0] = *(a01 + 10);
+		b[ 1] = *(a01 + 11);
+		b[ 2] = *(a02 + 10);
+		b[ 3] = *(a02 + 11);
+		b[ 4] = *(a03 + 10);
+		b[ 5] = *(a03 + 11);
+		b[ 6] = *(a04 + 10);
+		b[ 7] = *(a04 + 11);
+		b[ 8] = *(a05 + 10);
+		b[ 9] = *(a05 + 11);
+#ifdef UNIT
+		b[10] = ONE;
+		b[11] = ZERO;
+#else
+		b[10] = *(a06 + 10);
+		b[11] = *(a06 + 11);
+#endif
+		b[ 12] = ZERO;
+		b[ 13] = ZERO;
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	      }
+
+	      if (i >= 7) {
+		b[ 0] = *(a01 + 12);
+		b[ 1] = *(a01 + 13);
+		b[ 2] = *(a02 + 12);
+		b[ 3] = *(a02 + 13);
+		b[ 4] = *(a03 + 12);
+		b[ 5] = *(a03 + 13);
+		b[ 6] = *(a04 + 12);
+		b[ 7] = *(a04 + 13);
+		b[ 8] = *(a05 + 12);
+		b[ 9] = *(a05 + 13);
+		b[10] = *(a06 + 12);
+		b[11] = *(a06 + 13);
+#ifdef UNIT
+		b[12] = ONE;
+		b[13] = ZERO;
+#else
+		b[12] = *(a07 + 12);
+		b[13] = *(a07 + 13);
+#endif
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	    }
+
+		if (i >= 8) {
+	      b[  0] = *(a01 + 14);
+	      b[  1] = *(a01 + 15);
+	      b[  2] = *(a02 + 14);
+	      b[  3] = *(a02 + 15);
+	      b[  4] = *(a03 + 14);
+	      b[  5] = *(a03 + 15);
+	      b[  6] = *(a04 + 14);
+	      b[  7] = *(a04 + 15);
+	      b[  8] = *(a05 + 14);
+	      b[  9] = *(a05 + 15);
+	      b[ 10] = *(a06 + 14);
+	      b[ 11] = *(a06 + 15);
+	      b[ 12] = *(a07 + 14);
+	      b[ 13] = *(a07 + 15);
+#ifdef UNIT
+	      b[ 14] = ONE;
+	      b[ 15] = ZERO;
+#else
+	      b[ 14] = *(a08 +  14);
+	      b[ 15] = *(a08 +  15);
+#endif
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 9) {
+	      b[  0] = *(a01 + 16);
+	      b[  1] = *(a01 + 17);
+	      b[  2] = *(a02 + 16);
+	      b[  3] = *(a02 + 17);
+	      b[  4] = *(a03 + 16);
+	      b[  5] = *(a03 + 17);
+	      b[  6] = *(a04 + 16);
+	      b[  7] = *(a04 + 17);
+	      b[  8] = *(a05 + 16);
+	      b[  9] = *(a05 + 17);
+	      b[ 10] = *(a06 + 16);
+	      b[ 11] = *(a06 + 17);
+	      b[ 12] = *(a07 + 16);
+	      b[ 13] = *(a07 + 17);
+	      b[ 14] = *(a08 + 16);
+	      b[ 15] = *(a08 + 17);
+#ifdef UNIT
+	      b[ 16] = ONE;
+	      b[ 17] = ZERO;
+#else
+	      b[ 16] = *(a09 + 16);
+	      b[ 17] = *(a09 + 17);
+#endif
+	    b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 10) {
+	      b[  0] = *(a01 + 18);
+	      b[  1] = *(a01 + 19);
+	      b[  2] = *(a02 + 18);
+	      b[  3] = *(a02 + 19);
+	      b[  4] = *(a03 + 18);
+	      b[  5] = *(a03 + 19);
+	      b[  6] = *(a04 + 18);
+	      b[  7] = *(a04 + 19);
+	      b[  8] = *(a05 + 18);
+	      b[  9] = *(a05 + 19);
+	      b[ 10] = *(a06 + 18);
+	      b[ 11] = *(a06 + 19);
+	      b[ 12] = *(a07 + 18);
+	      b[ 13] = *(a07 + 19);
+	      b[ 14] = *(a08 + 18);
+	      b[ 15] = *(a08 + 19);
+	      b[ 16] = *(a09 + 18);
+	      b[ 17] = *(a09 + 19);
+#ifdef UNIT
+	      b[ 18] = ONE;
+	      b[ 19] = ZERO;
+#else
+	      b[ 18] = *(a10 + 18);
+	      b[ 19] = *(a10 + 19);
+#endif
+	    b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 11) {
+	      b[  0] = *(a01 + 20);
+	      b[  1] = *(a01 + 21);
+	      b[  2] = *(a02 + 20);
+	      b[  3] = *(a02 + 21);
+	      b[  4] = *(a03 + 20);
+	      b[  5] = *(a03 + 21);
+	      b[  6] = *(a04 + 20);
+	      b[  7] = *(a04 + 21);
+	      b[  8] = *(a05 + 20);
+	      b[  9] = *(a05 + 21);
+	      b[ 10] = *(a06 + 20);
+	      b[ 11] = *(a06 + 21);
+	      b[ 12] = *(a07 + 20);
+	      b[ 13] = *(a07 + 21);
+	      b[ 14] = *(a08 + 20);
+	      b[ 15] = *(a08 + 21);
+	      b[ 16] = *(a09 + 20);
+	      b[ 17] = *(a09 + 21);
+	      b[ 18] = *(a10 + 20);
+	      b[ 19] = *(a10 + 21);
+#ifdef UNIT
+	      b[ 20] = ONE;
+	      b[ 21] = ZERO;
+#else
+	      b[ 20] = *(a11 + 20);
+	      b[ 21] = *(a11 + 21);
+#endif
+	    b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 12) {
+	      b[  0] = *(a01 + 22);
+	      b[  1] = *(a01 + 23);
+	      b[  2] = *(a02 + 22);
+	      b[  3] = *(a02 + 23);
+	      b[  4] = *(a03 + 22);
+	      b[  5] = *(a03 + 23);
+	      b[  6] = *(a04 + 22);
+	      b[  7] = *(a04 + 23);
+	      b[  8] = *(a05 + 22);
+	      b[  9] = *(a05 + 23);
+	      b[ 10] = *(a06 + 22);
+	      b[ 11] = *(a06 + 23);
+	      b[ 12] = *(a07 + 22);
+	      b[ 13] = *(a07 + 23);
+	      b[ 14] = *(a08 + 22);
+	      b[ 15] = *(a08 + 23);
+	      b[ 16] = *(a09 + 22);
+	      b[ 17] = *(a09 + 23);
+	      b[ 18] = *(a10 + 22);
+	      b[ 19] = *(a10 + 23);
+	      b[ 20] = *(a11 + 22);
+	      b[ 21] = *(a11 + 23);
+#ifdef UNIT
+	      b[ 22] = ONE;
+	      b[ 23] = ZERO;
+#else
+	      b[ 22] = *(a12 + 22);
+	      b[ 23] = *(a12 + 23);
+#endif
+	    b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 13) {
+	      b[  0] = *(a01 + 24);
+	      b[  1] = *(a01 + 25);
+	      b[  2] = *(a02 + 24);
+	      b[  3] = *(a02 + 25);
+	      b[  4] = *(a03 + 24);
+	      b[  5] = *(a03 + 25);
+	      b[  6] = *(a04 + 24);
+	      b[  7] = *(a04 + 25);
+	      b[  8] = *(a05 + 24);
+	      b[  9] = *(a05 + 25);
+	      b[ 10] = *(a06 + 24);
+	      b[ 11] = *(a06 + 25);
+	      b[ 12] = *(a07 + 24);
+	      b[ 13] = *(a07 + 25);
+	      b[ 14] = *(a08 + 24);
+	      b[ 15] = *(a08 + 25);
+	      b[ 16] = *(a09 + 24);
+	      b[ 17] = *(a09 + 25);
+	      b[ 18] = *(a10 + 24);
+	      b[ 19] = *(a10 + 25);
+	      b[ 20] = *(a11 + 24);
+	      b[ 21] = *(a11 + 25);
+	      b[ 22] = *(a12 + 24);
+	      b[ 23] = *(a12 + 25);
+#ifdef UNIT
+	      b[ 24] = ONE;
+	      b[ 25] = ZERO;
+#else
+	      b[ 24] = *(a13 + 24);
+	      b[ 25] = *(a13 + 25);
+#endif
+	    b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 14) {
+	      b[  0] = *(a01 + 26);
+	      b[  1] = *(a01 + 27);
+	      b[  2] = *(a02 + 26);
+	      b[  3] = *(a02 + 27);
+	      b[  4] = *(a03 + 26);
+	      b[  5] = *(a03 + 27);
+	      b[  6] = *(a04 + 26);
+	      b[  7] = *(a04 + 27);
+	      b[  8] = *(a05 + 26);
+	      b[  9] = *(a05 + 27);
+	      b[ 10] = *(a06 + 26);
+	      b[ 11] = *(a06 + 27);
+	      b[ 12] = *(a07 + 26);
+	      b[ 13] = *(a07 + 27);
+	      b[ 14] = *(a08 + 26);
+	      b[ 15] = *(a08 + 27);
+	      b[ 16] = *(a09 + 26);
+	      b[ 17] = *(a09 + 27);
+	      b[ 18] = *(a10 + 26);
+	      b[ 19] = *(a10 + 27);
+	      b[ 20] = *(a11 + 26);
+	      b[ 21] = *(a11 + 27);
+	      b[ 22] = *(a12 + 26);
+	      b[ 23] = *(a12 + 27);
+	      b[ 24] = *(a13 + 26);
+	      b[ 25] = *(a13 + 27);
+#ifdef UNIT
+	      b[ 26] = ONE;
+	      b[ 27] = ZERO;
+#else
+	      b[ 26] = *(a14 + 26);
+	      b[ 27] = *(a14 + 27);
+#endif
+	    b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 15) {
+	      b[  0] = *(a01 + 28);
+	      b[  1] = *(a01 + 29);
+	      b[  2] = *(a02 + 28);
+	      b[  3] = *(a02 + 29);
+	      b[  4] = *(a03 + 28);
+	      b[  5] = *(a03 + 29);
+	      b[  6] = *(a04 + 28);
+	      b[  7] = *(a04 + 29);
+	      b[  8] = *(a05 + 28);
+	      b[  9] = *(a05 + 29);
+	      b[ 10] = *(a06 + 28);
+	      b[ 11] = *(a06 + 29);
+	      b[ 12] = *(a07 + 28);
+	      b[ 13] = *(a07 + 29);
+	      b[ 14] = *(a08 + 28);
+	      b[ 15] = *(a08 + 29);
+	      b[ 16] = *(a09 + 28);
+	      b[ 17] = *(a09 + 29);
+	      b[ 18] = *(a10 + 28);
+	      b[ 19] = *(a10 + 29);
+	      b[ 20] = *(a11 + 28);
+	      b[ 21] = *(a11 + 29);
+	      b[ 22] = *(a12 + 28);
+	      b[ 23] = *(a12 + 29);
+	      b[ 24] = *(a13 + 28);
+	      b[ 25] = *(a13 + 29);
+	      b[ 26] = *(a14 + 28);
+	      b[ 27] = *(a14 + 29);
+#ifdef UNIT
+	      b[ 28] = ONE;
+	      b[ 29] = ZERO;
+#else
+	      b[ 28] = *(a15 + 28);
+	      b[ 29] = *(a15 + 29);
+#endif
+	      b[ 30] = ZERO;
+	      b[ 31] = ZERO;
+	      b += 32;
+	    }
+	  }
+      }
+
+      posY += 16;
+      js --;
+    } while (js > 0);
+  } /* End of main loop */
+
+
+  if (n & 8){
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+	a02 = a + posY * 2 + (posX + 1) * lda;
+	a03 = a + posY * 2 + (posX + 2) * lda;
+	a04 = a + posY * 2 + (posX + 3) * lda;
+	a05 = a + posY * 2 + (posX + 4) * lda;
+	a06 = a + posY * 2 + (posX + 5) * lda;
+	a07 = a + posY * 2 + (posX + 6) * lda;
+	a08 = a + posY * 2 + (posX + 7) * lda;
+      } else {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+	a02 = a + posX * 2 + (posY + 1) * lda;
+	a03 = a + posX * 2 + (posY + 2) * lda;
+	a04 = a + posX * 2 + (posY + 3) * lda;
+	a05 = a + posX * 2 + (posY + 4) * lda;
+	a06 = a + posX * 2 + (posY + 5) * lda;
+	a07 = a + posX * 2 + (posY + 6) * lda;
+	a08 = a + posX * 2 + (posY + 7) * lda;
+      }
+
+      i = (m >> 3);
+      if (i > 0) {
+	do {
+	  if (X > posY) {
+	    for (ii = 0; ii < 8; ii++){
+
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+	      b[  2] = *(a02 +  0);
+	      b[  3] = *(a02 +  1);
+	      b[  4] = *(a03 +  0);
+	      b[  5] = *(a03 +  1);
+	      b[  6] = *(a04 +  0);
+	      b[  7] = *(a04 +  1);
+
+	      b[  8] = *(a05 +  0);
+	      b[  9] = *(a05 +  1);
+	      b[ 10] = *(a06 +  0);
+	      b[ 11] = *(a06 +  1);
+	      b[ 12] = *(a07 +  0);
+	      b[ 13] = *(a07 +  1);
+	      b[ 14] = *(a08 +  0);
+	      b[ 15] = *(a08 +  1);
+
+	      a01 += 2;
+	      a02 += 2;
+	      a03 += 2;
+	      a04 += 2;
+	      a05 += 2;
+	      a06 += 2;
+	      a07 += 2;
+	      a08 += 2;
+	      b += 16;
+	    }
+	  } else
+	    if (X < posY) {
+	      a01 += 8 * lda;
+	      a02 += 8 * lda;
+	      a03 += 8 * lda;
+	      a04 += 8 * lda;
+	      a05 += 8 * lda;
+	      a06 += 8 * lda;
+	      a07 += 8 * lda;
+	      a08 += 8 * lda;
+
+	      b += 128;
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+
+	      b[ 16] = *(a01 +  2);
+	      b[ 17] = *(a01 +  3);
+#ifdef UNIT
+	      b[ 18] = ONE;
+	      b[ 19] = ZERO;
+#else
+	      b[ 18] = *(a02 +  2);
+	      b[ 19] = *(a02 +  3);
+#endif
+	      b[ 20] = ZERO;
+	      b[ 21] = ZERO;
+	      b[ 22] = ZERO;
+	      b[ 23] = ZERO;
+	      b[ 24] = ZERO;
+	      b[ 25] = ZERO;
+	      b[ 26] = ZERO;
+	      b[ 27] = ZERO;
+	      b[ 28] = ZERO;
+	      b[ 29] = ZERO;
+	      b[ 30] = ZERO;
+	      b[ 31] = ZERO;
+
+	      b[ 32] = *(a01 +  4);
+	      b[ 33] = *(a01 +  5);
+	      b[ 34] = *(a02 +  4);
+	      b[ 35] = *(a02 +  5);
+#ifdef UNIT
+	      b[ 36] = ONE;
+	      b[ 37] = ZERO;
+#else
+	      b[ 36] = *(a03 +  4);
+	      b[ 37] = *(a03 +  5);
+#endif
+	      b[ 38] = ZERO;
+	      b[ 39] = ZERO;
+	      b[ 40] = ZERO;
+	      b[ 41] = ZERO;
+	      b[ 42] = ZERO;
+	      b[ 43] = ZERO;
+	      b[ 44] = ZERO;
+	      b[ 45] = ZERO;
+	      b[ 46] = ZERO;
+	      b[ 47] = ZERO;
+
+	      b[ 48] = *(a01 +  6);
+	      b[ 49] = *(a01 +  7);
+	      b[ 50] = *(a02 +  6);
+	      b[ 51] = *(a02 +  7);
+	      b[ 52] = *(a03 +  6);
+	      b[ 53] = *(a03 +  7);
+#ifdef UNIT
+	      b[ 54] = ONE;
+	      b[ 55] = ZERO;
+#else
+	      b[ 54] = *(a04 +  6);
+	      b[ 55] = *(a04 +  7);
+#endif
+	      b[ 56] = ZERO;
+	      b[ 57] = ZERO;
+	      b[ 58] = ZERO;
+	      b[ 59] = ZERO;
+	      b[ 60] = ZERO;
+	      b[ 61] = ZERO;
+	      b[ 62] = ZERO;
+	      b[ 63] = ZERO;
+
+	      b[ 64] = *(a01 +  8);
+	      b[ 65] = *(a01 +  9);
+	      b[ 66] = *(a02 +  8);
+	      b[ 67] = *(a02 +  9);
+	      b[ 68] = *(a03 +  8);
+	      b[ 69] = *(a03 +  9);
+	      b[ 70] = *(a04 +  8);
+	      b[ 71] = *(a04 +  9);
+#ifdef UNIT
+	      b[ 72] = ONE;
+	      b[ 73] = ZERO;
+#else
+	      b[ 72] = *(a05 +  8);
+	      b[ 73] = *(a05 +  9);
+#endif
+	      b[ 74] = ZERO;
+	      b[ 75] = ZERO;
+	      b[ 76] = ZERO;
+	      b[ 77] = ZERO;
+	      b[ 78] = ZERO;
+	      b[ 79] = ZERO;
+
+	      b[ 80] = *(a01 + 10);
+	      b[ 81] = *(a01 + 11);
+	      b[ 82] = *(a02 + 10);
+	      b[ 83] = *(a02 + 11);
+	      b[ 84] = *(a03 + 10);
+	      b[ 85] = *(a03 + 11);
+	      b[ 86] = *(a04 + 10);
+	      b[ 87] = *(a04 + 11);
+	      b[ 88] = *(a05 + 10);
+	      b[ 89] = *(a05 + 11);
+#ifdef UNIT
+	      b[ 90] = ONE;
+	      b[ 91] = ZERO;
+#else
+	      b[ 90] = *(a06 + 10);
+	      b[ 91] = *(a06 + 11);
+#endif
+	      b[ 92] = ZERO;
+	      b[ 93] = ZERO;
+	      b[ 94] = ZERO;
+	      b[ 95] = ZERO;
+
+	      b[ 96] = *(a01 + 12);
+	      b[ 97] = *(a01 + 13);
+	      b[ 98] = *(a02 + 12);
+	      b[ 99] = *(a02 + 13);
+	      b[100] = *(a03 + 12);
+	      b[101] = *(a03 + 13);
+	      b[102] = *(a04 + 12);
+	      b[103] = *(a04 + 13);
+	      b[104] = *(a05 + 12);
+	      b[105] = *(a05 + 13);
+	      b[106] = *(a06 + 12);
+	      b[107] = *(a06 + 13);
+#ifdef UNIT
+	      b[108] = ONE;
+	      b[109] = ZERO;
+#else
+	      b[108] = *(a07 + 12);
+	      b[109] = *(a07 + 13);
+#endif
+	      b[110] = ZERO;
+	      b[111] = ZERO;
+
+	      b[112] = *(a01 + 14);
+	      b[113] = *(a01 + 15);
+	      b[114] = *(a02 + 14);
+	      b[115] = *(a02 + 15);
+	      b[116] = *(a03 + 14);
+	      b[117] = *(a03 + 15);
+	      b[118] = *(a04 + 14);
+	      b[119] = *(a04 + 15);
+	      b[120] = *(a05 + 14);
+	      b[121] = *(a05 + 15);
+	      b[122] = *(a06 + 14);
+	      b[123] = *(a06 + 15);
+	      b[124] = *(a07 + 14);
+	      b[125] = *(a07 + 15);
+#ifdef UNIT
+	      b[126] = ONE;
+	      b[127] = ZERO;
+#else
+	      b[126] = *(a08 + 14);
+	      b[127] = *(a08 + 15);
+#endif
+
+	      a01 += 16;
+	      a02 += 16;
+	      a03 += 16;
+	      a04 += 16;
+	      a05 += 16;
+	      a06 += 16;
+	      a07 += 16;
+	      a08 += 16;
+	      b += 128;
+	    }
+
+	  X += 8;
+	  i --;
+	} while (i > 0);
+      }
+
+      i = (m & 7);
+      if (i) {
+
+	if (X > posY) {
+	  for (ii = 0; ii < i; ii++){
+
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    b[  2] = *(a02 +  0);
+	    b[  3] = *(a02 +  1);
+	    b[  4] = *(a03 +  0);
+	    b[  5] = *(a03 +  1);
+	    b[  6] = *(a04 +  0);
+	    b[  7] = *(a04 +  1);
+
+	    b[  8] = *(a05 +  0);
+	    b[  9] = *(a05 +  1);
+	    b[ 10] = *(a06 +  0);
+	    b[ 11] = *(a06 +  1);
+	    b[ 12] = *(a07 +  0);
+	    b[ 13] = *(a07 +  1);
+	    b[ 14] = *(a08 +  0);
+	    b[ 15] = *(a08 +  1);
+
+	    a01 += 2;
+	    a02 += 2;
+	    a03 += 2;
+	    a04 += 2;
+	    a05 += 2;
+	    a06 += 2;
+	    a07 += 2;
+	    a08 += 2;
+	    b += 16;
+	    }
+	} else
+	  if (X < posY) {
+	      /* a01 += i * lda;
+	      a02 += i * lda;
+	      a03 += i * lda;
+	      a04 += i * lda;
+	      a05 += i * lda;
+	      a06 += i * lda;
+	      a07 += i * lda;
+	      a08 += i * lda; */
+	      b += 16 * i;
+	  } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+	      b += 16;
+
+	    if (i >= 2) {
+	    b[ 0] = *(a01 +  2);
+		b[ 1] = *(a01 +  3);
+#ifdef UNIT
+		b[ 2] = ONE;
+		b[ 3] = ZERO;
+#else
+		b[ 2] = *(a02 +  2);
+		b[ 3] = *(a02 +  3);
+#endif
+	    b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+		b[12] = ZERO;
+		b[13] = ZERO;
+		b[14] = ZERO;
+		b[15] = ZERO;
+		b += 16;
+	    }
+
+	    if (i >= 3) {
+		b[ 0] = *(a01 +  4);
+		b[ 1] = *(a01 +  5);
+		b[ 2] = *(a02 +  4);
+		b[ 3] = *(a02 +  5);
+#ifdef UNIT
+		b[ 4] = ONE;
+		b[ 5] = ZERO;
+#else
+		b[ 4] = *(a03 +  4);
+		b[ 5] = *(a03 +  5);
+#endif
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+		b[12] = ZERO;
+		b[13] = ZERO;
+		b[14] = ZERO;
+		b[15] = ZERO;
+		b += 16;
+	    }
+
+	    if (i >= 4) {
+		b[ 0] = *(a01 +  6);
+		b[ 1] = *(a01 +  7);
+		b[ 2] = *(a02 +  6);
+		b[ 3] = *(a02 +  7);
+		b[ 4] = *(a03 +  6);
+		b[ 5] = *(a03 +  7);
+#ifdef UNIT
+		b[ 6] = ONE;
+		b[ 7] = ZERO;
+#else
+		b[ 6] = *(a04 +  6);
+		b[ 7] = *(a04 +  7);
+#endif
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+		b[12] = ZERO;
+		b[13] = ZERO;
+		b[14] = ZERO;
+		b[15] = ZERO;
+		b += 16;
+	    }
+
+	    if (i >= 5) {
+		b[ 0] = *(a01 +  8);
+		b[ 1] = *(a01 +  9);
+		b[ 2] = *(a02 +  8);
+		b[ 3] = *(a02 +  9);
+		b[ 4] = *(a03 +  8);
+		b[ 5] = *(a03 +  9);
+		b[ 6] = *(a04 +  8);
+		b[ 7] = *(a04 +  9);
+#ifdef UNIT
+		b[ 8] = ONE;
+		b[ 9] = ZERO;
+#else
+		b[ 8] = *(a05 +  8);
+		b[ 9] = *(a05 +  9);
+#endif
+		b[10] = ZERO;
+		b[11] = ZERO;
+		b[12] = ZERO;
+		b[13] = ZERO;
+		b[14] = ZERO;
+		b[15] = ZERO;
+		b += 16;
+	    }
+
+	    if (i >= 6) {
+		b[ 0] = *(a01 + 10);
+		b[ 1] = *(a01 + 11);
+		b[ 2] = *(a02 + 10);
+		b[ 3] = *(a02 + 11);
+		b[ 4] = *(a03 + 10);
+		b[ 5] = *(a03 + 11);
+		b[ 6] = *(a04 + 10);
+		b[ 7] = *(a04 + 11);
+		b[ 8] = *(a05 + 10);
+		b[ 9] = *(a05 + 11);
+#ifdef UNIT
+		b[10] = ONE;
+		b[11] = ZERO;
+#else
+		b[10] = *(a06 + 10);
+		b[11] = *(a06 + 11);
+#endif
+		b[12] = ZERO;
+		b[13] = ZERO;
+		b[14] = ZERO;
+		b[15] = ZERO;
+		b += 16;
+	    }
+
+	    if (i >= 7) {
+		b[ 0] = *(a01 + 12);
+		b[ 1] = *(a01 + 13);
+		b[ 2] = *(a02 + 12);
+		b[ 3] = *(a02 + 13);
+		b[ 4] = *(a03 + 12);
+		b[ 5] = *(a03 + 13);
+		b[ 6] = *(a04 + 12);
+		b[ 7] = *(a04 + 13);
+		b[ 8] = *(a05 + 12);
+		b[ 9] = *(a05 + 13);
+		b[10] = *(a06 + 12);
+		b[11] = *(a06 + 13);
+#ifdef UNIT
+		b[12] = ONE;
+		b[13] = ZERO;
+#else
+		b[12] = *(a07 + 12);
+		b[13] = *(a07 + 13);
+#endif
+		b[14] = ZERO;
+		b[15] = ZERO;
+		b += 16;
+	    }
+	  }
+      }
+
+      posY += 8;
+  }
+
+
+  if (n & 4){
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+	a02 = a + posY * 2 + (posX + 1) * lda;
+	a03 = a + posY * 2 + (posX + 2) * lda;
+	a04 = a + posY * 2 + (posX + 3) * lda;
+      } else {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+	a02 = a + posX * 2 + (posY + 1) * lda;
+	a03 = a + posX * 2 + (posY + 2) * lda;
+	a04 = a + posX * 2 + (posY + 3) * lda;
+      }
+
+      i = (m >> 2);
+      if (i > 0) {
+	do {
+	  if (X > posY) {
+	    for (ii = 0; ii < 4; ii++){
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+	      b[  2] = *(a02 +  0);
+	      b[  3] = *(a02 +  1);
+	      b[  4] = *(a03 +  0);
+	      b[  5] = *(a03 +  1);
+	      b[  6] = *(a04 +  0);
+	      b[  7] = *(a04 +  1);
+
+	      a01 += 2;
+	      a02 += 2;
+	      a03 += 2;
+	      a04 += 2;
+	      b += 8;
+	    }
+	  } else
+	    if (X < posY) {
+	      a01 += 4 * lda;
+	      a02 += 4 * lda;
+	      a03 += 4 * lda;
+	      a04 += 4 * lda;
+	      b += 32;
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+
+	      b[  8] = *(a01 +  2);
+	      b[  9] = *(a01 +  3);
+#ifdef UNIT
+	      b[ 10] = ONE;
+	      b[ 11] = ZERO;
+#else
+	      b[ 10] = *(a02 +  2);
+	      b[ 11] = *(a02 +  3);
+#endif
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+
+	      b[ 16] = *(a01 +  4);
+	      b[ 17] = *(a01 +  5);
+	      b[ 18] = *(a02 +  4);
+	      b[ 19] = *(a02 +  5);
+#ifdef UNIT
+	      b[ 20] = ONE;
+	      b[ 21] = ZERO;
+#else
+	      b[ 20] = *(a03 +  4);
+	      b[ 21] = *(a03 +  5);
+#endif
+	      b[ 22] = ZERO;
+	      b[ 23] = ZERO;
+
+	      b[ 24] = *(a01 +  6);
+	      b[ 25] = *(a01 +  7);
+	      b[ 26] = *(a02 +  6);
+	      b[ 27] = *(a02 +  7);
+	      b[ 28] = *(a03 +  6);
+	      b[ 29] = *(a03 +  7);
+#ifdef UNIT
+	      b[ 30] = ONE;
+	      b[ 31] = ZERO;
+#else
+	      b[ 30] = *(a04 +  6);
+	      b[ 31] = *(a04 +  7);
+#endif
+
+	      a01 += 8;
+	      a02 += 8;
+	      a03 += 8;
+	      a04 += 8;
+	      b += 32;
+	    }
+
+	  X += 4;
+	  i --;
+	} while (i > 0);
+      }
+
+      i = (m & 3);
+      if (i) {
+
+	if (X > posY) {
+
+	  for (ii = 0; ii < i; ii++){
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    b[  2] = *(a02 +  0);
+	    b[  3] = *(a02 +  1);
+	    b[  4] = *(a03 +  0);
+	    b[  5] = *(a03 +  1);
+	    b[  6] = *(a04 +  0);
+	    b[  7] = *(a04 +  1);
+
+	    a01 += 2;
+	    a02 += 2;
+	    a03 += 2;
+	    a04 += 2;
+	    b += 8;
+	  }
+	} else
+	  if (X < posY) {
+	    /* a01 += i * lda;
+	    a02 += i * lda;
+	    a03 += i * lda;
+	    a04 += i * lda; */
+	    b += 8 * i;
+	  } else {
+#ifdef UNIT
+	    b[  0] = ONE;
+	    b[  1] = ZERO;
+#else
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+#endif
+	    b[  2] = ZERO;
+	    b[  3] = ZERO;
+	    b[  4] = ZERO;
+	    b[  5] = ZERO;
+	    b[  6] = ZERO;
+	    b[  7] = ZERO;
+	    b += 8;
+
+	    if (i >= 2) {
+	      b[ 0] = *(a01 +  2);
+	      b[ 1] = *(a01 +  3);
+#ifdef UNIT
+	      b[ 2] = ONE;
+	      b[ 3] = ZERO;
+#else
+	      b[ 2] = *(a02 +  2);
+	      b[ 3] = *(a02 +  3);
+#endif
+	      b[ 4] = ZERO;
+	      b[ 5] = ZERO;
+	      b[ 6] = ZERO;
+	      b[ 7] = ZERO;
+	      b += 8;
+	    }
+
+	    if (i >= 3) {
+	      b[ 0] = *(a01 +  4);
+	      b[ 1] = *(a01 +  5);
+	      b[ 2] = *(a02 +  4);
+	      b[ 3] = *(a02 +  5);
+#ifdef UNIT
+	      b[ 4] = ONE;
+	      b[ 5] = ZERO;
+#else
+	      b[ 4] = *(a03 +  4);
+	      b[ 5] = *(a03 +  5);
+#endif
+	      b[ 6] = ZERO;
+	      b[ 7] = ZERO;
+	      b += 8;
+	    }
+	  }
+      }
+
+      posY += 4;
+  }
+
+  if (n & 2){
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+	a02 = a + posY * 2 + (posX + 1) * lda;
+      } else {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+	a02 = a + posX * 2 + (posY + 1) * lda;
+      }
+
+      i = (m >> 1);
+      if (i > 0) {
+	do {
+	  if (X > posY) {
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    b[  2] = *(a02 +  0);
+	    b[  3] = *(a02 +  1);
+	    b[  4] = *(a01 +  2);
+	    b[  5] = *(a01 +  3);
+	    b[  6] = *(a02 +  2);
+	    b[  7] = *(a02 +  3);
+
+	    a01 += 4;
+	    a02 += 4;
+	    b += 8;
+	  } else
+	    if (X < posY) {
+	      a01 += 2 * lda;
+	      a02 += 2 * lda;
+	      b += 8;
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+
+	      b[  4] = *(a01 +  2);
+	      b[  5] = *(a01 +  3);
+#ifdef UNIT
+	      b[  6] = ONE;
+	      b[  7] = ZERO;
+#else
+	      b[  6] = *(a02 +  2);
+	      b[  7] = *(a02 +  3);
+#endif
+	      a01 += 4;
+	      a02 += 4;
+	      b += 8;
+	    }
+
+	  X += 2;
+	  i --;
+	} while (i > 0);
+      }
+
+      if (m & 1) {
+
+	if (X > posY) {
+	  b[  0] = *(a01 +  0);
+	  b[  1] = *(a01 +  1);
+	  b[  2] = *(a02 +  0);
+	  b[  3] = *(a02 +  1);
+	  /* a01 += 2;
+	  a02 += 2; */
+	  b += 4;
+	} else
+	  if (X < posY) {
+	    /* a01 += 2 * lda;
+	    a02 += 2 * lda; */
+	    b += 4;
+	  } else {
+#ifdef UNIT
+	    b[  0] = ONE;
+	    b[  1] = ZERO;
+#else
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+#endif
+	    b[  2] = ZERO;
+	    b[  3] = ZERO;
+	    b += 4;
+	  }
+      }
+      posY += 2;
+  }
+
+  if (n & 1){
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+      } else {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+      }
+
+      i = m;
+      if (m > 0) {
+	do {
+	  if (X > posY) {
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    a01 += 2;
+	    b += 2;
+	  } else
+	    if (X < posY) {
+	      a01 += lda;
+	      b += 2;
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      a01 += 2;
+	      b += 2;
+	    }
+
+	  X += 1;
+	  i --;
+	} while (i > 0);
+      }
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/ztrmm_ltcopy_16.c b/kernel/generic/ztrmm_ltcopy_16.c
new file mode 100644
index 000000000..8d585e70b
--- /dev/null
+++ b/kernel/generic/ztrmm_ltcopy_16.c
@@ -0,0 +1,2313 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, js;
+  BLASLONG X, ii;
+
+  FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08;
+  FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16;
+
+  lda += lda;
+
+  js = (n >> 4);
+
+  if (js > 0){
+    do {
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+	a02 = a + posY * 2 + (posX + 1) * lda;
+	a03 = a + posY * 2 + (posX + 2) * lda;
+	a04 = a + posY * 2 + (posX + 3) * lda;
+	a05 = a + posY * 2 + (posX + 4) * lda;
+	a06 = a + posY * 2 + (posX + 5) * lda;
+	a07 = a + posY * 2 + (posX + 6) * lda;
+	a08 = a + posY * 2 + (posX + 7) * lda;
+	a09 = a + posY * 2 + (posX + 8) * lda;
+	a10 = a + posY * 2 + (posX + 9) * lda;
+	a11 = a + posY * 2 + (posX + 10) * lda;
+	a12 = a + posY * 2 + (posX + 11) * lda;
+	a13 = a + posY * 2 + (posX + 12) * lda;
+	a14 = a + posY * 2 + (posX + 13) * lda;
+	a15 = a + posY * 2 + (posX + 14) * lda;
+	a16 = a + posY * 2 + (posX + 15) * lda;
+      } else {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+	a02 = a + posX * 2 + (posY + 1) * lda;
+	a03 = a + posX * 2 + (posY + 2) * lda;
+	a04 = a + posX * 2 + (posY + 3) * lda;
+	a05 = a + posX * 2 + (posY + 4) * lda;
+	a06 = a + posX * 2 + (posY + 5) * lda;
+	a07 = a + posX * 2 + (posY + 6) * lda;
+	a08 = a + posX * 2 + (posY + 7) * lda;
+	a09 = a + posX * 2 + (posY + 8) * lda;
+	a10 = a + posX * 2 + (posY + 9) * lda;
+	a11 = a + posX * 2 + (posY + 10) * lda;
+	a12 = a + posX * 2 + (posY + 11) * lda;
+	a13 = a + posX * 2 + (posY + 12) * lda;
+	a14 = a + posX * 2 + (posY + 13) * lda;
+	a15 = a + posX * 2 + (posY + 14) * lda;
+	a16 = a + posX * 2 + (posY + 15) * lda;
+      }
+
+      i = (m >> 4);
+      if (i > 0) {
+	do {
+	  if (X > posY) {
+		a01 += 32;
+		a02 += 32;
+		a03 += 32;
+		a04 += 32;
+		a05 += 32;
+		a06 += 32;
+		a07 += 32;
+		a08 += 32;
+		a09 += 32;
+		a10 += 32;
+		a11 += 32;
+		a12 += 32;
+		a13 += 32;
+		a14 += 32;
+		a15 += 32;
+		a16 += 32;
+		b += 512;
+	  } else
+	    if (X < posY) {
+	      for (ii = 0; ii < 16; ii++){
+
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+	      b[  2] = *(a01 +  2);
+	      b[  3] = *(a01 +  3);
+	      b[  4] = *(a01 +  4);
+	      b[  5] = *(a01 +  5);
+	      b[  6] = *(a01 +  6);
+	      b[  7] = *(a01 +  7);
+
+	      b[  8] = *(a01 +  8);
+	      b[  9] = *(a01 +  9);
+	      b[ 10] = *(a01 + 10);
+	      b[ 11] = *(a01 + 11);
+	      b[ 12] = *(a01 + 12);
+	      b[ 13] = *(a01 + 13);
+	      b[ 14] = *(a01 + 14);
+	      b[ 15] = *(a01 + 15);
+
+		  b[ 16] = *(a01 + 16);
+	      b[ 17] = *(a01 + 17);
+	      b[ 18] = *(a01 + 18);
+	      b[ 19] = *(a01 + 19);
+	      b[ 20] = *(a01 + 20);
+	      b[ 21] = *(a01 + 21);
+	      b[ 22] = *(a01 + 22);
+	      b[ 23] = *(a01 + 23);
+
+	      b[ 24] = *(a01 + 24);
+	      b[ 25] = *(a01 + 25);
+	      b[ 26] = *(a01 + 26);
+	      b[ 27] = *(a01 + 27);
+	      b[ 28] = *(a01 + 28);
+	      b[ 29] = *(a01 + 29);
+	      b[ 30] = *(a01 + 30);
+	      b[ 31] = *(a01 + 31);
+
+	      a01 += lda;
+	      b += 32;
+	    }
+		a02 += 16 * lda;
+		a03 += 16 * lda;
+		a04 += 16 * lda;
+		a05 += 16 * lda;
+		a06 += 16 * lda;
+		a07 += 16 * lda;
+		a08 += 16 * lda;
+		a09 += 16 * lda;
+		a10 += 16 * lda;
+		a11 += 16 * lda;
+		a12 += 16 * lda;
+		a13 += 16 * lda;
+		a14 += 16 * lda;
+		a15 += 16 * lda;
+		a16 += 16 * lda;
+
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = *(a01 +  2);
+	      b[  3] = *(a01 +  3);
+	      b[  4] = *(a01 +  4);
+	      b[  5] = *(a01 +  5);
+	      b[  6] = *(a01 +  6);
+	      b[  7] = *(a01 +  7);
+	      b[  8] = *(a01 +  8);
+	      b[  9] = *(a01 +  9);
+	      b[ 10] = *(a01 + 10);
+	      b[ 11] = *(a01 + 11);
+	      b[ 12] = *(a01 + 12);
+	      b[ 13] = *(a01 + 13);
+	      b[ 14] = *(a01 + 14);
+	      b[ 15] = *(a01 + 15);
+		  b[ 16] = *(a01 + 16);
+	      b[ 17] = *(a01 + 17);
+	      b[ 18] = *(a01 + 18);
+	      b[ 19] = *(a01 + 19);
+	      b[ 20] = *(a01 + 20);
+	      b[ 21] = *(a01 + 21);
+	      b[ 22] = *(a01 + 22);
+	      b[ 23] = *(a01 + 23);
+	      b[ 24] = *(a01 + 24);
+	      b[ 25] = *(a01 + 25);
+	      b[ 26] = *(a01 + 26);
+	      b[ 27] = *(a01 + 27);
+	      b[ 28] = *(a01 + 28);
+	      b[ 29] = *(a01 + 29);
+		  b[ 30] = *(a01 + 30);
+	      b[ 31] = *(a01 + 31);
+
+	      b[ 32] = ZERO;
+	      b[ 33] = ZERO;
+#ifdef UNIT
+	      b[ 34] = ONE;
+	      b[ 35] = ZERO;
+#else
+	      b[ 34] = *(a02 +  2);
+	      b[ 35] = *(a02 +  3);
+#endif
+	      b[ 36] = *(a02 +  4);
+	      b[ 37] = *(a02 +  5);
+	      b[ 38] = *(a02 +  6);
+	      b[ 39] = *(a02 +  7);
+	      b[ 40] = *(a02 +  8);
+	      b[ 41] = *(a02 +  9);
+	      b[ 42] = *(a02 + 10);
+	      b[ 43] = *(a02 + 11);
+	      b[ 44] = *(a02 + 12);
+	      b[ 45] = *(a02 + 13);
+	      b[ 46] = *(a02 + 14);
+	      b[ 47] = *(a02 + 15);
+	      b[ 48] = *(a02 + 16);
+	      b[ 49] = *(a02 + 17);
+	      b[ 50] = *(a02 + 18);
+	      b[ 51] = *(a02 + 19);
+	      b[ 52] = *(a02 + 20);
+	      b[ 53] = *(a02 + 21);
+	      b[ 54] = *(a02 + 22);
+	      b[ 55] = *(a02 + 23);
+	      b[ 56] = *(a02 + 24);
+	      b[ 57] = *(a02 + 25);
+	      b[ 58] = *(a02 + 26);
+	      b[ 59] = *(a02 + 27);
+	      b[ 60] = *(a02 + 28);
+	      b[ 61] = *(a02 + 29);
+	      b[ 62] = *(a02 + 30);
+	      b[ 63] = *(a02 + 31);
+
+	      b[ 64] = ZERO;
+	      b[ 65] = ZERO;
+	      b[ 66] = ZERO;
+	      b[ 67] = ZERO;
+#ifdef UNIT
+	      b[ 68] = ONE;
+	      b[ 69] = ZERO;
+#else
+	      b[ 68] = *(a03 +  4);
+	      b[ 69] = *(a03 +  5);
+#endif
+	      b[ 70] = *(a03 +  6);
+	      b[ 71] = *(a03 +  7);
+	      b[ 72] = *(a03 +  8);
+	      b[ 73] = *(a03 +  9);
+	      b[ 74] = *(a03 + 10);
+	      b[ 75] = *(a03 + 11);
+	      b[ 76] = *(a03 + 12);
+	      b[ 77] = *(a03 + 13);
+	      b[ 78] = *(a03 + 14);
+	      b[ 79] = *(a03 + 15);
+	      b[ 80] = *(a03 + 16);
+	      b[ 81] = *(a03 + 17);
+	      b[ 82] = *(a03 + 18);
+		  b[ 83] = *(a03 + 19);
+	      b[ 84] = *(a03 + 20);
+	      b[ 85] = *(a03 + 21);
+	      b[ 86] = *(a03 + 22);
+	      b[ 87] = *(a03 + 23);
+	      b[ 88] = *(a03 + 24);
+	      b[ 89] = *(a03 + 25);
+	      b[ 90] = *(a03 + 26);
+	      b[ 91] = *(a03 + 27);
+	      b[ 92] = *(a03 + 28);
+	      b[ 93] = *(a03 + 29);
+	      b[ 94] = *(a03 + 30);
+	      b[ 95] = *(a03 + 31);
+
+	      b[ 96] = ZERO;
+	      b[ 97] = ZERO;
+	      b[ 98] = ZERO;
+	      b[ 99] = ZERO;
+	      b[100] = ZERO;
+	      b[101] = ZERO;
+#ifdef UNIT
+	      b[102] = ONE;
+	      b[103] = ZERO;
+#else
+	      b[102] = *(a04 +  6);
+	      b[103] = *(a04 +  7);
+#endif
+	      b[104] = *(a04 +  8);
+	      b[105] = *(a04 +  9);
+	      b[106] = *(a04 + 10);
+	      b[107] = *(a04 + 11);
+	      b[108] = *(a04 + 12);
+	      b[109] = *(a04 + 13);
+	      b[110] = *(a04 + 14);
+	      b[111] = *(a04 + 15);
+	      b[112] = *(a04 + 16);
+	      b[113] = *(a04 + 17);
+	      b[114] = *(a04 + 18);
+	      b[115] = *(a04 + 19);
+		  b[116] = *(a04 + 20);
+	      b[117] = *(a04 + 21);
+	      b[118] = *(a04 + 22);
+	      b[119] = *(a04 + 23);
+	      b[120] = *(a04 + 24);
+	      b[121] = *(a04 + 25);
+	      b[122] = *(a04 + 26);
+	      b[123] = *(a04 + 27);
+	      b[124] = *(a04 + 28);
+	      b[125] = *(a04 + 29);
+	      b[126] = *(a04 + 30);
+	      b[127] = *(a04 + 31);
+
+	      b[128] = ZERO;
+	      b[129] = ZERO;
+	      b[130] = ZERO;
+	      b[131] = ZERO;
+	      b[132] = ZERO;
+	      b[133] = ZERO;
+	      b[134] = ZERO;
+	      b[135] = ZERO;
+#ifdef UNIT
+	      b[136] = ONE;
+	      b[137] = ZERO;
+#else
+	      b[136] = *(a05 +  8);
+	      b[137] = *(a05 +  9);
+#endif
+	      b[138] = *(a05 + 10);
+	      b[139] = *(a05 + 11);
+	      b[140] = *(a05 + 12);
+	      b[141] = *(a05 + 13);
+	      b[142] = *(a05 + 14);
+	      b[143] = *(a05 + 15);
+	      b[144] = *(a05 + 16);
+	      b[145] = *(a05 + 17);
+	      b[146] = *(a05 + 18);
+	      b[147] = *(a05 + 19);
+	      b[148] = *(a05 + 20);
+		  b[149] = *(a05 + 21);
+	      b[150] = *(a05 + 22);
+	      b[151] = *(a05 + 23);
+	      b[152] = *(a05 + 24);
+	      b[153] = *(a05 + 25);
+	      b[154] = *(a05 + 26);
+	      b[155] = *(a05 + 27);
+	      b[156] = *(a05 + 28);
+	      b[157] = *(a05 + 29);
+	      b[158] = *(a05 + 30);
+	      b[159] = *(a05 + 31);
+
+	      b[160] = ZERO;
+	      b[161] = ZERO;
+	      b[162] = ZERO;
+	      b[163] = ZERO;
+	      b[164] = ZERO;
+	      b[165] = ZERO;
+	      b[166] = ZERO;
+	      b[167] = ZERO;
+	      b[168] = ZERO;
+	      b[169] = ZERO;
+#ifdef UNIT
+	      b[170] = ONE;
+	      b[171] = ZERO;
+#else
+	      b[170] = *(a06 + 10);
+	      b[171] = *(a06 + 11);
+#endif
+	      b[172] = *(a06 + 12);
+	      b[173] = *(a06 + 13);
+	      b[174] = *(a06 + 14);
+	      b[175] = *(a06 + 15);
+	      b[176] = *(a06 + 16);
+	      b[177] = *(a06 + 17);
+	      b[178] = *(a06 + 18);
+	      b[179] = *(a06 + 19);
+	      b[180] = *(a06 + 20);
+	      b[181] = *(a06 + 21);
+		  b[182] = *(a06 + 22);
+	      b[183] = *(a06 + 23);
+	      b[184] = *(a06 + 24);
+	      b[185] = *(a06 + 25);
+	      b[186] = *(a06 + 26);
+	      b[187] = *(a06 + 27);
+	      b[188] = *(a06 + 28);
+	      b[189] = *(a06 + 29);
+	      b[190] = *(a06 + 30);
+	      b[191] = *(a06 + 31);
+
+	      b[192] = ZERO;
+	      b[193] = ZERO;
+	      b[194] = ZERO;
+	      b[195] = ZERO;
+	      b[196] = ZERO;
+	      b[197] = ZERO;
+	      b[198] = ZERO;
+	      b[199] = ZERO;
+	      b[200] = ZERO;
+	      b[201] = ZERO;
+	      b[202] = ZERO;
+	      b[203] = ZERO;
+#ifdef UNIT
+	      b[204] = ONE;
+	      b[205] = ZERO;
+#else
+	      b[204] = *(a07 + 12);
+	      b[205] = *(a07 + 13);
+#endif
+	      b[206] = *(a07 + 14);
+	      b[207] = *(a07 + 15);
+	      b[208] = *(a07 + 16);
+	      b[209] = *(a07 + 17);
+	      b[210] = *(a07 + 18);
+	      b[211] = *(a07 + 19);
+	      b[212] = *(a07 + 20);
+	      b[213] = *(a07 + 21);
+	      b[214] = *(a07 + 22);
+		  b[215] = *(a07 + 23);
+	      b[216] = *(a07 + 24);
+	      b[217] = *(a07 + 25);
+	      b[218] = *(a07 + 26);
+	      b[219] = *(a07 + 27);
+	      b[220] = *(a07 + 28);
+	      b[221] = *(a07 + 29);
+	      b[222] = *(a07 + 30);
+	      b[223] = *(a07 + 31);
+
+	      b[224] = ZERO;
+	      b[225] = ZERO;
+	      b[226] = ZERO;
+	      b[227] = ZERO;
+	      b[228] = ZERO;
+	      b[229] = ZERO;
+	      b[230] = ZERO;
+	      b[231] = ZERO;
+	      b[232] = ZERO;
+	      b[233] = ZERO;
+	      b[234] = ZERO;
+	      b[235] = ZERO;
+	      b[236] = ZERO;
+	      b[237] = ZERO;
+#ifdef UNIT
+	      b[238] = ONE;
+	      b[239] = ZERO;
+#else
+	      b[238] = *(a08 + 14);
+	      b[239] = *(a08 + 15);
+#endif
+		  b[240] = *(a08 + 16);
+	      b[241] = *(a08 + 17);
+	      b[242] = *(a08 + 18);
+	      b[243] = *(a08 + 19);
+	      b[244] = *(a08 + 20);
+	      b[245] = *(a08 + 21);
+	      b[246] = *(a08 + 22);
+	      b[247] = *(a08 + 23);
+		  b[248] = *(a08 + 24);
+	      b[249] = *(a08 + 25);
+	      b[250] = *(a08 + 26);
+	      b[251] = *(a08 + 27);
+	      b[252] = *(a08 + 28);
+	      b[253] = *(a08 + 29);
+	      b[254] = *(a08 + 30);
+	      b[255] = *(a08 + 31);
+
+	      b[256] = ZERO;
+	      b[257] = ZERO;
+	      b[258] = ZERO;
+	      b[259] = ZERO;
+	      b[260] = ZERO;
+	      b[261] = ZERO;
+	      b[262] = ZERO;
+	      b[263] = ZERO;
+	      b[264] = ZERO;
+	      b[265] = ZERO;
+	      b[266] = ZERO;
+	      b[267] = ZERO;
+	      b[268] = ZERO;
+	      b[269] = ZERO;
+	      b[270] = ZERO;
+	      b[271] = ZERO;
+#ifdef UNIT
+	      b[272] = ONE;
+		  b[273] = ZERO;
+#else
+	      b[272] = *(a09 + 16);
+		  b[273] = *(a09 + 17);
+#endif
+		  b[274] = *(a09 + 18);
+	      b[275] = *(a09 + 19);
+	      b[276] = *(a09 + 20);
+	      b[277] = *(a09 + 21);
+	      b[278] = *(a09 + 22);
+	      b[279] = *(a09 + 23);
+	      b[280] = *(a09 + 24);
+		  b[281] = *(a09 + 25);
+	      b[282] = *(a09 + 26);
+	      b[283] = *(a09 + 27);
+	      b[284] = *(a09 + 28);
+	      b[285] = *(a09 + 29);
+	      b[286] = *(a09 + 30);
+	      b[287] = *(a09 + 31);
+
+		  b[288] = ZERO;
+		  b[289] = ZERO;
+	      b[290] = ZERO;
+	      b[291] = ZERO;
+	      b[292] = ZERO;
+	      b[293] = ZERO;
+	      b[294] = ZERO;
+	      b[295] = ZERO;
+	      b[296] = ZERO;
+	      b[297] = ZERO;
+	      b[298] = ZERO;
+	      b[299] = ZERO;
+	      b[300] = ZERO;
+	      b[301] = ZERO;
+	      b[302] = ZERO;
+	      b[303] = ZERO;
+	      b[304] = ZERO;
+	      b[305] = ZERO;
+#ifdef UNIT
+	      b[306] = ONE;
+		  b[307] = ZERO;
+#else
+	      b[306] = *(a10 + 18);
+		  b[307] = *(a10 + 19);
+#endif
+		  b[308] = *(a10 + 20);
+	      b[309] = *(a10 + 21);
+	      b[310] = *(a10 + 22);
+	      b[311] = *(a10 + 23);
+	      b[312] = *(a10 + 24);
+	      b[313] = *(a10 + 25);
+		  b[314] = *(a10 + 26);
+	      b[315] = *(a10 + 27);
+	      b[316] = *(a10 + 28);
+	      b[317] = *(a10 + 29);
+	      b[318] = *(a10 + 30);
+	      b[319] = *(a10 + 31);
+
+		  b[320] = ZERO;
+		  b[321] = ZERO;
+	      b[322] = ZERO;
+	      b[323] = ZERO;
+	      b[324] = ZERO;
+	      b[325] = ZERO;
+	      b[326] = ZERO;
+	      b[327] = ZERO;
+	      b[328] = ZERO;
+	      b[329] = ZERO;
+	      b[330] = ZERO;
+	      b[331] = ZERO;
+	      b[332] = ZERO;
+	      b[333] = ZERO;
+	      b[334] = ZERO;
+	      b[335] = ZERO;
+	      b[336] = ZERO;
+	      b[337] = ZERO;
+	      b[338] = ZERO;
+	      b[339] = ZERO;
+#ifdef UNIT
+	      b[340] = ONE;
+	      b[341] = ZERO;
+#else
+	      b[340] = *(a11 + 20);
+	      b[341] = *(a11 + 21);
+#endif
+		  b[342] = *(a11 + 22);
+	      b[343] = *(a11 + 23);
+	      b[344] = *(a11 + 24);
+	      b[345] = *(a11 + 25);
+	      b[346] = *(a11 + 26);
+		  b[347] = *(a11 + 27);
+	      b[348] = *(a11 + 28);
+	      b[349] = *(a11 + 29);
+	      b[350] = *(a11 + 30);
+	      b[351] = *(a11 + 31);
+
+		  b[352] = ZERO;
+		  b[353] = ZERO;
+	      b[354] = ZERO;
+	      b[355] = ZERO;
+	      b[356] = ZERO;
+	      b[357] = ZERO;
+	      b[358] = ZERO;
+	      b[359] = ZERO;
+	      b[360] = ZERO;
+	      b[361] = ZERO;
+	      b[362] = ZERO;
+	      b[363] = ZERO;
+	      b[364] = ZERO;
+	      b[365] = ZERO;
+	      b[366] = ZERO;
+	      b[367] = ZERO;
+	      b[368] = ZERO;
+	      b[369] = ZERO;
+	      b[370] = ZERO;
+	      b[371] = ZERO;
+	      b[372] = ZERO;
+	      b[373] = ZERO;
+#ifdef UNIT
+	      b[374] = ONE;
+	      b[375] = ZERO;
+#else
+	      b[374] = *(a12 + 22);
+	      b[375] = *(a12 + 23);
+#endif
+		  b[376] = *(a12 + 24);
+	      b[377] = *(a12 + 25);
+	      b[378] = *(a12 + 26);
+	      b[379] = *(a12 + 27);
+		  b[380] = *(a12 + 28);
+	      b[381] = *(a12 + 29);
+	      b[382] = *(a12 + 30);
+	      b[383] = *(a12 + 31);
+
+		  b[384] = ZERO;
+		  b[385] = ZERO;
+	      b[386] = ZERO;
+	      b[387] = ZERO;
+	      b[388] = ZERO;
+	      b[389] = ZERO;
+	      b[390] = ZERO;
+	      b[391] = ZERO;
+	      b[392] = ZERO;
+	      b[393] = ZERO;
+	      b[394] = ZERO;
+	      b[395] = ZERO;
+	      b[396] = ZERO;
+	      b[397] = ZERO;
+	      b[398] = ZERO;
+	      b[399] = ZERO;
+	      b[400] = ZERO;
+	      b[401] = ZERO;
+	      b[402] = ZERO;
+	      b[403] = ZERO;
+	      b[404] = ZERO;
+	      b[405] = ZERO;
+	      b[406] = ZERO;
+	      b[407] = ZERO;
+#ifdef UNIT
+	      b[408] = ONE;
+	      b[409] = ZERO;
+#else
+	      b[408] = *(a13 + 24);
+	      b[409] = *(a13 + 25);
+#endif
+		  b[410] = *(a13 + 26);
+	      b[411] = *(a13 + 27);
+	      b[412] = *(a13 + 28);
+		  b[413] = *(a13 + 29);
+	      b[414] = *(a13 + 30);
+	      b[415] = *(a13 + 31);
+
+		  b[416] = ZERO;
+		  b[417] = ZERO;
+	      b[418] = ZERO;
+	      b[419] = ZERO;
+	      b[420] = ZERO;
+	      b[421] = ZERO;
+	      b[422] = ZERO;
+	      b[423] = ZERO;
+	      b[424] = ZERO;
+	      b[425] = ZERO;
+	      b[426] = ZERO;
+	      b[427] = ZERO;
+	      b[428] = ZERO;
+	      b[429] = ZERO;
+	      b[430] = ZERO;
+	      b[431] = ZERO;
+	      b[432] = ZERO;
+	      b[433] = ZERO;
+	      b[434] = ZERO;
+	      b[435] = ZERO;
+	      b[436] = ZERO;
+	      b[437] = ZERO;
+	      b[438] = ZERO;
+	      b[439] = ZERO;
+	      b[440] = ZERO;
+	      b[441] = ZERO;
+#ifdef UNIT
+	      b[442] = ONE;
+	      b[443] = ZERO;
+#else
+	      b[442] = *(a14 + 26);
+	      b[443] = *(a14 + 27);
+#endif
+		  b[444] = *(a14 + 28);
+	      b[445] = *(a14 + 29);
+		  b[446] = *(a14 + 30);
+	      b[447] = *(a14 + 31);
+
+		  b[448] = ZERO;
+		  b[449] = ZERO;
+	      b[450] = ZERO;
+	      b[451] = ZERO;
+	      b[452] = ZERO;
+	      b[453] = ZERO;
+	      b[454] = ZERO;
+	      b[455] = ZERO;
+	      b[456] = ZERO;
+	      b[457] = ZERO;
+	      b[458] = ZERO;
+	      b[459] = ZERO;
+	      b[460] = ZERO;
+	      b[461] = ZERO;
+	      b[462] = ZERO;
+	      b[463] = ZERO;
+	      b[464] = ZERO;
+	      b[465] = ZERO;
+	      b[466] = ZERO;
+	      b[467] = ZERO;
+	      b[468] = ZERO;
+	      b[469] = ZERO;
+	      b[470] = ZERO;
+	      b[471] = ZERO;
+	      b[472] = ZERO;
+	      b[473] = ZERO;
+	      b[474] = ZERO;
+	      b[475] = ZERO;
+#ifdef UNIT
+	      b[476] = ONE;
+	      b[477] = ZERO;
+#else
+	      b[476] = *(a15 + 28);
+	      b[477] = *(a15 + 29);
+#endif
+		  b[478] = *(a15 + 30);
+		  b[479] = *(a15 + 31);
+
+		  b[480] = ZERO;
+		  b[481] = ZERO;
+	      b[482] = ZERO;
+	      b[483] = ZERO;
+	      b[484] = ZERO;
+	      b[485] = ZERO;
+	      b[486] = ZERO;
+	      b[487] = ZERO;
+	      b[488] = ZERO;
+	      b[489] = ZERO;
+	      b[490] = ZERO;
+	      b[491] = ZERO;
+	      b[492] = ZERO;
+	      b[493] = ZERO;
+	      b[494] = ZERO;
+	      b[495] = ZERO;
+	      b[496] = ZERO;
+	      b[497] = ZERO;
+	      b[498] = ZERO;
+	      b[499] = ZERO;
+	      b[500] = ZERO;
+	      b[501] = ZERO;
+	      b[502] = ZERO;
+	      b[503] = ZERO;
+	      b[504] = ZERO;
+	      b[505] = ZERO;
+	      b[506] = ZERO;
+	      b[507] = ZERO;
+	      b[508] = ZERO;
+	      b[509] = ZERO;
+#ifdef UNIT
+	      b[510] = ONE;
+	      b[511] = ZERO;
+#else
+	      b[510] = *(a16 + 30);
+	      b[511] = *(a16 + 31);
+#endif
+
+	      a01 += 32;
+	      a02 += 32;
+	      a03 += 32;
+	      a04 += 32;
+	      a05 += 32;
+	      a06 += 32;
+	      a07 += 32;
+	      a08 += 32;
+	      a09 += 32;
+	      a10 += 32;
+	      a11 += 32;
+	      a12 += 32;
+	      a13 += 32;
+	      a14 += 32;
+	      a15 += 32;
+	      a16 += 32;
+	      b += 512;
+	    }
+
+	  X += 16;
+	  i --;
+	} while (i > 0);
+      }
+
+      i = (m & 15);
+      if (i) {
+
+	if (X > posY) {
+		/* a01 += i * lda;
+	      a02 += i * lda;
+	      a03 += i * lda;
+	      a04 += i * lda;
+	      a05 += i * lda;
+	      a06 += i * lda;
+	      a07 += i * lda;
+	      a08 += i * lda;
+	      a09 += i * lda;
+	      a10 += i * lda;
+	      a11 += i * lda;
+	      a12 += i * lda;
+	      a13 += i * lda;
+	      a14 += i * lda;
+	      a15 += i * lda;
+	      a16 += i * lda; */
+	    b += 32 * i;
+	} else
+	  if (X < posY) {
+	    for (ii = 0; ii < i; ii++){
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+		b[  2] = *(a01 +  2);
+		b[  3] = *(a01 +  3);
+		b[  4] = *(a01 +  4);
+		b[  5] = *(a01 +  5);
+		b[  6] = *(a01 +  6);
+		b[  7] = *(a01 +  7);
+		b[  8] = *(a01 +  8);
+		b[  9] = *(a01 +  9);
+		b[ 10] = *(a01 + 10);
+		b[ 11] = *(a01 + 11);
+		b[ 12] = *(a01 + 12);
+		b[ 13] = *(a01 + 13);
+		b[ 14] = *(a01 + 14);
+		b[ 15] = *(a01 + 15);
+
+		b[ 16] = *(a01 + 16);
+		b[ 17] = *(a01 + 17);
+		b[ 18] = *(a01 + 18);
+		b[ 19] = *(a01 + 19);
+		b[ 20] = *(a01 + 20);
+		b[ 21] = *(a01 + 21);
+		b[ 22] = *(a01 + 22);
+		b[ 23] = *(a01 + 23);
+		b[ 24] = *(a01 + 24);
+		b[ 25] = *(a01 + 25);
+		b[ 26] = *(a01 + 26);
+		b[ 27] = *(a01 + 27);
+		b[ 28] = *(a01 + 28);
+		b[ 29] = *(a01 + 29);
+		b[ 30] = *(a01 + 30);
+		b[ 31] = *(a01 + 31);
+
+	    a01 += lda;
+	    a02 += lda;
+	    a03 += lda;
+	    a04 += lda;
+	    a05 += lda;
+	    a06 += lda;
+	    a07 += lda;
+	    a08 += lda;
+		a09 += lda;
+	    a10 += lda;
+	    a11 += lda;
+	    a12 += lda;
+	    a13 += lda;
+	    a14 += lda;
+	    a15 += lda;
+	    a16 += lda;
+	    b += 32;
+	  }
+	  } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = *(a01 +  2);
+	      b[  3] = *(a01 +  3);
+	      b[  4] = *(a01 +  4);
+	      b[  5] = *(a01 +  5);
+	      b[  6] = *(a01 +  6);
+	      b[  7] = *(a01 +  7);
+	      b[  8] = *(a01 +  8);
+	      b[  9] = *(a01 +  9);
+	      b[ 10] = *(a01 + 10);
+	      b[ 11] = *(a01 + 11);
+	      b[ 12] = *(a01 + 12);
+	      b[ 13] = *(a01 + 13);
+	      b[ 14] = *(a01 + 14);
+	      b[ 15] = *(a01 + 15);
+		  b[ 16] = *(a01 + 16);
+	      b[ 17] = *(a01 + 17);
+	      b[ 18] = *(a01 + 18);
+	      b[ 19] = *(a01 + 19);
+	      b[ 20] = *(a01 + 20);
+	      b[ 21] = *(a01 + 21);
+	      b[ 22] = *(a01 + 22);
+	      b[ 23] = *(a01 + 23);
+	      b[ 24] = *(a01 + 24);
+	      b[ 25] = *(a01 + 25);
+	      b[ 26] = *(a01 + 26);
+	      b[ 27] = *(a01 + 27);
+	      b[ 28] = *(a01 + 28);
+	      b[ 29] = *(a01 + 29);
+		  b[ 30] = *(a01 + 30);
+	      b[ 31] = *(a01 + 31);
+	      b += 32;
+
+	      if (i >= 2) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+#ifdef UNIT
+		b[ 2] = ONE;
+		b[ 3] = ZERO;
+#else
+		b[ 2] = *(a02 +  2);
+		b[ 3] = *(a02 +  3);
+#endif
+		b[  4] = *(a02 +  4);
+		b[  5] = *(a02 +  5);
+		b[  6] = *(a02 +  6);
+		b[  7] = *(a02 +  7);
+		b[  8] = *(a02 +  8);
+		b[  9] = *(a02 +  9);
+		b[ 10] = *(a02 + 10);
+		b[ 11] = *(a02 + 11);
+		b[ 12] = *(a02 + 12);
+		b[ 13] = *(a02 + 13);
+		b[ 14] = *(a02 + 14);
+		b[ 15] = *(a02 + 15);
+		b[ 16] = *(a02 + 16);
+		b[ 17] = *(a02 + 17);
+		b[ 18] = *(a02 + 18);
+		b[ 19] = *(a02 + 19);
+		b[ 20] = *(a02 + 20);
+		b[ 21] = *(a02 + 21);
+		b[ 22] = *(a02 + 22);
+		b[ 23] = *(a02 + 23);
+		b[ 24] = *(a02 + 24);
+		b[ 25] = *(a02 + 25);
+		b[ 26] = *(a02 + 26);
+		b[ 27] = *(a02 + 27);
+		b[ 28] = *(a02 + 28);
+		b[ 29] = *(a02 + 29);
+		b[ 30] = *(a02 + 30);
+		b[ 31] = *(a02 + 31);
+		b += 32;
+	      }
+
+	      if (i >= 3) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+#ifdef UNIT
+		b[ 4] = ONE;
+		b[ 5] = ZERO;
+#else
+		b[ 4] = *(a03 +  4);
+		b[ 5] = *(a03 +  5);
+#endif
+		b[  6] = *(a03 +  6);
+		b[  7] = *(a03 +  7);
+		b[  8] = *(a03 +  8);
+		b[  9] = *(a03 +  9);
+		b[ 10] = *(a03 + 10);
+		b[ 11] = *(a03 + 11);
+		b[ 12] = *(a03 + 12);
+		b[ 13] = *(a03 + 13);
+		b[ 14] = *(a03 + 14);
+		b[ 15] = *(a03 + 15);
+		b[ 16] = *(a03 + 16);
+		b[ 17] = *(a03 + 17);
+		b[ 18] = *(a03 + 18);
+		b[ 19] = *(a03 + 19);
+		b[ 20] = *(a03 + 20);
+		b[ 21] = *(a03 + 21);
+		b[ 22] = *(a03 + 22);
+		b[ 23] = *(a03 + 23);
+		b[ 24] = *(a03 + 24);
+		b[ 25] = *(a03 + 25);
+		b[ 26] = *(a03 + 26);
+		b[ 27] = *(a03 + 27);
+		b[ 28] = *(a03 + 28);
+		b[ 29] = *(a03 + 29);
+		b[ 30] = *(a03 + 30);
+		b[ 31] = *(a03 + 31);
+		b += 32;
+	      }
+
+	      if (i >= 4) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+#ifdef UNIT
+		b[ 6] = ONE;
+		b[ 7] = ZERO;
+#else
+		b[ 6] = *(a04 +  6);
+		b[ 7] = *(a04 +  7);
+#endif
+		b[  8] = *(a04 +  8);
+		b[  9] = *(a04 +  9);
+		b[ 10] = *(a04 + 10);
+		b[ 11] = *(a04 + 11);
+		b[ 12] = *(a04 + 12);
+		b[ 13] = *(a04 + 13);
+		b[ 14] = *(a04 + 14);
+		b[ 15] = *(a04 + 15);
+		b[ 16] = *(a04 + 16);
+		b[ 17] = *(a04 + 17);
+		b[ 18] = *(a04 + 18);
+		b[ 19] = *(a04 + 19);
+		b[ 20] = *(a04 + 20);
+		b[ 21] = *(a04 + 21);
+		b[ 22] = *(a04 + 22);
+		b[ 23] = *(a04 + 23);
+		b[ 24] = *(a04 + 24);
+		b[ 25] = *(a04 + 25);
+		b[ 26] = *(a04 + 26);
+		b[ 27] = *(a04 + 27);
+		b[ 28] = *(a04 + 28);
+		b[ 29] = *(a04 + 29);
+		b[ 30] = *(a04 + 30);
+		b[ 31] = *(a04 + 31);
+		b += 32;
+	      }
+
+	      if (i >= 5) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+#ifdef UNIT
+		b[ 8] = ONE;
+		b[ 9] = ZERO;
+#else
+		b[ 8] = *(a05 +  8);
+		b[ 9] = *(a05 +  9);
+#endif
+		b[ 10] = *(a05 + 10);
+		b[ 11] = *(a05 + 11);
+		b[ 12] = *(a05 + 12);
+		b[ 13] = *(a05 + 13);
+		b[ 14] = *(a05 + 14);
+		b[ 15] = *(a05 + 15);
+		b[ 16] = *(a05 + 16);
+		b[ 17] = *(a05 + 17);
+		b[ 18] = *(a05 + 18);
+		b[ 19] = *(a05 + 19);
+		b[ 20] = *(a05 + 20);
+		b[ 21] = *(a05 + 21);
+		b[ 22] = *(a05 + 22);
+		b[ 23] = *(a05 + 23);
+		b[ 24] = *(a05 + 24);
+		b[ 25] = *(a05 + 25);
+		b[ 26] = *(a05 + 26);
+		b[ 27] = *(a05 + 27);
+		b[ 28] = *(a05 + 28);
+		b[ 29] = *(a05 + 29);
+		b[ 30] = *(a05 + 30);
+		b[ 31] = *(a05 + 31);
+		b += 32;
+	      }
+
+	      if (i >= 6) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+#ifdef UNIT
+		b[10] = ONE;
+		b[11] = ZERO;
+#else
+		b[10] = *(a06 + 10);
+		b[11] = *(a06 + 11);
+#endif
+		b[ 12] = *(a06 + 12);
+		b[ 13] = *(a06 + 13);
+		b[ 14] = *(a06 + 14);
+		b[ 15] = *(a06 + 15);
+		b[ 16] = *(a06 + 16);
+		b[ 17] = *(a06 + 17);
+		b[ 18] = *(a06 + 18);
+		b[ 19] = *(a06 + 19);
+		b[ 20] = *(a06 + 20);
+		b[ 21] = *(a06 + 21);
+		b[ 22] = *(a06 + 22);
+		b[ 23] = *(a06 + 23);
+		b[ 24] = *(a06 + 24);
+		b[ 25] = *(a06 + 25);
+		b[ 26] = *(a06 + 26);
+		b[ 27] = *(a06 + 27);
+		b[ 28] = *(a06 + 28);
+		b[ 29] = *(a06 + 29);
+		b[ 30] = *(a06 + 30);
+		b[ 31] = *(a06 + 31);
+		b += 32;
+	      }
+
+	      if (i >= 7) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+#ifdef UNIT
+		b[12] = ONE;
+		b[13] = ZERO;
+#else
+		b[12] = *(a07 + 12);
+		b[13] = *(a07 + 13);
+#endif
+		b[ 14] = *(a07 + 14);
+		b[ 15] = *(a07 + 15);
+		b[ 16] = *(a07 + 16);
+		b[ 17] = *(a07 + 17);
+		b[ 18] = *(a07 + 18);
+		b[ 19] = *(a07 + 19);
+		b[ 20] = *(a07 + 20);
+		b[ 21] = *(a07 + 21);
+		b[ 22] = *(a07 + 22);
+		b[ 23] = *(a07 + 23);
+		b[ 24] = *(a07 + 24);
+		b[ 25] = *(a07 + 25);
+		b[ 26] = *(a07 + 26);
+		b[ 27] = *(a07 + 27);
+		b[ 28] = *(a07 + 28);
+		b[ 29] = *(a07 + 29);
+		b[ 30] = *(a07 + 30);
+		b[ 31] = *(a07 + 31);
+		b += 32;
+	    }
+
+		if (i >= 8) {
+	      b[  0] = ZERO;
+	      b[  1] = ZERO;
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+#ifdef UNIT
+	      b[ 14] = ONE;
+	      b[ 15] = ZERO;
+#else
+	      b[ 14] = *(a08 +  14);
+	      b[ 15] = *(a08 +  15);
+#endif
+		b[ 16] = *(a08 + 16);
+		b[ 17] = *(a08 + 17);
+		b[ 18] = *(a08 + 18);
+		b[ 19] = *(a08 + 19);
+		b[ 20] = *(a08 + 20);
+		b[ 21] = *(a08 + 21);
+		b[ 22] = *(a08 + 22);
+		b[ 23] = *(a08 + 23);
+		b[ 24] = *(a08 + 24);
+		b[ 25] = *(a08 + 25);
+		b[ 26] = *(a08 + 26);
+		b[ 27] = *(a08 + 27);
+		b[ 28] = *(a08 + 28);
+		b[ 29] = *(a08 + 29);
+		b[ 30] = *(a08 + 30);
+		b[ 31] = *(a08 + 31);
+	      b += 32;
+	    }
+
+		if (i >= 9) {
+	      b[  0] = ZERO;
+	      b[  1] = ZERO;
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+#ifdef UNIT
+	      b[ 16] = ONE;
+	      b[ 17] = ZERO;
+#else
+	      b[ 16] = *(a09 + 16);
+	      b[ 17] = *(a09 + 17);
+#endif
+	    b[ 18] = *(a09 + 18);
+		b[ 19] = *(a09 + 19);
+		b[ 20] = *(a09 + 20);
+		b[ 21] = *(a09 + 21);
+		b[ 22] = *(a09 + 22);
+		b[ 23] = *(a09 + 23);
+		b[ 24] = *(a09 + 24);
+		b[ 25] = *(a09 + 25);
+		b[ 26] = *(a09 + 26);
+		b[ 27] = *(a09 + 27);
+		b[ 28] = *(a09 + 28);
+		b[ 29] = *(a09 + 29);
+		b[ 30] = *(a09 + 30);
+		b[ 31] = *(a09 + 31);
+	      b += 32;
+	    }
+
+		if (i >= 10) {
+	      b[  0] = ZERO;
+	      b[  1] = ZERO;
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+	      b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+#ifdef UNIT
+	      b[ 18] = ONE;
+	      b[ 19] = ZERO;
+#else
+	      b[ 18] = *(a10 + 18);
+	      b[ 19] = *(a10 + 19);
+#endif
+	    b[ 20] = *(a10 + 20);
+		b[ 21] = *(a10 + 21);
+		b[ 22] = *(a10 + 22);
+		b[ 23] = *(a10 + 23);
+		b[ 24] = *(a10 + 24);
+		b[ 25] = *(a10 + 25);
+		b[ 26] = *(a10 + 26);
+		b[ 27] = *(a10 + 27);
+		b[ 28] = *(a10 + 28);
+		b[ 29] = *(a10 + 29);
+		b[ 30] = *(a10 + 30);
+		b[ 31] = *(a10 + 31);
+	      b += 32;
+	    }
+
+		if (i >= 11) {
+	      b[  0] = ZERO;
+	      b[  1] = ZERO;
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+	      b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+#ifdef UNIT
+	      b[ 20] = ONE;
+	      b[ 21] = ZERO;
+#else
+	      b[ 20] = *(a11 + 20);
+	      b[ 21] = *(a11 + 21);
+#endif
+	    b[ 22] = *(a11 + 22);
+		b[ 23] = *(a11 + 23);
+		b[ 24] = *(a11 + 24);
+		b[ 25] = *(a11 + 25);
+		b[ 26] = *(a11 + 26);
+		b[ 27] = *(a11 + 27);
+		b[ 28] = *(a11 + 28);
+		b[ 29] = *(a11 + 29);
+		b[ 30] = *(a11 + 30);
+		b[ 31] = *(a11 + 31);
+	      b += 32;
+	    }
+
+		if (i >= 12) {
+	      b[  0] = ZERO;
+	      b[  1] = ZERO;
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+	      b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+	      b[ 20] = ZERO;
+	      b[ 21] = ZERO;
+#ifdef UNIT
+	      b[ 22] = ONE;
+	      b[ 23] = ZERO;
+#else
+	      b[ 22] = *(a12 + 22);
+	      b[ 23] = *(a12 + 23);
+#endif
+	    b[ 24] = *(a12 + 24);
+		b[ 25] = *(a12 + 25);
+		b[ 26] = *(a12 + 26);
+		b[ 27] = *(a12 + 27);
+		b[ 28] = *(a12 + 28);
+		b[ 29] = *(a12 + 29);
+		b[ 30] = *(a12 + 30);
+		b[ 31] = *(a12 + 31);
+	      b += 32;
+	    }
+
+		if (i >= 13) {
+	      b[  0] = ZERO;
+	      b[  1] = ZERO;
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+	      b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+	      b[ 20] = ZERO;
+	      b[ 21] = ZERO;
+	      b[ 22] = ZERO;
+	      b[ 23] = ZERO;
+#ifdef UNIT
+	      b[ 24] = ONE;
+	      b[ 25] = ZERO;
+#else
+	      b[ 24] = *(a13 + 24);
+	      b[ 25] = *(a13 + 25);
+#endif
+	    b[ 26] = *(a13 + 26);
+		b[ 27] = *(a13 + 27);
+		b[ 28] = *(a13 + 28);
+		b[ 29] = *(a12 + 29);
+		b[ 30] = *(a13 + 30);
+		b[ 31] = *(a13 + 31);
+	      b += 32;
+	    }
+
+		if (i >= 14) {
+	      b[  0] = ZERO;
+	      b[  1] = ZERO;
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+	      b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+	      b[ 20] = ZERO;
+	      b[ 21] = ZERO;
+	      b[ 22] = ZERO;
+	      b[ 23] = ZERO;
+	      b[ 24] = ZERO;
+	      b[ 25] = ZERO;
+#ifdef UNIT
+	      b[ 26] = ONE;
+	      b[ 27] = ZERO;
+#else
+	      b[ 26] = *(a14 + 26);
+	      b[ 27] = *(a14 + 27);
+#endif
+	    b[ 28] = *(a14 + 28);
+		b[ 29] = *(a14 + 29);
+		b[ 30] = *(a14 + 30);
+		b[ 31] = *(a14 + 31);
+	      b += 32;
+	    }
+
+		if (i >= 15) {
+	      b[  0] = ZERO;
+	      b[  1] = ZERO;
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+	      b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+	      b[ 20] = ZERO;
+	      b[ 21] = ZERO;
+	      b[ 22] = ZERO;
+	      b[ 23] = ZERO;
+	      b[ 24] = ZERO;
+	      b[ 25] = ZERO;
+	      b[ 26] = ZERO;
+	      b[ 27] = ZERO;
+#ifdef UNIT
+	      b[ 28] = ONE;
+	      b[ 29] = ZERO;
+#else
+	      b[ 28] = *(a15 + 28);
+	      b[ 29] = *(a15 + 29);
+#endif
+	      b[ 30] = *(a15 + 30);
+		  b[ 31] = *(a15 + 31);
+	      b += 32;
+	    }
+	  }
+      }
+
+      posY += 16;
+      js --;
+    } while (js > 0);
+  } /* End of main loop */
+
+
+  if (n & 8){
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posY * 2 + (posX +  0) * lda;
+	a02 = a + posY * 2 + (posX +  1) * lda;
+	a03 = a + posY * 2 + (posX +  2) * lda;
+	a04 = a + posY * 2 + (posX +  3) * lda;
+	a05 = a + posY * 2 + (posX +  4) * lda;
+	a06 = a + posY * 2 + (posX +  5) * lda;
+	a07 = a + posY * 2 + (posX +  6) * lda;
+	a08 = a + posY * 2 + (posX +  7) * lda;
+      } else {
+	a01 = a + posX * 2 + (posY +  0) * lda;
+	a02 = a + posX * 2 + (posY +  1) * lda;
+	a03 = a + posX * 2 + (posY +  2) * lda;
+	a04 = a + posX * 2 + (posY +  3) * lda;
+	a05 = a + posX * 2 + (posY +  4) * lda;
+	a06 = a + posX * 2 + (posY +  5) * lda;
+	a07 = a + posX * 2 + (posY +  6) * lda;
+	a08 = a + posX * 2 + (posY +  7) * lda;
+      }
+
+      i = (m >> 3);
+      if (i > 0) {
+	do {
+	  if (X > posY) {
+	    a01 += 16;
+	    a02 += 16;
+	    a03 += 16;
+	    a04 += 16;
+	    a05 += 16;
+	    a06 += 16;
+	    a07 += 16;
+	    a08 += 16;
+	    b += 128;
+	  } else
+	    if (X < posY) {
+	      for (ii = 0; ii < 8; ii++){
+
+		b[  0] = *(a01 +  0);
+		b[  1] = *(a01 +  1);
+		b[  2] = *(a01 +  2);
+		b[  3] = *(a01 +  3);
+		b[  4] = *(a01 +  4);
+		b[  5] = *(a01 +  5);
+		b[  6] = *(a01 +  6);
+		b[  7] = *(a01 +  7);
+
+		b[  8] = *(a01 +  8);
+		b[  9] = *(a01 +  9);
+		b[ 10] = *(a01 + 10);
+		b[ 11] = *(a01 + 11);
+		b[ 12] = *(a01 + 12);
+		b[ 13] = *(a01 + 13);
+		b[ 14] = *(a01 + 14);
+		b[ 15] = *(a01 + 15);
+
+		a01 += lda;
+		b += 16;
+	      }
+		  a02 += 8 * lda;
+	      a03 += 8 * lda;
+	      a04 += 8 * lda;
+	      a05 += 8 * lda;
+	      a06 += 8 * lda;
+	      a07 += 8 * lda;
+	      a08 += 8 * lda;
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = *(a01 +  2);
+	      b[  3] = *(a01 +  3);
+	      b[  4] = *(a01 +  4);
+	      b[  5] = *(a01 +  5);
+	      b[  6] = *(a01 +  6);
+	      b[  7] = *(a01 +  7);
+	      b[  8] = *(a01 +  8);
+	      b[  9] = *(a01 +  9);
+	      b[ 10] = *(a01 + 10);
+	      b[ 11] = *(a01 + 11);
+	      b[ 12] = *(a01 + 12);
+	      b[ 13] = *(a01 + 13);
+	      b[ 14] = *(a01 + 14);
+	      b[ 15] = *(a01 + 15);
+
+	      b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+#ifdef UNIT
+	      b[ 18] = ONE;
+	      b[ 19] = ZERO;
+#else
+	      b[ 18] = *(a02 +  2);
+	      b[ 19] = *(a02 +  3);
+#endif
+	      b[ 20] = *(a02 +  4);
+	      b[ 21] = *(a02 +  5);
+	      b[ 22] = *(a02 +  6);
+	      b[ 23] = *(a02 +  7);
+	      b[ 24] = *(a02 +  8);
+	      b[ 25] = *(a02 +  9);
+	      b[ 26] = *(a02 + 10);
+	      b[ 27] = *(a02 + 11);
+	      b[ 28] = *(a02 + 12);
+	      b[ 29] = *(a02 + 13);
+	      b[ 30] = *(a02 + 14);
+	      b[ 31] = *(a02 + 15);
+
+	      b[ 32] = ZERO;
+	      b[ 33] = ZERO;
+	      b[ 34] = ZERO;
+	      b[ 35] = ZERO;
+#ifdef UNIT
+	      b[ 36] = ONE;
+	      b[ 37] = ZERO;
+#else
+	      b[ 36] = *(a03 +  4);
+	      b[ 37] = *(a03 +  5);
+#endif
+	      b[ 38] = *(a03 +  6);
+	      b[ 39] = *(a03 +  7);
+	      b[ 40] = *(a03 +  8);
+	      b[ 41] = *(a03 +  9);
+	      b[ 42] = *(a03 + 10);
+	      b[ 43] = *(a03 + 11);
+	      b[ 44] = *(a03 + 12);
+	      b[ 45] = *(a03 + 13);
+	      b[ 46] = *(a03 + 14);
+	      b[ 47] = *(a03 + 15);
+
+	      b[ 48] = ZERO;
+	      b[ 49] = ZERO;
+	      b[ 50] = ZERO;
+	      b[ 51] = ZERO;
+	      b[ 52] = ZERO;
+	      b[ 53] = ZERO;
+#ifdef UNIT
+	      b[ 54] = ONE;
+	      b[ 55] = ZERO;
+#else
+	      b[ 54] = *(a04 +  6);
+	      b[ 55] = *(a04 +  7);
+#endif
+	      b[ 56] = *(a04 +  8);
+	      b[ 57] = *(a04 +  9);
+	      b[ 58] = *(a04 + 10);
+	      b[ 59] = *(a04 + 11);
+	      b[ 60] = *(a04 + 12);
+	      b[ 61] = *(a04 + 13);
+	      b[ 62] = *(a04 + 14);
+	      b[ 63] = *(a04 + 15);
+
+	      b[ 64] = ZERO;
+	      b[ 65] = ZERO;
+	      b[ 66] = ZERO;
+	      b[ 67] = ZERO;
+	      b[ 68] = ZERO;
+	      b[ 69] = ZERO;
+	      b[ 70] = ZERO;
+	      b[ 71] = ZERO;
+#ifdef UNIT
+	      b[ 72] = ONE;
+	      b[ 73] = ZERO;
+#else
+	      b[ 72] = *(a05 +  8);
+	      b[ 73] = *(a05 +  9);
+#endif
+	      b[ 74] = *(a05 + 10);
+	      b[ 75] = *(a05 + 11);
+	      b[ 76] = *(a05 + 12);
+	      b[ 77] = *(a05 + 13);
+	      b[ 78] = *(a05 + 14);
+	      b[ 79] = *(a05 + 15);
+
+	      b[ 80] = ZERO;
+	      b[ 81] = ZERO;
+	      b[ 82] = ZERO;
+	      b[ 83] = ZERO;
+	      b[ 84] = ZERO;
+	      b[ 85] = ZERO;
+	      b[ 86] = ZERO;
+	      b[ 87] = ZERO;
+	      b[ 88] = ZERO;
+	      b[ 89] = ZERO;
+#ifdef UNIT
+	      b[ 90] = ONE;
+	      b[ 91] = ZERO;
+#else
+	      b[ 90] = *(a06 + 10);
+	      b[ 91] = *(a06 + 11);
+#endif
+	      b[ 92] = *(a06 + 12);
+	      b[ 93] = *(a06 + 13);
+	      b[ 94] = *(a06 + 14);
+	      b[ 95] = *(a06 + 15);
+
+	      b[ 96] = ZERO;
+	      b[ 97] = ZERO;
+	      b[ 98] = ZERO;
+	      b[ 99] = ZERO;
+	      b[100] = ZERO;
+	      b[101] = ZERO;
+	      b[102] = ZERO;
+	      b[103] = ZERO;
+	      b[104] = ZERO;
+	      b[105] = ZERO;
+	      b[106] = ZERO;
+	      b[107] = ZERO;
+#ifdef UNIT
+	      b[108] = ONE;
+	      b[109] = ZERO;
+#else
+	      b[108] = *(a07 + 12);
+	      b[109] = *(a07 + 13);
+#endif
+	      b[110] = *(a07 + 14);
+	      b[111] = *(a07 + 15);
+
+	      b[112] = ZERO;
+	      b[113] = ZERO;
+	      b[114] = ZERO;
+	      b[115] = ZERO;
+	      b[116] = ZERO;
+	      b[117] = ZERO;
+	      b[118] = ZERO;
+	      b[119] = ZERO;
+	      b[120] = ZERO;
+	      b[121] = ZERO;
+	      b[122] = ZERO;
+	      b[123] = ZERO;
+	      b[124] = ZERO;
+	      b[125] = ZERO;
+#ifdef UNIT
+	      b[126] = ONE;
+	      b[127] = ZERO;
+#else
+	      b[126] = *(a08 + 14);
+	      b[127] = *(a08 + 15);
+#endif
+
+	      a01 += 16;
+	      a02 += 16;
+	      a03 += 16;
+	      a04 += 16;
+	      a05 += 16;
+	      a06 += 16;
+	      a07 += 16;
+	      a08 += 16;
+	      b += 128;
+	    }
+
+	  X += 8;
+	  i --;
+	} while (i > 0);
+      }
+
+      i = (m & 7);
+      if (i) {
+
+	if (X > posY) {
+	  /* a01 += 2 * i;
+	  a02 += 2 * i;
+	  a03 += 2 * i;
+	  a04 += 2 * i;
+	  a05 += 2 * i;
+	  a06 += 2 * i;
+	  a07 += 2 * i;
+	  a08 += 2 * i; */
+	  b += 16 * i;
+	} else
+	  if (X < posY) {
+	      for (ii = 0; ii < i; ii++){
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+	      b[  2] = *(a01 +  2);
+	      b[  3] = *(a01 +  3);
+	      b[  4] = *(a01 +  4);
+	      b[  5] = *(a01 +  5);
+	      b[  6] = *(a01 +  6);
+	      b[  7] = *(a01 +  7);
+
+	      b[  8] = *(a01 +  8);
+	      b[  9] = *(a01 +  9);
+	      b[ 10] = *(a01 + 10);
+	      b[ 11] = *(a01 + 11);
+	      b[ 12] = *(a01 + 12);
+	      b[ 13] = *(a01 + 13);
+	      b[ 14] = *(a01 + 14);
+	      b[ 15] = *(a01 + 15);
+
+	      a01 += lda;
+	      a02 += lda;
+	      a03 += lda;
+	      a04 += lda;
+	      a05 += lda;
+	      a06 += lda;
+	      a07 += lda;
+	      a08 += lda;
+	      b += 16;
+	    }
+	  } else {
+#ifdef UNIT
+	    b[  0] = ONE;
+	    b[  1] = ZERO;
+#else
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+#endif
+	    b[  2] = *(a01 +  2);
+	    b[  3] = *(a01 +  3);
+	    b[  4] = *(a01 +  4);
+	    b[  5] = *(a01 +  5);
+	    b[  6] = *(a01 +  6);
+	    b[  7] = *(a01 +  7);
+
+	    b[  8] = *(a01 +  8);
+	    b[  9] = *(a01 +  9);
+	    b[ 10] = *(a01 + 10);
+	    b[ 11] = *(a01 + 11);
+	    b[ 12] = *(a01 + 12);
+	    b[ 13] = *(a01 + 13);
+	    b[ 14] = *(a01 + 14);
+	    b[ 15] = *(a01 + 15);
+	    b += 16;
+
+	    if (i >= 2) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+#ifdef UNIT
+	      b[ 2] = ONE;
+	      b[ 3] = ZERO;
+#else
+	      b[ 2] = *(a02 +  2);
+	      b[ 3] = *(a02 +  3);
+#endif
+	      b[ 4] = *(a02 +  4);
+	      b[ 5] = *(a02 +  5);
+	      b[ 6] = *(a02 +  6);
+	      b[ 7] = *(a02 +  7);
+
+	      b[ 8] = *(a02 +  8);
+	      b[ 9] = *(a02 +  9);
+	      b[10] = *(a02 + 10);
+	      b[11] = *(a02 + 11);
+	      b[12] = *(a02 + 12);
+	      b[13] = *(a02 + 13);
+	      b[14] = *(a02 + 14);
+	      b[15] = *(a02 + 15);
+	      b += 16;
+	    }
+
+	    if (i >= 3) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+#ifdef UNIT
+	      b[ 4] = ONE;
+	      b[ 5] = ZERO;
+#else
+	      b[ 4] = *(a03 +  4);
+	      b[ 5] = *(a03 +  5);
+#endif
+	      b[ 6] = *(a03 +  6);
+	      b[ 7] = *(a03 +  7);
+
+	      b[ 8] = *(a03 +  8);
+	      b[ 9] = *(a03 +  9);
+	      b[10] = *(a03 + 10);
+	      b[11] = *(a03 + 11);
+	      b[12] = *(a03 + 12);
+	      b[13] = *(a03 + 13);
+	      b[14] = *(a03 + 14);
+	      b[15] = *(a03 + 15);
+	      b += 16;
+	    }
+
+	    if (i >= 4) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+	      b[ 4] = ZERO;
+	      b[ 5] = ZERO;
+#ifdef UNIT
+	      b[ 6] = ONE;
+	      b[ 7] = ZERO;
+#else
+	      b[ 6] = *(a04 +  6);
+	      b[ 7] = *(a04 +  7);
+#endif
+
+	      b[ 8] = *(a04 +  8);
+	      b[ 9] = *(a04 +  9);
+	      b[10] = *(a04 + 10);
+	      b[11] = *(a04 + 11);
+	      b[12] = *(a04 + 12);
+	      b[13] = *(a04 + 13);
+	      b[14] = *(a04 + 14);
+	      b[15] = *(a04 + 15);
+	      b += 16;
+	    }
+
+	    if (i >= 5) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+	      b[ 4] = ZERO;
+	      b[ 5] = ZERO;
+	      b[ 6] = ZERO;
+	      b[ 7] = ZERO;
+
+#ifdef UNIT
+	      b[ 8] = ONE;
+	      b[ 9] = ZERO;
+#else
+	      b[ 8] = *(a05 +  8);
+	      b[ 9] = *(a05 +  9);
+#endif
+	      b[10] = *(a05 + 10);
+	      b[11] = *(a05 + 11);
+	      b[12] = *(a05 + 12);
+	      b[13] = *(a05 + 13);
+	      b[14] = *(a05 + 14);
+	      b[15] = *(a05 + 15);
+	      b += 16;
+	    }
+
+	    if (i >= 6) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+	      b[ 4] = ZERO;
+	      b[ 5] = ZERO;
+	      b[ 6] = ZERO;
+	      b[ 7] = ZERO;
+
+	      b[ 8] = ZERO;
+	      b[ 9] = ZERO;
+#ifdef UNIT
+	      b[10] = ONE;
+	      b[11] = ZERO;
+#else
+	      b[10] = *(a06 + 10);
+	      b[11] = *(a06 + 11);
+#endif
+	      b[12] = *(a06 + 12);
+	      b[13] = *(a06 + 13);
+	      b[14] = *(a06 + 14);
+	      b[15] = *(a06 + 15);
+	      b += 16;
+	    }
+
+	    if (i >= 7) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+	      b[ 4] = ZERO;
+	      b[ 5] = ZERO;
+	      b[ 6] = ZERO;
+	      b[ 7] = ZERO;
+
+	      b[ 8] = ZERO;
+	      b[ 9] = ZERO;
+	      b[10] = ZERO;
+	      b[11] = ZERO;
+#ifdef UNIT
+	      b[12] = ONE;
+	      b[13] = ZERO;
+#else
+	      b[12] = *(a07 + 12);
+	      b[13] = *(a07 + 13);
+#endif
+	      b[14] = *(a07 + 14);
+	      b[15] = *(a07 + 15);
+	      b += 16;
+	    }
+	  }
+      }
+
+      posY += 8;
+  }
+
+
+  if (n & 4){
+    X = posX;
+
+    if (posX <= posY) {
+      a01 = a + posY * 2 + (posX +  0) * lda;
+      a02 = a + posY * 2 + (posX +  1) * lda;
+      a03 = a + posY * 2 + (posX +  2) * lda;
+      a04 = a + posY * 2 + (posX +  3) * lda;
+    } else {
+      a01 = a + posX * 2 + (posY +  0) * lda;
+      a02 = a + posX * 2 + (posY +  1) * lda;
+      a03 = a + posX * 2 + (posY +  2) * lda;
+      a04 = a + posX * 2 + (posY +  3) * lda;
+    }
+
+    i = (m >> 2);
+    if (i > 0) {
+      do {
+	if (X > posY) {
+	  a01 += 8;
+	  a02 += 8;
+	  a03 += 8;
+	  a04 += 8;
+	  b += 32;
+	} else
+	  if (X < posY) {
+	    for (ii = 0; ii < 4; ii++){
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+	      b[  2] = *(a01 +  2);
+	      b[  3] = *(a01 +  3);
+	      b[  4] = *(a01 +  4);
+	      b[  5] = *(a01 +  5);
+	      b[  6] = *(a01 +  6);
+	      b[  7] = *(a01 +  7);
+
+	      a01 += lda;
+	      b += 8;
+	      }
+
+	    a02 += 4 * lda;
+	    a03 += 4 * lda;
+	    a04 += 4 * lda;
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = *(a01 +  2);
+	      b[  3] = *(a01 +  3);
+	      b[  4] = *(a01 +  4);
+	      b[  5] = *(a01 +  5);
+	      b[  6] = *(a01 +  6);
+	      b[  7] = *(a01 +  7);
+
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+#ifdef UNIT
+	      b[ 10] = ONE;
+	      b[ 11] = ZERO;
+#else
+	      b[ 10] = *(a02 +  2);
+	      b[ 11] = *(a02 +  3);
+#endif
+	      b[ 12] = *(a02 +  4);
+	      b[ 13] = *(a02 +  5);
+	      b[ 14] = *(a02 +  6);
+	      b[ 15] = *(a02 +  7);
+
+	      b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+#ifdef UNIT
+	      b[ 20] = ONE;
+	      b[ 21] = ZERO;
+#else
+	      b[ 20] = *(a03 +  4);
+	      b[ 21] = *(a03 +  5);
+#endif
+	      b[ 22] = *(a03 +  6);
+	      b[ 23] = *(a03 +  7);
+
+	      b[ 24] = ZERO;
+	      b[ 25] = ZERO;
+	      b[ 26] = ZERO;
+	      b[ 27] = ZERO;
+	      b[ 28] = ZERO;
+	      b[ 29] = ZERO;
+#ifdef UNIT
+	      b[ 30] = ONE;
+	      b[ 31] = ZERO;
+#else
+	      b[ 30] = *(a04 +  6);
+	      b[ 31] = *(a04 +  7);
+#endif
+
+	      a01 += 8;
+	      a02 += 8;
+	      a03 += 8;
+	      a04 += 8;
+	      b += 32;
+	    }
+
+	X += 4;
+	i --;
+      } while (i > 0);
+    }
+
+    i = (m & 3);
+    if (i > 0) {
+      if (X > posY) {
+	/* a01 += 2 * i;
+	a02 += 2 * i;
+	a03 += 2 * i;
+	a04 += 2 * i; */
+	b += 8 * i;
+      } else
+	if (X < posY) {
+	  for (ii = 0; ii < i; ii++){
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    b[  2] = *(a01 +  2);
+	    b[  3] = *(a01 +  3);
+	    b[  4] = *(a01 +  4);
+	    b[  5] = *(a01 +  5);
+	    b[  6] = *(a01 +  6);
+	    b[  7] = *(a01 +  7);
+
+	    a01 += lda;
+	    a02 += lda;
+	    a03 += lda;
+	    a04 += lda;
+	    b += 8;
+	  }
+	} else {
+#ifdef UNIT
+	  b[  0] = ONE;
+	  b[  1] = ZERO;
+#else
+	  b[  0] = *(a01 +  0);
+	  b[  1] = *(a01 +  1);
+#endif
+	  b[  2] = *(a01 +  2);
+	  b[  3] = *(a01 +  3);
+	  b[  4] = *(a01 +  4);
+	  b[  5] = *(a01 +  5);
+	  b[  6] = *(a01 +  6);
+	  b[  7] = *(a01 +  7);
+	  b += 8;
+
+	  if (i >= 2) {
+	    b[ 0] = ZERO;
+	    b[ 1] = ZERO;
+#ifdef UNIT
+	    b[ 2] = ONE;
+	    b[ 3] = ZERO;
+#else
+	    b[ 2] = *(a02 +  2);
+	    b[ 3] = *(a02 +  3);
+#endif
+	    b[ 4] = *(a02 +  4);
+	    b[ 5] = *(a02 +  5);
+	    b[ 6] = *(a02 +  6);
+	    b[ 7] = *(a02 +  7);
+	    b += 8;
+	  }
+
+	  if (i >= 3) {
+	    b[ 0] = ZERO;
+	    b[ 1] = ZERO;
+	    b[ 2] = ZERO;
+	    b[ 3] = ZERO;
+#ifdef UNIT
+	    b[ 4] = ONE;
+	    b[ 5] = ZERO;
+#else
+	    b[ 4] = *(a03 +  4);
+	    b[ 5] = *(a03 +  5);
+#endif
+	    b[ 6] = *(a03 +  6);
+	    b[ 7] = *(a03 +  7);
+	    b += 8;
+	  }
+	}
+    }
+    posY += 4;
+  }
+
+  if (n & 2){
+    X = posX;
+
+    if (posX <= posY) {
+      a01 = a + posY * 2 + (posX +  0) * lda;
+      a02 = a + posY * 2 + (posX +  1) * lda;
+    } else {
+      a01 = a + posX * 2 + (posY +  0) * lda;
+      a02 = a + posX * 2 + (posY +  1) * lda;
+    }
+
+    i = (m >> 1);
+    if (i > 0) {
+      do {
+	if (X > posY) {
+	  a01 += 4;
+	  a02 += 4;
+	  b += 8;
+	} else
+	  if (X < posY) {
+	    b[0] = *(a01 +  0);
+	    b[1] = *(a01 +  1);
+	    b[2] = *(a01 +  2);
+	    b[3] = *(a01 +  3);
+	    b[4] = *(a02 +  0);
+	    b[5] = *(a02 +  1);
+	    b[6] = *(a02 +  2);
+	    b[7] = *(a02 +  3);
+	    a01 += 2 * lda;
+	    a02 += 2 * lda;
+	    b += 8;
+	  } else {
+#ifdef UNIT
+	    b[0] = ONE;
+	    b[1] = ZERO;
+#else
+	    b[0] = *(a01 +  0);
+	    b[1] = *(a01 +  1);
+#endif
+	    b[2] = *(a01 +  2);
+	    b[3] = *(a01 +  3);
+
+	    b[4] = ZERO;
+	    b[5] = ZERO;
+#ifdef UNIT
+	    b[6] = ONE;
+	    b[7] = ZERO;
+#else
+	    b[6] = *(a02 +  2);
+	    b[7] = *(a02 +  3);
+#endif
+	    a01 += 4;
+	    a02 += 4;
+	    b += 8;
+	  }
+
+	X += 2;
+	i --;
+      } while (i > 0);
+    }
+
+    i = (m & 1);
+    if (i > 0) {
+      if (X > posY) {
+	/* a01 += 2;
+	a02 += 2; */
+	b += 4;
+      } else
+	if (X < posY) {
+	  b[  0] = *(a01 +  0);
+	  b[  1] = *(a01 +  1);
+	  b[  2] = *(a01 +  2);
+	  b[  3] = *(a01 +  3);
+
+	  /* a01 += lda;
+	  a02 += lda; */
+	  b += 4;
+	} else {
+#ifdef UNIT
+	  b[  0] = ONE;
+	  b[  1] = ZERO;
+#else
+	  b[  0] = *(a01 +  0);
+	  b[  1] = *(a01 +  1);
+#endif
+	  b[  2] = *(a01 +  2);
+	  b[  3] = *(a01 +  3);
+	  b += 4;
+	}
+    }
+    posY += 2;
+  }
+
+  if (n & 1){
+    X = posX;
+
+    if (posX <= posY) {
+      a01 = a + posY * 2 + (posX +  0) * lda;
+    } else {
+      a01 = a + posX * 2 + (posY +  0) * lda;
+    }
+
+    i = m;
+    if (i > 0) {
+      do {
+
+	if (X > posY) {
+	  a01 += 2;
+	  b += 2;
+	} else
+	  if (X < posY) {
+	    b[0] = *(a01 + 0);
+	    b[1] = *(a01 + 1);
+	    a01 += lda;
+	    b += 2;
+	  } else {
+#ifdef UNIT
+	    b[0] = ONE;
+	    b[1] = ZERO;
+#else
+	    b[0] = *(a01 + 0);
+	    b[1] = *(a01 + 1);
+#endif
+	    a01 += 2;
+	    b += 2;
+	  }
+
+	X += 1;
+	i --;
+      } while (i > 0);
+    }
+    // posY += 1;
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/ztrmm_uncopy_16.c b/kernel/generic/ztrmm_uncopy_16.c
new file mode 100644
index 000000000..40b85db38
--- /dev/null
+++ b/kernel/generic/ztrmm_uncopy_16.c
@@ -0,0 +1,2316 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, js;
+  BLASLONG X, ii;
+
+  FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08;
+  FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16;
+
+  lda += lda;
+
+  js = (n >> 4);
+
+  if (js > 0){
+    do {
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+	a02 = a + posX * 2 + (posY + 1) * lda;
+	a03 = a + posX * 2 + (posY + 2) * lda;
+	a04 = a + posX * 2 + (posY + 3) * lda;
+	a05 = a + posX * 2 + (posY + 4) * lda;
+	a06 = a + posX * 2 + (posY + 5) * lda;
+	a07 = a + posX * 2 + (posY + 6) * lda;
+	a08 = a + posX * 2 + (posY + 7) * lda;
+	a09 = a + posX * 2 + (posY + 8) * lda;
+	a10 = a + posX * 2 + (posY + 9) * lda;
+	a11 = a + posX * 2 + (posY + 10) * lda;
+	a12 = a + posX * 2 + (posY + 11) * lda;
+	a13 = a + posX * 2 + (posY + 12) * lda;
+	a14 = a + posX * 2 + (posY + 13) * lda;
+	a15 = a + posX * 2 + (posY + 14) * lda;
+	a16 = a + posX * 2 + (posY + 15) * lda;
+      } else {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+	a02 = a + posY * 2 + (posX + 1) * lda;
+	a03 = a + posY * 2 + (posX + 2) * lda;
+	a04 = a + posY * 2 + (posX + 3) * lda;
+	a05 = a + posY * 2 + (posX + 4) * lda;
+	a06 = a + posY * 2 + (posX + 5) * lda;
+	a07 = a + posY * 2 + (posX + 6) * lda;
+	a08 = a + posY * 2 + (posX + 7) * lda;
+	a09 = a + posY * 2 + (posX + 8) * lda;
+	a10 = a + posY * 2 + (posX + 9) * lda;
+	a11 = a + posY * 2 + (posX + 10) * lda;
+	a12 = a + posY * 2 + (posX + 11) * lda;
+	a13 = a + posY * 2 + (posX + 12) * lda;
+	a14 = a + posY * 2 + (posX + 13) * lda;
+	a15 = a + posY * 2 + (posX + 14) * lda;
+	a16 = a + posY * 2 + (posX + 15) * lda;
+      }
+
+      i = (m >> 4);
+      if (i > 0) {
+	do {
+	  if (X < posY) {
+	    for (ii = 0; ii < 16; ii++){
+
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+	      b[  2] = *(a02 +  0);
+	      b[  3] = *(a02 +  1);
+	      b[  4] = *(a03 +  0);
+	      b[  5] = *(a03 +  1);
+	      b[  6] = *(a04 +  0);
+	      b[  7] = *(a04 +  1);
+
+	      b[  8] = *(a05 +  0);
+	      b[  9] = *(a05 +  1);
+	      b[ 10] = *(a06 +  0);
+	      b[ 11] = *(a06 +  1);
+	      b[ 12] = *(a07 +  0);
+	      b[ 13] = *(a07 +  1);
+	      b[ 14] = *(a08 +  0);
+	      b[ 15] = *(a08 +  1);
+
+		  b[ 16] = *(a09 +  0);
+	      b[ 17] = *(a09 +  1);
+	      b[ 18] = *(a10 +  0);
+	      b[ 19] = *(a10 +  1);
+	      b[ 20] = *(a11 +  0);
+	      b[ 21] = *(a11 +  1);
+	      b[ 22] = *(a12 +  0);
+	      b[ 23] = *(a12 +  1);
+
+	      b[ 24] = *(a13 +  0);
+	      b[ 25] = *(a13 +  1);
+	      b[ 26] = *(a14 +  0);
+	      b[ 27] = *(a14 +  1);
+	      b[ 28] = *(a15 +  0);
+	      b[ 29] = *(a15 +  1);
+	      b[ 30] = *(a16 +  0);
+	      b[ 31] = *(a16 +  1);
+
+	      a01 += 2;
+	      a02 += 2;
+	      a03 += 2;
+	      a04 += 2;
+	      a05 += 2;
+	      a06 += 2;
+	      a07 += 2;
+	      a08 += 2;
+		  a09 += 2;
+	      a10 += 2;
+	      a11 += 2;
+	      a12 += 2;
+	      a13 += 2;
+	      a14 += 2;
+	      a15 += 2;
+	      a16 += 2;
+	      b += 32;
+	    }
+	  } else
+	    if (X > posY) {
+	      a01 += 16 * lda;
+	      a02 += 16 * lda;
+	      a03 += 16 * lda;
+	      a04 += 16 * lda;
+	      a05 += 16 * lda;
+	      a06 += 16 * lda;
+	      a07 += 16 * lda;
+	      a08 += 16 * lda;
+		  a09 += 16 * lda;
+	      a10 += 16 * lda;
+	      a11 += 16 * lda;
+	      a12 += 16 * lda;
+	      a13 += 16 * lda;
+	      a14 += 16 * lda;
+	      a15 += 16 * lda;
+	      a16 += 16 * lda;
+
+	      b += 512;
+
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = *(a02 +  0);
+	      b[  3] = *(a02 +  1);
+	      b[  4] = *(a03 +  0);
+	      b[  5] = *(a03 +  1);
+	      b[  6] = *(a04 +  0);
+	      b[  7] = *(a04 +  1);
+	      b[  8] = *(a05 +  0);
+	      b[  9] = *(a05 +  1);
+	      b[ 10] = *(a06 +  0);
+	      b[ 11] = *(a06 +  1);
+	      b[ 12] = *(a07 +  0);
+	      b[ 13] = *(a07 +  1);
+	      b[ 14] = *(a08 +  0);
+	      b[ 15] = *(a08 +  1);
+		  b[ 16] = *(a09 +  0);
+	      b[ 17] = *(a09 +  1);
+	      b[ 18] = *(a10 +  0);
+	      b[ 19] = *(a10 +  1);
+	      b[ 20] = *(a11 +  0);
+	      b[ 21] = *(a11 +  1);
+	      b[ 22] = *(a12 +  0);
+	      b[ 23] = *(a12 +  1);
+	      b[ 24] = *(a13 +  0);
+	      b[ 25] = *(a13 +  1);
+	      b[ 26] = *(a14 +  0);
+	      b[ 27] = *(a14 +  1);
+	      b[ 28] = *(a15 +  0);
+	      b[ 29] = *(a15 +  1);
+		  b[ 30] = *(a16 +  0);
+	      b[ 31] = *(a16 +  1);
+
+	      b[ 32] = ZERO;
+	      b[ 33] = ZERO;
+#ifdef UNIT
+	      b[ 34] = ONE;
+	      b[ 35] = ZERO;
+#else
+	      b[ 34] = *(a02 +  2);
+	      b[ 35] = *(a02 +  3);
+#endif
+	      b[ 36] = *(a03 +  2);
+	      b[ 37] = *(a03 +  3);
+	      b[ 38] = *(a04 +  2);
+	      b[ 39] = *(a04 +  3);
+	      b[ 40] = *(a05 +  2);
+	      b[ 41] = *(a05 +  3);
+	      b[ 42] = *(a06 +  2);
+	      b[ 43] = *(a06 +  3);
+	      b[ 44] = *(a07 +  2);
+	      b[ 45] = *(a07 +  3);
+	      b[ 46] = *(a08 +  2);
+	      b[ 47] = *(a08 +  3);
+	      b[ 48] = *(a09 +  2);
+	      b[ 49] = *(a09 +  3);
+	      b[ 50] = *(a10 +  2);
+	      b[ 51] = *(a10 +  3);
+	      b[ 52] = *(a11 +  2);
+	      b[ 53] = *(a11 +  3);
+	      b[ 54] = *(a12 +  2);
+	      b[ 55] = *(a12 +  3);
+	      b[ 56] = *(a13 +  2);
+	      b[ 57] = *(a13 +  3);
+	      b[ 58] = *(a14 +  2);
+	      b[ 59] = *(a14 +  3);
+	      b[ 60] = *(a15 +  2);
+	      b[ 61] = *(a15 +  3);
+	      b[ 62] = *(a16 +  2);
+	      b[ 63] = *(a16 +  3);
+
+	      b[ 64] = ZERO;
+	      b[ 65] = ZERO;
+	      b[ 66] = ZERO;
+	      b[ 67] = ZERO;
+#ifdef UNIT
+	      b[ 68] = ONE;
+	      b[ 69] = ZERO;
+#else
+	      b[ 68] = *(a03 +  4);
+	      b[ 69] = *(a03 +  5);
+#endif
+	      b[ 70] = *(a04 +  4);
+	      b[ 71] = *(a04 +  5);
+	      b[ 72] = *(a05 +  4);
+	      b[ 73] = *(a05 +  5);
+	      b[ 74] = *(a06 +  4);
+	      b[ 75] = *(a06 +  5);
+	      b[ 76] = *(a07 +  4);
+	      b[ 77] = *(a07 +  5);
+	      b[ 78] = *(a08 +  4);
+	      b[ 79] = *(a08 +  5);
+	      b[ 80] = *(a09 +  4);
+	      b[ 81] = *(a09 +  5);
+	      b[ 82] = *(a10 +  4);
+		  b[ 83] = *(a10 +  5);
+	      b[ 84] = *(a11 +  4);
+	      b[ 85] = *(a11 +  5);
+	      b[ 86] = *(a12 +  4);
+	      b[ 87] = *(a12 +  5);
+	      b[ 88] = *(a13 +  4);
+	      b[ 89] = *(a13 +  5);
+	      b[ 90] = *(a14 +  4);
+	      b[ 91] = *(a14 +  5);
+	      b[ 92] = *(a15 +  4);
+	      b[ 93] = *(a15 +  5);
+	      b[ 94] = *(a16 +  4);
+	      b[ 95] = *(a16 +  5);
+
+	      b[ 96] = ZERO;
+	      b[ 97] = ZERO;
+	      b[ 98] = ZERO;
+	      b[ 99] = ZERO;
+	      b[100] = ZERO;
+	      b[101] = ZERO;
+#ifdef UNIT
+	      b[102] = ONE;
+	      b[103] = ZERO;
+#else
+	      b[102] = *(a04 +  6);
+	      b[103] = *(a04 +  7);
+#endif
+	      b[104] = *(a05 +  6);
+	      b[105] = *(a05 +  7);
+	      b[106] = *(a06 +  6);
+	      b[107] = *(a06 +  7);
+	      b[108] = *(a07 +  6);
+	      b[109] = *(a07 +  7);
+	      b[110] = *(a08 +  6);
+	      b[111] = *(a08 +  7);
+	      b[112] = *(a09 +  6);
+	      b[113] = *(a09 +  7);
+	      b[114] = *(a10 +  6);
+	      b[115] = *(a10 +  7);
+		  b[116] = *(a11 +  6);
+	      b[117] = *(a11 +  7);
+	      b[118] = *(a12 +  6);
+	      b[119] = *(a12 +  7);
+	      b[120] = *(a13 +  6);
+	      b[121] = *(a13 +  7);
+	      b[122] = *(a14 +  6);
+	      b[123] = *(a14 +  7);
+	      b[124] = *(a15 +  6);
+	      b[125] = *(a15 +  7);
+	      b[126] = *(a16 +  6);
+	      b[127] = *(a16 +  7);
+
+	      b[128] = ZERO;
+	      b[129] = ZERO;
+	      b[130] = ZERO;
+	      b[131] = ZERO;
+	      b[132] = ZERO;
+	      b[133] = ZERO;
+	      b[134] = ZERO;
+	      b[135] = ZERO;
+#ifdef UNIT
+	      b[136] = ONE;
+	      b[137] = ZERO;
+#else
+	      b[136] = *(a05 +  8);
+	      b[137] = *(a05 +  9);
+#endif
+	      b[138] = *(a06 +  8);
+	      b[139] = *(a06 +  9);
+	      b[140] = *(a07 +  8);
+	      b[141] = *(a07 +  9);
+	      b[142] = *(a08 +  8);
+	      b[143] = *(a08 +  9);
+	      b[144] = *(a09 +  8);
+	      b[145] = *(a09 +  9);
+	      b[146] = *(a10 +  8);
+	      b[147] = *(a10 +  9);
+	      b[148] = *(a11 +  8);
+		  b[149] = *(a11 +  9);
+	      b[150] = *(a12 +  8);
+	      b[151] = *(a12 +  9);
+	      b[152] = *(a13 +  8);
+	      b[153] = *(a13 +  9);
+	      b[154] = *(a14 +  8);
+	      b[155] = *(a14 +  9);
+	      b[156] = *(a15 +  8);
+	      b[157] = *(a15 +  9);
+	      b[158] = *(a16 +  8);
+	      b[159] = *(a16 +  9);
+
+	      b[160] = ZERO;
+	      b[161] = ZERO;
+	      b[162] = ZERO;
+	      b[163] = ZERO;
+	      b[164] = ZERO;
+	      b[165] = ZERO;
+	      b[166] = ZERO;
+	      b[167] = ZERO;
+	      b[168] = ZERO;
+	      b[169] = ZERO;
+#ifdef UNIT
+	      b[170] = ONE;
+	      b[171] = ZERO;
+#else
+	      b[170] = *(a06 + 10);
+	      b[171] = *(a06 + 11);
+#endif
+	      b[172] = *(a07 + 10);
+	      b[173] = *(a07 + 11);
+	      b[174] = *(a08 + 10);
+	      b[175] = *(a08 + 11);
+	      b[176] = *(a09 + 10);
+	      b[177] = *(a09 + 11);
+	      b[178] = *(a10 + 10);
+	      b[179] = *(a10 + 11);
+	      b[180] = *(a11 + 10);
+	      b[181] = *(a11 + 11);
+		  b[182] = *(a12 + 10);
+	      b[183] = *(a12 + 11);
+	      b[184] = *(a13 + 10);
+	      b[185] = *(a13 + 11);
+	      b[186] = *(a14 + 10);
+	      b[187] = *(a14 + 11);
+	      b[188] = *(a15 + 10);
+	      b[189] = *(a15 + 11);
+	      b[190] = *(a16 + 10);
+	      b[191] = *(a16 + 11);
+
+	      b[192] = ZERO;
+	      b[193] = ZERO;
+	      b[194] = ZERO;
+	      b[195] = ZERO;
+	      b[196] = ZERO;
+	      b[197] = ZERO;
+	      b[198] = ZERO;
+	      b[199] = ZERO;
+	      b[200] = ZERO;
+	      b[201] = ZERO;
+	      b[202] = ZERO;
+	      b[203] = ZERO;
+#ifdef UNIT
+	      b[204] = ONE;
+	      b[205] = ZERO;
+#else
+	      b[204] = *(a07 + 12);
+	      b[205] = *(a07 + 13);
+#endif
+	      b[206] = *(a08 + 12);
+	      b[207] = *(a08 + 13);
+	      b[208] = *(a09 + 12);
+	      b[209] = *(a09 + 13);
+	      b[210] = *(a10 + 12);
+	      b[211] = *(a10 + 13);
+	      b[212] = *(a11 + 12);
+	      b[213] = *(a11 + 13);
+	      b[214] = *(a12 + 12);
+		  b[215] = *(a12 + 13);
+	      b[216] = *(a13 + 12);
+	      b[217] = *(a13 + 13);
+	      b[218] = *(a14 + 12);
+	      b[219] = *(a14 + 13);
+	      b[220] = *(a15 + 12);
+	      b[221] = *(a15 + 13);
+	      b[222] = *(a16 + 12);
+	      b[223] = *(a16 + 13);
+
+	      b[224] = ZERO;
+	      b[225] = ZERO;
+	      b[226] = ZERO;
+	      b[227] = ZERO;
+	      b[228] = ZERO;
+	      b[229] = ZERO;
+	      b[230] = ZERO;
+	      b[231] = ZERO;
+	      b[232] = ZERO;
+	      b[233] = ZERO;
+	      b[234] = ZERO;
+	      b[235] = ZERO;
+	      b[236] = ZERO;
+	      b[237] = ZERO;
+#ifdef UNIT
+	      b[238] = ONE;
+	      b[239] = ZERO;
+#else
+	      b[238] = *(a08 + 14);
+	      b[239] = *(a08 + 15);
+#endif
+		  b[240] = *(a09 + 14);
+	      b[241] = *(a09 + 15);
+	      b[242] = *(a10 + 14);
+	      b[243] = *(a10 + 15);
+	      b[244] = *(a11 + 14);
+	      b[245] = *(a11 + 15);
+	      b[246] = *(a12 + 14);
+	      b[247] = *(a12 + 15);
+		  b[248] = *(a13 + 14);
+	      b[249] = *(a13 + 15);
+	      b[250] = *(a14 + 14);
+	      b[251] = *(a14 + 15);
+	      b[252] = *(a15 + 14);
+	      b[253] = *(a15 + 15);
+	      b[254] = *(a16 + 14);
+	      b[255] = *(a16 + 15);
+
+	      b[256] = ZERO;
+	      b[257] = ZERO;
+	      b[258] = ZERO;
+	      b[259] = ZERO;
+	      b[260] = ZERO;
+	      b[261] = ZERO;
+	      b[262] = ZERO;
+	      b[263] = ZERO;
+	      b[264] = ZERO;
+	      b[265] = ZERO;
+	      b[266] = ZERO;
+	      b[267] = ZERO;
+	      b[268] = ZERO;
+	      b[269] = ZERO;
+	      b[270] = ZERO;
+	      b[271] = ZERO;
+#ifdef UNIT
+	      b[272] = ONE;
+		  b[273] = ZERO;
+#else
+	      b[272] = *(a09 + 16);
+		  b[273] = *(a09 + 17);
+#endif
+		  b[274] = *(a10 + 16);
+	      b[275] = *(a10 + 17);
+	      b[276] = *(a11 + 16);
+	      b[277] = *(a11 + 17);
+	      b[278] = *(a12 + 16);
+	      b[279] = *(a12 + 17);
+	      b[280] = *(a13 + 16);
+		  b[281] = *(a13 + 17);
+	      b[282] = *(a14 + 16);
+	      b[283] = *(a14 + 17);
+	      b[284] = *(a15 + 16);
+	      b[285] = *(a15 + 17);
+	      b[286] = *(a16 + 16);
+	      b[287] = *(a16 + 17);
+
+		  b[288] = ZERO;
+		  b[289] = ZERO;
+	      b[290] = ZERO;
+	      b[291] = ZERO;
+	      b[292] = ZERO;
+	      b[293] = ZERO;
+	      b[294] = ZERO;
+	      b[295] = ZERO;
+	      b[296] = ZERO;
+	      b[297] = ZERO;
+	      b[298] = ZERO;
+	      b[299] = ZERO;
+	      b[300] = ZERO;
+	      b[301] = ZERO;
+	      b[302] = ZERO;
+	      b[303] = ZERO;
+	      b[304] = ZERO;
+	      b[305] = ZERO;
+#ifdef UNIT
+	      b[306] = ONE;
+		  b[307] = ZERO;
+#else
+	      b[306] = *(a10 + 18);
+		  b[307] = *(a10 + 19);
+#endif
+		  b[308] = *(a11 + 18);
+	      b[309] = *(a11 + 19);
+	      b[310] = *(a12 + 18);
+	      b[311] = *(a12 + 19);
+	      b[312] = *(a13 + 18);
+	      b[313] = *(a13 + 19);
+		  b[314] = *(a14 + 18);
+	      b[315] = *(a14 + 19);
+	      b[316] = *(a15 + 18);
+	      b[317] = *(a15 + 19);
+	      b[318] = *(a16 + 18);
+	      b[319] = *(a16 + 19);
+
+		  b[320] = ZERO;
+		  b[321] = ZERO;
+	      b[322] = ZERO;
+	      b[323] = ZERO;
+	      b[324] = ZERO;
+	      b[325] = ZERO;
+	      b[326] = ZERO;
+	      b[327] = ZERO;
+	      b[328] = ZERO;
+	      b[329] = ZERO;
+	      b[330] = ZERO;
+	      b[331] = ZERO;
+	      b[332] = ZERO;
+	      b[333] = ZERO;
+	      b[334] = ZERO;
+	      b[335] = ZERO;
+	      b[336] = ZERO;
+	      b[337] = ZERO;
+	      b[338] = ZERO;
+	      b[339] = ZERO;
+#ifdef UNIT
+	      b[340] = ONE;
+	      b[341] = ZERO;
+#else
+	      b[340] = *(a11 + 20);
+	      b[341] = *(a11 + 21);
+#endif
+		  b[342] = *(a12 + 20);
+	      b[343] = *(a12 + 21);
+	      b[344] = *(a13 + 20);
+	      b[345] = *(a13 + 21);
+	      b[346] = *(a14 + 20);
+		  b[347] = *(a14 + 21);
+	      b[348] = *(a15 + 20);
+	      b[349] = *(a15 + 21);
+	      b[350] = *(a16 + 20);
+	      b[351] = *(a16 + 21);
+
+		  b[352] = ZERO;
+		  b[353] = ZERO;
+	      b[354] = ZERO;
+	      b[355] = ZERO;
+	      b[356] = ZERO;
+	      b[357] = ZERO;
+	      b[358] = ZERO;
+	      b[359] = ZERO;
+	      b[360] = ZERO;
+	      b[361] = ZERO;
+	      b[362] = ZERO;
+	      b[363] = ZERO;
+	      b[364] = ZERO;
+	      b[365] = ZERO;
+	      b[366] = ZERO;
+	      b[367] = ZERO;
+	      b[368] = ZERO;
+	      b[369] = ZERO;
+	      b[370] = ZERO;
+	      b[371] = ZERO;
+	      b[372] = ZERO;
+	      b[373] = ZERO;
+#ifdef UNIT
+	      b[374] = ONE;
+	      b[375] = ZERO;
+#else
+	      b[374] = *(a12 + 22);
+	      b[375] = *(a12 + 23);
+#endif
+		  b[376] = *(a13 + 22);
+	      b[377] = *(a13 + 23);
+	      b[378] = *(a14 + 22);
+	      b[379] = *(a14 + 23);
+		  b[380] = *(a15 + 22);
+	      b[381] = *(a15 + 23);
+	      b[382] = *(a16 + 22);
+	      b[383] = *(a16 + 23);
+
+		  b[384] = ZERO;
+		  b[385] = ZERO;
+	      b[386] = ZERO;
+	      b[387] = ZERO;
+	      b[388] = ZERO;
+	      b[389] = ZERO;
+	      b[390] = ZERO;
+	      b[391] = ZERO;
+	      b[392] = ZERO;
+	      b[393] = ZERO;
+	      b[394] = ZERO;
+	      b[395] = ZERO;
+	      b[396] = ZERO;
+	      b[397] = ZERO;
+	      b[398] = ZERO;
+	      b[399] = ZERO;
+	      b[400] = ZERO;
+	      b[401] = ZERO;
+	      b[402] = ZERO;
+	      b[403] = ZERO;
+	      b[404] = ZERO;
+	      b[405] = ZERO;
+	      b[406] = ZERO;
+	      b[407] = ZERO;
+#ifdef UNIT
+	      b[408] = ONE;
+	      b[409] = ZERO;
+#else
+	      b[408] = *(a13 + 24);
+	      b[409] = *(a13 + 25);
+#endif
+		  b[410] = *(a14 + 24);
+	      b[411] = *(a14 + 25);
+	      b[412] = *(a15 + 24);
+		  b[413] = *(a15 + 25);
+	      b[414] = *(a16 + 24);
+	      b[415] = *(a16 + 25);
+
+		  b[416] = ZERO;
+		  b[417] = ZERO;
+	      b[418] = ZERO;
+	      b[419] = ZERO;
+	      b[420] = ZERO;
+	      b[421] = ZERO;
+	      b[422] = ZERO;
+	      b[423] = ZERO;
+	      b[424] = ZERO;
+	      b[425] = ZERO;
+	      b[426] = ZERO;
+	      b[427] = ZERO;
+	      b[428] = ZERO;
+	      b[429] = ZERO;
+	      b[430] = ZERO;
+	      b[431] = ZERO;
+	      b[432] = ZERO;
+	      b[433] = ZERO;
+	      b[434] = ZERO;
+	      b[435] = ZERO;
+	      b[436] = ZERO;
+	      b[437] = ZERO;
+	      b[438] = ZERO;
+	      b[439] = ZERO;
+	      b[440] = ZERO;
+	      b[441] = ZERO;
+#ifdef UNIT
+	      b[442] = ONE;
+	      b[443] = ZERO;
+#else
+	      b[442] = *(a14 + 26);
+	      b[443] = *(a14 + 27);
+#endif
+		  b[444] = *(a15 + 26);
+	      b[445] = *(a15 + 27);
+		  b[446] = *(a16 + 26);
+	      b[447] = *(a16 + 27);
+
+		  b[448] = ZERO;
+		  b[449] = ZERO;
+	      b[450] = ZERO;
+	      b[451] = ZERO;
+	      b[452] = ZERO;
+	      b[453] = ZERO;
+	      b[454] = ZERO;
+	      b[455] = ZERO;
+	      b[456] = ZERO;
+	      b[457] = ZERO;
+	      b[458] = ZERO;
+	      b[459] = ZERO;
+	      b[460] = ZERO;
+	      b[461] = ZERO;
+	      b[462] = ZERO;
+	      b[463] = ZERO;
+	      b[464] = ZERO;
+	      b[465] = ZERO;
+	      b[466] = ZERO;
+	      b[467] = ZERO;
+	      b[468] = ZERO;
+	      b[469] = ZERO;
+	      b[470] = ZERO;
+	      b[471] = ZERO;
+	      b[472] = ZERO;
+	      b[473] = ZERO;
+	      b[474] = ZERO;
+	      b[475] = ZERO;
+#ifdef UNIT
+	      b[476] = ONE;
+	      b[477] = ZERO;
+#else
+	      b[476] = *(a15 + 28);
+	      b[477] = *(a15 + 29);
+#endif
+		  b[478] = *(a16 + 28);
+		  b[479] = *(a16 + 29);
+
+		  b[480] = ZERO;
+		  b[481] = ZERO;
+	      b[482] = ZERO;
+	      b[483] = ZERO;
+	      b[484] = ZERO;
+	      b[485] = ZERO;
+	      b[486] = ZERO;
+	      b[487] = ZERO;
+	      b[488] = ZERO;
+	      b[489] = ZERO;
+	      b[490] = ZERO;
+	      b[491] = ZERO;
+	      b[492] = ZERO;
+	      b[493] = ZERO;
+	      b[494] = ZERO;
+	      b[495] = ZERO;
+	      b[496] = ZERO;
+	      b[497] = ZERO;
+	      b[498] = ZERO;
+	      b[499] = ZERO;
+	      b[500] = ZERO;
+	      b[501] = ZERO;
+	      b[502] = ZERO;
+	      b[503] = ZERO;
+	      b[504] = ZERO;
+	      b[505] = ZERO;
+	      b[506] = ZERO;
+	      b[507] = ZERO;
+	      b[508] = ZERO;
+	      b[509] = ZERO;
+#ifdef UNIT
+	      b[510] = ONE;
+	      b[511] = ZERO;
+#else
+	      b[510] = *(a16 + 30);
+	      b[511] = *(a16 + 31);
+#endif
+
+	      a01 += 16 * lda;
+	      a02 += 16 * lda;
+	      a03 += 16 * lda;
+	      a04 += 16 * lda;
+	      a05 += 16 * lda;
+	      a06 += 16 * lda;
+	      a07 += 16 * lda;
+	      a08 += 16 * lda;
+	      a09 += 16 * lda;
+	      a10 += 16 * lda;
+	      a11 += 16 * lda;
+	      a12 += 16 * lda;
+	      a13 += 16 * lda;
+	      a14 += 16 * lda;
+	      a15 += 16 * lda;
+	      a16 += 16 * lda;
+	      b += 512;
+	    }
+
+	  X += 16;
+	  i --;
+	} while (i > 0);
+      }
+
+      i = (m & 15);
+      if (i) {
+
+	if (X < posY) {
+
+	  for (ii = 0; ii < i; ii++){
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+		b[  2] = *(a02 +  0);
+		b[  3] = *(a02 +  1);
+		b[  4] = *(a03 +  0);
+		b[  5] = *(a03 +  1);
+		b[  6] = *(a04 +  0);
+		b[  7] = *(a04 +  1);
+		b[  8] = *(a05 +  0);
+		b[  9] = *(a05 +  1);
+		b[ 10] = *(a06 +  0);
+		b[ 11] = *(a06 +  1);
+		b[ 12] = *(a07 +  0);
+		b[ 13] = *(a07 +  1);
+		b[ 14] = *(a08 +  0);
+		b[ 15] = *(a08 +  1);
+
+		b[ 16] = *(a09 +  0);
+		b[ 17] = *(a09 +  1);
+		b[ 18] = *(a10 +  0);
+		b[ 19] = *(a10 +  1);
+		b[ 20] = *(a11 +  0);
+		b[ 21] = *(a11 +  1);
+		b[ 22] = *(a12 +  0);
+		b[ 23] = *(a12 +  1);
+		b[ 24] = *(a13 +  0);
+		b[ 25] = *(a13 +  1);
+		b[ 26] = *(a14 +  0);
+		b[ 27] = *(a14 +  1);
+		b[ 28] = *(a15 +  0);
+		b[ 29] = *(a15 +  1);
+		b[ 30] = *(a16 +  0);
+		b[ 31] = *(a16 +  1);
+
+	    a01 += 2;
+	    a02 += 2;
+	    a03 += 2;
+	    a04 += 2;
+	    a05 += 2;
+	    a06 += 2;
+	    a07 += 2;
+	    a08 += 2;
+		a09 += 2;
+	    a10 += 2;
+	    a11 += 2;
+	    a12 += 2;
+	    a13 += 2;
+	    a14 += 2;
+	    a15 += 2;
+	    a16 += 2;
+	    b += 32;
+	  }
+	} else
+	  if (X > posY) {
+	    /* a01 += i * lda;
+	      a02 += i * lda;
+	      a03 += i * lda;
+	      a04 += i * lda;
+	      a05 += i * lda;
+	      a06 += i * lda;
+	      a07 += i * lda;
+	      a08 += i * lda;
+	      a09 += i * lda;
+	      a10 += i * lda;
+	      a11 += i * lda;
+	      a12 += i * lda;
+	      a13 += i * lda;
+	      a14 += i * lda;
+	      a15 += i * lda;
+	      a16 += i * lda; */
+	    b += 32 * i;
+	  } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = *(a02 +  0);
+	      b[  3] = *(a02 +  1);
+	      b[  4] = *(a03 +  0);
+	      b[  5] = *(a03 +  1);
+	      b[  6] = *(a04 +  0);
+	      b[  7] = *(a04 +  1);
+	      b[  8] = *(a05 +  0);
+	      b[  9] = *(a05 +  1);
+	      b[ 10] = *(a06 +  0);
+	      b[ 11] = *(a06 +  1);
+	      b[ 12] = *(a07 +  0);
+	      b[ 13] = *(a07 +  1);
+	      b[ 14] = *(a08 +  0);
+	      b[ 15] = *(a08 +  1);
+		  b[ 16] = *(a09 +  0);
+	      b[ 17] = *(a09 +  1);
+	      b[ 18] = *(a10 +  0);
+	      b[ 19] = *(a10 +  1);
+	      b[ 20] = *(a11 +  0);
+	      b[ 21] = *(a11 +  1);
+	      b[ 22] = *(a12 +  0);
+	      b[ 23] = *(a12 +  1);
+	      b[ 24] = *(a13 +  0);
+	      b[ 25] = *(a13 +  1);
+	      b[ 26] = *(a14 +  0);
+	      b[ 27] = *(a14 +  1);
+	      b[ 28] = *(a15 +  0);
+	      b[ 29] = *(a15 +  1);
+		  b[ 30] = *(a16 +  0);
+	      b[ 31] = *(a16 +  1);
+	      b += 32;
+
+	      if (i >= 2) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+#ifdef UNIT
+		b[ 2] = ONE;
+		b[ 3] = ZERO;
+#else
+		b[ 2] = *(a02 +  2);
+		b[ 3] = *(a02 +  3);
+#endif
+		b[  4] = *(a03 +  2);
+		b[  5] = *(a03 +  3);
+		b[  6] = *(a04 +  2);
+		b[  7] = *(a04 +  3);
+		b[  8] = *(a05 +  2);
+		b[  9] = *(a05 +  3);
+		b[ 10] = *(a06 +  2);
+		b[ 11] = *(a06 +  3);
+		b[ 12] = *(a07 +  2);
+		b[ 13] = *(a07 +  3);
+		b[ 14] = *(a08 +  2);
+		b[ 15] = *(a08 +  3);
+		b[ 16] = *(a09 +  2);
+		b[ 17] = *(a09 +  3);
+		b[ 18] = *(a10 +  2);
+		b[ 19] = *(a10 +  3);
+		b[ 20] = *(a11 +  2);
+		b[ 21] = *(a11 +  3);
+		b[ 22] = *(a12 +  2);
+		b[ 23] = *(a12 +  3);
+		b[ 24] = *(a13 +  2);
+		b[ 25] = *(a13 +  3);
+		b[ 26] = *(a14 +  2);
+		b[ 27] = *(a14 +  3);
+		b[ 28] = *(a15 +  2);
+		b[ 29] = *(a15 +  3);
+		b[ 30] = *(a16 +  2);
+		b[ 31] = *(a16 +  3);
+		b += 32;
+	      }
+
+	      if (i >= 3) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+#ifdef UNIT
+		b[ 4] = ONE;
+		b[ 5] = ZERO;
+#else
+		b[ 4] = *(a03 +  4);
+		b[ 5] = *(a03 +  5);
+#endif
+		b[  6] = *(a04 +  4);
+		b[  7] = *(a04 +  5);
+		b[  8] = *(a05 +  4);
+		b[  9] = *(a05 +  5);
+		b[ 10] = *(a06 +  4);
+		b[ 11] = *(a06 +  5);
+		b[ 12] = *(a07 +  4);
+		b[ 13] = *(a07 +  5);
+		b[ 14] = *(a08 +  4);
+		b[ 15] = *(a08 +  5);
+		b[ 16] = *(a09 +  4);
+		b[ 17] = *(a09 +  5);
+		b[ 18] = *(a10 +  4);
+		b[ 19] = *(a10 +  5);
+		b[ 20] = *(a11 +  4);
+		b[ 21] = *(a11 +  5);
+		b[ 22] = *(a12 +  4);
+		b[ 23] = *(a12 +  5);
+		b[ 24] = *(a13 +  4);
+		b[ 25] = *(a13 +  5);
+		b[ 26] = *(a14 +  4);
+		b[ 27] = *(a14 +  5);
+		b[ 28] = *(a15 +  4);
+		b[ 29] = *(a15 +  5);
+		b[ 30] = *(a16 +  4);
+		b[ 31] = *(a16 +  5);
+		b += 32;
+	      }
+
+	      if (i >= 4) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+#ifdef UNIT
+		b[ 6] = ONE;
+		b[ 7] = ZERO;
+#else
+		b[ 6] = *(a04 +  6);
+		b[ 7] = *(a04 +  7);
+#endif
+		b[  8] = *(a05 +  6);
+		b[  9] = *(a05 +  7);
+		b[ 10] = *(a06 +  6);
+		b[ 11] = *(a06 +  7);
+		b[ 12] = *(a07 +  6);
+		b[ 13] = *(a07 +  7);
+		b[ 14] = *(a08 +  6);
+		b[ 15] = *(a08 +  7);
+		b[ 16] = *(a09 +  6);
+		b[ 17] = *(a09 +  7);
+		b[ 18] = *(a10 +  6);
+		b[ 19] = *(a10 +  7);
+		b[ 20] = *(a11 +  6);
+		b[ 21] = *(a11 +  7);
+		b[ 22] = *(a12 +  6);
+		b[ 23] = *(a12 +  7);
+		b[ 24] = *(a13 +  6);
+		b[ 25] = *(a13 +  7);
+		b[ 26] = *(a14 +  6);
+		b[ 27] = *(a14 +  7);
+		b[ 28] = *(a15 +  6);
+		b[ 29] = *(a15 +  7);
+		b[ 30] = *(a16 +  6);
+		b[ 31] = *(a16 +  7);
+		b += 32;
+	      }
+
+	      if (i >= 5) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+#ifdef UNIT
+		b[ 8] = ONE;
+		b[ 9] = ZERO;
+#else
+		b[ 8] = *(a05 +  8);
+		b[ 9] = *(a05 +  9);
+#endif
+		b[ 10] = *(a06 +  8);
+		b[ 11] = *(a06 +  9);
+		b[ 12] = *(a07 +  8);
+		b[ 13] = *(a07 +  9);
+		b[ 14] = *(a08 +  8);
+		b[ 15] = *(a08 +  9);
+		b[ 16] = *(a09 +  8);
+		b[ 17] = *(a09 +  9);
+		b[ 18] = *(a10 +  8);
+		b[ 19] = *(a10 +  9);
+		b[ 20] = *(a11 +  8);
+		b[ 21] = *(a11 +  9);
+		b[ 22] = *(a12 +  8);
+		b[ 23] = *(a12 +  9);
+		b[ 24] = *(a13 +  8);
+		b[ 25] = *(a13 +  9);
+		b[ 26] = *(a14 +  8);
+		b[ 27] = *(a14 +  9);
+		b[ 28] = *(a15 +  8);
+		b[ 29] = *(a15 +  9);
+		b[ 30] = *(a16 +  8);
+		b[ 31] = *(a16 +  9);
+		b += 32;
+	      }
+
+	      if (i >= 6) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+#ifdef UNIT
+		b[10] = ONE;
+		b[11] = ZERO;
+#else
+		b[10] = *(a06 + 10);
+		b[11] = *(a06 + 11);
+#endif
+		b[ 12] = *(a07 + 10);
+		b[ 13] = *(a07 + 11);
+		b[ 14] = *(a08 + 10);
+		b[ 15] = *(a08 + 11);
+		b[ 16] = *(a09 + 10);
+		b[ 17] = *(a09 + 11);
+		b[ 18] = *(a10 + 10);
+		b[ 19] = *(a10 + 11);
+		b[ 20] = *(a11 + 10);
+		b[ 21] = *(a11 + 11);
+		b[ 22] = *(a12 + 10);
+		b[ 23] = *(a12 + 11);
+		b[ 24] = *(a13 + 10);
+		b[ 25] = *(a13 + 11);
+		b[ 26] = *(a14 + 10);
+		b[ 27] = *(a14 + 11);
+		b[ 28] = *(a15 + 10);
+		b[ 29] = *(a15 + 11);
+		b[ 30] = *(a16 + 10);
+		b[ 31] = *(a16 + 11);
+		b += 32;
+	      }
+
+	      if (i >= 7) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+#ifdef UNIT
+		b[12] = ONE;
+		b[13] = ZERO;
+#else
+		b[12] = *(a07 + 12);
+		b[13] = *(a07 + 13);
+#endif
+		b[ 14] = *(a08 + 12);
+		b[ 15] = *(a08 + 13);
+		b[ 16] = *(a09 + 12);
+		b[ 17] = *(a09 + 13);
+		b[ 18] = *(a10 + 12);
+		b[ 19] = *(a10 + 13);
+		b[ 20] = *(a11 + 12);
+		b[ 21] = *(a11 + 13);
+		b[ 22] = *(a12 + 12);
+		b[ 23] = *(a12 + 13);
+		b[ 24] = *(a13 + 12);
+		b[ 25] = *(a13 + 13);
+		b[ 26] = *(a14 + 12);
+		b[ 27] = *(a14 + 13);
+		b[ 28] = *(a15 + 12);
+		b[ 29] = *(a15 + 13);
+		b[ 30] = *(a16 + 12);
+		b[ 31] = *(a16 + 13);
+		b += 32;
+	    }
+
+		if (i >= 8) {
+	    b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+	    b[12] = ZERO;
+	    b[13] = ZERO;
+#ifdef UNIT
+	      b[ 14] = ONE;
+	      b[ 15] = ZERO;
+#else
+	      b[ 14] = *(a08 +  14);
+	      b[ 15] = *(a08 +  15);
+#endif
+		b[ 16] = *(a09 + 14);
+		b[ 17] = *(a09 + 15);
+		b[ 18] = *(a10 + 14);
+		b[ 19] = *(a10 + 15);
+		b[ 20] = *(a11 + 14);
+		b[ 21] = *(a11 + 15);
+		b[ 22] = *(a12 + 14);
+		b[ 23] = *(a12 + 15);
+		b[ 24] = *(a13 + 14);
+		b[ 25] = *(a13 + 15);
+		b[ 26] = *(a14 + 14);
+		b[ 27] = *(a14 + 15);
+		b[ 28] = *(a15 + 14);
+		b[ 29] = *(a15 + 15);
+		b[ 30] = *(a16 + 14);
+		b[ 31] = *(a16 + 15);
+	      b += 32;
+	    }
+
+		if (i >= 9) {
+	    b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+	    b[12] = ZERO;
+	    b[13] = ZERO;
+	    b[ 14] = ZERO;
+	    b[ 15] = ZERO;
+#ifdef UNIT
+	      b[ 16] = ONE;
+	      b[ 17] = ZERO;
+#else
+	      b[ 16] = *(a09 + 16);
+	      b[ 17] = *(a09 + 17);
+#endif
+	    b[ 18] = *(a10 + 16);
+		b[ 19] = *(a10 + 17);
+		b[ 20] = *(a11 + 16);
+		b[ 21] = *(a11 + 17);
+		b[ 22] = *(a12 + 16);
+		b[ 23] = *(a12 + 17);
+		b[ 24] = *(a13 + 16);
+		b[ 25] = *(a13 + 17);
+		b[ 26] = *(a14 + 16);
+		b[ 27] = *(a14 + 17);
+		b[ 28] = *(a15 + 16);
+		b[ 29] = *(a15 + 17);
+		b[ 30] = *(a16 + 16);
+		b[ 31] = *(a16 + 17);
+	      b += 32;
+	    }
+
+		if (i >= 10) {
+	    b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+	    b[12] = ZERO;
+	    b[13] = ZERO;
+	    b[ 14] = ZERO;
+	    b[ 15] = ZERO;
+	    b[ 16] = ZERO;
+	    b[ 17] = ZERO;
+#ifdef UNIT
+	      b[ 18] = ONE;
+	      b[ 19] = ZERO;
+#else
+	      b[ 18] = *(a10 + 18);
+	      b[ 19] = *(a10 + 19);
+#endif
+	    b[ 20] = *(a11 + 18);
+		b[ 21] = *(a11 + 19);
+		b[ 22] = *(a12 + 18);
+		b[ 23] = *(a12 + 19);
+		b[ 24] = *(a13 + 18);
+		b[ 25] = *(a13 + 19);
+		b[ 26] = *(a14 + 18);
+		b[ 27] = *(a14 + 19);
+		b[ 28] = *(a15 + 18);
+		b[ 29] = *(a15 + 19);
+		b[ 30] = *(a16 + 18);
+		b[ 31] = *(a16 + 19);
+	      b += 32;
+	    }
+
+		if (i >= 11) {
+	    b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+	    b[12] = ZERO;
+	    b[13] = ZERO;
+	    b[ 14] = ZERO;
+	    b[ 15] = ZERO;
+	    b[ 16] = ZERO;
+	    b[ 17] = ZERO;
+	    b[ 18] = ZERO;
+	    b[ 19] = ZERO;
+#ifdef UNIT
+	      b[ 20] = ONE;
+	      b[ 21] = ZERO;
+#else
+	      b[ 20] = *(a11 + 20);
+	      b[ 21] = *(a11 + 21);
+#endif
+	    b[ 22] = *(a12 + 20);
+		b[ 23] = *(a12 + 21);
+		b[ 24] = *(a13 + 20);
+		b[ 25] = *(a13 + 21);
+		b[ 26] = *(a14 + 20);
+		b[ 27] = *(a14 + 21);
+		b[ 28] = *(a15 + 20);
+		b[ 29] = *(a15 + 21);
+		b[ 30] = *(a16 + 20);
+		b[ 31] = *(a16 + 21);
+	      b += 32;
+	    }
+
+		if (i >= 12) {
+	    b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+	    b[12] = ZERO;
+	    b[13] = ZERO;
+	    b[ 14] = ZERO;
+	    b[ 15] = ZERO;
+	    b[ 16] = ZERO;
+	    b[ 17] = ZERO;
+	    b[ 18] = ZERO;
+	    b[ 19] = ZERO;
+	    b[ 20] = ZERO;
+	    b[ 21] = ZERO;
+#ifdef UNIT
+	      b[ 22] = ONE;
+	      b[ 23] = ZERO;
+#else
+	      b[ 22] = *(a12 + 22);
+	      b[ 23] = *(a12 + 23);
+#endif
+	    b[ 24] = *(a13 + 22);
+		b[ 25] = *(a13 + 23);
+		b[ 26] = *(a14 + 22);
+		b[ 27] = *(a14 + 23);
+		b[ 28] = *(a15 + 22);
+		b[ 29] = *(a15 + 23);
+		b[ 30] = *(a16 + 22);
+		b[ 31] = *(a16 + 23);
+	      b += 32;
+	    }
+
+		if (i >= 13) {
+	    b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+	    b[12] = ZERO;
+	    b[13] = ZERO;
+	    b[ 14] = ZERO;
+	    b[ 15] = ZERO;
+	    b[ 16] = ZERO;
+	    b[ 17] = ZERO;
+	    b[ 18] = ZERO;
+	    b[ 19] = ZERO;
+	    b[ 20] = ZERO;
+	    b[ 21] = ZERO;
+	    b[ 22] = ZERO;
+	    b[ 23] = ZERO;
+#ifdef UNIT
+	      b[ 24] = ONE;
+	      b[ 25] = ZERO;
+#else
+	      b[ 24] = *(a13 + 24);
+	      b[ 25] = *(a13 + 25);
+#endif
+	    b[ 26] = *(a14 + 24);
+		b[ 27] = *(a14 + 25);
+		b[ 28] = *(a15 + 24);
+		b[ 29] = *(a15 + 25);
+		b[ 30] = *(a16 + 24);
+		b[ 31] = *(a16 + 25);
+	      b += 32;
+	    }
+
+		if (i >= 14) {
+	    b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+	    b[12] = ZERO;
+	    b[13] = ZERO;
+	    b[ 14] = ZERO;
+	    b[ 15] = ZERO;
+	    b[ 16] = ZERO;
+	    b[ 17] = ZERO;
+	    b[ 18] = ZERO;
+	    b[ 19] = ZERO;
+	    b[ 20] = ZERO;
+	    b[ 21] = ZERO;
+	    b[ 22] = ZERO;
+	    b[ 23] = ZERO;
+	    b[ 24] = ZERO;
+	    b[ 25] = ZERO;
+#ifdef UNIT
+	      b[ 26] = ONE;
+	      b[ 27] = ZERO;
+#else
+	      b[ 26] = *(a14 + 26);
+	      b[ 27] = *(a14 + 27);
+#endif
+	    b[ 28] = *(a15 + 26);
+		b[ 29] = *(a15 + 27);
+		b[ 30] = *(a16 + 26);
+		b[ 31] = *(a16 + 27);
+	      b += 32;
+	    }
+
+		if (i >= 15) {
+	    b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+	    b[12] = ZERO;
+	    b[13] = ZERO;
+	    b[ 14] = ZERO;
+	    b[ 15] = ZERO;
+	    b[ 16] = ZERO;
+	    b[ 17] = ZERO;
+	    b[ 18] = ZERO;
+	    b[ 19] = ZERO;
+	    b[ 20] = ZERO;
+	    b[ 21] = ZERO;
+	    b[ 22] = ZERO;
+	    b[ 23] = ZERO;
+	    b[ 24] = ZERO;
+	    b[ 25] = ZERO;
+	    b[ 26] = ZERO;
+	    b[ 27] = ZERO;
+#ifdef UNIT
+	      b[ 28] = ONE;
+	      b[ 29] = ZERO;
+#else
+	      b[ 28] = *(a15 + 28);
+	      b[ 29] = *(a15 + 29);
+#endif
+	    b[ 30] = *(a16 + 28);
+		b[ 31] = *(a16 + 29);
+	      b += 32;
+	    }
+	  }
+      }
+
+      posY += 16;
+      js --;
+    } while (js > 0);
+  } /* End of main loop */
+
+
+  if (n & 8){
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+	a02 = a + posX * 2 + (posY + 1) * lda;
+	a03 = a + posX * 2 + (posY + 2) * lda;
+	a04 = a + posX * 2 + (posY + 3) * lda;
+	a05 = a + posX * 2 + (posY + 4) * lda;
+	a06 = a + posX * 2 + (posY + 5) * lda;
+	a07 = a + posX * 2 + (posY + 6) * lda;
+	a08 = a + posX * 2 + (posY + 7) * lda;
+      } else {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+	a02 = a + posY * 2 + (posX + 1) * lda;
+	a03 = a + posY * 2 + (posX + 2) * lda;
+	a04 = a + posY * 2 + (posX + 3) * lda;
+	a05 = a + posY * 2 + (posX + 4) * lda;
+	a06 = a + posY * 2 + (posX + 5) * lda;
+	a07 = a + posY * 2 + (posX + 6) * lda;
+	a08 = a + posY * 2 + (posX + 7) * lda;
+      }
+
+      i = (m >> 3);
+      if (i > 0) {
+	do {
+	  if (X < posY) {
+	    for (ii = 0; ii < 8; ii++){
+
+	    b[  0] = *(a01 +  0);
+		b[  1] = *(a01 +  1);
+		b[  2] = *(a02 +  0);
+		b[  3] = *(a02 +  1);
+		b[  4] = *(a03 +  0);
+		b[  5] = *(a03 +  1);
+		b[  6] = *(a04 +  0);
+		b[  7] = *(a04 +  1);
+
+		b[  8] = *(a05 +  0);
+		b[  9] = *(a05 +  1);
+		b[ 10] = *(a06 +  0);
+		b[ 11] = *(a06 +  1);
+		b[ 12] = *(a07 +  0);
+		b[ 13] = *(a07 +  1);
+		b[ 14] = *(a08 +  0);
+		b[ 15] = *(a08 +  1);
+
+		a01 += 2;
+		a02 += 2;
+		a03 += 2;
+		a04 += 2;
+		a05 += 2;
+		a06 += 2;
+		a07 += 2;
+		a08 += 2;
+		b += 16;
+	    }
+	  } else
+	    if (X > posY) {
+	      a01 += 8 * lda;
+	      a02 += 8 * lda;
+	      a03 += 8 * lda;
+	      a04 += 8 * lda;
+	      a05 += 8 * lda;
+	      a06 += 8 * lda;
+	      a07 += 8 * lda;
+	      a08 += 8 * lda;
+
+	      b += 128;
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = *(a02 +  0);
+	      b[  3] = *(a02 +  1);
+	      b[  4] = *(a03 +  0);
+	      b[  5] = *(a03 +  1);
+	      b[  6] = *(a04 +  0);
+	      b[  7] = *(a04 +  1);
+
+	      b[  8] = *(a05 +  0);
+	      b[  9] = *(a05 +  1);
+	      b[ 10] = *(a06 +  0);
+	      b[ 11] = *(a06 +  1);
+	      b[ 12] = *(a07 +  0);
+	      b[ 13] = *(a07 +  1);
+	      b[ 14] = *(a08 +  0);
+	      b[ 15] = *(a08 +  1);
+
+	      b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+#ifdef UNIT
+	      b[ 18] = ONE;
+	      b[ 19] = ZERO;
+#else
+	      b[ 18] = *(a02 +  2);
+	      b[ 19] = *(a02 +  3);
+#endif
+	      b[ 20] = *(a03 +  2);
+	      b[ 21] = *(a03 +  3);
+	      b[ 22] = *(a04 +  2);
+	      b[ 23] = *(a04 +  3);
+	      b[ 24] = *(a05 +  2);
+	      b[ 25] = *(a05 +  3);
+	      b[ 26] = *(a06 +  2);
+	      b[ 27] = *(a06 +  3);
+	      b[ 28] = *(a07 +  2);
+	      b[ 29] = *(a07 +  3);
+	      b[ 30] = *(a08 +  2);
+	      b[ 31] = *(a08 +  3);
+
+	      b[ 32] = ZERO;
+	      b[ 33] = ZERO;
+	      b[ 34] = ZERO;
+	      b[ 35] = ZERO;
+#ifdef UNIT
+	      b[ 36] = ONE;
+	      b[ 37] = ZERO;
+#else
+	      b[ 36] = *(a03 +  4);
+	      b[ 37] = *(a03 +  5);
+#endif
+	      b[ 38] = *(a04 +  4);
+	      b[ 39] = *(a04 +  5);
+	      b[ 40] = *(a05 +  4);
+	      b[ 41] = *(a05 +  5);
+	      b[ 42] = *(a06 +  4);
+	      b[ 43] = *(a06 +  5);
+	      b[ 44] = *(a07 +  4);
+	      b[ 45] = *(a07 +  5);
+	      b[ 46] = *(a08 +  4);
+	      b[ 47] = *(a08 +  5);
+
+	      b[ 48] = ZERO;
+	      b[ 49] = ZERO;
+	      b[ 50] = ZERO;
+	      b[ 51] = ZERO;
+	      b[ 52] = ZERO;
+	      b[ 53] = ZERO;
+#ifdef UNIT
+	      b[ 54] = ONE;
+	      b[ 55] = ZERO;
+#else
+	      b[ 54] = *(a04 +  6);
+	      b[ 55] = *(a04 +  7);
+#endif
+	      b[ 56] = *(a05 +  6);
+	      b[ 57] = *(a05 +  7);
+	      b[ 58] = *(a06 +  6);
+	      b[ 59] = *(a06 +  7);
+	      b[ 60] = *(a07 +  6);
+	      b[ 61] = *(a07 +  7);
+	      b[ 62] = *(a08 +  6);
+	      b[ 63] = *(a08 +  7);
+
+	      b[ 64] = ZERO;
+	      b[ 65] = ZERO;
+	      b[ 66] = ZERO;
+	      b[ 67] = ZERO;
+	      b[ 68] = ZERO;
+	      b[ 69] = ZERO;
+	      b[ 70] = ZERO;
+	      b[ 71] = ZERO;
+#ifdef UNIT
+	      b[ 72] = ONE;
+	      b[ 73] = ZERO;
+#else
+	      b[ 72] = *(a05 +  8);
+	      b[ 73] = *(a05 +  9);
+#endif
+	      b[ 74] = *(a06 +  8);
+	      b[ 75] = *(a06 +  9);
+	      b[ 76] = *(a07 +  8);
+	      b[ 77] = *(a07 +  9);
+	      b[ 78] = *(a08 +  8);
+	      b[ 79] = *(a08 +  9);
+
+	      b[ 80] = ZERO;
+	      b[ 81] = ZERO;
+	      b[ 82] = ZERO;
+	      b[ 83] = ZERO;
+	      b[ 84] = ZERO;
+	      b[ 85] = ZERO;
+	      b[ 86] = ZERO;
+	      b[ 87] = ZERO;
+	      b[ 88] = ZERO;
+	      b[ 89] = ZERO;
+#ifdef UNIT
+	      b[ 90] = ONE;
+	      b[ 91] = ZERO;
+#else
+	      b[ 90] = *(a06 + 10);
+	      b[ 91] = *(a06 + 11);
+#endif
+	      b[ 92] = *(a07 + 10);
+	      b[ 93] = *(a07 + 11);
+	      b[ 94] = *(a08 + 10);
+	      b[ 95] = *(a08 + 11);
+
+	      b[ 96] = ZERO;
+	      b[ 97] = ZERO;
+	      b[ 98] = ZERO;
+	      b[ 99] = ZERO;
+	      b[100] = ZERO;
+	      b[101] = ZERO;
+	      b[102] = ZERO;
+	      b[103] = ZERO;
+	      b[104] = ZERO;
+	      b[105] = ZERO;
+	      b[106] = ZERO;
+	      b[107] = ZERO;
+#ifdef UNIT
+	      b[108] = ONE;
+	      b[109] = ZERO;
+#else
+	      b[108] = *(a07 + 12);
+	      b[109] = *(a07 + 13);
+#endif
+	      b[110] = *(a08 + 12);
+	      b[111] = *(a08 + 13);
+
+	      b[112] = ZERO;
+	      b[113] = ZERO;
+	      b[114] = ZERO;
+	      b[115] = ZERO;
+	      b[116] = ZERO;
+	      b[117] = ZERO;
+	      b[118] = ZERO;
+	      b[119] = ZERO;
+	      b[120] = ZERO;
+	      b[121] = ZERO;
+	      b[122] = ZERO;
+	      b[123] = ZERO;
+	      b[124] = ZERO;
+	      b[125] = ZERO;
+#ifdef UNIT
+	      b[126] = ONE;
+	      b[127] = ZERO;
+#else
+	      b[126] = *(a08 + 14);
+	      b[127] = *(a08 + 15);
+#endif
+
+	      a01 += 8 * lda;
+	      a02 += 8 * lda;
+	      a03 += 8 * lda;
+	      a04 += 8 * lda;
+	      a05 += 8 * lda;
+	      a06 += 8 * lda;
+	      a07 += 8 * lda;
+	      a08 += 8 * lda;
+	      b += 128;
+	    }
+
+	  X += 8;
+	  i --;
+	} while (i > 0);
+      }
+
+      i = (m & 7);
+      if (i) {
+
+	if (X < posY) {
+	  for (ii = 0; ii < i; ii++){
+
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    b[  2] = *(a02 +  0);
+	    b[  3] = *(a02 +  1);
+	    b[  4] = *(a03 +  0);
+	    b[  5] = *(a03 +  1);
+	    b[  6] = *(a04 +  0);
+	    b[  7] = *(a04 +  1);
+
+	    b[  8] = *(a05 +  0);
+	    b[  9] = *(a05 +  1);
+	    b[ 10] = *(a06 +  0);
+	    b[ 11] = *(a06 +  1);
+	    b[ 12] = *(a07 +  0);
+	    b[ 13] = *(a07 +  1);
+	    b[ 14] = *(a08 +  0);
+	    b[ 15] = *(a08 +  1);
+
+	    a01 += 2;
+	    a02 += 2;
+	    a03 += 2;
+	    a04 += 2;
+	    a05 += 2;
+	    a06 += 2;
+	    a07 += 2;
+	    a08 += 2;
+	    b += 16;
+	    }
+	} else
+	  if (X > posY) {
+	      /* a01 += i * lda;
+	      a02 += i * lda;
+	      a03 += i * lda;
+	      a04 += i * lda;
+	      a05 += i * lda;
+	      a06 += i * lda;
+	      a07 += i * lda;
+	      a08 += i * lda; */
+	      b += 16 * i;
+	  } else {
+#ifdef UNIT
+	    b[ 0] = ONE;
+	    b[ 1] = ZERO;
+#else
+	    b[ 0] = *(a01 +  0);
+	    b[ 1] = *(a01 +  1);
+#endif
+	    b[ 2] = *(a02 +  0);
+	    b[ 3] = *(a02 +  1);
+	    b[ 4] = *(a03 +  0);
+	    b[ 5] = *(a03 +  1);
+	    b[ 6] = *(a04 +  0);
+	    b[ 7] = *(a04 +  1);
+	    b[ 8] = *(a05 +  0);
+	    b[ 9] = *(a05 +  1);
+	    b[10] = *(a06 +  0);
+	    b[11] = *(a06 +  1);
+	    b[12] = *(a07 +  0);
+	    b[13] = *(a07 +  1);
+	    b[14] = *(a08 +  0);
+	    b[15] = *(a08 +  1);
+	    b += 16;
+
+	    if(i >= 2) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+#ifdef UNIT
+	      b[ 2] = ONE;
+	      b[ 3] = ZERO;
+#else
+	      b[ 2] = *(a02 +  2);
+	      b[ 3] = *(a02 +  3);
+#endif
+	      b[ 4] = *(a03 +  2);
+	      b[ 5] = *(a03 +  3);
+	      b[ 6] = *(a04 +  2);
+	      b[ 7] = *(a04 +  3);
+	      b[ 8] = *(a05 +  2);
+	      b[ 9] = *(a05 +  3);
+	      b[10] = *(a06 +  2);
+	      b[11] = *(a06 +  3);
+	      b[12] = *(a07 +  2);
+	      b[13] = *(a07 +  3);
+	      b[14] = *(a08 +  2);
+	      b[15] = *(a08 +  3);
+	      b += 16;
+	    }
+
+	    if (i >= 3) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+#ifdef UNIT
+	      b[ 4] = ONE;
+	      b[ 5] = ZERO;
+#else
+	      b[ 4] = *(a03 +  4);
+	      b[ 5] = *(a03 +  5);
+#endif
+	      b[ 6] = *(a04 +  4);
+	      b[ 7] = *(a04 +  5);
+	      b[ 8] = *(a05 +  4);
+	      b[ 9] = *(a05 +  5);
+	      b[10] = *(a06 +  4);
+	      b[11] = *(a06 +  5);
+	      b[12] = *(a07 +  4);
+	      b[13] = *(a07 +  5);
+	      b[14] = *(a08 +  4);
+	      b[15] = *(a08 +  5);
+	      b += 16;
+	    }
+
+	    if (i >= 4) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+	      b[ 4] = ZERO;
+	      b[ 5] = ZERO;
+#ifdef UNIT
+	      b[ 6] = ONE;
+	      b[ 7] = ZERO;
+#else
+	      b[ 6] = *(a04 +  6);
+	      b[ 7] = *(a04 +  7);
+#endif
+	      b[ 8] = *(a05 +  6);
+	      b[ 9] = *(a05 +  7);
+	      b[10] = *(a06 +  6);
+	      b[11] = *(a06 +  7);
+	      b[12] = *(a07 +  6);
+	      b[13] = *(a07 +  7);
+	      b[14] = *(a08 +  6);
+	      b[15] = *(a08 +  7);
+	      b += 16;
+	    }
+
+	    if (i >= 5) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+	      b[ 4] = ZERO;
+	      b[ 5] = ZERO;
+	      b[ 6] = ZERO;
+	      b[ 7] = ZERO;
+#ifdef UNIT
+	      b[ 8] = ONE;
+	      b[ 9] = ZERO;
+#else
+	      b[ 8] = *(a05 +  8);
+	      b[ 9] = *(a05 +  9);
+#endif
+	      b[10] = *(a06 +  8);
+	      b[11] = *(a06 +  9);
+	      b[12] = *(a07 +  8);
+	      b[13] = *(a07 +  9);
+	      b[14] = *(a08 +  8);
+	      b[15] = *(a08 +  9);
+	      b += 16;
+	    }
+
+	    if (i >= 6) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+	      b[ 4] = ZERO;
+	      b[ 5] = ZERO;
+	      b[ 6] = ZERO;
+	      b[ 7] = ZERO;
+	      b[ 8] = ZERO;
+	      b[ 9] = ZERO;
+#ifdef UNIT
+	      b[10] = ONE;
+	      b[11] = ZERO;
+#else
+	      b[10] = *(a06 + 10);
+	      b[11] = *(a06 + 11);
+#endif
+	      b[12] = *(a07 + 10);
+	      b[13] = *(a07 + 11);
+	      b[14] = *(a08 + 10);
+	      b[15] = *(a08 + 11);
+	      b += 16;
+	    }
+
+	    if (i >= 7) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+	      b[ 4] = ZERO;
+	      b[ 5] = ZERO;
+	      b[ 6] = ZERO;
+	      b[ 7] = ZERO;
+	      b[ 8] = ZERO;
+	      b[ 9] = ZERO;
+	      b[10] = ZERO;
+	      b[11] = ZERO;
+#ifdef UNIT
+	      b[12] = ONE;
+	      b[13] = ZERO;
+#else
+	      b[12] = *(a07 + 12);
+	      b[13] = *(a07 + 13);
+#endif
+	      b[14] = *(a08 + 12);
+	      b[15] = *(a08 + 13);
+	      b += 16;
+	    }
+	  }
+      }
+
+      posY += 8;
+  }
+
+
+  if (n & 4){
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+	a02 = a + posX * 2 + (posY + 1) * lda;
+	a03 = a + posX * 2 + (posY + 2) * lda;
+	a04 = a + posX * 2 + (posY + 3) * lda;
+      } else {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+	a02 = a + posY * 2 + (posX + 1) * lda;
+	a03 = a + posY * 2 + (posX + 2) * lda;
+	a04 = a + posY * 2 + (posX + 3) * lda;
+      }
+
+      i = (m >> 2);
+      if (i > 0) {
+	do {
+	  if (X < posY) {
+	      for (ii = 0; ii < 4; ii++){
+
+		b[  0] = *(a01 +  0);
+		b[  1] = *(a01 +  1);
+		b[  2] = *(a02 +  0);
+		b[  3] = *(a02 +  1);
+		b[  4] = *(a03 +  0);
+		b[  5] = *(a03 +  1);
+		b[  6] = *(a04 +  0);
+		b[  7] = *(a04 +  1);
+
+		a01 += 2;
+		a02 += 2;
+		a03 += 2;
+		a04 += 2;
+		b += 8;
+	      }
+	  } else
+	    if (X > posY) {
+	      a01 += 4 * lda;
+	      a02 += 4 * lda;
+	      a03 += 4 * lda;
+	      a04 += 4 * lda;
+	      b += 32;
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = *(a02 +  0);
+	      b[  3] = *(a02 +  1);
+	      b[  4] = *(a03 +  0);
+	      b[  5] = *(a03 +  1);
+	      b[  6] = *(a04 +  0);
+	      b[  7] = *(a04 +  1);
+
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+#ifdef UNIT
+	      b[ 10] = ONE;
+	      b[ 11] = ZERO;
+#else
+	      b[ 10] = *(a02 +  2);
+	      b[ 11] = *(a02 +  3);
+#endif
+	      b[ 12] = *(a03 +  2);
+	      b[ 13] = *(a03 +  3);
+	      b[ 14] = *(a04 +  2);
+	      b[ 15] = *(a04 +  3);
+
+	      b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+#ifdef UNIT
+	      b[ 20] = ONE;
+	      b[ 21] = ZERO;
+#else
+	      b[ 20] = *(a03 +  4);
+	      b[ 21] = *(a03 +  5);
+#endif
+	      b[ 22] = *(a04 +  4);
+	      b[ 23] = *(a04 +  5);
+
+	      b[ 24] = ZERO;
+	      b[ 25] = ZERO;
+	      b[ 26] = ZERO;
+	      b[ 27] = ZERO;
+	      b[ 28] = ZERO;
+	      b[ 29] = ZERO;
+#ifdef UNIT
+	      b[ 30] = ONE;
+	      b[ 31] = ZERO;
+#else
+	      b[ 30] = *(a04 +  6);
+	      b[ 31] = *(a04 +  7);
+#endif
+
+	      a01 += 4 * lda;
+	      a02 += 4 * lda;
+	      a03 += 4 * lda;
+	      a04 += 4 * lda;
+
+	      b += 32;
+	    }
+
+	  X += 4;
+	  i --;
+	} while (i > 0);
+      }
+
+      i = (m & 3);
+      if (i) {
+
+	if (X < posY) {
+
+	  for (ii = 0; ii < i; ii++){
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    b[  2] = *(a02 +  0);
+	    b[  3] = *(a02 +  1);
+	    b[  4] = *(a03 +  0);
+	    b[  5] = *(a03 +  1);
+	    b[  6] = *(a04 +  0);
+	    b[  7] = *(a04 +  1);
+
+	    a01 += 2;
+	    a02 += 2;
+	    a03 += 2;
+	    a04 += 2;
+	    b += 8;
+	  }
+	} else
+	  if (X > posY) {
+	    /* a01 += i * lda;
+	    a02 += i * lda;
+	    a03 += i * lda;
+	    a04 += i * lda; */
+	    b += 8 * i;
+	  } else {
+#ifdef UNIT
+	    b[ 0] = ONE;
+	    b[ 1] = ZERO;
+#else
+	    b[ 0] = *(a01 +  0);
+	    b[ 1] = *(a01 +  1);
+#endif
+	    b[ 2] = *(a02 +  0);
+	    b[ 3] = *(a02 +  1);
+	    b[ 4] = *(a03 +  0);
+	    b[ 5] = *(a03 +  1);
+	    b[ 6] = *(a04 +  0);
+	    b[ 7] = *(a04 +  1);
+	    b += 8;
+
+	    if(i >= 2) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+#ifdef UNIT
+	      b[ 2] = ONE;
+	      b[ 3] = ZERO;
+#else
+	      b[ 2] = *(a02 +  2);
+	      b[ 3] = *(a02 +  3);
+#endif
+	      b[ 4] = *(a03 +  2);
+	      b[ 5] = *(a03 +  3);
+	      b[ 6] = *(a04 +  2);
+	      b[ 7] = *(a04 +  3);
+	      b += 8;
+	    }
+
+	    if (i >= 3) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+#ifdef UNIT
+	      b[ 4] = ONE;
+	      b[ 5] = ZERO;
+#else
+	      b[ 4] = *(a03 +  4);
+	      b[ 5] = *(a03 +  5);
+#endif
+	      b[ 6] = *(a04 +  4);
+	      b[ 7] = *(a04 +  5);
+	      b += 8;
+	    }
+	  }
+      }
+
+      posY += 4;
+  }
+
+  if (n & 2){
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+	a02 = a + posX * 2 + (posY + 1) * lda;
+      } else {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+	a02 = a + posY * 2 + (posX + 1) * lda;
+      }
+
+      i = (m >> 1);
+      if (i > 0) {
+	do {
+	  if (X < posY) {
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    b[  2] = *(a02 +  0);
+	    b[  3] = *(a02 +  1);
+	    b[  4] = *(a01 +  2);
+	    b[  5] = *(a01 +  3);
+	    b[  6] = *(a02 +  2);
+	    b[  7] = *(a02 +  3);
+
+	    a01 += 4;
+	    a02 += 4;
+	    b += 8;
+	  } else
+	    if (X > posY) {
+	      a01 += 2 * lda;
+	      a02 += 2 * lda;
+	      b += 8;
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = *(a02 +  0);
+	      b[  3] = *(a02 +  1);
+
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+#ifdef UNIT
+	      b[  6] = ONE;
+	      b[  7] = ZERO;
+#else
+	      b[  6] = *(a02 +  2);
+	      b[  7] = *(a02 +  3);
+#endif
+
+	      a01 += 2 * lda;
+	      a02 += 2 * lda;
+	      b += 8;
+	    }
+
+	  X += 2;
+	  i --;
+	} while (i > 0);
+      }
+
+      if (m & 1) {
+
+	if (X < posY) {
+	  b[  0] = *(a01 +  0);
+	  b[  1] = *(a01 +  1);
+	  b[  2] = *(a02 +  0);
+	  b[  3] = *(a02 +  1);
+	  /* a01 += 2;
+	  a02 += 2; */
+	  b += 4;
+	} else
+	  if (X > posY) {
+	    /* a01 += 2 * lda;
+	    a02 += 2 * lda; */
+	    b += 4;
+	  } else {
+#ifdef UNIT
+	    b[  0] = ONE;
+	    b[  1] = ZERO;
+	    b[  2] = *(a02 +  0);
+	    b[  3] = *(a02 +  1);
+#else
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    b[  2] = *(a02 +  0);
+	    b[  3] = *(a02 +  1);
+#endif
+	    b += 2;
+	  }
+      }
+      posY += 2;
+  }
+
+  if (n & 1){
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+      } else {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+      }
+
+      i = m;
+      if (m > 0) {
+	do {
+	  if (X < posY) {
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    a01 += 2;
+	    b += 2;
+	  } else
+	    if (X > posY) {
+	      a01 += lda;
+	      b += 2;
+	    } else {
+#ifdef UNIT
+	      b[ 0] = ONE;
+	      b[ 1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      a01 += lda;
+	      b += 2;
+	    }
+
+	  X += 1;
+	  i --;
+	} while (i > 0);
+      }
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/ztrmm_utcopy_16.c b/kernel/generic/ztrmm_utcopy_16.c
new file mode 100644
index 000000000..5aba3727a
--- /dev/null
+++ b/kernel/generic/ztrmm_utcopy_16.c
@@ -0,0 +1,2318 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, js;
+  BLASLONG X, ii;
+
+  FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08;
+  FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16;
+
+  lda += lda;
+
+  js = (n >> 4);
+
+  if (js > 0){
+    do {
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+	a02 = a + posX * 2 + (posY + 1) * lda;
+	a03 = a + posX * 2 + (posY + 2) * lda;
+	a04 = a + posX * 2 + (posY + 3) * lda;
+	a05 = a + posX * 2 + (posY + 4) * lda;
+	a06 = a + posX * 2 + (posY + 5) * lda;
+	a07 = a + posX * 2 + (posY + 6) * lda;
+	a08 = a + posX * 2 + (posY + 7) * lda;
+	a09 = a + posX * 2 + (posY + 8) * lda;
+	a10 = a + posX * 2 + (posY + 9) * lda;
+	a11 = a + posX * 2 + (posY + 10) * lda;
+	a12 = a + posX * 2 + (posY + 11) * lda;
+	a13 = a + posX * 2 + (posY + 12) * lda;
+	a14 = a + posX * 2 + (posY + 13) * lda;
+	a15 = a + posX * 2 + (posY + 14) * lda;
+	a16 = a + posX * 2 + (posY + 15) * lda;
+      } else {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+	a02 = a + posY * 2 + (posX + 1) * lda;
+	a03 = a + posY * 2 + (posX + 2) * lda;
+	a04 = a + posY * 2 + (posX + 3) * lda;
+	a05 = a + posY * 2 + (posX + 4) * lda;
+	a06 = a + posY * 2 + (posX + 5) * lda;
+	a07 = a + posY * 2 + (posX + 6) * lda;
+	a08 = a + posY * 2 + (posX + 7) * lda;
+	a09 = a + posY * 2 + (posX + 8) * lda;
+	a10 = a + posY * 2 + (posX + 9) * lda;
+	a11 = a + posY * 2 + (posX + 10) * lda;
+	a12 = a + posY * 2 + (posX + 11) * lda;
+	a13 = a + posY * 2 + (posX + 12) * lda;
+	a14 = a + posY * 2 + (posX + 13) * lda;
+	a15 = a + posY * 2 + (posX + 14) * lda;
+	a16 = a + posY * 2 + (posX + 15) * lda;
+      }
+
+      i = (m >> 4);
+      if (i > 0) {
+	do {
+	  if (X < posY) {
+		a01 += 32;
+	    a02 += 32;
+	    a03 += 32;
+	    a04 += 32;
+	    a05 += 32;
+	    a06 += 32;
+	    a07 += 32;
+	    a08 += 32;
+	    a09 += 32;
+	    a10 += 32;
+	    a11 += 32;
+	    a12 += 32;
+	    a13 += 32;
+	    a14 += 32;
+	    a15 += 32;
+	    a16 += 32;
+
+	    b += 512;
+	  } else
+	    if (X > posY) {
+			for (ii = 0; ii < 16; ii++){
+
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+	      b[  2] = *(a01 +  2);
+	      b[  3] = *(a01 +  3);
+	      b[  4] = *(a01 +  4);
+	      b[  5] = *(a01 +  5);
+	      b[  6] = *(a01 +  6);
+	      b[  7] = *(a01 +  7);
+
+	      b[  8] = *(a01 +  8);
+	      b[  9] = *(a01 +  9);
+	      b[ 10] = *(a01 + 10);
+	      b[ 11] = *(a01 + 11);
+	      b[ 12] = *(a01 + 12);
+	      b[ 13] = *(a01 + 13);
+	      b[ 14] = *(a01 + 14);
+	      b[ 15] = *(a01 + 15);
+
+		  b[ 16] = *(a01 + 16);
+	      b[ 17] = *(a01 + 17);
+	      b[ 18] = *(a01 + 18);
+	      b[ 19] = *(a01 + 19);
+	      b[ 20] = *(a01 + 20);
+	      b[ 21] = *(a01 + 21);
+	      b[ 22] = *(a01 + 22);
+	      b[ 23] = *(a01 + 23);
+
+	      b[ 24] = *(a01 + 24);
+	      b[ 25] = *(a01 + 25);
+	      b[ 26] = *(a01 + 26);
+	      b[ 27] = *(a01 + 27);
+	      b[ 28] = *(a01 + 28);
+	      b[ 29] = *(a01 + 29);
+	      b[ 30] = *(a01 + 30);
+	      b[ 31] = *(a01 + 31);
+
+	      a01 += lda;
+	      b += 32;
+	    }
+
+		a02 += 16 * lda;
+		a03 += 16 * lda;
+		a04 += 16 * lda;
+		a05 += 16 * lda;
+		a06 += 16 * lda;
+		a07 += 16 * lda;
+		a08 += 16 * lda;
+		a09 += 16 * lda;
+		a10 += 16 * lda;
+		a11 += 16 * lda;
+		a12 += 16 * lda;
+		a13 += 16 * lda;
+		a14 += 16 * lda;
+		a15 += 16 * lda;
+		a16 += 16 * lda;
+
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+		  b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+	      b[ 20] = ZERO;
+	      b[ 21] = ZERO;
+	      b[ 22] = ZERO;
+	      b[ 23] = ZERO;
+	      b[ 24] = ZERO;
+	      b[ 25] = ZERO;
+	      b[ 26] = ZERO;
+	      b[ 27] = ZERO;
+	      b[ 28] = ZERO;
+	      b[ 29] = ZERO;
+		  b[ 30] = ZERO;
+	      b[ 31] = ZERO;
+
+	      b[ 32] = *(a02 +  0);
+	      b[ 33] = *(a02 +  1);
+#ifdef UNIT
+	      b[ 34] = ONE;
+	      b[ 35] = ZERO;
+#else
+	      b[ 34] = *(a02 +  2);
+	      b[ 35] = *(a02 +  3);
+#endif
+	      b[ 36] = ZERO;
+	      b[ 37] = ZERO;
+	      b[ 38] = ZERO;
+	      b[ 39] = ZERO;
+	      b[ 40] = ZERO;
+	      b[ 41] = ZERO;
+	      b[ 42] = ZERO;
+	      b[ 43] = ZERO;
+	      b[ 44] = ZERO;
+	      b[ 45] = ZERO;
+	      b[ 46] = ZERO;
+	      b[ 47] = ZERO;
+	      b[ 48] = ZERO;
+	      b[ 49] = ZERO;
+	      b[ 50] = ZERO;
+	      b[ 51] = ZERO;
+	      b[ 52] = ZERO;
+	      b[ 53] = ZERO;
+	      b[ 54] = ZERO;
+	      b[ 55] = ZERO;
+	      b[ 56] = ZERO;
+	      b[ 57] = ZERO;
+	      b[ 58] = ZERO;
+	      b[ 59] = ZERO;
+	      b[ 60] = ZERO;
+	      b[ 61] = ZERO;
+	      b[ 62] = ZERO;
+	      b[ 63] = ZERO;
+
+	      b[ 64] = *(a03 +  0);
+	      b[ 65] = *(a03 +  1);
+	      b[ 66] = *(a03 +  2);
+	      b[ 67] = *(a03 +  3);
+#ifdef UNIT
+	      b[ 68] = ONE;
+	      b[ 69] = ZERO;
+#else
+	      b[ 68] = *(a03 +  4);
+	      b[ 69] = *(a03 +  5);
+#endif
+	      b[ 70] = ZERO;
+	      b[ 71] = ZERO;
+	      b[ 72] = ZERO;
+	      b[ 73] = ZERO;
+	      b[ 74] = ZERO;
+	      b[ 75] = ZERO;
+	      b[ 76] = ZERO;
+	      b[ 77] = ZERO;
+	      b[ 78] = ZERO;
+	      b[ 79] = ZERO;
+	      b[ 80] = ZERO;
+	      b[ 81] = ZERO;
+	      b[ 82] = ZERO;
+		  b[ 83] = ZERO;
+	      b[ 84] = ZERO;
+	      b[ 85] = ZERO;
+	      b[ 86] = ZERO;
+	      b[ 87] = ZERO;
+	      b[ 88] = ZERO;
+	      b[ 89] = ZERO;
+	      b[ 90] = ZERO;
+	      b[ 91] = ZERO;
+	      b[ 92] = ZERO;
+	      b[ 93] = ZERO;
+	      b[ 94] = ZERO;
+	      b[ 95] = ZERO;
+
+	      b[ 96] = *(a04 +  0);
+	      b[ 97] = *(a04 +  1);
+	      b[ 98] = *(a04 +  2);
+	      b[ 99] = *(a04 +  3);
+	      b[100] = *(a04 +  4);
+	      b[101] = *(a04 +  5);
+#ifdef UNIT
+	      b[102] = ONE;
+	      b[103] = ZERO;
+#else
+	      b[102] = *(a04 +  6);
+	      b[103] = *(a04 +  7);
+#endif
+	      b[104] = ZERO;
+	      b[105] = ZERO;
+	      b[106] = ZERO;
+	      b[107] = ZERO;
+	      b[108] = ZERO;
+	      b[109] = ZERO;
+	      b[110] = ZERO;
+	      b[111] = ZERO;
+	      b[112] = ZERO;
+	      b[113] = ZERO;
+	      b[114] = ZERO;
+	      b[115] = ZERO;
+		  b[116] = ZERO;
+	      b[117] = ZERO;
+	      b[118] = ZERO;
+	      b[119] = ZERO;
+	      b[120] = ZERO;
+	      b[121] = ZERO;
+	      b[122] = ZERO;
+	      b[123] = ZERO;
+	      b[124] = ZERO;
+	      b[125] = ZERO;
+	      b[126] = ZERO;
+	      b[127] = ZERO;
+
+	      b[128] = *(a05 +  0);
+	      b[129] = *(a05 +  1);
+	      b[130] = *(a05 +  2);
+	      b[131] = *(a05 +  3);
+	      b[132] = *(a05 +  4);
+	      b[133] = *(a05 +  5);
+	      b[134] = *(a05 +  6);
+	      b[135] = *(a05 +  7);
+#ifdef UNIT
+	      b[136] = ONE;
+	      b[137] = ZERO;
+#else
+	      b[136] = *(a05 +  8);
+	      b[137] = *(a05 +  9);
+#endif
+	      b[138] = ZERO;
+	      b[139] = ZERO;
+	      b[140] = ZERO;
+	      b[141] = ZERO;
+	      b[142] = ZERO;
+	      b[143] = ZERO;
+	      b[144] = ZERO;
+	      b[145] = ZERO;
+	      b[146] = ZERO;
+	      b[147] = ZERO;
+	      b[148] = ZERO;
+		  b[149] = ZERO;
+	      b[150] = ZERO;
+	      b[151] = ZERO;
+	      b[152] = ZERO;
+	      b[153] = ZERO;
+	      b[154] = ZERO;
+	      b[155] = ZERO;
+	      b[156] = ZERO;
+	      b[157] = ZERO;
+	      b[158] = ZERO;
+	      b[159] = ZERO;
+
+	      b[160] = *(a06 +  0);
+	      b[161] = *(a06 +  1);
+	      b[162] = *(a06 +  2);
+	      b[163] = *(a06 +  3);
+	      b[164] = *(a06 +  4);
+	      b[165] = *(a06 +  5);
+	      b[166] = *(a06 +  6);
+	      b[167] = *(a06 +  7);
+	      b[168] = *(a06 +  8);
+	      b[169] = *(a06 +  9);
+#ifdef UNIT
+	      b[170] = ONE;
+	      b[171] = ZERO;
+#else
+	      b[170] = *(a06 + 10);
+	      b[171] = *(a06 + 11);
+#endif
+	      b[172] = ZERO;
+	      b[173] = ZERO;
+	      b[174] = ZERO;
+	      b[175] = ZERO;
+	      b[176] = ZERO;
+	      b[177] = ZERO;
+	      b[178] = ZERO;
+	      b[179] = ZERO;
+	      b[180] = ZERO;
+	      b[181] = ZERO;
+		  b[182] = ZERO;
+	      b[183] = ZERO;
+	      b[184] = ZERO;
+	      b[185] = ZERO;
+	      b[186] = ZERO;
+	      b[187] = ZERO;
+	      b[188] = ZERO;
+	      b[189] = ZERO;
+	      b[190] = ZERO;
+	      b[191] = ZERO;
+
+	      b[192] = *(a07 + 0);
+	      b[193] = *(a07 + 1);
+	      b[194] = *(a07 + 2);
+	      b[195] = *(a07 + 3);
+	      b[196] = *(a07 + 4);
+	      b[197] = *(a07 + 5);
+	      b[198] = *(a07 + 6);
+	      b[199] = *(a07 + 7);
+	      b[200] = *(a07 + 8);
+	      b[201] = *(a07 + 9);
+	      b[202] = *(a07 + 10);
+	      b[203] = *(a07 + 11);
+#ifdef UNIT
+	      b[204] = ONE;
+	      b[205] = ZERO;
+#else
+	      b[204] = *(a07 + 12);
+	      b[205] = *(a07 + 13);
+#endif
+	      b[206] = ZERO;
+	      b[207] = ZERO;
+	      b[208] = ZERO;
+	      b[209] = ZERO;
+	      b[210] = ZERO;
+	      b[211] = ZERO;
+	      b[212] = ZERO;
+	      b[213] = ZERO;
+	      b[214] = ZERO;
+		  b[215] = ZERO;
+	      b[216] = ZERO;
+	      b[217] = ZERO;
+	      b[218] = ZERO;
+	      b[219] = ZERO;
+	      b[220] = ZERO;
+	      b[221] = ZERO;
+	      b[222] = ZERO;
+	      b[223] = ZERO;
+
+	      b[224] = *(a08 + 0);
+	      b[225] = *(a08 + 1);
+	      b[226] = *(a08 + 2);
+	      b[227] = *(a08 + 3);
+	      b[228] = *(a08 + 4);
+	      b[229] = *(a08 + 5);
+	      b[230] = *(a08 + 6);
+	      b[231] = *(a08 + 7);
+	      b[232] = *(a08 + 8);
+	      b[233] = *(a08 + 9);
+	      b[234] = *(a08 + 10);
+	      b[235] = *(a08 + 11);
+	      b[236] = *(a08 + 12);
+	      b[237] = *(a08 + 13);
+#ifdef UNIT
+	      b[238] = ONE;
+	      b[239] = ZERO;
+#else
+	      b[238] = *(a08 + 14);
+	      b[239] = *(a08 + 15);
+#endif
+		  b[240] = ZERO;
+	      b[241] = ZERO;
+	      b[242] = ZERO;
+	      b[243] = ZERO;
+	      b[244] = ZERO;
+	      b[245] = ZERO;
+	      b[246] = ZERO;
+	      b[247] = ZERO;
+		  b[248] = ZERO;
+	      b[249] = ZERO;
+	      b[250] = ZERO;
+	      b[251] = ZERO;
+	      b[252] = ZERO;
+	      b[253] = ZERO;
+	      b[254] = ZERO;
+	      b[255] = ZERO;
+
+	      b[256] = *(a09 + 0);
+	      b[257] = *(a09 + 1);
+	      b[258] = *(a09 + 2);
+	      b[259] = *(a09 + 3);
+	      b[260] = *(a09 + 4);
+	      b[261] = *(a09 + 5);
+	      b[262] = *(a09 + 6);
+	      b[263] = *(a09 + 7);
+	      b[264] = *(a09 + 8);
+	      b[265] = *(a09 + 9);
+	      b[266] = *(a09 + 10);
+	      b[267] = *(a09 + 11);
+	      b[268] = *(a09 + 12);
+	      b[269] = *(a09 + 13);
+	      b[270] = *(a09 + 14);
+	      b[271] = *(a09 + 15);
+#ifdef UNIT
+	      b[272] = ONE;
+		  b[273] = ZERO;
+#else
+	      b[272] = *(a09 + 16);
+		  b[273] = *(a09 + 17);
+#endif
+		  b[274] = ZERO;
+	      b[275] = ZERO;
+	      b[276] = ZERO;
+	      b[277] = ZERO;
+	      b[278] = ZERO;
+	      b[279] = ZERO;
+	      b[280] = ZERO;
+		  b[281] = ZERO;
+	      b[282] = ZERO;
+	      b[283] = ZERO;
+	      b[284] = ZERO;
+	      b[285] = ZERO;
+	      b[286] = ZERO;
+	      b[287] = ZERO;
+
+		  b[288] = *(a10 + 0);
+		  b[289] = *(a10 + 1);
+	      b[290] = *(a10 + 2);
+	      b[291] = *(a10 + 3);
+	      b[292] = *(a10 + 4);
+	      b[293] = *(a10 + 5);
+	      b[294] = *(a10 + 6);
+	      b[295] = *(a10 + 7);
+	      b[296] = *(a10 + 8);
+	      b[297] = *(a10 + 9);
+	      b[298] = *(a10 + 10);
+	      b[299] = *(a10 + 11);
+	      b[300] = *(a10 + 12);
+	      b[301] = *(a10 + 13);
+	      b[302] = *(a10 + 14);
+	      b[303] = *(a10 + 15);
+	      b[304] = *(a10 + 16);
+	      b[305] = *(a10 + 17);
+#ifdef UNIT
+	      b[306] = ONE;
+		  b[307] = ZERO;
+#else
+	      b[306] = *(a10 + 18);
+		  b[307] = *(a10 + 19);
+#endif
+		  b[308] = ZERO;
+	      b[309] = ZERO;
+	      b[310] = ZERO;
+	      b[311] = ZERO;
+	      b[312] = ZERO;
+	      b[313] = ZERO;
+		  b[314] = ZERO;
+	      b[315] = ZERO;
+	      b[316] = ZERO;
+	      b[317] = ZERO;
+	      b[318] = ZERO;
+	      b[319] = ZERO;
+
+		  b[320] = *(a11 + 0);
+		  b[321] = *(a11 + 1);
+	      b[322] = *(a11 + 2);
+	      b[323] = *(a11 + 3);
+	      b[324] = *(a11 + 4);
+	      b[325] = *(a11 + 5);
+	      b[326] = *(a11 + 6);
+	      b[327] = *(a11 + 7);
+	      b[328] = *(a11 + 8);
+	      b[329] = *(a11 + 9);
+	      b[330] = *(a11 + 10);
+	      b[331] = *(a11 + 11);
+	      b[332] = *(a11 + 12);
+	      b[333] = *(a11 + 13);
+	      b[334] = *(a11 + 14);
+	      b[335] = *(a11 + 15);
+	      b[336] = *(a11 + 16);
+	      b[337] = *(a11 + 17);
+	      b[338] = *(a11 + 18);
+	      b[339] = *(a11 + 19);
+#ifdef UNIT
+	      b[340] = ONE;
+	      b[341] = ZERO;
+#else
+	      b[340] = *(a11 + 20);
+	      b[341] = *(a11 + 21);
+#endif
+		  b[342] = ZERO;
+	      b[343] = ZERO;
+	      b[344] = ZERO;
+	      b[345] = ZERO;
+	      b[346] = ZERO;
+		  b[347] = ZERO;
+	      b[348] = ZERO;
+	      b[349] = ZERO;
+	      b[350] = ZERO;
+	      b[351] = ZERO;
+
+		  b[352] = *(a12 + 0);
+		  b[353] = *(a12 + 1);
+	      b[354] = *(a12 + 2);
+	      b[355] = *(a12 + 3);
+	      b[356] = *(a12 + 4);
+	      b[357] = *(a12 + 5);
+	      b[358] = *(a12 + 6);
+	      b[359] = *(a12 + 7);
+	      b[360] = *(a12 + 8);
+	      b[361] = *(a12 + 9);
+	      b[362] = *(a12 + 10);
+	      b[363] = *(a12 + 11);
+	      b[364] = *(a12 + 12);
+	      b[365] = *(a12 + 13);
+	      b[366] = *(a12 + 14);
+	      b[367] = *(a12 + 15);
+	      b[368] = *(a12 + 16);
+	      b[369] = *(a12 + 17);
+	      b[370] = *(a12 + 18);
+	      b[371] = *(a12 + 19);
+	      b[372] = *(a12 + 20);
+	      b[373] = *(a12 + 21);
+#ifdef UNIT
+	      b[374] = ONE;
+	      b[375] = ZERO;
+#else
+	      b[374] = *(a12 + 22);
+	      b[375] = *(a12 + 23);
+#endif
+		  b[376] = ZERO;
+	      b[377] = ZERO;
+	      b[378] = ZERO;
+	      b[379] = ZERO;
+		  b[380] = ZERO;
+	      b[381] = ZERO;
+	      b[382] = ZERO;
+	      b[383] = ZERO;
+
+		  b[384] = *(a13 + 0);
+		  b[385] = *(a13 + 1);
+	      b[386] = *(a13 + 2);
+	      b[387] = *(a13 + 3);
+	      b[388] = *(a13 + 4);
+	      b[389] = *(a13 + 5);
+	      b[390] = *(a13 + 6);
+	      b[391] = *(a13 + 7);
+	      b[392] = *(a13 + 8);
+	      b[393] = *(a13 + 9);
+	      b[394] = *(a13 + 10);
+	      b[395] = *(a13 + 11);
+	      b[396] = *(a13 + 12);
+	      b[397] = *(a13 + 13);
+	      b[398] = *(a13 + 14);
+	      b[399] = *(a13 + 15);
+	      b[400] = *(a13 + 16);
+	      b[401] = *(a13 + 17);
+	      b[402] = *(a13 + 18);
+	      b[403] = *(a13 + 19);
+	      b[404] = *(a13 + 20);
+	      b[405] = *(a13 + 21);
+	      b[406] = *(a13 + 22);
+	      b[407] = *(a13 + 23);
+#ifdef UNIT
+	      b[408] = ONE;
+	      b[409] = ZERO;
+#else
+	      b[408] = *(a13 + 24);
+	      b[409] = *(a13 + 25);
+#endif
+		  b[410] = ZERO;
+	      b[411] = ZERO;
+	      b[412] = ZERO;
+		  b[413] = ZERO;
+	      b[414] = ZERO;
+	      b[415] = ZERO;
+
+		  b[416] = *(a14 + 0);
+		  b[417] = *(a14 + 1);
+	      b[418] = *(a14 + 2);
+	      b[419] = *(a14 + 3);
+	      b[420] = *(a14 + 4);
+	      b[421] = *(a14 + 5);
+	      b[422] = *(a14 + 6);
+	      b[423] = *(a14 + 7);
+	      b[424] = *(a14 + 8);
+	      b[425] = *(a14 + 9);
+	      b[426] = *(a14 + 10);
+	      b[427] = *(a14 + 11);
+	      b[428] = *(a14 + 12);
+	      b[429] = *(a14 + 13);
+	      b[430] = *(a14 + 14);
+	      b[431] = *(a14 + 15);
+	      b[432] = *(a14 + 16);
+	      b[433] = *(a14 + 17);
+	      b[434] = *(a14 + 18);
+	      b[435] = *(a14 + 19);
+	      b[436] = *(a14 + 20);
+	      b[437] = *(a14 + 21);
+	      b[438] = *(a14 + 22);
+	      b[439] = *(a14 + 23);
+	      b[440] = *(a14 + 24);
+	      b[441] = *(a14 + 25);
+#ifdef UNIT
+	      b[442] = ONE;
+	      b[443] = ZERO;
+#else
+	      b[442] = *(a14 + 26);
+	      b[443] = *(a14 + 27);
+#endif
+		  b[444] = ZERO;
+	      b[445] = ZERO;
+		  b[446] = ZERO;
+	      b[447] = ZERO;
+
+		  b[448] = *(a15 + 0);
+		  b[449] = *(a15 + 1);
+	      b[450] = *(a15 + 2);
+	      b[451] = *(a15 + 3);
+	      b[452] = *(a15 + 4);
+	      b[453] = *(a15 + 5);
+	      b[454] = *(a15 + 6);
+	      b[455] = *(a15 + 7);
+	      b[456] = *(a15 + 8);
+	      b[457] = *(a15 + 9);
+	      b[458] = *(a15 + 10);
+	      b[459] = *(a15 + 11);
+	      b[460] = *(a15 + 12);
+	      b[461] = *(a15 + 13);
+	      b[462] = *(a15 + 14);
+	      b[463] = *(a15 + 15);
+	      b[464] = *(a15 + 16);
+	      b[465] = *(a15 + 17);
+	      b[466] = *(a15 + 18);
+	      b[467] = *(a15 + 19);
+	      b[468] = *(a15 + 20);
+	      b[469] = *(a15 + 21);
+	      b[470] = *(a15 + 22);
+	      b[471] = *(a15 + 23);
+	      b[472] = *(a15 + 24);
+	      b[473] = *(a15 + 25);
+	      b[474] = *(a15 + 26);
+	      b[475] = *(a15 + 27);
+#ifdef UNIT
+	      b[476] = ONE;
+	      b[477] = ZERO;
+#else
+	      b[476] = *(a15 + 28);
+	      b[477] = *(a15 + 29);
+#endif
+		  b[478] = ZERO;
+		  b[479] = ZERO;
+
+		  b[480] = *(a16 + 0);
+		  b[481] = *(a16 + 1);
+	      b[482] = *(a16 + 2);
+	      b[483] = *(a16 + 3);
+	      b[484] = *(a16 + 4);
+	      b[485] = *(a16 + 5);
+	      b[486] = *(a16 + 6);
+	      b[487] = *(a16 + 7);
+	      b[488] = *(a16 + 8);
+	      b[489] = *(a16 + 9);
+	      b[490] = *(a16 + 10);
+	      b[491] = *(a16 + 11);
+	      b[492] = *(a16 + 12);
+	      b[493] = *(a16 + 13);
+	      b[494] = *(a16 + 14);
+	      b[495] = *(a16 + 15);
+	      b[496] = *(a16 + 16);
+	      b[497] = *(a16 + 17);
+	      b[498] = *(a16 + 18);
+	      b[499] = *(a16 + 19);
+	      b[500] = *(a16 + 20);
+	      b[501] = *(a16 + 21);
+	      b[502] = *(a16 + 22);
+	      b[503] = *(a16 + 23);
+	      b[504] = *(a16 + 24);
+	      b[505] = *(a16 + 25);
+	      b[506] = *(a16 + 26);
+	      b[507] = *(a16 + 27);
+	      b[508] = *(a16 + 28);
+	      b[509] = *(a16 + 29);
+#ifdef UNIT
+	      b[510] = ONE;
+	      b[511] = ZERO;
+#else
+	      b[510] = *(a16 + 30);
+	      b[511] = *(a16 + 31);
+#endif
+
+	      a01 += 16 * lda;
+	      a02 += 16 * lda;
+	      a03 += 16 * lda;
+	      a04 += 16 * lda;
+	      a05 += 16 * lda;
+	      a06 += 16 * lda;
+	      a07 += 16 * lda;
+	      a08 += 16 * lda;
+	      a09 += 16 * lda;
+	      a10 += 16 * lda;
+	      a11 += 16 * lda;
+	      a12 += 16 * lda;
+	      a13 += 16 * lda;
+	      a14 += 16 * lda;
+	      a15 += 16 * lda;
+	      a16 += 16 * lda;
+	      b += 512;
+	    }
+
+	  X += 16;
+	  i --;
+	} while (i > 0);
+      }
+
+      i = (m & 15);
+      if (i) {
+
+	if (X < posY) {
+	//   a01 += 2 * i;
+	//   a02 += 2 * i;
+	//   a03 += 2 * i;
+	//   a04 += 2 * i;
+	//   a05 += 2 * i;
+	//   a06 += 2 * i;
+	//   a07 += 2 * i;
+	//   a08 += 2 * i;
+	//   a09 += 2 * i;
+	//   a10 += 2 * i;
+	//   a11 += 2 * i;
+	//   a12 += 2 * i;
+	//   a13 += 2 * i;
+	//   a14 += 2 * i;
+	//   a15 += 2 * i;
+	//   a16 += 2 * i;
+	  b += 32 * i;
+
+	} else
+	  if (X > posY) {
+	    for (ii = 0; ii < i; ii++){
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+		b[  2] = *(a01 +  2);
+		b[  3] = *(a01 +  3);
+		b[  4] = *(a01 +  4);
+		b[  5] = *(a01 +  5);
+		b[  6] = *(a01 +  6);
+		b[  7] = *(a01 +  7);
+		b[  8] = *(a01 +  8);
+		b[  9] = *(a01 +  9);
+		b[ 10] = *(a01 + 10);
+		b[ 11] = *(a01 + 11);
+		b[ 12] = *(a01 + 12);
+		b[ 13] = *(a01 + 13);
+		b[ 14] = *(a01 + 14);
+		b[ 15] = *(a01 + 15);
+
+		b[ 16] = *(a01 + 16);
+		b[ 17] = *(a01 + 17);
+		b[ 18] = *(a01 + 18);
+		b[ 19] = *(a01 + 19);
+		b[ 20] = *(a01 + 20);
+		b[ 21] = *(a01 + 21);
+		b[ 22] = *(a01 + 22);
+		b[ 23] = *(a01 + 23);
+		b[ 24] = *(a01 + 24);
+		b[ 25] = *(a01 + 25);
+		b[ 26] = *(a01 + 26);
+		b[ 27] = *(a01 + 27);
+		b[ 28] = *(a01 + 28);
+		b[ 29] = *(a01 + 29);
+		b[ 30] = *(a01 + 30);
+		b[ 31] = *(a01 + 31);
+
+	    a01 += lda;
+	    a02 += lda;
+	    a03 += lda;
+	    a04 += lda;
+	    a05 += lda;
+	    a06 += lda;
+	    a07 += lda;
+	    a08 += lda;
+		a09 += lda;
+	    a10 += lda;
+	    a11 += lda;
+	    a12 += lda;
+	    a13 += lda;
+	    a14 += lda;
+	    a15 += lda;
+	    a16 += lda;
+	    b += 32;
+	  }
+	  } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+		  b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+	      b[ 20] = ZERO;
+	      b[ 21] = ZERO;
+	      b[ 22] = ZERO;
+	      b[ 23] = ZERO;
+	      b[ 24] = ZERO;
+	      b[ 25] = ZERO;
+	      b[ 26] = ZERO;
+	      b[ 27] = ZERO;
+	      b[ 28] = ZERO;
+	      b[ 29] = ZERO;
+		  b[ 30] = ZERO;
+	      b[ 31] = ZERO;
+	      b += 32;
+
+	      if (i >= 2) {
+		b[ 0] = *(a02 +  0);
+		b[ 1] = *(a02 +  1);
+#ifdef UNIT
+		b[ 2] = ONE;
+		b[ 3] = ZERO;
+#else
+		b[ 2] = *(a02 +  2);
+		b[ 3] = *(a02 +  3);
+#endif
+		b[  4] = ZERO;
+		b[  5] = ZERO;
+		b[  6] = ZERO;
+		b[  7] = ZERO;
+		b[  8] = ZERO;
+		b[  9] = ZERO;
+		b[ 10] = ZERO;
+		b[ 11] = ZERO;
+		b[ 12] = ZERO;
+		b[ 13] = ZERO;
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	      }
+
+	      if (i >= 3) {
+		b[ 0] = *(a03 +  0);
+		b[ 1] = *(a03 +  1);
+		b[ 2] = *(a03 +  2);
+		b[ 3] = *(a03 +  3);
+#ifdef UNIT
+		b[ 4] = ONE;
+		b[ 5] = ZERO;
+#else
+		b[ 4] = *(a03 +  4);
+		b[ 5] = *(a03 +  5);
+#endif
+		b[  6] = ZERO;
+		b[  7] = ZERO;
+		b[  8] = ZERO;
+		b[  9] = ZERO;
+		b[ 10] = ZERO;
+		b[ 11] = ZERO;
+		b[ 12] = ZERO;
+		b[ 13] = ZERO;
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	      }
+
+	      if (i >= 4) {
+		b[ 0] = *(a04 +  0);
+		b[ 1] = *(a04 +  1);
+		b[ 2] = *(a04 +  2);
+		b[ 3] = *(a04 +  3);
+		b[ 4] = *(a04 +  4);
+		b[ 5] = *(a04 +  5);
+#ifdef UNIT
+		b[ 6] = ONE;
+		b[ 7] = ZERO;
+#else
+		b[ 6] = *(a04 +  6);
+		b[ 7] = *(a04 +  7);
+#endif
+		b[  8] = ZERO;
+		b[  9] = ZERO;
+		b[ 10] = ZERO;
+		b[ 11] = ZERO;
+		b[ 12] = ZERO;
+		b[ 13] = ZERO;
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	      }
+
+	      if (i >= 5) {
+		b[ 0] = *(a05 +  0);
+		b[ 1] = *(a05 +  1);
+		b[ 2] = *(a05 +  2);
+		b[ 3] = *(a05 +  3);
+		b[ 4] = *(a05 +  4);
+		b[ 5] = *(a05 +  5);
+		b[ 6] = *(a05 +  6);
+		b[ 7] = *(a05 +  7);
+#ifdef UNIT
+		b[ 8] = ONE;
+		b[ 9] = ZERO;
+#else
+		b[ 8] = *(a05 +  8);
+		b[ 9] = *(a05 +  9);
+#endif
+		b[ 10] = ZERO;
+		b[ 11] = ZERO;
+		b[ 12] = ZERO;
+		b[ 13] = ZERO;
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	      }
+
+	      if (i >= 6) {
+		b[ 0] = *(a06 + 0);
+		b[ 1] = *(a06 + 1);
+		b[ 2] = *(a06 + 2);
+		b[ 3] = *(a06 + 3);
+		b[ 4] = *(a06 + 4);
+		b[ 5] = *(a06 + 5);
+		b[ 6] = *(a06 + 6);
+		b[ 7] = *(a06 + 7);
+		b[ 8] = *(a06 + 8);
+		b[ 9] = *(a06 + 9);
+#ifdef UNIT
+		b[10] = ONE;
+		b[11] = ZERO;
+#else
+		b[10] = *(a06 + 10);
+		b[11] = *(a06 + 11);
+#endif
+		b[ 12] = ZERO;
+		b[ 13] = ZERO;
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	      }
+
+	      if (i >= 7) {
+		b[ 0] = *(a07 + 0);
+		b[ 1] = *(a07 + 1);
+		b[ 2] = *(a07 + 2);
+		b[ 3] = *(a07 + 3);
+		b[ 4] = *(a07 + 4);
+		b[ 5] = *(a07 + 5);
+		b[ 6] = *(a07 + 6);
+		b[ 7] = *(a07 + 7);
+		b[ 8] = *(a07 + 8);
+		b[ 9] = *(a07 + 9);
+		b[10] = *(a07 + 10);
+		b[11] = *(a07 + 11);
+#ifdef UNIT
+		b[12] = ONE;
+		b[13] = ZERO;
+#else
+		b[12] = *(a07 + 12);
+		b[13] = *(a07 + 13);
+#endif
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	    }
+
+		if (i >= 8) {
+	      b[  0] = *(a08 + 0);
+	      b[  1] = *(a08 + 1);
+	      b[  2] = *(a08 + 2);
+	      b[  3] = *(a08 + 3);
+	      b[  4] = *(a08 + 4);
+	      b[  5] = *(a08 + 5);
+	      b[  6] = *(a08 + 6);
+	      b[  7] = *(a08 + 7);
+	      b[  8] = *(a08 + 8);
+	      b[  9] = *(a08 + 9);
+	      b[ 10] = *(a08 + 10);
+	      b[ 11] = *(a08 + 11);
+	      b[ 12] = *(a08 + 12);
+	      b[ 13] = *(a08 + 13);
+#ifdef UNIT
+	      b[ 14] = ONE;
+	      b[ 15] = ZERO;
+#else
+	      b[ 14] = *(a08 +  14);
+	      b[ 15] = *(a08 +  15);
+#endif
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 9) {
+	      b[  0] = *(a09 + 0);
+	      b[  1] = *(a09 + 1);
+	      b[  2] = *(a09 + 2);
+	      b[  3] = *(a09 + 3);
+	      b[  4] = *(a09 + 4);
+	      b[  5] = *(a09 + 5);
+	      b[  6] = *(a09 + 6);
+	      b[  7] = *(a09 + 7);
+	      b[  8] = *(a09 + 8);
+	      b[  9] = *(a09 + 9);
+	      b[ 10] = *(a09 + 10);
+	      b[ 11] = *(a09 + 11);
+	      b[ 12] = *(a09 + 12);
+	      b[ 13] = *(a09 + 13);
+	      b[ 14] = *(a09 + 14);
+	      b[ 15] = *(a09 + 15);
+#ifdef UNIT
+	      b[ 16] = ONE;
+	      b[ 17] = ZERO;
+#else
+	      b[ 16] = *(a09 + 16);
+	      b[ 17] = *(a09 + 17);
+#endif
+	    b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 10) {
+	      b[  0] = *(a10 + 0);
+	      b[  1] = *(a10 + 1);
+	      b[  2] = *(a10 + 2);
+	      b[  3] = *(a10 + 3);
+	      b[  4] = *(a10 + 4);
+	      b[  5] = *(a10 + 5);
+	      b[  6] = *(a10 + 6);
+	      b[  7] = *(a10 + 7);
+	      b[  8] = *(a10 + 8);
+	      b[  9] = *(a10 + 9);
+	      b[ 10] = *(a10 + 10);
+	      b[ 11] = *(a10 + 11);
+	      b[ 12] = *(a10 + 12);
+	      b[ 13] = *(a10 + 13);
+	      b[ 14] = *(a10 + 14);
+	      b[ 15] = *(a10 + 15);
+	      b[ 16] = *(a10 + 16);
+	      b[ 17] = *(a10 + 17);
+#ifdef UNIT
+	      b[ 18] = ONE;
+	      b[ 19] = ZERO;
+#else
+	      b[ 18] = *(a10 + 18);
+	      b[ 19] = *(a10 + 19);
+#endif
+	    b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 11) {
+	      b[  0] = *(a11 + 0);
+	      b[  1] = *(a11 + 1);
+	      b[  2] = *(a11 + 2);
+	      b[  3] = *(a11 + 3);
+	      b[  4] = *(a11 + 4);
+	      b[  5] = *(a11 + 5);
+	      b[  6] = *(a11 + 6);
+	      b[  7] = *(a11 + 7);
+	      b[  8] = *(a11 + 8);
+	      b[  9] = *(a11 + 9);
+	      b[ 10] = *(a11 + 10);
+	      b[ 11] = *(a11 + 11);
+	      b[ 12] = *(a11 + 12);
+	      b[ 13] = *(a11 + 13);
+	      b[ 14] = *(a11 + 14);
+	      b[ 15] = *(a11 + 15);
+	      b[ 16] = *(a11 + 16);
+	      b[ 17] = *(a11 + 17);
+	      b[ 18] = *(a11 + 18);
+	      b[ 19] = *(a11 + 19);
+#ifdef UNIT
+	      b[ 20] = ONE;
+	      b[ 21] = ZERO;
+#else
+	      b[ 20] = *(a11 + 20);
+	      b[ 21] = *(a11 + 21);
+#endif
+	    b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 12) {
+	      b[  0] = *(a12 + 0);
+	      b[  1] = *(a12 + 1);
+	      b[  2] = *(a12 + 2);
+	      b[  3] = *(a12 + 3);
+	      b[  4] = *(a12 + 4);
+	      b[  5] = *(a12 + 5);
+	      b[  6] = *(a12 + 6);
+	      b[  7] = *(a12 + 7);
+	      b[  8] = *(a12 + 8);
+	      b[  9] = *(a12 + 9);
+	      b[ 10] = *(a12 + 10);
+	      b[ 11] = *(a12 + 11);
+	      b[ 12] = *(a12 + 12);
+	      b[ 13] = *(a12 + 13);
+	      b[ 14] = *(a12 + 14);
+	      b[ 15] = *(a12 + 15);
+	      b[ 16] = *(a12 + 16);
+	      b[ 17] = *(a12 + 17);
+	      b[ 18] = *(a12 + 18);
+	      b[ 19] = *(a12 + 19);
+	      b[ 20] = *(a12 + 20);
+	      b[ 21] = *(a12 + 21);
+#ifdef UNIT
+	      b[ 22] = ONE;
+	      b[ 23] = ZERO;
+#else
+	      b[ 22] = *(a12 + 22);
+	      b[ 23] = *(a12 + 23);
+#endif
+	    b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 13) {
+	      b[  0] = *(a13 + 0);
+	      b[  1] = *(a13 + 1);
+	      b[  2] = *(a13 + 2);
+	      b[  3] = *(a13 + 3);
+	      b[  4] = *(a13 + 4);
+	      b[  5] = *(a13 + 5);
+	      b[  6] = *(a13 + 6);
+	      b[  7] = *(a13 + 7);
+	      b[  8] = *(a13 + 8);
+	      b[  9] = *(a13 + 9);
+	      b[ 10] = *(a13 + 10);
+	      b[ 11] = *(a13 + 11);
+	      b[ 12] = *(a13 + 12);
+	      b[ 13] = *(a13 + 13);
+	      b[ 14] = *(a13 + 14);
+	      b[ 15] = *(a13 + 15);
+	      b[ 16] = *(a13 + 16);
+	      b[ 17] = *(a13 + 17);
+	      b[ 18] = *(a13 + 18);
+	      b[ 19] = *(a13 + 19);
+	      b[ 20] = *(a13 + 20);
+	      b[ 21] = *(a13 + 21);
+	      b[ 22] = *(a13 + 22);
+	      b[ 23] = *(a13 + 23);
+#ifdef UNIT
+	      b[ 24] = ONE;
+	      b[ 25] = ZERO;
+#else
+	      b[ 24] = *(a13 + 24);
+	      b[ 25] = *(a13 + 25);
+#endif
+	    b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 14) {
+	      b[  0] = *(a14 + 0);
+	      b[  1] = *(a14 + 1);
+	      b[  2] = *(a14 + 2);
+	      b[  3] = *(a14 + 3);
+	      b[  4] = *(a14 + 4);
+	      b[  5] = *(a14 + 5);
+	      b[  6] = *(a14 + 6);
+	      b[  7] = *(a14 + 7);
+	      b[  8] = *(a14 + 8);
+	      b[  9] = *(a14 + 9);
+	      b[ 10] = *(a14 + 10);
+	      b[ 11] = *(a14 + 11);
+	      b[ 12] = *(a14 + 12);
+	      b[ 13] = *(a14 + 13);
+	      b[ 14] = *(a14 + 14);
+	      b[ 15] = *(a14 + 15);
+	      b[ 16] = *(a14 + 16);
+	      b[ 17] = *(a14 + 17);
+	      b[ 18] = *(a14 + 18);
+	      b[ 19] = *(a14 + 19);
+	      b[ 20] = *(a14 + 20);
+	      b[ 21] = *(a14 + 21);
+	      b[ 22] = *(a14 + 22);
+	      b[ 23] = *(a14 + 23);
+	      b[ 24] = *(a14 + 24);
+	      b[ 25] = *(a14 + 25);
+#ifdef UNIT
+	      b[ 26] = ONE;
+	      b[ 27] = ZERO;
+#else
+	      b[ 26] = *(a14 + 26);
+	      b[ 27] = *(a14 + 27);
+#endif
+	    b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 15) {
+	      b[  0] = *(a15 + 0);
+	      b[  1] = *(a15 + 1);
+	      b[  2] = *(a15 + 2);
+	      b[  3] = *(a15 + 3);
+	      b[  4] = *(a15 + 4);
+	      b[  5] = *(a15 + 5);
+	      b[  6] = *(a15 + 6);
+	      b[  7] = *(a15 + 7);
+	      b[  8] = *(a15 + 8);
+	      b[  9] = *(a15 + 9);
+	      b[ 10] = *(a15 + 10);
+	      b[ 11] = *(a15 + 11);
+	      b[ 12] = *(a15 + 12);
+	      b[ 13] = *(a15 + 13);
+	      b[ 14] = *(a15 + 14);
+	      b[ 15] = *(a15 + 15);
+	      b[ 16] = *(a15 + 16);
+	      b[ 17] = *(a15 + 17);
+	      b[ 18] = *(a15 + 18);
+	      b[ 19] = *(a15 + 19);
+	      b[ 20] = *(a15 + 20);
+	      b[ 21] = *(a15 + 21);
+	      b[ 22] = *(a15 + 22);
+	      b[ 23] = *(a15 + 23);
+	      b[ 24] = *(a15 + 24);
+	      b[ 25] = *(a15 + 25);
+	      b[ 26] = *(a15 + 26);
+	      b[ 27] = *(a15 + 27);
+#ifdef UNIT
+	      b[ 28] = ONE;
+	      b[ 29] = ZERO;
+#else
+	      b[ 28] = *(a15 + 28);
+	      b[ 29] = *(a15 + 29);
+#endif
+	      b[ 30] = ZERO;
+	      b[ 31] = ZERO;
+	      b += 32;
+	    }
+	  }
+      }
+
+      posY += 16;
+      js --;
+    } while (js > 0);
+  } /* End of main loop */
+
+
+  if (n & 8){
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posX * 2 + (posY +  0) * lda;
+	a02 = a + posX * 2 + (posY +  1) * lda;
+	a03 = a + posX * 2 + (posY +  2) * lda;
+	a04 = a + posX * 2 + (posY +  3) * lda;
+	a05 = a + posX * 2 + (posY +  4) * lda;
+	a06 = a + posX * 2 + (posY +  5) * lda;
+	a07 = a + posX * 2 + (posY +  6) * lda;
+	a08 = a + posX * 2 + (posY +  7) * lda;
+      } else {
+	a01 = a + posY * 2 + (posX +  0) * lda;
+	a02 = a + posY * 2 + (posX +  1) * lda;
+	a03 = a + posY * 2 + (posX +  2) * lda;
+	a04 = a + posY * 2 + (posX +  3) * lda;
+	a05 = a + posY * 2 + (posX +  4) * lda;
+	a06 = a + posY * 2 + (posX +  5) * lda;
+	a07 = a + posY * 2 + (posX +  6) * lda;
+	a08 = a + posY * 2 + (posX +  7) * lda;
+      }
+
+      i = (m >> 3);
+      if (i > 0) {
+	do {
+	  if (X < posY) {
+	    a01 += 16;
+	    a02 += 16;
+	    a03 += 16;
+	    a04 += 16;
+	    a05 += 16;
+	    a06 += 16;
+	    a07 += 16;
+	    a08 += 16;
+	    b += 128;
+	  } else
+	    if (X > posY) {
+	      for (ii = 0; ii < 8; ii++){
+
+		b[  0] = *(a01 +  0);
+		b[  1] = *(a01 +  1);
+		b[  2] = *(a01 +  2);
+		b[  3] = *(a01 +  3);
+		b[  4] = *(a01 +  4);
+		b[  5] = *(a01 +  5);
+		b[  6] = *(a01 +  6);
+		b[  7] = *(a01 +  7);
+
+		b[  8] = *(a01 +  8);
+		b[  9] = *(a01 +  9);
+		b[ 10] = *(a01 + 10);
+		b[ 11] = *(a01 + 11);
+		b[ 12] = *(a01 + 12);
+		b[ 13] = *(a01 + 13);
+		b[ 14] = *(a01 + 14);
+		b[ 15] = *(a01 + 15);
+
+		a01 += lda;
+		b += 16;
+	      }
+		  a02 += 8 * lda;
+	      a03 += 8 * lda;
+	      a04 += 8 * lda;
+	      a05 += 8 * lda;
+	      a06 += 8 * lda;
+	      a07 += 8 * lda;
+	      a08 += 8 * lda;
+
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+
+	      b[ 16] = *(a02 +  0);
+	      b[ 17] = *(a02 +  1);
+#ifdef UNIT
+	      b[ 18] = ONE;
+	      b[ 19] = ZERO;
+#else
+	      b[ 18] = *(a02 +  2);
+	      b[ 19] = *(a02 +  3);
+#endif
+	      b[ 20] = ZERO;
+	      b[ 21] = ZERO;
+	      b[ 22] = ZERO;
+	      b[ 23] = ZERO;
+	      b[ 24] = ZERO;
+	      b[ 25] = ZERO;
+	      b[ 26] = ZERO;
+	      b[ 27] = ZERO;
+	      b[ 28] = ZERO;
+	      b[ 29] = ZERO;
+	      b[ 30] = ZERO;
+	      b[ 31] = ZERO;
+
+	      b[ 32] = *(a03 +  0);
+	      b[ 33] = *(a03 +  1);
+	      b[ 34] = *(a03 +  2);
+	      b[ 35] = *(a03 +  3);
+#ifdef UNIT
+	      b[ 36] = ONE;
+	      b[ 37] = ZERO;
+#else
+	      b[ 36] = *(a03 +  4);
+	      b[ 37] = *(a03 +  5);
+#endif
+	      b[ 38] = ZERO;
+	      b[ 39] = ZERO;
+	      b[ 40] = ZERO;
+	      b[ 41] = ZERO;
+	      b[ 42] = ZERO;
+	      b[ 43] = ZERO;
+	      b[ 44] = ZERO;
+	      b[ 45] = ZERO;
+	      b[ 46] = ZERO;
+	      b[ 47] = ZERO;
+
+	      b[ 48] = *(a04 +  0);
+	      b[ 49] = *(a04 +  1);
+	      b[ 50] = *(a04 +  2);
+	      b[ 51] = *(a04 +  3);
+	      b[ 52] = *(a04 +  4);
+	      b[ 53] = *(a04 +  5);
+#ifdef UNIT
+	      b[ 54] = ONE;
+	      b[ 55] = ZERO;
+#else
+	      b[ 54] = *(a04 +  6);
+	      b[ 55] = *(a04 +  7);
+#endif
+	      b[ 56] = ZERO;
+	      b[ 57] = ZERO;
+	      b[ 58] = ZERO;
+	      b[ 59] = ZERO;
+	      b[ 60] = ZERO;
+	      b[ 61] = ZERO;
+	      b[ 62] = ZERO;
+	      b[ 63] = ZERO;
+
+	      b[ 64] = *(a05 +  0);
+	      b[ 65] = *(a05 +  1);
+	      b[ 66] = *(a05 +  2);
+	      b[ 67] = *(a05 +  3);
+	      b[ 68] = *(a05 +  4);
+	      b[ 69] = *(a05 +  5);
+	      b[ 70] = *(a05 +  6);
+	      b[ 71] = *(a05 +  7);
+#ifdef UNIT
+	      b[ 72] = ONE;
+	      b[ 73] = ZERO;
+#else
+	      b[ 72] = *(a05 +  8);
+	      b[ 73] = *(a05 +  9);
+#endif
+	      b[ 74] = ZERO;
+	      b[ 75] = ZERO;
+	      b[ 76] = ZERO;
+	      b[ 77] = ZERO;
+	      b[ 78] = ZERO;
+	      b[ 79] = ZERO;
+
+	      b[ 80] = *(a06 +  0);
+	      b[ 81] = *(a06 +  1);
+	      b[ 82] = *(a06 +  2);
+	      b[ 83] = *(a06 +  3);
+	      b[ 84] = *(a06 +  4);
+	      b[ 85] = *(a06 +  5);
+	      b[ 86] = *(a06 +  6);
+	      b[ 87] = *(a06 +  7);
+	      b[ 88] = *(a06 +  8);
+	      b[ 89] = *(a06 +  9);
+#ifdef UNIT
+	      b[ 90] = ONE;
+	      b[ 91] = ZERO;
+#else
+	      b[ 90] = *(a06 + 10);
+	      b[ 91] = *(a06 + 11);
+#endif
+	      b[ 92] = ZERO;
+	      b[ 93] = ZERO;
+	      b[ 94] = ZERO;
+	      b[ 95] = ZERO;
+
+	      b[ 96] = *(a07 +  0);
+	      b[ 97] = *(a07 +  1);
+	      b[ 98] = *(a07 +  2);
+	      b[ 99] = *(a07 +  3);
+	      b[100] = *(a07 +  4);
+	      b[101] = *(a07 +  5);
+	      b[102] = *(a07 +  6);
+	      b[103] = *(a07 +  7);
+	      b[104] = *(a07 +  8);
+	      b[105] = *(a07 +  9);
+	      b[106] = *(a07 + 10);
+	      b[107] = *(a07 + 11);
+#ifdef UNIT
+	      b[108] = ONE;
+	      b[109] = ZERO;
+#else
+	      b[108] = *(a07 + 12);
+	      b[109] = *(a07 + 13);
+#endif
+	      b[110] = ZERO;
+	      b[111] = ZERO;
+
+	      b[112] = *(a08 +  0);
+	      b[113] = *(a08 +  1);
+	      b[114] = *(a08 +  2);
+	      b[115] = *(a08 +  3);
+	      b[116] = *(a08 +  4);
+	      b[117] = *(a08 +  5);
+	      b[118] = *(a08 +  6);
+	      b[119] = *(a08 +  7);
+	      b[120] = *(a08 +  8);
+	      b[121] = *(a08 +  9);
+	      b[122] = *(a08 + 10);
+	      b[123] = *(a08 + 11);
+	      b[124] = *(a08 + 12);
+	      b[125] = *(a08 + 13);
+#ifdef UNIT
+	      b[126] = ONE;
+	      b[127] = ZERO;
+#else
+	      b[126] = *(a08 + 14);
+	      b[127] = *(a08 + 15);
+#endif
+
+	      a01 += 8 * lda;
+	      a02 += 8 * lda;
+	      a03 += 8 * lda;
+	      a04 += 8 * lda;
+	      a05 += 8 * lda;
+	      a06 += 8 * lda;
+	      a07 += 8 * lda;
+	      a08 += 8 * lda;
+	      b += 128;
+	    }
+
+	  X += 8;
+	  i --;
+	} while (i > 0);
+      }
+
+      i = (m & 7);
+      if (i) {
+
+	if (X < posY) {
+	  /* a01 += 2 * i;
+	    a02 += 2 * i;
+	    a03 += 2 * i;
+	    a04 += 2 * i;
+	    a05 += 2 * i;
+	    a06 += 2 * i;
+	    a07 += 2 * i;
+	    a08 += 2 * i; */
+	    b += 16 * i;
+	} else
+	  if (X > posY) {
+	      for (ii = 0; ii < i; ii++){
+
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+	      b[  2] = *(a01 +  2);
+	      b[  3] = *(a01 +  3);
+	      b[  4] = *(a01 +  4);
+	      b[  5] = *(a01 +  5);
+	      b[  6] = *(a01 +  6);
+	      b[  7] = *(a01 +  7);
+
+	      b[  8] = *(a01 +  8);
+	      b[  9] = *(a01 +  9);
+	      b[ 10] = *(a01 + 10);
+	      b[ 11] = *(a01 + 11);
+	      b[ 12] = *(a01 + 12);
+	      b[ 13] = *(a01 + 13);
+	      b[ 14] = *(a01 + 14);
+	      b[ 15] = *(a01 + 15);
+
+	      a01 += lda;
+	      a02 += lda;
+	      a03 += lda;
+	      a04 += lda;
+	      a05 += lda;
+	      a06 += lda;
+	      a07 += lda;
+	      a08 += lda;
+	      b += 16;
+	    }
+	  } else {
+#ifdef UNIT
+	    b[ 0] = ONE;
+	    b[ 1] = ZERO;
+#else
+	    b[ 0] = *(a01 +  0);
+	    b[ 1] = *(a01 +  1);
+#endif
+	    b[ 2] = ZERO;
+	    b[ 3] = ZERO;
+	    b[ 4] = ZERO;
+	    b[ 5] = ZERO;
+	    b[ 6] = ZERO;
+	    b[ 7] = ZERO;
+	    b[ 8] = ZERO;
+	    b[ 9] = ZERO;
+	    b[10] = ZERO;
+	    b[11] = ZERO;
+	    b[12] = ZERO;
+	    b[13] = ZERO;
+	    b[14] = ZERO;
+	    b[15] = ZERO;
+	    b += 16;
+
+	    if(i >= 2) {
+	      b[ 0] = *(a02 +  0);
+	      b[ 1] = *(a02 +  1);
+#ifdef UNIT
+	      b[ 2] = ONE;
+	      b[ 3] = ZERO;
+#else
+	      b[ 2] = *(a02 +  2);
+	      b[ 3] = *(a02 +  3);
+#endif
+	      b[ 4] = ZERO;
+	      b[ 5] = ZERO;
+	      b[ 6] = ZERO;
+	      b[ 7] = ZERO;
+	      b[ 8] = ZERO;
+	      b[ 9] = ZERO;
+	      b[10] = ZERO;
+	      b[11] = ZERO;
+	      b[12] = ZERO;
+	      b[13] = ZERO;
+	      b[14] = ZERO;
+	      b[15] = ZERO;
+	      b += 16;
+	    }
+
+	    if (i >= 3) {
+	      b[ 0] = *(a03 +  0);
+	      b[ 1] = *(a03 +  1);
+	      b[ 2] = *(a03 +  2);
+	      b[ 3] = *(a03 +  3);
+#ifdef UNIT
+	      b[ 4] = ONE;
+	      b[ 5] = ZERO;
+#else
+	      b[ 4] = *(a03 +  4);
+	      b[ 5] = *(a03 +  5);
+#endif
+	      b[ 6] = ZERO;
+	      b[ 7] = ZERO;
+	      b[ 8] = ZERO;
+	      b[ 9] = ZERO;
+	      b[10] = ZERO;
+	      b[11] = ZERO;
+	      b[12] = ZERO;
+	      b[13] = ZERO;
+	      b[14] = ZERO;
+	      b[15] = ZERO;
+	      b += 16;
+	    }
+
+	    if (i >= 4) {
+	      b[ 0] = *(a04 +  0);
+	      b[ 1] = *(a04 +  1);
+	      b[ 2] = *(a04 +  2);
+	      b[ 3] = *(a04 +  3);
+	      b[ 4] = *(a04 +  4);
+	      b[ 5] = *(a04 +  5);
+#ifdef UNIT
+	      b[ 6] = ONE;
+	      b[ 7] = ZERO;
+#else
+	      b[ 6] = *(a04 +  6);
+	      b[ 7] = *(a04 +  7);
+#endif
+	      b[ 8] = ZERO;
+	      b[ 9] = ZERO;
+	      b[10] = ZERO;
+	      b[11] = ZERO;
+	      b[12] = ZERO;
+	      b[13] = ZERO;
+	      b[14] = ZERO;
+	      b[15] = ZERO;
+	      b += 16;
+	    }
+
+	    if (i >= 5) {
+	      b[ 0] = *(a05 +  0);
+	      b[ 1] = *(a05 +  1);
+	      b[ 2] = *(a05 +  2);
+	      b[ 3] = *(a05 +  3);
+	      b[ 4] = *(a05 +  4);
+	      b[ 5] = *(a05 +  5);
+	      b[ 6] = *(a05 +  6);
+	      b[ 7] = *(a05 +  7);
+#ifdef UNIT
+	      b[ 8] = ONE;
+	      b[ 9] = ZERO;
+#else
+	      b[ 8] = *(a05 +  8);
+	      b[ 9] = *(a05 +  9);
+#endif
+	      b[10] = ZERO;
+	      b[11] = ZERO;
+	      b[12] = ZERO;
+	      b[13] = ZERO;
+	      b[14] = ZERO;
+	      b[15] = ZERO;
+	      b += 16;
+	    }
+
+	    if (i >= 6) {
+	      b[ 0] = *(a06 +  0);
+	      b[ 1] = *(a06 +  1);
+	      b[ 2] = *(a06 +  2);
+	      b[ 3] = *(a06 +  3);
+	      b[ 4] = *(a06 +  4);
+	      b[ 5] = *(a06 +  5);
+	      b[ 6] = *(a06 +  6);
+	      b[ 7] = *(a06 +  7);
+	      b[ 8] = *(a06 +  8);
+	      b[ 9] = *(a06 +  9);
+#ifdef UNIT
+	      b[10] = ONE;
+	      b[11] = ZERO;
+#else
+	      b[10] = *(a06 + 10);
+	      b[11] = *(a06 + 11);
+#endif
+	      b[12] = ZERO;
+	      b[13] = ZERO;
+	      b[14] = ZERO;
+	      b[15] = ZERO;
+	      b += 16;
+	    }
+
+	    if (i >= 7) {
+	      b[ 0] = *(a07 +  0);
+	      b[ 1] = *(a07 +  1);
+	      b[ 2] = *(a07 +  2);
+	      b[ 3] = *(a07 +  3);
+	      b[ 4] = *(a07 +  4);
+	      b[ 5] = *(a07 +  5);
+	      b[ 6] = *(a07 +  6);
+	      b[ 7] = *(a07 +  7);
+	      b[ 8] = *(a07 +  8);
+	      b[ 9] = *(a07 +  9);
+	      b[10] = *(a07 + 10);
+	      b[11] = *(a07 + 11);
+#ifdef UNIT
+	      b[12] = ONE;
+	      b[13] = ZERO;
+#else
+	      b[12] = *(a07 + 12);
+	      b[13] = *(a07 + 13);
+#endif
+	      b[14] = ZERO;
+	      b[15] = ZERO;
+	      b += 16;
+	    }
+	  }
+      }
+
+      posY += 8;
+  }
+
+
+  if (n & 4){
+    X = posX;
+
+    if (posX <= posY) {
+      a01 = a + posX * 2 + (posY +  0) * lda;
+      a02 = a + posX * 2 + (posY +  1) * lda;
+      a03 = a + posX * 2 + (posY +  2) * lda;
+      a04 = a + posX * 2 + (posY +  3) * lda;
+    } else {
+      a01 = a + posY * 2 + (posX +  0) * lda;
+      a02 = a + posY * 2 + (posX +  1) * lda;
+      a03 = a + posY * 2 + (posX +  2) * lda;
+      a04 = a + posY * 2 + (posX +  3) * lda;
+    }
+
+    i = (m >> 2);
+    if (i > 0) {
+      do {
+	if (X < posY) {
+	  a01 += 8;
+	  a02 += 8;
+	  a03 += 8;
+	  a04 += 8;
+	  b += 32;
+	} else
+	  if (X > posY) {
+
+	    for (ii = 0; ii < 4; ii++){
+
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+	      b[  2] = *(a01 +  2);
+	      b[  3] = *(a01 +  3);
+	      b[  4] = *(a01 +  4);
+	      b[  5] = *(a01 +  5);
+	      b[  6] = *(a01 +  6);
+	      b[  7] = *(a01 +  7);
+
+	      a01 += lda;
+	      b += 8;
+	    }
+
+	    a02 += 4 * lda;
+	    a03 += 4 * lda;
+	    a04 += 4 * lda;
+
+	  } else {
+#ifdef UNIT
+	    b[  0] = ONE;
+	    b[  1] = ZERO;
+#else
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+#endif
+	    b[  2] = ZERO;
+	    b[  3] = ZERO;
+	    b[  4] = ZERO;
+	    b[  5] = ZERO;
+	    b[  6] = ZERO;
+	    b[  7] = ZERO;
+
+	    b[  8] = *(a02 +  0);
+	    b[  9] = *(a02 +  1);
+#ifdef UNIT
+	    b[ 10] = ONE;
+	    b[ 11] = ZERO;
+#else
+	    b[ 10] = *(a02 +  2);
+	    b[ 11] = *(a02 +  3);
+#endif
+	    b[ 12] = ZERO;
+	    b[ 13] = ZERO;
+	    b[ 14] = ZERO;
+	    b[ 15] = ZERO;
+
+	    b[ 16] = *(a03 +  0);
+	    b[ 17] = *(a03 +  1);
+	    b[ 18] = *(a03 +  2);
+	    b[ 19] = *(a03 +  3);
+#ifdef UNIT
+	    b[ 20] = ONE;
+	    b[ 21] = ZERO;
+#else
+	    b[ 20] = *(a03 +  4);
+	    b[ 21] = *(a03 +  5);
+#endif
+	    b[ 22] = ZERO;
+	    b[ 23] = ZERO;
+
+	    b[ 24] = *(a04 +  0);
+	    b[ 25] = *(a04 +  1);
+	    b[ 26] = *(a04 +  2);
+	    b[ 27] = *(a04 +  3);
+	    b[ 28] = *(a04 +  4);
+	    b[ 29] = *(a04 +  5);
+#ifdef UNIT
+	    b[ 30] = ONE;
+	    b[ 31] = ZERO;
+#else
+	    b[ 30] = *(a04 +  6);
+	    b[ 31] = *(a04 +  7);
+#endif
+
+	    a01 += 4 * lda;
+	    a02 += 4 * lda;
+	    a03 += 4 * lda;
+	    a04 += 4 * lda;
+	    b += 32;
+	  }
+
+	X += 4;
+	i --;
+      } while (i > 0);
+    }
+
+    i = (m & 3);
+    if (i) {
+
+      if (X < posY) {
+	/* a01 += 2 * i;
+	a02 += 2 * i;
+	a03 += 2 * i;
+	a04 += 2 * i; */
+	b += 8 * i;
+      } else
+	if (X > posY) {
+
+	  for (ii = 0; ii < i; ii++){
+
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    b[  2] = *(a01 +  2);
+	    b[  3] = *(a01 +  3);
+	    b[  4] = *(a01 +  4);
+	    b[  5] = *(a01 +  5);
+	    b[  6] = *(a01 +  6);
+	    b[  7] = *(a01 +  7);
+
+	    a01 += lda;
+	    a02 += lda;
+	    a03 += lda;
+	    a04 += lda;
+	    b += 8;
+	  }
+	} else {
+
+#ifdef UNIT
+	  b[ 0] = ONE;
+	  b[ 1] = ZERO;
+#else
+	  b[ 0] = *(a01 +  0);
+	  b[ 1] = *(a01 +  1);
+#endif
+	  b[ 2] = ZERO;
+	  b[ 3] = ZERO;
+	  b[ 4] = ZERO;
+	  b[ 5] = ZERO;
+	  b[ 6] = ZERO;
+	  b[ 7] = ZERO;
+	  b += 8;
+
+	  if(i >= 2) {
+	    b[ 0] = *(a02 +  0);
+	    b[ 1] = *(a02 +  1);
+#ifdef UNIT
+	    b[ 2] = ONE;
+	    b[ 3] = ZERO;
+#else
+	    b[ 2] = *(a02 +  2);
+	    b[ 3] = *(a02 +  3);
+#endif
+	    b[ 4] = ZERO;
+	    b[ 5] = ZERO;
+	    b[ 6] = ZERO;
+	    b[ 7] = ZERO;
+	    b += 8;
+	  }
+
+	  if (i >= 3) {
+	    b[ 0] = *(a03 +  0);
+	    b[ 1] = *(a03 +  1);
+	    b[ 2] = *(a03 +  2);
+	    b[ 3] = *(a03 +  3);
+#ifdef UNIT
+	    b[ 4] = ONE;
+	    b[ 5] = ZERO;
+#else
+	    b[ 4] = *(a03 +  4);
+	    b[ 5] = *(a03 +  5);
+#endif
+	    b[ 6] = ZERO;
+	    b[ 7] = ZERO;
+	    b += 8;
+	  }
+	}
+    }
+
+    posY += 4;
+  }
+
+
+  if (n & 2){
+    X = posX;
+
+    if (posX <= posY) {
+      a01 = a + posX * 2 + (posY +  0) * lda;
+      a02 = a + posX * 2 + (posY +  1) * lda;
+    } else {
+      a01 = a + posY * 2 + (posX +  0) * lda;
+      a02 = a + posY * 2 + (posX +  1) * lda;
+    }
+
+    i = (m >> 1);
+    if (i > 0) {
+      do {
+	if (X < posY) {
+	    a01 += 4;
+	    a02 += 4;
+	    b += 8;
+	} else
+	  if (X > posY) {
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    b[  2] = *(a01 +  2);
+	    b[  3] = *(a01 +  3);
+	    b[  4] = *(a02 +  0);
+	    b[  5] = *(a02 +  1);
+	    b[  6] = *(a02 +  2);
+	    b[  7] = *(a02 +  3);
+
+	    a01 += 2 * lda;
+	    a02 += 2 * lda;
+	    b += 8;
+	  } else {
+
+#ifdef UNIT
+	    b[  0] = ONE;
+	    b[  1] = ZERO;
+#else
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+#endif
+	    b[  2] = ZERO;
+	    b[  3] = ZERO;
+
+	    b[  4] = *(a02 +  0);
+	    b[  5] = *(a02 +  1);
+#ifdef UNIT
+	    b[  6] = ONE;
+	    b[  7] = ZERO;
+#else
+	    b[  6] = *(a02 +  2);
+	    b[  7] = *(a02 +  3);
+#endif
+
+	    a01 += 2 * lda;
+	    a02 += 2 * lda;
+	    b += 8;
+	  }
+
+	X += 2;
+	i --;
+      } while (i > 0);
+    }
+
+    i = (m & 1);
+    if (i) {
+
+      if (X < posY) {
+	b += 4;
+      } else
+	if (X > posY) {
+	  b[  0] = *(a01 +  0);
+	  b[  1] = *(a01 +  1);
+	  b[  2] = *(a01 +  2);
+	  b[  3] = *(a01 +  3);
+	  b += 4;
+	  }
+#if 1
+	}
+#else
+	} else {
+#ifdef UNIT
+	  b[ 0] = ONE;
+	  b[ 1] = ZERO;
+#else
+	  b[ 0] = *(a01 +  0);
+	  b[ 1] = *(a01 +  1);
+#endif
+	  b[ 2] = *(a02 +  0);
+	  b[ 3] = *(a02 +  1);
+	  b += 4;
+	}
+#endif
+    posY += 2;
+  }
+
+  if (n & 1){
+    X = posX;
+
+    if (posX <= posY) {
+      a01 = a + posX * 2 + (posY +  0) * lda;
+    } else {
+      a01 = a + posY * 2 + (posX +  0) * lda;
+    }
+
+    i = m;
+    if (m > 0) {
+      do {
+	if (X < posY) {
+	  a01 += 2;
+	} else {
+#ifdef UNIT
+	  if (X > posY) {
+#endif
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+#ifdef UNIT
+	  } else {
+	    b[  0] = ONE;
+	    b[  1] = ZERO;
+	  }
+#endif
+	  a01 += lda;
+	}
+	b += 2;
+	X ++;
+	i --;
+      } while (i > 0);
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/ztrsm_lncopy_16.c b/kernel/generic/ztrsm_lncopy_16.c
new file mode 100644
index 000000000..4fd72c13e
--- /dev/null
+++ b/kernel/generic/ztrsm_lncopy_16.c
@@ -0,0 +1,308 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, j, jj, k;
+
+  FLOAT *a1, *a2,  *a3,  *a4,  *a5,  *a6,  *a7,  *a8;
+  FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16;
+
+  FLOAT data1, data2;
+
+  lda *= 2;
+  jj = offset;
+
+  j = (n >> 4);
+  while (j > 0){
+
+    a1  = a +  0 * lda;
+    a2  = a +  1 * lda;
+    a3  = a +  2 * lda;
+    a4  = a +  3 * lda;
+    a5  = a +  4 * lda;
+    a6  = a +  5 * lda;
+    a7  = a +  6 * lda;
+    a8  = a +  7 * lda;
+    a9  = a +  8 * lda;
+    a10 = a +  9 * lda;
+    a11 = a + 10 * lda;
+    a12 = a + 11 * lda;
+    a13 = a + 12 * lda;
+    a14 = a + 13 * lda;
+    a15 = a + 14 * lda;
+    a16 = a + 15 * lda;
+
+    a += 16 * lda;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 16)) {
+	for (k = 0; k < ii - jj; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * lda + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * lda + 1);
+	}
+	data1 = *(a1 + (ii - jj) * lda + 0);
+	data2 = *(a1 + (ii - jj) * lda + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj >= 16) {
+	*(b +  0) = *(a1  +  0);
+	*(b +  1) = *(a1  +  1);
+	*(b +  2) = *(a2  +  0);
+	*(b +  3) = *(a2  +  1);
+	*(b +  4) = *(a3  +  0);
+	*(b +  5) = *(a3  +  1);
+	*(b +  6) = *(a4  +  0);
+	*(b +  7) = *(a4  +  1);
+	*(b +  8) = *(a5  +  0);
+	*(b +  9) = *(a5  +  1);
+	*(b + 10) = *(a6  +  0);
+	*(b + 11) = *(a6  +  1);
+	*(b + 12) = *(a7  +  0);
+	*(b + 13) = *(a7  +  1);
+	*(b + 14) = *(a8  +  0);
+	*(b + 15) = *(a8  +  1);
+  *(b + 16) = *(a9  +  0);
+	*(b + 17) = *(a9  +  1);
+	*(b + 18) = *(a10  +  0);
+	*(b + 19) = *(a10  +  1);
+	*(b + 20) = *(a11  +  0);
+	*(b + 21) = *(a11  +  1);
+	*(b + 22) = *(a12  +  0);
+	*(b + 23) = *(a12  +  1);
+	*(b + 24) = *(a13  +  0);
+	*(b + 25) = *(a13  +  1);
+	*(b + 26) = *(a14  +  0);
+	*(b + 27) = *(a14  +  1);
+	*(b + 28) = *(a15  +  0);
+	*(b + 29) = *(a15  +  1);
+	*(b + 30) = *(a16  +  0);
+	*(b + 31) = *(a16  +  1);
+      }
+
+      a1  += 2;
+      a2  += 2;
+      a3  += 2;
+      a4  += 2;
+      a5  += 2;
+      a6  += 2;
+      a7  += 2;
+      a8  += 2;
+      a9  += 2;
+      a10  += 2;
+      a11  += 2;
+      a12  += 2;
+      a13  += 2;
+      a14  += 2;
+      a15  += 2;
+      a16  += 2;
+      b  += 32;
+      ii ++;
+    }
+
+    jj += 16;
+    j --;
+  }
+
+  if (n & 8) {
+    a1  = a +  0 * lda;
+    a2  = a +  1 * lda;
+    a3  = a +  2 * lda;
+    a4  = a +  3 * lda;
+    a5  = a +  4 * lda;
+    a6  = a +  5 * lda;
+    a7  = a +  6 * lda;
+    a8  = a +  7 * lda;
+
+    a += 8 * lda;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 8)) {
+	for (k = 0; k < ii - jj; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * lda + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * lda + 1);
+	}
+	data1 = *(a1 + (ii - jj) * lda + 0);
+	data2 = *(a1 + (ii - jj) * lda + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj >= 8) {
+	*(b +  0) = *(a1  +  0);
+	*(b +  1) = *(a1  +  1);
+	*(b +  2) = *(a2  +  0);
+	*(b +  3) = *(a2  +  1);
+	*(b +  4) = *(a3  +  0);
+	*(b +  5) = *(a3  +  1);
+	*(b +  6) = *(a4  +  0);
+	*(b +  7) = *(a4  +  1);
+	*(b +  8) = *(a5  +  0);
+	*(b +  9) = *(a5  +  1);
+	*(b + 10) = *(a6  +  0);
+	*(b + 11) = *(a6  +  1);
+	*(b + 12) = *(a7  +  0);
+	*(b + 13) = *(a7  +  1);
+	*(b + 14) = *(a8  +  0);
+	*(b + 15) = *(a8  +  1);
+      }
+
+      a1  += 2;
+      a2  += 2;
+      a3  += 2;
+      a4  += 2;
+      a5  += 2;
+      a6  += 2;
+      a7  += 2;
+      a8  += 2;
+      b  += 16;
+      ii ++;
+    }
+
+    jj += 8;
+  }
+
+  if (n & 4) {
+
+    a1  = a +  0 * lda;
+    a2  = a +  1 * lda;
+    a3  = a +  2 * lda;
+    a4  = a +  3 * lda;
+    a += 4 * lda;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 4)) {
+	for (k = 0; k < ii - jj; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * lda + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * lda + 1);
+	}
+	data1 = *(a1 + (ii - jj) * lda + 0);
+	data2 = *(a1 + (ii - jj) * lda + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj >= 4) {
+	*(b +  0) = *(a1  +  0);
+	*(b +  1) = *(a1  +  1);
+	*(b +  2) = *(a2  +  0);
+	*(b +  3) = *(a2  +  1);
+	*(b +  4) = *(a3  +  0);
+	*(b +  5) = *(a3  +  1);
+	*(b +  6) = *(a4  +  0);
+	*(b +  7) = *(a4  +  1);
+      }
+
+      a1  += 2;
+      a2  += 2;
+      a3  += 2;
+      a4  += 2;
+      b  += 8;
+      ii ++;
+    }
+
+    jj += 4;
+  }
+
+  if (n & 2) {
+
+    a1  = a +  0 * lda;
+    a2  = a +  1 * lda;
+    a += 2 * lda;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 2)) {
+	for (k = 0; k < ii - jj; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * lda + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * lda + 1);
+	}
+	data1 = *(a1 + (ii - jj) * lda + 0);
+	data2 = *(a1 + (ii - jj) * lda + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj >= 2) {
+	*(b +  0) = *(a1  +  0);
+	*(b +  1) = *(a1  +  1);
+	*(b +  2) = *(a2  +  0);
+	*(b +  3) = *(a2  +  1);
+      }
+
+      a1  += 2;
+      a2  += 2;
+      b  += 4;
+      ii ++;
+    }
+
+    jj += 2;
+  }
+
+  if (n & 1) {
+
+    a1  = a +  0 * lda;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 1)) {
+	for (k = 0; k < ii - jj; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * lda + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * lda + 1);
+	}
+	data1 = *(a1 + (ii - jj) * lda + 0);
+	data2 = *(a1 + (ii - jj) * lda + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj >= 1) {
+	*(b +  0) = *(a1  +  0);
+	*(b +  1) = *(a1  +  1);
+      }
+
+      a1  += 2;
+      b  += 2;
+      ii ++;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/ztrsm_ltcopy_16.c b/kernel/generic/ztrsm_ltcopy_16.c
new file mode 100644
index 000000000..e9aeae1ad
--- /dev/null
+++ b/kernel/generic/ztrsm_ltcopy_16.c
@@ -0,0 +1,264 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, j, jj, k;
+
+  FLOAT *a1;
+  FLOAT data1, data2;
+
+  lda *= 2;
+  jj = offset;
+
+  j = (n >> 4);
+  while (j > 0){
+
+    a1 = a;
+    a += 32;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 16)) {
+
+	data1 = *(a1 + (ii - jj) * 2 + 0);
+	data2 = *(a1 + (ii - jj) * 2 + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+
+	for (k = ii - jj + 1; k < 16; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * 2 + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * 2 + 1);
+	}
+
+      }
+
+      if (ii - jj < 0) {
+	*(b +  0) = *(a1 +  0);
+	*(b +  1) = *(a1 +  1);
+	*(b +  2) = *(a1 +  2);
+	*(b +  3) = *(a1 +  3);
+	*(b +  4) = *(a1 +  4);
+	*(b +  5) = *(a1 +  5);
+	*(b +  6) = *(a1 +  6);
+	*(b +  7) = *(a1 +  7);
+	*(b +  8) = *(a1 +  8);
+	*(b +  9) = *(a1 +  9);
+	*(b + 10) = *(a1 + 10);
+	*(b + 11) = *(a1 + 11);
+	*(b + 12) = *(a1 + 12);
+	*(b + 13) = *(a1 + 13);
+	*(b + 14) = *(a1 + 14);
+	*(b + 15) = *(a1 + 15);
+  *(b + 16) = *(a1 + 16);
+	*(b + 17) = *(a1 + 17);
+	*(b + 18) = *(a1 + 18);
+	*(b + 19) = *(a1 + 19);
+	*(b + 20) = *(a1 + 20);
+	*(b + 21) = *(a1 + 21);
+	*(b + 22) = *(a1 + 22);
+	*(b + 23) = *(a1 + 23);
+	*(b + 24) = *(a1 + 24);
+	*(b + 25) = *(a1 + 25);
+	*(b + 26) = *(a1 + 26);
+	*(b + 27) = *(a1 + 27);
+	*(b + 28) = *(a1 + 28);
+	*(b + 29) = *(a1 + 29);
+	*(b + 30) = *(a1 + 30);
+	*(b + 31) = *(a1 + 31);
+      }
+
+      b  += 32;
+      a1 += lda;
+      ii ++;
+    }
+
+    jj += 16;
+    j --;
+  }
+
+  j = (n & 8);
+  if (j > 0) {
+    a1 = a;
+    a += 16;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 8)) {
+
+	data1 = *(a1 + (ii - jj) * 2 + 0);
+	data2 = *(a1 + (ii - jj) * 2 + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+
+	for (k = ii - jj + 1; k < 8; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * 2 + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * 2 + 1);
+	}
+
+      }
+
+      if (ii - jj < 0) {
+	*(b +  0) = *(a1 +  0);
+	*(b +  1) = *(a1 +  1);
+	*(b +  2) = *(a1 +  2);
+	*(b +  3) = *(a1 +  3);
+	*(b +  4) = *(a1 +  4);
+	*(b +  5) = *(a1 +  5);
+	*(b +  6) = *(a1 +  6);
+	*(b +  7) = *(a1 +  7);
+	*(b +  8) = *(a1 +  8);
+	*(b +  9) = *(a1 +  9);
+	*(b + 10) = *(a1 + 10);
+	*(b + 11) = *(a1 + 11);
+	*(b + 12) = *(a1 + 12);
+	*(b + 13) = *(a1 + 13);
+	*(b + 14) = *(a1 + 14);
+	*(b + 15) = *(a1 + 15);
+      }
+
+      b  += 16;
+      a1 += lda;
+      ii ++;
+    }
+
+    jj += 8;
+  }
+
+  j = (n & 4);
+  if (j > 0) {
+
+    a1 = a;
+    a += 8;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 4)) {
+
+	data1 = *(a1 + (ii - jj) * 2 + 0);
+	data2 = *(a1 + (ii - jj) * 2 + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+
+	for (k = ii - jj + 1; k < 4; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * 2 + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * 2 + 1);
+	}
+
+      }
+
+      if (ii - jj < 0) {
+	*(b +  0) = *(a1 +  0);
+	*(b +  1) = *(a1 +  1);
+	*(b +  2) = *(a1 +  2);
+	*(b +  3) = *(a1 +  3);
+	*(b +  4) = *(a1 +  4);
+	*(b +  5) = *(a1 +  5);
+	*(b +  6) = *(a1 +  6);
+	*(b +  7) = *(a1 +  7);
+      }
+
+      b  += 8;
+      a1 += lda;
+      ii ++;
+    }
+
+    jj += 4;
+  }
+
+  j = (n & 2);
+  if (j > 0) {
+
+    a1 = a;
+    a += 4;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 2)) {
+
+	data1 = *(a1 + (ii - jj) * 2 + 0);
+	data2 = *(a1 + (ii - jj) * 2 + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+
+	for (k = ii - jj + 1; k < 2; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * 2 + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * 2 + 1);
+	}
+
+      }
+
+      if (ii - jj < 0) {
+	*(b +  0) = *(a1 +  0);
+	*(b +  1) = *(a1 +  1);
+	*(b +  2) = *(a1 +  2);
+	*(b +  3) = *(a1 +  3);
+      }
+
+      b  += 4;
+      a1 += lda;
+      ii ++;
+    }
+
+    jj += 2;
+  }
+
+  j = (n & 1);
+  if (j > 0) {
+
+    a1 = a;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 1)) {
+	data1 = *(a1 + (ii - jj) * 2 + 0);
+	data2 = *(a1 + (ii - jj) * 2 + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj < 0) {
+	*(b +  0) = *(a1 +  0);
+	*(b +  1) = *(a1 +  1);
+      }
+
+      b  += 2;
+      a1 += lda;
+      ii ++;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/ztrsm_uncopy_16.c b/kernel/generic/ztrsm_uncopy_16.c
new file mode 100644
index 000000000..e84d96891
--- /dev/null
+++ b/kernel/generic/ztrsm_uncopy_16.c
@@ -0,0 +1,313 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, j, jj, k;
+
+  FLOAT *a1, *a2,  *a3,  *a4,  *a5,  *a6,  *a7,  *a8;
+  FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16;
+
+  FLOAT data1, data2;
+
+  lda *= 2;
+  jj = offset;
+
+  j = (n >> 4);
+  while (j > 0){
+
+    a1  = a +  0 * lda;
+    a2  = a +  1 * lda;
+    a3  = a +  2 * lda;
+    a4  = a +  3 * lda;
+    a5  = a +  4 * lda;
+    a6  = a +  5 * lda;
+    a7  = a +  6 * lda;
+    a8  = a +  7 * lda;
+    a9  = a +  8 * lda;
+    a10 = a +  9 * lda;
+    a11 = a + 10 * lda;
+    a12 = a + 11 * lda;
+    a13 = a + 12 * lda;
+    a14 = a + 13 * lda;
+    a15 = a + 14 * lda;
+    a16 = a + 15 * lda;
+
+    a += 16 * lda;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 16)) {
+
+	data1 = *(a1 + (ii - jj) * lda + 0);
+	data2 = *(a1 + (ii - jj) * lda + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+
+	for (k = ii - jj + 1; k < 16; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * lda + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * lda + 1);
+	}
+      }
+
+      if (ii - jj < 0) {
+	*(b +  0) = *(a1  +  0);
+	*(b +  1) = *(a1  +  1);
+	*(b +  2) = *(a2  +  0);
+	*(b +  3) = *(a2  +  1);
+	*(b +  4) = *(a3  +  0);
+	*(b +  5) = *(a3  +  1);
+	*(b +  6) = *(a4  +  0);
+	*(b +  7) = *(a4  +  1);
+	*(b +  8) = *(a5  +  0);
+	*(b +  9) = *(a5  +  1);
+	*(b + 10) = *(a6  +  0);
+	*(b + 11) = *(a6  +  1);
+	*(b + 12) = *(a7  +  0);
+	*(b + 13) = *(a7  +  1);
+	*(b + 14) = *(a8  +  0);
+	*(b + 15) = *(a8  +  1);
+  *(b + 16) = *(a9  +  0);
+	*(b + 17) = *(a9  +  1);
+	*(b + 18) = *(a10  +  0);
+	*(b + 19) = *(a10  +  1);
+	*(b + 20) = *(a11  +  0);
+	*(b + 21) = *(a11  +  1);
+	*(b + 22) = *(a12  +  0);
+	*(b + 23) = *(a12  +  1);
+	*(b + 24) = *(a13  +  0);
+	*(b + 25) = *(a13  +  1);
+	*(b + 26) = *(a14  +  0);
+	*(b + 27) = *(a14  +  1);
+	*(b + 28) = *(a15  +  0);
+	*(b + 29) = *(a15  +  1);
+	*(b + 30) = *(a16  +  0);
+	*(b + 31) = *(a16  +  1);
+      }
+
+      a1  += 2;
+      a2  += 2;
+      a3  += 2;
+      a4  += 2;
+      a5  += 2;
+      a6  += 2;
+      a7  += 2;
+      a8  += 2;
+      a9  += 2;
+      a10  += 2;
+      a11  += 2;
+      a12  += 2;
+      a13  += 2;
+      a14  += 2;
+      a15  += 2;
+      a16  += 2;
+      b  += 32;
+      ii ++;
+    }
+
+    jj += 16;
+    j --;
+  }
+
+  if (n & 8) {
+    a1  = a +  0 * lda;
+    a2  = a +  1 * lda;
+    a3  = a +  2 * lda;
+    a4  = a +  3 * lda;
+    a5  = a +  4 * lda;
+    a6  = a +  5 * lda;
+    a7  = a +  6 * lda;
+    a8  = a +  7 * lda;
+
+    a += 8 * lda;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 8)) {
+
+	data1 = *(a1 + (ii - jj) * lda + 0);
+	data2 = *(a1 + (ii - jj) * lda + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+
+	for (k = ii - jj + 1; k < 8; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * lda + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * lda + 1);
+	}
+    }
+
+      if (ii - jj < 0) {
+	*(b +  0) = *(a1  +  0);
+	*(b +  1) = *(a1  +  1);
+	*(b +  2) = *(a2  +  0);
+	*(b +  3) = *(a2  +  1);
+	*(b +  4) = *(a3  +  0);
+	*(b +  5) = *(a3  +  1);
+	*(b +  6) = *(a4  +  0);
+	*(b +  7) = *(a4  +  1);
+	*(b +  8) = *(a5  +  0);
+	*(b +  9) = *(a5  +  1);
+	*(b + 10) = *(a6  +  0);
+	*(b + 11) = *(a6  +  1);
+	*(b + 12) = *(a7  +  0);
+	*(b + 13) = *(a7  +  1);
+	*(b + 14) = *(a8  +  0);
+	*(b + 15) = *(a8  +  1);
+      }
+
+      a1  += 2;
+      a2  += 2;
+      a3  += 2;
+      a4  += 2;
+      a5  += 2;
+      a6  += 2;
+      a7  += 2;
+      a8  += 2;
+      b  += 16;
+      ii ++;
+    }
+
+    jj += 8;
+  }
+
+  if (n & 4) {
+
+    a1  = a +  0 * lda;
+    a2  = a +  1 * lda;
+    a3  = a +  2 * lda;
+    a4  = a +  3 * lda;
+    a += 4 * lda;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 4)) {
+	data1 = *(a1 + (ii - jj) * lda + 0);
+	data2 = *(a1 + (ii - jj) * lda + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+
+	for (k = ii - jj + 1; k < 4; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * lda + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * lda + 1);
+	}
+      }
+
+      if (ii - jj < 0) {
+	*(b +  0) = *(a1  +  0);
+	*(b +  1) = *(a1  +  1);
+	*(b +  2) = *(a2  +  0);
+	*(b +  3) = *(a2  +  1);
+	*(b +  4) = *(a3  +  0);
+	*(b +  5) = *(a3  +  1);
+	*(b +  6) = *(a4  +  0);
+	*(b +  7) = *(a4  +  1);
+      }
+
+      a1  += 2;
+      a2  += 2;
+      a3  += 2;
+      a4  += 2;
+      b  += 8;
+      ii ++;
+    }
+
+    jj += 4;
+  }
+
+  if (n & 2) {
+
+    a1  = a +  0 * lda;
+    a2  = a +  1 * lda;
+    a += 2 * lda;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 2)) {
+	data1 = *(a1 + (ii - jj) * lda + 0);
+	data2 = *(a1 + (ii - jj) * lda + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+	for (k = ii - jj + 1; k < 2; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * lda + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * lda + 1);
+	}
+      }
+
+      if (ii - jj < 0) {
+	*(b +  0) = *(a1  +  0);
+	*(b +  1) = *(a1  +  1);
+	*(b +  2) = *(a2  +  0);
+	*(b +  3) = *(a2  +  1);
+      }
+
+      a1  += 2;
+      a2  += 2;
+      b  += 4;
+      ii ++;
+    }
+
+    jj += 2;
+  }
+
+  if (n & 1) {
+
+    a1  = a +  0 * lda;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 1)) {
+	data1 = *(a1 + (ii - jj) * lda + 0);
+	data2 = *(a1 + (ii - jj) * lda + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+	for (k = ii - jj + 1; k < 1; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * lda + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * lda + 1);
+	}
+      }
+
+      if (ii - jj < 0) {
+	*(b +  0) = *(a1  +  0);
+	*(b +  1) = *(a1  +  1);
+      }
+
+      a1  += 2;
+      b  += 2;
+      ii ++;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/ztrsm_utcopy_16.c b/kernel/generic/ztrsm_utcopy_16.c
new file mode 100644
index 000000000..efcea5c3f
--- /dev/null
+++ b/kernel/generic/ztrsm_utcopy_16.c
@@ -0,0 +1,261 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, j, jj, k;
+
+  FLOAT *a1, data1, data2;
+
+  lda *= 2;
+
+  jj = offset;
+
+  j = (n >> 4);
+  while (j > 0){
+
+    a1 = a;
+    a += 32;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 16)) {
+	for (k = 0; k < ii - jj; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * 2 + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * 2 + 1);
+	}
+
+	data1 = *(a1 + (ii - jj) * 2 + 0);
+	data2 = *(a1 + (ii - jj) * 2 + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj >= 16) {
+	*(b +  0) = *(a1 +  0);
+	*(b +  1) = *(a1 +  1);
+	*(b +  2) = *(a1 +  2);
+	*(b +  3) = *(a1 +  3);
+	*(b +  4) = *(a1 +  4);
+	*(b +  5) = *(a1 +  5);
+	*(b +  6) = *(a1 +  6);
+	*(b +  7) = *(a1 +  7);
+	*(b +  8) = *(a1 +  8);
+	*(b +  9) = *(a1 +  9);
+	*(b + 10) = *(a1 + 10);
+	*(b + 11) = *(a1 + 11);
+	*(b + 12) = *(a1 + 12);
+	*(b + 13) = *(a1 + 13);
+	*(b + 14) = *(a1 + 14);
+	*(b + 15) = *(a1 + 15);
+  *(b + 16) = *(a1 + 16);
+	*(b + 17) = *(a1 + 17);
+	*(b + 18) = *(a1 + 18);
+	*(b + 19) = *(a1 + 19);
+	*(b + 20) = *(a1 + 20);
+	*(b + 21) = *(a1 + 21);
+	*(b + 22) = *(a1 + 22);
+	*(b + 23) = *(a1 + 23);
+	*(b + 24) = *(a1 + 24);
+	*(b + 25) = *(a1 + 25);
+	*(b + 26) = *(a1 + 26);
+	*(b + 27) = *(a1 + 27);
+	*(b + 28) = *(a1 + 28);
+	*(b + 29) = *(a1 + 29);
+	*(b + 30) = *(a1 + 30);
+	*(b + 31) = *(a1 + 31);
+      }
+
+      b  += 32;
+      a1 += lda;
+      ii ++;
+    }
+
+    jj += 16;
+    j --;
+  }
+
+  j = (n & 8);
+  if (j > 0) {
+    a1 = a;
+    a += 16;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 8)) {
+	for (k = 0; k < ii - jj; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * 2 + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * 2 + 1);
+	}
+
+	data1 = *(a1 + (ii - jj) * 2 + 0);
+	data2 = *(a1 + (ii - jj) * 2 + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj >= 8) {
+	*(b +  0) = *(a1 +  0);
+	*(b +  1) = *(a1 +  1);
+	*(b +  2) = *(a1 +  2);
+	*(b +  3) = *(a1 +  3);
+	*(b +  4) = *(a1 +  4);
+	*(b +  5) = *(a1 +  5);
+	*(b +  6) = *(a1 +  6);
+	*(b +  7) = *(a1 +  7);
+	*(b +  8) = *(a1 +  8);
+	*(b +  9) = *(a1 +  9);
+	*(b + 10) = *(a1 + 10);
+	*(b + 11) = *(a1 + 11);
+	*(b + 12) = *(a1 + 12);
+	*(b + 13) = *(a1 + 13);
+	*(b + 14) = *(a1 + 14);
+	*(b + 15) = *(a1 + 15);
+      }
+
+      b  += 16;
+      a1 += lda;
+      ii ++;
+    }
+
+    jj += 8;
+  }
+
+  j = (n & 4);
+  if (j > 0) {
+
+    a1 = a;
+    a += 8;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 4)) {
+	for (k = 0; k < ii - jj; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * 2 + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * 2 + 1);
+	}
+
+	data1 = *(a1 + (ii - jj) * 2 + 0);
+	data2 = *(a1 + (ii - jj) * 2 + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj >= 4) {
+	*(b +  0) = *(a1 +  0);
+	*(b +  1) = *(a1 +  1);
+	*(b +  2) = *(a1 +  2);
+	*(b +  3) = *(a1 +  3);
+	*(b +  4) = *(a1 +  4);
+	*(b +  5) = *(a1 +  5);
+	*(b +  6) = *(a1 +  6);
+	*(b +  7) = *(a1 +  7);
+      }
+
+      b  += 8;
+      a1 += lda;
+      ii ++;
+    }
+
+    jj += 4;
+  }
+
+  j = (n & 2);
+  if (j > 0) {
+
+    a1 = a;
+    a += 4;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 2)) {
+	for (k = 0; k < ii - jj; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * 2 + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * 2 + 1);
+	}
+
+	data1 = *(a1 + (ii - jj) * 2 + 0);
+	data2 = *(a1 + (ii - jj) * 2 + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj >= 2) {
+	*(b +  0) = *(a1 +  0);
+	*(b +  1) = *(a1 +  1);
+	*(b +  2) = *(a1 +  2);
+	*(b +  3) = *(a1 +  3);
+      }
+
+      b  += 4;
+      a1 += lda;
+      ii ++;
+    }
+
+    jj += 2;
+  }
+
+  j = (n & 1);
+  if (j > 0) {
+
+    a1 = a;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 1)) {
+	for (k = 0; k < ii - jj; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * 2 + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * 2 + 1);
+	}
+
+	data1 = *(a1 + (ii - jj) * 2 + 0);
+	data2 = *(a1 + (ii - jj) * 2 + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj >= 1) {
+	*(b +  0) = *(a1 +  0);
+	*(b +  1) = *(a1 +  1);
+      }
+
+      b  += 2;
+      a1 += lda;
+      ii ++;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 17d15656a..ce9268b93 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -111,9 +111,13 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 SGEMVNKERNEL = sgemv_n_8_lasx.S
 SGEMVTKERNEL = sgemv_t_8_lasx.S
 
-CGEMMKERNEL  = cgemm_kernel_2x2_lsx.S
-CGEMMONCOPY  = cgemm_ncopy_2_lsx.S
-CGEMMOTCOPY  = cgemm_tcopy_2_lsx.S
+CGEMMKERNEL  = cgemm_kernel_16x4_lasx.S
+CGEMMINCOPY  = cgemm_ncopy_16_lasx.S
+CGEMMITCOPY  = cgemm_tcopy_16_lasx.S
+CGEMMONCOPY  = cgemm_ncopy_4_lasx.S
+CGEMMOTCOPY  = cgemm_tcopy_4_lasx.S
+CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
 CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
diff --git a/kernel/loongarch64/cgemm_kernel_16x4_lasx.S b/kernel/loongarch64/cgemm_kernel_16x4_lasx.S
new file mode 100644
index 000000000..249abe102
--- /dev/null
+++ b/kernel/loongarch64/cgemm_kernel_16x4_lasx.S
@@ -0,0 +1,3757 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+#define T2     $r26
+#define T3     $r27
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LASX vectors */
+#define U0     $xr30
+#define U1     $xr31
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define VALPHAR $xr28
+#define VALPHAI $xr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVFMADD
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VNMSUB
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVNMSUB
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VFMADD
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVFMADD
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VFMADD
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVNMSUB
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VNMSUB
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    xvldrepl.w  VALPHAR, $sp, 112
+    xvldrepl.w  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   2
+
+    move       J,      $r0
+    srai.d     T0,     N,     2  //bn/4
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/4; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    add.d      C2,     C1,    TL
+    add.d      C3,     C2,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     4  //bm/16
+    beq        I,      T0,    .L11
+
+.L101:  /* for(i=0; i<bm/16; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     T3,     OFF,   0x07
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x05
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   16
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+    xvxor.v    U4,     U4,   U4
+    xvxor.v    U5,     U5,   U5
+    xvxor.v    U6,     U6,   U6
+    xvxor.v    U7,     U7,   U7
+    xvxor.v    U8,     U8,   U8
+    xvxor.v    U9,     U9,   U9
+    xvxor.v    U10,     U10,   U10
+    xvxor.v    U11,     U11,   U11
+    xvxor.v    U12,     U12,   U12
+    xvxor.v    U13,     U13,   U13
+    xvxor.v    U14,     U14,   U14
+    xvxor.v    U15,     U15,   U15
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L103
+    blt        TL,     L,     .L103
+
+.L102:  /* for(k=0; k<temp; k+=1) */
+    xvld       D2,     B0,    0x00  // b0ri b1ri b2ri b3ri
+
+    vldrepl.w  $vr20,  A0,    0x00
+    vldrepl.w  $vr16,  A0,    0x08
+
+    vldrepl.w  $vr21,  A0,    0x04
+    vldrepl.w  $vr19,  A0,    0x0c
+
+    xvpermi.q  D4,     D0,    0x02  //a0rrrr a1rrrr
+    xvpermi.q  D5,     D3,    0x02  //a0iiii a1iiii
+
+    xvpickev.w D6,     D2,    D2    //b0r b1r b0r b1r b2r b3r b2r b3r
+    xvpermi.d  D6,     D6,    0xd8  //b0r b1r b2r b3r b0r b1r b2r b3r
+
+    xvpickod.w D7,     D2,    D2    //b0i b1i b0i b1i b2i b3i b2i b3i
+    xvpermi.d  D7,     D7,    0xd8  //b0i b1i b2i b3i b0i b1i b2i b3i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r 01r 11r 21r 31r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i 01i 11i 21i 31i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    vldrepl.w  $vr20,  A0,    0x10
+    vldrepl.w  $vr16,  A0,    0x18
+
+    vldrepl.w  $vr21,  A0,    0x14
+    vldrepl.w  $vr19,  A0,    0x1c
+
+    xvpermi.q  D4,     D0,    0x02
+    xvpermi.q  D5,     D3,    0x02
+
+    XVMADD1    U2,     D4,    D6,     U2  //02r 12r 22r 32r 03r 13r 23r 33r
+    XVMADD2    U3,     D5,    D6,     U3  //02i 12i 22i 32i 03i 13i 23i 33i
+    XVMADD3    U2,     D5,    D7,     U2
+    XVMADD4    U3,     D4,    D7,     U3
+
+    vldrepl.w  $vr20,  A0,    0x20
+    vldrepl.w  $vr16,  A0,    0x28
+
+    vldrepl.w  $vr21,  A0,    0x24
+    vldrepl.w  $vr19,  A0,    0x2c
+
+    xvpermi.q  D4,     D0,    0x02
+    xvpermi.q  D5,     D3,    0x02
+
+    XVMADD1    U4,     D4,    D6,     U4  //04r 14r 24r 34r 05r 15r 25r 35r
+    XVMADD2    U5,     D5,    D6,     U5  //04i 14i 24i 34i 05i 15i 25i 35i
+    XVMADD3    U4,     D5,    D7,     U4
+    XVMADD4    U5,     D4,    D7,     U5
+
+    vldrepl.w  $vr20,  A0,    0x30
+    vldrepl.w  $vr16,  A0,    0x38
+
+    vldrepl.w  $vr21,  A0,    0x34
+    vldrepl.w  $vr19,  A0,    0x3c
+
+    xvpermi.q  D4,     D0,    0x02
+    xvpermi.q  D5,     D3,    0x02
+
+    XVMADD1    U6,     D4,    D6,     U6  //06r 16r 26r 36r 07r 17r 27r 37r
+    XVMADD2    U7,     D5,    D6,     U7  //06i 16i 26i 36i 07i 17i 27i 37i
+    XVMADD3    U6,     D5,    D7,     U6
+    XVMADD4    U7,     D4,    D7,     U7
+
+    vldrepl.w  $vr20,  A0,    0x40
+    vldrepl.w  $vr16,  A0,    0x48
+
+    vldrepl.w  $vr21,  A0,    0x44
+    vldrepl.w  $vr19,  A0,    0x4c
+
+    xvpermi.q  D4,     D0,    0x02
+    xvpermi.q  D5,     D3,    0x02
+
+    XVMADD1    U8,     D4,    D6,     U8  //08r 18r 28r 38r 09r 19r 29r 39r
+    XVMADD2    U9,     D5,    D6,     U9  //08i 18i 28i 38i 09i 19i 29i 39i
+    XVMADD3    U8,     D5,    D7,     U8
+    XVMADD4    U9,     D4,    D7,     U9
+
+    vldrepl.w  $vr20,  A0,    0x50
+    vldrepl.w  $vr16,  A0,    0x58
+
+    vldrepl.w  $vr21,  A0,    0x54
+    vldrepl.w  $vr19,  A0,    0x5c
+
+    xvpermi.q  D4,     D0,    0x02
+    xvpermi.q  D5,     D3,    0x02
+
+    XVMADD1    U10,     D4,    D6,     U10  //0ar 1ar 2ar 3ar 0br 1br 2br 3br
+    XVMADD2    U11,     D5,    D6,     U11  //0ai 1ai 2ai 3ai 0bi 1bi 2bi 3bi
+    XVMADD3    U10,     D5,    D7,     U10
+    XVMADD4    U11,     D4,    D7,     U11
+
+    vldrepl.w  $vr20,  A0,    0x60
+    vldrepl.w  $vr16,  A0,    0x68
+
+    vldrepl.w  $vr21,  A0,    0x64
+    vldrepl.w  $vr19,  A0,    0x6c
+
+    xvpermi.q  D4,     D0,    0x02
+    xvpermi.q  D5,     D3,    0x02
+
+    XVMADD1    U12,     D4,    D6,     U12  //0cr 1cr 2cr 3cr 0dr 1dr 2dr 3dr
+    XVMADD2    U13,     D5,    D6,     U13  //0ci 1ci 2ci 3ci 0di 1di 2di 3di
+    XVMADD3    U12,     D5,    D7,     U12
+    XVMADD4    U13,     D4,    D7,     U13
+
+    vldrepl.w  $vr20,  A0,    0x70
+    vldrepl.w  $vr16,  A0,    0x78
+
+    vldrepl.w  $vr21,  A0,    0x74
+    vldrepl.w  $vr19,  A0,    0x7c
+
+    xvpermi.q  D4,     D0,    0x02
+    xvpermi.q  D5,     D3,    0x02
+
+    XVMADD1    U14,     D4,    D6,     U14  //0er 1er 2er 3er 0fr 1fr 2fr 3fr
+    XVMADD2    U15,     D5,    D6,     U15  //0ei 1ei 2ei 3ei 0fi 1fi 2fi 3fi
+    XVMADD3    U14,     D5,    D7,     U14
+    XVMADD4    U15,     D4,    D7,     U15
+
+    addi.d     A0,     A0,    0x80
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L102
+
+.L103:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30 res01 res11 res21 res31
+    //res02 res12 res22 res32 res03 res13 res23 res33
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3 4 5 6 7
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackev.w D5,     D3,    D2   //c2[0] c3[0] c2[2] c3[2] c2[4] c3[4] c2[6] c3[6]
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02 //c0[0] c1[0] c0[2] c1[2] c2[0] c3[0] c2[2] 03[2]
+    xvpermi.d  D6,     D6,    0xd8 //c0[0] c1[0] c2[0] c3[0] c0[2] c1[2] c2[2] 03[2]
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31 //c0[4] c1[4] c0[6] c1[6] c2[4] c3[4] c2[6] 03[6]
+    xvpermi.d  D7,     D7,    0xd8 //c0[4] c1[4] c2[4] c3[4] c0[6] c1[6] c2[6] 03[6]
+
+    xvpackod.w D4,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+    xvpackod.w D5,     D3,    D2   //c2[1] c3[1] c2[3] c3[3] c2[5] c3[5] c2[7] c3[7]
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02 //c0[1] c1[1] c0[3] c1[3] c2[1] c3[1] c2[3] 03[3]
+    xvpermi.d  D8,     D8,    0xd8 //c0[1] c1[1] c2[1] c3[1] c0[3] c1[3] c2[3] 03[3]
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31 //c0[5] c1[5] c0[7] c1[7] c2[5] c3[5] c2[7] 03[7]
+    xvpermi.d  D9,     D9,    0xd8 //c0[5] c1[5] c2[5] c3[5] c0[7] c1[7] c2[7] 03[7]
+
+    xvfmul.s      D6,    U0,    VALPHAR
+    xvfmul.s      D8,    U1,    VALPHAR
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D8,    U0,    VALPHAI, D8
+
+    xvfmul.s      D7,    U2,    VALPHAR
+    xvfmul.s      D9,    U3,    VALPHAR
+    XVNMSUB      D7,    U3,    VALPHAI, D7
+    XVFMADD      D9,    U2,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6 //c0[0] c0[1] c2[0] c2[1] c0[2] c0[3] c2[2] c2[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[0] c0[1] c0[2] c0[3] c2[0] c2[1] c2[2] c2[3]
+    xvpackev.w D5,     D9,    D7 //c0[4] c0[5] c2[4] c2[5] c0[6] c0[7] c2[6] c2[7]
+    xvpermi.d  D5,     D5,    0xd8 //c0[4] c0[5] c0[6] c0[7] c2[4] c2[5] c2[6] c2[7]
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02 //c0: 0 1 2 3 4 5 6 7
+    xvpermi.q  D1,     D4,    0x31 //c2: 0 1 2 3 4 5 6 7
+
+    xvpackod.w D4,     D8,    D6 //c1[0] c1[1] c3[0] c3[1] c1[2] c1[3] c3[2] c3[3]
+    xvpermi.d  D4,     D4,    0xd8 //c1[0] c1[1] c1[2] c1[3] c3[0] c3[1] c3[2] c3[3]
+    xvpackod.w D5,     D9,    D7 //c1[4] c1[5] c3[4] c3[5] c1[6] c1[7] c3[6] c3[7]
+    xvpermi.d  D5,     D5,    0xd8 //c1[4] c1[5] c1[6] c1[7] c3[4] c3[5] c3[6] c3[7]
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02 //c1: 0 1 2 3 4 5 6 7
+    xvpermi.q  D3,     D4,    0x31 //c3: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D2,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D3,     C3,    0x00
+
+    //res04 res14 res24 res34 res05 res15 res25 res35
+    //res06 res16 res26 res36 res07 res17 res27 res37
+    xvld       D0,     C0,    0x20
+    xvld       D1,     C1,    0x20
+    xvld       D2,     C2,    0x20
+    xvld       D3,     C3,    0x20
+
+    xvpackev.w D4,     D1,    D0
+    xvpackev.w D5,     D3,    D2
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02
+    xvpermi.d  D6,     D6,    0xd8
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31
+    xvpermi.d  D7,     D7,    0xd8
+
+    xvpackod.w D4,     D1,    D0
+    xvpackod.w D5,     D3,    D2
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02
+    xvpermi.d  D8,     D8,    0xd8
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31
+    xvpermi.d  D9,     D9,    0xd8
+
+    xvfmul.s      D6,    U4,    VALPHAR
+    xvfmul.s      D8,    U5,    VALPHAR
+    XVNMSUB      D6,    U5,    VALPHAI, D6
+    XVFMADD      D8,    U4,    VALPHAI, D8
+
+    xvfmul.s      D7,    U6,    VALPHAR
+    xvfmul.s      D9,    U7,    VALPHAR
+    XVNMSUB      D7,    U7,    VALPHAI, D7
+    XVFMADD      D9,    U6,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackev.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02
+    xvpermi.q  D1,     D4,    0x31
+
+    xvpackod.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackod.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02
+    xvpermi.q  D3,     D4,    0x31
+
+    xvst       D0,     C0,    0x20
+    xvst       D2,     C1,    0x20
+    xvst       D1,     C2,    0x20
+    xvst       D3,     C3,    0x20
+
+    //res08 res18 res28 res38 res09 res19 res29 res39
+    //res0a res1a res2a res3a res0b res1b res2b res3b
+    xvld       D0,     C0,    0x40
+    xvld       D1,     C1,    0x40
+    xvld       D2,     C2,    0x40
+    xvld       D3,     C3,    0x40
+
+    xvpackev.w D4,     D1,    D0
+    xvpackev.w D5,     D3,    D2
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02
+    xvpermi.d  D6,     D6,    0xd8
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31
+    xvpermi.d  D7,     D7,    0xd8
+
+    xvpackod.w D4,     D1,    D0
+    xvpackod.w D5,     D3,    D2
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02
+    xvpermi.d  D8,     D8,    0xd8
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31
+    xvpermi.d  D9,     D9,    0xd8
+
+    xvfmul.s      D6,    U8,    VALPHAR
+    xvfmul.s      D8,    U9,    VALPHAR
+    XVNMSUB      D6,    U9,    VALPHAI, D6
+    XVFMADD      D8,    U8,    VALPHAI, D8
+
+    xvfmul.s      D7,    U10,    VALPHAR
+    xvfmul.s      D9,    U11,    VALPHAR
+    XVNMSUB      D7,    U11,    VALPHAI, D7
+    XVFMADD      D9,    U10,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackev.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02
+    xvpermi.q  D1,     D4,    0x31
+
+    xvpackod.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackod.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02
+    xvpermi.q  D3,     D4,    0x31
+
+    xvst       D0,     C0,    0x40
+    xvst       D2,     C1,    0x40
+    xvst       D1,     C2,    0x40
+    xvst       D3,     C3,    0x40
+
+    //res0c res1c res2c res3c res0d res1d res2d res3d
+    //res0e res1e res2e res3e res0f res1f res2f res3f
+    xvld       D0,     C0,    0x60
+    xvld       D1,     C1,    0x60
+    xvld       D2,     C2,    0x60
+    xvld       D3,     C3,    0x60
+
+    xvpackev.w D4,     D1,    D0
+    xvpackev.w D5,     D3,    D2
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02
+    xvpermi.d  D6,     D6,    0xd8
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31
+    xvpermi.d  D7,     D7,    0xd8
+
+    xvpackod.w D4,     D1,    D0
+    xvpackod.w D5,     D3,    D2
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02
+    xvpermi.d  D8,     D8,    0xd8
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31
+    xvpermi.d  D9,     D9,    0xd8
+
+    xvfmul.s      D6,    U12,    VALPHAR
+    xvfmul.s      D8,    U13,    VALPHAR
+    XVNMSUB      D6,    U13,    VALPHAI, D6
+    XVFMADD      D8,    U12,    VALPHAI, D8
+
+    xvfmul.s      D7,    U14,    VALPHAR
+    xvfmul.s      D9,    U15,    VALPHAR
+    XVNMSUB      D7,    U15,    VALPHAI, D7
+    XVFMADD      D9,    U14,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackev.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02
+    xvpermi.q  D1,     D4,    0x31
+
+    xvpackod.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackod.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02
+    xvpermi.q  D3,     D4,    0x31
+
+    xvst       D0,     C0,    0x60
+    xvst       D2,     C1,    0x60
+    xvst       D1,     C2,    0x60
+    xvst       D3,     C3,    0x60
+
+    addi.d     C0,     C0,    0x80
+    addi.d     C1,     C1,    0x80
+    addi.d     C2,     C2,    0x80
+    addi.d     C3,     C3,    0x80
+#else
+    //res00 res10 res20 res30 res01 res11 res21 res31
+    //res02 res12 res22 res32 res03 res13 res23 res33
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3 4 5 6 7
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackev.w D5,     D3,    D2   //c2[0] c3[0] c2[2] c3[2] c2[4] c3[4] c2[6] c3[6]
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02 //c0[0] c1[0] c0[2] c1[2] c2[0] c3[0] c2[2] 03[2]
+    xvpermi.d  D6,     D6,    0xd8 //c0[0] c1[0] c2[0] c3[0] c0[2] c1[2] c2[2] 03[2]
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31 //c0[4] c1[4] c0[6] c1[6] c2[4] c3[4] c2[6] 03[6]
+    xvpermi.d  D7,     D7,    0xd8 //c0[4] c1[4] c2[4] c3[4] c0[6] c1[6] c2[6] 03[6]
+
+    xvpackod.w D4,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+    xvpackod.w D5,     D3,    D2   //c2[1] c3[1] c2[3] c3[3] c2[5] c3[5] c2[7] c3[7]
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02 //c0[1] c1[1] c0[3] c1[3] c2[1] c3[1] c2[3] 03[3]
+    xvpermi.d  D8,     D8,    0xd8 //c0[1] c1[1] c2[1] c3[1] c0[3] c1[3] c2[3] 03[3]
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31 //c0[5] c1[5] c0[7] c1[7] c2[5] c3[5] c2[7] 03[7]
+    xvpermi.d  D9,     D9,    0xd8 //c0[5] c1[5] c2[5] c3[5] c0[7] c1[7] c2[7] 03[7]
+
+    XVFMADD      D6,    U0,    VALPHAR, D6
+    XVFMADD      D8,    U1,    VALPHAR, D8
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D8,    U0,    VALPHAI, D8
+
+    XVFMADD      D7,    U2,    VALPHAR, D7
+    XVFMADD      D9,    U3,    VALPHAR, D9
+    XVNMSUB      D7,    U3,    VALPHAI, D7
+    XVFMADD      D9,    U2,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6 //c0[0] c0[1] c2[0] c2[1] c0[2] c0[3] c2[2] c2[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[0] c0[1] c0[2] c0[3] c2[0] c2[1] c2[2] c2[3]
+    xvpackev.w D5,     D9,    D7 //c0[4] c0[5] c2[4] c2[5] c0[6] c0[7] c2[6] c2[7]
+    xvpermi.d  D5,     D5,    0xd8 //c0[4] c0[5] c0[6] c0[7] c2[4] c2[5] c2[6] c2[7]
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02 //c0: 0 1 2 3 4 5 6 7
+    xvpermi.q  D1,     D4,    0x31 //c2: 0 1 2 3 4 5 6 7
+
+    xvpackod.w D4,     D8,    D6 //c1[0] c1[1] c3[0] c3[1] c1[2] c1[3] c3[2] c3[3]
+    xvpermi.d  D4,     D4,    0xd8 //c1[0] c1[1] c1[2] c1[3] c3[0] c3[1] c3[2] c3[3]
+    xvpackod.w D5,     D9,    D7 //c1[4] c1[5] c3[4] c3[5] c1[6] c1[7] c3[6] c3[7]
+    xvpermi.d  D5,     D5,    0xd8 //c1[4] c1[5] c1[6] c1[7] c3[4] c3[5] c3[6] c3[7]
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02 //c1: 0 1 2 3 4 5 6 7
+    xvpermi.q  D3,     D4,    0x31 //c3: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D2,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D3,     C3,    0x00
+
+    //res04 res14 res24 res34 res05 res15 res25 res35
+    //res06 res16 res26 res36 res07 res17 res27 res37
+    xvld       D0,     C0,    0x20
+    xvld       D1,     C1,    0x20
+    xvld       D2,     C2,    0x20
+    xvld       D3,     C3,    0x20
+
+    xvpackev.w D4,     D1,    D0
+    xvpackev.w D5,     D3,    D2
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02
+    xvpermi.d  D6,     D6,    0xd8
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31
+    xvpermi.d  D7,     D7,    0xd8
+
+    xvpackod.w D4,     D1,    D0
+    xvpackod.w D5,     D3,    D2
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02
+    xvpermi.d  D8,     D8,    0xd8
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31
+    xvpermi.d  D9,     D9,    0xd8
+
+    XVFMADD      D6,    U4,    VALPHAR, D6
+    XVFMADD      D8,    U5,    VALPHAR, D8
+    XVNMSUB      D6,    U5,    VALPHAI, D6
+    XVFMADD      D8,    U4,    VALPHAI, D8
+
+    XVFMADD      D7,    U6,    VALPHAR, D7
+    XVFMADD      D9,    U7,    VALPHAR, D9
+    XVNMSUB      D7,    U7,    VALPHAI, D7
+    XVFMADD      D9,    U6,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackev.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02
+    xvpermi.q  D1,     D4,    0x31
+
+    xvpackod.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackod.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02
+    xvpermi.q  D3,     D4,    0x31
+
+    xvst       D0,     C0,    0x20
+    xvst       D2,     C1,    0x20
+    xvst       D1,     C2,    0x20
+    xvst       D3,     C3,    0x20
+
+    //res08 res18 res28 res38 res09 res19 res29 res39
+    //res0a res1a res2a res3a res0b res1b res2b res3b
+    xvld       D0,     C0,    0x40
+    xvld       D1,     C1,    0x40
+    xvld       D2,     C2,    0x40
+    xvld       D3,     C3,    0x40
+
+    xvpackev.w D4,     D1,    D0
+    xvpackev.w D5,     D3,    D2
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02
+    xvpermi.d  D6,     D6,    0xd8
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31
+    xvpermi.d  D7,     D7,    0xd8
+
+    xvpackod.w D4,     D1,    D0
+    xvpackod.w D5,     D3,    D2
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02
+    xvpermi.d  D8,     D8,    0xd8
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31
+    xvpermi.d  D9,     D9,    0xd8
+
+    XVFMADD      D6,    U8,    VALPHAR, D6
+    XVFMADD      D8,    U9,    VALPHAR, D8
+    XVNMSUB      D6,    U9,    VALPHAI, D6
+    XVFMADD      D8,    U8,    VALPHAI, D8
+
+    XVFMADD      D7,    U10,    VALPHAR, D7
+    XVFMADD      D9,    U11,    VALPHAR, D9
+    XVNMSUB      D7,    U11,    VALPHAI, D7
+    XVFMADD      D9,    U10,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackev.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02
+    xvpermi.q  D1,     D4,    0x31
+
+    xvpackod.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackod.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02
+    xvpermi.q  D3,     D4,    0x31
+
+    xvst       D0,     C0,    0x40
+    xvst       D2,     C1,    0x40
+    xvst       D1,     C2,    0x40
+    xvst       D3,     C3,    0x40
+
+    //res0c res1c res2c res3c res0d res1d res2d res3d
+    //res0e res1e res2e res3e res0f res1f res2f res3f
+    xvld       D0,     C0,    0x60
+    xvld       D1,     C1,    0x60
+    xvld       D2,     C2,    0x60
+    xvld       D3,     C3,    0x60
+
+    xvpackev.w D4,     D1,    D0
+    xvpackev.w D5,     D3,    D2
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02
+    xvpermi.d  D6,     D6,    0xd8
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31
+    xvpermi.d  D7,     D7,    0xd8
+
+    xvpackod.w D4,     D1,    D0
+    xvpackod.w D5,     D3,    D2
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02
+    xvpermi.d  D8,     D8,    0xd8
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31
+    xvpermi.d  D9,     D9,    0xd8
+
+    XVFMADD      D6,    U12,    VALPHAR, D6
+    XVFMADD      D8,    U13,    VALPHAR, D8
+    XVNMSUB      D6,    U13,    VALPHAI, D6
+    XVFMADD      D8,    U12,    VALPHAI, D8
+
+    XVFMADD      D7,    U14,    VALPHAR, D7
+    XVFMADD      D9,    U15,    VALPHAR, D9
+    XVNMSUB      D7,    U15,    VALPHAI, D7
+    XVFMADD      D9,    U14,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackev.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02
+    xvpermi.q  D1,     D4,    0x31
+
+    xvpackod.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackod.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02
+    xvpermi.q  D3,     D4,    0x31
+
+    xvst       D0,     C0,    0x60
+    xvst       D2,     C1,    0x60
+    xvst       D1,     C2,    0x60
+    xvst       D3,     C3,    0x60
+
+    addi.d     C0,     C0,    0x80
+    addi.d     C1,     C1,    0x80
+    addi.d     C2,     C2,    0x80
+    addi.d     C3,     C3,    0x80
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -16
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   16
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L101
+
+.L11:  /* if ( bm & 8 ) */
+    move       I,      $r0
+    andi       T0,     M,     8
+    beq        I,      T0,    .L150
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     T3,     OFF,   0x06
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x05
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+    xvxor.v    U4,     U4,   U4
+    xvxor.v    U5,     U5,   U5
+    xvxor.v    U6,     U6,   U6
+    xvxor.v    U7,     U7,   U7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L13
+    blt        TL,     L,     .L13
+
+.L12:  /* for(k=0; k<temp; k+=1) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri a2ri a3ri
+    xvld       D2,     B0,    0x00  // b0ri b1ri b2ri b3ri
+
+    xvand.v    D1,     D0,    D0
+    xvpermi.q  D1,     D0,    0x02  //a0ri a1ri a0ri a1ri
+    xvpermi.d  D1,     D1,    0xd8  //a0ri a0ri a1ri a1ri
+
+    xvand.v    D4,     D1,    D1
+    xvand.v    D5,     D1,    D1
+    xvpermi.w  D4,     D4,    0x00  //a0rrrr a1rrrr
+    xvpermi.w  D5,     D5,    0x55  //a0iiii a1iiii
+
+    xvpackev.w D6,     D2,    D2    //b0rr b1rr b2rr b3rr
+    xvpermi.w  D6,     D6,    0x88  //b0r b1r b0r b1r b2r b3r b2r b3r
+    xvpermi.d  D6,     D6,    0xd8  //b0r b1r b2r b3r b0r b1r b2r b3r
+
+    xvpackod.w D7,     D2,    D2    //b0ii b1ii b2ii b3ii
+    xvpermi.w  D7,     D7,    0x88  //b0i b1i b0i b1i b2i b3i b2i b3i
+    xvpermi.d  D7,     D7,    0xd8  //b0r b1r b2r b3r b0r b1r b2r b3r
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r 01r 11r 21r 31r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i 01i 11i 21i 31i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    xvand.v    D1,     D0,    D0
+    xvpermi.q  D1,     D0,    0x31  //a0ri a1ri a0ri a1ri
+    xvpermi.d  D1,     D1,    0xd8  //a0ri a0ri a1ri a1ri
+
+    xvand.v    D4,     D1,    D1
+    xvand.v    D5,     D1,    D1
+    xvpermi.w  D4,     D4,    0x00  //a0rrrr a1rrrr
+    xvpermi.w  D5,     D5,    0x55  //a0iiii a1iiii
+
+    XVMADD1    U2,     D4,    D6,     U2  //02r 12r 22r 32r 03r 13r 23r 33r
+    XVMADD2    U3,     D5,    D6,     U3  //02i 12i 22i 32i 03i 13i 23i 33i
+    XVMADD3    U2,     D5,    D7,     U2
+    XVMADD4    U3,     D4,    D7,     U3
+
+    xvld       D0,     A0,    0x20  // a4ri a5ri a6ri a7ri
+
+    xvand.v    D1,     D0,    D0
+    xvpermi.q  D1,     D0,    0x02  //a4ri a5ri a4ri a5ri
+    xvpermi.d  D1,     D1,    0xd8  //a4ri a4ri a5ri a5ri
+
+    xvand.v    D4,     D1,    D1
+    xvand.v    D5,     D1,    D1
+    xvpermi.w  D4,     D4,    0x00  //a4rrrr a5rrrr
+    xvpermi.w  D5,     D5,    0x55  //a4iiii a5iiii
+
+    XVMADD1    U4,     D4,    D6,     U4  //04r 14r 24r 34r 05r 15r 25r 35r
+    XVMADD2    U5,     D5,    D6,     U5  //04i 14i 24i 34i 05i 15i 25i 35i
+    XVMADD3    U4,     D5,    D7,     U4
+    XVMADD4    U5,     D4,    D7,     U5
+
+    xvand.v    D1,     D0,    D0
+    xvpermi.q  D1,     D0,    0x31  //a6ri a7ri a6ri a7ri
+    xvpermi.d  D1,     D1,    0xd8  //a6ri a6ri a7ri a7ri
+
+    xvand.v    D4,     D1,    D1
+    xvand.v    D5,     D1,    D1
+    xvpermi.w  D4,     D4,    0x00  //a6rrrr a7rrrr
+    xvpermi.w  D5,     D5,    0x55  //a6iiii a7iiii
+
+    XVMADD1    U6,     D4,    D6,     U6  //06r 16r 26r 36r 07r 17r 27r 37r
+    XVMADD2    U7,     D5,    D6,     U7  //06i 16i 26i 36i 07i 17i 27i 37i
+    XVMADD3    U6,     D5,    D7,     U6
+    XVMADD4    U7,     D4,    D7,     U7
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L12
+
+.L13:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30 res01 res11 res21 res31
+    //res02 res12 res22 res32 res03 res13 res23 res33
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3 4 5 6 7
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackev.w D5,     D3,    D2   //c2[0] c3[0] c2[2] c3[2] c2[4] c3[4] c2[6] c3[6]
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02 //c0[0] c1[0] c0[2] c1[2] c2[0] c3[0] c2[2] 03[2]
+    xvpermi.d  D6,     D6,    0xd8 //c0[0] c1[0] c2[0] c3[0] c0[2] c1[2] c2[2] 03[2]
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31 //c0[4] c1[4] c0[6] c1[6] c2[4] c3[4] c2[6] 03[6]
+    xvpermi.d  D7,     D7,    0xd8 //c0[4] c1[4] c2[4] c3[4] c0[6] c1[6] c2[6] 03[6]
+
+    xvpackod.w D4,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+    xvpackod.w D5,     D3,    D2   //c2[1] c3[1] c2[3] c3[3] c2[5] c3[5] c2[7] c3[7]
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02 //c0[1] c1[1] c0[3] c1[3] c2[1] c3[1] c2[3] 03[3]
+    xvpermi.d  D8,     D8,    0xd8 //c0[1] c1[1] c2[1] c3[1] c0[3] c1[3] c2[3] 03[3]
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31 //c0[5] c1[5] c0[7] c1[7] c2[5] c3[5] c2[7] 03[7]
+    xvpermi.d  D9,     D9,    0xd8 //c0[5] c1[5] c2[5] c3[5] c0[7] c1[7] c2[7] 03[7]
+
+    xvfmul.s      D6,    U0,    VALPHAR
+    xvfmul.s      D8,    U1,    VALPHAR
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D8,    U0,    VALPHAI, D8
+
+    xvfmul.s      D7,    U2,    VALPHAR
+    xvfmul.s      D9,    U3,    VALPHAR
+    XVNMSUB      D7,    U3,    VALPHAI, D7
+    XVFMADD      D9,    U2,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6 //c0[0] c0[1] c2[0] c2[1] c0[2] c0[3] c2[2] c2[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[0] c0[1] c0[2] c0[3] c2[0] c2[1] c2[2] c2[3]
+    xvpackev.w D5,     D9,    D7 //c0[4] c0[5] c2[4] c2[5] c0[6] c0[7] c2[6] c2[7]
+    xvpermi.d  D5,     D5,    0xd8 //c0[4] c0[5] c0[6] c0[7] c2[4] c2[5] c2[6] c2[7]
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02 //c0: 0 1 2 3 4 5 6 7
+    xvpermi.q  D1,     D4,    0x31 //c2: 0 1 2 3 4 5 6 7
+
+    xvpackod.w D4,     D8,    D6 //c1[0] c1[1] c3[0] c3[1] c1[2] c1[3] c3[2] c3[3]
+    xvpermi.d  D4,     D4,    0xd8 //c1[0] c1[1] c1[2] c1[3] c3[0] c3[1] c3[2] c3[3]
+    xvpackod.w D5,     D9,    D7 //c1[4] c1[5] c3[4] c3[5] c1[6] c1[7] c3[6] c3[7]
+    xvpermi.d  D5,     D5,    0xd8 //c1[4] c1[5] c1[6] c1[7] c3[4] c3[5] c3[6] c3[7]
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02 //c1: 0 1 2 3 4 5 6 7
+    xvpermi.q  D3,     D4,    0x31 //c3: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D2,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D3,     C3,    0x00
+
+    //res04 res14 res24 res34 res05 res15 res25 res35
+    //res06 res16 res26 res36 res07 res17 res27 res37
+    xvld       D0,     C0,    0x20
+    xvld       D1,     C1,    0x20
+    xvld       D2,     C2,    0x20
+    xvld       D3,     C3,    0x20
+
+    xvpackev.w D4,     D1,    D0
+    xvpackev.w D5,     D3,    D2
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02
+    xvpermi.d  D6,     D6,    0xd8
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31
+    xvpermi.d  D7,     D7,    0xd8
+
+    xvpackod.w D4,     D1,    D0
+    xvpackod.w D5,     D3,    D2
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02
+    xvpermi.d  D8,     D8,    0xd8
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31
+    xvpermi.d  D9,     D9,    0xd8
+
+    xvfmul.s      D6,    U4,    VALPHAR
+    xvfmul.s      D8,    U5,    VALPHAR
+    XVNMSUB      D6,    U5,    VALPHAI, D6
+    XVFMADD      D8,    U4,    VALPHAI, D8
+
+    xvfmul.s      D7,    U6,    VALPHAR
+    xvfmul.s      D9,    U7,    VALPHAR
+    XVNMSUB      D7,    U7,    VALPHAI, D7
+    XVFMADD      D9,    U6,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackev.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02
+    xvpermi.q  D1,     D4,    0x31
+
+    xvpackod.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackod.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02
+    xvpermi.q  D3,     D4,    0x31
+
+    xvst       D0,     C0,    0x20
+    xvst       D2,     C1,    0x20
+    xvst       D1,     C2,    0x20
+    xvst       D3,     C3,    0x20
+
+    addi.d     C0,     C0,    0x40
+    addi.d     C1,     C1,    0x40
+    addi.d     C2,     C2,    0x40
+    addi.d     C3,     C3,    0x40
+#else
+    //res00 res10 res20 res30 res01 res11 res21 res31
+    //res02 res12 res22 res32 res03 res13 res23 res33
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3 4 5 6 7
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackev.w D5,     D3,    D2   //c2[0] c3[0] c2[2] c3[2] c2[4] c3[4] c2[6] c3[6]
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02 //c0[0] c1[0] c0[2] c1[2] c2[0] c3[0] c2[2] 03[2]
+    xvpermi.d  D6,     D6,    0xd8 //c0[0] c1[0] c2[0] c3[0] c0[2] c1[2] c2[2] 03[2]
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31 //c0[4] c1[4] c0[6] c1[6] c2[4] c3[4] c2[6] 03[6]
+    xvpermi.d  D7,     D7,    0xd8 //c0[4] c1[4] c2[4] c3[4] c0[6] c1[6] c2[6] 03[6]
+
+    xvpackod.w D4,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+    xvpackod.w D5,     D3,    D2   //c2[1] c3[1] c2[3] c3[3] c2[5] c3[5] c2[7] c3[7]
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02 //c0[1] c1[1] c0[3] c1[3] c2[1] c3[1] c2[3] 03[3]
+    xvpermi.d  D8,     D8,    0xd8 //c0[1] c1[1] c2[1] c3[1] c0[3] c1[3] c2[3] 03[3]
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31 //c0[5] c1[5] c0[7] c1[7] c2[5] c3[5] c2[7] 03[7]
+    xvpermi.d  D9,     D9,    0xd8 //c0[5] c1[5] c2[5] c3[5] c0[7] c1[7] c2[7] 03[7]
+
+    XVFMADD      D6,    U0,    VALPHAR, D6
+    XVFMADD      D8,    U1,    VALPHAR, D8
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D8,    U0,    VALPHAI, D8
+
+    XVFMADD      D7,    U2,    VALPHAR, D7
+    XVFMADD      D9,    U3,    VALPHAR, D9
+    XVNMSUB      D7,    U3,    VALPHAI, D7
+    XVFMADD      D9,    U2,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6 //c0[0] c0[1] c2[0] c2[1] c0[2] c0[3] c2[2] c2[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[0] c0[1] c0[2] c0[3] c2[0] c2[1] c2[2] c2[3]
+    xvpackev.w D5,     D9,    D7 //c0[4] c0[5] c2[4] c2[5] c0[6] c0[7] c2[6] c2[7]
+    xvpermi.d  D5,     D5,    0xd8 //c0[4] c0[5] c0[6] c0[7] c2[4] c2[5] c2[6] c2[7]
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02 //c0: 0 1 2 3 4 5 6 7
+    xvpermi.q  D1,     D4,    0x31 //c2: 0 1 2 3 4 5 6 7
+
+    xvpackod.w D4,     D8,    D6 //c1[0] c1[1] c3[0] c3[1] c1[2] c1[3] c3[2] c3[3]
+    xvpermi.d  D4,     D4,    0xd8 //c1[0] c1[1] c1[2] c1[3] c3[0] c3[1] c3[2] c3[3]
+    xvpackod.w D5,     D9,    D7 //c1[4] c1[5] c3[4] c3[5] c1[6] c1[7] c3[6] c3[7]
+    xvpermi.d  D5,     D5,    0xd8 //c1[4] c1[5] c1[6] c1[7] c3[4] c3[5] c3[6] c3[7]
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02 //c1: 0 1 2 3 4 5 6 7
+    xvpermi.q  D3,     D4,    0x31 //c3: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D2,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D3,     C3,    0x00
+
+    //res04 res14 res24 res34 res05 res15 res25 res35
+    //res06 res16 res26 res36 res07 res17 res27 res37
+    xvld       D0,     C0,    0x20
+    xvld       D1,     C1,    0x20
+    xvld       D2,     C2,    0x20
+    xvld       D3,     C3,    0x20
+
+    xvpackev.w D4,     D1,    D0
+    xvpackev.w D5,     D3,    D2
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02
+    xvpermi.d  D6,     D6,    0xd8
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31
+    xvpermi.d  D7,     D7,    0xd8
+
+    xvpackod.w D4,     D1,    D0
+    xvpackod.w D5,     D3,    D2
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02
+    xvpermi.d  D8,     D8,    0xd8
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31
+    xvpermi.d  D9,     D9,    0xd8
+
+    XVFMADD      D6,    U4,    VALPHAR, D6
+    XVFMADD      D8,    U5,    VALPHAR, D8
+    XVNMSUB      D6,    U5,    VALPHAI, D6
+    XVFMADD      D8,    U4,    VALPHAI, D8
+
+    XVFMADD      D7,    U6,    VALPHAR, D7
+    XVFMADD      D9,    U7,    VALPHAR, D9
+    XVNMSUB      D7,    U7,    VALPHAI, D7
+    XVFMADD      D9,    U6,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackev.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02
+    xvpermi.q  D1,     D4,    0x31
+
+    xvpackod.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackod.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02
+    xvpermi.q  D3,     D4,    0x31
+
+    xvst       D0,     C0,    0x20
+    xvst       D2,     C1,    0x20
+    xvst       D1,     C2,    0x20
+    xvst       D3,     C3,    0x20
+
+    addi.d     C0,     C0,    0x40
+    addi.d     C1,     C1,    0x40
+    addi.d     C2,     C2,    0x40
+    addi.d     C3,     C3,    0x40
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     4
+    beq        I,      T0,    .L18
+
+.L15:  /* if (bm & 4) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x05
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri a2ri a3ri
+    xvld       D2,     B0,    0x00  // b0ri b1ri b2ri b3ri
+
+    xvand.v    D1,     D0,    D0
+    xvpermi.q  D1,     D0,    0x02  //a0ri a1ri a0ri a1ri
+    xvpermi.d  D1,     D1,    0xd8  //a0ri a0ri a1ri a1ri
+
+    xvand.v    D4,     D1,    D1
+    xvand.v    D5,     D1,    D1
+    xvpermi.w  D4,     D4,    0x00  //a0rrrr a1rrrr
+    xvpermi.w  D5,     D5,    0x55  //a0iiii a1iiii
+
+    xvpackev.w D6,     D2,    D2    //b0rr b1rr b2rr b3rr
+    xvpermi.w  D6,     D6,    0x88  //b0r b1r b0r b1r b2r b3r b2r b3r
+    xvpermi.d  D6,     D6,    0xd8  //b0r b1r b2r b3r b0r b1r b2r b3r
+
+    xvpackod.w D7,     D2,    D2    //b0ii b1ii b2ii b3ii
+    xvpermi.w  D7,     D7,    0x88  //b0i b1i b0i b1i b2i b3i b2i b3i
+    xvpermi.d  D7,     D7,    0xd8  //b0i b1i b2i b3i b0i b1i b2i b3i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r 01r 11r 21r 31r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i 01i 11i 21i 31i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    xvand.v    D1,     D0,    D0
+    xvpermi.q  D1,     D0,    0x31  //a2ri a3ri a2ri a3ri
+    xvpermi.d  D1,     D1,    0xd8  //a2ri a2ri a3ri a3ri
+
+    xvand.v    D4,     D1,    D1
+    xvand.v    D5,     D1,    D1
+    xvpermi.w  D4,     D4,    0x00  //a2rrrr a3rrrr
+    xvpermi.w  D5,     D5,    0x55  //a2iiii a3iiii
+
+    XVMADD1    U2,     D4,    D6,     U2  //02r 12r 22r 32r 03r 13r 23r 33r
+    XVMADD2    U3,     D5,    D6,     U3  //02i 12i 22i 32i 03i 13i 23i 33i
+    XVMADD3    U2,     D5,    D7,     U2
+    XVMADD4    U3,     D4,    D7,     U3
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30 res01 res11 res21 res31
+    //res02 res12 res22 res32 res03 res13 res23 res33
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3 4 5 6 7
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackev.w D5,     D3,    D2   //c2[0] c3[0] c2[2] c3[2] c2[4] c3[4] c2[6] c3[6]
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02 //c0[0] c1[0] c0[2] c1[2] c2[0] c3[0] c2[2] 03[2]
+    xvpermi.d  D6,     D6,    0xd8 //c0[0] c1[0] c2[0] c3[0] c0[2] c1[2] c2[2] 03[2]
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31 //c0[4] c1[4] c0[6] c1[6] c2[4] c3[4] c2[6] 03[6]
+    xvpermi.d  D7,     D7,    0xd8 //c0[4] c1[4] c2[4] c3[4] c0[6] c1[6] c2[6] 03[6]
+
+    xvpackod.w D4,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+    xvpackod.w D5,     D3,    D2   //c2[1] c3[1] c2[3] c3[3] c2[5] c3[5] c2[7] c3[7]
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02 //c0[1] c1[1] c0[3] c1[3] c2[1] c3[1] c2[3] 03[3]
+    xvpermi.d  D8,     D8,    0xd8 //c0[1] c1[1] c2[1] c3[1] c0[3] c1[3] c2[3] 03[3]
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31 //c0[5] c1[5] c0[7] c1[7] c2[5] c3[5] c2[7] 03[7]
+    xvpermi.d  D9,     D9,    0xd8 //c0[5] c1[5] c2[5] c3[5] c0[7] c1[7] c2[7] 03[7]
+
+    xvfmul.s      D6,    U0,    VALPHAR
+    xvfmul.s      D8,    U1,    VALPHAR
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D8,    U0,    VALPHAI, D8
+
+    xvfmul.s      D7,    U2,    VALPHAR
+    xvfmul.s      D9,    U3,    VALPHAR
+    XVNMSUB      D7,    U3,    VALPHAI, D7
+    XVFMADD      D9,    U2,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6 //c0[0] c0[1] c2[0] c2[1] c0[2] c0[3] c2[2] c2[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[0] c0[1] c0[2] c0[3] c2[0] c2[1] c2[2] c2[3]
+    xvpackev.w D5,     D9,    D7 //c0[4] c0[5] c2[4] c2[5] c0[6] c0[7] c2[6] c2[7]
+    xvpermi.d  D5,     D5,    0xd8 //c0[4] c0[5] c0[6] c0[7] c2[4] c2[5] c2[6] c2[7]
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02 //c0: 0 1 2 3 4 5 6 7
+    xvpermi.q  D1,     D4,    0x31 //c2: 0 1 2 3 4 5 6 7
+
+    xvpackod.w D4,     D8,    D6 //c1[0] c1[1] c3[0] c3[1] c1[2] c1[3] c3[2] c3[3]
+    xvpermi.d  D4,     D4,    0xd8 //c1[0] c1[1] c1[2] c1[3] c3[0] c3[1] c3[2] c3[3]
+    xvpackod.w D5,     D9,    D7 //c1[4] c1[5] c3[4] c3[5] c1[6] c1[7] c3[6] c3[7]
+    xvpermi.d  D5,     D5,    0xd8 //c1[4] c1[5] c1[6] c1[7] c3[4] c3[5] c3[6] c3[7]
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02 //c1: 0 1 2 3 4 5 6 7
+    xvpermi.q  D3,     D4,    0x31 //c3: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D2,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#else
+    //res00 res10 res20 res30 res01 res11 res21 res31
+    //res02 res12 res22 res32 res03 res13 res23 res33
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3 4 5 6 7
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackev.w D5,     D3,    D2   //c2[0] c3[0] c2[2] c3[2] c2[4] c3[4] c2[6] c3[6]
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02 //c0[0] c1[0] c0[2] c1[2] c2[0] c3[0] c2[2] 03[2]
+    xvpermi.d  D6,     D6,    0xd8 //c0[0] c1[0] c2[0] c3[0] c0[2] c1[2] c2[2] 03[2]
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31 //c0[4] c1[4] c0[6] c1[6] c2[4] c3[4] c2[6] 03[6]
+    xvpermi.d  D7,     D7,    0xd8 //c0[4] c1[4] c2[4] c3[4] c0[6] c1[6] c2[6] 03[6]
+
+    xvpackod.w D4,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+    xvpackod.w D5,     D3,    D2   //c2[1] c3[1] c2[3] c3[3] c2[5] c3[5] c2[7] c3[7]
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02 //c0[1] c1[1] c0[3] c1[3] c2[1] c3[1] c2[3] 03[3]
+    xvpermi.d  D8,     D8,    0xd8 //c0[1] c1[1] c2[1] c3[1] c0[3] c1[3] c2[3] 03[3]
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31 //c0[5] c1[5] c0[7] c1[7] c2[5] c3[5] c2[7] 03[7]
+    xvpermi.d  D9,     D9,    0xd8 //c0[5] c1[5] c2[5] c3[5] c0[7] c1[7] c2[7] 03[7]
+
+    XVFMADD      D6,    U0,    VALPHAR, D6
+    XVFMADD      D8,    U1,    VALPHAR, D8
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D8,    U0,    VALPHAI, D8
+
+    XVFMADD      D7,    U2,    VALPHAR, D7
+    XVFMADD      D9,    U3,    VALPHAR, D9
+    XVNMSUB      D7,    U3,    VALPHAI, D7
+    XVFMADD      D9,    U2,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6 //c0[0] c0[1] c2[0] c2[1] c0[2] c0[3] c2[2] c2[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[0] c0[1] c0[2] c0[3] c2[0] c2[1] c2[2] c2[3]
+    xvpackev.w D5,     D9,    D7 //c0[4] c0[5] c2[4] c2[5] c0[6] c0[7] c2[6] c2[7]
+    xvpermi.d  D5,     D5,    0xd8 //c0[4] c0[5] c0[6] c0[7] c2[4] c2[5] c2[6] c2[7]
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02 //c0: 0 1 2 3 4 5 6 7
+    xvpermi.q  D1,     D4,    0x31 //c2: 0 1 2 3 4 5 6 7
+
+    xvpackod.w D4,     D8,    D6 //c1[0] c1[1] c3[0] c3[1] c1[2] c1[3] c3[2] c3[3]
+    xvpermi.d  D4,     D4,    0xd8 //c1[0] c1[1] c1[2] c1[3] c3[0] c3[1] c3[2] c3[3]
+    xvpackod.w D5,     D9,    D7 //c1[4] c1[5] c3[4] c3[5] c1[6] c1[7] c3[6] c3[7]
+    xvpermi.d  D5,     D5,    0xd8 //c1[4] c1[5] c1[6] c1[7] c3[4] c3[5] c3[6] c3[7]
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02 //c1: 0 1 2 3 4 5 6 7
+    xvpermi.q  D3,     D4,    0x31 //c3: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D2,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L18:   /* if (bm & 2) */
+    move       I,      $r0
+    andi       T0,     M,     2
+    beq        I,      T0,    .L183
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x05
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    $vr2,     $vr2,   $vr2
+    vxor.v    $vr3,     $vr3,   $vr3
+    vxor.v    $vr4,     $vr4,   $vr4
+    vxor.v    $vr5,     $vr5,   $vr5
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L182
+    blt        TL,     L,     .L182
+
+.L181:  /* for (k=0; k<temp; k++) */
+    vld       $vr16,     A0,    0x00  // a0ri a1ri
+    vld       $vr18,     B0,    0x00  // b0ri b1ri
+    vld       $vr19,     B0,    0x10  // b2ri b3ri
+
+    vshuf4i.w  $vr20,     $vr16,    0x00  //a0r
+    vshuf4i.w  $vr21,     $vr16,    0x55  //a0i
+
+    vpackev.w  $vr22,     $vr19,    $vr18
+    vshuf4i.w  $vr22,     $vr22,    0xd8  //b0r b1r b2r b3r
+
+    vpackod.w  $vr23,     $vr19,    $vr18
+    vshuf4i.w  $vr23,     $vr23,    0xd8  //b0i b1i b2i b3i
+
+    VMADD1    $vr2,     $vr20,    $vr22,     $vr2  //00r 10r 20r 30r
+    VMADD2    $vr3,     $vr21,    $vr22,     $vr3  //00i 10i 20i 30i
+    VMADD3    $vr2,     $vr21,    $vr23,     $vr2
+    VMADD4    $vr3,     $vr20,    $vr23,     $vr3
+
+    vshuf4i.w  $vr20,     $vr16,    0xaa  //a1r
+    vshuf4i.w  $vr21,     $vr16,    0xff  //a1i
+
+    VMADD1    $vr4,     $vr20,    $vr22,     $vr4  //01r 11r 21r 31r
+    VMADD2    $vr5,     $vr21,    $vr22,     $vr5  //01i 11i 21i 31i
+    VMADD3    $vr4,     $vr21,    $vr23,     $vr4
+    VMADD4    $vr5,     $vr20,    $vr23,     $vr5
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L181
+
+.L182:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    vld       $vr16,     C0,    0x00 //c0: 0 1 2 3
+    vld       $vr17,     C1,    0x00 //c1: 0 1 2 3
+    vld       $vr18,     C2,    0x00 //c2: 0 1 2 3
+    vld       $vr19,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    $vr20,     $vr17,    $vr17
+    vpermi.w  $vr20,     $vr16,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  $vr22,     $vr20,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  $vr23,     $vr20,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    $vr21,     $vr19,    $vr19
+    vpermi.w  $vr21,     $vr18,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  $vr24,     $vr21,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  $vr25,     $vr21,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  $vr24,     $vr22,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  $vr25,     $vr23,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      $vr24,    $vr2,    $vr28
+    vfmul.s      $vr25,    $vr3,    $vr28
+    VNMSUB      $vr24,    $vr3,    $vr29, $vr24
+    VFMADD      $vr25,    $vr2,    $vr29, $vr25
+
+    vand.v    $vr26,     $vr25,    $vr25 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    $vr27,     $vr25,    $vr25 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  $vr26,     $vr24,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  $vr26,     $vr26,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  $vr27,     $vr24,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  $vr27,     $vr27,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    $vr20,     $vr17,    $vr17
+    vpermi.w  $vr20,     $vr16,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  $vr22,     $vr20,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  $vr23,     $vr20,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    $vr21,     $vr19,    $vr19
+    vpermi.w  $vr21,     $vr18,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  $vr24,     $vr21,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  $vr25,     $vr21,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  $vr24,     $vr22,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  $vr25,     $vr23,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      $vr24,    $vr4,    $vr28
+    vfmul.s      $vr25,    $vr5,    $vr28
+    VNMSUB      $vr24,    $vr5,    $vr29, $vr24
+    VFMADD      $vr25,    $vr4,    $vr29, $vr25
+
+    vand.v    $vr20,     $vr25,    $vr25
+    vpermi.w  $vr20,     $vr24,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w $vr20,     $vr20,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    $vr18,     $vr20,    $vr20
+
+    vand.v    $vr21,     $vr25,    $vr25
+    vpermi.w  $vr21,     $vr24,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w $vr21,     $vr21,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    $vr19,     $vr21,    $vr21
+
+    vand.v    $vr16,     $vr26,    $vr26 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    $vr17,     $vr27,    $vr27 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  $vr20,     $vr16,     0x44 //c0: 0 1 2 3
+    vpermi.w  $vr18,     $vr16,     0xee //c1: 0 1 2 3
+    vpermi.w  $vr21,     $vr17,     0x44 //c2: 0 1 2 3
+    vpermi.w  $vr19,     $vr17,     0xee //c3: 0 1 2 3
+
+    vst       $vr20,     C0,    0x00
+    vst       $vr18,     C1,    0x00
+    vst       $vr21,     C2,    0x00
+    vst       $vr19,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10 res20 res30
+    vld       $vr16,     C0,    0x00 //c0: 0 1 2 3
+    vld       $vr17,     C1,    0x00 //c1: 0 1 2 3
+    vld       $vr18,     C2,    0x00 //c2: 0 1 2 3
+    vld       $vr19,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    $vr20,     $vr17,    $vr17
+    vpermi.w  $vr20,     $vr16,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  $vr22,     $vr20,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  $vr23,     $vr20,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    $vr21,     $vr19,    $vr19
+    vpermi.w  $vr21,     $vr18,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  $vr24,     $vr21,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  $vr25,     $vr21,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  $vr24,     $vr22,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  $vr25,     $vr23,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      $vr24,    $vr2,    $vr28, $vr24
+    VFMADD      $vr25,    $vr3,    $vr28, $vr25
+    VNMSUB      $vr24,    $vr3,    $vr29, $vr24
+    VFMADD      $vr25,    $vr2,    $vr29, $vr25
+
+    vand.v    $vr26,     $vr25,    $vr25 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    $vr27,     $vr25,    $vr25 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  $vr26,     $vr24,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  $vr26,     $vr26,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  $vr27,     $vr24,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  $vr27,     $vr27,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    $vr20,     $vr17,    $vr17
+    vpermi.w  $vr20,     $vr16,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  $vr22,     $vr20,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  $vr23,     $vr20,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    $vr21,     $vr19,    $vr19
+    vpermi.w  $vr21,     $vr18,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  $vr24,     $vr21,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  $vr25,     $vr21,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  $vr24,     $vr22,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  $vr25,     $vr23,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      $vr24,    $vr4,    $vr28, $vr24
+    VFMADD      $vr25,    $vr5,    $vr28, $vr25
+    VNMSUB      $vr24,    $vr5,    $vr29, $vr24
+    VFMADD      $vr25,    $vr4,    $vr29, $vr25
+
+    vand.v    $vr20,     $vr25,    $vr25
+    vpermi.w  $vr20,     $vr24,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w $vr20,     $vr20,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    $vr18,     $vr20,    $vr20
+
+    vand.v    $vr21,     $vr25,    $vr25
+    vpermi.w  $vr21,     $vr24,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w $vr21,     $vr21,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    $vr19,     $vr21,    $vr21
+
+    vand.v    $vr16,     $vr26,    $vr26 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    $vr17,     $vr27,    $vr27 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  $vr20,     $vr16,     0x44 //c0: 0 1 2 3
+    vpermi.w  $vr18,     $vr16,     0xee //c1: 0 1 2 3
+    vpermi.w  $vr21,     $vr17,     0x44 //c2: 0 1 2 3
+    vpermi.w  $vr19,     $vr17,     0xee //c3: 0 1 2 3
+
+    vst       $vr20,     C0,    0x00
+    vst       $vr18,     C1,    0x00
+    vst       $vr21,     C2,    0x00
+    vst       $vr19,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L183:   /* if (bm & 1) */
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L186
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x03
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x05
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+    MTC        c31,    $r0
+    MTC        c32,    $r0
+    MTC        c41,    $r0
+    MTC        c42,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L185
+    blt        TL,     L,     .L185
+
+.L184:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00        //a0r
+    LD         a2,     A0,    0x04        //a0i
+
+    LD         b1,     B0,    0x00        //b0r
+    LD         b2,     B0,    0x04        //b0i
+    LD         b3,     B0,    0x08        //b1r
+    LD         b4,     B0,    0x0c        //b1i
+    LD         b5,     B0,    0x10        //b2r
+    LD         b6,     B0,    0x14        //b2i
+    LD         b7,     B0,    0x18        //b3r
+    LD         b8,     B0,    0x1c        //b3i
+
+    MADD1      c11,    a1,    b1,     c11  //res00r
+    MADD2      c12,    a2,    b1,     c12  //res00i
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    MADD1      c21,    a1,    b3,     c21  //res10r
+    MADD2      c22,    a2,    b3,     c22  //res10i
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    MADD1      c31,    a1,    b5,     c31  //res20r
+    MADD2      c32,    a2,    b5,     c32  //res20i
+    MADD3      c31,    a2,    b6,     c31
+    MADD4      c32,    a1,    b6,     c32
+
+    MADD1      c41,    a1,    b7,     c41  //res30r
+    MADD2      c42,    a2,    b7,     c42  //res30i
+    MADD3      c41,    a2,    b8,     c41
+    MADD4      c42,    a1,    b8,     c42
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L184
+
+.L185:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MUL       a5,     c11,   ALPHA_R
+    MUL       a6,     c12,   ALPHA_R
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    LD         a5,     C1,    0x00    //C1[0]
+    LD         a6,     C1,    0x04    //C1[1]
+
+    MUL       a5,     c21,   ALPHA_R
+    MUL       a6,     c22,   ALPHA_R
+    NMSUB      a5,     c22,   ALPHA_I, a5
+    MADD       a6,     c21,   ALPHA_I, a6
+
+    ST         a5,     C1,    0x00
+    ST         a6,     C1,    0x04
+
+    LD         a5,     C2,    0x00    //C2[0]
+    LD         a6,     C2,    0x04    //C2[1]
+
+    MUL       a5,     c31,   ALPHA_R
+    MUL       a6,     c32,   ALPHA_R
+    NMSUB      a5,     c32,   ALPHA_I, a5
+    MADD       a6,     c31,   ALPHA_I, a6
+
+    ST         a5,     C2,    0x00
+    ST         a6,     C2,    0x04
+
+    LD         a5,     C3,    0x00    //C3[0]
+    LD         a6,     C3,    0x04    //C3[1]
+
+    MUL       a5,     c41,   ALPHA_R
+    MUL       a6,     c42,   ALPHA_R
+    NMSUB      a5,     c42,   ALPHA_I, a5
+    MADD       a6,     c41,   ALPHA_I, a6
+
+    ST         a5,     C3,    0x00
+    ST         a6,     C3,    0x04
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+    addi.d     C2,     C2,    0x08
+    addi.d     C3,     C3,    0x08
+#else
+    //res00 res10 res20 res30
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    LD         a5,     C1,    0x00    //C1[0]
+    LD         a6,     C1,    0x04    //C1[1]
+
+    MADD       a5,     c21,   ALPHA_R, a5
+    MADD       a6,     c22,   ALPHA_R, a6
+    NMSUB      a5,     c22,   ALPHA_I, a5
+    MADD       a6,     c21,   ALPHA_I, a6
+
+    ST         a5,     C1,    0x00
+    ST         a6,     C1,    0x04
+
+    LD         a5,     C2,    0x00    //C2[0]
+    LD         a6,     C2,    0x04    //C2[1]
+
+    MADD       a5,     c31,   ALPHA_R, a5
+    MADD       a6,     c32,   ALPHA_R, a6
+    NMSUB      a5,     c32,   ALPHA_I, a5
+    MADD       a6,     c31,   ALPHA_I, a6
+
+    ST         a5,     C2,    0x00
+    ST         a6,     C2,    0x04
+
+    LD         a5,     C3,    0x00    //C3[0]
+    LD         a6,     C3,    0x04    //C3[1]
+
+    MADD       a5,     c41,   ALPHA_R, a5
+    MADD       a6,     c42,   ALPHA_R, a6
+    NMSUB      a5,     c42,   ALPHA_I, a5
+    MADD       a6,     c41,   ALPHA_I, a6
+
+    ST         a5,     C3,    0x00
+    ST         a6,     C3,    0x04
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+    addi.d     C2,     C2,    0x08
+    addi.d     C3,     C3,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x03
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+
+.L186:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   4
+#endif
+
+    slli.d     L,      K,     0x05
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x03
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     2
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     2
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&2); j+=2) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     4  //bm/16
+    beq        I,      T0,    .L21
+
+.L201:  /* for (i=0; i<bm/16; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x04
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   16
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+    xvxor.v    U4,     U4,   U4
+    xvxor.v    U5,     U5,   U5
+    xvxor.v    U6,     U6,   U6
+    xvxor.v    U7,     U7,   U7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L203
+    blt        TL,     L,     .L203
+
+.L202:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  //a0ri a1ri a2ri a3ri
+    vld       $vr18,     B0,    0x00  //b0ri b1ri
+
+    xvpackev.w D4,     D0,    D0  //a0rr a1rr a2rr a3rr
+    xvpackod.w D5,     D0,    D0  //a0ii a1ii a2ii a3ii
+
+    vpackev.w $vr22,     $vr18,    $vr18 //b0rr b1rr
+    vpackod.w $vr23,     $vr18,    $vr18 //b0ii b1ii
+
+    vpermi.w  $vr22,     $vr22,    0x88 //b0r b1r b0r b1r
+    vpermi.w  $vr23,     $vr23,    0x88 //b0i b1i b0i b1i
+
+    xvpermi.d  D6,     D6,    0x00 //b0r b1r b0r b1r b0r b1r b0r b1r
+    xvpermi.d  D7,     D7,    0x00 //b0i b1i b0i b1i b0i b1i b0i b1i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 01r 11r 02r 12r 03r 13r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 01i 11i 02i 12i 03i 13i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    xvld       D0,     A0,    0x20
+
+    xvpackev.w D4,     D0,    D0
+    xvpackod.w D5,     D0,    D0
+
+    XVMADD1    U2,     D4,    D6,     U2  //04r 14r 05r 15r 06r 16r 07r 17r
+    XVMADD2    U3,     D5,    D6,     U3  //04i 14i 05i 15i 06i 16i 07i 17i
+    XVMADD3    U2,     D5,    D7,     U2
+    XVMADD4    U3,     D4,    D7,     U3
+
+    xvld       D0,     A0,    0x40
+
+    xvpackev.w D4,     D0,    D0
+    xvpackod.w D5,     D0,    D0
+
+    XVMADD1    U4,     D4,    D6,     U4  //08r 18r 09r 19r 0ar 1ar 0br 1br
+    XVMADD2    U5,     D5,    D6,     U5  //08i 18i 09i 19i 0ai 1ai 0bi 1bi
+    XVMADD3    U4,     D5,    D7,     U4
+    XVMADD4    U5,     D4,    D7,     U5
+
+    xvld       D0,     A0,    0x60
+
+    xvpackev.w D4,     D0,    D0
+    xvpackod.w D5,     D0,    D0
+
+    XVMADD1    U6,     D4,    D6,     U6  //0cr 1cr 0dr 1dr 0er 1er 0fr 1fr
+    XVMADD2    U7,     D5,    D6,     U7  //0ci 1ci 0di 1di 0ei 1ei 0fi 1fi
+    XVMADD3    U6,     D5,    D7,     U6
+    XVMADD4    U7,     D4,    D7,     U7
+
+    addi.d     A0,     A0,    0x80
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L202
+
+.L203:
+#if defined(TRMMKERNEL)
+    //res00 res10 res01 res11 res02 res12 res03 res13
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackod.w D5,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+
+    xvfmul.s      D4,    U0,    VALPHAR
+    xvfmul.s      D5,    U1,    VALPHAR
+    XVNMSUB      D4,    U1,    VALPHAI, D4
+    XVFMADD      D5,    U0,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4   //c0: 0 1 2 3 4 5 6 7
+    xvpackod.w D1,     D5,    D4   //c1: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    //res04 res14 res05 res15 res06 res16 res07 res17
+    xvld       D0,     C0,    0x00
+    xvld       D1,     C1,    0x00
+
+    xvpackev.w D4,     D1,    D0
+    xvpackod.w D5,     D1,    D0
+
+    xvfmul.s      D4,    U2,    VALPHAR
+    xvfmul.s      D5,    U3,    VALPHAR
+    XVNMSUB      D4,    U3,    VALPHAI, D4
+    XVFMADD      D5,    U2,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4
+    xvpackod.w D1,     D5,    D4
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    //res08 res18 res09 res19 res0a res1a res0b res1b
+    xvld       D0,     C0,    0x00
+    xvld       D1,     C1,    0x00
+
+    xvpackev.w D4,     D1,    D0
+    xvpackod.w D5,     D1,    D0
+
+    xvfmul.s      D4,    U4,    VALPHAR
+    xvfmul.s      D5,    U5,    VALPHAR
+    XVNMSUB      D4,    U5,    VALPHAI, D4
+    XVFMADD      D5,    U4,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4
+    xvpackod.w D1,     D5,    D4
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    //res0c res1c res0d res1d res0e res1e res0f res1f
+    xvld       D0,     C0,    0x00
+    xvld       D1,     C1,    0x00
+
+    xvpackev.w D4,     D1,    D0
+    xvpackod.w D5,     D1,    D0
+
+    xvfmul.s      D4,    U6,    VALPHAR
+    xvfmul.s      D5,    U7,    VALPHAR
+    XVNMSUB      D4,    U7,    VALPHAI, D4
+    XVFMADD      D5,    U6,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4
+    xvpackod.w D1,     D5,    D4
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+#else
+    //res00 res10 res01 res11 res02 res12 res03 res13
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackod.w D5,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+
+    XVFMADD      D4,    U0,    VALPHAR, D4
+    XVFMADD      D5,    U1,    VALPHAR, D5
+    XVNMSUB      D4,    U1,    VALPHAI, D4
+    XVFMADD      D5,    U0,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4   //c0: 0 1 2 3 4 5 6 7
+    xvpackod.w D1,     D5,    D4   //c1: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    //res04 res14 res05 res15 res06 res16 res07 res17
+    xvld       D0,     C0,    0x00
+    xvld       D1,     C1,    0x00
+
+    xvpackev.w D4,     D1,    D0
+    xvpackod.w D5,     D1,    D0
+
+    XVFMADD      D4,    U2,    VALPHAR, D4
+    XVFMADD      D5,    U3,    VALPHAR, D5
+    XVNMSUB      D4,    U3,    VALPHAI, D4
+    XVFMADD      D5,    U2,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4
+    xvpackod.w D1,     D5,    D4
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    //res08 res18 res09 res19 res0a res1a res0b res1b
+    xvld       D0,     C0,    0x00
+    xvld       D1,     C1,    0x00
+
+    xvpackev.w D4,     D1,    D0
+    xvpackod.w D5,     D1,    D0
+
+    XVFMADD      D4,    U4,    VALPHAR, D4
+    XVFMADD      D5,    U5,    VALPHAR, D5
+    XVNMSUB      D4,    U5,    VALPHAI, D4
+    XVFMADD      D5,    U4,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4
+    xvpackod.w D1,     D5,    D4
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    //res0c res1c res0d res1d res0e res1e res0f res1f
+    xvld       D0,     C0,    0x00
+    xvld       D1,     C1,    0x00
+
+    xvpackev.w D4,     D1,    D0
+    xvpackod.w D5,     D1,    D0
+
+    XVFMADD      D4,    U6,    VALPHAR, D4
+    XVFMADD      D5,    U7,    VALPHAR, D5
+    XVNMSUB      D4,    U7,    VALPHAI, D4
+    XVFMADD      D5,    U6,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4
+    xvpackod.w D1,     D5,    D4
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -16
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   16
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L201
+
+.L21:  /* if (bm & 8) */
+    move       I,      $r0
+    andi       T1,     M,     8    //bm&8
+    beq        I,      T1,    .L24
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x04
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  //a0ri a1ri a2ri a3ri
+    vld       $vr18,     B0,    0x00  //b0ri b1ri
+
+    xvpackev.w D4,     D0,    D0  //a0rr a1rr a2rr a3rr
+    xvpackod.w D5,     D0,    D0  //a0ii a1ii a2ii a3ii
+
+    vpackev.w $vr22,     $vr18,    $vr18 //b0rr b1rr
+    vpackod.w $vr23,     $vr18,    $vr18 //b0ii b1ii
+
+    vpermi.w  $vr22,     $vr22,    0x88 //b0r b1r b0r b1r
+    vpermi.w  $vr23,     $vr23,    0x88 //b0i b1i b0i b1i
+
+    xvpermi.d  D6,     D6,    0x00 //b0r b1r b0r b1r b0r b1r b0r b1r
+    xvpermi.d  D7,     D7,    0x00 //b0i b1i b0i b1i b0i b1i b0i b1i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 01r 11r 02r 12r 03r 13r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 01i 11i 02i 12i 03i 13i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    xvld       D0,     A0,    0x20
+
+    xvpackev.w D4,     D0,    D0
+    xvpackod.w D5,     D0,    D0
+
+    XVMADD1    U2,     D4,    D6,     U2  //04r 14r 05r 15r 06r 16r 07r 17r
+    XVMADD2    U3,     D5,    D6,     U3  //04i 14i 05i 15i 06i 16i 07i 17i
+    XVMADD3    U2,     D5,    D7,     U2
+    XVMADD4    U3,     D4,    D7,     U3
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    //res00 res10 res01 res11 res02 res12 res03 res13
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackod.w D5,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+
+    xvfmul.s      D4,    U0,    VALPHAR
+    xvfmul.s      D5,    U1,    VALPHAR
+    XVNMSUB      D4,    U1,    VALPHAI, D4
+    XVFMADD      D5,    U0,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4   //c0: 0 1 2 3 4 5 6 7
+    xvpackod.w D1,     D5,    D4   //c1: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    //res04 res14 res05 res15 res06 res16 res07 res17
+    xvld       D0,     C0,    0x00
+    xvld       D1,     C1,    0x00
+
+    xvpackev.w D4,     D1,    D0
+    xvpackod.w D5,     D1,    D0
+
+    xvfmul.s      D4,    U2,    VALPHAR
+    xvfmul.s      D5,    U3,    VALPHAR
+    XVNMSUB      D4,    U3,    VALPHAI, D4
+    XVFMADD      D5,    U2,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4
+    xvpackod.w D1,     D5,    D4
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+#else
+    //res00 res10 res01 res11 res02 res12 res03 res13
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackod.w D5,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+
+    XVFMADD      D4,    U0,    VALPHAR, D4
+    XVFMADD      D5,    U1,    VALPHAR, D5
+    XVNMSUB      D4,    U1,    VALPHAI, D4
+    XVFMADD      D5,    U0,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4   //c0: 0 1 2 3 4 5 6 7
+    xvpackod.w D1,     D5,    D4   //c1: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    //res04 res14 res05 res15 res06 res16 res07 res17
+    xvld       D0,     C0,    0x00
+    xvld       D1,     C1,    0x00
+
+    xvpackev.w D4,     D1,    D0
+    xvpackod.w D5,     D1,    D0
+
+    XVFMADD      D4,    U2,    VALPHAR, D4
+    XVFMADD      D5,    U3,    VALPHAR, D5
+    XVNMSUB      D4,    U3,    VALPHAI, D4
+    XVFMADD      D5,    U2,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4
+    xvpackod.w D1,     D5,    D4
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L24:   /* if ( bm & 4 ) */
+    move       I,      $r0
+    andi       T1,     M,     4    //bm&4
+    beq        I,      T1,    .L280
+
+.L25:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  //a0ri a1ri a2ri a3ri
+    vld       $vr18,     B0,    0x00  //b0ri b1ri
+
+    xvpackev.w D4,     D0,    D0  //a0rr a1rr a2rr a3rr
+    xvpackod.w D5,     D0,    D0  //a0ii a1ii a2ii a3ii
+
+    vpackev.w $vr22,     $vr18,    $vr18 //b0rr b1rr
+    vpackod.w $vr23,     $vr18,    $vr18 //b0ii b1ii
+
+    vpermi.w  $vr22,     $vr22,    0x88 //b0r b1r b0r b1r
+    vpermi.w  $vr23,     $vr23,    0x88 //b0i b1i b0i b1i
+
+    xvpermi.d  D6,     D6,    0x00 //b0r b1r b0r b1r b0r b1r b0r b1r
+    xvpermi.d  D7,     D7,    0x00 //b0i b1i b0i b1i b0i b1i b0i b1i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 01r 11r 02r 12r 03r 13r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 01i 11i 02i 12i 03i 13i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    //res00 res10 res01 res11 res02 res12 res03 res13
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackod.w D5,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+
+    xvfmul.s      D4,    U0,    VALPHAR
+    xvfmul.s      D5,    U1,    VALPHAR
+    XVNMSUB      D4,    U1,    VALPHAI, D4
+    XVFMADD      D5,    U0,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4   //c0: 0 1 2 3 4 5 6 7
+    xvpackod.w D1,     D5,    D4   //c1: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+#else
+    //res00 res10 res01 res11 res02 res12 res03 res13
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackod.w D5,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+
+    XVFMADD      D4,    U0,    VALPHAR, D4
+    XVFMADD      D5,    U1,    VALPHAR, D5
+    XVNMSUB      D4,    U1,    VALPHAI, D4
+    XVFMADD      D5,    U0,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4   //c0: 0 1 2 3 4 5 6 7
+    xvpackod.w D1,     D5,    D4   //c1: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L280:   /* if ( bm & 2 )*/
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L284
+
+.L281:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    $vr2,     $vr2,   $vr2
+    vxor.v    $vr3,     $vr3,   $vr3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L283
+    blt        TL,     L,     .L283
+
+.L282:  /* for (k=0; k<temp; k++) */
+    vld       $vr16,     A0,    0x00  // a0ri a1ri
+    vld       $vr18,     B0,    0x00  // b0ri b1ri
+
+    vpackev.w $vr20,     $vr16,    $vr16  //a0rr a1rr
+    vpackod.w $vr21,     $vr16,    $vr16  //a0ii a1ii
+
+    vpackev.w $vr22,     $vr18,    $vr18 //b0rr b1rr
+    vpackod.w $vr23,     $vr18,    $vr18 //b0ii b1ii
+
+    vpermi.w  $vr22,     $vr22,    0x88 //b0r b1r b0r b1r
+    vpermi.w  $vr23,     $vr23,    0x88 //b0i b1i b0i b1i
+
+    VMADD1    $vr2,     $vr20,    $vr22,     $vr2  //00r 10r 01r 11r
+    VMADD2    $vr3,     $vr21,    $vr22,     $vr3  //00i 10i 01i 11i
+    VMADD3    $vr2,     $vr21,    $vr23,     $vr2
+    VMADD4    $vr3,     $vr20,    $vr23,     $vr3
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L282
+
+.L283:
+#if defined(TRMMKERNEL)
+    //res00 res10 res01 res11
+    vld       $vr16,     C0,    0x00 //c0: 0 1 2 3
+    vld       $vr17,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w $vr18,     $vr17,    $vr16  //c0[0] c1[0] c0[2] c1[2]
+    vpackod.w $vr19,     $vr17,    $vr16  //c0[1] c1[1] c0[3] c1[3]
+
+    vfmul.s      $vr18,    $vr2,    $vr28
+    vfmul.s      $vr19,    $vr3,    $vr28
+    VNMSUB      $vr18,    $vr3,    $vr29, $vr18
+    VFMADD      $vr19,    $vr2,    $vr29, $vr19
+
+    vpackev.w $vr20,     $vr19,    $vr18
+    vpackod.w $vr21,     $vr19,    $vr18
+
+    vst       $vr20,     C0,    0x00 //c0: 0 1 2 3
+    vst       $vr21,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10 res01 res11
+    vld       $vr16,     C0,    0x00 //c0: 0 1 2 3
+    vld       $vr17,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w $vr18,     $vr17,    $vr16  //c0[0] c1[0] c0[2] c1[2]
+    vpackod.w $vr19,     $vr17,    $vr16  //c0[1] c1[1] c0[3] c1[3]
+
+    VFMADD      $vr18,    $vr2,    $vr28, $vr18
+    VFMADD      $vr19,    $vr3,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr3,    $vr29, $vr18
+    VFMADD      $vr19,    $vr2,    $vr29, $vr19
+
+    vpackev.w $vr20,     $vr19,    $vr18
+    vpackod.w $vr21,     $vr19,    $vr18
+
+    vst       $vr20,     C0,    0x00 //c0: 0 1 2 3
+    vst       $vr21,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L284:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L288
+
+.L285:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x03
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L287
+    blt        TL,     L,     .L287
+
+.L286:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00        //a0r
+    LD         a2,     A0,    0x04        //a0i
+
+    LD         b1,     B0,    0x00        //b0r
+    LD         b2,     B0,    0x04        //b0i
+    LD         b3,     B0,    0x08        //b1r
+    LD         b4,     B0,    0x0c        //b1i
+
+    MADD1      c11,    a1,    b1,     c11  //res00r
+    MADD2      c12,    a2,    b1,     c12  //res00i
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    MADD1      c21,    a1,    b3,     c21  //res10r
+    MADD2      c22,    a2,    b3,     c22  //res10i
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L286
+
+.L287:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MUL       a5,     c11,   ALPHA_R
+    MUL       a6,     c12,   ALPHA_R
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    LD         a5,     C1,    0x00    //C1[0]
+    LD         a6,     C1,    0x04    //C1[1]
+
+    MUL       a5,     c21,   ALPHA_R
+    MUL       a6,     c22,   ALPHA_R
+    NMSUB      a5,     c22,   ALPHA_I, a5
+    MADD       a6,     c21,   ALPHA_I, a6
+
+    ST         a5,     C1,    0x00
+    ST         a6,     C1,    0x04
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+#else
+    //res00 res10 res20 res30
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    LD         a5,     C1,    0x00    //C1[0]
+    LD         a6,     C1,    0x04    //C1[1]
+
+    MADD       a5,     c21,   ALPHA_R, a5
+    MADD       a6,     c22,   ALPHA_R, a6
+    NMSUB      a5,     c22,   ALPHA_I, a5
+    MADD       a6,     c21,   ALPHA_I, a6
+
+    ST         a5,     C1,    0x00
+    ST         a6,     C1,    0x04
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x03
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L288:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+    slli.d     L,      K,     4
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   2
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     2
+    andi       T0,     N,     2
+    blt        J,      T0,    .L20
+
+.L30:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L999
+
+.L300:  /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     4  //bm/16
+    beq        I,      T0,    .L31
+
+.L301:  /* for (i=0; i<bm/16; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x03
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   16
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L303
+    blt        TL,     L,     .L303
+
+.L302:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  //a0ri a1ri a2ri a3ri
+    xvld       D1,     A0,    0x20  //a4ri a5ri a6ri a7ri
+
+    xvldrepl.w D2,     B0,    0x00 //b0r
+    xvldrepl.w D3,     B0,    0x04 //b0i
+
+    xvpackev.w D4,     D1,    D0    //a0r a4r a1r a5r a2r a6r a3r a7r
+    xvpermi.w  D4,     D4,    0xd8  //a0r a1r a4r a5r a2r a3r a6r a7r
+    xvpermi.d  D4,     D4,    0xd8  //a0r a1r a2r a3r a4r a5r a6r a7r
+
+    xvpackod.w D5,     D1,    D0    //a0i a4i a1i a5i a2i a6i a3i a7i
+    xvpermi.w  D5,     D5,    0xd8  //a0i a1i a4i a5i a2i a3i a6i a7i
+    xvpermi.d  D5,     D5,    0xd8  //a0i a1i a2i a3i a4i a5i a6i a7i
+
+    XVMADD1    U0,     D4,    D2,     U0  //00r 01r 02r 03r 04r 05r 06r 07r
+    XVMADD2    U1,     D5,    D2,     U1  //00i 01i 02i 03i 04i 05i 06i 07i
+    XVMADD3    U0,     D5,    D3,     U0
+    XVMADD4    U1,     D4,    D3,     U1
+
+    xvld       D0,     A0,    0x40  //a8ri a9ri a10ri a11ri
+    xvld       D1,     A0,    0x60  //a12ri a13ri a14ri a15ri
+
+    xvpackev.w D4,     D1,    D0
+    xvpermi.w  D4,     D4,    0xd8
+    xvpermi.d  D4,     D4,    0xd8  //a8r a9r a10r a11r a12r a13r a14r a15r
+
+    xvpackod.w D5,     D1,    D0
+    xvpermi.w  D5,     D5,    0xd8
+    xvpermi.d  D5,     D5,    0xd8  //a8i a9i a10i a11i a12i a13i a14i a15i
+
+    XVMADD1    U2,     D4,    D2,     U2  //08r 09r 0ar 0br 0cr 0dr 0er 0fr
+    XVMADD2    U3,     D5,    D2,     U3  //08i 09i 0ai 0bi 0ci 0di 0ei 0fi
+    XVMADD3    U2,     D5,    D3,     U2
+    XVMADD4    U3,     D4,    D3,     U3
+
+    addi.d     A0,     A0,    0x80
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L302
+
+.L303:
+#if defined(TRMMKERNEL)
+    //res00 res01 res02 res03 res04 res05 res06 res07
+    xvld       D0,     C0,    0x00 //c0:0 1 2 3 4 5 6 7
+    xvld       D1,     C0,    0x20 //c0:8 9 10 11 12 13 14 15
+
+    xvpackev.w D2,     D1,    D0  //0 8 2 10 4 12 6 14
+    xvpermi.w  D2,     D2,    0xd8  //0 2 8 10 4 6 12 14
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6 8 10 12 14
+
+    xvpackod.w D3,     D1,    D0
+    xvpermi.w  D3,     D3,    0xd8
+    xvpermi.d  D3,     D3,    0xd8 //1 3 5 7 9 11 13 15
+
+    xvfmul.s      D2,    U0,    VALPHAR
+    xvfmul.s      D3,    U1,    VALPHAR
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6 8 10 12 14
+    xvpermi.q  D4,     D3,   0x02 //0 2 4 6 1 3 5 7
+    xvpermi.d  D4,     D4,   0xd8 //0 2 1 3 4 6 5 7
+    xvpermi.w  D4,     D4,   0xd8 //0 1 2 3 4 5 6 7
+
+    xvand.v    D5,     D3,   D3
+    xvpermi.q  D5,     D2,   0x31
+    xvpermi.d  D5,     D5,   0xd8
+    xvpermi.w  D5,     D5,   0xd8
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    addi.d     C0,     C0,    0x40
+
+    //res08 res09 res0a res0b res0c res0d res0e res0f
+    xvld       D0,     C0,    0x00 //c0:0 1 2 3 4 5 6 7
+    xvld       D1,     C0,    0x20 //c0:8 9 10 11 12 13 14 15
+
+    xvpackev.w D2,     D1,    D0  //0 8 2 10 4 12 6 14
+    xvpermi.w  D2,     D2,    0xd8  //0 2 8 10 4 6 12 14
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6 8 10 12 14
+
+    xvpackod.w D3,     D1,    D0
+    xvpermi.w  D3,     D3,    0xd8
+    xvpermi.d  D3,     D3,    0xd8 //1 3 5 7 9 11 13 15
+
+    xvfmul.s      D2,    U2,    VALPHAR
+    xvfmul.s      D3,    U3,    VALPHAR
+    XVNMSUB      D2,    U3,    VALPHAI, D2
+    XVFMADD      D3,    U2,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6 8 10 12 14
+    xvpermi.q  D4,     D3,   0x02 //0 2 4 6 1 3 5 7
+    xvpermi.d  D4,     D4,   0xd8 //0 2 1 3 4 6 5 7
+    xvpermi.w  D4,     D4,   0xd8 //0 1 2 3 4 5 6 7
+
+    xvand.v    D5,     D3,   D3
+    xvpermi.q  D5,     D2,   0x31
+    xvpermi.d  D5,     D5,   0xd8
+    xvpermi.w  D5,     D5,   0xd8
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    addi.d     C0,     C0,    0x40
+#else
+    //res00 res01 res02 res03 res04 res05 res06 res07
+    xvld       D0,     C0,    0x00 //c0:0 1 2 3 4 5 6 7
+    xvld       D1,     C0,    0x20 //c0:8 9 10 11 12 13 14 15
+
+    xvpackev.w D2,     D1,    D0  //0 8 2 10 4 12 6 14
+    xvpermi.w  D2,     D2,    0xd8  //0 2 8 10 4 6 12 14
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6 8 10 12 14
+
+    xvpackod.w D3,     D1,    D0
+    xvpermi.w  D3,     D3,    0xd8
+    xvpermi.d  D3,     D3,    0xd8 //1 3 5 7 9 11 13 15
+
+    XVFMADD      D2,    U0,    VALPHAR, D2
+    XVFMADD      D3,    U1,    VALPHAR, D3
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6 8 10 12 14
+    xvpermi.q  D4,     D3,   0x02 //0 2 4 6 1 3 5 7
+    xvpermi.d  D4,     D4,   0xd8 //0 2 1 3 4 6 5 7
+    xvpermi.w  D4,     D4,   0xd8 //0 1 2 3 4 5 6 7
+
+    xvand.v    D5,     D3,   D3
+    xvpermi.q  D5,     D2,   0x31
+    xvpermi.d  D5,     D5,   0xd8
+    xvpermi.w  D5,     D5,   0xd8
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    addi.d     C0,     C0,    0x40
+
+    //res08 res09 res0a res0b res0c res0d res0e res0f
+    xvld       D0,     C0,    0x00 //c0:0 1 2 3 4 5 6 7
+    xvld       D1,     C0,    0x20 //c0:8 9 10 11 12 13 14 15
+
+    xvpackev.w D2,     D1,    D0  //0 8 2 10 4 12 6 14
+    xvpermi.w  D2,     D2,    0xd8  //0 2 8 10 4 6 12 14
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6 8 10 12 14
+
+    xvpackod.w D3,     D1,    D0
+    xvpermi.w  D3,     D3,    0xd8
+    xvpermi.d  D3,     D3,    0xd8 //1 3 5 7 9 11 13 15
+
+    XVFMADD      D2,    U2,    VALPHAR, D2
+    XVFMADD      D3,    U3,    VALPHAR, D3
+    XVNMSUB      D2,    U3,    VALPHAI, D2
+    XVFMADD      D3,    U2,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6 8 10 12 14
+    xvpermi.q  D4,     D3,   0x02 //0 2 4 6 1 3 5 7
+    xvpermi.d  D4,     D4,   0xd8 //0 2 1 3 4 6 5 7
+    xvpermi.w  D4,     D4,   0xd8 //0 1 2 3 4 5 6 7
+
+    xvand.v    D5,     D3,   D3
+    xvpermi.q  D5,     D2,   0x31
+    xvpermi.d  D5,     D5,   0xd8
+    xvpermi.w  D5,     D5,   0xd8
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    addi.d     C0,     C0,    0x40
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -16
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x03
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   16
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L301
+
+.L31:  /* if ( bm & 8 )*/
+    move       I,      $r0
+    andi       T1,     M,     8    //bm&8
+    beq        I,      T1,    .L34
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x03
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L33
+    blt        TL,     L,     .L33
+
+.L32:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  //a0ri a1ri a2ri a3ri
+    xvld       D1,     A0,    0x20  //a4ri a5ri a6ri a7ri
+
+    xvldrepl.w D2,     B0,    0x00 //b0r
+    xvldrepl.w D3,     B0,    0x04 //b0i
+
+    xvpackev.w D4,     D1,    D0    //a0r a4r a1r a5r a2r a6r a3r a7r
+    xvpermi.w  D4,     D4,    0xd8  //a0r a1r a4r a5r a2r a3r a6r a7r
+    xvpermi.d  D4,     D4,    0xd8  //a0r a1r a2r a3r a4r a5r a6r a7r
+
+    xvpackod.w D5,     D1,    D0    //a0i a4i a1i a5i a2i a6i a3i a7i
+    xvpermi.w  D5,     D5,    0xd8  //a0i a1i a4i a5i a2i a3i a6i a7i
+    xvpermi.d  D5,     D5,    0xd8  //a0i a1i a2i a3i a4i a5i a6i a7i
+
+    XVMADD1    U0,     D4,    D2,     U0  //00r 01r 02r 03r 04r 05r 06r 07r
+    XVMADD2    U1,     D5,    D2,     U1  //00i 01i 02i 03i 04i 05i 06i 07i
+    XVMADD3    U0,     D5,    D3,     U0
+    XVMADD4    U1,     D4,    D3,     U1
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L32
+
+.L33:
+#if defined(TRMMKERNEL)
+    //res00 res01 res02 res03 res04 res05 res06 res07
+    xvld       D0,     C0,    0x00 //c0:0 1 2 3 4 5 6 7
+    xvld       D1,     C0,    0x20 //c0:8 9 10 11 12 13 14 15
+
+    xvpackev.w D2,     D1,    D0  //0 8 2 10 4 12 6 14
+    xvpermi.w  D2,     D2,    0xd8  //0 2 8 10 4 6 12 14
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6 8 10 12 14
+
+    xvpackod.w D3,     D1,    D0
+    xvpermi.w  D3,     D3,    0xd8
+    xvpermi.d  D3,     D3,    0xd8 //1 3 5 7 9 11 13 15
+
+    xvfmul.s      D2,    U0,    VALPHAR
+    xvfmul.s      D3,    U1,    VALPHAR
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6 8 10 12 14
+    xvpermi.q  D4,     D3,   0x02 //0 2 4 6 1 3 5 7
+    xvpermi.d  D4,     D4,   0xd8 //0 2 1 3 4 6 5 7
+    xvpermi.w  D4,     D4,   0xd8 //0 1 2 3 4 5 6 7
+
+    xvand.v    D5,     D3,   D3
+    xvpermi.q  D5,     D2,   0x31
+    xvpermi.d  D5,     D5,   0xd8
+    xvpermi.w  D5,     D5,   0xd8
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    addi.d     C0,     C0,    0x40
+#else
+    //res00 res01 res02 res03 res04 res05 res06 res07
+    xvld       D0,     C0,    0x00 //c0:0 1 2 3 4 5 6 7
+    xvld       D1,     C0,    0x20 //c0:8 9 10 11 12 13 14 15
+
+    xvpackev.w D2,     D1,    D0  //0 8 2 10 4 12 6 14
+    xvpermi.w  D2,     D2,    0xd8  //0 2 8 10 4 6 12 14
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6 8 10 12 14
+
+    xvpackod.w D3,     D1,    D0
+    xvpermi.w  D3,     D3,    0xd8
+    xvpermi.d  D3,     D3,    0xd8 //1 3 5 7 9 11 13 15
+
+    XVFMADD      D2,    U0,    VALPHAR, D2
+    XVFMADD      D3,    U1,    VALPHAR, D3
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6 8 10 12 14
+    xvpermi.q  D4,     D3,   0x02 //0 2 4 6 1 3 5 7
+    xvpermi.d  D4,     D4,   0xd8 //0 2 1 3 4 6 5 7
+    xvpermi.w  D4,     D4,   0xd8 //0 1 2 3 4 5 6 7
+
+    xvand.v    D5,     D3,   D3
+    xvpermi.q  D5,     D2,   0x31
+    xvpermi.d  D5,     D5,   0xd8
+    xvpermi.w  D5,     D5,   0xd8
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    addi.d     C0,     C0,    0x40
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x03
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+.L34:   /* if ( bm & 4 ) */
+    move       I,      $r0
+    andi       T1,     M,     4    //bm&4
+    beq        I,      T1,    .L38
+
+.L35:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x03
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    $vr2,     $vr2,   $vr2
+    vxor.v    $vr3,     $vr3,   $vr3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L37
+    blt        TL,     L,     .L37
+
+.L36:  /* for (k=0; k<temp; k++) */
+    vld       $vr16,     A0,    0x00  // a0ri a1ri
+    vld       $vr17,     A0,    0x10  // a2ri a3ri
+
+    vldrepl.w $vr18,     B0,    0x00 //b0r
+    vldrepl.w $vr19,     B0,    0x04 //b0i
+
+    vpackev.w $vr20,     $vr17,    $vr16
+    vshuf4i.w  $vr20,     $vr20,    0xd8  //a0r a1r a2r a3r
+
+    vpackod.w $vr21,     $vr17,    $vr16
+    vshuf4i.w  $vr21,     $vr21,    0xd8  //a0i a1i a2i a3i
+
+    VMADD1    $vr2,     $vr20,    $vr18,     $vr2  //00r 01r 02r 03r
+    VMADD2    $vr3,     $vr21,    $vr18,     $vr3  //00i 01i 02i 03i
+    VMADD3    $vr2,     $vr21,    $vr19,     $vr2
+    VMADD4    $vr3,     $vr20,    $vr19,     $vr3
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L36
+
+.L37:
+#if defined(TRMMKERNEL)
+    //res00 res01 res02 res03
+    vld       $vr16,     C0,    0x00 //c0: 0 1 2 3
+    vld       $vr17,     C0,    0x10 //c0: 4 5 6 7
+
+    vpackev.w $vr18,     $vr17,    $vr16
+    vshuf4i.w  $vr18,     $vr18,    0xd8  //0 2 4 6
+    vpackod.w $vr19,     $vr17,    $vr16
+    vshuf4i.w  $vr19,     $vr19,    0xd8  //1 3 5 7
+
+    vfmul.s      $vr18,    $vr2,    $vr28
+    vfmul.s      $vr19,    $vr3,    $vr28
+    VNMSUB      $vr18,    $vr3,    $vr29, $vr18
+    VFMADD      $vr19,    $vr2,    $vr29, $vr19
+
+    vand.v    $vr20,     $vr19,   $vr19  //1 3 5 7
+    vpermi.w  $vr20,     $vr18,   0x44 //0 2 1 3
+    vshuf4i.w  $vr20,     $vr20,   0xd8 //0 1 2 3
+
+    vand.v    $vr21,     $vr19,   $vr19  //1 3 5 7
+    vpermi.w  $vr21,     $vr18,   0xee //4 6 5 7
+    vshuf4i.w  $vr21,     $vr21,   0xd8 //4 5 6 7
+
+    vst       $vr20,     C0,    0x00
+    vst       $vr21,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#else
+    //res00 res01 res02 res03
+    vld       $vr16,     C0,    0x00 //c0: 0 1 2 3
+    vld       $vr17,     C0,    0x10 //c0: 4 5 6 7
+
+    vpackev.w $vr18,     $vr17,    $vr16
+    vshuf4i.w  $vr18,     $vr18,    0xd8  //0 2 4 6
+    vpackod.w $vr19,     $vr17,    $vr16
+    vshuf4i.w  $vr19,     $vr19,    0xd8  //1 3 5 7
+
+    VFMADD      $vr18,    $vr2,    $vr28, $vr18
+    VFMADD      $vr19,    $vr3,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr3,    $vr29, $vr18
+    VFMADD      $vr19,    $vr2,    $vr29, $vr19
+
+    vand.v    $vr20,     $vr19,   $vr19  //1 3 5 7
+    vpermi.w  $vr20,     $vr18,   0x44 //0 2 1 3
+    vshuf4i.w  $vr20,     $vr20,   0xd8 //0 1 2 3
+
+    vand.v    $vr21,     $vr19,   $vr19  //1 3 5 7
+    vpermi.w  $vr21,     $vr18,   0xee //4 6 5 7
+    vshuf4i.w  $vr21,     $vr21,   0xd8 //4 5 6 7
+
+    vst       $vr20,     C0,    0x00
+    vst       $vr21,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x03
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+.L38:   /* if ( bm & 2 ) */
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L312
+
+.L39:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x03
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L311
+    blt        TL,     L,     .L311
+
+.L310:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00        //a0r
+    LD         a2,     A0,    0x04        //a0i
+    LD         a3,     A0,    0x08        //a1r
+    LD         a4,     A0,    0x0c        //a1i
+
+    LD         b1,     B0,    0x00        //b0r
+    LD         b2,     B0,    0x04        //b0i
+
+    MADD1      c11,    a1,    b1,     c11  //res00r
+    MADD2      c12,    a2,    b1,     c12  //res00i
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    MADD1      c21,    a3,    b1,     c21  //res01r
+    MADD2      c22,    a4,    b1,     c22  //res01i
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L310
+
+.L311:
+#if defined(TRMMKERNEL)
+    //res00 res01
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MUL       a5,     c11,   ALPHA_R
+    MUL       a6,     c12,   ALPHA_R
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    LD         a5,     C0,    0x08    //C0[2]
+    LD         a6,     C0,    0x0c    //C0[3]
+
+    MUL       a5,     c21,   ALPHA_R
+    MUL       a6,     c22,   ALPHA_R
+    NMSUB      a5,     c22,   ALPHA_I, a5
+    MADD       a6,     c21,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x08
+    ST         a6,     C0,    0x0c
+
+    addi.d     C0,     C0,    0x10
+#else
+    //res00 res01
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    LD         a5,     C0,    0x08    //C0[2]
+    LD         a6,     C0,    0x0c    //C0[3]
+
+    MADD       a5,     c21,   ALPHA_R, a5
+    MADD       a6,     c22,   ALPHA_R, a6
+    NMSUB      a5,     c22,   ALPHA_I, a5
+    MADD       a6,     c21,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x08
+    ST         a6,     C0,    0x0c
+
+    addi.d     C0,     C0,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x03
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L312:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L316
+
+.L313:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x03
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x03
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L315
+    blt        TL,     L,     .L315
+
+.L314:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00
+    LD         a2,     A0,    0x04
+
+    LD         b1,     B0,    0x00
+    LD         b2,     B0,    0x04
+
+    MADD1      c11,    a1,    b1,     c11
+    MADD2      c12,    a2,    b1,     c12
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L314
+
+.L315:
+#if defined(TRMMKERNEL)
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MUL       a5,     c11,   ALPHA_R
+    MUL       a6,     c12,   ALPHA_R
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    addi.d     C0,     C0,    0x08
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    addi.d     C0,     C0,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x03
+    add.d      A0,     A0,   T3
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L316:
+    slli.d     L,      K,     3
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L300
+
+.L999:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_ncopy_16_lasx.S b/kernel/loongarch64/cgemm_ncopy_16_lasx.S
new file mode 100644
index 000000000..7c2d0ac64
--- /dev/null
+++ b/kernel/loongarch64/cgemm_ncopy_16_lasx.S
@@ -0,0 +1,691 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define S9     $r20
+#define S10    $r23
+#define S11    $r24
+#define S12    $r25
+#define S13    $r26
+#define S14    $r27
+#define S15    $r28
+#define S16    $r29
+#define TD     $r30
+#define TS     $r31
+#define TL     $r7
+#define T0     $r6
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -0x90
+    SDARG      $r23, $sp,  0x00
+    SDARG      $r24, $sp,  0x08
+    SDARG      $r25, $sp,  0x10
+    SDARG      $r26, $sp,  0x18
+    SDARG      $r27, $sp,  0x20
+    SDARG      $r28, $sp,  0x28
+    SDARG      $r29, $sp,  0x30
+    SDARG      $r30, $sp,  0x38
+    SDARG      $r31, $sp,  0x40
+    ST         $f23, $sp,  0x48
+    ST         $f24, $sp,  0x50
+    ST         $f25, $sp,  0x58
+    ST         $f26, $sp,  0x60
+    ST         $f27, $sp,  0x68
+    ST         $f28, $sp,  0x70
+    ST         $f29, $sp,  0x78
+    ST         $f30, $sp,  0x80
+    ST         $f31, $sp,  0x88
+
+    move       TD,   DST
+    move       TS,   SRC
+    slli.d     TL,   LDA,  0x03
+    slli.d     T0,   TL,   0x01
+    srai.d     J,    N,    0x04
+    beq        J,    ZERO, .L_N8
+
+.L_J1: /* J-- */
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x03
+    add.d      S3,   S2,   TL
+    addi.d     J,    J,    -1
+    add.d      S4,   S3,   TL
+    add.d      S5,   S3,   T0
+    add.d      S6,   S4,   T0
+    add.d      S7,   S5,   T0
+    add.d      S8,   S6,   T0
+    add.d      S9,   S7,   T0
+    add.d      S10,  S8,   T0
+    add.d      S11,  S9,   T0
+    add.d      S12,  S10,  T0
+    add.d      S13,  S11,  T0
+    add.d      S14,  S12,  T0
+    add.d      S15,  S13,  T0
+    add.d      S16,  S14,  T0
+    add.d      TS,   S15,  T0
+    beq        I,    ZERO, .L_I7
+
+.L_I1: /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+    xvld       U4,   S5,   0x00
+    xvld       U5,   S6,   0x00
+    xvld       U6,   S7,   0x00
+    xvld       U7,   S8,   0x00
+    xvld       U8,   S9,   0x00
+    xvld       U9,   S10,  0x00
+    xvld       U10,  S11,  0x00
+    xvld       U11,  S12,  0x00
+    xvld       U12,  S13,  0x00
+    xvld       U13,  S14,  0x00
+    xvld       U14,  S15,  0x00
+    xvld       U15,  S16,  0x00
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+    xvpackev.d D4,   U5,   U4
+    xvpackod.d D5,   U5,   U4
+    xvpackev.d D6,   U7,   U6
+    xvpackod.d D7,   U7,   U6
+
+    xvpackev.d D8,   U9,   U8
+    xvpackod.d D9,   U9,   U8
+    xvpackev.d D10,  U11,  U10
+    xvpackod.d D11,  U11,  U10
+    xvpackev.d D12,  U13,  U12
+    xvpackod.d D13,  U13,  U12
+    xvpackev.d D14,  U15,  U14
+    xvpackod.d D15,  U15,  U14
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U4,   D4,   D4
+    xvpermi.q  D4,   D6,   0x02  // 1
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 4
+    xvand.v    U5,   D5,   D5
+    xvpermi.q  D5,   D7,   0x02  // 5
+    xvpermi.q  D2,   U0,   0x31  // 8
+    xvpermi.q  D6,   U4,   0x31  // 9
+    xvpermi.q  D3,   U1,   0x31  // 12
+    xvpermi.q  D7,   U5,   0x31  // 13
+
+    xvand.v    U8,   D8,   D8
+    xvpermi.q  D8,   D10,  0x02  // 2
+    xvand.v    U12,  D12,  D12
+    xvpermi.q  D12,  D14,  0x02  // 3
+    xvand.v    U9,   D9,   D9
+    xvpermi.q  D9,   D11,  0x02  // 6
+    xvand.v    U13,  D13,  D13
+    xvpermi.q  D13,  D15,  0x02  // 7
+    xvpermi.q  D10,  U8,   0x31  // 10
+    xvpermi.q  D14,  U12,  0x31  // 11
+    xvpermi.q  D11,  U9,   0x31  // 14
+    xvpermi.q  D15,  U13,  0x31  // 15
+
+    xvst       D0,   TD,   0x00  // 0
+    xvst       D4,   TD,   0x20  // 1
+    xvst       D8,   TD,   0x40  // 2
+    xvst       D12,  TD,   0x60  // 3
+    xvst       D1,   TD,   0x80  // 4
+    xvst       D5,   TD,   0xA0  // 5
+    xvst       D9,   TD,   0xC0  // 6
+    xvst       D13,  TD,   0xE0  // 7
+    addi.d     TD,   TD,   0x100
+    xvst       D2,   TD,   0x00  // 8
+    xvst       D6,   TD,   0x20  // 9
+    xvst       D10,  TD,   0x40  // 10
+    xvst       D14,  TD,   0x60  // 11
+    xvst       D3,   TD,   0x80  // 12
+    xvst       D7,   TD,   0xA0  // 13
+    xvst       D11,  TD,   0xC0  // 14
+    xvst       D15,  TD,   0xE0  // 15
+    addi.d     TD,   TD,   0x100
+
+    xvld       U0,   S1,   0x20
+    xvld       U1,   S2,   0x20
+    xvld       U2,   S3,   0x20
+    xvld       U3,   S4,   0x20
+    xvld       U4,   S5,   0x20
+    xvld       U5,   S6,   0x20
+    xvld       U6,   S7,   0x20
+    xvld       U7,   S8,   0x20
+    xvld       U8,   S9,   0x20
+    xvld       U9,   S10,  0x20
+    xvld       U10,  S11,  0x20
+    xvld       U11,  S12,  0x20
+    xvld       U12,  S13,  0x20
+    xvld       U13,  S14,  0x20
+    xvld       U14,  S15,  0x20
+    xvld       U15,  S16,  0x20
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+    xvpackev.d D4,   U5,   U4
+    xvpackod.d D5,   U5,   U4
+    xvpackev.d D6,   U7,   U6
+    xvpackod.d D7,   U7,   U6
+
+    xvpackev.d D8,   U9,   U8
+    xvpackod.d D9,   U9,   U8
+    xvpackev.d D10,  U11,  U10
+    xvpackod.d D11,  U11,  U10
+    xvpackev.d D12,  U13,  U12
+    xvpackod.d D13,  U13,  U12
+    xvpackev.d D14,  U15,  U14
+    xvpackod.d D15,  U15,  U14
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U4,   D4,   D4
+    xvpermi.q  D4,   D6,   0x02  // 1
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 4
+    xvand.v    U5,   D5,   D5
+    xvpermi.q  D5,   D7,   0x02  // 5
+    xvpermi.q  D2,   U0,   0x31  // 8
+    xvpermi.q  D6,   U4,   0x31  // 9
+    xvpermi.q  D3,   U1,   0x31  // 12
+    xvpermi.q  D7,   U5,   0x31  // 13
+
+    xvand.v    U8,   D8,   D8
+    xvpermi.q  D8,   D10,  0x02  // 2
+    xvand.v    U12,  D12,  D12
+    xvpermi.q  D12,  D14,  0x02  // 3
+    xvand.v    U9,   D9,   D9
+    xvpermi.q  D9,   D11,  0x02  // 6
+    xvand.v    U13,  D13,  D13
+    xvpermi.q  D13,  D15,  0x02  // 7
+    xvpermi.q  D10,  U8,   0x31  // 10
+    xvpermi.q  D14,  U12,  0x31  // 11
+    xvpermi.q  D11,  U9,   0x31  // 14
+    xvpermi.q  D15,  U13,  0x31  // 15
+
+    xvst       D0,   TD,   0x00  // 0
+    xvst       D4,   TD,   0x20  // 1
+    xvst       D8,   TD,   0x40  // 2
+    xvst       D12,  TD,   0x60  // 3
+    xvst       D1,   TD,   0x80  // 4
+    xvst       D5,   TD,   0xA0  // 5
+    xvst       D9,   TD,   0xC0  // 6
+    xvst       D13,  TD,   0xE0  // 7
+    addi.d     TD,   TD,   0x100
+    xvst       D2,   TD,   0x00  // 8
+    xvst       D6,   TD,   0x20  // 9
+    xvst       D10,  TD,   0x40  // 10
+    xvst       D14,  TD,   0x60  // 11
+    xvst       D3,   TD,   0x80  // 12
+    xvst       D7,   TD,   0xA0  // 13
+    xvst       D11,  TD,   0xC0  // 14
+    xvst       D15,  TD,   0xE0  // 15
+    addi.d     TD,   TD,   0x100
+
+
+    addi.d     S1,   S1,   0x40
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    addi.d     S5,   S5,   0x40
+    addi.d     S6,   S6,   0x40
+    addi.d     S7,   S7,   0x40
+    addi.d     S8,   S8,   0x40
+    addi.d     S9,   S9,   0x40
+    addi.d     S10,  S10,  0x40
+    addi.d     S11,  S11,  0x40
+    addi.d     S12,  S12,  0x40
+    addi.d     S13,  S13,  0x40
+    addi.d     S14,  S14,  0x40
+    addi.d     S15,  S15,  0x40
+    addi.d     S16,  S16,  0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_I7:
+    andi      I,     M,    0x07
+    beq       I,     ZERO, .L_I0
+
+.L_II1: /* I-- */
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+    fld.d     F4,    S5,  0x00
+    fld.d     F5,    S6,  0x00
+    fld.d     F6,    S7,  0x00
+    fld.d     F7,    S8,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+    fst.d     F4,    TD,  0x20
+    addi.d    S5,    S5,  0x08
+    fst.d     F5,    TD,  0x28
+    addi.d    S6,    S6,  0x08
+    fst.d     F6,    TD,  0x30
+    addi.d    S7,    S7,  0x08
+    fst.d     F7,    TD,  0x38
+    addi.d    S8,    S8,  0x08
+    addi.d    TD,    TD,  0x40
+
+    fld.d     F0,    S9,  0x00
+    fld.d     F1,    S10, 0x00
+    fld.d     F2,    S11, 0x00
+    fld.d     F3,    S12, 0x00
+    fld.d     F4,    S13, 0x00
+    fld.d     F5,    S14, 0x00
+    fld.d     F6,    S15, 0x00
+    fld.d     F7,    S16, 0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S9,    S9,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S10,   S10, 0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S11,   S11, 0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S12,   S12, 0x08
+    fst.d     F4,    TD,  0x20
+    addi.d    S13,   S13, 0x08
+    fst.d     F5,    TD,  0x28
+    addi.d    S14,   S14, 0x08
+    fst.d     F6,    TD,  0x30
+    addi.d    S15,   S15, 0x08
+    fst.d     F7,    TD,  0x38
+    addi.d    S16,   S16, 0x08
+    addi.d    TD,    TD,  0x40
+
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_II1
+
+.L_I0:
+    blt       ZERO,  J,   .L_J1
+
+.L_N8:
+    andi      J,     N,   0x08
+    beq       ZERO,  J,   .L_N4
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x03
+    add.d      S3,   S2,   TL
+    add.d      S4,   S2,   T0
+    add.d      S5,   S3,   T0
+    add.d      S6,   S4,   T0
+    add.d      S7,   S5,   T0
+    add.d      S8,   S6,   T0
+    add.d      TS,   S7,   T0
+    beq        I,    ZERO, .L_8I3
+
+.L_8I1:  /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+    xvld       U4,   S5,   0x00
+    xvld       U5,   S6,   0x00
+    xvld       U6,   S7,   0x00
+    xvld       U7,   S8,   0x00
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+    xvpackev.d D4,   U5,   U4
+    xvpackod.d D5,   U5,   U4
+    xvpackev.d D6,   U7,   U6
+    xvpackod.d D7,   U7,   U6
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U4,   D4,   D4
+    xvpermi.q  D4,   D6,   0x02  // 1
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 2
+    xvand.v    U5,   D5,   D5
+    xvpermi.q  D5,   D7,   0x02  // 3
+    xvpermi.q  D2,   U0,   0x31  // 4
+    xvpermi.q  D6,   U4,   0x31  // 5
+    xvpermi.q  D3,   U1,   0x31  // 6
+    xvpermi.q  D7,   U5,   0x31  // 7
+
+    xvst       D0,   TD,   0x00
+    xvst       D4,   TD,   0x20
+    xvst       D1,   TD,   0x40
+    xvst       D5,   TD,   0x60
+    xvst       D2,   TD,   0x80
+    xvst       D6,   TD,   0xA0
+    xvst       D3,   TD,   0xC0
+    xvst       D7,   TD,   0xE0
+    addi.d     TD,   TD,   0x100
+
+    xvld       U0,   S1,   0x20
+    xvld       U1,   S2,   0x20
+    xvld       U2,   S3,   0x20
+    xvld       U3,   S4,   0x20
+    xvld       U4,   S5,   0x20
+    xvld       U5,   S6,   0x20
+    xvld       U6,   S7,   0x20
+    xvld       U7,   S8,   0x20
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+    xvpackev.d D4,   U5,   U4
+    xvpackod.d D5,   U5,   U4
+    xvpackev.d D6,   U7,   U6
+    xvpackod.d D7,   U7,   U6
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U4,   D4,   D4
+    xvpermi.q  D4,   D6,   0x02  // 1
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 2
+    xvand.v    U5,   D5,   D5
+    xvpermi.q  D5,   D7,   0x02  // 3
+    xvpermi.q  D2,   U0,   0x31  // 4
+    xvpermi.q  D6,   U4,   0x31  // 5
+    xvpermi.q  D3,   U1,   0x31  // 6
+    xvpermi.q  D7,   U5,   0x31  // 7
+
+    xvst       D0,   TD,   0x00
+    xvst       D4,   TD,   0x20
+    xvst       D1,   TD,   0x40
+    xvst       D5,   TD,   0x60
+    xvst       D2,   TD,   0x80
+    xvst       D6,   TD,   0xA0
+    xvst       D3,   TD,   0xC0
+    xvst       D7,   TD,   0xE0
+    addi.d     TD,   TD,   0x100
+
+    addi.d     S1,   S1,   0x40
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    addi.d     S5,   S5,   0x40
+    addi.d     S6,   S6,   0x40
+    addi.d     S7,   S7,   0x40
+    addi.d     S8,   S8,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_8I1
+
+.L_8I3:
+    andi      I,     M,    0x07
+    beq       I,     ZERO, .L_N4
+
+.L_8I11:
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+    fld.d     F4,    S5,  0x00
+    fld.d     F5,    S6,  0x00
+    fld.d     F6,    S7,  0x00
+    fld.d     F7,    S8,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+    fst.d     F4,    TD,  0x20
+    addi.d    S5,    S5,  0x08
+    fst.d     F5,    TD,  0x28
+    addi.d    S6,    S6,  0x08
+    fst.d     F6,    TD,  0x30
+    addi.d    S7,    S7,  0x08
+    fst.d     F7,    TD,  0x38
+    addi.d    S8,    S8,  0x08
+
+    addi.d    TD,    TD,  0x40
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_8I11
+
+.L_N4:
+    andi      J,     N,   0x04
+    beq       ZERO,  J,   .L_N2
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x02
+    add.d      S3,   S2,   TL
+    add.d      S4,   S2,   T0
+    add.d      TS,   S3,   T0
+    beq        I,    ZERO, .L_I3
+
+.L_4I1: /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 1
+    xvpermi.q  D2,   U0,   0x31  // 2
+    xvpermi.q  D3,   U1,   0x31  // 3
+
+    xvst       D0,   TD,   0x00
+    xvst       D1,   TD,   0x20
+    xvst       D2,   TD,   0x40
+    xvst       D3,   TD,   0x60
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x80
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_4I1
+
+.L_I3:
+    andi      I,     M,    0x03
+    beq       I,     ZERO, .L_N2
+
+.L_4II1:
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+
+    addi.d    TD,    TD,  0x20
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_4II1
+
+.L_N2:
+    andi      J,     N,   0x02
+    beq       ZERO,  J,   .L_N1
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x01
+    add.d      TS,   S2,   TL
+    beq        I,    ZERO, .L_NI1
+
+.L_2I1: /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+
+    xvpermi.q  D0,   D1,   0x02  // 0
+
+    xvst       D0,   TD,   0x00
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     TD,   TD,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_2I1
+
+.L_NI1:
+    andi      I,     M,    0x01
+    beq       I,     ZERO, .L_N1
+
+
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    addi.d    TD,    TD,  0x10
+
+.L_N1:
+    move      S1,    TS
+    beq       ZERO,  M,   .L_N0
+
+.L_M1:
+    fld.d     F0,    S1,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F0,    TD,  0x00
+    addi.d    TD,    TD,  0x08
+    addi.d    M,     M,   -1
+    blt       ZERO,  M,   .L_M1
+
+.L_N0:
+    LDARG      $r23, $sp,  0x00
+    LDARG      $r24, $sp,  0x08
+    LDARG      $r25, $sp,  0x10
+    LDARG      $r26, $sp,  0x18
+    LDARG      $r27, $sp,  0x20
+    LDARG      $r28, $sp,  0x28
+    LDARG      $r29, $sp,  0x30
+    LDARG      $r30, $sp,  0x38
+    LDARG      $r31, $sp,  0x40
+    LD         $f23, $sp,  0x48
+    LD         $f24, $sp,  0x50
+    LD         $f25, $sp,  0x58
+    LD         $f26, $sp,  0x60
+    LD         $f27, $sp,  0x68
+    LD         $f28, $sp,  0x70
+    LD         $f29, $sp,  0x78
+    LD         $f30, $sp,  0x80
+    LD         $f31, $sp,  0x88
+    addi.d     $sp,  $sp,  0x90
+    jirl       $r0,  $r1,  0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_ncopy_4_lasx.S b/kernel/loongarch64/cgemm_ncopy_4_lasx.S
new file mode 100644
index 000000000..24c2e6748
--- /dev/null
+++ b/kernel/loongarch64/cgemm_ncopy_4_lasx.S
@@ -0,0 +1,325 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define TD     $r20
+#define TS     $r11
+#define TL     $r19
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+#define D8     $xr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x02
+    slli.d     TL,   TL,   0x01
+
+    srai.d     J,    N,    0x02
+    beq        J,    ZERO,  .L_N0
+
+.L_J1: /* J-- */
+    move       S1,   TS
+    add.d      S2,   S1,   TL
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+
+    slli.d     T0,   TL,   0x02
+    add.d      TS,   TS,   T0
+
+    srai.d     I,    M,    0x02
+    beq        I,    ZERO,  .L_I3
+
+.L_I1: /* I-- */
+    xvld       U0,   S1,   0x00 //1 2 3 4 5 6 7 8
+    xvld       U1,   S2,   0x00 //9 10 11 12 13 14 15 16
+    xvld       U2,   S3,   0x00 //17 18 19 20 21 22 23 24
+    xvld       U3,   S4,   0x00 //25 26 27 28 29 30 31 32
+
+    xvand.v    D0,   U0,   U0
+    xvand.v    D1,   U1,   U1
+    xvand.v    D2,   U2,   U2
+    xvand.v    D3,   U3,   U3
+
+    xvshuf4i.d D0,   U1,  0x88 //1 2 9 10 5 6 13 14
+    xvshuf4i.d D2,   U3,  0x88 //17 18 25 26 21 22 29 30
+    xvshuf4i.d D1,   U0,  0x77 //3 4 11 12 7 8 15 16
+    xvshuf4i.d D3,   U2,  0x77 //19 20 27 28 23 24 31 32
+
+    xvand.v    U4,   D0,   D0
+    xvand.v    U5,   D1,   D1
+
+    xvpermi.q  U4,   D2,  0x02 //1 2 9 10 17 18 25 26
+    xvpermi.q  U5,   D3,  0x02 //3 4 11 12 19 20 27 28
+    xvpermi.q  D2,   D0,  0x31 //5 6 13 14 21 22 29 30
+    xvpermi.q  D3,   D1,  0x31 //7 8 15 16 23 24 31 32
+
+    xvst       U4,   TD,   0x00
+    xvst       U5,   TD,   0x20
+    xvst       D2,   TD,   0x40
+    xvst       D3,   TD,   0x60
+
+    addi.d     S1,   S1,   0x20   // a_offset
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x80  // b_offset
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_I3:  /* if(m&2) */
+    andi       I,    M,    0x02
+    beq        I,    ZERO, .L_II20
+
+    vld       $vr0,   S1,   0x00
+    vld       $vr1,   S2,   0x00
+    vld       $vr2,   S3,   0x00
+    vld       $vr3,   S4,   0x00
+
+    vand.v    $vr8,   $vr1,   $vr1
+    vand.v    $vr9,   $vr1,   $vr1
+    vand.v    $vr10,   $vr3,   $vr3
+    vand.v    $vr11,   $vr3,   $vr3
+
+    vpermi.w  $vr8,   $vr0,   0x44
+    vpermi.w  $vr10,   $vr2,   0x44
+    vpermi.w  $vr9,   $vr0,   0xee
+    vpermi.w  $vr11,   $vr2,   0xee
+
+    vst       $vr8,   TD,   0x00
+    vst       $vr10,   TD,   0x10
+    vst       $vr9,   TD,   0x20
+    vst       $vr11,   TD,   0x30
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     S3,   S3,   0x10
+    addi.d     S4,   S4,   0x10
+    addi.d     TD,   TD,   0x40
+
+.L_II20:  /* if(m&1) */
+    andi       I,    M,    0x01
+    beq        I,    ZERO, .L_J0
+
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fld.s      F4,   S3,   0x00
+    fld.s      F5,   S3,   0x04
+
+    fld.s      F6,   S4,   0x00
+    fld.s      F7,   S4,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+    fst.s      F4,   TD,   0x10
+    fst.s      F5,   TD,   0x14
+    fst.s      F6,   TD,   0x18
+    fst.s      F7,   TD,   0x1c
+
+    addi.d     TD,   TD,   0x20
+
+.L_J0:
+    addi.d    J,     J,    -1
+    blt       ZERO,  J,    .L_J1
+
+.L_N0:  /* if(n&2) */
+    andi       I,     N,   0x02
+    beq        ZERO,  I,   .L_N20
+
+    move       S1,    TS
+    add.d      S2,    S1,   TL
+
+    slli.d     T0,    TL,   0x01
+    add.d      TS,    TS,   T0
+
+    srai.d     I,     M,    0x02
+    beq        ZERO,  I,    .L_N10
+
+.L_N11: /* if(i>0) */
+    xvld       U0,   S1,   0x00 //1 2 3 4 5 6 7 8
+    xvld       U1,   S2,   0x00 //9 10 11 12 13 14 15 16
+
+    xvand.v    D0,   U0,   U0
+    xvand.v    D1,   U1,   U1
+
+    xvshuf4i.d D0,   U1,  0x88 //1 2 9 10 5 6 13 14
+    xvshuf4i.d D1,   U0,  0x77 //3 4 11 12 7 8 15 16
+
+    xvand.v    U4,   D0,   D0
+
+    xvpermi.q  U4,   D1,  0x02 //1 2 9 10 3 4 11 12
+    xvpermi.q  D1,   D0,  0x31 //5 6 13 14 7 8 15 16
+
+    xvst       U4,   TD,   0x00
+    xvst       D1,   TD,   0x20
+
+    addi.d     S1,    S1,   0x20   // a_offset
+    addi.d     S2,    S2,   0x20
+    addi.d     TD,    TD,   0x40   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N11
+
+.L_N10:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N130
+
+    vld       $vr0,    S1,   0x00
+    vld       $vr1,    S2,   0x00
+    vand.v    $vr8,    $vr1,   $vr1
+
+    vpermi.w  $vr8,    $vr0,   0x44
+    vpermi.w  $vr1,    $vr0,   0xee
+
+    vst       $vr8,    TD,   0x00
+    vst       $vr1,    TD,   0x10
+
+    addi.d     S1,    S1,   0x10   // a_offset
+    addi.d     S2,    S2,   0x10
+    addi.d     TD,    TD,   0x20   // b_offset
+
+.L_N130:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N20
+
+    fld.s      F0,     S1,   0x00
+    fld.s      F1,     S1,   0x04
+
+    fld.s      F2,     S2,   0x00
+    fld.s      F3,     S2,   0x04
+
+    fst.s      F0,     TD,   0x00
+    fst.s      F1,     TD,   0x04
+    fst.s      F2,     TD,   0x08
+    fst.s      F3,     TD,   0x0c
+
+    addi.d     TD,    TD,   0x10
+
+.L_N20:   /* if(n&1) */
+    andi       I,     N,    0x01
+    beq        I,     ZERO, .L_N00
+
+    move       S1,   TS
+    srai.d     I,    M,    0x02
+
+    beq        I,    ZERO, .L_N30
+
+.L_N21:  /* if(i>0) */
+    xvld       U0,   S1,   0x00
+
+    xvst       U0,   TD,   0x00
+
+    addi.d     S1,   S1,   0x20   // aoffset1
+    addi.d     TD,   TD,   0x20   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N21
+
+.L_N30:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N330
+
+    vld       $vr0,   S1,   0x00
+
+    vst       $vr0,   TD,   0x00
+
+    addi.d     S1,   S1,   0x10   // aoffset1
+    addi.d     TD,   TD,   0x10   // b_offset
+
+.L_N330:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N00
+
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_tcopy_16_lasx.S b/kernel/loongarch64/cgemm_tcopy_16_lasx.S
new file mode 100644
index 000000000..7d9eb94c8
--- /dev/null
+++ b/kernel/loongarch64/cgemm_tcopy_16_lasx.S
@@ -0,0 +1,741 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S0     $r11
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define P0     $r20
+#define P1     $r23
+#define P2     $r24
+#define P3     $r25
+#define P4     $r26
+#define P5     $r27
+#define T0     $r28
+#define T1     $r29
+#define TL     $r7
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+#define F8     $f8
+#define F9     $f9
+#define F10     $f10
+#define F11     $f11
+#define F12     $f12
+#define F13     $f13
+#define F14     $f14
+#define F15     $f15
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -56
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    SDARG      $r28,   $sp,   40
+    SDARG      $r29,   $sp,   48
+
+    move       S0,     SRC
+    move       P0,     DST
+
+    srai.d     T0,     N,     0x04
+    srai.d     T1,     N,     0x03
+    slli.d     T0,     T0,    0x04
+    slli.d     T1,     T1,    0x03
+    mul.d      P2,     M,     T0
+    mul.d      P3,     M,     T1
+    slli.d     P2,     P2,    0x03
+    slli.d     P3,     P3,    0x03
+    add.d      P2,     DST,   P2
+    add.d      P3,     DST,   P3
+
+    srai.d     T0,     N,     0x02
+    srai.d     T1,     N,     0x01
+    slli.d     T0,     T0,    0x02
+    slli.d     T1,     T1,    0x01
+    mul.d      P4,     M,     T0
+    mul.d      P5,     M,     T1
+    slli.d     P4,     P4,    0x03
+    slli.d     P5,     P5,    0x03
+    add.d      P4,     DST,   P4
+    add.d      P5,     DST,   P5
+
+    slli.d     TL,     LDA,   0x03
+    srai.d     J,      M,     0x03
+    slli.d     T0,     TL,    0x01
+    slli.d     T1,     M,     0x07
+    beq        ZERO,   J,     .L_M7
+
+.L_J1: /* J-- */
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S3,     S1,    T0
+    add.d      S4,     S2,    T0
+    add.d      S5,     S3,    T0
+    add.d      S6,     S4,    T0
+    add.d      S7,     S5,    T0
+    add.d      S8,     S6,    T0
+    add.d      S0,     S7,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x400
+
+    srai.d     I,      N,     0x04
+    addi.d     J,      J,     -1
+    beq        ZERO,   I,     .L_N15
+
+.L_I1: /* I-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+    xvld       U4,     S2,    0x00
+    xvld       U5,     S2,    0x20
+    xvld       U6,     S2,    0x40
+    xvld       U7,     S2,    0x60
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+    xvst       U4,     P1,    0x80
+    xvst       U5,     P1,    0xA0
+    xvst       U6,     P1,    0xC0
+    xvst       U7,     P1,    0xE0
+
+    xvld       U0,     S3,    0x00
+    xvld       U1,     S3,    0x20
+    xvld       U2,     S3,    0x40
+    xvld       U3,     S3,    0x60
+    xvld       U4,     S4,    0x00
+    xvld       U5,     S4,    0x20
+    xvld       U6,     S4,    0x40
+    xvld       U7,     S4,    0x60
+
+    xvst       U0,     P1,    0x100
+    xvst       U1,     P1,    0x120
+    xvst       U2,     P1,    0x140
+    xvst       U3,     P1,    0x160
+    xvst       U4,     P1,    0x180
+    xvst       U5,     P1,    0x1A0
+    xvst       U6,     P1,    0x1C0
+    xvst       U7,     P1,    0x1E0
+
+    xvld       U0,     S5,    0x00
+    xvld       U1,     S5,    0x20
+    xvld       U2,     S5,    0x40
+    xvld       U3,     S5,    0x60
+    xvld       U4,     S6,    0x00
+    xvld       U5,     S6,    0x20
+    xvld       U6,     S6,    0x40
+    xvld       U7,     S6,    0x60
+
+    xvst       U0,     P1,    0x200
+    xvst       U1,     P1,    0x220
+    xvst       U2,     P1,    0x240
+    xvst       U3,     P1,    0x260
+    xvst       U4,     P1,    0x280
+    xvst       U5,     P1,    0x2A0
+    xvst       U6,     P1,    0x2C0
+    xvst       U7,     P1,    0x2E0
+
+    xvld       U0,     S7,    0x00
+    xvld       U1,     S7,    0x20
+    xvld       U2,     S7,    0x40
+    xvld       U3,     S7,    0x60
+    xvld       U4,     S8,    0x00
+    xvld       U5,     S8,    0x20
+    xvld       U6,     S8,    0x40
+    xvld       U7,     S8,    0x60
+
+    xvst       U0,     P1,    0x300
+    xvst       U1,     P1,    0x320
+    xvst       U2,     P1,    0x340
+    xvst       U3,     P1,    0x360
+    xvst       U4,     P1,    0x380
+    xvst       U5,     P1,    0x3A0
+    xvst       U6,     P1,    0x3C0
+    xvst       U7,     P1,    0x3E0
+
+    addi.d     S1,     S1,    0x80
+    addi.d     S2,     S2,    0x80
+    addi.d     S3,     S3,    0x80
+    addi.d     S4,     S4,    0x80
+    addi.d     S5,     S5,    0x80
+    addi.d     S6,     S6,    0x80
+    addi.d     S7,     S7,    0x80
+    addi.d     S8,     S8,    0x80
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_I1
+
+.L_N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_N7
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+    xvld       U4,     S3,    0x00
+    xvld       U5,     S3,    0x20
+    xvld       U6,     S4,    0x00
+    xvld       U7,     S4,    0x20
+
+    xvst       U0,     P2,    0x00
+    xvst       U1,     P2,    0x20
+    xvst       U2,     P2,    0x40
+    xvst       U3,     P2,    0x60
+    xvst       U4,     P2,    0x80
+    xvst       U5,     P2,    0xA0
+    xvst       U6,     P2,    0xC0
+    xvst       U7,     P2,    0xE0
+
+    xvld       U0,     S5,    0x00
+    xvld       U1,     S5,    0x20
+    xvld       U2,     S6,    0x00
+    xvld       U3,     S6,    0x20
+    xvld       U4,     S7,    0x00
+    xvld       U5,     S7,    0x20
+    xvld       U6,     S8,    0x00
+    xvld       U7,     S8,    0x20
+
+    xvst       U0,     P2,    0x100
+    xvst       U1,     P2,    0x120
+    xvst       U2,     P2,    0x140
+    xvst       U3,     P2,    0x160
+    xvst       U4,     P2,    0x180
+    xvst       U5,     P2,    0x1A0
+    xvst       U6,     P2,    0x1C0
+    xvst       U7,     P2,    0x1E0
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    addi.d     S3,     S3,    0x40
+    addi.d     S4,     S4,    0x40
+    addi.d     S5,     S5,    0x40
+    addi.d     S6,     S6,    0x40
+    addi.d     S7,     S7,    0x40
+    addi.d     S8,     S8,    0x40
+    addi.d     P2,     P2,    0x200
+
+.L_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_N3
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+    xvld       U4,     S5,    0x00
+    xvld       U5,     S6,    0x00
+    xvld       U6,     S7,    0x00
+    xvld       U7,     S8,    0x00
+
+    xvst       U0,     P3,    0x00
+    xvst       U1,     P3,    0x20
+    xvst       U2,     P3,    0x40
+    xvst       U3,     P3,    0x60
+    xvst       U4,     P3,    0x80
+    xvst       U5,     P3,    0xA0
+    xvst       U6,     P3,    0xC0
+    xvst       U7,     P3,    0xE0
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S3,     S3,    0x20
+    addi.d     S4,     S4,    0x20
+    addi.d     S5,     S5,    0x20
+    addi.d     S6,     S6,    0x20
+    addi.d     S7,     S7,    0x20
+    addi.d     S8,     S8,    0x20
+    addi.d     P3,     P3,    0x100
+
+.L_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_N1
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+    xvld       U4,     S5,    0x00
+    xvld       U5,     S6,    0x00
+    xvld       U6,     S7,    0x00
+    xvld       U7,     S8,    0x00
+
+    xvpermi.q  U0,     U1,    0x02
+    xvpermi.q  U2,     U3,    0x02
+    xvpermi.q  U4,     U5,    0x02
+    xvpermi.q  U6,     U7,    0x02
+
+    xvst       U0,     P4,    0x00
+    xvst       U2,     P4,    0x20
+    xvst       U4,     P4,    0x40
+    xvst       U6,     P4,    0x60
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S3,     S3,    0x10
+    addi.d     S4,     S4,    0x10
+    addi.d     S5,     S5,    0x10
+    addi.d     S6,     S6,    0x10
+    addi.d     S7,     S7,    0x10
+    addi.d     S8,     S8,    0x10
+    addi.d     P4,     P4,    0x80
+
+.L_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    fld.s      F0,     S1,    0x00
+    fld.s      F1,     S1,    0x04
+
+    fld.s      F2,     S2,    0x00
+    fld.s      F3,     S2,    0x04
+
+    fld.s      F4,     S3,    0x00
+    fld.s      F5,     S3,    0x04
+
+    fld.s      F6,     S4,    0x00
+    fld.s      F7,     S4,    0x04
+
+    fld.s      F8,     S5,    0x00
+    fld.s      F9,     S5,    0x04
+
+    fld.s      F10,     S6,    0x00
+    fld.s      F11,     S6,    0x04
+
+    fld.s      F12,     S7,    0x00
+    fld.s      F13,     S7,    0x04
+
+    fld.s      F14,     S8,    0x00
+    fld.s      F15,     S8,    0x04
+
+    fst.s      F0,     P5,    0x00
+    fst.s      F1,     P5,    0x04
+    fst.s      F2,     P5,    0x08
+    fst.s      F3,     P5,    0x0c
+    fst.s      F4,     P5,    0x10
+    fst.s      F5,     P5,    0x14
+    fst.s      F6,     P5,    0x18
+    fst.s      F7,     P5,    0x1c
+    fst.s      F8,     P5,    0x20
+    fst.s      F9,     P5,    0x24
+    fst.s      F10,     P5,    0x28
+    fst.s      F11,     P5,    0x2c
+    fst.s      F12,     P5,    0x30
+    fst.s      F13,     P5,    0x34
+    fst.s      F14,     P5,    0x38
+    fst.s      F15,     P5,    0x3c
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     S3,     S3,    0x08
+    addi.d     S4,     S4,    0x08
+    addi.d     S5,     S5,    0x08
+    addi.d     S6,     S6,    0x08
+    addi.d     S7,     S7,    0x08
+    addi.d     S8,     S8,    0x08
+    addi.d     P5,     P5,    0x40
+
+.L_N0:
+    blt        ZERO,   J,     .L_J1
+
+.L_M7:
+    andi       J,      M,     0x04
+    beq        ZERO,   J,     .L_M3
+
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S3,     S1,    T0
+    add.d      S4,     S2,    T0
+    add.d      S0,     S3,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x200
+
+    srai.d     I,      N,     0x04
+    beq        ZERO,   I,     .L_4N15
+
+.L_4I1: /* I-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+    xvld       U4,     S2,    0x00
+    xvld       U5,     S2,    0x20
+    xvld       U6,     S2,    0x40
+    xvld       U7,     S2,    0x60
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+    xvst       U4,     P1,    0x80
+    xvst       U5,     P1,    0xA0
+    xvst       U6,     P1,    0xC0
+    xvst       U7,     P1,    0xE0
+
+    xvld       U0,     S3,    0x00
+    xvld       U1,     S3,    0x20
+    xvld       U2,     S3,    0x40
+    xvld       U3,     S3,    0x60
+    xvld       U4,     S4,    0x00
+    xvld       U5,     S4,    0x20
+    xvld       U6,     S4,    0x40
+    xvld       U7,     S4,    0x60
+
+    xvst       U0,     P1,    0x100
+    xvst       U1,     P1,    0x120
+    xvst       U2,     P1,    0x140
+    xvst       U3,     P1,    0x160
+    xvst       U4,     P1,    0x180
+    xvst       U5,     P1,    0x1A0
+    xvst       U6,     P1,    0x1C0
+    xvst       U7,     P1,    0x1E0
+
+    addi.d     S1,     S1,    0x80
+    addi.d     S2,     S2,    0x80
+    addi.d     S3,     S3,    0x80
+    addi.d     S4,     S4,    0x80
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_4I1
+
+.L_4N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_4N7
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+    xvld       U4,     S3,    0x00
+    xvld       U5,     S3,    0x20
+    xvld       U6,     S4,    0x00
+    xvld       U7,     S4,    0x20
+
+    xvst       U0,     P2,    0x00
+    xvst       U1,     P2,    0x20
+    xvst       U2,     P2,    0x40
+    xvst       U3,     P2,    0x60
+    xvst       U4,     P2,    0x80
+    xvst       U5,     P2,    0xA0
+    xvst       U6,     P2,    0xC0
+    xvst       U7,     P2,    0xE0
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    addi.d     S3,     S3,    0x40
+    addi.d     S4,     S4,    0x40
+    addi.d     P2,     P2,    0x100
+
+.L_4N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_4N3
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+
+    xvst       U0,     P3,    0x00
+    xvst       U1,     P3,    0x20
+    xvst       U2,     P3,    0x40
+    xvst       U3,     P3,    0x60
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S3,     S3,    0x20
+    addi.d     S4,     S4,    0x20
+    addi.d     P3,     P3,    0x80
+
+.L_4N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_4N1
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+
+    xvpermi.q  U0,     U1,    0x02
+    xvpermi.q  U2,     U3,    0x02
+
+    xvst       U0,     P4,    0x00
+    xvst       U2,     P4,    0x20
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S3,     S3,    0x10
+    addi.d     S4,     S4,    0x10
+    addi.d     P4,     P4,    0x40
+
+.L_4N1:
+    andi        I,      N,     0x01
+    beq         ZERO,   I,     .L_M3
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+    fld.d      F2,     S3,    0x00
+    fld.d      F3,     S4,    0x00
+
+    fst.d      F0,     P5,    0x00
+    fst.d      F1,     P5,    0x08
+    fst.d      F2,     P5,    0x10
+    fst.d      F3,     P5,    0x18
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     S3,     S3,    0x08
+    addi.d     S4,     S4,    0x08
+    addi.d     P5,     P5,    0x20
+
+.L_M3:
+    andi       J,      M,     0x02
+    beq        ZERO,   J,     .L_M1
+
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S0,     S0,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x100
+
+    srai.d     I,      N,     0x04
+    beq        ZERO,   I,     .L_2N15
+
+.L_2I1: /* I-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+    xvld       U4,     S2,    0x00
+    xvld       U5,     S2,    0x20
+    xvld       U6,     S2,    0x40
+    xvld       U7,     S2,    0x60
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+    xvst       U4,     P1,    0x80
+    xvst       U5,     P1,    0xA0
+    xvst       U6,     P1,    0xC0
+    xvst       U7,     P1,    0xE0
+
+    addi.d     S1,     S1,    0x80
+    addi.d     S2,     S2,    0x80
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_2I1
+
+.L_2N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_2N7
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+
+    xvst       U0,     P2,    0x00
+    xvst       U1,     P2,    0x20
+    xvst       U2,     P2,    0x40
+    xvst       U3,     P2,    0x60
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    addi.d     P2,     P2,    0x80
+
+.L_2N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_2N3
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    xvst       U0,     P3,    0x00
+    xvst       U1,     P3,    0x20
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     P3,     P3,    0x40
+
+.L_2N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_2N1
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    xvpermi.q  U0,     U1,    0x02
+
+    xvst       U0,     P4,    0x00
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     P4,     P4,    0x20
+
+.L_2N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M1
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+
+    fst.d      F0,     P5,    0x00
+    fst.d      F1,     P5,    0x08
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     P5,     P5,    0x10
+
+.L_M1:
+    andi       J,      M,     0x01
+    beq        ZERO,   J,     .L_M0
+
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x80
+
+    srai.d     I,      N,     0x04
+    beq        ZERO,   I,     .L_1N15
+
+.L_1I1: /* I-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+
+    addi.d     S1,     S1,    0x80
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_1I1
+
+.L_1N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_1N7
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+
+    xvst       U0,     P2,    0x00
+    xvst       U1,     P2,    0x20
+
+    addi.d     S1,     S1,    0x40
+    addi.d     P2,     P2,    0x40
+
+.L_1N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_1N3
+
+    xvld       U0,     S1,    0x00
+
+    xvst       U0,     P3,    0x00
+
+    addi.d     S1,     S1,    0x20
+    addi.d     P3,     P3,    0x20
+
+.L_1N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_1N1
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S1,    0x08
+
+    fst.d      F0,     P4,    0x00
+    fst.d      F1,     P4,    0x08
+
+    addi.d     S1,     S1,    0x10
+    addi.d     P4,     P4,    0x10
+
+.L_1N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M0
+
+    fld.d      F0,     S1,    0x00
+
+    fst.d      F0,     P5,    0x00
+
+    addi.d     S1,     S1,    0x08
+    addi.d     P5,     P5,    0x08
+
+.L_M0:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LDARG      $r28,   $sp,   40
+    LDARG      $r29,   $sp,   48
+    addi.d     $sp,    $sp,   56
+    jirl       $r0,    $r1,   0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_tcopy_4_lasx.S b/kernel/loongarch64/cgemm_tcopy_4_lasx.S
new file mode 100644
index 000000000..9ff8a35b8
--- /dev/null
+++ b/kernel/loongarch64/cgemm_tcopy_4_lasx.S
@@ -0,0 +1,306 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define TD     $r16
+#define TS     $r17
+#define TL     $r18
+#define T0     $r19
+#define S8     $r20
+#define S9     $r23
+#define S10    $r11
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x02  //lda
+    slli.d     TL,     TL,    0x01  //lda
+
+    ori        T0,     ZERO,  0x03
+    andn       T0,     N,     T0
+    mul.w      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x02
+    add.d      S9,     DST,   T0  //boffset2
+
+    ori        T0,     ZERO,  0x01
+    andn       T0,     N,     T0
+    mul.w      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x02
+    add.d      S10,    DST,   T0  //boffset3
+
+    srai.d     J,      M,     0x02  //j
+
+    beq        J,      ZERO,  .L_M1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+    add.d      S3,     S2,    TL
+    add.d      S4,     S3,    TL
+
+    slli.d     T0,     TL,    0x02
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x80
+
+    srai.d     I,      N,     0x02
+
+    beq        ZERO,   I,     .L_JN1
+
+.L_JI1:  /* if(i>0) i--*/
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+
+    xvst       U0,   S8,   0x00
+    xvst       U1,   S8,   0x20
+    xvst       U2,   S8,   0x40
+    xvst       U3,   S8,   0x60
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    slli.d     T0,   M,    0x05
+    add.d      S8,   S8,   T0
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_JI1
+
+.L_JN1:  /* if(n&2) */
+    andi       I,    N,    0x02
+    beq        ZERO, I,    .L_JN2
+
+    vld       $vr0,   S1,   0x00
+    vld       $vr1,   S2,   0x00
+    vld       $vr2,   S3,   0x00
+    vld       $vr3,   S4,   0x00
+
+    vst       $vr0,   S9,   0x00
+    vst       $vr1,   S9,   0x10
+    vst       $vr2,   S9,   0x20
+    vst       $vr3,   S9,   0x30
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     S3,   S3,   0x10
+    addi.d     S4,   S4,   0x10
+    addi.d     S9,   S9,   0x40
+
+.L_JN2:  /* if(n&1) */
+    andi       I,    N,    0x01
+    beq        ZERO, I,    .L_J0
+
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fld.s      F4,   S3,   0x00
+    fld.s      F5,   S3,   0x04
+
+    fld.s      F6,   S4,   0x00
+    fld.s      F7,   S4,   0x04
+
+    fst.s      F0,   S10,   0x00
+    fst.s      F1,   S10,   0x04
+    fst.s      F2,   S10,   0x08
+    fst.s      F3,   S10,   0x0c
+    fst.s      F4,   S10,   0x10
+    fst.s      F5,   S10,   0x14
+    fst.s      F6,   S10,   0x18
+    fst.s      F7,   S10,   0x1c
+
+    addi.d     S10,  S10,   0x20
+
+.L_J0:
+    addi.d     J,      J,     -1
+    blt        ZERO, J,   .L_J1
+
+.L_M1:  /* if(m&2) */
+    andi       I,      M,    0x02
+    beq        ZERO,   I,    .L_M2
+
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+
+    slli.d     T0,     TL,    0x01
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x40
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M1N1
+
+.L_M1I1:  /* if(i>0) */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    xvst       U0,     S8,    0x00
+    xvst       U1,     S8,    0x20
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    slli.d     T0,     M,     0x05
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M1I1
+
+.L_M1N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M1N2
+
+    vld       $vr0,     S1,    0x00
+    vld       $vr1,     S2,    0x00
+
+    vst       $vr0,     S9,    0x00
+    vst       $vr1,     S9,    0x10
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S9,     S9,    0x20
+
+.L_M1N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M2
+
+    fld.s      F0,      S1,    0x00
+    fld.s      F1,      S1,    0x04
+
+    fld.s      F2,      S2,    0x00
+    fld.s      F3,      S2,    0x04
+
+    fst.s      F0,      S10,   0x00
+    fst.s      F1,      S10,   0x04
+    fst.s      F2,      S10,   0x08
+    fst.s      F3,      S10,   0x0c
+
+    addi.d     S10,    S10,   0x10
+
+.L_M2:  /* if(m&1) */
+    andi       I,      M,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    move       S1,     TS     //aoffset1
+    move       S8,     TD     //boffset1
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M2N1
+
+.L_M2I1:  /* if(i>0) */
+    xvld       U0,     S1,    0x00
+
+    xvst       U0,     S8,    0x00
+
+    addi.d     S1,     S1,    0x20
+    slli.d     T0,     M,     0x05
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M2I1
+
+.L_M2N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M2N2
+
+    vld       $vr0,     S1,    0x00
+
+    vst       $vr0,     S9,    0x00
+
+    addi.d     S1,     S1,    0x10
+
+.L_M2N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    fld.s      F0,     S1,    0x00
+    fld.s      F1,     S1,    0x04
+
+    fst.s      F0,     S10,    0x00
+    fst.s      F1,     S10,    0x04
+
+.L_M0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/param.h b/param.h
index 5d2e960a2..8bdc03380 100644
--- a/param.h
+++ b/param.h
@@ -2845,21 +2845,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define DGEMM_DEFAULT_UNROLL_M 2
 #define SGEMM_DEFAULT_UNROLL_N 8
 #define SGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_N 4
+#define CGEMM_DEFAULT_UNROLL_M 1
+#define ZGEMM_DEFAULT_UNROLL_N 4
+#define ZGEMM_DEFAULT_UNROLL_M 1
 #else
 #define DGEMM_DEFAULT_UNROLL_N 4
 #define DGEMM_DEFAULT_UNROLL_M 16
 #define SGEMM_DEFAULT_UNROLL_N 8
 #define SGEMM_DEFAULT_UNROLL_M 16
+#define CGEMM_DEFAULT_UNROLL_N 4
+#define CGEMM_DEFAULT_UNROLL_M 16
+#define ZGEMM_DEFAULT_UNROLL_N 4
+#define ZGEMM_DEFAULT_UNROLL_M 8
 #endif
 
 #define QGEMM_DEFAULT_UNROLL_N 2
-#define CGEMM_DEFAULT_UNROLL_N 2
-#define ZGEMM_DEFAULT_UNROLL_N 4
 #define XGEMM_DEFAULT_UNROLL_N 1
 
 #define QGEMM_DEFAULT_UNROLL_M 2
-#define CGEMM_DEFAULT_UNROLL_M 2
-#define ZGEMM_DEFAULT_UNROLL_M 8
 #define XGEMM_DEFAULT_UNROLL_M 1
 
 #define SGEMM_DEFAULT_P 256

From bf2310442b7eda57ad8089878518b3f43733efaf Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Wed, 21 Feb 2024 13:26:28 -0600
Subject: [PATCH 679/718] Fix get_num_cores for AIX.

---
 getarch.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/getarch.c b/getarch.c
index f879e6bbb..2b5459a5f 100644
--- a/getarch.c
+++ b/getarch.c
@@ -90,7 +90,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <sys/sysinfo.h>
 #include <unistd.h>
 #endif
-#if defined(AIX)
+#if defined(_AIX)
+#include <unistd.h>
+#include <sys/systemcfg.h>
 #include <sys/sysinfo.h>
 #endif
 
@@ -1870,11 +1872,13 @@ static int get_num_cores(void) {
   
   return count;
 
-#elif defined(AIX)
+#elif defined(_AIX)
   //returns the number of processors which are currently online
   count = sysconf(_SC_NPROCESSORS_ONLN);
   if (count <= 0) count = 2;
-  
+
+  return count;
+
 #else
   return 2;
 #endif

From 9b24b3141985de115e6191b376b435117a32404a Mon Sep 17 00:00:00 2001
From: frjohnst <frjohnst@us.ibm.com>
Date: Wed, 21 Feb 2024 15:52:29 -0500
Subject: [PATCH 680/718] resolve second_ conflict which breaks xlf timef

---
 lapack-netlib/SRC/Makefile | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile
index de2242701..205a32d31 100644
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@@ -101,8 +101,10 @@ SCLAUX = la_constants.o \
    slaset.o slasq1.o slasq2.o slasq3.o slasq4.o slasq5.o slasq6.o \
    slasr.o  slasrt.o slassq.o slasv2.o spttrf.o sstebz.o sstedc.o \
    ssteqr.o ssterf.o slaisnan.o sisnan.o \
-   slartgp.o slartgs.o scombssq.o ../INSTALL/sroundup_lwork.o \
-   ../INSTALL/second_$(TIMER).o
+   slartgp.o slartgs.o scombssq.o ../INSTALL/sroundup_lwork.o
+ifneq ($(F_COMPILER), IBM)
+SCLAUX += ../INSTALL/second_$(TIMER).o
+endif
 endif
 
 ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" ""
@@ -124,7 +126,10 @@ DZLAUX = la_constants.o\
    dlasr.o  dlasrt.o dlassq.o dlasv2.o dpttrf.o dstebz.o dstedc.o \
    dsteqr.o dsterf.o dlaisnan.o disnan.o \
    dlartgp.o dlartgs.o ../INSTALL/droundup_lwork.o \
-   ../INSTALL/dlamch.o ../INSTALL/dsecnd_$(TIMER).o
+   ../INSTALL/dlamch.o
+ifneq ($(F_COMPILER), IBM)
+DZLAUX +=  ../INSTALL/dsecnd_$(TIMER).o
+endif
 endif
 
 #ifeq ($(BUILD_SINGLE),1)

From d51ffec3a20299532a1ef5a403df2760f7b5b653 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Thu, 22 Feb 2024 10:46:45 +0800
Subject: [PATCH 681/718] LoongArch64: Opt cgemv with LASX

---
 kernel/loongarch64/KERNEL.LOONGSON3R5 |   3 +
 kernel/loongarch64/cgemv_n_8_lasx.S   | 383 ++++++++++++++++++++++++++
 kernel/loongarch64/cgemv_t_8_lasx.S   | 342 +++++++++++++++++++++++
 kernel/loongarch64/loongarch64_asm.S  | 240 ++++++++++++++++
 4 files changed, 968 insertions(+)
 create mode 100644 kernel/loongarch64/cgemv_n_8_lasx.S
 create mode 100644 kernel/loongarch64/cgemv_t_8_lasx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index ce9268b93..3b2ee6e55 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -121,6 +121,9 @@ CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
 CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
+CGEMVNKERNEL = cgemv_n_8_lasx.S
+CGEMVTKERNEL = cgemv_t_8_lasx.S
+
 CTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
 CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
diff --git a/kernel/loongarch64/cgemv_n_8_lasx.S b/kernel/loongarch64/cgemv_n_8_lasx.S
new file mode 100644
index 000000000..b078e3227
--- /dev/null
+++ b/kernel/loongarch64/cgemv_n_8_lasx.S
@@ -0,0 +1,383 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2024/02/20 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA_R $f0
+#define ALPHA_I $f1
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define Y_ORG   $r15
+#define OFFSET  $r16
+#define K_LDA   $r17
+#define M8      $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+
+#define VALPHA  $xr1
+#define X0      $xr2
+#define X1      $xr3
+#define X2      $xr4
+#define X3      $xr5
+#define X4      $xr6
+#define X5      $xr7
+#define X6      $xr8
+#define X7      $xr9
+#define Y0      $xr10
+#define Y1      $xr11
+#define A0      $xr12
+#define A1      $xr13
+#define A2      $xr14
+#define A3      $xr15
+#define A4      $xr16
+#define A5      $xr17
+#define A6      $xr18
+#define A7      $xr19
+#define A8      $xr20
+#define A9      $xr21
+#define A10     $xr22
+#define A11     $xr23
+#define A12     $xr24
+#define A13     $xr25
+#define A14     $xr26
+#define A15     $xr27
+#define TMP0    $xr28
+#define TMP1    $xr29
+#define TMP2    $xr30
+
+#if !defined(CONJ)
+#if !defined(XCONJ)
+#define GXCONJ 0
+#define GCONJ  0
+#else
+#define GXCONJ 1
+#define GCONJ  0
+#endif
+#else
+#if !defined(XCONJ)
+#define GXCONJ 0
+#define GCONJ  1
+#else
+#define GXCONJ 1
+#define GCONJ  1
+#endif
+#endif
+
+.macro CLOAD_X_8
+    GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
+                   X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
+    GCOMPLEXMUL GXCONJ, \
+    xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
+    X1, X1, VALPHA, TMP0, TMP1, TMP2, \
+    X2, X2, VALPHA, TMP0, TMP1, TMP2, \
+    X3, X3, VALPHA, TMP0, TMP1, TMP2, \
+    X4, X4, VALPHA, TMP0, TMP1, TMP2, \
+    X5, X5, VALPHA, TMP0, TMP1, TMP2, \
+    X6, X6, VALPHA, TMP0, TMP1, TMP2, \
+    X7, X7, VALPHA, TMP0, TMP1, TMP2
+.endm
+
+.macro CLOAD_X_8_GAP
+    xvldrepl.d  X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    xvldrepl.d  X1,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X2,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X3,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X4,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X5,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X6,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X7,     T0,     0x00
+
+    GCOMPLEXMUL GXCONJ, \
+    xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
+    X1, X1, VALPHA, TMP0, TMP1, TMP2, \
+    X2, X2, VALPHA, TMP0, TMP1, TMP2, \
+    X3, X3, VALPHA, TMP0, TMP1, TMP2, \
+    X4, X4, VALPHA, TMP0, TMP1, TMP2, \
+    X5, X5, VALPHA, TMP0, TMP1, TMP2, \
+    X6, X6, VALPHA, TMP0, TMP1, TMP2, \
+    X7, X7, VALPHA, TMP0, TMP1, TMP2
+.endm
+
+.macro CLOAD_Y_8
+    GLD xv, , Y0, Y, 0, Y1, Y, 0x20
+.endm
+
+.macro CLOAD_Y_8_GAP
+    fld.d   $f10,   Y,  0
+    fldx.d  $f13,   Y,  INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  1
+    fld.d   $f14,   T0, 0
+    fldx.d  $f15,   T0, INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  2
+    fld.d   $f11,   T0, 0
+    fldx.d  $f17,   T0, INC_Y
+    PTR_ADD   T0,   T0, INC_Y
+    PTR_ADD   T0,   T0, INC_Y
+    fld.d   $f18,   T0, 0
+    fldx.d  $f19,   T0, INC_Y
+    GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3
+.endm
+
+.macro CSTORE_Y_8_GAP
+    xvstelm.d   Y0,     Y,      0,      0
+    PTR_ADD     T0,     Y,      INC_Y
+    xvstelm.d   Y0,     T0,     0,      1
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y0,     T0,     0,      2
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y0,     T0,     0,      3
+
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      0
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      1
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      2
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      3
+.endm
+
+.macro CGEMV_N_8x8
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0, \
+    A8,  PA4, 0, A9,  PA4, 0, \
+    A10, PA5, 0, A11, PA5, 0, \
+    A12, PA6, 0, A13, PA6, 0, \
+    A14, PA7, 0, A15, PA7, 0
+
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    xvf, s, Y0, X0, A0,  Y0, TMP0, TMP1, TMP2, Y1, X0, A1,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X1, A2,  Y0, TMP0, TMP1, TMP2, Y1, X1, A3,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X2, A4,  Y0, TMP0, TMP1, TMP2, Y1, X2, A5,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X3, A6,  Y0, TMP0, TMP1, TMP2, Y1, X3, A7,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X4, A8,  Y0, TMP0, TMP1, TMP2, Y1, X4, A9,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X5, A10, Y0, TMP0, TMP1, TMP2, Y1, X5, A11, Y1, TMP0, TMP1, TMP2, \
+    Y0, X6, A12, Y0, TMP0, TMP1, TMP2, Y1, X6, A13, Y1, TMP0, TMP1, TMP2, \
+    Y0, X7, A14, Y0, TMP0, TMP1, TMP2, Y1, X7, A15, Y1, TMP0, TMP1, TMP2
+.endm
+
+.macro CSTORE_Y_8
+    GST xv, , Y0, Y, 0, Y1, Y, 0x20
+.endm
+
+.macro CLOAD_X_1
+    GLDREPL xv, d, X0, X, 0x00
+    GCOMPLEXMUL GXCONJ, \
+    xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2
+.endm
+
+.macro CLOAD_Y_1
+    fld.d  $f10,   Y,  0
+.endm
+
+.macro CGEMV_N_1x8
+    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \
+                        $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    xvf, s, Y0, X0,  A0, Y0, TMP0, TMP1, TMP2, \
+    Y0, X1, A2,  Y0, TMP0, TMP1, TMP2, \
+    Y0, X2, A4,  Y0, TMP0, TMP1, TMP2, \
+    Y0, X3, A6,  Y0, TMP0, TMP1, TMP2, \
+    Y0, X4, A8,  Y0, TMP0, TMP1, TMP2, \
+    Y0, X5, A10, Y0, TMP0, TMP1, TMP2, \
+    Y0, X6, A12, Y0, TMP0, TMP1, TMP2, \
+    Y0, X7, A14, Y0, TMP0, TMP1, TMP2
+.endm
+
+.macro CSTORE_Y_1
+    fst.d   $f10,   Y,  0
+.endm
+
+.macro CGEMV_N_1x1
+    fld.d   $f12,    PA0,    0
+    PTR_ADDI PA0,   PA0,    0x08
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2
+.endm
+
+.macro CGEMV_N_LASX XW:req, X_8:req, X_1:req, Y_8:req, Y_1:req
+    PTR_SRLI  J,      N,      3
+    beqz      J,      .L_\XW\()_N_7
+    PTR_SLLI  K_LDA,  LDA,    3
+    PTR_SUB   K_LDA,  K_LDA,  M8
+.L_\XW\()_N_L8:
+    CLOAD_\X_8
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_M_7
+.align 5
+.L_\XW\()_M_L8:
+    CLOAD_\Y_8
+    CGEMV_N_8x8
+    CSTORE_\Y_8
+    PTR_ADDI    I,      I,      -1
+    PTR_ALSL    Y,      INC_Y,  Y,  3
+    PTR_ADDI    K,      K,      8
+    bnez        I,      .L_\XW\()_M_L8
+.L_\XW\()_M_7:
+    andi        I,      M,      7
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    CLOAD_\Y_1
+    CGEMV_N_1x8
+    CSTORE_\Y_1
+    PTR_ADDI    I,      I,      -1
+    PTR_ADD     Y,      Y,      INC_Y
+    PTR_ADDI    K,      K,      1
+    bnez        I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#endif
+    PTR_ALSL    X,      INC_X,  X,  3
+    bnez        J,      .L_\XW\()_N_L8
+.L_\XW\()_N_7:
+    andi        J,      N,      7
+    beqz        J,      .L_END
+.L_\XW\()_N_L1:
+    CLOAD_\X_1
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    CLOAD_\Y_1
+    CGEMV_N_1x1
+    CSTORE_\Y_1
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   Y,      Y,      INC_Y
+    PTR_ADDI  K,      K,      1
+    bnez    I,      .L_\XW\()_N_1_M_L1
+.L_\XW\()_N_1_M_END:
+    PTR_ADDI    J,      J,      -1
+    PTR_SUB     K_LDA,  LDA,    M8
+    PTR_ADD     PA0,    PA0,    K_LDA
+    PTR_ADD     X,      X,      INC_X
+    bnez        J,      .L_\XW\()_N_L1
+
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 7, 31
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    PTR_SUB    J,      INC_Y,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    maskeqz    J,      K,       J  /* if(inc_y == 1) j = 0; else j = 1; */
+    PTR_ALSL   I,      I,       J,      1
+    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
+    // Init VALPHA
+    xvpackev.w      $xr0,   $xr1,   $xr0
+    xvreplve0.d     VALPHA, $xr0
+    move     Y_ORG,  Y
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0 // Obtain the offset address
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword  .L_GAP_0_0 - .L_GAP_TABLE
+    .hword  .L_GAP_0_1 - .L_GAP_TABLE
+    .hword  .L_GAP_1_0 - .L_GAP_TABLE
+    .hword  .L_GAP_1_1 - .L_GAP_TABLE
+.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
+    CGEMV_N_LASX GAP_0_0, X_8, X_1, Y_8, Y_1
+.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
+    CGEMV_N_LASX GAP_0_1, X_8, X_1, Y_8_GAP, Y_1
+.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
+    CGEMV_N_LASX GAP_1_0, X_8_GAP, X_1, Y_8, Y_1
+.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
+    CGEMV_N_LASX GAP_1_1, X_8_GAP, X_1, Y_8_GAP, Y_1
+.L_END:
+    pop_if_used 17 + 7, 31
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/cgemv_t_8_lasx.S b/kernel/loongarch64/cgemv_t_8_lasx.S
new file mode 100644
index 000000000..94e4bd2eb
--- /dev/null
+++ b/kernel/loongarch64/cgemv_t_8_lasx.S
@@ -0,0 +1,342 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2022/02/20 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA_R $f0
+#define ALPHA_I $f1
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define PY0     $r14
+#define X_ORG   $r15
+#define PY1     $r16
+#define K_LDA   $r17
+#define PY2     $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+#define M8      $r30
+
+#define VALPHA  $xr0
+#define X0      $xr1
+#define X1      $xr2
+#define A0      $xr3
+#define A1      $xr4
+#define A2      $xr5
+#define A3      $xr6
+#define A4      $xr7
+#define A5      $xr8
+#define A6      $xr9
+#define A7      $xr10
+#define A8      $xr11
+#define A9      $xr12
+#define A10     $xr13
+#define A11     $xr14
+#define A12     $xr15
+#define A13     $xr16
+#define A14     $xr17
+#define A15     $xr18
+#define TP0     $xr19
+#define TP1     $xr20
+#define TP2     $xr21
+#define TP3     $xr22
+#define TP4     $xr23
+#define TP5     $xr24
+#define TP6     $xr25
+#define TP7     $xr26
+#define TMP0    $xr27
+#define TMP1    $xr28
+#define TMP2    $xr29
+#define Y0      $xr3
+#define Y1      $xr4
+#define Y2      $xr5
+#define Y3      $xr6
+#define Y4      $xr7
+#define Y5      $xr8
+#define Y6      $xr9
+#define Y7      $xr10
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+#define GXCONJ1 0
+#define GCONJ1  0
+#else
+#define GXCONJ1 1
+#define GCONJ1  0
+#endif
+
+#if !defined(XCONJ)
+#define GXCONJ2 0
+#define GCONJ2  0
+#else
+#define GXCONJ2 0
+#define GCONJ2  1
+#endif
+
+.macro ZERO_Y8
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \
+                TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7
+.endm
+
+.macro ZERO_Y1
+    GXOR xv, v, TP0, TP0, TP0
+.endm
+
+.macro CLOAD_X8
+    GLD xv, , X0, X, 0x00, X1, X, 0x20
+.endm
+
+.macro CLOAD_X8_GAP
+    fld.d       $f1,    X,    0x00
+    fldx.d      $f2,    X,    INC_X
+    PTR_ALSL    T0,     INC_X,      X,      1
+    fld.d       $f3,    T0,   0x00
+    fldx.d      $f4,    T0,   INC_X
+    GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
+    PTR_ALSL    T0,     INC_X,      X,      2
+    fld.d       $f2,    T0,   0x00
+    fldx.d      $f3,    T0,   INC_X
+    PTR_ALSL    T0,     INC_X,      T0,     1
+    fld.d       $f4,    T0,   0x00
+    fldx.d      $f5,    T0,   INC_X
+    GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3
+.endm
+
+.macro CGEMV_T_8x8
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0, \
+    A8,  PA4, 0, A9,  PA4, 0, \
+    A10, PA5, 0, A11, PA5, 0, \
+    A12, PA6, 0, A13, PA6, 0, \
+    A14, PA7, 0, A15, PA7, 0
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    xvf, s, TP0, A0,  X0, TP0, TMP0, TMP1, TMP2, TP0, A1,  X1, TP0, TMP0, TMP1, TMP2, \
+    TP1, A2,  X0, TP1, TMP0, TMP1, TMP2, TP1, A3,  X1, TP1, TMP0, TMP1, TMP2, \
+    TP2, A4,  X0, TP2, TMP0, TMP1, TMP2, TP2, A5,  X1, TP2, TMP0, TMP1, TMP2, \
+    TP3, A6,  X0, TP3, TMP0, TMP1, TMP2, TP3, A7,  X1, TP3, TMP0, TMP1, TMP2, \
+    TP4, A8,  X0, TP4, TMP0, TMP1, TMP2, TP4, A9,  X1, TP4, TMP0, TMP1, TMP2, \
+    TP5, A10, X0, TP5, TMP0, TMP1, TMP2, TP5, A11, X1, TP5, TMP0, TMP1, TMP2, \
+    TP6, A12, X0, TP6, TMP0, TMP1, TMP2, TP6, A13, X1, TP6, TMP0, TMP1, TMP2, \
+    TP7, A14, X0, TP7, TMP0, TMP1, TMP2, TP7, A15, X1, TP7, TMP0, TMP1, TMP2
+.endm
+
+.macro CGEMV_T_LASX XW:req, X8:req
+    PTR_SRLI  J,      N,      3
+    beqz      J,      .L_\XW\()_N_7
+    PTR_SLLI  K_LDA,  LDA,    3
+    PTR_SUB   K_LDA,  K_LDA,  M8
+.L_\XW\()_N_L8:
+    ZERO_Y8
+    move      X,      X_ORG
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_M_7
+.align 5
+.L_\XW\()_M_L8:
+    CLOAD_\X8
+    CGEMV_T_8x8
+    PTR_ADDI    I,  I,  -1
+    PTR_ALSL    X,  INC_X,  X,  3
+    bnez        I,  .L_\XW\()_M_L8
+.L_\XW\()_M_7:
+    // Accumulated
+    GCOMPLEXACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \
+                        Y5, TP5, Y6, TP6, Y7, TP7
+    andi        I,      M,      7
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    fld.d   $f1,    X,      0x00
+    fld.d   $f11,   PA0,    0x00
+    fld.d   $f12,   PA1,    0x00
+    fld.d   $f13,   PA2,    0x00
+    fld.d   $f14,   PA3,    0x00
+    fld.d   $f15,   PA4,    0x00
+    fld.d   $f16,   PA5,    0x00
+    fld.d   $f17,   PA6,    0x00
+    fld.d   $f18,   PA7,    0x00
+#if __loongarch_grlen == 64
+    GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
+               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
+#elif __loongarch_grlen == 32
+    GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
+               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
+#else
+    GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
+               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
+#endif
+
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    xvf, s, A0, A8,  X0, A0, TMP0, TMP1, TMP2, A1, A9,  X0, A1, TMP0, TMP1, TMP2, \
+    A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2, \
+    A4, A12, X0, A4, TMP0, TMP1, TMP2, A5, A13, X0, A5, TMP0, TMP1, TMP2, \
+    A6, A14, X0, A6, TMP0, TMP1, TMP2, A7, A15, X0, A7, TMP0, TMP1, TMP2
+
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    fld.d   $f11,   Y,  0x00
+    fldx.d  $f12,   Y,  INC_Y
+    PTR_ALSL    PY0, INC_Y,  Y,  1
+    fld.d   $f13,   PY0,    0x00
+    fldx.d  $f14,   PY0,    INC_Y
+    PTR_ALSL    PY1, INC_Y,  Y,  2
+    fld.d   $f15,   PY1,    0x00
+    fldx.d  $f16,   PY1,    INC_Y
+    PTR_ALSL    PY2, INC_Y,  PY1, 1
+    fld.d   $f17,   PY2,    0x00
+    fldx.d  $f18,   PY2,    INC_Y
+
+    GCOMPLEXMADD GXCONJ2, GCONJ2, \
+    xvf, s, A8,  VALPHA, A0, A8,  TMP0, TMP1, TMP2, A9,  VALPHA, A1, A9,  TMP0, TMP1, TMP2,\
+    A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2,\
+    A12, VALPHA, A4, A12, TMP0, TMP1, TMP2, A13, VALPHA, A5, A13, TMP0, TMP1, TMP2,\
+    A14, VALPHA, A6, A14, TMP0, TMP1, TMP2, A15, VALPHA, A7, A15, TMP0, TMP1, TMP2
+
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#endif
+    fst.d   $f11,   Y,      0x00
+    fstx.d  $f12,   Y,      INC_Y
+    fst.d   $f13,   PY0,    0x00
+    fstx.d  $f14,   PY0,    INC_Y
+    fst.d   $f15,   PY1,    0x00
+    fstx.d  $f16,   PY1,    INC_Y
+    fst.d   $f17,   PY2,    0x00
+    fstx.d  $f18,   PY2,    INC_Y
+    PTR_ALSL    Y,      INC_Y,  Y,  3
+    bnez        J,      .L_\XW\()_N_L8
+.L_\XW\()_N_7:
+    andi        J,      N,      7
+    beqz        J,      .L_END
+    PTR_SUB     K_LDA,  LDA,    M8
+.L_\XW\()_N_1:
+    ZERO_Y1
+    move    X,      X_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    fld.d   $f3,    PA0,    0x00
+    fld.d   $f1,    X,      0x00
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    xvf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    PTR_ADDI  PA0,    PA0,    0x08
+    bnez      I,      .L_\XW\()_N_1_M_L1
+.L_\XW\()_N_1_M_END:
+    PTR_ADDI  J,      J,      -1
+    fld.d     $f3,    Y,      0x00
+    GCOMPLEXMADD GXCONJ2, GCONJ2, \
+    xvf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
+    fst.d     $f3,    Y,      0x00
+    PTR_ADD   PA0,    PA0,    K_LDA
+    PTR_ADD   Y,      Y,      INC_Y
+    bnez      J,      .L_\XW\()_N_1
+
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 8, 30
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
+    // Init VALPHA
+    xvpackev.w      $xr0,   $xr1,   $xr0
+    xvreplve0.d     VALPHA, $xr0
+    move     X_ORG,  X
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword .L_GAP_0 - .L_GAP_TABLE
+    .hword .L_GAP_1 - .L_GAP_TABLE
+.L_GAP_0: /* if (incx == 1) */
+    CGEMV_T_LASX GAP_0, X8
+.L_GAP_1: /* if (incx != 1) */
+    CGEMV_T_LASX GAP_1, X8_GAP
+.L_END:
+    pop_if_used 17 + 8, 30
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S
index 694dcdaa9..fee46d63e 100644
--- a/kernel/loongarch64/loongarch64_asm.S
+++ b/kernel/loongarch64/loongarch64_asm.S
@@ -384,6 +384,246 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endif
 .endm
 
+//
+// GCOMPLEXACC: Complex accumulate the values of vector registers
+// pre_op: xvf or vf, differentiate between LSX or LASX instruction
+// suf_op: s or d, differentiate between single precision or double precision complex numbers
+// Note: When "pre_op = xvf && suf_op = s", in will be modified.
+//
+.macro GCOMPLEXACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
+.ifeqs "\pre_op", "xvf"
+    xvpermi.q              \out,   \in,    0x01
+.ifeqs "\suf_op", "s"
+    \pre_op\()add.\suf_op  \in,    \out,   \in
+    xvpackod.d             \out,   \in,    \in
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.else
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.endif
+.endif
+
+.ifeqs "\pre_op", "vf"
+.ifeqs "\suf_op", "s"
+    vpackod.d              \out,   \in,    \in
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.endif
+.endif
+
+.ifnb \more
+    GCOMPLEXACC \pre_op, \suf_op, \more
+.endif
+.endm
+
+//
+// GCOMPLEXMUL: Complex multiplication, out = in0 * in1
+// xconj: default value 0.
+// if !(xconj)
+//     out_r = in0_r * in1_r - in0_i * in1_i;
+//     out_i = in0_r * in1_i + in0_i * in1_r;
+// else
+//     out_r = in0_r * in1_r + in0_i * in1_i;
+//     out_i = in0_r * in1_i - in0_i * in1_r;
+// pre_op: xvf or vf, differentiate between LSX or LASX instruction
+// suf_op: s or d, differentiate between single precision or double precision complex numbers
+//
+.macro GCOMPLEXMUL xconj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, tmp0:req, tmp1:req, tmp2:req, more:vararg
+.ifeqs "\pre_op", "xvf"
+    xvxor.v           \tmp1,     \tmp1,     \tmp1
+.ifeqs "\suf_op", "s"
+    xvpackev.w  \tmp0,     \in0,      \in0
+.else
+    xvpackev.d  \tmp0,     \in0,      \in0
+.endif
+.else
+    vxor.v           \tmp1,     \tmp1,     \tmp1
+.ifeqs "\suf_op", "s"
+    vpackev.w  \tmp0,     \in0,      \in0
+.else
+    vpackev.d  \tmp0,     \in0,      \in0
+.endif
+.endif
+
+    \pre_op\()sub.\suf_op    \tmp1,     \tmp1,     \in0
+
+.ifeqs "\pre_op", "xvf"
+.ifeqs "\suf_op", "s"
+.ifeqs "\xconj", "0"
+    xvpackod.w  \tmp1,     \in0,      \tmp1
+.else
+    xvpackod.w  \tmp1,     \tmp1,     \in0
+.endif
+    xvshuf4i.w  \tmp2,     \in1,      0xb1
+.else
+.ifeqs "\xconj", "0"
+    xvpackod.d  \tmp1,     \in0,      \tmp1
+.else
+    xvpackod.d  \tmp1,     \tmp1,     \in0
+.endif
+    xvshuf4i.d  \tmp2,     \in1,      0x0b
+.endif
+.else
+.ifeqs "\suf_op", "s"
+.ifeqs "\xconj", "0"
+    vpackod.w  \tmp1,     \in0,      \tmp1
+.else
+    vpackod.w  \tmp1,     \tmp1,     \in0
+.endif
+    vshuf4i.w  \tmp2,     \in1,      0xb1
+.else
+.ifeqs "\xconj", "0"
+    vpackod.d  \tmp1,     \in0,      \tmp1
+.else
+    vpackod.d  \tmp1,     \tmp1,     \in0
+.endif
+    vshuf4i.d  \tmp2,     \in1,      0x0b
+.endif
+.endif
+
+    \pre_op\()mul.\suf_op    \out,      \tmp0,     \in1
+    \pre_op\()madd.\suf_op   \out,      \tmp1,     \tmp2,     \out
+
+.ifnb \more
+    GCOMPLEXMUL \xconj, \pre_op, \suf_op, \more
+.endif
+.endm
+
+//
+// GCOMPLEXMADD: Complex multiply-accumulate, out = in0 * in1 + in2
+// xconj: default value 0
+// conj: default value 0
+// if !(CONJ)
+// if !(XCONJ)
+//     out_r = in0_r * in1_r - in0_i * in1_i + in2_r;
+//     out_i = in0_r * in1_i + in0_i * in1_r + in2_i;
+// else
+//     out_r = in0_r * in1_r + in0_i * in1_i + in2_r;
+//     out_i = in0_r * in1_i - in0_i * in1_r + in2_i;
+// else
+// if !(XCONJ)
+//     out_r = in0_r * in1_r + in0_i * in1_i + in2_r;
+//     out_i = in2_i - (in0_r * in1_i - in0_i * in1_r);
+// else
+//     out_r = in0_r * in1_r   - in0_i * in1_i + in2_r;
+//     out_i = in2_i - (in0_r * in1_i + in0_i * in1_r);
+// pre_op: xvf or vf, differentiate between LSX or LASX instruction
+// suf_op: s or d, differentiate between single precision or double precision complex numbers
+//
+.macro GCOMPLEXMADD xconj=0, conj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, tmp0:req, tmp1:req, tmp2:req, more:vararg
+.ifeqs "\pre_op", "xvf"
+    xvxor.v           \tmp1,     \tmp1,     \tmp1
+.ifeqs "\suf_op", "s"
+    xvpackev.w  \tmp0,     \in0,      \in0
+.else
+    xvpackev.d  \tmp0,     \in0,      \in0
+.endif
+.else
+    vxor.v           \tmp1,     \tmp1,     \tmp1
+.ifeqs "\suf_op", "s"
+    vpackev.w  \tmp0,     \in0,      \in0
+.else
+    vpackev.d  \tmp0,     \in0,      \in0
+.endif
+.endif
+
+    \pre_op\()madd.\suf_op   \tmp2,      \tmp0,     \in1,      \in2
+.ifeqs "\conj", "1"
+    \pre_op\()nmsub.\suf_op  \tmp0,      \tmp0,     \in1,      \in2
+.ifeqs "\pre_op", "xvf"
+.ifeqs "\suf_op", "s"
+    xvshuf4i.w  \tmp0,     \tmp0,      0xb1
+    xvpackev.w  \out,      \tmp0,      \tmp2
+.else
+    xvshuf4i.d  \tmp0,     \tmp0,      0x0b
+    xvpackev.d  \out,      \tmp0,      \tmp2
+.endif
+.else
+.ifeqs "\suf_op", "s"
+    vshuf4i.w  \tmp0,     \tmp0,      0xb1
+    vpackev.w  \out,      \tmp0,      \tmp2
+.else
+    vshuf4i.d  \tmp0,     \tmp0,      0x0b
+    vpackev.d  \out,      \tmp0,      \tmp2
+.endif
+.endif /* pre_op = xvf */
+.else
+    \pre_op\()add.\suf_op    \out,     \tmp2,     \tmp1
+.endif /* conj = 1 */
+
+    \pre_op\()sub.\suf_op    \tmp1,     \tmp1,     \in0
+
+.ifeqs "\pre_op", "xvf"
+.ifeqs "\suf_op", "s"
+.ifeqs "\conj",  "0"
+.ifeqs "\xconj", "0"
+    xvpackod.w  \tmp1,     \in0,      \tmp1
+.else
+    xvpackod.w  \tmp1,     \tmp1,     \in0
+.endif
+.else
+.ifeqs "\xconj", "0"
+    xvpackod.w  \tmp1,     \in0,     \in0
+.else
+    xvpackod.w  \tmp1,     \tmp1,     \tmp1
+.endif
+.endif
+    xvshuf4i.w  \tmp2,     \in1,      0xb1
+.else
+.ifeqs "\conj",  "0"
+.ifeqs "\xconj", "0"
+    xvpackod.d  \tmp1,     \in0,      \tmp1
+.else
+    xvpackod.d  \tmp1,     \tmp1,     \in0
+.endif
+.else
+.ifeqs "\xconj", "0"
+    xvpackod.d  \tmp1,     \in0,     \in0
+.else
+    xvpackod.d  \tmp1,     \tmp1,    \tmp1
+.endif
+.endif
+    xvshuf4i.d  \tmp2,     \in1,      0x0b
+.endif
+.else
+.ifeqs "\suf_op", "s"
+.ifeqs "\conj",  "0"
+.ifeqs "\xconj", "0"
+    vpackod.w  \tmp1,     \in0,      \tmp1
+.else
+    vpackod.w  \tmp1,     \tmp1,     \in0
+.endif
+.else
+.ifeqs "\xconj", "0"
+    vpackod.w  \tmp1,     \in0,     \in0
+.else
+    vpackod.w  \tmp1,     \tmp1,    \tmp1
+.endif
+.endif
+    vshuf4i.w  \tmp2,     \in1,      0xb1
+.else
+.ifeqs "\conj",  "0"
+.ifeqs "\xconj", "0"
+    vpackod.d  \tmp1,     \in0,      \tmp1
+.else
+    vpackod.d  \tmp1,     \tmp1,     \in0
+.endif
+.else
+.ifeqs "\xconj", "0"
+    vpackod.d  \tmp1,     \in0,     \in0
+.else
+    vpackod.d  \tmp1,     \tmp1,    \tmp1
+.endif
+.endif
+    vshuf4i.d  \tmp2,     \in1,      0x0b
+.endif
+.endif
+
+    \pre_op\()madd.\suf_op   \out,      \tmp1,     \tmp2,     \out
+
+.ifnb \more
+    GCOMPLEXMADD \xconj, \conj, \pre_op, \suf_op, \more
+.endif
+.endm
+
 //
 // Media Related Macros
 //

From 990507e3b8f833de26da1b2ab6dff0972ae65c3f Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Thu, 22 Feb 2024 11:41:15 +0800
Subject: [PATCH 682/718] LoongArch64: Opt zgemv with LASX

---
 kernel/loongarch64/KERNEL.LOONGSON3R5 |   3 +
 kernel/loongarch64/zgemv_n_4_lasx.S   | 343 ++++++++++++++++++++++++++
 kernel/loongarch64/zgemv_t_4_lasx.S   | 299 ++++++++++++++++++++++
 3 files changed, 645 insertions(+)
 create mode 100644 kernel/loongarch64/zgemv_n_4_lasx.S
 create mode 100644 kernel/loongarch64/zgemv_t_4_lasx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 3b2ee6e55..9b55d1bbb 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -139,6 +139,9 @@ ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
+ZGEMVNKERNEL = zgemv_n_4_lasx.S
+ZGEMVTKERNEL = zgemv_t_4_lasx.S
+
 ZTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
 ZTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 ZTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
diff --git a/kernel/loongarch64/zgemv_n_4_lasx.S b/kernel/loongarch64/zgemv_n_4_lasx.S
new file mode 100644
index 000000000..98b1a6f7d
--- /dev/null
+++ b/kernel/loongarch64/zgemv_n_4_lasx.S
@@ -0,0 +1,343 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2024/02/20 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA_R $f0
+#define ALPHA_I $f1
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define Y_ORG   $r15
+#define OFFSET  $r16
+#define K_LDA   $r17
+#define M16     $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+
+#define VALPHA  $xr1
+#define X0      $xr2
+#define X1      $xr3
+#define X2      $xr4
+#define X3      $xr5
+#define X4      $xr6
+#define X5      $xr7
+#define X6      $xr8
+#define X7      $xr9
+#define Y0      $xr10
+#define Y1      $xr11
+#define A0      $xr12
+#define A1      $xr13
+#define A2      $xr14
+#define A3      $xr15
+#define A4      $xr16
+#define A5      $xr17
+#define A6      $xr18
+#define A7      $xr19
+#define A8      $xr20
+#define A9      $xr21
+#define A10     $xr22
+#define A11     $xr23
+#define A12     $xr24
+#define A13     $xr25
+#define A14     $xr26
+#define A15     $xr27
+#define TMP0    $xr28
+#define TMP1    $xr29
+#define TMP2    $xr30
+
+#if !defined(CONJ)
+#if !defined(XCONJ)
+#define GXCONJ 0
+#define GCONJ  0
+#else
+#define GXCONJ 1
+#define GCONJ  0
+#endif
+#else
+#if !defined(XCONJ)
+#define GXCONJ 0
+#define GCONJ  1
+#else
+#define GXCONJ 1
+#define GCONJ  1
+#endif
+#endif
+
+.macro ZLOAD_X_4
+    GLD xv, , X0, X, 0x00, X1, X, 0x10, X2, X, 0x20, X3, X, 0x30
+    GPERMI xv, q, X0, X0, 0, X1, X1, 0, X2, X2, 0, X3, X3, 0
+    GCOMPLEXMUL GXCONJ, \
+    xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
+    X1, X1, VALPHA, TMP0, TMP1, TMP2, \
+    X2, X2, VALPHA, TMP0, TMP1, TMP2, \
+    X3, X3, VALPHA, TMP0, TMP1, TMP2
+.endm
+
+.macro ZLOAD_X_4_GAP
+    xvld        X0,     X,      0
+    xvpermi.q   X0,     X0,     0
+
+    PTR_ADD     T0,     X,      INC_X
+    xvld        X1,     T0,     0
+    xvpermi.q   X1,     X1,     0
+
+    PTR_ADD     T0,     T0,     INC_X
+    xvld        X2,     T0,     0
+    xvpermi.q   X2,     X2,     0
+
+    PTR_ADD     T0,     T0,     INC_X
+    xvld        X3,     T0,     0
+    xvpermi.q   X3,     X3,     0
+
+    GCOMPLEXMUL GXCONJ, \
+    xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
+    X1, X1, VALPHA, TMP0, TMP1, TMP2, \
+    X2, X2, VALPHA, TMP0, TMP1, TMP2, \
+    X3, X3, VALPHA, TMP0, TMP1, TMP2
+.endm
+
+.macro ZLOAD_Y_4
+    GLD xv, , Y0, Y, 0, Y1, Y, 0x20
+.endm
+
+.macro ZLOAD_Y_4_GAP
+    vld  $vr10, Y, 0
+    vldx $vr13, Y, INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  1
+    vld  $vr11, T0, 0
+    vldx $vr14, T0, INC_Y
+    GPERMI xv, q, Y0, A1, 0x02, Y1, A2, 0x02
+.endm
+
+.macro ZGEMV_N_4x4
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    xvf, d, Y0, X0, A0,  Y0, TMP0, TMP1, TMP2, Y1, X0, A1,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X1, A2,  Y0, TMP0, TMP1, TMP2, Y1, X1, A3,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X2, A4,  Y0, TMP0, TMP1, TMP2, Y1, X2, A5,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X3, A6,  Y0, TMP0, TMP1, TMP2, Y1, X3, A7,  Y1, TMP0, TMP1, TMP2
+.endm
+
+.macro ZSTORE_Y_4
+    GST xv, , Y0, Y, 0, Y1, Y, 0x20
+.endm
+
+.macro ZSTORE_Y_4_GAP
+    xvstelm.d   Y0,     Y,      0,      0
+    xvstelm.d   Y0,     Y,      0x08,   1
+    PTR_ADD     T0,     Y,      INC_Y
+    xvstelm.d   Y0,     T0,     0,      2
+    xvstelm.d   Y0,     T0,     0x08,   3
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      0
+    xvstelm.d   Y1,     T0,     0x08,   1
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      2
+    xvstelm.d   Y1,     T0,     0x08,   3
+.endm
+
+.macro ZLOAD_Y_1
+    vld $vr10, Y, 0
+.endm
+
+.macro ZGEMV_N_1x4
+    GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0, $vr16, PA2, 0, $vr18, PA3, 0
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    xvf, d, Y0, X0,  A0, Y0, TMP0, TMP1, TMP2, \
+    Y0, X1, A2,  Y0, TMP0, TMP1, TMP2, \
+    Y0, X2, A4,  Y0, TMP0, TMP1, TMP2, \
+    Y0, X3, A6,  Y0, TMP0, TMP1, TMP2
+.endm
+
+.macro ZSTORE_Y_1
+    vst $vr10, Y, 0
+.endm
+
+.macro ZLOAD_X_1
+    GLD xv, , X0, X, 0x00
+    GPERMI xv, q, X0, X0, 0
+    GCOMPLEXMUL GXCONJ, \
+    xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2
+.endm
+
+.macro ZGEMV_N_1x1
+    GLD_INC v, , 0x10, $vr12, PA0, 0
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    xvf, d, Y0, X0,  A0, Y0, TMP0, TMP1, TMP2
+.endm
+
+.macro ZGEMV_N_LASX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req
+    PTR_SRLI  J,      N,      2
+    beqz      J,      .L_\XW\()_N_3
+    PTR_SLLI  K_LDA,  LDA,    2
+    PTR_SUB   K_LDA,  K_LDA,  M16
+.L_\XW\()_N_L4:
+    ZLOAD_\X_4
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    PTR_SRLI  I,      M,       2
+    beqz      I,      .L_\XW\()_M_3
+.align 5
+.L_\XW\()_M_L4:
+    ZLOAD_\Y_4
+    ZGEMV_N_4x4
+    ZSTORE_\Y_4
+    PTR_ADDI    I,      I,      -1
+    PTR_ALSL    Y,      INC_Y,  Y,  2
+    PTR_ADDI    K,      K,      4
+    bnez        I,      .L_\XW\()_M_L4
+.L_\XW\()_M_3:
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    ZLOAD_\Y_1
+    ZGEMV_N_1x4
+    ZSTORE_\Y_1
+    PTR_ADDI    I,      I,      -1
+    PTR_ADD     Y,      Y,      INC_Y
+    PTR_ADDI    K,      K,      1
+    bnez        I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#endif
+    PTR_ALSL    X,      INC_X,  X,  2
+    bnez        J,      .L_\XW\()_N_L4
+.L_\XW\()_N_3:
+    andi        J,      N,      3
+    beqz        J,      .L_END
+.L_\XW\()_N_L1:
+    ZLOAD_\X_1
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    ZLOAD_\Y_1
+    ZGEMV_N_1x1
+    ZSTORE_\Y_1
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   Y,      Y,      INC_Y
+    PTR_ADDI  K,      K,      1
+    bnez    I,      .L_\XW\()_N_1_M_L1
+.L_\XW\()_N_1_M_END:
+    PTR_ADDI    J,      J,      -1
+    PTR_SUB     K_LDA,  LDA,    M16
+    PTR_ADD     PA0,    PA0,    K_LDA
+    PTR_ADD     X,      X,      INC_X
+    bnez        J,      .L_\XW\()_N_L1
+
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 7, 31
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    PTR_SUB    J,      INC_Y,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    maskeqz    J,      K,       J  /* if(inc_y == 1) j = 0; else j = 1; */
+    PTR_ALSL   I,      I,       J,      1
+    GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
+    // Init VALPHA
+    xvpackev.d      $xr0,   $xr1,   $xr0
+    xvreplve0.q     VALPHA, $xr0
+    move     Y_ORG,  Y
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0 // Obtain the offset address
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword  .L_GAP_0_0 - .L_GAP_TABLE
+    .hword  .L_GAP_0_1 - .L_GAP_TABLE
+    .hword  .L_GAP_1_0 - .L_GAP_TABLE
+    .hword  .L_GAP_1_1 - .L_GAP_TABLE
+.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
+    ZGEMV_N_LASX GAP_0_0, X_4, X_1, Y_4, Y_1
+.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
+    ZGEMV_N_LASX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1
+.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
+    ZGEMV_N_LASX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1
+.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
+    ZGEMV_N_LASX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1
+.L_END:
+    pop_if_used 17 + 7, 31
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
+
diff --git a/kernel/loongarch64/zgemv_t_4_lasx.S b/kernel/loongarch64/zgemv_t_4_lasx.S
new file mode 100644
index 000000000..4d33b8f96
--- /dev/null
+++ b/kernel/loongarch64/zgemv_t_4_lasx.S
@@ -0,0 +1,299 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2024/02/20 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA_R $f0
+#define ALPHA_I $f1
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define PY0     $r14
+#define X_ORG   $r15
+#define PY1     $r16
+#define K_LDA   $r17
+#define PY2     $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+#define M16     $r30
+
+#define VALPHA  $xr0
+#define X0      $xr1
+#define X1      $xr2
+#define A0      $xr3
+#define A1      $xr4
+#define A2      $xr5
+#define A3      $xr6
+#define A4      $xr7
+#define A5      $xr8
+#define A6      $xr9
+#define A7      $xr10
+#define A8      $xr11
+#define A9      $xr12
+#define A10     $xr13
+#define A11     $xr14
+#define A12     $xr15
+#define A13     $xr16
+#define A14     $xr17
+#define A15     $xr18
+#define TP0     $xr19
+#define TP1     $xr20
+#define TP2     $xr21
+#define TP3     $xr22
+#define TP4     $xr23
+#define TP5     $xr24
+#define TP6     $xr25
+#define TP7     $xr26
+#define TMP0    $xr27
+#define TMP1    $xr28
+#define TMP2    $xr29
+#define Y0      $xr3
+#define Y1      $xr4
+#define Y2      $xr5
+#define Y3      $xr6
+#define Y4      $xr7
+#define Y5      $xr8
+#define Y6      $xr9
+#define Y7      $xr10
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+#define GXCONJ1 0
+#define GCONJ1  0
+#else
+#define GXCONJ1 1
+#define GCONJ1  0
+#endif
+
+#if !defined(XCONJ)
+#define GXCONJ2 0
+#define GCONJ2  0
+#else
+#define GXCONJ2 0
+#define GCONJ2  1
+#endif
+
+.macro ZERO_Y4
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
+.endm
+
+.macro ZERO_Y1
+    GXOR xv, v, TP0, TP0, TP0
+.endm
+
+.macro ZLOAD_X4
+    GLD xv, , X0, X, 0x00, X1, X, 0x20
+.endm
+
+.macro ZLOAD_X4_GAP
+    xvld        X0,     X,      0
+
+    PTR_ADD     T0,     X,      INC_X
+    xvld        A0,     T0,     0
+    xvpermi.q   X0,     A0,     0x02
+
+    PTR_ADD     T0,     T0,     INC_X
+    xvld        X1,     T0,     0
+
+    PTR_ADD     T0,     T0,     INC_X
+    xvld        A0,     T0,     0
+    xvpermi.q   X1,     A0,     0x02
+.endm
+
+.macro ZGEMV_T_4x4
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    xvf, d, TP0, A0,  X0, TP0, TMP0, TMP1, TMP2, TP0, A1,  X1, TP0, TMP0, TMP1, TMP2, \
+    TP1, A2,  X0, TP1, TMP0, TMP1, TMP2, TP1, A3,  X1, TP1, TMP0, TMP1, TMP2, \
+    TP2, A4,  X0, TP2, TMP0, TMP1, TMP2, TP2, A5,  X1, TP2, TMP0, TMP1, TMP2, \
+    TP3, A6,  X0, TP3, TMP0, TMP1, TMP2, TP3, A7,  X1, TP3, TMP0, TMP1, TMP2
+.endm
+
+.macro ZGEMV_T_LASX XW:req, X4:req
+    PTR_SRLI  J,      N,      2
+    beqz      J,      .L_\XW\()_N_3
+    PTR_SLLI  K_LDA,  LDA,    2
+    PTR_SUB   K_LDA,  K_LDA,  M16
+.L_\XW\()_N_L4:
+    ZERO_Y4
+    move      X,      X_ORG
+    PTR_SRLI  I,      M,       2
+    beqz      I,      .L_\XW\()_M_3
+.align 5
+.L_\XW\()_M_L4:
+    ZLOAD_\X4
+    ZGEMV_T_4x4
+    PTR_ADDI    I,  I,  -1
+    PTR_ALSL    X,  INC_X,  X,  2
+    bnez        I,  .L_\XW\()_M_L4
+.L_\XW\()_M_3:
+    // Accumulated
+    GCOMPLEXACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    GLD xv, , X0, X, 0x00, A8, PA0, 0x00, A9, PA1, 0x00, A10, PA2, 0x00, A11, PA3, 0x00
+#if __loongarch_grlen == 64
+    GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10
+#elif __loongarch_grlen == 32
+    GADDI , w, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10
+#else
+    GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10
+#endif
+
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    xvf, d, A0, A8,  X0, A0, TMP0, TMP1, TMP2, A1, A9,  X0, A1, TMP0, TMP1, TMP2, \
+    A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2
+
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    xvld  A8, Y, 0x00
+    xvldx A9, Y, INC_Y
+    PTR_ALSL    PY0, INC_Y,  Y,  1
+    xvld  A10, PY0, 0x00
+    xvldx A11, PY0, INC_Y
+
+    GCOMPLEXMADD GXCONJ2, GCONJ2, \
+    xvf, d, A8,  VALPHA, A0, A8,  TMP0, TMP1, TMP2, A9,  VALPHA, A1, A9,  TMP0, TMP1, TMP2,\
+    A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2
+
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#endif
+    vst  $vr11, Y,   0x00
+    vstx $vr12, Y,   INC_Y
+    vst  $vr13, PY0, 0x00
+    vstx $vr14, PY0, INC_Y
+    PTR_ALSL    Y,      INC_Y,  Y,  2
+    bnez        J,      .L_\XW\()_N_L4
+.L_\XW\()_N_3:
+    andi        J,      N,      3
+    beqz        J,      .L_END
+    PTR_SUB     K_LDA,  LDA,    M16
+.L_\XW\()_N_1:
+    ZERO_Y1
+    move    X,      X_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    GLD xv, , A0, PA0, 0x00, X0, X, 0x00
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    xvf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    PTR_ADDI  PA0,    PA0,    0x10
+    bnez      I,      .L_\XW\()_N_1_M_L1
+.L_\XW\()_N_1_M_END:
+    PTR_ADDI  J,      J,      -1
+    xvld      A0,     Y,      0x00
+    GCOMPLEXMADD GXCONJ2, GCONJ2, \
+    xvf, d, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
+    vst       $vr3,   Y,      0x00
+    PTR_ADD   PA0,    PA0,    K_LDA
+    PTR_ADD   Y,      Y,      INC_Y
+    bnez      J,      .L_\XW\()_N_1
+
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 8, 30
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
+    // Init VALPHA
+    xvpackev.d      $xr0,   $xr1,   $xr0
+    xvreplve0.q     VALPHA, $xr0
+    move     X_ORG,  X
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword .L_GAP_0 - .L_GAP_TABLE
+    .hword .L_GAP_1 - .L_GAP_TABLE
+.L_GAP_0: /* if (incx == 1) */
+    ZGEMV_T_LASX GAP_0, X4
+.L_GAP_1: /* if (incx != 1) */
+    ZGEMV_T_LASX GAP_1, X4_GAP
+.L_END:
+    pop_if_used 17 + 8, 30
+    jirl    $r0, $r1, 0x0
+    EPILOGUE

From 892f8ff3e55e24fda9af3f6364319cce3f60116b Mon Sep 17 00:00:00 2001
From: Ayappan Perumal <ayappap2@in.ibm.com>
Date: Thu, 22 Feb 2024 07:05:37 -0600
Subject: [PATCH 683/718] Shared library support for AIX

---
 Makefile         |  3 +++
 Makefile.system  |  4 ----
 exports/Makefile | 18 ++++++++++++++++++
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index b344abcd2..19bab2915 100644
--- a/Makefile
+++ b/Makefile
@@ -152,6 +152,9 @@ endif
 ifeq ($(OSNAME), CYGWIN_NT)
 	@$(MAKE) -C exports dll
 endif
+ifeq ($(OSNAME), AIX)
+	@$(MAKE) -C exports so
+endif
 endif
 
 tests : shared
diff --git a/Makefile.system b/Makefile.system
index 0088eaff5..f7ccc7746 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -1707,11 +1707,7 @@ endif
 
 LIBDLLNAME   = $(LIBPREFIX).dll
 IMPLIBNAME   = lib$(LIBNAMEBASE).dll.a
-ifneq ($(OSNAME), AIX)
 LIBSONAME    = $(LIBNAME:.$(LIBSUFFIX)=.so)
-else
-LIBSONAME    = $(LIBNAME:.$(LIBSUFFIX)=.a)
-endif
 LIBDYNNAME   = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
 LIBDEFNAME   = $(LIBNAME:.$(LIBSUFFIX)=.def)
 LIBEXPNAME   = $(LIBNAME:.$(LIBSUFFIX)=.exp)
diff --git a/exports/Makefile b/exports/Makefile
index 7682f851d..cf948ccb2 100644
--- a/exports/Makefile
+++ b/exports/Makefile
@@ -73,6 +73,10 @@ endif
 endif
 endif
 
+ifeq ($(F_COMPILER)$(OSNAME), IBMAIX)
+EXTRALIB += -lxlf90
+endif
+
 ifeq ($(C_COMPILER), PGI)
 EXTRALIB += -pgf90libs
 endif
@@ -248,6 +252,20 @@ endif
 
 ifeq ($(OSNAME), AIX)
 
+so : ../$(LIBSONAME) linktest.c
+	$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(EXTRALIB) && echo OK.
+	rm -f linktest
+
+../$(LIBSONAME) : aix.exp
+	$(CC) $(CFLAGS) $(LDFLAGS)  -shared -o ../$(LIBSONAME) \
+	-Wl,-bE:aix.exp -Wl,-bbigtoc ../$(LIBNAME) $(EXTRALIB)
+
+aix.exp :
+	/usr/bin/nm -X32_64 -PCpgl ../$(LIBNAME) | /usr/bin/awk '{ if ((($$ 2 == "T") \
+	|| ($$ 2 == "D") || ($$ 2 == "B") || ($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) && (substr($$ 1,1,1) != ".")) \
+	{ if (($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) { print $$ 1 " weak" } else { print $$ 1 } } }' | \
+	/usr/bin/sort -u > aix.exp
+
 ifeq ($(COMPILER_F77), xlf)
 
 goto32.$(SUFFIX) : ../$(LIBNAME) aix.def

From 82b81c0bbee5d1344dcd2fb0f50cba6a7c0ded9f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 22 Feb 2024 22:11:50 +0100
Subject: [PATCH 684/718] Dont fail if there is no Fortran compiler

---
 cmake/f_check.cmake | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake
index df3a4858d..4c4f5ac04 100644
--- a/cmake/f_check.cmake
+++ b/cmake/f_check.cmake
@@ -64,6 +64,7 @@ else ()
     "#define NEEDBUNDERSCORE 1\n")
 endif()
 
+if (CMAKE_Fortran_COMPILER)
 get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE)
 string(TOUPPER ${F_COMPILER} F_COMPILER)
-
+endif()

From 8fc2c2db043eda5e415c1673779c8bdcd91870fd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 22 Feb 2024 22:14:13 +0100
Subject: [PATCH 685/718] Fix missing support for INTERFACE64 on ARM64 and
 MIPS64

---
 cmake/fc.cmake | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/cmake/fc.cmake b/cmake/fc.cmake
index 5c30be843..bc85a2921 100644
--- a/cmake/fc.cmake
+++ b/cmake/fc.cmake
@@ -6,9 +6,6 @@
 if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
   # This is for classic Flang. LLVM Flang is handled with gfortran below.
   set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
-  if (BINARY64 AND INTERFACE64)
-    set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
-  endif ()
   if (USE_OPENMP)
     set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
   endif ()
@@ -55,6 +52,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
     if (MIPS64)
       if (BINARY64)
         set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64")
+        if (INTERFACE64)
+          set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
+	endif ()
       else ()
         set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32")
       endif ()
@@ -83,6 +83,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
         endif ()
       endif ()
     endif ()
+    if (ARM64 AND INTERFACE64)
+          set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
+    endif ()
   else ()
     if (BINARY64)
       set(FCOMMON_OPT "${FCOMMON_OPT} -m64")

From 3516fff378cf5d9153d18c86d9a117b0976e777d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 22 Feb 2024 22:15:28 +0100
Subject: [PATCH 686/718] Avoid linking both libgomp and libomp in mixed
 clang/gfortran builds

---
 ctest/CMakeLists.txt | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt
index 91338b73b..6e0a7f309 100644
--- a/ctest/CMakeLists.txt
+++ b/ctest/CMakeLists.txt
@@ -40,6 +40,10 @@ else()
     c_${float_char}blas1.c)
 endif()
   target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
+  if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
+        string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
+        target_link_libraries(x${float_char}cblat1 omp pthread)
+  endif()
   if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
       target_link_libraries(x${float_char}cblat1 m)
   endif()
@@ -65,6 +69,10 @@ else()
     constant.c)
 endif()
   target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
+  if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
+        string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
+        target_link_libraries(x${float_char}cblat2 omp pthread)
+  endif()
   if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
     target_link_libraries(x${float_char}cblat2 m)
   endif()
@@ -90,6 +98,10 @@ else()
     constant.c)
 endif()
   target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
+  if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
+        string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
+        target_link_libraries(x${float_char}cblat3 omp pthread)
+  endif()
   if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
     target_link_libraries(x${float_char}cblat3 m)
   endif()

From 4adfe4d53185233bfbeeb362734a2d80814d4457 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 22 Feb 2024 22:16:01 +0100
Subject: [PATCH 687/718] Avoid linking both libgomp and libomp in mixed
 clang/gfortran builds

---
 test/CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index d68b12d87..ace20dffc 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -21,10 +21,14 @@ endif()
 if (BUILD_COMPLEX16)
 	list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3)
 endif()
-
+message (STATUS CCOMP ${CMAKE_C_COMPILER_ID} FCOMP ${CMAKE_Fortran_COMPILER_ID})
 foreach(test_bin ${OpenBLAS_Tests})
 	add_executable(${test_bin} ${test_bin}.f)
 target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME})
+if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
+	string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
+target_link_libraries(${test_bin} omp pthread)
+endif()
 endforeach()
 
 # $1 exec, $2 input, $3 output_result

From ca121eb5eda1c635f1353d261de5783e079c21c1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 22 Feb 2024 22:17:05 +0100
Subject: [PATCH 688/718] Avoid linking both libgomp and libomp in mixed
 clang/gfortran builds

---
 lapack-netlib/TESTING/EIG/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt
index e7236677a..b69417853 100644
--- a/lapack-netlib/TESTING/EIG/CMakeLists.txt
+++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt
@@ -107,6 +107,10 @@ set(ZDMDEIGTST zchkdmd.f90)
 macro(add_eig_executable name)
   add_executable(${name} ${ARGN})
   target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE})
+if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
+	string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
+target_link_libraries(${name} omp pthread)
+endif()
 #${TMGLIB} ../${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
 endmacro()
 

From be20588a3c7c0725cd38846ce3408604fbfb2c95 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 22 Feb 2024 22:17:48 +0100
Subject: [PATCH 689/718] Avoid linking both libgomp and libomp in mixed
 clang/gfortran builds

---
 lapack-netlib/TESTING/LIN/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt
index 143fd0597..9ae0cea79 100644
--- a/lapack-netlib/TESTING/LIN/CMakeLists.txt
+++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt
@@ -240,6 +240,10 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr
 macro(add_lin_executable name)
   add_executable(${name} ${ARGN})
   target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE})
+  if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
+	string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
+        target_link_libraries(${name} omp pthread)
+  endif()
 #${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
 endmacro()
 

From 16b488cabe8c6113ba41bf140f4dfe3cace6b9ac Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 22 Feb 2024 22:38:05 +0100
Subject: [PATCH 690/718] CI: Add various Apple M1 build configurations to gh
 workflow

---
 .github/workflows/apple_m.yml | 149 ++++++++++++++++++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100644 .github/workflows/apple_m.yml

diff --git a/.github/workflows/apple_m.yml b/.github/workflows/apple_m.yml
new file mode 100644
index 000000000..e34eada86
--- /dev/null
+++ b/.github/workflows/apple_m.yml
@@ -0,0 +1,149 @@
+name: apple m
+
+on: [push, pull_request]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+jobs:
+  build:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
+    runs-on: macos-14
+
+    strategy:
+      fail-fast: false
+      matrix:
+        build: [cmake, make]
+        fortran: [gfortran]
+        openmp: [0, 1]
+        ilp64: [0, 1]
+        
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Print system information
+        run: |
+          if [ "$RUNNER_OS" == "Linux" ]; then
+            cat /proc/cpuinfo
+          elif [ "$RUNNER_OS" == "macOS" ]; then
+            sysctl -a | grep machdep.cpu
+          else
+            echo "::error::$RUNNER_OS not supported"
+            exit 1
+          fi
+
+      - name: Install Dependencies
+        run: |
+          if [ "$RUNNER_OS" == "Linux" ]; then
+            sudo apt-get install -y gfortran cmake ccache libtinfo5
+          elif [ "$RUNNER_OS" == "macOS" ]; then
+            # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
+            brew reinstall gcc
+            brew install coreutils cmake ccache
+            brew install llvm
+          else
+            echo "::error::$RUNNER_OS not supported"
+            exit 1
+          fi
+
+      - name: Compilation cache
+        uses: actions/cache@v3
+        with:
+          path: ~/.ccache
+          # We include the commit sha in the cache key, as new cache entries are
+          # only created if there is no existing entry for the key yet.
+          # GNU make and cmake call the compilers differently. It looks like
+          # that causes the cache to mismatch. Keep the ccache for both build
+          # tools separate to avoid polluting each other.
+          key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
+          # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
+          restore-keys: |
+            ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}-${{ github.ref }}
+            ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}
+            ccache-${{ runner.os }}-${{ matrix.build }}
+
+      - name: Configure ccache
+        run: |
+          if [ "${{ matrix.build }}" = "make" ]; then
+            # Add ccache to path
+            if [ "$RUNNER_OS" = "Linux" ]; then
+              echo "/usr/lib/ccache" >> $GITHUB_PATH
+            elif [ "$RUNNER_OS" = "macOS" ]; then
+              echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH
+              echo "/opt/homebrew/opt/llvm/bin" >>$GITHUB_PATH
+              echo "" >>$GITHUB_PATH
+            else
+              echo "::error::$RUNNER_OS not supported"
+              exit 1
+            fi
+          fi
+          # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
+          test -d ~/.ccache || mkdir -p ~/.ccache
+          echo "max_size = 300M" > ~/.ccache/ccache.conf
+          echo "compression = true" >> ~/.ccache/ccache.conf
+          ccache -s
+
+      - name: Build OpenBLAS
+        run: |
+          export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+          export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
+          export CC="/opt/homebrew/opt/llvm/bin/clang"
+          case "${{ matrix.build }}" in
+            "make")
+              make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=${{matrix.openmp}} INTERFACE64=${{matrix.ilp64}} FC="ccache ${{ matrix.fortran }}"
+              ;;
+            "cmake")
+              export LDFLAGS="$LDFLAGS -Wl,-ld_classic"
+              mkdir build && cd build
+              cmake -DDYNAMIC_ARCH=1 \
+                    -DUSE_OPENMP=${{matrix.openmp}} \
+                    -DINTERFACE64=${{matrix.ilp64}} \
+                    -DNOFORTRAN=0 \
+                    -DBUILD_WITHOUT_LAPACK=0 \
+                    -DCMAKE_VERBOSE_MAKEFILE=ON \
+                    -DCMAKE_BUILD_TYPE=Release \
+                    -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
+                    -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+                    -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
+                    ..
+              cmake --build .
+              ;;
+            *)
+              echo "::error::Configuration not supported"
+              exit 1
+              ;;
+          esac
+
+      - name: Show ccache status
+        continue-on-error: true
+        run: ccache -s
+
+      - name: Run tests
+        timeout-minutes: 60
+        run: |
+          case "${{ matrix.build }}" in
+            "make")
+              MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0'
+              echo "::group::Tests in 'test' directory"
+              make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
+              echo "::endgroup::"
+              echo "::group::Tests in 'ctest' directory"
+              make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
+              echo "::endgroup::"
+              echo "::group::Tests in 'utest' directory"
+              make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
+              echo "::endgroup::"
+              ;;
+            "cmake")
+              cd build && ctest
+              ;;
+            *)
+              echo "::error::Configuration not supported"
+              exit 1
+              ;;
+          esac

From 5b953f2f8d3a3f138e4be4ed28624ee25826c968 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 22 Feb 2024 22:41:08 +0100
Subject: [PATCH 691/718] Disable most AppleM1 builds (replaced by gh
 workflows)

---
 .cirrus.yml | 70 ++++++++++++++++++++++++++---------------------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index b4c4870d0..9a898f421 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -1,44 +1,44 @@
 macos_instance:
   image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
 
-task:
-  name: AppleM1/LLVM
-  compile_script:
-  - brew install llvm
-  - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
-  - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
-  - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
-  - make TARGET=VORTEX USE_OPENMP=1 CC=clang
+#task:
+#  name: AppleM1/LLVM
+#  compile_script:
+#  - brew install llvm
+#  - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
+#  - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+#  - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
+#  - make TARGET=VORTEX USE_OPENMP=1 CC=clang
 
-task:
-  name: AppleM1/LLVM/ILP64
-  compile_script:
-  - brew install llvm
-  - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
-  - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
-  - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
-  - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1
+#task:
+#  name: AppleM1/LLVM/ILP64
+#  compile_script:
+#  - brew install llvm
+#  - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
+#  - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+#  - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
+#  - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1
 
-task:
-  name: AppleM1/LLVM/CMAKE
-  compile_script:
-  - brew install llvm
-  - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
-  - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
-  - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
-  - mkdir build
-  - cd build
-  - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
-  - make -j 4
+#task:
+#  name: AppleM1/LLVM/CMAKE
+#  compile_script:
+#  - brew install llvm
+#  - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
+#  - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+#  - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
+#  - mkdir build
+#  - cd build
+#  - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
+#  - make -j 4
 
-task:
-  name: AppleM1/GCC/MAKE/OPENMP
-  compile_script:
-  - brew install gcc@11
-  - export PATH=/opt/homebrew/bin:$PATH
-  - export LDFLAGS="-L/opt/homebrew/lib"
-  - export CPPFLAGS="-I/opt/homebrew/include"
-  - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
+#task:
+#  name: AppleM1/GCC/MAKE/OPENMP
+#  compile_script:
+#  - brew install gcc@11
+#  - export PATH=/opt/homebrew/bin:$PATH
+#  - export LDFLAGS="-L/opt/homebrew/lib"
+#  - export CPPFLAGS="-I/opt/homebrew/include"
+#  - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
   
 macos_instance:
   image: ghcr.io/cirruslabs/macos-monterey-xcode:latest

From bdaa6705ca7b0031223417402e3f87e85fef86ac Mon Sep 17 00:00:00 2001
From: frjohnst <frjohnst@us.ibm.com>
Date: Fri, 23 Feb 2024 10:20:48 -0500
Subject: [PATCH 692/718] fix conlict between PR 4515 and AIX shared obj
 support

---
 exports/Makefile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/exports/Makefile b/exports/Makefile
index 15f65aefe..27a291f34 100644
--- a/exports/Makefile
+++ b/exports/Makefile
@@ -315,6 +315,11 @@ test : linktest.c
 
 linktest.c : $(GENSYM) ../Makefile.system ../getarch.c
 	./$(GENSYM) linktest  $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c
+ifeq ($(F_COMPILER), IBM)
+	mv linktest.c linktest.c.FIRST
+	egrep -v 'second_|dsecnd_' linktest.c.FIRST > linktest.c
+	rm linktest.c.FIRST
+endif
 
 clean ::
 	@rm -f *.def *.dylib __.SYMDEF* *.renamed

From be5e18c6f94716c6265f9e2cd58517fa138abcf5 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 24 Feb 2024 23:55:43 +0100
Subject: [PATCH 693/718] Add kernel definitions for CSUM and ZSUM

---
 kernel/x86_64/KERNEL          | 2 ++
 kernel/x86_64/KERNEL.SKYLAKEX | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL
index f8278c3b4..ec4290e82 100644
--- a/kernel/x86_64/KERNEL
+++ b/kernel/x86_64/KERNEL
@@ -489,5 +489,7 @@ XGEMM3MKERNEL    =  xgemm3m_kernel_2x2.S
 
 SSUMKERNEL = ../arm/sum.c
 DSUMKERNEL = ../arm/sum.c
+CSUMKERNEL = zsum_sse.S
+ZSUMKERNEL = zsum_sse2.S
 
 SOMATCOPY_RT = omatcopy_rt.c
diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 548e5dcfc..7e946ef2e 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -46,3 +46,5 @@ ZGEMMKERNEL    =  zgemm_kernel_4x2_skylakex.c
 
 CASUMKERNEL = casum.c
 ZASUMKERNEL = zasum.c
+CSUMKERNEL = csum.c
+ZSUMKERNEL = zsum.c

From 8f8ef3492a3eb753f8e6e1201dfe4c7107c5d779 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 24 Feb 2024 23:57:50 +0100
Subject: [PATCH 694/718] Add CSUM and ZSUM kernels (trivially derived from
 their existing ASUM counterparts)

---
 kernel/x86_64/csum.c                   | 131 +++++++++++
 kernel/x86_64/csum_microk_skylakex-2.c | 289 ++++++++++++++++++++++++
 kernel/x86_64/zsum.c                   | 131 +++++++++++
 kernel/x86_64/zsum_microk_skylakex-2.c | 280 +++++++++++++++++++++++
 kernel/x86_64/zsum_sse.S               | 299 +++++++++++++++++++++++++
 kernel/x86_64/zsum_sse2.S              | 283 +++++++++++++++++++++++
 6 files changed, 1413 insertions(+)
 create mode 100644 kernel/x86_64/csum.c
 create mode 100644 kernel/x86_64/csum_microk_skylakex-2.c
 create mode 100644 kernel/x86_64/zsum.c
 create mode 100644 kernel/x86_64/zsum_microk_skylakex-2.c
 create mode 100644 kernel/x86_64/zsum_sse.S
 create mode 100644 kernel/x86_64/zsum_sse2.S

diff --git a/kernel/x86_64/csum.c b/kernel/x86_64/csum.c
new file mode 100644
index 000000000..e85b5cae1
--- /dev/null
+++ b/kernel/x86_64/csum.c
@@ -0,0 +1,131 @@
+#include "common.h"
+
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
+#include "csum_microk_skylakex-2.c"
+#endif
+
+#ifndef HAVE_CSUM_KERNEL
+static FLOAT csum_kernel(BLASLONG n, FLOAT *x)
+{
+
+    BLASLONG i=0;
+    BLASLONG n_8 = n & -8;
+    FLOAT *x1 = x;
+    FLOAT temp0, temp1, temp2, temp3;
+    FLOAT temp4, temp5, temp6, temp7;
+    FLOAT sum0 = 0.0;
+    FLOAT sum1 = 0.0;
+    FLOAT sum2 = 0.0;
+    FLOAT sum3 = 0.0;
+    FLOAT sum4 = 0.0;
+    
+    while (i < n_8) {
+        sum0 += x1[0];
+        sum1 += x1[1];
+        sum2 += x1[2];
+        sum3 += x1[3];
+        
+        sum0 += x1[4];
+        sum1 += x1[5];
+        sum2 += x1[6];
+        sum3 += x1[7];
+        
+        x1+=8;
+        i+=4;
+    }
+
+     while (i < n) {
+        sum4 += (x1[0] + x1[1]);
+        x1 += 2;
+        i++;
+     }
+
+    return sum0+sum1+sum2+sum3+sum4;
+}
+
+#endif
+
+static FLOAT sum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    BLASLONG i = 0;
+    BLASLONG ip = 0;
+    BLASLONG inc_x2;
+    FLOAT sumf = 0.0;
+
+    if (n <= 0 || inc_x <= 0) return(sumf);
+    if (inc_x == 1) {
+        sumf = csum_kernel(n, x);
+    }
+    else {
+        inc_x2 = 2 * inc_x;
+
+        while (i < n) {
+            sumf += x[ip] + x[ip + 1];
+            ip += inc_x2;
+            i++;
+        }
+    }
+
+    return(sumf);
+}
+
+#if defined(SMP)
+static int sum_thread_function(BLASLONG n, 
+        BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2,
+        FLOAT *x, BLASLONG inc_x,
+        FLOAT * dummy3, BLASLONG dummy4,
+        FLOAT * result, BLASLONG dummy5)
+{
+    *(FLOAT *) result = sum_compute(n, x, inc_x);
+    return 0;
+}
+
+extern int blas_level1_thread_with_return_value(int mode, 
+        BLASLONG m, BLASLONG n, BLASLONG k, void * alpha,
+        void *a, BLASLONG lda, 
+        void *b, BLASLONG ldb,
+        void *c, BLASLONG ldc,
+        int (*function)(),
+        int nthread);
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+#if defined(SMP)
+    int nthreads;
+    FLOAT dummy_alpha[2];
+#endif
+    FLOAT sumf = 0.0;
+
+#if defined(SMP)
+    int num_cpu = num_cpu_avail(1);
+    if (n <= 10000 || inc_x <= 0)
+        nthreads = 1;
+    else
+        nthreads = num_cpu < n/10000 ? num_cpu : n/10000;
+    
+    if (nthreads == 1) {
+        sumf = sum_compute(n, x, inc_x);
+    }
+    else {
+        int mode, i;
+        char result[MAX_CPU_NUMBER * sizeof(double) *2];
+        FLOAT *ptr;
+#if !defined(DOUBLE)
+        mode = BLAS_SINGLE | BLAS_COMPLEX;
+#else
+        mode = BLAS_DOUBLE | BLAS_COMPLEX;
+#endif
+        blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, 
+                NULL, 0, result, 0, (int (*)(void))sum_thread_function, nthreads);
+        ptr = (FLOAT *)result;
+        for (i = 0; i < nthreads; i++) {
+            sumf += (*ptr);
+            ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2);
+        }
+    }
+#else
+    sumf = sum_compute(n, x, inc_x);
+#endif
+    return(sumf);
+}
diff --git a/kernel/x86_64/csum_microk_skylakex-2.c b/kernel/x86_64/csum_microk_skylakex-2.c
new file mode 100644
index 000000000..ec882efa1
--- /dev/null
+++ b/kernel/x86_64/csum_microk_skylakex-2.c
@@ -0,0 +1,289 @@
+/* need a new enough GCC for avx512 support */
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))
+
+#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))
+
+#define HAVE_CASUM_KERNEL 1
+
+#include <immintrin.h>
+
+#include <stdint.h>
+
+static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
+{
+    FLOAT *x1 = x;
+    FLOAT sumf=0.0;
+    BLASLONG n2 = n + n;
+    
+    if (n2 < 64) {
+        __m128 accum_10, accum_11, accum_12, accum_13;
+
+        accum_10 = _mm_setzero_ps();
+        accum_11 = _mm_setzero_ps();
+        accum_12 = _mm_setzero_ps();
+        accum_13 = _mm_setzero_ps();
+        
+        _mm_prefetch(&x1[0], _MM_HINT_T0);
+        
+        if (n2 >= 32){
+            __m128 x00 = _mm_loadu_ps(&x1[ 0]);
+            __m128 x01 = _mm_loadu_ps(&x1[ 4]);
+            __m128 x02 = _mm_loadu_ps(&x1[ 8]);
+            __m128 x03 = _mm_loadu_ps(&x1[12]);
+            
+            _mm_prefetch(&x1[16], _MM_HINT_T0);
+            __m128 x04 = _mm_loadu_ps(&x1[16]);
+            __m128 x05 = _mm_loadu_ps(&x1[20]);
+            __m128 x06 = _mm_loadu_ps(&x1[24]);
+            __m128 x07 = _mm_loadu_ps(&x1[28]);
+
+            accum_10 = _mm_add_ps(accum_10, x00);
+            accum_11 = _mm_add_ps(accum_11, x01);
+            accum_12 = _mm_add_ps(accum_12, x02);
+            accum_13 = _mm_add_ps(accum_13, x03);
+
+            accum_10 = _mm_add_ps(accum_10, x04);
+            accum_11 = _mm_add_ps(accum_11, x05);
+            accum_12 = _mm_add_ps(accum_12, x06);
+            accum_13 = _mm_add_ps(accum_13, x07);
+
+            n2 -= 32;
+            x1 += 32;
+        }
+
+        if (n2 >= 16) {
+            __m128 x00 = _mm_loadu_ps(&x1[ 0]);
+            __m128 x01 = _mm_loadu_ps(&x1[ 4]);
+            __m128 x02 = _mm_loadu_ps(&x1[ 8]);
+            __m128 x03 = _mm_loadu_ps(&x1[12]);
+
+            accum_10 = _mm_add_ps(accum_10, x00);
+            accum_11 = _mm_add_ps(accum_11, x01);
+            accum_12 = _mm_add_ps(accum_12, x02);
+            accum_13 = _mm_add_ps(accum_13, x03);
+            
+            n2 -= 16;
+            x1 += 16;
+        }
+
+        if (n2 >= 8) {
+            __m128 x00 = _mm_loadu_ps(&x1[ 0]);
+            __m128 x01 = _mm_loadu_ps(&x1[ 4]);
+            accum_10 = _mm_add_ps(accum_10, x00);
+            accum_11 = _mm_add_ps(accum_11, x01);
+
+            n2 -= 8;
+            x1 += 8;
+        }
+        
+        if (n2 >= 4) {
+            __m128 x00 = _mm_loadu_ps(&x1[ 0]);
+            accum_10 = _mm_add_ps(accum_10, x00);
+
+            n2 -= 4;
+            x1 += 4;
+        }
+
+        if (n2) {
+            sumf += (x1[0] + x1[1]);
+        }
+
+        accum_10 = _mm_add_ps(accum_10, accum_11);
+        accum_12 = _mm_add_ps(accum_12, accum_13);
+        accum_10 = _mm_add_ps(accum_10, accum_12);
+
+        accum_10 = _mm_hadd_ps(accum_10, accum_10);
+        accum_10 = _mm_hadd_ps(accum_10, accum_10);
+
+        sumf += accum_10[0];
+    }
+    else {
+        __m512 accum_0, accum_1, accum_2, accum_3;
+        __m512 x00, x01, x02, x03, x04, x05, x06, x07;
+        
+        accum_0 = _mm512_setzero_ps();
+        accum_1 = _mm512_setzero_ps();
+        accum_2 = _mm512_setzero_ps();
+        accum_3 = _mm512_setzero_ps();
+
+        // alignment has side-effect when the size of input array is not large enough
+        if (n2 < 256) {
+            if (n2 >= 128) {
+                x00 = _mm512_loadu_ps(&x1[  0]);
+                x01 = _mm512_loadu_ps(&x1[ 16]);
+                x02 = _mm512_loadu_ps(&x1[ 32]);
+                x03 = _mm512_loadu_ps(&x1[ 48]);
+                x04 = _mm512_loadu_ps(&x1[ 64]);
+                x05 = _mm512_loadu_ps(&x1[ 80]);
+                x06 = _mm512_loadu_ps(&x1[ 96]);
+                x07 = _mm512_loadu_ps(&x1[112]);
+
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+                accum_2 = _mm512_add_ps(accum_2, x02);
+                accum_3 = _mm512_add_ps(accum_3, x03);
+                
+                accum_0 = _mm512_add_ps(accum_0, x04);
+                accum_1 = _mm512_add_ps(accum_1, x05);
+                accum_2 = _mm512_add_ps(accum_2, x06);
+                accum_3 = _mm512_add_ps(accum_3, x07);
+                
+                n2 -= 128;
+                x1 += 128;
+            }
+
+            if (n2 >= 64) {
+                x00 = _mm512_loadu_ps(&x1[ 0]);
+                x01 = _mm512_loadu_ps(&x1[16]);
+                x02 = _mm512_loadu_ps(&x1[32]);
+                x03 = _mm512_loadu_ps(&x1[48]);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+                accum_2 = _mm512_add_ps(accum_2, x02);
+                accum_3 = _mm512_add_ps(accum_3, x03);
+
+                n2 -= 64;
+                x1 += 64;
+            }
+
+            if (n2 >= 32) {
+                x00 = _mm512_loadu_ps(&x1[ 0]);
+                x01 = _mm512_loadu_ps(&x1[16]);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+
+                n2 -= 32;
+                x1 += 32;
+            }
+
+            if (n2 >= 16) {
+                x00 = _mm512_loadu_ps(&x1[ 0]);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+
+                n2 -= 16;
+                x1 += 16;
+            }
+
+            if (n2) {
+                uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2));
+                x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &tail_mask16), &x1[ 0]);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+            }
+            accum_0 = _mm512_add_ps(accum_0, accum_1);
+            accum_2 = _mm512_add_ps(accum_2, accum_3);
+            accum_0 = _mm512_add_ps(accum_0, accum_2);
+
+            sumf =  _mm512_reduce_add_ps(accum_0);
+        }
+        // n2 >= 256, doing alignment
+        else {
+
+            int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 2) & 0xf;
+
+            if (0 != align_header) {
+                uint16_t align_mask16 = (((uint16_t)0xffff) >> (16 - align_header));
+                x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &align_mask16), &x1[0]);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+
+                n2 -= align_header;
+                x1 += align_header;
+            }
+
+            x00 = _mm512_load_ps(&x1[  0]);
+            x01 = _mm512_load_ps(&x1[ 16]);
+            x02 = _mm512_load_ps(&x1[ 32]);
+            x03 = _mm512_load_ps(&x1[ 48]);
+            x04 = _mm512_load_ps(&x1[ 64]);
+            x05 = _mm512_load_ps(&x1[ 80]);
+            x06 = _mm512_load_ps(&x1[ 96]);
+            x07 = _mm512_load_ps(&x1[112]);
+            
+            n2 -= 128;
+            x1 += 128;
+
+            while (n2 >= 128) {
+                
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                x00 = _mm512_load_ps(&x1[  0]);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+                x01 = _mm512_load_ps(&x1[ 16]);
+                accum_2 = _mm512_add_ps(accum_2, x02);
+                x02 = _mm512_load_ps(&x1[ 32]);
+                accum_3 = _mm512_add_ps(accum_3, x03);
+                x03 = _mm512_load_ps(&x1[ 48]);
+                
+                accum_0 = _mm512_add_ps(accum_0, x04);
+                x04 = _mm512_load_ps(&x1[ 64]);
+                accum_1 = _mm512_add_ps(accum_1, x05);
+                x05 = _mm512_load_ps(&x1[ 80]);
+                accum_2 = _mm512_add_ps(accum_2, x06);
+                x06 = _mm512_load_ps(&x1[ 96]);
+                accum_3 = _mm512_add_ps(accum_3, x07);
+                x07 = _mm512_load_ps(&x1[112]);
+
+                n2 -= 128;
+                x1 += 128;
+            }
+            
+            accum_0 = _mm512_add_ps(accum_0, x00);
+            accum_1 = _mm512_add_ps(accum_1, x01);
+            accum_2 = _mm512_add_ps(accum_2, x02);
+            accum_3 = _mm512_add_ps(accum_3, x03);
+            
+            accum_0 = _mm512_add_ps(accum_0, x04);
+            accum_1 = _mm512_add_ps(accum_1, x05);
+            accum_2 = _mm512_add_ps(accum_2, x06);
+            accum_3 = _mm512_add_ps(accum_3, x07);
+
+            if (n2 >= 64) {
+                x00 = _mm512_load_ps(&x1[ 0]);
+                x01 = _mm512_load_ps(&x1[16]);
+                x02 = _mm512_load_ps(&x1[32]);
+                x03 = _mm512_load_ps(&x1[48]);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+                accum_2 = _mm512_add_ps(accum_2, x02);
+                accum_3 = _mm512_add_ps(accum_3, x03);
+
+                n2 -= 64;
+                x1 += 64;
+            }
+
+            if (n2 >= 32) {
+                x00 = _mm512_load_ps(&x1[ 0]);
+                x01 = _mm512_load_ps(&x1[16]);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+
+                n2 -= 32;
+                x1 += 32;
+            }
+
+            if (n2 >= 16) {
+                x00 = _mm512_load_ps(&x1[ 0]);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+
+                n2 -= 16;
+                x1 += 16;
+            }
+
+            if (n2) {
+                uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2));
+                x00 = _mm512_maskz_load_ps(*((__mmask16*) &tail_mask16), &x1[ 0]);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+            }
+
+            accum_0 = _mm512_add_ps(accum_0, accum_1);
+            accum_2 = _mm512_add_ps(accum_2, accum_3);
+            accum_0 = _mm512_add_ps(accum_0, accum_2);
+            sumf = _mm512_reduce_add_ps(accum_0);
+        }
+    }
+
+    return sumf;
+}
+#endif
+#endif
diff --git a/kernel/x86_64/zsum.c b/kernel/x86_64/zsum.c
new file mode 100644
index 000000000..5973c1253
--- /dev/null
+++ b/kernel/x86_64/zsum.c
@@ -0,0 +1,131 @@
+#include "common.h"
+
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
+#include "zsum_microk_skylakex-2.c"
+#endif
+
+#ifndef HAVE_ZASUM_KERNEL
+static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
+{
+
+    BLASLONG i=0;
+    BLASLONG n_8 = n & -8;
+    FLOAT *x1 = x;
+    FLOAT temp0, temp1, temp2, temp3;
+    FLOAT temp4, temp5, temp6, temp7;
+    FLOAT sum0 = 0.0;
+    FLOAT sum1 = 0.0;
+    FLOAT sum2 = 0.0;
+    FLOAT sum3 = 0.0;
+    FLOAT sum4 = 0.0;
+    
+    while (i < n_8) {
+        sum0 += x1[0];
+        sum1 += x1[1];
+        sum2 += x1[2];
+        sum3 += x1[3];
+        
+        sum0 += x1[4];
+        sum1 += x1[5];
+        sum2 += x1[6];
+        sum3 += x1[7];
+        
+        x1+=8;
+        i+=4;
+    }
+
+     while (i < n) {
+        sum4 += x1[0] + x1[1];
+        x1 += 2;
+        i++;
+     }
+
+    return sum0+sum1+sum2+sum3+sum4;
+}
+
+#endif
+
+static FLOAT sum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    BLASLONG i = 0;
+    BLASLONG ip = 0;
+    BLASLONG inc_x2;
+    FLOAT sumf = 0.0;
+
+    if (n <= 0 || inc_x <= 0) return(sumf);
+    if (inc_x == 1) {
+        sumf = zsum_kernel(n, x);
+    }
+    else {
+        inc_x2 = 2 * inc_x;
+
+        while (i < n) {
+            sumf += x[ip] + x[ip + 1];
+            ip += inc_x2;
+            i++;
+        }
+    }
+
+    return(sumf);
+}
+
+#if defined(SMP)
+static int sum_thread_function(BLASLONG n, 
+        BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2,
+        FLOAT *x, BLASLONG inc_x,
+        FLOAT * dummy3, BLASLONG dummy4,
+        FLOAT * result, BLASLONG dummy5)
+{
+    *(FLOAT *) result = sum_compute(n, x, inc_x);
+    return 0;
+}
+
+extern int blas_level1_thread_with_return_value(int mode, 
+        BLASLONG m, BLASLONG n, BLASLONG k, void * alpha,
+        void *a, BLASLONG lda, 
+        void *b, BLASLONG ldb,
+        void *c, BLASLONG ldc,
+        int (*function)(),
+        int nthread);
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+#if defined(SMP)
+    int nthreads;
+    FLOAT dummy_alpha[2];
+#endif
+    FLOAT sumf = 0.0;
+
+#if defined(SMP)
+    int num_cpu = num_cpu_avail(1);
+    if (n <= 10000 || inc_x <= 0)
+        nthreads = 1;
+    else
+        nthreads = num_cpu < n/10000 ? num_cpu : n/10000;
+    
+    if (nthreads == 1) {
+        sumf = sum_compute(n, x, inc_x);
+    }
+    else {
+        int mode, i;
+        char result[MAX_CPU_NUMBER * sizeof(double) *2];
+        FLOAT *ptr;
+#if !defined(DOUBLE)
+        mode = BLAS_SINGLE | BLAS_COMPLEX;
+#else
+        mode = BLAS_DOUBLE | BLAS_COMPLEX;
+#endif
+        blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, 
+                NULL, 0, result, 0, (int (*)(void))sum_thread_function, nthreads);
+        ptr = (FLOAT *)result;
+        for (i = 0; i < nthreads; i++) {
+            sumf += (*ptr);
+            ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2);
+        }
+    }
+#else
+    sumf = sum_compute(n, x, inc_x);
+#endif
+    return(sumf);
+}
diff --git a/kernel/x86_64/zsum_microk_skylakex-2.c b/kernel/x86_64/zsum_microk_skylakex-2.c
new file mode 100644
index 000000000..0bca7ce6d
--- /dev/null
+++ b/kernel/x86_64/zsum_microk_skylakex-2.c
@@ -0,0 +1,280 @@
+/* need a new enough GCC for avx512 support */
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))
+
+#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))
+
+#define HAVE_ZSUM_KERNEL 1
+
+#include <immintrin.h>
+
+#include <stdint.h>
+
+static FLOAT zsum_kernel(BLASLONG n, FLOAT *x)
+{
+    FLOAT *x1 = x;
+    FLOAT sumf=0.0;
+    BLASLONG n2 = n + n;
+    
+
+    if (n2 < 32) {
+        __m128d accum_10, accum_11, accum_12, accum_13;
+
+        accum_10 = _mm_setzero_pd();
+        accum_11 = _mm_setzero_pd();
+        accum_12 = _mm_setzero_pd();
+        accum_13 = _mm_setzero_pd();
+        
+        _mm_prefetch(&x1[0], _MM_HINT_T0);
+        if (n2 >= 16){
+            __m128d x00 = _mm_loadu_pd(&x1[ 0]);
+            __m128d x01 = _mm_loadu_pd(&x1[ 2]);
+            __m128d x02 = _mm_loadu_pd(&x1[ 4]);
+            __m128d x03 = _mm_loadu_pd(&x1[ 6]);
+            
+            _mm_prefetch(&x1[8], _MM_HINT_T0);
+            __m128d x04 = _mm_loadu_pd(&x1[ 8]);
+            __m128d x05 = _mm_loadu_pd(&x1[10]);
+            __m128d x06 = _mm_loadu_pd(&x1[12]);
+            __m128d x07 = _mm_loadu_pd(&x1[14]);
+
+            accum_10 = _mm_add_pd(accum_10, x00);
+            accum_11 = _mm_add_pd(accum_11, x01);
+            accum_12 = _mm_add_pd(accum_12, x02);
+            accum_13 = _mm_add_pd(accum_13, x03);
+
+            accum_10 = _mm_add_pd(accum_10, x04);
+            accum_11 = _mm_add_pd(accum_11, x05);
+            accum_12 = _mm_add_pd(accum_12, x06);
+            accum_13 = _mm_add_pd(accum_13, x07);
+
+            x1 += 16;
+            n2 -= 16;
+        }
+
+        if (n2 >= 8) {
+            __m128d x00 = _mm_loadu_pd(&x1[ 0]);
+            __m128d x01 = _mm_loadu_pd(&x1[ 2]);
+            __m128d x02 = _mm_loadu_pd(&x1[ 4]);
+            __m128d x03 = _mm_loadu_pd(&x1[ 6]);
+
+            accum_10 = _mm_add_pd(accum_10, x00);
+            accum_11 = _mm_add_pd(accum_11, x01);
+            accum_12 = _mm_add_pd(accum_12, x02);
+            accum_13 = _mm_add_pd(accum_13, x03);
+            
+            n2 -= 8;
+            x1 += 8;
+        }
+
+        if (n2 >= 4) {
+            __m128d x00 = _mm_loadu_pd(&x1[ 0]);
+            __m128d x01 = _mm_loadu_pd(&x1[ 2]);
+            accum_10 = _mm_add_pd(accum_10, x00);
+            accum_11 = _mm_add_pd(accum_11, x01);
+
+            n2 -= 4;
+            x1 += 4;
+        }
+        
+        if (n2) {
+            __m128d x00 = _mm_loadu_pd(&x1[ 0]);
+            accum_10 = _mm_add_pd(accum_10, x00);
+        }
+
+        accum_10 = _mm_add_pd(accum_10, accum_11);
+        accum_12 = _mm_add_pd(accum_12, accum_13);
+        accum_10 = _mm_add_pd(accum_10, accum_12);
+
+        accum_10 = _mm_hadd_pd(accum_10, accum_10);
+
+        sumf = accum_10[0];
+    }
+    else {
+        __m512d accum_0, accum_1, accum_2, accum_3;
+        __m512d x00, x01, x02, x03, x04, x05, x06, x07;
+        __m512d abs_mask = (__m512d)_mm512_set1_epi64(0x7fffffffffffffff);
+        
+        accum_0 = _mm512_setzero_pd();
+        accum_1 = _mm512_setzero_pd();
+        accum_2 = _mm512_setzero_pd();
+        accum_3 = _mm512_setzero_pd();
+
+        // alignment has side-effect when the size of input array is not large enough
+        if (n2 < 128) {
+            if (n2 >= 64) {
+                x00 = _mm512_loadu_pd(&x1[ 0]);
+                x01 = _mm512_loadu_pd(&x1[ 8]);
+                x02 = _mm512_loadu_pd(&x1[16]);
+                x03 = _mm512_loadu_pd(&x1[24]);
+                x04 = _mm512_loadu_pd(&x1[32]);
+                x05 = _mm512_loadu_pd(&x1[40]);
+                x06 = _mm512_loadu_pd(&x1[48]);
+                x07 = _mm512_loadu_pd(&x1[56]);
+
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+                accum_2 = _mm512_add_pd(accum_2, x02);
+                accum_3 = _mm512_add_pd(accum_3, x03);
+                
+                accum_0 = _mm512_add_pd(accum_0, x04);
+                accum_1 = _mm512_add_pd(accum_1, x05);
+                accum_2 = _mm512_add_pd(accum_2, x06);
+                accum_3 = _mm512_add_pd(accum_3, x07);
+                
+                n2 -= 64;
+                x1 += 64;
+            }
+
+            if (n2 >= 32) {
+                x00 = _mm512_loadu_pd(&x1[ 0]);
+                x01 = _mm512_loadu_pd(&x1[ 8]);
+                x02 = _mm512_loadu_pd(&x1[16]);
+                x03 = _mm512_loadu_pd(&x1[24]);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+                accum_2 = _mm512_add_pd(accum_2, x02);
+                accum_3 = _mm512_add_pd(accum_3, x03);
+
+                n2 -= 32;
+                x1 += 32;
+            }
+
+            if (n2 >= 16) {
+                x00 = _mm512_loadu_pd(&x1[ 0]);
+                x01 = _mm512_loadu_pd(&x1[ 8]);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+
+                n2 -= 16;
+                x1 += 16;
+            }
+
+            if (n2 >= 8) {
+                x00 = _mm512_loadu_pd(&x1[ 0]);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+
+                n2 -= 8;
+                x1 += 8;
+            }
+
+            if (n2) {
+                unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2));
+                x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &x1[ 0]);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+            }
+            accum_0 = _mm512_add_pd(accum_0, accum_1);
+            accum_2 = _mm512_add_pd(accum_2, accum_3);
+            accum_0 = _mm512_add_pd(accum_0, accum_2);
+            sumf =  _mm512_reduce_add_pd(accum_0);
+        }
+        // n2 >= 128, doing alignment
+        else {
+
+            int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 3) & 0x7;
+
+            if (0 != align_header) {
+                unsigned char align_mask8 = (((unsigned char)0xff) >> (8 - align_header));
+                x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &align_mask8), &x1[0]);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+
+                n2 -= align_header;
+                x1 += align_header;
+            }
+
+            x00 = _mm512_load_pd(&x1[ 0]);
+            x01 = _mm512_load_pd(&x1[ 8]);
+            x02 = _mm512_load_pd(&x1[16]);
+            x03 = _mm512_load_pd(&x1[24]);
+            x04 = _mm512_load_pd(&x1[32]);
+            x05 = _mm512_load_pd(&x1[40]);
+            x06 = _mm512_load_pd(&x1[48]);
+            x07 = _mm512_load_pd(&x1[56]);
+            
+            n2 -= 64;
+            x1 += 64;
+
+            while (n2 >= 64) {
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                x00 = _mm512_load_pd(&x1[ 0]);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+                x01 = _mm512_load_pd(&x1[ 8]);
+                accum_2 = _mm512_add_pd(accum_2, x02);
+                x02 = _mm512_load_pd(&x1[16]);
+                accum_3 = _mm512_add_pd(accum_3, x03);
+                x03 = _mm512_load_pd(&x1[24]);
+                
+                accum_0 = _mm512_add_pd(accum_0, x04);
+                x04 = _mm512_load_pd(&x1[32]);
+                accum_1 = _mm512_add_pd(accum_1, x05);
+                x05 = _mm512_load_pd(&x1[40]);
+                accum_2 = _mm512_add_pd(accum_2, x06);
+                x06 = _mm512_load_pd(&x1[48]);
+                accum_3 = _mm512_add_pd(accum_3, x07);
+                x07 = _mm512_load_pd(&x1[56]);
+
+                n2 -= 64;
+                x1 += 64;
+            }
+            
+            accum_0 = _mm512_add_pd(accum_0, x00);
+            accum_1 = _mm512_add_pd(accum_1, x01);
+            accum_2 = _mm512_add_pd(accum_2, x02);
+            accum_3 = _mm512_add_pd(accum_3, x03);
+            
+            accum_0 = _mm512_add_pd(accum_0, x04);
+            accum_1 = _mm512_add_pd(accum_1, x05);
+            accum_2 = _mm512_add_pd(accum_2, x06);
+            accum_3 = _mm512_add_pd(accum_3, x07);
+
+            if (n2 >= 32) {
+                x00 = _mm512_load_pd(&x1[ 0]);
+                x01 = _mm512_load_pd(&x1[ 8]);
+                x02 = _mm512_load_pd(&x1[16]);
+                x03 = _mm512_load_pd(&x1[24]);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+                accum_2 = _mm512_add_pd(accum_2, x02);
+                accum_3 = _mm512_add_pd(accum_3, x03);
+
+                n2 -= 32;
+                x1 += 32;
+            }
+
+            if (n2 >= 16) {
+                x00 = _mm512_load_pd(&x1[ 0]);
+                x01 = _mm512_load_pd(&x1[ 8]);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+
+                n2 -= 16;
+                x1 += 16;
+            }
+
+            if (n2 >= 8) {
+                x00 = _mm512_load_pd(&x1[ 0]);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+
+                n2 -= 8;
+                x1 += 8;
+            }
+
+            if (n2) {
+                unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2));
+                x00 = _mm512_maskz_load_pd(*((__mmask8*) &tail_mask8), &x1[ 0]);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+            }
+
+            accum_0 = _mm512_add_pd(accum_0, accum_1);
+            accum_2 = _mm512_add_pd(accum_2, accum_3);
+            accum_0 = _mm512_add_pd(accum_0, accum_2);
+            sumf = _mm512_reduce_add_pd(accum_0);
+        }
+    }
+
+    return sumf;
+}
+#endif
+#endif
diff --git a/kernel/x86_64/zsum_sse.S b/kernel/x86_64/zsum_sse.S
new file mode 100644
index 000000000..b679b42b0
--- /dev/null
+++ b/kernel/x86_64/zsum_sse.S
@@ -0,0 +1,299 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define M	ARG1	/* rdi */
+#define X	ARG2	/* rsi */
+#define INCX	ARG3	/* rdx */
+
+#define I	%rax
+
+#include "l1param.h"
+
+	PROLOGUE
+	PROFCODE
+
+	SAVEREGISTERS
+
+	pxor	%xmm0, %xmm0
+	testq	M, M
+	jle	.L999
+	testq	INCX, INCX
+	jle	.L999
+
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+
+	salq	$ZBASE_SHIFT, INCX
+
+	cmpq	$2 * SIZE, INCX
+	jne	.L100
+
+	subq	$-32 * SIZE, X
+	addq	M, M
+
+	cmpq	$3, M
+	jle	.L18
+
+	testq	$4, X
+	je	.L05
+	movss	-32 * SIZE(X), %xmm0
+	addq	$SIZE, X
+	decq	M
+	jle	.L998
+	ALIGN_3
+
+.L05:
+	testq	$8, X
+	je	.L10
+
+#ifdef movsd
+	xorps	%xmm1, %xmm1
+#endif
+	movsd	-32 * SIZE(X), %xmm1
+	addq	$2 * SIZE, X
+	subq	$2, M
+	jle	.L998
+	ALIGN_3
+
+.L10:
+	movq	M,  I
+	sarq	$5, I
+	jle	.L14
+
+	movaps	-32 * SIZE(X), %xmm4
+	movaps	-28 * SIZE(X), %xmm5
+	movaps	-24 * SIZE(X), %xmm6
+	movaps	-20 * SIZE(X), %xmm7
+
+	movaps	-16 * SIZE(X), %xmm8
+	movaps	-12 * SIZE(X), %xmm9
+	movaps	 -8 * SIZE(X), %xmm10
+	movaps	 -4 * SIZE(X), %xmm11
+	decq	I
+	jle	.L12
+	ALIGN_3
+
+.L11:
+#ifdef PREFETCH
+	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
+#endif
+
+	addps	%xmm4,  %xmm0
+	movaps	  0 * SIZE(X), %xmm4
+
+	addps	%xmm5,  %xmm1
+	movaps	  4 * SIZE(X), %xmm5
+
+	addps	%xmm6,  %xmm2
+	movaps	  8 * SIZE(X), %xmm6
+
+	addps	%xmm7,  %xmm3
+	movaps	 12 * SIZE(X), %xmm7
+
+#if defined(PREFETCH) && !defined(FETCH128)
+	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
+#endif
+
+	addps	%xmm8,  %xmm0
+	movaps	 16 * SIZE(X), %xmm8
+
+	addps	%xmm9,  %xmm1
+	movaps	 20 * SIZE(X), %xmm9
+
+	addps	%xmm10, %xmm2
+	movaps	 24 * SIZE(X), %xmm10
+
+	addps	%xmm11, %xmm3
+	movaps	 28 * SIZE(X), %xmm11
+
+	subq	$-32 * SIZE, X
+	decq	I
+	jg	.L11
+	ALIGN_3
+
+.L12:
+	addps	%xmm4,  %xmm0
+	addps	%xmm5,  %xmm1
+
+	addps	%xmm6,  %xmm2
+	addps	%xmm7,  %xmm3
+
+	addps	%xmm8,  %xmm0
+	addps	%xmm9,  %xmm1
+
+	addps	%xmm10, %xmm2
+	addps	%xmm11, %xmm3
+
+	addq	$32 * SIZE, X
+	ALIGN_3
+
+.L14:
+	testq	$31,  M
+	jle	.L998
+
+.L15:
+	testq	$16, M
+	je	.L16
+
+	movaps	-32 * SIZE(X), %xmm4
+	addps	%xmm4, %xmm0
+
+	movaps	-28 * SIZE(X), %xmm5
+	addps	%xmm5, %xmm1
+
+	movaps	-24 * SIZE(X), %xmm4
+	addps	%xmm4, %xmm0
+
+	movaps	-20 * SIZE(X), %xmm5
+	addps	%xmm5, %xmm1
+
+	addq	$16 * SIZE, X
+	ALIGN_3
+
+.L16:
+	testq	$8, M
+	je	.L17
+
+	movaps	-32 * SIZE(X), %xmm4
+	addps	%xmm4, %xmm0
+
+	movaps	-28 * SIZE(X), %xmm5
+	addps	%xmm5, %xmm1
+
+	addq	$8 * SIZE, X
+	ALIGN_3
+
+.L17:
+	testq	$4, M
+	je	.L18
+
+	movaps	-32 * SIZE(X), %xmm6
+	addps	%xmm6, %xmm2
+	addq	$4 * SIZE, X
+	ALIGN_3
+
+.L18:
+	testq	$2, M
+	je	.L19
+
+#ifdef movsd
+	xorps	%xmm7, %xmm7
+#endif
+	movsd	-32 * SIZE(X), %xmm7
+	addps	%xmm7, %xmm3
+	addq	$2 * SIZE, X
+	ALIGN_3
+
+.L19:
+	testq	$1, M
+	je	.L998
+
+	movss	-32 * SIZE(X), %xmm6
+	addps	%xmm6, %xmm2
+	jmp	.L998
+	ALIGN_4
+
+.L100:
+	movq	M,  I
+	sarq	$2, I
+	jle	.L105
+	ALIGN_4
+
+.L101:
+	movsd	(X), %xmm4
+	addq	INCX, X
+	movhps	(X), %xmm4
+	addq	INCX, X
+
+	addps	%xmm4,  %xmm0
+
+	movsd	(X), %xmm5
+	addq	INCX, X
+	movhps	(X), %xmm5
+	addq	INCX, X
+
+	addps	%xmm5,  %xmm1
+
+	decq	I
+	jg	.L101
+	ALIGN_4
+
+.L105:
+#ifdef movsd
+	xorps	%xmm4, %xmm4
+#endif
+	andq	$3,  M
+	jle	.L998
+	ALIGN_4
+
+.L106:
+	movsd	(X), %xmm4
+	addps	%xmm4, %xmm0
+	addq	INCX, X
+	decq	M
+	jg	.L106
+	ALIGN_4
+
+.L998:
+	addps	%xmm1, %xmm0
+	addps	%xmm3, %xmm2
+	addps	%xmm2, %xmm0
+
+#ifndef HAVE_SSE3
+	movhlps	%xmm0, %xmm1
+	addps	%xmm1, %xmm0
+
+	movaps	%xmm0, %xmm1
+	shufps  $1, %xmm0, %xmm0
+	addss	 %xmm1, %xmm0
+#else
+	haddps	%xmm0, %xmm0
+	haddps	%xmm0, %xmm0
+#endif
+	ALIGN_4
+
+.L999:
+	RESTOREREGISTERS
+
+	ret
+
+	EPILOGUE
diff --git a/kernel/x86_64/zsum_sse2.S b/kernel/x86_64/zsum_sse2.S
new file mode 100644
index 000000000..6f667164d
--- /dev/null
+++ b/kernel/x86_64/zsum_sse2.S
@@ -0,0 +1,283 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define M	ARG1	/* rdi */
+#define X	ARG2	/* rsi */
+#define INCX	ARG3	/* rdx */
+
+#define I	%rax
+
+#include "l1param.h"
+
+	PROLOGUE
+	PROFCODE
+
+	SAVEREGISTERS
+
+	xorps	%xmm0, %xmm0
+	testq	M, M
+	jle	.L999
+	testq	INCX, INCX
+	jle	.L999
+
+	xorps	%xmm1, %xmm1
+	xorps	%xmm2, %xmm2
+	xorps	%xmm3, %xmm3
+
+	salq	$ZBASE_SHIFT, INCX
+
+	cmpq	$2 * SIZE, INCX
+	jne	.L40
+
+	subq	$-16 * SIZE, X
+	addq	M, M
+
+	testq	$SIZE, X
+	je	.L05
+
+#ifdef movsd
+	xorps	%xmm0, %xmm0
+#endif
+	movsd	-16 * SIZE(X), %xmm0
+	addq	$SIZE, X
+
+	subq	$1, M
+	jle	.L999
+	ALIGN_3
+
+.L05:
+	movq	M,  I
+	sarq	$4, I
+	jle	.L20
+
+	movaps	-16 * SIZE(X), %xmm4
+	movaps	-14 * SIZE(X), %xmm5
+	movaps	-12 * SIZE(X), %xmm6
+	movaps	-10 * SIZE(X), %xmm7
+
+	movaps	 -8 * SIZE(X), %xmm8
+	movaps	 -6 * SIZE(X), %xmm9
+	movaps	 -4 * SIZE(X), %xmm10
+	movaps	 -2 * SIZE(X), %xmm11
+
+	decq	I
+	jle	.L11
+	ALIGN_4
+
+.L10:
+#ifdef PREFETCH
+	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
+#endif
+
+	addpd	%xmm4,  %xmm0
+	movaps	  0 * SIZE(X), %xmm4
+
+	addpd	%xmm5,  %xmm1
+	movaps	  2 * SIZE(X), %xmm5
+
+	addpd	%xmm6,  %xmm2
+	movaps	  4 * SIZE(X), %xmm6
+
+	addpd	%xmm7,  %xmm3
+	movaps	  6 * SIZE(X), %xmm7
+
+#if defined(PREFETCH) && !defined(FETCH128)
+	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
+#endif
+
+	addpd	%xmm8,  %xmm0
+	movaps	  8 * SIZE(X), %xmm8
+
+	addpd	%xmm9,  %xmm1
+	movaps	 10 * SIZE(X), %xmm9
+
+	addpd	%xmm10, %xmm2
+	movaps	 12 * SIZE(X), %xmm10
+
+	addpd	%xmm11, %xmm3
+	movaps	 14 * SIZE(X), %xmm11
+
+	subq	$-16 * SIZE, X
+	decq	I
+	jg	.L10
+	ALIGN_4
+
+.L11:
+
+	addpd	%xmm4,  %xmm0
+	addpd	%xmm5,  %xmm1
+	addpd	%xmm6,  %xmm2
+	addpd	%xmm7,  %xmm3
+
+	addpd	%xmm8,  %xmm0
+	addpd	%xmm9,  %xmm1
+	addpd	%xmm10, %xmm2
+	addpd	%xmm11, %xmm3
+
+	subq	$-16 * SIZE, X
+	ALIGN_3
+
+.L20:
+	andq	$15,  M
+	jle	.L998
+
+	testq	$8, M
+	je	.L21
+
+	movaps	-16 * SIZE(X), %xmm4
+	movaps	-14 * SIZE(X), %xmm5
+	movaps	-12 * SIZE(X), %xmm6
+	movaps	-10 * SIZE(X), %xmm7
+
+	addpd	%xmm4, %xmm0
+	addpd	%xmm5, %xmm1
+	addpd	%xmm6, %xmm2
+	addpd	%xmm7, %xmm3
+	addq	$8 * SIZE, X
+	ALIGN_3
+
+.L21:
+	testq	$4, M
+	je	.L22
+
+	movaps	-16 * SIZE(X), %xmm4
+	movaps	-14 * SIZE(X), %xmm5
+
+	addpd	%xmm4, %xmm0
+	addpd	%xmm5, %xmm1
+
+	addq	$4 * SIZE, X
+	ALIGN_3
+
+.L22:
+	testq	$2, M
+	je	.L23
+
+	movaps	-16 * SIZE(X), %xmm6
+	addpd	%xmm6, %xmm3
+	addq	$2 * SIZE, X
+
+.L23:
+	testq	$1, M
+	je	.L998
+
+#ifdef movsd
+	xorps	%xmm4, %xmm4
+#endif
+	movsd	-16 * SIZE(X), %xmm4
+	addsd	%xmm4, %xmm0
+	jmp	.L998
+	ALIGN_3
+
+
+.L40:
+	movq	M,  I
+	sarq	$2, I
+	jle	.L60
+	ALIGN_4
+
+.L50:
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+	prefetcht0	PREFETCHSIZE * SIZE(X)
+#endif
+
+#ifdef PENTIUM4
+	prefetchnta	PREFETCHSIZE * SIZE(X)
+#endif
+
+	movsd	0 * SIZE(X), %xmm4
+	movhpd	1 * SIZE(X), %xmm4
+	addq	INCX, X
+	addpd	%xmm4, %xmm0
+
+	movsd	0 * SIZE(X), %xmm5
+	movhpd	1 * SIZE(X), %xmm5
+	addq	INCX, X
+	addpd	%xmm5, %xmm1
+
+	movsd	0 * SIZE(X), %xmm6
+	movhpd	1 * SIZE(X), %xmm6
+	addq	INCX, X
+	addpd	%xmm6, %xmm2
+
+	movsd	0 * SIZE(X), %xmm7
+	movhpd	1 * SIZE(X), %xmm7
+	addq	INCX, X
+	addpd	%xmm7, %xmm3
+
+	decq	I
+	jg	.L50
+	ALIGN_4
+
+.L60:
+	andq	$3,  M
+	jle	.L998
+	ALIGN_4
+
+
+.L61:
+	movsd	0 * SIZE(X), %xmm4
+	movhpd	1 * SIZE(X), %xmm4
+	addpd	%xmm4, %xmm0
+	addq	INCX, X
+	decq	M
+	jg	.L61
+	ALIGN_4
+
+.L998:
+	addpd	%xmm1, %xmm0
+	addpd	%xmm3, %xmm2
+	addpd	%xmm2, %xmm0
+
+#ifndef HAVE_SSE3
+	movhlps	%xmm0, %xmm1
+	addsd	%xmm1, %xmm0
+#else
+	haddpd	%xmm0, %xmm0
+#endif
+	ALIGN_4
+
+.L999:
+	RESTOREREGISTERS
+
+	ret
+
+	EPILOGUE

From 12787775d92c700d9f5956ea35cbcb0576e4ab2a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 25 Feb 2024 17:55:36 +0100
Subject: [PATCH 695/718] add csum/zsum kernels (trivially derived from the
 asum ones)s)

---
 kernel/arm64/KERNEL.ARMV8SVE     |   3 +
 kernel/arm64/csum_thunderx2t99.c | 247 +++++++++++++++++++++++++++++++
 kernel/arm64/zsum_thunderx2t99.c | 244 ++++++++++++++++++++++++++++++
 3 files changed, 494 insertions(+)
 create mode 100644 kernel/arm64/csum_thunderx2t99.c
 create mode 100644 kernel/arm64/zsum_thunderx2t99.c

diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE
index ccbce27e1..eeb4844bf 100644
--- a/kernel/arm64/KERNEL.ARMV8SVE
+++ b/kernel/arm64/KERNEL.ARMV8SVE
@@ -1,3 +1,6 @@
+
+CSUMKERNEL    = csum_thunderx2t99.c
+ZSUMKERNEL    = zsum_thunderx2t99.c
 SAMINKERNEL  = ../arm/amin.c
 DAMINKERNEL  = ../arm/amin.c
 CAMINKERNEL  = ../arm/zamin.c
diff --git a/kernel/arm64/csum_thunderx2t99.c b/kernel/arm64/csum_thunderx2t99.c
new file mode 100644
index 000000000..874f4eb5a
--- /dev/null
+++ b/kernel/arm64/csum_thunderx2t99.c
@@ -0,0 +1,247 @@
+/***************************************************************************
+Copyright (c) 2017, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#include <arm_neon.h>
+
+#define	N	"x0"	/* vector length */
+#define	X	"x1"	/* "X" vector address */
+#define	INC_X	"x2"	/* "X" stride */
+#define J	"x5"	/* loop variable */
+
+#define REG0	"wzr"
+#define SUMF	"s0"
+#define SUMFD	"d0"
+
+/******************************************************************************/
+
+#define KERNEL_F1					\
+	"ldr	d1, ["X"]			\n"	\
+	"add	"X", "X", #8			\n"	\
+	"ext	v2.8b, v1.8b, v1.8b, #4		\n"	\
+	"fadd	s1, s1, s2			\n"	\
+	"fadd	"SUMF", "SUMF", s1		\n"
+
+#define KERNEL_F32					\
+	"ldr	q16, ["X"]			\n"	\
+	"ldr	q17, ["X", #16]			\n"	\
+	"ldr	q18, ["X", #32]			\n"	\
+	"ldr	q19, ["X", #48]			\n"	\
+	"ldp	q20, q21, ["X", #64]		\n"	\
+	"ldp	q22, q23, ["X", #96]		\n"	\
+	"ldp	q24, q25, ["X", #128]		\n"	\
+	"ldp	q26, q27, ["X", #160]		\n"	\
+	"fadd	v16.4s, v16.4s, v17.4s		\n"	\
+	"fadd	v18.4s, v18.4s, v19.4s		\n"	\
+	"ldp	q28, q29, ["X", #192]		\n"	\
+	"ldp	q30, q31, ["X", #224]		\n"	\
+	"add	"X", "X", #256			\n"	\
+	"fadd	v20.4s, v20.4s, v21.4s		\n"	\
+	"fadd	v22.4s, v22.4s, v23.4s		\n"	\
+	"PRFM	PLDL1KEEP, ["X", #1024]		\n"	\
+	"PRFM	PLDL1KEEP, ["X", #1024+64]	\n"	\
+	"fadd	v24.4s, v24.4s, v25.4s		\n"	\
+	"fadd	v26.4s, v26.4s, v27.4s		\n"	\
+	"fadd	v0.4s, v0.4s, v16.4s		\n"	\
+	"fadd	v1.4s, v1.4s, v18.4s		\n"	\
+	"fadd	v2.4s, v2.4s, v20.4s		\n"	\
+	"fadd	v3.4s, v3.4s, v22.4s		\n"	\
+	"PRFM	PLDL1KEEP, ["X", #1024+128]	\n"	\
+	"PRFM	PLDL1KEEP, ["X", #1024+192]	\n"	\
+	"fadd	v28.4s, v28.4s, v29.4s		\n"	\
+	"fadd	v30.4s, v30.4s, v31.4s		\n"	\
+	"fadd	v4.4s, v4.4s, v24.4s		\n"	\
+	"fadd	v5.4s, v5.4s, v26.4s		\n"	\
+	"fadd	v6.4s, v6.4s, v28.4s		\n"	\
+	"fadd	v7.4s, v7.4s, v30.4s		\n"
+
+#define KERNEL_F32_FINALIZE				\
+	"fadd	v0.4s, v0.4s, v1.4s		\n"	\
+	"fadd	v2.4s, v2.4s, v3.4s		\n"	\
+	"fadd	v4.4s, v4.4s, v5.4s		\n"	\
+	"fadd	v6.4s, v6.4s, v7.4s		\n"	\
+	"fadd	v0.4s, v0.4s, v2.4s		\n"	\
+	"fadd	v4.4s, v4.4s, v6.4s		\n"	\
+	"fadd	v0.4s, v0.4s, v4.4s		\n"	\
+	"ext	v1.16b, v0.16b, v0.16b, #8	\n"	\
+	"fadd	v0.2s, v0.2s, v1.2s		\n"	\
+	"faddp	"SUMF", v0.2s			\n"
+
+#define INIT_S						\
+	"lsl	"INC_X", "INC_X", #3		\n"
+
+#define KERNEL_S1					\
+	"ldr	d1, ["X"]			\n"	\
+	"add	"X", "X", "INC_X"		\n"	\
+	"ext	v2.8b, v1.8b, v1.8b, #4		\n"	\
+	"fadd	s1, s1, s2			\n"	\
+	"fadd	"SUMF", "SUMF", s1		\n"
+
+
+#if defined(SMP)
+extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
+	BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
+	void *c, BLASLONG ldc, int (*function)(), int nthreads);
+#endif
+
+
+static FLOAT casum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	FLOAT  asum = 0.0 ;
+
+	if ( n < 0 )  return(asum);
+
+	__asm__ __volatile__ (
+	"	mov	"N", %[N_]			\n"
+	"	mov	"X", %[X_]			\n"
+	"	mov	"INC_X", %[INCX_]		\n"
+	"	fmov	"SUMF", "REG0"			\n"
+	"	fmov	s1, "REG0"			\n"
+	"	fmov	s2, "REG0"			\n"
+	"	fmov	s3, "REG0"			\n"
+	"	fmov	s4, "REG0"			\n"
+	"	fmov	s5, "REG0"			\n"
+	"	fmov	s6, "REG0"			\n"
+	"	fmov	s7, "REG0"			\n"
+	"	cmp	"N", xzr			\n"
+	"	ble	9f //asum_kernel_L999		\n"
+	"	cmp	"INC_X", xzr			\n"
+	"	ble	9f //asum_kernel_L999		\n"
+	"	cmp	"INC_X", #1			\n"
+	"	bne	5f //asum_kernel_S_BEGIN	\n"
+
+	"1: //asum_kernel_F_BEGIN:			\n"
+	"	asr	"J", "N", #5			\n"
+	"	cmp	"J", xzr			\n"
+	"	beq	3f //asum_kernel_F1		\n"
+
+	"2: //asum_kernel_F32:				\n"
+	"	"KERNEL_F32"				\n"
+	"	subs	"J", "J", #1			\n"
+	"	bne	2b //asum_kernel_F32		\n"
+	"	"KERNEL_F32_FINALIZE"			\n"
+
+	"3: //asum_kernel_F1:				\n"
+	"	ands	"J", "N", #31			\n"
+	"	ble	9f //asum_kernel_L999		\n"
+
+	"4: //asum_kernel_F10:				\n"
+	"	"KERNEL_F1"				\n"
+	"	subs    "J", "J", #1			\n"
+	"	bne	4b //asum_kernel_F10		\n"
+	"	b	9f //asum_kernel_L999		\n"
+
+	"5: //asum_kernel_S_BEGIN:			\n"
+	"	"INIT_S"				\n"
+	"	asr	"J", "N", #2			\n"
+	"	cmp	"J", xzr			\n"
+	"	ble	7f //asum_kernel_S1		\n"
+
+	"6: //asum_kernel_S4:				\n"
+	"	"KERNEL_S1"				\n"
+	"	"KERNEL_S1"				\n"
+	"	"KERNEL_S1"				\n"
+	"	"KERNEL_S1"				\n"
+	"	subs	"J", "J", #1			\n"
+	"	bne	6b //asum_kernel_S4		\n"
+
+	"7: //asum_kernel_S1:				\n"
+	"	ands	"J", "N", #3			\n"
+	"	ble	9f //asum_kernel_L999		\n"
+
+	"8: //asum_kernel_S10:				\n"
+	"	"KERNEL_S1"				\n"
+	"	subs	"J", "J", #1			\n"
+	"	bne	8b //asum_kernel_S10		\n"
+
+	"9: //asum_kernel_L999:				\n"
+	"	fmov	%[ASUM_], "SUMFD"		\n"
+
+	: [ASUM_] "=r" (asum)		//%0
+	: [N_]    "r"  (n),		//%1
+	  [X_]    "r"  (x),		//%2
+	  [INCX_] "r"  (inc_x)		//%3
+	: "cc",
+	  "memory",
+	  "x0", "x1", "x2", "x3", "x4", "x5",
+	  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
+	);
+
+	return asum;
+}
+
+#if defined(SMP)
+static int casum_thread_function(BLASLONG n, BLASLONG dummy0,
+	BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
+	BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
+{
+	*result = casum_compute(n, x, inc_x);
+
+	return 0;
+}
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+#if defined(SMP)
+	int nthreads;
+	FLOAT dummy_alpha;
+#endif
+	FLOAT asum = 0.0;
+
+#if defined(SMP)
+	if (inc_x == 0 || n <= 10000)
+		nthreads = 1;
+	else
+		nthreads = num_cpu_avail(1);
+
+	if (nthreads == 1) {
+		asum = casum_compute(n, x, inc_x);
+	} else {
+		int mode, i;
+		char result[MAX_CPU_NUMBER * sizeof(double) * 2];
+		FLOAT *ptr;
+
+		mode = BLAS_SINGLE  | BLAS_COMPLEX;
+
+		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
+				   x, inc_x, NULL, 0, result, 0,
+				   ( void *)casum_thread_function, nthreads);
+
+		ptr = (FLOAT *)result;
+		for (i = 0; i < nthreads; i++) {
+			asum = asum + (*ptr);
+			ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2);
+		}
+	}
+#else
+	asum = casum_compute(n, x, inc_x);
+#endif
+
+	return asum;
+}
diff --git a/kernel/arm64/zsum_thunderx2t99.c b/kernel/arm64/zsum_thunderx2t99.c
new file mode 100644
index 000000000..087dae2fe
--- /dev/null
+++ b/kernel/arm64/zsum_thunderx2t99.c
@@ -0,0 +1,244 @@
+/***************************************************************************
+Copyright (c) 2017, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#include <arm_neon.h>
+
+#define	N	"x0"	/* vector length */
+#define	X	"x1"	/* "X" vector address */
+#define	INC_X	"x2"	/* "X" stride */
+#define J	"x5"	/* loop variable */
+
+#define REG0	"xzr"
+#define SUMF	"d0"
+#define TMPF	"d1"
+
+/******************************************************************************/
+
+#define KERNEL_F1					\
+	"ldr	q1, ["X"]			\n"	\
+	"add	"X", "X", #16			\n"	\
+	"faddp	d1, v1.2d			\n"	\
+	"fadd	"SUMF", "SUMF", d1		\n"
+
+#define KERNEL_F16					\
+	"ldr	q16, ["X"]			\n"	\
+	"ldr	q17, ["X", #16]			\n"	\
+	"ldr	q18, ["X", #32]			\n"	\
+	"ldr	q19, ["X", #48]			\n"	\
+	"ldp	q20, q21, ["X", #64]		\n"	\
+	"ldp	q22, q23, ["X", #96]		\n"	\
+	"ldp	q24, q25, ["X", #128]		\n"	\
+	"ldp	q26, q27, ["X", #160]		\n"	\
+	"fadd	v16.2d, v16.2d, v17.2d		\n"	\
+	"fadd	v18.2d, v18.2d, v19.2d		\n"	\
+	"ldp	q28, q29, ["X", #192]		\n"	\
+	"ldp	q30, q31, ["X", #224]		\n"	\
+	"add	"X", "X", #256			\n"	\
+	"fadd	v20.2d, v20.2d, v21.2d		\n"	\
+	"fadd	v22.2d, v22.2d, v23.2d		\n"	\
+	"PRFM	PLDL1KEEP, ["X", #1024]		\n"	\
+	"PRFM	PLDL1KEEP, ["X", #1024+64]	\n"	\
+	"fadd	v24.2d, v24.2d, v25.2d		\n"	\
+	"fadd	v26.2d, v26.2d, v27.2d		\n"	\
+	"fadd	v28.2d, v28.2d, v29.2d		\n"	\
+	"fadd	v30.2d, v30.2d, v31.2d		\n"	\
+	"fadd	v0.2d, v0.2d, v16.2d		\n"	\
+	"fadd	v1.2d, v1.2d, v18.2d		\n"	\
+	"fadd	v2.2d, v2.2d, v20.2d		\n"	\
+	"fadd	v3.2d, v3.2d, v22.2d		\n"	\
+	"PRFM	PLDL1KEEP, ["X", #1024+128]	\n"	\
+	"PRFM	PLDL1KEEP, ["X", #1024+192]	\n"	\
+	"fadd	v4.2d, v4.2d, v24.2d		\n"	\
+	"fadd	v5.2d, v5.2d, v26.2d		\n"	\
+	"fadd	v6.2d, v6.2d, v28.2d		\n"	\
+	"fadd	v7.2d, v7.2d, v30.2d		\n"
+
+#define KERNEL_F16_FINALIZE				\
+	"fadd	v0.2d, v0.2d, v1.2d		\n"	\
+	"fadd	v2.2d, v2.2d, v3.2d		\n"	\
+	"fadd	v4.2d, v4.2d, v5.2d		\n"	\
+	"fadd	v6.2d, v6.2d, v7.2d		\n"	\
+	"fadd	v0.2d, v0.2d, v2.2d		\n"	\
+	"fadd	v4.2d, v4.2d, v6.2d		\n"	\
+	"fadd	v0.2d, v0.2d, v4.2d		\n"	\
+	"faddp	"SUMF", v0.2d			\n"
+
+#define INIT_S						\
+	"lsl	"INC_X", "INC_X", #4		\n"
+
+#define KERNEL_S1					\
+	"ldr	q1, ["X"]			\n"	\
+	"add	"X", "X", "INC_X"		\n"	\
+	"faddp	d1, v1.2d			\n"	\
+	"fadd	"SUMF", "SUMF", d1		\n"
+
+
+#if defined(SMP)
+extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
+	BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
+	void *c, BLASLONG ldc, int (*function)(), int nthreads);
+#endif
+
+
+static FLOAT zasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	FLOAT  asum = 0.0 ;
+
+	if ( n < 0 )  return(asum);
+
+	__asm__ __volatile__ (
+	"	mov	"N", %[N_]			\n"
+	"	mov	"X", %[X_]			\n"
+	"	mov	"INC_X", %[INCX_]		\n"
+	"	fmov	"SUMF", "REG0"			\n"
+	"	fmov	d1, "REG0"			\n"
+	"	fmov	d2, "REG0"			\n"
+	"	fmov	d3, "REG0"			\n"
+	"	fmov	d4, "REG0"			\n"
+	"	fmov	d5, "REG0"			\n"
+	"	fmov	d6, "REG0"			\n"
+	"	fmov	d7, "REG0"			\n"
+	"	cmp	"N", xzr			\n"
+	"	ble	9f //asum_kernel_L999		\n"
+	"	cmp	"INC_X", xzr			\n"
+	"	ble	9f //asum_kernel_L999		\n"
+	"	cmp	"INC_X", #1			\n"
+	"	bne	5f //asum_kernel_S_BEGIN	\n"
+
+	"1: //asum_kernel_F_BEGIN:			\n"
+	"	asr	"J", "N", #4			\n"
+	"	cmp	"J", xzr			\n"
+	"	beq	3f //asum_kernel_F1		\n"
+
+	".align 5					\n"
+	"2: //asum_kernel_F16:				\n"
+	"	"KERNEL_F16"				\n"
+	"	subs	"J", "J", #1			\n"
+	"	bne	2b //asum_kernel_F16		\n"
+	"	"KERNEL_F16_FINALIZE"			\n"
+
+	"3: //asum_kernel_F1:				\n"
+	"	ands	"J", "N", #15			\n"
+	"	ble	9f //asum_kernel_L999		\n"
+
+	"4: //asum_kernel_F10:				\n"
+	"	"KERNEL_F1"				\n"
+	"	subs    "J", "J", #1			\n"
+	"	bne	4b //asum_kernel_F10		\n"
+	"	b	9f //asum_kernel_L999		\n"
+
+	"5: //asum_kernel_S_BEGIN:			\n"
+	"	"INIT_S"				\n"
+	"	asr	"J", "N", #2			\n"
+	"	cmp	"J", xzr			\n"
+	"	ble	7f //asum_kernel_S1		\n"
+
+	"6: //asum_kernel_S4:				\n"
+	"	"KERNEL_S1"				\n"
+	"	"KERNEL_S1"				\n"
+	"	"KERNEL_S1"				\n"
+	"	"KERNEL_S1"				\n"
+	"	subs	"J", "J", #1			\n"
+	"	bne	6b //asum_kernel_S4		\n"
+
+	"7: //asum_kernel_S1:				\n"
+	"	ands	"J", "N", #3			\n"
+	"	ble	9f //asum_kernel_L999		\n"
+
+	"8: //asum_kernel_S10:				\n"
+	"	"KERNEL_S1"				\n"
+	"	subs	"J", "J", #1			\n"
+	"	bne	8b //asum_kernel_S10		\n"
+
+	"9: //asum_kernel_L999:				\n"
+	"	fmov	%[ASUM_], "SUMF"		\n"
+
+	: [ASUM_] "=r" (asum)		//%0
+	: [N_]    "r"  (n),		//%1
+	  [X_]    "r"  (x),		//%2
+	  [INCX_] "r"  (inc_x)		//%3
+	: "cc",
+	  "memory",
+	  "x0", "x1", "x2", "x3", "x4", "x5",
+	  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
+	);
+
+	return asum;
+}
+
+#if defined(SMP)
+static int zasum_thread_function(BLASLONG n, BLASLONG dummy0,
+	BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
+	BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
+{
+	*result = zasum_compute(n, x, inc_x);
+
+	return 0;
+}
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+#if defined(SMP)
+	int nthreads;
+	FLOAT dummy_alpha;
+#endif
+	FLOAT asum = 0.0;
+
+#if defined(SMP)
+	if (inc_x == 0 || n <= 10000)
+		nthreads = 1;
+	else
+		nthreads = num_cpu_avail(1);
+
+	if (nthreads == 1) {
+		asum = zasum_compute(n, x, inc_x);
+	} else {
+		int mode, i;
+		char result[MAX_CPU_NUMBER * sizeof(double) * 2];
+		FLOAT *ptr;
+
+		mode = BLAS_DOUBLE | BLAS_COMPLEX;
+
+		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
+				   x, inc_x, NULL, 0, result, 0,
+				   ( void *)zasum_thread_function, nthreads);
+
+		ptr = (FLOAT *)result;
+		for (i = 0; i < nthreads; i++) {
+			asum = asum + (*ptr);
+			ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2);
+		}
+	}
+#else
+	asum = zasum_compute(n, x, inc_x);
+#endif
+
+	return asum;
+}

From 7d506984fa2c1d1f8dd981969d84788d087907f5 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 25 Feb 2024 17:57:11 +0100
Subject: [PATCH 696/718] fix assignment of default CSUM kernel

---
 kernel/arm64/KERNEL | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/arm64/KERNEL b/kernel/arm64/KERNEL
index f936cdf47..7d7e648c4 100644
--- a/kernel/arm64/KERNEL
+++ b/kernel/arm64/KERNEL
@@ -1,3 +1,5 @@
+CSUMKERNEL=csum.S
+
 ifndef SNRM2KERNEL
 SNRM2KERNEL = ../arm/nrm2.c
 endif

From baf88564bcf857f088839158d07ec05380ed4c7f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 25 Feb 2024 19:23:41 +0100
Subject: [PATCH 697/718] Fix potential buffer overflow

---
 interface/gemmt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/interface/gemmt.c b/interface/gemmt.c
index 018deb7fb..cae00877e 100644
--- a/interface/gemmt.c
+++ b/interface/gemmt.c
@@ -522,7 +522,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 			IDEBUG_START;
 
-			buffer_size = j + k + 128 / sizeof(FLOAT);
+			buffer_size = 2 * (j + k) + 128 / sizeof(FLOAT);
 #ifdef WINDOWS_ABI
 			buffer_size += 160 / sizeof(FLOAT);
 #endif
@@ -611,7 +611,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 			IDEBUG_START;
 
-			buffer_size = j + k + 128 / sizeof(FLOAT);
+			buffer_size = 2 * (j + k) + 128 / sizeof(FLOAT);
 #ifdef WINDOWS_ABI
 			buffer_size += 160 / sizeof(FLOAT);
 #endif

From 0ce794f0c363754e0a9c7c3b2d997e9fe7465c12 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 25 Feb 2024 23:38:36 +0100
Subject: [PATCH 698/718] Enable GEMM3M tests on supported platforms

---
 ctest/CMakeLists.txt | 46 +++++++++++++++++++++++++++++++++++++++++++-
 ctest/Makefile       | 36 ++++++++++++++++++++++++++++++++--
 2 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt
index 6e0a7f309..d7baadee4 100644
--- a/ctest/CMakeLists.txt
+++ b/ctest/CMakeLists.txt
@@ -10,6 +10,11 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
 	set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize")
 endif()
 
+set (USE_GEMM3M 0)
+if (${ARCH} MATCHES x86|x86_64|ia64|mips)
+        set(USE_GEMM3M 1)
+endif ()
+
 if(WIN32)
 FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1
 "$ErrorActionPreference = \"Stop\"\n"
@@ -88,6 +93,17 @@ if (NOT NOFORTRAN)
     auxiliary.c
     c_xerbla.c
     constant.c)
+  if (USE_GEMM3M)
+  if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
+  add_executable(x${float_char}cblat3_3m
+    c_${float_char}blat3_3m.f
+    c_${float_char}blas3_3m.c
+    c_${float_char}3chke_3m.c
+    auxiliary.c
+    c_xerbla.c
+    constant.c)
+  endif()
+  endif()
 else()
   add_executable(x${float_char}cblat3
     c_${float_char}blat3c.c
@@ -96,6 +112,17 @@ else()
     auxiliary.c
     c_xerbla.c
     constant.c)
+  if (USE_GEMM3M)
+  if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
+  add_executable(x${float_char}cblat3_3m
+    c_${float_char}blat3c_3m.c
+    c_${float_char}blas3_3m.c
+    c_${float_char}3chke_3m.c
+    auxiliary.c
+    c_xerbla.c
+    constant.c)
+  endif()
+  endif()
 endif()
   target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
   if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
@@ -105,7 +132,24 @@ endif()
   if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
     target_link_libraries(x${float_char}cblat3 m)
   endif()
+  if (USE_GEMM3M)
+  if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
+  target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME})
+  if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
+        string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
+        target_link_libraries(x${float_char}cblat3 omp pthread)
+  endif()
+  if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
+    target_link_libraries(x${float_char}cblat3_3m m)
+  endif()
+  endif()
+  endif()
   add_test(NAME "x${float_char}cblat3"
     COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
-
+    if (USE_GEMM3M)
+    if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
+  add_test(NAME "x${float_char}cblat3_3m"
+    COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3_3m> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3_3m")
+    endif()
+  endif()
 endforeach()
diff --git a/ctest/Makefile b/ctest/Makefile
index ad960b35a..36682c7b6 100644
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -5,6 +5,24 @@
 TOPDIR = ..
 include $(TOPDIR)/Makefile.system
 
+SUPPORT_GEMM3M = 0
+
+ifeq ($(ARCH), x86)
+SUPPORT_GEMM3M = 1
+endif
+
+ifeq ($(ARCH), x86_64)
+SUPPORT_GEMM3M = 1
+endif
+
+ifeq ($(ARCH), ia64)
+SUPPORT_GEMM3M = 1
+endif
+
+ifeq ($(ARCH), MIPS)
+SUPPORT_GEMM3M = 1
+endif
+
 override CFLAGS += -DADD$(BU) -DCBLAS
 ifeq ($(F_COMPILER),GFORTRAN)
 	override FFLAGS += -fno-tree-vectorize
@@ -43,7 +61,7 @@ ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o
 ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o
 
 
-all :: all1 all2 all3
+all :: all1 all2 all3 all3_3m
 
 ifeq ($(BUILD_SINGLE),1)
 all1targets += xscblat1
@@ -182,8 +200,9 @@ endif
 endif
 
 all3_3m: xzcblat3_3m xccblat3_3m
+ifeq ($(SUPPORT_GEMM3M),1)
 ifeq ($(USE_OPENMP), 1)
-ifeq ($(BUILD_SINGLE),1)
+ifeq ($(BUILD_COMPLEX),1)
 	OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
 endif
 ifeq ($(BUILD_COMPLEX16),1)
@@ -197,6 +216,7 @@ ifeq ($(BUILD_COMPLEX16),1)
 	OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
 endif
 endif
+endif
 
 
 
@@ -271,8 +291,10 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME)
 	$(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
 xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME)
 	$(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+ifeq ($(SUPPORT_GEMM3M),1)
 xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME)
 	$(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+endif
 else
 xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME)
 	$(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) 
@@ -280,6 +302,10 @@ xccblat2: $(ctestl2o) c_cblat2c.o $(TOPDIR)/$(LIBNAME)
 	$(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
 xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME)
 	$(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
+ifeq ($(SUPPORT_GEMM3M),1)
+xccblat3_3m: $(ctestl3o_3m) c_cblat3c_3m.o $(TOPDIR)/$(LIBNAME)
+	$(CC) $(CFLAGS) -o xccblat3_3m c_cblat3c_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+endif
 endif
 endif
 
@@ -293,8 +319,10 @@ xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME)
 	$(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
 xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME)
 	$(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+ifeq ($(SUPPORT_GEMM3M),1)
 xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME)
 	$(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+endif
 else
 xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME)
 	$(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) 
@@ -302,6 +330,10 @@ xzcblat2: $(ztestl2o) c_zblat2c.o $(TOPDIR)/$(LIBNAME)
 	$(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
 xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME)
 	$(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
+ifeq ($(SUPPORT_GEMM3M),1)
+xzcblat3_3m: $(ztestl3o_3m) c_zblat3c_3m.o $(TOPDIR)/$(LIBNAME)
+	$(CC) $(CFLAGS) -o xzcblat3_3m c_zblat3c_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+endif
 endif
 endif
 

From ba201c1939fbb2e68ccb384709ee29e9e4559158 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 25 Feb 2024 23:39:24 +0100
Subject: [PATCH 699/718] Enable GEMM3M tests on supported platforms

---
 test/CMakeLists.txt | 19 ++++++++++++++++++-
 test/Makefile       | 27 +++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index ace20dffc..b4bf36cee 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -21,7 +21,18 @@ endif()
 if (BUILD_COMPLEX16)
 	list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3)
 endif()
-message (STATUS CCOMP ${CMAKE_C_COMPILER_ID} FCOMP ${CMAKE_Fortran_COMPILER_ID})
+
+set (USE_GEMM3M 0)
+if (${ARCH} MATCHES x86|x86_64|ia64|mips)
+	set(USE_GEMM3M 1)
+	if (BUILD_COMPLEX)
+		list (APPEND OpenBLAS_Tests cblat3_3m)
+	endif ()
+	if (BUILD_COMPLEX16)
+		list (APPEND OpenBLAS_Tests zblat3_3m)
+	endif ()
+endif ()
+
 foreach(test_bin ${OpenBLAS_Tests})
 	add_executable(${test_bin} ${test_bin}.f)
 target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME})
@@ -82,4 +93,10 @@ add_test(NAME "${float_type}blas2"
   COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat2> "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM)
 add_test(NAME "${float_type}blas3" 
   COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat3> "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM)
+if (USE_GEMM3M)
+if ((${float_type} STREQUAL "c") OR (${float_type} STREQUAL "z"))
+add_test(NAME "${float_type}blas3_3m" 
+  COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat3_3m> "${PROJECT_SOURCE_DIR}/test/${float_type}blat3_3m.dat" ${float_type_upper}BLAT3_3M.SUMM)
+endif()
+endif()
 endforeach()
diff --git a/test/Makefile b/test/Makefile
index 5a4694ce6..6a50b6c98 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -4,6 +4,24 @@ ifeq ($(F_COMPILER),GFORTRAN)
         override FFLAGS += -fno-tree-vectorize
 endif
 
+SUPPORT_GEMM3M = 0
+
+ifeq ($(ARCH), x86)
+SUPPORT_GEMM3M = 1
+endif
+
+ifeq ($(ARCH), x86_64)
+SUPPORT_GEMM3M = 1
+endif
+
+ifeq ($(ARCH), ia64)
+SUPPORT_GEMM3M = 1
+endif
+
+ifeq ($(ARCH), MIPS)
+SUPPORT_GEMM3M = 1
+endif
+
 ifeq ($(NOFORTRAN),1)
 all ::
 else
@@ -153,11 +171,20 @@ ifeq ($(BUILD_DOUBLE),1)
 D3=dblat3
 endif
 ifeq ($(BUILD_COMPLEX),1)
+ifeq ($(SUPPORT_GEMM3M),1)
+C3=cblat3 cblat3_3m
+else
 C3=cblat3
 endif
+endif
 ifeq ($(BUILD_COMPLEX16),1)
+ifeq ($(SUPPORT_GEMM3M),1)
+Z3=zblat3 zblat3_3m
+else
 Z3=zblat3
 endif
+endif
+
 
 level3: $(B3) $(S3) $(D3) $(C3) $(Z3)
 

From 87dd1c710e5bec07f72b0e30d8c81db3446c456d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 26 Feb 2024 07:37:30 +0100
Subject: [PATCH 700/718] fix conditional gemm3m build

---
 ctest/Makefile | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/ctest/Makefile b/ctest/Makefile
index 36682c7b6..3a3ee5611 100644
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -61,7 +61,7 @@ ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o
 ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o
 
 
-all :: all1 all2 all3 all3_3m
+all :: all1 all2 all3
 
 ifeq ($(BUILD_SINGLE),1)
 all1targets += xscblat1
@@ -162,9 +162,15 @@ all3targets += xdcblat3
 endif
 ifeq ($(BUILD_COMPLEX),1)
 all3targets += xccblat3
+ifeq ($(USE_GEMM3M),1)
+all3targets += xccblat3_3m
+endif
 endif
 ifeq ($(BUILD_COMPLEX16),1)
 all3targets += xzcblat3
+ifeq ($(USE_GEMM3M),1)
+all3targets += xzcblat3_3m
+endif
 endif
 
 all3: $(all3targets)
@@ -199,7 +205,6 @@ endif
 endif
 endif
 
-all3_3m: xzcblat3_3m xccblat3_3m
 ifeq ($(SUPPORT_GEMM3M),1)
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(BUILD_COMPLEX),1)

From 8dea25ffffca03dee25db035e7197f83402b117f Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Mon, 26 Feb 2024 02:04:37 -0500
Subject: [PATCH 701/718] LoongArch64: Fixed utest kernel_regress:skx_avx

---
 kernel/loongarch64/rot_lasx.S | 2 +-
 kernel/loongarch64/rot_lsx.S  | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/kernel/loongarch64/rot_lasx.S b/kernel/loongarch64/rot_lasx.S
index 5d7e3d7cc..71378e0b2 100644
--- a/kernel/loongarch64/rot_lasx.S
+++ b/kernel/loongarch64/rot_lasx.S
@@ -1036,7 +1036,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     add.d Y, Y, INCY
     xvfmul.d VT0, VX1, VXC
     xvfmadd.d VT0, VX3, VXS, VT0
-    xvfmul.d VT1, VX0, VXS
+    xvfmul.d VT1, VX1, VXS
     xvfmsub.d VT1, VX3, VXC, VT1
     xvstelm.d VT0, XX, 0, 0
     add.d XX, XX, INCX
diff --git a/kernel/loongarch64/rot_lsx.S b/kernel/loongarch64/rot_lsx.S
index 4b0e59310..3bb77aaec 100644
--- a/kernel/loongarch64/rot_lsx.S
+++ b/kernel/loongarch64/rot_lsx.S
@@ -1142,7 +1142,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifdef DOUBLE
     vinsgr2vr.d VX0, t1, 0
     vinsgr2vr.d VX0, t2, 1
-    add.d X, X, INCX
     ld.d t1, Y, 0 * SIZE
     add.d Y, Y, INCY
     ld.d t2, Y, 0 * SIZE
@@ -1199,7 +1198,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     add.d Y, Y, INCY
     VMUL VT0, VX1, VXC
     VFMADD VT0, VX3, VXS, VT0
-    VMUL VT1, VX0, VXS
+    VMUL VT1, VX1, VXS
     VMSUB VT1, VX3, VXC, VT1
     vstelm.d VT0, XX, 0, 0
     add.d XX, XX, INCX
@@ -1223,7 +1222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     add.d Y, Y, INCY
     VMUL VT0, VX1, VXC
     VFMADD VT0, VX3, VXS, VT0
-    VMUL VT1, VX0, VXS
+    VMUL VT1, VX1, VXS
     VMSUB VT1, VX3, VXC, VT1
     vstelm.d VT0, XX, 0, 0
     add.d XX, XX, INCX
@@ -1296,7 +1295,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     add.d Y, Y, INCY
     VMUL VT0, VX1, VXC
     VFMADD VT0, VX3, VXS, VT0
-    VMUL VT1, VX0, VXS
+    VMUL VT1, VX1, VXS
     VMSUB VT1, VX3, VXC, VT1
     vstelm.w VT0, XX, 0, 0
     add.d XX, XX, INCX

From 5aaeca2896464a21e8c7f435bb28b5a44378c49e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 26 Feb 2024 09:26:14 +0100
Subject: [PATCH 702/718] fix name

---
 ctest/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ctest/Makefile b/ctest/Makefile
index 3a3ee5611..bbaf96f8e 100644
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -162,13 +162,13 @@ all3targets += xdcblat3
 endif
 ifeq ($(BUILD_COMPLEX),1)
 all3targets += xccblat3
-ifeq ($(USE_GEMM3M),1)
+ifeq ($(SUPPORT_GEMM3M),1)
 all3targets += xccblat3_3m
 endif
 endif
 ifeq ($(BUILD_COMPLEX16),1)
 all3targets += xzcblat3
-ifeq ($(USE_GEMM3M),1)
+ifeq ($(SUPPORT_GEMM3M),1)
 all3targets += xzcblat3_3m
 endif
 endif

From ea167328f112bddc82eb5ab8ab9c330b4e210c36 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 26 Feb 2024 14:14:58 +0100
Subject: [PATCH 703/718] Add f2c-converted sources for GEMM3M tests

---
 ctest/c_cblat3c_3m.c | 4854 +++++++++++++++++++++++++++++++++++++++++
 ctest/c_zblat3c_3m.c | 4897 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 9751 insertions(+)
 create mode 100644 ctest/c_cblat3c_3m.c
 create mode 100644 ctest/c_zblat3c_3m.c

diff --git a/ctest/c_cblat3c_3m.c b/ctest/c_cblat3c_3m.c
new file mode 100644
index 000000000..9cfa26a41
--- /dev/null
+++ b/ctest/c_cblat3c_3m.c
@@ -0,0 +1,4854 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Common Block Declarations */
+
+struct {
+    integer infot, noutc;
+    logical ok, lerr;
+} infoc_;
+
+#define infoc_1 infoc_
+
+struct {
+    char srnamt[13];
+} srnamc_;
+
+#define srnamc_1 srnamc_
+
+/* Table of constant values */
+
+static complex c_b1 = {0.f,0.f};
+static complex c_b2 = {1.f,0.f};
+static integer c__9 = 9;
+static integer c__1 = 1;
+static integer c__3 = 3;
+static integer c__8 = 8;
+static integer c__4 = 4;
+static integer c__65 = 65;
+static integer c__7 = 7;
+static integer c__6 = 6;
+static integer c__2 = 2;
+static real c_b91 = 1.f;
+static logical c_true = TRUE_;
+static integer c__0 = 0;
+static logical c_false = FALSE_;
+
+/* Main program */ MAIN__(void)
+{
+    /* Initialized data */
+
+    static char snames[13*9] = "cblas_cgemm3m" "cblas_chemm  " "cblas_csymm  "
+	     "cblas_ctrmm  " "cblas_ctrsm  " "cblas_cherk  " "cblas_csyrk  " 
+	    "cblas_cher2k " "cblas_csyr2k ";
+
+    /* Format strings */
+    static char fmt_9997[] = "(\002 NUMBER OF VALUES OF \002,a,\002 IS LESS "
+	    "THAN 1 OR GREATER \002,\002THAN \002,i2)";
+    static char fmt_9996[] = "(\002 VALUE OF N IS LESS THAN 0 OR GREATER THA"
+	    "N \002,i2)";
+    static char fmt_9995[] = "(\002 TESTS OF THE COMPLEX          LEVEL 3 BL"
+	    "AS\002,//\002 THE F\002,\002OLLOWING PARAMETER VALUES WILL BE US"
+	    "ED:\002)";
+    static char fmt_9994[] = "(\002   FOR N              \002,9i6)";
+    static char fmt_9993[] = "(\002   FOR ALPHA          \002,7(\002(\002,f4"
+	    ".1,\002,\002,f4.1,\002)  \002,:))";
+    static char fmt_9992[] = "(\002   FOR BETA           \002,7(\002(\002,f4"
+	    ".1,\002,\002,f4.1,\002)  \002,:))";
+    static char fmt_9984[] = "(\002 ERROR-CALL MYEXITS WILL NOT BE TESTED"
+	    "\002)";
+    static char fmt_9999[] = "(\002 ROUTINES PASS COMPUTATIONAL TESTS IF TES"
+	    "T RATIO IS LES\002,\002S THAN\002,f8.2)";
+    static char fmt_10002[] = "(\002 COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS"
+	    " ARE TESTED\002)";
+    static char fmt_10001[] = "(\002 ROW-MAJOR DATA LAYOUT IS TESTED\002)";
+    static char fmt_10000[] = "(\002 COLUMN-MAJOR DATA LAYOUT IS TESTED\002)";
+    static char fmt_9988[] = "(a13,l2)";
+    static char fmt_9990[] = "(\002 SUBPROGRAM NAME \002,a13,\002 NOT RECOGN"
+	    "IZED\002,/\002 ******* T\002,\002ESTS ABANDONED *******\002)";
+    static char fmt_9998[] = "(\002 RELATIVE MACHINE PRECISION IS TAKEN TO"
+	    " BE\002,1p,e9.1)";
+    static char fmt_9989[] = "(\002 ERROR IN CMMCH -  IN-LINE DOT PRODUCTS A"
+	    "RE BEING EVALU\002,\002ATED WRONGLY.\002,/\002 CMMCH WAS CALLED "
+	    "WITH TRANSA = \002,a1,\002AND TRANSB = \002,a1,/\002 AND RETURNE"
+	    "D SAME = \002,l1,\002 AND \002,\002 ERR = \002,f12.3,\002.\002,"
+	    "/\002 THIS MAY BE DUE TO FAULTS IN THE \002,\002ARITHMETIC OR TH"
+	    "E COMPILER.\002,/\002 ******* TESTS ABANDONED \002,\002******"
+	    "*\002)";
+    static char fmt_9987[] = "(1x,a13,\002 WAS NOT TESTED\002)";
+    static char fmt_9986[] = "(/\002 END OF TESTS\002)";
+    static char fmt_9985[] = "(/\002 ******* FATAL ERROR - TESTS ABANDONED *"
+	    "******\002)";
+    static char fmt_9991[] = "(\002 AMEND DATA FILE OR INCREASE ARRAY SIZES "
+	    "IN PROGRAM\002,/\002 ******* TESTS ABANDONED *******\002)";
+
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4, i__5;
+    real r__1;
+    olist o__1;
+    cllist cl__1;
+
+    /* Local variables */
+    integer nalf, idim[9];
+    logical same;
+    integer nbet, ntra;
+    logical rewi;
+    extern /* Subroutine */ int cchk1_(char *, real *, real *, integer *, 
+	    integer *, logical *, logical *, logical *, integer *, integer *, 
+	    integer *, complex *, integer *, complex *, integer *, complex *, 
+	    complex *, complex *, complex *, complex *, complex *, complex *, 
+	    complex *, complex *, complex *, real *, integer *), 
+	    cchk2_(char *, real *, real *, integer *, integer *, logical *, 
+	    logical *, logical *, integer *, integer *, integer *, complex *, 
+	    integer *, complex *, integer *, complex *, complex *, complex *, 
+	    complex *, complex *, complex *, complex *, complex *, complex *, 
+	    complex *, real *, integer *), cchk3_(char *, real *, 
+	    real *, integer *, integer *, logical *, logical *, logical *, 
+	    integer *, integer *, integer *, complex *, integer *, complex *, 
+	    complex *, complex *, complex *, complex *, complex *, complex *, 
+	    real *, complex *, integer *), cchk4_(char *, real *, 
+	    real *, integer *, integer *, logical *, logical *, logical *, 
+	    integer *, integer *, integer *, complex *, integer *, complex *, 
+	    integer *, complex *, complex *, complex *, complex *, complex *, 
+	    complex *, complex *, complex *, complex *, complex *, real *, 
+	    integer *), cchk5_(char *, real *, real *, integer *, 
+	    integer *, logical *, logical *, logical *, integer *, integer *, 
+	    integer *, complex *, integer *, complex *, integer *, complex *, 
+	    complex *, complex *, complex *, complex *, complex *, complex *, 
+	    complex *, complex *, real *, complex *, integer *);
+    complex c__[4225]	/* was [65][65] */;
+    real g[65];
+    integer i__, j, n;
+    logical fatal;
+    complex w[130];
+    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *, complex *, real *, complex *, 
+	    integer *, real *, real *, logical *, integer *, logical *);
+    extern real sdiff_(real *, real *);
+    logical trace;
+    integer nidim;
+    char snaps[32];
+    integer isnum;
+    logical ltest[9];
+    complex aa[4225], ab[8450]	/* was [65][130] */, bb[4225], cc[4225], as[
+	    4225], bs[4225], cs[4225], ct[65];
+    logical sfatal, corder;
+    char snamet[13], transa[1], transb[1];
+    real thresh;
+    logical rorder;
+    extern /* Subroutine */ int cc3chke_(char *);
+    integer layout;
+    logical ltestt, tsterr;
+    complex alf[7];
+    extern logical lce_(complex *, complex *, integer *);
+    complex bet[7];
+    real eps, err;
+
+    /* Fortran I/O blocks */
+    static cilist io___2 = { 0, 5, 0, 0, 0 };
+    static cilist io___4 = { 0, 5, 0, 0, 0 };
+    static cilist io___7 = { 0, 5, 0, 0, 0 };
+    static cilist io___9 = { 0, 5, 0, 0, 0 };
+    static cilist io___11 = { 0, 5, 0, 0, 0 };
+    static cilist io___13 = { 0, 5, 0, 0, 0 };
+    static cilist io___15 = { 0, 5, 0, 0, 0 };
+    static cilist io___17 = { 0, 5, 0, 0, 0 };
+    static cilist io___19 = { 0, 6, 0, fmt_9997, 0 };
+    static cilist io___20 = { 0, 5, 0, 0, 0 };
+    static cilist io___23 = { 0, 6, 0, fmt_9996, 0 };
+    static cilist io___24 = { 0, 5, 0, 0, 0 };
+    static cilist io___26 = { 0, 6, 0, fmt_9997, 0 };
+    static cilist io___27 = { 0, 5, 0, 0, 0 };
+    static cilist io___29 = { 0, 5, 0, 0, 0 };
+    static cilist io___31 = { 0, 6, 0, fmt_9997, 0 };
+    static cilist io___32 = { 0, 5, 0, 0, 0 };
+    static cilist io___34 = { 0, 6, 0, fmt_9995, 0 };
+    static cilist io___35 = { 0, 6, 0, fmt_9994, 0 };
+    static cilist io___36 = { 0, 6, 0, fmt_9993, 0 };
+    static cilist io___37 = { 0, 6, 0, fmt_9992, 0 };
+    static cilist io___38 = { 0, 6, 0, 0, 0 };
+    static cilist io___39 = { 0, 6, 0, fmt_9984, 0 };
+    static cilist io___40 = { 0, 6, 0, 0, 0 };
+    static cilist io___41 = { 0, 6, 0, fmt_9999, 0 };
+    static cilist io___42 = { 0, 6, 0, 0, 0 };
+    static cilist io___45 = { 0, 6, 0, fmt_10002, 0 };
+    static cilist io___46 = { 0, 6, 0, fmt_10001, 0 };
+    static cilist io___47 = { 0, 6, 0, fmt_10000, 0 };
+    static cilist io___48 = { 0, 6, 0, 0, 0 };
+    static cilist io___50 = { 0, 5, 1, fmt_9988, 0 };
+    static cilist io___53 = { 0, 6, 0, fmt_9990, 0 };
+    static cilist io___55 = { 0, 6, 0, fmt_9998, 0 };
+    static cilist io___68 = { 0, 6, 0, fmt_9989, 0 };
+    static cilist io___69 = { 0, 6, 0, fmt_9989, 0 };
+    static cilist io___70 = { 0, 6, 0, fmt_9989, 0 };
+    static cilist io___71 = { 0, 6, 0, fmt_9989, 0 };
+    static cilist io___73 = { 0, 6, 0, 0, 0 };
+    static cilist io___74 = { 0, 6, 0, fmt_9987, 0 };
+    static cilist io___75 = { 0, 6, 0, 0, 0 };
+    static cilist io___82 = { 0, 6, 0, fmt_9986, 0 };
+    static cilist io___83 = { 0, 6, 0, fmt_9985, 0 };
+    static cilist io___84 = { 0, 6, 0, fmt_9991, 0 };
+
+
+
+/*  Test program for the COMPLEX          Level 3 Blas. */
+
+/*  The program must be driven by a short data file. The first 13 records */
+/*  of the file are read using list-directed input, the last 9 records */
+/*  are read using the format ( A13, L2 ). An annotated example of a data */
+/*  file can be obtained by deleting the first 3 characters from the */
+/*  following 22 lines: */
+/*  'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE */
+/*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) */
+/*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. */
+/*  F        LOGICAL FLAG, T TO STOP ON FAILURES. */
+/*  T        LOGICAL FLAG, T TO TEST ERROR CALL MYEXITS. */
+/*  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH */
+/*  16.0     THRESHOLD VALUE OF TEST RATIO */
+/*  6                 NUMBER OF VALUES OF N */
+/*  0 1 2 3 5 9       VALUES OF N */
+/*  3                 NUMBER OF VALUES OF ALPHA */
+/*  (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA */
+/*  3                 NUMBER OF VALUES OF BETA */
+/*  (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA */
+/*  cblas_cgemm  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  cblas_chemm  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  cblas_csymm  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  cblas_ctrmm  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  cblas_ctrsm  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  cblas_cherk  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  cblas_csyrk  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  cblas_cher2k T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  cblas_csyr2k T PUT F FOR NO TEST. SAME COLUMNS. */
+
+/*  See: */
+
+/*     Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. */
+/*     A Set of Level 3 Basic Linear Algebra Subprograms. */
+
+/*     Technical Memorandum No.88 (Revision 1), Mathematics and */
+/*     Computer Science Division, Argonne National Laboratory, 9700 */
+/*     South Cass Avenue, Argonne, Illinois 60439, US. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+
+    infoc_1.noutc = 6;
+
+/*     Read name and unit number for snapshot output file and open file. */
+
+    s_rsle(&io___2);
+    do_lio(&c__9, &c__1, snaps, (ftnlen)32);
+    e_rsle();
+    s_rsle(&io___4);
+    do_lio(&c__3, &c__1, (char *)&ntra, (ftnlen)sizeof(integer));
+    e_rsle();
+    trace = ntra >= 0;
+    if (trace) {
+	o__1.oerr = 0;
+	o__1.ounit = ntra;
+	o__1.ofnmlen = 32;
+	o__1.ofnm = snaps;
+	o__1.orl = 0;
+	o__1.osta = 0;
+	o__1.oacc = 0;
+	o__1.ofm = 0;
+	o__1.oblnk = 0;
+	f_open(&o__1);
+    }
+/*     Read the flag that directs rewinding of the snapshot file. */
+    s_rsle(&io___7);
+    do_lio(&c__8, &c__1, (char *)&rewi, (ftnlen)sizeof(logical));
+    e_rsle();
+    rewi = rewi && trace;
+/*     Read the flag that directs stopping on any failure. */
+    s_rsle(&io___9);
+    do_lio(&c__8, &c__1, (char *)&sfatal, (ftnlen)sizeof(logical));
+    e_rsle();
+/*     Read the flag that indicates whether error exits are to be tested. */
+    s_rsle(&io___11);
+    do_lio(&c__8, &c__1, (char *)&tsterr, (ftnlen)sizeof(logical));
+    e_rsle();
+/*     Read the flag that indicates whether row-major data layout to be tested. */
+    s_rsle(&io___13);
+    do_lio(&c__3, &c__1, (char *)&layout, (ftnlen)sizeof(integer));
+    e_rsle();
+/*     Read the threshold value of the test ratio */
+    s_rsle(&io___15);
+    do_lio(&c__4, &c__1, (char *)&thresh, (ftnlen)sizeof(real));
+    e_rsle();
+
+/*     Read and check the parameter values for the tests. */
+
+/*     Values of N */
+    s_rsle(&io___17);
+    do_lio(&c__3, &c__1, (char *)&nidim, (ftnlen)sizeof(integer));
+    e_rsle();
+    if (nidim < 1 || nidim > 9) {
+	s_wsfe(&io___19);
+	do_fio(&c__1, "N", (ftnlen)1);
+	do_fio(&c__1, (char *)&c__9, (ftnlen)sizeof(integer));
+	e_wsfe();
+	goto L220;
+    }
+    s_rsle(&io___20);
+    i__1 = nidim;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	do_lio(&c__3, &c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer));
+    }
+    e_rsle();
+    i__1 = nidim;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (idim[i__ - 1] < 0 || idim[i__ - 1] > 65) {
+	    s_wsfe(&io___23);
+	    do_fio(&c__1, (char *)&c__65, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	    goto L220;
+	}
+/* L10: */
+    }
+/*     Values of ALPHA */
+    s_rsle(&io___24);
+    do_lio(&c__3, &c__1, (char *)&nalf, (ftnlen)sizeof(integer));
+    e_rsle();
+    if (nalf < 1 || nalf > 7) {
+	s_wsfe(&io___26);
+	do_fio(&c__1, "ALPHA", (ftnlen)5);
+	do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer));
+	e_wsfe();
+	goto L220;
+    }
+    s_rsle(&io___27);
+    i__1 = nalf;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	do_lio(&c__6, &c__1, (char *)&alf[i__ - 1], (ftnlen)sizeof(complex));
+    }
+    e_rsle();
+/*     Values of BETA */
+    s_rsle(&io___29);
+    do_lio(&c__3, &c__1, (char *)&nbet, (ftnlen)sizeof(integer));
+    e_rsle();
+    if (nbet < 1 || nbet > 7) {
+	s_wsfe(&io___31);
+	do_fio(&c__1, "BETA", (ftnlen)4);
+	do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer));
+	e_wsfe();
+	goto L220;
+    }
+    s_rsle(&io___32);
+    i__1 = nbet;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	do_lio(&c__6, &c__1, (char *)&bet[i__ - 1], (ftnlen)sizeof(complex));
+    }
+    e_rsle();
+
+/*     Report values of parameters. */
+
+    s_wsfe(&io___34);
+    e_wsfe();
+    s_wsfe(&io___35);
+    i__1 = nidim;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	do_fio(&c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer));
+    }
+    e_wsfe();
+    s_wsfe(&io___36);
+    i__1 = nalf;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	do_fio(&c__2, (char *)&alf[i__ - 1], (ftnlen)sizeof(real));
+    }
+    e_wsfe();
+    s_wsfe(&io___37);
+    i__1 = nbet;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	do_fio(&c__2, (char *)&bet[i__ - 1], (ftnlen)sizeof(real));
+    }
+    e_wsfe();
+    if (! tsterr) {
+	s_wsle(&io___38);
+	e_wsle();
+	s_wsfe(&io___39);
+	e_wsfe();
+    }
+    s_wsle(&io___40);
+    e_wsle();
+    s_wsfe(&io___41);
+    do_fio(&c__1, (char *)&thresh, (ftnlen)sizeof(real));
+    e_wsfe();
+    s_wsle(&io___42);
+    e_wsle();
+    rorder = FALSE_;
+    corder = FALSE_;
+    if (layout == 2) {
+	rorder = TRUE_;
+	corder = TRUE_;
+	s_wsfe(&io___45);
+	e_wsfe();
+    } else if (layout == 1) {
+	rorder = TRUE_;
+	s_wsfe(&io___46);
+	e_wsfe();
+    } else if (layout == 0) {
+	corder = TRUE_;
+	s_wsfe(&io___47);
+	e_wsfe();
+    }
+    s_wsle(&io___48);
+    e_wsle();
+
+/*     Read names of subroutines and flags which indicate */
+/*     whether they are to be tested. */
+
+    for (i__ = 1; i__ <= 9; ++i__) {
+	ltest[i__ - 1] = FALSE_;
+/* L20: */
+    }
+L30:
+    i__1 = s_rsfe(&io___50);
+    if (i__1 != 0) {
+	goto L60;
+    }
+    i__1 = do_fio(&c__1, snamet, (ftnlen)13);
+    if (i__1 != 0) {
+	goto L60;
+    }
+    i__1 = do_fio(&c__1, (char *)&ltestt, (ftnlen)sizeof(logical));
+    if (i__1 != 0) {
+	goto L60;
+    }
+    i__1 = e_rsfe();
+    if (i__1 != 0) {
+	goto L60;
+    }
+    for (i__ = 1; i__ <= 9; ++i__) {
+	if (s_cmp(snamet, snames + (i__ - 1) * 13, (ftnlen)13, (ftnlen)13) == 
+		0) {
+	    goto L50;
+	}
+/* L40: */
+    }
+    s_wsfe(&io___53);
+    do_fio(&c__1, snamet, (ftnlen)13);
+    e_wsfe();
+    s_stop("", (ftnlen)0);
+L50:
+    ltest[i__ - 1] = ltestt;
+    goto L30;
+
+L60:
+    cl__1.cerr = 0;
+    cl__1.cunit = 5;
+    cl__1.csta = 0;
+    f_clos(&cl__1);
+
+/*     Compute EPS (the machine precision). */
+
+    eps = 1.f;
+L70:
+    r__1 = eps + 1.f;
+    if (sdiff_(&r__1, &c_b91) == 0.f) {
+	goto L80;
+    }
+    eps *= .5f;
+    goto L70;
+L80:
+    eps += eps;
+    s_wsfe(&io___55);
+    do_fio(&c__1, (char *)&eps, (ftnlen)sizeof(real));
+    e_wsfe();
+
+/*     Check the reliability of CMMCH using exact data. */
+
+    n = 32;
+    i__1 = n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = n;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * 65 - 66;
+/* Computing MAX */
+	    i__5 = i__ - j + 1;
+	    i__4 = f2cmax(i__5,0);
+	    ab[i__3].r = (real) i__4, ab[i__3].i = 0.f;
+/* L90: */
+	}
+	i__2 = j + 4224;
+	ab[i__2].r = (real) j, ab[i__2].i = 0.f;
+	i__2 = (j + 65) * 65 - 65;
+	ab[i__2].r = (real) j, ab[i__2].i = 0.f;
+	i__2 = j - 1;
+	c__[i__2].r = 0.f, c__[i__2].i = 0.f;
+/* L100: */
+    }
+    i__1 = n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = j - 1;
+	i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3;
+	cc[i__2].r = (real) i__3, cc[i__2].i = 0.f;
+/* L110: */
+    }
+/*     CC holds the exact result. On exit from CMMCH CT holds */
+/*     the result computed by CMMCH. */
+    *(unsigned char *)transa = 'N';
+    *(unsigned char *)transb = 'N';
+    cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    &c__6, &c_true);
+    same = lce_(cc, ct, &n);
+    if (! same || err != 0.f) {
+	s_wsfe(&io___68);
+	do_fio(&c__1, transa, (ftnlen)1);
+	do_fio(&c__1, transb, (ftnlen)1);
+	do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical));
+	do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real));
+	e_wsfe();
+	s_stop("", (ftnlen)0);
+    }
+    *(unsigned char *)transb = 'C';
+    cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    &c__6, &c_true);
+    same = lce_(cc, ct, &n);
+    if (! same || err != 0.f) {
+	s_wsfe(&io___69);
+	do_fio(&c__1, transa, (ftnlen)1);
+	do_fio(&c__1, transb, (ftnlen)1);
+	do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical));
+	do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real));
+	e_wsfe();
+	s_stop("", (ftnlen)0);
+    }
+    i__1 = n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = j + 4224;
+	i__3 = n - j + 1;
+	ab[i__2].r = (real) i__3, ab[i__2].i = 0.f;
+	i__2 = (j + 65) * 65 - 65;
+	i__3 = n - j + 1;
+	ab[i__2].r = (real) i__3, ab[i__2].i = 0.f;
+/* L120: */
+    }
+    i__1 = n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = n - j;
+	i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3;
+	cc[i__2].r = (real) i__3, cc[i__2].i = 0.f;
+/* L130: */
+    }
+    *(unsigned char *)transa = 'C';
+    *(unsigned char *)transb = 'N';
+    cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    &c__6, &c_true);
+    same = lce_(cc, ct, &n);
+    if (! same || err != 0.f) {
+	s_wsfe(&io___70);
+	do_fio(&c__1, transa, (ftnlen)1);
+	do_fio(&c__1, transb, (ftnlen)1);
+	do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical));
+	do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real));
+	e_wsfe();
+	s_stop("", (ftnlen)0);
+    }
+    *(unsigned char *)transb = 'C';
+    cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    &c__6, &c_true);
+    same = lce_(cc, ct, &n);
+    if (! same || err != 0.f) {
+	s_wsfe(&io___71);
+	do_fio(&c__1, transa, (ftnlen)1);
+	do_fio(&c__1, transb, (ftnlen)1);
+	do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical));
+	do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real));
+	e_wsfe();
+	s_stop("", (ftnlen)0);
+    }
+
+/*     Test each subroutine in turn. */
+
+    for (isnum = 1; isnum <= 9; ++isnum) {
+	s_wsle(&io___73);
+	e_wsle();
+	if (! ltest[isnum - 1]) {
+/*           Subprogram is not to be tested. */
+	    s_wsfe(&io___74);
+	    do_fio(&c__1, snames + (isnum - 1) * 13, (ftnlen)13);
+	    e_wsfe();
+	} else {
+	    s_copy(srnamc_1.srnamt, snames + (isnum - 1) * 13, (ftnlen)13, (
+		    ftnlen)13);
+/*           Test error exits. */
+	    if (tsterr) {
+		cc3chke_(snames + (isnum - 1) * 13);
+		s_wsle(&io___75);
+		e_wsle();
+	    }
+/*           Test computations. */
+	    infoc_1.infot = 0;
+	    infoc_1.ok = TRUE_;
+	    fatal = FALSE_;
+	    switch (isnum) {
+		case 1:  goto L140;
+		case 2:  goto L150;
+		case 3:  goto L150;
+		case 4:  goto L160;
+		case 5:  goto L160;
+		case 6:  goto L170;
+		case 7:  goto L170;
+		case 8:  goto L180;
+		case 9:  goto L180;
+	    }
+/*           Test CGEMM, 01. */
+L140:
+	    if (corder) {
+		cchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__0);
+	    }
+	    if (rorder) {
+		cchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__1);
+	    }
+	    goto L190;
+/*           Test CHEMM, 02, CSYMM, 03. */
+L150:
+	    if (corder) {
+		cchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__0);
+	    }
+	    if (rorder) {
+		cchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__1);
+	    }
+	    goto L190;
+/*           Test CTRMM, 04, CTRSM, 05. */
+L160:
+	    if (corder) {
+		cchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, &
+			c__0);
+	    }
+	    if (rorder) {
+		cchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, &
+			c__1);
+	    }
+	    goto L190;
+/*           Test CHERK, 06, CSYRK, 07. */
+L170:
+	    if (corder) {
+		cchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__0);
+	    }
+	    if (rorder) {
+		cchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__1);
+	    }
+	    goto L190;
+/*           Test CHER2K, 08, CSYR2K, 09. */
+L180:
+	    if (corder) {
+		cchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, 
+			ct, g, w, &c__0);
+	    }
+	    if (rorder) {
+		cchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, 
+			ct, g, w, &c__1);
+	    }
+	    goto L190;
+
+L190:
+	    if (fatal && sfatal) {
+		goto L210;
+	    }
+	}
+/* L200: */
+    }
+    s_wsfe(&io___82);
+    e_wsfe();
+    goto L230;
+
+L210:
+    s_wsfe(&io___83);
+    e_wsfe();
+    goto L230;
+
+L220:
+    s_wsfe(&io___84);
+    e_wsfe();
+
+L230:
+    if (trace) {
+	cl__1.cerr = 0;
+	cl__1.cunit = ntra;
+	cl__1.csta = 0;
+	f_clos(&cl__1);
+    }
+    cl__1.cerr = 0;
+    cl__1.cunit = 6;
+    cl__1.csta = 0;
+    f_clos(&cl__1);
+    s_stop("", (ftnlen)0);
+
+
+/*     End of CBLAT3. */
+
+    return 0;
+} /* MAIN__ */
+
+/* Subroutine */ int cchk1_(char *sname, real *eps, real *thresh, integer *
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	integer *nidim, integer *idim, integer *nalf, complex *alf, integer *
+	nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex *
+	as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, 
+	complex *cs, complex *ct, real *g, integer *iorder)
+{
+    /* Initialized data */
+
+    static char ich[3] = "NTC";
+
+    /* Format strings */
+    static char fmt_9994[] = "(\002 ****** FATAL ERROR - ERROR-CALL MYEXIT T"
+	    "AKEN ON VALID\002,\002 CALL ******\002)";
+    static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER"
+	    " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)";
+    static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C"
+	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
+    static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR    C"
+	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
+    static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO"
+	    "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
+	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
+	    "ECT *******\002)";
+    static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR  "
+	    "  COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
+	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
+	    "ECT *******\002)";
+    static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM"
+	    "BER:\002)";
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+	    i__3, i__4, i__5, i__6, i__7, i__8;
+    alist al__1;
+
+    /* Local variables */
+    complex beta;
+    integer ldas, ldbs, ldcs;
+    logical same, null;
+    integer i__, k, m, n;
+    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, 
+	    integer *, complex *, integer *, complex *, integer *, logical *, 
+	    complex *);
+    complex alpha;
+    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *, complex *, real *, complex *, 
+	    integer *, real *, real *, logical *, integer *, logical *);
+    logical isame[13], trana, tranb;
+    integer nargs;
+    logical reset;
+    extern /* Subroutine */ int cprcn1_(integer *, integer *, char *, integer 
+	    *, char *, char *, integer *, integer *, integer *, complex *, 
+	    integer *, integer *, complex *, integer *);
+    integer ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns;
+    extern logical lceres_(char *, char *, integer *, integer *, complex *, 
+	    complex *, integer *);
+    char tranas[1], tranbs[1], transa[1], transb[1];
+    real errmax;
+    integer ica, icb, laa, lbb, lda, lcc, ldb, ldc;
+    extern logical lce_(complex *, complex *, integer *);
+    complex als, bls;
+    real err;
+    extern /* Subroutine */ int ccgemm3m_(integer *, char *, char *, integer *
+	    , integer *, integer *, complex *, complex *, integer *, complex *
+	    , integer *, complex *, complex *, integer *);
+
+    /* Fortran I/O blocks */
+    static cilist io___128 = { 0, 0, 0, fmt_9994, 0 };
+    static cilist io___131 = { 0, 0, 0, fmt_9998, 0 };
+    static cilist io___133 = { 0, 0, 0, fmt_10000, 0 };
+    static cilist io___134 = { 0, 0, 0, fmt_10001, 0 };
+    static cilist io___135 = { 0, 0, 0, fmt_10002, 0 };
+    static cilist io___136 = { 0, 0, 0, fmt_10003, 0 };
+    static cilist io___137 = { 0, 0, 0, fmt_9996, 0 };
+
+
+
+/*  Tests CGEMM. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    --idim;
+    --alf;
+    --bet;
+    --g;
+    --ct;
+    --cs;
+    --cc;
+    c_dim1 = *nmax;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --bs;
+    --bb;
+    b_dim1 = *nmax;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --as;
+    --aa;
+    a_dim1 = *nmax;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+
+    /* Function Body */
+
+    nargs = 13;
+    nc = 0;
+    reset = TRUE_;
+    errmax = 0.f;
+
+    i__1 = *nidim;
+    for (im = 1; im <= i__1; ++im) {
+	m = idim[im];
+
+	i__2 = *nidim;
+	for (in = 1; in <= i__2; ++in) {
+	    n = idim[in];
+/*           Set LDC to 1 more than minimum value if room. */
+	    ldc = m;
+	    if (ldc < *nmax) {
+		++ldc;
+	    }
+/*           Skip tests if not enough room. */
+	    if (ldc > *nmax) {
+		goto L100;
+	    }
+	    lcc = ldc * n;
+	    null = n <= 0 || m <= 0;
+
+	    i__3 = *nidim;
+	    for (ik = 1; ik <= i__3; ++ik) {
+		k = idim[ik];
+
+		for (ica = 1; ica <= 3; ++ica) {
+		    *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1]
+			    ;
+		    trana = *(unsigned char *)transa == 'T' || *(unsigned 
+			    char *)transa == 'C';
+
+		    if (trana) {
+			ma = k;
+			na = m;
+		    } else {
+			ma = m;
+			na = k;
+		    }
+/*                 Set LDA to 1 more than minimum value if room. */
+		    lda = ma;
+		    if (lda < *nmax) {
+			++lda;
+		    }
+/*                 Skip tests if not enough room. */
+		    if (lda > *nmax) {
+			goto L80;
+		    }
+		    laa = lda * na;
+
+/*                 Generate the matrix A. */
+
+		    cmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[
+			    1], &lda, &reset, &c_b1);
+
+		    for (icb = 1; icb <= 3; ++icb) {
+			*(unsigned char *)transb = *(unsigned char *)&ich[icb 
+				- 1];
+			tranb = *(unsigned char *)transb == 'T' || *(unsigned 
+				char *)transb == 'C';
+
+			if (tranb) {
+			    mb = n;
+			    nb = k;
+			} else {
+			    mb = k;
+			    nb = n;
+			}
+/*                    Set LDB to 1 more than minimum value if room. */
+			ldb = mb;
+			if (ldb < *nmax) {
+			    ++ldb;
+			}
+/*                    Skip tests if not enough room. */
+			if (ldb > *nmax) {
+			    goto L70;
+			}
+			lbb = ldb * nb;
+
+/*                    Generate the matrix B. */
+
+			cmake_("ge", " ", " ", &mb, &nb, &b[b_offset], nmax, &
+				bb[1], &ldb, &reset, &c_b1);
+
+			i__4 = *nalf;
+			for (ia = 1; ia <= i__4; ++ia) {
+			    i__5 = ia;
+			    alpha.r = alf[i__5].r, alpha.i = alf[i__5].i;
+
+			    i__5 = *nbet;
+			    for (ib = 1; ib <= i__5; ++ib) {
+				i__6 = ib;
+				beta.r = bet[i__6].r, beta.i = bet[i__6].i;
+
+/*                          Generate the matrix C. */
+
+				cmake_("ge", " ", " ", &m, &n, &c__[c_offset],
+					 nmax, &cc[1], &ldc, &reset, &c_b1);
+
+				++nc;
+
+/*                          Save every datum before calling the */
+/*                          subroutine. */
+
+				*(unsigned char *)tranas = *(unsigned char *)
+					transa;
+				*(unsigned char *)tranbs = *(unsigned char *)
+					transb;
+				ms = m;
+				ns = n;
+				ks = k;
+				als.r = alpha.r, als.i = alpha.i;
+				i__6 = laa;
+				for (i__ = 1; i__ <= i__6; ++i__) {
+				    i__7 = i__;
+				    i__8 = i__;
+				    as[i__7].r = aa[i__8].r, as[i__7].i = aa[
+					    i__8].i;
+/* L10: */
+				}
+				ldas = lda;
+				i__6 = lbb;
+				for (i__ = 1; i__ <= i__6; ++i__) {
+				    i__7 = i__;
+				    i__8 = i__;
+				    bs[i__7].r = bb[i__8].r, bs[i__7].i = bb[
+					    i__8].i;
+/* L20: */
+				}
+				ldbs = ldb;
+				bls.r = beta.r, bls.i = beta.i;
+				i__6 = lcc;
+				for (i__ = 1; i__ <= i__6; ++i__) {
+				    i__7 = i__;
+				    i__8 = i__;
+				    cs[i__7].r = cc[i__8].r, cs[i__7].i = cc[
+					    i__8].i;
+/* L30: */
+				}
+				ldcs = ldc;
+
+/*                          Call the subroutine. */
+
+				if (*trace) {
+				    cprcn1_(ntra, &nc, sname, iorder, transa, 
+					    transb, &m, &n, &k, &alpha, &lda, 
+					    &ldb, &beta, &ldc);
+				}
+				if (*rewi) {
+				    al__1.aerr = 0;
+				    al__1.aunit = *ntra;
+				    f_rew(&al__1);
+				}
+				ccgemm3m_(iorder, transa, transb, &m, &n, &k, 
+					&alpha, &aa[1], &lda, &bb[1], &ldb, &
+					beta, &cc[1], &ldc);
+
+/*                          Check if error-exit was taken incorrectly. */
+
+				if (! infoc_1.ok) {
+				    io___128.ciunit = *nout;
+				    s_wsfe(&io___128);
+				    e_wsfe();
+				    *fatal = TRUE_;
+				    goto L120;
+				}
+
+/*                          See what data changed inside subroutines. */
+
+				isame[0] = *(unsigned char *)transa == *(
+					unsigned char *)tranas;
+				isame[1] = *(unsigned char *)transb == *(
+					unsigned char *)tranbs;
+				isame[2] = ms == m;
+				isame[3] = ns == n;
+				isame[4] = ks == k;
+				isame[5] = als.r == alpha.r && als.i == 
+					alpha.i;
+				isame[6] = lce_(&as[1], &aa[1], &laa);
+				isame[7] = ldas == lda;
+				isame[8] = lce_(&bs[1], &bb[1], &lbb);
+				isame[9] = ldbs == ldb;
+				isame[10] = bls.r == beta.r && bls.i == 
+					beta.i;
+				if (null) {
+				    isame[11] = lce_(&cs[1], &cc[1], &lcc);
+				} else {
+				    isame[11] = lceres_("ge", " ", &m, &n, &
+					    cs[1], &cc[1], &ldc);
+				}
+				isame[12] = ldcs == ldc;
+
+/*                          If data was incorrectly changed, report */
+/*                          and return. */
+
+				same = TRUE_;
+				i__6 = nargs;
+				for (i__ = 1; i__ <= i__6; ++i__) {
+				    same = same && isame[i__ - 1];
+				    if (! isame[i__ - 1]) {
+					io___131.ciunit = *nout;
+					s_wsfe(&io___131);
+					do_fio(&c__1, (char *)&i__, (ftnlen)
+						sizeof(integer));
+					e_wsfe();
+				    }
+/* L40: */
+				}
+				if (! same) {
+				    *fatal = TRUE_;
+				    goto L120;
+				}
+
+				if (! null) {
+
+/*                             Check the result. */
+
+				    cmmch_(transa, transb, &m, &n, &k, &alpha,
+					     &a[a_offset], nmax, &b[b_offset],
+					     nmax, &beta, &c__[c_offset], 
+					    nmax, &ct[1], &g[1], &cc[1], &ldc,
+					     eps, &err, fatal, nout, &c_true);
+				    errmax = f2cmax(errmax,err);
+/*                             If got really bad answer, report and */
+/*                             return. */
+				    if (*fatal) {
+					goto L120;
+				    }
+				}
+
+/* L50: */
+			    }
+
+/* L60: */
+			}
+
+L70:
+			;
+		    }
+
+L80:
+		    ;
+		}
+
+/* L90: */
+	    }
+
+L100:
+	    ;
+	}
+
+/* L110: */
+    }
+
+/*     Report result. */
+
+    if (errmax < *thresh) {
+	if (*iorder == 0) {
+	    io___133.ciunit = *nout;
+	    s_wsfe(&io___133);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+	if (*iorder == 1) {
+	    io___134.ciunit = *nout;
+	    s_wsfe(&io___134);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+    } else {
+	if (*iorder == 0) {
+	    io___135.ciunit = *nout;
+	    s_wsfe(&io___135);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real));
+	    e_wsfe();
+	}
+	if (*iorder == 1) {
+	    io___136.ciunit = *nout;
+	    s_wsfe(&io___136);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real));
+	    e_wsfe();
+	}
+    }
+    goto L130;
+
+L120:
+    io___137.ciunit = *nout;
+    s_wsfe(&io___137);
+    do_fio(&c__1, sname, (ftnlen)13);
+    e_wsfe();
+    cprcn1_(nout, &nc, sname, iorder, transa, transb, &m, &n, &k, &alpha, &
+	    lda, &ldb, &beta, &ldc);
+
+L130:
+    return 0;
+
+/* L9995: */
+
+/*     End of CCHK1. */
+
+} /* cchk1_ */
+
+
+/* Subroutine */ int cprcn1_(integer *nout, integer *nc, char *sname, integer 
+	*iorder, char *transa, char *transb, integer *m, integer *n, integer *
+	k, complex *alpha, integer *lda, integer *ldb, complex *beta, integer 
+	*ldc)
+{
+    /* Format strings */
+    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a14,\002,\002,"
+	    "a14,\002,\002,a14,\002,\002)";
+    static char fmt_9994[] = "(10x,3(i3,\002,\002),\002 (\002,f4.1,\002,\002"
+	    ",f4.1,\002) , A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002"
+	    ",\002,f4.1,\002) , C,\002,i3,\002).\002)";
+
+    /* Local variables */
+    char crc[14], cta[14], ctb[14];
+
+    /* Fortran I/O blocks */
+    static cilist io___141 = { 0, 0, 0, fmt_9995, 0 };
+    static cilist io___142 = { 0, 0, 0, fmt_9994, 0 };
+
+
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(cta, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(cta, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cta, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transb == 'N') {
+	s_copy(ctb, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transb == 'T') {
+	s_copy(ctb, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ctb, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    io___141.ciunit = *nout;
+    s_wsfe(&io___141);
+    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
+    do_fio(&c__1, sname, (ftnlen)13);
+    do_fio(&c__1, crc, (ftnlen)14);
+    do_fio(&c__1, cta, (ftnlen)14);
+    do_fio(&c__1, ctb, (ftnlen)14);
+    e_wsfe();
+    io___142.ciunit = *nout;
+    s_wsfe(&io___142);
+    do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer));
+    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real));
+    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer));
+    do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real));
+    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
+    e_wsfe();
+    return 0;
+} /* cprcn1_ */
+
+
+/* Subroutine */ int cchk2_(char *sname, real *eps, real *thresh, integer *
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	integer *nidim, integer *idim, integer *nalf, complex *alf, integer *
+	nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex *
+	as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, 
+	complex *cs, complex *ct, real *g, integer *iorder)
+{
+    /* Initialized data */
+
+    static char ichs[2] = "LR";
+    static char ichu[2] = "UL";
+
+    /* Format strings */
+    static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT "
+	    "TAKEN ON VALID\002,\002 CALL *******\002)";
+    static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER"
+	    " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)";
+    static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C"
+	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
+    static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR    C"
+	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
+    static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO"
+	    "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
+	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
+	    "ECT *******\002)";
+    static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR  "
+	    "  COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
+	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
+	    "ECT *******\002)";
+    static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM"
+	    "BER:\002)";
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+	    i__3, i__4, i__5, i__6, i__7;
+    alist al__1;
+
+    /* Local variables */
+    complex beta;
+    integer ldas, ldbs, ldcs;
+    logical same;
+    char side[1];
+    logical conj, left, null;
+    char uplo[1];
+    integer i__, m, n;
+    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, 
+	    integer *, complex *, integer *, complex *, integer *, logical *, 
+	    complex *);
+    complex alpha;
+    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *, complex *, real *, complex *, 
+	    integer *, real *, real *, logical *, integer *, logical *);
+    logical isame[13];
+    char sides[1];
+    integer nargs;
+    logical reset;
+    char uplos[1];
+    extern /* Subroutine */ int cprcn2_(integer *, integer *, char *, integer 
+	    *, char *, char *, integer *, integer *, complex *, integer *, 
+	    integer *, complex *, integer *);
+    integer ia, ib, na, nc, im, in;
+    extern /* Subroutine */ int cchemm_(integer *, char *, char *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *);
+    integer ms, ns;
+    extern logical lceres_(char *, char *, integer *, integer *, complex *, 
+	    complex *, integer *);
+    extern /* Subroutine */ int ccsymm_(integer *, char *, char *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *);
+    real errmax;
+    integer laa, lbb, lda, lcc, ldb, ldc;
+    extern logical lce_(complex *, complex *, integer *);
+    integer ics;
+    complex als, bls;
+    integer icu;
+    real err;
+
+    /* Fortran I/O blocks */
+    static cilist io___181 = { 0, 0, 0, fmt_9994, 0 };
+    static cilist io___184 = { 0, 0, 0, fmt_9998, 0 };
+    static cilist io___186 = { 0, 0, 0, fmt_10000, 0 };
+    static cilist io___187 = { 0, 0, 0, fmt_10001, 0 };
+    static cilist io___188 = { 0, 0, 0, fmt_10002, 0 };
+    static cilist io___189 = { 0, 0, 0, fmt_10003, 0 };
+    static cilist io___190 = { 0, 0, 0, fmt_9996, 0 };
+
+
+
+/*  Tests CHEMM and CSYMM. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    --idim;
+    --alf;
+    --bet;
+    --g;
+    --ct;
+    --cs;
+    --cc;
+    c_dim1 = *nmax;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --bs;
+    --bb;
+    b_dim1 = *nmax;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --as;
+    --aa;
+    a_dim1 = *nmax;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+
+    /* Function Body */
+    conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0;
+
+    nargs = 12;
+    nc = 0;
+    reset = TRUE_;
+    errmax = 0.f;
+
+    i__1 = *nidim;
+    for (im = 1; im <= i__1; ++im) {
+	m = idim[im];
+
+	i__2 = *nidim;
+	for (in = 1; in <= i__2; ++in) {
+	    n = idim[in];
+/*           Set LDC to 1 more than minimum value if room. */
+	    ldc = m;
+	    if (ldc < *nmax) {
+		++ldc;
+	    }
+/*           Skip tests if not enough room. */
+	    if (ldc > *nmax) {
+		goto L90;
+	    }
+	    lcc = ldc * n;
+	    null = n <= 0 || m <= 0;
+/*           Set LDB to 1 more than minimum value if room. */
+	    ldb = m;
+	    if (ldb < *nmax) {
+		++ldb;
+	    }
+/*           Skip tests if not enough room. */
+	    if (ldb > *nmax) {
+		goto L90;
+	    }
+	    lbb = ldb * n;
+
+/*           Generate the matrix B. */
+
+	    cmake_("ge", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, &
+		    reset, &c_b1);
+
+	    for (ics = 1; ics <= 2; ++ics) {
+		*(unsigned char *)side = *(unsigned char *)&ichs[ics - 1];
+		left = *(unsigned char *)side == 'L';
+
+		if (left) {
+		    na = m;
+		} else {
+		    na = n;
+		}
+/*              Set LDA to 1 more than minimum value if room. */
+		lda = na;
+		if (lda < *nmax) {
+		    ++lda;
+		}
+/*              Skip tests if not enough room. */
+		if (lda > *nmax) {
+		    goto L80;
+		}
+		laa = lda * na;
+
+		for (icu = 1; icu <= 2; ++icu) {
+		    *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1];
+
+/*                 Generate the hermitian or symmetric matrix A. */
+
+		    cmake_(sname + 7, uplo, " ", &na, &na, &a[a_offset], nmax,
+			     &aa[1], &lda, &reset, &c_b1);
+
+		    i__3 = *nalf;
+		    for (ia = 1; ia <= i__3; ++ia) {
+			i__4 = ia;
+			alpha.r = alf[i__4].r, alpha.i = alf[i__4].i;
+
+			i__4 = *nbet;
+			for (ib = 1; ib <= i__4; ++ib) {
+			    i__5 = ib;
+			    beta.r = bet[i__5].r, beta.i = bet[i__5].i;
+
+/*                       Generate the matrix C. */
+
+			    cmake_("ge", " ", " ", &m, &n, &c__[c_offset], 
+				    nmax, &cc[1], &ldc, &reset, &c_b1);
+
+			    ++nc;
+
+/*                       Save every datum before calling the */
+/*                       subroutine. */
+
+			    *(unsigned char *)sides = *(unsigned char *)side;
+			    *(unsigned char *)uplos = *(unsigned char *)uplo;
+			    ms = m;
+			    ns = n;
+			    als.r = alpha.r, als.i = alpha.i;
+			    i__5 = laa;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7]
+					.i;
+/* L10: */
+			    }
+			    ldas = lda;
+			    i__5 = lbb;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7]
+					.i;
+/* L20: */
+			    }
+			    ldbs = ldb;
+			    bls.r = beta.r, bls.i = beta.i;
+			    i__5 = lcc;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7]
+					.i;
+/* L30: */
+			    }
+			    ldcs = ldc;
+
+/*                       Call the subroutine. */
+
+			    if (*trace) {
+				cprcn2_(ntra, &nc, sname, iorder, side, uplo, 
+					&m, &n, &alpha, &lda, &ldb, &beta, &
+					ldc)
+					;
+			    }
+			    if (*rewi) {
+				al__1.aerr = 0;
+				al__1.aunit = *ntra;
+				f_rew(&al__1);
+			    }
+			    if (conj) {
+				cchemm_(iorder, side, uplo, &m, &n, &alpha, &
+					aa[1], &lda, &bb[1], &ldb, &beta, &cc[
+					1], &ldc);
+			    } else {
+				ccsymm_(iorder, side, uplo, &m, &n, &alpha, &
+					aa[1], &lda, &bb[1], &ldb, &beta, &cc[
+					1], &ldc);
+			    }
+
+/*                       Check if error-exit was taken incorrectly. */
+
+			    if (! infoc_1.ok) {
+				io___181.ciunit = *nout;
+				s_wsfe(&io___181);
+				e_wsfe();
+				*fatal = TRUE_;
+				goto L110;
+			    }
+
+/*                       See what data changed inside subroutines. */
+
+			    isame[0] = *(unsigned char *)sides == *(unsigned 
+				    char *)side;
+			    isame[1] = *(unsigned char *)uplos == *(unsigned 
+				    char *)uplo;
+			    isame[2] = ms == m;
+			    isame[3] = ns == n;
+			    isame[4] = als.r == alpha.r && als.i == alpha.i;
+			    isame[5] = lce_(&as[1], &aa[1], &laa);
+			    isame[6] = ldas == lda;
+			    isame[7] = lce_(&bs[1], &bb[1], &lbb);
+			    isame[8] = ldbs == ldb;
+			    isame[9] = bls.r == beta.r && bls.i == beta.i;
+			    if (null) {
+				isame[10] = lce_(&cs[1], &cc[1], &lcc);
+			    } else {
+				isame[10] = lceres_("ge", " ", &m, &n, &cs[1],
+					 &cc[1], &ldc);
+			    }
+			    isame[11] = ldcs == ldc;
+
+/*                       If data was incorrectly changed, report and */
+/*                       return. */
+
+			    same = TRUE_;
+			    i__5 = nargs;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				same = same && isame[i__ - 1];
+				if (! isame[i__ - 1]) {
+				    io___184.ciunit = *nout;
+				    s_wsfe(&io___184);
+				    do_fio(&c__1, (char *)&i__, (ftnlen)
+					    sizeof(integer));
+				    e_wsfe();
+				}
+/* L40: */
+			    }
+			    if (! same) {
+				*fatal = TRUE_;
+				goto L110;
+			    }
+
+			    if (! null) {
+
+/*                          Check the result. */
+
+				if (left) {
+				    cmmch_("N", "N", &m, &n, &m, &alpha, &a[
+					    a_offset], nmax, &b[b_offset], 
+					    nmax, &beta, &c__[c_offset], nmax,
+					     &ct[1], &g[1], &cc[1], &ldc, eps,
+					     &err, fatal, nout, &c_true);
+				} else {
+				    cmmch_("N", "N", &m, &n, &n, &alpha, &b[
+					    b_offset], nmax, &a[a_offset], 
+					    nmax, &beta, &c__[c_offset], nmax,
+					     &ct[1], &g[1], &cc[1], &ldc, eps,
+					     &err, fatal, nout, &c_true);
+				}
+				errmax = f2cmax(errmax,err);
+/*                          If got really bad answer, report and */
+/*                          return. */
+				if (*fatal) {
+				    goto L110;
+				}
+			    }
+
+/* L50: */
+			}
+
+/* L60: */
+		    }
+
+/* L70: */
+		}
+
+L80:
+		;
+	    }
+
+L90:
+	    ;
+	}
+
+/* L100: */
+    }
+
+/*     Report result. */
+
+    if (errmax < *thresh) {
+	if (*iorder == 0) {
+	    io___186.ciunit = *nout;
+	    s_wsfe(&io___186);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+	if (*iorder == 1) {
+	    io___187.ciunit = *nout;
+	    s_wsfe(&io___187);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+    } else {
+	if (*iorder == 0) {
+	    io___188.ciunit = *nout;
+	    s_wsfe(&io___188);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real));
+	    e_wsfe();
+	}
+	if (*iorder == 1) {
+	    io___189.ciunit = *nout;
+	    s_wsfe(&io___189);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real));
+	    e_wsfe();
+	}
+    }
+    goto L120;
+
+L110:
+    io___190.ciunit = *nout;
+    s_wsfe(&io___190);
+    do_fio(&c__1, sname, (ftnlen)13);
+    e_wsfe();
+    cprcn2_(nout, &nc, sname, iorder, side, uplo, &m, &n, &alpha, &lda, &ldb, 
+	    &beta, &ldc);
+
+L120:
+    return 0;
+
+/* L9995: */
+
+/*     End of CCHK2. */
+
+} /* cchk2_ */
+
+
+/* Subroutine */ int cprcn2_(integer *nout, integer *nc, char *sname, integer 
+	*iorder, char *side, char *uplo, integer *m, integer *n, complex *
+	alpha, integer *lda, integer *ldb, complex *beta, integer *ldc)
+{
+    /* Format strings */
+    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a14,\002,\002,"
+	    "a14,\002,\002,a14,\002,\002)";
+    static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002"
+	    ",f4.1,\002), A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002,"
+	    "\002,f4.1,\002), \002,\002C,\002,i3,\002).\002)";
+
+    /* Local variables */
+    char cs[14], cu[14], crc[14];
+
+    /* Fortran I/O blocks */
+    static cilist io___194 = { 0, 0, 0, fmt_9995, 0 };
+    static cilist io___195 = { 0, 0, 0, fmt_9994, 0 };
+
+
+    if (*(unsigned char *)side == 'L') {
+	s_copy(cs, "     CblasLeft", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cs, "    CblasRight", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    io___194.ciunit = *nout;
+    s_wsfe(&io___194);
+    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
+    do_fio(&c__1, sname, (ftnlen)13);
+    do_fio(&c__1, crc, (ftnlen)14);
+    do_fio(&c__1, cs, (ftnlen)14);
+    do_fio(&c__1, cu, (ftnlen)14);
+    e_wsfe();
+    io___195.ciunit = *nout;
+    s_wsfe(&io___195);
+    do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
+    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real));
+    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer));
+    do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real));
+    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
+    e_wsfe();
+    return 0;
+} /* cprcn2_ */
+
+
+/* Subroutine */ int cchk3_(char *sname, real *eps, real *thresh, integer *
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	integer *nidim, integer *idim, integer *nalf, complex *alf, integer *
+	nmax, complex *a, complex *aa, complex *as, complex *b, complex *bb, 
+	complex *bs, complex *ct, real *g, complex *c__, integer *iorder)
+{
+    /* Initialized data */
+
+    static char ichu[2] = "UL";
+    static char icht[3] = "NTC";
+    static char ichd[2] = "UN";
+    static char ichs[2] = "LR";
+
+    /* Format strings */
+    static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT "
+	    "TAKEN ON VALID\002,\002 CALL *******\002)";
+    static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER"
+	    " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)";
+    static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C"
+	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
+    static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR    C"
+	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
+    static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO"
+	    "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
+	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
+	    "ECT *******\002)";
+    static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR  "
+	    "  COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
+	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
+	    "ECT *******\002)";
+    static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM"
+	    "BER:\002)";
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+	    i__3, i__4, i__5, i__6, i__7;
+    complex q__1;
+    alist al__1;
+
+    /* Local variables */
+    char diag[1];
+    integer ldas, ldbs;
+    logical same;
+    char side[1];
+    logical left, null;
+    char uplo[1];
+    integer i__, j, m, n;
+    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, 
+	    integer *, complex *, integer *, complex *, integer *, logical *, 
+	    complex *);
+    complex alpha;
+    char diags[1];
+    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *, complex *, real *, complex *, 
+	    integer *, real *, real *, logical *, integer *, logical *);
+    logical isame[13];
+    char sides[1];
+    integer nargs;
+    logical reset;
+    char uplos[1];
+    extern /* Subroutine */ int cprcn3_(integer *, integer *, char *, integer 
+	    *, char *, char *, char *, char *, integer *, integer *, complex *
+	    , integer *, integer *);
+    integer ia, na, nc, im, in, ms, ns;
+    extern logical lceres_(char *, char *, integer *, integer *, complex *, 
+	    complex *, integer *);
+    extern /* Subroutine */ int cctrmm_(integer *, char *, char *, char *, 
+	    char *, integer *, integer *, complex *, complex *, integer *, 
+	    complex *, integer *);
+    char tranas[1], transa[1];
+    extern /* Subroutine */ int cctrsm_(integer *, char *, char *, char *, 
+	    char *, integer *, integer *, complex *, complex *, integer *, 
+	    complex *, integer *);
+    real errmax;
+    integer laa, icd, lbb, lda, ldb;
+    extern logical lce_(complex *, complex *, integer *);
+    integer ics;
+    complex als;
+    integer ict, icu;
+    real err;
+
+    /* Fortran I/O blocks */
+    static cilist io___236 = { 0, 0, 0, fmt_9994, 0 };
+    static cilist io___239 = { 0, 0, 0, fmt_9998, 0 };
+    static cilist io___241 = { 0, 0, 0, fmt_10000, 0 };
+    static cilist io___242 = { 0, 0, 0, fmt_10001, 0 };
+    static cilist io___243 = { 0, 0, 0, fmt_10002, 0 };
+    static cilist io___244 = { 0, 0, 0, fmt_10003, 0 };
+    static cilist io___245 = { 0, 0, 0, fmt_9996, 0 };
+
+
+
+/*  Tests CTRMM and CTRSM. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    --idim;
+    --alf;
+    c_dim1 = *nmax;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --g;
+    --ct;
+    --bs;
+    --bb;
+    b_dim1 = *nmax;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --as;
+    --aa;
+    a_dim1 = *nmax;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+
+    /* Function Body */
+
+    nargs = 11;
+    nc = 0;
+    reset = TRUE_;
+    errmax = 0.f;
+/*     Set up zero matrix for CMMCH. */
+    i__1 = *nmax;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *nmax;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * c_dim1;
+	    c__[i__3].r = 0.f, c__[i__3].i = 0.f;
+/* L10: */
+	}
+/* L20: */
+    }
+
+    i__1 = *nidim;
+    for (im = 1; im <= i__1; ++im) {
+	m = idim[im];
+
+	i__2 = *nidim;
+	for (in = 1; in <= i__2; ++in) {
+	    n = idim[in];
+/*           Set LDB to 1 more than minimum value if room. */
+	    ldb = m;
+	    if (ldb < *nmax) {
+		++ldb;
+	    }
+/*           Skip tests if not enough room. */
+	    if (ldb > *nmax) {
+		goto L130;
+	    }
+	    lbb = ldb * n;
+	    null = m <= 0 || n <= 0;
+
+	    for (ics = 1; ics <= 2; ++ics) {
+		*(unsigned char *)side = *(unsigned char *)&ichs[ics - 1];
+		left = *(unsigned char *)side == 'L';
+		if (left) {
+		    na = m;
+		} else {
+		    na = n;
+		}
+/*              Set LDA to 1 more than minimum value if room. */
+		lda = na;
+		if (lda < *nmax) {
+		    ++lda;
+		}
+/*              Skip tests if not enough room. */
+		if (lda > *nmax) {
+		    goto L130;
+		}
+		laa = lda * na;
+
+		for (icu = 1; icu <= 2; ++icu) {
+		    *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1];
+
+		    for (ict = 1; ict <= 3; ++ict) {
+			*(unsigned char *)transa = *(unsigned char *)&icht[
+				ict - 1];
+
+			for (icd = 1; icd <= 2; ++icd) {
+			    *(unsigned char *)diag = *(unsigned char *)&ichd[
+				    icd - 1];
+
+			    i__3 = *nalf;
+			    for (ia = 1; ia <= i__3; ++ia) {
+				i__4 = ia;
+				alpha.r = alf[i__4].r, alpha.i = alf[i__4].i;
+
+/*                          Generate the matrix A. */
+
+				cmake_("tr", uplo, diag, &na, &na, &a[
+					a_offset], nmax, &aa[1], &lda, &reset,
+					 &c_b1);
+
+/*                          Generate the matrix B. */
+
+				cmake_("ge", " ", " ", &m, &n, &b[b_offset], 
+					nmax, &bb[1], &ldb, &reset, &c_b1);
+
+				++nc;
+
+/*                          Save every datum before calling the */
+/*                          subroutine. */
+
+				*(unsigned char *)sides = *(unsigned char *)
+					side;
+				*(unsigned char *)uplos = *(unsigned char *)
+					uplo;
+				*(unsigned char *)tranas = *(unsigned char *)
+					transa;
+				*(unsigned char *)diags = *(unsigned char *)
+					diag;
+				ms = m;
+				ns = n;
+				als.r = alpha.r, als.i = alpha.i;
+				i__4 = laa;
+				for (i__ = 1; i__ <= i__4; ++i__) {
+				    i__5 = i__;
+				    i__6 = i__;
+				    as[i__5].r = aa[i__6].r, as[i__5].i = aa[
+					    i__6].i;
+/* L30: */
+				}
+				ldas = lda;
+				i__4 = lbb;
+				for (i__ = 1; i__ <= i__4; ++i__) {
+				    i__5 = i__;
+				    i__6 = i__;
+				    bs[i__5].r = bb[i__6].r, bs[i__5].i = bb[
+					    i__6].i;
+/* L40: */
+				}
+				ldbs = ldb;
+
+/*                          Call the subroutine. */
+
+				if (s_cmp(sname + 9, "mm", (ftnlen)2, (ftnlen)
+					2) == 0) {
+				    if (*trace) {
+					cprcn3_(ntra, &nc, sname, iorder, 
+						side, uplo, transa, diag, &m, 
+						&n, &alpha, &lda, &ldb, (
+						ftnlen)13, (ftnlen)1, (ftnlen)
+						1, (ftnlen)1, (ftnlen)1);
+				    }
+				    if (*rewi) {
+					al__1.aerr = 0;
+					al__1.aunit = *ntra;
+					f_rew(&al__1);
+				    }
+				    cctrmm_(iorder, side, uplo, transa, diag, 
+					    &m, &n, &alpha, &aa[1], &lda, &bb[
+					    1], &ldb);
+				} else if (s_cmp(sname + 9, "sm", (ftnlen)2, (
+					ftnlen)2) == 0) {
+				    if (*trace) {
+					cprcn3_(ntra, &nc, sname, iorder, 
+						side, uplo, transa, diag, &m, 
+						&n, &alpha, &lda, &ldb, (
+						ftnlen)13, (ftnlen)1, (ftnlen)
+						1, (ftnlen)1, (ftnlen)1);
+				    }
+				    if (*rewi) {
+					al__1.aerr = 0;
+					al__1.aunit = *ntra;
+					f_rew(&al__1);
+				    }
+				    cctrsm_(iorder, side, uplo, transa, diag, 
+					    &m, &n, &alpha, &aa[1], &lda, &bb[
+					    1], &ldb);
+				}
+
+/*                          Check if error-exit was taken incorrectly. */
+
+				if (! infoc_1.ok) {
+				    io___236.ciunit = *nout;
+				    s_wsfe(&io___236);
+				    e_wsfe();
+				    *fatal = TRUE_;
+				    goto L150;
+				}
+
+/*                          See what data changed inside subroutines. */
+
+				isame[0] = *(unsigned char *)sides == *(
+					unsigned char *)side;
+				isame[1] = *(unsigned char *)uplos == *(
+					unsigned char *)uplo;
+				isame[2] = *(unsigned char *)tranas == *(
+					unsigned char *)transa;
+				isame[3] = *(unsigned char *)diags == *(
+					unsigned char *)diag;
+				isame[4] = ms == m;
+				isame[5] = ns == n;
+				isame[6] = als.r == alpha.r && als.i == 
+					alpha.i;
+				isame[7] = lce_(&as[1], &aa[1], &laa);
+				isame[8] = ldas == lda;
+				if (null) {
+				    isame[9] = lce_(&bs[1], &bb[1], &lbb);
+				} else {
+				    isame[9] = lceres_("ge", " ", &m, &n, &bs[
+					    1], &bb[1], &ldb);
+				}
+				isame[10] = ldbs == ldb;
+
+/*                          If data was incorrectly changed, report and */
+/*                          return. */
+
+				same = TRUE_;
+				i__4 = nargs;
+				for (i__ = 1; i__ <= i__4; ++i__) {
+				    same = same && isame[i__ - 1];
+				    if (! isame[i__ - 1]) {
+					io___239.ciunit = *nout;
+					s_wsfe(&io___239);
+					do_fio(&c__1, (char *)&i__, (ftnlen)
+						sizeof(integer));
+					e_wsfe();
+				    }
+/* L50: */
+				}
+				if (! same) {
+				    *fatal = TRUE_;
+				    goto L150;
+				}
+
+				if (! null) {
+				    if (s_cmp(sname + 9, "mm", (ftnlen)2, (
+					    ftnlen)2) == 0) {
+
+/*                                Check the result. */
+
+					if (left) {
+					    cmmch_(transa, "N", &m, &n, &m, &
+						    alpha, &a[a_offset], nmax,
+						     &b[b_offset], nmax, &
+						    c_b1, &c__[c_offset], 
+						    nmax, &ct[1], &g[1], &bb[
+						    1], &ldb, eps, &err, 
+						    fatal, nout, &c_true, (
+						    ftnlen)1, (ftnlen)1);
+					} else {
+					    cmmch_("N", transa, &m, &n, &n, &
+						    alpha, &b[b_offset], nmax,
+						     &a[a_offset], nmax, &
+						    c_b1, &c__[c_offset], 
+						    nmax, &ct[1], &g[1], &bb[
+						    1], &ldb, eps, &err, 
+						    fatal, nout, &c_true);
+					}
+				    } else if (s_cmp(sname + 9, "sm", (ftnlen)
+					    2, (ftnlen)2) == 0) {
+
+/*                                Compute approximation to original */
+/*                                matrix. */
+
+					i__4 = n;
+					for (j = 1; j <= i__4; ++j) {
+					    i__5 = m;
+					    for (i__ = 1; i__ <= i__5; ++i__) 
+						    {
+			  i__6 = i__ + j * c_dim1;
+			  i__7 = i__ + (j - 1) * ldb;
+			  c__[i__6].r = bb[i__7].r, c__[i__6].i = bb[i__7].i;
+			  i__6 = i__ + (j - 1) * ldb;
+			  i__7 = i__ + j * b_dim1;
+			  q__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, 
+				  q__1.i = alpha.r * b[i__7].i + alpha.i * b[
+				  i__7].r;
+			  bb[i__6].r = q__1.r, bb[i__6].i = q__1.i;
+/* L60: */
+					    }
+/* L70: */
+					}
+
+					if (left) {
+					    cmmch_(transa, "N", &m, &n, &m, &
+						    c_b2, &a[a_offset], nmax, 
+						    &c__[c_offset], nmax, &
+						    c_b1, &b[b_offset], nmax, 
+						    &ct[1], &g[1], &bb[1], &
+						    ldb, eps, &err, fatal, 
+						    nout, &c_false);
+					} else {
+					    cmmch_("N", transa, &m, &n, &n, &
+						    c_b2, &c__[c_offset], 
+						    nmax, &a[a_offset], nmax, 
+						    &c_b1, &b[b_offset], nmax,
+						     &ct[1], &g[1], &bb[1], &
+						    ldb, eps, &err, fatal, 
+						    nout, &c_false);
+					}
+				    }
+				    errmax = f2cmax(errmax,err);
+/*                             If got really bad answer, report and */
+/*                             return. */
+				    if (*fatal) {
+					goto L150;
+				    }
+				}
+
+/* L80: */
+			    }
+
+/* L90: */
+			}
+
+/* L100: */
+		    }
+
+/* L110: */
+		}
+
+/* L120: */
+	    }
+
+L130:
+	    ;
+	}
+
+/* L140: */
+    }
+
+/*     Report result. */
+
+    if (errmax < *thresh) {
+	if (*iorder == 0) {
+	    io___241.ciunit = *nout;
+	    s_wsfe(&io___241);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+	if (*iorder == 1) {
+	    io___242.ciunit = *nout;
+	    s_wsfe(&io___242);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+    } else {
+	if (*iorder == 0) {
+	    io___243.ciunit = *nout;
+	    s_wsfe(&io___243);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real));
+	    e_wsfe();
+	}
+	if (*iorder == 1) {
+	    io___244.ciunit = *nout;
+	    s_wsfe(&io___244);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real));
+	    e_wsfe();
+	}
+    }
+    goto L160;
+
+L150:
+    io___245.ciunit = *nout;
+    s_wsfe(&io___245);
+    do_fio(&c__1, sname, (ftnlen)13);
+    e_wsfe();
+    if (*trace) {
+	cprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, &
+		alpha, &lda, &ldb);
+    }
+
+L160:
+    return 0;
+
+/* L9995: */
+
+/*     End of CCHK3. */
+
+} /* cchk3_ */
+
+
+/* Subroutine */ int cprcn3_(integer *nout, integer *nc, char *sname, integer 
+	*iorder, char *side, char *uplo, char *transa, char *diag, integer *m,
+	 integer *n, complex *alpha, integer *lda, integer *ldb)
+{
+    /* Format strings */
+    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a14,\002,\002,"
+	    "a14,\002,\002,a14,\002,\002)";
+    static char fmt_9994[] = "(10x,2(a14,\002,\002),2(i3,\002,\002),\002 "
+	    "(\002,f4.1,\002,\002,f4.1,\002), A,\002,i3,\002, B,\002,i3,\002)."
+	    "\002)";
+
+    /* Local variables */
+    char ca[14], cd[14], cs[14], cu[14], crc[14];
+
+    /* Fortran I/O blocks */
+    static cilist io___251 = { 0, 0, 0, fmt_9995, 0 };
+    static cilist io___252 = { 0, 0, 0, fmt_9994, 0 };
+
+
+    if (*(unsigned char *)side == 'L') {
+	s_copy(cs, "     CblasLeft", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cs, "    CblasRight", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(ca, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(ca, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)diag == 'N') {
+	s_copy(cd, "  CblasNonUnit", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cd, "     CblasUnit", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    io___251.ciunit = *nout;
+    s_wsfe(&io___251);
+    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
+    do_fio(&c__1, sname, (ftnlen)13);
+    do_fio(&c__1, crc, (ftnlen)14);
+    do_fio(&c__1, cs, (ftnlen)14);
+    do_fio(&c__1, cu, (ftnlen)14);
+    e_wsfe();
+    io___252.ciunit = *nout;
+    s_wsfe(&io___252);
+    do_fio(&c__1, ca, (ftnlen)14);
+    do_fio(&c__1, cd, (ftnlen)14);
+    do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
+    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real));
+    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer));
+    e_wsfe();
+    return 0;
+} /* cprcn3_ */
+
+
+/* Subroutine */ int cchk4_(char *sname, real *eps, real *thresh, integer *
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	integer *nidim, integer *idim, integer *nalf, complex *alf, integer *
+	nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex *
+	as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, 
+	complex *cs, complex *ct, real *g, integer *iorder)
+{
+    /* Initialized data */
+
+    static char icht[2] = "NC";
+    static char ichu[2] = "UL";
+
+    /* Format strings */
+    static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT "
+	    "TAKEN ON VALID\002,\002 CALL *******\002)";
+    static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER"
+	    " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)";
+    static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C"
+	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
+    static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR    C"
+	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
+    static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO"
+	    "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
+	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
+	    "ECT *******\002)";
+    static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR  "
+	    "  COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
+	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
+	    "ECT *******\002)";
+    static char fmt_9995[] = "(\002      THESE ARE THE RESULTS FOR COLUMN"
+	    " \002,i3)";
+    static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM"
+	    "BER:\002)";
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+	    i__3, i__4, i__5, i__6, i__7;
+    complex q__1;
+    alist al__1;
+
+    /* Local variables */
+    complex beta;
+    integer ldas, ldcs;
+    logical same, conj;
+    complex bets;
+    real rals;
+    logical tran, null;
+    char uplo[1];
+    integer i__, j, k, n;
+    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, 
+	    integer *, complex *, integer *, complex *, integer *, logical *, 
+	    complex *);
+    complex alpha;
+    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *, complex *, real *, complex *, 
+	    integer *, real *, real *, logical *, integer *, logical *);
+    real rbeta;
+    logical isame[13];
+    integer nargs;
+    real rbets;
+    logical reset;
+    char trans[1];
+    logical upper;
+    char uplos[1];
+    extern /* Subroutine */ int cprcn4_(integer *, integer *, char *, integer 
+	    *, char *, char *, integer *, integer *, complex *, integer *, 
+	    complex *, integer *), cprcn6_(integer *, 
+	    integer *, char *, integer *, char *, char *, integer *, integer *
+	    , real *, integer *, real *, integer *);
+    integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks;
+    extern /* Subroutine */ int ccherk_(integer *, char *, char *, integer *, 
+	    integer *, real *, complex *, integer *, real *, complex *, 
+	    integer *);
+    integer ns;
+    real ralpha;
+    extern logical lceres_(char *, char *, integer *, integer *, complex *, 
+	    complex *, integer *);
+    real errmax;
+    extern /* Subroutine */ int ccsyrk_(integer *, char *, char *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, complex *, 
+	    integer *);
+    char transs[1], transt[1];
+    integer laa, lda, lcc, ldc;
+    extern logical lce_(complex *, complex *, integer *);
+    complex als;
+    integer ict, icu;
+    real err;
+
+    /* Fortran I/O blocks */
+    static cilist io___294 = { 0, 0, 0, fmt_9992, 0 };
+    static cilist io___297 = { 0, 0, 0, fmt_9998, 0 };
+    static cilist io___304 = { 0, 0, 0, fmt_10000, 0 };
+    static cilist io___305 = { 0, 0, 0, fmt_10001, 0 };
+    static cilist io___306 = { 0, 0, 0, fmt_10002, 0 };
+    static cilist io___307 = { 0, 0, 0, fmt_10003, 0 };
+    static cilist io___308 = { 0, 0, 0, fmt_9995, 0 };
+    static cilist io___309 = { 0, 0, 0, fmt_9996, 0 };
+
+
+
+/*  Tests CHERK and CSYRK. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    --idim;
+    --alf;
+    --bet;
+    --g;
+    --ct;
+    --cs;
+    --cc;
+    c_dim1 = *nmax;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --bs;
+    --bb;
+    b_dim1 = *nmax;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --as;
+    --aa;
+    a_dim1 = *nmax;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+
+    /* Function Body */
+    conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0;
+
+    nargs = 10;
+    nc = 0;
+    reset = TRUE_;
+    errmax = 0.f;
+
+    i__1 = *nidim;
+    for (in = 1; in <= i__1; ++in) {
+	n = idim[in];
+/*        Set LDC to 1 more than minimum value if room. */
+	ldc = n;
+	if (ldc < *nmax) {
+	    ++ldc;
+	}
+/*        Skip tests if not enough room. */
+	if (ldc > *nmax) {
+	    goto L100;
+	}
+	lcc = ldc * n;
+
+	i__2 = *nidim;
+	for (ik = 1; ik <= i__2; ++ik) {
+	    k = idim[ik];
+
+	    for (ict = 1; ict <= 2; ++ict) {
+		*(unsigned char *)trans = *(unsigned char *)&icht[ict - 1];
+		tran = *(unsigned char *)trans == 'C';
+		if (tran && ! conj) {
+		    *(unsigned char *)trans = 'T';
+		}
+		if (tran) {
+		    ma = k;
+		    na = n;
+		} else {
+		    ma = n;
+		    na = k;
+		}
+/*              Set LDA to 1 more than minimum value if room. */
+		lda = ma;
+		if (lda < *nmax) {
+		    ++lda;
+		}
+/*              Skip tests if not enough room. */
+		if (lda > *nmax) {
+		    goto L80;
+		}
+		laa = lda * na;
+
+/*              Generate the matrix A. */
+
+		cmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[1], &
+			lda, &reset, &c_b1);
+
+		for (icu = 1; icu <= 2; ++icu) {
+		    *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1];
+		    upper = *(unsigned char *)uplo == 'U';
+
+		    i__3 = *nalf;
+		    for (ia = 1; ia <= i__3; ++ia) {
+			i__4 = ia;
+			alpha.r = alf[i__4].r, alpha.i = alf[i__4].i;
+			if (conj) {
+			    ralpha = alpha.r;
+			    q__1.r = ralpha, q__1.i = 0.f;
+			    alpha.r = q__1.r, alpha.i = q__1.i;
+			}
+
+			i__4 = *nbet;
+			for (ib = 1; ib <= i__4; ++ib) {
+			    i__5 = ib;
+			    beta.r = bet[i__5].r, beta.i = bet[i__5].i;
+			    if (conj) {
+				rbeta = beta.r;
+				q__1.r = rbeta, q__1.i = 0.f;
+				beta.r = q__1.r, beta.i = q__1.i;
+			    }
+			    null = n <= 0;
+			    if (conj) {
+				null = null || (k <= 0 || ralpha == 0.f) && 
+					rbeta == 1.f;
+			    }
+
+/*                       Generate the matrix C. */
+
+			    cmake_(sname + 7, uplo, " ", &n, &n, &c__[
+				    c_offset], nmax, &cc[1], &ldc, &reset, &
+				    c_b1);
+
+			    ++nc;
+
+/*                       Save every datum before calling the subroutine. */
+
+			    *(unsigned char *)uplos = *(unsigned char *)uplo;
+			    *(unsigned char *)transs = *(unsigned char *)
+				    trans;
+			    ns = n;
+			    ks = k;
+			    if (conj) {
+				rals = ralpha;
+			    } else {
+				als.r = alpha.r, als.i = alpha.i;
+			    }
+			    i__5 = laa;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7]
+					.i;
+/* L10: */
+			    }
+			    ldas = lda;
+			    if (conj) {
+				rbets = rbeta;
+			    } else {
+				bets.r = beta.r, bets.i = beta.i;
+			    }
+			    i__5 = lcc;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7]
+					.i;
+/* L20: */
+			    }
+			    ldcs = ldc;
+
+/*                       Call the subroutine. */
+
+			    if (conj) {
+				if (*trace) {
+				    cprcn6_(ntra, &nc, sname, iorder, uplo, 
+					    trans, &n, &k, &ralpha, &lda, &
+					    rbeta, &ldc);
+				}
+				if (*rewi) {
+				    al__1.aerr = 0;
+				    al__1.aunit = *ntra;
+				    f_rew(&al__1);
+				}
+				ccherk_(iorder, uplo, trans, &n, &k, &ralpha, 
+					&aa[1], &lda, &rbeta, &cc[1], &ldc);
+			    } else {
+				if (*trace) {
+				    cprcn4_(ntra, &nc, sname, iorder, uplo, 
+					    trans, &n, &k, &alpha, &lda, &
+					    beta, &ldc);
+				}
+				if (*rewi) {
+				    al__1.aerr = 0;
+				    al__1.aunit = *ntra;
+				    f_rew(&al__1);
+				}
+				ccsyrk_(iorder, uplo, trans, &n, &k, &alpha, &
+					aa[1], &lda, &beta, &cc[1], &ldc);
+			    }
+
+/*                       Check if error-exit was taken incorrectly. */
+
+			    if (! infoc_1.ok) {
+				io___294.ciunit = *nout;
+				s_wsfe(&io___294);
+				e_wsfe();
+				*fatal = TRUE_;
+				goto L120;
+			    }
+
+/*                       See what data changed inside subroutines. */
+
+			    isame[0] = *(unsigned char *)uplos == *(unsigned 
+				    char *)uplo;
+			    isame[1] = *(unsigned char *)transs == *(unsigned 
+				    char *)trans;
+			    isame[2] = ns == n;
+			    isame[3] = ks == k;
+			    if (conj) {
+				isame[4] = rals == ralpha;
+			    } else {
+				isame[4] = als.r == alpha.r && als.i == 
+					alpha.i;
+			    }
+			    isame[5] = lce_(&as[1], &aa[1], &laa);
+			    isame[6] = ldas == lda;
+			    if (conj) {
+				isame[7] = rbets == rbeta;
+			    } else {
+				isame[7] = bets.r == beta.r && bets.i == 
+					beta.i;
+			    }
+			    if (null) {
+				isame[8] = lce_(&cs[1], &cc[1], &lcc);
+			    } else {
+				isame[8] = lceres_(sname + 7, uplo, &n, &n, &
+					cs[1], &cc[1], &ldc);
+			    }
+			    isame[9] = ldcs == ldc;
+
+/*                       If data was incorrectly changed, report and */
+/*                       return. */
+
+			    same = TRUE_;
+			    i__5 = nargs;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				same = same && isame[i__ - 1];
+				if (! isame[i__ - 1]) {
+				    io___297.ciunit = *nout;
+				    s_wsfe(&io___297);
+				    do_fio(&c__1, (char *)&i__, (ftnlen)
+					    sizeof(integer));
+				    e_wsfe();
+				}
+/* L30: */
+			    }
+			    if (! same) {
+				*fatal = TRUE_;
+				goto L120;
+			    }
+
+			    if (! null) {
+
+/*                          Check the result column by column. */
+
+				if (conj) {
+				    *(unsigned char *)transt = 'C';
+				} else {
+				    *(unsigned char *)transt = 'T';
+				}
+				jc = 1;
+				i__5 = n;
+				for (j = 1; j <= i__5; ++j) {
+				    if (upper) {
+					jj = 1;
+					lj = j;
+				    } else {
+					jj = j;
+					lj = n - j + 1;
+				    }
+				    if (tran) {
+					cmmch_(transt, "N", &lj, &c__1, &k, &
+						alpha, &a[jj * a_dim1 + 1], 
+						nmax, &a[j * a_dim1 + 1], 
+						nmax, &beta, &c__[jj + j * 
+						c_dim1], nmax, &ct[1], &g[1], 
+						&cc[jc], &ldc, eps, &err, 
+						fatal, nout, &c_true);
+				    } else {
+					cmmch_("N", transt, &lj, &c__1, &k, &
+						alpha, &a[jj + a_dim1], nmax, 
+						&a[j + a_dim1], nmax, &beta, &
+						c__[jj + j * c_dim1], nmax, &
+						ct[1], &g[1], &cc[jc], &ldc, 
+						eps, &err, fatal, nout, &
+						c_true);
+				    }
+				    if (upper) {
+					jc += ldc;
+				    } else {
+					jc = jc + ldc + 1;
+				    }
+				    errmax = f2cmax(errmax,err);
+/*                             If got really bad answer, report and */
+/*                             return. */
+				    if (*fatal) {
+					goto L110;
+				    }
+/* L40: */
+				}
+			    }
+
+/* L50: */
+			}
+
+/* L60: */
+		    }
+
+/* L70: */
+		}
+
+L80:
+		;
+	    }
+
+/* L90: */
+	}
+
+L100:
+	;
+    }
+
+/*     Report result. */
+
+    if (errmax < *thresh) {
+	if (*iorder == 0) {
+	    io___304.ciunit = *nout;
+	    s_wsfe(&io___304);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+	if (*iorder == 1) {
+	    io___305.ciunit = *nout;
+	    s_wsfe(&io___305);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+    } else {
+	if (*iorder == 0) {
+	    io___306.ciunit = *nout;
+	    s_wsfe(&io___306);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real));
+	    e_wsfe();
+	}
+	if (*iorder == 1) {
+	    io___307.ciunit = *nout;
+	    s_wsfe(&io___307);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real));
+	    e_wsfe();
+	}
+    }
+    goto L130;
+
+L110:
+    if (n > 1) {
+	io___308.ciunit = *nout;
+	s_wsfe(&io___308);
+	do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer));
+	e_wsfe();
+    }
+
+L120:
+    io___309.ciunit = *nout;
+    s_wsfe(&io___309);
+    do_fio(&c__1, sname, (ftnlen)13);
+    e_wsfe();
+    if (conj) {
+	cprcn6_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &ralpha, &lda, 
+		&rbeta, &ldc);
+    } else {
+	cprcn4_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, &
+		beta, &ldc);
+    }
+
+L130:
+    return 0;
+
+/* L9994: */
+/* L9993: */
+
+/*     End of CCHK4. */
+
+} /* cchk4_ */
+
+
+/* Subroutine */ int cprcn4_(integer *nout, integer *nc, char *sname, integer 
+	*iorder, char *uplo, char *transa, integer *n, integer *k, complex *
+	alpha, integer *lda, complex *beta, integer *ldc)
+{
+    /* Format strings */
+    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002,"
+	    "\002))";
+    static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002"
+	    ",f4.1,\002), A,\002,i3,\002, (\002,f4.1,\002,\002,f4.1,\002), C"
+	    ",\002,i3,\002).\002)";
+
+    /* Local variables */
+    char ca[14], cu[14], crc[14];
+
+    /* Fortran I/O blocks */
+    static cilist io___313 = { 0, 0, 0, fmt_9995, 0 };
+    static cilist io___314 = { 0, 0, 0, fmt_9994, 0 };
+
+
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(ca, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(ca, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    io___313.ciunit = *nout;
+    s_wsfe(&io___313);
+    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
+    do_fio(&c__1, sname, (ftnlen)13);
+    do_fio(&c__1, crc, (ftnlen)14);
+    do_fio(&c__1, cu, (ftnlen)14);
+    do_fio(&c__1, ca, (ftnlen)14);
+    e_wsfe();
+    io___314.ciunit = *nout;
+    s_wsfe(&io___314);
+    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer));
+    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real));
+    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
+    do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real));
+    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
+    e_wsfe();
+    return 0;
+} /* cprcn4_ */
+
+
+
+/* Subroutine */ int cprcn6_(integer *nout, integer *nc, char *sname, integer 
+	*iorder, char *uplo, char *transa, integer *n, integer *k, real *
+	alpha, integer *lda, real *beta, integer *ldc)
+{
+    /* Format strings */
+    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002,"
+	    "\002))";
+    static char fmt_9994[] = "(10x,2(i3,\002,\002),f4.1,\002, A,\002,i3"
+	    ",\002,\002,f4.1,\002, C,\002,i3,\002).\002)";
+
+    /* Local variables */
+    char ca[14], cu[14], crc[14];
+
+    /* Fortran I/O blocks */
+    static cilist io___318 = { 0, 0, 0, fmt_9995, 0 };
+    static cilist io___319 = { 0, 0, 0, fmt_9994, 0 };
+
+
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(ca, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(ca, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    io___318.ciunit = *nout;
+    s_wsfe(&io___318);
+    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
+    do_fio(&c__1, sname, (ftnlen)13);
+    do_fio(&c__1, crc, (ftnlen)14);
+    do_fio(&c__1, cu, (ftnlen)14);
+    do_fio(&c__1, ca, (ftnlen)14);
+    e_wsfe();
+    io___319.ciunit = *nout;
+    s_wsfe(&io___319);
+    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*alpha), (ftnlen)sizeof(real));
+    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(real));
+    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
+    e_wsfe();
+    return 0;
+} /* cprcn6_ */
+
+
+/* Subroutine */ int cchk5_(char *sname, real *eps, real *thresh, integer *
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	integer *nidim, integer *idim, integer *nalf, complex *alf, integer *
+	nbet, complex *bet, integer *nmax, complex *ab, complex *aa, complex *
+	as, complex *bb, complex *bs, complex *c__, complex *cc, complex *cs, 
+	complex *ct, real *g, complex *w, integer *iorder)
+{
+    /* Initialized data */
+
+    static char icht[2] = "NC";
+    static char ichu[2] = "UL";
+
+    /* Format strings */
+    static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT "
+	    "TAKEN ON VALID\002,\002 CALL *******\002)";
+    static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER"
+	    " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)";
+    static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C"
+	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
+    static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR    C"
+	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
+    static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO"
+	    "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
+	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
+	    "ECT *******\002)";
+    static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR  "
+	    "  COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
+	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
+	    "ECT *******\002)";
+    static char fmt_9995[] = "(\002      THESE ARE THE RESULTS FOR COLUMN"
+	    " \002,i3)";
+    static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM"
+	    "BER:\002)";
+
+    /* System generated locals */
+    integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8;
+    complex q__1, q__2;
+    alist al__1;
+
+    /* Local variables */
+    integer jjab;
+    complex beta;
+    integer ldas, ldbs, ldcs;
+    logical same, conj;
+    complex bets;
+    logical tran, null;
+    char uplo[1];
+    integer i__, j, k, n;
+    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, 
+	    integer *, complex *, integer *, complex *, integer *, logical *, 
+	    complex *);
+    complex alpha;
+    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *, complex *, real *, complex *, 
+	    integer *, real *, real *, logical *, integer *, logical *);
+    real rbeta;
+    logical isame[13];
+    integer nargs;
+    real rbets;
+    logical reset;
+    char trans[1];
+    logical upper;
+    char uplos[1];
+    extern /* Subroutine */ int cprcn5_(integer *, integer *, char *, integer 
+	    *, char *, char *, integer *, integer *, complex *, integer *, 
+	    integer *, complex *, integer *), cprcn7_(
+	    integer *, integer *, char *, integer *, char *, char *, integer *
+	    , integer *, complex *, integer *, integer *, real *, integer *);
+    integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks, ns;
+    extern logical lceres_(char *, char *, integer *, integer *, complex *, 
+	    complex *, integer *);
+    real errmax;
+    char transs[1], transt[1];
+    extern /* Subroutine */ int ccher2k_(integer *, char *, char *, integer *,
+	     integer *, complex *, complex *, integer *, complex *, integer *,
+	     real *, complex *, integer *);
+    integer laa, lbb, lda, lcc, ldb, ldc;
+    extern logical lce_(complex *, complex *, integer *);
+    extern /* Subroutine */ int ccsyr2k_(integer *, char *, char *, integer *,
+	     integer *, complex *, complex *, integer *, complex *, integer *,
+	     complex *, complex *, integer *);
+    complex als;
+    integer ict, icu;
+    real err;
+
+    /* Fortran I/O blocks */
+    static cilist io___362 = { 0, 0, 0, fmt_9992, 0 };
+    static cilist io___365 = { 0, 0, 0, fmt_9998, 0 };
+    static cilist io___373 = { 0, 0, 0, fmt_10000, 0 };
+    static cilist io___374 = { 0, 0, 0, fmt_10001, 0 };
+    static cilist io___375 = { 0, 0, 0, fmt_10002, 0 };
+    static cilist io___376 = { 0, 0, 0, fmt_10003, 0 };
+    static cilist io___377 = { 0, 0, 0, fmt_9995, 0 };
+    static cilist io___378 = { 0, 0, 0, fmt_9996, 0 };
+
+
+
+/*  Tests CHER2K and CSYR2K. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    --idim;
+    --alf;
+    --bet;
+    --w;
+    --g;
+    --ct;
+    --cs;
+    --cc;
+    c_dim1 = *nmax;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --bs;
+    --bb;
+    --as;
+    --aa;
+    --ab;
+
+    /* Function Body */
+    conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0;
+
+    nargs = 12;
+    nc = 0;
+    reset = TRUE_;
+    errmax = 0.f;
+
+    i__1 = *nidim;
+    for (in = 1; in <= i__1; ++in) {
+	n = idim[in];
+/*        Set LDC to 1 more than minimum value if room. */
+	ldc = n;
+	if (ldc < *nmax) {
+	    ++ldc;
+	}
+/*        Skip tests if not enough room. */
+	if (ldc > *nmax) {
+	    goto L130;
+	}
+	lcc = ldc * n;
+
+	i__2 = *nidim;
+	for (ik = 1; ik <= i__2; ++ik) {
+	    k = idim[ik];
+
+	    for (ict = 1; ict <= 2; ++ict) {
+		*(unsigned char *)trans = *(unsigned char *)&icht[ict - 1];
+		tran = *(unsigned char *)trans == 'C';
+		if (tran && ! conj) {
+		    *(unsigned char *)trans = 'T';
+		}
+		if (tran) {
+		    ma = k;
+		    na = n;
+		} else {
+		    ma = n;
+		    na = k;
+		}
+/*              Set LDA to 1 more than minimum value if room. */
+		lda = ma;
+		if (lda < *nmax) {
+		    ++lda;
+		}
+/*              Skip tests if not enough room. */
+		if (lda > *nmax) {
+		    goto L110;
+		}
+		laa = lda * na;
+
+/*              Generate the matrix A. */
+
+		if (tran) {
+		    i__3 = *nmax << 1;
+		    cmake_("ge", " ", " ", &ma, &na, &ab[1], &i__3, &aa[1], &
+			    lda, &reset, &c_b1);
+		} else {
+		    cmake_("ge", " ", " ", &ma, &na, &ab[1], nmax, &aa[1], &
+			    lda, &reset, &c_b1);
+		}
+
+/*              Generate the matrix B. */
+
+		ldb = lda;
+		lbb = laa;
+		if (tran) {
+		    i__3 = *nmax << 1;
+		    cmake_("ge", " ", " ", &ma, &na, &ab[k + 1], &i__3, &bb[1]
+			    , &ldb, &reset, &c_b1);
+		} else {
+		    cmake_("ge", " ", " ", &ma, &na, &ab[k * *nmax + 1], nmax,
+			     &bb[1], &ldb, &reset, &c_b1);
+		}
+
+		for (icu = 1; icu <= 2; ++icu) {
+		    *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1];
+		    upper = *(unsigned char *)uplo == 'U';
+
+		    i__3 = *nalf;
+		    for (ia = 1; ia <= i__3; ++ia) {
+			i__4 = ia;
+			alpha.r = alf[i__4].r, alpha.i = alf[i__4].i;
+
+			i__4 = *nbet;
+			for (ib = 1; ib <= i__4; ++ib) {
+			    i__5 = ib;
+			    beta.r = bet[i__5].r, beta.i = bet[i__5].i;
+			    if (conj) {
+				rbeta = beta.r;
+				q__1.r = rbeta, q__1.i = 0.f;
+				beta.r = q__1.r, beta.i = q__1.i;
+			    }
+			    null = n <= 0;
+			    if (conj) {
+				null = null || (k <= 0 || alpha.r == 0.f && 
+					alpha.i == 0.f) && rbeta == 1.f;
+			    }
+
+/*                       Generate the matrix C. */
+
+			    cmake_(sname + 7, uplo, " ", &n, &n, &c__[
+				    c_offset], nmax, &cc[1], &ldc, &reset, &
+				    c_b1);
+
+			    ++nc;
+
+/*                       Save every datum before calling the subroutine. */
+
+			    *(unsigned char *)uplos = *(unsigned char *)uplo;
+			    *(unsigned char *)transs = *(unsigned char *)
+				    trans;
+			    ns = n;
+			    ks = k;
+			    als.r = alpha.r, als.i = alpha.i;
+			    i__5 = laa;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7]
+					.i;
+/* L10: */
+			    }
+			    ldas = lda;
+			    i__5 = lbb;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7]
+					.i;
+/* L20: */
+			    }
+			    ldbs = ldb;
+			    if (conj) {
+				rbets = rbeta;
+			    } else {
+				bets.r = beta.r, bets.i = beta.i;
+			    }
+			    i__5 = lcc;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7]
+					.i;
+/* L30: */
+			    }
+			    ldcs = ldc;
+
+/*                       Call the subroutine. */
+
+			    if (conj) {
+				if (*trace) {
+				    cprcn7_(ntra, &nc, sname, iorder, uplo, 
+					    trans, &n, &k, &alpha, &lda, &ldb,
+					     &rbeta, &ldc);
+				}
+				if (*rewi) {
+				    al__1.aerr = 0;
+				    al__1.aunit = *ntra;
+				    f_rew(&al__1);
+				}
+				ccher2k_(iorder, uplo, trans, &n, &k, &alpha, 
+					&aa[1], &lda, &bb[1], &ldb, &rbeta, &
+					cc[1], &ldc);
+			    } else {
+				if (*trace) {
+				    cprcn5_(ntra, &nc, sname, iorder, uplo, 
+					    trans, &n, &k, &alpha, &lda, &ldb,
+					     &beta, &ldc);
+				}
+				if (*rewi) {
+				    al__1.aerr = 0;
+				    al__1.aunit = *ntra;
+				    f_rew(&al__1);
+				}
+				ccsyr2k_(iorder, uplo, trans, &n, &k, &alpha, 
+					&aa[1], &lda, &bb[1], &ldb, &beta, &
+					cc[1], &ldc);
+			    }
+
+/*                       Check if error-exit was taken incorrectly. */
+
+			    if (! infoc_1.ok) {
+				io___362.ciunit = *nout;
+				s_wsfe(&io___362);
+				e_wsfe();
+				*fatal = TRUE_;
+				goto L150;
+			    }
+
+/*                       See what data changed inside subroutines. */
+
+			    isame[0] = *(unsigned char *)uplos == *(unsigned 
+				    char *)uplo;
+			    isame[1] = *(unsigned char *)transs == *(unsigned 
+				    char *)trans;
+			    isame[2] = ns == n;
+			    isame[3] = ks == k;
+			    isame[4] = als.r == alpha.r && als.i == alpha.i;
+			    isame[5] = lce_(&as[1], &aa[1], &laa);
+			    isame[6] = ldas == lda;
+			    isame[7] = lce_(&bs[1], &bb[1], &lbb);
+			    isame[8] = ldbs == ldb;
+			    if (conj) {
+				isame[9] = rbets == rbeta;
+			    } else {
+				isame[9] = bets.r == beta.r && bets.i == 
+					beta.i;
+			    }
+			    if (null) {
+				isame[10] = lce_(&cs[1], &cc[1], &lcc);
+			    } else {
+				isame[10] = lceres_("he", uplo, &n, &n, &cs[1]
+					, &cc[1], &ldc);
+			    }
+			    isame[11] = ldcs == ldc;
+
+/*                       If data was incorrectly changed, report and */
+/*                       return. */
+
+			    same = TRUE_;
+			    i__5 = nargs;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				same = same && isame[i__ - 1];
+				if (! isame[i__ - 1]) {
+				    io___365.ciunit = *nout;
+				    s_wsfe(&io___365);
+				    do_fio(&c__1, (char *)&i__, (ftnlen)
+					    sizeof(integer));
+				    e_wsfe();
+				}
+/* L40: */
+			    }
+			    if (! same) {
+				*fatal = TRUE_;
+				goto L150;
+			    }
+
+			    if (! null) {
+
+/*                          Check the result column by column. */
+
+				if (conj) {
+				    *(unsigned char *)transt = 'C';
+				} else {
+				    *(unsigned char *)transt = 'T';
+				}
+				jjab = 1;
+				jc = 1;
+				i__5 = n;
+				for (j = 1; j <= i__5; ++j) {
+				    if (upper) {
+					jj = 1;
+					lj = j;
+				    } else {
+					jj = j;
+					lj = n - j + 1;
+				    }
+				    if (tran) {
+					i__6 = k;
+					for (i__ = 1; i__ <= i__6; ++i__) {
+					    i__7 = i__;
+					    i__8 = (j - 1 << 1) * *nmax + k + 
+						    i__;
+					    q__1.r = alpha.r * ab[i__8].r - 
+						    alpha.i * ab[i__8].i, 
+						    q__1.i = alpha.r * ab[
+						    i__8].i + alpha.i * ab[
+						    i__8].r;
+					    w[i__7].r = q__1.r, w[i__7].i = 
+						    q__1.i;
+					    if (conj) {
+			  i__7 = k + i__;
+			  r_cnjg(&q__2, &alpha);
+			  i__8 = (j - 1 << 1) * *nmax + i__;
+			  q__1.r = q__2.r * ab[i__8].r - q__2.i * ab[i__8].i, 
+				  q__1.i = q__2.r * ab[i__8].i + q__2.i * ab[
+				  i__8].r;
+			  w[i__7].r = q__1.r, w[i__7].i = q__1.i;
+					    } else {
+			  i__7 = k + i__;
+			  i__8 = (j - 1 << 1) * *nmax + i__;
+			  q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
+				  .i, q__1.i = alpha.r * ab[i__8].i + alpha.i 
+				  * ab[i__8].r;
+			  w[i__7].r = q__1.r, w[i__7].i = q__1.i;
+					    }
+/* L50: */
+					}
+					i__6 = k << 1;
+					i__7 = *nmax << 1;
+					i__8 = *nmax << 1;
+					cmmch_(transt, "N", &lj, &c__1, &i__6,
+						 &c_b2, &ab[jjab], &i__7, &w[
+						1], &i__8, &beta, &c__[jj + j 
+						* c_dim1], nmax, &ct[1], &g[1]
+						, &cc[jc], &ldc, eps, &err, 
+						fatal, nout, &c_true);
+				    } else {
+					i__6 = k;
+					for (i__ = 1; i__ <= i__6; ++i__) {
+					    if (conj) {
+			  i__7 = i__;
+			  r_cnjg(&q__2, &ab[(k + i__ - 1) * *nmax + j]);
+			  q__1.r = alpha.r * q__2.r - alpha.i * q__2.i, 
+				  q__1.i = alpha.r * q__2.i + alpha.i * 
+				  q__2.r;
+			  w[i__7].r = q__1.r, w[i__7].i = q__1.i;
+			  i__7 = k + i__;
+			  i__8 = (i__ - 1) * *nmax + j;
+			  q__2.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
+				  .i, q__2.i = alpha.r * ab[i__8].i + alpha.i 
+				  * ab[i__8].r;
+			  r_cnjg(&q__1, &q__2);
+			  w[i__7].r = q__1.r, w[i__7].i = q__1.i;
+					    } else {
+			  i__7 = i__;
+			  i__8 = (k + i__ - 1) * *nmax + j;
+			  q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
+				  .i, q__1.i = alpha.r * ab[i__8].i + alpha.i 
+				  * ab[i__8].r;
+			  w[i__7].r = q__1.r, w[i__7].i = q__1.i;
+			  i__7 = k + i__;
+			  i__8 = (i__ - 1) * *nmax + j;
+			  q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
+				  .i, q__1.i = alpha.r * ab[i__8].i + alpha.i 
+				  * ab[i__8].r;
+			  w[i__7].r = q__1.r, w[i__7].i = q__1.i;
+					    }
+/* L60: */
+					}
+					i__6 = k << 1;
+					i__7 = *nmax << 1;
+					cmmch_("N", "N", &lj, &c__1, &i__6, &
+						c_b2, &ab[jj], nmax, &w[1], &
+						i__7, &beta, &c__[jj + j * 
+						c_dim1], nmax, &ct[1], &g[1], 
+						&cc[jc], &ldc, eps, &err, 
+						fatal, nout, &c_true);
+				    }
+				    if (upper) {
+					jc += ldc;
+				    } else {
+					jc = jc + ldc + 1;
+					if (tran) {
+					    jjab += *nmax << 1;
+					}
+				    }
+				    errmax = f2cmax(errmax,err);
+/*                             If got really bad answer, report and */
+/*                             return. */
+				    if (*fatal) {
+					goto L140;
+				    }
+/* L70: */
+				}
+			    }
+
+/* L80: */
+			}
+
+/* L90: */
+		    }
+
+/* L100: */
+		}
+
+L110:
+		;
+	    }
+
+/* L120: */
+	}
+
+L130:
+	;
+    }
+
+/*     Report result. */
+
+    if (errmax < *thresh) {
+	if (*iorder == 0) {
+	    io___373.ciunit = *nout;
+	    s_wsfe(&io___373);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+	if (*iorder == 1) {
+	    io___374.ciunit = *nout;
+	    s_wsfe(&io___374);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+    } else {
+	if (*iorder == 0) {
+	    io___375.ciunit = *nout;
+	    s_wsfe(&io___375);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real));
+	    e_wsfe();
+	}
+	if (*iorder == 1) {
+	    io___376.ciunit = *nout;
+	    s_wsfe(&io___376);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real));
+	    e_wsfe();
+	}
+    }
+    goto L160;
+
+L140:
+    if (n > 1) {
+	io___377.ciunit = *nout;
+	s_wsfe(&io___377);
+	do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer));
+	e_wsfe();
+    }
+
+L150:
+    io___378.ciunit = *nout;
+    s_wsfe(&io___378);
+    do_fio(&c__1, sname, (ftnlen)13);
+    e_wsfe();
+    if (conj) {
+	cprcn7_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, &
+		ldb, &rbeta, &ldc);
+    } else {
+	cprcn5_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, &
+		ldb, &beta, &ldc);
+    }
+
+L160:
+    return 0;
+
+/* L9994: */
+/* L9993: */
+
+/*     End of CCHK5. */
+
+} /* cchk5_ */
+
+
+/* Subroutine */ int cprcn5_(integer *nout, integer *nc, char *sname, integer 
+	*iorder, char *uplo, char *transa, integer *n, integer *k, complex *
+	alpha, integer *lda, integer *ldb, complex *beta, integer *ldc)
+{
+    /* Format strings */
+    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002,"
+	    "\002))";
+    static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002"
+	    ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002, (\002,f4.1,\002,\002"
+	    ",f4.1,\002), C,\002,i3,\002).\002)";
+
+    /* Local variables */
+    char ca[14], cu[14], crc[14];
+
+    /* Fortran I/O blocks */
+    static cilist io___382 = { 0, 0, 0, fmt_9995, 0 };
+    static cilist io___383 = { 0, 0, 0, fmt_9994, 0 };
+
+
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(ca, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(ca, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    io___382.ciunit = *nout;
+    s_wsfe(&io___382);
+    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
+    do_fio(&c__1, sname, (ftnlen)13);
+    do_fio(&c__1, crc, (ftnlen)14);
+    do_fio(&c__1, cu, (ftnlen)14);
+    do_fio(&c__1, ca, (ftnlen)14);
+    e_wsfe();
+    io___383.ciunit = *nout;
+    s_wsfe(&io___383);
+    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer));
+    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real));
+    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer));
+    do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real));
+    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
+    e_wsfe();
+    return 0;
+} /* cprcn5_ */
+
+
+
+/* Subroutine */ int cprcn7_(integer *nout, integer *nc, char *sname, integer 
+	*iorder, char *uplo, char *transa, integer *n, integer *k, complex *
+	alpha, integer *lda, integer *ldb, real *beta, integer *ldc)
+{
+    /* Format strings */
+    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002,"
+	    "\002))";
+    static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002"
+	    ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002,\002,f4.1,\002, C,"
+	    "\002,i3,\002).\002)";
+
+    /* Local variables */
+    char ca[14], cu[14], crc[14];
+
+    /* Fortran I/O blocks */
+    static cilist io___387 = { 0, 0, 0, fmt_9995, 0 };
+    static cilist io___388 = { 0, 0, 0, fmt_9994, 0 };
+
+
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(ca, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(ca, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    io___387.ciunit = *nout;
+    s_wsfe(&io___387);
+    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
+    do_fio(&c__1, sname, (ftnlen)13);
+    do_fio(&c__1, crc, (ftnlen)14);
+    do_fio(&c__1, cu, (ftnlen)14);
+    do_fio(&c__1, ca, (ftnlen)14);
+    e_wsfe();
+    io___388.ciunit = *nout;
+    s_wsfe(&io___388);
+    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer));
+    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real));
+    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(real));
+    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
+    e_wsfe();
+    return 0;
+} /* cprcn7_ */
+
+
+/* Subroutine */ int cmake_(char *type__, char *uplo, char *diag, integer *m, 
+	integer *n, complex *a, integer *nmax, complex *aa, integer *lda, 
+	logical *reset, complex *transl)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+    real r__1;
+    complex q__1, q__2;
+
+    /* Local variables */
+    extern /* Complex */ VOID cbeg_(complex *, logical *);
+    integer ibeg, iend;
+    logical unit;
+    integer i__, j;
+    logical lower, upper;
+    integer jj;
+    logical gen, her, tri, sym;
+
+
+/*  Generates values for an M by N matrix A. */
+/*  Stores the values in the array AA in the data structure required */
+/*  by the routine, with unwanted elements set to rogue value. */
+
+/*  TYPE is 'ge', 'he', 'sy' or 'tr'. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    a_dim1 = *nmax;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --aa;
+
+    /* Function Body */
+    gen = s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0;
+    her = s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0;
+    sym = s_cmp(type__, "sy", (ftnlen)2, (ftnlen)2) == 0;
+    tri = s_cmp(type__, "tr", (ftnlen)2, (ftnlen)2) == 0;
+    upper = (her || sym || tri) && *(unsigned char *)uplo == 'U';
+    lower = (her || sym || tri) && *(unsigned char *)uplo == 'L';
+    unit = tri && *(unsigned char *)diag == 'U';
+
+/*     Generate data in array A. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    if (gen || upper && i__ <= j || lower && i__ >= j) {
+		i__3 = i__ + j * a_dim1;
+		cbeg_(&q__2, reset);
+		q__1.r = q__2.r + transl->r, q__1.i = q__2.i + transl->i;
+		a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+		if (i__ != j) {
+/*                 Set some elements to zero */
+		    if (*n > 3 && j == *n / 2) {
+			i__3 = i__ + j * a_dim1;
+			a[i__3].r = 0.f, a[i__3].i = 0.f;
+		    }
+		    if (her) {
+			i__3 = j + i__ * a_dim1;
+			r_cnjg(&q__1, &a[i__ + j * a_dim1]);
+			a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+		    } else if (sym) {
+			i__3 = j + i__ * a_dim1;
+			i__4 = i__ + j * a_dim1;
+			a[i__3].r = a[i__4].r, a[i__3].i = a[i__4].i;
+		    } else if (tri) {
+			i__3 = j + i__ * a_dim1;
+			a[i__3].r = 0.f, a[i__3].i = 0.f;
+		    }
+		}
+	    }
+/* L10: */
+	}
+	if (her) {
+	    i__2 = j + j * a_dim1;
+	    i__3 = j + j * a_dim1;
+	    r__1 = a[i__3].r;
+	    q__1.r = r__1, q__1.i = 0.f;
+	    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+	}
+	if (tri) {
+	    i__2 = j + j * a_dim1;
+	    i__3 = j + j * a_dim1;
+	    q__1.r = a[i__3].r + 1.f, q__1.i = a[i__3].i + 0.f;
+	    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+	}
+	if (unit) {
+	    i__2 = j + j * a_dim1;
+	    a[i__2].r = 1.f, a[i__2].i = 0.f;
+	}
+/* L20: */
+    }
+
+/*     Store elements in array AS in data structure required by routine. */
+
+    if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + (j - 1) * *lda;
+		i__4 = i__ + j * a_dim1;
+		aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i;
+/* L30: */
+	    }
+	    i__2 = *lda;
+	    for (i__ = *m + 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + (j - 1) * *lda;
+		aa[i__3].r = -1e10f, aa[i__3].i = 1e10f;
+/* L40: */
+	    }
+/* L50: */
+	}
+    } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__,
+	     "sy", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, "tr", (ftnlen)
+	    2, (ftnlen)2) == 0) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    if (upper) {
+		ibeg = 1;
+		if (unit) {
+		    iend = j - 1;
+		} else {
+		    iend = j;
+		}
+	    } else {
+		if (unit) {
+		    ibeg = j + 1;
+		} else {
+		    ibeg = j;
+		}
+		iend = *n;
+	    }
+	    i__2 = ibeg - 1;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + (j - 1) * *lda;
+		aa[i__3].r = -1e10f, aa[i__3].i = 1e10f;
+/* L60: */
+	    }
+	    i__2 = iend;
+	    for (i__ = ibeg; i__ <= i__2; ++i__) {
+		i__3 = i__ + (j - 1) * *lda;
+		i__4 = i__ + j * a_dim1;
+		aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i;
+/* L70: */
+	    }
+	    i__2 = *lda;
+	    for (i__ = iend + 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + (j - 1) * *lda;
+		aa[i__3].r = -1e10f, aa[i__3].i = 1e10f;
+/* L80: */
+	    }
+	    if (her) {
+		jj = j + (j - 1) * *lda;
+		i__2 = jj;
+		i__3 = jj;
+		r__1 = aa[i__3].r;
+		q__1.r = r__1, q__1.i = -1e10f;
+		aa[i__2].r = q__1.r, aa[i__2].i = q__1.i;
+	    }
+/* L90: */
+	}
+    }
+    return 0;
+
+/*     End of CMAKE. */
+
+} /* cmake_ */
+
+/* Subroutine */ int cmmch_(char *transa, char *transb, integer *m, integer *
+	n, integer *kk, complex *alpha, complex *a, integer *lda, complex *b, 
+	integer *ldb, complex *beta, complex *c__, integer *ldc, complex *ct, 
+	real *g, complex *cc, integer *ldcc, real *eps, real *err, logical *
+	fatal, integer *nout, logical *mv)
+{
+    /* Format strings */
+    static char fmt_9999[] = "(\002 ******* FATAL ERROR - COMPUTED RESULT IS"
+	    " LESS THAN HAL\002,\002F ACCURATE *******\002,/\002             "
+	    "          EXPECTED RE\002,\002SULT                    COMPUTED R"
+	    "ESULT\002)";
+    static char fmt_9998[] = "(1x,i7,2(\002  (\002,g15.6,\002,\002,g15.6,"
+	    "\002)\002))";
+    static char fmt_9997[] = "(\002      THESE ARE THE RESULTS FOR COLUMN"
+	    " \002,i3)";
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, 
+	    cc_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7;
+    real r__1, r__2, r__3, r__4, r__5, r__6;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Local variables */
+    real erri;
+    integer i__, j, k;
+    logical trana, tranb, ctrana, ctranb;
+
+    /* Fortran I/O blocks */
+    static cilist io___409 = { 0, 0, 0, fmt_9999, 0 };
+    static cilist io___410 = { 0, 0, 0, fmt_9998, 0 };
+    static cilist io___411 = { 0, 0, 0, fmt_9998, 0 };
+    static cilist io___412 = { 0, 0, 0, fmt_9997, 0 };
+
+
+
+/*  Checks the results of the computational tests. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --ct;
+    --g;
+    cc_dim1 = *ldcc;
+    cc_offset = 1 + cc_dim1 * 1;
+    cc -= cc_offset;
+
+    /* Function Body */
+    trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 
+	    'C';
+    tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 
+	    'C';
+    ctrana = *(unsigned char *)transa == 'C';
+    ctranb = *(unsigned char *)transb == 'C';
+
+/*     Compute expected result, one column at a time, in CT using data */
+/*     in A, B and C. */
+/*     Compute gauges in G. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__;
+	    ct[i__3].r = 0.f, ct[i__3].i = 0.f;
+	    g[i__] = 0.f;
+/* L10: */
+	}
+	if (! trana && ! tranb) {
+	    i__2 = *kk;
+	    for (k = 1; k <= i__2; ++k) {
+		i__3 = *m;
+		for (i__ = 1; i__ <= i__3; ++i__) {
+		    i__4 = i__;
+		    i__5 = i__;
+		    i__6 = i__ + k * a_dim1;
+		    i__7 = k + j * b_dim1;
+		    q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, 
+			    q__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[
+			    i__7].r;
+		    q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + 
+			    q__2.i;
+		    ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
+		    i__4 = i__ + k * a_dim1;
+		    i__5 = k + j * b_dim1;
+		    g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag(
+			    &a[i__ + k * a_dim1]), abs(r__2))) * ((r__3 = b[
+			    i__5].r, abs(r__3)) + (r__4 = r_imag(&b[k + j * 
+			    b_dim1]), abs(r__4)));
+/* L20: */
+		}
+/* L30: */
+	    }
+	} else if (trana && ! tranb) {
+	    if (ctrana) {
+		i__2 = *kk;
+		for (k = 1; k <= i__2; ++k) {
+		    i__3 = *m;
+		    for (i__ = 1; i__ <= i__3; ++i__) {
+			i__4 = i__;
+			i__5 = i__;
+			r_cnjg(&q__3, &a[k + i__ * a_dim1]);
+			i__6 = k + j * b_dim1;
+			q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, 
+				q__2.i = q__3.r * b[i__6].i + q__3.i * b[i__6]
+				.r;
+			q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + 
+				q__2.i;
+			ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
+			i__4 = k + i__ * a_dim1;
+			i__5 = k + j * b_dim1;
+			g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = 
+				r_imag(&a[k + i__ * a_dim1]), abs(r__2))) * ((
+				r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag(
+				&b[k + j * b_dim1]), abs(r__4)));
+/* L40: */
+		    }
+/* L50: */
+		}
+	    } else {
+		i__2 = *kk;
+		for (k = 1; k <= i__2; ++k) {
+		    i__3 = *m;
+		    for (i__ = 1; i__ <= i__3; ++i__) {
+			i__4 = i__;
+			i__5 = i__;
+			i__6 = k + i__ * a_dim1;
+			i__7 = k + j * b_dim1;
+			q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7]
+				.i, q__2.i = a[i__6].r * b[i__7].i + a[i__6]
+				.i * b[i__7].r;
+			q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + 
+				q__2.i;
+			ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
+			i__4 = k + i__ * a_dim1;
+			i__5 = k + j * b_dim1;
+			g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = 
+				r_imag(&a[k + i__ * a_dim1]), abs(r__2))) * ((
+				r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag(
+				&b[k + j * b_dim1]), abs(r__4)));
+/* L60: */
+		    }
+/* L70: */
+		}
+	    }
+	} else if (! trana && tranb) {
+	    if (ctranb) {
+		i__2 = *kk;
+		for (k = 1; k <= i__2; ++k) {
+		    i__3 = *m;
+		    for (i__ = 1; i__ <= i__3; ++i__) {
+			i__4 = i__;
+			i__5 = i__;
+			i__6 = i__ + k * a_dim1;
+			r_cnjg(&q__3, &b[j + k * b_dim1]);
+			q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, 
+				q__2.i = a[i__6].r * q__3.i + a[i__6].i * 
+				q__3.r;
+			q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + 
+				q__2.i;
+			ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
+			i__4 = i__ + k * a_dim1;
+			i__5 = j + k * b_dim1;
+			g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = 
+				r_imag(&a[i__ + k * a_dim1]), abs(r__2))) * ((
+				r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag(
+				&b[j + k * b_dim1]), abs(r__4)));
+/* L80: */
+		    }
+/* L90: */
+		}
+	    } else {
+		i__2 = *kk;
+		for (k = 1; k <= i__2; ++k) {
+		    i__3 = *m;
+		    for (i__ = 1; i__ <= i__3; ++i__) {
+			i__4 = i__;
+			i__5 = i__;
+			i__6 = i__ + k * a_dim1;
+			i__7 = j + k * b_dim1;
+			q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7]
+				.i, q__2.i = a[i__6].r * b[i__7].i + a[i__6]
+				.i * b[i__7].r;
+			q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + 
+				q__2.i;
+			ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
+			i__4 = i__ + k * a_dim1;
+			i__5 = j + k * b_dim1;
+			g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = 
+				r_imag(&a[i__ + k * a_dim1]), abs(r__2))) * ((
+				r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag(
+				&b[j + k * b_dim1]), abs(r__4)));
+/* L100: */
+		    }
+/* L110: */
+		}
+	    }
+	} else if (trana && tranb) {
+	    if (ctrana) {
+		if (ctranb) {
+		    i__2 = *kk;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__;
+			    i__5 = i__;
+			    r_cnjg(&q__3, &a[k + i__ * a_dim1]);
+			    r_cnjg(&q__4, &b[j + k * b_dim1]);
+			    q__2.r = q__3.r * q__4.r - q__3.i * q__4.i, 
+				    q__2.i = q__3.r * q__4.i + q__3.i * 
+				    q__4.r;
+			    q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i 
+				    + q__2.i;
+			    ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
+			    i__4 = k + i__ * a_dim1;
+			    i__5 = j + k * b_dim1;
+			    g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 =
+				     r_imag(&a[k + i__ * a_dim1]), abs(r__2)))
+				     * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 
+				    = r_imag(&b[j + k * b_dim1]), abs(r__4)));
+/* L120: */
+			}
+/* L130: */
+		    }
+		} else {
+		    i__2 = *kk;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__;
+			    i__5 = i__;
+			    r_cnjg(&q__3, &a[k + i__ * a_dim1]);
+			    i__6 = j + k * b_dim1;
+			    q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, 
+				    q__2.i = q__3.r * b[i__6].i + q__3.i * b[
+				    i__6].r;
+			    q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i 
+				    + q__2.i;
+			    ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
+			    i__4 = k + i__ * a_dim1;
+			    i__5 = j + k * b_dim1;
+			    g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 =
+				     r_imag(&a[k + i__ * a_dim1]), abs(r__2)))
+				     * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 
+				    = r_imag(&b[j + k * b_dim1]), abs(r__4)));
+/* L140: */
+			}
+/* L150: */
+		    }
+		}
+	    } else {
+		if (ctranb) {
+		    i__2 = *kk;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__;
+			    i__5 = i__;
+			    i__6 = k + i__ * a_dim1;
+			    r_cnjg(&q__3, &b[j + k * b_dim1]);
+			    q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, 
+				    q__2.i = a[i__6].r * q__3.i + a[i__6].i * 
+				    q__3.r;
+			    q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i 
+				    + q__2.i;
+			    ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
+			    i__4 = k + i__ * a_dim1;
+			    i__5 = j + k * b_dim1;
+			    g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 =
+				     r_imag(&a[k + i__ * a_dim1]), abs(r__2)))
+				     * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 
+				    = r_imag(&b[j + k * b_dim1]), abs(r__4)));
+/* L160: */
+			}
+/* L170: */
+		    }
+		} else {
+		    i__2 = *kk;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__;
+			    i__5 = i__;
+			    i__6 = k + i__ * a_dim1;
+			    i__7 = j + k * b_dim1;
+			    q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[
+				    i__7].i, q__2.i = a[i__6].r * b[i__7].i + 
+				    a[i__6].i * b[i__7].r;
+			    q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i 
+				    + q__2.i;
+			    ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
+			    i__4 = k + i__ * a_dim1;
+			    i__5 = j + k * b_dim1;
+			    g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 =
+				     r_imag(&a[k + i__ * a_dim1]), abs(r__2)))
+				     * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 
+				    = r_imag(&b[j + k * b_dim1]), abs(r__4)));
+/* L180: */
+			}
+/* L190: */
+		    }
+		}
+	    }
+	}
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__;
+	    i__4 = i__;
+	    q__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, q__2.i = 
+		    alpha->r * ct[i__4].i + alpha->i * ct[i__4].r;
+	    i__5 = i__ + j * c_dim1;
+	    q__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, q__3.i = 
+		    beta->r * c__[i__5].i + beta->i * c__[i__5].r;
+	    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+	    ct[i__3].r = q__1.r, ct[i__3].i = q__1.i;
+	    i__3 = i__ + j * c_dim1;
+	    g[i__] = ((r__1 = alpha->r, abs(r__1)) + (r__2 = r_imag(alpha), 
+		    abs(r__2))) * g[i__] + ((r__3 = beta->r, abs(r__3)) + (
+		    r__4 = r_imag(beta), abs(r__4))) * ((r__5 = c__[i__3].r, 
+		    abs(r__5)) + (r__6 = r_imag(&c__[i__ + j * c_dim1]), abs(
+		    r__6)));
+/* L200: */
+	}
+
+/*        Compute the error ratio for this result. */
+
+	*err = 0.f;
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__;
+	    i__4 = i__ + j * cc_dim1;
+	    q__2.r = ct[i__3].r - cc[i__4].r, q__2.i = ct[i__3].i - cc[i__4]
+		    .i;
+	    q__1.r = q__2.r, q__1.i = q__2.i;
+	    erri = ((r__1 = q__1.r, abs(r__1)) + (r__2 = r_imag(&q__1), abs(
+		    r__2))) / *eps;
+	    if (g[i__] != 0.f) {
+		erri /= g[i__];
+	    }
+	    *err = f2cmax(*err,erri);
+	    if (*err * sqrt(*eps) >= 1.f) {
+		goto L230;
+	    }
+/* L210: */
+	}
+
+/* L220: */
+    }
+
+/*     If the loop completes, all results are at least half accurate. */
+    goto L250;
+
+/*     Report fatal error. */
+
+L230:
+    *fatal = TRUE_;
+    io___409.ciunit = *nout;
+    s_wsfe(&io___409);
+    e_wsfe();
+    i__1 = *m;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (*mv) {
+	    io___410.ciunit = *nout;
+	    s_wsfe(&io___410);
+	    do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer));
+	    do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(real));
+	    do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof(real)
+		    );
+	    e_wsfe();
+	} else {
+	    io___411.ciunit = *nout;
+	    s_wsfe(&io___411);
+	    do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer));
+	    do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof(real)
+		    );
+	    do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(real));
+	    e_wsfe();
+	}
+/* L240: */
+    }
+    if (*n > 1) {
+	io___412.ciunit = *nout;
+	s_wsfe(&io___412);
+	do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer));
+	e_wsfe();
+    }
+
+L250:
+    return 0;
+
+
+/*     End of CMMCH. */
+
+} /* cmmch_ */
+
+logical lce_(complex *ri, complex *rj, integer *lr)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+    logical ret_val;
+
+    /* Local variables */
+    integer i__;
+
+
+/*  Tests if two arrays are identical. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    --rj;
+    --ri;
+
+    /* Function Body */
+    i__1 = *lr;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	i__3 = i__;
+	if (ri[i__2].r != rj[i__3].r || ri[i__2].i != rj[i__3].i) {
+	    goto L20;
+	}
+/* L10: */
+    }
+    ret_val = TRUE_;
+    goto L30;
+L20:
+    ret_val = FALSE_;
+L30:
+    return ret_val;
+
+/*     End of LCE. */
+
+} /* lce_ */
+
+logical lceres_(char *type__, char *uplo, integer *m, integer *n, complex *aa,
+	 complex *as, integer *lda)
+{
+    /* System generated locals */
+    integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4;
+    logical ret_val;
+
+    /* Local variables */
+    integer ibeg, iend, i__, j;
+    logical upper;
+
+
+/*  Tests if selected elements in two arrays are equal. */
+
+/*  TYPE is 'ge' or 'he' or 'sy'. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    as_dim1 = *lda;
+    as_offset = 1 + as_dim1 * 1;
+    as -= as_offset;
+    aa_dim1 = *lda;
+    aa_offset = 1 + aa_dim1 * 1;
+    aa -= aa_offset;
+
+    /* Function Body */
+    upper = *(unsigned char *)uplo == 'U';
+    if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *lda;
+	    for (i__ = *m + 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * aa_dim1;
+		i__4 = i__ + j * as_dim1;
+		if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) {
+		    goto L70;
+		}
+/* L10: */
+	    }
+/* L20: */
+	}
+    } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__,
+	     "sy", (ftnlen)2, (ftnlen)2) == 0) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    if (upper) {
+		ibeg = 1;
+		iend = j;
+	    } else {
+		ibeg = j;
+		iend = *n;
+	    }
+	    i__2 = ibeg - 1;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * aa_dim1;
+		i__4 = i__ + j * as_dim1;
+		if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) {
+		    goto L70;
+		}
+/* L30: */
+	    }
+	    i__2 = *lda;
+	    for (i__ = iend + 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * aa_dim1;
+		i__4 = i__ + j * as_dim1;
+		if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) {
+		    goto L70;
+		}
+/* L40: */
+	    }
+/* L50: */
+	}
+    }
+
+/* L60: */
+    ret_val = TRUE_;
+    goto L80;
+L70:
+    ret_val = FALSE_;
+L80:
+    return ret_val;
+
+/*     End of LCERES. */
+
+} /* lceres_ */
+
+/* Complex */ VOID cbeg_(complex * ret_val, logical *reset)
+{
+    /* System generated locals */
+    real r__1, r__2;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, j, ic, mi, mj;
+
+
+/*  Generates complex numbers as pairs of random numbers uniformly */
+/*  distributed between -0.5 and 0.5. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    if (*reset) {
+/*        Initialize local variables. */
+	mi = 891;
+	mj = 457;
+	i__ = 7;
+	j = 7;
+	ic = 0;
+	*reset = FALSE_;
+    }
+
+/*     The sequence of values of I or J is bounded between 1 and 999. */
+/*     If initial I or J = 1,2,3,6,7 or 9, the period will be 50. */
+/*     If initial I or J = 4 or 8, the period will be 25. */
+/*     If initial I or J = 5, the period will be 10. */
+/*     IC is used to break up the period by skipping 1 value of I or J */
+/*     in 6. */
+
+    ++ic;
+L10:
+    i__ *= mi;
+    j *= mj;
+    i__ -= i__ / 1000 * 1000;
+    j -= j / 1000 * 1000;
+    if (ic >= 5) {
+	ic = 0;
+	goto L10;
+    }
+    r__1 = (i__ - 500) / 1001.f;
+    r__2 = (j - 500) / 1001.f;
+    q__1.r = r__1, q__1.i = r__2;
+     ret_val->r = q__1.r,  ret_val->i = q__1.i;
+    return ;
+
+/*     End of CBEG. */
+
+} /* cbeg_ */
+
+real sdiff_(real *x, real *y)
+{
+    /* System generated locals */
+    real ret_val;
+
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    ret_val = *x - *y;
+    return ret_val;
+
+/*     End of SDIFF. */
+
+} /* sdiff_ */
+
+/* Main program alias */ int cblat3_ () { MAIN__ (); return 0; }
diff --git a/ctest/c_zblat3c_3m.c b/ctest/c_zblat3c_3m.c
new file mode 100644
index 000000000..059daccb5
--- /dev/null
+++ b/ctest/c_zblat3c_3m.c
@@ -0,0 +1,4897 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Common Block Declarations */
+
+struct {
+    integer infot, noutc;
+    logical ok, lerr;
+} infoc_;
+
+#define infoc_1 infoc_
+
+struct {
+    char srnamt[13];
+} srnamc_;
+
+#define srnamc_1 srnamc_
+
+/* Table of constant values */
+
+static doublecomplex c_b1 = {0.,0.};
+static doublecomplex c_b2 = {1.,0.};
+static integer c__9 = 9;
+static integer c__1 = 1;
+static integer c__3 = 3;
+static integer c__8 = 8;
+static integer c__5 = 5;
+static integer c__65 = 65;
+static integer c__7 = 7;
+static integer c__2 = 2;
+static doublereal c_b92 = 1.;
+static integer c__6 = 6;
+static logical c_true = TRUE_;
+static integer c__0 = 0;
+static logical c_false = FALSE_;
+
+/* Main program */ MAIN__(void)
+{
+    /* Initialized data */
+
+    static char snames[13*9] = "cblas_zgemm3m" "cblas_zhemm  " "cblas_zsymm  "
+	     "cblas_ztrmm  " "cblas_ztrsm  " "cblas_zherk  " "cblas_zsyrk  " 
+	    "cblas_zher2k " "cblas_zsyr2k ";
+
+    /* Format strings */
+    static char fmt_9997[] = "(\002 NUMBER OF VALUES OF \002,a,\002 IS LESS "
+	    "THAN 1 OR GREATER \002,\002THAN \002,i2)";
+    static char fmt_9996[] = "(\002 VALUE OF N IS LESS THAN 0 OR GREATER THA"
+	    "N \002,i2)";
+    static char fmt_9995[] = "(\002TESTS OF THE COMPLEX*16        LEVEL 3 BL"
+	    "AS\002,//\002 THE F\002,\002OLLOWING PARAMETER VALUES WILL BE US"
+	    "ED:\002)";
+    static char fmt_9994[] = "(\002   FOR N              \002,9i6)";
+    static char fmt_9993[] = "(\002   FOR ALPHA          \002,7(\002(\002,f4"
+	    ".1,\002,\002,f4.1,\002)  \002,:))";
+    static char fmt_9992[] = "(\002   FOR BETA           \002,7(\002(\002,f4"
+	    ".1,\002,\002,f4.1,\002)  \002,:))";
+    static char fmt_9984[] = "(\002 ERROR-CALL MYEXITS WILL NOT BE TESTED"
+	    "\002)";
+    static char fmt_9999[] = "(\002 ROUTINES PASS COMPUTATIONAL TESTS IF TES"
+	    "T RATIO IS LES\002,\002S THAN\002,f8.2)";
+    static char fmt_10002[] = "(\002 COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS"
+	    " ARE TESTED\002)";
+    static char fmt_10001[] = "(\002 ROW-MAJOR DATA LAYOUT IS TESTED\002)";
+    static char fmt_10000[] = "(\002 COLUMN-MAJOR DATA LAYOUT IS TESTED\002)";
+    static char fmt_9988[] = "(a13,l2)";
+    static char fmt_9990[] = "(\002 SUBPROGRAM NAME \002,a13,\002 NOT RECOGN"
+	    "IZED\002,/\002 ******* T\002,\002ESTS ABANDONED *******\002)";
+    static char fmt_9998[] = "(\002 RELATIVE MACHINE PRECISION IS TAKEN TO"
+	    " BE\002,1p,e9.1)";
+    static char fmt_9989[] = "(\002 ERROR IN ZMMCH -  IN-LINE DOT PRODUCTS A"
+	    "RE BEING EVALU\002,\002ATED WRONGLY.\002,/\002 ZMMCH WAS CALLED "
+	    "WITH TRANSA = \002,a1,\002AND TRANSB = \002,a1,/\002 AND RETURNE"
+	    "D SAME = \002,l1,\002 AND \002,\002 ERR = \002,f12.3,\002.\002,"
+	    "/\002 THIS MAY BE DUE TO FAULTS IN THE \002,\002ARITHMETIC OR TH"
+	    "E COMPILER.\002,/\002 ******* TESTS ABANDONED \002,\002******"
+	    "*\002)";
+    static char fmt_9987[] = "(1x,a13,\002 WAS NOT TESTED\002)";
+    static char fmt_9986[] = "(/\002 END OF TESTS\002)";
+    static char fmt_9985[] = "(/\002 ******* FATAL ERROR - TESTS ABANDONED *"
+	    "******\002)";
+    static char fmt_9991[] = "(\002 AMEND DATA FILE OR INCREASE ARRAY SIZES "
+	    "IN PROGRAM\002,/\002 ******* TESTS ABANDONED *******\002)";
+
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4, i__5;
+    doublereal d__1;
+    olist o__1;
+    cllist cl__1;
+
+    /* Local variables */
+    integer nalf, idim[9];
+    logical same;
+    integer nbet, ntra;
+    logical rewi;
+    extern /* Subroutine */ int zchk1_(char *, doublereal *, doublereal *, 
+	    integer *, integer *, logical *, logical *, logical *, integer *, 
+	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
+	     integer *, doublecomplex *, doublecomplex *, doublecomplex *, 
+	    doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *
+	    , doublecomplex *, doublecomplex *, doublecomplex *, doublereal *,
+	     integer *), zchk2_(char *, doublereal *, doublereal *, 
+	    integer *, integer *, logical *, logical *, logical *, integer *, 
+	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
+	     integer *, doublecomplex *, doublecomplex *, doublecomplex *, 
+	    doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *
+	    , doublecomplex *, doublecomplex *, doublecomplex *, doublereal *,
+	     integer *), zchk3_(char *, doublereal *, doublereal *, 
+	    integer *, integer *, logical *, logical *, logical *, integer *, 
+	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
+	     doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex 
+	    *, doublecomplex *, doublecomplex *, doublereal *, doublecomplex *
+	    , integer *), zchk4_(char *, doublereal *, doublereal *, 
+	    integer *, integer *, logical *, logical *, logical *, integer *, 
+	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
+	     integer *, doublecomplex *, doublecomplex *, doublecomplex *, 
+	    doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *
+	    , doublecomplex *, doublecomplex *, doublecomplex *, doublereal *,
+	     integer *), zchk5_(char *, doublereal *, doublereal *, 
+	    integer *, integer *, logical *, logical *, logical *, integer *, 
+	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
+	     integer *, doublecomplex *, doublecomplex *, doublecomplex *, 
+	    doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *
+	    , doublecomplex *, doublecomplex *, doublereal *, doublecomplex *,
+	     integer *);
+    doublecomplex c__[4225]	/* was [65][65] */;
+    doublereal g[65];
+    integer i__, j;
+    extern doublereal ddiff_(doublereal *, doublereal *);
+    integer n;
+    logical fatal;
+    doublecomplex w[130];
+    logical trace;
+    integer nidim;
+    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, 
+	    integer *, doublecomplex *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *, doublecomplex *, doublereal *, doublecomplex *, 
+	    integer *, doublereal *, doublereal *, logical *, integer *, 
+	    logical *);
+    char snaps[32];
+    integer isnum;
+    logical ltest[9];
+    doublecomplex aa[4225], ab[8450]	/* was [65][130] */, bb[4225], cc[
+	    4225], as[4225], bs[4225], cs[4225], ct[65];
+    logical sfatal, corder;
+    char snamet[13], transa[1], transb[1];
+    doublereal thresh;
+    logical rorder;
+    integer layout;
+    logical ltestt, tsterr;
+    extern /* Subroutine */ int cz3chke_(char *);
+    doublecomplex alf[7], bet[7];
+    doublereal eps, err;
+    extern logical lze_(doublecomplex *, doublecomplex *, integer *);
+
+    /* Fortran I/O blocks */
+    static cilist io___2 = { 0, 5, 0, 0, 0 };
+    static cilist io___4 = { 0, 5, 0, 0, 0 };
+    static cilist io___7 = { 0, 5, 0, 0, 0 };
+    static cilist io___9 = { 0, 5, 0, 0, 0 };
+    static cilist io___11 = { 0, 5, 0, 0, 0 };
+    static cilist io___13 = { 0, 5, 0, 0, 0 };
+    static cilist io___15 = { 0, 5, 0, 0, 0 };
+    static cilist io___17 = { 0, 5, 0, 0, 0 };
+    static cilist io___19 = { 0, 6, 0, fmt_9997, 0 };
+    static cilist io___20 = { 0, 5, 0, 0, 0 };
+    static cilist io___23 = { 0, 6, 0, fmt_9996, 0 };
+    static cilist io___24 = { 0, 5, 0, 0, 0 };
+    static cilist io___26 = { 0, 6, 0, fmt_9997, 0 };
+    static cilist io___27 = { 0, 5, 0, 0, 0 };
+    static cilist io___29 = { 0, 5, 0, 0, 0 };
+    static cilist io___31 = { 0, 6, 0, fmt_9997, 0 };
+    static cilist io___32 = { 0, 5, 0, 0, 0 };
+    static cilist io___34 = { 0, 6, 0, fmt_9995, 0 };
+    static cilist io___35 = { 0, 6, 0, fmt_9994, 0 };
+    static cilist io___36 = { 0, 6, 0, fmt_9993, 0 };
+    static cilist io___37 = { 0, 6, 0, fmt_9992, 0 };
+    static cilist io___38 = { 0, 6, 0, 0, 0 };
+    static cilist io___39 = { 0, 6, 0, fmt_9984, 0 };
+    static cilist io___40 = { 0, 6, 0, 0, 0 };
+    static cilist io___41 = { 0, 6, 0, fmt_9999, 0 };
+    static cilist io___42 = { 0, 6, 0, 0, 0 };
+    static cilist io___45 = { 0, 6, 0, fmt_10002, 0 };
+    static cilist io___46 = { 0, 6, 0, fmt_10001, 0 };
+    static cilist io___47 = { 0, 6, 0, fmt_10000, 0 };
+    static cilist io___48 = { 0, 6, 0, 0, 0 };
+    static cilist io___50 = { 0, 5, 1, fmt_9988, 0 };
+    static cilist io___53 = { 0, 6, 0, fmt_9990, 0 };
+    static cilist io___55 = { 0, 6, 0, fmt_9998, 0 };
+    static cilist io___68 = { 0, 6, 0, fmt_9989, 0 };
+    static cilist io___69 = { 0, 6, 0, fmt_9989, 0 };
+    static cilist io___70 = { 0, 6, 0, fmt_9989, 0 };
+    static cilist io___71 = { 0, 6, 0, fmt_9989, 0 };
+    static cilist io___73 = { 0, 6, 0, 0, 0 };
+    static cilist io___74 = { 0, 6, 0, fmt_9987, 0 };
+    static cilist io___75 = { 0, 6, 0, 0, 0 };
+    static cilist io___82 = { 0, 6, 0, fmt_9986, 0 };
+    static cilist io___83 = { 0, 6, 0, fmt_9985, 0 };
+    static cilist io___84 = { 0, 6, 0, fmt_9991, 0 };
+
+
+
+/*  Test program for the COMPLEX*16          Level 3 Blas. */
+
+/*  The program must be driven by a short data file. The first 13 records */
+/*  of the file are read using list-directed input, the last 9 records */
+/*  are read using the format ( A13,L2 ). An annotated example of a data */
+/*  file can be obtained by deleting the first 3 characters from the */
+/*  following 22 lines: */
+/*  'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE */
+/*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) */
+/*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. */
+/*  F        LOGICAL FLAG, T TO STOP ON FAILURES. */
+/*  T        LOGICAL FLAG, T TO TEST ERROR CALL MYEXITS. */
+/*  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH */
+/*  16.0     THRESHOLD VALUE OF TEST RATIO */
+/*  6                 NUMBER OF VALUES OF N */
+/*  0 1 2 3 5 9       VALUES OF N */
+/*  3                 NUMBER OF VALUES OF ALPHA */
+/*  (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA */
+/*  3                 NUMBER OF VALUES OF BETA */
+/*  (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA */
+/*  ZGEMM  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  ZHEMM  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  ZSYMM  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  ZTRMM  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  ZTRSM  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  ZHERK  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  ZSYRK  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. */
+
+/*  See: */
+
+/*     Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. */
+/*     A Set of Level 3 Basic Linear Algebra Subprograms. */
+
+/*     Technical Memorandum No.88 (Revision 1), Mathematics and */
+/*     Computer Science Division, Argonne National Laboratory, 9700 */
+/*     South Cass Avenue, Argonne, Illinois 60439, US. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+
+    infoc_1.noutc = 6;
+
+/*     Read name and unit number for snapshot output file and open file. */
+
+    s_rsle(&io___2);
+    do_lio(&c__9, &c__1, snaps, (ftnlen)32);
+    e_rsle();
+    s_rsle(&io___4);
+    do_lio(&c__3, &c__1, (char *)&ntra, (ftnlen)sizeof(integer));
+    e_rsle();
+    trace = ntra >= 0;
+    if (trace) {
+	o__1.oerr = 0;
+	o__1.ounit = ntra;
+	o__1.ofnmlen = 32;
+	o__1.ofnm = snaps;
+	o__1.orl = 0;
+	o__1.osta = "NEW";
+	o__1.oacc = 0;
+	o__1.ofm = 0;
+	o__1.oblnk = 0;
+	f_open(&o__1);
+    }
+/*     Read the flag that directs rewinding of the snapshot file. */
+    s_rsle(&io___7);
+    do_lio(&c__8, &c__1, (char *)&rewi, (ftnlen)sizeof(logical));
+    e_rsle();
+    rewi = rewi && trace;
+/*     Read the flag that directs stopping on any failure. */
+    s_rsle(&io___9);
+    do_lio(&c__8, &c__1, (char *)&sfatal, (ftnlen)sizeof(logical));
+    e_rsle();
+/*     Read the flag that indicates whether error exits are to be tested. */
+    s_rsle(&io___11);
+    do_lio(&c__8, &c__1, (char *)&tsterr, (ftnlen)sizeof(logical));
+    e_rsle();
+/*     Read the flag that indicates whether row-major data layout to be tested. */
+    s_rsle(&io___13);
+    do_lio(&c__3, &c__1, (char *)&layout, (ftnlen)sizeof(integer));
+    e_rsle();
+/*     Read the threshold value of the test ratio */
+    s_rsle(&io___15);
+    do_lio(&c__5, &c__1, (char *)&thresh, (ftnlen)sizeof(doublereal));
+    e_rsle();
+
+/*     Read and check the parameter values for the tests. */
+
+/*     Values of N */
+    s_rsle(&io___17);
+    do_lio(&c__3, &c__1, (char *)&nidim, (ftnlen)sizeof(integer));
+    e_rsle();
+    if (nidim < 1 || nidim > 9) {
+	s_wsfe(&io___19);
+	do_fio(&c__1, "N", (ftnlen)1);
+	do_fio(&c__1, (char *)&c__9, (ftnlen)sizeof(integer));
+	e_wsfe();
+	goto L220;
+    }
+    s_rsle(&io___20);
+    i__1 = nidim;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	do_lio(&c__3, &c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer));
+    }
+    e_rsle();
+    i__1 = nidim;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (idim[i__ - 1] < 0 || idim[i__ - 1] > 65) {
+	    s_wsfe(&io___23);
+	    do_fio(&c__1, (char *)&c__65, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	    goto L220;
+	}
+/* L10: */
+    }
+/*     Values of ALPHA */
+    s_rsle(&io___24);
+    do_lio(&c__3, &c__1, (char *)&nalf, (ftnlen)sizeof(integer));
+    e_rsle();
+    if (nalf < 1 || nalf > 7) {
+	s_wsfe(&io___26);
+	do_fio(&c__1, "ALPHA", (ftnlen)5);
+	do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer));
+	e_wsfe();
+	goto L220;
+    }
+    s_rsle(&io___27);
+    i__1 = nalf;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	do_lio(&c__7, &c__1, (char *)&alf[i__ - 1], (ftnlen)sizeof(
+		doublecomplex));
+    }
+    e_rsle();
+/*     Values of BETA */
+    s_rsle(&io___29);
+    do_lio(&c__3, &c__1, (char *)&nbet, (ftnlen)sizeof(integer));
+    e_rsle();
+    if (nbet < 1 || nbet > 7) {
+	s_wsfe(&io___31);
+	do_fio(&c__1, "BETA", (ftnlen)4);
+	do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer));
+	e_wsfe();
+	goto L220;
+    }
+    s_rsle(&io___32);
+    i__1 = nbet;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	do_lio(&c__7, &c__1, (char *)&bet[i__ - 1], (ftnlen)sizeof(
+		doublecomplex));
+    }
+    e_rsle();
+
+/*     Report values of parameters. */
+
+    s_wsfe(&io___34);
+    e_wsfe();
+    s_wsfe(&io___35);
+    i__1 = nidim;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	do_fio(&c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer));
+    }
+    e_wsfe();
+    s_wsfe(&io___36);
+    i__1 = nalf;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	do_fio(&c__2, (char *)&alf[i__ - 1], (ftnlen)sizeof(doublereal));
+    }
+    e_wsfe();
+    s_wsfe(&io___37);
+    i__1 = nbet;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	do_fio(&c__2, (char *)&bet[i__ - 1], (ftnlen)sizeof(doublereal));
+    }
+    e_wsfe();
+    if (! tsterr) {
+	s_wsle(&io___38);
+	e_wsle();
+	s_wsfe(&io___39);
+	e_wsfe();
+    }
+    s_wsle(&io___40);
+    e_wsle();
+    s_wsfe(&io___41);
+    do_fio(&c__1, (char *)&thresh, (ftnlen)sizeof(doublereal));
+    e_wsfe();
+    s_wsle(&io___42);
+    e_wsle();
+    rorder = FALSE_;
+    corder = FALSE_;
+    if (layout == 2) {
+	rorder = TRUE_;
+	corder = TRUE_;
+	s_wsfe(&io___45);
+	e_wsfe();
+    } else if (layout == 1) {
+	rorder = TRUE_;
+	s_wsfe(&io___46);
+	e_wsfe();
+    } else if (layout == 0) {
+	corder = TRUE_;
+	s_wsfe(&io___47);
+	e_wsfe();
+    }
+    s_wsle(&io___48);
+    e_wsle();
+
+/*     Read names of subroutines and flags which indicate */
+/*     whether they are to be tested. */
+
+    for (i__ = 1; i__ <= 9; ++i__) {
+	ltest[i__ - 1] = FALSE_;
+/* L20: */
+    }
+L30:
+    i__1 = s_rsfe(&io___50);
+    if (i__1 != 0) {
+	goto L60;
+    }
+    i__1 = do_fio(&c__1, snamet, (ftnlen)13);
+    if (i__1 != 0) {
+	goto L60;
+    }
+    i__1 = do_fio(&c__1, (char *)&ltestt, (ftnlen)sizeof(logical));
+    if (i__1 != 0) {
+	goto L60;
+    }
+    i__1 = e_rsfe();
+    if (i__1 != 0) {
+	goto L60;
+    }
+    for (i__ = 1; i__ <= 9; ++i__) {
+	if (s_cmp(snamet, snames + (i__ - 1) * 13, (ftnlen)13, (ftnlen)13) == 
+		0) {
+	    goto L50;
+	}
+/* L40: */
+    }
+    s_wsfe(&io___53);
+    do_fio(&c__1, snamet, (ftnlen)13);
+    e_wsfe();
+    s_stop("", (ftnlen)0);
+L50:
+    ltest[i__ - 1] = ltestt;
+    goto L30;
+
+L60:
+    cl__1.cerr = 0;
+    cl__1.cunit = 5;
+    cl__1.csta = 0;
+    f_clos(&cl__1);
+
+/*     Compute EPS (the machine precision). */
+
+    eps = 1.;
+L70:
+    d__1 = eps + 1.;
+    if (ddiff_(&d__1, &c_b92) == 0.) {
+	goto L80;
+    }
+    eps *= .5;
+    goto L70;
+L80:
+    eps += eps;
+    s_wsfe(&io___55);
+    do_fio(&c__1, (char *)&eps, (ftnlen)sizeof(doublereal));
+    e_wsfe();
+
+/*     Check the reliability of ZMMCH using exact data. */
+
+    n = 32;
+    i__1 = n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = n;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * 65 - 66;
+/* Computing MAX */
+	    i__5 = i__ - j + 1;
+	    i__4 = f2cmax(i__5,0);
+	    ab[i__3].r = (doublereal) i__4, ab[i__3].i = 0.;
+/* L90: */
+	}
+	i__2 = j + 4224;
+	ab[i__2].r = (doublereal) j, ab[i__2].i = 0.;
+	i__2 = (j + 65) * 65 - 65;
+	ab[i__2].r = (doublereal) j, ab[i__2].i = 0.;
+	i__2 = j - 1;
+	c__[i__2].r = 0., c__[i__2].i = 0.;
+/* L100: */
+    }
+    i__1 = n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = j - 1;
+	i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3;
+	cc[i__2].r = (doublereal) i__3, cc[i__2].i = 0.;
+/* L110: */
+    }
+/*     CC holds the exact result. On exit from ZMMCH CT holds */
+/*     the result computed by ZMMCH. */
+    *(unsigned char *)transa = 'N';
+    *(unsigned char *)transb = 'N';
+    zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    &c__6, &c_true);
+    same = lze_(cc, ct, &n);
+    if (! same || err != 0.) {
+	s_wsfe(&io___68);
+	do_fio(&c__1, transa, (ftnlen)1);
+	do_fio(&c__1, transb, (ftnlen)1);
+	do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical));
+	do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal));
+	e_wsfe();
+	s_stop("", (ftnlen)0);
+    }
+    *(unsigned char *)transb = 'C';
+    zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    &c__6, &c_true);
+    same = lze_(cc, ct, &n);
+    if (! same || err != 0.) {
+	s_wsfe(&io___69);
+	do_fio(&c__1, transa, (ftnlen)1);
+	do_fio(&c__1, transb, (ftnlen)1);
+	do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical));
+	do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal));
+	e_wsfe();
+	s_stop("", (ftnlen)0);
+    }
+    i__1 = n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = j + 4224;
+	i__3 = n - j + 1;
+	ab[i__2].r = (doublereal) i__3, ab[i__2].i = 0.;
+	i__2 = (j + 65) * 65 - 65;
+	i__3 = n - j + 1;
+	ab[i__2].r = (doublereal) i__3, ab[i__2].i = 0.;
+/* L120: */
+    }
+    i__1 = n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = n - j;
+	i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3;
+	cc[i__2].r = (doublereal) i__3, cc[i__2].i = 0.;
+/* L130: */
+    }
+    *(unsigned char *)transa = 'C';
+    *(unsigned char *)transb = 'N';
+    zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    &c__6, &c_true);
+    same = lze_(cc, ct, &n);
+    if (! same || err != 0.) {
+	s_wsfe(&io___70);
+	do_fio(&c__1, transa, (ftnlen)1);
+	do_fio(&c__1, transb, (ftnlen)1);
+	do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical));
+	do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal));
+	e_wsfe();
+	s_stop("", (ftnlen)0);
+    }
+    *(unsigned char *)transb = 'C';
+    zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    &c__6, &c_true);
+    same = lze_(cc, ct, &n);
+    if (! same || err != 0.) {
+	s_wsfe(&io___71);
+	do_fio(&c__1, transa, (ftnlen)1);
+	do_fio(&c__1, transb, (ftnlen)1);
+	do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical));
+	do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal));
+	e_wsfe();
+	s_stop("", (ftnlen)0);
+    }
+
+/*     Test each subroutine in turn. */
+
+    for (isnum = 1; isnum <= 9; ++isnum) {
+	s_wsle(&io___73);
+	e_wsle();
+	if (! ltest[isnum - 1]) {
+/*           Subprogram is not to be tested. */
+	    s_wsfe(&io___74);
+	    do_fio(&c__1, snames + (isnum - 1) * 13, (ftnlen)13);
+	    e_wsfe();
+	} else {
+	    s_copy(srnamc_1.srnamt, snames + (isnum - 1) * 13, (ftnlen)13, (
+		    ftnlen)13);
+/*           Test error exits. */
+	    if (tsterr) {
+		cz3chke_(snames + (isnum - 1) * 13);
+		s_wsle(&io___75);
+		e_wsle();
+	    }
+/*           Test computations. */
+	    infoc_1.infot = 0;
+	    infoc_1.ok = TRUE_;
+	    fatal = FALSE_;
+	    switch (isnum) {
+		case 1:  goto L140;
+		case 2:  goto L150;
+		case 3:  goto L150;
+		case 4:  goto L160;
+		case 5:  goto L160;
+		case 6:  goto L170;
+		case 7:  goto L170;
+		case 8:  goto L180;
+		case 9:  goto L180;
+	    }
+/*           Test ZGEMM, 01. */
+L140:
+	    if (corder) {
+		zchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__0);
+	    }
+	    if (rorder) {
+		zchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__1);
+	    }
+	    goto L190;
+/*           Test ZHEMM, 02, ZSYMM, 03. */
+L150:
+	    if (corder) {
+		zchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__0);
+	    }
+	    if (rorder) {
+		zchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__1);
+	    }
+	    goto L190;
+/*           Test ZTRMM, 04, ZTRSM, 05. */
+L160:
+	    if (corder) {
+		zchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, &
+			c__0);
+	    }
+	    if (rorder) {
+		zchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, &
+			c__1);
+	    }
+	    goto L190;
+/*           Test ZHERK, 06, ZSYRK, 07. */
+L170:
+	    if (corder) {
+		zchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__0);
+	    }
+	    if (rorder) {
+		zchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__1);
+	    }
+	    goto L190;
+/*           Test ZHER2K, 08, ZSYR2K, 09. */
+L180:
+	    if (corder) {
+		zchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, 
+			ct, g, w, &c__0);
+	    }
+	    if (rorder) {
+		zchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, 
+			ct, g, w, &c__1);
+	    }
+	    goto L190;
+
+L190:
+	    if (fatal && sfatal) {
+		goto L210;
+	    }
+	}
+/* L200: */
+    }
+    s_wsfe(&io___82);
+    e_wsfe();
+    goto L230;
+
+L210:
+    s_wsfe(&io___83);
+    e_wsfe();
+    goto L230;
+
+L220:
+    s_wsfe(&io___84);
+    e_wsfe();
+
+L230:
+    if (trace) {
+	cl__1.cerr = 0;
+	cl__1.cunit = ntra;
+	cl__1.csta = 0;
+	f_clos(&cl__1);
+    }
+    cl__1.cerr = 0;
+    cl__1.cunit = 6;
+    cl__1.csta = 0;
+    f_clos(&cl__1);
+    s_stop("", (ftnlen)0);
+
+
+/*     End of ZBLAT3. */
+
+    return 0;
+} /* MAIN__ */
+
+/* Subroutine */ int zchk1_(char *sname, doublereal *eps, doublereal *thresh, 
+	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
+	fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex *
+	alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex *
+	a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, 
+	doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, 
+	doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal *
+	g, integer *iorder)
+{
+    /* Initialized data */
+
+    static char ich[3] = "NTC";
+
+    /* Format strings */
+    static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT "
+	    "TAKEN ON VALID\002,\002 CALL *******\002)";
+    static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER"
+	    " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)";
+    static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C"
+	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
+    static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR    C"
+	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
+    static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO"
+	    "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
+	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
+	    "ECT *******\002)";
+    static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR  "
+	    "  COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
+	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
+	    "ECT *******\002)";
+    static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM"
+	    "BER:\002)";
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+	    i__3, i__4, i__5, i__6, i__7, i__8;
+    alist al__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int czgemm3m_(integer *, char *, char *, integer *
+	    , integer *, integer *, doublecomplex *, doublecomplex *, integer 
+	    *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *);
+    doublecomplex beta;
+    integer ldas, ldbs, ldcs;
+    logical same, null;
+    integer i__, k, m, n;
+    doublecomplex alpha;
+    logical isame[13], trana, tranb;
+    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, 
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
+	     logical *, doublecomplex *);
+    integer nargs;
+    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, 
+	    integer *, doublecomplex *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *, doublecomplex *, doublereal *, doublecomplex *, 
+	    integer *, doublereal *, doublereal *, logical *, integer *, 
+	    logical *);
+    logical reset;
+    integer ia, ib;
+    extern /* Subroutine */ int zprcn1_(integer *, integer *, char *, integer 
+	    *, char *, char *, integer *, integer *, integer *, doublecomplex 
+	    *, integer *, integer *, doublecomplex *, integer *);
+    integer ma, mb, na, nb, nc, ik, im, in, ks, ms, ns;
+    char tranas[1], tranbs[1], transa[1], transb[1];
+    doublereal errmax;
+    extern logical lzeres_(char *, char *, integer *, integer *, 
+	    doublecomplex *, doublecomplex *, integer *);
+    integer ica, icb, laa, lbb, lda, lcc, ldb, ldc;
+    doublecomplex als, bls;
+    doublereal err;
+    extern logical lze_(doublecomplex *, doublecomplex *, integer *);
+
+    /* Fortran I/O blocks */
+    static cilist io___128 = { 0, 0, 0, fmt_9994, 0 };
+    static cilist io___131 = { 0, 0, 0, fmt_9998, 0 };
+    static cilist io___133 = { 0, 0, 0, fmt_10000, 0 };
+    static cilist io___134 = { 0, 0, 0, fmt_10001, 0 };
+    static cilist io___135 = { 0, 0, 0, fmt_10002, 0 };
+    static cilist io___136 = { 0, 0, 0, fmt_10003, 0 };
+    static cilist io___137 = { 0, 0, 0, fmt_9996, 0 };
+
+
+
+/*  Tests ZGEMM. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    --idim;
+    --alf;
+    --bet;
+    --g;
+    --ct;
+    --cs;
+    --cc;
+    c_dim1 = *nmax;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --bs;
+    --bb;
+    b_dim1 = *nmax;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --as;
+    --aa;
+    a_dim1 = *nmax;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+
+    /* Function Body */
+
+    nargs = 13;
+    nc = 0;
+    reset = TRUE_;
+    errmax = 0.;
+
+    i__1 = *nidim;
+    for (im = 1; im <= i__1; ++im) {
+	m = idim[im];
+
+	i__2 = *nidim;
+	for (in = 1; in <= i__2; ++in) {
+	    n = idim[in];
+/*           Set LDC to 1 more than minimum value if room. */
+	    ldc = m;
+	    if (ldc < *nmax) {
+		++ldc;
+	    }
+/*           Skip tests if not enough room. */
+	    if (ldc > *nmax) {
+		goto L100;
+	    }
+	    lcc = ldc * n;
+	    null = n <= 0 || m <= 0;
+
+	    i__3 = *nidim;
+	    for (ik = 1; ik <= i__3; ++ik) {
+		k = idim[ik];
+
+		for (ica = 1; ica <= 3; ++ica) {
+		    *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1]
+			    ;
+		    trana = *(unsigned char *)transa == 'T' || *(unsigned 
+			    char *)transa == 'C';
+
+		    if (trana) {
+			ma = k;
+			na = m;
+		    } else {
+			ma = m;
+			na = k;
+		    }
+/*                 Set LDA to 1 more than minimum value if room. */
+		    lda = ma;
+		    if (lda < *nmax) {
+			++lda;
+		    }
+/*                 Skip tests if not enough room. */
+		    if (lda > *nmax) {
+			goto L80;
+		    }
+		    laa = lda * na;
+
+/*                 Generate the matrix A. */
+
+		    zmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[
+			    1], &lda, &reset, &c_b1);
+
+		    for (icb = 1; icb <= 3; ++icb) {
+			*(unsigned char *)transb = *(unsigned char *)&ich[icb 
+				- 1];
+			tranb = *(unsigned char *)transb == 'T' || *(unsigned 
+				char *)transb == 'C';
+
+			if (tranb) {
+			    mb = n;
+			    nb = k;
+			} else {
+			    mb = k;
+			    nb = n;
+			}
+/*                    Set LDB to 1 more than minimum value if room. */
+			ldb = mb;
+			if (ldb < *nmax) {
+			    ++ldb;
+			}
+/*                    Skip tests if not enough room. */
+			if (ldb > *nmax) {
+			    goto L70;
+			}
+			lbb = ldb * nb;
+
+/*                    Generate the matrix B. */
+
+			zmake_("ge", " ", " ", &mb, &nb, &b[b_offset], nmax, &
+				bb[1], &ldb, &reset, &c_b1);
+
+			i__4 = *nalf;
+			for (ia = 1; ia <= i__4; ++ia) {
+			    i__5 = ia;
+			    alpha.r = alf[i__5].r, alpha.i = alf[i__5].i;
+
+			    i__5 = *nbet;
+			    for (ib = 1; ib <= i__5; ++ib) {
+				i__6 = ib;
+				beta.r = bet[i__6].r, beta.i = bet[i__6].i;
+
+/*                          Generate the matrix C. */
+
+				zmake_("ge", " ", " ", &m, &n, &c__[c_offset],
+					 nmax, &cc[1], &ldc, &reset, &c_b1);
+
+				++nc;
+
+/*                          Save every datum before calling the */
+/*                          subroutine. */
+
+				*(unsigned char *)tranas = *(unsigned char *)
+					transa;
+				*(unsigned char *)tranbs = *(unsigned char *)
+					transb;
+				ms = m;
+				ns = n;
+				ks = k;
+				als.r = alpha.r, als.i = alpha.i;
+				i__6 = laa;
+				for (i__ = 1; i__ <= i__6; ++i__) {
+				    i__7 = i__;
+				    i__8 = i__;
+				    as[i__7].r = aa[i__8].r, as[i__7].i = aa[
+					    i__8].i;
+/* L10: */
+				}
+				ldas = lda;
+				i__6 = lbb;
+				for (i__ = 1; i__ <= i__6; ++i__) {
+				    i__7 = i__;
+				    i__8 = i__;
+				    bs[i__7].r = bb[i__8].r, bs[i__7].i = bb[
+					    i__8].i;
+/* L20: */
+				}
+				ldbs = ldb;
+				bls.r = beta.r, bls.i = beta.i;
+				i__6 = lcc;
+				for (i__ = 1; i__ <= i__6; ++i__) {
+				    i__7 = i__;
+				    i__8 = i__;
+				    cs[i__7].r = cc[i__8].r, cs[i__7].i = cc[
+					    i__8].i;
+/* L30: */
+				}
+				ldcs = ldc;
+
+/*                          Call the subroutine. */
+
+				if (*trace) {
+				    zprcn1_(ntra, &nc, sname, iorder, transa, 
+					    transb, &m, &n, &k, &alpha, &lda, 
+					    &ldb, &beta, &ldc);
+				}
+				if (*rewi) {
+				    al__1.aerr = 0;
+				    al__1.aunit = *ntra;
+				    f_rew(&al__1);
+				}
+				czgemm3m_(iorder, transa, transb, &m, &n, &k, 
+					&alpha, &aa[1], &lda, &bb[1], &ldb, &
+					beta, &cc[1], &ldc);
+
+/*                          Check if error-exit was taken incorrectly. */
+
+				if (! infoc_1.ok) {
+				    io___128.ciunit = *nout;
+				    s_wsfe(&io___128);
+				    e_wsfe();
+				    *fatal = TRUE_;
+				    goto L120;
+				}
+
+/*                          See what data changed inside subroutines. */
+
+				isame[0] = *(unsigned char *)transa == *(
+					unsigned char *)tranas;
+				isame[1] = *(unsigned char *)transb == *(
+					unsigned char *)tranbs;
+				isame[2] = ms == m;
+				isame[3] = ns == n;
+				isame[4] = ks == k;
+				isame[5] = als.r == alpha.r && als.i == 
+					alpha.i;
+				isame[6] = lze_(&as[1], &aa[1], &laa);
+				isame[7] = ldas == lda;
+				isame[8] = lze_(&bs[1], &bb[1], &lbb);
+				isame[9] = ldbs == ldb;
+				isame[10] = bls.r == beta.r && bls.i == 
+					beta.i;
+				if (null) {
+				    isame[11] = lze_(&cs[1], &cc[1], &lcc);
+				} else {
+				    isame[11] = lzeres_("ge", " ", &m, &n, &
+					    cs[1], &cc[1], &ldc);
+				}
+				isame[12] = ldcs == ldc;
+
+/*                          If data was incorrectly changed, report */
+/*                          and return. */
+
+				same = TRUE_;
+				i__6 = nargs;
+				for (i__ = 1; i__ <= i__6; ++i__) {
+				    same = same && isame[i__ - 1];
+				    if (! isame[i__ - 1]) {
+					io___131.ciunit = *nout;
+					s_wsfe(&io___131);
+					do_fio(&c__1, (char *)&i__, (ftnlen)
+						sizeof(integer));
+					e_wsfe();
+				    }
+/* L40: */
+				}
+				if (! same) {
+				    *fatal = TRUE_;
+				    goto L120;
+				}
+
+				if (! null) {
+
+/*                             Check the result. */
+
+				    zmmch_(transa, transb, &m, &n, &k, &alpha,
+					     &a[a_offset], nmax, &b[b_offset],
+					     nmax, &beta, &c__[c_offset], 
+					    nmax, &ct[1], &g[1], &cc[1], &ldc,
+					     eps, &err, fatal, nout, &c_true);
+				    errmax = f2cmax(errmax,err);
+/*                             If got really bad answer, report and */
+/*                             return. */
+				    if (*fatal) {
+					goto L120;
+				    }
+				}
+
+/* L50: */
+			    }
+
+/* L60: */
+			}
+
+L70:
+			;
+		    }
+
+L80:
+		    ;
+		}
+
+/* L90: */
+	    }
+
+L100:
+	    ;
+	}
+
+/* L110: */
+    }
+
+/*     Report result. */
+
+    if (errmax < *thresh) {
+	if (*iorder == 0) {
+	    io___133.ciunit = *nout;
+	    s_wsfe(&io___133);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+	if (*iorder == 1) {
+	    io___134.ciunit = *nout;
+	    s_wsfe(&io___134);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+    } else {
+	if (*iorder == 0) {
+	    io___135.ciunit = *nout;
+	    s_wsfe(&io___135);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal));
+	    e_wsfe();
+	}
+	if (*iorder == 1) {
+	    io___136.ciunit = *nout;
+	    s_wsfe(&io___136);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal));
+	    e_wsfe();
+	}
+    }
+    goto L130;
+
+L120:
+    io___137.ciunit = *nout;
+    s_wsfe(&io___137);
+    do_fio(&c__1, sname, (ftnlen)13);
+    e_wsfe();
+    zprcn1_(nout, &nc, sname, iorder, transa, transb, &m, &n, &k, &alpha, &
+	    lda, &ldb, &beta, &ldc);
+
+L130:
+    return 0;
+
+/* L9995: */
+
+/*     End of ZCHK1. */
+
+} /* zchk1_ */
+
+
+/* Subroutine */ int zprcn1_(integer *nout, integer *nc, char *sname, integer 
+	*iorder, char *transa, char *transb, integer *m, integer *n, integer *
+	k, doublecomplex *alpha, integer *lda, integer *ldb, doublecomplex *
+	beta, integer *ldc)
+{
+    /* Format strings */
+    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a15,\002,\002,"
+	    "a14,\002,\002,a14,\002,\002)";
+    static char fmt_9994[] = "(10x,3(i3,\002,\002),\002 (\002,f4.1,\002,\002"
+	    ",f4.1,\002) , A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002"
+	    ",\002,f4.1,\002) , C,\002,i3,\002).\002)";
+
+    /* Local variables */
+    char crc[14], cta[14], ctb[14];
+
+    /* Fortran I/O blocks */
+    static cilist io___141 = { 0, 0, 0, fmt_9995, 0 };
+    static cilist io___142 = { 0, 0, 0, fmt_9994, 0 };
+
+
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(cta, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(cta, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cta, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transb == 'N') {
+	s_copy(ctb, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transb == 'T') {
+	s_copy(ctb, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ctb, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    io___141.ciunit = *nout;
+    s_wsfe(&io___141);
+    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
+    do_fio(&c__1, sname, (ftnlen)13);
+    do_fio(&c__1, crc, (ftnlen)14);
+    do_fio(&c__1, cta, (ftnlen)14);
+    do_fio(&c__1, ctb, (ftnlen)14);
+    e_wsfe();
+    io___142.ciunit = *nout;
+    s_wsfe(&io___142);
+    do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer));
+    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal));
+    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer));
+    do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal));
+    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
+    e_wsfe();
+    return 0;
+} /* zprcn1_ */
+
+
+/* Subroutine */ int zchk2_(char *sname, doublereal *eps, doublereal *thresh, 
+	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
+	fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex *
+	alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex *
+	a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, 
+	doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, 
+	doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal *
+	g, integer *iorder)
+{
+    /* Initialized data */
+
+    static char ichs[2] = "LR";
+    static char ichu[2] = "UL";
+
+    /* Format strings */
+    static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT "
+	    "TAKEN ON VALID\002,\002 CALL *******\002)";
+    static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER"
+	    " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)";
+    static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C"
+	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
+    static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR    C"
+	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
+    static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO"
+	    "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
+	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
+	    "ECT *******\002)";
+    static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR  "
+	    "  COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
+	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
+	    "ECT *******\002)";
+    static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM"
+	    "BER:\002)";
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+	    i__3, i__4, i__5, i__6, i__7;
+    alist al__1;
+
+    /* Local variables */
+    doublecomplex beta;
+    integer ldas, ldbs, ldcs;
+    logical same;
+    char side[1];
+    logical conj, left, null;
+    char uplo[1];
+    integer i__, m, n;
+    doublecomplex alpha;
+    logical isame[13];
+    char sides[1];
+    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, 
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
+	     logical *, doublecomplex *);
+    integer nargs;
+    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, 
+	    integer *, doublecomplex *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *, doublecomplex *, doublereal *, doublecomplex *, 
+	    integer *, doublereal *, doublereal *, logical *, integer *, 
+	    logical *);
+    logical reset;
+    char uplos[1];
+    integer ia, ib;
+    extern /* Subroutine */ int zprcn2_(integer *, integer *, char *, integer 
+	    *, char *, char *, integer *, integer *, doublecomplex *, integer 
+	    *, integer *, doublecomplex *, integer *);
+    integer na, nc, im, in, ms, ns;
+    extern /* Subroutine */ int czhemm_(integer *, char *, char *, integer *, 
+	    integer *, doublecomplex *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *);
+    doublereal errmax;
+    extern logical lzeres_(char *, char *, integer *, integer *, 
+	    doublecomplex *, doublecomplex *, integer *);
+    extern /* Subroutine */ int czsymm_(integer *, char *, char *, integer *, 
+	    integer *, doublecomplex *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *);
+    integer laa, lbb, lda, lcc, ldb, ldc, ics;
+    doublecomplex als, bls;
+    integer icu;
+    doublereal err;
+    extern logical lze_(doublecomplex *, doublecomplex *, integer *);
+
+    /* Fortran I/O blocks */
+    static cilist io___181 = { 0, 0, 0, fmt_9994, 0 };
+    static cilist io___184 = { 0, 0, 0, fmt_9998, 0 };
+    static cilist io___186 = { 0, 0, 0, fmt_10000, 0 };
+    static cilist io___187 = { 0, 0, 0, fmt_10001, 0 };
+    static cilist io___188 = { 0, 0, 0, fmt_10002, 0 };
+    static cilist io___189 = { 0, 0, 0, fmt_10003, 0 };
+    static cilist io___190 = { 0, 0, 0, fmt_9996, 0 };
+
+
+
+/*  Tests ZHEMM and ZSYMM. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    --idim;
+    --alf;
+    --bet;
+    --g;
+    --ct;
+    --cs;
+    --cc;
+    c_dim1 = *nmax;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --bs;
+    --bb;
+    b_dim1 = *nmax;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --as;
+    --aa;
+    a_dim1 = *nmax;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+
+    /* Function Body */
+    conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0;
+
+    nargs = 12;
+    nc = 0;
+    reset = TRUE_;
+    errmax = 0.;
+
+    i__1 = *nidim;
+    for (im = 1; im <= i__1; ++im) {
+	m = idim[im];
+
+	i__2 = *nidim;
+	for (in = 1; in <= i__2; ++in) {
+	    n = idim[in];
+/*           Set LDC to 1 more than minimum value if room. */
+	    ldc = m;
+	    if (ldc < *nmax) {
+		++ldc;
+	    }
+/*           Skip tests if not enough room. */
+	    if (ldc > *nmax) {
+		goto L90;
+	    }
+	    lcc = ldc * n;
+	    null = n <= 0 || m <= 0;
+/*           Set LDB to 1 more than minimum value if room. */
+	    ldb = m;
+	    if (ldb < *nmax) {
+		++ldb;
+	    }
+/*           Skip tests if not enough room. */
+	    if (ldb > *nmax) {
+		goto L90;
+	    }
+	    lbb = ldb * n;
+
+/*           Generate the matrix B. */
+
+	    zmake_("ge", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, &
+		    reset, &c_b1);
+
+	    for (ics = 1; ics <= 2; ++ics) {
+		*(unsigned char *)side = *(unsigned char *)&ichs[ics - 1];
+		left = *(unsigned char *)side == 'L';
+
+		if (left) {
+		    na = m;
+		} else {
+		    na = n;
+		}
+/*              Set LDA to 1 more than minimum value if room. */
+		lda = na;
+		if (lda < *nmax) {
+		    ++lda;
+		}
+/*              Skip tests if not enough room. */
+		if (lda > *nmax) {
+		    goto L80;
+		}
+		laa = lda * na;
+
+		for (icu = 1; icu <= 2; ++icu) {
+		    *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1];
+
+/*                 Generate the hermitian or symmetric matrix A. */
+
+		    zmake_(sname + 7, uplo, " ", &na, &na, &a[a_offset], nmax,
+			     &aa[1], &lda, &reset, &c_b1);
+
+		    i__3 = *nalf;
+		    for (ia = 1; ia <= i__3; ++ia) {
+			i__4 = ia;
+			alpha.r = alf[i__4].r, alpha.i = alf[i__4].i;
+
+			i__4 = *nbet;
+			for (ib = 1; ib <= i__4; ++ib) {
+			    i__5 = ib;
+			    beta.r = bet[i__5].r, beta.i = bet[i__5].i;
+
+/*                       Generate the matrix C. */
+
+			    zmake_("ge", " ", " ", &m, &n, &c__[c_offset], 
+				    nmax, &cc[1], &ldc, &reset, &c_b1);
+
+			    ++nc;
+
+/*                       Save every datum before calling the */
+/*                       subroutine. */
+
+			    *(unsigned char *)sides = *(unsigned char *)side;
+			    *(unsigned char *)uplos = *(unsigned char *)uplo;
+			    ms = m;
+			    ns = n;
+			    als.r = alpha.r, als.i = alpha.i;
+			    i__5 = laa;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7]
+					.i;
+/* L10: */
+			    }
+			    ldas = lda;
+			    i__5 = lbb;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7]
+					.i;
+/* L20: */
+			    }
+			    ldbs = ldb;
+			    bls.r = beta.r, bls.i = beta.i;
+			    i__5 = lcc;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7]
+					.i;
+/* L30: */
+			    }
+			    ldcs = ldc;
+
+/*                       Call the subroutine. */
+
+			    if (*trace) {
+				zprcn2_(ntra, &nc, sname, iorder, side, uplo, 
+					&m, &n, &alpha, &lda, &ldb, &beta, &
+					ldc)
+					;
+			    }
+			    if (*rewi) {
+				al__1.aerr = 0;
+				al__1.aunit = *ntra;
+				f_rew(&al__1);
+			    }
+			    if (conj) {
+				czhemm_(iorder, side, uplo, &m, &n, &alpha, &
+					aa[1], &lda, &bb[1], &ldb, &beta, &cc[
+					1], &ldc);
+			    } else {
+				czsymm_(iorder, side, uplo, &m, &n, &alpha, &
+					aa[1], &lda, &bb[1], &ldb, &beta, &cc[
+					1], &ldc);
+			    }
+
+/*                       Check if error-exit was taken incorrectly. */
+
+			    if (! infoc_1.ok) {
+				io___181.ciunit = *nout;
+				s_wsfe(&io___181);
+				e_wsfe();
+				*fatal = TRUE_;
+				goto L110;
+			    }
+
+/*                       See what data changed inside subroutines. */
+
+			    isame[0] = *(unsigned char *)sides == *(unsigned 
+				    char *)side;
+			    isame[1] = *(unsigned char *)uplos == *(unsigned 
+				    char *)uplo;
+			    isame[2] = ms == m;
+			    isame[3] = ns == n;
+			    isame[4] = als.r == alpha.r && als.i == alpha.i;
+			    isame[5] = lze_(&as[1], &aa[1], &laa);
+			    isame[6] = ldas == lda;
+			    isame[7] = lze_(&bs[1], &bb[1], &lbb);
+			    isame[8] = ldbs == ldb;
+			    isame[9] = bls.r == beta.r && bls.i == beta.i;
+			    if (null) {
+				isame[10] = lze_(&cs[1], &cc[1], &lcc);
+			    } else {
+				isame[10] = lzeres_("ge", " ", &m, &n, &cs[1],
+					 &cc[1], &ldc);
+			    }
+			    isame[11] = ldcs == ldc;
+
+/*                       If data was incorrectly changed, report and */
+/*                       return. */
+
+			    same = TRUE_;
+			    i__5 = nargs;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				same = same && isame[i__ - 1];
+				if (! isame[i__ - 1]) {
+				    io___184.ciunit = *nout;
+				    s_wsfe(&io___184);
+				    do_fio(&c__1, (char *)&i__, (ftnlen)
+					    sizeof(integer));
+				    e_wsfe();
+				}
+/* L40: */
+			    }
+			    if (! same) {
+				*fatal = TRUE_;
+				goto L110;
+			    }
+
+			    if (! null) {
+
+/*                          Check the result. */
+
+				if (left) {
+				    zmmch_("N", "N", &m, &n, &m, &alpha, &a[
+					    a_offset], nmax, &b[b_offset], 
+					    nmax, &beta, &c__[c_offset], nmax,
+					     &ct[1], &g[1], &cc[1], &ldc, eps,
+					     &err, fatal, nout, &c_true);
+				} else {
+				    zmmch_("N", "N", &m, &n, &n, &alpha, &b[
+					    b_offset], nmax, &a[a_offset], 
+					    nmax, &beta, &c__[c_offset], nmax,
+					     &ct[1], &g[1], &cc[1], &ldc, eps,
+					     &err, fatal, nout, &c_true);
+				}
+				errmax = f2cmax(errmax,err);
+/*                          If got really bad answer, report and */
+/*                          return. */
+				if (*fatal) {
+				    goto L110;
+				}
+			    }
+
+/* L50: */
+			}
+
+/* L60: */
+		    }
+
+/* L70: */
+		}
+
+L80:
+		;
+	    }
+
+L90:
+	    ;
+	}
+
+/* L100: */
+    }
+
+/*     Report result. */
+
+    if (errmax < *thresh) {
+	if (*iorder == 0) {
+	    io___186.ciunit = *nout;
+	    s_wsfe(&io___186);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+	if (*iorder == 1) {
+	    io___187.ciunit = *nout;
+	    s_wsfe(&io___187);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+    } else {
+	if (*iorder == 0) {
+	    io___188.ciunit = *nout;
+	    s_wsfe(&io___188);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal));
+	    e_wsfe();
+	}
+	if (*iorder == 1) {
+	    io___189.ciunit = *nout;
+	    s_wsfe(&io___189);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal));
+	    e_wsfe();
+	}
+    }
+    goto L120;
+
+L110:
+    io___190.ciunit = *nout;
+    s_wsfe(&io___190);
+    do_fio(&c__1, sname, (ftnlen)13);
+    e_wsfe();
+    zprcn2_(nout, &nc, sname, iorder, side, uplo, &m, &n, &alpha, &lda, &ldb, 
+	    &beta, &ldc);
+
+L120:
+    return 0;
+
+/* L9995: */
+
+/*     End of ZCHK2. */
+
+} /* zchk2_ */
+
+
+/* Subroutine */ int zprcn2_(integer *nout, integer *nc, char *sname, integer 
+	*iorder, char *side, char *uplo, integer *m, integer *n, 
+	doublecomplex *alpha, integer *lda, integer *ldb, doublecomplex *beta,
+	 integer *ldc)
+{
+    /* Format strings */
+    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a15,\002,\002,"
+	    "a14,\002,\002,a14,\002,\002)";
+    static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002"
+	    ",f4.1,\002), A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002,"
+	    "\002,f4.1,\002), \002,\002C,\002,i3,\002).\002)";
+
+    /* Local variables */
+    char cs[14], cu[14], crc[14];
+
+    /* Fortran I/O blocks */
+    static cilist io___194 = { 0, 0, 0, fmt_9995, 0 };
+    static cilist io___195 = { 0, 0, 0, fmt_9994, 0 };
+
+
+    if (*(unsigned char *)side == 'L') {
+	s_copy(cs, "     CblasLeft", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cs, "    CblasRight", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    io___194.ciunit = *nout;
+    s_wsfe(&io___194);
+    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
+    do_fio(&c__1, sname, (ftnlen)13);
+    do_fio(&c__1, crc, (ftnlen)14);
+    do_fio(&c__1, cs, (ftnlen)14);
+    do_fio(&c__1, cu, (ftnlen)14);
+    e_wsfe();
+    io___195.ciunit = *nout;
+    s_wsfe(&io___195);
+    do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
+    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal));
+    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer));
+    do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal));
+    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
+    e_wsfe();
+    return 0;
+} /* zprcn2_ */
+
+
+/* Subroutine */ int zchk3_(char *sname, doublereal *eps, doublereal *thresh, 
+	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
+	fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex *
+	alf, integer *nmax, doublecomplex *a, doublecomplex *aa, 
+	doublecomplex *as, doublecomplex *b, doublecomplex *bb, doublecomplex 
+	*bs, doublecomplex *ct, doublereal *g, doublecomplex *c__, integer *
+	iorder)
+{
+    /* Initialized data */
+
+    static char ichu[2] = "UL";
+    static char icht[3] = "NTC";
+    static char ichd[2] = "UN";
+    static char ichs[2] = "LR";
+
+    /* Format strings */
+    static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT "
+	    "TAKEN ON VALID\002,\002 CALL *******\002)";
+    static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER"
+	    " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)";
+    static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C"
+	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
+    static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR    C"
+	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
+    static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO"
+	    "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
+	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
+	    "ECT *******\002)";
+    static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR  "
+	    "  COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
+	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
+	    "ECT *******\002)";
+    static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM"
+	    "BER:\002)";
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+	    i__3, i__4, i__5, i__6, i__7;
+    doublecomplex z__1;
+    alist al__1;
+
+    /* Local variables */
+    char diag[1];
+    integer ldas, ldbs;
+    logical same;
+    char side[1];
+    logical left, null;
+    char uplo[1];
+    integer i__, j, m, n;
+    doublecomplex alpha;
+    char diags[1];
+    logical isame[13];
+    char sides[1];
+    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, 
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
+	     logical *, doublecomplex *);
+    integer nargs;
+    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, 
+	    integer *, doublecomplex *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *, doublecomplex *, doublereal *, doublecomplex *, 
+	    integer *, doublereal *, doublereal *, logical *, integer *, 
+	    logical *);
+    logical reset;
+    char uplos[1];
+    integer ia, na;
+    extern /* Subroutine */ int zprcn3_(integer *, integer *, char *, integer 
+	    *, char *, char *, char *, char *, integer *, integer *, 
+	    doublecomplex *, integer *, integer *);
+    integer nc, im, in, ms, ns;
+    char tranas[1], transa[1];
+    doublereal errmax;
+    extern logical lzeres_(char *, char *, integer *, integer *, 
+	    doublecomplex *, doublecomplex *, integer *);
+    extern /* Subroutine */ int cztrmm_(integer *, char *, char *, char *, 
+	    char *, integer *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *, doublecomplex *, integer *), cztrsm_(integer *, char *, char *, char *, char *, 
+	    integer *, integer *, doublecomplex *, doublecomplex *, integer *,
+	     doublecomplex *, integer *);
+    integer laa, icd, lbb, lda, ldb, ics;
+    doublecomplex als;
+    integer ict, icu;
+    doublereal err;
+    extern logical lze_(doublecomplex *, doublecomplex *, integer *);
+
+    /* Fortran I/O blocks */
+    static cilist io___236 = { 0, 0, 0, fmt_9994, 0 };
+    static cilist io___239 = { 0, 0, 0, fmt_9998, 0 };
+    static cilist io___241 = { 0, 0, 0, fmt_10000, 0 };
+    static cilist io___242 = { 0, 0, 0, fmt_10001, 0 };
+    static cilist io___243 = { 0, 0, 0, fmt_10002, 0 };
+    static cilist io___244 = { 0, 0, 0, fmt_10003, 0 };
+    static cilist io___245 = { 0, 0, 0, fmt_9996, 0 };
+
+
+
+/*  Tests ZTRMM and ZTRSM. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    --idim;
+    --alf;
+    c_dim1 = *nmax;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --g;
+    --ct;
+    --bs;
+    --bb;
+    b_dim1 = *nmax;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --as;
+    --aa;
+    a_dim1 = *nmax;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+
+    /* Function Body */
+
+    nargs = 11;
+    nc = 0;
+    reset = TRUE_;
+    errmax = 0.;
+/*     Set up zero matrix for ZMMCH. */
+    i__1 = *nmax;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *nmax;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * c_dim1;
+	    c__[i__3].r = 0., c__[i__3].i = 0.;
+/* L10: */
+	}
+/* L20: */
+    }
+
+    i__1 = *nidim;
+    for (im = 1; im <= i__1; ++im) {
+	m = idim[im];
+
+	i__2 = *nidim;
+	for (in = 1; in <= i__2; ++in) {
+	    n = idim[in];
+/*           Set LDB to 1 more than minimum value if room. */
+	    ldb = m;
+	    if (ldb < *nmax) {
+		++ldb;
+	    }
+/*           Skip tests if not enough room. */
+	    if (ldb > *nmax) {
+		goto L130;
+	    }
+	    lbb = ldb * n;
+	    null = m <= 0 || n <= 0;
+
+	    for (ics = 1; ics <= 2; ++ics) {
+		*(unsigned char *)side = *(unsigned char *)&ichs[ics - 1];
+		left = *(unsigned char *)side == 'L';
+		if (left) {
+		    na = m;
+		} else {
+		    na = n;
+		}
+/*              Set LDA to 1 more than minimum value if room. */
+		lda = na;
+		if (lda < *nmax) {
+		    ++lda;
+		}
+/*              Skip tests if not enough room. */
+		if (lda > *nmax) {
+		    goto L130;
+		}
+		laa = lda * na;
+
+		for (icu = 1; icu <= 2; ++icu) {
+		    *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1];
+
+		    for (ict = 1; ict <= 3; ++ict) {
+			*(unsigned char *)transa = *(unsigned char *)&icht[
+				ict - 1];
+
+			for (icd = 1; icd <= 2; ++icd) {
+			    *(unsigned char *)diag = *(unsigned char *)&ichd[
+				    icd - 1];
+
+			    i__3 = *nalf;
+			    for (ia = 1; ia <= i__3; ++ia) {
+				i__4 = ia;
+				alpha.r = alf[i__4].r, alpha.i = alf[i__4].i;
+
+/*                          Generate the matrix A. */
+
+				zmake_("tr", uplo, diag, &na, &na, &a[
+					a_offset], nmax, &aa[1], &lda, &reset,
+					 &c_b1);
+
+/*                          Generate the matrix B. */
+
+				zmake_("ge", " ", " ", &m, &n, &b[b_offset], 
+					nmax, &bb[1], &ldb, &reset, &c_b1);
+
+				++nc;
+
+/*                          Save every datum before calling the */
+/*                          subroutine. */
+
+				*(unsigned char *)sides = *(unsigned char *)
+					side;
+				*(unsigned char *)uplos = *(unsigned char *)
+					uplo;
+				*(unsigned char *)tranas = *(unsigned char *)
+					transa;
+				*(unsigned char *)diags = *(unsigned char *)
+					diag;
+				ms = m;
+				ns = n;
+				als.r = alpha.r, als.i = alpha.i;
+				i__4 = laa;
+				for (i__ = 1; i__ <= i__4; ++i__) {
+				    i__5 = i__;
+				    i__6 = i__;
+				    as[i__5].r = aa[i__6].r, as[i__5].i = aa[
+					    i__6].i;
+/* L30: */
+				}
+				ldas = lda;
+				i__4 = lbb;
+				for (i__ = 1; i__ <= i__4; ++i__) {
+				    i__5 = i__;
+				    i__6 = i__;
+				    bs[i__5].r = bb[i__6].r, bs[i__5].i = bb[
+					    i__6].i;
+/* L40: */
+				}
+				ldbs = ldb;
+
+/*                          Call the subroutine. */
+
+				if (s_cmp(sname + 9, "mm", (ftnlen)2, (ftnlen)
+					2) == 0) {
+				    if (*trace) {
+					zprcn3_(ntra, &nc, sname, iorder, 
+						side, uplo, transa, diag, &m, 
+						&n, &alpha, &lda, &ldb, (
+						ftnlen)13, (ftnlen)1, (ftnlen)
+						1, (ftnlen)1, (ftnlen)1);
+				    }
+				    if (*rewi) {
+					al__1.aerr = 0;
+					al__1.aunit = *ntra;
+					f_rew(&al__1);
+				    }
+				    cztrmm_(iorder, side, uplo, transa, diag, 
+					    &m, &n, &alpha, &aa[1], &lda, &bb[
+					    1], &ldb);
+				} else if (s_cmp(sname + 9, "sm", (ftnlen)2, (
+					ftnlen)2) == 0) {
+				    if (*trace) {
+					zprcn3_(ntra, &nc, sname, iorder, 
+						side, uplo, transa, diag, &m, 
+						&n, &alpha, &lda, &ldb, (
+						ftnlen)13, (ftnlen)1, (ftnlen)
+						1, (ftnlen)1, (ftnlen)1);
+				    }
+				    if (*rewi) {
+					al__1.aerr = 0;
+					al__1.aunit = *ntra;
+					f_rew(&al__1);
+				    }
+				    cztrsm_(iorder, side, uplo, transa, diag, 
+					    &m, &n, &alpha, &aa[1], &lda, &bb[
+					    1], &ldb);
+				}
+
+/*                          Check if error-exit was taken incorrectly. */
+
+				if (! infoc_1.ok) {
+				    io___236.ciunit = *nout;
+				    s_wsfe(&io___236);
+				    e_wsfe();
+				    *fatal = TRUE_;
+				    goto L150;
+				}
+
+/*                          See what data changed inside subroutines. */
+
+				isame[0] = *(unsigned char *)sides == *(
+					unsigned char *)side;
+				isame[1] = *(unsigned char *)uplos == *(
+					unsigned char *)uplo;
+				isame[2] = *(unsigned char *)tranas == *(
+					unsigned char *)transa;
+				isame[3] = *(unsigned char *)diags == *(
+					unsigned char *)diag;
+				isame[4] = ms == m;
+				isame[5] = ns == n;
+				isame[6] = als.r == alpha.r && als.i == 
+					alpha.i;
+				isame[7] = lze_(&as[1], &aa[1], &laa);
+				isame[8] = ldas == lda;
+				if (null) {
+				    isame[9] = lze_(&bs[1], &bb[1], &lbb);
+				} else {
+				    isame[9] = lzeres_("ge", " ", &m, &n, &bs[
+					    1], &bb[1], &ldb);
+				}
+				isame[10] = ldbs == ldb;
+
+/*                          If data was incorrectly changed, report and */
+/*                          return. */
+
+				same = TRUE_;
+				i__4 = nargs;
+				for (i__ = 1; i__ <= i__4; ++i__) {
+				    same = same && isame[i__ - 1];
+				    if (! isame[i__ - 1]) {
+					io___239.ciunit = *nout;
+					s_wsfe(&io___239);
+					do_fio(&c__1, (char *)&i__, (ftnlen)
+						sizeof(integer));
+					e_wsfe();
+				    }
+/* L50: */
+				}
+				if (! same) {
+				    *fatal = TRUE_;
+				    goto L150;
+				}
+
+				if (! null) {
+				    if (s_cmp(sname + 9, "mm", (ftnlen)2, (
+					    ftnlen)2) == 0) {
+
+/*                                Check the result. */
+
+					if (left) {
+					    zmmch_(transa, "N", &m, &n, &m, &
+						    alpha, &a[a_offset], nmax,
+						     &b[b_offset], nmax, &
+						    c_b1, &c__[c_offset], 
+						    nmax, &ct[1], &g[1], &bb[
+						    1], &ldb, eps, &err, 
+						    fatal, nout, &c_true, (
+						    ftnlen)1, (ftnlen)1);
+					} else {
+					    zmmch_("N", transa, &m, &n, &n, &
+						    alpha, &b[b_offset], nmax,
+						     &a[a_offset], nmax, &
+						    c_b1, &c__[c_offset], 
+						    nmax, &ct[1], &g[1], &bb[
+						    1], &ldb, eps, &err, 
+						    fatal, nout, &c_true);
+					}
+				    } else if (s_cmp(sname + 9, "sm", (ftnlen)
+					    2, (ftnlen)2) == 0) {
+
+/*                                Compute approximation to original */
+/*                                matrix. */
+
+					i__4 = n;
+					for (j = 1; j <= i__4; ++j) {
+					    i__5 = m;
+					    for (i__ = 1; i__ <= i__5; ++i__) 
+						    {
+			  i__6 = i__ + j * c_dim1;
+			  i__7 = i__ + (j - 1) * ldb;
+			  c__[i__6].r = bb[i__7].r, c__[i__6].i = bb[i__7].i;
+			  i__6 = i__ + (j - 1) * ldb;
+			  i__7 = i__ + j * b_dim1;
+			  z__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, 
+				  z__1.i = alpha.r * b[i__7].i + alpha.i * b[
+				  i__7].r;
+			  bb[i__6].r = z__1.r, bb[i__6].i = z__1.i;
+/* L60: */
+					    }
+/* L70: */
+					}
+
+					if (left) {
+					    zmmch_(transa, "N", &m, &n, &m, &
+						    c_b2, &a[a_offset], nmax, 
+						    &c__[c_offset], nmax, &
+						    c_b1, &b[b_offset], nmax, 
+						    &ct[1], &g[1], &bb[1], &
+						    ldb, eps, &err, fatal, 
+						    nout, &c_false);
+					} else {
+					    zmmch_("N", transa, &m, &n, &n, &
+						    c_b2, &c__[c_offset], 
+						    nmax, &a[a_offset], nmax, 
+						    &c_b1, &b[b_offset], nmax,
+						     &ct[1], &g[1], &bb[1], &
+						    ldb, eps, &err, fatal, 
+						    nout, &c_false);
+					}
+				    }
+				    errmax = f2cmax(errmax,err);
+/*                             If got really bad answer, report and */
+/*                             return. */
+				    if (*fatal) {
+					goto L150;
+				    }
+				}
+
+/* L80: */
+			    }
+
+/* L90: */
+			}
+
+/* L100: */
+		    }
+
+/* L110: */
+		}
+
+/* L120: */
+	    }
+
+L130:
+	    ;
+	}
+
+/* L140: */
+    }
+
+/*     Report result. */
+
+    if (errmax < *thresh) {
+	if (*iorder == 0) {
+	    io___241.ciunit = *nout;
+	    s_wsfe(&io___241);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+	if (*iorder == 1) {
+	    io___242.ciunit = *nout;
+	    s_wsfe(&io___242);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+    } else {
+	if (*iorder == 0) {
+	    io___243.ciunit = *nout;
+	    s_wsfe(&io___243);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal));
+	    e_wsfe();
+	}
+	if (*iorder == 1) {
+	    io___244.ciunit = *nout;
+	    s_wsfe(&io___244);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal));
+	    e_wsfe();
+	}
+    }
+    goto L160;
+
+L150:
+    io___245.ciunit = *nout;
+    s_wsfe(&io___245);
+    do_fio(&c__1, sname, (ftnlen)13);
+    e_wsfe();
+    if (*trace) {
+	zprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, &
+		alpha, &lda, &ldb);
+    }
+
+L160:
+    return 0;
+
+/* L9995: */
+
+/*     End of ZCHK3. */
+
+} /* zchk3_ */
+
+
+/* Subroutine */ int zprcn3_(integer *nout, integer *nc, char *sname, integer 
+	*iorder, char *side, char *uplo, char *transa, char *diag, integer *m,
+	 integer *n, doublecomplex *alpha, integer *lda, integer *ldb)
+{
+    /* Format strings */
+    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a15,\002,\002,"
+	    "a14,\002,\002,a14,\002,\002)";
+    static char fmt_9994[] = "(10x,2(a15,\002,\002),2(i3,\002,\002),\002 "
+	    "(\002,f4.1,\002,\002,f4.1,\002), A,\002,i3,\002, B,\002,i3,\002)."
+	    "\002)";
+
+    /* Local variables */
+    char ca[14], cd[14], cs[14], cu[14], crc[14];
+
+    /* Fortran I/O blocks */
+    static cilist io___251 = { 0, 0, 0, fmt_9995, 0 };
+    static cilist io___252 = { 0, 0, 0, fmt_9994, 0 };
+
+
+    if (*(unsigned char *)side == 'L') {
+	s_copy(cs, "     CblasLeft", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cs, "    CblasRight", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(ca, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(ca, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)diag == 'N') {
+	s_copy(cd, "  CblasNonUnit", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cd, "     CblasUnit", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    io___251.ciunit = *nout;
+    s_wsfe(&io___251);
+    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
+    do_fio(&c__1, sname, (ftnlen)13);
+    do_fio(&c__1, crc, (ftnlen)14);
+    do_fio(&c__1, cs, (ftnlen)14);
+    do_fio(&c__1, cu, (ftnlen)14);
+    e_wsfe();
+    io___252.ciunit = *nout;
+    s_wsfe(&io___252);
+    do_fio(&c__1, ca, (ftnlen)14);
+    do_fio(&c__1, cd, (ftnlen)14);
+    do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
+    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal));
+    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer));
+    e_wsfe();
+    return 0;
+} /* zprcn3_ */
+
+
+/* Subroutine */ int zchk4_(char *sname, doublereal *eps, doublereal *thresh, 
+	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
+	fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex *
+	alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex *
+	a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, 
+	doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, 
+	doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal *
+	g, integer *iorder)
+{
+    /* Initialized data */
+
+    static char icht[2] = "NC";
+    static char ichu[2] = "UL";
+
+    /* Format strings */
+    static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT "
+	    "TAKEN ON VALID\002,\002 CALL *******\002)";
+    static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER"
+	    " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)";
+    static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C"
+	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
+    static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR    C"
+	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
+    static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO"
+	    "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
+	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
+	    "ECT *******\002)";
+    static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR  "
+	    "  COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
+	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
+	    "ECT *******\002)";
+    static char fmt_9995[] = "(\002      THESE ARE THE RESULTS FOR COLUMN"
+	    " \002,i3)";
+    static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM"
+	    "BER:\002)";
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+	    i__3, i__4, i__5, i__6, i__7;
+    doublecomplex z__1;
+    alist al__1;
+
+    /* Local variables */
+    doublecomplex beta;
+    integer ldas, ldcs;
+    logical same, conj;
+    doublecomplex bets;
+    doublereal rals;
+    logical tran, null;
+    char uplo[1];
+    integer i__, j, k, n;
+    doublecomplex alpha;
+    doublereal rbeta;
+    logical isame[13];
+    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, 
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
+	     logical *, doublecomplex *);
+    integer nargs;
+    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, 
+	    integer *, doublecomplex *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *, doublecomplex *, doublereal *, doublecomplex *, 
+	    integer *, doublereal *, doublereal *, logical *, integer *, 
+	    logical *);
+    doublereal rbets;
+    logical reset;
+    char trans[1];
+    logical upper;
+    char uplos[1];
+    integer ia, ib, jc, ma, na;
+    extern /* Subroutine */ int zprcn4_(integer *, integer *, char *, integer 
+	    *, char *, char *, integer *, integer *, doublecomplex *, integer 
+	    *, doublecomplex *, integer *);
+    integer nc;
+    extern /* Subroutine */ int zprcn6_(integer *, integer *, char *, integer 
+	    *, char *, char *, integer *, integer *, doublereal *, integer *, 
+	    doublereal *, integer *);
+    integer ik, in, jj, lj, ks, ns;
+    doublereal ralpha;
+    extern /* Subroutine */ int czherk_(integer *, char *, char *, integer *, 
+	    integer *, doublereal *, doublecomplex *, integer *, doublereal *,
+	     doublecomplex *, integer *);
+    doublereal errmax;
+    extern logical lzeres_(char *, char *, integer *, integer *, 
+	    doublecomplex *, doublecomplex *, integer *);
+    char transs[1], transt[1];
+    extern /* Subroutine */ int czsyrk_(integer *, char *, char *, integer *, 
+	    integer *, doublecomplex *, doublecomplex *, integer *, 
+	    doublecomplex *, doublecomplex *, integer *);
+    integer laa, lda, lcc, ldc;
+    doublecomplex als;
+    integer ict, icu;
+    doublereal err;
+    extern logical lze_(doublecomplex *, doublecomplex *, integer *);
+
+    /* Fortran I/O blocks */
+    static cilist io___294 = { 0, 0, 0, fmt_9992, 0 };
+    static cilist io___297 = { 0, 0, 0, fmt_9998, 0 };
+    static cilist io___304 = { 0, 0, 0, fmt_10000, 0 };
+    static cilist io___305 = { 0, 0, 0, fmt_10001, 0 };
+    static cilist io___306 = { 0, 0, 0, fmt_10002, 0 };
+    static cilist io___307 = { 0, 0, 0, fmt_10003, 0 };
+    static cilist io___308 = { 0, 0, 0, fmt_9995, 0 };
+    static cilist io___309 = { 0, 0, 0, fmt_9996, 0 };
+
+
+
+/*  Tests ZHERK and ZSYRK. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    --idim;
+    --alf;
+    --bet;
+    --g;
+    --ct;
+    --cs;
+    --cc;
+    c_dim1 = *nmax;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --bs;
+    --bb;
+    b_dim1 = *nmax;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --as;
+    --aa;
+    a_dim1 = *nmax;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+
+    /* Function Body */
+    conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0;
+
+    nargs = 10;
+    nc = 0;
+    reset = TRUE_;
+    errmax = 0.;
+
+    i__1 = *nidim;
+    for (in = 1; in <= i__1; ++in) {
+	n = idim[in];
+/*        Set LDC to 1 more than minimum value if room. */
+	ldc = n;
+	if (ldc < *nmax) {
+	    ++ldc;
+	}
+/*        Skip tests if not enough room. */
+	if (ldc > *nmax) {
+	    goto L100;
+	}
+	lcc = ldc * n;
+
+	i__2 = *nidim;
+	for (ik = 1; ik <= i__2; ++ik) {
+	    k = idim[ik];
+
+	    for (ict = 1; ict <= 2; ++ict) {
+		*(unsigned char *)trans = *(unsigned char *)&icht[ict - 1];
+		tran = *(unsigned char *)trans == 'C';
+		if (tran && ! conj) {
+		    *(unsigned char *)trans = 'T';
+		}
+		if (tran) {
+		    ma = k;
+		    na = n;
+		} else {
+		    ma = n;
+		    na = k;
+		}
+/*              Set LDA to 1 more than minimum value if room. */
+		lda = ma;
+		if (lda < *nmax) {
+		    ++lda;
+		}
+/*              Skip tests if not enough room. */
+		if (lda > *nmax) {
+		    goto L80;
+		}
+		laa = lda * na;
+
+/*              Generate the matrix A. */
+
+		zmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[1], &
+			lda, &reset, &c_b1);
+
+		for (icu = 1; icu <= 2; ++icu) {
+		    *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1];
+		    upper = *(unsigned char *)uplo == 'U';
+
+		    i__3 = *nalf;
+		    for (ia = 1; ia <= i__3; ++ia) {
+			i__4 = ia;
+			alpha.r = alf[i__4].r, alpha.i = alf[i__4].i;
+			if (conj) {
+			    ralpha = alpha.r;
+			    z__1.r = ralpha, z__1.i = 0.;
+			    alpha.r = z__1.r, alpha.i = z__1.i;
+			}
+
+			i__4 = *nbet;
+			for (ib = 1; ib <= i__4; ++ib) {
+			    i__5 = ib;
+			    beta.r = bet[i__5].r, beta.i = bet[i__5].i;
+			    if (conj) {
+				rbeta = beta.r;
+				z__1.r = rbeta, z__1.i = 0.;
+				beta.r = z__1.r, beta.i = z__1.i;
+			    }
+			    null = n <= 0;
+			    if (conj) {
+				null = null || (k <= 0 || ralpha == 0.) && 
+					rbeta == 1.;
+			    }
+
+/*                       Generate the matrix C. */
+
+			    zmake_(sname + 7, uplo, " ", &n, &n, &c__[
+				    c_offset], nmax, &cc[1], &ldc, &reset, &
+				    c_b1);
+
+			    ++nc;
+
+/*                       Save every datum before calling the subroutine. */
+
+			    *(unsigned char *)uplos = *(unsigned char *)uplo;
+			    *(unsigned char *)transs = *(unsigned char *)
+				    trans;
+			    ns = n;
+			    ks = k;
+			    if (conj) {
+				rals = ralpha;
+			    } else {
+				als.r = alpha.r, als.i = alpha.i;
+			    }
+			    i__5 = laa;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7]
+					.i;
+/* L10: */
+			    }
+			    ldas = lda;
+			    if (conj) {
+				rbets = rbeta;
+			    } else {
+				bets.r = beta.r, bets.i = beta.i;
+			    }
+			    i__5 = lcc;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7]
+					.i;
+/* L20: */
+			    }
+			    ldcs = ldc;
+
+/*                       Call the subroutine. */
+
+			    if (conj) {
+				if (*trace) {
+				    zprcn6_(ntra, &nc, sname, iorder, uplo, 
+					    trans, &n, &k, &ralpha, &lda, &
+					    rbeta, &ldc);
+				}
+				if (*rewi) {
+				    al__1.aerr = 0;
+				    al__1.aunit = *ntra;
+				    f_rew(&al__1);
+				}
+				czherk_(iorder, uplo, trans, &n, &k, &ralpha, 
+					&aa[1], &lda, &rbeta, &cc[1], &ldc);
+			    } else {
+				if (*trace) {
+				    zprcn4_(ntra, &nc, sname, iorder, uplo, 
+					    trans, &n, &k, &alpha, &lda, &
+					    beta, &ldc);
+				}
+				if (*rewi) {
+				    al__1.aerr = 0;
+				    al__1.aunit = *ntra;
+				    f_rew(&al__1);
+				}
+				czsyrk_(iorder, uplo, trans, &n, &k, &alpha, &
+					aa[1], &lda, &beta, &cc[1], &ldc);
+			    }
+
+/*                       Check if error-exit was taken incorrectly. */
+
+			    if (! infoc_1.ok) {
+				io___294.ciunit = *nout;
+				s_wsfe(&io___294);
+				e_wsfe();
+				*fatal = TRUE_;
+				goto L120;
+			    }
+
+/*                       See what data changed inside subroutines. */
+
+			    isame[0] = *(unsigned char *)uplos == *(unsigned 
+				    char *)uplo;
+			    isame[1] = *(unsigned char *)transs == *(unsigned 
+				    char *)trans;
+			    isame[2] = ns == n;
+			    isame[3] = ks == k;
+			    if (conj) {
+				isame[4] = rals == ralpha;
+			    } else {
+				isame[4] = als.r == alpha.r && als.i == 
+					alpha.i;
+			    }
+			    isame[5] = lze_(&as[1], &aa[1], &laa);
+			    isame[6] = ldas == lda;
+			    if (conj) {
+				isame[7] = rbets == rbeta;
+			    } else {
+				isame[7] = bets.r == beta.r && bets.i == 
+					beta.i;
+			    }
+			    if (null) {
+				isame[8] = lze_(&cs[1], &cc[1], &lcc);
+			    } else {
+				isame[8] = lzeres_(sname + 7, uplo, &n, &n, &
+					cs[1], &cc[1], &ldc);
+			    }
+			    isame[9] = ldcs == ldc;
+
+/*                       If data was incorrectly changed, report and */
+/*                       return. */
+
+			    same = TRUE_;
+			    i__5 = nargs;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				same = same && isame[i__ - 1];
+				if (! isame[i__ - 1]) {
+				    io___297.ciunit = *nout;
+				    s_wsfe(&io___297);
+				    do_fio(&c__1, (char *)&i__, (ftnlen)
+					    sizeof(integer));
+				    e_wsfe();
+				}
+/* L30: */
+			    }
+			    if (! same) {
+				*fatal = TRUE_;
+				goto L120;
+			    }
+
+			    if (! null) {
+
+/*                          Check the result column by column. */
+
+				if (conj) {
+				    *(unsigned char *)transt = 'C';
+				} else {
+				    *(unsigned char *)transt = 'T';
+				}
+				jc = 1;
+				i__5 = n;
+				for (j = 1; j <= i__5; ++j) {
+				    if (upper) {
+					jj = 1;
+					lj = j;
+				    } else {
+					jj = j;
+					lj = n - j + 1;
+				    }
+				    if (tran) {
+					zmmch_(transt, "N", &lj, &c__1, &k, &
+						alpha, &a[jj * a_dim1 + 1], 
+						nmax, &a[j * a_dim1 + 1], 
+						nmax, &beta, &c__[jj + j * 
+						c_dim1], nmax, &ct[1], &g[1], 
+						&cc[jc], &ldc, eps, &err, 
+						fatal, nout, &c_true);
+				    } else {
+					zmmch_("N", transt, &lj, &c__1, &k, &
+						alpha, &a[jj + a_dim1], nmax, 
+						&a[j + a_dim1], nmax, &beta, &
+						c__[jj + j * c_dim1], nmax, &
+						ct[1], &g[1], &cc[jc], &ldc, 
+						eps, &err, fatal, nout, &
+						c_true);
+				    }
+				    if (upper) {
+					jc += ldc;
+				    } else {
+					jc = jc + ldc + 1;
+				    }
+				    errmax = f2cmax(errmax,err);
+/*                             If got really bad answer, report and */
+/*                             return. */
+				    if (*fatal) {
+					goto L110;
+				    }
+/* L40: */
+				}
+			    }
+
+/* L50: */
+			}
+
+/* L60: */
+		    }
+
+/* L70: */
+		}
+
+L80:
+		;
+	    }
+
+/* L90: */
+	}
+
+L100:
+	;
+    }
+
+/*     Report result. */
+
+    if (errmax < *thresh) {
+	if (*iorder == 0) {
+	    io___304.ciunit = *nout;
+	    s_wsfe(&io___304);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+	if (*iorder == 1) {
+	    io___305.ciunit = *nout;
+	    s_wsfe(&io___305);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+    } else {
+	if (*iorder == 0) {
+	    io___306.ciunit = *nout;
+	    s_wsfe(&io___306);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal));
+	    e_wsfe();
+	}
+	if (*iorder == 1) {
+	    io___307.ciunit = *nout;
+	    s_wsfe(&io___307);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal));
+	    e_wsfe();
+	}
+    }
+    goto L130;
+
+L110:
+    if (n > 1) {
+	io___308.ciunit = *nout;
+	s_wsfe(&io___308);
+	do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer));
+	e_wsfe();
+    }
+
+L120:
+    io___309.ciunit = *nout;
+    s_wsfe(&io___309);
+    do_fio(&c__1, sname, (ftnlen)13);
+    e_wsfe();
+    if (conj) {
+	zprcn6_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &ralpha, &lda, 
+		&rbeta, &ldc);
+    } else {
+	zprcn4_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, &
+		beta, &ldc);
+    }
+
+L130:
+    return 0;
+
+/* L9994: */
+/* L9993: */
+
+/*     End of CCHK4. */
+
+} /* zchk4_ */
+
+
+/* Subroutine */ int zprcn4_(integer *nout, integer *nc, char *sname, integer 
+	*iorder, char *uplo, char *transa, integer *n, integer *k, 
+	doublecomplex *alpha, integer *lda, doublecomplex *beta, integer *ldc)
+{
+    /* Format strings */
+    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002,"
+	    "\002))";
+    static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002"
+	    ",f4.1,\002), A,\002,i3,\002, (\002,f4.1,\002,\002,f4.1,\002), C"
+	    ",\002,i3,\002).\002)";
+
+    /* Local variables */
+    char ca[14], cu[14], crc[14];
+
+    /* Fortran I/O blocks */
+    static cilist io___313 = { 0, 0, 0, fmt_9995, 0 };
+    static cilist io___314 = { 0, 0, 0, fmt_9994, 0 };
+
+
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(ca, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(ca, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    io___313.ciunit = *nout;
+    s_wsfe(&io___313);
+    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
+    do_fio(&c__1, sname, (ftnlen)13);
+    do_fio(&c__1, crc, (ftnlen)14);
+    do_fio(&c__1, cu, (ftnlen)14);
+    do_fio(&c__1, ca, (ftnlen)14);
+    e_wsfe();
+    io___314.ciunit = *nout;
+    s_wsfe(&io___314);
+    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer));
+    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal));
+    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
+    do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal));
+    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
+    e_wsfe();
+    return 0;
+} /* zprcn4_ */
+
+
+
+/* Subroutine */ int zprcn6_(integer *nout, integer *nc, char *sname, integer 
+	*iorder, char *uplo, char *transa, integer *n, integer *k, doublereal 
+	*alpha, integer *lda, doublereal *beta, integer *ldc)
+{
+    /* Format strings */
+    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002,"
+	    "\002))";
+    static char fmt_9994[] = "(10x,2(i3,\002,\002),f4.1,\002, A,\002,i3"
+	    ",\002,\002,f4.1,\002, C,\002,i3,\002).\002)";
+
+    /* Local variables */
+    char ca[14], cu[14], crc[14];
+
+    /* Fortran I/O blocks */
+    static cilist io___318 = { 0, 0, 0, fmt_9995, 0 };
+    static cilist io___319 = { 0, 0, 0, fmt_9994, 0 };
+
+
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(ca, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(ca, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    io___318.ciunit = *nout;
+    s_wsfe(&io___318);
+    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
+    do_fio(&c__1, sname, (ftnlen)13);
+    do_fio(&c__1, crc, (ftnlen)14);
+    do_fio(&c__1, cu, (ftnlen)14);
+    do_fio(&c__1, ca, (ftnlen)14);
+    e_wsfe();
+    io___319.ciunit = *nout;
+    s_wsfe(&io___319);
+    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*alpha), (ftnlen)sizeof(doublereal));
+    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(doublereal));
+    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
+    e_wsfe();
+    return 0;
+} /* zprcn6_ */
+
+
+/* Subroutine */ int zchk5_(char *sname, doublereal *eps, doublereal *thresh, 
+	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
+	fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex *
+	alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex *
+	ab, doublecomplex *aa, doublecomplex *as, doublecomplex *bb, 
+	doublecomplex *bs, doublecomplex *c__, doublecomplex *cc, 
+	doublecomplex *cs, doublecomplex *ct, doublereal *g, doublecomplex *w,
+	 integer *iorder)
+{
+    /* Initialized data */
+
+    static char icht[2] = "NC";
+    static char ichu[2] = "UL";
+
+    /* Format strings */
+    static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT "
+	    "TAKEN ON VALID\002,\002 CALL *******\002)";
+    static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER"
+	    " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)";
+    static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C"
+	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
+    static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR    C"
+	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
+    static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO"
+	    "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
+	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
+	    "ECT *******\002)";
+    static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR  "
+	    "  COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
+	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
+	    "ECT *******\002)";
+    static char fmt_9995[] = "(\002      THESE ARE THE RESULTS FOR COLUMN"
+	    " \002,i3)";
+    static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM"
+	    "BER:\002)";
+
+    /* System generated locals */
+    integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8;
+    doublecomplex z__1, z__2;
+    alist al__1;
+
+    /* Local variables */
+    integer jjab;
+    doublecomplex beta;
+    integer ldas, ldbs, ldcs;
+    logical same, conj;
+    doublecomplex bets;
+    logical tran, null;
+    char uplo[1];
+    integer i__, j, k, n;
+    doublecomplex alpha;
+    doublereal rbeta;
+    logical isame[13];
+    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, 
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
+	     logical *, doublecomplex *);
+    integer nargs;
+    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, 
+	    integer *, doublecomplex *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *, doublecomplex *, doublereal *, doublecomplex *, 
+	    integer *, doublereal *, doublereal *, logical *, integer *, 
+	    logical *);
+    doublereal rbets;
+    logical reset;
+    char trans[1];
+    logical upper;
+    char uplos[1];
+    integer ia, ib, jc, ma, na, nc;
+    extern /* Subroutine */ int zprcn5_(integer *, integer *, char *, integer 
+	    *, char *, char *, integer *, integer *, doublecomplex *, integer 
+	    *, integer *, doublecomplex *, integer *),
+	     zprcn7_(integer *, integer *, char *, integer *, char *, char *, 
+	    integer *, integer *, doublecomplex *, integer *, integer *, 
+	    doublereal *, integer *);
+    integer ik, in, jj, lj, ks, ns;
+    doublereal errmax;
+    extern logical lzeres_(char *, char *, integer *, integer *, 
+	    doublecomplex *, doublecomplex *, integer *);
+    char transs[1], transt[1];
+    extern /* Subroutine */ int czher2k_(integer *, char *, char *, integer *,
+	     integer *, doublecomplex *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, doublereal *, doublecomplex *, 
+	    integer *);
+    integer laa, lbb, lda, lcc, ldb, ldc;
+    doublecomplex als;
+    integer ict, icu;
+    extern /* Subroutine */ int czsyr2k_(integer *, char *, char *, integer *,
+	     integer *, doublecomplex *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *);
+    doublereal err;
+    extern logical lze_(doublecomplex *, doublecomplex *, integer *);
+
+    /* Fortran I/O blocks */
+    static cilist io___362 = { 0, 0, 0, fmt_9992, 0 };
+    static cilist io___365 = { 0, 0, 0, fmt_9998, 0 };
+    static cilist io___373 = { 0, 0, 0, fmt_10000, 0 };
+    static cilist io___374 = { 0, 0, 0, fmt_10001, 0 };
+    static cilist io___375 = { 0, 0, 0, fmt_10002, 0 };
+    static cilist io___376 = { 0, 0, 0, fmt_10003, 0 };
+    static cilist io___377 = { 0, 0, 0, fmt_9995, 0 };
+    static cilist io___378 = { 0, 0, 0, fmt_9996, 0 };
+
+
+
+/*  Tests ZHER2K and ZSYR2K. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    --idim;
+    --alf;
+    --bet;
+    --w;
+    --g;
+    --ct;
+    --cs;
+    --cc;
+    c_dim1 = *nmax;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --bs;
+    --bb;
+    --as;
+    --aa;
+    --ab;
+
+    /* Function Body */
+    conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0;
+
+    nargs = 12;
+    nc = 0;
+    reset = TRUE_;
+    errmax = 0.;
+
+    i__1 = *nidim;
+    for (in = 1; in <= i__1; ++in) {
+	n = idim[in];
+/*        Set LDC to 1 more than minimum value if room. */
+	ldc = n;
+	if (ldc < *nmax) {
+	    ++ldc;
+	}
+/*        Skip tests if not enough room. */
+	if (ldc > *nmax) {
+	    goto L130;
+	}
+	lcc = ldc * n;
+
+	i__2 = *nidim;
+	for (ik = 1; ik <= i__2; ++ik) {
+	    k = idim[ik];
+
+	    for (ict = 1; ict <= 2; ++ict) {
+		*(unsigned char *)trans = *(unsigned char *)&icht[ict - 1];
+		tran = *(unsigned char *)trans == 'C';
+		if (tran && ! conj) {
+		    *(unsigned char *)trans = 'T';
+		}
+		if (tran) {
+		    ma = k;
+		    na = n;
+		} else {
+		    ma = n;
+		    na = k;
+		}
+/*              Set LDA to 1 more than minimum value if room. */
+		lda = ma;
+		if (lda < *nmax) {
+		    ++lda;
+		}
+/*              Skip tests if not enough room. */
+		if (lda > *nmax) {
+		    goto L110;
+		}
+		laa = lda * na;
+
+/*              Generate the matrix A. */
+
+		if (tran) {
+		    i__3 = *nmax << 1;
+		    zmake_("ge", " ", " ", &ma, &na, &ab[1], &i__3, &aa[1], &
+			    lda, &reset, &c_b1);
+		} else {
+		    zmake_("ge", " ", " ", &ma, &na, &ab[1], nmax, &aa[1], &
+			    lda, &reset, &c_b1);
+		}
+
+/*              Generate the matrix B. */
+
+		ldb = lda;
+		lbb = laa;
+		if (tran) {
+		    i__3 = *nmax << 1;
+		    zmake_("ge", " ", " ", &ma, &na, &ab[k + 1], &i__3, &bb[1]
+			    , &ldb, &reset, &c_b1);
+		} else {
+		    zmake_("ge", " ", " ", &ma, &na, &ab[k * *nmax + 1], nmax,
+			     &bb[1], &ldb, &reset, &c_b1);
+		}
+
+		for (icu = 1; icu <= 2; ++icu) {
+		    *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1];
+		    upper = *(unsigned char *)uplo == 'U';
+
+		    i__3 = *nalf;
+		    for (ia = 1; ia <= i__3; ++ia) {
+			i__4 = ia;
+			alpha.r = alf[i__4].r, alpha.i = alf[i__4].i;
+
+			i__4 = *nbet;
+			for (ib = 1; ib <= i__4; ++ib) {
+			    i__5 = ib;
+			    beta.r = bet[i__5].r, beta.i = bet[i__5].i;
+			    if (conj) {
+				rbeta = beta.r;
+				z__1.r = rbeta, z__1.i = 0.;
+				beta.r = z__1.r, beta.i = z__1.i;
+			    }
+			    null = n <= 0;
+			    if (conj) {
+				null = null || (k <= 0 || alpha.r == 0. && 
+					alpha.i == 0.) && rbeta == 1.;
+			    }
+
+/*                       Generate the matrix C. */
+
+			    zmake_(sname + 7, uplo, " ", &n, &n, &c__[
+				    c_offset], nmax, &cc[1], &ldc, &reset, &
+				    c_b1);
+
+			    ++nc;
+
+/*                       Save every datum before calling the subroutine. */
+
+			    *(unsigned char *)uplos = *(unsigned char *)uplo;
+			    *(unsigned char *)transs = *(unsigned char *)
+				    trans;
+			    ns = n;
+			    ks = k;
+			    als.r = alpha.r, als.i = alpha.i;
+			    i__5 = laa;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7]
+					.i;
+/* L10: */
+			    }
+			    ldas = lda;
+			    i__5 = lbb;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7]
+					.i;
+/* L20: */
+			    }
+			    ldbs = ldb;
+			    if (conj) {
+				rbets = rbeta;
+			    } else {
+				bets.r = beta.r, bets.i = beta.i;
+			    }
+			    i__5 = lcc;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7]
+					.i;
+/* L30: */
+			    }
+			    ldcs = ldc;
+
+/*                       Call the subroutine. */
+
+			    if (conj) {
+				if (*trace) {
+				    zprcn7_(ntra, &nc, sname, iorder, uplo, 
+					    trans, &n, &k, &alpha, &lda, &ldb,
+					     &rbeta, &ldc);
+				}
+				if (*rewi) {
+				    al__1.aerr = 0;
+				    al__1.aunit = *ntra;
+				    f_rew(&al__1);
+				}
+				czher2k_(iorder, uplo, trans, &n, &k, &alpha, 
+					&aa[1], &lda, &bb[1], &ldb, &rbeta, &
+					cc[1], &ldc);
+			    } else {
+				if (*trace) {
+				    zprcn5_(ntra, &nc, sname, iorder, uplo, 
+					    trans, &n, &k, &alpha, &lda, &ldb,
+					     &beta, &ldc);
+				}
+				if (*rewi) {
+				    al__1.aerr = 0;
+				    al__1.aunit = *ntra;
+				    f_rew(&al__1);
+				}
+				czsyr2k_(iorder, uplo, trans, &n, &k, &alpha, 
+					&aa[1], &lda, &bb[1], &ldb, &beta, &
+					cc[1], &ldc);
+			    }
+
+/*                       Check if error-exit was taken incorrectly. */
+
+			    if (! infoc_1.ok) {
+				io___362.ciunit = *nout;
+				s_wsfe(&io___362);
+				e_wsfe();
+				*fatal = TRUE_;
+				goto L150;
+			    }
+
+/*                       See what data changed inside subroutines. */
+
+			    isame[0] = *(unsigned char *)uplos == *(unsigned 
+				    char *)uplo;
+			    isame[1] = *(unsigned char *)transs == *(unsigned 
+				    char *)trans;
+			    isame[2] = ns == n;
+			    isame[3] = ks == k;
+			    isame[4] = als.r == alpha.r && als.i == alpha.i;
+			    isame[5] = lze_(&as[1], &aa[1], &laa);
+			    isame[6] = ldas == lda;
+			    isame[7] = lze_(&bs[1], &bb[1], &lbb);
+			    isame[8] = ldbs == ldb;
+			    if (conj) {
+				isame[9] = rbets == rbeta;
+			    } else {
+				isame[9] = bets.r == beta.r && bets.i == 
+					beta.i;
+			    }
+			    if (null) {
+				isame[10] = lze_(&cs[1], &cc[1], &lcc);
+			    } else {
+				isame[10] = lzeres_("he", uplo, &n, &n, &cs[1]
+					, &cc[1], &ldc);
+			    }
+			    isame[11] = ldcs == ldc;
+
+/*                       If data was incorrectly changed, report and */
+/*                       return. */
+
+			    same = TRUE_;
+			    i__5 = nargs;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				same = same && isame[i__ - 1];
+				if (! isame[i__ - 1]) {
+				    io___365.ciunit = *nout;
+				    s_wsfe(&io___365);
+				    do_fio(&c__1, (char *)&i__, (ftnlen)
+					    sizeof(integer));
+				    e_wsfe();
+				}
+/* L40: */
+			    }
+			    if (! same) {
+				*fatal = TRUE_;
+				goto L150;
+			    }
+
+			    if (! null) {
+
+/*                          Check the result column by column. */
+
+				if (conj) {
+				    *(unsigned char *)transt = 'C';
+				} else {
+				    *(unsigned char *)transt = 'T';
+				}
+				jjab = 1;
+				jc = 1;
+				i__5 = n;
+				for (j = 1; j <= i__5; ++j) {
+				    if (upper) {
+					jj = 1;
+					lj = j;
+				    } else {
+					jj = j;
+					lj = n - j + 1;
+				    }
+				    if (tran) {
+					i__6 = k;
+					for (i__ = 1; i__ <= i__6; ++i__) {
+					    i__7 = i__;
+					    i__8 = (j - 1 << 1) * *nmax + k + 
+						    i__;
+					    z__1.r = alpha.r * ab[i__8].r - 
+						    alpha.i * ab[i__8].i, 
+						    z__1.i = alpha.r * ab[
+						    i__8].i + alpha.i * ab[
+						    i__8].r;
+					    w[i__7].r = z__1.r, w[i__7].i = 
+						    z__1.i;
+					    if (conj) {
+			  i__7 = k + i__;
+			  d_cnjg(&z__2, &alpha);
+			  i__8 = (j - 1 << 1) * *nmax + i__;
+			  z__1.r = z__2.r * ab[i__8].r - z__2.i * ab[i__8].i, 
+				  z__1.i = z__2.r * ab[i__8].i + z__2.i * ab[
+				  i__8].r;
+			  w[i__7].r = z__1.r, w[i__7].i = z__1.i;
+					    } else {
+			  i__7 = k + i__;
+			  i__8 = (j - 1 << 1) * *nmax + i__;
+			  z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
+				  .i, z__1.i = alpha.r * ab[i__8].i + alpha.i 
+				  * ab[i__8].r;
+			  w[i__7].r = z__1.r, w[i__7].i = z__1.i;
+					    }
+/* L50: */
+					}
+					i__6 = k << 1;
+					i__7 = *nmax << 1;
+					i__8 = *nmax << 1;
+					zmmch_(transt, "N", &lj, &c__1, &i__6,
+						 &c_b2, &ab[jjab], &i__7, &w[
+						1], &i__8, &beta, &c__[jj + j 
+						* c_dim1], nmax, &ct[1], &g[1]
+						, &cc[jc], &ldc, eps, &err, 
+						fatal, nout, &c_true);
+				    } else {
+					i__6 = k;
+					for (i__ = 1; i__ <= i__6; ++i__) {
+					    if (conj) {
+			  i__7 = i__;
+			  d_cnjg(&z__2, &ab[(k + i__ - 1) * *nmax + j]);
+			  z__1.r = alpha.r * z__2.r - alpha.i * z__2.i, 
+				  z__1.i = alpha.r * z__2.i + alpha.i * 
+				  z__2.r;
+			  w[i__7].r = z__1.r, w[i__7].i = z__1.i;
+			  i__7 = k + i__;
+			  i__8 = (i__ - 1) * *nmax + j;
+			  z__2.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
+				  .i, z__2.i = alpha.r * ab[i__8].i + alpha.i 
+				  * ab[i__8].r;
+			  d_cnjg(&z__1, &z__2);
+			  w[i__7].r = z__1.r, w[i__7].i = z__1.i;
+					    } else {
+			  i__7 = i__;
+			  i__8 = (k + i__ - 1) * *nmax + j;
+			  z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
+				  .i, z__1.i = alpha.r * ab[i__8].i + alpha.i 
+				  * ab[i__8].r;
+			  w[i__7].r = z__1.r, w[i__7].i = z__1.i;
+			  i__7 = k + i__;
+			  i__8 = (i__ - 1) * *nmax + j;
+			  z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
+				  .i, z__1.i = alpha.r * ab[i__8].i + alpha.i 
+				  * ab[i__8].r;
+			  w[i__7].r = z__1.r, w[i__7].i = z__1.i;
+					    }
+/* L60: */
+					}
+					i__6 = k << 1;
+					i__7 = *nmax << 1;
+					zmmch_("N", "N", &lj, &c__1, &i__6, &
+						c_b2, &ab[jj], nmax, &w[1], &
+						i__7, &beta, &c__[jj + j * 
+						c_dim1], nmax, &ct[1], &g[1], 
+						&cc[jc], &ldc, eps, &err, 
+						fatal, nout, &c_true);
+				    }
+				    if (upper) {
+					jc += ldc;
+				    } else {
+					jc = jc + ldc + 1;
+					if (tran) {
+					    jjab += *nmax << 1;
+					}
+				    }
+				    errmax = f2cmax(errmax,err);
+/*                             If got really bad answer, report and */
+/*                             return. */
+				    if (*fatal) {
+					goto L140;
+				    }
+/* L70: */
+				}
+			    }
+
+/* L80: */
+			}
+
+/* L90: */
+		    }
+
+/* L100: */
+		}
+
+L110:
+		;
+	    }
+
+/* L120: */
+	}
+
+L130:
+	;
+    }
+
+/*     Report result. */
+
+    if (errmax < *thresh) {
+	if (*iorder == 0) {
+	    io___373.ciunit = *nout;
+	    s_wsfe(&io___373);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+	if (*iorder == 1) {
+	    io___374.ciunit = *nout;
+	    s_wsfe(&io___374);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    e_wsfe();
+	}
+    } else {
+	if (*iorder == 0) {
+	    io___375.ciunit = *nout;
+	    s_wsfe(&io___375);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal));
+	    e_wsfe();
+	}
+	if (*iorder == 1) {
+	    io___376.ciunit = *nout;
+	    s_wsfe(&io___376);
+	    do_fio(&c__1, sname, (ftnlen)13);
+	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
+	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal));
+	    e_wsfe();
+	}
+    }
+    goto L160;
+
+L140:
+    if (n > 1) {
+	io___377.ciunit = *nout;
+	s_wsfe(&io___377);
+	do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer));
+	e_wsfe();
+    }
+
+L150:
+    io___378.ciunit = *nout;
+    s_wsfe(&io___378);
+    do_fio(&c__1, sname, (ftnlen)13);
+    e_wsfe();
+    if (conj) {
+	zprcn7_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, &
+		ldb, &rbeta, &ldc);
+    } else {
+	zprcn5_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, &
+		ldb, &beta, &ldc);
+    }
+
+L160:
+    return 0;
+
+/* L9994: */
+/* L9993: */
+
+/*     End of ZCHK5. */
+
+} /* zchk5_ */
+
+
+/* Subroutine */ int zprcn5_(integer *nout, integer *nc, char *sname, integer 
+	*iorder, char *uplo, char *transa, integer *n, integer *k, 
+	doublecomplex *alpha, integer *lda, integer *ldb, doublecomplex *beta,
+	 integer *ldc)
+{
+    /* Format strings */
+    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002,"
+	    "\002))";
+    static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002"
+	    ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002, (\002,f4.1,\002,\002"
+	    ",f4.1,\002), C,\002,i3,\002).\002)";
+
+    /* Local variables */
+    char ca[14], cu[14], crc[14];
+
+    /* Fortran I/O blocks */
+    static cilist io___382 = { 0, 0, 0, fmt_9995, 0 };
+    static cilist io___383 = { 0, 0, 0, fmt_9994, 0 };
+
+
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(ca, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(ca, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    io___382.ciunit = *nout;
+    s_wsfe(&io___382);
+    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
+    do_fio(&c__1, sname, (ftnlen)13);
+    do_fio(&c__1, crc, (ftnlen)14);
+    do_fio(&c__1, cu, (ftnlen)14);
+    do_fio(&c__1, ca, (ftnlen)14);
+    e_wsfe();
+    io___383.ciunit = *nout;
+    s_wsfe(&io___383);
+    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer));
+    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal));
+    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer));
+    do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal));
+    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
+    e_wsfe();
+    return 0;
+} /* zprcn5_ */
+
+
+
+/* Subroutine */ int zprcn7_(integer *nout, integer *nc, char *sname, integer 
+	*iorder, char *uplo, char *transa, integer *n, integer *k, 
+	doublecomplex *alpha, integer *lda, integer *ldb, doublereal *beta, 
+	integer *ldc)
+{
+    /* Format strings */
+    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002,"
+	    "\002))";
+    static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002"
+	    ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002,\002,f4.1,\002, C,"
+	    "\002,i3,\002).\002)";
+
+    /* Local variables */
+    char ca[14], cu[14], crc[14];
+
+    /* Fortran I/O blocks */
+    static cilist io___387 = { 0, 0, 0, fmt_9995, 0 };
+    static cilist io___388 = { 0, 0, 0, fmt_9994, 0 };
+
+
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(ca, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(ca, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    io___387.ciunit = *nout;
+    s_wsfe(&io___387);
+    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
+    do_fio(&c__1, sname, (ftnlen)13);
+    do_fio(&c__1, crc, (ftnlen)14);
+    do_fio(&c__1, cu, (ftnlen)14);
+    do_fio(&c__1, ca, (ftnlen)14);
+    e_wsfe();
+    io___388.ciunit = *nout;
+    s_wsfe(&io___388);
+    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer));
+    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal));
+    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer));
+    do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(doublereal));
+    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
+    e_wsfe();
+    return 0;
+} /* zprcn7_ */
+
+
+/* Subroutine */ int zmake_(char *type__, char *uplo, char *diag, integer *m, 
+	integer *n, doublecomplex *a, integer *nmax, doublecomplex *aa, 
+	integer *lda, logical *reset, doublecomplex *transl)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+    doublereal d__1;
+    doublecomplex z__1, z__2;
+
+    /* Local variables */
+    integer ibeg, iend;
+    extern /* Double Complex */ VOID zbeg_(doublecomplex *, logical *);
+    logical unit;
+    integer i__, j;
+    logical lower, upper;
+    integer jj;
+    logical gen, her, tri, sym;
+
+
+/*  Generates values for an M by N matrix A. */
+/*  Stores the values in the array AA in the data structure required */
+/*  by the routine, with unwanted elements set to rogue value. */
+
+/*  TYPE is 'ge', 'he', 'sy' or 'tr'. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    a_dim1 = *nmax;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --aa;
+
+    /* Function Body */
+    gen = s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0;
+    her = s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0;
+    sym = s_cmp(type__, "sy", (ftnlen)2, (ftnlen)2) == 0;
+    tri = s_cmp(type__, "tr", (ftnlen)2, (ftnlen)2) == 0;
+    upper = (her || sym || tri) && *(unsigned char *)uplo == 'U';
+    lower = (her || sym || tri) && *(unsigned char *)uplo == 'L';
+    unit = tri && *(unsigned char *)diag == 'U';
+
+/*     Generate data in array A. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    if (gen || upper && i__ <= j || lower && i__ >= j) {
+		i__3 = i__ + j * a_dim1;
+		zbeg_(&z__2, reset);
+		z__1.r = z__2.r + transl->r, z__1.i = z__2.i + transl->i;
+		a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+		if (i__ != j) {
+/*                 Set some elements to zero */
+		    if (*n > 3 && j == *n / 2) {
+			i__3 = i__ + j * a_dim1;
+			a[i__3].r = 0., a[i__3].i = 0.;
+		    }
+		    if (her) {
+			i__3 = j + i__ * a_dim1;
+			d_cnjg(&z__1, &a[i__ + j * a_dim1]);
+			a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+		    } else if (sym) {
+			i__3 = j + i__ * a_dim1;
+			i__4 = i__ + j * a_dim1;
+			a[i__3].r = a[i__4].r, a[i__3].i = a[i__4].i;
+		    } else if (tri) {
+			i__3 = j + i__ * a_dim1;
+			a[i__3].r = 0., a[i__3].i = 0.;
+		    }
+		}
+	    }
+/* L10: */
+	}
+	if (her) {
+	    i__2 = j + j * a_dim1;
+	    i__3 = j + j * a_dim1;
+	    d__1 = a[i__3].r;
+	    z__1.r = d__1, z__1.i = 0.;
+	    a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+	}
+	if (tri) {
+	    i__2 = j + j * a_dim1;
+	    i__3 = j + j * a_dim1;
+	    z__1.r = a[i__3].r + 1., z__1.i = a[i__3].i + 0.;
+	    a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+	}
+	if (unit) {
+	    i__2 = j + j * a_dim1;
+	    a[i__2].r = 1., a[i__2].i = 0.;
+	}
+/* L20: */
+    }
+
+/*     Store elements in array AS in data structure required by routine. */
+
+    if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + (j - 1) * *lda;
+		i__4 = i__ + j * a_dim1;
+		aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i;
+/* L30: */
+	    }
+	    i__2 = *lda;
+	    for (i__ = *m + 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + (j - 1) * *lda;
+		aa[i__3].r = -1e10, aa[i__3].i = 1e10;
+/* L40: */
+	    }
+/* L50: */
+	}
+    } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__,
+	     "sy", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, "tr", (ftnlen)
+	    2, (ftnlen)2) == 0) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    if (upper) {
+		ibeg = 1;
+		if (unit) {
+		    iend = j - 1;
+		} else {
+		    iend = j;
+		}
+	    } else {
+		if (unit) {
+		    ibeg = j + 1;
+		} else {
+		    ibeg = j;
+		}
+		iend = *n;
+	    }
+	    i__2 = ibeg - 1;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + (j - 1) * *lda;
+		aa[i__3].r = -1e10, aa[i__3].i = 1e10;
+/* L60: */
+	    }
+	    i__2 = iend;
+	    for (i__ = ibeg; i__ <= i__2; ++i__) {
+		i__3 = i__ + (j - 1) * *lda;
+		i__4 = i__ + j * a_dim1;
+		aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i;
+/* L70: */
+	    }
+	    i__2 = *lda;
+	    for (i__ = iend + 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + (j - 1) * *lda;
+		aa[i__3].r = -1e10, aa[i__3].i = 1e10;
+/* L80: */
+	    }
+	    if (her) {
+		jj = j + (j - 1) * *lda;
+		i__2 = jj;
+		i__3 = jj;
+		d__1 = aa[i__3].r;
+		z__1.r = d__1, z__1.i = -1e10;
+		aa[i__2].r = z__1.r, aa[i__2].i = z__1.i;
+	    }
+/* L90: */
+	}
+    }
+    return 0;
+
+/*     End of ZMAKE. */
+
+} /* zmake_ */
+
+/* Subroutine */ int zmmch_(char *transa, char *transb, integer *m, integer *
+	n, integer *kk, doublecomplex *alpha, doublecomplex *a, integer *lda, 
+	doublecomplex *b, integer *ldb, doublecomplex *beta, doublecomplex *
+	c__, integer *ldc, doublecomplex *ct, doublereal *g, doublecomplex *
+	cc, integer *ldcc, doublereal *eps, doublereal *err, logical *fatal, 
+	integer *nout, logical *mv)
+{
+    /* Format strings */
+    static char fmt_9999[] = "(\002 ******* FATAL ERROR - COMPUTED RESULT IS"
+	    " LESS THAN HAL\002,\002F ACCURATE *******\002,/\002             "
+	    "          EXPECTED RE\002,\002SULT                    COMPUTED R"
+	    "ESULT\002)";
+    static char fmt_9998[] = "(1x,i7,2(\002  (\002,g15.6,\002,\002,g15.6,"
+	    "\002)\002))";
+    static char fmt_9997[] = "(\002      THESE ARE THE RESULTS FOR COLUMN"
+	    " \002,i3)";
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, 
+	    cc_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7;
+    doublereal d__1, d__2, d__3, d__4, d__5, d__6;
+    doublecomplex z__1, z__2, z__3, z__4;
+
+    /* Local variables */
+    doublereal erri;
+    integer i__, j, k;
+    logical trana, tranb, ctrana, ctranb;
+
+    /* Fortran I/O blocks */
+    static cilist io___409 = { 0, 0, 0, fmt_9999, 0 };
+    static cilist io___410 = { 0, 0, 0, fmt_9998, 0 };
+    static cilist io___411 = { 0, 0, 0, fmt_9998, 0 };
+    static cilist io___412 = { 0, 0, 0, fmt_9997, 0 };
+
+
+
+/*  Checks the results of the computational tests. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --ct;
+    --g;
+    cc_dim1 = *ldcc;
+    cc_offset = 1 + cc_dim1 * 1;
+    cc -= cc_offset;
+
+    /* Function Body */
+    trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 
+	    'C';
+    tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 
+	    'C';
+    ctrana = *(unsigned char *)transa == 'C';
+    ctranb = *(unsigned char *)transb == 'C';
+
+/*     Compute expected result, one column at a time, in CT using data */
+/*     in A, B and C. */
+/*     Compute gauges in G. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__;
+	    ct[i__3].r = 0., ct[i__3].i = 0.;
+	    g[i__] = 0.;
+/* L10: */
+	}
+	if (! trana && ! tranb) {
+	    i__2 = *kk;
+	    for (k = 1; k <= i__2; ++k) {
+		i__3 = *m;
+		for (i__ = 1; i__ <= i__3; ++i__) {
+		    i__4 = i__;
+		    i__5 = i__;
+		    i__6 = i__ + k * a_dim1;
+		    i__7 = k + j * b_dim1;
+		    z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, 
+			    z__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[
+			    i__7].r;
+		    z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + 
+			    z__2.i;
+		    ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
+		    i__4 = i__ + k * a_dim1;
+		    i__5 = k + j * b_dim1;
+		    g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag(
+			    &a[i__ + k * a_dim1]), abs(d__2))) * ((d__3 = b[
+			    i__5].r, abs(d__3)) + (d__4 = d_imag(&b[k + j * 
+			    b_dim1]), abs(d__4)));
+/* L20: */
+		}
+/* L30: */
+	    }
+	} else if (trana && ! tranb) {
+	    if (ctrana) {
+		i__2 = *kk;
+		for (k = 1; k <= i__2; ++k) {
+		    i__3 = *m;
+		    for (i__ = 1; i__ <= i__3; ++i__) {
+			i__4 = i__;
+			i__5 = i__;
+			d_cnjg(&z__3, &a[k + i__ * a_dim1]);
+			i__6 = k + j * b_dim1;
+			z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, 
+				z__2.i = z__3.r * b[i__6].i + z__3.i * b[i__6]
+				.r;
+			z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + 
+				z__2.i;
+			ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
+			i__4 = k + i__ * a_dim1;
+			i__5 = k + j * b_dim1;
+			g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = 
+				d_imag(&a[k + i__ * a_dim1]), abs(d__2))) * ((
+				d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag(
+				&b[k + j * b_dim1]), abs(d__4)));
+/* L40: */
+		    }
+/* L50: */
+		}
+	    } else {
+		i__2 = *kk;
+		for (k = 1; k <= i__2; ++k) {
+		    i__3 = *m;
+		    for (i__ = 1; i__ <= i__3; ++i__) {
+			i__4 = i__;
+			i__5 = i__;
+			i__6 = k + i__ * a_dim1;
+			i__7 = k + j * b_dim1;
+			z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7]
+				.i, z__2.i = a[i__6].r * b[i__7].i + a[i__6]
+				.i * b[i__7].r;
+			z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + 
+				z__2.i;
+			ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
+			i__4 = k + i__ * a_dim1;
+			i__5 = k + j * b_dim1;
+			g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = 
+				d_imag(&a[k + i__ * a_dim1]), abs(d__2))) * ((
+				d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag(
+				&b[k + j * b_dim1]), abs(d__4)));
+/* L60: */
+		    }
+/* L70: */
+		}
+	    }
+	} else if (! trana && tranb) {
+	    if (ctranb) {
+		i__2 = *kk;
+		for (k = 1; k <= i__2; ++k) {
+		    i__3 = *m;
+		    for (i__ = 1; i__ <= i__3; ++i__) {
+			i__4 = i__;
+			i__5 = i__;
+			i__6 = i__ + k * a_dim1;
+			d_cnjg(&z__3, &b[j + k * b_dim1]);
+			z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, 
+				z__2.i = a[i__6].r * z__3.i + a[i__6].i * 
+				z__3.r;
+			z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + 
+				z__2.i;
+			ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
+			i__4 = i__ + k * a_dim1;
+			i__5 = j + k * b_dim1;
+			g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = 
+				d_imag(&a[i__ + k * a_dim1]), abs(d__2))) * ((
+				d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag(
+				&b[j + k * b_dim1]), abs(d__4)));
+/* L80: */
+		    }
+/* L90: */
+		}
+	    } else {
+		i__2 = *kk;
+		for (k = 1; k <= i__2; ++k) {
+		    i__3 = *m;
+		    for (i__ = 1; i__ <= i__3; ++i__) {
+			i__4 = i__;
+			i__5 = i__;
+			i__6 = i__ + k * a_dim1;
+			i__7 = j + k * b_dim1;
+			z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7]
+				.i, z__2.i = a[i__6].r * b[i__7].i + a[i__6]
+				.i * b[i__7].r;
+			z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + 
+				z__2.i;
+			ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
+			i__4 = i__ + k * a_dim1;
+			i__5 = j + k * b_dim1;
+			g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = 
+				d_imag(&a[i__ + k * a_dim1]), abs(d__2))) * ((
+				d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag(
+				&b[j + k * b_dim1]), abs(d__4)));
+/* L100: */
+		    }
+/* L110: */
+		}
+	    }
+	} else if (trana && tranb) {
+	    if (ctrana) {
+		if (ctranb) {
+		    i__2 = *kk;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__;
+			    i__5 = i__;
+			    d_cnjg(&z__3, &a[k + i__ * a_dim1]);
+			    d_cnjg(&z__4, &b[j + k * b_dim1]);
+			    z__2.r = z__3.r * z__4.r - z__3.i * z__4.i, 
+				    z__2.i = z__3.r * z__4.i + z__3.i * 
+				    z__4.r;
+			    z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i 
+				    + z__2.i;
+			    ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
+			    i__4 = k + i__ * a_dim1;
+			    i__5 = j + k * b_dim1;
+			    g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 =
+				     d_imag(&a[k + i__ * a_dim1]), abs(d__2)))
+				     * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 
+				    = d_imag(&b[j + k * b_dim1]), abs(d__4)));
+/* L120: */
+			}
+/* L130: */
+		    }
+		} else {
+		    i__2 = *kk;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__;
+			    i__5 = i__;
+			    d_cnjg(&z__3, &a[k + i__ * a_dim1]);
+			    i__6 = j + k * b_dim1;
+			    z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, 
+				    z__2.i = z__3.r * b[i__6].i + z__3.i * b[
+				    i__6].r;
+			    z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i 
+				    + z__2.i;
+			    ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
+			    i__4 = k + i__ * a_dim1;
+			    i__5 = j + k * b_dim1;
+			    g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 =
+				     d_imag(&a[k + i__ * a_dim1]), abs(d__2)))
+				     * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 
+				    = d_imag(&b[j + k * b_dim1]), abs(d__4)));
+/* L140: */
+			}
+/* L150: */
+		    }
+		}
+	    } else {
+		if (ctranb) {
+		    i__2 = *kk;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__;
+			    i__5 = i__;
+			    i__6 = k + i__ * a_dim1;
+			    d_cnjg(&z__3, &b[j + k * b_dim1]);
+			    z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, 
+				    z__2.i = a[i__6].r * z__3.i + a[i__6].i * 
+				    z__3.r;
+			    z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i 
+				    + z__2.i;
+			    ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
+			    i__4 = k + i__ * a_dim1;
+			    i__5 = j + k * b_dim1;
+			    g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 =
+				     d_imag(&a[k + i__ * a_dim1]), abs(d__2)))
+				     * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 
+				    = d_imag(&b[j + k * b_dim1]), abs(d__4)));
+/* L160: */
+			}
+/* L170: */
+		    }
+		} else {
+		    i__2 = *kk;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__;
+			    i__5 = i__;
+			    i__6 = k + i__ * a_dim1;
+			    i__7 = j + k * b_dim1;
+			    z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[
+				    i__7].i, z__2.i = a[i__6].r * b[i__7].i + 
+				    a[i__6].i * b[i__7].r;
+			    z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i 
+				    + z__2.i;
+			    ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
+			    i__4 = k + i__ * a_dim1;
+			    i__5 = j + k * b_dim1;
+			    g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 =
+				     d_imag(&a[k + i__ * a_dim1]), abs(d__2)))
+				     * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 
+				    = d_imag(&b[j + k * b_dim1]), abs(d__4)));
+/* L180: */
+			}
+/* L190: */
+		    }
+		}
+	    }
+	}
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__;
+	    i__4 = i__;
+	    z__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, z__2.i = 
+		    alpha->r * ct[i__4].i + alpha->i * ct[i__4].r;
+	    i__5 = i__ + j * c_dim1;
+	    z__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, z__3.i = 
+		    beta->r * c__[i__5].i + beta->i * c__[i__5].r;
+	    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+	    ct[i__3].r = z__1.r, ct[i__3].i = z__1.i;
+	    i__3 = i__ + j * c_dim1;
+	    g[i__] = ((d__1 = alpha->r, abs(d__1)) + (d__2 = d_imag(alpha), 
+		    abs(d__2))) * g[i__] + ((d__3 = beta->r, abs(d__3)) + (
+		    d__4 = d_imag(beta), abs(d__4))) * ((d__5 = c__[i__3].r, 
+		    abs(d__5)) + (d__6 = d_imag(&c__[i__ + j * c_dim1]), abs(
+		    d__6)));
+/* L200: */
+	}
+
+/*        Compute the error ratio for this result. */
+
+	*err = 0.;
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__;
+	    i__4 = i__ + j * cc_dim1;
+	    z__2.r = ct[i__3].r - cc[i__4].r, z__2.i = ct[i__3].i - cc[i__4]
+		    .i;
+	    z__1.r = z__2.r, z__1.i = z__2.i;
+	    erri = ((d__1 = z__1.r, abs(d__1)) + (d__2 = d_imag(&z__1), abs(
+		    d__2))) / *eps;
+	    if (g[i__] != 0.) {
+		erri /= g[i__];
+	    }
+	    *err = f2cmax(*err,erri);
+	    if (*err * sqrt(*eps) >= 1.) {
+		goto L230;
+	    }
+/* L210: */
+	}
+
+/* L220: */
+    }
+
+/*     If the loop completes, all results are at least half accurate. */
+    goto L250;
+
+/*     Report fatal error. */
+
+L230:
+    *fatal = TRUE_;
+    io___409.ciunit = *nout;
+    s_wsfe(&io___409);
+    e_wsfe();
+    i__1 = *m;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (*mv) {
+	    io___410.ciunit = *nout;
+	    s_wsfe(&io___410);
+	    do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer));
+	    do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(doublereal));
+	    do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof(
+		    doublereal));
+	    e_wsfe();
+	} else {
+	    io___411.ciunit = *nout;
+	    s_wsfe(&io___411);
+	    do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer));
+	    do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof(
+		    doublereal));
+	    do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(doublereal));
+	    e_wsfe();
+	}
+/* L240: */
+    }
+    if (*n > 1) {
+	io___412.ciunit = *nout;
+	s_wsfe(&io___412);
+	do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer));
+	e_wsfe();
+    }
+
+L250:
+    return 0;
+
+
+/*     End of ZMMCH. */
+
+} /* zmmch_ */
+
+logical lze_(doublecomplex *ri, doublecomplex *rj, integer *lr)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+    logical ret_val;
+
+    /* Local variables */
+    integer i__;
+
+
+/*  Tests if two arrays are identical. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    --rj;
+    --ri;
+
+    /* Function Body */
+    i__1 = *lr;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	i__3 = i__;
+	if (ri[i__2].r != rj[i__3].r || ri[i__2].i != rj[i__3].i) {
+	    goto L20;
+	}
+/* L10: */
+    }
+    ret_val = TRUE_;
+    goto L30;
+L20:
+    ret_val = FALSE_;
+L30:
+    return ret_val;
+
+/*     End of LZE. */
+
+} /* lze_ */
+
+logical lzeres_(char *type__, char *uplo, integer *m, integer *n, 
+	doublecomplex *aa, doublecomplex *as, integer *lda)
+{
+    /* System generated locals */
+    integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4;
+    logical ret_val;
+
+    /* Local variables */
+    integer ibeg, iend, i__, j;
+    logical upper;
+
+
+/*  Tests if selected elements in two arrays are equal. */
+
+/*  TYPE is 'ge' or 'he' or 'sy'. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    as_dim1 = *lda;
+    as_offset = 1 + as_dim1 * 1;
+    as -= as_offset;
+    aa_dim1 = *lda;
+    aa_offset = 1 + aa_dim1 * 1;
+    aa -= aa_offset;
+
+    /* Function Body */
+    upper = *(unsigned char *)uplo == 'U';
+    if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *lda;
+	    for (i__ = *m + 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * aa_dim1;
+		i__4 = i__ + j * as_dim1;
+		if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) {
+		    goto L70;
+		}
+/* L10: */
+	    }
+/* L20: */
+	}
+    } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__,
+	     "sy", (ftnlen)2, (ftnlen)2) == 0) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    if (upper) {
+		ibeg = 1;
+		iend = j;
+	    } else {
+		ibeg = j;
+		iend = *n;
+	    }
+	    i__2 = ibeg - 1;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * aa_dim1;
+		i__4 = i__ + j * as_dim1;
+		if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) {
+		    goto L70;
+		}
+/* L30: */
+	    }
+	    i__2 = *lda;
+	    for (i__ = iend + 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * aa_dim1;
+		i__4 = i__ + j * as_dim1;
+		if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) {
+		    goto L70;
+		}
+/* L40: */
+	    }
+/* L50: */
+	}
+    }
+
+/* L60: */
+    ret_val = TRUE_;
+    goto L80;
+L70:
+    ret_val = FALSE_;
+L80:
+    return ret_val;
+
+/*     End of LZERES. */
+
+} /* lzeres_ */
+
+/* Double Complex */ VOID zbeg_(doublecomplex * ret_val, logical *reset)
+{
+    /* System generated locals */
+    doublereal d__1, d__2;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__, j, ic, mi, mj;
+
+
+/*  Generates complex numbers as pairs of random numbers uniformly */
+/*  distributed between -0.5 and 0.5. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    if (*reset) {
+/*        Initialize local variables. */
+	mi = 891;
+	mj = 457;
+	i__ = 7;
+	j = 7;
+	ic = 0;
+	*reset = FALSE_;
+    }
+
+/*     The sequence of values of I or J is bounded between 1 and 999. */
+/*     If initial I or J = 1,2,3,6,7 or 9, the period will be 50. */
+/*     If initial I or J = 4 or 8, the period will be 25. */
+/*     If initial I or J = 5, the period will be 10. */
+/*     IC is used to break up the period by skipping 1 value of I or J */
+/*     in 6. */
+
+    ++ic;
+L10:
+    i__ *= mi;
+    j *= mj;
+    i__ -= i__ / 1000 * 1000;
+    j -= j / 1000 * 1000;
+    if (ic >= 5) {
+	ic = 0;
+	goto L10;
+    }
+    d__1 = (i__ - 500) / 1001.;
+    d__2 = (j - 500) / 1001.;
+    z__1.r = d__1, z__1.i = d__2;
+     ret_val->r = z__1.r,  ret_val->i = z__1.i;
+    return ;
+
+/*     End of ZBEG. */
+
+} /* zbeg_ */
+
+doublereal ddiff_(doublereal *x, doublereal *y)
+{
+    /* System generated locals */
+    doublereal ret_val;
+
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    ret_val = *x - *y;
+    return ret_val;
+
+/*     End of DDIFF. */
+
+} /* ddiff_ */
+
+/* Main program alias */ int zblat3_ () { MAIN__ (); return 0; }

From 175e357f5d576e493cba9f180ad78800462f0c3f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 26 Feb 2024 14:19:50 +0100
Subject: [PATCH 704/718] run apt-get update before fetching Ubuntu packages

---
 .github/workflows/dynamic_arch.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml
index 49721958a..669aa8116 100644
--- a/.github/workflows/dynamic_arch.yml
+++ b/.github/workflows/dynamic_arch.yml
@@ -42,6 +42,7 @@ jobs:
       - name: Install Dependencies
         run: |
           if [ "$RUNNER_OS" == "Linux" ]; then
+            sudo apt-get update
             sudo apt-get install -y gfortran cmake ccache libtinfo5
           elif [ "$RUNNER_OS" == "macOS" ]; then
             # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.

From a1ec94c258ac4151eb69012310e0f694f7067ea2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 26 Feb 2024 17:46:07 +0100
Subject: [PATCH 705/718] Readd proper f2c'd sources for the GEMM3M tests

---
 ctest/c_cblat3c_3m.c | 1490 +++++--------------------
 ctest/c_zblat3c_3m.c | 2454 +++++++++++++-----------------------------
 2 files changed, 1043 insertions(+), 2901 deletions(-)

diff --git a/ctest/c_cblat3c_3m.c b/ctest/c_cblat3c_3m.c
index 9cfa26a41..b5d6bf9cb 100644
--- a/ctest/c_cblat3c_3m.c
+++ b/ctest/c_cblat3c_3m.c
@@ -10,25 +10,7 @@
 #undef I
 #endif
 
-#if defined(_WIN64)
-typedef long long BLASLONG;
-typedef unsigned long long BLASULONG;
-#else
-typedef long BLASLONG;
-typedef unsigned long BLASULONG;
-#endif
-
-#ifdef LAPACK_ILP64
-typedef BLASLONG blasint;
-#if defined(_WIN64)
-#define blasabs(x) llabs(x)
-#else
-#define blasabs(x) labs(x)
-#endif
-#else
-typedef int blasint;
-#define blasabs(x) abs(x)
-#endif
+#include "common.h"
 
 typedef blasint integer;
 
@@ -247,7 +229,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,259 +242,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
 
 
 /* Common Block Declarations */
@@ -526,7 +254,7 @@ struct {
 #define infoc_1 infoc_
 
 struct {
-    char srnamt[13];
+    char srnamt[12];
 } srnamc_;
 
 #define srnamc_1 srnamc_
@@ -535,73 +263,25 @@ struct {
 
 static complex c_b1 = {0.f,0.f};
 static complex c_b2 = {1.f,0.f};
-static integer c__9 = 9;
 static integer c__1 = 1;
-static integer c__3 = 3;
-static integer c__8 = 8;
-static integer c__4 = 4;
 static integer c__65 = 65;
-static integer c__7 = 7;
 static integer c__6 = 6;
-static integer c__2 = 2;
 static real c_b91 = 1.f;
 static logical c_true = TRUE_;
 static integer c__0 = 0;
 static logical c_false = FALSE_;
 
-/* Main program */ MAIN__(void)
+int /* Main program */ main(void)
 {
     /* Initialized data */
 
-    static char snames[13*9] = "cblas_cgemm3m" "cblas_chemm  " "cblas_csymm  "
-	     "cblas_ctrmm  " "cblas_ctrsm  " "cblas_cherk  " "cblas_csyrk  " 
-	    "cblas_cher2k " "cblas_csyr2k ";
-
-    /* Format strings */
-    static char fmt_9997[] = "(\002 NUMBER OF VALUES OF \002,a,\002 IS LESS "
-	    "THAN 1 OR GREATER \002,\002THAN \002,i2)";
-    static char fmt_9996[] = "(\002 VALUE OF N IS LESS THAN 0 OR GREATER THA"
-	    "N \002,i2)";
-    static char fmt_9995[] = "(\002 TESTS OF THE COMPLEX          LEVEL 3 BL"
-	    "AS\002,//\002 THE F\002,\002OLLOWING PARAMETER VALUES WILL BE US"
-	    "ED:\002)";
-    static char fmt_9994[] = "(\002   FOR N              \002,9i6)";
-    static char fmt_9993[] = "(\002   FOR ALPHA          \002,7(\002(\002,f4"
-	    ".1,\002,\002,f4.1,\002)  \002,:))";
-    static char fmt_9992[] = "(\002   FOR BETA           \002,7(\002(\002,f4"
-	    ".1,\002,\002,f4.1,\002)  \002,:))";
-    static char fmt_9984[] = "(\002 ERROR-CALL MYEXITS WILL NOT BE TESTED"
-	    "\002)";
-    static char fmt_9999[] = "(\002 ROUTINES PASS COMPUTATIONAL TESTS IF TES"
-	    "T RATIO IS LES\002,\002S THAN\002,f8.2)";
-    static char fmt_10002[] = "(\002 COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS"
-	    " ARE TESTED\002)";
-    static char fmt_10001[] = "(\002 ROW-MAJOR DATA LAYOUT IS TESTED\002)";
-    static char fmt_10000[] = "(\002 COLUMN-MAJOR DATA LAYOUT IS TESTED\002)";
-    static char fmt_9988[] = "(a13,l2)";
-    static char fmt_9990[] = "(\002 SUBPROGRAM NAME \002,a13,\002 NOT RECOGN"
-	    "IZED\002,/\002 ******* T\002,\002ESTS ABANDONED *******\002)";
-    static char fmt_9998[] = "(\002 RELATIVE MACHINE PRECISION IS TAKEN TO"
-	    " BE\002,1p,e9.1)";
-    static char fmt_9989[] = "(\002 ERROR IN CMMCH -  IN-LINE DOT PRODUCTS A"
-	    "RE BEING EVALU\002,\002ATED WRONGLY.\002,/\002 CMMCH WAS CALLED "
-	    "WITH TRANSA = \002,a1,\002AND TRANSB = \002,a1,/\002 AND RETURNE"
-	    "D SAME = \002,l1,\002 AND \002,\002 ERR = \002,f12.3,\002.\002,"
-	    "/\002 THIS MAY BE DUE TO FAULTS IN THE \002,\002ARITHMETIC OR TH"
-	    "E COMPILER.\002,/\002 ******* TESTS ABANDONED \002,\002******"
-	    "*\002)";
-    static char fmt_9987[] = "(1x,a13,\002 WAS NOT TESTED\002)";
-    static char fmt_9986[] = "(/\002 END OF TESTS\002)";
-    static char fmt_9985[] = "(/\002 ******* FATAL ERROR - TESTS ABANDONED *"
-	    "******\002)";
-    static char fmt_9991[] = "(\002 AMEND DATA FILE OR INCREASE ARRAY SIZES "
-	    "IN PROGRAM\002,/\002 ******* TESTS ABANDONED *******\002)";
+    static char snames[9][13] = {"cblas_cgemm3m ", "cblas_chemm ", "cblas_csymm ", 
+	    "cblas_ctrmm ", "cblas_ctrsm ", "cblas_cherk ", "cblas_csyrk ", 
+	    "cblas_cher2k", "cblas_csyr2k"};
 
     /* System generated locals */
     integer i__1, i__2, i__3, i__4, i__5;
     real r__1;
-    olist o__1;
-    cllist cl__1;
 
     /* Local variables */
     integer nalf, idim[9];
@@ -649,7 +329,7 @@ static logical c_false = FALSE_;
     complex aa[4225], ab[8450]	/* was [65][130] */, bb[4225], cc[4225], as[
 	    4225], bs[4225], cs[4225], ct[65];
     logical sfatal, corder;
-    char snamet[13], transa[1], transb[1];
+    char snamet[12], transa[1], transb[1];
     real thresh;
     logical rorder;
     extern /* Subroutine */ int cc3chke_(char *);
@@ -659,59 +339,13 @@ static logical c_false = FALSE_;
     extern logical lce_(complex *, complex *, integer *);
     complex bet[7];
     real eps, err;
-
-    /* Fortran I/O blocks */
-    static cilist io___2 = { 0, 5, 0, 0, 0 };
-    static cilist io___4 = { 0, 5, 0, 0, 0 };
-    static cilist io___7 = { 0, 5, 0, 0, 0 };
-    static cilist io___9 = { 0, 5, 0, 0, 0 };
-    static cilist io___11 = { 0, 5, 0, 0, 0 };
-    static cilist io___13 = { 0, 5, 0, 0, 0 };
-    static cilist io___15 = { 0, 5, 0, 0, 0 };
-    static cilist io___17 = { 0, 5, 0, 0, 0 };
-    static cilist io___19 = { 0, 6, 0, fmt_9997, 0 };
-    static cilist io___20 = { 0, 5, 0, 0, 0 };
-    static cilist io___23 = { 0, 6, 0, fmt_9996, 0 };
-    static cilist io___24 = { 0, 5, 0, 0, 0 };
-    static cilist io___26 = { 0, 6, 0, fmt_9997, 0 };
-    static cilist io___27 = { 0, 5, 0, 0, 0 };
-    static cilist io___29 = { 0, 5, 0, 0, 0 };
-    static cilist io___31 = { 0, 6, 0, fmt_9997, 0 };
-    static cilist io___32 = { 0, 5, 0, 0, 0 };
-    static cilist io___34 = { 0, 6, 0, fmt_9995, 0 };
-    static cilist io___35 = { 0, 6, 0, fmt_9994, 0 };
-    static cilist io___36 = { 0, 6, 0, fmt_9993, 0 };
-    static cilist io___37 = { 0, 6, 0, fmt_9992, 0 };
-    static cilist io___38 = { 0, 6, 0, 0, 0 };
-    static cilist io___39 = { 0, 6, 0, fmt_9984, 0 };
-    static cilist io___40 = { 0, 6, 0, 0, 0 };
-    static cilist io___41 = { 0, 6, 0, fmt_9999, 0 };
-    static cilist io___42 = { 0, 6, 0, 0, 0 };
-    static cilist io___45 = { 0, 6, 0, fmt_10002, 0 };
-    static cilist io___46 = { 0, 6, 0, fmt_10001, 0 };
-    static cilist io___47 = { 0, 6, 0, fmt_10000, 0 };
-    static cilist io___48 = { 0, 6, 0, 0, 0 };
-    static cilist io___50 = { 0, 5, 1, fmt_9988, 0 };
-    static cilist io___53 = { 0, 6, 0, fmt_9990, 0 };
-    static cilist io___55 = { 0, 6, 0, fmt_9998, 0 };
-    static cilist io___68 = { 0, 6, 0, fmt_9989, 0 };
-    static cilist io___69 = { 0, 6, 0, fmt_9989, 0 };
-    static cilist io___70 = { 0, 6, 0, fmt_9989, 0 };
-    static cilist io___71 = { 0, 6, 0, fmt_9989, 0 };
-    static cilist io___73 = { 0, 6, 0, 0, 0 };
-    static cilist io___74 = { 0, 6, 0, fmt_9987, 0 };
-    static cilist io___75 = { 0, 6, 0, 0, 0 };
-    static cilist io___82 = { 0, 6, 0, fmt_9986, 0 };
-    static cilist io___83 = { 0, 6, 0, fmt_9985, 0 };
-    static cilist io___84 = { 0, 6, 0, fmt_9991, 0 };
-
-
+    char tmpchar;
 
 /*  Test program for the COMPLEX          Level 3 Blas. */
 
 /*  The program must be driven by a short data file. The first 13 records */
 /*  of the file are read using list-directed input, the last 9 records */
-/*  are read using the format ( A13, L2 ). An annotated example of a data */
+/*  are read using the format ( A12, L2 ). An annotated example of a data */
 /*  file can be obtained by deleting the first 3 characters from the */
 /*  following 22 lines: */
 /*  'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE */
@@ -756,16 +390,19 @@ static logical c_false = FALSE_;
     infoc_1.noutc = 6;
 
 /*     Read name and unit number for snapshot output file and open file. */
-
-    s_rsle(&io___2);
-    do_lio(&c__9, &c__1, snaps, (ftnlen)32);
-    e_rsle();
-    s_rsle(&io___4);
-    do_lio(&c__3, &c__1, (char *)&ntra, (ftnlen)sizeof(integer));
-    e_rsle();
+    char line[80];
+    
+    fgets(line,80,stdin);
+    sscanf(line,"'%s'",snaps);
+    fgets(line,80,stdin);
+#ifdef USE64BITINT
+    sscanf(line,"%ld",&ntra);
+#else
+    sscanf(line,"%d",&ntra);
+#endif
     trace = ntra >= 0;
     if (trace) {
-	o__1.oerr = 0;
+/*	o__1.oerr = 0;
 	o__1.ounit = ntra;
 	o__1.ofnmlen = 32;
 	o__1.ofnm = snaps;
@@ -774,147 +411,122 @@ static logical c_false = FALSE_;
 	o__1.oacc = 0;
 	o__1.ofm = 0;
 	o__1.oblnk = 0;
-	f_open(&o__1);
+	f_open(&o__1);*/
     }
 /*     Read the flag that directs rewinding of the snapshot file. */
-    s_rsle(&io___7);
-    do_lio(&c__8, &c__1, (char *)&rewi, (ftnlen)sizeof(logical));
-    e_rsle();
-    rewi = rewi && trace;
+   fgets(line,80,stdin);
+   sscanf(line,"%d",&rewi);
+   rewi = rewi && trace;
 /*     Read the flag that directs stopping on any failure. */
-    s_rsle(&io___9);
-    do_lio(&c__8, &c__1, (char *)&sfatal, (ftnlen)sizeof(logical));
-    e_rsle();
-/*     Read the flag that indicates whether error exits are to be tested. */
-    s_rsle(&io___11);
-    do_lio(&c__8, &c__1, (char *)&tsterr, (ftnlen)sizeof(logical));
-    e_rsle();
-/*     Read the flag that indicates whether row-major data layout to be tested. */
-    s_rsle(&io___13);
-    do_lio(&c__3, &c__1, (char *)&layout, (ftnlen)sizeof(integer));
-    e_rsle();
-/*     Read the threshold value of the test ratio */
-    s_rsle(&io___15);
-    do_lio(&c__4, &c__1, (char *)&thresh, (ftnlen)sizeof(real));
-    e_rsle();
+   fgets(line,80,stdin);
+   sscanf(line,"%c",&tmpchar);
+   sfatal=FALSE_;
+   if (tmpchar=='T')sfatal=TRUE_;
+   fgets(line,80,stdin);
+   sscanf(line,"%c",&tmpchar);
+   tsterr=FALSE_;
+   if (tmpchar=='T')tsterr=TRUE_;
+   fgets(line,80,stdin);
+   sscanf(line,"%d",&layout);
+   fgets(line,80,stdin);
+   sscanf(line,"%f",&thresh);
+
 
 /*     Read and check the parameter values for the tests. */
 
 /*     Values of N */
-    s_rsle(&io___17);
-    do_lio(&c__3, &c__1, (char *)&nidim, (ftnlen)sizeof(integer));
-    e_rsle();
+   fgets(line,80,stdin);
+#ifdef USE64BITINT
+   sscanf(line,"%ld",&nidim);
+#else
+   sscanf(line,"%d",&nidim);
+#endif
+
     if (nidim < 1 || nidim > 9) {
-	s_wsfe(&io___19);
-	do_fio(&c__1, "N", (ftnlen)1);
-	do_fio(&c__1, (char *)&c__9, (ftnlen)sizeof(integer));
-	e_wsfe();
+	fprintf(stderr,"NUMBER OF VALUES OF N IS LESS THAN 1 OR GREATER THAN 9");
 	goto L220;
     }
-    s_rsle(&io___20);
-    i__1 = nidim;
-    for (i__ = 1; i__ <= i__1; ++i__) {
-	do_lio(&c__3, &c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer));
-    }
-    e_rsle();
+   fgets(line,80,stdin);
+#ifdef USE64BITINT
+   sscanf(line,"%ld %ld %ld %ld %ld %ld %ld %ld %ld",&idim[0],&idim[1],&idim[2],
+    &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]);
+#else
+   sscanf(line,"%d %d %d %d %d %d %d %d %d",&idim[0],&idim[1],&idim[2],
+    &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]);
+#endif
     i__1 = nidim;
     for (i__ = 1; i__ <= i__1; ++i__) {
 	if (idim[i__ - 1] < 0 || idim[i__ - 1] > 65) {
-	    s_wsfe(&io___23);
-	    do_fio(&c__1, (char *)&c__65, (ftnlen)sizeof(integer));
-	    e_wsfe();
+	fprintf(stderr,"VALUE OF N IS LESS THAN 0 OR GREATER THAN 65\n");
 	    goto L220;
 	}
 /* L10: */
     }
 /*     Values of ALPHA */
-    s_rsle(&io___24);
-    do_lio(&c__3, &c__1, (char *)&nalf, (ftnlen)sizeof(integer));
-    e_rsle();
+   fgets(line,80,stdin);
+#ifdef USE64BITINT
+   sscanf(line,"%ld",&nalf);
+#else
+   sscanf(line,"%d",&nalf);
+#endif
     if (nalf < 1 || nalf > 7) {
-	s_wsfe(&io___26);
-	do_fio(&c__1, "ALPHA", (ftnlen)5);
-	do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer));
-	e_wsfe();
+	fprintf(stderr,"VALUE OF ALPHA IS LESS THAN 0 OR GREATER THAN 7\n");
 	goto L220;
     }
-    s_rsle(&io___27);
-    i__1 = nalf;
-    for (i__ = 1; i__ <= i__1; ++i__) {
-	do_lio(&c__6, &c__1, (char *)&alf[i__ - 1], (ftnlen)sizeof(complex));
-    }
-    e_rsle();
+   fgets(line,80,stdin);
+   sscanf(line,"(%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f)",&alf[0].r,&alf[0].i,&alf[1].r,&alf[1].i,&alf[2].r,&alf[2].i,&alf[3].r,&alf[3].i,
+   &alf[4].r,&alf[4].i,&alf[5].r,&alf[5].i,&alf[6].r,&alf[6].i);
+
+//    i__1 = nalf;
+//    for (i__ = 1; i__ <= i__1; ++i__) {
+//	do_lio(&c__6, &c__1, (char *)&alf[i__ - 1], (ftnlen)sizeof(complex));
+//    }
 /*     Values of BETA */
-    s_rsle(&io___29);
-    do_lio(&c__3, &c__1, (char *)&nbet, (ftnlen)sizeof(integer));
-    e_rsle();
-    if (nbet < 1 || nbet > 7) {
-	s_wsfe(&io___31);
-	do_fio(&c__1, "BETA", (ftnlen)4);
-	do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer));
-	e_wsfe();
+   fgets(line,80,stdin);
+#ifdef USE64BITINT
+   sscanf(line,"%ld",&nbet);
+#else
+   sscanf(line,"%d",&nbet);
+#endif
+    if (nalf < 1 || nbet > 7) {
+	fprintf(stderr,"VALUE OF BETA IS LESS THAN 0 OR GREATER THAN 7\n");
 	goto L220;
     }
-    s_rsle(&io___32);
-    i__1 = nbet;
-    for (i__ = 1; i__ <= i__1; ++i__) {
-	do_lio(&c__6, &c__1, (char *)&bet[i__ - 1], (ftnlen)sizeof(complex));
-    }
-    e_rsle();
+   fgets(line,80,stdin);
+   sscanf(line,"(%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f)",&bet[0].r,&bet[0].i,&bet[1].r,&bet[1].i,&bet[2].r,&bet[2].i,&bet[3].r,&bet[3].i,
+   &bet[4].r,&bet[4].i,&bet[5].r,&bet[5].i,&bet[6].r,&bet[6].i);
+
 
 /*     Report values of parameters. */
 
-    s_wsfe(&io___34);
-    e_wsfe();
-    s_wsfe(&io___35);
-    i__1 = nidim;
-    for (i__ = 1; i__ <= i__1; ++i__) {
-	do_fio(&c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer));
-    }
-    e_wsfe();
-    s_wsfe(&io___36);
-    i__1 = nalf;
-    for (i__ = 1; i__ <= i__1; ++i__) {
-	do_fio(&c__2, (char *)&alf[i__ - 1], (ftnlen)sizeof(real));
-    }
-    e_wsfe();
-    s_wsfe(&io___37);
-    i__1 = nbet;
-    for (i__ = 1; i__ <= i__1; ++i__) {
-	do_fio(&c__2, (char *)&bet[i__ - 1], (ftnlen)sizeof(real));
-    }
-    e_wsfe();
+    printf("TESTS OF THE COMPLEX    LEVEL 3 BLAS\nTHE FOLLOWING PARAMETER VALUES WILL BE USED:\n");
+    printf(" FOR N");
+    for (i__ =1; i__ <=nidim;++i__) printf(" %d",idim[i__-1]);
+    printf("\n");    
+    printf(" FOR ALPHA");
+    for (i__ =1; i__ <=nalf;++i__) printf(" (%f,%f)",alf[i__-1].r,alf[i__-1].i);
+    printf("\n");    
+    printf(" FOR BETA");
+    for (i__ =1; i__ <=nbet;++i__) printf(" (%f,%f)",bet[i__-1].r,bet[i__-1].i);
+    printf("\n");    
+
     if (! tsterr) {
-	s_wsle(&io___38);
-	e_wsle();
-	s_wsfe(&io___39);
-	e_wsfe();
+      printf(" ERROR-EXITS WILL NOT BE TESTED\n"); 
     }
-    s_wsle(&io___40);
-    e_wsle();
-    s_wsfe(&io___41);
-    do_fio(&c__1, (char *)&thresh, (ftnlen)sizeof(real));
-    e_wsfe();
-    s_wsle(&io___42);
-    e_wsle();
+    printf("ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LESS THAN %f\n",thresh);
     rorder = FALSE_;
     corder = FALSE_;
     if (layout == 2) {
 	rorder = TRUE_;
 	corder = TRUE_;
-	s_wsfe(&io___45);
-	e_wsfe();
+        printf("COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED\n");
     } else if (layout == 1) {
 	rorder = TRUE_;
-	s_wsfe(&io___46);
-	e_wsfe();
+        printf("ROW-MAJOR DATA LAYOUT IS TESTED\n");
     } else if (layout == 0) {
 	corder = TRUE_;
-	s_wsfe(&io___47);
-	e_wsfe();
+        printf("COLUMN-MAJOR DATA LAYOUT IS TESTED\n");
     }
-    s_wsle(&io___48);
-    e_wsle();
 
 /*     Read names of subroutines and flags which indicate */
 /*     whether they are to be tested. */
@@ -924,42 +536,33 @@ static logical c_false = FALSE_;
 /* L20: */
     }
 L30:
-    i__1 = s_rsfe(&io___50);
-    if (i__1 != 0) {
+   if (! fgets(line,80,stdin)) {
 	goto L60;
     }
-    i__1 = do_fio(&c__1, snamet, (ftnlen)13);
-    if (i__1 != 0) {
-	goto L60;
-    }
-    i__1 = do_fio(&c__1, (char *)&ltestt, (ftnlen)sizeof(logical));
-    if (i__1 != 0) {
-	goto L60;
-    }
-    i__1 = e_rsfe();
-    if (i__1 != 0) {
+   i__1 = sscanf(line,"%12c %c",snamet,&tmpchar);
+   ltestt=FALSE_;
+   if (tmpchar=='T')ltestt=TRUE_;
+    if (i__1 < 2) {
 	goto L60;
     }
     for (i__ = 1; i__ <= 9; ++i__) {
-	if (s_cmp(snamet, snames + (i__ - 1) * 13, (ftnlen)13, (ftnlen)13) == 
+	if (s_cmp(snamet, snames[i__ - 1] , (ftnlen)12, (ftnlen)12) == 
 		0) {
 	    goto L50;
 	}
 /* L40: */
     }
-    s_wsfe(&io___53);
-    do_fio(&c__1, snamet, (ftnlen)13);
-    e_wsfe();
-    s_stop("", (ftnlen)0);
+    printf("SUBPROGRAM NAME %s NOT RECOGNIZED\n****** TESTS ABANDONED ******\n",snamet);
+    exit(1);
 L50:
     ltest[i__ - 1] = ltestt;
     goto L30;
 
 L60:
-    cl__1.cerr = 0;
+/*    cl__1.cerr = 0;
     cl__1.cunit = 5;
     cl__1.csta = 0;
-    f_clos(&cl__1);
+    f_clos(&cl__1);*/
 
 /*     Compute EPS (the machine precision). */
 
@@ -973,9 +576,7 @@ L70:
     goto L70;
 L80:
     eps += eps;
-    s_wsfe(&io___55);
-    do_fio(&c__1, (char *)&eps, (ftnlen)sizeof(real));
-    e_wsfe();
+    printf("RELATIVE MACHINE PRECISION IS TAKEN TO BE %9.1g\n",eps);
 
 /*     Check the reliability of CMMCH using exact data. */
 
@@ -1015,13 +616,12 @@ L80:
 	    &c__6, &c_true);
     same = lce_(cc, ct, &n);
     if (! same || err != 0.f) {
-	s_wsfe(&io___68);
-	do_fio(&c__1, transa, (ftnlen)1);
-	do_fio(&c__1, transb, (ftnlen)1);
-	do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical));
-	do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real));
-	e_wsfe();
-	s_stop("", (ftnlen)0);
+      printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n");
+      printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb);
+      printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err);
+      printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n");
+      printf("****** TESTS ABANDONED ******\n");
+      exit(1);
     }
     *(unsigned char *)transb = 'C';
     cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
@@ -1029,13 +629,12 @@ L80:
 	    &c__6, &c_true);
     same = lce_(cc, ct, &n);
     if (! same || err != 0.f) {
-	s_wsfe(&io___69);
-	do_fio(&c__1, transa, (ftnlen)1);
-	do_fio(&c__1, transb, (ftnlen)1);
-	do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical));
-	do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real));
-	e_wsfe();
-	s_stop("", (ftnlen)0);
+      printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n");
+      printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb);
+      printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err);
+      printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n");
+      printf("****** TESTS ABANDONED ******\n");
+      exit(1);
     }
     i__1 = n;
     for (j = 1; j <= i__1; ++j) {
@@ -1061,13 +660,12 @@ L80:
 	    &c__6, &c_true);
     same = lce_(cc, ct, &n);
     if (! same || err != 0.f) {
-	s_wsfe(&io___70);
-	do_fio(&c__1, transa, (ftnlen)1);
-	do_fio(&c__1, transb, (ftnlen)1);
-	do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical));
-	do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real));
-	e_wsfe();
-	s_stop("", (ftnlen)0);
+      printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n");
+      printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb);
+      printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err);
+      printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n");
+      printf("****** TESTS ABANDONED ******\n");
+      exit(1);
     }
     *(unsigned char *)transb = 'C';
     cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
@@ -1075,33 +673,26 @@ L80:
 	    &c__6, &c_true);
     same = lce_(cc, ct, &n);
     if (! same || err != 0.f) {
-	s_wsfe(&io___71);
-	do_fio(&c__1, transa, (ftnlen)1);
-	do_fio(&c__1, transb, (ftnlen)1);
-	do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical));
-	do_fio(&c__1, (char *)&err, (ftnlen)sizeof(real));
-	e_wsfe();
-	s_stop("", (ftnlen)0);
+      printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n");
+      printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb);
+      printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err);
+      printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n");
+      printf("****** TESTS ABANDONED ******\n");
+      exit(1);
     }
 
 /*     Test each subroutine in turn. */
 
     for (isnum = 1; isnum <= 9; ++isnum) {
-	s_wsle(&io___73);
-	e_wsle();
 	if (! ltest[isnum - 1]) {
 /*           Subprogram is not to be tested. */
-	    s_wsfe(&io___74);
-	    do_fio(&c__1, snames + (isnum - 1) * 13, (ftnlen)13);
-	    e_wsfe();
+           printf("%12s WAS NOT TESTED\n",snames[isnum-1]);
 	} else {
-	    s_copy(srnamc_1.srnamt, snames + (isnum - 1) * 13, (ftnlen)13, (
-		    ftnlen)13);
+	    s_copy(srnamc_1.srnamt, snames[isnum - 1], (ftnlen)12, (
+		    ftnlen)12);
 /*           Test error exits. */
 	    if (tsterr) {
-		cc3chke_(snames + (isnum - 1) * 13);
-		s_wsle(&io___75);
-		e_wsle();
+		cc3chke_(snames[isnum - 1]);
 	    }
 /*           Test computations. */
 	    infoc_1.infot = 0;
@@ -1121,13 +712,13 @@ L80:
 /*           Test CGEMM, 01. */
 L140:
 	    if (corder) {
-		cchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+		cchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
 			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
 			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
 			 cc, cs, ct, g, &c__0);
 	    }
 	    if (rorder) {
-		cchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+		cchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
 			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
 			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
 			 cc, cs, ct, g, &c__1);
@@ -1136,13 +727,13 @@ L140:
 /*           Test CHEMM, 02, CSYMM, 03. */
 L150:
 	    if (corder) {
-		cchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+		cchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
 			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
 			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
 			 cc, cs, ct, g, &c__0);
 	    }
 	    if (rorder) {
-		cchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+		cchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
 			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
 			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
 			 cc, cs, ct, g, &c__1);
@@ -1151,13 +742,13 @@ L150:
 /*           Test CTRMM, 04, CTRSM, 05. */
 L160:
 	    if (corder) {
-		cchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+		cchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
 			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
 			c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, &
 			c__0);
 	    }
 	    if (rorder) {
-		cchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+		cchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
 			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
 			c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, &
 			c__1);
@@ -1166,13 +757,13 @@ L160:
 /*           Test CHERK, 06, CSYRK, 07. */
 L170:
 	    if (corder) {
-		cchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+		cchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
 			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
 			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
 			 cc, cs, ct, g, &c__0);
 	    }
 	    if (rorder) {
-		cchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+		cchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
 			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
 			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
 			 cc, cs, ct, g, &c__1);
@@ -1181,13 +772,13 @@ L170:
 /*           Test CHER2K, 08, CSYR2K, 09. */
 L180:
 	    if (corder) {
-		cchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+		cchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
 			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
 			nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, 
 			ct, g, w, &c__0);
 	    }
 	    if (rorder) {
-		cchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+		cchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
 			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
 			nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, 
 			ct, g, w, &c__1);
@@ -1201,32 +792,29 @@ L190:
 	}
 /* L200: */
     }
-    s_wsfe(&io___82);
-    e_wsfe();
+    printf("\nEND OF TESTS\n");
     goto L230;
 
 L210:
-    s_wsfe(&io___83);
-    e_wsfe();
+    printf("\n****** FATAL ERROR - TESTS ABANDONED ******\n");
     goto L230;
 
 L220:
-    s_wsfe(&io___84);
-    e_wsfe();
-
+    printf("AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM\n");
+    printf("****** TESTS ABANDONED ******\n");
 L230:
     if (trace) {
-	cl__1.cerr = 0;
+/*	cl__1.cerr = 0;
 	cl__1.cunit = ntra;
 	cl__1.csta = 0;
-	f_clos(&cl__1);
+	f_clos(&cl__1);*/
     }
-    cl__1.cerr = 0;
+/*    cl__1.cerr = 0;
     cl__1.cunit = 6;
     cl__1.csta = 0;
     f_clos(&cl__1);
-    s_stop("", (ftnlen)0);
-
+    s_stop("", (ftnlen)0);*/
+     exit(0);
 
 /*     End of CBLAT3. */
 
@@ -1244,30 +832,9 @@ L230:
 
     static char ich[3] = "NTC";
 
-    /* Format strings */
-    static char fmt_9994[] = "(\002 ****** FATAL ERROR - ERROR-CALL MYEXIT T"
-	    "AKEN ON VALID\002,\002 CALL ******\002)";
-    static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER"
-	    " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)";
-    static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C"
-	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
-    static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR    C"
-	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
-    static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO"
-	    "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
-	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
-	    "ECT *******\002)";
-    static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR  "
-	    "  COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
-	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
-	    "ECT *******\002)";
-    static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM"
-	    "BER:\002)";
-
     /* System generated locals */
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5, i__6, i__7, i__8;
-    alist al__1;
 
     /* Local variables */
     complex beta;
@@ -1288,7 +855,11 @@ L230:
     extern /* Subroutine */ int cprcn1_(integer *, integer *, char *, integer 
 	    *, char *, char *, integer *, integer *, integer *, complex *, 
 	    integer *, integer *, complex *, integer *);
-    integer ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns;
+    integer ia, ib, ma, mb, na, nb, nc, ik, im, in;
+    extern /* Subroutine */ int ccgemm3m_(integer *, char *, char *, integer *, 
+	    integer *, integer *, complex *, complex *, integer *, complex *, 
+	    integer *, complex *, complex *, integer *);
+    integer ks, ms, ns;
     extern logical lceres_(char *, char *, integer *, integer *, complex *, 
 	    complex *, integer *);
     char tranas[1], tranbs[1], transa[1], transb[1];
@@ -1297,20 +868,6 @@ L230:
     extern logical lce_(complex *, complex *, integer *);
     complex als, bls;
     real err;
-    extern /* Subroutine */ int ccgemm3m_(integer *, char *, char *, integer *
-	    , integer *, integer *, complex *, complex *, integer *, complex *
-	    , integer *, complex *, complex *, integer *);
-
-    /* Fortran I/O blocks */
-    static cilist io___128 = { 0, 0, 0, fmt_9994, 0 };
-    static cilist io___131 = { 0, 0, 0, fmt_9998, 0 };
-    static cilist io___133 = { 0, 0, 0, fmt_10000, 0 };
-    static cilist io___134 = { 0, 0, 0, fmt_10001, 0 };
-    static cilist io___135 = { 0, 0, 0, fmt_10002, 0 };
-    static cilist io___136 = { 0, 0, 0, fmt_10003, 0 };
-    static cilist io___137 = { 0, 0, 0, fmt_9996, 0 };
-
-
 
 /*  Tests CGEMM. */
 
@@ -1497,20 +1054,21 @@ L230:
 					    &ldb, &beta, &ldc);
 				}
 				if (*rewi) {
-				    al__1.aerr = 0;
+/*				    al__1.aerr = 0;
 				    al__1.aunit = *ntra;
-				    f_rew(&al__1);
+				    f_rew(&al__1); */
 				}
-				ccgemm3m_(iorder, transa, transb, &m, &n, &k, 
-					&alpha, &aa[1], &lda, &bb[1], &ldb, &
+				ccgemm3m_(iorder, transa, transb, &m, &n, &k, &
+					alpha, &aa[1], &lda, &bb[1], &ldb, &
 					beta, &cc[1], &ldc);
 
 /*                          Check if error-exit was taken incorrectly. */
 
 				if (! infoc_1.ok) {
-				    io___128.ciunit = *nout;
-				    s_wsfe(&io___128);
-				    e_wsfe();
+//				    io___128.ciunit = *nout;
+//				    s_wsfe(&io___128);
+//				    e_wsfe();
+				    printf(" *** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n");
 				    *fatal = TRUE_;
 				    goto L120;
 				}
@@ -1548,11 +1106,7 @@ L230:
 				for (i__ = 1; i__ <= i__6; ++i__) {
 				    same = same && isame[i__ - 1];
 				    if (! isame[i__ - 1]) {
-					io___131.ciunit = *nout;
-					s_wsfe(&io___131);
-					do_fio(&c__1, (char *)&i__, (ftnlen)
-						sizeof(integer));
-					e_wsfe();
+    				printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);;
 				    }
 /* L40: */
 				}
@@ -1606,51 +1160,34 @@ L100:
 
     if (errmax < *thresh) {
 	if (*iorder == 0) {
-	    io___133.ciunit = *nout;
-	    s_wsfe(&io___133);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    e_wsfe();
+            printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
 	}
 	if (*iorder == 1) {
-	    io___134.ciunit = *nout;
-	    s_wsfe(&io___134);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    e_wsfe();
+            printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
 	}
     } else {
 	if (*iorder == 0) {
-	    io___135.ciunit = *nout;
-	    s_wsfe(&io___135);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real));
-	    e_wsfe();
+            printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+	    printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
 	}
 	if (*iorder == 1) {
-	    io___136.ciunit = *nout;
-	    s_wsfe(&io___136);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real));
-	    e_wsfe();
+            printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+	    printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
 	}
     }
     goto L130;
 
 L120:
-    io___137.ciunit = *nout;
-    s_wsfe(&io___137);
-    do_fio(&c__1, sname, (ftnlen)13);
-    e_wsfe();
+    printf(" ******* %s FAILED ON CALL NUMBER:\n",sname);
     cprcn1_(nout, &nc, sname, iorder, transa, transb, &m, &n, &k, &alpha, &
 	    lda, &ldb, &beta, &ldc);
 
 L130:
     return 0;
 
-/* L9995: */
+/* 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', */
+/*     $     3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, */
+/*     $     ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) */
 
 /*     End of CCHK1. */
 
@@ -1662,21 +1199,9 @@ L130:
 	k, complex *alpha, integer *lda, integer *ldb, complex *beta, integer 
 	*ldc)
 {
-    /* Format strings */
-    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a14,\002,\002,"
-	    "a14,\002,\002,a14,\002,\002)";
-    static char fmt_9994[] = "(10x,3(i3,\002,\002),\002 (\002,f4.1,\002,\002"
-	    ",f4.1,\002) , A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002"
-	    ",\002,f4.1,\002) , C,\002,i3,\002).\002)";
-
     /* Local variables */
     char crc[14], cta[14], ctb[14];
 
-    /* Fortran I/O blocks */
-    static cilist io___141 = { 0, 0, 0, fmt_9995, 0 };
-    static cilist io___142 = { 0, 0, 0, fmt_9994, 0 };
-
-
     if (*(unsigned char *)transa == 'N') {
 	s_copy(cta, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
     } else if (*(unsigned char *)transa == 'T') {
@@ -1696,25 +1221,8 @@ L130:
     } else {
 	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
     }
-    io___141.ciunit = *nout;
-    s_wsfe(&io___141);
-    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
-    do_fio(&c__1, sname, (ftnlen)13);
-    do_fio(&c__1, crc, (ftnlen)14);
-    do_fio(&c__1, cta, (ftnlen)14);
-    do_fio(&c__1, ctb, (ftnlen)14);
-    e_wsfe();
-    io___142.ciunit = *nout;
-    s_wsfe(&io___142);
-    do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer));
-    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real));
-    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer));
-    do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real));
-    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
-    e_wsfe();
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cta,ctb);
+    printf("%d %d %d (%4.1f,%4.1f) , A, %d, B, %d, (%4.1f,%4.1f) , C, %d.\n",*m,*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc);
     return 0;
 } /* cprcn1_ */
 
@@ -1731,30 +1239,9 @@ L130:
     static char ichs[2] = "LR";
     static char ichu[2] = "UL";
 
-    /* Format strings */
-    static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT "
-	    "TAKEN ON VALID\002,\002 CALL *******\002)";
-    static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER"
-	    " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)";
-    static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C"
-	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
-    static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR    C"
-	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
-    static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO"
-	    "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
-	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
-	    "ECT *******\002)";
-    static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR  "
-	    "  COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
-	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
-	    "ECT *******\002)";
-    static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM"
-	    "BER:\002)";
-
     /* System generated locals */
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5, i__6, i__7;
-    alist al__1;
 
     /* Local variables */
     complex beta;
@@ -1798,17 +1285,6 @@ L130:
     integer icu;
     real err;
 
-    /* Fortran I/O blocks */
-    static cilist io___181 = { 0, 0, 0, fmt_9994, 0 };
-    static cilist io___184 = { 0, 0, 0, fmt_9998, 0 };
-    static cilist io___186 = { 0, 0, 0, fmt_10000, 0 };
-    static cilist io___187 = { 0, 0, 0, fmt_10001, 0 };
-    static cilist io___188 = { 0, 0, 0, fmt_10002, 0 };
-    static cilist io___189 = { 0, 0, 0, fmt_10003, 0 };
-    static cilist io___190 = { 0, 0, 0, fmt_9996, 0 };
-
-
-
 /*  Tests CHEMM and CSYMM. */
 
 /*  Auxiliary routine for test program for Level 3 Blas. */
@@ -1974,9 +1450,9 @@ L130:
 					;
 			    }
 			    if (*rewi) {
-				al__1.aerr = 0;
+/*				al__1.aerr = 0;
 				al__1.aunit = *ntra;
-				f_rew(&al__1);
+				f_rew(&al__1);*/
 			    }
 			    if (conj) {
 				cchemm_(iorder, side, uplo, &m, &n, &alpha, &
@@ -1991,9 +1467,7 @@ L130:
 /*                       Check if error-exit was taken incorrectly. */
 
 			    if (! infoc_1.ok) {
-				io___181.ciunit = *nout;
-				s_wsfe(&io___181);
-				e_wsfe();
+    printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n");
 				*fatal = TRUE_;
 				goto L110;
 			    }
@@ -2028,11 +1502,7 @@ L130:
 			    for (i__ = 1; i__ <= i__5; ++i__) {
 				same = same && isame[i__ - 1];
 				if (! isame[i__ - 1]) {
-				    io___184.ciunit = *nout;
-				    s_wsfe(&io___184);
-				    do_fio(&c__1, (char *)&i__, (ftnlen)
-					    sizeof(integer));
-				    e_wsfe();
+				printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);
 				}
 /* L40: */
 			    }
@@ -2090,51 +1560,34 @@ L90:
 
     if (errmax < *thresh) {
 	if (*iorder == 0) {
-	    io___186.ciunit = *nout;
-	    s_wsfe(&io___186);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    e_wsfe();
+            printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
 	}
 	if (*iorder == 1) {
-	    io___187.ciunit = *nout;
-	    s_wsfe(&io___187);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    e_wsfe();
+            printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
 	}
     } else {
 	if (*iorder == 0) {
-	    io___188.ciunit = *nout;
-	    s_wsfe(&io___188);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real));
-	    e_wsfe();
+            printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+	    printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
 	}
 	if (*iorder == 1) {
-	    io___189.ciunit = *nout;
-	    s_wsfe(&io___189);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real));
-	    e_wsfe();
+            printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+	    printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
 	}
     }
     goto L120;
 
 L110:
-    io___190.ciunit = *nout;
-    s_wsfe(&io___190);
-    do_fio(&c__1, sname, (ftnlen)13);
-    e_wsfe();
+    printf(" ******* %s FAILED ON CALL NUMBER:\n",sname);
     cprcn2_(nout, &nc, sname, iorder, side, uplo, &m, &n, &alpha, &lda, &ldb, 
 	    &beta, &ldc);
 
 L120:
     return 0;
 
-/* L9995: */
+/* 9995 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $      '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */
+/*     $      ',', F4.1, '), C,', I3, ')    .' ) */
 
 /*     End of CCHK2. */
 
@@ -2145,21 +1598,9 @@ L120:
 	*iorder, char *side, char *uplo, integer *m, integer *n, complex *
 	alpha, integer *lda, integer *ldb, complex *beta, integer *ldc)
 {
-    /* Format strings */
-    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a14,\002,\002,"
-	    "a14,\002,\002,a14,\002,\002)";
-    static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002"
-	    ",f4.1,\002), A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002,"
-	    "\002,f4.1,\002), \002,\002C,\002,i3,\002).\002)";
-
     /* Local variables */
     char cs[14], cu[14], crc[14];
 
-    /* Fortran I/O blocks */
-    static cilist io___194 = { 0, 0, 0, fmt_9995, 0 };
-    static cilist io___195 = { 0, 0, 0, fmt_9994, 0 };
-
-
     if (*(unsigned char *)side == 'L') {
 	s_copy(cs, "     CblasLeft", (ftnlen)14, (ftnlen)14);
     } else {
@@ -2175,24 +1616,8 @@ L120:
     } else {
 	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
     }
-    io___194.ciunit = *nout;
-    s_wsfe(&io___194);
-    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
-    do_fio(&c__1, sname, (ftnlen)13);
-    do_fio(&c__1, crc, (ftnlen)14);
-    do_fio(&c__1, cs, (ftnlen)14);
-    do_fio(&c__1, cu, (ftnlen)14);
-    e_wsfe();
-    io___195.ciunit = *nout;
-    s_wsfe(&io___195);
-    do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
-    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real));
-    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer));
-    do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real));
-    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
-    e_wsfe();
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu);
+    printf("%d %d (%4.1f,%4.1f) , A, %d, B, %d, (%4.1f,%4.1f) , C, %d.\n",*m,*n,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc);
     return 0;
 } /* cprcn2_ */
 
@@ -2210,31 +1635,10 @@ L120:
     static char ichd[2] = "UN";
     static char ichs[2] = "LR";
 
-    /* Format strings */
-    static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT "
-	    "TAKEN ON VALID\002,\002 CALL *******\002)";
-    static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER"
-	    " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)";
-    static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C"
-	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
-    static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR    C"
-	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
-    static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO"
-	    "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
-	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
-	    "ECT *******\002)";
-    static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR  "
-	    "  COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
-	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
-	    "ECT *******\002)";
-    static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM"
-	    "BER:\002)";
-
     /* System generated locals */
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5, i__6, i__7;
     complex q__1;
-    alist al__1;
 
     /* Local variables */
     char diag[1];
@@ -2279,17 +1683,6 @@ L120:
     integer ict, icu;
     real err;
 
-    /* Fortran I/O blocks */
-    static cilist io___236 = { 0, 0, 0, fmt_9994, 0 };
-    static cilist io___239 = { 0, 0, 0, fmt_9998, 0 };
-    static cilist io___241 = { 0, 0, 0, fmt_10000, 0 };
-    static cilist io___242 = { 0, 0, 0, fmt_10001, 0 };
-    static cilist io___243 = { 0, 0, 0, fmt_10002, 0 };
-    static cilist io___244 = { 0, 0, 0, fmt_10003, 0 };
-    static cilist io___245 = { 0, 0, 0, fmt_9996, 0 };
-
-
-
 /*  Tests CTRMM and CTRSM. */
 
 /*  Auxiliary routine for test program for Level 3 Blas. */
@@ -2444,14 +1837,14 @@ L120:
 				    if (*trace) {
 					cprcn3_(ntra, &nc, sname, iorder, 
 						side, uplo, transa, diag, &m, 
-						&n, &alpha, &lda, &ldb, (
-						ftnlen)13, (ftnlen)1, (ftnlen)
-						1, (ftnlen)1, (ftnlen)1);
+						&n, &alpha, &lda, &ldb/*, (
+						ftnlen)12, (ftnlen)1, (ftnlen)
+						1, (ftnlen)1, (ftnlen)1*/);
 				    }
 				    if (*rewi) {
-					al__1.aerr = 0;
+/*					al__1.aerr = 0;
 					al__1.aunit = *ntra;
-					f_rew(&al__1);
+					f_rew(&al__1);*/
 				    }
 				    cctrmm_(iorder, side, uplo, transa, diag, 
 					    &m, &n, &alpha, &aa[1], &lda, &bb[
@@ -2461,14 +1854,14 @@ L120:
 				    if (*trace) {
 					cprcn3_(ntra, &nc, sname, iorder, 
 						side, uplo, transa, diag, &m, 
-						&n, &alpha, &lda, &ldb, (
-						ftnlen)13, (ftnlen)1, (ftnlen)
-						1, (ftnlen)1, (ftnlen)1);
+						&n, &alpha, &lda, &ldb/*, (
+						ftnlen)12, (ftnlen)1, (ftnlen)
+						1, (ftnlen)1, (ftnlen)1*/);
 				    }
 				    if (*rewi) {
-					al__1.aerr = 0;
+/*					al__1.aerr = 0;
 					al__1.aunit = *ntra;
-					f_rew(&al__1);
+					f_rew(&al__1);*/
 				    }
 				    cctrsm_(iorder, side, uplo, transa, diag, 
 					    &m, &n, &alpha, &aa[1], &lda, &bb[
@@ -2478,9 +1871,7 @@ L120:
 /*                          Check if error-exit was taken incorrectly. */
 
 				if (! infoc_1.ok) {
-				    io___236.ciunit = *nout;
-				    s_wsfe(&io___236);
-				    e_wsfe();
+				    printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n");
 				    *fatal = TRUE_;
 				    goto L150;
 				}
@@ -2517,11 +1908,7 @@ L120:
 				for (i__ = 1; i__ <= i__4; ++i__) {
 				    same = same && isame[i__ - 1];
 				    if (! isame[i__ - 1]) {
-					io___239.ciunit = *nout;
-					s_wsfe(&io___239);
-					do_fio(&c__1, (char *)&i__, (ftnlen)
-						sizeof(integer));
-					e_wsfe();
+					printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);
 				    }
 /* L50: */
 				}
@@ -2543,8 +1930,8 @@ L120:
 						    c_b1, &c__[c_offset], 
 						    nmax, &ct[1], &g[1], &bb[
 						    1], &ldb, eps, &err, 
-						    fatal, nout, &c_true, (
-						    ftnlen)1, (ftnlen)1);
+						    fatal, nout, &c_true/*, (
+						    ftnlen)1, (ftnlen)1*/);
 					} else {
 					    cmmch_("N", transa, &m, &n, &n, &
 						    alpha, &b[b_offset], nmax,
@@ -2631,44 +2018,25 @@ L130:
 
     if (errmax < *thresh) {
 	if (*iorder == 0) {
-	    io___241.ciunit = *nout;
-	    s_wsfe(&io___241);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    e_wsfe();
+            printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
 	}
 	if (*iorder == 1) {
-	    io___242.ciunit = *nout;
-	    s_wsfe(&io___242);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    e_wsfe();
+            printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
 	}
     } else {
 	if (*iorder == 0) {
-	    io___243.ciunit = *nout;
-	    s_wsfe(&io___243);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real));
-	    e_wsfe();
+            printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+	    printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
 	}
 	if (*iorder == 1) {
-	    io___244.ciunit = *nout;
-	    s_wsfe(&io___244);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real));
-	    e_wsfe();
+            printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+	    printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
 	}
     }
     goto L160;
 
 L150:
-    io___245.ciunit = *nout;
-    s_wsfe(&io___245);
-    do_fio(&c__1, sname, (ftnlen)13);
-    e_wsfe();
+    printf(" ******* %s FAILED ON CALL NUMBER:\n",sname);
     if (*trace) {
 	cprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, &
 		alpha, &lda, &ldb);
@@ -2677,7 +2045,9 @@ L150:
 L160:
     return 0;
 
-/* L9995: */
+/* 9995 FORMAT(1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $     '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ')         ', */
+/*     $      '      .' ) */
 
 /*     End of CCHK3. */
 
@@ -2688,21 +2058,9 @@ L160:
 	*iorder, char *side, char *uplo, char *transa, char *diag, integer *m,
 	 integer *n, complex *alpha, integer *lda, integer *ldb)
 {
-    /* Format strings */
-    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a14,\002,\002,"
-	    "a14,\002,\002,a14,\002,\002)";
-    static char fmt_9994[] = "(10x,2(a14,\002,\002),2(i3,\002,\002),\002 "
-	    "(\002,f4.1,\002,\002,f4.1,\002), A,\002,i3,\002, B,\002,i3,\002)."
-	    "\002)";
-
     /* Local variables */
     char ca[14], cd[14], cs[14], cu[14], crc[14];
 
-    /* Fortran I/O blocks */
-    static cilist io___251 = { 0, 0, 0, fmt_9995, 0 };
-    static cilist io___252 = { 0, 0, 0, fmt_9994, 0 };
-
-
     if (*(unsigned char *)side == 'L') {
 	s_copy(cs, "     CblasLeft", (ftnlen)14, (ftnlen)14);
     } else {
@@ -2730,24 +2088,9 @@ L160:
     } else {
 	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
     }
-    io___251.ciunit = *nout;
-    s_wsfe(&io___251);
-    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
-    do_fio(&c__1, sname, (ftnlen)13);
-    do_fio(&c__1, crc, (ftnlen)14);
-    do_fio(&c__1, cs, (ftnlen)14);
-    do_fio(&c__1, cu, (ftnlen)14);
-    e_wsfe();
-    io___252.ciunit = *nout;
-    s_wsfe(&io___252);
-    do_fio(&c__1, ca, (ftnlen)14);
-    do_fio(&c__1, cd, (ftnlen)14);
-    do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
-    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real));
-    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer));
-    e_wsfe();
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu);
+    printf("         %s %s %d %d (%4.1f,%4.1f) A %d B %d\n",ca,cd,*m,*n,alpha->r,alpha->i,*lda,*ldb);
+
     return 0;
 } /* cprcn3_ */
 
@@ -2764,33 +2107,10 @@ L160:
     static char icht[2] = "NC";
     static char ichu[2] = "UL";
 
-    /* Format strings */
-    static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT "
-	    "TAKEN ON VALID\002,\002 CALL *******\002)";
-    static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER"
-	    " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)";
-    static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C"
-	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
-    static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR    C"
-	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
-    static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO"
-	    "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
-	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
-	    "ECT *******\002)";
-    static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR  "
-	    "  COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
-	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
-	    "ECT *******\002)";
-    static char fmt_9995[] = "(\002      THESE ARE THE RESULTS FOR COLUMN"
-	    " \002,i3)";
-    static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM"
-	    "BER:\002)";
-
     /* System generated locals */
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5, i__6, i__7;
     complex q__1;
-    alist al__1;
 
     /* Local variables */
     complex beta;
@@ -2841,18 +2161,6 @@ L160:
     integer ict, icu;
     real err;
 
-    /* Fortran I/O blocks */
-    static cilist io___294 = { 0, 0, 0, fmt_9992, 0 };
-    static cilist io___297 = { 0, 0, 0, fmt_9998, 0 };
-    static cilist io___304 = { 0, 0, 0, fmt_10000, 0 };
-    static cilist io___305 = { 0, 0, 0, fmt_10001, 0 };
-    static cilist io___306 = { 0, 0, 0, fmt_10002, 0 };
-    static cilist io___307 = { 0, 0, 0, fmt_10003, 0 };
-    static cilist io___308 = { 0, 0, 0, fmt_9995, 0 };
-    static cilist io___309 = { 0, 0, 0, fmt_9996, 0 };
-
-
-
 /*  Tests CHERK and CSYRK. */
 
 /*  Auxiliary routine for test program for Level 3 Blas. */
@@ -2892,6 +2200,8 @@ L160:
     nc = 0;
     reset = TRUE_;
     errmax = 0.f;
+    rals = 1.f;
+    rbets = 1.f;
 
     i__1 = *nidim;
     for (in = 1; in <= i__1; ++in) {
@@ -2965,8 +2275,8 @@ L160:
 			    }
 			    null = n <= 0;
 			    if (conj) {
-				null = null || (k <= 0 || ralpha == 0.f) && 
-					rbeta == 1.f;
+				null = null || ((k <= 0 || ralpha == 0.f) && 
+					rbeta == 1.f);
 			    }
 
 /*                       Generate the matrix C. */
@@ -3022,9 +2332,9 @@ L160:
 					    rbeta, &ldc);
 				}
 				if (*rewi) {
-				    al__1.aerr = 0;
+/*				    al__1.aerr = 0;
 				    al__1.aunit = *ntra;
-				    f_rew(&al__1);
+				    f_rew(&al__1);*/
 				}
 				ccherk_(iorder, uplo, trans, &n, &k, &ralpha, 
 					&aa[1], &lda, &rbeta, &cc[1], &ldc);
@@ -3035,9 +2345,9 @@ L160:
 					    beta, &ldc);
 				}
 				if (*rewi) {
-				    al__1.aerr = 0;
+/*				    al__1.aerr = 0;
 				    al__1.aunit = *ntra;
-				    f_rew(&al__1);
+				    f_rew(&al__1);*/
 				}
 				ccsyrk_(iorder, uplo, trans, &n, &k, &alpha, &
 					aa[1], &lda, &beta, &cc[1], &ldc);
@@ -3046,9 +2356,7 @@ L160:
 /*                       Check if error-exit was taken incorrectly. */
 
 			    if (! infoc_1.ok) {
-				io___294.ciunit = *nout;
-				s_wsfe(&io___294);
-				e_wsfe();
+				printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n");
 				*fatal = TRUE_;
 				goto L120;
 			    }
@@ -3091,11 +2399,7 @@ L160:
 			    for (i__ = 1; i__ <= i__5; ++i__) {
 				same = same && isame[i__ - 1];
 				if (! isame[i__ - 1]) {
-				    io___297.ciunit = *nout;
-				    s_wsfe(&io___297);
-				    do_fio(&c__1, (char *)&i__, (ftnlen)
-					    sizeof(integer));
-				    e_wsfe();
+				    printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);
 				}
 /* L30: */
 			    }
@@ -3179,52 +2483,30 @@ L100:
 
     if (errmax < *thresh) {
 	if (*iorder == 0) {
-	    io___304.ciunit = *nout;
-	    s_wsfe(&io___304);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    e_wsfe();
+            printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
 	}
 	if (*iorder == 1) {
-	    io___305.ciunit = *nout;
-	    s_wsfe(&io___305);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    e_wsfe();
+            printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
 	}
     } else {
 	if (*iorder == 0) {
-	    io___306.ciunit = *nout;
-	    s_wsfe(&io___306);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real));
-	    e_wsfe();
+            printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+	    printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
 	}
 	if (*iorder == 1) {
-	    io___307.ciunit = *nout;
-	    s_wsfe(&io___307);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real));
-	    e_wsfe();
+            printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+	    printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
 	}
     }
     goto L130;
 
 L110:
     if (n > 1) {
-	io___308.ciunit = *nout;
-	s_wsfe(&io___308);
-	do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer));
-	e_wsfe();
+        printf("      THESE ARE THE RESULTS FOR COLUMN %d:\n",j);
     }
 
 L120:
-    io___309.ciunit = *nout;
-    s_wsfe(&io___309);
-    do_fio(&c__1, sname, (ftnlen)13);
-    e_wsfe();
+    printf(" ******* %s FAILED ON CALL NUMBER:\n",sname);
     if (conj) {
 	cprcn6_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &ralpha, &lda, 
 		&rbeta, &ldc);
@@ -3236,8 +2518,12 @@ L120:
 L130:
     return 0;
 
-/* L9994: */
-/* L9993: */
+/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $     F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ')               ', */
+/*     $      '          .' ) */
+/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $      '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, */
+/*     $      '), C,', I3, ')          .' ) */
 
 /*     End of CCHK4. */
 
@@ -3248,21 +2534,9 @@ L130:
 	*iorder, char *uplo, char *transa, integer *n, integer *k, complex *
 	alpha, integer *lda, complex *beta, integer *ldc)
 {
-    /* Format strings */
-    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002,"
-	    "\002))";
-    static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002"
-	    ",f4.1,\002), A,\002,i3,\002, (\002,f4.1,\002,\002,f4.1,\002), C"
-	    ",\002,i3,\002).\002)";
-
     /* Local variables */
     char ca[14], cu[14], crc[14];
 
-    /* Fortran I/O blocks */
-    static cilist io___313 = { 0, 0, 0, fmt_9995, 0 };
-    static cilist io___314 = { 0, 0, 0, fmt_9994, 0 };
-
-
     if (*(unsigned char *)uplo == 'U') {
 	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
     } else {
@@ -3280,23 +2554,8 @@ L130:
     } else {
 	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
     }
-    io___313.ciunit = *nout;
-    s_wsfe(&io___313);
-    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
-    do_fio(&c__1, sname, (ftnlen)13);
-    do_fio(&c__1, crc, (ftnlen)14);
-    do_fio(&c__1, cu, (ftnlen)14);
-    do_fio(&c__1, ca, (ftnlen)14);
-    e_wsfe();
-    io___314.ciunit = *nout;
-    s_wsfe(&io___314);
-    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer));
-    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real));
-    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
-    do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real));
-    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
-    e_wsfe();
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca);
+    printf("(          %d %d (%4.1f,%4.1f) A %d (%4.1f,%4.1f) C %d\n",*n,*k,alpha->r,alpha->i,*lda,beta->r,beta->i,*ldc);
     return 0;
 } /* cprcn4_ */
 
@@ -3306,20 +2565,9 @@ L130:
 	*iorder, char *uplo, char *transa, integer *n, integer *k, real *
 	alpha, integer *lda, real *beta, integer *ldc)
 {
-    /* Format strings */
-    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002,"
-	    "\002))";
-    static char fmt_9994[] = "(10x,2(i3,\002,\002),f4.1,\002, A,\002,i3"
-	    ",\002,\002,f4.1,\002, C,\002,i3,\002).\002)";
-
     /* Local variables */
     char ca[14], cu[14], crc[14];
 
-    /* Fortran I/O blocks */
-    static cilist io___318 = { 0, 0, 0, fmt_9995, 0 };
-    static cilist io___319 = { 0, 0, 0, fmt_9994, 0 };
-
-
     if (*(unsigned char *)uplo == 'U') {
 	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
     } else {
@@ -3337,23 +2585,8 @@ L130:
     } else {
 	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
     }
-    io___318.ciunit = *nout;
-    s_wsfe(&io___318);
-    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
-    do_fio(&c__1, sname, (ftnlen)13);
-    do_fio(&c__1, crc, (ftnlen)14);
-    do_fio(&c__1, cu, (ftnlen)14);
-    do_fio(&c__1, ca, (ftnlen)14);
-    e_wsfe();
-    io___319.ciunit = *nout;
-    s_wsfe(&io___319);
-    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*alpha), (ftnlen)sizeof(real));
-    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(real));
-    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
-    e_wsfe();
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca);
+    printf("(          %d %d %4.1f A %d %4.1f C %d\n",*n,*k,*alpha,*lda,*beta,*ldc);
     return 0;
 } /* cprcn6_ */
 
@@ -3370,32 +2603,10 @@ L130:
     static char icht[2] = "NC";
     static char ichu[2] = "UL";
 
-    /* Format strings */
-    static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT "
-	    "TAKEN ON VALID\002,\002 CALL *******\002)";
-    static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER"
-	    " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)";
-    static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C"
-	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
-    static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR    C"
-	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
-    static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO"
-	    "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
-	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
-	    "ECT *******\002)";
-    static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR  "
-	    "  COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
-	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
-	    "ECT *******\002)";
-    static char fmt_9995[] = "(\002      THESE ARE THE RESULTS FOR COLUMN"
-	    " \002,i3)";
-    static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM"
-	    "BER:\002)";
 
     /* System generated locals */
     integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8;
     complex q__1, q__2;
-    alist al__1;
 
     /* Local variables */
     integer jjab;
@@ -3444,18 +2655,6 @@ L130:
     integer ict, icu;
     real err;
 
-    /* Fortran I/O blocks */
-    static cilist io___362 = { 0, 0, 0, fmt_9992, 0 };
-    static cilist io___365 = { 0, 0, 0, fmt_9998, 0 };
-    static cilist io___373 = { 0, 0, 0, fmt_10000, 0 };
-    static cilist io___374 = { 0, 0, 0, fmt_10001, 0 };
-    static cilist io___375 = { 0, 0, 0, fmt_10002, 0 };
-    static cilist io___376 = { 0, 0, 0, fmt_10003, 0 };
-    static cilist io___377 = { 0, 0, 0, fmt_9995, 0 };
-    static cilist io___378 = { 0, 0, 0, fmt_9996, 0 };
-
-
-
 /*  Tests CHER2K and CSYR2K. */
 
 /*  Auxiliary routine for test program for Level 3 Blas. */
@@ -3578,8 +2777,8 @@ L130:
 			    }
 			    null = n <= 0;
 			    if (conj) {
-				null = null || (k <= 0 || alpha.r == 0.f && 
-					alpha.i == 0.f) && rbeta == 1.f;
+				null = null || ((k <= 0 || (alpha.r == 0.f && 
+					alpha.i == 0.f)) && rbeta == 1.f);
 			    }
 
 /*                       Generate the matrix C. */
@@ -3640,9 +2839,9 @@ L130:
 					     &rbeta, &ldc);
 				}
 				if (*rewi) {
-				    al__1.aerr = 0;
+/*				    al__1.aerr = 0;
 				    al__1.aunit = *ntra;
-				    f_rew(&al__1);
+				    f_rew(&al__1);*/
 				}
 				ccher2k_(iorder, uplo, trans, &n, &k, &alpha, 
 					&aa[1], &lda, &bb[1], &ldb, &rbeta, &
@@ -3654,9 +2853,9 @@ L130:
 					     &beta, &ldc);
 				}
 				if (*rewi) {
-				    al__1.aerr = 0;
+/*				    al__1.aerr = 0;
 				    al__1.aunit = *ntra;
-				    f_rew(&al__1);
+				    f_rew(&al__1);*/
 				}
 				ccsyr2k_(iorder, uplo, trans, &n, &k, &alpha, 
 					&aa[1], &lda, &bb[1], &ldb, &beta, &
@@ -3666,9 +2865,7 @@ L130:
 /*                       Check if error-exit was taken incorrectly. */
 
 			    if (! infoc_1.ok) {
-				io___362.ciunit = *nout;
-				s_wsfe(&io___362);
-				e_wsfe();
+				printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n");
 				*fatal = TRUE_;
 				goto L150;
 			    }
@@ -3708,11 +2905,7 @@ L130:
 			    for (i__ = 1; i__ <= i__5; ++i__) {
 				same = same && isame[i__ - 1];
 				if (! isame[i__ - 1]) {
-				    io___365.ciunit = *nout;
-				    s_wsfe(&io___365);
-				    do_fio(&c__1, (char *)&i__, (ftnlen)
-					    sizeof(integer));
-				    e_wsfe();
+				    printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);
 				}
 /* L40: */
 			    }
@@ -3745,7 +2938,7 @@ L130:
 					i__6 = k;
 					for (i__ = 1; i__ <= i__6; ++i__) {
 					    i__7 = i__;
-					    i__8 = (j - 1 << 1) * *nmax + k + 
+					    i__8 = ((j - 1) << 1) * *nmax + k + 
 						    i__;
 					    q__1.r = alpha.r * ab[i__8].r - 
 						    alpha.i * ab[i__8].i, 
@@ -3757,14 +2950,14 @@ L130:
 					    if (conj) {
 			  i__7 = k + i__;
 			  r_cnjg(&q__2, &alpha);
-			  i__8 = (j - 1 << 1) * *nmax + i__;
+			  i__8 = ((j - 1) << 1) * *nmax + i__;
 			  q__1.r = q__2.r * ab[i__8].r - q__2.i * ab[i__8].i, 
 				  q__1.i = q__2.r * ab[i__8].i + q__2.i * ab[
 				  i__8].r;
 			  w[i__7].r = q__1.r, w[i__7].i = q__1.i;
 					    } else {
 			  i__7 = k + i__;
-			  i__8 = (j - 1 << 1) * *nmax + i__;
+			  i__8 = ((j - 1) << 1) * *nmax + i__;
 			  q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
 				  .i, q__1.i = alpha.r * ab[i__8].i + alpha.i 
 				  * ab[i__8].r;
@@ -3865,52 +3058,30 @@ L130:
 
     if (errmax < *thresh) {
 	if (*iorder == 0) {
-	    io___373.ciunit = *nout;
-	    s_wsfe(&io___373);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    e_wsfe();
+            printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
 	}
 	if (*iorder == 1) {
-	    io___374.ciunit = *nout;
-	    s_wsfe(&io___374);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    e_wsfe();
+            printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
 	}
     } else {
 	if (*iorder == 0) {
-	    io___375.ciunit = *nout;
-	    s_wsfe(&io___375);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real));
-	    e_wsfe();
+            printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+	    printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
 	}
 	if (*iorder == 1) {
-	    io___376.ciunit = *nout;
-	    s_wsfe(&io___376);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(real));
-	    e_wsfe();
+            printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+	    printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
 	}
     }
     goto L160;
 
 L140:
     if (n > 1) {
-	io___377.ciunit = *nout;
-	s_wsfe(&io___377);
-	do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer));
-	e_wsfe();
+        printf("      THESE ARE THE RESULTS FOR COLUMN %d:\n",j);
     }
 
 L150:
-    io___378.ciunit = *nout;
-    s_wsfe(&io___378);
-    do_fio(&c__1, sname, (ftnlen)13);
-    e_wsfe();
+    printf(" ******* %s FAILED ON CALL NUMBER:\n",sname);
     if (conj) {
 	cprcn7_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, &
 		ldb, &rbeta, &ldc);
@@ -3922,8 +3093,12 @@ L150:
 L160:
     return 0;
 
-/* L9994: */
-/* L9993: */
+/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $      '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, */
+/*     $      ', C,', I3, ')           .' ) */
+/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $      '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */
+/*     $      ',', F4.1, '), C,', I3, ')    .' ) */
 
 /*     End of CCHK5. */
 
@@ -3934,21 +3109,10 @@ L160:
 	*iorder, char *uplo, char *transa, integer *n, integer *k, complex *
 	alpha, integer *lda, integer *ldb, complex *beta, integer *ldc)
 {
-    /* Format strings */
-    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002,"
-	    "\002))";
-    static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002"
-	    ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002, (\002,f4.1,\002,\002"
-	    ",f4.1,\002), C,\002,i3,\002).\002)";
 
     /* Local variables */
     char ca[14], cu[14], crc[14];
 
-    /* Fortran I/O blocks */
-    static cilist io___382 = { 0, 0, 0, fmt_9995, 0 };
-    static cilist io___383 = { 0, 0, 0, fmt_9994, 0 };
-
-
     if (*(unsigned char *)uplo == 'U') {
 	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
     } else {
@@ -3966,24 +3130,8 @@ L160:
     } else {
 	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
     }
-    io___382.ciunit = *nout;
-    s_wsfe(&io___382);
-    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
-    do_fio(&c__1, sname, (ftnlen)13);
-    do_fio(&c__1, crc, (ftnlen)14);
-    do_fio(&c__1, cu, (ftnlen)14);
-    do_fio(&c__1, ca, (ftnlen)14);
-    e_wsfe();
-    io___383.ciunit = *nout;
-    s_wsfe(&io___383);
-    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer));
-    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real));
-    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer));
-    do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(real));
-    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
-    e_wsfe();
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca);
+    printf("%d %d (%4.1f,%4.1f) , A, %d, B, %d, (%4.1f,%4.1f) , C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc);
     return 0;
 } /* cprcn5_ */
 
@@ -3993,21 +3141,10 @@ L160:
 	*iorder, char *uplo, char *transa, integer *n, integer *k, complex *
 	alpha, integer *lda, integer *ldb, real *beta, integer *ldc)
 {
-    /* Format strings */
-    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a14,\002,"
-	    "\002))";
-    static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002"
-	    ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002,\002,f4.1,\002, C,"
-	    "\002,i3,\002).\002)";
 
     /* Local variables */
     char ca[14], cu[14], crc[14];
 
-    /* Fortran I/O blocks */
-    static cilist io___387 = { 0, 0, 0, fmt_9995, 0 };
-    static cilist io___388 = { 0, 0, 0, fmt_9994, 0 };
-
-
     if (*(unsigned char *)uplo == 'U') {
 	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
     } else {
@@ -4025,24 +3162,8 @@ L160:
     } else {
 	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
     }
-    io___387.ciunit = *nout;
-    s_wsfe(&io___387);
-    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
-    do_fio(&c__1, sname, (ftnlen)13);
-    do_fio(&c__1, crc, (ftnlen)14);
-    do_fio(&c__1, cu, (ftnlen)14);
-    do_fio(&c__1, ca, (ftnlen)14);
-    e_wsfe();
-    io___388.ciunit = *nout;
-    s_wsfe(&io___388);
-    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer));
-    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(real));
-    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(real));
-    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
-    e_wsfe();
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca);
+    printf("%d %d (%4.1f,%4.1f), A, %d, B, %d, %4.1f, C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,*beta,*ldc);
     return 0;
 } /* cprcn7_ */
 
@@ -4101,7 +3222,7 @@ L160:
     for (j = 1; j <= i__1; ++j) {
 	i__2 = *m;
 	for (i__ = 1; i__ <= i__2; ++i__) {
-	    if (gen || upper && i__ <= j || lower && i__ >= j) {
+	    if (gen || (upper && i__ <= j) || (lower && i__ >= j)) {
 		i__3 = i__ + j * a_dim1;
 		cbeg_(&q__2, reset);
 		q__1.r = q__2.r + transl->r, q__1.i = q__2.i + transl->i;
@@ -4230,15 +3351,6 @@ L160:
 	real *g, complex *cc, integer *ldcc, real *eps, real *err, logical *
 	fatal, integer *nout, logical *mv)
 {
-    /* Format strings */
-    static char fmt_9999[] = "(\002 ******* FATAL ERROR - COMPUTED RESULT IS"
-	    " LESS THAN HAL\002,\002F ACCURATE *******\002,/\002             "
-	    "          EXPECTED RE\002,\002SULT                    COMPUTED R"
-	    "ESULT\002)";
-    static char fmt_9998[] = "(1x,i7,2(\002  (\002,g15.6,\002,\002,g15.6,"
-	    "\002)\002))";
-    static char fmt_9997[] = "(\002      THESE ARE THE RESULTS FOR COLUMN"
-	    " \002,i3)";
 
     /* System generated locals */
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, 
@@ -4251,14 +3363,6 @@ L160:
     integer i__, j, k;
     logical trana, tranb, ctrana, ctranb;
 
-    /* Fortran I/O blocks */
-    static cilist io___409 = { 0, 0, 0, fmt_9999, 0 };
-    static cilist io___410 = { 0, 0, 0, fmt_9998, 0 };
-    static cilist io___411 = { 0, 0, 0, fmt_9998, 0 };
-    static cilist io___412 = { 0, 0, 0, fmt_9997, 0 };
-
-
-
 /*  Checks the results of the computational tests. */
 
 /*  Auxiliary routine for test program for Level 3 Blas. */
@@ -4595,35 +3699,19 @@ L160:
 
 L230:
     *fatal = TRUE_;
-    io___409.ciunit = *nout;
-    s_wsfe(&io___409);
-    e_wsfe();
+    printf(" ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HALF ACCURATE *******\n");
+    printf("         EXPECTED RESULT                    COMPUTED RESULT\n");
     i__1 = *m;
     for (i__ = 1; i__ <= i__1; ++i__) {
 	if (*mv) {
-	    io___410.ciunit = *nout;
-	    s_wsfe(&io___410);
-	    do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer));
-	    do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(real));
-	    do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof(real)
-		    );
-	    e_wsfe();
+            printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,ct[i__].r,ct[i__].i,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i);
 	} else {
-	    io___411.ciunit = *nout;
-	    s_wsfe(&io___411);
-	    do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer));
-	    do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof(real)
-		    );
-	    do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(real));
-	    e_wsfe();
+            printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i,ct[i__].r,ct[i__].i);
 	}
 /* L240: */
     }
     if (*n > 1) {
-	io___412.ciunit = *nout;
-	s_wsfe(&io___412);
-	do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer));
-	e_wsfe();
+    	printf("      THESE ARE THE RESULTS FOR COLUMN %d\n",j);
     }
 
 L250:
@@ -4760,7 +3848,7 @@ logical lceres_(char *type__, char *uplo, integer *m, integer *n, complex *aa,
 	}
     }
 
-/* L60: */
+/*   60 CONTINUE */
     ret_val = TRUE_;
     goto L80;
 L70:
@@ -4851,4 +3939,4 @@ real sdiff_(real *x, real *y)
 
 } /* sdiff_ */
 
-/* Main program alias */ int cblat3_ () { MAIN__ (); return 0; }
+/* Main program alias */ /*int cblat3_ () { MAIN__ (); return 0; }*/
diff --git a/ctest/c_zblat3c_3m.c b/ctest/c_zblat3c_3m.c
index 059daccb5..0c76f11e7 100644
--- a/ctest/c_zblat3c_3m.c
+++ b/ctest/c_zblat3c_3m.c
@@ -10,25 +10,7 @@
 #undef I
 #endif
 
-#if defined(_WIN64)
-typedef long long BLASLONG;
-typedef unsigned long long BLASULONG;
-#else
-typedef long BLASLONG;
-typedef unsigned long BLASULONG;
-#endif
-
-#ifdef LAPACK_ILP64
-typedef BLASLONG blasint;
-#if defined(_WIN64)
-#define blasabs(x) llabs(x)
-#else
-#define blasabs(x) labs(x)
-#endif
-#else
-typedef int blasint;
-#define blasabs(x) abs(x)
-#endif
+#include "common.h"
 
 typedef blasint integer;
 
@@ -40,14 +22,11 @@ typedef double doublereal;
 typedef struct { real r, i; } complex;
 typedef struct { doublereal r, i; } doublecomplex;
 #ifdef _MSC_VER
-static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
 static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
-static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
 static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
 #else
 static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
-static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
 static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
 #endif
 #define pCf(z) (*_pCf(z))
@@ -247,7 +226,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,259 +239,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
 
 
 /* Common Block Declarations */
@@ -526,7 +251,7 @@ struct {
 #define infoc_1 infoc_
 
 struct {
-    char srnamt[13];
+    char srnamt[12];
 } srnamc_;
 
 #define srnamc_1 srnamc_
@@ -535,199 +260,78 @@ struct {
 
 static doublecomplex c_b1 = {0.,0.};
 static doublecomplex c_b2 = {1.,0.};
-static integer c__9 = 9;
 static integer c__1 = 1;
-static integer c__3 = 3;
-static integer c__8 = 8;
-static integer c__5 = 5;
 static integer c__65 = 65;
-static integer c__7 = 7;
-static integer c__2 = 2;
 static doublereal c_b92 = 1.;
 static integer c__6 = 6;
 static logical c_true = TRUE_;
 static integer c__0 = 0;
 static logical c_false = FALSE_;
 
-/* Main program */ MAIN__(void)
+/* Main program  MAIN__() */ int main(void)
 {
     /* Initialized data */
 
-    static char snames[13*9] = "cblas_zgemm3m" "cblas_zhemm  " "cblas_zsymm  "
-	     "cblas_ztrmm  " "cblas_ztrsm  " "cblas_zherk  " "cblas_zsyrk  " 
-	    "cblas_zher2k " "cblas_zsyr2k ";
-
-    /* Format strings */
-    static char fmt_9997[] = "(\002 NUMBER OF VALUES OF \002,a,\002 IS LESS "
-	    "THAN 1 OR GREATER \002,\002THAN \002,i2)";
-    static char fmt_9996[] = "(\002 VALUE OF N IS LESS THAN 0 OR GREATER THA"
-	    "N \002,i2)";
-    static char fmt_9995[] = "(\002TESTS OF THE COMPLEX*16        LEVEL 3 BL"
-	    "AS\002,//\002 THE F\002,\002OLLOWING PARAMETER VALUES WILL BE US"
-	    "ED:\002)";
-    static char fmt_9994[] = "(\002   FOR N              \002,9i6)";
-    static char fmt_9993[] = "(\002   FOR ALPHA          \002,7(\002(\002,f4"
-	    ".1,\002,\002,f4.1,\002)  \002,:))";
-    static char fmt_9992[] = "(\002   FOR BETA           \002,7(\002(\002,f4"
-	    ".1,\002,\002,f4.1,\002)  \002,:))";
-    static char fmt_9984[] = "(\002 ERROR-CALL MYEXITS WILL NOT BE TESTED"
-	    "\002)";
-    static char fmt_9999[] = "(\002 ROUTINES PASS COMPUTATIONAL TESTS IF TES"
-	    "T RATIO IS LES\002,\002S THAN\002,f8.2)";
-    static char fmt_10002[] = "(\002 COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS"
-	    " ARE TESTED\002)";
-    static char fmt_10001[] = "(\002 ROW-MAJOR DATA LAYOUT IS TESTED\002)";
-    static char fmt_10000[] = "(\002 COLUMN-MAJOR DATA LAYOUT IS TESTED\002)";
-    static char fmt_9988[] = "(a13,l2)";
-    static char fmt_9990[] = "(\002 SUBPROGRAM NAME \002,a13,\002 NOT RECOGN"
-	    "IZED\002,/\002 ******* T\002,\002ESTS ABANDONED *******\002)";
-    static char fmt_9998[] = "(\002 RELATIVE MACHINE PRECISION IS TAKEN TO"
-	    " BE\002,1p,e9.1)";
-    static char fmt_9989[] = "(\002 ERROR IN ZMMCH -  IN-LINE DOT PRODUCTS A"
-	    "RE BEING EVALU\002,\002ATED WRONGLY.\002,/\002 ZMMCH WAS CALLED "
-	    "WITH TRANSA = \002,a1,\002AND TRANSB = \002,a1,/\002 AND RETURNE"
-	    "D SAME = \002,l1,\002 AND \002,\002 ERR = \002,f12.3,\002.\002,"
-	    "/\002 THIS MAY BE DUE TO FAULTS IN THE \002,\002ARITHMETIC OR TH"
-	    "E COMPILER.\002,/\002 ******* TESTS ABANDONED \002,\002******"
-	    "*\002)";
-    static char fmt_9987[] = "(1x,a13,\002 WAS NOT TESTED\002)";
-    static char fmt_9986[] = "(/\002 END OF TESTS\002)";
-    static char fmt_9985[] = "(/\002 ******* FATAL ERROR - TESTS ABANDONED *"
-	    "******\002)";
-    static char fmt_9991[] = "(\002 AMEND DATA FILE OR INCREASE ARRAY SIZES "
-	    "IN PROGRAM\002,/\002 ******* TESTS ABANDONED *******\002)";
+    static char snames[9][13] = { "cblas_zgemm3m ", "cblas_zhemm ", "cblas_zsymm ", "cblas_ztrmm ",
+     "cblas_ztrsm ", "cblas_zherk ", "cblas_zsyrk ", "cblas_zher2k", "cblas_zsyr2k"};
 
     /* System generated locals */
     integer i__1, i__2, i__3, i__4, i__5;
     doublereal d__1;
-    olist o__1;
-    cllist cl__1;
-
-    /* Local variables */
-    integer nalf, idim[9];
-    logical same;
-    integer nbet, ntra;
-    logical rewi;
-    extern /* Subroutine */ int zchk1_(char *, doublereal *, doublereal *, 
-	    integer *, integer *, logical *, logical *, logical *, integer *, 
-	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
-	     integer *, doublecomplex *, doublecomplex *, doublecomplex *, 
-	    doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *
-	    , doublecomplex *, doublecomplex *, doublecomplex *, doublereal *,
-	     integer *), zchk2_(char *, doublereal *, doublereal *, 
-	    integer *, integer *, logical *, logical *, logical *, integer *, 
-	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
-	     integer *, doublecomplex *, doublecomplex *, doublecomplex *, 
-	    doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *
-	    , doublecomplex *, doublecomplex *, doublecomplex *, doublereal *,
-	     integer *), zchk3_(char *, doublereal *, doublereal *, 
-	    integer *, integer *, logical *, logical *, logical *, integer *, 
-	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
-	     doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex 
-	    *, doublecomplex *, doublecomplex *, doublereal *, doublecomplex *
-	    , integer *), zchk4_(char *, doublereal *, doublereal *, 
-	    integer *, integer *, logical *, logical *, logical *, integer *, 
-	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
-	     integer *, doublecomplex *, doublecomplex *, doublecomplex *, 
-	    doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *
-	    , doublecomplex *, doublecomplex *, doublecomplex *, doublereal *,
-	     integer *), zchk5_(char *, doublereal *, doublereal *, 
-	    integer *, integer *, logical *, logical *, logical *, integer *, 
-	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
-	     integer *, doublecomplex *, doublecomplex *, doublecomplex *, 
-	    doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *
-	    , doublecomplex *, doublecomplex *, doublereal *, doublecomplex *,
-	     integer *);
-    doublecomplex c__[4225]	/* was [65][65] */;
-    doublereal g[65];
-    integer i__, j;
-    extern doublereal ddiff_(doublereal *, doublereal *);
-    integer n;
-    logical fatal;
-    doublecomplex w[130];
-    logical trace;
-    integer nidim;
-    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
-	    integer *, doublecomplex *, doublereal *, doublecomplex *, 
-	    integer *, doublereal *, doublereal *, logical *, integer *, 
-	    logical *);
-    char snaps[32];
-    integer isnum;
-    logical ltest[9];
-    doublecomplex aa[4225], ab[8450]	/* was [65][130] */, bb[4225], cc[
-	    4225], as[4225], bs[4225], cs[4225], ct[65];
-    logical sfatal, corder;
-    char snamet[13], transa[1], transb[1];
-    doublereal thresh;
-    logical rorder;
-    integer layout;
-    logical ltestt, tsterr;
-    extern /* Subroutine */ int cz3chke_(char *);
-    doublecomplex alf[7], bet[7];
-    doublereal eps, err;
-    extern logical lze_(doublecomplex *, doublecomplex *, integer *);
-
-    /* Fortran I/O blocks */
-    static cilist io___2 = { 0, 5, 0, 0, 0 };
-    static cilist io___4 = { 0, 5, 0, 0, 0 };
-    static cilist io___7 = { 0, 5, 0, 0, 0 };
-    static cilist io___9 = { 0, 5, 0, 0, 0 };
-    static cilist io___11 = { 0, 5, 0, 0, 0 };
-    static cilist io___13 = { 0, 5, 0, 0, 0 };
-    static cilist io___15 = { 0, 5, 0, 0, 0 };
-    static cilist io___17 = { 0, 5, 0, 0, 0 };
-    static cilist io___19 = { 0, 6, 0, fmt_9997, 0 };
-    static cilist io___20 = { 0, 5, 0, 0, 0 };
-    static cilist io___23 = { 0, 6, 0, fmt_9996, 0 };
-    static cilist io___24 = { 0, 5, 0, 0, 0 };
-    static cilist io___26 = { 0, 6, 0, fmt_9997, 0 };
-    static cilist io___27 = { 0, 5, 0, 0, 0 };
-    static cilist io___29 = { 0, 5, 0, 0, 0 };
-    static cilist io___31 = { 0, 6, 0, fmt_9997, 0 };
-    static cilist io___32 = { 0, 5, 0, 0, 0 };
-    static cilist io___34 = { 0, 6, 0, fmt_9995, 0 };
-    static cilist io___35 = { 0, 6, 0, fmt_9994, 0 };
-    static cilist io___36 = { 0, 6, 0, fmt_9993, 0 };
-    static cilist io___37 = { 0, 6, 0, fmt_9992, 0 };
-    static cilist io___38 = { 0, 6, 0, 0, 0 };
-    static cilist io___39 = { 0, 6, 0, fmt_9984, 0 };
-    static cilist io___40 = { 0, 6, 0, 0, 0 };
-    static cilist io___41 = { 0, 6, 0, fmt_9999, 0 };
-    static cilist io___42 = { 0, 6, 0, 0, 0 };
-    static cilist io___45 = { 0, 6, 0, fmt_10002, 0 };
-    static cilist io___46 = { 0, 6, 0, fmt_10001, 0 };
-    static cilist io___47 = { 0, 6, 0, fmt_10000, 0 };
-    static cilist io___48 = { 0, 6, 0, 0, 0 };
-    static cilist io___50 = { 0, 5, 1, fmt_9988, 0 };
-    static cilist io___53 = { 0, 6, 0, fmt_9990, 0 };
-    static cilist io___55 = { 0, 6, 0, fmt_9998, 0 };
-    static cilist io___68 = { 0, 6, 0, fmt_9989, 0 };
-    static cilist io___69 = { 0, 6, 0, fmt_9989, 0 };
-    static cilist io___70 = { 0, 6, 0, fmt_9989, 0 };
-    static cilist io___71 = { 0, 6, 0, fmt_9989, 0 };
-    static cilist io___73 = { 0, 6, 0, 0, 0 };
-    static cilist io___74 = { 0, 6, 0, fmt_9987, 0 };
-    static cilist io___75 = { 0, 6, 0, 0, 0 };
-    static cilist io___82 = { 0, 6, 0, fmt_9986, 0 };
-    static cilist io___83 = { 0, 6, 0, fmt_9985, 0 };
-    static cilist io___84 = { 0, 6, 0, fmt_9991, 0 };
-
 
+    /* Builtin functions */
+    integer s_rsle(void), do_lio(void), e_rsle(void), f_open(void), s_wsfe(void), do_fio(void), 
+	    e_wsfe(void), s_wsle(void), e_wsle(void), s_rsfe(void), e_rsfe(void);
 
+    /* Local variables */
+    static integer nalf, idim[9];
+    static logical same;
+    static integer nbet, ntra;
+    static logical rewi;
+    extern /* Subroutine */ int zchk1_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk2_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk3_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk4_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk5_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen);
+    static doublecomplex c__[4225]	/* was [65][65] */;
+    static doublereal g[65];
+    static integer i__, j;
+    extern doublereal ddiff_(doublereal*, doublereal*);
+    static integer n;
+    static logical fatal;
+    static doublecomplex w[130];
+    static logical trace;
+    static integer nidim;
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
+    static char snaps[32];
+    static integer isnum;
+    static logical ltest[9];
+    static doublecomplex aa[4225], ab[8450]	/* was [65][130] */, bb[4225],
+	     cc[4225], as[4225], bs[4225], cs[4225], ct[65];
+    static logical sfatal, corder;
+    static char snamet[12], transa[1], transb[1];
+    static doublereal thresh;
+    static logical rorder;
+    static integer layout;
+    static logical ltestt, tsterr;
+    extern /* Subroutine */ int cz3chke_(char*, ftnlen);
+    static doublecomplex alf[7], bet[7];
+    static doublereal eps, err;
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
+    char tmpchar;
+    
 /*  Test program for the COMPLEX*16          Level 3 Blas. */
 
 /*  The program must be driven by a short data file. The first 13 records */
 /*  of the file are read using list-directed input, the last 9 records */
-/*  are read using the format ( A13,L2 ). An annotated example of a data */
+/*  are read using the format ( A12,L2 ). An annotated example of a data */
 /*  file can be obtained by deleting the first 3 characters from the */
 /*  following 22 lines: */
 /*  'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE */
 /*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) */
 /*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. */
 /*  F        LOGICAL FLAG, T TO STOP ON FAILURES. */
-/*  T        LOGICAL FLAG, T TO TEST ERROR CALL MYEXITS. */
+/*  T        LOGICAL FLAG, T TO TEST ERROR EXITS. */
 /*  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH */
 /*  16.0     THRESHOLD VALUE OF TEST RATIO */
 /*  6                 NUMBER OF VALUES OF N */
@@ -761,20 +365,34 @@ static logical c_false = FALSE_;
 /*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
 /*     Sven Hammarling, Numerical Algorithms Group Ltd. */
 
+/*     .. Parameters .. */
+/*     .. Local Scalars .. */
+/*     .. Local Arrays .. */
+/*     .. External Functions .. */
+/*     .. External Subroutines .. */
+/*     .. Intrinsic Functions .. */
+/*     .. Scalars in Common .. */
+/*     .. Common blocks .. */
+/*     .. Data statements .. */
+/*     .. Executable Statements .. */
 
     infoc_1.noutc = 6;
 
 /*     Read name and unit number for snapshot output file and open file. */
 
-    s_rsle(&io___2);
-    do_lio(&c__9, &c__1, snaps, (ftnlen)32);
-    e_rsle();
-    s_rsle(&io___4);
-    do_lio(&c__3, &c__1, (char *)&ntra, (ftnlen)sizeof(integer));
-    e_rsle();
+    char line[80];
+    
+    fgets(line,80,stdin);
+    sscanf(line,"'%s'",snaps);
+    fgets(line,80,stdin);
+#ifdef USE64BITINT
+    sscanf(line,"%ld",&ntra);
+#else
+    sscanf(line,"%d",&ntra);
+#endif
     trace = ntra >= 0;
     if (trace) {
-	o__1.oerr = 0;
+/*	o__1.oerr = 0;
 	o__1.ounit = ntra;
 	o__1.ofnmlen = 32;
 	o__1.ofnm = snaps;
@@ -783,149 +401,119 @@ static logical c_false = FALSE_;
 	o__1.oacc = 0;
 	o__1.ofm = 0;
 	o__1.oblnk = 0;
-	f_open(&o__1);
+	f_open(&o__1);*/
     }
 /*     Read the flag that directs rewinding of the snapshot file. */
-    s_rsle(&io___7);
-    do_lio(&c__8, &c__1, (char *)&rewi, (ftnlen)sizeof(logical));
-    e_rsle();
-    rewi = rewi && trace;
+   fgets(line,80,stdin);
+   sscanf(line,"%d",&rewi);
+   rewi = rewi && trace;
 /*     Read the flag that directs stopping on any failure. */
-    s_rsle(&io___9);
-    do_lio(&c__8, &c__1, (char *)&sfatal, (ftnlen)sizeof(logical));
-    e_rsle();
+   fgets(line,80,stdin);
+   sscanf(line,"%c",&tmpchar);
+   sfatal=FALSE_;
+   if (tmpchar=='T')sfatal=TRUE_;
 /*     Read the flag that indicates whether error exits are to be tested. */
-    s_rsle(&io___11);
-    do_lio(&c__8, &c__1, (char *)&tsterr, (ftnlen)sizeof(logical));
-    e_rsle();
+   fgets(line,80,stdin);
+   sscanf(line,"%c",&tmpchar);
+   tsterr=FALSE_;
+   if (tmpchar=='T')tsterr=TRUE_;
 /*     Read the flag that indicates whether row-major data layout to be tested. */
-    s_rsle(&io___13);
-    do_lio(&c__3, &c__1, (char *)&layout, (ftnlen)sizeof(integer));
-    e_rsle();
+   fgets(line,80,stdin);
+   sscanf(line,"%d",&layout);
 /*     Read the threshold value of the test ratio */
-    s_rsle(&io___15);
-    do_lio(&c__5, &c__1, (char *)&thresh, (ftnlen)sizeof(doublereal));
-    e_rsle();
+   fgets(line,80,stdin);
+   sscanf(line,"%lf",&thresh);
 
 /*     Read and check the parameter values for the tests. */
 
 /*     Values of N */
-    s_rsle(&io___17);
-    do_lio(&c__3, &c__1, (char *)&nidim, (ftnlen)sizeof(integer));
-    e_rsle();
+   fgets(line,80,stdin);
+#ifdef USE64BITINT
+   sscanf(line,"%d",&nidim);
+#else
+   sscanf(line,"%d",&nidim);
+#endif
     if (nidim < 1 || nidim > 9) {
-	s_wsfe(&io___19);
-	do_fio(&c__1, "N", (ftnlen)1);
-	do_fio(&c__1, (char *)&c__9, (ftnlen)sizeof(integer));
-	e_wsfe();
-	goto L220;
+        fprintf(stderr,"NUMBER OF VALUES OF N IS LESS THAN 1 OR GREATER THAN 9");
+        goto L220;
     }
-    s_rsle(&io___20);
-    i__1 = nidim;
-    for (i__ = 1; i__ <= i__1; ++i__) {
-	do_lio(&c__3, &c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer));
-    }
-    e_rsle();
+   fgets(line,80,stdin);
+#ifdef USE64BITINT
+   sscanf(line,"%ld %ld %ld %ld %ld %ld %ld %ld %ld",&idim[0],&idim[1],&idim[2],
+    &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]);
+#else
+   sscanf(line,"%d %d %d %d %d %d %d %d %d",&idim[0],&idim[1],&idim[2],
+    &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]);
+#endif
     i__1 = nidim;
     for (i__ = 1; i__ <= i__1; ++i__) {
-	if (idim[i__ - 1] < 0 || idim[i__ - 1] > 65) {
-	    s_wsfe(&io___23);
-	    do_fio(&c__1, (char *)&c__65, (ftnlen)sizeof(integer));
-	    e_wsfe();
-	    goto L220;
-	}
+        if (idim[i__ - 1] < 0 || idim[i__ - 1] > 65) {
+        fprintf(stderr,"VALUE OF N IS LESS THAN 0 OR GREATER THAN 65\n");
+            goto L220;
+        }
 /* L10: */
     }
 /*     Values of ALPHA */
-    s_rsle(&io___24);
-    do_lio(&c__3, &c__1, (char *)&nalf, (ftnlen)sizeof(integer));
-    e_rsle();
+   fgets(line,80,stdin);
+#ifdef USE64BITINT
+   sscanf(line,"%ld",&nalf);
+#else
+   sscanf(line,"%d",&nalf);
+#endif
     if (nalf < 1 || nalf > 7) {
-	s_wsfe(&io___26);
-	do_fio(&c__1, "ALPHA", (ftnlen)5);
-	do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer));
-	e_wsfe();
-	goto L220;
-    }
-    s_rsle(&io___27);
-    i__1 = nalf;
-    for (i__ = 1; i__ <= i__1; ++i__) {
-	do_lio(&c__7, &c__1, (char *)&alf[i__ - 1], (ftnlen)sizeof(
-		doublecomplex));
+        fprintf(stderr,"VALUE OF ALPHA IS LESS THAN 0 OR GREATER THAN 7\n");
+        goto L220;
     }
-    e_rsle();
+   fgets(line,80,stdin);
+   sscanf(line,"(%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf)",&alf[0].r,&alf[0].i,&alf[1].r,&alf[1].i,&alf[2].r,&alf[2].i,&alf[3].r,&alf[3].i,
+   &alf[4].r,&alf[4].i,&alf[5].r,&alf[5].i,&alf[6].r,&alf[6].i);
+
 /*     Values of BETA */
-    s_rsle(&io___29);
-    do_lio(&c__3, &c__1, (char *)&nbet, (ftnlen)sizeof(integer));
-    e_rsle();
-    if (nbet < 1 || nbet > 7) {
-	s_wsfe(&io___31);
-	do_fio(&c__1, "BETA", (ftnlen)4);
-	do_fio(&c__1, (char *)&c__7, (ftnlen)sizeof(integer));
-	e_wsfe();
-	goto L220;
-    }
-    s_rsle(&io___32);
-    i__1 = nbet;
-    for (i__ = 1; i__ <= i__1; ++i__) {
-	do_lio(&c__7, &c__1, (char *)&bet[i__ - 1], (ftnlen)sizeof(
-		doublecomplex));
+   fgets(line,80,stdin);
+#ifdef USE64BITINT
+   sscanf(line,"%ld",&nbet);
+#else
+   sscanf(line,"%d",&nbet);
+#endif
+    if (nalf < 1 || nbet > 7) {
+        fprintf(stderr,"VALUE OF BETA IS LESS THAN 0 OR GREATER THAN 7\n");
+        goto L220;
     }
-    e_rsle();
+   fgets(line,80,stdin);
+   sscanf(line,"(%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf)",&bet[0].r,&bet[0].i,&bet[1].r,&bet[1].i,&bet[2].r,&bet[2].i,&bet[3].r,&bet[3].i,
+   &bet[4].r,&bet[4].i,&bet[5].r,&bet[5].i,&bet[6].r,&bet[6].i);
 
 /*     Report values of parameters. */
 
-    s_wsfe(&io___34);
-    e_wsfe();
-    s_wsfe(&io___35);
-    i__1 = nidim;
-    for (i__ = 1; i__ <= i__1; ++i__) {
-	do_fio(&c__1, (char *)&idim[i__ - 1], (ftnlen)sizeof(integer));
-    }
-    e_wsfe();
-    s_wsfe(&io___36);
-    i__1 = nalf;
-    for (i__ = 1; i__ <= i__1; ++i__) {
-	do_fio(&c__2, (char *)&alf[i__ - 1], (ftnlen)sizeof(doublereal));
-    }
-    e_wsfe();
-    s_wsfe(&io___37);
-    i__1 = nbet;
-    for (i__ = 1; i__ <= i__1; ++i__) {
-	do_fio(&c__2, (char *)&bet[i__ - 1], (ftnlen)sizeof(doublereal));
-    }
-    e_wsfe();
+    printf("TESTS OF THE DOUBLE PRECISION COMPLEX LEVEL 3 BLAS\nTHE FOLLOWING PARAMETER VALUES WILL BE USED:\n");
+    printf(" FOR N");
+    for (i__ =1; i__ <=nidim;++i__) printf(" %d",idim[i__-1]);
+    printf("\n");    
+    printf(" FOR ALPHA");
+    for (i__ =1; i__ <=nalf;++i__) printf(" (%lf,%lf)",alf[i__-1].r,alf[i__-1].i);
+    printf("\n");    
+    printf(" FOR BETA");
+    for (i__ =1; i__ <=nbet;++i__) printf(" (%lf,%lf)",bet[i__-1].r,bet[i__-1].i);
+    printf("\n");    
+
     if (! tsterr) {
-	s_wsle(&io___38);
-	e_wsle();
-	s_wsfe(&io___39);
-	e_wsfe();
+      printf(" ERROR-EXITS WILL NOT BE TESTED\n"); 
     }
-    s_wsle(&io___40);
-    e_wsle();
-    s_wsfe(&io___41);
-    do_fio(&c__1, (char *)&thresh, (ftnlen)sizeof(doublereal));
-    e_wsfe();
-    s_wsle(&io___42);
-    e_wsle();
+
+    printf("ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LESS THAN %lf\n",thresh);
     rorder = FALSE_;
     corder = FALSE_;
     if (layout == 2) {
 	rorder = TRUE_;
 	corder = TRUE_;
-	s_wsfe(&io___45);
-	e_wsfe();
+        printf("COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED\n");
     } else if (layout == 1) {
 	rorder = TRUE_;
-	s_wsfe(&io___46);
-	e_wsfe();
+        printf("ROW-MAJOR DATA LAYOUT IS TESTED\n");
     } else if (layout == 0) {
 	corder = TRUE_;
-	s_wsfe(&io___47);
-	e_wsfe();
+        printf("COLUMN-MAJOR DATA LAYOUT IS TESTED\n");
     }
-    s_wsle(&io___48);
-    e_wsle();
 
 /*     Read names of subroutines and flags which indicate */
 /*     whether they are to be tested. */
@@ -935,42 +523,33 @@ static logical c_false = FALSE_;
 /* L20: */
     }
 L30:
-    i__1 = s_rsfe(&io___50);
-    if (i__1 != 0) {
-	goto L60;
-    }
-    i__1 = do_fio(&c__1, snamet, (ftnlen)13);
-    if (i__1 != 0) {
-	goto L60;
-    }
-    i__1 = do_fio(&c__1, (char *)&ltestt, (ftnlen)sizeof(logical));
-    if (i__1 != 0) {
-	goto L60;
+   if (! fgets(line,80,stdin)) {
+        goto L60;
     }
-    i__1 = e_rsfe();
-    if (i__1 != 0) {
-	goto L60;
+   i__1 = sscanf(line,"%12c %c",snamet,&tmpchar);
+   ltestt=FALSE_;
+   if (tmpchar=='T')ltestt=TRUE_;
+    if (i__1 < 2) {
+        goto L60;
     }
     for (i__ = 1; i__ <= 9; ++i__) {
-	if (s_cmp(snamet, snames + (i__ - 1) * 13, (ftnlen)13, (ftnlen)13) == 
-		0) {
-	    goto L50;
-	}
+        if (s_cmp(snamet, snames[i__ - 1] , (ftnlen)12, (ftnlen)12) == 
+                0) {
+            goto L50;
+        }
 /* L40: */
     }
-    s_wsfe(&io___53);
-    do_fio(&c__1, snamet, (ftnlen)13);
-    e_wsfe();
-    s_stop("", (ftnlen)0);
+    printf("SUBPROGRAM NAME %s NOT RECOGNIZED\n****** TESTS ABANDONED ******\n",snamet);
+    exit(1);
 L50:
     ltest[i__ - 1] = ltestt;
     goto L30;
 
 L60:
-    cl__1.cerr = 0;
+/*    cl__1.cerr = 0;
     cl__1.cunit = 5;
     cl__1.csta = 0;
-    f_clos(&cl__1);
+    f_clos(&cl__1);*/
 
 /*     Compute EPS (the machine precision). */
 
@@ -984,9 +563,7 @@ L70:
     goto L70;
 L80:
     eps += eps;
-    s_wsfe(&io___55);
-    do_fio(&c__1, (char *)&eps, (ftnlen)sizeof(doublereal));
-    e_wsfe();
+    printf("RELATIVE MACHINE PRECISION IS TAKEN TO BE %9.1g\n",eps);
 
 /*     Check the reliability of ZMMCH using exact data. */
 
@@ -1023,30 +600,28 @@ L80:
     *(unsigned char *)transb = 'N';
     zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
 	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
-	    &c__6, &c_true);
+	    &c__6, &c_true, (ftnlen)1, (ftnlen)1);
     same = lze_(cc, ct, &n);
     if (! same || err != 0.) {
-	s_wsfe(&io___68);
-	do_fio(&c__1, transa, (ftnlen)1);
-	do_fio(&c__1, transb, (ftnlen)1);
-	do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical));
-	do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal));
-	e_wsfe();
-	s_stop("", (ftnlen)0);
+      printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n");
+      printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb);
+      printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err);
+      printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n");
+      printf("****** TESTS ABANDONED ******\n");
+      exit(1);
     }
     *(unsigned char *)transb = 'C';
     zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
 	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
-	    &c__6, &c_true);
+	    &c__6, &c_true, (ftnlen)1, (ftnlen)1);
     same = lze_(cc, ct, &n);
     if (! same || err != 0.) {
-	s_wsfe(&io___69);
-	do_fio(&c__1, transa, (ftnlen)1);
-	do_fio(&c__1, transb, (ftnlen)1);
-	do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical));
-	do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal));
-	e_wsfe();
-	s_stop("", (ftnlen)0);
+      printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n");
+      printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb);
+      printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err);
+      printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n");
+      printf("****** TESTS ABANDONED ******\n");
+      exit(1);
     }
     i__1 = n;
     for (j = 1; j <= i__1; ++j) {
@@ -1069,56 +644,48 @@ L80:
     *(unsigned char *)transb = 'N';
     zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
 	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
-	    &c__6, &c_true);
+	    &c__6, &c_true, (ftnlen)1, (ftnlen)1);
     same = lze_(cc, ct, &n);
     if (! same || err != 0.) {
-	s_wsfe(&io___70);
-	do_fio(&c__1, transa, (ftnlen)1);
-	do_fio(&c__1, transb, (ftnlen)1);
-	do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical));
-	do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal));
-	e_wsfe();
-	s_stop("", (ftnlen)0);
+      printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n");
+      printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb);
+      printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err);
+      printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n");
+      printf("****** TESTS ABANDONED ******\n");
+      exit(1);
     }
     *(unsigned char *)transb = 'C';
     zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
 	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
-	    &c__6, &c_true);
+	    &c__6, &c_true, (ftnlen)1, (ftnlen)1);
     same = lze_(cc, ct, &n);
     if (! same || err != 0.) {
-	s_wsfe(&io___71);
-	do_fio(&c__1, transa, (ftnlen)1);
-	do_fio(&c__1, transb, (ftnlen)1);
-	do_fio(&c__1, (char *)&same, (ftnlen)sizeof(logical));
-	do_fio(&c__1, (char *)&err, (ftnlen)sizeof(doublereal));
-	e_wsfe();
-	s_stop("", (ftnlen)0);
+      printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n");
+      printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb);
+      printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err);
+      printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n");
+      printf("****** TESTS ABANDONED ******\n");
+      exit(1);
     }
 
 /*     Test each subroutine in turn. */
 
     for (isnum = 1; isnum <= 9; ++isnum) {
-	s_wsle(&io___73);
-	e_wsle();
 	if (! ltest[isnum - 1]) {
 /*           Subprogram is not to be tested. */
-	    s_wsfe(&io___74);
-	    do_fio(&c__1, snames + (isnum - 1) * 13, (ftnlen)13);
-	    e_wsfe();
+           printf("%12s WAS NOT TESTED\n",snames[isnum-1]);
 	} else {
-	    s_copy(srnamc_1.srnamt, snames + (isnum - 1) * 13, (ftnlen)13, (
-		    ftnlen)13);
+	    s_copy(srnamc_1.srnamt, snames[isnum - 1], (ftnlen)12, (
+		    ftnlen)12);
 /*           Test error exits. */
 	    if (tsterr) {
-		cz3chke_(snames + (isnum - 1) * 13);
-		s_wsle(&io___75);
-		e_wsle();
+		cz3chke_(snames[isnum - 1], (ftnlen)12);
 	    }
 /*           Test computations. */
 	    infoc_1.infot = 0;
 	    infoc_1.ok = TRUE_;
 	    fatal = FALSE_;
-	    switch (isnum) {
+	    switch ((int)isnum) {
 		case 1:  goto L140;
 		case 2:  goto L150;
 		case 3:  goto L150;
@@ -1132,76 +699,76 @@ L80:
 /*           Test ZGEMM, 01. */
 L140:
 	    if (corder) {
-		zchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+		zchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
 			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
 			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
-			 cc, cs, ct, g, &c__0);
+			 cc, cs, ct, g, &c__0, (ftnlen)12);
 	    }
 	    if (rorder) {
-		zchk1_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+		zchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
 			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
 			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
-			 cc, cs, ct, g, &c__1);
+			 cc, cs, ct, g, &c__1, (ftnlen)12);
 	    }
 	    goto L190;
 /*           Test ZHEMM, 02, ZSYMM, 03. */
 L150:
 	    if (corder) {
-		zchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+		zchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
 			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
 			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
-			 cc, cs, ct, g, &c__0);
+			 cc, cs, ct, g, &c__0, (ftnlen)12);
 	    }
 	    if (rorder) {
-		zchk2_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+		zchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
 			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
 			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
-			 cc, cs, ct, g, &c__1);
+			 cc, cs, ct, g, &c__1, (ftnlen)12);
 	    }
 	    goto L190;
 /*           Test ZTRMM, 04, ZTRSM, 05. */
 L160:
 	    if (corder) {
-		zchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+		zchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
 			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
 			c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, &
-			c__0);
+			c__0, (ftnlen)12);
 	    }
 	    if (rorder) {
-		zchk3_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+		zchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
 			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
 			c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, &
-			c__1);
+			c__1, (ftnlen)12);
 	    }
 	    goto L190;
 /*           Test ZHERK, 06, ZSYRK, 07. */
 L170:
 	    if (corder) {
-		zchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+		zchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
 			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
 			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
-			 cc, cs, ct, g, &c__0);
+			 cc, cs, ct, g, &c__0, (ftnlen)12);
 	    }
 	    if (rorder) {
-		zchk4_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+		zchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
 			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
 			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
-			 cc, cs, ct, g, &c__1);
+			 cc, cs, ct, g, &c__1, (ftnlen)12);
 	    }
 	    goto L190;
 /*           Test ZHER2K, 08, ZSYR2K, 09. */
 L180:
 	    if (corder) {
-		zchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+		zchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
 			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
 			nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, 
-			ct, g, w, &c__0);
+			ct, g, w, &c__0, (ftnlen)12);
 	    }
 	    if (rorder) {
-		zchk5_(snames + (isnum - 1) * 13, &eps, &thresh, &c__6, &ntra,
+		zchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
 			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
 			nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, 
-			ct, g, w, &c__1);
+			ct, g, w, &c__1, (ftnlen)12);
 	    }
 	    goto L190;
 
@@ -1212,122 +779,66 @@ L190:
 	}
 /* L200: */
     }
-    s_wsfe(&io___82);
-    e_wsfe();
+    printf("\nEND OF TESTS\n");
     goto L230;
 
 L210:
-    s_wsfe(&io___83);
-    e_wsfe();
+    printf("\n****** FATAL ERROR - TESTS ABANDONED ******\n");
     goto L230;
 
 L220:
-    s_wsfe(&io___84);
-    e_wsfe();
+    printf("AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM\n");
+    printf("****** TESTS ABANDONED ******\n");
 
 L230:
     if (trace) {
-	cl__1.cerr = 0;
+/*	cl__1.cerr = 0;
 	cl__1.cunit = ntra;
 	cl__1.csta = 0;
-	f_clos(&cl__1);
+	f_clos(&cl__1);*/
     }
-    cl__1.cerr = 0;
+/*    cl__1.cerr = 0;
     cl__1.cunit = 6;
     cl__1.csta = 0;
-    f_clos(&cl__1);
-    s_stop("", (ftnlen)0);
-
+    f_clos(&cl__1);*/
+    exit(0);
 
 /*     End of ZBLAT3. */
 
-    return 0;
 } /* MAIN__ */
 
-/* Subroutine */ int zchk1_(char *sname, doublereal *eps, doublereal *thresh, 
-	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
-	fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex *
-	alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex *
-	a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, 
-	doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, 
-	doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal *
-	g, integer *iorder)
+/* Subroutine */ int zchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
-    static char ich[3] = "NTC";
-
-    /* Format strings */
-    static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT "
-	    "TAKEN ON VALID\002,\002 CALL *******\002)";
-    static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER"
-	    " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)";
-    static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C"
-	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
-    static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR    C"
-	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
-    static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO"
-	    "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
-	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
-	    "ECT *******\002)";
-    static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR  "
-	    "  COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
-	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
-	    "ECT *******\002)";
-    static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM"
-	    "BER:\002)";
+    static char ich[3+1] = "NTC";
 
     /* System generated locals */
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5, i__6, i__7, i__8;
-    alist al__1;
 
     /* Local variables */
-    extern /* Subroutine */ int czgemm3m_(integer *, char *, char *, integer *
-	    , integer *, integer *, doublecomplex *, doublecomplex *, integer 
-	    *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
-	    integer *);
-    doublecomplex beta;
-    integer ldas, ldbs, ldcs;
-    logical same, null;
-    integer i__, k, m, n;
-    doublecomplex alpha;
-    logical isame[13], trana, tranb;
-    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, 
-	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
-	     logical *, doublecomplex *);
-    integer nargs;
-    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
-	    integer *, doublecomplex *, doublereal *, doublecomplex *, 
-	    integer *, doublereal *, doublereal *, logical *, integer *, 
-	    logical *);
-    logical reset;
-    integer ia, ib;
-    extern /* Subroutine */ int zprcn1_(integer *, integer *, char *, integer 
-	    *, char *, char *, integer *, integer *, integer *, doublecomplex 
-	    *, integer *, integer *, doublecomplex *, integer *);
-    integer ma, mb, na, nb, nc, ik, im, in, ks, ms, ns;
-    char tranas[1], tranbs[1], transa[1], transb[1];
-    doublereal errmax;
-    extern logical lzeres_(char *, char *, integer *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *);
-    integer ica, icb, laa, lbb, lda, lcc, ldb, ldc;
-    doublecomplex als, bls;
-    doublereal err;
-    extern logical lze_(doublecomplex *, doublecomplex *, integer *);
-
-    /* Fortran I/O blocks */
-    static cilist io___128 = { 0, 0, 0, fmt_9994, 0 };
-    static cilist io___131 = { 0, 0, 0, fmt_9998, 0 };
-    static cilist io___133 = { 0, 0, 0, fmt_10000, 0 };
-    static cilist io___134 = { 0, 0, 0, fmt_10001, 0 };
-    static cilist io___135 = { 0, 0, 0, fmt_10002, 0 };
-    static cilist io___136 = { 0, 0, 0, fmt_10003, 0 };
-    static cilist io___137 = { 0, 0, 0, fmt_9996, 0 };
-
-
+    static doublecomplex beta;
+    static integer ldas, ldbs, ldcs;
+    static logical same, null;
+    static integer i__, k, m, n;
+    static doublecomplex alpha;
+    static logical isame[13], trana, tranb;
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
+    static integer nargs;
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
+    static logical reset;
+    static integer ia, ib;
+    extern /* Subroutine */ int zprcn1_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
+    static integer ma, mb, na, nb, nc, ik, im, in, ks, ms, ns;
+    extern /* Subroutine */ void czgemm3m_(integer*, char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    static char tranas[1], tranbs[1], transa[1], transb[1];
+    static doublereal errmax;
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    static integer ica, icb, laa, lbb, lda, lcc, ldb, ldc;
+    static doublecomplex als, bls;
+    static doublereal err;
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 /*  Tests ZGEMM. */
 
@@ -1339,6 +850,17 @@ L230:
 /*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
 /*     Sven Hammarling, Numerical Algorithms Group Ltd. */
 
+/*     .. Parameters .. */
+/*     .. Scalar Arguments .. */
+/*     .. Array Arguments .. */
+/*     .. Local Scalars .. */
+/*     .. Local Arrays .. */
+/*     .. External Functions .. */
+/*     .. External Subroutines .. */
+/*     .. Intrinsic Functions .. */
+/*     .. Scalars in Common .. */
+/*     .. Common blocks .. */
+/*     .. Data statements .. */
     /* Parameter adjustments */
     --idim;
     --alf;
@@ -1362,6 +884,7 @@ L230:
     a -= a_offset;
 
     /* Function Body */
+/*     .. Executable Statements .. */
 
     nargs = 13;
     nc = 0;
@@ -1418,7 +941,8 @@ L230:
 /*                 Generate the matrix A. */
 
 		    zmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[
-			    1], &lda, &reset, &c_b1);
+			    1], &lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (
+			    ftnlen)1);
 
 		    for (icb = 1; icb <= 3; ++icb) {
 			*(unsigned char *)transb = *(unsigned char *)&ich[icb 
@@ -1447,7 +971,8 @@ L230:
 /*                    Generate the matrix B. */
 
 			zmake_("ge", " ", " ", &mb, &nb, &b[b_offset], nmax, &
-				bb[1], &ldb, &reset, &c_b1);
+				bb[1], &ldb, &reset, &c_b1, (ftnlen)2, (
+				ftnlen)1, (ftnlen)1);
 
 			i__4 = *nalf;
 			for (ia = 1; ia <= i__4; ++ia) {
@@ -1462,7 +987,8 @@ L230:
 /*                          Generate the matrix C. */
 
 				zmake_("ge", " ", " ", &m, &n, &c__[c_offset],
-					 nmax, &cc[1], &ldc, &reset, &c_b1);
+					 nmax, &cc[1], &ldc, &reset, &c_b1, (
+					ftnlen)2, (ftnlen)1, (ftnlen)1);
 
 				++nc;
 
@@ -1511,23 +1037,23 @@ L230:
 				if (*trace) {
 				    zprcn1_(ntra, &nc, sname, iorder, transa, 
 					    transb, &m, &n, &k, &alpha, &lda, 
-					    &ldb, &beta, &ldc);
+					    &ldb, &beta, &ldc, (ftnlen)12, (
+					    ftnlen)1, (ftnlen)1);
 				}
 				if (*rewi) {
-				    al__1.aerr = 0;
+/*				    al__1.aerr = 0;
 				    al__1.aunit = *ntra;
-				    f_rew(&al__1);
+				    f_rew(&al__1);*/
 				}
-				czgemm3m_(iorder, transa, transb, &m, &n, &k, 
-					&alpha, &aa[1], &lda, &bb[1], &ldb, &
-					beta, &cc[1], &ldc);
+				czgemm3m_(iorder, transa, transb, &m, &n, &k, &
+					alpha, &aa[1], &lda, &bb[1], &ldb, &
+					beta, &cc[1], &ldc, (ftnlen)1, (
+					ftnlen)1);
 
 /*                          Check if error-exit was taken incorrectly. */
 
 				if (! infoc_1.ok) {
-				    io___128.ciunit = *nout;
-				    s_wsfe(&io___128);
-				    e_wsfe();
+                                    printf(" *** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n");
 				    *fatal = TRUE_;
 				    goto L120;
 				}
@@ -1553,7 +1079,8 @@ L230:
 				    isame[11] = lze_(&cs[1], &cc[1], &lcc);
 				} else {
 				    isame[11] = lzeres_("ge", " ", &m, &n, &
-					    cs[1], &cc[1], &ldc);
+					    cs[1], &cc[1], &ldc, (ftnlen)2, (
+					    ftnlen)1);
 				}
 				isame[12] = ldcs == ldc;
 
@@ -1565,11 +1092,7 @@ L230:
 				for (i__ = 1; i__ <= i__6; ++i__) {
 				    same = same && isame[i__ - 1];
 				    if (! isame[i__ - 1]) {
-					io___131.ciunit = *nout;
-					s_wsfe(&io___131);
-					do_fio(&c__1, (char *)&i__, (ftnlen)
-						sizeof(integer));
-					e_wsfe();
+	                                printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);
 				    }
 /* L40: */
 				}
@@ -1586,7 +1109,8 @@ L230:
 					     &a[a_offset], nmax, &b[b_offset],
 					     nmax, &beta, &c__[c_offset], 
 					    nmax, &ct[1], &g[1], &cc[1], &ldc,
-					     eps, &err, fatal, nout, &c_true);
+					     eps, &err, fatal, nout, &c_true, 
+					    (ftnlen)1, (ftnlen)1);
 				    errmax = f2cmax(errmax,err);
 /*                             If got really bad answer, report and */
 /*                             return. */
@@ -1623,76 +1147,44 @@ L100:
 
     if (errmax < *thresh) {
 	if (*iorder == 0) {
-	    io___133.ciunit = *nout;
-	    s_wsfe(&io___133);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    e_wsfe();
+            printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
 	}
 	if (*iorder == 1) {
-	    io___134.ciunit = *nout;
-	    s_wsfe(&io___134);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    e_wsfe();
+            printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
 	}
     } else {
 	if (*iorder == 0) {
-	    io___135.ciunit = *nout;
-	    s_wsfe(&io___135);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal));
-	    e_wsfe();
+            printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+            printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
 	}
 	if (*iorder == 1) {
-	    io___136.ciunit = *nout;
-	    s_wsfe(&io___136);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal));
-	    e_wsfe();
+            printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+            printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
 	}
     }
     goto L130;
 
 L120:
-    io___137.ciunit = *nout;
-    s_wsfe(&io___137);
-    do_fio(&c__1, sname, (ftnlen)13);
-    e_wsfe();
+    printf(" ******* %s FAILED ON CALL NUMBER:\n",sname);
     zprcn1_(nout, &nc, sname, iorder, transa, transb, &m, &n, &k, &alpha, &
-	    lda, &ldb, &beta, &ldc);
+	    lda, &ldb, &beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1);
 
 L130:
     return 0;
 
-/* L9995: */
+/* 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', */
+/*     $     3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, */
+/*     $     ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) */
 
 /*     End of ZCHK1. */
 
 } /* zchk1_ */
 
 
-/* Subroutine */ int zprcn1_(integer *nout, integer *nc, char *sname, integer 
-	*iorder, char *transa, char *transb, integer *m, integer *n, integer *
-	k, doublecomplex *alpha, integer *lda, integer *ldb, doublecomplex *
-	beta, integer *ldc)
+/* Subroutine */ int zprcn1_(integer* nout, integer* nc, char* sname, integer* iorder, char* transa, char* transb, integer* m, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen transa_len, ftnlen transb_len)
 {
-    /* Format strings */
-    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a15,\002,\002,"
-	    "a14,\002,\002,a14,\002,\002)";
-    static char fmt_9994[] = "(10x,3(i3,\002,\002),\002 (\002,f4.1,\002,\002"
-	    ",f4.1,\002) , A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002"
-	    ",\002,f4.1,\002) , C,\002,i3,\002).\002)";
-
     /* Local variables */
-    char crc[14], cta[14], ctb[14];
-
-    /* Fortran I/O blocks */
-    static cilist io___141 = { 0, 0, 0, fmt_9995, 0 };
-    static cilist io___142 = { 0, 0, 0, fmt_9994, 0 };
-
+    static char crc[14], cta[14], ctb[14];
 
     if (*(unsigned char *)transa == 'N') {
 	s_copy(cta, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
@@ -1713,123 +1205,52 @@ L130:
     } else {
 	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
     }
-    io___141.ciunit = *nout;
-    s_wsfe(&io___141);
-    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
-    do_fio(&c__1, sname, (ftnlen)13);
-    do_fio(&c__1, crc, (ftnlen)14);
-    do_fio(&c__1, cta, (ftnlen)14);
-    do_fio(&c__1, ctb, (ftnlen)14);
-    e_wsfe();
-    io___142.ciunit = *nout;
-    s_wsfe(&io___142);
-    do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer));
-    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal));
-    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer));
-    do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal));
-    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
-    e_wsfe();
-    return 0;
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cta,ctb);
+    printf("%d %d %d (%4.1lf,%4.1lf) , A, %d, B, %d, (%4.1lf,%4.1lf) , C, %d.\n",*m,*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc);
+
+return 0;
 } /* zprcn1_ */
 
 
-/* Subroutine */ int zchk2_(char *sname, doublereal *eps, doublereal *thresh, 
-	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
-	fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex *
-	alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex *
-	a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, 
-	doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, 
-	doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal *
-	g, integer *iorder)
+/* Subroutine */ int zchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
-    static char ichs[2] = "LR";
-    static char ichu[2] = "UL";
-
-    /* Format strings */
-    static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT "
-	    "TAKEN ON VALID\002,\002 CALL *******\002)";
-    static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER"
-	    " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)";
-    static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C"
-	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
-    static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR    C"
-	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
-    static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO"
-	    "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
-	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
-	    "ECT *******\002)";
-    static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR  "
-	    "  COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
-	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
-	    "ECT *******\002)";
-    static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM"
-	    "BER:\002)";
+    static char ichs[2+1] = "LR";
+    static char ichu[2+1] = "UL";
 
     /* System generated locals */
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5, i__6, i__7;
-    alist al__1;
 
     /* Local variables */
-    doublecomplex beta;
-    integer ldas, ldbs, ldcs;
-    logical same;
-    char side[1];
-    logical conj, left, null;
-    char uplo[1];
-    integer i__, m, n;
-    doublecomplex alpha;
-    logical isame[13];
-    char sides[1];
-    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, 
-	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
-	     logical *, doublecomplex *);
-    integer nargs;
-    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
-	    integer *, doublecomplex *, doublereal *, doublecomplex *, 
-	    integer *, doublereal *, doublereal *, logical *, integer *, 
-	    logical *);
-    logical reset;
-    char uplos[1];
-    integer ia, ib;
-    extern /* Subroutine */ int zprcn2_(integer *, integer *, char *, integer 
-	    *, char *, char *, integer *, integer *, doublecomplex *, integer 
-	    *, integer *, doublecomplex *, integer *);
-    integer na, nc, im, in, ms, ns;
-    extern /* Subroutine */ int czhemm_(integer *, char *, char *, integer *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
-	    integer *);
-    doublereal errmax;
-    extern logical lzeres_(char *, char *, integer *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *);
-    extern /* Subroutine */ int czsymm_(integer *, char *, char *, integer *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
-	    integer *);
-    integer laa, lbb, lda, lcc, ldb, ldc, ics;
-    doublecomplex als, bls;
-    integer icu;
-    doublereal err;
-    extern logical lze_(doublecomplex *, doublecomplex *, integer *);
-
-    /* Fortran I/O blocks */
-    static cilist io___181 = { 0, 0, 0, fmt_9994, 0 };
-    static cilist io___184 = { 0, 0, 0, fmt_9998, 0 };
-    static cilist io___186 = { 0, 0, 0, fmt_10000, 0 };
-    static cilist io___187 = { 0, 0, 0, fmt_10001, 0 };
-    static cilist io___188 = { 0, 0, 0, fmt_10002, 0 };
-    static cilist io___189 = { 0, 0, 0, fmt_10003, 0 };
-    static cilist io___190 = { 0, 0, 0, fmt_9996, 0 };
-
-
+    static doublecomplex beta;
+    static integer ldas, ldbs, ldcs;
+    static logical same;
+    static char side[1];
+    static logical isconj, left, null;
+    static char uplo[1];
+    static integer i__, m, n;
+    static doublecomplex alpha;
+    static logical isame[13];
+    static char sides[1];
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
+    static integer nargs;
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
+    static logical reset;
+    static char uplos[1];
+    static integer ia, ib;
+    extern /* Subroutine */ int zprcn2_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
+    static integer na, nc, im, in, ms, ns;
+    extern /* Subroutine */ void czhemm_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    static doublereal errmax;
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void czsymm_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    static integer laa, lbb, lda, lcc, ldb, ldc, ics;
+    static doublecomplex als, bls;
+    static integer icu;
+    static doublereal err;
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 /*  Tests ZHEMM and ZSYMM. */
 
@@ -1841,6 +1262,17 @@ L130:
 /*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
 /*     Sven Hammarling, Numerical Algorithms Group Ltd. */
 
+/*     .. Parameters .. */
+/*     .. Scalar Arguments .. */
+/*     .. Array Arguments .. */
+/*     .. Local Scalars .. */
+/*     .. Local Arrays .. */
+/*     .. External Functions .. */
+/*     .. External Subroutines .. */
+/*     .. Intrinsic Functions .. */
+/*     .. Scalars in Common .. */
+/*     .. Common blocks .. */
+/*     .. Data statements .. */
     /* Parameter adjustments */
     --idim;
     --alf;
@@ -1864,7 +1296,8 @@ L130:
     a -= a_offset;
 
     /* Function Body */
-    conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0;
+/*     .. Executable Statements .. */
+    isconj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0;
 
     nargs = 12;
     nc = 0;
@@ -1903,7 +1336,7 @@ L130:
 /*           Generate the matrix B. */
 
 	    zmake_("ge", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, &
-		    reset, &c_b1);
+		    reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1);
 
 	    for (ics = 1; ics <= 2; ++ics) {
 		*(unsigned char *)side = *(unsigned char *)&ichs[ics - 1];
@@ -1931,7 +1364,8 @@ L130:
 /*                 Generate the hermitian or symmetric matrix A. */
 
 		    zmake_(sname + 7, uplo, " ", &na, &na, &a[a_offset], nmax,
-			     &aa[1], &lda, &reset, &c_b1);
+			     &aa[1], &lda, &reset, &c_b1, (ftnlen)2, (ftnlen)
+			    1, (ftnlen)1);
 
 		    i__3 = *nalf;
 		    for (ia = 1; ia <= i__3; ++ia) {
@@ -1946,7 +1380,8 @@ L130:
 /*                       Generate the matrix C. */
 
 			    zmake_("ge", " ", " ", &m, &n, &c__[c_offset], 
-				    nmax, &cc[1], &ldc, &reset, &c_b1);
+				    nmax, &cc[1], &ldc, &reset, &c_b1, (
+				    ftnlen)2, (ftnlen)1, (ftnlen)1);
 
 			    ++nc;
 
@@ -1992,30 +1427,28 @@ L130:
 			    if (*trace) {
 				zprcn2_(ntra, &nc, sname, iorder, side, uplo, 
 					&m, &n, &alpha, &lda, &ldb, &beta, &
-					ldc)
+					ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1)
 					;
 			    }
 			    if (*rewi) {
-				al__1.aerr = 0;
+/*				al__1.aerr = 0;
 				al__1.aunit = *ntra;
-				f_rew(&al__1);
+				f_rew(&al__1);*/
 			    }
-			    if (conj) {
+			    if (isconj) {
 				czhemm_(iorder, side, uplo, &m, &n, &alpha, &
 					aa[1], &lda, &bb[1], &ldb, &beta, &cc[
-					1], &ldc);
+					1], &ldc, (ftnlen)1, (ftnlen)1);
 			    } else {
 				czsymm_(iorder, side, uplo, &m, &n, &alpha, &
 					aa[1], &lda, &bb[1], &ldb, &beta, &cc[
-					1], &ldc);
+					1], &ldc, (ftnlen)1, (ftnlen)1);
 			    }
 
 /*                       Check if error-exit was taken incorrectly. */
 
 			    if (! infoc_1.ok) {
-				io___181.ciunit = *nout;
-				s_wsfe(&io___181);
-				e_wsfe();
+			    	printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n");
 				*fatal = TRUE_;
 				goto L110;
 			    }
@@ -2038,7 +1471,7 @@ L130:
 				isame[10] = lze_(&cs[1], &cc[1], &lcc);
 			    } else {
 				isame[10] = lzeres_("ge", " ", &m, &n, &cs[1],
-					 &cc[1], &ldc);
+					 &cc[1], &ldc, (ftnlen)2, (ftnlen)1);
 			    }
 			    isame[11] = ldcs == ldc;
 
@@ -2050,11 +1483,7 @@ L130:
 			    for (i__ = 1; i__ <= i__5; ++i__) {
 				same = same && isame[i__ - 1];
 				if (! isame[i__ - 1]) {
-				    io___184.ciunit = *nout;
-				    s_wsfe(&io___184);
-				    do_fio(&c__1, (char *)&i__, (ftnlen)
-					    sizeof(integer));
-				    e_wsfe();
+                                printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);
 				}
 /* L40: */
 			    }
@@ -2072,13 +1501,15 @@ L130:
 					    a_offset], nmax, &b[b_offset], 
 					    nmax, &beta, &c__[c_offset], nmax,
 					     &ct[1], &g[1], &cc[1], &ldc, eps,
-					     &err, fatal, nout, &c_true);
+					     &err, fatal, nout, &c_true, (
+					    ftnlen)1, (ftnlen)1);
 				} else {
 				    zmmch_("N", "N", &m, &n, &n, &alpha, &b[
 					    b_offset], nmax, &a[a_offset], 
 					    nmax, &beta, &c__[c_offset], nmax,
 					     &ct[1], &g[1], &cc[1], &ldc, eps,
-					     &err, fatal, nout, &c_true);
+					     &err, fatal, nout, &c_true, (
+					    ftnlen)1, (ftnlen)1);
 				}
 				errmax = f2cmax(errmax,err);
 /*                          If got really bad answer, report and */
@@ -2112,76 +1543,44 @@ L90:
 
     if (errmax < *thresh) {
 	if (*iorder == 0) {
-	    io___186.ciunit = *nout;
-	    s_wsfe(&io___186);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    e_wsfe();
+            printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
 	}
 	if (*iorder == 1) {
-	    io___187.ciunit = *nout;
-	    s_wsfe(&io___187);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    e_wsfe();
+            printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
 	}
     } else {
 	if (*iorder == 0) {
-	    io___188.ciunit = *nout;
-	    s_wsfe(&io___188);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal));
-	    e_wsfe();
+            printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+            printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
 	}
 	if (*iorder == 1) {
-	    io___189.ciunit = *nout;
-	    s_wsfe(&io___189);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal));
-	    e_wsfe();
+            printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+            printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
 	}
     }
     goto L120;
 
 L110:
-    io___190.ciunit = *nout;
-    s_wsfe(&io___190);
-    do_fio(&c__1, sname, (ftnlen)13);
-    e_wsfe();
+    printf(" ******* %s FAILED ON CALL NUMBER:\n",sname);
     zprcn2_(nout, &nc, sname, iorder, side, uplo, &m, &n, &alpha, &lda, &ldb, 
-	    &beta, &ldc);
+	    &beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1);
 
 L120:
     return 0;
 
-/* L9995: */
+/* 9995 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $      '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */
+/*     $      ',', F4.1, '), C,', I3, ')    .' ) */
 
 /*     End of ZCHK2. */
 
 } /* zchk2_ */
 
 
-/* Subroutine */ int zprcn2_(integer *nout, integer *nc, char *sname, integer 
-	*iorder, char *side, char *uplo, integer *m, integer *n, 
-	doublecomplex *alpha, integer *lda, integer *ldb, doublecomplex *beta,
-	 integer *ldc)
+/* Subroutine */ int zprcn2_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, integer* m, integer* n, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len)
 {
-    /* Format strings */
-    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a15,\002,\002,"
-	    "a14,\002,\002,a14,\002,\002)";
-    static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002"
-	    ",f4.1,\002), A,\002,i3,\002, B,\002,i3,\002, (\002,f4.1,\002,"
-	    "\002,f4.1,\002), \002,\002C,\002,i3,\002).\002)";
-
     /* Local variables */
-    char cs[14], cu[14], crc[14];
-
-    /* Fortran I/O blocks */
-    static cilist io___194 = { 0, 0, 0, fmt_9995, 0 };
-    static cilist io___195 = { 0, 0, 0, fmt_9994, 0 };
-
+    static char cs[14], cu[14], crc[14];
 
     if (*(unsigned char *)side == 'L') {
 	s_copy(cs, "     CblasLeft", (ftnlen)14, (ftnlen)14);
@@ -2198,123 +1597,57 @@ L120:
     } else {
 	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
     }
-    io___194.ciunit = *nout;
-    s_wsfe(&io___194);
-    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
-    do_fio(&c__1, sname, (ftnlen)13);
-    do_fio(&c__1, crc, (ftnlen)14);
-    do_fio(&c__1, cs, (ftnlen)14);
-    do_fio(&c__1, cu, (ftnlen)14);
-    e_wsfe();
-    io___195.ciunit = *nout;
-    s_wsfe(&io___195);
-    do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
-    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal));
-    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer));
-    do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal));
-    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
-    e_wsfe();
-    return 0;
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu);
+    printf("%d %d (%4.1lf,%4.1lf) , A, %d, B, %d, (%4.1lf,%4.1lf) , C, %d.\n",*m,*n,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc);
+
+return 0;
 } /* zprcn2_ */
 
 
-/* Subroutine */ int zchk3_(char *sname, doublereal *eps, doublereal *thresh, 
-	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
-	fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex *
-	alf, integer *nmax, doublecomplex *a, doublecomplex *aa, 
-	doublecomplex *as, doublecomplex *b, doublecomplex *bb, doublecomplex 
-	*bs, doublecomplex *ct, doublereal *g, doublecomplex *c__, integer *
-	iorder)
+/* Subroutine */ int zchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* ct, doublereal* g, doublecomplex* c__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
-    static char ichu[2] = "UL";
-    static char icht[3] = "NTC";
-    static char ichd[2] = "UN";
-    static char ichs[2] = "LR";
-
-    /* Format strings */
-    static char fmt_9994[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT "
-	    "TAKEN ON VALID\002,\002 CALL *******\002)";
-    static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER"
-	    " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)";
-    static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C"
-	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
-    static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR    C"
-	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
-    static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO"
-	    "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
-	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
-	    "ECT *******\002)";
-    static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR  "
-	    "  COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
-	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
-	    "ECT *******\002)";
-    static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM"
-	    "BER:\002)";
+    static char ichu[2+1] = "UL";
+    static char icht[3+1] = "NTC";
+    static char ichd[2+1] = "UN";
+    static char ichs[2+1] = "LR";
 
     /* System generated locals */
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5, i__6, i__7;
     doublecomplex z__1;
-    alist al__1;
 
     /* Local variables */
-    char diag[1];
-    integer ldas, ldbs;
-    logical same;
-    char side[1];
-    logical left, null;
-    char uplo[1];
-    integer i__, j, m, n;
-    doublecomplex alpha;
-    char diags[1];
-    logical isame[13];
-    char sides[1];
-    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, 
-	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
-	     logical *, doublecomplex *);
-    integer nargs;
-    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
-	    integer *, doublecomplex *, doublereal *, doublecomplex *, 
-	    integer *, doublereal *, doublereal *, logical *, integer *, 
-	    logical *);
-    logical reset;
-    char uplos[1];
-    integer ia, na;
-    extern /* Subroutine */ int zprcn3_(integer *, integer *, char *, integer 
-	    *, char *, char *, char *, char *, integer *, integer *, 
-	    doublecomplex *, integer *, integer *);
-    integer nc, im, in, ms, ns;
-    char tranas[1], transa[1];
-    doublereal errmax;
-    extern logical lzeres_(char *, char *, integer *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *);
-    extern /* Subroutine */ int cztrmm_(integer *, char *, char *, char *, 
-	    char *, integer *, integer *, doublecomplex *, doublecomplex *, 
-	    integer *, doublecomplex *, integer *), cztrsm_(integer *, char *, char *, char *, char *, 
-	    integer *, integer *, doublecomplex *, doublecomplex *, integer *,
-	     doublecomplex *, integer *);
-    integer laa, icd, lbb, lda, ldb, ics;
-    doublecomplex als;
-    integer ict, icu;
-    doublereal err;
-    extern logical lze_(doublecomplex *, doublecomplex *, integer *);
-
-    /* Fortran I/O blocks */
-    static cilist io___236 = { 0, 0, 0, fmt_9994, 0 };
-    static cilist io___239 = { 0, 0, 0, fmt_9998, 0 };
-    static cilist io___241 = { 0, 0, 0, fmt_10000, 0 };
-    static cilist io___242 = { 0, 0, 0, fmt_10001, 0 };
-    static cilist io___243 = { 0, 0, 0, fmt_10002, 0 };
-    static cilist io___244 = { 0, 0, 0, fmt_10003, 0 };
-    static cilist io___245 = { 0, 0, 0, fmt_9996, 0 };
-
-
+    static char diag[1];
+    static integer ldas, ldbs;
+    static logical same;
+    static char side[1];
+    static logical left, null;
+    static char uplo[1];
+    static integer i__, j, m, n;
+    static doublecomplex alpha;
+    static char diags[1];
+    static logical isame[13];
+    static char sides[1];
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
+    static integer nargs;
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
+    static logical reset;
+    static char uplos[1];
+    static integer ia, na;
+    extern /* Subroutine */ int zprcn3_(integer*, integer*, char*, integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, ftnlen, ftnlen, ftnlen, ftnlen, ftnlen);
+    static integer nc, im, in, ms, ns;
+    static char tranas[1], transa[1];
+    static doublereal errmax;
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cztrmm_(integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cztrsm_(integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen, ftnlen);
+    static integer laa, icd, lbb, lda, ldb, ics;
+    static doublecomplex als;
+    static integer ict, icu;
+    static doublereal err;
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 /*  Tests ZTRMM and ZTRSM. */
 
@@ -2326,6 +1659,17 @@ L120:
 /*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
 /*     Sven Hammarling, Numerical Algorithms Group Ltd. */
 
+/*     .. Parameters .. */
+/*     .. Scalar Arguments .. */
+/*     .. Array Arguments .. */
+/*     .. Local Scalars .. */
+/*     .. Local Arrays .. */
+/*     .. External Functions .. */
+/*     .. External Subroutines .. */
+/*     .. Intrinsic Functions .. */
+/*     .. Scalars in Common .. */
+/*     .. Common blocks .. */
+/*     .. Data statements .. */
     /* Parameter adjustments */
     --idim;
     --alf;
@@ -2346,6 +1690,7 @@ L120:
     a -= a_offset;
 
     /* Function Body */
+/*     .. Executable Statements .. */
 
     nargs = 11;
     nc = 0;
@@ -2421,12 +1766,14 @@ L120:
 
 				zmake_("tr", uplo, diag, &na, &na, &a[
 					a_offset], nmax, &aa[1], &lda, &reset,
-					 &c_b1);
+					 &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)
+					1);
 
 /*                          Generate the matrix B. */
 
 				zmake_("ge", " ", " ", &m, &n, &b[b_offset], 
-					nmax, &bb[1], &ldb, &reset, &c_b1);
+					nmax, &bb[1], &ldb, &reset, &c_b1, (
+					ftnlen)2, (ftnlen)1, (ftnlen)1);
 
 				++nc;
 
@@ -2471,42 +1818,42 @@ L120:
 					zprcn3_(ntra, &nc, sname, iorder, 
 						side, uplo, transa, diag, &m, 
 						&n, &alpha, &lda, &ldb, (
-						ftnlen)13, (ftnlen)1, (ftnlen)
+						ftnlen)12, (ftnlen)1, (ftnlen)
 						1, (ftnlen)1, (ftnlen)1);
 				    }
 				    if (*rewi) {
-					al__1.aerr = 0;
+/*					al__1.aerr = 0;
 					al__1.aunit = *ntra;
-					f_rew(&al__1);
+					f_rew(&al__1);*/
 				    }
 				    cztrmm_(iorder, side, uplo, transa, diag, 
 					    &m, &n, &alpha, &aa[1], &lda, &bb[
-					    1], &ldb);
+					    1], &ldb, (ftnlen)1, (ftnlen)1, (
+					    ftnlen)1, (ftnlen)1);
 				} else if (s_cmp(sname + 9, "sm", (ftnlen)2, (
 					ftnlen)2) == 0) {
 				    if (*trace) {
 					zprcn3_(ntra, &nc, sname, iorder, 
 						side, uplo, transa, diag, &m, 
 						&n, &alpha, &lda, &ldb, (
-						ftnlen)13, (ftnlen)1, (ftnlen)
+						ftnlen)12, (ftnlen)1, (ftnlen)
 						1, (ftnlen)1, (ftnlen)1);
 				    }
 				    if (*rewi) {
-					al__1.aerr = 0;
+/*					al__1.aerr = 0;
 					al__1.aunit = *ntra;
-					f_rew(&al__1);
+					f_rew(&al__1);*/
 				    }
 				    cztrsm_(iorder, side, uplo, transa, diag, 
 					    &m, &n, &alpha, &aa[1], &lda, &bb[
-					    1], &ldb);
+					    1], &ldb, (ftnlen)1, (ftnlen)1, (
+					    ftnlen)1, (ftnlen)1);
 				}
 
 /*                          Check if error-exit was taken incorrectly. */
 
 				if (! infoc_1.ok) {
-				    io___236.ciunit = *nout;
-				    s_wsfe(&io___236);
-				    e_wsfe();
+                                    printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n");
 				    *fatal = TRUE_;
 				    goto L150;
 				}
@@ -2531,7 +1878,8 @@ L120:
 				    isame[9] = lze_(&bs[1], &bb[1], &lbb);
 				} else {
 				    isame[9] = lzeres_("ge", " ", &m, &n, &bs[
-					    1], &bb[1], &ldb);
+					    1], &bb[1], &ldb, (ftnlen)2, (
+					    ftnlen)1);
 				}
 				isame[10] = ldbs == ldb;
 
@@ -2543,11 +1891,7 @@ L120:
 				for (i__ = 1; i__ <= i__4; ++i__) {
 				    same = same && isame[i__ - 1];
 				    if (! isame[i__ - 1]) {
-					io___239.ciunit = *nout;
-					s_wsfe(&io___239);
-					do_fio(&c__1, (char *)&i__, (ftnlen)
-						sizeof(integer));
-					e_wsfe();
+                                        printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);
 				    }
 /* L50: */
 				}
@@ -2578,7 +1922,8 @@ L120:
 						    c_b1, &c__[c_offset], 
 						    nmax, &ct[1], &g[1], &bb[
 						    1], &ldb, eps, &err, 
-						    fatal, nout, &c_true);
+						    fatal, nout, &c_true, (
+						    ftnlen)1, (ftnlen)1);
 					}
 				    } else if (s_cmp(sname + 9, "sm", (ftnlen)
 					    2, (ftnlen)2) == 0) {
@@ -2612,7 +1957,8 @@ L120:
 						    c_b1, &b[b_offset], nmax, 
 						    &ct[1], &g[1], &bb[1], &
 						    ldb, eps, &err, fatal, 
-						    nout, &c_false);
+						    nout, &c_false, (ftnlen)1,
+						     (ftnlen)1);
 					} else {
 					    zmmch_("N", transa, &m, &n, &n, &
 						    c_b2, &c__[c_offset], 
@@ -2620,7 +1966,8 @@ L120:
 						    &c_b1, &b[b_offset], nmax,
 						     &ct[1], &g[1], &bb[1], &
 						    ldb, eps, &err, fatal, 
-						    nout, &c_false);
+						    nout, &c_false, (ftnlen)1,
+						     (ftnlen)1);
 					}
 				    }
 				    errmax = f2cmax(errmax,err);
@@ -2657,77 +2004,48 @@ L130:
 
     if (errmax < *thresh) {
 	if (*iorder == 0) {
-	    io___241.ciunit = *nout;
-	    s_wsfe(&io___241);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    e_wsfe();
+            printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
 	}
 	if (*iorder == 1) {
-	    io___242.ciunit = *nout;
-	    s_wsfe(&io___242);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    e_wsfe();
+            printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
 	}
     } else {
 	if (*iorder == 0) {
-	    io___243.ciunit = *nout;
-	    s_wsfe(&io___243);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal));
-	    e_wsfe();
+            printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+            printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
 	}
 	if (*iorder == 1) {
-	    io___244.ciunit = *nout;
-	    s_wsfe(&io___244);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal));
-	    e_wsfe();
+            printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+            printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
 	}
     }
     goto L160;
 
 L150:
-    io___245.ciunit = *nout;
-    s_wsfe(&io___245);
-    do_fio(&c__1, sname, (ftnlen)13);
-    e_wsfe();
+    printf(" ******* %s FAILED ON CALL NUMBER:\n",sname);
     if (*trace) {
 	zprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, &
-		alpha, &lda, &ldb);
+		alpha, &lda, &ldb, (ftnlen)12, (ftnlen)1, (ftnlen)1, (ftnlen)
+		1, (ftnlen)1);
     }
 
 L160:
     return 0;
 
-/* L9995: */
+/* 9995 FORMAT(1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $     '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ')         ', */
+/*     $      '      .' ) */
 
 /*     End of ZCHK3. */
 
 } /* zchk3_ */
 
 
-/* Subroutine */ int zprcn3_(integer *nout, integer *nc, char *sname, integer 
-	*iorder, char *side, char *uplo, char *transa, char *diag, integer *m,
-	 integer *n, doublecomplex *alpha, integer *lda, integer *ldb)
+/* Subroutine */ int zprcn3_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, char* transa, char* diag, integer* m, integer* n, doublecomplex* alpha, integer* lda, integer* ldb, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len, ftnlen transa_len, ftnlen diag_len)
 {
-    /* Format strings */
-    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,a15,\002,\002,"
-	    "a14,\002,\002,a14,\002,\002)";
-    static char fmt_9994[] = "(10x,2(a15,\002,\002),2(i3,\002,\002),\002 "
-	    "(\002,f4.1,\002,\002,f4.1,\002), A,\002,i3,\002, B,\002,i3,\002)."
-	    "\002)";
 
     /* Local variables */
-    char ca[14], cd[14], cs[14], cu[14], crc[14];
-
-    /* Fortran I/O blocks */
-    static cilist io___251 = { 0, 0, 0, fmt_9995, 0 };
-    static cilist io___252 = { 0, 0, 0, fmt_9994, 0 };
-
+    static char ca[14], cd[14], cs[14], cu[14], crc[14];
 
     if (*(unsigned char *)side == 'L') {
 	s_copy(cs, "     CblasLeft", (ftnlen)14, (ftnlen)14);
@@ -2756,134 +2074,61 @@ L160:
     } else {
 	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
     }
-    io___251.ciunit = *nout;
-    s_wsfe(&io___251);
-    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
-    do_fio(&c__1, sname, (ftnlen)13);
-    do_fio(&c__1, crc, (ftnlen)14);
-    do_fio(&c__1, cs, (ftnlen)14);
-    do_fio(&c__1, cu, (ftnlen)14);
-    e_wsfe();
-    io___252.ciunit = *nout;
-    s_wsfe(&io___252);
-    do_fio(&c__1, ca, (ftnlen)14);
-    do_fio(&c__1, cd, (ftnlen)14);
-    do_fio(&c__1, (char *)&(*m), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
-    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal));
-    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer));
-    e_wsfe();
-    return 0;
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu);
+    printf("         %s %s %d %d (%4.1lf,%4.1lf) A %d B %d\n",ca,cd,*m,*n,alpha->r,alpha->i,*lda,*ldb);
+
+return 0;
 } /* zprcn3_ */
 
 
-/* Subroutine */ int zchk4_(char *sname, doublereal *eps, doublereal *thresh, 
-	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
-	fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex *
-	alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex *
-	a, doublecomplex *aa, doublecomplex *as, doublecomplex *b, 
-	doublecomplex *bb, doublecomplex *bs, doublecomplex *c__, 
-	doublecomplex *cc, doublecomplex *cs, doublecomplex *ct, doublereal *
-	g, integer *iorder)
+/* Subroutine */ int zchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
-    static char icht[2] = "NC";
-    static char ichu[2] = "UL";
-
-    /* Format strings */
-    static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT "
-	    "TAKEN ON VALID\002,\002 CALL *******\002)";
-    static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER"
-	    " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)";
-    static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C"
-	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
-    static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR    C"
-	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
-    static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO"
-	    "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
-	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
-	    "ECT *******\002)";
-    static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR  "
-	    "  COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
-	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
-	    "ECT *******\002)";
-    static char fmt_9995[] = "(\002      THESE ARE THE RESULTS FOR COLUMN"
-	    " \002,i3)";
-    static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM"
-	    "BER:\002)";
+    static char icht[2+1] = "NC";
+    static char ichu[2+1] = "UL";
 
     /* System generated locals */
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5, i__6, i__7;
     doublecomplex z__1;
-    alist al__1;
 
     /* Local variables */
-    doublecomplex beta;
-    integer ldas, ldcs;
-    logical same, conj;
-    doublecomplex bets;
-    doublereal rals;
-    logical tran, null;
-    char uplo[1];
-    integer i__, j, k, n;
-    doublecomplex alpha;
-    doublereal rbeta;
-    logical isame[13];
-    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, 
-	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
-	     logical *, doublecomplex *);
-    integer nargs;
-    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
-	    integer *, doublecomplex *, doublereal *, doublecomplex *, 
-	    integer *, doublereal *, doublereal *, logical *, integer *, 
-	    logical *);
-    doublereal rbets;
-    logical reset;
-    char trans[1];
-    logical upper;
-    char uplos[1];
-    integer ia, ib, jc, ma, na;
-    extern /* Subroutine */ int zprcn4_(integer *, integer *, char *, integer 
-	    *, char *, char *, integer *, integer *, doublecomplex *, integer 
-	    *, doublecomplex *, integer *);
-    integer nc;
-    extern /* Subroutine */ int zprcn6_(integer *, integer *, char *, integer 
-	    *, char *, char *, integer *, integer *, doublereal *, integer *, 
-	    doublereal *, integer *);
-    integer ik, in, jj, lj, ks, ns;
-    doublereal ralpha;
-    extern /* Subroutine */ int czherk_(integer *, char *, char *, integer *, 
-	    integer *, doublereal *, doublecomplex *, integer *, doublereal *,
-	     doublecomplex *, integer *);
-    doublereal errmax;
-    extern logical lzeres_(char *, char *, integer *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *);
-    char transs[1], transt[1];
-    extern /* Subroutine */ int czsyrk_(integer *, char *, char *, integer *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *);
-    integer laa, lda, lcc, ldc;
-    doublecomplex als;
-    integer ict, icu;
-    doublereal err;
-    extern logical lze_(doublecomplex *, doublecomplex *, integer *);
-
-    /* Fortran I/O blocks */
-    static cilist io___294 = { 0, 0, 0, fmt_9992, 0 };
-    static cilist io___297 = { 0, 0, 0, fmt_9998, 0 };
-    static cilist io___304 = { 0, 0, 0, fmt_10000, 0 };
-    static cilist io___305 = { 0, 0, 0, fmt_10001, 0 };
-    static cilist io___306 = { 0, 0, 0, fmt_10002, 0 };
-    static cilist io___307 = { 0, 0, 0, fmt_10003, 0 };
-    static cilist io___308 = { 0, 0, 0, fmt_9995, 0 };
-    static cilist io___309 = { 0, 0, 0, fmt_9996, 0 };
-
-
+    static doublecomplex beta;
+    static integer ldas, ldcs;
+    static logical same, isconj;
+    static doublecomplex bets;
+    static doublereal rals;
+    static logical tran, null;
+    static char uplo[1];
+    static integer i__, j, k, n;
+    static doublecomplex alpha;
+    static doublereal rbeta;
+    static logical isame[13];
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
+    static integer nargs;
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
+    static doublereal rbets;
+    static logical reset;
+    static char trans[1];
+    static logical upper;
+    static char uplos[1];
+    static integer ia, ib, jc, ma, na;
+    extern /* Subroutine */ int zprcn4_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
+    static integer nc;
+    extern /* Subroutine */ int zprcn6_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
+    static integer ik, in, jj, lj, ks, ns;
+    static doublereal ralpha;
+    extern /* Subroutine */ int czherk_(integer*, char*, char*, integer*, integer*, doublereal*, doublecomplex*, integer*, doublereal*, doublecomplex*, integer*, ftnlen, ftnlen);
+    static doublereal errmax;
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    static char transs[1], transt[1];
+    extern /* Subroutine */ int czsyrk_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    static integer laa, lda, lcc, ldc;
+    static doublecomplex als;
+    static integer ict, icu;
+    static doublereal err;
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 /*  Tests ZHERK and ZSYRK. */
 
@@ -2895,6 +2140,17 @@ L160:
 /*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
 /*     Sven Hammarling, Numerical Algorithms Group Ltd. */
 
+/*     .. Parameters .. */
+/*     .. Scalar Arguments .. */
+/*     .. Array Arguments .. */
+/*     .. Local Scalars .. */
+/*     .. Local Arrays .. */
+/*     .. External Functions .. */
+/*     .. External Subroutines .. */
+/*     .. Intrinsic Functions .. */
+/*     .. Scalars in Common .. */
+/*     .. Common blocks .. */
+/*     .. Data statements .. */
     /* Parameter adjustments */
     --idim;
     --alf;
@@ -2918,12 +2174,15 @@ L160:
     a -= a_offset;
 
     /* Function Body */
-    conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0;
+/*     .. Executable Statements .. */
+    isconj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0;
 
     nargs = 10;
     nc = 0;
     reset = TRUE_;
     errmax = 0.;
+    rals = 1.;
+    rbets = 1.;
 
     i__1 = *nidim;
     for (in = 1; in <= i__1; ++in) {
@@ -2946,7 +2205,7 @@ L160:
 	    for (ict = 1; ict <= 2; ++ict) {
 		*(unsigned char *)trans = *(unsigned char *)&icht[ict - 1];
 		tran = *(unsigned char *)trans == 'C';
-		if (tran && ! conj) {
+		if (tran && ! isconj) {
 		    *(unsigned char *)trans = 'T';
 		}
 		if (tran) {
@@ -2970,7 +2229,7 @@ L160:
 /*              Generate the matrix A. */
 
 		zmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[1], &
-			lda, &reset, &c_b1);
+			lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1);
 
 		for (icu = 1; icu <= 2; ++icu) {
 		    *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1];
@@ -2980,7 +2239,7 @@ L160:
 		    for (ia = 1; ia <= i__3; ++ia) {
 			i__4 = ia;
 			alpha.r = alf[i__4].r, alpha.i = alf[i__4].i;
-			if (conj) {
+			if (isconj) {
 			    ralpha = alpha.r;
 			    z__1.r = ralpha, z__1.i = 0.;
 			    alpha.r = z__1.r, alpha.i = z__1.i;
@@ -2990,22 +2249,22 @@ L160:
 			for (ib = 1; ib <= i__4; ++ib) {
 			    i__5 = ib;
 			    beta.r = bet[i__5].r, beta.i = bet[i__5].i;
-			    if (conj) {
+			    if (isconj) {
 				rbeta = beta.r;
 				z__1.r = rbeta, z__1.i = 0.;
 				beta.r = z__1.r, beta.i = z__1.i;
 			    }
 			    null = n <= 0;
-			    if (conj) {
-				null = null || (k <= 0 || ralpha == 0.) && 
-					rbeta == 1.;
+			    if (isconj) {
+				null = null ||( (k <= 0 || ralpha == 0.) && 
+					rbeta == 1.);
 			    }
 
 /*                       Generate the matrix C. */
 
 			    zmake_(sname + 7, uplo, " ", &n, &n, &c__[
 				    c_offset], nmax, &cc[1], &ldc, &reset, &
-				    c_b1);
+				    c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1);
 
 			    ++nc;
 
@@ -3016,7 +2275,7 @@ L160:
 				    trans;
 			    ns = n;
 			    ks = k;
-			    if (conj) {
+			    if (isconj) {
 				rals = ralpha;
 			    } else {
 				als.r = alpha.r, als.i = alpha.i;
@@ -3030,7 +2289,7 @@ L160:
 /* L10: */
 			    }
 			    ldas = lda;
-			    if (conj) {
+			    if (isconj) {
 				rbets = rbeta;
 			    } else {
 				bets.r = beta.r, bets.i = beta.i;
@@ -3047,40 +2306,42 @@ L160:
 
 /*                       Call the subroutine. */
 
-			    if (conj) {
+			    if (isconj) {
 				if (*trace) {
 				    zprcn6_(ntra, &nc, sname, iorder, uplo, 
 					    trans, &n, &k, &ralpha, &lda, &
-					    rbeta, &ldc);
+					    rbeta, &ldc, (ftnlen)12, (ftnlen)
+					    1, (ftnlen)1);
 				}
 				if (*rewi) {
-				    al__1.aerr = 0;
+/*				    al__1.aerr = 0;
 				    al__1.aunit = *ntra;
-				    f_rew(&al__1);
+				    f_rew(&al__1);*/
 				}
 				czherk_(iorder, uplo, trans, &n, &k, &ralpha, 
-					&aa[1], &lda, &rbeta, &cc[1], &ldc);
+					&aa[1], &lda, &rbeta, &cc[1], &ldc, (
+					ftnlen)1, (ftnlen)1);
 			    } else {
 				if (*trace) {
 				    zprcn4_(ntra, &nc, sname, iorder, uplo, 
 					    trans, &n, &k, &alpha, &lda, &
-					    beta, &ldc);
+					    beta, &ldc, (ftnlen)12, (ftnlen)1,
+					     (ftnlen)1);
 				}
 				if (*rewi) {
-				    al__1.aerr = 0;
+/*				    al__1.aerr = 0;
 				    al__1.aunit = *ntra;
-				    f_rew(&al__1);
+				    f_rew(&al__1);*/
 				}
 				czsyrk_(iorder, uplo, trans, &n, &k, &alpha, &
-					aa[1], &lda, &beta, &cc[1], &ldc);
+					aa[1], &lda, &beta, &cc[1], &ldc, (
+					ftnlen)1, (ftnlen)1);
 			    }
 
 /*                       Check if error-exit was taken incorrectly. */
 
 			    if (! infoc_1.ok) {
-				io___294.ciunit = *nout;
-				s_wsfe(&io___294);
-				e_wsfe();
+                                printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n");
 				*fatal = TRUE_;
 				goto L120;
 			    }
@@ -3093,7 +2354,7 @@ L160:
 				    char *)trans;
 			    isame[2] = ns == n;
 			    isame[3] = ks == k;
-			    if (conj) {
+			    if (isconj) {
 				isame[4] = rals == ralpha;
 			    } else {
 				isame[4] = als.r == alpha.r && als.i == 
@@ -3101,7 +2362,7 @@ L160:
 			    }
 			    isame[5] = lze_(&as[1], &aa[1], &laa);
 			    isame[6] = ldas == lda;
-			    if (conj) {
+			    if (isconj) {
 				isame[7] = rbets == rbeta;
 			    } else {
 				isame[7] = bets.r == beta.r && bets.i == 
@@ -3111,7 +2372,8 @@ L160:
 				isame[8] = lze_(&cs[1], &cc[1], &lcc);
 			    } else {
 				isame[8] = lzeres_(sname + 7, uplo, &n, &n, &
-					cs[1], &cc[1], &ldc);
+					cs[1], &cc[1], &ldc, (ftnlen)2, (
+					ftnlen)1);
 			    }
 			    isame[9] = ldcs == ldc;
 
@@ -3123,11 +2385,7 @@ L160:
 			    for (i__ = 1; i__ <= i__5; ++i__) {
 				same = same && isame[i__ - 1];
 				if (! isame[i__ - 1]) {
-				    io___297.ciunit = *nout;
-				    s_wsfe(&io___297);
-				    do_fio(&c__1, (char *)&i__, (ftnlen)
-					    sizeof(integer));
-				    e_wsfe();
+                                    printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);
 				}
 /* L30: */
 			    }
@@ -3140,7 +2398,7 @@ L160:
 
 /*                          Check the result column by column. */
 
-				if (conj) {
+				if (isconj) {
 				    *(unsigned char *)transt = 'C';
 				} else {
 				    *(unsigned char *)transt = 'T';
@@ -3162,7 +2420,8 @@ L160:
 						nmax, &beta, &c__[jj + j * 
 						c_dim1], nmax, &ct[1], &g[1], 
 						&cc[jc], &ldc, eps, &err, 
-						fatal, nout, &c_true);
+						fatal, nout, &c_true, (ftnlen)
+						1, (ftnlen)1);
 				    } else {
 					zmmch_("N", transt, &lj, &c__1, &k, &
 						alpha, &a[jj + a_dim1], nmax, 
@@ -3170,7 +2429,7 @@ L160:
 						c__[jj + j * c_dim1], nmax, &
 						ct[1], &g[1], &cc[jc], &ldc, 
 						eps, &err, fatal, nout, &
-						c_true);
+						c_true, (ftnlen)1, (ftnlen)1);
 				    }
 				    if (upper) {
 					jc += ldc;
@@ -3211,89 +2470,57 @@ L100:
 
     if (errmax < *thresh) {
 	if (*iorder == 0) {
-	    io___304.ciunit = *nout;
-	    s_wsfe(&io___304);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    e_wsfe();
+            printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
 	}
 	if (*iorder == 1) {
-	    io___305.ciunit = *nout;
-	    s_wsfe(&io___305);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    e_wsfe();
+            printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
 	}
     } else {
 	if (*iorder == 0) {
-	    io___306.ciunit = *nout;
-	    s_wsfe(&io___306);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal));
-	    e_wsfe();
+            printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+            printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
 	}
 	if (*iorder == 1) {
-	    io___307.ciunit = *nout;
-	    s_wsfe(&io___307);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal));
-	    e_wsfe();
+            printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+            printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
 	}
     }
     goto L130;
 
 L110:
     if (n > 1) {
-	io___308.ciunit = *nout;
-	s_wsfe(&io___308);
-	do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer));
-	e_wsfe();
+        printf("      THESE ARE THE RESULTS FOR COLUMN %d:\n",j);
     }
 
 L120:
-    io___309.ciunit = *nout;
-    s_wsfe(&io___309);
-    do_fio(&c__1, sname, (ftnlen)13);
-    e_wsfe();
-    if (conj) {
+    printf(" ******* %s FAILED ON CALL NUMBER:\n",sname);
+    if (isconj) {
 	zprcn6_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &ralpha, &lda, 
-		&rbeta, &ldc);
+		&rbeta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1);
     } else {
 	zprcn4_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, &
-		beta, &ldc);
+		beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1);
     }
 
 L130:
     return 0;
 
-/* L9994: */
-/* L9993: */
+/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $     F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ')               ', */
+/*     $      '          .' ) */
+/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $      '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, */
+/*     $      '), C,', I3, ')          .' ) */
 
 /*     End of CCHK4. */
 
 } /* zchk4_ */
 
 
-/* Subroutine */ int zprcn4_(integer *nout, integer *nc, char *sname, integer 
-	*iorder, char *uplo, char *transa, integer *n, integer *k, 
-	doublecomplex *alpha, integer *lda, doublecomplex *beta, integer *ldc)
+/* Subroutine */ int zprcn4_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
 {
-    /* Format strings */
-    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002,"
-	    "\002))";
-    static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002"
-	    ",f4.1,\002), A,\002,i3,\002, (\002,f4.1,\002,\002,f4.1,\002), C"
-	    ",\002,i3,\002).\002)";
-
     /* Local variables */
-    char ca[14], cu[14], crc[14];
-
-    /* Fortran I/O blocks */
-    static cilist io___313 = { 0, 0, 0, fmt_9995, 0 };
-    static cilist io___314 = { 0, 0, 0, fmt_9994, 0 };
-
+    static char ca[14], cu[14], crc[14];
 
     if (*(unsigned char *)uplo == 'U') {
 	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
@@ -3312,45 +2539,19 @@ L130:
     } else {
 	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
     }
-    io___313.ciunit = *nout;
-    s_wsfe(&io___313);
-    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
-    do_fio(&c__1, sname, (ftnlen)13);
-    do_fio(&c__1, crc, (ftnlen)14);
-    do_fio(&c__1, cu, (ftnlen)14);
-    do_fio(&c__1, ca, (ftnlen)14);
-    e_wsfe();
-    io___314.ciunit = *nout;
-    s_wsfe(&io___314);
-    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer));
-    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal));
-    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
-    do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal));
-    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
-    e_wsfe();
-    return 0;
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca);
+    printf("(          %d %d (%4.1lf,%4.1lf) A %d (%4.1lf,%4.1lf) C %d\n",*n,*k,alpha->r,alpha->i,*lda,beta->r,beta->i,*ldc);
+
+return 0;
 } /* zprcn4_ */
 
 
 
-/* Subroutine */ int zprcn6_(integer *nout, integer *nc, char *sname, integer 
-	*iorder, char *uplo, char *transa, integer *n, integer *k, doublereal 
-	*alpha, integer *lda, doublereal *beta, integer *ldc)
+/* Subroutine */ int zprcn6_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublereal* alpha, integer* lda, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
 {
-    /* Format strings */
-    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002,"
-	    "\002))";
-    static char fmt_9994[] = "(10x,2(i3,\002,\002),f4.1,\002, A,\002,i3"
-	    ",\002,\002,f4.1,\002, C,\002,i3,\002).\002)";
 
     /* Local variables */
-    char ca[14], cu[14], crc[14];
-
-    /* Fortran I/O blocks */
-    static cilist io___318 = { 0, 0, 0, fmt_9995, 0 };
-    static cilist io___319 = { 0, 0, 0, fmt_9994, 0 };
-
+    static char ca[14], cu[14], crc[14];
 
     if (*(unsigned char *)uplo == 'U') {
 	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
@@ -3369,132 +2570,58 @@ L130:
     } else {
 	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
     }
-    io___318.ciunit = *nout;
-    s_wsfe(&io___318);
-    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
-    do_fio(&c__1, sname, (ftnlen)13);
-    do_fio(&c__1, crc, (ftnlen)14);
-    do_fio(&c__1, cu, (ftnlen)14);
-    do_fio(&c__1, ca, (ftnlen)14);
-    e_wsfe();
-    io___319.ciunit = *nout;
-    s_wsfe(&io___319);
-    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*alpha), (ftnlen)sizeof(doublereal));
-    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(doublereal));
-    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
-    e_wsfe();
-    return 0;
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca);
+    printf("(          %d %d %4.1lf A %d %4.1lf C %d\n",*n,*k,*alpha,*lda,*beta,*ldc);
+
+return 0;
 } /* zprcn6_ */
 
 
-/* Subroutine */ int zchk5_(char *sname, doublereal *eps, doublereal *thresh, 
-	integer *nout, integer *ntra, logical *trace, logical *rewi, logical *
-	fatal, integer *nidim, integer *idim, integer *nalf, doublecomplex *
-	alf, integer *nbet, doublecomplex *bet, integer *nmax, doublecomplex *
-	ab, doublecomplex *aa, doublecomplex *as, doublecomplex *bb, 
-	doublecomplex *bs, doublecomplex *c__, doublecomplex *cc, 
-	doublecomplex *cs, doublecomplex *ct, doublereal *g, doublecomplex *w,
-	 integer *iorder)
+/* Subroutine */ int zchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* ab, doublecomplex* aa, doublecomplex* as, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, doublecomplex* w, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
-    static char icht[2] = "NC";
-    static char ichu[2] = "UL";
-
-    /* Format strings */
-    static char fmt_9992[] = "(\002 ******* FATAL ERROR - ERROR-CALL MYEXIT "
-	    "TAKEN ON VALID\002,\002 CALL *******\002)";
-    static char fmt_9998[] = "(\002 ******* FATAL ERROR - PARAMETER NUMBER"
-	    " \002,i2,\002 WAS CH\002,\002ANGED INCORRECTLY *******\002)";
-    static char fmt_10000[] = "(\002 \002,a13,\002 PASSED THE COLUMN-MAJOR C"
-	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
-    static char fmt_10001[] = "(\002 \002,a13,\002 PASSED THE ROW-MAJOR    C"
-	    "OMPUTATIONAL TESTS\002,\002 (\002,i6,\002 CALL\002,\002S)\002)";
-    static char fmt_10002[] = "(\002 \002,a13,\002 COMPLETED THE COLUMN-MAJO"
-	    "R COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
-	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
-	    "ECT *******\002)";
-    static char fmt_10003[] = "(\002 \002,a13,\002 COMPLETED THE ROW-MAJOR  "
-	    "  COMPUTATIONAL \002,\002TESTS (\002,i6,\002 CALLS)\002,/\002 **"
-	    "***** BUT WITH MAXIMUM TEST \002,\002RATIO \002,f8.2,\002 - SUSP"
-	    "ECT *******\002)";
-    static char fmt_9995[] = "(\002      THESE ARE THE RESULTS FOR COLUMN"
-	    " \002,i3)";
-    static char fmt_9996[] = "(\002 ******* \002,a13,\002 FAILED ON CALL NUM"
-	    "BER:\002)";
+    static char icht[2+1] = "NC";
+    static char ichu[2+1] = "UL";
 
     /* System generated locals */
     integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8;
     doublecomplex z__1, z__2;
-    alist al__1;
 
     /* Local variables */
-    integer jjab;
-    doublecomplex beta;
-    integer ldas, ldbs, ldcs;
-    logical same, conj;
-    doublecomplex bets;
-    logical tran, null;
-    char uplo[1];
-    integer i__, j, k, n;
-    doublecomplex alpha;
-    doublereal rbeta;
-    logical isame[13];
-    extern /* Subroutine */ int zmake_(char *, char *, char *, integer *, 
-	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
-	     logical *, doublecomplex *);
-    integer nargs;
-    extern /* Subroutine */ int zmmch_(char *, char *, integer *, integer *, 
-	    integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
-	    integer *, doublecomplex *, doublereal *, doublecomplex *, 
-	    integer *, doublereal *, doublereal *, logical *, integer *, 
-	    logical *);
-    doublereal rbets;
-    logical reset;
-    char trans[1];
-    logical upper;
-    char uplos[1];
-    integer ia, ib, jc, ma, na, nc;
-    extern /* Subroutine */ int zprcn5_(integer *, integer *, char *, integer 
-	    *, char *, char *, integer *, integer *, doublecomplex *, integer 
-	    *, integer *, doublecomplex *, integer *),
-	     zprcn7_(integer *, integer *, char *, integer *, char *, char *, 
-	    integer *, integer *, doublecomplex *, integer *, integer *, 
-	    doublereal *, integer *);
-    integer ik, in, jj, lj, ks, ns;
-    doublereal errmax;
-    extern logical lzeres_(char *, char *, integer *, integer *, 
-	    doublecomplex *, doublecomplex *, integer *);
-    char transs[1], transt[1];
-    extern /* Subroutine */ int czher2k_(integer *, char *, char *, integer *,
-	     integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublereal *, doublecomplex *, 
-	    integer *);
-    integer laa, lbb, lda, lcc, ldb, ldc;
-    doublecomplex als;
-    integer ict, icu;
-    extern /* Subroutine */ int czsyr2k_(integer *, char *, char *, integer *,
-	     integer *, doublecomplex *, doublecomplex *, integer *, 
-	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
-	    integer *);
-    doublereal err;
-    extern logical lze_(doublecomplex *, doublecomplex *, integer *);
-
-    /* Fortran I/O blocks */
-    static cilist io___362 = { 0, 0, 0, fmt_9992, 0 };
-    static cilist io___365 = { 0, 0, 0, fmt_9998, 0 };
-    static cilist io___373 = { 0, 0, 0, fmt_10000, 0 };
-    static cilist io___374 = { 0, 0, 0, fmt_10001, 0 };
-    static cilist io___375 = { 0, 0, 0, fmt_10002, 0 };
-    static cilist io___376 = { 0, 0, 0, fmt_10003, 0 };
-    static cilist io___377 = { 0, 0, 0, fmt_9995, 0 };
-    static cilist io___378 = { 0, 0, 0, fmt_9996, 0 };
-
-
+    static integer jjab;
+    static doublecomplex beta;
+    static integer ldas, ldbs, ldcs;
+    static logical same, isconj;
+    static doublecomplex bets;
+    static logical tran, null;
+    static char uplo[1];
+    static integer i__, j, k, n;
+    static doublecomplex alpha;
+    static doublereal rbeta;
+    static logical isame[13];
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
+    static integer nargs;
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
+    static doublereal rbets;
+    static logical reset;
+    static char trans[1];
+    static logical upper;
+    static char uplos[1];
+    static integer ia, ib, jc, ma, na, nc;
+    extern /* Subroutine */ int zprcn5_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ int zprcn7_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
+    static integer ik, in, jj, lj, ks, ns;
+    static doublereal errmax;
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    static char transs[1], transt[1];
+    extern /* Subroutine */ int czher2k_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublereal*, doublecomplex*, integer*, ftnlen, ftnlen);
+    static integer laa, lbb, lda, lcc, ldb, ldc;
+    static doublecomplex als;
+    static integer ict, icu;
+    extern /* Subroutine */ int czsyr2k_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    static doublereal err;
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 /*  Tests ZHER2K and ZSYR2K. */
 
@@ -3506,6 +2633,17 @@ L130:
 /*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
 /*     Sven Hammarling, Numerical Algorithms Group Ltd. */
 
+/*     .. Parameters .. */
+/*     .. Scalar Arguments .. */
+/*     .. Array Arguments .. */
+/*     .. Local Scalars .. */
+/*     .. Local Arrays .. */
+/*     .. External Functions .. */
+/*     .. External Subroutines .. */
+/*     .. Intrinsic Functions .. */
+/*     .. Scalars in Common .. */
+/*     .. Common blocks .. */
+/*     .. Data statements .. */
     /* Parameter adjustments */
     --idim;
     --alf;
@@ -3525,7 +2663,8 @@ L130:
     --ab;
 
     /* Function Body */
-    conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0;
+/*     .. Executable Statements .. */
+    isconj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0;
 
     nargs = 12;
     nc = 0;
@@ -3553,7 +2692,7 @@ L130:
 	    for (ict = 1; ict <= 2; ++ict) {
 		*(unsigned char *)trans = *(unsigned char *)&icht[ict - 1];
 		tran = *(unsigned char *)trans == 'C';
-		if (tran && ! conj) {
+		if (tran && ! isconj) {
 		    *(unsigned char *)trans = 'T';
 		}
 		if (tran) {
@@ -3579,10 +2718,12 @@ L130:
 		if (tran) {
 		    i__3 = *nmax << 1;
 		    zmake_("ge", " ", " ", &ma, &na, &ab[1], &i__3, &aa[1], &
-			    lda, &reset, &c_b1);
+			    lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)
+			    1);
 		} else {
 		    zmake_("ge", " ", " ", &ma, &na, &ab[1], nmax, &aa[1], &
-			    lda, &reset, &c_b1);
+			    lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)
+			    1);
 		}
 
 /*              Generate the matrix B. */
@@ -3592,10 +2733,12 @@ L130:
 		if (tran) {
 		    i__3 = *nmax << 1;
 		    zmake_("ge", " ", " ", &ma, &na, &ab[k + 1], &i__3, &bb[1]
-			    , &ldb, &reset, &c_b1);
+			    , &ldb, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (
+			    ftnlen)1);
 		} else {
 		    zmake_("ge", " ", " ", &ma, &na, &ab[k * *nmax + 1], nmax,
-			     &bb[1], &ldb, &reset, &c_b1);
+			     &bb[1], &ldb, &reset, &c_b1, (ftnlen)2, (ftnlen)
+			    1, (ftnlen)1);
 		}
 
 		for (icu = 1; icu <= 2; ++icu) {
@@ -3611,22 +2754,22 @@ L130:
 			for (ib = 1; ib <= i__4; ++ib) {
 			    i__5 = ib;
 			    beta.r = bet[i__5].r, beta.i = bet[i__5].i;
-			    if (conj) {
+			    if (isconj) {
 				rbeta = beta.r;
 				z__1.r = rbeta, z__1.i = 0.;
 				beta.r = z__1.r, beta.i = z__1.i;
 			    }
 			    null = n <= 0;
-			    if (conj) {
-				null = null || (k <= 0 || alpha.r == 0. && 
-					alpha.i == 0.) && rbeta == 1.;
+			    if (isconj) {
+				null = null ||( (k <= 0 || (alpha.r == 0. && 
+					alpha.i == 0.)) && rbeta == 1.);
 			    }
 
 /*                       Generate the matrix C. */
 
 			    zmake_(sname + 7, uplo, " ", &n, &n, &c__[
 				    c_offset], nmax, &cc[1], &ldc, &reset, &
-				    c_b1);
+				    c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1);
 
 			    ++nc;
 
@@ -3656,7 +2799,7 @@ L130:
 /* L20: */
 			    }
 			    ldbs = ldb;
-			    if (conj) {
+			    if (isconj) {
 				rbets = rbeta;
 			    } else {
 				bets.r = beta.r, bets.i = beta.i;
@@ -3673,42 +2816,42 @@ L130:
 
 /*                       Call the subroutine. */
 
-			    if (conj) {
+			    if (isconj) {
 				if (*trace) {
 				    zprcn7_(ntra, &nc, sname, iorder, uplo, 
 					    trans, &n, &k, &alpha, &lda, &ldb,
-					     &rbeta, &ldc);
+					     &rbeta, &ldc, (ftnlen)12, (
+					    ftnlen)1, (ftnlen)1);
 				}
 				if (*rewi) {
-				    al__1.aerr = 0;
+/*				    al__1.aerr = 0;
 				    al__1.aunit = *ntra;
-				    f_rew(&al__1);
+				    f_rew(&al__1);*/
 				}
 				czher2k_(iorder, uplo, trans, &n, &k, &alpha, 
 					&aa[1], &lda, &bb[1], &ldb, &rbeta, &
-					cc[1], &ldc);
+					cc[1], &ldc, (ftnlen)1, (ftnlen)1);
 			    } else {
 				if (*trace) {
 				    zprcn5_(ntra, &nc, sname, iorder, uplo, 
 					    trans, &n, &k, &alpha, &lda, &ldb,
-					     &beta, &ldc);
+					     &beta, &ldc, (ftnlen)12, (ftnlen)
+					    1, (ftnlen)1);
 				}
 				if (*rewi) {
-				    al__1.aerr = 0;
+/*				    al__1.aerr = 0;
 				    al__1.aunit = *ntra;
-				    f_rew(&al__1);
+				    f_rew(&al__1);*/
 				}
 				czsyr2k_(iorder, uplo, trans, &n, &k, &alpha, 
 					&aa[1], &lda, &bb[1], &ldb, &beta, &
-					cc[1], &ldc);
+					cc[1], &ldc, (ftnlen)1, (ftnlen)1);
 			    }
 
 /*                       Check if error-exit was taken incorrectly. */
 
 			    if (! infoc_1.ok) {
-				io___362.ciunit = *nout;
-				s_wsfe(&io___362);
-				e_wsfe();
+                                printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n");
 				*fatal = TRUE_;
 				goto L150;
 			    }
@@ -3726,7 +2869,7 @@ L130:
 			    isame[6] = ldas == lda;
 			    isame[7] = lze_(&bs[1], &bb[1], &lbb);
 			    isame[8] = ldbs == ldb;
-			    if (conj) {
+			    if (isconj) {
 				isame[9] = rbets == rbeta;
 			    } else {
 				isame[9] = bets.r == beta.r && bets.i == 
@@ -3736,7 +2879,7 @@ L130:
 				isame[10] = lze_(&cs[1], &cc[1], &lcc);
 			    } else {
 				isame[10] = lzeres_("he", uplo, &n, &n, &cs[1]
-					, &cc[1], &ldc);
+					, &cc[1], &ldc, (ftnlen)2, (ftnlen)1);
 			    }
 			    isame[11] = ldcs == ldc;
 
@@ -3748,12 +2891,8 @@ L130:
 			    for (i__ = 1; i__ <= i__5; ++i__) {
 				same = same && isame[i__ - 1];
 				if (! isame[i__ - 1]) {
-				    io___365.ciunit = *nout;
-				    s_wsfe(&io___365);
-				    do_fio(&c__1, (char *)&i__, (ftnlen)
-					    sizeof(integer));
-				    e_wsfe();
-				}
+                                    printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);
+    				}
 /* L40: */
 			    }
 			    if (! same) {
@@ -3765,7 +2904,7 @@ L130:
 
 /*                          Check the result column by column. */
 
-				if (conj) {
+				if (isconj) {
 				    *(unsigned char *)transt = 'C';
 				} else {
 				    *(unsigned char *)transt = 'T';
@@ -3785,7 +2924,7 @@ L130:
 					i__6 = k;
 					for (i__ = 1; i__ <= i__6; ++i__) {
 					    i__7 = i__;
-					    i__8 = (j - 1 << 1) * *nmax + k + 
+					    i__8 = ((j - 1) << 1) * *nmax + k + 
 						    i__;
 					    z__1.r = alpha.r * ab[i__8].r - 
 						    alpha.i * ab[i__8].i, 
@@ -3794,17 +2933,17 @@ L130:
 						    i__8].r;
 					    w[i__7].r = z__1.r, w[i__7].i = 
 						    z__1.i;
-					    if (conj) {
+					    if (isconj) {
 			  i__7 = k + i__;
 			  d_cnjg(&z__2, &alpha);
-			  i__8 = (j - 1 << 1) * *nmax + i__;
+			  i__8 = ((j - 1) << 1) * *nmax + i__;
 			  z__1.r = z__2.r * ab[i__8].r - z__2.i * ab[i__8].i, 
 				  z__1.i = z__2.r * ab[i__8].i + z__2.i * ab[
 				  i__8].r;
 			  w[i__7].r = z__1.r, w[i__7].i = z__1.i;
 					    } else {
 			  i__7 = k + i__;
-			  i__8 = (j - 1 << 1) * *nmax + i__;
+			  i__8 = ((j - 1) << 1) * *nmax + i__;
 			  z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
 				  .i, z__1.i = alpha.r * ab[i__8].i + alpha.i 
 				  * ab[i__8].r;
@@ -3820,11 +2959,12 @@ L130:
 						1], &i__8, &beta, &c__[jj + j 
 						* c_dim1], nmax, &ct[1], &g[1]
 						, &cc[jc], &ldc, eps, &err, 
-						fatal, nout, &c_true);
+						fatal, nout, &c_true, (ftnlen)
+						1, (ftnlen)1);
 				    } else {
 					i__6 = k;
 					for (i__ = 1; i__ <= i__6; ++i__) {
-					    if (conj) {
+					    if (isconj) {
 			  i__7 = i__;
 			  d_cnjg(&z__2, &ab[(k + i__ - 1) * *nmax + j]);
 			  z__1.r = alpha.r * z__2.r - alpha.i * z__2.i, 
@@ -3861,7 +3001,8 @@ L130:
 						i__7, &beta, &c__[jj + j * 
 						c_dim1], nmax, &ct[1], &g[1], 
 						&cc[jc], &ldc, eps, &err, 
-						fatal, nout, &c_true);
+						fatal, nout, &c_true, (ftnlen)
+						1, (ftnlen)1);
 				    }
 				    if (upper) {
 					jc += ldc;
@@ -3905,90 +3046,57 @@ L130:
 
     if (errmax < *thresh) {
 	if (*iorder == 0) {
-	    io___373.ciunit = *nout;
-	    s_wsfe(&io___373);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    e_wsfe();
+            printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
 	}
 	if (*iorder == 1) {
-	    io___374.ciunit = *nout;
-	    s_wsfe(&io___374);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    e_wsfe();
+            printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
 	}
     } else {
 	if (*iorder == 0) {
-	    io___375.ciunit = *nout;
-	    s_wsfe(&io___375);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal));
-	    e_wsfe();
+            printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+            printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
 	}
 	if (*iorder == 1) {
-	    io___376.ciunit = *nout;
-	    s_wsfe(&io___376);
-	    do_fio(&c__1, sname, (ftnlen)13);
-	    do_fio(&c__1, (char *)&nc, (ftnlen)sizeof(integer));
-	    do_fio(&c__1, (char *)&errmax, (ftnlen)sizeof(doublereal));
-	    e_wsfe();
+            printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+            printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
 	}
     }
     goto L160;
 
 L140:
     if (n > 1) {
-	io___377.ciunit = *nout;
-	s_wsfe(&io___377);
-	do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer));
-	e_wsfe();
+        printf("      THESE ARE THE RESULTS FOR COLUMN %d:\n",j);
     }
 
 L150:
-    io___378.ciunit = *nout;
-    s_wsfe(&io___378);
-    do_fio(&c__1, sname, (ftnlen)13);
-    e_wsfe();
-    if (conj) {
+    printf(" ******* %s FAILED ON CALL NUMBER:\n",sname);
+    if (isconj) {
 	zprcn7_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, &
-		ldb, &rbeta, &ldc);
+		ldb, &rbeta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1);
     } else {
 	zprcn5_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, &
-		ldb, &beta, &ldc);
+		ldb, &beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1);
     }
 
 L160:
     return 0;
 
-/* L9994: */
-/* L9993: */
+/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $      '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, */
+/*     $      ', C,', I3, ')           .' ) */
+/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $      '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */
+/*     $      ',', F4.1, '), C,', I3, ')    .' ) */
 
 /*     End of ZCHK5. */
 
 } /* zchk5_ */
 
 
-/* Subroutine */ int zprcn5_(integer *nout, integer *nc, char *sname, integer 
-	*iorder, char *uplo, char *transa, integer *n, integer *k, 
-	doublecomplex *alpha, integer *lda, integer *ldb, doublecomplex *beta,
-	 integer *ldc)
+/* Subroutine */ int zprcn5_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
 {
-    /* Format strings */
-    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002,"
-	    "\002))";
-    static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002"
-	    ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002, (\002,f4.1,\002,\002"
-	    ",f4.1,\002), C,\002,i3,\002).\002)";
-
     /* Local variables */
-    char ca[14], cu[14], crc[14];
-
-    /* Fortran I/O blocks */
-    static cilist io___382 = { 0, 0, 0, fmt_9995, 0 };
-    static cilist io___383 = { 0, 0, 0, fmt_9994, 0 };
-
+    static char ca[14], cu[14], crc[14];
 
     if (*(unsigned char *)uplo == 'U') {
 	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
@@ -4007,48 +3115,19 @@ L160:
     } else {
 	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
     }
-    io___382.ciunit = *nout;
-    s_wsfe(&io___382);
-    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
-    do_fio(&c__1, sname, (ftnlen)13);
-    do_fio(&c__1, crc, (ftnlen)14);
-    do_fio(&c__1, cu, (ftnlen)14);
-    do_fio(&c__1, ca, (ftnlen)14);
-    e_wsfe();
-    io___383.ciunit = *nout;
-    s_wsfe(&io___383);
-    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer));
-    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal));
-    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer));
-    do_fio(&c__2, (char *)&(*beta), (ftnlen)sizeof(doublereal));
-    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
-    e_wsfe();
-    return 0;
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca);
+    printf("%d %d (%4.1lf,%4.1lf) , A, %d, B, %d, (%4.1lf,%4.1lf) , C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc);
+
+return 0;
 } /* zprcn5_ */
 
 
 
-/* Subroutine */ int zprcn7_(integer *nout, integer *nc, char *sname, integer 
-	*iorder, char *uplo, char *transa, integer *n, integer *k, 
-	doublecomplex *alpha, integer *lda, integer *ldb, doublereal *beta, 
-	integer *ldc)
+/* Subroutine */ int zprcn7_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
 {
-    /* Format strings */
-    static char fmt_9995[] = "(1x,i6,\002: \002,a13,\002(\002,3(a15,\002,"
-	    "\002))";
-    static char fmt_9994[] = "(10x,2(i3,\002,\002),\002 (\002,f4.1,\002,\002"
-	    ",f4.1,\002), A,\002,i3,\002, B\002,i3,\002,\002,f4.1,\002, C,"
-	    "\002,i3,\002).\002)";
 
     /* Local variables */
-    char ca[14], cu[14], crc[14];
-
-    /* Fortran I/O blocks */
-    static cilist io___387 = { 0, 0, 0, fmt_9995, 0 };
-    static cilist io___388 = { 0, 0, 0, fmt_9994, 0 };
-
+    static char ca[14], cu[14], crc[14];
 
     if (*(unsigned char *)uplo == 'U') {
 	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
@@ -4067,31 +3146,14 @@ L160:
     } else {
 	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
     }
-    io___387.ciunit = *nout;
-    s_wsfe(&io___387);
-    do_fio(&c__1, (char *)&(*nc), (ftnlen)sizeof(integer));
-    do_fio(&c__1, sname, (ftnlen)13);
-    do_fio(&c__1, crc, (ftnlen)14);
-    do_fio(&c__1, cu, (ftnlen)14);
-    do_fio(&c__1, ca, (ftnlen)14);
-    e_wsfe();
-    io___388.ciunit = *nout;
-    s_wsfe(&io___388);
-    do_fio(&c__1, (char *)&(*n), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*k), (ftnlen)sizeof(integer));
-    do_fio(&c__2, (char *)&(*alpha), (ftnlen)sizeof(doublereal));
-    do_fio(&c__1, (char *)&(*lda), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*ldb), (ftnlen)sizeof(integer));
-    do_fio(&c__1, (char *)&(*beta), (ftnlen)sizeof(doublereal));
-    do_fio(&c__1, (char *)&(*ldc), (ftnlen)sizeof(integer));
-    e_wsfe();
-    return 0;
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca);
+    printf("%d %d (%4.1lf,%4.1lf), A, %d, B, %d, %4.1lf, C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,*beta,*ldc);
+
+return 0;
 } /* zprcn7_ */
 
 
-/* Subroutine */ int zmake_(char *type__, char *uplo, char *diag, integer *m, 
-	integer *n, doublecomplex *a, integer *nmax, doublecomplex *aa, 
-	integer *lda, logical *reset, doublecomplex *transl)
+/* Subroutine */ int zmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublecomplex* a, integer* nmax, doublecomplex* aa, integer* lda, logical* reset, doublecomplex* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
@@ -4099,13 +3161,13 @@ L160:
     doublecomplex z__1, z__2;
 
     /* Local variables */
-    integer ibeg, iend;
-    extern /* Double Complex */ VOID zbeg_(doublecomplex *, logical *);
-    logical unit;
-    integer i__, j;
-    logical lower, upper;
-    integer jj;
-    logical gen, her, tri, sym;
+    static integer ibeg, iend;
+    extern /* Double Complex */ VOID zbeg_(doublecomplex*, logical*);
+    static logical unit;
+    static integer i__, j;
+    static logical lower, upper;
+    static integer jj;
+    static logical gen, her, tri, sym;
 
 
 /*  Generates values for an M by N matrix A. */
@@ -4122,6 +3184,13 @@ L160:
 /*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
 /*     Sven Hammarling, Numerical Algorithms Group Ltd. */
 
+/*     .. Parameters .. */
+/*     .. Scalar Arguments .. */
+/*     .. Array Arguments .. */
+/*     .. Local Scalars .. */
+/*     .. External Functions .. */
+/*     .. Intrinsic Functions .. */
+/*     .. Executable Statements .. */
     /* Parameter adjustments */
     a_dim1 = *nmax;
     a_offset = 1 + a_dim1 * 1;
@@ -4143,7 +3212,7 @@ L160:
     for (j = 1; j <= i__1; ++j) {
 	i__2 = *m;
 	for (i__ = 1; i__ <= i__2; ++i__) {
-	    if (gen || upper && i__ <= j || lower && i__ >= j) {
+	    if (gen || (upper && i__ <= j) || (lower && i__ >= j)) {
 		i__3 = i__ + j * a_dim1;
 		zbeg_(&z__2, reset);
 		z__1.r = z__2.r + transl->r, z__1.i = z__2.i + transl->i;
@@ -4266,22 +3335,8 @@ L160:
 
 } /* zmake_ */
 
-/* Subroutine */ int zmmch_(char *transa, char *transb, integer *m, integer *
-	n, integer *kk, doublecomplex *alpha, doublecomplex *a, integer *lda, 
-	doublecomplex *b, integer *ldb, doublecomplex *beta, doublecomplex *
-	c__, integer *ldc, doublecomplex *ct, doublereal *g, doublecomplex *
-	cc, integer *ldcc, doublereal *eps, doublereal *err, logical *fatal, 
-	integer *nout, logical *mv)
+/* Subroutine */ int zmmch_(char* transa, char* transb, integer* m, integer* n, integer* kk, doublecomplex* alpha, doublecomplex* a, integer* lda, doublecomplex* b, integer* ldb, doublecomplex* beta, doublecomplex* c__, integer* ldc, doublecomplex* ct, doublereal* g, doublecomplex* cc, integer* ldcc, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen transa_len, ftnlen transb_len)
 {
-    /* Format strings */
-    static char fmt_9999[] = "(\002 ******* FATAL ERROR - COMPUTED RESULT IS"
-	    " LESS THAN HAL\002,\002F ACCURATE *******\002,/\002             "
-	    "          EXPECTED RE\002,\002SULT                    COMPUTED R"
-	    "ESULT\002)";
-    static char fmt_9998[] = "(1x,i7,2(\002  (\002,g15.6,\002,\002,g15.6,"
-	    "\002)\002))";
-    static char fmt_9997[] = "(\002      THESE ARE THE RESULTS FOR COLUMN"
-	    " \002,i3)";
 
     /* System generated locals */
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, 
@@ -4289,18 +3344,11 @@ L160:
     doublereal d__1, d__2, d__3, d__4, d__5, d__6;
     doublecomplex z__1, z__2, z__3, z__4;
 
+    double sqrt(double);
     /* Local variables */
-    doublereal erri;
-    integer i__, j, k;
-    logical trana, tranb, ctrana, ctranb;
-
-    /* Fortran I/O blocks */
-    static cilist io___409 = { 0, 0, 0, fmt_9999, 0 };
-    static cilist io___410 = { 0, 0, 0, fmt_9998, 0 };
-    static cilist io___411 = { 0, 0, 0, fmt_9998, 0 };
-    static cilist io___412 = { 0, 0, 0, fmt_9997, 0 };
-
-
+    static doublereal erri;
+    static integer i__, j, k;
+    static logical trana, tranb, ctrana, ctranb;
 
 /*  Checks the results of the computational tests. */
 
@@ -4312,6 +3360,14 @@ L160:
 /*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
 /*     Sven Hammarling, Numerical Algorithms Group Ltd. */
 
+/*     .. Parameters .. */
+/*     .. Scalar Arguments .. */
+/*     .. Array Arguments .. */
+/*     .. Local Scalars .. */
+/*     .. Intrinsic Functions .. */
+/*     .. Statement Functions .. */
+/*     .. Statement Function definitions .. */
+/*     .. Executable Statements .. */
     /* Parameter adjustments */
     a_dim1 = *lda;
     a_offset = 1 + a_dim1 * 1;
@@ -4638,35 +3694,19 @@ L160:
 
 L230:
     *fatal = TRUE_;
-    io___409.ciunit = *nout;
-    s_wsfe(&io___409);
-    e_wsfe();
+    printf(" ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HALF ACCURATE *******\n");
+    printf("         EXPECTED RESULT                    COMPUTED RESULT\n");
     i__1 = *m;
     for (i__ = 1; i__ <= i__1; ++i__) {
 	if (*mv) {
-	    io___410.ciunit = *nout;
-	    s_wsfe(&io___410);
-	    do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer));
-	    do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(doublereal));
-	    do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof(
-		    doublereal));
-	    e_wsfe();
-	} else {
-	    io___411.ciunit = *nout;
-	    s_wsfe(&io___411);
-	    do_fio(&c__1, (char *)&i__, (ftnlen)sizeof(integer));
-	    do_fio(&c__2, (char *)&cc[i__ + j * cc_dim1], (ftnlen)sizeof(
-		    doublereal));
-	    do_fio(&c__2, (char *)&ct[i__], (ftnlen)sizeof(doublereal));
-	    e_wsfe();
+            printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,ct[i__].r,ct[i__].i,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i);
+        } else {
+            printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i,ct[i__].r,ct[i__].i);
 	}
 /* L240: */
     }
     if (*n > 1) {
-	io___412.ciunit = *nout;
-	s_wsfe(&io___412);
-	do_fio(&c__1, (char *)&j, (ftnlen)sizeof(integer));
-	e_wsfe();
+        printf("      THESE ARE THE RESULTS FOR COLUMN %d\n",j);
     }
 
 L250:
@@ -4677,14 +3717,14 @@ L250:
 
 } /* zmmch_ */
 
-logical lze_(doublecomplex *ri, doublecomplex *rj, integer *lr)
+logical lze_(doublecomplex* ri, doublecomplex* rj, integer* lr)
 {
     /* System generated locals */
     integer i__1, i__2, i__3;
     logical ret_val;
 
     /* Local variables */
-    integer i__;
+    static integer i__;
 
 
 /*  Tests if two arrays are identical. */
@@ -4697,6 +3737,10 @@ logical lze_(doublecomplex *ri, doublecomplex *rj, integer *lr)
 /*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
 /*     Sven Hammarling, Numerical Algorithms Group Ltd. */
 
+/*     .. Scalar Arguments .. */
+/*     .. Array Arguments .. */
+/*     .. Local Scalars .. */
+/*     .. Executable Statements .. */
     /* Parameter adjustments */
     --rj;
     --ri;
@@ -4722,16 +3766,15 @@ L30:
 
 } /* lze_ */
 
-logical lzeres_(char *type__, char *uplo, integer *m, integer *n, 
-	doublecomplex *aa, doublecomplex *as, integer *lda)
+logical lzeres_(char* type__, char* uplo, integer* m, integer* n, doublecomplex *aa, doublecomplex* as, integer* lda, ftnlen type_len, ftnlen uplo_len)
 {
     /* System generated locals */
     integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4;
     logical ret_val;
 
     /* Local variables */
-    integer ibeg, iend, i__, j;
-    logical upper;
+    static integer ibeg, iend, i__, j;
+    static logical upper;
 
 
 /*  Tests if selected elements in two arrays are equal. */
@@ -4746,6 +3789,10 @@ logical lzeres_(char *type__, char *uplo, integer *m, integer *n,
 /*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
 /*     Sven Hammarling, Numerical Algorithms Group Ltd. */
 
+/*     .. Scalar Arguments .. */
+/*     .. Array Arguments .. */
+/*     .. Local Scalars .. */
+/*     .. Executable Statements .. */
     /* Parameter adjustments */
     as_dim1 = *lda;
     as_offset = 1 + as_dim1 * 1;
@@ -4803,7 +3850,7 @@ logical lzeres_(char *type__, char *uplo, integer *m, integer *n,
 	}
     }
 
-/* L60: */
+/*   60 CONTINUE */
     ret_val = TRUE_;
     goto L80;
 L70:
@@ -4815,7 +3862,7 @@ L80:
 
 } /* lzeres_ */
 
-/* Double Complex */ VOID zbeg_(doublecomplex * ret_val, logical *reset)
+/* Double Complex */ VOID zbeg_(doublecomplex* ret_val, logical* reset)
 {
     /* System generated locals */
     doublereal d__1, d__2;
@@ -4836,6 +3883,11 @@ L80:
 /*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
 /*     Sven Hammarling, Numerical Algorithms Group Ltd. */
 
+/*     .. Scalar Arguments .. */
+/*     .. Local Scalars .. */
+/*     .. Save statement .. */
+/*     .. Intrinsic Functions .. */
+/*     .. Executable Statements .. */
     if (*reset) {
 /*        Initialize local variables. */
 	mi = 891;
@@ -4873,7 +3925,7 @@ L10:
 
 } /* zbeg_ */
 
-doublereal ddiff_(doublereal *x, doublereal *y)
+doublereal ddiff_(doublereal* x, doublereal* y)
 {
     /* System generated locals */
     doublereal ret_val;
@@ -4887,6 +3939,8 @@ doublereal ddiff_(doublereal *x, doublereal *y)
 /*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
 /*     Sven Hammarling, Numerical Algorithms Group Ltd. */
 
+/*     .. Scalar Arguments .. */
+/*     .. Executable Statements .. */
     ret_val = *x - *y;
     return ret_val;
 
@@ -4894,4 +3948,4 @@ doublereal ddiff_(doublereal *x, doublereal *y)
 
 } /* ddiff_ */
 
-/* Main program alias */ int zblat3_ () { MAIN__ (); return 0; }
+/* Main program alias */ /*int zblat3_ () { MAIN__ (); }*/

From 5d929d2706f92b5fa70122b865dafe72aac6ea84 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 26 Feb 2024 21:00:57 +0100
Subject: [PATCH 706/718] avoid overriding the global USE_GEMM3M

---
 ctest/CMakeLists.txt | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt
index d7baadee4..c56a78346 100644
--- a/ctest/CMakeLists.txt
+++ b/ctest/CMakeLists.txt
@@ -10,11 +10,6 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
 	set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize")
 endif()
 
-set (USE_GEMM3M 0)
-if (${ARCH} MATCHES x86|x86_64|ia64|mips)
-        set(USE_GEMM3M 1)
-endif ()
-
 if(WIN32)
 FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1
 "$ErrorActionPreference = \"Stop\"\n"

From 28f151808ea3fa3b39e02948d7b52616ce52cdfb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 26 Feb 2024 21:01:46 +0100
Subject: [PATCH 707/718] Avoid overriding the global USE_GEMM3M

---
 test/CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index b4bf36cee..4ebd5348c 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -22,9 +22,7 @@ if (BUILD_COMPLEX16)
 	list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3)
 endif()
 
-set (USE_GEMM3M 0)
-if (${ARCH} MATCHES x86|x86_64|ia64|mips)
-	set(USE_GEMM3M 1)
+if (USE_GEMM3M)
 	if (BUILD_COMPLEX)
 		list (APPEND OpenBLAS_Tests cblat3_3m)
 	endif ()

From 38283f678ed7683132f7b16f82fc6b13602f969a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 26 Feb 2024 22:22:48 +0100
Subject: [PATCH 708/718] Fix portability problems

---
 utest/test_extensions/test_cgemv_t.c | 4 ++--
 utest/test_extensions/test_csbmv.c   | 6 +++---
 utest/test_extensions/test_idamin.c  | 8 +++++---
 utest/test_extensions/test_isamin.c  | 8 +++++---
 utest/test_extensions/test_zgemv_t.c | 4 ++--
 5 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/utest/test_extensions/test_cgemv_t.c b/utest/test_extensions/test_cgemv_t.c
index aa3281e66..cb4e5ad9e 100644
--- a/utest/test_extensions/test_cgemv_t.c
+++ b/utest/test_extensions/test_cgemv_t.c
@@ -126,7 +126,7 @@ static float check_cgemv(char api, char order, char trans, blasint m, blasint n,
     srand_generate(data_cgemv_t.y_test, m * inc_y * 2);
 
     // Copy vector y for reference funcs
-    for (int i = 0; i < m * inc_y * 2; i++) {
+    for (i = 0; i < m * inc_y * 2; i++) {
         data_cgemv_t.y_verify[i] = data_cgemv_t.y_test[i];
     }
 
@@ -1129,4 +1129,4 @@ CTEST(cgemv, c_api_xerbla_invalid_order_col_major)
     int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
     ASSERT_EQUAL(TRUE, passed);
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/utest/test_extensions/test_csbmv.c b/utest/test_extensions/test_csbmv.c
index 8e8ce4530..41c24a2b7 100644
--- a/utest/test_extensions/test_csbmv.c
+++ b/utest/test_extensions/test_csbmv.c
@@ -188,7 +188,7 @@ static float check_csbmv(char uplo, blasint n, blasint k, float *alpha, blasint
     char trans = 'N';
 
     // Symmetric band packed matrix for sbmv
-    float a[lda * n * 2];
+    float *a = (float*) malloc(lda * n * 2 * sizeof(float));
 
     // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test 
     srand_generate(data_csbmv.sp_matrix, n * (n + 1));
@@ -216,7 +216,7 @@ static float check_csbmv(char uplo, blasint n, blasint k, float *alpha, blasint
     // Find the differences between output vector caculated by csbmv and cgemv
     for (i = 0; i < n * inc_c * 2; i++)
         data_csbmv.c_test[i] -= data_csbmv.c_verify[i];
-
+    free(a);
     // Find the norm of differences
     return BLASFUNC(scnrm2)(&n, data_csbmv.c_test, &inc_c);
 }
@@ -603,4 +603,4 @@ CTEST(csbmv, xerbla_lda_invalid)
     int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
     ASSERT_EQUAL(TRUE, passed);
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/utest/test_extensions/test_idamin.c b/utest/test_extensions/test_idamin.c
index 6a7ed9d1e..bebe76dba 100644
--- a/utest/test_extensions/test_idamin.c
+++ b/utest/test_extensions/test_idamin.c
@@ -402,13 +402,14 @@ CTEST(idamin, min_idx_in_vec_tail){
 CTEST(idamin, min_idx_in_vec_tail_inc_1){
    blasint i;
    blasint N = ELEMENTS, inc = 1;
-   double x[ELEMENTS * inc];
+   double *x = (double*)malloc(ELEMENTS * inc * sizeof(double));
    for (i = 0; i < N * inc; i ++) {
       x[i] = i + 1000;
    }
 
    x[(N - 1) * inc] = 0.0f;
    blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   free(x);
    ASSERT_EQUAL(N, index);
 }
 
@@ -775,13 +776,14 @@ CTEST(idamin, c_api_min_idx_in_vec_tail){
 CTEST(idamin, c_api_min_idx_in_vec_tail_inc_1){
    blasint i;
    blasint N = ELEMENTS, inc = 1;
-   double x[ELEMENTS * inc];
+   double *x = (double*) malloc(ELEMENTS * inc * sizeof(double));
    for (i = 0; i < N * inc; i ++) {
       x[i] = i + 1000;
    }
 
    x[(N - 1) * inc] = 0.0;
    blasint index = cblas_idamin(N, x, inc);
+   free(x);
    ASSERT_EQUAL(N - 1, index);
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/utest/test_extensions/test_isamin.c b/utest/test_extensions/test_isamin.c
index 4ff235b83..673e656df 100644
--- a/utest/test_extensions/test_isamin.c
+++ b/utest/test_extensions/test_isamin.c
@@ -402,13 +402,14 @@ CTEST(isamin, min_idx_in_vec_tail){
 CTEST(isamin, min_idx_in_vec_tail_inc_1){
    blasint i;
    blasint N = ELEMENTS, inc = 1;
-   float x[ELEMENTS * inc];
+   float *x = (float*) (ELEMENTS * inc * sizeof(float));
    for (i = 0; i < N * inc; i ++) {
       x[i] = i + 1000;
    }
 
    x[(N - 1) * inc] = 0.0f;
    blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   free(x);
    ASSERT_EQUAL(N, index);
 }
 
@@ -775,13 +776,14 @@ CTEST(isamin, c_api_min_idx_in_vec_tail){
 CTEST(isamin, c_api_min_idx_in_vec_tail_inc_1){
    blasint i;
    blasint N = ELEMENTS, inc = 1;
-   float x[ELEMENTS * inc];
+   float *x = (float*)malloc(ELEMENTS * inc * sizeof(float));
    for (i = 0; i < N * inc; i ++) {
       x[i] = i + 1000;
    }
 
    x[(N - 1) * inc] = 0.0f;
    blasint index = cblas_isamin(N, x, inc);
+   free(x);
    ASSERT_EQUAL(N - 1, index);
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/utest/test_extensions/test_zgemv_t.c b/utest/test_extensions/test_zgemv_t.c
index 2e0ee65f0..b2d0b2713 100644
--- a/utest/test_extensions/test_zgemv_t.c
+++ b/utest/test_extensions/test_zgemv_t.c
@@ -126,7 +126,7 @@ static double check_zgemv(char api, char order, char trans, blasint m, blasint n
     drand_generate(data_zgemv_t.y_test, m * inc_y * 2);
 
     // Copy vector y for reference funcs
-    for (int i = 0; i < m * inc_y * 2; i++)
+    for (i = 0; i < m * inc_y * 2; i++)
     {
         data_zgemv_t.y_verify[i] = data_zgemv_t.y_test[i];
     }
@@ -1133,4 +1133,4 @@ CTEST(zgemv, c_api_xerbla_invalid_order_col_major)
     int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
     ASSERT_EQUAL(TRUE, passed);
 }
-#endif
\ No newline at end of file
+#endif

From f7ffab870b6992d14173b0439b803da04ba1ab12 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 26 Feb 2024 23:03:10 +0100
Subject: [PATCH 709/718] fix missing malloc

---
 utest/test_extensions/test_isamin.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utest/test_extensions/test_isamin.c b/utest/test_extensions/test_isamin.c
index 673e656df..d93813e6f 100644
--- a/utest/test_extensions/test_isamin.c
+++ b/utest/test_extensions/test_isamin.c
@@ -402,7 +402,7 @@ CTEST(isamin, min_idx_in_vec_tail){
 CTEST(isamin, min_idx_in_vec_tail_inc_1){
    blasint i;
    blasint N = ELEMENTS, inc = 1;
-   float *x = (float*) (ELEMENTS * inc * sizeof(float));
+   float *x = (float*) malloc(ELEMENTS * inc * sizeof(float));
    for (i = 0; i < N * inc; i ++) {
       x[i] = i + 1000;
    }

From f81c1d4b598e16205f31f8146ad64e4069fe12f9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 27 Feb 2024 07:19:52 +0100
Subject: [PATCH 710/718] Fix portability problem

---
 utest/test_extensions/test_zsbmv.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/utest/test_extensions/test_zsbmv.c b/utest/test_extensions/test_zsbmv.c
index afdb208c1..0e79dc0d8 100644
--- a/utest/test_extensions/test_zsbmv.c
+++ b/utest/test_extensions/test_zsbmv.c
@@ -188,7 +188,7 @@ static double check_zsbmv(char uplo, blasint n, blasint k, double *alpha, blasin
     char trans = 'N';
 
     // Symmetric band packed matrix for sbmv
-    double a[lda * n * 2];
+    double *a = (double*) malloc(lda * n * 2 * sizeof(double));
 
     // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test 
     drand_generate(data_zsbmv.sp_matrix, n * (n + 1));
@@ -213,6 +213,7 @@ static double check_zsbmv(char uplo, blasint n, blasint k, double *alpha, blasin
     BLASFUNC(zsbmv)(&uplo, &n, &k, alpha, a, &lda,
                     data_zsbmv.b_test, &inc_b, beta, data_zsbmv.c_test, &inc_c);
 
+    free(a);
     // Find the differences between output vector caculated by zsbmv and zgemv
     for (i = 0; i < n * inc_c * 2; i++)
         data_zsbmv.c_test[i] -= data_zsbmv.c_verify[i];
@@ -603,4 +604,4 @@ CTEST(zsbmv, xerbla_lda_invalid)
     int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
     ASSERT_EQUAL(TRUE, passed);
 }
-#endif
\ No newline at end of file
+#endif

From 8e872a91a946c4fe646b9bb85732f92e3ae6424e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 27 Feb 2024 11:28:50 +0100
Subject: [PATCH 711/718] Fix erroneous mapping of SUM kernels to ASUM

---
 kernel/zarch/KERNEL.Z13 | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13
index 3bcc32197..fe82d81e6 100644
--- a/kernel/zarch/KERNEL.Z13
+++ b/kernel/zarch/KERNEL.Z13
@@ -35,10 +35,10 @@ DASUMKERNEL  = dasum.c
 CASUMKERNEL  = ../arm/zasum.c
 ZASUMKERNEL  = zasum.c
 
-SSUMKERNEL  = ../arm/asum.c
-DSUMKERNEL  = dasum.c
-CSUMKERNEL  = ../arm/zasum.c
-ZSUMKERNEL  = zasum.c
+SSUMKERNEL  = ../arm/sum.c
+DSUMKERNEL  = dsum.c
+CSUMKERNEL  = ../arm/zsum.c
+ZSUMKERNEL  = zsum.c
 
 SAXPYKERNEL  = ../arm/axpy.c
 DAXPYKERNEL  = daxpy.c

From 4266b393046f2b12c9f2b5724889245380c0293d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 27 Feb 2024 17:52:45 +0100
Subject: [PATCH 712/718] Make building the benchmarks optional and handle
 dependency on other options

---
 CMakeLists.txt | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index af3050cf9..30adf3a66 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,6 +24,8 @@ option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, d
 
 option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
 
+option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF)
+
 option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
 
 option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
@@ -458,9 +460,23 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
   endif()
 endif()
 
-if (BUILD_TESTING)
-  find_package(OpenMP REQUIRED)
+if (BUILD_BENCHMARKS)
+	#find_package(OpenMP REQUIRED)
   file(GLOB SOURCES "benchmark/*.c")
+  if (NOT USE_OPENMP)
+  file(GLOB REMFILE "benchmark/smallscaling.c")
+  list(REMOVE_ITEM SOURCES ${REMFILE})
+  endif()
+  if (BUILD_WITHOUT_LAPACK)
+  file(GLOB REMFILE "benchmark/cholesky.c")
+  list(REMOVE_ITEM SOURCES ${REMFILE})
+  file(GLOB REMFILE "benchmark/geev.c")
+  list(REMOVE_ITEM SOURCES ${REMFILE})
+  endif()
+  if (NOT USE_GEMM3M)
+  file(GLOB REMFILE "benchmark/gemm3m.c")
+  list(REMOVE_ITEM SOURCES ${REMFILE})
+  endif()
   foreach(source ${SOURCES})
     get_filename_component(name ${source} NAME_WE)
     if ((NOT ${name} STREQUAL "zdot-intel") AND (NOT ${name} STREQUAL "cula_wrapper"))
@@ -477,7 +493,8 @@ if (BUILD_TESTING)
             (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX_DOUBLE"))
           add_executable(${target_name} ${source})
           target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
-          target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C)
+	    target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} )
+	  #  target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C)
           if (NOT "${define}" STREQUAL "DEFAULT")
             target_compile_definitions(${target_name} PRIVATE ${define})
           endif()

From cfc28c586ebcc840623d79560710a8485eea9ec2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 27 Feb 2024 19:55:12 +0100
Subject: [PATCH 713/718] Exclude LAPACK testsuite and LAPACK-dependent
 benchmarks in no-LAPACK builds

---
 CMakeLists.txt | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 30adf3a66..9fbe878e6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -330,7 +330,7 @@ if (NOT NOFORTRAN)
   # Build test and ctest
   add_subdirectory(test)
  endif()
-  if (BUILD_TESTING)
+ if (BUILD_TESTING AND NOT BUILD_WITHOUT_LAPACK)
     add_subdirectory(lapack-netlib/TESTING)
   endif()
 endif()
@@ -472,6 +472,18 @@ if (BUILD_BENCHMARKS)
   list(REMOVE_ITEM SOURCES ${REMFILE})
   file(GLOB REMFILE "benchmark/geev.c")
   list(REMOVE_ITEM SOURCES ${REMFILE})
+  file(GLOB REMFILE "benchmark/gesv.c")
+  list(REMOVE_ITEM SOURCES ${REMFILE})
+  file(GLOB REMFILE "benchmark/getri.c")
+  list(REMOVE_ITEM SOURCES ${REMFILE})
+  file(GLOB REMFILE "benchmark/potrf.c")
+  list(REMOVE_ITEM SOURCES ${REMFILE})
+  file(GLOB REMFILE "benchmark/spmv.c")
+  list(REMOVE_ITEM SOURCES ${REMFILE})
+  file(GLOB REMFILE "benchmark/symv.c")
+  list(REMOVE_ITEM SOURCES ${REMFILE})
+  file(GLOB REMFILE "benchmark/linpack.c")
+  list(REMOVE_ITEM SOURCES ${REMFILE})
   endif()
   if (NOT USE_GEMM3M)
   file(GLOB REMFILE "benchmark/gemm3m.c")

From d1409407a090a46512e1ecddc6633943ccef99be Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 27 Feb 2024 21:05:59 +0100
Subject: [PATCH 714/718] Omit redundant prefixes or suffixes in library naming

---
 Makefile.system | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index cb5453cac..aadf3459a 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -1520,10 +1520,18 @@ ifndef LIBNAMEPREFIX
 LIBNAMEPREFIX =
 endif
 
+SYMPREFIX=$(SYMBOLPREFIX)
+ifeq ($(SYMBOLPREFIX),$(LIBNAMEPREFIX))
+SYMPREFIX=
+endif
+SYMSUFFIX=$(SYMBOLSUFFIX)
+ifeq ($(SYMBOLSUFFIX),$(LIBNAMESUFFIX))
+SYMSUFFIX=
+endif
 ifndef LIBNAMESUFFIX
-LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)
+LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)
 else
-LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)$(LIBNAMESUFFIX)
+LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)$(LIBNAMESUFFIX)
 endif
 
 ifeq ($(OSNAME), CYGWIN_NT)

From c508a10cf2413fc5381376c4586303e0fa7ffbd0 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Mon, 26 Feb 2024 20:09:42 -0500
Subject: [PATCH 715/718] LoongArch64: Add cgemv LSX opt

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   3 +
 kernel/loongarch64/cgemv_n_4_lsx.S       | 323 +++++++++++++++++++++++
 kernel/loongarch64/cgemv_t_4_lsx.S       | 290 ++++++++++++++++++++
 3 files changed, 616 insertions(+)
 create mode 100644 kernel/loongarch64/cgemv_n_4_lsx.S
 create mode 100644 kernel/loongarch64/cgemv_t_4_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index c7ef44035..befe68451 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -100,6 +100,9 @@ DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
 DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
 
+CGEMVNKERNEL = cgemv_n_4_lsx.S
+CGEMVTKERNEL = cgemv_t_4_lsx.S
+
 CGEMMKERNEL  = cgemm_kernel_8x4_lsx.S
 CGEMMINCOPY  = cgemm_ncopy_8_lsx.S
 CGEMMITCOPY  = cgemm_tcopy_8_lsx.S
diff --git a/kernel/loongarch64/cgemv_n_4_lsx.S b/kernel/loongarch64/cgemv_n_4_lsx.S
new file mode 100644
index 000000000..cf8273797
--- /dev/null
+++ b/kernel/loongarch64/cgemv_n_4_lsx.S
@@ -0,0 +1,323 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA_R $f0
+#define ALPHA_I $f1
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define Y_ORG   $r15
+#define OFFSET  $r16
+#define K_LDA   $r17
+#define M8      $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+
+#define VALPHA  $vr1
+#define X0      $vr2
+#define X1      $vr3
+#define X2      $vr4
+#define X3      $vr5
+#define X4      $vr6
+#define X5      $vr7
+#define X6      $vr8
+#define X7      $vr9
+#define Y0      $vr10
+#define Y1      $vr11
+#define A0      $vr12
+#define A1      $vr13
+#define A2      $vr14
+#define A3      $vr15
+#define A4      $vr16
+#define A5      $vr17
+#define A6      $vr18
+#define A7      $vr19
+#define A8      $vr20
+#define A9      $vr21
+#define A10     $vr22
+#define A11     $vr23
+#define A12     $vr24
+#define A13     $vr25
+#define A14     $vr26
+#define A15     $vr27
+#define TMP0    $vr28
+#define TMP1    $vr29
+#define TMP2    $vr30
+
+#if !defined(CONJ)
+#if !defined(XCONJ)
+#define GXCONJ 0
+#define GCONJ  0
+#else
+#define GXCONJ 1
+#define GCONJ  0
+#endif
+#else
+#if !defined(XCONJ)
+#define GXCONJ 0
+#define GCONJ  1
+#else
+#define GXCONJ 1
+#define GCONJ  1
+#endif
+#endif
+
+.macro CLOAD_X_4
+    GLDREPL v, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18
+    GCOMPLEXMUL GXCONJ, \
+    vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
+    X1, VALPHA, X1, TMP0, TMP1, TMP2, \
+    X2, VALPHA, X2, TMP0, TMP1, TMP2, \
+    X3, VALPHA, X3, TMP0, TMP1, TMP2
+.endm
+
+.macro CLOAD_X_4_GAP
+    vldrepl.d   X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    vldrepl.d   X1,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    vldrepl.d   X2,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    vldrepl.d   X3,     T0,     0x00
+
+    GCOMPLEXMUL GXCONJ, \
+    vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
+    X1, VALPHA, X1, TMP0, TMP1, TMP2, \
+    X2, VALPHA, X2, TMP0, TMP1, TMP2, \
+    X3, VALPHA, X3, TMP0, TMP1, TMP2
+.endm
+
+.macro CLOAD_X_1
+    GLDREPL v, d, X0, X, 0x00
+    GCOMPLEXMUL GXCONJ, \
+    vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2
+.endm
+
+.macro CLOAD_Y_4
+    GLD v, , Y0, Y, 0, Y1, Y, 0x10
+.endm
+
+.macro CLOAD_Y_4_GAP
+    fld.d   $f10,   Y,  0
+    fldx.d  $f13,   Y,  INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  1
+    fld.d   $f11,   T0, 0
+    fldx.d  $f17,   T0, INC_Y
+    vpackev.d Y0, A1, Y0
+    vpackev.d Y1, A5, Y1
+.endm
+
+.macro CLOAD_Y_1
+    fld.d  $f10,   Y,  0
+.endm
+
+.macro CSTORE_Y_4
+    GST v, , Y0, Y, 0, Y1, Y, 0x10
+.endm
+
+.macro CSTORE_Y_4_GAP
+    vstelm.d    Y0,     Y,      0,      0
+    PTR_ADD     T0,     Y,      INC_Y
+    vstelm.d    Y0,     T0,     0,      1
+    PTR_ADD     T0,     T0,     INC_Y
+    vstelm.d    Y1,     T0,     0,      0
+    PTR_ADD     T0,     T0,     INC_Y
+    vstelm.d    Y1,     T0,     0,      1
+.endm
+
+.macro CSTORE_Y_1
+    fst.d   $f10,   Y,  0
+.endm
+
+.macro CGEMV_N_4x4
+    GLD_INC v, , 0x10,        \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0
+
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    vf, s, Y0, X0, A0,  Y0, TMP0, TMP1, TMP2, Y1, X0, A1,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X1, A2,  Y0, TMP0, TMP1, TMP2, Y1, X1, A3,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X2, A4,  Y0, TMP0, TMP1, TMP2, Y1, X2, A5,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X3, A6,  Y0, TMP0, TMP1, TMP2, Y1, X3, A7,  Y1, TMP0, TMP1, TMP2
+.endm
+
+.macro CGEMV_N_1x4
+    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    vf, s, Y0, X0,  A0, Y0, TMP0, TMP1, TMP2, \
+    Y0, X1, A2,  Y0, TMP0, TMP1, TMP2, \
+    Y0, X2, A4,  Y0, TMP0, TMP1, TMP2, \
+    Y0, X3, A6,  Y0, TMP0, TMP1, TMP2
+.endm
+
+.macro CGEMV_N_1x1
+    fld.d   $f12,    PA0,    0
+    PTR_ADDI PA0,   PA0,    0x08
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2
+.endm
+
+.macro CGEMV_N_LSX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req
+    PTR_SRLI  J,      N,      2
+    beqz      J,      .L_\XW\()_N_3
+    PTR_SLLI  K_LDA,  LDA,    2
+    PTR_SUB   K_LDA,  K_LDA,  M8
+.L_\XW\()_N_L4:
+    CLOAD_\X_4
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    PTR_SRLI  I,      M,       2
+    beqz      I,      .L_\XW\()_M_3
+.align 5
+.L_\XW\()_M_L4:
+    CLOAD_\Y_4
+    CGEMV_N_4x4
+    CSTORE_\Y_4
+    PTR_ADDI    I,      I,      -1
+    PTR_ALSL    Y,      INC_Y,  Y,  2
+    PTR_ADDI    K,      K,      4
+    bnez        I,      .L_\XW\()_M_L4
+.L_\XW\()_M_3:
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    CLOAD_\Y_1
+    CGEMV_N_1x4
+    CSTORE_\Y_1
+    PTR_ADDI    I,      I,      -1
+    PTR_ADD     Y,      Y,      INC_Y
+    PTR_ADDI    K,      K,      1
+    bnez        I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#endif
+    PTR_ALSL    X,      INC_X,  X,  2
+    bnez        J,      .L_\XW\()_N_L4
+.L_\XW\()_N_3:
+    andi        J,      N,      3
+    beqz        J,      .L_END
+.L_\XW\()_N_L1:
+    CLOAD_\X_1
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    CLOAD_\Y_1
+    CGEMV_N_1x1
+    CSTORE_\Y_1
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   Y,      Y,      INC_Y
+    PTR_ADDI  K,      K,      1
+    bnez    I,      .L_\XW\()_N_1_M_L1
+.L_\XW\()_N_1_M_END:
+    PTR_ADDI    J,      J,      -1
+    PTR_SUB     K_LDA,  LDA,    M8
+    PTR_ADD     PA0,    PA0,    K_LDA
+    PTR_ADD     X,      X,      INC_X
+    bnez        J,      .L_\XW\()_N_L1
+
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 7, 31
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    PTR_SUB    J,      INC_Y,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    maskeqz    J,      K,       J  /* if(inc_y == 1) j = 0; else j = 1; */
+    PTR_ALSL   I,      I,       J,      1
+    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
+    // Init VALPHA
+    vpackev.w      $vr0,   $vr1,   $vr0
+    vpackev.d      VALPHA, $vr0,   $vr0
+    move     Y_ORG,  Y
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0 // Obtain the offset address
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword  .L_GAP_0_0 - .L_GAP_TABLE
+    .hword  .L_GAP_0_1 - .L_GAP_TABLE
+    .hword  .L_GAP_1_0 - .L_GAP_TABLE
+    .hword  .L_GAP_1_1 - .L_GAP_TABLE
+.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
+    CGEMV_N_LSX GAP_0_0, X_4, X_1, Y_4, Y_1
+.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
+    CGEMV_N_LSX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1
+.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
+    CGEMV_N_LSX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1
+.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
+    CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1
+.L_END:
+    pop_if_used 17 + 7, 31
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/cgemv_t_4_lsx.S b/kernel/loongarch64/cgemv_t_4_lsx.S
new file mode 100644
index 000000000..ada349364
--- /dev/null
+++ b/kernel/loongarch64/cgemv_t_4_lsx.S
@@ -0,0 +1,290 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA_R $f0
+#define ALPHA_I $f1
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define PY0     $r14
+#define X_ORG   $r15
+#define PY1     $r16
+#define K_LDA   $r17
+#define PY2     $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+#define M8      $r30
+
+#define VALPHA  $vr0
+#define X0      $vr1
+#define X1      $vr2
+#define A0      $vr3
+#define A1      $vr4
+#define A2      $vr5
+#define A3      $vr6
+#define A4      $vr7
+#define A5      $vr8
+#define A6      $vr9
+#define A7      $vr10
+#define A8      $vr11
+#define A9      $vr12
+#define A10     $vr13
+#define A11     $vr14
+#define A12     $vr15
+#define A13     $vr16
+#define A14     $vr17
+#define A15     $vr18
+#define TP0     $vr19
+#define TP1     $vr20
+#define TP2     $vr21
+#define TP3     $vr22
+#define TP4     $vr23
+#define TP5     $vr24
+#define TP6     $vr25
+#define TP7     $vr26
+#define TMP0    $vr27
+#define TMP1    $vr28
+#define TMP2    $vr29
+#define Y0      $vr3
+#define Y1      $vr4
+#define Y2      $vr5
+#define Y3      $vr6
+#define Y4      $vr7
+#define Y5      $vr8
+#define Y6      $vr9
+#define Y7      $vr10
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+#define GXCONJ1 0
+#define GCONJ1  0
+#else
+#define GXCONJ1 1
+#define GCONJ1  0
+#endif
+
+#if !defined(XCONJ)
+#define GXCONJ2 0
+#define GCONJ2  0
+#else
+#define GXCONJ2 0
+#define GCONJ2  1
+#endif
+
+.macro ZERO_Y4
+    GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
+.endm
+
+.macro ZERO_Y1
+    GXOR v, v, TP0, TP0, TP0
+.endm
+
+.macro CLOAD_X4
+    GLD v, , X0, X, 0x00, X1, X, 0x10
+.endm
+
+.macro CLOAD_X4_GAP
+    fld.d       $f1,    X,    0x00
+    fldx.d      $f3,    X,    INC_X
+    PTR_ALSL    T0,     INC_X,      X,      1
+    fld.d       $f2,    T0,   0x00
+    fldx.d      $f4,    T0,   INC_X
+    vpackev.d   X0, A0, X0
+    vpackev.d   X1, A1, X1
+.endm
+
+.macro CGEMV_T_4x4
+    GLD_INC v, , 0x10,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    vf, s, TP0, A0,  X0, TP0, TMP0, TMP1, TMP2, TP0, A1,  X1, TP0, TMP0, TMP1, TMP2, \
+    TP1, A2,  X0, TP1, TMP0, TMP1, TMP2, TP1, A3,  X1, TP1, TMP0, TMP1, TMP2, \
+    TP2, A4,  X0, TP2, TMP0, TMP1, TMP2, TP2, A5,  X1, TP2, TMP0, TMP1, TMP2, \
+    TP3, A6,  X0, TP3, TMP0, TMP1, TMP2, TP3, A7,  X1, TP3, TMP0, TMP1, TMP2
+.endm
+
+.macro CGEMV_T_LSX XW:req, X4:req
+    PTR_SRLI  J,      N,      2
+    beqz      J,      .L_\XW\()_N_3
+    PTR_SLLI  K_LDA,  LDA,    2
+    PTR_SUB   K_LDA,  K_LDA,  M8
+.L_\XW\()_N_L4:
+    ZERO_Y4
+    move      X,      X_ORG
+    PTR_SRLI  I,      M,       2
+    beqz      I,      .L_\XW\()_M_3
+.align 5
+.L_\XW\()_M_L4:
+    CLOAD_\X4
+    CGEMV_T_4x4
+    PTR_ADDI    I,  I,  -1
+    PTR_ALSL    X,  INC_X,  X,  2
+    bnez        I,  .L_\XW\()_M_L4
+.L_\XW\()_M_3:
+    // Accumulated
+    GCOMPLEXACC vf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    fld.d   $f1,    X,      0x00
+    fld.d   $f11,   PA0,    0x00
+    fld.d   $f12,   PA1,    0x00
+    fld.d   $f13,   PA2,    0x00
+    fld.d   $f14,   PA3,    0x00
+#if __loongarch_grlen == 64
+    GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08
+#elif __loongarch_grlen == 32
+    GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08
+#else
+    GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08
+#endif
+
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    vf, s, A0, A8,  X0, A0, TMP0, TMP1, TMP2, A1, A9,  X0, A1, TMP0, TMP1, TMP2, \
+    A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2
+
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    fld.d   $f11,   Y,  0x00
+    fldx.d  $f12,   Y,  INC_Y
+    PTR_ALSL    PY0, INC_Y,  Y,  1
+    fld.d   $f13,   PY0,    0x00
+    fldx.d  $f14,   PY0,    INC_Y
+
+    GCOMPLEXMADD GXCONJ2, GCONJ2, \
+    vf, s, A8,  VALPHA, A0, A8,  TMP0, TMP1, TMP2, A9,  VALPHA, A1, A9,  TMP0, TMP1, TMP2,\
+    A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2
+
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#endif
+    fst.d   $f11,   Y,      0x00
+    fstx.d  $f12,   Y,      INC_Y
+    fst.d   $f13,   PY0,    0x00
+    fstx.d  $f14,   PY0,    INC_Y
+    PTR_ALSL    Y,      INC_Y,  Y,  2
+    bnez        J,      .L_\XW\()_N_L4
+.L_\XW\()_N_3:
+    andi        J,      N,      3
+    beqz        J,      .L_END
+    PTR_SUB     K_LDA,  LDA,    M8
+.L_\XW\()_N_1:
+    ZERO_Y1
+    move    X,      X_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    fld.d   $f3,    PA0,    0x00
+    fld.d   $f1,    X,      0x00
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    PTR_ADDI  PA0,    PA0,    0x08
+    bnez      I,      .L_\XW\()_N_1_M_L1
+.L_\XW\()_N_1_M_END:
+    PTR_ADDI  J,      J,      -1
+    fld.d     $f3,    Y,      0x00
+    GCOMPLEXMADD GXCONJ2, GCONJ2, \
+    vf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
+    fst.d     $f3,    Y,      0x00
+    PTR_ADD   PA0,    PA0,    K_LDA
+    PTR_ADD   Y,      Y,      INC_Y
+    bnez      J,      .L_\XW\()_N_1
+
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 8, 30
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
+    // Init VALPHA
+    vpackev.w      $vr0,   $vr1,   $vr0
+    vpackev.d      VALPHA, $vr0,   $vr0
+    move     X_ORG,  X
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword .L_GAP_0 - .L_GAP_TABLE
+    .hword .L_GAP_1 - .L_GAP_TABLE
+.L_GAP_0: /* if (incx == 1) */
+    CGEMV_T_LSX GAP_0, X4
+.L_GAP_1: /* if (incx != 1) */
+    CGEMV_T_LSX GAP_1, X4_GAP
+.L_END:
+    pop_if_used 17 + 8, 30
+    jirl    $r0, $r1, 0x0
+    EPILOGUE

From 3f22fc22333ae7b739e0620d64710e3905ef1dc1 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Tue, 27 Feb 2024 04:16:49 -0500
Subject: [PATCH 716/718] LoongArch64: Add zgemv LSX opt

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |   3 +
 kernel/loongarch64/loongarch64_asm.S     |   3 +
 kernel/loongarch64/zgemv_n_2_lsx.S       | 296 +++++++++++++++++++++++
 kernel/loongarch64/zgemv_t_2_lsx.S       | 268 ++++++++++++++++++++
 4 files changed, 570 insertions(+)
 create mode 100644 kernel/loongarch64/zgemv_n_2_lsx.S
 create mode 100644 kernel/loongarch64/zgemv_t_2_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index befe68451..15d4358be 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -118,6 +118,9 @@ CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
 CTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
 
+ZGEMVNKERNEL = zgemv_n_2_lsx.S
+ZGEMVTKERNEL = zgemv_t_2_lsx.S
+
 ZGEMMKERNEL  = zgemm_kernel_4x4_lsx.S
 ZGEMMONCOPY  = zgemm_ncopy_4_lsx.S
 ZGEMMOTCOPY  = zgemm_tcopy_4_lsx.S
diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S
index fee46d63e..d097b3045 100644
--- a/kernel/loongarch64/loongarch64_asm.S
+++ b/kernel/loongarch64/loongarch64_asm.S
@@ -406,9 +406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .ifeqs "\suf_op", "s"
     vpackod.d              \out,   \in,    \in
     \pre_op\()add.\suf_op  \out,   \out,   \in
+.else
+    vor.v \out, \in, \in
 .endif
 .endif
 
+
 .ifnb \more
     GCOMPLEXACC \pre_op, \suf_op, \more
 .endif
diff --git a/kernel/loongarch64/zgemv_n_2_lsx.S b/kernel/loongarch64/zgemv_n_2_lsx.S
new file mode 100644
index 000000000..efb376118
--- /dev/null
+++ b/kernel/loongarch64/zgemv_n_2_lsx.S
@@ -0,0 +1,296 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA_R $f0
+#define ALPHA_I $f1
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define Y_ORG   $r15
+#define OFFSET  $r16
+#define K_LDA   $r17
+#define M16     $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+
+#define VALPHA  $vr1
+#define X0      $vr2
+#define X1      $vr3
+#define X2      $vr4
+#define X3      $vr5
+#define X4      $vr6
+#define X5      $vr7
+#define X6      $vr8
+#define X7      $vr9
+#define Y0      $vr10
+#define Y1      $vr11
+#define A0      $vr12
+#define A1      $vr13
+#define A2      $vr14
+#define A3      $vr15
+#define A4      $vr16
+#define A5      $vr17
+#define A6      $vr18
+#define A7      $vr19
+#define A8      $vr20
+#define A9      $vr21
+#define A10     $vr22
+#define A11     $vr23
+#define A12     $vr24
+#define A13     $vr25
+#define A14     $vr26
+#define A15     $vr27
+#define TMP0    $vr28
+#define TMP1    $vr29
+#define TMP2    $vr30
+
+#if !defined(CONJ)
+#if !defined(XCONJ)
+#define GXCONJ 0
+#define GCONJ  0
+#else
+#define GXCONJ 1
+#define GCONJ  0
+#endif
+#else
+#if !defined(XCONJ)
+#define GXCONJ 0
+#define GCONJ  1
+#else
+#define GXCONJ 1
+#define GCONJ  1
+#endif
+#endif
+
+.macro ZLOAD_X_2
+    GLD v, , X0, X, 0x00, X1, X, 0x10
+    GCOMPLEXMUL GXCONJ, \
+    vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
+    X1, VALPHA, X1, TMP0, TMP1, TMP2
+.endm
+
+.macro ZLOAD_X_2_GAP
+    vld         X0,     X,      0
+    PTR_ADD     T0,     X,      INC_X
+    vld         X1,     T0,     0
+
+    GCOMPLEXMUL GXCONJ, \
+    vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
+    X1, VALPHA, X1, TMP0, TMP1, TMP2
+.endm
+
+.macro ZLOAD_X_1
+    GLD v, , X0, X, 0x00
+    GCOMPLEXMUL GXCONJ, \
+    vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2
+.endm
+
+.macro ZLOAD_Y_2
+    GLD v, , Y0, Y, 0, Y1, Y, 0x10
+.endm
+
+.macro ZLOAD_Y_2_GAP
+    vld  $vr10, Y, 0
+    vldx $vr11, Y, INC_Y
+.endm
+
+.macro ZLOAD_Y_1
+    vld $vr10, Y, 0
+.endm
+
+.macro ZGEMV_N_2x2
+    GLD_INC v, , 0x10,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    vf, d, Y0, X0, A0,  Y0, TMP0, TMP1, TMP2, Y1, X0, A1,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X1, A2,  Y0, TMP0, TMP1, TMP2, Y1, X1, A3,  Y1, TMP0, TMP1, TMP2
+.endm
+
+.macro ZGEMV_N_1x2
+    GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    vf, d, Y0, X0,  A0, Y0, TMP0, TMP1, TMP2, \
+    Y0, X1, A2,  Y0, TMP0, TMP1, TMP2
+.endm
+
+.macro ZGEMV_N_1x1
+    GLD_INC v, , 0x10, $vr12, PA0, 0
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    vf, d, Y0, X0,  A0, Y0, TMP0, TMP1, TMP2
+.endm
+
+.macro ZSTORE_Y_2
+    GST v, , Y0, Y, 0, Y1, Y, 0x10
+.endm
+
+.macro ZSTORE_Y_2_GAP
+    vst  Y0, Y, 0
+    vstx Y1, Y, INC_Y
+.endm
+
+.macro ZSTORE_Y_1
+    vst $vr10, Y, 0
+.endm
+
+.macro ZGEMV_N_LSX XW:req, X_2:req, X_1:req, Y_2:req, Y_1:req
+    PTR_SRLI  J,      N,      1
+    beqz      J,      .L_\XW\()_N_1
+    PTR_SLLI  K_LDA,  LDA,    1
+    PTR_SUB   K_LDA,  K_LDA,  M16
+.L_\XW\()_N_L2:
+    ZLOAD_\X_2
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    PTR_SRLI  I,      M,       1
+    beqz      I,      .L_\XW\()_M_1
+.align 5
+.L_\XW\()_M_L2:
+    ZLOAD_\Y_2
+    ZGEMV_N_2x2
+    ZSTORE_\Y_2
+    PTR_ADDI    I,      I,      -1
+    PTR_ALSL    Y,      INC_Y,  Y,  1
+    PTR_ADDI    K,      K,      4
+    bnez        I,      .L_\XW\()_M_L2
+.L_\XW\()_M_1:
+    andi        I,      M,      1
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    ZLOAD_\Y_1
+    ZGEMV_N_1x2
+    ZSTORE_\Y_1
+    PTR_ADDI    I,      I,      -1
+    PTR_ADD     Y,      Y,      INC_Y
+    PTR_ADDI    K,      K,      1
+    bnez        I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#endif
+    PTR_ALSL    X,      INC_X,  X,  1
+    bnez        J,      .L_\XW\()_N_L2
+.L_\XW\()_N_1:
+    andi        J,      N,      1
+    beqz        J,      .L_END
+.L_\XW\()_N_L1:
+    ZLOAD_\X_1
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    ZLOAD_\Y_1
+    ZGEMV_N_1x1
+    ZSTORE_\Y_1
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   Y,      Y,      INC_Y
+    PTR_ADDI  K,      K,      1
+    bnez    I,      .L_\XW\()_N_1_M_L1
+.L_\XW\()_N_1_M_END:
+    PTR_ADDI    J,      J,      -1
+    PTR_SUB     K_LDA,  LDA,    M16
+    PTR_ADD     PA0,    PA0,    K_LDA
+    PTR_ADD     X,      X,      INC_X
+    bnez        J,      .L_\XW\()_N_L1
+
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 7, 31
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    PTR_SUB    J,      INC_Y,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    maskeqz    J,      K,       J  /* if(inc_y == 1) j = 0; else j = 1; */
+    PTR_ALSL   I,      I,       J,      1
+    GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
+    // Init VALPHA
+    vpackev.d      VALPHA,   $vr1,   $vr0
+    move     Y_ORG,  Y
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA
+#else
+    GADD , d, PA1, PA0, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0 // Obtain the offset address
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword  .L_GAP_0_0 - .L_GAP_TABLE
+    .hword  .L_GAP_0_1 - .L_GAP_TABLE
+    .hword  .L_GAP_1_0 - .L_GAP_TABLE
+    .hword  .L_GAP_1_1 - .L_GAP_TABLE
+.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
+    ZGEMV_N_LSX GAP_0_0, X_2, X_1, Y_2, Y_1
+.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
+    ZGEMV_N_LSX GAP_0_1, X_2, X_1, Y_2_GAP, Y_1
+.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
+    ZGEMV_N_LSX GAP_1_0, X_2_GAP, X_1, Y_2, Y_1
+.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
+    ZGEMV_N_LSX GAP_1_1, X_2_GAP, X_1, Y_2_GAP, Y_1
+.L_END:
+    pop_if_used 17 + 7, 31
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/zgemv_t_2_lsx.S b/kernel/loongarch64/zgemv_t_2_lsx.S
new file mode 100644
index 000000000..2a0fc172e
--- /dev/null
+++ b/kernel/loongarch64/zgemv_t_2_lsx.S
@@ -0,0 +1,268 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA_R $f0
+#define ALPHA_I $f1
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define PY0     $r14
+#define X_ORG   $r15
+#define PY1     $r16
+#define K_LDA   $r17
+#define PY2     $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+#define M16     $r30
+
+#define VALPHA  $vr0
+#define X0      $vr1
+#define X1      $vr2
+#define A0      $vr3
+#define A1      $vr4
+#define A2      $vr5
+#define A3      $vr6
+#define A4      $vr7
+#define A5      $vr8
+#define A6      $vr9
+#define A7      $vr10
+#define A8      $vr11
+#define A9      $vr12
+#define A10     $vr13
+#define A11     $vr14
+#define A12     $vr15
+#define A13     $vr16
+#define A14     $vr17
+#define A15     $vr18
+#define TP0     $vr19
+#define TP1     $vr20
+#define TP2     $vr21
+#define TP3     $vr22
+#define TP4     $vr23
+#define TP5     $vr24
+#define TP6     $vr25
+#define TP7     $vr26
+#define TMP0    $vr27
+#define TMP1    $vr28
+#define TMP2    $vr29
+#define Y0      $vr3
+#define Y1      $vr4
+#define Y2      $vr5
+#define Y3      $vr6
+#define Y4      $vr7
+#define Y5      $vr8
+#define Y6      $vr9
+#define Y7      $vr10
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+#define GXCONJ1 0
+#define GCONJ1  0
+#else
+#define GXCONJ1 1
+#define GCONJ1  0
+#endif
+
+#if !defined(XCONJ)
+#define GXCONJ2 0
+#define GCONJ2  0
+#else
+#define GXCONJ2 0
+#define GCONJ2  1
+#endif
+
+.macro ZERO_Y2
+    GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1
+.endm
+
+.macro ZERO_Y1
+    GXOR v, v, TP0, TP0, TP0
+.endm
+
+.macro ZLOAD_X2
+    GLD v, , X0, X, 0x00, X1, X, 0x10
+.endm
+
+.macro ZLOAD_X2_GAP
+    vld        X0,     X,      0
+    vldx       X1,     X,      INC_X
+.endm
+
+.macro ZGEMV_T_2x2
+    GLD_INC v, , 0x10,        \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    vf, d, TP0, A0,  X0, TP0, TMP0, TMP1, TMP2, TP0, A1,  X1, TP0, TMP0, TMP1, TMP2, \
+    TP1, A2,  X0, TP1, TMP0, TMP1, TMP2, TP1, A3,  X1, TP1, TMP0, TMP1, TMP2
+.endm
+
+.macro ZGEMV_T_LSX XW:req, X2:req
+    PTR_SRLI  J,      N,      1
+    beqz      J,      .L_\XW\()_N_1
+    PTR_SLLI  K_LDA,  LDA,    1
+    PTR_SUB   K_LDA,  K_LDA,  M16
+.L_\XW\()_N_L2:
+    ZERO_Y2
+    move      X,      X_ORG
+    PTR_SRLI  I,      M,       1
+    beqz      I,      .L_\XW\()_M_1
+.align 5
+.L_\XW\()_M_L2:
+    ZLOAD_\X2
+    ZGEMV_T_2x2
+    PTR_ADDI    I,  I,  -1
+    PTR_ALSL    X,  INC_X,  X,  1
+    bnez        I,  .L_\XW\()_M_L2
+.L_\XW\()_M_1:
+    // Accumulated
+    GCOMPLEXACC vf, d, Y0, TP0, Y1, TP1
+    andi        I,      M,      1
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    GLD v, , X0, X, 0x00, A8, PA0, 0x00, A9, PA1, 0x00
+#if __loongarch_grlen == 64
+    GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10
+#elif __loongarch_grlen == 32
+    GADDI , w, PA0, PA0, 0x10, PA1, PA1, 0x10
+#else
+    GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10
+#endif
+
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    vf, d, A0, A8,  X0, A0, TMP0, TMP1, TMP2, A1, A9,  X0, A1, TMP0, TMP1, TMP2
+
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    vld  A8, Y, 0x00
+    vldx A9, Y, INC_Y
+
+    GCOMPLEXMADD GXCONJ2, GCONJ2, \
+    vf, d, A8,  VALPHA, A0, A8,  TMP0, TMP1, TMP2, A9,  VALPHA, A1, A9,  TMP0, TMP1, TMP2
+
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#endif
+    vst  $vr11, Y,   0x00
+    vstx $vr12, Y,   INC_Y
+    PTR_ALSL    Y,      INC_Y,  Y,  1
+    bnez        J,      .L_\XW\()_N_L2
+.L_\XW\()_N_1:
+    andi        J,      N,      1
+    beqz        J,      .L_END
+    PTR_SUB     K_LDA,  LDA,    M16
+.L_\XW\()_N_L1:
+    ZERO_Y1
+    move    X,      X_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    GLD v, , A0, PA0, 0x00, X0, X, 0x00
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    vf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    PTR_ADDI  PA0,    PA0,    0x10
+    bnez      I,      .L_\XW\()_N_1_M_L1
+.L_\XW\()_N_1_M_END:
+    PTR_ADDI  J,      J,      -1
+    vld       A0,     Y,      0x00
+    GCOMPLEXMADD GXCONJ2, GCONJ2, \
+    vf, d, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
+    vst       $vr3,   Y,      0x00
+    PTR_ADD   PA0,    PA0,    K_LDA
+    PTR_ADD   Y,      Y,      INC_Y
+    bnez      J,      .L_\XW\()_N_L1
+
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 8, 30
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
+    // Init VALPHA
+    vpackev.d      VALPHA,   $vr1,   $vr0
+    move     X_ORG,  X
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA
+#else
+    GADD , d, PA1, PA0, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword .L_GAP_0 - .L_GAP_TABLE
+    .hword .L_GAP_1 - .L_GAP_TABLE
+.L_GAP_0: /* if (incx == 1) */
+    ZGEMV_T_LSX GAP_0, X2
+.L_GAP_1: /* if (incx != 1) */
+    ZGEMV_T_LSX GAP_1, X2_GAP
+.L_END:
+    pop_if_used 17 + 8, 30
+    jirl    $r0, $r1, 0x0
+    EPILOGUE

From 8e05c053be8b889697b17f6847d94a8e4ce3c12b Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Tue, 27 Feb 2024 21:17:01 -0500
Subject: [PATCH 717/718] LoongArch64:Fixed the failed test cases
 test_{c/z}gemv_n in test_extensions

---
 kernel/loongarch64/cgemv_n_8_lasx.S | 34 ++++++++++++++---------------
 kernel/loongarch64/zgemv_n_4_lasx.S | 18 +++++++--------
 2 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/kernel/loongarch64/cgemv_n_8_lasx.S b/kernel/loongarch64/cgemv_n_8_lasx.S
index b078e3227..ba38a9573 100644
--- a/kernel/loongarch64/cgemv_n_8_lasx.S
+++ b/kernel/loongarch64/cgemv_n_8_lasx.S
@@ -122,14 +122,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
                    X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
     GCOMPLEXMUL GXCONJ, \
-    xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
-    X1, X1, VALPHA, TMP0, TMP1, TMP2, \
-    X2, X2, VALPHA, TMP0, TMP1, TMP2, \
-    X3, X3, VALPHA, TMP0, TMP1, TMP2, \
-    X4, X4, VALPHA, TMP0, TMP1, TMP2, \
-    X5, X5, VALPHA, TMP0, TMP1, TMP2, \
-    X6, X6, VALPHA, TMP0, TMP1, TMP2, \
-    X7, X7, VALPHA, TMP0, TMP1, TMP2
+    xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
+    X1, VALPHA, X1, TMP0, TMP1, TMP2, \
+    X2, VALPHA, X2, TMP0, TMP1, TMP2, \
+    X3, VALPHA, X3, TMP0, TMP1, TMP2, \
+    X4, VALPHA, X4, TMP0, TMP1, TMP2, \
+    X5, VALPHA, X5, TMP0, TMP1, TMP2, \
+    X6, VALPHA, X6, TMP0, TMP1, TMP2, \
+    X7, VALPHA, X7, TMP0, TMP1, TMP2
 .endm
 
 .macro CLOAD_X_8_GAP
@@ -150,14 +150,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvldrepl.d  X7,     T0,     0x00
 
     GCOMPLEXMUL GXCONJ, \
-    xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
-    X1, X1, VALPHA, TMP0, TMP1, TMP2, \
-    X2, X2, VALPHA, TMP0, TMP1, TMP2, \
-    X3, X3, VALPHA, TMP0, TMP1, TMP2, \
-    X4, X4, VALPHA, TMP0, TMP1, TMP2, \
-    X5, X5, VALPHA, TMP0, TMP1, TMP2, \
-    X6, X6, VALPHA, TMP0, TMP1, TMP2, \
-    X7, X7, VALPHA, TMP0, TMP1, TMP2
+    xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
+    X1, VALPHA, X1, TMP0, TMP1, TMP2, \
+    X2, VALPHA, X2, TMP0, TMP1, TMP2, \
+    X3, VALPHA, X3, TMP0, TMP1, TMP2, \
+    X4, VALPHA, X4, TMP0, TMP1, TMP2, \
+    X5, VALPHA, X5, TMP0, TMP1, TMP2, \
+    X6, VALPHA, X6, TMP0, TMP1, TMP2, \
+    X7, VALPHA, X7, TMP0, TMP1, TMP2
 .endm
 
 .macro CLOAD_Y_8
@@ -228,7 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro CLOAD_X_1
     GLDREPL xv, d, X0, X, 0x00
     GCOMPLEXMUL GXCONJ, \
-    xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2
+    xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2
 .endm
 
 .macro CLOAD_Y_1
diff --git a/kernel/loongarch64/zgemv_n_4_lasx.S b/kernel/loongarch64/zgemv_n_4_lasx.S
index 98b1a6f7d..26edf1ed7 100644
--- a/kernel/loongarch64/zgemv_n_4_lasx.S
+++ b/kernel/loongarch64/zgemv_n_4_lasx.S
@@ -122,10 +122,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     GLD xv, , X0, X, 0x00, X1, X, 0x10, X2, X, 0x20, X3, X, 0x30
     GPERMI xv, q, X0, X0, 0, X1, X1, 0, X2, X2, 0, X3, X3, 0
     GCOMPLEXMUL GXCONJ, \
-    xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
-    X1, X1, VALPHA, TMP0, TMP1, TMP2, \
-    X2, X2, VALPHA, TMP0, TMP1, TMP2, \
-    X3, X3, VALPHA, TMP0, TMP1, TMP2
+    xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
+    X1, VALPHA, X1, TMP0, TMP1, TMP2, \
+    X2, VALPHA, X2, TMP0, TMP1, TMP2, \
+    X3, VALPHA, X3, TMP0, TMP1, TMP2
 .endm
 
 .macro ZLOAD_X_4_GAP
@@ -145,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvpermi.q   X3,     X3,     0
 
     GCOMPLEXMUL GXCONJ, \
-    xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
-    X1, X1, VALPHA, TMP0, TMP1, TMP2, \
-    X2, X2, VALPHA, TMP0, TMP1, TMP2, \
-    X3, X3, VALPHA, TMP0, TMP1, TMP2
+    xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
+    X1, VALPHA, X1, TMP0, TMP1, TMP2, \
+    X2, VALPHA, X2, TMP0, TMP1, TMP2, \
+    X3, VALPHA, X3, TMP0, TMP1, TMP2
 .endm
 
 .macro ZLOAD_Y_4
@@ -216,7 +216,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     GLD xv, , X0, X, 0x00
     GPERMI xv, q, X0, X0, 0
     GCOMPLEXMUL GXCONJ, \
-    xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2
+    xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2
 .endm
 
 .macro ZGEMV_N_1x1

From b2db064285bfaa36ac44b3a61cb4930cf0076d8d Mon Sep 17 00:00:00 2001
From: pengxu <pengxu@loongson.cn>
Date: Tue, 27 Feb 2024 10:47:49 +0800
Subject: [PATCH 718/718] Optimized sgemv and dgemv kernel LSX for LoongArch

---
 kernel/loongarch64/KERNEL.LOONGSON2K1000 |  12 +
 kernel/loongarch64/dgemv_n_lsx.S         | 229 +++++++++++++++++++
 kernel/loongarch64/dgemv_t_lsx.S         | 279 +++++++++++++++++++++++
 kernel/loongarch64/sgemv_n_lsx.S         | 227 ++++++++++++++++++
 kernel/loongarch64/sgemv_t_lsx.S         | 275 ++++++++++++++++++++++
 5 files changed, 1022 insertions(+)
 create mode 100644 kernel/loongarch64/dgemv_n_lsx.S
 create mode 100644 kernel/loongarch64/dgemv_t_lsx.S
 create mode 100644 kernel/loongarch64/sgemv_n_lsx.S
 create mode 100644 kernel/loongarch64/sgemv_t_lsx.S

diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
index c7ef44035..5b54a2ada 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -85,6 +85,12 @@ ZSWAPKERNEL = cswap_lsx.S
 CSUMKERNEL = csum_lsx.S
 ZSUMKERNEL = csum_lsx.S
 
+SGEMVNKERNEL = sgemv_n_lsx.S
+SGEMVTKERNEL = sgemv_t_lsx.S
+
+DGEMVNKERNEL = dgemv_n_lsx.S
+DGEMVTKERNEL = dgemv_t_lsx.S
+
 DGEMMKERNEL    = dgemm_kernel_8x4.S
 DGEMMINCOPY    = dgemm_ncopy_8_lsx.S
 DGEMMITCOPY    = dgemm_tcopy_8_lsx.S
@@ -100,6 +106,9 @@ DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
 DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
 
+CGEMVNKERNEL = cgemv_n_4_lsx.S
+CGEMVTKERNEL = cgemv_t_4_lsx.S
+
 CGEMMKERNEL  = cgemm_kernel_8x4_lsx.S
 CGEMMINCOPY  = cgemm_ncopy_8_lsx.S
 CGEMMITCOPY  = cgemm_tcopy_8_lsx.S
@@ -115,6 +124,9 @@ CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
 CTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
 
+ZGEMVNKERNEL = zgemv_n_2_lsx.S
+ZGEMVTKERNEL = zgemv_t_2_lsx.S
+
 ZGEMMKERNEL  = zgemm_kernel_4x4_lsx.S
 ZGEMMONCOPY  = zgemm_ncopy_4_lsx.S
 ZGEMMOTCOPY  = zgemm_tcopy_4_lsx.S
diff --git a/kernel/loongarch64/dgemv_n_lsx.S b/kernel/loongarch64/dgemv_n_lsx.S
new file mode 100644
index 000000000..9a0141fb1
--- /dev/null
+++ b/kernel/loongarch64/dgemv_n_lsx.S
@@ -0,0 +1,229 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Param */
+#define M      $r4
+#define N      $r5
+#define A      $r7
+#define LDA    $r8
+#define X      $r9
+#define INCX   $r10
+#define Y      $r11
+#define INCY   $r6
+#define BUFFER $r16
+#define ALPHA  $f0
+
+#define YORIG  $r18
+#define T0     $r19
+#define T1     $r20
+#define XX     $r12
+#define YY     $r13
+#define I      $r14
+#define J      $r15
+#define AO1    $r23
+#define AO2    $r24
+#define IX     $r25
+#define IY     $r26
+#define II     $r27
+#define T2     $r28
+#define T3     $r29
+#define T4     $r30
+
+/* LSX vectors */
+#define U0     $vr11
+#define U1     $vr12
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define VALPHA $vr10
+
+#define a1     $f3
+#define a2     $f4
+#define a3     $f5
+#define a4     $f6
+#define a5     $f7
+#define a6     $f8
+#define a7     $f9
+#define a8     $f10
+
+
+    PROLOGUE
+
+    LDARG  INCY,    $sp,  0
+    LDARG  BUFFER,  $sp,  8
+
+    addi.d  $sp, $sp, -80
+
+    SDARG  $r23,  $sp,    0
+    SDARG  $r24,  $sp,    8
+    SDARG  $r25,  $sp,    16
+    SDARG  $r26,  $sp,    32
+    SDARG  $r27,  $sp,    40
+    SDARG  $r28,  $sp,    48
+    SDARG  $r29,  $sp,    56
+    SDARG  $r30,  $sp,    64
+    ST     ALPHA, $sp,    72
+
+    vldrepl.d  VALPHA, $sp, 72
+
+    slli.d     LDA,     LDA,  BASE_SHIFT
+    slli.d     INCX,    INCX, BASE_SHIFT
+    slli.d     INCY,    INCY, BASE_SHIFT
+
+    bge    $r0,    M,    .L999
+    bge    $r0,    N,    .L999
+
+    move   J,      $r0
+    move   IX,     $r0
+
+    move   AO1,    A  //a_ptr
+    move   XX,     X
+    move   YY,     Y
+
+    beq    J,      M,     .L999
+
+.L01:
+    vldx        U0,    XX,    IX
+    vshuf4i.d   U0,    U0,    0x00
+
+    vfmul.d     U1,    VALPHA,  U0  //temp1
+
+    move        IY,    $r0
+    move        II,    $r0
+    move        I,     $r0
+
+    srai.d      T0,    M,     2  //n/4
+    beq         I,     T0,    .L03
+
+.L02:
+    vldx        U2,    AO1,    II
+    addi.d      II,    II,     16
+    vldx        U7,    AO1,    II
+
+    move        T1,    IY
+    add.d       T2,    T1,    INCY
+    add.d       T3,    T2,    INCY
+    add.d       T4,    T3,    INCY
+
+    fldx.d      a1,    YY,    T1
+    fldx.d      a2,    YY,    T2
+    fldx.d      a3,    YY,    T3
+    fldx.d      a4,    YY,    T4
+
+    vextrins.d  U3,    U4,    0x10
+    vextrins.d  U5,    U6,    0x10
+
+    vfmadd.d    U3,    U1,    U2,    U3
+    vfmadd.d    U5,    U1,    U7,    U5
+
+    vextrins.d  U4,    U3,    0x01
+    vextrins.d  U6,    U5,    0x01
+
+    fstx.d     a1,     YY,    T1
+    fstx.d     a2,     YY,    T2
+    fstx.d     a3,     YY,    T3
+    fstx.d     a4,     YY,    T4
+
+    add.d      IY,     T4,    INCY
+    addi.d     II,     II,    16
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L02
+
+.L03:
+    andi       T0,     M,     2
+    beq        $r0,    T0,    .L04
+
+    addi.d     T1,     $r0,   4
+    mod.d      T1,     M,     T1
+    sub.d      II,     M,     T1
+    slli.d     II,     II,    BASE_SHIFT
+
+    move       T1,    IY
+    add.d      T2,    T1,    INCY
+
+    vldx       U2,    AO1,    II
+
+    fldx.d     a1,    YY,    T1
+    fldx.d     a2,    YY,    T2
+
+    vextrins.d  U3,    U4,    0x10
+
+    vfmadd.d    U3,    U1,    U2,    U3
+
+    vextrins.d  U4,    U3,    0x01
+
+    fstx.d     a1,     YY,    T1
+    fstx.d     a2,     YY,    T2
+
+    add.d      IY,     T2,    INCY
+
+.L04:
+    andi       T0,     M,     1
+    beq        $r0,    T0,    .L05
+
+    addi.d     II,    M,     -1
+    slli.d     II,    II,    BASE_SHIFT
+
+    fldx.d     a1,    AO1,   II
+    fldx.d     a3,    YY,    IY
+
+    fmadd.d    a3,    $f12,  a1,    a3
+
+    fstx.d     a3,    YY,    IY
+
+    add.d      IY,    IY,    INCY
+
+.L05:
+    add.d      AO1,   AO1,   LDA
+    add.d      IX,    IX,    INCX
+
+    addi.d     J,     J,     1
+    blt        J,     N,    .L01
+
+.L999:
+    LDARG  $r23,  $sp,    0
+    LDARG  $r24,  $sp,    8
+    LDARG  $r25,  $sp,    16
+    LDARG  $r26,  $sp,    32
+    LDARG  $r27,  $sp,    40
+    LDARG  $r28,  $sp,    48
+    LDARG  $r29,  $sp,    56
+    LDARG  $r30,  $sp,    64
+    LD     ALPHA, $sp,    72
+    addi.d  $sp, $sp, 80
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemv_t_lsx.S b/kernel/loongarch64/dgemv_t_lsx.S
new file mode 100644
index 000000000..76f0d9bdc
--- /dev/null
+++ b/kernel/loongarch64/dgemv_t_lsx.S
@@ -0,0 +1,279 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Param */
+#define M      $r4
+#define N      $r5
+#define A      $r7
+#define LDA    $r8
+#define X      $r9
+#define INCX   $r10
+#define Y      $r11
+#define INCY   $r6
+#define BUFFER $r16
+#define ALPHA  $f0
+
+#define YORIG  $r18
+#define T0     $r19
+#define T1     $r20
+#define AO3    $r12
+#define AO4    $r13
+#define I      $r14
+#define J      $r15
+#define AO1    $r23
+#define AO2    $r24
+#define IX     $r25
+#define IY     $r26
+#define II     $r27
+#define T2     $r28
+#define T3     $r29
+#define T4     $r30
+
+/* LSX vectors */
+#define U0     $vr11
+#define U1     $vr12
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define VALPHA $vr10
+
+#define a1     $f3
+#define a2     $f4
+#define a3     $f5
+#define a4     $f6
+#define a5     $f7
+#define a6     $f8
+#define a7     $f9
+#define a8     $f10
+
+
+    PROLOGUE
+
+    LDARG  INCY,    $sp,  0
+    LDARG  BUFFER,  $sp,  8
+
+    addi.d  $sp, $sp, -80
+
+    SDARG  $r23,  $sp,    0
+    SDARG  $r24,  $sp,    8
+    SDARG  $r25,  $sp,    16
+    SDARG  $r26,  $sp,    32
+    SDARG  $r27,  $sp,    40
+    SDARG  $r28,  $sp,    48
+    SDARG  $r29,  $sp,    56
+    SDARG  $r30,  $sp,    64
+    ST     ALPHA, $sp,    72
+
+    vldrepl.d  VALPHA, $sp, 72
+
+    slli.d     LDA,     LDA,  BASE_SHIFT
+    slli.d     INCX,    INCX, BASE_SHIFT
+    slli.d     INCY,    INCY, BASE_SHIFT
+
+    bge    $r0,    M,    .L999
+    bge    $r0,    N,    .L999
+
+    move   J,      $r0
+    move   IY,     $r0
+
+    move   AO1,    A  //a_ptr1
+
+    srai.d     T0,     N,     2  //n/4
+    beq        J,      T0,    .L04
+
+.L01: /* j<n/4 */
+    vxor.v    U0,     U0,   U0
+    vxor.v    U7,     U7,   U7
+    add.d    AO2,    AO1,  LDA
+    add.d    AO3,    AO2,  LDA
+    add.d    AO4,    AO3,  LDA
+
+    move      IX,     $r0
+    move      I,      $r0
+    move      II,     $r0
+
+    beq       $r0,    M,    .L03
+
+.L02:  /* i<m */
+    vldx        U1,    X,    IX
+    fldx.d      $f2,   AO1,   II
+    fldx.d      $f3,   AO2,   II
+    fldx.d      $f4,   AO3,   II
+    fldx.d      $f5,   AO4,   II
+
+    vshuf4i.d   U1,    U1,    0x00
+    vextrins.d  U2,    U3,    0x10
+    vextrins.d  U4,    U5,    0x10
+
+    vfmadd.d    U0,    U2,    U1,    U0  //temp1,2
+    vfmadd.d    U7,    U4,    U1,    U7  //temp3,4
+
+    add.d       IX,    IX,    INCX
+
+    addi.d     II,     II,    8
+    addi.d     I,      I,     1
+    blt        I,      M,    .L02
+
+.L03:
+    move        T1,    IY
+    add.d       T2,    T1,    INCY
+    add.d       T3,    T2,    INCY
+    add.d       T4,    T3,    INCY
+
+    fldx.d      $f3,    Y,    T1
+    fldx.d      $f4,    Y,    T2
+    fldx.d      $f5,    Y,    T3
+    fldx.d      $f6,    Y,    T4
+
+    vextrins.d  U3,    U4,    0x10
+    vextrins.d  U5,    U6,    0x10
+
+    vfmadd.d    U3,    VALPHA,    U0,    U3
+    vfmadd.d    U5,    VALPHA,    U7,    U5
+
+    vextrins.d  U4,    U3,    0x01
+    vextrins.d  U6,    U5,    0x01
+
+    fstx.d     $f3,     Y,    T1
+    fstx.d     $f4,     Y,    T2
+    fstx.d     $f5,     Y,    T3
+    fstx.d     $f6,     Y,    T4
+
+    slli.d     T1,     LDA,     2
+    add.d      AO1,    AO1,     T1
+    add.d      IY,     T4,      INCY
+
+    addi.d     J,      J,     1
+    blt        J,      T0,    .L01
+
+.L04: /* if(n&2) */
+    andi       T0,     N,     2
+    beq        $r0,    T0,    .L07
+
+    vxor.v    U0,     U0,   U0
+
+    add.d    AO2,    AO1,  LDA
+
+    move      IX,     $r0
+    move      I,      $r0
+    move      II,     $r0
+
+    beq       $r0,    M,    .L06
+
+.L05:  /* i<m */
+    vldx        U1,    X,    IX
+    fldx.d      $f2,   AO1,   II
+    fldx.d      $f3,   AO2,   II
+
+    vshuf4i.d   U1,    U1,    0x00
+    vextrins.d  U2,    U3,    0x10
+
+    vfmadd.d    U0,    U2,    U1,    U0  //temp1,2
+
+    add.d       IX,    IX,    INCX
+
+    addi.d     II,     II,    8
+    addi.d     I,      I,     1
+    blt        I,      M,    .L05
+
+.L06:
+    move        T1,    IY
+    add.d       T2,    T1,    INCY
+
+    fldx.d      a1,    Y,    T1
+    fldx.d      a2,    Y,    T2
+
+    vextrins.d  U3,    U4,    0x10
+
+    vfmadd.d    U3,    VALPHA,    U0,    U3
+
+    vextrins.d  U4,    U3,    0x01
+
+    fstx.d     a1,     Y,    T1
+    fstx.d     a2,     Y,    T2
+
+    slli.d     T0,     LDA,     1
+    add.d      AO1,    AO1,     T0
+    add.d      IY,     T2,      INCY
+
+.L07: /* if(n&1) */
+    andi       T0,     N,     1
+    beq        $r0,    T0,    .L999
+
+    MTC       a1,     $r0
+
+    move      IX,     $r0
+    move      I,      $r0
+    move      II,      $r0
+
+    beq       $r0,    M,    .L09
+
+.L08:  /* i<m */
+    fldx.d      a3,    X,     IX
+    fldx.d      a4,    AO1,   II
+
+    fmadd.d     a1,    a4,    a3,    a1  //temp1
+
+    add.d       IX,    IX,    INCX
+
+    addi.d     II,     II,    8
+    addi.d     I,      I,     1
+    blt        I,      M,    .L08
+
+.L09:
+    fldx.d      a3,    Y,    IY
+
+    fmadd.d    a3,    ALPHA,    a1,    a3
+
+    fstx.d     a3,     Y,    IY
+
+    add.d      AO1,    AO1,     LDA
+    add.d      IY,     IY,      INCY
+
+.L999:
+    LDARG  $r23,  $sp,    0
+    LDARG  $r24,  $sp,    8
+    LDARG  $r25,  $sp,    16
+    LDARG  $r26,  $sp,    32
+    LDARG  $r27,  $sp,    40
+    LDARG  $r28,  $sp,    48
+    LDARG  $r29,  $sp,    56
+    LDARG  $r30,  $sp,    64
+    LD     ALPHA, $sp,    72
+    addi.d  $sp, $sp, 80
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/sgemv_n_lsx.S b/kernel/loongarch64/sgemv_n_lsx.S
new file mode 100644
index 000000000..3b1b519fc
--- /dev/null
+++ b/kernel/loongarch64/sgemv_n_lsx.S
@@ -0,0 +1,227 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Param */
+#define M      $r4
+#define N      $r5
+#define A      $r7
+#define LDA    $r8
+#define X      $r9
+#define INCX   $r10
+#define Y      $r11
+#define INCY   $r6
+#define BUFFER $r16
+#define ALPHA  $f0
+
+#define YORIG  $r18
+#define T0     $r19
+#define T1     $r20
+#define XX     $r12
+#define YY     $r13
+#define I      $r14
+#define J      $r15
+#define AO1    $r23
+#define AO2    $r24
+#define IX     $r25
+#define IY     $r26
+#define II     $r27
+#define T2     $r28
+#define T3     $r29
+#define T4     $r30
+
+/* LSX vectors */
+#define U0     $vr11
+#define U1     $vr12
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define VALPHA $vr10
+
+#define a1     $f3
+#define a2     $f4
+#define a3     $f5
+#define a4     $f6
+#define a5     $f7
+#define a6     $f8
+#define a7     $f9
+#define a8     $f10
+
+
+    PROLOGUE
+
+    LDARG  INCY,    $sp,  0
+    LDARG  BUFFER,  $sp,  8
+
+    addi.d  $sp, $sp, -80
+
+    SDARG  $r23,  $sp,    0
+    SDARG  $r24,  $sp,    8
+    SDARG  $r25,  $sp,    16
+    SDARG  $r26,  $sp,    32
+    SDARG  $r27,  $sp,    40
+    SDARG  $r28,  $sp,    48
+    SDARG  $r29,  $sp,    56
+    SDARG  $r30,  $sp,    64
+    ST     ALPHA, $sp,    72
+
+    vldrepl.w  VALPHA, $sp, 72
+
+    slli.d     LDA,     LDA,  BASE_SHIFT
+    slli.d     INCX,    INCX, BASE_SHIFT
+    slli.d     INCY,    INCY, BASE_SHIFT
+
+    bge    $r0,    M,    .L999
+    bge    $r0,    N,    .L999
+
+    move   J,      $r0
+    move   IX,     $r0
+
+    move   AO1,    A  //a_ptr
+    move   XX,     X
+    move   YY,     Y
+
+    beq    J,      M,     .L999
+
+.L01:
+    vldx        U0,    XX,    IX
+    vpermi.w    U0,    U0,    0x00
+
+    vfmul.s     U1,    VALPHA,  U0  //temp1
+
+    move        IY,    $r0
+    move        II,    $r0
+    move        I,     $r0
+
+    srai.d      T0,    M,     2  //n/4
+    beq         I,     T0,    .L03
+
+.L02:
+    vldx        U2,    AO1,    II
+
+    move        T1,    IY
+    add.d       T2,    T1,    INCY
+    add.d       T3,    T2,    INCY
+    add.d       T4,    T3,    INCY
+
+    fldx.s      a1,    YY,    T1
+    fldx.s      a2,    YY,    T2
+    fldx.s      a3,    YY,    T3
+    fldx.s      a4,    YY,    T4
+
+    vextrins.w  U3,    U4,    0x10
+    vextrins.w  U3,    U5,    0x20
+    vextrins.w  U3,    U6,    0x30
+
+    vfmadd.s    U3,    U1,    U2,    U3
+
+    vextrins.w  U4,    U3,    0x01
+    vextrins.w  U5,    U3,    0x02
+    vextrins.w  U6,    U3,    0x03
+
+    fstx.s     a1,     YY,    T1
+    fstx.s     a2,     YY,    T2
+    fstx.s     a3,     YY,    T3
+    fstx.s     a4,     YY,    T4
+
+    add.d      IY,     T4,    INCY
+    addi.d     II,     II,    16
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L02
+
+.L03:
+    andi       T0,     M,     2
+    beq        $r0,    T0,    .L04
+
+    addi.d     T1,     $r0,   4
+    mod.d      T1,     M,     T1
+    sub.d      II,     M,     T1
+    slli.d     II,     II,    BASE_SHIFT
+
+    move       T1,    IY
+    add.d      T2,    T1,    INCY
+
+    fldx.s     a1,     AO1,   II
+    addi.d     T0,     II,    4
+    fldx.s     a2,     AO1,   T0
+
+    fldx.s     a3,    YY,    T1
+    fldx.s     a4,    YY,    T2
+
+    fmadd.s    a3,    $f12,  a1,    a3
+    fmadd.s    a4,    $f12,  a2,    a4
+
+    fstx.s     a3,     YY,    T1
+    fstx.s     a4,     YY,    T2
+
+    add.d      IY,     T2,    INCY
+
+.L04:
+    andi       T0,     M,     1
+    beq        $r0,    T0,    .L05
+
+    addi.d     II,    M,     -1
+    slli.d     II,    II,    BASE_SHIFT
+
+    fldx.s     a1,    AO1,   II
+    fldx.s     a3,    YY,    IY
+
+    fmadd.s    a3,    $f12,  a1,    a3
+
+    fstx.s     a3,    YY,    IY
+
+    add.d      IY,    IY,    INCY
+
+.L05:
+    add.d      AO1,   AO1,   LDA
+    add.d      IX,    IX,    INCX
+
+    addi.d     J,     J,     1
+    blt        J,     N,    .L01
+
+.L999:
+    LDARG  $r23,  $sp,    0
+    LDARG  $r24,  $sp,    8
+    LDARG  $r25,  $sp,    16
+    LDARG  $r26,  $sp,    32
+    LDARG  $r27,  $sp,    40
+    LDARG  $r28,  $sp,    48
+    LDARG  $r29,  $sp,    56
+    LDARG  $r30,  $sp,    64
+    LD     ALPHA, $sp,    72
+    addi.d  $sp, $sp, 80
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/sgemv_t_lsx.S b/kernel/loongarch64/sgemv_t_lsx.S
new file mode 100644
index 000000000..b817f0bbf
--- /dev/null
+++ b/kernel/loongarch64/sgemv_t_lsx.S
@@ -0,0 +1,275 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Param */
+#define M      $r4
+#define N      $r5
+#define A      $r7
+#define LDA    $r8
+#define X      $r9
+#define INCX   $r10
+#define Y      $r11
+#define INCY   $r6
+#define BUFFER $r16
+#define ALPHA  $f0
+
+#define YORIG  $r18
+#define T0     $r19
+#define T1     $r20
+#define AO3    $r12
+#define AO4    $r13
+#define I      $r14
+#define J      $r15
+#define AO1    $r23
+#define AO2    $r24
+#define IX     $r25
+#define IY     $r26
+#define II     $r27
+#define T2     $r28
+#define T3     $r29
+#define T4     $r30
+
+/* LSX vectors */
+#define U0     $vr11
+#define U1     $vr12
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define VALPHA $vr10
+
+#define a1     $f3
+#define a2     $f4
+#define a3     $f5
+#define a4     $f6
+#define a5     $f7
+#define a6     $f8
+#define a7     $f9
+#define a8     $f10
+
+
+    PROLOGUE
+
+    LDARG  INCY,    $sp,  0
+    LDARG  BUFFER,  $sp,  8
+
+    addi.d  $sp, $sp, -80
+
+    SDARG  $r23,  $sp,    0
+    SDARG  $r24,  $sp,    8
+    SDARG  $r25,  $sp,    16
+    SDARG  $r26,  $sp,    32
+    SDARG  $r27,  $sp,    40
+    SDARG  $r28,  $sp,    48
+    SDARG  $r29,  $sp,    56
+    SDARG  $r30,  $sp,    64
+    ST     ALPHA, $sp,    72
+
+    vldrepl.w  VALPHA, $sp, 72
+
+    slli.d     LDA,     LDA,  BASE_SHIFT
+    slli.d     INCX,    INCX, BASE_SHIFT
+    slli.d     INCY,    INCY, BASE_SHIFT
+
+    bge    $r0,    M,    .L999
+    bge    $r0,    N,    .L999
+
+    move   J,      $r0
+    move   IY,     $r0
+
+    move   AO1,    A  //a_ptr1
+
+    srai.d     T0,     N,     2  //n/4
+    beq        J,      T0,    .L04
+
+.L01: /* j<n/4 */
+    vxor.v    U0,     U0,   U0
+    add.d    AO2,    AO1,  LDA
+    add.d    AO3,    AO2,  LDA
+    add.d    AO4,    AO3,  LDA
+
+    move      IX,     $r0
+    move      I,      $r0
+    move      II,     $r0
+
+    beq       $r0,    M,    .L03
+
+.L02:  /* i<m */
+    vldx        U1,    X,    IX
+    fldx.s      $f2,   AO1,   II
+    fldx.s      $f3,   AO2,   II
+    fldx.s      $f4,   AO3,   II
+    fldx.s      $f5,   AO4,   II
+
+    vpermi.w    U1,    U1,    0x00
+    vextrins.w  U2,    U3,    0x10
+    vextrins.w  U2,    U4,    0x20
+    vextrins.w  U2,    U5,    0x30
+
+    vfmadd.s    U0,    U2,    U1,    U0  //temp1,2,3,4
+
+    add.d       IX,    IX,    INCX
+
+    addi.d     II,     II,    4
+    addi.d     I,      I,     1
+    blt        I,      M,    .L02
+
+.L03:
+    move        T1,    IY
+    add.d       T2,    T1,    INCY
+    add.d       T3,    T2,    INCY
+    add.d       T4,    T3,    INCY
+
+    fldx.s      a1,    Y,    T1
+    fldx.s      a2,    Y,    T2
+    fldx.s      a3,    Y,    T3
+    fldx.s      a4,    Y,    T4
+
+    vextrins.w  U3,    U4,    0x10
+    vextrins.w  U3,    U5,    0x20
+    vextrins.w  U3,    U6,    0x30
+
+    vfmadd.s    U3,    VALPHA,    U0,    U3
+
+    vextrins.w  U4,    U3,    0x01
+    vextrins.w  U5,    U3,    0x02
+    vextrins.w  U6,    U3,    0x03
+
+    fstx.s     a1,     Y,    T1
+    fstx.s     a2,     Y,    T2
+    fstx.s     a3,     Y,    T3
+    fstx.s     a4,     Y,    T4
+
+    slli.d     T1,     LDA,     2
+    add.d      AO1,    AO1,     T1
+    add.d      IY,     T4,      INCY
+
+    addi.d     J,      J,     1
+    blt        J,      T0,    .L01
+
+.L04: /* if(n&2) */
+    andi       T0,     N,     2
+    beq        $r0,    T0,    .L07
+
+    MTC       a1,     $r0
+    MTC       a2,     $r0
+
+    add.d    AO2,    AO1,  LDA
+
+    move      IX,     $r0
+    move      I,      $r0
+    move      II,     $r0
+
+    beq       $r0,    M,    .L06
+
+.L05:  /* i<m */
+    fldx.s      a3,    X,     IX
+    fldx.s      a4,    AO1,   II
+    fldx.s      a5,    AO2,   II
+
+    fmadd.s     a1,    a4,    a3,    a1  //temp1
+    fmadd.s     a2,    a5,    a3,    a2  //temp2
+
+    add.d       IX,    IX,    INCX
+
+    addi.d     II,     II,    4
+    addi.d     I,      I,     1
+    blt        I,      M,    .L05
+
+.L06:
+    move        T1,    IY
+    add.d       T2,    T1,    INCY
+
+    fldx.s      a3,    Y,    T1
+    fldx.s      a4,    Y,    T2
+
+    fmadd.s    a3,    ALPHA,    a1,    a3
+    fmadd.s    a4,    ALPHA,    a2,    a4
+
+    fstx.s     a3,     Y,    T1
+    fstx.s     a4,     Y,    T2
+
+    slli.d     T0,     LDA,     1
+    add.d      AO1,    AO1,     T0
+    add.d      IY,     T2,      INCY
+
+.L07: /* if(n&1) */
+    andi       T0,     N,     1
+    beq        $r0,    T0,    .L999
+
+    MTC       a1,     $r0
+
+    move      IX,     $r0
+    move      I,      $r0
+    move      II,      $r0
+
+    beq       $r0,    M,    .L09
+
+.L08:  /* i<m */
+    fldx.s      a3,    X,     IX
+    fldx.s      a4,    AO1,   II
+
+    fmadd.s     a1,    a4,    a3,    a1  //temp1
+
+    add.d       IX,    IX,    INCX
+
+    addi.d     II,     II,    4
+    addi.d     I,      I,     1
+    blt        I,      M,    .L08
+
+.L09:
+    fldx.s      a3,    Y,    IY
+
+    fmadd.s    a3,    ALPHA,    a1,    a3
+
+    fstx.s     a3,     Y,    IY
+
+    add.d      AO1,    AO1,     LDA
+    add.d      IY,     IY,      INCY
+
+.L999:
+    LDARG  $r23,  $sp,    0
+    LDARG  $r24,  $sp,    8
+    LDARG  $r25,  $sp,    16
+    LDARG  $r26,  $sp,    32
+    LDARG  $r27,  $sp,    40
+    LDARG  $r28,  $sp,    48
+    LDARG  $r29,  $sp,    56
+    LDARG  $r30,  $sp,    64
+    LD     ALPHA, $sp,    72
+    addi.d  $sp, $sp, 80
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE